diff --git "a/sft/665K/Full_default_moe/checkpoint-16632/trainer_state.json" "b/sft/665K/Full_default_moe/checkpoint-16632/trainer_state.json" new file mode 100644--- /dev/null +++ "b/sft/665K/Full_default_moe/checkpoint-16632/trainer_state.json" @@ -0,0 +1,282777 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.999969938373666, + "eval_steps": 500, + "global_step": 16632, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "auxiliary_loss_clip": 0.05075364, + "auxiliary_loss_mlp": 0.02164498, + "balance_loss_clip": 2.48935461, + "balance_loss_mlp": 1.78512585, + "epoch": 6.012325266796934e-05, + "flos": 29843169474240.0, + "grad_norm": 90.35344605907909, + "language_loss": 2.89324927, + "learning_rate": 0.0, + "loss": 1.9732132, + "num_input_tokens_seen": 19155, + "router_z_loss_clip": 25.828125, + "router_z_loss_mlp": 3.79101562, + "step": 1, + "time_per_iteration": 19.313944101333618 + }, + { + "auxiliary_loss_clip": 0.03415321, + "auxiliary_loss_mlp": 0.01412582, + "balance_loss_clip": 1.66284227, + "balance_loss_mlp": 1.17778778, + "epoch": 0.00012024650533593868, + "flos": 24680054544480.0, + "grad_norm": 36.06623228308307, + "language_loss": 1.84125423, + "learning_rate": 4.4628432569317594e-07, + "loss": 1.88953328, + "num_input_tokens_seen": 36175, + "router_z_loss_clip": 17.5, + "router_z_loss_mlp": 2.34765625, + "step": 2, + "time_per_iteration": 2.626032590866089 + }, + { + "auxiliary_loss_clip": 0.03356028, + "auxiliary_loss_mlp": 0.01407113, + "balance_loss_clip": 1.65944636, + "balance_loss_mlp": 1.18833995, + "epoch": 0.000180369758003908, + "flos": 27222719133600.0, + "grad_norm": 42.558056813615295, + "language_loss": 1.59598124, + "learning_rate": 7.073439208833112e-07, + "loss": 1.64361274, + "num_input_tokens_seen": 54870, + "router_z_loss_clip": 16.953125, + "router_z_loss_mlp": 2.18945312, + "step": 3, + "time_per_iteration": 2.601954460144043 + }, + { + "auxiliary_loss_clip": 0.03374822, + "auxiliary_loss_mlp": 0.01411042, + "balance_loss_clip": 1.65319943, + "balance_loss_mlp": 1.15393138, + "epoch": 0.00024049301067187735, + "flos": 27350937273600.0, + "grad_norm": 53.08902273973465, + "language_loss": 1.70609295, + "learning_rate": 8.925686513863519e-07, + "loss": 1.75395143, + "num_input_tokens_seen": 74575, + "router_z_loss_clip": 17.234375, + "router_z_loss_mlp": 2.57226562, + "step": 4, + "time_per_iteration": 2.6800124645233154 + }, + { + "auxiliary_loss_clip": 0.03421716, + "auxiliary_loss_mlp": 0.01439554, + "balance_loss_clip": 1.65258622, + "balance_loss_mlp": 1.19732118, + "epoch": 0.0003006162633398467, + "flos": 26108733728160.0, + "grad_norm": 55.41117633614733, + "language_loss": 1.94123042, + "learning_rate": 1.0362401141348472e-06, + "loss": 1.98984301, + "num_input_tokens_seen": 92580, + "router_z_loss_clip": 17.65625, + "router_z_loss_mlp": 2.421875, + "step": 5, + "time_per_iteration": 2.95265793800354 + }, + { + "auxiliary_loss_clip": 0.03384525, + "auxiliary_loss_mlp": 0.01453806, + "balance_loss_clip": 1.64149857, + "balance_loss_mlp": 1.20413399, + "epoch": 0.000360739516007816, + "flos": 26420388710400.0, + "grad_norm": 33.60916184159133, + "language_loss": 1.62005711, + "learning_rate": 1.153628246576487e-06, + "loss": 1.66844046, + "num_input_tokens_seen": 109705, + "router_z_loss_clip": 17.40625, + "router_z_loss_mlp": 2.49414062, + "step": 6, + "time_per_iteration": 2.9208219051361084 + }, + { + "auxiliary_loss_clip": 0.03374398, + "auxiliary_loss_mlp": 0.01438659, + "balance_loss_clip": 1.64300013, + "balance_loss_mlp": 1.19299293, + "epoch": 0.0004208627686757854, + "flos": 33141084482880.0, + "grad_norm": 24.22080759837002, + "language_loss": 1.54537678, + "learning_rate": 1.2528784983718962e-06, + "loss": 1.59350729, + "num_input_tokens_seen": 129425, + "router_z_loss_clip": 17.3125, + "router_z_loss_mlp": 2.45507812, + "step": 7, + "time_per_iteration": 2.9749228954315186 + }, + { + "auxiliary_loss_clip": 0.03335211, + "auxiliary_loss_mlp": 0.01411031, + "balance_loss_clip": 1.6398468, + "balance_loss_mlp": 1.17108619, + "epoch": 0.0004809860213437547, + "flos": 38216648996640.0, + "grad_norm": 35.805818426560805, + "language_loss": 1.43480098, + "learning_rate": 1.338852977079528e-06, + "loss": 1.48226333, + "num_input_tokens_seen": 149210, + "router_z_loss_clip": 16.921875, + "router_z_loss_mlp": 2.3984375, + "step": 8, + "time_per_iteration": 2.9901981353759766 + }, + { + "auxiliary_loss_clip": 0.03398959, + "auxiliary_loss_mlp": 0.01448239, + "balance_loss_clip": 1.63965964, + "balance_loss_mlp": 1.20352602, + "epoch": 0.000541109274011724, + "flos": 39243089260800.0, + "grad_norm": 29.7996717842951, + "language_loss": 1.51206195, + "learning_rate": 1.4146878417666224e-06, + "loss": 1.560534, + "num_input_tokens_seen": 169055, + "router_z_loss_clip": 17.59375, + "router_z_loss_mlp": 2.44335938, + "step": 9, + "time_per_iteration": 2.968672037124634 + }, + { + "auxiliary_loss_clip": 0.03333133, + "auxiliary_loss_mlp": 0.01431892, + "balance_loss_clip": 1.64050078, + "balance_loss_mlp": 1.19957685, + "epoch": 0.0006012325266796934, + "flos": 23082160083840.0, + "grad_norm": 27.51953669592014, + "language_loss": 1.45631254, + "learning_rate": 1.4825244398280232e-06, + "loss": 1.50396276, + "num_input_tokens_seen": 188045, + "router_z_loss_clip": 16.9140625, + "router_z_loss_mlp": 2.32128906, + "step": 10, + "time_per_iteration": 2.9221670627593994 + }, + { + "auxiliary_loss_clip": 0.03375491, + "auxiliary_loss_mlp": 0.01447785, + "balance_loss_clip": 1.64610744, + "balance_loss_mlp": 1.21089244, + "epoch": 0.0006613557793476627, + "flos": 25349750444160.0, + "grad_norm": 21.12658260959183, + "language_loss": 1.4455204, + "learning_rate": 1.5438901072051983e-06, + "loss": 1.49375319, + "num_input_tokens_seen": 207035, + "router_z_loss_clip": 17.296875, + "router_z_loss_mlp": 2.36914062, + "step": 11, + "time_per_iteration": 2.867265224456787 + }, + { + "auxiliary_loss_clip": 0.03330056, + "auxiliary_loss_mlp": 0.01413259, + "balance_loss_clip": 1.63595128, + "balance_loss_mlp": 1.17026246, + "epoch": 0.000721479032015632, + "flos": 20232662068800.0, + "grad_norm": 17.268116829915904, + "language_loss": 1.44001436, + "learning_rate": 1.5999125722696629e-06, + "loss": 1.4874475, + "num_input_tokens_seen": 223225, + "router_z_loss_clip": 16.9609375, + "router_z_loss_mlp": 2.42675781, + "step": 12, + "time_per_iteration": 2.8749685287475586 + }, + { + "auxiliary_loss_clip": 0.03330424, + "auxiliary_loss_mlp": 0.01370508, + "balance_loss_clip": 1.6433301, + "balance_loss_mlp": 1.1498282, + "epoch": 0.0007816022846836014, + "flos": 29047721713920.0, + "grad_norm": 9.362272025169979, + "language_loss": 1.20646882, + "learning_rate": 1.6514482443788434e-06, + "loss": 1.25347805, + "num_input_tokens_seen": 242570, + "router_z_loss_clip": 16.875, + "router_z_loss_mlp": 2.20898438, + "step": 13, + "time_per_iteration": 2.8924548625946045 + }, + { + "auxiliary_loss_clip": 0.03312184, + "auxiliary_loss_mlp": 0.01430651, + "balance_loss_clip": 1.63948298, + "balance_loss_mlp": 1.19299495, + "epoch": 0.0008417255373515708, + "flos": 23393815066080.0, + "grad_norm": 5.774516397212065, + "language_loss": 1.20367312, + "learning_rate": 1.6991628240650723e-06, + "loss": 1.25110149, + "num_input_tokens_seen": 261215, + "router_z_loss_clip": 16.71875, + "router_z_loss_mlp": 2.375, + "step": 14, + "time_per_iteration": 2.890596628189087 + }, + { + "auxiliary_loss_clip": 0.03301363, + "auxiliary_loss_mlp": 0.01390286, + "balance_loss_clip": 1.64342642, + "balance_loss_mlp": 1.15930605, + "epoch": 0.00090184879001954, + "flos": 32208388503840.0, + "grad_norm": 6.094026923195623, + "language_loss": 1.11905706, + "learning_rate": 1.7435840350181584e-06, + "loss": 1.16597354, + "num_input_tokens_seen": 280035, + "router_z_loss_clip": 16.578125, + "router_z_loss_mlp": 2.30859375, + "step": 15, + "time_per_iteration": 2.9895102977752686 + }, + { + "auxiliary_loss_clip": 0.03267337, + "auxiliary_loss_mlp": 0.01372697, + "balance_loss_clip": 1.63057756, + "balance_loss_mlp": 1.15506876, + "epoch": 0.0009619720426875094, + "flos": 30114429804000.0, + "grad_norm": 4.753837426690492, + "language_loss": 1.11151004, + "learning_rate": 1.7851373027727038e-06, + "loss": 1.15791035, + "num_input_tokens_seen": 300265, + "router_z_loss_clip": 16.359375, + "router_z_loss_mlp": 2.17773438, + "step": 16, + "time_per_iteration": 2.9459683895111084 + }, + { + "auxiliary_loss_clip": 0.03252241, + "auxiliary_loss_mlp": 0.01403514, + "balance_loss_clip": 1.63388944, + "balance_loss_mlp": 1.18874621, + "epoch": 0.0010220952953554788, + "flos": 22725537271200.0, + "grad_norm": 5.731505007025399, + "language_loss": 1.1272862, + "learning_rate": 1.8241705979033208e-06, + "loss": 1.17384374, + "num_input_tokens_seen": 317375, + "router_z_loss_clip": 16.171875, + "router_z_loss_mlp": 2.1484375, + "step": 17, + "time_per_iteration": 2.8969786167144775 + }, + { + "auxiliary_loss_clip": 0.03196891, + "auxiliary_loss_mlp": 0.01367492, + "balance_loss_clip": 1.63643289, + "balance_loss_mlp": 1.16397822, + "epoch": 0.001082218548023448, + "flos": 31899407662080.0, + "grad_norm": 4.309460552243762, + "language_loss": 1.07462871, + "learning_rate": 1.860972167459798e-06, + "loss": 1.12027252, + "num_input_tokens_seen": 337975, + "router_z_loss_clip": 15.609375, + "router_z_loss_mlp": 2.03808594, + "step": 18, + "time_per_iteration": 4.487032890319824 + }, + { + "auxiliary_loss_clip": 0.03199001, + "auxiliary_loss_mlp": 0.01377215, + "balance_loss_clip": 1.63137031, + "balance_loss_mlp": 1.14232564, + "epoch": 0.0011423418006914173, + "flos": 23927797128960.0, + "grad_norm": 4.450347816038019, + "language_loss": 1.02295792, + "learning_rate": 1.89578346593066e-06, + "loss": 1.0687201, + "num_input_tokens_seen": 356635, + "router_z_loss_clip": 15.6640625, + "router_z_loss_mlp": 2.34863281, + "step": 19, + "time_per_iteration": 4.420933246612549 + }, + { + "auxiliary_loss_clip": 0.03153329, + "auxiliary_loss_mlp": 0.01324296, + "balance_loss_clip": 1.63319123, + "balance_loss_mlp": 1.1277442, + "epoch": 0.0012024650533593868, + "flos": 21835702224000.0, + "grad_norm": 3.9563380191697872, + "language_loss": 1.16739035, + "learning_rate": 1.928808765521199e-06, + "loss": 1.21216655, + "num_input_tokens_seen": 375625, + "router_z_loss_clip": 15.1875, + "router_z_loss_mlp": 1.96484375, + "step": 20, + "time_per_iteration": 2.8905680179595947 + }, + { + "auxiliary_loss_clip": 0.03136474, + "auxiliary_loss_mlp": 0.01348593, + "balance_loss_clip": 1.61387968, + "balance_loss_mlp": 1.13020205, + "epoch": 0.001262588306027356, + "flos": 25931779650720.0, + "grad_norm": 11.562747999160507, + "language_loss": 1.05458379, + "learning_rate": 1.9602224192552076e-06, + "loss": 1.09943438, + "num_input_tokens_seen": 394350, + "router_z_loss_clip": 15.2421875, + "router_z_loss_mlp": 2.18554688, + "step": 21, + "time_per_iteration": 2.9314024448394775 + }, + { + "auxiliary_loss_clip": 0.03032, + "auxiliary_loss_mlp": 0.0135542, + "balance_loss_clip": 1.59199071, + "balance_loss_mlp": 1.14503956, + "epoch": 0.0013227115586953253, + "flos": 31852657071360.0, + "grad_norm": 5.516762530656928, + "language_loss": 1.05628133, + "learning_rate": 1.9901744328983746e-06, + "loss": 1.10015547, + "num_input_tokens_seen": 413255, + "router_z_loss_clip": 14.4140625, + "router_z_loss_mlp": 2.10644531, + "step": 22, + "time_per_iteration": 2.8801519870758057 + }, + { + "auxiliary_loss_clip": 0.02979035, + "auxiliary_loss_mlp": 0.01328801, + "balance_loss_clip": 1.59578872, + "balance_loss_mlp": 1.13787508, + "epoch": 0.0013828348113632948, + "flos": 29225769757920.0, + "grad_norm": 2.8139155552122466, + "language_loss": 0.91541886, + "learning_rate": 2.018794797290208e-06, + "loss": 0.95849723, + "num_input_tokens_seen": 433065, + "router_z_loss_clip": 13.8359375, + "router_z_loss_mlp": 1.90917969, + "step": 23, + "time_per_iteration": 2.9411375522613525 + }, + { + "auxiliary_loss_clip": 0.02948163, + "auxiliary_loss_mlp": 0.01342524, + "balance_loss_clip": 1.58715093, + "balance_loss_mlp": 1.14444637, + "epoch": 0.001442958064031264, + "flos": 19474002923040.0, + "grad_norm": 2.5920011383199064, + "language_loss": 1.08127117, + "learning_rate": 2.046196897962839e-06, + "loss": 1.12417817, + "num_input_tokens_seen": 451175, + "router_z_loss_clip": 13.6328125, + "router_z_loss_mlp": 1.98339844, + "step": 24, + "time_per_iteration": 2.8761208057403564 + }, + { + "auxiliary_loss_clip": 0.02836047, + "auxiliary_loss_mlp": 0.01314326, + "balance_loss_clip": 1.57873523, + "balance_loss_mlp": 1.13131559, + "epoch": 0.0015030813166992333, + "flos": 22096149714720.0, + "grad_norm": 3.7144557546766497, + "language_loss": 1.00866866, + "learning_rate": 2.0724802282696944e-06, + "loss": 1.05017233, + "num_input_tokens_seen": 468775, + "router_z_loss_clip": 12.578125, + "router_z_loss_mlp": 1.83007812, + "step": 25, + "time_per_iteration": 2.9235622882843018 + }, + { + "auxiliary_loss_clip": 0.02830522, + "auxiliary_loss_mlp": 0.01298151, + "balance_loss_clip": 1.58184624, + "balance_loss_mlp": 1.11123073, + "epoch": 0.0015632045693672028, + "flos": 27130960195200.0, + "grad_norm": 2.700079905917366, + "language_loss": 1.06415462, + "learning_rate": 2.0977325700720194e-06, + "loss": 1.10544133, + "num_input_tokens_seen": 488530, + "router_z_loss_clip": 12.5, + "router_z_loss_mlp": 1.87011719, + "step": 26, + "time_per_iteration": 2.897278308868408 + }, + { + "auxiliary_loss_clip": 0.02768305, + "auxiliary_loss_mlp": 0.01312266, + "balance_loss_clip": 1.570894, + "balance_loss_mlp": 1.13144946, + "epoch": 0.001623327822035172, + "flos": 29274910868160.0, + "grad_norm": 2.6383399923190556, + "language_loss": 0.95509094, + "learning_rate": 2.122031762649933e-06, + "loss": 0.99589664, + "num_input_tokens_seen": 510495, + "router_z_loss_clip": 11.96875, + "router_z_loss_mlp": 1.80859375, + "step": 27, + "time_per_iteration": 2.9694721698760986 + }, + { + "auxiliary_loss_clip": 0.02747363, + "auxiliary_loss_mlp": 0.0130723, + "balance_loss_clip": 1.57629824, + "balance_loss_mlp": 1.14214909, + "epoch": 0.0016834510747031415, + "flos": 24011087955840.0, + "grad_norm": 2.7852127116957655, + "language_loss": 1.06237817, + "learning_rate": 2.1454471497582483e-06, + "loss": 1.10292411, + "num_input_tokens_seen": 528605, + "router_z_loss_clip": 11.7109375, + "router_z_loss_mlp": 1.65136719, + "step": 28, + "time_per_iteration": 2.9603118896484375 + }, + { + "auxiliary_loss_clip": 0.02715081, + "auxiliary_loss_mlp": 0.01314861, + "balance_loss_clip": 1.55893636, + "balance_loss_mlp": 1.14510751, + "epoch": 0.0017435743273711108, + "flos": 25531242456960.0, + "grad_norm": 4.120257775336123, + "language_loss": 1.02399158, + "learning_rate": 2.1680407726407727e-06, + "loss": 1.064291, + "num_input_tokens_seen": 548515, + "router_z_loss_clip": 11.578125, + "router_z_loss_mlp": 1.69726562, + "step": 29, + "time_per_iteration": 2.8906047344207764 + }, + { + "auxiliary_loss_clip": 0.02694117, + "auxiliary_loss_mlp": 0.01305456, + "balance_loss_clip": 1.55593443, + "balance_loss_mlp": 1.13474846, + "epoch": 0.00180369758003908, + "flos": 23829920081280.0, + "grad_norm": 2.958050756868074, + "language_loss": 1.18992901, + "learning_rate": 2.189868360711334e-06, + "loss": 1.2299248, + "num_input_tokens_seen": 564025, + "router_z_loss_clip": 11.3828125, + "router_z_loss_mlp": 1.70703125, + "step": 30, + "time_per_iteration": 2.8802270889282227 + }, + { + "auxiliary_loss_clip": 0.02625977, + "auxiliary_loss_mlp": 0.01330461, + "balance_loss_clip": 1.54096079, + "balance_loss_mlp": 1.16986179, + "epoch": 0.0018638208327070496, + "flos": 33499003848480.0, + "grad_norm": 2.636111699166659, + "language_loss": 1.02863753, + "learning_rate": 2.2109801597326265e-06, + "loss": 1.06820178, + "num_input_tokens_seen": 583345, + "router_z_loss_clip": 10.8359375, + "router_z_loss_mlp": 1.60644531, + "step": 31, + "time_per_iteration": 2.9062626361846924 + }, + { + "auxiliary_loss_clip": 0.02586276, + "auxiliary_loss_mlp": 0.01312351, + "balance_loss_clip": 1.53739643, + "balance_loss_mlp": 1.15060782, + "epoch": 0.0019239440853750188, + "flos": 16581360355200.0, + "grad_norm": 2.332189431626574, + "language_loss": 0.95425963, + "learning_rate": 2.2314216284658796e-06, + "loss": 0.9932459, + "num_input_tokens_seen": 600010, + "router_z_loss_clip": 10.484375, + "router_z_loss_mlp": 1.61523438, + "step": 32, + "time_per_iteration": 2.873229503631592 + }, + { + "auxiliary_loss_clip": 0.02566526, + "auxiliary_loss_mlp": 0.01294521, + "balance_loss_clip": 1.53403437, + "balance_loss_mlp": 1.13993049, + "epoch": 0.001984067338042988, + "flos": 13731335615520.0, + "grad_norm": 3.119215211514814, + "language_loss": 0.94997674, + "learning_rate": 2.2512340280885094e-06, + "loss": 0.98858726, + "num_input_tokens_seen": 616295, + "router_z_loss_clip": 10.3203125, + "router_z_loss_mlp": 1.54589844, + "step": 33, + "time_per_iteration": 2.884086847305298 + }, + { + "auxiliary_loss_clip": 0.02445346, + "auxiliary_loss_mlp": 0.01298493, + "balance_loss_clip": 1.50761545, + "balance_loss_mlp": 1.1521039, + "epoch": 0.0020441905907109576, + "flos": 27315531521280.0, + "grad_norm": 1.9067773774733798, + "language_loss": 0.91228378, + "learning_rate": 2.270454923596497e-06, + "loss": 0.94972217, + "num_input_tokens_seen": 637640, + "router_z_loss_clip": 9.3671875, + "router_z_loss_mlp": 1.46289062, + "step": 34, + "time_per_iteration": 2.9541094303131104 + }, + { + "auxiliary_loss_clip": 0.02389429, + "auxiliary_loss_mlp": 0.01261683, + "balance_loss_clip": 1.47298694, + "balance_loss_mlp": 1.11929965, + "epoch": 0.0021043138433789266, + "flos": 60742542513600.0, + "grad_norm": 2.574388394158631, + "language_loss": 0.76467049, + "learning_rate": 2.2891186125067434e-06, + "loss": 0.80118161, + "num_input_tokens_seen": 659710, + "router_z_loss_clip": 9.1640625, + "router_z_loss_mlp": 1.42480469, + "step": 35, + "time_per_iteration": 3.1509780883789062 + }, + { + "auxiliary_loss_clip": 0.02356135, + "auxiliary_loss_mlp": 0.01264841, + "balance_loss_clip": 1.4827956, + "balance_loss_mlp": 1.13361549, + "epoch": 0.002164437096046896, + "flos": 25084886569920.0, + "grad_norm": 3.981645929199942, + "language_loss": 0.8899579, + "learning_rate": 2.307256493152974e-06, + "loss": 0.92616761, + "num_input_tokens_seen": 679670, + "router_z_loss_clip": 8.73828125, + "router_z_loss_mlp": 1.3125, + "step": 36, + "time_per_iteration": 2.8578379154205322 + }, + { + "auxiliary_loss_clip": 0.02299005, + "auxiliary_loss_mlp": 0.01322311, + "balance_loss_clip": 1.47196412, + "balance_loss_mlp": 1.18803406, + "epoch": 0.0022245603487148656, + "flos": 32387368445280.0, + "grad_norm": 3.609559420029389, + "language_loss": 0.93162656, + "learning_rate": 2.3248973825097614e-06, + "loss": 0.96783978, + "num_input_tokens_seen": 700170, + "router_z_loss_clip": 8.2734375, + "router_z_loss_mlp": 1.34277344, + "step": 37, + "time_per_iteration": 2.874316930770874 + }, + { + "auxiliary_loss_clip": 0.02263899, + "auxiliary_loss_mlp": 0.01285339, + "balance_loss_clip": 1.4646709, + "balance_loss_mlp": 1.1713748, + "epoch": 0.0022846836013828346, + "flos": 24816781313280.0, + "grad_norm": 2.014880208432467, + "language_loss": 1.04020298, + "learning_rate": 2.3420677916238357e-06, + "loss": 1.07569528, + "num_input_tokens_seen": 718545, + "router_z_loss_clip": 7.99609375, + "router_z_loss_mlp": 1.13964844, + "step": 38, + "time_per_iteration": 2.836562156677246 + }, + { + "auxiliary_loss_clip": 0.02229362, + "auxiliary_loss_mlp": 0.01247681, + "balance_loss_clip": 1.45691252, + "balance_loss_mlp": 1.13591051, + "epoch": 0.002344806854050804, + "flos": 32027990457600.0, + "grad_norm": 2.2302169897081154, + "language_loss": 0.85361463, + "learning_rate": 2.358792165262154e-06, + "loss": 0.88838506, + "num_input_tokens_seen": 739865, + "router_z_loss_clip": 7.71484375, + "router_z_loss_mlp": 1.11914062, + "step": 39, + "time_per_iteration": 2.915477752685547 + }, + { + "auxiliary_loss_clip": 0.02195545, + "auxiliary_loss_mlp": 0.01244827, + "balance_loss_clip": 1.44230461, + "balance_loss_mlp": 1.12714338, + "epoch": 0.0024049301067187736, + "flos": 14399613410400.0, + "grad_norm": 2.5907057334079306, + "language_loss": 0.89661765, + "learning_rate": 2.3750930912143747e-06, + "loss": 0.93102133, + "num_input_tokens_seen": 755770, + "router_z_loss_clip": 7.5390625, + "router_z_loss_mlp": 1.17871094, + "step": 40, + "time_per_iteration": 2.821971893310547 + }, + { + "auxiliary_loss_clip": 0.02154214, + "auxiliary_loss_mlp": 0.01272302, + "balance_loss_clip": 1.43762505, + "balance_loss_mlp": 1.16806567, + "epoch": 0.0024650533593867426, + "flos": 25174579127040.0, + "grad_norm": 4.225306470083118, + "language_loss": 0.93387353, + "learning_rate": 2.3909914837471044e-06, + "loss": 0.96813869, + "num_input_tokens_seen": 773440, + "router_z_loss_clip": 7.1796875, + "router_z_loss_mlp": 1.04199219, + "step": 41, + "time_per_iteration": 2.8098654747009277 + }, + { + "auxiliary_loss_clip": 0.02112419, + "auxiliary_loss_mlp": 0.01251408, + "balance_loss_clip": 1.4249208, + "balance_loss_mlp": 1.1528933, + "epoch": 0.002525176612054712, + "flos": 22458971671200.0, + "grad_norm": 3.3103627416131522, + "language_loss": 0.97104627, + "learning_rate": 2.4065067449483835e-06, + "loss": 1.00468457, + "num_input_tokens_seen": 790455, + "router_z_loss_clip": 6.87890625, + "router_z_loss_mlp": 0.98486328, + "step": 42, + "time_per_iteration": 2.827666997909546 + }, + { + "auxiliary_loss_clip": 0.02075672, + "auxiliary_loss_mlp": 0.01294826, + "balance_loss_clip": 1.4239254, + "balance_loss_mlp": 1.19583476, + "epoch": 0.0025852998647226816, + "flos": 34390743207840.0, + "grad_norm": 4.587019815507019, + "language_loss": 0.97609675, + "learning_rate": 2.4216569070848724e-06, + "loss": 1.00980175, + "num_input_tokens_seen": 810645, + "router_z_loss_clip": 6.52734375, + "router_z_loss_mlp": 0.99023438, + "step": 43, + "time_per_iteration": 2.894254446029663 + }, + { + "auxiliary_loss_clip": 0.02091955, + "auxiliary_loss_mlp": 0.01287777, + "balance_loss_clip": 1.42226863, + "balance_loss_mlp": 1.18539977, + "epoch": 0.0026454231173906506, + "flos": 17427767228640.0, + "grad_norm": 2.273463487462894, + "language_loss": 0.93535686, + "learning_rate": 2.4364587585915504e-06, + "loss": 0.96915412, + "num_input_tokens_seen": 827470, + "router_z_loss_clip": 6.69140625, + "router_z_loss_mlp": 1.02392578, + "step": 44, + "time_per_iteration": 2.8050522804260254 + }, + { + "auxiliary_loss_clip": 0.02063394, + "auxiliary_loss_mlp": 0.01271016, + "balance_loss_clip": 1.42614889, + "balance_loss_mlp": 1.17846215, + "epoch": 0.00270554637005862, + "flos": 27356082968160.0, + "grad_norm": 2.39159960929637, + "language_loss": 0.98589307, + "learning_rate": 2.450927955901469e-06, + "loss": 1.01923728, + "num_input_tokens_seen": 847285, + "router_z_loss_clip": 6.375, + "router_z_loss_mlp": 0.92578125, + "step": 45, + "time_per_iteration": 2.8413240909576416 + }, + { + "auxiliary_loss_clip": 0.02033983, + "auxiliary_loss_mlp": 0.01216593, + "balance_loss_clip": 1.41165066, + "balance_loss_mlp": 1.13309908, + "epoch": 0.0027656696227265896, + "flos": 29267941896000.0, + "grad_norm": 2.445338434531516, + "language_loss": 1.02678037, + "learning_rate": 2.465079122983384e-06, + "loss": 1.05928612, + "num_input_tokens_seen": 867545, + "router_z_loss_clip": 6.23046875, + "router_z_loss_mlp": 0.83496094, + "step": 46, + "time_per_iteration": 2.883270740509033 + }, + { + "auxiliary_loss_clip": 0.01999248, + "auxiliary_loss_mlp": 0.01258863, + "balance_loss_clip": 1.401088, + "balance_loss_mlp": 1.17560697, + "epoch": 0.0028257928753945586, + "flos": 45965729862720.0, + "grad_norm": 3.0555349780229255, + "language_loss": 0.88278103, + "learning_rate": 2.4789259401737868e-06, + "loss": 0.91536212, + "num_input_tokens_seen": 889915, + "router_z_loss_clip": 5.9765625, + "router_z_loss_mlp": 0.83300781, + "step": 47, + "time_per_iteration": 2.9915354251861572 + }, + { + "auxiliary_loss_clip": 0.01964483, + "auxiliary_loss_mlp": 0.01248836, + "balance_loss_clip": 1.39352345, + "balance_loss_mlp": 1.16977632, + "epoch": 0.002885916128062528, + "flos": 27399308555520.0, + "grad_norm": 1.8428229998147552, + "language_loss": 0.87866056, + "learning_rate": 2.492481223656015e-06, + "loss": 0.91079372, + "num_input_tokens_seen": 908975, + "router_z_loss_clip": 5.7109375, + "router_z_loss_mlp": 0.79003906, + "step": 48, + "time_per_iteration": 2.817441463470459 + }, + { + "auxiliary_loss_clip": 0.01971349, + "auxiliary_loss_mlp": 0.01235785, + "balance_loss_clip": 1.38728189, + "balance_loss_mlp": 1.15190911, + "epoch": 0.0029460393807304976, + "flos": 32961861437760.0, + "grad_norm": 2.663139850793192, + "language_loss": 0.89778459, + "learning_rate": 2.5057569967437924e-06, + "loss": 0.929856, + "num_input_tokens_seen": 929810, + "router_z_loss_clip": 5.84375, + "router_z_loss_mlp": 0.83935547, + "step": 49, + "time_per_iteration": 2.9781811237335205 + }, + { + "auxiliary_loss_clip": 0.0195257, + "auxiliary_loss_mlp": 0.0122866, + "balance_loss_clip": 1.37672901, + "balance_loss_mlp": 1.15031576, + "epoch": 0.0030061626333984666, + "flos": 19342057193280.0, + "grad_norm": 2.037392230349484, + "language_loss": 0.90646648, + "learning_rate": 2.51876455396287e-06, + "loss": 0.93827873, + "num_input_tokens_seen": 948650, + "router_z_loss_clip": 5.7578125, + "router_z_loss_mlp": 0.78320312, + "step": 50, + "time_per_iteration": 2.881181001663208 + }, + { + "auxiliary_loss_clip": 0.01954041, + "auxiliary_loss_mlp": 0.01188108, + "balance_loss_clip": 1.38052964, + "balance_loss_mlp": 1.11252904, + "epoch": 0.003066285886066436, + "flos": 38836758096000.0, + "grad_norm": 2.713557885971132, + "language_loss": 0.86727047, + "learning_rate": 2.5315145187866316e-06, + "loss": 0.89869195, + "num_input_tokens_seen": 966455, + "router_z_loss_clip": 5.7265625, + "router_z_loss_mlp": 0.75683594, + "step": 51, + "time_per_iteration": 2.979857921600342 + }, + { + "auxiliary_loss_clip": 0.01911674, + "auxiliary_loss_mlp": 0.01193997, + "balance_loss_clip": 1.3730154, + "balance_loss_mlp": 1.12323403, + "epoch": 0.0031264091387344056, + "flos": 50551510315680.0, + "grad_norm": 2.2714964767794505, + "language_loss": 0.95164382, + "learning_rate": 2.5440168957651953e-06, + "loss": 0.98270059, + "num_input_tokens_seen": 988110, + "router_z_loss_clip": 5.390625, + "router_z_loss_mlp": 0.70751953, + "step": 52, + "time_per_iteration": 3.024636745452881 + }, + { + "auxiliary_loss_clip": 0.01913025, + "auxiliary_loss_mlp": 0.01225796, + "balance_loss_clip": 1.37112594, + "balance_loss_mlp": 1.15226805, + "epoch": 0.0031865323914023747, + "flos": 28602864966240.0, + "grad_norm": 1.880675508508894, + "language_loss": 0.92022657, + "learning_rate": 2.5562811176888872e-06, + "loss": 0.95161486, + "num_input_tokens_seen": 1008550, + "router_z_loss_clip": 5.4140625, + "router_z_loss_mlp": 0.73535156, + "step": 53, + "time_per_iteration": 2.8416264057159424 + }, + { + "auxiliary_loss_clip": 0.01902071, + "auxiliary_loss_mlp": 0.01188198, + "balance_loss_clip": 1.37579787, + "balance_loss_mlp": 1.11343026, + "epoch": 0.003246655644070344, + "flos": 17606017859040.0, + "grad_norm": 2.5078950824802044, + "language_loss": 0.82855946, + "learning_rate": 2.5683160883431093e-06, + "loss": 0.85946214, + "num_input_tokens_seen": 1026840, + "router_z_loss_clip": 5.26171875, + "router_z_loss_mlp": 0.74755859, + "step": 54, + "time_per_iteration": 2.8315253257751465 + }, + { + "auxiliary_loss_clip": 0.01892589, + "auxiliary_loss_mlp": 0.01190905, + "balance_loss_clip": 1.36130226, + "balance_loss_mlp": 1.11966586, + "epoch": 0.0033067788967383136, + "flos": 43828707644640.0, + "grad_norm": 2.3066083745576083, + "language_loss": 0.81227064, + "learning_rate": 2.580130221340046e-06, + "loss": 0.84310561, + "num_input_tokens_seen": 1048875, + "router_z_loss_clip": 5.31640625, + "router_z_loss_mlp": 0.71240234, + "step": 55, + "time_per_iteration": 3.091524600982666 + }, + { + "auxiliary_loss_clip": 0.01888506, + "auxiliary_loss_mlp": 0.01184416, + "balance_loss_clip": 1.35923934, + "balance_loss_mlp": 1.1106497, + "epoch": 0.003366902149406283, + "flos": 28015163340480.0, + "grad_norm": 3.8162038305204775, + "language_loss": 0.86693233, + "learning_rate": 2.5917314754514246e-06, + "loss": 0.89766157, + "num_input_tokens_seen": 1066435, + "router_z_loss_clip": 5.29296875, + "router_z_loss_mlp": 0.73730469, + "step": 56, + "time_per_iteration": 2.900674819946289 + }, + { + "auxiliary_loss_clip": 0.01888047, + "auxiliary_loss_mlp": 0.01155024, + "balance_loss_clip": 1.35259771, + "balance_loss_mlp": 1.08645487, + "epoch": 0.003427025402074252, + "flos": 32436469038240.0, + "grad_norm": 1.745124188715631, + "language_loss": 0.92776102, + "learning_rate": 2.6031273868139713e-06, + "loss": 0.95819175, + "num_input_tokens_seen": 1090330, + "router_z_loss_clip": 5.359375, + "router_z_loss_mlp": 0.68603516, + "step": 57, + "time_per_iteration": 4.476167678833008 + }, + { + "auxiliary_loss_clip": 0.01850034, + "auxiliary_loss_mlp": 0.01191573, + "balance_loss_clip": 1.35405684, + "balance_loss_mlp": 1.12672353, + "epoch": 0.0034871486547422216, + "flos": 29225567171520.0, + "grad_norm": 2.0906264346928625, + "language_loss": 0.99958622, + "learning_rate": 2.614325098333948e-06, + "loss": 1.03000224, + "num_input_tokens_seen": 1109840, + "router_z_loss_clip": 4.96484375, + "router_z_loss_mlp": 0.64892578, + "step": 58, + "time_per_iteration": 5.742264032363892 + }, + { + "auxiliary_loss_clip": 0.01829305, + "auxiliary_loss_mlp": 0.01179133, + "balance_loss_clip": 1.340415, + "balance_loss_mlp": 1.11428356, + "epoch": 0.003547271907410191, + "flos": 25886609233920.0, + "grad_norm": 2.1413673807906513, + "language_loss": 0.88193399, + "learning_rate": 2.625331386578098e-06, + "loss": 0.91201836, + "num_input_tokens_seen": 1128415, + "router_z_loss_clip": 4.89453125, + "router_z_loss_mlp": 0.64794922, + "step": 59, + "time_per_iteration": 2.8503546714782715 + }, + { + "auxiliary_loss_clip": 0.01850078, + "auxiliary_loss_mlp": 0.01151397, + "balance_loss_clip": 1.35059047, + "balance_loss_mlp": 1.08449626, + "epoch": 0.00360739516007816, + "flos": 20139322956480.0, + "grad_norm": 3.2046752235589846, + "language_loss": 0.9325521, + "learning_rate": 2.63615268640451e-06, + "loss": 0.96256679, + "num_input_tokens_seen": 1146515, + "router_z_loss_clip": 5.0, + "router_z_loss_mlp": 0.66845703, + "step": 60, + "time_per_iteration": 2.7693469524383545 + }, + { + "auxiliary_loss_clip": 0.01825168, + "auxiliary_loss_mlp": 0.01162278, + "balance_loss_clip": 1.33133757, + "balance_loss_mlp": 1.09790492, + "epoch": 0.0036675184127461296, + "flos": 23750802534240.0, + "grad_norm": 2.875777877821645, + "language_loss": 0.89902604, + "learning_rate": 2.6467951135575943e-06, + "loss": 0.92890048, + "num_input_tokens_seen": 1166330, + "router_z_loss_clip": 4.93359375, + "router_z_loss_mlp": 0.64306641, + "step": 61, + "time_per_iteration": 2.838298797607422 + }, + { + "auxiliary_loss_clip": 0.01814021, + "auxiliary_loss_mlp": 0.01130438, + "balance_loss_clip": 1.33008933, + "balance_loss_mlp": 1.06978416, + "epoch": 0.003727641665414099, + "flos": 25572401663040.0, + "grad_norm": 2.146546700890689, + "language_loss": 0.88577527, + "learning_rate": 2.657264485425803e-06, + "loss": 0.9152199, + "num_input_tokens_seen": 1186010, + "router_z_loss_clip": 4.84765625, + "router_z_loss_mlp": 0.60644531, + "step": 62, + "time_per_iteration": 2.814664363861084 + }, + { + "auxiliary_loss_clip": 0.01795228, + "auxiliary_loss_mlp": 0.01153143, + "balance_loss_clip": 1.32233107, + "balance_loss_mlp": 1.08981955, + "epoch": 0.003787764918082068, + "flos": 22458890636640.0, + "grad_norm": 1.9021793821528126, + "language_loss": 0.96287704, + "learning_rate": 2.6675663401385186e-06, + "loss": 0.99236071, + "num_input_tokens_seen": 1204985, + "router_z_loss_clip": 4.7265625, + "router_z_loss_mlp": 0.6328125, + "step": 63, + "time_per_iteration": 2.8458399772644043 + }, + { + "auxiliary_loss_clip": 0.01804353, + "auxiliary_loss_mlp": 0.01163295, + "balance_loss_clip": 1.32873404, + "balance_loss_mlp": 1.10230732, + "epoch": 0.0038478881707500376, + "flos": 15202105902720.0, + "grad_norm": 2.6163182366008106, + "language_loss": 0.99146158, + "learning_rate": 2.677705954159056e-06, + "loss": 1.02113819, + "num_input_tokens_seen": 1223545, + "router_z_loss_clip": 4.75, + "router_z_loss_mlp": 0.60986328, + "step": 64, + "time_per_iteration": 2.7901804447174072 + }, + { + "auxiliary_loss_clip": 0.01811229, + "auxiliary_loss_mlp": 0.01138675, + "balance_loss_clip": 1.32866788, + "balance_loss_mlp": 1.07630432, + "epoch": 0.003908011423418007, + "flos": 16537243387680.0, + "grad_norm": 2.02792446841672, + "language_loss": 0.85338616, + "learning_rate": 2.6876883585136904e-06, + "loss": 0.88288516, + "num_input_tokens_seen": 1241175, + "router_z_loss_clip": 4.83203125, + "router_z_loss_mlp": 0.62353516, + "step": 65, + "time_per_iteration": 2.80338191986084 + }, + { + "auxiliary_loss_clip": 0.01782022, + "auxiliary_loss_mlp": 0.01149675, + "balance_loss_clip": 1.31303692, + "balance_loss_mlp": 1.08906853, + "epoch": 0.003968134676085976, + "flos": 22370292046080.0, + "grad_norm": 2.063118309269249, + "language_loss": 0.85286683, + "learning_rate": 2.697518353781685e-06, + "loss": 0.88218379, + "num_input_tokens_seen": 1259315, + "router_z_loss_clip": 4.69140625, + "router_z_loss_mlp": 0.60595703, + "step": 66, + "time_per_iteration": 2.7796356678009033 + }, + { + "auxiliary_loss_clip": 0.01783266, + "auxiliary_loss_mlp": 0.01149011, + "balance_loss_clip": 1.30898261, + "balance_loss_mlp": 1.08211064, + "epoch": 0.004028257928753946, + "flos": 24996125910240.0, + "grad_norm": 2.2189348849879447, + "language_loss": 0.96183962, + "learning_rate": 2.7072005239581103e-06, + "loss": 0.99116236, + "num_input_tokens_seen": 1277055, + "router_z_loss_clip": 4.73828125, + "router_z_loss_mlp": 0.66894531, + "step": 67, + "time_per_iteration": 2.8310251235961914 + }, + { + "auxiliary_loss_clip": 0.01754421, + "auxiliary_loss_mlp": 0.01147593, + "balance_loss_clip": 1.30337036, + "balance_loss_mlp": 1.08407807, + "epoch": 0.004088381181421915, + "flos": 22993115803200.0, + "grad_norm": 2.538411971278939, + "language_loss": 0.94299752, + "learning_rate": 2.7167392492896727e-06, + "loss": 0.97201765, + "num_input_tokens_seen": 1294355, + "router_z_loss_clip": 4.5078125, + "router_z_loss_mlp": 0.63525391, + "step": 68, + "time_per_iteration": 2.8272855281829834 + }, + { + "auxiliary_loss_clip": 0.017523, + "auxiliary_loss_mlp": 0.01152274, + "balance_loss_clip": 1.30210948, + "balance_loss_mlp": 1.09047627, + "epoch": 0.004148504434089885, + "flos": 23705713152000.0, + "grad_norm": 2.067480771116571, + "language_loss": 0.95789373, + "learning_rate": 2.7261387181735195e-06, + "loss": 0.98693943, + "num_input_tokens_seen": 1313525, + "router_z_loss_clip": 4.5, + "router_z_loss_mlp": 0.61816406, + "step": 69, + "time_per_iteration": 2.8271875381469727 + }, + { + "auxiliary_loss_clip": 0.017444, + "auxiliary_loss_mlp": 0.01153173, + "balance_loss_clip": 1.3022716, + "balance_loss_mlp": 1.09418869, + "epoch": 0.004208627686757853, + "flos": 25393907928960.0, + "grad_norm": 2.4211937331742766, + "language_loss": 0.97501731, + "learning_rate": 2.7354029381999196e-06, + "loss": 1.00399303, + "num_input_tokens_seen": 1330505, + "router_z_loss_clip": 4.41796875, + "router_z_loss_mlp": 0.58984375, + "step": 70, + "time_per_iteration": 2.8193652629852295 + }, + { + "auxiliary_loss_clip": 0.01748363, + "auxiliary_loss_mlp": 0.01140783, + "balance_loss_clip": 1.2928462, + "balance_loss_mlp": 1.07860303, + "epoch": 0.004268750939425823, + "flos": 23304081991680.0, + "grad_norm": 3.015928674392508, + "language_loss": 0.93954813, + "learning_rate": 2.7445357464116983e-06, + "loss": 0.96843958, + "num_input_tokens_seen": 1349615, + "router_z_loss_clip": 4.546875, + "router_z_loss_mlp": 0.62158203, + "step": 71, + "time_per_iteration": 2.783710241317749 + }, + { + "auxiliary_loss_clip": 0.01800146, + "auxiliary_loss_mlp": 0.01389993, + "balance_loss_clip": 1.44346309, + "balance_loss_mlp": 1.34583771, + "epoch": 0.004328874192093792, + "flos": 63986628032640.0, + "grad_norm": 2.4022310070021984, + "language_loss": 0.65454173, + "learning_rate": 2.75354081884615e-06, + "loss": 0.68644309, + "num_input_tokens_seen": 1410275, + "router_z_loss_clip": 3.56640625, + "router_z_loss_mlp": 0.44238281, + "step": 72, + "time_per_iteration": 3.6408872604370117 + }, + { + "auxiliary_loss_clip": 0.01786245, + "auxiliary_loss_mlp": 0.01336167, + "balance_loss_clip": 1.43702865, + "balance_loss_mlp": 1.293419, + "epoch": 0.004388997444761762, + "flos": 81109385112960.0, + "grad_norm": 2.2251625400890007, + "language_loss": 0.63494819, + "learning_rate": 2.7624216794188286e-06, + "loss": 0.66617227, + "num_input_tokens_seen": 1473020, + "router_z_loss_clip": 3.49609375, + "router_z_loss_mlp": 0.42797852, + "step": 73, + "time_per_iteration": 3.8619563579559326 + }, + { + "auxiliary_loss_clip": 0.01723774, + "auxiliary_loss_mlp": 0.01132447, + "balance_loss_clip": 1.28721881, + "balance_loss_mlp": 1.07160211, + "epoch": 0.004449120697429731, + "flos": 23126317568640.0, + "grad_norm": 2.6902664952271707, + "language_loss": 0.86109471, + "learning_rate": 2.771181708202938e-06, + "loss": 0.88965684, + "num_input_tokens_seen": 1490385, + "router_z_loss_clip": 4.3671875, + "router_z_loss_mlp": 0.60742188, + "step": 74, + "time_per_iteration": 3.118027925491333 + }, + { + "auxiliary_loss_clip": 0.017274, + "auxiliary_loss_mlp": 0.01156824, + "balance_loss_clip": 1.28497267, + "balance_loss_mlp": 1.09373856, + "epoch": 0.004509243950097701, + "flos": 25753002295680.0, + "grad_norm": 2.078937641766488, + "language_loss": 0.9719497, + "learning_rate": 2.779824149153005e-06, + "loss": 1.00079191, + "num_input_tokens_seen": 1509725, + "router_z_loss_clip": 4.41796875, + "router_z_loss_mlp": 0.63085938, + "step": 75, + "time_per_iteration": 2.8297815322875977 + }, + { + "auxiliary_loss_clip": 0.01707269, + "auxiliary_loss_mlp": 0.01147252, + "balance_loss_clip": 1.28090417, + "balance_loss_mlp": 1.08717084, + "epoch": 0.004569367202765669, + "flos": 25257140642880.0, + "grad_norm": 2.092928033196904, + "language_loss": 0.87582511, + "learning_rate": 2.788352117317012e-06, + "loss": 0.90437031, + "num_input_tokens_seen": 1527245, + "router_z_loss_clip": 4.26953125, + "router_z_loss_mlp": 0.60107422, + "step": 76, + "time_per_iteration": 2.814002752304077 + }, + { + "auxiliary_loss_clip": 0.01707613, + "auxiliary_loss_mlp": 0.01134315, + "balance_loss_clip": 1.2801404, + "balance_loss_mlp": 1.072469, + "epoch": 0.004629490455433639, + "flos": 34969693101120.0, + "grad_norm": 1.8008484530316902, + "language_loss": 0.91604555, + "learning_rate": 2.796768605577095e-06, + "loss": 0.9444648, + "num_input_tokens_seen": 1548930, + "router_z_loss_clip": 4.2734375, + "router_z_loss_mlp": 0.61767578, + "step": 77, + "time_per_iteration": 2.8796350955963135 + }, + { + "auxiliary_loss_clip": 0.01698021, + "auxiliary_loss_mlp": 0.01164206, + "balance_loss_clip": 1.28268504, + "balance_loss_mlp": 1.10164475, + "epoch": 0.004689613708101608, + "flos": 13509818880480.0, + "grad_norm": 2.406784800265504, + "language_loss": 0.9193157, + "learning_rate": 2.80507649095533e-06, + "loss": 0.94793797, + "num_input_tokens_seen": 1565695, + "router_z_loss_clip": 4.15234375, + "router_z_loss_mlp": 0.62548828, + "step": 78, + "time_per_iteration": 2.7623069286346436 + }, + { + "auxiliary_loss_clip": 0.01691104, + "auxiliary_loss_mlp": 0.01149928, + "balance_loss_clip": 1.27327788, + "balance_loss_mlp": 1.08808208, + "epoch": 0.004749736960769578, + "flos": 26599368651840.0, + "grad_norm": 2.3759546398257365, + "language_loss": 0.82317257, + "learning_rate": 2.813278540517843e-06, + "loss": 0.85158288, + "num_input_tokens_seen": 1582625, + "router_z_loss_clip": 4.17578125, + "router_z_loss_mlp": 0.61767578, + "step": 79, + "time_per_iteration": 2.803164482116699 + }, + { + "auxiliary_loss_clip": 0.01706953, + "auxiliary_loss_mlp": 0.01130934, + "balance_loss_clip": 1.27965713, + "balance_loss_mlp": 1.067801, + "epoch": 0.004809860213437547, + "flos": 24150853520640.0, + "grad_norm": 1.736311542735376, + "language_loss": 0.91213226, + "learning_rate": 2.8213774169075505e-06, + "loss": 0.94051117, + "num_input_tokens_seen": 1601725, + "router_z_loss_clip": 4.2734375, + "router_z_loss_mlp": 0.63085938, + "step": 80, + "time_per_iteration": 2.814166784286499 + }, + { + "auxiliary_loss_clip": 0.01675378, + "auxiliary_loss_mlp": 0.01137319, + "balance_loss_clip": 1.27105379, + "balance_loss_mlp": 1.07494879, + "epoch": 0.004869983466105517, + "flos": 32426866442880.0, + "grad_norm": 2.16289047997952, + "language_loss": 0.95199132, + "learning_rate": 2.829375683533245e-06, + "loss": 0.98011833, + "num_input_tokens_seen": 1622420, + "router_z_loss_clip": 4.03515625, + "router_z_loss_mlp": 0.62304688, + "step": 81, + "time_per_iteration": 2.823190450668335 + }, + { + "auxiliary_loss_clip": 0.01686911, + "auxiliary_loss_mlp": 0.01140579, + "balance_loss_clip": 1.27349305, + "balance_loss_mlp": 1.08126092, + "epoch": 0.004930106718773485, + "flos": 15646516960320.0, + "grad_norm": 3.1308834559412095, + "language_loss": 0.96176922, + "learning_rate": 2.8372758094402803e-06, + "loss": 0.99004418, + "num_input_tokens_seen": 1640715, + "router_z_loss_clip": 4.13085938, + "router_z_loss_mlp": 0.59375, + "step": 82, + "time_per_iteration": 2.873552083969116 + }, + { + "auxiliary_loss_clip": 0.01670332, + "auxiliary_loss_mlp": 0.01165185, + "balance_loss_clip": 1.26069593, + "balance_loss_mlp": 1.10081196, + "epoch": 0.004990229971441455, + "flos": 31451066428320.0, + "grad_norm": 1.973782864846563, + "language_loss": 0.86498612, + "learning_rate": 2.84508017388607e-06, + "loss": 0.8933413, + "num_input_tokens_seen": 1662210, + "router_z_loss_clip": 4.09375, + "router_z_loss_mlp": 0.64355469, + "step": 83, + "time_per_iteration": 2.84626841545105 + }, + { + "auxiliary_loss_clip": 0.0166579, + "auxiliary_loss_mlp": 0.01151072, + "balance_loss_clip": 1.26331139, + "balance_loss_mlp": 1.08774805, + "epoch": 0.005050353224109424, + "flos": 21301112401920.0, + "grad_norm": 2.3479813887105614, + "language_loss": 0.91554952, + "learning_rate": 2.852791070641559e-06, + "loss": 0.94371819, + "num_input_tokens_seen": 1681070, + "router_z_loss_clip": 4.02539062, + "router_z_loss_mlp": 0.63330078, + "step": 84, + "time_per_iteration": 2.780439853668213 + }, + { + "auxiliary_loss_clip": 0.01647703, + "auxiliary_loss_mlp": 0.01124868, + "balance_loss_clip": 1.37149096, + "balance_loss_mlp": 1.08736491, + "epoch": 0.005110476476777394, + "flos": 85177870272000.0, + "grad_norm": 1.4292500744007008, + "language_loss": 0.62515396, + "learning_rate": 2.8604107120381682e-06, + "loss": 0.65287971, + "num_input_tokens_seen": 1747140, + "router_z_loss_clip": 2.7578125, + "router_z_loss_mlp": 0.37426758, + "step": 85, + "time_per_iteration": 3.4897592067718506 + }, + { + "auxiliary_loss_clip": 0.01649835, + "auxiliary_loss_mlp": 0.011219, + "balance_loss_clip": 1.25041091, + "balance_loss_mlp": 1.05900478, + "epoch": 0.005170599729445363, + "flos": 30249292777920.0, + "grad_norm": 1.8947417579530357, + "language_loss": 0.90444183, + "learning_rate": 2.8679412327780482e-06, + "loss": 0.93215919, + "num_input_tokens_seen": 1767475, + "router_z_loss_clip": 3.99414062, + "router_z_loss_mlp": 0.62841797, + "step": 86, + "time_per_iteration": 3.0079801082611084 + }, + { + "auxiliary_loss_clip": 0.01655256, + "auxiliary_loss_mlp": 0.01158323, + "balance_loss_clip": 1.25857496, + "balance_loss_mlp": 1.09409344, + "epoch": 0.005230722982113333, + "flos": 28379727540000.0, + "grad_norm": 2.8443992383125645, + "language_loss": 0.82017779, + "learning_rate": 2.8753846935240833e-06, + "loss": 0.84831357, + "num_input_tokens_seen": 1784980, + "router_z_loss_clip": 3.96484375, + "router_z_loss_mlp": 0.64160156, + "step": 87, + "time_per_iteration": 2.8648266792297363 + }, + { + "auxiliary_loss_clip": 0.01641315, + "auxiliary_loss_mlp": 0.01157558, + "balance_loss_clip": 1.25554299, + "balance_loss_mlp": 1.09561682, + "epoch": 0.005290846234781301, + "flos": 20411074768320.0, + "grad_norm": 2.4135816238260435, + "language_loss": 0.95828581, + "learning_rate": 2.8827430842847267e-06, + "loss": 0.9862746, + "num_input_tokens_seen": 1803030, + "router_z_loss_clip": 3.85546875, + "router_z_loss_mlp": 0.61914062, + "step": 88, + "time_per_iteration": 2.8540093898773193 + }, + { + "auxiliary_loss_clip": 0.01654791, + "auxiliary_loss_mlp": 0.01148555, + "balance_loss_clip": 1.25383842, + "balance_loss_mlp": 1.08756781, + "epoch": 0.005350969487449271, + "flos": 25485302211840.0, + "grad_norm": 1.9868498929912293, + "language_loss": 0.86170548, + "learning_rate": 2.8900183276075957e-06, + "loss": 0.88973892, + "num_input_tokens_seen": 1822865, + "router_z_loss_clip": 4.00976562, + "router_z_loss_mlp": 0.60986328, + "step": 89, + "time_per_iteration": 2.83520770072937 + }, + { + "auxiliary_loss_clip": 0.01644156, + "auxiliary_loss_mlp": 0.0112102, + "balance_loss_clip": 1.24595094, + "balance_loss_mlp": 1.06160605, + "epoch": 0.00541109274011724, + "flos": 31982212281600.0, + "grad_norm": 1.9341192640558793, + "language_loss": 0.91604352, + "learning_rate": 2.8972122815946455e-06, + "loss": 0.94369531, + "num_input_tokens_seen": 1842435, + "router_z_loss_clip": 3.98046875, + "router_z_loss_mlp": 0.59375, + "step": 90, + "time_per_iteration": 2.965790271759033 + }, + { + "auxiliary_loss_clip": 0.01627754, + "auxiliary_loss_mlp": 0.01125298, + "balance_loss_clip": 1.24619484, + "balance_loss_mlp": 1.06412017, + "epoch": 0.00547121599278521, + "flos": 25842168128160.0, + "grad_norm": 3.4178404628790715, + "language_loss": 0.85930836, + "learning_rate": 2.90432674275074e-06, + "loss": 0.88683891, + "num_input_tokens_seen": 1860065, + "router_z_loss_clip": 3.81835938, + "router_z_loss_mlp": 0.61083984, + "step": 91, + "time_per_iteration": 2.8157992362976074 + }, + { + "auxiliary_loss_clip": 0.01623982, + "auxiliary_loss_mlp": 0.0114146, + "balance_loss_clip": 1.238662, + "balance_loss_mlp": 1.08171225, + "epoch": 0.005531339245453179, + "flos": 24060877342560.0, + "grad_norm": 2.4443717567587244, + "language_loss": 0.87134576, + "learning_rate": 2.91136344867656e-06, + "loss": 0.89900017, + "num_input_tokens_seen": 1878135, + "router_z_loss_clip": 3.85546875, + "router_z_loss_mlp": 0.59765625, + "step": 92, + "time_per_iteration": 2.852381944656372 + }, + { + "auxiliary_loss_clip": 0.01617658, + "auxiliary_loss_mlp": 0.01180628, + "balance_loss_clip": 1.22998929, + "balance_loss_mlp": 1.11863947, + "epoch": 0.005591462498121149, + "flos": 21519671375520.0, + "grad_norm": 3.0186613181736104, + "language_loss": 0.9202221, + "learning_rate": 2.918324080615938e-06, + "loss": 0.94820499, + "num_input_tokens_seen": 1894895, + "router_z_loss_clip": 3.87695312, + "router_z_loss_mlp": 0.61962891, + "step": 93, + "time_per_iteration": 2.8524281978607178 + }, + { + "auxiliary_loss_clip": 0.01632722, + "auxiliary_loss_mlp": 0.01151222, + "balance_loss_clip": 1.23549724, + "balance_loss_mlp": 1.08684897, + "epoch": 0.005651585750789117, + "flos": 24417662224320.0, + "grad_norm": 2.1865651987494004, + "language_loss": 0.87442029, + "learning_rate": 2.925210265866963e-06, + "loss": 0.90225971, + "num_input_tokens_seen": 1913220, + "router_z_loss_clip": 3.96679688, + "router_z_loss_mlp": 0.64306641, + "step": 94, + "time_per_iteration": 2.8578548431396484 + }, + { + "auxiliary_loss_clip": 0.0156222, + "auxiliary_loss_mlp": 0.01047784, + "balance_loss_clip": 1.32113934, + "balance_loss_mlp": 1.01214039, + "epoch": 0.005711709003457087, + "flos": 72983827967040.0, + "grad_norm": 1.3636555818662717, + "language_loss": 0.68100286, + "learning_rate": 2.932023580065507e-06, + "loss": 0.70710289, + "num_input_tokens_seen": 1970970, + "router_z_loss_clip": 2.40625, + "router_z_loss_mlp": 0.35717773, + "step": 95, + "time_per_iteration": 3.3257503509521484 + }, + { + "auxiliary_loss_clip": 0.01606611, + "auxiliary_loss_mlp": 0.0114516, + "balance_loss_clip": 1.22315121, + "balance_loss_mlp": 1.08526921, + "epoch": 0.005771832256125056, + "flos": 18985474897920.0, + "grad_norm": 2.560981220068937, + "language_loss": 0.90487701, + "learning_rate": 2.9387655493491906e-06, + "loss": 0.93239468, + "num_input_tokens_seen": 1988930, + "router_z_loss_clip": 3.83203125, + "router_z_loss_mlp": 0.59912109, + "step": 96, + "time_per_iteration": 2.807335376739502 + }, + { + "auxiliary_loss_clip": 0.01598039, + "auxiliary_loss_mlp": 0.01136253, + "balance_loss_clip": 1.22569227, + "balance_loss_mlp": 1.07927132, + "epoch": 0.005831955508793026, + "flos": 27489325250880.0, + "grad_norm": 2.669557990731899, + "language_loss": 0.90003639, + "learning_rate": 2.9454376524092147e-06, + "loss": 0.92737925, + "num_input_tokens_seen": 2006285, + "router_z_loss_clip": 3.72265625, + "router_z_loss_mlp": 0.56933594, + "step": 97, + "time_per_iteration": 5.8601744174957275 + }, + { + "auxiliary_loss_clip": 0.0158771, + "auxiliary_loss_mlp": 0.0113488, + "balance_loss_clip": 1.22101879, + "balance_loss_mlp": 1.07355881, + "epoch": 0.005892078761460995, + "flos": 26905756387680.0, + "grad_norm": 1.9170445032732126, + "language_loss": 0.76571935, + "learning_rate": 2.952041322436969e-06, + "loss": 0.79294527, + "num_input_tokens_seen": 2024905, + "router_z_loss_clip": 3.6640625, + "router_z_loss_mlp": 0.61279297, + "step": 98, + "time_per_iteration": 2.8640878200531006 + }, + { + "auxiliary_loss_clip": 0.0153049, + "auxiliary_loss_mlp": 0.01040946, + "balance_loss_clip": 1.2986722, + "balance_loss_mlp": 1.0054214, + "epoch": 0.005952202014128965, + "flos": 83631628992960.0, + "grad_norm": 1.027430366172882, + "language_loss": 0.65436208, + "learning_rate": 2.9585779489718204e-06, + "loss": 0.68007642, + "num_input_tokens_seen": 2086220, + "router_z_loss_clip": 2.3125, + "router_z_loss_mlp": 0.35595703, + "step": 99, + "time_per_iteration": 3.3733863830566406 + }, + { + "auxiliary_loss_clip": 0.01586144, + "auxiliary_loss_mlp": 0.01137392, + "balance_loss_clip": 1.21665394, + "balance_loss_mlp": 1.0739255, + "epoch": 0.006012325266796933, + "flos": 28017594377280.0, + "grad_norm": 2.229193849580398, + "language_loss": 0.90787005, + "learning_rate": 2.9650488796560464e-06, + "loss": 0.93510532, + "num_input_tokens_seen": 2103365, + "router_z_loss_clip": 3.69335938, + "router_z_loss_mlp": 0.63427734, + "step": 100, + "time_per_iteration": 2.827288866043091 + }, + { + "auxiliary_loss_clip": 0.01599079, + "auxiliary_loss_mlp": 0.01139224, + "balance_loss_clip": 1.21950817, + "balance_loss_mlp": 1.07861853, + "epoch": 0.006072448519464903, + "flos": 21167708050080.0, + "grad_norm": 2.1590596500530235, + "language_loss": 0.90940171, + "learning_rate": 2.971455421902446e-06, + "loss": 0.93678486, + "num_input_tokens_seen": 2121995, + "router_z_loss_clip": 3.80078125, + "router_z_loss_mlp": 0.60546875, + "step": 101, + "time_per_iteration": 2.8115689754486084 + }, + { + "auxiliary_loss_clip": 0.01586869, + "auxiliary_loss_mlp": 0.01142553, + "balance_loss_clip": 1.22142136, + "balance_loss_mlp": 1.08018279, + "epoch": 0.006132571772132872, + "flos": 30116253081600.0, + "grad_norm": 2.266427158853952, + "language_loss": 0.90684617, + "learning_rate": 2.9777988444798075e-06, + "loss": 0.93414032, + "num_input_tokens_seen": 2141815, + "router_z_loss_clip": 3.65429688, + "router_z_loss_mlp": 0.62304688, + "step": 102, + "time_per_iteration": 2.872502088546753 + }, + { + "auxiliary_loss_clip": 0.01583009, + "auxiliary_loss_mlp": 0.01127691, + "balance_loss_clip": 1.21749222, + "balance_loss_mlp": 1.06999373, + "epoch": 0.006192695024800842, + "flos": 26192956452480.0, + "grad_norm": 2.031120817123517, + "language_loss": 0.87908411, + "learning_rate": 2.9840803790210285e-06, + "loss": 0.90619111, + "num_input_tokens_seen": 2161125, + "router_z_loss_clip": 3.65625, + "router_z_loss_mlp": 0.57714844, + "step": 103, + "time_per_iteration": 2.819823980331421 + }, + { + "auxiliary_loss_clip": 0.01578807, + "auxiliary_loss_mlp": 0.01134314, + "balance_loss_clip": 1.21726537, + "balance_loss_mlp": 1.07490039, + "epoch": 0.006252818277468811, + "flos": 21256185088800.0, + "grad_norm": 2.291738310324615, + "language_loss": 0.93616712, + "learning_rate": 2.990301221458371e-06, + "loss": 0.96329832, + "num_input_tokens_seen": 2179510, + "router_z_loss_clip": 3.61132812, + "router_z_loss_mlp": 0.59375, + "step": 104, + "time_per_iteration": 2.7958974838256836 + }, + { + "auxiliary_loss_clip": 0.01573919, + "auxiliary_loss_mlp": 0.01141223, + "balance_loss_clip": 1.20927465, + "balance_loss_mlp": 1.08376431, + "epoch": 0.006312941530136781, + "flos": 23305135440960.0, + "grad_norm": 2.636151037975896, + "language_loss": 0.96338975, + "learning_rate": 2.9964625333900544e-06, + "loss": 0.99054116, + "num_input_tokens_seen": 2197870, + "router_z_loss_clip": 3.64257812, + "router_z_loss_mlp": 0.57470703, + "step": 105, + "time_per_iteration": 2.822502851486206 + }, + { + "auxiliary_loss_clip": 0.01570108, + "auxiliary_loss_mlp": 0.01157149, + "balance_loss_clip": 1.20877457, + "balance_loss_mlp": 1.0925858, + "epoch": 0.006373064782804749, + "flos": 29359336178880.0, + "grad_norm": 25.52088848783741, + "language_loss": 0.8739934, + "learning_rate": 3.002565443382063e-06, + "loss": 0.90126598, + "num_input_tokens_seen": 2217495, + "router_z_loss_clip": 3.61132812, + "router_z_loss_mlp": 0.64550781, + "step": 106, + "time_per_iteration": 2.898146152496338 + }, + { + "auxiliary_loss_clip": 0.01557271, + "auxiliary_loss_mlp": 0.01140445, + "balance_loss_clip": 1.19679785, + "balance_loss_mlp": 1.07878995, + "epoch": 0.006433188035472719, + "flos": 22369117044960.0, + "grad_norm": 2.215479493421575, + "language_loss": 0.83608967, + "learning_rate": 3.008611048208843e-06, + "loss": 0.86306679, + "num_input_tokens_seen": 2236520, + "router_z_loss_clip": 3.60742188, + "router_z_loss_mlp": 0.61621094, + "step": 107, + "time_per_iteration": 2.79065203666687 + }, + { + "auxiliary_loss_clip": 0.01464206, + "auxiliary_loss_mlp": 0.01044412, + "balance_loss_clip": 1.25117099, + "balance_loss_mlp": 1.00993681, + "epoch": 0.006493311288140688, + "flos": 76337696263680.0, + "grad_norm": 0.9860753554842147, + "language_loss": 0.64742249, + "learning_rate": 3.014600414036285e-06, + "loss": 0.6725086, + "num_input_tokens_seen": 2300140, + "router_z_loss_clip": 2.12695312, + "router_z_loss_mlp": 0.34521484, + "step": 108, + "time_per_iteration": 3.4799129962921143 + }, + { + "auxiliary_loss_clip": 0.01549068, + "auxiliary_loss_mlp": 0.01125045, + "balance_loss_clip": 1.19901121, + "balance_loss_mlp": 1.0629133, + "epoch": 0.006553434540808658, + "flos": 23794028121600.0, + "grad_norm": 1.9173194798133395, + "language_loss": 0.97535098, + "learning_rate": 3.0205345775501937e-06, + "loss": 1.00209212, + "num_input_tokens_seen": 2317320, + "router_z_loss_clip": 3.50195312, + "router_z_loss_mlp": 0.62158203, + "step": 109, + "time_per_iteration": 2.785651206970215 + }, + { + "auxiliary_loss_clip": 0.01547699, + "auxiliary_loss_mlp": 0.01138087, + "balance_loss_clip": 1.20047164, + "balance_loss_mlp": 1.07862568, + "epoch": 0.006613557793476627, + "flos": 25752597122880.0, + "grad_norm": 1.728861385535522, + "language_loss": 0.84220767, + "learning_rate": 3.0264145470332218e-06, + "loss": 0.86906546, + "num_input_tokens_seen": 2337820, + "router_z_loss_clip": 3.46875, + "router_z_loss_mlp": 0.59521484, + "step": 110, + "time_per_iteration": 2.8586788177490234 + }, + { + "auxiliary_loss_clip": 0.01542779, + "auxiliary_loss_mlp": 0.0115194, + "balance_loss_clip": 1.19614029, + "balance_loss_mlp": 1.0925734, + "epoch": 0.006673681046144597, + "flos": 31763207617920.0, + "grad_norm": 1.9318861153406908, + "language_loss": 0.82841533, + "learning_rate": 3.032241303393073e-06, + "loss": 0.85536242, + "num_input_tokens_seen": 2358560, + "router_z_loss_clip": 3.46679688, + "router_z_loss_mlp": 0.59326172, + "step": 111, + "time_per_iteration": 2.9205515384674072 + }, + { + "auxiliary_loss_clip": 0.01542899, + "auxiliary_loss_mlp": 0.01128285, + "balance_loss_clip": 1.19899893, + "balance_loss_mlp": 1.07139814, + "epoch": 0.006733804298812566, + "flos": 28245229221600.0, + "grad_norm": 2.161195104322131, + "language_loss": 0.93838108, + "learning_rate": 3.0380158011446e-06, + "loss": 0.9650929, + "num_input_tokens_seen": 2379005, + "router_z_loss_clip": 3.4375, + "router_z_loss_mlp": 0.56958008, + "step": 112, + "time_per_iteration": 2.818359375 + }, + { + "auxiliary_loss_clip": 0.0154571, + "auxiliary_loss_mlp": 0.01133616, + "balance_loss_clip": 1.1937089, + "balance_loss_mlp": 1.07551301, + "epoch": 0.006793927551480535, + "flos": 14354402476320.0, + "grad_norm": 3.8978926581569966, + "language_loss": 0.79060251, + "learning_rate": 3.0437389693482466e-06, + "loss": 0.81739575, + "num_input_tokens_seen": 2395610, + "router_z_loss_clip": 3.51953125, + "router_z_loss_mlp": 0.58129883, + "step": 113, + "time_per_iteration": 2.7807016372680664 + }, + { + "auxiliary_loss_clip": 0.01534054, + "auxiliary_loss_mlp": 0.01129167, + "balance_loss_clip": 1.18970013, + "balance_loss_mlp": 1.06961, + "epoch": 0.006854050804148504, + "flos": 23393531445120.0, + "grad_norm": 2.8182306314425163, + "language_loss": 0.93163699, + "learning_rate": 3.0494117125071475e-06, + "loss": 0.9582693, + "num_input_tokens_seen": 2415005, + "router_z_loss_clip": 3.4453125, + "router_z_loss_mlp": 0.59570312, + "step": 114, + "time_per_iteration": 2.7538247108459473 + }, + { + "auxiliary_loss_clip": 0.01542019, + "auxiliary_loss_mlp": 0.0113403, + "balance_loss_clip": 1.19076967, + "balance_loss_mlp": 1.07967091, + "epoch": 0.006914174056816474, + "flos": 26822384526240.0, + "grad_norm": 2.206985615584345, + "language_loss": 0.94599587, + "learning_rate": 3.055034911425055e-06, + "loss": 0.97275639, + "num_input_tokens_seen": 2433965, + "router_z_loss_clip": 3.515625, + "router_z_loss_mlp": 0.54418945, + "step": 115, + "time_per_iteration": 2.7976157665252686 + }, + { + "auxiliary_loss_clip": 0.01538654, + "auxiliary_loss_mlp": 0.01118301, + "balance_loss_clip": 1.18770754, + "balance_loss_mlp": 1.05817175, + "epoch": 0.006974297309484443, + "flos": 19875755635200.0, + "grad_norm": 2.468865676521545, + "language_loss": 0.81809998, + "learning_rate": 3.0606094240271244e-06, + "loss": 0.84466958, + "num_input_tokens_seen": 2451605, + "router_z_loss_clip": 3.50976562, + "router_z_loss_mlp": 0.60107422, + "step": 116, + "time_per_iteration": 2.7346818447113037 + }, + { + "auxiliary_loss_clip": 0.01530227, + "auxiliary_loss_mlp": 0.01124855, + "balance_loss_clip": 1.19061124, + "balance_loss_mlp": 1.06615603, + "epoch": 0.007034420562152413, + "flos": 31852576036800.0, + "grad_norm": 2.2710519879961595, + "language_loss": 0.88324606, + "learning_rate": 3.0661360861454656e-06, + "loss": 0.90979683, + "num_input_tokens_seen": 2472035, + "router_z_loss_clip": 3.39453125, + "router_z_loss_mlp": 0.58642578, + "step": 117, + "time_per_iteration": 2.8240585327148438 + }, + { + "auxiliary_loss_clip": 0.01530211, + "auxiliary_loss_mlp": 0.01137696, + "balance_loss_clip": 1.1871227, + "balance_loss_mlp": 1.07694733, + "epoch": 0.007094543814820382, + "flos": 17332483286880.0, + "grad_norm": 2.5299869007183973, + "language_loss": 0.84690273, + "learning_rate": 3.071615712271274e-06, + "loss": 0.87358177, + "num_input_tokens_seen": 2489285, + "router_z_loss_clip": 3.43554688, + "router_z_loss_mlp": 0.60693359, + "step": 118, + "time_per_iteration": 2.7489993572235107 + }, + { + "auxiliary_loss_clip": 0.01538356, + "auxiliary_loss_mlp": 0.01152778, + "balance_loss_clip": 1.18885124, + "balance_loss_mlp": 1.09508109, + "epoch": 0.007154667067488351, + "flos": 18273323239200.0, + "grad_norm": 2.1465676961960796, + "language_loss": 0.99433106, + "learning_rate": 3.0770490962752172e-06, + "loss": 1.02124226, + "num_input_tokens_seen": 2506460, + "router_z_loss_clip": 3.49609375, + "router_z_loss_mlp": 0.57714844, + "step": 119, + "time_per_iteration": 2.741521120071411 + }, + { + "auxiliary_loss_clip": 0.01541014, + "auxiliary_loss_mlp": 0.01117106, + "balance_loss_clip": 1.18500233, + "balance_loss_mlp": 1.05983782, + "epoch": 0.00721479032015632, + "flos": 24639867753120.0, + "grad_norm": 2.6366456337289383, + "language_loss": 0.89466941, + "learning_rate": 3.082437012097686e-06, + "loss": 0.92125064, + "num_input_tokens_seen": 2525565, + "router_z_loss_clip": 3.56445312, + "router_z_loss_mlp": 0.57324219, + "step": 120, + "time_per_iteration": 2.7940354347229004 + }, + { + "auxiliary_loss_clip": 0.01528214, + "auxiliary_loss_mlp": 0.01125253, + "balance_loss_clip": 1.18809843, + "balance_loss_mlp": 1.06765103, + "epoch": 0.00727491357282429, + "flos": 28244621462400.0, + "grad_norm": 1.9091258868705374, + "language_loss": 0.93370473, + "learning_rate": 3.0877802144103967e-06, + "loss": 0.96023935, + "num_input_tokens_seen": 2546605, + "router_z_loss_clip": 3.40429688, + "router_z_loss_mlp": 0.57568359, + "step": 121, + "time_per_iteration": 2.8017005920410156 + }, + { + "auxiliary_loss_clip": 0.01528412, + "auxiliary_loss_mlp": 0.01153373, + "balance_loss_clip": 1.18665898, + "balance_loss_mlp": 1.09581864, + "epoch": 0.007335036825492259, + "flos": 18939332066400.0, + "grad_norm": 2.2152921451143346, + "language_loss": 0.90231812, + "learning_rate": 3.09307943925077e-06, + "loss": 0.92913598, + "num_input_tokens_seen": 2560730, + "router_z_loss_clip": 3.41601562, + "router_z_loss_mlp": 0.57568359, + "step": 122, + "time_per_iteration": 2.7490737438201904 + }, + { + "auxiliary_loss_clip": 0.01524264, + "auxiliary_loss_mlp": 0.01134698, + "balance_loss_clip": 1.18134212, + "balance_loss_mlp": 1.07390094, + "epoch": 0.007395160078160229, + "flos": 29582068432320.0, + "grad_norm": 3.175870600585356, + "language_loss": 0.92343712, + "learning_rate": 3.0983354046304154e-06, + "loss": 0.95002669, + "num_input_tokens_seen": 2579550, + "router_z_loss_clip": 3.42773438, + "router_z_loss_mlp": 0.60791016, + "step": 123, + "time_per_iteration": 2.789691209793091 + }, + { + "auxiliary_loss_clip": 0.01518806, + "auxiliary_loss_mlp": 0.01125327, + "balance_loss_clip": 1.17390609, + "balance_loss_mlp": 1.06863129, + "epoch": 0.007455283330828198, + "flos": 38754156062880.0, + "grad_norm": 2.747849517462492, + "language_loss": 0.71130443, + "learning_rate": 3.103548811118979e-06, + "loss": 0.73774576, + "num_input_tokens_seen": 2600390, + "router_z_loss_clip": 3.44921875, + "router_z_loss_mlp": 0.56665039, + "step": 124, + "time_per_iteration": 2.892991065979004 + }, + { + "auxiliary_loss_clip": 0.0150939, + "auxiliary_loss_mlp": 0.0112254, + "balance_loss_clip": 1.17712939, + "balance_loss_mlp": 1.06489015, + "epoch": 0.007515406583496167, + "flos": 32476291174080.0, + "grad_norm": 2.158729139768983, + "language_loss": 0.8842864, + "learning_rate": 3.108720342404542e-06, + "loss": 0.91060567, + "num_input_tokens_seen": 2620770, + "router_z_loss_clip": 3.32421875, + "router_z_loss_mlp": 0.57617188, + "step": 125, + "time_per_iteration": 2.839601993560791 + }, + { + "auxiliary_loss_clip": 0.01524615, + "auxiliary_loss_mlp": 0.01140466, + "balance_loss_clip": 1.1762867, + "balance_loss_mlp": 1.08295989, + "epoch": 0.007575529836164136, + "flos": 22236360969600.0, + "grad_norm": 2.9567824214526057, + "language_loss": 0.82108951, + "learning_rate": 3.1138506658316945e-06, + "loss": 0.84774041, + "num_input_tokens_seen": 2639900, + "router_z_loss_clip": 3.48242188, + "router_z_loss_mlp": 0.57568359, + "step": 126, + "time_per_iteration": 2.7827911376953125 + }, + { + "auxiliary_loss_clip": 0.01516277, + "auxiliary_loss_mlp": 0.01134896, + "balance_loss_clip": 1.17436838, + "balance_loss_mlp": 1.07924891, + "epoch": 0.007635653088832106, + "flos": 26332478913600.0, + "grad_norm": 2.403709210336058, + "language_loss": 0.67063999, + "learning_rate": 3.1189404329183404e-06, + "loss": 0.69715178, + "num_input_tokens_seen": 2657450, + "router_z_loss_clip": 3.41796875, + "router_z_loss_mlp": 0.55615234, + "step": 127, + "time_per_iteration": 2.7521352767944336 + }, + { + "auxiliary_loss_clip": 0.01505937, + "auxiliary_loss_mlp": 0.01124499, + "balance_loss_clip": 1.17819619, + "balance_loss_mlp": 1.06694472, + "epoch": 0.007695776341500075, + "flos": 30962984093280.0, + "grad_norm": 2.9409944463250604, + "language_loss": 0.87966621, + "learning_rate": 3.1239902798522317e-06, + "loss": 0.90597063, + "num_input_tokens_seen": 2678150, + "router_z_loss_clip": 3.27929688, + "router_z_loss_mlp": 0.57568359, + "step": 128, + "time_per_iteration": 2.808018445968628 + }, + { + "auxiliary_loss_clip": 0.01508552, + "auxiliary_loss_mlp": 0.01131023, + "balance_loss_clip": 1.17408133, + "balance_loss_mlp": 1.07399297, + "epoch": 0.007755899594168045, + "flos": 27264405064320.0, + "grad_norm": 1.6383688769599816, + "language_loss": 0.84630013, + "learning_rate": 3.129000827968184e-06, + "loss": 0.87269592, + "num_input_tokens_seen": 2698290, + "router_z_loss_clip": 3.34375, + "router_z_loss_mlp": 0.5703125, + "step": 129, + "time_per_iteration": 2.7739012241363525 + }, + { + "auxiliary_loss_clip": 0.01506327, + "auxiliary_loss_mlp": 0.01130313, + "balance_loss_clip": 1.17345691, + "balance_loss_mlp": 1.07142365, + "epoch": 0.007816022846836013, + "flos": 27623337361920.0, + "grad_norm": 2.5901954991611884, + "language_loss": 0.97553295, + "learning_rate": 3.133972684206866e-06, + "loss": 1.00189936, + "num_input_tokens_seen": 2717630, + "router_z_loss_clip": 3.328125, + "router_z_loss_mlp": 0.58837891, + "step": 130, + "time_per_iteration": 2.758291006088257 + }, + { + "auxiliary_loss_clip": 0.01497067, + "auxiliary_loss_mlp": 0.01126162, + "balance_loss_clip": 1.17024875, + "balance_loss_mlp": 1.06808305, + "epoch": 0.007876146099503984, + "flos": 22186976755680.0, + "grad_norm": 1.889153494531036, + "language_loss": 0.82756174, + "learning_rate": 3.138906441556014e-06, + "loss": 0.85379398, + "num_input_tokens_seen": 2735835, + "router_z_loss_clip": 3.26757812, + "router_z_loss_mlp": 0.58056641, + "step": 131, + "time_per_iteration": 2.762603998184204 + }, + { + "auxiliary_loss_clip": 0.0150444, + "auxiliary_loss_mlp": 0.01124892, + "balance_loss_clip": 1.17152238, + "balance_loss_mlp": 1.07005572, + "epoch": 0.007936269352171952, + "flos": 33092308028160.0, + "grad_norm": 4.76672180162514, + "language_loss": 0.82933033, + "learning_rate": 3.143802679474861e-06, + "loss": 0.8556236, + "num_input_tokens_seen": 2756335, + "router_z_loss_clip": 3.32617188, + "router_z_loss_mlp": 0.54882812, + "step": 132, + "time_per_iteration": 2.8256118297576904 + }, + { + "auxiliary_loss_clip": 0.01496293, + "auxiliary_loss_mlp": 0.01120333, + "balance_loss_clip": 1.16710544, + "balance_loss_mlp": 1.06549656, + "epoch": 0.007996392604839923, + "flos": 23215807539360.0, + "grad_norm": 2.2874021251512233, + "language_loss": 0.95323372, + "learning_rate": 3.1486619643025565e-06, + "loss": 0.97939992, + "num_input_tokens_seen": 2775090, + "router_z_loss_clip": 3.29101562, + "router_z_loss_mlp": 0.54833984, + "step": 133, + "time_per_iteration": 2.770198345184326 + }, + { + "auxiliary_loss_clip": 0.01495057, + "auxiliary_loss_mlp": 0.01119488, + "balance_loss_clip": 1.17497003, + "balance_loss_mlp": 1.06488991, + "epoch": 0.008056515857507891, + "flos": 31096145341440.0, + "grad_norm": 1.820008137326774, + "language_loss": 0.73607415, + "learning_rate": 3.153484849651286e-06, + "loss": 0.76221955, + "num_input_tokens_seen": 2795320, + "router_z_loss_clip": 3.19921875, + "router_z_loss_mlp": 0.54638672, + "step": 134, + "time_per_iteration": 2.802438735961914 + }, + { + "auxiliary_loss_clip": 0.01490857, + "auxiliary_loss_mlp": 0.01123944, + "balance_loss_clip": 1.1638186, + "balance_loss_mlp": 1.06658018, + "epoch": 0.00811663911017586, + "flos": 25084440879840.0, + "grad_norm": 12.529740784075063, + "language_loss": 0.88859302, + "learning_rate": 3.1582718767847806e-06, + "loss": 0.91474104, + "num_input_tokens_seen": 2812815, + "router_z_loss_clip": 3.27148438, + "router_z_loss_mlp": 0.57373047, + "step": 135, + "time_per_iteration": 2.738384246826172 + }, + { + "auxiliary_loss_clip": 0.01492621, + "auxiliary_loss_mlp": 0.01132894, + "balance_loss_clip": 1.16750419, + "balance_loss_mlp": 1.07376659, + "epoch": 0.00817676236284383, + "flos": 22940003999520.0, + "grad_norm": 2.0092601313790723, + "language_loss": 0.88958979, + "learning_rate": 3.1630235749828485e-06, + "loss": 0.91584492, + "num_input_tokens_seen": 2830445, + "router_z_loss_clip": 3.25390625, + "router_z_loss_mlp": 0.59082031, + "step": 136, + "time_per_iteration": 4.283787965774536 + }, + { + "auxiliary_loss_clip": 0.01490556, + "auxiliary_loss_mlp": 0.01104073, + "balance_loss_clip": 1.16257572, + "balance_loss_mlp": 1.04980922, + "epoch": 0.008236885615511799, + "flos": 29131296161760.0, + "grad_norm": 2.4597141522201738, + "language_loss": 0.83923519, + "learning_rate": 3.1677404618925676e-06, + "loss": 0.86518151, + "num_input_tokens_seen": 2846965, + "router_z_loss_clip": 3.27929688, + "router_z_loss_mlp": 0.54272461, + "step": 137, + "time_per_iteration": 5.649913311004639 + }, + { + "auxiliary_loss_clip": 0.01484951, + "auxiliary_loss_mlp": 0.0111182, + "balance_loss_clip": 1.1606729, + "balance_loss_mlp": 1.05691242, + "epoch": 0.00829700886817977, + "flos": 30070596457440.0, + "grad_norm": 1.7245356391623146, + "language_loss": 0.90203494, + "learning_rate": 3.1724230438666953e-06, + "loss": 0.92800266, + "num_input_tokens_seen": 2867520, + "router_z_loss_clip": 3.24414062, + "router_z_loss_mlp": 0.54907227, + "step": 138, + "time_per_iteration": 2.8145251274108887 + }, + { + "auxiliary_loss_clip": 0.01475754, + "auxiliary_loss_mlp": 0.01121121, + "balance_loss_clip": 1.15991426, + "balance_loss_mlp": 1.06294739, + "epoch": 0.008357132120847738, + "flos": 30825365944320.0, + "grad_norm": 2.4005420985566848, + "language_loss": 0.91164994, + "learning_rate": 3.177071816289865e-06, + "loss": 0.93761867, + "num_input_tokens_seen": 2885675, + "router_z_loss_clip": 3.15820312, + "router_z_loss_mlp": 0.58154297, + "step": 139, + "time_per_iteration": 2.818398952484131 + }, + { + "auxiliary_loss_clip": 0.01490049, + "auxiliary_loss_mlp": 0.01118582, + "balance_loss_clip": 1.16729248, + "balance_loss_mlp": 1.06190979, + "epoch": 0.008417255373515706, + "flos": 33366774497760.0, + "grad_norm": 2.735026078406745, + "language_loss": 0.85685754, + "learning_rate": 3.181687263893095e-06, + "loss": 0.88294381, + "num_input_tokens_seen": 2905960, + "router_z_loss_clip": 3.2265625, + "router_z_loss_mlp": 0.56616211, + "step": 140, + "time_per_iteration": 2.8234872817993164 + }, + { + "auxiliary_loss_clip": 0.01479501, + "auxiliary_loss_mlp": 0.01125298, + "balance_loss_clip": 1.16104662, + "balance_loss_mlp": 1.06957972, + "epoch": 0.008477378626183677, + "flos": 21523034309760.0, + "grad_norm": 2.9368035068716134, + "language_loss": 0.84448135, + "learning_rate": 3.186269861057098e-06, + "loss": 0.87052941, + "num_input_tokens_seen": 2922780, + "router_z_loss_clip": 3.18554688, + "router_z_loss_mlp": 0.55688477, + "step": 141, + "time_per_iteration": 2.731071949005127 + }, + { + "auxiliary_loss_clip": 0.01483318, + "auxiliary_loss_mlp": 0.01130939, + "balance_loss_clip": 1.16000414, + "balance_loss_mlp": 1.07462454, + "epoch": 0.008537501878851645, + "flos": 16937537477760.0, + "grad_norm": 2.6372008380914784, + "language_loss": 0.8133505, + "learning_rate": 3.1908200721048745e-06, + "loss": 0.83949304, + "num_input_tokens_seen": 2938765, + "router_z_loss_clip": 3.22851562, + "router_z_loss_mlp": 0.56298828, + "step": 142, + "time_per_iteration": 2.760749101638794 + }, + { + "auxiliary_loss_clip": 0.01382891, + "auxiliary_loss_mlp": 0.01042392, + "balance_loss_clip": 1.19932961, + "balance_loss_mlp": 1.01249397, + "epoch": 0.008597625131519616, + "flos": 86936477731200.0, + "grad_norm": 1.0458940587063346, + "language_loss": 0.66915894, + "learning_rate": 3.195338351584042e-06, + "loss": 0.69341177, + "num_input_tokens_seen": 3006665, + "router_z_loss_clip": 1.8359375, + "router_z_loss_mlp": 0.29833984, + "step": 143, + "time_per_iteration": 3.589078903198242 + }, + { + "auxiliary_loss_clip": 0.014751, + "auxiliary_loss_mlp": 0.01118074, + "balance_loss_clip": 1.15803027, + "balance_loss_mlp": 1.06245053, + "epoch": 0.008657748384187584, + "flos": 21478795790400.0, + "grad_norm": 1.8975534896897075, + "language_loss": 0.84059519, + "learning_rate": 3.1998251445393258e-06, + "loss": 0.86652696, + "num_input_tokens_seen": 3024335, + "router_z_loss_clip": 3.16796875, + "router_z_loss_mlp": 0.55639648, + "step": 144, + "time_per_iteration": 3.1309895515441895 + }, + { + "auxiliary_loss_clip": 0.01462735, + "auxiliary_loss_mlp": 0.01110803, + "balance_loss_clip": 1.15190291, + "balance_loss_mlp": 1.05310595, + "epoch": 0.008717871636855555, + "flos": 24056947166400.0, + "grad_norm": 1.7345507527940403, + "language_loss": 0.88542551, + "learning_rate": 3.204280886775619e-06, + "loss": 0.91116095, + "num_input_tokens_seen": 3043300, + "router_z_loss_clip": 3.10742188, + "router_z_loss_mlp": 0.57666016, + "step": 145, + "time_per_iteration": 2.812284469604492 + }, + { + "auxiliary_loss_clip": 0.0147679, + "auxiliary_loss_mlp": 0.01120648, + "balance_loss_clip": 1.15386558, + "balance_loss_mlp": 1.06330812, + "epoch": 0.008777994889523523, + "flos": 30160572635520.0, + "grad_norm": 1.663803974445575, + "language_loss": 0.85800385, + "learning_rate": 3.208706005112005e-06, + "loss": 0.88397825, + "num_input_tokens_seen": 3064610, + "router_z_loss_clip": 3.22851562, + "router_z_loss_mlp": 0.57373047, + "step": 146, + "time_per_iteration": 2.774514675140381 + }, + { + "auxiliary_loss_clip": 0.01363532, + "auxiliary_loss_mlp": 0.01028824, + "balance_loss_clip": 1.18580902, + "balance_loss_mlp": 0.99980879, + "epoch": 0.008838118142191492, + "flos": 85576584188160.0, + "grad_norm": 0.8595792553789602, + "language_loss": 0.60052967, + "learning_rate": 3.213100917627104e-06, + "loss": 0.62445325, + "num_input_tokens_seen": 3130385, + "router_z_loss_clip": 1.77832031, + "router_z_loss_mlp": 0.28930664, + "step": 147, + "time_per_iteration": 3.445765733718872 + }, + { + "auxiliary_loss_clip": 0.01468209, + "auxiliary_loss_mlp": 0.01121162, + "balance_loss_clip": 1.15591717, + "balance_loss_mlp": 1.06837583, + "epoch": 0.008898241394859462, + "flos": 24458902464960.0, + "grad_norm": 1.764897358634855, + "language_loss": 0.84309733, + "learning_rate": 3.2174660338961135e-06, + "loss": 0.86899102, + "num_input_tokens_seen": 3149760, + "router_z_loss_clip": 3.12109375, + "router_z_loss_mlp": 0.52832031, + "step": 148, + "time_per_iteration": 2.757974147796631 + }, + { + "auxiliary_loss_clip": 0.01470938, + "auxiliary_loss_mlp": 0.01134576, + "balance_loss_clip": 1.15805864, + "balance_loss_mlp": 1.07704592, + "epoch": 0.008958364647527431, + "flos": 13108633410240.0, + "grad_norm": 2.0239495405679433, + "language_loss": 0.88579643, + "learning_rate": 3.2218017552198588e-06, + "loss": 0.91185158, + "num_input_tokens_seen": 3164500, + "router_z_loss_clip": 3.1328125, + "router_z_loss_mlp": 0.57446289, + "step": 149, + "time_per_iteration": 2.7326667308807373 + }, + { + "auxiliary_loss_clip": 0.014708, + "auxiliary_loss_mlp": 0.0111274, + "balance_loss_clip": 1.15144885, + "balance_loss_mlp": 1.05911994, + "epoch": 0.009018487900195401, + "flos": 35542484367840.0, + "grad_norm": 1.8691196581676723, + "language_loss": 0.93138266, + "learning_rate": 3.226108474846181e-06, + "loss": 0.95721805, + "num_input_tokens_seen": 3182455, + "router_z_loss_clip": 3.19335938, + "router_z_loss_mlp": 0.53613281, + "step": 150, + "time_per_iteration": 2.825246810913086 + }, + { + "auxiliary_loss_clip": 0.01457858, + "auxiliary_loss_mlp": 0.01112679, + "balance_loss_clip": 1.14789939, + "balance_loss_mlp": 1.06110871, + "epoch": 0.00907861115286337, + "flos": 40223224624320.0, + "grad_norm": 1.8802522625261324, + "language_loss": 0.74528039, + "learning_rate": 3.2303865781839817e-06, + "loss": 0.77098572, + "num_input_tokens_seen": 3203995, + "router_z_loss_clip": 3.1015625, + "router_z_loss_mlp": 0.51586914, + "step": 151, + "time_per_iteration": 2.866436004638672 + }, + { + "auxiliary_loss_clip": 0.01466552, + "auxiliary_loss_mlp": 0.01120514, + "balance_loss_clip": 1.15138495, + "balance_loss_mlp": 1.06701255, + "epoch": 0.009138734405531338, + "flos": 26554643925120.0, + "grad_norm": 2.2385752825202467, + "language_loss": 0.88596368, + "learning_rate": 3.234636443010188e-06, + "loss": 0.91183436, + "num_input_tokens_seen": 3222575, + "router_z_loss_clip": 3.15039062, + "router_z_loss_mlp": 0.53442383, + "step": 152, + "time_per_iteration": 2.8693747520446777 + }, + { + "auxiliary_loss_clip": 0.01464482, + "auxiliary_loss_mlp": 0.01117589, + "balance_loss_clip": 1.15613842, + "balance_loss_mlp": 1.06439829, + "epoch": 0.009198857658199309, + "flos": 25432798167360.0, + "grad_norm": 3.5946366701952748, + "language_loss": 0.84145278, + "learning_rate": 3.238858439669943e-06, + "loss": 0.86727345, + "num_input_tokens_seen": 3240180, + "router_z_loss_clip": 3.08203125, + "router_z_loss_mlp": 0.5324707, + "step": 153, + "time_per_iteration": 2.7679505348205566 + }, + { + "auxiliary_loss_clip": 0.01458388, + "auxiliary_loss_mlp": 0.01135294, + "balance_loss_clip": 1.14959931, + "balance_loss_mlp": 1.08086348, + "epoch": 0.009258980910867277, + "flos": 30294706298400.0, + "grad_norm": 1.8230362685244992, + "language_loss": 0.89699334, + "learning_rate": 3.2430529312702712e-06, + "loss": 0.92293018, + "num_input_tokens_seen": 3259800, + "router_z_loss_clip": 3.0859375, + "router_z_loss_mlp": 0.54467773, + "step": 154, + "time_per_iteration": 2.7782764434814453 + }, + { + "auxiliary_loss_clip": 0.01463729, + "auxiliary_loss_mlp": 0.01150734, + "balance_loss_clip": 1.15229976, + "balance_loss_mlp": 1.09732771, + "epoch": 0.009319104163535248, + "flos": 35103097452960.0, + "grad_norm": 1.9558101395960092, + "language_loss": 0.89448243, + "learning_rate": 3.2472202738674737e-06, + "loss": 0.92062706, + "num_input_tokens_seen": 3280400, + "router_z_loss_clip": 3.11328125, + "router_z_loss_mlp": 0.53369141, + "step": 155, + "time_per_iteration": 2.802318811416626 + }, + { + "auxiliary_loss_clip": 0.01468813, + "auxiliary_loss_mlp": 0.01115266, + "balance_loss_clip": 1.14952397, + "balance_loss_mlp": 1.06309998, + "epoch": 0.009379227416203216, + "flos": 20233026724320.0, + "grad_norm": 3.3594656726632377, + "language_loss": 0.86367047, + "learning_rate": 3.2513608166485063e-06, + "loss": 0.88951129, + "num_input_tokens_seen": 3297600, + "router_z_loss_clip": 3.19335938, + "router_z_loss_mlp": 0.52197266, + "step": 156, + "time_per_iteration": 2.7294957637786865 + }, + { + "auxiliary_loss_clip": 0.01462384, + "auxiliary_loss_mlp": 0.01113625, + "balance_loss_clip": 1.15264797, + "balance_loss_mlp": 1.06017184, + "epoch": 0.009439350668871187, + "flos": 22366280835360.0, + "grad_norm": 2.577412271728902, + "language_loss": 0.99678802, + "learning_rate": 3.2554749021065498e-06, + "loss": 1.0225482, + "num_input_tokens_seen": 3313635, + "router_z_loss_clip": 3.09179688, + "router_z_loss_mlp": 0.53466797, + "step": 157, + "time_per_iteration": 2.7279343605041504 + }, + { + "auxiliary_loss_clip": 0.01452254, + "auxiliary_loss_mlp": 0.01136301, + "balance_loss_clip": 1.14927244, + "balance_loss_mlp": 1.08251405, + "epoch": 0.009499473921539155, + "flos": 29712352953600.0, + "grad_norm": 2.463581496189555, + "language_loss": 0.88106179, + "learning_rate": 3.2595628662110186e-06, + "loss": 0.90694726, + "num_input_tokens_seen": 3333735, + "router_z_loss_clip": 3.03125, + "router_z_loss_mlp": 0.53759766, + "step": 158, + "time_per_iteration": 2.7843501567840576 + }, + { + "auxiliary_loss_clip": 0.01457165, + "auxiliary_loss_mlp": 0.01118895, + "balance_loss_clip": 1.14851677, + "balance_loss_mlp": 1.06556118, + "epoch": 0.009559597174207124, + "flos": 20010132401760.0, + "grad_norm": 2.2200098175774143, + "language_loss": 0.86589855, + "learning_rate": 3.2636250385721982e-06, + "loss": 0.89165914, + "num_input_tokens_seen": 3348800, + "router_z_loss_clip": 3.0859375, + "router_z_loss_mlp": 0.53344727, + "step": 159, + "time_per_iteration": 2.807440996170044 + }, + { + "auxiliary_loss_clip": 0.01443877, + "auxiliary_loss_mlp": 0.01127453, + "balance_loss_clip": 1.1429745, + "balance_loss_mlp": 1.07378483, + "epoch": 0.009619720426875094, + "flos": 27890186582880.0, + "grad_norm": 1.6888334872608206, + "language_loss": 0.86784416, + "learning_rate": 3.2676617426007263e-06, + "loss": 0.89355743, + "num_input_tokens_seen": 3368595, + "router_z_loss_clip": 3.01171875, + "router_z_loss_mlp": 0.53686523, + "step": 160, + "time_per_iteration": 2.766827344894409 + }, + { + "auxiliary_loss_clip": 0.01453417, + "auxiliary_loss_mlp": 0.01113681, + "balance_loss_clip": 1.14778042, + "balance_loss_mlp": 1.06370878, + "epoch": 0.009679843679543063, + "flos": 23348482580160.0, + "grad_norm": 3.172451810193117, + "language_loss": 0.91549754, + "learning_rate": 3.2716732956621042e-06, + "loss": 0.94116849, + "num_input_tokens_seen": 3384975, + "router_z_loss_clip": 3.05859375, + "router_z_loss_mlp": 0.5, + "step": 161, + "time_per_iteration": 2.7386558055877686 + }, + { + "auxiliary_loss_clip": 0.01458998, + "auxiliary_loss_mlp": 0.01108198, + "balance_loss_clip": 1.14880466, + "balance_loss_mlp": 1.05772507, + "epoch": 0.009739966932211033, + "flos": 24773717795040.0, + "grad_norm": 1.714917955658723, + "language_loss": 0.913854, + "learning_rate": 3.2756600092264203e-06, + "loss": 0.93952596, + "num_input_tokens_seen": 3404755, + "router_z_loss_clip": 3.10351562, + "router_z_loss_mlp": 0.50561523, + "step": 162, + "time_per_iteration": 2.7218196392059326 + }, + { + "auxiliary_loss_clip": 0.01317977, + "auxiliary_loss_mlp": 0.01027195, + "balance_loss_clip": 1.15347457, + "balance_loss_mlp": 1.00094545, + "epoch": 0.009800090184879002, + "flos": 81796051402560.0, + "grad_norm": 1.178121691601743, + "language_loss": 0.72319835, + "learning_rate": 3.279622189013474e-06, + "loss": 0.7466501, + "num_input_tokens_seen": 3467210, + "router_z_loss_clip": 1.64355469, + "router_z_loss_mlp": 0.26293945, + "step": 163, + "time_per_iteration": 3.3913869857788086 + }, + { + "auxiliary_loss_clip": 0.01442469, + "auxiliary_loss_mlp": 0.01114079, + "balance_loss_clip": 1.14473152, + "balance_loss_mlp": 1.06258059, + "epoch": 0.00986021343754697, + "flos": 20944165451040.0, + "grad_norm": 2.2215304205125164, + "language_loss": 0.84957325, + "learning_rate": 3.283560135133457e-06, + "loss": 0.87513876, + "num_input_tokens_seen": 3483220, + "router_z_loss_clip": 2.9765625, + "router_z_loss_mlp": 0.51513672, + "step": 164, + "time_per_iteration": 2.715306520462036 + }, + { + "auxiliary_loss_clip": 0.01436514, + "auxiliary_loss_mlp": 0.01101413, + "balance_loss_clip": 1.13923061, + "balance_loss_mlp": 1.05055809, + "epoch": 0.00992033669021494, + "flos": 21657613662720.0, + "grad_norm": 1.9627583151803547, + "language_loss": 0.89320385, + "learning_rate": 3.2874741422233565e-06, + "loss": 0.9185831, + "num_input_tokens_seen": 3501465, + "router_z_loss_clip": 2.97460938, + "router_z_loss_mlp": 0.5090332, + "step": 165, + "time_per_iteration": 2.751352310180664 + }, + { + "auxiliary_loss_clip": 0.01440964, + "auxiliary_loss_mlp": 0.01120057, + "balance_loss_clip": 1.13904738, + "balance_loss_mlp": 1.06653166, + "epoch": 0.00998045994288291, + "flos": 30868388945280.0, + "grad_norm": 1.727450963555256, + "language_loss": 0.80044967, + "learning_rate": 3.2913644995792465e-06, + "loss": 0.82605988, + "num_input_tokens_seen": 3520480, + "router_z_loss_clip": 3.02148438, + "router_z_loss_mlp": 0.53515625, + "step": 166, + "time_per_iteration": 2.8142685890197754 + }, + { + "auxiliary_loss_clip": 0.01442327, + "auxiliary_loss_mlp": 0.01120949, + "balance_loss_clip": 1.14208221, + "balance_loss_mlp": 1.06775749, + "epoch": 0.01004058319555088, + "flos": 39412831262400.0, + "grad_norm": 2.2143829971183773, + "language_loss": 0.91964877, + "learning_rate": 3.2952314912845914e-06, + "loss": 0.94528157, + "num_input_tokens_seen": 3539570, + "router_z_loss_clip": 3.0, + "router_z_loss_mlp": 0.53222656, + "step": 167, + "time_per_iteration": 2.8196465969085693 + }, + { + "auxiliary_loss_clip": 0.01435326, + "auxiliary_loss_mlp": 0.01131413, + "balance_loss_clip": 1.14031112, + "balance_loss_mlp": 1.08046269, + "epoch": 0.010100706448218848, + "flos": 13812600578400.0, + "grad_norm": 2.523837010126024, + "language_loss": 0.90822697, + "learning_rate": 3.299075396334735e-06, + "loss": 0.9338944, + "num_input_tokens_seen": 3555465, + "router_z_loss_clip": 2.95117188, + "router_z_loss_mlp": 0.50976562, + "step": 168, + "time_per_iteration": 2.7429299354553223 + }, + { + "auxiliary_loss_clip": 0.01429083, + "auxiliary_loss_mlp": 0.01099679, + "balance_loss_clip": 1.13525486, + "balance_loss_mlp": 1.04791832, + "epoch": 0.010160829700886819, + "flos": 36264157587360.0, + "grad_norm": 1.567346208406122, + "language_loss": 0.86986864, + "learning_rate": 3.3028964887576868e-06, + "loss": 0.89515626, + "num_input_tokens_seen": 3578970, + "router_z_loss_clip": 2.9375, + "router_z_loss_mlp": 0.51831055, + "step": 169, + "time_per_iteration": 2.831648826599121 + }, + { + "auxiliary_loss_clip": 0.01430557, + "auxiliary_loss_mlp": 0.01113186, + "balance_loss_clip": 1.13713765, + "balance_loss_mlp": 1.0618546, + "epoch": 0.010220952953554787, + "flos": 24907162664160.0, + "grad_norm": 1.8100168995821653, + "language_loss": 0.85025561, + "learning_rate": 3.306695037731344e-06, + "loss": 0.87569302, + "num_input_tokens_seen": 3597275, + "router_z_loss_clip": 2.93554688, + "router_z_loss_mlp": 0.51318359, + "step": 170, + "time_per_iteration": 2.7660326957702637 + }, + { + "auxiliary_loss_clip": 0.01442677, + "auxiliary_loss_mlp": 0.01131817, + "balance_loss_clip": 1.13908446, + "balance_loss_mlp": 1.07929325, + "epoch": 0.010281076206222756, + "flos": 38174963065920.0, + "grad_norm": 2.9359746717155257, + "language_loss": 0.89855087, + "learning_rate": 3.3104713076972827e-06, + "loss": 0.92429578, + "num_input_tokens_seen": 3618905, + "router_z_loss_clip": 3.03710938, + "router_z_loss_mlp": 0.52539062, + "step": 171, + "time_per_iteration": 2.8263981342315674 + }, + { + "auxiliary_loss_clip": 0.01432606, + "auxiliary_loss_mlp": 0.01102525, + "balance_loss_clip": 1.13943195, + "balance_loss_mlp": 1.05221891, + "epoch": 0.010341199458890726, + "flos": 26822830216320.0, + "grad_norm": 4.341560969412766, + "language_loss": 0.8870244, + "learning_rate": 3.314225558471224e-06, + "loss": 0.91237569, + "num_input_tokens_seen": 3639610, + "router_z_loss_clip": 2.93359375, + "router_z_loss_mlp": 0.50292969, + "step": 172, + "time_per_iteration": 2.7749245166778564 + }, + { + "auxiliary_loss_clip": 0.01417203, + "auxiliary_loss_mlp": 0.01117905, + "balance_loss_clip": 1.1308893, + "balance_loss_mlp": 1.06893384, + "epoch": 0.010401322711558695, + "flos": 37595851103520.0, + "grad_norm": 1.8421645948117886, + "language_loss": 0.80966568, + "learning_rate": 3.317958045350308e-06, + "loss": 0.83501679, + "num_input_tokens_seen": 3664030, + "router_z_loss_clip": 2.86132812, + "router_z_loss_mlp": 0.48999023, + "step": 173, + "time_per_iteration": 2.872950792312622 + }, + { + "auxiliary_loss_clip": 0.01434315, + "auxiliary_loss_mlp": 0.01102575, + "balance_loss_clip": 1.13820004, + "balance_loss_mlp": 1.05386567, + "epoch": 0.010461445964226665, + "flos": 30157979529600.0, + "grad_norm": 1.7685574351672646, + "language_loss": 0.82666194, + "learning_rate": 3.3216690192172596e-06, + "loss": 0.85203087, + "num_input_tokens_seen": 3683615, + "router_z_loss_clip": 2.9609375, + "router_z_loss_mlp": 0.48706055, + "step": 174, + "time_per_iteration": 2.7925329208374023 + }, + { + "auxiliary_loss_clip": 0.01426676, + "auxiliary_loss_mlp": 0.01117276, + "balance_loss_clip": 1.13361752, + "balance_loss_mlp": 1.06770861, + "epoch": 0.010521569216894634, + "flos": 33812684694720.0, + "grad_norm": 2.49863255655451, + "language_loss": 0.72630107, + "learning_rate": 3.325358726641591e-06, + "loss": 0.75174069, + "num_input_tokens_seen": 3704540, + "router_z_loss_clip": 2.9296875, + "router_z_loss_mlp": 0.49584961, + "step": 175, + "time_per_iteration": 4.26935338973999 + }, + { + "auxiliary_loss_clip": 0.01431799, + "auxiliary_loss_mlp": 0.0112293, + "balance_loss_clip": 1.13661098, + "balance_loss_mlp": 1.0701921, + "epoch": 0.010581692469562603, + "flos": 15201092970720.0, + "grad_norm": 7.171426448439728, + "language_loss": 0.9787246, + "learning_rate": 3.329027409977902e-06, + "loss": 1.00427186, + "num_input_tokens_seen": 3721320, + "router_z_loss_clip": 2.953125, + "router_z_loss_mlp": 0.52709961, + "step": 176, + "time_per_iteration": 5.557633876800537 + }, + { + "auxiliary_loss_clip": 0.0141538, + "auxiliary_loss_mlp": 0.01120991, + "balance_loss_clip": 1.13171232, + "balance_loss_mlp": 1.0734508, + "epoch": 0.010641815722230573, + "flos": 23705470048320.0, + "grad_norm": 3.2595158681305394, + "language_loss": 0.76836538, + "learning_rate": 3.3326753074614087e-06, + "loss": 0.79372907, + "num_input_tokens_seen": 3739385, + "router_z_loss_clip": 2.83789062, + "router_z_loss_mlp": 0.47558594, + "step": 177, + "time_per_iteration": 2.785449504852295 + }, + { + "auxiliary_loss_clip": 0.01425586, + "auxiliary_loss_mlp": 0.01102166, + "balance_loss_clip": 1.13144433, + "balance_loss_mlp": 1.05157351, + "epoch": 0.010701938974898541, + "flos": 22369279114080.0, + "grad_norm": 2.9165560775508723, + "language_loss": 0.7699194, + "learning_rate": 3.3363026533007716e-06, + "loss": 0.79519689, + "num_input_tokens_seen": 3756360, + "router_z_loss_clip": 2.94335938, + "router_z_loss_mlp": 0.50610352, + "step": 178, + "time_per_iteration": 2.7161378860473633 + }, + { + "auxiliary_loss_clip": 0.01435052, + "auxiliary_loss_mlp": 0.01105072, + "balance_loss_clip": 1.13875079, + "balance_loss_mlp": 1.05431306, + "epoch": 0.010762062227566512, + "flos": 23433029442720.0, + "grad_norm": 2.0875272861188283, + "language_loss": 0.84368753, + "learning_rate": 3.3399096777683303e-06, + "loss": 0.86908877, + "num_input_tokens_seen": 3773930, + "router_z_loss_clip": 2.9609375, + "router_z_loss_mlp": 0.50805664, + "step": 179, + "time_per_iteration": 2.7156200408935547 + }, + { + "auxiliary_loss_clip": 0.01426901, + "auxiliary_loss_mlp": 0.01109582, + "balance_loss_clip": 1.13136089, + "balance_loss_mlp": 1.0578692, + "epoch": 0.01082218548023448, + "flos": 38346690414240.0, + "grad_norm": 2.2874030306955904, + "language_loss": 0.838682, + "learning_rate": 3.3434966072878213e-06, + "loss": 0.86404681, + "num_input_tokens_seen": 3793630, + "router_z_loss_clip": 2.953125, + "router_z_loss_mlp": 0.51757812, + "step": 180, + "time_per_iteration": 2.791616678237915 + }, + { + "auxiliary_loss_clip": 0.01424045, + "auxiliary_loss_mlp": 0.01112845, + "balance_loss_clip": 1.13321114, + "balance_loss_mlp": 1.06225288, + "epoch": 0.01088230873290245, + "flos": 30562001209440.0, + "grad_norm": 2.084689505120143, + "language_loss": 0.77915221, + "learning_rate": 3.3470636645196674e-06, + "loss": 0.80452108, + "num_input_tokens_seen": 3813610, + "router_z_loss_clip": 2.90820312, + "router_z_loss_mlp": 0.50610352, + "step": 181, + "time_per_iteration": 2.930830478668213 + }, + { + "auxiliary_loss_clip": 0.01422706, + "auxiliary_loss_mlp": 0.01129507, + "balance_loss_clip": 1.13076472, + "balance_loss_mlp": 1.07977295, + "epoch": 0.01094243198557042, + "flos": 27934749240480.0, + "grad_norm": 4.120179983327372, + "language_loss": 0.7658118, + "learning_rate": 3.3506110684439156e-06, + "loss": 0.79133391, + "num_input_tokens_seen": 3831390, + "router_z_loss_clip": 2.91992188, + "router_z_loss_mlp": 0.49780273, + "step": 182, + "time_per_iteration": 2.779360771179199 + }, + { + "auxiliary_loss_clip": 0.01418375, + "auxiliary_loss_mlp": 0.01126631, + "balance_loss_clip": 1.12917352, + "balance_loss_mlp": 1.07513297, + "epoch": 0.011002555238238388, + "flos": 20944570623840.0, + "grad_norm": 2.2736668677780165, + "language_loss": 0.87809849, + "learning_rate": 3.3541390344409054e-06, + "loss": 0.9035486, + "num_input_tokens_seen": 3849705, + "router_z_loss_clip": 2.89453125, + "router_z_loss_mlp": 0.51489258, + "step": 183, + "time_per_iteration": 2.785749673843384 + }, + { + "auxiliary_loss_clip": 0.01419752, + "auxiliary_loss_mlp": 0.01112308, + "balance_loss_clip": 1.13024652, + "balance_loss_mlp": 1.06581676, + "epoch": 0.011062678490906358, + "flos": 27222962237280.0, + "grad_norm": 2.1890056129810347, + "language_loss": 0.86862469, + "learning_rate": 3.357647774369736e-06, + "loss": 0.89394534, + "num_input_tokens_seen": 3869230, + "router_z_loss_clip": 2.89453125, + "router_z_loss_mlp": 0.46459961, + "step": 184, + "time_per_iteration": 2.7349140644073486 + }, + { + "auxiliary_loss_clip": 0.01414816, + "auxiliary_loss_mlp": 0.01103606, + "balance_loss_clip": 1.13127744, + "balance_loss_mlp": 1.0547061, + "epoch": 0.011122801743574327, + "flos": 29759103544320.0, + "grad_norm": 1.9925475234588312, + "language_loss": 0.83886838, + "learning_rate": 3.3611374966446085e-06, + "loss": 0.86405265, + "num_input_tokens_seen": 3889735, + "router_z_loss_clip": 2.83398438, + "router_z_loss_mlp": 0.48950195, + "step": 185, + "time_per_iteration": 2.760558605194092 + }, + { + "auxiliary_loss_clip": 0.01425759, + "auxiliary_loss_mlp": 0.01104691, + "balance_loss_clip": 1.13085103, + "balance_loss_mlp": 1.05223882, + "epoch": 0.011182924996242297, + "flos": 22146789964320.0, + "grad_norm": 2.5765412317449328, + "language_loss": 0.7106508, + "learning_rate": 3.3646084063091142e-06, + "loss": 0.73595524, + "num_input_tokens_seen": 3908855, + "router_z_loss_clip": 2.9453125, + "router_z_loss_mlp": 0.5246582, + "step": 186, + "time_per_iteration": 2.729583740234375 + }, + { + "auxiliary_loss_clip": 0.0142281, + "auxiliary_loss_mlp": 0.01102273, + "balance_loss_clip": 1.13101971, + "balance_loss_mlp": 1.05506611, + "epoch": 0.011243048248910266, + "flos": 18896633203680.0, + "grad_norm": 2.4187985380906625, + "language_loss": 1.01737714, + "learning_rate": 3.3680607051085194e-06, + "loss": 1.04262793, + "num_input_tokens_seen": 3923865, + "router_z_loss_clip": 2.91796875, + "router_z_loss_mlp": 0.47241211, + "step": 187, + "time_per_iteration": 2.6784796714782715 + }, + { + "auxiliary_loss_clip": 0.01409702, + "auxiliary_loss_mlp": 0.01108885, + "balance_loss_clip": 1.12993681, + "balance_loss_mlp": 1.05912709, + "epoch": 0.011303171501578235, + "flos": 49928402937600.0, + "grad_norm": 1.911109093005095, + "language_loss": 0.74949259, + "learning_rate": 3.371494591560139e-06, + "loss": 0.77467841, + "num_input_tokens_seen": 3946870, + "router_z_loss_clip": 2.80273438, + "router_z_loss_mlp": 0.49780273, + "step": 188, + "time_per_iteration": 2.9176273345947266 + }, + { + "auxiliary_loss_clip": 0.01294927, + "auxiliary_loss_mlp": 0.01038425, + "balance_loss_clip": 1.13838279, + "balance_loss_mlp": 1.01436889, + "epoch": 0.011363294754246205, + "flos": 80903542214880.0, + "grad_norm": 0.7514664918105294, + "language_loss": 0.56167102, + "learning_rate": 3.3749102610218297e-06, + "loss": 0.58500451, + "num_input_tokens_seen": 4010005, + "router_z_loss_clip": 1.56542969, + "router_z_loss_mlp": 0.24047852, + "step": 189, + "time_per_iteration": 3.4012908935546875 + }, + { + "auxiliary_loss_clip": 0.01410525, + "auxiliary_loss_mlp": 0.01120014, + "balance_loss_clip": 1.12713397, + "balance_loss_mlp": 1.06934988, + "epoch": 0.011423418006914174, + "flos": 30383669544480.0, + "grad_norm": 2.4356937411496804, + "language_loss": 0.94911718, + "learning_rate": 3.3783079057586833e-06, + "loss": 0.97442257, + "num_input_tokens_seen": 4029035, + "router_z_loss_clip": 2.83789062, + "router_z_loss_mlp": 0.50634766, + "step": 190, + "time_per_iteration": 2.771681547164917 + }, + { + "auxiliary_loss_clip": 0.01410277, + "auxiliary_loss_mlp": 0.01096848, + "balance_loss_clip": 1.12864995, + "balance_loss_mlp": 1.04945087, + "epoch": 0.011483541259582144, + "flos": 24150043175040.0, + "grad_norm": 2.7388485106077862, + "language_loss": 0.85116649, + "learning_rate": 3.3816877150079665e-06, + "loss": 0.87623775, + "num_input_tokens_seen": 4046995, + "router_z_loss_clip": 2.81835938, + "router_z_loss_mlp": 0.47412109, + "step": 191, + "time_per_iteration": 2.7312824726104736 + }, + { + "auxiliary_loss_clip": 0.01411018, + "auxiliary_loss_mlp": 0.01124205, + "balance_loss_clip": 1.12532246, + "balance_loss_mlp": 1.07587802, + "epoch": 0.011543664512250112, + "flos": 31941579800160.0, + "grad_norm": 1.8096899115011762, + "language_loss": 0.91693091, + "learning_rate": 3.385049875042367e-06, + "loss": 0.94228315, + "num_input_tokens_seen": 4065865, + "router_z_loss_clip": 2.85546875, + "router_z_loss_mlp": 0.48339844, + "step": 192, + "time_per_iteration": 2.7723429203033447 + }, + { + "auxiliary_loss_clip": 0.01405242, + "auxiliary_loss_mlp": 0.01119221, + "balance_loss_clip": 1.12497425, + "balance_loss_mlp": 1.06777072, + "epoch": 0.011603787764918083, + "flos": 29003199573600.0, + "grad_norm": 2.5780362386112494, + "language_loss": 0.8709777, + "learning_rate": 3.3883945692315938e-06, + "loss": 0.89622229, + "num_input_tokens_seen": 4085305, + "router_z_loss_clip": 2.80273438, + "router_z_loss_mlp": 0.51489258, + "step": 193, + "time_per_iteration": 2.757941246032715 + }, + { + "auxiliary_loss_clip": 0.01408809, + "auxiliary_loss_mlp": 0.01101161, + "balance_loss_clip": 1.12411988, + "balance_loss_mlp": 1.0536921, + "epoch": 0.011663911017586051, + "flos": 31670678851200.0, + "grad_norm": 4.219715853363775, + "language_loss": 0.9210546, + "learning_rate": 3.3917219781023906e-06, + "loss": 0.9461543, + "num_input_tokens_seen": 4105185, + "router_z_loss_clip": 2.84765625, + "router_z_loss_mlp": 0.47460938, + "step": 194, + "time_per_iteration": 2.8066890239715576 + }, + { + "auxiliary_loss_clip": 0.01416627, + "auxiliary_loss_mlp": 0.01103185, + "balance_loss_clip": 1.13066626, + "balance_loss_mlp": 1.05471492, + "epoch": 0.01172403427025402, + "flos": 21835459120320.0, + "grad_norm": 2.1096905950020517, + "language_loss": 0.90089434, + "learning_rate": 3.3950322793970014e-06, + "loss": 0.92609251, + "num_input_tokens_seen": 4123160, + "router_z_loss_clip": 2.859375, + "router_z_loss_mlp": 0.48486328, + "step": 195, + "time_per_iteration": 2.800358533859253 + }, + { + "auxiliary_loss_clip": 0.01409982, + "auxiliary_loss_mlp": 0.01113275, + "balance_loss_clip": 1.12898004, + "balance_loss_mlp": 1.06339836, + "epoch": 0.01178415752292199, + "flos": 21834770326560.0, + "grad_norm": 3.1338670364815653, + "language_loss": 0.85884631, + "learning_rate": 3.3983256481301445e-06, + "loss": 0.88407886, + "num_input_tokens_seen": 4140425, + "router_z_loss_clip": 2.81054688, + "router_z_loss_mlp": 0.49829102, + "step": 196, + "time_per_iteration": 2.7151882648468018 + }, + { + "auxiliary_loss_clip": 0.01406366, + "auxiliary_loss_mlp": 0.01108167, + "balance_loss_clip": 1.12426877, + "balance_loss_mlp": 1.05855215, + "epoch": 0.011844280775589959, + "flos": 27934911309600.0, + "grad_norm": 2.1976837379869387, + "language_loss": 0.9293443, + "learning_rate": 3.4016022566445335e-06, + "loss": 0.95448959, + "num_input_tokens_seen": 4159555, + "router_z_loss_clip": 2.82226562, + "router_z_loss_mlp": 0.49609375, + "step": 197, + "time_per_iteration": 2.751955270767212 + }, + { + "auxiliary_loss_clip": 0.01402683, + "auxiliary_loss_mlp": 0.01108498, + "balance_loss_clip": 1.12481117, + "balance_loss_mlp": 1.06014705, + "epoch": 0.01190440402825793, + "flos": 32921877232800.0, + "grad_norm": 2.145786314156838, + "language_loss": 0.79071838, + "learning_rate": 3.4048622746649966e-06, + "loss": 0.81583011, + "num_input_tokens_seen": 4180480, + "router_z_loss_clip": 2.77929688, + "router_z_loss_mlp": 0.48339844, + "step": 198, + "time_per_iteration": 2.853865146636963 + }, + { + "auxiliary_loss_clip": 0.01400318, + "auxiliary_loss_mlp": 0.01112388, + "balance_loss_clip": 1.12725973, + "balance_loss_mlp": 1.0662303, + "epoch": 0.011964527280925898, + "flos": 25040485981440.0, + "grad_norm": 1.6989237840912155, + "language_loss": 0.88173854, + "learning_rate": 3.4081058693512278e-06, + "loss": 0.9068656, + "num_input_tokens_seen": 4198835, + "router_z_loss_clip": 2.72851562, + "router_z_loss_mlp": 0.46166992, + "step": 199, + "time_per_iteration": 2.7249083518981934 + }, + { + "auxiliary_loss_clip": 0.01411851, + "auxiliary_loss_mlp": 0.01122754, + "balance_loss_clip": 1.12996578, + "balance_loss_mlp": 1.07173264, + "epoch": 0.012024650533593867, + "flos": 33856234420320.0, + "grad_norm": 2.037073782395418, + "language_loss": 0.81094897, + "learning_rate": 3.411333205349222e-06, + "loss": 0.83629501, + "num_input_tokens_seen": 4219335, + "router_z_loss_clip": 2.81640625, + "router_z_loss_mlp": 0.51049805, + "step": 200, + "time_per_iteration": 2.7790169715881348 + }, + { + "auxiliary_loss_clip": 0.01410567, + "auxiliary_loss_mlp": 0.01098743, + "balance_loss_clip": 1.12835121, + "balance_loss_mlp": 1.05017734, + "epoch": 0.012084773786261837, + "flos": 12752780425920.0, + "grad_norm": 1.958723002763731, + "language_loss": 0.87610668, + "learning_rate": 3.4145444448414217e-06, + "loss": 0.90119982, + "num_input_tokens_seen": 4236940, + "router_z_loss_clip": 2.82226562, + "router_z_loss_mlp": 0.48583984, + "step": 201, + "time_per_iteration": 2.7182180881500244 + }, + { + "auxiliary_loss_clip": 0.01407238, + "auxiliary_loss_mlp": 0.01108022, + "balance_loss_clip": 1.12884736, + "balance_loss_mlp": 1.05847883, + "epoch": 0.012144897038929806, + "flos": 28194467420160.0, + "grad_norm": 2.1655289617792026, + "language_loss": 0.84266722, + "learning_rate": 3.4177397475956223e-06, + "loss": 0.86781985, + "num_input_tokens_seen": 4256755, + "router_z_loss_clip": 2.78710938, + "router_z_loss_mlp": 0.49584961, + "step": 202, + "time_per_iteration": 2.746278762817383 + }, + { + "auxiliary_loss_clip": 0.01396716, + "auxiliary_loss_mlp": 0.01100938, + "balance_loss_clip": 1.12056494, + "balance_loss_mlp": 1.05256319, + "epoch": 0.012205020291597776, + "flos": 25664727843360.0, + "grad_norm": 1.9545870161582644, + "language_loss": 0.90033567, + "learning_rate": 3.4209192710126685e-06, + "loss": 0.92531216, + "num_input_tokens_seen": 4276505, + "router_z_loss_clip": 2.75976562, + "router_z_loss_mlp": 0.48339844, + "step": 203, + "time_per_iteration": 2.7256648540496826 + }, + { + "auxiliary_loss_clip": 0.0127372, + "auxiliary_loss_mlp": 0.01040813, + "balance_loss_clip": 1.12696707, + "balance_loss_mlp": 1.01831865, + "epoch": 0.012265143544265745, + "flos": 83521839864960.0, + "grad_norm": 1.0149384007672035, + "language_loss": 0.61192322, + "learning_rate": 3.4240831701729837e-06, + "loss": 0.6350686, + "num_input_tokens_seen": 4330965, + "router_z_loss_clip": 1.46875, + "router_z_loss_mlp": 0.22485352, + "step": 204, + "time_per_iteration": 3.2777273654937744 + }, + { + "auxiliary_loss_clip": 0.01406937, + "auxiliary_loss_mlp": 0.01111238, + "balance_loss_clip": 1.12330556, + "balance_loss_mlp": 1.06224298, + "epoch": 0.012325266796933715, + "flos": 20766644131680.0, + "grad_norm": 2.2740737310844445, + "language_loss": 0.91299617, + "learning_rate": 3.4272315978819516e-06, + "loss": 0.93817794, + "num_input_tokens_seen": 4348200, + "router_z_loss_clip": 2.83398438, + "router_z_loss_mlp": 0.48999023, + "step": 205, + "time_per_iteration": 2.766155958175659 + }, + { + "auxiliary_loss_clip": 0.01416028, + "auxiliary_loss_mlp": 0.01117289, + "balance_loss_clip": 1.13107872, + "balance_loss_mlp": 1.0684135, + "epoch": 0.012385390049601683, + "flos": 24636221197920.0, + "grad_norm": 2.2055934562527533, + "language_loss": 0.89320445, + "learning_rate": 3.4303647047142043e-06, + "loss": 0.91853774, + "num_input_tokens_seen": 4365460, + "router_z_loss_clip": 2.8515625, + "router_z_loss_mlp": 0.48828125, + "step": 206, + "time_per_iteration": 2.743853807449341 + }, + { + "auxiliary_loss_clip": 0.0140594, + "auxiliary_loss_mlp": 0.01101023, + "balance_loss_clip": 1.12416816, + "balance_loss_mlp": 1.0535059, + "epoch": 0.012445513302269652, + "flos": 19831760219520.0, + "grad_norm": 2.004934222022271, + "language_loss": 0.95411289, + "learning_rate": 3.43348263905683e-06, + "loss": 0.97918248, + "num_input_tokens_seen": 4383650, + "router_z_loss_clip": 2.81445312, + "router_z_loss_mlp": 0.4753418, + "step": 207, + "time_per_iteration": 2.7165451049804688 + }, + { + "auxiliary_loss_clip": 0.01403301, + "auxiliary_loss_mlp": 0.01113064, + "balance_loss_clip": 1.12930858, + "balance_loss_mlp": 1.064641, + "epoch": 0.012505636554937622, + "flos": 29003645263680.0, + "grad_norm": 2.1236322556819713, + "language_loss": 0.7576555, + "learning_rate": 3.436585547151547e-06, + "loss": 0.78281909, + "num_input_tokens_seen": 4403765, + "router_z_loss_clip": 2.74023438, + "router_z_loss_mlp": 0.48413086, + "step": 208, + "time_per_iteration": 2.7434539794921875 + }, + { + "auxiliary_loss_clip": 0.01392401, + "auxiliary_loss_mlp": 0.01103654, + "balance_loss_clip": 1.12331963, + "balance_loss_mlp": 1.05573225, + "epoch": 0.012565759807605591, + "flos": 37328637227040.0, + "grad_norm": 2.102350286355916, + "language_loss": 0.98628122, + "learning_rate": 3.4396735731358586e-06, + "loss": 1.01124179, + "num_input_tokens_seen": 4421935, + "router_z_loss_clip": 2.6875, + "router_z_loss_mlp": 0.47924805, + "step": 209, + "time_per_iteration": 2.7866218090057373 + }, + { + "auxiliary_loss_clip": 0.0139692, + "auxiliary_loss_mlp": 0.01115303, + "balance_loss_clip": 1.12399328, + "balance_loss_mlp": 1.0667851, + "epoch": 0.012625883060273561, + "flos": 48948146022240.0, + "grad_norm": 3.8838793783299677, + "language_loss": 0.8503176, + "learning_rate": 3.4427468590832302e-06, + "loss": 0.87543988, + "num_input_tokens_seen": 4441470, + "router_z_loss_clip": 2.73046875, + "router_z_loss_mlp": 0.4855957, + "step": 210, + "time_per_iteration": 2.900022268295288 + }, + { + "auxiliary_loss_clip": 0.01393833, + "auxiliary_loss_mlp": 0.01121588, + "balance_loss_clip": 1.12212157, + "balance_loss_mlp": 1.07562089, + "epoch": 0.01268600631294153, + "flos": 33054754860000.0, + "grad_norm": 2.1569082290119823, + "language_loss": 0.96960759, + "learning_rate": 3.445805545042314e-06, + "loss": 0.99476182, + "num_input_tokens_seen": 4459950, + "router_z_loss_clip": 2.71679688, + "router_z_loss_mlp": 0.45898438, + "step": 211, + "time_per_iteration": 2.7891323566436768 + }, + { + "auxiliary_loss_clip": 0.01402526, + "auxiliary_loss_mlp": 0.01121587, + "balance_loss_clip": 1.12725699, + "balance_loss_mlp": 1.07211554, + "epoch": 0.012746129565609499, + "flos": 20722121991360.0, + "grad_norm": 2.4548443363979398, + "language_loss": 0.94936955, + "learning_rate": 3.448849769075239e-06, + "loss": 0.97461069, + "num_input_tokens_seen": 4478390, + "router_z_loss_clip": 2.75390625, + "router_z_loss_mlp": 0.49487305, + "step": 212, + "time_per_iteration": 2.71856951713562 + }, + { + "auxiliary_loss_clip": 0.01393205, + "auxiliary_loss_mlp": 0.01109111, + "balance_loss_clip": 1.12461472, + "balance_loss_mlp": 1.062572, + "epoch": 0.012806252818277469, + "flos": 56782422027360.0, + "grad_norm": 2.0303390383244033, + "language_loss": 0.76120067, + "learning_rate": 3.4518796672950093e-06, + "loss": 0.78622383, + "num_input_tokens_seen": 4501665, + "router_z_loss_clip": 2.6875, + "router_z_loss_mlp": 0.46557617, + "step": 213, + "time_per_iteration": 2.970418930053711 + }, + { + "auxiliary_loss_clip": 0.01396536, + "auxiliary_loss_mlp": 0.01106936, + "balance_loss_clip": 1.12261677, + "balance_loss_mlp": 1.06061172, + "epoch": 0.012866376070945438, + "flos": 17556876748800.0, + "grad_norm": 2.4091741492294942, + "language_loss": 0.86639965, + "learning_rate": 3.4548953739020187e-06, + "loss": 0.89143437, + "num_input_tokens_seen": 4519055, + "router_z_loss_clip": 2.7421875, + "router_z_loss_mlp": 0.46289062, + "step": 214, + "time_per_iteration": 2.679232597351074 + }, + { + "auxiliary_loss_clip": 0.01393532, + "auxiliary_loss_mlp": 0.01122824, + "balance_loss_clip": 1.12869, + "balance_loss_mlp": 1.07330453, + "epoch": 0.012926499323613408, + "flos": 31897462832640.0, + "grad_norm": 2.339408540050906, + "language_loss": 0.77541578, + "learning_rate": 3.4578970212197196e-06, + "loss": 0.80057931, + "num_input_tokens_seen": 4540870, + "router_z_loss_clip": 2.64648438, + "router_z_loss_mlp": 0.49536133, + "step": 215, + "time_per_iteration": 4.28437066078186 + }, + { + "auxiliary_loss_clip": 0.01401349, + "auxiliary_loss_mlp": 0.01114742, + "balance_loss_clip": 1.12589812, + "balance_loss_mlp": 1.06753492, + "epoch": 0.012986622576281377, + "flos": 36749808885600.0, + "grad_norm": 2.6188157200809132, + "language_loss": 0.90728295, + "learning_rate": 3.460884739729461e-06, + "loss": 0.93244386, + "num_input_tokens_seen": 4560395, + "router_z_loss_clip": 2.75585938, + "router_z_loss_mlp": 0.47192383, + "step": 216, + "time_per_iteration": 5.650654077529907 + }, + { + "auxiliary_loss_clip": 0.0139193, + "auxiliary_loss_mlp": 0.01102206, + "balance_loss_clip": 1.11846268, + "balance_loss_mlp": 1.05521417, + "epoch": 0.013046745828949347, + "flos": 17026257620160.0, + "grad_norm": 2.440220280200938, + "language_loss": 0.9365499, + "learning_rate": 3.463858658104523e-06, + "loss": 0.96149123, + "num_input_tokens_seen": 4575785, + "router_z_loss_clip": 2.73046875, + "router_z_loss_mlp": 0.46899414, + "step": 217, + "time_per_iteration": 2.70591139793396 + }, + { + "auxiliary_loss_clip": 0.01388704, + "auxiliary_loss_mlp": 0.0110051, + "balance_loss_clip": 1.11891246, + "balance_loss_mlp": 1.05153847, + "epoch": 0.013106869081617315, + "flos": 21167586498240.0, + "grad_norm": 1.92703481022837, + "language_loss": 0.93482554, + "learning_rate": 3.4668189032433696e-06, + "loss": 0.95971769, + "num_input_tokens_seen": 4594985, + "router_z_loss_clip": 2.69921875, + "router_z_loss_mlp": 0.48950195, + "step": 218, + "time_per_iteration": 2.700800895690918 + }, + { + "auxiliary_loss_clip": 0.01385634, + "auxiliary_loss_mlp": 0.01098173, + "balance_loss_clip": 1.12095976, + "balance_loss_mlp": 1.0526588, + "epoch": 0.013166992334285284, + "flos": 31585240608480.0, + "grad_norm": 1.8774024427219826, + "language_loss": 0.861848, + "learning_rate": 3.46976560030214e-06, + "loss": 0.88668609, + "num_input_tokens_seen": 4616125, + "router_z_loss_clip": 2.64648438, + "router_z_loss_mlp": 0.45483398, + "step": 219, + "time_per_iteration": 2.7690744400024414 + }, + { + "auxiliary_loss_clip": 0.01390387, + "auxiliary_loss_mlp": 0.01103635, + "balance_loss_clip": 1.12176514, + "balance_loss_mlp": 1.05781078, + "epoch": 0.013227115586953254, + "flos": 38041720783200.0, + "grad_norm": 1.6399458914056606, + "language_loss": 0.87778729, + "learning_rate": 3.4726988727263976e-06, + "loss": 0.90272748, + "num_input_tokens_seen": 4637795, + "router_z_loss_clip": 2.68945312, + "router_z_loss_mlp": 0.45825195, + "step": 220, + "time_per_iteration": 2.8095834255218506 + }, + { + "auxiliary_loss_clip": 0.01380333, + "auxiliary_loss_mlp": 0.01112019, + "balance_loss_clip": 1.11599016, + "balance_loss_mlp": 1.06910408, + "epoch": 0.013287238839621223, + "flos": 24903921281760.0, + "grad_norm": 1.8073681100463408, + "language_loss": 0.86652446, + "learning_rate": 3.475618842282164e-06, + "loss": 0.89144796, + "num_input_tokens_seen": 4656835, + "router_z_loss_clip": 2.64453125, + "router_z_loss_mlp": 0.42919922, + "step": 221, + "time_per_iteration": 2.7374722957611084 + }, + { + "auxiliary_loss_clip": 0.01386386, + "auxiliary_loss_mlp": 0.01115513, + "balance_loss_clip": 1.11610055, + "balance_loss_mlp": 1.06885409, + "epoch": 0.013347362092289193, + "flos": 17249638150080.0, + "grad_norm": 2.2325993124515398, + "language_loss": 0.92297614, + "learning_rate": 3.4785256290862486e-06, + "loss": 0.94799513, + "num_input_tokens_seen": 4673015, + "router_z_loss_clip": 2.70117188, + "router_z_loss_mlp": 0.46630859, + "step": 222, + "time_per_iteration": 2.7441158294677734 + }, + { + "auxiliary_loss_clip": 0.01384936, + "auxiliary_loss_mlp": 0.01109714, + "balance_loss_clip": 1.11935329, + "balance_loss_mlp": 1.06007552, + "epoch": 0.013407485344957162, + "flos": 26597788477920.0, + "grad_norm": 2.281548031311958, + "language_loss": 0.95729673, + "learning_rate": 3.481419351635897e-06, + "loss": 0.98224318, + "num_input_tokens_seen": 4692355, + "router_z_loss_clip": 2.65625, + "router_z_loss_mlp": 0.49584961, + "step": 223, + "time_per_iteration": 2.7376160621643066 + }, + { + "auxiliary_loss_clip": 0.0138384, + "auxiliary_loss_mlp": 0.01105573, + "balance_loss_clip": 1.11773205, + "balance_loss_mlp": 1.05993962, + "epoch": 0.013467608597625132, + "flos": 22722255371520.0, + "grad_norm": 2.2372446883349473, + "language_loss": 0.88169497, + "learning_rate": 3.484300126837776e-06, + "loss": 0.90658903, + "num_input_tokens_seen": 4710080, + "router_z_loss_clip": 2.66015625, + "router_z_loss_mlp": 0.45629883, + "step": 224, + "time_per_iteration": 2.7532975673675537 + }, + { + "auxiliary_loss_clip": 0.01385104, + "auxiliary_loss_mlp": 0.01104151, + "balance_loss_clip": 1.11860728, + "balance_loss_mlp": 1.05446506, + "epoch": 0.013527731850293101, + "flos": 22637343853440.0, + "grad_norm": 2.213110199982116, + "language_loss": 0.89636683, + "learning_rate": 3.487168070036317e-06, + "loss": 0.9212594, + "num_input_tokens_seen": 4728980, + "router_z_loss_clip": 2.66601562, + "router_z_loss_mlp": 0.49682617, + "step": 225, + "time_per_iteration": 2.749796152114868 + }, + { + "auxiliary_loss_clip": 0.01381124, + "auxiliary_loss_mlp": 0.01116068, + "balance_loss_clip": 1.11797309, + "balance_loss_mlp": 1.06750166, + "epoch": 0.01358785510296107, + "flos": 23386116782880.0, + "grad_norm": 1.8980362984810946, + "language_loss": 0.98689389, + "learning_rate": 3.4900232950414224e-06, + "loss": 1.01186585, + "num_input_tokens_seen": 4747020, + "router_z_loss_clip": 2.6328125, + "router_z_loss_mlp": 0.4855957, + "step": 226, + "time_per_iteration": 2.8603384494781494 + }, + { + "auxiliary_loss_clip": 0.01387321, + "auxiliary_loss_mlp": 0.01110371, + "balance_loss_clip": 1.12162304, + "balance_loss_mlp": 1.06039834, + "epoch": 0.01364797835562904, + "flos": 28467556302240.0, + "grad_norm": 2.8450509055151456, + "language_loss": 0.91166854, + "learning_rate": 3.4928659141555727e-06, + "loss": 0.93664539, + "num_input_tokens_seen": 4765000, + "router_z_loss_clip": 2.65625, + "router_z_loss_mlp": 0.50024414, + "step": 227, + "time_per_iteration": 2.7960331439971924 + }, + { + "auxiliary_loss_clip": 0.01255939, + "auxiliary_loss_mlp": 0.01054899, + "balance_loss_clip": 1.11851072, + "balance_loss_mlp": 1.03425241, + "epoch": 0.013708101608297009, + "flos": 86625997750080.0, + "grad_norm": 0.9395758301977092, + "language_loss": 0.57675683, + "learning_rate": 3.4956960382003234e-06, + "loss": 0.5998652, + "num_input_tokens_seen": 4833210, + "router_z_loss_clip": 1.375, + "router_z_loss_mlp": 0.20654297, + "step": 228, + "time_per_iteration": 3.4486191272735596 + }, + { + "auxiliary_loss_clip": 0.01375414, + "auxiliary_loss_mlp": 0.01102925, + "balance_loss_clip": 1.11631119, + "balance_loss_mlp": 1.05807853, + "epoch": 0.013768224860964979, + "flos": 19919751050880.0, + "grad_norm": 2.2465540167675604, + "language_loss": 0.87814206, + "learning_rate": 3.4985137765422354e-06, + "loss": 0.90292537, + "num_input_tokens_seen": 4850120, + "router_z_loss_clip": 2.59179688, + "router_z_loss_mlp": 0.44848633, + "step": 229, + "time_per_iteration": 2.774000644683838 + }, + { + "auxiliary_loss_clip": 0.01383093, + "auxiliary_loss_mlp": 0.01097354, + "balance_loss_clip": 1.1166563, + "balance_loss_mlp": 1.05172062, + "epoch": 0.013828348113632948, + "flos": 24639584132160.0, + "grad_norm": 4.389477981162218, + "language_loss": 0.84224409, + "learning_rate": 3.501319237118231e-06, + "loss": 0.86704856, + "num_input_tokens_seen": 4866215, + "router_z_loss_clip": 2.66601562, + "router_z_loss_mlp": 0.45678711, + "step": 230, + "time_per_iteration": 2.751049280166626 + }, + { + "auxiliary_loss_clip": 0.01382656, + "auxiliary_loss_mlp": 0.01111434, + "balance_loss_clip": 1.11881053, + "balance_loss_mlp": 1.06637287, + "epoch": 0.013888471366300916, + "flos": 25308267099840.0, + "grad_norm": 2.121007165411203, + "language_loss": 0.90266299, + "learning_rate": 3.5041125264604056e-06, + "loss": 0.92760384, + "num_input_tokens_seen": 4885630, + "router_z_loss_clip": 2.63867188, + "router_z_loss_mlp": 0.45043945, + "step": 231, + "time_per_iteration": 2.804349184036255 + }, + { + "auxiliary_loss_clip": 0.01385793, + "auxiliary_loss_mlp": 0.01106542, + "balance_loss_clip": 1.12254632, + "balance_loss_mlp": 1.06162405, + "epoch": 0.013948594618968886, + "flos": 26955140601600.0, + "grad_norm": 3.205488795493729, + "language_loss": 0.83779573, + "learning_rate": 3.5068937497203002e-06, + "loss": 0.86271906, + "num_input_tokens_seen": 4905570, + "router_z_loss_clip": 2.63671875, + "router_z_loss_mlp": 0.44873047, + "step": 232, + "time_per_iteration": 2.7621612548828125 + }, + { + "auxiliary_loss_clip": 0.01388916, + "auxiliary_loss_mlp": 0.01092266, + "balance_loss_clip": 1.11481225, + "balance_loss_mlp": 1.04482102, + "epoch": 0.014008717871636855, + "flos": 23260653817920.0, + "grad_norm": 2.703605227288361, + "language_loss": 0.74526691, + "learning_rate": 3.509663010692652e-06, + "loss": 0.77007878, + "num_input_tokens_seen": 4923535, + "router_z_loss_clip": 2.74023438, + "router_z_loss_mlp": 0.47460938, + "step": 233, + "time_per_iteration": 2.7946906089782715 + }, + { + "auxiliary_loss_clip": 0.01387656, + "auxiliary_loss_mlp": 0.01112743, + "balance_loss_clip": 1.11823559, + "balance_loss_mlp": 1.06541717, + "epoch": 0.014068841124304825, + "flos": 17730751512960.0, + "grad_norm": 1.9572540494938344, + "language_loss": 0.85505879, + "learning_rate": 3.512420411838642e-06, + "loss": 0.88006282, + "num_input_tokens_seen": 4939200, + "router_z_loss_clip": 2.69726562, + "router_z_loss_mlp": 0.47363281, + "step": 234, + "time_per_iteration": 2.710214853286743 + }, + { + "auxiliary_loss_clip": 0.01383579, + "auxiliary_loss_mlp": 0.01107042, + "balance_loss_clip": 1.12112808, + "balance_loss_mlp": 1.06212413, + "epoch": 0.014128964376972794, + "flos": 22057867235520.0, + "grad_norm": 2.9065788758652276, + "language_loss": 0.89090699, + "learning_rate": 3.515166054308634e-06, + "loss": 0.91581321, + "num_input_tokens_seen": 4956620, + "router_z_loss_clip": 2.62304688, + "router_z_loss_mlp": 0.44873047, + "step": 235, + "time_per_iteration": 2.7699618339538574 + }, + { + "auxiliary_loss_clip": 0.01382048, + "auxiliary_loss_mlp": 0.01117295, + "balance_loss_clip": 1.12075257, + "balance_loss_mlp": 1.07316363, + "epoch": 0.014189087629640764, + "flos": 30913680913920.0, + "grad_norm": 2.5146334143271045, + "language_loss": 0.85930312, + "learning_rate": 3.5179000379644498e-06, + "loss": 0.8842966, + "num_input_tokens_seen": 4975650, + "router_z_loss_clip": 2.61132812, + "router_z_loss_mlp": 0.44116211, + "step": 236, + "time_per_iteration": 2.827768325805664 + }, + { + "auxiliary_loss_clip": 0.01378741, + "auxiliary_loss_mlp": 0.01094961, + "balance_loss_clip": 1.11437881, + "balance_loss_mlp": 1.04956579, + "epoch": 0.014249210882308733, + "flos": 44098757730720.0, + "grad_norm": 1.949633523320079, + "language_loss": 0.82656199, + "learning_rate": 3.520622461401154e-06, + "loss": 0.85129905, + "num_input_tokens_seen": 4997415, + "router_z_loss_clip": 2.64257812, + "router_z_loss_mlp": 0.45385742, + "step": 237, + "time_per_iteration": 2.9053781032562256 + }, + { + "auxiliary_loss_clip": 0.01377648, + "auxiliary_loss_mlp": 0.01113683, + "balance_loss_clip": 1.11674142, + "balance_loss_mlp": 1.0661664, + "epoch": 0.014309334134976702, + "flos": 15780245450400.0, + "grad_norm": 1.9223764966779693, + "language_loss": 0.77040541, + "learning_rate": 3.5233334219683935e-06, + "loss": 0.79531872, + "num_input_tokens_seen": 5013905, + "router_z_loss_clip": 2.61523438, + "router_z_loss_mlp": 0.4753418, + "step": 238, + "time_per_iteration": 2.751877546310425 + }, + { + "auxiliary_loss_clip": 0.0137432, + "auxiliary_loss_mlp": 0.01103189, + "balance_loss_clip": 1.11901259, + "balance_loss_mlp": 1.06020236, + "epoch": 0.014369457387644672, + "flos": 25352748722880.0, + "grad_norm": 1.8288401757823114, + "language_loss": 0.87157476, + "learning_rate": 3.526033015791284e-06, + "loss": 0.89634991, + "num_input_tokens_seen": 5033645, + "router_z_loss_clip": 2.55273438, + "router_z_loss_mlp": 0.43017578, + "step": 239, + "time_per_iteration": 2.7562365531921387 + }, + { + "auxiliary_loss_clip": 0.01358135, + "auxiliary_loss_mlp": 0.01092101, + "balance_loss_clip": 1.10958672, + "balance_loss_mlp": 1.04942465, + "epoch": 0.01442958064031264, + "flos": 31540840020000.0, + "grad_norm": 2.0469862353212487, + "language_loss": 0.93241495, + "learning_rate": 3.528721337790862e-06, + "loss": 0.95691729, + "num_input_tokens_seen": 5052875, + "router_z_loss_clip": 2.49023438, + "router_z_loss_mlp": 0.42700195, + "step": 240, + "time_per_iteration": 2.7949929237365723 + }, + { + "auxiliary_loss_clip": 0.01371647, + "auxiliary_loss_mlp": 0.01098906, + "balance_loss_clip": 1.11655414, + "balance_loss_mlp": 1.05704021, + "epoch": 0.014489703892980611, + "flos": 34434373968000.0, + "grad_norm": 2.6879850771957545, + "language_loss": 0.85002673, + "learning_rate": 3.531398481704111e-06, + "loss": 0.87473226, + "num_input_tokens_seen": 5075005, + "router_z_loss_clip": 2.55273438, + "router_z_loss_mlp": 0.41894531, + "step": 241, + "time_per_iteration": 2.7979986667633057 + }, + { + "auxiliary_loss_clip": 0.01369014, + "auxiliary_loss_mlp": 0.01114347, + "balance_loss_clip": 1.12160051, + "balance_loss_mlp": 1.06923783, + "epoch": 0.01454982714564858, + "flos": 27443709144000.0, + "grad_norm": 1.8304806915254306, + "language_loss": 0.88314283, + "learning_rate": 3.534064540103573e-06, + "loss": 0.90797639, + "num_input_tokens_seen": 5091875, + "router_z_loss_clip": 2.47265625, + "router_z_loss_mlp": 0.45117188, + "step": 242, + "time_per_iteration": 2.770249605178833 + }, + { + "auxiliary_loss_clip": 0.01366915, + "auxiliary_loss_mlp": 0.01095923, + "balance_loss_clip": 1.11299312, + "balance_loss_mlp": 1.05038476, + "epoch": 0.014609950398316548, + "flos": 26421320607840.0, + "grad_norm": 2.3574775983815877, + "language_loss": 0.86641157, + "learning_rate": 3.536719604416555e-06, + "loss": 0.89103997, + "num_input_tokens_seen": 5111290, + "router_z_loss_clip": 2.53710938, + "router_z_loss_mlp": 0.45581055, + "step": 243, + "time_per_iteration": 2.7260701656341553 + }, + { + "auxiliary_loss_clip": 0.01374241, + "auxiliary_loss_mlp": 0.01100067, + "balance_loss_clip": 1.11699665, + "balance_loss_mlp": 1.05448151, + "epoch": 0.014670073650984519, + "flos": 26686062930240.0, + "grad_norm": 1.5624345642713422, + "language_loss": 0.84327352, + "learning_rate": 3.5393637649439464e-06, + "loss": 0.8680166, + "num_input_tokens_seen": 5132265, + "router_z_loss_clip": 2.5703125, + "router_z_loss_mlp": 0.45581055, + "step": 244, + "time_per_iteration": 2.7557361125946045 + }, + { + "auxiliary_loss_clip": 0.01383349, + "auxiliary_loss_mlp": 0.0111303, + "balance_loss_clip": 1.12070727, + "balance_loss_mlp": 1.06656218, + "epoch": 0.014730196903652487, + "flos": 28289143602720.0, + "grad_norm": 2.133833248161333, + "language_loss": 0.78562295, + "learning_rate": 3.54199711087864e-06, + "loss": 0.81058669, + "num_input_tokens_seen": 5148575, + "router_z_loss_clip": 2.62695312, + "router_z_loss_mlp": 0.46508789, + "step": 245, + "time_per_iteration": 2.7246456146240234 + }, + { + "auxiliary_loss_clip": 0.01381963, + "auxiliary_loss_mlp": 0.01102135, + "balance_loss_clip": 1.11617792, + "balance_loss_mlp": 1.05342603, + "epoch": 0.014790320156320457, + "flos": 28336380400800.0, + "grad_norm": 2.0161062250894513, + "language_loss": 0.84165299, + "learning_rate": 3.5446197303235913e-06, + "loss": 0.86649394, + "num_input_tokens_seen": 5170415, + "router_z_loss_clip": 2.66015625, + "router_z_loss_mlp": 0.48681641, + "step": 246, + "time_per_iteration": 2.7974908351898193 + }, + { + "auxiliary_loss_clip": 0.01375174, + "auxiliary_loss_mlp": 0.01098566, + "balance_loss_clip": 1.11416364, + "balance_loss_mlp": 1.05257487, + "epoch": 0.014850443408988426, + "flos": 19297697122080.0, + "grad_norm": 2.741312186033544, + "language_loss": 0.90168178, + "learning_rate": 3.5472317103095034e-06, + "loss": 0.9264192, + "num_input_tokens_seen": 5188565, + "router_z_loss_clip": 2.609375, + "router_z_loss_mlp": 0.46020508, + "step": 247, + "time_per_iteration": 2.7736682891845703 + }, + { + "auxiliary_loss_clip": 0.01376808, + "auxiliary_loss_mlp": 0.01098713, + "balance_loss_clip": 1.11055803, + "balance_loss_mlp": 1.05465305, + "epoch": 0.014910566661656396, + "flos": 27798589713600.0, + "grad_norm": 5.739045248476814, + "language_loss": 0.78057361, + "learning_rate": 3.549833136812155e-06, + "loss": 0.80532885, + "num_input_tokens_seen": 5207810, + "router_z_loss_clip": 2.66015625, + "router_z_loss_mlp": 0.44067383, + "step": 248, + "time_per_iteration": 2.763833522796631 + }, + { + "auxiliary_loss_clip": 0.01374419, + "auxiliary_loss_mlp": 0.01104859, + "balance_loss_clip": 1.11886811, + "balance_loss_mlp": 1.06098974, + "epoch": 0.014970689914324365, + "flos": 32781706495200.0, + "grad_norm": 2.938035334054467, + "language_loss": 0.836218, + "learning_rate": 3.552424094769381e-06, + "loss": 0.86101079, + "num_input_tokens_seen": 5226210, + "router_z_loss_clip": 2.55859375, + "router_z_loss_mlp": 0.43896484, + "step": 249, + "time_per_iteration": 2.784397602081299 + }, + { + "auxiliary_loss_clip": 0.01366858, + "auxiliary_loss_mlp": 0.01100332, + "balance_loss_clip": 1.1120075, + "balance_loss_mlp": 1.05696368, + "epoch": 0.015030813166992334, + "flos": 17065309927680.0, + "grad_norm": 1.992274757189024, + "language_loss": 0.93450969, + "learning_rate": 3.5550046680977174e-06, + "loss": 0.95918155, + "num_input_tokens_seen": 5241660, + "router_z_loss_clip": 2.55078125, + "router_z_loss_mlp": 0.43383789, + "step": 250, + "time_per_iteration": 2.7216663360595703 + }, + { + "auxiliary_loss_clip": 0.0137723, + "auxiliary_loss_mlp": 0.01115006, + "balance_loss_clip": 1.11733913, + "balance_loss_mlp": 1.06849027, + "epoch": 0.015090936419660304, + "flos": 30161018325600.0, + "grad_norm": 2.2349717959097495, + "language_loss": 0.9700824, + "learning_rate": 3.5575749397087034e-06, + "loss": 0.99500477, + "num_input_tokens_seen": 5261090, + "router_z_loss_clip": 2.59960938, + "router_z_loss_mlp": 0.46508789, + "step": 251, + "time_per_iteration": 2.795107364654541 + }, + { + "auxiliary_loss_clip": 0.01369387, + "auxiliary_loss_mlp": 0.01100828, + "balance_loss_clip": 1.11026287, + "balance_loss_mlp": 1.05660093, + "epoch": 0.015151059672328273, + "flos": 31406949460800.0, + "grad_norm": 1.849771432415494, + "language_loss": 0.84603989, + "learning_rate": 3.5601349915248707e-06, + "loss": 0.87074202, + "num_input_tokens_seen": 5279175, + "router_z_loss_clip": 2.59570312, + "router_z_loss_mlp": 0.44238281, + "step": 252, + "time_per_iteration": 2.7514994144439697 + }, + { + "auxiliary_loss_clip": 0.01366787, + "auxiliary_loss_mlp": 0.01109818, + "balance_loss_clip": 1.11496997, + "balance_loss_mlp": 1.06485248, + "epoch": 0.015211182924996243, + "flos": 26687440517760.0, + "grad_norm": 2.286197625444377, + "language_loss": 0.9884423, + "learning_rate": 3.5626849044954064e-06, + "loss": 1.01320839, + "num_input_tokens_seen": 5296975, + "router_z_loss_clip": 2.51757812, + "router_z_loss_mlp": 0.44946289, + "step": 253, + "time_per_iteration": 2.719421625137329 + }, + { + "auxiliary_loss_clip": 0.01242896, + "auxiliary_loss_mlp": 0.01067436, + "balance_loss_clip": 1.11010098, + "balance_loss_mlp": 1.04876757, + "epoch": 0.015271306177664212, + "flos": 81625620607200.0, + "grad_norm": 0.8622760899927249, + "language_loss": 0.55668038, + "learning_rate": 3.5652247586115167e-06, + "loss": 0.57978374, + "num_input_tokens_seen": 5358375, + "router_z_loss_clip": 1.328125, + "router_z_loss_mlp": 0.1862793, + "step": 254, + "time_per_iteration": 4.757065296173096 + }, + { + "auxiliary_loss_clip": 0.01372714, + "auxiliary_loss_mlp": 0.01113551, + "balance_loss_clip": 1.11240149, + "balance_loss_mlp": 1.06820369, + "epoch": 0.01533142943033218, + "flos": 32743424016000.0, + "grad_norm": 2.6246133413098525, + "language_loss": 0.90535009, + "learning_rate": 3.567754632921479e-06, + "loss": 0.93021274, + "num_input_tokens_seen": 5377255, + "router_z_loss_clip": 2.6015625, + "router_z_loss_mlp": 0.45336914, + "step": 255, + "time_per_iteration": 5.698214769363403 + }, + { + "auxiliary_loss_clip": 0.01368999, + "auxiliary_loss_mlp": 0.01128672, + "balance_loss_clip": 1.11322117, + "balance_loss_mlp": 1.08339643, + "epoch": 0.01539155268300015, + "flos": 25396987242240.0, + "grad_norm": 2.1518866275744326, + "language_loss": 0.8552686, + "learning_rate": 3.5702746055454075e-06, + "loss": 0.88024533, + "num_input_tokens_seen": 5395320, + "router_z_loss_clip": 2.56054688, + "router_z_loss_mlp": 0.45263672, + "step": 256, + "time_per_iteration": 2.7585415840148926 + }, + { + "auxiliary_loss_clip": 0.01377315, + "auxiliary_loss_mlp": 0.01117097, + "balance_loss_clip": 1.11495924, + "balance_loss_mlp": 1.0718689, + "epoch": 0.01545167593566812, + "flos": 19476271890720.0, + "grad_norm": 2.760960192397453, + "language_loss": 0.71842563, + "learning_rate": 3.5727847536897254e-06, + "loss": 0.74336982, + "num_input_tokens_seen": 5411970, + "router_z_loss_clip": 2.62304688, + "router_z_loss_mlp": 0.45239258, + "step": 257, + "time_per_iteration": 2.731505870819092 + }, + { + "auxiliary_loss_clip": 0.01368275, + "auxiliary_loss_mlp": 0.01104208, + "balance_loss_clip": 1.11374235, + "balance_loss_mlp": 1.05890822, + "epoch": 0.01551179918833609, + "flos": 27578815221600.0, + "grad_norm": 1.8526073232373905, + "language_loss": 0.94631863, + "learning_rate": 3.5752851536613596e-06, + "loss": 0.97104341, + "num_input_tokens_seen": 5430245, + "router_z_loss_clip": 2.546875, + "router_z_loss_mlp": 0.453125, + "step": 258, + "time_per_iteration": 2.7985525131225586 + }, + { + "auxiliary_loss_clip": 0.01370306, + "auxiliary_loss_mlp": 0.01106065, + "balance_loss_clip": 1.11334193, + "balance_loss_mlp": 1.06319726, + "epoch": 0.015571922441004058, + "flos": 27841491162720.0, + "grad_norm": 2.300811207628998, + "language_loss": 0.92826307, + "learning_rate": 3.577775880881658e-06, + "loss": 0.95302683, + "num_input_tokens_seen": 5448905, + "router_z_loss_clip": 2.5703125, + "router_z_loss_mlp": 0.42871094, + "step": 259, + "time_per_iteration": 2.8305089473724365 + }, + { + "auxiliary_loss_clip": 0.01361745, + "auxiliary_loss_mlp": 0.0109778, + "balance_loss_clip": 1.11600161, + "balance_loss_mlp": 1.05672407, + "epoch": 0.015632045693672027, + "flos": 23120077907520.0, + "grad_norm": 1.8057607998450653, + "language_loss": 0.97233176, + "learning_rate": 3.5802570099000424e-06, + "loss": 0.99692702, + "num_input_tokens_seen": 5466405, + "router_z_loss_clip": 2.45898438, + "router_z_loss_mlp": 0.41015625, + "step": 260, + "time_per_iteration": 2.73941707611084 + }, + { + "auxiliary_loss_clip": 0.0137561, + "auxiliary_loss_mlp": 0.01114415, + "balance_loss_clip": 1.11460066, + "balance_loss_mlp": 1.0707128, + "epoch": 0.015692168946339995, + "flos": 36572247048960.0, + "grad_norm": 2.015288421787853, + "language_loss": 0.87823832, + "learning_rate": 3.5827286144073947e-06, + "loss": 0.90313858, + "num_input_tokens_seen": 5487055, + "router_z_loss_clip": 2.609375, + "router_z_loss_mlp": 0.43701172, + "step": 261, + "time_per_iteration": 2.7859835624694824 + }, + { + "auxiliary_loss_clip": 0.01366248, + "auxiliary_loss_mlp": 0.01111148, + "balance_loss_clip": 1.11182547, + "balance_loss_mlp": 1.06718421, + "epoch": 0.015752292199007967, + "flos": 23662041874560.0, + "grad_norm": 2.0022368391403003, + "language_loss": 0.67348856, + "learning_rate": 3.5851907672491904e-06, + "loss": 0.69826257, + "num_input_tokens_seen": 5506600, + "router_z_loss_clip": 2.54101562, + "router_z_loss_mlp": 0.43969727, + "step": 262, + "time_per_iteration": 2.730647087097168 + }, + { + "auxiliary_loss_clip": 0.01367192, + "auxiliary_loss_mlp": 0.01121735, + "balance_loss_clip": 1.11413383, + "balance_loss_mlp": 1.07686496, + "epoch": 0.015812415451675936, + "flos": 24818604590880.0, + "grad_norm": 2.215704401973334, + "language_loss": 0.68132788, + "learning_rate": 3.587643540438383e-06, + "loss": 0.70621717, + "num_input_tokens_seen": 5524350, + "router_z_loss_clip": 2.52929688, + "router_z_loss_mlp": 0.44897461, + "step": 263, + "time_per_iteration": 2.671121835708618 + }, + { + "auxiliary_loss_clip": 0.01365915, + "auxiliary_loss_mlp": 0.01104394, + "balance_loss_clip": 1.11025095, + "balance_loss_mlp": 1.0610733, + "epoch": 0.015872538704343905, + "flos": 21384727367040.0, + "grad_norm": 2.371077187564973, + "language_loss": 0.85128731, + "learning_rate": 3.590087005168037e-06, + "loss": 0.87599033, + "num_input_tokens_seen": 5542145, + "router_z_loss_clip": 2.56054688, + "router_z_loss_mlp": 0.43359375, + "step": 264, + "time_per_iteration": 2.6744959354400635 + }, + { + "auxiliary_loss_clip": 0.01369801, + "auxiliary_loss_mlp": 0.01091504, + "balance_loss_clip": 1.11370921, + "balance_loss_mlp": 1.04966164, + "epoch": 0.015932661957011873, + "flos": 18621153802080.0, + "grad_norm": 2.577223000157792, + "language_loss": 1.04053164, + "learning_rate": 3.5925212318237344e-06, + "loss": 1.06514478, + "num_input_tokens_seen": 5557920, + "router_z_loss_clip": 2.5625, + "router_z_loss_mlp": 0.41845703, + "step": 265, + "time_per_iteration": 2.7093863487243652 + }, + { + "auxiliary_loss_clip": 0.01375258, + "auxiliary_loss_mlp": 0.01108866, + "balance_loss_clip": 1.11864066, + "balance_loss_mlp": 1.06304157, + "epoch": 0.015992785209679845, + "flos": 24773191070400.0, + "grad_norm": 2.8366110612135422, + "language_loss": 0.75077766, + "learning_rate": 3.5949462899957323e-06, + "loss": 0.77561891, + "num_input_tokens_seen": 5576290, + "router_z_loss_clip": 2.56054688, + "router_z_loss_mlp": 0.45849609, + "step": 266, + "time_per_iteration": 2.7893130779266357 + }, + { + "auxiliary_loss_clip": 0.01360471, + "auxiliary_loss_mlp": 0.01102655, + "balance_loss_clip": 1.11404574, + "balance_loss_mlp": 1.05866647, + "epoch": 0.016052908462347814, + "flos": 28507499989920.0, + "grad_norm": 1.7896687227074362, + "language_loss": 0.90758955, + "learning_rate": 3.5973622484909068e-06, + "loss": 0.93222076, + "num_input_tokens_seen": 5595205, + "router_z_loss_clip": 2.46289062, + "router_z_loss_mlp": 0.43994141, + "step": 267, + "time_per_iteration": 2.785590410232544 + }, + { + "auxiliary_loss_clip": 0.01369994, + "auxiliary_loss_mlp": 0.0111195, + "balance_loss_clip": 1.11553454, + "balance_loss_mlp": 1.07005966, + "epoch": 0.016113031715015783, + "flos": 25974113857920.0, + "grad_norm": 3.0079635096798945, + "language_loss": 0.85612315, + "learning_rate": 3.599769175344462e-06, + "loss": 0.88094264, + "num_input_tokens_seen": 5612645, + "router_z_loss_clip": 2.54492188, + "router_z_loss_mlp": 0.41894531, + "step": 268, + "time_per_iteration": 2.7075400352478027 + }, + { + "auxiliary_loss_clip": 0.01360857, + "auxiliary_loss_mlp": 0.01095282, + "balance_loss_clip": 1.11528778, + "balance_loss_mlp": 1.05336857, + "epoch": 0.01617315496768375, + "flos": 23079769564320.0, + "grad_norm": 2.3670097874919525, + "language_loss": 0.88264591, + "learning_rate": 3.602167137831432e-06, + "loss": 0.90720731, + "num_input_tokens_seen": 5628345, + "router_z_loss_clip": 2.45117188, + "router_z_loss_mlp": 0.41918945, + "step": 269, + "time_per_iteration": 2.7443861961364746 + }, + { + "auxiliary_loss_clip": 0.01366284, + "auxiliary_loss_mlp": 0.01100274, + "balance_loss_clip": 1.11295009, + "balance_loss_mlp": 1.05461657, + "epoch": 0.01623327822035172, + "flos": 20189598550560.0, + "grad_norm": 2.0559607461598848, + "language_loss": 0.97153223, + "learning_rate": 3.6045562024779565e-06, + "loss": 0.99619782, + "num_input_tokens_seen": 5645940, + "router_z_loss_clip": 2.53125, + "router_z_loss_mlp": 0.45654297, + "step": 270, + "time_per_iteration": 2.7615456581115723 + }, + { + "auxiliary_loss_clip": 0.01366827, + "auxiliary_loss_mlp": 0.01111122, + "balance_loss_clip": 1.11785889, + "balance_loss_mlp": 1.0685643, + "epoch": 0.016293401473019692, + "flos": 28691423039520.0, + "grad_norm": 4.959791746929564, + "language_loss": 0.86088938, + "learning_rate": 3.606936435072361e-06, + "loss": 0.88566887, + "num_input_tokens_seen": 5665690, + "router_z_loss_clip": 2.49023438, + "router_z_loss_mlp": 0.42578125, + "step": 271, + "time_per_iteration": 2.798658609390259 + }, + { + "auxiliary_loss_clip": 0.01365462, + "auxiliary_loss_mlp": 0.01098621, + "balance_loss_clip": 1.11001253, + "balance_loss_mlp": 1.05568123, + "epoch": 0.01635352472568766, + "flos": 35406730013760.0, + "grad_norm": 3.199893258084364, + "language_loss": 0.81532001, + "learning_rate": 3.609307900676025e-06, + "loss": 0.83996087, + "num_input_tokens_seen": 5683190, + "router_z_loss_clip": 2.5546875, + "router_z_loss_mlp": 0.42944336, + "step": 272, + "time_per_iteration": 2.9073188304901123 + }, + { + "auxiliary_loss_clip": 0.01358518, + "auxiliary_loss_mlp": 0.01114137, + "balance_loss_clip": 1.11216021, + "balance_loss_mlp": 1.07222319, + "epoch": 0.01641364797835563, + "flos": 16314186996000.0, + "grad_norm": 2.2474830396377743, + "language_loss": 0.81314349, + "learning_rate": 3.611670663634051e-06, + "loss": 0.83787012, + "num_input_tokens_seen": 5699780, + "router_z_loss_clip": 2.46484375, + "router_z_loss_mlp": 0.41918945, + "step": 273, + "time_per_iteration": 2.8141467571258545 + }, + { + "auxiliary_loss_clip": 0.01356563, + "auxiliary_loss_mlp": 0.01098559, + "balance_loss_clip": 1.10756087, + "balance_loss_mlp": 1.05542874, + "epoch": 0.016473771231023598, + "flos": 23035287941280.0, + "grad_norm": 2.14669611318564, + "language_loss": 0.91473138, + "learning_rate": 3.614024787585744e-06, + "loss": 0.9392826, + "num_input_tokens_seen": 5716980, + "router_z_loss_clip": 2.49414062, + "router_z_loss_mlp": 0.43115234, + "step": 274, + "time_per_iteration": 2.6937241554260254 + }, + { + "auxiliary_loss_clip": 0.01356888, + "auxiliary_loss_mlp": 0.01106978, + "balance_loss_clip": 1.11150146, + "balance_loss_mlp": 1.06308544, + "epoch": 0.016533894483691566, + "flos": 27578693669760.0, + "grad_norm": 1.8059552861171804, + "language_loss": 0.88449848, + "learning_rate": 3.6163703354748927e-06, + "loss": 0.90913713, + "num_input_tokens_seen": 5737780, + "router_z_loss_clip": 2.45507812, + "router_z_loss_mlp": 0.43896484, + "step": 275, + "time_per_iteration": 2.7421133518218994 + }, + { + "auxiliary_loss_clip": 0.01361368, + "auxiliary_loss_mlp": 0.01099714, + "balance_loss_clip": 1.11275399, + "balance_loss_mlp": 1.05467653, + "epoch": 0.01659401773635954, + "flos": 26243151012000.0, + "grad_norm": 1.567260313192797, + "language_loss": 0.80759758, + "learning_rate": 3.6187073695598707e-06, + "loss": 0.8322084, + "num_input_tokens_seen": 5758330, + "router_z_loss_clip": 2.484375, + "router_z_loss_mlp": 0.44970703, + "step": 276, + "time_per_iteration": 2.7273499965667725 + }, + { + "auxiliary_loss_clip": 0.01351313, + "auxiliary_loss_mlp": 0.01096592, + "balance_loss_clip": 1.11044598, + "balance_loss_mlp": 1.0559895, + "epoch": 0.016654140989027507, + "flos": 40086011648160.0, + "grad_norm": 1.9637932152652398, + "language_loss": 0.81136715, + "learning_rate": 3.621035951423551e-06, + "loss": 0.83584625, + "num_input_tokens_seen": 5778340, + "router_z_loss_clip": 2.40820312, + "router_z_loss_mlp": 0.40625, + "step": 277, + "time_per_iteration": 2.787691116333008 + }, + { + "auxiliary_loss_clip": 0.0134879, + "auxiliary_loss_mlp": 0.010898, + "balance_loss_clip": 1.1042192, + "balance_loss_mlp": 1.0467658, + "epoch": 0.016714264241695476, + "flos": 15016035437280.0, + "grad_norm": 2.8891477302681676, + "language_loss": 0.80921483, + "learning_rate": 3.623356141983041e-06, + "loss": 0.83360082, + "num_input_tokens_seen": 5794295, + "router_z_loss_clip": 2.4453125, + "router_z_loss_mlp": 0.43041992, + "step": 278, + "time_per_iteration": 2.7153420448303223 + }, + { + "auxiliary_loss_clip": 0.01351059, + "auxiliary_loss_mlp": 0.0109633, + "balance_loss_clip": 1.10869479, + "balance_loss_mlp": 1.05420136, + "epoch": 0.016774387494363444, + "flos": 33097170101760.0, + "grad_norm": 1.6858832511257829, + "language_loss": 0.90536684, + "learning_rate": 3.6256680014992486e-06, + "loss": 0.92984074, + "num_input_tokens_seen": 5814405, + "router_z_loss_clip": 2.421875, + "router_z_loss_mlp": 0.42138672, + "step": 279, + "time_per_iteration": 2.786332607269287 + }, + { + "auxiliary_loss_clip": 0.01360022, + "auxiliary_loss_mlp": 0.01108721, + "balance_loss_clip": 1.10923743, + "balance_loss_mlp": 1.06468463, + "epoch": 0.016834510747031413, + "flos": 24637841889120.0, + "grad_norm": 2.3956985664351715, + "language_loss": 0.93674767, + "learning_rate": 3.6279715895862713e-06, + "loss": 0.96143508, + "num_input_tokens_seen": 5832795, + "router_z_loss_clip": 2.50976562, + "router_z_loss_mlp": 0.44042969, + "step": 280, + "time_per_iteration": 2.7480061054229736 + }, + { + "auxiliary_loss_clip": 0.0135822, + "auxiliary_loss_mlp": 0.0110031, + "balance_loss_clip": 1.10716808, + "balance_loss_mlp": 1.0553441, + "epoch": 0.016894633999699385, + "flos": 33280363840320.0, + "grad_norm": 1.8678597052072532, + "language_loss": 0.74231416, + "learning_rate": 3.6302669652206183e-06, + "loss": 0.76689947, + "num_input_tokens_seen": 5855750, + "router_z_loss_clip": 2.51367188, + "router_z_loss_mlp": 0.44995117, + "step": 281, + "time_per_iteration": 2.7728865146636963 + }, + { + "auxiliary_loss_clip": 0.01355956, + "auxiliary_loss_mlp": 0.01119294, + "balance_loss_clip": 1.11130023, + "balance_loss_mlp": 1.0778811, + "epoch": 0.016954757252367354, + "flos": 18184197924000.0, + "grad_norm": 3.5854610133667775, + "language_loss": 0.80354416, + "learning_rate": 3.632554186750274e-06, + "loss": 0.82829666, + "num_input_tokens_seen": 5872610, + "router_z_loss_clip": 2.4453125, + "router_z_loss_mlp": 0.41430664, + "step": 282, + "time_per_iteration": 2.6820812225341797 + }, + { + "auxiliary_loss_clip": 0.01361008, + "auxiliary_loss_mlp": 0.01113912, + "balance_loss_clip": 1.11170661, + "balance_loss_mlp": 1.07006657, + "epoch": 0.017014880505035322, + "flos": 26062307275680.0, + "grad_norm": 1.8934845025867448, + "language_loss": 0.77685583, + "learning_rate": 3.6348333119035937e-06, + "loss": 0.80160499, + "num_input_tokens_seen": 5892985, + "router_z_loss_clip": 2.4921875, + "router_z_loss_mlp": 0.43847656, + "step": 283, + "time_per_iteration": 2.758906841278076 + }, + { + "auxiliary_loss_clip": 0.01360908, + "auxiliary_loss_mlp": 0.01093541, + "balance_loss_clip": 1.11382985, + "balance_loss_mlp": 1.05215192, + "epoch": 0.01707500375770329, + "flos": 43114854260160.0, + "grad_norm": 2.081076557312628, + "language_loss": 0.84284031, + "learning_rate": 3.6371043977980503e-06, + "loss": 0.86738479, + "num_input_tokens_seen": 5914060, + "router_z_loss_clip": 2.47070312, + "router_z_loss_mlp": 0.41381836, + "step": 284, + "time_per_iteration": 2.8334884643554688 + }, + { + "auxiliary_loss_clip": 0.01352007, + "auxiliary_loss_mlp": 0.010973, + "balance_loss_clip": 1.10896611, + "balance_loss_mlp": 1.05290627, + "epoch": 0.01713512701037126, + "flos": 28775524212000.0, + "grad_norm": 2.221355893459177, + "language_loss": 0.97302592, + "learning_rate": 3.639367500948819e-06, + "loss": 0.99751902, + "num_input_tokens_seen": 5932860, + "router_z_loss_clip": 2.4296875, + "router_z_loss_mlp": 0.44433594, + "step": 285, + "time_per_iteration": 2.7523088455200195 + }, + { + "auxiliary_loss_clip": 0.01354713, + "auxiliary_loss_mlp": 0.0109953, + "balance_loss_clip": 1.11059892, + "balance_loss_mlp": 1.05880797, + "epoch": 0.01719525026303923, + "flos": 33720561100800.0, + "grad_norm": 2.048984921949776, + "language_loss": 0.93914938, + "learning_rate": 3.6416226772772178e-06, + "loss": 0.96369183, + "num_input_tokens_seen": 5952725, + "router_z_loss_clip": 2.44335938, + "router_z_loss_mlp": 0.4074707, + "step": 286, + "time_per_iteration": 2.7718563079833984 + }, + { + "auxiliary_loss_clip": 0.01346712, + "auxiliary_loss_mlp": 0.01085014, + "balance_loss_clip": 1.10580015, + "balance_loss_mlp": 1.04276633, + "epoch": 0.0172553735157072, + "flos": 32921188439040.0, + "grad_norm": 1.737549150927313, + "language_loss": 0.9214623, + "learning_rate": 3.643869982119001e-06, + "loss": 0.9457795, + "num_input_tokens_seen": 5970560, + "router_z_loss_clip": 2.40820312, + "router_z_loss_mlp": 0.42285156, + "step": 287, + "time_per_iteration": 2.762650489807129 + }, + { + "auxiliary_loss_clip": 0.01353966, + "auxiliary_loss_mlp": 0.01089313, + "balance_loss_clip": 1.10722435, + "balance_loss_mlp": 1.04723167, + "epoch": 0.01731549676837517, + "flos": 17152085240640.0, + "grad_norm": 3.0326610871019577, + "language_loss": 1.01413107, + "learning_rate": 3.646109470232502e-06, + "loss": 1.03856385, + "num_input_tokens_seen": 5982980, + "router_z_loss_clip": 2.46875, + "router_z_loss_mlp": 0.42041016, + "step": 288, + "time_per_iteration": 2.6587555408477783 + }, + { + "auxiliary_loss_clip": 0.01242378, + "auxiliary_loss_mlp": 0.01089737, + "balance_loss_clip": 1.11122704, + "balance_loss_mlp": 1.07202208, + "epoch": 0.017375620021043137, + "flos": 81155001219840.0, + "grad_norm": 0.9117410157420267, + "language_loss": 0.63817656, + "learning_rate": 3.6483411958066417e-06, + "loss": 0.66149771, + "num_input_tokens_seen": 6049445, + "router_z_loss_clip": 1.30859375, + "router_z_loss_mlp": 0.17749023, + "step": 289, + "time_per_iteration": 3.5115158557891846 + }, + { + "auxiliary_loss_clip": 0.01353388, + "auxiliary_loss_mlp": 0.01109382, + "balance_loss_clip": 1.11140347, + "balance_loss_mlp": 1.06789684, + "epoch": 0.01743574327371111, + "flos": 18577928214720.0, + "grad_norm": 2.5473785829121045, + "language_loss": 0.88239682, + "learning_rate": 3.6505652124687957e-06, + "loss": 0.90702456, + "num_input_tokens_seen": 6064150, + "router_z_loss_clip": 2.41992188, + "router_z_loss_mlp": 0.41479492, + "step": 290, + "time_per_iteration": 2.696962833404541 + }, + { + "auxiliary_loss_clip": 0.01354243, + "auxiliary_loss_mlp": 0.01095283, + "balance_loss_clip": 1.11103022, + "balance_loss_mlp": 1.05317819, + "epoch": 0.017495866526379078, + "flos": 30961647023040.0, + "grad_norm": 2.6903609348039352, + "language_loss": 0.84760571, + "learning_rate": 3.6527815732925258e-06, + "loss": 0.87210095, + "num_input_tokens_seen": 6083920, + "router_z_loss_clip": 2.4375, + "router_z_loss_mlp": 0.4206543, + "step": 291, + "time_per_iteration": 2.7629806995391846 + }, + { + "auxiliary_loss_clip": 0.01356323, + "auxiliary_loss_mlp": 0.01103471, + "balance_loss_clip": 1.11615515, + "balance_loss_mlp": 1.05879188, + "epoch": 0.017555989779047047, + "flos": 32164312053600.0, + "grad_norm": 1.567497440135688, + "language_loss": 0.72813439, + "learning_rate": 3.6549903308051806e-06, + "loss": 0.75273234, + "num_input_tokens_seen": 6105460, + "router_z_loss_clip": 2.40429688, + "router_z_loss_mlp": 0.44702148, + "step": 292, + "time_per_iteration": 2.787870168685913 + }, + { + "auxiliary_loss_clip": 0.01348103, + "auxiliary_loss_mlp": 0.01100875, + "balance_loss_clip": 1.10908127, + "balance_loss_mlp": 1.05731559, + "epoch": 0.017616113031715015, + "flos": 27258651610560.0, + "grad_norm": 3.359084847187195, + "language_loss": 0.87650871, + "learning_rate": 3.6571915369953646e-06, + "loss": 0.90099853, + "num_input_tokens_seen": 6122890, + "router_z_loss_clip": 2.38671875, + "router_z_loss_mlp": 0.43554688, + "step": 293, + "time_per_iteration": 4.171454429626465 + }, + { + "auxiliary_loss_clip": 0.01347736, + "auxiliary_loss_mlp": 0.01110017, + "balance_loss_clip": 1.10835826, + "balance_loss_mlp": 1.06755424, + "epoch": 0.017676236284382984, + "flos": 24595467164640.0, + "grad_norm": 2.4659055906343683, + "language_loss": 0.80661219, + "learning_rate": 3.6593852433202797e-06, + "loss": 0.83118975, + "num_input_tokens_seen": 6142890, + "router_z_loss_clip": 2.39648438, + "router_z_loss_mlp": 0.42456055, + "step": 294, + "time_per_iteration": 5.705071687698364 + }, + { + "auxiliary_loss_clip": 0.01347029, + "auxiliary_loss_mlp": 0.01112896, + "balance_loss_clip": 1.10467386, + "balance_loss_mlp": 1.07048106, + "epoch": 0.017736359537050956, + "flos": 30778372249920.0, + "grad_norm": 2.096502180864424, + "language_loss": 0.84204847, + "learning_rate": 3.6615715007129453e-06, + "loss": 0.86664772, + "num_input_tokens_seen": 6162030, + "router_z_loss_clip": 2.42382812, + "router_z_loss_mlp": 0.42382812, + "step": 295, + "time_per_iteration": 2.7386560440063477 + }, + { + "auxiliary_loss_clip": 0.01354217, + "auxiliary_loss_mlp": 0.01113739, + "balance_loss_clip": 1.11698818, + "balance_loss_mlp": 1.07208729, + "epoch": 0.017796482789718925, + "flos": 24817470107040.0, + "grad_norm": 4.468106613543081, + "language_loss": 0.84660363, + "learning_rate": 3.6637503595892897e-06, + "loss": 0.87128317, + "num_input_tokens_seen": 6180540, + "router_z_loss_clip": 2.37304688, + "router_z_loss_mlp": 0.41674805, + "step": 296, + "time_per_iteration": 2.7783353328704834 + }, + { + "auxiliary_loss_clip": 0.01353435, + "auxiliary_loss_mlp": 0.01100011, + "balance_loss_clip": 1.11095834, + "balance_loss_mlp": 1.0585978, + "epoch": 0.017856606042386893, + "flos": 27307590134400.0, + "grad_norm": 2.411291238208663, + "language_loss": 0.87727892, + "learning_rate": 3.665921869855132e-06, + "loss": 0.90181339, + "num_input_tokens_seen": 6199425, + "router_z_loss_clip": 2.421875, + "router_z_loss_mlp": 0.41430664, + "step": 297, + "time_per_iteration": 2.726285696029663 + }, + { + "auxiliary_loss_clip": 0.01351335, + "auxiliary_loss_mlp": 0.01104197, + "balance_loss_clip": 1.10855973, + "balance_loss_mlp": 1.06271243, + "epoch": 0.017916729295054862, + "flos": 24684714031680.0, + "grad_norm": 2.696192692698362, + "language_loss": 0.88659906, + "learning_rate": 3.6680860809130346e-06, + "loss": 0.91115433, + "num_input_tokens_seen": 6219170, + "router_z_loss_clip": 2.4296875, + "router_z_loss_mlp": 0.41479492, + "step": 298, + "time_per_iteration": 2.7133591175079346 + }, + { + "auxiliary_loss_clip": 0.01348908, + "auxiliary_loss_mlp": 0.01125796, + "balance_loss_clip": 1.11088204, + "balance_loss_mlp": 1.08175981, + "epoch": 0.01797685254772283, + "flos": 23660785838880.0, + "grad_norm": 1.9430311290266735, + "language_loss": 0.88572812, + "learning_rate": 3.6702430416690516e-06, + "loss": 0.91047513, + "num_input_tokens_seen": 6237930, + "router_z_loss_clip": 2.37695312, + "router_z_loss_mlp": 0.44018555, + "step": 299, + "time_per_iteration": 2.7483291625976562 + }, + { + "auxiliary_loss_clip": 0.01354892, + "auxiliary_loss_mlp": 0.0110661, + "balance_loss_clip": 1.11093032, + "balance_loss_mlp": 1.06364679, + "epoch": 0.018036975800390802, + "flos": 29804922237600.0, + "grad_norm": 4.109941915223728, + "language_loss": 0.64674687, + "learning_rate": 3.672392800539357e-06, + "loss": 0.67136192, + "num_input_tokens_seen": 6257170, + "router_z_loss_clip": 2.43554688, + "router_z_loss_mlp": 0.42944336, + "step": 300, + "time_per_iteration": 2.76416015625 + }, + { + "auxiliary_loss_clip": 0.01351274, + "auxiliary_loss_mlp": 0.0111244, + "balance_loss_clip": 1.11155701, + "balance_loss_mlp": 1.0703826, + "epoch": 0.01809709905305877, + "flos": 19253093947200.0, + "grad_norm": 4.425478990445218, + "language_loss": 0.88327259, + "learning_rate": 3.6745354054567686e-06, + "loss": 0.90790975, + "num_input_tokens_seen": 6274780, + "router_z_loss_clip": 2.39453125, + "router_z_loss_mlp": 0.4206543, + "step": 301, + "time_per_iteration": 2.693756341934204 + }, + { + "auxiliary_loss_clip": 0.01230185, + "auxiliary_loss_mlp": 0.0105587, + "balance_loss_clip": 1.10462332, + "balance_loss_mlp": 1.03962231, + "epoch": 0.01815722230572674, + "flos": 82177349238720.0, + "grad_norm": 0.9521431412329696, + "language_loss": 0.62244856, + "learning_rate": 3.676670903877158e-06, + "loss": 0.64530915, + "num_input_tokens_seen": 6340435, + "router_z_loss_clip": 1.25585938, + "router_z_loss_mlp": 0.16247559, + "step": 302, + "time_per_iteration": 3.470306634902954 + }, + { + "auxiliary_loss_clip": 0.01342631, + "auxiliary_loss_mlp": 0.01113811, + "balance_loss_clip": 1.1060183, + "balance_loss_mlp": 1.07010865, + "epoch": 0.01821734555839471, + "flos": 18895296133440.0, + "grad_norm": 2.12669954879805, + "language_loss": 0.89611793, + "learning_rate": 3.6787993427857567e-06, + "loss": 0.92068231, + "num_input_tokens_seen": 6358160, + "router_z_loss_clip": 2.3671875, + "router_z_loss_mlp": 0.43725586, + "step": 303, + "time_per_iteration": 2.7146620750427246 + }, + { + "auxiliary_loss_clip": 0.01350659, + "auxiliary_loss_mlp": 0.01118636, + "balance_loss_clip": 1.11283183, + "balance_loss_mlp": 1.07481456, + "epoch": 0.018277468811062677, + "flos": 29403979871040.0, + "grad_norm": 1.7111379130115116, + "language_loss": 0.80197901, + "learning_rate": 3.680920768703364e-06, + "loss": 0.82667196, + "num_input_tokens_seen": 6378485, + "router_z_loss_clip": 2.37890625, + "router_z_loss_mlp": 0.43823242, + "step": 304, + "time_per_iteration": 2.748744249343872 + }, + { + "auxiliary_loss_clip": 0.01345194, + "auxiliary_loss_mlp": 0.01098318, + "balance_loss_clip": 1.11367309, + "balance_loss_mlp": 1.05709577, + "epoch": 0.01833759206373065, + "flos": 25574670630720.0, + "grad_norm": 1.5614525644233985, + "language_loss": 0.82872617, + "learning_rate": 3.6830352276924415e-06, + "loss": 0.85316122, + "num_input_tokens_seen": 6397845, + "router_z_loss_clip": 2.31445312, + "router_z_loss_mlp": 0.41210938, + "step": 305, + "time_per_iteration": 2.7408607006073 + }, + { + "auxiliary_loss_clip": 0.01345239, + "auxiliary_loss_mlp": 0.01090762, + "balance_loss_clip": 1.10580111, + "balance_loss_mlp": 1.05087447, + "epoch": 0.018397715316398618, + "flos": 23660542735200.0, + "grad_norm": 1.848532217749583, + "language_loss": 0.90616894, + "learning_rate": 3.685142765363119e-06, + "loss": 0.930529, + "num_input_tokens_seen": 6416475, + "router_z_loss_clip": 2.39648438, + "router_z_loss_mlp": 0.39892578, + "step": 306, + "time_per_iteration": 2.6995487213134766 + }, + { + "auxiliary_loss_clip": 0.01338581, + "auxiliary_loss_mlp": 0.010895, + "balance_loss_clip": 1.10439253, + "balance_loss_mlp": 1.04913568, + "epoch": 0.018457838569066586, + "flos": 35548197304320.0, + "grad_norm": 2.273119042297212, + "language_loss": 0.86589783, + "learning_rate": 3.687243426879095e-06, + "loss": 0.89017868, + "num_input_tokens_seen": 6437520, + "router_z_loss_clip": 2.34179688, + "router_z_loss_mlp": 0.40380859, + "step": 307, + "time_per_iteration": 2.785245418548584 + }, + { + "auxiliary_loss_clip": 0.01341176, + "auxiliary_loss_mlp": 0.01105055, + "balance_loss_clip": 1.10965097, + "balance_loss_mlp": 1.06056583, + "epoch": 0.018517961821734555, + "flos": 23438296689120.0, + "grad_norm": 2.1938543486051865, + "language_loss": 0.7135241, + "learning_rate": 3.6893372569634466e-06, + "loss": 0.73798645, + "num_input_tokens_seen": 6455680, + "router_z_loss_clip": 2.31445312, + "router_z_loss_mlp": 0.4453125, + "step": 308, + "time_per_iteration": 2.7191102504730225 + }, + { + "auxiliary_loss_clip": 0.01344541, + "auxiliary_loss_mlp": 0.01094829, + "balance_loss_clip": 1.10452902, + "balance_loss_mlp": 1.05386901, + "epoch": 0.018578085074402523, + "flos": 24237021074400.0, + "grad_norm": 1.941343579827699, + "language_loss": 0.91868049, + "learning_rate": 3.6914242999043395e-06, + "loss": 0.94307423, + "num_input_tokens_seen": 6474880, + "router_z_loss_clip": 2.39648438, + "router_z_loss_mlp": 0.40966797, + "step": 309, + "time_per_iteration": 2.7133896350860596 + }, + { + "auxiliary_loss_clip": 0.01357817, + "auxiliary_loss_mlp": 0.01099469, + "balance_loss_clip": 1.10998273, + "balance_loss_mlp": 1.05567133, + "epoch": 0.018638208327070496, + "flos": 36127106680320.0, + "grad_norm": 2.002553683171886, + "language_loss": 0.72782803, + "learning_rate": 3.69350459956065e-06, + "loss": 0.75240093, + "num_input_tokens_seen": 6495945, + "router_z_loss_clip": 2.4765625, + "router_z_loss_mlp": 0.43798828, + "step": 310, + "time_per_iteration": 2.7977895736694336 + }, + { + "auxiliary_loss_clip": 0.01344984, + "auxiliary_loss_mlp": 0.01106816, + "balance_loss_clip": 1.1122489, + "balance_loss_mlp": 1.06573665, + "epoch": 0.018698331579738464, + "flos": 55803785803200.0, + "grad_norm": 1.9836593778067264, + "language_loss": 0.74261546, + "learning_rate": 3.695578199367497e-06, + "loss": 0.76713347, + "num_input_tokens_seen": 6519930, + "router_z_loss_clip": 2.33007812, + "router_z_loss_mlp": 0.41040039, + "step": 311, + "time_per_iteration": 2.9515538215637207 + }, + { + "auxiliary_loss_clip": 0.01351798, + "auxiliary_loss_mlp": 0.01103393, + "balance_loss_clip": 1.10822213, + "balance_loss_mlp": 1.06338596, + "epoch": 0.018758454832406433, + "flos": 24994343149920.0, + "grad_norm": 2.4357556596407957, + "language_loss": 0.91601741, + "learning_rate": 3.6976451423416825e-06, + "loss": 0.94056928, + "num_input_tokens_seen": 6535070, + "router_z_loss_clip": 2.4375, + "router_z_loss_mlp": 0.40014648, + "step": 312, + "time_per_iteration": 2.741560220718384 + }, + { + "auxiliary_loss_clip": 0.01352154, + "auxiliary_loss_mlp": 0.01115859, + "balance_loss_clip": 1.11091793, + "balance_loss_mlp": 1.07241881, + "epoch": 0.0188185780850744, + "flos": 19251797394240.0, + "grad_norm": 2.099671076781367, + "language_loss": 0.89829957, + "learning_rate": 3.699705471087043e-06, + "loss": 0.92297971, + "num_input_tokens_seen": 6554135, + "router_z_loss_clip": 2.41210938, + "router_z_loss_mlp": 0.43457031, + "step": 313, + "time_per_iteration": 2.6981375217437744 + }, + { + "auxiliary_loss_clip": 0.01351521, + "auxiliary_loss_mlp": 0.01091801, + "balance_loss_clip": 1.10794163, + "balance_loss_mlp": 1.04898143, + "epoch": 0.018878701337742373, + "flos": 27400645625760.0, + "grad_norm": 2.1343211893728244, + "language_loss": 0.73414469, + "learning_rate": 3.7017592277997256e-06, + "loss": 0.75857788, + "num_input_tokens_seen": 6572275, + "router_z_loss_clip": 2.43359375, + "router_z_loss_mlp": 0.42822266, + "step": 314, + "time_per_iteration": 2.7154417037963867 + }, + { + "auxiliary_loss_clip": 0.01343304, + "auxiliary_loss_mlp": 0.01108624, + "balance_loss_clip": 1.10795176, + "balance_loss_mlp": 1.06730604, + "epoch": 0.018938824590410342, + "flos": 37819110081600.0, + "grad_norm": 2.957690164505911, + "language_loss": 0.91006649, + "learning_rate": 3.7038064542733654e-06, + "loss": 0.93458581, + "num_input_tokens_seen": 6594520, + "router_z_loss_clip": 2.35351562, + "router_z_loss_mlp": 0.41333008, + "step": 315, + "time_per_iteration": 2.7657978534698486 + }, + { + "auxiliary_loss_clip": 0.01345637, + "auxiliary_loss_mlp": 0.0109594, + "balance_loss_clip": 1.10897171, + "balance_loss_mlp": 1.05428863, + "epoch": 0.01899894784307831, + "flos": 28380699954720.0, + "grad_norm": 2.013392991939233, + "language_loss": 0.80761063, + "learning_rate": 3.7058471919041945e-06, + "loss": 0.83202636, + "num_input_tokens_seen": 6614245, + "router_z_loss_clip": 2.36328125, + "router_z_loss_mlp": 0.41674805, + "step": 316, + "time_per_iteration": 2.7500691413879395 + }, + { + "auxiliary_loss_clip": 0.01340309, + "auxiliary_loss_mlp": 0.01095471, + "balance_loss_clip": 1.10693729, + "balance_loss_mlp": 1.05374742, + "epoch": 0.01905907109574628, + "flos": 21301841712960.0, + "grad_norm": 2.6654898341116913, + "language_loss": 0.90488124, + "learning_rate": 3.7078814816960605e-06, + "loss": 0.92923903, + "num_input_tokens_seen": 6632015, + "router_z_loss_clip": 2.33398438, + "router_z_loss_mlp": 0.41723633, + "step": 317, + "time_per_iteration": 2.8286242485046387 + }, + { + "auxiliary_loss_clip": 0.01337581, + "auxiliary_loss_mlp": 0.01091334, + "balance_loss_clip": 1.10549951, + "balance_loss_mlp": 1.04970574, + "epoch": 0.019119194348414248, + "flos": 18266799957120.0, + "grad_norm": 2.4507267960639902, + "language_loss": 0.9116627, + "learning_rate": 3.709909364265374e-06, + "loss": 0.93595183, + "num_input_tokens_seen": 6649015, + "router_z_loss_clip": 2.32226562, + "router_z_loss_mlp": 0.41625977, + "step": 318, + "time_per_iteration": 2.7770791053771973 + }, + { + "auxiliary_loss_clip": 0.01340124, + "auxiliary_loss_mlp": 0.01085699, + "balance_loss_clip": 1.10620284, + "balance_loss_mlp": 1.04702747, + "epoch": 0.01917931760108222, + "flos": 31094646202080.0, + "grad_norm": 2.3928010538222697, + "language_loss": 0.94078964, + "learning_rate": 3.7119308798459706e-06, + "loss": 0.96504784, + "num_input_tokens_seen": 6669225, + "router_z_loss_clip": 2.33789062, + "router_z_loss_mlp": 0.38647461, + "step": 319, + "time_per_iteration": 2.7858471870422363 + }, + { + "auxiliary_loss_clip": 0.01241382, + "auxiliary_loss_mlp": 0.01106239, + "balance_loss_clip": 1.11870956, + "balance_loss_mlp": 1.08945465, + "epoch": 0.01923944085375019, + "flos": 87315176111040.0, + "grad_norm": 0.951417329197908, + "language_loss": 0.59853435, + "learning_rate": 3.7139460682939026e-06, + "loss": 0.62201059, + "num_input_tokens_seen": 6725775, + "router_z_loss_clip": 1.2265625, + "router_z_loss_mlp": 0.16796875, + "step": 320, + "time_per_iteration": 3.2794578075408936 + }, + { + "auxiliary_loss_clip": 0.01337264, + "auxiliary_loss_mlp": 0.01098655, + "balance_loss_clip": 1.10522854, + "balance_loss_mlp": 1.05924416, + "epoch": 0.019299564106418157, + "flos": 24016557788640.0, + "grad_norm": 3.4873071133309343, + "language_loss": 0.89944589, + "learning_rate": 3.715954969092154e-06, + "loss": 0.92380512, + "num_input_tokens_seen": 6744170, + "router_z_loss_clip": 2.32421875, + "router_z_loss_mlp": 0.39404297, + "step": 321, + "time_per_iteration": 2.7669084072113037 + }, + { + "auxiliary_loss_clip": 0.01346409, + "auxiliary_loss_mlp": 0.01107624, + "balance_loss_clip": 1.10890913, + "balance_loss_mlp": 1.06678259, + "epoch": 0.019359687359086126, + "flos": 29757725956800.0, + "grad_norm": 1.987990358402138, + "language_loss": 0.82682914, + "learning_rate": 3.7179576213552805e-06, + "loss": 0.8513695, + "num_input_tokens_seen": 6764565, + "router_z_loss_clip": 2.37695312, + "router_z_loss_mlp": 0.40869141, + "step": 322, + "time_per_iteration": 2.7524900436401367 + }, + { + "auxiliary_loss_clip": 0.01347369, + "auxiliary_loss_mlp": 0.01086541, + "balance_loss_clip": 1.10749543, + "balance_loss_mlp": 1.04810774, + "epoch": 0.019419810611754094, + "flos": 29225769757920.0, + "grad_norm": 2.060950135933644, + "language_loss": 0.73344874, + "learning_rate": 3.719954063833981e-06, + "loss": 0.75778788, + "num_input_tokens_seen": 6785310, + "router_z_loss_clip": 2.40039062, + "router_z_loss_mlp": 0.3840332, + "step": 323, + "time_per_iteration": 2.7398557662963867 + }, + { + "auxiliary_loss_clip": 0.0133564, + "auxiliary_loss_mlp": 0.01086794, + "balance_loss_clip": 1.10256553, + "balance_loss_mlp": 1.04704952, + "epoch": 0.019479933864422067, + "flos": 27040376257920.0, + "grad_norm": 1.936501671747128, + "language_loss": 0.92321062, + "learning_rate": 3.721944334919596e-06, + "loss": 0.9474349, + "num_input_tokens_seen": 6803290, + "router_z_loss_clip": 2.33203125, + "router_z_loss_mlp": 0.39746094, + "step": 324, + "time_per_iteration": 2.702643394470215 + }, + { + "auxiliary_loss_clip": 0.01342559, + "auxiliary_loss_mlp": 0.01086095, + "balance_loss_clip": 1.10845542, + "balance_loss_mlp": 1.04851985, + "epoch": 0.019540057117090035, + "flos": 27133512783840.0, + "grad_norm": 2.902325437908421, + "language_loss": 0.64668775, + "learning_rate": 3.7239284726485375e-06, + "loss": 0.67097425, + "num_input_tokens_seen": 6822570, + "router_z_loss_clip": 2.34375, + "router_z_loss_mlp": 0.37573242, + "step": 325, + "time_per_iteration": 2.6937289237976074 + }, + { + "auxiliary_loss_clip": 0.01345288, + "auxiliary_loss_mlp": 0.01102584, + "balance_loss_clip": 1.11543953, + "balance_loss_mlp": 1.06233859, + "epoch": 0.019600180369758004, + "flos": 28158413391360.0, + "grad_norm": 1.6472543373801916, + "language_loss": 0.76293606, + "learning_rate": 3.72590651470665e-06, + "loss": 0.78741479, + "num_input_tokens_seen": 6841910, + "router_z_loss_clip": 2.29882812, + "router_z_loss_mlp": 0.40234375, + "step": 326, + "time_per_iteration": 2.713787317276001 + }, + { + "auxiliary_loss_clip": 0.01336582, + "auxiliary_loss_mlp": 0.01103561, + "balance_loss_clip": 1.11017156, + "balance_loss_mlp": 1.06336331, + "epoch": 0.019660303622425972, + "flos": 31006736405280.0, + "grad_norm": 2.2086720269193085, + "language_loss": 0.79802966, + "learning_rate": 3.727878498433505e-06, + "loss": 0.82243109, + "num_input_tokens_seen": 6862480, + "router_z_loss_clip": 2.26171875, + "router_z_loss_mlp": 0.40161133, + "step": 327, + "time_per_iteration": 2.756892204284668 + }, + { + "auxiliary_loss_clip": 0.0134441, + "auxiliary_loss_mlp": 0.01109194, + "balance_loss_clip": 1.10982561, + "balance_loss_mlp": 1.07118976, + "epoch": 0.01972042687509394, + "flos": 28867364184960.0, + "grad_norm": 2.05627647731948, + "language_loss": 0.80769616, + "learning_rate": 3.7298444608266328e-06, + "loss": 0.83223218, + "num_input_tokens_seen": 6882015, + "router_z_loss_clip": 2.34765625, + "router_z_loss_mlp": 0.38012695, + "step": 328, + "time_per_iteration": 2.7059073448181152 + }, + { + "auxiliary_loss_clip": 0.01341446, + "auxiliary_loss_mlp": 0.01092741, + "balance_loss_clip": 1.1052047, + "balance_loss_mlp": 1.05282998, + "epoch": 0.019780550127761913, + "flos": 22236117865920.0, + "grad_norm": 4.695161655858131, + "language_loss": 0.94071913, + "learning_rate": 3.731804438545683e-06, + "loss": 0.96506095, + "num_input_tokens_seen": 6899785, + "router_z_loss_clip": 2.359375, + "router_z_loss_mlp": 0.39916992, + "step": 329, + "time_per_iteration": 2.6866567134857178 + }, + { + "auxiliary_loss_clip": 0.01351704, + "auxiliary_loss_mlp": 0.01108291, + "balance_loss_clip": 1.11207104, + "balance_loss_mlp": 1.06847477, + "epoch": 0.01984067338042988, + "flos": 27355596760800.0, + "grad_norm": 2.072109328152703, + "language_loss": 0.74404335, + "learning_rate": 3.7337584679165324e-06, + "loss": 0.76864332, + "num_input_tokens_seen": 6918575, + "router_z_loss_clip": 2.3984375, + "router_z_loss_mlp": 0.39794922, + "step": 330, + "time_per_iteration": 2.738645553588867 + }, + { + "auxiliary_loss_clip": 0.01344247, + "auxiliary_loss_mlp": 0.01114085, + "balance_loss_clip": 1.10806835, + "balance_loss_mlp": 1.07448351, + "epoch": 0.01990079663309785, + "flos": 20810558512800.0, + "grad_norm": 2.9849097254717822, + "language_loss": 0.93744504, + "learning_rate": 3.7357065849353186e-06, + "loss": 0.96202832, + "num_input_tokens_seen": 6936965, + "router_z_loss_clip": 2.36328125, + "router_z_loss_mlp": 0.39599609, + "step": 331, + "time_per_iteration": 2.681051254272461 + }, + { + "auxiliary_loss_clip": 0.01332304, + "auxiliary_loss_mlp": 0.01080627, + "balance_loss_clip": 1.10734367, + "balance_loss_mlp": 1.04276609, + "epoch": 0.01996091988576582, + "flos": 19476190856160.0, + "grad_norm": 2.065799731804318, + "language_loss": 0.92872703, + "learning_rate": 3.737648825272422e-06, + "loss": 0.9528563, + "num_input_tokens_seen": 6953475, + "router_z_loss_clip": 2.25, + "router_z_loss_mlp": 0.37841797, + "step": 332, + "time_per_iteration": 2.692413091659546 + }, + { + "auxiliary_loss_clip": 0.01342799, + "auxiliary_loss_mlp": 0.01093864, + "balance_loss_clip": 1.11457753, + "balance_loss_mlp": 1.05433357, + "epoch": 0.02002104313843379, + "flos": 28780467320160.0, + "grad_norm": 2.9532473160340835, + "language_loss": 0.75510085, + "learning_rate": 3.739585224276384e-06, + "loss": 0.77946746, + "num_input_tokens_seen": 6971630, + "router_z_loss_clip": 2.28515625, + "router_z_loss_mlp": 0.39526367, + "step": 333, + "time_per_iteration": 7.122416019439697 + }, + { + "auxiliary_loss_clip": 0.01341722, + "auxiliary_loss_mlp": 0.01078428, + "balance_loss_clip": 1.10728025, + "balance_loss_mlp": 1.04049516, + "epoch": 0.02008116639110176, + "flos": 41603613560640.0, + "grad_norm": 2.1178167092774483, + "language_loss": 0.78406578, + "learning_rate": 3.7415158169777673e-06, + "loss": 0.80826724, + "num_input_tokens_seen": 6992775, + "router_z_loss_clip": 2.33984375, + "router_z_loss_mlp": 0.37939453, + "step": 334, + "time_per_iteration": 2.8218472003936768 + }, + { + "auxiliary_loss_clip": 0.01340092, + "auxiliary_loss_mlp": 0.01094023, + "balance_loss_clip": 1.10442483, + "balance_loss_mlp": 1.0533483, + "epoch": 0.020141289643769728, + "flos": 24017935376160.0, + "grad_norm": 1.865383581718395, + "language_loss": 0.83183682, + "learning_rate": 3.7434406380929575e-06, + "loss": 0.85617793, + "num_input_tokens_seen": 7011425, + "router_z_loss_clip": 2.35546875, + "router_z_loss_mlp": 0.40673828, + "step": 335, + "time_per_iteration": 2.698545217514038 + }, + { + "auxiliary_loss_clip": 0.01335558, + "auxiliary_loss_mlp": 0.01083788, + "balance_loss_clip": 1.10600042, + "balance_loss_mlp": 1.04566455, + "epoch": 0.020201412896437697, + "flos": 25307699857920.0, + "grad_norm": 1.9852303706377383, + "language_loss": 0.91950476, + "learning_rate": 3.745359722027911e-06, + "loss": 0.94369817, + "num_input_tokens_seen": 7029450, + "router_z_loss_clip": 2.296875, + "router_z_loss_mlp": 0.38085938, + "step": 336, + "time_per_iteration": 2.7461371421813965 + }, + { + "auxiliary_loss_clip": 0.01334145, + "auxiliary_loss_mlp": 0.01076578, + "balance_loss_clip": 1.10176551, + "balance_loss_mlp": 1.03888369, + "epoch": 0.020261536149105665, + "flos": 24729195654720.0, + "grad_norm": 1.6680336952484474, + "language_loss": 0.88641059, + "learning_rate": 3.7472731028818428e-06, + "loss": 0.91051787, + "num_input_tokens_seen": 7047555, + "router_z_loss_clip": 2.32617188, + "router_z_loss_mlp": 0.37646484, + "step": 337, + "time_per_iteration": 2.6969923973083496 + }, + { + "auxiliary_loss_clip": 0.01325459, + "auxiliary_loss_mlp": 0.01099602, + "balance_loss_clip": 1.09985769, + "balance_loss_mlp": 1.05945253, + "epoch": 0.020321659401773638, + "flos": 31541083123680.0, + "grad_norm": 1.421622212171608, + "language_loss": 0.89802516, + "learning_rate": 3.7491808144508626e-06, + "loss": 0.92227578, + "num_input_tokens_seen": 7068185, + "router_z_loss_clip": 2.25390625, + "router_z_loss_mlp": 0.40136719, + "step": 338, + "time_per_iteration": 2.831451177597046 + }, + { + "auxiliary_loss_clip": 0.01334846, + "auxiliary_loss_mlp": 0.0110142, + "balance_loss_clip": 1.10256231, + "balance_loss_mlp": 1.06191373, + "epoch": 0.020381782654441606, + "flos": 21347093164320.0, + "grad_norm": 2.6874606535702106, + "language_loss": 0.85267317, + "learning_rate": 3.7510828902315576e-06, + "loss": 0.87703586, + "num_input_tokens_seen": 7085955, + "router_z_loss_clip": 2.32421875, + "router_z_loss_mlp": 0.39526367, + "step": 339, + "time_per_iteration": 2.678534746170044 + }, + { + "auxiliary_loss_clip": 0.01341221, + "auxiliary_loss_mlp": 0.01097028, + "balance_loss_clip": 1.10675406, + "balance_loss_mlp": 1.05747473, + "epoch": 0.020441905907109575, + "flos": 29582716708800.0, + "grad_norm": 1.742812128356363, + "language_loss": 0.88683891, + "learning_rate": 3.75297936342452e-06, + "loss": 0.91122139, + "num_input_tokens_seen": 7106345, + "router_z_loss_clip": 2.34375, + "router_z_loss_mlp": 0.39575195, + "step": 340, + "time_per_iteration": 2.7248125076293945 + }, + { + "auxiliary_loss_clip": 0.01340002, + "auxiliary_loss_mlp": 0.01082051, + "balance_loss_clip": 1.10519826, + "balance_loss_mlp": 1.04142463, + "epoch": 0.020502029159777543, + "flos": 27129542090400.0, + "grad_norm": 2.0466570777487747, + "language_loss": 0.88086569, + "learning_rate": 3.7548702669378253e-06, + "loss": 0.90508616, + "num_input_tokens_seen": 7125070, + "router_z_loss_clip": 2.34960938, + "router_z_loss_mlp": 0.40649414, + "step": 341, + "time_per_iteration": 2.7278361320495605 + }, + { + "auxiliary_loss_clip": 0.0133869, + "auxiliary_loss_mlp": 0.01098202, + "balance_loss_clip": 1.10227931, + "balance_loss_mlp": 1.05867243, + "epoch": 0.020562152412445512, + "flos": 29270291898240.0, + "grad_norm": 2.3863265476893027, + "language_loss": 0.80598676, + "learning_rate": 3.756755633390458e-06, + "loss": 0.8303557, + "num_input_tokens_seen": 7144675, + "router_z_loss_clip": 2.36328125, + "router_z_loss_mlp": 0.39575195, + "step": 342, + "time_per_iteration": 2.7447617053985596 + }, + { + "auxiliary_loss_clip": 0.01328824, + "auxiliary_loss_mlp": 0.01091868, + "balance_loss_clip": 1.10166168, + "balance_loss_mlp": 1.05012059, + "epoch": 0.020622275665113484, + "flos": 32916204813600.0, + "grad_norm": 1.7423273419819363, + "language_loss": 0.89334285, + "learning_rate": 3.7586354951156886e-06, + "loss": 0.91754979, + "num_input_tokens_seen": 7165505, + "router_z_loss_clip": 2.265625, + "router_z_loss_mlp": 0.41772461, + "step": 343, + "time_per_iteration": 2.737379312515259 + }, + { + "auxiliary_loss_clip": 0.01340843, + "auxiliary_loss_mlp": 0.01092645, + "balance_loss_clip": 1.11031294, + "balance_loss_mlp": 1.05504632, + "epoch": 0.020682398917781453, + "flos": 27577640220480.0, + "grad_norm": 1.7582725597226116, + "language_loss": 0.78332818, + "learning_rate": 3.7605098841644e-06, + "loss": 0.80766308, + "num_input_tokens_seen": 7184605, + "router_z_loss_clip": 2.30664062, + "router_z_loss_mlp": 0.37597656, + "step": 344, + "time_per_iteration": 2.7058844566345215 + }, + { + "auxiliary_loss_clip": 0.01330359, + "auxiliary_loss_mlp": 0.01099091, + "balance_loss_clip": 1.10420752, + "balance_loss_mlp": 1.05875063, + "epoch": 0.02074252217044942, + "flos": 18319101415200.0, + "grad_norm": 2.3987058044342815, + "language_loss": 0.75096524, + "learning_rate": 3.7623788323083666e-06, + "loss": 0.77525973, + "num_input_tokens_seen": 7203065, + "router_z_loss_clip": 2.265625, + "router_z_loss_mlp": 0.40356445, + "step": 345, + "time_per_iteration": 2.6830010414123535 + }, + { + "auxiliary_loss_clip": 0.01332583, + "auxiliary_loss_mlp": 0.01097292, + "balance_loss_clip": 1.10711837, + "balance_loss_mlp": 1.05757165, + "epoch": 0.02080264542311739, + "flos": 30917773159200.0, + "grad_norm": 3.999423328656303, + "language_loss": 0.90253299, + "learning_rate": 3.7642423710434837e-06, + "loss": 0.92683172, + "num_input_tokens_seen": 7222995, + "router_z_loss_clip": 2.25585938, + "router_z_loss_mlp": 0.39746094, + "step": 346, + "time_per_iteration": 2.741513729095459 + }, + { + "auxiliary_loss_clip": 0.01332166, + "auxiliary_loss_mlp": 0.01089609, + "balance_loss_clip": 1.10505033, + "balance_loss_mlp": 1.05322683, + "epoch": 0.02086276867578536, + "flos": 29759589751680.0, + "grad_norm": 1.9319063344509455, + "language_loss": 0.79265606, + "learning_rate": 3.7661005315929563e-06, + "loss": 0.81687385, + "num_input_tokens_seen": 7244625, + "router_z_loss_clip": 2.26953125, + "router_z_loss_mlp": 0.36401367, + "step": 347, + "time_per_iteration": 2.7663474082946777 + }, + { + "auxiliary_loss_clip": 0.01333515, + "auxiliary_loss_mlp": 0.01097491, + "balance_loss_clip": 1.1079483, + "balance_loss_mlp": 1.05762792, + "epoch": 0.02092289192845333, + "flos": 29849120239680.0, + "grad_norm": 1.7917797065930674, + "language_loss": 0.71329069, + "learning_rate": 3.7679533449104354e-06, + "loss": 0.73760074, + "num_input_tokens_seen": 7263255, + "router_z_loss_clip": 2.25390625, + "router_z_loss_mlp": 0.39868164, + "step": 348, + "time_per_iteration": 2.69480037689209 + }, + { + "auxiliary_loss_clip": 0.01337436, + "auxiliary_loss_mlp": 0.01100198, + "balance_loss_clip": 1.10555208, + "balance_loss_mlp": 1.06064451, + "epoch": 0.0209830151811213, + "flos": 21294264981600.0, + "grad_norm": 3.961747318679475, + "language_loss": 0.77439821, + "learning_rate": 3.7698008416831116e-06, + "loss": 0.79877454, + "num_input_tokens_seen": 7279275, + "router_z_loss_clip": 2.31835938, + "router_z_loss_mlp": 0.39550781, + "step": 349, + "time_per_iteration": 2.6636102199554443 + }, + { + "auxiliary_loss_clip": 0.01321537, + "auxiliary_loss_mlp": 0.0110263, + "balance_loss_clip": 1.10365045, + "balance_loss_mlp": 1.06505513, + "epoch": 0.021043138433789268, + "flos": 29985320283840.0, + "grad_norm": 1.7032260513003705, + "language_loss": 0.85181218, + "learning_rate": 3.7716430523347664e-06, + "loss": 0.87605381, + "num_input_tokens_seen": 7300180, + "router_z_loss_clip": 2.17773438, + "router_z_loss_mlp": 0.37597656, + "step": 350, + "time_per_iteration": 2.728710412979126 + }, + { + "auxiliary_loss_clip": 0.0133029, + "auxiliary_loss_mlp": 0.01085034, + "balance_loss_clip": 1.10772955, + "balance_loss_mlp": 1.04874694, + "epoch": 0.021103261686457236, + "flos": 29839922817120.0, + "grad_norm": 2.975963588049443, + "language_loss": 0.79908186, + "learning_rate": 3.773480007028776e-06, + "loss": 0.82323515, + "num_input_tokens_seen": 7317430, + "router_z_loss_clip": 2.22851562, + "router_z_loss_mlp": 0.36303711, + "step": 351, + "time_per_iteration": 2.710705280303955 + }, + { + "auxiliary_loss_clip": 0.01334476, + "auxiliary_loss_mlp": 0.01100078, + "balance_loss_clip": 1.10552692, + "balance_loss_mlp": 1.05973768, + "epoch": 0.021163384939125205, + "flos": 17915971115520.0, + "grad_norm": 2.805437514024386, + "language_loss": 0.87498307, + "learning_rate": 3.775311735671078e-06, + "loss": 0.89932859, + "num_input_tokens_seen": 7334875, + "router_z_loss_clip": 2.29101562, + "router_z_loss_mlp": 0.40307617, + "step": 352, + "time_per_iteration": 2.643345832824707 + }, + { + "auxiliary_loss_clip": 0.01328548, + "auxiliary_loss_mlp": 0.01101975, + "balance_loss_clip": 1.10630929, + "balance_loss_mlp": 1.06301725, + "epoch": 0.021223508191793177, + "flos": 29887443236160.0, + "grad_norm": 2.153892660128195, + "language_loss": 0.82463461, + "learning_rate": 3.7771382679130878e-06, + "loss": 0.8489399, + "num_input_tokens_seen": 7355185, + "router_z_loss_clip": 2.21875, + "router_z_loss_mlp": 0.38964844, + "step": 353, + "time_per_iteration": 2.7816567420959473 + }, + { + "auxiliary_loss_clip": 0.01325529, + "auxiliary_loss_mlp": 0.01093988, + "balance_loss_clip": 1.10472715, + "balance_loss_mlp": 1.05598366, + "epoch": 0.021283631444461146, + "flos": 29440682176320.0, + "grad_norm": 1.7723735574509578, + "language_loss": 0.80977255, + "learning_rate": 3.7789596331545845e-06, + "loss": 0.83396775, + "num_input_tokens_seen": 7374425, + "router_z_loss_clip": 2.20507812, + "router_z_loss_mlp": 0.38037109, + "step": 354, + "time_per_iteration": 2.7383406162261963 + }, + { + "auxiliary_loss_clip": 0.01332343, + "auxiliary_loss_mlp": 0.01089524, + "balance_loss_clip": 1.10395956, + "balance_loss_mlp": 1.04939818, + "epoch": 0.021343754697129114, + "flos": 30739927701600.0, + "grad_norm": 4.829212225310324, + "language_loss": 0.81027794, + "learning_rate": 3.780775860546545e-06, + "loss": 0.83449662, + "num_input_tokens_seen": 7394175, + "router_z_loss_clip": 2.28125, + "router_z_loss_mlp": 0.40136719, + "step": 355, + "time_per_iteration": 2.7256178855895996 + }, + { + "auxiliary_loss_clip": 0.01327632, + "auxiliary_loss_mlp": 0.01087384, + "balance_loss_clip": 1.10261512, + "balance_loss_mlp": 1.04947543, + "epoch": 0.021403877949797083, + "flos": 21077772389280.0, + "grad_norm": 2.450376636172651, + "language_loss": 0.89729941, + "learning_rate": 3.7825869789939474e-06, + "loss": 0.9214496, + "num_input_tokens_seen": 7412645, + "router_z_loss_clip": 2.25195312, + "router_z_loss_mlp": 0.37915039, + "step": 356, + "time_per_iteration": 2.6887660026550293 + }, + { + "auxiliary_loss_clip": 0.01327335, + "auxiliary_loss_mlp": 0.01078348, + "balance_loss_clip": 1.10643077, + "balance_loss_mlp": 1.03896105, + "epoch": 0.021464001202465055, + "flos": 37726540797600.0, + "grad_norm": 1.8771753248035794, + "language_loss": 0.802827, + "learning_rate": 3.784393017158528e-06, + "loss": 0.82688379, + "num_input_tokens_seen": 7432275, + "router_z_loss_clip": 2.21289062, + "router_z_loss_mlp": 0.39404297, + "step": 357, + "time_per_iteration": 2.775463819503784 + }, + { + "auxiliary_loss_clip": 0.01328556, + "auxiliary_loss_mlp": 0.01080599, + "balance_loss_clip": 1.10336244, + "balance_loss_mlp": 1.0453372, + "epoch": 0.021524124455133024, + "flos": 22190906931840.0, + "grad_norm": 2.275755771379251, + "language_loss": 0.76531297, + "learning_rate": 3.786194003461506e-06, + "loss": 0.78940451, + "num_input_tokens_seen": 7450245, + "router_z_loss_clip": 2.25, + "router_z_loss_mlp": 0.3527832, + "step": 358, + "time_per_iteration": 2.68503999710083 + }, + { + "auxiliary_loss_clip": 0.01324944, + "auxiliary_loss_mlp": 0.01083893, + "balance_loss_clip": 1.1008954, + "balance_loss_mlp": 1.04391003, + "epoch": 0.021584247707800992, + "flos": 16845575952960.0, + "grad_norm": 3.8375680000254206, + "language_loss": 0.8872335, + "learning_rate": 3.787989966086264e-06, + "loss": 0.91132182, + "num_input_tokens_seen": 7466845, + "router_z_loss_clip": 2.24023438, + "router_z_loss_mlp": 0.3996582, + "step": 359, + "time_per_iteration": 2.695612907409668 + }, + { + "auxiliary_loss_clip": 0.01334601, + "auxiliary_loss_mlp": 0.01091734, + "balance_loss_clip": 1.10572612, + "balance_loss_mlp": 1.05547094, + "epoch": 0.02164437096046896, + "flos": 28424452266720.0, + "grad_norm": 2.3027763698627237, + "language_loss": 0.76247013, + "learning_rate": 3.789780932980997e-06, + "loss": 0.78673351, + "num_input_tokens_seen": 7485450, + "router_z_loss_clip": 2.2890625, + "router_z_loss_mlp": 0.36254883, + "step": 360, + "time_per_iteration": 2.7349913120269775 + }, + { + "auxiliary_loss_clip": 0.01235941, + "auxiliary_loss_mlp": 0.01109122, + "balance_loss_clip": 1.11595535, + "balance_loss_mlp": 1.09544826, + "epoch": 0.02170449421313693, + "flos": 84071866770720.0, + "grad_norm": 0.8549122842825692, + "language_loss": 0.6492545, + "learning_rate": 3.79156693186132e-06, + "loss": 0.67270517, + "num_input_tokens_seen": 7553780, + "router_z_loss_clip": 1.20117188, + "router_z_loss_mlp": 0.13708496, + "step": 361, + "time_per_iteration": 3.443009614944458 + }, + { + "auxiliary_loss_clip": 0.01324721, + "auxiliary_loss_mlp": 0.01081799, + "balance_loss_clip": 1.09851599, + "balance_loss_mlp": 1.04448628, + "epoch": 0.0217646174658049, + "flos": 30784247255520.0, + "grad_norm": 3.1066173038211344, + "language_loss": 0.7818203, + "learning_rate": 3.7933479902128433e-06, + "loss": 0.80588555, + "num_input_tokens_seen": 7574155, + "router_z_loss_clip": 2.26171875, + "router_z_loss_mlp": 0.37280273, + "step": 362, + "time_per_iteration": 2.769745349884033 + }, + { + "auxiliary_loss_clip": 0.01330613, + "auxiliary_loss_mlp": 0.01091581, + "balance_loss_clip": 1.10380721, + "balance_loss_mlp": 1.05412591, + "epoch": 0.02182474071847287, + "flos": 27934425102240.0, + "grad_norm": 2.0897539044925955, + "language_loss": 0.92189771, + "learning_rate": 3.7951241352937077e-06, + "loss": 0.94611967, + "num_input_tokens_seen": 7592320, + "router_z_loss_clip": 2.26953125, + "router_z_loss_mlp": 0.37475586, + "step": 363, + "time_per_iteration": 2.790285110473633 + }, + { + "auxiliary_loss_clip": 0.01323952, + "auxiliary_loss_mlp": 0.01097318, + "balance_loss_clip": 1.10261655, + "balance_loss_mlp": 1.0609827, + "epoch": 0.02188486397114084, + "flos": 28869025393440.0, + "grad_norm": 8.56920265868681, + "language_loss": 0.90015769, + "learning_rate": 3.7968953941370915e-06, + "loss": 0.92437041, + "num_input_tokens_seen": 7611185, + "router_z_loss_clip": 2.21484375, + "router_z_loss_mlp": 0.36303711, + "step": 364, + "time_per_iteration": 2.7886602878570557 + }, + { + "auxiliary_loss_clip": 0.01332593, + "auxiliary_loss_mlp": 0.01088715, + "balance_loss_clip": 1.10865045, + "balance_loss_mlp": 1.05073452, + "epoch": 0.021944987223808807, + "flos": 26287632635040.0, + "grad_norm": 2.1387694328040285, + "language_loss": 0.79446471, + "learning_rate": 3.798661793553676e-06, + "loss": 0.81867778, + "num_input_tokens_seen": 7631970, + "router_z_loss_clip": 2.23828125, + "router_z_loss_mlp": 0.37963867, + "step": 365, + "time_per_iteration": 2.7317941188812256 + }, + { + "auxiliary_loss_clip": 0.01328322, + "auxiliary_loss_mlp": 0.01098284, + "balance_loss_clip": 1.10590279, + "balance_loss_mlp": 1.05825412, + "epoch": 0.022005110476476776, + "flos": 19609352104320.0, + "grad_norm": 2.257735486782864, + "language_loss": 0.84739542, + "learning_rate": 3.8004233601340808e-06, + "loss": 0.87166142, + "num_input_tokens_seen": 7649745, + "router_z_loss_clip": 2.22265625, + "router_z_loss_mlp": 0.40039062, + "step": 366, + "time_per_iteration": 2.6819469928741455 + }, + { + "auxiliary_loss_clip": 0.01331703, + "auxiliary_loss_mlp": 0.01084667, + "balance_loss_clip": 1.1066525, + "balance_loss_mlp": 1.04964304, + "epoch": 0.022065233729144748, + "flos": 26153539489440.0, + "grad_norm": 1.773627934834949, + "language_loss": 0.86819434, + "learning_rate": 3.8021801202512694e-06, + "loss": 0.89235801, + "num_input_tokens_seen": 7668830, + "router_z_loss_clip": 2.25, + "router_z_loss_mlp": 0.35009766, + "step": 367, + "time_per_iteration": 2.7080345153808594 + }, + { + "auxiliary_loss_clip": 0.01332713, + "auxiliary_loss_mlp": 0.01089756, + "balance_loss_clip": 1.10334301, + "balance_loss_mlp": 1.05127501, + "epoch": 0.022125356981812717, + "flos": 26287713669600.0, + "grad_norm": 3.2792449836264344, + "language_loss": 0.8513329, + "learning_rate": 3.803932100062912e-06, + "loss": 0.8755576, + "num_input_tokens_seen": 7687240, + "router_z_loss_clip": 2.29492188, + "router_z_loss_mlp": 0.38500977, + "step": 368, + "time_per_iteration": 2.6743085384368896 + }, + { + "auxiliary_loss_clip": 0.01333506, + "auxiliary_loss_mlp": 0.01076533, + "balance_loss_clip": 1.10276413, + "balance_loss_mlp": 1.04072273, + "epoch": 0.022185480234480685, + "flos": 25263258752160.0, + "grad_norm": 2.2005573083616095, + "language_loss": 0.7630291, + "learning_rate": 3.8056793255137264e-06, + "loss": 0.78712946, + "num_input_tokens_seen": 7704440, + "router_z_loss_clip": 2.30859375, + "router_z_loss_mlp": 0.35791016, + "step": 369, + "time_per_iteration": 2.6924965381622314 + }, + { + "auxiliary_loss_clip": 0.01327544, + "auxiliary_loss_mlp": 0.0109387, + "balance_loss_clip": 1.10383093, + "balance_loss_mlp": 1.05734468, + "epoch": 0.022245603487148654, + "flos": 30741021668160.0, + "grad_norm": 2.0537072048441902, + "language_loss": 0.82745087, + "learning_rate": 3.8074218223377844e-06, + "loss": 0.85166502, + "num_input_tokens_seen": 7727160, + "router_z_loss_clip": 2.234375, + "router_z_loss_mlp": 0.36523438, + "step": 370, + "time_per_iteration": 2.731492280960083 + }, + { + "auxiliary_loss_clip": 0.01324504, + "auxiliary_loss_mlp": 0.01096657, + "balance_loss_clip": 1.10269988, + "balance_loss_mlp": 1.06006026, + "epoch": 0.022305726739816623, + "flos": 26107558727040.0, + "grad_norm": 1.7165241478262896, + "language_loss": 0.81678706, + "learning_rate": 3.8091596160607834e-06, + "loss": 0.84099865, + "num_input_tokens_seen": 7747730, + "router_z_loss_clip": 2.21875, + "router_z_loss_mlp": 0.3659668, + "step": 371, + "time_per_iteration": 2.7231714725494385 + }, + { + "auxiliary_loss_clip": 0.01330693, + "auxiliary_loss_mlp": 0.01092307, + "balance_loss_clip": 1.1078887, + "balance_loss_mlp": 1.05518556, + "epoch": 0.022365849992484595, + "flos": 27445086731520.0, + "grad_norm": 1.9292877201796392, + "language_loss": 0.83347434, + "learning_rate": 3.8108927320022896e-06, + "loss": 0.8577044, + "num_input_tokens_seen": 7766765, + "router_z_loss_clip": 2.2265625, + "router_z_loss_mlp": 0.37133789, + "step": 372, + "time_per_iteration": 4.141528844833374 + }, + { + "auxiliary_loss_clip": 0.01323492, + "auxiliary_loss_mlp": 0.01087417, + "balance_loss_clip": 1.10241437, + "balance_loss_mlp": 1.05012798, + "epoch": 0.022425973245152563, + "flos": 21788586977760.0, + "grad_norm": 3.043433298056022, + "language_loss": 0.79145056, + "learning_rate": 3.8126211952779548e-06, + "loss": 0.81555963, + "num_input_tokens_seen": 7784010, + "router_z_loss_clip": 2.20898438, + "router_z_loss_mlp": 0.37280273, + "step": 373, + "time_per_iteration": 5.545243263244629 + }, + { + "auxiliary_loss_clip": 0.01328584, + "auxiliary_loss_mlp": 0.01088263, + "balance_loss_clip": 1.10436881, + "balance_loss_mlp": 1.04973423, + "epoch": 0.022486096497820532, + "flos": 18892378889280.0, + "grad_norm": 3.5002822309224166, + "language_loss": 0.77667487, + "learning_rate": 3.8143450308016952e-06, + "loss": 0.80084336, + "num_input_tokens_seen": 7801305, + "router_z_loss_clip": 2.2421875, + "router_z_loss_mlp": 0.38525391, + "step": 374, + "time_per_iteration": 2.678537368774414 + }, + { + "auxiliary_loss_clip": 0.0131582, + "auxiliary_loss_mlp": 0.01070623, + "balance_loss_clip": 1.09400535, + "balance_loss_mlp": 1.03278577, + "epoch": 0.0225462197504885, + "flos": 33902660872800.0, + "grad_norm": 1.6281041211066425, + "language_loss": 0.86261642, + "learning_rate": 3.8160642632878525e-06, + "loss": 0.88648093, + "num_input_tokens_seen": 7823965, + "router_z_loss_clip": 2.21679688, + "router_z_loss_mlp": 0.37817383, + "step": 375, + "time_per_iteration": 2.758558988571167 + }, + { + "auxiliary_loss_clip": 0.01323181, + "auxiliary_loss_mlp": 0.01091407, + "balance_loss_clip": 1.10219979, + "balance_loss_mlp": 1.05278337, + "epoch": 0.02260634300315647, + "flos": 24373990946880.0, + "grad_norm": 2.0725218737001008, + "language_loss": 0.88745159, + "learning_rate": 3.817778917253314e-06, + "loss": 0.91159749, + "num_input_tokens_seen": 7842115, + "router_z_loss_clip": 2.2109375, + "router_z_loss_mlp": 0.38574219, + "step": 376, + "time_per_iteration": 2.698462963104248 + }, + { + "auxiliary_loss_clip": 0.01324657, + "auxiliary_loss_mlp": 0.01083163, + "balance_loss_clip": 1.09838128, + "balance_loss_mlp": 1.04740012, + "epoch": 0.02266646625582444, + "flos": 19557820474560.0, + "grad_norm": 2.958343159994528, + "language_loss": 0.75532681, + "learning_rate": 3.8194890170196155e-06, + "loss": 0.779405, + "num_input_tokens_seen": 7857830, + "router_z_loss_clip": 2.26367188, + "router_z_loss_mlp": 0.35766602, + "step": 377, + "time_per_iteration": 2.6883113384246826 + }, + { + "auxiliary_loss_clip": 0.01317648, + "auxiliary_loss_mlp": 0.01089415, + "balance_loss_clip": 1.10264874, + "balance_loss_mlp": 1.05112481, + "epoch": 0.02272658950849241, + "flos": 24898451448960.0, + "grad_norm": 2.1880730400618345, + "language_loss": 0.99500579, + "learning_rate": 3.8211945867150055e-06, + "loss": 1.01907647, + "num_input_tokens_seen": 7875840, + "router_z_loss_clip": 2.15429688, + "router_z_loss_mlp": 0.38256836, + "step": 378, + "time_per_iteration": 2.7389657497406006 + }, + { + "auxiliary_loss_clip": 0.01209225, + "auxiliary_loss_mlp": 0.01027258, + "balance_loss_clip": 1.09615254, + "balance_loss_mlp": 1.01428854, + "epoch": 0.02278671276116038, + "flos": 85228470004320.0, + "grad_norm": 0.9611340655863234, + "language_loss": 0.75416172, + "learning_rate": 3.822895650276492e-06, + "loss": 0.77652657, + "num_input_tokens_seen": 7940190, + "router_z_loss_clip": 1.12890625, + "router_z_loss_mlp": 0.12988281, + "step": 379, + "time_per_iteration": 3.379312038421631 + }, + { + "auxiliary_loss_clip": 0.01325417, + "auxiliary_loss_mlp": 0.0108432, + "balance_loss_clip": 1.09728003, + "balance_loss_mlp": 1.04946327, + "epoch": 0.022846836013828347, + "flos": 46989860641920.0, + "grad_norm": 2.3413723915925764, + "language_loss": 0.78412056, + "learning_rate": 3.824592231451859e-06, + "loss": 0.80821794, + "num_input_tokens_seen": 7960840, + "router_z_loss_clip": 2.28125, + "router_z_loss_mlp": 0.34887695, + "step": 380, + "time_per_iteration": 2.902622699737549 + }, + { + "auxiliary_loss_clip": 0.0131957, + "auxiliary_loss_mlp": 0.01088301, + "balance_loss_clip": 1.10043979, + "balance_loss_mlp": 1.05272877, + "epoch": 0.02290695926649632, + "flos": 25574832699840.0, + "grad_norm": 2.377716596170663, + "language_loss": 0.96669924, + "learning_rate": 3.826284353801652e-06, + "loss": 0.99077791, + "num_input_tokens_seen": 7975500, + "router_z_loss_clip": 2.19335938, + "router_z_loss_mlp": 0.35571289, + "step": 381, + "time_per_iteration": 2.686882495880127 + }, + { + "auxiliary_loss_clip": 0.01329996, + "auxiliary_loss_mlp": 0.01096218, + "balance_loss_clip": 1.10362935, + "balance_loss_mlp": 1.05954897, + "epoch": 0.022967082519164288, + "flos": 29312626105440.0, + "grad_norm": 2.277936696928791, + "language_loss": 0.88069439, + "learning_rate": 3.827972040701142e-06, + "loss": 0.90495652, + "num_input_tokens_seen": 7993880, + "router_z_loss_clip": 2.265625, + "router_z_loss_mlp": 0.36669922, + "step": 382, + "time_per_iteration": 2.706787347793579 + }, + { + "auxiliary_loss_clip": 0.01320749, + "auxiliary_loss_mlp": 0.01103292, + "balance_loss_clip": 1.10161138, + "balance_loss_mlp": 1.06750584, + "epoch": 0.023027205771832256, + "flos": 25620691910400.0, + "grad_norm": 2.1170078585777707, + "language_loss": 0.84855461, + "learning_rate": 3.829655315342268e-06, + "loss": 0.8727951, + "num_input_tokens_seen": 8012730, + "router_z_loss_clip": 2.18945312, + "router_z_loss_mlp": 0.35791016, + "step": 383, + "time_per_iteration": 2.738438367843628 + }, + { + "auxiliary_loss_clip": 0.01321314, + "auxiliary_loss_mlp": 0.01114086, + "balance_loss_clip": 1.1042366, + "balance_loss_mlp": 1.07868111, + "epoch": 0.023087329024500225, + "flos": 26064576243360.0, + "grad_norm": 2.890465049113729, + "language_loss": 0.83055907, + "learning_rate": 3.831334200735543e-06, + "loss": 0.85491312, + "num_input_tokens_seen": 8031275, + "router_z_loss_clip": 2.16992188, + "router_z_loss_mlp": 0.35400391, + "step": 384, + "time_per_iteration": 2.7218971252441406 + }, + { + "auxiliary_loss_clip": 0.01317357, + "auxiliary_loss_mlp": 0.01092081, + "balance_loss_clip": 1.10477448, + "balance_loss_mlp": 1.05872643, + "epoch": 0.023147452277168194, + "flos": 26688818105280.0, + "grad_norm": 1.8257785968799825, + "language_loss": 0.89076519, + "learning_rate": 3.8330087197119426e-06, + "loss": 0.91485959, + "num_input_tokens_seen": 8051600, + "router_z_loss_clip": 2.12695312, + "router_z_loss_mlp": 0.33349609, + "step": 385, + "time_per_iteration": 2.8550021648406982 + }, + { + "auxiliary_loss_clip": 0.01322492, + "auxiliary_loss_mlp": 0.01117431, + "balance_loss_clip": 1.1030997, + "balance_loss_mlp": 1.08159709, + "epoch": 0.023207575529836166, + "flos": 23082038532000.0, + "grad_norm": 1.7906504382943458, + "language_loss": 0.69680828, + "learning_rate": 3.83467889492477e-06, + "loss": 0.7212075, + "num_input_tokens_seen": 8070600, + "router_z_loss_clip": 2.19335938, + "router_z_loss_mlp": 0.35839844, + "step": 386, + "time_per_iteration": 2.668346643447876 + }, + { + "auxiliary_loss_clip": 0.01323613, + "auxiliary_loss_mlp": 0.01085827, + "balance_loss_clip": 1.10353637, + "balance_loss_mlp": 1.0510658, + "epoch": 0.023267698782504134, + "flos": 30561515002080.0, + "grad_norm": 1.8753761555743067, + "language_loss": 0.88023561, + "learning_rate": 3.836344748851495e-06, + "loss": 0.90433002, + "num_input_tokens_seen": 8090680, + "router_z_loss_clip": 2.20117188, + "router_z_loss_mlp": 0.34790039, + "step": 387, + "time_per_iteration": 2.717545747756958 + }, + { + "auxiliary_loss_clip": 0.01323193, + "auxiliary_loss_mlp": 0.01074018, + "balance_loss_clip": 1.10241127, + "balance_loss_mlp": 1.03789759, + "epoch": 0.023327822035172103, + "flos": 35238892324320.0, + "grad_norm": 2.752986319382434, + "language_loss": 0.83427024, + "learning_rate": 3.838006303795566e-06, + "loss": 0.85824239, + "num_input_tokens_seen": 8114610, + "router_z_loss_clip": 2.20898438, + "router_z_loss_mlp": 0.36132812, + "step": 388, + "time_per_iteration": 2.766753911972046 + }, + { + "auxiliary_loss_clip": 0.01319476, + "auxiliary_loss_mlp": 0.01089446, + "balance_loss_clip": 1.10074997, + "balance_loss_mlp": 1.0562346, + "epoch": 0.02338794528784007, + "flos": 33094617513120.0, + "grad_norm": 2.0640237199914115, + "language_loss": 0.93982977, + "learning_rate": 3.839663581888206e-06, + "loss": 0.96391904, + "num_input_tokens_seen": 8133975, + "router_z_loss_clip": 2.18554688, + "router_z_loss_mlp": 0.33203125, + "step": 389, + "time_per_iteration": 2.7498021125793457 + }, + { + "auxiliary_loss_clip": 0.01317124, + "auxiliary_loss_mlp": 0.01078093, + "balance_loss_clip": 1.10476315, + "balance_loss_mlp": 1.04287851, + "epoch": 0.02344806854050804, + "flos": 26017825652640.0, + "grad_norm": 1.9924327905148451, + "language_loss": 0.87657571, + "learning_rate": 3.841316605090178e-06, + "loss": 0.90052783, + "num_input_tokens_seen": 8153570, + "router_z_loss_clip": 2.12304688, + "router_z_loss_mlp": 0.35253906, + "step": 390, + "time_per_iteration": 2.705853223800659 + }, + { + "auxiliary_loss_clip": 0.0131864, + "auxiliary_loss_mlp": 0.01085856, + "balance_loss_clip": 1.10399282, + "balance_loss_mlp": 1.05281079, + "epoch": 0.023508191793176012, + "flos": 30249171226080.0, + "grad_norm": 2.410925632162155, + "language_loss": 0.89315856, + "learning_rate": 3.842965395193529e-06, + "loss": 0.91720349, + "num_input_tokens_seen": 8170075, + "router_z_loss_clip": 2.14648438, + "router_z_loss_mlp": 0.33032227, + "step": 391, + "time_per_iteration": 2.7193400859832764 + }, + { + "auxiliary_loss_clip": 0.01315604, + "auxiliary_loss_mlp": 0.01072551, + "balance_loss_clip": 1.10000181, + "balance_loss_mlp": 1.03843355, + "epoch": 0.02356831504584398, + "flos": 31719941513280.0, + "grad_norm": 1.9011733526853751, + "language_loss": 0.85829037, + "learning_rate": 3.84460997382332e-06, + "loss": 0.88217187, + "num_input_tokens_seen": 8190420, + "router_z_loss_clip": 2.16015625, + "router_z_loss_mlp": 0.34106445, + "step": 392, + "time_per_iteration": 2.8106510639190674 + }, + { + "auxiliary_loss_clip": 0.01312662, + "auxiliary_loss_mlp": 0.01081985, + "balance_loss_clip": 1.10128689, + "balance_loss_mlp": 1.04803395, + "epoch": 0.02362843829851195, + "flos": 23259924506880.0, + "grad_norm": 1.952400138223068, + "language_loss": 0.89040875, + "learning_rate": 3.8462503624393256e-06, + "loss": 0.91435528, + "num_input_tokens_seen": 8208790, + "router_z_loss_clip": 2.11328125, + "router_z_loss_mlp": 0.33959961, + "step": 393, + "time_per_iteration": 2.6749141216278076 + }, + { + "auxiliary_loss_clip": 0.01324171, + "auxiliary_loss_mlp": 0.01099574, + "balance_loss_clip": 1.10712767, + "balance_loss_mlp": 1.06283343, + "epoch": 0.023688561551179918, + "flos": 19609514173440.0, + "grad_norm": 1.7445601979655536, + "language_loss": 0.81467032, + "learning_rate": 3.84788658233771e-06, + "loss": 0.83890778, + "num_input_tokens_seen": 8226885, + "router_z_loss_clip": 2.16796875, + "router_z_loss_mlp": 0.36743164, + "step": 394, + "time_per_iteration": 2.692268133163452 + }, + { + "auxiliary_loss_clip": 0.01314178, + "auxiliary_loss_mlp": 0.01082197, + "balance_loss_clip": 1.09883749, + "balance_loss_mlp": 1.0463866, + "epoch": 0.023748684803847887, + "flos": 26508339024480.0, + "grad_norm": 1.8606350108605711, + "language_loss": 0.86132145, + "learning_rate": 3.84951865465269e-06, + "loss": 0.88528526, + "num_input_tokens_seen": 8246825, + "router_z_loss_clip": 2.15429688, + "router_z_loss_mlp": 0.3581543, + "step": 395, + "time_per_iteration": 2.687107563018799 + }, + { + "auxiliary_loss_clip": 0.01201049, + "auxiliary_loss_mlp": 0.01038532, + "balance_loss_clip": 1.09198153, + "balance_loss_mlp": 1.02580035, + "epoch": 0.02380880805651586, + "flos": 75562830205920.0, + "grad_norm": 0.9359977579214701, + "language_loss": 0.63834596, + "learning_rate": 3.851146600358172e-06, + "loss": 0.66074181, + "num_input_tokens_seen": 8302835, + "router_z_loss_clip": 1.09082031, + "router_z_loss_mlp": 0.12744141, + "step": 396, + "time_per_iteration": 3.1779303550720215 + }, + { + "auxiliary_loss_clip": 0.01312539, + "auxiliary_loss_mlp": 0.0106838, + "balance_loss_clip": 1.09871209, + "balance_loss_mlp": 1.03471518, + "epoch": 0.023868931309183827, + "flos": 24728993068320.0, + "grad_norm": 3.161520261369181, + "language_loss": 0.83666432, + "learning_rate": 3.852770440269372e-06, + "loss": 0.86047357, + "num_input_tokens_seen": 8320745, + "router_z_loss_clip": 2.13867188, + "router_z_loss_mlp": 0.33642578, + "step": 397, + "time_per_iteration": 2.79474139213562 + }, + { + "auxiliary_loss_clip": 0.01316353, + "auxiliary_loss_mlp": 0.01085221, + "balance_loss_clip": 1.10270596, + "balance_loss_mlp": 1.05084109, + "epoch": 0.023929054561851796, + "flos": 25794283053600.0, + "grad_norm": 1.950577658788252, + "language_loss": 0.8417083, + "learning_rate": 3.854390195044404e-06, + "loss": 0.86572409, + "num_input_tokens_seen": 8339540, + "router_z_loss_clip": 2.1328125, + "router_z_loss_mlp": 0.34423828, + "step": 398, + "time_per_iteration": 2.7324113845825195 + }, + { + "auxiliary_loss_clip": 0.01316634, + "auxiliary_loss_mlp": 0.010783, + "balance_loss_clip": 1.09870291, + "balance_loss_mlp": 1.04189324, + "epoch": 0.023989177814519765, + "flos": 16714481086080.0, + "grad_norm": 2.3522578448250213, + "language_loss": 0.85983199, + "learning_rate": 3.856005885185868e-06, + "loss": 0.88378131, + "num_input_tokens_seen": 8354890, + "router_z_loss_clip": 2.1796875, + "router_z_loss_mlp": 0.36401367, + "step": 399, + "time_per_iteration": 2.6868348121643066 + }, + { + "auxiliary_loss_clip": 0.01310894, + "auxiliary_loss_mlp": 0.01086603, + "balance_loss_clip": 1.09993744, + "balance_loss_mlp": 1.05207968, + "epoch": 0.024049301067187733, + "flos": 32118695946720.0, + "grad_norm": 10.02665470411138, + "language_loss": 0.86262381, + "learning_rate": 3.857617531042398e-06, + "loss": 0.88659877, + "num_input_tokens_seen": 8375845, + "router_z_loss_clip": 2.11328125, + "router_z_loss_mlp": 0.3449707, + "step": 400, + "time_per_iteration": 2.7560527324676514 + }, + { + "auxiliary_loss_clip": 0.01319399, + "auxiliary_loss_mlp": 0.01077942, + "balance_loss_clip": 1.10428572, + "balance_loss_mlp": 1.04415774, + "epoch": 0.024109424319855705, + "flos": 29804273961120.0, + "grad_norm": 1.8148508121705524, + "language_loss": 0.793262, + "learning_rate": 3.8592251528102065e-06, + "loss": 0.81723541, + "num_input_tokens_seen": 8395240, + "router_z_loss_clip": 2.15234375, + "router_z_loss_mlp": 0.33764648, + "step": 401, + "time_per_iteration": 2.711412191390991 + }, + { + "auxiliary_loss_clip": 0.0131319, + "auxiliary_loss_mlp": 0.01087632, + "balance_loss_clip": 1.09905243, + "balance_loss_mlp": 1.05341887, + "epoch": 0.024169547572523674, + "flos": 36124229953440.0, + "grad_norm": 1.870774925585359, + "language_loss": 0.78538609, + "learning_rate": 3.8608287705345976e-06, + "loss": 0.80939436, + "num_input_tokens_seen": 8416950, + "router_z_loss_clip": 2.14453125, + "router_z_loss_mlp": 0.34228516, + "step": 402, + "time_per_iteration": 2.7625391483306885 + }, + { + "auxiliary_loss_clip": 0.01315448, + "auxiliary_loss_mlp": 0.01081039, + "balance_loss_clip": 1.09832454, + "balance_loss_mlp": 1.04465663, + "epoch": 0.024229670825191642, + "flos": 27578491083360.0, + "grad_norm": 2.5980113423651963, + "language_loss": 0.94971985, + "learning_rate": 3.86242840411147e-06, + "loss": 0.97368467, + "num_input_tokens_seen": 8433660, + "router_z_loss_clip": 2.17382812, + "router_z_loss_mlp": 0.36401367, + "step": 403, + "time_per_iteration": 2.698162317276001 + }, + { + "auxiliary_loss_clip": 0.01318883, + "auxiliary_loss_mlp": 0.01077407, + "balance_loss_clip": 1.09787083, + "balance_loss_mlp": 1.04269373, + "epoch": 0.02428979407785961, + "flos": 22146789964320.0, + "grad_norm": 2.2146968367647024, + "language_loss": 0.99656153, + "learning_rate": 3.864024073288798e-06, + "loss": 1.02052438, + "num_input_tokens_seen": 8450180, + "router_z_loss_clip": 2.21289062, + "router_z_loss_mlp": 0.34716797, + "step": 404, + "time_per_iteration": 2.6760830879211426 + }, + { + "auxiliary_loss_clip": 0.01318633, + "auxiliary_loss_mlp": 0.01083941, + "balance_loss_clip": 1.10121632, + "balance_loss_mlp": 1.04937053, + "epoch": 0.024349917330527583, + "flos": 18674711295840.0, + "grad_norm": 2.409272166337564, + "language_loss": 0.87676746, + "learning_rate": 3.865615797668091e-06, + "loss": 0.90079319, + "num_input_tokens_seen": 8467775, + "router_z_loss_clip": 2.17382812, + "router_z_loss_mlp": 0.34545898, + "step": 405, + "time_per_iteration": 2.6758790016174316 + }, + { + "auxiliary_loss_clip": 0.0132471, + "auxiliary_loss_mlp": 0.0109, + "balance_loss_clip": 1.10427332, + "balance_loss_mlp": 1.0554297, + "epoch": 0.024410040583195552, + "flos": 25348656477600.0, + "grad_norm": 2.274935427980461, + "language_loss": 0.93423975, + "learning_rate": 3.867203596705844e-06, + "loss": 0.95838678, + "num_input_tokens_seen": 8486765, + "router_z_loss_clip": 2.20703125, + "router_z_loss_mlp": 0.34594727, + "step": 406, + "time_per_iteration": 2.660433530807495 + }, + { + "auxiliary_loss_clip": 0.01317846, + "auxiliary_loss_mlp": 0.01090873, + "balance_loss_clip": 1.10288978, + "balance_loss_mlp": 1.05491972, + "epoch": 0.02447016383586352, + "flos": 26599085030880.0, + "grad_norm": 2.0121014794964984, + "language_loss": 0.87219775, + "learning_rate": 3.86878748971496e-06, + "loss": 0.896285, + "num_input_tokens_seen": 8506515, + "router_z_loss_clip": 2.15039062, + "router_z_loss_mlp": 0.35986328, + "step": 407, + "time_per_iteration": 2.773972511291504 + }, + { + "auxiliary_loss_clip": 0.01316307, + "auxiliary_loss_mlp": 0.01083206, + "balance_loss_clip": 1.10492682, + "balance_loss_mlp": 1.0483259, + "epoch": 0.02453028708853149, + "flos": 41424593101920.0, + "grad_norm": 1.8308826525808364, + "language_loss": 0.73953605, + "learning_rate": 3.8703674958661596e-06, + "loss": 0.76353121, + "num_input_tokens_seen": 8528035, + "router_z_loss_clip": 2.11523438, + "router_z_loss_mlp": 0.34863281, + "step": 408, + "time_per_iteration": 2.8194351196289062 + }, + { + "auxiliary_loss_clip": 0.01318427, + "auxiliary_loss_mlp": 0.01091011, + "balance_loss_clip": 1.10225236, + "balance_loss_mlp": 1.05567718, + "epoch": 0.024590410341199458, + "flos": 26592480714240.0, + "grad_norm": 2.7110670642106363, + "language_loss": 0.92917562, + "learning_rate": 3.871943634189376e-06, + "loss": 0.95326996, + "num_input_tokens_seen": 8546455, + "router_z_loss_clip": 2.16015625, + "router_z_loss_mlp": 0.35351562, + "step": 409, + "time_per_iteration": 2.7217047214508057 + }, + { + "auxiliary_loss_clip": 0.01317633, + "auxiliary_loss_mlp": 0.01072267, + "balance_loss_clip": 1.10221434, + "balance_loss_mlp": 1.04034233, + "epoch": 0.02465053359386743, + "flos": 42848329177440.0, + "grad_norm": 2.1083457467915223, + "language_loss": 0.82770449, + "learning_rate": 3.873515923575128e-06, + "loss": 0.85160345, + "num_input_tokens_seen": 8568450, + "router_z_loss_clip": 2.15332031, + "router_z_loss_mlp": 0.31933594, + "step": 410, + "time_per_iteration": 2.8990888595581055 + }, + { + "auxiliary_loss_clip": 0.01317075, + "auxiliary_loss_mlp": 0.01086441, + "balance_loss_clip": 1.10188138, + "balance_loss_mlp": 1.05251372, + "epoch": 0.0247106568465354, + "flos": 33497018501760.0, + "grad_norm": 2.3204332175927225, + "language_loss": 0.77848887, + "learning_rate": 3.875084382775879e-06, + "loss": 0.80252409, + "num_input_tokens_seen": 8589340, + "router_z_loss_clip": 2.15234375, + "router_z_loss_mlp": 0.33935547, + "step": 411, + "time_per_iteration": 2.7357869148254395 + }, + { + "auxiliary_loss_clip": 0.0131614, + "auxiliary_loss_mlp": 0.01094622, + "balance_loss_clip": 1.09931087, + "balance_loss_mlp": 1.05888319, + "epoch": 0.024770780099203367, + "flos": 25262853579360.0, + "grad_norm": 2.259730010016184, + "language_loss": 0.86745632, + "learning_rate": 3.87664903040738e-06, + "loss": 0.89156389, + "num_input_tokens_seen": 8607150, + "router_z_loss_clip": 2.16992188, + "router_z_loss_mlp": 0.35742188, + "step": 412, + "time_per_iteration": 5.54777455329895 + }, + { + "auxiliary_loss_clip": 0.01195357, + "auxiliary_loss_mlp": 0.01026032, + "balance_loss_clip": 1.0895189, + "balance_loss_mlp": 1.01424253, + "epoch": 0.024830903351871336, + "flos": 84868241153760.0, + "grad_norm": 0.8592416596942217, + "language_loss": 0.58578801, + "learning_rate": 3.878209884949994e-06, + "loss": 0.60800189, + "num_input_tokens_seen": 8669865, + "router_z_loss_clip": 1.05761719, + "router_z_loss_mlp": 0.11779785, + "step": 413, + "time_per_iteration": 4.795031547546387 + }, + { + "auxiliary_loss_clip": 0.01311606, + "auxiliary_loss_mlp": 0.01083499, + "balance_loss_clip": 1.09911466, + "balance_loss_mlp": 1.04744983, + "epoch": 0.024891026604539304, + "flos": 39377344475520.0, + "grad_norm": 1.846265229616242, + "language_loss": 0.806027, + "learning_rate": 3.879766964750006e-06, + "loss": 0.82997799, + "num_input_tokens_seen": 8690235, + "router_z_loss_clip": 2.12695312, + "router_z_loss_mlp": 0.3605957, + "step": 414, + "time_per_iteration": 2.7844290733337402 + }, + { + "auxiliary_loss_clip": 0.01305644, + "auxiliary_loss_mlp": 0.01094761, + "balance_loss_clip": 1.09692097, + "balance_loss_mlp": 1.06112039, + "epoch": 0.024951149857207276, + "flos": 22989104592480.0, + "grad_norm": 2.4229549661624983, + "language_loss": 0.80540943, + "learning_rate": 3.881320288020917e-06, + "loss": 0.82941347, + "num_input_tokens_seen": 8706295, + "router_z_loss_clip": 2.08398438, + "router_z_loss_mlp": 0.33666992, + "step": 415, + "time_per_iteration": 2.6364126205444336 + }, + { + "auxiliary_loss_clip": 0.01324657, + "auxiliary_loss_mlp": 0.01084152, + "balance_loss_clip": 1.10386765, + "balance_loss_mlp": 1.04962897, + "epoch": 0.025011273109875245, + "flos": 18762904713600.0, + "grad_norm": 2.807622936665876, + "language_loss": 0.96162307, + "learning_rate": 3.882869872844723e-06, + "loss": 0.98571122, + "num_input_tokens_seen": 8724200, + "router_z_loss_clip": 2.20800781, + "router_z_loss_mlp": 0.34545898, + "step": 416, + "time_per_iteration": 2.6564533710479736 + }, + { + "auxiliary_loss_clip": 0.01314405, + "auxiliary_loss_mlp": 0.01073318, + "balance_loss_clip": 1.09930098, + "balance_loss_mlp": 1.03698254, + "epoch": 0.025071396362543213, + "flos": 23080701461760.0, + "grad_norm": 2.2482035291262905, + "language_loss": 0.77467632, + "learning_rate": 3.884415737173176e-06, + "loss": 0.79855359, + "num_input_tokens_seen": 8744170, + "router_z_loss_clip": 2.15234375, + "router_z_loss_mlp": 0.36352539, + "step": 417, + "time_per_iteration": 2.7118186950683594 + }, + { + "auxiliary_loss_clip": 0.0131074, + "auxiliary_loss_mlp": 0.01085828, + "balance_loss_clip": 1.10327005, + "balance_loss_mlp": 1.0513525, + "epoch": 0.025131519615211182, + "flos": 30917773159200.0, + "grad_norm": 1.5986795218684475, + "language_loss": 0.76963758, + "learning_rate": 3.8859578988290344e-06, + "loss": 0.7936033, + "num_input_tokens_seen": 8765120, + "router_z_loss_clip": 2.07617188, + "router_z_loss_mlp": 0.34472656, + "step": 418, + "time_per_iteration": 2.7442195415496826 + }, + { + "auxiliary_loss_clip": 0.01318356, + "auxiliary_loss_mlp": 0.0107149, + "balance_loss_clip": 1.10297048, + "balance_loss_mlp": 1.03877902, + "epoch": 0.02519164286787915, + "flos": 23126520155040.0, + "grad_norm": 2.6950038451704295, + "language_loss": 0.81404483, + "learning_rate": 3.887496375507294e-06, + "loss": 0.83794332, + "num_input_tokens_seen": 8783500, + "router_z_loss_clip": 2.15429688, + "router_z_loss_mlp": 0.3269043, + "step": 419, + "time_per_iteration": 2.673841953277588 + }, + { + "auxiliary_loss_clip": 0.01313393, + "auxiliary_loss_mlp": 0.01089471, + "balance_loss_clip": 1.10338664, + "balance_loss_mlp": 1.0537318, + "epoch": 0.025251766120547123, + "flos": 21256954917120.0, + "grad_norm": 1.8341848882366667, + "language_loss": 0.73647916, + "learning_rate": 3.8890311847764065e-06, + "loss": 0.76050782, + "num_input_tokens_seen": 8801175, + "router_z_loss_clip": 2.1015625, + "router_z_loss_mlp": 0.35717773, + "step": 420, + "time_per_iteration": 2.688203811645508 + }, + { + "auxiliary_loss_clip": 0.01312414, + "auxiliary_loss_mlp": 0.01093842, + "balance_loss_clip": 1.09825015, + "balance_loss_mlp": 1.06024837, + "epoch": 0.02531188937321509, + "flos": 30560866725600.0, + "grad_norm": 1.785673359968983, + "language_loss": 0.78858584, + "learning_rate": 3.890562344079484e-06, + "loss": 0.81264842, + "num_input_tokens_seen": 8820215, + "router_z_loss_clip": 2.13867188, + "router_z_loss_mlp": 0.3359375, + "step": 421, + "time_per_iteration": 2.751253843307495 + }, + { + "auxiliary_loss_clip": 0.01311798, + "auxiliary_loss_mlp": 0.01086946, + "balance_loss_clip": 1.1015327, + "balance_loss_mlp": 1.05204129, + "epoch": 0.02537201262588306, + "flos": 37329407055360.0, + "grad_norm": 2.104886627598096, + "language_loss": 0.81733549, + "learning_rate": 3.89208987073549e-06, + "loss": 0.8413229, + "num_input_tokens_seen": 8839660, + "router_z_loss_clip": 2.10546875, + "router_z_loss_mlp": 0.34863281, + "step": 422, + "time_per_iteration": 2.7843968868255615 + }, + { + "auxiliary_loss_clip": 0.01313646, + "auxiliary_loss_mlp": 0.01079438, + "balance_loss_clip": 1.09820485, + "balance_loss_mlp": 1.04737055, + "epoch": 0.02543213587855103, + "flos": 31805622859680.0, + "grad_norm": 1.7004692566132533, + "language_loss": 0.83409512, + "learning_rate": 3.893613781940409e-06, + "loss": 0.85802597, + "num_input_tokens_seen": 8859280, + "router_z_loss_clip": 2.15234375, + "router_z_loss_mlp": 0.32055664, + "step": 423, + "time_per_iteration": 2.70981502532959 + }, + { + "auxiliary_loss_clip": 0.01303611, + "auxiliary_loss_mlp": 0.01075339, + "balance_loss_clip": 1.09524202, + "balance_loss_mlp": 1.04201961, + "epoch": 0.025492259131218997, + "flos": 43956763715520.0, + "grad_norm": 1.9630281398423046, + "language_loss": 0.73985505, + "learning_rate": 3.895134094768415e-06, + "loss": 0.76364458, + "num_input_tokens_seen": 8880560, + "router_z_loss_clip": 2.08300781, + "router_z_loss_mlp": 0.33312988, + "step": 424, + "time_per_iteration": 2.8134777545928955 + }, + { + "auxiliary_loss_clip": 0.01316259, + "auxiliary_loss_mlp": 0.0109129, + "balance_loss_clip": 1.10135865, + "balance_loss_mlp": 1.05819798, + "epoch": 0.02555238238388697, + "flos": 22681015130880.0, + "grad_norm": 2.681175921213918, + "language_loss": 0.83224261, + "learning_rate": 3.896650826173015e-06, + "loss": 0.85631812, + "num_input_tokens_seen": 8899155, + "router_z_loss_clip": 2.14453125, + "router_z_loss_mlp": 0.33105469, + "step": 425, + "time_per_iteration": 2.7387542724609375 + }, + { + "auxiliary_loss_clip": 0.01311195, + "auxiliary_loss_mlp": 0.01084069, + "balance_loss_clip": 1.09317565, + "balance_loss_mlp": 1.04904521, + "epoch": 0.025612505636554938, + "flos": 29581865845920.0, + "grad_norm": 2.3925421965282045, + "language_loss": 0.85573369, + "learning_rate": 3.898163992988186e-06, + "loss": 0.87968636, + "num_input_tokens_seen": 8917890, + "router_z_loss_clip": 2.18554688, + "router_z_loss_mlp": 0.35009766, + "step": 426, + "time_per_iteration": 2.7384824752807617 + }, + { + "auxiliary_loss_clip": 0.01196314, + "auxiliary_loss_mlp": 0.0105089, + "balance_loss_clip": 1.09130979, + "balance_loss_mlp": 1.03974426, + "epoch": 0.025672628889222907, + "flos": 73927585163520.0, + "grad_norm": 0.8937576243759789, + "language_loss": 0.57251197, + "learning_rate": 3.899673611929491e-06, + "loss": 0.59498405, + "num_input_tokens_seen": 8978260, + "router_z_loss_clip": 1.04785156, + "router_z_loss_mlp": 0.1114502, + "step": 427, + "time_per_iteration": 3.4215924739837646 + }, + { + "auxiliary_loss_clip": 0.01313435, + "auxiliary_loss_mlp": 0.01096375, + "balance_loss_clip": 1.10329819, + "balance_loss_mlp": 1.06404567, + "epoch": 0.025732752141890875, + "flos": 23883234471360.0, + "grad_norm": 2.2795828909869553, + "language_loss": 0.88118708, + "learning_rate": 3.901179699595194e-06, + "loss": 0.90528524, + "num_input_tokens_seen": 8994460, + "router_z_loss_clip": 2.1015625, + "router_z_loss_mlp": 0.32348633, + "step": 428, + "time_per_iteration": 2.731351852416992 + }, + { + "auxiliary_loss_clip": 0.01304211, + "auxiliary_loss_mlp": 0.01071572, + "balance_loss_clip": 1.09684038, + "balance_loss_mlp": 1.03681076, + "epoch": 0.025792875394558847, + "flos": 38173990651200.0, + "grad_norm": 1.5796165352680953, + "language_loss": 0.85680044, + "learning_rate": 3.902682272467353e-06, + "loss": 0.88055831, + "num_input_tokens_seen": 9016670, + "router_z_loss_clip": 2.06835938, + "router_z_loss_mlp": 0.34741211, + "step": 429, + "time_per_iteration": 2.861532211303711 + }, + { + "auxiliary_loss_clip": 0.01311575, + "auxiliary_loss_mlp": 0.01078914, + "balance_loss_clip": 1.09584939, + "balance_loss_mlp": 1.04429603, + "epoch": 0.025852998647226816, + "flos": 39510546240960.0, + "grad_norm": 2.7103950519835167, + "language_loss": 0.88075191, + "learning_rate": 3.904181346912895e-06, + "loss": 0.90465677, + "num_input_tokens_seen": 9039720, + "router_z_loss_clip": 2.15625, + "router_z_loss_mlp": 0.34643555, + "step": 430, + "time_per_iteration": 2.79115891456604 + }, + { + "auxiliary_loss_clip": 0.01310859, + "auxiliary_loss_mlp": 0.01080439, + "balance_loss_clip": 1.10192084, + "balance_loss_mlp": 1.04851508, + "epoch": 0.025913121899894784, + "flos": 24639867753120.0, + "grad_norm": 1.7570816321363136, + "language_loss": 0.83964533, + "learning_rate": 3.905676939184698e-06, + "loss": 0.86355829, + "num_input_tokens_seen": 9059850, + "router_z_loss_clip": 2.09375, + "router_z_loss_mlp": 0.31933594, + "step": 431, + "time_per_iteration": 2.714344024658203 + }, + { + "auxiliary_loss_clip": 0.01306692, + "auxiliary_loss_mlp": 0.01074739, + "balance_loss_clip": 1.09754133, + "balance_loss_mlp": 1.04388785, + "epoch": 0.025973245152562753, + "flos": 17961465670560.0, + "grad_norm": 4.115413059859651, + "language_loss": 0.86852127, + "learning_rate": 3.907169065422638e-06, + "loss": 0.89233553, + "num_input_tokens_seen": 9077590, + "router_z_loss_clip": 2.09179688, + "router_z_loss_mlp": 0.30834961, + "step": 432, + "time_per_iteration": 2.6827938556671143 + }, + { + "auxiliary_loss_clip": 0.01307844, + "auxiliary_loss_mlp": 0.01063849, + "balance_loss_clip": 1.09696293, + "balance_loss_mlp": 1.03324819, + "epoch": 0.02603336840523072, + "flos": 37818502322400.0, + "grad_norm": 1.8652926362799416, + "language_loss": 0.76373684, + "learning_rate": 3.908657741654636e-06, + "loss": 0.78745377, + "num_input_tokens_seen": 9099880, + "router_z_loss_clip": 2.109375, + "router_z_loss_mlp": 0.30578613, + "step": 433, + "time_per_iteration": 2.8062708377838135 + }, + { + "auxiliary_loss_clip": 0.01309847, + "auxiliary_loss_mlp": 0.01089452, + "balance_loss_clip": 1.09574544, + "balance_loss_mlp": 1.05442858, + "epoch": 0.026093491657898694, + "flos": 21566097828000.0, + "grad_norm": 1.9028346160610015, + "language_loss": 0.89756155, + "learning_rate": 3.910142983797699e-06, + "loss": 0.92155457, + "num_input_tokens_seen": 9118620, + "router_z_loss_clip": 2.13867188, + "router_z_loss_mlp": 0.3503418, + "step": 434, + "time_per_iteration": 2.6861796379089355 + }, + { + "auxiliary_loss_clip": 0.01311597, + "auxiliary_loss_mlp": 0.01097756, + "balance_loss_clip": 1.10178018, + "balance_loss_mlp": 1.06375802, + "epoch": 0.026153614910566662, + "flos": 21790045599840.0, + "grad_norm": 2.04824826155189, + "language_loss": 0.79928386, + "learning_rate": 3.9116248076589305e-06, + "loss": 0.82337737, + "num_input_tokens_seen": 9135655, + "router_z_loss_clip": 2.09570312, + "router_z_loss_mlp": 0.34008789, + "step": 435, + "time_per_iteration": 2.673910140991211 + }, + { + "auxiliary_loss_clip": 0.01305645, + "auxiliary_loss_mlp": 0.01080141, + "balance_loss_clip": 1.09427834, + "balance_loss_mlp": 1.04640532, + "epoch": 0.02621373816323463, + "flos": 24417540672480.0, + "grad_norm": 2.5533586258088525, + "language_loss": 0.86373419, + "learning_rate": 3.913103228936546e-06, + "loss": 0.88759202, + "num_input_tokens_seen": 9153520, + "router_z_loss_clip": 2.11621094, + "router_z_loss_mlp": 0.33764648, + "step": 436, + "time_per_iteration": 2.7198662757873535 + }, + { + "auxiliary_loss_clip": 0.0130868, + "auxiliary_loss_mlp": 0.0108817, + "balance_loss_clip": 1.09850395, + "balance_loss_mlp": 1.05512476, + "epoch": 0.0262738614159026, + "flos": 23526854762400.0, + "grad_norm": 3.3141317197698674, + "language_loss": 0.74757612, + "learning_rate": 3.914578263220868e-06, + "loss": 0.77154464, + "num_input_tokens_seen": 9170750, + "router_z_loss_clip": 2.1015625, + "router_z_loss_mlp": 0.33007812, + "step": 437, + "time_per_iteration": 2.675748109817505 + }, + { + "auxiliary_loss_clip": 0.01305477, + "auxiliary_loss_mlp": 0.01082531, + "balance_loss_clip": 1.09823346, + "balance_loss_mlp": 1.04807997, + "epoch": 0.026333984668570568, + "flos": 22948431593760.0, + "grad_norm": 2.212490426804267, + "language_loss": 0.9142229, + "learning_rate": 3.916049925995316e-06, + "loss": 0.93810296, + "num_input_tokens_seen": 9188430, + "router_z_loss_clip": 2.07519531, + "router_z_loss_mlp": 0.34472656, + "step": 438, + "time_per_iteration": 2.671436309814453 + }, + { + "auxiliary_loss_clip": 0.01184498, + "auxiliary_loss_mlp": 0.01031211, + "balance_loss_clip": 1.08063769, + "balance_loss_mlp": 1.02122116, + "epoch": 0.02639410792123854, + "flos": 78789736398240.0, + "grad_norm": 0.8750989259610006, + "language_loss": 0.62593156, + "learning_rate": 3.917518232637377e-06, + "loss": 0.64808869, + "num_input_tokens_seen": 9255835, + "router_z_loss_clip": 1.03710938, + "router_z_loss_mlp": 0.09985352, + "step": 439, + "time_per_iteration": 3.4224295616149902 + }, + { + "auxiliary_loss_clip": 0.01314684, + "auxiliary_loss_mlp": 0.0108581, + "balance_loss_clip": 1.10234284, + "balance_loss_mlp": 1.05164421, + "epoch": 0.02645423117390651, + "flos": 34744165155360.0, + "grad_norm": 1.8988645849567505, + "language_loss": 0.75634009, + "learning_rate": 3.918983198419573e-06, + "loss": 0.78034502, + "num_input_tokens_seen": 9276835, + "router_z_loss_clip": 2.12304688, + "router_z_loss_mlp": 0.34179688, + "step": 440, + "time_per_iteration": 2.7619760036468506 + }, + { + "auxiliary_loss_clip": 0.01306803, + "auxiliary_loss_mlp": 0.01077132, + "balance_loss_clip": 1.09860992, + "balance_loss_mlp": 1.04380071, + "epoch": 0.026514354426574478, + "flos": 22636128335040.0, + "grad_norm": 2.0700029301674676, + "language_loss": 0.83057946, + "learning_rate": 3.920444838510415e-06, + "loss": 0.85441875, + "num_input_tokens_seen": 9295075, + "router_z_loss_clip": 2.08203125, + "router_z_loss_mlp": 0.33349609, + "step": 441, + "time_per_iteration": 2.69832181930542 + }, + { + "auxiliary_loss_clip": 0.01308097, + "auxiliary_loss_mlp": 0.01084837, + "balance_loss_clip": 1.09514713, + "balance_loss_mlp": 1.05033743, + "epoch": 0.026574477679242446, + "flos": 25218898680960.0, + "grad_norm": 1.8326232689116537, + "language_loss": 0.78108227, + "learning_rate": 3.92190316797534e-06, + "loss": 0.80501163, + "num_input_tokens_seen": 9314205, + "router_z_loss_clip": 2.13085938, + "router_z_loss_mlp": 0.34472656, + "step": 442, + "time_per_iteration": 2.7561450004577637 + }, + { + "auxiliary_loss_clip": 0.01180621, + "auxiliary_loss_mlp": 0.01011699, + "balance_loss_clip": 1.07877266, + "balance_loss_mlp": 1.00158989, + "epoch": 0.026634600931910415, + "flos": 70717615194240.0, + "grad_norm": 0.959385971786589, + "language_loss": 0.64483988, + "learning_rate": 3.92335820177765e-06, + "loss": 0.66676313, + "num_input_tokens_seen": 9367395, + "router_z_loss_clip": 1.01660156, + "router_z_loss_mlp": 0.10107422, + "step": 443, + "time_per_iteration": 3.130131959915161 + }, + { + "auxiliary_loss_clip": 0.01309705, + "auxiliary_loss_mlp": 0.01082103, + "balance_loss_clip": 1.10122454, + "balance_loss_mlp": 1.04924893, + "epoch": 0.026694724184578387, + "flos": 19297291949280.0, + "grad_norm": 2.0088112591176985, + "language_loss": 0.82576656, + "learning_rate": 3.924809954779425e-06, + "loss": 0.8496846, + "num_input_tokens_seen": 9385185, + "router_z_loss_clip": 2.08203125, + "router_z_loss_mlp": 0.32836914, + "step": 444, + "time_per_iteration": 2.6811628341674805 + }, + { + "auxiliary_loss_clip": 0.01312594, + "auxiliary_loss_mlp": 0.01084983, + "balance_loss_clip": 1.09730721, + "balance_loss_mlp": 1.04938686, + "epoch": 0.026754847437246355, + "flos": 28602095137920.0, + "grad_norm": 2.0472117837386876, + "language_loss": 0.95625859, + "learning_rate": 3.9262584417424425e-06, + "loss": 0.98023438, + "num_input_tokens_seen": 9403225, + "router_z_loss_clip": 2.15429688, + "router_z_loss_mlp": 0.35620117, + "step": 445, + "time_per_iteration": 2.7155370712280273 + }, + { + "auxiliary_loss_clip": 0.01310901, + "auxiliary_loss_mlp": 0.01088643, + "balance_loss_clip": 1.10025227, + "balance_loss_mlp": 1.05321383, + "epoch": 0.026814970689914324, + "flos": 21160779595200.0, + "grad_norm": 2.6617924794246477, + "language_loss": 0.91942495, + "learning_rate": 3.9277036773290725e-06, + "loss": 0.94342041, + "num_input_tokens_seen": 9420540, + "router_z_loss_clip": 2.10449219, + "router_z_loss_mlp": 0.35424805, + "step": 446, + "time_per_iteration": 2.662635087966919 + }, + { + "auxiliary_loss_clip": 0.01303555, + "auxiliary_loss_mlp": 0.01079766, + "balance_loss_clip": 1.09702444, + "balance_loss_mlp": 1.04595828, + "epoch": 0.026875093942582293, + "flos": 21834972912960.0, + "grad_norm": 1.9129262533300682, + "language_loss": 0.79829812, + "learning_rate": 3.92914567610317e-06, + "loss": 0.82213128, + "num_input_tokens_seen": 9438840, + "router_z_loss_clip": 2.06347656, + "router_z_loss_mlp": 0.33813477, + "step": 447, + "time_per_iteration": 2.68815279006958 + }, + { + "auxiliary_loss_clip": 0.01304863, + "auxiliary_loss_mlp": 0.01074787, + "balance_loss_clip": 1.09662628, + "balance_loss_mlp": 1.04305339, + "epoch": 0.026935217195250265, + "flos": 26507893334400.0, + "grad_norm": 2.9472349274694634, + "language_loss": 0.86558855, + "learning_rate": 3.930584452530952e-06, + "loss": 0.88938504, + "num_input_tokens_seen": 9457215, + "router_z_loss_clip": 2.08105469, + "router_z_loss_mlp": 0.31713867, + "step": 448, + "time_per_iteration": 2.7006638050079346 + }, + { + "auxiliary_loss_clip": 0.01298253, + "auxiliary_loss_mlp": 0.01099381, + "balance_loss_clip": 1.09544551, + "balance_loss_mlp": 1.06855392, + "epoch": 0.026995340447918233, + "flos": 28824705839520.0, + "grad_norm": 3.2376171633262856, + "language_loss": 0.88600135, + "learning_rate": 3.9320200209818755e-06, + "loss": 0.90997773, + "num_input_tokens_seen": 9475615, + "router_z_loss_clip": 2.02929688, + "router_z_loss_mlp": 0.30786133, + "step": 449, + "time_per_iteration": 2.691148042678833 + }, + { + "auxiliary_loss_clip": 0.01312597, + "auxiliary_loss_mlp": 0.0109828, + "balance_loss_clip": 1.09754491, + "balance_loss_mlp": 1.06335175, + "epoch": 0.027055463700586202, + "flos": 21879576087840.0, + "grad_norm": 1.9320006131913436, + "language_loss": 0.80548477, + "learning_rate": 3.933452395729493e-06, + "loss": 0.82959354, + "num_input_tokens_seen": 9493975, + "router_z_loss_clip": 2.15039062, + "router_z_loss_mlp": 0.34924316, + "step": 450, + "time_per_iteration": 2.734354257583618 + }, + { + "auxiliary_loss_clip": 0.01302201, + "auxiliary_loss_mlp": 0.01079317, + "balance_loss_clip": 1.09842491, + "balance_loss_mlp": 1.04558074, + "epoch": 0.02711558695325417, + "flos": 30650478248160.0, + "grad_norm": 2.1253086466737123, + "language_loss": 0.81748879, + "learning_rate": 3.934881590952304e-06, + "loss": 0.84130394, + "num_input_tokens_seen": 9514810, + "router_z_loss_clip": 2.03515625, + "router_z_loss_mlp": 0.3371582, + "step": 451, + "time_per_iteration": 4.250251770019531 + }, + { + "auxiliary_loss_clip": 0.01302216, + "auxiliary_loss_mlp": 0.01094525, + "balance_loss_clip": 1.09993362, + "balance_loss_mlp": 1.05969262, + "epoch": 0.02717571020592214, + "flos": 29577773600640.0, + "grad_norm": 1.8730700928423623, + "language_loss": 0.7705822, + "learning_rate": 3.936307620734599e-06, + "loss": 0.79454958, + "num_input_tokens_seen": 9533635, + "router_z_loss_clip": 2.02050781, + "router_z_loss_mlp": 0.34814453, + "step": 452, + "time_per_iteration": 5.637980222702026 + }, + { + "auxiliary_loss_clip": 0.01301258, + "auxiliary_loss_mlp": 0.01079663, + "balance_loss_clip": 1.09731841, + "balance_loss_mlp": 1.04614162, + "epoch": 0.02723583345859011, + "flos": 30650518765440.0, + "grad_norm": 2.178076095337244, + "language_loss": 0.72892344, + "learning_rate": 3.937730499067294e-06, + "loss": 0.75273263, + "num_input_tokens_seen": 9555420, + "router_z_loss_clip": 2.0390625, + "router_z_loss_mlp": 0.33520508, + "step": 453, + "time_per_iteration": 2.7385756969451904 + }, + { + "auxiliary_loss_clip": 0.0129651, + "auxiliary_loss_mlp": 0.0108448, + "balance_loss_clip": 1.09390354, + "balance_loss_mlp": 1.05179286, + "epoch": 0.02729595671125808, + "flos": 52155036678240.0, + "grad_norm": 2.3670588672196176, + "language_loss": 0.82134509, + "learning_rate": 3.939150239848748e-06, + "loss": 0.845155, + "num_input_tokens_seen": 9578950, + "router_z_loss_clip": 2.0234375, + "router_z_loss_mlp": 0.32714844, + "step": 454, + "time_per_iteration": 2.9392929077148438 + }, + { + "auxiliary_loss_clip": 0.01299401, + "auxiliary_loss_mlp": 0.01073946, + "balance_loss_clip": 1.0964191, + "balance_loss_mlp": 1.0431428, + "epoch": 0.02735607996392605, + "flos": 26150460176160.0, + "grad_norm": 2.163153881465569, + "language_loss": 0.75353843, + "learning_rate": 3.9405668568855866e-06, + "loss": 0.77727193, + "num_input_tokens_seen": 9598160, + "router_z_loss_clip": 2.03027344, + "router_z_loss_mlp": 0.30786133, + "step": 455, + "time_per_iteration": 2.689523220062256 + }, + { + "auxiliary_loss_clip": 0.01300025, + "auxiliary_loss_mlp": 0.01091315, + "balance_loss_clip": 1.09377658, + "balance_loss_mlp": 1.05939102, + "epoch": 0.027416203216594017, + "flos": 25442684383680.0, + "grad_norm": 1.9337401730422419, + "language_loss": 0.80561829, + "learning_rate": 3.941980363893499e-06, + "loss": 0.82953167, + "num_input_tokens_seen": 9616010, + "router_z_loss_clip": 2.0625, + "router_z_loss_mlp": 0.3190918, + "step": 456, + "time_per_iteration": 2.6815125942230225 + }, + { + "auxiliary_loss_clip": 0.0129703, + "auxiliary_loss_mlp": 0.01076274, + "balance_loss_clip": 1.09451199, + "balance_loss_mlp": 1.04308653, + "epoch": 0.027476326469261986, + "flos": 16136017400160.0, + "grad_norm": 2.0899474985130517, + "language_loss": 0.81767797, + "learning_rate": 3.9433907744980384e-06, + "loss": 0.84141099, + "num_input_tokens_seen": 9634000, + "router_z_loss_clip": 2.02441406, + "router_z_loss_mlp": 0.33154297, + "step": 457, + "time_per_iteration": 2.659430980682373 + }, + { + "auxiliary_loss_clip": 0.01301859, + "auxiliary_loss_mlp": 0.01080298, + "balance_loss_clip": 1.09437895, + "balance_loss_mlp": 1.04730082, + "epoch": 0.027536449721929958, + "flos": 29315016624960.0, + "grad_norm": 1.8781098018799458, + "language_loss": 0.94030917, + "learning_rate": 3.944798102235412e-06, + "loss": 0.9641307, + "num_input_tokens_seen": 9653455, + "router_z_loss_clip": 2.07421875, + "router_z_loss_mlp": 0.33007812, + "step": 458, + "time_per_iteration": 2.7205848693847656 + }, + { + "auxiliary_loss_clip": 0.01299599, + "auxiliary_loss_mlp": 0.01096122, + "balance_loss_clip": 1.09456539, + "balance_loss_mlp": 1.06395948, + "epoch": 0.027596572974597926, + "flos": 15869411282880.0, + "grad_norm": 2.1602814500873926, + "language_loss": 0.78888774, + "learning_rate": 3.9462023605532545e-06, + "loss": 0.81284499, + "num_input_tokens_seen": 9669650, + "router_z_loss_clip": 2.05273438, + "router_z_loss_mlp": 0.3215332, + "step": 459, + "time_per_iteration": 2.6662230491638184 + }, + { + "auxiliary_loss_clip": 0.01303363, + "auxiliary_loss_mlp": 0.01081429, + "balance_loss_clip": 1.09997523, + "balance_loss_mlp": 1.04604816, + "epoch": 0.027656696227265895, + "flos": 31900015421280.0, + "grad_norm": 1.8461897986395963, + "language_loss": 0.83131731, + "learning_rate": 3.947603562811407e-06, + "loss": 0.8551653, + "num_input_tokens_seen": 9691415, + "router_z_loss_clip": 2.03320312, + "router_z_loss_mlp": 0.35400391, + "step": 460, + "time_per_iteration": 2.7573962211608887 + }, + { + "auxiliary_loss_clip": 0.01171396, + "auxiliary_loss_mlp": 0.01035056, + "balance_loss_clip": 1.0724318, + "balance_loss_mlp": 1.02592444, + "epoch": 0.027716819479933864, + "flos": 74062083481920.0, + "grad_norm": 1.5888828878039833, + "language_loss": 0.73587447, + "learning_rate": 3.949001722282675e-06, + "loss": 0.75793898, + "num_input_tokens_seen": 9755605, + "router_z_loss_clip": 0.98974609, + "router_z_loss_mlp": 0.09130859, + "step": 461, + "time_per_iteration": 3.302452802658081 + }, + { + "auxiliary_loss_clip": 0.01301307, + "auxiliary_loss_mlp": 0.01076955, + "balance_loss_clip": 1.10261178, + "balance_loss_mlp": 1.04665184, + "epoch": 0.027776942732601832, + "flos": 38085675681600.0, + "grad_norm": 2.5530269629460167, + "language_loss": 0.80922312, + "learning_rate": 3.950396852153582e-06, + "loss": 0.83300573, + "num_input_tokens_seen": 9776270, + "router_z_loss_clip": 1.98535156, + "router_z_loss_mlp": 0.30297852, + "step": 462, + "time_per_iteration": 2.780465602874756 + }, + { + "auxiliary_loss_clip": 0.01297149, + "auxiliary_loss_mlp": 0.01070481, + "balance_loss_clip": 1.09630251, + "balance_loss_mlp": 1.0406307, + "epoch": 0.027837065985269804, + "flos": 27667778467680.0, + "grad_norm": 3.164843041182973, + "language_loss": 0.90101528, + "learning_rate": 3.951788965525118e-06, + "loss": 0.92469156, + "num_input_tokens_seen": 9794465, + "router_z_loss_clip": 2.00878906, + "router_z_loss_mlp": 0.29858398, + "step": 463, + "time_per_iteration": 2.6890289783477783 + }, + { + "auxiliary_loss_clip": 0.01169032, + "auxiliary_loss_mlp": 0.01015295, + "balance_loss_clip": 1.07031083, + "balance_loss_mlp": 1.00640249, + "epoch": 0.027897189237937773, + "flos": 75875498120160.0, + "grad_norm": 0.8824241095421125, + "language_loss": 0.59059751, + "learning_rate": 3.953178075413476e-06, + "loss": 0.61244082, + "num_input_tokens_seen": 9849685, + "router_z_loss_clip": 0.98632812, + "router_z_loss_mlp": 0.08905029, + "step": 464, + "time_per_iteration": 3.237894296646118 + }, + { + "auxiliary_loss_clip": 0.01311177, + "auxiliary_loss_mlp": 0.01089784, + "balance_loss_clip": 1.10112548, + "balance_loss_mlp": 1.05614328, + "epoch": 0.02795731249060574, + "flos": 29887402718880.0, + "grad_norm": 2.1535742238801387, + "language_loss": 0.81132662, + "learning_rate": 3.954564194750784e-06, + "loss": 0.83533627, + "num_input_tokens_seen": 9869505, + "router_z_loss_clip": 2.09960938, + "router_z_loss_mlp": 0.33666992, + "step": 465, + "time_per_iteration": 2.7457072734832764 + }, + { + "auxiliary_loss_clip": 0.01296359, + "auxiliary_loss_mlp": 0.01080474, + "balance_loss_clip": 1.09348392, + "balance_loss_mlp": 1.04802537, + "epoch": 0.02801743574327371, + "flos": 28959244675200.0, + "grad_norm": 1.966540440097469, + "language_loss": 0.78086495, + "learning_rate": 3.955947336385828e-06, + "loss": 0.80463332, + "num_input_tokens_seen": 9890950, + "router_z_loss_clip": 2.02929688, + "router_z_loss_mlp": 0.32470703, + "step": 466, + "time_per_iteration": 2.7037246227264404 + }, + { + "auxiliary_loss_clip": 0.01295383, + "auxiliary_loss_mlp": 0.01078499, + "balance_loss_clip": 1.09596896, + "balance_loss_mlp": 1.04674149, + "epoch": 0.02807755899594168, + "flos": 25171864469280.0, + "grad_norm": 2.66130347189982, + "language_loss": 0.87685502, + "learning_rate": 3.957327513084761e-06, + "loss": 0.90059382, + "num_input_tokens_seen": 9911265, + "router_z_loss_clip": 1.9921875, + "router_z_loss_mlp": 0.31738281, + "step": 467, + "time_per_iteration": 2.678809881210327 + }, + { + "auxiliary_loss_clip": 0.01300694, + "auxiliary_loss_mlp": 0.01090063, + "balance_loss_clip": 1.09763443, + "balance_loss_mlp": 1.05580246, + "epoch": 0.02813768224860965, + "flos": 23482170552960.0, + "grad_norm": 2.1871474992269366, + "language_loss": 0.86398995, + "learning_rate": 3.958704737531818e-06, + "loss": 0.88789755, + "num_input_tokens_seen": 9929025, + "router_z_loss_clip": 2.02832031, + "router_z_loss_mlp": 0.34277344, + "step": 468, + "time_per_iteration": 2.6674821376800537 + }, + { + "auxiliary_loss_clip": 0.01295365, + "auxiliary_loss_mlp": 0.01078898, + "balance_loss_clip": 1.09275556, + "balance_loss_mlp": 1.0456624, + "epoch": 0.02819780550127762, + "flos": 25397230345920.0, + "grad_norm": 2.4744135500166484, + "language_loss": 0.91767001, + "learning_rate": 3.9600790223300065e-06, + "loss": 0.94141263, + "num_input_tokens_seen": 9945190, + "router_z_loss_clip": 2.02539062, + "router_z_loss_mlp": 0.33251953, + "step": 469, + "time_per_iteration": 2.7523372173309326 + }, + { + "auxiliary_loss_clip": 0.01294785, + "auxiliary_loss_mlp": 0.01076888, + "balance_loss_clip": 1.09487653, + "balance_loss_mlp": 1.04324698, + "epoch": 0.028257928753945588, + "flos": 24372451290240.0, + "grad_norm": 2.7142614224005737, + "language_loss": 0.81699759, + "learning_rate": 3.96145038000181e-06, + "loss": 0.84071434, + "num_input_tokens_seen": 9962820, + "router_z_loss_clip": 1.99804688, + "router_z_loss_mlp": 0.33618164, + "step": 470, + "time_per_iteration": 2.713249683380127 + }, + { + "auxiliary_loss_clip": 0.01294559, + "auxiliary_loss_mlp": 0.01084034, + "balance_loss_clip": 1.09221923, + "balance_loss_mlp": 1.0500828, + "epoch": 0.028318052006613557, + "flos": 24995275047360.0, + "grad_norm": 1.723348852040956, + "language_loss": 0.93418801, + "learning_rate": 3.962818822989861e-06, + "loss": 0.95797396, + "num_input_tokens_seen": 9982595, + "router_z_loss_clip": 2.0234375, + "router_z_loss_mlp": 0.33935547, + "step": 471, + "time_per_iteration": 2.729320764541626 + }, + { + "auxiliary_loss_clip": 0.01292353, + "auxiliary_loss_mlp": 0.010889, + "balance_loss_clip": 1.09260511, + "balance_loss_mlp": 1.05630827, + "epoch": 0.02837817525928153, + "flos": 34794197645760.0, + "grad_norm": 1.8143202238085754, + "language_loss": 0.76398915, + "learning_rate": 3.964184363657625e-06, + "loss": 0.78780168, + "num_input_tokens_seen": 10004645, + "router_z_loss_clip": 1.99804688, + "router_z_loss_mlp": 0.32592773, + "step": 472, + "time_per_iteration": 2.7598140239715576 + }, + { + "auxiliary_loss_clip": 0.01297918, + "auxiliary_loss_mlp": 0.01066421, + "balance_loss_clip": 1.09164715, + "balance_loss_mlp": 1.03506851, + "epoch": 0.028438298511949497, + "flos": 22636614542400.0, + "grad_norm": 1.933377629258672, + "language_loss": 0.93347466, + "learning_rate": 3.965547014290071e-06, + "loss": 0.95711803, + "num_input_tokens_seen": 10022555, + "router_z_loss_clip": 2.06542969, + "router_z_loss_mlp": 0.31347656, + "step": 473, + "time_per_iteration": 2.6424691677093506 + }, + { + "auxiliary_loss_clip": 0.01303056, + "auxiliary_loss_mlp": 0.01102667, + "balance_loss_clip": 1.09619784, + "balance_loss_mlp": 1.07090998, + "epoch": 0.028498421764617466, + "flos": 20633563918080.0, + "grad_norm": 2.827818690607038, + "language_loss": 0.88356346, + "learning_rate": 3.96690678709433e-06, + "loss": 0.90762067, + "num_input_tokens_seen": 10041025, + "router_z_loss_clip": 2.06835938, + "router_z_loss_mlp": 0.31762695, + "step": 474, + "time_per_iteration": 2.6588034629821777 + }, + { + "auxiliary_loss_clip": 0.01295315, + "auxiliary_loss_mlp": 0.01081573, + "balance_loss_clip": 1.09445167, + "balance_loss_mlp": 1.04871929, + "epoch": 0.028558545017285435, + "flos": 33896502246240.0, + "grad_norm": 2.5389673579731804, + "language_loss": 0.78996503, + "learning_rate": 3.968263694200355e-06, + "loss": 0.81373394, + "num_input_tokens_seen": 10060775, + "router_z_loss_clip": 2.00878906, + "router_z_loss_mlp": 0.32861328, + "step": 475, + "time_per_iteration": 2.78212833404541 + }, + { + "auxiliary_loss_clip": 0.01163678, + "auxiliary_loss_mlp": 0.0101335, + "balance_loss_clip": 1.06674337, + "balance_loss_mlp": 1.00475478, + "epoch": 0.028618668269953403, + "flos": 80111624732640.0, + "grad_norm": 0.929698046604829, + "language_loss": 0.67027795, + "learning_rate": 3.969617747661569e-06, + "loss": 0.69204831, + "num_input_tokens_seen": 10120225, + "router_z_loss_clip": 0.96972656, + "router_z_loss_mlp": 0.08605957, + "step": 476, + "time_per_iteration": 3.2297282218933105 + }, + { + "auxiliary_loss_clip": 0.01297672, + "auxiliary_loss_mlp": 0.01080644, + "balance_loss_clip": 1.09477437, + "balance_loss_mlp": 1.04666901, + "epoch": 0.028678791522621375, + "flos": 26768462376960.0, + "grad_norm": 3.348334695748732, + "language_loss": 0.83955288, + "learning_rate": 3.970968959455509e-06, + "loss": 0.86333609, + "num_input_tokens_seen": 10137880, + "router_z_loss_clip": 2.02832031, + "router_z_loss_mlp": 0.33984375, + "step": 477, + "time_per_iteration": 2.675302028656006 + }, + { + "auxiliary_loss_clip": 0.01305179, + "auxiliary_loss_mlp": 0.0109257, + "balance_loss_clip": 1.0995425, + "balance_loss_mlp": 1.05878639, + "epoch": 0.028738914775289344, + "flos": 29983213385280.0, + "grad_norm": 1.90255275838756, + "language_loss": 0.82331336, + "learning_rate": 3.97231734148446e-06, + "loss": 0.84729081, + "num_input_tokens_seen": 10156930, + "router_z_loss_clip": 2.05664062, + "router_z_loss_mlp": 0.33789062, + "step": 478, + "time_per_iteration": 2.7120907306671143 + }, + { + "auxiliary_loss_clip": 0.01295812, + "auxiliary_loss_mlp": 0.01075638, + "balance_loss_clip": 1.0946157, + "balance_loss_mlp": 1.04342771, + "epoch": 0.028799038027957313, + "flos": 28379646505440.0, + "grad_norm": 1.952160103791701, + "language_loss": 0.81125897, + "learning_rate": 3.973662905576082e-06, + "loss": 0.83497345, + "num_input_tokens_seen": 10176295, + "router_z_loss_clip": 2.01269531, + "router_z_loss_mlp": 0.32202148, + "step": 479, + "time_per_iteration": 2.685967445373535 + }, + { + "auxiliary_loss_clip": 0.01293026, + "auxiliary_loss_mlp": 0.01085532, + "balance_loss_clip": 1.09365654, + "balance_loss_mlp": 1.05115247, + "epoch": 0.02885916128062528, + "flos": 27044752124160.0, + "grad_norm": 2.236689186814991, + "language_loss": 0.73624945, + "learning_rate": 3.975005663484038e-06, + "loss": 0.76003504, + "num_input_tokens_seen": 10195790, + "router_z_loss_clip": 1.99121094, + "router_z_loss_mlp": 0.34375, + "step": 480, + "time_per_iteration": 2.74837589263916 + }, + { + "auxiliary_loss_clip": 0.01291792, + "auxiliary_loss_mlp": 0.01072871, + "balance_loss_clip": 1.09437346, + "balance_loss_mlp": 1.04280663, + "epoch": 0.02891928453329325, + "flos": 27982593797760.0, + "grad_norm": 1.6108233346414733, + "language_loss": 0.87988627, + "learning_rate": 3.976345626888605e-06, + "loss": 0.90353292, + "num_input_tokens_seen": 10218405, + "router_z_loss_clip": 1.97558594, + "router_z_loss_mlp": 0.3001709, + "step": 481, + "time_per_iteration": 2.7582058906555176 + }, + { + "auxiliary_loss_clip": 0.0116248, + "auxiliary_loss_mlp": 0.0100943, + "balance_loss_clip": 1.0664711, + "balance_loss_mlp": 1.00105548, + "epoch": 0.028979407785961222, + "flos": 70080326768160.0, + "grad_norm": 0.8215221439365239, + "language_loss": 0.66074032, + "learning_rate": 3.9776828073972864e-06, + "loss": 0.68245941, + "num_input_tokens_seen": 10271005, + "router_z_loss_clip": 0.95996094, + "router_z_loss_mlp": 0.08380127, + "step": 482, + "time_per_iteration": 3.0921177864074707 + }, + { + "auxiliary_loss_clip": 0.01304199, + "auxiliary_loss_mlp": 0.01073694, + "balance_loss_clip": 1.09569991, + "balance_loss_mlp": 1.04227066, + "epoch": 0.02903953103862919, + "flos": 20405240280000.0, + "grad_norm": 2.3152776918434075, + "language_loss": 0.78717256, + "learning_rate": 3.979017216545415e-06, + "loss": 0.81095147, + "num_input_tokens_seen": 10288405, + "router_z_loss_clip": 2.08496094, + "router_z_loss_mlp": 0.31396484, + "step": 483, + "time_per_iteration": 2.6932523250579834 + }, + { + "auxiliary_loss_clip": 0.01301563, + "auxiliary_loss_mlp": 0.01090694, + "balance_loss_clip": 1.09732938, + "balance_loss_mlp": 1.05700517, + "epoch": 0.02909965429129716, + "flos": 20454746045760.0, + "grad_norm": 2.1027425954435572, + "language_loss": 0.75616586, + "learning_rate": 3.980348865796749e-06, + "loss": 0.78008848, + "num_input_tokens_seen": 10306875, + "router_z_loss_clip": 2.04003906, + "router_z_loss_mlp": 0.33666992, + "step": 484, + "time_per_iteration": 2.742981433868408 + }, + { + "auxiliary_loss_clip": 0.01296462, + "auxiliary_loss_mlp": 0.01080289, + "balance_loss_clip": 1.0950644, + "balance_loss_mlp": 1.04891348, + "epoch": 0.029159777543965128, + "flos": 24142669030080.0, + "grad_norm": 2.906623564848116, + "language_loss": 0.8372339, + "learning_rate": 3.9816777665440615e-06, + "loss": 0.86100143, + "num_input_tokens_seen": 10323965, + "router_z_loss_clip": 2.01367188, + "router_z_loss_mlp": 0.31420898, + "step": 485, + "time_per_iteration": 2.7264597415924072 + }, + { + "auxiliary_loss_clip": 0.01302371, + "auxiliary_loss_mlp": 0.01079069, + "balance_loss_clip": 1.10223341, + "balance_loss_mlp": 1.04650092, + "epoch": 0.029219900796633096, + "flos": 23968267541280.0, + "grad_norm": 2.053520632018532, + "language_loss": 0.84634387, + "learning_rate": 3.983003930109732e-06, + "loss": 0.87015837, + "num_input_tokens_seen": 10342620, + "router_z_loss_clip": 2.00097656, + "router_z_loss_mlp": 0.32519531, + "step": 486, + "time_per_iteration": 2.724743604660034 + }, + { + "auxiliary_loss_clip": 0.01295464, + "auxiliary_loss_mlp": 0.01088338, + "balance_loss_clip": 1.09502506, + "balance_loss_mlp": 1.05555522, + "epoch": 0.02928002404930107, + "flos": 31586131988640.0, + "grad_norm": 1.7350456909030068, + "language_loss": 0.88742113, + "learning_rate": 3.984327367746315e-06, + "loss": 0.91125917, + "num_input_tokens_seen": 10364610, + "router_z_loss_clip": 2.00390625, + "router_z_loss_mlp": 0.32763672, + "step": 487, + "time_per_iteration": 2.8243248462677 + }, + { + "auxiliary_loss_clip": 0.01300261, + "auxiliary_loss_mlp": 0.01067382, + "balance_loss_clip": 1.09884286, + "balance_loss_mlp": 1.03759146, + "epoch": 0.029340147301969037, + "flos": 25210349534880.0, + "grad_norm": 2.801898816706939, + "language_loss": 0.88590324, + "learning_rate": 3.985648090637122e-06, + "loss": 0.90957963, + "num_input_tokens_seen": 10380910, + "router_z_loss_clip": 2.01464844, + "router_z_loss_mlp": 0.29772949, + "step": 488, + "time_per_iteration": 2.721602201461792 + }, + { + "auxiliary_loss_clip": 0.01294527, + "auxiliary_loss_mlp": 0.01075568, + "balance_loss_clip": 1.09578252, + "balance_loss_mlp": 1.04450202, + "epoch": 0.029400270554637006, + "flos": 29807191205280.0, + "grad_norm": 1.8773352597680604, + "language_loss": 0.88946033, + "learning_rate": 3.986966109896785e-06, + "loss": 0.91316128, + "num_input_tokens_seen": 10400665, + "router_z_loss_clip": 1.98730469, + "router_z_loss_mlp": 0.31030273, + "step": 489, + "time_per_iteration": 2.7507829666137695 + }, + { + "auxiliary_loss_clip": 0.01288958, + "auxiliary_loss_mlp": 0.0107234, + "balance_loss_clip": 1.09114242, + "balance_loss_mlp": 1.04010582, + "epoch": 0.029460393807304974, + "flos": 24551471748960.0, + "grad_norm": 1.9506457752922493, + "language_loss": 0.88496828, + "learning_rate": 3.988281436571815e-06, + "loss": 0.90858126, + "num_input_tokens_seen": 10420150, + "router_z_loss_clip": 1.97949219, + "router_z_loss_mlp": 0.32226562, + "step": 490, + "time_per_iteration": 2.7089924812316895 + }, + { + "auxiliary_loss_clip": 0.0129609, + "auxiliary_loss_mlp": 0.01080259, + "balance_loss_clip": 1.09388971, + "balance_loss_mlp": 1.04916894, + "epoch": 0.029520517059972943, + "flos": 21567961622880.0, + "grad_norm": 2.3255072398898036, + "language_loss": 0.91649914, + "learning_rate": 3.989594081641164e-06, + "loss": 0.94026268, + "num_input_tokens_seen": 10438210, + "router_z_loss_clip": 2.02441406, + "router_z_loss_mlp": 0.31054688, + "step": 491, + "time_per_iteration": 5.684825658798218 + }, + { + "auxiliary_loss_clip": 0.01285859, + "auxiliary_loss_mlp": 0.01072828, + "balance_loss_clip": 1.09357119, + "balance_loss_mlp": 1.04288268, + "epoch": 0.029580640312640915, + "flos": 23126560672320.0, + "grad_norm": 1.8434331028253435, + "language_loss": 0.85468102, + "learning_rate": 3.9909040560167675e-06, + "loss": 0.87826794, + "num_input_tokens_seen": 10455125, + "router_z_loss_clip": 1.92382812, + "router_z_loss_mlp": 0.29956055, + "step": 492, + "time_per_iteration": 5.643540382385254 + }, + { + "auxiliary_loss_clip": 0.01295349, + "auxiliary_loss_mlp": 0.01087155, + "balance_loss_clip": 1.09677112, + "balance_loss_mlp": 1.0555644, + "epoch": 0.029640763565308884, + "flos": 22850230407840.0, + "grad_norm": 3.230256102599805, + "language_loss": 0.84333551, + "learning_rate": 3.992211370544093e-06, + "loss": 0.8671605, + "num_input_tokens_seen": 10470990, + "router_z_loss_clip": 1.98242188, + "router_z_loss_mlp": 0.31591797, + "step": 493, + "time_per_iteration": 2.6664443016052246 + }, + { + "auxiliary_loss_clip": 0.01294865, + "auxiliary_loss_mlp": 0.01078235, + "balance_loss_clip": 1.09488606, + "balance_loss_mlp": 1.04783654, + "epoch": 0.029700886817976852, + "flos": 25130381124960.0, + "grad_norm": 1.688239394912692, + "language_loss": 0.86506581, + "learning_rate": 3.99351603600268e-06, + "loss": 0.88879681, + "num_input_tokens_seen": 10490685, + "router_z_loss_clip": 2.0, + "router_z_loss_mlp": 0.30419922, + "step": 494, + "time_per_iteration": 2.6787681579589844 + }, + { + "auxiliary_loss_clip": 0.01298933, + "auxiliary_loss_mlp": 0.01077611, + "balance_loss_clip": 1.09691179, + "balance_loss_mlp": 1.04878664, + "epoch": 0.02976101007064482, + "flos": 27133553301120.0, + "grad_norm": 2.0737454504865918, + "language_loss": 0.86803353, + "learning_rate": 3.994818063106668e-06, + "loss": 0.89179897, + "num_input_tokens_seen": 10509435, + "router_z_loss_clip": 2.01855469, + "router_z_loss_mlp": 0.28808594, + "step": 495, + "time_per_iteration": 2.693882942199707 + }, + { + "auxiliary_loss_clip": 0.01287697, + "auxiliary_loss_mlp": 0.01075372, + "balance_loss_clip": 1.09469318, + "balance_loss_mlp": 1.04566455, + "epoch": 0.029821133323312793, + "flos": 28958839502400.0, + "grad_norm": 2.2032508229255208, + "language_loss": 0.61835563, + "learning_rate": 3.99611746250533e-06, + "loss": 0.64198637, + "num_input_tokens_seen": 10530050, + "router_z_loss_clip": 1.92871094, + "router_z_loss_mlp": 0.29736328, + "step": 496, + "time_per_iteration": 2.752462863922119 + }, + { + "auxiliary_loss_clip": 0.01288685, + "auxiliary_loss_mlp": 0.01081476, + "balance_loss_clip": 1.09588099, + "balance_loss_mlp": 1.05149519, + "epoch": 0.02988125657598076, + "flos": 27356407106400.0, + "grad_norm": 1.8805517235837548, + "language_loss": 0.88759118, + "learning_rate": 3.997414244783595e-06, + "loss": 0.91129279, + "num_input_tokens_seen": 10551370, + "router_z_loss_clip": 1.92578125, + "router_z_loss_mlp": 0.29956055, + "step": 497, + "time_per_iteration": 2.7599902153015137 + }, + { + "auxiliary_loss_clip": 0.01295272, + "auxiliary_loss_mlp": 0.01077777, + "balance_loss_clip": 1.09594035, + "balance_loss_mlp": 1.04714012, + "epoch": 0.02994137982864873, + "flos": 16893298958400.0, + "grad_norm": 2.375657560529604, + "language_loss": 0.850348, + "learning_rate": 3.998708420462557e-06, + "loss": 0.87407845, + "num_input_tokens_seen": 10569225, + "router_z_loss_clip": 1.99121094, + "router_z_loss_mlp": 0.30615234, + "step": 498, + "time_per_iteration": 2.654350996017456 + }, + { + "auxiliary_loss_clip": 0.01290591, + "auxiliary_loss_mlp": 0.01075172, + "balance_loss_clip": 1.09341526, + "balance_loss_mlp": 1.04565549, + "epoch": 0.0300015030813167, + "flos": 29173873472640.0, + "grad_norm": 3.3468984694669057, + "language_loss": 0.77949709, + "learning_rate": 4e-06, + "loss": 0.80315471, + "num_input_tokens_seen": 10586170, + "router_z_loss_clip": 1.96875, + "router_z_loss_mlp": 0.29492188, + "step": 499, + "time_per_iteration": 2.70377516746521 + }, + { + "auxiliary_loss_clip": 0.01293455, + "auxiliary_loss_mlp": 0.01074712, + "balance_loss_clip": 1.09655881, + "balance_loss_mlp": 1.04495716, + "epoch": 0.030061626333984667, + "flos": 26865123906240.0, + "grad_norm": 1.7447271866745713, + "language_loss": 0.82806611, + "learning_rate": 3.9999999620799e-06, + "loss": 0.85174775, + "num_input_tokens_seen": 10606205, + "router_z_loss_clip": 1.96777344, + "router_z_loss_mlp": 0.29760742, + "step": 500, + "time_per_iteration": 2.7171671390533447 + }, + { + "auxiliary_loss_clip": 0.01286831, + "auxiliary_loss_mlp": 0.01086348, + "balance_loss_clip": 1.0926379, + "balance_loss_mlp": 1.05351806, + "epoch": 0.03012174958665264, + "flos": 28113283491840.0, + "grad_norm": 2.420563675769102, + "language_loss": 0.88010347, + "learning_rate": 3.9999998483196e-06, + "loss": 0.9038353, + "num_input_tokens_seen": 10625995, + "router_z_loss_clip": 1.94140625, + "router_z_loss_mlp": 0.328125, + "step": 501, + "time_per_iteration": 2.7150890827178955 + }, + { + "auxiliary_loss_clip": 0.01293153, + "auxiliary_loss_mlp": 0.01071472, + "balance_loss_clip": 1.09315753, + "balance_loss_mlp": 1.0418843, + "epoch": 0.030181872839320608, + "flos": 23126641706880.0, + "grad_norm": 2.3022373488079495, + "language_loss": 0.86446756, + "learning_rate": 3.9999996587191065e-06, + "loss": 0.8881138, + "num_input_tokens_seen": 10644105, + "router_z_loss_clip": 1.99902344, + "router_z_loss_mlp": 0.29589844, + "step": 502, + "time_per_iteration": 2.6985464096069336 + }, + { + "auxiliary_loss_clip": 0.01289416, + "auxiliary_loss_mlp": 0.0107395, + "balance_loss_clip": 1.09549642, + "balance_loss_mlp": 1.04347968, + "epoch": 0.030241996091988577, + "flos": 20411196320160.0, + "grad_norm": 2.283734430296965, + "language_loss": 0.84513319, + "learning_rate": 3.999999393278425e-06, + "loss": 0.86876678, + "num_input_tokens_seen": 10661090, + "router_z_loss_clip": 1.94042969, + "router_z_loss_mlp": 0.30444336, + "step": 503, + "time_per_iteration": 2.660702705383301 + }, + { + "auxiliary_loss_clip": 0.01283532, + "auxiliary_loss_mlp": 0.01084383, + "balance_loss_clip": 1.09341204, + "balance_loss_mlp": 1.05437803, + "epoch": 0.030302119344656545, + "flos": 34924360615200.0, + "grad_norm": 1.5951376941930644, + "language_loss": 0.88255727, + "learning_rate": 3.999999051997567e-06, + "loss": 0.90623641, + "num_input_tokens_seen": 10682380, + "router_z_loss_clip": 1.90039062, + "router_z_loss_mlp": 0.29992676, + "step": 504, + "time_per_iteration": 2.774080991744995 + }, + { + "auxiliary_loss_clip": 0.01286309, + "auxiliary_loss_mlp": 0.01086064, + "balance_loss_clip": 1.09337389, + "balance_loss_mlp": 1.05633354, + "epoch": 0.030362242597324514, + "flos": 19119365457120.0, + "grad_norm": 2.182523659220912, + "language_loss": 0.78091925, + "learning_rate": 3.9999986348765425e-06, + "loss": 0.80464292, + "num_input_tokens_seen": 10699925, + "router_z_loss_clip": 1.92871094, + "router_z_loss_mlp": 0.29736328, + "step": 505, + "time_per_iteration": 2.6820194721221924 + }, + { + "auxiliary_loss_clip": 0.01157273, + "auxiliary_loss_mlp": 0.01007987, + "balance_loss_clip": 1.0649122, + "balance_loss_mlp": 1.00044119, + "epoch": 0.030422365849992486, + "flos": 88007521170240.0, + "grad_norm": 0.8496684663822287, + "language_loss": 0.55062658, + "learning_rate": 3.999998141915371e-06, + "loss": 0.57227916, + "num_input_tokens_seen": 10766525, + "router_z_loss_clip": 0.92382812, + "router_z_loss_mlp": 0.07531738, + "step": 506, + "time_per_iteration": 3.50586199760437 + }, + { + "auxiliary_loss_clip": 0.01286675, + "auxiliary_loss_mlp": 0.01085472, + "balance_loss_clip": 1.09146202, + "balance_loss_mlp": 1.05442941, + "epoch": 0.030482489102660455, + "flos": 23705713152000.0, + "grad_norm": 1.9187681172315312, + "language_loss": 0.83147061, + "learning_rate": 3.999997573114069e-06, + "loss": 0.85519207, + "num_input_tokens_seen": 10786725, + "router_z_loss_clip": 1.95214844, + "router_z_loss_mlp": 0.31030273, + "step": 507, + "time_per_iteration": 2.7072973251342773 + }, + { + "auxiliary_loss_clip": 0.01290388, + "auxiliary_loss_mlp": 0.01073987, + "balance_loss_clip": 1.09275937, + "balance_loss_mlp": 1.04366016, + "epoch": 0.030542612355328423, + "flos": 24862600006560.0, + "grad_norm": 6.0864237708440845, + "language_loss": 0.88436508, + "learning_rate": 3.999996928472659e-06, + "loss": 0.90800875, + "num_input_tokens_seen": 10805390, + "router_z_loss_clip": 1.97558594, + "router_z_loss_mlp": 0.30297852, + "step": 508, + "time_per_iteration": 2.7570302486419678 + }, + { + "auxiliary_loss_clip": 0.01292165, + "auxiliary_loss_mlp": 0.01060614, + "balance_loss_clip": 1.09298182, + "balance_loss_mlp": 1.03073967, + "epoch": 0.030602735607996392, + "flos": 42314509183680.0, + "grad_norm": 1.802100712350147, + "language_loss": 0.71508986, + "learning_rate": 3.999996207991165e-06, + "loss": 0.73861766, + "num_input_tokens_seen": 10828030, + "router_z_loss_clip": 1.98925781, + "router_z_loss_mlp": 0.29858398, + "step": 509, + "time_per_iteration": 2.8419761657714844 + }, + { + "auxiliary_loss_clip": 0.01283876, + "auxiliary_loss_mlp": 0.01066057, + "balance_loss_clip": 1.09195042, + "balance_loss_mlp": 1.03742313, + "epoch": 0.03066285886066436, + "flos": 28868660737920.0, + "grad_norm": 1.9266868557364785, + "language_loss": 0.82277578, + "learning_rate": 3.999995411669614e-06, + "loss": 0.84627509, + "num_input_tokens_seen": 10845240, + "router_z_loss_clip": 1.91894531, + "router_z_loss_mlp": 0.28625488, + "step": 510, + "time_per_iteration": 2.7010416984558105 + }, + { + "auxiliary_loss_clip": 0.01289437, + "auxiliary_loss_mlp": 0.01080779, + "balance_loss_clip": 1.0960865, + "balance_loss_mlp": 1.050547, + "epoch": 0.030722982113332332, + "flos": 28068437213280.0, + "grad_norm": 2.0492911830746596, + "language_loss": 0.83828473, + "learning_rate": 3.999994539508036e-06, + "loss": 0.86198688, + "num_input_tokens_seen": 10864325, + "router_z_loss_clip": 1.93164062, + "router_z_loss_mlp": 0.30224609, + "step": 511, + "time_per_iteration": 2.716562271118164 + }, + { + "auxiliary_loss_clip": 0.01290163, + "auxiliary_loss_mlp": 0.01075311, + "balance_loss_clip": 1.09131527, + "balance_loss_mlp": 1.04584265, + "epoch": 0.0307831053660003, + "flos": 30201650807040.0, + "grad_norm": 2.038121649527241, + "language_loss": 0.82236314, + "learning_rate": 3.9999935915064655e-06, + "loss": 0.8460179, + "num_input_tokens_seen": 10883860, + "router_z_loss_clip": 1.98925781, + "router_z_loss_mlp": 0.29467773, + "step": 512, + "time_per_iteration": 2.7243590354919434 + }, + { + "auxiliary_loss_clip": 0.01285628, + "auxiliary_loss_mlp": 0.01078828, + "balance_loss_clip": 1.09138095, + "balance_loss_mlp": 1.0480957, + "epoch": 0.03084322861866827, + "flos": 31897543867200.0, + "grad_norm": 1.8562818431585277, + "language_loss": 0.87150109, + "learning_rate": 3.9999925676649374e-06, + "loss": 0.89514565, + "num_input_tokens_seen": 10904555, + "router_z_loss_clip": 1.94042969, + "router_z_loss_mlp": 0.30737305, + "step": 513, + "time_per_iteration": 2.813098430633545 + }, + { + "auxiliary_loss_clip": 0.01295045, + "auxiliary_loss_mlp": 0.01074854, + "balance_loss_clip": 1.09455037, + "balance_loss_mlp": 1.04395473, + "epoch": 0.03090335187133624, + "flos": 22904557729920.0, + "grad_norm": 1.7249915942942007, + "language_loss": 0.79275829, + "learning_rate": 3.999991467983491e-06, + "loss": 0.81645727, + "num_input_tokens_seen": 10923700, + "router_z_loss_clip": 2.00683594, + "router_z_loss_mlp": 0.30908203, + "step": 514, + "time_per_iteration": 2.6912553310394287 + }, + { + "auxiliary_loss_clip": 0.01285105, + "auxiliary_loss_mlp": 0.01061827, + "balance_loss_clip": 1.09376895, + "balance_loss_mlp": 1.03341901, + "epoch": 0.030963475124004207, + "flos": 28335570055200.0, + "grad_norm": 8.349238606640155, + "language_loss": 0.77685082, + "learning_rate": 3.999990292462167e-06, + "loss": 0.80032015, + "num_input_tokens_seen": 10942730, + "router_z_loss_clip": 1.91113281, + "router_z_loss_mlp": 0.28381348, + "step": 515, + "time_per_iteration": 2.691058874130249 + }, + { + "auxiliary_loss_clip": 0.01284217, + "auxiliary_loss_mlp": 0.01066664, + "balance_loss_clip": 1.08914375, + "balance_loss_mlp": 1.03645647, + "epoch": 0.03102359837667218, + "flos": 51891185736000.0, + "grad_norm": 1.933656339290599, + "language_loss": 0.82418036, + "learning_rate": 3.999989041101011e-06, + "loss": 0.84768915, + "num_input_tokens_seen": 10967120, + "router_z_loss_clip": 1.95117188, + "router_z_loss_mlp": 0.30200195, + "step": 516, + "time_per_iteration": 2.8771228790283203 + }, + { + "auxiliary_loss_clip": 0.01283102, + "auxiliary_loss_mlp": 0.01080655, + "balance_loss_clip": 1.09140897, + "balance_loss_mlp": 1.05035162, + "epoch": 0.031083721629340148, + "flos": 25840344850560.0, + "grad_norm": 1.7870348128302003, + "language_loss": 0.78719902, + "learning_rate": 3.999987713900071e-06, + "loss": 0.81083655, + "num_input_tokens_seen": 10986775, + "router_z_loss_clip": 1.91699219, + "router_z_loss_mlp": 0.30310059, + "step": 517, + "time_per_iteration": 2.688565492630005 + }, + { + "auxiliary_loss_clip": 0.01278783, + "auxiliary_loss_mlp": 0.01070757, + "balance_loss_clip": 1.09067035, + "balance_loss_mlp": 1.04131222, + "epoch": 0.031143844882008116, + "flos": 36260429997600.0, + "grad_norm": 2.09135419025503, + "language_loss": 0.90623808, + "learning_rate": 3.999986310859396e-06, + "loss": 0.92973351, + "num_input_tokens_seen": 11011360, + "router_z_loss_clip": 1.87988281, + "router_z_loss_mlp": 0.29467773, + "step": 518, + "time_per_iteration": 2.7601451873779297 + }, + { + "auxiliary_loss_clip": 0.01291619, + "auxiliary_loss_mlp": 0.01087784, + "balance_loss_clip": 1.09896398, + "balance_loss_mlp": 1.05543077, + "epoch": 0.031203968134676085, + "flos": 28202044151520.0, + "grad_norm": 2.0580966494448694, + "language_loss": 0.86092174, + "learning_rate": 3.999984831979039e-06, + "loss": 0.8847158, + "num_input_tokens_seen": 11030150, + "router_z_loss_clip": 1.92480469, + "router_z_loss_mlp": 0.32348633, + "step": 519, + "time_per_iteration": 2.6775901317596436 + }, + { + "auxiliary_loss_clip": 0.01290318, + "auxiliary_loss_mlp": 0.0108243, + "balance_loss_clip": 1.09200549, + "balance_loss_mlp": 1.05336642, + "epoch": 0.03126409138734405, + "flos": 25569079246080.0, + "grad_norm": 2.0950791897501038, + "language_loss": 0.86893618, + "learning_rate": 3.999983277259057e-06, + "loss": 0.89266372, + "num_input_tokens_seen": 11049145, + "router_z_loss_clip": 1.98144531, + "router_z_loss_mlp": 0.29052734, + "step": 520, + "time_per_iteration": 2.746575117111206 + }, + { + "auxiliary_loss_clip": 0.01289296, + "auxiliary_loss_mlp": 0.01089083, + "balance_loss_clip": 1.0918057, + "balance_loss_mlp": 1.05842233, + "epoch": 0.031324214640012026, + "flos": 26417836121760.0, + "grad_norm": 2.451974560387822, + "language_loss": 0.89508533, + "learning_rate": 3.999981646699509e-06, + "loss": 0.91886908, + "num_input_tokens_seen": 11068835, + "router_z_loss_clip": 1.97265625, + "router_z_loss_mlp": 0.30688477, + "step": 521, + "time_per_iteration": 2.7223355770111084 + }, + { + "auxiliary_loss_clip": 0.01286915, + "auxiliary_loss_mlp": 0.0107951, + "balance_loss_clip": 1.09448862, + "balance_loss_mlp": 1.04830074, + "epoch": 0.03138433789267999, + "flos": 28603310656320.0, + "grad_norm": 2.8832178530838912, + "language_loss": 0.7128737, + "learning_rate": 3.999979940300456e-06, + "loss": 0.73653793, + "num_input_tokens_seen": 11088980, + "router_z_loss_clip": 1.92285156, + "router_z_loss_mlp": 0.31176758, + "step": 522, + "time_per_iteration": 2.7236719131469727 + }, + { + "auxiliary_loss_clip": 0.01287344, + "auxiliary_loss_mlp": 0.01074691, + "balance_loss_clip": 1.08993411, + "balance_loss_mlp": 1.04531777, + "epoch": 0.03144446114534796, + "flos": 23162614701120.0, + "grad_norm": 2.9895658718885243, + "language_loss": 0.84799325, + "learning_rate": 3.999978158061963e-06, + "loss": 0.87161356, + "num_input_tokens_seen": 11104300, + "router_z_loss_clip": 1.97265625, + "router_z_loss_mlp": 0.29382324, + "step": 523, + "time_per_iteration": 2.6787009239196777 + }, + { + "auxiliary_loss_clip": 0.01292965, + "auxiliary_loss_mlp": 0.01069037, + "balance_loss_clip": 1.09280944, + "balance_loss_mlp": 1.03959227, + "epoch": 0.031504584398015935, + "flos": 27622729602720.0, + "grad_norm": 2.1278600383897164, + "language_loss": 0.90369958, + "learning_rate": 3.999976299984099e-06, + "loss": 0.92731965, + "num_input_tokens_seen": 11123335, + "router_z_loss_clip": 1.99902344, + "router_z_loss_mlp": 0.29443359, + "step": 524, + "time_per_iteration": 2.7257301807403564 + }, + { + "auxiliary_loss_clip": 0.01296323, + "auxiliary_loss_mlp": 0.01078115, + "balance_loss_clip": 1.09669769, + "balance_loss_mlp": 1.04794323, + "epoch": 0.0315647076506839, + "flos": 30868267393440.0, + "grad_norm": 2.269109663585886, + "language_loss": 0.8056981, + "learning_rate": 3.999974366066933e-06, + "loss": 0.8294425, + "num_input_tokens_seen": 11140880, + "router_z_loss_clip": 1.99707031, + "router_z_loss_mlp": 0.30163574, + "step": 525, + "time_per_iteration": 2.7086076736450195 + }, + { + "auxiliary_loss_clip": 0.01286034, + "auxiliary_loss_mlp": 0.01077662, + "balance_loss_clip": 1.09026027, + "balance_loss_mlp": 1.04760897, + "epoch": 0.03162483090335187, + "flos": 20722203025920.0, + "grad_norm": 2.3101428646348814, + "language_loss": 0.80829293, + "learning_rate": 3.999972356310538e-06, + "loss": 0.83192986, + "num_input_tokens_seen": 11158710, + "router_z_loss_clip": 1.95605469, + "router_z_loss_mlp": 0.30065918, + "step": 526, + "time_per_iteration": 2.6521966457366943 + }, + { + "auxiliary_loss_clip": 0.01294566, + "auxiliary_loss_mlp": 0.01062807, + "balance_loss_clip": 1.09717906, + "balance_loss_mlp": 1.03085947, + "epoch": 0.03168495415601984, + "flos": 22859670934080.0, + "grad_norm": 1.9451273429322677, + "language_loss": 0.81614447, + "learning_rate": 3.999970270714991e-06, + "loss": 0.83971816, + "num_input_tokens_seen": 11177550, + "router_z_loss_clip": 1.97363281, + "router_z_loss_mlp": 0.31958008, + "step": 527, + "time_per_iteration": 2.6760141849517822 + }, + { + "auxiliary_loss_clip": 0.01283607, + "auxiliary_loss_mlp": 0.01078233, + "balance_loss_clip": 1.08933449, + "balance_loss_mlp": 1.04757237, + "epoch": 0.03174507740868781, + "flos": 25886487682080.0, + "grad_norm": 1.9816403408339125, + "language_loss": 0.93536168, + "learning_rate": 3.999968109280371e-06, + "loss": 0.95898008, + "num_input_tokens_seen": 11196230, + "router_z_loss_clip": 1.94335938, + "router_z_loss_mlp": 0.30688477, + "step": 528, + "time_per_iteration": 2.702369213104248 + }, + { + "auxiliary_loss_clip": 0.01284965, + "auxiliary_loss_mlp": 0.01072616, + "balance_loss_clip": 1.09096885, + "balance_loss_mlp": 1.04292083, + "epoch": 0.03180520066135578, + "flos": 30246537602880.0, + "grad_norm": 1.8330292604753082, + "language_loss": 0.84045988, + "learning_rate": 3.99996587200676e-06, + "loss": 0.86403567, + "num_input_tokens_seen": 11214935, + "router_z_loss_clip": 1.94140625, + "router_z_loss_mlp": 0.29675293, + "step": 529, + "time_per_iteration": 2.707425117492676 + }, + { + "auxiliary_loss_clip": 0.01288253, + "auxiliary_loss_mlp": 0.01080738, + "balance_loss_clip": 1.09773123, + "balance_loss_mlp": 1.05088854, + "epoch": 0.03186532391402375, + "flos": 30339674128800.0, + "grad_norm": 1.8228277608868515, + "language_loss": 0.90749401, + "learning_rate": 3.999963558894243e-06, + "loss": 0.93118393, + "num_input_tokens_seen": 11235310, + "router_z_loss_clip": 1.90429688, + "router_z_loss_mlp": 0.29846191, + "step": 530, + "time_per_iteration": 4.157374382019043 + }, + { + "auxiliary_loss_clip": 0.01282534, + "auxiliary_loss_mlp": 0.01071117, + "balance_loss_clip": 1.08759701, + "balance_loss_mlp": 1.0397172, + "epoch": 0.03192544716669172, + "flos": 25886690268480.0, + "grad_norm": 2.2707148896417917, + "language_loss": 0.76120454, + "learning_rate": 3.999961169942907e-06, + "loss": 0.78474104, + "num_input_tokens_seen": 11254425, + "router_z_loss_clip": 1.94921875, + "router_z_loss_mlp": 0.3137207, + "step": 531, + "time_per_iteration": 5.573816537857056 + }, + { + "auxiliary_loss_clip": 0.01284459, + "auxiliary_loss_mlp": 0.01060794, + "balance_loss_clip": 1.09079695, + "balance_loss_mlp": 1.03022861, + "epoch": 0.03198557041935969, + "flos": 29716080543360.0, + "grad_norm": 2.6534020493711763, + "language_loss": 0.90563679, + "learning_rate": 3.999958705152843e-06, + "loss": 0.92908931, + "num_input_tokens_seen": 11274595, + "router_z_loss_clip": 1.93945312, + "router_z_loss_mlp": 0.30566406, + "step": 532, + "time_per_iteration": 2.7183306217193604 + }, + { + "auxiliary_loss_clip": 0.01156631, + "auxiliary_loss_mlp": 0.01008297, + "balance_loss_clip": 1.06570315, + "balance_loss_mlp": 1.00138235, + "epoch": 0.032045693672027656, + "flos": 75441256899840.0, + "grad_norm": 0.7312067720038417, + "language_loss": 0.5794974, + "learning_rate": 3.9999561645241445e-06, + "loss": 0.6011467, + "num_input_tokens_seen": 11336705, + "router_z_loss_clip": 0.90820312, + "router_z_loss_mlp": 0.06933594, + "step": 533, + "time_per_iteration": 3.5429389476776123 + }, + { + "auxiliary_loss_clip": 0.01282554, + "auxiliary_loss_mlp": 0.01078332, + "balance_loss_clip": 1.09064353, + "balance_loss_mlp": 1.04857755, + "epoch": 0.03210581692469563, + "flos": 34655890703040.0, + "grad_norm": 1.7965105376443884, + "language_loss": 0.86711097, + "learning_rate": 3.999953548056907e-06, + "loss": 0.89071977, + "num_input_tokens_seen": 11356820, + "router_z_loss_clip": 1.91796875, + "router_z_loss_mlp": 0.29711914, + "step": 534, + "time_per_iteration": 2.764211416244507 + }, + { + "auxiliary_loss_clip": 0.01284266, + "auxiliary_loss_mlp": 0.01064391, + "balance_loss_clip": 1.09210873, + "balance_loss_mlp": 1.03460002, + "epoch": 0.03216594017736359, + "flos": 30160896773760.0, + "grad_norm": 2.3771307638820125, + "language_loss": 0.77455962, + "learning_rate": 3.999950855751232e-06, + "loss": 0.79804623, + "num_input_tokens_seen": 11376645, + "router_z_loss_clip": 1.921875, + "router_z_loss_mlp": 0.29772949, + "step": 535, + "time_per_iteration": 2.7505033016204834 + }, + { + "auxiliary_loss_clip": 0.01283907, + "auxiliary_loss_mlp": 0.01078088, + "balance_loss_clip": 1.09115458, + "balance_loss_mlp": 1.04809499, + "epoch": 0.032226063430031565, + "flos": 38041153541280.0, + "grad_norm": 2.7453817804652494, + "language_loss": 0.80639583, + "learning_rate": 3.999948087607219e-06, + "loss": 0.83001578, + "num_input_tokens_seen": 11397310, + "router_z_loss_clip": 1.92578125, + "router_z_loss_mlp": 0.29980469, + "step": 536, + "time_per_iteration": 2.7733471393585205 + }, + { + "auxiliary_loss_clip": 0.01286062, + "auxiliary_loss_mlp": 0.01078998, + "balance_loss_clip": 1.09306884, + "balance_loss_mlp": 1.04678798, + "epoch": 0.03228618668269954, + "flos": 39289110540480.0, + "grad_norm": 1.7616989825125837, + "language_loss": 0.69799447, + "learning_rate": 3.999945243624975e-06, + "loss": 0.72164512, + "num_input_tokens_seen": 11418475, + "router_z_loss_clip": 1.92871094, + "router_z_loss_mlp": 0.32202148, + "step": 537, + "time_per_iteration": 2.7808544635772705 + }, + { + "auxiliary_loss_clip": 0.01284087, + "auxiliary_loss_mlp": 0.01076308, + "balance_loss_clip": 1.09575665, + "balance_loss_mlp": 1.04688668, + "epoch": 0.0323463099353675, + "flos": 27662713807680.0, + "grad_norm": 1.9928763736667272, + "language_loss": 0.82828778, + "learning_rate": 3.999942323804607e-06, + "loss": 0.85189176, + "num_input_tokens_seen": 11436630, + "router_z_loss_clip": 1.8828125, + "router_z_loss_mlp": 0.29394531, + "step": 538, + "time_per_iteration": 2.6789090633392334 + }, + { + "auxiliary_loss_clip": 0.01293, + "auxiliary_loss_mlp": 0.01076304, + "balance_loss_clip": 1.09344292, + "balance_loss_mlp": 1.04583442, + "epoch": 0.032406433188035474, + "flos": 32830361398080.0, + "grad_norm": 1.8126056081972506, + "language_loss": 0.79475737, + "learning_rate": 3.999939328146225e-06, + "loss": 0.81845045, + "num_input_tokens_seen": 11457275, + "router_z_loss_clip": 1.99511719, + "router_z_loss_mlp": 0.3046875, + "step": 539, + "time_per_iteration": 2.700892925262451 + }, + { + "auxiliary_loss_clip": 0.01285783, + "auxiliary_loss_mlp": 0.01068065, + "balance_loss_clip": 1.09352636, + "balance_loss_mlp": 1.03668928, + "epoch": 0.03246655644070344, + "flos": 38441974356000.0, + "grad_norm": 2.2369403981674987, + "language_loss": 0.7770803, + "learning_rate": 3.999936256649943e-06, + "loss": 0.80061877, + "num_input_tokens_seen": 11476925, + "router_z_loss_clip": 1.91992188, + "router_z_loss_mlp": 0.3137207, + "step": 540, + "time_per_iteration": 2.745432138442993 + }, + { + "auxiliary_loss_clip": 0.01289585, + "auxiliary_loss_mlp": 0.01070376, + "balance_loss_clip": 1.09473956, + "balance_loss_mlp": 1.04045463, + "epoch": 0.03252667969337141, + "flos": 28331558844480.0, + "grad_norm": 2.3085903883721204, + "language_loss": 0.85473299, + "learning_rate": 3.999933109315878e-06, + "loss": 0.87833256, + "num_input_tokens_seen": 11496830, + "router_z_loss_clip": 1.94824219, + "router_z_loss_mlp": 0.29956055, + "step": 541, + "time_per_iteration": 2.6990392208099365 + }, + { + "auxiliary_loss_clip": 0.01282418, + "auxiliary_loss_mlp": 0.0107752, + "balance_loss_clip": 1.09246969, + "balance_loss_mlp": 1.04578662, + "epoch": 0.032586802946039384, + "flos": 18006595570080.0, + "grad_norm": 2.509568278819291, + "language_loss": 0.89065266, + "learning_rate": 3.9999298861441496e-06, + "loss": 0.91425204, + "num_input_tokens_seen": 11515605, + "router_z_loss_clip": 1.90136719, + "router_z_loss_mlp": 0.31713867, + "step": 542, + "time_per_iteration": 2.639021635055542 + }, + { + "auxiliary_loss_clip": 0.01283682, + "auxiliary_loss_mlp": 0.01075475, + "balance_loss_clip": 1.09157538, + "balance_loss_mlp": 1.04507661, + "epoch": 0.03264692619870735, + "flos": 29627522470080.0, + "grad_norm": 1.9249774116557412, + "language_loss": 0.71297204, + "learning_rate": 3.999926587134879e-06, + "loss": 0.73656368, + "num_input_tokens_seen": 11536230, + "router_z_loss_clip": 1.921875, + "router_z_loss_mlp": 0.30432129, + "step": 543, + "time_per_iteration": 2.7212932109832764 + }, + { + "auxiliary_loss_clip": 0.01281756, + "auxiliary_loss_mlp": 0.01082234, + "balance_loss_clip": 1.08663535, + "balance_loss_mlp": 1.05236018, + "epoch": 0.03270704945137532, + "flos": 27934465619520.0, + "grad_norm": 9.517345738762137, + "language_loss": 0.91365469, + "learning_rate": 3.999923212288192e-06, + "loss": 0.93729454, + "num_input_tokens_seen": 11554715, + "router_z_loss_clip": 1.95019531, + "router_z_loss_mlp": 0.29858398, + "step": 544, + "time_per_iteration": 2.680330276489258 + }, + { + "auxiliary_loss_clip": 0.01288012, + "auxiliary_loss_mlp": 0.01078175, + "balance_loss_clip": 1.09422934, + "balance_loss_mlp": 1.04970777, + "epoch": 0.032767172704043286, + "flos": 22013466647040.0, + "grad_norm": 2.4847655443584173, + "language_loss": 0.66348958, + "learning_rate": 3.999919761604216e-06, + "loss": 0.68715143, + "num_input_tokens_seen": 11571370, + "router_z_loss_clip": 1.93652344, + "router_z_loss_mlp": 0.28491211, + "step": 545, + "time_per_iteration": 2.7060916423797607 + }, + { + "auxiliary_loss_clip": 0.01282851, + "auxiliary_loss_mlp": 0.01064618, + "balance_loss_clip": 1.08942819, + "balance_loss_mlp": 1.03538835, + "epoch": 0.03282729595671126, + "flos": 27489730423680.0, + "grad_norm": 2.1273377088862917, + "language_loss": 0.92075419, + "learning_rate": 3.999916235083083e-06, + "loss": 0.94422889, + "num_input_tokens_seen": 11588560, + "router_z_loss_clip": 1.9375, + "router_z_loss_mlp": 0.29223633, + "step": 546, + "time_per_iteration": 2.6823947429656982 + }, + { + "auxiliary_loss_clip": 0.01282516, + "auxiliary_loss_mlp": 0.01070721, + "balance_loss_clip": 1.08758378, + "balance_loss_mlp": 1.03915453, + "epoch": 0.03288741920937923, + "flos": 24905217834720.0, + "grad_norm": 3.766090682343355, + "language_loss": 0.81405032, + "learning_rate": 3.999912632724925e-06, + "loss": 0.83758271, + "num_input_tokens_seen": 11605685, + "router_z_loss_clip": 1.95214844, + "router_z_loss_mlp": 0.31567383, + "step": 547, + "time_per_iteration": 2.6843996047973633 + }, + { + "auxiliary_loss_clip": 0.01283814, + "auxiliary_loss_mlp": 0.01072716, + "balance_loss_clip": 1.09050441, + "balance_loss_mlp": 1.04210341, + "epoch": 0.032947542462047195, + "flos": 25353275447520.0, + "grad_norm": 1.7937653877092594, + "language_loss": 0.81258762, + "learning_rate": 3.999908954529881e-06, + "loss": 0.83615291, + "num_input_tokens_seen": 11626290, + "router_z_loss_clip": 1.93457031, + "router_z_loss_mlp": 0.30615234, + "step": 548, + "time_per_iteration": 2.719539165496826 + }, + { + "auxiliary_loss_clip": 0.01285927, + "auxiliary_loss_mlp": 0.01073803, + "balance_loss_clip": 1.0922184, + "balance_loss_mlp": 1.04235578, + "epoch": 0.03300766571471517, + "flos": 24283528561440.0, + "grad_norm": 2.9066667093667946, + "language_loss": 0.6750918, + "learning_rate": 3.999905200498087e-06, + "loss": 0.6986891, + "num_input_tokens_seen": 11643950, + "router_z_loss_clip": 1.93945312, + "router_z_loss_mlp": 0.31445312, + "step": 549, + "time_per_iteration": 2.752063274383545 + }, + { + "auxiliary_loss_clip": 0.01279415, + "auxiliary_loss_mlp": 0.01070349, + "balance_loss_clip": 1.09190762, + "balance_loss_mlp": 1.04040337, + "epoch": 0.03306778896738313, + "flos": 21924098228160.0, + "grad_norm": 1.8289086915661628, + "language_loss": 0.85989219, + "learning_rate": 3.999901370629689e-06, + "loss": 0.88338983, + "num_input_tokens_seen": 11662560, + "router_z_loss_clip": 1.87304688, + "router_z_loss_mlp": 0.29931641, + "step": 550, + "time_per_iteration": 2.699363946914673 + }, + { + "auxiliary_loss_clip": 0.01284977, + "auxiliary_loss_mlp": 0.0108424, + "balance_loss_clip": 1.09503663, + "balance_loss_mlp": 1.05455637, + "epoch": 0.033127912220051105, + "flos": 26421523194240.0, + "grad_norm": 1.7264742110897138, + "language_loss": 0.81474316, + "learning_rate": 3.99989746492483e-06, + "loss": 0.83843529, + "num_input_tokens_seen": 11682265, + "router_z_loss_clip": 1.8984375, + "router_z_loss_mlp": 0.29711914, + "step": 551, + "time_per_iteration": 2.6710565090179443 + }, + { + "auxiliary_loss_clip": 0.01289741, + "auxiliary_loss_mlp": 0.01082747, + "balance_loss_clip": 1.09443688, + "balance_loss_mlp": 1.0518955, + "epoch": 0.03318803547271908, + "flos": 36836827302240.0, + "grad_norm": 6.818695197150097, + "language_loss": 0.86461425, + "learning_rate": 3.999893483383658e-06, + "loss": 0.88833916, + "num_input_tokens_seen": 11699300, + "router_z_loss_clip": 1.953125, + "router_z_loss_mlp": 0.30834961, + "step": 552, + "time_per_iteration": 2.746349573135376 + }, + { + "auxiliary_loss_clip": 0.01288462, + "auxiliary_loss_mlp": 0.01079394, + "balance_loss_clip": 1.09475029, + "balance_loss_mlp": 1.04670715, + "epoch": 0.03324815872538704, + "flos": 24862843110240.0, + "grad_norm": 2.230141557260979, + "language_loss": 0.92990017, + "learning_rate": 3.999889426006326e-06, + "loss": 0.95357871, + "num_input_tokens_seen": 11716955, + "router_z_loss_clip": 1.93554688, + "router_z_loss_mlp": 0.3269043, + "step": 553, + "time_per_iteration": 2.6787567138671875 + }, + { + "auxiliary_loss_clip": 0.01285283, + "auxiliary_loss_mlp": 0.01077805, + "balance_loss_clip": 1.09215295, + "balance_loss_mlp": 1.0451417, + "epoch": 0.033308281978055014, + "flos": 29888982892800.0, + "grad_norm": 2.381155859951256, + "language_loss": 0.78652179, + "learning_rate": 3.999885292792986e-06, + "loss": 0.81015259, + "num_input_tokens_seen": 11736130, + "router_z_loss_clip": 1.93066406, + "router_z_loss_mlp": 0.32666016, + "step": 554, + "time_per_iteration": 2.700812339782715 + }, + { + "auxiliary_loss_clip": 0.01280378, + "auxiliary_loss_mlp": 0.01082167, + "balance_loss_clip": 1.09064162, + "balance_loss_mlp": 1.05088615, + "epoch": 0.03336840523072298, + "flos": 28553683338720.0, + "grad_norm": 2.1761513140457187, + "language_loss": 0.82241291, + "learning_rate": 3.999881083743795e-06, + "loss": 0.84603834, + "num_input_tokens_seen": 11754425, + "router_z_loss_clip": 1.89746094, + "router_z_loss_mlp": 0.31274414, + "step": 555, + "time_per_iteration": 2.6895320415496826 + }, + { + "auxiliary_loss_clip": 0.01285872, + "auxiliary_loss_mlp": 0.0108043, + "balance_loss_clip": 1.08972883, + "balance_loss_mlp": 1.04881561, + "epoch": 0.03342852848339095, + "flos": 37279050426720.0, + "grad_norm": 2.1586327326262564, + "language_loss": 0.88678348, + "learning_rate": 3.999876798858914e-06, + "loss": 0.91044652, + "num_input_tokens_seen": 11772845, + "router_z_loss_clip": 1.9609375, + "router_z_loss_mlp": 0.31591797, + "step": 556, + "time_per_iteration": 2.785430431365967 + }, + { + "auxiliary_loss_clip": 0.0128322, + "auxiliary_loss_mlp": 0.0108019, + "balance_loss_clip": 1.09075403, + "balance_loss_mlp": 1.04781246, + "epoch": 0.03348865173605892, + "flos": 27934627688640.0, + "grad_norm": 2.8577193109985433, + "language_loss": 0.83959568, + "learning_rate": 3.999872438138503e-06, + "loss": 0.86322975, + "num_input_tokens_seen": 11792850, + "router_z_loss_clip": 1.92480469, + "router_z_loss_mlp": 0.32373047, + "step": 557, + "time_per_iteration": 2.690833806991577 + }, + { + "auxiliary_loss_clip": 0.01287773, + "auxiliary_loss_mlp": 0.01065488, + "balance_loss_clip": 1.09550893, + "balance_loss_mlp": 1.03611469, + "epoch": 0.03354877498872689, + "flos": 21568083174720.0, + "grad_norm": 2.631272445350443, + "language_loss": 0.94355863, + "learning_rate": 3.999868001582729e-06, + "loss": 0.96709126, + "num_input_tokens_seen": 11809670, + "router_z_loss_clip": 1.92285156, + "router_z_loss_mlp": 0.29394531, + "step": 558, + "time_per_iteration": 2.6726233959198 + }, + { + "auxiliary_loss_clip": 0.0128072, + "auxiliary_loss_mlp": 0.01066314, + "balance_loss_clip": 1.08869028, + "balance_loss_mlp": 1.03558207, + "epoch": 0.03360889824139486, + "flos": 26421077504160.0, + "grad_norm": 2.0428071590603345, + "language_loss": 0.77516067, + "learning_rate": 3.99986348919176e-06, + "loss": 0.79863101, + "num_input_tokens_seen": 11829665, + "router_z_loss_clip": 1.92089844, + "router_z_loss_mlp": 0.30688477, + "step": 559, + "time_per_iteration": 2.6673333644866943 + }, + { + "auxiliary_loss_clip": 0.01281017, + "auxiliary_loss_mlp": 0.01075833, + "balance_loss_clip": 1.08893991, + "balance_loss_mlp": 1.04717469, + "epoch": 0.033669021494062826, + "flos": 26595560027520.0, + "grad_norm": 1.9981880266817866, + "language_loss": 0.87657893, + "learning_rate": 3.9998589009657675e-06, + "loss": 0.90014744, + "num_input_tokens_seen": 11848190, + "router_z_loss_clip": 1.92089844, + "router_z_loss_mlp": 0.28649902, + "step": 560, + "time_per_iteration": 2.6971116065979004 + }, + { + "auxiliary_loss_clip": 0.01278014, + "auxiliary_loss_mlp": 0.01067379, + "balance_loss_clip": 1.08968413, + "balance_loss_mlp": 1.03948426, + "epoch": 0.0337291447467308, + "flos": 26681808615840.0, + "grad_norm": 2.355398859865772, + "language_loss": 0.81693417, + "learning_rate": 3.999854236904925e-06, + "loss": 0.84038818, + "num_input_tokens_seen": 11864795, + "router_z_loss_clip": 1.88378906, + "router_z_loss_mlp": 0.27905273, + "step": 561, + "time_per_iteration": 2.6777572631835938 + }, + { + "auxiliary_loss_clip": 0.01276636, + "auxiliary_loss_mlp": 0.01067644, + "balance_loss_clip": 1.08971238, + "balance_loss_mlp": 1.03893805, + "epoch": 0.03378926799939877, + "flos": 29584985676480.0, + "grad_norm": 1.6479577158092693, + "language_loss": 0.8231945, + "learning_rate": 3.999849497009409e-06, + "loss": 0.84663737, + "num_input_tokens_seen": 11885275, + "router_z_loss_clip": 1.8671875, + "router_z_loss_mlp": 0.28710938, + "step": 562, + "time_per_iteration": 2.716874599456787 + }, + { + "auxiliary_loss_clip": 0.01282925, + "auxiliary_loss_mlp": 0.01078288, + "balance_loss_clip": 1.09186339, + "balance_loss_mlp": 1.04786551, + "epoch": 0.033849391252066735, + "flos": 20142604856160.0, + "grad_norm": 2.055922752384587, + "language_loss": 0.84412706, + "learning_rate": 3.999844681279401e-06, + "loss": 0.86773914, + "num_input_tokens_seen": 11903595, + "router_z_loss_clip": 1.90917969, + "router_z_loss_mlp": 0.30419922, + "step": 563, + "time_per_iteration": 2.6525497436523438 + }, + { + "auxiliary_loss_clip": 0.01280391, + "auxiliary_loss_mlp": 0.01074727, + "balance_loss_clip": 1.091223, + "balance_loss_mlp": 1.0449959, + "epoch": 0.03390951450473471, + "flos": 19119324939840.0, + "grad_norm": 2.3313177973530745, + "language_loss": 0.94108915, + "learning_rate": 3.99983978971508e-06, + "loss": 0.96464038, + "num_input_tokens_seen": 11917815, + "router_z_loss_clip": 1.89355469, + "router_z_loss_mlp": 0.296875, + "step": 564, + "time_per_iteration": 2.625429391860962 + }, + { + "auxiliary_loss_clip": 0.01280022, + "auxiliary_loss_mlp": 0.01072312, + "balance_loss_clip": 1.0870775, + "balance_loss_mlp": 1.04267693, + "epoch": 0.03396963775740267, + "flos": 27667940536800.0, + "grad_norm": 2.4414548200392407, + "language_loss": 0.93989569, + "learning_rate": 3.999834822316635e-06, + "loss": 0.96341908, + "num_input_tokens_seen": 11936305, + "router_z_loss_clip": 1.93066406, + "router_z_loss_mlp": 0.29614258, + "step": 565, + "time_per_iteration": 2.6809895038604736 + }, + { + "auxiliary_loss_clip": 0.01161744, + "auxiliary_loss_mlp": 0.01013263, + "balance_loss_clip": 1.07116115, + "balance_loss_mlp": 1.00633681, + "epoch": 0.034029761010070644, + "flos": 78570610182720.0, + "grad_norm": 1.1057659261155788, + "language_loss": 0.54841501, + "learning_rate": 3.9998297790842535e-06, + "loss": 0.57016504, + "num_input_tokens_seen": 11998940, + "router_z_loss_clip": 0.90576172, + "router_z_loss_mlp": 0.06945801, + "step": 566, + "time_per_iteration": 3.3661303520202637 + }, + { + "auxiliary_loss_clip": 0.01282821, + "auxiliary_loss_mlp": 0.01071112, + "balance_loss_clip": 1.09057474, + "balance_loss_mlp": 1.04004645, + "epoch": 0.034089884262738616, + "flos": 30510996304320.0, + "grad_norm": 3.0877072117303155, + "language_loss": 0.76907945, + "learning_rate": 3.999824660018126e-06, + "loss": 0.79261875, + "num_input_tokens_seen": 12018860, + "router_z_loss_clip": 1.92089844, + "router_z_loss_mlp": 0.31054688, + "step": 567, + "time_per_iteration": 2.8109772205352783 + }, + { + "auxiliary_loss_clip": 0.01275923, + "auxiliary_loss_mlp": 0.01077567, + "balance_loss_clip": 1.08941233, + "balance_loss_mlp": 1.04871798, + "epoch": 0.03415000751540658, + "flos": 34702033534560.0, + "grad_norm": 1.918602500676127, + "language_loss": 0.80732584, + "learning_rate": 3.999819465118447e-06, + "loss": 0.83086073, + "num_input_tokens_seen": 12039675, + "router_z_loss_clip": 1.86523438, + "router_z_loss_mlp": 0.28857422, + "step": 568, + "time_per_iteration": 2.7368645668029785 + }, + { + "auxiliary_loss_clip": 0.01277653, + "auxiliary_loss_mlp": 0.01080034, + "balance_loss_clip": 1.09173632, + "balance_loss_mlp": 1.04992163, + "epoch": 0.034210130768074554, + "flos": 26196400421280.0, + "grad_norm": 2.1774131672285804, + "language_loss": 0.86588842, + "learning_rate": 3.999814194385413e-06, + "loss": 0.88946521, + "num_input_tokens_seen": 12057680, + "router_z_loss_clip": 1.85839844, + "router_z_loss_mlp": 0.30126953, + "step": 569, + "time_per_iteration": 5.728067398071289 + }, + { + "auxiliary_loss_clip": 0.0127865, + "auxiliary_loss_mlp": 0.01076838, + "balance_loss_clip": 1.08925557, + "balance_loss_mlp": 1.04786968, + "epoch": 0.03427025402074252, + "flos": 22814581551840.0, + "grad_norm": 1.6509610671669555, + "language_loss": 0.9590705, + "learning_rate": 3.9998088478192255e-06, + "loss": 0.98262537, + "num_input_tokens_seen": 12076135, + "router_z_loss_clip": 1.89160156, + "router_z_loss_mlp": 0.28955078, + "step": 570, + "time_per_iteration": 4.207317113876343 + }, + { + "auxiliary_loss_clip": 0.01279624, + "auxiliary_loss_mlp": 0.01076814, + "balance_loss_clip": 1.0861007, + "balance_loss_mlp": 1.04462779, + "epoch": 0.03433037727341049, + "flos": 25441225761600.0, + "grad_norm": 2.095081585101255, + "language_loss": 0.79678583, + "learning_rate": 3.9998034254200846e-06, + "loss": 0.82035023, + "num_input_tokens_seen": 12094785, + "router_z_loss_clip": 1.93554688, + "router_z_loss_mlp": 0.32177734, + "step": 571, + "time_per_iteration": 4.135289907455444 + }, + { + "auxiliary_loss_clip": 0.01277709, + "auxiliary_loss_mlp": 0.01079341, + "balance_loss_clip": 1.09001708, + "balance_loss_mlp": 1.04856133, + "epoch": 0.03439050052607846, + "flos": 31006088128800.0, + "grad_norm": 2.6327614967484743, + "language_loss": 0.80364472, + "learning_rate": 3.999797927188199e-06, + "loss": 0.82721519, + "num_input_tokens_seen": 12114590, + "router_z_loss_clip": 1.87695312, + "router_z_loss_mlp": 0.30737305, + "step": 572, + "time_per_iteration": 2.7468202114105225 + }, + { + "auxiliary_loss_clip": 0.01286881, + "auxiliary_loss_mlp": 0.01071018, + "balance_loss_clip": 1.09421659, + "balance_loss_mlp": 1.04128742, + "epoch": 0.03445062377874643, + "flos": 21524249828160.0, + "grad_norm": 1.823775015462354, + "language_loss": 0.84518087, + "learning_rate": 3.999792353123774e-06, + "loss": 0.86875987, + "num_input_tokens_seen": 12132390, + "router_z_loss_clip": 1.92578125, + "router_z_loss_mlp": 0.29736328, + "step": 573, + "time_per_iteration": 2.653341770172119 + }, + { + "auxiliary_loss_clip": 0.01279189, + "auxiliary_loss_mlp": 0.01063492, + "balance_loss_clip": 1.08686447, + "balance_loss_mlp": 1.03528702, + "epoch": 0.0345107470314144, + "flos": 20455718460480.0, + "grad_norm": 2.62433188903846, + "language_loss": 0.76555479, + "learning_rate": 3.999786703227023e-06, + "loss": 0.78898156, + "num_input_tokens_seen": 12149035, + "router_z_loss_clip": 1.921875, + "router_z_loss_mlp": 0.28222656, + "step": 574, + "time_per_iteration": 2.6905345916748047 + }, + { + "auxiliary_loss_clip": 0.01278569, + "auxiliary_loss_mlp": 0.01064916, + "balance_loss_clip": 1.088781, + "balance_loss_mlp": 1.03609097, + "epoch": 0.03457087028408237, + "flos": 17917510772160.0, + "grad_norm": 11.892087102224158, + "language_loss": 0.83824927, + "learning_rate": 3.9997809774981606e-06, + "loss": 0.86168414, + "num_input_tokens_seen": 12167530, + "router_z_loss_clip": 1.89746094, + "router_z_loss_mlp": 0.28857422, + "step": 575, + "time_per_iteration": 2.6563708782196045 + }, + { + "auxiliary_loss_clip": 0.0127505, + "auxiliary_loss_mlp": 0.01070195, + "balance_loss_clip": 1.09055221, + "balance_loss_mlp": 1.04163194, + "epoch": 0.03463099353675034, + "flos": 24417743258880.0, + "grad_norm": 2.4866163582031926, + "language_loss": 0.84052527, + "learning_rate": 3.9997751759374025e-06, + "loss": 0.86397773, + "num_input_tokens_seen": 12186340, + "router_z_loss_clip": 1.84375, + "router_z_loss_mlp": 0.28564453, + "step": 576, + "time_per_iteration": 2.7170403003692627 + }, + { + "auxiliary_loss_clip": 0.01280758, + "auxiliary_loss_mlp": 0.01072177, + "balance_loss_clip": 1.09762669, + "balance_loss_mlp": 1.04473495, + "epoch": 0.03469111678941831, + "flos": 30872886363360.0, + "grad_norm": 2.4122427079873496, + "language_loss": 0.86019319, + "learning_rate": 3.99976929854497e-06, + "loss": 0.88372254, + "num_input_tokens_seen": 12204090, + "router_z_loss_clip": 1.83203125, + "router_z_loss_mlp": 0.2746582, + "step": 577, + "time_per_iteration": 2.7063393592834473 + }, + { + "auxiliary_loss_clip": 0.01275449, + "auxiliary_loss_mlp": 0.01069945, + "balance_loss_clip": 1.09237504, + "balance_loss_mlp": 1.04055977, + "epoch": 0.034751240042086275, + "flos": 28381267196640.0, + "grad_norm": 1.9574580388044396, + "language_loss": 0.7212972, + "learning_rate": 3.9997633453210845e-06, + "loss": 0.7447511, + "num_input_tokens_seen": 12224850, + "router_z_loss_clip": 1.83105469, + "router_z_loss_mlp": 0.29333496, + "step": 578, + "time_per_iteration": 2.688535451889038 + }, + { + "auxiliary_loss_clip": 0.01277283, + "auxiliary_loss_mlp": 0.0107569, + "balance_loss_clip": 1.08876121, + "balance_loss_mlp": 1.04526806, + "epoch": 0.03481136329475425, + "flos": 29003483194560.0, + "grad_norm": 1.7703243039454695, + "language_loss": 0.77898484, + "learning_rate": 3.999757316265973e-06, + "loss": 0.80251455, + "num_input_tokens_seen": 12244935, + "router_z_loss_clip": 1.88574219, + "router_z_loss_mlp": 0.30419922, + "step": 579, + "time_per_iteration": 2.726383924484253 + }, + { + "auxiliary_loss_clip": 0.01277228, + "auxiliary_loss_mlp": 0.01080276, + "balance_loss_clip": 1.09001958, + "balance_loss_mlp": 1.0500679, + "epoch": 0.03487148654742222, + "flos": 24596155958400.0, + "grad_norm": 1.8558797742605744, + "language_loss": 0.86920363, + "learning_rate": 3.999751211379863e-06, + "loss": 0.89277869, + "num_input_tokens_seen": 12262140, + "router_z_loss_clip": 1.86914062, + "router_z_loss_mlp": 0.30200195, + "step": 580, + "time_per_iteration": 2.6553971767425537 + }, + { + "auxiliary_loss_clip": 0.01280435, + "auxiliary_loss_mlp": 0.01064549, + "balance_loss_clip": 1.08969879, + "balance_loss_mlp": 1.03719091, + "epoch": 0.034931609800090184, + "flos": 19119649078080.0, + "grad_norm": 2.2367426748409023, + "language_loss": 0.82125473, + "learning_rate": 3.999745030662987e-06, + "loss": 0.84470451, + "num_input_tokens_seen": 12280930, + "router_z_loss_clip": 1.90820312, + "router_z_loss_mlp": 0.27380371, + "step": 581, + "time_per_iteration": 2.6477763652801514 + }, + { + "auxiliary_loss_clip": 0.01276615, + "auxiliary_loss_mlp": 0.01068308, + "balance_loss_clip": 1.09165263, + "balance_loss_mlp": 1.04069889, + "epoch": 0.034991733052758156, + "flos": 19965083536800.0, + "grad_norm": 2.2476095077273794, + "language_loss": 0.77251971, + "learning_rate": 3.99973877411558e-06, + "loss": 0.79596889, + "num_input_tokens_seen": 12299125, + "router_z_loss_clip": 1.85058594, + "router_z_loss_mlp": 0.27636719, + "step": 582, + "time_per_iteration": 2.673813581466675 + }, + { + "auxiliary_loss_clip": 0.01274686, + "auxiliary_loss_mlp": 0.01075763, + "balance_loss_clip": 1.09119558, + "balance_loss_mlp": 1.04627061, + "epoch": 0.03505185630542612, + "flos": 23656653076320.0, + "grad_norm": 2.1273706388975273, + "language_loss": 0.88139856, + "learning_rate": 3.999732441737877e-06, + "loss": 0.90490305, + "num_input_tokens_seen": 12316905, + "router_z_loss_clip": 1.83691406, + "router_z_loss_mlp": 0.29467773, + "step": 583, + "time_per_iteration": 2.6483027935028076 + }, + { + "auxiliary_loss_clip": 0.01278453, + "auxiliary_loss_mlp": 0.010814, + "balance_loss_clip": 1.09026527, + "balance_loss_mlp": 1.05216944, + "epoch": 0.03511197955809409, + "flos": 26019486861120.0, + "grad_norm": 2.1824597613153585, + "language_loss": 0.80961359, + "learning_rate": 3.99972603353012e-06, + "loss": 0.83321214, + "num_input_tokens_seen": 12335070, + "router_z_loss_clip": 1.88378906, + "router_z_loss_mlp": 0.29223633, + "step": 584, + "time_per_iteration": 2.700709581375122 + }, + { + "auxiliary_loss_clip": 0.01277598, + "auxiliary_loss_mlp": 0.01065098, + "balance_loss_clip": 1.08890617, + "balance_loss_mlp": 1.03672624, + "epoch": 0.035172102810762065, + "flos": 17249030390880.0, + "grad_norm": 3.2417880764959244, + "language_loss": 0.93469167, + "learning_rate": 3.999719549492551e-06, + "loss": 0.95811868, + "num_input_tokens_seen": 12350315, + "router_z_loss_clip": 1.88964844, + "router_z_loss_mlp": 0.28369141, + "step": 585, + "time_per_iteration": 2.685749053955078 + }, + { + "auxiliary_loss_clip": 0.01276264, + "auxiliary_loss_mlp": 0.01070283, + "balance_loss_clip": 1.08899522, + "balance_loss_mlp": 1.04211426, + "epoch": 0.03523222606343003, + "flos": 24766829857440.0, + "grad_norm": 2.288352712580851, + "language_loss": 0.87589228, + "learning_rate": 3.9997129896254165e-06, + "loss": 0.89935774, + "num_input_tokens_seen": 12366030, + "router_z_loss_clip": 1.87109375, + "router_z_loss_mlp": 0.28186035, + "step": 586, + "time_per_iteration": 2.697291374206543 + }, + { + "auxiliary_loss_clip": 0.0128067, + "auxiliary_loss_mlp": 0.01078358, + "balance_loss_clip": 1.09206247, + "balance_loss_mlp": 1.0497005, + "epoch": 0.035292349316098, + "flos": 24862600006560.0, + "grad_norm": 1.8287166414156326, + "language_loss": 0.76809025, + "learning_rate": 3.999706353928965e-06, + "loss": 0.79168046, + "num_input_tokens_seen": 12384895, + "router_z_loss_clip": 1.88476562, + "router_z_loss_mlp": 0.28662109, + "step": 587, + "time_per_iteration": 2.678344249725342 + }, + { + "auxiliary_loss_clip": 0.01281518, + "auxiliary_loss_mlp": 0.01061821, + "balance_loss_clip": 1.09030879, + "balance_loss_mlp": 1.03216124, + "epoch": 0.03535247256876597, + "flos": 26196562490400.0, + "grad_norm": 2.601853853306201, + "language_loss": 0.79089653, + "learning_rate": 3.999699642403449e-06, + "loss": 0.81432992, + "num_input_tokens_seen": 12404980, + "router_z_loss_clip": 1.91308594, + "router_z_loss_mlp": 0.296875, + "step": 588, + "time_per_iteration": 2.7052602767944336 + }, + { + "auxiliary_loss_clip": 0.01277121, + "auxiliary_loss_mlp": 0.01071336, + "balance_loss_clip": 1.08704138, + "balance_loss_mlp": 1.04043722, + "epoch": 0.03541259582143394, + "flos": 28825111012320.0, + "grad_norm": 4.214337985667146, + "language_loss": 0.94613171, + "learning_rate": 3.99969285504912e-06, + "loss": 0.96961629, + "num_input_tokens_seen": 12423835, + "router_z_loss_clip": 1.89941406, + "router_z_loss_mlp": 0.30908203, + "step": 589, + "time_per_iteration": 2.691126823425293 + }, + { + "auxiliary_loss_clip": 0.0127972, + "auxiliary_loss_mlp": 0.01066536, + "balance_loss_clip": 1.08936155, + "balance_loss_mlp": 1.03856921, + "epoch": 0.03547271907410191, + "flos": 41154259394880.0, + "grad_norm": 3.608047768939718, + "language_loss": 0.83906978, + "learning_rate": 3.99968599186624e-06, + "loss": 0.86253238, + "num_input_tokens_seen": 12443135, + "router_z_loss_clip": 1.90136719, + "router_z_loss_mlp": 0.27941895, + "step": 590, + "time_per_iteration": 2.8138773441314697 + }, + { + "auxiliary_loss_clip": 0.0127297, + "auxiliary_loss_mlp": 0.01057893, + "balance_loss_clip": 1.08962595, + "balance_loss_mlp": 1.03059387, + "epoch": 0.03553284232676988, + "flos": 25798334781600.0, + "grad_norm": 2.094178854861681, + "language_loss": 0.87242734, + "learning_rate": 3.999679052855065e-06, + "loss": 0.89573598, + "num_input_tokens_seen": 12462895, + "router_z_loss_clip": 1.83496094, + "router_z_loss_mlp": 0.27282715, + "step": 591, + "time_per_iteration": 2.703568935394287 + }, + { + "auxiliary_loss_clip": 0.01277099, + "auxiliary_loss_mlp": 0.010731, + "balance_loss_clip": 1.08670163, + "balance_loss_mlp": 1.04338121, + "epoch": 0.03559296557943785, + "flos": 24460928328960.0, + "grad_norm": 1.9912676051039615, + "language_loss": 0.82943928, + "learning_rate": 3.999672038015861e-06, + "loss": 0.85294122, + "num_input_tokens_seen": 12481515, + "router_z_loss_clip": 1.90234375, + "router_z_loss_mlp": 0.29675293, + "step": 592, + "time_per_iteration": 2.757136344909668 + }, + { + "auxiliary_loss_clip": 0.01159587, + "auxiliary_loss_mlp": 0.01016794, + "balance_loss_clip": 1.0707128, + "balance_loss_mlp": 1.00959969, + "epoch": 0.035653088832105814, + "flos": 73619374150080.0, + "grad_norm": 0.8634754123949849, + "language_loss": 0.59750628, + "learning_rate": 3.999664947348893e-06, + "loss": 0.61927003, + "num_input_tokens_seen": 12548220, + "router_z_loss_clip": 0.88964844, + "router_z_loss_mlp": 0.07177734, + "step": 593, + "time_per_iteration": 3.3315484523773193 + }, + { + "auxiliary_loss_clip": 0.0127656, + "auxiliary_loss_mlp": 0.01068974, + "balance_loss_clip": 1.0924201, + "balance_loss_mlp": 1.03910017, + "epoch": 0.035713212084773786, + "flos": 24542720016480.0, + "grad_norm": 2.2059338810350977, + "language_loss": 0.87394243, + "learning_rate": 3.999657780854429e-06, + "loss": 0.89739782, + "num_input_tokens_seen": 12566105, + "router_z_loss_clip": 1.84179688, + "router_z_loss_mlp": 0.29833984, + "step": 594, + "time_per_iteration": 2.729391098022461 + }, + { + "auxiliary_loss_clip": 0.01275254, + "auxiliary_loss_mlp": 0.01069656, + "balance_loss_clip": 1.08719862, + "balance_loss_mlp": 1.04154646, + "epoch": 0.03577333533744176, + "flos": 32074457427360.0, + "grad_norm": 2.0364411644669165, + "language_loss": 0.83711725, + "learning_rate": 3.999650538532742e-06, + "loss": 0.86056638, + "num_input_tokens_seen": 12586680, + "router_z_loss_clip": 1.87988281, + "router_z_loss_mlp": 0.28100586, + "step": 595, + "time_per_iteration": 2.713217258453369 + }, + { + "auxiliary_loss_clip": 0.01274818, + "auxiliary_loss_mlp": 0.01071615, + "balance_loss_clip": 1.08967805, + "balance_loss_mlp": 1.04225397, + "epoch": 0.035833458590109724, + "flos": 13286924557920.0, + "grad_norm": 2.6649101846395173, + "language_loss": 0.96359259, + "learning_rate": 3.999643220384106e-06, + "loss": 0.98705697, + "num_input_tokens_seen": 12601605, + "router_z_loss_clip": 1.85058594, + "router_z_loss_mlp": 0.2935791, + "step": 596, + "time_per_iteration": 2.679187059402466 + }, + { + "auxiliary_loss_clip": 0.01276864, + "auxiliary_loss_mlp": 0.01076037, + "balance_loss_clip": 1.09085917, + "balance_loss_mlp": 1.04914308, + "epoch": 0.035893581842777696, + "flos": 26954654394240.0, + "grad_norm": 2.0225505876164624, + "language_loss": 0.83096135, + "learning_rate": 3.999635826408799e-06, + "loss": 0.85449034, + "num_input_tokens_seen": 12620365, + "router_z_loss_clip": 1.85839844, + "router_z_loss_mlp": 0.26928711, + "step": 597, + "time_per_iteration": 2.6673526763916016 + }, + { + "auxiliary_loss_clip": 0.01274363, + "auxiliary_loss_mlp": 0.01065925, + "balance_loss_clip": 1.09140849, + "balance_loss_mlp": 1.03701639, + "epoch": 0.03595370509544566, + "flos": 28111703317920.0, + "grad_norm": 1.7207133606444958, + "language_loss": 0.81202722, + "learning_rate": 3.999628356607101e-06, + "loss": 0.83543003, + "num_input_tokens_seen": 12641140, + "router_z_loss_clip": 1.82910156, + "router_z_loss_mlp": 0.28918457, + "step": 598, + "time_per_iteration": 2.7410941123962402 + }, + { + "auxiliary_loss_clip": 0.01269812, + "auxiliary_loss_mlp": 0.01066342, + "balance_loss_clip": 1.09099472, + "balance_loss_mlp": 1.03680182, + "epoch": 0.03601382834811363, + "flos": 25352991826560.0, + "grad_norm": 1.7742852852307383, + "language_loss": 0.81428796, + "learning_rate": 3.999620810979295e-06, + "loss": 0.83764952, + "num_input_tokens_seen": 12661080, + "router_z_loss_clip": 1.79003906, + "router_z_loss_mlp": 0.2956543, + "step": 599, + "time_per_iteration": 2.7471773624420166 + }, + { + "auxiliary_loss_clip": 0.01278172, + "auxiliary_loss_mlp": 0.01068987, + "balance_loss_clip": 1.0889281, + "balance_loss_mlp": 1.04173565, + "epoch": 0.036073951600781605, + "flos": 29226053378880.0, + "grad_norm": 2.152615264043934, + "language_loss": 0.86195755, + "learning_rate": 3.999613189525668e-06, + "loss": 0.88542914, + "num_input_tokens_seen": 12678270, + "router_z_loss_clip": 1.890625, + "router_z_loss_mlp": 0.27233887, + "step": 600, + "time_per_iteration": 2.742819309234619 + }, + { + "auxiliary_loss_clip": 0.01270615, + "auxiliary_loss_mlp": 0.01082862, + "balance_loss_clip": 1.08431387, + "balance_loss_mlp": 1.05434752, + "epoch": 0.03613407485344957, + "flos": 23077054906560.0, + "grad_norm": 2.0104020307211594, + "language_loss": 0.81975871, + "learning_rate": 3.999605492246508e-06, + "loss": 0.84329355, + "num_input_tokens_seen": 12697295, + "router_z_loss_clip": 1.86132812, + "router_z_loss_mlp": 0.28491211, + "step": 601, + "time_per_iteration": 2.687736988067627 + }, + { + "auxiliary_loss_clip": 0.01266307, + "auxiliary_loss_mlp": 0.01058876, + "balance_loss_clip": 1.08430362, + "balance_loss_mlp": 1.03062367, + "epoch": 0.03619419810611754, + "flos": 28111905904320.0, + "grad_norm": 2.659402548518176, + "language_loss": 0.75284672, + "learning_rate": 3.999597719142107e-06, + "loss": 0.77609861, + "num_input_tokens_seen": 12716165, + "router_z_loss_clip": 1.81933594, + "router_z_loss_mlp": 0.2824707, + "step": 602, + "time_per_iteration": 2.7119109630584717 + }, + { + "auxiliary_loss_clip": 0.01265944, + "auxiliary_loss_mlp": 0.01056405, + "balance_loss_clip": 1.08384383, + "balance_loss_mlp": 1.02867699, + "epoch": 0.03625432135878551, + "flos": 35945371563840.0, + "grad_norm": 2.09591037699073, + "language_loss": 0.80168074, + "learning_rate": 3.999589870212761e-06, + "loss": 0.8249042, + "num_input_tokens_seen": 12735475, + "router_z_loss_clip": 1.8203125, + "router_z_loss_mlp": 0.27697754, + "step": 603, + "time_per_iteration": 2.8042044639587402 + }, + { + "auxiliary_loss_clip": 0.01274127, + "auxiliary_loss_mlp": 0.01065948, + "balance_loss_clip": 1.09125304, + "balance_loss_mlp": 1.03837514, + "epoch": 0.03631444461145348, + "flos": 28685831654880.0, + "grad_norm": 2.513624659204753, + "language_loss": 0.8697899, + "learning_rate": 3.9995819454587664e-06, + "loss": 0.89319062, + "num_input_tokens_seen": 12754540, + "router_z_loss_clip": 1.828125, + "router_z_loss_mlp": 0.27600098, + "step": 604, + "time_per_iteration": 2.69154691696167 + }, + { + "auxiliary_loss_clip": 0.01272509, + "auxiliary_loss_mlp": 0.01064163, + "balance_loss_clip": 1.08934498, + "balance_loss_mlp": 1.03405094, + "epoch": 0.03637456786412145, + "flos": 20277143691840.0, + "grad_norm": 3.301097823189932, + "language_loss": 0.8025775, + "learning_rate": 3.999573944880424e-06, + "loss": 0.82594419, + "num_input_tokens_seen": 12773050, + "router_z_loss_clip": 1.83203125, + "router_z_loss_mlp": 0.30126953, + "step": 605, + "time_per_iteration": 2.6767818927764893 + }, + { + "auxiliary_loss_clip": 0.01269968, + "auxiliary_loss_mlp": 0.01069191, + "balance_loss_clip": 1.08733141, + "balance_loss_mlp": 1.04204679, + "epoch": 0.03643469111678942, + "flos": 19342057193280.0, + "grad_norm": 3.358727569806673, + "language_loss": 0.85863745, + "learning_rate": 3.9995658684780375e-06, + "loss": 0.88202906, + "num_input_tokens_seen": 12791240, + "router_z_loss_clip": 1.82617188, + "router_z_loss_mlp": 0.27124023, + "step": 606, + "time_per_iteration": 2.6540863513946533 + }, + { + "auxiliary_loss_clip": 0.01276133, + "auxiliary_loss_mlp": 0.01072519, + "balance_loss_clip": 1.0892092, + "balance_loss_mlp": 1.04426658, + "epoch": 0.03649481436945739, + "flos": 28822112733600.0, + "grad_norm": 2.002556456097771, + "language_loss": 0.82649839, + "learning_rate": 3.999557716251912e-06, + "loss": 0.84998494, + "num_input_tokens_seen": 12812245, + "router_z_loss_clip": 1.87011719, + "router_z_loss_mlp": 0.28271484, + "step": 607, + "time_per_iteration": 2.703338146209717 + }, + { + "auxiliary_loss_clip": 0.01269668, + "auxiliary_loss_mlp": 0.01066589, + "balance_loss_clip": 1.08859611, + "balance_loss_mlp": 1.03917098, + "epoch": 0.036554937622125354, + "flos": 26546905124640.0, + "grad_norm": 2.2408060947180477, + "language_loss": 0.83793163, + "learning_rate": 3.999549488202358e-06, + "loss": 0.86129421, + "num_input_tokens_seen": 12831085, + "router_z_loss_clip": 1.8125, + "router_z_loss_mlp": 0.27429199, + "step": 608, + "time_per_iteration": 4.154517412185669 + }, + { + "auxiliary_loss_clip": 0.01273049, + "auxiliary_loss_mlp": 0.01068359, + "balance_loss_clip": 1.08902776, + "balance_loss_mlp": 1.03798437, + "epoch": 0.036615060874793326, + "flos": 21743700181920.0, + "grad_norm": 1.9765746849912775, + "language_loss": 0.82166225, + "learning_rate": 3.999541184329688e-06, + "loss": 0.84507638, + "num_input_tokens_seen": 12849115, + "router_z_loss_clip": 1.83886719, + "router_z_loss_mlp": 0.30395508, + "step": 609, + "time_per_iteration": 4.114210605621338 + }, + { + "auxiliary_loss_clip": 0.01279457, + "auxiliary_loss_mlp": 0.01079946, + "balance_loss_clip": 1.09517598, + "balance_loss_mlp": 1.05264688, + "epoch": 0.0366751841274613, + "flos": 32646154727520.0, + "grad_norm": 2.497378941986542, + "language_loss": 0.79445595, + "learning_rate": 3.999532804634215e-06, + "loss": 0.81804997, + "num_input_tokens_seen": 12868005, + "router_z_loss_clip": 1.84179688, + "router_z_loss_mlp": 0.27270508, + "step": 610, + "time_per_iteration": 4.1082422733306885 + }, + { + "auxiliary_loss_clip": 0.01279476, + "auxiliary_loss_mlp": 0.01081273, + "balance_loss_clip": 1.09077072, + "balance_loss_mlp": 1.0523293, + "epoch": 0.03673530738012926, + "flos": 27085182019200.0, + "grad_norm": 4.273636352243052, + "language_loss": 0.87421787, + "learning_rate": 3.9995243491162575e-06, + "loss": 0.89782536, + "num_input_tokens_seen": 12886890, + "router_z_loss_clip": 1.88476562, + "router_z_loss_mlp": 0.28955078, + "step": 611, + "time_per_iteration": 4.193069934844971 + }, + { + "auxiliary_loss_clip": 0.01269631, + "auxiliary_loss_mlp": 0.01088097, + "balance_loss_clip": 1.08997619, + "balance_loss_mlp": 1.05995178, + "epoch": 0.036795430632797235, + "flos": 30116860840800.0, + "grad_norm": 2.0812734536294446, + "language_loss": 0.72438401, + "learning_rate": 3.999515817776136e-06, + "loss": 0.74796128, + "num_input_tokens_seen": 12906130, + "router_z_loss_clip": 1.79785156, + "router_z_loss_mlp": 0.28161621, + "step": 612, + "time_per_iteration": 2.7595958709716797 + }, + { + "auxiliary_loss_clip": 0.01270343, + "auxiliary_loss_mlp": 0.01069766, + "balance_loss_clip": 1.08589482, + "balance_loss_mlp": 1.04060721, + "epoch": 0.0368555538854652, + "flos": 21657046420800.0, + "grad_norm": 2.8486266160187528, + "language_loss": 0.79571277, + "learning_rate": 3.999507210614175e-06, + "loss": 0.81911385, + "num_input_tokens_seen": 12925260, + "router_z_loss_clip": 1.84570312, + "router_z_loss_mlp": 0.29125977, + "step": 613, + "time_per_iteration": 2.6804330348968506 + }, + { + "auxiliary_loss_clip": 0.01266251, + "auxiliary_loss_mlp": 0.01075015, + "balance_loss_clip": 1.08555853, + "balance_loss_mlp": 1.04723954, + "epoch": 0.03691567713813317, + "flos": 25129530262080.0, + "grad_norm": 1.9228225839657707, + "language_loss": 0.93751377, + "learning_rate": 3.9994985276307e-06, + "loss": 0.96092635, + "num_input_tokens_seen": 12944590, + "router_z_loss_clip": 1.80761719, + "router_z_loss_mlp": 0.27770996, + "step": 614, + "time_per_iteration": 2.73637318611145 + }, + { + "auxiliary_loss_clip": 0.01274603, + "auxiliary_loss_mlp": 0.01076463, + "balance_loss_clip": 1.09012568, + "balance_loss_mlp": 1.04647028, + "epoch": 0.036975800390801145, + "flos": 41061811662720.0, + "grad_norm": 3.316629240571562, + "language_loss": 0.72955447, + "learning_rate": 3.999489768826041e-06, + "loss": 0.75306511, + "num_input_tokens_seen": 12964785, + "router_z_loss_clip": 1.84472656, + "router_z_loss_mlp": 0.29980469, + "step": 615, + "time_per_iteration": 2.7792551517486572 + }, + { + "auxiliary_loss_clip": 0.0127108, + "auxiliary_loss_mlp": 0.01071722, + "balance_loss_clip": 1.08575666, + "balance_loss_mlp": 1.04377961, + "epoch": 0.03703592364346911, + "flos": 34524552732480.0, + "grad_norm": 1.7208145770053107, + "language_loss": 0.82100582, + "learning_rate": 3.999480934200528e-06, + "loss": 0.84443378, + "num_input_tokens_seen": 12986705, + "router_z_loss_clip": 1.85351562, + "router_z_loss_mlp": 0.2791748, + "step": 616, + "time_per_iteration": 2.7702178955078125 + }, + { + "auxiliary_loss_clip": 0.01272366, + "auxiliary_loss_mlp": 0.01068999, + "balance_loss_clip": 1.08864093, + "balance_loss_mlp": 1.04200971, + "epoch": 0.03709604689613708, + "flos": 38442298494240.0, + "grad_norm": 2.183063919381532, + "language_loss": 0.67878491, + "learning_rate": 3.999472023754499e-06, + "loss": 0.70219851, + "num_input_tokens_seen": 13010560, + "router_z_loss_clip": 1.83789062, + "router_z_loss_mlp": 0.27001953, + "step": 617, + "time_per_iteration": 2.765986442565918 + }, + { + "auxiliary_loss_clip": 0.01275974, + "auxiliary_loss_mlp": 0.01067847, + "balance_loss_clip": 1.09210277, + "balance_loss_mlp": 1.03819966, + "epoch": 0.03715617014880505, + "flos": 23927351438880.0, + "grad_norm": 2.419053770702172, + "language_loss": 0.80003393, + "learning_rate": 3.99946303748829e-06, + "loss": 0.8234722, + "num_input_tokens_seen": 13028935, + "router_z_loss_clip": 1.83984375, + "router_z_loss_mlp": 0.29675293, + "step": 618, + "time_per_iteration": 2.7065589427948 + }, + { + "auxiliary_loss_clip": 0.01274529, + "auxiliary_loss_mlp": 0.0107507, + "balance_loss_clip": 1.08700609, + "balance_loss_mlp": 1.04438567, + "epoch": 0.03721629340147302, + "flos": 19430412680160.0, + "grad_norm": 3.077668470532438, + "language_loss": 0.91296566, + "learning_rate": 3.999453975402242e-06, + "loss": 0.93646169, + "num_input_tokens_seen": 13046000, + "router_z_loss_clip": 1.875, + "router_z_loss_mlp": 0.30664062, + "step": 619, + "time_per_iteration": 2.66670298576355 + }, + { + "auxiliary_loss_clip": 0.01271846, + "auxiliary_loss_mlp": 0.01075417, + "balance_loss_clip": 1.09067559, + "balance_loss_mlp": 1.04690242, + "epoch": 0.03727641665414099, + "flos": 25751422121760.0, + "grad_norm": 2.8974068589298483, + "language_loss": 0.94098699, + "learning_rate": 3.9994448374967e-06, + "loss": 0.96445966, + "num_input_tokens_seen": 13062995, + "router_z_loss_clip": 1.80957031, + "router_z_loss_mlp": 0.28564453, + "step": 620, + "time_per_iteration": 2.6714487075805664 + }, + { + "auxiliary_loss_clip": 0.01270106, + "auxiliary_loss_mlp": 0.0108009, + "balance_loss_clip": 1.08784628, + "balance_loss_mlp": 1.05028749, + "epoch": 0.037336539906808956, + "flos": 29445989940000.0, + "grad_norm": 1.8167590474548059, + "language_loss": 0.77256954, + "learning_rate": 3.999435623772008e-06, + "loss": 0.79607153, + "num_input_tokens_seen": 13084120, + "router_z_loss_clip": 1.82128906, + "router_z_loss_mlp": 0.29785156, + "step": 621, + "time_per_iteration": 2.7979655265808105 + }, + { + "auxiliary_loss_clip": 0.0126638, + "auxiliary_loss_mlp": 0.01066885, + "balance_loss_clip": 1.08857799, + "balance_loss_mlp": 1.03777409, + "epoch": 0.03739666315947693, + "flos": 27267200756640.0, + "grad_norm": 2.518071212534249, + "language_loss": 0.86267614, + "learning_rate": 3.999426334228518e-06, + "loss": 0.8860088, + "num_input_tokens_seen": 13100035, + "router_z_loss_clip": 1.77929688, + "router_z_loss_mlp": 0.29125977, + "step": 622, + "time_per_iteration": 2.7005112171173096 + }, + { + "auxiliary_loss_clip": 0.01265655, + "auxiliary_loss_mlp": 0.01065606, + "balance_loss_clip": 1.08536935, + "balance_loss_mlp": 1.03697157, + "epoch": 0.0374567864121449, + "flos": 24952454632800.0, + "grad_norm": 2.6651791583019295, + "language_loss": 0.90288782, + "learning_rate": 3.999416968866581e-06, + "loss": 0.92620039, + "num_input_tokens_seen": 13118070, + "router_z_loss_clip": 1.80175781, + "router_z_loss_mlp": 0.28649902, + "step": 623, + "time_per_iteration": 2.7220232486724854 + }, + { + "auxiliary_loss_clip": 0.01270485, + "auxiliary_loss_mlp": 0.0108368, + "balance_loss_clip": 1.08923268, + "balance_loss_mlp": 1.05442595, + "epoch": 0.037516909664812866, + "flos": 23437607895360.0, + "grad_norm": 1.8800418917964286, + "language_loss": 0.84203279, + "learning_rate": 3.999407527686551e-06, + "loss": 0.86557448, + "num_input_tokens_seen": 13136355, + "router_z_loss_clip": 1.8125, + "router_z_loss_mlp": 0.29248047, + "step": 624, + "time_per_iteration": 2.7099368572235107 + }, + { + "auxiliary_loss_clip": 0.01268796, + "auxiliary_loss_mlp": 0.01072308, + "balance_loss_clip": 1.08414924, + "balance_loss_mlp": 1.04362607, + "epoch": 0.03757703291748084, + "flos": 42714641204640.0, + "grad_norm": 8.52181208823571, + "language_loss": 0.67481607, + "learning_rate": 3.999398010688788e-06, + "loss": 0.69822711, + "num_input_tokens_seen": 13155435, + "router_z_loss_clip": 1.84472656, + "router_z_loss_mlp": 0.28710938, + "step": 625, + "time_per_iteration": 2.8122754096984863 + }, + { + "auxiliary_loss_clip": 0.01265312, + "auxiliary_loss_mlp": 0.01068085, + "balance_loss_clip": 1.08704138, + "balance_loss_mlp": 1.03773451, + "epoch": 0.0376371561701488, + "flos": 31095861720480.0, + "grad_norm": 2.1431100413721373, + "language_loss": 0.77028704, + "learning_rate": 3.999388417873652e-06, + "loss": 0.79362094, + "num_input_tokens_seen": 13174295, + "router_z_loss_clip": 1.78222656, + "router_z_loss_mlp": 0.30371094, + "step": 626, + "time_per_iteration": 2.7422754764556885 + }, + { + "auxiliary_loss_clip": 0.01267864, + "auxiliary_loss_mlp": 0.01072022, + "balance_loss_clip": 1.08615768, + "balance_loss_mlp": 1.04255366, + "epoch": 0.037697279422816775, + "flos": 22190056068960.0, + "grad_norm": 2.8839534466967938, + "language_loss": 0.81340426, + "learning_rate": 3.999378749241506e-06, + "loss": 0.83680308, + "num_input_tokens_seen": 13192500, + "router_z_loss_clip": 1.81738281, + "router_z_loss_mlp": 0.29431152, + "step": 627, + "time_per_iteration": 2.6625783443450928 + }, + { + "auxiliary_loss_clip": 0.01271788, + "auxiliary_loss_mlp": 0.01078594, + "balance_loss_clip": 1.08872962, + "balance_loss_mlp": 1.04931605, + "epoch": 0.03775740267548475, + "flos": 30071325768480.0, + "grad_norm": 1.927379078580675, + "language_loss": 0.88672054, + "learning_rate": 3.999369004792719e-06, + "loss": 0.91022438, + "num_input_tokens_seen": 13213470, + "router_z_loss_clip": 1.83203125, + "router_z_loss_mlp": 0.29272461, + "step": 628, + "time_per_iteration": 2.734402656555176 + }, + { + "auxiliary_loss_clip": 0.01265594, + "auxiliary_loss_mlp": 0.01069267, + "balance_loss_clip": 1.08228731, + "balance_loss_mlp": 1.04108644, + "epoch": 0.03781752592815271, + "flos": 25975937135520.0, + "grad_norm": 2.6355778526637663, + "language_loss": 0.79902464, + "learning_rate": 3.999359184527658e-06, + "loss": 0.82237321, + "num_input_tokens_seen": 13232365, + "router_z_loss_clip": 1.83203125, + "router_z_loss_mlp": 0.28173828, + "step": 629, + "time_per_iteration": 2.7525250911712646 + }, + { + "auxiliary_loss_clip": 0.01265951, + "auxiliary_loss_mlp": 0.01064984, + "balance_loss_clip": 1.08384979, + "balance_loss_mlp": 1.03735137, + "epoch": 0.037877649180820684, + "flos": 26954006117760.0, + "grad_norm": 1.6874190427700813, + "language_loss": 0.77178878, + "learning_rate": 3.999349288446696e-06, + "loss": 0.79509813, + "num_input_tokens_seen": 13251920, + "router_z_loss_clip": 1.82128906, + "router_z_loss_mlp": 0.27636719, + "step": 630, + "time_per_iteration": 2.7119295597076416 + }, + { + "auxiliary_loss_clip": 0.01269794, + "auxiliary_loss_mlp": 0.01069125, + "balance_loss_clip": 1.08526969, + "balance_loss_mlp": 1.04111123, + "epoch": 0.03793777243348865, + "flos": 17694130242240.0, + "grad_norm": 3.0502747021424073, + "language_loss": 0.91630781, + "learning_rate": 3.99933931655021e-06, + "loss": 0.93969697, + "num_input_tokens_seen": 13267440, + "router_z_loss_clip": 1.84472656, + "router_z_loss_mlp": 0.28039551, + "step": 631, + "time_per_iteration": 2.648026704788208 + }, + { + "auxiliary_loss_clip": 0.01260114, + "auxiliary_loss_mlp": 0.01078424, + "balance_loss_clip": 1.08218908, + "balance_loss_mlp": 1.04764438, + "epoch": 0.03799789568615662, + "flos": 26732773003680.0, + "grad_norm": 1.5708529716536201, + "language_loss": 0.92237663, + "learning_rate": 3.999329268838575e-06, + "loss": 0.9457621, + "num_input_tokens_seen": 13287850, + "router_z_loss_clip": 1.77832031, + "router_z_loss_mlp": 0.30773926, + "step": 632, + "time_per_iteration": 2.693638801574707 + }, + { + "auxiliary_loss_clip": 0.01265458, + "auxiliary_loss_mlp": 0.01062124, + "balance_loss_clip": 1.08571124, + "balance_loss_mlp": 1.03380001, + "epoch": 0.03805801893882459, + "flos": 30294301125600.0, + "grad_norm": 2.1066707304362158, + "language_loss": 0.83294046, + "learning_rate": 3.999319145312175e-06, + "loss": 0.85621631, + "num_input_tokens_seen": 13307760, + "router_z_loss_clip": 1.79785156, + "router_z_loss_mlp": 0.2833252, + "step": 633, + "time_per_iteration": 2.697425365447998 + }, + { + "auxiliary_loss_clip": 0.01264612, + "auxiliary_loss_mlp": 0.01068867, + "balance_loss_clip": 1.08286858, + "balance_loss_mlp": 1.04059029, + "epoch": 0.03811814219149256, + "flos": 37196043220800.0, + "grad_norm": 1.6484035643286514, + "language_loss": 0.69716716, + "learning_rate": 3.999308945971392e-06, + "loss": 0.72050196, + "num_input_tokens_seen": 13331230, + "router_z_loss_clip": 1.81835938, + "router_z_loss_mlp": 0.28320312, + "step": 634, + "time_per_iteration": 2.7406227588653564 + }, + { + "auxiliary_loss_clip": 0.01145234, + "auxiliary_loss_mlp": 0.01015086, + "balance_loss_clip": 1.05582905, + "balance_loss_mlp": 1.00844049, + "epoch": 0.03817826544416053, + "flos": 81745694773920.0, + "grad_norm": 0.8846167304950187, + "language_loss": 0.61621606, + "learning_rate": 3.999298670816614e-06, + "loss": 0.63781929, + "num_input_tokens_seen": 13394760, + "router_z_loss_clip": 0.89355469, + "router_z_loss_mlp": 0.06658936, + "step": 635, + "time_per_iteration": 3.3189480304718018 + }, + { + "auxiliary_loss_clip": 0.01260946, + "auxiliary_loss_mlp": 0.01069422, + "balance_loss_clip": 1.08178711, + "balance_loss_mlp": 1.04119325, + "epoch": 0.038238388696828496, + "flos": 24996247462080.0, + "grad_norm": 2.1030891013006943, + "language_loss": 0.83805943, + "learning_rate": 3.9992883198482294e-06, + "loss": 0.86136311, + "num_input_tokens_seen": 13412775, + "router_z_loss_clip": 1.79199219, + "router_z_loss_mlp": 0.28222656, + "step": 636, + "time_per_iteration": 2.680189371109009 + }, + { + "auxiliary_loss_clip": 0.01263332, + "auxiliary_loss_mlp": 0.01084902, + "balance_loss_clip": 1.08245063, + "balance_loss_mlp": 1.0568167, + "epoch": 0.03829851194949647, + "flos": 21921991329600.0, + "grad_norm": 3.1686138215466686, + "language_loss": 0.79274607, + "learning_rate": 3.999277893066632e-06, + "loss": 0.81622839, + "num_input_tokens_seen": 13427835, + "router_z_loss_clip": 1.80761719, + "router_z_loss_mlp": 0.28100586, + "step": 637, + "time_per_iteration": 2.635582208633423 + }, + { + "auxiliary_loss_clip": 0.01260655, + "auxiliary_loss_mlp": 0.01081485, + "balance_loss_clip": 1.07977867, + "balance_loss_mlp": 1.0523026, + "epoch": 0.03835863520216444, + "flos": 27401537005920.0, + "grad_norm": 1.913370177565359, + "language_loss": 0.83850068, + "learning_rate": 3.999267390472215e-06, + "loss": 0.86192203, + "num_input_tokens_seen": 13447295, + "router_z_loss_clip": 1.80664062, + "router_z_loss_mlp": 0.29174805, + "step": 638, + "time_per_iteration": 2.7251670360565186 + }, + { + "auxiliary_loss_clip": 0.01266986, + "auxiliary_loss_mlp": 0.01070933, + "balance_loss_clip": 1.08127475, + "balance_loss_mlp": 1.04179835, + "epoch": 0.038418758454832405, + "flos": 27045116779680.0, + "grad_norm": 5.8685340791211225, + "language_loss": 0.70503503, + "learning_rate": 3.999256812065381e-06, + "loss": 0.72841424, + "num_input_tokens_seen": 13468455, + "router_z_loss_clip": 1.85839844, + "router_z_loss_mlp": 0.29125977, + "step": 639, + "time_per_iteration": 2.700291633605957 + }, + { + "auxiliary_loss_clip": 0.01264154, + "auxiliary_loss_mlp": 0.01082088, + "balance_loss_clip": 1.08256602, + "balance_loss_mlp": 1.05188012, + "epoch": 0.03847888170750038, + "flos": 27756620161920.0, + "grad_norm": 2.2291591428425503, + "language_loss": 0.8524096, + "learning_rate": 3.999246157846526e-06, + "loss": 0.87587202, + "num_input_tokens_seen": 13489085, + "router_z_loss_clip": 1.81445312, + "router_z_loss_mlp": 0.30224609, + "step": 640, + "time_per_iteration": 2.6920595169067383 + }, + { + "auxiliary_loss_clip": 0.01267689, + "auxiliary_loss_mlp": 0.01077686, + "balance_loss_clip": 1.08416724, + "balance_loss_mlp": 1.04737115, + "epoch": 0.03853900496016834, + "flos": 27712381642560.0, + "grad_norm": 2.1558277299051114, + "language_loss": 0.81976473, + "learning_rate": 3.9992354278160574e-06, + "loss": 0.84321851, + "num_input_tokens_seen": 13509120, + "router_z_loss_clip": 1.83105469, + "router_z_loss_mlp": 0.30297852, + "step": 641, + "time_per_iteration": 2.6954829692840576 + }, + { + "auxiliary_loss_clip": 0.01141701, + "auxiliary_loss_mlp": 0.01026376, + "balance_loss_clip": 1.05461717, + "balance_loss_mlp": 1.01979589, + "epoch": 0.038599128212836314, + "flos": 85901407286400.0, + "grad_norm": 0.8999374052547324, + "language_loss": 0.65397614, + "learning_rate": 3.999224621974381e-06, + "loss": 0.67565691, + "num_input_tokens_seen": 13562005, + "router_z_loss_clip": 0.87060547, + "router_z_loss_mlp": 0.06591797, + "step": 642, + "time_per_iteration": 3.280838966369629 + }, + { + "auxiliary_loss_clip": 0.01260518, + "auxiliary_loss_mlp": 0.01063355, + "balance_loss_clip": 1.08003616, + "balance_loss_mlp": 1.03612733, + "epoch": 0.03865925146550429, + "flos": 28424938474080.0, + "grad_norm": 1.793817300376825, + "language_loss": 0.79193819, + "learning_rate": 3.999213740321906e-06, + "loss": 0.81517696, + "num_input_tokens_seen": 13582185, + "router_z_loss_clip": 1.8046875, + "router_z_loss_mlp": 0.27233887, + "step": 643, + "time_per_iteration": 2.7169189453125 + }, + { + "auxiliary_loss_clip": 0.01256902, + "auxiliary_loss_mlp": 0.01076835, + "balance_loss_clip": 1.07742, + "balance_loss_mlp": 1.04950011, + "epoch": 0.03871937471817225, + "flos": 26149852416960.0, + "grad_norm": 1.9471483685444493, + "language_loss": 0.82653892, + "learning_rate": 3.999202782859046e-06, + "loss": 0.84987634, + "num_input_tokens_seen": 13599555, + "router_z_loss_clip": 1.79296875, + "router_z_loss_mlp": 0.27307129, + "step": 644, + "time_per_iteration": 2.6948134899139404 + }, + { + "auxiliary_loss_clip": 0.01259821, + "auxiliary_loss_mlp": 0.01066436, + "balance_loss_clip": 1.07803273, + "balance_loss_mlp": 1.03713417, + "epoch": 0.038779497970840224, + "flos": 41826224262240.0, + "grad_norm": 9.883351949016879, + "language_loss": 0.82576823, + "learning_rate": 3.9991917495862165e-06, + "loss": 0.84903085, + "num_input_tokens_seen": 13621160, + "router_z_loss_clip": 1.81738281, + "router_z_loss_mlp": 0.29321289, + "step": 645, + "time_per_iteration": 2.7623400688171387 + }, + { + "auxiliary_loss_clip": 0.01263427, + "auxiliary_loss_mlp": 0.01067669, + "balance_loss_clip": 1.08023846, + "balance_loss_mlp": 1.03929698, + "epoch": 0.03883962122350819, + "flos": 27757471024800.0, + "grad_norm": 2.589446272414667, + "language_loss": 0.81879336, + "learning_rate": 3.9991806405038345e-06, + "loss": 0.84210432, + "num_input_tokens_seen": 13641915, + "router_z_loss_clip": 1.82910156, + "router_z_loss_mlp": 0.2833252, + "step": 646, + "time_per_iteration": 2.682685375213623 + }, + { + "auxiliary_loss_clip": 0.01262874, + "auxiliary_loss_mlp": 0.01078186, + "balance_loss_clip": 1.08364022, + "balance_loss_mlp": 1.0492419, + "epoch": 0.03889974447617616, + "flos": 26778348593280.0, + "grad_norm": 1.9293781107895809, + "language_loss": 0.81985217, + "learning_rate": 3.999169455612323e-06, + "loss": 0.84326279, + "num_input_tokens_seen": 13661410, + "router_z_loss_clip": 1.79101562, + "router_z_loss_mlp": 0.28979492, + "step": 647, + "time_per_iteration": 2.7258567810058594 + }, + { + "auxiliary_loss_clip": 0.01257952, + "auxiliary_loss_mlp": 0.01066732, + "balance_loss_clip": 1.07858539, + "balance_loss_mlp": 1.03915882, + "epoch": 0.03895986772884413, + "flos": 38442987288000.0, + "grad_norm": 2.062818989125037, + "language_loss": 0.84705931, + "learning_rate": 3.999158194912106e-06, + "loss": 0.87030619, + "num_input_tokens_seen": 13681705, + "router_z_loss_clip": 1.79296875, + "router_z_loss_mlp": 0.27575684, + "step": 648, + "time_per_iteration": 5.734619140625 + }, + { + "auxiliary_loss_clip": 0.01258533, + "auxiliary_loss_mlp": 0.01069797, + "balance_loss_clip": 1.07876277, + "balance_loss_mlp": 1.04165137, + "epoch": 0.0390199909815121, + "flos": 24283244940480.0, + "grad_norm": 2.8255780488345414, + "language_loss": 0.84536225, + "learning_rate": 3.9991468584036086e-06, + "loss": 0.86864555, + "num_input_tokens_seen": 13700400, + "router_z_loss_clip": 1.79882812, + "router_z_loss_mlp": 0.28100586, + "step": 649, + "time_per_iteration": 2.683356523513794 + }, + { + "auxiliary_loss_clip": 0.01259055, + "auxiliary_loss_mlp": 0.01067413, + "balance_loss_clip": 1.07784581, + "balance_loss_mlp": 1.03813481, + "epoch": 0.03908011423418007, + "flos": 26372017428480.0, + "grad_norm": 1.9312687131901716, + "language_loss": 0.79733831, + "learning_rate": 3.999135446087263e-06, + "loss": 0.82060295, + "num_input_tokens_seen": 13720145, + "router_z_loss_clip": 1.81152344, + "router_z_loss_mlp": 0.29321289, + "step": 650, + "time_per_iteration": 5.533164024353027 + }, + { + "auxiliary_loss_clip": 0.01253441, + "auxiliary_loss_mlp": 0.01070115, + "balance_loss_clip": 1.07427287, + "balance_loss_mlp": 1.04182696, + "epoch": 0.039140237486848035, + "flos": 22770505101600.0, + "grad_norm": 2.3206360647458775, + "language_loss": 0.78845263, + "learning_rate": 3.9991239579635e-06, + "loss": 0.81168818, + "num_input_tokens_seen": 13737500, + "router_z_loss_clip": 1.79101562, + "router_z_loss_mlp": 0.28283691, + "step": 651, + "time_per_iteration": 2.675487756729126 + }, + { + "auxiliary_loss_clip": 0.01254305, + "auxiliary_loss_mlp": 0.0107363, + "balance_loss_clip": 1.07565594, + "balance_loss_mlp": 1.04389906, + "epoch": 0.03920036073951601, + "flos": 22770180963360.0, + "grad_norm": 2.4090934889062248, + "language_loss": 0.87396008, + "learning_rate": 3.999112394032757e-06, + "loss": 0.89723939, + "num_input_tokens_seen": 13754750, + "router_z_loss_clip": 1.78613281, + "router_z_loss_mlp": 0.29711914, + "step": 652, + "time_per_iteration": 2.649022340774536 + }, + { + "auxiliary_loss_clip": 0.01247659, + "auxiliary_loss_mlp": 0.01067364, + "balance_loss_clip": 1.07404077, + "balance_loss_mlp": 1.04054224, + "epoch": 0.03926048399218398, + "flos": 38259915101280.0, + "grad_norm": 2.8035751392009405, + "language_loss": 0.78645521, + "learning_rate": 3.999100754295471e-06, + "loss": 0.80960548, + "num_input_tokens_seen": 13771990, + "router_z_loss_clip": 1.73632812, + "router_z_loss_mlp": 0.26831055, + "step": 653, + "time_per_iteration": 2.7227275371551514 + }, + { + "auxiliary_loss_clip": 0.01263656, + "auxiliary_loss_mlp": 0.01067116, + "balance_loss_clip": 1.07909071, + "balance_loss_mlp": 1.03838611, + "epoch": 0.039320607244851945, + "flos": 36123217021440.0, + "grad_norm": 2.536557877344623, + "language_loss": 0.8585763, + "learning_rate": 3.999089038752085e-06, + "loss": 0.88188404, + "num_input_tokens_seen": 13792750, + "router_z_loss_clip": 1.84765625, + "router_z_loss_mlp": 0.28735352, + "step": 654, + "time_per_iteration": 2.7206761837005615 + }, + { + "auxiliary_loss_clip": 0.01142013, + "auxiliary_loss_mlp": 0.01005237, + "balance_loss_clip": 1.05310774, + "balance_loss_mlp": 0.99865043, + "epoch": 0.03938073049751992, + "flos": 81185342312160.0, + "grad_norm": 0.7205375368589922, + "language_loss": 0.49965563, + "learning_rate": 3.999077247403041e-06, + "loss": 0.52112818, + "num_input_tokens_seen": 13858570, + "router_z_loss_clip": 0.88916016, + "router_z_loss_mlp": 0.065979, + "step": 655, + "time_per_iteration": 3.3587586879730225 + }, + { + "auxiliary_loss_clip": 0.01250038, + "auxiliary_loss_mlp": 0.01064438, + "balance_loss_clip": 1.07655907, + "balance_loss_mlp": 1.03773522, + "epoch": 0.03944085375018788, + "flos": 28513658616480.0, + "grad_norm": 2.3041891448979315, + "language_loss": 0.80691028, + "learning_rate": 3.9990653802487886e-06, + "loss": 0.830055, + "num_input_tokens_seen": 13876335, + "router_z_loss_clip": 1.734375, + "router_z_loss_mlp": 0.2668457, + "step": 656, + "time_per_iteration": 2.696866035461426 + }, + { + "auxiliary_loss_clip": 0.0126237, + "auxiliary_loss_mlp": 0.01087758, + "balance_loss_clip": 1.07938004, + "balance_loss_mlp": 1.05566657, + "epoch": 0.039500977002855854, + "flos": 22632805918080.0, + "grad_norm": 2.40935049121461, + "language_loss": 0.76115239, + "learning_rate": 3.999053437289776e-06, + "loss": 0.7846536, + "num_input_tokens_seen": 13892640, + "router_z_loss_clip": 1.828125, + "router_z_loss_mlp": 0.32104492, + "step": 657, + "time_per_iteration": 2.6448593139648438 + }, + { + "auxiliary_loss_clip": 0.01258235, + "auxiliary_loss_mlp": 0.0106786, + "balance_loss_clip": 1.07695675, + "balance_loss_mlp": 1.03932118, + "epoch": 0.039561100255523826, + "flos": 30918016262880.0, + "grad_norm": 1.9916451715050245, + "language_loss": 0.81648839, + "learning_rate": 3.999041418526457e-06, + "loss": 0.83974934, + "num_input_tokens_seen": 13910085, + "router_z_loss_clip": 1.8125, + "router_z_loss_mlp": 0.28540039, + "step": 658, + "time_per_iteration": 2.7346479892730713 + }, + { + "auxiliary_loss_clip": 0.01249861, + "auxiliary_loss_mlp": 0.01072811, + "balance_loss_clip": 1.07442272, + "balance_loss_mlp": 1.04334259, + "epoch": 0.03962122350819179, + "flos": 22232066137920.0, + "grad_norm": 2.203554782724889, + "language_loss": 0.9082377, + "learning_rate": 3.999029323959287e-06, + "loss": 0.93146443, + "num_input_tokens_seen": 13928800, + "router_z_loss_clip": 1.75292969, + "router_z_loss_mlp": 0.29431152, + "step": 659, + "time_per_iteration": 2.6965105533599854 + }, + { + "auxiliary_loss_clip": 0.01256636, + "auxiliary_loss_mlp": 0.01064922, + "balance_loss_clip": 1.07669628, + "balance_loss_mlp": 1.03756285, + "epoch": 0.03968134676085976, + "flos": 25041215292480.0, + "grad_norm": 2.0841762957240353, + "language_loss": 0.79165936, + "learning_rate": 3.999017153588724e-06, + "loss": 0.81487495, + "num_input_tokens_seen": 13948325, + "router_z_loss_clip": 1.80078125, + "router_z_loss_mlp": 0.27368164, + "step": 660, + "time_per_iteration": 2.67425274848938 + }, + { + "auxiliary_loss_clip": 0.01255249, + "auxiliary_loss_mlp": 0.01067008, + "balance_loss_clip": 1.07895303, + "balance_loss_mlp": 1.03827834, + "epoch": 0.03974147001352773, + "flos": 27359891592480.0, + "grad_norm": 2.022768222426913, + "language_loss": 0.81768578, + "learning_rate": 3.999004907415231e-06, + "loss": 0.84090829, + "num_input_tokens_seen": 13969090, + "router_z_loss_clip": 1.76367188, + "router_z_loss_mlp": 0.28759766, + "step": 661, + "time_per_iteration": 2.71549129486084 + }, + { + "auxiliary_loss_clip": 0.0112866, + "auxiliary_loss_mlp": 0.01014119, + "balance_loss_clip": 1.04256582, + "balance_loss_mlp": 1.00750256, + "epoch": 0.0398015932661957, + "flos": 86791769058240.0, + "grad_norm": 0.9104349586707359, + "language_loss": 0.694471, + "learning_rate": 3.998992585439272e-06, + "loss": 0.71589875, + "num_input_tokens_seen": 14037555, + "router_z_loss_clip": 0.86035156, + "router_z_loss_mlp": 0.06628418, + "step": 662, + "time_per_iteration": 3.4522359371185303 + }, + { + "auxiliary_loss_clip": 0.01260671, + "auxiliary_loss_mlp": 0.01069957, + "balance_loss_clip": 1.08122754, + "balance_loss_mlp": 1.04225326, + "epoch": 0.03986171651886367, + "flos": 20500200083520.0, + "grad_norm": 1.8506968227122, + "language_loss": 0.83180058, + "learning_rate": 3.998980187661314e-06, + "loss": 0.85510683, + "num_input_tokens_seen": 14055765, + "router_z_loss_clip": 1.79589844, + "router_z_loss_mlp": 0.27709961, + "step": 663, + "time_per_iteration": 2.6624343395233154 + }, + { + "auxiliary_loss_clip": 0.01262973, + "auxiliary_loss_mlp": 0.01062186, + "balance_loss_clip": 1.08081961, + "balance_loss_mlp": 1.0336597, + "epoch": 0.03992183977153164, + "flos": 29938042968480.0, + "grad_norm": 2.833317751009039, + "language_loss": 0.8726989, + "learning_rate": 3.998967714081826e-06, + "loss": 0.89595044, + "num_input_tokens_seen": 14074195, + "router_z_loss_clip": 1.82128906, + "router_z_loss_mlp": 0.28540039, + "step": 664, + "time_per_iteration": 2.687558174133301 + }, + { + "auxiliary_loss_clip": 0.01249619, + "auxiliary_loss_mlp": 0.01066392, + "balance_loss_clip": 1.0753305, + "balance_loss_mlp": 1.03751993, + "epoch": 0.03998196302419961, + "flos": 19029794451840.0, + "grad_norm": 2.792961484046672, + "language_loss": 0.84803057, + "learning_rate": 3.998955164701281e-06, + "loss": 0.87119067, + "num_input_tokens_seen": 14090215, + "router_z_loss_clip": 1.7421875, + "router_z_loss_mlp": 0.28869629, + "step": 665, + "time_per_iteration": 2.6389927864074707 + }, + { + "auxiliary_loss_clip": 0.01263016, + "auxiliary_loss_mlp": 0.01081434, + "balance_loss_clip": 1.0796113, + "balance_loss_mlp": 1.0510354, + "epoch": 0.04004208627686758, + "flos": 30877302746880.0, + "grad_norm": 1.9514634488680993, + "language_loss": 0.81838739, + "learning_rate": 3.998942539520158e-06, + "loss": 0.8418318, + "num_input_tokens_seen": 14112150, + "router_z_loss_clip": 1.83398438, + "router_z_loss_mlp": 0.30395508, + "step": 666, + "time_per_iteration": 2.7357535362243652 + }, + { + "auxiliary_loss_clip": 0.01252808, + "auxiliary_loss_mlp": 0.01073638, + "balance_loss_clip": 1.07586873, + "balance_loss_mlp": 1.04307258, + "epoch": 0.04010220952953555, + "flos": 28645604346240.0, + "grad_norm": 2.034137673158652, + "language_loss": 0.86954808, + "learning_rate": 3.998929838538932e-06, + "loss": 0.89281249, + "num_input_tokens_seen": 14131475, + "router_z_loss_clip": 1.76953125, + "router_z_loss_mlp": 0.30566406, + "step": 667, + "time_per_iteration": 2.678462505340576 + }, + { + "auxiliary_loss_clip": 0.01253712, + "auxiliary_loss_mlp": 0.01069372, + "balance_loss_clip": 1.08052266, + "balance_loss_mlp": 1.04158401, + "epoch": 0.04016233278220352, + "flos": 22717069159680.0, + "grad_norm": 2.3081768571238332, + "language_loss": 0.80688179, + "learning_rate": 3.998917061758087e-06, + "loss": 0.83011258, + "num_input_tokens_seen": 14146165, + "router_z_loss_clip": 1.73242188, + "router_z_loss_mlp": 0.2779541, + "step": 668, + "time_per_iteration": 2.628709316253662 + }, + { + "auxiliary_loss_clip": 0.01123706, + "auxiliary_loss_mlp": 0.01009249, + "balance_loss_clip": 1.04009652, + "balance_loss_mlp": 1.00276947, + "epoch": 0.040222456034871484, + "flos": 86520503453760.0, + "grad_norm": 0.7878063250874157, + "language_loss": 0.60052383, + "learning_rate": 3.998904209178107e-06, + "loss": 0.62185341, + "num_input_tokens_seen": 14215005, + "router_z_loss_clip": 0.83642578, + "router_z_loss_mlp": 0.06488037, + "step": 669, + "time_per_iteration": 3.6405727863311768 + }, + { + "auxiliary_loss_clip": 0.01252053, + "auxiliary_loss_mlp": 0.01072247, + "balance_loss_clip": 1.073524, + "balance_loss_mlp": 1.04437566, + "epoch": 0.040282579287539456, + "flos": 28998378017280.0, + "grad_norm": 1.8233078670969354, + "language_loss": 0.86281443, + "learning_rate": 3.9988912807994785e-06, + "loss": 0.88605738, + "num_input_tokens_seen": 14235510, + "router_z_loss_clip": 1.78417969, + "router_z_loss_mlp": 0.27880859, + "step": 670, + "time_per_iteration": 2.8797714710235596 + }, + { + "auxiliary_loss_clip": 0.01250011, + "auxiliary_loss_mlp": 0.01068936, + "balance_loss_clip": 1.07572389, + "balance_loss_mlp": 1.04152966, + "epoch": 0.04034270254020743, + "flos": 22547367675360.0, + "grad_norm": 1.7018310321618333, + "language_loss": 0.75261927, + "learning_rate": 3.998878276622692e-06, + "loss": 0.77580869, + "num_input_tokens_seen": 14254565, + "router_z_loss_clip": 1.74121094, + "router_z_loss_mlp": 0.27404785, + "step": 671, + "time_per_iteration": 2.739025831222534 + }, + { + "auxiliary_loss_clip": 0.01258327, + "auxiliary_loss_mlp": 0.01072338, + "balance_loss_clip": 1.07933867, + "balance_loss_mlp": 1.04451466, + "epoch": 0.040402825792875394, + "flos": 20989457419680.0, + "grad_norm": 2.809433357310436, + "language_loss": 0.92178226, + "learning_rate": 3.998865196648242e-06, + "loss": 0.94508898, + "num_input_tokens_seen": 14271885, + "router_z_loss_clip": 1.79003906, + "router_z_loss_mlp": 0.27819824, + "step": 672, + "time_per_iteration": 2.6605732440948486 + }, + { + "auxiliary_loss_clip": 0.01254656, + "auxiliary_loss_mlp": 0.01066421, + "balance_loss_clip": 1.07812142, + "balance_loss_mlp": 1.03690517, + "epoch": 0.040462949045543366, + "flos": 23393774548800.0, + "grad_norm": 1.8557508020804918, + "language_loss": 0.90139234, + "learning_rate": 3.998852040876622e-06, + "loss": 0.9246031, + "num_input_tokens_seen": 14289670, + "router_z_loss_clip": 1.765625, + "router_z_loss_mlp": 0.29516602, + "step": 673, + "time_per_iteration": 2.7474770545959473 + }, + { + "auxiliary_loss_clip": 0.01247995, + "auxiliary_loss_mlp": 0.01073227, + "balance_loss_clip": 1.07473755, + "balance_loss_mlp": 1.0438776, + "epoch": 0.04052307229821133, + "flos": 29309708861280.0, + "grad_norm": 2.5383887373010317, + "language_loss": 0.74842978, + "learning_rate": 3.998838809308334e-06, + "loss": 0.77164197, + "num_input_tokens_seen": 14309285, + "router_z_loss_clip": 1.73144531, + "router_z_loss_mlp": 0.29321289, + "step": 674, + "time_per_iteration": 2.7309443950653076 + }, + { + "auxiliary_loss_clip": 0.0126032, + "auxiliary_loss_mlp": 0.0106252, + "balance_loss_clip": 1.07809973, + "balance_loss_mlp": 1.03431535, + "epoch": 0.0405831955508793, + "flos": 20055545922240.0, + "grad_norm": 2.383197745915199, + "language_loss": 0.78182691, + "learning_rate": 3.9988255019438766e-06, + "loss": 0.80505526, + "num_input_tokens_seen": 14328300, + "router_z_loss_clip": 1.82324219, + "router_z_loss_mlp": 0.28186035, + "step": 675, + "time_per_iteration": 2.797966241836548 + }, + { + "auxiliary_loss_clip": 0.01251083, + "auxiliary_loss_mlp": 0.01073931, + "balance_loss_clip": 1.07489657, + "balance_loss_mlp": 1.04396141, + "epoch": 0.040643318803547275, + "flos": 29626671607200.0, + "grad_norm": 1.7066018483725751, + "language_loss": 0.7689501, + "learning_rate": 3.998812118783757e-06, + "loss": 0.79220033, + "num_input_tokens_seen": 14346395, + "router_z_loss_clip": 1.76269531, + "router_z_loss_mlp": 0.29943848, + "step": 676, + "time_per_iteration": 2.6875102519989014 + }, + { + "auxiliary_loss_clip": 0.01256461, + "auxiliary_loss_mlp": 0.01070374, + "balance_loss_clip": 1.07851267, + "balance_loss_mlp": 1.04116774, + "epoch": 0.04070344205621524, + "flos": 21612119107680.0, + "grad_norm": 4.259657026402786, + "language_loss": 0.85342222, + "learning_rate": 3.9987986598284804e-06, + "loss": 0.87669063, + "num_input_tokens_seen": 14364605, + "router_z_loss_clip": 1.77832031, + "router_z_loss_mlp": 0.29174805, + "step": 677, + "time_per_iteration": 2.676729917526245 + }, + { + "auxiliary_loss_clip": 0.01251448, + "auxiliary_loss_mlp": 0.01061549, + "balance_loss_clip": 1.07774389, + "balance_loss_mlp": 1.03332067, + "epoch": 0.04076356530888321, + "flos": 31942390145760.0, + "grad_norm": 1.8357489297268905, + "language_loss": 0.76579654, + "learning_rate": 3.998785125078559e-06, + "loss": 0.78892648, + "num_input_tokens_seen": 14385265, + "router_z_loss_clip": 1.73632812, + "router_z_loss_mlp": 0.28234863, + "step": 678, + "time_per_iteration": 2.7007594108581543 + }, + { + "auxiliary_loss_clip": 0.01251512, + "auxiliary_loss_mlp": 0.01069854, + "balance_loss_clip": 1.074965, + "balance_loss_mlp": 1.04263902, + "epoch": 0.04082368856155118, + "flos": 43651915636320.0, + "grad_norm": 2.74595571265895, + "language_loss": 0.82062918, + "learning_rate": 3.998771514534505e-06, + "loss": 0.84384286, + "num_input_tokens_seen": 14406090, + "router_z_loss_clip": 1.76464844, + "router_z_loss_mlp": 0.27233887, + "step": 679, + "time_per_iteration": 2.793701171875 + }, + { + "auxiliary_loss_clip": 0.01256695, + "auxiliary_loss_mlp": 0.01060053, + "balance_loss_clip": 1.08177233, + "balance_loss_mlp": 1.03213453, + "epoch": 0.04088381181421915, + "flos": 34345208135520.0, + "grad_norm": 1.8523208152761066, + "language_loss": 0.76591051, + "learning_rate": 3.998757828196835e-06, + "loss": 0.78907806, + "num_input_tokens_seen": 14425130, + "router_z_loss_clip": 1.74902344, + "router_z_loss_mlp": 0.27929688, + "step": 680, + "time_per_iteration": 2.733106851577759 + }, + { + "auxiliary_loss_clip": 0.01251879, + "auxiliary_loss_mlp": 0.01065268, + "balance_loss_clip": 1.0739193, + "balance_loss_mlp": 1.03596652, + "epoch": 0.04094393506688712, + "flos": 33675471718560.0, + "grad_norm": 1.8744313824271461, + "language_loss": 0.83420503, + "learning_rate": 3.9987440660660685e-06, + "loss": 0.85737646, + "num_input_tokens_seen": 14447355, + "router_z_loss_clip": 1.77832031, + "router_z_loss_mlp": 0.29260254, + "step": 681, + "time_per_iteration": 2.749056816101074 + }, + { + "auxiliary_loss_clip": 0.01254021, + "auxiliary_loss_mlp": 0.01064239, + "balance_loss_clip": 1.07600749, + "balance_loss_mlp": 1.03465152, + "epoch": 0.04100405831955509, + "flos": 28201557944160.0, + "grad_norm": 1.5500480452653127, + "language_loss": 0.71143109, + "learning_rate": 3.998730228142726e-06, + "loss": 0.73461366, + "num_input_tokens_seen": 14466790, + "router_z_loss_clip": 1.78027344, + "router_z_loss_mlp": 0.29577637, + "step": 682, + "time_per_iteration": 2.693429708480835 + }, + { + "auxiliary_loss_clip": 0.01250023, + "auxiliary_loss_mlp": 0.0106585, + "balance_loss_clip": 1.07506812, + "balance_loss_mlp": 1.03878951, + "epoch": 0.04106418157222306, + "flos": 24595264578240.0, + "grad_norm": 1.883302828496311, + "language_loss": 0.72441578, + "learning_rate": 3.998716314427333e-06, + "loss": 0.74757457, + "num_input_tokens_seen": 14485195, + "router_z_loss_clip": 1.75, + "router_z_loss_mlp": 0.27075195, + "step": 683, + "time_per_iteration": 2.803751230239868 + }, + { + "auxiliary_loss_clip": 0.01251597, + "auxiliary_loss_mlp": 0.01073121, + "balance_loss_clip": 1.08164442, + "balance_loss_mlp": 1.04520226, + "epoch": 0.041124304824891024, + "flos": 21256509227040.0, + "grad_norm": 2.4253447464827356, + "language_loss": 0.81597334, + "learning_rate": 3.998702324920417e-06, + "loss": 0.83922052, + "num_input_tokens_seen": 14503370, + "router_z_loss_clip": 1.70019531, + "router_z_loss_mlp": 0.27929688, + "step": 684, + "time_per_iteration": 2.7720375061035156 + }, + { + "auxiliary_loss_clip": 0.01250618, + "auxiliary_loss_mlp": 0.01061347, + "balance_loss_clip": 1.07890368, + "balance_loss_mlp": 1.03243923, + "epoch": 0.041184428077558996, + "flos": 31456981951200.0, + "grad_norm": 1.5665166885008224, + "language_loss": 0.90683651, + "learning_rate": 3.9986882596225085e-06, + "loss": 0.92995614, + "num_input_tokens_seen": 14526415, + "router_z_loss_clip": 1.71582031, + "router_z_loss_mlp": 0.28918457, + "step": 685, + "time_per_iteration": 2.7756052017211914 + }, + { + "auxiliary_loss_clip": 0.01252345, + "auxiliary_loss_mlp": 0.01067876, + "balance_loss_clip": 1.0774579, + "balance_loss_mlp": 1.03928995, + "epoch": 0.04124455133022697, + "flos": 28021970243520.0, + "grad_norm": 2.0307959341733546, + "language_loss": 0.87724876, + "learning_rate": 3.998674118534141e-06, + "loss": 0.90045094, + "num_input_tokens_seen": 14546595, + "router_z_loss_clip": 1.74804688, + "router_z_loss_mlp": 0.28588867, + "step": 686, + "time_per_iteration": 2.7054779529571533 + }, + { + "auxiliary_loss_clip": 0.01256622, + "auxiliary_loss_mlp": 0.0106701, + "balance_loss_clip": 1.07735538, + "balance_loss_mlp": 1.03935301, + "epoch": 0.04130467458289493, + "flos": 25976909550240.0, + "grad_norm": 3.0631016357898204, + "language_loss": 0.71472669, + "learning_rate": 3.998659901655851e-06, + "loss": 0.73796308, + "num_input_tokens_seen": 14566590, + "router_z_loss_clip": 1.79296875, + "router_z_loss_mlp": 0.27685547, + "step": 687, + "time_per_iteration": 4.123210191726685 + }, + { + "auxiliary_loss_clip": 0.01249197, + "auxiliary_loss_mlp": 0.01069991, + "balance_loss_clip": 1.08085716, + "balance_loss_mlp": 1.04396796, + "epoch": 0.041364797835562905, + "flos": 24372572842080.0, + "grad_norm": 1.7687091479938541, + "language_loss": 0.85920739, + "learning_rate": 3.998645608988177e-06, + "loss": 0.88239932, + "num_input_tokens_seen": 14585965, + "router_z_loss_clip": 1.68261719, + "router_z_loss_mlp": 0.26049805, + "step": 688, + "time_per_iteration": 4.066043853759766 + }, + { + "auxiliary_loss_clip": 0.01246782, + "auxiliary_loss_mlp": 0.01070875, + "balance_loss_clip": 1.0769695, + "balance_loss_mlp": 1.04462504, + "epoch": 0.04142492108823087, + "flos": 26730301449600.0, + "grad_norm": 2.0591922922537607, + "language_loss": 0.83343053, + "learning_rate": 3.998631240531661e-06, + "loss": 0.85660708, + "num_input_tokens_seen": 14606015, + "router_z_loss_clip": 1.69824219, + "router_z_loss_mlp": 0.26245117, + "step": 689, + "time_per_iteration": 4.224620342254639 + }, + { + "auxiliary_loss_clip": 0.01247884, + "auxiliary_loss_mlp": 0.01074606, + "balance_loss_clip": 1.07520688, + "balance_loss_mlp": 1.04840362, + "epoch": 0.04148504434089884, + "flos": 33726152485440.0, + "grad_norm": 1.920033355513434, + "language_loss": 0.68327653, + "learning_rate": 3.998616796286848e-06, + "loss": 0.70650148, + "num_input_tokens_seen": 14629955, + "router_z_loss_clip": 1.72753906, + "router_z_loss_mlp": 0.26220703, + "step": 690, + "time_per_iteration": 4.183804988861084 + }, + { + "auxiliary_loss_clip": 0.01245376, + "auxiliary_loss_mlp": 0.01070656, + "balance_loss_clip": 1.07457674, + "balance_loss_mlp": 1.04456067, + "epoch": 0.041545167593566815, + "flos": 25036879943520.0, + "grad_norm": 1.6986469879912904, + "language_loss": 0.7494241, + "learning_rate": 3.998602276254286e-06, + "loss": 0.77258438, + "num_input_tokens_seen": 14648000, + "router_z_loss_clip": 1.70800781, + "router_z_loss_mlp": 0.2611084, + "step": 691, + "time_per_iteration": 2.716343402862549 + }, + { + "auxiliary_loss_clip": 0.01246087, + "auxiliary_loss_mlp": 0.01073925, + "balance_loss_clip": 1.07525289, + "balance_loss_mlp": 1.04489708, + "epoch": 0.04160529084623478, + "flos": 14482336995360.0, + "grad_norm": 2.013633267160689, + "language_loss": 0.84695435, + "learning_rate": 3.998587680434526e-06, + "loss": 0.8701545, + "num_input_tokens_seen": 14662235, + "router_z_loss_clip": 1.70996094, + "router_z_loss_mlp": 0.29016113, + "step": 692, + "time_per_iteration": 2.6200244426727295 + }, + { + "auxiliary_loss_clip": 0.0125008, + "auxiliary_loss_mlp": 0.01070061, + "balance_loss_clip": 1.07320476, + "balance_loss_mlp": 1.04067636, + "epoch": 0.04166541409890275, + "flos": 18093249331200.0, + "grad_norm": 2.9758540173614545, + "language_loss": 0.88908362, + "learning_rate": 3.99857300882812e-06, + "loss": 0.91228509, + "num_input_tokens_seen": 14676065, + "router_z_loss_clip": 1.76953125, + "router_z_loss_mlp": 0.29382324, + "step": 693, + "time_per_iteration": 2.720207929611206 + }, + { + "auxiliary_loss_clip": 0.01253799, + "auxiliary_loss_mlp": 0.01060859, + "balance_loss_clip": 1.0804795, + "balance_loss_mlp": 1.03431082, + "epoch": 0.04172553735157072, + "flos": 31491901496160.0, + "grad_norm": 2.584582775666956, + "language_loss": 0.81863779, + "learning_rate": 3.998558261435626e-06, + "loss": 0.84178436, + "num_input_tokens_seen": 14694955, + "router_z_loss_clip": 1.73535156, + "router_z_loss_mlp": 0.26574707, + "step": 694, + "time_per_iteration": 2.7245473861694336 + }, + { + "auxiliary_loss_clip": 0.01250365, + "auxiliary_loss_mlp": 0.01066824, + "balance_loss_clip": 1.07411957, + "balance_loss_mlp": 1.03977513, + "epoch": 0.04178566060423869, + "flos": 29626550055360.0, + "grad_norm": 2.0612269515538792, + "language_loss": 0.8388828, + "learning_rate": 3.9985434382576015e-06, + "loss": 0.86205471, + "num_input_tokens_seen": 14715510, + "router_z_loss_clip": 1.76269531, + "router_z_loss_mlp": 0.27062988, + "step": 695, + "time_per_iteration": 2.7101669311523438 + }, + { + "auxiliary_loss_clip": 0.01246577, + "auxiliary_loss_mlp": 0.01069354, + "balance_loss_clip": 1.07471895, + "balance_loss_mlp": 1.04155445, + "epoch": 0.04184578385690666, + "flos": 22234456657440.0, + "grad_norm": 2.3620067107875276, + "language_loss": 0.84617597, + "learning_rate": 3.99852853929461e-06, + "loss": 0.86933529, + "num_input_tokens_seen": 14731755, + "router_z_loss_clip": 1.72070312, + "router_z_loss_mlp": 0.27819824, + "step": 696, + "time_per_iteration": 2.6775496006011963 + }, + { + "auxiliary_loss_clip": 0.01247284, + "auxiliary_loss_mlp": 0.01075984, + "balance_loss_clip": 1.07468104, + "balance_loss_mlp": 1.04828024, + "epoch": 0.041905907109574626, + "flos": 27792593156160.0, + "grad_norm": 2.224103064342064, + "language_loss": 0.93171358, + "learning_rate": 3.998513564547216e-06, + "loss": 0.95494628, + "num_input_tokens_seen": 14750810, + "router_z_loss_clip": 1.72363281, + "router_z_loss_mlp": 0.27709961, + "step": 697, + "time_per_iteration": 2.701293468475342 + }, + { + "auxiliary_loss_clip": 0.01244525, + "auxiliary_loss_mlp": 0.0106049, + "balance_loss_clip": 1.07562613, + "balance_loss_mlp": 1.03490734, + "epoch": 0.0419660303622426, + "flos": 24595507681920.0, + "grad_norm": 2.095383032507615, + "language_loss": 0.8350625, + "learning_rate": 3.998498514015987e-06, + "loss": 0.85811269, + "num_input_tokens_seen": 14768435, + "router_z_loss_clip": 1.68945312, + "router_z_loss_mlp": 0.25585938, + "step": 698, + "time_per_iteration": 2.6822426319122314 + }, + { + "auxiliary_loss_clip": 0.01247777, + "auxiliary_loss_mlp": 0.0107972, + "balance_loss_clip": 1.07427454, + "balance_loss_mlp": 1.05163431, + "epoch": 0.042026153614910564, + "flos": 28157967701280.0, + "grad_norm": 3.4618849290278595, + "language_loss": 0.91134453, + "learning_rate": 3.998483387701495e-06, + "loss": 0.93461949, + "num_input_tokens_seen": 14786690, + "router_z_loss_clip": 1.734375, + "router_z_loss_mlp": 0.28076172, + "step": 699, + "time_per_iteration": 2.704845428466797 + }, + { + "auxiliary_loss_clip": 0.01126625, + "auxiliary_loss_mlp": 0.01015306, + "balance_loss_clip": 1.04690266, + "balance_loss_mlp": 1.0089761, + "epoch": 0.042086276867578536, + "flos": 78696397285920.0, + "grad_norm": 0.9060610349828258, + "language_loss": 0.67878157, + "learning_rate": 3.998468185604312e-06, + "loss": 0.70020086, + "num_input_tokens_seen": 14853840, + "router_z_loss_clip": 0.796875, + "router_z_loss_mlp": 0.06335449, + "step": 700, + "time_per_iteration": 3.3523426055908203 + }, + { + "auxiliary_loss_clip": 0.012535, + "auxiliary_loss_mlp": 0.01078671, + "balance_loss_clip": 1.07765555, + "balance_loss_mlp": 1.050156, + "epoch": 0.04214640012024651, + "flos": 18896876307360.0, + "grad_norm": 2.4513468845136757, + "language_loss": 0.89171493, + "learning_rate": 3.998452907725016e-06, + "loss": 0.91503662, + "num_input_tokens_seen": 14869580, + "router_z_loss_clip": 1.75585938, + "router_z_loss_mlp": 0.28491211, + "step": 701, + "time_per_iteration": 2.7153913974761963 + }, + { + "auxiliary_loss_clip": 0.0125043, + "auxiliary_loss_mlp": 0.01070989, + "balance_loss_clip": 1.08036339, + "balance_loss_mlp": 1.04273605, + "epoch": 0.04220652337291447, + "flos": 29135185820640.0, + "grad_norm": 2.0233268685518406, + "language_loss": 0.67251414, + "learning_rate": 3.998437554064184e-06, + "loss": 0.6957283, + "num_input_tokens_seen": 14891065, + "router_z_loss_clip": 1.70117188, + "router_z_loss_mlp": 0.2824707, + "step": 702, + "time_per_iteration": 2.7676823139190674 + }, + { + "auxiliary_loss_clip": 0.01119845, + "auxiliary_loss_mlp": 0.01006325, + "balance_loss_clip": 1.04071224, + "balance_loss_mlp": 1.00008476, + "epoch": 0.042266646625582445, + "flos": 77843061957600.0, + "grad_norm": 0.8412938282132233, + "language_loss": 0.60789156, + "learning_rate": 3.9984221246224006e-06, + "loss": 0.62915325, + "num_input_tokens_seen": 14954815, + "router_z_loss_clip": 0.79150391, + "router_z_loss_mlp": 0.06234741, + "step": 703, + "time_per_iteration": 3.3463385105133057 + }, + { + "auxiliary_loss_clip": 0.01120989, + "auxiliary_loss_mlp": 0.01006767, + "balance_loss_clip": 1.04056716, + "balance_loss_mlp": 1.00043118, + "epoch": 0.04232676987825041, + "flos": 61030426553280.0, + "grad_norm": 1.015726988229099, + "language_loss": 0.57772577, + "learning_rate": 3.9984066194002494e-06, + "loss": 0.59900331, + "num_input_tokens_seen": 15003050, + "router_z_loss_clip": 0.80371094, + "router_z_loss_mlp": 0.06341553, + "step": 704, + "time_per_iteration": 3.1539883613586426 + }, + { + "auxiliary_loss_clip": 0.01252859, + "auxiliary_loss_mlp": 0.0105993, + "balance_loss_clip": 1.07813001, + "balance_loss_mlp": 1.03177261, + "epoch": 0.04238689313091838, + "flos": 26376109673760.0, + "grad_norm": 2.2656377744927934, + "language_loss": 0.87388366, + "learning_rate": 3.998391038398319e-06, + "loss": 0.89701152, + "num_input_tokens_seen": 15021990, + "router_z_loss_clip": 1.74707031, + "router_z_loss_mlp": 0.28173828, + "step": 705, + "time_per_iteration": 2.726149082183838 + }, + { + "auxiliary_loss_clip": 0.01239832, + "auxiliary_loss_mlp": 0.01067412, + "balance_loss_clip": 1.07376552, + "balance_loss_mlp": 1.04206848, + "epoch": 0.042447016383586354, + "flos": 23348604132000.0, + "grad_norm": 1.7774286841280187, + "language_loss": 0.71628571, + "learning_rate": 3.998375381617201e-06, + "loss": 0.73935819, + "num_input_tokens_seen": 15040700, + "router_z_loss_clip": 1.65820312, + "router_z_loss_mlp": 0.25354004, + "step": 706, + "time_per_iteration": 2.6655936241149902 + }, + { + "auxiliary_loss_clip": 0.01242077, + "auxiliary_loss_mlp": 0.01060938, + "balance_loss_clip": 1.07347941, + "balance_loss_mlp": 1.03321004, + "epoch": 0.04250713963625432, + "flos": 29804922237600.0, + "grad_norm": 2.3217690622671823, + "language_loss": 0.93692386, + "learning_rate": 3.9983596490574875e-06, + "loss": 0.95995402, + "num_input_tokens_seen": 15056725, + "router_z_loss_clip": 1.6875, + "router_z_loss_mlp": 0.27709961, + "step": 707, + "time_per_iteration": 2.7320189476013184 + }, + { + "auxiliary_loss_clip": 0.01246608, + "auxiliary_loss_mlp": 0.01061151, + "balance_loss_clip": 1.07142925, + "balance_loss_mlp": 1.03279126, + "epoch": 0.04256726288892229, + "flos": 37055588862240.0, + "grad_norm": 2.0006451280148347, + "language_loss": 0.81259298, + "learning_rate": 3.998343840719776e-06, + "loss": 0.83567065, + "num_input_tokens_seen": 15077550, + "router_z_loss_clip": 1.75097656, + "router_z_loss_mlp": 0.28356934, + "step": 708, + "time_per_iteration": 2.7906363010406494 + }, + { + "auxiliary_loss_clip": 0.01250419, + "auxiliary_loss_mlp": 0.01065356, + "balance_loss_clip": 1.0750947, + "balance_loss_mlp": 1.03748512, + "epoch": 0.04262738614159026, + "flos": 19965407675040.0, + "grad_norm": 2.348643748822729, + "language_loss": 0.82042301, + "learning_rate": 3.998327956604666e-06, + "loss": 0.84358072, + "num_input_tokens_seen": 15094955, + "router_z_loss_clip": 1.75292969, + "router_z_loss_mlp": 0.27868652, + "step": 709, + "time_per_iteration": 2.7343177795410156 + }, + { + "auxiliary_loss_clip": 0.01255411, + "auxiliary_loss_mlp": 0.01061912, + "balance_loss_clip": 1.07774282, + "balance_loss_mlp": 1.03365946, + "epoch": 0.04268750939425823, + "flos": 24907203181440.0, + "grad_norm": 2.707732500690642, + "language_loss": 0.85262585, + "learning_rate": 3.99831199671276e-06, + "loss": 0.87579918, + "num_input_tokens_seen": 15113395, + "router_z_loss_clip": 1.77636719, + "router_z_loss_mlp": 0.28259277, + "step": 710, + "time_per_iteration": 2.6631696224212646 + }, + { + "auxiliary_loss_clip": 0.01254764, + "auxiliary_loss_mlp": 0.01065722, + "balance_loss_clip": 1.08104837, + "balance_loss_mlp": 1.03955567, + "epoch": 0.0427476326469262, + "flos": 24773677277760.0, + "grad_norm": 2.3612905989666175, + "language_loss": 0.8462429, + "learning_rate": 3.998295961044662e-06, + "loss": 0.86944777, + "num_input_tokens_seen": 15132920, + "router_z_loss_clip": 1.73535156, + "router_z_loss_mlp": 0.26196289, + "step": 711, + "time_per_iteration": 2.703124761581421 + }, + { + "auxiliary_loss_clip": 0.01244716, + "auxiliary_loss_mlp": 0.01063511, + "balance_loss_clip": 1.07230639, + "balance_loss_mlp": 1.03559244, + "epoch": 0.042807755899594166, + "flos": 26418605950080.0, + "grad_norm": 1.8703189602548358, + "language_loss": 0.85663402, + "learning_rate": 3.9982798496009804e-06, + "loss": 0.87971628, + "num_input_tokens_seen": 15153115, + "router_z_loss_clip": 1.72460938, + "router_z_loss_mlp": 0.27929688, + "step": 712, + "time_per_iteration": 2.697035312652588 + }, + { + "auxiliary_loss_clip": 0.0125222, + "auxiliary_loss_mlp": 0.0106866, + "balance_loss_clip": 1.07403064, + "balance_loss_mlp": 1.04257739, + "epoch": 0.04286787915226214, + "flos": 26154998111520.0, + "grad_norm": 5.427825693772452, + "language_loss": 0.90498227, + "learning_rate": 3.998263662382328e-06, + "loss": 0.92819107, + "num_input_tokens_seen": 15172770, + "router_z_loss_clip": 1.78320312, + "router_z_loss_mlp": 0.26074219, + "step": 713, + "time_per_iteration": 2.6984124183654785 + }, + { + "auxiliary_loss_clip": 0.01137237, + "auxiliary_loss_mlp": 0.0101547, + "balance_loss_clip": 1.05073357, + "balance_loss_mlp": 1.0105381, + "epoch": 0.04292800240493011, + "flos": 81017747726400.0, + "grad_norm": 0.8802871205814905, + "language_loss": 0.63784713, + "learning_rate": 3.9982473993893165e-06, + "loss": 0.65937418, + "num_input_tokens_seen": 15240055, + "router_z_loss_clip": 0.86474609, + "router_z_loss_mlp": 0.04928589, + "step": 714, + "time_per_iteration": 3.394625186920166 + }, + { + "auxiliary_loss_clip": 0.01246144, + "auxiliary_loss_mlp": 0.01074498, + "balance_loss_clip": 1.07683098, + "balance_loss_mlp": 1.04808116, + "epoch": 0.042988125657598075, + "flos": 38620508607360.0, + "grad_norm": 2.0691012499513715, + "language_loss": 0.75023389, + "learning_rate": 3.998231060622563e-06, + "loss": 0.7734403, + "num_input_tokens_seen": 15261585, + "router_z_loss_clip": 1.6953125, + "router_z_loss_mlp": 0.2644043, + "step": 715, + "time_per_iteration": 2.753211736679077 + }, + { + "auxiliary_loss_clip": 0.01252102, + "auxiliary_loss_mlp": 0.01073214, + "balance_loss_clip": 1.07950997, + "balance_loss_mlp": 1.04418612, + "epoch": 0.04304824891026605, + "flos": 40571419842720.0, + "grad_norm": 2.078784114797202, + "language_loss": 0.72944117, + "learning_rate": 3.998214646082688e-06, + "loss": 0.75269437, + "num_input_tokens_seen": 15281160, + "router_z_loss_clip": 1.72753906, + "router_z_loss_mlp": 0.29016113, + "step": 716, + "time_per_iteration": 2.8075919151306152 + }, + { + "auxiliary_loss_clip": 0.01137002, + "auxiliary_loss_mlp": 0.01018199, + "balance_loss_clip": 1.04766417, + "balance_loss_mlp": 1.01335049, + "epoch": 0.04310837216293401, + "flos": 78170437644480.0, + "grad_norm": 0.9180540778969299, + "language_loss": 0.65596682, + "learning_rate": 3.998198155770314e-06, + "loss": 0.67751878, + "num_input_tokens_seen": 15344505, + "router_z_loss_clip": 0.89306641, + "router_z_loss_mlp": 0.0484314, + "step": 717, + "time_per_iteration": 3.2924201488494873 + }, + { + "auxiliary_loss_clip": 0.01136977, + "auxiliary_loss_mlp": 0.01015241, + "balance_loss_clip": 1.04700518, + "balance_loss_mlp": 1.01001358, + "epoch": 0.043168495415601985, + "flos": 74851975100160.0, + "grad_norm": 0.9876484211548102, + "language_loss": 0.58892965, + "learning_rate": 3.998181589686065e-06, + "loss": 0.61045182, + "num_input_tokens_seen": 15404050, + "router_z_loss_clip": 0.89941406, + "router_z_loss_mlp": 0.05227661, + "step": 718, + "time_per_iteration": 3.1028294563293457 + }, + { + "auxiliary_loss_clip": 0.0124913, + "auxiliary_loss_mlp": 0.01072745, + "balance_loss_clip": 1.07888114, + "balance_loss_mlp": 1.04303813, + "epoch": 0.04322861866826996, + "flos": 25263380304000.0, + "grad_norm": 2.164081537227795, + "language_loss": 0.91353393, + "learning_rate": 3.99816494783057e-06, + "loss": 0.93675274, + "num_input_tokens_seen": 15424190, + "router_z_loss_clip": 1.70019531, + "router_z_loss_mlp": 0.296875, + "step": 719, + "time_per_iteration": 2.711247682571411 + }, + { + "auxiliary_loss_clip": 0.01244344, + "auxiliary_loss_mlp": 0.01063296, + "balance_loss_clip": 1.0731678, + "balance_loss_mlp": 1.03723681, + "epoch": 0.04328874192093792, + "flos": 37062314730720.0, + "grad_norm": 1.7024918963934184, + "language_loss": 0.66345906, + "learning_rate": 3.99814823020446e-06, + "loss": 0.68653548, + "num_input_tokens_seen": 15446500, + "router_z_loss_clip": 1.71386719, + "router_z_loss_mlp": 0.26049805, + "step": 720, + "time_per_iteration": 2.7442774772644043 + }, + { + "auxiliary_loss_clip": 0.012457, + "auxiliary_loss_mlp": 0.01070935, + "balance_loss_clip": 1.0758276, + "balance_loss_mlp": 1.04296827, + "epoch": 0.043348865173605894, + "flos": 26777173592160.0, + "grad_norm": 2.2249877526030213, + "language_loss": 0.77763903, + "learning_rate": 3.9981314368083684e-06, + "loss": 0.80080539, + "num_input_tokens_seen": 15465830, + "router_z_loss_clip": 1.69824219, + "router_z_loss_mlp": 0.27966309, + "step": 721, + "time_per_iteration": 2.6920151710510254 + }, + { + "auxiliary_loss_clip": 0.01248966, + "auxiliary_loss_mlp": 0.01079961, + "balance_loss_clip": 1.07756138, + "balance_loss_mlp": 1.05338931, + "epoch": 0.04340898842627386, + "flos": 18625367599200.0, + "grad_norm": 2.7414302802270925, + "language_loss": 0.88106734, + "learning_rate": 3.998114567642933e-06, + "loss": 0.9043566, + "num_input_tokens_seen": 15479985, + "router_z_loss_clip": 1.71289062, + "router_z_loss_mlp": 0.26586914, + "step": 722, + "time_per_iteration": 2.6865944862365723 + }, + { + "auxiliary_loss_clip": 0.01256811, + "auxiliary_loss_mlp": 0.01067781, + "balance_loss_clip": 1.08190084, + "balance_loss_mlp": 1.04126859, + "epoch": 0.04346911167894183, + "flos": 34123975021440.0, + "grad_norm": 1.8643716846648373, + "language_loss": 0.8413744, + "learning_rate": 3.998097622708792e-06, + "loss": 0.86462033, + "num_input_tokens_seen": 15501545, + "router_z_loss_clip": 1.75, + "router_z_loss_mlp": 0.26513672, + "step": 723, + "time_per_iteration": 2.751919746398926 + }, + { + "auxiliary_loss_clip": 0.01257824, + "auxiliary_loss_mlp": 0.01077165, + "balance_loss_clip": 1.08308196, + "balance_loss_mlp": 1.05029535, + "epoch": 0.0435292349316098, + "flos": 35682006828960.0, + "grad_norm": 1.8252543253617763, + "language_loss": 0.82701182, + "learning_rate": 3.99808060200659e-06, + "loss": 0.8503617, + "num_input_tokens_seen": 15521725, + "router_z_loss_clip": 1.74707031, + "router_z_loss_mlp": 0.26879883, + "step": 724, + "time_per_iteration": 2.755279541015625 + }, + { + "auxiliary_loss_clip": 0.01252409, + "auxiliary_loss_mlp": 0.01084214, + "balance_loss_clip": 1.07913613, + "balance_loss_mlp": 1.05559218, + "epoch": 0.04358935818427777, + "flos": 25085332260000.0, + "grad_norm": 2.4446123163862232, + "language_loss": 0.79783201, + "learning_rate": 3.998063505536971e-06, + "loss": 0.82119828, + "num_input_tokens_seen": 15540910, + "router_z_loss_clip": 1.73046875, + "router_z_loss_mlp": 0.28588867, + "step": 725, + "time_per_iteration": 2.72099232673645 + }, + { + "auxiliary_loss_clip": 0.01261798, + "auxiliary_loss_mlp": 0.0107637, + "balance_loss_clip": 1.08129168, + "balance_loss_mlp": 1.0478549, + "epoch": 0.04364948143694574, + "flos": 17649689136480.0, + "grad_norm": 3.3319775233045696, + "language_loss": 0.86878151, + "learning_rate": 3.998046333300584e-06, + "loss": 0.89216316, + "num_input_tokens_seen": 15558640, + "router_z_loss_clip": 1.8046875, + "router_z_loss_mlp": 0.28540039, + "step": 726, + "time_per_iteration": 2.7139859199523926 + }, + { + "auxiliary_loss_clip": 0.01146017, + "auxiliary_loss_mlp": 0.01037445, + "balance_loss_clip": 1.06033838, + "balance_loss_mlp": 1.03251874, + "epoch": 0.043709604689613706, + "flos": 61091757014400.0, + "grad_norm": 0.9146354565564684, + "language_loss": 0.55848134, + "learning_rate": 3.998029085298079e-06, + "loss": 0.58031595, + "num_input_tokens_seen": 15612975, + "router_z_loss_clip": 0.85644531, + "router_z_loss_mlp": 0.04922485, + "step": 727, + "time_per_iteration": 4.853659629821777 + }, + { + "auxiliary_loss_clip": 0.01252668, + "auxiliary_loss_mlp": 0.01076406, + "balance_loss_clip": 1.07887387, + "balance_loss_mlp": 1.04864252, + "epoch": 0.04376972794228168, + "flos": 17071995278880.0, + "grad_norm": 1.9875455349322173, + "language_loss": 0.82062864, + "learning_rate": 3.998011761530112e-06, + "loss": 0.8439194, + "num_input_tokens_seen": 15631070, + "router_z_loss_clip": 1.73632812, + "router_z_loss_mlp": 0.27770996, + "step": 728, + "time_per_iteration": 5.548681974411011 + }, + { + "auxiliary_loss_clip": 0.0124795, + "auxiliary_loss_mlp": 0.01068255, + "balance_loss_clip": 1.07925963, + "balance_loss_mlp": 1.04184973, + "epoch": 0.04382985119494965, + "flos": 26857344588480.0, + "grad_norm": 2.1627354155108542, + "language_loss": 0.77046293, + "learning_rate": 3.997994361997338e-06, + "loss": 0.793625, + "num_input_tokens_seen": 15647825, + "router_z_loss_clip": 1.68652344, + "router_z_loss_mlp": 0.2644043, + "step": 729, + "time_per_iteration": 4.140490293502808 + }, + { + "auxiliary_loss_clip": 0.01255829, + "auxiliary_loss_mlp": 0.01070086, + "balance_loss_clip": 1.07883024, + "balance_loss_mlp": 1.04238188, + "epoch": 0.043889974447617615, + "flos": 29537141119200.0, + "grad_norm": 2.1275080861165496, + "language_loss": 0.95132107, + "learning_rate": 3.997976886700417e-06, + "loss": 0.97458023, + "num_input_tokens_seen": 15668260, + "router_z_loss_clip": 1.77050781, + "router_z_loss_mlp": 0.27722168, + "step": 730, + "time_per_iteration": 2.7020530700683594 + }, + { + "auxiliary_loss_clip": 0.01247548, + "auxiliary_loss_mlp": 0.01062693, + "balance_loss_clip": 1.07300472, + "balance_loss_mlp": 1.03365326, + "epoch": 0.04395009770028559, + "flos": 21078177562080.0, + "grad_norm": 3.0349314734241077, + "language_loss": 0.88123715, + "learning_rate": 3.997959335640013e-06, + "loss": 0.90433955, + "num_input_tokens_seen": 15685630, + "router_z_loss_clip": 1.74414062, + "router_z_loss_mlp": 0.29052734, + "step": 731, + "time_per_iteration": 2.657618761062622 + }, + { + "auxiliary_loss_clip": 0.01249715, + "auxiliary_loss_mlp": 0.01067496, + "balance_loss_clip": 1.07732105, + "balance_loss_mlp": 1.04260552, + "epoch": 0.04401022095295355, + "flos": 15020613889920.0, + "grad_norm": 3.178745997184453, + "language_loss": 0.8855778, + "learning_rate": 3.997941708816791e-06, + "loss": 0.90874988, + "num_input_tokens_seen": 15698645, + "router_z_loss_clip": 1.72265625, + "router_z_loss_mlp": 0.24890137, + "step": 732, + "time_per_iteration": 2.654717206954956 + }, + { + "auxiliary_loss_clip": 0.01251977, + "auxiliary_loss_mlp": 0.01070041, + "balance_loss_clip": 1.07734454, + "balance_loss_mlp": 1.04317164, + "epoch": 0.044070344205621524, + "flos": 25575035286240.0, + "grad_norm": 2.3637160060087443, + "language_loss": 0.85888159, + "learning_rate": 3.997924006231419e-06, + "loss": 0.88210177, + "num_input_tokens_seen": 15716775, + "router_z_loss_clip": 1.74804688, + "router_z_loss_mlp": 0.26855469, + "step": 733, + "time_per_iteration": 2.6691277027130127 + }, + { + "auxiliary_loss_clip": 0.01254888, + "auxiliary_loss_mlp": 0.01078428, + "balance_loss_clip": 1.07839608, + "balance_loss_mlp": 1.04926968, + "epoch": 0.044130467458289496, + "flos": 16893501544800.0, + "grad_norm": 2.158072339062125, + "language_loss": 0.91264057, + "learning_rate": 3.9979062278845685e-06, + "loss": 0.9359737, + "num_input_tokens_seen": 15733320, + "router_z_loss_clip": 1.76171875, + "router_z_loss_mlp": 0.29150391, + "step": 734, + "time_per_iteration": 2.705151319503784 + }, + { + "auxiliary_loss_clip": 0.01248099, + "auxiliary_loss_mlp": 0.01060208, + "balance_loss_clip": 1.07921576, + "balance_loss_mlp": 1.03438687, + "epoch": 0.04419059071095746, + "flos": 34965843959520.0, + "grad_norm": 1.9713834217498247, + "language_loss": 0.77994633, + "learning_rate": 3.9978883737769125e-06, + "loss": 0.80302942, + "num_input_tokens_seen": 15752705, + "router_z_loss_clip": 1.68847656, + "router_z_loss_mlp": 0.25830078, + "step": 735, + "time_per_iteration": 2.8812949657440186 + }, + { + "auxiliary_loss_clip": 0.01243367, + "auxiliary_loss_mlp": 0.01056333, + "balance_loss_clip": 1.07378554, + "balance_loss_mlp": 1.03010714, + "epoch": 0.04425071396362543, + "flos": 34389932862240.0, + "grad_norm": 2.3993561635138425, + "language_loss": 0.8840884, + "learning_rate": 3.9978704439091305e-06, + "loss": 0.90708542, + "num_input_tokens_seen": 15772800, + "router_z_loss_clip": 1.6953125, + "router_z_loss_mlp": 0.2623291, + "step": 736, + "time_per_iteration": 2.7238473892211914 + }, + { + "auxiliary_loss_clip": 0.01245511, + "auxiliary_loss_mlp": 0.01072938, + "balance_loss_clip": 1.07834435, + "balance_loss_mlp": 1.04727209, + "epoch": 0.0443108372162934, + "flos": 28868417634240.0, + "grad_norm": 1.7681584589236274, + "language_loss": 0.84270746, + "learning_rate": 3.997852438281901e-06, + "loss": 0.86589193, + "num_input_tokens_seen": 15793665, + "router_z_loss_clip": 1.67089844, + "router_z_loss_mlp": 0.25695801, + "step": 737, + "time_per_iteration": 2.7262260913848877 + }, + { + "auxiliary_loss_clip": 0.01251804, + "auxiliary_loss_mlp": 0.01066499, + "balance_loss_clip": 1.07933998, + "balance_loss_mlp": 1.03750694, + "epoch": 0.04437096046896137, + "flos": 41463766961280.0, + "grad_norm": 2.1331892764193796, + "language_loss": 0.84811813, + "learning_rate": 3.997834356895906e-06, + "loss": 0.87130111, + "num_input_tokens_seen": 15813175, + "router_z_loss_clip": 1.72363281, + "router_z_loss_mlp": 0.28991699, + "step": 738, + "time_per_iteration": 2.762143611907959 + }, + { + "auxiliary_loss_clip": 0.01131016, + "auxiliary_loss_mlp": 0.01011925, + "balance_loss_clip": 1.04341364, + "balance_loss_mlp": 1.00746703, + "epoch": 0.04443108372162934, + "flos": 82239490045440.0, + "grad_norm": 0.8737906484288759, + "language_loss": 0.59187376, + "learning_rate": 3.9978161997518324e-06, + "loss": 0.61330312, + "num_input_tokens_seen": 15872050, + "router_z_loss_clip": 0.87597656, + "router_z_loss_mlp": 0.04464722, + "step": 739, + "time_per_iteration": 3.282494068145752 + }, + { + "auxiliary_loss_clip": 0.01252063, + "auxiliary_loss_mlp": 0.01066302, + "balance_loss_clip": 1.08092594, + "balance_loss_mlp": 1.03990865, + "epoch": 0.04449120697429731, + "flos": 36304952137920.0, + "grad_norm": 2.202493073274629, + "language_loss": 0.91097254, + "learning_rate": 3.997797966850369e-06, + "loss": 0.93415618, + "num_input_tokens_seen": 15891085, + "router_z_loss_clip": 1.71191406, + "router_z_loss_mlp": 0.2635498, + "step": 740, + "time_per_iteration": 2.7221734523773193 + }, + { + "auxiliary_loss_clip": 0.01252698, + "auxiliary_loss_mlp": 0.01062437, + "balance_loss_clip": 1.08042252, + "balance_loss_mlp": 1.03672338, + "epoch": 0.04455133022696528, + "flos": 44541183441600.0, + "grad_norm": 2.032957713969487, + "language_loss": 0.72116548, + "learning_rate": 3.997779658192205e-06, + "loss": 0.74431682, + "num_input_tokens_seen": 15914225, + "router_z_loss_clip": 1.72167969, + "router_z_loss_mlp": 0.25708008, + "step": 741, + "time_per_iteration": 2.813366174697876 + }, + { + "auxiliary_loss_clip": 0.01241615, + "auxiliary_loss_mlp": 0.01068216, + "balance_loss_clip": 1.07402718, + "balance_loss_mlp": 1.04253876, + "epoch": 0.044611453479633245, + "flos": 35146930799520.0, + "grad_norm": 2.4463746186400486, + "language_loss": 0.88522565, + "learning_rate": 3.997761273778037e-06, + "loss": 0.90832394, + "num_input_tokens_seen": 15934540, + "router_z_loss_clip": 1.67675781, + "router_z_loss_mlp": 0.25671387, + "step": 742, + "time_per_iteration": 2.7291269302368164 + }, + { + "auxiliary_loss_clip": 0.0124426, + "auxiliary_loss_mlp": 0.01059933, + "balance_loss_clip": 1.0752852, + "balance_loss_mlp": 1.03340888, + "epoch": 0.04467157673230122, + "flos": 24417378603360.0, + "grad_norm": 1.7939844425662774, + "language_loss": 0.83702743, + "learning_rate": 3.997742813608561e-06, + "loss": 0.86006939, + "num_input_tokens_seen": 15952560, + "router_z_loss_clip": 1.68945312, + "router_z_loss_mlp": 0.26550293, + "step": 743, + "time_per_iteration": 2.660159111022949 + }, + { + "auxiliary_loss_clip": 0.01249949, + "auxiliary_loss_mlp": 0.01074401, + "balance_loss_clip": 1.07669187, + "balance_loss_mlp": 1.04793644, + "epoch": 0.04473169998496919, + "flos": 21969430714080.0, + "grad_norm": 2.0623266114629355, + "language_loss": 0.80035782, + "learning_rate": 3.997724277684479e-06, + "loss": 0.82360131, + "num_input_tokens_seen": 15970620, + "router_z_loss_clip": 1.73242188, + "router_z_loss_mlp": 0.26513672, + "step": 744, + "time_per_iteration": 2.774104595184326 + }, + { + "auxiliary_loss_clip": 0.0124312, + "auxiliary_loss_mlp": 0.01064923, + "balance_loss_clip": 1.07533002, + "balance_loss_mlp": 1.03986549, + "epoch": 0.044791823237637154, + "flos": 25174822230720.0, + "grad_norm": 3.4335940990456493, + "language_loss": 0.85250223, + "learning_rate": 3.99770566600649e-06, + "loss": 0.8755827, + "num_input_tokens_seen": 15987325, + "router_z_loss_clip": 1.67773438, + "router_z_loss_mlp": 0.25036621, + "step": 745, + "time_per_iteration": 2.677554130554199 + }, + { + "auxiliary_loss_clip": 0.01244812, + "auxiliary_loss_mlp": 0.01065309, + "balance_loss_clip": 1.07521808, + "balance_loss_mlp": 1.03848672, + "epoch": 0.04485194649030513, + "flos": 38041963886880.0, + "grad_norm": 2.915985610766518, + "language_loss": 0.69226158, + "learning_rate": 3.997686978575302e-06, + "loss": 0.71536279, + "num_input_tokens_seen": 16008310, + "router_z_loss_clip": 1.69433594, + "router_z_loss_mlp": 0.26818848, + "step": 746, + "time_per_iteration": 2.8518435955047607 + }, + { + "auxiliary_loss_clip": 0.01250869, + "auxiliary_loss_mlp": 0.01072814, + "balance_loss_clip": 1.07985091, + "balance_loss_mlp": 1.04525304, + "epoch": 0.04491206974297309, + "flos": 31900258524960.0, + "grad_norm": 2.4266158096073185, + "language_loss": 0.68481672, + "learning_rate": 3.997668215391625e-06, + "loss": 0.70805359, + "num_input_tokens_seen": 16029620, + "router_z_loss_clip": 1.7109375, + "router_z_loss_mlp": 0.27539062, + "step": 747, + "time_per_iteration": 2.7578628063201904 + }, + { + "auxiliary_loss_clip": 0.01252361, + "auxiliary_loss_mlp": 0.01079181, + "balance_loss_clip": 1.07946086, + "balance_loss_mlp": 1.05218065, + "epoch": 0.044972192995641064, + "flos": 25218736611840.0, + "grad_norm": 2.1261535319267195, + "language_loss": 0.66593635, + "learning_rate": 3.997649376456168e-06, + "loss": 0.68925178, + "num_input_tokens_seen": 16049065, + "router_z_loss_clip": 1.7265625, + "router_z_loss_mlp": 0.2701416, + "step": 748, + "time_per_iteration": 2.7073819637298584 + }, + { + "auxiliary_loss_clip": 0.01254033, + "auxiliary_loss_mlp": 0.01076025, + "balance_loss_clip": 1.08189631, + "balance_loss_mlp": 1.04920328, + "epoch": 0.045032316248309036, + "flos": 19653266485440.0, + "grad_norm": 2.3540199050208295, + "language_loss": 0.76668, + "learning_rate": 3.997630461769647e-06, + "loss": 0.78998065, + "num_input_tokens_seen": 16066765, + "router_z_loss_clip": 1.72363281, + "router_z_loss_mlp": 0.26867676, + "step": 749, + "time_per_iteration": 2.631240129470825 + }, + { + "auxiliary_loss_clip": 0.01252682, + "auxiliary_loss_mlp": 0.0107274, + "balance_loss_clip": 1.08101213, + "balance_loss_mlp": 1.04677618, + "epoch": 0.045092439500977, + "flos": 21791139566400.0, + "grad_norm": 2.181551000142309, + "language_loss": 0.88906872, + "learning_rate": 3.997611471332778e-06, + "loss": 0.91232294, + "num_input_tokens_seen": 16085980, + "router_z_loss_clip": 1.71582031, + "router_z_loss_mlp": 0.25976562, + "step": 750, + "time_per_iteration": 2.6542234420776367 + }, + { + "auxiliary_loss_clip": 0.01248981, + "auxiliary_loss_mlp": 0.01068524, + "balance_loss_clip": 1.07546151, + "balance_loss_mlp": 1.03953242, + "epoch": 0.04515256275364497, + "flos": 29849809033440.0, + "grad_norm": 1.8623701419823822, + "language_loss": 0.74640644, + "learning_rate": 3.9975924051462825e-06, + "loss": 0.76958144, + "num_input_tokens_seen": 16106260, + "router_z_loss_clip": 1.73632812, + "router_z_loss_mlp": 0.28991699, + "step": 751, + "time_per_iteration": 2.6997156143188477 + }, + { + "auxiliary_loss_clip": 0.01247711, + "auxiliary_loss_mlp": 0.01073848, + "balance_loss_clip": 1.07560313, + "balance_loss_mlp": 1.04706168, + "epoch": 0.04521268600631294, + "flos": 25522855380000.0, + "grad_norm": 2.0561935636702984, + "language_loss": 0.69227391, + "learning_rate": 3.997573263210883e-06, + "loss": 0.71548951, + "num_input_tokens_seen": 16123475, + "router_z_loss_clip": 1.72265625, + "router_z_loss_mlp": 0.26794434, + "step": 752, + "time_per_iteration": 2.764549970626831 + }, + { + "auxiliary_loss_clip": 0.01245123, + "auxiliary_loss_mlp": 0.01056311, + "balance_loss_clip": 1.07444286, + "balance_loss_mlp": 1.03039479, + "epoch": 0.04527280925898091, + "flos": 16315159410720.0, + "grad_norm": 3.2730481733190104, + "language_loss": 0.92558908, + "learning_rate": 3.997554045527305e-06, + "loss": 0.94860339, + "num_input_tokens_seen": 16138335, + "router_z_loss_clip": 1.70996094, + "router_z_loss_mlp": 0.2590332, + "step": 753, + "time_per_iteration": 2.6243648529052734 + }, + { + "auxiliary_loss_clip": 0.01251159, + "auxiliary_loss_mlp": 0.01080848, + "balance_loss_clip": 1.07898557, + "balance_loss_mlp": 1.05439591, + "epoch": 0.04533293251164888, + "flos": 28375797363840.0, + "grad_norm": 1.963071757376533, + "language_loss": 0.91346216, + "learning_rate": 3.997534752096277e-06, + "loss": 0.93678224, + "num_input_tokens_seen": 16157110, + "router_z_loss_clip": 1.72167969, + "router_z_loss_mlp": 0.26464844, + "step": 754, + "time_per_iteration": 2.7092244625091553 + }, + { + "auxiliary_loss_clip": 0.01239419, + "auxiliary_loss_mlp": 0.01071853, + "balance_loss_clip": 1.07606053, + "balance_loss_mlp": 1.04472089, + "epoch": 0.04539305576431685, + "flos": 15156732899520.0, + "grad_norm": 2.9804098104616235, + "language_loss": 0.78529274, + "learning_rate": 3.997515382918531e-06, + "loss": 0.80840546, + "num_input_tokens_seen": 16174155, + "router_z_loss_clip": 1.63378906, + "router_z_loss_mlp": 0.27124023, + "step": 755, + "time_per_iteration": 2.649143934249878 + }, + { + "auxiliary_loss_clip": 0.01251815, + "auxiliary_loss_mlp": 0.01074934, + "balance_loss_clip": 1.07872832, + "balance_loss_mlp": 1.04807639, + "epoch": 0.04545317901698482, + "flos": 19609473656160.0, + "grad_norm": 2.173051490759667, + "language_loss": 0.7890234, + "learning_rate": 3.9974959379948015e-06, + "loss": 0.81229091, + "num_input_tokens_seen": 16192240, + "router_z_loss_clip": 1.73242188, + "router_z_loss_mlp": 0.26818848, + "step": 756, + "time_per_iteration": 2.6430509090423584 + }, + { + "auxiliary_loss_clip": 0.01127802, + "auxiliary_loss_mlp": 0.0103233, + "balance_loss_clip": 1.04723763, + "balance_loss_mlp": 1.02793407, + "epoch": 0.045513302269652785, + "flos": 81015681345120.0, + "grad_norm": 0.8172287595005798, + "language_loss": 0.62698025, + "learning_rate": 3.997476417325827e-06, + "loss": 0.64858156, + "num_input_tokens_seen": 16255775, + "router_z_loss_clip": 0.8046875, + "router_z_loss_mlp": 0.04403687, + "step": 757, + "time_per_iteration": 3.392024040222168 + }, + { + "auxiliary_loss_clip": 0.01245942, + "auxiliary_loss_mlp": 0.01066168, + "balance_loss_clip": 1.07666826, + "balance_loss_mlp": 1.04083657, + "epoch": 0.04557342552232076, + "flos": 26199155596320.0, + "grad_norm": 1.491458819701312, + "language_loss": 0.84169674, + "learning_rate": 3.997456820912346e-06, + "loss": 0.8648178, + "num_input_tokens_seen": 16277015, + "router_z_loss_clip": 1.69238281, + "router_z_loss_mlp": 0.25354004, + "step": 758, + "time_per_iteration": 2.7144267559051514 + }, + { + "auxiliary_loss_clip": 0.01239823, + "auxiliary_loss_mlp": 0.01061677, + "balance_loss_clip": 1.07173014, + "balance_loss_mlp": 1.03672624, + "epoch": 0.04563354877498873, + "flos": 28958880019680.0, + "grad_norm": 1.8450664413540212, + "language_loss": 0.88280857, + "learning_rate": 3.997437148755101e-06, + "loss": 0.90582359, + "num_input_tokens_seen": 16296005, + "router_z_loss_clip": 1.68066406, + "router_z_loss_mlp": 0.24987793, + "step": 759, + "time_per_iteration": 2.7239432334899902 + }, + { + "auxiliary_loss_clip": 0.01250016, + "auxiliary_loss_mlp": 0.01070542, + "balance_loss_clip": 1.08087325, + "balance_loss_mlp": 1.04344606, + "epoch": 0.045693672027656694, + "flos": 31403505492000.0, + "grad_norm": 2.382421992736469, + "language_loss": 0.73422849, + "learning_rate": 3.9974174008548405e-06, + "loss": 0.75743413, + "num_input_tokens_seen": 16315300, + "router_z_loss_clip": 1.69335938, + "router_z_loss_mlp": 0.27099609, + "step": 760, + "time_per_iteration": 2.733196496963501 + }, + { + "auxiliary_loss_clip": 0.01250764, + "auxiliary_loss_mlp": 0.01068232, + "balance_loss_clip": 1.08195734, + "balance_loss_mlp": 1.04342461, + "epoch": 0.045753795280324666, + "flos": 24061120446240.0, + "grad_norm": 2.0956960561808775, + "language_loss": 0.82567596, + "learning_rate": 3.9973975772123105e-06, + "loss": 0.84886593, + "num_input_tokens_seen": 16333820, + "router_z_loss_clip": 1.6875, + "router_z_loss_mlp": 0.24816895, + "step": 761, + "time_per_iteration": 2.6809468269348145 + }, + { + "auxiliary_loss_clip": 0.0124153, + "auxiliary_loss_mlp": 0.01068035, + "balance_loss_clip": 1.07505488, + "balance_loss_mlp": 1.04163039, + "epoch": 0.04581391853299264, + "flos": 28377174951360.0, + "grad_norm": 1.9028661478436681, + "language_loss": 0.79872918, + "learning_rate": 3.997377677828266e-06, + "loss": 0.82182485, + "num_input_tokens_seen": 16355290, + "router_z_loss_clip": 1.66308594, + "router_z_loss_mlp": 0.2635498, + "step": 762, + "time_per_iteration": 2.7098872661590576 + }, + { + "auxiliary_loss_clip": 0.01123313, + "auxiliary_loss_mlp": 0.01005488, + "balance_loss_clip": 1.04287732, + "balance_loss_mlp": 1.00114238, + "epoch": 0.0458740417856606, + "flos": 78374937121920.0, + "grad_norm": 1.0080513808548113, + "language_loss": 0.58746487, + "learning_rate": 3.9973577027034585e-06, + "loss": 0.60875291, + "num_input_tokens_seen": 16415995, + "router_z_loss_clip": 0.80517578, + "router_z_loss_mlp": 0.04351807, + "step": 763, + "time_per_iteration": 3.354146718978882 + }, + { + "auxiliary_loss_clip": 0.01246595, + "auxiliary_loss_mlp": 0.01067495, + "balance_loss_clip": 1.07609618, + "balance_loss_mlp": 1.04161477, + "epoch": 0.045934165038328575, + "flos": 25344402163200.0, + "grad_norm": 3.277020061547608, + "language_loss": 0.87616515, + "learning_rate": 3.9973376518386475e-06, + "loss": 0.89930612, + "num_input_tokens_seen": 16433120, + "router_z_loss_clip": 1.70507812, + "router_z_loss_mlp": 0.2590332, + "step": 764, + "time_per_iteration": 2.69775652885437 + }, + { + "auxiliary_loss_clip": 0.01248185, + "auxiliary_loss_mlp": 0.01077003, + "balance_loss_clip": 1.07762408, + "balance_loss_mlp": 1.05139673, + "epoch": 0.04599428829099654, + "flos": 36927613825920.0, + "grad_norm": 2.1057384960821937, + "language_loss": 0.8579734, + "learning_rate": 3.997317525234592e-06, + "loss": 0.88122523, + "num_input_tokens_seen": 16453360, + "router_z_loss_clip": 1.70703125, + "router_z_loss_mlp": 0.2557373, + "step": 765, + "time_per_iteration": 2.749006986618042 + }, + { + "auxiliary_loss_clip": 0.01250906, + "auxiliary_loss_mlp": 0.01065737, + "balance_loss_clip": 1.07861137, + "balance_loss_mlp": 1.03682876, + "epoch": 0.04605441154366451, + "flos": 28112351594400.0, + "grad_norm": 3.0508703619716115, + "language_loss": 0.88260293, + "learning_rate": 3.997297322892056e-06, + "loss": 0.90576935, + "num_input_tokens_seen": 16471160, + "router_z_loss_clip": 1.72167969, + "router_z_loss_mlp": 0.2890625, + "step": 766, + "time_per_iteration": 4.269926071166992 + }, + { + "auxiliary_loss_clip": 0.01244175, + "auxiliary_loss_mlp": 0.01066081, + "balance_loss_clip": 1.07581735, + "balance_loss_mlp": 1.04008126, + "epoch": 0.046114534796332485, + "flos": 26865812700000.0, + "grad_norm": 2.4056247448413544, + "language_loss": 0.84373623, + "learning_rate": 3.997277044811806e-06, + "loss": 0.86683881, + "num_input_tokens_seen": 16488940, + "router_z_loss_clip": 1.68457031, + "router_z_loss_mlp": 0.26013184, + "step": 767, + "time_per_iteration": 4.128887176513672 + }, + { + "auxiliary_loss_clip": 0.01244921, + "auxiliary_loss_mlp": 0.01057829, + "balance_loss_clip": 1.07854688, + "balance_loss_mlp": 1.03153145, + "epoch": 0.04617465804900045, + "flos": 36438802179840.0, + "grad_norm": 2.009262549464076, + "language_loss": 0.86952996, + "learning_rate": 3.99725669099461e-06, + "loss": 0.8925575, + "num_input_tokens_seen": 16509505, + "router_z_loss_clip": 1.66503906, + "router_z_loss_mlp": 0.26293945, + "step": 768, + "time_per_iteration": 4.176476001739502 + }, + { + "auxiliary_loss_clip": 0.01241486, + "auxiliary_loss_mlp": 0.01064537, + "balance_loss_clip": 1.0713309, + "balance_loss_mlp": 1.03934824, + "epoch": 0.04623478130166842, + "flos": 31274193385440.0, + "grad_norm": 2.234251251322822, + "language_loss": 0.75050819, + "learning_rate": 3.9972362614412395e-06, + "loss": 0.77356839, + "num_input_tokens_seen": 16528840, + "router_z_loss_clip": 1.70214844, + "router_z_loss_mlp": 0.25158691, + "step": 769, + "time_per_iteration": 4.196349382400513 + }, + { + "auxiliary_loss_clip": 0.01239231, + "auxiliary_loss_mlp": 0.01065043, + "balance_loss_clip": 1.07577586, + "balance_loss_mlp": 1.04151106, + "epoch": 0.04629490455433639, + "flos": 24951968425440.0, + "grad_norm": 1.854924032041225, + "language_loss": 0.86126435, + "learning_rate": 3.997215756152471e-06, + "loss": 0.88430715, + "num_input_tokens_seen": 16548335, + "router_z_loss_clip": 1.63476562, + "router_z_loss_mlp": 0.23522949, + "step": 770, + "time_per_iteration": 2.67380952835083 + }, + { + "auxiliary_loss_clip": 0.01246911, + "auxiliary_loss_mlp": 0.01067098, + "balance_loss_clip": 1.07412517, + "balance_loss_mlp": 1.04127717, + "epoch": 0.04635502780700436, + "flos": 28246323188160.0, + "grad_norm": 2.0939118713878306, + "language_loss": 0.87145555, + "learning_rate": 3.99719517512908e-06, + "loss": 0.89459562, + "num_input_tokens_seen": 16567725, + "router_z_loss_clip": 1.72753906, + "router_z_loss_mlp": 0.25805664, + "step": 771, + "time_per_iteration": 2.7738635540008545 + }, + { + "auxiliary_loss_clip": 0.01246576, + "auxiliary_loss_mlp": 0.01075478, + "balance_loss_clip": 1.07317472, + "balance_loss_mlp": 1.04852438, + "epoch": 0.04641515105967233, + "flos": 28421413470720.0, + "grad_norm": 3.171186004367874, + "language_loss": 0.84142953, + "learning_rate": 3.997174518371848e-06, + "loss": 0.86465007, + "num_input_tokens_seen": 16588175, + "router_z_loss_clip": 1.73339844, + "router_z_loss_mlp": 0.26965332, + "step": 772, + "time_per_iteration": 2.6935322284698486 + }, + { + "auxiliary_loss_clip": 0.01246508, + "auxiliary_loss_mlp": 0.01061585, + "balance_loss_clip": 1.07986927, + "balance_loss_mlp": 1.0365392, + "epoch": 0.046475274312340296, + "flos": 30646304968320.0, + "grad_norm": 2.6339751169963375, + "language_loss": 0.73923093, + "learning_rate": 3.997153785881557e-06, + "loss": 0.76231188, + "num_input_tokens_seen": 16607735, + "router_z_loss_clip": 1.66601562, + "router_z_loss_mlp": 0.25036621, + "step": 773, + "time_per_iteration": 2.7424960136413574 + }, + { + "auxiliary_loss_clip": 0.01239483, + "auxiliary_loss_mlp": 0.01064253, + "balance_loss_clip": 1.07606912, + "balance_loss_mlp": 1.03745532, + "epoch": 0.04653539756500827, + "flos": 30828647844000.0, + "grad_norm": 2.004763467687063, + "language_loss": 0.7856468, + "learning_rate": 3.997132977658996e-06, + "loss": 0.80868411, + "num_input_tokens_seen": 16627225, + "router_z_loss_clip": 1.63671875, + "router_z_loss_mlp": 0.26806641, + "step": 774, + "time_per_iteration": 2.7091541290283203 + }, + { + "auxiliary_loss_clip": 0.01238617, + "auxiliary_loss_mlp": 0.01060722, + "balance_loss_clip": 1.07501507, + "balance_loss_mlp": 1.03660607, + "epoch": 0.046595520817676234, + "flos": 43201670090400.0, + "grad_norm": 2.6894493257517627, + "language_loss": 0.73512959, + "learning_rate": 3.997112093704952e-06, + "loss": 0.75812298, + "num_input_tokens_seen": 16647785, + "router_z_loss_clip": 1.63574219, + "router_z_loss_mlp": 0.24133301, + "step": 775, + "time_per_iteration": 2.7768311500549316 + }, + { + "auxiliary_loss_clip": 0.01241348, + "auxiliary_loss_mlp": 0.01053718, + "balance_loss_clip": 1.07479417, + "balance_loss_mlp": 1.02804041, + "epoch": 0.046655644070344206, + "flos": 22100849719200.0, + "grad_norm": 1.588744988065128, + "language_loss": 0.77424157, + "learning_rate": 3.997091134020217e-06, + "loss": 0.79719228, + "num_input_tokens_seen": 16667555, + "router_z_loss_clip": 1.66503906, + "router_z_loss_mlp": 0.2565918, + "step": 776, + "time_per_iteration": 2.6935343742370605 + }, + { + "auxiliary_loss_clip": 0.0123693, + "auxiliary_loss_mlp": 0.01066131, + "balance_loss_clip": 1.07420778, + "balance_loss_mlp": 1.04215801, + "epoch": 0.04671576732301218, + "flos": 35636998481280.0, + "grad_norm": 2.067155150080586, + "language_loss": 0.7100625, + "learning_rate": 3.997070098605585e-06, + "loss": 0.73309314, + "num_input_tokens_seen": 16686875, + "router_z_loss_clip": 1.62792969, + "router_z_loss_mlp": 0.23974609, + "step": 777, + "time_per_iteration": 2.7715775966644287 + }, + { + "auxiliary_loss_clip": 0.01241408, + "auxiliary_loss_mlp": 0.01077738, + "balance_loss_clip": 1.07631326, + "balance_loss_mlp": 1.05077326, + "epoch": 0.04677589057568014, + "flos": 37190816491680.0, + "grad_norm": 1.8305331893922518, + "language_loss": 0.76558518, + "learning_rate": 3.997048987461856e-06, + "loss": 0.78877664, + "num_input_tokens_seen": 16706420, + "router_z_loss_clip": 1.65136719, + "router_z_loss_mlp": 0.26928711, + "step": 778, + "time_per_iteration": 2.765292167663574 + }, + { + "auxiliary_loss_clip": 0.01236702, + "auxiliary_loss_mlp": 0.01058804, + "balance_loss_clip": 1.07450032, + "balance_loss_mlp": 1.03365111, + "epoch": 0.046836013828348115, + "flos": 25084562431680.0, + "grad_norm": 1.963462118208894, + "language_loss": 0.78793591, + "learning_rate": 3.997027800589829e-06, + "loss": 0.81089097, + "num_input_tokens_seen": 16726390, + "router_z_loss_clip": 1.62109375, + "router_z_loss_mlp": 0.25158691, + "step": 779, + "time_per_iteration": 2.691596269607544 + }, + { + "auxiliary_loss_clip": 0.01231995, + "auxiliary_loss_mlp": 0.01063268, + "balance_loss_clip": 1.07277918, + "balance_loss_mlp": 1.03954554, + "epoch": 0.04689613708101608, + "flos": 31051299062880.0, + "grad_norm": 1.7832276002535028, + "language_loss": 0.77200687, + "learning_rate": 3.997006537990308e-06, + "loss": 0.79495955, + "num_input_tokens_seen": 16748965, + "router_z_loss_clip": 1.59179688, + "router_z_loss_mlp": 0.23742676, + "step": 780, + "time_per_iteration": 2.7697412967681885 + }, + { + "auxiliary_loss_clip": 0.01236484, + "auxiliary_loss_mlp": 0.01063455, + "balance_loss_clip": 1.07547188, + "balance_loss_mlp": 1.04068613, + "epoch": 0.04695626033368405, + "flos": 28066654452960.0, + "grad_norm": 1.723423722305139, + "language_loss": 0.76463735, + "learning_rate": 3.996985199664099e-06, + "loss": 0.7876367, + "num_input_tokens_seen": 16768620, + "router_z_loss_clip": 1.61035156, + "router_z_loss_mlp": 0.22766113, + "step": 781, + "time_per_iteration": 2.6834561824798584 + }, + { + "auxiliary_loss_clip": 0.01245018, + "auxiliary_loss_mlp": 0.01065019, + "balance_loss_clip": 1.07648158, + "balance_loss_mlp": 1.03988981, + "epoch": 0.047016383586352024, + "flos": 35549129201760.0, + "grad_norm": 2.710237394638598, + "language_loss": 0.73632705, + "learning_rate": 3.99696378561201e-06, + "loss": 0.75942743, + "num_input_tokens_seen": 16789755, + "router_z_loss_clip": 1.68554688, + "router_z_loss_mlp": 0.25146484, + "step": 782, + "time_per_iteration": 2.766160011291504 + }, + { + "auxiliary_loss_clip": 0.01240309, + "auxiliary_loss_mlp": 0.0105946, + "balance_loss_clip": 1.07835102, + "balance_loss_mlp": 1.03601146, + "epoch": 0.04707650683901999, + "flos": 18273687894720.0, + "grad_norm": 2.2347751770233497, + "language_loss": 0.80622822, + "learning_rate": 3.996942295834855e-06, + "loss": 0.8292259, + "num_input_tokens_seen": 16807585, + "router_z_loss_clip": 1.62109375, + "router_z_loss_mlp": 0.23425293, + "step": 783, + "time_per_iteration": 2.6550657749176025 + }, + { + "auxiliary_loss_clip": 0.01233473, + "auxiliary_loss_mlp": 0.01059417, + "balance_loss_clip": 1.07613218, + "balance_loss_mlp": 1.03559887, + "epoch": 0.04713663009168796, + "flos": 26418970605600.0, + "grad_norm": 2.1179056872820685, + "language_loss": 0.81616819, + "learning_rate": 3.996920730333448e-06, + "loss": 0.83909708, + "num_input_tokens_seen": 16827220, + "router_z_loss_clip": 1.57421875, + "router_z_loss_mlp": 0.23828125, + "step": 784, + "time_per_iteration": 2.7349956035614014 + }, + { + "auxiliary_loss_clip": 0.01238917, + "auxiliary_loss_mlp": 0.01063972, + "balance_loss_clip": 1.07341444, + "balance_loss_mlp": 1.03984392, + "epoch": 0.04719675334435593, + "flos": 26021269621440.0, + "grad_norm": 2.5036190342096556, + "language_loss": 0.80921251, + "learning_rate": 3.996899089108607e-06, + "loss": 0.83224136, + "num_input_tokens_seen": 16846230, + "router_z_loss_clip": 1.65722656, + "router_z_loss_mlp": 0.24157715, + "step": 785, + "time_per_iteration": 2.6617002487182617 + }, + { + "auxiliary_loss_clip": 0.01243136, + "auxiliary_loss_mlp": 0.01058718, + "balance_loss_clip": 1.08161449, + "balance_loss_mlp": 1.03555536, + "epoch": 0.0472568765970239, + "flos": 21879697639680.0, + "grad_norm": 1.8273374545673091, + "language_loss": 0.89245433, + "learning_rate": 3.996877372161152e-06, + "loss": 0.91547287, + "num_input_tokens_seen": 16865325, + "router_z_loss_clip": 1.61425781, + "router_z_loss_mlp": 0.23156738, + "step": 786, + "time_per_iteration": 2.6789512634277344 + }, + { + "auxiliary_loss_clip": 0.01239118, + "auxiliary_loss_mlp": 0.01062924, + "balance_loss_clip": 1.06959748, + "balance_loss_mlp": 1.03619707, + "epoch": 0.04731699984969187, + "flos": 22057786200960.0, + "grad_norm": 5.054278169551639, + "language_loss": 0.7698313, + "learning_rate": 3.9968555794919065e-06, + "loss": 0.79285175, + "num_input_tokens_seen": 16882930, + "router_z_loss_clip": 1.6953125, + "router_z_loss_mlp": 0.26782227, + "step": 787, + "time_per_iteration": 2.6376538276672363 + }, + { + "auxiliary_loss_clip": 0.012479, + "auxiliary_loss_mlp": 0.01063984, + "balance_loss_clip": 1.08192539, + "balance_loss_mlp": 1.03890204, + "epoch": 0.047377123102359836, + "flos": 28290683259360.0, + "grad_norm": 2.3037110687028877, + "language_loss": 0.8119812, + "learning_rate": 3.996833711101698e-06, + "loss": 0.83510005, + "num_input_tokens_seen": 16900710, + "router_z_loss_clip": 1.65917969, + "router_z_loss_mlp": 0.25085449, + "step": 788, + "time_per_iteration": 2.70719838142395 + }, + { + "auxiliary_loss_clip": 0.01236503, + "auxiliary_loss_mlp": 0.0106943, + "balance_loss_clip": 1.07695436, + "balance_loss_mlp": 1.0438956, + "epoch": 0.04743724635502781, + "flos": 27757025334720.0, + "grad_norm": 3.3688726913878995, + "language_loss": 0.84756434, + "learning_rate": 3.996811766991355e-06, + "loss": 0.87062371, + "num_input_tokens_seen": 16919210, + "router_z_loss_clip": 1.59570312, + "router_z_loss_mlp": 0.25500488, + "step": 789, + "time_per_iteration": 2.6517109870910645 + }, + { + "auxiliary_loss_clip": 0.0124245, + "auxiliary_loss_mlp": 0.01064781, + "balance_loss_clip": 1.07791448, + "balance_loss_mlp": 1.04142773, + "epoch": 0.04749736960769577, + "flos": 21034384732800.0, + "grad_norm": 2.09005410332846, + "language_loss": 0.82051039, + "learning_rate": 3.996789747161709e-06, + "loss": 0.84358275, + "num_input_tokens_seen": 16937125, + "router_z_loss_clip": 1.64453125, + "router_z_loss_mlp": 0.23364258, + "step": 790, + "time_per_iteration": 2.681994915008545 + }, + { + "auxiliary_loss_clip": 0.01236391, + "auxiliary_loss_mlp": 0.01061963, + "balance_loss_clip": 1.07416821, + "balance_loss_mlp": 1.03690553, + "epoch": 0.047557492860363745, + "flos": 49394380357440.0, + "grad_norm": 2.3043244748210014, + "language_loss": 0.87902188, + "learning_rate": 3.996767651613597e-06, + "loss": 0.90200531, + "num_input_tokens_seen": 16958610, + "router_z_loss_clip": 1.62304688, + "router_z_loss_mlp": 0.25048828, + "step": 791, + "time_per_iteration": 2.907400608062744 + }, + { + "auxiliary_loss_clip": 0.01240744, + "auxiliary_loss_mlp": 0.01061579, + "balance_loss_clip": 1.07896626, + "balance_loss_mlp": 1.03671241, + "epoch": 0.04761761611303172, + "flos": 22815148793760.0, + "grad_norm": 2.054021028553834, + "language_loss": 0.90071696, + "learning_rate": 3.996745480347854e-06, + "loss": 0.92374021, + "num_input_tokens_seen": 16977300, + "router_z_loss_clip": 1.6171875, + "router_z_loss_mlp": 0.24902344, + "step": 792, + "time_per_iteration": 2.6550238132476807 + }, + { + "auxiliary_loss_clip": 0.01240235, + "auxiliary_loss_mlp": 0.01071357, + "balance_loss_clip": 1.07567835, + "balance_loss_mlp": 1.04808748, + "epoch": 0.04767773936569968, + "flos": 25529176075680.0, + "grad_norm": 2.1504564826684147, + "language_loss": 0.73675668, + "learning_rate": 3.996723233365324e-06, + "loss": 0.75987267, + "num_input_tokens_seen": 16994950, + "router_z_loss_clip": 1.64550781, + "router_z_loss_mlp": 0.23254395, + "step": 793, + "time_per_iteration": 2.69620680809021 + }, + { + "auxiliary_loss_clip": 0.01243979, + "auxiliary_loss_mlp": 0.01065403, + "balance_loss_clip": 1.07801044, + "balance_loss_mlp": 1.04048872, + "epoch": 0.047737862618367655, + "flos": 28958880019680.0, + "grad_norm": 1.9715906442873177, + "language_loss": 0.86096406, + "learning_rate": 3.996700910666847e-06, + "loss": 0.88405782, + "num_input_tokens_seen": 17014760, + "router_z_loss_clip": 1.65917969, + "router_z_loss_mlp": 0.24902344, + "step": 794, + "time_per_iteration": 2.6852715015411377 + }, + { + "auxiliary_loss_clip": 0.0123931, + "auxiliary_loss_mlp": 0.01068653, + "balance_loss_clip": 1.07434857, + "balance_loss_mlp": 1.0438571, + "epoch": 0.04779798587103562, + "flos": 28913871672000.0, + "grad_norm": 2.5886572119518343, + "language_loss": 0.69769418, + "learning_rate": 3.996678512253272e-06, + "loss": 0.72077382, + "num_input_tokens_seen": 17032715, + "router_z_loss_clip": 1.65136719, + "router_z_loss_mlp": 0.24804688, + "step": 795, + "time_per_iteration": 2.6856513023376465 + }, + { + "auxiliary_loss_clip": 0.0123632, + "auxiliary_loss_mlp": 0.010669, + "balance_loss_clip": 1.07437491, + "balance_loss_mlp": 1.042593, + "epoch": 0.04785810912370359, + "flos": 28289062568160.0, + "grad_norm": 1.795772282802024, + "language_loss": 0.80725729, + "learning_rate": 3.996656038125449e-06, + "loss": 0.83028954, + "num_input_tokens_seen": 17052215, + "router_z_loss_clip": 1.61914062, + "router_z_loss_mlp": 0.24316406, + "step": 796, + "time_per_iteration": 2.7209928035736084 + }, + { + "auxiliary_loss_clip": 0.01239642, + "auxiliary_loss_mlp": 0.01057545, + "balance_loss_clip": 1.07587731, + "balance_loss_mlp": 1.03291667, + "epoch": 0.047918232376371564, + "flos": 22013020956960.0, + "grad_norm": 3.698634331531907, + "language_loss": 0.81343818, + "learning_rate": 3.996633488284228e-06, + "loss": 0.83641005, + "num_input_tokens_seen": 17069225, + "router_z_loss_clip": 1.63574219, + "router_z_loss_mlp": 0.24658203, + "step": 797, + "time_per_iteration": 2.6925137042999268 + }, + { + "auxiliary_loss_clip": 0.01136813, + "auxiliary_loss_mlp": 0.01015574, + "balance_loss_clip": 1.05824852, + "balance_loss_mlp": 1.01076365, + "epoch": 0.04797835562903953, + "flos": 76192460866080.0, + "grad_norm": 0.9188712664347837, + "language_loss": 0.64449632, + "learning_rate": 3.996610862730465e-06, + "loss": 0.66602027, + "num_input_tokens_seen": 17126680, + "router_z_loss_clip": 0.78564453, + "router_z_loss_mlp": 0.04803467, + "step": 798, + "time_per_iteration": 3.2340128421783447 + }, + { + "auxiliary_loss_clip": 0.01242422, + "auxiliary_loss_mlp": 0.01064044, + "balance_loss_clip": 1.07194471, + "balance_loss_mlp": 1.03989267, + "epoch": 0.0480384788817075, + "flos": 26243272563840.0, + "grad_norm": 2.038082174607587, + "language_loss": 0.91549921, + "learning_rate": 3.996588161465018e-06, + "loss": 0.93856388, + "num_input_tokens_seen": 17144835, + "router_z_loss_clip": 1.70703125, + "router_z_loss_mlp": 0.24169922, + "step": 799, + "time_per_iteration": 2.706191062927246 + }, + { + "auxiliary_loss_clip": 0.01241657, + "auxiliary_loss_mlp": 0.01066953, + "balance_loss_clip": 1.07941556, + "balance_loss_mlp": 1.04154921, + "epoch": 0.048098602134375466, + "flos": 26510972647680.0, + "grad_norm": 2.1797890155591864, + "language_loss": 0.86574817, + "learning_rate": 3.996565384488748e-06, + "loss": 0.8888343, + "num_input_tokens_seen": 17165030, + "router_z_loss_clip": 1.62304688, + "router_z_loss_mlp": 0.25390625, + "step": 800, + "time_per_iteration": 2.6819448471069336 + }, + { + "auxiliary_loss_clip": 0.01240633, + "auxiliary_loss_mlp": 0.01066068, + "balance_loss_clip": 1.07511783, + "balance_loss_mlp": 1.04260802, + "epoch": 0.04815872538704344, + "flos": 27980608451040.0, + "grad_norm": 3.2743987049681853, + "language_loss": 0.84548062, + "learning_rate": 3.996542531802518e-06, + "loss": 0.86854762, + "num_input_tokens_seen": 17184895, + "router_z_loss_clip": 1.65625, + "router_z_loss_mlp": 0.23461914, + "step": 801, + "time_per_iteration": 2.6767354011535645 + }, + { + "auxiliary_loss_clip": 0.01239325, + "auxiliary_loss_mlp": 0.0106802, + "balance_loss_clip": 1.0750668, + "balance_loss_mlp": 1.04391575, + "epoch": 0.04821884863971141, + "flos": 52683994598400.0, + "grad_norm": 2.1645218101793544, + "language_loss": 0.79849935, + "learning_rate": 3.996519603407196e-06, + "loss": 0.82157284, + "num_input_tokens_seen": 17208225, + "router_z_loss_clip": 1.64257812, + "router_z_loss_mlp": 0.24108887, + "step": 802, + "time_per_iteration": 2.876420259475708 + }, + { + "auxiliary_loss_clip": 0.01240792, + "auxiliary_loss_mlp": 0.01059398, + "balance_loss_clip": 1.07788444, + "balance_loss_mlp": 1.03618813, + "epoch": 0.048278971892379376, + "flos": 22720432093920.0, + "grad_norm": 1.9902330533740793, + "language_loss": 0.86446714, + "learning_rate": 3.996496599303649e-06, + "loss": 0.88746905, + "num_input_tokens_seen": 17226305, + "router_z_loss_clip": 1.62988281, + "router_z_loss_mlp": 0.23205566, + "step": 803, + "time_per_iteration": 2.652531147003174 + }, + { + "auxiliary_loss_clip": 0.01235799, + "auxiliary_loss_mlp": 0.01053333, + "balance_loss_clip": 1.07671034, + "balance_loss_mlp": 1.02936029, + "epoch": 0.04833909514504735, + "flos": 24684714031680.0, + "grad_norm": 2.431784121543883, + "language_loss": 0.85320556, + "learning_rate": 3.996473519492753e-06, + "loss": 0.8760969, + "num_input_tokens_seen": 17244545, + "router_z_loss_clip": 1.58984375, + "router_z_loss_mlp": 0.23986816, + "step": 804, + "time_per_iteration": 2.6855125427246094 + }, + { + "auxiliary_loss_clip": 0.01239107, + "auxiliary_loss_mlp": 0.01065179, + "balance_loss_clip": 1.07612097, + "balance_loss_mlp": 1.04138446, + "epoch": 0.04839921839771532, + "flos": 30072338700480.0, + "grad_norm": 1.9763538410060866, + "language_loss": 0.86255372, + "learning_rate": 3.99645036397538e-06, + "loss": 0.88559651, + "num_input_tokens_seen": 17265730, + "router_z_loss_clip": 1.62988281, + "router_z_loss_mlp": 0.23791504, + "step": 805, + "time_per_iteration": 2.704782247543335 + }, + { + "auxiliary_loss_clip": 0.01235152, + "auxiliary_loss_mlp": 0.01060511, + "balance_loss_clip": 1.07342577, + "balance_loss_mlp": 1.03665733, + "epoch": 0.048459341650383285, + "flos": 30294787332960.0, + "grad_norm": 1.8870842353681978, + "language_loss": 0.68248004, + "learning_rate": 3.9964271327524085e-06, + "loss": 0.70543665, + "num_input_tokens_seen": 17284820, + "router_z_loss_clip": 1.61816406, + "router_z_loss_mlp": 0.23852539, + "step": 806, + "time_per_iteration": 4.166409254074097 + }, + { + "auxiliary_loss_clip": 0.01234982, + "auxiliary_loss_mlp": 0.01058787, + "balance_loss_clip": 1.07549834, + "balance_loss_mlp": 1.03446829, + "epoch": 0.04851946490305126, + "flos": 27043739192160.0, + "grad_norm": 4.753124855347904, + "language_loss": 0.76446474, + "learning_rate": 3.9964038258247214e-06, + "loss": 0.78740245, + "num_input_tokens_seen": 17305085, + "router_z_loss_clip": 1.59570312, + "router_z_loss_mlp": 0.24316406, + "step": 807, + "time_per_iteration": 5.734745502471924 + }, + { + "auxiliary_loss_clip": 0.01231936, + "auxiliary_loss_mlp": 0.01063989, + "balance_loss_clip": 1.07166874, + "balance_loss_mlp": 1.03992105, + "epoch": 0.04857958815571922, + "flos": 24150205244160.0, + "grad_norm": 2.4895142369309164, + "language_loss": 0.87003052, + "learning_rate": 3.9963804431932005e-06, + "loss": 0.89298975, + "num_input_tokens_seen": 17322715, + "router_z_loss_clip": 1.60253906, + "router_z_loss_mlp": 0.24084473, + "step": 808, + "time_per_iteration": 4.123759746551514 + }, + { + "auxiliary_loss_clip": 0.01241024, + "auxiliary_loss_mlp": 0.01062891, + "balance_loss_clip": 1.07610488, + "balance_loss_mlp": 1.03898978, + "epoch": 0.048639711408387194, + "flos": 22815108276480.0, + "grad_norm": 1.7809220124619416, + "language_loss": 0.90044206, + "learning_rate": 3.996356984858732e-06, + "loss": 0.92348123, + "num_input_tokens_seen": 17341455, + "router_z_loss_clip": 1.65039062, + "router_z_loss_mlp": 0.23925781, + "step": 809, + "time_per_iteration": 2.725801944732666 + }, + { + "auxiliary_loss_clip": 0.01237008, + "auxiliary_loss_mlp": 0.01062583, + "balance_loss_clip": 1.07698989, + "balance_loss_mlp": 1.03892004, + "epoch": 0.048699834661055166, + "flos": 30338823265920.0, + "grad_norm": 2.233286968004601, + "language_loss": 0.84604317, + "learning_rate": 3.996333450822208e-06, + "loss": 0.86903912, + "num_input_tokens_seen": 17360765, + "router_z_loss_clip": 1.60058594, + "router_z_loss_mlp": 0.23681641, + "step": 810, + "time_per_iteration": 2.725553274154663 + }, + { + "auxiliary_loss_clip": 0.01237658, + "auxiliary_loss_mlp": 0.01058277, + "balance_loss_clip": 1.07438898, + "balance_loss_mlp": 1.03416061, + "epoch": 0.04875995791372313, + "flos": 25263096683040.0, + "grad_norm": 1.7141367402903165, + "language_loss": 0.80648053, + "learning_rate": 3.99630984108452e-06, + "loss": 0.82943988, + "num_input_tokens_seen": 17380625, + "router_z_loss_clip": 1.6328125, + "router_z_loss_mlp": 0.24108887, + "step": 811, + "time_per_iteration": 2.6832752227783203 + }, + { + "auxiliary_loss_clip": 0.01229982, + "auxiliary_loss_mlp": 0.01062527, + "balance_loss_clip": 1.07166111, + "balance_loss_mlp": 1.0391624, + "epoch": 0.048820081166391104, + "flos": 22681298751840.0, + "grad_norm": 1.9362607879391265, + "language_loss": 0.74671382, + "learning_rate": 3.9962861556465615e-06, + "loss": 0.7696389, + "num_input_tokens_seen": 17399355, + "router_z_loss_clip": 1.58203125, + "router_z_loss_mlp": 0.23352051, + "step": 812, + "time_per_iteration": 2.711787462234497 + }, + { + "auxiliary_loss_clip": 0.01238167, + "auxiliary_loss_mlp": 0.01072394, + "balance_loss_clip": 1.08030164, + "balance_loss_mlp": 1.04890943, + "epoch": 0.04888020441905907, + "flos": 27708005776320.0, + "grad_norm": 2.074933863702776, + "language_loss": 0.90592271, + "learning_rate": 3.996262394509233e-06, + "loss": 0.92902833, + "num_input_tokens_seen": 17418240, + "router_z_loss_clip": 1.57910156, + "router_z_loss_mlp": 0.23486328, + "step": 813, + "time_per_iteration": 2.674884557723999 + }, + { + "auxiliary_loss_clip": 0.01231177, + "auxiliary_loss_mlp": 0.0105616, + "balance_loss_clip": 1.07350123, + "balance_loss_mlp": 1.03297377, + "epoch": 0.04894032767172704, + "flos": 27801628509600.0, + "grad_norm": 2.3087175557957695, + "language_loss": 0.74823296, + "learning_rate": 3.9962385576734335e-06, + "loss": 0.77110636, + "num_input_tokens_seen": 17436250, + "router_z_loss_clip": 1.57617188, + "router_z_loss_mlp": 0.23205566, + "step": 814, + "time_per_iteration": 2.676614761352539 + }, + { + "auxiliary_loss_clip": 0.01235211, + "auxiliary_loss_mlp": 0.01065299, + "balance_loss_clip": 1.0756793, + "balance_loss_mlp": 1.04132581, + "epoch": 0.04900045092439501, + "flos": 31135602821760.0, + "grad_norm": 2.3002455694170036, + "language_loss": 0.83670676, + "learning_rate": 3.9962146451400675e-06, + "loss": 0.85971189, + "num_input_tokens_seen": 17455750, + "router_z_loss_clip": 1.59570312, + "router_z_loss_mlp": 0.23974609, + "step": 815, + "time_per_iteration": 2.736072063446045 + }, + { + "auxiliary_loss_clip": 0.01237633, + "auxiliary_loss_mlp": 0.01055602, + "balance_loss_clip": 1.07531261, + "balance_loss_mlp": 1.03115201, + "epoch": 0.04906057417706298, + "flos": 31674568510080.0, + "grad_norm": 2.7195454049250976, + "language_loss": 0.90861315, + "learning_rate": 3.996190656910043e-06, + "loss": 0.9315455, + "num_input_tokens_seen": 17474995, + "router_z_loss_clip": 1.62207031, + "router_z_loss_mlp": 0.24462891, + "step": 816, + "time_per_iteration": 2.6948797702789307 + }, + { + "auxiliary_loss_clip": 0.0124001, + "auxiliary_loss_mlp": 0.01055166, + "balance_loss_clip": 1.07678699, + "balance_loss_mlp": 1.03121722, + "epoch": 0.04912069742973095, + "flos": 22725658823040.0, + "grad_norm": 2.4094732591624894, + "language_loss": 0.79828995, + "learning_rate": 3.996166592984268e-06, + "loss": 0.82124174, + "num_input_tokens_seen": 17493395, + "router_z_loss_clip": 1.63085938, + "router_z_loss_mlp": 0.23950195, + "step": 817, + "time_per_iteration": 2.695582866668701 + }, + { + "auxiliary_loss_clip": 0.01236529, + "auxiliary_loss_mlp": 0.01071629, + "balance_loss_clip": 1.07759094, + "balance_loss_mlp": 1.04720259, + "epoch": 0.049180820682398915, + "flos": 28918409607360.0, + "grad_norm": 3.384452485850364, + "language_loss": 0.84937733, + "learning_rate": 3.996142453363656e-06, + "loss": 0.87245888, + "num_input_tokens_seen": 17514565, + "router_z_loss_clip": 1.59082031, + "router_z_loss_mlp": 0.24414062, + "step": 818, + "time_per_iteration": 2.7030885219573975 + }, + { + "auxiliary_loss_clip": 0.01240399, + "auxiliary_loss_mlp": 0.01060073, + "balance_loss_clip": 1.07397532, + "balance_loss_mlp": 1.03483629, + "epoch": 0.04924094393506689, + "flos": 27357420038400.0, + "grad_norm": 1.9167220223514174, + "language_loss": 0.75731957, + "learning_rate": 3.996118238049124e-06, + "loss": 0.78032422, + "num_input_tokens_seen": 17534590, + "router_z_loss_clip": 1.66210938, + "router_z_loss_mlp": 0.25244141, + "step": 819, + "time_per_iteration": 2.7195587158203125 + }, + { + "auxiliary_loss_clip": 0.01238348, + "auxiliary_loss_mlp": 0.01058197, + "balance_loss_clip": 1.07822394, + "balance_loss_mlp": 1.03635812, + "epoch": 0.04930106718773486, + "flos": 19203831285120.0, + "grad_norm": 2.471103340131062, + "language_loss": 0.85006154, + "learning_rate": 3.996093947041586e-06, + "loss": 0.87302697, + "num_input_tokens_seen": 17551900, + "router_z_loss_clip": 1.60058594, + "router_z_loss_mlp": 0.21850586, + "step": 820, + "time_per_iteration": 2.7223563194274902 + }, + { + "auxiliary_loss_clip": 0.01235691, + "auxiliary_loss_mlp": 0.01060448, + "balance_loss_clip": 1.07304823, + "balance_loss_mlp": 1.03651094, + "epoch": 0.049361190440402825, + "flos": 32031434426400.0, + "grad_norm": 1.8661757364922895, + "language_loss": 0.90896696, + "learning_rate": 3.996069580341966e-06, + "loss": 0.93192834, + "num_input_tokens_seen": 17571485, + "router_z_loss_clip": 1.62695312, + "router_z_loss_mlp": 0.23937988, + "step": 821, + "time_per_iteration": 2.715564250946045 + }, + { + "auxiliary_loss_clip": 0.01234139, + "auxiliary_loss_mlp": 0.01074981, + "balance_loss_clip": 1.07347751, + "balance_loss_mlp": 1.05255771, + "epoch": 0.0494213136930708, + "flos": 25932144306240.0, + "grad_norm": 1.96134699184454, + "language_loss": 0.89617443, + "learning_rate": 3.996045137951188e-06, + "loss": 0.91926563, + "num_input_tokens_seen": 17591410, + "router_z_loss_clip": 1.60644531, + "router_z_loss_mlp": 0.22387695, + "step": 822, + "time_per_iteration": 2.715949773788452 + }, + { + "auxiliary_loss_clip": 0.01236119, + "auxiliary_loss_mlp": 0.01059563, + "balance_loss_clip": 1.07690442, + "balance_loss_mlp": 1.03368223, + "epoch": 0.04948143694573876, + "flos": 34124056056000.0, + "grad_norm": 2.8184711800920708, + "language_loss": 0.6734283, + "learning_rate": 3.996020619870178e-06, + "loss": 0.69638503, + "num_input_tokens_seen": 17612010, + "router_z_loss_clip": 1.59277344, + "router_z_loss_mlp": 0.25878906, + "step": 823, + "time_per_iteration": 2.714160680770874 + }, + { + "auxiliary_loss_clip": 0.0111306, + "auxiliary_loss_mlp": 0.01007325, + "balance_loss_clip": 1.03824615, + "balance_loss_mlp": 1.00237155, + "epoch": 0.049541560198406734, + "flos": 80756159401440.0, + "grad_norm": 1.310302585589464, + "language_loss": 0.62253881, + "learning_rate": 3.995996026099866e-06, + "loss": 0.64374268, + "num_input_tokens_seen": 17673430, + "router_z_loss_clip": 0.74707031, + "router_z_loss_mlp": 0.04949951, + "step": 824, + "time_per_iteration": 3.3694863319396973 + }, + { + "auxiliary_loss_clip": 0.01242101, + "auxiliary_loss_mlp": 0.01071688, + "balance_loss_clip": 1.07734489, + "balance_loss_mlp": 1.04566479, + "epoch": 0.049601683451074706, + "flos": 27933938894880.0, + "grad_norm": 1.876186232871792, + "language_loss": 0.90528035, + "learning_rate": 3.995971356641185e-06, + "loss": 0.92841822, + "num_input_tokens_seen": 17689545, + "router_z_loss_clip": 1.6484375, + "router_z_loss_mlp": 0.26013184, + "step": 825, + "time_per_iteration": 2.6963553428649902 + }, + { + "auxiliary_loss_clip": 0.01237365, + "auxiliary_loss_mlp": 0.01065964, + "balance_loss_clip": 1.07520032, + "balance_loss_mlp": 1.04054844, + "epoch": 0.04966180670374267, + "flos": 26153458454880.0, + "grad_norm": 2.4435526706516293, + "language_loss": 0.66746414, + "learning_rate": 3.9959466114950695e-06, + "loss": 0.6904974, + "num_input_tokens_seen": 17705965, + "router_z_loss_clip": 1.62207031, + "router_z_loss_mlp": 0.25415039, + "step": 826, + "time_per_iteration": 2.682192087173462 + }, + { + "auxiliary_loss_clip": 0.0123877, + "auxiliary_loss_mlp": 0.01064028, + "balance_loss_clip": 1.07560325, + "balance_loss_mlp": 1.03945863, + "epoch": 0.04972192995641064, + "flos": 28196290697760.0, + "grad_norm": 3.1179496750530946, + "language_loss": 0.78213537, + "learning_rate": 3.995921790662459e-06, + "loss": 0.80516326, + "num_input_tokens_seen": 17724580, + "router_z_loss_clip": 1.6328125, + "router_z_loss_mlp": 0.24584961, + "step": 827, + "time_per_iteration": 2.7182819843292236 + }, + { + "auxiliary_loss_clip": 0.01242933, + "auxiliary_loss_mlp": 0.01078472, + "balance_loss_clip": 1.0760349, + "balance_loss_mlp": 1.05194771, + "epoch": 0.04978205320907861, + "flos": 49305174007680.0, + "grad_norm": 1.8875862228035332, + "language_loss": 0.78455317, + "learning_rate": 3.995896894144294e-06, + "loss": 0.80776721, + "num_input_tokens_seen": 17747755, + "router_z_loss_clip": 1.66894531, + "router_z_loss_mlp": 0.26538086, + "step": 828, + "time_per_iteration": 2.8709065914154053 + }, + { + "auxiliary_loss_clip": 0.01227599, + "auxiliary_loss_mlp": 0.01062157, + "balance_loss_clip": 1.0707109, + "balance_loss_mlp": 1.03830361, + "epoch": 0.04984217646174658, + "flos": 30784247255520.0, + "grad_norm": 1.7561958247022864, + "language_loss": 0.83671427, + "learning_rate": 3.995871921941519e-06, + "loss": 0.85961187, + "num_input_tokens_seen": 17768550, + "router_z_loss_clip": 1.56933594, + "router_z_loss_mlp": 0.23852539, + "step": 829, + "time_per_iteration": 2.761532783508301 + }, + { + "auxiliary_loss_clip": 0.01236654, + "auxiliary_loss_mlp": 0.01076361, + "balance_loss_clip": 1.07263458, + "balance_loss_mlp": 1.04857278, + "epoch": 0.04990229971441455, + "flos": 19473759819360.0, + "grad_norm": 4.884802668428238, + "language_loss": 0.75158048, + "learning_rate": 3.99584687405508e-06, + "loss": 0.7747106, + "num_input_tokens_seen": 17786080, + "router_z_loss_clip": 1.63769531, + "router_z_loss_mlp": 0.2779541, + "step": 830, + "time_per_iteration": 2.6499128341674805 + }, + { + "auxiliary_loss_clip": 0.01235857, + "auxiliary_loss_mlp": 0.01069446, + "balance_loss_clip": 1.07351613, + "balance_loss_mlp": 1.04442418, + "epoch": 0.04996242296708252, + "flos": 22457837187360.0, + "grad_norm": 2.0263045784959814, + "language_loss": 0.79650676, + "learning_rate": 3.995821750485929e-06, + "loss": 0.81955981, + "num_input_tokens_seen": 17803635, + "router_z_loss_clip": 1.625, + "router_z_loss_mlp": 0.25048828, + "step": 831, + "time_per_iteration": 2.7069523334503174 + }, + { + "auxiliary_loss_clip": 0.01239477, + "auxiliary_loss_mlp": 0.01075472, + "balance_loss_clip": 1.07591152, + "balance_loss_mlp": 1.05108142, + "epoch": 0.05002254621975049, + "flos": 21790248186240.0, + "grad_norm": 2.7529048784208014, + "language_loss": 0.91238493, + "learning_rate": 3.995796551235016e-06, + "loss": 0.93553448, + "num_input_tokens_seen": 17822190, + "router_z_loss_clip": 1.63476562, + "router_z_loss_mlp": 0.24414062, + "step": 832, + "time_per_iteration": 2.6824703216552734 + }, + { + "auxiliary_loss_clip": 0.01230411, + "auxiliary_loss_mlp": 0.01074219, + "balance_loss_clip": 1.07326853, + "balance_loss_mlp": 1.05055642, + "epoch": 0.050082669472418455, + "flos": 55716645834720.0, + "grad_norm": 2.006302466966254, + "language_loss": 0.83275223, + "learning_rate": 3.9957712763032974e-06, + "loss": 0.85579848, + "num_input_tokens_seen": 17846915, + "router_z_loss_clip": 1.57226562, + "router_z_loss_mlp": 0.23681641, + "step": 833, + "time_per_iteration": 2.9545626640319824 + }, + { + "auxiliary_loss_clip": 0.01237079, + "auxiliary_loss_mlp": 0.01057174, + "balance_loss_clip": 1.07482767, + "balance_loss_mlp": 1.03209233, + "epoch": 0.05014279272508643, + "flos": 46322676813600.0, + "grad_norm": 2.118221836300739, + "language_loss": 0.81970036, + "learning_rate": 3.995745925691733e-06, + "loss": 0.8426429, + "num_input_tokens_seen": 17867270, + "router_z_loss_clip": 1.62207031, + "router_z_loss_mlp": 0.25085449, + "step": 834, + "time_per_iteration": 2.8352882862091064 + }, + { + "auxiliary_loss_clip": 0.01241537, + "auxiliary_loss_mlp": 0.0106173, + "balance_loss_clip": 1.0757097, + "balance_loss_mlp": 1.03593326, + "epoch": 0.0502029159777544, + "flos": 25619597943840.0, + "grad_norm": 2.245117388654245, + "language_loss": 0.92225331, + "learning_rate": 3.995720499401282e-06, + "loss": 0.94528604, + "num_input_tokens_seen": 17884880, + "router_z_loss_clip": 1.66015625, + "router_z_loss_mlp": 0.25817871, + "step": 835, + "time_per_iteration": 2.675410509109497 + }, + { + "auxiliary_loss_clip": 0.01237688, + "auxiliary_loss_mlp": 0.010691, + "balance_loss_clip": 1.07164562, + "balance_loss_mlp": 1.04255247, + "epoch": 0.050263039230422364, + "flos": 19386295712640.0, + "grad_norm": 2.195102466314977, + "language_loss": 0.76098579, + "learning_rate": 3.995694997432911e-06, + "loss": 0.78405368, + "num_input_tokens_seen": 17903695, + "router_z_loss_clip": 1.66015625, + "router_z_loss_mlp": 0.26513672, + "step": 836, + "time_per_iteration": 2.6464922428131104 + }, + { + "auxiliary_loss_clip": 0.01230358, + "auxiliary_loss_mlp": 0.01066205, + "balance_loss_clip": 1.07454109, + "balance_loss_mlp": 1.04291129, + "epoch": 0.050323162483090336, + "flos": 28958961054240.0, + "grad_norm": 2.089686974652765, + "language_loss": 0.83569485, + "learning_rate": 3.9956694197875855e-06, + "loss": 0.85866046, + "num_input_tokens_seen": 17920745, + "router_z_loss_clip": 1.55761719, + "router_z_loss_mlp": 0.23278809, + "step": 837, + "time_per_iteration": 2.7851524353027344 + }, + { + "auxiliary_loss_clip": 0.01236032, + "auxiliary_loss_mlp": 0.01064372, + "balance_loss_clip": 1.07617342, + "balance_loss_mlp": 1.04110205, + "epoch": 0.0503832857357583, + "flos": 24728871516480.0, + "grad_norm": 2.5270943426788057, + "language_loss": 0.73423314, + "learning_rate": 3.995643766466275e-06, + "loss": 0.7572372, + "num_input_tokens_seen": 17938220, + "router_z_loss_clip": 1.59960938, + "router_z_loss_mlp": 0.23266602, + "step": 838, + "time_per_iteration": 2.6646041870117188 + }, + { + "auxiliary_loss_clip": 0.01231416, + "auxiliary_loss_mlp": 0.01068415, + "balance_loss_clip": 1.07030451, + "balance_loss_mlp": 1.04372704, + "epoch": 0.05044340898842627, + "flos": 21701122871040.0, + "grad_norm": 1.7335985773202074, + "language_loss": 0.83090407, + "learning_rate": 3.995618037469953e-06, + "loss": 0.85390234, + "num_input_tokens_seen": 17957325, + "router_z_loss_clip": 1.61035156, + "router_z_loss_mlp": 0.24682617, + "step": 839, + "time_per_iteration": 2.6870315074920654 + }, + { + "auxiliary_loss_clip": 0.0122857, + "auxiliary_loss_mlp": 0.01067328, + "balance_loss_clip": 1.07146466, + "balance_loss_mlp": 1.04315233, + "epoch": 0.050503532241094246, + "flos": 28023955590240.0, + "grad_norm": 1.9735722303445842, + "language_loss": 0.85417557, + "learning_rate": 3.995592232799595e-06, + "loss": 0.87713456, + "num_input_tokens_seen": 17975875, + "router_z_loss_clip": 1.5703125, + "router_z_loss_mlp": 0.24182129, + "step": 840, + "time_per_iteration": 2.73046612739563 + }, + { + "auxiliary_loss_clip": 0.01234939, + "auxiliary_loss_mlp": 0.01058952, + "balance_loss_clip": 1.07453227, + "balance_loss_mlp": 1.03323889, + "epoch": 0.05056365549376221, + "flos": 27793322467200.0, + "grad_norm": 2.0594820994746232, + "language_loss": 0.94351137, + "learning_rate": 3.99556635245618e-06, + "loss": 0.96645033, + "num_input_tokens_seen": 17994340, + "router_z_loss_clip": 1.60449219, + "router_z_loss_mlp": 0.25732422, + "step": 841, + "time_per_iteration": 2.659517526626587 + }, + { + "auxiliary_loss_clip": 0.01235251, + "auxiliary_loss_mlp": 0.01063477, + "balance_loss_clip": 1.07479942, + "balance_loss_mlp": 1.03828752, + "epoch": 0.05062377874643018, + "flos": 37726014072960.0, + "grad_norm": 2.0858968122704393, + "language_loss": 0.77848518, + "learning_rate": 3.995540396440688e-06, + "loss": 0.80147243, + "num_input_tokens_seen": 18015260, + "router_z_loss_clip": 1.60449219, + "router_z_loss_mlp": 0.25158691, + "step": 842, + "time_per_iteration": 2.7686989307403564 + }, + { + "auxiliary_loss_clip": 0.0123825, + "auxiliary_loss_mlp": 0.01065101, + "balance_loss_clip": 1.07467794, + "balance_loss_mlp": 1.03987622, + "epoch": 0.05068390199909815, + "flos": 23973332201280.0, + "grad_norm": 2.368557628804456, + "language_loss": 0.78094995, + "learning_rate": 3.995514364754105e-06, + "loss": 0.80398345, + "num_input_tokens_seen": 18033960, + "router_z_loss_clip": 1.63476562, + "router_z_loss_mlp": 0.25219727, + "step": 843, + "time_per_iteration": 2.6525399684906006 + }, + { + "auxiliary_loss_clip": 0.01237935, + "auxiliary_loss_mlp": 0.01059386, + "balance_loss_clip": 1.07599807, + "balance_loss_mlp": 1.03631902, + "epoch": 0.05074402525176612, + "flos": 46322717330880.0, + "grad_norm": 1.872068722156484, + "language_loss": 0.82902676, + "learning_rate": 3.995488257397417e-06, + "loss": 0.852, + "num_input_tokens_seen": 18056700, + "router_z_loss_clip": 1.61914062, + "router_z_loss_mlp": 0.23059082, + "step": 844, + "time_per_iteration": 2.860447645187378 + }, + { + "auxiliary_loss_clip": 0.01233432, + "auxiliary_loss_mlp": 0.01058436, + "balance_loss_clip": 1.0759958, + "balance_loss_mlp": 1.03529716, + "epoch": 0.05080414850443409, + "flos": 26911145185920.0, + "grad_norm": 2.1446407474306284, + "language_loss": 0.76456898, + "learning_rate": 3.995462074371614e-06, + "loss": 0.78748763, + "num_input_tokens_seen": 18075815, + "router_z_loss_clip": 1.57421875, + "router_z_loss_mlp": 0.23132324, + "step": 845, + "time_per_iteration": 2.681673288345337 + }, + { + "auxiliary_loss_clip": 0.01231309, + "auxiliary_loss_mlp": 0.01065847, + "balance_loss_clip": 1.07204199, + "balance_loss_mlp": 1.04118216, + "epoch": 0.05086427175710206, + "flos": 24679892475360.0, + "grad_norm": 1.8441394619667257, + "language_loss": 0.87901151, + "learning_rate": 3.99543581567769e-06, + "loss": 0.90198308, + "num_input_tokens_seen": 18095095, + "router_z_loss_clip": 1.59179688, + "router_z_loss_mlp": 0.24658203, + "step": 846, + "time_per_iteration": 5.635085821151733 + }, + { + "auxiliary_loss_clip": 0.01234389, + "auxiliary_loss_mlp": 0.01065702, + "balance_loss_clip": 1.07592154, + "balance_loss_mlp": 1.04211104, + "epoch": 0.05092439500977003, + "flos": 18496825320960.0, + "grad_norm": 1.7139965887476352, + "language_loss": 0.87777191, + "learning_rate": 3.9954094813166394e-06, + "loss": 0.90077287, + "num_input_tokens_seen": 18112675, + "router_z_loss_clip": 1.58691406, + "router_z_loss_mlp": 0.23608398, + "step": 847, + "time_per_iteration": 5.610147953033447 + }, + { + "auxiliary_loss_clip": 0.01232091, + "auxiliary_loss_mlp": 0.01062406, + "balance_loss_clip": 1.07668579, + "balance_loss_mlp": 1.0378015, + "epoch": 0.050984518262437994, + "flos": 26911752945120.0, + "grad_norm": 2.5093738837818176, + "language_loss": 0.82392406, + "learning_rate": 3.995383071289462e-06, + "loss": 0.84686911, + "num_input_tokens_seen": 18130745, + "router_z_loss_clip": 1.55273438, + "router_z_loss_mlp": 0.24584961, + "step": 848, + "time_per_iteration": 2.6540520191192627 + }, + { + "auxiliary_loss_clip": 0.01235775, + "auxiliary_loss_mlp": 0.0106785, + "balance_loss_clip": 1.07828391, + "balance_loss_mlp": 1.04418707, + "epoch": 0.05104464151510597, + "flos": 36881876167200.0, + "grad_norm": 1.7320029468635585, + "language_loss": 0.87540114, + "learning_rate": 3.995356585597158e-06, + "loss": 0.89843738, + "num_input_tokens_seen": 18152410, + "router_z_loss_clip": 1.57617188, + "router_z_loss_mlp": 0.23669434, + "step": 849, + "time_per_iteration": 2.756585121154785 + }, + { + "auxiliary_loss_clip": 0.01228012, + "auxiliary_loss_mlp": 0.01058657, + "balance_loss_clip": 1.07197905, + "balance_loss_mlp": 1.03524423, + "epoch": 0.05110476476777394, + "flos": 22369643769600.0, + "grad_norm": 1.8755462877501956, + "language_loss": 0.83055234, + "learning_rate": 3.995330024240732e-06, + "loss": 0.85341895, + "num_input_tokens_seen": 18170870, + "router_z_loss_clip": 1.55957031, + "router_z_loss_mlp": 0.23413086, + "step": 850, + "time_per_iteration": 2.638155460357666 + }, + { + "auxiliary_loss_clip": 0.01230721, + "auxiliary_loss_mlp": 0.0105591, + "balance_loss_clip": 1.07259512, + "balance_loss_mlp": 1.0326879, + "epoch": 0.051164888020441904, + "flos": 46367401540320.0, + "grad_norm": 2.2885226388611124, + "language_loss": 0.65051746, + "learning_rate": 3.995303387221192e-06, + "loss": 0.67338383, + "num_input_tokens_seen": 18191555, + "router_z_loss_clip": 1.58105469, + "router_z_loss_mlp": 0.23217773, + "step": 851, + "time_per_iteration": 2.861701488494873 + }, + { + "auxiliary_loss_clip": 0.01229886, + "auxiliary_loss_mlp": 0.01066307, + "balance_loss_clip": 1.07132328, + "balance_loss_mlp": 1.04031944, + "epoch": 0.051225011273109876, + "flos": 28112027456160.0, + "grad_norm": 2.514438930907691, + "language_loss": 0.83440745, + "learning_rate": 3.995276674539547e-06, + "loss": 0.8573693, + "num_input_tokens_seen": 18208620, + "router_z_loss_clip": 1.58398438, + "router_z_loss_mlp": 0.2598877, + "step": 852, + "time_per_iteration": 2.7114439010620117 + }, + { + "auxiliary_loss_clip": 0.01231985, + "auxiliary_loss_mlp": 0.01069685, + "balance_loss_clip": 1.07245088, + "balance_loss_mlp": 1.0445441, + "epoch": 0.05128513452577785, + "flos": 22280477937120.0, + "grad_norm": 2.0493018921754445, + "language_loss": 0.80366063, + "learning_rate": 3.995249886196811e-06, + "loss": 0.82667726, + "num_input_tokens_seen": 18226370, + "router_z_loss_clip": 1.59472656, + "router_z_loss_mlp": 0.25146484, + "step": 853, + "time_per_iteration": 2.6856634616851807 + }, + { + "auxiliary_loss_clip": 0.01227251, + "auxiliary_loss_mlp": 0.01065338, + "balance_loss_clip": 1.07026279, + "balance_loss_mlp": 1.0407691, + "epoch": 0.05134525777844581, + "flos": 33188685936480.0, + "grad_norm": 1.906375439796247, + "language_loss": 0.75891, + "learning_rate": 3.995223022193999e-06, + "loss": 0.78183579, + "num_input_tokens_seen": 18247075, + "router_z_loss_clip": 1.5703125, + "router_z_loss_mlp": 0.24572754, + "step": 854, + "time_per_iteration": 2.691706418991089 + }, + { + "auxiliary_loss_clip": 0.01235947, + "auxiliary_loss_mlp": 0.01062783, + "balance_loss_clip": 1.07693982, + "balance_loss_mlp": 1.03770137, + "epoch": 0.051405381031113785, + "flos": 34608613387680.0, + "grad_norm": 2.061242386520503, + "language_loss": 0.81361032, + "learning_rate": 3.99519608253213e-06, + "loss": 0.83659756, + "num_input_tokens_seen": 18265680, + "router_z_loss_clip": 1.59082031, + "router_z_loss_mlp": 0.25085449, + "step": 855, + "time_per_iteration": 2.7123804092407227 + }, + { + "auxiliary_loss_clip": 0.01120209, + "auxiliary_loss_mlp": 0.01014095, + "balance_loss_clip": 1.04724717, + "balance_loss_mlp": 1.00907636, + "epoch": 0.05146550428378175, + "flos": 80067426730560.0, + "grad_norm": 0.9845404069754101, + "language_loss": 0.65658838, + "learning_rate": 3.995169067212227e-06, + "loss": 0.67793143, + "num_input_tokens_seen": 18327015, + "router_z_loss_clip": 0.72949219, + "router_z_loss_mlp": 0.0501709, + "step": 856, + "time_per_iteration": 3.2934584617614746 + }, + { + "auxiliary_loss_clip": 0.01226249, + "auxiliary_loss_mlp": 0.010547, + "balance_loss_clip": 1.07172465, + "balance_loss_mlp": 1.03073931, + "epoch": 0.05152562753644972, + "flos": 26911428806880.0, + "grad_norm": 1.8970241180509388, + "language_loss": 0.77063859, + "learning_rate": 3.9951419762353116e-06, + "loss": 0.79344809, + "num_input_tokens_seen": 18345235, + "router_z_loss_clip": 1.54394531, + "router_z_loss_mlp": 0.23974609, + "step": 857, + "time_per_iteration": 2.666167974472046 + }, + { + "auxiliary_loss_clip": 0.01230151, + "auxiliary_loss_mlp": 0.01055424, + "balance_loss_clip": 1.07180786, + "balance_loss_mlp": 1.03061664, + "epoch": 0.051585750789117694, + "flos": 22585731189120.0, + "grad_norm": 2.2257987517656908, + "language_loss": 0.89175689, + "learning_rate": 3.995114809602412e-06, + "loss": 0.91461265, + "num_input_tokens_seen": 18362350, + "router_z_loss_clip": 1.58496094, + "router_z_loss_mlp": 0.24780273, + "step": 858, + "time_per_iteration": 2.647526979446411 + }, + { + "auxiliary_loss_clip": 0.01227332, + "auxiliary_loss_mlp": 0.0105111, + "balance_loss_clip": 1.07140625, + "balance_loss_mlp": 1.02711368, + "epoch": 0.05164587404178566, + "flos": 28956124844640.0, + "grad_norm": 2.780831230493106, + "language_loss": 0.75268388, + "learning_rate": 3.9950875673145605e-06, + "loss": 0.77546829, + "num_input_tokens_seen": 18383390, + "router_z_loss_clip": 1.55859375, + "router_z_loss_mlp": 0.23999023, + "step": 859, + "time_per_iteration": 2.6993870735168457 + }, + { + "auxiliary_loss_clip": 0.01238308, + "auxiliary_loss_mlp": 0.01072244, + "balance_loss_clip": 1.0745405, + "balance_loss_mlp": 1.0465064, + "epoch": 0.05170599729445363, + "flos": 19831476598560.0, + "grad_norm": 2.1869175820003424, + "language_loss": 0.90684652, + "learning_rate": 3.995060249372788e-06, + "loss": 0.92995203, + "num_input_tokens_seen": 18399220, + "router_z_loss_clip": 1.63867188, + "router_z_loss_mlp": 0.25744629, + "step": 860, + "time_per_iteration": 2.673708438873291 + }, + { + "auxiliary_loss_clip": 0.0123335, + "auxiliary_loss_mlp": 0.01054316, + "balance_loss_clip": 1.07643533, + "balance_loss_mlp": 1.03085577, + "epoch": 0.0517661205471216, + "flos": 29268103965120.0, + "grad_norm": 2.027335943197945, + "language_loss": 0.82479495, + "learning_rate": 3.99503285577813e-06, + "loss": 0.84767163, + "num_input_tokens_seen": 18419005, + "router_z_loss_clip": 1.5703125, + "router_z_loss_mlp": 0.23461914, + "step": 861, + "time_per_iteration": 2.676659107208252 + }, + { + "auxiliary_loss_clip": 0.01232403, + "auxiliary_loss_mlp": 0.0105733, + "balance_loss_clip": 1.07207394, + "balance_loss_mlp": 1.03344011, + "epoch": 0.05182624379978957, + "flos": 35725435002720.0, + "grad_norm": 1.6792009928911698, + "language_loss": 0.7866472, + "learning_rate": 3.995005386531627e-06, + "loss": 0.8095445, + "num_input_tokens_seen": 18440550, + "router_z_loss_clip": 1.60351562, + "router_z_loss_mlp": 0.23901367, + "step": 862, + "time_per_iteration": 2.8489010334014893 + }, + { + "auxiliary_loss_clip": 0.0122641, + "auxiliary_loss_mlp": 0.01061358, + "balance_loss_clip": 1.07309055, + "balance_loss_mlp": 1.03851748, + "epoch": 0.05188636705245754, + "flos": 29493510359040.0, + "grad_norm": 1.8105886445054133, + "language_loss": 0.89069438, + "learning_rate": 3.9949778416343195e-06, + "loss": 0.91357207, + "num_input_tokens_seen": 18461950, + "router_z_loss_clip": 1.53125, + "router_z_loss_mlp": 0.22839355, + "step": 863, + "time_per_iteration": 2.6959354877471924 + }, + { + "auxiliary_loss_clip": 0.0123384, + "auxiliary_loss_mlp": 0.01062984, + "balance_loss_clip": 1.07602167, + "balance_loss_mlp": 1.03773546, + "epoch": 0.051946490305125506, + "flos": 32653934045280.0, + "grad_norm": 2.2243919122099762, + "language_loss": 0.75746334, + "learning_rate": 3.9949502210872525e-06, + "loss": 0.78043163, + "num_input_tokens_seen": 18480555, + "router_z_loss_clip": 1.57910156, + "router_z_loss_mlp": 0.25244141, + "step": 864, + "time_per_iteration": 2.7299516201019287 + }, + { + "auxiliary_loss_clip": 0.01233626, + "auxiliary_loss_mlp": 0.0106408, + "balance_loss_clip": 1.07454753, + "balance_loss_mlp": 1.03910542, + "epoch": 0.05200661355779348, + "flos": 26237883765600.0, + "grad_norm": 1.9549024554695067, + "language_loss": 0.78811157, + "learning_rate": 3.994922524891474e-06, + "loss": 0.81108868, + "num_input_tokens_seen": 18499645, + "router_z_loss_clip": 1.58984375, + "router_z_loss_mlp": 0.24963379, + "step": 865, + "time_per_iteration": 2.6822612285614014 + }, + { + "auxiliary_loss_clip": 0.01230347, + "auxiliary_loss_mlp": 0.01067508, + "balance_loss_clip": 1.07181168, + "balance_loss_mlp": 1.04267669, + "epoch": 0.05206673681046144, + "flos": 22103078169600.0, + "grad_norm": 2.344132133308601, + "language_loss": 0.85944593, + "learning_rate": 3.994894753048032e-06, + "loss": 0.88242441, + "num_input_tokens_seen": 18516810, + "router_z_loss_clip": 1.58496094, + "router_z_loss_mlp": 0.24841309, + "step": 866, + "time_per_iteration": 2.6990599632263184 + }, + { + "auxiliary_loss_clip": 0.01234615, + "auxiliary_loss_mlp": 0.01063709, + "balance_loss_clip": 1.07893395, + "balance_loss_mlp": 1.03958142, + "epoch": 0.052126860063129415, + "flos": 21383349779520.0, + "grad_norm": 2.410983070975139, + "language_loss": 0.87562418, + "learning_rate": 3.9948669055579815e-06, + "loss": 0.89860749, + "num_input_tokens_seen": 18532510, + "router_z_loss_clip": 1.55566406, + "router_z_loss_mlp": 0.24108887, + "step": 867, + "time_per_iteration": 2.7025105953216553 + }, + { + "auxiliary_loss_clip": 0.01229177, + "auxiliary_loss_mlp": 0.01069847, + "balance_loss_clip": 1.07741046, + "balance_loss_mlp": 1.04811525, + "epoch": 0.05218698331579739, + "flos": 39777152358240.0, + "grad_norm": 1.4300113528210423, + "language_loss": 0.6359458, + "learning_rate": 3.9948389824223785e-06, + "loss": 0.65893602, + "num_input_tokens_seen": 18557380, + "router_z_loss_clip": 1.51953125, + "router_z_loss_mlp": 0.21728516, + "step": 868, + "time_per_iteration": 2.8144266605377197 + }, + { + "auxiliary_loss_clip": 0.01232502, + "auxiliary_loss_mlp": 0.01070177, + "balance_loss_clip": 1.07355535, + "balance_loss_mlp": 1.0437367, + "epoch": 0.05224710656846535, + "flos": 26999865328320.0, + "grad_norm": 2.1986076141094295, + "language_loss": 0.83043122, + "learning_rate": 3.994810983642281e-06, + "loss": 0.85345805, + "num_input_tokens_seen": 18575720, + "router_z_loss_clip": 1.58984375, + "router_z_loss_mlp": 0.2644043, + "step": 869, + "time_per_iteration": 2.703404664993286 + }, + { + "auxiliary_loss_clip": 0.0123407, + "auxiliary_loss_mlp": 0.01057166, + "balance_loss_clip": 1.07371008, + "balance_loss_mlp": 1.03315687, + "epoch": 0.052307229821133325, + "flos": 13598539022880.0, + "grad_norm": 2.170106255178385, + "language_loss": 0.87428427, + "learning_rate": 3.994782909218751e-06, + "loss": 0.89719665, + "num_input_tokens_seen": 18592185, + "router_z_loss_clip": 1.60351562, + "router_z_loss_mlp": 0.24023438, + "step": 870, + "time_per_iteration": 2.644404888153076 + }, + { + "auxiliary_loss_clip": 0.01233672, + "auxiliary_loss_mlp": 0.01060147, + "balance_loss_clip": 1.07485414, + "balance_loss_mlp": 1.03647184, + "epoch": 0.05236735307380129, + "flos": 23340662745120.0, + "grad_norm": 1.9603451472545517, + "language_loss": 0.80972135, + "learning_rate": 3.994754759152854e-06, + "loss": 0.8326596, + "num_input_tokens_seen": 18609560, + "router_z_loss_clip": 1.58789062, + "router_z_loss_mlp": 0.23693848, + "step": 871, + "time_per_iteration": 2.669196128845215 + }, + { + "auxiliary_loss_clip": 0.01231815, + "auxiliary_loss_mlp": 0.01065113, + "balance_loss_clip": 1.07759762, + "balance_loss_mlp": 1.04253459, + "epoch": 0.05242747632646926, + "flos": 25576169770080.0, + "grad_norm": 1.6162570496730164, + "language_loss": 0.81472754, + "learning_rate": 3.994726533445656e-06, + "loss": 0.83769679, + "num_input_tokens_seen": 18629405, + "router_z_loss_clip": 1.54394531, + "router_z_loss_mlp": 0.22595215, + "step": 872, + "time_per_iteration": 2.701725482940674 + }, + { + "auxiliary_loss_clip": 0.01113493, + "auxiliary_loss_mlp": 0.01009623, + "balance_loss_clip": 1.04058886, + "balance_loss_mlp": 1.00481319, + "epoch": 0.052487599579137234, + "flos": 79339432815360.0, + "grad_norm": 0.9368558670961133, + "language_loss": 0.61695373, + "learning_rate": 3.9946982320982274e-06, + "loss": 0.63818491, + "num_input_tokens_seen": 18681480, + "router_z_loss_clip": 0.72949219, + "router_z_loss_mlp": 0.04803467, + "step": 873, + "time_per_iteration": 3.224742889404297 + }, + { + "auxiliary_loss_clip": 0.01229138, + "auxiliary_loss_mlp": 0.01054194, + "balance_loss_clip": 1.07322884, + "balance_loss_mlp": 1.03084052, + "epoch": 0.0525477228318052, + "flos": 28417847950080.0, + "grad_norm": 1.8598534666138127, + "language_loss": 0.88943154, + "learning_rate": 3.994669855111643e-06, + "loss": 0.91226488, + "num_input_tokens_seen": 18700390, + "router_z_loss_clip": 1.55859375, + "router_z_loss_mlp": 0.23352051, + "step": 874, + "time_per_iteration": 2.726720094680786 + }, + { + "auxiliary_loss_clip": 0.01230684, + "auxiliary_loss_mlp": 0.01058542, + "balance_loss_clip": 1.07193995, + "balance_loss_mlp": 1.03440213, + "epoch": 0.05260784608447317, + "flos": 39328568020800.0, + "grad_norm": 1.7301785094615167, + "language_loss": 0.74472833, + "learning_rate": 3.994641402486977e-06, + "loss": 0.76762062, + "num_input_tokens_seen": 18721280, + "router_z_loss_clip": 1.58886719, + "router_z_loss_mlp": 0.24145508, + "step": 875, + "time_per_iteration": 2.7686917781829834 + }, + { + "auxiliary_loss_clip": 0.01227276, + "auxiliary_loss_mlp": 0.01057961, + "balance_loss_clip": 1.07182848, + "balance_loss_mlp": 1.03292668, + "epoch": 0.052667969337141136, + "flos": 29850254723520.0, + "grad_norm": 2.087056584068154, + "language_loss": 0.92779374, + "learning_rate": 3.99461287422531e-06, + "loss": 0.9506461, + "num_input_tokens_seen": 18741545, + "router_z_loss_clip": 1.55664062, + "router_z_loss_mlp": 0.25048828, + "step": 876, + "time_per_iteration": 2.7374775409698486 + }, + { + "auxiliary_loss_clip": 0.01110015, + "auxiliary_loss_mlp": 0.01005227, + "balance_loss_clip": 1.03768778, + "balance_loss_mlp": 1.00033355, + "epoch": 0.05272809258980911, + "flos": 70506997607520.0, + "grad_norm": 0.8191126507682237, + "language_loss": 0.62917268, + "learning_rate": 3.994584270327722e-06, + "loss": 0.65032512, + "num_input_tokens_seen": 18801400, + "router_z_loss_clip": 0.72412109, + "router_z_loss_mlp": 0.04888916, + "step": 877, + "time_per_iteration": 3.300004005432129 + }, + { + "auxiliary_loss_clip": 0.01228867, + "auxiliary_loss_mlp": 0.01066468, + "balance_loss_clip": 1.07154226, + "balance_loss_mlp": 1.04101706, + "epoch": 0.05278821584247708, + "flos": 21256630778880.0, + "grad_norm": 2.8556915462078134, + "language_loss": 0.85537851, + "learning_rate": 3.994555590795299e-06, + "loss": 0.87833184, + "num_input_tokens_seen": 18819670, + "router_z_loss_clip": 1.57324219, + "router_z_loss_mlp": 0.25439453, + "step": 878, + "time_per_iteration": 2.7420012950897217 + }, + { + "auxiliary_loss_clip": 0.01229874, + "auxiliary_loss_mlp": 0.01065387, + "balance_loss_clip": 1.0714612, + "balance_loss_mlp": 1.04158044, + "epoch": 0.052848339095145046, + "flos": 31893451621920.0, + "grad_norm": 3.1645010395729996, + "language_loss": 0.82957852, + "learning_rate": 3.9945268356291275e-06, + "loss": 0.85253119, + "num_input_tokens_seen": 18840580, + "router_z_loss_clip": 1.58496094, + "router_z_loss_mlp": 0.23828125, + "step": 879, + "time_per_iteration": 2.739737033843994 + }, + { + "auxiliary_loss_clip": 0.01226767, + "auxiliary_loss_mlp": 0.01064842, + "balance_loss_clip": 1.0712862, + "balance_loss_mlp": 1.04001081, + "epoch": 0.05290846234781302, + "flos": 20098933578720.0, + "grad_norm": 3.093532731268023, + "language_loss": 0.84144592, + "learning_rate": 3.9944980048302985e-06, + "loss": 0.864362, + "num_input_tokens_seen": 18859295, + "router_z_loss_clip": 1.55566406, + "router_z_loss_mlp": 0.24816895, + "step": 880, + "time_per_iteration": 2.665121555328369 + }, + { + "auxiliary_loss_clip": 0.01232311, + "auxiliary_loss_mlp": 0.01064673, + "balance_loss_clip": 1.07412338, + "balance_loss_mlp": 1.04092622, + "epoch": 0.05296858560048098, + "flos": 24239452111200.0, + "grad_norm": 2.04741497840018, + "language_loss": 0.87052131, + "learning_rate": 3.994469098399906e-06, + "loss": 0.89349115, + "num_input_tokens_seen": 18877485, + "router_z_loss_clip": 1.58203125, + "router_z_loss_mlp": 0.23779297, + "step": 881, + "time_per_iteration": 2.648505926132202 + }, + { + "auxiliary_loss_clip": 0.01226918, + "auxiliary_loss_mlp": 0.01062489, + "balance_loss_clip": 1.07001853, + "balance_loss_mlp": 1.03700185, + "epoch": 0.053028708853148955, + "flos": 29759346648000.0, + "grad_norm": 2.5464858372622188, + "language_loss": 0.88029802, + "learning_rate": 3.994440116339046e-06, + "loss": 0.90319216, + "num_input_tokens_seen": 18898275, + "router_z_loss_clip": 1.56933594, + "router_z_loss_mlp": 0.25476074, + "step": 882, + "time_per_iteration": 2.7147574424743652 + }, + { + "auxiliary_loss_clip": 0.0123206, + "auxiliary_loss_mlp": 0.01062886, + "balance_loss_clip": 1.07305098, + "balance_loss_mlp": 1.03780437, + "epoch": 0.05308883210581693, + "flos": 44408184262560.0, + "grad_norm": 2.198254836580769, + "language_loss": 0.6959601, + "learning_rate": 3.994411058648816e-06, + "loss": 0.71890962, + "num_input_tokens_seen": 18920665, + "router_z_loss_clip": 1.59082031, + "router_z_loss_mlp": 0.25061035, + "step": 883, + "time_per_iteration": 2.8229777812957764 + }, + { + "auxiliary_loss_clip": 0.01225324, + "auxiliary_loss_mlp": 0.01057691, + "balance_loss_clip": 1.07256615, + "balance_loss_mlp": 1.03443336, + "epoch": 0.05314895535848489, + "flos": 27888160718880.0, + "grad_norm": 1.8874056173723681, + "language_loss": 0.75902843, + "learning_rate": 3.994381925330319e-06, + "loss": 0.78185856, + "num_input_tokens_seen": 18939835, + "router_z_loss_clip": 1.52734375, + "router_z_loss_mlp": 0.23242188, + "step": 884, + "time_per_iteration": 2.7109413146972656 + }, + { + "auxiliary_loss_clip": 0.0122625, + "auxiliary_loss_mlp": 0.01060314, + "balance_loss_clip": 1.07376361, + "balance_loss_mlp": 1.0372709, + "epoch": 0.053209078611152864, + "flos": 15691565825280.0, + "grad_norm": 1.9938102557834545, + "language_loss": 0.86044145, + "learning_rate": 3.994352716384659e-06, + "loss": 0.8833071, + "num_input_tokens_seen": 18958405, + "router_z_loss_clip": 1.52441406, + "router_z_loss_mlp": 0.23059082, + "step": 885, + "time_per_iteration": 4.160953521728516 + }, + { + "auxiliary_loss_clip": 0.01229142, + "auxiliary_loss_mlp": 0.0106364, + "balance_loss_clip": 1.07053387, + "balance_loss_mlp": 1.03938127, + "epoch": 0.05326920186382083, + "flos": 14842808949600.0, + "grad_norm": 8.398383007446796, + "language_loss": 0.85764319, + "learning_rate": 3.994323431812945e-06, + "loss": 0.88057101, + "num_input_tokens_seen": 18975445, + "router_z_loss_clip": 1.58691406, + "router_z_loss_mlp": 0.24255371, + "step": 886, + "time_per_iteration": 5.540072441101074 + }, + { + "auxiliary_loss_clip": 0.01228629, + "auxiliary_loss_mlp": 0.01065487, + "balance_loss_clip": 1.0723691, + "balance_loss_mlp": 1.0405004, + "epoch": 0.0533293251164888, + "flos": 27704804911200.0, + "grad_norm": 2.24544978653459, + "language_loss": 0.89055121, + "learning_rate": 3.994294071616286e-06, + "loss": 0.91349232, + "num_input_tokens_seen": 18991930, + "router_z_loss_clip": 1.56542969, + "router_z_loss_mlp": 0.25, + "step": 887, + "time_per_iteration": 4.165618896484375 + }, + { + "auxiliary_loss_clip": 0.01226917, + "auxiliary_loss_mlp": 0.01066585, + "balance_loss_clip": 1.07003391, + "balance_loss_mlp": 1.04032278, + "epoch": 0.053389448369156774, + "flos": 32874073192800.0, + "grad_norm": 1.8863334377644418, + "language_loss": 0.75256771, + "learning_rate": 3.994264635795796e-06, + "loss": 0.7755028, + "num_input_tokens_seen": 19009790, + "router_z_loss_clip": 1.56835938, + "router_z_loss_mlp": 0.26245117, + "step": 888, + "time_per_iteration": 2.732959747314453 + }, + { + "auxiliary_loss_clip": 0.01230478, + "auxiliary_loss_mlp": 0.01072118, + "balance_loss_clip": 1.07458401, + "balance_loss_mlp": 1.04620206, + "epoch": 0.05344957162182474, + "flos": 31184298241920.0, + "grad_norm": 1.8436979053794682, + "language_loss": 0.88587165, + "learning_rate": 3.994235124352592e-06, + "loss": 0.90889764, + "num_input_tokens_seen": 19030170, + "router_z_loss_clip": 1.56054688, + "router_z_loss_mlp": 0.2590332, + "step": 889, + "time_per_iteration": 2.7432544231414795 + }, + { + "auxiliary_loss_clip": 0.01224049, + "auxiliary_loss_mlp": 0.01052444, + "balance_loss_clip": 1.07143772, + "balance_loss_mlp": 1.02915025, + "epoch": 0.05350969487449271, + "flos": 24061768722720.0, + "grad_norm": 1.7896905183061775, + "language_loss": 0.88688141, + "learning_rate": 3.994205537287791e-06, + "loss": 0.90964627, + "num_input_tokens_seen": 19048075, + "router_z_loss_clip": 1.52734375, + "router_z_loss_mlp": 0.23303223, + "step": 890, + "time_per_iteration": 2.744985580444336 + }, + { + "auxiliary_loss_clip": 0.01229057, + "auxiliary_loss_mlp": 0.01071336, + "balance_loss_clip": 1.07196045, + "balance_loss_mlp": 1.04762578, + "epoch": 0.053569818127160676, + "flos": 32965872648480.0, + "grad_norm": 1.9427063932143498, + "language_loss": 0.93359077, + "learning_rate": 3.994175874602517e-06, + "loss": 0.95659471, + "num_input_tokens_seen": 19067465, + "router_z_loss_clip": 1.5703125, + "router_z_loss_mlp": 0.23706055, + "step": 891, + "time_per_iteration": 2.7631187438964844 + }, + { + "auxiliary_loss_clip": 0.01225794, + "auxiliary_loss_mlp": 0.0106289, + "balance_loss_clip": 1.07062674, + "balance_loss_mlp": 1.03672314, + "epoch": 0.05362994137982865, + "flos": 16091859915360.0, + "grad_norm": 2.9834348066219905, + "language_loss": 0.71426463, + "learning_rate": 3.994146136297893e-06, + "loss": 0.73715138, + "num_input_tokens_seen": 19085505, + "router_z_loss_clip": 1.55175781, + "router_z_loss_mlp": 0.26171875, + "step": 892, + "time_per_iteration": 2.70951247215271 + }, + { + "auxiliary_loss_clip": 0.01227123, + "auxiliary_loss_mlp": 0.01068032, + "balance_loss_clip": 1.0711962, + "balance_loss_mlp": 1.0443449, + "epoch": 0.05369006463249662, + "flos": 34969166376480.0, + "grad_norm": 1.8747575944802157, + "language_loss": 0.82154608, + "learning_rate": 3.994116322375049e-06, + "loss": 0.84449756, + "num_input_tokens_seen": 19104360, + "router_z_loss_clip": 1.55761719, + "router_z_loss_mlp": 0.23681641, + "step": 893, + "time_per_iteration": 2.7580277919769287 + }, + { + "auxiliary_loss_clip": 0.01229397, + "auxiliary_loss_mlp": 0.01061418, + "balance_loss_clip": 1.06999731, + "balance_loss_mlp": 1.03740954, + "epoch": 0.053750187885164585, + "flos": 35280537737760.0, + "grad_norm": 1.9172494343832946, + "language_loss": 0.81522876, + "learning_rate": 3.994086432835114e-06, + "loss": 0.83813691, + "num_input_tokens_seen": 19124680, + "router_z_loss_clip": 1.59472656, + "router_z_loss_mlp": 0.2401123, + "step": 894, + "time_per_iteration": 2.7628488540649414 + }, + { + "auxiliary_loss_clip": 0.01227146, + "auxiliary_loss_mlp": 0.01055556, + "balance_loss_clip": 1.0707581, + "balance_loss_mlp": 1.03134489, + "epoch": 0.05381031113783256, + "flos": 18496379630880.0, + "grad_norm": 2.2807758976281054, + "language_loss": 0.75505042, + "learning_rate": 3.994056467679221e-06, + "loss": 0.77787745, + "num_input_tokens_seen": 19142895, + "router_z_loss_clip": 1.56542969, + "router_z_loss_mlp": 0.24230957, + "step": 895, + "time_per_iteration": 2.657341957092285 + }, + { + "auxiliary_loss_clip": 0.0123471, + "auxiliary_loss_mlp": 0.01056915, + "balance_loss_clip": 1.07390547, + "balance_loss_mlp": 1.03270388, + "epoch": 0.05387043439050053, + "flos": 26643485619360.0, + "grad_norm": 2.3955568193113064, + "language_loss": 0.86445212, + "learning_rate": 3.9940264269085065e-06, + "loss": 0.88736832, + "num_input_tokens_seen": 19163125, + "router_z_loss_clip": 1.60742188, + "router_z_loss_mlp": 0.2421875, + "step": 896, + "time_per_iteration": 2.7602357864379883 + }, + { + "auxiliary_loss_clip": 0.01231137, + "auxiliary_loss_mlp": 0.01056604, + "balance_loss_clip": 1.07094073, + "balance_loss_mlp": 1.03106904, + "epoch": 0.053930557643168495, + "flos": 21121889356800.0, + "grad_norm": 2.323341152147343, + "language_loss": 0.87777281, + "learning_rate": 3.9939963105241115e-06, + "loss": 0.9006502, + "num_input_tokens_seen": 19179385, + "router_z_loss_clip": 1.60351562, + "router_z_loss_mlp": 0.25549316, + "step": 897, + "time_per_iteration": 2.6313376426696777 + }, + { + "auxiliary_loss_clip": 0.01225741, + "auxiliary_loss_mlp": 0.01059538, + "balance_loss_clip": 1.06986713, + "balance_loss_mlp": 1.03338337, + "epoch": 0.05399068089583647, + "flos": 21168315809280.0, + "grad_norm": 1.6261067488137193, + "language_loss": 0.90365493, + "learning_rate": 3.993966118527175e-06, + "loss": 0.92650771, + "num_input_tokens_seen": 19198725, + "router_z_loss_clip": 1.55957031, + "router_z_loss_mlp": 0.26159668, + "step": 898, + "time_per_iteration": 2.6874654293060303 + }, + { + "auxiliary_loss_clip": 0.01231067, + "auxiliary_loss_mlp": 0.0107451, + "balance_loss_clip": 1.0702126, + "balance_loss_mlp": 1.05026293, + "epoch": 0.05405080414850443, + "flos": 21337976776320.0, + "grad_norm": 3.010883265424236, + "language_loss": 0.92175829, + "learning_rate": 3.993935850918845e-06, + "loss": 0.94481403, + "num_input_tokens_seen": 19212380, + "router_z_loss_clip": 1.61132812, + "router_z_loss_mlp": 0.24243164, + "step": 899, + "time_per_iteration": 2.649160385131836 + }, + { + "auxiliary_loss_clip": 0.0122469, + "auxiliary_loss_mlp": 0.01064423, + "balance_loss_clip": 1.06990802, + "balance_loss_mlp": 1.04066467, + "epoch": 0.054110927401172404, + "flos": 29891494964160.0, + "grad_norm": 2.8958497001407393, + "language_loss": 0.75865114, + "learning_rate": 3.9939055077002665e-06, + "loss": 0.78154224, + "num_input_tokens_seen": 19232235, + "router_z_loss_clip": 1.54785156, + "router_z_loss_mlp": 0.23754883, + "step": 900, + "time_per_iteration": 2.728673219680786 + }, + { + "auxiliary_loss_clip": 0.01227479, + "auxiliary_loss_mlp": 0.01054012, + "balance_loss_clip": 1.06863546, + "balance_loss_mlp": 1.03058743, + "epoch": 0.054171050653840376, + "flos": 27979433449920.0, + "grad_norm": 2.264721369551573, + "language_loss": 0.74192309, + "learning_rate": 3.993875088872592e-06, + "loss": 0.76473802, + "num_input_tokens_seen": 19251460, + "router_z_loss_clip": 1.58886719, + "router_z_loss_mlp": 0.234375, + "step": 901, + "time_per_iteration": 2.6601107120513916 + }, + { + "auxiliary_loss_clip": 0.01221699, + "auxiliary_loss_mlp": 0.01062171, + "balance_loss_clip": 1.07033038, + "balance_loss_mlp": 1.03842461, + "epoch": 0.05423117390650834, + "flos": 15780488554080.0, + "grad_norm": 2.165397648567261, + "language_loss": 0.84726179, + "learning_rate": 3.9938445944369745e-06, + "loss": 0.8701005, + "num_input_tokens_seen": 19269060, + "router_z_loss_clip": 1.51464844, + "router_z_loss_mlp": 0.23730469, + "step": 902, + "time_per_iteration": 2.6574690341949463 + }, + { + "auxiliary_loss_clip": 0.01222186, + "auxiliary_loss_mlp": 0.01059619, + "balance_loss_clip": 1.067379, + "balance_loss_mlp": 1.03525281, + "epoch": 0.05429129715917631, + "flos": 24283609596000.0, + "grad_norm": 1.8140379049663964, + "language_loss": 0.8654629, + "learning_rate": 3.993814024394569e-06, + "loss": 0.88828087, + "num_input_tokens_seen": 19288620, + "router_z_loss_clip": 1.54589844, + "router_z_loss_mlp": 0.24353027, + "step": 903, + "time_per_iteration": 2.692800283432007 + }, + { + "auxiliary_loss_clip": 0.01226282, + "auxiliary_loss_mlp": 0.01057305, + "balance_loss_clip": 1.07115746, + "balance_loss_mlp": 1.03352308, + "epoch": 0.05435142041184428, + "flos": 20631740640480.0, + "grad_norm": 1.9948589386082238, + "language_loss": 0.75002849, + "learning_rate": 3.993783378746537e-06, + "loss": 0.7728644, + "num_input_tokens_seen": 19306615, + "router_z_loss_clip": 1.55175781, + "router_z_loss_mlp": 0.23791504, + "step": 904, + "time_per_iteration": 2.707833766937256 + }, + { + "auxiliary_loss_clip": 0.01229099, + "auxiliary_loss_mlp": 0.01065211, + "balance_loss_clip": 1.07151461, + "balance_loss_mlp": 1.04136944, + "epoch": 0.05441154366451225, + "flos": 29222325789120.0, + "grad_norm": 4.562035085489466, + "language_loss": 0.86405039, + "learning_rate": 3.993752657494039e-06, + "loss": 0.88699341, + "num_input_tokens_seen": 19321680, + "router_z_loss_clip": 1.57421875, + "router_z_loss_mlp": 0.23852539, + "step": 905, + "time_per_iteration": 2.6684627532958984 + }, + { + "auxiliary_loss_clip": 0.0122743, + "auxiliary_loss_mlp": 0.01069701, + "balance_loss_clip": 1.0753026, + "balance_loss_mlp": 1.04701519, + "epoch": 0.05447166691718022, + "flos": 24373869395040.0, + "grad_norm": 1.7678625012513778, + "language_loss": 0.74221539, + "learning_rate": 3.993721860638241e-06, + "loss": 0.76518679, + "num_input_tokens_seen": 19339760, + "router_z_loss_clip": 1.52246094, + "router_z_loss_mlp": 0.22680664, + "step": 906, + "time_per_iteration": 2.651613235473633 + }, + { + "auxiliary_loss_clip": 0.01229034, + "auxiliary_loss_mlp": 0.01062511, + "balance_loss_clip": 1.07184815, + "balance_loss_mlp": 1.03866935, + "epoch": 0.05453179016984819, + "flos": 30427908063840.0, + "grad_norm": 2.1376925162826934, + "language_loss": 0.87465, + "learning_rate": 3.993690988180309e-06, + "loss": 0.89756536, + "num_input_tokens_seen": 19359585, + "router_z_loss_clip": 1.57324219, + "router_z_loss_mlp": 0.23852539, + "step": 907, + "time_per_iteration": 2.697657585144043 + }, + { + "auxiliary_loss_clip": 0.0122363, + "auxiliary_loss_mlp": 0.01068048, + "balance_loss_clip": 1.0697974, + "balance_loss_mlp": 1.04383683, + "epoch": 0.05459191342251616, + "flos": 22103766963360.0, + "grad_norm": 1.8349293018175166, + "language_loss": 0.8705622, + "learning_rate": 3.9936600401214165e-06, + "loss": 0.89347899, + "num_input_tokens_seen": 19378590, + "router_z_loss_clip": 1.53808594, + "router_z_loss_mlp": 0.24230957, + "step": 908, + "time_per_iteration": 2.7212629318237305 + }, + { + "auxiliary_loss_clip": 0.01224296, + "auxiliary_loss_mlp": 0.01066601, + "balance_loss_clip": 1.07106006, + "balance_loss_mlp": 1.04169846, + "epoch": 0.054652036675184125, + "flos": 23437891516320.0, + "grad_norm": 2.3682918609239074, + "language_loss": 0.89757138, + "learning_rate": 3.9936290164627345e-06, + "loss": 0.92048031, + "num_input_tokens_seen": 19397910, + "router_z_loss_clip": 1.53027344, + "router_z_loss_mlp": 0.2487793, + "step": 909, + "time_per_iteration": 2.667241334915161 + }, + { + "auxiliary_loss_clip": 0.01225263, + "auxiliary_loss_mlp": 0.01072222, + "balance_loss_clip": 1.06911922, + "balance_loss_mlp": 1.04781938, + "epoch": 0.0547121599278521, + "flos": 19920399327360.0, + "grad_norm": 2.376530510286428, + "language_loss": 0.71113002, + "learning_rate": 3.99359791720544e-06, + "loss": 0.73410475, + "num_input_tokens_seen": 19415950, + "router_z_loss_clip": 1.56152344, + "router_z_loss_mlp": 0.24414062, + "step": 910, + "time_per_iteration": 2.658383846282959 + }, + { + "auxiliary_loss_clip": 0.01222776, + "auxiliary_loss_mlp": 0.01053057, + "balance_loss_clip": 1.06866384, + "balance_loss_mlp": 1.02997828, + "epoch": 0.05477228318052007, + "flos": 24817713210720.0, + "grad_norm": 1.6811588049928907, + "language_loss": 0.83761203, + "learning_rate": 3.993566742350714e-06, + "loss": 0.8603704, + "num_input_tokens_seen": 19435275, + "router_z_loss_clip": 1.54199219, + "router_z_loss_mlp": 0.23071289, + "step": 911, + "time_per_iteration": 2.678471803665161 + }, + { + "auxiliary_loss_clip": 0.01222671, + "auxiliary_loss_mlp": 0.01065678, + "balance_loss_clip": 1.06683969, + "balance_loss_mlp": 1.04088235, + "epoch": 0.054832406433188034, + "flos": 26812457792640.0, + "grad_norm": 2.6845625073972057, + "language_loss": 0.76251239, + "learning_rate": 3.993535491899736e-06, + "loss": 0.78539586, + "num_input_tokens_seen": 19452090, + "router_z_loss_clip": 1.55859375, + "router_z_loss_mlp": 0.24829102, + "step": 912, + "time_per_iteration": 2.6734306812286377 + }, + { + "auxiliary_loss_clip": 0.01217193, + "auxiliary_loss_mlp": 0.01052813, + "balance_loss_clip": 1.0676825, + "balance_loss_mlp": 1.02913833, + "epoch": 0.054892529685856006, + "flos": 20009686711680.0, + "grad_norm": 2.2510986048195893, + "language_loss": 0.82638025, + "learning_rate": 3.993504165853694e-06, + "loss": 0.84908032, + "num_input_tokens_seen": 19470865, + "router_z_loss_clip": 1.49414062, + "router_z_loss_mlp": 0.23669434, + "step": 913, + "time_per_iteration": 2.6532788276672363 + }, + { + "auxiliary_loss_clip": 0.01224018, + "auxiliary_loss_mlp": 0.01056303, + "balance_loss_clip": 1.07269943, + "balance_loss_mlp": 1.03348601, + "epoch": 0.05495265293852397, + "flos": 29178370890720.0, + "grad_norm": 1.5561256612356877, + "language_loss": 0.83458781, + "learning_rate": 3.993472764213772e-06, + "loss": 0.85739106, + "num_input_tokens_seen": 19492145, + "router_z_loss_clip": 1.51171875, + "router_z_loss_mlp": 0.22802734, + "step": 914, + "time_per_iteration": 2.6703438758850098 + }, + { + "auxiliary_loss_clip": 0.01223933, + "auxiliary_loss_mlp": 0.01053732, + "balance_loss_clip": 1.06969786, + "balance_loss_mlp": 1.03103483, + "epoch": 0.055012776191191944, + "flos": 28781034562080.0, + "grad_norm": 2.4703828781520936, + "language_loss": 0.90259242, + "learning_rate": 3.9934412869811655e-06, + "loss": 0.92536902, + "num_input_tokens_seen": 19511015, + "router_z_loss_clip": 1.54199219, + "router_z_loss_mlp": 0.22680664, + "step": 915, + "time_per_iteration": 2.6622543334960938 + }, + { + "auxiliary_loss_clip": 0.01223931, + "auxiliary_loss_mlp": 0.01055238, + "balance_loss_clip": 1.07184577, + "balance_loss_mlp": 1.03234982, + "epoch": 0.055072899443859916, + "flos": 21388211853120.0, + "grad_norm": 1.6552488152096188, + "language_loss": 0.89895976, + "learning_rate": 3.993409734157064e-06, + "loss": 0.9217515, + "num_input_tokens_seen": 19529040, + "router_z_loss_clip": 1.52148438, + "router_z_loss_mlp": 0.22875977, + "step": 916, + "time_per_iteration": 2.6389920711517334 + }, + { + "auxiliary_loss_clip": 0.01228914, + "auxiliary_loss_mlp": 0.01064687, + "balance_loss_clip": 1.07275128, + "balance_loss_mlp": 1.0416919, + "epoch": 0.05513302269652788, + "flos": 26463006538560.0, + "grad_norm": 1.895334670317242, + "language_loss": 0.80189168, + "learning_rate": 3.993378105742666e-06, + "loss": 0.82482773, + "num_input_tokens_seen": 19549540, + "router_z_loss_clip": 1.56152344, + "router_z_loss_mlp": 0.2298584, + "step": 917, + "time_per_iteration": 2.691939115524292 + }, + { + "auxiliary_loss_clip": 0.01223034, + "auxiliary_loss_mlp": 0.01060433, + "balance_loss_clip": 1.06692076, + "balance_loss_mlp": 1.03667474, + "epoch": 0.05519314594919585, + "flos": 26372989843200.0, + "grad_norm": 1.8970832086561942, + "language_loss": 0.79516882, + "learning_rate": 3.9933464017391705e-06, + "loss": 0.81800348, + "num_input_tokens_seen": 19567570, + "router_z_loss_clip": 1.5625, + "router_z_loss_mlp": 0.23742676, + "step": 918, + "time_per_iteration": 2.66218638420105 + }, + { + "auxiliary_loss_clip": 0.01222183, + "auxiliary_loss_mlp": 0.0105203, + "balance_loss_clip": 1.0688498, + "balance_loss_mlp": 1.02932048, + "epoch": 0.05525326920186382, + "flos": 26598760892640.0, + "grad_norm": 2.0703212698577533, + "language_loss": 0.88852471, + "learning_rate": 3.99331462214778e-06, + "loss": 0.9112668, + "num_input_tokens_seen": 19585330, + "router_z_loss_clip": 1.53222656, + "router_z_loss_mlp": 0.22705078, + "step": 919, + "time_per_iteration": 2.6894309520721436 + }, + { + "auxiliary_loss_clip": 0.01219982, + "auxiliary_loss_mlp": 0.01062382, + "balance_loss_clip": 1.06683791, + "balance_loss_mlp": 1.03883862, + "epoch": 0.05531339245453179, + "flos": 34702641293760.0, + "grad_norm": 3.170561232059453, + "language_loss": 0.87782383, + "learning_rate": 3.993282766969699e-06, + "loss": 0.9006474, + "num_input_tokens_seen": 19604970, + "router_z_loss_clip": 1.53222656, + "router_z_loss_mlp": 0.23535156, + "step": 920, + "time_per_iteration": 2.721057176589966 + }, + { + "auxiliary_loss_clip": 0.01222474, + "auxiliary_loss_mlp": 0.0106224, + "balance_loss_clip": 1.07156563, + "balance_loss_mlp": 1.03898263, + "epoch": 0.05537351570719976, + "flos": 45606757047840.0, + "grad_norm": 2.408883313727936, + "language_loss": 0.65981138, + "learning_rate": 3.993250836206136e-06, + "loss": 0.68265849, + "num_input_tokens_seen": 19626235, + "router_z_loss_clip": 1.51171875, + "router_z_loss_mlp": 0.23278809, + "step": 921, + "time_per_iteration": 2.7930164337158203 + }, + { + "auxiliary_loss_clip": 0.01228963, + "auxiliary_loss_mlp": 0.01061129, + "balance_loss_clip": 1.07195616, + "balance_loss_mlp": 1.03491485, + "epoch": 0.05543363895986773, + "flos": 24507030643200.0, + "grad_norm": 1.7802418966871043, + "language_loss": 0.71778244, + "learning_rate": 3.993218829858301e-06, + "loss": 0.74068332, + "num_input_tokens_seen": 19644305, + "router_z_loss_clip": 1.56738281, + "router_z_loss_mlp": 0.26196289, + "step": 922, + "time_per_iteration": 2.6496682167053223 + }, + { + "auxiliary_loss_clip": 0.01226494, + "auxiliary_loss_mlp": 0.01064636, + "balance_loss_clip": 1.07005763, + "balance_loss_mlp": 1.04057968, + "epoch": 0.0554937622125357, + "flos": 29935936069920.0, + "grad_norm": 2.685381503013976, + "language_loss": 0.82795858, + "learning_rate": 3.993186747927408e-06, + "loss": 0.85086989, + "num_input_tokens_seen": 19662130, + "router_z_loss_clip": 1.56542969, + "router_z_loss_mlp": 0.24072266, + "step": 923, + "time_per_iteration": 2.654003143310547 + }, + { + "auxiliary_loss_clip": 0.01219946, + "auxiliary_loss_mlp": 0.01062102, + "balance_loss_clip": 1.06570339, + "balance_loss_mlp": 1.03824854, + "epoch": 0.055553885465203665, + "flos": 17472816093600.0, + "grad_norm": 1.8653130104995788, + "language_loss": 0.78734636, + "learning_rate": 3.993154590414675e-06, + "loss": 0.81016684, + "num_input_tokens_seen": 19680715, + "router_z_loss_clip": 1.54296875, + "router_z_loss_mlp": 0.23864746, + "step": 924, + "time_per_iteration": 4.045341491699219 + }, + { + "auxiliary_loss_clip": 0.01220914, + "auxiliary_loss_mlp": 0.01058324, + "balance_loss_clip": 1.06854844, + "balance_loss_mlp": 1.0341121, + "epoch": 0.05561400871787164, + "flos": 33411013017120.0, + "grad_norm": 1.9633634707746241, + "language_loss": 1.02295089, + "learning_rate": 3.993122357321319e-06, + "loss": 1.04574323, + "num_input_tokens_seen": 19700535, + "router_z_loss_clip": 1.5234375, + "router_z_loss_mlp": 0.24194336, + "step": 925, + "time_per_iteration": 4.118370294570923 + }, + { + "auxiliary_loss_clip": 0.01218748, + "auxiliary_loss_mlp": 0.01049446, + "balance_loss_clip": 1.06389344, + "balance_loss_mlp": 1.0264504, + "epoch": 0.05567413197053961, + "flos": 28334678675040.0, + "grad_norm": 1.9383528304080992, + "language_loss": 0.81217468, + "learning_rate": 3.993090048648564e-06, + "loss": 0.83485657, + "num_input_tokens_seen": 19718825, + "router_z_loss_clip": 1.54589844, + "router_z_loss_mlp": 0.2298584, + "step": 926, + "time_per_iteration": 4.109630346298218 + }, + { + "auxiliary_loss_clip": 0.0122902, + "auxiliary_loss_mlp": 0.01064129, + "balance_loss_clip": 1.06935203, + "balance_loss_mlp": 1.03922582, + "epoch": 0.055734255223207574, + "flos": 30828688361280.0, + "grad_norm": 2.232898248588751, + "language_loss": 0.73730731, + "learning_rate": 3.993057664397634e-06, + "loss": 0.76023877, + "num_input_tokens_seen": 19739080, + "router_z_loss_clip": 1.59472656, + "router_z_loss_mlp": 0.24890137, + "step": 927, + "time_per_iteration": 2.724970579147339 + }, + { + "auxiliary_loss_clip": 0.01106984, + "auxiliary_loss_mlp": 0.01009542, + "balance_loss_clip": 1.0389204, + "balance_loss_mlp": 1.00470233, + "epoch": 0.055794378475875546, + "flos": 81146452073760.0, + "grad_norm": 0.8126272938326128, + "language_loss": 0.59903938, + "learning_rate": 3.9930252045697585e-06, + "loss": 0.62020463, + "num_input_tokens_seen": 19802960, + "router_z_loss_clip": 0.68066406, + "router_z_loss_mlp": 0.04833984, + "step": 928, + "time_per_iteration": 3.3748018741607666 + }, + { + "auxiliary_loss_clip": 0.01223837, + "auxiliary_loss_mlp": 0.01062784, + "balance_loss_clip": 1.06924689, + "balance_loss_mlp": 1.03914464, + "epoch": 0.05585450172854351, + "flos": 30917408503680.0, + "grad_norm": 2.2542623707419427, + "language_loss": 0.94900417, + "learning_rate": 3.992992669166168e-06, + "loss": 0.97187042, + "num_input_tokens_seen": 19822765, + "router_z_loss_clip": 1.54394531, + "router_z_loss_mlp": 0.23632812, + "step": 929, + "time_per_iteration": 2.6703977584838867 + }, + { + "auxiliary_loss_clip": 0.0122044, + "auxiliary_loss_mlp": 0.01068488, + "balance_loss_clip": 1.065992, + "balance_loss_mlp": 1.04285836, + "epoch": 0.05591462498121148, + "flos": 41380192513440.0, + "grad_norm": 2.2504997091657817, + "language_loss": 0.71809042, + "learning_rate": 3.992960058188094e-06, + "loss": 0.74097967, + "num_input_tokens_seen": 19843590, + "router_z_loss_clip": 1.54492188, + "router_z_loss_mlp": 0.25646973, + "step": 930, + "time_per_iteration": 2.792081594467163 + }, + { + "auxiliary_loss_clip": 0.01227369, + "auxiliary_loss_mlp": 0.01063546, + "balance_loss_clip": 1.07151747, + "balance_loss_mlp": 1.03892922, + "epoch": 0.055974748233879455, + "flos": 21879130397760.0, + "grad_norm": 2.4362370573768812, + "language_loss": 0.85215837, + "learning_rate": 3.992927371636776e-06, + "loss": 0.87506753, + "num_input_tokens_seen": 19860230, + "router_z_loss_clip": 1.55957031, + "router_z_loss_mlp": 0.24621582, + "step": 931, + "time_per_iteration": 2.6774351596832275 + }, + { + "auxiliary_loss_clip": 0.01224987, + "auxiliary_loss_mlp": 0.01069718, + "balance_loss_clip": 1.06798136, + "balance_loss_mlp": 1.04527998, + "epoch": 0.05603487148654742, + "flos": 29312139898080.0, + "grad_norm": 1.7232279024411765, + "language_loss": 0.83732116, + "learning_rate": 3.9928946095134525e-06, + "loss": 0.86026818, + "num_input_tokens_seen": 19880795, + "router_z_loss_clip": 1.57128906, + "router_z_loss_mlp": 0.2442627, + "step": 932, + "time_per_iteration": 2.691572904586792 + }, + { + "auxiliary_loss_clip": 0.01224774, + "auxiliary_loss_mlp": 0.01065331, + "balance_loss_clip": 1.07007718, + "balance_loss_mlp": 1.03980827, + "epoch": 0.05609499473921539, + "flos": 21119012629920.0, + "grad_norm": 2.041276274303933, + "language_loss": 0.73402894, + "learning_rate": 3.992861771819365e-06, + "loss": 0.75692999, + "num_input_tokens_seen": 19897960, + "router_z_loss_clip": 1.54785156, + "router_z_loss_mlp": 0.25500488, + "step": 933, + "time_per_iteration": 2.656883716583252 + }, + { + "auxiliary_loss_clip": 0.01219209, + "auxiliary_loss_mlp": 0.01070275, + "balance_loss_clip": 1.06604671, + "balance_loss_mlp": 1.04575348, + "epoch": 0.05615511799188336, + "flos": 25617977252640.0, + "grad_norm": 2.5017649717460233, + "language_loss": 0.8691951, + "learning_rate": 3.99282885855576e-06, + "loss": 0.89208996, + "num_input_tokens_seen": 19913315, + "router_z_loss_clip": 1.53710938, + "router_z_loss_mlp": 0.24560547, + "step": 934, + "time_per_iteration": 2.6520731449127197 + }, + { + "auxiliary_loss_clip": 0.01218621, + "auxiliary_loss_mlp": 0.01068004, + "balance_loss_clip": 1.06899357, + "balance_loss_mlp": 1.04441261, + "epoch": 0.05621524124455133, + "flos": 21077043078240.0, + "grad_norm": 2.18873973554652, + "language_loss": 0.80683619, + "learning_rate": 3.992795869723885e-06, + "loss": 0.82970244, + "num_input_tokens_seen": 19928790, + "router_z_loss_clip": 1.49707031, + "router_z_loss_mlp": 0.23583984, + "step": 935, + "time_per_iteration": 2.6486122608184814 + }, + { + "auxiliary_loss_clip": 0.01096558, + "auxiliary_loss_mlp": 0.01008487, + "balance_loss_clip": 1.02912772, + "balance_loss_mlp": 1.00414515, + "epoch": 0.0562753644972193, + "flos": 71649257724000.0, + "grad_norm": 0.8231054512109617, + "language_loss": 0.69207352, + "learning_rate": 3.99276280532499e-06, + "loss": 0.71312392, + "num_input_tokens_seen": 19988785, + "router_z_loss_clip": 0.67529297, + "router_z_loss_mlp": 0.04348755, + "step": 936, + "time_per_iteration": 3.1672515869140625 + }, + { + "auxiliary_loss_clip": 0.01219044, + "auxiliary_loss_mlp": 0.01058909, + "balance_loss_clip": 1.06522465, + "balance_loss_mlp": 1.03555632, + "epoch": 0.05633548774988727, + "flos": 21300869298240.0, + "grad_norm": 3.007398433511199, + "language_loss": 0.75998199, + "learning_rate": 3.992729665360331e-06, + "loss": 0.78276151, + "num_input_tokens_seen": 20007685, + "router_z_loss_clip": 1.53710938, + "router_z_loss_mlp": 0.23327637, + "step": 937, + "time_per_iteration": 2.729503870010376 + }, + { + "auxiliary_loss_clip": 0.01095711, + "auxiliary_loss_mlp": 0.01006649, + "balance_loss_clip": 1.02782011, + "balance_loss_mlp": 1.00240207, + "epoch": 0.05639561100255524, + "flos": 86213507958720.0, + "grad_norm": 0.8585793722097247, + "language_loss": 0.64345402, + "learning_rate": 3.992696449831162e-06, + "loss": 0.66447759, + "num_input_tokens_seen": 20072750, + "router_z_loss_clip": 0.6796875, + "router_z_loss_mlp": 0.04251099, + "step": 938, + "time_per_iteration": 3.227611541748047 + }, + { + "auxiliary_loss_clip": 0.01224284, + "auxiliary_loss_mlp": 0.01064141, + "balance_loss_clip": 1.06679678, + "balance_loss_mlp": 1.04000092, + "epoch": 0.056455734255223204, + "flos": 24996976773120.0, + "grad_norm": 2.640378826001165, + "language_loss": 0.79447138, + "learning_rate": 3.992663158738745e-06, + "loss": 0.81735575, + "num_input_tokens_seen": 20089070, + "router_z_loss_clip": 1.57519531, + "router_z_loss_mlp": 0.24133301, + "step": 939, + "time_per_iteration": 2.6741178035736084 + }, + { + "auxiliary_loss_clip": 0.0121984, + "auxiliary_loss_mlp": 0.01059342, + "balance_loss_clip": 1.06748414, + "balance_loss_mlp": 1.03678727, + "epoch": 0.056515857507891176, + "flos": 26909970184800.0, + "grad_norm": 1.7799859955343322, + "language_loss": 0.7426275, + "learning_rate": 3.992629792084341e-06, + "loss": 0.7654193, + "num_input_tokens_seen": 20108790, + "router_z_loss_clip": 1.5234375, + "router_z_loss_mlp": 0.22558594, + "step": 940, + "time_per_iteration": 2.7022907733917236 + }, + { + "auxiliary_loss_clip": 0.01217872, + "auxiliary_loss_mlp": 0.0106377, + "balance_loss_clip": 1.06693935, + "balance_loss_mlp": 1.03996384, + "epoch": 0.05657598076055915, + "flos": 29315908005120.0, + "grad_norm": 1.9029329600643416, + "language_loss": 0.71325421, + "learning_rate": 3.992596349869216e-06, + "loss": 0.73607063, + "num_input_tokens_seen": 20128455, + "router_z_loss_clip": 1.50878906, + "router_z_loss_mlp": 0.23828125, + "step": 941, + "time_per_iteration": 2.678008794784546 + }, + { + "auxiliary_loss_clip": 0.01213789, + "auxiliary_loss_mlp": 0.01057175, + "balance_loss_clip": 1.0644052, + "balance_loss_mlp": 1.03341639, + "epoch": 0.05663610401322711, + "flos": 24990899181120.0, + "grad_norm": 1.8440488114205627, + "language_loss": 0.80482811, + "learning_rate": 3.992562832094637e-06, + "loss": 0.82753778, + "num_input_tokens_seen": 20145775, + "router_z_loss_clip": 1.49316406, + "router_z_loss_mlp": 0.2376709, + "step": 942, + "time_per_iteration": 2.641923666000366 + }, + { + "auxiliary_loss_clip": 0.01212098, + "auxiliary_loss_mlp": 0.01060224, + "balance_loss_clip": 1.0636096, + "balance_loss_mlp": 1.03726375, + "epoch": 0.056696227265895086, + "flos": 25708520672640.0, + "grad_norm": 2.041249494120725, + "language_loss": 0.88234115, + "learning_rate": 3.9925292387618755e-06, + "loss": 0.90506434, + "num_input_tokens_seen": 20164315, + "router_z_loss_clip": 1.48242188, + "router_z_loss_mlp": 0.2298584, + "step": 943, + "time_per_iteration": 2.6932499408721924 + }, + { + "auxiliary_loss_clip": 0.01218603, + "auxiliary_loss_mlp": 0.01059179, + "balance_loss_clip": 1.06751287, + "balance_loss_mlp": 1.03670835, + "epoch": 0.05675635051856306, + "flos": 21745442424960.0, + "grad_norm": 2.4013605291466655, + "language_loss": 0.75309014, + "learning_rate": 3.992495569872206e-06, + "loss": 0.77586794, + "num_input_tokens_seen": 20182760, + "router_z_loss_clip": 1.51171875, + "router_z_loss_mlp": 0.22473145, + "step": 944, + "time_per_iteration": 2.623682975769043 + }, + { + "auxiliary_loss_clip": 0.0121428, + "auxiliary_loss_mlp": 0.0106129, + "balance_loss_clip": 1.06355786, + "balance_loss_mlp": 1.03922451, + "epoch": 0.05681647377123102, + "flos": 28204839843840.0, + "grad_norm": 2.67730649376376, + "language_loss": 0.79404795, + "learning_rate": 3.992461825426906e-06, + "loss": 0.81680369, + "num_input_tokens_seen": 20203830, + "router_z_loss_clip": 1.50585938, + "router_z_loss_mlp": 0.22070312, + "step": 945, + "time_per_iteration": 2.6826303005218506 + }, + { + "auxiliary_loss_clip": 0.01218402, + "auxiliary_loss_mlp": 0.01059226, + "balance_loss_clip": 1.06546974, + "balance_loss_mlp": 1.03598058, + "epoch": 0.056876597023898995, + "flos": 19608946931520.0, + "grad_norm": 2.1615652623319885, + "language_loss": 0.82688469, + "learning_rate": 3.992428005427252e-06, + "loss": 0.84966099, + "num_input_tokens_seen": 20220365, + "router_z_loss_clip": 1.52929688, + "router_z_loss_mlp": 0.2322998, + "step": 946, + "time_per_iteration": 2.6327455043792725 + }, + { + "auxiliary_loss_clip": 0.01219451, + "auxiliary_loss_mlp": 0.01052975, + "balance_loss_clip": 1.06570041, + "balance_loss_mlp": 1.02904963, + "epoch": 0.05693672027656696, + "flos": 20542696359840.0, + "grad_norm": 1.8081480287712814, + "language_loss": 0.78797394, + "learning_rate": 3.992394109874529e-06, + "loss": 0.81069821, + "num_input_tokens_seen": 20238640, + "router_z_loss_clip": 1.53808594, + "router_z_loss_mlp": 0.23950195, + "step": 947, + "time_per_iteration": 2.633730888366699 + }, + { + "auxiliary_loss_clip": 0.01223269, + "auxiliary_loss_mlp": 0.01065035, + "balance_loss_clip": 1.06860602, + "balance_loss_mlp": 1.04214692, + "epoch": 0.05699684352923493, + "flos": 26099860443840.0, + "grad_norm": 2.7814884163545, + "language_loss": 0.85712194, + "learning_rate": 3.9923601387700225e-06, + "loss": 0.880005, + "num_input_tokens_seen": 20251025, + "router_z_loss_clip": 1.546875, + "router_z_loss_mlp": 0.22900391, + "step": 948, + "time_per_iteration": 2.6741552352905273 + }, + { + "auxiliary_loss_clip": 0.0121811, + "auxiliary_loss_mlp": 0.01068566, + "balance_loss_clip": 1.06562841, + "balance_loss_mlp": 1.04270983, + "epoch": 0.057056966781902904, + "flos": 18986285243520.0, + "grad_norm": 2.107926117755927, + "language_loss": 0.87611675, + "learning_rate": 3.992326092115019e-06, + "loss": 0.89898354, + "num_input_tokens_seen": 20269775, + "router_z_loss_clip": 1.52441406, + "router_z_loss_mlp": 0.25866699, + "step": 949, + "time_per_iteration": 2.6894099712371826 + }, + { + "auxiliary_loss_clip": 0.01212776, + "auxiliary_loss_mlp": 0.01063553, + "balance_loss_clip": 1.06582618, + "balance_loss_mlp": 1.04226196, + "epoch": 0.05711709003457087, + "flos": 24328172253600.0, + "grad_norm": 3.130857000119996, + "language_loss": 0.79333305, + "learning_rate": 3.992291969910811e-06, + "loss": 0.81609631, + "num_input_tokens_seen": 20287715, + "router_z_loss_clip": 1.46972656, + "router_z_loss_mlp": 0.2130127, + "step": 950, + "time_per_iteration": 2.650230884552002 + }, + { + "auxiliary_loss_clip": 0.01218981, + "auxiliary_loss_mlp": 0.01068498, + "balance_loss_clip": 1.06458628, + "balance_loss_mlp": 1.04528844, + "epoch": 0.05717721328723884, + "flos": 37013092585920.0, + "grad_norm": 2.1790990538388377, + "language_loss": 0.82470429, + "learning_rate": 3.992257772158691e-06, + "loss": 0.84757906, + "num_input_tokens_seen": 20307070, + "router_z_loss_clip": 1.54492188, + "router_z_loss_mlp": 0.23205566, + "step": 951, + "time_per_iteration": 2.6917006969451904 + }, + { + "auxiliary_loss_clip": 0.01211058, + "auxiliary_loss_mlp": 0.01055925, + "balance_loss_clip": 1.0597558, + "balance_loss_mlp": 1.03173745, + "epoch": 0.05723733653990681, + "flos": 28863798664320.0, + "grad_norm": 2.644175210750251, + "language_loss": 0.86412853, + "learning_rate": 3.992223498859958e-06, + "loss": 0.88679838, + "num_input_tokens_seen": 20324945, + "router_z_loss_clip": 1.51171875, + "router_z_loss_mlp": 0.24182129, + "step": 952, + "time_per_iteration": 2.641084671020508 + }, + { + "auxiliary_loss_clip": 0.01216469, + "auxiliary_loss_mlp": 0.01059007, + "balance_loss_clip": 1.0607003, + "balance_loss_mlp": 1.0337224, + "epoch": 0.05729745979257478, + "flos": 26912441738880.0, + "grad_norm": 1.7965712846989212, + "language_loss": 0.79168719, + "learning_rate": 3.9921891500159084e-06, + "loss": 0.81444198, + "num_input_tokens_seen": 20346135, + "router_z_loss_clip": 1.55761719, + "router_z_loss_mlp": 0.25317383, + "step": 953, + "time_per_iteration": 2.666898012161255 + }, + { + "auxiliary_loss_clip": 0.01217132, + "auxiliary_loss_mlp": 0.01061008, + "balance_loss_clip": 1.06555343, + "balance_loss_mlp": 1.03716636, + "epoch": 0.05735758304524275, + "flos": 23921679019680.0, + "grad_norm": 1.9969448024453644, + "language_loss": 0.86908114, + "learning_rate": 3.992154725627848e-06, + "loss": 0.89186257, + "num_input_tokens_seen": 20364450, + "router_z_loss_clip": 1.51757812, + "router_z_loss_mlp": 0.23815918, + "step": 954, + "time_per_iteration": 2.6436972618103027 + }, + { + "auxiliary_loss_clip": 0.01218208, + "auxiliary_loss_mlp": 0.0105802, + "balance_loss_clip": 1.064466, + "balance_loss_mlp": 1.0352627, + "epoch": 0.057417706297910716, + "flos": 23304527681760.0, + "grad_norm": 3.895959882429902, + "language_loss": 0.88564909, + "learning_rate": 3.9921202256970804e-06, + "loss": 0.90841144, + "num_input_tokens_seen": 20383500, + "router_z_loss_clip": 1.53710938, + "router_z_loss_mlp": 0.22741699, + "step": 955, + "time_per_iteration": 2.8272786140441895 + }, + { + "auxiliary_loss_clip": 0.01214599, + "auxiliary_loss_mlp": 0.01063439, + "balance_loss_clip": 1.06229222, + "balance_loss_mlp": 1.03980017, + "epoch": 0.05747782955057869, + "flos": 20321544280320.0, + "grad_norm": 2.2908634224881492, + "language_loss": 0.89535481, + "learning_rate": 3.992085650224914e-06, + "loss": 0.91813517, + "num_input_tokens_seen": 20400295, + "router_z_loss_clip": 1.52441406, + "router_z_loss_mlp": 0.23657227, + "step": 956, + "time_per_iteration": 2.584447145462036 + }, + { + "auxiliary_loss_clip": 0.01209539, + "auxiliary_loss_mlp": 0.01055852, + "balance_loss_clip": 1.0633347, + "balance_loss_mlp": 1.03206968, + "epoch": 0.05753795280324665, + "flos": 17694940587840.0, + "grad_norm": 1.9492440826682398, + "language_loss": 0.75622171, + "learning_rate": 3.99205099921266e-06, + "loss": 0.77887559, + "num_input_tokens_seen": 20419085, + "router_z_loss_clip": 1.45898438, + "router_z_loss_mlp": 0.23791504, + "step": 957, + "time_per_iteration": 2.6468379497528076 + }, + { + "auxiliary_loss_clip": 0.01217243, + "auxiliary_loss_mlp": 0.01072286, + "balance_loss_clip": 1.06257749, + "balance_loss_mlp": 1.04631066, + "epoch": 0.057598076055914625, + "flos": 22056651717120.0, + "grad_norm": 2.022292233896851, + "language_loss": 0.79787064, + "learning_rate": 3.992016272661633e-06, + "loss": 0.82076591, + "num_input_tokens_seen": 20437465, + "router_z_loss_clip": 1.54785156, + "router_z_loss_mlp": 0.26000977, + "step": 958, + "time_per_iteration": 2.624433994293213 + }, + { + "auxiliary_loss_clip": 0.01210509, + "auxiliary_loss_mlp": 0.01057491, + "balance_loss_clip": 1.06028509, + "balance_loss_mlp": 1.03527021, + "epoch": 0.0576581993085826, + "flos": 26996826532320.0, + "grad_norm": 10.601009482157643, + "language_loss": 0.88152492, + "learning_rate": 3.99198147057315e-06, + "loss": 0.90420496, + "num_input_tokens_seen": 20456235, + "router_z_loss_clip": 1.50195312, + "router_z_loss_mlp": 0.22216797, + "step": 959, + "time_per_iteration": 2.642660140991211 + }, + { + "auxiliary_loss_clip": 0.01206642, + "auxiliary_loss_mlp": 0.01057132, + "balance_loss_clip": 1.06106591, + "balance_loss_mlp": 1.03445876, + "epoch": 0.05771832256125056, + "flos": 40489425568800.0, + "grad_norm": 2.242643755421649, + "language_loss": 0.78685462, + "learning_rate": 3.991946592948529e-06, + "loss": 0.80949235, + "num_input_tokens_seen": 20476825, + "router_z_loss_clip": 1.45703125, + "router_z_loss_mlp": 0.2265625, + "step": 960, + "time_per_iteration": 2.7836368083953857 + }, + { + "auxiliary_loss_clip": 0.01214492, + "auxiliary_loss_mlp": 0.01058296, + "balance_loss_clip": 1.06272149, + "balance_loss_mlp": 1.03415668, + "epoch": 0.057778445813918534, + "flos": 29492740530720.0, + "grad_norm": 2.3940790150161706, + "language_loss": 0.93355691, + "learning_rate": 3.991911639789094e-06, + "loss": 0.95628482, + "num_input_tokens_seen": 20496965, + "router_z_loss_clip": 1.51660156, + "router_z_loss_mlp": 0.24133301, + "step": 961, + "time_per_iteration": 2.7045345306396484 + }, + { + "auxiliary_loss_clip": 0.01211901, + "auxiliary_loss_mlp": 0.01062874, + "balance_loss_clip": 1.06020045, + "balance_loss_mlp": 1.03875792, + "epoch": 0.0578385690665865, + "flos": 36171183130560.0, + "grad_norm": 2.8911854920036286, + "language_loss": 0.68016088, + "learning_rate": 3.991876611096169e-06, + "loss": 0.70290869, + "num_input_tokens_seen": 20518035, + "router_z_loss_clip": 1.51660156, + "router_z_loss_mlp": 0.24121094, + "step": 962, + "time_per_iteration": 2.737760066986084 + }, + { + "auxiliary_loss_clip": 0.01212704, + "auxiliary_loss_mlp": 0.01063805, + "balance_loss_clip": 1.06314814, + "balance_loss_mlp": 1.04144084, + "epoch": 0.05789869231925447, + "flos": 25484572900800.0, + "grad_norm": 2.0821147086051126, + "language_loss": 0.88422662, + "learning_rate": 3.991841506871084e-06, + "loss": 0.90699172, + "num_input_tokens_seen": 20534740, + "router_z_loss_clip": 1.49707031, + "router_z_loss_mlp": 0.22351074, + "step": 963, + "time_per_iteration": 4.0711493492126465 + }, + { + "auxiliary_loss_clip": 0.01220103, + "auxiliary_loss_mlp": 0.01057584, + "balance_loss_clip": 1.06728196, + "balance_loss_mlp": 1.03473163, + "epoch": 0.057958815571922444, + "flos": 31763612790720.0, + "grad_norm": 2.7828531166746466, + "language_loss": 0.85068321, + "learning_rate": 3.99180632711517e-06, + "loss": 0.87346005, + "num_input_tokens_seen": 20553485, + "router_z_loss_clip": 1.53027344, + "router_z_loss_mlp": 0.22851562, + "step": 964, + "time_per_iteration": 2.6806201934814453 + }, + { + "auxiliary_loss_clip": 0.01211983, + "auxiliary_loss_mlp": 0.01062471, + "balance_loss_clip": 1.06287432, + "balance_loss_mlp": 1.03877211, + "epoch": 0.05801893882459041, + "flos": 22058191373760.0, + "grad_norm": 3.0793763029160353, + "language_loss": 0.7825259, + "learning_rate": 3.99177107182976e-06, + "loss": 0.80527043, + "num_input_tokens_seen": 20572155, + "router_z_loss_clip": 1.4921875, + "router_z_loss_mlp": 0.23706055, + "step": 965, + "time_per_iteration": 4.09394383430481 + }, + { + "auxiliary_loss_clip": 0.01207085, + "auxiliary_loss_mlp": 0.01065808, + "balance_loss_clip": 1.05967891, + "balance_loss_mlp": 1.04312217, + "epoch": 0.05807906207725838, + "flos": 21657005903520.0, + "grad_norm": 1.8990846979184524, + "language_loss": 0.81118912, + "learning_rate": 3.99173574101619e-06, + "loss": 0.83391809, + "num_input_tokens_seen": 20590395, + "router_z_loss_clip": 1.47460938, + "router_z_loss_mlp": 0.22680664, + "step": 966, + "time_per_iteration": 4.140699625015259 + }, + { + "auxiliary_loss_clip": 0.01209945, + "auxiliary_loss_mlp": 0.01055356, + "balance_loss_clip": 1.06156373, + "balance_loss_mlp": 1.03399372, + "epoch": 0.058139185329926346, + "flos": 22012737336000.0, + "grad_norm": 2.8508206497580284, + "language_loss": 0.7607975, + "learning_rate": 3.9917003346758035e-06, + "loss": 0.78345048, + "num_input_tokens_seen": 20608435, + "router_z_loss_clip": 1.484375, + "router_z_loss_mlp": 0.21350098, + "step": 967, + "time_per_iteration": 2.622432231903076 + }, + { + "auxiliary_loss_clip": 0.01096854, + "auxiliary_loss_mlp": 0.01028039, + "balance_loss_clip": 1.03127706, + "balance_loss_mlp": 1.02386105, + "epoch": 0.05819930858259432, + "flos": 76096285544160.0, + "grad_norm": 0.7947204207803475, + "language_loss": 0.57383615, + "learning_rate": 3.991664852809939e-06, + "loss": 0.59508508, + "num_input_tokens_seen": 20668575, + "router_z_loss_clip": 0.65625, + "router_z_loss_mlp": 0.04180908, + "step": 968, + "time_per_iteration": 3.2082273960113525 + }, + { + "auxiliary_loss_clip": 0.01217289, + "auxiliary_loss_mlp": 0.01056597, + "balance_loss_clip": 1.06725883, + "balance_loss_mlp": 1.03262413, + "epoch": 0.05825943183526229, + "flos": 23348887752960.0, + "grad_norm": 1.9705320314838286, + "language_loss": 0.82195467, + "learning_rate": 3.991629295419945e-06, + "loss": 0.84469354, + "num_input_tokens_seen": 20687355, + "router_z_loss_clip": 1.50097656, + "router_z_loss_mlp": 0.2401123, + "step": 969, + "time_per_iteration": 2.6543562412261963 + }, + { + "auxiliary_loss_clip": 0.01213561, + "auxiliary_loss_mlp": 0.01052886, + "balance_loss_clip": 1.06331789, + "balance_loss_mlp": 1.03012896, + "epoch": 0.058319555087930255, + "flos": 35413942089600.0, + "grad_norm": 2.129071475165484, + "language_loss": 0.77921098, + "learning_rate": 3.991593662507167e-06, + "loss": 0.80187541, + "num_input_tokens_seen": 20705710, + "router_z_loss_clip": 1.50292969, + "router_z_loss_mlp": 0.22753906, + "step": 970, + "time_per_iteration": 2.7248406410217285 + }, + { + "auxiliary_loss_clip": 0.01214857, + "auxiliary_loss_mlp": 0.01066318, + "balance_loss_clip": 1.0634824, + "balance_loss_mlp": 1.04198694, + "epoch": 0.05837967834059823, + "flos": 23037273288000.0, + "grad_norm": 3.7943662237545763, + "language_loss": 0.9216305, + "learning_rate": 3.991557954072958e-06, + "loss": 0.94444221, + "num_input_tokens_seen": 20722405, + "router_z_loss_clip": 1.51464844, + "router_z_loss_mlp": 0.24365234, + "step": 971, + "time_per_iteration": 2.6270885467529297 + }, + { + "auxiliary_loss_clip": 0.01207522, + "auxiliary_loss_mlp": 0.01057365, + "balance_loss_clip": 1.05861807, + "balance_loss_mlp": 1.03433418, + "epoch": 0.05843980159326619, + "flos": 31363278183360.0, + "grad_norm": 2.052310439958296, + "language_loss": 0.86374986, + "learning_rate": 3.991522170118673e-06, + "loss": 0.88639867, + "num_input_tokens_seen": 20741480, + "router_z_loss_clip": 1.48632812, + "router_z_loss_mlp": 0.23034668, + "step": 972, + "time_per_iteration": 2.6998472213745117 + }, + { + "auxiliary_loss_clip": 0.01210418, + "auxiliary_loss_mlp": 0.01064546, + "balance_loss_clip": 1.0641619, + "balance_loss_mlp": 1.04268265, + "epoch": 0.058499924845934165, + "flos": 31179557720160.0, + "grad_norm": 2.104067306391991, + "language_loss": 0.87757462, + "learning_rate": 3.991486310645667e-06, + "loss": 0.90032429, + "num_input_tokens_seen": 20759685, + "router_z_loss_clip": 1.46191406, + "router_z_loss_mlp": 0.21862793, + "step": 973, + "time_per_iteration": 2.6822669506073 + }, + { + "auxiliary_loss_clip": 0.01212323, + "auxiliary_loss_mlp": 0.01059597, + "balance_loss_clip": 1.06206417, + "balance_loss_mlp": 1.03558803, + "epoch": 0.05856004809860214, + "flos": 20055059714880.0, + "grad_norm": 1.7788022791722906, + "language_loss": 0.74635297, + "learning_rate": 3.991450375655301e-06, + "loss": 0.76907218, + "num_input_tokens_seen": 20778180, + "router_z_loss_clip": 1.50097656, + "router_z_loss_mlp": 0.2401123, + "step": 974, + "time_per_iteration": 2.762666702270508 + }, + { + "auxiliary_loss_clip": 0.01210668, + "auxiliary_loss_mlp": 0.01057837, + "balance_loss_clip": 1.0624404, + "balance_loss_mlp": 1.0351038, + "epoch": 0.0586201713512701, + "flos": 48149056981440.0, + "grad_norm": 1.5592020181120718, + "language_loss": 0.76687622, + "learning_rate": 3.991414365148936e-06, + "loss": 0.78956127, + "num_input_tokens_seen": 20802705, + "router_z_loss_clip": 1.48144531, + "router_z_loss_mlp": 0.22729492, + "step": 975, + "time_per_iteration": 2.846220016479492 + }, + { + "auxiliary_loss_clip": 0.01212837, + "auxiliary_loss_mlp": 0.01059939, + "balance_loss_clip": 1.06118965, + "balance_loss_mlp": 1.03681207, + "epoch": 0.058680294603938074, + "flos": 28510660337760.0, + "grad_norm": 1.8445716719061283, + "language_loss": 0.76836526, + "learning_rate": 3.99137827912794e-06, + "loss": 0.79109299, + "num_input_tokens_seen": 20822540, + "router_z_loss_clip": 1.51367188, + "router_z_loss_mlp": 0.23120117, + "step": 976, + "time_per_iteration": 2.671225070953369 + }, + { + "auxiliary_loss_clip": 0.01204152, + "auxiliary_loss_mlp": 0.01065995, + "balance_loss_clip": 1.05585742, + "balance_loss_mlp": 1.04351199, + "epoch": 0.05874041785660604, + "flos": 39330593884800.0, + "grad_norm": 1.8889301402207566, + "language_loss": 0.87803441, + "learning_rate": 3.991342117593679e-06, + "loss": 0.90073591, + "num_input_tokens_seen": 20844175, + "router_z_loss_clip": 1.484375, + "router_z_loss_mlp": 0.22473145, + "step": 977, + "time_per_iteration": 2.7389533519744873 + }, + { + "auxiliary_loss_clip": 0.01209346, + "auxiliary_loss_mlp": 0.01063375, + "balance_loss_clip": 1.06185138, + "balance_loss_mlp": 1.03965199, + "epoch": 0.05880054110927401, + "flos": 27222840685440.0, + "grad_norm": 1.6020970522916531, + "language_loss": 0.79392219, + "learning_rate": 3.991305880547527e-06, + "loss": 0.81664944, + "num_input_tokens_seen": 20864730, + "router_z_loss_clip": 1.47558594, + "router_z_loss_mlp": 0.23718262, + "step": 978, + "time_per_iteration": 2.661644458770752 + }, + { + "auxiliary_loss_clip": 0.01217261, + "auxiliary_loss_mlp": 0.01065894, + "balance_loss_clip": 1.06434941, + "balance_loss_mlp": 1.04161096, + "epoch": 0.05886066436194198, + "flos": 33410243188800.0, + "grad_norm": 1.8626391984728183, + "language_loss": 0.81109941, + "learning_rate": 3.991269567990855e-06, + "loss": 0.83393097, + "num_input_tokens_seen": 20885200, + "router_z_loss_clip": 1.52832031, + "router_z_loss_mlp": 0.24255371, + "step": 979, + "time_per_iteration": 2.696528196334839 + }, + { + "auxiliary_loss_clip": 0.01087774, + "auxiliary_loss_mlp": 0.0100817, + "balance_loss_clip": 1.02142763, + "balance_loss_mlp": 1.00440264, + "epoch": 0.05892078761460995, + "flos": 72705917528640.0, + "grad_norm": 0.9261947942621087, + "language_loss": 0.59034741, + "learning_rate": 3.9912331799250415e-06, + "loss": 0.61130691, + "num_input_tokens_seen": 20940325, + "router_z_loss_clip": 0.66357422, + "router_z_loss_mlp": 0.03759766, + "step": 980, + "time_per_iteration": 3.1901206970214844 + }, + { + "auxiliary_loss_clip": 0.01207503, + "auxiliary_loss_mlp": 0.01062534, + "balance_loss_clip": 1.06249034, + "balance_loss_mlp": 1.03906178, + "epoch": 0.05898091086727792, + "flos": 18807872544000.0, + "grad_norm": 2.4188138986006953, + "language_loss": 0.86851025, + "learning_rate": 3.9911967163514665e-06, + "loss": 0.89121068, + "num_input_tokens_seen": 20958220, + "router_z_loss_clip": 1.45117188, + "router_z_loss_mlp": 0.23474121, + "step": 981, + "time_per_iteration": 2.66041898727417 + }, + { + "auxiliary_loss_clip": 0.01208512, + "auxiliary_loss_mlp": 0.01052062, + "balance_loss_clip": 1.06125331, + "balance_loss_mlp": 1.03079486, + "epoch": 0.059041034119945886, + "flos": 28864892630880.0, + "grad_norm": 2.0627044369805136, + "language_loss": 0.79567659, + "learning_rate": 3.991160177271513e-06, + "loss": 0.81828237, + "num_input_tokens_seen": 20978920, + "router_z_loss_clip": 1.47363281, + "router_z_loss_mlp": 0.21276855, + "step": 982, + "time_per_iteration": 2.6684420108795166 + }, + { + "auxiliary_loss_clip": 0.01215095, + "auxiliary_loss_mlp": 0.01056412, + "balance_loss_clip": 1.06190538, + "balance_loss_mlp": 1.03391731, + "epoch": 0.05910115737261386, + "flos": 30205297362240.0, + "grad_norm": 6.222703010969878, + "language_loss": 0.8480829, + "learning_rate": 3.9911235626865654e-06, + "loss": 0.87079799, + "num_input_tokens_seen": 20999490, + "router_z_loss_clip": 1.53222656, + "router_z_loss_mlp": 0.22497559, + "step": 983, + "time_per_iteration": 2.690295696258545 + }, + { + "auxiliary_loss_clip": 0.01207243, + "auxiliary_loss_mlp": 0.01061553, + "balance_loss_clip": 1.06179941, + "balance_loss_mlp": 1.03936779, + "epoch": 0.05916128062528183, + "flos": 14310771716160.0, + "grad_norm": 1.8444542640438792, + "language_loss": 0.84397465, + "learning_rate": 3.9910868725980125e-06, + "loss": 0.86666262, + "num_input_tokens_seen": 21017865, + "router_z_loss_clip": 1.453125, + "router_z_loss_mlp": 0.22180176, + "step": 984, + "time_per_iteration": 2.635728120803833 + }, + { + "auxiliary_loss_clip": 0.01204151, + "auxiliary_loss_mlp": 0.01055761, + "balance_loss_clip": 1.06048203, + "balance_loss_mlp": 1.03413677, + "epoch": 0.059221403877949795, + "flos": 26726654894400.0, + "grad_norm": 2.369312335459147, + "language_loss": 0.77307659, + "learning_rate": 3.9910501070072465e-06, + "loss": 0.79567569, + "num_input_tokens_seen": 21035900, + "router_z_loss_clip": 1.43554688, + "router_z_loss_mlp": 0.21594238, + "step": 985, + "time_per_iteration": 2.652271032333374 + }, + { + "auxiliary_loss_clip": 0.01214646, + "auxiliary_loss_mlp": 0.01065176, + "balance_loss_clip": 1.06460023, + "balance_loss_mlp": 1.04223979, + "epoch": 0.05928152713061777, + "flos": 25032585111840.0, + "grad_norm": 3.0742724883577237, + "language_loss": 0.90680438, + "learning_rate": 3.991013265915661e-06, + "loss": 0.92960262, + "num_input_tokens_seen": 21053235, + "router_z_loss_clip": 1.5, + "router_z_loss_mlp": 0.22949219, + "step": 986, + "time_per_iteration": 2.6332643032073975 + }, + { + "auxiliary_loss_clip": 0.01212259, + "auxiliary_loss_mlp": 0.0105999, + "balance_loss_clip": 1.05950153, + "balance_loss_mlp": 1.03580284, + "epoch": 0.05934165038328574, + "flos": 29889631169280.0, + "grad_norm": 1.9937481124815626, + "language_loss": 0.75897592, + "learning_rate": 3.9909763493246525e-06, + "loss": 0.78169841, + "num_input_tokens_seen": 21073090, + "router_z_loss_clip": 1.52636719, + "router_z_loss_mlp": 0.24169922, + "step": 987, + "time_per_iteration": 2.685492515563965 + }, + { + "auxiliary_loss_clip": 0.01214437, + "auxiliary_loss_mlp": 0.01054099, + "balance_loss_clip": 1.06061721, + "balance_loss_mlp": 1.03147292, + "epoch": 0.059401773635953704, + "flos": 47257398656640.0, + "grad_norm": 2.3671311541824562, + "language_loss": 0.71930647, + "learning_rate": 3.990939357235621e-06, + "loss": 0.74199182, + "num_input_tokens_seen": 21094895, + "router_z_loss_clip": 1.5390625, + "router_z_loss_mlp": 0.22644043, + "step": 988, + "time_per_iteration": 2.7691650390625 + }, + { + "auxiliary_loss_clip": 0.0108949, + "auxiliary_loss_mlp": 0.01009762, + "balance_loss_clip": 1.0257206, + "balance_loss_mlp": 1.00590539, + "epoch": 0.059461896888621676, + "flos": 70800419813760.0, + "grad_norm": 0.935686638765388, + "language_loss": 0.71203768, + "learning_rate": 3.99090228964997e-06, + "loss": 0.73303014, + "num_input_tokens_seen": 21147555, + "router_z_loss_clip": 0.63867188, + "router_z_loss_mlp": 0.03851318, + "step": 989, + "time_per_iteration": 3.107513427734375 + }, + { + "auxiliary_loss_clip": 0.01216495, + "auxiliary_loss_mlp": 0.01064834, + "balance_loss_clip": 1.06297064, + "balance_loss_mlp": 1.04007435, + "epoch": 0.05952202014128964, + "flos": 27000392052960.0, + "grad_norm": 5.678345898488745, + "language_loss": 0.78075206, + "learning_rate": 3.990865146569105e-06, + "loss": 0.80356532, + "num_input_tokens_seen": 21167845, + "router_z_loss_clip": 1.53515625, + "router_z_loss_mlp": 0.24780273, + "step": 990, + "time_per_iteration": 2.671734571456909 + }, + { + "auxiliary_loss_clip": 0.0121067, + "auxiliary_loss_mlp": 0.01057823, + "balance_loss_clip": 1.06327605, + "balance_loss_mlp": 1.03445756, + "epoch": 0.059582143393957614, + "flos": 24948240835680.0, + "grad_norm": 2.7187277384872917, + "language_loss": 0.86510599, + "learning_rate": 3.990827927994434e-06, + "loss": 0.88779092, + "num_input_tokens_seen": 21185085, + "router_z_loss_clip": 1.47460938, + "router_z_loss_mlp": 0.23364258, + "step": 991, + "time_per_iteration": 2.641115188598633 + }, + { + "auxiliary_loss_clip": 0.01214141, + "auxiliary_loss_mlp": 0.01060543, + "balance_loss_clip": 1.06225979, + "balance_loss_mlp": 1.03813183, + "epoch": 0.059642266646625586, + "flos": 25129894917600.0, + "grad_norm": 1.8824122157288987, + "language_loss": 0.76623505, + "learning_rate": 3.9907906339273674e-06, + "loss": 0.78898185, + "num_input_tokens_seen": 21204230, + "router_z_loss_clip": 1.51855469, + "router_z_loss_mlp": 0.22399902, + "step": 992, + "time_per_iteration": 2.6670923233032227 + }, + { + "auxiliary_loss_clip": 0.01213383, + "auxiliary_loss_mlp": 0.01060842, + "balance_loss_clip": 1.06613588, + "balance_loss_mlp": 1.03981388, + "epoch": 0.05970238989929355, + "flos": 23614197317280.0, + "grad_norm": 2.3476750629792518, + "language_loss": 0.74848306, + "learning_rate": 3.9907532643693215e-06, + "loss": 0.77122533, + "num_input_tokens_seen": 21222655, + "router_z_loss_clip": 1.47070312, + "router_z_loss_mlp": 0.21044922, + "step": 993, + "time_per_iteration": 2.6220645904541016 + }, + { + "auxiliary_loss_clip": 0.01212189, + "auxiliary_loss_mlp": 0.0106886, + "balance_loss_clip": 1.06700253, + "balance_loss_mlp": 1.04579329, + "epoch": 0.05976251315196152, + "flos": 36928910378880.0, + "grad_norm": 1.9910040509835896, + "language_loss": 0.7897504, + "learning_rate": 3.990715819321712e-06, + "loss": 0.81256092, + "num_input_tokens_seen": 21242310, + "router_z_loss_clip": 1.45214844, + "router_z_loss_mlp": 0.23083496, + "step": 994, + "time_per_iteration": 2.7134687900543213 + }, + { + "auxiliary_loss_clip": 0.01212755, + "auxiliary_loss_mlp": 0.01080436, + "balance_loss_clip": 1.06573367, + "balance_loss_mlp": 1.057917, + "epoch": 0.05982263640462949, + "flos": 28290764293920.0, + "grad_norm": 3.3419611126134323, + "language_loss": 0.79884213, + "learning_rate": 3.99067829878596e-06, + "loss": 0.82177401, + "num_input_tokens_seen": 21261410, + "router_z_loss_clip": 1.46972656, + "router_z_loss_mlp": 0.2253418, + "step": 995, + "time_per_iteration": 2.65392804145813 + }, + { + "auxiliary_loss_clip": 0.01207248, + "auxiliary_loss_mlp": 0.01060994, + "balance_loss_clip": 1.0607667, + "balance_loss_mlp": 1.03753364, + "epoch": 0.05988275965729746, + "flos": 33984655146720.0, + "grad_norm": 1.911876681315164, + "language_loss": 0.87175357, + "learning_rate": 3.990640702763487e-06, + "loss": 0.894436, + "num_input_tokens_seen": 21280080, + "router_z_loss_clip": 1.46582031, + "router_z_loss_mlp": 0.23474121, + "step": 996, + "time_per_iteration": 2.6859259605407715 + }, + { + "auxiliary_loss_clip": 0.01215912, + "auxiliary_loss_mlp": 0.01071694, + "balance_loss_clip": 1.06690359, + "balance_loss_mlp": 1.04482388, + "epoch": 0.05994288290996543, + "flos": 30114997045920.0, + "grad_norm": 3.365060366468712, + "language_loss": 0.88135064, + "learning_rate": 3.990603031255718e-06, + "loss": 0.90422672, + "num_input_tokens_seen": 21296765, + "router_z_loss_clip": 1.49023438, + "router_z_loss_mlp": 0.26879883, + "step": 997, + "time_per_iteration": 2.6626222133636475 + }, + { + "auxiliary_loss_clip": 0.01088268, + "auxiliary_loss_mlp": 0.01006091, + "balance_loss_clip": 1.02456439, + "balance_loss_mlp": 1.00219584, + "epoch": 0.0600030061626334, + "flos": 85327967743200.0, + "grad_norm": 1.0243320868379326, + "language_loss": 0.75430119, + "learning_rate": 3.990565284264083e-06, + "loss": 0.77524483, + "num_input_tokens_seen": 21363345, + "router_z_loss_clip": 0.63769531, + "router_z_loss_mlp": 0.03890991, + "step": 998, + "time_per_iteration": 3.3401670455932617 + }, + { + "auxiliary_loss_clip": 0.01213126, + "auxiliary_loss_mlp": 0.01061111, + "balance_loss_clip": 1.06831098, + "balance_loss_mlp": 1.03843725, + "epoch": 0.06006312941530137, + "flos": 32384937408480.0, + "grad_norm": 2.150926697411245, + "language_loss": 0.75709331, + "learning_rate": 3.990527461790013e-06, + "loss": 0.7798357, + "num_input_tokens_seen": 21385290, + "router_z_loss_clip": 1.44921875, + "router_z_loss_mlp": 0.22680664, + "step": 999, + "time_per_iteration": 2.6901674270629883 + }, + { + "auxiliary_loss_clip": 0.012116, + "auxiliary_loss_mlp": 0.01057142, + "balance_loss_clip": 1.06286466, + "balance_loss_mlp": 1.03374112, + "epoch": 0.060123252667969335, + "flos": 33365761565760.0, + "grad_norm": 1.8785006665054824, + "language_loss": 0.82823551, + "learning_rate": 3.990489563834943e-06, + "loss": 0.85092294, + "num_input_tokens_seen": 21407625, + "router_z_loss_clip": 1.48828125, + "router_z_loss_mlp": 0.23413086, + "step": 1000, + "time_per_iteration": 2.7388241291046143 + }, + { + "auxiliary_loss_clip": 0.01216371, + "auxiliary_loss_mlp": 0.01056415, + "balance_loss_clip": 1.0677284, + "balance_loss_mlp": 1.03458774, + "epoch": 0.06018337592063731, + "flos": 32966277821280.0, + "grad_norm": 2.468897784650073, + "language_loss": 0.86341286, + "learning_rate": 3.990451590400309e-06, + "loss": 0.8861407, + "num_input_tokens_seen": 21426835, + "router_z_loss_clip": 1.48535156, + "router_z_loss_mlp": 0.21813965, + "step": 1001, + "time_per_iteration": 2.7672672271728516 + }, + { + "auxiliary_loss_clip": 0.01211265, + "auxiliary_loss_mlp": 0.01054195, + "balance_loss_clip": 1.06626582, + "balance_loss_mlp": 1.03241539, + "epoch": 0.06024349917330528, + "flos": 31228982451360.0, + "grad_norm": 3.464375699323498, + "language_loss": 0.74263537, + "learning_rate": 3.990413541487551e-06, + "loss": 0.76529002, + "num_input_tokens_seen": 21444920, + "router_z_loss_clip": 1.44921875, + "router_z_loss_mlp": 0.21789551, + "step": 1002, + "time_per_iteration": 2.6924285888671875 + }, + { + "auxiliary_loss_clip": 0.01212014, + "auxiliary_loss_mlp": 0.01060526, + "balance_loss_clip": 1.0652895, + "balance_loss_mlp": 1.03724408, + "epoch": 0.060303622425973244, + "flos": 31888954203840.0, + "grad_norm": 2.0511572561171088, + "language_loss": 0.7578128, + "learning_rate": 3.990375417098112e-06, + "loss": 0.78053826, + "num_input_tokens_seen": 21463555, + "router_z_loss_clip": 1.46679688, + "router_z_loss_mlp": 0.23303223, + "step": 1003, + "time_per_iteration": 4.242711544036865 + }, + { + "auxiliary_loss_clip": 0.01218598, + "auxiliary_loss_mlp": 0.01059039, + "balance_loss_clip": 1.06842864, + "balance_loss_mlp": 1.03631735, + "epoch": 0.060363745678641216, + "flos": 24864139663200.0, + "grad_norm": 6.303652736080117, + "language_loss": 0.6983192, + "learning_rate": 3.990337217233437e-06, + "loss": 0.72109562, + "num_input_tokens_seen": 21481990, + "router_z_loss_clip": 1.50292969, + "router_z_loss_mlp": 0.22717285, + "step": 1004, + "time_per_iteration": 5.5661396980285645 + }, + { + "auxiliary_loss_clip": 0.01219277, + "auxiliary_loss_mlp": 0.010677, + "balance_loss_clip": 1.07012677, + "balance_loss_mlp": 1.0446924, + "epoch": 0.06042386893130918, + "flos": 21657330041760.0, + "grad_norm": 2.3932314820999845, + "language_loss": 0.83999449, + "learning_rate": 3.990298941894976e-06, + "loss": 0.86286426, + "num_input_tokens_seen": 21500385, + "router_z_loss_clip": 1.49023438, + "router_z_loss_mlp": 0.23022461, + "step": 1005, + "time_per_iteration": 4.114614963531494 + }, + { + "auxiliary_loss_clip": 0.01085134, + "auxiliary_loss_mlp": 0.01005377, + "balance_loss_clip": 1.02172804, + "balance_loss_mlp": 1.00149047, + "epoch": 0.06048399218397715, + "flos": 83630899681920.0, + "grad_norm": 0.9071610839737785, + "language_loss": 0.5904637, + "learning_rate": 3.9902605910841794e-06, + "loss": 0.61136878, + "num_input_tokens_seen": 21561040, + "router_z_loss_clip": 0.63378906, + "router_z_loss_mlp": 0.03881836, + "step": 1006, + "time_per_iteration": 3.3712029457092285 + }, + { + "auxiliary_loss_clip": 0.01213613, + "auxiliary_loss_mlp": 0.01052702, + "balance_loss_clip": 1.06513286, + "balance_loss_mlp": 1.0302546, + "epoch": 0.060544115436645125, + "flos": 28380254264640.0, + "grad_norm": 1.846902340933698, + "language_loss": 0.74923265, + "learning_rate": 3.990222164802503e-06, + "loss": 0.77189577, + "num_input_tokens_seen": 21580655, + "router_z_loss_clip": 1.484375, + "router_z_loss_mlp": 0.22460938, + "step": 1007, + "time_per_iteration": 2.667793035507202 + }, + { + "auxiliary_loss_clip": 0.01214349, + "auxiliary_loss_mlp": 0.01051832, + "balance_loss_clip": 1.06664741, + "balance_loss_mlp": 1.02956355, + "epoch": 0.06060423868931309, + "flos": 29136765994560.0, + "grad_norm": 1.8769864607590927, + "language_loss": 0.80575126, + "learning_rate": 3.9901836630514006e-06, + "loss": 0.82841313, + "num_input_tokens_seen": 21599650, + "router_z_loss_clip": 1.47460938, + "router_z_loss_mlp": 0.22265625, + "step": 1008, + "time_per_iteration": 2.6824183464050293 + }, + { + "auxiliary_loss_clip": 0.0121315, + "auxiliary_loss_mlp": 0.01053961, + "balance_loss_clip": 1.06757069, + "balance_loss_mlp": 1.03151357, + "epoch": 0.06066436194198106, + "flos": 22852418340960.0, + "grad_norm": 1.8245846064509745, + "language_loss": 0.77691674, + "learning_rate": 3.990145085832335e-06, + "loss": 0.79958785, + "num_input_tokens_seen": 21617550, + "router_z_loss_clip": 1.45605469, + "router_z_loss_mlp": 0.22436523, + "step": 1009, + "time_per_iteration": 2.6231725215911865 + }, + { + "auxiliary_loss_clip": 0.01213852, + "auxiliary_loss_mlp": 0.0105443, + "balance_loss_clip": 1.0687511, + "balance_loss_mlp": 1.03207839, + "epoch": 0.06072448519464903, + "flos": 30067030936800.0, + "grad_norm": 1.8685040693357848, + "language_loss": 0.93104601, + "learning_rate": 3.990106433146769e-06, + "loss": 0.95372885, + "num_input_tokens_seen": 21635865, + "router_z_loss_clip": 1.45117188, + "router_z_loss_mlp": 0.22363281, + "step": 1010, + "time_per_iteration": 2.6930408477783203 + }, + { + "auxiliary_loss_clip": 0.01220726, + "auxiliary_loss_mlp": 0.01061878, + "balance_loss_clip": 1.06644857, + "balance_loss_mlp": 1.03654575, + "epoch": 0.060784608447317, + "flos": 21205909494720.0, + "grad_norm": 3.337286344175029, + "language_loss": 0.71907699, + "learning_rate": 3.9900677049961665e-06, + "loss": 0.74190295, + "num_input_tokens_seen": 21653945, + "router_z_loss_clip": 1.54296875, + "router_z_loss_mlp": 0.25341797, + "step": 1011, + "time_per_iteration": 2.653823137283325 + }, + { + "auxiliary_loss_clip": 0.0121816, + "auxiliary_loss_mlp": 0.01063003, + "balance_loss_clip": 1.06844592, + "balance_loss_mlp": 1.03856564, + "epoch": 0.06084473169998497, + "flos": 28909860461280.0, + "grad_norm": 1.9128063870987007, + "language_loss": 0.87536949, + "learning_rate": 3.990028901381999e-06, + "loss": 0.89818114, + "num_input_tokens_seen": 21671230, + "router_z_loss_clip": 1.49707031, + "router_z_loss_mlp": 0.24462891, + "step": 1012, + "time_per_iteration": 2.7115707397460938 + }, + { + "auxiliary_loss_clip": 0.01210857, + "auxiliary_loss_mlp": 0.01060115, + "balance_loss_clip": 1.06380773, + "balance_loss_mlp": 1.0375843, + "epoch": 0.06090485495265294, + "flos": 28731852934560.0, + "grad_norm": 2.1677803443087984, + "language_loss": 0.77475643, + "learning_rate": 3.989990022305734e-06, + "loss": 0.79746616, + "num_input_tokens_seen": 21691155, + "router_z_loss_clip": 1.47070312, + "router_z_loss_mlp": 0.2253418, + "step": 1013, + "time_per_iteration": 2.7074568271636963 + }, + { + "auxiliary_loss_clip": 0.01217914, + "auxiliary_loss_mlp": 0.0105944, + "balance_loss_clip": 1.06691813, + "balance_loss_mlp": 1.035074, + "epoch": 0.06096497820532091, + "flos": 24818037348960.0, + "grad_norm": 2.9416563427768536, + "language_loss": 0.86116141, + "learning_rate": 3.98995106776885e-06, + "loss": 0.88393492, + "num_input_tokens_seen": 21707405, + "router_z_loss_clip": 1.50976562, + "router_z_loss_mlp": 0.24377441, + "step": 1014, + "time_per_iteration": 2.652062177658081 + }, + { + "auxiliary_loss_clip": 0.0122107, + "auxiliary_loss_mlp": 0.01067384, + "balance_loss_clip": 1.06834316, + "balance_loss_mlp": 1.04258871, + "epoch": 0.061025101457988874, + "flos": 32871925776960.0, + "grad_norm": 2.0585167562474602, + "language_loss": 0.73857307, + "learning_rate": 3.98991203777282e-06, + "loss": 0.76145762, + "num_input_tokens_seen": 21728090, + "router_z_loss_clip": 1.52832031, + "router_z_loss_mlp": 0.24804688, + "step": 1015, + "time_per_iteration": 2.69447660446167 + }, + { + "auxiliary_loss_clip": 0.01207483, + "auxiliary_loss_mlp": 0.01056531, + "balance_loss_clip": 1.06386447, + "balance_loss_mlp": 1.03459692, + "epoch": 0.061085224710656846, + "flos": 30962984093280.0, + "grad_norm": 1.7655728544721092, + "language_loss": 0.79249191, + "learning_rate": 3.9898729323191275e-06, + "loss": 0.81513202, + "num_input_tokens_seen": 21747950, + "router_z_loss_clip": 1.4375, + "router_z_loss_mlp": 0.21936035, + "step": 1016, + "time_per_iteration": 2.685985565185547 + }, + { + "auxiliary_loss_clip": 0.01215179, + "auxiliary_loss_mlp": 0.01057018, + "balance_loss_clip": 1.06716061, + "balance_loss_mlp": 1.03500044, + "epoch": 0.06114534796332482, + "flos": 30291748536960.0, + "grad_norm": 1.6279595849702064, + "language_loss": 0.75900173, + "learning_rate": 3.989833751409254e-06, + "loss": 0.78172368, + "num_input_tokens_seen": 21767900, + "router_z_loss_clip": 1.48144531, + "router_z_loss_mlp": 0.22021484, + "step": 1017, + "time_per_iteration": 2.6851425170898438 + }, + { + "auxiliary_loss_clip": 0.01221034, + "auxiliary_loss_mlp": 0.01069017, + "balance_loss_clip": 1.06826341, + "balance_loss_mlp": 1.0455327, + "epoch": 0.061205471215992784, + "flos": 25174498092480.0, + "grad_norm": 2.168245994609695, + "language_loss": 0.85752738, + "learning_rate": 3.989794495044685e-06, + "loss": 0.88042784, + "num_input_tokens_seen": 21787375, + "router_z_loss_clip": 1.52734375, + "router_z_loss_mlp": 0.23486328, + "step": 1018, + "time_per_iteration": 2.702202320098877 + }, + { + "auxiliary_loss_clip": 0.01212407, + "auxiliary_loss_mlp": 0.01072244, + "balance_loss_clip": 1.06613827, + "balance_loss_mlp": 1.04855704, + "epoch": 0.061265594468660756, + "flos": 20143212615360.0, + "grad_norm": 2.5161217118451282, + "language_loss": 0.76968265, + "learning_rate": 3.989755163226909e-06, + "loss": 0.79252917, + "num_input_tokens_seen": 21806275, + "router_z_loss_clip": 1.46191406, + "router_z_loss_mlp": 0.23681641, + "step": 1019, + "time_per_iteration": 2.724871873855591 + }, + { + "auxiliary_loss_clip": 0.01211505, + "auxiliary_loss_mlp": 0.01059204, + "balance_loss_clip": 1.06276226, + "balance_loss_mlp": 1.03568411, + "epoch": 0.06132571772132872, + "flos": 32026450800960.0, + "grad_norm": 2.385478432456181, + "language_loss": 0.84190679, + "learning_rate": 3.989715755957418e-06, + "loss": 0.86461389, + "num_input_tokens_seen": 21826430, + "router_z_loss_clip": 1.48730469, + "router_z_loss_mlp": 0.23535156, + "step": 1020, + "time_per_iteration": 2.7037782669067383 + }, + { + "auxiliary_loss_clip": 0.01216896, + "auxiliary_loss_mlp": 0.01057019, + "balance_loss_clip": 1.06796718, + "balance_loss_mlp": 1.03417814, + "epoch": 0.06138584097399669, + "flos": 45654844708800.0, + "grad_norm": 1.8865399083647267, + "language_loss": 0.79433066, + "learning_rate": 3.989676273237705e-06, + "loss": 0.81706977, + "num_input_tokens_seen": 21847800, + "router_z_loss_clip": 1.48828125, + "router_z_loss_mlp": 0.22827148, + "step": 1021, + "time_per_iteration": 2.8196094036102295 + }, + { + "auxiliary_loss_clip": 0.01211369, + "auxiliary_loss_mlp": 0.01058825, + "balance_loss_clip": 1.06590569, + "balance_loss_mlp": 1.03901207, + "epoch": 0.061445964226664665, + "flos": 21256468709760.0, + "grad_norm": 2.8229689572709638, + "language_loss": 0.875633, + "learning_rate": 3.9896367150692705e-06, + "loss": 0.89833492, + "num_input_tokens_seen": 21863385, + "router_z_loss_clip": 1.453125, + "router_z_loss_mlp": 0.19812012, + "step": 1022, + "time_per_iteration": 2.6301867961883545 + }, + { + "auxiliary_loss_clip": 0.01214865, + "auxiliary_loss_mlp": 0.01058581, + "balance_loss_clip": 1.06807017, + "balance_loss_mlp": 1.03693318, + "epoch": 0.06150608747933263, + "flos": 27576870392160.0, + "grad_norm": 1.8639174833628531, + "language_loss": 0.83207369, + "learning_rate": 3.989597081453611e-06, + "loss": 0.85480821, + "num_input_tokens_seen": 21881880, + "router_z_loss_clip": 1.46972656, + "router_z_loss_mlp": 0.21643066, + "step": 1023, + "time_per_iteration": 2.6812973022460938 + }, + { + "auxiliary_loss_clip": 0.01093134, + "auxiliary_loss_mlp": 0.01011884, + "balance_loss_clip": 1.02988875, + "balance_loss_mlp": 1.00802422, + "epoch": 0.0615662107320006, + "flos": 69236391448800.0, + "grad_norm": 0.8909294524734522, + "language_loss": 0.65060842, + "learning_rate": 3.989557372392231e-06, + "loss": 0.67165864, + "num_input_tokens_seen": 21940550, + "router_z_loss_clip": 0.6328125, + "router_z_loss_mlp": 0.0385437, + "step": 1024, + "time_per_iteration": 3.3442494869232178 + }, + { + "auxiliary_loss_clip": 0.01214078, + "auxiliary_loss_mlp": 0.01064195, + "balance_loss_clip": 1.06690538, + "balance_loss_mlp": 1.04097271, + "epoch": 0.06162633398466857, + "flos": 27533766356640.0, + "grad_norm": 1.8400214998649396, + "language_loss": 0.8825826, + "learning_rate": 3.989517587886636e-06, + "loss": 0.90536535, + "num_input_tokens_seen": 21958390, + "router_z_loss_clip": 1.47167969, + "router_z_loss_mlp": 0.2322998, + "step": 1025, + "time_per_iteration": 2.6960723400115967 + }, + { + "auxiliary_loss_clip": 0.0121412, + "auxiliary_loss_mlp": 0.01057777, + "balance_loss_clip": 1.06676626, + "balance_loss_mlp": 1.03609252, + "epoch": 0.06168645723733654, + "flos": 31229144520480.0, + "grad_norm": 1.502388295860123, + "language_loss": 0.84538144, + "learning_rate": 3.989477727938335e-06, + "loss": 0.8681004, + "num_input_tokens_seen": 21978625, + "router_z_loss_clip": 1.47167969, + "router_z_loss_mlp": 0.21691895, + "step": 1026, + "time_per_iteration": 2.7316172122955322 + }, + { + "auxiliary_loss_clip": 0.01212039, + "auxiliary_loss_mlp": 0.01061574, + "balance_loss_clip": 1.06333566, + "balance_loss_mlp": 1.03899622, + "epoch": 0.06174658049000451, + "flos": 19520307823680.0, + "grad_norm": 1.828680059792196, + "language_loss": 0.82098961, + "learning_rate": 3.989437792548839e-06, + "loss": 0.84372574, + "num_input_tokens_seen": 21996035, + "router_z_loss_clip": 1.48828125, + "router_z_loss_mlp": 0.22570801, + "step": 1027, + "time_per_iteration": 2.6780455112457275 + }, + { + "auxiliary_loss_clip": 0.01207817, + "auxiliary_loss_mlp": 0.01052636, + "balance_loss_clip": 1.06223214, + "balance_loss_mlp": 1.03120232, + "epoch": 0.06180670374267248, + "flos": 13770914647680.0, + "grad_norm": 2.18825485085231, + "language_loss": 0.83961916, + "learning_rate": 3.989397781719663e-06, + "loss": 0.86222363, + "num_input_tokens_seen": 22011625, + "router_z_loss_clip": 1.45507812, + "router_z_loss_mlp": 0.21435547, + "step": 1028, + "time_per_iteration": 2.6340508460998535 + }, + { + "auxiliary_loss_clip": 0.01085121, + "auxiliary_loss_mlp": 0.01010468, + "balance_loss_clip": 1.02297008, + "balance_loss_mlp": 1.00686812, + "epoch": 0.06186682699534045, + "flos": 79473161305440.0, + "grad_norm": 0.9374458273549718, + "language_loss": 0.60516214, + "learning_rate": 3.989357695452323e-06, + "loss": 0.626118, + "num_input_tokens_seen": 22066035, + "router_z_loss_clip": 0.62207031, + "router_z_loss_mlp": 0.03598022, + "step": 1029, + "time_per_iteration": 3.0538463592529297 + }, + { + "auxiliary_loss_clip": 0.01204035, + "auxiliary_loss_mlp": 0.01057443, + "balance_loss_clip": 1.05962849, + "balance_loss_mlp": 1.0357945, + "epoch": 0.061926950248008414, + "flos": 25753285916640.0, + "grad_norm": 1.98221123582432, + "language_loss": 0.82784206, + "learning_rate": 3.98931753374834e-06, + "loss": 0.85045683, + "num_input_tokens_seen": 22085015, + "router_z_loss_clip": 1.44140625, + "router_z_loss_mlp": 0.21630859, + "step": 1030, + "time_per_iteration": 2.6637611389160156 + }, + { + "auxiliary_loss_clip": 0.012142, + "auxiliary_loss_mlp": 0.01063667, + "balance_loss_clip": 1.0653882, + "balance_loss_mlp": 1.04094553, + "epoch": 0.061987073500676386, + "flos": 21656519696160.0, + "grad_norm": 2.8737903384570496, + "language_loss": 0.79713613, + "learning_rate": 3.989277296609237e-06, + "loss": 0.81991476, + "num_input_tokens_seen": 22102775, + "router_z_loss_clip": 1.48828125, + "router_z_loss_mlp": 0.22729492, + "step": 1031, + "time_per_iteration": 2.6303000450134277 + }, + { + "auxiliary_loss_clip": 0.01209237, + "auxiliary_loss_mlp": 0.01066769, + "balance_loss_clip": 1.06180334, + "balance_loss_mlp": 1.04312992, + "epoch": 0.06204719675334436, + "flos": 26644620103200.0, + "grad_norm": 1.4987407216961028, + "language_loss": 0.77178633, + "learning_rate": 3.98923698403654e-06, + "loss": 0.79454637, + "num_input_tokens_seen": 22121680, + "router_z_loss_clip": 1.47363281, + "router_z_loss_mlp": 0.23669434, + "step": 1032, + "time_per_iteration": 2.6657817363739014 + }, + { + "auxiliary_loss_clip": 0.01207525, + "auxiliary_loss_mlp": 0.01065445, + "balance_loss_clip": 1.05970967, + "balance_loss_mlp": 1.04254508, + "epoch": 0.06210732000601232, + "flos": 23615372318400.0, + "grad_norm": 2.2786765925241674, + "language_loss": 0.89634138, + "learning_rate": 3.989196596031776e-06, + "loss": 0.91907108, + "num_input_tokens_seen": 22138155, + "router_z_loss_clip": 1.47851562, + "router_z_loss_mlp": 0.22912598, + "step": 1033, + "time_per_iteration": 2.675481081008911 + }, + { + "auxiliary_loss_clip": 0.01208817, + "auxiliary_loss_mlp": 0.01054599, + "balance_loss_clip": 1.06096423, + "balance_loss_mlp": 1.03366566, + "epoch": 0.062167443258680295, + "flos": 30199989598560.0, + "grad_norm": 2.3244576501451313, + "language_loss": 0.84984076, + "learning_rate": 3.989156132596479e-06, + "loss": 0.87247497, + "num_input_tokens_seen": 22157420, + "router_z_loss_clip": 1.47949219, + "router_z_loss_mlp": 0.20935059, + "step": 1034, + "time_per_iteration": 2.6710915565490723 + }, + { + "auxiliary_loss_clip": 0.01202219, + "auxiliary_loss_mlp": 0.01058186, + "balance_loss_clip": 1.06225216, + "balance_loss_mlp": 1.03530955, + "epoch": 0.06222756651134827, + "flos": 42047416859040.0, + "grad_norm": 1.8885310031475715, + "language_loss": 0.81114566, + "learning_rate": 3.989115593732182e-06, + "loss": 0.83374971, + "num_input_tokens_seen": 22178620, + "router_z_loss_clip": 1.39941406, + "router_z_loss_mlp": 0.22875977, + "step": 1035, + "time_per_iteration": 2.8282299041748047 + }, + { + "auxiliary_loss_clip": 0.01209593, + "auxiliary_loss_mlp": 0.01059698, + "balance_loss_clip": 1.06337965, + "balance_loss_mlp": 1.03522468, + "epoch": 0.06228768976401623, + "flos": 31318634491200.0, + "grad_norm": 2.4926926707296015, + "language_loss": 0.78361589, + "learning_rate": 3.989074979440421e-06, + "loss": 0.80630875, + "num_input_tokens_seen": 22197125, + "router_z_loss_clip": 1.46191406, + "router_z_loss_mlp": 0.24475098, + "step": 1036, + "time_per_iteration": 2.6858036518096924 + }, + { + "auxiliary_loss_clip": 0.01205512, + "auxiliary_loss_mlp": 0.01063471, + "balance_loss_clip": 1.06254315, + "balance_loss_mlp": 1.04073834, + "epoch": 0.062347813016684205, + "flos": 30866646702240.0, + "grad_norm": 1.6748389044882852, + "language_loss": 0.86687034, + "learning_rate": 3.989034289722739e-06, + "loss": 0.88956022, + "num_input_tokens_seen": 22217575, + "router_z_loss_clip": 1.4296875, + "router_z_loss_mlp": 0.22753906, + "step": 1037, + "time_per_iteration": 2.669933319091797 + }, + { + "auxiliary_loss_clip": 0.01204759, + "auxiliary_loss_mlp": 0.01048911, + "balance_loss_clip": 1.06068885, + "balance_loss_mlp": 1.02549803, + "epoch": 0.06240793626935217, + "flos": 32832468296640.0, + "grad_norm": 2.1925663871945864, + "language_loss": 0.81002939, + "learning_rate": 3.988993524580676e-06, + "loss": 0.83256608, + "num_input_tokens_seen": 22236840, + "router_z_loss_clip": 1.44238281, + "router_z_loss_mlp": 0.23413086, + "step": 1038, + "time_per_iteration": 2.8147003650665283 + }, + { + "auxiliary_loss_clip": 0.01205533, + "auxiliary_loss_mlp": 0.01064988, + "balance_loss_clip": 1.06398129, + "balance_loss_mlp": 1.04158711, + "epoch": 0.06246805952202014, + "flos": 26375745018240.0, + "grad_norm": 1.8617788070920192, + "language_loss": 0.85617268, + "learning_rate": 3.98895268401578e-06, + "loss": 0.87887788, + "num_input_tokens_seen": 22256465, + "router_z_loss_clip": 1.41601562, + "router_z_loss_mlp": 0.23376465, + "step": 1039, + "time_per_iteration": 2.734415292739868 + }, + { + "auxiliary_loss_clip": 0.01207763, + "auxiliary_loss_mlp": 0.01066311, + "balance_loss_clip": 1.06332648, + "balance_loss_mlp": 1.04326773, + "epoch": 0.0625281827746881, + "flos": 23564853620640.0, + "grad_norm": 1.8862068637478084, + "language_loss": 0.80774027, + "learning_rate": 3.9889117680296e-06, + "loss": 0.83048099, + "num_input_tokens_seen": 22274025, + "router_z_loss_clip": 1.4453125, + "router_z_loss_mlp": 0.23046875, + "step": 1040, + "time_per_iteration": 2.6424155235290527 + }, + { + "auxiliary_loss_clip": 0.01211817, + "auxiliary_loss_mlp": 0.01059848, + "balance_loss_clip": 1.06694329, + "balance_loss_mlp": 1.03719854, + "epoch": 0.06258830602735609, + "flos": 33855991316640.0, + "grad_norm": 2.422279481614191, + "language_loss": 0.69374669, + "learning_rate": 3.988870776623685e-06, + "loss": 0.71646333, + "num_input_tokens_seen": 22292245, + "router_z_loss_clip": 1.44824219, + "router_z_loss_mlp": 0.22644043, + "step": 1041, + "time_per_iteration": 2.6910560131073 + }, + { + "auxiliary_loss_clip": 0.01209422, + "auxiliary_loss_mlp": 0.01053407, + "balance_loss_clip": 1.06113279, + "balance_loss_mlp": 1.02997017, + "epoch": 0.06264842928002405, + "flos": 28336380400800.0, + "grad_norm": 2.1144459142080345, + "language_loss": 0.81703097, + "learning_rate": 3.9888297097995905e-06, + "loss": 0.83965921, + "num_input_tokens_seen": 22311455, + "router_z_loss_clip": 1.48046875, + "router_z_loss_mlp": 0.23449707, + "step": 1042, + "time_per_iteration": 4.204681873321533 + }, + { + "auxiliary_loss_clip": 0.01207854, + "auxiliary_loss_mlp": 0.01047666, + "balance_loss_clip": 1.06257081, + "balance_loss_mlp": 1.0263989, + "epoch": 0.06270855253269202, + "flos": 46856496807360.0, + "grad_norm": 1.6865158523012425, + "language_loss": 0.76202595, + "learning_rate": 3.988788567558874e-06, + "loss": 0.78458118, + "num_input_tokens_seen": 22333750, + "router_z_loss_clip": 1.45214844, + "router_z_loss_mlp": 0.21252441, + "step": 1043, + "time_per_iteration": 4.267367601394653 + }, + { + "auxiliary_loss_clip": 0.01203917, + "auxiliary_loss_mlp": 0.0105938, + "balance_loss_clip": 1.06404853, + "balance_loss_mlp": 1.03862596, + "epoch": 0.06276867578535998, + "flos": 27398174071680.0, + "grad_norm": 1.9598151465348446, + "language_loss": 0.92266512, + "learning_rate": 3.988747349903097e-06, + "loss": 0.94529808, + "num_input_tokens_seen": 22351940, + "router_z_loss_clip": 1.40039062, + "router_z_loss_mlp": 0.2076416, + "step": 1044, + "time_per_iteration": 5.561589479446411 + }, + { + "auxiliary_loss_clip": 0.01206182, + "auxiliary_loss_mlp": 0.01068228, + "balance_loss_clip": 1.061571, + "balance_loss_mlp": 1.0460434, + "epoch": 0.06282879903802796, + "flos": 27979879140000.0, + "grad_norm": 1.9656213916045575, + "language_loss": 0.8603605, + "learning_rate": 3.988706056833821e-06, + "loss": 0.88310456, + "num_input_tokens_seen": 22372085, + "router_z_loss_clip": 1.44726562, + "router_z_loss_mlp": 0.2220459, + "step": 1045, + "time_per_iteration": 2.7709250450134277 + }, + { + "auxiliary_loss_clip": 0.01203809, + "auxiliary_loss_mlp": 0.01062516, + "balance_loss_clip": 1.06203008, + "balance_loss_mlp": 1.04089129, + "epoch": 0.06288892229069593, + "flos": 42488019292320.0, + "grad_norm": 1.936884834877687, + "language_loss": 0.78479743, + "learning_rate": 3.9886646883526125e-06, + "loss": 0.80746067, + "num_input_tokens_seen": 22392020, + "router_z_loss_clip": 1.41894531, + "router_z_loss_mlp": 0.21618652, + "step": 1046, + "time_per_iteration": 2.773391008377075 + }, + { + "auxiliary_loss_clip": 0.01203105, + "auxiliary_loss_mlp": 0.0106244, + "balance_loss_clip": 1.0618124, + "balance_loss_mlp": 1.04119647, + "epoch": 0.06294904554336389, + "flos": 23704943323680.0, + "grad_norm": 2.2831907165702865, + "language_loss": 0.77253747, + "learning_rate": 3.988623244461039e-06, + "loss": 0.79519296, + "num_input_tokens_seen": 22411180, + "router_z_loss_clip": 1.41503906, + "router_z_loss_mlp": 0.21240234, + "step": 1047, + "time_per_iteration": 2.63908052444458 + }, + { + "auxiliary_loss_clip": 0.01212539, + "auxiliary_loss_mlp": 0.01056118, + "balance_loss_clip": 1.06373858, + "balance_loss_mlp": 1.03408825, + "epoch": 0.06300916879603187, + "flos": 49617193645440.0, + "grad_norm": 2.151805500911245, + "language_loss": 0.77626026, + "learning_rate": 3.988581725160672e-06, + "loss": 0.7989468, + "num_input_tokens_seen": 22435105, + "router_z_loss_clip": 1.48925781, + "router_z_loss_mlp": 0.22033691, + "step": 1048, + "time_per_iteration": 2.833899736404419 + }, + { + "auxiliary_loss_clip": 0.01205792, + "auxiliary_loss_mlp": 0.01062072, + "balance_loss_clip": 1.0603888, + "balance_loss_mlp": 1.03980386, + "epoch": 0.06306929204869983, + "flos": 29046222574560.0, + "grad_norm": 3.8313145371441384, + "language_loss": 0.77519822, + "learning_rate": 3.988540130453087e-06, + "loss": 0.79787683, + "num_input_tokens_seen": 22452710, + "router_z_loss_clip": 1.45507812, + "router_z_loss_mlp": 0.22265625, + "step": 1049, + "time_per_iteration": 2.689478874206543 + }, + { + "auxiliary_loss_clip": 0.01206008, + "auxiliary_loss_mlp": 0.01055689, + "balance_loss_clip": 1.06032026, + "balance_loss_mlp": 1.03355193, + "epoch": 0.0631294153013678, + "flos": 23081025600000.0, + "grad_norm": 2.0020937614317313, + "language_loss": 0.82910877, + "learning_rate": 3.988498460339862e-06, + "loss": 0.8517257, + "num_input_tokens_seen": 22470175, + "router_z_loss_clip": 1.45703125, + "router_z_loss_mlp": 0.22131348, + "step": 1050, + "time_per_iteration": 2.6694369316101074 + }, + { + "auxiliary_loss_clip": 0.01207693, + "auxiliary_loss_mlp": 0.01062115, + "balance_loss_clip": 1.06466401, + "balance_loss_mlp": 1.04066956, + "epoch": 0.06318953855403578, + "flos": 29626955228160.0, + "grad_norm": 1.8029269835907018, + "language_loss": 0.76770496, + "learning_rate": 3.988456714822575e-06, + "loss": 0.79040301, + "num_input_tokens_seen": 22490020, + "router_z_loss_clip": 1.43261719, + "router_z_loss_mlp": 0.21447754, + "step": 1051, + "time_per_iteration": 2.655285358428955 + }, + { + "auxiliary_loss_clip": 0.01207022, + "auxiliary_loss_mlp": 0.01068954, + "balance_loss_clip": 1.06185246, + "balance_loss_mlp": 1.04637599, + "epoch": 0.06324966180670374, + "flos": 27489933010080.0, + "grad_norm": 1.9631014537487566, + "language_loss": 0.80129373, + "learning_rate": 3.98841489390281e-06, + "loss": 0.82405353, + "num_input_tokens_seen": 22509685, + "router_z_loss_clip": 1.45214844, + "router_z_loss_mlp": 0.22583008, + "step": 1052, + "time_per_iteration": 2.653707265853882 + }, + { + "auxiliary_loss_clip": 0.01206839, + "auxiliary_loss_mlp": 0.01057543, + "balance_loss_clip": 1.06085491, + "balance_loss_mlp": 1.03584707, + "epoch": 0.06330978505937171, + "flos": 19252162049760.0, + "grad_norm": 2.281623288373757, + "language_loss": 0.78148991, + "learning_rate": 3.988372997582155e-06, + "loss": 0.80413371, + "num_input_tokens_seen": 22527905, + "router_z_loss_clip": 1.45996094, + "router_z_loss_mlp": 0.21716309, + "step": 1053, + "time_per_iteration": 2.636154890060425 + }, + { + "auxiliary_loss_clip": 0.01205442, + "auxiliary_loss_mlp": 0.01052612, + "balance_loss_clip": 1.06022286, + "balance_loss_mlp": 1.03064203, + "epoch": 0.06336990831203967, + "flos": 26199074561760.0, + "grad_norm": 1.8426566792048447, + "language_loss": 0.84761167, + "learning_rate": 3.988331025862195e-06, + "loss": 0.87019223, + "num_input_tokens_seen": 22546335, + "router_z_loss_clip": 1.45214844, + "router_z_loss_mlp": 0.21960449, + "step": 1054, + "time_per_iteration": 2.6943633556365967 + }, + { + "auxiliary_loss_clip": 0.0120437, + "auxiliary_loss_mlp": 0.01061067, + "balance_loss_clip": 1.06237102, + "balance_loss_mlp": 1.03908455, + "epoch": 0.06343003156470765, + "flos": 22547772848160.0, + "grad_norm": 2.5328486234433045, + "language_loss": 0.85633439, + "learning_rate": 3.9882889787445225e-06, + "loss": 0.87898874, + "num_input_tokens_seen": 22563885, + "router_z_loss_clip": 1.41894531, + "router_z_loss_mlp": 0.21972656, + "step": 1055, + "time_per_iteration": 2.6346514225006104 + }, + { + "auxiliary_loss_clip": 0.01208836, + "auxiliary_loss_mlp": 0.01064788, + "balance_loss_clip": 1.06022036, + "balance_loss_mlp": 1.04223394, + "epoch": 0.06349015481737562, + "flos": 30694433146560.0, + "grad_norm": 2.457127476985612, + "language_loss": 0.81054538, + "learning_rate": 3.988246856230734e-06, + "loss": 0.83328164, + "num_input_tokens_seen": 22583035, + "router_z_loss_clip": 1.48632812, + "router_z_loss_mlp": 0.22546387, + "step": 1056, + "time_per_iteration": 2.706029176712036 + }, + { + "auxiliary_loss_clip": 0.01209703, + "auxiliary_loss_mlp": 0.01057357, + "balance_loss_clip": 1.05801702, + "balance_loss_mlp": 1.03345537, + "epoch": 0.06355027807004358, + "flos": 32789323743840.0, + "grad_norm": 2.3096581328945978, + "language_loss": 0.81311983, + "learning_rate": 3.988204658322426e-06, + "loss": 0.8357904, + "num_input_tokens_seen": 22605055, + "router_z_loss_clip": 1.51757812, + "router_z_loss_mlp": 0.2388916, + "step": 1057, + "time_per_iteration": 2.7014009952545166 + }, + { + "auxiliary_loss_clip": 0.01196935, + "auxiliary_loss_mlp": 0.01064586, + "balance_loss_clip": 1.05777395, + "balance_loss_mlp": 1.04460621, + "epoch": 0.06361040132271156, + "flos": 26108409589920.0, + "grad_norm": 1.6787091053190555, + "language_loss": 0.83574444, + "learning_rate": 3.988162385021196e-06, + "loss": 0.85835963, + "num_input_tokens_seen": 22623760, + "router_z_loss_clip": 1.39160156, + "router_z_loss_mlp": 0.19970703, + "step": 1058, + "time_per_iteration": 2.646103858947754 + }, + { + "auxiliary_loss_clip": 0.0120399, + "auxiliary_loss_mlp": 0.01057027, + "balance_loss_clip": 1.05801928, + "balance_loss_mlp": 1.03294683, + "epoch": 0.06367052457537953, + "flos": 31401277041600.0, + "grad_norm": 1.9405901208167773, + "language_loss": 0.87329173, + "learning_rate": 3.988120036328651e-06, + "loss": 0.89590192, + "num_input_tokens_seen": 22643000, + "router_z_loss_clip": 1.45898438, + "router_z_loss_mlp": 0.24084473, + "step": 1059, + "time_per_iteration": 2.6974077224731445 + }, + { + "auxiliary_loss_clip": 0.01212104, + "auxiliary_loss_mlp": 0.01059279, + "balance_loss_clip": 1.06374466, + "balance_loss_mlp": 1.03635502, + "epoch": 0.0637306478280475, + "flos": 21514404129120.0, + "grad_norm": 2.2105213115614566, + "language_loss": 0.9123559, + "learning_rate": 3.988077612246394e-06, + "loss": 0.93506968, + "num_input_tokens_seen": 22660460, + "router_z_loss_clip": 1.48632812, + "router_z_loss_mlp": 0.22924805, + "step": 1060, + "time_per_iteration": 2.6230857372283936 + }, + { + "auxiliary_loss_clip": 0.01204263, + "auxiliary_loss_mlp": 0.01061555, + "balance_loss_clip": 1.05881643, + "balance_loss_mlp": 1.03854799, + "epoch": 0.06379077108071547, + "flos": 16670607222240.0, + "grad_norm": 2.4688201740752747, + "language_loss": 0.87235588, + "learning_rate": 3.988035112776035e-06, + "loss": 0.89501405, + "num_input_tokens_seen": 22679270, + "router_z_loss_clip": 1.45410156, + "router_z_loss_mlp": 0.2298584, + "step": 1061, + "time_per_iteration": 2.6598598957061768 + }, + { + "auxiliary_loss_clip": 0.01207867, + "auxiliary_loss_mlp": 0.01053825, + "balance_loss_clip": 1.05697298, + "balance_loss_mlp": 1.03030467, + "epoch": 0.06385089433338344, + "flos": 34750242747360.0, + "grad_norm": 2.1419127994921494, + "language_loss": 0.77255833, + "learning_rate": 3.987992537919185e-06, + "loss": 0.79517525, + "num_input_tokens_seen": 22699330, + "router_z_loss_clip": 1.50878906, + "router_z_loss_mlp": 0.23547363, + "step": 1062, + "time_per_iteration": 2.7419800758361816 + }, + { + "auxiliary_loss_clip": 0.01206215, + "auxiliary_loss_mlp": 0.01055519, + "balance_loss_clip": 1.05897045, + "balance_loss_mlp": 1.03364456, + "epoch": 0.0639110175860514, + "flos": 29665156672800.0, + "grad_norm": 2.031948586339304, + "language_loss": 0.86517811, + "learning_rate": 3.987949887677459e-06, + "loss": 0.88779545, + "num_input_tokens_seen": 22717945, + "router_z_loss_clip": 1.47265625, + "router_z_loss_mlp": 0.21875, + "step": 1063, + "time_per_iteration": 2.6874773502349854 + }, + { + "auxiliary_loss_clip": 0.0120353, + "auxiliary_loss_mlp": 0.01060143, + "balance_loss_clip": 1.05818367, + "balance_loss_mlp": 1.03700447, + "epoch": 0.06397114083871938, + "flos": 26955302670720.0, + "grad_norm": 1.9681214175796984, + "language_loss": 0.80319428, + "learning_rate": 3.9879071620524744e-06, + "loss": 0.82583106, + "num_input_tokens_seen": 22736790, + "router_z_loss_clip": 1.45410156, + "router_z_loss_mlp": 0.23120117, + "step": 1064, + "time_per_iteration": 2.667550563812256 + }, + { + "auxiliary_loss_clip": 0.01208058, + "auxiliary_loss_mlp": 0.01064262, + "balance_loss_clip": 1.06149387, + "balance_loss_mlp": 1.04012179, + "epoch": 0.06403126409138735, + "flos": 23882748264000.0, + "grad_norm": 3.199654721697041, + "language_loss": 0.8431887, + "learning_rate": 3.987864361045851e-06, + "loss": 0.8659119, + "num_input_tokens_seen": 22754745, + "router_z_loss_clip": 1.46679688, + "router_z_loss_mlp": 0.24133301, + "step": 1065, + "time_per_iteration": 2.7156002521514893 + }, + { + "auxiliary_loss_clip": 0.01205096, + "auxiliary_loss_mlp": 0.01049143, + "balance_loss_clip": 1.06169641, + "balance_loss_mlp": 1.02854359, + "epoch": 0.06409138734405531, + "flos": 49794836516640.0, + "grad_norm": 1.6066880724881156, + "language_loss": 0.68086147, + "learning_rate": 3.987821484659211e-06, + "loss": 0.70340383, + "num_input_tokens_seen": 22776780, + "router_z_loss_clip": 1.43457031, + "router_z_loss_mlp": 0.20593262, + "step": 1066, + "time_per_iteration": 2.8595082759857178 + }, + { + "auxiliary_loss_clip": 0.01206189, + "auxiliary_loss_mlp": 0.01067851, + "balance_loss_clip": 1.06187868, + "balance_loss_mlp": 1.04467678, + "epoch": 0.06415151059672328, + "flos": 24943540831200.0, + "grad_norm": 1.9346135815448988, + "language_loss": 0.90249294, + "learning_rate": 3.987778532894181e-06, + "loss": 0.92523336, + "num_input_tokens_seen": 22793915, + "router_z_loss_clip": 1.44628906, + "router_z_loss_mlp": 0.23168945, + "step": 1067, + "time_per_iteration": 2.7015371322631836 + }, + { + "auxiliary_loss_clip": 0.01206398, + "auxiliary_loss_mlp": 0.01062336, + "balance_loss_clip": 1.06041932, + "balance_loss_mlp": 1.041224, + "epoch": 0.06421163384939126, + "flos": 22051830160800.0, + "grad_norm": 2.15785346601583, + "language_loss": 0.83567369, + "learning_rate": 3.987735505752391e-06, + "loss": 0.85836101, + "num_input_tokens_seen": 22812670, + "router_z_loss_clip": 1.4609375, + "router_z_loss_mlp": 0.21118164, + "step": 1068, + "time_per_iteration": 2.6622350215911865 + }, + { + "auxiliary_loss_clip": 0.01204338, + "auxiliary_loss_mlp": 0.0105564, + "balance_loss_clip": 1.06347704, + "balance_loss_mlp": 1.03432524, + "epoch": 0.06427175710205922, + "flos": 30651086007360.0, + "grad_norm": 3.3082071672096833, + "language_loss": 0.8955676, + "learning_rate": 3.987692403235471e-06, + "loss": 0.91816735, + "num_input_tokens_seen": 22832440, + "router_z_loss_clip": 1.40722656, + "router_z_loss_mlp": 0.21313477, + "step": 1069, + "time_per_iteration": 2.695676803588867 + }, + { + "auxiliary_loss_clip": 0.01207547, + "auxiliary_loss_mlp": 0.01069799, + "balance_loss_clip": 1.0606935, + "balance_loss_mlp": 1.04613626, + "epoch": 0.06433188035472719, + "flos": 21207854324160.0, + "grad_norm": 2.88229511149387, + "language_loss": 0.95853525, + "learning_rate": 3.987649225345056e-06, + "loss": 0.9813087, + "num_input_tokens_seen": 22845495, + "router_z_loss_clip": 1.46777344, + "router_z_loss_mlp": 0.23669434, + "step": 1070, + "time_per_iteration": 2.6288187503814697 + }, + { + "auxiliary_loss_clip": 0.01206501, + "auxiliary_loss_mlp": 0.01050577, + "balance_loss_clip": 1.06081963, + "balance_loss_mlp": 1.0276649, + "epoch": 0.06439200360739517, + "flos": 28731771900000.0, + "grad_norm": 1.8179612233623315, + "language_loss": 0.88261127, + "learning_rate": 3.987605972082782e-06, + "loss": 0.905182, + "num_input_tokens_seen": 22865390, + "router_z_loss_clip": 1.45703125, + "router_z_loss_mlp": 0.22900391, + "step": 1071, + "time_per_iteration": 2.6792924404144287 + }, + { + "auxiliary_loss_clip": 0.01203388, + "auxiliary_loss_mlp": 0.01047787, + "balance_loss_clip": 1.059955, + "balance_loss_mlp": 1.02741385, + "epoch": 0.06445212686006313, + "flos": 26819994006720.0, + "grad_norm": 1.5986062587858143, + "language_loss": 0.76070571, + "learning_rate": 3.987562643450292e-06, + "loss": 0.78321755, + "num_input_tokens_seen": 22885495, + "router_z_loss_clip": 1.43457031, + "router_z_loss_mlp": 0.20373535, + "step": 1072, + "time_per_iteration": 2.657161235809326 + }, + { + "auxiliary_loss_clip": 0.01206957, + "auxiliary_loss_mlp": 0.01060218, + "balance_loss_clip": 1.05967522, + "balance_loss_mlp": 1.03682923, + "epoch": 0.0645122501127311, + "flos": 31629641196960.0, + "grad_norm": 1.926407278026174, + "language_loss": 0.80290818, + "learning_rate": 3.987519239449226e-06, + "loss": 0.82557994, + "num_input_tokens_seen": 22904845, + "router_z_loss_clip": 1.47070312, + "router_z_loss_mlp": 0.23376465, + "step": 1073, + "time_per_iteration": 2.6683573722839355 + }, + { + "auxiliary_loss_clip": 0.01196723, + "auxiliary_loss_mlp": 0.01058795, + "balance_loss_clip": 1.05754924, + "balance_loss_mlp": 1.03787351, + "epoch": 0.06457237336539907, + "flos": 31269655450080.0, + "grad_norm": 1.825921679590525, + "language_loss": 0.80257928, + "learning_rate": 3.987475760081233e-06, + "loss": 0.8251344, + "num_input_tokens_seen": 22925940, + "router_z_loss_clip": 1.390625, + "router_z_loss_mlp": 0.20935059, + "step": 1074, + "time_per_iteration": 2.6763763427734375 + }, + { + "auxiliary_loss_clip": 0.01205651, + "auxiliary_loss_mlp": 0.01058964, + "balance_loss_clip": 1.06071925, + "balance_loss_mlp": 1.03671968, + "epoch": 0.06463249661806704, + "flos": 23749870636800.0, + "grad_norm": 1.7993722803251349, + "language_loss": 0.78831589, + "learning_rate": 3.987432205347958e-06, + "loss": 0.81096202, + "num_input_tokens_seen": 22944375, + "router_z_loss_clip": 1.44921875, + "router_z_loss_mlp": 0.22241211, + "step": 1075, + "time_per_iteration": 2.648155927658081 + }, + { + "auxiliary_loss_clip": 0.0120977, + "auxiliary_loss_mlp": 0.01060881, + "balance_loss_clip": 1.06572819, + "balance_loss_mlp": 1.04042506, + "epoch": 0.064692619870735, + "flos": 29892913068960.0, + "grad_norm": 2.8097377424754884, + "language_loss": 0.87547576, + "learning_rate": 3.987388575251055e-06, + "loss": 0.89818227, + "num_input_tokens_seen": 22959145, + "router_z_loss_clip": 1.44140625, + "router_z_loss_mlp": 0.20446777, + "step": 1076, + "time_per_iteration": 2.7041327953338623 + }, + { + "auxiliary_loss_clip": 0.01199693, + "auxiliary_loss_mlp": 0.01049229, + "balance_loss_clip": 1.05902135, + "balance_loss_mlp": 1.02780747, + "epoch": 0.06475274312340297, + "flos": 20765995855200.0, + "grad_norm": 1.7489084566218847, + "language_loss": 0.8069241, + "learning_rate": 3.98734486979218e-06, + "loss": 0.82941335, + "num_input_tokens_seen": 22978100, + "router_z_loss_clip": 1.40820312, + "router_z_loss_mlp": 0.2142334, + "step": 1077, + "time_per_iteration": 2.6265716552734375 + }, + { + "auxiliary_loss_clip": 0.01209622, + "auxiliary_loss_mlp": 0.01059423, + "balance_loss_clip": 1.06079674, + "balance_loss_mlp": 1.03642774, + "epoch": 0.06481286637607095, + "flos": 29983497006240.0, + "grad_norm": 3.4502546526915907, + "language_loss": 0.91942823, + "learning_rate": 3.987301088972986e-06, + "loss": 0.9421187, + "num_input_tokens_seen": 22997285, + "router_z_loss_clip": 1.48828125, + "router_z_loss_mlp": 0.23010254, + "step": 1078, + "time_per_iteration": 2.729199171066284 + }, + { + "auxiliary_loss_clip": 0.01214002, + "auxiliary_loss_mlp": 0.01057012, + "balance_loss_clip": 1.06389153, + "balance_loss_mlp": 1.03492224, + "epoch": 0.06487298962873891, + "flos": 25753164364800.0, + "grad_norm": 3.5776742994059303, + "language_loss": 0.78751373, + "learning_rate": 3.987257232795137e-06, + "loss": 0.81022388, + "num_input_tokens_seen": 23016285, + "router_z_loss_clip": 1.5, + "router_z_loss_mlp": 0.22094727, + "step": 1079, + "time_per_iteration": 2.6550416946411133 + }, + { + "auxiliary_loss_clip": 0.01205469, + "auxiliary_loss_mlp": 0.01059562, + "balance_loss_clip": 1.06103444, + "balance_loss_mlp": 1.03740048, + "epoch": 0.06493311288140688, + "flos": 30027249318240.0, + "grad_norm": 2.347592763654615, + "language_loss": 0.69494987, + "learning_rate": 3.987213301260294e-06, + "loss": 0.71760011, + "num_input_tokens_seen": 23036420, + "router_z_loss_clip": 1.44335938, + "router_z_loss_mlp": 0.22131348, + "step": 1080, + "time_per_iteration": 2.6968190670013428 + }, + { + "auxiliary_loss_clip": 0.01205353, + "auxiliary_loss_mlp": 0.01058808, + "balance_loss_clip": 1.05988872, + "balance_loss_mlp": 1.03544307, + "epoch": 0.06499323613407486, + "flos": 30917813676480.0, + "grad_norm": 1.9355350554822601, + "language_loss": 0.71894944, + "learning_rate": 3.987169294370123e-06, + "loss": 0.7415911, + "num_input_tokens_seen": 23056945, + "router_z_loss_clip": 1.45703125, + "router_z_loss_mlp": 0.23352051, + "step": 1081, + "time_per_iteration": 2.6573286056518555 + }, + { + "auxiliary_loss_clip": 0.01203624, + "auxiliary_loss_mlp": 0.01057607, + "balance_loss_clip": 1.06051123, + "balance_loss_mlp": 1.03431356, + "epoch": 0.06505335938674282, + "flos": 24862478454720.0, + "grad_norm": 2.990380574666302, + "language_loss": 0.84442461, + "learning_rate": 3.987125212126294e-06, + "loss": 0.86703694, + "num_input_tokens_seen": 23074940, + "router_z_loss_clip": 1.4296875, + "router_z_loss_mlp": 0.2331543, + "step": 1082, + "time_per_iteration": 4.086202383041382 + }, + { + "auxiliary_loss_clip": 0.01216219, + "auxiliary_loss_mlp": 0.01061611, + "balance_loss_clip": 1.0641799, + "balance_loss_mlp": 1.03872299, + "epoch": 0.06511348263941079, + "flos": 30917732641920.0, + "grad_norm": 2.608390054200457, + "language_loss": 0.82528538, + "learning_rate": 3.987081054530478e-06, + "loss": 0.84806371, + "num_input_tokens_seen": 23093420, + "router_z_loss_clip": 1.52050781, + "router_z_loss_mlp": 0.22875977, + "step": 1083, + "time_per_iteration": 5.571670293807983 + }, + { + "auxiliary_loss_clip": 0.01209844, + "auxiliary_loss_mlp": 0.01060816, + "balance_loss_clip": 1.06457508, + "balance_loss_mlp": 1.03770161, + "epoch": 0.06517360589207877, + "flos": 24810703721280.0, + "grad_norm": 2.931718023625281, + "language_loss": 0.79469568, + "learning_rate": 3.987036821584348e-06, + "loss": 0.81740224, + "num_input_tokens_seen": 23111550, + "router_z_loss_clip": 1.45507812, + "router_z_loss_mlp": 0.23144531, + "step": 1084, + "time_per_iteration": 4.196600437164307 + }, + { + "auxiliary_loss_clip": 0.01208045, + "auxiliary_loss_mlp": 0.01058358, + "balance_loss_clip": 1.06316209, + "balance_loss_mlp": 1.03581512, + "epoch": 0.06523372914474673, + "flos": 38659277293920.0, + "grad_norm": 3.9678282960309237, + "language_loss": 0.66332376, + "learning_rate": 3.986992513289584e-06, + "loss": 0.68598777, + "num_input_tokens_seen": 23130335, + "router_z_loss_clip": 1.44921875, + "router_z_loss_mlp": 0.2253418, + "step": 1085, + "time_per_iteration": 2.7451565265655518 + }, + { + "auxiliary_loss_clip": 0.01201401, + "auxiliary_loss_mlp": 0.01065976, + "balance_loss_clip": 1.06210113, + "balance_loss_mlp": 1.0444113, + "epoch": 0.0652938523974147, + "flos": 25353478033920.0, + "grad_norm": 2.323965149478883, + "language_loss": 0.76813346, + "learning_rate": 3.9869481296478645e-06, + "loss": 0.79080719, + "num_input_tokens_seen": 23152380, + "router_z_loss_clip": 1.39257812, + "router_z_loss_mlp": 0.21557617, + "step": 1086, + "time_per_iteration": 2.672363758087158 + }, + { + "auxiliary_loss_clip": 0.01203447, + "auxiliary_loss_mlp": 0.01054916, + "balance_loss_clip": 1.06043899, + "balance_loss_mlp": 1.03236127, + "epoch": 0.06535397565008266, + "flos": 20366066420640.0, + "grad_norm": 2.1727950223986228, + "language_loss": 0.85028678, + "learning_rate": 3.986903670660872e-06, + "loss": 0.87287039, + "num_input_tokens_seen": 23171630, + "router_z_loss_clip": 1.43066406, + "router_z_loss_mlp": 0.22546387, + "step": 1087, + "time_per_iteration": 2.6681830883026123 + }, + { + "auxiliary_loss_clip": 0.01210051, + "auxiliary_loss_mlp": 0.01057877, + "balance_loss_clip": 1.06512022, + "balance_loss_mlp": 1.03612113, + "epoch": 0.06541409890275064, + "flos": 32785312533120.0, + "grad_norm": 1.7745136225703495, + "language_loss": 0.77985257, + "learning_rate": 3.9868591363302945e-06, + "loss": 0.80253184, + "num_input_tokens_seen": 23192520, + "router_z_loss_clip": 1.45214844, + "router_z_loss_mlp": 0.21765137, + "step": 1088, + "time_per_iteration": 2.732170343399048 + }, + { + "auxiliary_loss_clip": 0.01210458, + "auxiliary_loss_mlp": 0.01063786, + "balance_loss_clip": 1.06616521, + "balance_loss_mlp": 1.04311538, + "epoch": 0.06547422215541861, + "flos": 25040526498720.0, + "grad_norm": 1.8450798585528494, + "language_loss": 0.71067637, + "learning_rate": 3.9868145266578186e-06, + "loss": 0.73341882, + "num_input_tokens_seen": 23210710, + "router_z_loss_clip": 1.44140625, + "router_z_loss_mlp": 0.20666504, + "step": 1089, + "time_per_iteration": 2.676898241043091 + }, + { + "auxiliary_loss_clip": 0.01207694, + "auxiliary_loss_mlp": 0.01054916, + "balance_loss_clip": 1.0658567, + "balance_loss_mlp": 1.03397131, + "epoch": 0.06553434540808657, + "flos": 26864921319840.0, + "grad_norm": 1.9373873319200783, + "language_loss": 0.85717422, + "learning_rate": 3.9867698416451366e-06, + "loss": 0.87980032, + "num_input_tokens_seen": 23230305, + "router_z_loss_clip": 1.41992188, + "router_z_loss_mlp": 0.20947266, + "step": 1090, + "time_per_iteration": 2.647359609603882 + }, + { + "auxiliary_loss_clip": 0.0121402, + "auxiliary_loss_mlp": 0.01057743, + "balance_loss_clip": 1.06778193, + "balance_loss_mlp": 1.03567755, + "epoch": 0.06559446866075455, + "flos": 30027776042880.0, + "grad_norm": 1.7497295579111416, + "language_loss": 0.72238749, + "learning_rate": 3.9867250812939434e-06, + "loss": 0.74510515, + "num_input_tokens_seen": 23249015, + "router_z_loss_clip": 1.46289062, + "router_z_loss_mlp": 0.22070312, + "step": 1091, + "time_per_iteration": 2.7250354290008545 + }, + { + "auxiliary_loss_clip": 0.01208411, + "auxiliary_loss_mlp": 0.01062364, + "balance_loss_clip": 1.06462646, + "balance_loss_mlp": 1.04016733, + "epoch": 0.06565459191342252, + "flos": 29620756084320.0, + "grad_norm": 2.323759095263997, + "language_loss": 0.83037853, + "learning_rate": 3.986680245605936e-06, + "loss": 0.85308629, + "num_input_tokens_seen": 23265105, + "router_z_loss_clip": 1.43847656, + "router_z_loss_mlp": 0.22192383, + "step": 1092, + "time_per_iteration": 2.66995906829834 + }, + { + "auxiliary_loss_clip": 0.01212121, + "auxiliary_loss_mlp": 0.01057389, + "balance_loss_clip": 1.0640229, + "balance_loss_mlp": 1.03369045, + "epoch": 0.06571471516609048, + "flos": 30245929843680.0, + "grad_norm": 1.9017352963813017, + "language_loss": 0.71594489, + "learning_rate": 3.986635334582814e-06, + "loss": 0.73864001, + "num_input_tokens_seen": 23283950, + "router_z_loss_clip": 1.48046875, + "router_z_loss_mlp": 0.23706055, + "step": 1093, + "time_per_iteration": 2.67146372795105 + }, + { + "auxiliary_loss_clip": 0.01211543, + "auxiliary_loss_mlp": 0.01057782, + "balance_loss_clip": 1.06709754, + "balance_loss_mlp": 1.03386903, + "epoch": 0.06577483841875846, + "flos": 31986831251520.0, + "grad_norm": 1.6780703967609656, + "language_loss": 0.88058001, + "learning_rate": 3.986590348226282e-06, + "loss": 0.90327322, + "num_input_tokens_seen": 23305005, + "router_z_loss_clip": 1.44335938, + "router_z_loss_mlp": 0.23925781, + "step": 1094, + "time_per_iteration": 2.6919033527374268 + }, + { + "auxiliary_loss_clip": 0.0121193, + "auxiliary_loss_mlp": 0.01065351, + "balance_loss_clip": 1.06660891, + "balance_loss_mlp": 1.04069889, + "epoch": 0.06583496167142643, + "flos": 30604132830240.0, + "grad_norm": 1.6466260488267386, + "language_loss": 0.8152101, + "learning_rate": 3.986545286538044e-06, + "loss": 0.83798289, + "num_input_tokens_seen": 23323220, + "router_z_loss_clip": 1.453125, + "router_z_loss_mlp": 0.2467041, + "step": 1095, + "time_per_iteration": 2.65901255607605 + }, + { + "auxiliary_loss_clip": 0.0120846, + "auxiliary_loss_mlp": 0.01055963, + "balance_loss_clip": 1.06520367, + "balance_loss_mlp": 1.03452921, + "epoch": 0.06589508492409439, + "flos": 31273909764480.0, + "grad_norm": 2.4220777585853996, + "language_loss": 0.69942623, + "learning_rate": 3.986500149519811e-06, + "loss": 0.72207052, + "num_input_tokens_seen": 23342235, + "router_z_loss_clip": 1.43164062, + "router_z_loss_mlp": 0.21435547, + "step": 1096, + "time_per_iteration": 2.7121262550354004 + }, + { + "auxiliary_loss_clip": 0.01211765, + "auxiliary_loss_mlp": 0.01062276, + "balance_loss_clip": 1.06772757, + "balance_loss_mlp": 1.0396378, + "epoch": 0.06595520817676236, + "flos": 28823085148320.0, + "grad_norm": 1.7567053343012335, + "language_loss": 0.77602595, + "learning_rate": 3.986454937173292e-06, + "loss": 0.79876637, + "num_input_tokens_seen": 23363680, + "router_z_loss_clip": 1.44238281, + "router_z_loss_mlp": 0.22631836, + "step": 1097, + "time_per_iteration": 2.7063393592834473 + }, + { + "auxiliary_loss_clip": 0.01212004, + "auxiliary_loss_mlp": 0.01063201, + "balance_loss_clip": 1.0658617, + "balance_loss_mlp": 1.04120684, + "epoch": 0.06601533142943034, + "flos": 41245977816000.0, + "grad_norm": 1.6069076640227713, + "language_loss": 0.78572339, + "learning_rate": 3.986409649500203e-06, + "loss": 0.80847549, + "num_input_tokens_seen": 23385590, + "router_z_loss_clip": 1.46191406, + "router_z_loss_mlp": 0.2199707, + "step": 1098, + "time_per_iteration": 2.891512155532837 + }, + { + "auxiliary_loss_clip": 0.01211425, + "auxiliary_loss_mlp": 0.01064221, + "balance_loss_clip": 1.0674355, + "balance_loss_mlp": 1.04058146, + "epoch": 0.0660754546820983, + "flos": 24720930129600.0, + "grad_norm": 1.9697571717383497, + "language_loss": 0.81911349, + "learning_rate": 3.986364286502261e-06, + "loss": 0.84186989, + "num_input_tokens_seen": 23402945, + "router_z_loss_clip": 1.44042969, + "router_z_loss_mlp": 0.23632812, + "step": 1099, + "time_per_iteration": 2.7848076820373535 + }, + { + "auxiliary_loss_clip": 0.01203658, + "auxiliary_loss_mlp": 0.01048308, + "balance_loss_clip": 1.0622226, + "balance_loss_mlp": 1.02576518, + "epoch": 0.06613557793476627, + "flos": 23615899043040.0, + "grad_norm": 2.2015527090177742, + "language_loss": 0.82891554, + "learning_rate": 3.986318848181186e-06, + "loss": 0.85143518, + "num_input_tokens_seen": 23421410, + "router_z_loss_clip": 1.4140625, + "router_z_loss_mlp": 0.22546387, + "step": 1100, + "time_per_iteration": 2.751789093017578 + }, + { + "auxiliary_loss_clip": 0.01210628, + "auxiliary_loss_mlp": 0.01057405, + "balance_loss_clip": 1.06631291, + "balance_loss_mlp": 1.03505373, + "epoch": 0.06619570118743424, + "flos": 16804740885120.0, + "grad_norm": 3.1770950732022687, + "language_loss": 0.73473525, + "learning_rate": 3.986273334538702e-06, + "loss": 0.75741553, + "num_input_tokens_seen": 23438870, + "router_z_loss_clip": 1.44042969, + "router_z_loss_mlp": 0.22351074, + "step": 1101, + "time_per_iteration": 2.6265010833740234 + }, + { + "auxiliary_loss_clip": 0.01206747, + "auxiliary_loss_mlp": 0.01055834, + "balance_loss_clip": 1.06416047, + "balance_loss_mlp": 1.03356612, + "epoch": 0.06625582444010221, + "flos": 21790045599840.0, + "grad_norm": 2.3268780589490285, + "language_loss": 0.86463261, + "learning_rate": 3.986227745576533e-06, + "loss": 0.88725841, + "num_input_tokens_seen": 23456975, + "router_z_loss_clip": 1.42480469, + "router_z_loss_mlp": 0.22277832, + "step": 1102, + "time_per_iteration": 2.6544268131256104 + }, + { + "auxiliary_loss_clip": 0.01206758, + "auxiliary_loss_mlp": 0.01058567, + "balance_loss_clip": 1.06427467, + "balance_loss_mlp": 1.03588164, + "epoch": 0.06631594769277017, + "flos": 14444905379040.0, + "grad_norm": 2.216731314160965, + "language_loss": 0.81693327, + "learning_rate": 3.98618208129641e-06, + "loss": 0.8395865, + "num_input_tokens_seen": 23473440, + "router_z_loss_clip": 1.42480469, + "router_z_loss_mlp": 0.22692871, + "step": 1103, + "time_per_iteration": 2.6214759349823 + }, + { + "auxiliary_loss_clip": 0.01209311, + "auxiliary_loss_mlp": 0.01061811, + "balance_loss_clip": 1.06669998, + "balance_loss_mlp": 1.04051971, + "epoch": 0.06637607094543815, + "flos": 24151704383520.0, + "grad_norm": 1.7919544239342289, + "language_loss": 0.82074183, + "learning_rate": 3.986136341700063e-06, + "loss": 0.84345305, + "num_input_tokens_seen": 23493880, + "router_z_loss_clip": 1.42675781, + "router_z_loss_mlp": 0.2130127, + "step": 1104, + "time_per_iteration": 2.660534620285034 + }, + { + "auxiliary_loss_clip": 0.01201608, + "auxiliary_loss_mlp": 0.01047169, + "balance_loss_clip": 1.06019688, + "balance_loss_mlp": 1.02398252, + "epoch": 0.06643619419810612, + "flos": 31097847067200.0, + "grad_norm": 1.5556374074222905, + "language_loss": 0.80519629, + "learning_rate": 3.986090526789227e-06, + "loss": 0.82768404, + "num_input_tokens_seen": 23514920, + "router_z_loss_clip": 1.4140625, + "router_z_loss_mlp": 0.23181152, + "step": 1105, + "time_per_iteration": 2.7241930961608887 + }, + { + "auxiliary_loss_clip": 0.01201167, + "auxiliary_loss_mlp": 0.01055579, + "balance_loss_clip": 1.06286895, + "balance_loss_mlp": 1.03443098, + "epoch": 0.06649631745077408, + "flos": 20677883472000.0, + "grad_norm": 1.9860919532388916, + "language_loss": 0.96862233, + "learning_rate": 3.986044636565639e-06, + "loss": 0.99118972, + "num_input_tokens_seen": 23531635, + "router_z_loss_clip": 1.38476562, + "router_z_loss_mlp": 0.21142578, + "step": 1106, + "time_per_iteration": 2.641599416732788 + }, + { + "auxiliary_loss_clip": 0.01210352, + "auxiliary_loss_mlp": 0.01058611, + "balance_loss_clip": 1.06342673, + "balance_loss_mlp": 1.03562701, + "epoch": 0.06655644070344206, + "flos": 21790855945440.0, + "grad_norm": 1.9350580679783613, + "language_loss": 0.82380557, + "learning_rate": 3.985998671031039e-06, + "loss": 0.84649515, + "num_input_tokens_seen": 23551020, + "router_z_loss_clip": 1.47070312, + "router_z_loss_mlp": 0.22998047, + "step": 1107, + "time_per_iteration": 2.65852427482605 + }, + { + "auxiliary_loss_clip": 0.01097529, + "auxiliary_loss_mlp": 0.01020994, + "balance_loss_clip": 1.03440416, + "balance_loss_mlp": 1.01745081, + "epoch": 0.06661656395611003, + "flos": 74943977142240.0, + "grad_norm": 0.7932185913605557, + "language_loss": 0.56674397, + "learning_rate": 3.9859526301871705e-06, + "loss": 0.58792919, + "num_input_tokens_seen": 23610675, + "router_z_loss_clip": 0.63232422, + "router_z_loss_mlp": 0.03546143, + "step": 1108, + "time_per_iteration": 3.2164530754089355 + }, + { + "auxiliary_loss_clip": 0.01207133, + "auxiliary_loss_mlp": 0.01059181, + "balance_loss_clip": 1.06198823, + "balance_loss_mlp": 1.036376, + "epoch": 0.066676687208778, + "flos": 25213307296320.0, + "grad_norm": 3.118470673174893, + "language_loss": 0.72267973, + "learning_rate": 3.9859065140357795e-06, + "loss": 0.74534285, + "num_input_tokens_seen": 23628710, + "router_z_loss_clip": 1.453125, + "router_z_loss_mlp": 0.2277832, + "step": 1109, + "time_per_iteration": 2.672045946121216 + }, + { + "auxiliary_loss_clip": 0.01202456, + "auxiliary_loss_mlp": 0.01055472, + "balance_loss_clip": 1.06032872, + "balance_loss_mlp": 1.03247643, + "epoch": 0.06673681046144596, + "flos": 25530634697760.0, + "grad_norm": 2.137836288867359, + "language_loss": 0.78037351, + "learning_rate": 3.985860322578614e-06, + "loss": 0.80295277, + "num_input_tokens_seen": 23649160, + "router_z_loss_clip": 1.41992188, + "router_z_loss_mlp": 0.2298584, + "step": 1110, + "time_per_iteration": 2.7061052322387695 + }, + { + "auxiliary_loss_clip": 0.0120521, + "auxiliary_loss_mlp": 0.01054569, + "balance_loss_clip": 1.062289, + "balance_loss_mlp": 1.03275383, + "epoch": 0.06679693371411394, + "flos": 37907384533920.0, + "grad_norm": 2.173288288678995, + "language_loss": 0.71499979, + "learning_rate": 3.985814055817427e-06, + "loss": 0.73759758, + "num_input_tokens_seen": 23671995, + "router_z_loss_clip": 1.43066406, + "router_z_loss_mlp": 0.21813965, + "step": 1111, + "time_per_iteration": 2.766788959503174 + }, + { + "auxiliary_loss_clip": 0.01208448, + "auxiliary_loss_mlp": 0.01063146, + "balance_loss_clip": 1.06413412, + "balance_loss_mlp": 1.04136646, + "epoch": 0.0668570569667819, + "flos": 26510324371200.0, + "grad_norm": 1.7471808906696986, + "language_loss": 0.78512907, + "learning_rate": 3.985767713753971e-06, + "loss": 0.80784506, + "num_input_tokens_seen": 23690705, + "router_z_loss_clip": 1.44238281, + "router_z_loss_mlp": 0.21765137, + "step": 1112, + "time_per_iteration": 2.8306260108947754 + }, + { + "auxiliary_loss_clip": 0.01208341, + "auxiliary_loss_mlp": 0.0106468, + "balance_loss_clip": 1.06583381, + "balance_loss_mlp": 1.04292417, + "epoch": 0.06691718021944987, + "flos": 27756417575520.0, + "grad_norm": 1.9124972002030625, + "language_loss": 0.78863323, + "learning_rate": 3.985721296390005e-06, + "loss": 0.81136346, + "num_input_tokens_seen": 23709990, + "router_z_loss_clip": 1.42480469, + "router_z_loss_mlp": 0.21765137, + "step": 1113, + "time_per_iteration": 2.881781816482544 + }, + { + "auxiliary_loss_clip": 0.01199397, + "auxiliary_loss_mlp": 0.0105576, + "balance_loss_clip": 1.06027746, + "balance_loss_mlp": 1.0352205, + "epoch": 0.06697730347211785, + "flos": 20188626135840.0, + "grad_norm": 2.18076402310086, + "language_loss": 0.82494998, + "learning_rate": 3.985674803727289e-06, + "loss": 0.84750152, + "num_input_tokens_seen": 23728485, + "router_z_loss_clip": 1.390625, + "router_z_loss_mlp": 0.20556641, + "step": 1114, + "time_per_iteration": 2.6362764835357666 + }, + { + "auxiliary_loss_clip": 0.01094376, + "auxiliary_loss_mlp": 0.01022421, + "balance_loss_clip": 1.03258967, + "balance_loss_mlp": 1.01897264, + "epoch": 0.06703742672478581, + "flos": 72945910143360.0, + "grad_norm": 0.8275302679260741, + "language_loss": 0.58152753, + "learning_rate": 3.985628235767584e-06, + "loss": 0.60269552, + "num_input_tokens_seen": 23786650, + "router_z_loss_clip": 0.6171875, + "router_z_loss_mlp": 0.0345459, + "step": 1115, + "time_per_iteration": 3.318481206893921 + }, + { + "auxiliary_loss_clip": 0.01204483, + "auxiliary_loss_mlp": 0.01062757, + "balance_loss_clip": 1.0622952, + "balance_loss_mlp": 1.03978527, + "epoch": 0.06709754997745378, + "flos": 20499349220640.0, + "grad_norm": 3.732453690292813, + "language_loss": 0.91363388, + "learning_rate": 3.985581592512658e-06, + "loss": 0.9363063, + "num_input_tokens_seen": 23802555, + "router_z_loss_clip": 1.42285156, + "router_z_loss_mlp": 0.22973633, + "step": 1116, + "time_per_iteration": 2.6522536277770996 + }, + { + "auxiliary_loss_clip": 0.01213447, + "auxiliary_loss_mlp": 0.01058161, + "balance_loss_clip": 1.06650066, + "balance_loss_mlp": 1.03607142, + "epoch": 0.06715767323012176, + "flos": 26996056704000.0, + "grad_norm": 2.0177706004214793, + "language_loss": 0.87232214, + "learning_rate": 3.985534873964279e-06, + "loss": 0.89503825, + "num_input_tokens_seen": 23822945, + "router_z_loss_clip": 1.46972656, + "router_z_loss_mlp": 0.22070312, + "step": 1117, + "time_per_iteration": 2.6717522144317627 + }, + { + "auxiliary_loss_clip": 0.01092, + "auxiliary_loss_mlp": 0.01012772, + "balance_loss_clip": 1.03059697, + "balance_loss_mlp": 1.00933909, + "epoch": 0.06721779648278972, + "flos": 81289614739680.0, + "grad_norm": 0.8689549292095824, + "language_loss": 0.59737498, + "learning_rate": 3.985488080124218e-06, + "loss": 0.61842269, + "num_input_tokens_seen": 23874075, + "router_z_loss_clip": 0.61376953, + "router_z_loss_mlp": 0.03442383, + "step": 1118, + "time_per_iteration": 3.185849905014038 + }, + { + "auxiliary_loss_clip": 0.01203125, + "auxiliary_loss_mlp": 0.01051659, + "balance_loss_clip": 1.05827522, + "balance_loss_mlp": 1.02958155, + "epoch": 0.06727791973545769, + "flos": 27311277206880.0, + "grad_norm": 3.0290253846417743, + "language_loss": 0.82953155, + "learning_rate": 3.985441210994251e-06, + "loss": 0.85207939, + "num_input_tokens_seen": 23889720, + "router_z_loss_clip": 1.44921875, + "router_z_loss_mlp": 0.22058105, + "step": 1119, + "time_per_iteration": 2.6466376781463623 + }, + { + "auxiliary_loss_clip": 0.01203483, + "auxiliary_loss_mlp": 0.01059653, + "balance_loss_clip": 1.06405699, + "balance_loss_mlp": 1.03918457, + "epoch": 0.06733804298812565, + "flos": 29626995745440.0, + "grad_norm": 1.8059318647497227, + "language_loss": 0.84912258, + "learning_rate": 3.9853942665761545e-06, + "loss": 0.87175393, + "num_input_tokens_seen": 23909385, + "router_z_loss_clip": 1.39453125, + "router_z_loss_mlp": 0.20458984, + "step": 1120, + "time_per_iteration": 2.7089860439300537 + }, + { + "auxiliary_loss_clip": 0.01213524, + "auxiliary_loss_mlp": 0.01066451, + "balance_loss_clip": 1.06775379, + "balance_loss_mlp": 1.04375315, + "epoch": 0.06739816624079363, + "flos": 19423686811680.0, + "grad_norm": 1.8299149206186325, + "language_loss": 0.78637439, + "learning_rate": 3.985347246871708e-06, + "loss": 0.80917418, + "num_input_tokens_seen": 23926830, + "router_z_loss_clip": 1.45898438, + "router_z_loss_mlp": 0.22692871, + "step": 1121, + "time_per_iteration": 4.113529920578003 + }, + { + "auxiliary_loss_clip": 0.01089192, + "auxiliary_loss_mlp": 0.01003063, + "balance_loss_clip": 1.0278945, + "balance_loss_mlp": 0.99970412, + "epoch": 0.0674582894934616, + "flos": 87119671469760.0, + "grad_norm": 0.7495527082858795, + "language_loss": 0.58371341, + "learning_rate": 3.985300151882694e-06, + "loss": 0.60463595, + "num_input_tokens_seen": 23992640, + "router_z_loss_clip": 0.61279297, + "router_z_loss_mlp": 0.03366089, + "step": 1122, + "time_per_iteration": 4.918056011199951 + }, + { + "auxiliary_loss_clip": 0.01206799, + "auxiliary_loss_mlp": 0.01056327, + "balance_loss_clip": 1.06432939, + "balance_loss_mlp": 1.03494096, + "epoch": 0.06751841274612956, + "flos": 30828688361280.0, + "grad_norm": 2.7500927734594254, + "language_loss": 0.71879101, + "learning_rate": 3.985252981610901e-06, + "loss": 0.7414223, + "num_input_tokens_seen": 24011135, + "router_z_loss_clip": 1.42578125, + "router_z_loss_mlp": 0.21398926, + "step": 1123, + "time_per_iteration": 5.921875715255737 + }, + { + "auxiliary_loss_clip": 0.01205826, + "auxiliary_loss_mlp": 0.01057266, + "balance_loss_clip": 1.06142867, + "balance_loss_mlp": 1.03374565, + "epoch": 0.06757853599879754, + "flos": 29044642400640.0, + "grad_norm": 1.941617117913146, + "language_loss": 0.79160321, + "learning_rate": 3.985205736058114e-06, + "loss": 0.81423414, + "num_input_tokens_seen": 24030695, + "router_z_loss_clip": 1.44238281, + "router_z_loss_mlp": 0.23547363, + "step": 1124, + "time_per_iteration": 2.6672918796539307 + }, + { + "auxiliary_loss_clip": 0.01203832, + "auxiliary_loss_mlp": 0.01050098, + "balance_loss_clip": 1.06276608, + "balance_loss_mlp": 1.02920079, + "epoch": 0.0676386592514655, + "flos": 25665335602560.0, + "grad_norm": 2.194535924262332, + "language_loss": 0.71416306, + "learning_rate": 3.985158415226128e-06, + "loss": 0.73670238, + "num_input_tokens_seen": 24050680, + "router_z_loss_clip": 1.41113281, + "router_z_loss_mlp": 0.20910645, + "step": 1125, + "time_per_iteration": 2.6907241344451904 + }, + { + "auxiliary_loss_clip": 0.01203482, + "auxiliary_loss_mlp": 0.01064322, + "balance_loss_clip": 1.06337166, + "balance_loss_mlp": 1.04068279, + "epoch": 0.06769878250413347, + "flos": 31184298241920.0, + "grad_norm": 2.6508661482230096, + "language_loss": 0.81255984, + "learning_rate": 3.985111019116736e-06, + "loss": 0.83523792, + "num_input_tokens_seen": 24067205, + "router_z_loss_clip": 1.40039062, + "router_z_loss_mlp": 0.23620605, + "step": 1126, + "time_per_iteration": 2.736978769302368 + }, + { + "auxiliary_loss_clip": 0.01084945, + "auxiliary_loss_mlp": 0.01004652, + "balance_loss_clip": 1.02428651, + "balance_loss_mlp": 1.00128758, + "epoch": 0.06775890575680145, + "flos": 86213872614240.0, + "grad_norm": 1.0368117707732092, + "language_loss": 0.59806883, + "learning_rate": 3.985063547731735e-06, + "loss": 0.61896473, + "num_input_tokens_seen": 24131320, + "router_z_loss_clip": 0.60742188, + "router_z_loss_mlp": 0.03372192, + "step": 1127, + "time_per_iteration": 3.254908323287964 + }, + { + "auxiliary_loss_clip": 0.01202683, + "auxiliary_loss_mlp": 0.01052301, + "balance_loss_clip": 1.06284606, + "balance_loss_mlp": 1.03021181, + "epoch": 0.06781902900946941, + "flos": 29573195148000.0, + "grad_norm": 4.690506407850664, + "language_loss": 0.81797886, + "learning_rate": 3.985016001072925e-06, + "loss": 0.84052873, + "num_input_tokens_seen": 24149930, + "router_z_loss_clip": 1.39941406, + "router_z_loss_mlp": 0.22094727, + "step": 1128, + "time_per_iteration": 2.6759068965911865 + }, + { + "auxiliary_loss_clip": 0.01210019, + "auxiliary_loss_mlp": 0.01045959, + "balance_loss_clip": 1.06527686, + "balance_loss_mlp": 1.02286768, + "epoch": 0.06787915226213738, + "flos": 27354178656000.0, + "grad_norm": 4.509264238887223, + "language_loss": 0.76014924, + "learning_rate": 3.984968379142109e-06, + "loss": 0.78270912, + "num_input_tokens_seen": 24169590, + "router_z_loss_clip": 1.44824219, + "router_z_loss_mlp": 0.2310791, + "step": 1129, + "time_per_iteration": 2.687709331512451 + }, + { + "auxiliary_loss_clip": 0.01203085, + "auxiliary_loss_mlp": 0.01054704, + "balance_loss_clip": 1.05962098, + "balance_loss_mlp": 1.03282952, + "epoch": 0.06793927551480534, + "flos": 46011305452320.0, + "grad_norm": 1.7572261837462018, + "language_loss": 0.72023624, + "learning_rate": 3.984920681941094e-06, + "loss": 0.74281406, + "num_input_tokens_seen": 24189965, + "router_z_loss_clip": 1.43457031, + "router_z_loss_mlp": 0.21875, + "step": 1130, + "time_per_iteration": 2.831613540649414 + }, + { + "auxiliary_loss_clip": 0.0119907, + "auxiliary_loss_mlp": 0.01057304, + "balance_loss_clip": 1.05956984, + "balance_loss_mlp": 1.03522682, + "epoch": 0.06799939876747332, + "flos": 25174457575200.0, + "grad_norm": 1.9753448392981527, + "language_loss": 0.80770129, + "learning_rate": 3.984872909471688e-06, + "loss": 0.8302651, + "num_input_tokens_seen": 24208045, + "router_z_loss_clip": 1.39453125, + "router_z_loss_mlp": 0.22070312, + "step": 1131, + "time_per_iteration": 2.688992977142334 + }, + { + "auxiliary_loss_clip": 0.01197166, + "auxiliary_loss_mlp": 0.0106065, + "balance_loss_clip": 1.05899191, + "balance_loss_mlp": 1.03888226, + "epoch": 0.06805952202014129, + "flos": 18137730954240.0, + "grad_norm": 2.3596687081350725, + "language_loss": 0.80048019, + "learning_rate": 3.984825061735701e-06, + "loss": 0.82305837, + "num_input_tokens_seen": 24223805, + "router_z_loss_clip": 1.38085938, + "router_z_loss_mlp": 0.2175293, + "step": 1132, + "time_per_iteration": 2.6634206771850586 + }, + { + "auxiliary_loss_clip": 0.01200274, + "auxiliary_loss_mlp": 0.01063757, + "balance_loss_clip": 1.05994296, + "balance_loss_mlp": 1.04179859, + "epoch": 0.06811964527280925, + "flos": 59681344773600.0, + "grad_norm": 1.6031151260952203, + "language_loss": 0.63529325, + "learning_rate": 3.9847771387349495e-06, + "loss": 0.65793359, + "num_input_tokens_seen": 24249475, + "router_z_loss_clip": 1.40527344, + "router_z_loss_mlp": 0.21960449, + "step": 1133, + "time_per_iteration": 2.8898348808288574 + }, + { + "auxiliary_loss_clip": 0.01201698, + "auxiliary_loss_mlp": 0.01053652, + "balance_loss_clip": 1.05648875, + "balance_loss_mlp": 1.0292623, + "epoch": 0.06817976852547723, + "flos": 18762985748160.0, + "grad_norm": 2.1645498460577763, + "language_loss": 0.74644065, + "learning_rate": 3.9847291404712506e-06, + "loss": 0.76899415, + "num_input_tokens_seen": 24267980, + "router_z_loss_clip": 1.45214844, + "router_z_loss_mlp": 0.24389648, + "step": 1134, + "time_per_iteration": 2.647580146789551 + }, + { + "auxiliary_loss_clip": 0.01199093, + "auxiliary_loss_mlp": 0.01052738, + "balance_loss_clip": 1.06095362, + "balance_loss_mlp": 1.03154254, + "epoch": 0.0682398917781452, + "flos": 24594089577120.0, + "grad_norm": 1.838974332193539, + "language_loss": 0.87334841, + "learning_rate": 3.984681066946423e-06, + "loss": 0.89586675, + "num_input_tokens_seen": 24286805, + "router_z_loss_clip": 1.38183594, + "router_z_loss_mlp": 0.21203613, + "step": 1135, + "time_per_iteration": 2.9422662258148193 + }, + { + "auxiliary_loss_clip": 0.01201408, + "auxiliary_loss_mlp": 0.0104814, + "balance_loss_clip": 1.05749154, + "balance_loss_mlp": 1.02533495, + "epoch": 0.06830001503081316, + "flos": 28602500310720.0, + "grad_norm": 2.7817642809343126, + "language_loss": 0.77726227, + "learning_rate": 3.984632918162291e-06, + "loss": 0.79975772, + "num_input_tokens_seen": 24305855, + "router_z_loss_clip": 1.44042969, + "router_z_loss_mlp": 0.22802734, + "step": 1136, + "time_per_iteration": 2.6548638343811035 + }, + { + "auxiliary_loss_clip": 0.01204228, + "auxiliary_loss_mlp": 0.01063654, + "balance_loss_clip": 1.06289959, + "balance_loss_mlp": 1.04169607, + "epoch": 0.06836013828348114, + "flos": 41914741818240.0, + "grad_norm": 2.3545453189514847, + "language_loss": 0.84169245, + "learning_rate": 3.984584694120679e-06, + "loss": 0.8643713, + "num_input_tokens_seen": 24326535, + "router_z_loss_clip": 1.41308594, + "router_z_loss_mlp": 0.21960449, + "step": 1137, + "time_per_iteration": 2.7909786701202393 + }, + { + "auxiliary_loss_clip": 0.01197658, + "auxiliary_loss_mlp": 0.01055041, + "balance_loss_clip": 1.05941796, + "balance_loss_mlp": 1.0332135, + "epoch": 0.06842026153614911, + "flos": 28246444740000.0, + "grad_norm": 2.5666980759742613, + "language_loss": 0.78694129, + "learning_rate": 3.984536394823418e-06, + "loss": 0.80946827, + "num_input_tokens_seen": 24345810, + "router_z_loss_clip": 1.3828125, + "router_z_loss_mlp": 0.21826172, + "step": 1138, + "time_per_iteration": 2.6806766986846924 + }, + { + "auxiliary_loss_clip": 0.01201903, + "auxiliary_loss_mlp": 0.01052904, + "balance_loss_clip": 1.05955601, + "balance_loss_mlp": 1.03055215, + "epoch": 0.06848038478881707, + "flos": 30028140698400.0, + "grad_norm": 2.2517586489775625, + "language_loss": 0.85780811, + "learning_rate": 3.984488020272336e-06, + "loss": 0.88035613, + "num_input_tokens_seen": 24366095, + "router_z_loss_clip": 1.41992188, + "router_z_loss_mlp": 0.22338867, + "step": 1139, + "time_per_iteration": 2.691990375518799 + }, + { + "auxiliary_loss_clip": 0.01197153, + "auxiliary_loss_mlp": 0.01055054, + "balance_loss_clip": 1.05700028, + "balance_loss_mlp": 1.03317928, + "epoch": 0.06854050804148504, + "flos": 49884569591040.0, + "grad_norm": 2.1754311418118086, + "language_loss": 0.7464422, + "learning_rate": 3.984439570469271e-06, + "loss": 0.76896423, + "num_input_tokens_seen": 24388665, + "router_z_loss_clip": 1.40234375, + "router_z_loss_mlp": 0.21850586, + "step": 1140, + "time_per_iteration": 2.8196260929107666 + }, + { + "auxiliary_loss_clip": 0.01196663, + "auxiliary_loss_mlp": 0.01062886, + "balance_loss_clip": 1.05938709, + "balance_loss_mlp": 1.03979516, + "epoch": 0.06860063129415302, + "flos": 38664585057600.0, + "grad_norm": 2.51970530094294, + "language_loss": 0.68479204, + "learning_rate": 3.9843910454160574e-06, + "loss": 0.70738757, + "num_input_tokens_seen": 24407705, + "router_z_loss_clip": 1.37207031, + "router_z_loss_mlp": 0.23083496, + "step": 1141, + "time_per_iteration": 2.749300479888916 + }, + { + "auxiliary_loss_clip": 0.01201618, + "auxiliary_loss_mlp": 0.0106558, + "balance_loss_clip": 1.05956435, + "balance_loss_mlp": 1.04277492, + "epoch": 0.06866075454682098, + "flos": 32387044307040.0, + "grad_norm": 2.109691957528365, + "language_loss": 0.79554105, + "learning_rate": 3.984342445114538e-06, + "loss": 0.81821311, + "num_input_tokens_seen": 24428390, + "router_z_loss_clip": 1.421875, + "router_z_loss_mlp": 0.22790527, + "step": 1142, + "time_per_iteration": 2.706677198410034 + }, + { + "auxiliary_loss_clip": 0.01197922, + "auxiliary_loss_mlp": 0.01057542, + "balance_loss_clip": 1.05761075, + "balance_loss_mlp": 1.03581023, + "epoch": 0.06872087779948895, + "flos": 36350163072000.0, + "grad_norm": 1.748077291657766, + "language_loss": 0.68629813, + "learning_rate": 3.984293769566553e-06, + "loss": 0.70885283, + "num_input_tokens_seen": 24450810, + "router_z_loss_clip": 1.40527344, + "router_z_loss_mlp": 0.21728516, + "step": 1143, + "time_per_iteration": 2.73832106590271 + }, + { + "auxiliary_loss_clip": 0.01192766, + "auxiliary_loss_mlp": 0.01059028, + "balance_loss_clip": 1.05835509, + "balance_loss_mlp": 1.03896475, + "epoch": 0.06878100105215693, + "flos": 32874275779200.0, + "grad_norm": 1.7269112609737234, + "language_loss": 0.74468887, + "learning_rate": 3.98424501877395e-06, + "loss": 0.76720679, + "num_input_tokens_seen": 24469965, + "router_z_loss_clip": 1.34570312, + "router_z_loss_mlp": 0.20068359, + "step": 1144, + "time_per_iteration": 2.718014717102051 + }, + { + "auxiliary_loss_clip": 0.01202417, + "auxiliary_loss_mlp": 0.01061785, + "balance_loss_clip": 1.05825782, + "balance_loss_mlp": 1.03913534, + "epoch": 0.06884112430482489, + "flos": 13019710681440.0, + "grad_norm": 2.1070655006468537, + "language_loss": 0.91712666, + "learning_rate": 3.984196192738577e-06, + "loss": 0.93976867, + "num_input_tokens_seen": 24486370, + "router_z_loss_clip": 1.44042969, + "router_z_loss_mlp": 0.22631836, + "step": 1145, + "time_per_iteration": 2.63156795501709 + }, + { + "auxiliary_loss_clip": 0.01203112, + "auxiliary_loss_mlp": 0.0106258, + "balance_loss_clip": 1.05790257, + "balance_loss_mlp": 1.03939378, + "epoch": 0.06890124755749286, + "flos": 24640272925920.0, + "grad_norm": 2.2924238101749475, + "language_loss": 0.82441664, + "learning_rate": 3.984147291462285e-06, + "loss": 0.84707355, + "num_input_tokens_seen": 24503780, + "router_z_loss_clip": 1.453125, + "router_z_loss_mlp": 0.23193359, + "step": 1146, + "time_per_iteration": 2.680490732192993 + }, + { + "auxiliary_loss_clip": 0.01197368, + "auxiliary_loss_mlp": 0.01062548, + "balance_loss_clip": 1.05972791, + "balance_loss_mlp": 1.04168642, + "epoch": 0.06896137081016084, + "flos": 24952130494560.0, + "grad_norm": 2.60011275559131, + "language_loss": 0.85338748, + "learning_rate": 3.98409831494693e-06, + "loss": 0.8759867, + "num_input_tokens_seen": 24522320, + "router_z_loss_clip": 1.37792969, + "router_z_loss_mlp": 0.20861816, + "step": 1147, + "time_per_iteration": 2.6317505836486816 + }, + { + "auxiliary_loss_clip": 0.01198288, + "auxiliary_loss_mlp": 0.01059422, + "balance_loss_clip": 1.05749965, + "balance_loss_mlp": 1.03803563, + "epoch": 0.0690214940628288, + "flos": 22413396081600.0, + "grad_norm": 1.987667800994039, + "language_loss": 0.85975444, + "learning_rate": 3.984049263194367e-06, + "loss": 0.88233161, + "num_input_tokens_seen": 24540445, + "router_z_loss_clip": 1.40820312, + "router_z_loss_mlp": 0.21386719, + "step": 1148, + "time_per_iteration": 2.9072017669677734 + }, + { + "auxiliary_loss_clip": 0.01197928, + "auxiliary_loss_mlp": 0.01056086, + "balance_loss_clip": 1.05756688, + "balance_loss_mlp": 1.03483081, + "epoch": 0.06908161731549677, + "flos": 25084967604480.0, + "grad_norm": 2.813625770127041, + "language_loss": 0.69830525, + "learning_rate": 3.9840001362064575e-06, + "loss": 0.7208454, + "num_input_tokens_seen": 24557105, + "router_z_loss_clip": 1.40332031, + "router_z_loss_mlp": 0.21252441, + "step": 1149, + "time_per_iteration": 2.6586766242980957 + }, + { + "auxiliary_loss_clip": 0.01201982, + "auxiliary_loss_mlp": 0.01048152, + "balance_loss_clip": 1.0581708, + "balance_loss_mlp": 1.02575243, + "epoch": 0.06914174056816474, + "flos": 33633745270560.0, + "grad_norm": 2.082209126562221, + "language_loss": 0.83736312, + "learning_rate": 3.983950933985064e-06, + "loss": 0.85986447, + "num_input_tokens_seen": 24578240, + "router_z_loss_clip": 1.4375, + "router_z_loss_mlp": 0.22399902, + "step": 1150, + "time_per_iteration": 2.7231364250183105 + }, + { + "auxiliary_loss_clip": 0.01204297, + "auxiliary_loss_mlp": 0.01056097, + "balance_loss_clip": 1.06222248, + "balance_loss_mlp": 1.03350699, + "epoch": 0.06920186382083271, + "flos": 18673617329280.0, + "grad_norm": 4.037148235745495, + "language_loss": 0.81288123, + "learning_rate": 3.983901656532052e-06, + "loss": 0.83548516, + "num_input_tokens_seen": 24593585, + "router_z_loss_clip": 1.42089844, + "router_z_loss_mlp": 0.22595215, + "step": 1151, + "time_per_iteration": 2.640592336654663 + }, + { + "auxiliary_loss_clip": 0.01200086, + "auxiliary_loss_mlp": 0.01056739, + "balance_loss_clip": 1.06128788, + "balance_loss_mlp": 1.03529286, + "epoch": 0.06926198707350067, + "flos": 30738793217760.0, + "grad_norm": 2.195633173802712, + "language_loss": 0.85832882, + "learning_rate": 3.983852303849291e-06, + "loss": 0.8808971, + "num_input_tokens_seen": 24613110, + "router_z_loss_clip": 1.38574219, + "router_z_loss_mlp": 0.21435547, + "step": 1152, + "time_per_iteration": 2.785871982574463 + }, + { + "auxiliary_loss_clip": 0.01196841, + "auxiliary_loss_mlp": 0.01058315, + "balance_loss_clip": 1.05888665, + "balance_loss_mlp": 1.03771567, + "epoch": 0.06932211032616864, + "flos": 16175069707680.0, + "grad_norm": 3.545847728665485, + "language_loss": 0.91116893, + "learning_rate": 3.983802875938651e-06, + "loss": 0.93372047, + "num_input_tokens_seen": 24628795, + "router_z_loss_clip": 1.37988281, + "router_z_loss_mlp": 0.20593262, + "step": 1153, + "time_per_iteration": 2.628016948699951 + }, + { + "auxiliary_loss_clip": 0.01200474, + "auxiliary_loss_mlp": 0.01053021, + "balance_loss_clip": 1.06100619, + "balance_loss_mlp": 1.03119385, + "epoch": 0.06938223357883662, + "flos": 30294706298400.0, + "grad_norm": 2.0947504630351097, + "language_loss": 0.81570208, + "learning_rate": 3.983753372802008e-06, + "loss": 0.83823705, + "num_input_tokens_seen": 24645480, + "router_z_loss_clip": 1.39355469, + "router_z_loss_mlp": 0.21813965, + "step": 1154, + "time_per_iteration": 2.7035579681396484 + }, + { + "auxiliary_loss_clip": 0.01203171, + "auxiliary_loss_mlp": 0.0105816, + "balance_loss_clip": 1.06405449, + "balance_loss_mlp": 1.03723848, + "epoch": 0.06944235683150458, + "flos": 33273273316320.0, + "grad_norm": 2.1126579154278717, + "language_loss": 0.75694078, + "learning_rate": 3.983703794441237e-06, + "loss": 0.77955413, + "num_input_tokens_seen": 24664630, + "router_z_loss_clip": 1.38964844, + "router_z_loss_mlp": 0.20922852, + "step": 1155, + "time_per_iteration": 2.706737756729126 + }, + { + "auxiliary_loss_clip": 0.01196874, + "auxiliary_loss_mlp": 0.01057703, + "balance_loss_clip": 1.05760777, + "balance_loss_mlp": 1.0369246, + "epoch": 0.06950248008417255, + "flos": 31491820461600.0, + "grad_norm": 1.8335684870181772, + "language_loss": 0.7097795, + "learning_rate": 3.98365414085822e-06, + "loss": 0.73232532, + "num_input_tokens_seen": 24684210, + "router_z_loss_clip": 1.39453125, + "router_z_loss_mlp": 0.2076416, + "step": 1156, + "time_per_iteration": 2.698868989944458 + }, + { + "auxiliary_loss_clip": 0.01202929, + "auxiliary_loss_mlp": 0.01058122, + "balance_loss_clip": 1.06207013, + "balance_loss_mlp": 1.03566289, + "epoch": 0.06956260333684053, + "flos": 27176049577440.0, + "grad_norm": 2.2578966233164395, + "language_loss": 0.74990106, + "learning_rate": 3.98360441205484e-06, + "loss": 0.7725116, + "num_input_tokens_seen": 24702490, + "router_z_loss_clip": 1.40820312, + "router_z_loss_mlp": 0.22460938, + "step": 1157, + "time_per_iteration": 2.7806060314178467 + }, + { + "auxiliary_loss_clip": 0.01200762, + "auxiliary_loss_mlp": 0.010542, + "balance_loss_clip": 1.05901563, + "balance_loss_mlp": 1.03200364, + "epoch": 0.0696227265895085, + "flos": 36217812169440.0, + "grad_norm": 1.7515081490363675, + "language_loss": 0.71461523, + "learning_rate": 3.983554608032982e-06, + "loss": 0.73716486, + "num_input_tokens_seen": 24724340, + "router_z_loss_clip": 1.41992188, + "router_z_loss_mlp": 0.2220459, + "step": 1158, + "time_per_iteration": 2.7382471561431885 + }, + { + "auxiliary_loss_clip": 0.01203002, + "auxiliary_loss_mlp": 0.01056441, + "balance_loss_clip": 1.06074333, + "balance_loss_mlp": 1.03454244, + "epoch": 0.06968284984217646, + "flos": 31143949381440.0, + "grad_norm": 1.9606476716043029, + "language_loss": 0.79857266, + "learning_rate": 3.983504728794533e-06, + "loss": 0.82116711, + "num_input_tokens_seen": 24745550, + "router_z_loss_clip": 1.42089844, + "router_z_loss_mlp": 0.21911621, + "step": 1159, + "time_per_iteration": 2.70021390914917 + }, + { + "auxiliary_loss_clip": 0.01203476, + "auxiliary_loss_mlp": 0.01062977, + "balance_loss_clip": 1.06073928, + "balance_loss_mlp": 1.03821719, + "epoch": 0.06974297309484444, + "flos": 25256532883680.0, + "grad_norm": 3.0487367721451535, + "language_loss": 0.80540168, + "learning_rate": 3.983454774341387e-06, + "loss": 0.82806623, + "num_input_tokens_seen": 24762575, + "router_z_loss_clip": 1.42480469, + "router_z_loss_mlp": 0.24780273, + "step": 1160, + "time_per_iteration": 2.680645227432251 + }, + { + "auxiliary_loss_clip": 0.01200154, + "auxiliary_loss_mlp": 0.01056181, + "balance_loss_clip": 1.05912542, + "balance_loss_mlp": 1.03388929, + "epoch": 0.0698030963475124, + "flos": 32342157511200.0, + "grad_norm": 1.7419063192207136, + "language_loss": 0.76428163, + "learning_rate": 3.983404744675437e-06, + "loss": 0.78684509, + "num_input_tokens_seen": 24782605, + "router_z_loss_clip": 1.41015625, + "router_z_loss_mlp": 0.22277832, + "step": 1161, + "time_per_iteration": 4.30124306678772 + }, + { + "auxiliary_loss_clip": 0.01199103, + "auxiliary_loss_mlp": 0.01061795, + "balance_loss_clip": 1.05916595, + "balance_loss_mlp": 1.03935933, + "epoch": 0.06986321960018037, + "flos": 28114053320160.0, + "grad_norm": 1.8876226169150616, + "language_loss": 0.82805771, + "learning_rate": 3.9833546397985794e-06, + "loss": 0.85066676, + "num_input_tokens_seen": 24802910, + "router_z_loss_clip": 1.40039062, + "router_z_loss_mlp": 0.22424316, + "step": 1162, + "time_per_iteration": 6.971064567565918 + }, + { + "auxiliary_loss_clip": 0.01196666, + "auxiliary_loss_mlp": 0.01051693, + "balance_loss_clip": 1.05743027, + "balance_loss_mlp": 1.02895987, + "epoch": 0.06992334285284833, + "flos": 34879878992160.0, + "grad_norm": 2.1859724411110855, + "language_loss": 0.79542196, + "learning_rate": 3.983304459712716e-06, + "loss": 0.8179056, + "num_input_tokens_seen": 24823305, + "router_z_loss_clip": 1.39160156, + "router_z_loss_mlp": 0.22741699, + "step": 1163, + "time_per_iteration": 2.7301690578460693 + }, + { + "auxiliary_loss_clip": 0.01201501, + "auxiliary_loss_mlp": 0.01059585, + "balance_loss_clip": 1.05989289, + "balance_loss_mlp": 1.03606486, + "epoch": 0.06998346610551631, + "flos": 25130421642240.0, + "grad_norm": 1.9530425446698525, + "language_loss": 0.79138601, + "learning_rate": 3.983254204419749e-06, + "loss": 0.81399691, + "num_input_tokens_seen": 24842155, + "router_z_loss_clip": 1.4140625, + "router_z_loss_mlp": 0.23535156, + "step": 1164, + "time_per_iteration": 2.702863931655884 + }, + { + "auxiliary_loss_clip": 0.01201186, + "auxiliary_loss_mlp": 0.01060133, + "balance_loss_clip": 1.06097007, + "balance_loss_mlp": 1.03717375, + "epoch": 0.07004358935818428, + "flos": 27489811458240.0, + "grad_norm": 1.4043871436807944, + "language_loss": 0.72720641, + "learning_rate": 3.983203873921583e-06, + "loss": 0.74981964, + "num_input_tokens_seen": 24862080, + "router_z_loss_clip": 1.40332031, + "router_z_loss_mlp": 0.22961426, + "step": 1165, + "time_per_iteration": 2.6618356704711914 + }, + { + "auxiliary_loss_clip": 0.01200022, + "auxiliary_loss_mlp": 0.01054627, + "balance_loss_clip": 1.06062603, + "balance_loss_mlp": 1.03297842, + "epoch": 0.07010371261085224, + "flos": 35325465050880.0, + "grad_norm": 1.8550191471371078, + "language_loss": 0.81148726, + "learning_rate": 3.983153468220128e-06, + "loss": 0.83403373, + "num_input_tokens_seen": 24886165, + "router_z_loss_clip": 1.39453125, + "router_z_loss_mlp": 0.21643066, + "step": 1166, + "time_per_iteration": 2.7687835693359375 + }, + { + "auxiliary_loss_clip": 0.01198503, + "auxiliary_loss_mlp": 0.01047875, + "balance_loss_clip": 1.0593915, + "balance_loss_mlp": 1.02503419, + "epoch": 0.07016383586352022, + "flos": 28869227979840.0, + "grad_norm": 2.004331093762954, + "language_loss": 0.84311557, + "learning_rate": 3.983102987317295e-06, + "loss": 0.86557931, + "num_input_tokens_seen": 24905775, + "router_z_loss_clip": 1.38964844, + "router_z_loss_mlp": 0.22827148, + "step": 1167, + "time_per_iteration": 2.6564669609069824 + }, + { + "auxiliary_loss_clip": 0.01202596, + "auxiliary_loss_mlp": 0.01052368, + "balance_loss_clip": 1.06174564, + "balance_loss_mlp": 1.03030217, + "epoch": 0.07022395911618819, + "flos": 24150326796000.0, + "grad_norm": 3.1629059132039186, + "language_loss": 0.89571822, + "learning_rate": 3.983052431214997e-06, + "loss": 0.91826785, + "num_input_tokens_seen": 24924295, + "router_z_loss_clip": 1.40820312, + "router_z_loss_mlp": 0.22045898, + "step": 1168, + "time_per_iteration": 2.6759722232818604 + }, + { + "auxiliary_loss_clip": 0.01204848, + "auxiliary_loss_mlp": 0.01063398, + "balance_loss_clip": 1.06002402, + "balance_loss_mlp": 1.03861392, + "epoch": 0.07028408236885615, + "flos": 26465234988960.0, + "grad_norm": 2.00128390042675, + "language_loss": 0.8901853, + "learning_rate": 3.983001799915153e-06, + "loss": 0.91286778, + "num_input_tokens_seen": 24943210, + "router_z_loss_clip": 1.44824219, + "router_z_loss_mlp": 0.24768066, + "step": 1169, + "time_per_iteration": 2.6441688537597656 + }, + { + "auxiliary_loss_clip": 0.01202606, + "auxiliary_loss_mlp": 0.0106243, + "balance_loss_clip": 1.06015134, + "balance_loss_mlp": 1.0395422, + "epoch": 0.07034420562152413, + "flos": 31274436489120.0, + "grad_norm": 2.1006749643873386, + "language_loss": 0.83955157, + "learning_rate": 3.982951093419681e-06, + "loss": 0.86220193, + "num_input_tokens_seen": 24960360, + "router_z_loss_clip": 1.421875, + "router_z_loss_mlp": 0.2286377, + "step": 1170, + "time_per_iteration": 2.6934659481048584 + }, + { + "auxiliary_loss_clip": 0.01201594, + "auxiliary_loss_mlp": 0.01061493, + "balance_loss_clip": 1.0610528, + "balance_loss_mlp": 1.03814006, + "epoch": 0.0704043288741921, + "flos": 25393259652480.0, + "grad_norm": 1.9231090671051432, + "language_loss": 0.75639677, + "learning_rate": 3.982900311730506e-06, + "loss": 0.77902764, + "num_input_tokens_seen": 24978290, + "router_z_loss_clip": 1.40625, + "router_z_loss_mlp": 0.23376465, + "step": 1171, + "time_per_iteration": 2.8547983169555664 + }, + { + "auxiliary_loss_clip": 0.01201724, + "auxiliary_loss_mlp": 0.01051978, + "balance_loss_clip": 1.06209731, + "balance_loss_mlp": 1.03001988, + "epoch": 0.07046445212686006, + "flos": 31229225555040.0, + "grad_norm": 1.9893009567804851, + "language_loss": 0.89123559, + "learning_rate": 3.9828494548495514e-06, + "loss": 0.91377264, + "num_input_tokens_seen": 24997055, + "router_z_loss_clip": 1.39746094, + "router_z_loss_mlp": 0.21948242, + "step": 1172, + "time_per_iteration": 2.7676594257354736 + }, + { + "auxiliary_loss_clip": 0.01200162, + "auxiliary_loss_mlp": 0.01048945, + "balance_loss_clip": 1.05665183, + "balance_loss_mlp": 1.02605665, + "epoch": 0.07052457537952803, + "flos": 31185027552960.0, + "grad_norm": 1.7103562169955107, + "language_loss": 0.8209666, + "learning_rate": 3.982798522778748e-06, + "loss": 0.84345764, + "num_input_tokens_seen": 25017490, + "router_z_loss_clip": 1.43554688, + "router_z_loss_mlp": 0.22851562, + "step": 1173, + "time_per_iteration": 2.6941158771514893 + }, + { + "auxiliary_loss_clip": 0.01197502, + "auxiliary_loss_mlp": 0.01050798, + "balance_loss_clip": 1.05747211, + "balance_loss_mlp": 1.02839828, + "epoch": 0.070584698632196, + "flos": 21924949091040.0, + "grad_norm": 1.952156745638519, + "language_loss": 0.82284749, + "learning_rate": 3.9827475155200245e-06, + "loss": 0.84533048, + "num_input_tokens_seen": 25035660, + "router_z_loss_clip": 1.40234375, + "router_z_loss_mlp": 0.22412109, + "step": 1174, + "time_per_iteration": 2.6502630710601807 + }, + { + "auxiliary_loss_clip": 0.01197465, + "auxiliary_loss_mlp": 0.010559, + "balance_loss_clip": 1.05779552, + "balance_loss_mlp": 1.03330982, + "epoch": 0.07064482188486397, + "flos": 30958284088800.0, + "grad_norm": 1.9017579790682992, + "language_loss": 0.85378015, + "learning_rate": 3.982696433075317e-06, + "loss": 0.87631381, + "num_input_tokens_seen": 25054785, + "router_z_loss_clip": 1.39550781, + "router_z_loss_mlp": 0.22595215, + "step": 1175, + "time_per_iteration": 2.657379627227783 + }, + { + "auxiliary_loss_clip": 0.0120214, + "auxiliary_loss_mlp": 0.01064037, + "balance_loss_clip": 1.06145382, + "balance_loss_mlp": 1.04228175, + "epoch": 0.07070494513753194, + "flos": 30383183337120.0, + "grad_norm": 1.8910876488406942, + "language_loss": 0.83176708, + "learning_rate": 3.982645275446563e-06, + "loss": 0.85442889, + "num_input_tokens_seen": 25075180, + "router_z_loss_clip": 1.40722656, + "router_z_loss_mlp": 0.2175293, + "step": 1176, + "time_per_iteration": 2.751729726791382 + }, + { + "auxiliary_loss_clip": 0.01199954, + "auxiliary_loss_mlp": 0.01060775, + "balance_loss_clip": 1.06108761, + "balance_loss_mlp": 1.03848255, + "epoch": 0.07076506839019991, + "flos": 27258246437760.0, + "grad_norm": 2.264098548410351, + "language_loss": 0.74230814, + "learning_rate": 3.982594042635701e-06, + "loss": 0.76491535, + "num_input_tokens_seen": 25093035, + "router_z_loss_clip": 1.38769531, + "router_z_loss_mlp": 0.22314453, + "step": 1177, + "time_per_iteration": 2.67647385597229 + }, + { + "auxiliary_loss_clip": 0.0120408, + "auxiliary_loss_mlp": 0.01054515, + "balance_loss_clip": 1.06135869, + "balance_loss_mlp": 1.03212786, + "epoch": 0.07082519164286788, + "flos": 22769816307840.0, + "grad_norm": 1.777349471002737, + "language_loss": 0.86012214, + "learning_rate": 3.982542734644673e-06, + "loss": 0.88270807, + "num_input_tokens_seen": 25112520, + "router_z_loss_clip": 1.42675781, + "router_z_loss_mlp": 0.22399902, + "step": 1178, + "time_per_iteration": 2.6664414405822754 + }, + { + "auxiliary_loss_clip": 0.01091159, + "auxiliary_loss_mlp": 0.01044515, + "balance_loss_clip": 1.03081965, + "balance_loss_mlp": 1.04078984, + "epoch": 0.07088531489553584, + "flos": 77670159608160.0, + "grad_norm": 0.8447762173635521, + "language_loss": 0.63238329, + "learning_rate": 3.982491351475427e-06, + "loss": 0.65374005, + "num_input_tokens_seen": 25177760, + "router_z_loss_clip": 0.60351562, + "router_z_loss_mlp": 0.03720093, + "step": 1179, + "time_per_iteration": 3.3684868812561035 + }, + { + "auxiliary_loss_clip": 0.01206724, + "auxiliary_loss_mlp": 0.01056221, + "balance_loss_clip": 1.06276393, + "balance_loss_mlp": 1.03475142, + "epoch": 0.07094543814820382, + "flos": 26323767698400.0, + "grad_norm": 2.663309535824297, + "language_loss": 0.83797115, + "learning_rate": 3.98243989312991e-06, + "loss": 0.86060059, + "num_input_tokens_seen": 25195260, + "router_z_loss_clip": 1.44042969, + "router_z_loss_mlp": 0.21459961, + "step": 1180, + "time_per_iteration": 2.655388355255127 + }, + { + "auxiliary_loss_clip": 0.01198696, + "auxiliary_loss_mlp": 0.0106031, + "balance_loss_clip": 1.05904329, + "balance_loss_mlp": 1.0374217, + "epoch": 0.07100556140087179, + "flos": 26954370773280.0, + "grad_norm": 2.822893873626081, + "language_loss": 0.8837949, + "learning_rate": 3.982388359610074e-06, + "loss": 0.90638494, + "num_input_tokens_seen": 25212740, + "router_z_loss_clip": 1.39648438, + "router_z_loss_mlp": 0.22888184, + "step": 1181, + "time_per_iteration": 2.654195547103882 + }, + { + "auxiliary_loss_clip": 0.01202768, + "auxiliary_loss_mlp": 0.01057233, + "balance_loss_clip": 1.06292582, + "balance_loss_mlp": 1.03539348, + "epoch": 0.07106568465353975, + "flos": 58476937500000.0, + "grad_norm": 1.816304967081431, + "language_loss": 0.83629811, + "learning_rate": 3.9823367509178725e-06, + "loss": 0.85889804, + "num_input_tokens_seen": 25236420, + "router_z_loss_clip": 1.39746094, + "router_z_loss_mlp": 0.21813965, + "step": 1182, + "time_per_iteration": 2.95530366897583 + }, + { + "auxiliary_loss_clip": 0.01197149, + "auxiliary_loss_mlp": 0.01053363, + "balance_loss_clip": 1.06268382, + "balance_loss_mlp": 1.03046322, + "epoch": 0.07112580790620772, + "flos": 28602702897120.0, + "grad_norm": 2.38835680178343, + "language_loss": 0.79145664, + "learning_rate": 3.982285067055262e-06, + "loss": 0.81396174, + "num_input_tokens_seen": 25255120, + "router_z_loss_clip": 1.34570312, + "router_z_loss_mlp": 0.22912598, + "step": 1183, + "time_per_iteration": 2.765684127807617 + }, + { + "auxiliary_loss_clip": 0.01202307, + "auxiliary_loss_mlp": 0.01056645, + "balance_loss_clip": 1.05831122, + "balance_loss_mlp": 1.03392351, + "epoch": 0.0711859311588757, + "flos": 38887317311040.0, + "grad_norm": 2.064427706271616, + "language_loss": 0.79131782, + "learning_rate": 3.982233308024204e-06, + "loss": 0.81390738, + "num_input_tokens_seen": 25275150, + "router_z_loss_clip": 1.43847656, + "router_z_loss_mlp": 0.22729492, + "step": 1184, + "time_per_iteration": 2.9756531715393066 + }, + { + "auxiliary_loss_clip": 0.01197144, + "auxiliary_loss_mlp": 0.01050918, + "balance_loss_clip": 1.06054807, + "balance_loss_mlp": 1.02991295, + "epoch": 0.07124605441154366, + "flos": 23928404888160.0, + "grad_norm": 3.7167830187131954, + "language_loss": 0.76948166, + "learning_rate": 3.98218147382666e-06, + "loss": 0.79196227, + "num_input_tokens_seen": 25293680, + "router_z_loss_clip": 1.3671875, + "router_z_loss_mlp": 0.21008301, + "step": 1185, + "time_per_iteration": 2.636483907699585 + }, + { + "auxiliary_loss_clip": 0.01200058, + "auxiliary_loss_mlp": 0.01060887, + "balance_loss_clip": 1.06010413, + "balance_loss_mlp": 1.03916681, + "epoch": 0.07130617766421163, + "flos": 17917510772160.0, + "grad_norm": 2.874046236397004, + "language_loss": 0.65543061, + "learning_rate": 3.982129564464596e-06, + "loss": 0.67804003, + "num_input_tokens_seen": 25310050, + "router_z_loss_clip": 1.3984375, + "router_z_loss_mlp": 0.21716309, + "step": 1186, + "time_per_iteration": 2.6855297088623047 + }, + { + "auxiliary_loss_clip": 0.01198891, + "auxiliary_loss_mlp": 0.01049392, + "balance_loss_clip": 1.06093454, + "balance_loss_mlp": 1.02833939, + "epoch": 0.07136630091687961, + "flos": 31808499586560.0, + "grad_norm": 1.8524399290834723, + "language_loss": 0.69752586, + "learning_rate": 3.98207757993998e-06, + "loss": 0.72000873, + "num_input_tokens_seen": 25331020, + "router_z_loss_clip": 1.38085938, + "router_z_loss_mlp": 0.21057129, + "step": 1187, + "time_per_iteration": 2.687141180038452 + }, + { + "auxiliary_loss_clip": 0.01194671, + "auxiliary_loss_mlp": 0.01053008, + "balance_loss_clip": 1.05986941, + "balance_loss_mlp": 1.03271818, + "epoch": 0.07142642416954757, + "flos": 19119203388000.0, + "grad_norm": 2.3804450520902276, + "language_loss": 0.78785276, + "learning_rate": 3.9820255202547845e-06, + "loss": 0.81032956, + "num_input_tokens_seen": 25347875, + "router_z_loss_clip": 1.34863281, + "router_z_loss_mlp": 0.20288086, + "step": 1188, + "time_per_iteration": 2.6331963539123535 + }, + { + "auxiliary_loss_clip": 0.01200548, + "auxiliary_loss_mlp": 0.01056305, + "balance_loss_clip": 1.0619297, + "balance_loss_mlp": 1.03401279, + "epoch": 0.07148654742221554, + "flos": 24105440000160.0, + "grad_norm": 4.3922623994132275, + "language_loss": 0.85035825, + "learning_rate": 3.981973385410981e-06, + "loss": 0.87292683, + "num_input_tokens_seen": 25366715, + "router_z_loss_clip": 1.38769531, + "router_z_loss_mlp": 0.22290039, + "step": 1189, + "time_per_iteration": 2.6448569297790527 + }, + { + "auxiliary_loss_clip": 0.01196011, + "auxiliary_loss_mlp": 0.01052471, + "balance_loss_clip": 1.05883324, + "balance_loss_mlp": 1.02966666, + "epoch": 0.07154667067488352, + "flos": 28641066410880.0, + "grad_norm": 1.7117509489949978, + "language_loss": 0.76768124, + "learning_rate": 3.9819211754105494e-06, + "loss": 0.79016614, + "num_input_tokens_seen": 25385450, + "router_z_loss_clip": 1.37207031, + "router_z_loss_mlp": 0.22802734, + "step": 1190, + "time_per_iteration": 2.6880176067352295 + }, + { + "auxiliary_loss_clip": 0.01199968, + "auxiliary_loss_mlp": 0.01065478, + "balance_loss_clip": 1.05888963, + "balance_loss_mlp": 1.04224443, + "epoch": 0.07160679392755148, + "flos": 22369684286880.0, + "grad_norm": 2.2405932296500177, + "language_loss": 0.75816607, + "learning_rate": 3.981868890255468e-06, + "loss": 0.78082061, + "num_input_tokens_seen": 25403940, + "router_z_loss_clip": 1.41308594, + "router_z_loss_mlp": 0.23242188, + "step": 1191, + "time_per_iteration": 2.6373441219329834 + }, + { + "auxiliary_loss_clip": 0.01197002, + "auxiliary_loss_mlp": 0.01054427, + "balance_loss_clip": 1.05759633, + "balance_loss_mlp": 1.03158641, + "epoch": 0.07166691718021945, + "flos": 21654899004960.0, + "grad_norm": 4.2316906272370165, + "language_loss": 0.73923802, + "learning_rate": 3.981816529947719e-06, + "loss": 0.76175225, + "num_input_tokens_seen": 25420410, + "router_z_loss_clip": 1.39257812, + "router_z_loss_mlp": 0.22827148, + "step": 1192, + "time_per_iteration": 2.660207509994507 + }, + { + "auxiliary_loss_clip": 0.01194068, + "auxiliary_loss_mlp": 0.01048972, + "balance_loss_clip": 1.05530584, + "balance_loss_mlp": 1.02780032, + "epoch": 0.07172704043288743, + "flos": 27396350794080.0, + "grad_norm": 2.267157933903019, + "language_loss": 0.78253859, + "learning_rate": 3.9817640944892896e-06, + "loss": 0.80496901, + "num_input_tokens_seen": 25439415, + "router_z_loss_clip": 1.38867188, + "router_z_loss_mlp": 0.21154785, + "step": 1193, + "time_per_iteration": 2.668971538543701 + }, + { + "auxiliary_loss_clip": 0.0120061, + "auxiliary_loss_mlp": 0.01050206, + "balance_loss_clip": 1.06138003, + "balance_loss_mlp": 1.02784181, + "epoch": 0.07178716368555539, + "flos": 28335934710720.0, + "grad_norm": 2.743263934655498, + "language_loss": 0.85413754, + "learning_rate": 3.981711583882166e-06, + "loss": 0.87664568, + "num_input_tokens_seen": 25458715, + "router_z_loss_clip": 1.39257812, + "router_z_loss_mlp": 0.22363281, + "step": 1194, + "time_per_iteration": 2.677536964416504 + }, + { + "auxiliary_loss_clip": 0.01196842, + "auxiliary_loss_mlp": 0.01054906, + "balance_loss_clip": 1.05901957, + "balance_loss_mlp": 1.0330075, + "epoch": 0.07184728693822336, + "flos": 30689854693920.0, + "grad_norm": 2.1040304033304644, + "language_loss": 0.81542253, + "learning_rate": 3.981658998128341e-06, + "loss": 0.83794004, + "num_input_tokens_seen": 25477985, + "router_z_loss_clip": 1.37695312, + "router_z_loss_mlp": 0.21911621, + "step": 1195, + "time_per_iteration": 2.7116289138793945 + }, + { + "auxiliary_loss_clip": 0.01199722, + "auxiliary_loss_mlp": 0.01050049, + "balance_loss_clip": 1.0623759, + "balance_loss_mlp": 1.02965271, + "epoch": 0.07190741019089132, + "flos": 27712422159840.0, + "grad_norm": 1.8278739926232555, + "language_loss": 0.7991302, + "learning_rate": 3.981606337229808e-06, + "loss": 0.82162797, + "num_input_tokens_seen": 25497110, + "router_z_loss_clip": 1.375, + "router_z_loss_mlp": 0.20397949, + "step": 1196, + "time_per_iteration": 2.672168731689453 + }, + { + "auxiliary_loss_clip": 0.01196378, + "auxiliary_loss_mlp": 0.010623, + "balance_loss_clip": 1.05927241, + "balance_loss_mlp": 1.03872037, + "epoch": 0.0719675334435593, + "flos": 35813466351360.0, + "grad_norm": 2.8862338181288734, + "language_loss": 0.71447009, + "learning_rate": 3.9815536011885655e-06, + "loss": 0.73705685, + "num_input_tokens_seen": 25516555, + "router_z_loss_clip": 1.37109375, + "router_z_loss_mlp": 0.23583984, + "step": 1197, + "time_per_iteration": 2.9258649349212646 + }, + { + "auxiliary_loss_clip": 0.01198858, + "auxiliary_loss_mlp": 0.010483, + "balance_loss_clip": 1.06069183, + "balance_loss_mlp": 1.02700913, + "epoch": 0.07202765669622727, + "flos": 21523844655360.0, + "grad_norm": 2.8623120574750467, + "language_loss": 0.85672152, + "learning_rate": 3.98150079000661e-06, + "loss": 0.87919313, + "num_input_tokens_seen": 25533895, + "router_z_loss_clip": 1.38085938, + "router_z_loss_mlp": 0.2130127, + "step": 1198, + "time_per_iteration": 2.599213123321533 + }, + { + "auxiliary_loss_clip": 0.01198643, + "auxiliary_loss_mlp": 0.01056317, + "balance_loss_clip": 1.06073737, + "balance_loss_mlp": 1.03418016, + "epoch": 0.07208777994889523, + "flos": 26153904144960.0, + "grad_norm": 2.014292342593333, + "language_loss": 0.83699185, + "learning_rate": 3.981447903685947e-06, + "loss": 0.85954142, + "num_input_tokens_seen": 25554195, + "router_z_loss_clip": 1.37890625, + "router_z_loss_mlp": 0.22131348, + "step": 1199, + "time_per_iteration": 2.6655373573303223 + }, + { + "auxiliary_loss_clip": 0.01204648, + "auxiliary_loss_mlp": 0.01049729, + "balance_loss_clip": 1.0655427, + "balance_loss_mlp": 1.02885532, + "epoch": 0.07214790320156321, + "flos": 32873992158240.0, + "grad_norm": 2.2742347540672965, + "language_loss": 0.76565349, + "learning_rate": 3.981394942228581e-06, + "loss": 0.78819722, + "num_input_tokens_seen": 25574155, + "router_z_loss_clip": 1.38964844, + "router_z_loss_mlp": 0.20874023, + "step": 1200, + "time_per_iteration": 4.116399765014648 + }, + { + "auxiliary_loss_clip": 0.0120103, + "auxiliary_loss_mlp": 0.01062133, + "balance_loss_clip": 1.06251514, + "balance_loss_mlp": 1.0398643, + "epoch": 0.07220802645423118, + "flos": 29138467720320.0, + "grad_norm": 2.594885626716726, + "language_loss": 0.83048046, + "learning_rate": 3.98134190563652e-06, + "loss": 0.85311204, + "num_input_tokens_seen": 25592735, + "router_z_loss_clip": 1.38476562, + "router_z_loss_mlp": 0.22265625, + "step": 1201, + "time_per_iteration": 2.745274782180786 + }, + { + "auxiliary_loss_clip": 0.01202314, + "auxiliary_loss_mlp": 0.01056864, + "balance_loss_clip": 1.06012917, + "balance_loss_mlp": 1.03326046, + "epoch": 0.07226814970689914, + "flos": 23481117103680.0, + "grad_norm": 2.393706770717269, + "language_loss": 0.68722188, + "learning_rate": 3.981288793911775e-06, + "loss": 0.70981365, + "num_input_tokens_seen": 25611510, + "router_z_loss_clip": 1.42285156, + "router_z_loss_mlp": 0.23596191, + "step": 1202, + "time_per_iteration": 7.110603094100952 + }, + { + "auxiliary_loss_clip": 0.01203468, + "auxiliary_loss_mlp": 0.01059372, + "balance_loss_clip": 1.06231999, + "balance_loss_mlp": 1.03523219, + "epoch": 0.07232827295956712, + "flos": 23393612479680.0, + "grad_norm": 2.0311036115511705, + "language_loss": 0.87850642, + "learning_rate": 3.98123560705636e-06, + "loss": 0.90113485, + "num_input_tokens_seen": 25629560, + "router_z_loss_clip": 1.4140625, + "router_z_loss_mlp": 0.24145508, + "step": 1203, + "time_per_iteration": 2.6761038303375244 + }, + { + "auxiliary_loss_clip": 0.01204051, + "auxiliary_loss_mlp": 0.01059508, + "balance_loss_clip": 1.06051528, + "balance_loss_mlp": 1.03683436, + "epoch": 0.07238839621223508, + "flos": 21523804138080.0, + "grad_norm": 10.36599832641088, + "language_loss": 0.7847476, + "learning_rate": 3.981182345072293e-06, + "loss": 0.80738318, + "num_input_tokens_seen": 25648330, + "router_z_loss_clip": 1.43652344, + "router_z_loss_mlp": 0.22668457, + "step": 1204, + "time_per_iteration": 2.6620333194732666 + }, + { + "auxiliary_loss_clip": 0.01199979, + "auxiliary_loss_mlp": 0.010642, + "balance_loss_clip": 1.06014681, + "balance_loss_mlp": 1.04238474, + "epoch": 0.07244851946490305, + "flos": 34523094110400.0, + "grad_norm": 1.542743392252352, + "language_loss": 0.82340264, + "learning_rate": 3.981129007961593e-06, + "loss": 0.84604448, + "num_input_tokens_seen": 25669470, + "router_z_loss_clip": 1.39746094, + "router_z_loss_mlp": 0.21801758, + "step": 1205, + "time_per_iteration": 2.703396797180176 + }, + { + "auxiliary_loss_clip": 0.01204729, + "auxiliary_loss_mlp": 0.01063322, + "balance_loss_clip": 1.06214595, + "balance_loss_mlp": 1.04017138, + "epoch": 0.07250864271757101, + "flos": 27533806873920.0, + "grad_norm": 1.7236173043000431, + "language_loss": 0.76075351, + "learning_rate": 3.981075595726283e-06, + "loss": 0.78343403, + "num_input_tokens_seen": 25690470, + "router_z_loss_clip": 1.42578125, + "router_z_loss_mlp": 0.23156738, + "step": 1206, + "time_per_iteration": 2.675240993499756 + }, + { + "auxiliary_loss_clip": 0.01202855, + "auxiliary_loss_mlp": 0.01058003, + "balance_loss_clip": 1.06252265, + "balance_loss_mlp": 1.0345546, + "epoch": 0.072568765970239, + "flos": 26554603407840.0, + "grad_norm": 1.8304673945042855, + "language_loss": 0.77072734, + "learning_rate": 3.981022108368387e-06, + "loss": 0.79333591, + "num_input_tokens_seen": 25709205, + "router_z_loss_clip": 1.40429688, + "router_z_loss_mlp": 0.23461914, + "step": 1207, + "time_per_iteration": 2.726236343383789 + }, + { + "auxiliary_loss_clip": 0.01198296, + "auxiliary_loss_mlp": 0.01051008, + "balance_loss_clip": 1.05986083, + "balance_loss_mlp": 1.02993155, + "epoch": 0.07262888922290696, + "flos": 31139614032480.0, + "grad_norm": 3.3065214347527436, + "language_loss": 0.79290348, + "learning_rate": 3.9809685458899345e-06, + "loss": 0.81539655, + "num_input_tokens_seen": 25728485, + "router_z_loss_clip": 1.3828125, + "router_z_loss_mlp": 0.21081543, + "step": 1208, + "time_per_iteration": 2.811807155609131 + }, + { + "auxiliary_loss_clip": 0.0119763, + "auxiliary_loss_mlp": 0.01056595, + "balance_loss_clip": 1.05999148, + "balance_loss_mlp": 1.03578115, + "epoch": 0.07268901247557492, + "flos": 25925580506880.0, + "grad_norm": 1.9472163751275304, + "language_loss": 0.78467858, + "learning_rate": 3.980914908292955e-06, + "loss": 0.80722082, + "num_input_tokens_seen": 25747730, + "router_z_loss_clip": 1.37792969, + "router_z_loss_mlp": 0.20812988, + "step": 1209, + "time_per_iteration": 2.718797206878662 + }, + { + "auxiliary_loss_clip": 0.01200121, + "auxiliary_loss_mlp": 0.01062624, + "balance_loss_clip": 1.06127477, + "balance_loss_mlp": 1.0411427, + "epoch": 0.0727491357282429, + "flos": 31090675508640.0, + "grad_norm": 2.4539242016811746, + "language_loss": 0.81316376, + "learning_rate": 3.980861195579486e-06, + "loss": 0.83579117, + "num_input_tokens_seen": 25768050, + "router_z_loss_clip": 1.38867188, + "router_z_loss_mlp": 0.21484375, + "step": 1210, + "time_per_iteration": 2.7080042362213135 + }, + { + "auxiliary_loss_clip": 0.01198363, + "auxiliary_loss_mlp": 0.01060552, + "balance_loss_clip": 1.06147659, + "balance_loss_mlp": 1.03856993, + "epoch": 0.07280925898091087, + "flos": 29849079722400.0, + "grad_norm": 1.910842401748803, + "language_loss": 0.845972, + "learning_rate": 3.98080740775156e-06, + "loss": 0.86856121, + "num_input_tokens_seen": 25787985, + "router_z_loss_clip": 1.36816406, + "router_z_loss_mlp": 0.21984863, + "step": 1211, + "time_per_iteration": 2.657660722732544 + }, + { + "auxiliary_loss_clip": 0.01197067, + "auxiliary_loss_mlp": 0.01050106, + "balance_loss_clip": 1.0593549, + "balance_loss_mlp": 1.02908933, + "epoch": 0.07286938223357883, + "flos": 22315843172160.0, + "grad_norm": 3.13370610538659, + "language_loss": 0.91005659, + "learning_rate": 3.98075354481122e-06, + "loss": 0.93252832, + "num_input_tokens_seen": 25803620, + "router_z_loss_clip": 1.37695312, + "router_z_loss_mlp": 0.21020508, + "step": 1212, + "time_per_iteration": 2.6301708221435547 + }, + { + "auxiliary_loss_clip": 0.01197981, + "auxiliary_loss_mlp": 0.01056395, + "balance_loss_clip": 1.06018353, + "balance_loss_mlp": 1.034639, + "epoch": 0.07292950548624681, + "flos": 25886406647520.0, + "grad_norm": 1.7936388445905744, + "language_loss": 0.72675288, + "learning_rate": 3.9806996067605055e-06, + "loss": 0.74929667, + "num_input_tokens_seen": 25823315, + "router_z_loss_clip": 1.37792969, + "router_z_loss_mlp": 0.21740723, + "step": 1213, + "time_per_iteration": 2.6763627529144287 + }, + { + "auxiliary_loss_clip": 0.01199787, + "auxiliary_loss_mlp": 0.0105072, + "balance_loss_clip": 1.05970025, + "balance_loss_mlp": 1.0286181, + "epoch": 0.07298962873891478, + "flos": 30068773179840.0, + "grad_norm": 1.821017125074721, + "language_loss": 0.84277022, + "learning_rate": 3.980645593601465e-06, + "loss": 0.86527532, + "num_input_tokens_seen": 25842605, + "router_z_loss_clip": 1.39941406, + "router_z_loss_mlp": 0.22106934, + "step": 1214, + "time_per_iteration": 2.6950953006744385 + }, + { + "auxiliary_loss_clip": 0.0120778, + "auxiliary_loss_mlp": 0.01057783, + "balance_loss_clip": 1.06576145, + "balance_loss_mlp": 1.03555048, + "epoch": 0.07304975199158274, + "flos": 33010313754240.0, + "grad_norm": 2.0708428938186145, + "language_loss": 0.84348011, + "learning_rate": 3.980591505336144e-06, + "loss": 0.86613566, + "num_input_tokens_seen": 25863030, + "router_z_loss_clip": 1.41992188, + "router_z_loss_mlp": 0.22229004, + "step": 1215, + "time_per_iteration": 2.7687137126922607 + }, + { + "auxiliary_loss_clip": 0.01199014, + "auxiliary_loss_mlp": 0.0105832, + "balance_loss_clip": 1.05911279, + "balance_loss_mlp": 1.03546715, + "epoch": 0.07310987524425071, + "flos": 40937969388960.0, + "grad_norm": 2.031546092015698, + "language_loss": 0.81094718, + "learning_rate": 3.980537341966595e-06, + "loss": 0.83352047, + "num_input_tokens_seen": 25888015, + "router_z_loss_clip": 1.3984375, + "router_z_loss_mlp": 0.22851562, + "step": 1216, + "time_per_iteration": 2.7790281772613525 + }, + { + "auxiliary_loss_clip": 0.01202267, + "auxiliary_loss_mlp": 0.01050848, + "balance_loss_clip": 1.06249952, + "balance_loss_mlp": 1.02966404, + "epoch": 0.07316999849691869, + "flos": 34301577375360.0, + "grad_norm": 1.9371129973936134, + "language_loss": 0.75694537, + "learning_rate": 3.980483103494872e-06, + "loss": 0.77947652, + "num_input_tokens_seen": 25908660, + "router_z_loss_clip": 1.3984375, + "router_z_loss_mlp": 0.21179199, + "step": 1217, + "time_per_iteration": 2.744704484939575 + }, + { + "auxiliary_loss_clip": 0.01198984, + "auxiliary_loss_mlp": 0.01053112, + "balance_loss_clip": 1.06134295, + "balance_loss_mlp": 1.03310871, + "epoch": 0.07323012174958665, + "flos": 17561374166880.0, + "grad_norm": 2.730551217915444, + "language_loss": 0.86375642, + "learning_rate": 3.98042878992303e-06, + "loss": 0.88627732, + "num_input_tokens_seen": 25927215, + "router_z_loss_clip": 1.37597656, + "router_z_loss_mlp": 0.20019531, + "step": 1218, + "time_per_iteration": 2.6443886756896973 + }, + { + "auxiliary_loss_clip": 0.01198861, + "auxiliary_loss_mlp": 0.01058641, + "balance_loss_clip": 1.05895281, + "balance_loss_mlp": 1.03768456, + "epoch": 0.07329024500225462, + "flos": 26376474329280.0, + "grad_norm": 1.8568658355057592, + "language_loss": 0.86721557, + "learning_rate": 3.9803744012531305e-06, + "loss": 0.88979059, + "num_input_tokens_seen": 25945500, + "router_z_loss_clip": 1.3984375, + "router_z_loss_mlp": 0.20959473, + "step": 1219, + "time_per_iteration": 2.682075023651123 + }, + { + "auxiliary_loss_clip": 0.01200392, + "auxiliary_loss_mlp": 0.01054828, + "balance_loss_clip": 1.06129003, + "balance_loss_mlp": 1.03401411, + "epoch": 0.0733503682549226, + "flos": 16135612227360.0, + "grad_norm": 2.5650837961524444, + "language_loss": 0.84629989, + "learning_rate": 3.980319937487235e-06, + "loss": 0.86885214, + "num_input_tokens_seen": 25963105, + "router_z_loss_clip": 1.38964844, + "router_z_loss_mlp": 0.20800781, + "step": 1220, + "time_per_iteration": 2.891373634338379 + }, + { + "auxiliary_loss_clip": 0.01201622, + "auxiliary_loss_mlp": 0.01057738, + "balance_loss_clip": 1.06164169, + "balance_loss_mlp": 1.03664947, + "epoch": 0.07341049150759056, + "flos": 25486315143840.0, + "grad_norm": 2.898952498301209, + "language_loss": 0.77479625, + "learning_rate": 3.98026539862741e-06, + "loss": 0.79738975, + "num_input_tokens_seen": 25981690, + "router_z_loss_clip": 1.39941406, + "router_z_loss_mlp": 0.2109375, + "step": 1221, + "time_per_iteration": 2.722506523132324 + }, + { + "auxiliary_loss_clip": 0.01204453, + "auxiliary_loss_mlp": 0.01060103, + "balance_loss_clip": 1.0643971, + "balance_loss_mlp": 1.03845465, + "epoch": 0.07347061476025853, + "flos": 18807345819360.0, + "grad_norm": 2.1879538661079736, + "language_loss": 0.9183197, + "learning_rate": 3.980210784675722e-06, + "loss": 0.94096529, + "num_input_tokens_seen": 25999890, + "router_z_loss_clip": 1.40039062, + "router_z_loss_mlp": 0.21643066, + "step": 1222, + "time_per_iteration": 2.620753049850464 + }, + { + "auxiliary_loss_clip": 0.01204576, + "auxiliary_loss_mlp": 0.01056044, + "balance_loss_clip": 1.06421697, + "balance_loss_mlp": 1.03515816, + "epoch": 0.0735307380129265, + "flos": 13554300503520.0, + "grad_norm": 2.264772800722224, + "language_loss": 0.908566, + "learning_rate": 3.980156095634242e-06, + "loss": 0.93117213, + "num_input_tokens_seen": 26016445, + "router_z_loss_clip": 1.40136719, + "router_z_loss_mlp": 0.20898438, + "step": 1223, + "time_per_iteration": 2.664818048477173 + }, + { + "auxiliary_loss_clip": 0.01199993, + "auxiliary_loss_mlp": 0.01068487, + "balance_loss_clip": 1.06269395, + "balance_loss_mlp": 1.04704094, + "epoch": 0.07359086126559447, + "flos": 28958555881440.0, + "grad_norm": 1.9735247978138635, + "language_loss": 0.82111704, + "learning_rate": 3.980101331505045e-06, + "loss": 0.84380186, + "num_input_tokens_seen": 26036080, + "router_z_loss_clip": 1.37304688, + "router_z_loss_mlp": 0.21459961, + "step": 1224, + "time_per_iteration": 2.6798243522644043 + }, + { + "auxiliary_loss_clip": 0.01198246, + "auxiliary_loss_mlp": 0.01055277, + "balance_loss_clip": 1.05963159, + "balance_loss_mlp": 1.03269875, + "epoch": 0.07365098451826244, + "flos": 25616316044160.0, + "grad_norm": 2.1793245555517458, + "language_loss": 0.83085108, + "learning_rate": 3.9800464922902076e-06, + "loss": 0.85338628, + "num_input_tokens_seen": 26055805, + "router_z_loss_clip": 1.38574219, + "router_z_loss_mlp": 0.22570801, + "step": 1225, + "time_per_iteration": 2.660938262939453 + }, + { + "auxiliary_loss_clip": 0.01201805, + "auxiliary_loss_mlp": 0.01054797, + "balance_loss_clip": 1.06219482, + "balance_loss_mlp": 1.03305316, + "epoch": 0.0737111077709304, + "flos": 24323350697280.0, + "grad_norm": 5.177247993408391, + "language_loss": 0.90214276, + "learning_rate": 3.979991577991808e-06, + "loss": 0.92470884, + "num_input_tokens_seen": 26073905, + "router_z_loss_clip": 1.39453125, + "router_z_loss_mlp": 0.21740723, + "step": 1226, + "time_per_iteration": 2.6597588062286377 + }, + { + "auxiliary_loss_clip": 0.01208388, + "auxiliary_loss_mlp": 0.01049606, + "balance_loss_clip": 1.06112731, + "balance_loss_mlp": 1.02706349, + "epoch": 0.07377123102359838, + "flos": 20232499999680.0, + "grad_norm": 3.279722136218082, + "language_loss": 0.76459956, + "learning_rate": 3.97993658861193e-06, + "loss": 0.78717947, + "num_input_tokens_seen": 26091700, + "router_z_loss_clip": 1.47363281, + "router_z_loss_mlp": 0.22546387, + "step": 1227, + "time_per_iteration": 2.653803825378418 + }, + { + "auxiliary_loss_clip": 0.01201051, + "auxiliary_loss_mlp": 0.01052645, + "balance_loss_clip": 1.06478274, + "balance_loss_mlp": 1.0312829, + "epoch": 0.07383135427626634, + "flos": 34567251595200.0, + "grad_norm": 2.5323436680858933, + "language_loss": 0.85589689, + "learning_rate": 3.9798815241526575e-06, + "loss": 0.87843388, + "num_input_tokens_seen": 26114105, + "router_z_loss_clip": 1.36132812, + "router_z_loss_mlp": 0.21374512, + "step": 1228, + "time_per_iteration": 2.741788625717163 + }, + { + "auxiliary_loss_clip": 0.01201822, + "auxiliary_loss_mlp": 0.0106109, + "balance_loss_clip": 1.06053126, + "balance_loss_mlp": 1.03939342, + "epoch": 0.07389147752893431, + "flos": 24461576605440.0, + "grad_norm": 2.160867908147033, + "language_loss": 0.80116254, + "learning_rate": 3.97982638461608e-06, + "loss": 0.82379168, + "num_input_tokens_seen": 26131165, + "router_z_loss_clip": 1.41113281, + "router_z_loss_mlp": 0.21704102, + "step": 1229, + "time_per_iteration": 2.665271520614624 + }, + { + "auxiliary_loss_clip": 0.01204329, + "auxiliary_loss_mlp": 0.01059374, + "balance_loss_clip": 1.06320691, + "balance_loss_mlp": 1.03691506, + "epoch": 0.07395160078160229, + "flos": 22102672996800.0, + "grad_norm": 2.918173482626889, + "language_loss": 0.78303552, + "learning_rate": 3.979771170004287e-06, + "loss": 0.80567253, + "num_input_tokens_seen": 26150040, + "router_z_loss_clip": 1.41308594, + "router_z_loss_mlp": 0.22460938, + "step": 1230, + "time_per_iteration": 2.64599347114563 + }, + { + "auxiliary_loss_clip": 0.01199429, + "auxiliary_loss_mlp": 0.01051111, + "balance_loss_clip": 1.06252527, + "balance_loss_mlp": 1.0288074, + "epoch": 0.07401172403427025, + "flos": 28780994044800.0, + "grad_norm": 1.8716671572585768, + "language_loss": 0.81222683, + "learning_rate": 3.979715880319372e-06, + "loss": 0.83473229, + "num_input_tokens_seen": 26169380, + "router_z_loss_clip": 1.36816406, + "router_z_loss_mlp": 0.2232666, + "step": 1231, + "time_per_iteration": 2.759472131729126 + }, + { + "auxiliary_loss_clip": 0.01203161, + "auxiliary_loss_mlp": 0.01061839, + "balance_loss_clip": 1.06046164, + "balance_loss_mlp": 1.03986919, + "epoch": 0.07407184728693822, + "flos": 32164798260960.0, + "grad_norm": 3.269334886128877, + "language_loss": 0.94934642, + "learning_rate": 3.979660515563434e-06, + "loss": 0.97199637, + "num_input_tokens_seen": 26189420, + "router_z_loss_clip": 1.42871094, + "router_z_loss_mlp": 0.21984863, + "step": 1232, + "time_per_iteration": 2.711080312728882 + }, + { + "auxiliary_loss_clip": 0.01201603, + "auxiliary_loss_mlp": 0.01058618, + "balance_loss_clip": 1.06427956, + "balance_loss_mlp": 1.03856707, + "epoch": 0.0741319705396062, + "flos": 27089112195360.0, + "grad_norm": 2.31553964292577, + "language_loss": 0.80983484, + "learning_rate": 3.979605075738569e-06, + "loss": 0.83243704, + "num_input_tokens_seen": 26209300, + "router_z_loss_clip": 1.37207031, + "router_z_loss_mlp": 0.20056152, + "step": 1233, + "time_per_iteration": 2.9286115169525146 + }, + { + "auxiliary_loss_clip": 0.01203821, + "auxiliary_loss_mlp": 0.01055221, + "balance_loss_clip": 1.06117058, + "balance_loss_mlp": 1.03180826, + "epoch": 0.07419209379227416, + "flos": 47834079582240.0, + "grad_norm": 2.1901508079936356, + "language_loss": 0.70613253, + "learning_rate": 3.979549560846883e-06, + "loss": 0.72872287, + "num_input_tokens_seen": 26228110, + "router_z_loss_clip": 1.42578125, + "router_z_loss_mlp": 0.234375, + "step": 1234, + "time_per_iteration": 2.825446128845215 + }, + { + "auxiliary_loss_clip": 0.01202026, + "auxiliary_loss_mlp": 0.01059272, + "balance_loss_clip": 1.06359935, + "balance_loss_mlp": 1.03796971, + "epoch": 0.07425221704494213, + "flos": 27798589713600.0, + "grad_norm": 1.7958200685141634, + "language_loss": 0.76956236, + "learning_rate": 3.979493970890478e-06, + "loss": 0.79217535, + "num_input_tokens_seen": 26247020, + "router_z_loss_clip": 1.3828125, + "router_z_loss_mlp": 0.21313477, + "step": 1235, + "time_per_iteration": 2.662508487701416 + }, + { + "auxiliary_loss_clip": 0.0119835, + "auxiliary_loss_mlp": 0.01055145, + "balance_loss_clip": 1.06297851, + "balance_loss_mlp": 1.03433108, + "epoch": 0.0743123402976101, + "flos": 27177670268640.0, + "grad_norm": 2.0088885946045227, + "language_loss": 0.82420838, + "learning_rate": 3.979438305871464e-06, + "loss": 0.84674335, + "num_input_tokens_seen": 26265750, + "router_z_loss_clip": 1.35351562, + "router_z_loss_mlp": 0.20812988, + "step": 1236, + "time_per_iteration": 2.679166793823242 + }, + { + "auxiliary_loss_clip": 0.01203955, + "auxiliary_loss_mlp": 0.01057403, + "balance_loss_clip": 1.06267142, + "balance_loss_mlp": 1.0353967, + "epoch": 0.07437246355027807, + "flos": 35770889040480.0, + "grad_norm": 1.9992783650801491, + "language_loss": 0.7560308, + "learning_rate": 3.979382565791951e-06, + "loss": 0.77864444, + "num_input_tokens_seen": 26287905, + "router_z_loss_clip": 1.41210938, + "router_z_loss_mlp": 0.2199707, + "step": 1237, + "time_per_iteration": 2.7117867469787598 + }, + { + "auxiliary_loss_clip": 0.01197983, + "auxiliary_loss_mlp": 0.01059066, + "balance_loss_clip": 1.05914271, + "balance_loss_mlp": 1.03831136, + "epoch": 0.07443258680294604, + "flos": 38979602974080.0, + "grad_norm": 1.7207343341595545, + "language_loss": 0.77704775, + "learning_rate": 3.979326750654053e-06, + "loss": 0.79961824, + "num_input_tokens_seen": 26311795, + "router_z_loss_clip": 1.38769531, + "router_z_loss_mlp": 0.2076416, + "step": 1238, + "time_per_iteration": 2.7752552032470703 + }, + { + "auxiliary_loss_clip": 0.01206972, + "auxiliary_loss_mlp": 0.01053218, + "balance_loss_clip": 1.06386387, + "balance_loss_mlp": 1.03148615, + "epoch": 0.074492710055614, + "flos": 27668507778720.0, + "grad_norm": 2.5276679812493374, + "language_loss": 0.86594421, + "learning_rate": 3.9792708604598854e-06, + "loss": 0.88854611, + "num_input_tokens_seen": 26330330, + "router_z_loss_clip": 1.43066406, + "router_z_loss_mlp": 0.21728516, + "step": 1239, + "time_per_iteration": 2.6843667030334473 + }, + { + "auxiliary_loss_clip": 0.01199061, + "auxiliary_loss_mlp": 0.01049391, + "balance_loss_clip": 1.05986857, + "balance_loss_mlp": 1.02672911, + "epoch": 0.07455283330828198, + "flos": 25972209545760.0, + "grad_norm": 2.180740079650229, + "language_loss": 0.88510597, + "learning_rate": 3.979214895211569e-06, + "loss": 0.90759045, + "num_input_tokens_seen": 26348865, + "router_z_loss_clip": 1.38867188, + "router_z_loss_mlp": 0.2265625, + "step": 1240, + "time_per_iteration": 4.08722448348999 + }, + { + "auxiliary_loss_clip": 0.01204291, + "auxiliary_loss_mlp": 0.01057501, + "balance_loss_clip": 1.0648644, + "balance_loss_mlp": 1.03517342, + "epoch": 0.07461295656094995, + "flos": 29759144061600.0, + "grad_norm": 3.086612489966232, + "language_loss": 0.88843238, + "learning_rate": 3.979158854911225e-06, + "loss": 0.91105032, + "num_input_tokens_seen": 26368210, + "router_z_loss_clip": 1.39453125, + "router_z_loss_mlp": 0.22338867, + "step": 1241, + "time_per_iteration": 4.242807149887085 + }, + { + "auxiliary_loss_clip": 0.01101084, + "auxiliary_loss_mlp": 0.01007301, + "balance_loss_clip": 1.03872633, + "balance_loss_mlp": 1.00372744, + "epoch": 0.07467307981361791, + "flos": 75786615908640.0, + "grad_norm": 0.898979614842683, + "language_loss": 0.63064069, + "learning_rate": 3.979102739560979e-06, + "loss": 0.65172458, + "num_input_tokens_seen": 26424890, + "router_z_loss_clip": 0.62304688, + "router_z_loss_mlp": 0.03570557, + "step": 1242, + "time_per_iteration": 6.273852586746216 + }, + { + "auxiliary_loss_clip": 0.01212112, + "auxiliary_loss_mlp": 0.01057863, + "balance_loss_clip": 1.06412363, + "balance_loss_mlp": 1.03386593, + "epoch": 0.07473320306628589, + "flos": 30338418093120.0, + "grad_norm": 2.9130377280005164, + "language_loss": 0.62878114, + "learning_rate": 3.9790465491629595e-06, + "loss": 0.65148085, + "num_input_tokens_seen": 26446405, + "router_z_loss_clip": 1.47851562, + "router_z_loss_mlp": 0.2401123, + "step": 1243, + "time_per_iteration": 2.791933298110962 + }, + { + "auxiliary_loss_clip": 0.01201512, + "auxiliary_loss_mlp": 0.01050209, + "balance_loss_clip": 1.06346273, + "balance_loss_mlp": 1.02882254, + "epoch": 0.07479332631895386, + "flos": 30380468679360.0, + "grad_norm": 1.876068656940475, + "language_loss": 0.76518846, + "learning_rate": 3.978990283719296e-06, + "loss": 0.7877056, + "num_input_tokens_seen": 26466070, + "router_z_loss_clip": 1.38085938, + "router_z_loss_mlp": 0.21411133, + "step": 1244, + "time_per_iteration": 2.8496463298797607 + }, + { + "auxiliary_loss_clip": 0.01205019, + "auxiliary_loss_mlp": 0.0105444, + "balance_loss_clip": 1.06369376, + "balance_loss_mlp": 1.03288674, + "epoch": 0.07485344957162182, + "flos": 21738595004640.0, + "grad_norm": 5.240437460167169, + "language_loss": 0.68171239, + "learning_rate": 3.978933943232123e-06, + "loss": 0.70430696, + "num_input_tokens_seen": 26479350, + "router_z_loss_clip": 1.41113281, + "router_z_loss_mlp": 0.21533203, + "step": 1245, + "time_per_iteration": 2.6553447246551514 + }, + { + "auxiliary_loss_clip": 0.01201157, + "auxiliary_loss_mlp": 0.01057368, + "balance_loss_clip": 1.06205845, + "balance_loss_mlp": 1.03530216, + "epoch": 0.0749135728242898, + "flos": 30517762690080.0, + "grad_norm": 1.7912991301055439, + "language_loss": 0.88714874, + "learning_rate": 3.978877527703576e-06, + "loss": 0.90973401, + "num_input_tokens_seen": 26498255, + "router_z_loss_clip": 1.39355469, + "router_z_loss_mlp": 0.22070312, + "step": 1246, + "time_per_iteration": 2.716329574584961 + }, + { + "auxiliary_loss_clip": 0.01213427, + "auxiliary_loss_mlp": 0.01063826, + "balance_loss_clip": 1.06517386, + "balance_loss_mlp": 1.03930509, + "epoch": 0.07497369607695777, + "flos": 21746293287840.0, + "grad_norm": 2.9612529734195383, + "language_loss": 0.88454771, + "learning_rate": 3.9788210371357945e-06, + "loss": 0.90732026, + "num_input_tokens_seen": 26515375, + "router_z_loss_clip": 1.48339844, + "router_z_loss_mlp": 0.24523926, + "step": 1247, + "time_per_iteration": 2.6865904331207275 + }, + { + "auxiliary_loss_clip": 0.01202465, + "auxiliary_loss_mlp": 0.01060463, + "balance_loss_clip": 1.06293225, + "balance_loss_mlp": 1.0377655, + "epoch": 0.07503381932962573, + "flos": 18450439385760.0, + "grad_norm": 2.2906660956190543, + "language_loss": 0.64344287, + "learning_rate": 3.978764471530921e-06, + "loss": 0.66607213, + "num_input_tokens_seen": 26533595, + "router_z_loss_clip": 1.39355469, + "router_z_loss_mlp": 0.22692871, + "step": 1248, + "time_per_iteration": 2.649332046508789 + }, + { + "auxiliary_loss_clip": 0.01202027, + "auxiliary_loss_mlp": 0.01059198, + "balance_loss_clip": 1.0665741, + "balance_loss_mlp": 1.03942096, + "epoch": 0.0750939425822937, + "flos": 15639021263520.0, + "grad_norm": 2.214715728894331, + "language_loss": 0.74455094, + "learning_rate": 3.978707830891102e-06, + "loss": 0.76716322, + "num_input_tokens_seen": 26549405, + "router_z_loss_clip": 1.35742188, + "router_z_loss_mlp": 0.19775391, + "step": 1249, + "time_per_iteration": 2.6200790405273438 + }, + { + "auxiliary_loss_clip": 0.01205536, + "auxiliary_loss_mlp": 0.01067552, + "balance_loss_clip": 1.06327271, + "balance_loss_mlp": 1.04493833, + "epoch": 0.07515406583496168, + "flos": 29537505774720.0, + "grad_norm": 2.5880971196729847, + "language_loss": 0.82189035, + "learning_rate": 3.978651115218482e-06, + "loss": 0.84462124, + "num_input_tokens_seen": 26567200, + "router_z_loss_clip": 1.42089844, + "router_z_loss_mlp": 0.22595215, + "step": 1250, + "time_per_iteration": 2.685040235519409 + }, + { + "auxiliary_loss_clip": 0.01203888, + "auxiliary_loss_mlp": 0.01056851, + "balance_loss_clip": 1.06598222, + "balance_loss_mlp": 1.03514302, + "epoch": 0.07521418908762964, + "flos": 32565375972000.0, + "grad_norm": 2.113717932858604, + "language_loss": 0.67009485, + "learning_rate": 3.978594324515215e-06, + "loss": 0.69270229, + "num_input_tokens_seen": 26586190, + "router_z_loss_clip": 1.38085938, + "router_z_loss_mlp": 0.21704102, + "step": 1251, + "time_per_iteration": 2.6769466400146484 + }, + { + "auxiliary_loss_clip": 0.01091761, + "auxiliary_loss_mlp": 0.01004351, + "balance_loss_clip": 1.03229833, + "balance_loss_mlp": 1.00115895, + "epoch": 0.0752743123402976, + "flos": 72106796380320.0, + "grad_norm": 0.9336427246050035, + "language_loss": 0.70405197, + "learning_rate": 3.9785374587834515e-06, + "loss": 0.72501314, + "num_input_tokens_seen": 26650710, + "router_z_loss_clip": 0.59472656, + "router_z_loss_mlp": 0.0319519, + "step": 1252, + "time_per_iteration": 3.327927350997925 + }, + { + "auxiliary_loss_clip": 0.01200402, + "auxiliary_loss_mlp": 0.01061772, + "balance_loss_clip": 1.06137919, + "balance_loss_mlp": 1.03951526, + "epoch": 0.07533443559296558, + "flos": 28646941416480.0, + "grad_norm": 2.1960444630685965, + "language_loss": 0.79860461, + "learning_rate": 3.97848051802535e-06, + "loss": 0.82122636, + "num_input_tokens_seen": 26669000, + "router_z_loss_clip": 1.390625, + "router_z_loss_mlp": 0.22253418, + "step": 1253, + "time_per_iteration": 2.6735002994537354 + }, + { + "auxiliary_loss_clip": 0.01205612, + "auxiliary_loss_mlp": 0.01061159, + "balance_loss_clip": 1.06485105, + "balance_loss_mlp": 1.03935528, + "epoch": 0.07539455884563355, + "flos": 25477441859520.0, + "grad_norm": 2.4819416078057417, + "language_loss": 0.93764925, + "learning_rate": 3.978423502243069e-06, + "loss": 0.96031696, + "num_input_tokens_seen": 26683075, + "router_z_loss_clip": 1.40820312, + "router_z_loss_mlp": 0.21789551, + "step": 1254, + "time_per_iteration": 2.653294563293457 + }, + { + "auxiliary_loss_clip": 0.01195724, + "auxiliary_loss_mlp": 0.01059457, + "balance_loss_clip": 1.06050444, + "balance_loss_mlp": 1.03805947, + "epoch": 0.07545468209830151, + "flos": 33767068587840.0, + "grad_norm": 2.095723185318906, + "language_loss": 0.88108027, + "learning_rate": 3.97836641143877e-06, + "loss": 0.9036321, + "num_input_tokens_seen": 26701875, + "router_z_loss_clip": 1.3515625, + "router_z_loss_mlp": 0.21411133, + "step": 1255, + "time_per_iteration": 2.718101739883423 + }, + { + "auxiliary_loss_clip": 0.0119666, + "auxiliary_loss_mlp": 0.01063061, + "balance_loss_clip": 1.06056976, + "balance_loss_mlp": 1.04088783, + "epoch": 0.0755148053509695, + "flos": 17249476080960.0, + "grad_norm": 2.10744564463702, + "language_loss": 0.79492563, + "learning_rate": 3.978309245614618e-06, + "loss": 0.81752282, + "num_input_tokens_seen": 26719050, + "router_z_loss_clip": 1.359375, + "router_z_loss_mlp": 0.22155762, + "step": 1256, + "time_per_iteration": 2.8765013217926025 + }, + { + "auxiliary_loss_clip": 0.01089011, + "auxiliary_loss_mlp": 0.01003324, + "balance_loss_clip": 1.03019345, + "balance_loss_mlp": 1.00008798, + "epoch": 0.07557492860363746, + "flos": 71059408682400.0, + "grad_norm": 0.7703073823374661, + "language_loss": 0.58022934, + "learning_rate": 3.9782520047727825e-06, + "loss": 0.60115266, + "num_input_tokens_seen": 26780650, + "router_z_loss_clip": 0.58837891, + "router_z_loss_mlp": 0.03240967, + "step": 1257, + "time_per_iteration": 3.386434555053711 + }, + { + "auxiliary_loss_clip": 0.01203469, + "auxiliary_loss_mlp": 0.01062949, + "balance_loss_clip": 1.0663203, + "balance_loss_mlp": 1.04097831, + "epoch": 0.07563505185630542, + "flos": 30071447320320.0, + "grad_norm": 2.1496525887282565, + "language_loss": 0.8974911, + "learning_rate": 3.978194688915432e-06, + "loss": 0.92015523, + "num_input_tokens_seen": 26798725, + "router_z_loss_clip": 1.37109375, + "router_z_loss_mlp": 0.21960449, + "step": 1258, + "time_per_iteration": 2.7777047157287598 + }, + { + "auxiliary_loss_clip": 0.01200385, + "auxiliary_loss_mlp": 0.01054751, + "balance_loss_clip": 1.06507039, + "balance_loss_mlp": 1.03394914, + "epoch": 0.07569517510897339, + "flos": 18940831205760.0, + "grad_norm": 1.9149704995600265, + "language_loss": 0.81122613, + "learning_rate": 3.978137298044741e-06, + "loss": 0.83377755, + "num_input_tokens_seen": 26817005, + "router_z_loss_clip": 1.35351562, + "router_z_loss_mlp": 0.20800781, + "step": 1259, + "time_per_iteration": 2.6751928329467773 + }, + { + "auxiliary_loss_clip": 0.01202701, + "auxiliary_loss_mlp": 0.01058614, + "balance_loss_clip": 1.06403434, + "balance_loss_mlp": 1.03758502, + "epoch": 0.07575529836164137, + "flos": 27977731724160.0, + "grad_norm": 3.467575950979985, + "language_loss": 0.76066005, + "learning_rate": 3.978079832162885e-06, + "loss": 0.78327322, + "num_input_tokens_seen": 26836655, + "router_z_loss_clip": 1.38769531, + "router_z_loss_mlp": 0.21020508, + "step": 1260, + "time_per_iteration": 2.6621203422546387 + }, + { + "auxiliary_loss_clip": 0.01200463, + "auxiliary_loss_mlp": 0.01057015, + "balance_loss_clip": 1.06166005, + "balance_loss_mlp": 1.03419828, + "epoch": 0.07581542161430933, + "flos": 23794352259840.0, + "grad_norm": 1.8141752053445712, + "language_loss": 0.8465488, + "learning_rate": 3.978022291272044e-06, + "loss": 0.86912358, + "num_input_tokens_seen": 26854925, + "router_z_loss_clip": 1.38964844, + "router_z_loss_mlp": 0.22802734, + "step": 1261, + "time_per_iteration": 2.645848035812378 + }, + { + "auxiliary_loss_clip": 0.01206935, + "auxiliary_loss_mlp": 0.01058718, + "balance_loss_clip": 1.06750274, + "balance_loss_mlp": 1.03768992, + "epoch": 0.0758755448669773, + "flos": 30472956928800.0, + "grad_norm": 2.22313429949623, + "language_loss": 0.82538569, + "learning_rate": 3.977964675374399e-06, + "loss": 0.84804225, + "num_input_tokens_seen": 26876170, + "router_z_loss_clip": 1.39257812, + "router_z_loss_mlp": 0.21032715, + "step": 1262, + "time_per_iteration": 2.7255606651306152 + }, + { + "auxiliary_loss_clip": 0.01200252, + "auxiliary_loss_mlp": 0.01056722, + "balance_loss_clip": 1.06161344, + "balance_loss_mlp": 1.03497863, + "epoch": 0.07593566811964528, + "flos": 27757227921120.0, + "grad_norm": 2.800291500697572, + "language_loss": 0.82494605, + "learning_rate": 3.977906984472136e-06, + "loss": 0.84751582, + "num_input_tokens_seen": 26895005, + "router_z_loss_clip": 1.38574219, + "router_z_loss_mlp": 0.21728516, + "step": 1263, + "time_per_iteration": 2.67413592338562 + }, + { + "auxiliary_loss_clip": 0.01204571, + "auxiliary_loss_mlp": 0.01057482, + "balance_loss_clip": 1.06347942, + "balance_loss_mlp": 1.03620338, + "epoch": 0.07599579137231324, + "flos": 28201963116960.0, + "grad_norm": 1.9533786073904262, + "language_loss": 0.75903398, + "learning_rate": 3.977849218567442e-06, + "loss": 0.78165448, + "num_input_tokens_seen": 26913930, + "router_z_loss_clip": 1.41113281, + "router_z_loss_mlp": 0.21276855, + "step": 1264, + "time_per_iteration": 2.689699649810791 + }, + { + "auxiliary_loss_clip": 0.01203131, + "auxiliary_loss_mlp": 0.010607, + "balance_loss_clip": 1.06338501, + "balance_loss_mlp": 1.03886044, + "epoch": 0.07605591462498121, + "flos": 17694940587840.0, + "grad_norm": 2.0991820638643257, + "language_loss": 0.80989408, + "learning_rate": 3.977791377662507e-06, + "loss": 0.83253241, + "num_input_tokens_seen": 26931485, + "router_z_loss_clip": 1.39746094, + "router_z_loss_mlp": 0.21850586, + "step": 1265, + "time_per_iteration": 2.625455379486084 + }, + { + "auxiliary_loss_clip": 0.01204198, + "auxiliary_loss_mlp": 0.01053996, + "balance_loss_clip": 1.06312513, + "balance_loss_mlp": 1.03272891, + "epoch": 0.07611603787764919, + "flos": 28692233385120.0, + "grad_norm": 2.119865323237123, + "language_loss": 0.65515935, + "learning_rate": 3.977733461759524e-06, + "loss": 0.67774129, + "num_input_tokens_seen": 26951670, + "router_z_loss_clip": 1.41015625, + "router_z_loss_mlp": 0.21289062, + "step": 1266, + "time_per_iteration": 2.669536590576172 + }, + { + "auxiliary_loss_clip": 0.01200141, + "auxiliary_loss_mlp": 0.01059872, + "balance_loss_clip": 1.06028199, + "balance_loss_mlp": 1.03831923, + "epoch": 0.07617616113031715, + "flos": 26243151012000.0, + "grad_norm": 3.025382845819438, + "language_loss": 0.79308909, + "learning_rate": 3.977675470860691e-06, + "loss": 0.81568921, + "num_input_tokens_seen": 26970335, + "router_z_loss_clip": 1.3984375, + "router_z_loss_mlp": 0.21557617, + "step": 1267, + "time_per_iteration": 2.644402265548706 + }, + { + "auxiliary_loss_clip": 0.01198731, + "auxiliary_loss_mlp": 0.01047965, + "balance_loss_clip": 1.0599854, + "balance_loss_mlp": 1.02767563, + "epoch": 0.07623628438298512, + "flos": 17781877969920.0, + "grad_norm": 3.0449297431563638, + "language_loss": 0.72830236, + "learning_rate": 3.977617404968205e-06, + "loss": 0.75076932, + "num_input_tokens_seen": 26986025, + "router_z_loss_clip": 1.38574219, + "router_z_loss_mlp": 0.20275879, + "step": 1268, + "time_per_iteration": 2.6619601249694824 + }, + { + "auxiliary_loss_clip": 0.01199869, + "auxiliary_loss_mlp": 0.0105411, + "balance_loss_clip": 1.06074834, + "balance_loss_mlp": 1.03271163, + "epoch": 0.07629640763565308, + "flos": 17961425153280.0, + "grad_norm": 2.296619662613228, + "language_loss": 0.82217896, + "learning_rate": 3.977559264084269e-06, + "loss": 0.84471869, + "num_input_tokens_seen": 27004045, + "router_z_loss_clip": 1.39160156, + "router_z_loss_mlp": 0.21398926, + "step": 1269, + "time_per_iteration": 2.892094850540161 + }, + { + "auxiliary_loss_clip": 0.01201755, + "auxiliary_loss_mlp": 0.01057504, + "balance_loss_clip": 1.06394863, + "balance_loss_mlp": 1.03583157, + "epoch": 0.07635653088832106, + "flos": 18184157406720.0, + "grad_norm": 2.2205575457304074, + "language_loss": 0.88350177, + "learning_rate": 3.977501048211088e-06, + "loss": 0.90609443, + "num_input_tokens_seen": 27022070, + "router_z_loss_clip": 1.37792969, + "router_z_loss_mlp": 0.2166748, + "step": 1270, + "time_per_iteration": 2.6459062099456787 + }, + { + "auxiliary_loss_clip": 0.01204328, + "auxiliary_loss_mlp": 0.0105311, + "balance_loss_clip": 1.06294668, + "balance_loss_mlp": 1.03195, + "epoch": 0.07641665414098903, + "flos": 32520732279840.0, + "grad_norm": 2.0152054152625616, + "language_loss": 0.71159887, + "learning_rate": 3.977442757350869e-06, + "loss": 0.7341733, + "num_input_tokens_seen": 27041755, + "router_z_loss_clip": 1.41308594, + "router_z_loss_mlp": 0.21118164, + "step": 1271, + "time_per_iteration": 2.6813459396362305 + }, + { + "auxiliary_loss_clip": 0.01197601, + "auxiliary_loss_mlp": 0.01060937, + "balance_loss_clip": 1.06456244, + "balance_loss_mlp": 1.03944373, + "epoch": 0.07647677739365699, + "flos": 30740575978080.0, + "grad_norm": 1.672955048401239, + "language_loss": 0.82930768, + "learning_rate": 3.977384391505823e-06, + "loss": 0.85189307, + "num_input_tokens_seen": 27061540, + "router_z_loss_clip": 1.33007812, + "router_z_loss_mlp": 0.21496582, + "step": 1272, + "time_per_iteration": 2.675403356552124 + }, + { + "auxiliary_loss_clip": 0.01199121, + "auxiliary_loss_mlp": 0.01058811, + "balance_loss_clip": 1.06050456, + "balance_loss_mlp": 1.0379492, + "epoch": 0.07653690064632497, + "flos": 25085089156320.0, + "grad_norm": 1.8573335194456024, + "language_loss": 0.79990935, + "learning_rate": 3.977325950678162e-06, + "loss": 0.82248873, + "num_input_tokens_seen": 27081395, + "router_z_loss_clip": 1.38671875, + "router_z_loss_mlp": 0.20874023, + "step": 1273, + "time_per_iteration": 2.6358067989349365 + }, + { + "auxiliary_loss_clip": 0.01202202, + "auxiliary_loss_mlp": 0.01058483, + "balance_loss_clip": 1.06303906, + "balance_loss_mlp": 1.03677475, + "epoch": 0.07659702389899294, + "flos": 27173537506080.0, + "grad_norm": 1.9954510256914955, + "language_loss": 0.81180555, + "learning_rate": 3.977267434870103e-06, + "loss": 0.8344124, + "num_input_tokens_seen": 27101175, + "router_z_loss_clip": 1.39160156, + "router_z_loss_mlp": 0.21704102, + "step": 1274, + "time_per_iteration": 2.7449004650115967 + }, + { + "auxiliary_loss_clip": 0.0120062, + "auxiliary_loss_mlp": 0.01061898, + "balance_loss_clip": 1.06216681, + "balance_loss_mlp": 1.0396173, + "epoch": 0.0766571471516609, + "flos": 39823457258880.0, + "grad_norm": 1.7765455043607146, + "language_loss": 0.73036671, + "learning_rate": 3.977208844083865e-06, + "loss": 0.75299186, + "num_input_tokens_seen": 27124505, + "router_z_loss_clip": 1.38476562, + "router_z_loss_mlp": 0.22290039, + "step": 1275, + "time_per_iteration": 2.7389655113220215 + }, + { + "auxiliary_loss_clip": 0.01201099, + "auxiliary_loss_mlp": 0.01060094, + "balance_loss_clip": 1.06293535, + "balance_loss_mlp": 1.03788519, + "epoch": 0.07671727040432888, + "flos": 18628933119840.0, + "grad_norm": 4.65115759895111, + "language_loss": 0.79475182, + "learning_rate": 3.9771501783216685e-06, + "loss": 0.81736374, + "num_input_tokens_seen": 27140960, + "router_z_loss_clip": 1.38183594, + "router_z_loss_mlp": 0.2220459, + "step": 1276, + "time_per_iteration": 2.6303746700286865 + }, + { + "auxiliary_loss_clip": 0.01203973, + "auxiliary_loss_mlp": 0.01052125, + "balance_loss_clip": 1.06422305, + "balance_loss_mlp": 1.03082204, + "epoch": 0.07677739365699685, + "flos": 34390824242400.0, + "grad_norm": 2.6590077955270144, + "language_loss": 0.59568143, + "learning_rate": 3.97709143758574e-06, + "loss": 0.61824244, + "num_input_tokens_seen": 27160985, + "router_z_loss_clip": 1.39648438, + "router_z_loss_mlp": 0.21289062, + "step": 1277, + "time_per_iteration": 2.78387713432312 + }, + { + "auxiliary_loss_clip": 0.01207046, + "auxiliary_loss_mlp": 0.01053959, + "balance_loss_clip": 1.06416047, + "balance_loss_mlp": 1.03282285, + "epoch": 0.07683751690966481, + "flos": 22324797491040.0, + "grad_norm": 2.5773501621796293, + "language_loss": 0.75019121, + "learning_rate": 3.977032621878305e-06, + "loss": 0.77280128, + "num_input_tokens_seen": 27178390, + "router_z_loss_clip": 1.43066406, + "router_z_loss_mlp": 0.21142578, + "step": 1278, + "time_per_iteration": 2.6445071697235107 + }, + { + "auxiliary_loss_clip": 0.01199756, + "auxiliary_loss_mlp": 0.01053928, + "balance_loss_clip": 1.06286931, + "balance_loss_mlp": 1.03279257, + "epoch": 0.07689764016233278, + "flos": 26821493146080.0, + "grad_norm": 2.0761155098163533, + "language_loss": 0.88460904, + "learning_rate": 3.976973731201596e-06, + "loss": 0.90714586, + "num_input_tokens_seen": 27197505, + "router_z_loss_clip": 1.36914062, + "router_z_loss_mlp": 0.21130371, + "step": 1279, + "time_per_iteration": 4.111357927322388 + }, + { + "auxiliary_loss_clip": 0.01198111, + "auxiliary_loss_mlp": 0.01056074, + "balance_loss_clip": 1.06268239, + "balance_loss_mlp": 1.0349381, + "epoch": 0.07695776341500075, + "flos": 27132580886400.0, + "grad_norm": 2.648517521559764, + "language_loss": 0.82679766, + "learning_rate": 3.976914765557845e-06, + "loss": 0.84933949, + "num_input_tokens_seen": 27214260, + "router_z_loss_clip": 1.35449219, + "router_z_loss_mlp": 0.21118164, + "step": 1280, + "time_per_iteration": 4.1533379554748535 + }, + { + "auxiliary_loss_clip": 0.011981, + "auxiliary_loss_mlp": 0.01055173, + "balance_loss_clip": 1.06392765, + "balance_loss_mlp": 1.03351235, + "epoch": 0.07701788666766872, + "flos": 19698274833120.0, + "grad_norm": 2.3237088423192187, + "language_loss": 0.76036161, + "learning_rate": 3.9768557249492875e-06, + "loss": 0.78289437, + "num_input_tokens_seen": 27232525, + "router_z_loss_clip": 1.34277344, + "router_z_loss_mlp": 0.21655273, + "step": 1281, + "time_per_iteration": 4.3687639236450195 + }, + { + "auxiliary_loss_clip": 0.01207061, + "auxiliary_loss_mlp": 0.0105554, + "balance_loss_clip": 1.0646311, + "balance_loss_mlp": 1.03373623, + "epoch": 0.07707800992033668, + "flos": 23748817187520.0, + "grad_norm": 1.9164257803453284, + "language_loss": 0.75065851, + "learning_rate": 3.9767966093781634e-06, + "loss": 0.77328449, + "num_input_tokens_seen": 27249800, + "router_z_loss_clip": 1.42578125, + "router_z_loss_mlp": 0.21801758, + "step": 1282, + "time_per_iteration": 2.635479688644409 + }, + { + "auxiliary_loss_clip": 0.01202477, + "auxiliary_loss_mlp": 0.0106195, + "balance_loss_clip": 1.06567097, + "balance_loss_mlp": 1.04052854, + "epoch": 0.07713813317300466, + "flos": 23171771606400.0, + "grad_norm": 2.1488727097323976, + "language_loss": 0.83860511, + "learning_rate": 3.976737418846713e-06, + "loss": 0.86124939, + "num_input_tokens_seen": 27268895, + "router_z_loss_clip": 1.3671875, + "router_z_loss_mlp": 0.2142334, + "step": 1283, + "time_per_iteration": 2.6458563804626465 + }, + { + "auxiliary_loss_clip": 0.01203113, + "auxiliary_loss_mlp": 0.01065467, + "balance_loss_clip": 1.06434882, + "balance_loss_mlp": 1.04229271, + "epoch": 0.07719825642567263, + "flos": 22102470410400.0, + "grad_norm": 2.8312663388063477, + "language_loss": 0.74791282, + "learning_rate": 3.976678153357181e-06, + "loss": 0.77059865, + "num_input_tokens_seen": 27288180, + "router_z_loss_clip": 1.38769531, + "router_z_loss_mlp": 0.23193359, + "step": 1284, + "time_per_iteration": 2.623908758163452 + }, + { + "auxiliary_loss_clip": 0.01199191, + "auxiliary_loss_mlp": 0.01065517, + "balance_loss_clip": 1.06373596, + "balance_loss_mlp": 1.04476309, + "epoch": 0.0772583796783406, + "flos": 51487326125280.0, + "grad_norm": 3.022681937910575, + "language_loss": 0.76409191, + "learning_rate": 3.976618812911817e-06, + "loss": 0.78673899, + "num_input_tokens_seen": 27311815, + "router_z_loss_clip": 1.35449219, + "router_z_loss_mlp": 0.20751953, + "step": 1285, + "time_per_iteration": 2.8615782260894775 + }, + { + "auxiliary_loss_clip": 0.01208766, + "auxiliary_loss_mlp": 0.01062391, + "balance_loss_clip": 1.06964302, + "balance_loss_mlp": 1.04188669, + "epoch": 0.07731850293100857, + "flos": 30204851672160.0, + "grad_norm": 1.854089964989223, + "language_loss": 0.83956546, + "learning_rate": 3.9765593975128685e-06, + "loss": 0.86227709, + "num_input_tokens_seen": 27331890, + "router_z_loss_clip": 1.39160156, + "router_z_loss_mlp": 0.2052002, + "step": 1286, + "time_per_iteration": 2.6802892684936523 + }, + { + "auxiliary_loss_clip": 0.01205751, + "auxiliary_loss_mlp": 0.01056301, + "balance_loss_clip": 1.06372666, + "balance_loss_mlp": 1.03474796, + "epoch": 0.07737862618367654, + "flos": 21433746925440.0, + "grad_norm": 3.076929374921907, + "language_loss": 0.77718282, + "learning_rate": 3.97649990716259e-06, + "loss": 0.79980332, + "num_input_tokens_seen": 27348320, + "router_z_loss_clip": 1.42089844, + "router_z_loss_mlp": 0.21557617, + "step": 1287, + "time_per_iteration": 2.6500489711761475 + }, + { + "auxiliary_loss_clip": 0.01199279, + "auxiliary_loss_mlp": 0.01053125, + "balance_loss_clip": 1.06352854, + "balance_loss_mlp": 1.03271651, + "epoch": 0.0774387494363445, + "flos": 31270911485760.0, + "grad_norm": 1.9216686461522723, + "language_loss": 0.84662437, + "learning_rate": 3.976440341863237e-06, + "loss": 0.86914837, + "num_input_tokens_seen": 27367670, + "router_z_loss_clip": 1.35644531, + "router_z_loss_mlp": 0.20422363, + "step": 1288, + "time_per_iteration": 2.703702688217163 + }, + { + "auxiliary_loss_clip": 0.012008, + "auxiliary_loss_mlp": 0.01056953, + "balance_loss_clip": 1.06133783, + "balance_loss_mlp": 1.03612733, + "epoch": 0.07749887268901248, + "flos": 14934446336160.0, + "grad_norm": 2.341836076523423, + "language_loss": 0.84925789, + "learning_rate": 3.976380701617068e-06, + "loss": 0.87183541, + "num_input_tokens_seen": 27385485, + "router_z_loss_clip": 1.39257812, + "router_z_loss_mlp": 0.20837402, + "step": 1289, + "time_per_iteration": 2.6385018825531006 + }, + { + "auxiliary_loss_clip": 0.01200967, + "auxiliary_loss_mlp": 0.01048148, + "balance_loss_clip": 1.06298113, + "balance_loss_mlp": 1.02775145, + "epoch": 0.07755899594168045, + "flos": 30605186279520.0, + "grad_norm": 2.197190321085421, + "language_loss": 0.85413986, + "learning_rate": 3.976320986426344e-06, + "loss": 0.87663096, + "num_input_tokens_seen": 27405110, + "router_z_loss_clip": 1.38085938, + "router_z_loss_mlp": 0.20385742, + "step": 1290, + "time_per_iteration": 2.689152956008911 + }, + { + "auxiliary_loss_clip": 0.01197297, + "auxiliary_loss_mlp": 0.01055208, + "balance_loss_clip": 1.06453836, + "balance_loss_mlp": 1.03299952, + "epoch": 0.07761911919434841, + "flos": 17383001984640.0, + "grad_norm": 2.9622268727120824, + "language_loss": 0.90980363, + "learning_rate": 3.9762611962933315e-06, + "loss": 0.93232864, + "num_input_tokens_seen": 27422855, + "router_z_loss_clip": 1.32714844, + "router_z_loss_mlp": 0.22192383, + "step": 1291, + "time_per_iteration": 2.6529293060302734 + }, + { + "auxiliary_loss_clip": 0.01091735, + "auxiliary_loss_mlp": 0.01017163, + "balance_loss_clip": 1.03352678, + "balance_loss_mlp": 1.01411688, + "epoch": 0.07767924244701638, + "flos": 82041837988320.0, + "grad_norm": 0.890301713392948, + "language_loss": 0.65061605, + "learning_rate": 3.9762013312202955e-06, + "loss": 0.67170501, + "num_input_tokens_seen": 27487190, + "router_z_loss_clip": 0.58105469, + "router_z_loss_mlp": 0.03044128, + "step": 1292, + "time_per_iteration": 3.3774068355560303 + }, + { + "auxiliary_loss_clip": 0.01200701, + "auxiliary_loss_mlp": 0.01052857, + "balance_loss_clip": 1.06316411, + "balance_loss_mlp": 1.03175688, + "epoch": 0.07773936569968436, + "flos": 34837949957760.0, + "grad_norm": 1.627977617686155, + "language_loss": 0.87774712, + "learning_rate": 3.9761413912095075e-06, + "loss": 0.90028262, + "num_input_tokens_seen": 27510465, + "router_z_loss_clip": 1.375, + "router_z_loss_mlp": 0.21081543, + "step": 1293, + "time_per_iteration": 3.0061373710632324 + }, + { + "auxiliary_loss_clip": 0.01201119, + "auxiliary_loss_mlp": 0.01058949, + "balance_loss_clip": 1.06382656, + "balance_loss_mlp": 1.03658557, + "epoch": 0.07779948895235232, + "flos": 33544214782560.0, + "grad_norm": 2.799523247353814, + "language_loss": 0.84912986, + "learning_rate": 3.976081376263239e-06, + "loss": 0.87173051, + "num_input_tokens_seen": 27528645, + "router_z_loss_clip": 1.37402344, + "router_z_loss_mlp": 0.22375488, + "step": 1294, + "time_per_iteration": 2.7448997497558594 + }, + { + "auxiliary_loss_clip": 0.01202587, + "auxiliary_loss_mlp": 0.0105993, + "balance_loss_clip": 1.06449831, + "balance_loss_mlp": 1.0379833, + "epoch": 0.07785961220502029, + "flos": 22235915279520.0, + "grad_norm": 2.8215275593466353, + "language_loss": 0.79312962, + "learning_rate": 3.976021286383768e-06, + "loss": 0.81575477, + "num_input_tokens_seen": 27546165, + "router_z_loss_clip": 1.38085938, + "router_z_loss_mlp": 0.21960449, + "step": 1295, + "time_per_iteration": 2.6228480339050293 + }, + { + "auxiliary_loss_clip": 0.01196326, + "auxiliary_loss_mlp": 0.01053843, + "balance_loss_clip": 1.06081235, + "balance_loss_mlp": 1.03205121, + "epoch": 0.07791973545768827, + "flos": 30027289835520.0, + "grad_norm": 2.374450671230686, + "language_loss": 0.87834489, + "learning_rate": 3.975961121573371e-06, + "loss": 0.90084654, + "num_input_tokens_seen": 27566520, + "router_z_loss_clip": 1.35546875, + "router_z_loss_mlp": 0.21813965, + "step": 1296, + "time_per_iteration": 2.67250394821167 + }, + { + "auxiliary_loss_clip": 0.01198703, + "auxiliary_loss_mlp": 0.01054235, + "balance_loss_clip": 1.06003618, + "balance_loss_mlp": 1.03193092, + "epoch": 0.07797985871035623, + "flos": 17426592227520.0, + "grad_norm": 2.554126707217665, + "language_loss": 0.9608885, + "learning_rate": 3.9759008818343305e-06, + "loss": 0.98341787, + "num_input_tokens_seen": 27581960, + "router_z_loss_clip": 1.38671875, + "router_z_loss_mlp": 0.2232666, + "step": 1297, + "time_per_iteration": 2.6054089069366455 + }, + { + "auxiliary_loss_clip": 0.01196309, + "auxiliary_loss_mlp": 0.01049433, + "balance_loss_clip": 1.05772829, + "balance_loss_mlp": 1.02873778, + "epoch": 0.0780399819630242, + "flos": 32470983410400.0, + "grad_norm": 2.4198723636207538, + "language_loss": 0.76039517, + "learning_rate": 3.97584056716893e-06, + "loss": 0.78285265, + "num_input_tokens_seen": 27601415, + "router_z_loss_clip": 1.38574219, + "router_z_loss_mlp": 0.20678711, + "step": 1298, + "time_per_iteration": 2.7001380920410156 + }, + { + "auxiliary_loss_clip": 0.01197746, + "auxiliary_loss_mlp": 0.01060612, + "balance_loss_clip": 1.06248629, + "balance_loss_mlp": 1.04019165, + "epoch": 0.07810010521569218, + "flos": 26642675273760.0, + "grad_norm": 1.693620391451028, + "language_loss": 0.80614531, + "learning_rate": 3.9757801775794575e-06, + "loss": 0.82872891, + "num_input_tokens_seen": 27621490, + "router_z_loss_clip": 1.35253906, + "router_z_loss_mlp": 0.20422363, + "step": 1299, + "time_per_iteration": 2.6833701133728027 + }, + { + "auxiliary_loss_clip": 0.01193156, + "auxiliary_loss_mlp": 0.01054113, + "balance_loss_clip": 1.06037545, + "balance_loss_mlp": 1.03286982, + "epoch": 0.07816022846836014, + "flos": 30604619037600.0, + "grad_norm": 1.933669613032188, + "language_loss": 0.86319256, + "learning_rate": 3.975719713068202e-06, + "loss": 0.8856653, + "num_input_tokens_seen": 27640600, + "router_z_loss_clip": 1.32714844, + "router_z_loss_mlp": 0.21240234, + "step": 1300, + "time_per_iteration": 2.6774864196777344 + }, + { + "auxiliary_loss_clip": 0.01194527, + "auxiliary_loss_mlp": 0.01049196, + "balance_loss_clip": 1.05905974, + "balance_loss_mlp": 1.02759504, + "epoch": 0.0782203517210281, + "flos": 49929132248640.0, + "grad_norm": 5.588691233243714, + "language_loss": 0.7162751, + "learning_rate": 3.975659173637458e-06, + "loss": 0.73871231, + "num_input_tokens_seen": 27663070, + "router_z_loss_clip": 1.35546875, + "router_z_loss_mlp": 0.21594238, + "step": 1301, + "time_per_iteration": 2.942925214767456 + }, + { + "auxiliary_loss_clip": 0.01200852, + "auxiliary_loss_mlp": 0.0105829, + "balance_loss_clip": 1.06268501, + "balance_loss_mlp": 1.03724957, + "epoch": 0.07828047497369607, + "flos": 50239774298880.0, + "grad_norm": 1.5500079965882343, + "language_loss": 0.70646816, + "learning_rate": 3.97559855928952e-06, + "loss": 0.72905958, + "num_input_tokens_seen": 27686425, + "router_z_loss_clip": 1.38085938, + "router_z_loss_mlp": 0.21032715, + "step": 1302, + "time_per_iteration": 2.872512102127075 + }, + { + "auxiliary_loss_clip": 0.01196551, + "auxiliary_loss_mlp": 0.01054127, + "balance_loss_clip": 1.06112707, + "balance_loss_mlp": 1.03257442, + "epoch": 0.07834059822636405, + "flos": 28685385964800.0, + "grad_norm": 2.393619482371972, + "language_loss": 0.8199954, + "learning_rate": 3.9755378700266864e-06, + "loss": 0.84250218, + "num_input_tokens_seen": 27704900, + "router_z_loss_clip": 1.35253906, + "router_z_loss_mlp": 0.2154541, + "step": 1303, + "time_per_iteration": 2.6832313537597656 + }, + { + "auxiliary_loss_clip": 0.0119354, + "auxiliary_loss_mlp": 0.01061239, + "balance_loss_clip": 1.05707812, + "balance_loss_mlp": 1.03978086, + "epoch": 0.07840072147903202, + "flos": 24640556546880.0, + "grad_norm": 1.7420028485325985, + "language_loss": 0.74982226, + "learning_rate": 3.9754771058512585e-06, + "loss": 0.77236998, + "num_input_tokens_seen": 27724890, + "router_z_loss_clip": 1.3671875, + "router_z_loss_mlp": 0.21472168, + "step": 1304, + "time_per_iteration": 2.663405179977417 + }, + { + "auxiliary_loss_clip": 0.01195523, + "auxiliary_loss_mlp": 0.01063992, + "balance_loss_clip": 1.06116223, + "balance_loss_mlp": 1.04227209, + "epoch": 0.07846084473169998, + "flos": 26064576243360.0, + "grad_norm": 1.875844924552238, + "language_loss": 0.76148057, + "learning_rate": 3.975416266765542e-06, + "loss": 0.78407568, + "num_input_tokens_seen": 27743115, + "router_z_loss_clip": 1.34179688, + "router_z_loss_mlp": 0.21728516, + "step": 1305, + "time_per_iteration": 2.7969439029693604 + }, + { + "auxiliary_loss_clip": 0.01197407, + "auxiliary_loss_mlp": 0.01064584, + "balance_loss_clip": 1.0598042, + "balance_loss_mlp": 1.0435915, + "epoch": 0.07852096798436796, + "flos": 31006857957120.0, + "grad_norm": 2.167006619368622, + "language_loss": 0.84836674, + "learning_rate": 3.975355352771841e-06, + "loss": 0.87098664, + "num_input_tokens_seen": 27763570, + "router_z_loss_clip": 1.375, + "router_z_loss_mlp": 0.20983887, + "step": 1306, + "time_per_iteration": 2.7868010997772217 + }, + { + "auxiliary_loss_clip": 0.01194602, + "auxiliary_loss_mlp": 0.01044294, + "balance_loss_clip": 1.06097054, + "balance_loss_mlp": 1.02436256, + "epoch": 0.07858109123703592, + "flos": 29982484074240.0, + "grad_norm": 2.5860973608921523, + "language_loss": 0.9027096, + "learning_rate": 3.975294363872468e-06, + "loss": 0.92509854, + "num_input_tokens_seen": 27780030, + "router_z_loss_clip": 1.3359375, + "router_z_loss_mlp": 0.19946289, + "step": 1307, + "time_per_iteration": 2.685561180114746 + }, + { + "auxiliary_loss_clip": 0.01195772, + "auxiliary_loss_mlp": 0.01056363, + "balance_loss_clip": 1.05946445, + "balance_loss_mlp": 1.03448844, + "epoch": 0.07864121448970389, + "flos": 25257302712000.0, + "grad_norm": 2.1492051167609807, + "language_loss": 0.83299392, + "learning_rate": 3.975233300069735e-06, + "loss": 0.85551524, + "num_input_tokens_seen": 27796225, + "router_z_loss_clip": 1.36523438, + "router_z_loss_mlp": 0.21875, + "step": 1308, + "time_per_iteration": 2.656215190887451 + }, + { + "auxiliary_loss_clip": 0.0119418, + "auxiliary_loss_mlp": 0.01055699, + "balance_loss_clip": 1.05981505, + "balance_loss_mlp": 1.03573143, + "epoch": 0.07870133774237187, + "flos": 28023955590240.0, + "grad_norm": 1.489929909341998, + "language_loss": 0.77244556, + "learning_rate": 3.975172161365958e-06, + "loss": 0.79494441, + "num_input_tokens_seen": 27815975, + "router_z_loss_clip": 1.34472656, + "router_z_loss_mlp": 0.19970703, + "step": 1309, + "time_per_iteration": 2.7101123332977295 + }, + { + "auxiliary_loss_clip": 0.01202548, + "auxiliary_loss_mlp": 0.01063404, + "balance_loss_clip": 1.0614841, + "balance_loss_mlp": 1.04186273, + "epoch": 0.07876146099503983, + "flos": 22991981319360.0, + "grad_norm": 1.9977439016989593, + "language_loss": 0.80511397, + "learning_rate": 3.975110947763453e-06, + "loss": 0.82777351, + "num_input_tokens_seen": 27832255, + "router_z_loss_clip": 1.41015625, + "router_z_loss_mlp": 0.21533203, + "step": 1310, + "time_per_iteration": 2.64890456199646 + }, + { + "auxiliary_loss_clip": 0.01192769, + "auxiliary_loss_mlp": 0.01055467, + "balance_loss_clip": 1.06196785, + "balance_loss_mlp": 1.03558302, + "epoch": 0.0788215842477078, + "flos": 29048086369440.0, + "grad_norm": 1.7136377371828209, + "language_loss": 0.72917831, + "learning_rate": 3.9750496592645435e-06, + "loss": 0.75166065, + "num_input_tokens_seen": 27852180, + "router_z_loss_clip": 1.30566406, + "router_z_loss_mlp": 0.19885254, + "step": 1311, + "time_per_iteration": 2.6606760025024414 + }, + { + "auxiliary_loss_clip": 0.01195524, + "auxiliary_loss_mlp": 0.01068078, + "balance_loss_clip": 1.06137705, + "balance_loss_mlp": 1.04631019, + "epoch": 0.07888170750037576, + "flos": 26331911671680.0, + "grad_norm": 2.5184955084016893, + "language_loss": 0.85781562, + "learning_rate": 3.974988295871553e-06, + "loss": 0.88045168, + "num_input_tokens_seen": 27871435, + "router_z_loss_clip": 1.34179688, + "router_z_loss_mlp": 0.2175293, + "step": 1312, + "time_per_iteration": 2.691161632537842 + }, + { + "auxiliary_loss_clip": 0.0119471, + "auxiliary_loss_mlp": 0.01060237, + "balance_loss_clip": 1.06097555, + "balance_loss_mlp": 1.04003072, + "epoch": 0.07894183075304374, + "flos": 24239087455680.0, + "grad_norm": 1.6691674038594575, + "language_loss": 0.81778169, + "learning_rate": 3.9749268575868085e-06, + "loss": 0.8403312, + "num_input_tokens_seen": 27890625, + "router_z_loss_clip": 1.33691406, + "router_z_loss_mlp": 0.2019043, + "step": 1313, + "time_per_iteration": 2.6515886783599854 + }, + { + "auxiliary_loss_clip": 0.01196967, + "auxiliary_loss_mlp": 0.01052421, + "balance_loss_clip": 1.05708718, + "balance_loss_mlp": 1.03029609, + "epoch": 0.07900195400571171, + "flos": 19697950694880.0, + "grad_norm": 4.5465264218545265, + "language_loss": 0.73487389, + "learning_rate": 3.97486534441264e-06, + "loss": 0.75736785, + "num_input_tokens_seen": 27906530, + "router_z_loss_clip": 1.3984375, + "router_z_loss_mlp": 0.22131348, + "step": 1314, + "time_per_iteration": 2.714892625808716 + }, + { + "auxiliary_loss_clip": 0.011928, + "auxiliary_loss_mlp": 0.01055339, + "balance_loss_clip": 1.05760884, + "balance_loss_mlp": 1.03530025, + "epoch": 0.07906207725837967, + "flos": 28956327431040.0, + "grad_norm": 1.8726906334229771, + "language_loss": 0.796058, + "learning_rate": 3.974803756351379e-06, + "loss": 0.81853938, + "num_input_tokens_seen": 27926725, + "router_z_loss_clip": 1.35058594, + "router_z_loss_mlp": 0.20031738, + "step": 1315, + "time_per_iteration": 2.6843883991241455 + }, + { + "auxiliary_loss_clip": 0.01193314, + "auxiliary_loss_mlp": 0.01059838, + "balance_loss_clip": 1.05716395, + "balance_loss_mlp": 1.03746271, + "epoch": 0.07912220051104765, + "flos": 29670626505600.0, + "grad_norm": 1.7021534574792114, + "language_loss": 0.73991084, + "learning_rate": 3.974742093405362e-06, + "loss": 0.76244241, + "num_input_tokens_seen": 27947875, + "router_z_loss_clip": 1.36328125, + "router_z_loss_mlp": 0.22387695, + "step": 1316, + "time_per_iteration": 2.711799144744873 + }, + { + "auxiliary_loss_clip": 0.01198189, + "auxiliary_loss_mlp": 0.01059165, + "balance_loss_clip": 1.0596168, + "balance_loss_mlp": 1.03790998, + "epoch": 0.07918232376371562, + "flos": 23037313805280.0, + "grad_norm": 2.5338473814058995, + "language_loss": 0.65687615, + "learning_rate": 3.974680355576927e-06, + "loss": 0.67944968, + "num_input_tokens_seen": 27965040, + "router_z_loss_clip": 1.38476562, + "router_z_loss_mlp": 0.21252441, + "step": 1317, + "time_per_iteration": 2.6547436714172363 + }, + { + "auxiliary_loss_clip": 0.01199929, + "auxiliary_loss_mlp": 0.01061252, + "balance_loss_clip": 1.06037855, + "balance_loss_mlp": 1.03924632, + "epoch": 0.07924244701638358, + "flos": 33405543184320.0, + "grad_norm": 2.121845087745217, + "language_loss": 0.73216313, + "learning_rate": 3.974618542868415e-06, + "loss": 0.75477493, + "num_input_tokens_seen": 27985330, + "router_z_loss_clip": 1.39453125, + "router_z_loss_mlp": 0.2199707, + "step": 1318, + "time_per_iteration": 2.9797725677490234 + }, + { + "auxiliary_loss_clip": 0.01193715, + "auxiliary_loss_mlp": 0.0105547, + "balance_loss_clip": 1.05862141, + "balance_loss_mlp": 1.03553796, + "epoch": 0.07930257026905156, + "flos": 30652098939360.0, + "grad_norm": 1.8189414867499665, + "language_loss": 0.901546, + "learning_rate": 3.97455665528217e-06, + "loss": 0.92403793, + "num_input_tokens_seen": 28007615, + "router_z_loss_clip": 1.35253906, + "router_z_loss_mlp": 0.19909668, + "step": 1319, + "time_per_iteration": 4.1776649951934814 + }, + { + "auxiliary_loss_clip": 0.01191728, + "auxiliary_loss_mlp": 0.0104946, + "balance_loss_clip": 1.05574667, + "balance_loss_mlp": 1.02875292, + "epoch": 0.07936269352171953, + "flos": 26642634756480.0, + "grad_norm": 2.146524259964104, + "language_loss": 0.80104649, + "learning_rate": 3.974494692820539e-06, + "loss": 0.82345831, + "num_input_tokens_seen": 28027765, + "router_z_loss_clip": 1.36035156, + "router_z_loss_mlp": 0.20715332, + "step": 1320, + "time_per_iteration": 5.5400331020355225 + }, + { + "auxiliary_loss_clip": 0.01192719, + "auxiliary_loss_mlp": 0.01053748, + "balance_loss_clip": 1.05876184, + "balance_loss_mlp": 1.03337467, + "epoch": 0.07942281677438749, + "flos": 20670387775200.0, + "grad_norm": 2.2932819796052155, + "language_loss": 0.69785023, + "learning_rate": 3.974432655485872e-06, + "loss": 0.72031486, + "num_input_tokens_seen": 28044225, + "router_z_loss_clip": 1.33984375, + "router_z_loss_mlp": 0.20373535, + "step": 1321, + "time_per_iteration": 4.092014312744141 + }, + { + "auxiliary_loss_clip": 0.01190357, + "auxiliary_loss_mlp": 0.01055473, + "balance_loss_clip": 1.05828738, + "balance_loss_mlp": 1.03525472, + "epoch": 0.07948294002705546, + "flos": 23168003499360.0, + "grad_norm": 2.0394335125706506, + "language_loss": 0.84068739, + "learning_rate": 3.9743705432805195e-06, + "loss": 0.86314565, + "num_input_tokens_seen": 28062915, + "router_z_loss_clip": 1.32128906, + "router_z_loss_mlp": 0.20227051, + "step": 1322, + "time_per_iteration": 2.7833821773529053 + }, + { + "auxiliary_loss_clip": 0.01191633, + "auxiliary_loss_mlp": 0.01052054, + "balance_loss_clip": 1.05581474, + "balance_loss_mlp": 1.0307157, + "epoch": 0.07954306327972344, + "flos": 26421604228800.0, + "grad_norm": 2.267611742840802, + "language_loss": 0.90686381, + "learning_rate": 3.974308356206838e-06, + "loss": 0.92930067, + "num_input_tokens_seen": 28082175, + "router_z_loss_clip": 1.35644531, + "router_z_loss_mlp": 0.21350098, + "step": 1323, + "time_per_iteration": 2.68612003326416 + }, + { + "auxiliary_loss_clip": 0.01190592, + "auxiliary_loss_mlp": 0.01058778, + "balance_loss_clip": 1.05851519, + "balance_loss_mlp": 1.03810704, + "epoch": 0.0796031865323914, + "flos": 28333746777600.0, + "grad_norm": 1.6384639430894483, + "language_loss": 0.82685828, + "learning_rate": 3.974246094267187e-06, + "loss": 0.84935188, + "num_input_tokens_seen": 28102645, + "router_z_loss_clip": 1.32128906, + "router_z_loss_mlp": 0.20654297, + "step": 1324, + "time_per_iteration": 2.627777576446533 + }, + { + "auxiliary_loss_clip": 0.01193833, + "auxiliary_loss_mlp": 0.01050073, + "balance_loss_clip": 1.05785871, + "balance_loss_mlp": 1.02857947, + "epoch": 0.07966330978505937, + "flos": 28424249680320.0, + "grad_norm": 3.1727332402837463, + "language_loss": 0.79383612, + "learning_rate": 3.974183757463925e-06, + "loss": 0.81627518, + "num_input_tokens_seen": 28122805, + "router_z_loss_clip": 1.359375, + "router_z_loss_mlp": 0.21520996, + "step": 1325, + "time_per_iteration": 2.6900651454925537 + }, + { + "auxiliary_loss_clip": 0.01196317, + "auxiliary_loss_mlp": 0.01064735, + "balance_loss_clip": 1.06143403, + "balance_loss_mlp": 1.04263401, + "epoch": 0.07972343303772735, + "flos": 22407845214240.0, + "grad_norm": 2.032751331531557, + "language_loss": 0.88420284, + "learning_rate": 3.974121345799418e-06, + "loss": 0.90681338, + "num_input_tokens_seen": 28140530, + "router_z_loss_clip": 1.34863281, + "router_z_loss_mlp": 0.22106934, + "step": 1326, + "time_per_iteration": 2.6285340785980225 + }, + { + "auxiliary_loss_clip": 0.0118899, + "auxiliary_loss_mlp": 0.01053574, + "balance_loss_clip": 1.05660772, + "balance_loss_mlp": 1.03155661, + "epoch": 0.07978355629039531, + "flos": 26554724959680.0, + "grad_norm": 1.8250645126432765, + "language_loss": 0.82970822, + "learning_rate": 3.974058859276032e-06, + "loss": 0.85213387, + "num_input_tokens_seen": 28159640, + "router_z_loss_clip": 1.32617188, + "router_z_loss_mlp": 0.22009277, + "step": 1327, + "time_per_iteration": 2.6785411834716797 + }, + { + "auxiliary_loss_clip": 0.01194932, + "auxiliary_loss_mlp": 0.01052614, + "balance_loss_clip": 1.05876839, + "balance_loss_mlp": 1.03058434, + "epoch": 0.07984367954306328, + "flos": 22636209369600.0, + "grad_norm": 4.391740738813166, + "language_loss": 0.7884748, + "learning_rate": 3.9739962978961354e-06, + "loss": 0.81095016, + "num_input_tokens_seen": 28177050, + "router_z_loss_clip": 1.36132812, + "router_z_loss_mlp": 0.22021484, + "step": 1328, + "time_per_iteration": 2.6193134784698486 + }, + { + "auxiliary_loss_clip": 0.01195178, + "auxiliary_loss_mlp": 0.01049946, + "balance_loss_clip": 1.05864894, + "balance_loss_mlp": 1.02754712, + "epoch": 0.07990380279573125, + "flos": 20626594945920.0, + "grad_norm": 2.2431292228820885, + "language_loss": 0.74338418, + "learning_rate": 3.973933661662101e-06, + "loss": 0.7658354, + "num_input_tokens_seen": 28193245, + "router_z_loss_clip": 1.36523438, + "router_z_loss_mlp": 0.22387695, + "step": 1329, + "time_per_iteration": 2.8960981369018555 + }, + { + "auxiliary_loss_clip": 0.01190002, + "auxiliary_loss_mlp": 0.01061001, + "balance_loss_clip": 1.05617774, + "balance_loss_mlp": 1.04072368, + "epoch": 0.07996392604839922, + "flos": 29405033320320.0, + "grad_norm": 1.5782496849894514, + "language_loss": 0.81265658, + "learning_rate": 3.973870950576305e-06, + "loss": 0.83516663, + "num_input_tokens_seen": 28213570, + "router_z_loss_clip": 1.33691406, + "router_z_loss_mlp": 0.20275879, + "step": 1330, + "time_per_iteration": 2.6988372802734375 + }, + { + "auxiliary_loss_clip": 0.01194199, + "auxiliary_loss_mlp": 0.01059491, + "balance_loss_clip": 1.05828691, + "balance_loss_mlp": 1.03816473, + "epoch": 0.08002404930106718, + "flos": 17422824120480.0, + "grad_norm": 1.8752825069014467, + "language_loss": 0.88499367, + "learning_rate": 3.9738081646411255e-06, + "loss": 0.90753055, + "num_input_tokens_seen": 28229980, + "router_z_loss_clip": 1.359375, + "router_z_loss_mlp": 0.21337891, + "step": 1331, + "time_per_iteration": 2.633877992630005 + }, + { + "auxiliary_loss_clip": 0.01198968, + "auxiliary_loss_mlp": 0.01056502, + "balance_loss_clip": 1.05824757, + "balance_loss_mlp": 1.03440106, + "epoch": 0.08008417255373516, + "flos": 49305255042240.0, + "grad_norm": 2.1074130631368018, + "language_loss": 0.73034626, + "learning_rate": 3.973745303858942e-06, + "loss": 0.75290096, + "num_input_tokens_seen": 28253840, + "router_z_loss_clip": 1.40625, + "router_z_loss_mlp": 0.22106934, + "step": 1332, + "time_per_iteration": 2.84100604057312 + }, + { + "auxiliary_loss_clip": 0.0119454, + "auxiliary_loss_mlp": 0.01055332, + "balance_loss_clip": 1.05895662, + "balance_loss_mlp": 1.03436315, + "epoch": 0.08014429580640313, + "flos": 22547813365440.0, + "grad_norm": 2.1247704695582756, + "language_loss": 0.82395518, + "learning_rate": 3.973682368232138e-06, + "loss": 0.84645391, + "num_input_tokens_seen": 28271675, + "router_z_loss_clip": 1.35546875, + "router_z_loss_mlp": 0.20959473, + "step": 1333, + "time_per_iteration": 2.62992787361145 + }, + { + "auxiliary_loss_clip": 0.01193732, + "auxiliary_loss_mlp": 0.01049996, + "balance_loss_clip": 1.05730987, + "balance_loss_mlp": 1.02890801, + "epoch": 0.0802044190590711, + "flos": 26909848632960.0, + "grad_norm": 2.8346531934725205, + "language_loss": 0.7496056, + "learning_rate": 3.9736193577631015e-06, + "loss": 0.77204287, + "num_input_tokens_seen": 28291850, + "router_z_loss_clip": 1.36425781, + "router_z_loss_mlp": 0.2109375, + "step": 1334, + "time_per_iteration": 2.7386956214904785 + }, + { + "auxiliary_loss_clip": 0.01194723, + "auxiliary_loss_mlp": 0.01058296, + "balance_loss_clip": 1.06096792, + "balance_loss_mlp": 1.03763771, + "epoch": 0.08026454231173906, + "flos": 29983213385280.0, + "grad_norm": 1.9938221899288942, + "language_loss": 0.79753911, + "learning_rate": 3.973556272454221e-06, + "loss": 0.82006931, + "num_input_tokens_seen": 28310780, + "router_z_loss_clip": 1.33886719, + "router_z_loss_mlp": 0.20678711, + "step": 1335, + "time_per_iteration": 2.675563335418701 + }, + { + "auxiliary_loss_clip": 0.01080435, + "auxiliary_loss_mlp": 0.0101556, + "balance_loss_clip": 1.02528906, + "balance_loss_mlp": 1.01221061, + "epoch": 0.08032466556440704, + "flos": 64160502796800.0, + "grad_norm": 0.7478471528492707, + "language_loss": 0.56042373, + "learning_rate": 3.973493112307889e-06, + "loss": 0.58138365, + "num_input_tokens_seen": 28369985, + "router_z_loss_clip": 0.55126953, + "router_z_loss_mlp": 0.03356934, + "step": 1336, + "time_per_iteration": 3.3285927772521973 + }, + { + "auxiliary_loss_clip": 0.01196277, + "auxiliary_loss_mlp": 0.01062473, + "balance_loss_clip": 1.06136286, + "balance_loss_mlp": 1.04226708, + "epoch": 0.080384788817075, + "flos": 29092973165280.0, + "grad_norm": 2.2828487854868302, + "language_loss": 0.67499816, + "learning_rate": 3.9734298773265005e-06, + "loss": 0.6975857, + "num_input_tokens_seen": 28388670, + "router_z_loss_clip": 1.34863281, + "router_z_loss_mlp": 0.2019043, + "step": 1337, + "time_per_iteration": 2.6795296669006348 + }, + { + "auxiliary_loss_clip": 0.01193702, + "auxiliary_loss_mlp": 0.01065137, + "balance_loss_clip": 1.06058097, + "balance_loss_mlp": 1.04450154, + "epoch": 0.08044491206974297, + "flos": 30872400156000.0, + "grad_norm": 2.0764311242853193, + "language_loss": 0.87040722, + "learning_rate": 3.973366567512453e-06, + "loss": 0.8929956, + "num_input_tokens_seen": 28411845, + "router_z_loss_clip": 1.33007812, + "router_z_loss_mlp": 0.2064209, + "step": 1338, + "time_per_iteration": 2.7623398303985596 + }, + { + "auxiliary_loss_clip": 0.01195104, + "auxiliary_loss_mlp": 0.01065223, + "balance_loss_clip": 1.05681157, + "balance_loss_mlp": 1.042835, + "epoch": 0.08050503532241095, + "flos": 27304591855680.0, + "grad_norm": 2.620583123269744, + "language_loss": 0.87243199, + "learning_rate": 3.973303182868147e-06, + "loss": 0.89503527, + "num_input_tokens_seen": 28427875, + "router_z_loss_clip": 1.3828125, + "router_z_loss_mlp": 0.22375488, + "step": 1339, + "time_per_iteration": 2.7619893550872803 + }, + { + "auxiliary_loss_clip": 0.0119057, + "auxiliary_loss_mlp": 0.01057471, + "balance_loss_clip": 1.05853879, + "balance_loss_mlp": 1.0372417, + "epoch": 0.08056515857507891, + "flos": 22414206427200.0, + "grad_norm": 2.2328130655433043, + "language_loss": 0.89237839, + "learning_rate": 3.973239723395988e-06, + "loss": 0.91485876, + "num_input_tokens_seen": 28446615, + "router_z_loss_clip": 1.3203125, + "router_z_loss_mlp": 0.20227051, + "step": 1340, + "time_per_iteration": 2.64253568649292 + }, + { + "auxiliary_loss_clip": 0.01078196, + "auxiliary_loss_mlp": 0.01005252, + "balance_loss_clip": 1.02351665, + "balance_loss_mlp": 1.00183022, + "epoch": 0.08062528182774688, + "flos": 62654407791840.0, + "grad_norm": 0.8872068182874866, + "language_loss": 0.64882237, + "learning_rate": 3.97317618909838e-06, + "loss": 0.66965687, + "num_input_tokens_seen": 28505290, + "router_z_loss_clip": 0.546875, + "router_z_loss_mlp": 0.03430176, + "step": 1341, + "time_per_iteration": 3.239278554916382 + }, + { + "auxiliary_loss_clip": 0.01198775, + "auxiliary_loss_mlp": 0.01057118, + "balance_loss_clip": 1.0569613, + "balance_loss_mlp": 1.03358626, + "epoch": 0.08068540508041486, + "flos": 21475270787040.0, + "grad_norm": 3.0079249104976955, + "language_loss": 0.89237231, + "learning_rate": 3.973112579977733e-06, + "loss": 0.91493124, + "num_input_tokens_seen": 28522735, + "router_z_loss_clip": 1.41699219, + "router_z_loss_mlp": 0.23522949, + "step": 1342, + "time_per_iteration": 2.838376045227051 + }, + { + "auxiliary_loss_clip": 0.0120223, + "auxiliary_loss_mlp": 0.01062427, + "balance_loss_clip": 1.06394887, + "balance_loss_mlp": 1.03946686, + "epoch": 0.08074552833308282, + "flos": 12886022708640.0, + "grad_norm": 2.4070176497368188, + "language_loss": 0.76159382, + "learning_rate": 3.973048896036459e-06, + "loss": 0.78424037, + "num_input_tokens_seen": 28539460, + "router_z_loss_clip": 1.3828125, + "router_z_loss_mlp": 0.22961426, + "step": 1343, + "time_per_iteration": 2.6562163829803467 + }, + { + "auxiliary_loss_clip": 0.01076773, + "auxiliary_loss_mlp": 0.0100415, + "balance_loss_clip": 1.02212119, + "balance_loss_mlp": 1.00080609, + "epoch": 0.08080565158575079, + "flos": 74236890143520.0, + "grad_norm": 0.7963498619552527, + "language_loss": 0.57435536, + "learning_rate": 3.972985137276974e-06, + "loss": 0.59516454, + "num_input_tokens_seen": 28599855, + "router_z_loss_clip": 0.54589844, + "router_z_loss_mlp": 0.03347778, + "step": 1344, + "time_per_iteration": 3.1830759048461914 + }, + { + "auxiliary_loss_clip": 0.0119844, + "auxiliary_loss_mlp": 0.01059415, + "balance_loss_clip": 1.06174088, + "balance_loss_mlp": 1.03787386, + "epoch": 0.08086577483841875, + "flos": 22369036010400.0, + "grad_norm": 2.566614877510493, + "language_loss": 0.86470628, + "learning_rate": 3.972921303701695e-06, + "loss": 0.88728487, + "num_input_tokens_seen": 28617585, + "router_z_loss_clip": 1.36816406, + "router_z_loss_mlp": 0.21533203, + "step": 1345, + "time_per_iteration": 2.652639150619507 + }, + { + "auxiliary_loss_clip": 0.01194922, + "auxiliary_loss_mlp": 0.0105339, + "balance_loss_clip": 1.06024885, + "balance_loss_mlp": 1.03341007, + "epoch": 0.08092589809108673, + "flos": 26287592117760.0, + "grad_norm": 1.9280484668353546, + "language_loss": 0.87918961, + "learning_rate": 3.972857395313042e-06, + "loss": 0.90167272, + "num_input_tokens_seen": 28636355, + "router_z_loss_clip": 1.34765625, + "router_z_loss_mlp": 0.20007324, + "step": 1346, + "time_per_iteration": 2.674386739730835 + }, + { + "auxiliary_loss_clip": 0.01193781, + "auxiliary_loss_mlp": 0.01054546, + "balance_loss_clip": 1.05785728, + "balance_loss_mlp": 1.0333147, + "epoch": 0.0809860213437547, + "flos": 27000675673920.0, + "grad_norm": 1.666356658212976, + "language_loss": 0.92754143, + "learning_rate": 3.972793412113439e-06, + "loss": 0.95002466, + "num_input_tokens_seen": 28656260, + "router_z_loss_clip": 1.35839844, + "router_z_loss_mlp": 0.21240234, + "step": 1347, + "time_per_iteration": 2.679724931716919 + }, + { + "auxiliary_loss_clip": 0.01193875, + "auxiliary_loss_mlp": 0.01060831, + "balance_loss_clip": 1.05988014, + "balance_loss_mlp": 1.03771675, + "epoch": 0.08104614459642266, + "flos": 26509595060160.0, + "grad_norm": 1.9003745353470831, + "language_loss": 0.89394039, + "learning_rate": 3.972729354105312e-06, + "loss": 0.91648746, + "num_input_tokens_seen": 28675865, + "router_z_loss_clip": 1.34179688, + "router_z_loss_mlp": 0.2310791, + "step": 1348, + "time_per_iteration": 2.6732547283172607 + }, + { + "auxiliary_loss_clip": 0.0119142, + "auxiliary_loss_mlp": 0.01055435, + "balance_loss_clip": 1.06120896, + "balance_loss_mlp": 1.03469229, + "epoch": 0.08110626784909064, + "flos": 29226336999840.0, + "grad_norm": 1.9095785232764173, + "language_loss": 0.76981843, + "learning_rate": 3.97266522129109e-06, + "loss": 0.79228693, + "num_input_tokens_seen": 28696255, + "router_z_loss_clip": 1.30273438, + "router_z_loss_mlp": 0.20739746, + "step": 1349, + "time_per_iteration": 2.6600568294525146 + }, + { + "auxiliary_loss_clip": 0.01195753, + "auxiliary_loss_mlp": 0.01067125, + "balance_loss_clip": 1.05926299, + "balance_loss_mlp": 1.04545295, + "epoch": 0.0811663911017586, + "flos": 23215078228320.0, + "grad_norm": 1.9088646674562018, + "language_loss": 0.88503969, + "learning_rate": 3.972601013673205e-06, + "loss": 0.90766841, + "num_input_tokens_seen": 28713905, + "router_z_loss_clip": 1.3671875, + "router_z_loss_mlp": 0.21679688, + "step": 1350, + "time_per_iteration": 2.6763956546783447 + }, + { + "auxiliary_loss_clip": 0.01191073, + "auxiliary_loss_mlp": 0.01060881, + "balance_loss_clip": 1.0581665, + "balance_loss_mlp": 1.03895843, + "epoch": 0.08122651435442657, + "flos": 18719030849760.0, + "grad_norm": 1.9173582056183935, + "language_loss": 0.82380772, + "learning_rate": 3.972536731254092e-06, + "loss": 0.8463273, + "num_input_tokens_seen": 28732075, + "router_z_loss_clip": 1.32910156, + "router_z_loss_mlp": 0.21911621, + "step": 1351, + "time_per_iteration": 2.666922092437744 + }, + { + "auxiliary_loss_clip": 0.01190294, + "auxiliary_loss_mlp": 0.01050137, + "balance_loss_clip": 1.05432642, + "balance_loss_mlp": 1.0270226, + "epoch": 0.08128663760709455, + "flos": 28335205399680.0, + "grad_norm": 2.2665480356328476, + "language_loss": 0.75043142, + "learning_rate": 3.972472374036189e-06, + "loss": 0.77283573, + "num_input_tokens_seen": 28751150, + "router_z_loss_clip": 1.36035156, + "router_z_loss_mlp": 0.23120117, + "step": 1352, + "time_per_iteration": 2.6761655807495117 + }, + { + "auxiliary_loss_clip": 0.01200283, + "auxiliary_loss_mlp": 0.01059519, + "balance_loss_clip": 1.06289303, + "balance_loss_mlp": 1.03733397, + "epoch": 0.08134676085976252, + "flos": 28022942658240.0, + "grad_norm": 1.8061005261084189, + "language_loss": 0.83023477, + "learning_rate": 3.972407942021935e-06, + "loss": 0.85283285, + "num_input_tokens_seen": 28773360, + "router_z_loss_clip": 1.37304688, + "router_z_loss_mlp": 0.22180176, + "step": 1353, + "time_per_iteration": 2.6772778034210205 + }, + { + "auxiliary_loss_clip": 0.01076552, + "auxiliary_loss_mlp": 0.01032039, + "balance_loss_clip": 1.02184463, + "balance_loss_mlp": 1.02852547, + "epoch": 0.08140688411243048, + "flos": 78487637143680.0, + "grad_norm": 0.8427916982527842, + "language_loss": 0.59687656, + "learning_rate": 3.972343435213775e-06, + "loss": 0.61796248, + "num_input_tokens_seen": 28833390, + "router_z_loss_clip": 0.546875, + "router_z_loss_mlp": 0.03518677, + "step": 1354, + "time_per_iteration": 3.433544635772705 + }, + { + "auxiliary_loss_clip": 0.01188615, + "auxiliary_loss_mlp": 0.01057618, + "balance_loss_clip": 1.05712879, + "balance_loss_mlp": 1.03616071, + "epoch": 0.08146700736509845, + "flos": 27444762593280.0, + "grad_norm": 1.6669224050847653, + "language_loss": 0.83009875, + "learning_rate": 3.972278853614154e-06, + "loss": 0.85256112, + "num_input_tokens_seen": 28852430, + "router_z_loss_clip": 1.31542969, + "router_z_loss_mlp": 0.21459961, + "step": 1355, + "time_per_iteration": 2.705888271331787 + }, + { + "auxiliary_loss_clip": 0.0119268, + "auxiliary_loss_mlp": 0.01062862, + "balance_loss_clip": 1.05765557, + "balance_loss_mlp": 1.03968823, + "epoch": 0.08152713061776642, + "flos": 24950550320640.0, + "grad_norm": 2.0609757836796407, + "language_loss": 0.70656002, + "learning_rate": 3.972214197225521e-06, + "loss": 0.72911549, + "num_input_tokens_seen": 28870685, + "router_z_loss_clip": 1.34960938, + "router_z_loss_mlp": 0.23168945, + "step": 1356, + "time_per_iteration": 2.665520191192627 + }, + { + "auxiliary_loss_clip": 0.01195868, + "auxiliary_loss_mlp": 0.01058706, + "balance_loss_clip": 1.05882919, + "balance_loss_mlp": 1.03685498, + "epoch": 0.08158725387043439, + "flos": 28736228800800.0, + "grad_norm": 2.034649920972828, + "language_loss": 0.70195913, + "learning_rate": 3.972149466050329e-06, + "loss": 0.72450483, + "num_input_tokens_seen": 28889860, + "router_z_loss_clip": 1.36914062, + "router_z_loss_mlp": 0.21862793, + "step": 1357, + "time_per_iteration": 2.6854326725006104 + }, + { + "auxiliary_loss_clip": 0.0119658, + "auxiliary_loss_mlp": 0.01053211, + "balance_loss_clip": 1.06023192, + "balance_loss_mlp": 1.0325048, + "epoch": 0.08164737712310235, + "flos": 26865731665440.0, + "grad_norm": 2.5157109556152033, + "language_loss": 0.84053266, + "learning_rate": 3.97208466009103e-06, + "loss": 0.86303061, + "num_input_tokens_seen": 28905865, + "router_z_loss_clip": 1.36328125, + "router_z_loss_mlp": 0.20703125, + "step": 1358, + "time_per_iteration": 4.049339771270752 + }, + { + "auxiliary_loss_clip": 0.01196624, + "auxiliary_loss_mlp": 0.01061302, + "balance_loss_clip": 1.05895507, + "balance_loss_mlp": 1.03748405, + "epoch": 0.08170750037577033, + "flos": 28513658616480.0, + "grad_norm": 1.9696010813106162, + "language_loss": 1.0267483, + "learning_rate": 3.972019779350084e-06, + "loss": 1.04932749, + "num_input_tokens_seen": 28925250, + "router_z_loss_clip": 1.37597656, + "router_z_loss_mlp": 0.23803711, + "step": 1359, + "time_per_iteration": 4.1134021282196045 + }, + { + "auxiliary_loss_clip": 0.01190293, + "auxiliary_loss_mlp": 0.01050268, + "balance_loss_clip": 1.05548143, + "balance_loss_mlp": 1.02822626, + "epoch": 0.0817676236284383, + "flos": 34652284665120.0, + "grad_norm": 2.045362398877046, + "language_loss": 0.83978373, + "learning_rate": 3.971954823829951e-06, + "loss": 0.86218929, + "num_input_tokens_seen": 28943445, + "router_z_loss_clip": 1.34960938, + "router_z_loss_mlp": 0.22070312, + "step": 1360, + "time_per_iteration": 5.582982778549194 + }, + { + "auxiliary_loss_clip": 0.01193762, + "auxiliary_loss_mlp": 0.01069623, + "balance_loss_clip": 1.05677342, + "balance_loss_mlp": 1.0477128, + "epoch": 0.08182774688110626, + "flos": 23438580310080.0, + "grad_norm": 2.0837797922548944, + "language_loss": 0.72459054, + "learning_rate": 3.971889793533093e-06, + "loss": 0.74722439, + "num_input_tokens_seen": 28962695, + "router_z_loss_clip": 1.36914062, + "router_z_loss_mlp": 0.21923828, + "step": 1361, + "time_per_iteration": 2.640895128250122 + }, + { + "auxiliary_loss_clip": 0.01183691, + "auxiliary_loss_mlp": 0.01060162, + "balance_loss_clip": 1.05112624, + "balance_loss_mlp": 1.0375005, + "epoch": 0.08188787013377424, + "flos": 27801344888640.0, + "grad_norm": 2.3042974844736586, + "language_loss": 0.76145589, + "learning_rate": 3.971824688461976e-06, + "loss": 0.78389442, + "num_input_tokens_seen": 28982120, + "router_z_loss_clip": 1.32617188, + "router_z_loss_mlp": 0.2265625, + "step": 1362, + "time_per_iteration": 2.642575740814209 + }, + { + "auxiliary_loss_clip": 0.01191802, + "auxiliary_loss_mlp": 0.01053918, + "balance_loss_clip": 1.05849016, + "balance_loss_mlp": 1.03294933, + "epoch": 0.08194799338644221, + "flos": 20094436160640.0, + "grad_norm": 4.5981646088890535, + "language_loss": 0.72644627, + "learning_rate": 3.971759508619069e-06, + "loss": 0.74890345, + "num_input_tokens_seen": 28998100, + "router_z_loss_clip": 1.33300781, + "router_z_loss_mlp": 0.2097168, + "step": 1363, + "time_per_iteration": 2.620806932449341 + }, + { + "auxiliary_loss_clip": 0.01193359, + "auxiliary_loss_mlp": 0.01068823, + "balance_loss_clip": 1.06047845, + "balance_loss_mlp": 1.04468298, + "epoch": 0.08200811663911017, + "flos": 29179829512800.0, + "grad_norm": 1.9175980495388374, + "language_loss": 0.7707181, + "learning_rate": 3.971694254006844e-06, + "loss": 0.79333991, + "num_input_tokens_seen": 29017095, + "router_z_loss_clip": 1.32910156, + "router_z_loss_mlp": 0.24133301, + "step": 1364, + "time_per_iteration": 2.67631196975708 + }, + { + "auxiliary_loss_clip": 0.0119342, + "auxiliary_loss_mlp": 0.0106078, + "balance_loss_clip": 1.05832911, + "balance_loss_mlp": 1.03819001, + "epoch": 0.08206823989177814, + "flos": 21837039294240.0, + "grad_norm": 1.745500598952301, + "language_loss": 0.82417214, + "learning_rate": 3.971628924627776e-06, + "loss": 0.8467142, + "num_input_tokens_seen": 29037240, + "router_z_loss_clip": 1.35058594, + "router_z_loss_mlp": 0.22607422, + "step": 1365, + "time_per_iteration": 2.95294189453125 + }, + { + "auxiliary_loss_clip": 0.01193631, + "auxiliary_loss_mlp": 0.01061128, + "balance_loss_clip": 1.06212819, + "balance_loss_mlp": 1.04015923, + "epoch": 0.08212836314444612, + "flos": 26952385426560.0, + "grad_norm": 1.6043777565142927, + "language_loss": 0.81877285, + "learning_rate": 3.97156352048434e-06, + "loss": 0.84132046, + "num_input_tokens_seen": 29056250, + "router_z_loss_clip": 1.31542969, + "router_z_loss_mlp": 0.2097168, + "step": 1366, + "time_per_iteration": 2.6635074615478516 + }, + { + "auxiliary_loss_clip": 0.01192722, + "auxiliary_loss_mlp": 0.01070037, + "balance_loss_clip": 1.05635715, + "balance_loss_mlp": 1.04970014, + "epoch": 0.08218848639711408, + "flos": 21472758715680.0, + "grad_norm": 1.7239125776932547, + "language_loss": 0.81715441, + "learning_rate": 3.97149804157902e-06, + "loss": 0.839782, + "num_input_tokens_seen": 29073380, + "router_z_loss_clip": 1.36230469, + "router_z_loss_mlp": 0.20349121, + "step": 1367, + "time_per_iteration": 2.603233575820923 + }, + { + "auxiliary_loss_clip": 0.01195443, + "auxiliary_loss_mlp": 0.01066556, + "balance_loss_clip": 1.05756044, + "balance_loss_mlp": 1.04384685, + "epoch": 0.08224860964978205, + "flos": 21790005082560.0, + "grad_norm": 2.5484258448645374, + "language_loss": 0.83905268, + "learning_rate": 3.9714324879142946e-06, + "loss": 0.86167264, + "num_input_tokens_seen": 29091330, + "router_z_loss_clip": 1.37890625, + "router_z_loss_mlp": 0.22717285, + "step": 1368, + "time_per_iteration": 2.623457670211792 + }, + { + "auxiliary_loss_clip": 0.01188427, + "auxiliary_loss_mlp": 0.01053285, + "balance_loss_clip": 1.05952251, + "balance_loss_mlp": 1.03255427, + "epoch": 0.08230873290245003, + "flos": 30783112771680.0, + "grad_norm": 1.6606729648903038, + "language_loss": 0.81350231, + "learning_rate": 3.971366859492653e-06, + "loss": 0.83591938, + "num_input_tokens_seen": 29110375, + "router_z_loss_clip": 1.28808594, + "router_z_loss_mlp": 0.20727539, + "step": 1369, + "time_per_iteration": 2.6572256088256836 + }, + { + "auxiliary_loss_clip": 0.01188705, + "auxiliary_loss_mlp": 0.01055296, + "balance_loss_clip": 1.05900884, + "balance_loss_mlp": 1.03492284, + "epoch": 0.08236885615511799, + "flos": 38754318132000.0, + "grad_norm": 2.081147505916121, + "language_loss": 0.74471331, + "learning_rate": 3.971301156316582e-06, + "loss": 0.76715326, + "num_input_tokens_seen": 29129395, + "router_z_loss_clip": 1.29589844, + "router_z_loss_mlp": 0.20373535, + "step": 1370, + "time_per_iteration": 2.7117536067962646 + }, + { + "auxiliary_loss_clip": 0.01194683, + "auxiliary_loss_mlp": 0.01066547, + "balance_loss_clip": 1.05988717, + "balance_loss_mlp": 1.04390931, + "epoch": 0.08242897940778596, + "flos": 28291534122240.0, + "grad_norm": 1.7625194688361905, + "language_loss": 0.74453807, + "learning_rate": 3.971235378388573e-06, + "loss": 0.7671504, + "num_input_tokens_seen": 29148650, + "router_z_loss_clip": 1.34472656, + "router_z_loss_mlp": 0.22644043, + "step": 1371, + "time_per_iteration": 2.6245665550231934 + }, + { + "auxiliary_loss_clip": 0.01192577, + "auxiliary_loss_mlp": 0.01057797, + "balance_loss_clip": 1.05766487, + "balance_loss_mlp": 1.03562438, + "epoch": 0.08248910266045394, + "flos": 42088008823200.0, + "grad_norm": 1.9657503944831842, + "language_loss": 0.70484436, + "learning_rate": 3.971169525711122e-06, + "loss": 0.72734809, + "num_input_tokens_seen": 29170785, + "router_z_loss_clip": 1.34765625, + "router_z_loss_mlp": 0.22192383, + "step": 1372, + "time_per_iteration": 2.7581632137298584 + }, + { + "auxiliary_loss_clip": 0.01194362, + "auxiliary_loss_mlp": 0.01054993, + "balance_loss_clip": 1.05771828, + "balance_loss_mlp": 1.03284383, + "epoch": 0.0825492259131219, + "flos": 16395046786080.0, + "grad_norm": 2.59542464181169, + "language_loss": 0.87603301, + "learning_rate": 3.9711035982867246e-06, + "loss": 0.89852661, + "num_input_tokens_seen": 29185210, + "router_z_loss_clip": 1.36621094, + "router_z_loss_mlp": 0.22143555, + "step": 1373, + "time_per_iteration": 2.611953020095825 + }, + { + "auxiliary_loss_clip": 0.01190888, + "auxiliary_loss_mlp": 0.01054785, + "balance_loss_clip": 1.05714166, + "balance_loss_mlp": 1.03261173, + "epoch": 0.08260934916578987, + "flos": 31496763569760.0, + "grad_norm": 1.69596058279211, + "language_loss": 0.82056183, + "learning_rate": 3.971037596117882e-06, + "loss": 0.84301859, + "num_input_tokens_seen": 29205210, + "router_z_loss_clip": 1.33691406, + "router_z_loss_mlp": 0.22167969, + "step": 1374, + "time_per_iteration": 2.701967239379883 + }, + { + "auxiliary_loss_clip": 0.010758, + "auxiliary_loss_mlp": 0.01020255, + "balance_loss_clip": 1.02165985, + "balance_loss_mlp": 1.01699162, + "epoch": 0.08266947241845783, + "flos": 77434380790560.0, + "grad_norm": 0.8227520788885505, + "language_loss": 0.60654902, + "learning_rate": 3.970971519207095e-06, + "loss": 0.62750959, + "num_input_tokens_seen": 29265350, + "router_z_loss_clip": 0.54101562, + "router_z_loss_mlp": 0.03268433, + "step": 1375, + "time_per_iteration": 3.249593496322632 + }, + { + "auxiliary_loss_clip": 0.01075857, + "auxiliary_loss_mlp": 0.01016919, + "balance_loss_clip": 1.02124298, + "balance_loss_mlp": 1.01370907, + "epoch": 0.08272959567112581, + "flos": 85407044772960.0, + "grad_norm": 0.9098576434001814, + "language_loss": 0.62290525, + "learning_rate": 3.970905367556871e-06, + "loss": 0.64383292, + "num_input_tokens_seen": 29321475, + "router_z_loss_clip": 0.54638672, + "router_z_loss_mlp": 0.03213501, + "step": 1376, + "time_per_iteration": 3.1956191062927246 + }, + { + "auxiliary_loss_clip": 0.01202299, + "auxiliary_loss_mlp": 0.01063382, + "balance_loss_clip": 1.0643878, + "balance_loss_mlp": 1.04111373, + "epoch": 0.08278971892379378, + "flos": 24908135078880.0, + "grad_norm": 1.819243172401896, + "language_loss": 0.82760596, + "learning_rate": 3.970839141169718e-06, + "loss": 0.85026276, + "num_input_tokens_seen": 29341405, + "router_z_loss_clip": 1.37988281, + "router_z_loss_mlp": 0.22277832, + "step": 1377, + "time_per_iteration": 2.903763771057129 + }, + { + "auxiliary_loss_clip": 0.01191423, + "auxiliary_loss_mlp": 0.01057715, + "balance_loss_clip": 1.05703545, + "balance_loss_mlp": 1.03524446, + "epoch": 0.08284984217646174, + "flos": 32030988736320.0, + "grad_norm": 1.9611483254785798, + "language_loss": 0.84811723, + "learning_rate": 3.970772840048147e-06, + "loss": 0.87060869, + "num_input_tokens_seen": 29361955, + "router_z_loss_clip": 1.34375, + "router_z_loss_mlp": 0.22473145, + "step": 1378, + "time_per_iteration": 2.7384989261627197 + }, + { + "auxiliary_loss_clip": 0.01192279, + "auxiliary_loss_mlp": 0.01065042, + "balance_loss_clip": 1.05650306, + "balance_loss_mlp": 1.04204679, + "epoch": 0.08290996542912972, + "flos": 33183864380160.0, + "grad_norm": 2.2079163001466107, + "language_loss": 0.87883371, + "learning_rate": 3.970706464194672e-06, + "loss": 0.90140688, + "num_input_tokens_seen": 29382395, + "router_z_loss_clip": 1.35742188, + "router_z_loss_mlp": 0.23010254, + "step": 1379, + "time_per_iteration": 2.721296548843384 + }, + { + "auxiliary_loss_clip": 0.01191139, + "auxiliary_loss_mlp": 0.01060079, + "balance_loss_clip": 1.05955052, + "balance_loss_mlp": 1.03856158, + "epoch": 0.08297008868179769, + "flos": 47123386545600.0, + "grad_norm": 2.0127053935645725, + "language_loss": 0.78500187, + "learning_rate": 3.970640013611812e-06, + "loss": 0.80751407, + "num_input_tokens_seen": 29404460, + "router_z_loss_clip": 1.31445312, + "router_z_loss_mlp": 0.21508789, + "step": 1380, + "time_per_iteration": 2.796032667160034 + }, + { + "auxiliary_loss_clip": 0.01191433, + "auxiliary_loss_mlp": 0.01056659, + "balance_loss_clip": 1.05995512, + "balance_loss_mlp": 1.0333302, + "epoch": 0.08303021193446565, + "flos": 24373140084000.0, + "grad_norm": 2.6663191717198225, + "language_loss": 0.86183333, + "learning_rate": 3.970573488302083e-06, + "loss": 0.8843143, + "num_input_tokens_seen": 29422675, + "router_z_loss_clip": 1.31542969, + "router_z_loss_mlp": 0.23352051, + "step": 1381, + "time_per_iteration": 2.6511142253875732 + }, + { + "auxiliary_loss_clip": 0.01200887, + "auxiliary_loss_mlp": 0.01066843, + "balance_loss_clip": 1.06249189, + "balance_loss_mlp": 1.04405046, + "epoch": 0.08309033518713363, + "flos": 16670647739520.0, + "grad_norm": 3.125415929672311, + "language_loss": 0.87527144, + "learning_rate": 3.970506888268011e-06, + "loss": 0.89794874, + "num_input_tokens_seen": 29439840, + "router_z_loss_clip": 1.38378906, + "router_z_loss_mlp": 0.22814941, + "step": 1382, + "time_per_iteration": 2.605982780456543 + }, + { + "auxiliary_loss_clip": 0.01194678, + "auxiliary_loss_mlp": 0.01060387, + "balance_loss_clip": 1.06046164, + "balance_loss_mlp": 1.04020488, + "epoch": 0.0831504584398016, + "flos": 21924989608320.0, + "grad_norm": 2.642046389412983, + "language_loss": 0.77013087, + "learning_rate": 3.970440213512121e-06, + "loss": 0.79268152, + "num_input_tokens_seen": 29457360, + "router_z_loss_clip": 1.34082031, + "router_z_loss_mlp": 0.20202637, + "step": 1383, + "time_per_iteration": 2.6606101989746094 + }, + { + "auxiliary_loss_clip": 0.01196548, + "auxiliary_loss_mlp": 0.01059909, + "balance_loss_clip": 1.06000686, + "balance_loss_mlp": 1.0374738, + "epoch": 0.08321058169246956, + "flos": 27578410048800.0, + "grad_norm": 1.8069371569031951, + "language_loss": 0.82815289, + "learning_rate": 3.97037346403694e-06, + "loss": 0.85071743, + "num_input_tokens_seen": 29477040, + "router_z_loss_clip": 1.36523438, + "router_z_loss_mlp": 0.22412109, + "step": 1384, + "time_per_iteration": 2.7085015773773193 + }, + { + "auxiliary_loss_clip": 0.01199719, + "auxiliary_loss_mlp": 0.01056883, + "balance_loss_clip": 1.06068182, + "balance_loss_mlp": 1.03385234, + "epoch": 0.08327070494513754, + "flos": 27882893472480.0, + "grad_norm": 2.995960915825651, + "language_loss": 0.85123867, + "learning_rate": 3.970306639845e-06, + "loss": 0.87380469, + "num_input_tokens_seen": 29492010, + "router_z_loss_clip": 1.38964844, + "router_z_loss_mlp": 0.23034668, + "step": 1385, + "time_per_iteration": 2.66475772857666 + }, + { + "auxiliary_loss_clip": 0.01198281, + "auxiliary_loss_mlp": 0.01069479, + "balance_loss_clip": 1.06053972, + "balance_loss_mlp": 1.04696012, + "epoch": 0.0833308281978055, + "flos": 27800088852960.0, + "grad_norm": 1.7171010199197387, + "language_loss": 0.6858452, + "learning_rate": 3.970239740938835e-06, + "loss": 0.70852286, + "num_input_tokens_seen": 29511850, + "router_z_loss_clip": 1.37792969, + "router_z_loss_mlp": 0.2253418, + "step": 1386, + "time_per_iteration": 2.6630797386169434 + }, + { + "auxiliary_loss_clip": 0.01191125, + "auxiliary_loss_mlp": 0.0106025, + "balance_loss_clip": 1.05574048, + "balance_loss_mlp": 1.03764796, + "epoch": 0.08339095145047347, + "flos": 25395690689280.0, + "grad_norm": 1.6941398657276423, + "language_loss": 0.81979966, + "learning_rate": 3.97017276732098e-06, + "loss": 0.84231341, + "num_input_tokens_seen": 29531415, + "router_z_loss_clip": 1.35449219, + "router_z_loss_mlp": 0.22595215, + "step": 1387, + "time_per_iteration": 2.670732021331787 + }, + { + "auxiliary_loss_clip": 0.01196756, + "auxiliary_loss_mlp": 0.0107621, + "balance_loss_clip": 1.05784512, + "balance_loss_mlp": 1.05259502, + "epoch": 0.08345107470314143, + "flos": 22592254471200.0, + "grad_norm": 2.030176350835959, + "language_loss": 0.77480978, + "learning_rate": 3.970105718993978e-06, + "loss": 0.79753947, + "num_input_tokens_seen": 29549525, + "router_z_loss_clip": 1.38964844, + "router_z_loss_mlp": 0.23608398, + "step": 1388, + "time_per_iteration": 2.6532788276672363 + }, + { + "auxiliary_loss_clip": 0.01190254, + "auxiliary_loss_mlp": 0.0106693, + "balance_loss_clip": 1.05909014, + "balance_loss_mlp": 1.04349351, + "epoch": 0.08351119795580941, + "flos": 22948836766560.0, + "grad_norm": 2.0488470037020874, + "language_loss": 0.79349661, + "learning_rate": 3.970038595960369e-06, + "loss": 0.81606847, + "num_input_tokens_seen": 29568705, + "router_z_loss_clip": 1.3125, + "router_z_loss_mlp": 0.23461914, + "step": 1389, + "time_per_iteration": 2.632722854614258 + }, + { + "auxiliary_loss_clip": 0.01199451, + "auxiliary_loss_mlp": 0.01062941, + "balance_loss_clip": 1.06193399, + "balance_loss_mlp": 1.04060102, + "epoch": 0.08357132120847738, + "flos": 22502602431360.0, + "grad_norm": 3.0453264692914006, + "language_loss": 0.86937249, + "learning_rate": 3.969971398222699e-06, + "loss": 0.89199638, + "num_input_tokens_seen": 29585855, + "router_z_loss_clip": 1.375, + "router_z_loss_mlp": 0.22351074, + "step": 1390, + "time_per_iteration": 2.9080917835235596 + }, + { + "auxiliary_loss_clip": 0.01195307, + "auxiliary_loss_mlp": 0.01064241, + "balance_loss_clip": 1.05958939, + "balance_loss_mlp": 1.04106688, + "epoch": 0.08363144446114534, + "flos": 31630816198080.0, + "grad_norm": 1.7563769988314277, + "language_loss": 0.86592603, + "learning_rate": 3.969904125783517e-06, + "loss": 0.88852155, + "num_input_tokens_seen": 29607280, + "router_z_loss_clip": 1.35742188, + "router_z_loss_mlp": 0.23181152, + "step": 1391, + "time_per_iteration": 2.697110652923584 + }, + { + "auxiliary_loss_clip": 0.01201016, + "auxiliary_loss_mlp": 0.0107911, + "balance_loss_clip": 1.06095016, + "balance_loss_mlp": 1.05684185, + "epoch": 0.08369156771381332, + "flos": 22013790785280.0, + "grad_norm": 2.3045876371489475, + "language_loss": 0.87565053, + "learning_rate": 3.969836778645371e-06, + "loss": 0.89845181, + "num_input_tokens_seen": 29624130, + "router_z_loss_clip": 1.40136719, + "router_z_loss_mlp": 0.22265625, + "step": 1392, + "time_per_iteration": 2.635349988937378 + }, + { + "auxiliary_loss_clip": 0.01194427, + "auxiliary_loss_mlp": 0.01059145, + "balance_loss_clip": 1.05857456, + "balance_loss_mlp": 1.03676915, + "epoch": 0.08375169096648129, + "flos": 27668305192320.0, + "grad_norm": 2.4751081384303224, + "language_loss": 0.80395418, + "learning_rate": 3.969769356810819e-06, + "loss": 0.82648993, + "num_input_tokens_seen": 29643210, + "router_z_loss_clip": 1.359375, + "router_z_loss_mlp": 0.22375488, + "step": 1393, + "time_per_iteration": 2.6631107330322266 + }, + { + "auxiliary_loss_clip": 0.01196037, + "auxiliary_loss_mlp": 0.01054263, + "balance_loss_clip": 1.06390345, + "balance_loss_mlp": 1.03290057, + "epoch": 0.08381181421914925, + "flos": 31851684656640.0, + "grad_norm": 1.761322651865671, + "language_loss": 0.84914923, + "learning_rate": 3.969701860282415e-06, + "loss": 0.87165225, + "num_input_tokens_seen": 29663920, + "router_z_loss_clip": 1.3203125, + "router_z_loss_mlp": 0.21374512, + "step": 1394, + "time_per_iteration": 2.6980793476104736 + }, + { + "auxiliary_loss_clip": 0.01197262, + "auxiliary_loss_mlp": 0.01054524, + "balance_loss_clip": 1.06277812, + "balance_loss_mlp": 1.03198147, + "epoch": 0.08387193747181723, + "flos": 25172634297600.0, + "grad_norm": 1.7913911757665002, + "language_loss": 0.82989085, + "learning_rate": 3.969634289062719e-06, + "loss": 0.85240865, + "num_input_tokens_seen": 29683825, + "router_z_loss_clip": 1.34375, + "router_z_loss_mlp": 0.22521973, + "step": 1395, + "time_per_iteration": 2.627490997314453 + }, + { + "auxiliary_loss_clip": 0.01199198, + "auxiliary_loss_mlp": 0.01064339, + "balance_loss_clip": 1.06128299, + "balance_loss_mlp": 1.03994846, + "epoch": 0.0839320607244852, + "flos": 16402907138400.0, + "grad_norm": 2.284004044005246, + "language_loss": 0.82422745, + "learning_rate": 3.969566643154293e-06, + "loss": 0.84686285, + "num_input_tokens_seen": 29698775, + "router_z_loss_clip": 1.37792969, + "router_z_loss_mlp": 0.24401855, + "step": 1396, + "time_per_iteration": 2.612034797668457 + }, + { + "auxiliary_loss_clip": 0.01197685, + "auxiliary_loss_mlp": 0.01055051, + "balance_loss_clip": 1.06494474, + "balance_loss_mlp": 1.03088713, + "epoch": 0.08399218397715316, + "flos": 28647184520160.0, + "grad_norm": 2.098310299380084, + "language_loss": 0.76946402, + "learning_rate": 3.969498922559703e-06, + "loss": 0.79199135, + "num_input_tokens_seen": 29719430, + "router_z_loss_clip": 1.32714844, + "router_z_loss_mlp": 0.24157715, + "step": 1397, + "time_per_iteration": 4.127553939819336 + }, + { + "auxiliary_loss_clip": 0.0119632, + "auxiliary_loss_mlp": 0.01048486, + "balance_loss_clip": 1.06150532, + "balance_loss_mlp": 1.02524006, + "epoch": 0.08405230722982113, + "flos": 31628992920480.0, + "grad_norm": 2.9097194336777092, + "language_loss": 0.77799296, + "learning_rate": 3.969431127281516e-06, + "loss": 0.80044109, + "num_input_tokens_seen": 29739685, + "router_z_loss_clip": 1.34667969, + "router_z_loss_mlp": 0.23254395, + "step": 1398, + "time_per_iteration": 2.6999268531799316 + }, + { + "auxiliary_loss_clip": 0.01190173, + "auxiliary_loss_mlp": 0.01056271, + "balance_loss_clip": 1.05957794, + "balance_loss_mlp": 1.03463459, + "epoch": 0.0841124304824891, + "flos": 21923733572640.0, + "grad_norm": 2.217114805397134, + "language_loss": 0.95053369, + "learning_rate": 3.969363257322304e-06, + "loss": 0.97299814, + "num_input_tokens_seen": 29756165, + "router_z_loss_clip": 1.30761719, + "router_z_loss_mlp": 0.21655273, + "step": 1399, + "time_per_iteration": 5.5859880447387695 + }, + { + "auxiliary_loss_clip": 0.01196076, + "auxiliary_loss_mlp": 0.01063847, + "balance_loss_clip": 1.05768287, + "balance_loss_mlp": 1.03939724, + "epoch": 0.08417255373515707, + "flos": 31273747695360.0, + "grad_norm": 1.8807788307258264, + "language_loss": 0.81684065, + "learning_rate": 3.96929531268464e-06, + "loss": 0.83943987, + "num_input_tokens_seen": 29776425, + "router_z_loss_clip": 1.38378906, + "router_z_loss_mlp": 0.24450684, + "step": 1400, + "time_per_iteration": 4.092692613601685 + }, + { + "auxiliary_loss_clip": 0.01195332, + "auxiliary_loss_mlp": 0.01062008, + "balance_loss_clip": 1.05899787, + "balance_loss_mlp": 1.03911996, + "epoch": 0.08423267698782504, + "flos": 32030421494400.0, + "grad_norm": 1.917899674684965, + "language_loss": 0.86912596, + "learning_rate": 3.969227293371099e-06, + "loss": 0.89169937, + "num_input_tokens_seen": 29796440, + "router_z_loss_clip": 1.36425781, + "router_z_loss_mlp": 0.22875977, + "step": 1401, + "time_per_iteration": 2.9404921531677246 + }, + { + "auxiliary_loss_clip": 0.01193446, + "auxiliary_loss_mlp": 0.01068625, + "balance_loss_clip": 1.05792689, + "balance_loss_mlp": 1.04438972, + "epoch": 0.08429280024049302, + "flos": 24550134678720.0, + "grad_norm": 1.9458798974453622, + "language_loss": 0.87549156, + "learning_rate": 3.969159199384263e-06, + "loss": 0.8981123, + "num_input_tokens_seen": 29814755, + "router_z_loss_clip": 1.35644531, + "router_z_loss_mlp": 0.2421875, + "step": 1402, + "time_per_iteration": 2.65602970123291 + }, + { + "auxiliary_loss_clip": 0.01191747, + "auxiliary_loss_mlp": 0.01054529, + "balance_loss_clip": 1.05843639, + "balance_loss_mlp": 1.03301215, + "epoch": 0.08435292349316098, + "flos": 52378214621760.0, + "grad_norm": 2.565033151056627, + "language_loss": 0.8906523, + "learning_rate": 3.9690910307267125e-06, + "loss": 0.91311508, + "num_input_tokens_seen": 29834785, + "router_z_loss_clip": 1.33300781, + "router_z_loss_mlp": 0.21508789, + "step": 1403, + "time_per_iteration": 2.8535215854644775 + }, + { + "auxiliary_loss_clip": 0.01193212, + "auxiliary_loss_mlp": 0.01056818, + "balance_loss_clip": 1.05608141, + "balance_loss_mlp": 1.03370357, + "epoch": 0.08441304674582895, + "flos": 27890632272960.0, + "grad_norm": 2.023077071241266, + "language_loss": 0.8047784, + "learning_rate": 3.969022787401033e-06, + "loss": 0.82727873, + "num_input_tokens_seen": 29854695, + "router_z_loss_clip": 1.37207031, + "router_z_loss_mlp": 0.23144531, + "step": 1404, + "time_per_iteration": 2.6669023036956787 + }, + { + "auxiliary_loss_clip": 0.01202725, + "auxiliary_loss_mlp": 0.01070405, + "balance_loss_clip": 1.06309474, + "balance_loss_mlp": 1.04656303, + "epoch": 0.08447316999849692, + "flos": 22814581551840.0, + "grad_norm": 2.175116245529138, + "language_loss": 0.83387369, + "learning_rate": 3.968954469409811e-06, + "loss": 0.85660505, + "num_input_tokens_seen": 29872180, + "router_z_loss_clip": 1.39550781, + "router_z_loss_mlp": 0.23840332, + "step": 1405, + "time_per_iteration": 2.624776601791382 + }, + { + "auxiliary_loss_clip": 0.01194778, + "auxiliary_loss_mlp": 0.01058092, + "balance_loss_clip": 1.05874479, + "balance_loss_mlp": 1.03656244, + "epoch": 0.08453329325116489, + "flos": 31095942755040.0, + "grad_norm": 1.6159992302932908, + "language_loss": 0.80098391, + "learning_rate": 3.968886076755639e-06, + "loss": 0.82351255, + "num_input_tokens_seen": 29893205, + "router_z_loss_clip": 1.36035156, + "router_z_loss_mlp": 0.21533203, + "step": 1406, + "time_per_iteration": 2.6772687435150146 + }, + { + "auxiliary_loss_clip": 0.01200174, + "auxiliary_loss_mlp": 0.01070793, + "balance_loss_clip": 1.06481671, + "balance_loss_mlp": 1.04803562, + "epoch": 0.08459341650383286, + "flos": 25526988142560.0, + "grad_norm": 2.6448132225549217, + "language_loss": 0.79984355, + "learning_rate": 3.96881760944111e-06, + "loss": 0.82255328, + "num_input_tokens_seen": 29911970, + "router_z_loss_clip": 1.35253906, + "router_z_loss_mlp": 0.22741699, + "step": 1407, + "time_per_iteration": 2.6586451530456543 + }, + { + "auxiliary_loss_clip": 0.01191091, + "auxiliary_loss_mlp": 0.01054657, + "balance_loss_clip": 1.05815506, + "balance_loss_mlp": 1.03337848, + "epoch": 0.08465353975650082, + "flos": 15914541182400.0, + "grad_norm": 2.1665565760418835, + "language_loss": 0.91853386, + "learning_rate": 3.968749067468819e-06, + "loss": 0.9409914, + "num_input_tokens_seen": 29929925, + "router_z_loss_clip": 1.33007812, + "router_z_loss_mlp": 0.21276855, + "step": 1408, + "time_per_iteration": 2.7142539024353027 + }, + { + "auxiliary_loss_clip": 0.01078976, + "auxiliary_loss_mlp": 0.01058872, + "balance_loss_clip": 1.02737427, + "balance_loss_mlp": 1.05621636, + "epoch": 0.0847136630091688, + "flos": 74282182112160.0, + "grad_norm": 0.9073275588795946, + "language_loss": 0.61834049, + "learning_rate": 3.968680450841368e-06, + "loss": 0.63971901, + "num_input_tokens_seen": 29985950, + "router_z_loss_clip": 0.515625, + "router_z_loss_mlp": 0.02656555, + "step": 1409, + "time_per_iteration": 3.3205885887145996 + }, + { + "auxiliary_loss_clip": 0.01189813, + "auxiliary_loss_mlp": 0.01065275, + "balance_loss_clip": 1.06038404, + "balance_loss_mlp": 1.04370975, + "epoch": 0.08477378626183676, + "flos": 26902028797920.0, + "grad_norm": 1.694225633377852, + "language_loss": 0.86633146, + "learning_rate": 3.968611759561355e-06, + "loss": 0.88888228, + "num_input_tokens_seen": 30004330, + "router_z_loss_clip": 1.29394531, + "router_z_loss_mlp": 0.21569824, + "step": 1410, + "time_per_iteration": 2.861323595046997 + }, + { + "auxiliary_loss_clip": 0.01195248, + "auxiliary_loss_mlp": 0.01052124, + "balance_loss_clip": 1.06019175, + "balance_loss_mlp": 1.02855599, + "epoch": 0.08483390951450473, + "flos": 20365012971360.0, + "grad_norm": 1.9680859421907797, + "language_loss": 0.7465356, + "learning_rate": 3.968542993631388e-06, + "loss": 0.76900935, + "num_input_tokens_seen": 30022555, + "router_z_loss_clip": 1.35058594, + "router_z_loss_mlp": 0.2355957, + "step": 1411, + "time_per_iteration": 2.638946771621704 + }, + { + "auxiliary_loss_clip": 0.01075969, + "auxiliary_loss_mlp": 0.01019325, + "balance_loss_clip": 1.02465379, + "balance_loss_mlp": 1.01663256, + "epoch": 0.08489403276717271, + "flos": 62941394100960.0, + "grad_norm": 0.9064151282210208, + "language_loss": 0.56806219, + "learning_rate": 3.968474153054073e-06, + "loss": 0.58901513, + "num_input_tokens_seen": 30077220, + "router_z_loss_clip": 0.51416016, + "router_z_loss_mlp": 0.02696228, + "step": 1412, + "time_per_iteration": 3.176448345184326 + }, + { + "auxiliary_loss_clip": 0.01189741, + "auxiliary_loss_mlp": 0.01056144, + "balance_loss_clip": 1.05907488, + "balance_loss_mlp": 1.03496087, + "epoch": 0.08495415601984067, + "flos": 20855809964160.0, + "grad_norm": 3.071976805522662, + "language_loss": 0.89466506, + "learning_rate": 3.96840523783202e-06, + "loss": 0.91712391, + "num_input_tokens_seen": 30094600, + "router_z_loss_clip": 1.3046875, + "router_z_loss_mlp": 0.21166992, + "step": 1413, + "time_per_iteration": 2.813044309616089 + }, + { + "auxiliary_loss_clip": 0.01191698, + "auxiliary_loss_mlp": 0.01057241, + "balance_loss_clip": 1.06018329, + "balance_loss_mlp": 1.03457916, + "epoch": 0.08501427927250864, + "flos": 28245958532640.0, + "grad_norm": 1.8172498401818362, + "language_loss": 0.8833257, + "learning_rate": 3.968336247967844e-06, + "loss": 0.90581506, + "num_input_tokens_seen": 30114475, + "router_z_loss_clip": 1.31445312, + "router_z_loss_mlp": 0.22644043, + "step": 1414, + "time_per_iteration": 2.8262195587158203 + }, + { + "auxiliary_loss_clip": 0.0119209, + "auxiliary_loss_mlp": 0.01058736, + "balance_loss_clip": 1.06003332, + "balance_loss_mlp": 1.0381608, + "epoch": 0.08507440252517662, + "flos": 23261058990720.0, + "grad_norm": 1.877754177197206, + "language_loss": 0.77409595, + "learning_rate": 3.96826718346416e-06, + "loss": 0.79660428, + "num_input_tokens_seen": 30133350, + "router_z_loss_clip": 1.31933594, + "router_z_loss_mlp": 0.20581055, + "step": 1415, + "time_per_iteration": 2.674513101577759 + }, + { + "auxiliary_loss_clip": 0.01190075, + "auxiliary_loss_mlp": 0.01057443, + "balance_loss_clip": 1.06046891, + "balance_loss_mlp": 1.03642607, + "epoch": 0.08513452577784458, + "flos": 73437470614080.0, + "grad_norm": 1.6880078239819911, + "language_loss": 0.70848429, + "learning_rate": 3.968198044323587e-06, + "loss": 0.73095948, + "num_input_tokens_seen": 30159005, + "router_z_loss_clip": 1.296875, + "router_z_loss_mlp": 0.21008301, + "step": 1416, + "time_per_iteration": 3.024916172027588 + }, + { + "auxiliary_loss_clip": 0.01195976, + "auxiliary_loss_mlp": 0.01062261, + "balance_loss_clip": 1.06009841, + "balance_loss_mlp": 1.03875256, + "epoch": 0.08519464903051255, + "flos": 33322333392000.0, + "grad_norm": 2.198724771595957, + "language_loss": 0.74845457, + "learning_rate": 3.968128830548748e-06, + "loss": 0.77103692, + "num_input_tokens_seen": 30179450, + "router_z_loss_clip": 1.35742188, + "router_z_loss_mlp": 0.23510742, + "step": 1417, + "time_per_iteration": 2.713344097137451 + }, + { + "auxiliary_loss_clip": 0.01190362, + "auxiliary_loss_mlp": 0.01060825, + "balance_loss_clip": 1.05863357, + "balance_loss_mlp": 1.03889012, + "epoch": 0.08525477228318051, + "flos": 24729114620160.0, + "grad_norm": 2.2016730725709777, + "language_loss": 0.81808031, + "learning_rate": 3.968059542142265e-06, + "loss": 0.84059215, + "num_input_tokens_seen": 30197235, + "router_z_loss_clip": 1.31542969, + "router_z_loss_mlp": 0.21923828, + "step": 1418, + "time_per_iteration": 2.7415823936462402 + }, + { + "auxiliary_loss_clip": 0.01077031, + "auxiliary_loss_mlp": 0.01022306, + "balance_loss_clip": 1.02442932, + "balance_loss_mlp": 1.01911974, + "epoch": 0.08531489553584849, + "flos": 82505042713440.0, + "grad_norm": 0.8697178221815276, + "language_loss": 0.56663495, + "learning_rate": 3.9679901791067685e-06, + "loss": 0.58762836, + "num_input_tokens_seen": 30257410, + "router_z_loss_clip": 0.52587891, + "router_z_loss_mlp": 0.03186035, + "step": 1419, + "time_per_iteration": 3.2157657146453857 + }, + { + "auxiliary_loss_clip": 0.01190574, + "auxiliary_loss_mlp": 0.0106674, + "balance_loss_clip": 1.05706108, + "balance_loss_mlp": 1.04455543, + "epoch": 0.08537501878851646, + "flos": 33588980026560.0, + "grad_norm": 1.92914964654019, + "language_loss": 0.70163506, + "learning_rate": 3.967920741444886e-06, + "loss": 0.72420824, + "num_input_tokens_seen": 30277865, + "router_z_loss_clip": 1.33496094, + "router_z_loss_mlp": 0.22192383, + "step": 1420, + "time_per_iteration": 2.7164571285247803 + }, + { + "auxiliary_loss_clip": 0.01189413, + "auxiliary_loss_mlp": 0.01049847, + "balance_loss_clip": 1.05801201, + "balance_loss_mlp": 1.0284493, + "epoch": 0.08543514204118442, + "flos": 27801506957760.0, + "grad_norm": 2.1528489970686198, + "language_loss": 0.88298762, + "learning_rate": 3.967851229159252e-06, + "loss": 0.90538025, + "num_input_tokens_seen": 30298545, + "router_z_loss_clip": 1.3125, + "router_z_loss_mlp": 0.21386719, + "step": 1421, + "time_per_iteration": 2.6507956981658936 + }, + { + "auxiliary_loss_clip": 0.01076261, + "auxiliary_loss_mlp": 0.01002995, + "balance_loss_clip": 1.02401829, + "balance_loss_mlp": 1.00014865, + "epoch": 0.0854952652938524, + "flos": 74420083882080.0, + "grad_norm": 0.7993174320311015, + "language_loss": 0.63436061, + "learning_rate": 3.967781642252502e-06, + "loss": 0.65515316, + "num_input_tokens_seen": 30361725, + "router_z_loss_clip": 0.52246094, + "router_z_loss_mlp": 0.02842712, + "step": 1422, + "time_per_iteration": 3.209974765777588 + }, + { + "auxiliary_loss_clip": 0.01190157, + "auxiliary_loss_mlp": 0.01061827, + "balance_loss_clip": 1.06187892, + "balance_loss_mlp": 1.0406673, + "epoch": 0.08555538854652037, + "flos": 34212614129280.0, + "grad_norm": 1.9617940050626022, + "language_loss": 0.82850254, + "learning_rate": 3.967711980727276e-06, + "loss": 0.85102236, + "num_input_tokens_seen": 30382180, + "router_z_loss_clip": 1.28320312, + "router_z_loss_mlp": 0.21166992, + "step": 1423, + "time_per_iteration": 2.6897807121276855 + }, + { + "auxiliary_loss_clip": 0.01194741, + "auxiliary_loss_mlp": 0.01060388, + "balance_loss_clip": 1.06093788, + "balance_loss_mlp": 1.038764, + "epoch": 0.08561551179918833, + "flos": 28686682517760.0, + "grad_norm": 1.6704219197495336, + "language_loss": 0.74735969, + "learning_rate": 3.967642244586213e-06, + "loss": 0.76991099, + "num_input_tokens_seen": 30402980, + "router_z_loss_clip": 1.33691406, + "router_z_loss_mlp": 0.21630859, + "step": 1424, + "time_per_iteration": 2.69999098777771 + }, + { + "auxiliary_loss_clip": 0.01192814, + "auxiliary_loss_mlp": 0.0105988, + "balance_loss_clip": 1.06016779, + "balance_loss_mlp": 1.03900695, + "epoch": 0.08567563505185631, + "flos": 21874673496960.0, + "grad_norm": 2.8644595459868416, + "language_loss": 0.76451957, + "learning_rate": 3.96757243383196e-06, + "loss": 0.78704643, + "num_input_tokens_seen": 30420800, + "router_z_loss_clip": 1.32421875, + "router_z_loss_mlp": 0.20874023, + "step": 1425, + "time_per_iteration": 2.673682689666748 + }, + { + "auxiliary_loss_clip": 0.01188132, + "auxiliary_loss_mlp": 0.01056957, + "balance_loss_clip": 1.05913234, + "balance_loss_mlp": 1.03628588, + "epoch": 0.08573575830452428, + "flos": 24061687688160.0, + "grad_norm": 2.1401376559980942, + "language_loss": 0.93160808, + "learning_rate": 3.9675025484671624e-06, + "loss": 0.954059, + "num_input_tokens_seen": 30439620, + "router_z_loss_clip": 1.29101562, + "router_z_loss_mlp": 0.20666504, + "step": 1426, + "time_per_iteration": 2.8895628452301025 + }, + { + "auxiliary_loss_clip": 0.01197503, + "auxiliary_loss_mlp": 0.01064902, + "balance_loss_clip": 1.06282902, + "balance_loss_mlp": 1.0412035, + "epoch": 0.08579588155719224, + "flos": 21879900226080.0, + "grad_norm": 2.3963452859088106, + "language_loss": 0.75378454, + "learning_rate": 3.967432588494471e-06, + "loss": 0.77640861, + "num_input_tokens_seen": 30457300, + "router_z_loss_clip": 1.34570312, + "router_z_loss_mlp": 0.23706055, + "step": 1427, + "time_per_iteration": 2.6339612007141113 + }, + { + "auxiliary_loss_clip": 0.01188269, + "auxiliary_loss_mlp": 0.01059175, + "balance_loss_clip": 1.05833185, + "balance_loss_mlp": 1.03838515, + "epoch": 0.08585600480986022, + "flos": 19563614445600.0, + "grad_norm": 2.4313651655078212, + "language_loss": 0.82323331, + "learning_rate": 3.96736255391654e-06, + "loss": 0.84570771, + "num_input_tokens_seen": 30471580, + "router_z_loss_clip": 1.29882812, + "router_z_loss_mlp": 0.20776367, + "step": 1428, + "time_per_iteration": 2.6572976112365723 + }, + { + "auxiliary_loss_clip": 0.01193461, + "auxiliary_loss_mlp": 0.01068318, + "balance_loss_clip": 1.05850625, + "balance_loss_mlp": 1.04596651, + "epoch": 0.08591612806252819, + "flos": 34968518100000.0, + "grad_norm": 2.298014112232165, + "language_loss": 0.79944277, + "learning_rate": 3.967292444736023e-06, + "loss": 0.82206059, + "num_input_tokens_seen": 30492720, + "router_z_loss_clip": 1.34960938, + "router_z_loss_mlp": 0.22351074, + "step": 1429, + "time_per_iteration": 2.844911813735962 + }, + { + "auxiliary_loss_clip": 0.01194086, + "auxiliary_loss_mlp": 0.01066478, + "balance_loss_clip": 1.06167579, + "balance_loss_mlp": 1.04519939, + "epoch": 0.08597625131519615, + "flos": 25574427527040.0, + "grad_norm": 1.8795335373162538, + "language_loss": 0.88272715, + "learning_rate": 3.967222260955578e-06, + "loss": 0.9053328, + "num_input_tokens_seen": 30509535, + "router_z_loss_clip": 1.32324219, + "router_z_loss_mlp": 0.21276855, + "step": 1430, + "time_per_iteration": 2.6939878463745117 + }, + { + "auxiliary_loss_clip": 0.01190912, + "auxiliary_loss_mlp": 0.01073921, + "balance_loss_clip": 1.06274462, + "balance_loss_mlp": 1.05295181, + "epoch": 0.08603637456786412, + "flos": 28377823227840.0, + "grad_norm": 1.6555348675753252, + "language_loss": 0.81687164, + "learning_rate": 3.96715200257787e-06, + "loss": 0.83951992, + "num_input_tokens_seen": 30529490, + "router_z_loss_clip": 1.28222656, + "router_z_loss_mlp": 0.2097168, + "step": 1431, + "time_per_iteration": 2.7077739238739014 + }, + { + "auxiliary_loss_clip": 0.01188568, + "auxiliary_loss_mlp": 0.01059374, + "balance_loss_clip": 1.05761147, + "balance_loss_mlp": 1.03809488, + "epoch": 0.0860964978205321, + "flos": 35013323861280.0, + "grad_norm": 1.6730273009089716, + "language_loss": 0.77876389, + "learning_rate": 3.967081669605559e-06, + "loss": 0.80124331, + "num_input_tokens_seen": 30550205, + "router_z_loss_clip": 1.30859375, + "router_z_loss_mlp": 0.21276855, + "step": 1432, + "time_per_iteration": 2.703986644744873 + }, + { + "auxiliary_loss_clip": 0.01188977, + "auxiliary_loss_mlp": 0.01068255, + "balance_loss_clip": 1.05626011, + "balance_loss_mlp": 1.04649973, + "epoch": 0.08615662107320006, + "flos": 23572187248320.0, + "grad_norm": 3.7787538017969724, + "language_loss": 0.73403335, + "learning_rate": 3.967011262041315e-06, + "loss": 0.75660563, + "num_input_tokens_seen": 30568830, + "router_z_loss_clip": 1.328125, + "router_z_loss_mlp": 0.2175293, + "step": 1433, + "time_per_iteration": 2.6433603763580322 + }, + { + "auxiliary_loss_clip": 0.01192658, + "auxiliary_loss_mlp": 0.01067807, + "balance_loss_clip": 1.05941331, + "balance_loss_mlp": 1.0446682, + "epoch": 0.08621674432586802, + "flos": 19341895124160.0, + "grad_norm": 2.6395277481080113, + "language_loss": 0.85753179, + "learning_rate": 3.9669407798878065e-06, + "loss": 0.88013643, + "num_input_tokens_seen": 30585730, + "router_z_loss_clip": 1.33300781, + "router_z_loss_mlp": 0.23132324, + "step": 1434, + "time_per_iteration": 2.6100552082061768 + }, + { + "auxiliary_loss_clip": 0.01190169, + "auxiliary_loss_mlp": 0.01059051, + "balance_loss_clip": 1.05770218, + "balance_loss_mlp": 1.03776062, + "epoch": 0.086276867578536, + "flos": 17205116009760.0, + "grad_norm": 2.3844132067639454, + "language_loss": 0.78824484, + "learning_rate": 3.966870223147707e-06, + "loss": 0.81073701, + "num_input_tokens_seen": 30603180, + "router_z_loss_clip": 1.32617188, + "router_z_loss_mlp": 0.21289062, + "step": 1435, + "time_per_iteration": 2.6725635528564453 + }, + { + "auxiliary_loss_clip": 0.01074404, + "auxiliary_loss_mlp": 0.01068622, + "balance_loss_clip": 1.02245474, + "balance_loss_mlp": 1.0658865, + "epoch": 0.08633699083120397, + "flos": 85639501173600.0, + "grad_norm": 1.094834195781812, + "language_loss": 0.57923067, + "learning_rate": 3.96679959182369e-06, + "loss": 0.60066098, + "num_input_tokens_seen": 30668895, + "router_z_loss_clip": 0.51904297, + "router_z_loss_mlp": 0.02738953, + "step": 1436, + "time_per_iteration": 3.3930041790008545 + }, + { + "auxiliary_loss_clip": 0.01192093, + "auxiliary_loss_mlp": 0.0104763, + "balance_loss_clip": 1.05861259, + "balance_loss_mlp": 1.02610135, + "epoch": 0.08639711408387193, + "flos": 36971568724320.0, + "grad_norm": 2.4093485105460815, + "language_loss": 0.69065118, + "learning_rate": 3.966728885918437e-06, + "loss": 0.7130484, + "num_input_tokens_seen": 30688955, + "router_z_loss_clip": 1.3359375, + "router_z_loss_mlp": 0.21533203, + "step": 1437, + "time_per_iteration": 4.240413188934326 + }, + { + "auxiliary_loss_clip": 0.01189204, + "auxiliary_loss_mlp": 0.0105574, + "balance_loss_clip": 1.05698192, + "balance_loss_mlp": 1.03568864, + "epoch": 0.08645723733653991, + "flos": 24767721237600.0, + "grad_norm": 2.298795148692288, + "language_loss": 0.73325932, + "learning_rate": 3.966658105434627e-06, + "loss": 0.75570869, + "num_input_tokens_seen": 30706095, + "router_z_loss_clip": 1.32226562, + "router_z_loss_mlp": 0.20043945, + "step": 1438, + "time_per_iteration": 4.2292585372924805 + }, + { + "auxiliary_loss_clip": 0.01187392, + "auxiliary_loss_mlp": 0.01052086, + "balance_loss_clip": 1.05917811, + "balance_loss_mlp": 1.03040159, + "epoch": 0.08651736058920788, + "flos": 40134058791840.0, + "grad_norm": 2.0347259849525163, + "language_loss": 0.64117885, + "learning_rate": 3.966587250374945e-06, + "loss": 0.66357362, + "num_input_tokens_seen": 30729025, + "router_z_loss_clip": 1.28125, + "router_z_loss_mlp": 0.21679688, + "step": 1439, + "time_per_iteration": 5.67034649848938 + }, + { + "auxiliary_loss_clip": 0.01189657, + "auxiliary_loss_mlp": 0.01054446, + "balance_loss_clip": 1.05926275, + "balance_loss_mlp": 1.03223681, + "epoch": 0.08657748384187584, + "flos": 27622364947200.0, + "grad_norm": 2.1038500705333516, + "language_loss": 0.87624657, + "learning_rate": 3.966516320742077e-06, + "loss": 0.8986876, + "num_input_tokens_seen": 30746155, + "router_z_loss_clip": 1.30175781, + "router_z_loss_mlp": 0.2220459, + "step": 1440, + "time_per_iteration": 2.704383134841919 + }, + { + "auxiliary_loss_clip": 0.0119481, + "auxiliary_loss_mlp": 0.0105704, + "balance_loss_clip": 1.05916023, + "balance_loss_mlp": 1.03391373, + "epoch": 0.08663760709454381, + "flos": 28868336599680.0, + "grad_norm": 2.9508726895901325, + "language_loss": 0.83583081, + "learning_rate": 3.9664453165387124e-06, + "loss": 0.85834938, + "num_input_tokens_seen": 30761410, + "router_z_loss_clip": 1.35742188, + "router_z_loss_mlp": 0.23120117, + "step": 1441, + "time_per_iteration": 2.635464906692505 + }, + { + "auxiliary_loss_clip": 0.01071685, + "auxiliary_loss_mlp": 0.01010904, + "balance_loss_clip": 1.0198282, + "balance_loss_mlp": 1.00833488, + "epoch": 0.08669773034721179, + "flos": 76491231353280.0, + "grad_norm": 0.843486402004246, + "language_loss": 0.60455942, + "learning_rate": 3.966374237767545e-06, + "loss": 0.62538528, + "num_input_tokens_seen": 30823010, + "router_z_loss_clip": 0.51708984, + "router_z_loss_mlp": 0.0256958, + "step": 1442, + "time_per_iteration": 3.3652637004852295 + }, + { + "auxiliary_loss_clip": 0.01194289, + "auxiliary_loss_mlp": 0.0105399, + "balance_loss_clip": 1.05831099, + "balance_loss_mlp": 1.03267527, + "epoch": 0.08675785359987975, + "flos": 25218534025440.0, + "grad_norm": 3.3886503104426935, + "language_loss": 0.78514099, + "learning_rate": 3.96630308443127e-06, + "loss": 0.80762374, + "num_input_tokens_seen": 30841980, + "router_z_loss_clip": 1.35839844, + "router_z_loss_mlp": 0.2130127, + "step": 1443, + "time_per_iteration": 2.6527199745178223 + }, + { + "auxiliary_loss_clip": 0.01188207, + "auxiliary_loss_mlp": 0.01051767, + "balance_loss_clip": 1.05560291, + "balance_loss_mlp": 1.03017831, + "epoch": 0.08681797685254772, + "flos": 32874275779200.0, + "grad_norm": 1.6303189636395594, + "language_loss": 0.82235944, + "learning_rate": 3.966231856532584e-06, + "loss": 0.84475917, + "num_input_tokens_seen": 30863280, + "router_z_loss_clip": 1.32617188, + "router_z_loss_mlp": 0.21557617, + "step": 1444, + "time_per_iteration": 2.719001054763794 + }, + { + "auxiliary_loss_clip": 0.0119257, + "auxiliary_loss_mlp": 0.01054681, + "balance_loss_clip": 1.05816078, + "balance_loss_mlp": 1.03356886, + "epoch": 0.0868781001052157, + "flos": 21612969970560.0, + "grad_norm": 2.0154077856171293, + "language_loss": 0.87211466, + "learning_rate": 3.966160554074189e-06, + "loss": 0.89458716, + "num_input_tokens_seen": 30881710, + "router_z_loss_clip": 1.34277344, + "router_z_loss_mlp": 0.21081543, + "step": 1445, + "time_per_iteration": 2.6517679691314697 + }, + { + "auxiliary_loss_clip": 0.01194072, + "auxiliary_loss_mlp": 0.01055762, + "balance_loss_clip": 1.06226182, + "balance_loss_mlp": 1.0361402, + "epoch": 0.08693822335788366, + "flos": 24278180280480.0, + "grad_norm": 2.1517430268988367, + "language_loss": 0.8188414, + "learning_rate": 3.96608917705879e-06, + "loss": 0.84133977, + "num_input_tokens_seen": 30900225, + "router_z_loss_clip": 1.31835938, + "router_z_loss_mlp": 0.19616699, + "step": 1446, + "time_per_iteration": 2.6198177337646484 + }, + { + "auxiliary_loss_clip": 0.01070508, + "auxiliary_loss_mlp": 0.01006888, + "balance_loss_clip": 1.01826537, + "balance_loss_mlp": 1.00426841, + "epoch": 0.08699834661055163, + "flos": 81784220356800.0, + "grad_norm": 0.7314179056517691, + "language_loss": 0.54763931, + "learning_rate": 3.966017725489091e-06, + "loss": 0.56841326, + "num_input_tokens_seen": 30959580, + "router_z_loss_clip": 0.52294922, + "router_z_loss_mlp": 0.0262146, + "step": 1447, + "time_per_iteration": 3.2669641971588135 + }, + { + "auxiliary_loss_clip": 0.01185315, + "auxiliary_loss_mlp": 0.01058161, + "balance_loss_clip": 1.05813026, + "balance_loss_mlp": 1.03778851, + "epoch": 0.0870584698632196, + "flos": 16448361176160.0, + "grad_norm": 3.1585708117752147, + "language_loss": 0.84442818, + "learning_rate": 3.965946199367804e-06, + "loss": 0.86686295, + "num_input_tokens_seen": 30976775, + "router_z_loss_clip": 1.27148438, + "router_z_loss_mlp": 0.20385742, + "step": 1448, + "time_per_iteration": 2.6506009101867676 + }, + { + "auxiliary_loss_clip": 0.01192722, + "auxiliary_loss_mlp": 0.01053985, + "balance_loss_clip": 1.05919981, + "balance_loss_mlp": 1.03277731, + "epoch": 0.08711859311588757, + "flos": 19653469071840.0, + "grad_norm": 3.7787120915888375, + "language_loss": 0.8072812, + "learning_rate": 3.965874598697638e-06, + "loss": 0.82974827, + "num_input_tokens_seen": 30990495, + "router_z_loss_clip": 1.33496094, + "router_z_loss_mlp": 0.21203613, + "step": 1449, + "time_per_iteration": 2.8661766052246094 + }, + { + "auxiliary_loss_clip": 0.01189673, + "auxiliary_loss_mlp": 0.01053324, + "balance_loss_clip": 1.06052589, + "balance_loss_mlp": 1.03246236, + "epoch": 0.08717871636855554, + "flos": 46945419536160.0, + "grad_norm": 1.6198203430244686, + "language_loss": 0.71167809, + "learning_rate": 3.965802923481313e-06, + "loss": 0.73410809, + "num_input_tokens_seen": 31014080, + "router_z_loss_clip": 1.29199219, + "router_z_loss_mlp": 0.20861816, + "step": 1450, + "time_per_iteration": 2.790604829788208 + }, + { + "auxiliary_loss_clip": 0.01190573, + "auxiliary_loss_mlp": 0.01055365, + "balance_loss_clip": 1.06135714, + "balance_loss_mlp": 1.03511143, + "epoch": 0.0872388396212235, + "flos": 21476243201760.0, + "grad_norm": 1.8202512531445179, + "language_loss": 0.83635849, + "learning_rate": 3.965731173721542e-06, + "loss": 0.85881793, + "num_input_tokens_seen": 31031210, + "router_z_loss_clip": 1.29296875, + "router_z_loss_mlp": 0.20251465, + "step": 1451, + "time_per_iteration": 2.675311326980591 + }, + { + "auxiliary_loss_clip": 0.01184763, + "auxiliary_loss_mlp": 0.01050252, + "balance_loss_clip": 1.05724883, + "balance_loss_mlp": 1.03030872, + "epoch": 0.08729896287389148, + "flos": 30822408182880.0, + "grad_norm": 1.9453782922026412, + "language_loss": 0.74709761, + "learning_rate": 3.965659349421049e-06, + "loss": 0.76944774, + "num_input_tokens_seen": 31049710, + "router_z_loss_clip": 1.27441406, + "router_z_loss_mlp": 0.19946289, + "step": 1452, + "time_per_iteration": 2.689758062362671 + }, + { + "auxiliary_loss_clip": 0.01192062, + "auxiliary_loss_mlp": 0.01056384, + "balance_loss_clip": 1.05918717, + "balance_loss_mlp": 1.0352478, + "epoch": 0.08735908612655945, + "flos": 19074397626720.0, + "grad_norm": 2.8905564299581528, + "language_loss": 0.79929912, + "learning_rate": 3.965587450582556e-06, + "loss": 0.82178354, + "num_input_tokens_seen": 31066160, + "router_z_loss_clip": 1.328125, + "router_z_loss_mlp": 0.21130371, + "step": 1453, + "time_per_iteration": 2.6506402492523193 + }, + { + "auxiliary_loss_clip": 0.01190493, + "auxiliary_loss_mlp": 0.01060754, + "balance_loss_clip": 1.06088901, + "balance_loss_mlp": 1.03920126, + "epoch": 0.08741920937922741, + "flos": 24818442521760.0, + "grad_norm": 1.9582383276706583, + "language_loss": 0.70925254, + "learning_rate": 3.96551547720879e-06, + "loss": 0.73176503, + "num_input_tokens_seen": 31085270, + "router_z_loss_clip": 1.29492188, + "router_z_loss_mlp": 0.2154541, + "step": 1454, + "time_per_iteration": 2.664128065109253 + }, + { + "auxiliary_loss_clip": 0.01070022, + "auxiliary_loss_mlp": 0.01023584, + "balance_loss_clip": 1.01870406, + "balance_loss_mlp": 1.0207665, + "epoch": 0.08747933263189539, + "flos": 76651620213600.0, + "grad_norm": 0.8009222993388483, + "language_loss": 0.5860886, + "learning_rate": 3.96544342930248e-06, + "loss": 0.60702467, + "num_input_tokens_seen": 31148445, + "router_z_loss_clip": 0.51269531, + "router_z_loss_mlp": 0.02816772, + "step": 1455, + "time_per_iteration": 3.28842830657959 + }, + { + "auxiliary_loss_clip": 0.01192036, + "auxiliary_loss_mlp": 0.01060798, + "balance_loss_clip": 1.05972767, + "balance_loss_mlp": 1.03994799, + "epoch": 0.08753945588456336, + "flos": 40311418042080.0, + "grad_norm": 1.7822453200196167, + "language_loss": 0.7747668, + "learning_rate": 3.965371306866359e-06, + "loss": 0.79729515, + "num_input_tokens_seen": 31168770, + "router_z_loss_clip": 1.32421875, + "router_z_loss_mlp": 0.20849609, + "step": 1456, + "time_per_iteration": 2.7928335666656494 + }, + { + "auxiliary_loss_clip": 0.01188904, + "auxiliary_loss_mlp": 0.01049745, + "balance_loss_clip": 1.05864418, + "balance_loss_mlp": 1.02926528, + "epoch": 0.08759957913723132, + "flos": 43376476752000.0, + "grad_norm": 1.9924348584165705, + "language_loss": 0.72213411, + "learning_rate": 3.96529910990316e-06, + "loss": 0.7445206, + "num_input_tokens_seen": 31189270, + "router_z_loss_clip": 1.29980469, + "router_z_loss_mlp": 0.20495605, + "step": 1457, + "time_per_iteration": 2.7594399452209473 + }, + { + "auxiliary_loss_clip": 0.01183709, + "auxiliary_loss_mlp": 0.01045766, + "balance_loss_clip": 1.05734634, + "balance_loss_mlp": 1.02576303, + "epoch": 0.0876597023898993, + "flos": 29176993303200.0, + "grad_norm": 1.5503113708136311, + "language_loss": 0.86737621, + "learning_rate": 3.965226838415622e-06, + "loss": 0.88967097, + "num_input_tokens_seen": 31210385, + "router_z_loss_clip": 1.26464844, + "router_z_loss_mlp": 0.20007324, + "step": 1458, + "time_per_iteration": 2.7060205936431885 + }, + { + "auxiliary_loss_clip": 0.01193202, + "auxiliary_loss_mlp": 0.01057913, + "balance_loss_clip": 1.06268001, + "balance_loss_mlp": 1.0377543, + "epoch": 0.08771982564256726, + "flos": 22102551444960.0, + "grad_norm": 2.033364000066896, + "language_loss": 0.80252898, + "learning_rate": 3.965154492406486e-06, + "loss": 0.82504016, + "num_input_tokens_seen": 31229745, + "router_z_loss_clip": 1.3046875, + "router_z_loss_mlp": 0.20166016, + "step": 1459, + "time_per_iteration": 2.653327703475952 + }, + { + "auxiliary_loss_clip": 0.01194811, + "auxiliary_loss_mlp": 0.01054134, + "balance_loss_clip": 1.06063449, + "balance_loss_mlp": 1.03329623, + "epoch": 0.08777994889523523, + "flos": 21612038073120.0, + "grad_norm": 2.491419258659155, + "language_loss": 0.8396765, + "learning_rate": 3.9650820718784945e-06, + "loss": 0.86216599, + "num_input_tokens_seen": 31248280, + "router_z_loss_clip": 1.34082031, + "router_z_loss_mlp": 0.20825195, + "step": 1460, + "time_per_iteration": 2.6753973960876465 + }, + { + "auxiliary_loss_clip": 0.01184261, + "auxiliary_loss_mlp": 0.0105949, + "balance_loss_clip": 1.05614924, + "balance_loss_mlp": 1.04034495, + "epoch": 0.0878400721479032, + "flos": 15642627301440.0, + "grad_norm": 2.528272202062766, + "language_loss": 0.80338997, + "learning_rate": 3.965009576834394e-06, + "loss": 0.82582748, + "num_input_tokens_seen": 31262190, + "router_z_loss_clip": 1.28125, + "router_z_loss_mlp": 0.19152832, + "step": 1461, + "time_per_iteration": 2.8812408447265625 + }, + { + "auxiliary_loss_clip": 0.01192907, + "auxiliary_loss_mlp": 0.01055385, + "balance_loss_clip": 1.06110454, + "balance_loss_mlp": 1.03517902, + "epoch": 0.08790019540057117, + "flos": 32205147121440.0, + "grad_norm": 1.987969007231633, + "language_loss": 0.76212883, + "learning_rate": 3.964937007276932e-06, + "loss": 0.7846117, + "num_input_tokens_seen": 31283690, + "router_z_loss_clip": 1.31835938, + "router_z_loss_mlp": 0.2019043, + "step": 1462, + "time_per_iteration": 2.7232964038848877 + }, + { + "auxiliary_loss_clip": 0.01192825, + "auxiliary_loss_mlp": 0.0106117, + "balance_loss_clip": 1.06057835, + "balance_loss_mlp": 1.03949738, + "epoch": 0.08796031865323914, + "flos": 23347388613600.0, + "grad_norm": 2.6093941637788034, + "language_loss": 0.74395514, + "learning_rate": 3.9648643632088634e-06, + "loss": 0.76649511, + "num_input_tokens_seen": 31302505, + "router_z_loss_clip": 1.32226562, + "router_z_loss_mlp": 0.21643066, + "step": 1463, + "time_per_iteration": 2.6436359882354736 + }, + { + "auxiliary_loss_clip": 0.01194252, + "auxiliary_loss_mlp": 0.01057417, + "balance_loss_clip": 1.05914259, + "balance_loss_mlp": 1.03606689, + "epoch": 0.0880204419059071, + "flos": 31804407341280.0, + "grad_norm": 2.6107270819384625, + "language_loss": 0.83279061, + "learning_rate": 3.964791644632941e-06, + "loss": 0.85530728, + "num_input_tokens_seen": 31323070, + "router_z_loss_clip": 1.35253906, + "router_z_loss_mlp": 0.21362305, + "step": 1464, + "time_per_iteration": 2.6842644214630127 + }, + { + "auxiliary_loss_clip": 0.01188303, + "auxiliary_loss_mlp": 0.01059908, + "balance_loss_clip": 1.0586369, + "balance_loss_mlp": 1.04016697, + "epoch": 0.08808056515857508, + "flos": 27306050477760.0, + "grad_norm": 2.2697884074102443, + "language_loss": 0.78213149, + "learning_rate": 3.964718851551923e-06, + "loss": 0.80461359, + "num_input_tokens_seen": 31341880, + "router_z_loss_clip": 1.29589844, + "router_z_loss_mlp": 0.19750977, + "step": 1465, + "time_per_iteration": 2.6731650829315186 + }, + { + "auxiliary_loss_clip": 0.01195481, + "auxiliary_loss_mlp": 0.01057594, + "balance_loss_clip": 1.06083846, + "balance_loss_mlp": 1.03810263, + "epoch": 0.08814068841124305, + "flos": 28290926363040.0, + "grad_norm": 2.5874473709855987, + "language_loss": 0.85308158, + "learning_rate": 3.9646459839685675e-06, + "loss": 0.87561238, + "num_input_tokens_seen": 31361995, + "router_z_loss_clip": 1.34472656, + "router_z_loss_mlp": 0.19470215, + "step": 1466, + "time_per_iteration": 2.6394999027252197 + }, + { + "auxiliary_loss_clip": 0.01189498, + "auxiliary_loss_mlp": 0.01049048, + "balance_loss_clip": 1.05847347, + "balance_loss_mlp": 1.02853179, + "epoch": 0.08820081166391101, + "flos": 30694878836640.0, + "grad_norm": 2.2905384224622614, + "language_loss": 0.8403464, + "learning_rate": 3.964573041885641e-06, + "loss": 0.86273181, + "num_input_tokens_seen": 31381515, + "router_z_loss_clip": 1.3125, + "router_z_loss_mlp": 0.20507812, + "step": 1467, + "time_per_iteration": 2.6897804737091064 + }, + { + "auxiliary_loss_clip": 0.01190767, + "auxiliary_loss_mlp": 0.01054135, + "balance_loss_clip": 1.06034184, + "balance_loss_mlp": 1.03411996, + "epoch": 0.08826093491657899, + "flos": 27127718812800.0, + "grad_norm": 1.7799277878379085, + "language_loss": 0.75190306, + "learning_rate": 3.964500025305907e-06, + "loss": 0.77435207, + "num_input_tokens_seen": 31400345, + "router_z_loss_clip": 1.30371094, + "router_z_loss_mlp": 0.20019531, + "step": 1468, + "time_per_iteration": 2.646702289581299 + }, + { + "auxiliary_loss_clip": 0.01188329, + "auxiliary_loss_mlp": 0.01052669, + "balance_loss_clip": 1.06147122, + "balance_loss_mlp": 1.03361917, + "epoch": 0.08832105816924696, + "flos": 26999338603680.0, + "grad_norm": 1.8523596172046968, + "language_loss": 0.80390251, + "learning_rate": 3.9644269342321355e-06, + "loss": 0.82631242, + "num_input_tokens_seen": 31419620, + "router_z_loss_clip": 1.26953125, + "router_z_loss_mlp": 0.19055176, + "step": 1469, + "time_per_iteration": 2.680569887161255 + }, + { + "auxiliary_loss_clip": 0.01191547, + "auxiliary_loss_mlp": 0.01055388, + "balance_loss_clip": 1.05997503, + "balance_loss_mlp": 1.03580177, + "epoch": 0.08838118142191492, + "flos": 21434435719200.0, + "grad_norm": 2.070829612976428, + "language_loss": 0.77611637, + "learning_rate": 3.9643537686670974e-06, + "loss": 0.79858565, + "num_input_tokens_seen": 31437970, + "router_z_loss_clip": 1.31347656, + "router_z_loss_mlp": 0.19580078, + "step": 1470, + "time_per_iteration": 2.613910436630249 + }, + { + "auxiliary_loss_clip": 0.01186052, + "auxiliary_loss_mlp": 0.0105992, + "balance_loss_clip": 1.05943751, + "balance_loss_mlp": 1.0394516, + "epoch": 0.0884413046745829, + "flos": 25352384067360.0, + "grad_norm": 1.9910060932756855, + "language_loss": 0.84192097, + "learning_rate": 3.964280528613569e-06, + "loss": 0.86438072, + "num_input_tokens_seen": 31457040, + "router_z_loss_clip": 1.26464844, + "router_z_loss_mlp": 0.20471191, + "step": 1471, + "time_per_iteration": 2.652069568634033 + }, + { + "auxiliary_loss_clip": 0.01183711, + "auxiliary_loss_mlp": 0.01048129, + "balance_loss_clip": 1.06153214, + "balance_loss_mlp": 1.03077209, + "epoch": 0.08850142792725087, + "flos": 26997798947040.0, + "grad_norm": 1.628077177457495, + "language_loss": 0.83388734, + "learning_rate": 3.964207214074324e-06, + "loss": 0.85620576, + "num_input_tokens_seen": 31477520, + "router_z_loss_clip": 1.22167969, + "router_z_loss_mlp": 0.17358398, + "step": 1472, + "time_per_iteration": 2.6537251472473145 + }, + { + "auxiliary_loss_clip": 0.01189923, + "auxiliary_loss_mlp": 0.01051347, + "balance_loss_clip": 1.06131494, + "balance_loss_mlp": 1.03059292, + "epoch": 0.08856155117991883, + "flos": 27355556243520.0, + "grad_norm": 2.9275788167063883, + "language_loss": 0.8285712, + "learning_rate": 3.964133825052146e-06, + "loss": 0.85098392, + "num_input_tokens_seen": 31495575, + "router_z_loss_clip": 1.28515625, + "router_z_loss_mlp": 0.20739746, + "step": 1473, + "time_per_iteration": 2.656031608581543 + }, + { + "auxiliary_loss_clip": 0.01188428, + "auxiliary_loss_mlp": 0.01055455, + "balance_loss_clip": 1.05895472, + "balance_loss_mlp": 1.0373826, + "epoch": 0.0886216744325868, + "flos": 36529710255360.0, + "grad_norm": 2.1756453654151424, + "language_loss": 0.78825212, + "learning_rate": 3.964060361549816e-06, + "loss": 0.81069088, + "num_input_tokens_seen": 31520020, + "router_z_loss_clip": 1.29394531, + "router_z_loss_mlp": 0.18066406, + "step": 1474, + "time_per_iteration": 2.9850428104400635 + }, + { + "auxiliary_loss_clip": 0.01189003, + "auxiliary_loss_mlp": 0.01051019, + "balance_loss_clip": 1.06286645, + "balance_loss_mlp": 1.03107548, + "epoch": 0.08868179768525478, + "flos": 29264619479040.0, + "grad_norm": 1.9966510998523883, + "language_loss": 0.78643584, + "learning_rate": 3.963986823570121e-06, + "loss": 0.8088361, + "num_input_tokens_seen": 31539265, + "router_z_loss_clip": 1.26074219, + "router_z_loss_mlp": 0.19958496, + "step": 1475, + "time_per_iteration": 2.7421751022338867 + }, + { + "auxiliary_loss_clip": 0.01188154, + "auxiliary_loss_mlp": 0.01048996, + "balance_loss_clip": 1.05936325, + "balance_loss_mlp": 1.02863467, + "epoch": 0.08874192093792274, + "flos": 52687195463520.0, + "grad_norm": 1.793213040879682, + "language_loss": 0.74143815, + "learning_rate": 3.963913211115848e-06, + "loss": 0.76380962, + "num_input_tokens_seen": 31563425, + "router_z_loss_clip": 1.28710938, + "router_z_loss_mlp": 0.20361328, + "step": 1476, + "time_per_iteration": 4.373602867126465 + }, + { + "auxiliary_loss_clip": 0.0118858, + "auxiliary_loss_mlp": 0.01059997, + "balance_loss_clip": 1.06091332, + "balance_loss_mlp": 1.03939807, + "epoch": 0.0888020441905907, + "flos": 40087672856640.0, + "grad_norm": 3.8203093168685234, + "language_loss": 0.7481066, + "learning_rate": 3.9638395241897895e-06, + "loss": 0.77059245, + "num_input_tokens_seen": 31584525, + "router_z_loss_clip": 1.27734375, + "router_z_loss_mlp": 0.20593262, + "step": 1477, + "time_per_iteration": 4.2261643409729 + }, + { + "auxiliary_loss_clip": 0.01187252, + "auxiliary_loss_mlp": 0.01051868, + "balance_loss_clip": 1.0591135, + "balance_loss_mlp": 1.03120899, + "epoch": 0.08886216744325869, + "flos": 28247011981920.0, + "grad_norm": 2.0498767146101584, + "language_loss": 0.87193692, + "learning_rate": 3.963765762794739e-06, + "loss": 0.89432812, + "num_input_tokens_seen": 31603325, + "router_z_loss_clip": 1.28125, + "router_z_loss_mlp": 0.20654297, + "step": 1478, + "time_per_iteration": 4.164849281311035 + }, + { + "auxiliary_loss_clip": 0.01185482, + "auxiliary_loss_mlp": 0.01055822, + "balance_loss_clip": 1.05808473, + "balance_loss_mlp": 1.03691506, + "epoch": 0.08892229069592665, + "flos": 28469420097120.0, + "grad_norm": 1.6642550356276582, + "language_loss": 0.77609801, + "learning_rate": 3.963691926933495e-06, + "loss": 0.79851103, + "num_input_tokens_seen": 31624820, + "router_z_loss_clip": 1.2734375, + "router_z_loss_mlp": 0.18920898, + "step": 1479, + "time_per_iteration": 4.125023126602173 + }, + { + "auxiliary_loss_clip": 0.01183512, + "auxiliary_loss_mlp": 0.01050581, + "balance_loss_clip": 1.05637729, + "balance_loss_mlp": 1.02995801, + "epoch": 0.08898241394859462, + "flos": 31986507113280.0, + "grad_norm": 2.3472057728389832, + "language_loss": 0.78022724, + "learning_rate": 3.9636180166088555e-06, + "loss": 0.8025682, + "num_input_tokens_seen": 31646080, + "router_z_loss_clip": 1.26953125, + "router_z_loss_mlp": 0.20617676, + "step": 1480, + "time_per_iteration": 2.7134556770324707 + }, + { + "auxiliary_loss_clip": 0.01187327, + "auxiliary_loss_mlp": 0.01061114, + "balance_loss_clip": 1.05692983, + "balance_loss_mlp": 1.03966856, + "epoch": 0.0890425372012626, + "flos": 28736674490880.0, + "grad_norm": 1.5579719707445572, + "language_loss": 0.66558105, + "learning_rate": 3.963544031823624e-06, + "loss": 0.68806541, + "num_input_tokens_seen": 31665770, + "router_z_loss_clip": 1.3046875, + "router_z_loss_mlp": 0.21447754, + "step": 1481, + "time_per_iteration": 2.7673556804656982 + }, + { + "auxiliary_loss_clip": 0.01187944, + "auxiliary_loss_mlp": 0.0104791, + "balance_loss_clip": 1.06055546, + "balance_loss_mlp": 1.02838326, + "epoch": 0.08910266045393056, + "flos": 28068234626880.0, + "grad_norm": 1.9334127044619898, + "language_loss": 0.96303725, + "learning_rate": 3.9634699725806065e-06, + "loss": 0.98539579, + "num_input_tokens_seen": 31683805, + "router_z_loss_clip": 1.27246094, + "router_z_loss_mlp": 0.1953125, + "step": 1482, + "time_per_iteration": 2.6561548709869385 + }, + { + "auxiliary_loss_clip": 0.01193438, + "auxiliary_loss_mlp": 0.01051205, + "balance_loss_clip": 1.06107271, + "balance_loss_mlp": 1.03098762, + "epoch": 0.08916278370659853, + "flos": 38970324516960.0, + "grad_norm": 2.0976845433295344, + "language_loss": 0.78728855, + "learning_rate": 3.96339583888261e-06, + "loss": 0.80973494, + "num_input_tokens_seen": 31704630, + "router_z_loss_clip": 1.32226562, + "router_z_loss_mlp": 0.20202637, + "step": 1483, + "time_per_iteration": 2.722421407699585 + }, + { + "auxiliary_loss_clip": 0.01186024, + "auxiliary_loss_mlp": 0.01077552, + "balance_loss_clip": 1.05834794, + "balance_loss_mlp": 1.05652356, + "epoch": 0.08922290695926649, + "flos": 21390359268960.0, + "grad_norm": 2.3101786787902925, + "language_loss": 0.85263419, + "learning_rate": 3.963321630732448e-06, + "loss": 0.87526995, + "num_input_tokens_seen": 31723255, + "router_z_loss_clip": 1.27636719, + "router_z_loss_mlp": 0.21020508, + "step": 1484, + "time_per_iteration": 2.6242170333862305 + }, + { + "auxiliary_loss_clip": 0.01194976, + "auxiliary_loss_mlp": 0.01063574, + "balance_loss_clip": 1.06241369, + "balance_loss_mlp": 1.0421164, + "epoch": 0.08928303021193447, + "flos": 39199863673440.0, + "grad_norm": 1.7454580416960586, + "language_loss": 0.80232984, + "learning_rate": 3.963247348132932e-06, + "loss": 0.82491535, + "num_input_tokens_seen": 31747045, + "router_z_loss_clip": 1.32617188, + "router_z_loss_mlp": 0.21447754, + "step": 1485, + "time_per_iteration": 2.923107147216797 + }, + { + "auxiliary_loss_clip": 0.01186651, + "auxiliary_loss_mlp": 0.0105547, + "balance_loss_clip": 1.05778837, + "balance_loss_mlp": 1.03483474, + "epoch": 0.08934315346460243, + "flos": 26997515326080.0, + "grad_norm": 2.022631538628077, + "language_loss": 0.83321089, + "learning_rate": 3.96317299108688e-06, + "loss": 0.85563207, + "num_input_tokens_seen": 31766615, + "router_z_loss_clip": 1.28710938, + "router_z_loss_mlp": 0.2064209, + "step": 1486, + "time_per_iteration": 2.6613306999206543 + }, + { + "auxiliary_loss_clip": 0.01187168, + "auxiliary_loss_mlp": 0.01061732, + "balance_loss_clip": 1.05991268, + "balance_loss_mlp": 1.04127598, + "epoch": 0.0894032767172704, + "flos": 27534657736800.0, + "grad_norm": 2.0755629856644906, + "language_loss": 0.76498717, + "learning_rate": 3.963098559597111e-06, + "loss": 0.78747618, + "num_input_tokens_seen": 31785855, + "router_z_loss_clip": 1.27148438, + "router_z_loss_mlp": 0.2043457, + "step": 1487, + "time_per_iteration": 2.6650376319885254 + }, + { + "auxiliary_loss_clip": 0.01185409, + "auxiliary_loss_mlp": 0.01055263, + "balance_loss_clip": 1.05767572, + "balance_loss_mlp": 1.0332334, + "epoch": 0.08946339996993838, + "flos": 24639989304960.0, + "grad_norm": 2.1258787958925085, + "language_loss": 0.83023512, + "learning_rate": 3.963024053666449e-06, + "loss": 0.85264188, + "num_input_tokens_seen": 31804210, + "router_z_loss_clip": 1.27929688, + "router_z_loss_mlp": 0.22033691, + "step": 1488, + "time_per_iteration": 2.638496160507202 + }, + { + "auxiliary_loss_clip": 0.01184725, + "auxiliary_loss_mlp": 0.01047492, + "balance_loss_clip": 1.05916142, + "balance_loss_mlp": 1.02761972, + "epoch": 0.08952352322260634, + "flos": 59011284218400.0, + "grad_norm": 1.8029261824400327, + "language_loss": 0.71902817, + "learning_rate": 3.962949473297718e-06, + "loss": 0.74135035, + "num_input_tokens_seen": 31826150, + "router_z_loss_clip": 1.25488281, + "router_z_loss_mlp": 0.19848633, + "step": 1489, + "time_per_iteration": 2.8997654914855957 + }, + { + "auxiliary_loss_clip": 0.01183875, + "auxiliary_loss_mlp": 0.01048277, + "balance_loss_clip": 1.05578637, + "balance_loss_mlp": 1.02811861, + "epoch": 0.08958364647527431, + "flos": 38795072165280.0, + "grad_norm": 1.980865655902791, + "language_loss": 0.89758098, + "learning_rate": 3.962874818493745e-06, + "loss": 0.91990244, + "num_input_tokens_seen": 31848060, + "router_z_loss_clip": 1.28027344, + "router_z_loss_mlp": 0.20153809, + "step": 1490, + "time_per_iteration": 2.76466965675354 + }, + { + "auxiliary_loss_clip": 0.01193972, + "auxiliary_loss_mlp": 0.0106359, + "balance_loss_clip": 1.05933833, + "balance_loss_mlp": 1.0432651, + "epoch": 0.08964376972794229, + "flos": 28514063789280.0, + "grad_norm": 3.100375676689709, + "language_loss": 0.73333806, + "learning_rate": 3.9628000892573635e-06, + "loss": 0.75591367, + "num_input_tokens_seen": 31870040, + "router_z_loss_clip": 1.34765625, + "router_z_loss_mlp": 0.20324707, + "step": 1491, + "time_per_iteration": 2.8720383644104004 + }, + { + "auxiliary_loss_clip": 0.01185689, + "auxiliary_loss_mlp": 0.0105016, + "balance_loss_clip": 1.05866241, + "balance_loss_mlp": 1.03093195, + "epoch": 0.08970389298061025, + "flos": 28424533301280.0, + "grad_norm": 1.7417692502126605, + "language_loss": 0.77483302, + "learning_rate": 3.9627252855914055e-06, + "loss": 0.7971915, + "num_input_tokens_seen": 31890400, + "router_z_loss_clip": 1.27050781, + "router_z_loss_mlp": 0.19213867, + "step": 1492, + "time_per_iteration": 2.9044039249420166 + }, + { + "auxiliary_loss_clip": 0.0118474, + "auxiliary_loss_mlp": 0.01054049, + "balance_loss_clip": 1.05985427, + "balance_loss_mlp": 1.03390229, + "epoch": 0.08976401623327822, + "flos": 41197484982240.0, + "grad_norm": 2.232194745436692, + "language_loss": 0.70968068, + "learning_rate": 3.962650407498707e-06, + "loss": 0.73206854, + "num_input_tokens_seen": 31913435, + "router_z_loss_clip": 1.25, + "router_z_loss_mlp": 0.20141602, + "step": 1493, + "time_per_iteration": 2.795586585998535 + }, + { + "auxiliary_loss_clip": 0.01184444, + "auxiliary_loss_mlp": 0.01055412, + "balance_loss_clip": 1.0577836, + "balance_loss_mlp": 1.03433597, + "epoch": 0.08982413948594618, + "flos": 29177236406880.0, + "grad_norm": 2.1395411425045148, + "language_loss": 0.86602616, + "learning_rate": 3.962575454982109e-06, + "loss": 0.88842475, + "num_input_tokens_seen": 31932435, + "router_z_loss_clip": 1.265625, + "router_z_loss_mlp": 0.21081543, + "step": 1494, + "time_per_iteration": 2.7226953506469727 + }, + { + "auxiliary_loss_clip": 0.01184779, + "auxiliary_loss_mlp": 0.01066259, + "balance_loss_clip": 1.05836225, + "balance_loss_mlp": 1.04474235, + "epoch": 0.08988426273861416, + "flos": 20544843775680.0, + "grad_norm": 1.85898035827943, + "language_loss": 0.83116812, + "learning_rate": 3.962500428044454e-06, + "loss": 0.85367852, + "num_input_tokens_seen": 31950125, + "router_z_loss_clip": 1.26464844, + "router_z_loss_mlp": 0.21520996, + "step": 1495, + "time_per_iteration": 2.6291050910949707 + }, + { + "auxiliary_loss_clip": 0.01191233, + "auxiliary_loss_mlp": 0.01052983, + "balance_loss_clip": 1.06279898, + "balance_loss_mlp": 1.03301597, + "epoch": 0.08994438599128213, + "flos": 18050469433920.0, + "grad_norm": 2.212579686978234, + "language_loss": 0.70337522, + "learning_rate": 3.962425326688585e-06, + "loss": 0.72581744, + "num_input_tokens_seen": 31968050, + "router_z_loss_clip": 1.28515625, + "router_z_loss_mlp": 0.19958496, + "step": 1496, + "time_per_iteration": 2.6326401233673096 + }, + { + "auxiliary_loss_clip": 0.01185169, + "auxiliary_loss_mlp": 0.01048474, + "balance_loss_clip": 1.05872703, + "balance_loss_mlp": 1.02996063, + "epoch": 0.09000450924395009, + "flos": 21212108638560.0, + "grad_norm": 1.7026312703155735, + "language_loss": 0.7950424, + "learning_rate": 3.962350150917351e-06, + "loss": 0.81737888, + "num_input_tokens_seen": 31985675, + "router_z_loss_clip": 1.26464844, + "router_z_loss_mlp": 0.18518066, + "step": 1497, + "time_per_iteration": 2.8187637329101562 + }, + { + "auxiliary_loss_clip": 0.01188109, + "auxiliary_loss_mlp": 0.01060445, + "balance_loss_clip": 1.05614519, + "balance_loss_mlp": 1.03944063, + "epoch": 0.09006463249661807, + "flos": 29627117297280.0, + "grad_norm": 4.660869836107259, + "language_loss": 0.82678163, + "learning_rate": 3.9622749007336035e-06, + "loss": 0.84926713, + "num_input_tokens_seen": 32005180, + "router_z_loss_clip": 1.32128906, + "router_z_loss_mlp": 0.21008301, + "step": 1498, + "time_per_iteration": 2.738529920578003 + }, + { + "auxiliary_loss_clip": 0.01189078, + "auxiliary_loss_mlp": 0.01065335, + "balance_loss_clip": 1.05932307, + "balance_loss_mlp": 1.04453301, + "epoch": 0.09012475574928604, + "flos": 16669837393920.0, + "grad_norm": 2.1001195518249514, + "language_loss": 0.78892708, + "learning_rate": 3.962199576140195e-06, + "loss": 0.81147116, + "num_input_tokens_seen": 32022970, + "router_z_loss_clip": 1.29785156, + "router_z_loss_mlp": 0.20812988, + "step": 1499, + "time_per_iteration": 2.598039150238037 + }, + { + "auxiliary_loss_clip": 0.01184147, + "auxiliary_loss_mlp": 0.01056319, + "balance_loss_clip": 1.06036496, + "balance_loss_mlp": 1.03658962, + "epoch": 0.090184879001954, + "flos": 28465084748160.0, + "grad_norm": 1.682480519243795, + "language_loss": 0.93175441, + "learning_rate": 3.962124177139981e-06, + "loss": 0.95415902, + "num_input_tokens_seen": 32043055, + "router_z_loss_clip": 1.23730469, + "router_z_loss_mlp": 0.19726562, + "step": 1500, + "time_per_iteration": 2.7372186183929443 + }, + { + "auxiliary_loss_clip": 0.0118477, + "auxiliary_loss_mlp": 0.01050768, + "balance_loss_clip": 1.05621696, + "balance_loss_mlp": 1.02942991, + "epoch": 0.09024500225462198, + "flos": 28068113075040.0, + "grad_norm": 2.151039558515049, + "language_loss": 0.74104446, + "learning_rate": 3.962048703735822e-06, + "loss": 0.7633999, + "num_input_tokens_seen": 32061900, + "router_z_loss_clip": 1.28613281, + "router_z_loss_mlp": 0.21374512, + "step": 1501, + "time_per_iteration": 2.659773349761963 + }, + { + "auxiliary_loss_clip": 0.01076048, + "auxiliary_loss_mlp": 0.01004769, + "balance_loss_clip": 1.02609515, + "balance_loss_mlp": 1.00161552, + "epoch": 0.09030512550728995, + "flos": 75883074851520.0, + "grad_norm": 0.7307515794943135, + "language_loss": 0.58322883, + "learning_rate": 3.96197315593058e-06, + "loss": 0.60403699, + "num_input_tokens_seen": 32122745, + "router_z_loss_clip": 0.49926758, + "router_z_loss_mlp": 0.03153992, + "step": 1502, + "time_per_iteration": 3.383147954940796 + }, + { + "auxiliary_loss_clip": 0.01180916, + "auxiliary_loss_mlp": 0.01052361, + "balance_loss_clip": 1.05584145, + "balance_loss_mlp": 1.03334713, + "epoch": 0.09036524875995791, + "flos": 47346078281760.0, + "grad_norm": 2.777336057011816, + "language_loss": 0.69783592, + "learning_rate": 3.961897533727119e-06, + "loss": 0.72016871, + "num_input_tokens_seen": 32145125, + "router_z_loss_clip": 1.25, + "router_z_loss_mlp": 0.19006348, + "step": 1503, + "time_per_iteration": 2.850828170776367 + }, + { + "auxiliary_loss_clip": 0.0118665, + "auxiliary_loss_mlp": 0.01054099, + "balance_loss_clip": 1.05734527, + "balance_loss_mlp": 1.03471613, + "epoch": 0.09042537201262588, + "flos": 26466288438240.0, + "grad_norm": 2.076380292948186, + "language_loss": 0.85876918, + "learning_rate": 3.961821837128306e-06, + "loss": 0.88117665, + "num_input_tokens_seen": 32166255, + "router_z_loss_clip": 1.29296875, + "router_z_loss_mlp": 0.19372559, + "step": 1504, + "time_per_iteration": 2.7055087089538574 + }, + { + "auxiliary_loss_clip": 0.01193098, + "auxiliary_loss_mlp": 0.0106402, + "balance_loss_clip": 1.05897355, + "balance_loss_mlp": 1.04049993, + "epoch": 0.09048549526529386, + "flos": 27170944400160.0, + "grad_norm": 2.0596362983389476, + "language_loss": 0.72415352, + "learning_rate": 3.961746066137014e-06, + "loss": 0.74672472, + "num_input_tokens_seen": 32184010, + "router_z_loss_clip": 1.33984375, + "router_z_loss_mlp": 0.23522949, + "step": 1505, + "time_per_iteration": 2.672253131866455 + }, + { + "auxiliary_loss_clip": 0.01180335, + "auxiliary_loss_mlp": 0.01055259, + "balance_loss_clip": 1.05593657, + "balance_loss_mlp": 1.03464818, + "epoch": 0.09054561851796182, + "flos": 17827858732320.0, + "grad_norm": 2.7944491148241317, + "language_loss": 0.8081705, + "learning_rate": 3.961670220756114e-06, + "loss": 0.83052647, + "num_input_tokens_seen": 32201635, + "router_z_loss_clip": 1.2421875, + "router_z_loss_mlp": 0.20629883, + "step": 1506, + "time_per_iteration": 2.68074369430542 + }, + { + "auxiliary_loss_clip": 0.01182425, + "auxiliary_loss_mlp": 0.01051923, + "balance_loss_clip": 1.05763221, + "balance_loss_mlp": 1.0330162, + "epoch": 0.09060574177062979, + "flos": 33722262826560.0, + "grad_norm": 1.9538448220353772, + "language_loss": 0.76164967, + "learning_rate": 3.961594300988482e-06, + "loss": 0.78399312, + "num_input_tokens_seen": 32221940, + "router_z_loss_clip": 1.24707031, + "router_z_loss_mlp": 0.18933105, + "step": 1507, + "time_per_iteration": 2.6980600357055664 + }, + { + "auxiliary_loss_clip": 0.0107106, + "auxiliary_loss_mlp": 0.01002633, + "balance_loss_clip": 1.02156067, + "balance_loss_mlp": 0.99940574, + "epoch": 0.09066586502329776, + "flos": 80638678340640.0, + "grad_norm": 0.7672055522207717, + "language_loss": 0.57674873, + "learning_rate": 3.961518306836998e-06, + "loss": 0.59748566, + "num_input_tokens_seen": 32276495, + "router_z_loss_clip": 0.49487305, + "router_z_loss_mlp": 0.03230286, + "step": 1508, + "time_per_iteration": 3.1570231914520264 + }, + { + "auxiliary_loss_clip": 0.01183648, + "auxiliary_loss_mlp": 0.01054301, + "balance_loss_clip": 1.05666614, + "balance_loss_mlp": 1.03433323, + "epoch": 0.09072598827596573, + "flos": 23081754911040.0, + "grad_norm": 1.968524830647236, + "language_loss": 0.8505336, + "learning_rate": 3.961442238304543e-06, + "loss": 0.87291306, + "num_input_tokens_seen": 32294130, + "router_z_loss_clip": 1.26953125, + "router_z_loss_mlp": 0.19970703, + "step": 1509, + "time_per_iteration": 2.6470675468444824 + }, + { + "auxiliary_loss_clip": 0.01193913, + "auxiliary_loss_mlp": 0.01064781, + "balance_loss_clip": 1.06031418, + "balance_loss_mlp": 1.04325175, + "epoch": 0.0907861115286337, + "flos": 30288345085440.0, + "grad_norm": 6.153655075533721, + "language_loss": 0.8408075, + "learning_rate": 3.961366095394002e-06, + "loss": 0.86339444, + "num_input_tokens_seen": 32313555, + "router_z_loss_clip": 1.3359375, + "router_z_loss_mlp": 0.21520996, + "step": 1510, + "time_per_iteration": 2.886477470397949 + }, + { + "auxiliary_loss_clip": 0.01187475, + "auxiliary_loss_mlp": 0.01055435, + "balance_loss_clip": 1.05852222, + "balance_loss_mlp": 1.03557479, + "epoch": 0.09084623478130167, + "flos": 26420915435040.0, + "grad_norm": 2.570576821988955, + "language_loss": 0.85548311, + "learning_rate": 3.961289878108262e-06, + "loss": 0.87791228, + "num_input_tokens_seen": 32331430, + "router_z_loss_clip": 1.29003906, + "router_z_loss_mlp": 0.19873047, + "step": 1511, + "time_per_iteration": 2.6353700160980225 + }, + { + "auxiliary_loss_clip": 0.01185548, + "auxiliary_loss_mlp": 0.01052666, + "balance_loss_clip": 1.05987215, + "balance_loss_mlp": 1.03255582, + "epoch": 0.09090635803396964, + "flos": 33725747312640.0, + "grad_norm": 4.595991341876054, + "language_loss": 0.85184872, + "learning_rate": 3.9612135864502135e-06, + "loss": 0.87423086, + "num_input_tokens_seen": 32353705, + "router_z_loss_clip": 1.25683594, + "router_z_loss_mlp": 0.2010498, + "step": 1512, + "time_per_iteration": 2.7780444622039795 + }, + { + "auxiliary_loss_clip": 0.01178678, + "auxiliary_loss_mlp": 0.01050573, + "balance_loss_clip": 1.05540216, + "balance_loss_mlp": 1.03190446, + "epoch": 0.0909664812866376, + "flos": 21560303856960.0, + "grad_norm": 2.7006989820604113, + "language_loss": 0.86464489, + "learning_rate": 3.961137220422749e-06, + "loss": 0.88693738, + "num_input_tokens_seen": 32370520, + "router_z_loss_clip": 1.23242188, + "router_z_loss_mlp": 0.18664551, + "step": 1513, + "time_per_iteration": 2.604193925857544 + }, + { + "auxiliary_loss_clip": 0.0118368, + "auxiliary_loss_mlp": 0.01054369, + "balance_loss_clip": 1.05794382, + "balance_loss_mlp": 1.03551054, + "epoch": 0.09102660453930557, + "flos": 29225850792480.0, + "grad_norm": 1.7964081381304604, + "language_loss": 0.86503685, + "learning_rate": 3.961060780028764e-06, + "loss": 0.88741732, + "num_input_tokens_seen": 32389105, + "router_z_loss_clip": 1.25683594, + "router_z_loss_mlp": 0.18859863, + "step": 1514, + "time_per_iteration": 2.7033655643463135 + }, + { + "auxiliary_loss_clip": 0.0118483, + "auxiliary_loss_mlp": 0.01059306, + "balance_loss_clip": 1.06002545, + "balance_loss_mlp": 1.04060245, + "epoch": 0.09108672779197355, + "flos": 31497330811680.0, + "grad_norm": 1.8649641596601256, + "language_loss": 0.90190125, + "learning_rate": 3.960984265271159e-06, + "loss": 0.92434263, + "num_input_tokens_seen": 32408065, + "router_z_loss_clip": 1.24804688, + "router_z_loss_mlp": 0.18701172, + "step": 1515, + "time_per_iteration": 2.686361312866211 + }, + { + "auxiliary_loss_clip": 0.01186249, + "auxiliary_loss_mlp": 0.01055567, + "balance_loss_clip": 1.05916762, + "balance_loss_mlp": 1.03481221, + "epoch": 0.09114685104464151, + "flos": 36167171919840.0, + "grad_norm": 2.048811794243098, + "language_loss": 0.85616875, + "learning_rate": 3.9609076761528335e-06, + "loss": 0.87858689, + "num_input_tokens_seen": 32427225, + "router_z_loss_clip": 1.27050781, + "router_z_loss_mlp": 0.2076416, + "step": 1516, + "time_per_iteration": 5.682703256607056 + }, + { + "auxiliary_loss_clip": 0.011889, + "auxiliary_loss_mlp": 0.01057518, + "balance_loss_clip": 1.05821884, + "balance_loss_mlp": 1.03778863, + "epoch": 0.09120697429730948, + "flos": 41157946467360.0, + "grad_norm": 1.4896506566144583, + "language_loss": 0.80897337, + "learning_rate": 3.960831012676692e-06, + "loss": 0.83143753, + "num_input_tokens_seen": 32450510, + "router_z_loss_clip": 1.30761719, + "router_z_loss_mlp": 0.19726562, + "step": 1517, + "time_per_iteration": 2.7791764736175537 + }, + { + "auxiliary_loss_clip": 0.0118992, + "auxiliary_loss_mlp": 0.01066472, + "balance_loss_clip": 1.05970323, + "balance_loss_mlp": 1.04644537, + "epoch": 0.09126709754997746, + "flos": 22453623390240.0, + "grad_norm": 1.6318697760842586, + "language_loss": 0.7766062, + "learning_rate": 3.960754274845642e-06, + "loss": 0.79917014, + "num_input_tokens_seen": 32468425, + "router_z_loss_clip": 1.30175781, + "router_z_loss_mlp": 0.20031738, + "step": 1518, + "time_per_iteration": 5.621671438217163 + }, + { + "auxiliary_loss_clip": 0.01185946, + "auxiliary_loss_mlp": 0.01058876, + "balance_loss_clip": 1.0587858, + "balance_loss_mlp": 1.03896773, + "epoch": 0.09132722080264542, + "flos": 26956639740960.0, + "grad_norm": 1.9669371972801974, + "language_loss": 0.86399174, + "learning_rate": 3.960677462662594e-06, + "loss": 0.88643998, + "num_input_tokens_seen": 32487510, + "router_z_loss_clip": 1.27148438, + "router_z_loss_mlp": 0.19921875, + "step": 1519, + "time_per_iteration": 2.7211883068084717 + }, + { + "auxiliary_loss_clip": 0.01185872, + "auxiliary_loss_mlp": 0.01051852, + "balance_loss_clip": 1.05686915, + "balance_loss_mlp": 1.03088307, + "epoch": 0.09138734405531339, + "flos": 25664646808800.0, + "grad_norm": 2.4813798055955374, + "language_loss": 0.73238301, + "learning_rate": 3.96060057613046e-06, + "loss": 0.75476027, + "num_input_tokens_seen": 32507250, + "router_z_loss_clip": 1.29003906, + "router_z_loss_mlp": 0.20983887, + "step": 1520, + "time_per_iteration": 2.8434102535247803 + }, + { + "auxiliary_loss_clip": 0.01189741, + "auxiliary_loss_mlp": 0.01053801, + "balance_loss_clip": 1.0595336, + "balance_loss_mlp": 1.03348804, + "epoch": 0.09144746730798137, + "flos": 24506422884000.0, + "grad_norm": 2.8856980497662224, + "language_loss": 0.85248953, + "learning_rate": 3.960523615252156e-06, + "loss": 0.8749249, + "num_input_tokens_seen": 32526045, + "router_z_loss_clip": 1.30371094, + "router_z_loss_mlp": 0.203125, + "step": 1521, + "time_per_iteration": 2.7152702808380127 + }, + { + "auxiliary_loss_clip": 0.0119011, + "auxiliary_loss_mlp": 0.01065071, + "balance_loss_clip": 1.0598712, + "balance_loss_mlp": 1.04462695, + "epoch": 0.09150759056064933, + "flos": 27794700054720.0, + "grad_norm": 1.877771423373552, + "language_loss": 0.84044617, + "learning_rate": 3.960446580030599e-06, + "loss": 0.86299801, + "num_input_tokens_seen": 32546575, + "router_z_loss_clip": 1.30273438, + "router_z_loss_mlp": 0.20458984, + "step": 1522, + "time_per_iteration": 2.6755177974700928 + }, + { + "auxiliary_loss_clip": 0.01182105, + "auxiliary_loss_mlp": 0.01058972, + "balance_loss_clip": 1.05897093, + "balance_loss_mlp": 1.03876662, + "epoch": 0.0915677138133173, + "flos": 33634353029760.0, + "grad_norm": 3.153359127004755, + "language_loss": 0.8097946, + "learning_rate": 3.960369470468711e-06, + "loss": 0.83220541, + "num_input_tokens_seen": 32568795, + "router_z_loss_clip": 1.23046875, + "router_z_loss_mlp": 0.20227051, + "step": 1523, + "time_per_iteration": 2.763482093811035 + }, + { + "auxiliary_loss_clip": 0.01190594, + "auxiliary_loss_mlp": 0.01061662, + "balance_loss_clip": 1.0619024, + "balance_loss_mlp": 1.04170632, + "epoch": 0.09162783706598528, + "flos": 21566665069920.0, + "grad_norm": 2.130909438891235, + "language_loss": 0.74340725, + "learning_rate": 3.960292286569418e-06, + "loss": 0.76592982, + "num_input_tokens_seen": 32587010, + "router_z_loss_clip": 1.28613281, + "router_z_loss_mlp": 0.19958496, + "step": 1524, + "time_per_iteration": 2.6425726413726807 + }, + { + "auxiliary_loss_clip": 0.01188328, + "auxiliary_loss_mlp": 0.01052137, + "balance_loss_clip": 1.06081629, + "balance_loss_mlp": 1.03154993, + "epoch": 0.09168796031865324, + "flos": 22547003019840.0, + "grad_norm": 2.16967446124899, + "language_loss": 0.86454785, + "learning_rate": 3.960215028335644e-06, + "loss": 0.88695246, + "num_input_tokens_seen": 32602375, + "router_z_loss_clip": 1.27539062, + "router_z_loss_mlp": 0.20581055, + "step": 1525, + "time_per_iteration": 2.662445545196533 + }, + { + "auxiliary_loss_clip": 0.01190893, + "auxiliary_loss_mlp": 0.01046114, + "balance_loss_clip": 1.06248069, + "balance_loss_mlp": 1.02537155, + "epoch": 0.0917480835713212, + "flos": 35859487631040.0, + "grad_norm": 2.1170367484546464, + "language_loss": 0.75000632, + "learning_rate": 3.96013769577032e-06, + "loss": 0.77237642, + "num_input_tokens_seen": 32621460, + "router_z_loss_clip": 1.28417969, + "router_z_loss_mlp": 0.20727539, + "step": 1526, + "time_per_iteration": 2.716402292251587 + }, + { + "auxiliary_loss_clip": 0.01188796, + "auxiliary_loss_mlp": 0.01057044, + "balance_loss_clip": 1.06211197, + "balance_loss_mlp": 1.03631401, + "epoch": 0.09180820682398917, + "flos": 24195578247360.0, + "grad_norm": 2.206548522611203, + "language_loss": 0.76857567, + "learning_rate": 3.960060288876378e-06, + "loss": 0.79103398, + "num_input_tokens_seen": 32640440, + "router_z_loss_clip": 1.26660156, + "router_z_loss_mlp": 0.20715332, + "step": 1527, + "time_per_iteration": 2.774045467376709 + }, + { + "auxiliary_loss_clip": 0.01191973, + "auxiliary_loss_mlp": 0.0105387, + "balance_loss_clip": 1.06227756, + "balance_loss_mlp": 1.03168511, + "epoch": 0.09186833007665715, + "flos": 29092122302400.0, + "grad_norm": 3.3778325864588097, + "language_loss": 0.78925776, + "learning_rate": 3.959982807656753e-06, + "loss": 0.8117162, + "num_input_tokens_seen": 32660020, + "router_z_loss_clip": 1.29785156, + "router_z_loss_mlp": 0.22167969, + "step": 1528, + "time_per_iteration": 2.717787504196167 + }, + { + "auxiliary_loss_clip": 0.011892, + "auxiliary_loss_mlp": 0.01046248, + "balance_loss_clip": 1.05994856, + "balance_loss_mlp": 1.02549386, + "epoch": 0.09192845332932512, + "flos": 15779961829440.0, + "grad_norm": 2.9744693583251673, + "language_loss": 0.76414758, + "learning_rate": 3.959905252114384e-06, + "loss": 0.786502, + "num_input_tokens_seen": 32678170, + "router_z_loss_clip": 1.29394531, + "router_z_loss_mlp": 0.2076416, + "step": 1529, + "time_per_iteration": 2.620492458343506 + }, + { + "auxiliary_loss_clip": 0.01188555, + "auxiliary_loss_mlp": 0.01052016, + "balance_loss_clip": 1.0575211, + "balance_loss_mlp": 1.02978325, + "epoch": 0.09198857658199308, + "flos": 29978756484480.0, + "grad_norm": 1.8274974975094735, + "language_loss": 0.82882214, + "learning_rate": 3.959827622252211e-06, + "loss": 0.85122776, + "num_input_tokens_seen": 32697540, + "router_z_loss_clip": 1.31054688, + "router_z_loss_mlp": 0.22253418, + "step": 1530, + "time_per_iteration": 2.707686185836792 + }, + { + "auxiliary_loss_clip": 0.01185471, + "auxiliary_loss_mlp": 0.01060486, + "balance_loss_clip": 1.06049514, + "balance_loss_mlp": 1.03912401, + "epoch": 0.09204869983466106, + "flos": 24729681862080.0, + "grad_norm": 3.593651751264138, + "language_loss": 0.83662784, + "learning_rate": 3.959749918073179e-06, + "loss": 0.85908741, + "num_input_tokens_seen": 32716805, + "router_z_loss_clip": 1.24902344, + "router_z_loss_mlp": 0.21386719, + "step": 1531, + "time_per_iteration": 2.66058087348938 + }, + { + "auxiliary_loss_clip": 0.01190136, + "auxiliary_loss_mlp": 0.01050009, + "balance_loss_clip": 1.06210685, + "balance_loss_mlp": 1.02898049, + "epoch": 0.09210882308732903, + "flos": 25484816004480.0, + "grad_norm": 2.444420518631677, + "language_loss": 0.81285495, + "learning_rate": 3.959672139580233e-06, + "loss": 0.83525646, + "num_input_tokens_seen": 32736385, + "router_z_loss_clip": 1.28027344, + "router_z_loss_mlp": 0.21020508, + "step": 1532, + "time_per_iteration": 2.670379638671875 + }, + { + "auxiliary_loss_clip": 0.01192207, + "auxiliary_loss_mlp": 0.01053345, + "balance_loss_clip": 1.06345677, + "balance_loss_mlp": 1.03238761, + "epoch": 0.09216894633999699, + "flos": 37773777595680.0, + "grad_norm": 2.028708096044901, + "language_loss": 0.83572626, + "learning_rate": 3.9595942867763235e-06, + "loss": 0.85818177, + "num_input_tokens_seen": 32757140, + "router_z_loss_clip": 1.28808594, + "router_z_loss_mlp": 0.20947266, + "step": 1533, + "time_per_iteration": 2.97151255607605 + }, + { + "auxiliary_loss_clip": 0.01192663, + "auxiliary_loss_mlp": 0.01055805, + "balance_loss_clip": 1.06328046, + "balance_loss_mlp": 1.03435946, + "epoch": 0.09222906959266497, + "flos": 16047013636800.0, + "grad_norm": 2.180351695130475, + "language_loss": 0.8991465, + "learning_rate": 3.959516359664402e-06, + "loss": 0.92163122, + "num_input_tokens_seen": 32774860, + "router_z_loss_clip": 1.29296875, + "router_z_loss_mlp": 0.21435547, + "step": 1534, + "time_per_iteration": 2.644423007965088 + }, + { + "auxiliary_loss_clip": 0.01189708, + "auxiliary_loss_mlp": 0.01064477, + "balance_loss_clip": 1.06010997, + "balance_loss_mlp": 1.04034889, + "epoch": 0.09228919284533293, + "flos": 31718928581280.0, + "grad_norm": 2.2972999371890266, + "language_loss": 0.75988668, + "learning_rate": 3.959438358247424e-06, + "loss": 0.7824285, + "num_input_tokens_seen": 32795250, + "router_z_loss_clip": 1.296875, + "router_z_loss_mlp": 0.24133301, + "step": 1535, + "time_per_iteration": 2.6772446632385254 + }, + { + "auxiliary_loss_clip": 0.0118478, + "auxiliary_loss_mlp": 0.01047903, + "balance_loss_clip": 1.06130552, + "balance_loss_mlp": 1.02798295, + "epoch": 0.0923493160980009, + "flos": 22769816307840.0, + "grad_norm": 1.765374103892894, + "language_loss": 0.81291664, + "learning_rate": 3.959360282528346e-06, + "loss": 0.83524346, + "num_input_tokens_seen": 32813805, + "router_z_loss_clip": 1.23632812, + "router_z_loss_mlp": 0.19909668, + "step": 1536, + "time_per_iteration": 2.6440210342407227 + }, + { + "auxiliary_loss_clip": 0.01184742, + "auxiliary_loss_mlp": 0.01051913, + "balance_loss_clip": 1.05917132, + "balance_loss_mlp": 1.03181434, + "epoch": 0.09240943935066886, + "flos": 25795944262080.0, + "grad_norm": 1.9958926563821981, + "language_loss": 0.89132535, + "learning_rate": 3.959282132510131e-06, + "loss": 0.91369194, + "num_input_tokens_seen": 32830960, + "router_z_loss_clip": 1.25390625, + "router_z_loss_mlp": 0.20092773, + "step": 1537, + "time_per_iteration": 2.6353816986083984 + }, + { + "auxiliary_loss_clip": 0.0118666, + "auxiliary_loss_mlp": 0.01060727, + "balance_loss_clip": 1.05921519, + "balance_loss_mlp": 1.03866148, + "epoch": 0.09246956260333684, + "flos": 25127139742560.0, + "grad_norm": 2.3867014819811754, + "language_loss": 0.80690235, + "learning_rate": 3.959203908195741e-06, + "loss": 0.82937622, + "num_input_tokens_seen": 32848275, + "router_z_loss_clip": 1.27734375, + "router_z_loss_mlp": 0.22058105, + "step": 1538, + "time_per_iteration": 2.6547069549560547 + }, + { + "auxiliary_loss_clip": 0.01083342, + "auxiliary_loss_mlp": 0.01001779, + "balance_loss_clip": 1.0346396, + "balance_loss_mlp": 0.99851, + "epoch": 0.09252968585600481, + "flos": 82434798450720.0, + "grad_norm": 0.7319557503593132, + "language_loss": 0.57427931, + "learning_rate": 3.959125609588142e-06, + "loss": 0.59513044, + "num_input_tokens_seen": 32917730, + "router_z_loss_clip": 0.48608398, + "router_z_loss_mlp": 0.03274536, + "step": 1539, + "time_per_iteration": 3.5327959060668945 + }, + { + "auxiliary_loss_clip": 0.01195216, + "auxiliary_loss_mlp": 0.01051278, + "balance_loss_clip": 1.06538141, + "balance_loss_mlp": 1.02962983, + "epoch": 0.09258980910867277, + "flos": 21211581913920.0, + "grad_norm": 2.4333525071382462, + "language_loss": 0.67147517, + "learning_rate": 3.959047236690304e-06, + "loss": 0.6939401, + "num_input_tokens_seen": 32934910, + "router_z_loss_clip": 1.29882812, + "router_z_loss_mlp": 0.21655273, + "step": 1540, + "time_per_iteration": 2.734057903289795 + }, + { + "auxiliary_loss_clip": 0.0119434, + "auxiliary_loss_mlp": 0.01046579, + "balance_loss_clip": 1.06689227, + "balance_loss_mlp": 1.02496624, + "epoch": 0.09264993236134075, + "flos": 24240302974080.0, + "grad_norm": 1.7088921981235872, + "language_loss": 0.83712077, + "learning_rate": 3.958968789505198e-06, + "loss": 0.85952991, + "num_input_tokens_seen": 32953840, + "router_z_loss_clip": 1.2734375, + "router_z_loss_mlp": 0.21618652, + "step": 1541, + "time_per_iteration": 2.685336112976074 + }, + { + "auxiliary_loss_clip": 0.01082052, + "auxiliary_loss_mlp": 0.01003073, + "balance_loss_clip": 1.0333432, + "balance_loss_mlp": 0.99983019, + "epoch": 0.09271005561400872, + "flos": 76000555912320.0, + "grad_norm": 0.8839578925989103, + "language_loss": 0.61857307, + "learning_rate": 3.9588902680358e-06, + "loss": 0.63942432, + "num_input_tokens_seen": 33011410, + "router_z_loss_clip": 0.48706055, + "router_z_loss_mlp": 0.0324707, + "step": 1542, + "time_per_iteration": 3.2493155002593994 + }, + { + "auxiliary_loss_clip": 0.01191982, + "auxiliary_loss_mlp": 0.01061347, + "balance_loss_clip": 1.06360769, + "balance_loss_mlp": 1.04128385, + "epoch": 0.09277017886667668, + "flos": 28467475267680.0, + "grad_norm": 1.9574465299871797, + "language_loss": 0.82366168, + "learning_rate": 3.958811672285086e-06, + "loss": 0.84619498, + "num_input_tokens_seen": 33031675, + "router_z_loss_clip": 1.28417969, + "router_z_loss_mlp": 0.20068359, + "step": 1543, + "time_per_iteration": 2.6929843425750732 + }, + { + "auxiliary_loss_clip": 0.01186642, + "auxiliary_loss_mlp": 0.01060646, + "balance_loss_clip": 1.06183624, + "balance_loss_mlp": 1.04017806, + "epoch": 0.09283030211934466, + "flos": 66802739808960.0, + "grad_norm": 3.4291911448464907, + "language_loss": 0.72226834, + "learning_rate": 3.958733002256038e-06, + "loss": 0.7447412, + "num_input_tokens_seen": 33056355, + "router_z_loss_clip": 1.25, + "router_z_loss_mlp": 0.20458984, + "step": 1544, + "time_per_iteration": 2.982628345489502 + }, + { + "auxiliary_loss_clip": 0.01188013, + "auxiliary_loss_mlp": 0.01053951, + "balance_loss_clip": 1.059497, + "balance_loss_mlp": 1.03262424, + "epoch": 0.09289042537201263, + "flos": 37015442588160.0, + "grad_norm": 2.6737050892277776, + "language_loss": 0.77491188, + "learning_rate": 3.958654257951637e-06, + "loss": 0.79733151, + "num_input_tokens_seen": 33079520, + "router_z_loss_clip": 1.28515625, + "router_z_loss_mlp": 0.2130127, + "step": 1545, + "time_per_iteration": 2.9308676719665527 + }, + { + "auxiliary_loss_clip": 0.01184799, + "auxiliary_loss_mlp": 0.0105264, + "balance_loss_clip": 1.06206632, + "balance_loss_mlp": 1.03180218, + "epoch": 0.09295054862468059, + "flos": 21654696418560.0, + "grad_norm": 3.1802502811926616, + "language_loss": 0.74768627, + "learning_rate": 3.9585754393748706e-06, + "loss": 0.77006066, + "num_input_tokens_seen": 33096135, + "router_z_loss_clip": 1.22851562, + "router_z_loss_mlp": 0.20825195, + "step": 1546, + "time_per_iteration": 2.8269057273864746 + }, + { + "auxiliary_loss_clip": 0.01192348, + "auxiliary_loss_mlp": 0.01054381, + "balance_loss_clip": 1.06366765, + "balance_loss_mlp": 1.03297067, + "epoch": 0.09301067187734856, + "flos": 28867931426880.0, + "grad_norm": 1.8360481734015535, + "language_loss": 0.84213358, + "learning_rate": 3.9584965465287275e-06, + "loss": 0.86460084, + "num_input_tokens_seen": 33115245, + "router_z_loss_clip": 1.28515625, + "router_z_loss_mlp": 0.21386719, + "step": 1547, + "time_per_iteration": 2.714641571044922 + }, + { + "auxiliary_loss_clip": 0.01188539, + "auxiliary_loss_mlp": 0.010604, + "balance_loss_clip": 1.06127656, + "balance_loss_mlp": 1.03958654, + "epoch": 0.09307079513001654, + "flos": 33589830889440.0, + "grad_norm": 2.6709769630374636, + "language_loss": 0.67281508, + "learning_rate": 3.958417579416199e-06, + "loss": 0.69530451, + "num_input_tokens_seen": 33136640, + "router_z_loss_clip": 1.27148438, + "router_z_loss_mlp": 0.20812988, + "step": 1548, + "time_per_iteration": 2.716919183731079 + }, + { + "auxiliary_loss_clip": 0.01189402, + "auxiliary_loss_mlp": 0.01058408, + "balance_loss_clip": 1.06066012, + "balance_loss_mlp": 1.03774929, + "epoch": 0.0931309183826845, + "flos": 25169879122560.0, + "grad_norm": 1.942181616580252, + "language_loss": 0.83565503, + "learning_rate": 3.9583385380402795e-06, + "loss": 0.8581332, + "num_input_tokens_seen": 33155060, + "router_z_loss_clip": 1.28710938, + "router_z_loss_mlp": 0.20654297, + "step": 1549, + "time_per_iteration": 2.6495413780212402 + }, + { + "auxiliary_loss_clip": 0.01192828, + "auxiliary_loss_mlp": 0.0105366, + "balance_loss_clip": 1.06570816, + "balance_loss_mlp": 1.03349018, + "epoch": 0.09319104163535247, + "flos": 35412199846560.0, + "grad_norm": 1.7129720861066422, + "language_loss": 0.75579524, + "learning_rate": 3.958259422403966e-06, + "loss": 0.77826011, + "num_input_tokens_seen": 33175420, + "router_z_loss_clip": 1.27050781, + "router_z_loss_mlp": 0.20153809, + "step": 1550, + "time_per_iteration": 2.7189278602600098 + }, + { + "auxiliary_loss_clip": 0.01192683, + "auxiliary_loss_mlp": 0.01071726, + "balance_loss_clip": 1.06192338, + "balance_loss_mlp": 1.0491004, + "epoch": 0.09325116488802045, + "flos": 30873332053440.0, + "grad_norm": 2.128097993582182, + "language_loss": 0.83305311, + "learning_rate": 3.95818023251026e-06, + "loss": 0.85569715, + "num_input_tokens_seen": 33194120, + "router_z_loss_clip": 1.30664062, + "router_z_loss_mlp": 0.22607422, + "step": 1551, + "time_per_iteration": 2.687788963317871 + }, + { + "auxiliary_loss_clip": 0.01076054, + "auxiliary_loss_mlp": 0.01019652, + "balance_loss_clip": 1.02819681, + "balance_loss_mlp": 1.01654339, + "epoch": 0.09331128814068841, + "flos": 75085890122880.0, + "grad_norm": 0.7722882266261619, + "language_loss": 0.61811817, + "learning_rate": 3.958100968362163e-06, + "loss": 0.63907528, + "num_input_tokens_seen": 33261080, + "router_z_loss_clip": 0.47851562, + "router_z_loss_mlp": 0.03106689, + "step": 1552, + "time_per_iteration": 3.3816940784454346 + }, + { + "auxiliary_loss_clip": 0.01074458, + "auxiliary_loss_mlp": 0.01009138, + "balance_loss_clip": 1.02604985, + "balance_loss_mlp": 1.00615144, + "epoch": 0.09337141139335638, + "flos": 65027451931200.0, + "grad_norm": 0.8247251792020416, + "language_loss": 0.58864057, + "learning_rate": 3.958021629962681e-06, + "loss": 0.60947657, + "num_input_tokens_seen": 33330235, + "router_z_loss_clip": 0.48364258, + "router_z_loss_mlp": 0.02984619, + "step": 1553, + "time_per_iteration": 3.377878189086914 + }, + { + "auxiliary_loss_clip": 0.01190043, + "auxiliary_loss_mlp": 0.01060377, + "balance_loss_clip": 1.05946028, + "balance_loss_mlp": 1.03894353, + "epoch": 0.09343153464602436, + "flos": 28646738830080.0, + "grad_norm": 1.768193497110936, + "language_loss": 0.87716305, + "learning_rate": 3.957942217314823e-06, + "loss": 0.8996672, + "num_input_tokens_seen": 33349035, + "router_z_loss_clip": 1.30664062, + "router_z_loss_mlp": 0.21435547, + "step": 1554, + "time_per_iteration": 2.6801598072052 + }, + { + "auxiliary_loss_clip": 0.01187962, + "auxiliary_loss_mlp": 0.0106158, + "balance_loss_clip": 1.06383467, + "balance_loss_mlp": 1.0405035, + "epoch": 0.09349165789869232, + "flos": 23615129214720.0, + "grad_norm": 2.3817232310348246, + "language_loss": 0.81411028, + "learning_rate": 3.957862730421599e-06, + "loss": 0.83660573, + "num_input_tokens_seen": 33368060, + "router_z_loss_clip": 1.24316406, + "router_z_loss_mlp": 0.21081543, + "step": 1555, + "time_per_iteration": 4.137402057647705 + }, + { + "auxiliary_loss_clip": 0.01075631, + "auxiliary_loss_mlp": 0.01002005, + "balance_loss_clip": 1.02704239, + "balance_loss_mlp": 0.99903852, + "epoch": 0.09355178115136029, + "flos": 82368113358240.0, + "grad_norm": 0.8701281080457403, + "language_loss": 0.59673977, + "learning_rate": 3.957783169286024e-06, + "loss": 0.61751616, + "num_input_tokens_seen": 33430825, + "router_z_loss_clip": 0.4855957, + "router_z_loss_mlp": 0.02964783, + "step": 1556, + "time_per_iteration": 4.875341176986694 + }, + { + "auxiliary_loss_clip": 0.01187664, + "auxiliary_loss_mlp": 0.01057091, + "balance_loss_clip": 1.06136334, + "balance_loss_mlp": 1.03661108, + "epoch": 0.09361190440402825, + "flos": 45565597841760.0, + "grad_norm": 1.6357324557712105, + "language_loss": 0.84341401, + "learning_rate": 3.9577035339111155e-06, + "loss": 0.86586159, + "num_input_tokens_seen": 33454855, + "router_z_loss_clip": 1.26171875, + "router_z_loss_mlp": 0.20483398, + "step": 1557, + "time_per_iteration": 5.867675542831421 + }, + { + "auxiliary_loss_clip": 0.01187787, + "auxiliary_loss_mlp": 0.01059696, + "balance_loss_clip": 1.06115341, + "balance_loss_mlp": 1.03676033, + "epoch": 0.09367202765669623, + "flos": 30383021268000.0, + "grad_norm": 1.7571464463601236, + "language_loss": 0.7809543, + "learning_rate": 3.957623824299893e-06, + "loss": 0.80342913, + "num_input_tokens_seen": 33476000, + "router_z_loss_clip": 1.265625, + "router_z_loss_mlp": 0.22949219, + "step": 1558, + "time_per_iteration": 2.729322910308838 + }, + { + "auxiliary_loss_clip": 0.01191158, + "auxiliary_loss_mlp": 0.01055031, + "balance_loss_clip": 1.06076896, + "balance_loss_mlp": 1.03359723, + "epoch": 0.0937321509093642, + "flos": 19163603976480.0, + "grad_norm": 2.2122448487085515, + "language_loss": 0.80102265, + "learning_rate": 3.957544040455379e-06, + "loss": 0.82348454, + "num_input_tokens_seen": 33493845, + "router_z_loss_clip": 1.30273438, + "router_z_loss_mlp": 0.21435547, + "step": 1559, + "time_per_iteration": 2.641669511795044 + }, + { + "auxiliary_loss_clip": 0.01184632, + "auxiliary_loss_mlp": 0.01059855, + "balance_loss_clip": 1.06037843, + "balance_loss_mlp": 1.03943431, + "epoch": 0.09379227416203216, + "flos": 24994383667200.0, + "grad_norm": 2.4054965834770106, + "language_loss": 0.76530683, + "learning_rate": 3.957464182380599e-06, + "loss": 0.78775167, + "num_input_tokens_seen": 33510850, + "router_z_loss_clip": 1.24316406, + "router_z_loss_mlp": 0.20422363, + "step": 1560, + "time_per_iteration": 2.7246360778808594 + }, + { + "auxiliary_loss_clip": 0.01191808, + "auxiliary_loss_mlp": 0.0106051, + "balance_loss_clip": 1.06155598, + "balance_loss_mlp": 1.0392673, + "epoch": 0.09385239741470014, + "flos": 29715270197760.0, + "grad_norm": 1.7438136924003176, + "language_loss": 0.80835587, + "learning_rate": 3.95738425007858e-06, + "loss": 0.83087903, + "num_input_tokens_seen": 33530430, + "router_z_loss_clip": 1.30078125, + "router_z_loss_mlp": 0.21228027, + "step": 1561, + "time_per_iteration": 2.6971030235290527 + }, + { + "auxiliary_loss_clip": 0.01186007, + "auxiliary_loss_mlp": 0.01045157, + "balance_loss_clip": 1.05694938, + "balance_loss_mlp": 1.02459335, + "epoch": 0.0939125206673681, + "flos": 40623316128000.0, + "grad_norm": 2.777453976068992, + "language_loss": 0.61259353, + "learning_rate": 3.957304243552354e-06, + "loss": 0.63490522, + "num_input_tokens_seen": 33551975, + "router_z_loss_clip": 1.29003906, + "router_z_loss_mlp": 0.20556641, + "step": 1562, + "time_per_iteration": 2.764648675918579 + }, + { + "auxiliary_loss_clip": 0.01188852, + "auxiliary_loss_mlp": 0.01055865, + "balance_loss_clip": 1.06481922, + "balance_loss_mlp": 1.03580213, + "epoch": 0.09397264392003607, + "flos": 23482575725760.0, + "grad_norm": 2.0204001944108287, + "language_loss": 0.84845865, + "learning_rate": 3.957224162804956e-06, + "loss": 0.87090588, + "num_input_tokens_seen": 33569850, + "router_z_loss_clip": 1.24023438, + "router_z_loss_mlp": 0.20031738, + "step": 1563, + "time_per_iteration": 2.763343334197998 + }, + { + "auxiliary_loss_clip": 0.01182851, + "auxiliary_loss_mlp": 0.01057004, + "balance_loss_clip": 1.05915046, + "balance_loss_mlp": 1.03745341, + "epoch": 0.09403276717270405, + "flos": 23571660523680.0, + "grad_norm": 1.9727830471652394, + "language_loss": 0.75768828, + "learning_rate": 3.9571440078394205e-06, + "loss": 0.78008676, + "num_input_tokens_seen": 33590510, + "router_z_loss_clip": 1.23632812, + "router_z_loss_mlp": 0.19543457, + "step": 1564, + "time_per_iteration": 2.7128915786743164 + }, + { + "auxiliary_loss_clip": 0.01189419, + "auxiliary_loss_mlp": 0.01050438, + "balance_loss_clip": 1.06494212, + "balance_loss_mlp": 1.03103065, + "epoch": 0.09409289042537201, + "flos": 28777104385920.0, + "grad_norm": 1.9104514181329164, + "language_loss": 0.79552817, + "learning_rate": 3.9570637786587895e-06, + "loss": 0.81792676, + "num_input_tokens_seen": 33608810, + "router_z_loss_clip": 1.24414062, + "router_z_loss_mlp": 0.1940918, + "step": 1565, + "time_per_iteration": 2.776141405105591 + }, + { + "auxiliary_loss_clip": 0.01185198, + "auxiliary_loss_mlp": 0.01062245, + "balance_loss_clip": 1.05954528, + "balance_loss_mlp": 1.04206324, + "epoch": 0.09415301367803998, + "flos": 24500223740160.0, + "grad_norm": 1.8474098774452719, + "language_loss": 0.75250292, + "learning_rate": 3.956983475266103e-06, + "loss": 0.77497733, + "num_input_tokens_seen": 33627265, + "router_z_loss_clip": 1.25683594, + "router_z_loss_mlp": 0.2019043, + "step": 1566, + "time_per_iteration": 2.6933414936065674 + }, + { + "auxiliary_loss_clip": 0.0118645, + "auxiliary_loss_mlp": 0.01058629, + "balance_loss_clip": 1.06000721, + "balance_loss_mlp": 1.03795803, + "epoch": 0.09421313693070796, + "flos": 25700660320320.0, + "grad_norm": 1.8083333033890905, + "language_loss": 0.78077215, + "learning_rate": 3.956903097664407e-06, + "loss": 0.80322289, + "num_input_tokens_seen": 33644810, + "router_z_loss_clip": 1.26464844, + "router_z_loss_mlp": 0.20678711, + "step": 1567, + "time_per_iteration": 2.6946749687194824 + }, + { + "auxiliary_loss_clip": 0.01189473, + "auxiliary_loss_mlp": 0.01052538, + "balance_loss_clip": 1.06250501, + "balance_loss_mlp": 1.03369117, + "epoch": 0.09427326018337592, + "flos": 29670910126560.0, + "grad_norm": 2.430927143499994, + "language_loss": 0.82608378, + "learning_rate": 3.956822645856749e-06, + "loss": 0.84850383, + "num_input_tokens_seen": 33665665, + "router_z_loss_clip": 1.26855469, + "router_z_loss_mlp": 0.18847656, + "step": 1568, + "time_per_iteration": 2.9056479930877686 + }, + { + "auxiliary_loss_clip": 0.01191574, + "auxiliary_loss_mlp": 0.01053691, + "balance_loss_clip": 1.06313992, + "balance_loss_mlp": 1.03193569, + "epoch": 0.09433338343604389, + "flos": 24726237893280.0, + "grad_norm": 2.2872667668903044, + "language_loss": 0.76893246, + "learning_rate": 3.9567421198461814e-06, + "loss": 0.79138505, + "num_input_tokens_seen": 33684760, + "router_z_loss_clip": 1.28515625, + "router_z_loss_mlp": 0.21740723, + "step": 1569, + "time_per_iteration": 2.6686840057373047 + }, + { + "auxiliary_loss_clip": 0.01181279, + "auxiliary_loss_mlp": 0.01050226, + "balance_loss_clip": 1.05979633, + "balance_loss_mlp": 1.02955544, + "epoch": 0.09439350668871185, + "flos": 15549085602720.0, + "grad_norm": 2.266343776150997, + "language_loss": 0.85791707, + "learning_rate": 3.956661519635756e-06, + "loss": 0.8802321, + "num_input_tokens_seen": 33700750, + "router_z_loss_clip": 1.21582031, + "router_z_loss_mlp": 0.20678711, + "step": 1570, + "time_per_iteration": 2.6113154888153076 + }, + { + "auxiliary_loss_clip": 0.01186797, + "auxiliary_loss_mlp": 0.01050138, + "balance_loss_clip": 1.06141758, + "balance_loss_mlp": 1.0290978, + "epoch": 0.09445362994137983, + "flos": 31679227997280.0, + "grad_norm": 1.950379490265598, + "language_loss": 0.76400554, + "learning_rate": 3.95658084522853e-06, + "loss": 0.78637481, + "num_input_tokens_seen": 33724430, + "router_z_loss_clip": 1.25292969, + "router_z_loss_mlp": 0.21044922, + "step": 1571, + "time_per_iteration": 2.72741436958313 + }, + { + "auxiliary_loss_clip": 0.0118244, + "auxiliary_loss_mlp": 0.01054682, + "balance_loss_clip": 1.06160021, + "balance_loss_mlp": 1.03430927, + "epoch": 0.0945137531940478, + "flos": 24056987683680.0, + "grad_norm": 1.6366973303324839, + "language_loss": 0.79140151, + "learning_rate": 3.956500096627561e-06, + "loss": 0.81377268, + "num_input_tokens_seen": 33743455, + "router_z_loss_clip": 1.20703125, + "router_z_loss_mlp": 0.20385742, + "step": 1572, + "time_per_iteration": 2.641569137573242 + }, + { + "auxiliary_loss_clip": 0.01182991, + "auxiliary_loss_mlp": 0.0105258, + "balance_loss_clip": 1.05994487, + "balance_loss_mlp": 1.03242183, + "epoch": 0.09457387644671576, + "flos": 28818344626560.0, + "grad_norm": 2.0882652802604302, + "language_loss": 0.876679, + "learning_rate": 3.956419273835913e-06, + "loss": 0.89903474, + "num_input_tokens_seen": 33763435, + "router_z_loss_clip": 1.23144531, + "router_z_loss_mlp": 0.20153809, + "step": 1573, + "time_per_iteration": 2.690671443939209 + }, + { + "auxiliary_loss_clip": 0.0118697, + "auxiliary_loss_mlp": 0.01067913, + "balance_loss_clip": 1.06064248, + "balance_loss_mlp": 1.04422641, + "epoch": 0.09463399969938374, + "flos": 32832751917600.0, + "grad_norm": 2.3217355596086744, + "language_loss": 0.8173719, + "learning_rate": 3.95633837685665e-06, + "loss": 0.83992076, + "num_input_tokens_seen": 33784325, + "router_z_loss_clip": 1.26171875, + "router_z_loss_mlp": 0.23693848, + "step": 1574, + "time_per_iteration": 2.702552080154419 + }, + { + "auxiliary_loss_clip": 0.01188578, + "auxiliary_loss_mlp": 0.01051896, + "balance_loss_clip": 1.06254458, + "balance_loss_mlp": 1.03203535, + "epoch": 0.0946941229520517, + "flos": 28869673669920.0, + "grad_norm": 1.891067568947756, + "language_loss": 0.81063825, + "learning_rate": 3.95625740569284e-06, + "loss": 0.83304298, + "num_input_tokens_seen": 33802510, + "router_z_loss_clip": 1.25976562, + "router_z_loss_mlp": 0.1986084, + "step": 1575, + "time_per_iteration": 2.6910994052886963 + }, + { + "auxiliary_loss_clip": 0.01184071, + "auxiliary_loss_mlp": 0.01063711, + "balance_loss_clip": 1.06057262, + "balance_loss_mlp": 1.0423131, + "epoch": 0.09475424620471967, + "flos": 29448664080480.0, + "grad_norm": 1.990404786313936, + "language_loss": 0.86616176, + "learning_rate": 3.956176360347553e-06, + "loss": 0.88863957, + "num_input_tokens_seen": 33819980, + "router_z_loss_clip": 1.234375, + "router_z_loss_mlp": 0.21374512, + "step": 1576, + "time_per_iteration": 2.8218860626220703 + }, + { + "auxiliary_loss_clip": 0.01070129, + "auxiliary_loss_mlp": 0.0100633, + "balance_loss_clip": 1.02345419, + "balance_loss_mlp": 1.00360954, + "epoch": 0.09481436945738765, + "flos": 83494091878560.0, + "grad_norm": 0.9748816124881781, + "language_loss": 0.65752172, + "learning_rate": 3.956095240823862e-06, + "loss": 0.67828631, + "num_input_tokens_seen": 33878925, + "router_z_loss_clip": 0.46630859, + "router_z_loss_mlp": 0.02722168, + "step": 1577, + "time_per_iteration": 3.238412380218506 + }, + { + "auxiliary_loss_clip": 0.01185071, + "auxiliary_loss_mlp": 0.01043983, + "balance_loss_clip": 1.05963826, + "balance_loss_mlp": 1.02475417, + "epoch": 0.09487449271005562, + "flos": 20321058072960.0, + "grad_norm": 1.9472534901886442, + "language_loss": 0.79298598, + "learning_rate": 3.956014047124844e-06, + "loss": 0.8152765, + "num_input_tokens_seen": 33897600, + "router_z_loss_clip": 1.25390625, + "router_z_loss_mlp": 0.19238281, + "step": 1578, + "time_per_iteration": 2.703687906265259 + }, + { + "auxiliary_loss_clip": 0.01184358, + "auxiliary_loss_mlp": 0.01061027, + "balance_loss_clip": 1.05924368, + "balance_loss_mlp": 1.03997421, + "epoch": 0.09493461596272358, + "flos": 29621890568160.0, + "grad_norm": 2.0389223697336245, + "language_loss": 0.78034836, + "learning_rate": 3.955932779253578e-06, + "loss": 0.80280221, + "num_input_tokens_seen": 33917365, + "router_z_loss_clip": 1.25097656, + "router_z_loss_mlp": 0.21057129, + "step": 1579, + "time_per_iteration": 2.7036333084106445 + }, + { + "auxiliary_loss_clip": 0.01184519, + "auxiliary_loss_mlp": 0.01058468, + "balance_loss_clip": 1.0597682, + "balance_loss_mlp": 1.0373323, + "epoch": 0.09499473921539155, + "flos": 26686265516640.0, + "grad_norm": 1.9395363154864718, + "language_loss": 0.73557985, + "learning_rate": 3.955851437213144e-06, + "loss": 0.75800967, + "num_input_tokens_seen": 33936680, + "router_z_loss_clip": 1.24707031, + "router_z_loss_mlp": 0.21118164, + "step": 1580, + "time_per_iteration": 2.7082815170288086 + }, + { + "auxiliary_loss_clip": 0.01178141, + "auxiliary_loss_mlp": 0.01053579, + "balance_loss_clip": 1.05699372, + "balance_loss_mlp": 1.0328846, + "epoch": 0.09505486246805953, + "flos": 40935254731200.0, + "grad_norm": 8.079171865456113, + "language_loss": 0.7767117, + "learning_rate": 3.955770021006627e-06, + "loss": 0.79902887, + "num_input_tokens_seen": 33960685, + "router_z_loss_clip": 1.21191406, + "router_z_loss_mlp": 0.20690918, + "step": 1581, + "time_per_iteration": 2.931508779525757 + }, + { + "auxiliary_loss_clip": 0.01184202, + "auxiliary_loss_mlp": 0.01053386, + "balance_loss_clip": 1.05922377, + "balance_loss_mlp": 1.0332042, + "epoch": 0.09511498572072749, + "flos": 25887014406720.0, + "grad_norm": 2.0779647177297695, + "language_loss": 0.87104368, + "learning_rate": 3.955688530637116e-06, + "loss": 0.89341956, + "num_input_tokens_seen": 33980015, + "router_z_loss_clip": 1.25, + "router_z_loss_mlp": 0.2019043, + "step": 1582, + "time_per_iteration": 2.674546957015991 + }, + { + "auxiliary_loss_clip": 0.0118397, + "auxiliary_loss_mlp": 0.01061771, + "balance_loss_clip": 1.05846059, + "balance_loss_mlp": 1.0400629, + "epoch": 0.09517510897339546, + "flos": 17828223387840.0, + "grad_norm": 5.987999633060573, + "language_loss": 0.66575575, + "learning_rate": 3.955606966107699e-06, + "loss": 0.68821323, + "num_input_tokens_seen": 33997705, + "router_z_loss_clip": 1.25488281, + "router_z_loss_mlp": 0.21704102, + "step": 1583, + "time_per_iteration": 2.6536402702331543 + }, + { + "auxiliary_loss_clip": 0.01186546, + "auxiliary_loss_mlp": 0.01053175, + "balance_loss_clip": 1.06117833, + "balance_loss_mlp": 1.03164613, + "epoch": 0.09523523222606343, + "flos": 33943455423360.0, + "grad_norm": 2.1730270848194704, + "language_loss": 0.70525193, + "learning_rate": 3.95552532742147e-06, + "loss": 0.72764909, + "num_input_tokens_seen": 34017465, + "router_z_loss_clip": 1.25488281, + "router_z_loss_mlp": 0.21533203, + "step": 1584, + "time_per_iteration": 2.715977907180786 + }, + { + "auxiliary_loss_clip": 0.01183924, + "auxiliary_loss_mlp": 0.01057348, + "balance_loss_clip": 1.05941844, + "balance_loss_mlp": 1.03807187, + "epoch": 0.0952953554787314, + "flos": 25265649271680.0, + "grad_norm": 2.7325789661832904, + "language_loss": 0.81011903, + "learning_rate": 3.955443614581525e-06, + "loss": 0.83253181, + "num_input_tokens_seen": 34038550, + "router_z_loss_clip": 1.24414062, + "router_z_loss_mlp": 0.19287109, + "step": 1585, + "time_per_iteration": 2.7039427757263184 + }, + { + "auxiliary_loss_clip": 0.0118697, + "auxiliary_loss_mlp": 0.01057279, + "balance_loss_clip": 1.05958724, + "balance_loss_mlp": 1.03476048, + "epoch": 0.09535547873139937, + "flos": 30246497085600.0, + "grad_norm": 1.8027303086566402, + "language_loss": 0.7167086, + "learning_rate": 3.955361827590961e-06, + "loss": 0.73915112, + "num_input_tokens_seen": 34058665, + "router_z_loss_clip": 1.27441406, + "router_z_loss_mlp": 0.22546387, + "step": 1586, + "time_per_iteration": 2.709216356277466 + }, + { + "auxiliary_loss_clip": 0.01070653, + "auxiliary_loss_mlp": 0.01012457, + "balance_loss_clip": 1.02397275, + "balance_loss_mlp": 1.0097239, + "epoch": 0.09541560198406734, + "flos": 87749538883200.0, + "grad_norm": 0.8165387382829008, + "language_loss": 0.55455464, + "learning_rate": 3.955279966452883e-06, + "loss": 0.57538569, + "num_input_tokens_seen": 34109655, + "router_z_loss_clip": 0.46679688, + "router_z_loss_mlp": 0.02737427, + "step": 1587, + "time_per_iteration": 3.130997657775879 + }, + { + "auxiliary_loss_clip": 0.01187192, + "auxiliary_loss_mlp": 0.01061, + "balance_loss_clip": 1.05985808, + "balance_loss_mlp": 1.04060292, + "epoch": 0.09547572523673531, + "flos": 35365611324960.0, + "grad_norm": 1.6668701796395937, + "language_loss": 0.80930865, + "learning_rate": 3.955198031170391e-06, + "loss": 0.83179057, + "num_input_tokens_seen": 34131115, + "router_z_loss_clip": 1.2734375, + "router_z_loss_mlp": 0.20373535, + "step": 1588, + "time_per_iteration": 2.750074863433838 + }, + { + "auxiliary_loss_clip": 0.01182328, + "auxiliary_loss_mlp": 0.01055799, + "balance_loss_clip": 1.05900979, + "balance_loss_mlp": 1.03567648, + "epoch": 0.09553584848940327, + "flos": 29448502011360.0, + "grad_norm": 1.7158477295537433, + "language_loss": 0.81791091, + "learning_rate": 3.955116021746594e-06, + "loss": 0.84029222, + "num_input_tokens_seen": 34151925, + "router_z_loss_clip": 1.234375, + "router_z_loss_mlp": 0.20129395, + "step": 1589, + "time_per_iteration": 2.7361888885498047 + }, + { + "auxiliary_loss_clip": 0.0118319, + "auxiliary_loss_mlp": 0.01057893, + "balance_loss_clip": 1.06006289, + "balance_loss_mlp": 1.03625691, + "epoch": 0.09559597174207124, + "flos": 52288967754720.0, + "grad_norm": 1.6117580164623841, + "language_loss": 0.64654732, + "learning_rate": 3.955033938184601e-06, + "loss": 0.66895819, + "num_input_tokens_seen": 34175395, + "router_z_loss_clip": 1.23046875, + "router_z_loss_mlp": 0.21630859, + "step": 1590, + "time_per_iteration": 2.9419641494750977 + }, + { + "auxiliary_loss_clip": 0.01180739, + "auxiliary_loss_mlp": 0.01059023, + "balance_loss_clip": 1.05849862, + "balance_loss_mlp": 1.03786349, + "epoch": 0.09565609499473922, + "flos": 39865994052480.0, + "grad_norm": 1.6438093028788718, + "language_loss": 0.82781804, + "learning_rate": 3.954951780487526e-06, + "loss": 0.85021567, + "num_input_tokens_seen": 34197760, + "router_z_loss_clip": 1.22265625, + "router_z_loss_mlp": 0.21154785, + "step": 1591, + "time_per_iteration": 2.846815824508667 + }, + { + "auxiliary_loss_clip": 0.01185209, + "auxiliary_loss_mlp": 0.01052033, + "balance_loss_clip": 1.05729651, + "balance_loss_mlp": 1.03146911, + "epoch": 0.09571621824740718, + "flos": 22547246123520.0, + "grad_norm": 3.091187357032626, + "language_loss": 0.74336851, + "learning_rate": 3.9548695486584835e-06, + "loss": 0.76574099, + "num_input_tokens_seen": 34215330, + "router_z_loss_clip": 1.27832031, + "router_z_loss_mlp": 0.20568848, + "step": 1592, + "time_per_iteration": 2.666938543319702 + }, + { + "auxiliary_loss_clip": 0.0118136, + "auxiliary_loss_mlp": 0.01048082, + "balance_loss_clip": 1.05727601, + "balance_loss_mlp": 1.02779281, + "epoch": 0.09577634150007515, + "flos": 35859366079200.0, + "grad_norm": 2.485622407652198, + "language_loss": 0.73738647, + "learning_rate": 3.954787242700592e-06, + "loss": 0.75968099, + "num_input_tokens_seen": 34237745, + "router_z_loss_clip": 1.24121094, + "router_z_loss_mlp": 0.20288086, + "step": 1593, + "time_per_iteration": 2.9456303119659424 + }, + { + "auxiliary_loss_clip": 0.01182686, + "auxiliary_loss_mlp": 0.01053777, + "balance_loss_clip": 1.06002498, + "balance_loss_mlp": 1.0337857, + "epoch": 0.09583646475274313, + "flos": 27757065852000.0, + "grad_norm": 2.0677401388309926, + "language_loss": 0.70182574, + "learning_rate": 3.954704862616971e-06, + "loss": 0.72419029, + "num_input_tokens_seen": 34256565, + "router_z_loss_clip": 1.2265625, + "router_z_loss_mlp": 0.19970703, + "step": 1594, + "time_per_iteration": 2.6963930130004883 + }, + { + "auxiliary_loss_clip": 0.01182097, + "auxiliary_loss_mlp": 0.01051095, + "balance_loss_clip": 1.05723286, + "balance_loss_mlp": 1.03158081, + "epoch": 0.0958965880054111, + "flos": 28331599361760.0, + "grad_norm": 2.268146591601013, + "language_loss": 0.82279837, + "learning_rate": 3.954622408410747e-06, + "loss": 0.84513032, + "num_input_tokens_seen": 34275970, + "router_z_loss_clip": 1.24804688, + "router_z_loss_mlp": 0.19519043, + "step": 1595, + "time_per_iteration": 5.652805805206299 + }, + { + "auxiliary_loss_clip": 0.0118384, + "auxiliary_loss_mlp": 0.01055561, + "balance_loss_clip": 1.05869555, + "balance_loss_mlp": 1.03387737, + "epoch": 0.09595671125807906, + "flos": 26020175654880.0, + "grad_norm": 1.8999432816357757, + "language_loss": 0.84554678, + "learning_rate": 3.954539880085045e-06, + "loss": 0.86794078, + "num_input_tokens_seen": 34295490, + "router_z_loss_clip": 1.25, + "router_z_loss_mlp": 0.21691895, + "step": 1596, + "time_per_iteration": 2.6600186824798584 + }, + { + "auxiliary_loss_clip": 0.01188325, + "auxiliary_loss_mlp": 0.01052855, + "balance_loss_clip": 1.06206155, + "balance_loss_mlp": 1.03126597, + "epoch": 0.09601683451074704, + "flos": 48325686920640.0, + "grad_norm": 2.189645317020489, + "language_loss": 0.69103885, + "learning_rate": 3.9544572776429945e-06, + "loss": 0.71345055, + "num_input_tokens_seen": 34319990, + "router_z_loss_clip": 1.26171875, + "router_z_loss_mlp": 0.21594238, + "step": 1597, + "time_per_iteration": 5.719931125640869 + }, + { + "auxiliary_loss_clip": 0.01184055, + "auxiliary_loss_mlp": 0.01043518, + "balance_loss_clip": 1.056517, + "balance_loss_mlp": 1.02356195, + "epoch": 0.096076957763415, + "flos": 28958920536960.0, + "grad_norm": 2.230704205429904, + "language_loss": 0.74833298, + "learning_rate": 3.954374601087729e-06, + "loss": 0.77060872, + "num_input_tokens_seen": 34339225, + "router_z_loss_clip": 1.27539062, + "router_z_loss_mlp": 0.19946289, + "step": 1598, + "time_per_iteration": 2.6843316555023193 + }, + { + "auxiliary_loss_clip": 0.01188654, + "auxiliary_loss_mlp": 0.01052107, + "balance_loss_clip": 1.06136966, + "balance_loss_mlp": 1.03053045, + "epoch": 0.09613708101608297, + "flos": 42315360046560.0, + "grad_norm": 1.869665425120449, + "language_loss": 0.68706411, + "learning_rate": 3.954291850422382e-06, + "loss": 0.7094717, + "num_input_tokens_seen": 34361020, + "router_z_loss_clip": 1.27148438, + "router_z_loss_mlp": 0.21569824, + "step": 1599, + "time_per_iteration": 2.8035826683044434 + }, + { + "auxiliary_loss_clip": 0.01184977, + "auxiliary_loss_mlp": 0.01057107, + "balance_loss_clip": 1.05995941, + "balance_loss_mlp": 1.03718686, + "epoch": 0.09619720426875093, + "flos": 25307092098720.0, + "grad_norm": 2.268709853752678, + "language_loss": 0.84262776, + "learning_rate": 3.954209025650093e-06, + "loss": 0.86504865, + "num_input_tokens_seen": 34378630, + "router_z_loss_clip": 1.25, + "router_z_loss_mlp": 0.19921875, + "step": 1600, + "time_per_iteration": 2.6560118198394775 + }, + { + "auxiliary_loss_clip": 0.01182779, + "auxiliary_loss_mlp": 0.01052191, + "balance_loss_clip": 1.05723238, + "balance_loss_mlp": 1.03207994, + "epoch": 0.09625732752141891, + "flos": 15914136009600.0, + "grad_norm": 3.48703017901921, + "language_loss": 0.80430192, + "learning_rate": 3.954126126774001e-06, + "loss": 0.82665157, + "num_input_tokens_seen": 34397110, + "router_z_loss_clip": 1.25390625, + "router_z_loss_mlp": 0.20117188, + "step": 1601, + "time_per_iteration": 2.6673083305358887 + }, + { + "auxiliary_loss_clip": 0.01188451, + "auxiliary_loss_mlp": 0.01053848, + "balance_loss_clip": 1.06083298, + "balance_loss_mlp": 1.0328908, + "epoch": 0.09631745077408688, + "flos": 27178399579680.0, + "grad_norm": 2.342582440770056, + "language_loss": 0.82230431, + "learning_rate": 3.954043153797251e-06, + "loss": 0.84472728, + "num_input_tokens_seen": 34414165, + "router_z_loss_clip": 1.27539062, + "router_z_loss_mlp": 0.20959473, + "step": 1602, + "time_per_iteration": 2.6659348011016846 + }, + { + "auxiliary_loss_clip": 0.01181239, + "auxiliary_loss_mlp": 0.01049622, + "balance_loss_clip": 1.05875623, + "balance_loss_mlp": 1.02828372, + "epoch": 0.09637757402675484, + "flos": 30205499948640.0, + "grad_norm": 1.9744035950121108, + "language_loss": 0.62918031, + "learning_rate": 3.953960106722989e-06, + "loss": 0.6514889, + "num_input_tokens_seen": 34434445, + "router_z_loss_clip": 1.22363281, + "router_z_loss_mlp": 0.21337891, + "step": 1603, + "time_per_iteration": 2.7346067428588867 + }, + { + "auxiliary_loss_clip": 0.01188514, + "auxiliary_loss_mlp": 0.01051024, + "balance_loss_clip": 1.06054866, + "balance_loss_mlp": 1.02887464, + "epoch": 0.09643769727942282, + "flos": 27486367489440.0, + "grad_norm": 2.5196650934681224, + "language_loss": 0.71773958, + "learning_rate": 3.953876985554364e-06, + "loss": 0.74013495, + "num_input_tokens_seen": 34453095, + "router_z_loss_clip": 1.27832031, + "router_z_loss_mlp": 0.22143555, + "step": 1604, + "time_per_iteration": 2.89125919342041 + }, + { + "auxiliary_loss_clip": 0.01182037, + "auxiliary_loss_mlp": 0.01052543, + "balance_loss_clip": 1.05899906, + "balance_loss_mlp": 1.03332663, + "epoch": 0.09649782053209079, + "flos": 37730592525600.0, + "grad_norm": 3.1427044583484727, + "language_loss": 0.80046982, + "learning_rate": 3.953793790294527e-06, + "loss": 0.82281554, + "num_input_tokens_seen": 34473680, + "router_z_loss_clip": 1.23144531, + "router_z_loss_mlp": 0.19226074, + "step": 1605, + "time_per_iteration": 2.769420623779297 + }, + { + "auxiliary_loss_clip": 0.01185528, + "auxiliary_loss_mlp": 0.01048619, + "balance_loss_clip": 1.05711102, + "balance_loss_mlp": 1.02850842, + "epoch": 0.09655794378475875, + "flos": 30917408503680.0, + "grad_norm": 1.9814764372565805, + "language_loss": 0.7428357, + "learning_rate": 3.953710520946634e-06, + "loss": 0.76517719, + "num_input_tokens_seen": 34492610, + "router_z_loss_clip": 1.28320312, + "router_z_loss_mlp": 0.20117188, + "step": 1606, + "time_per_iteration": 2.7141284942626953 + }, + { + "auxiliary_loss_clip": 0.01186605, + "auxiliary_loss_mlp": 0.01051617, + "balance_loss_clip": 1.06073821, + "balance_loss_mlp": 1.03204298, + "epoch": 0.09661806703742673, + "flos": 27266836101120.0, + "grad_norm": 2.07187382832695, + "language_loss": 0.75960588, + "learning_rate": 3.953627177513843e-06, + "loss": 0.78198814, + "num_input_tokens_seen": 34511855, + "router_z_loss_clip": 1.2578125, + "router_z_loss_mlp": 0.19580078, + "step": 1607, + "time_per_iteration": 2.7448058128356934 + }, + { + "auxiliary_loss_clip": 0.01186395, + "auxiliary_loss_mlp": 0.01050493, + "balance_loss_clip": 1.05930901, + "balance_loss_mlp": 1.03069186, + "epoch": 0.0966781902900947, + "flos": 21301477057440.0, + "grad_norm": 2.346475771560718, + "language_loss": 0.86936432, + "learning_rate": 3.953543759999312e-06, + "loss": 0.89173317, + "num_input_tokens_seen": 34528905, + "router_z_loss_clip": 1.27148438, + "router_z_loss_mlp": 0.19799805, + "step": 1608, + "time_per_iteration": 2.8558642864227295 + }, + { + "auxiliary_loss_clip": 0.01193559, + "auxiliary_loss_mlp": 0.01054757, + "balance_loss_clip": 1.06244707, + "balance_loss_mlp": 1.03401494, + "epoch": 0.09673831354276266, + "flos": 45031534744320.0, + "grad_norm": 2.41688784071654, + "language_loss": 0.71486646, + "learning_rate": 3.953460268406207e-06, + "loss": 0.73734963, + "num_input_tokens_seen": 34548480, + "router_z_loss_clip": 1.31054688, + "router_z_loss_mlp": 0.20751953, + "step": 1609, + "time_per_iteration": 2.795440435409546 + }, + { + "auxiliary_loss_clip": 0.01183627, + "auxiliary_loss_mlp": 0.01060752, + "balance_loss_clip": 1.05787277, + "balance_loss_mlp": 1.04116547, + "epoch": 0.09679843679543064, + "flos": 25260138921600.0, + "grad_norm": 1.9826139641320752, + "language_loss": 0.84380919, + "learning_rate": 3.953376702737693e-06, + "loss": 0.86625296, + "num_input_tokens_seen": 34565410, + "router_z_loss_clip": 1.25878906, + "router_z_loss_mlp": 0.19580078, + "step": 1610, + "time_per_iteration": 2.6786205768585205 + }, + { + "auxiliary_loss_clip": 0.01185813, + "auxiliary_loss_mlp": 0.01050947, + "balance_loss_clip": 1.06160712, + "balance_loss_mlp": 1.03009689, + "epoch": 0.0968585600480986, + "flos": 28692476488800.0, + "grad_norm": 2.5969577917153597, + "language_loss": 0.66995513, + "learning_rate": 3.953293062996939e-06, + "loss": 0.69232273, + "num_input_tokens_seen": 34584840, + "router_z_loss_clip": 1.2421875, + "router_z_loss_mlp": 0.20861816, + "step": 1611, + "time_per_iteration": 2.662470817565918 + }, + { + "auxiliary_loss_clip": 0.01184195, + "auxiliary_loss_mlp": 0.01049298, + "balance_loss_clip": 1.0592773, + "balance_loss_mlp": 1.02927053, + "epoch": 0.09691868330076657, + "flos": 24551674335360.0, + "grad_norm": 1.7835227153914728, + "language_loss": 0.81109369, + "learning_rate": 3.953209349187115e-06, + "loss": 0.83342862, + "num_input_tokens_seen": 34603360, + "router_z_loss_clip": 1.24804688, + "router_z_loss_mlp": 0.20019531, + "step": 1612, + "time_per_iteration": 2.690871000289917 + }, + { + "auxiliary_loss_clip": 0.01187955, + "auxiliary_loss_mlp": 0.01063173, + "balance_loss_clip": 1.06105542, + "balance_loss_mlp": 1.04281187, + "epoch": 0.09697880655343454, + "flos": 20187572686560.0, + "grad_norm": 2.3173995826251335, + "language_loss": 0.80524504, + "learning_rate": 3.953125561311398e-06, + "loss": 0.82775629, + "num_input_tokens_seen": 34620760, + "router_z_loss_clip": 1.26953125, + "router_z_loss_mlp": 0.20361328, + "step": 1613, + "time_per_iteration": 2.6210384368896484 + }, + { + "auxiliary_loss_clip": 0.01182245, + "auxiliary_loss_mlp": 0.01055399, + "balance_loss_clip": 1.05764723, + "balance_loss_mlp": 1.03460896, + "epoch": 0.09703892980610251, + "flos": 31852778623200.0, + "grad_norm": 1.8595001944589395, + "language_loss": 0.84537935, + "learning_rate": 3.953041699372964e-06, + "loss": 0.86775583, + "num_input_tokens_seen": 34640695, + "router_z_loss_clip": 1.24707031, + "router_z_loss_mlp": 0.20788574, + "step": 1614, + "time_per_iteration": 2.69823956489563 + }, + { + "auxiliary_loss_clip": 0.01080127, + "auxiliary_loss_mlp": 0.01006636, + "balance_loss_clip": 1.03231716, + "balance_loss_mlp": 1.00416553, + "epoch": 0.09709905305877048, + "flos": 73752454363680.0, + "grad_norm": 0.7007645967228001, + "language_loss": 0.54621553, + "learning_rate": 3.952957763374992e-06, + "loss": 0.56708312, + "num_input_tokens_seen": 34702395, + "router_z_loss_clip": 0.4777832, + "router_z_loss_mlp": 0.02468872, + "step": 1615, + "time_per_iteration": 3.246814727783203 + }, + { + "auxiliary_loss_clip": 0.01079324, + "auxiliary_loss_mlp": 0.01003299, + "balance_loss_clip": 1.03143167, + "balance_loss_mlp": 1.00071204, + "epoch": 0.09715917631143844, + "flos": 70330651289280.0, + "grad_norm": 0.7699795727841201, + "language_loss": 0.58221012, + "learning_rate": 3.952873753320666e-06, + "loss": 0.60303628, + "num_input_tokens_seen": 34768910, + "router_z_loss_clip": 0.47875977, + "router_z_loss_mlp": 0.02587891, + "step": 1616, + "time_per_iteration": 3.484632730484009 + }, + { + "auxiliary_loss_clip": 0.01185333, + "auxiliary_loss_mlp": 0.01062873, + "balance_loss_clip": 1.05955708, + "balance_loss_mlp": 1.04103422, + "epoch": 0.09721929956410642, + "flos": 25085170190880.0, + "grad_norm": 1.8714794993727881, + "language_loss": 0.68815875, + "learning_rate": 3.952789669213172e-06, + "loss": 0.71064085, + "num_input_tokens_seen": 34787680, + "router_z_loss_clip": 1.25878906, + "router_z_loss_mlp": 0.21838379, + "step": 1617, + "time_per_iteration": 2.6642696857452393 + }, + { + "auxiliary_loss_clip": 0.01187796, + "auxiliary_loss_mlp": 0.01055627, + "balance_loss_clip": 1.06010246, + "balance_loss_mlp": 1.03290606, + "epoch": 0.09727942281677439, + "flos": 33366166738560.0, + "grad_norm": 2.7843184232604483, + "language_loss": 0.80472577, + "learning_rate": 3.952705511055698e-06, + "loss": 0.82716, + "num_input_tokens_seen": 34808330, + "router_z_loss_clip": 1.27539062, + "router_z_loss_mlp": 0.22705078, + "step": 1618, + "time_per_iteration": 2.7716221809387207 + }, + { + "auxiliary_loss_clip": 0.01179428, + "auxiliary_loss_mlp": 0.01048857, + "balance_loss_clip": 1.05692136, + "balance_loss_mlp": 1.02992654, + "epoch": 0.09733954606944235, + "flos": 30383385923520.0, + "grad_norm": 1.6282321992046558, + "language_loss": 0.92949295, + "learning_rate": 3.952621278851435e-06, + "loss": 0.95177579, + "num_input_tokens_seen": 34830020, + "router_z_loss_clip": 1.22363281, + "router_z_loss_mlp": 0.18933105, + "step": 1619, + "time_per_iteration": 2.7282915115356445 + }, + { + "auxiliary_loss_clip": 0.01182323, + "auxiliary_loss_mlp": 0.0104734, + "balance_loss_clip": 1.06118309, + "balance_loss_mlp": 1.02755117, + "epoch": 0.09739966932211033, + "flos": 38441974356000.0, + "grad_norm": 2.3126389111188006, + "language_loss": 0.88497162, + "learning_rate": 3.9525369726035784e-06, + "loss": 0.90726829, + "num_input_tokens_seen": 34850330, + "router_z_loss_clip": 1.2109375, + "router_z_loss_mlp": 0.19787598, + "step": 1620, + "time_per_iteration": 2.764599084854126 + }, + { + "auxiliary_loss_clip": 0.01183343, + "auxiliary_loss_mlp": 0.01061526, + "balance_loss_clip": 1.05821753, + "balance_loss_mlp": 1.03960323, + "epoch": 0.0974597925747783, + "flos": 29137130650080.0, + "grad_norm": 2.0967717171989113, + "language_loss": 0.77586979, + "learning_rate": 3.952452592315324e-06, + "loss": 0.79831851, + "num_input_tokens_seen": 34871640, + "router_z_loss_clip": 1.25195312, + "router_z_loss_mlp": 0.21936035, + "step": 1621, + "time_per_iteration": 2.708004951477051 + }, + { + "auxiliary_loss_clip": 0.01180499, + "auxiliary_loss_mlp": 0.01059708, + "balance_loss_clip": 1.05535805, + "balance_loss_mlp": 1.03889441, + "epoch": 0.09751991582744626, + "flos": 20767008787200.0, + "grad_norm": 2.3155877536800173, + "language_loss": 0.77899051, + "learning_rate": 3.952368137989871e-06, + "loss": 0.80139261, + "num_input_tokens_seen": 34888100, + "router_z_loss_clip": 1.25, + "router_z_loss_mlp": 0.20800781, + "step": 1622, + "time_per_iteration": 2.6425528526306152 + }, + { + "auxiliary_loss_clip": 0.01185013, + "auxiliary_loss_mlp": 0.0105505, + "balance_loss_clip": 1.05805957, + "balance_loss_mlp": 1.03447425, + "epoch": 0.09758003908011423, + "flos": 34657754497920.0, + "grad_norm": 2.6767612405710643, + "language_loss": 0.85442269, + "learning_rate": 3.9522836096304225e-06, + "loss": 0.87682337, + "num_input_tokens_seen": 34910485, + "router_z_loss_clip": 1.26953125, + "router_z_loss_mlp": 0.20568848, + "step": 1623, + "time_per_iteration": 2.70896053314209 + }, + { + "auxiliary_loss_clip": 0.01183085, + "auxiliary_loss_mlp": 0.01057704, + "balance_loss_clip": 1.05935884, + "balance_loss_mlp": 1.03686619, + "epoch": 0.09764016233278221, + "flos": 22140064095840.0, + "grad_norm": 8.06867151791644, + "language_loss": 0.80392766, + "learning_rate": 3.952199007240184e-06, + "loss": 0.82633555, + "num_input_tokens_seen": 34928615, + "router_z_loss_clip": 1.23828125, + "router_z_loss_mlp": 0.20849609, + "step": 1624, + "time_per_iteration": 2.6263339519500732 + }, + { + "auxiliary_loss_clip": 0.01178851, + "auxiliary_loss_mlp": 0.01048968, + "balance_loss_clip": 1.05441332, + "balance_loss_mlp": 1.02932262, + "epoch": 0.09770028558545017, + "flos": 18627190876800.0, + "grad_norm": 2.5106510443342223, + "language_loss": 0.86088997, + "learning_rate": 3.952114330822364e-06, + "loss": 0.88316816, + "num_input_tokens_seen": 34946045, + "router_z_loss_clip": 1.24414062, + "router_z_loss_mlp": 0.19641113, + "step": 1625, + "time_per_iteration": 2.6453840732574463 + }, + { + "auxiliary_loss_clip": 0.01184565, + "auxiliary_loss_mlp": 0.01056774, + "balance_loss_clip": 1.05805683, + "balance_loss_mlp": 1.03663945, + "epoch": 0.09776040883811814, + "flos": 28641755204640.0, + "grad_norm": 2.2310013774064523, + "language_loss": 0.85360914, + "learning_rate": 3.952029580380172e-06, + "loss": 0.87602258, + "num_input_tokens_seen": 34962865, + "router_z_loss_clip": 1.265625, + "router_z_loss_mlp": 0.20141602, + "step": 1626, + "time_per_iteration": 2.681434154510498 + }, + { + "auxiliary_loss_clip": 0.01187045, + "auxiliary_loss_mlp": 0.01059976, + "balance_loss_clip": 1.05988586, + "balance_loss_mlp": 1.03844714, + "epoch": 0.09782053209078612, + "flos": 29894533760160.0, + "grad_norm": 2.317289187363774, + "language_loss": 0.8331244, + "learning_rate": 3.9519447559168234e-06, + "loss": 0.85559464, + "num_input_tokens_seen": 34983505, + "router_z_loss_clip": 1.27050781, + "router_z_loss_mlp": 0.21533203, + "step": 1627, + "time_per_iteration": 2.7120375633239746 + }, + { + "auxiliary_loss_clip": 0.01178389, + "auxiliary_loss_mlp": 0.01050739, + "balance_loss_clip": 1.05505085, + "balance_loss_mlp": 1.03054523, + "epoch": 0.09788065534345408, + "flos": 26332195292640.0, + "grad_norm": 1.7344569975217903, + "language_loss": 0.83916837, + "learning_rate": 3.951859857435534e-06, + "loss": 0.86145961, + "num_input_tokens_seen": 35001825, + "router_z_loss_clip": 1.23339844, + "router_z_loss_mlp": 0.20202637, + "step": 1628, + "time_per_iteration": 2.6915359497070312 + }, + { + "auxiliary_loss_clip": 0.01179139, + "auxiliary_loss_mlp": 0.01052508, + "balance_loss_clip": 1.05568171, + "balance_loss_mlp": 1.03242171, + "epoch": 0.09794077859612205, + "flos": 29092811096160.0, + "grad_norm": 1.7128369739980038, + "language_loss": 0.75546849, + "learning_rate": 3.951774884939523e-06, + "loss": 0.77778494, + "num_input_tokens_seen": 35023075, + "router_z_loss_clip": 1.23632812, + "router_z_loss_mlp": 0.2010498, + "step": 1629, + "time_per_iteration": 2.918032169342041 + }, + { + "auxiliary_loss_clip": 0.01184001, + "auxiliary_loss_mlp": 0.01051026, + "balance_loss_clip": 1.06082273, + "balance_loss_mlp": 1.03002191, + "epoch": 0.09800090184879003, + "flos": 28870038325440.0, + "grad_norm": 1.8875198999721738, + "language_loss": 0.78290856, + "learning_rate": 3.951689838432013e-06, + "loss": 0.80525881, + "num_input_tokens_seen": 35043480, + "router_z_loss_clip": 1.23242188, + "router_z_loss_mlp": 0.21020508, + "step": 1630, + "time_per_iteration": 2.755937099456787 + }, + { + "auxiliary_loss_clip": 0.011859, + "auxiliary_loss_mlp": 0.01052504, + "balance_loss_clip": 1.06153333, + "balance_loss_mlp": 1.0306654, + "epoch": 0.09806102510145799, + "flos": 20811206789280.0, + "grad_norm": 1.9583563404254718, + "language_loss": 0.86432314, + "learning_rate": 3.951604717916228e-06, + "loss": 0.88670719, + "num_input_tokens_seen": 35061490, + "router_z_loss_clip": 1.24414062, + "router_z_loss_mlp": 0.21850586, + "step": 1631, + "time_per_iteration": 2.6438913345336914 + }, + { + "auxiliary_loss_clip": 0.01181358, + "auxiliary_loss_mlp": 0.01055736, + "balance_loss_clip": 1.05864644, + "balance_loss_mlp": 1.03629351, + "epoch": 0.09812114835412596, + "flos": 29136806511840.0, + "grad_norm": 2.302411972229412, + "language_loss": 0.83155704, + "learning_rate": 3.9515195233953975e-06, + "loss": 0.85392803, + "num_input_tokens_seen": 35079670, + "router_z_loss_clip": 1.2265625, + "router_z_loss_mlp": 0.19445801, + "step": 1632, + "time_per_iteration": 2.7007648944854736 + }, + { + "auxiliary_loss_clip": 0.01183541, + "auxiliary_loss_mlp": 0.0105427, + "balance_loss_clip": 1.05906296, + "balance_loss_mlp": 1.03486288, + "epoch": 0.09818127160679392, + "flos": 25130583711360.0, + "grad_norm": 1.5697990846570584, + "language_loss": 0.78668725, + "learning_rate": 3.951434254872751e-06, + "loss": 0.80906534, + "num_input_tokens_seen": 35099205, + "router_z_loss_clip": 1.24511719, + "router_z_loss_mlp": 0.1940918, + "step": 1633, + "time_per_iteration": 2.691593885421753 + }, + { + "auxiliary_loss_clip": 0.01177898, + "auxiliary_loss_mlp": 0.01055782, + "balance_loss_clip": 1.05681551, + "balance_loss_mlp": 1.03506327, + "epoch": 0.0982413948594619, + "flos": 18897240962880.0, + "grad_norm": 2.1474207994915813, + "language_loss": 0.73290169, + "learning_rate": 3.951348912351521e-06, + "loss": 0.75523847, + "num_input_tokens_seen": 35115270, + "router_z_loss_clip": 1.20996094, + "router_z_loss_mlp": 0.20727539, + "step": 1634, + "time_per_iteration": 5.553609609603882 + }, + { + "auxiliary_loss_clip": 0.01186587, + "auxiliary_loss_mlp": 0.01063844, + "balance_loss_clip": 1.05645192, + "balance_loss_mlp": 1.04238617, + "epoch": 0.09830151811212987, + "flos": 29538761810400.0, + "grad_norm": 2.907050759437782, + "language_loss": 0.72908545, + "learning_rate": 3.951263495834947e-06, + "loss": 0.75158978, + "num_input_tokens_seen": 35134065, + "router_z_loss_clip": 1.30175781, + "router_z_loss_mlp": 0.21472168, + "step": 1635, + "time_per_iteration": 2.735189437866211 + }, + { + "auxiliary_loss_clip": 0.01185279, + "auxiliary_loss_mlp": 0.01052032, + "balance_loss_clip": 1.0577966, + "balance_loss_mlp": 1.03127754, + "epoch": 0.09836164136479783, + "flos": 25129570779360.0, + "grad_norm": 1.9074856393491286, + "language_loss": 0.78361791, + "learning_rate": 3.951178005326264e-06, + "loss": 0.80599105, + "num_input_tokens_seen": 35154870, + "router_z_loss_clip": 1.27539062, + "router_z_loss_mlp": 0.2076416, + "step": 1636, + "time_per_iteration": 4.163293123245239 + }, + { + "auxiliary_loss_clip": 0.0117845, + "auxiliary_loss_mlp": 0.01057292, + "balance_loss_clip": 1.05541229, + "balance_loss_mlp": 1.03718114, + "epoch": 0.09842176461746581, + "flos": 24323998973760.0, + "grad_norm": 1.9866433467198112, + "language_loss": 0.70114827, + "learning_rate": 3.951092440828715e-06, + "loss": 0.72350574, + "num_input_tokens_seen": 35171850, + "router_z_loss_clip": 1.23046875, + "router_z_loss_mlp": 0.2010498, + "step": 1637, + "time_per_iteration": 4.0930516719818115 + }, + { + "auxiliary_loss_clip": 0.01180317, + "auxiliary_loss_mlp": 0.01057905, + "balance_loss_clip": 1.05590892, + "balance_loss_mlp": 1.03710318, + "epoch": 0.09848188787013377, + "flos": 25886244578400.0, + "grad_norm": 2.566559072515067, + "language_loss": 0.77697796, + "learning_rate": 3.951006802345545e-06, + "loss": 0.79936022, + "num_input_tokens_seen": 35188795, + "router_z_loss_clip": 1.24414062, + "router_z_loss_mlp": 0.20812988, + "step": 1638, + "time_per_iteration": 2.6483969688415527 + }, + { + "auxiliary_loss_clip": 0.01177294, + "auxiliary_loss_mlp": 0.01048789, + "balance_loss_clip": 1.05556071, + "balance_loss_mlp": 1.02902389, + "epoch": 0.09854201112280174, + "flos": 36794938785120.0, + "grad_norm": 1.6775225706749466, + "language_loss": 0.7243799, + "learning_rate": 3.950921089880003e-06, + "loss": 0.74664068, + "num_input_tokens_seen": 35212100, + "router_z_loss_clip": 1.21777344, + "router_z_loss_mlp": 0.19763184, + "step": 1639, + "time_per_iteration": 2.84572172164917 + }, + { + "auxiliary_loss_clip": 0.01179563, + "auxiliary_loss_mlp": 0.01039224, + "balance_loss_clip": 1.05519021, + "balance_loss_mlp": 1.0192802, + "epoch": 0.09860213437546972, + "flos": 26595560027520.0, + "grad_norm": 23.596336623232233, + "language_loss": 0.8859539, + "learning_rate": 3.950835303435337e-06, + "loss": 0.90814173, + "num_input_tokens_seen": 35230390, + "router_z_loss_clip": 1.24316406, + "router_z_loss_mlp": 0.19958496, + "step": 1640, + "time_per_iteration": 2.8427820205688477 + }, + { + "auxiliary_loss_clip": 0.01182277, + "auxiliary_loss_mlp": 0.01039254, + "balance_loss_clip": 1.0578934, + "balance_loss_mlp": 1.01984644, + "epoch": 0.09866225762813768, + "flos": 26643850274880.0, + "grad_norm": 2.2864888777607666, + "language_loss": 0.8045733, + "learning_rate": 3.950749443014801e-06, + "loss": 0.82678866, + "num_input_tokens_seen": 35250405, + "router_z_loss_clip": 1.24511719, + "router_z_loss_mlp": 0.1940918, + "step": 1641, + "time_per_iteration": 2.686751365661621 + }, + { + "auxiliary_loss_clip": 0.01179116, + "auxiliary_loss_mlp": 0.01057675, + "balance_loss_clip": 1.05565691, + "balance_loss_mlp": 1.03640795, + "epoch": 0.09872238088080565, + "flos": 21475432856160.0, + "grad_norm": 5.924785107138918, + "language_loss": 0.85933757, + "learning_rate": 3.95066350862165e-06, + "loss": 0.88170546, + "num_input_tokens_seen": 35262820, + "router_z_loss_clip": 1.23339844, + "router_z_loss_mlp": 0.21276855, + "step": 1642, + "time_per_iteration": 2.6065478324890137 + }, + { + "auxiliary_loss_clip": 0.01181869, + "auxiliary_loss_mlp": 0.01052196, + "balance_loss_clip": 1.05875063, + "balance_loss_mlp": 1.03328919, + "epoch": 0.09878250413347361, + "flos": 33722222309280.0, + "grad_norm": 1.793786056174796, + "language_loss": 0.81143129, + "learning_rate": 3.950577500259144e-06, + "loss": 0.83377194, + "num_input_tokens_seen": 35284490, + "router_z_loss_clip": 1.23144531, + "router_z_loss_mlp": 0.18908691, + "step": 1643, + "time_per_iteration": 2.701798915863037 + }, + { + "auxiliary_loss_clip": 0.01178992, + "auxiliary_loss_mlp": 0.01064477, + "balance_loss_clip": 1.05601346, + "balance_loss_mlp": 1.04396129, + "epoch": 0.0988426273861416, + "flos": 20187613203840.0, + "grad_norm": 4.114565565108333, + "language_loss": 0.82746816, + "learning_rate": 3.950491417930543e-06, + "loss": 0.84990287, + "num_input_tokens_seen": 35302815, + "router_z_loss_clip": 1.23046875, + "router_z_loss_mlp": 0.20507812, + "step": 1644, + "time_per_iteration": 2.62288498878479 + }, + { + "auxiliary_loss_clip": 0.01174976, + "auxiliary_loss_mlp": 0.01050907, + "balance_loss_clip": 1.05572033, + "balance_loss_mlp": 1.03118944, + "epoch": 0.09890275063880956, + "flos": 25886933372160.0, + "grad_norm": 1.9636643607233495, + "language_loss": 0.68680215, + "learning_rate": 3.9504052616391124e-06, + "loss": 0.70906103, + "num_input_tokens_seen": 35321175, + "router_z_loss_clip": 1.19140625, + "router_z_loss_mlp": 0.1973877, + "step": 1645, + "time_per_iteration": 2.6641323566436768 + }, + { + "auxiliary_loss_clip": 0.01076795, + "auxiliary_loss_mlp": 0.01002163, + "balance_loss_clip": 1.02884555, + "balance_loss_mlp": 0.99940908, + "epoch": 0.09896287389147752, + "flos": 72454343322240.0, + "grad_norm": 0.8632426559069002, + "language_loss": 0.60810274, + "learning_rate": 3.950319031388119e-06, + "loss": 0.6288923, + "num_input_tokens_seen": 35381740, + "router_z_loss_clip": 0.47949219, + "router_z_loss_mlp": 0.02755737, + "step": 1646, + "time_per_iteration": 3.2105209827423096 + }, + { + "auxiliary_loss_clip": 0.01176281, + "auxiliary_loss_mlp": 0.01053947, + "balance_loss_clip": 1.05377316, + "balance_loss_mlp": 1.03276384, + "epoch": 0.0990229971441455, + "flos": 36170777957760.0, + "grad_norm": 1.8856954155156644, + "language_loss": 0.73158824, + "learning_rate": 3.950232727180833e-06, + "loss": 0.75389057, + "num_input_tokens_seen": 35403760, + "router_z_loss_clip": 1.22460938, + "router_z_loss_mlp": 0.21179199, + "step": 1647, + "time_per_iteration": 2.7856605052948 + }, + { + "auxiliary_loss_clip": 0.01178244, + "auxiliary_loss_mlp": 0.01060368, + "balance_loss_clip": 1.05538487, + "balance_loss_mlp": 1.04190242, + "epoch": 0.09908312039681347, + "flos": 26643080446560.0, + "grad_norm": 2.0749859927961327, + "language_loss": 0.84348238, + "learning_rate": 3.950146349020525e-06, + "loss": 0.86586839, + "num_input_tokens_seen": 35424050, + "router_z_loss_clip": 1.22753906, + "router_z_loss_mlp": 0.18457031, + "step": 1648, + "time_per_iteration": 2.7206809520721436 + }, + { + "auxiliary_loss_clip": 0.01074847, + "auxiliary_loss_mlp": 0.01001895, + "balance_loss_clip": 1.02634203, + "balance_loss_mlp": 0.99899572, + "epoch": 0.09914324364948143, + "flos": 70238608729920.0, + "grad_norm": 0.7300711117569696, + "language_loss": 0.55678809, + "learning_rate": 3.950059896910473e-06, + "loss": 0.57755554, + "num_input_tokens_seen": 35481690, + "router_z_loss_clip": 0.48535156, + "router_z_loss_mlp": 0.02897644, + "step": 1649, + "time_per_iteration": 3.204497814178467 + }, + { + "auxiliary_loss_clip": 0.01173943, + "auxiliary_loss_mlp": 0.01044065, + "balance_loss_clip": 1.05286205, + "balance_loss_mlp": 1.02491963, + "epoch": 0.09920336690214941, + "flos": 41639748624000.0, + "grad_norm": 2.5988971972022634, + "language_loss": 0.89734221, + "learning_rate": 3.949973370853954e-06, + "loss": 0.91952229, + "num_input_tokens_seen": 35498635, + "router_z_loss_clip": 1.2109375, + "router_z_loss_mlp": 0.19152832, + "step": 1650, + "time_per_iteration": 2.772557497024536 + }, + { + "auxiliary_loss_clip": 0.01073872, + "auxiliary_loss_mlp": 0.01003937, + "balance_loss_clip": 1.02575803, + "balance_loss_mlp": 1.00121045, + "epoch": 0.09926349015481738, + "flos": 86897384906400.0, + "grad_norm": 0.796242479135754, + "language_loss": 0.63733006, + "learning_rate": 3.94988677085425e-06, + "loss": 0.65810817, + "num_input_tokens_seen": 35565720, + "router_z_loss_clip": 0.48120117, + "router_z_loss_mlp": 0.02731323, + "step": 1651, + "time_per_iteration": 3.4181313514709473 + }, + { + "auxiliary_loss_clip": 0.01174179, + "auxiliary_loss_mlp": 0.01055548, + "balance_loss_clip": 1.05372238, + "balance_loss_mlp": 1.0356164, + "epoch": 0.09932361340748534, + "flos": 28246282670880.0, + "grad_norm": 1.9169805278010663, + "language_loss": 0.87616074, + "learning_rate": 3.949800096914643e-06, + "loss": 0.898458, + "num_input_tokens_seen": 35586000, + "router_z_loss_clip": 1.203125, + "router_z_loss_mlp": 0.19934082, + "step": 1652, + "time_per_iteration": 2.9110026359558105 + }, + { + "auxiliary_loss_clip": 0.01182915, + "auxiliary_loss_mlp": 0.01051754, + "balance_loss_clip": 1.05921412, + "balance_loss_mlp": 1.03243065, + "epoch": 0.09938373666015332, + "flos": 24194889453600.0, + "grad_norm": 1.932311803114, + "language_loss": 0.82278746, + "learning_rate": 3.949713349038422e-06, + "loss": 0.84513414, + "num_input_tokens_seen": 35604355, + "router_z_loss_clip": 1.23632812, + "router_z_loss_mlp": 0.19311523, + "step": 1653, + "time_per_iteration": 2.7915267944335938 + }, + { + "auxiliary_loss_clip": 0.01178716, + "auxiliary_loss_mlp": 0.01049566, + "balance_loss_clip": 1.05549121, + "balance_loss_mlp": 1.03034997, + "epoch": 0.09944385991282129, + "flos": 26955181118880.0, + "grad_norm": 1.951092752562684, + "language_loss": 0.79424399, + "learning_rate": 3.949626527228875e-06, + "loss": 0.81652677, + "num_input_tokens_seen": 35625495, + "router_z_loss_clip": 1.23242188, + "router_z_loss_mlp": 0.19226074, + "step": 1654, + "time_per_iteration": 2.7533645629882812 + }, + { + "auxiliary_loss_clip": 0.01177319, + "auxiliary_loss_mlp": 0.01050651, + "balance_loss_clip": 1.05826163, + "balance_loss_mlp": 1.03237629, + "epoch": 0.09950398316548925, + "flos": 24195132557280.0, + "grad_norm": 1.7982492287521727, + "language_loss": 0.80869448, + "learning_rate": 3.949539631489295e-06, + "loss": 0.83097416, + "num_input_tokens_seen": 35645030, + "router_z_loss_clip": 1.18945312, + "router_z_loss_mlp": 0.18273926, + "step": 1655, + "time_per_iteration": 2.915579319000244 + }, + { + "auxiliary_loss_clip": 0.01175923, + "auxiliary_loss_mlp": 0.01056669, + "balance_loss_clip": 1.05530715, + "balance_loss_mlp": 1.03752351, + "epoch": 0.09956410641815722, + "flos": 30510429062400.0, + "grad_norm": 2.423533433641765, + "language_loss": 0.806602, + "learning_rate": 3.9494526618229765e-06, + "loss": 0.82892787, + "num_input_tokens_seen": 35664305, + "router_z_loss_clip": 1.20703125, + "router_z_loss_mlp": 0.19152832, + "step": 1656, + "time_per_iteration": 2.696552038192749 + }, + { + "auxiliary_loss_clip": 0.01177513, + "auxiliary_loss_mlp": 0.01056719, + "balance_loss_clip": 1.05749083, + "balance_loss_mlp": 1.03743088, + "epoch": 0.0996242296708252, + "flos": 23571417420000.0, + "grad_norm": 1.6058745268799213, + "language_loss": 0.88956934, + "learning_rate": 3.949365618233217e-06, + "loss": 0.91191173, + "num_input_tokens_seen": 35684060, + "router_z_loss_clip": 1.20117188, + "router_z_loss_mlp": 0.19287109, + "step": 1657, + "time_per_iteration": 2.6772310733795166 + }, + { + "auxiliary_loss_clip": 0.0118197, + "auxiliary_loss_mlp": 0.01054959, + "balance_loss_clip": 1.05562687, + "balance_loss_mlp": 1.03439534, + "epoch": 0.09968435292349316, + "flos": 26687886207840.0, + "grad_norm": 4.490220434435375, + "language_loss": 0.84885454, + "learning_rate": 3.9492785007233195e-06, + "loss": 0.87122381, + "num_input_tokens_seen": 35703250, + "router_z_loss_clip": 1.26367188, + "router_z_loss_mlp": 0.20556641, + "step": 1658, + "time_per_iteration": 2.683187484741211 + }, + { + "auxiliary_loss_clip": 0.01069782, + "auxiliary_loss_mlp": 0.01012349, + "balance_loss_clip": 1.02261019, + "balance_loss_mlp": 1.00985277, + "epoch": 0.09974447617616113, + "flos": 79783681803840.0, + "grad_norm": 0.899343769631864, + "language_loss": 0.6080358, + "learning_rate": 3.949191309296585e-06, + "loss": 0.62885714, + "num_input_tokens_seen": 35762165, + "router_z_loss_clip": 0.47143555, + "router_z_loss_mlp": 0.02494812, + "step": 1659, + "time_per_iteration": 3.3144118785858154 + }, + { + "auxiliary_loss_clip": 0.01177889, + "auxiliary_loss_mlp": 0.01053628, + "balance_loss_clip": 1.05584085, + "balance_loss_mlp": 1.03331506, + "epoch": 0.0998045994288291, + "flos": 28869957290880.0, + "grad_norm": 3.0414225505045365, + "language_loss": 0.85334492, + "learning_rate": 3.949104043956321e-06, + "loss": 0.87566006, + "num_input_tokens_seen": 35781520, + "router_z_loss_clip": 1.22070312, + "router_z_loss_mlp": 0.203125, + "step": 1660, + "time_per_iteration": 2.7492806911468506 + }, + { + "auxiliary_loss_clip": 0.01179582, + "auxiliary_loss_mlp": 0.0105709, + "balance_loss_clip": 1.05826557, + "balance_loss_mlp": 1.03615642, + "epoch": 0.09986472268149707, + "flos": 23927716094400.0, + "grad_norm": 2.0739728892800167, + "language_loss": 0.79701877, + "learning_rate": 3.949016704705836e-06, + "loss": 0.81938547, + "num_input_tokens_seen": 35799565, + "router_z_loss_clip": 1.21289062, + "router_z_loss_mlp": 0.20922852, + "step": 1661, + "time_per_iteration": 2.803354024887085 + }, + { + "auxiliary_loss_clip": 0.01179946, + "auxiliary_loss_mlp": 0.01051868, + "balance_loss_clip": 1.05291843, + "balance_loss_mlp": 1.03149486, + "epoch": 0.09992484593416504, + "flos": 31985980388640.0, + "grad_norm": 2.324457718400137, + "language_loss": 0.83634776, + "learning_rate": 3.948929291548443e-06, + "loss": 0.85866588, + "num_input_tokens_seen": 35821085, + "router_z_loss_clip": 1.27148438, + "router_z_loss_mlp": 0.20397949, + "step": 1662, + "time_per_iteration": 2.723898410797119 + }, + { + "auxiliary_loss_clip": 0.01179807, + "auxiliary_loss_mlp": 0.01058431, + "balance_loss_clip": 1.05697632, + "balance_loss_mlp": 1.0367353, + "epoch": 0.09998496918683301, + "flos": 21345512990400.0, + "grad_norm": 2.244557760388014, + "language_loss": 0.89483094, + "learning_rate": 3.9488418044874546e-06, + "loss": 0.91721332, + "num_input_tokens_seen": 35839840, + "router_z_loss_clip": 1.22753906, + "router_z_loss_mlp": 0.21716309, + "step": 1663, + "time_per_iteration": 2.6353976726531982 + }, + { + "auxiliary_loss_clip": 0.01180844, + "auxiliary_loss_mlp": 0.01048953, + "balance_loss_clip": 1.056952, + "balance_loss_mlp": 1.02890205, + "epoch": 0.10004509243950098, + "flos": 27802276786080.0, + "grad_norm": 1.9750462995392517, + "language_loss": 0.70055711, + "learning_rate": 3.948754243526191e-06, + "loss": 0.72285509, + "num_input_tokens_seen": 35861545, + "router_z_loss_clip": 1.23730469, + "router_z_loss_mlp": 0.20043945, + "step": 1664, + "time_per_iteration": 2.9853713512420654 + }, + { + "auxiliary_loss_clip": 0.01180758, + "auxiliary_loss_mlp": 0.01051725, + "balance_loss_clip": 1.05746686, + "balance_loss_mlp": 1.0313518, + "epoch": 0.10010521569216894, + "flos": 19832043840480.0, + "grad_norm": 2.4076786707105673, + "language_loss": 0.78378868, + "learning_rate": 3.94866660866797e-06, + "loss": 0.80611348, + "num_input_tokens_seen": 35878295, + "router_z_loss_clip": 1.23242188, + "router_z_loss_mlp": 0.20373535, + "step": 1665, + "time_per_iteration": 2.635312557220459 + }, + { + "auxiliary_loss_clip": 0.01185247, + "auxiliary_loss_mlp": 0.01058913, + "balance_loss_clip": 1.06200242, + "balance_loss_mlp": 1.03947031, + "epoch": 0.10016533894483691, + "flos": 28556235927360.0, + "grad_norm": 1.828701671444371, + "language_loss": 0.69448102, + "learning_rate": 3.9485788999161165e-06, + "loss": 0.71692264, + "num_input_tokens_seen": 35898990, + "router_z_loss_clip": 1.23339844, + "router_z_loss_mlp": 0.19458008, + "step": 1666, + "time_per_iteration": 2.7261605262756348 + }, + { + "auxiliary_loss_clip": 0.01182224, + "auxiliary_loss_mlp": 0.01056615, + "balance_loss_clip": 1.05728137, + "balance_loss_mlp": 1.03506243, + "epoch": 0.10022546219750489, + "flos": 23615899043040.0, + "grad_norm": 3.461283580499216, + "language_loss": 0.78861815, + "learning_rate": 3.948491117273956e-06, + "loss": 0.81100655, + "num_input_tokens_seen": 35916225, + "router_z_loss_clip": 1.25, + "router_z_loss_mlp": 0.21520996, + "step": 1667, + "time_per_iteration": 2.6891539096832275 + }, + { + "auxiliary_loss_clip": 0.01180126, + "auxiliary_loss_mlp": 0.01048025, + "balance_loss_clip": 1.05667579, + "balance_loss_mlp": 1.02743721, + "epoch": 0.10028558545017285, + "flos": 33050905718400.0, + "grad_norm": 2.4363724331139163, + "language_loss": 0.77541178, + "learning_rate": 3.948403260744817e-06, + "loss": 0.79769325, + "num_input_tokens_seen": 35934630, + "router_z_loss_clip": 1.23242188, + "router_z_loss_mlp": 0.20581055, + "step": 1668, + "time_per_iteration": 2.6831467151641846 + }, + { + "auxiliary_loss_clip": 0.01177556, + "auxiliary_loss_mlp": 0.01052938, + "balance_loss_clip": 1.05559671, + "balance_loss_mlp": 1.0324223, + "epoch": 0.10034570870284082, + "flos": 31539259846080.0, + "grad_norm": 1.9466765425845138, + "language_loss": 0.7801035, + "learning_rate": 3.948315330332031e-06, + "loss": 0.80240846, + "num_input_tokens_seen": 35953855, + "router_z_loss_clip": 1.21972656, + "router_z_loss_mlp": 0.20507812, + "step": 1669, + "time_per_iteration": 2.7076170444488525 + }, + { + "auxiliary_loss_clip": 0.01183859, + "auxiliary_loss_mlp": 0.01058383, + "balance_loss_clip": 1.05817175, + "balance_loss_mlp": 1.03756857, + "epoch": 0.1004058319555088, + "flos": 32030259425280.0, + "grad_norm": 2.527842888987261, + "language_loss": 0.85325849, + "learning_rate": 3.948227326038933e-06, + "loss": 0.87568092, + "num_input_tokens_seen": 35974555, + "router_z_loss_clip": 1.25585938, + "router_z_loss_mlp": 0.20825195, + "step": 1670, + "time_per_iteration": 2.7056596279144287 + }, + { + "auxiliary_loss_clip": 0.01173347, + "auxiliary_loss_mlp": 0.01055178, + "balance_loss_clip": 1.0547173, + "balance_loss_mlp": 1.03560424, + "epoch": 0.10046595520817676, + "flos": 30962335816800.0, + "grad_norm": 1.907647232435057, + "language_loss": 0.76915658, + "learning_rate": 3.9481392478688586e-06, + "loss": 0.7914418, + "num_input_tokens_seen": 35996830, + "router_z_loss_clip": 1.1875, + "router_z_loss_mlp": 0.19580078, + "step": 1671, + "time_per_iteration": 2.749009847640991 + }, + { + "auxiliary_loss_clip": 0.01070433, + "auxiliary_loss_mlp": 0.01004752, + "balance_loss_clip": 1.02353525, + "balance_loss_mlp": 1.00224376, + "epoch": 0.10052607846084473, + "flos": 82318769661600.0, + "grad_norm": 0.7708067431476702, + "language_loss": 0.60739696, + "learning_rate": 3.948051095825149e-06, + "loss": 0.62814873, + "num_input_tokens_seen": 36054465, + "router_z_loss_clip": 0.46850586, + "router_z_loss_mlp": 0.02507019, + "step": 1672, + "time_per_iteration": 3.2802422046661377 + }, + { + "auxiliary_loss_clip": 0.01179025, + "auxiliary_loss_mlp": 0.01057505, + "balance_loss_clip": 1.05505276, + "balance_loss_mlp": 1.03749013, + "epoch": 0.10058620171351271, + "flos": 26064211587840.0, + "grad_norm": 2.178564295416203, + "language_loss": 0.77394557, + "learning_rate": 3.947962869911147e-06, + "loss": 0.7963109, + "num_input_tokens_seen": 36073480, + "router_z_loss_clip": 1.24023438, + "router_z_loss_mlp": 0.20007324, + "step": 1673, + "time_per_iteration": 2.6921398639678955 + }, + { + "auxiliary_loss_clip": 0.01176651, + "auxiliary_loss_mlp": 0.01043698, + "balance_loss_clip": 1.05374849, + "balance_loss_mlp": 1.02531612, + "epoch": 0.10064632496618067, + "flos": 20499268186080.0, + "grad_norm": 6.411879008721456, + "language_loss": 0.73096675, + "learning_rate": 3.947874570130197e-06, + "loss": 0.75317025, + "num_input_tokens_seen": 36091830, + "router_z_loss_clip": 1.22949219, + "router_z_loss_mlp": 0.18371582, + "step": 1674, + "time_per_iteration": 5.494474411010742 + }, + { + "auxiliary_loss_clip": 0.01180959, + "auxiliary_loss_mlp": 0.01053528, + "balance_loss_clip": 1.05635238, + "balance_loss_mlp": 1.03526545, + "epoch": 0.10070644821884864, + "flos": 28826691186240.0, + "grad_norm": 2.1676989178158634, + "language_loss": 0.79451066, + "learning_rate": 3.947786196485649e-06, + "loss": 0.81685555, + "num_input_tokens_seen": 36111400, + "router_z_loss_clip": 1.24707031, + "router_z_loss_mlp": 0.18273926, + "step": 1675, + "time_per_iteration": 2.7662594318389893 + }, + { + "auxiliary_loss_clip": 0.01175051, + "auxiliary_loss_mlp": 0.01063026, + "balance_loss_clip": 1.05488205, + "balance_loss_mlp": 1.04578829, + "epoch": 0.1007665714715166, + "flos": 29849525412480.0, + "grad_norm": 2.2595567731608286, + "language_loss": 0.81256282, + "learning_rate": 3.947697748980853e-06, + "loss": 0.83494359, + "num_input_tokens_seen": 36129345, + "router_z_loss_clip": 1.20019531, + "router_z_loss_mlp": 0.17236328, + "step": 1676, + "time_per_iteration": 4.22042989730835 + }, + { + "auxiliary_loss_clip": 0.01180834, + "auxiliary_loss_mlp": 0.01051517, + "balance_loss_clip": 1.05749989, + "balance_loss_mlp": 1.03320694, + "epoch": 0.10082669472418458, + "flos": 20498133702240.0, + "grad_norm": 2.143918698855601, + "language_loss": 0.86200893, + "learning_rate": 3.947609227619163e-06, + "loss": 0.88433248, + "num_input_tokens_seen": 36146255, + "router_z_loss_clip": 1.23242188, + "router_z_loss_mlp": 0.18310547, + "step": 1677, + "time_per_iteration": 4.05973744392395 + }, + { + "auxiliary_loss_clip": 0.01178693, + "auxiliary_loss_mlp": 0.0104941, + "balance_loss_clip": 1.05695653, + "balance_loss_mlp": 1.03065825, + "epoch": 0.10088681797685255, + "flos": 16537445974080.0, + "grad_norm": 1.998893027300351, + "language_loss": 0.85818291, + "learning_rate": 3.947520632403936e-06, + "loss": 0.88046396, + "num_input_tokens_seen": 36164050, + "router_z_loss_clip": 1.21777344, + "router_z_loss_mlp": 0.18737793, + "step": 1678, + "time_per_iteration": 2.720036029815674 + }, + { + "auxiliary_loss_clip": 0.01180382, + "auxiliary_loss_mlp": 0.01052086, + "balance_loss_clip": 1.0578866, + "balance_loss_mlp": 1.03301251, + "epoch": 0.10094694122952051, + "flos": 30829539224160.0, + "grad_norm": 2.0309322640809286, + "language_loss": 0.89994276, + "learning_rate": 3.947431963338532e-06, + "loss": 0.92226744, + "num_input_tokens_seen": 36183530, + "router_z_loss_clip": 1.22363281, + "router_z_loss_mlp": 0.19067383, + "step": 1679, + "time_per_iteration": 2.7334625720977783 + }, + { + "auxiliary_loss_clip": 0.01066098, + "auxiliary_loss_mlp": 0.01007733, + "balance_loss_clip": 1.02031112, + "balance_loss_mlp": 1.00510418, + "epoch": 0.10100706448218849, + "flos": 85236330356640.0, + "grad_norm": 0.7810685206324108, + "language_loss": 0.53008723, + "learning_rate": 3.947343220426312e-06, + "loss": 0.55082548, + "num_input_tokens_seen": 36248550, + "router_z_loss_clip": 0.45727539, + "router_z_loss_mlp": 0.02630615, + "step": 1680, + "time_per_iteration": 3.331524133682251 + }, + { + "auxiliary_loss_clip": 0.01176693, + "auxiliary_loss_mlp": 0.01049352, + "balance_loss_clip": 1.05656934, + "balance_loss_mlp": 1.03089881, + "epoch": 0.10106718773485646, + "flos": 24413083771680.0, + "grad_norm": 1.6804521227410079, + "language_loss": 0.77130353, + "learning_rate": 3.947254403670641e-06, + "loss": 0.79356402, + "num_input_tokens_seen": 36266065, + "router_z_loss_clip": 1.20214844, + "router_z_loss_mlp": 0.18457031, + "step": 1681, + "time_per_iteration": 2.6826770305633545 + }, + { + "auxiliary_loss_clip": 0.01183157, + "auxiliary_loss_mlp": 0.01058286, + "balance_loss_clip": 1.05453074, + "balance_loss_mlp": 1.03674531, + "epoch": 0.10112731098752442, + "flos": 16447591347840.0, + "grad_norm": 3.7455841629037003, + "language_loss": 0.93702686, + "learning_rate": 3.947165513074889e-06, + "loss": 0.9594413, + "num_input_tokens_seen": 36280960, + "router_z_loss_clip": 1.28613281, + "router_z_loss_mlp": 0.2154541, + "step": 1682, + "time_per_iteration": 2.62086820602417 + }, + { + "auxiliary_loss_clip": 0.01180062, + "auxiliary_loss_mlp": 0.01047163, + "balance_loss_clip": 1.0564115, + "balance_loss_mlp": 1.02848327, + "epoch": 0.1011874342401924, + "flos": 22592902747680.0, + "grad_norm": 2.000808734063654, + "language_loss": 0.87672269, + "learning_rate": 3.947076548642425e-06, + "loss": 0.89899492, + "num_input_tokens_seen": 36299010, + "router_z_loss_clip": 1.23632812, + "router_z_loss_mlp": 0.18688965, + "step": 1683, + "time_per_iteration": 2.7166249752044678 + }, + { + "auxiliary_loss_clip": 0.01176403, + "auxiliary_loss_mlp": 0.01056202, + "balance_loss_clip": 1.05690932, + "balance_loss_mlp": 1.03745055, + "epoch": 0.10124755749286037, + "flos": 25261800130080.0, + "grad_norm": 1.8903743936528032, + "language_loss": 0.74742448, + "learning_rate": 3.946987510376624e-06, + "loss": 0.76975054, + "num_input_tokens_seen": 36318400, + "router_z_loss_clip": 1.19628906, + "router_z_loss_mlp": 0.18762207, + "step": 1684, + "time_per_iteration": 2.6556921005249023 + }, + { + "auxiliary_loss_clip": 0.01064315, + "auxiliary_loss_mlp": 0.010037, + "balance_loss_clip": 1.01831865, + "balance_loss_mlp": 1.00116801, + "epoch": 0.10130768074552833, + "flos": 68462868811680.0, + "grad_norm": 0.7535482033701871, + "language_loss": 0.6108104, + "learning_rate": 3.9468983982808615e-06, + "loss": 0.63149053, + "num_input_tokens_seen": 36381815, + "router_z_loss_clip": 0.46020508, + "router_z_loss_mlp": 0.02531433, + "step": 1685, + "time_per_iteration": 3.302058219909668 + }, + { + "auxiliary_loss_clip": 0.01176216, + "auxiliary_loss_mlp": 0.0105221, + "balance_loss_clip": 1.05498624, + "balance_loss_mlp": 1.03294635, + "epoch": 0.1013678039981963, + "flos": 40758462722880.0, + "grad_norm": 2.445530774548432, + "language_loss": 0.61642557, + "learning_rate": 3.946809212358516e-06, + "loss": 0.63870978, + "num_input_tokens_seen": 36404320, + "router_z_loss_clip": 1.21191406, + "router_z_loss_mlp": 0.19262695, + "step": 1686, + "time_per_iteration": 2.7614715099334717 + }, + { + "auxiliary_loss_clip": 0.01177923, + "auxiliary_loss_mlp": 0.01048315, + "balance_loss_clip": 1.0587914, + "balance_loss_mlp": 1.02883661, + "epoch": 0.10142792725086427, + "flos": 38931798934080.0, + "grad_norm": 2.502439118161342, + "language_loss": 0.80978382, + "learning_rate": 3.946719952612972e-06, + "loss": 0.83204627, + "num_input_tokens_seen": 36427510, + "router_z_loss_clip": 1.19140625, + "router_z_loss_mlp": 0.19482422, + "step": 1687, + "time_per_iteration": 3.0267603397369385 + }, + { + "auxiliary_loss_clip": 0.01179528, + "auxiliary_loss_mlp": 0.01052109, + "balance_loss_clip": 1.05583525, + "balance_loss_mlp": 1.03329754, + "epoch": 0.10148805050353224, + "flos": 34746920330400.0, + "grad_norm": 2.3139159079482727, + "language_loss": 0.72307444, + "learning_rate": 3.94663061904761e-06, + "loss": 0.74539077, + "num_input_tokens_seen": 36448230, + "router_z_loss_clip": 1.23632812, + "router_z_loss_mlp": 0.18811035, + "step": 1688, + "time_per_iteration": 2.694920301437378 + }, + { + "auxiliary_loss_clip": 0.01177234, + "auxiliary_loss_mlp": 0.01055215, + "balance_loss_clip": 1.05608797, + "balance_loss_mlp": 1.0352838, + "epoch": 0.1015481737562002, + "flos": 30687666760800.0, + "grad_norm": 2.0732449041857266, + "language_loss": 0.86462331, + "learning_rate": 3.94654121166582e-06, + "loss": 0.88694781, + "num_input_tokens_seen": 36464395, + "router_z_loss_clip": 1.2109375, + "router_z_loss_mlp": 0.19946289, + "step": 1689, + "time_per_iteration": 2.7020978927612305 + }, + { + "auxiliary_loss_clip": 0.01172649, + "auxiliary_loss_mlp": 0.01048333, + "balance_loss_clip": 1.05111563, + "balance_loss_mlp": 1.03120232, + "epoch": 0.10160829700886818, + "flos": 37684085038560.0, + "grad_norm": 1.7671101701623593, + "language_loss": 0.88100523, + "learning_rate": 3.946451730470993e-06, + "loss": 0.90321505, + "num_input_tokens_seen": 36486475, + "router_z_loss_clip": 1.21582031, + "router_z_loss_mlp": 0.17138672, + "step": 1690, + "time_per_iteration": 2.760000705718994 + }, + { + "auxiliary_loss_clip": 0.01176396, + "auxiliary_loss_mlp": 0.01052869, + "balance_loss_clip": 1.05365396, + "balance_loss_mlp": 1.03298473, + "epoch": 0.10166842026153615, + "flos": 24906798008640.0, + "grad_norm": 2.515893151178919, + "language_loss": 0.83778155, + "learning_rate": 3.946362175466521e-06, + "loss": 0.86007416, + "num_input_tokens_seen": 36505310, + "router_z_loss_clip": 1.22851562, + "router_z_loss_mlp": 0.19873047, + "step": 1691, + "time_per_iteration": 2.704219102859497 + }, + { + "auxiliary_loss_clip": 0.0117727, + "auxiliary_loss_mlp": 0.01049612, + "balance_loss_clip": 1.05479503, + "balance_loss_mlp": 1.03066993, + "epoch": 0.10172854351420411, + "flos": 40849370798400.0, + "grad_norm": 1.6711870362740595, + "language_loss": 0.66789949, + "learning_rate": 3.946272546655801e-06, + "loss": 0.69016826, + "num_input_tokens_seen": 36529820, + "router_z_loss_clip": 1.22558594, + "router_z_loss_mlp": 0.18945312, + "step": 1692, + "time_per_iteration": 2.802448034286499 + }, + { + "auxiliary_loss_clip": 0.01175355, + "auxiliary_loss_mlp": 0.01060038, + "balance_loss_clip": 1.05310023, + "balance_loss_mlp": 1.04103565, + "epoch": 0.1017886667668721, + "flos": 28736350352640.0, + "grad_norm": 1.7716024839974367, + "language_loss": 0.75673956, + "learning_rate": 3.94618284404223e-06, + "loss": 0.7790935, + "num_input_tokens_seen": 36549000, + "router_z_loss_clip": 1.22363281, + "router_z_loss_mlp": 0.18994141, + "step": 1693, + "time_per_iteration": 2.665858030319214 + }, + { + "auxiliary_loss_clip": 0.01178526, + "auxiliary_loss_mlp": 0.01049687, + "balance_loss_clip": 1.05371141, + "balance_loss_mlp": 1.02851558, + "epoch": 0.10184879001954006, + "flos": 28425991923360.0, + "grad_norm": 1.771704151463453, + "language_loss": 0.875875, + "learning_rate": 3.9460930676292105e-06, + "loss": 0.89815712, + "num_input_tokens_seen": 36567515, + "router_z_loss_clip": 1.24804688, + "router_z_loss_mlp": 0.21166992, + "step": 1694, + "time_per_iteration": 2.681555986404419 + }, + { + "auxiliary_loss_clip": 0.01182186, + "auxiliary_loss_mlp": 0.01052827, + "balance_loss_clip": 1.05420125, + "balance_loss_mlp": 1.03272867, + "epoch": 0.10190891327220802, + "flos": 22369886873280.0, + "grad_norm": 2.2552707789700075, + "language_loss": 0.79396617, + "learning_rate": 3.946003217420147e-06, + "loss": 0.81631637, + "num_input_tokens_seen": 36586190, + "router_z_loss_clip": 1.28027344, + "router_z_loss_mlp": 0.20092773, + "step": 1695, + "time_per_iteration": 2.6393377780914307 + }, + { + "auxiliary_loss_clip": 0.01177601, + "auxiliary_loss_mlp": 0.01058041, + "balance_loss_clip": 1.0528543, + "balance_loss_mlp": 1.03765678, + "epoch": 0.10196903652487599, + "flos": 32293867263840.0, + "grad_norm": 3.876948124971975, + "language_loss": 0.86417753, + "learning_rate": 3.945913293418447e-06, + "loss": 0.88653398, + "num_input_tokens_seen": 36607495, + "router_z_loss_clip": 1.24511719, + "router_z_loss_mlp": 0.20385742, + "step": 1696, + "time_per_iteration": 2.720505714416504 + }, + { + "auxiliary_loss_clip": 0.01175387, + "auxiliary_loss_mlp": 0.01054527, + "balance_loss_clip": 1.05504847, + "balance_loss_mlp": 1.03562033, + "epoch": 0.10202915977754397, + "flos": 26685536205600.0, + "grad_norm": 2.026032162381558, + "language_loss": 0.81602502, + "learning_rate": 3.945823295627519e-06, + "loss": 0.83832413, + "num_input_tokens_seen": 36628555, + "router_z_loss_clip": 1.20507812, + "router_z_loss_mlp": 0.18920898, + "step": 1697, + "time_per_iteration": 2.6938045024871826 + }, + { + "auxiliary_loss_clip": 0.01179027, + "auxiliary_loss_mlp": 0.01049502, + "balance_loss_clip": 1.05463862, + "balance_loss_mlp": 1.02982092, + "epoch": 0.10208928303021193, + "flos": 27222516547200.0, + "grad_norm": 2.141095244059714, + "language_loss": 0.81431031, + "learning_rate": 3.9457332240507775e-06, + "loss": 0.83659554, + "num_input_tokens_seen": 36646250, + "router_z_loss_clip": 1.24316406, + "router_z_loss_mlp": 0.19677734, + "step": 1698, + "time_per_iteration": 2.735067844390869 + }, + { + "auxiliary_loss_clip": 0.01177345, + "auxiliary_loss_mlp": 0.01050464, + "balance_loss_clip": 1.05472732, + "balance_loss_mlp": 1.03127098, + "epoch": 0.1021494062828799, + "flos": 26997920498880.0, + "grad_norm": 3.3225665731578817, + "language_loss": 0.76167595, + "learning_rate": 3.945643078691637e-06, + "loss": 0.78395408, + "num_input_tokens_seen": 36666675, + "router_z_loss_clip": 1.22753906, + "router_z_loss_mlp": 0.19189453, + "step": 1699, + "time_per_iteration": 2.682978630065918 + }, + { + "auxiliary_loss_clip": 0.01178718, + "auxiliary_loss_mlp": 0.01048301, + "balance_loss_clip": 1.05634856, + "balance_loss_mlp": 1.02915549, + "epoch": 0.10220952953554788, + "flos": 23972845993920.0, + "grad_norm": 1.7399193492645968, + "language_loss": 0.8009789, + "learning_rate": 3.945552859553516e-06, + "loss": 0.8232491, + "num_input_tokens_seen": 36685225, + "router_z_loss_clip": 1.22265625, + "router_z_loss_mlp": 0.19140625, + "step": 1700, + "time_per_iteration": 2.894258499145508 + }, + { + "auxiliary_loss_clip": 0.01177987, + "auxiliary_loss_mlp": 0.01046375, + "balance_loss_clip": 1.05432713, + "balance_loss_mlp": 1.02682459, + "epoch": 0.10226965278821584, + "flos": 36347367379680.0, + "grad_norm": 1.7682136835886586, + "language_loss": 0.77016896, + "learning_rate": 3.945462566639836e-06, + "loss": 0.79241258, + "num_input_tokens_seen": 36705985, + "router_z_loss_clip": 1.23535156, + "router_z_loss_mlp": 0.19543457, + "step": 1701, + "time_per_iteration": 2.714297294616699 + }, + { + "auxiliary_loss_clip": 0.01182847, + "auxiliary_loss_mlp": 0.01047834, + "balance_loss_clip": 1.05594921, + "balance_loss_mlp": 1.02851057, + "epoch": 0.10232977604088381, + "flos": 32966399373120.0, + "grad_norm": 2.2251850642492528, + "language_loss": 0.77962416, + "learning_rate": 3.945372199954019e-06, + "loss": 0.80193096, + "num_input_tokens_seen": 36725815, + "router_z_loss_clip": 1.26855469, + "router_z_loss_mlp": 0.1932373, + "step": 1702, + "time_per_iteration": 2.700465679168701 + }, + { + "auxiliary_loss_clip": 0.01176258, + "auxiliary_loss_mlp": 0.01047296, + "balance_loss_clip": 1.05464458, + "balance_loss_mlp": 1.02883053, + "epoch": 0.10238989929355179, + "flos": 25352748722880.0, + "grad_norm": 2.050667863755044, + "language_loss": 0.94126356, + "learning_rate": 3.945281759499494e-06, + "loss": 0.96349907, + "num_input_tokens_seen": 36742345, + "router_z_loss_clip": 1.21582031, + "router_z_loss_mlp": 0.18469238, + "step": 1703, + "time_per_iteration": 2.7180678844451904 + }, + { + "auxiliary_loss_clip": 0.01064994, + "auxiliary_loss_mlp": 0.01002137, + "balance_loss_clip": 1.01924515, + "balance_loss_mlp": 0.99965429, + "epoch": 0.10245002254621975, + "flos": 70403245554240.0, + "grad_norm": 0.8728318994685011, + "language_loss": 0.55009949, + "learning_rate": 3.94519124527969e-06, + "loss": 0.57077074, + "num_input_tokens_seen": 36798775, + "router_z_loss_clip": 0.45727539, + "router_z_loss_mlp": 0.02481079, + "step": 1704, + "time_per_iteration": 3.178373098373413 + }, + { + "auxiliary_loss_clip": 0.0117712, + "auxiliary_loss_mlp": 0.01050975, + "balance_loss_clip": 1.05395031, + "balance_loss_mlp": 1.03101945, + "epoch": 0.10251014579888772, + "flos": 20499268186080.0, + "grad_norm": 2.5951365434652907, + "language_loss": 0.84132391, + "learning_rate": 3.945100657298039e-06, + "loss": 0.86360484, + "num_input_tokens_seen": 36816295, + "router_z_loss_clip": 1.23046875, + "router_z_loss_mlp": 0.19958496, + "step": 1705, + "time_per_iteration": 2.6718077659606934 + }, + { + "auxiliary_loss_clip": 0.01064849, + "auxiliary_loss_mlp": 0.01000982, + "balance_loss_clip": 1.01885569, + "balance_loss_mlp": 0.99855876, + "epoch": 0.1025702690515557, + "flos": 83664725260320.0, + "grad_norm": 0.7635663101026284, + "language_loss": 0.60437799, + "learning_rate": 3.9450099955579765e-06, + "loss": 0.62503624, + "num_input_tokens_seen": 36882030, + "router_z_loss_clip": 0.45996094, + "router_z_loss_mlp": 0.02420044, + "step": 1706, + "time_per_iteration": 3.273648738861084 + }, + { + "auxiliary_loss_clip": 0.01182425, + "auxiliary_loss_mlp": 0.01044437, + "balance_loss_clip": 1.05862355, + "balance_loss_mlp": 1.024863, + "epoch": 0.10263039230422366, + "flos": 18139837852800.0, + "grad_norm": 2.312509388475746, + "language_loss": 0.8651107, + "learning_rate": 3.94491926006294e-06, + "loss": 0.88737935, + "num_input_tokens_seen": 36899245, + "router_z_loss_clip": 1.23730469, + "router_z_loss_mlp": 0.19567871, + "step": 1707, + "time_per_iteration": 2.666731834411621 + }, + { + "auxiliary_loss_clip": 0.0117694, + "auxiliary_loss_mlp": 0.01047903, + "balance_loss_clip": 1.05430174, + "balance_loss_mlp": 1.02894866, + "epoch": 0.10269051555689163, + "flos": 30917367986400.0, + "grad_norm": 1.4628136760057489, + "language_loss": 0.72572947, + "learning_rate": 3.944828450816369e-06, + "loss": 0.74797785, + "num_input_tokens_seen": 36920950, + "router_z_loss_clip": 1.2265625, + "router_z_loss_mlp": 0.18945312, + "step": 1708, + "time_per_iteration": 2.6854240894317627 + }, + { + "auxiliary_loss_clip": 0.01177981, + "auxiliary_loss_mlp": 0.01051746, + "balance_loss_clip": 1.05558443, + "balance_loss_mlp": 1.03171873, + "epoch": 0.10275063880955959, + "flos": 25708277568960.0, + "grad_norm": 1.7064230516965824, + "language_loss": 0.91081387, + "learning_rate": 3.944737567821709e-06, + "loss": 0.93311113, + "num_input_tokens_seen": 36938900, + "router_z_loss_clip": 1.22363281, + "router_z_loss_mlp": 0.20031738, + "step": 1709, + "time_per_iteration": 2.646047353744507 + }, + { + "auxiliary_loss_clip": 0.01180779, + "auxiliary_loss_mlp": 0.0104795, + "balance_loss_clip": 1.05935884, + "balance_loss_mlp": 1.02853119, + "epoch": 0.10281076206222757, + "flos": 37054900068480.0, + "grad_norm": 1.8254909433699742, + "language_loss": 0.87837994, + "learning_rate": 3.944646611082406e-06, + "loss": 0.90066731, + "num_input_tokens_seen": 36957010, + "router_z_loss_clip": 1.21386719, + "router_z_loss_mlp": 0.19421387, + "step": 1710, + "time_per_iteration": 2.7063167095184326 + }, + { + "auxiliary_loss_clip": 0.01177386, + "auxiliary_loss_mlp": 0.01055223, + "balance_loss_clip": 1.05629516, + "balance_loss_mlp": 1.03591108, + "epoch": 0.10287088531489554, + "flos": 27355070036160.0, + "grad_norm": 2.3972514494241697, + "language_loss": 0.79497778, + "learning_rate": 3.944555580601908e-06, + "loss": 0.8173039, + "num_input_tokens_seen": 36977690, + "router_z_loss_clip": 1.20996094, + "router_z_loss_mlp": 0.1932373, + "step": 1711, + "time_per_iteration": 2.6736366748809814 + }, + { + "auxiliary_loss_clip": 0.01180135, + "auxiliary_loss_mlp": 0.01054994, + "balance_loss_clip": 1.05521393, + "balance_loss_mlp": 1.03449035, + "epoch": 0.1029310085675635, + "flos": 30647196348480.0, + "grad_norm": 2.488036056685552, + "language_loss": 0.73650408, + "learning_rate": 3.944464476383668e-06, + "loss": 0.75885546, + "num_input_tokens_seen": 36997300, + "router_z_loss_clip": 1.25, + "router_z_loss_mlp": 0.20507812, + "step": 1712, + "time_per_iteration": 2.9256575107574463 + }, + { + "auxiliary_loss_clip": 0.01175069, + "auxiliary_loss_mlp": 0.01053153, + "balance_loss_clip": 1.05722916, + "balance_loss_mlp": 1.03528368, + "epoch": 0.10299113182023148, + "flos": 24239978835840.0, + "grad_norm": 1.747389340028162, + "language_loss": 0.86948562, + "learning_rate": 3.94437329843114e-06, + "loss": 0.89176792, + "num_input_tokens_seen": 37016110, + "router_z_loss_clip": 1.1796875, + "router_z_loss_mlp": 0.17871094, + "step": 1713, + "time_per_iteration": 5.433117628097534 + }, + { + "auxiliary_loss_clip": 0.01174884, + "auxiliary_loss_mlp": 0.010542, + "balance_loss_clip": 1.0541445, + "balance_loss_mlp": 1.03603244, + "epoch": 0.10305125507289944, + "flos": 24950307216960.0, + "grad_norm": 1.7744960558295846, + "language_loss": 0.72775424, + "learning_rate": 3.944282046747782e-06, + "loss": 0.75004506, + "num_input_tokens_seen": 37036405, + "router_z_loss_clip": 1.20898438, + "router_z_loss_mlp": 0.18188477, + "step": 1714, + "time_per_iteration": 2.643122434616089 + }, + { + "auxiliary_loss_clip": 0.01183881, + "auxiliary_loss_mlp": 0.01052707, + "balance_loss_clip": 1.05748761, + "balance_loss_mlp": 1.03263164, + "epoch": 0.10311137832556741, + "flos": 32031596495520.0, + "grad_norm": 2.0395013676267633, + "language_loss": 0.9116382, + "learning_rate": 3.944190721337053e-06, + "loss": 0.93400407, + "num_input_tokens_seen": 37057580, + "router_z_loss_clip": 1.26367188, + "router_z_loss_mlp": 0.20080566, + "step": 1715, + "time_per_iteration": 4.196561574935913 + }, + { + "auxiliary_loss_clip": 0.01180619, + "auxiliary_loss_mlp": 0.010563, + "balance_loss_clip": 1.05798626, + "balance_loss_mlp": 1.03725004, + "epoch": 0.10317150157823539, + "flos": 43072884708480.0, + "grad_norm": 2.1635178486148496, + "language_loss": 0.75460827, + "learning_rate": 3.944099322202418e-06, + "loss": 0.77697748, + "num_input_tokens_seen": 37079120, + "router_z_loss_clip": 1.22558594, + "router_z_loss_mlp": 0.19055176, + "step": 1716, + "time_per_iteration": 4.223790884017944 + }, + { + "auxiliary_loss_clip": 0.0118147, + "auxiliary_loss_mlp": 0.01061655, + "balance_loss_clip": 1.05663502, + "balance_loss_mlp": 1.0416398, + "epoch": 0.10323162483090335, + "flos": 31408246013760.0, + "grad_norm": 1.9140239414461273, + "language_loss": 0.85628355, + "learning_rate": 3.944007849347342e-06, + "loss": 0.8787148, + "num_input_tokens_seen": 37099710, + "router_z_loss_clip": 1.24804688, + "router_z_loss_mlp": 0.20019531, + "step": 1717, + "time_per_iteration": 2.716118097305298 + }, + { + "auxiliary_loss_clip": 0.0117876, + "auxiliary_loss_mlp": 0.01057627, + "balance_loss_clip": 1.05853033, + "balance_loss_mlp": 1.03970945, + "epoch": 0.10329174808357132, + "flos": 19876565980800.0, + "grad_norm": 1.9713652440929588, + "language_loss": 0.82813323, + "learning_rate": 3.943916302775292e-06, + "loss": 0.85049701, + "num_input_tokens_seen": 37117775, + "router_z_loss_clip": 1.20214844, + "router_z_loss_mlp": 0.17919922, + "step": 1718, + "time_per_iteration": 2.633472204208374 + }, + { + "auxiliary_loss_clip": 0.01180564, + "auxiliary_loss_mlp": 0.01048019, + "balance_loss_clip": 1.05996847, + "balance_loss_mlp": 1.0286479, + "epoch": 0.10335187133623928, + "flos": 44766751904640.0, + "grad_norm": 12.545220924590216, + "language_loss": 0.73341584, + "learning_rate": 3.943824682489742e-06, + "loss": 0.75570166, + "num_input_tokens_seen": 37140280, + "router_z_loss_clip": 1.203125, + "router_z_loss_mlp": 0.19372559, + "step": 1719, + "time_per_iteration": 2.783822536468506 + }, + { + "auxiliary_loss_clip": 0.01180106, + "auxiliary_loss_mlp": 0.01046658, + "balance_loss_clip": 1.05905843, + "balance_loss_mlp": 1.0287528, + "epoch": 0.10341199458890726, + "flos": 18272918066400.0, + "grad_norm": 1.8125043632273579, + "language_loss": 0.9273333, + "learning_rate": 3.9437329884941665e-06, + "loss": 0.94960093, + "num_input_tokens_seen": 37158350, + "router_z_loss_clip": 1.2109375, + "router_z_loss_mlp": 0.17907715, + "step": 1720, + "time_per_iteration": 2.618877410888672 + }, + { + "auxiliary_loss_clip": 0.01178629, + "auxiliary_loss_mlp": 0.01051431, + "balance_loss_clip": 1.05565143, + "balance_loss_mlp": 1.03126049, + "epoch": 0.10347211784157523, + "flos": 25662904565760.0, + "grad_norm": 1.718337067067889, + "language_loss": 0.79381526, + "learning_rate": 3.943641220792039e-06, + "loss": 0.81611586, + "num_input_tokens_seen": 37177120, + "router_z_loss_clip": 1.22851562, + "router_z_loss_mlp": 0.20166016, + "step": 1721, + "time_per_iteration": 2.6493330001831055 + }, + { + "auxiliary_loss_clip": 0.01184918, + "auxiliary_loss_mlp": 0.01055118, + "balance_loss_clip": 1.05889821, + "balance_loss_mlp": 1.03454232, + "epoch": 0.1035322410942432, + "flos": 24150286278720.0, + "grad_norm": 2.133546108727896, + "language_loss": 0.8102622, + "learning_rate": 3.9435493793868434e-06, + "loss": 0.83266258, + "num_input_tokens_seen": 37195895, + "router_z_loss_clip": 1.26074219, + "router_z_loss_mlp": 0.20568848, + "step": 1722, + "time_per_iteration": 2.6352694034576416 + }, + { + "auxiliary_loss_clip": 0.01063826, + "auxiliary_loss_mlp": 0.01021486, + "balance_loss_clip": 1.01908195, + "balance_loss_mlp": 1.01921368, + "epoch": 0.10359236434691117, + "flos": 64302618363840.0, + "grad_norm": 0.9460085547851649, + "language_loss": 0.67108434, + "learning_rate": 3.943457464282059e-06, + "loss": 0.69193745, + "num_input_tokens_seen": 37247270, + "router_z_loss_clip": 0.44750977, + "router_z_loss_mlp": 0.0227356, + "step": 1723, + "time_per_iteration": 3.271275281906128 + }, + { + "auxiliary_loss_clip": 0.01178838, + "auxiliary_loss_mlp": 0.01050413, + "balance_loss_clip": 1.05498683, + "balance_loss_mlp": 1.03223372, + "epoch": 0.10365248759957914, + "flos": 22458161325600.0, + "grad_norm": 3.088094419999352, + "language_loss": 0.77520835, + "learning_rate": 3.9433654754811745e-06, + "loss": 0.79750091, + "num_input_tokens_seen": 37265595, + "router_z_loss_clip": 1.23925781, + "router_z_loss_mlp": 0.1817627, + "step": 1724, + "time_per_iteration": 2.6613779067993164 + }, + { + "auxiliary_loss_clip": 0.01184862, + "auxiliary_loss_mlp": 0.01052725, + "balance_loss_clip": 1.05926824, + "balance_loss_mlp": 1.03490353, + "epoch": 0.1037126108522471, + "flos": 58027340230560.0, + "grad_norm": 2.0131594601062113, + "language_loss": 0.74837989, + "learning_rate": 3.943273412987676e-06, + "loss": 0.77075577, + "num_input_tokens_seen": 37286660, + "router_z_loss_clip": 1.25878906, + "router_z_loss_mlp": 0.17822266, + "step": 1725, + "time_per_iteration": 2.894009590148926 + }, + { + "auxiliary_loss_clip": 0.01179345, + "auxiliary_loss_mlp": 0.01044203, + "balance_loss_clip": 1.05962169, + "balance_loss_mlp": 1.02632177, + "epoch": 0.10377273410491508, + "flos": 27841450645440.0, + "grad_norm": 1.952531475940922, + "language_loss": 0.74627197, + "learning_rate": 3.943181276805054e-06, + "loss": 0.76850748, + "num_input_tokens_seen": 37304915, + "router_z_loss_clip": 1.19726562, + "router_z_loss_mlp": 0.17883301, + "step": 1726, + "time_per_iteration": 2.6872875690460205 + }, + { + "auxiliary_loss_clip": 0.01182321, + "auxiliary_loss_mlp": 0.01054817, + "balance_loss_clip": 1.05937767, + "balance_loss_mlp": 1.03538609, + "epoch": 0.10383285735758305, + "flos": 31895072313120.0, + "grad_norm": 2.995991670690617, + "language_loss": 0.73618001, + "learning_rate": 3.9430890669368035e-06, + "loss": 0.75855136, + "num_input_tokens_seen": 37325265, + "router_z_loss_clip": 1.23046875, + "router_z_loss_mlp": 0.19421387, + "step": 1727, + "time_per_iteration": 2.6955928802490234 + }, + { + "auxiliary_loss_clip": 0.01179726, + "auxiliary_loss_mlp": 0.01048369, + "balance_loss_clip": 1.05759048, + "balance_loss_mlp": 1.02968919, + "epoch": 0.10389298061025101, + "flos": 20855202204960.0, + "grad_norm": 2.413964648198625, + "language_loss": 0.85163331, + "learning_rate": 3.942996783386422e-06, + "loss": 0.87391424, + "num_input_tokens_seen": 37341650, + "router_z_loss_clip": 1.22167969, + "router_z_loss_mlp": 0.18676758, + "step": 1728, + "time_per_iteration": 2.648975372314453 + }, + { + "auxiliary_loss_clip": 0.01181414, + "auxiliary_loss_mlp": 0.01045201, + "balance_loss_clip": 1.05968714, + "balance_loss_mlp": 1.02627039, + "epoch": 0.10395310386291898, + "flos": 25351614239040.0, + "grad_norm": 2.0485693136439926, + "language_loss": 0.70591342, + "learning_rate": 3.942904426157406e-06, + "loss": 0.72817957, + "num_input_tokens_seen": 37360270, + "router_z_loss_clip": 1.21582031, + "router_z_loss_mlp": 0.18933105, + "step": 1729, + "time_per_iteration": 2.663821220397949 + }, + { + "auxiliary_loss_clip": 0.01180521, + "auxiliary_loss_mlp": 0.0105163, + "balance_loss_clip": 1.05895972, + "balance_loss_mlp": 1.03123355, + "epoch": 0.10401322711558696, + "flos": 15643316095200.0, + "grad_norm": 2.228178226765417, + "language_loss": 0.81072056, + "learning_rate": 3.9428119952532605e-06, + "loss": 0.83304203, + "num_input_tokens_seen": 37375225, + "router_z_loss_clip": 1.21679688, + "router_z_loss_mlp": 0.20397949, + "step": 1730, + "time_per_iteration": 2.6581289768218994 + }, + { + "auxiliary_loss_clip": 0.01177065, + "auxiliary_loss_mlp": 0.01046159, + "balance_loss_clip": 1.05673027, + "balance_loss_mlp": 1.02851593, + "epoch": 0.10407335036825492, + "flos": 28290237569280.0, + "grad_norm": 2.652666199929168, + "language_loss": 0.76063389, + "learning_rate": 3.942719490677489e-06, + "loss": 0.78286612, + "num_input_tokens_seen": 37395165, + "router_z_loss_clip": 1.203125, + "router_z_loss_mlp": 0.17626953, + "step": 1731, + "time_per_iteration": 2.685479164123535 + }, + { + "auxiliary_loss_clip": 0.01174425, + "auxiliary_loss_mlp": 0.01044953, + "balance_loss_clip": 1.05807042, + "balance_loss_mlp": 1.02746499, + "epoch": 0.10413347362092289, + "flos": 31852778623200.0, + "grad_norm": 1.7595682296660915, + "language_loss": 0.82514369, + "learning_rate": 3.9426269124336e-06, + "loss": 0.84733748, + "num_input_tokens_seen": 37414845, + "router_z_loss_clip": 1.1640625, + "router_z_loss_mlp": 0.17504883, + "step": 1732, + "time_per_iteration": 2.7539825439453125 + }, + { + "auxiliary_loss_clip": 0.01182427, + "auxiliary_loss_mlp": 0.01044748, + "balance_loss_clip": 1.06289923, + "balance_loss_mlp": 1.02731955, + "epoch": 0.10419359687359087, + "flos": 15424027810560.0, + "grad_norm": 2.219127891341431, + "language_loss": 0.83750916, + "learning_rate": 3.942534260525104e-06, + "loss": 0.85978091, + "num_input_tokens_seen": 37432490, + "router_z_loss_clip": 1.19433594, + "router_z_loss_mlp": 0.17431641, + "step": 1733, + "time_per_iteration": 2.6180806159973145 + }, + { + "auxiliary_loss_clip": 0.01184618, + "auxiliary_loss_mlp": 0.01044055, + "balance_loss_clip": 1.06139302, + "balance_loss_mlp": 1.02599525, + "epoch": 0.10425372012625883, + "flos": 14794883357760.0, + "grad_norm": 2.283946296148232, + "language_loss": 0.76231551, + "learning_rate": 3.942441534955514e-06, + "loss": 0.78460222, + "num_input_tokens_seen": 37449435, + "router_z_loss_clip": 1.23242188, + "router_z_loss_mlp": 0.18078613, + "step": 1734, + "time_per_iteration": 2.7100155353546143 + }, + { + "auxiliary_loss_clip": 0.0118006, + "auxiliary_loss_mlp": 0.01045878, + "balance_loss_clip": 1.06064439, + "balance_loss_mlp": 1.02805662, + "epoch": 0.1043138433789268, + "flos": 30917327469120.0, + "grad_norm": 1.7279635407632408, + "language_loss": 0.74802709, + "learning_rate": 3.9423487357283465e-06, + "loss": 0.77028644, + "num_input_tokens_seen": 37469105, + "router_z_loss_clip": 1.19433594, + "router_z_loss_mlp": 0.17834473, + "step": 1735, + "time_per_iteration": 2.736750602722168 + }, + { + "auxiliary_loss_clip": 0.01185051, + "auxiliary_loss_mlp": 0.01046942, + "balance_loss_clip": 1.06144369, + "balance_loss_mlp": 1.02864361, + "epoch": 0.10437396663159478, + "flos": 35590612546080.0, + "grad_norm": 1.729418553054735, + "language_loss": 0.78654957, + "learning_rate": 3.94225586284712e-06, + "loss": 0.80886954, + "num_input_tokens_seen": 37490540, + "router_z_loss_clip": 1.23632812, + "router_z_loss_mlp": 0.1829834, + "step": 1736, + "time_per_iteration": 2.9040722846984863 + }, + { + "auxiliary_loss_clip": 0.01179116, + "auxiliary_loss_mlp": 0.01055507, + "balance_loss_clip": 1.06009173, + "balance_loss_mlp": 1.03730333, + "epoch": 0.10443408988426274, + "flos": 30604294899360.0, + "grad_norm": 1.8221347770388148, + "language_loss": 0.70745301, + "learning_rate": 3.942162916315356e-06, + "loss": 0.72979927, + "num_input_tokens_seen": 37511905, + "router_z_loss_clip": 1.18945312, + "router_z_loss_mlp": 0.18188477, + "step": 1737, + "time_per_iteration": 2.6982996463775635 + }, + { + "auxiliary_loss_clip": 0.01183509, + "auxiliary_loss_mlp": 0.01057996, + "balance_loss_clip": 1.05806279, + "balance_loss_mlp": 1.0361445, + "epoch": 0.1044942131369307, + "flos": 32652759044160.0, + "grad_norm": 2.1690580674319335, + "language_loss": 0.81407696, + "learning_rate": 3.942069896136581e-06, + "loss": 0.836492, + "num_input_tokens_seen": 37533635, + "router_z_loss_clip": 1.25390625, + "router_z_loss_mlp": 0.21838379, + "step": 1738, + "time_per_iteration": 2.727046012878418 + }, + { + "auxiliary_loss_clip": 0.0118048, + "auxiliary_loss_mlp": 0.01058136, + "balance_loss_clip": 1.05590653, + "balance_loss_mlp": 1.03757262, + "epoch": 0.10455433638959867, + "flos": 22503453294240.0, + "grad_norm": 2.3854402541859687, + "language_loss": 0.74581665, + "learning_rate": 3.9419768023143196e-06, + "loss": 0.76820278, + "num_input_tokens_seen": 37552035, + "router_z_loss_clip": 1.24511719, + "router_z_loss_mlp": 0.20568848, + "step": 1739, + "time_per_iteration": 2.6559243202209473 + }, + { + "auxiliary_loss_clip": 0.0117905, + "auxiliary_loss_mlp": 0.01054332, + "balance_loss_clip": 1.05841184, + "balance_loss_mlp": 1.03573573, + "epoch": 0.10461445964226665, + "flos": 28332288155520.0, + "grad_norm": 2.109860157022448, + "language_loss": 0.77289522, + "learning_rate": 3.941883634852104e-06, + "loss": 0.79522908, + "num_input_tokens_seen": 37571540, + "router_z_loss_clip": 1.20703125, + "router_z_loss_mlp": 0.18579102, + "step": 1740, + "time_per_iteration": 2.6585605144500732 + }, + { + "auxiliary_loss_clip": 0.01184511, + "auxiliary_loss_mlp": 0.01050735, + "balance_loss_clip": 1.06484103, + "balance_loss_mlp": 1.03225732, + "epoch": 0.10467458289493461, + "flos": 29707085707200.0, + "grad_norm": 2.2624829433217983, + "language_loss": 0.86233044, + "learning_rate": 3.941790393753467e-06, + "loss": 0.88468289, + "num_input_tokens_seen": 37588265, + "router_z_loss_clip": 1.19824219, + "router_z_loss_mlp": 0.18469238, + "step": 1741, + "time_per_iteration": 2.6922414302825928 + }, + { + "auxiliary_loss_clip": 0.01183276, + "auxiliary_loss_mlp": 0.01053419, + "balance_loss_clip": 1.05943167, + "balance_loss_mlp": 1.03291512, + "epoch": 0.10473470614760258, + "flos": 25975653514560.0, + "grad_norm": 3.0727282201625634, + "language_loss": 0.75413281, + "learning_rate": 3.941697079021942e-06, + "loss": 0.77649975, + "num_input_tokens_seen": 37606860, + "router_z_loss_clip": 1.23925781, + "router_z_loss_mlp": 0.20507812, + "step": 1742, + "time_per_iteration": 2.6922061443328857 + }, + { + "auxiliary_loss_clip": 0.01184361, + "auxiliary_loss_mlp": 0.01062087, + "balance_loss_clip": 1.06372607, + "balance_loss_mlp": 1.04365766, + "epoch": 0.10479482940027056, + "flos": 26463047055840.0, + "grad_norm": 2.064317190361714, + "language_loss": 0.87147284, + "learning_rate": 3.94160369066107e-06, + "loss": 0.89393735, + "num_input_tokens_seen": 37625210, + "router_z_loss_clip": 1.20605469, + "router_z_loss_mlp": 0.1842041, + "step": 1743, + "time_per_iteration": 2.680410861968994 + }, + { + "auxiliary_loss_clip": 0.01180007, + "auxiliary_loss_mlp": 0.01048624, + "balance_loss_clip": 1.05974114, + "balance_loss_mlp": 1.02868044, + "epoch": 0.10485495265293852, + "flos": 26324902182240.0, + "grad_norm": 1.952239447449477, + "language_loss": 0.7559278, + "learning_rate": 3.941510228674391e-06, + "loss": 0.7782141, + "num_input_tokens_seen": 37644110, + "router_z_loss_clip": 1.20214844, + "router_z_loss_mlp": 0.19934082, + "step": 1744, + "time_per_iteration": 2.6652026176452637 + }, + { + "auxiliary_loss_clip": 0.01180866, + "auxiliary_loss_mlp": 0.01059459, + "balance_loss_clip": 1.06047177, + "balance_loss_mlp": 1.04143465, + "epoch": 0.10491507590560649, + "flos": 46322514744480.0, + "grad_norm": 2.0955571187332414, + "language_loss": 0.79336941, + "learning_rate": 3.941416693065451e-06, + "loss": 0.81577265, + "num_input_tokens_seen": 37665800, + "router_z_loss_clip": 1.20410156, + "router_z_loss_mlp": 0.18029785, + "step": 1745, + "time_per_iteration": 2.87904691696167 + }, + { + "auxiliary_loss_clip": 0.01178343, + "auxiliary_loss_mlp": 0.01063044, + "balance_loss_clip": 1.05668342, + "balance_loss_mlp": 1.04404175, + "epoch": 0.10497519915827447, + "flos": 32209198849440.0, + "grad_norm": 2.0834657949877102, + "language_loss": 0.82867539, + "learning_rate": 3.941323083837794e-06, + "loss": 0.8510893, + "num_input_tokens_seen": 37685095, + "router_z_loss_clip": 1.21777344, + "router_z_loss_mlp": 0.18994141, + "step": 1746, + "time_per_iteration": 2.735029935836792 + }, + { + "auxiliary_loss_clip": 0.01180957, + "auxiliary_loss_mlp": 0.01060923, + "balance_loss_clip": 1.06118798, + "balance_loss_mlp": 1.04231429, + "epoch": 0.10503532241094243, + "flos": 49617558300960.0, + "grad_norm": 2.0521635872822857, + "language_loss": 0.7021879, + "learning_rate": 3.941229400994971e-06, + "loss": 0.72460663, + "num_input_tokens_seen": 37707445, + "router_z_loss_clip": 1.19726562, + "router_z_loss_mlp": 0.18615723, + "step": 1747, + "time_per_iteration": 2.833019733428955 + }, + { + "auxiliary_loss_clip": 0.01186829, + "auxiliary_loss_mlp": 0.01065373, + "balance_loss_clip": 1.06009626, + "balance_loss_mlp": 1.04592991, + "epoch": 0.1050954456636104, + "flos": 36349757899200.0, + "grad_norm": 2.1487886569575303, + "language_loss": 0.84729481, + "learning_rate": 3.941135644540535e-06, + "loss": 0.86981684, + "num_input_tokens_seen": 37728325, + "router_z_loss_clip": 1.26660156, + "router_z_loss_mlp": 0.19433594, + "step": 1748, + "time_per_iteration": 2.9469985961914062 + }, + { + "auxiliary_loss_clip": 0.01175039, + "auxiliary_loss_mlp": 0.01045728, + "balance_loss_clip": 1.0558188, + "balance_loss_mlp": 1.02572465, + "epoch": 0.10515556891627838, + "flos": 29222811996480.0, + "grad_norm": 1.9765459364466444, + "language_loss": 0.71613055, + "learning_rate": 3.941041814478041e-06, + "loss": 0.73833823, + "num_input_tokens_seen": 37748910, + "router_z_loss_clip": 1.19238281, + "router_z_loss_mlp": 0.19995117, + "step": 1749, + "time_per_iteration": 2.7162926197052 + }, + { + "auxiliary_loss_clip": 0.01174129, + "auxiliary_loss_mlp": 0.01057827, + "balance_loss_clip": 1.05626392, + "balance_loss_mlp": 1.03907526, + "epoch": 0.10521569216894634, + "flos": 22280234833440.0, + "grad_norm": 2.140823092332965, + "language_loss": 0.81331658, + "learning_rate": 3.940947910811047e-06, + "loss": 0.83563614, + "num_input_tokens_seen": 37765745, + "router_z_loss_clip": 1.1796875, + "router_z_loss_mlp": 0.1875, + "step": 1750, + "time_per_iteration": 2.6576175689697266 + }, + { + "auxiliary_loss_clip": 0.01183541, + "auxiliary_loss_mlp": 0.01059251, + "balance_loss_clip": 1.06097651, + "balance_loss_mlp": 1.03954613, + "epoch": 0.10527581542161431, + "flos": 19072655383680.0, + "grad_norm": 2.9501804083130234, + "language_loss": 0.92612118, + "learning_rate": 3.940853933543114e-06, + "loss": 0.94854903, + "num_input_tokens_seen": 37780520, + "router_z_loss_clip": 1.22558594, + "router_z_loss_mlp": 0.19702148, + "step": 1751, + "time_per_iteration": 2.619723081588745 + }, + { + "auxiliary_loss_clip": 0.01176646, + "auxiliary_loss_mlp": 0.01046616, + "balance_loss_clip": 1.05753052, + "balance_loss_mlp": 1.02801943, + "epoch": 0.10533593867428227, + "flos": 22325283698400.0, + "grad_norm": 2.527385530367795, + "language_loss": 0.79132342, + "learning_rate": 3.940759882677805e-06, + "loss": 0.81355608, + "num_input_tokens_seen": 37799515, + "router_z_loss_clip": 1.18945312, + "router_z_loss_mlp": 0.18591309, + "step": 1752, + "time_per_iteration": 4.145538330078125 + }, + { + "auxiliary_loss_clip": 0.01178374, + "auxiliary_loss_mlp": 0.01049172, + "balance_loss_clip": 1.05940628, + "balance_loss_mlp": 1.02987194, + "epoch": 0.10539606192695025, + "flos": 35414792952480.0, + "grad_norm": 2.024808210808717, + "language_loss": 0.75829643, + "learning_rate": 3.940665758218686e-06, + "loss": 0.78057194, + "num_input_tokens_seen": 37818695, + "router_z_loss_clip": 1.18945312, + "router_z_loss_mlp": 0.19311523, + "step": 1753, + "time_per_iteration": 4.257465362548828 + }, + { + "auxiliary_loss_clip": 0.01183928, + "auxiliary_loss_mlp": 0.01053709, + "balance_loss_clip": 1.05854666, + "balance_loss_mlp": 1.0325731, + "epoch": 0.10545618517961822, + "flos": 24367021974720.0, + "grad_norm": 1.8851019693842672, + "language_loss": 0.84093082, + "learning_rate": 3.940571560169328e-06, + "loss": 0.86330724, + "num_input_tokens_seen": 37837860, + "router_z_loss_clip": 1.25292969, + "router_z_loss_mlp": 0.21130371, + "step": 1754, + "time_per_iteration": 2.6723334789276123 + }, + { + "auxiliary_loss_clip": 0.01183922, + "auxiliary_loss_mlp": 0.01047317, + "balance_loss_clip": 1.0617249, + "balance_loss_mlp": 1.02703953, + "epoch": 0.10551630843228618, + "flos": 19697910177600.0, + "grad_norm": 2.4115320662925033, + "language_loss": 0.68857288, + "learning_rate": 3.940477288533302e-06, + "loss": 0.71088529, + "num_input_tokens_seen": 37856260, + "router_z_loss_clip": 1.22363281, + "router_z_loss_mlp": 0.20300293, + "step": 1755, + "time_per_iteration": 5.570379257202148 + }, + { + "auxiliary_loss_clip": 0.0118257, + "auxiliary_loss_mlp": 0.01060231, + "balance_loss_clip": 1.05896616, + "balance_loss_mlp": 1.04003668, + "epoch": 0.10557643168495416, + "flos": 28601933068800.0, + "grad_norm": 2.2966512613143286, + "language_loss": 0.77056009, + "learning_rate": 3.940382943314182e-06, + "loss": 0.79298806, + "num_input_tokens_seen": 37876960, + "router_z_loss_clip": 1.234375, + "router_z_loss_mlp": 0.20202637, + "step": 1756, + "time_per_iteration": 2.681612253189087 + }, + { + "auxiliary_loss_clip": 0.01182413, + "auxiliary_loss_mlp": 0.01059201, + "balance_loss_clip": 1.05854964, + "balance_loss_mlp": 1.03999686, + "epoch": 0.10563655493762213, + "flos": 26599287617280.0, + "grad_norm": 3.3919573249940638, + "language_loss": 0.79743141, + "learning_rate": 3.940288524515547e-06, + "loss": 0.81984758, + "num_input_tokens_seen": 37897070, + "router_z_loss_clip": 1.24023438, + "router_z_loss_mlp": 0.19213867, + "step": 1757, + "time_per_iteration": 2.667940139770508 + }, + { + "auxiliary_loss_clip": 0.01179467, + "auxiliary_loss_mlp": 0.01053339, + "balance_loss_clip": 1.05748987, + "balance_loss_mlp": 1.03344309, + "epoch": 0.10569667819029009, + "flos": 65650026234240.0, + "grad_norm": 2.036936937659444, + "language_loss": 0.78873205, + "learning_rate": 3.940194032140976e-06, + "loss": 0.81106007, + "num_input_tokens_seen": 37923635, + "router_z_loss_clip": 1.22070312, + "router_z_loss_mlp": 0.19873047, + "step": 1758, + "time_per_iteration": 2.9568583965301514 + }, + { + "auxiliary_loss_clip": 0.0118655, + "auxiliary_loss_mlp": 0.01050902, + "balance_loss_clip": 1.06139314, + "balance_loss_mlp": 1.03125644, + "epoch": 0.10575680144295807, + "flos": 27974328272640.0, + "grad_norm": 1.8410917649583216, + "language_loss": 0.91927004, + "learning_rate": 3.940099466194054e-06, + "loss": 0.94164455, + "num_input_tokens_seen": 37942650, + "router_z_loss_clip": 1.25097656, + "router_z_loss_mlp": 0.19641113, + "step": 1759, + "time_per_iteration": 2.911688804626465 + }, + { + "auxiliary_loss_clip": 0.01181757, + "auxiliary_loss_mlp": 0.01052013, + "balance_loss_clip": 1.05640674, + "balance_loss_mlp": 1.03116357, + "epoch": 0.10581692469562604, + "flos": 17249111425440.0, + "grad_norm": 2.3892358071881965, + "language_loss": 0.77410811, + "learning_rate": 3.940004826678365e-06, + "loss": 0.79644573, + "num_input_tokens_seen": 37960660, + "router_z_loss_clip": 1.25390625, + "router_z_loss_mlp": 0.20861816, + "step": 1760, + "time_per_iteration": 2.624453544616699 + }, + { + "auxiliary_loss_clip": 0.0118036, + "auxiliary_loss_mlp": 0.01059825, + "balance_loss_clip": 1.05620074, + "balance_loss_mlp": 1.03830731, + "epoch": 0.105877047948294, + "flos": 31675176269280.0, + "grad_norm": 2.68508762074009, + "language_loss": 0.89371306, + "learning_rate": 3.939910113597498e-06, + "loss": 0.91611493, + "num_input_tokens_seen": 37978625, + "router_z_loss_clip": 1.2421875, + "router_z_loss_mlp": 0.21496582, + "step": 1761, + "time_per_iteration": 2.710071563720703 + }, + { + "auxiliary_loss_clip": 0.01178932, + "auxiliary_loss_mlp": 0.01059663, + "balance_loss_clip": 1.05744934, + "balance_loss_mlp": 1.0399574, + "epoch": 0.10593717120096197, + "flos": 37417965128640.0, + "grad_norm": 2.2992026219546817, + "language_loss": 0.78255117, + "learning_rate": 3.9398153269550464e-06, + "loss": 0.80493706, + "num_input_tokens_seen": 38000005, + "router_z_loss_clip": 1.21386719, + "router_z_loss_mlp": 0.19702148, + "step": 1762, + "time_per_iteration": 2.732642650604248 + }, + { + "auxiliary_loss_clip": 0.01067543, + "auxiliary_loss_mlp": 0.01019351, + "balance_loss_clip": 1.02176285, + "balance_loss_mlp": 1.01652467, + "epoch": 0.10599729445362994, + "flos": 81064781938080.0, + "grad_norm": 0.7591310154239933, + "language_loss": 0.60488129, + "learning_rate": 3.939720466754602e-06, + "loss": 0.62575018, + "num_input_tokens_seen": 38066165, + "router_z_loss_clip": 0.45776367, + "router_z_loss_mlp": 0.02825928, + "step": 1763, + "time_per_iteration": 3.4285402297973633 + }, + { + "auxiliary_loss_clip": 0.0117581, + "auxiliary_loss_mlp": 0.01044918, + "balance_loss_clip": 1.05421734, + "balance_loss_mlp": 1.02599907, + "epoch": 0.10605741770629791, + "flos": 29222163720000.0, + "grad_norm": 1.5937044070856428, + "language_loss": 0.79918551, + "learning_rate": 3.939625532999763e-06, + "loss": 0.82139283, + "num_input_tokens_seen": 38086150, + "router_z_loss_clip": 1.21679688, + "router_z_loss_mlp": 0.18896484, + "step": 1764, + "time_per_iteration": 2.6845645904541016 + }, + { + "auxiliary_loss_clip": 0.01176305, + "auxiliary_loss_mlp": 0.01051421, + "balance_loss_clip": 1.05672598, + "balance_loss_mlp": 1.03158498, + "epoch": 0.10611754095896588, + "flos": 23656896180000.0, + "grad_norm": 1.593660851106388, + "language_loss": 0.79951298, + "learning_rate": 3.9395305256941314e-06, + "loss": 0.82179034, + "num_input_tokens_seen": 38104205, + "router_z_loss_clip": 1.19433594, + "router_z_loss_mlp": 0.19836426, + "step": 1765, + "time_per_iteration": 2.638700008392334 + }, + { + "auxiliary_loss_clip": 0.01173514, + "auxiliary_loss_mlp": 0.0105834, + "balance_loss_clip": 1.05362523, + "balance_loss_mlp": 1.03790808, + "epoch": 0.10617766421163385, + "flos": 27134566233120.0, + "grad_norm": 1.6721850609807105, + "language_loss": 0.76521158, + "learning_rate": 3.939435444841306e-06, + "loss": 0.78753012, + "num_input_tokens_seen": 38122005, + "router_z_loss_clip": 1.20019531, + "router_z_loss_mlp": 0.20446777, + "step": 1766, + "time_per_iteration": 2.7096962928771973 + }, + { + "auxiliary_loss_clip": 0.01177663, + "auxiliary_loss_mlp": 0.0106437, + "balance_loss_clip": 1.05703759, + "balance_loss_mlp": 1.04456902, + "epoch": 0.10623778746430182, + "flos": 34659537258240.0, + "grad_norm": 1.8616972984053841, + "language_loss": 0.77612948, + "learning_rate": 3.939340290444895e-06, + "loss": 0.79854977, + "num_input_tokens_seen": 38143365, + "router_z_loss_clip": 1.20605469, + "router_z_loss_mlp": 0.19787598, + "step": 1767, + "time_per_iteration": 2.7048370838165283 + }, + { + "auxiliary_loss_clip": 0.01061329, + "auxiliary_loss_mlp": 0.01006078, + "balance_loss_clip": 1.01631248, + "balance_loss_mlp": 1.00360775, + "epoch": 0.10629791071696978, + "flos": 78378867298080.0, + "grad_norm": 0.6735462847630931, + "language_loss": 0.57888508, + "learning_rate": 3.939245062508506e-06, + "loss": 0.59955919, + "num_input_tokens_seen": 38210035, + "router_z_loss_clip": 0.45043945, + "router_z_loss_mlp": 0.02468872, + "step": 1768, + "time_per_iteration": 3.357118606567383 + }, + { + "auxiliary_loss_clip": 0.01176521, + "auxiliary_loss_mlp": 0.01045832, + "balance_loss_clip": 1.05583429, + "balance_loss_mlp": 1.02722394, + "epoch": 0.10635803396963776, + "flos": 27756782231040.0, + "grad_norm": 1.3633672261362153, + "language_loss": 0.86242032, + "learning_rate": 3.939149761035749e-06, + "loss": 0.88464391, + "num_input_tokens_seen": 38231230, + "router_z_loss_clip": 1.20800781, + "router_z_loss_mlp": 0.18615723, + "step": 1769, + "time_per_iteration": 2.6972289085388184 + }, + { + "auxiliary_loss_clip": 0.01178523, + "auxiliary_loss_mlp": 0.01051268, + "balance_loss_clip": 1.05476737, + "balance_loss_mlp": 1.03100264, + "epoch": 0.10641815722230573, + "flos": 38309663970720.0, + "grad_norm": 1.8892185205395198, + "language_loss": 0.6140669, + "learning_rate": 3.9390543860302395e-06, + "loss": 0.63636482, + "num_input_tokens_seen": 38253890, + "router_z_loss_clip": 1.23828125, + "router_z_loss_mlp": 0.20263672, + "step": 1770, + "time_per_iteration": 2.756849765777588 + }, + { + "auxiliary_loss_clip": 0.01060419, + "auxiliary_loss_mlp": 0.01009212, + "balance_loss_clip": 1.01544154, + "balance_loss_mlp": 1.00672626, + "epoch": 0.1064782804749737, + "flos": 71447588105760.0, + "grad_norm": 0.9853542136871597, + "language_loss": 0.5705393, + "learning_rate": 3.9389589374955925e-06, + "loss": 0.59123558, + "num_input_tokens_seen": 38304290, + "router_z_loss_clip": 0.44995117, + "router_z_loss_mlp": 0.02484131, + "step": 1771, + "time_per_iteration": 3.2265474796295166 + }, + { + "auxiliary_loss_clip": 0.01178796, + "auxiliary_loss_mlp": 0.01062717, + "balance_loss_clip": 1.05855775, + "balance_loss_mlp": 1.04394174, + "epoch": 0.10653840372764166, + "flos": 29270251380960.0, + "grad_norm": 1.6894051847541234, + "language_loss": 0.88398993, + "learning_rate": 3.938863415435429e-06, + "loss": 0.90640503, + "num_input_tokens_seen": 38324725, + "router_z_loss_clip": 1.20019531, + "router_z_loss_mlp": 0.18774414, + "step": 1772, + "time_per_iteration": 2.7154922485351562 + }, + { + "auxiliary_loss_clip": 0.01179805, + "auxiliary_loss_mlp": 0.01051709, + "balance_loss_clip": 1.05495, + "balance_loss_mlp": 1.03090727, + "epoch": 0.10659852698030964, + "flos": 22323136282560.0, + "grad_norm": 2.6605103651635464, + "language_loss": 0.76543176, + "learning_rate": 3.93876781985337e-06, + "loss": 0.78774691, + "num_input_tokens_seen": 38340735, + "router_z_loss_clip": 1.24707031, + "router_z_loss_mlp": 0.20800781, + "step": 1773, + "time_per_iteration": 2.6581926345825195 + }, + { + "auxiliary_loss_clip": 0.01175249, + "auxiliary_loss_mlp": 0.01061554, + "balance_loss_clip": 1.05601311, + "balance_loss_mlp": 1.0410856, + "epoch": 0.1066586502329776, + "flos": 39243291847200.0, + "grad_norm": 3.71544553565784, + "language_loss": 0.82794213, + "learning_rate": 3.938672150753041e-06, + "loss": 0.85031015, + "num_input_tokens_seen": 38361315, + "router_z_loss_clip": 1.19238281, + "router_z_loss_mlp": 0.20458984, + "step": 1774, + "time_per_iteration": 2.732055902481079 + }, + { + "auxiliary_loss_clip": 0.01183144, + "auxiliary_loss_mlp": 0.01054529, + "balance_loss_clip": 1.05783939, + "balance_loss_mlp": 1.03432333, + "epoch": 0.10671877348564557, + "flos": 21701284940160.0, + "grad_norm": 2.4646273829832737, + "language_loss": 0.76411748, + "learning_rate": 3.9385764081380704e-06, + "loss": 0.78649426, + "num_input_tokens_seen": 38377425, + "router_z_loss_clip": 1.25195312, + "router_z_loss_mlp": 0.20214844, + "step": 1775, + "time_per_iteration": 2.645233631134033 + }, + { + "auxiliary_loss_clip": 0.01058991, + "auxiliary_loss_mlp": 0.01003355, + "balance_loss_clip": 1.01463366, + "balance_loss_mlp": 1.00091839, + "epoch": 0.10677889673831355, + "flos": 77494340014560.0, + "grad_norm": 0.9198818773294454, + "language_loss": 0.57433695, + "learning_rate": 3.9384805920120876e-06, + "loss": 0.59496045, + "num_input_tokens_seen": 38440275, + "router_z_loss_clip": 0.44360352, + "router_z_loss_mlp": 0.02433777, + "step": 1776, + "time_per_iteration": 3.2721991539001465 + }, + { + "auxiliary_loss_clip": 0.01180294, + "auxiliary_loss_mlp": 0.01059339, + "balance_loss_clip": 1.05823898, + "balance_loss_mlp": 1.0374645, + "epoch": 0.10683901999098151, + "flos": 26866136838240.0, + "grad_norm": 1.547246231396573, + "language_loss": 0.83440721, + "learning_rate": 3.938384702378727e-06, + "loss": 0.8568036, + "num_input_tokens_seen": 38461820, + "router_z_loss_clip": 1.22070312, + "router_z_loss_mlp": 0.21887207, + "step": 1777, + "time_per_iteration": 2.7134053707122803 + }, + { + "auxiliary_loss_clip": 0.01173221, + "auxiliary_loss_mlp": 0.01050322, + "balance_loss_clip": 1.05648899, + "balance_loss_mlp": 1.03132021, + "epoch": 0.10689914324364948, + "flos": 30558071033280.0, + "grad_norm": 2.155427161211277, + "language_loss": 0.87468588, + "learning_rate": 3.938288739241625e-06, + "loss": 0.89692128, + "num_input_tokens_seen": 38482235, + "router_z_loss_clip": 1.16699219, + "router_z_loss_mlp": 0.19018555, + "step": 1778, + "time_per_iteration": 2.6750566959381104 + }, + { + "auxiliary_loss_clip": 0.01178122, + "auxiliary_loss_mlp": 0.01049659, + "balance_loss_clip": 1.0580368, + "balance_loss_mlp": 1.02982247, + "epoch": 0.10695926649631746, + "flos": 20053884713760.0, + "grad_norm": 1.89163154526747, + "language_loss": 0.84219587, + "learning_rate": 3.938192702604417e-06, + "loss": 0.8644737, + "num_input_tokens_seen": 38500690, + "router_z_loss_clip": 1.19921875, + "router_z_loss_mlp": 0.19836426, + "step": 1779, + "time_per_iteration": 2.63407039642334 + }, + { + "auxiliary_loss_clip": 0.01173528, + "auxiliary_loss_mlp": 0.01053095, + "balance_loss_clip": 1.05416703, + "balance_loss_mlp": 1.03442717, + "epoch": 0.10701938974898542, + "flos": 20718110780640.0, + "grad_norm": 1.8917015155545003, + "language_loss": 0.67505234, + "learning_rate": 3.9380965924707495e-06, + "loss": 0.69731855, + "num_input_tokens_seen": 38518405, + "router_z_loss_clip": 1.19335938, + "router_z_loss_mlp": 0.18676758, + "step": 1780, + "time_per_iteration": 2.6376004219055176 + }, + { + "auxiliary_loss_clip": 0.01176829, + "auxiliary_loss_mlp": 0.0104792, + "balance_loss_clip": 1.05708361, + "balance_loss_mlp": 1.02833402, + "epoch": 0.10707951300165339, + "flos": 19386336229920.0, + "grad_norm": 5.977854303743617, + "language_loss": 0.91585004, + "learning_rate": 3.938000408844265e-06, + "loss": 0.93809748, + "num_input_tokens_seen": 38535060, + "router_z_loss_clip": 1.19726562, + "router_z_loss_mlp": 0.19592285, + "step": 1781, + "time_per_iteration": 2.6222550868988037 + }, + { + "auxiliary_loss_clip": 0.01177569, + "auxiliary_loss_mlp": 0.01048928, + "balance_loss_clip": 1.05769467, + "balance_loss_mlp": 1.02986634, + "epoch": 0.10713963625432135, + "flos": 17383852847520.0, + "grad_norm": 1.9694060871670769, + "language_loss": 0.79029614, + "learning_rate": 3.9379041517286105e-06, + "loss": 0.81256109, + "num_input_tokens_seen": 38552855, + "router_z_loss_clip": 1.19824219, + "router_z_loss_mlp": 0.19055176, + "step": 1782, + "time_per_iteration": 2.626864194869995 + }, + { + "auxiliary_loss_clip": 0.01178772, + "auxiliary_loss_mlp": 0.01050867, + "balance_loss_clip": 1.05692983, + "balance_loss_mlp": 1.03086329, + "epoch": 0.10719975950698933, + "flos": 20447007245280.0, + "grad_norm": 2.157432727743675, + "language_loss": 0.78990042, + "learning_rate": 3.937807821127436e-06, + "loss": 0.81219685, + "num_input_tokens_seen": 38570075, + "router_z_loss_clip": 1.21777344, + "router_z_loss_mlp": 0.1998291, + "step": 1783, + "time_per_iteration": 2.644176959991455 + }, + { + "auxiliary_loss_clip": 0.01177641, + "auxiliary_loss_mlp": 0.01056546, + "balance_loss_clip": 1.05630493, + "balance_loss_mlp": 1.03648353, + "epoch": 0.1072598827596573, + "flos": 27711733366080.0, + "grad_norm": 2.283284519750956, + "language_loss": 0.86456335, + "learning_rate": 3.937711417044395e-06, + "loss": 0.88690531, + "num_input_tokens_seen": 38587970, + "router_z_loss_clip": 1.2109375, + "router_z_loss_mlp": 0.20056152, + "step": 1784, + "time_per_iteration": 2.9521636962890625 + }, + { + "auxiliary_loss_clip": 0.01178392, + "auxiliary_loss_mlp": 0.01053741, + "balance_loss_clip": 1.05621672, + "balance_loss_mlp": 1.03329623, + "epoch": 0.10732000601232526, + "flos": 28379970643680.0, + "grad_norm": 2.289382837772116, + "language_loss": 1.01035082, + "learning_rate": 3.937614939483143e-06, + "loss": 1.03267205, + "num_input_tokens_seen": 38605840, + "router_z_loss_clip": 1.22070312, + "router_z_loss_mlp": 0.20446777, + "step": 1785, + "time_per_iteration": 2.6653048992156982 + }, + { + "auxiliary_loss_clip": 0.0117322, + "auxiliary_loss_mlp": 0.01053955, + "balance_loss_clip": 1.05758107, + "balance_loss_mlp": 1.03488159, + "epoch": 0.10738012926499324, + "flos": 29537748878400.0, + "grad_norm": 1.3399050373942862, + "language_loss": 0.84883797, + "learning_rate": 3.937518388447339e-06, + "loss": 0.87110972, + "num_input_tokens_seen": 38627070, + "router_z_loss_clip": 1.15527344, + "router_z_loss_mlp": 0.1907959, + "step": 1786, + "time_per_iteration": 2.6723456382751465 + }, + { + "auxiliary_loss_clip": 0.01175002, + "auxiliary_loss_mlp": 0.0105332, + "balance_loss_clip": 1.05395246, + "balance_loss_mlp": 1.03221965, + "epoch": 0.1074402525176612, + "flos": 25530391594080.0, + "grad_norm": 1.7572571658238527, + "language_loss": 0.78789127, + "learning_rate": 3.937421763940642e-06, + "loss": 0.81017447, + "num_input_tokens_seen": 38645840, + "router_z_loss_clip": 1.2109375, + "router_z_loss_mlp": 0.21118164, + "step": 1787, + "time_per_iteration": 2.611483335494995 + }, + { + "auxiliary_loss_clip": 0.0117749, + "auxiliary_loss_mlp": 0.01050757, + "balance_loss_clip": 1.05463982, + "balance_loss_mlp": 1.03062224, + "epoch": 0.10750037577032917, + "flos": 20678329162080.0, + "grad_norm": 1.9597486013741676, + "language_loss": 0.82892138, + "learning_rate": 3.937325065966719e-06, + "loss": 0.8512038, + "num_input_tokens_seen": 38664770, + "router_z_loss_clip": 1.23046875, + "router_z_loss_mlp": 0.20129395, + "step": 1788, + "time_per_iteration": 2.6779744625091553 + }, + { + "auxiliary_loss_clip": 0.01173202, + "auxiliary_loss_mlp": 0.01056532, + "balance_loss_clip": 1.05523455, + "balance_loss_mlp": 1.03762555, + "epoch": 0.10756049902299715, + "flos": 24729236172000.0, + "grad_norm": 1.9023326043065472, + "language_loss": 0.78072917, + "learning_rate": 3.9372282945292335e-06, + "loss": 0.80302656, + "num_input_tokens_seen": 38683865, + "router_z_loss_clip": 1.18066406, + "router_z_loss_mlp": 0.18884277, + "step": 1789, + "time_per_iteration": 2.654904365539551 + }, + { + "auxiliary_loss_clip": 0.01174735, + "auxiliary_loss_mlp": 0.0105806, + "balance_loss_clip": 1.05500603, + "balance_loss_mlp": 1.03599489, + "epoch": 0.10762062227566511, + "flos": 28780548354720.0, + "grad_norm": 2.7976944626788387, + "language_loss": 0.7466507, + "learning_rate": 3.937131449631859e-06, + "loss": 0.76897871, + "num_input_tokens_seen": 38702485, + "router_z_loss_clip": 1.19726562, + "router_z_loss_mlp": 0.22070312, + "step": 1790, + "time_per_iteration": 2.7087900638580322 + }, + { + "auxiliary_loss_clip": 0.01179952, + "auxiliary_loss_mlp": 0.01061558, + "balance_loss_clip": 1.05931783, + "balance_loss_mlp": 1.0401957, + "epoch": 0.10768074552833308, + "flos": 29664670465440.0, + "grad_norm": 4.631464907295518, + "language_loss": 0.78236353, + "learning_rate": 3.9370345312782645e-06, + "loss": 0.8047787, + "num_input_tokens_seen": 38722475, + "router_z_loss_clip": 1.20605469, + "router_z_loss_mlp": 0.21362305, + "step": 1791, + "time_per_iteration": 2.6799776554107666 + }, + { + "auxiliary_loss_clip": 0.01169683, + "auxiliary_loss_mlp": 0.0105248, + "balance_loss_clip": 1.0549624, + "balance_loss_mlp": 1.03316855, + "epoch": 0.10774086878100106, + "flos": 30871549293120.0, + "grad_norm": 4.149828480874808, + "language_loss": 0.70678878, + "learning_rate": 3.936937539472126e-06, + "loss": 0.7290104, + "num_input_tokens_seen": 38743285, + "router_z_loss_clip": 1.14746094, + "router_z_loss_mlp": 0.19311523, + "step": 1792, + "time_per_iteration": 5.621631860733032 + }, + { + "auxiliary_loss_clip": 0.01177275, + "auxiliary_loss_mlp": 0.01043332, + "balance_loss_clip": 1.05476356, + "balance_loss_mlp": 1.02238703, + "epoch": 0.10780099203366902, + "flos": 26910496909440.0, + "grad_norm": 1.738828386265441, + "language_loss": 0.760993, + "learning_rate": 3.9368404742171236e-06, + "loss": 0.78319907, + "num_input_tokens_seen": 38763035, + "router_z_loss_clip": 1.22460938, + "router_z_loss_mlp": 0.20947266, + "step": 1793, + "time_per_iteration": 2.6587512493133545 + }, + { + "auxiliary_loss_clip": 0.01173193, + "auxiliary_loss_mlp": 0.01055333, + "balance_loss_clip": 1.0578655, + "balance_loss_mlp": 1.03543723, + "epoch": 0.10786111528633699, + "flos": 27756660679200.0, + "grad_norm": 1.7530596705350883, + "language_loss": 0.85006547, + "learning_rate": 3.936743335516936e-06, + "loss": 0.87235075, + "num_input_tokens_seen": 38784900, + "router_z_loss_clip": 1.15332031, + "router_z_loss_mlp": 0.19885254, + "step": 1794, + "time_per_iteration": 2.6917855739593506 + }, + { + "auxiliary_loss_clip": 0.01181789, + "auxiliary_loss_mlp": 0.01051363, + "balance_loss_clip": 1.05800092, + "balance_loss_mlp": 1.03085876, + "epoch": 0.10792123853900495, + "flos": 25442441280000.0, + "grad_norm": 2.1213858277672992, + "language_loss": 0.74717575, + "learning_rate": 3.936646123375246e-06, + "loss": 0.76950729, + "num_input_tokens_seen": 38804695, + "router_z_loss_clip": 1.23730469, + "router_z_loss_mlp": 0.20532227, + "step": 1795, + "time_per_iteration": 5.76621675491333 + }, + { + "auxiliary_loss_clip": 0.01177077, + "auxiliary_loss_mlp": 0.01058902, + "balance_loss_clip": 1.05574512, + "balance_loss_mlp": 1.03826737, + "epoch": 0.10798136179167293, + "flos": 21656803317120.0, + "grad_norm": 3.639256372810956, + "language_loss": 0.8166495, + "learning_rate": 3.936548837795741e-06, + "loss": 0.83900928, + "num_input_tokens_seen": 38822395, + "router_z_loss_clip": 1.21191406, + "router_z_loss_mlp": 0.2064209, + "step": 1796, + "time_per_iteration": 2.602994680404663 + }, + { + "auxiliary_loss_clip": 0.01179352, + "auxiliary_loss_mlp": 0.01072495, + "balance_loss_clip": 1.05815852, + "balance_loss_mlp": 1.05209827, + "epoch": 0.1080414850443409, + "flos": 16581562941600.0, + "grad_norm": 2.202524544921888, + "language_loss": 0.73935848, + "learning_rate": 3.936451478782111e-06, + "loss": 0.76187694, + "num_input_tokens_seen": 38839865, + "router_z_loss_clip": 1.21289062, + "router_z_loss_mlp": 0.20385742, + "step": 1797, + "time_per_iteration": 2.6426193714141846 + }, + { + "auxiliary_loss_clip": 0.01173556, + "auxiliary_loss_mlp": 0.01047027, + "balance_loss_clip": 1.05485439, + "balance_loss_mlp": 1.02927709, + "epoch": 0.10810160829700886, + "flos": 19832084357760.0, + "grad_norm": 1.9636604724906472, + "language_loss": 0.81383973, + "learning_rate": 3.936354046338046e-06, + "loss": 0.83604562, + "num_input_tokens_seen": 38857300, + "router_z_loss_clip": 1.18847656, + "router_z_loss_mlp": 0.17749023, + "step": 1798, + "time_per_iteration": 2.6179587841033936 + }, + { + "auxiliary_loss_clip": 0.01176826, + "auxiliary_loss_mlp": 0.01055324, + "balance_loss_clip": 1.05702853, + "balance_loss_mlp": 1.03495073, + "epoch": 0.10816173154967684, + "flos": 18495447733440.0, + "grad_norm": 2.5550889890086212, + "language_loss": 0.85670501, + "learning_rate": 3.936256540467242e-06, + "loss": 0.87902653, + "num_input_tokens_seen": 38874960, + "router_z_loss_clip": 1.19824219, + "router_z_loss_mlp": 0.20373535, + "step": 1799, + "time_per_iteration": 2.613619089126587 + }, + { + "auxiliary_loss_clip": 0.01173895, + "auxiliary_loss_mlp": 0.01054776, + "balance_loss_clip": 1.05836773, + "balance_loss_mlp": 1.03628635, + "epoch": 0.10822185480234481, + "flos": 21701487526560.0, + "grad_norm": 2.1364202194360358, + "language_loss": 0.7720862, + "learning_rate": 3.9361589611733955e-06, + "loss": 0.79437292, + "num_input_tokens_seen": 38893610, + "router_z_loss_clip": 1.15429688, + "router_z_loss_mlp": 0.18469238, + "step": 1800, + "time_per_iteration": 2.637120485305786 + }, + { + "auxiliary_loss_clip": 0.01172776, + "auxiliary_loss_mlp": 0.01047484, + "balance_loss_clip": 1.05689955, + "balance_loss_mlp": 1.02944803, + "epoch": 0.10828197805501277, + "flos": 31184662897440.0, + "grad_norm": 1.9600083621338493, + "language_loss": 0.73081452, + "learning_rate": 3.9360613084602075e-06, + "loss": 0.75301713, + "num_input_tokens_seen": 38913485, + "router_z_loss_clip": 1.15820312, + "router_z_loss_mlp": 0.18029785, + "step": 1801, + "time_per_iteration": 2.6699254512786865 + }, + { + "auxiliary_loss_clip": 0.01182923, + "auxiliary_loss_mlp": 0.01047762, + "balance_loss_clip": 1.0584054, + "balance_loss_mlp": 1.02891564, + "epoch": 0.10834210130768075, + "flos": 35367677706240.0, + "grad_norm": 1.9608545507775208, + "language_loss": 0.66211951, + "learning_rate": 3.935963582331381e-06, + "loss": 0.68442637, + "num_input_tokens_seen": 38935650, + "router_z_loss_clip": 1.24609375, + "router_z_loss_mlp": 0.18847656, + "step": 1802, + "time_per_iteration": 2.784799337387085 + }, + { + "auxiliary_loss_clip": 0.01173152, + "auxiliary_loss_mlp": 0.01060214, + "balance_loss_clip": 1.0559833, + "balance_loss_mlp": 1.04085398, + "epoch": 0.10840222456034872, + "flos": 24726602548800.0, + "grad_norm": 2.13391098364739, + "language_loss": 0.81774706, + "learning_rate": 3.935865782790621e-06, + "loss": 0.84008074, + "num_input_tokens_seen": 38954130, + "router_z_loss_clip": 1.171875, + "router_z_loss_mlp": 0.19360352, + "step": 1803, + "time_per_iteration": 2.663328170776367 + }, + { + "auxiliary_loss_clip": 0.01171442, + "auxiliary_loss_mlp": 0.01057423, + "balance_loss_clip": 1.05514812, + "balance_loss_mlp": 1.0371809, + "epoch": 0.10846234781301668, + "flos": 24237061591680.0, + "grad_norm": 1.5789195717943045, + "language_loss": 0.91072154, + "learning_rate": 3.9357679098416365e-06, + "loss": 0.93301016, + "num_input_tokens_seen": 38972905, + "router_z_loss_clip": 1.1640625, + "router_z_loss_mlp": 0.20251465, + "step": 1804, + "time_per_iteration": 2.6608026027679443 + }, + { + "auxiliary_loss_clip": 0.01174549, + "auxiliary_loss_mlp": 0.01050162, + "balance_loss_clip": 1.0556165, + "balance_loss_mlp": 1.02968192, + "epoch": 0.10852247106568465, + "flos": 32298000026400.0, + "grad_norm": 1.8498793174125192, + "language_loss": 0.75950658, + "learning_rate": 3.935669963488139e-06, + "loss": 0.78175378, + "num_input_tokens_seen": 38993255, + "router_z_loss_clip": 1.18945312, + "router_z_loss_mlp": 0.20495605, + "step": 1805, + "time_per_iteration": 2.680229902267456 + }, + { + "auxiliary_loss_clip": 0.01172954, + "auxiliary_loss_mlp": 0.01044109, + "balance_loss_clip": 1.05739188, + "balance_loss_mlp": 1.02597725, + "epoch": 0.10858259431835263, + "flos": 36706178125440.0, + "grad_norm": 1.7142852322699556, + "language_loss": 0.86242962, + "learning_rate": 3.935571943733843e-06, + "loss": 0.88460028, + "num_input_tokens_seen": 39012610, + "router_z_loss_clip": 1.15332031, + "router_z_loss_mlp": 0.18127441, + "step": 1806, + "time_per_iteration": 2.7746636867523193 + }, + { + "auxiliary_loss_clip": 0.0117283, + "auxiliary_loss_mlp": 0.0104849, + "balance_loss_clip": 1.05570793, + "balance_loss_mlp": 1.02995253, + "epoch": 0.10864271757102059, + "flos": 23260734852480.0, + "grad_norm": 2.451661413133685, + "language_loss": 0.81205738, + "learning_rate": 3.9354738505824635e-06, + "loss": 0.83427054, + "num_input_tokens_seen": 39030120, + "router_z_loss_clip": 1.171875, + "router_z_loss_mlp": 0.18505859, + "step": 1807, + "time_per_iteration": 2.695751428604126 + }, + { + "auxiliary_loss_clip": 0.01174822, + "auxiliary_loss_mlp": 0.01054106, + "balance_loss_clip": 1.05928934, + "balance_loss_mlp": 1.03654659, + "epoch": 0.10870284082368856, + "flos": 30157898495040.0, + "grad_norm": 1.8166152390761745, + "language_loss": 0.79313421, + "learning_rate": 3.9353756840377225e-06, + "loss": 0.81542349, + "num_input_tokens_seen": 39049875, + "router_z_loss_clip": 1.15429688, + "router_z_loss_mlp": 0.17553711, + "step": 1808, + "time_per_iteration": 2.8406643867492676 + }, + { + "auxiliary_loss_clip": 0.01175953, + "auxiliary_loss_mlp": 0.01046881, + "balance_loss_clip": 1.05851531, + "balance_loss_mlp": 1.02781916, + "epoch": 0.10876296407635654, + "flos": 25170243778080.0, + "grad_norm": 1.6176002139107568, + "language_loss": 0.79039109, + "learning_rate": 3.935277444103342e-06, + "loss": 0.81261939, + "num_input_tokens_seen": 39068935, + "router_z_loss_clip": 1.17285156, + "router_z_loss_mlp": 0.19055176, + "step": 1809, + "time_per_iteration": 2.6584436893463135 + }, + { + "auxiliary_loss_clip": 0.01172392, + "auxiliary_loss_mlp": 0.0105659, + "balance_loss_clip": 1.05488586, + "balance_loss_mlp": 1.03767145, + "epoch": 0.1088230873290245, + "flos": 26331830637120.0, + "grad_norm": 2.093348405386107, + "language_loss": 0.84665203, + "learning_rate": 3.935179130783046e-06, + "loss": 0.8689419, + "num_input_tokens_seen": 39087370, + "router_z_loss_clip": 1.17480469, + "router_z_loss_mlp": 0.18933105, + "step": 1810, + "time_per_iteration": 2.6563799381256104 + }, + { + "auxiliary_loss_clip": 0.01180534, + "auxiliary_loss_mlp": 0.01048583, + "balance_loss_clip": 1.05921793, + "balance_loss_mlp": 1.02773356, + "epoch": 0.10888321058169247, + "flos": 32298000026400.0, + "grad_norm": 2.0802685720112915, + "language_loss": 0.63420057, + "learning_rate": 3.935080744080564e-06, + "loss": 0.65649176, + "num_input_tokens_seen": 39106635, + "router_z_loss_clip": 1.21191406, + "router_z_loss_mlp": 0.20849609, + "step": 1811, + "time_per_iteration": 2.6997737884521484 + }, + { + "auxiliary_loss_clip": 0.01172944, + "auxiliary_loss_mlp": 0.01046295, + "balance_loss_clip": 1.0563935, + "balance_loss_mlp": 1.02741182, + "epoch": 0.10894333383436045, + "flos": 31540921054560.0, + "grad_norm": 1.899663137492622, + "language_loss": 0.73582095, + "learning_rate": 3.934982283999626e-06, + "loss": 0.75801325, + "num_input_tokens_seen": 39126335, + "router_z_loss_clip": 1.1640625, + "router_z_loss_mlp": 0.18884277, + "step": 1812, + "time_per_iteration": 2.695138454437256 + }, + { + "auxiliary_loss_clip": 0.01175876, + "auxiliary_loss_mlp": 0.0104503, + "balance_loss_clip": 1.05943906, + "balance_loss_mlp": 1.02625453, + "epoch": 0.10900345708702841, + "flos": 23839076986560.0, + "grad_norm": 1.6983655724355666, + "language_loss": 0.72738469, + "learning_rate": 3.934883750543966e-06, + "loss": 0.74959373, + "num_input_tokens_seen": 39144820, + "router_z_loss_clip": 1.16503906, + "router_z_loss_mlp": 0.18774414, + "step": 1813, + "time_per_iteration": 2.659183979034424 + }, + { + "auxiliary_loss_clip": 0.01176369, + "auxiliary_loss_mlp": 0.01044594, + "balance_loss_clip": 1.06220841, + "balance_loss_mlp": 1.02616465, + "epoch": 0.10906358033969638, + "flos": 28825394633280.0, + "grad_norm": 1.7962452160792424, + "language_loss": 0.82681644, + "learning_rate": 3.93478514371732e-06, + "loss": 0.84902608, + "num_input_tokens_seen": 39165945, + "router_z_loss_clip": 1.14257812, + "router_z_loss_mlp": 0.18432617, + "step": 1814, + "time_per_iteration": 2.690380334854126 + }, + { + "auxiliary_loss_clip": 0.01179433, + "auxiliary_loss_mlp": 0.01052255, + "balance_loss_clip": 1.06028581, + "balance_loss_mlp": 1.03359914, + "epoch": 0.10912370359236434, + "flos": 25886325612960.0, + "grad_norm": 2.111039521660595, + "language_loss": 0.84357321, + "learning_rate": 3.934686463523429e-06, + "loss": 0.86589015, + "num_input_tokens_seen": 39183520, + "router_z_loss_clip": 1.19238281, + "router_z_loss_mlp": 0.18652344, + "step": 1815, + "time_per_iteration": 2.6738972663879395 + }, + { + "auxiliary_loss_clip": 0.01178075, + "auxiliary_loss_mlp": 0.01049604, + "balance_loss_clip": 1.06363082, + "balance_loss_mlp": 1.02968383, + "epoch": 0.10918382684503232, + "flos": 16537081318560.0, + "grad_norm": 2.3445906978846836, + "language_loss": 0.72111356, + "learning_rate": 3.9345877099660315e-06, + "loss": 0.74339032, + "num_input_tokens_seen": 39201190, + "router_z_loss_clip": 1.14355469, + "router_z_loss_mlp": 0.19921875, + "step": 1816, + "time_per_iteration": 2.6235761642456055 + }, + { + "auxiliary_loss_clip": 0.01177995, + "auxiliary_loss_mlp": 0.0105704, + "balance_loss_clip": 1.05882716, + "balance_loss_mlp": 1.03691697, + "epoch": 0.10924395009770028, + "flos": 34122718985760.0, + "grad_norm": 2.2329615189312024, + "language_loss": 0.72709256, + "learning_rate": 3.9344888830488744e-06, + "loss": 0.74944293, + "num_input_tokens_seen": 39221210, + "router_z_loss_clip": 1.19140625, + "router_z_loss_mlp": 0.2010498, + "step": 1817, + "time_per_iteration": 2.747723340988159 + }, + { + "auxiliary_loss_clip": 0.01176661, + "auxiliary_loss_mlp": 0.01053681, + "balance_loss_clip": 1.05955791, + "balance_loss_mlp": 1.03414297, + "epoch": 0.10930407335036825, + "flos": 31229873831520.0, + "grad_norm": 1.7139503971939498, + "language_loss": 0.67254686, + "learning_rate": 3.934389982775706e-06, + "loss": 0.69485027, + "num_input_tokens_seen": 39242025, + "router_z_loss_clip": 1.16992188, + "router_z_loss_mlp": 0.19519043, + "step": 1818, + "time_per_iteration": 2.69962739944458 + }, + { + "auxiliary_loss_clip": 0.01176309, + "auxiliary_loss_mlp": 0.0106045, + "balance_loss_clip": 1.05824995, + "balance_loss_mlp": 1.0412811, + "epoch": 0.10936419660303623, + "flos": 22459052705760.0, + "grad_norm": 2.222650598607074, + "language_loss": 0.7365247, + "learning_rate": 3.934291009150275e-06, + "loss": 0.7588923, + "num_input_tokens_seen": 39259870, + "router_z_loss_clip": 1.18066406, + "router_z_loss_mlp": 0.19177246, + "step": 1819, + "time_per_iteration": 2.659717082977295 + }, + { + "auxiliary_loss_clip": 0.01178649, + "auxiliary_loss_mlp": 0.01048345, + "balance_loss_clip": 1.0621978, + "balance_loss_mlp": 1.02967668, + "epoch": 0.1094243198557042, + "flos": 29090096438400.0, + "grad_norm": 2.671784407449809, + "language_loss": 0.73416024, + "learning_rate": 3.934191962176335e-06, + "loss": 0.75643015, + "num_input_tokens_seen": 39278500, + "router_z_loss_clip": 1.1640625, + "router_z_loss_mlp": 0.18664551, + "step": 1820, + "time_per_iteration": 2.785038471221924 + }, + { + "auxiliary_loss_clip": 0.01175558, + "auxiliary_loss_mlp": 0.01049644, + "balance_loss_clip": 1.05934441, + "balance_loss_mlp": 1.02931845, + "epoch": 0.10948444310837216, + "flos": 17868450696480.0, + "grad_norm": 3.0083547758353513, + "language_loss": 0.82889444, + "learning_rate": 3.934092841857642e-06, + "loss": 0.85114646, + "num_input_tokens_seen": 39294800, + "router_z_loss_clip": 1.1640625, + "router_z_loss_mlp": 0.20336914, + "step": 1821, + "time_per_iteration": 2.7623205184936523 + }, + { + "auxiliary_loss_clip": 0.01170939, + "auxiliary_loss_mlp": 0.01047724, + "balance_loss_clip": 1.0553869, + "balance_loss_mlp": 1.02985489, + "epoch": 0.10954456636104014, + "flos": 33945116631840.0, + "grad_norm": 2.1121937863819324, + "language_loss": 0.7604329, + "learning_rate": 3.933993648197955e-06, + "loss": 0.78261954, + "num_input_tokens_seen": 39314625, + "router_z_loss_clip": 1.15722656, + "router_z_loss_mlp": 0.17883301, + "step": 1822, + "time_per_iteration": 2.6799333095550537 + }, + { + "auxiliary_loss_clip": 0.01171847, + "auxiliary_loss_mlp": 0.01046964, + "balance_loss_clip": 1.0578959, + "balance_loss_mlp": 1.02877235, + "epoch": 0.1096046896137081, + "flos": 41025230909280.0, + "grad_norm": 1.6519362390485086, + "language_loss": 0.7956152, + "learning_rate": 3.933894381201034e-06, + "loss": 0.81780326, + "num_input_tokens_seen": 39336465, + "router_z_loss_clip": 1.140625, + "router_z_loss_mlp": 0.18212891, + "step": 1823, + "time_per_iteration": 2.7517216205596924 + }, + { + "auxiliary_loss_clip": 0.01173938, + "auxiliary_loss_mlp": 0.01041327, + "balance_loss_clip": 1.05912161, + "balance_loss_mlp": 1.02318323, + "epoch": 0.10966481286637607, + "flos": 32921512577280.0, + "grad_norm": 1.5280531107499369, + "language_loss": 0.79366803, + "learning_rate": 3.933795040870645e-06, + "loss": 0.81582075, + "num_input_tokens_seen": 39357930, + "router_z_loss_clip": 1.14746094, + "router_z_loss_mlp": 0.18127441, + "step": 1824, + "time_per_iteration": 2.683046340942383 + }, + { + "auxiliary_loss_clip": 0.01172477, + "auxiliary_loss_mlp": 0.01050463, + "balance_loss_clip": 1.05823219, + "balance_loss_mlp": 1.03160357, + "epoch": 0.10972493611904403, + "flos": 28109677453920.0, + "grad_norm": 1.8385832409786087, + "language_loss": 0.88061261, + "learning_rate": 3.933695627210554e-06, + "loss": 0.90284204, + "num_input_tokens_seen": 39376380, + "router_z_loss_clip": 1.140625, + "router_z_loss_mlp": 0.18847656, + "step": 1825, + "time_per_iteration": 2.6507537364959717 + }, + { + "auxiliary_loss_clip": 0.01172277, + "auxiliary_loss_mlp": 0.01054983, + "balance_loss_clip": 1.05744946, + "balance_loss_mlp": 1.0359571, + "epoch": 0.10978505937171201, + "flos": 46496956750560.0, + "grad_norm": 1.7814661004348948, + "language_loss": 0.7653448, + "learning_rate": 3.933596140224532e-06, + "loss": 0.78761744, + "num_input_tokens_seen": 39399935, + "router_z_loss_clip": 1.15039062, + "router_z_loss_mlp": 0.19018555, + "step": 1826, + "time_per_iteration": 2.8472020626068115 + }, + { + "auxiliary_loss_clip": 0.01069101, + "auxiliary_loss_mlp": 0.01012136, + "balance_loss_clip": 1.02700448, + "balance_loss_mlp": 1.00941086, + "epoch": 0.10984518262437998, + "flos": 73028633728320.0, + "grad_norm": 0.8253032153116697, + "language_loss": 0.54965919, + "learning_rate": 3.93349657991635e-06, + "loss": 0.57047158, + "num_input_tokens_seen": 39460685, + "router_z_loss_clip": 0.42138672, + "router_z_loss_mlp": 0.02726746, + "step": 1827, + "time_per_iteration": 3.265742540359497 + }, + { + "auxiliary_loss_clip": 0.01068722, + "auxiliary_loss_mlp": 0.01008878, + "balance_loss_clip": 1.02655649, + "balance_loss_mlp": 1.00621557, + "epoch": 0.10990530587704794, + "flos": 81411113361600.0, + "grad_norm": 0.7312222354514797, + "language_loss": 0.55324399, + "learning_rate": 3.933396946289784e-06, + "loss": 0.57402003, + "num_input_tokens_seen": 39524765, + "router_z_loss_clip": 0.42163086, + "router_z_loss_mlp": 0.0266571, + "step": 1828, + "time_per_iteration": 3.290682554244995 + }, + { + "auxiliary_loss_clip": 0.01176536, + "auxiliary_loss_mlp": 0.01053104, + "balance_loss_clip": 1.05735183, + "balance_loss_mlp": 1.03289795, + "epoch": 0.10996542912971592, + "flos": 31051096476480.0, + "grad_norm": 2.2651690672597833, + "language_loss": 0.8418107, + "learning_rate": 3.933297239348612e-06, + "loss": 0.86410713, + "num_input_tokens_seen": 39543640, + "router_z_loss_clip": 1.19140625, + "router_z_loss_mlp": 0.20202637, + "step": 1829, + "time_per_iteration": 2.715101480484009 + }, + { + "auxiliary_loss_clip": 0.01176585, + "auxiliary_loss_mlp": 0.01051013, + "balance_loss_clip": 1.05853224, + "balance_loss_mlp": 1.03056836, + "epoch": 0.11002555238238389, + "flos": 53712865899360.0, + "grad_norm": 1.959059657763882, + "language_loss": 0.88914728, + "learning_rate": 3.933197459096614e-06, + "loss": 0.91142333, + "num_input_tokens_seen": 39567525, + "router_z_loss_clip": 1.18066406, + "router_z_loss_mlp": 0.2043457, + "step": 1830, + "time_per_iteration": 2.8411290645599365 + }, + { + "auxiliary_loss_clip": 0.01067685, + "auxiliary_loss_mlp": 0.01002655, + "balance_loss_clip": 1.02543008, + "balance_loss_mlp": 1.00010884, + "epoch": 0.11008567563505185, + "flos": 65968332400800.0, + "grad_norm": 0.6837208513549795, + "language_loss": 0.5553863, + "learning_rate": 3.9330976055375756e-06, + "loss": 0.57608974, + "num_input_tokens_seen": 39628470, + "router_z_loss_clip": 0.42260742, + "router_z_loss_mlp": 0.02546692, + "step": 1831, + "time_per_iteration": 6.272849082946777 + }, + { + "auxiliary_loss_clip": 0.01180132, + "auxiliary_loss_mlp": 0.01060651, + "balance_loss_clip": 1.05880964, + "balance_loss_mlp": 1.03913379, + "epoch": 0.11014579888771983, + "flos": 29581825328640.0, + "grad_norm": 2.4552546576662664, + "language_loss": 0.90785974, + "learning_rate": 3.932997678675282e-06, + "loss": 0.93026757, + "num_input_tokens_seen": 39646670, + "router_z_loss_clip": 1.21289062, + "router_z_loss_mlp": 0.21508789, + "step": 1832, + "time_per_iteration": 2.8280205726623535 + }, + { + "auxiliary_loss_clip": 0.01066699, + "auxiliary_loss_mlp": 0.01003445, + "balance_loss_clip": 1.0244894, + "balance_loss_mlp": 1.00079525, + "epoch": 0.1102059221403878, + "flos": 70458099600960.0, + "grad_norm": 0.7375466141811682, + "language_loss": 0.59877038, + "learning_rate": 3.932897678513523e-06, + "loss": 0.61947185, + "num_input_tokens_seen": 39712915, + "router_z_loss_clip": 0.421875, + "router_z_loss_mlp": 0.02651978, + "step": 1833, + "time_per_iteration": 3.276287794113159 + }, + { + "auxiliary_loss_clip": 0.01173641, + "auxiliary_loss_mlp": 0.01052499, + "balance_loss_clip": 1.05480754, + "balance_loss_mlp": 1.03232932, + "epoch": 0.11026604539305576, + "flos": 20494527664320.0, + "grad_norm": 6.305921954914103, + "language_loss": 0.80611563, + "learning_rate": 3.93279760505609e-06, + "loss": 0.82837701, + "num_input_tokens_seen": 39730650, + "router_z_loss_clip": 1.18847656, + "router_z_loss_mlp": 0.20178223, + "step": 1834, + "time_per_iteration": 4.152186393737793 + }, + { + "auxiliary_loss_clip": 0.01177495, + "auxiliary_loss_mlp": 0.01052441, + "balance_loss_clip": 1.05955005, + "balance_loss_mlp": 1.03154373, + "epoch": 0.11032616864572373, + "flos": 29270535001920.0, + "grad_norm": 3.0285060940790296, + "language_loss": 0.90795124, + "learning_rate": 3.932697458306779e-06, + "loss": 0.93025059, + "num_input_tokens_seen": 39751065, + "router_z_loss_clip": 1.1796875, + "router_z_loss_mlp": 0.2088623, + "step": 1835, + "time_per_iteration": 4.085704326629639 + }, + { + "auxiliary_loss_clip": 0.01175063, + "auxiliary_loss_mlp": 0.01053541, + "balance_loss_clip": 1.05685556, + "balance_loss_mlp": 1.0324409, + "epoch": 0.1103862918983917, + "flos": 24017287099680.0, + "grad_norm": 2.7540414072545967, + "language_loss": 0.63584018, + "learning_rate": 3.932597238269386e-06, + "loss": 0.65812624, + "num_input_tokens_seen": 39769245, + "router_z_loss_clip": 1.18164062, + "router_z_loss_mlp": 0.21105957, + "step": 1836, + "time_per_iteration": 2.7069456577301025 + }, + { + "auxiliary_loss_clip": 0.01173326, + "auxiliary_loss_mlp": 0.01060226, + "balance_loss_clip": 1.0554049, + "balance_loss_mlp": 1.04135537, + "epoch": 0.11044641515105967, + "flos": 39243615985440.0, + "grad_norm": 3.261115158267675, + "language_loss": 0.7291342, + "learning_rate": 3.932496944947711e-06, + "loss": 0.75146973, + "num_input_tokens_seen": 39790830, + "router_z_loss_clip": 1.17773438, + "router_z_loss_mlp": 0.1887207, + "step": 1837, + "time_per_iteration": 2.733210802078247 + }, + { + "auxiliary_loss_clip": 0.01177201, + "auxiliary_loss_mlp": 0.01051371, + "balance_loss_clip": 1.05922878, + "balance_loss_mlp": 1.03223848, + "epoch": 0.11050653840372764, + "flos": 20364081073920.0, + "grad_norm": 2.168834172142915, + "language_loss": 0.785748, + "learning_rate": 3.93239657834556e-06, + "loss": 0.80803382, + "num_input_tokens_seen": 39809475, + "router_z_loss_clip": 1.18066406, + "router_z_loss_mlp": 0.19140625, + "step": 1838, + "time_per_iteration": 2.6536407470703125 + }, + { + "auxiliary_loss_clip": 0.01175484, + "auxiliary_loss_mlp": 0.01063036, + "balance_loss_clip": 1.05927753, + "balance_loss_mlp": 1.0429728, + "epoch": 0.11056666165639562, + "flos": 25879721296320.0, + "grad_norm": 2.1210831598736646, + "language_loss": 0.71782482, + "learning_rate": 3.932296138466736e-06, + "loss": 0.74021, + "num_input_tokens_seen": 39826355, + "router_z_loss_clip": 1.16162109, + "router_z_loss_mlp": 0.20056152, + "step": 1839, + "time_per_iteration": 2.676433563232422 + }, + { + "auxiliary_loss_clip": 0.01181704, + "auxiliary_loss_mlp": 0.01052164, + "balance_loss_clip": 1.06128049, + "balance_loss_mlp": 1.03174365, + "epoch": 0.11062678490906358, + "flos": 23386238334720.0, + "grad_norm": 8.088214720288287, + "language_loss": 0.78608805, + "learning_rate": 3.93219562531505e-06, + "loss": 0.80842674, + "num_input_tokens_seen": 39845335, + "router_z_loss_clip": 1.20507812, + "router_z_loss_mlp": 0.20422363, + "step": 1840, + "time_per_iteration": 2.665569543838501 + }, + { + "auxiliary_loss_clip": 0.01171799, + "auxiliary_loss_mlp": 0.01046852, + "balance_loss_clip": 1.05641055, + "balance_loss_mlp": 1.02779078, + "epoch": 0.11068690816173155, + "flos": 30378037642560.0, + "grad_norm": 1.6643545648603162, + "language_loss": 0.88135278, + "learning_rate": 3.932095038894311e-06, + "loss": 0.9035393, + "num_input_tokens_seen": 39865065, + "router_z_loss_clip": 1.15429688, + "router_z_loss_mlp": 0.19067383, + "step": 1841, + "time_per_iteration": 2.689549446105957 + }, + { + "auxiliary_loss_clip": 0.01169963, + "auxiliary_loss_mlp": 0.01049307, + "balance_loss_clip": 1.05498052, + "balance_loss_mlp": 1.03020978, + "epoch": 0.11074703141439952, + "flos": 20099257716960.0, + "grad_norm": 1.996681836632594, + "language_loss": 0.9072389, + "learning_rate": 3.931994379208334e-06, + "loss": 0.92943156, + "num_input_tokens_seen": 39882780, + "router_z_loss_clip": 1.14941406, + "router_z_loss_mlp": 0.19104004, + "step": 1842, + "time_per_iteration": 2.630075454711914 + }, + { + "auxiliary_loss_clip": 0.01171414, + "auxiliary_loss_mlp": 0.01053769, + "balance_loss_clip": 1.05382395, + "balance_loss_mlp": 1.03547037, + "epoch": 0.11080715466706749, + "flos": 23394058169760.0, + "grad_norm": 2.1206968876472705, + "language_loss": 0.85611117, + "learning_rate": 3.931893646260937e-06, + "loss": 0.87836295, + "num_input_tokens_seen": 39900295, + "router_z_loss_clip": 1.17675781, + "router_z_loss_mlp": 0.18286133, + "step": 1843, + "time_per_iteration": 2.857192277908325 + }, + { + "auxiliary_loss_clip": 0.01173611, + "auxiliary_loss_mlp": 0.01053362, + "balance_loss_clip": 1.05717587, + "balance_loss_mlp": 1.03294146, + "epoch": 0.11086727791973545, + "flos": 33806282964480.0, + "grad_norm": 1.5663940621241952, + "language_loss": 0.74886453, + "learning_rate": 3.931792840055941e-06, + "loss": 0.77113426, + "num_input_tokens_seen": 39922075, + "router_z_loss_clip": 1.1640625, + "router_z_loss_mlp": 0.20410156, + "step": 1844, + "time_per_iteration": 2.698136568069458 + }, + { + "auxiliary_loss_clip": 0.01176513, + "auxiliary_loss_mlp": 0.01051408, + "balance_loss_clip": 1.05703628, + "balance_loss_mlp": 1.03055835, + "epoch": 0.11092740117240343, + "flos": 22591849298400.0, + "grad_norm": 2.586134039317051, + "language_loss": 0.75804257, + "learning_rate": 3.931691960597165e-06, + "loss": 0.78032178, + "num_input_tokens_seen": 39940115, + "router_z_loss_clip": 1.19433594, + "router_z_loss_mlp": 0.20849609, + "step": 1845, + "time_per_iteration": 2.6158998012542725 + }, + { + "auxiliary_loss_clip": 0.01172795, + "auxiliary_loss_mlp": 0.01050986, + "balance_loss_clip": 1.05599964, + "balance_loss_mlp": 1.03130436, + "epoch": 0.1109875244250714, + "flos": 25041458396160.0, + "grad_norm": 1.533840517927003, + "language_loss": 0.76063335, + "learning_rate": 3.9315910078884375e-06, + "loss": 0.78287125, + "num_input_tokens_seen": 39959920, + "router_z_loss_clip": 1.16796875, + "router_z_loss_mlp": 0.19677734, + "step": 1846, + "time_per_iteration": 2.6464662551879883 + }, + { + "auxiliary_loss_clip": 0.01178693, + "auxiliary_loss_mlp": 0.01048086, + "balance_loss_clip": 1.05736423, + "balance_loss_mlp": 1.02922654, + "epoch": 0.11104764767773936, + "flos": 17202806524800.0, + "grad_norm": 3.6039035402050947, + "language_loss": 0.86041427, + "learning_rate": 3.931489981933584e-06, + "loss": 0.88268209, + "num_input_tokens_seen": 39974755, + "router_z_loss_clip": 1.21191406, + "router_z_loss_mlp": 0.1887207, + "step": 1847, + "time_per_iteration": 2.6312685012817383 + }, + { + "auxiliary_loss_clip": 0.0117656, + "auxiliary_loss_mlp": 0.01051141, + "balance_loss_clip": 1.05551934, + "balance_loss_mlp": 1.03112531, + "epoch": 0.11110777093040733, + "flos": 25129489744800.0, + "grad_norm": 2.0454631397284335, + "language_loss": 0.76732266, + "learning_rate": 3.931388882736438e-06, + "loss": 0.78959966, + "num_input_tokens_seen": 39993355, + "router_z_loss_clip": 1.20800781, + "router_z_loss_mlp": 0.20019531, + "step": 1848, + "time_per_iteration": 2.6704723834991455 + }, + { + "auxiliary_loss_clip": 0.01176103, + "auxiliary_loss_mlp": 0.01049825, + "balance_loss_clip": 1.06130898, + "balance_loss_mlp": 1.0307039, + "epoch": 0.11116789418307531, + "flos": 26686143964800.0, + "grad_norm": 1.9927347487951843, + "language_loss": 0.77426893, + "learning_rate": 3.931287710300832e-06, + "loss": 0.79652822, + "num_input_tokens_seen": 40012410, + "router_z_loss_clip": 1.1484375, + "router_z_loss_mlp": 0.19128418, + "step": 1849, + "time_per_iteration": 2.7016708850860596 + }, + { + "auxiliary_loss_clip": 0.01176446, + "auxiliary_loss_mlp": 0.01054654, + "balance_loss_clip": 1.05469561, + "balance_loss_mlp": 1.03456712, + "epoch": 0.11122801743574327, + "flos": 18940952757600.0, + "grad_norm": 2.597033351712897, + "language_loss": 0.71132731, + "learning_rate": 3.931186464630601e-06, + "loss": 0.73363835, + "num_input_tokens_seen": 40029315, + "router_z_loss_clip": 1.21777344, + "router_z_loss_mlp": 0.20080566, + "step": 1850, + "time_per_iteration": 2.651764154434204 + }, + { + "auxiliary_loss_clip": 0.01177625, + "auxiliary_loss_mlp": 0.0105068, + "balance_loss_clip": 1.05841208, + "balance_loss_mlp": 1.03050983, + "epoch": 0.11128814068841124, + "flos": 17561009511360.0, + "grad_norm": 3.0813119502841553, + "language_loss": 0.81550598, + "learning_rate": 3.931085145729588e-06, + "loss": 0.837789, + "num_input_tokens_seen": 40045765, + "router_z_loss_clip": 1.18945312, + "router_z_loss_mlp": 0.20166016, + "step": 1851, + "time_per_iteration": 2.647956609725952 + }, + { + "auxiliary_loss_clip": 0.01174215, + "auxiliary_loss_mlp": 0.01055791, + "balance_loss_clip": 1.05668116, + "balance_loss_mlp": 1.03733766, + "epoch": 0.11134826394107922, + "flos": 20320896003840.0, + "grad_norm": 2.4507222703932126, + "language_loss": 0.88155627, + "learning_rate": 3.930983753601631e-06, + "loss": 0.9038564, + "num_input_tokens_seen": 40061660, + "router_z_loss_clip": 1.17480469, + "router_z_loss_mlp": 0.18457031, + "step": 1852, + "time_per_iteration": 2.589634656906128 + }, + { + "auxiliary_loss_clip": 0.0117673, + "auxiliary_loss_mlp": 0.01059632, + "balance_loss_clip": 1.05808783, + "balance_loss_mlp": 1.03840089, + "epoch": 0.11140838719374718, + "flos": 20366066420640.0, + "grad_norm": 2.399951604005805, + "language_loss": 0.72227383, + "learning_rate": 3.930882288250578e-06, + "loss": 0.74463743, + "num_input_tokens_seen": 40080180, + "router_z_loss_clip": 1.18652344, + "router_z_loss_mlp": 0.21228027, + "step": 1853, + "time_per_iteration": 2.6213321685791016 + }, + { + "auxiliary_loss_clip": 0.01068474, + "auxiliary_loss_mlp": 0.0101839, + "balance_loss_clip": 1.02548552, + "balance_loss_mlp": 1.01547575, + "epoch": 0.11146851044641515, + "flos": 74403876970080.0, + "grad_norm": 0.8049852443126639, + "language_loss": 0.53704488, + "learning_rate": 3.930780749680273e-06, + "loss": 0.55791354, + "num_input_tokens_seen": 40138910, + "router_z_loss_clip": 0.42944336, + "router_z_loss_mlp": 0.02911377, + "step": 1854, + "time_per_iteration": 3.2282774448394775 + }, + { + "auxiliary_loss_clip": 0.01184815, + "auxiliary_loss_mlp": 0.01055826, + "balance_loss_clip": 1.05812752, + "balance_loss_mlp": 1.03517938, + "epoch": 0.11152863369908313, + "flos": 27081657015840.0, + "grad_norm": 2.024467701918069, + "language_loss": 0.84438455, + "learning_rate": 3.9306791378945705e-06, + "loss": 0.86679095, + "num_input_tokens_seen": 40157745, + "router_z_loss_clip": 1.26953125, + "router_z_loss_mlp": 0.20654297, + "step": 1855, + "time_per_iteration": 2.677640199661255 + }, + { + "auxiliary_loss_clip": 0.01178777, + "auxiliary_loss_mlp": 0.01067311, + "balance_loss_clip": 1.05845261, + "balance_loss_mlp": 1.04865432, + "epoch": 0.11158875695175109, + "flos": 23839563193920.0, + "grad_norm": 4.178439082221603, + "language_loss": 0.81894261, + "learning_rate": 3.9305774528973205e-06, + "loss": 0.84140348, + "num_input_tokens_seen": 40175375, + "router_z_loss_clip": 1.20410156, + "router_z_loss_mlp": 0.18652344, + "step": 1856, + "time_per_iteration": 2.8970448970794678 + }, + { + "auxiliary_loss_clip": 0.01175446, + "auxiliary_loss_mlp": 0.01050804, + "balance_loss_clip": 1.05951416, + "balance_loss_mlp": 1.03074133, + "epoch": 0.11164888020441906, + "flos": 31046031816480.0, + "grad_norm": 2.54608154415125, + "language_loss": 0.83087373, + "learning_rate": 3.93047569469238e-06, + "loss": 0.8531363, + "num_input_tokens_seen": 40195715, + "router_z_loss_clip": 1.15917969, + "router_z_loss_mlp": 0.20068359, + "step": 1857, + "time_per_iteration": 2.6844964027404785 + }, + { + "auxiliary_loss_clip": 0.01177833, + "auxiliary_loss_mlp": 0.01042525, + "balance_loss_clip": 1.05693936, + "balance_loss_mlp": 1.02367759, + "epoch": 0.11170900345708702, + "flos": 19074721764960.0, + "grad_norm": 2.2168583388524765, + "language_loss": 0.82953608, + "learning_rate": 3.930373863283608e-06, + "loss": 0.85173965, + "num_input_tokens_seen": 40213975, + "router_z_loss_clip": 1.20898438, + "router_z_loss_mlp": 0.18847656, + "step": 1858, + "time_per_iteration": 2.6189463138580322 + }, + { + "auxiliary_loss_clip": 0.01179789, + "auxiliary_loss_mlp": 0.0105148, + "balance_loss_clip": 1.0602051, + "balance_loss_mlp": 1.03215647, + "epoch": 0.111769126709755, + "flos": 28112918836320.0, + "grad_norm": 2.215861035443126, + "language_loss": 0.91732001, + "learning_rate": 3.930271958674866e-06, + "loss": 0.93963265, + "num_input_tokens_seen": 40233905, + "router_z_loss_clip": 1.1953125, + "router_z_loss_mlp": 0.19311523, + "step": 1859, + "time_per_iteration": 2.6887619495391846 + }, + { + "auxiliary_loss_clip": 0.01178608, + "auxiliary_loss_mlp": 0.01046282, + "balance_loss_clip": 1.05720878, + "balance_loss_mlp": 1.02691054, + "epoch": 0.11182924996242297, + "flos": 25442036107200.0, + "grad_norm": 2.3496395039193496, + "language_loss": 0.81607759, + "learning_rate": 3.930169980870018e-06, + "loss": 0.83832645, + "num_input_tokens_seen": 40252810, + "router_z_loss_clip": 1.21484375, + "router_z_loss_mlp": 0.19360352, + "step": 1860, + "time_per_iteration": 2.7004687786102295 + }, + { + "auxiliary_loss_clip": 0.01174969, + "auxiliary_loss_mlp": 0.01055046, + "balance_loss_clip": 1.05911779, + "balance_loss_mlp": 1.03571033, + "epoch": 0.11188937321509093, + "flos": 21298924468800.0, + "grad_norm": 2.310438207553572, + "language_loss": 0.74915332, + "learning_rate": 3.930067929872931e-06, + "loss": 0.7714535, + "num_input_tokens_seen": 40272000, + "router_z_loss_clip": 1.15722656, + "router_z_loss_mlp": 0.19348145, + "step": 1861, + "time_per_iteration": 2.6319777965545654 + }, + { + "auxiliary_loss_clip": 0.01172385, + "auxiliary_loss_mlp": 0.010471, + "balance_loss_clip": 1.05642676, + "balance_loss_mlp": 1.02886105, + "epoch": 0.11194949646775891, + "flos": 29402278145280.0, + "grad_norm": 1.90903956678247, + "language_loss": 0.88627946, + "learning_rate": 3.929965805687474e-06, + "loss": 0.90847433, + "num_input_tokens_seen": 40290660, + "router_z_loss_clip": 1.15917969, + "router_z_loss_mlp": 0.18225098, + "step": 1862, + "time_per_iteration": 2.6806211471557617 + }, + { + "auxiliary_loss_clip": 0.01178468, + "auxiliary_loss_mlp": 0.01056211, + "balance_loss_clip": 1.05961013, + "balance_loss_mlp": 1.03720891, + "epoch": 0.11200961972042688, + "flos": 30693298662720.0, + "grad_norm": 2.0405342516539955, + "language_loss": 0.87007225, + "learning_rate": 3.92986360831752e-06, + "loss": 0.89241904, + "num_input_tokens_seen": 40307820, + "router_z_loss_clip": 1.18847656, + "router_z_loss_mlp": 0.19018555, + "step": 1863, + "time_per_iteration": 2.6683459281921387 + }, + { + "auxiliary_loss_clip": 0.01175733, + "auxiliary_loss_mlp": 0.010498, + "balance_loss_clip": 1.05718923, + "balance_loss_mlp": 1.02834225, + "epoch": 0.11206974297309484, + "flos": 25976058687360.0, + "grad_norm": 3.197303089951924, + "language_loss": 0.64210641, + "learning_rate": 3.929761337766945e-06, + "loss": 0.66436172, + "num_input_tokens_seen": 40327430, + "router_z_loss_clip": 1.18554688, + "router_z_loss_mlp": 0.21447754, + "step": 1864, + "time_per_iteration": 2.6502790451049805 + }, + { + "auxiliary_loss_clip": 0.01178769, + "auxiliary_loss_mlp": 0.01044702, + "balance_loss_clip": 1.06165898, + "balance_loss_mlp": 1.02733326, + "epoch": 0.11212986622576282, + "flos": 23081228186400.0, + "grad_norm": 3.930143605030455, + "language_loss": 0.73668331, + "learning_rate": 3.929658994039627e-06, + "loss": 0.75891805, + "num_input_tokens_seen": 40344545, + "router_z_loss_clip": 1.17089844, + "router_z_loss_mlp": 0.17358398, + "step": 1865, + "time_per_iteration": 2.6247005462646484 + }, + { + "auxiliary_loss_clip": 0.01176211, + "auxiliary_loss_mlp": 0.01054424, + "balance_loss_clip": 1.05823088, + "balance_loss_mlp": 1.03344274, + "epoch": 0.11218998947843078, + "flos": 26911307255040.0, + "grad_norm": 2.55268868250889, + "language_loss": 0.84555131, + "learning_rate": 3.929556577139446e-06, + "loss": 0.86785769, + "num_input_tokens_seen": 40362300, + "router_z_loss_clip": 1.1796875, + "router_z_loss_mlp": 0.20983887, + "step": 1866, + "time_per_iteration": 2.66129732131958 + }, + { + "auxiliary_loss_clip": 0.01175711, + "auxiliary_loss_mlp": 0.01047431, + "balance_loss_clip": 1.0582819, + "balance_loss_mlp": 1.02759433, + "epoch": 0.11225011273109875, + "flos": 29982727177920.0, + "grad_norm": 1.5938706455729739, + "language_loss": 0.81397498, + "learning_rate": 3.929454087070286e-06, + "loss": 0.83620632, + "num_input_tokens_seen": 40384720, + "router_z_loss_clip": 1.17285156, + "router_z_loss_mlp": 0.19836426, + "step": 1867, + "time_per_iteration": 2.6960713863372803 + }, + { + "auxiliary_loss_clip": 0.01175665, + "auxiliary_loss_mlp": 0.01054471, + "balance_loss_clip": 1.05901694, + "balance_loss_mlp": 1.0356003, + "epoch": 0.11231023598376672, + "flos": 34701547327200.0, + "grad_norm": 2.246955499831662, + "language_loss": 0.86616403, + "learning_rate": 3.929351523836035e-06, + "loss": 0.88846546, + "num_input_tokens_seen": 40404000, + "router_z_loss_clip": 1.16699219, + "router_z_loss_mlp": 0.1887207, + "step": 1868, + "time_per_iteration": 2.7057101726531982 + }, + { + "auxiliary_loss_clip": 0.0117778, + "auxiliary_loss_mlp": 0.01049613, + "balance_loss_clip": 1.06213117, + "balance_loss_mlp": 1.03121877, + "epoch": 0.1123703592364347, + "flos": 17602938545760.0, + "grad_norm": 2.512378933201815, + "language_loss": 0.68397933, + "learning_rate": 3.9292488874405795e-06, + "loss": 0.70625317, + "num_input_tokens_seen": 40418665, + "router_z_loss_clip": 1.15625, + "router_z_loss_mlp": 0.18408203, + "step": 1869, + "time_per_iteration": 2.890383005142212 + }, + { + "auxiliary_loss_clip": 0.0118094, + "auxiliary_loss_mlp": 0.01058181, + "balance_loss_clip": 1.05946147, + "balance_loss_mlp": 1.03829718, + "epoch": 0.11243048248910266, + "flos": 27132905024640.0, + "grad_norm": 1.6020747644513258, + "language_loss": 0.77276725, + "learning_rate": 3.929146177887814e-06, + "loss": 0.79515851, + "num_input_tokens_seen": 40437870, + "router_z_loss_clip": 1.21582031, + "router_z_loss_mlp": 0.19885254, + "step": 1870, + "time_per_iteration": 2.678875684738159 + }, + { + "auxiliary_loss_clip": 0.01180451, + "auxiliary_loss_mlp": 0.01049545, + "balance_loss_clip": 1.05849111, + "balance_loss_mlp": 1.02978003, + "epoch": 0.11249060574177062, + "flos": 22676639264640.0, + "grad_norm": 2.5907536214157116, + "language_loss": 0.76133209, + "learning_rate": 3.929043395181631e-06, + "loss": 0.78363204, + "num_input_tokens_seen": 40455570, + "router_z_loss_clip": 1.21777344, + "router_z_loss_mlp": 0.19750977, + "step": 1871, + "time_per_iteration": 5.562704563140869 + }, + { + "auxiliary_loss_clip": 0.01178988, + "auxiliary_loss_mlp": 0.01048386, + "balance_loss_clip": 1.06120145, + "balance_loss_mlp": 1.03002739, + "epoch": 0.1125507289944386, + "flos": 27889821927360.0, + "grad_norm": 2.869695722152029, + "language_loss": 0.81675136, + "learning_rate": 3.928940539325929e-06, + "loss": 0.83902514, + "num_input_tokens_seen": 40473600, + "router_z_loss_clip": 1.17675781, + "router_z_loss_mlp": 0.18371582, + "step": 1872, + "time_per_iteration": 2.7159221172332764 + }, + { + "auxiliary_loss_clip": 0.01177584, + "auxiliary_loss_mlp": 0.01049793, + "balance_loss_clip": 1.05908811, + "balance_loss_mlp": 1.03111267, + "epoch": 0.11261085224710657, + "flos": 24009669851040.0, + "grad_norm": 2.2642591752837897, + "language_loss": 0.83504343, + "learning_rate": 3.9288376103246095e-06, + "loss": 0.85731715, + "num_input_tokens_seen": 40490025, + "router_z_loss_clip": 1.18457031, + "router_z_loss_mlp": 0.18688965, + "step": 1873, + "time_per_iteration": 2.6310200691223145 + }, + { + "auxiliary_loss_clip": 0.01181737, + "auxiliary_loss_mlp": 0.01051205, + "balance_loss_clip": 1.05977094, + "balance_loss_mlp": 1.03159535, + "epoch": 0.11267097549977453, + "flos": 31803232340160.0, + "grad_norm": 1.838823409186898, + "language_loss": 0.92117238, + "learning_rate": 3.928734608181575e-06, + "loss": 0.94350177, + "num_input_tokens_seen": 40511580, + "router_z_loss_clip": 1.21875, + "router_z_loss_mlp": 0.19592285, + "step": 1874, + "time_per_iteration": 4.239372968673706 + }, + { + "auxiliary_loss_clip": 0.01174833, + "auxiliary_loss_mlp": 0.01059373, + "balance_loss_clip": 1.05935049, + "balance_loss_mlp": 1.04105043, + "epoch": 0.11273109875244251, + "flos": 25707386188800.0, + "grad_norm": 1.5223839092105518, + "language_loss": 0.75274432, + "learning_rate": 3.928631532900729e-06, + "loss": 0.77508628, + "num_input_tokens_seen": 40530155, + "router_z_loss_clip": 1.15429688, + "router_z_loss_mlp": 0.18322754, + "step": 1875, + "time_per_iteration": 2.6554837226867676 + }, + { + "auxiliary_loss_clip": 0.01173423, + "auxiliary_loss_mlp": 0.01054398, + "balance_loss_clip": 1.06112695, + "balance_loss_mlp": 1.03695798, + "epoch": 0.11279122200511048, + "flos": 33054552273600.0, + "grad_norm": 3.9488693571410387, + "language_loss": 0.72156858, + "learning_rate": 3.928528384485984e-06, + "loss": 0.74384677, + "num_input_tokens_seen": 40549500, + "router_z_loss_clip": 1.12402344, + "router_z_loss_mlp": 0.17456055, + "step": 1876, + "time_per_iteration": 2.6919562816619873 + }, + { + "auxiliary_loss_clip": 0.01174963, + "auxiliary_loss_mlp": 0.01044005, + "balance_loss_clip": 1.06168723, + "balance_loss_mlp": 1.02583718, + "epoch": 0.11285134525777844, + "flos": 24633790161120.0, + "grad_norm": 1.9641376968088644, + "language_loss": 0.76939523, + "learning_rate": 3.9284251629412475e-06, + "loss": 0.79158485, + "num_input_tokens_seen": 40567475, + "router_z_loss_clip": 1.1328125, + "router_z_loss_mlp": 0.18164062, + "step": 1877, + "time_per_iteration": 2.672461986541748 + }, + { + "auxiliary_loss_clip": 0.01178883, + "auxiliary_loss_mlp": 0.01055875, + "balance_loss_clip": 1.06090975, + "balance_loss_mlp": 1.03634846, + "epoch": 0.11291146851044641, + "flos": 15201214522560.0, + "grad_norm": 2.318497119076509, + "language_loss": 0.88149405, + "learning_rate": 3.928321868270436e-06, + "loss": 0.90384161, + "num_input_tokens_seen": 40583280, + "router_z_loss_clip": 1.1796875, + "router_z_loss_mlp": 0.1953125, + "step": 1878, + "time_per_iteration": 2.6166136264801025 + }, + { + "auxiliary_loss_clip": 0.01176202, + "auxiliary_loss_mlp": 0.01043988, + "balance_loss_clip": 1.05886281, + "balance_loss_mlp": 1.02567744, + "epoch": 0.11297159176311439, + "flos": 29092649027040.0, + "grad_norm": 2.630049649643064, + "language_loss": 0.8154496, + "learning_rate": 3.928218500477466e-06, + "loss": 0.83765143, + "num_input_tokens_seen": 40603080, + "router_z_loss_clip": 1.17382812, + "router_z_loss_mlp": 0.18310547, + "step": 1879, + "time_per_iteration": 2.682828903198242 + }, + { + "auxiliary_loss_clip": 0.01177297, + "auxiliary_loss_mlp": 0.01056355, + "balance_loss_clip": 1.05931067, + "balance_loss_mlp": 1.03697145, + "epoch": 0.11303171501578235, + "flos": 36522417144960.0, + "grad_norm": 2.125823965188559, + "language_loss": 0.70419228, + "learning_rate": 3.928115059566259e-06, + "loss": 0.72652882, + "num_input_tokens_seen": 40623255, + "router_z_loss_clip": 1.18066406, + "router_z_loss_mlp": 0.19384766, + "step": 1880, + "time_per_iteration": 3.0198397636413574 + }, + { + "auxiliary_loss_clip": 0.01171947, + "auxiliary_loss_mlp": 0.01043399, + "balance_loss_clip": 1.05783546, + "balance_loss_mlp": 1.02500486, + "epoch": 0.11309183826845032, + "flos": 19743161628960.0, + "grad_norm": 1.6158273345563705, + "language_loss": 0.7255609, + "learning_rate": 3.928011545540734e-06, + "loss": 0.7477144, + "num_input_tokens_seen": 40641570, + "router_z_loss_clip": 1.14160156, + "router_z_loss_mlp": 0.18395996, + "step": 1881, + "time_per_iteration": 2.68086576461792 + }, + { + "auxiliary_loss_clip": 0.01175722, + "auxiliary_loss_mlp": 0.01054979, + "balance_loss_clip": 1.05729985, + "balance_loss_mlp": 1.03549981, + "epoch": 0.1131519615211183, + "flos": 14667029873280.0, + "grad_norm": 2.656846575652515, + "language_loss": 0.74128187, + "learning_rate": 3.927907958404819e-06, + "loss": 0.76358885, + "num_input_tokens_seen": 40658775, + "router_z_loss_clip": 1.18261719, + "router_z_loss_mlp": 0.19470215, + "step": 1882, + "time_per_iteration": 2.617060661315918 + }, + { + "auxiliary_loss_clip": 0.01173757, + "auxiliary_loss_mlp": 0.01052369, + "balance_loss_clip": 1.05835176, + "balance_loss_mlp": 1.03292584, + "epoch": 0.11321208477378626, + "flos": 32030867184480.0, + "grad_norm": 1.996000970144399, + "language_loss": 0.7941494, + "learning_rate": 3.92780429816244e-06, + "loss": 0.81641066, + "num_input_tokens_seen": 40679555, + "router_z_loss_clip": 1.15429688, + "router_z_loss_mlp": 0.19458008, + "step": 1883, + "time_per_iteration": 2.714326858520508 + }, + { + "auxiliary_loss_clip": 0.01175194, + "auxiliary_loss_mlp": 0.01048392, + "balance_loss_clip": 1.05705833, + "balance_loss_mlp": 1.02906823, + "epoch": 0.11327220802645423, + "flos": 16626368702880.0, + "grad_norm": 2.5341492861526476, + "language_loss": 0.77304083, + "learning_rate": 3.927700564817529e-06, + "loss": 0.7952767, + "num_input_tokens_seen": 40697295, + "router_z_loss_clip": 1.18066406, + "router_z_loss_mlp": 0.1932373, + "step": 1884, + "time_per_iteration": 2.640132188796997 + }, + { + "auxiliary_loss_clip": 0.01078701, + "auxiliary_loss_mlp": 0.01014186, + "balance_loss_clip": 1.03509462, + "balance_loss_mlp": 1.01194668, + "epoch": 0.1133323312791222, + "flos": 69784757146080.0, + "grad_norm": 0.7971777808964758, + "language_loss": 0.55195194, + "learning_rate": 3.927596758374019e-06, + "loss": 0.5728808, + "num_input_tokens_seen": 40758095, + "router_z_loss_clip": 0.4362793, + "router_z_loss_mlp": 0.02243042, + "step": 1885, + "time_per_iteration": 3.1950860023498535 + }, + { + "auxiliary_loss_clip": 0.01168331, + "auxiliary_loss_mlp": 0.010425, + "balance_loss_clip": 1.05744767, + "balance_loss_mlp": 1.02484488, + "epoch": 0.11339245453179017, + "flos": 29714378817600.0, + "grad_norm": 2.052699522346995, + "language_loss": 0.90598994, + "learning_rate": 3.927492878835848e-06, + "loss": 0.9280982, + "num_input_tokens_seen": 40777140, + "router_z_loss_clip": 1.10742188, + "router_z_loss_mlp": 0.17663574, + "step": 1886, + "time_per_iteration": 2.7022626399993896 + }, + { + "auxiliary_loss_clip": 0.0117231, + "auxiliary_loss_mlp": 0.01044566, + "balance_loss_clip": 1.05739868, + "balance_loss_mlp": 1.02705407, + "epoch": 0.11345257778445814, + "flos": 27663321566880.0, + "grad_norm": 2.2860665577650807, + "language_loss": 0.84951246, + "learning_rate": 3.927388926206953e-06, + "loss": 0.87168121, + "num_input_tokens_seen": 40797505, + "router_z_loss_clip": 1.14941406, + "router_z_loss_mlp": 0.17504883, + "step": 1887, + "time_per_iteration": 2.6793906688690186 + }, + { + "auxiliary_loss_clip": 0.01173792, + "auxiliary_loss_mlp": 0.01052891, + "balance_loss_clip": 1.05767608, + "balance_loss_mlp": 1.03547454, + "epoch": 0.11351270103712612, + "flos": 25610643624960.0, + "grad_norm": 2.6494206494979515, + "language_loss": 0.75284851, + "learning_rate": 3.927284900491277e-06, + "loss": 0.77511531, + "num_input_tokens_seen": 40812970, + "router_z_loss_clip": 1.16210938, + "router_z_loss_mlp": 0.17419434, + "step": 1888, + "time_per_iteration": 2.734715223312378 + }, + { + "auxiliary_loss_clip": 0.01181652, + "auxiliary_loss_mlp": 0.01054729, + "balance_loss_clip": 1.06233358, + "balance_loss_mlp": 1.03496444, + "epoch": 0.11357282428979408, + "flos": 45572688365760.0, + "grad_norm": 1.874584676800314, + "language_loss": 0.68483698, + "learning_rate": 3.927180801692764e-06, + "loss": 0.70720088, + "num_input_tokens_seen": 40837745, + "router_z_loss_clip": 1.19140625, + "router_z_loss_mlp": 0.19787598, + "step": 1889, + "time_per_iteration": 2.8403103351593018 + }, + { + "auxiliary_loss_clip": 0.01174643, + "auxiliary_loss_mlp": 0.01043873, + "balance_loss_clip": 1.06004858, + "balance_loss_mlp": 1.02580106, + "epoch": 0.11363294754246205, + "flos": 26548525815840.0, + "grad_norm": 1.855390852964585, + "language_loss": 0.84288299, + "learning_rate": 3.927076629815362e-06, + "loss": 0.86506814, + "num_input_tokens_seen": 40856490, + "router_z_loss_clip": 1.14550781, + "router_z_loss_mlp": 0.18078613, + "step": 1890, + "time_per_iteration": 2.676318645477295 + }, + { + "auxiliary_loss_clip": 0.0117216, + "auxiliary_loss_mlp": 0.01048127, + "balance_loss_clip": 1.0582515, + "balance_loss_mlp": 1.03023422, + "epoch": 0.11369307079513001, + "flos": 27578693669760.0, + "grad_norm": 2.5469545902827653, + "language_loss": 0.64869189, + "learning_rate": 3.926972384863022e-06, + "loss": 0.67089486, + "num_input_tokens_seen": 40874070, + "router_z_loss_clip": 1.13964844, + "router_z_loss_mlp": 0.17895508, + "step": 1891, + "time_per_iteration": 2.669962167739868 + }, + { + "auxiliary_loss_clip": 0.0117807, + "auxiliary_loss_mlp": 0.01040704, + "balance_loss_clip": 1.06005502, + "balance_loss_mlp": 1.02319264, + "epoch": 0.11375319404779799, + "flos": 26776565832960.0, + "grad_norm": 2.646420644225765, + "language_loss": 0.88787252, + "learning_rate": 3.9268680668396956e-06, + "loss": 0.91006029, + "num_input_tokens_seen": 40892425, + "router_z_loss_clip": 1.18066406, + "router_z_loss_mlp": 0.1751709, + "step": 1892, + "time_per_iteration": 2.769043207168579 + }, + { + "auxiliary_loss_clip": 0.01177251, + "auxiliary_loss_mlp": 0.01058228, + "balance_loss_clip": 1.05934179, + "balance_loss_mlp": 1.03983378, + "epoch": 0.11381331730046595, + "flos": 32208266952000.0, + "grad_norm": 3.615732177794396, + "language_loss": 0.72569942, + "learning_rate": 3.926763675749339e-06, + "loss": 0.74805427, + "num_input_tokens_seen": 40912190, + "router_z_loss_clip": 1.1796875, + "router_z_loss_mlp": 0.18395996, + "step": 1893, + "time_per_iteration": 2.8983561992645264 + }, + { + "auxiliary_loss_clip": 0.01171766, + "auxiliary_loss_mlp": 0.0105786, + "balance_loss_clip": 1.05688953, + "balance_loss_mlp": 1.03831005, + "epoch": 0.11387344055313392, + "flos": 29046870851040.0, + "grad_norm": 2.076223182244245, + "language_loss": 0.79395223, + "learning_rate": 3.92665921159591e-06, + "loss": 0.81624854, + "num_input_tokens_seen": 40928395, + "router_z_loss_clip": 1.14746094, + "router_z_loss_mlp": 0.19567871, + "step": 1894, + "time_per_iteration": 2.771216630935669 + }, + { + "auxiliary_loss_clip": 0.01179701, + "auxiliary_loss_mlp": 0.01050098, + "balance_loss_clip": 1.05941057, + "balance_loss_mlp": 1.03114402, + "epoch": 0.1139335638058019, + "flos": 42126777509760.0, + "grad_norm": 2.6818709755757824, + "language_loss": 0.78966898, + "learning_rate": 3.926554674383371e-06, + "loss": 0.81196702, + "num_input_tokens_seen": 40946555, + "router_z_loss_clip": 1.20410156, + "router_z_loss_mlp": 0.1895752, + "step": 1895, + "time_per_iteration": 2.7633185386657715 + }, + { + "auxiliary_loss_clip": 0.01072377, + "auxiliary_loss_mlp": 0.01017665, + "balance_loss_clip": 1.03025115, + "balance_loss_mlp": 1.01553106, + "epoch": 0.11399368705846986, + "flos": 86131067994720.0, + "grad_norm": 0.8078716439244048, + "language_loss": 0.63378775, + "learning_rate": 3.926450064115686e-06, + "loss": 0.65468812, + "num_input_tokens_seen": 41004910, + "router_z_loss_clip": 0.42089844, + "router_z_loss_mlp": 0.0213623, + "step": 1896, + "time_per_iteration": 3.357787609100342 + }, + { + "auxiliary_loss_clip": 0.01174197, + "auxiliary_loss_mlp": 0.0105309, + "balance_loss_clip": 1.06065857, + "balance_loss_mlp": 1.03306258, + "epoch": 0.11405381031113783, + "flos": 26019973068480.0, + "grad_norm": 1.5582486570530227, + "language_loss": 0.84861076, + "learning_rate": 3.926345380796821e-06, + "loss": 0.87088358, + "num_input_tokens_seen": 41026385, + "router_z_loss_clip": 1.13525391, + "router_z_loss_mlp": 0.20007324, + "step": 1897, + "time_per_iteration": 2.7205729484558105 + }, + { + "auxiliary_loss_clip": 0.01176695, + "auxiliary_loss_mlp": 0.01048302, + "balance_loss_clip": 1.05926704, + "balance_loss_mlp": 1.02940702, + "epoch": 0.11411393356380581, + "flos": 24061809240000.0, + "grad_norm": 2.2457866988118447, + "language_loss": 0.7981801, + "learning_rate": 3.9262406244307465e-06, + "loss": 0.82043004, + "num_input_tokens_seen": 41045315, + "router_z_loss_clip": 1.17480469, + "router_z_loss_mlp": 0.18896484, + "step": 1898, + "time_per_iteration": 2.694175958633423 + }, + { + "auxiliary_loss_clip": 0.01176621, + "auxiliary_loss_mlp": 0.01048167, + "balance_loss_clip": 1.05749953, + "balance_loss_mlp": 1.0280447, + "epoch": 0.11417405681647377, + "flos": 21390399786240.0, + "grad_norm": 9.033774085167966, + "language_loss": 0.73213649, + "learning_rate": 3.926135795021435e-06, + "loss": 0.7543844, + "num_input_tokens_seen": 41063390, + "router_z_loss_clip": 1.19238281, + "router_z_loss_mlp": 0.20141602, + "step": 1899, + "time_per_iteration": 2.669802188873291 + }, + { + "auxiliary_loss_clip": 0.01069652, + "auxiliary_loss_mlp": 0.01001188, + "balance_loss_clip": 1.02747083, + "balance_loss_mlp": 0.99912268, + "epoch": 0.11423418006914174, + "flos": 72814369586400.0, + "grad_norm": 0.9056404424193075, + "language_loss": 0.63422751, + "learning_rate": 3.92603089257286e-06, + "loss": 0.6549359, + "num_input_tokens_seen": 41124180, + "router_z_loss_clip": 0.421875, + "router_z_loss_mlp": 0.0206604, + "step": 1900, + "time_per_iteration": 3.187979221343994 + }, + { + "auxiliary_loss_clip": 0.01174357, + "auxiliary_loss_mlp": 0.01049277, + "balance_loss_clip": 1.05775404, + "balance_loss_mlp": 1.03057337, + "epoch": 0.1142943033218097, + "flos": 28020106448640.0, + "grad_norm": 1.5767806012317216, + "language_loss": 0.78517616, + "learning_rate": 3.925925917089001e-06, + "loss": 0.80741251, + "num_input_tokens_seen": 41143485, + "router_z_loss_clip": 1.16503906, + "router_z_loss_mlp": 0.18701172, + "step": 1901, + "time_per_iteration": 2.682116985321045 + }, + { + "auxiliary_loss_clip": 0.01174199, + "auxiliary_loss_mlp": 0.01050673, + "balance_loss_clip": 1.05912805, + "balance_loss_mlp": 1.03307748, + "epoch": 0.11435442657447768, + "flos": 22276466726400.0, + "grad_norm": 2.1081419977561584, + "language_loss": 0.84303248, + "learning_rate": 3.925820868573839e-06, + "loss": 0.86528122, + "num_input_tokens_seen": 41161695, + "router_z_loss_clip": 1.14941406, + "router_z_loss_mlp": 0.17578125, + "step": 1902, + "time_per_iteration": 2.647860050201416 + }, + { + "auxiliary_loss_clip": 0.01174219, + "auxiliary_loss_mlp": 0.01042524, + "balance_loss_clip": 1.05722058, + "balance_loss_mlp": 1.02354622, + "epoch": 0.11441454982714565, + "flos": 29359741351680.0, + "grad_norm": 1.9494033322988693, + "language_loss": 0.77886736, + "learning_rate": 3.925715747031356e-06, + "loss": 0.80103481, + "num_input_tokens_seen": 41181715, + "router_z_loss_clip": 1.16796875, + "router_z_loss_mlp": 0.18981934, + "step": 1903, + "time_per_iteration": 2.680069923400879 + }, + { + "auxiliary_loss_clip": 0.01174578, + "auxiliary_loss_mlp": 0.01040846, + "balance_loss_clip": 1.05921853, + "balance_loss_mlp": 1.02435935, + "epoch": 0.11447467307981361, + "flos": 30917489538240.0, + "grad_norm": 1.8866072473747004, + "language_loss": 0.75668472, + "learning_rate": 3.925610552465539e-06, + "loss": 0.77883899, + "num_input_tokens_seen": 41201770, + "router_z_loss_clip": 1.15332031, + "router_z_loss_mlp": 0.16491699, + "step": 1904, + "time_per_iteration": 2.7528982162475586 + }, + { + "auxiliary_loss_clip": 0.01173793, + "auxiliary_loss_mlp": 0.01050062, + "balance_loss_clip": 1.05998731, + "balance_loss_mlp": 1.0308814, + "epoch": 0.11453479633248159, + "flos": 26510243336640.0, + "grad_norm": 2.0386034314193693, + "language_loss": 0.92306876, + "learning_rate": 3.9255052848803764e-06, + "loss": 0.94530737, + "num_input_tokens_seen": 41220590, + "router_z_loss_clip": 1.13671875, + "router_z_loss_mlp": 0.19189453, + "step": 1905, + "time_per_iteration": 2.927767515182495 + }, + { + "auxiliary_loss_clip": 0.01176526, + "auxiliary_loss_mlp": 0.01041323, + "balance_loss_clip": 1.05376494, + "balance_loss_mlp": 1.02242851, + "epoch": 0.11459491958514956, + "flos": 15825051211680.0, + "grad_norm": 2.7652000721607104, + "language_loss": 0.77688289, + "learning_rate": 3.925399944279861e-06, + "loss": 0.79906136, + "num_input_tokens_seen": 41237250, + "router_z_loss_clip": 1.2265625, + "router_z_loss_mlp": 0.18884277, + "step": 1906, + "time_per_iteration": 2.743415117263794 + }, + { + "auxiliary_loss_clip": 0.01171739, + "auxiliary_loss_mlp": 0.01052173, + "balance_loss_clip": 1.0556922, + "balance_loss_mlp": 1.0328846, + "epoch": 0.11465504283781752, + "flos": 27712462677120.0, + "grad_norm": 2.6479068333713447, + "language_loss": 0.81928396, + "learning_rate": 3.925294530667986e-06, + "loss": 0.84152305, + "num_input_tokens_seen": 41256680, + "router_z_loss_clip": 1.16015625, + "router_z_loss_mlp": 0.19274902, + "step": 1907, + "time_per_iteration": 2.686065912246704 + }, + { + "auxiliary_loss_clip": 0.01172092, + "auxiliary_loss_mlp": 0.01052519, + "balance_loss_clip": 1.05810845, + "balance_loss_mlp": 1.03455412, + "epoch": 0.1147151660904855, + "flos": 28551495405600.0, + "grad_norm": 2.7917847182373183, + "language_loss": 0.84610862, + "learning_rate": 3.92518904404875e-06, + "loss": 0.86835468, + "num_input_tokens_seen": 41270955, + "router_z_loss_clip": 1.13964844, + "router_z_loss_mlp": 0.1796875, + "step": 1908, + "time_per_iteration": 2.6851043701171875 + }, + { + "auxiliary_loss_clip": 0.01066505, + "auxiliary_loss_mlp": 0.01004788, + "balance_loss_clip": 1.02484512, + "balance_loss_mlp": 1.00277483, + "epoch": 0.11477528934315347, + "flos": 76888446130080.0, + "grad_norm": 0.9241853502451386, + "language_loss": 0.61079818, + "learning_rate": 3.925083484426153e-06, + "loss": 0.63151109, + "num_input_tokens_seen": 41319180, + "router_z_loss_clip": 0.41674805, + "router_z_loss_mlp": 0.02012634, + "step": 1909, + "time_per_iteration": 3.0075783729553223 + }, + { + "auxiliary_loss_clip": 0.01174869, + "auxiliary_loss_mlp": 0.01041191, + "balance_loss_clip": 1.05940247, + "balance_loss_mlp": 1.02336955, + "epoch": 0.11483541259582143, + "flos": 19920237258240.0, + "grad_norm": 1.9619082130286964, + "language_loss": 0.79476571, + "learning_rate": 3.924977851804197e-06, + "loss": 0.81692624, + "num_input_tokens_seen": 41337480, + "router_z_loss_clip": 1.15625, + "router_z_loss_mlp": 0.17834473, + "step": 1910, + "time_per_iteration": 5.584706544876099 + }, + { + "auxiliary_loss_clip": 0.0117642, + "auxiliary_loss_mlp": 0.01047878, + "balance_loss_clip": 1.06013167, + "balance_loss_mlp": 1.02965128, + "epoch": 0.1148955358484894, + "flos": 26332478913600.0, + "grad_norm": 2.8721474247086887, + "language_loss": 0.76928943, + "learning_rate": 3.9248721461868875e-06, + "loss": 0.79153246, + "num_input_tokens_seen": 41354650, + "router_z_loss_clip": 1.16308594, + "router_z_loss_mlp": 0.18249512, + "step": 1911, + "time_per_iteration": 2.65714955329895 + }, + { + "auxiliary_loss_clip": 0.0116778, + "auxiliary_loss_mlp": 0.01045253, + "balance_loss_clip": 1.05602932, + "balance_loss_mlp": 1.0267998, + "epoch": 0.11495565910115738, + "flos": 33767959968000.0, + "grad_norm": 1.7924224890714286, + "language_loss": 0.78845561, + "learning_rate": 3.9247663675782336e-06, + "loss": 0.81058598, + "num_input_tokens_seen": 41376935, + "router_z_loss_clip": 1.1171875, + "router_z_loss_mlp": 0.18444824, + "step": 1912, + "time_per_iteration": 2.731820583343506 + }, + { + "auxiliary_loss_clip": 0.01172355, + "auxiliary_loss_mlp": 0.0105949, + "balance_loss_clip": 1.05805469, + "balance_loss_mlp": 1.04039299, + "epoch": 0.11501578235382534, + "flos": 25175551541760.0, + "grad_norm": 1.8581570526324058, + "language_loss": 0.78136057, + "learning_rate": 3.924660515982246e-06, + "loss": 0.80367899, + "num_input_tokens_seen": 41396105, + "router_z_loss_clip": 1.14355469, + "router_z_loss_mlp": 0.19091797, + "step": 1913, + "time_per_iteration": 4.15751314163208 + }, + { + "auxiliary_loss_clip": 0.01171179, + "auxiliary_loss_mlp": 0.01050062, + "balance_loss_clip": 1.05511332, + "balance_loss_mlp": 1.03097701, + "epoch": 0.1150759056064933, + "flos": 24195335143680.0, + "grad_norm": 2.331515143141954, + "language_loss": 0.7035014, + "learning_rate": 3.924554591402939e-06, + "loss": 0.72571385, + "num_input_tokens_seen": 41415600, + "router_z_loss_clip": 1.16113281, + "router_z_loss_mlp": 0.1907959, + "step": 1914, + "time_per_iteration": 4.0709311962127686 + }, + { + "auxiliary_loss_clip": 0.01063448, + "auxiliary_loss_mlp": 0.01016281, + "balance_loss_clip": 1.02221465, + "balance_loss_mlp": 1.01417589, + "epoch": 0.11513602885916129, + "flos": 85469151412800.0, + "grad_norm": 0.7668067677176355, + "language_loss": 0.61072707, + "learning_rate": 3.92444859384433e-06, + "loss": 0.63152432, + "num_input_tokens_seen": 41478760, + "router_z_loss_clip": 0.41235352, + "router_z_loss_mlp": 0.02107239, + "step": 1915, + "time_per_iteration": 3.465924024581909 + }, + { + "auxiliary_loss_clip": 0.01175094, + "auxiliary_loss_mlp": 0.0105521, + "balance_loss_clip": 1.0605669, + "balance_loss_mlp": 1.03601742, + "epoch": 0.11519615211182925, + "flos": 19207761461280.0, + "grad_norm": 2.0355066668581716, + "language_loss": 0.93587613, + "learning_rate": 3.924342523310436e-06, + "loss": 0.95817918, + "num_input_tokens_seen": 41495720, + "router_z_loss_clip": 1.14453125, + "router_z_loss_mlp": 0.1920166, + "step": 1916, + "time_per_iteration": 2.733931064605713 + }, + { + "auxiliary_loss_clip": 0.01173187, + "auxiliary_loss_mlp": 0.01056062, + "balance_loss_clip": 1.05705321, + "balance_loss_mlp": 1.03586817, + "epoch": 0.11525627536449722, + "flos": 25218696094560.0, + "grad_norm": 1.9449813511062641, + "language_loss": 0.72325099, + "learning_rate": 3.9242363798052806e-06, + "loss": 0.74554348, + "num_input_tokens_seen": 41513585, + "router_z_loss_clip": 1.16210938, + "router_z_loss_mlp": 0.20202637, + "step": 1917, + "time_per_iteration": 2.6517693996429443 + }, + { + "auxiliary_loss_clip": 0.01171953, + "auxiliary_loss_mlp": 0.01045837, + "balance_loss_clip": 1.0576539, + "balance_loss_mlp": 1.026752, + "epoch": 0.1153163986171652, + "flos": 24774041933280.0, + "grad_norm": 4.286433963359438, + "language_loss": 0.74872696, + "learning_rate": 3.92413016333289e-06, + "loss": 0.7709049, + "num_input_tokens_seen": 41533390, + "router_z_loss_clip": 1.140625, + "router_z_loss_mlp": 0.1907959, + "step": 1918, + "time_per_iteration": 2.6888234615325928 + }, + { + "auxiliary_loss_clip": 0.0117615, + "auxiliary_loss_mlp": 0.01046107, + "balance_loss_clip": 1.05701947, + "balance_loss_mlp": 1.02740324, + "epoch": 0.11537652186983316, + "flos": 21523317930720.0, + "grad_norm": 1.9167697754653739, + "language_loss": 0.86378169, + "learning_rate": 3.92402387389729e-06, + "loss": 0.88600427, + "num_input_tokens_seen": 41551015, + "router_z_loss_clip": 1.19140625, + "router_z_loss_mlp": 0.18713379, + "step": 1919, + "time_per_iteration": 2.6313788890838623 + }, + { + "auxiliary_loss_clip": 0.01171658, + "auxiliary_loss_mlp": 0.01054761, + "balance_loss_clip": 1.05735075, + "balance_loss_mlp": 1.03567553, + "epoch": 0.11543664512250112, + "flos": 25708925845440.0, + "grad_norm": 2.0453862419084405, + "language_loss": 0.86732554, + "learning_rate": 3.923917511502512e-06, + "loss": 0.88958979, + "num_input_tokens_seen": 41568055, + "router_z_loss_clip": 1.14257812, + "router_z_loss_mlp": 0.19091797, + "step": 1920, + "time_per_iteration": 2.667117118835449 + }, + { + "auxiliary_loss_clip": 0.0117037, + "auxiliary_loss_mlp": 0.01047068, + "balance_loss_clip": 1.05736685, + "balance_loss_mlp": 1.02834022, + "epoch": 0.11549676837516909, + "flos": 27756741713760.0, + "grad_norm": 2.0454452832546326, + "language_loss": 0.79618371, + "learning_rate": 3.923811076152589e-06, + "loss": 0.81835806, + "num_input_tokens_seen": 41587435, + "router_z_loss_clip": 1.12988281, + "router_z_loss_mlp": 0.18725586, + "step": 1921, + "time_per_iteration": 2.6366848945617676 + }, + { + "auxiliary_loss_clip": 0.01176109, + "auxiliary_loss_mlp": 0.0105567, + "balance_loss_clip": 1.0574944, + "balance_loss_mlp": 1.03609562, + "epoch": 0.11555689162783707, + "flos": 23390168510880.0, + "grad_norm": 1.9392808491454623, + "language_loss": 0.78977907, + "learning_rate": 3.923704567851557e-06, + "loss": 0.81209689, + "num_input_tokens_seen": 41604975, + "router_z_loss_clip": 1.18652344, + "router_z_loss_mlp": 0.19592285, + "step": 1922, + "time_per_iteration": 2.638003349304199 + }, + { + "auxiliary_loss_clip": 0.01178048, + "auxiliary_loss_mlp": 0.01062651, + "balance_loss_clip": 1.05944467, + "balance_loss_mlp": 1.04382777, + "epoch": 0.11561701488050503, + "flos": 29983132350720.0, + "grad_norm": 1.824218824913205, + "language_loss": 0.84322214, + "learning_rate": 3.923597986603456e-06, + "loss": 0.86562914, + "num_input_tokens_seen": 41626155, + "router_z_loss_clip": 1.18652344, + "router_z_loss_mlp": 0.18823242, + "step": 1923, + "time_per_iteration": 2.6774115562438965 + }, + { + "auxiliary_loss_clip": 0.01177602, + "auxiliary_loss_mlp": 0.0104776, + "balance_loss_clip": 1.06112075, + "balance_loss_mlp": 1.0278163, + "epoch": 0.115677138133173, + "flos": 20856053067840.0, + "grad_norm": 2.068339765896576, + "language_loss": 0.80803192, + "learning_rate": 3.9234913324123264e-06, + "loss": 0.83028555, + "num_input_tokens_seen": 41644805, + "router_z_loss_clip": 1.1640625, + "router_z_loss_mlp": 0.19946289, + "step": 1924, + "time_per_iteration": 2.6707777976989746 + }, + { + "auxiliary_loss_clip": 0.01063675, + "auxiliary_loss_mlp": 0.01003756, + "balance_loss_clip": 1.0221231, + "balance_loss_mlp": 1.00161803, + "epoch": 0.11573726138584098, + "flos": 76510760682240.0, + "grad_norm": 0.813170261835139, + "language_loss": 0.61225563, + "learning_rate": 3.923384605282212e-06, + "loss": 0.63292992, + "num_input_tokens_seen": 41709345, + "router_z_loss_clip": 0.4152832, + "router_z_loss_mlp": 0.02140808, + "step": 1925, + "time_per_iteration": 3.331524133682251 + }, + { + "auxiliary_loss_clip": 0.01174596, + "auxiliary_loss_mlp": 0.01065325, + "balance_loss_clip": 1.05839384, + "balance_loss_mlp": 1.04576254, + "epoch": 0.11579738463850894, + "flos": 27578126427840.0, + "grad_norm": 1.8113739620608773, + "language_loss": 0.75068837, + "learning_rate": 3.923277805217161e-06, + "loss": 0.77308756, + "num_input_tokens_seen": 41730210, + "router_z_loss_clip": 1.16308594, + "router_z_loss_mlp": 0.19555664, + "step": 1926, + "time_per_iteration": 2.721604824066162 + }, + { + "auxiliary_loss_clip": 0.01178849, + "auxiliary_loss_mlp": 0.01059716, + "balance_loss_clip": 1.05897105, + "balance_loss_mlp": 1.03745914, + "epoch": 0.11585750789117691, + "flos": 26510324371200.0, + "grad_norm": 2.6874379008696305, + "language_loss": 0.71897137, + "learning_rate": 3.923170932221222e-06, + "loss": 0.74135709, + "num_input_tokens_seen": 41750270, + "router_z_loss_clip": 1.19824219, + "router_z_loss_mlp": 0.22229004, + "step": 1927, + "time_per_iteration": 2.675550937652588 + }, + { + "auxiliary_loss_clip": 0.01172221, + "auxiliary_loss_mlp": 0.01050331, + "balance_loss_clip": 1.05681241, + "balance_loss_mlp": 1.03064919, + "epoch": 0.11591763114384489, + "flos": 32076442774080.0, + "grad_norm": 1.826145385095708, + "language_loss": 0.86851794, + "learning_rate": 3.92306398629845e-06, + "loss": 0.89074349, + "num_input_tokens_seen": 41772975, + "router_z_loss_clip": 1.15429688, + "router_z_loss_mlp": 0.19677734, + "step": 1928, + "time_per_iteration": 2.9139034748077393 + }, + { + "auxiliary_loss_clip": 0.01177117, + "auxiliary_loss_mlp": 0.01052775, + "balance_loss_clip": 1.06001472, + "balance_loss_mlp": 1.03328443, + "epoch": 0.11597775439651285, + "flos": 28065601003680.0, + "grad_norm": 3.098405003516829, + "language_loss": 0.77504206, + "learning_rate": 3.922956967452898e-06, + "loss": 0.79734099, + "num_input_tokens_seen": 41791765, + "router_z_loss_clip": 1.171875, + "router_z_loss_mlp": 0.19482422, + "step": 1929, + "time_per_iteration": 2.6805431842803955 + }, + { + "auxiliary_loss_clip": 0.01172356, + "auxiliary_loss_mlp": 0.01056554, + "balance_loss_clip": 1.0584451, + "balance_loss_mlp": 1.03844666, + "epoch": 0.11603787764918082, + "flos": 38976199522560.0, + "grad_norm": 2.4075643462280665, + "language_loss": 0.76902115, + "learning_rate": 3.922849875688626e-06, + "loss": 0.79131025, + "num_input_tokens_seen": 41815615, + "router_z_loss_clip": 1.13964844, + "router_z_loss_mlp": 0.18103027, + "step": 1930, + "time_per_iteration": 2.7424910068511963 + }, + { + "auxiliary_loss_clip": 0.01174899, + "auxiliary_loss_mlp": 0.01047843, + "balance_loss_clip": 1.05893612, + "balance_loss_mlp": 1.02807772, + "epoch": 0.1160980009018488, + "flos": 27176171129280.0, + "grad_norm": 1.7949491856131243, + "language_loss": 0.72134894, + "learning_rate": 3.922742711009693e-06, + "loss": 0.74357641, + "num_input_tokens_seen": 41834810, + "router_z_loss_clip": 1.16113281, + "router_z_loss_mlp": 0.19775391, + "step": 1931, + "time_per_iteration": 2.7505545616149902 + }, + { + "auxiliary_loss_clip": 0.01179287, + "auxiliary_loss_mlp": 0.01053245, + "balance_loss_clip": 1.06067324, + "balance_loss_mlp": 1.03348041, + "epoch": 0.11615812415451676, + "flos": 27800818164000.0, + "grad_norm": 1.5767253435739217, + "language_loss": 0.82001698, + "learning_rate": 3.922635473420164e-06, + "loss": 0.84234238, + "num_input_tokens_seen": 41854975, + "router_z_loss_clip": 1.1875, + "router_z_loss_mlp": 0.19763184, + "step": 1932, + "time_per_iteration": 2.7448837757110596 + }, + { + "auxiliary_loss_clip": 0.01062644, + "auxiliary_loss_mlp": 0.01001683, + "balance_loss_clip": 1.02139401, + "balance_loss_mlp": 0.99956989, + "epoch": 0.11621824740718473, + "flos": 81932332481280.0, + "grad_norm": 0.7668955661475862, + "language_loss": 0.61067927, + "learning_rate": 3.922528162924105e-06, + "loss": 0.63132256, + "num_input_tokens_seen": 41911105, + "router_z_loss_clip": 0.41235352, + "router_z_loss_mlp": 0.02114868, + "step": 1933, + "time_per_iteration": 3.2086355686187744 + }, + { + "auxiliary_loss_clip": 0.01176493, + "auxiliary_loss_mlp": 0.01046903, + "balance_loss_clip": 1.05678082, + "balance_loss_mlp": 1.02800846, + "epoch": 0.11627837065985269, + "flos": 24862721558400.0, + "grad_norm": 2.5382059130818115, + "language_loss": 0.86020768, + "learning_rate": 3.922420779525586e-06, + "loss": 0.88244164, + "num_input_tokens_seen": 41931750, + "router_z_loss_clip": 1.19726562, + "router_z_loss_mlp": 0.18896484, + "step": 1934, + "time_per_iteration": 2.6861140727996826 + }, + { + "auxiliary_loss_clip": 0.01180881, + "auxiliary_loss_mlp": 0.01052935, + "balance_loss_clip": 1.06066847, + "balance_loss_mlp": 1.03241932, + "epoch": 0.11633849391252067, + "flos": 26509959715680.0, + "grad_norm": 2.3151239158213213, + "language_loss": 0.65756303, + "learning_rate": 3.9223133232286776e-06, + "loss": 0.67990112, + "num_input_tokens_seen": 41949400, + "router_z_loss_clip": 1.20214844, + "router_z_loss_mlp": 0.2052002, + "step": 1935, + "time_per_iteration": 2.67742657661438 + }, + { + "auxiliary_loss_clip": 0.01179304, + "auxiliary_loss_mlp": 0.01045848, + "balance_loss_clip": 1.05950022, + "balance_loss_mlp": 1.028265, + "epoch": 0.11639861716518864, + "flos": 22946324695200.0, + "grad_norm": 1.7658448626834493, + "language_loss": 0.75640094, + "learning_rate": 3.922205794037456e-06, + "loss": 0.77865243, + "num_input_tokens_seen": 41968100, + "router_z_loss_clip": 1.19824219, + "router_z_loss_mlp": 0.17578125, + "step": 1936, + "time_per_iteration": 2.7128746509552 + }, + { + "auxiliary_loss_clip": 0.01176515, + "auxiliary_loss_mlp": 0.01049383, + "balance_loss_clip": 1.05760455, + "balance_loss_mlp": 1.02966607, + "epoch": 0.1164587404178566, + "flos": 25886609233920.0, + "grad_norm": 2.281599431100698, + "language_loss": 0.84673786, + "learning_rate": 3.922098191955998e-06, + "loss": 0.8689968, + "num_input_tokens_seen": 41986375, + "router_z_loss_clip": 1.18945312, + "router_z_loss_mlp": 0.19714355, + "step": 1937, + "time_per_iteration": 2.7062835693359375 + }, + { + "auxiliary_loss_clip": 0.01169895, + "auxiliary_loss_mlp": 0.01044374, + "balance_loss_clip": 1.05648518, + "balance_loss_mlp": 1.02587283, + "epoch": 0.11651886367052458, + "flos": 33944914045440.0, + "grad_norm": 1.8865377901791773, + "language_loss": 0.76069224, + "learning_rate": 3.921990516988384e-06, + "loss": 0.78283489, + "num_input_tokens_seen": 42006055, + "router_z_loss_clip": 1.13476562, + "router_z_loss_mlp": 0.18505859, + "step": 1938, + "time_per_iteration": 2.7132599353790283 + }, + { + "auxiliary_loss_clip": 0.01178623, + "auxiliary_loss_mlp": 0.01052942, + "balance_loss_clip": 1.05857539, + "balance_loss_mlp": 1.0337379, + "epoch": 0.11657898692319255, + "flos": 27930413891520.0, + "grad_norm": 2.086220420744803, + "language_loss": 0.79196256, + "learning_rate": 3.921882769138696e-06, + "loss": 0.81427824, + "num_input_tokens_seen": 42024995, + "router_z_loss_clip": 1.19824219, + "router_z_loss_mlp": 0.19213867, + "step": 1939, + "time_per_iteration": 2.677170991897583 + }, + { + "auxiliary_loss_clip": 0.01176729, + "auxiliary_loss_mlp": 0.01054164, + "balance_loss_clip": 1.0592227, + "balance_loss_mlp": 1.03399432, + "epoch": 0.11663911017586051, + "flos": 29670626505600.0, + "grad_norm": 2.444460789578623, + "language_loss": 0.86382008, + "learning_rate": 3.9217749484110215e-06, + "loss": 0.88612902, + "num_input_tokens_seen": 42042640, + "router_z_loss_clip": 1.17382812, + "router_z_loss_mlp": 0.20178223, + "step": 1940, + "time_per_iteration": 2.70872163772583 + }, + { + "auxiliary_loss_clip": 0.01174115, + "auxiliary_loss_mlp": 0.01050183, + "balance_loss_clip": 1.06030905, + "balance_loss_mlp": 1.03242123, + "epoch": 0.11669923342852849, + "flos": 51665171582880.0, + "grad_norm": 1.3946733630959338, + "language_loss": 0.75727522, + "learning_rate": 3.921667054809449e-06, + "loss": 0.77951825, + "num_input_tokens_seen": 42067005, + "router_z_loss_clip": 1.13671875, + "router_z_loss_mlp": 0.17773438, + "step": 1941, + "time_per_iteration": 2.998112916946411 + }, + { + "auxiliary_loss_clip": 0.01168684, + "auxiliary_loss_mlp": 0.01056182, + "balance_loss_clip": 1.05414057, + "balance_loss_mlp": 1.03701282, + "epoch": 0.11675935668119646, + "flos": 17866586901600.0, + "grad_norm": 2.20000204990048, + "language_loss": 0.88743818, + "learning_rate": 3.921559088338068e-06, + "loss": 0.90968686, + "num_input_tokens_seen": 42082295, + "router_z_loss_clip": 1.14355469, + "router_z_loss_mlp": 0.19177246, + "step": 1942, + "time_per_iteration": 2.73551607131958 + }, + { + "auxiliary_loss_clip": 0.01170283, + "auxiliary_loss_mlp": 0.01046856, + "balance_loss_clip": 1.05643296, + "balance_loss_mlp": 1.02932072, + "epoch": 0.11681947993386442, + "flos": 42850233489600.0, + "grad_norm": 2.1930919822538035, + "language_loss": 0.6840204, + "learning_rate": 3.921451049000975e-06, + "loss": 0.70619184, + "num_input_tokens_seen": 42105295, + "router_z_loss_clip": 1.13769531, + "router_z_loss_mlp": 0.17529297, + "step": 1943, + "time_per_iteration": 2.7659873962402344 + }, + { + "auxiliary_loss_clip": 0.01171292, + "auxiliary_loss_mlp": 0.01050773, + "balance_loss_clip": 1.05715346, + "balance_loss_mlp": 1.03216434, + "epoch": 0.11687960318653239, + "flos": 47568851052480.0, + "grad_norm": 1.8195761900031528, + "language_loss": 0.69382125, + "learning_rate": 3.921342936802265e-06, + "loss": 0.71604186, + "num_input_tokens_seen": 42125520, + "router_z_loss_clip": 1.140625, + "router_z_loss_mlp": 0.18603516, + "step": 1944, + "time_per_iteration": 2.82501220703125 + }, + { + "auxiliary_loss_clip": 0.01169435, + "auxiliary_loss_mlp": 0.01046373, + "balance_loss_clip": 1.05476499, + "balance_loss_mlp": 1.02864635, + "epoch": 0.11693972643920036, + "flos": 31719495823200.0, + "grad_norm": 1.641501724408868, + "language_loss": 0.82467651, + "learning_rate": 3.921234751746038e-06, + "loss": 0.8468346, + "num_input_tokens_seen": 42146335, + "router_z_loss_clip": 1.14746094, + "router_z_loss_mlp": 0.17749023, + "step": 1945, + "time_per_iteration": 2.707690954208374 + }, + { + "auxiliary_loss_clip": 0.01169353, + "auxiliary_loss_mlp": 0.01054647, + "balance_loss_clip": 1.05425501, + "balance_loss_mlp": 1.0359906, + "epoch": 0.11699984969186833, + "flos": 33277932803520.0, + "grad_norm": 1.9127769816199647, + "language_loss": 0.7638936, + "learning_rate": 3.9211264938363975e-06, + "loss": 0.78613359, + "num_input_tokens_seen": 42165320, + "router_z_loss_clip": 1.15039062, + "router_z_loss_mlp": 0.18664551, + "step": 1946, + "time_per_iteration": 2.693601369857788 + }, + { + "auxiliary_loss_clip": 0.01170629, + "auxiliary_loss_mlp": 0.01047995, + "balance_loss_clip": 1.05800271, + "balance_loss_mlp": 1.03075695, + "epoch": 0.1170599729445363, + "flos": 18629459844480.0, + "grad_norm": 2.1376281228316065, + "language_loss": 0.68747318, + "learning_rate": 3.921018163077448e-06, + "loss": 0.70965934, + "num_input_tokens_seen": 42182955, + "router_z_loss_clip": 1.12597656, + "router_z_loss_mlp": 0.17224121, + "step": 1947, + "time_per_iteration": 2.6752376556396484 + }, + { + "auxiliary_loss_clip": 0.01175212, + "auxiliary_loss_mlp": 0.01063628, + "balance_loss_clip": 1.0606575, + "balance_loss_mlp": 1.04474497, + "epoch": 0.11712009619720427, + "flos": 20943881830080.0, + "grad_norm": 1.8957500904086477, + "language_loss": 0.85306931, + "learning_rate": 3.920909759473295e-06, + "loss": 0.8754577, + "num_input_tokens_seen": 42200760, + "router_z_loss_clip": 1.14453125, + "router_z_loss_mlp": 0.18884277, + "step": 1948, + "time_per_iteration": 2.6327340602874756 + }, + { + "auxiliary_loss_clip": 0.01060752, + "auxiliary_loss_mlp": 0.01007005, + "balance_loss_clip": 1.01950717, + "balance_loss_mlp": 1.00489855, + "epoch": 0.11718021944987224, + "flos": 86563121281920.0, + "grad_norm": 0.8152606787819839, + "language_loss": 0.65063989, + "learning_rate": 3.920801283028054e-06, + "loss": 0.67131746, + "num_input_tokens_seen": 42265745, + "router_z_loss_clip": 0.41186523, + "router_z_loss_mlp": 0.02108765, + "step": 1949, + "time_per_iteration": 3.3110365867614746 + }, + { + "auxiliary_loss_clip": 0.0117088, + "auxiliary_loss_mlp": 0.01053652, + "balance_loss_clip": 1.05774045, + "balance_loss_mlp": 1.0353657, + "epoch": 0.1172403427025402, + "flos": 33499327986720.0, + "grad_norm": 1.5096438970126054, + "language_loss": 0.71596014, + "learning_rate": 3.920692733745835e-06, + "loss": 0.73820543, + "num_input_tokens_seen": 42286245, + "router_z_loss_clip": 1.13378906, + "router_z_loss_mlp": 0.18273926, + "step": 1950, + "time_per_iteration": 5.624756813049316 + }, + { + "auxiliary_loss_clip": 0.01175847, + "auxiliary_loss_mlp": 0.01054975, + "balance_loss_clip": 1.05815005, + "balance_loss_mlp": 1.03609228, + "epoch": 0.11730046595520818, + "flos": 19118676663360.0, + "grad_norm": 2.629157862226031, + "language_loss": 0.76677859, + "learning_rate": 3.920584111630755e-06, + "loss": 0.78908682, + "num_input_tokens_seen": 42302710, + "router_z_loss_clip": 1.17675781, + "router_z_loss_mlp": 0.18884277, + "step": 1951, + "time_per_iteration": 2.66506028175354 + }, + { + "auxiliary_loss_clip": 0.01173259, + "auxiliary_loss_mlp": 0.01057947, + "balance_loss_clip": 1.05820286, + "balance_loss_mlp": 1.03895736, + "epoch": 0.11736058920787615, + "flos": 31274314937280.0, + "grad_norm": 1.7295410250351486, + "language_loss": 0.76414752, + "learning_rate": 3.9204754166869325e-06, + "loss": 0.78645957, + "num_input_tokens_seen": 42324115, + "router_z_loss_clip": 1.15136719, + "router_z_loss_mlp": 0.19006348, + "step": 1952, + "time_per_iteration": 2.9780285358428955 + }, + { + "auxiliary_loss_clip": 0.01172306, + "auxiliary_loss_mlp": 0.01059024, + "balance_loss_clip": 1.05625844, + "balance_loss_mlp": 1.04066551, + "epoch": 0.11742071246054411, + "flos": 26154187765920.0, + "grad_norm": 1.8645070042630805, + "language_loss": 0.72166967, + "learning_rate": 3.920366648918491e-06, + "loss": 0.74398291, + "num_input_tokens_seen": 42342505, + "router_z_loss_clip": 1.16113281, + "router_z_loss_mlp": 0.18359375, + "step": 1953, + "time_per_iteration": 5.489282131195068 + }, + { + "auxiliary_loss_clip": 0.01176537, + "auxiliary_loss_mlp": 0.01052705, + "balance_loss_clip": 1.05660701, + "balance_loss_mlp": 1.03261805, + "epoch": 0.11748083571321208, + "flos": 19520226789120.0, + "grad_norm": 2.21995659251913, + "language_loss": 0.79954505, + "learning_rate": 3.920257808329552e-06, + "loss": 0.82183743, + "num_input_tokens_seen": 42360525, + "router_z_loss_clip": 1.19824219, + "router_z_loss_mlp": 0.20080566, + "step": 1954, + "time_per_iteration": 2.7587430477142334 + }, + { + "auxiliary_loss_clip": 0.01171879, + "auxiliary_loss_mlp": 0.01054566, + "balance_loss_clip": 1.05470049, + "balance_loss_mlp": 1.03480124, + "epoch": 0.11754095896588006, + "flos": 19742594387040.0, + "grad_norm": 1.945421410727672, + "language_loss": 0.85913527, + "learning_rate": 3.920148894924246e-06, + "loss": 0.88139969, + "num_input_tokens_seen": 42377045, + "router_z_loss_clip": 1.16992188, + "router_z_loss_mlp": 0.19750977, + "step": 1955, + "time_per_iteration": 2.6092655658721924 + }, + { + "auxiliary_loss_clip": 0.01171198, + "auxiliary_loss_mlp": 0.01045122, + "balance_loss_clip": 1.05368567, + "balance_loss_mlp": 1.02668023, + "epoch": 0.11760108221854802, + "flos": 16181228334240.0, + "grad_norm": 2.2855667215570934, + "language_loss": 0.77774763, + "learning_rate": 3.920039908706701e-06, + "loss": 0.79991084, + "num_input_tokens_seen": 42393960, + "router_z_loss_clip": 1.17578125, + "router_z_loss_mlp": 0.18432617, + "step": 1956, + "time_per_iteration": 2.6679553985595703 + }, + { + "auxiliary_loss_clip": 0.01168017, + "auxiliary_loss_mlp": 0.01051743, + "balance_loss_clip": 1.05516911, + "balance_loss_mlp": 1.03218126, + "epoch": 0.11766120547121599, + "flos": 29893358759040.0, + "grad_norm": 2.166059932254052, + "language_loss": 0.80086643, + "learning_rate": 3.91993084968105e-06, + "loss": 0.82306403, + "num_input_tokens_seen": 42413160, + "router_z_loss_clip": 1.12792969, + "router_z_loss_mlp": 0.19567871, + "step": 1957, + "time_per_iteration": 2.6747963428497314 + }, + { + "auxiliary_loss_clip": 0.01175781, + "auxiliary_loss_mlp": 0.01047918, + "balance_loss_clip": 1.05774355, + "balance_loss_mlp": 1.02916682, + "epoch": 0.11772132872388397, + "flos": 21699988387200.0, + "grad_norm": 2.7049180168201628, + "language_loss": 0.77679706, + "learning_rate": 3.919821717851428e-06, + "loss": 0.79903406, + "num_input_tokens_seen": 42432590, + "router_z_loss_clip": 1.1796875, + "router_z_loss_mlp": 0.18774414, + "step": 1958, + "time_per_iteration": 2.632571220397949 + }, + { + "auxiliary_loss_clip": 0.0117254, + "auxiliary_loss_mlp": 0.0104505, + "balance_loss_clip": 1.05622435, + "balance_loss_mlp": 1.02559531, + "epoch": 0.11778145197655193, + "flos": 16127589805920.0, + "grad_norm": 1.9373089301579893, + "language_loss": 0.76860964, + "learning_rate": 3.919712513221976e-06, + "loss": 0.79078555, + "num_input_tokens_seen": 42450135, + "router_z_loss_clip": 1.1640625, + "router_z_loss_mlp": 0.19458008, + "step": 1959, + "time_per_iteration": 2.6025731563568115 + }, + { + "auxiliary_loss_clip": 0.01172872, + "auxiliary_loss_mlp": 0.01050515, + "balance_loss_clip": 1.05571878, + "balance_loss_mlp": 1.0319066, + "epoch": 0.1178415752292199, + "flos": 24684876100800.0, + "grad_norm": 2.3972456291321635, + "language_loss": 0.70286679, + "learning_rate": 3.919603235796832e-06, + "loss": 0.7251007, + "num_input_tokens_seen": 42470050, + "router_z_loss_clip": 1.17089844, + "router_z_loss_mlp": 0.18603516, + "step": 1960, + "time_per_iteration": 2.6644225120544434 + }, + { + "auxiliary_loss_clip": 0.01177104, + "auxiliary_loss_mlp": 0.01049717, + "balance_loss_clip": 1.05771649, + "balance_loss_mlp": 1.02968991, + "epoch": 0.11790169848188788, + "flos": 15911218765440.0, + "grad_norm": 2.623510342645647, + "language_loss": 0.81460869, + "learning_rate": 3.9194938855801406e-06, + "loss": 0.83687699, + "num_input_tokens_seen": 42484335, + "router_z_loss_clip": 1.19433594, + "router_z_loss_mlp": 0.20031738, + "step": 1961, + "time_per_iteration": 2.664818286895752 + }, + { + "auxiliary_loss_clip": 0.01166933, + "auxiliary_loss_mlp": 0.01053274, + "balance_loss_clip": 1.05621064, + "balance_loss_mlp": 1.03503489, + "epoch": 0.11796182173455584, + "flos": 27169485778080.0, + "grad_norm": 1.888302586420116, + "language_loss": 0.92238855, + "learning_rate": 3.919384462576049e-06, + "loss": 0.94459069, + "num_input_tokens_seen": 42502720, + "router_z_loss_clip": 1.10839844, + "router_z_loss_mlp": 0.18249512, + "step": 1962, + "time_per_iteration": 2.6571578979492188 + }, + { + "auxiliary_loss_clip": 0.01173074, + "auxiliary_loss_mlp": 0.01055514, + "balance_loss_clip": 1.05663884, + "balance_loss_mlp": 1.03626215, + "epoch": 0.1180219449872238, + "flos": 12976039404000.0, + "grad_norm": 7.509933712122334, + "language_loss": 0.87060142, + "learning_rate": 3.919274966788707e-06, + "loss": 0.89288729, + "num_input_tokens_seen": 42519460, + "router_z_loss_clip": 1.16601562, + "router_z_loss_mlp": 0.19262695, + "step": 1963, + "time_per_iteration": 2.6121890544891357 + }, + { + "auxiliary_loss_clip": 0.01173547, + "auxiliary_loss_mlp": 0.01048609, + "balance_loss_clip": 1.05537593, + "balance_loss_mlp": 1.03004766, + "epoch": 0.11808206823989177, + "flos": 25530918318720.0, + "grad_norm": 1.935014261534538, + "language_loss": 0.84303588, + "learning_rate": 3.919165398222265e-06, + "loss": 0.86525744, + "num_input_tokens_seen": 42539420, + "router_z_loss_clip": 1.18261719, + "router_z_loss_mlp": 0.18566895, + "step": 1964, + "time_per_iteration": 2.9125843048095703 + }, + { + "auxiliary_loss_clip": 0.01178344, + "auxiliary_loss_mlp": 0.01066296, + "balance_loss_clip": 1.06196332, + "balance_loss_mlp": 1.04743731, + "epoch": 0.11814219149255975, + "flos": 25353032343840.0, + "grad_norm": 2.4106464029231147, + "language_loss": 0.83077598, + "learning_rate": 3.919055756880879e-06, + "loss": 0.85322237, + "num_input_tokens_seen": 42558225, + "router_z_loss_clip": 1.16601562, + "router_z_loss_mlp": 0.18847656, + "step": 1965, + "time_per_iteration": 2.6686556339263916 + }, + { + "auxiliary_loss_clip": 0.01173512, + "auxiliary_loss_mlp": 0.01051351, + "balance_loss_clip": 1.05511832, + "balance_loss_mlp": 1.03228927, + "epoch": 0.11820231474522772, + "flos": 59499245001600.0, + "grad_norm": 1.6714637925162117, + "language_loss": 0.74399292, + "learning_rate": 3.918946042768707e-06, + "loss": 0.76624155, + "num_input_tokens_seen": 42580790, + "router_z_loss_clip": 1.18554688, + "router_z_loss_mlp": 0.1907959, + "step": 1966, + "time_per_iteration": 2.875913619995117 + }, + { + "auxiliary_loss_clip": 0.01182935, + "auxiliary_loss_mlp": 0.01059247, + "balance_loss_clip": 1.06159937, + "balance_loss_mlp": 1.03985167, + "epoch": 0.11826243799789568, + "flos": 20365620730560.0, + "grad_norm": 4.238624680096797, + "language_loss": 0.7269125, + "learning_rate": 3.918836255889908e-06, + "loss": 0.74933434, + "num_input_tokens_seen": 42597355, + "router_z_loss_clip": 1.21289062, + "router_z_loss_mlp": 0.19384766, + "step": 1967, + "time_per_iteration": 2.6381547451019287 + }, + { + "auxiliary_loss_clip": 0.01173401, + "auxiliary_loss_mlp": 0.01046587, + "balance_loss_clip": 1.05685496, + "balance_loss_mlp": 1.02737045, + "epoch": 0.11832256125056366, + "flos": 20633158745280.0, + "grad_norm": 2.3228128875424834, + "language_loss": 0.8842929, + "learning_rate": 3.9187263962486456e-06, + "loss": 0.90649283, + "num_input_tokens_seen": 42616060, + "router_z_loss_clip": 1.16503906, + "router_z_loss_mlp": 0.19213867, + "step": 1968, + "time_per_iteration": 2.644057512283325 + }, + { + "auxiliary_loss_clip": 0.01174025, + "auxiliary_loss_mlp": 0.01046957, + "balance_loss_clip": 1.05774546, + "balance_loss_mlp": 1.02733469, + "epoch": 0.11838268450323162, + "flos": 27845664442560.0, + "grad_norm": 2.1071701395160765, + "language_loss": 0.66711688, + "learning_rate": 3.918616463849087e-06, + "loss": 0.68932664, + "num_input_tokens_seen": 42636285, + "router_z_loss_clip": 1.16210938, + "router_z_loss_mlp": 0.19616699, + "step": 1969, + "time_per_iteration": 2.68463134765625 + }, + { + "auxiliary_loss_clip": 0.01176274, + "auxiliary_loss_mlp": 0.01053802, + "balance_loss_clip": 1.06046057, + "balance_loss_mlp": 1.03350103, + "epoch": 0.11844280775589959, + "flos": 40933796109120.0, + "grad_norm": 2.171442532874965, + "language_loss": 0.8100614, + "learning_rate": 3.918506458695399e-06, + "loss": 0.83236217, + "num_input_tokens_seen": 42658320, + "router_z_loss_clip": 1.15820312, + "router_z_loss_mlp": 0.203125, + "step": 1970, + "time_per_iteration": 2.7788772583007812 + }, + { + "auxiliary_loss_clip": 0.01061824, + "auxiliary_loss_mlp": 0.01026585, + "balance_loss_clip": 1.01994228, + "balance_loss_mlp": 1.02429891, + "epoch": 0.11850293100856757, + "flos": 80961394540320.0, + "grad_norm": 0.8064052800584802, + "language_loss": 0.66218048, + "learning_rate": 3.918396380791754e-06, + "loss": 0.68306464, + "num_input_tokens_seen": 42721500, + "router_z_loss_clip": 0.41870117, + "router_z_loss_mlp": 0.02287292, + "step": 1971, + "time_per_iteration": 3.294957160949707 + }, + { + "auxiliary_loss_clip": 0.01172304, + "auxiliary_loss_mlp": 0.01051438, + "balance_loss_clip": 1.05665421, + "balance_loss_mlp": 1.03300786, + "epoch": 0.11856305426123553, + "flos": 30116172047040.0, + "grad_norm": 1.9430550555697415, + "language_loss": 0.79699492, + "learning_rate": 3.918286230142327e-06, + "loss": 0.81923234, + "num_input_tokens_seen": 42739825, + "router_z_loss_clip": 1.15722656, + "router_z_loss_mlp": 0.18432617, + "step": 1972, + "time_per_iteration": 2.681298017501831 + }, + { + "auxiliary_loss_clip": 0.01175195, + "auxiliary_loss_mlp": 0.01054874, + "balance_loss_clip": 1.05897951, + "balance_loss_mlp": 1.0353241, + "epoch": 0.1186231775139035, + "flos": 29626874193600.0, + "grad_norm": 2.1648054163873796, + "language_loss": 0.72710466, + "learning_rate": 3.918176006751292e-06, + "loss": 0.74940538, + "num_input_tokens_seen": 42758695, + "router_z_loss_clip": 1.16210938, + "router_z_loss_mlp": 0.19555664, + "step": 1973, + "time_per_iteration": 2.7041139602661133 + }, + { + "auxiliary_loss_clip": 0.01172829, + "auxiliary_loss_mlp": 0.01045846, + "balance_loss_clip": 1.05861831, + "balance_loss_mlp": 1.02717805, + "epoch": 0.11868330076657148, + "flos": 26548971505920.0, + "grad_norm": 1.8812626124175753, + "language_loss": 0.72066736, + "learning_rate": 3.918065710622832e-06, + "loss": 0.74285412, + "num_input_tokens_seen": 42778510, + "router_z_loss_clip": 1.14160156, + "router_z_loss_mlp": 0.18652344, + "step": 1974, + "time_per_iteration": 2.7000715732574463 + }, + { + "auxiliary_loss_clip": 0.01173574, + "auxiliary_loss_mlp": 0.01040737, + "balance_loss_clip": 1.05867159, + "balance_loss_mlp": 1.02140093, + "epoch": 0.11874342401923944, + "flos": 20979773789760.0, + "grad_norm": 2.165251958049545, + "language_loss": 0.77938175, + "learning_rate": 3.917955341761128e-06, + "loss": 0.80152482, + "num_input_tokens_seen": 42793995, + "router_z_loss_clip": 1.1484375, + "router_z_loss_mlp": 0.19348145, + "step": 1975, + "time_per_iteration": 2.6186351776123047 + }, + { + "auxiliary_loss_clip": 0.01175076, + "auxiliary_loss_mlp": 0.01050755, + "balance_loss_clip": 1.06120753, + "balance_loss_mlp": 1.03209889, + "epoch": 0.11880354727190741, + "flos": 18582911840160.0, + "grad_norm": 2.931762222713374, + "language_loss": 0.75442451, + "learning_rate": 3.917844900170364e-06, + "loss": 0.77668279, + "num_input_tokens_seen": 42809000, + "router_z_loss_clip": 1.13964844, + "router_z_loss_mlp": 0.18652344, + "step": 1976, + "time_per_iteration": 2.6720223426818848 + }, + { + "auxiliary_loss_clip": 0.01173369, + "auxiliary_loss_mlp": 0.01042896, + "balance_loss_clip": 1.05912328, + "balance_loss_mlp": 1.02382219, + "epoch": 0.11886367052457537, + "flos": 33323710979520.0, + "grad_norm": 1.9178804994899186, + "language_loss": 0.75283903, + "learning_rate": 3.91773438585473e-06, + "loss": 0.77500165, + "num_input_tokens_seen": 42831585, + "router_z_loss_clip": 1.14160156, + "router_z_loss_mlp": 0.19091797, + "step": 1977, + "time_per_iteration": 2.9173080921173096 + }, + { + "auxiliary_loss_clip": 0.01173914, + "auxiliary_loss_mlp": 0.01052518, + "balance_loss_clip": 1.05682886, + "balance_loss_mlp": 1.03345633, + "epoch": 0.11892379377724335, + "flos": 26598436754400.0, + "grad_norm": 2.934158164196827, + "language_loss": 0.74015671, + "learning_rate": 3.9176237988184165e-06, + "loss": 0.76242107, + "num_input_tokens_seen": 42848420, + "router_z_loss_clip": 1.17089844, + "router_z_loss_mlp": 0.19067383, + "step": 1978, + "time_per_iteration": 2.6729583740234375 + }, + { + "auxiliary_loss_clip": 0.01175014, + "auxiliary_loss_mlp": 0.01048316, + "balance_loss_clip": 1.06156397, + "balance_loss_mlp": 1.02998209, + "epoch": 0.11898391702991132, + "flos": 17071347002400.0, + "grad_norm": 2.0497908611645808, + "language_loss": 0.73523951, + "learning_rate": 3.917513139065616e-06, + "loss": 0.75747287, + "num_input_tokens_seen": 42866645, + "router_z_loss_clip": 1.13476562, + "router_z_loss_mlp": 0.18347168, + "step": 1979, + "time_per_iteration": 2.6659538745880127 + }, + { + "auxiliary_loss_clip": 0.01177146, + "auxiliary_loss_mlp": 0.01049907, + "balance_loss_clip": 1.06031132, + "balance_loss_mlp": 1.03147721, + "epoch": 0.11904404028257928, + "flos": 39333470611680.0, + "grad_norm": 1.7583147260076717, + "language_loss": 0.98619807, + "learning_rate": 3.917402406600525e-06, + "loss": 1.00846863, + "num_input_tokens_seen": 42888515, + "router_z_loss_clip": 1.16796875, + "router_z_loss_mlp": 0.18432617, + "step": 1980, + "time_per_iteration": 2.7774605751037598 + }, + { + "auxiliary_loss_clip": 0.01177475, + "auxiliary_loss_mlp": 0.01050786, + "balance_loss_clip": 1.05960679, + "balance_loss_mlp": 1.03100944, + "epoch": 0.11910416353524726, + "flos": 28780264733760.0, + "grad_norm": 4.767521483870094, + "language_loss": 0.86013293, + "learning_rate": 3.917291601427342e-06, + "loss": 0.88241553, + "num_input_tokens_seen": 42909035, + "router_z_loss_clip": 1.17871094, + "router_z_loss_mlp": 0.19775391, + "step": 1981, + "time_per_iteration": 2.651977300643921 + }, + { + "auxiliary_loss_clip": 0.01176826, + "auxiliary_loss_mlp": 0.0105516, + "balance_loss_clip": 1.0614748, + "balance_loss_mlp": 1.03577662, + "epoch": 0.11916428678791523, + "flos": 30912222291840.0, + "grad_norm": 2.0648348479286844, + "language_loss": 0.85410392, + "learning_rate": 3.91718072355027e-06, + "loss": 0.87642372, + "num_input_tokens_seen": 42927555, + "router_z_loss_clip": 1.15234375, + "router_z_loss_mlp": 0.19396973, + "step": 1982, + "time_per_iteration": 2.696668863296509 + }, + { + "auxiliary_loss_clip": 0.01173962, + "auxiliary_loss_mlp": 0.01046324, + "balance_loss_clip": 1.05970502, + "balance_loss_mlp": 1.02833521, + "epoch": 0.11922441004058319, + "flos": 24146153516160.0, + "grad_norm": 2.104264026274747, + "language_loss": 0.85044032, + "learning_rate": 3.917069772973513e-06, + "loss": 0.87264317, + "num_input_tokens_seen": 42945300, + "router_z_loss_clip": 1.14355469, + "router_z_loss_mlp": 0.18005371, + "step": 1983, + "time_per_iteration": 2.6324384212493896 + }, + { + "auxiliary_loss_clip": 0.01177817, + "auxiliary_loss_mlp": 0.01055393, + "balance_loss_clip": 1.06021714, + "balance_loss_mlp": 1.03630793, + "epoch": 0.11928453329325117, + "flos": 26279407627200.0, + "grad_norm": 2.888493072375824, + "language_loss": 0.76599717, + "learning_rate": 3.916958749701277e-06, + "loss": 0.78832924, + "num_input_tokens_seen": 42961295, + "router_z_loss_clip": 1.17578125, + "router_z_loss_mlp": 0.19091797, + "step": 1984, + "time_per_iteration": 2.6366584300994873 + }, + { + "auxiliary_loss_clip": 0.01174176, + "auxiliary_loss_mlp": 0.01051833, + "balance_loss_clip": 1.05894732, + "balance_loss_mlp": 1.03309345, + "epoch": 0.11934465654591914, + "flos": 25398405347040.0, + "grad_norm": 2.0070822935736734, + "language_loss": 0.83238846, + "learning_rate": 3.9168476537377745e-06, + "loss": 0.85464859, + "num_input_tokens_seen": 42980330, + "router_z_loss_clip": 1.15234375, + "router_z_loss_mlp": 0.18737793, + "step": 1985, + "time_per_iteration": 2.6523096561431885 + }, + { + "auxiliary_loss_clip": 0.01170293, + "auxiliary_loss_mlp": 0.01051066, + "balance_loss_clip": 1.05760753, + "balance_loss_mlp": 1.03257692, + "epoch": 0.1194047797985871, + "flos": 23257696056480.0, + "grad_norm": 1.7795837564008832, + "language_loss": 0.74249923, + "learning_rate": 3.916736485087216e-06, + "loss": 0.76471281, + "num_input_tokens_seen": 42996125, + "router_z_loss_clip": 1.12695312, + "router_z_loss_mlp": 0.18481445, + "step": 1986, + "time_per_iteration": 2.6382532119750977 + }, + { + "auxiliary_loss_clip": 0.01177027, + "auxiliary_loss_mlp": 0.0105314, + "balance_loss_clip": 1.06106257, + "balance_loss_mlp": 1.03480542, + "epoch": 0.11946490305125507, + "flos": 33179488513920.0, + "grad_norm": 5.9710198486188535, + "language_loss": 0.72492152, + "learning_rate": 3.916625243753819e-06, + "loss": 0.74722326, + "num_input_tokens_seen": 43014180, + "router_z_loss_clip": 1.16210938, + "router_z_loss_mlp": 0.18322754, + "step": 1987, + "time_per_iteration": 2.7347829341888428 + }, + { + "auxiliary_loss_clip": 0.01176462, + "auxiliary_loss_mlp": 0.01053742, + "balance_loss_clip": 1.05995142, + "balance_loss_mlp": 1.03409648, + "epoch": 0.11952502630392305, + "flos": 25796187365760.0, + "grad_norm": 2.009237606406184, + "language_loss": 0.71969533, + "learning_rate": 3.916513929741799e-06, + "loss": 0.74199736, + "num_input_tokens_seen": 43032120, + "router_z_loss_clip": 1.16503906, + "router_z_loss_mlp": 0.19665527, + "step": 1988, + "time_per_iteration": 2.657158136367798 + }, + { + "auxiliary_loss_clip": 0.0117523, + "auxiliary_loss_mlp": 0.01060148, + "balance_loss_clip": 1.059672, + "balance_loss_mlp": 1.03973985, + "epoch": 0.11958514955659101, + "flos": 26996421359520.0, + "grad_norm": 1.8620075915690397, + "language_loss": 0.8118093, + "learning_rate": 3.91640254305538e-06, + "loss": 0.83416307, + "num_input_tokens_seen": 43052215, + "router_z_loss_clip": 1.15527344, + "router_z_loss_mlp": 0.20410156, + "step": 1989, + "time_per_iteration": 5.843271732330322 + }, + { + "auxiliary_loss_clip": 0.01179898, + "auxiliary_loss_mlp": 0.0104943, + "balance_loss_clip": 1.06332278, + "balance_loss_mlp": 1.03033304, + "epoch": 0.11964527280925898, + "flos": 21257360089920.0, + "grad_norm": 2.771027032534343, + "language_loss": 0.75428075, + "learning_rate": 3.916291083698784e-06, + "loss": 0.77657402, + "num_input_tokens_seen": 43069720, + "router_z_loss_clip": 1.16503906, + "router_z_loss_mlp": 0.19116211, + "step": 1990, + "time_per_iteration": 2.6891064643859863 + }, + { + "auxiliary_loss_clip": 0.01068945, + "auxiliary_loss_mlp": 0.01018263, + "balance_loss_clip": 1.02767265, + "balance_loss_mlp": 1.01582944, + "epoch": 0.11970539606192696, + "flos": 86244861983040.0, + "grad_norm": 0.9171434084251402, + "language_loss": 0.55195534, + "learning_rate": 3.916179551676238e-06, + "loss": 0.57282746, + "num_input_tokens_seen": 43123130, + "router_z_loss_clip": 0.4128418, + "router_z_loss_mlp": 0.02430725, + "step": 1991, + "time_per_iteration": 3.2676053047180176 + }, + { + "auxiliary_loss_clip": 0.01173957, + "auxiliary_loss_mlp": 0.0105356, + "balance_loss_clip": 1.06044316, + "balance_loss_mlp": 1.03502274, + "epoch": 0.11976551931459492, + "flos": 25886649751200.0, + "grad_norm": 2.40792066093157, + "language_loss": 0.78499985, + "learning_rate": 3.916067946991971e-06, + "loss": 0.807275, + "num_input_tokens_seen": 43140015, + "router_z_loss_clip": 1.13378906, + "router_z_loss_mlp": 0.18530273, + "step": 1992, + "time_per_iteration": 5.545731782913208 + }, + { + "auxiliary_loss_clip": 0.01177108, + "auxiliary_loss_mlp": 0.01044655, + "balance_loss_clip": 1.05824471, + "balance_loss_mlp": 1.02526021, + "epoch": 0.11982564256726289, + "flos": 31713094092960.0, + "grad_norm": 1.6740890553569834, + "language_loss": 0.78697944, + "learning_rate": 3.915956269650216e-06, + "loss": 0.80919707, + "num_input_tokens_seen": 43160105, + "router_z_loss_clip": 1.18847656, + "router_z_loss_mlp": 0.19396973, + "step": 1993, + "time_per_iteration": 2.725550651550293 + }, + { + "auxiliary_loss_clip": 0.01173008, + "auxiliary_loss_mlp": 0.01051998, + "balance_loss_clip": 1.05759919, + "balance_loss_mlp": 1.03344893, + "epoch": 0.11988576581993086, + "flos": 26418686984640.0, + "grad_norm": 1.7677454371536594, + "language_loss": 0.82064199, + "learning_rate": 3.915844519655208e-06, + "loss": 0.84289205, + "num_input_tokens_seen": 43179835, + "router_z_loss_clip": 1.15429688, + "router_z_loss_mlp": 0.18554688, + "step": 1994, + "time_per_iteration": 2.700190544128418 + }, + { + "auxiliary_loss_clip": 0.01172698, + "auxiliary_loss_mlp": 0.01053699, + "balance_loss_clip": 1.05971789, + "balance_loss_mlp": 1.03616381, + "epoch": 0.11994588907259883, + "flos": 21789518875200.0, + "grad_norm": 2.079604693503057, + "language_loss": 0.88271129, + "learning_rate": 3.915732697011183e-06, + "loss": 0.9049753, + "num_input_tokens_seen": 43197210, + "router_z_loss_clip": 1.12988281, + "router_z_loss_mlp": 0.17529297, + "step": 1995, + "time_per_iteration": 2.6664557456970215 + }, + { + "auxiliary_loss_clip": 0.01175316, + "auxiliary_loss_mlp": 0.01055629, + "balance_loss_clip": 1.05933845, + "balance_loss_mlp": 1.03675854, + "epoch": 0.1200060123252668, + "flos": 29849727998880.0, + "grad_norm": 1.7625399038373155, + "language_loss": 0.74191189, + "learning_rate": 3.9156208017223825e-06, + "loss": 0.76422143, + "num_input_tokens_seen": 43215050, + "router_z_loss_clip": 1.16113281, + "router_z_loss_mlp": 0.1887207, + "step": 1996, + "time_per_iteration": 2.741199493408203 + }, + { + "auxiliary_loss_clip": 0.01172436, + "auxiliary_loss_mlp": 0.01049508, + "balance_loss_clip": 1.05735064, + "balance_loss_mlp": 1.03092337, + "epoch": 0.12006613557793476, + "flos": 22857077828160.0, + "grad_norm": 2.4217033023092425, + "language_loss": 0.87938762, + "learning_rate": 3.915508833793048e-06, + "loss": 0.90160704, + "num_input_tokens_seen": 43233900, + "router_z_loss_clip": 1.15039062, + "router_z_loss_mlp": 0.18591309, + "step": 1997, + "time_per_iteration": 2.634596824645996 + }, + { + "auxiliary_loss_clip": 0.01172381, + "auxiliary_loss_mlp": 0.01064923, + "balance_loss_clip": 1.05760026, + "balance_loss_mlp": 1.04542089, + "epoch": 0.12012625883060274, + "flos": 27171754745760.0, + "grad_norm": 1.9865633682628339, + "language_loss": 0.78674901, + "learning_rate": 3.915396793227428e-06, + "loss": 0.80912209, + "num_input_tokens_seen": 43252105, + "router_z_loss_clip": 1.1484375, + "router_z_loss_mlp": 0.19519043, + "step": 1998, + "time_per_iteration": 2.7022554874420166 + }, + { + "auxiliary_loss_clip": 0.01175559, + "auxiliary_loss_mlp": 0.01049544, + "balance_loss_clip": 1.06143093, + "balance_loss_mlp": 1.03011298, + "epoch": 0.1201863820832707, + "flos": 26550632714400.0, + "grad_norm": 2.191090085989948, + "language_loss": 0.73285139, + "learning_rate": 3.915284680029769e-06, + "loss": 0.7551024, + "num_input_tokens_seen": 43270315, + "router_z_loss_clip": 1.14160156, + "router_z_loss_mlp": 0.19433594, + "step": 1999, + "time_per_iteration": 2.681018590927124 + }, + { + "auxiliary_loss_clip": 0.01175921, + "auxiliary_loss_mlp": 0.0105887, + "balance_loss_clip": 1.05973697, + "balance_loss_mlp": 1.03993988, + "epoch": 0.12024650533593867, + "flos": 26732286796320.0, + "grad_norm": 2.276235508088788, + "language_loss": 0.74710774, + "learning_rate": 3.915172494204323e-06, + "loss": 0.76945567, + "num_input_tokens_seen": 43289935, + "router_z_loss_clip": 1.16113281, + "router_z_loss_mlp": 0.18945312, + "step": 2000, + "time_per_iteration": 2.9404964447021484 + }, + { + "auxiliary_loss_clip": 0.01171884, + "auxiliary_loss_mlp": 0.01048796, + "balance_loss_clip": 1.05651391, + "balance_loss_mlp": 1.02992558, + "epoch": 0.12030662858860665, + "flos": 26465316023520.0, + "grad_norm": 1.700658928930325, + "language_loss": 0.84769762, + "learning_rate": 3.915060235755344e-06, + "loss": 0.8699044, + "num_input_tokens_seen": 43309325, + "router_z_loss_clip": 1.15332031, + "router_z_loss_mlp": 0.18884277, + "step": 2001, + "time_per_iteration": 2.6410679817199707 + }, + { + "auxiliary_loss_clip": 0.01175337, + "auxiliary_loss_mlp": 0.01050971, + "balance_loss_clip": 1.05976319, + "balance_loss_mlp": 1.03305411, + "epoch": 0.12036675184127461, + "flos": 15780529071360.0, + "grad_norm": 2.6715212997303928, + "language_loss": 0.73946905, + "learning_rate": 3.91494790468709e-06, + "loss": 0.76173216, + "num_input_tokens_seen": 43327010, + "router_z_loss_clip": 1.15429688, + "router_z_loss_mlp": 0.17919922, + "step": 2002, + "time_per_iteration": 2.634549617767334 + }, + { + "auxiliary_loss_clip": 0.01180265, + "auxiliary_loss_mlp": 0.01052372, + "balance_loss_clip": 1.05977392, + "balance_loss_mlp": 1.03282118, + "epoch": 0.12042687509394258, + "flos": 25442400762720.0, + "grad_norm": 2.017060234268761, + "language_loss": 0.78122836, + "learning_rate": 3.9148355010038185e-06, + "loss": 0.80355477, + "num_input_tokens_seen": 43345650, + "router_z_loss_clip": 1.20507812, + "router_z_loss_mlp": 0.19543457, + "step": 2003, + "time_per_iteration": 2.690065383911133 + }, + { + "auxiliary_loss_clip": 0.0117123, + "auxiliary_loss_mlp": 0.01047853, + "balance_loss_clip": 1.05815208, + "balance_loss_mlp": 1.02899456, + "epoch": 0.12048699834661056, + "flos": 29136847029120.0, + "grad_norm": 1.8422433907827465, + "language_loss": 0.7205447, + "learning_rate": 3.914723024709793e-06, + "loss": 0.74273551, + "num_input_tokens_seen": 43365555, + "router_z_loss_clip": 1.12988281, + "router_z_loss_mlp": 0.18859863, + "step": 2004, + "time_per_iteration": 2.678133249282837 + }, + { + "auxiliary_loss_clip": 0.01180667, + "auxiliary_loss_mlp": 0.01058346, + "balance_loss_clip": 1.06068182, + "balance_loss_mlp": 1.03876042, + "epoch": 0.12054712159927852, + "flos": 24106615001280.0, + "grad_norm": 1.7278483174151842, + "language_loss": 0.78574574, + "learning_rate": 3.914610475809279e-06, + "loss": 0.80813587, + "num_input_tokens_seen": 43384990, + "router_z_loss_clip": 1.19726562, + "router_z_loss_mlp": 0.19592285, + "step": 2005, + "time_per_iteration": 2.700021743774414 + }, + { + "auxiliary_loss_clip": 0.01063527, + "auxiliary_loss_mlp": 0.01001964, + "balance_loss_clip": 1.02298689, + "balance_loss_mlp": 0.99982613, + "epoch": 0.12060724485194649, + "flos": 63050893257600.0, + "grad_norm": 0.9269307324776079, + "language_loss": 0.58074868, + "learning_rate": 3.914497854306543e-06, + "loss": 0.60140365, + "num_input_tokens_seen": 43436335, + "router_z_loss_clip": 0.40527344, + "router_z_loss_mlp": 0.02140808, + "step": 2006, + "time_per_iteration": 3.072404384613037 + }, + { + "auxiliary_loss_clip": 0.01172641, + "auxiliary_loss_mlp": 0.01049442, + "balance_loss_clip": 1.06068015, + "balance_loss_mlp": 1.03128672, + "epoch": 0.12066736810461445, + "flos": 23171933675520.0, + "grad_norm": 1.7343022756356885, + "language_loss": 0.76589602, + "learning_rate": 3.9143851602058575e-06, + "loss": 0.78811681, + "num_input_tokens_seen": 43456495, + "router_z_loss_clip": 1.11865234, + "router_z_loss_mlp": 0.1817627, + "step": 2007, + "time_per_iteration": 2.7307677268981934 + }, + { + "auxiliary_loss_clip": 0.01173432, + "auxiliary_loss_mlp": 0.01054863, + "balance_loss_clip": 1.05792463, + "balance_loss_mlp": 1.03534818, + "epoch": 0.12072749135728243, + "flos": 20098812026880.0, + "grad_norm": 2.7068883954417604, + "language_loss": 0.83044171, + "learning_rate": 3.914272393511494e-06, + "loss": 0.85272467, + "num_input_tokens_seen": 43473085, + "router_z_loss_clip": 1.15527344, + "router_z_loss_mlp": 0.19519043, + "step": 2008, + "time_per_iteration": 2.640188694000244 + }, + { + "auxiliary_loss_clip": 0.01172115, + "auxiliary_loss_mlp": 0.01045648, + "balance_loss_clip": 1.05695009, + "balance_loss_mlp": 1.0272541, + "epoch": 0.1207876146099504, + "flos": 22057543097280.0, + "grad_norm": 2.242080154412139, + "language_loss": 0.84627402, + "learning_rate": 3.91415955422773e-06, + "loss": 0.8684516, + "num_input_tokens_seen": 43491135, + "router_z_loss_clip": 1.15136719, + "router_z_loss_mlp": 0.18383789, + "step": 2009, + "time_per_iteration": 2.6985039710998535 + }, + { + "auxiliary_loss_clip": 0.01173426, + "auxiliary_loss_mlp": 0.01049418, + "balance_loss_clip": 1.06019557, + "balance_loss_mlp": 1.02890158, + "epoch": 0.12084773786261836, + "flos": 26687845690560.0, + "grad_norm": 1.85793247365175, + "language_loss": 0.83582407, + "learning_rate": 3.914046642358844e-06, + "loss": 0.85805249, + "num_input_tokens_seen": 43510440, + "router_z_loss_clip": 1.1328125, + "router_z_loss_mlp": 0.20507812, + "step": 2010, + "time_per_iteration": 2.64894700050354 + }, + { + "auxiliary_loss_clip": 0.01179055, + "auxiliary_loss_mlp": 0.0105886, + "balance_loss_clip": 1.06395233, + "balance_loss_mlp": 1.04015589, + "epoch": 0.12090786111528634, + "flos": 22369805838720.0, + "grad_norm": 1.6329761315504785, + "language_loss": 0.84101629, + "learning_rate": 3.9139336579091174e-06, + "loss": 0.86339545, + "num_input_tokens_seen": 43530145, + "router_z_loss_clip": 1.15234375, + "router_z_loss_mlp": 0.18701172, + "step": 2011, + "time_per_iteration": 2.692593574523926 + }, + { + "auxiliary_loss_clip": 0.01178516, + "auxiliary_loss_mlp": 0.01052948, + "balance_loss_clip": 1.06217933, + "balance_loss_mlp": 1.03407681, + "epoch": 0.1209679843679543, + "flos": 25753407468480.0, + "grad_norm": 2.091537013158773, + "language_loss": 0.96197569, + "learning_rate": 3.913820600882834e-06, + "loss": 0.98429024, + "num_input_tokens_seen": 43549315, + "router_z_loss_clip": 1.16308594, + "router_z_loss_mlp": 0.18847656, + "step": 2012, + "time_per_iteration": 2.651033401489258 + }, + { + "auxiliary_loss_clip": 0.01169831, + "auxiliary_loss_mlp": 0.01043341, + "balance_loss_clip": 1.05744326, + "balance_loss_mlp": 1.02411246, + "epoch": 0.12102810762062227, + "flos": 35680710276000.0, + "grad_norm": 1.988459669222739, + "language_loss": 0.80674762, + "learning_rate": 3.913707471284283e-06, + "loss": 0.82887936, + "num_input_tokens_seen": 43569240, + "router_z_loss_clip": 1.12402344, + "router_z_loss_mlp": 0.19226074, + "step": 2013, + "time_per_iteration": 2.9745161533355713 + }, + { + "auxiliary_loss_clip": 0.01180025, + "auxiliary_loss_mlp": 0.01043879, + "balance_loss_clip": 1.06181073, + "balance_loss_mlp": 1.02387583, + "epoch": 0.12108823087329025, + "flos": 21878927811360.0, + "grad_norm": 2.47480416706027, + "language_loss": 0.7677477, + "learning_rate": 3.9135942691177515e-06, + "loss": 0.78998673, + "num_input_tokens_seen": 43587710, + "router_z_loss_clip": 1.18261719, + "router_z_loss_mlp": 0.20007324, + "step": 2014, + "time_per_iteration": 2.649998664855957 + }, + { + "auxiliary_loss_clip": 0.01175036, + "auxiliary_loss_mlp": 0.01045764, + "balance_loss_clip": 1.0612154, + "balance_loss_mlp": 1.02669048, + "epoch": 0.12114835412595822, + "flos": 26956518189120.0, + "grad_norm": 2.0803618244370634, + "language_loss": 0.86741376, + "learning_rate": 3.913480994387535e-06, + "loss": 0.88962185, + "num_input_tokens_seen": 43606000, + "router_z_loss_clip": 1.13867188, + "router_z_loss_mlp": 0.1907959, + "step": 2015, + "time_per_iteration": 2.6662449836730957 + }, + { + "auxiliary_loss_clip": 0.01166268, + "auxiliary_loss_mlp": 0.01044677, + "balance_loss_clip": 1.05496526, + "balance_loss_mlp": 1.02602112, + "epoch": 0.12120847737862618, + "flos": 24907162664160.0, + "grad_norm": 1.8809000212104143, + "language_loss": 0.69145823, + "learning_rate": 3.913367647097926e-06, + "loss": 0.71356767, + "num_input_tokens_seen": 43624815, + "router_z_loss_clip": 1.11132812, + "router_z_loss_mlp": 0.18664551, + "step": 2016, + "time_per_iteration": 2.727698802947998 + }, + { + "auxiliary_loss_clip": 0.01175195, + "auxiliary_loss_mlp": 0.01043858, + "balance_loss_clip": 1.06023693, + "balance_loss_mlp": 1.02343738, + "epoch": 0.12126860063129415, + "flos": 27221706201600.0, + "grad_norm": 2.568798757674108, + "language_loss": 0.80563247, + "learning_rate": 3.913254227253225e-06, + "loss": 0.82782298, + "num_input_tokens_seen": 43643960, + "router_z_loss_clip": 1.14941406, + "router_z_loss_mlp": 0.20410156, + "step": 2017, + "time_per_iteration": 2.7768514156341553 + }, + { + "auxiliary_loss_clip": 0.01172232, + "auxiliary_loss_mlp": 0.01049143, + "balance_loss_clip": 1.05782723, + "balance_loss_mlp": 1.02882957, + "epoch": 0.12132872388396213, + "flos": 16715331948960.0, + "grad_norm": 3.1818868980505584, + "language_loss": 0.69028521, + "learning_rate": 3.913140734857731e-06, + "loss": 0.71249896, + "num_input_tokens_seen": 43662650, + "router_z_loss_clip": 1.14550781, + "router_z_loss_mlp": 0.203125, + "step": 2018, + "time_per_iteration": 2.6480801105499268 + }, + { + "auxiliary_loss_clip": 0.01173675, + "auxiliary_loss_mlp": 0.0104951, + "balance_loss_clip": 1.06143129, + "balance_loss_mlp": 1.03145003, + "epoch": 0.12138884713663009, + "flos": 32295650024160.0, + "grad_norm": 1.7398028721242145, + "language_loss": 0.72597432, + "learning_rate": 3.91302716991575e-06, + "loss": 0.7482062, + "num_input_tokens_seen": 43684205, + "router_z_loss_clip": 1.12207031, + "router_z_loss_mlp": 0.18054199, + "step": 2019, + "time_per_iteration": 2.725452423095703 + }, + { + "auxiliary_loss_clip": 0.01172297, + "auxiliary_loss_mlp": 0.01054246, + "balance_loss_clip": 1.05698681, + "balance_loss_mlp": 1.03437376, + "epoch": 0.12144897038929806, + "flos": 31898354212800.0, + "grad_norm": 2.989320498121431, + "language_loss": 0.91794705, + "learning_rate": 3.912913532431586e-06, + "loss": 0.94021249, + "num_input_tokens_seen": 43706320, + "router_z_loss_clip": 1.15332031, + "router_z_loss_mlp": 0.19885254, + "step": 2020, + "time_per_iteration": 2.8010499477386475 + }, + { + "auxiliary_loss_clip": 0.01174371, + "auxiliary_loss_mlp": 0.01049833, + "balance_loss_clip": 1.06071305, + "balance_loss_mlp": 1.0313077, + "epoch": 0.12150909364196603, + "flos": 30160694187360.0, + "grad_norm": 1.9572070058282243, + "language_loss": 0.77268171, + "learning_rate": 3.912799822409549e-06, + "loss": 0.79492378, + "num_input_tokens_seen": 43724805, + "router_z_loss_clip": 1.13476562, + "router_z_loss_mlp": 0.18518066, + "step": 2021, + "time_per_iteration": 2.70477294921875 + }, + { + "auxiliary_loss_clip": 0.01170837, + "auxiliary_loss_mlp": 0.01045266, + "balance_loss_clip": 1.05982733, + "balance_loss_mlp": 1.02662182, + "epoch": 0.121569216894634, + "flos": 30734012178720.0, + "grad_norm": 1.970969603029439, + "language_loss": 0.8043133, + "learning_rate": 3.912686039853952e-06, + "loss": 0.82647431, + "num_input_tokens_seen": 43742320, + "router_z_loss_clip": 1.10839844, + "router_z_loss_mlp": 0.18640137, + "step": 2022, + "time_per_iteration": 2.72761869430542 + }, + { + "auxiliary_loss_clip": 0.01177459, + "auxiliary_loss_mlp": 0.01050977, + "balance_loss_clip": 1.06259632, + "balance_loss_mlp": 1.03199911, + "epoch": 0.12162934014730196, + "flos": 16403920070400.0, + "grad_norm": 1.9605281200188298, + "language_loss": 0.84926277, + "learning_rate": 3.912572184769108e-06, + "loss": 0.8715471, + "num_input_tokens_seen": 43760665, + "router_z_loss_clip": 1.14746094, + "router_z_loss_mlp": 0.18981934, + "step": 2023, + "time_per_iteration": 2.6534671783447266 + }, + { + "auxiliary_loss_clip": 0.01174824, + "auxiliary_loss_mlp": 0.01049888, + "balance_loss_clip": 1.05944335, + "balance_loss_mlp": 1.03027844, + "epoch": 0.12168946339996994, + "flos": 20677437781920.0, + "grad_norm": 2.1083591284706564, + "language_loss": 0.85522664, + "learning_rate": 3.912458257159335e-06, + "loss": 0.87747383, + "num_input_tokens_seen": 43779020, + "router_z_loss_clip": 1.15429688, + "router_z_loss_mlp": 0.19604492, + "step": 2024, + "time_per_iteration": 2.7188098430633545 + }, + { + "auxiliary_loss_clip": 0.01170235, + "auxiliary_loss_mlp": 0.01054932, + "balance_loss_clip": 1.05502152, + "balance_loss_mlp": 1.03615713, + "epoch": 0.12174958665263791, + "flos": 36389417965920.0, + "grad_norm": 2.335070168776512, + "language_loss": 0.71956635, + "learning_rate": 3.912344257028954e-06, + "loss": 0.74181801, + "num_input_tokens_seen": 43798850, + "router_z_loss_clip": 1.15234375, + "router_z_loss_mlp": 0.18774414, + "step": 2025, + "time_per_iteration": 2.8736207485198975 + }, + { + "auxiliary_loss_clip": 0.01174071, + "auxiliary_loss_mlp": 0.0104231, + "balance_loss_clip": 1.05871677, + "balance_loss_mlp": 1.02413082, + "epoch": 0.12180970990530587, + "flos": 30068651628000.0, + "grad_norm": 1.9107951123829912, + "language_loss": 0.76072812, + "learning_rate": 3.912230184382286e-06, + "loss": 0.78289199, + "num_input_tokens_seen": 43820130, + "router_z_loss_clip": 1.15332031, + "router_z_loss_mlp": 0.18200684, + "step": 2026, + "time_per_iteration": 2.842562675476074 + }, + { + "auxiliary_loss_clip": 0.01173042, + "auxiliary_loss_mlp": 0.01045559, + "balance_loss_clip": 1.05949068, + "balance_loss_mlp": 1.02753496, + "epoch": 0.12186983315797385, + "flos": 25040607533280.0, + "grad_norm": 3.3971895349203245, + "language_loss": 0.89002717, + "learning_rate": 3.912116039223659e-06, + "loss": 0.91221321, + "num_input_tokens_seen": 43838485, + "router_z_loss_clip": 1.13476562, + "router_z_loss_mlp": 0.18029785, + "step": 2027, + "time_per_iteration": 2.6808524131774902 + }, + { + "auxiliary_loss_clip": 0.01171439, + "auxiliary_loss_mlp": 0.01050646, + "balance_loss_clip": 1.05961204, + "balance_loss_mlp": 1.03352714, + "epoch": 0.12192995641064182, + "flos": 33944589907200.0, + "grad_norm": 1.8803044634079242, + "language_loss": 0.75682533, + "learning_rate": 3.912001821557399e-06, + "loss": 0.77904618, + "num_input_tokens_seen": 43859080, + "router_z_loss_clip": 1.11914062, + "router_z_loss_mlp": 0.17114258, + "step": 2028, + "time_per_iteration": 2.7325587272644043 + }, + { + "auxiliary_loss_clip": 0.01171116, + "auxiliary_loss_mlp": 0.01049859, + "balance_loss_clip": 1.05859423, + "balance_loss_mlp": 1.03104818, + "epoch": 0.12199007966330978, + "flos": 26865488561760.0, + "grad_norm": 1.988057043498455, + "language_loss": 0.76900655, + "learning_rate": 3.911887531387839e-06, + "loss": 0.79121631, + "num_input_tokens_seen": 43879030, + "router_z_loss_clip": 1.125, + "router_z_loss_mlp": 0.18823242, + "step": 2029, + "time_per_iteration": 4.161135911941528 + }, + { + "auxiliary_loss_clip": 0.01171028, + "auxiliary_loss_mlp": 0.01050285, + "balance_loss_clip": 1.05657625, + "balance_loss_mlp": 1.03166437, + "epoch": 0.12205020291597775, + "flos": 28425019508640.0, + "grad_norm": 1.8568511555213958, + "language_loss": 0.79356486, + "learning_rate": 3.911773168719313e-06, + "loss": 0.81577802, + "num_input_tokens_seen": 43898505, + "router_z_loss_clip": 1.14453125, + "router_z_loss_mlp": 0.18615723, + "step": 2030, + "time_per_iteration": 2.705113410949707 + }, + { + "auxiliary_loss_clip": 0.01170368, + "auxiliary_loss_mlp": 0.01052878, + "balance_loss_clip": 1.05854583, + "balance_loss_mlp": 1.03344703, + "epoch": 0.12211032616864573, + "flos": 31764180032640.0, + "grad_norm": 2.2338769530526275, + "language_loss": 0.75179923, + "learning_rate": 3.911658733556155e-06, + "loss": 0.7740317, + "num_input_tokens_seen": 43917945, + "router_z_loss_clip": 1.11621094, + "router_z_loss_mlp": 0.19421387, + "step": 2031, + "time_per_iteration": 4.112341642379761 + }, + { + "auxiliary_loss_clip": 0.01173636, + "auxiliary_loss_mlp": 0.01041774, + "balance_loss_clip": 1.06150854, + "balance_loss_mlp": 1.02487016, + "epoch": 0.12217044942131369, + "flos": 24905582490240.0, + "grad_norm": 1.692301153053859, + "language_loss": 0.7527054, + "learning_rate": 3.911544225902707e-06, + "loss": 0.77485955, + "num_input_tokens_seen": 43937385, + "router_z_loss_clip": 1.12011719, + "router_z_loss_mlp": 0.16894531, + "step": 2032, + "time_per_iteration": 4.155579566955566 + }, + { + "auxiliary_loss_clip": 0.01164986, + "auxiliary_loss_mlp": 0.01044155, + "balance_loss_clip": 1.05594397, + "balance_loss_mlp": 1.02683449, + "epoch": 0.12223057267398166, + "flos": 27890470203840.0, + "grad_norm": 1.5744673367129578, + "language_loss": 0.89009994, + "learning_rate": 3.911429645763311e-06, + "loss": 0.91219139, + "num_input_tokens_seen": 43958130, + "router_z_loss_clip": 1.08984375, + "router_z_loss_mlp": 0.1730957, + "step": 2033, + "time_per_iteration": 2.7515270709991455 + }, + { + "auxiliary_loss_clip": 0.01178066, + "auxiliary_loss_mlp": 0.01043727, + "balance_loss_clip": 1.06280446, + "balance_loss_mlp": 1.02558327, + "epoch": 0.12229069592664964, + "flos": 24462022295520.0, + "grad_norm": 3.006993658276106, + "language_loss": 0.65806198, + "learning_rate": 3.911314993142311e-06, + "loss": 0.68027997, + "num_input_tokens_seen": 43976800, + "router_z_loss_clip": 1.15332031, + "router_z_loss_mlp": 0.18139648, + "step": 2034, + "time_per_iteration": 2.6567564010620117 + }, + { + "auxiliary_loss_clip": 0.01170785, + "auxiliary_loss_mlp": 0.01046944, + "balance_loss_clip": 1.05907738, + "balance_loss_mlp": 1.02798939, + "epoch": 0.1223508191793176, + "flos": 27179088373440.0, + "grad_norm": 1.5736329799916147, + "language_loss": 0.76666445, + "learning_rate": 3.911200268044055e-06, + "loss": 0.78884172, + "num_input_tokens_seen": 43996620, + "router_z_loss_clip": 1.11621094, + "router_z_loss_mlp": 0.18969727, + "step": 2035, + "time_per_iteration": 2.6958861351013184 + }, + { + "auxiliary_loss_clip": 0.01175833, + "auxiliary_loss_mlp": 0.01040714, + "balance_loss_clip": 1.06045079, + "balance_loss_mlp": 1.0224154, + "epoch": 0.12241094243198557, + "flos": 25973425064160.0, + "grad_norm": 1.748829621167028, + "language_loss": 0.71743679, + "learning_rate": 3.911085470472892e-06, + "loss": 0.73960233, + "num_input_tokens_seen": 44016175, + "router_z_loss_clip": 1.15332031, + "router_z_loss_mlp": 0.1829834, + "step": 2036, + "time_per_iteration": 2.6985762119293213 + }, + { + "auxiliary_loss_clip": 0.01174966, + "auxiliary_loss_mlp": 0.01051893, + "balance_loss_clip": 1.06226921, + "balance_loss_mlp": 1.03234243, + "epoch": 0.12247106568465355, + "flos": 21209920705440.0, + "grad_norm": 1.7776340342324033, + "language_loss": 0.83133471, + "learning_rate": 3.910970600433178e-06, + "loss": 0.8536033, + "num_input_tokens_seen": 44035060, + "router_z_loss_clip": 1.12695312, + "router_z_loss_mlp": 0.1953125, + "step": 2037, + "time_per_iteration": 2.9619710445404053 + }, + { + "auxiliary_loss_clip": 0.01175167, + "auxiliary_loss_mlp": 0.0105475, + "balance_loss_clip": 1.06099677, + "balance_loss_mlp": 1.03559351, + "epoch": 0.12253118893732151, + "flos": 33002048229120.0, + "grad_norm": 3.9071866370587336, + "language_loss": 0.79916793, + "learning_rate": 3.910855657929267e-06, + "loss": 0.8214671, + "num_input_tokens_seen": 44053330, + "router_z_loss_clip": 1.14453125, + "router_z_loss_mlp": 0.19152832, + "step": 2038, + "time_per_iteration": 2.7733047008514404 + }, + { + "auxiliary_loss_clip": 0.01075889, + "auxiliary_loss_mlp": 0.01011336, + "balance_loss_clip": 1.03450847, + "balance_loss_mlp": 1.00931978, + "epoch": 0.12259131218998948, + "flos": 65721816504000.0, + "grad_norm": 0.8184285951032975, + "language_loss": 0.58654767, + "learning_rate": 3.910740642965518e-06, + "loss": 0.60741997, + "num_input_tokens_seen": 44107575, + "router_z_loss_clip": 0.41430664, + "router_z_loss_mlp": 0.02015686, + "step": 2039, + "time_per_iteration": 3.1227564811706543 + }, + { + "auxiliary_loss_clip": 0.01176285, + "auxiliary_loss_mlp": 0.01052717, + "balance_loss_clip": 1.06103206, + "balance_loss_mlp": 1.03296423, + "epoch": 0.12265143544265744, + "flos": 21831812565120.0, + "grad_norm": 2.245219908026882, + "language_loss": 0.80608791, + "learning_rate": 3.910625555546292e-06, + "loss": 0.82837796, + "num_input_tokens_seen": 44126075, + "router_z_loss_clip": 1.15332031, + "router_z_loss_mlp": 0.19763184, + "step": 2040, + "time_per_iteration": 2.6618659496307373 + }, + { + "auxiliary_loss_clip": 0.0116921, + "auxiliary_loss_mlp": 0.01043033, + "balance_loss_clip": 1.05802357, + "balance_loss_mlp": 1.02442479, + "epoch": 0.12271155869532542, + "flos": 26600300549280.0, + "grad_norm": 1.78718418015588, + "language_loss": 0.82998371, + "learning_rate": 3.910510395675953e-06, + "loss": 0.85210621, + "num_input_tokens_seen": 44145605, + "router_z_loss_clip": 1.11035156, + "router_z_loss_mlp": 0.18615723, + "step": 2041, + "time_per_iteration": 2.7678959369659424 + }, + { + "auxiliary_loss_clip": 0.01174827, + "auxiliary_loss_mlp": 0.01050413, + "balance_loss_clip": 1.05897427, + "balance_loss_mlp": 1.03057647, + "epoch": 0.12277168194799339, + "flos": 24195011005440.0, + "grad_norm": 1.6393104968968, + "language_loss": 0.67225206, + "learning_rate": 3.9103951633588694e-06, + "loss": 0.69450444, + "num_input_tokens_seen": 44164770, + "router_z_loss_clip": 1.15722656, + "router_z_loss_mlp": 0.19836426, + "step": 2042, + "time_per_iteration": 2.840263843536377 + }, + { + "auxiliary_loss_clip": 0.01170472, + "auxiliary_loss_mlp": 0.01047615, + "balance_loss_clip": 1.0577302, + "balance_loss_mlp": 1.02970982, + "epoch": 0.12283180520066135, + "flos": 28334597640480.0, + "grad_norm": 1.7084209456061585, + "language_loss": 0.81736135, + "learning_rate": 3.910279858599409e-06, + "loss": 0.83954215, + "num_input_tokens_seen": 44184025, + "router_z_loss_clip": 1.12890625, + "router_z_loss_mlp": 0.17907715, + "step": 2043, + "time_per_iteration": 2.704580783843994 + }, + { + "auxiliary_loss_clip": 0.01169376, + "auxiliary_loss_mlp": 0.0104351, + "balance_loss_clip": 1.05574989, + "balance_loss_mlp": 1.02450812, + "epoch": 0.12289192845332933, + "flos": 22681501338240.0, + "grad_norm": 2.001297375037454, + "language_loss": 0.80273712, + "learning_rate": 3.910164481401946e-06, + "loss": 0.82486594, + "num_input_tokens_seen": 44202950, + "router_z_loss_clip": 1.13574219, + "router_z_loss_mlp": 0.19006348, + "step": 2044, + "time_per_iteration": 2.672830104827881 + }, + { + "auxiliary_loss_clip": 0.0116858, + "auxiliary_loss_mlp": 0.01046087, + "balance_loss_clip": 1.059183, + "balance_loss_mlp": 1.02751458, + "epoch": 0.1229520517059973, + "flos": 31444988836320.0, + "grad_norm": 1.8079960442570853, + "language_loss": 0.78500992, + "learning_rate": 3.910049031770853e-06, + "loss": 0.80715662, + "num_input_tokens_seen": 44221115, + "router_z_loss_clip": 1.09375, + "router_z_loss_mlp": 0.18566895, + "step": 2045, + "time_per_iteration": 2.720757484436035 + }, + { + "auxiliary_loss_clip": 0.01177347, + "auxiliary_loss_mlp": 0.01055184, + "balance_loss_clip": 1.06022763, + "balance_loss_mlp": 1.03569281, + "epoch": 0.12301217495866526, + "flos": 25486517730240.0, + "grad_norm": 1.9295131110013597, + "language_loss": 0.67609811, + "learning_rate": 3.90993350971051e-06, + "loss": 0.69842333, + "num_input_tokens_seen": 44240575, + "router_z_loss_clip": 1.17089844, + "router_z_loss_mlp": 0.19494629, + "step": 2046, + "time_per_iteration": 2.726041555404663 + }, + { + "auxiliary_loss_clip": 0.01173612, + "auxiliary_loss_mlp": 0.01047748, + "balance_loss_clip": 1.06130075, + "balance_loss_mlp": 1.02924705, + "epoch": 0.12307229821133324, + "flos": 27307144444320.0, + "grad_norm": 2.038261767623374, + "language_loss": 0.72691071, + "learning_rate": 3.909817915225297e-06, + "loss": 0.74912435, + "num_input_tokens_seen": 44257145, + "router_z_loss_clip": 1.12109375, + "router_z_loss_mlp": 0.18493652, + "step": 2047, + "time_per_iteration": 2.717780113220215 + }, + { + "auxiliary_loss_clip": 0.01169601, + "auxiliary_loss_mlp": 0.01054716, + "balance_loss_clip": 1.05791378, + "balance_loss_mlp": 1.03554714, + "epoch": 0.1231324214640012, + "flos": 28513172409120.0, + "grad_norm": 1.802035308562596, + "language_loss": 0.76668131, + "learning_rate": 3.909702248319597e-06, + "loss": 0.78892446, + "num_input_tokens_seen": 44278035, + "router_z_loss_clip": 1.1171875, + "router_z_loss_mlp": 0.19177246, + "step": 2048, + "time_per_iteration": 2.720522880554199 + }, + { + "auxiliary_loss_clip": 0.01168889, + "auxiliary_loss_mlp": 0.01044491, + "balance_loss_clip": 1.05928898, + "balance_loss_mlp": 1.02786171, + "epoch": 0.12319254471666917, + "flos": 29001133192320.0, + "grad_norm": 2.9496642501005454, + "language_loss": 0.8463136, + "learning_rate": 3.909586508997797e-06, + "loss": 0.86844742, + "num_input_tokens_seen": 44296980, + "router_z_loss_clip": 1.09667969, + "router_z_loss_mlp": 0.16625977, + "step": 2049, + "time_per_iteration": 2.9492838382720947 + }, + { + "auxiliary_loss_clip": 0.01173662, + "auxiliary_loss_mlp": 0.01044622, + "balance_loss_clip": 1.05944097, + "balance_loss_mlp": 1.02597785, + "epoch": 0.12325266796933713, + "flos": 28736147766240.0, + "grad_norm": 1.804059917932264, + "language_loss": 0.75654083, + "learning_rate": 3.909470697264285e-06, + "loss": 0.77872366, + "num_input_tokens_seen": 44318005, + "router_z_loss_clip": 1.14160156, + "router_z_loss_mlp": 0.18640137, + "step": 2050, + "time_per_iteration": 2.701793670654297 + }, + { + "auxiliary_loss_clip": 0.01173315, + "auxiliary_loss_mlp": 0.01043406, + "balance_loss_clip": 1.05938363, + "balance_loss_mlp": 1.02497661, + "epoch": 0.12331279122200511, + "flos": 29802653269920.0, + "grad_norm": 2.350401376651227, + "language_loss": 0.80881375, + "learning_rate": 3.909354813123452e-06, + "loss": 0.83098102, + "num_input_tokens_seen": 44335260, + "router_z_loss_clip": 1.13964844, + "router_z_loss_mlp": 0.18432617, + "step": 2051, + "time_per_iteration": 2.7317211627960205 + }, + { + "auxiliary_loss_clip": 0.01170989, + "auxiliary_loss_mlp": 0.01045334, + "balance_loss_clip": 1.06071627, + "balance_loss_mlp": 1.02749991, + "epoch": 0.12337291447467308, + "flos": 31096307410560.0, + "grad_norm": 2.752938577263642, + "language_loss": 0.80148053, + "learning_rate": 3.909238856579693e-06, + "loss": 0.82364374, + "num_input_tokens_seen": 44355315, + "router_z_loss_clip": 1.1015625, + "router_z_loss_mlp": 0.1784668, + "step": 2052, + "time_per_iteration": 2.718263626098633 + }, + { + "auxiliary_loss_clip": 0.01176163, + "auxiliary_loss_mlp": 0.01049581, + "balance_loss_clip": 1.06028306, + "balance_loss_mlp": 1.03000665, + "epoch": 0.12343303772734104, + "flos": 28735985697120.0, + "grad_norm": 2.632834701769225, + "language_loss": 0.74438137, + "learning_rate": 3.909122827637406e-06, + "loss": 0.76663882, + "num_input_tokens_seen": 44373020, + "router_z_loss_clip": 1.15820312, + "router_z_loss_mlp": 0.19592285, + "step": 2053, + "time_per_iteration": 2.702644109725952 + }, + { + "auxiliary_loss_clip": 0.0117276, + "auxiliary_loss_mlp": 0.01049594, + "balance_loss_clip": 1.05694842, + "balance_loss_mlp": 1.0308305, + "epoch": 0.12349316098000902, + "flos": 58031310924000.0, + "grad_norm": 1.5973667387261234, + "language_loss": 0.74226105, + "learning_rate": 3.909006726300991e-06, + "loss": 0.76448458, + "num_input_tokens_seen": 44397525, + "router_z_loss_clip": 1.15722656, + "router_z_loss_mlp": 0.18762207, + "step": 2054, + "time_per_iteration": 2.8953335285186768 + }, + { + "auxiliary_loss_clip": 0.01169269, + "auxiliary_loss_mlp": 0.01042205, + "balance_loss_clip": 1.05792725, + "balance_loss_mlp": 1.02525342, + "epoch": 0.12355328423267699, + "flos": 30561474484800.0, + "grad_norm": 1.8833551112528344, + "language_loss": 0.85317373, + "learning_rate": 3.908890552574849e-06, + "loss": 0.87528849, + "num_input_tokens_seen": 44415890, + "router_z_loss_clip": 1.11425781, + "router_z_loss_mlp": 0.16955566, + "step": 2055, + "time_per_iteration": 2.715590238571167 + }, + { + "auxiliary_loss_clip": 0.01172075, + "auxiliary_loss_mlp": 0.0104795, + "balance_loss_clip": 1.05828011, + "balance_loss_mlp": 1.03096294, + "epoch": 0.12361340748534495, + "flos": 33811307107200.0, + "grad_norm": 2.0249991943035965, + "language_loss": 0.7790308, + "learning_rate": 3.908774306463384e-06, + "loss": 0.80123109, + "num_input_tokens_seen": 44436625, + "router_z_loss_clip": 1.13867188, + "router_z_loss_mlp": 0.17004395, + "step": 2056, + "time_per_iteration": 2.765516996383667 + }, + { + "auxiliary_loss_clip": 0.01171595, + "auxiliary_loss_mlp": 0.0105424, + "balance_loss_clip": 1.05739701, + "balance_loss_mlp": 1.03564334, + "epoch": 0.12367353073801293, + "flos": 31897219728960.0, + "grad_norm": 1.8678349248346997, + "language_loss": 0.82883382, + "learning_rate": 3.908657987971009e-06, + "loss": 0.8510921, + "num_input_tokens_seen": 44455265, + "router_z_loss_clip": 1.14257812, + "router_z_loss_mlp": 0.18591309, + "step": 2057, + "time_per_iteration": 2.6968016624450684 + }, + { + "auxiliary_loss_clip": 0.01174062, + "auxiliary_loss_mlp": 0.01052492, + "balance_loss_clip": 1.05872464, + "balance_loss_mlp": 1.03364539, + "epoch": 0.1237336539906809, + "flos": 30695729699520.0, + "grad_norm": 1.5851145600253251, + "language_loss": 0.77822876, + "learning_rate": 3.90854159710213e-06, + "loss": 0.80049425, + "num_input_tokens_seen": 44475815, + "router_z_loss_clip": 1.15332031, + "router_z_loss_mlp": 0.18847656, + "step": 2058, + "time_per_iteration": 2.7195167541503906 + }, + { + "auxiliary_loss_clip": 0.01174577, + "auxiliary_loss_mlp": 0.01050661, + "balance_loss_clip": 1.05827546, + "balance_loss_mlp": 1.03161168, + "epoch": 0.12379377724334886, + "flos": 18674265605760.0, + "grad_norm": 2.130820038911758, + "language_loss": 0.83431721, + "learning_rate": 3.9084251338611624e-06, + "loss": 0.85656953, + "num_input_tokens_seen": 44494045, + "router_z_loss_clip": 1.16210938, + "router_z_loss_mlp": 0.19055176, + "step": 2059, + "time_per_iteration": 2.641227960586548 + }, + { + "auxiliary_loss_clip": 0.01177417, + "auxiliary_loss_mlp": 0.01058445, + "balance_loss_clip": 1.0605042, + "balance_loss_mlp": 1.03805995, + "epoch": 0.12385390049601683, + "flos": 26011464439680.0, + "grad_norm": 2.851140361760446, + "language_loss": 0.80777085, + "learning_rate": 3.908308598252523e-06, + "loss": 0.83012944, + "num_input_tokens_seen": 44509120, + "router_z_loss_clip": 1.16894531, + "router_z_loss_mlp": 0.20397949, + "step": 2060, + "time_per_iteration": 2.6988370418548584 + }, + { + "auxiliary_loss_clip": 0.01172446, + "auxiliary_loss_mlp": 0.01050691, + "balance_loss_clip": 1.05767512, + "balance_loss_mlp": 1.03160524, + "epoch": 0.1239140237486848, + "flos": 18445050587520.0, + "grad_norm": 2.2318962470145056, + "language_loss": 0.86540627, + "learning_rate": 3.9081919902806306e-06, + "loss": 0.88763762, + "num_input_tokens_seen": 44525780, + "router_z_loss_clip": 1.14746094, + "router_z_loss_mlp": 0.19091797, + "step": 2061, + "time_per_iteration": 2.639578342437744 + }, + { + "auxiliary_loss_clip": 0.01168345, + "auxiliary_loss_mlp": 0.01040938, + "balance_loss_clip": 1.05837512, + "balance_loss_mlp": 1.02364099, + "epoch": 0.12397414700135277, + "flos": 26816631072480.0, + "grad_norm": 1.8177348613710334, + "language_loss": 0.84939837, + "learning_rate": 3.908075309949906e-06, + "loss": 0.87149119, + "num_input_tokens_seen": 44543125, + "router_z_loss_clip": 1.09863281, + "router_z_loss_mlp": 0.1730957, + "step": 2062, + "time_per_iteration": 2.981971502304077 + }, + { + "auxiliary_loss_clip": 0.01173939, + "auxiliary_loss_mlp": 0.01044949, + "balance_loss_clip": 1.06137919, + "balance_loss_mlp": 1.0261023, + "epoch": 0.12403427025402074, + "flos": 16351821198720.0, + "grad_norm": 2.177242337765328, + "language_loss": 0.78527665, + "learning_rate": 3.907958557264774e-06, + "loss": 0.80746561, + "num_input_tokens_seen": 44560275, + "router_z_loss_clip": 1.125, + "router_z_loss_mlp": 0.18847656, + "step": 2063, + "time_per_iteration": 2.7190301418304443 + }, + { + "auxiliary_loss_clip": 0.01172156, + "auxiliary_loss_mlp": 0.01051576, + "balance_loss_clip": 1.05964661, + "balance_loss_mlp": 1.03178763, + "epoch": 0.12409439350668872, + "flos": 18674022502080.0, + "grad_norm": 2.500856276270775, + "language_loss": 0.79267514, + "learning_rate": 3.907841732229663e-06, + "loss": 0.81491244, + "num_input_tokens_seen": 44577640, + "router_z_loss_clip": 1.12402344, + "router_z_loss_mlp": 0.19775391, + "step": 2064, + "time_per_iteration": 2.697993278503418 + }, + { + "auxiliary_loss_clip": 0.01169913, + "auxiliary_loss_mlp": 0.01052067, + "balance_loss_clip": 1.05743074, + "balance_loss_mlp": 1.0335536, + "epoch": 0.12415451675935668, + "flos": 30516830792640.0, + "grad_norm": 2.8148573670312773, + "language_loss": 0.92492777, + "learning_rate": 3.907724834849002e-06, + "loss": 0.94714761, + "num_input_tokens_seen": 44594860, + "router_z_loss_clip": 1.12597656, + "router_z_loss_mlp": 0.18505859, + "step": 2065, + "time_per_iteration": 2.7834742069244385 + }, + { + "auxiliary_loss_clip": 0.011767, + "auxiliary_loss_mlp": 0.01046845, + "balance_loss_clip": 1.05987668, + "balance_loss_mlp": 1.02742577, + "epoch": 0.12421464001202465, + "flos": 28869795221760.0, + "grad_norm": 1.6980522585268465, + "language_loss": 0.80757594, + "learning_rate": 3.907607865127225e-06, + "loss": 0.82981145, + "num_input_tokens_seen": 44614780, + "router_z_loss_clip": 1.16894531, + "router_z_loss_mlp": 0.19421387, + "step": 2066, + "time_per_iteration": 2.722198724746704 + }, + { + "auxiliary_loss_clip": 0.01073866, + "auxiliary_loss_mlp": 0.01001707, + "balance_loss_clip": 1.03343964, + "balance_loss_mlp": 0.99953097, + "epoch": 0.12427476326469263, + "flos": 80206949191680.0, + "grad_norm": 0.9195917739553797, + "language_loss": 0.63268065, + "learning_rate": 3.907490823068766e-06, + "loss": 0.65343642, + "num_input_tokens_seen": 44671240, + "router_z_loss_clip": 0.40380859, + "router_z_loss_mlp": 0.02178955, + "step": 2067, + "time_per_iteration": 3.238013982772827 + }, + { + "auxiliary_loss_clip": 0.01173569, + "auxiliary_loss_mlp": 0.01050062, + "balance_loss_clip": 1.0593636, + "balance_loss_mlp": 1.03154886, + "epoch": 0.12433488651736059, + "flos": 29938448141280.0, + "grad_norm": 2.1775438094006176, + "language_loss": 0.93251407, + "learning_rate": 3.907373708678063e-06, + "loss": 0.95475042, + "num_input_tokens_seen": 44691050, + "router_z_loss_clip": 1.140625, + "router_z_loss_mlp": 0.18505859, + "step": 2068, + "time_per_iteration": 4.30002236366272 + }, + { + "auxiliary_loss_clip": 0.01175625, + "auxiliary_loss_mlp": 0.0104789, + "balance_loss_clip": 1.06160522, + "balance_loss_mlp": 1.03096187, + "epoch": 0.12439500977002856, + "flos": 25662985600320.0, + "grad_norm": 2.5270099685493634, + "language_loss": 0.81025362, + "learning_rate": 3.9072565219595596e-06, + "loss": 0.83248878, + "num_input_tokens_seen": 44709850, + "router_z_loss_clip": 1.13964844, + "router_z_loss_mlp": 0.16918945, + "step": 2069, + "time_per_iteration": 2.6799979209899902 + }, + { + "auxiliary_loss_clip": 0.01178138, + "auxiliary_loss_mlp": 0.01054816, + "balance_loss_clip": 1.0613997, + "balance_loss_mlp": 1.03557575, + "epoch": 0.12445513302269653, + "flos": 32739534357120.0, + "grad_norm": 1.57645797834845, + "language_loss": 0.77363729, + "learning_rate": 3.907139262917696e-06, + "loss": 0.7959668, + "num_input_tokens_seen": 44731475, + "router_z_loss_clip": 1.16699219, + "router_z_loss_mlp": 0.19226074, + "step": 2070, + "time_per_iteration": 2.743494749069214 + }, + { + "auxiliary_loss_clip": 0.01176215, + "auxiliary_loss_mlp": 0.01046629, + "balance_loss_clip": 1.06130528, + "balance_loss_mlp": 1.02858102, + "epoch": 0.1245152562753645, + "flos": 22413639185280.0, + "grad_norm": 2.193420364798396, + "language_loss": 0.81012696, + "learning_rate": 3.907021931556922e-06, + "loss": 0.83235538, + "num_input_tokens_seen": 44749685, + "router_z_loss_clip": 1.15039062, + "router_z_loss_mlp": 0.18054199, + "step": 2071, + "time_per_iteration": 5.559623718261719 + }, + { + "auxiliary_loss_clip": 0.01170676, + "auxiliary_loss_mlp": 0.01049111, + "balance_loss_clip": 1.05945826, + "balance_loss_mlp": 1.03006136, + "epoch": 0.12457537952803246, + "flos": 40399489908000.0, + "grad_norm": 2.1405002891336196, + "language_loss": 0.7823891, + "learning_rate": 3.906904527881684e-06, + "loss": 0.80458695, + "num_input_tokens_seen": 44772165, + "router_z_loss_clip": 1.11230469, + "router_z_loss_mlp": 0.19042969, + "step": 2072, + "time_per_iteration": 2.7654881477355957 + }, + { + "auxiliary_loss_clip": 0.01171601, + "auxiliary_loss_mlp": 0.01048408, + "balance_loss_clip": 1.05924606, + "balance_loss_mlp": 1.02982283, + "epoch": 0.12463550278070043, + "flos": 27174550438080.0, + "grad_norm": 2.0535971905840076, + "language_loss": 0.75305092, + "learning_rate": 3.9067870518964355e-06, + "loss": 0.77525103, + "num_input_tokens_seen": 44790580, + "router_z_loss_clip": 1.12402344, + "router_z_loss_mlp": 0.18591309, + "step": 2073, + "time_per_iteration": 2.9132204055786133 + }, + { + "auxiliary_loss_clip": 0.01169023, + "auxiliary_loss_mlp": 0.01047515, + "balance_loss_clip": 1.05583167, + "balance_loss_mlp": 1.02859676, + "epoch": 0.12469562603336841, + "flos": 17911392662880.0, + "grad_norm": 1.8855313421905726, + "language_loss": 0.9104144, + "learning_rate": 3.906669503605631e-06, + "loss": 0.93257976, + "num_input_tokens_seen": 44806730, + "router_z_loss_clip": 1.13183594, + "router_z_loss_mlp": 0.18920898, + "step": 2074, + "time_per_iteration": 2.6392037868499756 + }, + { + "auxiliary_loss_clip": 0.01174733, + "auxiliary_loss_mlp": 0.01050988, + "balance_loss_clip": 1.05796432, + "balance_loss_mlp": 1.03061485, + "epoch": 0.12475574928603637, + "flos": 30071893010400.0, + "grad_norm": 2.1608167547869455, + "language_loss": 0.84521937, + "learning_rate": 3.906551883013728e-06, + "loss": 0.86747652, + "num_input_tokens_seen": 44825550, + "router_z_loss_clip": 1.16894531, + "router_z_loss_mlp": 0.20361328, + "step": 2075, + "time_per_iteration": 2.679715156555176 + }, + { + "auxiliary_loss_clip": 0.01171385, + "auxiliary_loss_mlp": 0.01051479, + "balance_loss_clip": 1.05775118, + "balance_loss_mlp": 1.03142774, + "epoch": 0.12481587253870434, + "flos": 26555251684320.0, + "grad_norm": 1.926002939541869, + "language_loss": 0.74082565, + "learning_rate": 3.9064341901251865e-06, + "loss": 0.76305425, + "num_input_tokens_seen": 44844155, + "router_z_loss_clip": 1.13574219, + "router_z_loss_mlp": 0.20056152, + "step": 2076, + "time_per_iteration": 2.681137800216675 + }, + { + "auxiliary_loss_clip": 0.01167422, + "auxiliary_loss_mlp": 0.01039047, + "balance_loss_clip": 1.05833292, + "balance_loss_mlp": 1.02155936, + "epoch": 0.12487599579137232, + "flos": 26152202419200.0, + "grad_norm": 1.809849707356836, + "language_loss": 0.75576478, + "learning_rate": 3.906316424944469e-06, + "loss": 0.77782947, + "num_input_tokens_seen": 44863780, + "router_z_loss_clip": 1.09179688, + "router_z_loss_mlp": 0.17504883, + "step": 2077, + "time_per_iteration": 2.6800107955932617 + }, + { + "auxiliary_loss_clip": 0.0116991, + "auxiliary_loss_mlp": 0.01049657, + "balance_loss_clip": 1.05734587, + "balance_loss_mlp": 1.03034556, + "epoch": 0.12493611904404028, + "flos": 19654157865600.0, + "grad_norm": 2.6240077400068698, + "language_loss": 0.8251766, + "learning_rate": 3.906198587476043e-06, + "loss": 0.84737229, + "num_input_tokens_seen": 44881480, + "router_z_loss_clip": 1.12597656, + "router_z_loss_mlp": 0.19299316, + "step": 2078, + "time_per_iteration": 2.6826138496398926 + }, + { + "auxiliary_loss_clip": 0.01173938, + "auxiliary_loss_mlp": 0.01044996, + "balance_loss_clip": 1.05954099, + "balance_loss_mlp": 1.02577972, + "epoch": 0.12499624229670825, + "flos": 26332316844480.0, + "grad_norm": 1.814211273402256, + "language_loss": 0.75024211, + "learning_rate": 3.906080677724374e-06, + "loss": 0.77243149, + "num_input_tokens_seen": 44900390, + "router_z_loss_clip": 1.14355469, + "router_z_loss_mlp": 0.19213867, + "step": 2079, + "time_per_iteration": 2.705230951309204 + }, + { + "auxiliary_loss_clip": 0.01181988, + "auxiliary_loss_mlp": 0.01051099, + "balance_loss_clip": 1.06598711, + "balance_loss_mlp": 1.03250265, + "epoch": 0.1250563655493762, + "flos": 31358740248000.0, + "grad_norm": 2.098371373557715, + "language_loss": 0.83481526, + "learning_rate": 3.905962695693935e-06, + "loss": 0.85714614, + "num_input_tokens_seen": 44920375, + "router_z_loss_clip": 1.16015625, + "router_z_loss_mlp": 0.18591309, + "step": 2080, + "time_per_iteration": 2.7532832622528076 + }, + { + "auxiliary_loss_clip": 0.01171682, + "auxiliary_loss_mlp": 0.01054905, + "balance_loss_clip": 1.05907726, + "balance_loss_mlp": 1.03580737, + "epoch": 0.12511648880204418, + "flos": 20632753572480.0, + "grad_norm": 2.2028824865275274, + "language_loss": 0.85036051, + "learning_rate": 3.9058446413892e-06, + "loss": 0.8726263, + "num_input_tokens_seen": 44938415, + "router_z_loss_clip": 1.125, + "router_z_loss_mlp": 0.19091797, + "step": 2081, + "time_per_iteration": 2.6617891788482666 + }, + { + "auxiliary_loss_clip": 0.0117146, + "auxiliary_loss_mlp": 0.01043673, + "balance_loss_clip": 1.0592165, + "balance_loss_mlp": 1.02580357, + "epoch": 0.12517661205471217, + "flos": 21435570203040.0, + "grad_norm": 1.6365275950437423, + "language_loss": 0.7690258, + "learning_rate": 3.905726514814646e-06, + "loss": 0.79117715, + "num_input_tokens_seen": 44957135, + "router_z_loss_clip": 1.12207031, + "router_z_loss_mlp": 0.17858887, + "step": 2082, + "time_per_iteration": 2.713789939880371 + }, + { + "auxiliary_loss_clip": 0.01186265, + "auxiliary_loss_mlp": 0.01051697, + "balance_loss_clip": 1.06472433, + "balance_loss_mlp": 1.03219426, + "epoch": 0.12523673530738014, + "flos": 19564141170240.0, + "grad_norm": 2.4053134316171847, + "language_loss": 0.78919506, + "learning_rate": 3.9056083159747495e-06, + "loss": 0.8115747, + "num_input_tokens_seen": 44974480, + "router_z_loss_clip": 1.21484375, + "router_z_loss_mlp": 0.19506836, + "step": 2083, + "time_per_iteration": 2.665513515472412 + }, + { + "auxiliary_loss_clip": 0.01173852, + "auxiliary_loss_mlp": 0.01046849, + "balance_loss_clip": 1.05773747, + "balance_loss_mlp": 1.02675056, + "epoch": 0.1252968585600481, + "flos": 22949039352960.0, + "grad_norm": 2.3189627803066677, + "language_loss": 0.90285552, + "learning_rate": 3.9054900448739966e-06, + "loss": 0.92506242, + "num_input_tokens_seen": 44990310, + "router_z_loss_clip": 1.16113281, + "router_z_loss_mlp": 0.2010498, + "step": 2084, + "time_per_iteration": 2.7211263179779053 + }, + { + "auxiliary_loss_clip": 0.0117379, + "auxiliary_loss_mlp": 0.01051595, + "balance_loss_clip": 1.06083965, + "balance_loss_mlp": 1.03328419, + "epoch": 0.12535698181271607, + "flos": 33277325044320.0, + "grad_norm": 2.219950515572111, + "language_loss": 0.80003583, + "learning_rate": 3.905371701516869e-06, + "loss": 0.82228965, + "num_input_tokens_seen": 45010720, + "router_z_loss_clip": 1.12792969, + "router_z_loss_mlp": 0.18322754, + "step": 2085, + "time_per_iteration": 2.775491952896118 + }, + { + "auxiliary_loss_clip": 0.01171269, + "auxiliary_loss_mlp": 0.01050418, + "balance_loss_clip": 1.05811751, + "balance_loss_mlp": 1.03104615, + "epoch": 0.12541710506538403, + "flos": 26910577944000.0, + "grad_norm": 2.008844994151475, + "language_loss": 0.88071418, + "learning_rate": 3.905253285907856e-06, + "loss": 0.90293097, + "num_input_tokens_seen": 45030360, + "router_z_loss_clip": 1.13085938, + "router_z_loss_mlp": 0.19372559, + "step": 2086, + "time_per_iteration": 2.9134459495544434 + }, + { + "auxiliary_loss_clip": 0.01168321, + "auxiliary_loss_mlp": 0.01042804, + "balance_loss_clip": 1.0599072, + "balance_loss_mlp": 1.02528083, + "epoch": 0.125477228318052, + "flos": 15377925496320.0, + "grad_norm": 2.422137816508235, + "language_loss": 0.86848289, + "learning_rate": 3.905134798051447e-06, + "loss": 0.89059412, + "num_input_tokens_seen": 45045085, + "router_z_loss_clip": 1.08496094, + "router_z_loss_mlp": 0.1751709, + "step": 2087, + "time_per_iteration": 2.6545956134796143 + }, + { + "auxiliary_loss_clip": 0.01171731, + "auxiliary_loss_mlp": 0.01049933, + "balance_loss_clip": 1.05881095, + "balance_loss_mlp": 1.02996528, + "epoch": 0.12553735157071996, + "flos": 29136887546400.0, + "grad_norm": 2.7448264357600207, + "language_loss": 0.73479033, + "learning_rate": 3.905016237952136e-06, + "loss": 0.757007, + "num_input_tokens_seen": 45065145, + "router_z_loss_clip": 1.12890625, + "router_z_loss_mlp": 0.19970703, + "step": 2088, + "time_per_iteration": 2.748185634613037 + }, + { + "auxiliary_loss_clip": 0.01075138, + "auxiliary_loss_mlp": 0.01004735, + "balance_loss_clip": 1.03373384, + "balance_loss_mlp": 1.0024302, + "epoch": 0.12559747482338796, + "flos": 85318122044160.0, + "grad_norm": 0.773039414182871, + "language_loss": 0.61719763, + "learning_rate": 3.904897605614418e-06, + "loss": 0.63799644, + "num_input_tokens_seen": 45126230, + "router_z_loss_clip": 0.41430664, + "router_z_loss_mlp": 0.02302551, + "step": 2089, + "time_per_iteration": 3.288240909576416 + }, + { + "auxiliary_loss_clip": 0.01167621, + "auxiliary_loss_mlp": 0.0105249, + "balance_loss_clip": 1.05773842, + "balance_loss_mlp": 1.03372645, + "epoch": 0.12565759807605592, + "flos": 29626266434400.0, + "grad_norm": 3.16034941521357, + "language_loss": 0.7766124, + "learning_rate": 3.904778901042793e-06, + "loss": 0.79881358, + "num_input_tokens_seen": 45145545, + "router_z_loss_clip": 1.09765625, + "router_z_loss_mlp": 0.18786621, + "step": 2090, + "time_per_iteration": 2.6935579776763916 + }, + { + "auxiliary_loss_clip": 0.01072603, + "auxiliary_loss_mlp": 0.01005359, + "balance_loss_clip": 1.03105605, + "balance_loss_mlp": 1.00315464, + "epoch": 0.12571772132872389, + "flos": 68881997086560.0, + "grad_norm": 1.0893623149516103, + "language_loss": 0.59374231, + "learning_rate": 3.90466012424176e-06, + "loss": 0.61452192, + "num_input_tokens_seen": 45206845, + "router_z_loss_clip": 0.4152832, + "router_z_loss_mlp": 0.02204895, + "step": 2091, + "time_per_iteration": 3.1473169326782227 + }, + { + "auxiliary_loss_clip": 0.01171774, + "auxiliary_loss_mlp": 0.01042352, + "balance_loss_clip": 1.0597198, + "balance_loss_mlp": 1.02438688, + "epoch": 0.12577784458139185, + "flos": 50329790994240.0, + "grad_norm": 1.6861222008185095, + "language_loss": 0.63249409, + "learning_rate": 3.904541275215825e-06, + "loss": 0.65463531, + "num_input_tokens_seen": 45228495, + "router_z_loss_clip": 1.12109375, + "router_z_loss_mlp": 0.17956543, + "step": 2092, + "time_per_iteration": 2.829087018966675 + }, + { + "auxiliary_loss_clip": 0.01175602, + "auxiliary_loss_mlp": 0.01059709, + "balance_loss_clip": 1.05785298, + "balance_loss_mlp": 1.04019475, + "epoch": 0.12583796783405982, + "flos": 24105642586560.0, + "grad_norm": 2.097506112213218, + "language_loss": 0.80685335, + "learning_rate": 3.904422353969493e-06, + "loss": 0.82920647, + "num_input_tokens_seen": 45245720, + "router_z_loss_clip": 1.17675781, + "router_z_loss_mlp": 0.19506836, + "step": 2093, + "time_per_iteration": 2.6917989253997803 + }, + { + "auxiliary_loss_clip": 0.01169264, + "auxiliary_loss_mlp": 0.0105081, + "balance_loss_clip": 1.05677938, + "balance_loss_mlp": 1.0320344, + "epoch": 0.12589809108672778, + "flos": 27579220394400.0, + "grad_norm": 1.778076572094949, + "language_loss": 0.75857127, + "learning_rate": 3.904303360507276e-06, + "loss": 0.78077209, + "num_input_tokens_seen": 45265650, + "router_z_loss_clip": 1.125, + "router_z_loss_mlp": 0.18774414, + "step": 2094, + "time_per_iteration": 2.714392900466919 + }, + { + "auxiliary_loss_clip": 0.01164746, + "auxiliary_loss_mlp": 0.01052256, + "balance_loss_clip": 1.05406654, + "balance_loss_mlp": 1.03386199, + "epoch": 0.12595821433939577, + "flos": 55182825840960.0, + "grad_norm": 1.5828273349466566, + "language_loss": 0.76849383, + "learning_rate": 3.9041842948336835e-06, + "loss": 0.79066384, + "num_input_tokens_seen": 45287790, + "router_z_loss_clip": 1.10839844, + "router_z_loss_mlp": 0.18395996, + "step": 2095, + "time_per_iteration": 2.9889140129089355 + }, + { + "auxiliary_loss_clip": 0.0117077, + "auxiliary_loss_mlp": 0.01052038, + "balance_loss_clip": 1.05565751, + "balance_loss_mlp": 1.03288114, + "epoch": 0.12601833759206374, + "flos": 17472532472640.0, + "grad_norm": 3.25370623198741, + "language_loss": 0.82570922, + "learning_rate": 3.904065156953232e-06, + "loss": 0.84793735, + "num_input_tokens_seen": 45305720, + "router_z_loss_clip": 1.15039062, + "router_z_loss_mlp": 0.19152832, + "step": 2096, + "time_per_iteration": 2.7052571773529053 + }, + { + "auxiliary_loss_clip": 0.01171556, + "auxiliary_loss_mlp": 0.01048842, + "balance_loss_clip": 1.05695748, + "balance_loss_mlp": 1.03005493, + "epoch": 0.1260784608447317, + "flos": 25975977652800.0, + "grad_norm": 3.3127304764715784, + "language_loss": 0.75622439, + "learning_rate": 3.903945946870439e-06, + "loss": 0.77842844, + "num_input_tokens_seen": 45325290, + "router_z_loss_clip": 1.14648438, + "router_z_loss_mlp": 0.18798828, + "step": 2097, + "time_per_iteration": 2.684652090072632 + }, + { + "auxiliary_loss_clip": 0.01171081, + "auxiliary_loss_mlp": 0.01060457, + "balance_loss_clip": 1.05816865, + "balance_loss_mlp": 1.04334998, + "epoch": 0.12613858409739967, + "flos": 32032204254720.0, + "grad_norm": 2.1325930499941395, + "language_loss": 0.87674105, + "learning_rate": 3.9038266645898246e-06, + "loss": 0.89905643, + "num_input_tokens_seen": 45344465, + "router_z_loss_clip": 1.12695312, + "router_z_loss_mlp": 0.17102051, + "step": 2098, + "time_per_iteration": 2.9786839485168457 + }, + { + "auxiliary_loss_clip": 0.01175587, + "auxiliary_loss_mlp": 0.01057323, + "balance_loss_clip": 1.057567, + "balance_loss_mlp": 1.03618717, + "epoch": 0.12619870735006763, + "flos": 26331992706240.0, + "grad_norm": 2.646110675192514, + "language_loss": 0.69424224, + "learning_rate": 3.903707310115912e-06, + "loss": 0.71657133, + "num_input_tokens_seen": 45362465, + "router_z_loss_clip": 1.1796875, + "router_z_loss_mlp": 0.21118164, + "step": 2099, + "time_per_iteration": 2.652923107147217 + }, + { + "auxiliary_loss_clip": 0.01170305, + "auxiliary_loss_mlp": 0.01057807, + "balance_loss_clip": 1.05623984, + "balance_loss_mlp": 1.0378871, + "epoch": 0.1262588306027356, + "flos": 28512767236320.0, + "grad_norm": 2.104846745858824, + "language_loss": 0.81362641, + "learning_rate": 3.903587883453228e-06, + "loss": 0.83590752, + "num_input_tokens_seen": 45382700, + "router_z_loss_clip": 1.13964844, + "router_z_loss_mlp": 0.19909668, + "step": 2100, + "time_per_iteration": 2.7719733715057373 + }, + { + "auxiliary_loss_clip": 0.0117379, + "auxiliary_loss_mlp": 0.01051675, + "balance_loss_clip": 1.05906594, + "balance_loss_mlp": 1.0331378, + "epoch": 0.12631895385540357, + "flos": 29223298203840.0, + "grad_norm": 1.8324549140574578, + "language_loss": 0.80744022, + "learning_rate": 3.903468384606302e-06, + "loss": 0.82969487, + "num_input_tokens_seen": 45401005, + "router_z_loss_clip": 1.14746094, + "router_z_loss_mlp": 0.1854248, + "step": 2101, + "time_per_iteration": 2.672137975692749 + }, + { + "auxiliary_loss_clip": 0.01068224, + "auxiliary_loss_mlp": 0.01019173, + "balance_loss_clip": 1.02773547, + "balance_loss_mlp": 1.01688981, + "epoch": 0.12637907710807156, + "flos": 85759332236640.0, + "grad_norm": 0.7031004300230402, + "language_loss": 0.57096994, + "learning_rate": 3.903348813579662e-06, + "loss": 0.59184384, + "num_input_tokens_seen": 45466555, + "router_z_loss_clip": 0.40405273, + "router_z_loss_mlp": 0.02284241, + "step": 2102, + "time_per_iteration": 3.334517002105713 + }, + { + "auxiliary_loss_clip": 0.01175026, + "auxiliary_loss_mlp": 0.01054979, + "balance_loss_clip": 1.05899501, + "balance_loss_mlp": 1.03615594, + "epoch": 0.12643920036073952, + "flos": 23080215254400.0, + "grad_norm": 1.9732024616513808, + "language_loss": 0.93020886, + "learning_rate": 3.903229170377845e-06, + "loss": 0.95250893, + "num_input_tokens_seen": 45485165, + "router_z_loss_clip": 1.15966797, + "router_z_loss_mlp": 0.18823242, + "step": 2103, + "time_per_iteration": 2.6624808311462402 + }, + { + "auxiliary_loss_clip": 0.01163313, + "auxiliary_loss_mlp": 0.01039578, + "balance_loss_clip": 1.055879, + "balance_loss_mlp": 1.02286446, + "epoch": 0.1264993236134075, + "flos": 33900756560640.0, + "grad_norm": 1.7792139580212316, + "language_loss": 0.77973801, + "learning_rate": 3.903109455005387e-06, + "loss": 0.80176693, + "num_input_tokens_seen": 45504630, + "router_z_loss_clip": 1.07421875, + "router_z_loss_mlp": 0.16711426, + "step": 2104, + "time_per_iteration": 2.729719877243042 + }, + { + "auxiliary_loss_clip": 0.0117539, + "auxiliary_loss_mlp": 0.01052406, + "balance_loss_clip": 1.06061006, + "balance_loss_mlp": 1.03501308, + "epoch": 0.12655944686607545, + "flos": 30205986156000.0, + "grad_norm": 2.2750740766064954, + "language_loss": 0.81179082, + "learning_rate": 3.902989667466828e-06, + "loss": 0.83406872, + "num_input_tokens_seen": 45524885, + "router_z_loss_clip": 1.14746094, + "router_z_loss_mlp": 0.1739502, + "step": 2105, + "time_per_iteration": 2.703857421875 + }, + { + "auxiliary_loss_clip": 0.01177457, + "auxiliary_loss_mlp": 0.01055679, + "balance_loss_clip": 1.05974698, + "balance_loss_mlp": 1.03624761, + "epoch": 0.12661957011874342, + "flos": 29447529596640.0, + "grad_norm": 1.9399027746738342, + "language_loss": 0.83052891, + "learning_rate": 3.90286980776671e-06, + "loss": 0.85286021, + "num_input_tokens_seen": 45545000, + "router_z_loss_clip": 1.17675781, + "router_z_loss_mlp": 0.19433594, + "step": 2106, + "time_per_iteration": 2.719785690307617 + }, + { + "auxiliary_loss_clip": 0.01171573, + "auxiliary_loss_mlp": 0.01044244, + "balance_loss_clip": 1.05986571, + "balance_loss_mlp": 1.02583778, + "epoch": 0.12667969337141138, + "flos": 29980377175680.0, + "grad_norm": 1.6870036659253949, + "language_loss": 0.73582143, + "learning_rate": 3.902749875909578e-06, + "loss": 0.75797963, + "num_input_tokens_seen": 45564210, + "router_z_loss_clip": 1.11523438, + "router_z_loss_mlp": 0.18383789, + "step": 2107, + "time_per_iteration": 5.653555154800415 + }, + { + "auxiliary_loss_clip": 0.01167438, + "auxiliary_loss_mlp": 0.01045418, + "balance_loss_clip": 1.05745888, + "balance_loss_mlp": 1.02771568, + "epoch": 0.12673981662407935, + "flos": 28018485757440.0, + "grad_norm": 1.8358828956458808, + "language_loss": 0.79269648, + "learning_rate": 3.90262987189998e-06, + "loss": 0.81482506, + "num_input_tokens_seen": 45583030, + "router_z_loss_clip": 1.10253906, + "router_z_loss_mlp": 0.17712402, + "step": 2108, + "time_per_iteration": 2.7631852626800537 + }, + { + "auxiliary_loss_clip": 0.01167687, + "auxiliary_loss_mlp": 0.01047098, + "balance_loss_clip": 1.0536319, + "balance_loss_mlp": 1.0291574, + "epoch": 0.12679993987674734, + "flos": 21078542217600.0, + "grad_norm": 2.15088046363469, + "language_loss": 0.75919282, + "learning_rate": 3.902509795742467e-06, + "loss": 0.78134072, + "num_input_tokens_seen": 45602265, + "router_z_loss_clip": 1.140625, + "router_z_loss_mlp": 0.17932129, + "step": 2109, + "time_per_iteration": 2.817492961883545 + }, + { + "auxiliary_loss_clip": 0.01168668, + "auxiliary_loss_mlp": 0.01048516, + "balance_loss_clip": 1.05830133, + "balance_loss_mlp": 1.03036094, + "epoch": 0.1268600631294153, + "flos": 21078785321280.0, + "grad_norm": 1.8297037788369588, + "language_loss": 0.82856703, + "learning_rate": 3.902389647441592e-06, + "loss": 0.85073882, + "num_input_tokens_seen": 45620595, + "router_z_loss_clip": 1.10351562, + "router_z_loss_mlp": 0.18164062, + "step": 2110, + "time_per_iteration": 4.118005752563477 + }, + { + "auxiliary_loss_clip": 0.01168586, + "auxiliary_loss_mlp": 0.01051738, + "balance_loss_clip": 1.05712855, + "balance_loss_mlp": 1.03293896, + "epoch": 0.12692018638208327, + "flos": 29359579282560.0, + "grad_norm": 1.5787329884398558, + "language_loss": 0.78192383, + "learning_rate": 3.90226942700191e-06, + "loss": 0.8041271, + "num_input_tokens_seen": 45641140, + "router_z_loss_clip": 1.11328125, + "router_z_loss_mlp": 0.18798828, + "step": 2111, + "time_per_iteration": 4.099352598190308 + }, + { + "auxiliary_loss_clip": 0.01178297, + "auxiliary_loss_mlp": 0.01066032, + "balance_loss_clip": 1.05806279, + "balance_loss_mlp": 1.04574239, + "epoch": 0.12698030963475124, + "flos": 38842227928800.0, + "grad_norm": 2.290955483589265, + "language_loss": 0.76938993, + "learning_rate": 3.902149134427982e-06, + "loss": 0.79183316, + "num_input_tokens_seen": 45662315, + "router_z_loss_clip": 1.203125, + "router_z_loss_mlp": 0.20288086, + "step": 2112, + "time_per_iteration": 2.760340929031372 + }, + { + "auxiliary_loss_clip": 0.01167994, + "auxiliary_loss_mlp": 0.0105419, + "balance_loss_clip": 1.05578792, + "balance_loss_mlp": 1.03583181, + "epoch": 0.1270404328874192, + "flos": 30734863041600.0, + "grad_norm": 2.1936849783113264, + "language_loss": 0.85454142, + "learning_rate": 3.902028769724367e-06, + "loss": 0.87676328, + "num_input_tokens_seen": 45680335, + "router_z_loss_clip": 1.12402344, + "router_z_loss_mlp": 0.18371582, + "step": 2113, + "time_per_iteration": 2.666647434234619 + }, + { + "auxiliary_loss_clip": 0.0116936, + "auxiliary_loss_mlp": 0.01060403, + "balance_loss_clip": 1.05701387, + "balance_loss_mlp": 1.04113913, + "epoch": 0.12710055614008717, + "flos": 19519659547200.0, + "grad_norm": 2.094434127009472, + "language_loss": 0.73873669, + "learning_rate": 3.9019083328956315e-06, + "loss": 0.76103431, + "num_input_tokens_seen": 45696240, + "router_z_loss_clip": 1.12304688, + "router_z_loss_mlp": 0.19262695, + "step": 2114, + "time_per_iteration": 2.625844717025757 + }, + { + "auxiliary_loss_clip": 0.01171658, + "auxiliary_loss_mlp": 0.01054881, + "balance_loss_clip": 1.05972075, + "balance_loss_mlp": 1.03592622, + "epoch": 0.12716067939275516, + "flos": 18407092246560.0, + "grad_norm": 2.1675702392519165, + "language_loss": 0.83468521, + "learning_rate": 3.901787823946341e-06, + "loss": 0.85695058, + "num_input_tokens_seen": 45713695, + "router_z_loss_clip": 1.12011719, + "router_z_loss_mlp": 0.18945312, + "step": 2115, + "time_per_iteration": 2.60237979888916 + }, + { + "auxiliary_loss_clip": 0.01170761, + "auxiliary_loss_mlp": 0.01054537, + "balance_loss_clip": 1.05857253, + "balance_loss_mlp": 1.03638184, + "epoch": 0.12722080264542313, + "flos": 34613678047680.0, + "grad_norm": 10.543803587566169, + "language_loss": 0.86815727, + "learning_rate": 3.901667242881065e-06, + "loss": 0.89041018, + "num_input_tokens_seen": 45736655, + "router_z_loss_clip": 1.12207031, + "router_z_loss_mlp": 0.18151855, + "step": 2116, + "time_per_iteration": 2.7481601238250732 + }, + { + "auxiliary_loss_clip": 0.01166109, + "auxiliary_loss_mlp": 0.01044585, + "balance_loss_clip": 1.05595851, + "balance_loss_mlp": 1.0272398, + "epoch": 0.1272809258980911, + "flos": 39510870379200.0, + "grad_norm": 1.65909620446031, + "language_loss": 0.70488298, + "learning_rate": 3.9015465897043775e-06, + "loss": 0.72698987, + "num_input_tokens_seen": 45758195, + "router_z_loss_clip": 1.1015625, + "router_z_loss_mlp": 0.17333984, + "step": 2117, + "time_per_iteration": 2.731431722640991 + }, + { + "auxiliary_loss_clip": 0.01171116, + "auxiliary_loss_mlp": 0.01049189, + "balance_loss_clip": 1.05854666, + "balance_loss_mlp": 1.03066373, + "epoch": 0.12734104915075906, + "flos": 19564829964000.0, + "grad_norm": 2.1903998087710246, + "language_loss": 0.86590624, + "learning_rate": 3.901425864420852e-06, + "loss": 0.88810933, + "num_input_tokens_seen": 45774280, + "router_z_loss_clip": 1.125, + "router_z_loss_mlp": 0.18530273, + "step": 2118, + "time_per_iteration": 2.634533405303955 + }, + { + "auxiliary_loss_clip": 0.01168202, + "auxiliary_loss_mlp": 0.01046412, + "balance_loss_clip": 1.05718529, + "balance_loss_mlp": 1.02900696, + "epoch": 0.12740117240342702, + "flos": 22281045179040.0, + "grad_norm": 1.8588019768408273, + "language_loss": 0.87336743, + "learning_rate": 3.901305067035068e-06, + "loss": 0.89551353, + "num_input_tokens_seen": 45792760, + "router_z_loss_clip": 1.11132812, + "router_z_loss_mlp": 0.17419434, + "step": 2119, + "time_per_iteration": 2.632723569869995 + }, + { + "auxiliary_loss_clip": 0.01170473, + "auxiliary_loss_mlp": 0.01043059, + "balance_loss_clip": 1.05719495, + "balance_loss_mlp": 1.02464128, + "epoch": 0.127461295656095, + "flos": 14791155768000.0, + "grad_norm": 6.18704334632544, + "language_loss": 0.87700522, + "learning_rate": 3.901184197551605e-06, + "loss": 0.89914048, + "num_input_tokens_seen": 45804300, + "router_z_loss_clip": 1.1328125, + "router_z_loss_mlp": 0.18432617, + "step": 2120, + "time_per_iteration": 2.634831666946411 + }, + { + "auxiliary_loss_clip": 0.0116976, + "auxiliary_loss_mlp": 0.01042325, + "balance_loss_clip": 1.05727339, + "balance_loss_mlp": 1.02441978, + "epoch": 0.12752141890876295, + "flos": 28246728360960.0, + "grad_norm": 2.161467214803117, + "language_loss": 0.7619735, + "learning_rate": 3.901063255975046e-06, + "loss": 0.78409439, + "num_input_tokens_seen": 45823780, + "router_z_loss_clip": 1.12597656, + "router_z_loss_mlp": 0.17907715, + "step": 2121, + "time_per_iteration": 2.9226441383361816 + }, + { + "auxiliary_loss_clip": 0.01170299, + "auxiliary_loss_mlp": 0.01043934, + "balance_loss_clip": 1.05900311, + "balance_loss_mlp": 1.02527785, + "epoch": 0.12758154216143094, + "flos": 26375988121920.0, + "grad_norm": 3.8317346823042002, + "language_loss": 0.82997906, + "learning_rate": 3.900942242309978e-06, + "loss": 0.85212147, + "num_input_tokens_seen": 45840495, + "router_z_loss_clip": 1.11328125, + "router_z_loss_mlp": 0.18640137, + "step": 2122, + "time_per_iteration": 2.6628618240356445 + }, + { + "auxiliary_loss_clip": 0.0117365, + "auxiliary_loss_mlp": 0.01049029, + "balance_loss_clip": 1.06114078, + "balance_loss_mlp": 1.0306704, + "epoch": 0.1276416654140989, + "flos": 19431182508480.0, + "grad_norm": 1.8818031490206044, + "language_loss": 0.79309011, + "learning_rate": 3.90082115656099e-06, + "loss": 0.81531692, + "num_input_tokens_seen": 45857735, + "router_z_loss_clip": 1.12597656, + "router_z_loss_mlp": 0.18359375, + "step": 2123, + "time_per_iteration": 2.668344736099243 + }, + { + "auxiliary_loss_clip": 0.01173352, + "auxiliary_loss_mlp": 0.01048336, + "balance_loss_clip": 1.06013525, + "balance_loss_mlp": 1.03035939, + "epoch": 0.12770178866676687, + "flos": 27311520310560.0, + "grad_norm": 1.8399576540218021, + "language_loss": 0.79320323, + "learning_rate": 3.900699998732673e-06, + "loss": 0.81542015, + "num_input_tokens_seen": 45876485, + "router_z_loss_clip": 1.13183594, + "router_z_loss_mlp": 0.17980957, + "step": 2124, + "time_per_iteration": 2.6986031532287598 + }, + { + "auxiliary_loss_clip": 0.0117352, + "auxiliary_loss_mlp": 0.01050385, + "balance_loss_clip": 1.05836606, + "balance_loss_mlp": 1.03243268, + "epoch": 0.12776191191943484, + "flos": 26420510262240.0, + "grad_norm": 2.44822287493271, + "language_loss": 0.75942653, + "learning_rate": 3.900578768829623e-06, + "loss": 0.78166556, + "num_input_tokens_seen": 45894645, + "router_z_loss_clip": 1.15136719, + "router_z_loss_mlp": 0.17938232, + "step": 2125, + "time_per_iteration": 2.673955202102661 + }, + { + "auxiliary_loss_clip": 0.011708, + "auxiliary_loss_mlp": 0.01043259, + "balance_loss_clip": 1.05802107, + "balance_loss_mlp": 1.0244596, + "epoch": 0.1278220351721028, + "flos": 31403302905600.0, + "grad_norm": 2.9946947526625474, + "language_loss": 0.77446461, + "learning_rate": 3.900457466856434e-06, + "loss": 0.79660523, + "num_input_tokens_seen": 45913755, + "router_z_loss_clip": 1.12792969, + "router_z_loss_mlp": 0.18798828, + "step": 2126, + "time_per_iteration": 2.685495376586914 + }, + { + "auxiliary_loss_clip": 0.0117376, + "auxiliary_loss_mlp": 0.01047651, + "balance_loss_clip": 1.06298757, + "balance_loss_mlp": 1.03062773, + "epoch": 0.12788215842477077, + "flos": 50326833232800.0, + "grad_norm": 1.787324096309065, + "language_loss": 0.6947819, + "learning_rate": 3.9003360928177085e-06, + "loss": 0.71699595, + "num_input_tokens_seen": 45936095, + "router_z_loss_clip": 1.10839844, + "router_z_loss_mlp": 0.17028809, + "step": 2127, + "time_per_iteration": 2.8556346893310547 + }, + { + "auxiliary_loss_clip": 0.01068617, + "auxiliary_loss_mlp": 0.0101162, + "balance_loss_clip": 1.02800035, + "balance_loss_mlp": 1.0091747, + "epoch": 0.12794228167743876, + "flos": 86484692528640.0, + "grad_norm": 0.8572334411721868, + "language_loss": 0.62810171, + "learning_rate": 3.900214646718047e-06, + "loss": 0.64890409, + "num_input_tokens_seen": 46004655, + "router_z_loss_clip": 0.40576172, + "router_z_loss_mlp": 0.02442932, + "step": 2128, + "time_per_iteration": 3.3322231769561768 + }, + { + "auxiliary_loss_clip": 0.0117221, + "auxiliary_loss_mlp": 0.01041818, + "balance_loss_clip": 1.0575788, + "balance_loss_mlp": 1.02295899, + "epoch": 0.12800240493010673, + "flos": 19876403911680.0, + "grad_norm": 2.0353771640683638, + "language_loss": 0.77329713, + "learning_rate": 3.900093128562056e-06, + "loss": 0.79543746, + "num_input_tokens_seen": 46023610, + "router_z_loss_clip": 1.14697266, + "router_z_loss_mlp": 0.18835449, + "step": 2129, + "time_per_iteration": 2.7431461811065674 + }, + { + "auxiliary_loss_clip": 0.01181526, + "auxiliary_loss_mlp": 0.01050707, + "balance_loss_clip": 1.06185031, + "balance_loss_mlp": 1.02988148, + "epoch": 0.1280625281827747, + "flos": 25174619644320.0, + "grad_norm": 1.98531499440225, + "language_loss": 0.79295754, + "learning_rate": 3.899971538354343e-06, + "loss": 0.81527996, + "num_input_tokens_seen": 46041725, + "router_z_loss_clip": 1.1953125, + "router_z_loss_mlp": 0.20837402, + "step": 2130, + "time_per_iteration": 2.64399790763855 + }, + { + "auxiliary_loss_clip": 0.01174336, + "auxiliary_loss_mlp": 0.01045981, + "balance_loss_clip": 1.05959511, + "balance_loss_mlp": 1.02809989, + "epoch": 0.12812265143544266, + "flos": 27623337361920.0, + "grad_norm": 2.100261874798925, + "language_loss": 0.70255083, + "learning_rate": 3.899849876099518e-06, + "loss": 0.72475398, + "num_input_tokens_seen": 46061095, + "router_z_loss_clip": 1.14648438, + "router_z_loss_mlp": 0.17871094, + "step": 2131, + "time_per_iteration": 2.7001285552978516 + }, + { + "auxiliary_loss_clip": 0.01169934, + "auxiliary_loss_mlp": 0.01048878, + "balance_loss_clip": 1.05806971, + "balance_loss_mlp": 1.03053188, + "epoch": 0.12818277468811062, + "flos": 42360003738720.0, + "grad_norm": 2.6011871849266495, + "language_loss": 0.7233026, + "learning_rate": 3.899728141802197e-06, + "loss": 0.74549073, + "num_input_tokens_seen": 46082670, + "router_z_loss_clip": 1.11816406, + "router_z_loss_mlp": 0.18359375, + "step": 2132, + "time_per_iteration": 2.815157413482666 + }, + { + "auxiliary_loss_clip": 0.01165689, + "auxiliary_loss_mlp": 0.01051784, + "balance_loss_clip": 1.05689502, + "balance_loss_mlp": 1.03367627, + "epoch": 0.1282428979407786, + "flos": 28201638978720.0, + "grad_norm": 1.770048483272667, + "language_loss": 0.81698072, + "learning_rate": 3.8996063354669935e-06, + "loss": 0.83915544, + "num_input_tokens_seen": 46102410, + "router_z_loss_clip": 1.08691406, + "router_z_loss_mlp": 0.18103027, + "step": 2133, + "time_per_iteration": 2.7169036865234375 + }, + { + "auxiliary_loss_clip": 0.01177601, + "auxiliary_loss_mlp": 0.01059675, + "balance_loss_clip": 1.05866051, + "balance_loss_mlp": 1.03991032, + "epoch": 0.12830302119344655, + "flos": 25485788419200.0, + "grad_norm": 2.301154932455482, + "language_loss": 0.79764843, + "learning_rate": 3.899484457098528e-06, + "loss": 0.82002121, + "num_input_tokens_seen": 46121145, + "router_z_loss_clip": 1.18945312, + "router_z_loss_mlp": 0.19775391, + "step": 2134, + "time_per_iteration": 2.8838982582092285 + }, + { + "auxiliary_loss_clip": 0.01172915, + "auxiliary_loss_mlp": 0.01047282, + "balance_loss_clip": 1.05801249, + "balance_loss_mlp": 1.02876925, + "epoch": 0.12836314444611455, + "flos": 26109625108320.0, + "grad_norm": 1.6905001470628298, + "language_loss": 0.82871127, + "learning_rate": 3.899362506701421e-06, + "loss": 0.85091329, + "num_input_tokens_seen": 46140740, + "router_z_loss_clip": 1.14941406, + "router_z_loss_mlp": 0.18530273, + "step": 2135, + "time_per_iteration": 2.696131944656372 + }, + { + "auxiliary_loss_clip": 0.01168971, + "auxiliary_loss_mlp": 0.01055126, + "balance_loss_clip": 1.05668211, + "balance_loss_mlp": 1.03639781, + "epoch": 0.1284232676987825, + "flos": 16670445153120.0, + "grad_norm": 2.2365819677138448, + "language_loss": 0.77270997, + "learning_rate": 3.899240484280298e-06, + "loss": 0.79495096, + "num_input_tokens_seen": 46156805, + "router_z_loss_clip": 1.12207031, + "router_z_loss_mlp": 0.18713379, + "step": 2136, + "time_per_iteration": 2.6581692695617676 + }, + { + "auxiliary_loss_clip": 0.01069666, + "auxiliary_loss_mlp": 0.01021328, + "balance_loss_clip": 1.0284946, + "balance_loss_mlp": 1.01920772, + "epoch": 0.12848339095145048, + "flos": 73205709357600.0, + "grad_norm": 0.916661551544666, + "language_loss": 0.59179336, + "learning_rate": 3.899118389839785e-06, + "loss": 0.61270332, + "num_input_tokens_seen": 46222085, + "router_z_loss_clip": 0.41113281, + "router_z_loss_mlp": 0.02122498, + "step": 2137, + "time_per_iteration": 3.4140079021453857 + }, + { + "auxiliary_loss_clip": 0.01166257, + "auxiliary_loss_mlp": 0.01050671, + "balance_loss_clip": 1.05344677, + "balance_loss_mlp": 1.03324246, + "epoch": 0.12854351420411844, + "flos": 16937577995040.0, + "grad_norm": 3.105975595642218, + "language_loss": 0.82056904, + "learning_rate": 3.898996223384512e-06, + "loss": 0.84273839, + "num_input_tokens_seen": 46239970, + "router_z_loss_clip": 1.12695312, + "router_z_loss_mlp": 0.17431641, + "step": 2138, + "time_per_iteration": 2.631930112838745 + }, + { + "auxiliary_loss_clip": 0.01171764, + "auxiliary_loss_mlp": 0.01046472, + "balance_loss_clip": 1.05653834, + "balance_loss_mlp": 1.02712393, + "epoch": 0.1286036374567864, + "flos": 27623013223680.0, + "grad_norm": 2.243472456451634, + "language_loss": 0.78969979, + "learning_rate": 3.898873984919113e-06, + "loss": 0.81188214, + "num_input_tokens_seen": 46257740, + "router_z_loss_clip": 1.15136719, + "router_z_loss_mlp": 0.19335938, + "step": 2139, + "time_per_iteration": 2.70849609375 + }, + { + "auxiliary_loss_clip": 0.01171346, + "auxiliary_loss_mlp": 0.01040849, + "balance_loss_clip": 1.05595589, + "balance_loss_mlp": 1.02342105, + "epoch": 0.12866376070945437, + "flos": 19919305360800.0, + "grad_norm": 7.33462572269296, + "language_loss": 0.84986913, + "learning_rate": 3.8987516744482215e-06, + "loss": 0.8719911, + "num_input_tokens_seen": 46275445, + "router_z_loss_clip": 1.15429688, + "router_z_loss_mlp": 0.17431641, + "step": 2140, + "time_per_iteration": 2.6258628368377686 + }, + { + "auxiliary_loss_clip": 0.01163252, + "auxiliary_loss_mlp": 0.01041126, + "balance_loss_clip": 1.05290067, + "balance_loss_mlp": 1.02415061, + "epoch": 0.12872388396212234, + "flos": 14488698208320.0, + "grad_norm": 1.9803140572340874, + "language_loss": 0.8570739, + "learning_rate": 3.898629291976476e-06, + "loss": 0.87911773, + "num_input_tokens_seen": 46291710, + "router_z_loss_clip": 1.10351562, + "router_z_loss_mlp": 0.1697998, + "step": 2141, + "time_per_iteration": 2.655424118041992 + }, + { + "auxiliary_loss_clip": 0.0116945, + "auxiliary_loss_mlp": 0.0104093, + "balance_loss_clip": 1.05364585, + "balance_loss_mlp": 1.02286959, + "epoch": 0.12878400721479033, + "flos": 34613637530400.0, + "grad_norm": 1.9869265963608713, + "language_loss": 0.68390632, + "learning_rate": 3.898506837508518e-06, + "loss": 0.7060101, + "num_input_tokens_seen": 46311335, + "router_z_loss_clip": 1.15820312, + "router_z_loss_mlp": 0.18066406, + "step": 2142, + "time_per_iteration": 2.702840805053711 + }, + { + "auxiliary_loss_clip": 0.01175704, + "auxiliary_loss_mlp": 0.01046716, + "balance_loss_clip": 1.05939996, + "balance_loss_mlp": 1.02740431, + "epoch": 0.1288441304674583, + "flos": 31586050954080.0, + "grad_norm": 2.02084997415417, + "language_loss": 0.83167243, + "learning_rate": 3.89838431104899e-06, + "loss": 0.85389662, + "num_input_tokens_seen": 46330985, + "router_z_loss_clip": 1.16503906, + "router_z_loss_mlp": 0.19299316, + "step": 2143, + "time_per_iteration": 2.7199835777282715 + }, + { + "auxiliary_loss_clip": 0.01173667, + "auxiliary_loss_mlp": 0.01050338, + "balance_loss_clip": 1.05946255, + "balance_loss_mlp": 1.03172958, + "epoch": 0.12890425372012626, + "flos": 25397027759520.0, + "grad_norm": 1.5798738152413219, + "language_loss": 0.81711298, + "learning_rate": 3.898261712602539e-06, + "loss": 0.83935308, + "num_input_tokens_seen": 46351295, + "router_z_loss_clip": 1.140625, + "router_z_loss_mlp": 0.18603516, + "step": 2144, + "time_per_iteration": 2.654782295227051 + }, + { + "auxiliary_loss_clip": 0.01166535, + "auxiliary_loss_mlp": 0.01051256, + "balance_loss_clip": 1.0523895, + "balance_loss_mlp": 1.03187299, + "epoch": 0.12896437697279423, + "flos": 27534981875040.0, + "grad_norm": 2.1647417122014683, + "language_loss": 0.78650361, + "learning_rate": 3.898139042173813e-06, + "loss": 0.80868155, + "num_input_tokens_seen": 46368600, + "router_z_loss_clip": 1.14257812, + "router_z_loss_mlp": 0.19396973, + "step": 2145, + "time_per_iteration": 2.686582565307617 + }, + { + "auxiliary_loss_clip": 0.01170632, + "auxiliary_loss_mlp": 0.01044288, + "balance_loss_clip": 1.05575407, + "balance_loss_mlp": 1.02563202, + "epoch": 0.1290245002254622, + "flos": 21345675059520.0, + "grad_norm": 2.367055146397851, + "language_loss": 0.82216549, + "learning_rate": 3.898016299767465e-06, + "loss": 0.84431469, + "num_input_tokens_seen": 46387370, + "router_z_loss_clip": 1.15039062, + "router_z_loss_mlp": 0.18652344, + "step": 2146, + "time_per_iteration": 2.803399085998535 + }, + { + "auxiliary_loss_clip": 0.01169903, + "auxiliary_loss_mlp": 0.01043761, + "balance_loss_clip": 1.05648589, + "balance_loss_mlp": 1.0247829, + "epoch": 0.12908462347813016, + "flos": 44314358942880.0, + "grad_norm": 2.5541852392183193, + "language_loss": 0.70866841, + "learning_rate": 3.897893485388149e-06, + "loss": 0.73080504, + "num_input_tokens_seen": 46409570, + "router_z_loss_clip": 1.13378906, + "router_z_loss_mlp": 0.18969727, + "step": 2147, + "time_per_iteration": 5.796689033508301 + }, + { + "auxiliary_loss_clip": 0.01169175, + "auxiliary_loss_mlp": 0.01049725, + "balance_loss_clip": 1.05559826, + "balance_loss_mlp": 1.03252292, + "epoch": 0.12914474673079815, + "flos": 27489689906400.0, + "grad_norm": 2.7555497452387088, + "language_loss": 0.71397561, + "learning_rate": 3.897770599040521e-06, + "loss": 0.73616457, + "num_input_tokens_seen": 46429320, + "router_z_loss_clip": 1.13574219, + "router_z_loss_mlp": 0.171875, + "step": 2148, + "time_per_iteration": 2.6789963245391846 + }, + { + "auxiliary_loss_clip": 0.01168731, + "auxiliary_loss_mlp": 0.01044225, + "balance_loss_clip": 1.05973697, + "balance_loss_mlp": 1.02704716, + "epoch": 0.12920486998346611, + "flos": 26199965941920.0, + "grad_norm": 1.648528135741617, + "language_loss": 0.7885378, + "learning_rate": 3.897647640729242e-06, + "loss": 0.81066734, + "num_input_tokens_seen": 46450155, + "router_z_loss_clip": 1.08984375, + "router_z_loss_mlp": 0.171875, + "step": 2149, + "time_per_iteration": 2.7061901092529297 + }, + { + "auxiliary_loss_clip": 0.01171144, + "auxiliary_loss_mlp": 0.01041181, + "balance_loss_clip": 1.05802369, + "balance_loss_mlp": 1.02247715, + "epoch": 0.12926499323613408, + "flos": 33322414426560.0, + "grad_norm": 9.986641124174497, + "language_loss": 0.75960737, + "learning_rate": 3.897524610458975e-06, + "loss": 0.78173059, + "num_input_tokens_seen": 46470280, + "router_z_loss_clip": 1.1328125, + "router_z_loss_mlp": 0.18688965, + "step": 2150, + "time_per_iteration": 4.153578758239746 + }, + { + "auxiliary_loss_clip": 0.01170115, + "auxiliary_loss_mlp": 0.01049138, + "balance_loss_clip": 1.05592275, + "balance_loss_mlp": 1.03014827, + "epoch": 0.12932511648880204, + "flos": 26955707843520.0, + "grad_norm": 2.35204725872606, + "language_loss": 0.70709634, + "learning_rate": 3.8974015082343835e-06, + "loss": 0.72928894, + "num_input_tokens_seen": 46487605, + "router_z_loss_clip": 1.14160156, + "router_z_loss_mlp": 0.18969727, + "step": 2151, + "time_per_iteration": 4.050188302993774 + }, + { + "auxiliary_loss_clip": 0.01170236, + "auxiliary_loss_mlp": 0.01043146, + "balance_loss_clip": 1.0594635, + "balance_loss_mlp": 1.02552688, + "epoch": 0.12938523974147, + "flos": 24773474691360.0, + "grad_norm": 1.9218693005282035, + "language_loss": 0.84029615, + "learning_rate": 3.897278334060137e-06, + "loss": 0.86242986, + "num_input_tokens_seen": 46505100, + "router_z_loss_clip": 1.10839844, + "router_z_loss_mlp": 0.17614746, + "step": 2152, + "time_per_iteration": 2.702991485595703 + }, + { + "auxiliary_loss_clip": 0.01173516, + "auxiliary_loss_mlp": 0.01058639, + "balance_loss_clip": 1.0592289, + "balance_loss_mlp": 1.04067469, + "epoch": 0.12944536299413797, + "flos": 23793825535200.0, + "grad_norm": 1.677365013229113, + "language_loss": 0.78358746, + "learning_rate": 3.897155087940906e-06, + "loss": 0.80590898, + "num_input_tokens_seen": 46524020, + "router_z_loss_clip": 1.14355469, + "router_z_loss_mlp": 0.17980957, + "step": 2153, + "time_per_iteration": 2.650998830795288 + }, + { + "auxiliary_loss_clip": 0.01168395, + "auxiliary_loss_mlp": 0.01049454, + "balance_loss_clip": 1.05583715, + "balance_loss_mlp": 1.03170347, + "epoch": 0.12950548624680594, + "flos": 33809929519680.0, + "grad_norm": 1.6616634479237196, + "language_loss": 0.80229509, + "learning_rate": 3.897031769881364e-06, + "loss": 0.8244735, + "num_input_tokens_seen": 46544640, + "router_z_loss_clip": 1.125, + "router_z_loss_mlp": 0.17749023, + "step": 2154, + "time_per_iteration": 2.7124693393707275 + }, + { + "auxiliary_loss_clip": 0.01174121, + "auxiliary_loss_mlp": 0.01043996, + "balance_loss_clip": 1.06043637, + "balance_loss_mlp": 1.0260675, + "epoch": 0.12956560949947393, + "flos": 21433584856320.0, + "grad_norm": 2.179083594042958, + "language_loss": 0.83344722, + "learning_rate": 3.896908379886188e-06, + "loss": 0.85562843, + "num_input_tokens_seen": 46561395, + "router_z_loss_clip": 1.13671875, + "router_z_loss_mlp": 0.17919922, + "step": 2155, + "time_per_iteration": 2.65053129196167 + }, + { + "auxiliary_loss_clip": 0.01172376, + "auxiliary_loss_mlp": 0.01049962, + "balance_loss_clip": 1.05723047, + "balance_loss_mlp": 1.0318656, + "epoch": 0.1296257327521419, + "flos": 25307861927040.0, + "grad_norm": 2.6236037783575332, + "language_loss": 0.76167995, + "learning_rate": 3.896784917960055e-06, + "loss": 0.78390324, + "num_input_tokens_seen": 46579395, + "router_z_loss_clip": 1.15136719, + "router_z_loss_mlp": 0.1809082, + "step": 2156, + "time_per_iteration": 2.6570613384246826 + }, + { + "auxiliary_loss_clip": 0.01168174, + "auxiliary_loss_mlp": 0.01050096, + "balance_loss_clip": 1.05716872, + "balance_loss_mlp": 1.03183341, + "epoch": 0.12968585600480986, + "flos": 20006080673760.0, + "grad_norm": 1.7966697940630798, + "language_loss": 0.8681674, + "learning_rate": 3.896661384107648e-06, + "loss": 0.8903501, + "num_input_tokens_seen": 46597090, + "router_z_loss_clip": 1.109375, + "router_z_loss_mlp": 0.18249512, + "step": 2157, + "time_per_iteration": 2.8911209106445312 + }, + { + "auxiliary_loss_clip": 0.01172245, + "auxiliary_loss_mlp": 0.01049652, + "balance_loss_clip": 1.05573535, + "balance_loss_mlp": 1.03081727, + "epoch": 0.12974597925747783, + "flos": 34568345561760.0, + "grad_norm": 2.6714984708562244, + "language_loss": 0.81018096, + "learning_rate": 3.896537778333651e-06, + "loss": 0.83239996, + "num_input_tokens_seen": 46617355, + "router_z_loss_clip": 1.1640625, + "router_z_loss_mlp": 0.18823242, + "step": 2158, + "time_per_iteration": 2.7216386795043945 + }, + { + "auxiliary_loss_clip": 0.01173195, + "auxiliary_loss_mlp": 0.01054056, + "balance_loss_clip": 1.05724859, + "balance_loss_mlp": 1.03550696, + "epoch": 0.1298061025101458, + "flos": 11814655131360.0, + "grad_norm": 2.3479974373108115, + "language_loss": 0.74812627, + "learning_rate": 3.896414100642752e-06, + "loss": 0.77039874, + "num_input_tokens_seen": 46633130, + "router_z_loss_clip": 1.16015625, + "router_z_loss_mlp": 0.18554688, + "step": 2159, + "time_per_iteration": 2.6664340496063232 + }, + { + "auxiliary_loss_clip": 0.01165364, + "auxiliary_loss_mlp": 0.01044807, + "balance_loss_clip": 1.05418468, + "balance_loss_mlp": 1.02636516, + "epoch": 0.12986622576281376, + "flos": 33811752797280.0, + "grad_norm": 2.226976198701234, + "language_loss": 0.82542098, + "learning_rate": 3.89629035103964e-06, + "loss": 0.84752274, + "num_input_tokens_seen": 46650575, + "router_z_loss_clip": 1.11328125, + "router_z_loss_mlp": 0.18444824, + "step": 2160, + "time_per_iteration": 2.730756998062134 + }, + { + "auxiliary_loss_clip": 0.01164274, + "auxiliary_loss_mlp": 0.01041363, + "balance_loss_clip": 1.05693352, + "balance_loss_mlp": 1.02323079, + "epoch": 0.12992634901548175, + "flos": 22943366933760.0, + "grad_norm": 1.5603126361389115, + "language_loss": 0.81911534, + "learning_rate": 3.896166529529008e-06, + "loss": 0.84117174, + "num_input_tokens_seen": 46668780, + "router_z_loss_clip": 1.07226562, + "router_z_loss_mlp": 0.18139648, + "step": 2161, + "time_per_iteration": 2.6647982597351074 + }, + { + "auxiliary_loss_clip": 0.01166218, + "auxiliary_loss_mlp": 0.01051922, + "balance_loss_clip": 1.05409408, + "balance_loss_mlp": 1.03299189, + "epoch": 0.12998647226814972, + "flos": 35542605919680.0, + "grad_norm": 2.154250338520349, + "language_loss": 0.82153273, + "learning_rate": 3.896042636115551e-06, + "loss": 0.84371412, + "num_input_tokens_seen": 46687550, + "router_z_loss_clip": 1.12207031, + "router_z_loss_mlp": 0.18933105, + "step": 2162, + "time_per_iteration": 2.6933209896087646 + }, + { + "auxiliary_loss_clip": 0.0117039, + "auxiliary_loss_mlp": 0.01050864, + "balance_loss_clip": 1.05422592, + "balance_loss_mlp": 1.03260136, + "epoch": 0.13004659552081768, + "flos": 23883720678720.0, + "grad_norm": 2.7292776563063605, + "language_loss": 0.73057181, + "learning_rate": 3.895918670803968e-06, + "loss": 0.75278437, + "num_input_tokens_seen": 46706730, + "router_z_loss_clip": 1.16113281, + "router_z_loss_mlp": 0.18261719, + "step": 2163, + "time_per_iteration": 2.6875720024108887 + }, + { + "auxiliary_loss_clip": 0.01172287, + "auxiliary_loss_mlp": 0.01047453, + "balance_loss_clip": 1.05582058, + "balance_loss_mlp": 1.02767563, + "epoch": 0.13010671877348565, + "flos": 27443466040320.0, + "grad_norm": 2.119702015599759, + "language_loss": 0.81927812, + "learning_rate": 3.895794633598958e-06, + "loss": 0.84147555, + "num_input_tokens_seen": 46724250, + "router_z_loss_clip": 1.16601562, + "router_z_loss_mlp": 0.19775391, + "step": 2164, + "time_per_iteration": 2.6529316902160645 + }, + { + "auxiliary_loss_clip": 0.01169593, + "auxiliary_loss_mlp": 0.01041432, + "balance_loss_clip": 1.05541968, + "balance_loss_mlp": 1.02383709, + "epoch": 0.1301668420261536, + "flos": 29136522890880.0, + "grad_norm": 1.9793916691027873, + "language_loss": 0.72185487, + "learning_rate": 3.8956705245052256e-06, + "loss": 0.74396515, + "num_input_tokens_seen": 46744105, + "router_z_loss_clip": 1.14160156, + "router_z_loss_mlp": 0.17590332, + "step": 2165, + "time_per_iteration": 2.708627939224243 + }, + { + "auxiliary_loss_clip": 0.01172491, + "auxiliary_loss_mlp": 0.0104524, + "balance_loss_clip": 1.05681133, + "balance_loss_mlp": 1.02566624, + "epoch": 0.13022696527882158, + "flos": 28247822327520.0, + "grad_norm": 1.729648311448583, + "language_loss": 0.74978471, + "learning_rate": 3.8955463435274765e-06, + "loss": 0.77196205, + "num_input_tokens_seen": 46764250, + "router_z_loss_clip": 1.15722656, + "router_z_loss_mlp": 0.19567871, + "step": 2166, + "time_per_iteration": 2.682431697845459 + }, + { + "auxiliary_loss_clip": 0.01170723, + "auxiliary_loss_mlp": 0.01046266, + "balance_loss_clip": 1.05541635, + "balance_loss_mlp": 1.02874255, + "epoch": 0.13028708853148954, + "flos": 32834170022400.0, + "grad_norm": 1.5749744370551435, + "language_loss": 0.83032268, + "learning_rate": 3.895422090670421e-06, + "loss": 0.85249257, + "num_input_tokens_seen": 46786865, + "router_z_loss_clip": 1.15332031, + "router_z_loss_mlp": 0.1751709, + "step": 2167, + "time_per_iteration": 2.748572826385498 + }, + { + "auxiliary_loss_clip": 0.0116654, + "auxiliary_loss_mlp": 0.01052428, + "balance_loss_clip": 1.05412555, + "balance_loss_mlp": 1.0338906, + "epoch": 0.13034721178415754, + "flos": 25931131374240.0, + "grad_norm": 1.5777998939693032, + "language_loss": 0.83126611, + "learning_rate": 3.89529776593877e-06, + "loss": 0.85345578, + "num_input_tokens_seen": 46807030, + "router_z_loss_clip": 1.12597656, + "router_z_loss_mlp": 0.1854248, + "step": 2168, + "time_per_iteration": 2.658290147781372 + }, + { + "auxiliary_loss_clip": 0.0116901, + "auxiliary_loss_mlp": 0.01047967, + "balance_loss_clip": 1.05419552, + "balance_loss_mlp": 1.02794027, + "epoch": 0.1304073350368255, + "flos": 22899979277280.0, + "grad_norm": 2.0391681141217983, + "language_loss": 0.80043846, + "learning_rate": 3.8951733693372375e-06, + "loss": 0.82260823, + "num_input_tokens_seen": 46826280, + "router_z_loss_clip": 1.14746094, + "router_z_loss_mlp": 0.20031738, + "step": 2169, + "time_per_iteration": 2.6663014888763428 + }, + { + "auxiliary_loss_clip": 0.01171612, + "auxiliary_loss_mlp": 0.0104123, + "balance_loss_clip": 1.05740738, + "balance_loss_mlp": 1.02176332, + "epoch": 0.13046745828949347, + "flos": 34611328045440.0, + "grad_norm": 2.171416885341384, + "language_loss": 0.65745986, + "learning_rate": 3.8950489008705406e-06, + "loss": 0.67958832, + "num_input_tokens_seen": 46846505, + "router_z_loss_clip": 1.14160156, + "router_z_loss_mlp": 0.19482422, + "step": 2170, + "time_per_iteration": 2.933979034423828 + }, + { + "auxiliary_loss_clip": 0.01169478, + "auxiliary_loss_mlp": 0.01043713, + "balance_loss_clip": 1.05663598, + "balance_loss_mlp": 1.0255928, + "epoch": 0.13052758154216143, + "flos": 36124959264480.0, + "grad_norm": 2.206088140570405, + "language_loss": 0.67423558, + "learning_rate": 3.8949243605434e-06, + "loss": 0.6963675, + "num_input_tokens_seen": 46867380, + "router_z_loss_clip": 1.12792969, + "router_z_loss_mlp": 0.18115234, + "step": 2171, + "time_per_iteration": 2.754852294921875 + }, + { + "auxiliary_loss_clip": 0.01172152, + "auxiliary_loss_mlp": 0.01045657, + "balance_loss_clip": 1.05621696, + "balance_loss_mlp": 1.02549827, + "epoch": 0.1305877047948294, + "flos": 23660664287040.0, + "grad_norm": 1.9991527807959841, + "language_loss": 0.71970719, + "learning_rate": 3.894799748360537e-06, + "loss": 0.7418853, + "num_input_tokens_seen": 46886810, + "router_z_loss_clip": 1.15820312, + "router_z_loss_mlp": 0.20178223, + "step": 2172, + "time_per_iteration": 2.681419849395752 + }, + { + "auxiliary_loss_clip": 0.01165918, + "auxiliary_loss_mlp": 0.01040279, + "balance_loss_clip": 1.05779862, + "balance_loss_mlp": 1.02316046, + "epoch": 0.13064782804749736, + "flos": 20588798674080.0, + "grad_norm": 1.781248645007842, + "language_loss": 0.75398999, + "learning_rate": 3.894675064326678e-06, + "loss": 0.77605194, + "num_input_tokens_seen": 46905620, + "router_z_loss_clip": 1.08007812, + "router_z_loss_mlp": 0.17114258, + "step": 2173, + "time_per_iteration": 2.6614768505096436 + }, + { + "auxiliary_loss_clip": 0.01170599, + "auxiliary_loss_mlp": 0.01051996, + "balance_loss_clip": 1.05573928, + "balance_loss_mlp": 1.03215957, + "epoch": 0.13070795130016533, + "flos": 29894088070080.0, + "grad_norm": 2.586747525272135, + "language_loss": 0.70951301, + "learning_rate": 3.894550308446551e-06, + "loss": 0.73173898, + "num_input_tokens_seen": 46925120, + "router_z_loss_clip": 1.1484375, + "router_z_loss_mlp": 0.1986084, + "step": 2174, + "time_per_iteration": 2.675654411315918 + }, + { + "auxiliary_loss_clip": 0.01064361, + "auxiliary_loss_mlp": 0.0101915, + "balance_loss_clip": 1.02345586, + "balance_loss_mlp": 1.01694334, + "epoch": 0.13076807455283332, + "flos": 86701266155520.0, + "grad_norm": 0.8016217073196655, + "language_loss": 0.59071577, + "learning_rate": 3.894425480724886e-06, + "loss": 0.61155093, + "num_input_tokens_seen": 46988195, + "router_z_loss_clip": 0.40917969, + "router_z_loss_mlp": 0.02207947, + "step": 2175, + "time_per_iteration": 3.3842570781707764 + }, + { + "auxiliary_loss_clip": 0.01167843, + "auxiliary_loss_mlp": 0.01049273, + "balance_loss_clip": 1.05600393, + "balance_loss_mlp": 1.030581, + "epoch": 0.13082819780550128, + "flos": 24726926687040.0, + "grad_norm": 3.6886154615528337, + "language_loss": 0.80259508, + "learning_rate": 3.894300581166417e-06, + "loss": 0.82476628, + "num_input_tokens_seen": 47004720, + "router_z_loss_clip": 1.11816406, + "router_z_loss_mlp": 0.18688965, + "step": 2176, + "time_per_iteration": 2.6486868858337402 + }, + { + "auxiliary_loss_clip": 0.01166504, + "auxiliary_loss_mlp": 0.01050842, + "balance_loss_clip": 1.05363464, + "balance_loss_mlp": 1.03057611, + "epoch": 0.13088832105816925, + "flos": 41736167049600.0, + "grad_norm": 1.9266763741459332, + "language_loss": 0.74253976, + "learning_rate": 3.894175609775881e-06, + "loss": 0.76471323, + "num_input_tokens_seen": 47024255, + "router_z_loss_clip": 1.12792969, + "router_z_loss_mlp": 0.20275879, + "step": 2177, + "time_per_iteration": 2.754058599472046 + }, + { + "auxiliary_loss_clip": 0.0116532, + "auxiliary_loss_mlp": 0.01043577, + "balance_loss_clip": 1.05333042, + "balance_loss_mlp": 1.02376401, + "epoch": 0.13094844431083721, + "flos": 21835256533920.0, + "grad_norm": 1.9795301109230223, + "language_loss": 0.82179856, + "learning_rate": 3.894050566558015e-06, + "loss": 0.84388751, + "num_input_tokens_seen": 47042465, + "router_z_loss_clip": 1.12060547, + "router_z_loss_mlp": 0.19799805, + "step": 2178, + "time_per_iteration": 2.653684616088867 + }, + { + "auxiliary_loss_clip": 0.01165317, + "auxiliary_loss_mlp": 0.01043447, + "balance_loss_clip": 1.05429196, + "balance_loss_mlp": 1.02521968, + "epoch": 0.13100856756350518, + "flos": 21123023840640.0, + "grad_norm": 2.8126673317385427, + "language_loss": 0.74579871, + "learning_rate": 3.893925451517562e-06, + "loss": 0.76788634, + "num_input_tokens_seen": 47060370, + "router_z_loss_clip": 1.11035156, + "router_z_loss_mlp": 0.18225098, + "step": 2179, + "time_per_iteration": 2.686105728149414 + }, + { + "auxiliary_loss_clip": 0.01161363, + "auxiliary_loss_mlp": 0.01047685, + "balance_loss_clip": 1.0519948, + "balance_loss_mlp": 1.02993476, + "epoch": 0.13106869081617314, + "flos": 27089233747200.0, + "grad_norm": 2.173112896002274, + "language_loss": 0.84660912, + "learning_rate": 3.893800264659266e-06, + "loss": 0.86869955, + "num_input_tokens_seen": 47081415, + "router_z_loss_clip": 1.09326172, + "router_z_loss_mlp": 0.1776123, + "step": 2180, + "time_per_iteration": 2.6915500164031982 + }, + { + "auxiliary_loss_clip": 0.01166318, + "auxiliary_loss_mlp": 0.01054969, + "balance_loss_clip": 1.05481279, + "balance_loss_mlp": 1.03667021, + "epoch": 0.13112881406884114, + "flos": 26555413753440.0, + "grad_norm": 2.226235330812351, + "language_loss": 0.89668143, + "learning_rate": 3.8936750059878746e-06, + "loss": 0.91889435, + "num_input_tokens_seen": 47099860, + "router_z_loss_clip": 1.11328125, + "router_z_loss_mlp": 0.18286133, + "step": 2181, + "time_per_iteration": 2.6684107780456543 + }, + { + "auxiliary_loss_clip": 0.01165796, + "auxiliary_loss_mlp": 0.01045552, + "balance_loss_clip": 1.05414939, + "balance_loss_mlp": 1.02805185, + "epoch": 0.1311889373215091, + "flos": 28468852855200.0, + "grad_norm": 2.1372293567446223, + "language_loss": 0.68807811, + "learning_rate": 3.893549675508137e-06, + "loss": 0.71019161, + "num_input_tokens_seen": 47118540, + "router_z_loss_clip": 1.11621094, + "router_z_loss_mlp": 0.17492676, + "step": 2182, + "time_per_iteration": 2.7195346355438232 + }, + { + "auxiliary_loss_clip": 0.01164791, + "auxiliary_loss_mlp": 0.01048117, + "balance_loss_clip": 1.05092549, + "balance_loss_mlp": 1.02898371, + "epoch": 0.13124906057417707, + "flos": 26194860764640.0, + "grad_norm": 2.268834716635522, + "language_loss": 0.78500146, + "learning_rate": 3.893424273224806e-06, + "loss": 0.80713058, + "num_input_tokens_seen": 47136710, + "router_z_loss_clip": 1.13964844, + "router_z_loss_mlp": 0.19140625, + "step": 2183, + "time_per_iteration": 2.7950186729431152 + }, + { + "auxiliary_loss_clip": 0.0116193, + "auxiliary_loss_mlp": 0.01043157, + "balance_loss_clip": 1.05041432, + "balance_loss_mlp": 1.02523994, + "epoch": 0.13130918382684503, + "flos": 28376648226720.0, + "grad_norm": 2.0165317927616733, + "language_loss": 0.85661697, + "learning_rate": 3.893298799142636e-06, + "loss": 0.87866777, + "num_input_tokens_seen": 47157155, + "router_z_loss_clip": 1.11523438, + "router_z_loss_mlp": 0.17907715, + "step": 2184, + "time_per_iteration": 2.654871940612793 + }, + { + "auxiliary_loss_clip": 0.01163658, + "auxiliary_loss_mlp": 0.0104742, + "balance_loss_clip": 1.04956436, + "balance_loss_mlp": 1.02870417, + "epoch": 0.131369307079513, + "flos": 25441549899840.0, + "grad_norm": 2.03278492829652, + "language_loss": 0.82553816, + "learning_rate": 3.893173253266387e-06, + "loss": 0.84764898, + "num_input_tokens_seen": 47176820, + "router_z_loss_clip": 1.140625, + "router_z_loss_mlp": 0.18725586, + "step": 2185, + "time_per_iteration": 2.6627209186553955 + }, + { + "auxiliary_loss_clip": 0.01165635, + "auxiliary_loss_mlp": 0.01051508, + "balance_loss_clip": 1.0500623, + "balance_loss_mlp": 1.03279161, + "epoch": 0.13142943033218096, + "flos": 21790572324480.0, + "grad_norm": 2.769898078014435, + "language_loss": 0.72686851, + "learning_rate": 3.893047635600818e-06, + "loss": 0.74903989, + "num_input_tokens_seen": 47195855, + "router_z_loss_clip": 1.15429688, + "router_z_loss_mlp": 0.18713379, + "step": 2186, + "time_per_iteration": 4.087272882461548 + }, + { + "auxiliary_loss_clip": 0.01164301, + "auxiliary_loss_mlp": 0.01044844, + "balance_loss_clip": 1.05190706, + "balance_loss_mlp": 1.02556777, + "epoch": 0.13148955358484893, + "flos": 25619435874720.0, + "grad_norm": 1.9276166517586244, + "language_loss": 0.8006435, + "learning_rate": 3.892921946150693e-06, + "loss": 0.82273495, + "num_input_tokens_seen": 47214535, + "router_z_loss_clip": 1.12402344, + "router_z_loss_mlp": 0.19287109, + "step": 2187, + "time_per_iteration": 4.080174922943115 + }, + { + "auxiliary_loss_clip": 0.0105526, + "auxiliary_loss_mlp": 0.01014131, + "balance_loss_clip": 1.01465869, + "balance_loss_mlp": 1.01210749, + "epoch": 0.13154967683751692, + "flos": 85625522712000.0, + "grad_norm": 0.8404611174349393, + "language_loss": 0.59083283, + "learning_rate": 3.892796184920778e-06, + "loss": 0.61152673, + "num_input_tokens_seen": 47270300, + "router_z_loss_clip": 0.40600586, + "router_z_loss_mlp": 0.02023315, + "step": 2188, + "time_per_iteration": 3.2851624488830566 + }, + { + "auxiliary_loss_clip": 0.01168152, + "auxiliary_loss_mlp": 0.01047966, + "balance_loss_clip": 1.05676687, + "balance_loss_mlp": 1.03009701, + "epoch": 0.1316098000901849, + "flos": 24862964662080.0, + "grad_norm": 3.7708045174578038, + "language_loss": 0.7396636, + "learning_rate": 3.892670351915842e-06, + "loss": 0.76182473, + "num_input_tokens_seen": 47290720, + "router_z_loss_clip": 1.11328125, + "router_z_loss_mlp": 0.17871094, + "step": 2189, + "time_per_iteration": 4.0983240604400635 + }, + { + "auxiliary_loss_clip": 0.01164275, + "auxiliary_loss_mlp": 0.01044801, + "balance_loss_clip": 1.05343199, + "balance_loss_mlp": 1.02747965, + "epoch": 0.13166992334285285, + "flos": 28334719192320.0, + "grad_norm": 2.1537180634691615, + "language_loss": 0.72679001, + "learning_rate": 3.892544447140657e-06, + "loss": 0.74888074, + "num_input_tokens_seen": 47311820, + "router_z_loss_clip": 1.10644531, + "router_z_loss_mlp": 0.17321777, + "step": 2190, + "time_per_iteration": 2.64833402633667 + }, + { + "auxiliary_loss_clip": 0.01164819, + "auxiliary_loss_mlp": 0.01047941, + "balance_loss_clip": 1.0537405, + "balance_loss_mlp": 1.03067994, + "epoch": 0.13173004659552082, + "flos": 28468690786080.0, + "grad_norm": 1.908680776210559, + "language_loss": 0.74397653, + "learning_rate": 3.892418470599996e-06, + "loss": 0.76610416, + "num_input_tokens_seen": 47331605, + "router_z_loss_clip": 1.11132812, + "router_z_loss_mlp": 0.17260742, + "step": 2191, + "time_per_iteration": 4.157379865646362 + }, + { + "auxiliary_loss_clip": 0.011635, + "auxiliary_loss_mlp": 0.01043721, + "balance_loss_clip": 1.05197716, + "balance_loss_mlp": 1.02566111, + "epoch": 0.13179016984818878, + "flos": 25931171891520.0, + "grad_norm": 1.9543193720736272, + "language_loss": 0.79012126, + "learning_rate": 3.892292422298637e-06, + "loss": 0.81219351, + "num_input_tokens_seen": 47350455, + "router_z_loss_clip": 1.11523438, + "router_z_loss_mlp": 0.18066406, + "step": 2192, + "time_per_iteration": 2.681380033493042 + }, + { + "auxiliary_loss_clip": 0.01165611, + "auxiliary_loss_mlp": 0.01042146, + "balance_loss_clip": 1.05298209, + "balance_loss_mlp": 1.02462268, + "epoch": 0.13185029310085675, + "flos": 21693991829760.0, + "grad_norm": 1.9885795357844163, + "language_loss": 0.85414457, + "learning_rate": 3.892166302241361e-06, + "loss": 0.87622213, + "num_input_tokens_seen": 47368225, + "router_z_loss_clip": 1.12597656, + "router_z_loss_mlp": 0.17529297, + "step": 2193, + "time_per_iteration": 2.9022960662841797 + }, + { + "auxiliary_loss_clip": 0.01055224, + "auxiliary_loss_mlp": 0.0099999, + "balance_loss_clip": 1.01437819, + "balance_loss_mlp": 0.99800193, + "epoch": 0.1319104163535247, + "flos": 85233494147040.0, + "grad_norm": 0.7591273800211141, + "language_loss": 0.54121077, + "learning_rate": 3.8920401104329475e-06, + "loss": 0.56176287, + "num_input_tokens_seen": 47427125, + "router_z_loss_clip": 0.40820312, + "router_z_loss_mlp": 0.01986694, + "step": 2194, + "time_per_iteration": 3.215142011642456 + }, + { + "auxiliary_loss_clip": 0.01160692, + "auxiliary_loss_mlp": 0.01038895, + "balance_loss_clip": 1.05077744, + "balance_loss_mlp": 1.02121639, + "epoch": 0.1319705396061927, + "flos": 30740900116320.0, + "grad_norm": 1.666856446137168, + "language_loss": 0.71905446, + "learning_rate": 3.891913846878185e-06, + "loss": 0.74105036, + "num_input_tokens_seen": 47450275, + "router_z_loss_clip": 1.09960938, + "router_z_loss_mlp": 0.17687988, + "step": 2195, + "time_per_iteration": 2.701366424560547 + }, + { + "auxiliary_loss_clip": 0.01165796, + "auxiliary_loss_mlp": 0.010409, + "balance_loss_clip": 1.05127668, + "balance_loss_mlp": 1.02177882, + "epoch": 0.13203066285886067, + "flos": 25307578306080.0, + "grad_norm": 2.1267097737676193, + "language_loss": 0.77964634, + "learning_rate": 3.891787511581859e-06, + "loss": 0.80171335, + "num_input_tokens_seen": 47469155, + "router_z_loss_clip": 1.14257812, + "router_z_loss_mlp": 0.19104004, + "step": 2196, + "time_per_iteration": 2.6489322185516357 + }, + { + "auxiliary_loss_clip": 0.01163808, + "auxiliary_loss_mlp": 0.01043919, + "balance_loss_clip": 1.0507772, + "balance_loss_mlp": 1.02657425, + "epoch": 0.13209078611152864, + "flos": 26910902082240.0, + "grad_norm": 2.5983128507799758, + "language_loss": 0.74898332, + "learning_rate": 3.89166110454876e-06, + "loss": 0.77106059, + "num_input_tokens_seen": 47488405, + "router_z_loss_clip": 1.12988281, + "router_z_loss_mlp": 0.17346191, + "step": 2197, + "time_per_iteration": 2.7087714672088623 + }, + { + "auxiliary_loss_clip": 0.011652, + "auxiliary_loss_mlp": 0.01044492, + "balance_loss_clip": 1.05015266, + "balance_loss_mlp": 1.02675354, + "epoch": 0.1321509093641966, + "flos": 19871015113440.0, + "grad_norm": 1.8881504955822783, + "language_loss": 0.79363847, + "learning_rate": 3.891534625783685e-06, + "loss": 0.8157354, + "num_input_tokens_seen": 47505650, + "router_z_loss_clip": 1.15136719, + "router_z_loss_mlp": 0.17724609, + "step": 2198, + "time_per_iteration": 2.7151198387145996 + }, + { + "auxiliary_loss_clip": 0.01161015, + "auxiliary_loss_mlp": 0.01048012, + "balance_loss_clip": 1.05100083, + "balance_loss_mlp": 1.03102469, + "epoch": 0.13221103261686457, + "flos": 20722405612320.0, + "grad_norm": 2.3259355724409416, + "language_loss": 0.82989025, + "learning_rate": 3.891408075291425e-06, + "loss": 0.85198051, + "num_input_tokens_seen": 47521540, + "router_z_loss_clip": 1.09716797, + "router_z_loss_mlp": 0.1697998, + "step": 2199, + "time_per_iteration": 2.6475212574005127 + }, + { + "auxiliary_loss_clip": 0.01163102, + "auxiliary_loss_mlp": 0.01047547, + "balance_loss_clip": 1.05037165, + "balance_loss_mlp": 1.02961767, + "epoch": 0.13227115586953253, + "flos": 41773558148640.0, + "grad_norm": 1.7669610480801778, + "language_loss": 0.69527531, + "learning_rate": 3.8912814530767826e-06, + "loss": 0.71738183, + "num_input_tokens_seen": 47543625, + "router_z_loss_clip": 1.12792969, + "router_z_loss_mlp": 0.17932129, + "step": 2200, + "time_per_iteration": 2.712782144546509 + }, + { + "auxiliary_loss_clip": 0.01161897, + "auxiliary_loss_mlp": 0.01050537, + "balance_loss_clip": 1.05052233, + "balance_loss_mlp": 1.03183329, + "epoch": 0.13233127912220052, + "flos": 25263825994080.0, + "grad_norm": 1.9081151914771697, + "language_loss": 0.84200656, + "learning_rate": 3.891154759144557e-06, + "loss": 0.86413091, + "num_input_tokens_seen": 47563740, + "router_z_loss_clip": 1.11425781, + "router_z_loss_mlp": 0.18713379, + "step": 2201, + "time_per_iteration": 2.662198543548584 + }, + { + "auxiliary_loss_clip": 0.01165035, + "auxiliary_loss_mlp": 0.01047151, + "balance_loss_clip": 1.05164933, + "balance_loss_mlp": 1.02948463, + "epoch": 0.1323914023748685, + "flos": 31488417010080.0, + "grad_norm": 2.0499372776934446, + "language_loss": 0.86658502, + "learning_rate": 3.891027993499554e-06, + "loss": 0.8887068, + "num_input_tokens_seen": 47582655, + "router_z_loss_clip": 1.13378906, + "router_z_loss_mlp": 0.17663574, + "step": 2202, + "time_per_iteration": 2.679948329925537 + }, + { + "auxiliary_loss_clip": 0.01161612, + "auxiliary_loss_mlp": 0.01038629, + "balance_loss_clip": 1.05148411, + "balance_loss_mlp": 1.02177238, + "epoch": 0.13245152562753645, + "flos": 25931212408800.0, + "grad_norm": 2.0093538212076054, + "language_loss": 0.72566658, + "learning_rate": 3.89090115614658e-06, + "loss": 0.74766898, + "num_input_tokens_seen": 47600875, + "router_z_loss_clip": 1.09960938, + "router_z_loss_mlp": 0.1685791, + "step": 2203, + "time_per_iteration": 2.678513526916504 + }, + { + "auxiliary_loss_clip": 0.01163322, + "auxiliary_loss_mlp": 0.01052489, + "balance_loss_clip": 1.05073059, + "balance_loss_mlp": 1.03559744, + "epoch": 0.13251164888020442, + "flos": 32471550652320.0, + "grad_norm": 2.3293126691619377, + "language_loss": 0.73496079, + "learning_rate": 3.890774247090444e-06, + "loss": 0.75711894, + "num_input_tokens_seen": 47619250, + "router_z_loss_clip": 1.12695312, + "router_z_loss_mlp": 0.16882324, + "step": 2204, + "time_per_iteration": 2.671898365020752 + }, + { + "auxiliary_loss_clip": 0.01163275, + "auxiliary_loss_mlp": 0.01042426, + "balance_loss_clip": 1.05215096, + "balance_loss_mlp": 1.02436554, + "epoch": 0.13257177213287238, + "flos": 36395009350560.0, + "grad_norm": 1.7755620227165072, + "language_loss": 0.78361702, + "learning_rate": 3.89064726633596e-06, + "loss": 0.80567402, + "num_input_tokens_seen": 47639445, + "router_z_loss_clip": 1.11230469, + "router_z_loss_mlp": 0.18066406, + "step": 2205, + "time_per_iteration": 2.7283270359039307 + }, + { + "auxiliary_loss_clip": 0.01161237, + "auxiliary_loss_mlp": 0.01045775, + "balance_loss_clip": 1.05215025, + "balance_loss_mlp": 1.0281198, + "epoch": 0.13263189538554035, + "flos": 25976220756480.0, + "grad_norm": 2.1108077389852102, + "language_loss": 0.78950888, + "learning_rate": 3.890520213887941e-06, + "loss": 0.81157899, + "num_input_tokens_seen": 47658740, + "router_z_loss_clip": 1.09082031, + "router_z_loss_mlp": 0.17663574, + "step": 2206, + "time_per_iteration": 2.858957290649414 + }, + { + "auxiliary_loss_clip": 0.01164235, + "auxiliary_loss_mlp": 0.01045016, + "balance_loss_clip": 1.05249524, + "balance_loss_mlp": 1.02867281, + "epoch": 0.13269201863820831, + "flos": 20589446950560.0, + "grad_norm": 2.7009112488116584, + "language_loss": 0.74400854, + "learning_rate": 3.890393089751208e-06, + "loss": 0.766101, + "num_input_tokens_seen": 47676880, + "router_z_loss_clip": 1.1171875, + "router_z_loss_mlp": 0.16333008, + "step": 2207, + "time_per_iteration": 2.65320086479187 + }, + { + "auxiliary_loss_clip": 0.01156707, + "auxiliary_loss_mlp": 0.01041774, + "balance_loss_clip": 1.05029094, + "balance_loss_mlp": 1.02428591, + "epoch": 0.1327521418908763, + "flos": 29003118539040.0, + "grad_norm": 2.313893704613273, + "language_loss": 0.84319413, + "learning_rate": 3.890265893930578e-06, + "loss": 0.86517894, + "num_input_tokens_seen": 47696635, + "router_z_loss_clip": 1.06347656, + "router_z_loss_mlp": 0.17480469, + "step": 2208, + "time_per_iteration": 2.6800858974456787 + }, + { + "auxiliary_loss_clip": 0.01158171, + "auxiliary_loss_mlp": 0.0104675, + "balance_loss_clip": 1.05367553, + "balance_loss_mlp": 1.03059697, + "epoch": 0.13281226514354427, + "flos": 32342603201280.0, + "grad_norm": 1.8741219298308336, + "language_loss": 0.85739356, + "learning_rate": 3.890138626430876e-06, + "loss": 0.87944281, + "num_input_tokens_seen": 47717760, + "router_z_loss_clip": 1.04589844, + "router_z_loss_mlp": 0.16162109, + "step": 2209, + "time_per_iteration": 2.6914610862731934 + }, + { + "auxiliary_loss_clip": 0.01161066, + "auxiliary_loss_mlp": 0.01037191, + "balance_loss_clip": 1.05158603, + "balance_loss_mlp": 1.02131224, + "epoch": 0.13287238839621224, + "flos": 29893723414560.0, + "grad_norm": 2.1148184395606275, + "language_loss": 0.82194221, + "learning_rate": 3.890011287256929e-06, + "loss": 0.84392476, + "num_input_tokens_seen": 47737685, + "router_z_loss_clip": 1.09472656, + "router_z_loss_mlp": 0.15893555, + "step": 2210, + "time_per_iteration": 2.6749441623687744 + }, + { + "auxiliary_loss_clip": 0.01061287, + "auxiliary_loss_mlp": 0.01010945, + "balance_loss_clip": 1.02097237, + "balance_loss_mlp": 1.00889158, + "epoch": 0.1329325116488802, + "flos": 82601258552640.0, + "grad_norm": 0.7599648796800219, + "language_loss": 0.57969737, + "learning_rate": 3.889883876413563e-06, + "loss": 0.6004197, + "num_input_tokens_seen": 47802415, + "router_z_loss_clip": 0.40258789, + "router_z_loss_mlp": 0.02053833, + "step": 2211, + "time_per_iteration": 3.395197868347168 + }, + { + "auxiliary_loss_clip": 0.01061274, + "auxiliary_loss_mlp": 0.01007516, + "balance_loss_clip": 1.02101946, + "balance_loss_mlp": 1.00544167, + "epoch": 0.13299263490154817, + "flos": 88174305410400.0, + "grad_norm": 0.8127165182624067, + "language_loss": 0.5526672, + "learning_rate": 3.889756393905611e-06, + "loss": 0.57335508, + "num_input_tokens_seen": 47871485, + "router_z_loss_clip": 0.40234375, + "router_z_loss_mlp": 0.02075195, + "step": 2212, + "time_per_iteration": 3.3031210899353027 + }, + { + "auxiliary_loss_clip": 0.01164973, + "auxiliary_loss_mlp": 0.01045699, + "balance_loss_clip": 1.05285263, + "balance_loss_mlp": 1.02731693, + "epoch": 0.13305275815421613, + "flos": 21879576087840.0, + "grad_norm": 2.5265812585795464, + "language_loss": 0.7478981, + "learning_rate": 3.889628839737908e-06, + "loss": 0.77000481, + "num_input_tokens_seen": 47888315, + "router_z_loss_clip": 1.12109375, + "router_z_loss_mlp": 0.18371582, + "step": 2213, + "time_per_iteration": 2.608412265777588 + }, + { + "auxiliary_loss_clip": 0.01155899, + "auxiliary_loss_mlp": 0.01042713, + "balance_loss_clip": 1.05045676, + "balance_loss_mlp": 1.02660775, + "epoch": 0.13311288140688413, + "flos": 27261001612800.0, + "grad_norm": 1.8499746724812163, + "language_loss": 0.79689074, + "learning_rate": 3.889501213915291e-06, + "loss": 0.81887686, + "num_input_tokens_seen": 47906600, + "router_z_loss_clip": 1.05517578, + "router_z_loss_mlp": 0.16101074, + "step": 2214, + "time_per_iteration": 2.7524657249450684 + }, + { + "auxiliary_loss_clip": 0.01160682, + "auxiliary_loss_mlp": 0.01050695, + "balance_loss_clip": 1.05083275, + "balance_loss_mlp": 1.03320742, + "epoch": 0.1331730046595521, + "flos": 38887317311040.0, + "grad_norm": 1.9277251120423962, + "language_loss": 0.69827771, + "learning_rate": 3.889373516442597e-06, + "loss": 0.72039151, + "num_input_tokens_seen": 47927630, + "router_z_loss_clip": 1.09667969, + "router_z_loss_mlp": 0.17480469, + "step": 2215, + "time_per_iteration": 2.7856245040893555 + }, + { + "auxiliary_loss_clip": 0.01163341, + "auxiliary_loss_mlp": 0.01042603, + "balance_loss_clip": 1.05203652, + "balance_loss_mlp": 1.02482867, + "epoch": 0.13323312791222006, + "flos": 27535792220640.0, + "grad_norm": 2.1588999530171087, + "language_loss": 0.80872571, + "learning_rate": 3.889245747324671e-06, + "loss": 0.83078516, + "num_input_tokens_seen": 47947935, + "router_z_loss_clip": 1.11328125, + "router_z_loss_mlp": 0.17773438, + "step": 2216, + "time_per_iteration": 2.665205955505371 + }, + { + "auxiliary_loss_clip": 0.01159658, + "auxiliary_loss_mlp": 0.01050355, + "balance_loss_clip": 1.04941273, + "balance_loss_mlp": 1.03242588, + "epoch": 0.13329325116488802, + "flos": 18406727591040.0, + "grad_norm": 2.6384893491397095, + "language_loss": 0.87299955, + "learning_rate": 3.889117906566356e-06, + "loss": 0.89509964, + "num_input_tokens_seen": 47965515, + "router_z_loss_clip": 1.10449219, + "router_z_loss_mlp": 0.17932129, + "step": 2217, + "time_per_iteration": 2.6221184730529785 + }, + { + "auxiliary_loss_clip": 0.01159835, + "auxiliary_loss_mlp": 0.01046198, + "balance_loss_clip": 1.05037463, + "balance_loss_mlp": 1.02784014, + "epoch": 0.133353374417556, + "flos": 33499571090400.0, + "grad_norm": 2.1664881408501, + "language_loss": 0.72841883, + "learning_rate": 3.888989994172501e-06, + "loss": 0.7504791, + "num_input_tokens_seen": 47985675, + "router_z_loss_clip": 1.09375, + "router_z_loss_mlp": 0.18371582, + "step": 2218, + "time_per_iteration": 2.898927927017212 + }, + { + "auxiliary_loss_clip": 0.0115919, + "auxiliary_loss_mlp": 0.0104367, + "balance_loss_clip": 1.04927063, + "balance_loss_mlp": 1.025455, + "epoch": 0.13341349767022395, + "flos": 29400900557760.0, + "grad_norm": 1.9394632186050988, + "language_loss": 0.8736068, + "learning_rate": 3.8888620101479565e-06, + "loss": 0.89563549, + "num_input_tokens_seen": 48004985, + "router_z_loss_clip": 1.09863281, + "router_z_loss_mlp": 0.18225098, + "step": 2219, + "time_per_iteration": 2.659013032913208 + }, + { + "auxiliary_loss_clip": 0.01160409, + "auxiliary_loss_mlp": 0.01047899, + "balance_loss_clip": 1.05191612, + "balance_loss_mlp": 1.03131723, + "epoch": 0.13347362092289192, + "flos": 29447853734880.0, + "grad_norm": 5.5395116738151415, + "language_loss": 0.77007639, + "learning_rate": 3.888733954497574e-06, + "loss": 0.7921595, + "num_input_tokens_seen": 48024965, + "router_z_loss_clip": 1.0859375, + "router_z_loss_mlp": 0.16589355, + "step": 2220, + "time_per_iteration": 2.6474997997283936 + }, + { + "auxiliary_loss_clip": 0.01154561, + "auxiliary_loss_mlp": 0.01040407, + "balance_loss_clip": 1.04766881, + "balance_loss_mlp": 1.02406347, + "epoch": 0.1335337441755599, + "flos": 22496646391200.0, + "grad_norm": 2.0681216670176936, + "language_loss": 0.78929758, + "learning_rate": 3.888605827226212e-06, + "loss": 0.81124723, + "num_input_tokens_seen": 48040890, + "router_z_loss_clip": 1.06933594, + "router_z_loss_mlp": 0.16326904, + "step": 2221, + "time_per_iteration": 2.6354525089263916 + }, + { + "auxiliary_loss_clip": 0.01053482, + "auxiliary_loss_mlp": 0.01025701, + "balance_loss_clip": 1.01361203, + "balance_loss_mlp": 1.02348042, + "epoch": 0.13359386742822787, + "flos": 61756712392320.0, + "grad_norm": 0.9758220040151894, + "language_loss": 0.69006824, + "learning_rate": 3.8884776283387275e-06, + "loss": 0.71086007, + "num_input_tokens_seen": 48091855, + "router_z_loss_clip": 0.39868164, + "router_z_loss_mlp": 0.0222168, + "step": 2222, + "time_per_iteration": 3.1008005142211914 + }, + { + "auxiliary_loss_clip": 0.01161651, + "auxiliary_loss_mlp": 0.01046134, + "balance_loss_clip": 1.0526886, + "balance_loss_mlp": 1.02959931, + "epoch": 0.13365399068089584, + "flos": 27795105227520.0, + "grad_norm": 1.9603247740874226, + "language_loss": 0.67141253, + "learning_rate": 3.888349357839982e-06, + "loss": 0.69349039, + "num_input_tokens_seen": 48111350, + "router_z_loss_clip": 1.08984375, + "router_z_loss_mlp": 0.1652832, + "step": 2223, + "time_per_iteration": 2.664393901824951 + }, + { + "auxiliary_loss_clip": 0.0115903, + "auxiliary_loss_mlp": 0.01054074, + "balance_loss_clip": 1.04863334, + "balance_loss_mlp": 1.03565621, + "epoch": 0.1337141139335638, + "flos": 15290461389600.0, + "grad_norm": 2.358074073866739, + "language_loss": 0.82808751, + "learning_rate": 3.88822101573484e-06, + "loss": 0.85021853, + "num_input_tokens_seen": 48129840, + "router_z_loss_clip": 1.1015625, + "router_z_loss_mlp": 0.18408203, + "step": 2224, + "time_per_iteration": 2.670646905899048 + }, + { + "auxiliary_loss_clip": 0.01163303, + "auxiliary_loss_mlp": 0.01043564, + "balance_loss_clip": 1.04995489, + "balance_loss_mlp": 1.02483606, + "epoch": 0.13377423718623177, + "flos": 28112392111680.0, + "grad_norm": 1.9317687256832832, + "language_loss": 0.65516317, + "learning_rate": 3.888092602028167e-06, + "loss": 0.67723179, + "num_input_tokens_seen": 48149240, + "router_z_loss_clip": 1.1328125, + "router_z_loss_mlp": 0.18737793, + "step": 2225, + "time_per_iteration": 2.6897506713867188 + }, + { + "auxiliary_loss_clip": 0.01162754, + "auxiliary_loss_mlp": 0.01047068, + "balance_loss_clip": 1.05220664, + "balance_loss_mlp": 1.02955616, + "epoch": 0.13383436043889974, + "flos": 19786954458240.0, + "grad_norm": 2.298935770344574, + "language_loss": 0.89275044, + "learning_rate": 3.887964116724835e-06, + "loss": 0.91484869, + "num_input_tokens_seen": 48166330, + "router_z_loss_clip": 1.10644531, + "router_z_loss_mlp": 0.17492676, + "step": 2226, + "time_per_iteration": 5.518297910690308 + }, + { + "auxiliary_loss_clip": 0.01161513, + "auxiliary_loss_mlp": 0.01044665, + "balance_loss_clip": 1.05139422, + "balance_loss_mlp": 1.02702165, + "epoch": 0.1338944836915677, + "flos": 29626387986240.0, + "grad_norm": 2.2906589093228154, + "language_loss": 0.73806012, + "learning_rate": 3.887835559829712e-06, + "loss": 0.76012194, + "num_input_tokens_seen": 48187600, + "router_z_loss_clip": 1.1015625, + "router_z_loss_mlp": 0.1763916, + "step": 2227, + "time_per_iteration": 2.6957662105560303 + }, + { + "auxiliary_loss_clip": 0.01158881, + "auxiliary_loss_mlp": 0.01043381, + "balance_loss_clip": 1.04930472, + "balance_loss_mlp": 1.02563119, + "epoch": 0.1339546069442357, + "flos": 21474176820480.0, + "grad_norm": 1.9831676692451858, + "language_loss": 0.85401535, + "learning_rate": 3.8877069313476764e-06, + "loss": 0.87603796, + "num_input_tokens_seen": 48204400, + "router_z_loss_clip": 1.09570312, + "router_z_loss_mlp": 0.1776123, + "step": 2228, + "time_per_iteration": 2.609774351119995 + }, + { + "auxiliary_loss_clip": 0.01158497, + "auxiliary_loss_mlp": 0.01044185, + "balance_loss_clip": 1.05090284, + "balance_loss_mlp": 1.02710223, + "epoch": 0.13401473019690366, + "flos": 23171528502720.0, + "grad_norm": 1.798506810552355, + "language_loss": 0.80765206, + "learning_rate": 3.8875782312836054e-06, + "loss": 0.82967889, + "num_input_tokens_seen": 48222180, + "router_z_loss_clip": 1.07714844, + "router_z_loss_mlp": 0.17077637, + "step": 2229, + "time_per_iteration": 4.215746641159058 + }, + { + "auxiliary_loss_clip": 0.0116072, + "auxiliary_loss_mlp": 0.01051345, + "balance_loss_clip": 1.05163789, + "balance_loss_mlp": 1.03395236, + "epoch": 0.13407485344957162, + "flos": 32877314575200.0, + "grad_norm": 3.667591351170506, + "language_loss": 0.74148095, + "learning_rate": 3.887449459642378e-06, + "loss": 0.76360166, + "num_input_tokens_seen": 48243245, + "router_z_loss_clip": 1.08935547, + "router_z_loss_mlp": 0.1739502, + "step": 2230, + "time_per_iteration": 4.201992750167847 + }, + { + "auxiliary_loss_clip": 0.01161292, + "auxiliary_loss_mlp": 0.01051708, + "balance_loss_clip": 1.05070543, + "balance_loss_mlp": 1.03447008, + "epoch": 0.1341349767022396, + "flos": 24818118383520.0, + "grad_norm": 2.845589782199938, + "language_loss": 0.80025136, + "learning_rate": 3.8873206164288785e-06, + "loss": 0.82238138, + "num_input_tokens_seen": 48262600, + "router_z_loss_clip": 1.10644531, + "router_z_loss_mlp": 0.17224121, + "step": 2231, + "time_per_iteration": 2.62872576713562 + }, + { + "auxiliary_loss_clip": 0.01165432, + "auxiliary_loss_mlp": 0.01046866, + "balance_loss_clip": 1.05457997, + "balance_loss_mlp": 1.02844822, + "epoch": 0.13419509995490755, + "flos": 36438397007040.0, + "grad_norm": 1.547769912670362, + "language_loss": 0.72011787, + "learning_rate": 3.887191701647992e-06, + "loss": 0.74224091, + "num_input_tokens_seen": 48285075, + "router_z_loss_clip": 1.10742188, + "router_z_loss_mlp": 0.18432617, + "step": 2232, + "time_per_iteration": 2.708695650100708 + }, + { + "auxiliary_loss_clip": 0.01165649, + "auxiliary_loss_mlp": 0.01042629, + "balance_loss_clip": 1.05352759, + "balance_loss_mlp": 1.02446103, + "epoch": 0.13425522320757552, + "flos": 32876382677760.0, + "grad_norm": 3.222517385288686, + "language_loss": 0.65699929, + "learning_rate": 3.8870627153046066e-06, + "loss": 0.67908204, + "num_input_tokens_seen": 48301285, + "router_z_loss_clip": 1.12207031, + "router_z_loss_mlp": 0.18151855, + "step": 2233, + "time_per_iteration": 2.7206945419311523 + }, + { + "auxiliary_loss_clip": 0.01159459, + "auxiliary_loss_mlp": 0.01039286, + "balance_loss_clip": 1.04868615, + "balance_loss_mlp": 1.02133322, + "epoch": 0.1343153464602435, + "flos": 19251230152320.0, + "grad_norm": 2.683675502794049, + "language_loss": 0.81105196, + "learning_rate": 3.886933657403615e-06, + "loss": 0.8330394, + "num_input_tokens_seen": 48317835, + "router_z_loss_clip": 1.10839844, + "router_z_loss_mlp": 0.17956543, + "step": 2234, + "time_per_iteration": 2.7826344966888428 + }, + { + "auxiliary_loss_clip": 0.01164427, + "auxiliary_loss_mlp": 0.0104967, + "balance_loss_clip": 1.05413675, + "balance_loss_mlp": 1.03227687, + "epoch": 0.13437546971291148, + "flos": 29669329952640.0, + "grad_norm": 1.9412651532212657, + "language_loss": 0.82282895, + "learning_rate": 3.886804527949909e-06, + "loss": 0.84496993, + "num_input_tokens_seen": 48335670, + "router_z_loss_clip": 1.10351562, + "router_z_loss_mlp": 0.1739502, + "step": 2235, + "time_per_iteration": 2.67732310295105 + }, + { + "auxiliary_loss_clip": 0.01159888, + "auxiliary_loss_mlp": 0.01052229, + "balance_loss_clip": 1.05008864, + "balance_loss_mlp": 1.03400218, + "epoch": 0.13443559296557944, + "flos": 32519719347840.0, + "grad_norm": 1.6977839995770103, + "language_loss": 0.86319065, + "learning_rate": 3.8866753269483864e-06, + "loss": 0.88531184, + "num_input_tokens_seen": 48357805, + "router_z_loss_clip": 1.09765625, + "router_z_loss_mlp": 0.18225098, + "step": 2236, + "time_per_iteration": 2.6757616996765137 + }, + { + "auxiliary_loss_clip": 0.01164933, + "auxiliary_loss_mlp": 0.01043671, + "balance_loss_clip": 1.05358577, + "balance_loss_mlp": 1.02611184, + "epoch": 0.1344957162182474, + "flos": 26596086752160.0, + "grad_norm": 1.9873849160394939, + "language_loss": 0.77283001, + "learning_rate": 3.886546054403946e-06, + "loss": 0.79491603, + "num_input_tokens_seen": 48377845, + "router_z_loss_clip": 1.11328125, + "router_z_loss_mlp": 0.17541504, + "step": 2237, + "time_per_iteration": 2.6827170848846436 + }, + { + "auxiliary_loss_clip": 0.01162879, + "auxiliary_loss_mlp": 0.01044281, + "balance_loss_clip": 1.05249357, + "balance_loss_mlp": 1.02581584, + "epoch": 0.13455583947091537, + "flos": 24239897801280.0, + "grad_norm": 1.8792064713925771, + "language_loss": 0.7873655, + "learning_rate": 3.886416710321491e-06, + "loss": 0.80943704, + "num_input_tokens_seen": 48394735, + "router_z_loss_clip": 1.10351562, + "router_z_loss_mlp": 0.18457031, + "step": 2238, + "time_per_iteration": 2.6334381103515625 + }, + { + "auxiliary_loss_clip": 0.01159878, + "auxiliary_loss_mlp": 0.01043564, + "balance_loss_clip": 1.05207705, + "balance_loss_mlp": 1.02527761, + "epoch": 0.13461596272358334, + "flos": 37640170657440.0, + "grad_norm": 2.375390368428131, + "language_loss": 0.6785301, + "learning_rate": 3.886287294705924e-06, + "loss": 0.70056462, + "num_input_tokens_seen": 48414200, + "router_z_loss_clip": 1.07617188, + "router_z_loss_mlp": 0.1829834, + "step": 2239, + "time_per_iteration": 2.727158784866333 + }, + { + "auxiliary_loss_clip": 0.01167264, + "auxiliary_loss_mlp": 0.01045834, + "balance_loss_clip": 1.05360556, + "balance_loss_mlp": 1.0283103, + "epoch": 0.1346760859762513, + "flos": 15245169420960.0, + "grad_norm": 2.367852264263723, + "language_loss": 0.81364262, + "learning_rate": 3.8861578075621555e-06, + "loss": 0.83577359, + "num_input_tokens_seen": 48431065, + "router_z_loss_clip": 1.13476562, + "router_z_loss_mlp": 0.1751709, + "step": 2240, + "time_per_iteration": 2.590571403503418 + }, + { + "auxiliary_loss_clip": 0.01164195, + "auxiliary_loss_mlp": 0.0104356, + "balance_loss_clip": 1.05226874, + "balance_loss_mlp": 1.02589297, + "epoch": 0.1347362092289193, + "flos": 26644376999520.0, + "grad_norm": 1.742338682184566, + "language_loss": 0.77819777, + "learning_rate": 3.886028248895093e-06, + "loss": 0.80027533, + "num_input_tokens_seen": 48450335, + "router_z_loss_clip": 1.12011719, + "router_z_loss_mlp": 0.17663574, + "step": 2241, + "time_per_iteration": 2.678067684173584 + }, + { + "auxiliary_loss_clip": 0.01159773, + "auxiliary_loss_mlp": 0.01037982, + "balance_loss_clip": 1.0540204, + "balance_loss_mlp": 1.02231789, + "epoch": 0.13479633248158726, + "flos": 28686885104160.0, + "grad_norm": 1.8210235610277876, + "language_loss": 0.83438957, + "learning_rate": 3.88589861870965e-06, + "loss": 0.85636711, + "num_input_tokens_seen": 48468555, + "router_z_loss_clip": 1.05761719, + "router_z_loss_mlp": 0.15661621, + "step": 2242, + "time_per_iteration": 2.854538679122925 + }, + { + "auxiliary_loss_clip": 0.01164572, + "auxiliary_loss_mlp": 0.01054451, + "balance_loss_clip": 1.05370569, + "balance_loss_mlp": 1.03538942, + "epoch": 0.13485645573425523, + "flos": 35807307724800.0, + "grad_norm": 2.8635297571157947, + "language_loss": 0.65016097, + "learning_rate": 3.885768917010744e-06, + "loss": 0.67235118, + "num_input_tokens_seen": 48488515, + "router_z_loss_clip": 1.10839844, + "router_z_loss_mlp": 0.1907959, + "step": 2243, + "time_per_iteration": 2.7021114826202393 + }, + { + "auxiliary_loss_clip": 0.01151781, + "auxiliary_loss_mlp": 0.01038683, + "balance_loss_clip": 1.04827511, + "balance_loss_mlp": 1.02131402, + "epoch": 0.1349165789869232, + "flos": 34211560680000.0, + "grad_norm": 1.6212950734815776, + "language_loss": 0.72554374, + "learning_rate": 3.8856391438032895e-06, + "loss": 0.74744844, + "num_input_tokens_seen": 48510515, + "router_z_loss_clip": 1.03466797, + "router_z_loss_mlp": 0.17370605, + "step": 2244, + "time_per_iteration": 2.680342197418213 + }, + { + "auxiliary_loss_clip": 0.01158391, + "auxiliary_loss_mlp": 0.01048926, + "balance_loss_clip": 1.05035138, + "balance_loss_mlp": 1.03220057, + "epoch": 0.13497670223959116, + "flos": 27886621062240.0, + "grad_norm": 1.836303753562432, + "language_loss": 0.86443514, + "learning_rate": 3.88550929909221e-06, + "loss": 0.88650835, + "num_input_tokens_seen": 48529940, + "router_z_loss_clip": 1.08203125, + "router_z_loss_mlp": 0.16723633, + "step": 2245, + "time_per_iteration": 2.7041757106781006 + }, + { + "auxiliary_loss_clip": 0.01152974, + "auxiliary_loss_mlp": 0.01048356, + "balance_loss_clip": 1.05022192, + "balance_loss_mlp": 1.03146386, + "epoch": 0.13503682549225912, + "flos": 20138958300960.0, + "grad_norm": 7.559438234648121, + "language_loss": 0.78880703, + "learning_rate": 3.88537938288243e-06, + "loss": 0.81082034, + "num_input_tokens_seen": 48548190, + "router_z_loss_clip": 1.02734375, + "router_z_loss_mlp": 0.16894531, + "step": 2246, + "time_per_iteration": 2.6456754207611084 + }, + { + "auxiliary_loss_clip": 0.01058893, + "auxiliary_loss_mlp": 0.01016979, + "balance_loss_clip": 1.01898313, + "balance_loss_mlp": 1.01476371, + "epoch": 0.1350969487449271, + "flos": 86337957991680.0, + "grad_norm": 3.237964390760194, + "language_loss": 0.60576129, + "learning_rate": 3.885249395178874e-06, + "loss": 0.62652004, + "num_input_tokens_seen": 48613165, + "router_z_loss_clip": 0.39892578, + "router_z_loss_mlp": 0.02218628, + "step": 2247, + "time_per_iteration": 3.3773083686828613 + }, + { + "auxiliary_loss_clip": 0.01168364, + "auxiliary_loss_mlp": 0.01052585, + "balance_loss_clip": 1.05515337, + "balance_loss_mlp": 1.03333282, + "epoch": 0.13515707199759508, + "flos": 28157522011200.0, + "grad_norm": 1.891709625040162, + "language_loss": 0.81087017, + "learning_rate": 3.885119335986473e-06, + "loss": 0.83307964, + "num_input_tokens_seen": 48631705, + "router_z_loss_clip": 1.1328125, + "router_z_loss_mlp": 0.19238281, + "step": 2248, + "time_per_iteration": 2.674015998840332 + }, + { + "auxiliary_loss_clip": 0.01155011, + "auxiliary_loss_mlp": 0.01037929, + "balance_loss_clip": 1.05094981, + "balance_loss_mlp": 1.02135909, + "epoch": 0.13521719525026304, + "flos": 28292141881440.0, + "grad_norm": 1.715556622012686, + "language_loss": 0.77438706, + "learning_rate": 3.884989205310157e-06, + "loss": 0.79631639, + "num_input_tokens_seen": 48649740, + "router_z_loss_clip": 1.04052734, + "router_z_loss_mlp": 0.16552734, + "step": 2249, + "time_per_iteration": 2.661667823791504 + }, + { + "auxiliary_loss_clip": 0.01159313, + "auxiliary_loss_mlp": 0.01047815, + "balance_loss_clip": 1.05312657, + "balance_loss_mlp": 1.03101873, + "epoch": 0.135277318502931, + "flos": 30250224675360.0, + "grad_norm": 1.6494158359562805, + "language_loss": 0.84565622, + "learning_rate": 3.884859003154862e-06, + "loss": 0.86772752, + "num_input_tokens_seen": 48671565, + "router_z_loss_clip": 1.06396484, + "router_z_loss_mlp": 0.16796875, + "step": 2250, + "time_per_iteration": 2.7188079357147217 + }, + { + "auxiliary_loss_clip": 0.01161081, + "auxiliary_loss_mlp": 0.01045841, + "balance_loss_clip": 1.05207789, + "balance_loss_mlp": 1.02704215, + "epoch": 0.13533744175559898, + "flos": 26732894555520.0, + "grad_norm": 2.1307132923246606, + "language_loss": 0.8197546, + "learning_rate": 3.884728729525524e-06, + "loss": 0.84182382, + "num_input_tokens_seen": 48690425, + "router_z_loss_clip": 1.08984375, + "router_z_loss_mlp": 0.18798828, + "step": 2251, + "time_per_iteration": 2.6678378582000732 + }, + { + "auxiliary_loss_clip": 0.01159358, + "auxiliary_loss_mlp": 0.01047236, + "balance_loss_clip": 1.05040145, + "balance_loss_mlp": 1.02855551, + "epoch": 0.13539756500826694, + "flos": 25882841126880.0, + "grad_norm": 1.916052315775053, + "language_loss": 0.86041939, + "learning_rate": 3.884598384427084e-06, + "loss": 0.88248539, + "num_input_tokens_seen": 48707505, + "router_z_loss_clip": 1.08886719, + "router_z_loss_mlp": 0.18664551, + "step": 2252, + "time_per_iteration": 2.668086528778076 + }, + { + "auxiliary_loss_clip": 0.01058466, + "auxiliary_loss_mlp": 0.01008585, + "balance_loss_clip": 1.01919842, + "balance_loss_mlp": 1.00658798, + "epoch": 0.1354576882609349, + "flos": 77168382432480.0, + "grad_norm": 1.007827766388552, + "language_loss": 0.61874521, + "learning_rate": 3.884467967864485e-06, + "loss": 0.63941574, + "num_input_tokens_seen": 48775895, + "router_z_loss_clip": 0.39257812, + "router_z_loss_mlp": 0.0199585, + "step": 2253, + "time_per_iteration": 3.3863649368286133 + }, + { + "auxiliary_loss_clip": 0.01160926, + "auxiliary_loss_mlp": 0.0104717, + "balance_loss_clip": 1.05298424, + "balance_loss_mlp": 1.03015852, + "epoch": 0.1355178115136029, + "flos": 31094889305760.0, + "grad_norm": 2.1172014961752192, + "language_loss": 0.89494228, + "learning_rate": 3.884337479842671e-06, + "loss": 0.91702324, + "num_input_tokens_seen": 48798370, + "router_z_loss_clip": 1.07910156, + "router_z_loss_mlp": 0.17016602, + "step": 2254, + "time_per_iteration": 2.9313952922821045 + }, + { + "auxiliary_loss_clip": 0.01167007, + "auxiliary_loss_mlp": 0.01045419, + "balance_loss_clip": 1.0529213, + "balance_loss_mlp": 1.02629781, + "epoch": 0.13557793476627086, + "flos": 26377082088480.0, + "grad_norm": 2.022956151522043, + "language_loss": 0.84050333, + "learning_rate": 3.884206920366591e-06, + "loss": 0.86262751, + "num_input_tokens_seen": 48817955, + "router_z_loss_clip": 1.140625, + "router_z_loss_mlp": 0.19116211, + "step": 2255, + "time_per_iteration": 2.6527645587921143 + }, + { + "auxiliary_loss_clip": 0.01159994, + "auxiliary_loss_mlp": 0.0103885, + "balance_loss_clip": 1.05108416, + "balance_loss_mlp": 1.02092087, + "epoch": 0.13563805801893883, + "flos": 30418589089440.0, + "grad_norm": 2.384888622359924, + "language_loss": 0.74898231, + "learning_rate": 3.884076289441196e-06, + "loss": 0.7709707, + "num_input_tokens_seen": 48836330, + "router_z_loss_clip": 1.08886719, + "router_z_loss_mlp": 0.17932129, + "step": 2256, + "time_per_iteration": 2.680280923843384 + }, + { + "auxiliary_loss_clip": 0.01165691, + "auxiliary_loss_mlp": 0.01046105, + "balance_loss_clip": 1.05226946, + "balance_loss_mlp": 1.02749658, + "epoch": 0.1356981812716068, + "flos": 17998492114080.0, + "grad_norm": 2.273068147046163, + "language_loss": 0.83230972, + "learning_rate": 3.88394558707144e-06, + "loss": 0.8544277, + "num_input_tokens_seen": 48851890, + "router_z_loss_clip": 1.1328125, + "router_z_loss_mlp": 0.18603516, + "step": 2257, + "time_per_iteration": 2.6276559829711914 + }, + { + "auxiliary_loss_clip": 0.01167112, + "auxiliary_loss_mlp": 0.01046265, + "balance_loss_clip": 1.05188453, + "balance_loss_mlp": 1.02641654, + "epoch": 0.13575830452427476, + "flos": 13553976365280.0, + "grad_norm": 2.5740067520132643, + "language_loss": 0.81914914, + "learning_rate": 3.883814813262277e-06, + "loss": 0.8412829, + "num_input_tokens_seen": 48865510, + "router_z_loss_clip": 1.15234375, + "router_z_loss_mlp": 0.19836426, + "step": 2258, + "time_per_iteration": 2.6130406856536865 + }, + { + "auxiliary_loss_clip": 0.01160759, + "auxiliary_loss_mlp": 0.01047576, + "balance_loss_clip": 1.05036569, + "balance_loss_mlp": 1.02695274, + "epoch": 0.13581842777694272, + "flos": 21915670633920.0, + "grad_norm": 2.320828582491252, + "language_loss": 0.82327616, + "learning_rate": 3.883683968018669e-06, + "loss": 0.8453595, + "num_input_tokens_seen": 48882360, + "router_z_loss_clip": 1.10449219, + "router_z_loss_mlp": 0.20617676, + "step": 2259, + "time_per_iteration": 2.6165432929992676 + }, + { + "auxiliary_loss_clip": 0.0116228, + "auxiliary_loss_mlp": 0.01045764, + "balance_loss_clip": 1.05303526, + "balance_loss_mlp": 1.02946806, + "epoch": 0.1358785510296107, + "flos": 27890470203840.0, + "grad_norm": 2.169000249422346, + "language_loss": 0.73560357, + "learning_rate": 3.8835530513455755e-06, + "loss": 0.75768399, + "num_input_tokens_seen": 48902700, + "router_z_loss_clip": 1.09179688, + "router_z_loss_mlp": 0.16296387, + "step": 2260, + "time_per_iteration": 2.650930643081665 + }, + { + "auxiliary_loss_clip": 0.01160899, + "auxiliary_loss_mlp": 0.01049085, + "balance_loss_clip": 1.05207229, + "balance_loss_mlp": 1.03177607, + "epoch": 0.13593867428227868, + "flos": 31407638254560.0, + "grad_norm": 2.4169933949612465, + "language_loss": 0.74940157, + "learning_rate": 3.883422063247961e-06, + "loss": 0.77150142, + "num_input_tokens_seen": 48922525, + "router_z_loss_clip": 1.08691406, + "router_z_loss_mlp": 0.17297363, + "step": 2261, + "time_per_iteration": 2.7127268314361572 + }, + { + "auxiliary_loss_clip": 0.01162746, + "auxiliary_loss_mlp": 0.01045571, + "balance_loss_clip": 1.05155039, + "balance_loss_mlp": 1.02785611, + "epoch": 0.13599879753494665, + "flos": 38131089202080.0, + "grad_norm": 3.162175059465641, + "language_loss": 0.63263392, + "learning_rate": 3.883291003730794e-06, + "loss": 0.65471709, + "num_input_tokens_seen": 48942510, + "router_z_loss_clip": 1.11132812, + "router_z_loss_mlp": 0.17700195, + "step": 2262, + "time_per_iteration": 2.6959047317504883 + }, + { + "auxiliary_loss_clip": 0.01163876, + "auxiliary_loss_mlp": 0.0104368, + "balance_loss_clip": 1.05278492, + "balance_loss_mlp": 1.02615643, + "epoch": 0.1360589207876146, + "flos": 29181409686720.0, + "grad_norm": 4.127452525245647, + "language_loss": 0.8243295, + "learning_rate": 3.883159872799043e-06, + "loss": 0.84640503, + "num_input_tokens_seen": 48962625, + "router_z_loss_clip": 1.11035156, + "router_z_loss_mlp": 0.17529297, + "step": 2263, + "time_per_iteration": 2.6585891246795654 + }, + { + "auxiliary_loss_clip": 0.01166609, + "auxiliary_loss_mlp": 0.01053701, + "balance_loss_clip": 1.05454051, + "balance_loss_mlp": 1.03366232, + "epoch": 0.13611904404028258, + "flos": 24372532324800.0, + "grad_norm": 1.8621020216964352, + "language_loss": 0.88141048, + "learning_rate": 3.8830286704576815e-06, + "loss": 0.90361363, + "num_input_tokens_seen": 48982525, + "router_z_loss_clip": 1.12011719, + "router_z_loss_mlp": 0.20031738, + "step": 2264, + "time_per_iteration": 2.6427037715911865 + }, + { + "auxiliary_loss_clip": 0.01168454, + "auxiliary_loss_mlp": 0.01044542, + "balance_loss_clip": 1.05537379, + "balance_loss_mlp": 1.0254091, + "epoch": 0.13617916729295054, + "flos": 18718301538720.0, + "grad_norm": 3.1194406096155736, + "language_loss": 0.72418559, + "learning_rate": 3.882897396711683e-06, + "loss": 0.74631554, + "num_input_tokens_seen": 48997605, + "router_z_loss_clip": 1.12988281, + "router_z_loss_mlp": 0.19116211, + "step": 2265, + "time_per_iteration": 5.474350929260254 + }, + { + "auxiliary_loss_clip": 0.01160265, + "auxiliary_loss_mlp": 0.01038545, + "balance_loss_clip": 1.05261731, + "balance_loss_mlp": 1.02074671, + "epoch": 0.1362392905456185, + "flos": 33497059019040.0, + "grad_norm": 2.3678968290045335, + "language_loss": 0.66392708, + "learning_rate": 3.882766051566027e-06, + "loss": 0.68591523, + "num_input_tokens_seen": 49018535, + "router_z_loss_clip": 1.07714844, + "router_z_loss_mlp": 0.17785645, + "step": 2266, + "time_per_iteration": 2.9065065383911133 + }, + { + "auxiliary_loss_clip": 0.01160068, + "auxiliary_loss_mlp": 0.01051648, + "balance_loss_clip": 1.05269098, + "balance_loss_mlp": 1.0337429, + "epoch": 0.1362994137982865, + "flos": 30516425619840.0, + "grad_norm": 1.5371179892968403, + "language_loss": 0.7637791, + "learning_rate": 3.882634635025694e-06, + "loss": 0.78589624, + "num_input_tokens_seen": 49038865, + "router_z_loss_clip": 1.07324219, + "router_z_loss_mlp": 0.17907715, + "step": 2267, + "time_per_iteration": 2.697601318359375 + }, + { + "auxiliary_loss_clip": 0.01163403, + "auxiliary_loss_mlp": 0.01044387, + "balance_loss_clip": 1.05329955, + "balance_loss_mlp": 1.026577, + "epoch": 0.13635953705095447, + "flos": 24773758312320.0, + "grad_norm": 1.8786047756782696, + "language_loss": 0.81462896, + "learning_rate": 3.882503147095667e-06, + "loss": 0.83670682, + "num_input_tokens_seen": 49058010, + "router_z_loss_clip": 1.10058594, + "router_z_loss_mlp": 0.17810059, + "step": 2268, + "time_per_iteration": 2.620213508605957 + }, + { + "auxiliary_loss_clip": 0.01162987, + "auxiliary_loss_mlp": 0.01041235, + "balance_loss_clip": 1.05599701, + "balance_loss_mlp": 1.0215888, + "epoch": 0.13641966030362243, + "flos": 38264372002080.0, + "grad_norm": 1.6753003834771403, + "language_loss": 0.76347619, + "learning_rate": 3.882371587780931e-06, + "loss": 0.78551841, + "num_input_tokens_seen": 49080330, + "router_z_loss_clip": 1.06933594, + "router_z_loss_mlp": 0.19641113, + "step": 2269, + "time_per_iteration": 5.722122669219971 + }, + { + "auxiliary_loss_clip": 0.01167464, + "auxiliary_loss_mlp": 0.01045778, + "balance_loss_clip": 1.05639708, + "balance_loss_mlp": 1.02739584, + "epoch": 0.1364797835562904, + "flos": 24988062971520.0, + "grad_norm": 1.8812108999495767, + "language_loss": 0.80643666, + "learning_rate": 3.882239957086477e-06, + "loss": 0.82856905, + "num_input_tokens_seen": 49097035, + "router_z_loss_clip": 1.109375, + "router_z_loss_mlp": 0.18371582, + "step": 2270, + "time_per_iteration": 2.615091562271118 + }, + { + "auxiliary_loss_clip": 0.01169383, + "auxiliary_loss_mlp": 0.01049647, + "balance_loss_clip": 1.05612636, + "balance_loss_mlp": 1.03103805, + "epoch": 0.13653990680895836, + "flos": 15957685735200.0, + "grad_norm": 2.7253304249450983, + "language_loss": 0.75889444, + "learning_rate": 3.882108255017295e-06, + "loss": 0.78108478, + "num_input_tokens_seen": 49113945, + "router_z_loss_clip": 1.1328125, + "router_z_loss_mlp": 0.18603516, + "step": 2271, + "time_per_iteration": 2.623727798461914 + }, + { + "auxiliary_loss_clip": 0.0116667, + "auxiliary_loss_mlp": 0.01047174, + "balance_loss_clip": 1.05604959, + "balance_loss_mlp": 1.02799344, + "epoch": 0.13660003006162633, + "flos": 20678086058400.0, + "grad_norm": 4.564428800964784, + "language_loss": 0.80731487, + "learning_rate": 3.881976481578379e-06, + "loss": 0.82945335, + "num_input_tokens_seen": 49132855, + "router_z_loss_clip": 1.10644531, + "router_z_loss_mlp": 0.19177246, + "step": 2272, + "time_per_iteration": 2.706712245941162 + }, + { + "auxiliary_loss_clip": 0.01062552, + "auxiliary_loss_mlp": 0.01008937, + "balance_loss_clip": 1.02331066, + "balance_loss_mlp": 1.00686097, + "epoch": 0.1366601533142943, + "flos": 83807083931040.0, + "grad_norm": 0.6878794402443773, + "language_loss": 0.607283, + "learning_rate": 3.8818446367747255e-06, + "loss": 0.62799788, + "num_input_tokens_seen": 49198310, + "router_z_loss_clip": 0.39257812, + "router_z_loss_mlp": 0.02076721, + "step": 2273, + "time_per_iteration": 3.3936123847961426 + }, + { + "auxiliary_loss_clip": 0.01162901, + "auxiliary_loss_mlp": 0.01043826, + "balance_loss_clip": 1.05458808, + "balance_loss_mlp": 1.02508605, + "epoch": 0.13672027656696228, + "flos": 23481805897440.0, + "grad_norm": 2.585971733972019, + "language_loss": 0.77165842, + "learning_rate": 3.881712720611336e-06, + "loss": 0.79372567, + "num_input_tokens_seen": 49217250, + "router_z_loss_clip": 1.08203125, + "router_z_loss_mlp": 0.18737793, + "step": 2274, + "time_per_iteration": 2.617706298828125 + }, + { + "auxiliary_loss_clip": 0.01163709, + "auxiliary_loss_mlp": 0.01040734, + "balance_loss_clip": 1.0542587, + "balance_loss_mlp": 1.02155352, + "epoch": 0.13678039981963025, + "flos": 29937880899360.0, + "grad_norm": 2.210331605324177, + "language_loss": 0.78163636, + "learning_rate": 3.881580733093211e-06, + "loss": 0.80368078, + "num_input_tokens_seen": 49236615, + "router_z_loss_clip": 1.09472656, + "router_z_loss_mlp": 0.19152832, + "step": 2275, + "time_per_iteration": 2.67859148979187 + }, + { + "auxiliary_loss_clip": 0.01163485, + "auxiliary_loss_mlp": 0.01040701, + "balance_loss_clip": 1.05437899, + "balance_loss_mlp": 1.02273631, + "epoch": 0.13684052307229821, + "flos": 19119365457120.0, + "grad_norm": 2.3198287164464997, + "language_loss": 0.81980872, + "learning_rate": 3.881448674225356e-06, + "loss": 0.84185058, + "num_input_tokens_seen": 49253935, + "router_z_loss_clip": 1.08935547, + "router_z_loss_mlp": 0.1796875, + "step": 2276, + "time_per_iteration": 2.6131293773651123 + }, + { + "auxiliary_loss_clip": 0.01172812, + "auxiliary_loss_mlp": 0.01054468, + "balance_loss_clip": 1.05600572, + "balance_loss_mlp": 1.03424978, + "epoch": 0.13690064632496618, + "flos": 34611368562720.0, + "grad_norm": 2.8011596432037273, + "language_loss": 0.70108664, + "learning_rate": 3.881316544012779e-06, + "loss": 0.72335947, + "num_input_tokens_seen": 49273605, + "router_z_loss_clip": 1.16796875, + "router_z_loss_mlp": 0.20214844, + "step": 2277, + "time_per_iteration": 2.769835948944092 + }, + { + "auxiliary_loss_clip": 0.01168624, + "auxiliary_loss_mlp": 0.01052382, + "balance_loss_clip": 1.05694294, + "balance_loss_mlp": 1.03314185, + "epoch": 0.13696076957763414, + "flos": 28558707481440.0, + "grad_norm": 2.1690945753776094, + "language_loss": 0.8036648, + "learning_rate": 3.88118434246049e-06, + "loss": 0.82587492, + "num_input_tokens_seen": 49291785, + "router_z_loss_clip": 1.1171875, + "router_z_loss_mlp": 0.19250488, + "step": 2278, + "time_per_iteration": 2.8730251789093018 + }, + { + "auxiliary_loss_clip": 0.01168843, + "auxiliary_loss_mlp": 0.01043077, + "balance_loss_clip": 1.05910146, + "balance_loss_mlp": 1.02455187, + "epoch": 0.1370208928303021, + "flos": 45388522212480.0, + "grad_norm": 2.036416390314655, + "language_loss": 0.7487601, + "learning_rate": 3.881052069573502e-06, + "loss": 0.77087933, + "num_input_tokens_seen": 49311405, + "router_z_loss_clip": 1.09570312, + "router_z_loss_mlp": 0.18518066, + "step": 2279, + "time_per_iteration": 2.8079700469970703 + }, + { + "auxiliary_loss_clip": 0.01168369, + "auxiliary_loss_mlp": 0.01045852, + "balance_loss_clip": 1.05485606, + "balance_loss_mlp": 1.02774382, + "epoch": 0.13708101608297008, + "flos": 32917339297440.0, + "grad_norm": 2.0817141210277192, + "language_loss": 0.76700532, + "learning_rate": 3.880919725356831e-06, + "loss": 0.7891475, + "num_input_tokens_seen": 49331835, + "router_z_loss_clip": 1.13574219, + "router_z_loss_mlp": 0.1809082, + "step": 2280, + "time_per_iteration": 2.6616690158843994 + }, + { + "auxiliary_loss_clip": 0.01163087, + "auxiliary_loss_mlp": 0.01047596, + "balance_loss_clip": 1.05377328, + "balance_loss_mlp": 1.02988124, + "epoch": 0.13714113933563807, + "flos": 39727322454240.0, + "grad_norm": 2.22639786894761, + "language_loss": 0.79371464, + "learning_rate": 3.880787309815496e-06, + "loss": 0.81582153, + "num_input_tokens_seen": 49352290, + "router_z_loss_clip": 1.09277344, + "router_z_loss_mlp": 0.17724609, + "step": 2281, + "time_per_iteration": 2.737859010696411 + }, + { + "auxiliary_loss_clip": 0.01172209, + "auxiliary_loss_mlp": 0.01053567, + "balance_loss_clip": 1.0571146, + "balance_loss_mlp": 1.03516114, + "epoch": 0.13720126258830603, + "flos": 19646986307040.0, + "grad_norm": 1.6062213209866474, + "language_loss": 0.83683187, + "learning_rate": 3.880654822954518e-06, + "loss": 0.85908961, + "num_input_tokens_seen": 49370285, + "router_z_loss_clip": 1.15039062, + "router_z_loss_mlp": 0.18408203, + "step": 2282, + "time_per_iteration": 2.596400260925293 + }, + { + "auxiliary_loss_clip": 0.01163462, + "auxiliary_loss_mlp": 0.01053482, + "balance_loss_clip": 1.05355763, + "balance_loss_mlp": 1.03569627, + "epoch": 0.137261385840974, + "flos": 23126965845120.0, + "grad_norm": 1.6473651412206398, + "language_loss": 0.7364018, + "learning_rate": 3.8805222647789195e-06, + "loss": 0.75857121, + "num_input_tokens_seen": 49389610, + "router_z_loss_clip": 1.09863281, + "router_z_loss_mlp": 0.17797852, + "step": 2283, + "time_per_iteration": 2.6401915550231934 + }, + { + "auxiliary_loss_clip": 0.0116893, + "auxiliary_loss_mlp": 0.01052586, + "balance_loss_clip": 1.06008291, + "balance_loss_mlp": 1.0357064, + "epoch": 0.13732150909364196, + "flos": 28425181577760.0, + "grad_norm": 2.4828313091498146, + "language_loss": 0.8429122, + "learning_rate": 3.880389635293729e-06, + "loss": 0.86512733, + "num_input_tokens_seen": 49408390, + "router_z_loss_clip": 1.08789062, + "router_z_loss_mlp": 0.16870117, + "step": 2284, + "time_per_iteration": 2.6663174629211426 + }, + { + "auxiliary_loss_clip": 0.01170508, + "auxiliary_loss_mlp": 0.01056318, + "balance_loss_clip": 1.05445623, + "balance_loss_mlp": 1.03654075, + "epoch": 0.13738163234630993, + "flos": 35815127559840.0, + "grad_norm": 2.4294710502448003, + "language_loss": 0.74792391, + "learning_rate": 3.880256934503974e-06, + "loss": 0.77019215, + "num_input_tokens_seen": 49427725, + "router_z_loss_clip": 1.16113281, + "router_z_loss_mlp": 0.19787598, + "step": 2285, + "time_per_iteration": 2.698397397994995 + }, + { + "auxiliary_loss_clip": 0.01162884, + "auxiliary_loss_mlp": 0.01048309, + "balance_loss_clip": 1.05343366, + "balance_loss_mlp": 1.03096414, + "epoch": 0.1374417555989779, + "flos": 32520448658880.0, + "grad_norm": 1.7143099291518291, + "language_loss": 0.74708092, + "learning_rate": 3.880124162414689e-06, + "loss": 0.76919287, + "num_input_tokens_seen": 49449000, + "router_z_loss_clip": 1.09472656, + "router_z_loss_mlp": 0.17346191, + "step": 2286, + "time_per_iteration": 2.6968417167663574 + }, + { + "auxiliary_loss_clip": 0.01169938, + "auxiliary_loss_mlp": 0.01043758, + "balance_loss_clip": 1.05607939, + "balance_loss_mlp": 1.02414823, + "epoch": 0.1375018788516459, + "flos": 34658402774400.0, + "grad_norm": 2.548386109930482, + "language_loss": 0.86078918, + "learning_rate": 3.879991319030908e-06, + "loss": 0.88292617, + "num_input_tokens_seen": 49468360, + "router_z_loss_clip": 1.13769531, + "router_z_loss_mlp": 0.19616699, + "step": 2287, + "time_per_iteration": 2.6719717979431152 + }, + { + "auxiliary_loss_clip": 0.01168042, + "auxiliary_loss_mlp": 0.01048416, + "balance_loss_clip": 1.05603433, + "balance_loss_mlp": 1.03018832, + "epoch": 0.13756200210431385, + "flos": 45653710224960.0, + "grad_norm": 1.8128842403932106, + "language_loss": 0.68199414, + "learning_rate": 3.879858404357666e-06, + "loss": 0.70415866, + "num_input_tokens_seen": 49493450, + "router_z_loss_clip": 1.11914062, + "router_z_loss_mlp": 0.18249512, + "step": 2288, + "time_per_iteration": 2.79106068611145 + }, + { + "auxiliary_loss_clip": 0.01166073, + "auxiliary_loss_mlp": 0.01056672, + "balance_loss_clip": 1.05491769, + "balance_loss_mlp": 1.03745568, + "epoch": 0.13762212535698182, + "flos": 27712179056160.0, + "grad_norm": 3.64855661396346, + "language_loss": 0.86882603, + "learning_rate": 3.879725418400005e-06, + "loss": 0.89105344, + "num_input_tokens_seen": 49511220, + "router_z_loss_clip": 1.11132812, + "router_z_loss_mlp": 0.1920166, + "step": 2289, + "time_per_iteration": 2.642775058746338 + }, + { + "auxiliary_loss_clip": 0.01159164, + "auxiliary_loss_mlp": 0.01048479, + "balance_loss_clip": 1.05182326, + "balance_loss_mlp": 1.03093183, + "epoch": 0.13768224860964978, + "flos": 29226620620800.0, + "grad_norm": 2.206357498001803, + "language_loss": 0.74475694, + "learning_rate": 3.879592361162969e-06, + "loss": 0.76683336, + "num_input_tokens_seen": 49529820, + "router_z_loss_clip": 1.07324219, + "router_z_loss_mlp": 0.17529297, + "step": 2290, + "time_per_iteration": 2.786782741546631 + }, + { + "auxiliary_loss_clip": 0.01056506, + "auxiliary_loss_mlp": 0.01013175, + "balance_loss_clip": 1.0178597, + "balance_loss_mlp": 1.0109179, + "epoch": 0.13774237186231775, + "flos": 77591609303040.0, + "grad_norm": 0.7024578315609067, + "language_loss": 0.51623243, + "learning_rate": 3.8794592326516015e-06, + "loss": 0.53692925, + "num_input_tokens_seen": 49595325, + "router_z_loss_clip": 0.38647461, + "router_z_loss_mlp": 0.02258301, + "step": 2291, + "time_per_iteration": 3.4132158756256104 + }, + { + "auxiliary_loss_clip": 0.01163222, + "auxiliary_loss_mlp": 0.01046283, + "balance_loss_clip": 1.05162597, + "balance_loss_mlp": 1.0274241, + "epoch": 0.1378024951149857, + "flos": 29626144882560.0, + "grad_norm": 2.019552028578782, + "language_loss": 0.71056247, + "learning_rate": 3.879326032870952e-06, + "loss": 0.73265755, + "num_input_tokens_seen": 49615850, + "router_z_loss_clip": 1.11816406, + "router_z_loss_mlp": 0.1887207, + "step": 2292, + "time_per_iteration": 2.71109676361084 + }, + { + "auxiliary_loss_clip": 0.011636, + "auxiliary_loss_mlp": 0.01042694, + "balance_loss_clip": 1.05364859, + "balance_loss_mlp": 1.02518249, + "epoch": 0.13786261836765368, + "flos": 17109143274240.0, + "grad_norm": 4.492661023379105, + "language_loss": 0.79963028, + "learning_rate": 3.879192761826071e-06, + "loss": 0.8216933, + "num_input_tokens_seen": 49631860, + "router_z_loss_clip": 1.09765625, + "router_z_loss_mlp": 0.17504883, + "step": 2293, + "time_per_iteration": 2.649613380432129 + }, + { + "auxiliary_loss_clip": 0.01164649, + "auxiliary_loss_mlp": 0.01046058, + "balance_loss_clip": 1.05208349, + "balance_loss_mlp": 1.02816498, + "epoch": 0.13792274162032167, + "flos": 35238284565120.0, + "grad_norm": 2.231234033094736, + "language_loss": 0.78143311, + "learning_rate": 3.879059419522011e-06, + "loss": 0.80354017, + "num_input_tokens_seen": 49652145, + "router_z_loss_clip": 1.12597656, + "router_z_loss_mlp": 0.17907715, + "step": 2294, + "time_per_iteration": 2.72904896736145 + }, + { + "auxiliary_loss_clip": 0.01162709, + "auxiliary_loss_mlp": 0.01043375, + "balance_loss_clip": 1.05426335, + "balance_loss_mlp": 1.02692366, + "epoch": 0.13798286487298964, + "flos": 25796592538560.0, + "grad_norm": 3.7904488733490527, + "language_loss": 0.80130339, + "learning_rate": 3.878926005963831e-06, + "loss": 0.8233642, + "num_input_tokens_seen": 49669880, + "router_z_loss_clip": 1.08447266, + "router_z_loss_mlp": 0.16448975, + "step": 2295, + "time_per_iteration": 2.6680867671966553 + }, + { + "auxiliary_loss_clip": 0.01161374, + "auxiliary_loss_mlp": 0.01043371, + "balance_loss_clip": 1.05123758, + "balance_loss_mlp": 1.0249176, + "epoch": 0.1380429881256576, + "flos": 27439819485120.0, + "grad_norm": 1.8404641779319917, + "language_loss": 0.78185552, + "learning_rate": 3.878792521156588e-06, + "loss": 0.80390298, + "num_input_tokens_seen": 49687255, + "router_z_loss_clip": 1.09960938, + "router_z_loss_mlp": 0.18444824, + "step": 2296, + "time_per_iteration": 2.6303915977478027 + }, + { + "auxiliary_loss_clip": 0.0116593, + "auxiliary_loss_mlp": 0.01054148, + "balance_loss_clip": 1.05564141, + "balance_loss_mlp": 1.03500342, + "epoch": 0.13810311137832557, + "flos": 26105006138400.0, + "grad_norm": 1.8349662924157684, + "language_loss": 0.78758895, + "learning_rate": 3.8786589651053446e-06, + "loss": 0.80978972, + "num_input_tokens_seen": 49706650, + "router_z_loss_clip": 1.10351562, + "router_z_loss_mlp": 0.19140625, + "step": 2297, + "time_per_iteration": 2.672872304916382 + }, + { + "auxiliary_loss_clip": 0.01162206, + "auxiliary_loss_mlp": 0.01041679, + "balance_loss_clip": 1.05316973, + "balance_loss_mlp": 1.02432227, + "epoch": 0.13816323463099353, + "flos": 31713701852160.0, + "grad_norm": 2.185271554555207, + "language_loss": 0.69707793, + "learning_rate": 3.878525337815164e-06, + "loss": 0.71911675, + "num_input_tokens_seen": 49725715, + "router_z_loss_clip": 1.08984375, + "router_z_loss_mlp": 0.17358398, + "step": 2298, + "time_per_iteration": 2.673882007598877 + }, + { + "auxiliary_loss_clip": 0.01168423, + "auxiliary_loss_mlp": 0.01048721, + "balance_loss_clip": 1.05511832, + "balance_loss_mlp": 1.03061295, + "epoch": 0.1382233578836615, + "flos": 23481886932000.0, + "grad_norm": 2.3862085217669415, + "language_loss": 0.8665669, + "learning_rate": 3.878391639291116e-06, + "loss": 0.88873839, + "num_input_tokens_seen": 49744710, + "router_z_loss_clip": 1.13183594, + "router_z_loss_mlp": 0.1809082, + "step": 2299, + "time_per_iteration": 2.6963064670562744 + }, + { + "auxiliary_loss_clip": 0.0116383, + "auxiliary_loss_mlp": 0.01053153, + "balance_loss_clip": 1.0520097, + "balance_loss_mlp": 1.03367424, + "epoch": 0.1382834811363295, + "flos": 31318431904800.0, + "grad_norm": 2.856962936616559, + "language_loss": 0.75486916, + "learning_rate": 3.878257869538267e-06, + "loss": 0.77703905, + "num_input_tokens_seen": 49764300, + "router_z_loss_clip": 1.11767578, + "router_z_loss_mlp": 0.19470215, + "step": 2300, + "time_per_iteration": 2.6700565814971924 + }, + { + "auxiliary_loss_clip": 0.01164552, + "auxiliary_loss_mlp": 0.0104629, + "balance_loss_clip": 1.05510736, + "balance_loss_mlp": 1.02807474, + "epoch": 0.13834360438899745, + "flos": 24141372477120.0, + "grad_norm": 3.2830631484124404, + "language_loss": 0.82698762, + "learning_rate": 3.878124028561692e-06, + "loss": 0.84909606, + "num_input_tokens_seen": 49778380, + "router_z_loss_clip": 1.09277344, + "router_z_loss_mlp": 0.18212891, + "step": 2301, + "time_per_iteration": 2.606860399246216 + }, + { + "auxiliary_loss_clip": 0.011597, + "auxiliary_loss_mlp": 0.01041197, + "balance_loss_clip": 1.05227375, + "balance_loss_mlp": 1.02339888, + "epoch": 0.13840372764166542, + "flos": 32521056418080.0, + "grad_norm": 1.9822161583490483, + "language_loss": 0.86056733, + "learning_rate": 3.877990116366466e-06, + "loss": 0.88257629, + "num_input_tokens_seen": 49797460, + "router_z_loss_clip": 1.07373047, + "router_z_loss_mlp": 0.17785645, + "step": 2302, + "time_per_iteration": 2.6790144443511963 + }, + { + "auxiliary_loss_clip": 0.01054143, + "auxiliary_loss_mlp": 0.01032278, + "balance_loss_clip": 1.01584744, + "balance_loss_mlp": 1.03007817, + "epoch": 0.13846385089433338, + "flos": 86037688365120.0, + "grad_norm": 0.7564917745128922, + "language_loss": 0.65623391, + "learning_rate": 3.877856132957667e-06, + "loss": 0.6770981, + "num_input_tokens_seen": 49868005, + "router_z_loss_clip": 0.3828125, + "router_z_loss_mlp": 0.02201843, + "step": 2303, + "time_per_iteration": 3.504446268081665 + }, + { + "auxiliary_loss_clip": 0.01159933, + "auxiliary_loss_mlp": 0.01041687, + "balance_loss_clip": 1.05215597, + "balance_loss_mlp": 1.02444959, + "epoch": 0.13852397414700135, + "flos": 21168599430240.0, + "grad_norm": 2.533330875112701, + "language_loss": 0.78482157, + "learning_rate": 3.877722078340374e-06, + "loss": 0.8068378, + "num_input_tokens_seen": 49885825, + "router_z_loss_clip": 1.078125, + "router_z_loss_mlp": 0.17236328, + "step": 2304, + "time_per_iteration": 2.655144214630127 + }, + { + "auxiliary_loss_clip": 0.01166368, + "auxiliary_loss_mlp": 0.01043104, + "balance_loss_clip": 1.05451906, + "balance_loss_mlp": 1.02563953, + "epoch": 0.13858409739966931, + "flos": 26287794704160.0, + "grad_norm": 1.9290169627457752, + "language_loss": 0.77739555, + "learning_rate": 3.877587952519672e-06, + "loss": 0.79949021, + "num_input_tokens_seen": 49905975, + "router_z_loss_clip": 1.11816406, + "router_z_loss_mlp": 0.17456055, + "step": 2305, + "time_per_iteration": 4.076413631439209 + }, + { + "auxiliary_loss_clip": 0.01156449, + "auxiliary_loss_mlp": 0.01043704, + "balance_loss_clip": 1.04912114, + "balance_loss_mlp": 1.02608538, + "epoch": 0.13864422065233728, + "flos": 26331911671680.0, + "grad_norm": 1.8404831119082166, + "language_loss": 0.87614006, + "learning_rate": 3.877453755500647e-06, + "loss": 0.89814156, + "num_input_tokens_seen": 49925800, + "router_z_loss_clip": 1.07128906, + "router_z_loss_mlp": 0.17614746, + "step": 2306, + "time_per_iteration": 2.6377532482147217 + }, + { + "auxiliary_loss_clip": 0.01053028, + "auxiliary_loss_mlp": 0.01007736, + "balance_loss_clip": 1.01486921, + "balance_loss_mlp": 1.00547671, + "epoch": 0.13870434390500527, + "flos": 65124917455680.0, + "grad_norm": 0.8735862676169597, + "language_loss": 0.59080493, + "learning_rate": 3.877319487288387e-06, + "loss": 0.61141253, + "num_input_tokens_seen": 49977620, + "router_z_loss_clip": 0.3815918, + "router_z_loss_mlp": 0.02261353, + "step": 2307, + "time_per_iteration": 3.29575514793396 + }, + { + "auxiliary_loss_clip": 0.01168611, + "auxiliary_loss_mlp": 0.01041333, + "balance_loss_clip": 1.05374956, + "balance_loss_mlp": 1.02260518, + "epoch": 0.13876446715767324, + "flos": 27535022392320.0, + "grad_norm": 1.7199813396446917, + "language_loss": 0.79503, + "learning_rate": 3.877185147887984e-06, + "loss": 0.81712937, + "num_input_tokens_seen": 49996650, + "router_z_loss_clip": 1.15039062, + "router_z_loss_mlp": 0.18725586, + "step": 2308, + "time_per_iteration": 4.100496292114258 + }, + { + "auxiliary_loss_clip": 0.01160232, + "auxiliary_loss_mlp": 0.01043035, + "balance_loss_clip": 1.05203199, + "balance_loss_mlp": 1.02534473, + "epoch": 0.1388245904103412, + "flos": 25264474270560.0, + "grad_norm": 9.784493119043116, + "language_loss": 0.77529413, + "learning_rate": 3.877050737304533e-06, + "loss": 0.7973268, + "num_input_tokens_seen": 50015640, + "router_z_loss_clip": 1.08300781, + "router_z_loss_mlp": 0.17687988, + "step": 2309, + "time_per_iteration": 4.193415880203247 + }, + { + "auxiliary_loss_clip": 0.01166671, + "auxiliary_loss_mlp": 0.01044802, + "balance_loss_clip": 1.05221117, + "balance_loss_mlp": 1.0267899, + "epoch": 0.13888471366300917, + "flos": 25081240014720.0, + "grad_norm": 1.8793385878428994, + "language_loss": 0.67536038, + "learning_rate": 3.876916255543129e-06, + "loss": 0.69747508, + "num_input_tokens_seen": 50033500, + "router_z_loss_clip": 1.14453125, + "router_z_loss_mlp": 0.18017578, + "step": 2310, + "time_per_iteration": 2.7332987785339355 + }, + { + "auxiliary_loss_clip": 0.01163545, + "auxiliary_loss_mlp": 0.01047003, + "balance_loss_clip": 1.05252361, + "balance_loss_mlp": 1.0273211, + "epoch": 0.13894483691567713, + "flos": 16885317054240.0, + "grad_norm": 1.9968570138173378, + "language_loss": 0.83998835, + "learning_rate": 3.8767817026088725e-06, + "loss": 0.86209381, + "num_input_tokens_seen": 50050075, + "router_z_loss_clip": 1.11132812, + "router_z_loss_mlp": 0.19702148, + "step": 2311, + "time_per_iteration": 2.5968596935272217 + }, + { + "auxiliary_loss_clip": 0.01169579, + "auxiliary_loss_mlp": 0.0104539, + "balance_loss_clip": 1.05499542, + "balance_loss_mlp": 1.02759218, + "epoch": 0.1390049601683451, + "flos": 34204875328800.0, + "grad_norm": 2.341170583454056, + "language_loss": 0.8191871, + "learning_rate": 3.876647078506866e-06, + "loss": 0.84133679, + "num_input_tokens_seen": 50070080, + "router_z_loss_clip": 1.14453125, + "router_z_loss_mlp": 0.17810059, + "step": 2312, + "time_per_iteration": 2.6843838691711426 + }, + { + "auxiliary_loss_clip": 0.01167284, + "auxiliary_loss_mlp": 0.01041773, + "balance_loss_clip": 1.05379105, + "balance_loss_mlp": 1.02470207, + "epoch": 0.13906508342101306, + "flos": 32649598696320.0, + "grad_norm": 1.8155021811154723, + "language_loss": 0.8659631, + "learning_rate": 3.876512383242215e-06, + "loss": 0.88805377, + "num_input_tokens_seen": 50090040, + "router_z_loss_clip": 1.13378906, + "router_z_loss_mlp": 0.17077637, + "step": 2313, + "time_per_iteration": 2.6905288696289062 + }, + { + "auxiliary_loss_clip": 0.01162216, + "auxiliary_loss_mlp": 0.01052602, + "balance_loss_clip": 1.05228734, + "balance_loss_mlp": 1.03376698, + "epoch": 0.13912520667368106, + "flos": 29938083485760.0, + "grad_norm": 1.8972198927704391, + "language_loss": 0.80164969, + "learning_rate": 3.876377616820024e-06, + "loss": 0.82379794, + "num_input_tokens_seen": 50110595, + "router_z_loss_clip": 1.09960938, + "router_z_loss_mlp": 0.18835449, + "step": 2314, + "time_per_iteration": 2.9829351902008057 + }, + { + "auxiliary_loss_clip": 0.01162113, + "auxiliary_loss_mlp": 0.0104689, + "balance_loss_clip": 1.05143595, + "balance_loss_mlp": 1.02900839, + "epoch": 0.13918532992634902, + "flos": 23652034106400.0, + "grad_norm": 2.679027859253834, + "language_loss": 0.85580599, + "learning_rate": 3.876242779245409e-06, + "loss": 0.87789601, + "num_input_tokens_seen": 50125430, + "router_z_loss_clip": 1.10449219, + "router_z_loss_mlp": 0.17883301, + "step": 2315, + "time_per_iteration": 2.6133134365081787 + }, + { + "auxiliary_loss_clip": 0.01161846, + "auxiliary_loss_mlp": 0.0105305, + "balance_loss_clip": 1.05098999, + "balance_loss_mlp": 1.03326118, + "epoch": 0.139245453179017, + "flos": 26019405826560.0, + "grad_norm": 2.3091936569354163, + "language_loss": 0.77249515, + "learning_rate": 3.876107870523477e-06, + "loss": 0.79464412, + "num_input_tokens_seen": 50144120, + "router_z_loss_clip": 1.109375, + "router_z_loss_mlp": 0.19775391, + "step": 2316, + "time_per_iteration": 2.631230592727661 + }, + { + "auxiliary_loss_clip": 0.01161464, + "auxiliary_loss_mlp": 0.01065392, + "balance_loss_clip": 1.05101061, + "balance_loss_mlp": 1.04531717, + "epoch": 0.13930557643168495, + "flos": 23794554846240.0, + "grad_norm": 1.6362302836976286, + "language_loss": 0.76992738, + "learning_rate": 3.875972890659349e-06, + "loss": 0.79219592, + "num_input_tokens_seen": 50162500, + "router_z_loss_clip": 1.10449219, + "router_z_loss_mlp": 0.20043945, + "step": 2317, + "time_per_iteration": 2.641723871231079 + }, + { + "auxiliary_loss_clip": 0.01166547, + "auxiliary_loss_mlp": 0.01051129, + "balance_loss_clip": 1.05389154, + "balance_loss_mlp": 1.03269935, + "epoch": 0.13936569968435292, + "flos": 31006695888000.0, + "grad_norm": 2.3257736380663143, + "language_loss": 0.80623513, + "learning_rate": 3.875837839658139e-06, + "loss": 0.82841194, + "num_input_tokens_seen": 50182415, + "router_z_loss_clip": 1.125, + "router_z_loss_mlp": 0.18432617, + "step": 2318, + "time_per_iteration": 2.677027463912964 + }, + { + "auxiliary_loss_clip": 0.01053247, + "auxiliary_loss_mlp": 0.01020247, + "balance_loss_clip": 1.01581657, + "balance_loss_mlp": 1.01773453, + "epoch": 0.13942582293702088, + "flos": 86354610593760.0, + "grad_norm": 0.8646729072919142, + "language_loss": 0.59061635, + "learning_rate": 3.87570271752497e-06, + "loss": 0.61135125, + "num_input_tokens_seen": 50245160, + "router_z_loss_clip": 0.37402344, + "router_z_loss_mlp": 0.02511597, + "step": 2319, + "time_per_iteration": 3.3349010944366455 + }, + { + "auxiliary_loss_clip": 0.01164613, + "auxiliary_loss_mlp": 0.01054942, + "balance_loss_clip": 1.05096936, + "balance_loss_mlp": 1.03636932, + "epoch": 0.13948594618968888, + "flos": 43429142865600.0, + "grad_norm": 2.6452143679189475, + "language_loss": 0.64924204, + "learning_rate": 3.875567524264967e-06, + "loss": 0.67143762, + "num_input_tokens_seen": 50268215, + "router_z_loss_clip": 1.13769531, + "router_z_loss_mlp": 0.18566895, + "step": 2320, + "time_per_iteration": 2.7826876640319824 + }, + { + "auxiliary_loss_clip": 0.01159478, + "auxiliary_loss_mlp": 0.0104806, + "balance_loss_clip": 1.05114961, + "balance_loss_mlp": 1.02979708, + "epoch": 0.13954606944235684, + "flos": 25752678157440.0, + "grad_norm": 1.631562448741613, + "language_loss": 0.71047378, + "learning_rate": 3.875432259883256e-06, + "loss": 0.73254913, + "num_input_tokens_seen": 50288575, + "router_z_loss_clip": 1.08349609, + "router_z_loss_mlp": 0.18261719, + "step": 2321, + "time_per_iteration": 2.6617698669433594 + }, + { + "auxiliary_loss_clip": 0.0116179, + "auxiliary_loss_mlp": 0.01054523, + "balance_loss_clip": 1.04941475, + "balance_loss_mlp": 1.03453135, + "epoch": 0.1396061926950248, + "flos": 30559732241760.0, + "grad_norm": 2.4252182701650193, + "language_loss": 0.8596276, + "learning_rate": 3.875296924384965e-06, + "loss": 0.88179076, + "num_input_tokens_seen": 50308735, + "router_z_loss_clip": 1.12304688, + "router_z_loss_mlp": 0.19995117, + "step": 2322, + "time_per_iteration": 2.6763222217559814 + }, + { + "auxiliary_loss_clip": 0.01154564, + "auxiliary_loss_mlp": 0.01058483, + "balance_loss_clip": 1.04965067, + "balance_loss_mlp": 1.04135227, + "epoch": 0.13966631594769277, + "flos": 45922342206240.0, + "grad_norm": 1.8193397921006773, + "language_loss": 0.6698215, + "learning_rate": 3.875161517775226e-06, + "loss": 0.69195199, + "num_input_tokens_seen": 50331025, + "router_z_loss_clip": 1.04980469, + "router_z_loss_mlp": 0.17138672, + "step": 2323, + "time_per_iteration": 2.865600109100342 + }, + { + "auxiliary_loss_clip": 0.01170453, + "auxiliary_loss_mlp": 0.01054705, + "balance_loss_clip": 1.05345333, + "balance_loss_mlp": 1.03572726, + "epoch": 0.13972643920036074, + "flos": 20366066420640.0, + "grad_norm": 4.140800979683164, + "language_loss": 0.88784742, + "learning_rate": 3.875026040059175e-06, + "loss": 0.91009897, + "num_input_tokens_seen": 50349725, + "router_z_loss_clip": 1.16894531, + "router_z_loss_mlp": 0.18994141, + "step": 2324, + "time_per_iteration": 2.651266098022461 + }, + { + "auxiliary_loss_clip": 0.01162542, + "auxiliary_loss_mlp": 0.0106397, + "balance_loss_clip": 1.05022597, + "balance_loss_mlp": 1.04561198, + "epoch": 0.1397865624530287, + "flos": 28468812337920.0, + "grad_norm": 2.339717487249937, + "language_loss": 0.70725262, + "learning_rate": 3.8748904912419485e-06, + "loss": 0.7295177, + "num_input_tokens_seen": 50367965, + "router_z_loss_clip": 1.12304688, + "router_z_loss_mlp": 0.18359375, + "step": 2325, + "time_per_iteration": 2.679802894592285 + }, + { + "auxiliary_loss_clip": 0.01163062, + "auxiliary_loss_mlp": 0.01063418, + "balance_loss_clip": 1.05247855, + "balance_loss_mlp": 1.04546475, + "epoch": 0.13984668570569667, + "flos": 27794821606560.0, + "grad_norm": 1.7873808038545906, + "language_loss": 0.8189401, + "learning_rate": 3.874754871328688e-06, + "loss": 0.84120488, + "num_input_tokens_seen": 50385605, + "router_z_loss_clip": 1.10449219, + "router_z_loss_mlp": 0.17944336, + "step": 2326, + "time_per_iteration": 2.696807384490967 + }, + { + "auxiliary_loss_clip": 0.01157585, + "auxiliary_loss_mlp": 0.01051324, + "balance_loss_clip": 1.05155015, + "balance_loss_mlp": 1.03481317, + "epoch": 0.13990680895836466, + "flos": 23750518913280.0, + "grad_norm": 2.1259129758598134, + "language_loss": 0.8918525, + "learning_rate": 3.874619180324534e-06, + "loss": 0.9139415, + "num_input_tokens_seen": 50403985, + "router_z_loss_clip": 1.05957031, + "router_z_loss_mlp": 0.16491699, + "step": 2327, + "time_per_iteration": 2.817988634109497 + }, + { + "auxiliary_loss_clip": 0.01157548, + "auxiliary_loss_mlp": 0.01063596, + "balance_loss_clip": 1.0506115, + "balance_loss_mlp": 1.04547632, + "epoch": 0.13996693221103262, + "flos": 24774041933280.0, + "grad_norm": 2.156617213163244, + "language_loss": 0.85001922, + "learning_rate": 3.874483418234632e-06, + "loss": 0.87223065, + "num_input_tokens_seen": 50421590, + "router_z_loss_clip": 1.06933594, + "router_z_loss_mlp": 0.18127441, + "step": 2328, + "time_per_iteration": 2.6594314575195312 + }, + { + "auxiliary_loss_clip": 0.01159835, + "auxiliary_loss_mlp": 0.01048532, + "balance_loss_clip": 1.05000758, + "balance_loss_mlp": 1.03032899, + "epoch": 0.1400270554637006, + "flos": 32479167900960.0, + "grad_norm": 1.6922678144749668, + "language_loss": 0.73650622, + "learning_rate": 3.874347585064131e-06, + "loss": 0.75858992, + "num_input_tokens_seen": 50443945, + "router_z_loss_clip": 1.09960938, + "router_z_loss_mlp": 0.18200684, + "step": 2329, + "time_per_iteration": 2.6855428218841553 + }, + { + "auxiliary_loss_clip": 0.01159681, + "auxiliary_loss_mlp": 0.01049649, + "balance_loss_clip": 1.04833245, + "balance_loss_mlp": 1.03179157, + "epoch": 0.14008717871636855, + "flos": 23661353080800.0, + "grad_norm": 2.266426140317667, + "language_loss": 0.78464782, + "learning_rate": 3.874211680818183e-06, + "loss": 0.80674112, + "num_input_tokens_seen": 50462065, + "router_z_loss_clip": 1.11425781, + "router_z_loss_mlp": 0.17858887, + "step": 2330, + "time_per_iteration": 2.625584840774536 + }, + { + "auxiliary_loss_clip": 0.01157492, + "auxiliary_loss_mlp": 0.01045168, + "balance_loss_clip": 1.04951584, + "balance_loss_mlp": 1.02746582, + "epoch": 0.14014730196903652, + "flos": 18673900950240.0, + "grad_norm": 1.9639029936087726, + "language_loss": 0.71861404, + "learning_rate": 3.87407570550194e-06, + "loss": 0.74064064, + "num_input_tokens_seen": 50479565, + "router_z_loss_clip": 1.08105469, + "router_z_loss_mlp": 0.17687988, + "step": 2331, + "time_per_iteration": 2.6246554851531982 + }, + { + "auxiliary_loss_clip": 0.01154087, + "auxiliary_loss_mlp": 0.01055869, + "balance_loss_clip": 1.05199718, + "balance_loss_mlp": 1.03822577, + "epoch": 0.14020742522170448, + "flos": 18229003685280.0, + "grad_norm": 1.5829291892294313, + "language_loss": 0.72573698, + "learning_rate": 3.873939659120557e-06, + "loss": 0.74783653, + "num_input_tokens_seen": 50497305, + "router_z_loss_clip": 1.02001953, + "router_z_loss_mlp": 0.1763916, + "step": 2332, + "time_per_iteration": 2.6621363162994385 + }, + { + "auxiliary_loss_clip": 0.01052386, + "auxiliary_loss_mlp": 0.01035571, + "balance_loss_clip": 1.01518703, + "balance_loss_mlp": 1.03337812, + "epoch": 0.14026754847437245, + "flos": 59575532689440.0, + "grad_norm": 0.8440441688360057, + "language_loss": 0.56119061, + "learning_rate": 3.873803541679196e-06, + "loss": 0.58207023, + "num_input_tokens_seen": 50549735, + "router_z_loss_clip": 0.37207031, + "router_z_loss_mlp": 0.02197266, + "step": 2333, + "time_per_iteration": 3.115232467651367 + }, + { + "auxiliary_loss_clip": 0.01158226, + "auxiliary_loss_mlp": 0.01045874, + "balance_loss_clip": 1.05053258, + "balance_loss_mlp": 1.02821946, + "epoch": 0.14032767172704044, + "flos": 31449202633440.0, + "grad_norm": 1.845025194784725, + "language_loss": 0.82864916, + "learning_rate": 3.873667353183016e-06, + "loss": 0.85069013, + "num_input_tokens_seen": 50570100, + "router_z_loss_clip": 1.07714844, + "router_z_loss_mlp": 0.17663574, + "step": 2334, + "time_per_iteration": 2.693260908126831 + }, + { + "auxiliary_loss_clip": 0.01159338, + "auxiliary_loss_mlp": 0.01045254, + "balance_loss_clip": 1.05092144, + "balance_loss_mlp": 1.02812338, + "epoch": 0.1403877949797084, + "flos": 25888108373280.0, + "grad_norm": 1.9191789020147918, + "language_loss": 0.81150305, + "learning_rate": 3.8735310936371825e-06, + "loss": 0.83354896, + "num_input_tokens_seen": 50589185, + "router_z_loss_clip": 1.08691406, + "router_z_loss_mlp": 0.17138672, + "step": 2335, + "time_per_iteration": 2.654733419418335 + }, + { + "auxiliary_loss_clip": 0.01166163, + "auxiliary_loss_mlp": 0.01045605, + "balance_loss_clip": 1.05319512, + "balance_loss_mlp": 1.02471972, + "epoch": 0.14044791823237637, + "flos": 27757227921120.0, + "grad_norm": 1.6841897953047373, + "language_loss": 0.81959724, + "learning_rate": 3.873394763046862e-06, + "loss": 0.84171486, + "num_input_tokens_seen": 50609645, + "router_z_loss_clip": 1.12744141, + "router_z_loss_mlp": 0.20898438, + "step": 2336, + "time_per_iteration": 2.7316062450408936 + }, + { + "auxiliary_loss_clip": 0.01159436, + "auxiliary_loss_mlp": 0.01044234, + "balance_loss_clip": 1.051265, + "balance_loss_mlp": 1.02654314, + "epoch": 0.14050804148504434, + "flos": 28021605588000.0, + "grad_norm": 1.8440515519245717, + "language_loss": 0.80983436, + "learning_rate": 3.873258361417225e-06, + "loss": 0.83187115, + "num_input_tokens_seen": 50628385, + "router_z_loss_clip": 1.08007812, + "router_z_loss_mlp": 0.17687988, + "step": 2337, + "time_per_iteration": 2.711233615875244 + }, + { + "auxiliary_loss_clip": 0.01159345, + "auxiliary_loss_mlp": 0.01043425, + "balance_loss_clip": 1.04972124, + "balance_loss_mlp": 1.02613962, + "epoch": 0.1405681647377123, + "flos": 27088990643520.0, + "grad_norm": 2.0499991431844204, + "language_loss": 0.79028189, + "learning_rate": 3.873121888753442e-06, + "loss": 0.8123095, + "num_input_tokens_seen": 50647260, + "router_z_loss_clip": 1.09667969, + "router_z_loss_mlp": 0.17297363, + "step": 2338, + "time_per_iteration": 2.709683656692505 + }, + { + "auxiliary_loss_clip": 0.01167721, + "auxiliary_loss_mlp": 0.01048455, + "balance_loss_clip": 1.05567765, + "balance_loss_mlp": 1.02903628, + "epoch": 0.14062828799038027, + "flos": 28959001571520.0, + "grad_norm": 2.123731041067172, + "language_loss": 0.79900247, + "learning_rate": 3.87298534506069e-06, + "loss": 0.82116425, + "num_input_tokens_seen": 50666130, + "router_z_loss_clip": 1.12011719, + "router_z_loss_mlp": 0.19421387, + "step": 2339, + "time_per_iteration": 2.9134156703948975 + }, + { + "auxiliary_loss_clip": 0.01162801, + "auxiliary_loss_mlp": 0.01053672, + "balance_loss_clip": 1.05356216, + "balance_loss_mlp": 1.03593349, + "epoch": 0.14068841124304826, + "flos": 47836348549920.0, + "grad_norm": 2.166343955592781, + "language_loss": 0.65323055, + "learning_rate": 3.872848730344146e-06, + "loss": 0.67539531, + "num_input_tokens_seen": 50687440, + "router_z_loss_clip": 1.09277344, + "router_z_loss_mlp": 0.17736816, + "step": 2340, + "time_per_iteration": 2.8655459880828857 + }, + { + "auxiliary_loss_clip": 0.01157505, + "auxiliary_loss_mlp": 0.01047056, + "balance_loss_clip": 1.05265951, + "balance_loss_mlp": 1.02954459, + "epoch": 0.14074853449571623, + "flos": 24638733269280.0, + "grad_norm": 2.0988131367001204, + "language_loss": 0.78934908, + "learning_rate": 3.87271204460899e-06, + "loss": 0.81139463, + "num_input_tokens_seen": 50704030, + "router_z_loss_clip": 1.04931641, + "router_z_loss_mlp": 0.17529297, + "step": 2341, + "time_per_iteration": 2.6418745517730713 + }, + { + "auxiliary_loss_clip": 0.01156756, + "auxiliary_loss_mlp": 0.01048108, + "balance_loss_clip": 1.05050647, + "balance_loss_mlp": 1.03050137, + "epoch": 0.1408086577483842, + "flos": 22458688050240.0, + "grad_norm": 1.925422184241849, + "language_loss": 0.79838598, + "learning_rate": 3.8725752878604066e-06, + "loss": 0.82043469, + "num_input_tokens_seen": 50723305, + "router_z_loss_clip": 1.0625, + "router_z_loss_mlp": 0.17590332, + "step": 2342, + "time_per_iteration": 2.667726993560791 + }, + { + "auxiliary_loss_clip": 0.01160173, + "auxiliary_loss_mlp": 0.01041069, + "balance_loss_clip": 1.0560323, + "balance_loss_mlp": 1.02439177, + "epoch": 0.14086878100105216, + "flos": 30828283188480.0, + "grad_norm": 1.7665820267090437, + "language_loss": 0.77357912, + "learning_rate": 3.87243846010358e-06, + "loss": 0.79559159, + "num_input_tokens_seen": 50743270, + "router_z_loss_clip": 1.04101562, + "router_z_loss_mlp": 0.16687012, + "step": 2343, + "time_per_iteration": 2.6857240200042725 + }, + { + "auxiliary_loss_clip": 0.01054825, + "auxiliary_loss_mlp": 0.01003955, + "balance_loss_clip": 1.0172472, + "balance_loss_mlp": 1.00161862, + "epoch": 0.14092890425372012, + "flos": 80507421404640.0, + "grad_norm": 0.8354326021976626, + "language_loss": 0.61577106, + "learning_rate": 3.872301561343699e-06, + "loss": 0.63635886, + "num_input_tokens_seen": 50802710, + "router_z_loss_clip": 0.37548828, + "router_z_loss_mlp": 0.02333069, + "step": 2344, + "time_per_iteration": 4.668176651000977 + }, + { + "auxiliary_loss_clip": 0.01156178, + "auxiliary_loss_mlp": 0.01039566, + "balance_loss_clip": 1.05148911, + "balance_loss_mlp": 1.02272153, + "epoch": 0.1409890275063881, + "flos": 28912980291840.0, + "grad_norm": 1.4939045677646228, + "language_loss": 0.64302909, + "learning_rate": 3.872164591585956e-06, + "loss": 0.66498655, + "num_input_tokens_seen": 50822625, + "router_z_loss_clip": 1.046875, + "router_z_loss_mlp": 0.16833496, + "step": 2345, + "time_per_iteration": 2.6847715377807617 + }, + { + "auxiliary_loss_clip": 0.01165367, + "auxiliary_loss_mlp": 0.01041102, + "balance_loss_clip": 1.0508945, + "balance_loss_mlp": 1.02149177, + "epoch": 0.14104915075905605, + "flos": 28825232564160.0, + "grad_norm": 2.061756938341533, + "language_loss": 0.73766404, + "learning_rate": 3.8720275508355435e-06, + "loss": 0.75972873, + "num_input_tokens_seen": 50842330, + "router_z_loss_clip": 1.14355469, + "router_z_loss_mlp": 0.19616699, + "step": 2346, + "time_per_iteration": 2.6920692920684814 + }, + { + "auxiliary_loss_clip": 0.01166269, + "auxiliary_loss_mlp": 0.01047076, + "balance_loss_clip": 1.05507076, + "balance_loss_mlp": 1.02859879, + "epoch": 0.14110927401172405, + "flos": 25130219055840.0, + "grad_norm": 1.9894085921149165, + "language_loss": 0.77280718, + "learning_rate": 3.8718904390976585e-06, + "loss": 0.79494065, + "num_input_tokens_seen": 50861035, + "router_z_loss_clip": 1.11035156, + "router_z_loss_mlp": 0.18457031, + "step": 2347, + "time_per_iteration": 4.09270167350769 + }, + { + "auxiliary_loss_clip": 0.01163209, + "auxiliary_loss_mlp": 0.01049431, + "balance_loss_clip": 1.05351329, + "balance_loss_mlp": 1.03281319, + "epoch": 0.141169397264392, + "flos": 34835032713600.0, + "grad_norm": 1.7383838688302904, + "language_loss": 0.7693311, + "learning_rate": 3.8717532563775e-06, + "loss": 0.79145753, + "num_input_tokens_seen": 50880105, + "router_z_loss_clip": 1.09667969, + "router_z_loss_mlp": 0.16601562, + "step": 2348, + "time_per_iteration": 4.1363770961761475 + }, + { + "auxiliary_loss_clip": 0.01161403, + "auxiliary_loss_mlp": 0.01040437, + "balance_loss_clip": 1.05315781, + "balance_loss_mlp": 1.0226506, + "epoch": 0.14122952051705998, + "flos": 20855728929600.0, + "grad_norm": 1.7290021590651796, + "language_loss": 0.86756814, + "learning_rate": 3.871616002680272e-06, + "loss": 0.88958651, + "num_input_tokens_seen": 50897720, + "router_z_loss_clip": 1.08203125, + "router_z_loss_mlp": 0.17797852, + "step": 2349, + "time_per_iteration": 2.6299853324890137 + }, + { + "auxiliary_loss_clip": 0.01164266, + "auxiliary_loss_mlp": 0.01046378, + "balance_loss_clip": 1.05750632, + "balance_loss_mlp": 1.02843738, + "epoch": 0.14128964376972794, + "flos": 35321859012960.0, + "grad_norm": 1.7049275479780912, + "language_loss": 0.88892931, + "learning_rate": 3.871478678011177e-06, + "loss": 0.91103578, + "num_input_tokens_seen": 50918385, + "router_z_loss_clip": 1.06787109, + "router_z_loss_mlp": 0.17944336, + "step": 2350, + "time_per_iteration": 2.856837034225464 + }, + { + "auxiliary_loss_clip": 0.01167764, + "auxiliary_loss_mlp": 0.01044625, + "balance_loss_clip": 1.057639, + "balance_loss_mlp": 1.02564657, + "epoch": 0.1413497670223959, + "flos": 23171204364480.0, + "grad_norm": 1.982875568354535, + "language_loss": 0.80814373, + "learning_rate": 3.871341282375423e-06, + "loss": 0.83026755, + "num_input_tokens_seen": 50938270, + "router_z_loss_clip": 1.10253906, + "router_z_loss_mlp": 0.18969727, + "step": 2351, + "time_per_iteration": 2.7415103912353516 + }, + { + "auxiliary_loss_clip": 0.01164491, + "auxiliary_loss_mlp": 0.01041366, + "balance_loss_clip": 1.05427718, + "balance_loss_mlp": 1.02406931, + "epoch": 0.14140989027506387, + "flos": 36438640110720.0, + "grad_norm": 2.5230520121137134, + "language_loss": 0.8331694, + "learning_rate": 3.871203815778219e-06, + "loss": 0.85522795, + "num_input_tokens_seen": 50958155, + "router_z_loss_clip": 1.10253906, + "router_z_loss_mlp": 0.1730957, + "step": 2352, + "time_per_iteration": 2.736527442932129 + }, + { + "auxiliary_loss_clip": 0.01058399, + "auxiliary_loss_mlp": 0.0102962, + "balance_loss_clip": 1.02066112, + "balance_loss_mlp": 1.02726114, + "epoch": 0.14147001352773186, + "flos": 75749265326880.0, + "grad_norm": 0.9135298514360601, + "language_loss": 0.6189394, + "learning_rate": 3.87106627822478e-06, + "loss": 0.63981956, + "num_input_tokens_seen": 51020705, + "router_z_loss_clip": 0.37744141, + "router_z_loss_mlp": 0.02357483, + "step": 2353, + "time_per_iteration": 3.1955981254577637 + }, + { + "auxiliary_loss_clip": 0.01160282, + "auxiliary_loss_mlp": 0.01046694, + "balance_loss_clip": 1.05357981, + "balance_loss_mlp": 1.02914631, + "epoch": 0.14153013678039983, + "flos": 26865934251840.0, + "grad_norm": 1.6663315115347985, + "language_loss": 0.87045765, + "learning_rate": 3.8709286697203196e-06, + "loss": 0.8925274, + "num_input_tokens_seen": 51039995, + "router_z_loss_clip": 1.06835938, + "router_z_loss_mlp": 0.17553711, + "step": 2354, + "time_per_iteration": 2.658921480178833 + }, + { + "auxiliary_loss_clip": 0.01163535, + "auxiliary_loss_mlp": 0.01045634, + "balance_loss_clip": 1.05349898, + "balance_loss_mlp": 1.02681124, + "epoch": 0.1415902600330678, + "flos": 24062335964640.0, + "grad_norm": 1.7697725043473065, + "language_loss": 0.74085587, + "learning_rate": 3.870790990270057e-06, + "loss": 0.76294756, + "num_input_tokens_seen": 51059075, + "router_z_loss_clip": 1.10058594, + "router_z_loss_mlp": 0.18811035, + "step": 2355, + "time_per_iteration": 2.6712119579315186 + }, + { + "auxiliary_loss_clip": 0.01059839, + "auxiliary_loss_mlp": 0.01006373, + "balance_loss_clip": 1.02234006, + "balance_loss_mlp": 1.00409007, + "epoch": 0.14165038328573576, + "flos": 80413312464000.0, + "grad_norm": 0.6797270407311201, + "language_loss": 0.51804703, + "learning_rate": 3.870653239879212e-06, + "loss": 0.53870916, + "num_input_tokens_seen": 51120380, + "router_z_loss_clip": 0.37475586, + "router_z_loss_mlp": 0.02282715, + "step": 2356, + "time_per_iteration": 3.222501277923584 + }, + { + "auxiliary_loss_clip": 0.01163124, + "auxiliary_loss_mlp": 0.01051372, + "balance_loss_clip": 1.05491138, + "balance_loss_mlp": 1.03341937, + "epoch": 0.14171050653840372, + "flos": 14800596294240.0, + "grad_norm": 10.19689199905809, + "language_loss": 0.6985327, + "learning_rate": 3.8705154185530095e-06, + "loss": 0.72067767, + "num_input_tokens_seen": 51136950, + "router_z_loss_clip": 1.08203125, + "router_z_loss_mlp": 0.1796875, + "step": 2357, + "time_per_iteration": 2.6733522415161133 + }, + { + "auxiliary_loss_clip": 0.01165499, + "auxiliary_loss_mlp": 0.01045278, + "balance_loss_clip": 1.05305672, + "balance_loss_mlp": 1.02782595, + "epoch": 0.1417706297910717, + "flos": 24907324733280.0, + "grad_norm": 1.8894527276773292, + "language_loss": 0.82138693, + "learning_rate": 3.870377526296674e-06, + "loss": 0.84349471, + "num_input_tokens_seen": 51155175, + "router_z_loss_clip": 1.12304688, + "router_z_loss_mlp": 0.17468262, + "step": 2358, + "time_per_iteration": 2.6378870010375977 + }, + { + "auxiliary_loss_clip": 0.01167917, + "auxiliary_loss_mlp": 0.01046824, + "balance_loss_clip": 1.05436051, + "balance_loss_mlp": 1.02759576, + "epoch": 0.14183075304373965, + "flos": 27309170308320.0, + "grad_norm": 1.9221679775751872, + "language_loss": 0.72112989, + "learning_rate": 3.870239563115436e-06, + "loss": 0.74327731, + "num_input_tokens_seen": 51174500, + "router_z_loss_clip": 1.13671875, + "router_z_loss_mlp": 0.19213867, + "step": 2359, + "time_per_iteration": 2.7022757530212402 + }, + { + "auxiliary_loss_clip": 0.01160037, + "auxiliary_loss_mlp": 0.01045814, + "balance_loss_clip": 1.0519712, + "balance_loss_mlp": 1.02812362, + "epoch": 0.14189087629640765, + "flos": 26333046155520.0, + "grad_norm": 3.1957206082064955, + "language_loss": 0.75403297, + "learning_rate": 3.870101529014526e-06, + "loss": 0.77609158, + "num_input_tokens_seen": 51194270, + "router_z_loss_clip": 1.08105469, + "router_z_loss_mlp": 0.17700195, + "step": 2360, + "time_per_iteration": 2.7951741218566895 + }, + { + "auxiliary_loss_clip": 0.01162922, + "auxiliary_loss_mlp": 0.01043177, + "balance_loss_clip": 1.05569255, + "balance_loss_mlp": 1.0240562, + "epoch": 0.1419509995490756, + "flos": 24414663945600.0, + "grad_norm": 2.164593971483592, + "language_loss": 0.81700349, + "learning_rate": 3.869963423999178e-06, + "loss": 0.83906448, + "num_input_tokens_seen": 51211850, + "router_z_loss_clip": 1.07226562, + "router_z_loss_mlp": 0.19128418, + "step": 2361, + "time_per_iteration": 2.664275646209717 + }, + { + "auxiliary_loss_clip": 0.01157584, + "auxiliary_loss_mlp": 0.01048604, + "balance_loss_clip": 1.05171585, + "balance_loss_mlp": 1.03071094, + "epoch": 0.14201112280174358, + "flos": 38975915901600.0, + "grad_norm": 1.8325552911323506, + "language_loss": 0.74061376, + "learning_rate": 3.86982524807463e-06, + "loss": 0.76267558, + "num_input_tokens_seen": 51233545, + "router_z_loss_clip": 1.05957031, + "router_z_loss_mlp": 0.17895508, + "step": 2362, + "time_per_iteration": 2.7521774768829346 + }, + { + "auxiliary_loss_clip": 0.0116283, + "auxiliary_loss_mlp": 0.01048724, + "balance_loss_clip": 1.05505347, + "balance_loss_mlp": 1.03021061, + "epoch": 0.14207124605441154, + "flos": 50596235042400.0, + "grad_norm": 1.6705456124604776, + "language_loss": 0.74427432, + "learning_rate": 3.869687001246122e-06, + "loss": 0.76638985, + "num_input_tokens_seen": 51257615, + "router_z_loss_clip": 1.07617188, + "router_z_loss_mlp": 0.18505859, + "step": 2363, + "time_per_iteration": 3.066319704055786 + }, + { + "auxiliary_loss_clip": 0.01159675, + "auxiliary_loss_mlp": 0.01047256, + "balance_loss_clip": 1.05228186, + "balance_loss_mlp": 1.0288744, + "epoch": 0.1421313693070795, + "flos": 38931515313120.0, + "grad_norm": 1.595320688417615, + "language_loss": 0.73170161, + "learning_rate": 3.8695486835188946e-06, + "loss": 0.75377095, + "num_input_tokens_seen": 51279645, + "router_z_loss_clip": 1.07519531, + "router_z_loss_mlp": 0.18371582, + "step": 2364, + "time_per_iteration": 2.7204341888427734 + }, + { + "auxiliary_loss_clip": 0.01155544, + "auxiliary_loss_mlp": 0.0105051, + "balance_loss_clip": 1.0524404, + "balance_loss_mlp": 1.03446436, + "epoch": 0.14219149255974747, + "flos": 32788634950080.0, + "grad_norm": 1.965518238561068, + "language_loss": 0.90543234, + "learning_rate": 3.869410294898195e-06, + "loss": 0.92749286, + "num_input_tokens_seen": 51299775, + "router_z_loss_clip": 1.03125, + "router_z_loss_mlp": 0.16040039, + "step": 2365, + "time_per_iteration": 2.6948821544647217 + }, + { + "auxiliary_loss_clip": 0.01159487, + "auxiliary_loss_mlp": 0.01048386, + "balance_loss_clip": 1.05132294, + "balance_loss_mlp": 1.02972937, + "epoch": 0.14225161581241544, + "flos": 33500421953280.0, + "grad_norm": 1.737095162166344, + "language_loss": 0.65648919, + "learning_rate": 3.869271835389268e-06, + "loss": 0.67856795, + "num_input_tokens_seen": 51319430, + "router_z_loss_clip": 1.08203125, + "router_z_loss_mlp": 0.18652344, + "step": 2366, + "time_per_iteration": 2.732661247253418 + }, + { + "auxiliary_loss_clip": 0.01157307, + "auxiliary_loss_mlp": 0.01052498, + "balance_loss_clip": 1.05118155, + "balance_loss_mlp": 1.0339129, + "epoch": 0.14231173906508343, + "flos": 12886306329600.0, + "grad_norm": 1.9189546423798094, + "language_loss": 0.8048203, + "learning_rate": 3.8691333049973665e-06, + "loss": 0.8269183, + "num_input_tokens_seen": 51336045, + "router_z_loss_clip": 1.06152344, + "router_z_loss_mlp": 0.18579102, + "step": 2367, + "time_per_iteration": 2.6341888904571533 + }, + { + "auxiliary_loss_clip": 0.01163748, + "auxiliary_loss_mlp": 0.0105338, + "balance_loss_clip": 1.05343378, + "balance_loss_mlp": 1.03512883, + "epoch": 0.1423718623177514, + "flos": 34607073731040.0, + "grad_norm": 2.033205929856036, + "language_loss": 0.82739365, + "learning_rate": 3.868994703727742e-06, + "loss": 0.84956497, + "num_input_tokens_seen": 51357030, + "router_z_loss_clip": 1.10253906, + "router_z_loss_mlp": 0.18261719, + "step": 2368, + "time_per_iteration": 2.715188503265381 + }, + { + "auxiliary_loss_clip": 0.01160809, + "auxiliary_loss_mlp": 0.01046216, + "balance_loss_clip": 1.05349863, + "balance_loss_mlp": 1.02730894, + "epoch": 0.14243198557041936, + "flos": 23615980077600.0, + "grad_norm": 2.1365262053413083, + "language_loss": 0.8697502, + "learning_rate": 3.868856031585652e-06, + "loss": 0.89182043, + "num_input_tokens_seen": 51374890, + "router_z_loss_clip": 1.07324219, + "router_z_loss_mlp": 0.18908691, + "step": 2369, + "time_per_iteration": 2.6416149139404297 + }, + { + "auxiliary_loss_clip": 0.01164621, + "auxiliary_loss_mlp": 0.01042497, + "balance_loss_clip": 1.05252218, + "balance_loss_mlp": 1.02463913, + "epoch": 0.14249210882308733, + "flos": 35150536837440.0, + "grad_norm": 1.821049982453949, + "language_loss": 0.75964069, + "learning_rate": 3.868717288576354e-06, + "loss": 0.78171182, + "num_input_tokens_seen": 51398100, + "router_z_loss_clip": 1.12011719, + "router_z_loss_mlp": 0.1784668, + "step": 2370, + "time_per_iteration": 2.7300407886505127 + }, + { + "auxiliary_loss_clip": 0.01160108, + "auxiliary_loss_mlp": 0.01048388, + "balance_loss_clip": 1.0527513, + "balance_loss_mlp": 1.03068507, + "epoch": 0.1425522320757553, + "flos": 26643242515680.0, + "grad_norm": 1.6250829124881172, + "language_loss": 0.83206642, + "learning_rate": 3.868578474705109e-06, + "loss": 0.85415137, + "num_input_tokens_seen": 51418745, + "router_z_loss_clip": 1.07421875, + "router_z_loss_mlp": 0.17687988, + "step": 2371, + "time_per_iteration": 2.653050184249878 + }, + { + "auxiliary_loss_clip": 0.01162619, + "auxiliary_loss_mlp": 0.01048523, + "balance_loss_clip": 1.05401754, + "balance_loss_mlp": 1.03006947, + "epoch": 0.14261235532842326, + "flos": 21123185909760.0, + "grad_norm": 2.028912399319626, + "language_loss": 0.83021855, + "learning_rate": 3.868439589977181e-06, + "loss": 0.85232997, + "num_input_tokens_seen": 51437455, + "router_z_loss_clip": 1.0859375, + "router_z_loss_mlp": 0.18444824, + "step": 2372, + "time_per_iteration": 2.6284031867980957 + }, + { + "auxiliary_loss_clip": 0.01163631, + "auxiliary_loss_mlp": 0.01043352, + "balance_loss_clip": 1.0553627, + "balance_loss_mlp": 1.0250299, + "epoch": 0.14267247858109125, + "flos": 22947904869120.0, + "grad_norm": 2.1754042213405076, + "language_loss": 0.84539878, + "learning_rate": 3.868300634397836e-06, + "loss": 0.8674686, + "num_input_tokens_seen": 51455710, + "router_z_loss_clip": 1.08203125, + "router_z_loss_mlp": 0.18334961, + "step": 2373, + "time_per_iteration": 2.5987000465393066 + }, + { + "auxiliary_loss_clip": 0.01159241, + "auxiliary_loss_mlp": 0.01043891, + "balance_loss_clip": 1.05363464, + "balance_loss_mlp": 1.02753544, + "epoch": 0.14273260183375922, + "flos": 13859918411040.0, + "grad_norm": 1.9819438711349515, + "language_loss": 0.86052781, + "learning_rate": 3.8681616079723445e-06, + "loss": 0.88255918, + "num_input_tokens_seen": 51471270, + "router_z_loss_clip": 1.05517578, + "router_z_loss_mlp": 0.16357422, + "step": 2374, + "time_per_iteration": 2.644925117492676 + }, + { + "auxiliary_loss_clip": 0.01165986, + "auxiliary_loss_mlp": 0.01048658, + "balance_loss_clip": 1.05439496, + "balance_loss_mlp": 1.02957249, + "epoch": 0.14279272508642718, + "flos": 33638283205920.0, + "grad_norm": 1.973783846883264, + "language_loss": 0.79340768, + "learning_rate": 3.868022510705977e-06, + "loss": 0.81555414, + "num_input_tokens_seen": 51492705, + "router_z_loss_clip": 1.11621094, + "router_z_loss_mlp": 0.19091797, + "step": 2375, + "time_per_iteration": 2.732339859008789 + }, + { + "auxiliary_loss_clip": 0.01164562, + "auxiliary_loss_mlp": 0.01055261, + "balance_loss_clip": 1.05721259, + "balance_loss_mlp": 1.03761864, + "epoch": 0.14285284833909515, + "flos": 19831111943040.0, + "grad_norm": 2.2904410737610315, + "language_loss": 0.76706773, + "learning_rate": 3.867883342604009e-06, + "loss": 0.78926593, + "num_input_tokens_seen": 51510780, + "router_z_loss_clip": 1.07470703, + "router_z_loss_mlp": 0.1763916, + "step": 2376, + "time_per_iteration": 2.94163179397583 + }, + { + "auxiliary_loss_clip": 0.01163439, + "auxiliary_loss_mlp": 0.01045722, + "balance_loss_clip": 1.05583072, + "balance_loss_mlp": 1.02737546, + "epoch": 0.1429129715917631, + "flos": 24105764138400.0, + "grad_norm": 1.7451253789874226, + "language_loss": 0.93057513, + "learning_rate": 3.867744103671717e-06, + "loss": 0.9526667, + "num_input_tokens_seen": 51531400, + "router_z_loss_clip": 1.07617188, + "router_z_loss_mlp": 0.18334961, + "step": 2377, + "time_per_iteration": 2.6770262718200684 + }, + { + "auxiliary_loss_clip": 0.01163138, + "auxiliary_loss_mlp": 0.01049242, + "balance_loss_clip": 1.05427396, + "balance_loss_mlp": 1.0292505, + "epoch": 0.14297309484443108, + "flos": 25792014085920.0, + "grad_norm": 1.840902115448403, + "language_loss": 0.91432291, + "learning_rate": 3.867604793914382e-06, + "loss": 0.93644673, + "num_input_tokens_seen": 51548215, + "router_z_loss_clip": 1.09033203, + "router_z_loss_mlp": 0.19995117, + "step": 2378, + "time_per_iteration": 2.642045259475708 + }, + { + "auxiliary_loss_clip": 0.01165327, + "auxiliary_loss_mlp": 0.01044654, + "balance_loss_clip": 1.0557189, + "balance_loss_mlp": 1.02682018, + "epoch": 0.14303321809709904, + "flos": 28780710423840.0, + "grad_norm": 6.5090771813441926, + "language_loss": 0.73712409, + "learning_rate": 3.8674654133372864e-06, + "loss": 0.75922394, + "num_input_tokens_seen": 51566820, + "router_z_loss_clip": 1.09472656, + "router_z_loss_mlp": 0.17834473, + "step": 2379, + "time_per_iteration": 2.6852173805236816 + }, + { + "auxiliary_loss_clip": 0.01160956, + "auxiliary_loss_mlp": 0.01048654, + "balance_loss_clip": 1.05376172, + "balance_loss_mlp": 1.03030729, + "epoch": 0.14309334134976703, + "flos": 19386538816320.0, + "grad_norm": 2.0981340024268285, + "language_loss": 0.78733754, + "learning_rate": 3.867325961945714e-06, + "loss": 0.80943364, + "num_input_tokens_seen": 51585075, + "router_z_loss_clip": 1.07275391, + "router_z_loss_mlp": 0.18334961, + "step": 2380, + "time_per_iteration": 2.612675905227661 + }, + { + "auxiliary_loss_clip": 0.01166789, + "auxiliary_loss_mlp": 0.01047262, + "balance_loss_clip": 1.05717576, + "balance_loss_mlp": 1.02930903, + "epoch": 0.143153464602435, + "flos": 19920237258240.0, + "grad_norm": 2.053610437864997, + "language_loss": 0.88129973, + "learning_rate": 3.867186439744955e-06, + "loss": 0.90344024, + "num_input_tokens_seen": 51603185, + "router_z_loss_clip": 1.09765625, + "router_z_loss_mlp": 0.17944336, + "step": 2381, + "time_per_iteration": 2.6612422466278076 + }, + { + "auxiliary_loss_clip": 0.01161765, + "auxiliary_loss_mlp": 0.01051872, + "balance_loss_clip": 1.05507398, + "balance_loss_mlp": 1.03359699, + "epoch": 0.14321358785510296, + "flos": 20855526343200.0, + "grad_norm": 2.488769142806088, + "language_loss": 0.77024078, + "learning_rate": 3.867046846740299e-06, + "loss": 0.79237717, + "num_input_tokens_seen": 51620880, + "router_z_loss_clip": 1.06738281, + "router_z_loss_mlp": 0.18273926, + "step": 2382, + "time_per_iteration": 2.6231772899627686 + }, + { + "auxiliary_loss_clip": 0.01161819, + "auxiliary_loss_mlp": 0.01050461, + "balance_loss_clip": 1.05337071, + "balance_loss_mlp": 1.03252053, + "epoch": 0.14327371110777093, + "flos": 32119708878720.0, + "grad_norm": 2.13093373517224, + "language_loss": 0.77036333, + "learning_rate": 3.866907182937039e-06, + "loss": 0.79248619, + "num_input_tokens_seen": 51640170, + "router_z_loss_clip": 1.0859375, + "router_z_loss_mlp": 0.17944336, + "step": 2383, + "time_per_iteration": 4.090428352355957 + }, + { + "auxiliary_loss_clip": 0.01165633, + "auxiliary_loss_mlp": 0.0105302, + "balance_loss_clip": 1.05507445, + "balance_loss_mlp": 1.03286183, + "epoch": 0.1433338343604389, + "flos": 22056813786240.0, + "grad_norm": 2.081634464713114, + "language_loss": 0.87781781, + "learning_rate": 3.866767448340471e-06, + "loss": 0.90000433, + "num_input_tokens_seen": 51656580, + "router_z_loss_clip": 1.10546875, + "router_z_loss_mlp": 0.20141602, + "step": 2384, + "time_per_iteration": 4.138821601867676 + }, + { + "auxiliary_loss_clip": 0.01170094, + "auxiliary_loss_mlp": 0.01050022, + "balance_loss_clip": 1.05672216, + "balance_loss_mlp": 1.03057861, + "epoch": 0.14339395761310686, + "flos": 18940588102080.0, + "grad_norm": 2.475715569196785, + "language_loss": 0.79844999, + "learning_rate": 3.866627642955895e-06, + "loss": 0.82065117, + "num_input_tokens_seen": 51674645, + "router_z_loss_clip": 1.13378906, + "router_z_loss_mlp": 0.19445801, + "step": 2385, + "time_per_iteration": 2.6149613857269287 + }, + { + "auxiliary_loss_clip": 0.01164385, + "auxiliary_loss_mlp": 0.01042684, + "balance_loss_clip": 1.055475, + "balance_loss_mlp": 1.02510107, + "epoch": 0.14345408086577485, + "flos": 34835843059200.0, + "grad_norm": 1.7816610336987502, + "language_loss": 0.75421011, + "learning_rate": 3.866487766788612e-06, + "loss": 0.77628082, + "num_input_tokens_seen": 51695770, + "router_z_loss_clip": 1.08935547, + "router_z_loss_mlp": 0.17602539, + "step": 2386, + "time_per_iteration": 2.723592758178711 + }, + { + "auxiliary_loss_clip": 0.01165576, + "auxiliary_loss_mlp": 0.01041008, + "balance_loss_clip": 1.05702722, + "balance_loss_mlp": 1.02304339, + "epoch": 0.14351420411844282, + "flos": 24684714031680.0, + "grad_norm": 1.9674744143237748, + "language_loss": 0.78408056, + "learning_rate": 3.866347819843925e-06, + "loss": 0.80614638, + "num_input_tokens_seen": 51714165, + "router_z_loss_clip": 1.08398438, + "router_z_loss_mlp": 0.17956543, + "step": 2387, + "time_per_iteration": 4.4247965812683105 + }, + { + "auxiliary_loss_clip": 0.01163043, + "auxiliary_loss_mlp": 0.01052553, + "balance_loss_clip": 1.05435467, + "balance_loss_mlp": 1.0331223, + "epoch": 0.14357432737111078, + "flos": 24239290042080.0, + "grad_norm": 2.297562386796433, + "language_loss": 0.81742692, + "learning_rate": 3.866207802127143e-06, + "loss": 0.83958292, + "num_input_tokens_seen": 51734440, + "router_z_loss_clip": 1.08789062, + "router_z_loss_mlp": 0.19433594, + "step": 2388, + "time_per_iteration": 4.132163047790527 + }, + { + "auxiliary_loss_clip": 0.01164013, + "auxiliary_loss_mlp": 0.01042088, + "balance_loss_clip": 1.0558753, + "balance_loss_mlp": 1.02510047, + "epoch": 0.14363445062377875, + "flos": 34479584902080.0, + "grad_norm": 2.1537890142390737, + "language_loss": 0.82002181, + "learning_rate": 3.866067713643573e-06, + "loss": 0.8420828, + "num_input_tokens_seen": 51753730, + "router_z_loss_clip": 1.08105469, + "router_z_loss_mlp": 0.17016602, + "step": 2389, + "time_per_iteration": 2.678666591644287 + }, + { + "auxiliary_loss_clip": 0.01167182, + "auxiliary_loss_mlp": 0.01051468, + "balance_loss_clip": 1.05475569, + "balance_loss_mlp": 1.0327282, + "epoch": 0.1436945738764467, + "flos": 22191231070080.0, + "grad_norm": 2.2098476257481003, + "language_loss": 0.83097285, + "learning_rate": 3.8659275543985285e-06, + "loss": 0.85315931, + "num_input_tokens_seen": 51771195, + "router_z_loss_clip": 1.12402344, + "router_z_loss_mlp": 0.18737793, + "step": 2390, + "time_per_iteration": 2.6413674354553223 + }, + { + "auxiliary_loss_clip": 0.01162274, + "auxiliary_loss_mlp": 0.01052241, + "balance_loss_clip": 1.05378222, + "balance_loss_mlp": 1.03433609, + "epoch": 0.14375469712911468, + "flos": 33321685115520.0, + "grad_norm": 2.24356961086357, + "language_loss": 0.74656558, + "learning_rate": 3.865787324397324e-06, + "loss": 0.76871073, + "num_input_tokens_seen": 51792290, + "router_z_loss_clip": 1.08300781, + "router_z_loss_mlp": 0.17895508, + "step": 2391, + "time_per_iteration": 2.6864113807678223 + }, + { + "auxiliary_loss_clip": 0.01056727, + "auxiliary_loss_mlp": 0.01018174, + "balance_loss_clip": 1.01973009, + "balance_loss_mlp": 1.01566744, + "epoch": 0.14381482038178264, + "flos": 69419017945440.0, + "grad_norm": 2.708545454217413, + "language_loss": 0.61792624, + "learning_rate": 3.865647023645277e-06, + "loss": 0.63867533, + "num_input_tokens_seen": 51843675, + "router_z_loss_clip": 0.37036133, + "router_z_loss_mlp": 0.02505493, + "step": 2392, + "time_per_iteration": 3.1388063430786133 + }, + { + "auxiliary_loss_clip": 0.01166937, + "auxiliary_loss_mlp": 0.01050887, + "balance_loss_clip": 1.0529089, + "balance_loss_mlp": 1.03096652, + "epoch": 0.14387494363445064, + "flos": 17426713779360.0, + "grad_norm": 3.3541690062693714, + "language_loss": 0.77377397, + "learning_rate": 3.865506652147709e-06, + "loss": 0.7959522, + "num_input_tokens_seen": 51860285, + "router_z_loss_clip": 1.13964844, + "router_z_loss_mlp": 0.19934082, + "step": 2393, + "time_per_iteration": 2.608229875564575 + }, + { + "auxiliary_loss_clip": 0.01164596, + "auxiliary_loss_mlp": 0.01050647, + "balance_loss_clip": 1.05533147, + "balance_loss_mlp": 1.03305125, + "epoch": 0.1439350668871186, + "flos": 32654825425440.0, + "grad_norm": 2.0755436407644168, + "language_loss": 0.7694875, + "learning_rate": 3.865366209909941e-06, + "loss": 0.79163992, + "num_input_tokens_seen": 51880105, + "router_z_loss_clip": 1.09277344, + "router_z_loss_mlp": 0.17590332, + "step": 2394, + "time_per_iteration": 2.7242400646209717 + }, + { + "auxiliary_loss_clip": 0.0116291, + "auxiliary_loss_mlp": 0.01047243, + "balance_loss_clip": 1.05404663, + "balance_loss_mlp": 1.02949262, + "epoch": 0.14399519013978657, + "flos": 49663620097920.0, + "grad_norm": 1.57965959520309, + "language_loss": 0.86064237, + "learning_rate": 3.8652256969372994e-06, + "loss": 0.88274395, + "num_input_tokens_seen": 51905175, + "router_z_loss_clip": 1.08789062, + "router_z_loss_mlp": 0.17736816, + "step": 2395, + "time_per_iteration": 2.810870409011841 + }, + { + "auxiliary_loss_clip": 0.01161205, + "auxiliary_loss_mlp": 0.01050145, + "balance_loss_clip": 1.05487943, + "balance_loss_mlp": 1.0325855, + "epoch": 0.14405531339245453, + "flos": 25084400362560.0, + "grad_norm": 1.5740592671000315, + "language_loss": 0.83036125, + "learning_rate": 3.865085113235113e-06, + "loss": 0.85247481, + "num_input_tokens_seen": 51924490, + "router_z_loss_clip": 1.06347656, + "router_z_loss_mlp": 0.17553711, + "step": 2396, + "time_per_iteration": 2.6662425994873047 + }, + { + "auxiliary_loss_clip": 0.01158433, + "auxiliary_loss_mlp": 0.01040801, + "balance_loss_clip": 1.05308294, + "balance_loss_mlp": 1.02406418, + "epoch": 0.1441154366451225, + "flos": 23882869815840.0, + "grad_norm": 2.174295766123508, + "language_loss": 0.82643223, + "learning_rate": 3.864944458808712e-06, + "loss": 0.84842467, + "num_input_tokens_seen": 51940490, + "router_z_loss_clip": 1.05224609, + "router_z_loss_mlp": 0.16760254, + "step": 2397, + "time_per_iteration": 2.62579607963562 + }, + { + "auxiliary_loss_clip": 0.01164375, + "auxiliary_loss_mlp": 0.01047645, + "balance_loss_clip": 1.05411613, + "balance_loss_mlp": 1.0289408, + "epoch": 0.14417555989779046, + "flos": 22592700161280.0, + "grad_norm": 1.592155086815514, + "language_loss": 0.7971909, + "learning_rate": 3.86480373366343e-06, + "loss": 0.81931114, + "num_input_tokens_seen": 51957910, + "router_z_loss_clip": 1.10351562, + "router_z_loss_mlp": 0.18713379, + "step": 2398, + "time_per_iteration": 2.6430540084838867 + }, + { + "auxiliary_loss_clip": 0.01161383, + "auxiliary_loss_mlp": 0.01047068, + "balance_loss_clip": 1.05465174, + "balance_loss_mlp": 1.02936506, + "epoch": 0.14423568315045843, + "flos": 31764261067200.0, + "grad_norm": 1.974017947709915, + "language_loss": 0.64542848, + "learning_rate": 3.864662937804603e-06, + "loss": 0.66751295, + "num_input_tokens_seen": 51978010, + "router_z_loss_clip": 1.06640625, + "router_z_loss_mlp": 0.17700195, + "step": 2399, + "time_per_iteration": 2.9576268196105957 + }, + { + "auxiliary_loss_clip": 0.01163574, + "auxiliary_loss_mlp": 0.01044701, + "balance_loss_clip": 1.0562501, + "balance_loss_mlp": 1.02654576, + "epoch": 0.14429580640312642, + "flos": 25976544894720.0, + "grad_norm": 1.6473536516869947, + "language_loss": 0.82147002, + "learning_rate": 3.864522071237571e-06, + "loss": 0.84355283, + "num_input_tokens_seen": 51998515, + "router_z_loss_clip": 1.07324219, + "router_z_loss_mlp": 0.18164062, + "step": 2400, + "time_per_iteration": 2.6369645595550537 + }, + { + "auxiliary_loss_clip": 0.01166116, + "auxiliary_loss_mlp": 0.0105437, + "balance_loss_clip": 1.05438566, + "balance_loss_mlp": 1.03473663, + "epoch": 0.14435592965579438, + "flos": 31271721831360.0, + "grad_norm": 1.6158230458970133, + "language_loss": 0.73930126, + "learning_rate": 3.864381133967676e-06, + "loss": 0.76150614, + "num_input_tokens_seen": 52019270, + "router_z_loss_clip": 1.11816406, + "router_z_loss_mlp": 0.19628906, + "step": 2401, + "time_per_iteration": 2.728921890258789 + }, + { + "auxiliary_loss_clip": 0.01161629, + "auxiliary_loss_mlp": 0.01039905, + "balance_loss_clip": 1.05443716, + "balance_loss_mlp": 1.02232206, + "epoch": 0.14441605290846235, + "flos": 28022294381760.0, + "grad_norm": 1.5080898688297562, + "language_loss": 0.81184083, + "learning_rate": 3.86424012600026e-06, + "loss": 0.83385617, + "num_input_tokens_seen": 52039315, + "router_z_loss_clip": 1.06933594, + "router_z_loss_mlp": 0.17590332, + "step": 2402, + "time_per_iteration": 2.7199652194976807 + }, + { + "auxiliary_loss_clip": 0.01160869, + "auxiliary_loss_mlp": 0.0105068, + "balance_loss_clip": 1.05344224, + "balance_loss_mlp": 1.03283465, + "epoch": 0.14447617616113032, + "flos": 21167383911840.0, + "grad_norm": 2.4533712048717558, + "language_loss": 0.8453325, + "learning_rate": 3.864099047340673e-06, + "loss": 0.86744797, + "num_input_tokens_seen": 52056555, + "router_z_loss_clip": 1.07519531, + "router_z_loss_mlp": 0.17834473, + "step": 2403, + "time_per_iteration": 2.6343188285827637 + }, + { + "auxiliary_loss_clip": 0.01160385, + "auxiliary_loss_mlp": 0.01049418, + "balance_loss_clip": 1.05195439, + "balance_loss_mlp": 1.03001046, + "epoch": 0.14453629941379828, + "flos": 29358606867840.0, + "grad_norm": 1.7031408272379993, + "language_loss": 0.69694328, + "learning_rate": 3.863957897994262e-06, + "loss": 0.71904129, + "num_input_tokens_seen": 52075800, + "router_z_loss_clip": 1.0859375, + "router_z_loss_mlp": 0.19421387, + "step": 2404, + "time_per_iteration": 2.6418724060058594 + }, + { + "auxiliary_loss_clip": 0.01154288, + "auxiliary_loss_mlp": 0.01044568, + "balance_loss_clip": 1.04907489, + "balance_loss_mlp": 1.02756917, + "epoch": 0.14459642266646625, + "flos": 17606423031840.0, + "grad_norm": 2.052077246978049, + "language_loss": 0.73481834, + "learning_rate": 3.863816677966381e-06, + "loss": 0.75680697, + "num_input_tokens_seen": 52092585, + "router_z_loss_clip": 1.05126953, + "router_z_loss_mlp": 0.16992188, + "step": 2405, + "time_per_iteration": 2.6233510971069336 + }, + { + "auxiliary_loss_clip": 0.01158403, + "auxiliary_loss_mlp": 0.01046882, + "balance_loss_clip": 1.05252063, + "balance_loss_mlp": 1.02909613, + "epoch": 0.14465654591913424, + "flos": 12039048593280.0, + "grad_norm": 2.31272564238306, + "language_loss": 0.72900856, + "learning_rate": 3.863675387262386e-06, + "loss": 0.75106138, + "num_input_tokens_seen": 52108990, + "router_z_loss_clip": 1.05957031, + "router_z_loss_mlp": 0.17797852, + "step": 2406, + "time_per_iteration": 2.68943452835083 + }, + { + "auxiliary_loss_clip": 0.0116139, + "auxiliary_loss_mlp": 0.01047227, + "balance_loss_clip": 1.05225492, + "balance_loss_mlp": 1.02822483, + "epoch": 0.1447166691718022, + "flos": 30472349169600.0, + "grad_norm": 2.3591269186032906, + "language_loss": 0.75427741, + "learning_rate": 3.8635340258876325e-06, + "loss": 0.77636361, + "num_input_tokens_seen": 52125385, + "router_z_loss_clip": 1.09130859, + "router_z_loss_mlp": 0.19006348, + "step": 2407, + "time_per_iteration": 2.6921117305755615 + }, + { + "auxiliary_loss_clip": 0.01158745, + "auxiliary_loss_mlp": 0.01045216, + "balance_loss_clip": 1.05149245, + "balance_loss_mlp": 1.02765703, + "epoch": 0.14477679242447017, + "flos": 26732327313600.0, + "grad_norm": 1.51947307030756, + "language_loss": 0.795578, + "learning_rate": 3.8633925938474826e-06, + "loss": 0.8176176, + "num_input_tokens_seen": 52144985, + "router_z_loss_clip": 1.07226562, + "router_z_loss_mlp": 0.17541504, + "step": 2408, + "time_per_iteration": 2.6450235843658447 + }, + { + "auxiliary_loss_clip": 0.01163832, + "auxiliary_loss_mlp": 0.01045215, + "balance_loss_clip": 1.05575669, + "balance_loss_mlp": 1.02617741, + "epoch": 0.14483691567713813, + "flos": 25307902444320.0, + "grad_norm": 2.444481336806207, + "language_loss": 0.82007694, + "learning_rate": 3.863251091147299e-06, + "loss": 0.84216744, + "num_input_tokens_seen": 52163885, + "router_z_loss_clip": 1.08105469, + "router_z_loss_mlp": 0.19042969, + "step": 2409, + "time_per_iteration": 2.6616766452789307 + }, + { + "auxiliary_loss_clip": 0.01161313, + "auxiliary_loss_mlp": 0.01054951, + "balance_loss_clip": 1.05234838, + "balance_loss_mlp": 1.0363903, + "epoch": 0.1448970389298061, + "flos": 43205681301120.0, + "grad_norm": 1.9693139714047194, + "language_loss": 0.74928749, + "learning_rate": 3.863109517792446e-06, + "loss": 0.77145016, + "num_input_tokens_seen": 52184325, + "router_z_loss_clip": 1.08984375, + "router_z_loss_mlp": 0.18554688, + "step": 2410, + "time_per_iteration": 2.7484939098358154 + }, + { + "auxiliary_loss_clip": 0.0115965, + "auxiliary_loss_mlp": 0.01044436, + "balance_loss_clip": 1.05262828, + "balance_loss_mlp": 1.02752066, + "epoch": 0.14495716218247406, + "flos": 18808196682240.0, + "grad_norm": 1.7590753142948448, + "language_loss": 0.81543142, + "learning_rate": 3.8629678737882945e-06, + "loss": 0.83747226, + "num_input_tokens_seen": 52202740, + "router_z_loss_clip": 1.0703125, + "router_z_loss_mlp": 0.16918945, + "step": 2411, + "time_per_iteration": 2.6265933513641357 + }, + { + "auxiliary_loss_clip": 0.01160713, + "auxiliary_loss_mlp": 0.01048593, + "balance_loss_clip": 1.0535152, + "balance_loss_mlp": 1.03060436, + "epoch": 0.14501728543514203, + "flos": 41113505361600.0, + "grad_norm": 2.04750528973134, + "language_loss": 0.69743413, + "learning_rate": 3.862826159140214e-06, + "loss": 0.71952724, + "num_input_tokens_seen": 52223100, + "router_z_loss_clip": 1.07128906, + "router_z_loss_mlp": 0.17980957, + "step": 2412, + "time_per_iteration": 3.0189497470855713 + }, + { + "auxiliary_loss_clip": 0.01160387, + "auxiliary_loss_mlp": 0.01042376, + "balance_loss_clip": 1.05417991, + "balance_loss_mlp": 1.02460194, + "epoch": 0.14507740868781002, + "flos": 19030037555520.0, + "grad_norm": 2.084698616200848, + "language_loss": 0.76943839, + "learning_rate": 3.862684373853579e-06, + "loss": 0.79146606, + "num_input_tokens_seen": 52239690, + "router_z_loss_clip": 1.06103516, + "router_z_loss_mlp": 0.17773438, + "step": 2413, + "time_per_iteration": 2.611415147781372 + }, + { + "auxiliary_loss_clip": 0.01065793, + "auxiliary_loss_mlp": 0.01007217, + "balance_loss_clip": 1.02817988, + "balance_loss_mlp": 1.00486851, + "epoch": 0.145137531940478, + "flos": 81357555867840.0, + "grad_norm": 0.917656815554266, + "language_loss": 0.58944225, + "learning_rate": 3.8625425179337656e-06, + "loss": 0.61017239, + "num_input_tokens_seen": 52296705, + "router_z_loss_clip": 0.3762207, + "router_z_loss_mlp": 0.02346802, + "step": 2414, + "time_per_iteration": 3.230470895767212 + }, + { + "auxiliary_loss_clip": 0.01065183, + "auxiliary_loss_mlp": 0.01002571, + "balance_loss_clip": 1.0275116, + "balance_loss_mlp": 1.00027132, + "epoch": 0.14519765519314595, + "flos": 82389546999360.0, + "grad_norm": 0.8503829555965539, + "language_loss": 0.6220063, + "learning_rate": 3.862400591386154e-06, + "loss": 0.64268386, + "num_input_tokens_seen": 52361830, + "router_z_loss_clip": 0.37646484, + "router_z_loss_mlp": 0.02297974, + "step": 2415, + "time_per_iteration": 3.226707935333252 + }, + { + "auxiliary_loss_clip": 0.01159021, + "auxiliary_loss_mlp": 0.01047894, + "balance_loss_clip": 1.05150557, + "balance_loss_mlp": 1.02945197, + "epoch": 0.14525777844581392, + "flos": 20986135002720.0, + "grad_norm": 2.499844551402935, + "language_loss": 0.71740818, + "learning_rate": 3.8622585942161245e-06, + "loss": 0.73947728, + "num_input_tokens_seen": 52379420, + "router_z_loss_clip": 1.07519531, + "router_z_loss_mlp": 0.18444824, + "step": 2416, + "time_per_iteration": 2.654658317565918 + }, + { + "auxiliary_loss_clip": 0.01060908, + "auxiliary_loss_mlp": 0.01004414, + "balance_loss_clip": 1.02394772, + "balance_loss_mlp": 1.00223398, + "epoch": 0.14531790169848188, + "flos": 79806330963360.0, + "grad_norm": 0.7094314572285795, + "language_loss": 0.60367894, + "learning_rate": 3.8621165264290635e-06, + "loss": 0.62433219, + "num_input_tokens_seen": 52446290, + "router_z_loss_clip": 0.36938477, + "router_z_loss_mlp": 0.02183533, + "step": 2417, + "time_per_iteration": 3.295197010040283 + }, + { + "auxiliary_loss_clip": 0.01160302, + "auxiliary_loss_mlp": 0.01053685, + "balance_loss_clip": 1.05109942, + "balance_loss_mlp": 1.03550541, + "epoch": 0.14537802495114985, + "flos": 39733643149920.0, + "grad_norm": 2.607398061842593, + "language_loss": 0.79268289, + "learning_rate": 3.861974388030356e-06, + "loss": 0.81482279, + "num_input_tokens_seen": 52467295, + "router_z_loss_clip": 1.09277344, + "router_z_loss_mlp": 0.18188477, + "step": 2418, + "time_per_iteration": 2.7392098903656006 + }, + { + "auxiliary_loss_clip": 0.0115689, + "auxiliary_loss_mlp": 0.01049327, + "balance_loss_clip": 1.05182076, + "balance_loss_mlp": 1.03168392, + "epoch": 0.1454381482038178, + "flos": 24680905407360.0, + "grad_norm": 1.7558716372570893, + "language_loss": 0.71627414, + "learning_rate": 3.861832179025394e-06, + "loss": 0.73833627, + "num_input_tokens_seen": 52487295, + "router_z_loss_clip": 1.04980469, + "router_z_loss_mlp": 0.17663574, + "step": 2419, + "time_per_iteration": 2.6590592861175537 + }, + { + "auxiliary_loss_clip": 0.011577, + "auxiliary_loss_mlp": 0.01051649, + "balance_loss_clip": 1.05146003, + "balance_loss_mlp": 1.03369665, + "epoch": 0.1454982714564858, + "flos": 27934425102240.0, + "grad_norm": 2.387471061746189, + "language_loss": 0.9069891, + "learning_rate": 3.861689899419569e-06, + "loss": 0.92908263, + "num_input_tokens_seen": 52504220, + "router_z_loss_clip": 1.06005859, + "router_z_loss_mlp": 0.17956543, + "step": 2420, + "time_per_iteration": 2.634596347808838 + }, + { + "auxiliary_loss_clip": 0.01158059, + "auxiliary_loss_mlp": 0.01052736, + "balance_loss_clip": 1.05203867, + "balance_loss_mlp": 1.03574872, + "epoch": 0.14555839470915377, + "flos": 24684470928000.0, + "grad_norm": 1.8539391365897029, + "language_loss": 0.83180451, + "learning_rate": 3.861547549218276e-06, + "loss": 0.85391247, + "num_input_tokens_seen": 52521900, + "router_z_loss_clip": 1.06054688, + "router_z_loss_mlp": 0.1697998, + "step": 2421, + "time_per_iteration": 2.6419739723205566 + }, + { + "auxiliary_loss_clip": 0.01160325, + "auxiliary_loss_mlp": 0.01057942, + "balance_loss_clip": 1.05156028, + "balance_loss_mlp": 1.04047751, + "epoch": 0.14561851796182174, + "flos": 27132905024640.0, + "grad_norm": 1.603556893315709, + "language_loss": 0.8174836, + "learning_rate": 3.861405128426914e-06, + "loss": 0.83966625, + "num_input_tokens_seen": 52540495, + "router_z_loss_clip": 1.08642578, + "router_z_loss_mlp": 0.17456055, + "step": 2422, + "time_per_iteration": 2.644115447998047 + }, + { + "auxiliary_loss_clip": 0.01057473, + "auxiliary_loss_mlp": 0.01044859, + "balance_loss_clip": 1.02070498, + "balance_loss_mlp": 1.04264331, + "epoch": 0.1456786412144897, + "flos": 64225850819040.0, + "grad_norm": 0.9102552769965355, + "language_loss": 0.63365161, + "learning_rate": 3.861262637050883e-06, + "loss": 0.65467489, + "num_input_tokens_seen": 52603305, + "router_z_loss_clip": 0.36767578, + "router_z_loss_mlp": 0.02220154, + "step": 2423, + "time_per_iteration": 4.8756184577941895 + }, + { + "auxiliary_loss_clip": 0.01159427, + "auxiliary_loss_mlp": 0.01043837, + "balance_loss_clip": 1.05319357, + "balance_loss_mlp": 1.02813733, + "epoch": 0.14573876446715767, + "flos": 28335326951520.0, + "grad_norm": 1.6036004824095575, + "language_loss": 0.82505381, + "learning_rate": 3.861120075095585e-06, + "loss": 0.84708649, + "num_input_tokens_seen": 52623435, + "router_z_loss_clip": 1.06347656, + "router_z_loss_mlp": 0.15710449, + "step": 2424, + "time_per_iteration": 2.8847219944000244 + }, + { + "auxiliary_loss_clip": 0.01156846, + "auxiliary_loss_mlp": 0.01049324, + "balance_loss_clip": 1.05163503, + "balance_loss_mlp": 1.03157401, + "epoch": 0.14579888771982563, + "flos": 22102956617760.0, + "grad_norm": 2.159303839145842, + "language_loss": 0.78382134, + "learning_rate": 3.860977442566429e-06, + "loss": 0.80588299, + "num_input_tokens_seen": 52642255, + "router_z_loss_clip": 1.05273438, + "router_z_loss_mlp": 0.1776123, + "step": 2425, + "time_per_iteration": 2.6155691146850586 + }, + { + "auxiliary_loss_clip": 0.01162173, + "auxiliary_loss_mlp": 0.01049267, + "balance_loss_clip": 1.0547837, + "balance_loss_mlp": 1.03188646, + "epoch": 0.14585901097249362, + "flos": 28066330314720.0, + "grad_norm": 2.1495603786294715, + "language_loss": 0.8350181, + "learning_rate": 3.860834739468821e-06, + "loss": 0.85713255, + "num_input_tokens_seen": 52658700, + "router_z_loss_clip": 1.07519531, + "router_z_loss_mlp": 0.17382812, + "step": 2426, + "time_per_iteration": 4.135000944137573 + }, + { + "auxiliary_loss_clip": 0.01157854, + "auxiliary_loss_mlp": 0.01048326, + "balance_loss_clip": 1.05297065, + "balance_loss_mlp": 1.03079021, + "epoch": 0.1459191342251616, + "flos": 26733056624640.0, + "grad_norm": 1.807983472000742, + "language_loss": 0.87674767, + "learning_rate": 3.860691965808173e-06, + "loss": 0.89880943, + "num_input_tokens_seen": 52678140, + "router_z_loss_clip": 1.04980469, + "router_z_loss_mlp": 0.17541504, + "step": 2427, + "time_per_iteration": 2.722597122192383 + }, + { + "auxiliary_loss_clip": 0.01164952, + "auxiliary_loss_mlp": 0.01046098, + "balance_loss_clip": 1.05380464, + "balance_loss_mlp": 1.02734661, + "epoch": 0.14597925747782955, + "flos": 18272755997280.0, + "grad_norm": 2.286441585978773, + "language_loss": 0.67476517, + "learning_rate": 3.8605491215899e-06, + "loss": 0.69687569, + "num_input_tokens_seen": 52696825, + "router_z_loss_clip": 1.11035156, + "router_z_loss_mlp": 0.1875, + "step": 2428, + "time_per_iteration": 4.083776235580444 + }, + { + "auxiliary_loss_clip": 0.01158755, + "auxiliary_loss_mlp": 0.01044665, + "balance_loss_clip": 1.0522114, + "balance_loss_mlp": 1.0257827, + "epoch": 0.14603938073049752, + "flos": 25708358603520.0, + "grad_norm": 1.6697383843784794, + "language_loss": 0.83934945, + "learning_rate": 3.860406206819417e-06, + "loss": 0.86138368, + "num_input_tokens_seen": 52715125, + "router_z_loss_clip": 1.06640625, + "router_z_loss_mlp": 0.18884277, + "step": 2429, + "time_per_iteration": 2.6562373638153076 + }, + { + "auxiliary_loss_clip": 0.01155736, + "auxiliary_loss_mlp": 0.01044825, + "balance_loss_clip": 1.05014789, + "balance_loss_mlp": 1.02815902, + "epoch": 0.14609950398316549, + "flos": 24239006421120.0, + "grad_norm": 1.7768574451855823, + "language_loss": 0.7910434, + "learning_rate": 3.860263221502145e-06, + "loss": 0.81304902, + "num_input_tokens_seen": 52734015, + "router_z_loss_clip": 1.05615234, + "router_z_loss_mlp": 0.16662598, + "step": 2430, + "time_per_iteration": 2.672563076019287 + }, + { + "auxiliary_loss_clip": 0.01161288, + "auxiliary_loss_mlp": 0.01047221, + "balance_loss_clip": 1.05369246, + "balance_loss_mlp": 1.02957773, + "epoch": 0.14615962723583345, + "flos": 27355677795360.0, + "grad_norm": 2.041348340617501, + "language_loss": 0.82624453, + "learning_rate": 3.860120165643504e-06, + "loss": 0.84832966, + "num_input_tokens_seen": 52753025, + "router_z_loss_clip": 1.07617188, + "router_z_loss_mlp": 0.17651367, + "step": 2431, + "time_per_iteration": 2.6405134201049805 + }, + { + "auxiliary_loss_clip": 0.01164636, + "auxiliary_loss_mlp": 0.01051856, + "balance_loss_clip": 1.05433571, + "balance_loss_mlp": 1.03260326, + "epoch": 0.14621975048850142, + "flos": 27267038687520.0, + "grad_norm": 2.6727294700814026, + "language_loss": 0.78323984, + "learning_rate": 3.859977039248921e-06, + "loss": 0.80540478, + "num_input_tokens_seen": 52773420, + "router_z_loss_clip": 1.1015625, + "router_z_loss_mlp": 0.19250488, + "step": 2432, + "time_per_iteration": 2.6999082565307617 + }, + { + "auxiliary_loss_clip": 0.0116063, + "auxiliary_loss_mlp": 0.0105029, + "balance_loss_clip": 1.05234027, + "balance_loss_mlp": 1.03165817, + "epoch": 0.1462798737411694, + "flos": 29759994924480.0, + "grad_norm": 1.8369350813164422, + "language_loss": 0.80260706, + "learning_rate": 3.859833842323822e-06, + "loss": 0.82471621, + "num_input_tokens_seen": 52792870, + "router_z_loss_clip": 1.08154297, + "router_z_loss_mlp": 0.18652344, + "step": 2433, + "time_per_iteration": 2.679178476333618 + }, + { + "auxiliary_loss_clip": 0.01158453, + "auxiliary_loss_mlp": 0.01046168, + "balance_loss_clip": 1.05481362, + "balance_loss_mlp": 1.0267489, + "epoch": 0.14633999699383737, + "flos": 23482292104800.0, + "grad_norm": 2.0243705235800395, + "language_loss": 0.78468323, + "learning_rate": 3.859690574873638e-06, + "loss": 0.80672944, + "num_input_tokens_seen": 52811615, + "router_z_loss_clip": 1.03662109, + "router_z_loss_mlp": 0.19421387, + "step": 2434, + "time_per_iteration": 2.75521183013916 + }, + { + "auxiliary_loss_clip": 0.01056025, + "auxiliary_loss_mlp": 0.0100974, + "balance_loss_clip": 1.01876283, + "balance_loss_mlp": 1.00712442, + "epoch": 0.14640012024650534, + "flos": 76458621293280.0, + "grad_norm": 0.8539390821113356, + "language_loss": 0.5847528, + "learning_rate": 3.8595472369038e-06, + "loss": 0.60541046, + "num_input_tokens_seen": 52873230, + "router_z_loss_clip": 0.37255859, + "router_z_loss_mlp": 0.02616882, + "step": 2435, + "time_per_iteration": 3.357064723968506 + }, + { + "auxiliary_loss_clip": 0.01154382, + "auxiliary_loss_mlp": 0.01046975, + "balance_loss_clip": 1.05093431, + "balance_loss_mlp": 1.02918911, + "epoch": 0.1464602434991733, + "flos": 14978846924640.0, + "grad_norm": 2.065600735353634, + "language_loss": 0.88743222, + "learning_rate": 3.859403828419744e-06, + "loss": 0.90944576, + "num_input_tokens_seen": 52889325, + "router_z_loss_clip": 1.03466797, + "router_z_loss_mlp": 0.17785645, + "step": 2436, + "time_per_iteration": 2.658360719680786 + }, + { + "auxiliary_loss_clip": 0.01163047, + "auxiliary_loss_mlp": 0.01042388, + "balance_loss_clip": 1.05386198, + "balance_loss_mlp": 1.02455389, + "epoch": 0.14652036675184127, + "flos": 25529540731200.0, + "grad_norm": 2.446032985396703, + "language_loss": 0.74679005, + "learning_rate": 3.85926034942691e-06, + "loss": 0.76884449, + "num_input_tokens_seen": 52909705, + "router_z_loss_clip": 1.09277344, + "router_z_loss_mlp": 0.17834473, + "step": 2437, + "time_per_iteration": 2.6643588542938232 + }, + { + "auxiliary_loss_clip": 0.01161089, + "auxiliary_loss_mlp": 0.01048094, + "balance_loss_clip": 1.05257571, + "balance_loss_mlp": 1.027614, + "epoch": 0.14658049000450923, + "flos": 33805148480640.0, + "grad_norm": 2.136811715517304, + "language_loss": 0.73561084, + "learning_rate": 3.859116799930736e-06, + "loss": 0.75770271, + "num_input_tokens_seen": 52930300, + "router_z_loss_clip": 1.0859375, + "router_z_loss_mlp": 0.20471191, + "step": 2438, + "time_per_iteration": 2.701833963394165 + }, + { + "auxiliary_loss_clip": 0.01160476, + "auxiliary_loss_mlp": 0.01040787, + "balance_loss_clip": 1.05636048, + "balance_loss_mlp": 1.02393055, + "epoch": 0.14664061325717723, + "flos": 30427908063840.0, + "grad_norm": 2.06393277663241, + "language_loss": 0.74765217, + "learning_rate": 3.858973179936668e-06, + "loss": 0.76966482, + "num_input_tokens_seen": 52949955, + "router_z_loss_clip": 1.04101562, + "router_z_loss_mlp": 0.16845703, + "step": 2439, + "time_per_iteration": 2.707271099090576 + }, + { + "auxiliary_loss_clip": 0.01160644, + "auxiliary_loss_mlp": 0.01043216, + "balance_loss_clip": 1.05601144, + "balance_loss_mlp": 1.02495289, + "epoch": 0.1467007365098452, + "flos": 49172701553280.0, + "grad_norm": 1.8654521987976955, + "language_loss": 0.74881041, + "learning_rate": 3.85882948945015e-06, + "loss": 0.77084899, + "num_input_tokens_seen": 52972905, + "router_z_loss_clip": 1.04541016, + "router_z_loss_mlp": 0.18273926, + "step": 2440, + "time_per_iteration": 2.814908027648926 + }, + { + "auxiliary_loss_clip": 0.0115562, + "auxiliary_loss_mlp": 0.01047909, + "balance_loss_clip": 1.05292356, + "balance_loss_mlp": 1.03051591, + "epoch": 0.14676085976251316, + "flos": 32385990857760.0, + "grad_norm": 1.5067583115482153, + "language_loss": 0.82991552, + "learning_rate": 3.85868572847663e-06, + "loss": 0.85195082, + "num_input_tokens_seen": 52994850, + "router_z_loss_clip": 1.02734375, + "router_z_loss_mlp": 0.17382812, + "step": 2441, + "time_per_iteration": 2.6986067295074463 + }, + { + "auxiliary_loss_clip": 0.01165163, + "auxiliary_loss_mlp": 0.01047608, + "balance_loss_clip": 1.05341792, + "balance_loss_mlp": 1.02789068, + "epoch": 0.14682098301518112, + "flos": 28736107248960.0, + "grad_norm": 5.701538795920186, + "language_loss": 0.7200247, + "learning_rate": 3.858541897021563e-06, + "loss": 0.74215239, + "num_input_tokens_seen": 53014740, + "router_z_loss_clip": 1.1171875, + "router_z_loss_mlp": 0.19714355, + "step": 2442, + "time_per_iteration": 2.662106990814209 + }, + { + "auxiliary_loss_clip": 0.01166839, + "auxiliary_loss_mlp": 0.01043536, + "balance_loss_clip": 1.05353785, + "balance_loss_mlp": 1.02508283, + "epoch": 0.1468811062678491, + "flos": 14216419671840.0, + "grad_norm": 3.101901802071133, + "language_loss": 0.8096298, + "learning_rate": 3.8583979950904e-06, + "loss": 0.83173358, + "num_input_tokens_seen": 53029780, + "router_z_loss_clip": 1.13183594, + "router_z_loss_mlp": 0.18469238, + "step": 2443, + "time_per_iteration": 2.634866237640381 + }, + { + "auxiliary_loss_clip": 0.01161385, + "auxiliary_loss_mlp": 0.01049106, + "balance_loss_clip": 1.05461359, + "balance_loss_mlp": 1.03072429, + "epoch": 0.14694122952051705, + "flos": 28068032040480.0, + "grad_norm": 1.65345531513344, + "language_loss": 0.82976389, + "learning_rate": 3.858254022688599e-06, + "loss": 0.85186881, + "num_input_tokens_seen": 53048620, + "router_z_loss_clip": 1.06835938, + "router_z_loss_mlp": 0.18383789, + "step": 2444, + "time_per_iteration": 2.6570727825164795 + }, + { + "auxiliary_loss_clip": 0.0116247, + "auxiliary_loss_mlp": 0.01047181, + "balance_loss_clip": 1.05437422, + "balance_loss_mlp": 1.02902555, + "epoch": 0.14700135277318502, + "flos": 32339645439840.0, + "grad_norm": 1.6439124249017611, + "language_loss": 0.71247083, + "learning_rate": 3.85810997982162e-06, + "loss": 0.7345674, + "num_input_tokens_seen": 53070055, + "router_z_loss_clip": 1.08203125, + "router_z_loss_mlp": 0.18164062, + "step": 2445, + "time_per_iteration": 2.6848630905151367 + }, + { + "auxiliary_loss_clip": 0.01058931, + "auxiliary_loss_mlp": 0.01026177, + "balance_loss_clip": 1.02177, + "balance_loss_mlp": 1.02365136, + "epoch": 0.147061476025853, + "flos": 72540105703200.0, + "grad_norm": 0.8305709897101897, + "language_loss": 0.63146877, + "learning_rate": 3.857965866494923e-06, + "loss": 0.65231979, + "num_input_tokens_seen": 53126945, + "router_z_loss_clip": 0.37158203, + "router_z_loss_mlp": 0.0252533, + "step": 2446, + "time_per_iteration": 3.1740963459014893 + }, + { + "auxiliary_loss_clip": 0.01165346, + "auxiliary_loss_mlp": 0.01043097, + "balance_loss_clip": 1.05689883, + "balance_loss_mlp": 1.02471459, + "epoch": 0.14712159927852098, + "flos": 34569885218400.0, + "grad_norm": 1.6606101724297968, + "language_loss": 0.75262475, + "learning_rate": 3.857821682713975e-06, + "loss": 0.77470922, + "num_input_tokens_seen": 53149130, + "router_z_loss_clip": 1.08496094, + "router_z_loss_mlp": 0.18359375, + "step": 2447, + "time_per_iteration": 2.834432601928711 + }, + { + "auxiliary_loss_clip": 0.01160722, + "auxiliary_loss_mlp": 0.01044219, + "balance_loss_clip": 1.05332065, + "balance_loss_mlp": 1.02675438, + "epoch": 0.14718172253118894, + "flos": 33055484171040.0, + "grad_norm": 1.9299650990079193, + "language_loss": 0.85168815, + "learning_rate": 3.857677428484242e-06, + "loss": 0.87373757, + "num_input_tokens_seen": 53167120, + "router_z_loss_clip": 1.07421875, + "router_z_loss_mlp": 0.17468262, + "step": 2448, + "time_per_iteration": 2.9851837158203125 + }, + { + "auxiliary_loss_clip": 0.01057157, + "auxiliary_loss_mlp": 0.01009218, + "balance_loss_clip": 1.02018762, + "balance_loss_mlp": 1.0066824, + "epoch": 0.1472418457838569, + "flos": 81396365071680.0, + "grad_norm": 0.7621696240801615, + "language_loss": 0.56834757, + "learning_rate": 3.857533103811195e-06, + "loss": 0.58901131, + "num_input_tokens_seen": 53227945, + "router_z_loss_clip": 0.36938477, + "router_z_loss_mlp": 0.02536011, + "step": 2449, + "time_per_iteration": 3.1863205432891846 + }, + { + "auxiliary_loss_clip": 0.01157806, + "auxiliary_loss_mlp": 0.01043664, + "balance_loss_clip": 1.05366683, + "balance_loss_mlp": 1.02443504, + "epoch": 0.14730196903652487, + "flos": 23883599126880.0, + "grad_norm": 1.775736480692529, + "language_loss": 0.85063255, + "learning_rate": 3.857388708700307e-06, + "loss": 0.87264723, + "num_input_tokens_seen": 53244615, + "router_z_loss_clip": 1.04150391, + "router_z_loss_mlp": 0.19226074, + "step": 2450, + "time_per_iteration": 2.6850404739379883 + }, + { + "auxiliary_loss_clip": 0.01161049, + "auxiliary_loss_mlp": 0.01046687, + "balance_loss_clip": 1.05312777, + "balance_loss_mlp": 1.02788782, + "epoch": 0.14736209228919284, + "flos": 19609514173440.0, + "grad_norm": 2.169645370222631, + "language_loss": 0.74780512, + "learning_rate": 3.857244243157052e-06, + "loss": 0.76988244, + "num_input_tokens_seen": 53262205, + "router_z_loss_clip": 1.07958984, + "router_z_loss_mlp": 0.18798828, + "step": 2451, + "time_per_iteration": 2.668442487716675 + }, + { + "auxiliary_loss_clip": 0.01154643, + "auxiliary_loss_mlp": 0.01038808, + "balance_loss_clip": 1.05301356, + "balance_loss_mlp": 1.0217135, + "epoch": 0.1474222155418608, + "flos": 28112959353600.0, + "grad_norm": 3.031194599140657, + "language_loss": 0.82413375, + "learning_rate": 3.85709970718691e-06, + "loss": 0.8460682, + "num_input_tokens_seen": 53282445, + "router_z_loss_clip": 1.01660156, + "router_z_loss_mlp": 0.17089844, + "step": 2452, + "time_per_iteration": 2.7152888774871826 + }, + { + "auxiliary_loss_clip": 0.01160819, + "auxiliary_loss_mlp": 0.0103749, + "balance_loss_clip": 1.05592716, + "balance_loss_mlp": 1.02050316, + "epoch": 0.1474823387945288, + "flos": 20765955337920.0, + "grad_norm": 1.546219054112001, + "language_loss": 0.73937649, + "learning_rate": 3.856955100795361e-06, + "loss": 0.76135957, + "num_input_tokens_seen": 53299060, + "router_z_loss_clip": 1.04931641, + "router_z_loss_mlp": 0.16967773, + "step": 2453, + "time_per_iteration": 2.6307260990142822 + }, + { + "auxiliary_loss_clip": 0.01162552, + "auxiliary_loss_mlp": 0.01048864, + "balance_loss_clip": 1.05228913, + "balance_loss_mlp": 1.03032732, + "epoch": 0.14754246204719676, + "flos": 21835337568480.0, + "grad_norm": 1.8275223388793262, + "language_loss": 0.75996554, + "learning_rate": 3.856810423987889e-06, + "loss": 0.7820797, + "num_input_tokens_seen": 53315970, + "router_z_loss_clip": 1.1015625, + "router_z_loss_mlp": 0.1854248, + "step": 2454, + "time_per_iteration": 2.6521122455596924 + }, + { + "auxiliary_loss_clip": 0.01161577, + "auxiliary_loss_mlp": 0.01042049, + "balance_loss_clip": 1.05361295, + "balance_loss_mlp": 1.02408457, + "epoch": 0.14760258529986472, + "flos": 15958536598080.0, + "grad_norm": 1.8504561923420024, + "language_loss": 0.83075577, + "learning_rate": 3.856665676769979e-06, + "loss": 0.85279208, + "num_input_tokens_seen": 53332940, + "router_z_loss_clip": 1.08007812, + "router_z_loss_mlp": 0.1796875, + "step": 2455, + "time_per_iteration": 2.638275384902954 + }, + { + "auxiliary_loss_clip": 0.01163985, + "auxiliary_loss_mlp": 0.01047464, + "balance_loss_clip": 1.05219817, + "balance_loss_mlp": 1.0293318, + "epoch": 0.1476627085525327, + "flos": 37591191616320.0, + "grad_norm": 2.157912206777254, + "language_loss": 0.83935261, + "learning_rate": 3.85652085914712e-06, + "loss": 0.86146706, + "num_input_tokens_seen": 53353295, + "router_z_loss_clip": 1.11816406, + "router_z_loss_mlp": 0.18127441, + "step": 2456, + "time_per_iteration": 2.7555947303771973 + }, + { + "auxiliary_loss_clip": 0.01157335, + "auxiliary_loss_mlp": 0.01044433, + "balance_loss_clip": 1.05238569, + "balance_loss_mlp": 1.02651572, + "epoch": 0.14772283180520066, + "flos": 26465559127200.0, + "grad_norm": 2.192150626309475, + "language_loss": 0.84590733, + "learning_rate": 3.856375971124805e-06, + "loss": 0.86792505, + "num_input_tokens_seen": 53373410, + "router_z_loss_clip": 1.04882812, + "router_z_loss_mlp": 0.17907715, + "step": 2457, + "time_per_iteration": 2.6801681518554688 + }, + { + "auxiliary_loss_clip": 0.01157097, + "auxiliary_loss_mlp": 0.01042441, + "balance_loss_clip": 1.05476046, + "balance_loss_mlp": 1.0243932, + "epoch": 0.14778295505786862, + "flos": 22903544797920.0, + "grad_norm": 1.807100709172367, + "language_loss": 0.75963748, + "learning_rate": 3.856231012708527e-06, + "loss": 0.78163284, + "num_input_tokens_seen": 53391430, + "router_z_loss_clip": 1.02246094, + "router_z_loss_mlp": 0.18041992, + "step": 2458, + "time_per_iteration": 2.625671148300171 + }, + { + "auxiliary_loss_clip": 0.01168454, + "auxiliary_loss_mlp": 0.01048238, + "balance_loss_clip": 1.05525589, + "balance_loss_mlp": 1.02888989, + "epoch": 0.1478430783105366, + "flos": 27934911309600.0, + "grad_norm": 2.1972158311041303, + "language_loss": 0.83802283, + "learning_rate": 3.856085983903782e-06, + "loss": 0.86018974, + "num_input_tokens_seen": 53409960, + "router_z_loss_clip": 1.13085938, + "router_z_loss_mlp": 0.19335938, + "step": 2459, + "time_per_iteration": 2.6884372234344482 + }, + { + "auxiliary_loss_clip": 0.01155427, + "auxiliary_loss_mlp": 0.01036945, + "balance_loss_clip": 1.05214572, + "balance_loss_mlp": 1.01931405, + "epoch": 0.14790320156320458, + "flos": 18406727591040.0, + "grad_norm": 2.2749950480560246, + "language_loss": 0.75399387, + "learning_rate": 3.855940884716071e-06, + "loss": 0.77591753, + "num_input_tokens_seen": 53426160, + "router_z_loss_clip": 1.03417969, + "router_z_loss_mlp": 0.1763916, + "step": 2460, + "time_per_iteration": 2.670722484588623 + }, + { + "auxiliary_loss_clip": 0.01165573, + "auxiliary_loss_mlp": 0.01040882, + "balance_loss_clip": 1.05403519, + "balance_loss_mlp": 1.02238095, + "epoch": 0.14796332481587254, + "flos": 32341671303840.0, + "grad_norm": 1.7629098976011761, + "language_loss": 0.81526899, + "learning_rate": 3.855795715150896e-06, + "loss": 0.83733344, + "num_input_tokens_seen": 53448530, + "router_z_loss_clip": 1.11669922, + "router_z_loss_mlp": 0.18493652, + "step": 2461, + "time_per_iteration": 2.8816442489624023 + }, + { + "auxiliary_loss_clip": 0.01163457, + "auxiliary_loss_mlp": 0.0105239, + "balance_loss_clip": 1.05472779, + "balance_loss_mlp": 1.03282821, + "epoch": 0.1480234480685405, + "flos": 21430708129440.0, + "grad_norm": 4.43657481561455, + "language_loss": 0.66233087, + "learning_rate": 3.855650475213761e-06, + "loss": 0.68448931, + "num_input_tokens_seen": 53465915, + "router_z_loss_clip": 1.08740234, + "router_z_loss_mlp": 0.19543457, + "step": 2462, + "time_per_iteration": 5.6367363929748535 + }, + { + "auxiliary_loss_clip": 0.01160235, + "auxiliary_loss_mlp": 0.01050403, + "balance_loss_clip": 1.05357194, + "balance_loss_mlp": 1.03150845, + "epoch": 0.14808357132120847, + "flos": 65383177013280.0, + "grad_norm": 2.4139721246341286, + "language_loss": 0.67300832, + "learning_rate": 3.8555051649101745e-06, + "loss": 0.69511473, + "num_input_tokens_seen": 53496055, + "router_z_loss_clip": 1.06738281, + "router_z_loss_mlp": 0.18884277, + "step": 2463, + "time_per_iteration": 2.9680709838867188 + }, + { + "auxiliary_loss_clip": 0.01162522, + "auxiliary_loss_mlp": 0.01046489, + "balance_loss_clip": 1.0535624, + "balance_loss_mlp": 1.02758265, + "epoch": 0.14814369457387644, + "flos": 24195497212800.0, + "grad_norm": 4.38784049917083, + "language_loss": 0.7707392, + "learning_rate": 3.855359784245646e-06, + "loss": 0.79282933, + "num_input_tokens_seen": 53513790, + "router_z_loss_clip": 1.09082031, + "router_z_loss_mlp": 0.18920898, + "step": 2464, + "time_per_iteration": 2.6626322269439697 + }, + { + "auxiliary_loss_clip": 0.01162089, + "auxiliary_loss_mlp": 0.01043957, + "balance_loss_clip": 1.0571599, + "balance_loss_mlp": 1.02687407, + "epoch": 0.1482038178265444, + "flos": 29181328652160.0, + "grad_norm": 1.963269859269241, + "language_loss": 0.79836404, + "learning_rate": 3.855214333225688e-06, + "loss": 0.82042444, + "num_input_tokens_seen": 53533410, + "router_z_loss_clip": 1.04882812, + "router_z_loss_mlp": 0.17077637, + "step": 2465, + "time_per_iteration": 2.7057175636291504 + }, + { + "auxiliary_loss_clip": 0.01170886, + "auxiliary_loss_mlp": 0.01045907, + "balance_loss_clip": 1.05842483, + "balance_loss_mlp": 1.02689314, + "epoch": 0.1482639410792124, + "flos": 29493145703520.0, + "grad_norm": 1.5172604586484386, + "language_loss": 0.76170546, + "learning_rate": 3.855068811855817e-06, + "loss": 0.78387338, + "num_input_tokens_seen": 53554775, + "router_z_loss_clip": 1.12597656, + "router_z_loss_mlp": 0.18994141, + "step": 2466, + "time_per_iteration": 4.099774599075317 + }, + { + "auxiliary_loss_clip": 0.0106411, + "auxiliary_loss_mlp": 0.01028247, + "balance_loss_clip": 1.02609181, + "balance_loss_mlp": 1.02524567, + "epoch": 0.14832406433188036, + "flos": 80767504239840.0, + "grad_norm": 0.7885191307736104, + "language_loss": 0.60074961, + "learning_rate": 3.854923220141551e-06, + "loss": 0.62167323, + "num_input_tokens_seen": 53609675, + "router_z_loss_clip": 0.38037109, + "router_z_loss_mlp": 0.02998352, + "step": 2467, + "time_per_iteration": 4.63557243347168 + }, + { + "auxiliary_loss_clip": 0.01164837, + "auxiliary_loss_mlp": 0.01043742, + "balance_loss_clip": 1.05788708, + "balance_loss_mlp": 1.02533579, + "epoch": 0.14838418758454833, + "flos": 31007182095360.0, + "grad_norm": 2.413429944009483, + "language_loss": 0.87189204, + "learning_rate": 3.85477755808841e-06, + "loss": 0.89397788, + "num_input_tokens_seen": 53626950, + "router_z_loss_clip": 1.0703125, + "router_z_loss_mlp": 0.18383789, + "step": 2468, + "time_per_iteration": 2.804349184036255 + }, + { + "auxiliary_loss_clip": 0.01166955, + "auxiliary_loss_mlp": 0.01052622, + "balance_loss_clip": 1.05647516, + "balance_loss_mlp": 1.03251088, + "epoch": 0.1484443108372163, + "flos": 28419022951200.0, + "grad_norm": 2.4541999514634982, + "language_loss": 0.75745332, + "learning_rate": 3.854631825701919e-06, + "loss": 0.77964908, + "num_input_tokens_seen": 53644200, + "router_z_loss_clip": 1.10546875, + "router_z_loss_mlp": 0.2010498, + "step": 2469, + "time_per_iteration": 2.6705210208892822 + }, + { + "auxiliary_loss_clip": 0.01159829, + "auxiliary_loss_mlp": 0.01051308, + "balance_loss_clip": 1.05350161, + "balance_loss_mlp": 1.03347397, + "epoch": 0.14850443408988426, + "flos": 17872664493600.0, + "grad_norm": 2.796223936865341, + "language_loss": 0.76089478, + "learning_rate": 3.854486022987603e-06, + "loss": 0.78300619, + "num_input_tokens_seen": 53659650, + "router_z_loss_clip": 1.06201172, + "router_z_loss_mlp": 0.1784668, + "step": 2470, + "time_per_iteration": 2.6608433723449707 + }, + { + "auxiliary_loss_clip": 0.01160219, + "auxiliary_loss_mlp": 0.01051653, + "balance_loss_clip": 1.05640149, + "balance_loss_mlp": 1.03389025, + "epoch": 0.14856455734255222, + "flos": 28734081384960.0, + "grad_norm": 1.7744964635438032, + "language_loss": 0.72317529, + "learning_rate": 3.8543401499509905e-06, + "loss": 0.74529397, + "num_input_tokens_seen": 53680275, + "router_z_loss_clip": 1.03857422, + "router_z_loss_mlp": 0.1776123, + "step": 2471, + "time_per_iteration": 2.9220292568206787 + }, + { + "auxiliary_loss_clip": 0.01167211, + "auxiliary_loss_mlp": 0.01050949, + "balance_loss_clip": 1.05431986, + "balance_loss_mlp": 1.03166056, + "epoch": 0.1486246805952202, + "flos": 22057745683680.0, + "grad_norm": 1.8822245701864344, + "language_loss": 0.89620006, + "learning_rate": 3.854194206597615e-06, + "loss": 0.91838157, + "num_input_tokens_seen": 53698270, + "router_z_loss_clip": 1.12792969, + "router_z_loss_mlp": 0.19274902, + "step": 2472, + "time_per_iteration": 2.6629323959350586 + }, + { + "auxiliary_loss_clip": 0.01161109, + "auxiliary_loss_mlp": 0.010578, + "balance_loss_clip": 1.05373263, + "balance_loss_mlp": 1.0391674, + "epoch": 0.14868480384788818, + "flos": 23615291283840.0, + "grad_norm": 2.461467319699841, + "language_loss": 0.80272329, + "learning_rate": 3.854048192933008e-06, + "loss": 0.82491243, + "num_input_tokens_seen": 53716845, + "router_z_loss_clip": 1.07324219, + "router_z_loss_mlp": 0.18640137, + "step": 2473, + "time_per_iteration": 2.6447970867156982 + }, + { + "auxiliary_loss_clip": 0.01165808, + "auxiliary_loss_mlp": 0.01064106, + "balance_loss_clip": 1.05563283, + "balance_loss_mlp": 1.04615319, + "epoch": 0.14874492710055615, + "flos": 27088828574400.0, + "grad_norm": 2.3694475556707464, + "language_loss": 0.77597868, + "learning_rate": 3.853902108962709e-06, + "loss": 0.7982778, + "num_input_tokens_seen": 53734970, + "router_z_loss_clip": 1.1015625, + "router_z_loss_mlp": 0.17944336, + "step": 2474, + "time_per_iteration": 2.6530022621154785 + }, + { + "auxiliary_loss_clip": 0.0116471, + "auxiliary_loss_mlp": 0.01064151, + "balance_loss_clip": 1.05383515, + "balance_loss_mlp": 1.04503012, + "epoch": 0.1488050503532241, + "flos": 25750692810720.0, + "grad_norm": 1.7825717473088776, + "language_loss": 0.82548487, + "learning_rate": 3.853755954692255e-06, + "loss": 0.84777355, + "num_input_tokens_seen": 53753415, + "router_z_loss_clip": 1.10839844, + "router_z_loss_mlp": 0.19116211, + "step": 2475, + "time_per_iteration": 2.6758639812469482 + }, + { + "auxiliary_loss_clip": 0.0116408, + "auxiliary_loss_mlp": 0.01062673, + "balance_loss_clip": 1.05786836, + "balance_loss_mlp": 1.04473209, + "epoch": 0.14886517360589208, + "flos": 15601630164480.0, + "grad_norm": 1.7073072324871434, + "language_loss": 0.80332255, + "learning_rate": 3.85360973012719e-06, + "loss": 0.82559001, + "num_input_tokens_seen": 53770305, + "router_z_loss_clip": 1.06152344, + "router_z_loss_mlp": 0.17932129, + "step": 2476, + "time_per_iteration": 2.6498398780822754 + }, + { + "auxiliary_loss_clip": 0.01158358, + "auxiliary_loss_mlp": 0.01061438, + "balance_loss_clip": 1.05679107, + "balance_loss_mlp": 1.04348516, + "epoch": 0.14892529685856004, + "flos": 35414387779680.0, + "grad_norm": 1.6979586287638335, + "language_loss": 0.77513903, + "learning_rate": 3.853463435273058e-06, + "loss": 0.797337, + "num_input_tokens_seen": 53788895, + "router_z_loss_clip": 1.01513672, + "router_z_loss_mlp": 0.17956543, + "step": 2477, + "time_per_iteration": 2.7784886360168457 + }, + { + "auxiliary_loss_clip": 0.01062228, + "auxiliary_loss_mlp": 0.01027215, + "balance_loss_clip": 1.02562499, + "balance_loss_mlp": 1.02500677, + "epoch": 0.148985420111228, + "flos": 75562749171360.0, + "grad_norm": 1.3496245687298787, + "language_loss": 0.60162997, + "learning_rate": 3.853317070135407e-06, + "loss": 0.62252444, + "num_input_tokens_seen": 53850260, + "router_z_loss_clip": 0.36572266, + "router_z_loss_mlp": 0.02210999, + "step": 2478, + "time_per_iteration": 3.3294641971588135 + }, + { + "auxiliary_loss_clip": 0.01164621, + "auxiliary_loss_mlp": 0.01051339, + "balance_loss_clip": 1.05804038, + "balance_loss_mlp": 1.0340178, + "epoch": 0.149045543363896, + "flos": 29181490721280.0, + "grad_norm": 2.034879571651996, + "language_loss": 0.7115103, + "learning_rate": 3.853170634719787e-06, + "loss": 0.73366988, + "num_input_tokens_seen": 53867520, + "router_z_loss_clip": 1.06591797, + "router_z_loss_mlp": 0.17321777, + "step": 2479, + "time_per_iteration": 2.7230629920959473 + }, + { + "auxiliary_loss_clip": 0.01162683, + "auxiliary_loss_mlp": 0.01051769, + "balance_loss_clip": 1.05555868, + "balance_loss_mlp": 1.03308868, + "epoch": 0.14910566661656396, + "flos": 28863839181600.0, + "grad_norm": 1.5094441290446965, + "language_loss": 0.80858684, + "learning_rate": 3.853024129031751e-06, + "loss": 0.83073133, + "num_input_tokens_seen": 53886620, + "router_z_loss_clip": 1.07226562, + "router_z_loss_mlp": 0.18688965, + "step": 2480, + "time_per_iteration": 2.78377366065979 + }, + { + "auxiliary_loss_clip": 0.01164166, + "auxiliary_loss_mlp": 0.01049652, + "balance_loss_clip": 1.05533135, + "balance_loss_mlp": 1.03159165, + "epoch": 0.14916578986923193, + "flos": 25033395457440.0, + "grad_norm": 1.9592685641780903, + "language_loss": 0.84145629, + "learning_rate": 3.852877553076854e-06, + "loss": 0.86359447, + "num_input_tokens_seen": 53902230, + "router_z_loss_clip": 1.08789062, + "router_z_loss_mlp": 0.18054199, + "step": 2481, + "time_per_iteration": 2.682271957397461 + }, + { + "auxiliary_loss_clip": 0.01162457, + "auxiliary_loss_mlp": 0.01052217, + "balance_loss_clip": 1.05501533, + "balance_loss_mlp": 1.03263104, + "epoch": 0.1492259131218999, + "flos": 27444722076000.0, + "grad_norm": 2.1832545261985397, + "language_loss": 0.77605772, + "learning_rate": 3.8527309068606546e-06, + "loss": 0.79820442, + "num_input_tokens_seen": 53919475, + "router_z_loss_clip": 1.07421875, + "router_z_loss_mlp": 0.19604492, + "step": 2482, + "time_per_iteration": 2.734065294265747 + }, + { + "auxiliary_loss_clip": 0.01171221, + "auxiliary_loss_mlp": 0.01041345, + "balance_loss_clip": 1.0584507, + "balance_loss_mlp": 1.02136517, + "epoch": 0.14928603637456786, + "flos": 28291574639520.0, + "grad_norm": 1.9114268247098114, + "language_loss": 0.7838062, + "learning_rate": 3.852584190388713e-06, + "loss": 0.80593187, + "num_input_tokens_seen": 53939150, + "router_z_loss_clip": 1.12646484, + "router_z_loss_mlp": 0.19970703, + "step": 2483, + "time_per_iteration": 2.807854175567627 + }, + { + "auxiliary_loss_clip": 0.01157825, + "auxiliary_loss_mlp": 0.01041963, + "balance_loss_clip": 1.0553906, + "balance_loss_mlp": 1.02571511, + "epoch": 0.14934615962723582, + "flos": 26421361125120.0, + "grad_norm": 1.5202934428208614, + "language_loss": 0.70548809, + "learning_rate": 3.852437403666595e-06, + "loss": 0.72748601, + "num_input_tokens_seen": 53958735, + "router_z_loss_clip": 1.02294922, + "router_z_loss_mlp": 0.16247559, + "step": 2484, + "time_per_iteration": 2.8857181072235107 + }, + { + "auxiliary_loss_clip": 0.01166601, + "auxiliary_loss_mlp": 0.01044557, + "balance_loss_clip": 1.05609035, + "balance_loss_mlp": 1.02474451, + "epoch": 0.1494062828799038, + "flos": 32958782124480.0, + "grad_norm": 2.0531095883619415, + "language_loss": 0.84509653, + "learning_rate": 3.852290546699863e-06, + "loss": 0.86720812, + "num_input_tokens_seen": 53975065, + "router_z_loss_clip": 1.10546875, + "router_z_loss_mlp": 0.19812012, + "step": 2485, + "time_per_iteration": 2.695326328277588 + }, + { + "auxiliary_loss_clip": 0.01167142, + "auxiliary_loss_mlp": 0.01044899, + "balance_loss_clip": 1.05789113, + "balance_loss_mlp": 1.02583742, + "epoch": 0.14946640613257178, + "flos": 25886568716640.0, + "grad_norm": 2.054890003868714, + "language_loss": 0.85262096, + "learning_rate": 3.8521436194940894e-06, + "loss": 0.87474132, + "num_input_tokens_seen": 53993330, + "router_z_loss_clip": 1.09277344, + "router_z_loss_mlp": 0.19067383, + "step": 2486, + "time_per_iteration": 2.7555508613586426 + }, + { + "auxiliary_loss_clip": 0.01159341, + "auxiliary_loss_mlp": 0.01042426, + "balance_loss_clip": 1.05463433, + "balance_loss_mlp": 1.02638066, + "epoch": 0.14952652938523975, + "flos": 16314592168800.0, + "grad_norm": 2.129262052640908, + "language_loss": 0.75044429, + "learning_rate": 3.851996622054842e-06, + "loss": 0.77246189, + "num_input_tokens_seen": 54010515, + "router_z_loss_clip": 1.04785156, + "router_z_loss_mlp": 0.16027832, + "step": 2487, + "time_per_iteration": 2.6584150791168213 + }, + { + "auxiliary_loss_clip": 0.0115965, + "auxiliary_loss_mlp": 0.01047097, + "balance_loss_clip": 1.05402851, + "balance_loss_mlp": 1.02911997, + "epoch": 0.1495866526379077, + "flos": 43339855481280.0, + "grad_norm": 1.9256266645315194, + "language_loss": 0.71884179, + "learning_rate": 3.8518495543877e-06, + "loss": 0.74090928, + "num_input_tokens_seen": 54031315, + "router_z_loss_clip": 1.05566406, + "router_z_loss_mlp": 0.17980957, + "step": 2488, + "time_per_iteration": 2.8608715534210205 + }, + { + "auxiliary_loss_clip": 0.01164597, + "auxiliary_loss_mlp": 0.01047166, + "balance_loss_clip": 1.05500317, + "balance_loss_mlp": 1.02889156, + "epoch": 0.14964677589057568, + "flos": 21515903268480.0, + "grad_norm": 2.71892323528331, + "language_loss": 0.70905018, + "learning_rate": 3.851702416498235e-06, + "loss": 0.73116779, + "num_input_tokens_seen": 54045965, + "router_z_loss_clip": 1.09570312, + "router_z_loss_mlp": 0.18273926, + "step": 2489, + "time_per_iteration": 2.630764961242676 + }, + { + "auxiliary_loss_clip": 0.01163671, + "auxiliary_loss_mlp": 0.01049334, + "balance_loss_clip": 1.05476832, + "balance_loss_mlp": 1.03113103, + "epoch": 0.14970689914324364, + "flos": 24630832399680.0, + "grad_norm": 2.6423736422321493, + "language_loss": 0.8115555, + "learning_rate": 3.8515552083920295e-06, + "loss": 0.83368552, + "num_input_tokens_seen": 54059960, + "router_z_loss_clip": 1.08837891, + "router_z_loss_mlp": 0.18200684, + "step": 2490, + "time_per_iteration": 2.6696012020111084 + }, + { + "auxiliary_loss_clip": 0.01168261, + "auxiliary_loss_mlp": 0.01052688, + "balance_loss_clip": 1.05819607, + "balance_loss_mlp": 1.03555775, + "epoch": 0.1497670223959116, + "flos": 45428101244640.0, + "grad_norm": 5.7970785722542955, + "language_loss": 0.80365348, + "learning_rate": 3.851407930074666e-06, + "loss": 0.825863, + "num_input_tokens_seen": 54079330, + "router_z_loss_clip": 1.09960938, + "router_z_loss_mlp": 0.17126465, + "step": 2491, + "time_per_iteration": 2.869926929473877 + }, + { + "auxiliary_loss_clip": 0.01165289, + "auxiliary_loss_mlp": 0.01046652, + "balance_loss_clip": 1.05368757, + "balance_loss_mlp": 1.02662492, + "epoch": 0.1498271456485796, + "flos": 29841948681120.0, + "grad_norm": 5.756831708800315, + "language_loss": 0.90677643, + "learning_rate": 3.851260581551727e-06, + "loss": 0.92889583, + "num_input_tokens_seen": 54097555, + "router_z_loss_clip": 1.1171875, + "router_z_loss_mlp": 0.20043945, + "step": 2492, + "time_per_iteration": 2.7687060832977295 + }, + { + "auxiliary_loss_clip": 0.01162896, + "auxiliary_loss_mlp": 0.01057322, + "balance_loss_clip": 1.05621791, + "balance_loss_mlp": 1.03847551, + "epoch": 0.14988726890124757, + "flos": 19832286944160.0, + "grad_norm": 2.825472641452625, + "language_loss": 0.78833616, + "learning_rate": 3.851113162828802e-06, + "loss": 0.81053829, + "num_input_tokens_seen": 54115600, + "router_z_loss_clip": 1.06738281, + "router_z_loss_mlp": 0.18823242, + "step": 2493, + "time_per_iteration": 2.7240421772003174 + }, + { + "auxiliary_loss_clip": 0.01161892, + "auxiliary_loss_mlp": 0.01042738, + "balance_loss_clip": 1.05424309, + "balance_loss_mlp": 1.02374816, + "epoch": 0.14994739215391553, + "flos": 25217480576160.0, + "grad_norm": 1.616681088272589, + "language_loss": 0.80135286, + "learning_rate": 3.85096567391148e-06, + "loss": 0.82339919, + "num_input_tokens_seen": 54135220, + "router_z_loss_clip": 1.07763672, + "router_z_loss_mlp": 0.18994141, + "step": 2494, + "time_per_iteration": 2.6839869022369385 + }, + { + "auxiliary_loss_clip": 0.01158227, + "auxiliary_loss_mlp": 0.01048032, + "balance_loss_clip": 1.0544281, + "balance_loss_mlp": 1.02863669, + "epoch": 0.1500075154065835, + "flos": 86213987815680.0, + "grad_norm": 1.9121732759638872, + "language_loss": 0.66428959, + "learning_rate": 3.850818114805354e-06, + "loss": 0.68635219, + "num_input_tokens_seen": 54161065, + "router_z_loss_clip": 1.03808594, + "router_z_loss_mlp": 0.19384766, + "step": 2495, + "time_per_iteration": 3.1270196437835693 + }, + { + "auxiliary_loss_clip": 0.01060424, + "auxiliary_loss_mlp": 0.01001127, + "balance_loss_clip": 1.02290225, + "balance_loss_mlp": 0.99894392, + "epoch": 0.15006763865925146, + "flos": 82989802631520.0, + "grad_norm": 0.8872235572575321, + "language_loss": 0.59449512, + "learning_rate": 3.850670485516019e-06, + "loss": 0.61511064, + "num_input_tokens_seen": 54225095, + "router_z_loss_clip": 0.37475586, + "router_z_loss_mlp": 0.02186584, + "step": 2496, + "time_per_iteration": 3.4516689777374268 + }, + { + "auxiliary_loss_clip": 0.01160747, + "auxiliary_loss_mlp": 0.01053173, + "balance_loss_clip": 1.05199397, + "balance_loss_mlp": 1.03376627, + "epoch": 0.15012776191191943, + "flos": 23081552324640.0, + "grad_norm": 1.8334475402281867, + "language_loss": 0.65757823, + "learning_rate": 3.850522786049075e-06, + "loss": 0.67971742, + "num_input_tokens_seen": 54243750, + "router_z_loss_clip": 1.08935547, + "router_z_loss_mlp": 0.19421387, + "step": 2497, + "time_per_iteration": 2.7269253730773926 + }, + { + "auxiliary_loss_clip": 0.01163795, + "auxiliary_loss_mlp": 0.01044913, + "balance_loss_clip": 1.05693197, + "balance_loss_mlp": 1.02701998, + "epoch": 0.1501878851645874, + "flos": 28920151850400.0, + "grad_norm": 1.4456724160450336, + "language_loss": 0.7539351, + "learning_rate": 3.850375016410121e-06, + "loss": 0.7760222, + "num_input_tokens_seen": 54266185, + "router_z_loss_clip": 1.06884766, + "router_z_loss_mlp": 0.17883301, + "step": 2498, + "time_per_iteration": 2.8097124099731445 + }, + { + "auxiliary_loss_clip": 0.01168715, + "auxiliary_loss_mlp": 0.01042414, + "balance_loss_clip": 1.05778694, + "balance_loss_mlp": 1.02361476, + "epoch": 0.15024800841725539, + "flos": 24907365250560.0, + "grad_norm": 1.9998109060013889, + "language_loss": 0.72033238, + "learning_rate": 3.850227176604761e-06, + "loss": 0.74244368, + "num_input_tokens_seen": 54283940, + "router_z_loss_clip": 1.10839844, + "router_z_loss_mlp": 0.18811035, + "step": 2499, + "time_per_iteration": 2.6657230854034424 + }, + { + "auxiliary_loss_clip": 0.0116501, + "auxiliary_loss_mlp": 0.01050893, + "balance_loss_clip": 1.05660486, + "balance_loss_mlp": 1.03190279, + "epoch": 0.15030813166992335, + "flos": 38841214996800.0, + "grad_norm": 1.805242176034171, + "language_loss": 0.72252572, + "learning_rate": 3.850079266638601e-06, + "loss": 0.74468476, + "num_input_tokens_seen": 54304830, + "router_z_loss_clip": 1.08447266, + "router_z_loss_mlp": 0.18981934, + "step": 2500, + "time_per_iteration": 2.803736448287964 + }, + { + "auxiliary_loss_clip": 0.01161691, + "auxiliary_loss_mlp": 0.0105675, + "balance_loss_clip": 1.05612946, + "balance_loss_mlp": 1.03808165, + "epoch": 0.15036825492259132, + "flos": 43511380243200.0, + "grad_norm": 1.8142557246090147, + "language_loss": 0.65345943, + "learning_rate": 3.849931286517249e-06, + "loss": 0.6756438, + "num_input_tokens_seen": 54325595, + "router_z_loss_clip": 1.05419922, + "router_z_loss_mlp": 0.18664551, + "step": 2501, + "time_per_iteration": 2.8145203590393066 + }, + { + "auxiliary_loss_clip": 0.0116133, + "auxiliary_loss_mlp": 0.01056258, + "balance_loss_clip": 1.054533, + "balance_loss_mlp": 1.03648067, + "epoch": 0.15042837817525928, + "flos": 22986714072960.0, + "grad_norm": 2.066489226940213, + "language_loss": 0.83679891, + "learning_rate": 3.849783236246318e-06, + "loss": 0.85897475, + "num_input_tokens_seen": 54342180, + "router_z_loss_clip": 1.06933594, + "router_z_loss_mlp": 0.19775391, + "step": 2502, + "time_per_iteration": 5.514384508132935 + }, + { + "auxiliary_loss_clip": 0.01157012, + "auxiliary_loss_mlp": 0.01052923, + "balance_loss_clip": 1.05174935, + "balance_loss_mlp": 1.03575647, + "epoch": 0.15048850142792725, + "flos": 23837334743520.0, + "grad_norm": 2.036719167546713, + "language_loss": 0.77498102, + "learning_rate": 3.849635115831421e-06, + "loss": 0.7970804, + "num_input_tokens_seen": 54360255, + "router_z_loss_clip": 1.05224609, + "router_z_loss_mlp": 0.171875, + "step": 2503, + "time_per_iteration": 2.656179666519165 + }, + { + "auxiliary_loss_clip": 0.01157754, + "auxiliary_loss_mlp": 0.01043857, + "balance_loss_clip": 1.05403113, + "balance_loss_mlp": 1.02690589, + "epoch": 0.1505486246805952, + "flos": 26865974769120.0, + "grad_norm": 1.9833551153816795, + "language_loss": 0.84964287, + "learning_rate": 3.849486925278176e-06, + "loss": 0.87165904, + "num_input_tokens_seen": 54378260, + "router_z_loss_clip": 1.03613281, + "router_z_loss_mlp": 0.16943359, + "step": 2504, + "time_per_iteration": 2.648756504058838 + }, + { + "auxiliary_loss_clip": 0.01158041, + "auxiliary_loss_mlp": 0.01040372, + "balance_loss_clip": 1.05403388, + "balance_loss_mlp": 1.02356339, + "epoch": 0.15060874793326318, + "flos": 25310617102080.0, + "grad_norm": 1.6270784833073217, + "language_loss": 0.83133543, + "learning_rate": 3.8493386645922e-06, + "loss": 0.85331959, + "num_input_tokens_seen": 54399745, + "router_z_loss_clip": 1.04003906, + "router_z_loss_mlp": 0.16809082, + "step": 2505, + "time_per_iteration": 2.684333086013794 + }, + { + "auxiliary_loss_clip": 0.01160252, + "auxiliary_loss_mlp": 0.01047876, + "balance_loss_clip": 1.0547514, + "balance_loss_mlp": 1.03042352, + "epoch": 0.15066887118593117, + "flos": 20098730992320.0, + "grad_norm": 1.9247813746229367, + "language_loss": 0.75685012, + "learning_rate": 3.849190333779117e-06, + "loss": 0.77893138, + "num_input_tokens_seen": 54417105, + "router_z_loss_clip": 1.05615234, + "router_z_loss_mlp": 0.17443848, + "step": 2506, + "time_per_iteration": 5.702074289321899 + }, + { + "auxiliary_loss_clip": 0.01163997, + "auxiliary_loss_mlp": 0.01045478, + "balance_loss_clip": 1.05545735, + "balance_loss_mlp": 1.0273937, + "epoch": 0.15072899443859913, + "flos": 24232847794560.0, + "grad_norm": 3.463658383261325, + "language_loss": 0.76111597, + "learning_rate": 3.849041932844552e-06, + "loss": 0.78321069, + "num_input_tokens_seen": 54433920, + "router_z_loss_clip": 1.08447266, + "router_z_loss_mlp": 0.1809082, + "step": 2507, + "time_per_iteration": 2.796875476837158 + }, + { + "auxiliary_loss_clip": 0.01154477, + "auxiliary_loss_mlp": 0.01043099, + "balance_loss_clip": 1.0516274, + "balance_loss_mlp": 1.02584982, + "epoch": 0.1507891176912671, + "flos": 25351290100800.0, + "grad_norm": 1.95095518298516, + "language_loss": 0.69057333, + "learning_rate": 3.848893461794131e-06, + "loss": 0.71254909, + "num_input_tokens_seen": 54451540, + "router_z_loss_clip": 1.02832031, + "router_z_loss_mlp": 0.17236328, + "step": 2508, + "time_per_iteration": 2.6654787063598633 + }, + { + "auxiliary_loss_clip": 0.0116525, + "auxiliary_loss_mlp": 0.01049549, + "balance_loss_clip": 1.05862057, + "balance_loss_mlp": 1.03228736, + "epoch": 0.15084924094393506, + "flos": 28780669906560.0, + "grad_norm": 2.632612878406783, + "language_loss": 0.7720198, + "learning_rate": 3.8487449206334845e-06, + "loss": 0.79416788, + "num_input_tokens_seen": 54470800, + "router_z_loss_clip": 1.06542969, + "router_z_loss_mlp": 0.17272949, + "step": 2509, + "time_per_iteration": 2.6898443698883057 + }, + { + "auxiliary_loss_clip": 0.01169249, + "auxiliary_loss_mlp": 0.0105159, + "balance_loss_clip": 1.05564332, + "balance_loss_mlp": 1.03203964, + "epoch": 0.15090936419660303, + "flos": 23075960940000.0, + "grad_norm": 2.31507318267434, + "language_loss": 0.79975694, + "learning_rate": 3.848596309368246e-06, + "loss": 0.82196534, + "num_input_tokens_seen": 54486525, + "router_z_loss_clip": 1.13671875, + "router_z_loss_mlp": 0.19543457, + "step": 2510, + "time_per_iteration": 2.6232330799102783 + }, + { + "auxiliary_loss_clip": 0.01163379, + "auxiliary_loss_mlp": 0.01053778, + "balance_loss_clip": 1.05552793, + "balance_loss_mlp": 1.03515792, + "epoch": 0.150969487449271, + "flos": 21876132119040.0, + "grad_norm": 1.910565151473622, + "language_loss": 0.74062562, + "learning_rate": 3.8484476280040495e-06, + "loss": 0.76279718, + "num_input_tokens_seen": 54503795, + "router_z_loss_clip": 1.07910156, + "router_z_loss_mlp": 0.18615723, + "step": 2511, + "time_per_iteration": 2.758549928665161 + }, + { + "auxiliary_loss_clip": 0.01162303, + "auxiliary_loss_mlp": 0.01041583, + "balance_loss_clip": 1.05665648, + "balance_loss_mlp": 1.02436924, + "epoch": 0.151029610701939, + "flos": 29581744294080.0, + "grad_norm": 1.994571923357722, + "language_loss": 0.69405723, + "learning_rate": 3.848298876546534e-06, + "loss": 0.71609604, + "num_input_tokens_seen": 54523025, + "router_z_loss_clip": 1.05664062, + "router_z_loss_mlp": 0.17199707, + "step": 2512, + "time_per_iteration": 2.676713705062866 + }, + { + "auxiliary_loss_clip": 0.01162625, + "auxiliary_loss_mlp": 0.01047441, + "balance_loss_clip": 1.0569005, + "balance_loss_mlp": 1.02991724, + "epoch": 0.15108973395460695, + "flos": 36927168135840.0, + "grad_norm": 2.3133398580305014, + "language_loss": 0.73657894, + "learning_rate": 3.84815005500134e-06, + "loss": 0.75867957, + "num_input_tokens_seen": 54545025, + "router_z_loss_clip": 1.05566406, + "router_z_loss_mlp": 0.17529297, + "step": 2513, + "time_per_iteration": 2.7356574535369873 + }, + { + "auxiliary_loss_clip": 0.01058474, + "auxiliary_loss_mlp": 0.01014092, + "balance_loss_clip": 1.02154922, + "balance_loss_mlp": 1.01199079, + "epoch": 0.15114985720727492, + "flos": 73745687977920.0, + "grad_norm": 0.857258097619455, + "language_loss": 0.64721483, + "learning_rate": 3.84800116337411e-06, + "loss": 0.6679405, + "num_input_tokens_seen": 54604545, + "router_z_loss_clip": 0.36938477, + "router_z_loss_mlp": 0.02102661, + "step": 2514, + "time_per_iteration": 3.2155494689941406 + }, + { + "auxiliary_loss_clip": 0.01156735, + "auxiliary_loss_mlp": 0.01037902, + "balance_loss_clip": 1.05253673, + "balance_loss_mlp": 1.02029467, + "epoch": 0.15120998045994288, + "flos": 25040607533280.0, + "grad_norm": 2.3843457336246794, + "language_loss": 0.73036683, + "learning_rate": 3.8478522016704916e-06, + "loss": 0.75231326, + "num_input_tokens_seen": 54620590, + "router_z_loss_clip": 1.04443359, + "router_z_loss_mlp": 0.17614746, + "step": 2515, + "time_per_iteration": 2.6384570598602295 + }, + { + "auxiliary_loss_clip": 0.01157911, + "auxiliary_loss_mlp": 0.01040294, + "balance_loss_clip": 1.05562747, + "balance_loss_mlp": 1.02231765, + "epoch": 0.15127010371261085, + "flos": 25842532783680.0, + "grad_norm": 2.577720735690167, + "language_loss": 0.77683169, + "learning_rate": 3.8477031698961325e-06, + "loss": 0.7988137, + "num_input_tokens_seen": 54640410, + "router_z_loss_clip": 1.0234375, + "router_z_loss_mlp": 0.1796875, + "step": 2516, + "time_per_iteration": 2.6675891876220703 + }, + { + "auxiliary_loss_clip": 0.01058592, + "auxiliary_loss_mlp": 0.01005525, + "balance_loss_clip": 1.02169335, + "balance_loss_mlp": 1.00337756, + "epoch": 0.1513302269652788, + "flos": 79704888395040.0, + "grad_norm": 0.7250380477631365, + "language_loss": 0.54689217, + "learning_rate": 3.8475540680566835e-06, + "loss": 0.56753337, + "num_input_tokens_seen": 54701430, + "router_z_loss_clip": 0.36914062, + "router_z_loss_mlp": 0.02149963, + "step": 2517, + "time_per_iteration": 3.2981009483337402 + }, + { + "auxiliary_loss_clip": 0.01158229, + "auxiliary_loss_mlp": 0.01040027, + "balance_loss_clip": 1.05283976, + "balance_loss_mlp": 1.02144194, + "epoch": 0.15139035021794678, + "flos": 23349414477600.0, + "grad_norm": 1.8761136674078818, + "language_loss": 0.78545928, + "learning_rate": 3.8474048961577995e-06, + "loss": 0.80744183, + "num_input_tokens_seen": 54720845, + "router_z_loss_clip": 1.0546875, + "router_z_loss_mlp": 0.18579102, + "step": 2518, + "time_per_iteration": 2.6608684062957764 + }, + { + "auxiliary_loss_clip": 0.01164781, + "auxiliary_loss_mlp": 0.01048834, + "balance_loss_clip": 1.05629551, + "balance_loss_mlp": 1.03045225, + "epoch": 0.15145047347061477, + "flos": 32431647481920.0, + "grad_norm": 2.0377693119512585, + "language_loss": 0.70265543, + "learning_rate": 3.847255654205137e-06, + "loss": 0.72479165, + "num_input_tokens_seen": 54740495, + "router_z_loss_clip": 1.0859375, + "router_z_loss_mlp": 0.18383789, + "step": 2519, + "time_per_iteration": 2.9323437213897705 + }, + { + "auxiliary_loss_clip": 0.01162647, + "auxiliary_loss_mlp": 0.01045688, + "balance_loss_clip": 1.05711997, + "balance_loss_mlp": 1.02784288, + "epoch": 0.15151059672328274, + "flos": 24773677277760.0, + "grad_norm": 1.8623471447796776, + "language_loss": 0.7865833, + "learning_rate": 3.847106342204354e-06, + "loss": 0.80866671, + "num_input_tokens_seen": 54758415, + "router_z_loss_clip": 1.05566406, + "router_z_loss_mlp": 0.17834473, + "step": 2520, + "time_per_iteration": 2.670802354812622 + }, + { + "auxiliary_loss_clip": 0.01162657, + "auxiliary_loss_mlp": 0.0104979, + "balance_loss_clip": 1.05498242, + "balance_loss_mlp": 1.03132439, + "epoch": 0.1515707199759507, + "flos": 33225226172640.0, + "grad_norm": 1.8337967558810537, + "language_loss": 0.75099528, + "learning_rate": 3.846956960161114e-06, + "loss": 0.77311969, + "num_input_tokens_seen": 54779355, + "router_z_loss_clip": 1.07861328, + "router_z_loss_mlp": 0.18457031, + "step": 2521, + "time_per_iteration": 2.6981587409973145 + }, + { + "auxiliary_loss_clip": 0.01162757, + "auxiliary_loss_mlp": 0.01046376, + "balance_loss_clip": 1.05497754, + "balance_loss_mlp": 1.02748132, + "epoch": 0.15163084322861867, + "flos": 28781480252160.0, + "grad_norm": 3.8043732546564457, + "language_loss": 0.82260048, + "learning_rate": 3.84680750808108e-06, + "loss": 0.84469181, + "num_input_tokens_seen": 54799465, + "router_z_loss_clip": 1.07714844, + "router_z_loss_mlp": 0.18908691, + "step": 2522, + "time_per_iteration": 2.705735683441162 + }, + { + "auxiliary_loss_clip": 0.01060169, + "auxiliary_loss_mlp": 0.01008901, + "balance_loss_clip": 1.02358663, + "balance_loss_mlp": 1.00649011, + "epoch": 0.15169096648128663, + "flos": 81619461980640.0, + "grad_norm": 0.8241763828773552, + "language_loss": 0.57903689, + "learning_rate": 3.846657985969922e-06, + "loss": 0.59972763, + "num_input_tokens_seen": 54857665, + "router_z_loss_clip": 0.36621094, + "router_z_loss_mlp": 0.02407837, + "step": 2523, + "time_per_iteration": 3.219693183898926 + }, + { + "auxiliary_loss_clip": 0.01159791, + "auxiliary_loss_mlp": 0.01048066, + "balance_loss_clip": 1.05520689, + "balance_loss_mlp": 1.02939773, + "epoch": 0.1517510897339546, + "flos": 35502986370240.0, + "grad_norm": 3.5948959496711113, + "language_loss": 0.75191426, + "learning_rate": 3.8465083938333066e-06, + "loss": 0.77399284, + "num_input_tokens_seen": 54879895, + "router_z_loss_clip": 1.04589844, + "router_z_loss_mlp": 0.18676758, + "step": 2524, + "time_per_iteration": 2.7527694702148438 + }, + { + "auxiliary_loss_clip": 0.01158877, + "auxiliary_loss_mlp": 0.01041819, + "balance_loss_clip": 1.05362296, + "balance_loss_mlp": 1.02394915, + "epoch": 0.1518112129866226, + "flos": 22459457878560.0, + "grad_norm": 1.755157874006897, + "language_loss": 0.74571472, + "learning_rate": 3.8463587316769085e-06, + "loss": 0.76772165, + "num_input_tokens_seen": 54898245, + "router_z_loss_clip": 1.05175781, + "router_z_loss_mlp": 0.17871094, + "step": 2525, + "time_per_iteration": 2.693458080291748 + }, + { + "auxiliary_loss_clip": 0.011631, + "auxiliary_loss_mlp": 0.0104932, + "balance_loss_clip": 1.05605102, + "balance_loss_mlp": 1.03094959, + "epoch": 0.15187133623929056, + "flos": 23702998494240.0, + "grad_norm": 1.9007172373649688, + "language_loss": 0.8005203, + "learning_rate": 3.846208999506402e-06, + "loss": 0.82264447, + "num_input_tokens_seen": 54917060, + "router_z_loss_clip": 1.06933594, + "router_z_loss_mlp": 0.18383789, + "step": 2526, + "time_per_iteration": 2.66398024559021 + }, + { + "auxiliary_loss_clip": 0.01157026, + "auxiliary_loss_mlp": 0.01046192, + "balance_loss_clip": 1.05597544, + "balance_loss_mlp": 1.02895474, + "epoch": 0.15193145949195852, + "flos": 21434111580960.0, + "grad_norm": 2.8937954597808213, + "language_loss": 0.85605419, + "learning_rate": 3.846059197327466e-06, + "loss": 0.87808639, + "num_input_tokens_seen": 54936365, + "router_z_loss_clip": 1.01074219, + "router_z_loss_mlp": 0.17236328, + "step": 2527, + "time_per_iteration": 2.682004690170288 + }, + { + "auxiliary_loss_clip": 0.01160637, + "auxiliary_loss_mlp": 0.01040134, + "balance_loss_clip": 1.05523562, + "balance_loss_mlp": 1.02251458, + "epoch": 0.15199158274462649, + "flos": 44143320388320.0, + "grad_norm": 1.7035412398870184, + "language_loss": 0.6924541, + "learning_rate": 3.845909325145779e-06, + "loss": 0.7144618, + "num_input_tokens_seen": 54961365, + "router_z_loss_clip": 1.05273438, + "router_z_loss_mlp": 0.17626953, + "step": 2528, + "time_per_iteration": 2.8170418739318848 + }, + { + "auxiliary_loss_clip": 0.01159584, + "auxiliary_loss_mlp": 0.01047201, + "balance_loss_clip": 1.05491209, + "balance_loss_mlp": 1.02937913, + "epoch": 0.15205170599729445, + "flos": 28155334078080.0, + "grad_norm": 1.7995317309586276, + "language_loss": 0.86904496, + "learning_rate": 3.845759382967026e-06, + "loss": 0.8911128, + "num_input_tokens_seen": 54980750, + "router_z_loss_clip": 1.04589844, + "router_z_loss_mlp": 0.17822266, + "step": 2529, + "time_per_iteration": 2.677255868911743 + }, + { + "auxiliary_loss_clip": 0.01159089, + "auxiliary_loss_mlp": 0.01039939, + "balance_loss_clip": 1.05518699, + "balance_loss_mlp": 1.02175975, + "epoch": 0.15211182924996242, + "flos": 26732732486400.0, + "grad_norm": 2.1268914744828837, + "language_loss": 0.83483571, + "learning_rate": 3.845609370796893e-06, + "loss": 0.85682595, + "num_input_tokens_seen": 54999675, + "router_z_loss_clip": 1.0390625, + "router_z_loss_mlp": 0.18164062, + "step": 2530, + "time_per_iteration": 2.6724298000335693 + }, + { + "auxiliary_loss_clip": 0.01159253, + "auxiliary_loss_mlp": 0.01045535, + "balance_loss_clip": 1.05428064, + "balance_loss_mlp": 1.02716482, + "epoch": 0.15217195250263038, + "flos": 16937740064160.0, + "grad_norm": 1.847568130389204, + "language_loss": 0.80658001, + "learning_rate": 3.845459288641066e-06, + "loss": 0.82862788, + "num_input_tokens_seen": 55018295, + "router_z_loss_clip": 1.04980469, + "router_z_loss_mlp": 0.18359375, + "step": 2531, + "time_per_iteration": 2.699228286743164 + }, + { + "auxiliary_loss_clip": 0.0115802, + "auxiliary_loss_mlp": 0.01042665, + "balance_loss_clip": 1.05403137, + "balance_loss_mlp": 1.02573681, + "epoch": 0.15223207575529837, + "flos": 29938326589440.0, + "grad_norm": 1.648371772050807, + "language_loss": 0.7900542, + "learning_rate": 3.8453091365052394e-06, + "loss": 0.81206107, + "num_input_tokens_seen": 55037975, + "router_z_loss_clip": 1.04003906, + "router_z_loss_mlp": 0.16955566, + "step": 2532, + "time_per_iteration": 2.884319543838501 + }, + { + "auxiliary_loss_clip": 0.01155959, + "auxiliary_loss_mlp": 0.01047675, + "balance_loss_clip": 1.05406666, + "balance_loss_mlp": 1.02973461, + "epoch": 0.15229219900796634, + "flos": 31184824966560.0, + "grad_norm": 1.7307197106385375, + "language_loss": 0.87881207, + "learning_rate": 3.845158914395105e-06, + "loss": 0.90084845, + "num_input_tokens_seen": 55057135, + "router_z_loss_clip": 1.01953125, + "router_z_loss_mlp": 0.17932129, + "step": 2533, + "time_per_iteration": 2.7155141830444336 + }, + { + "auxiliary_loss_clip": 0.01160922, + "auxiliary_loss_mlp": 0.01052271, + "balance_loss_clip": 1.0546298, + "balance_loss_mlp": 1.03349519, + "epoch": 0.1523523222606343, + "flos": 22229716135680.0, + "grad_norm": 6.92790160524744, + "language_loss": 0.7894209, + "learning_rate": 3.84500862231636e-06, + "loss": 0.81155282, + "num_input_tokens_seen": 55075525, + "router_z_loss_clip": 1.06201172, + "router_z_loss_mlp": 0.18774414, + "step": 2534, + "time_per_iteration": 2.6616413593292236 + }, + { + "auxiliary_loss_clip": 0.01163635, + "auxiliary_loss_mlp": 0.01043378, + "balance_loss_clip": 1.05448651, + "balance_loss_mlp": 1.02464986, + "epoch": 0.15241244551330227, + "flos": 16179526608480.0, + "grad_norm": 2.3974653554387784, + "language_loss": 0.77045351, + "learning_rate": 3.844858260274702e-06, + "loss": 0.79252362, + "num_input_tokens_seen": 55090845, + "router_z_loss_clip": 1.09130859, + "router_z_loss_mlp": 0.18713379, + "step": 2535, + "time_per_iteration": 2.646493911743164 + }, + { + "auxiliary_loss_clip": 0.01164243, + "auxiliary_loss_mlp": 0.01043839, + "balance_loss_clip": 1.0543294, + "balance_loss_mlp": 1.02602947, + "epoch": 0.15247256876597023, + "flos": 24057230787360.0, + "grad_norm": 2.4100260907606295, + "language_loss": 0.78426409, + "learning_rate": 3.844707828275835e-06, + "loss": 0.80634493, + "num_input_tokens_seen": 55108750, + "router_z_loss_clip": 1.09863281, + "router_z_loss_mlp": 0.17822266, + "step": 2536, + "time_per_iteration": 2.6396303176879883 + }, + { + "auxiliary_loss_clip": 0.01157788, + "auxiliary_loss_mlp": 0.01052658, + "balance_loss_clip": 1.05633223, + "balance_loss_mlp": 1.0356468, + "epoch": 0.1525326920186382, + "flos": 24862640523840.0, + "grad_norm": 2.0606376696450495, + "language_loss": 0.75399059, + "learning_rate": 3.844557326325461e-06, + "loss": 0.77609509, + "num_input_tokens_seen": 55126750, + "router_z_loss_clip": 1.01464844, + "router_z_loss_mlp": 0.17004395, + "step": 2537, + "time_per_iteration": 2.6842148303985596 + }, + { + "auxiliary_loss_clip": 0.01163718, + "auxiliary_loss_mlp": 0.01046303, + "balance_loss_clip": 1.05718732, + "balance_loss_mlp": 1.02829063, + "epoch": 0.15259281527130616, + "flos": 16581927597120.0, + "grad_norm": 2.6453142862669314, + "language_loss": 0.77558887, + "learning_rate": 3.8444067544292896e-06, + "loss": 0.79768908, + "num_input_tokens_seen": 55144690, + "router_z_loss_clip": 1.06542969, + "router_z_loss_mlp": 0.18017578, + "step": 2538, + "time_per_iteration": 2.7547638416290283 + }, + { + "auxiliary_loss_clip": 0.01155866, + "auxiliary_loss_mlp": 0.01039807, + "balance_loss_clip": 1.05401111, + "balance_loss_mlp": 1.0238564, + "epoch": 0.15265293852397416, + "flos": 27894116759040.0, + "grad_norm": 1.6649305441136386, + "language_loss": 0.89952385, + "learning_rate": 3.844256112593029e-06, + "loss": 0.92148054, + "num_input_tokens_seen": 55166055, + "router_z_loss_clip": 1.01708984, + "router_z_loss_mlp": 0.15936279, + "step": 2539, + "time_per_iteration": 2.6872246265411377 + }, + { + "auxiliary_loss_clip": 0.01159745, + "auxiliary_loss_mlp": 0.01047184, + "balance_loss_clip": 1.05491066, + "balance_loss_mlp": 1.0296483, + "epoch": 0.15271306177664212, + "flos": 35677387859040.0, + "grad_norm": 2.034927931600072, + "language_loss": 0.93642992, + "learning_rate": 3.844105400822391e-06, + "loss": 0.95849919, + "num_input_tokens_seen": 55186285, + "router_z_loss_clip": 1.04833984, + "router_z_loss_mlp": 0.17529297, + "step": 2540, + "time_per_iteration": 2.7507076263427734 + }, + { + "auxiliary_loss_clip": 0.01155465, + "auxiliary_loss_mlp": 0.01039026, + "balance_loss_clip": 1.05413842, + "balance_loss_mlp": 1.02283788, + "epoch": 0.1527731850293101, + "flos": 38127766785120.0, + "grad_norm": 1.938281689240457, + "language_loss": 0.7533046, + "learning_rate": 3.843954619123092e-06, + "loss": 0.77524948, + "num_input_tokens_seen": 55207915, + "router_z_loss_clip": 1.01318359, + "router_z_loss_mlp": 0.1619873, + "step": 2541, + "time_per_iteration": 5.6597442626953125 + }, + { + "auxiliary_loss_clip": 0.01157759, + "auxiliary_loss_mlp": 0.01042945, + "balance_loss_clip": 1.05494499, + "balance_loss_mlp": 1.02577877, + "epoch": 0.15283330828197805, + "flos": 27310831516800.0, + "grad_norm": 1.5449942487990393, + "language_loss": 0.8129214, + "learning_rate": 3.84380376750085e-06, + "loss": 0.83492845, + "num_input_tokens_seen": 55227860, + "router_z_loss_clip": 1.02783203, + "router_z_loss_mlp": 0.17175293, + "step": 2542, + "time_per_iteration": 2.7052390575408936 + }, + { + "auxiliary_loss_clip": 0.01161116, + "auxiliary_loss_mlp": 0.01045555, + "balance_loss_clip": 1.05552888, + "balance_loss_mlp": 1.02767324, + "epoch": 0.15289343153464602, + "flos": 31140100239840.0, + "grad_norm": 2.275309348459378, + "language_loss": 0.77455604, + "learning_rate": 3.843652845961383e-06, + "loss": 0.79662275, + "num_input_tokens_seen": 55247330, + "router_z_loss_clip": 1.05517578, + "router_z_loss_mlp": 0.17883301, + "step": 2543, + "time_per_iteration": 2.89931583404541 + }, + { + "auxiliary_loss_clip": 0.0115819, + "auxiliary_loss_mlp": 0.01045262, + "balance_loss_clip": 1.05495095, + "balance_loss_mlp": 1.02845395, + "epoch": 0.15295355478731398, + "flos": 27711044572320.0, + "grad_norm": 1.967142692724756, + "language_loss": 0.86437321, + "learning_rate": 3.843501854510416e-06, + "loss": 0.88640773, + "num_input_tokens_seen": 55266195, + "router_z_loss_clip": 1.03369141, + "router_z_loss_mlp": 0.16809082, + "step": 2544, + "time_per_iteration": 2.671844959259033 + }, + { + "auxiliary_loss_clip": 0.01162305, + "auxiliary_loss_mlp": 0.01050282, + "balance_loss_clip": 1.05392957, + "balance_loss_mlp": 1.03108943, + "epoch": 0.15301367803998198, + "flos": 28372799085120.0, + "grad_norm": 2.321221914056181, + "language_loss": 0.82390237, + "learning_rate": 3.843350793153673e-06, + "loss": 0.84602827, + "num_input_tokens_seen": 55283305, + "router_z_loss_clip": 1.08300781, + "router_z_loss_mlp": 0.1920166, + "step": 2545, + "time_per_iteration": 4.207353353500366 + }, + { + "auxiliary_loss_clip": 0.01159452, + "auxiliary_loss_mlp": 0.01037797, + "balance_loss_clip": 1.05543399, + "balance_loss_mlp": 1.02046442, + "epoch": 0.15307380129264994, + "flos": 31586780265120.0, + "grad_norm": 2.0652170167842288, + "language_loss": 0.71139115, + "learning_rate": 3.843199661896884e-06, + "loss": 0.73336363, + "num_input_tokens_seen": 55303035, + "router_z_loss_clip": 1.0390625, + "router_z_loss_mlp": 0.17333984, + "step": 2546, + "time_per_iteration": 2.716198682785034 + }, + { + "auxiliary_loss_clip": 0.01163427, + "auxiliary_loss_mlp": 0.01043394, + "balance_loss_clip": 1.05696452, + "balance_loss_mlp": 1.02525079, + "epoch": 0.1531339245453179, + "flos": 57318875644320.0, + "grad_norm": 1.6064053334564923, + "language_loss": 0.77722579, + "learning_rate": 3.843048460745779e-06, + "loss": 0.79929399, + "num_input_tokens_seen": 55327570, + "router_z_loss_clip": 1.06445312, + "router_z_loss_mlp": 0.18139648, + "step": 2547, + "time_per_iteration": 2.8929221630096436 + }, + { + "auxiliary_loss_clip": 0.01163737, + "auxiliary_loss_mlp": 0.0105164, + "balance_loss_clip": 1.05681205, + "balance_loss_mlp": 1.03391314, + "epoch": 0.15319404779798587, + "flos": 43784509642560.0, + "grad_norm": 2.1293840288833366, + "language_loss": 0.74332321, + "learning_rate": 3.842897189706092e-06, + "loss": 0.76547694, + "num_input_tokens_seen": 55351090, + "router_z_loss_clip": 1.06787109, + "router_z_loss_mlp": 0.17736816, + "step": 2548, + "time_per_iteration": 2.7759439945220947 + }, + { + "auxiliary_loss_clip": 0.01161753, + "auxiliary_loss_mlp": 0.01051079, + "balance_loss_clip": 1.05609584, + "balance_loss_mlp": 1.03306615, + "epoch": 0.15325417105065384, + "flos": 31316568109920.0, + "grad_norm": 1.6532713156379741, + "language_loss": 0.80440807, + "learning_rate": 3.842745848783558e-06, + "loss": 0.82653642, + "num_input_tokens_seen": 55371050, + "router_z_loss_clip": 1.05664062, + "router_z_loss_mlp": 0.18005371, + "step": 2549, + "time_per_iteration": 2.7229650020599365 + }, + { + "auxiliary_loss_clip": 0.01161531, + "auxiliary_loss_mlp": 0.01046512, + "balance_loss_clip": 1.05566704, + "balance_loss_mlp": 1.02849913, + "epoch": 0.1533142943033218, + "flos": 22904031005280.0, + "grad_norm": 1.6414571254662484, + "language_loss": 0.74946642, + "learning_rate": 3.842594437983917e-06, + "loss": 0.77154684, + "num_input_tokens_seen": 55390375, + "router_z_loss_clip": 1.05859375, + "router_z_loss_mlp": 0.18005371, + "step": 2550, + "time_per_iteration": 2.6384778022766113 + }, + { + "auxiliary_loss_clip": 0.01163777, + "auxiliary_loss_mlp": 0.01039142, + "balance_loss_clip": 1.05578983, + "balance_loss_mlp": 1.0207715, + "epoch": 0.15337441755598977, + "flos": 28196290697760.0, + "grad_norm": 2.3628285027356566, + "language_loss": 0.76938164, + "learning_rate": 3.8424429573129115e-06, + "loss": 0.79141086, + "num_input_tokens_seen": 55408890, + "router_z_loss_clip": 1.08007812, + "router_z_loss_mlp": 0.18383789, + "step": 2551, + "time_per_iteration": 2.7064449787139893 + }, + { + "auxiliary_loss_clip": 0.01063037, + "auxiliary_loss_mlp": 0.01017837, + "balance_loss_clip": 1.02716613, + "balance_loss_mlp": 1.01540089, + "epoch": 0.15343454080865776, + "flos": 73041477706080.0, + "grad_norm": 0.9340786507856906, + "language_loss": 0.56677073, + "learning_rate": 3.842291406776283e-06, + "loss": 0.58757949, + "num_input_tokens_seen": 55463815, + "router_z_loss_clip": 0.35864258, + "router_z_loss_mlp": 0.02433777, + "step": 2552, + "time_per_iteration": 3.205134391784668 + }, + { + "auxiliary_loss_clip": 0.01164784, + "auxiliary_loss_mlp": 0.01043031, + "balance_loss_clip": 1.05608201, + "balance_loss_mlp": 1.02472091, + "epoch": 0.15349466406132573, + "flos": 14533058279520.0, + "grad_norm": 2.277087904257471, + "language_loss": 0.88562465, + "learning_rate": 3.84213978637978e-06, + "loss": 0.9077028, + "num_input_tokens_seen": 55481050, + "router_z_loss_clip": 1.08642578, + "router_z_loss_mlp": 0.18322754, + "step": 2553, + "time_per_iteration": 2.7417845726013184 + }, + { + "auxiliary_loss_clip": 0.01164476, + "auxiliary_loss_mlp": 0.01044092, + "balance_loss_clip": 1.05645001, + "balance_loss_mlp": 1.02539957, + "epoch": 0.1535547873139937, + "flos": 29403412629120.0, + "grad_norm": 1.738645374654559, + "language_loss": 0.78141487, + "learning_rate": 3.841988096129152e-06, + "loss": 0.80350053, + "num_input_tokens_seen": 55500050, + "router_z_loss_clip": 1.08007812, + "router_z_loss_mlp": 0.18701172, + "step": 2554, + "time_per_iteration": 2.7297863960266113 + }, + { + "auxiliary_loss_clip": 0.01164458, + "auxiliary_loss_mlp": 0.01055145, + "balance_loss_clip": 1.05675364, + "balance_loss_mlp": 1.03695369, + "epoch": 0.15361491056666166, + "flos": 21434395201920.0, + "grad_norm": 2.0867933072568916, + "language_loss": 0.78077525, + "learning_rate": 3.841836336030151e-06, + "loss": 0.8029713, + "num_input_tokens_seen": 55518125, + "router_z_loss_clip": 1.07763672, + "router_z_loss_mlp": 0.18188477, + "step": 2555, + "time_per_iteration": 2.9335179328918457 + }, + { + "auxiliary_loss_clip": 0.01159532, + "auxiliary_loss_mlp": 0.01047299, + "balance_loss_clip": 1.05651319, + "balance_loss_mlp": 1.03089547, + "epoch": 0.15367503381932962, + "flos": 30561555519360.0, + "grad_norm": 1.563299507747859, + "language_loss": 0.7694068, + "learning_rate": 3.8416845060885305e-06, + "loss": 0.79147512, + "num_input_tokens_seen": 55540960, + "router_z_loss_clip": 1.03173828, + "router_z_loss_mlp": 0.1640625, + "step": 2556, + "time_per_iteration": 2.7114081382751465 + }, + { + "auxiliary_loss_clip": 0.01156109, + "auxiliary_loss_mlp": 0.01040242, + "balance_loss_clip": 1.05476987, + "balance_loss_mlp": 1.02243197, + "epoch": 0.15373515707199759, + "flos": 26243313081120.0, + "grad_norm": 1.8712520081847206, + "language_loss": 0.90168399, + "learning_rate": 3.84153260631005e-06, + "loss": 0.92364752, + "num_input_tokens_seen": 55559210, + "router_z_loss_clip": 1.01269531, + "router_z_loss_mlp": 0.17810059, + "step": 2557, + "time_per_iteration": 2.730616569519043 + }, + { + "auxiliary_loss_clip": 0.01160683, + "auxiliary_loss_mlp": 0.01047739, + "balance_loss_clip": 1.05476308, + "balance_loss_mlp": 1.02909505, + "epoch": 0.15379528032466555, + "flos": 31719212202240.0, + "grad_norm": 1.8847208484453961, + "language_loss": 0.70933616, + "learning_rate": 3.841380636700468e-06, + "loss": 0.7314204, + "num_input_tokens_seen": 55578925, + "router_z_loss_clip": 1.05859375, + "router_z_loss_mlp": 0.18640137, + "step": 2558, + "time_per_iteration": 2.7169337272644043 + }, + { + "auxiliary_loss_clip": 0.01162445, + "auxiliary_loss_mlp": 0.0104459, + "balance_loss_clip": 1.05637729, + "balance_loss_mlp": 1.02611804, + "epoch": 0.15385540357733354, + "flos": 23522478896160.0, + "grad_norm": 1.9641096510725315, + "language_loss": 0.9225353, + "learning_rate": 3.841228597265548e-06, + "loss": 0.94460565, + "num_input_tokens_seen": 55597255, + "router_z_loss_clip": 1.06054688, + "router_z_loss_mlp": 0.18475342, + "step": 2559, + "time_per_iteration": 2.703650712966919 + }, + { + "auxiliary_loss_clip": 0.01164815, + "auxiliary_loss_mlp": 0.01053914, + "balance_loss_clip": 1.05798841, + "balance_loss_mlp": 1.0348171, + "epoch": 0.1539155268300015, + "flos": 34836248232000.0, + "grad_norm": 2.3388112820416116, + "language_loss": 0.64399904, + "learning_rate": 3.841076488011055e-06, + "loss": 0.66618633, + "num_input_tokens_seen": 55619515, + "router_z_loss_clip": 1.06738281, + "router_z_loss_mlp": 0.19091797, + "step": 2560, + "time_per_iteration": 2.7297894954681396 + }, + { + "auxiliary_loss_clip": 0.01165464, + "auxiliary_loss_mlp": 0.01044919, + "balance_loss_clip": 1.05755782, + "balance_loss_mlp": 1.0260005, + "epoch": 0.15397565008266947, + "flos": 28733595177600.0, + "grad_norm": 1.5950793885030599, + "language_loss": 0.87957478, + "learning_rate": 3.8409243089427574e-06, + "loss": 0.90167856, + "num_input_tokens_seen": 55640050, + "router_z_loss_clip": 1.07910156, + "router_z_loss_mlp": 0.18920898, + "step": 2561, + "time_per_iteration": 2.7178308963775635 + }, + { + "auxiliary_loss_clip": 0.01157775, + "auxiliary_loss_mlp": 0.01042217, + "balance_loss_clip": 1.05618453, + "balance_loss_mlp": 1.02544439, + "epoch": 0.15403577333533744, + "flos": 20901466588320.0, + "grad_norm": 1.7463788085816387, + "language_loss": 0.82916158, + "learning_rate": 3.840772060066425e-06, + "loss": 0.85116154, + "num_input_tokens_seen": 55658695, + "router_z_loss_clip": 1.015625, + "router_z_loss_mlp": 0.16772461, + "step": 2562, + "time_per_iteration": 2.6765189170837402 + }, + { + "auxiliary_loss_clip": 0.01170298, + "auxiliary_loss_mlp": 0.01047046, + "balance_loss_clip": 1.05933094, + "balance_loss_mlp": 1.02622008, + "epoch": 0.1540958965880054, + "flos": 21834729809280.0, + "grad_norm": 1.8979917995811657, + "language_loss": 0.74578369, + "learning_rate": 3.840619741387832e-06, + "loss": 0.76795709, + "num_input_tokens_seen": 55676340, + "router_z_loss_clip": 1.109375, + "router_z_loss_mlp": 0.20812988, + "step": 2563, + "time_per_iteration": 2.730865240097046 + }, + { + "auxiliary_loss_clip": 0.01162748, + "auxiliary_loss_mlp": 0.01041272, + "balance_loss_clip": 1.0546844, + "balance_loss_mlp": 1.02218652, + "epoch": 0.15415601984067337, + "flos": 39243818571840.0, + "grad_norm": 2.0135943663414873, + "language_loss": 0.7630378, + "learning_rate": 3.8404673529127534e-06, + "loss": 0.78507793, + "num_input_tokens_seen": 55698890, + "router_z_loss_clip": 1.08203125, + "router_z_loss_mlp": 0.19067383, + "step": 2564, + "time_per_iteration": 2.783672332763672 + }, + { + "auxiliary_loss_clip": 0.01159709, + "auxiliary_loss_mlp": 0.01050734, + "balance_loss_clip": 1.05486488, + "balance_loss_mlp": 1.03324592, + "epoch": 0.15421614309334136, + "flos": 29314895073120.0, + "grad_norm": 3.4978473667779166, + "language_loss": 0.70188248, + "learning_rate": 3.840314894646969e-06, + "loss": 0.72398686, + "num_input_tokens_seen": 55718535, + "router_z_loss_clip": 1.04882812, + "router_z_loss_mlp": 0.17492676, + "step": 2565, + "time_per_iteration": 2.883265495300293 + }, + { + "auxiliary_loss_clip": 0.01160589, + "auxiliary_loss_mlp": 0.01048942, + "balance_loss_clip": 1.05595422, + "balance_loss_mlp": 1.03024983, + "epoch": 0.15427626634600933, + "flos": 29756753542080.0, + "grad_norm": 1.9859402592726256, + "language_loss": 0.71866065, + "learning_rate": 3.840162366596259e-06, + "loss": 0.74075592, + "num_input_tokens_seen": 55738970, + "router_z_loss_clip": 1.04785156, + "router_z_loss_mlp": 0.18676758, + "step": 2566, + "time_per_iteration": 2.80719256401062 + }, + { + "auxiliary_loss_clip": 0.01155315, + "auxiliary_loss_mlp": 0.01042742, + "balance_loss_clip": 1.05257654, + "balance_loss_mlp": 1.02533746, + "epoch": 0.1543363895986773, + "flos": 28469339062560.0, + "grad_norm": 1.7384054528605484, + "language_loss": 0.8502481, + "learning_rate": 3.840009768766408e-06, + "loss": 0.87222868, + "num_input_tokens_seen": 55759585, + "router_z_loss_clip": 1.02783203, + "router_z_loss_mlp": 0.1739502, + "step": 2567, + "time_per_iteration": 2.81064510345459 + }, + { + "auxiliary_loss_clip": 0.0116031, + "auxiliary_loss_mlp": 0.01043114, + "balance_loss_clip": 1.05733967, + "balance_loss_mlp": 1.02610254, + "epoch": 0.15439651285134526, + "flos": 29620310394240.0, + "grad_norm": 2.538022147125824, + "language_loss": 0.78175199, + "learning_rate": 3.839857101163202e-06, + "loss": 0.80378622, + "num_input_tokens_seen": 55779250, + "router_z_loss_clip": 1.03076172, + "router_z_loss_mlp": 0.17016602, + "step": 2568, + "time_per_iteration": 2.8917834758758545 + }, + { + "auxiliary_loss_clip": 0.01160435, + "auxiliary_loss_mlp": 0.01038874, + "balance_loss_clip": 1.05671108, + "balance_loss_mlp": 1.01965797, + "epoch": 0.15445663610401322, + "flos": 27401131833120.0, + "grad_norm": 2.096403084062675, + "language_loss": 0.70554113, + "learning_rate": 3.83970436379243e-06, + "loss": 0.72753417, + "num_input_tokens_seen": 55800470, + "router_z_loss_clip": 1.03710938, + "router_z_loss_mlp": 0.19226074, + "step": 2569, + "time_per_iteration": 2.740541696548462 + }, + { + "auxiliary_loss_clip": 0.01156141, + "auxiliary_loss_mlp": 0.01040997, + "balance_loss_clip": 1.05508459, + "balance_loss_mlp": 1.02402127, + "epoch": 0.1545167593566812, + "flos": 26905391732160.0, + "grad_norm": 1.6917171077571018, + "language_loss": 0.76748931, + "learning_rate": 3.839551556659884e-06, + "loss": 0.78946066, + "num_input_tokens_seen": 55817795, + "router_z_loss_clip": 1.00927734, + "router_z_loss_mlp": 0.1697998, + "step": 2570, + "time_per_iteration": 2.64229679107666 + }, + { + "auxiliary_loss_clip": 0.01157611, + "auxiliary_loss_mlp": 0.01039037, + "balance_loss_clip": 1.05645275, + "balance_loss_mlp": 1.02126312, + "epoch": 0.15457688260934915, + "flos": 23572025179200.0, + "grad_norm": 2.96462204385677, + "language_loss": 0.77329767, + "learning_rate": 3.839398679771359e-06, + "loss": 0.79526412, + "num_input_tokens_seen": 55836125, + "router_z_loss_clip": 1.01123047, + "router_z_loss_mlp": 0.1776123, + "step": 2571, + "time_per_iteration": 2.710857391357422 + }, + { + "auxiliary_loss_clip": 0.01160364, + "auxiliary_loss_mlp": 0.01048821, + "balance_loss_clip": 1.05621803, + "balance_loss_mlp": 1.03110623, + "epoch": 0.15463700586201715, + "flos": 29448583045920.0, + "grad_norm": 3.240836859318164, + "language_loss": 0.8245908, + "learning_rate": 3.839245733132652e-06, + "loss": 0.84668267, + "num_input_tokens_seen": 55855280, + "router_z_loss_clip": 1.04101562, + "router_z_loss_mlp": 0.17724609, + "step": 2572, + "time_per_iteration": 2.675676107406616 + }, + { + "auxiliary_loss_clip": 0.01163825, + "auxiliary_loss_mlp": 0.01044872, + "balance_loss_clip": 1.05777359, + "balance_loss_mlp": 1.02707434, + "epoch": 0.1546971291146851, + "flos": 27358027797600.0, + "grad_norm": 1.566377091895519, + "language_loss": 0.90699136, + "learning_rate": 3.839092716749563e-06, + "loss": 0.92907834, + "num_input_tokens_seen": 55875695, + "router_z_loss_clip": 1.06152344, + "router_z_loss_mlp": 0.17785645, + "step": 2573, + "time_per_iteration": 2.7428817749023438 + }, + { + "auxiliary_loss_clip": 0.01162268, + "auxiliary_loss_mlp": 0.01046685, + "balance_loss_clip": 1.05749357, + "balance_loss_mlp": 1.02933979, + "epoch": 0.15475725236735308, + "flos": 21390156682560.0, + "grad_norm": 1.710464615704499, + "language_loss": 0.70316708, + "learning_rate": 3.838939630627893e-06, + "loss": 0.72525656, + "num_input_tokens_seen": 55894575, + "router_z_loss_clip": 1.04882812, + "router_z_loss_mlp": 0.17346191, + "step": 2574, + "time_per_iteration": 2.648632526397705 + }, + { + "auxiliary_loss_clip": 0.01160483, + "auxiliary_loss_mlp": 0.01048989, + "balance_loss_clip": 1.05503345, + "balance_loss_mlp": 1.03043962, + "epoch": 0.15481737562002104, + "flos": 27530322387840.0, + "grad_norm": 1.6198795121505087, + "language_loss": 0.82432353, + "learning_rate": 3.838786474773448e-06, + "loss": 0.8464182, + "num_input_tokens_seen": 55912855, + "router_z_loss_clip": 1.05371094, + "router_z_loss_mlp": 0.18554688, + "step": 2575, + "time_per_iteration": 2.714468479156494 + }, + { + "auxiliary_loss_clip": 0.01162492, + "auxiliary_loss_mlp": 0.01047926, + "balance_loss_clip": 1.05532384, + "balance_loss_mlp": 1.03085518, + "epoch": 0.154877498872689, + "flos": 30383993682720.0, + "grad_norm": 1.8608174800187773, + "language_loss": 0.85089958, + "learning_rate": 3.838633249192036e-06, + "loss": 0.87300372, + "num_input_tokens_seen": 55932375, + "router_z_loss_clip": 1.07275391, + "router_z_loss_mlp": 0.1706543, + "step": 2576, + "time_per_iteration": 2.687947988510132 + }, + { + "auxiliary_loss_clip": 0.01159299, + "auxiliary_loss_mlp": 0.01040857, + "balance_loss_clip": 1.05427909, + "balance_loss_mlp": 1.02305913, + "epoch": 0.15493762212535697, + "flos": 34346180550240.0, + "grad_norm": 1.6476873140161459, + "language_loss": 0.81780398, + "learning_rate": 3.838479953889465e-06, + "loss": 0.83980554, + "num_input_tokens_seen": 55953970, + "router_z_loss_clip": 1.04931641, + "router_z_loss_mlp": 0.17797852, + "step": 2577, + "time_per_iteration": 2.7632062435150146 + }, + { + "auxiliary_loss_clip": 0.01165971, + "auxiliary_loss_mlp": 0.01049622, + "balance_loss_clip": 1.06052685, + "balance_loss_mlp": 1.03202724, + "epoch": 0.15499774537802496, + "flos": 31006979508960.0, + "grad_norm": 2.2841827180302245, + "language_loss": 0.76513088, + "learning_rate": 3.8383265888715525e-06, + "loss": 0.78728682, + "num_input_tokens_seen": 55973120, + "router_z_loss_clip": 1.05419922, + "router_z_loss_mlp": 0.17590332, + "step": 2578, + "time_per_iteration": 2.6961426734924316 + }, + { + "auxiliary_loss_clip": 0.01163697, + "auxiliary_loss_mlp": 0.01046914, + "balance_loss_clip": 1.05839467, + "balance_loss_mlp": 1.02892506, + "epoch": 0.15505786863069293, + "flos": 26956194050880.0, + "grad_norm": 1.9964581788197977, + "language_loss": 0.82275724, + "learning_rate": 3.83817315414411e-06, + "loss": 0.84486336, + "num_input_tokens_seen": 55993260, + "router_z_loss_clip": 1.05273438, + "router_z_loss_mlp": 0.17993164, + "step": 2579, + "time_per_iteration": 2.7333638668060303 + }, + { + "auxiliary_loss_clip": 0.01164107, + "auxiliary_loss_mlp": 0.01051037, + "balance_loss_clip": 1.05990613, + "balance_loss_mlp": 1.03308451, + "epoch": 0.1551179918833609, + "flos": 23082524739360.0, + "grad_norm": 1.6974404533590517, + "language_loss": 0.80718821, + "learning_rate": 3.838019649712958e-06, + "loss": 0.82933962, + "num_input_tokens_seen": 56012130, + "router_z_loss_clip": 1.04296875, + "router_z_loss_mlp": 0.17944336, + "step": 2580, + "time_per_iteration": 2.7187042236328125 + }, + { + "auxiliary_loss_clip": 0.01071347, + "auxiliary_loss_mlp": 0.01007058, + "balance_loss_clip": 1.03596902, + "balance_loss_mlp": 1.00437438, + "epoch": 0.15517811513602886, + "flos": 80825153978880.0, + "grad_norm": 0.8355319387645544, + "language_loss": 0.58907223, + "learning_rate": 3.8378660755839166e-06, + "loss": 0.60985625, + "num_input_tokens_seen": 56079045, + "router_z_loss_clip": 0.35375977, + "router_z_loss_mlp": 0.02687073, + "step": 2581, + "time_per_iteration": 6.447328090667725 + }, + { + "auxiliary_loss_clip": 0.01164158, + "auxiliary_loss_mlp": 0.01047431, + "balance_loss_clip": 1.05793333, + "balance_loss_mlp": 1.02977622, + "epoch": 0.15523823838869683, + "flos": 29311126966080.0, + "grad_norm": 2.9504101011547164, + "language_loss": 0.85342032, + "learning_rate": 3.8377124317628095e-06, + "loss": 0.8755362, + "num_input_tokens_seen": 56098745, + "router_z_loss_clip": 1.06201172, + "router_z_loss_mlp": 0.17651367, + "step": 2582, + "time_per_iteration": 2.7896127700805664 + }, + { + "auxiliary_loss_clip": 0.01164043, + "auxiliary_loss_mlp": 0.01052946, + "balance_loss_clip": 1.05831981, + "balance_loss_mlp": 1.03454065, + "epoch": 0.1552983616413648, + "flos": 24995923323840.0, + "grad_norm": 2.275359508503232, + "language_loss": 0.79052043, + "learning_rate": 3.8375587182554625e-06, + "loss": 0.81269026, + "num_input_tokens_seen": 56117655, + "router_z_loss_clip": 1.05566406, + "router_z_loss_mlp": 0.18408203, + "step": 2583, + "time_per_iteration": 2.7143735885620117 + }, + { + "auxiliary_loss_clip": 0.01162818, + "auxiliary_loss_mlp": 0.01048687, + "balance_loss_clip": 1.05809999, + "balance_loss_mlp": 1.03023362, + "epoch": 0.15535848489403276, + "flos": 39198324016800.0, + "grad_norm": 1.876371237249381, + "language_loss": 0.76282895, + "learning_rate": 3.837404935067705e-06, + "loss": 0.784944, + "num_input_tokens_seen": 56141960, + "router_z_loss_clip": 1.04785156, + "router_z_loss_mlp": 0.18444824, + "step": 2584, + "time_per_iteration": 4.31025242805481 + }, + { + "auxiliary_loss_clip": 0.01158365, + "auxiliary_loss_mlp": 0.01039303, + "balance_loss_clip": 1.05384254, + "balance_loss_mlp": 1.02155328, + "epoch": 0.15541860814670075, + "flos": 23303879405280.0, + "grad_norm": 1.8459890348477583, + "language_loss": 0.75640494, + "learning_rate": 3.837251082205368e-06, + "loss": 0.77838159, + "num_input_tokens_seen": 56161430, + "router_z_loss_clip": 1.04492188, + "router_z_loss_mlp": 0.1776123, + "step": 2585, + "time_per_iteration": 4.145686864852905 + }, + { + "auxiliary_loss_clip": 0.01157624, + "auxiliary_loss_mlp": 0.01046564, + "balance_loss_clip": 1.0553726, + "balance_loss_mlp": 1.0288136, + "epoch": 0.1554787313993687, + "flos": 23393693514240.0, + "grad_norm": 2.662964119281251, + "language_loss": 0.61636865, + "learning_rate": 3.837097159674286e-06, + "loss": 0.63841051, + "num_input_tokens_seen": 56179390, + "router_z_loss_clip": 1.02197266, + "router_z_loss_mlp": 0.1776123, + "step": 2586, + "time_per_iteration": 2.654102087020874 + }, + { + "auxiliary_loss_clip": 0.01160568, + "auxiliary_loss_mlp": 0.01043649, + "balance_loss_clip": 1.05433238, + "balance_loss_mlp": 1.02636385, + "epoch": 0.15553885465203668, + "flos": 19698355867680.0, + "grad_norm": 1.603275299802998, + "language_loss": 0.81204432, + "learning_rate": 3.836943167480296e-06, + "loss": 0.83408648, + "num_input_tokens_seen": 56198020, + "router_z_loss_clip": 1.06298828, + "router_z_loss_mlp": 0.17272949, + "step": 2587, + "time_per_iteration": 2.654426097869873 + }, + { + "auxiliary_loss_clip": 0.01166474, + "auxiliary_loss_mlp": 0.01059426, + "balance_loss_clip": 1.05752039, + "balance_loss_mlp": 1.03867173, + "epoch": 0.15559897790470464, + "flos": 30917570572800.0, + "grad_norm": 2.0437784372717243, + "language_loss": 0.88699567, + "learning_rate": 3.836789105629236e-06, + "loss": 0.90925467, + "num_input_tokens_seen": 56218165, + "router_z_loss_clip": 1.09082031, + "router_z_loss_mlp": 0.20751953, + "step": 2588, + "time_per_iteration": 2.7097530364990234 + }, + { + "auxiliary_loss_clip": 0.01161521, + "auxiliary_loss_mlp": 0.01051631, + "balance_loss_clip": 1.05669713, + "balance_loss_mlp": 1.03179455, + "epoch": 0.1556591011573726, + "flos": 28246363705440.0, + "grad_norm": 2.850731440140474, + "language_loss": 0.64709371, + "learning_rate": 3.83663497412695e-06, + "loss": 0.66922522, + "num_input_tokens_seen": 56237160, + "router_z_loss_clip": 1.046875, + "router_z_loss_mlp": 0.19824219, + "step": 2589, + "time_per_iteration": 2.649932861328125 + }, + { + "auxiliary_loss_clip": 0.01160934, + "auxiliary_loss_mlp": 0.01042465, + "balance_loss_clip": 1.05562818, + "balance_loss_mlp": 1.02312887, + "epoch": 0.15571922441004057, + "flos": 30958284088800.0, + "grad_norm": 1.9183580980467305, + "language_loss": 0.82884526, + "learning_rate": 3.836480772979281e-06, + "loss": 0.85087919, + "num_input_tokens_seen": 56257610, + "router_z_loss_clip": 1.05371094, + "router_z_loss_mlp": 0.19335938, + "step": 2590, + "time_per_iteration": 2.7189431190490723 + }, + { + "auxiliary_loss_clip": 0.01160612, + "auxiliary_loss_mlp": 0.01043736, + "balance_loss_clip": 1.05319369, + "balance_loss_mlp": 1.02559209, + "epoch": 0.15577934766270854, + "flos": 17694170759520.0, + "grad_norm": 2.0432937226621917, + "language_loss": 0.78926611, + "learning_rate": 3.836326502192077e-06, + "loss": 0.81130958, + "num_input_tokens_seen": 56275215, + "router_z_loss_clip": 1.07421875, + "router_z_loss_mlp": 0.18139648, + "step": 2591, + "time_per_iteration": 2.873253345489502 + }, + { + "auxiliary_loss_clip": 0.01157301, + "auxiliary_loss_mlp": 0.01050166, + "balance_loss_clip": 1.054546, + "balance_loss_mlp": 1.03315496, + "epoch": 0.15583947091537653, + "flos": 45654561087840.0, + "grad_norm": 2.2559951631251587, + "language_loss": 0.65158904, + "learning_rate": 3.836172161771189e-06, + "loss": 0.67366374, + "num_input_tokens_seen": 56297130, + "router_z_loss_clip": 1.02636719, + "router_z_loss_mlp": 0.17004395, + "step": 2592, + "time_per_iteration": 2.8008456230163574 + }, + { + "auxiliary_loss_clip": 0.01165085, + "auxiliary_loss_mlp": 0.01047917, + "balance_loss_clip": 1.05787492, + "balance_loss_mlp": 1.02916515, + "epoch": 0.1558995941680445, + "flos": 26642918377440.0, + "grad_norm": 2.2264312084184787, + "language_loss": 0.82352918, + "learning_rate": 3.836017751722467e-06, + "loss": 0.84565926, + "num_input_tokens_seen": 56314995, + "router_z_loss_clip": 1.07226562, + "router_z_loss_mlp": 0.1875, + "step": 2593, + "time_per_iteration": 2.653554916381836 + }, + { + "auxiliary_loss_clip": 0.01157961, + "auxiliary_loss_mlp": 0.01044753, + "balance_loss_clip": 1.0555315, + "balance_loss_mlp": 1.02620387, + "epoch": 0.15595971742071246, + "flos": 24150894037920.0, + "grad_norm": 1.9790646756589478, + "language_loss": 0.72902822, + "learning_rate": 3.8358632720517695e-06, + "loss": 0.75105536, + "num_input_tokens_seen": 56334005, + "router_z_loss_clip": 1.02246094, + "router_z_loss_mlp": 0.1854248, + "step": 2594, + "time_per_iteration": 2.6730237007141113 + }, + { + "auxiliary_loss_clip": 0.01155969, + "auxiliary_loss_mlp": 0.01039182, + "balance_loss_clip": 1.05480981, + "balance_loss_mlp": 1.02107394, + "epoch": 0.15601984067338043, + "flos": 32607061902720.0, + "grad_norm": 2.1426106496196407, + "language_loss": 0.81627238, + "learning_rate": 3.835708722764952e-06, + "loss": 0.83822381, + "num_input_tokens_seen": 56353795, + "router_z_loss_clip": 1.01074219, + "router_z_loss_mlp": 0.18103027, + "step": 2595, + "time_per_iteration": 2.6801211833953857 + }, + { + "auxiliary_loss_clip": 0.01160261, + "auxiliary_loss_mlp": 0.01043124, + "balance_loss_clip": 1.05419087, + "balance_loss_mlp": 1.02512336, + "epoch": 0.1560799639260484, + "flos": 22414044358080.0, + "grad_norm": 1.7631500336030534, + "language_loss": 0.87031931, + "learning_rate": 3.835554103867876e-06, + "loss": 0.89235318, + "num_input_tokens_seen": 56373195, + "router_z_loss_clip": 1.05957031, + "router_z_loss_mlp": 0.17980957, + "step": 2596, + "time_per_iteration": 2.712923526763916 + }, + { + "auxiliary_loss_clip": 0.01157486, + "auxiliary_loss_mlp": 0.01044376, + "balance_loss_clip": 1.0556128, + "balance_loss_mlp": 1.02692437, + "epoch": 0.15614008717871636, + "flos": 27575695391040.0, + "grad_norm": 1.696997690340415, + "language_loss": 0.68600303, + "learning_rate": 3.835399415366404e-06, + "loss": 0.7080217, + "num_input_tokens_seen": 56391525, + "router_z_loss_clip": 1.01855469, + "router_z_loss_mlp": 0.17443848, + "step": 2597, + "time_per_iteration": 2.6690120697021484 + }, + { + "auxiliary_loss_clip": 0.01157151, + "auxiliary_loss_mlp": 0.01043351, + "balance_loss_clip": 1.05715775, + "balance_loss_mlp": 1.02663815, + "epoch": 0.15620021043138435, + "flos": 27755971885440.0, + "grad_norm": 1.7371420560015147, + "language_loss": 0.79502618, + "learning_rate": 3.8352446572664035e-06, + "loss": 0.81703115, + "num_input_tokens_seen": 56410715, + "router_z_loss_clip": 1.0, + "router_z_loss_mlp": 0.16711426, + "step": 2598, + "time_per_iteration": 2.6628692150115967 + }, + { + "auxiliary_loss_clip": 0.01154288, + "auxiliary_loss_mlp": 0.01035825, + "balance_loss_clip": 1.05331492, + "balance_loss_mlp": 1.01806283, + "epoch": 0.15626033368405232, + "flos": 16002329427360.0, + "grad_norm": 1.7665494227922736, + "language_loss": 0.82627839, + "learning_rate": 3.8350898295737405e-06, + "loss": 0.84817952, + "num_input_tokens_seen": 56429170, + "router_z_loss_clip": 1.01025391, + "router_z_loss_mlp": 0.17749023, + "step": 2599, + "time_per_iteration": 2.6287992000579834 + }, + { + "auxiliary_loss_clip": 0.01169996, + "auxiliary_loss_mlp": 0.0105518, + "balance_loss_clip": 1.05970848, + "balance_loss_mlp": 1.03521252, + "epoch": 0.15632045693672028, + "flos": 20099176682400.0, + "grad_norm": 2.0790951188974556, + "language_loss": 0.81673104, + "learning_rate": 3.834934932294287e-06, + "loss": 0.83898282, + "num_input_tokens_seen": 56445685, + "router_z_loss_clip": 1.10253906, + "router_z_loss_mlp": 0.19958496, + "step": 2600, + "time_per_iteration": 2.655200958251953 + }, + { + "auxiliary_loss_clip": 0.01165545, + "auxiliary_loss_mlp": 0.01049839, + "balance_loss_clip": 1.05956554, + "balance_loss_mlp": 1.03170693, + "epoch": 0.15638058018938825, + "flos": 25441630934400.0, + "grad_norm": 1.845690974084469, + "language_loss": 0.88453233, + "learning_rate": 3.834779965433917e-06, + "loss": 0.90668619, + "num_input_tokens_seen": 56465900, + "router_z_loss_clip": 1.06103516, + "router_z_loss_mlp": 0.18151855, + "step": 2601, + "time_per_iteration": 2.674541711807251 + }, + { + "auxiliary_loss_clip": 0.01167291, + "auxiliary_loss_mlp": 0.0106498, + "balance_loss_clip": 1.06031156, + "balance_loss_mlp": 1.0451082, + "epoch": 0.1564407034420562, + "flos": 26688575001600.0, + "grad_norm": 1.800482604201836, + "language_loss": 0.78242081, + "learning_rate": 3.834624928998508e-06, + "loss": 0.80474353, + "num_input_tokens_seen": 56485020, + "router_z_loss_clip": 1.06933594, + "router_z_loss_mlp": 0.19873047, + "step": 2602, + "time_per_iteration": 2.6876583099365234 + }, + { + "auxiliary_loss_clip": 0.01162318, + "auxiliary_loss_mlp": 0.01044278, + "balance_loss_clip": 1.05806899, + "balance_loss_mlp": 1.02590752, + "epoch": 0.15650082669472418, + "flos": 26642715791040.0, + "grad_norm": 1.809980756833659, + "language_loss": 0.73812294, + "learning_rate": 3.8344698229939376e-06, + "loss": 0.76018894, + "num_input_tokens_seen": 56505205, + "router_z_loss_clip": 1.04296875, + "router_z_loss_mlp": 0.18371582, + "step": 2603, + "time_per_iteration": 2.6609771251678467 + }, + { + "auxiliary_loss_clip": 0.01165806, + "auxiliary_loss_mlp": 0.01049305, + "balance_loss_clip": 1.0603348, + "balance_loss_mlp": 1.03087497, + "epoch": 0.15656094994739214, + "flos": 16839660430080.0, + "grad_norm": 3.7583762770106186, + "language_loss": 0.87875909, + "learning_rate": 3.8343146474260865e-06, + "loss": 0.90091014, + "num_input_tokens_seen": 56521495, + "router_z_loss_clip": 1.0546875, + "router_z_loss_mlp": 0.18432617, + "step": 2604, + "time_per_iteration": 2.8516359329223633 + }, + { + "auxiliary_loss_clip": 0.01167727, + "auxiliary_loss_mlp": 0.01046216, + "balance_loss_clip": 1.05894935, + "balance_loss_mlp": 1.02814364, + "epoch": 0.15662107320006013, + "flos": 33322009253760.0, + "grad_norm": 2.1005602133458443, + "language_loss": 0.85319614, + "learning_rate": 3.834159402300841e-06, + "loss": 0.87533557, + "num_input_tokens_seen": 56540665, + "router_z_loss_clip": 1.08691406, + "router_z_loss_mlp": 0.18066406, + "step": 2605, + "time_per_iteration": 2.7123653888702393 + }, + { + "auxiliary_loss_clip": 0.01166587, + "auxiliary_loss_mlp": 0.01053816, + "balance_loss_clip": 1.05694771, + "balance_loss_mlp": 1.0345397, + "epoch": 0.1566811964527281, + "flos": 32561891485920.0, + "grad_norm": 4.643139898778063, + "language_loss": 0.73381746, + "learning_rate": 3.834004087624087e-06, + "loss": 0.7560215, + "num_input_tokens_seen": 56560805, + "router_z_loss_clip": 1.09667969, + "router_z_loss_mlp": 0.19274902, + "step": 2606, + "time_per_iteration": 2.688183307647705 + }, + { + "auxiliary_loss_clip": 0.01167049, + "auxiliary_loss_mlp": 0.01044109, + "balance_loss_clip": 1.06235135, + "balance_loss_mlp": 1.02770615, + "epoch": 0.15674131970539606, + "flos": 19649417343840.0, + "grad_norm": 2.712441993286893, + "language_loss": 0.76585674, + "learning_rate": 3.8338487034017145e-06, + "loss": 0.78796834, + "num_input_tokens_seen": 56576335, + "router_z_loss_clip": 1.046875, + "router_z_loss_mlp": 0.1640625, + "step": 2607, + "time_per_iteration": 2.6712846755981445 + }, + { + "auxiliary_loss_clip": 0.01163431, + "auxiliary_loss_mlp": 0.01043911, + "balance_loss_clip": 1.05881572, + "balance_loss_mlp": 1.02624416, + "epoch": 0.15680144295806403, + "flos": 23390978856480.0, + "grad_norm": 2.696767598034187, + "language_loss": 0.81891048, + "learning_rate": 3.833693249639615e-06, + "loss": 0.84098393, + "num_input_tokens_seen": 56595880, + "router_z_loss_clip": 1.04589844, + "router_z_loss_mlp": 0.17663574, + "step": 2608, + "time_per_iteration": 2.7364561557769775 + }, + { + "auxiliary_loss_clip": 0.01169862, + "auxiliary_loss_mlp": 0.01049689, + "balance_loss_clip": 1.06143928, + "balance_loss_mlp": 1.03019845, + "epoch": 0.156861566210732, + "flos": 25396622586720.0, + "grad_norm": 2.4707262980561473, + "language_loss": 0.7258848, + "learning_rate": 3.833537726343684e-06, + "loss": 0.74808025, + "num_input_tokens_seen": 56615130, + "router_z_loss_clip": 1.08544922, + "router_z_loss_mlp": 0.19494629, + "step": 2609, + "time_per_iteration": 2.665938138961792 + }, + { + "auxiliary_loss_clip": 0.01170702, + "auxiliary_loss_mlp": 0.01042425, + "balance_loss_clip": 1.06080437, + "balance_loss_mlp": 1.02375686, + "epoch": 0.15692168946339996, + "flos": 24462427468320.0, + "grad_norm": 1.807162565024287, + "language_loss": 0.71917927, + "learning_rate": 3.833382133519818e-06, + "loss": 0.74131054, + "num_input_tokens_seen": 56634005, + "router_z_loss_clip": 1.10058594, + "router_z_loss_mlp": 0.18652344, + "step": 2610, + "time_per_iteration": 2.6488380432128906 + }, + { + "auxiliary_loss_clip": 0.01170523, + "auxiliary_loss_mlp": 0.01051508, + "balance_loss_clip": 1.06101465, + "balance_loss_mlp": 1.03177929, + "epoch": 0.15698181271606793, + "flos": 26109908729280.0, + "grad_norm": 1.7917478992126938, + "language_loss": 0.72881651, + "learning_rate": 3.833226471173919e-06, + "loss": 0.75103676, + "num_input_tokens_seen": 56653480, + "router_z_loss_clip": 1.09570312, + "router_z_loss_mlp": 0.19702148, + "step": 2611, + "time_per_iteration": 2.682544469833374 + }, + { + "auxiliary_loss_clip": 0.01168262, + "auxiliary_loss_mlp": 0.01047434, + "balance_loss_clip": 1.06127453, + "balance_loss_mlp": 1.02918267, + "epoch": 0.15704193596873592, + "flos": 25436485239840.0, + "grad_norm": 2.1500545041461856, + "language_loss": 0.70584792, + "learning_rate": 3.833070739311887e-06, + "loss": 0.72800493, + "num_input_tokens_seen": 56672270, + "router_z_loss_clip": 1.0703125, + "router_z_loss_mlp": 0.18237305, + "step": 2612, + "time_per_iteration": 2.659012794494629 + }, + { + "auxiliary_loss_clip": 0.01167009, + "auxiliary_loss_mlp": 0.01052579, + "balance_loss_clip": 1.05964255, + "balance_loss_mlp": 1.03475738, + "epoch": 0.15710205922140388, + "flos": 26555170649760.0, + "grad_norm": 1.9261452061378557, + "language_loss": 0.76100457, + "learning_rate": 3.83291493793963e-06, + "loss": 0.7832005, + "num_input_tokens_seen": 56691510, + "router_z_loss_clip": 1.07421875, + "router_z_loss_mlp": 0.17822266, + "step": 2613, + "time_per_iteration": 2.6847312450408936 + }, + { + "auxiliary_loss_clip": 0.01164223, + "auxiliary_loss_mlp": 0.01059697, + "balance_loss_clip": 1.05654442, + "balance_loss_mlp": 1.04094541, + "epoch": 0.15716218247407185, + "flos": 30515007515040.0, + "grad_norm": 1.7901283753062813, + "language_loss": 0.65954858, + "learning_rate": 3.832759067063055e-06, + "loss": 0.68178773, + "num_input_tokens_seen": 56712230, + "router_z_loss_clip": 1.07666016, + "router_z_loss_mlp": 0.18725586, + "step": 2614, + "time_per_iteration": 2.715587615966797 + }, + { + "auxiliary_loss_clip": 0.01168307, + "auxiliary_loss_mlp": 0.01048567, + "balance_loss_clip": 1.05765092, + "balance_loss_mlp": 1.02932692, + "epoch": 0.1572223057267398, + "flos": 24638166027360.0, + "grad_norm": 2.4395227087020297, + "language_loss": 0.75067031, + "learning_rate": 3.832603126688072e-06, + "loss": 0.77283901, + "num_input_tokens_seen": 56727490, + "router_z_loss_clip": 1.10644531, + "router_z_loss_mlp": 0.19226074, + "step": 2615, + "time_per_iteration": 2.63519549369812 + }, + { + "auxiliary_loss_clip": 0.01159219, + "auxiliary_loss_mlp": 0.01049571, + "balance_loss_clip": 1.05783868, + "balance_loss_mlp": 1.03215504, + "epoch": 0.15728242897940778, + "flos": 25574873217120.0, + "grad_norm": 1.641888522087756, + "language_loss": 0.73198533, + "learning_rate": 3.832447116820594e-06, + "loss": 0.75407326, + "num_input_tokens_seen": 56747385, + "router_z_loss_clip": 1.01464844, + "router_z_loss_mlp": 0.1739502, + "step": 2616, + "time_per_iteration": 2.7045466899871826 + }, + { + "auxiliary_loss_clip": 0.01164083, + "auxiliary_loss_mlp": 0.01050326, + "balance_loss_clip": 1.05690992, + "balance_loss_mlp": 1.03146732, + "epoch": 0.15734255223207574, + "flos": 28111622283360.0, + "grad_norm": 2.3822890879709386, + "language_loss": 0.72528696, + "learning_rate": 3.832291037466539e-06, + "loss": 0.74743104, + "num_input_tokens_seen": 56768055, + "router_z_loss_clip": 1.07128906, + "router_z_loss_mlp": 0.1887207, + "step": 2617, + "time_per_iteration": 2.8622725009918213 + }, + { + "auxiliary_loss_clip": 0.01159304, + "auxiliary_loss_mlp": 0.01041478, + "balance_loss_clip": 1.05541909, + "balance_loss_mlp": 1.0233345, + "epoch": 0.15740267548474374, + "flos": 25077350355840.0, + "grad_norm": 2.1363695268385787, + "language_loss": 0.74546295, + "learning_rate": 3.8321348886318235e-06, + "loss": 0.76747084, + "num_input_tokens_seen": 56785110, + "router_z_loss_clip": 1.03955078, + "router_z_loss_mlp": 0.18151855, + "step": 2618, + "time_per_iteration": 2.6285340785980225 + }, + { + "auxiliary_loss_clip": 0.01165093, + "auxiliary_loss_mlp": 0.01046458, + "balance_loss_clip": 1.05535483, + "balance_loss_mlp": 1.02706265, + "epoch": 0.1574627987374117, + "flos": 27659067252480.0, + "grad_norm": 2.0074876895427347, + "language_loss": 0.78989363, + "learning_rate": 3.8319786703223695e-06, + "loss": 0.81200916, + "num_input_tokens_seen": 56804975, + "router_z_loss_clip": 1.09765625, + "router_z_loss_mlp": 0.1940918, + "step": 2619, + "time_per_iteration": 2.730781078338623 + }, + { + "auxiliary_loss_clip": 0.01161048, + "auxiliary_loss_mlp": 0.01052279, + "balance_loss_clip": 1.05795693, + "balance_loss_mlp": 1.03436208, + "epoch": 0.15752292199007967, + "flos": 20499875945280.0, + "grad_norm": 1.7857363503990642, + "language_loss": 0.77033675, + "learning_rate": 3.831822382544101e-06, + "loss": 0.7924701, + "num_input_tokens_seen": 56822470, + "router_z_loss_clip": 1.03027344, + "router_z_loss_mlp": 0.17907715, + "step": 2620, + "time_per_iteration": 5.497500896453857 + }, + { + "auxiliary_loss_clip": 0.01163168, + "auxiliary_loss_mlp": 0.01050335, + "balance_loss_clip": 1.05595922, + "balance_loss_mlp": 1.03060555, + "epoch": 0.15758304524274763, + "flos": 36394401591360.0, + "grad_norm": 3.099348970215685, + "language_loss": 0.70653421, + "learning_rate": 3.831666025302944e-06, + "loss": 0.72866923, + "num_input_tokens_seen": 56842100, + "router_z_loss_clip": 1.07226562, + "router_z_loss_mlp": 0.1973877, + "step": 2621, + "time_per_iteration": 2.7714500427246094 + }, + { + "auxiliary_loss_clip": 0.01162226, + "auxiliary_loss_mlp": 0.01051514, + "balance_loss_clip": 1.05540323, + "balance_loss_mlp": 1.03171325, + "epoch": 0.1576431684954156, + "flos": 65377788215040.0, + "grad_norm": 2.0102917063310657, + "language_loss": 0.72687566, + "learning_rate": 3.831509598604828e-06, + "loss": 0.74901307, + "num_input_tokens_seen": 56865920, + "router_z_loss_clip": 1.06884766, + "router_z_loss_mlp": 0.19812012, + "step": 2622, + "time_per_iteration": 2.921243190765381 + }, + { + "auxiliary_loss_clip": 0.01159331, + "auxiliary_loss_mlp": 0.01044365, + "balance_loss_clip": 1.05528402, + "balance_loss_mlp": 1.02669835, + "epoch": 0.15770329174808356, + "flos": 25396825173120.0, + "grad_norm": 1.8037650554190408, + "language_loss": 0.87601966, + "learning_rate": 3.831353102455684e-06, + "loss": 0.89805663, + "num_input_tokens_seen": 56885265, + "router_z_loss_clip": 1.04101562, + "router_z_loss_mlp": 0.17651367, + "step": 2623, + "time_per_iteration": 2.66641902923584 + }, + { + "auxiliary_loss_clip": 0.01161465, + "auxiliary_loss_mlp": 0.01045707, + "balance_loss_clip": 1.05725348, + "balance_loss_mlp": 1.02826667, + "epoch": 0.15776341500075153, + "flos": 30473564688000.0, + "grad_norm": 1.6427097751732747, + "language_loss": 0.81762946, + "learning_rate": 3.831196536861448e-06, + "loss": 0.83970124, + "num_input_tokens_seen": 56906710, + "router_z_loss_clip": 1.04150391, + "router_z_loss_mlp": 0.17443848, + "step": 2624, + "time_per_iteration": 5.523432731628418 + }, + { + "auxiliary_loss_clip": 0.01161181, + "auxiliary_loss_mlp": 0.01046516, + "balance_loss_clip": 1.05360222, + "balance_loss_mlp": 1.02776396, + "epoch": 0.15782353825341952, + "flos": 26732205761760.0, + "grad_norm": 2.14569404795905, + "language_loss": 0.79772246, + "learning_rate": 3.831039901828054e-06, + "loss": 0.81979942, + "num_input_tokens_seen": 56924275, + "router_z_loss_clip": 1.07470703, + "router_z_loss_mlp": 0.1875, + "step": 2625, + "time_per_iteration": 2.7056984901428223 + }, + { + "auxiliary_loss_clip": 0.01160322, + "auxiliary_loss_mlp": 0.01044451, + "balance_loss_clip": 1.05621958, + "balance_loss_mlp": 1.02723694, + "epoch": 0.15788366150608749, + "flos": 31941863421120.0, + "grad_norm": 2.4305509115157906, + "language_loss": 0.79955208, + "learning_rate": 3.830883197361445e-06, + "loss": 0.82159978, + "num_input_tokens_seen": 56941525, + "router_z_loss_clip": 1.04003906, + "router_z_loss_mlp": 0.17199707, + "step": 2626, + "time_per_iteration": 2.6725869178771973 + }, + { + "auxiliary_loss_clip": 0.01164166, + "auxiliary_loss_mlp": 0.01046423, + "balance_loss_clip": 1.05977046, + "balance_loss_mlp": 1.0276711, + "epoch": 0.15794378475875545, + "flos": 33812076935520.0, + "grad_norm": 3.0642184497326923, + "language_loss": 0.7366119, + "learning_rate": 3.830726423467561e-06, + "loss": 0.75871778, + "num_input_tokens_seen": 56962145, + "router_z_loss_clip": 1.04394531, + "router_z_loss_mlp": 0.18762207, + "step": 2627, + "time_per_iteration": 2.8983988761901855 + }, + { + "auxiliary_loss_clip": 0.01160718, + "auxiliary_loss_mlp": 0.01052705, + "balance_loss_clip": 1.05657077, + "balance_loss_mlp": 1.03448963, + "epoch": 0.15800390801142342, + "flos": 14800596294240.0, + "grad_norm": 2.4023581757592285, + "language_loss": 0.85293388, + "learning_rate": 3.830569580152348e-06, + "loss": 0.87506813, + "num_input_tokens_seen": 56977505, + "router_z_loss_clip": 1.04150391, + "router_z_loss_mlp": 0.18225098, + "step": 2628, + "time_per_iteration": 2.8429441452026367 + }, + { + "auxiliary_loss_clip": 0.01157251, + "auxiliary_loss_mlp": 0.01039034, + "balance_loss_clip": 1.05425858, + "balance_loss_mlp": 1.02241635, + "epoch": 0.15806403126409138, + "flos": 25263907028640.0, + "grad_norm": 1.8868618166709323, + "language_loss": 0.7717694, + "learning_rate": 3.830412667421752e-06, + "loss": 0.79373229, + "num_input_tokens_seen": 56996770, + "router_z_loss_clip": 1.03027344, + "router_z_loss_mlp": 0.16625977, + "step": 2629, + "time_per_iteration": 2.665172576904297 + }, + { + "auxiliary_loss_clip": 0.01161587, + "auxiliary_loss_mlp": 0.01051071, + "balance_loss_clip": 1.05534518, + "balance_loss_mlp": 1.03184247, + "epoch": 0.15812415451675935, + "flos": 21745888115040.0, + "grad_norm": 2.55087454521535, + "language_loss": 0.73607725, + "learning_rate": 3.8302556852817245e-06, + "loss": 0.75820386, + "num_input_tokens_seen": 57014970, + "router_z_loss_clip": 1.06152344, + "router_z_loss_mlp": 0.19238281, + "step": 2630, + "time_per_iteration": 2.6362388134002686 + }, + { + "auxiliary_loss_clip": 0.01162976, + "auxiliary_loss_mlp": 0.01045925, + "balance_loss_clip": 1.05378246, + "balance_loss_mlp": 1.02782965, + "epoch": 0.15818427776942734, + "flos": 24506544435840.0, + "grad_norm": 2.2352159746446, + "language_loss": 0.83565176, + "learning_rate": 3.8300986337382184e-06, + "loss": 0.85774076, + "num_input_tokens_seen": 57034045, + "router_z_loss_clip": 1.09179688, + "router_z_loss_mlp": 0.1809082, + "step": 2631, + "time_per_iteration": 2.654822587966919 + }, + { + "auxiliary_loss_clip": 0.01158336, + "auxiliary_loss_mlp": 0.01045607, + "balance_loss_clip": 1.05294085, + "balance_loss_mlp": 1.02825058, + "epoch": 0.1582444010220953, + "flos": 25886528199360.0, + "grad_norm": 1.6487375143408118, + "language_loss": 0.78541267, + "learning_rate": 3.8299415127971895e-06, + "loss": 0.80745208, + "num_input_tokens_seen": 57053695, + "router_z_loss_clip": 1.05273438, + "router_z_loss_mlp": 0.17346191, + "step": 2632, + "time_per_iteration": 2.6382317543029785 + }, + { + "auxiliary_loss_clip": 0.01160695, + "auxiliary_loss_mlp": 0.01057671, + "balance_loss_clip": 1.05493069, + "balance_loss_mlp": 1.03932428, + "epoch": 0.15830452427476327, + "flos": 21790572324480.0, + "grad_norm": 1.8471290825828173, + "language_loss": 0.83467424, + "learning_rate": 3.829784322464594e-06, + "loss": 0.85685784, + "num_input_tokens_seen": 57071290, + "router_z_loss_clip": 1.05712891, + "router_z_loss_mlp": 0.18334961, + "step": 2633, + "time_per_iteration": 2.6391496658325195 + }, + { + "auxiliary_loss_clip": 0.01164585, + "auxiliary_loss_mlp": 0.01050472, + "balance_loss_clip": 1.05657434, + "balance_loss_mlp": 1.03216171, + "epoch": 0.15836464752743123, + "flos": 29937921416640.0, + "grad_norm": 2.0057413227242886, + "language_loss": 0.77353328, + "learning_rate": 3.829627062746394e-06, + "loss": 0.79568386, + "num_input_tokens_seen": 57091465, + "router_z_loss_clip": 1.07958984, + "router_z_loss_mlp": 0.18310547, + "step": 2634, + "time_per_iteration": 2.682814359664917 + }, + { + "auxiliary_loss_clip": 0.01161773, + "auxiliary_loss_mlp": 0.01045341, + "balance_loss_clip": 1.05441725, + "balance_loss_mlp": 1.02698243, + "epoch": 0.1584247707800992, + "flos": 24551390714400.0, + "grad_norm": 2.059003708276468, + "language_loss": 0.88933551, + "learning_rate": 3.829469733648552e-06, + "loss": 0.91140664, + "num_input_tokens_seen": 57110075, + "router_z_loss_clip": 1.07421875, + "router_z_loss_mlp": 0.18359375, + "step": 2635, + "time_per_iteration": 2.633277177810669 + }, + { + "auxiliary_loss_clip": 0.01159664, + "auxiliary_loss_mlp": 0.01056105, + "balance_loss_clip": 1.05140471, + "balance_loss_mlp": 1.03731775, + "epoch": 0.15848489403276717, + "flos": 24862762075680.0, + "grad_norm": 1.9890014993692915, + "language_loss": 0.75979966, + "learning_rate": 3.829312335177034e-06, + "loss": 0.78195733, + "num_input_tokens_seen": 57128945, + "router_z_loss_clip": 1.08203125, + "router_z_loss_mlp": 0.18786621, + "step": 2636, + "time_per_iteration": 2.667050361633301 + }, + { + "auxiliary_loss_clip": 0.01164097, + "auxiliary_loss_mlp": 0.01046923, + "balance_loss_clip": 1.05491531, + "balance_loss_mlp": 1.02776647, + "epoch": 0.15854501728543513, + "flos": 48012694868160.0, + "grad_norm": 2.1096891127634914, + "language_loss": 0.72077894, + "learning_rate": 3.82915486733781e-06, + "loss": 0.74288917, + "num_input_tokens_seen": 57152385, + "router_z_loss_clip": 1.09082031, + "router_z_loss_mlp": 0.19177246, + "step": 2637, + "time_per_iteration": 2.7927753925323486 + }, + { + "auxiliary_loss_clip": 0.01157551, + "auxiliary_loss_mlp": 0.01045599, + "balance_loss_clip": 1.05313015, + "balance_loss_mlp": 1.02821815, + "epoch": 0.15860514053810312, + "flos": 30339633611520.0, + "grad_norm": 3.0272227675840604, + "language_loss": 0.77707493, + "learning_rate": 3.82899733013685e-06, + "loss": 0.79910648, + "num_input_tokens_seen": 57172620, + "router_z_loss_clip": 1.04443359, + "router_z_loss_mlp": 0.17382812, + "step": 2638, + "time_per_iteration": 2.7009437084198 + }, + { + "auxiliary_loss_clip": 0.01161652, + "auxiliary_loss_mlp": 0.01056781, + "balance_loss_clip": 1.05369425, + "balance_loss_mlp": 1.03690839, + "epoch": 0.1586652637907711, + "flos": 31942430663040.0, + "grad_norm": 2.6240461728603086, + "language_loss": 0.75439692, + "learning_rate": 3.828839723580128e-06, + "loss": 0.77658123, + "num_input_tokens_seen": 57194680, + "router_z_loss_clip": 1.078125, + "router_z_loss_mlp": 0.19885254, + "step": 2639, + "time_per_iteration": 2.7381091117858887 + }, + { + "auxiliary_loss_clip": 0.01160167, + "auxiliary_loss_mlp": 0.01055982, + "balance_loss_clip": 1.05276942, + "balance_loss_mlp": 1.03776658, + "epoch": 0.15872538704343905, + "flos": 24150245761440.0, + "grad_norm": 1.8508577246344755, + "language_loss": 0.8149966, + "learning_rate": 3.82868204767362e-06, + "loss": 0.83715808, + "num_input_tokens_seen": 57214675, + "router_z_loss_clip": 1.07470703, + "router_z_loss_mlp": 0.18200684, + "step": 2640, + "time_per_iteration": 2.8926339149475098 + }, + { + "auxiliary_loss_clip": 0.01156566, + "auxiliary_loss_mlp": 0.01050707, + "balance_loss_clip": 1.05387139, + "balance_loss_mlp": 1.03309965, + "epoch": 0.15878551029610702, + "flos": 34746312571200.0, + "grad_norm": 1.430041303672335, + "language_loss": 0.66871285, + "learning_rate": 3.828524302423306e-06, + "loss": 0.69078559, + "num_input_tokens_seen": 57235830, + "router_z_loss_clip": 1.02685547, + "router_z_loss_mlp": 0.17602539, + "step": 2641, + "time_per_iteration": 2.7450015544891357 + }, + { + "auxiliary_loss_clip": 0.01168733, + "auxiliary_loss_mlp": 0.01053676, + "balance_loss_clip": 1.05678463, + "balance_loss_mlp": 1.03479302, + "epoch": 0.15884563354877498, + "flos": 29537343705600.0, + "grad_norm": 2.0823969349961566, + "language_loss": 0.75509375, + "learning_rate": 3.828366487835167e-06, + "loss": 0.77731788, + "num_input_tokens_seen": 57255970, + "router_z_loss_clip": 1.12060547, + "router_z_loss_mlp": 0.18884277, + "step": 2642, + "time_per_iteration": 2.6819310188293457 + }, + { + "auxiliary_loss_clip": 0.01161194, + "auxiliary_loss_mlp": 0.01052097, + "balance_loss_clip": 1.05823874, + "balance_loss_mlp": 1.03445423, + "epoch": 0.15890575680144295, + "flos": 29223986997600.0, + "grad_norm": 2.2781756445948784, + "language_loss": 0.70401955, + "learning_rate": 3.828208603915186e-06, + "loss": 0.72615242, + "num_input_tokens_seen": 57274435, + "router_z_loss_clip": 1.02832031, + "router_z_loss_mlp": 0.1763916, + "step": 2643, + "time_per_iteration": 2.7156546115875244 + }, + { + "auxiliary_loss_clip": 0.01160367, + "auxiliary_loss_mlp": 0.01047191, + "balance_loss_clip": 1.0577116, + "balance_loss_mlp": 1.03062081, + "epoch": 0.15896588005411091, + "flos": 25886649751200.0, + "grad_norm": 2.1091723523038763, + "language_loss": 0.78413403, + "learning_rate": 3.828050650669353e-06, + "loss": 0.80620968, + "num_input_tokens_seen": 57293115, + "router_z_loss_clip": 1.02685547, + "router_z_loss_mlp": 0.16589355, + "step": 2644, + "time_per_iteration": 2.7270009517669678 + }, + { + "auxiliary_loss_clip": 0.01158456, + "auxiliary_loss_mlp": 0.01048822, + "balance_loss_clip": 1.05420828, + "balance_loss_mlp": 1.03190649, + "epoch": 0.1590260033067789, + "flos": 29715148645920.0, + "grad_norm": 2.186444269554566, + "language_loss": 0.8231284, + "learning_rate": 3.827892628103657e-06, + "loss": 0.84520119, + "num_input_tokens_seen": 57312565, + "router_z_loss_clip": 1.04394531, + "router_z_loss_mlp": 0.16931152, + "step": 2645, + "time_per_iteration": 2.716425657272339 + }, + { + "auxiliary_loss_clip": 0.01163305, + "auxiliary_loss_mlp": 0.01054983, + "balance_loss_clip": 1.05452776, + "balance_loss_mlp": 1.03594494, + "epoch": 0.15908612655944687, + "flos": 39107253872160.0, + "grad_norm": 1.9631697706071742, + "language_loss": 0.70069605, + "learning_rate": 3.827734536224087e-06, + "loss": 0.72287893, + "num_input_tokens_seen": 57333360, + "router_z_loss_clip": 1.08691406, + "router_z_loss_mlp": 0.19042969, + "step": 2646, + "time_per_iteration": 2.7505545616149902 + }, + { + "auxiliary_loss_clip": 0.01157194, + "auxiliary_loss_mlp": 0.01044841, + "balance_loss_clip": 1.05444789, + "balance_loss_mlp": 1.02782965, + "epoch": 0.15914624981211484, + "flos": 21701528043840.0, + "grad_norm": 2.255706494871575, + "language_loss": 0.62651253, + "learning_rate": 3.827576375036642e-06, + "loss": 0.64853293, + "num_input_tokens_seen": 57350575, + "router_z_loss_clip": 1.02783203, + "router_z_loss_mlp": 0.17016602, + "step": 2647, + "time_per_iteration": 2.677948236465454 + }, + { + "auxiliary_loss_clip": 0.01160904, + "auxiliary_loss_mlp": 0.01044158, + "balance_loss_clip": 1.05646682, + "balance_loss_mlp": 1.0267055, + "epoch": 0.1592063730647828, + "flos": 21612605315040.0, + "grad_norm": 1.9083812887441336, + "language_loss": 0.89733154, + "learning_rate": 3.827418144547318e-06, + "loss": 0.9193821, + "num_input_tokens_seen": 57367570, + "router_z_loss_clip": 1.04492188, + "router_z_loss_mlp": 0.17443848, + "step": 2648, + "time_per_iteration": 2.6317150592803955 + }, + { + "auxiliary_loss_clip": 0.01158048, + "auxiliary_loss_mlp": 0.01046398, + "balance_loss_clip": 1.05658674, + "balance_loss_mlp": 1.02993476, + "epoch": 0.15926649631745077, + "flos": 22944298831200.0, + "grad_norm": 1.945478183538898, + "language_loss": 0.91780472, + "learning_rate": 3.827259844762114e-06, + "loss": 0.9398492, + "num_input_tokens_seen": 57383980, + "router_z_loss_clip": 1.01464844, + "router_z_loss_mlp": 0.16455078, + "step": 2649, + "time_per_iteration": 2.703568458557129 + }, + { + "auxiliary_loss_clip": 0.01171739, + "auxiliary_loss_mlp": 0.0104454, + "balance_loss_clip": 1.05731118, + "balance_loss_mlp": 1.02550197, + "epoch": 0.15932661957011873, + "flos": 21434152098240.0, + "grad_norm": 3.1187016429748344, + "language_loss": 0.71721768, + "learning_rate": 3.827101475687033e-06, + "loss": 0.73938042, + "num_input_tokens_seen": 57400840, + "router_z_loss_clip": 1.14453125, + "router_z_loss_mlp": 0.19042969, + "step": 2650, + "time_per_iteration": 2.694931983947754 + }, + { + "auxiliary_loss_clip": 0.01156285, + "auxiliary_loss_mlp": 0.01039705, + "balance_loss_clip": 1.05498791, + "balance_loss_mlp": 1.02417195, + "epoch": 0.15938674282278673, + "flos": 16270029511200.0, + "grad_norm": 2.4217577797625, + "language_loss": 0.71608859, + "learning_rate": 3.826943037328082e-06, + "loss": 0.73804843, + "num_input_tokens_seen": 57419230, + "router_z_loss_clip": 1.01220703, + "router_z_loss_mlp": 0.15527344, + "step": 2651, + "time_per_iteration": 2.6930391788482666 + }, + { + "auxiliary_loss_clip": 0.01161222, + "auxiliary_loss_mlp": 0.01049138, + "balance_loss_clip": 1.05482316, + "balance_loss_mlp": 1.03098297, + "epoch": 0.1594468660754547, + "flos": 27441359141760.0, + "grad_norm": 1.9553929856181587, + "language_loss": 0.80255651, + "learning_rate": 3.8267845296912674e-06, + "loss": 0.82466018, + "num_input_tokens_seen": 57439315, + "router_z_loss_clip": 1.06494141, + "router_z_loss_mlp": 0.18151855, + "step": 2652, + "time_per_iteration": 2.6848177909851074 + }, + { + "auxiliary_loss_clip": 0.01157686, + "auxiliary_loss_mlp": 0.01040268, + "balance_loss_clip": 1.05656219, + "balance_loss_mlp": 1.02341199, + "epoch": 0.15950698932812266, + "flos": 18312010891200.0, + "grad_norm": 4.529286614141368, + "language_loss": 0.69992262, + "learning_rate": 3.826625952782601e-06, + "loss": 0.72190213, + "num_input_tokens_seen": 57454635, + "router_z_loss_clip": 1.01171875, + "router_z_loss_mlp": 0.16845703, + "step": 2653, + "time_per_iteration": 2.895374298095703 + }, + { + "auxiliary_loss_clip": 0.0116246, + "auxiliary_loss_mlp": 0.01037356, + "balance_loss_clip": 1.05737662, + "balance_loss_mlp": 1.01985621, + "epoch": 0.15956711258079062, + "flos": 36795870682560.0, + "grad_norm": 2.5531508716723637, + "language_loss": 0.76508915, + "learning_rate": 3.826467306608095e-06, + "loss": 0.78708732, + "num_input_tokens_seen": 57476805, + "router_z_loss_clip": 1.05175781, + "router_z_loss_mlp": 0.17492676, + "step": 2654, + "time_per_iteration": 2.725398302078247 + }, + { + "auxiliary_loss_clip": 0.01157274, + "auxiliary_loss_mlp": 0.01040429, + "balance_loss_clip": 1.05339336, + "balance_loss_mlp": 1.02356124, + "epoch": 0.1596272358334586, + "flos": 25664160601440.0, + "grad_norm": 2.477068775235689, + "language_loss": 0.81768858, + "learning_rate": 3.826308591173765e-06, + "loss": 0.83966565, + "num_input_tokens_seen": 57496400, + "router_z_loss_clip": 1.0390625, + "router_z_loss_mlp": 0.16870117, + "step": 2655, + "time_per_iteration": 2.701115369796753 + }, + { + "auxiliary_loss_clip": 0.0116267, + "auxiliary_loss_mlp": 0.01045925, + "balance_loss_clip": 1.05663693, + "balance_loss_mlp": 1.02910471, + "epoch": 0.15968735908612655, + "flos": 18629662430880.0, + "grad_norm": 2.419094781592723, + "language_loss": 0.7353726, + "learning_rate": 3.826149806485631e-06, + "loss": 0.75745857, + "num_input_tokens_seen": 57513700, + "router_z_loss_clip": 1.05957031, + "router_z_loss_mlp": 0.16821289, + "step": 2656, + "time_per_iteration": 2.6683125495910645 + }, + { + "auxiliary_loss_clip": 0.0115758, + "auxiliary_loss_mlp": 0.0104497, + "balance_loss_clip": 1.05625248, + "balance_loss_mlp": 1.02862597, + "epoch": 0.15974748233879452, + "flos": 64265220914400.0, + "grad_norm": 3.4213591764004, + "language_loss": 0.77663827, + "learning_rate": 3.825990952549713e-06, + "loss": 0.79866379, + "num_input_tokens_seen": 57536180, + "router_z_loss_clip": 1.01318359, + "router_z_loss_mlp": 0.16333008, + "step": 2657, + "time_per_iteration": 2.9303555488586426 + }, + { + "auxiliary_loss_clip": 0.01159698, + "auxiliary_loss_mlp": 0.01044289, + "balance_loss_clip": 1.05734622, + "balance_loss_mlp": 1.02777839, + "epoch": 0.1598076055914625, + "flos": 22858252829280.0, + "grad_norm": 1.634181658961108, + "language_loss": 0.74497259, + "learning_rate": 3.825832029372035e-06, + "loss": 0.76701248, + "num_input_tokens_seen": 57555025, + "router_z_loss_clip": 1.0234375, + "router_z_loss_mlp": 0.16503906, + "step": 2658, + "time_per_iteration": 2.691495418548584 + }, + { + "auxiliary_loss_clip": 0.01160883, + "auxiliary_loss_mlp": 0.01046993, + "balance_loss_clip": 1.05536866, + "balance_loss_mlp": 1.02794361, + "epoch": 0.15986772884413047, + "flos": 41914093541760.0, + "grad_norm": 1.7766758697435419, + "language_loss": 0.75337446, + "learning_rate": 3.825673036958624e-06, + "loss": 0.77545321, + "num_input_tokens_seen": 57577660, + "router_z_loss_clip": 1.05615234, + "router_z_loss_mlp": 0.19055176, + "step": 2659, + "time_per_iteration": 2.7815308570861816 + }, + { + "auxiliary_loss_clip": 0.0116208, + "auxiliary_loss_mlp": 0.0104624, + "balance_loss_clip": 1.05574632, + "balance_loss_mlp": 1.02851367, + "epoch": 0.15992785209679844, + "flos": 26911550358720.0, + "grad_norm": 5.1283719630788225, + "language_loss": 0.91215324, + "learning_rate": 3.825513975315508e-06, + "loss": 0.93423647, + "num_input_tokens_seen": 57596335, + "router_z_loss_clip": 1.0625, + "router_z_loss_mlp": 0.17712402, + "step": 2660, + "time_per_iteration": 5.531864404678345 + }, + { + "auxiliary_loss_clip": 0.01162369, + "auxiliary_loss_mlp": 0.01047191, + "balance_loss_clip": 1.05637932, + "balance_loss_mlp": 1.02939296, + "epoch": 0.1599879753494664, + "flos": 40349497934880.0, + "grad_norm": 1.970038463117506, + "language_loss": 0.77917361, + "learning_rate": 3.82535484444872e-06, + "loss": 0.80126923, + "num_input_tokens_seen": 57616830, + "router_z_loss_clip": 1.06054688, + "router_z_loss_mlp": 0.17797852, + "step": 2661, + "time_per_iteration": 2.772207260131836 + }, + { + "auxiliary_loss_clip": 0.01163375, + "auxiliary_loss_mlp": 0.01044278, + "balance_loss_clip": 1.05691421, + "balance_loss_mlp": 1.0270282, + "epoch": 0.16004809860213437, + "flos": 34212573612000.0, + "grad_norm": 3.1264852852131493, + "language_loss": 0.74504918, + "learning_rate": 3.825195644364292e-06, + "loss": 0.76712573, + "num_input_tokens_seen": 57635515, + "router_z_loss_clip": 1.06542969, + "router_z_loss_mlp": 0.17260742, + "step": 2662, + "time_per_iteration": 2.7128827571868896 + }, + { + "auxiliary_loss_clip": 0.01161512, + "auxiliary_loss_mlp": 0.01049015, + "balance_loss_clip": 1.05510056, + "balance_loss_mlp": 1.0315156, + "epoch": 0.16010822185480234, + "flos": 27796847470560.0, + "grad_norm": 1.9255362764253143, + "language_loss": 0.82442391, + "learning_rate": 3.825036375068263e-06, + "loss": 0.84652913, + "num_input_tokens_seen": 57654250, + "router_z_loss_clip": 1.0625, + "router_z_loss_mlp": 0.17492676, + "step": 2663, + "time_per_iteration": 4.135187387466431 + }, + { + "auxiliary_loss_clip": 0.01162908, + "auxiliary_loss_mlp": 0.01048078, + "balance_loss_clip": 1.0566752, + "balance_loss_mlp": 1.02999425, + "epoch": 0.16016834510747033, + "flos": 24506503918560.0, + "grad_norm": 3.569576104621337, + "language_loss": 0.79976803, + "learning_rate": 3.824877036566672e-06, + "loss": 0.82187796, + "num_input_tokens_seen": 57672645, + "router_z_loss_clip": 1.0625, + "router_z_loss_mlp": 0.18078613, + "step": 2664, + "time_per_iteration": 4.4555418491363525 + }, + { + "auxiliary_loss_clip": 0.01158254, + "auxiliary_loss_mlp": 0.01048793, + "balance_loss_clip": 1.05396652, + "balance_loss_mlp": 1.03203225, + "epoch": 0.1602284683601383, + "flos": 25836819847200.0, + "grad_norm": 1.6740968073998335, + "language_loss": 0.94058621, + "learning_rate": 3.824717628865561e-06, + "loss": 0.96265668, + "num_input_tokens_seen": 57691055, + "router_z_loss_clip": 1.04492188, + "router_z_loss_mlp": 0.16760254, + "step": 2665, + "time_per_iteration": 2.7242367267608643 + }, + { + "auxiliary_loss_clip": 0.01160572, + "auxiliary_loss_mlp": 0.0104282, + "balance_loss_clip": 1.05404854, + "balance_loss_mlp": 1.0252248, + "epoch": 0.16028859161280626, + "flos": 17872542941760.0, + "grad_norm": 2.598225727789389, + "language_loss": 0.84969163, + "learning_rate": 3.824558151970974e-06, + "loss": 0.87172556, + "num_input_tokens_seen": 57707235, + "router_z_loss_clip": 1.06542969, + "router_z_loss_mlp": 0.17590332, + "step": 2666, + "time_per_iteration": 2.663588762283325 + }, + { + "auxiliary_loss_clip": 0.01159308, + "auxiliary_loss_mlp": 0.01043905, + "balance_loss_clip": 1.05444872, + "balance_loss_mlp": 1.02734673, + "epoch": 0.16034871486547422, + "flos": 25612993627200.0, + "grad_norm": 1.8321374634570444, + "language_loss": 0.81119448, + "learning_rate": 3.8243986058889595e-06, + "loss": 0.83322656, + "num_input_tokens_seen": 57724190, + "router_z_loss_clip": 1.04882812, + "router_z_loss_mlp": 0.16552734, + "step": 2667, + "time_per_iteration": 2.668694019317627 + }, + { + "auxiliary_loss_clip": 0.01160111, + "auxiliary_loss_mlp": 0.0104523, + "balance_loss_clip": 1.05668211, + "balance_loss_mlp": 1.02728891, + "epoch": 0.1604088381181422, + "flos": 26109584591040.0, + "grad_norm": 1.732188101960551, + "language_loss": 0.73411787, + "learning_rate": 3.824238990625567e-06, + "loss": 0.75617129, + "num_input_tokens_seen": 57743620, + "router_z_loss_clip": 1.03417969, + "router_z_loss_mlp": 0.17932129, + "step": 2668, + "time_per_iteration": 2.7142088413238525 + }, + { + "auxiliary_loss_clip": 0.01160007, + "auxiliary_loss_mlp": 0.01046413, + "balance_loss_clip": 1.05521894, + "balance_loss_mlp": 1.02936625, + "epoch": 0.16046896137081015, + "flos": 29135185820640.0, + "grad_norm": 1.802995448344171, + "language_loss": 0.77391541, + "learning_rate": 3.824079306186848e-06, + "loss": 0.79597962, + "num_input_tokens_seen": 57764810, + "router_z_loss_clip": 1.04785156, + "router_z_loss_mlp": 0.17053223, + "step": 2669, + "time_per_iteration": 2.6922354698181152 + }, + { + "auxiliary_loss_clip": 0.01062693, + "auxiliary_loss_mlp": 0.01003789, + "balance_loss_clip": 1.02714193, + "balance_loss_mlp": 1.00166011, + "epoch": 0.16052908462347812, + "flos": 72976777960320.0, + "grad_norm": 0.8019892670946371, + "language_loss": 0.55575705, + "learning_rate": 3.823919552578861e-06, + "loss": 0.57642192, + "num_input_tokens_seen": 57824390, + "router_z_loss_clip": 0.35571289, + "router_z_loss_mlp": 0.02131653, + "step": 2670, + "time_per_iteration": 3.15929913520813 + }, + { + "auxiliary_loss_clip": 0.01158687, + "auxiliary_loss_mlp": 0.01041402, + "balance_loss_clip": 1.05253959, + "balance_loss_mlp": 1.02478385, + "epoch": 0.1605892078761461, + "flos": 22325283698400.0, + "grad_norm": 1.9866862515466266, + "language_loss": 0.77553201, + "learning_rate": 3.82375972980766e-06, + "loss": 0.79753286, + "num_input_tokens_seen": 57843665, + "router_z_loss_clip": 1.0625, + "router_z_loss_mlp": 0.1661377, + "step": 2671, + "time_per_iteration": 2.7066612243652344 + }, + { + "auxiliary_loss_clip": 0.01163501, + "auxiliary_loss_mlp": 0.01038218, + "balance_loss_clip": 1.05774236, + "balance_loss_mlp": 1.02139735, + "epoch": 0.16064933112881408, + "flos": 39243453916320.0, + "grad_norm": 2.0583623935800572, + "language_loss": 0.64598364, + "learning_rate": 3.8235998378793086e-06, + "loss": 0.66800082, + "num_input_tokens_seen": 57863305, + "router_z_loss_clip": 1.05712891, + "router_z_loss_mlp": 0.16796875, + "step": 2672, + "time_per_iteration": 2.782851457595825 + }, + { + "auxiliary_loss_clip": 0.01161697, + "auxiliary_loss_mlp": 0.01035784, + "balance_loss_clip": 1.05413759, + "balance_loss_mlp": 1.01693726, + "epoch": 0.16070945438148204, + "flos": 24195051522720.0, + "grad_norm": 1.985779131229522, + "language_loss": 0.85503745, + "learning_rate": 3.8234398767998675e-06, + "loss": 0.87701225, + "num_input_tokens_seen": 57883025, + "router_z_loss_clip": 1.07519531, + "router_z_loss_mlp": 0.18847656, + "step": 2673, + "time_per_iteration": 2.7570688724517822 + }, + { + "auxiliary_loss_clip": 0.01160476, + "auxiliary_loss_mlp": 0.01043481, + "balance_loss_clip": 1.05577683, + "balance_loss_mlp": 1.02601719, + "epoch": 0.16076957763415, + "flos": 23078027321280.0, + "grad_norm": 2.896171969020434, + "language_loss": 0.7320857, + "learning_rate": 3.823279846575403e-06, + "loss": 0.75412524, + "num_input_tokens_seen": 57901430, + "router_z_loss_clip": 1.046875, + "router_z_loss_mlp": 0.17456055, + "step": 2674, + "time_per_iteration": 2.7073607444763184 + }, + { + "auxiliary_loss_clip": 0.01157795, + "auxiliary_loss_mlp": 0.01041813, + "balance_loss_clip": 1.05350351, + "balance_loss_mlp": 1.02337158, + "epoch": 0.16082970088681797, + "flos": 20455556391360.0, + "grad_norm": 1.71787128455517, + "language_loss": 0.8444044, + "learning_rate": 3.823119747211986e-06, + "loss": 0.86640048, + "num_input_tokens_seen": 57919550, + "router_z_loss_clip": 1.04296875, + "router_z_loss_mlp": 0.18444824, + "step": 2675, + "time_per_iteration": 2.661252737045288 + }, + { + "auxiliary_loss_clip": 0.01163572, + "auxiliary_loss_mlp": 0.01041958, + "balance_loss_clip": 1.05888999, + "balance_loss_mlp": 1.02363539, + "epoch": 0.16088982413948594, + "flos": 42891716833920.0, + "grad_norm": 2.3377330411037205, + "language_loss": 0.82233685, + "learning_rate": 3.822959578715685e-06, + "loss": 0.84439218, + "num_input_tokens_seen": 57939890, + "router_z_loss_clip": 1.04394531, + "router_z_loss_mlp": 0.18334961, + "step": 2676, + "time_per_iteration": 3.0407137870788574 + }, + { + "auxiliary_loss_clip": 0.01158539, + "auxiliary_loss_mlp": 0.01041671, + "balance_loss_clip": 1.05655122, + "balance_loss_mlp": 1.02593553, + "epoch": 0.1609499473921539, + "flos": 22726185547680.0, + "grad_norm": 2.6927558368262536, + "language_loss": 0.73060894, + "learning_rate": 3.822799341092573e-06, + "loss": 0.75261104, + "num_input_tokens_seen": 57957410, + "router_z_loss_clip": 1.01855469, + "router_z_loss_mlp": 0.15734863, + "step": 2677, + "time_per_iteration": 2.6469743251800537 + }, + { + "auxiliary_loss_clip": 0.01157924, + "auxiliary_loss_mlp": 0.01038872, + "balance_loss_clip": 1.05554855, + "balance_loss_mlp": 1.02156281, + "epoch": 0.1610100706448219, + "flos": 40445430153120.0, + "grad_norm": 1.645586419807107, + "language_loss": 0.76445603, + "learning_rate": 3.822639034348728e-06, + "loss": 0.78642398, + "num_input_tokens_seen": 57977900, + "router_z_loss_clip": 1.0234375, + "router_z_loss_mlp": 0.17297363, + "step": 2678, + "time_per_iteration": 2.768512010574341 + }, + { + "auxiliary_loss_clip": 0.01160361, + "auxiliary_loss_mlp": 0.01039447, + "balance_loss_clip": 1.05477667, + "balance_loss_mlp": 1.021101, + "epoch": 0.16107019389748986, + "flos": 42313779872640.0, + "grad_norm": 2.081358989751095, + "language_loss": 0.70517349, + "learning_rate": 3.822478658490228e-06, + "loss": 0.72717166, + "num_input_tokens_seen": 57998210, + "router_z_loss_clip": 1.0546875, + "router_z_loss_mlp": 0.18359375, + "step": 2679, + "time_per_iteration": 2.8205277919769287 + }, + { + "auxiliary_loss_clip": 0.01060315, + "auxiliary_loss_mlp": 0.01004413, + "balance_loss_clip": 1.02476549, + "balance_loss_mlp": 1.0024755, + "epoch": 0.16113031715015783, + "flos": 80184948308640.0, + "grad_norm": 0.781387914717221, + "language_loss": 0.51860058, + "learning_rate": 3.822318213523154e-06, + "loss": 0.53924781, + "num_input_tokens_seen": 58059420, + "router_z_loss_clip": 0.35546875, + "router_z_loss_mlp": 0.01934814, + "step": 2680, + "time_per_iteration": 3.312448740005493 + }, + { + "auxiliary_loss_clip": 0.01162091, + "auxiliary_loss_mlp": 0.01042377, + "balance_loss_clip": 1.05533564, + "balance_loss_mlp": 1.02404213, + "epoch": 0.1611904404028258, + "flos": 25393219135200.0, + "grad_norm": 1.6497263585228998, + "language_loss": 0.80540335, + "learning_rate": 3.8221576994535925e-06, + "loss": 0.82744807, + "num_input_tokens_seen": 58078370, + "router_z_loss_clip": 1.06933594, + "router_z_loss_mlp": 0.18334961, + "step": 2681, + "time_per_iteration": 2.6965935230255127 + }, + { + "auxiliary_loss_clip": 0.01158842, + "auxiliary_loss_mlp": 0.01046062, + "balance_loss_clip": 1.05728531, + "balance_loss_mlp": 1.02906299, + "epoch": 0.16125056365549376, + "flos": 32962469196960.0, + "grad_norm": 1.9036633092901099, + "language_loss": 0.69132948, + "learning_rate": 3.821997116287627e-06, + "loss": 0.71337855, + "num_input_tokens_seen": 58097395, + "router_z_loss_clip": 1.01660156, + "router_z_loss_mlp": 0.1697998, + "step": 2682, + "time_per_iteration": 2.69711971282959 + }, + { + "auxiliary_loss_clip": 0.01162214, + "auxiliary_loss_mlp": 0.01043501, + "balance_loss_clip": 1.0586493, + "balance_loss_mlp": 1.02583408, + "epoch": 0.16131068690816172, + "flos": 23521587516000.0, + "grad_norm": 1.9593277800781836, + "language_loss": 0.87367046, + "learning_rate": 3.821836464031348e-06, + "loss": 0.89572763, + "num_input_tokens_seen": 58115630, + "router_z_loss_clip": 1.03515625, + "router_z_loss_mlp": 0.17675781, + "step": 2683, + "time_per_iteration": 2.65582537651062 + }, + { + "auxiliary_loss_clip": 0.01163404, + "auxiliary_loss_mlp": 0.01050962, + "balance_loss_clip": 1.05881619, + "balance_loss_mlp": 1.03234124, + "epoch": 0.16137081016082971, + "flos": 43120243058400.0, + "grad_norm": 1.7904074116598823, + "language_loss": 0.74261022, + "learning_rate": 3.821675742690849e-06, + "loss": 0.76475388, + "num_input_tokens_seen": 58138655, + "router_z_loss_clip": 1.04443359, + "router_z_loss_mlp": 0.18615723, + "step": 2684, + "time_per_iteration": 2.7893974781036377 + }, + { + "auxiliary_loss_clip": 0.01163086, + "auxiliary_loss_mlp": 0.01043135, + "balance_loss_clip": 1.0573833, + "balance_loss_mlp": 1.02515864, + "epoch": 0.16143093341349768, + "flos": 41775421943520.0, + "grad_norm": 1.7358181406361437, + "language_loss": 0.70017809, + "learning_rate": 3.821514952272223e-06, + "loss": 0.72224033, + "num_input_tokens_seen": 58157440, + "router_z_loss_clip": 1.05664062, + "router_z_loss_mlp": 0.17980957, + "step": 2685, + "time_per_iteration": 2.7666103839874268 + }, + { + "auxiliary_loss_clip": 0.01157279, + "auxiliary_loss_mlp": 0.0104577, + "balance_loss_clip": 1.05740678, + "balance_loss_mlp": 1.02847242, + "epoch": 0.16149105666616564, + "flos": 34165498883040.0, + "grad_norm": 1.7679955376675855, + "language_loss": 0.71427649, + "learning_rate": 3.821354092781567e-06, + "loss": 0.73630702, + "num_input_tokens_seen": 58176660, + "router_z_loss_clip": 0.99804688, + "router_z_loss_mlp": 0.17321777, + "step": 2686, + "time_per_iteration": 2.710989475250244 + }, + { + "auxiliary_loss_clip": 0.01162748, + "auxiliary_loss_mlp": 0.0104627, + "balance_loss_clip": 1.05908835, + "balance_loss_mlp": 1.02837718, + "epoch": 0.1615511799188336, + "flos": 23699392456320.0, + "grad_norm": 2.3176338144909012, + "language_loss": 0.8170777, + "learning_rate": 3.821193164224981e-06, + "loss": 0.83916789, + "num_input_tokens_seen": 58195085, + "router_z_loss_clip": 1.03613281, + "router_z_loss_mlp": 0.17883301, + "step": 2687, + "time_per_iteration": 2.735978841781616 + }, + { + "auxiliary_loss_clip": 0.01160811, + "auxiliary_loss_mlp": 0.01043619, + "balance_loss_clip": 1.05282724, + "balance_loss_mlp": 1.0251056, + "epoch": 0.16161130317150157, + "flos": 27887958132480.0, + "grad_norm": 2.081688857660537, + "language_loss": 0.71977878, + "learning_rate": 3.821032166608568e-06, + "loss": 0.74182308, + "num_input_tokens_seen": 58213540, + "router_z_loss_clip": 1.078125, + "router_z_loss_mlp": 0.18505859, + "step": 2688, + "time_per_iteration": 2.711402177810669 + }, + { + "auxiliary_loss_clip": 0.01161358, + "auxiliary_loss_mlp": 0.01047602, + "balance_loss_clip": 1.0574441, + "balance_loss_mlp": 1.03113937, + "epoch": 0.16167142642416954, + "flos": 31859990699040.0, + "grad_norm": 2.0630619452533816, + "language_loss": 0.7571249, + "learning_rate": 3.8208710999384325e-06, + "loss": 0.7792145, + "num_input_tokens_seen": 58236995, + "router_z_loss_clip": 1.03955078, + "router_z_loss_mlp": 0.16467285, + "step": 2689, + "time_per_iteration": 2.9745452404022217 + }, + { + "auxiliary_loss_clip": 0.01163193, + "auxiliary_loss_mlp": 0.01047208, + "balance_loss_clip": 1.05958104, + "balance_loss_mlp": 1.02991033, + "epoch": 0.1617315496768375, + "flos": 27796320745920.0, + "grad_norm": 2.0578535005105913, + "language_loss": 0.87549567, + "learning_rate": 3.820709964220683e-06, + "loss": 0.8975997, + "num_input_tokens_seen": 58257230, + "router_z_loss_clip": 1.03662109, + "router_z_loss_mlp": 0.17297363, + "step": 2690, + "time_per_iteration": 2.704227924346924 + }, + { + "auxiliary_loss_clip": 0.01159339, + "auxiliary_loss_mlp": 0.01048103, + "balance_loss_clip": 1.05782509, + "balance_loss_mlp": 1.0324868, + "epoch": 0.1617916729295055, + "flos": 26865691148160.0, + "grad_norm": 1.7440434535095675, + "language_loss": 0.88145494, + "learning_rate": 3.8205487594614284e-06, + "loss": 0.90352935, + "num_input_tokens_seen": 58277080, + "router_z_loss_clip": 1.01513672, + "router_z_loss_mlp": 0.15625, + "step": 2691, + "time_per_iteration": 2.655045747756958 + }, + { + "auxiliary_loss_clip": 0.0116318, + "auxiliary_loss_mlp": 0.01050653, + "balance_loss_clip": 1.05408943, + "balance_loss_mlp": 1.03196144, + "epoch": 0.16185179618217346, + "flos": 28599826170240.0, + "grad_norm": 2.4816341947117966, + "language_loss": 0.82008672, + "learning_rate": 3.820387485666784e-06, + "loss": 0.84222507, + "num_input_tokens_seen": 58294815, + "router_z_loss_clip": 1.09179688, + "router_z_loss_mlp": 0.18713379, + "step": 2692, + "time_per_iteration": 2.7190182209014893 + }, + { + "auxiliary_loss_clip": 0.01164528, + "auxiliary_loss_mlp": 0.01049387, + "balance_loss_clip": 1.05488563, + "balance_loss_mlp": 1.03124344, + "epoch": 0.16191191943484143, + "flos": 31318229318400.0, + "grad_norm": 2.1506065555318603, + "language_loss": 0.80965781, + "learning_rate": 3.820226142842862e-06, + "loss": 0.831797, + "num_input_tokens_seen": 58313215, + "router_z_loss_clip": 1.09765625, + "router_z_loss_mlp": 0.18139648, + "step": 2693, + "time_per_iteration": 2.735548496246338 + }, + { + "auxiliary_loss_clip": 0.01157858, + "auxiliary_loss_mlp": 0.01052655, + "balance_loss_clip": 1.05605435, + "balance_loss_mlp": 1.03654933, + "epoch": 0.1619720426875094, + "flos": 28647306072000.0, + "grad_norm": 1.5567888274651103, + "language_loss": 0.83886534, + "learning_rate": 3.820064730995783e-06, + "loss": 0.8609705, + "num_input_tokens_seen": 58333215, + "router_z_loss_clip": 1.01757812, + "router_z_loss_mlp": 0.16113281, + "step": 2694, + "time_per_iteration": 2.745983600616455 + }, + { + "auxiliary_loss_clip": 0.01161027, + "auxiliary_loss_mlp": 0.01054538, + "balance_loss_clip": 1.05379951, + "balance_loss_mlp": 1.03626359, + "epoch": 0.16203216594017736, + "flos": 29448218390400.0, + "grad_norm": 1.9389012965228445, + "language_loss": 0.69178402, + "learning_rate": 3.819903250131667e-06, + "loss": 0.71393967, + "num_input_tokens_seen": 58351160, + "router_z_loss_clip": 1.07226562, + "router_z_loss_mlp": 0.18286133, + "step": 2695, + "time_per_iteration": 2.676422357559204 + }, + { + "auxiliary_loss_clip": 0.01165026, + "auxiliary_loss_mlp": 0.0105137, + "balance_loss_clip": 1.05840969, + "balance_loss_mlp": 1.03319049, + "epoch": 0.16209228919284532, + "flos": 27261001612800.0, + "grad_norm": 2.7226255318669974, + "language_loss": 0.82286572, + "learning_rate": 3.819741700256637e-06, + "loss": 0.84502971, + "num_input_tokens_seen": 58368505, + "router_z_loss_clip": 1.06591797, + "router_z_loss_mlp": 0.1817627, + "step": 2696, + "time_per_iteration": 2.684717893600464 + }, + { + "auxiliary_loss_clip": 0.01169409, + "auxiliary_loss_mlp": 0.01052636, + "balance_loss_clip": 1.0580647, + "balance_loss_mlp": 1.03355074, + "epoch": 0.1621524124455133, + "flos": 19297291949280.0, + "grad_norm": 2.083288825705699, + "language_loss": 0.88930058, + "learning_rate": 3.8195800813768194e-06, + "loss": 0.91152102, + "num_input_tokens_seen": 58385085, + "router_z_loss_clip": 1.11230469, + "router_z_loss_mlp": 0.1907959, + "step": 2697, + "time_per_iteration": 2.631753444671631 + }, + { + "auxiliary_loss_clip": 0.01153055, + "auxiliary_loss_mlp": 0.01041664, + "balance_loss_clip": 1.05355346, + "balance_loss_mlp": 1.02573752, + "epoch": 0.16221253569818128, + "flos": 36834922990080.0, + "grad_norm": 1.521821070315038, + "language_loss": 0.80615306, + "learning_rate": 3.819418393498343e-06, + "loss": 0.8281002, + "num_input_tokens_seen": 58406985, + "router_z_loss_clip": 0.99511719, + "router_z_loss_mlp": 0.15930176, + "step": 2698, + "time_per_iteration": 2.7268741130828857 + }, + { + "auxiliary_loss_clip": 0.01155919, + "auxiliary_loss_mlp": 0.01043517, + "balance_loss_clip": 1.05637765, + "balance_loss_mlp": 1.02630329, + "epoch": 0.16227265895084925, + "flos": 30024777764160.0, + "grad_norm": 1.668859947655622, + "language_loss": 0.77663875, + "learning_rate": 3.819256636627339e-06, + "loss": 0.7986331, + "num_input_tokens_seen": 58426205, + "router_z_loss_clip": 0.99560547, + "router_z_loss_mlp": 0.17224121, + "step": 2699, + "time_per_iteration": 5.670321464538574 + }, + { + "auxiliary_loss_clip": 0.01159555, + "auxiliary_loss_mlp": 0.01041345, + "balance_loss_clip": 1.05622232, + "balance_loss_mlp": 1.02553773, + "epoch": 0.1623327822035172, + "flos": 23883315505920.0, + "grad_norm": 1.8889708966499823, + "language_loss": 0.86358351, + "learning_rate": 3.81909481076994e-06, + "loss": 0.88559252, + "num_input_tokens_seen": 58443830, + "router_z_loss_clip": 1.03564453, + "router_z_loss_mlp": 0.15808105, + "step": 2700, + "time_per_iteration": 2.8827428817749023 + }, + { + "auxiliary_loss_clip": 0.01155513, + "auxiliary_loss_mlp": 0.0104487, + "balance_loss_clip": 1.05385673, + "balance_loss_mlp": 1.02646375, + "epoch": 0.16239290545618518, + "flos": 32297351749920.0, + "grad_norm": 1.7570799152960264, + "language_loss": 0.80676115, + "learning_rate": 3.818932915932284e-06, + "loss": 0.82876503, + "num_input_tokens_seen": 58464405, + "router_z_loss_clip": 1.01660156, + "router_z_loss_mlp": 0.18408203, + "step": 2701, + "time_per_iteration": 2.771275758743286 + }, + { + "auxiliary_loss_clip": 0.01162442, + "auxiliary_loss_mlp": 0.01042881, + "balance_loss_clip": 1.05941105, + "balance_loss_mlp": 1.02548814, + "epoch": 0.16245302870885314, + "flos": 19342178745120.0, + "grad_norm": 1.5276865517473812, + "language_loss": 0.72812521, + "learning_rate": 3.818770952120511e-06, + "loss": 0.75017846, + "num_input_tokens_seen": 58483295, + "router_z_loss_clip": 1.02929688, + "router_z_loss_mlp": 0.17382812, + "step": 2702, + "time_per_iteration": 2.6958417892456055 + }, + { + "auxiliary_loss_clip": 0.0116086, + "auxiliary_loss_mlp": 0.01047151, + "balance_loss_clip": 1.05561078, + "balance_loss_mlp": 1.02788675, + "epoch": 0.1625131519615211, + "flos": 18005542120800.0, + "grad_norm": 2.0926119918317423, + "language_loss": 0.72931123, + "learning_rate": 3.81860891934076e-06, + "loss": 0.75139129, + "num_input_tokens_seen": 58501205, + "router_z_loss_clip": 1.05371094, + "router_z_loss_mlp": 0.19274902, + "step": 2703, + "time_per_iteration": 4.105945110321045 + }, + { + "auxiliary_loss_clip": 0.01158574, + "auxiliary_loss_mlp": 0.01045531, + "balance_loss_clip": 1.05324674, + "balance_loss_mlp": 1.02681494, + "epoch": 0.1625732752141891, + "flos": 34435265348160.0, + "grad_norm": 2.3379767448323276, + "language_loss": 0.70422053, + "learning_rate": 3.818446817599176e-06, + "loss": 0.72626156, + "num_input_tokens_seen": 58522315, + "router_z_loss_clip": 1.05371094, + "router_z_loss_mlp": 0.18701172, + "step": 2704, + "time_per_iteration": 4.1850666999816895 + }, + { + "auxiliary_loss_clip": 0.01061932, + "auxiliary_loss_mlp": 0.01009009, + "balance_loss_clip": 1.0266006, + "balance_loss_mlp": 1.00699902, + "epoch": 0.16263339846685707, + "flos": 82154294906400.0, + "grad_norm": 0.7777867930319724, + "language_loss": 0.533337, + "learning_rate": 3.818284646901907e-06, + "loss": 0.55404639, + "num_input_tokens_seen": 58586695, + "router_z_loss_clip": 0.35327148, + "router_z_loss_mlp": 0.02009583, + "step": 2705, + "time_per_iteration": 3.2821879386901855 + }, + { + "auxiliary_loss_clip": 0.01161616, + "auxiliary_loss_mlp": 0.01038993, + "balance_loss_clip": 1.05582786, + "balance_loss_mlp": 1.02167225, + "epoch": 0.16269352171952503, + "flos": 17471803161600.0, + "grad_norm": 2.5356507993035238, + "language_loss": 0.74789792, + "learning_rate": 3.818122407255102e-06, + "loss": 0.76990402, + "num_input_tokens_seen": 58602435, + "router_z_loss_clip": 1.05664062, + "router_z_loss_mlp": 0.17321777, + "step": 2706, + "time_per_iteration": 2.6749460697174072 + }, + { + "auxiliary_loss_clip": 0.01160412, + "auxiliary_loss_mlp": 0.01046448, + "balance_loss_clip": 1.05649972, + "balance_loss_mlp": 1.02980685, + "epoch": 0.162753644972193, + "flos": 34607438386560.0, + "grad_norm": 1.791270757000268, + "language_loss": 0.72378379, + "learning_rate": 3.817960098664914e-06, + "loss": 0.74585235, + "num_input_tokens_seen": 58621275, + "router_z_loss_clip": 1.0390625, + "router_z_loss_mlp": 0.16625977, + "step": 2707, + "time_per_iteration": 2.7164723873138428 + }, + { + "auxiliary_loss_clip": 0.01158584, + "auxiliary_loss_mlp": 0.01045314, + "balance_loss_clip": 1.05535674, + "balance_loss_mlp": 1.02813613, + "epoch": 0.16281376822486096, + "flos": 24328172253600.0, + "grad_norm": 4.5897355969136155, + "language_loss": 0.83542067, + "learning_rate": 3.817797721137495e-06, + "loss": 0.85745966, + "num_input_tokens_seen": 58637550, + "router_z_loss_clip": 1.03320312, + "router_z_loss_mlp": 0.17175293, + "step": 2708, + "time_per_iteration": 2.6961066722869873 + }, + { + "auxiliary_loss_clip": 0.01163751, + "auxiliary_loss_mlp": 0.01041656, + "balance_loss_clip": 1.05490351, + "balance_loss_mlp": 1.02208221, + "epoch": 0.16287389147752893, + "flos": 25931414995200.0, + "grad_norm": 2.2791553299203, + "language_loss": 0.86347908, + "learning_rate": 3.817635274679006e-06, + "loss": 0.88553315, + "num_input_tokens_seen": 58654135, + "router_z_loss_clip": 1.08789062, + "router_z_loss_mlp": 0.19580078, + "step": 2709, + "time_per_iteration": 2.6491103172302246 + }, + { + "auxiliary_loss_clip": 0.01159363, + "auxiliary_loss_mlp": 0.01049048, + "balance_loss_clip": 1.05332327, + "balance_loss_mlp": 1.03102374, + "epoch": 0.1629340147301969, + "flos": 23482292104800.0, + "grad_norm": 1.9185590758680837, + "language_loss": 0.91177732, + "learning_rate": 3.817472759295605e-06, + "loss": 0.93386143, + "num_input_tokens_seen": 58674320, + "router_z_loss_clip": 1.05859375, + "router_z_loss_mlp": 0.18005371, + "step": 2710, + "time_per_iteration": 2.6860153675079346 + }, + { + "auxiliary_loss_clip": 0.01158554, + "auxiliary_loss_mlp": 0.01047575, + "balance_loss_clip": 1.0560801, + "balance_loss_mlp": 1.02975333, + "epoch": 0.16299413798286488, + "flos": 25929267579360.0, + "grad_norm": 2.129788375874118, + "language_loss": 0.81246471, + "learning_rate": 3.817310174993453e-06, + "loss": 0.83452606, + "num_input_tokens_seen": 58691000, + "router_z_loss_clip": 1.0234375, + "router_z_loss_mlp": 0.17834473, + "step": 2711, + "time_per_iteration": 2.6441092491149902 + }, + { + "auxiliary_loss_clip": 0.01161289, + "auxiliary_loss_mlp": 0.01036462, + "balance_loss_clip": 1.05195022, + "balance_loss_mlp": 1.01914096, + "epoch": 0.16305426123553285, + "flos": 22903747384320.0, + "grad_norm": 2.579054215427534, + "language_loss": 0.81208587, + "learning_rate": 3.817147521778719e-06, + "loss": 0.83406335, + "num_input_tokens_seen": 58710230, + "router_z_loss_clip": 1.09375, + "router_z_loss_mlp": 0.17321777, + "step": 2712, + "time_per_iteration": 2.9524528980255127 + }, + { + "auxiliary_loss_clip": 0.01161826, + "auxiliary_loss_mlp": 0.01046354, + "balance_loss_clip": 1.05469823, + "balance_loss_mlp": 1.02791214, + "epoch": 0.16311438448820081, + "flos": 26951777667360.0, + "grad_norm": 1.784183329734205, + "language_loss": 0.76810467, + "learning_rate": 3.816984799657568e-06, + "loss": 0.79018641, + "num_input_tokens_seen": 58728610, + "router_z_loss_clip": 1.0703125, + "router_z_loss_mlp": 0.1842041, + "step": 2713, + "time_per_iteration": 2.748267412185669 + }, + { + "auxiliary_loss_clip": 0.01159004, + "auxiliary_loss_mlp": 0.01054002, + "balance_loss_clip": 1.05866718, + "balance_loss_mlp": 1.03564334, + "epoch": 0.16317450774086878, + "flos": 20093990470560.0, + "grad_norm": 2.198108027107681, + "language_loss": 0.79720795, + "learning_rate": 3.8168220086361715e-06, + "loss": 0.81933808, + "num_input_tokens_seen": 58744385, + "router_z_loss_clip": 1.00292969, + "router_z_loss_mlp": 0.18359375, + "step": 2714, + "time_per_iteration": 2.676079034805298 + }, + { + "auxiliary_loss_clip": 0.01159271, + "auxiliary_loss_mlp": 0.01052559, + "balance_loss_clip": 1.05517435, + "balance_loss_mlp": 1.03522587, + "epoch": 0.16323463099353674, + "flos": 29715756405120.0, + "grad_norm": 1.7737256075285939, + "language_loss": 0.7822364, + "learning_rate": 3.816659148720702e-06, + "loss": 0.80435467, + "num_input_tokens_seen": 58763905, + "router_z_loss_clip": 1.04052734, + "router_z_loss_mlp": 0.17333984, + "step": 2715, + "time_per_iteration": 2.7342171669006348 + }, + { + "auxiliary_loss_clip": 0.01157008, + "auxiliary_loss_mlp": 0.01040232, + "balance_loss_clip": 1.05330229, + "balance_loss_mlp": 1.02345884, + "epoch": 0.1632947542462047, + "flos": 30384236786400.0, + "grad_norm": 2.422240556943723, + "language_loss": 0.81685257, + "learning_rate": 3.816496219917336e-06, + "loss": 0.83882499, + "num_input_tokens_seen": 58785580, + "router_z_loss_clip": 1.03613281, + "router_z_loss_mlp": 0.16772461, + "step": 2716, + "time_per_iteration": 2.7263834476470947 + }, + { + "auxiliary_loss_clip": 0.01163596, + "auxiliary_loss_mlp": 0.01052093, + "balance_loss_clip": 1.05855024, + "balance_loss_mlp": 1.03492653, + "epoch": 0.1633548774988727, + "flos": 30383669544480.0, + "grad_norm": 2.2693504948521417, + "language_loss": 0.86338162, + "learning_rate": 3.816333222232251e-06, + "loss": 0.88553846, + "num_input_tokens_seen": 58806075, + "router_z_loss_clip": 1.05029297, + "router_z_loss_mlp": 0.171875, + "step": 2717, + "time_per_iteration": 2.7864365577697754 + }, + { + "auxiliary_loss_clip": 0.01155417, + "auxiliary_loss_mlp": 0.01045047, + "balance_loss_clip": 1.05391467, + "balance_loss_mlp": 1.02832234, + "epoch": 0.16341500075154067, + "flos": 37146253834080.0, + "grad_norm": 2.469759149028146, + "language_loss": 0.76571643, + "learning_rate": 3.816170155671629e-06, + "loss": 0.7877211, + "num_input_tokens_seen": 58827405, + "router_z_loss_clip": 1.01513672, + "router_z_loss_mlp": 0.1673584, + "step": 2718, + "time_per_iteration": 2.7297937870025635 + }, + { + "auxiliary_loss_clip": 0.01159649, + "auxiliary_loss_mlp": 0.01041182, + "balance_loss_clip": 1.05427694, + "balance_loss_mlp": 1.02508855, + "epoch": 0.16347512400420863, + "flos": 27801831096000.0, + "grad_norm": 2.5152475589441994, + "language_loss": 0.73981923, + "learning_rate": 3.816007020241652e-06, + "loss": 0.76182753, + "num_input_tokens_seen": 58847205, + "router_z_loss_clip": 1.05322266, + "router_z_loss_mlp": 0.1607666, + "step": 2719, + "time_per_iteration": 2.759596586227417 + }, + { + "auxiliary_loss_clip": 0.01157736, + "auxiliary_loss_mlp": 0.01040565, + "balance_loss_clip": 1.05280435, + "balance_loss_mlp": 1.02368498, + "epoch": 0.1635352472568766, + "flos": 27617745977280.0, + "grad_norm": 1.6157081043478043, + "language_loss": 0.72518206, + "learning_rate": 3.815843815948507e-06, + "loss": 0.74716508, + "num_input_tokens_seen": 58866865, + "router_z_loss_clip": 1.05029297, + "router_z_loss_mlp": 0.16870117, + "step": 2720, + "time_per_iteration": 2.7030422687530518 + }, + { + "auxiliary_loss_clip": 0.0115714, + "auxiliary_loss_mlp": 0.01046479, + "balance_loss_clip": 1.05496788, + "balance_loss_mlp": 1.02757287, + "epoch": 0.16359537050954456, + "flos": 18940831205760.0, + "grad_norm": 2.0197830611838743, + "language_loss": 0.7481997, + "learning_rate": 3.8156805427983824e-06, + "loss": 0.7702359, + "num_input_tokens_seen": 58885200, + "router_z_loss_clip": 1.02148438, + "router_z_loss_mlp": 0.18896484, + "step": 2721, + "time_per_iteration": 2.6713554859161377 + }, + { + "auxiliary_loss_clip": 0.01160741, + "auxiliary_loss_mlp": 0.01048568, + "balance_loss_clip": 1.05331969, + "balance_loss_mlp": 1.03049612, + "epoch": 0.16365549376221253, + "flos": 26954613876960.0, + "grad_norm": 2.5698900332187984, + "language_loss": 0.79271847, + "learning_rate": 3.8155172007974695e-06, + "loss": 0.81481159, + "num_input_tokens_seen": 58906385, + "router_z_loss_clip": 1.07519531, + "router_z_loss_mlp": 0.18066406, + "step": 2722, + "time_per_iteration": 2.6501991748809814 + }, + { + "auxiliary_loss_clip": 0.0116265, + "auxiliary_loss_mlp": 0.01051961, + "balance_loss_clip": 1.05422187, + "balance_loss_mlp": 1.03188574, + "epoch": 0.1637156170148805, + "flos": 29358849971520.0, + "grad_norm": 2.2192014672936957, + "language_loss": 0.84485316, + "learning_rate": 3.8153537899519624e-06, + "loss": 0.86699927, + "num_input_tokens_seen": 58925040, + "router_z_loss_clip": 1.08398438, + "router_z_loss_mlp": 0.20068359, + "step": 2723, + "time_per_iteration": 2.6963062286376953 + }, + { + "auxiliary_loss_clip": 0.0115263, + "auxiliary_loss_mlp": 0.01044106, + "balance_loss_clip": 1.05331039, + "balance_loss_mlp": 1.02621293, + "epoch": 0.1637757402675485, + "flos": 32562215624160.0, + "grad_norm": 2.0014990097765955, + "language_loss": 0.70556551, + "learning_rate": 3.815190310268058e-06, + "loss": 0.72753286, + "num_input_tokens_seen": 58944790, + "router_z_loss_clip": 0.99462891, + "router_z_loss_mlp": 0.17895508, + "step": 2724, + "time_per_iteration": 2.7610116004943848 + }, + { + "auxiliary_loss_clip": 0.01153715, + "auxiliary_loss_mlp": 0.0104386, + "balance_loss_clip": 1.0531857, + "balance_loss_mlp": 1.02763569, + "epoch": 0.16383586352021645, + "flos": 19653388037280.0, + "grad_norm": 5.231819061691567, + "language_loss": 0.70485914, + "learning_rate": 3.815026761751955e-06, + "loss": 0.72683483, + "num_input_tokens_seen": 58962500, + "router_z_loss_clip": 1.00439453, + "router_z_loss_mlp": 0.16235352, + "step": 2725, + "time_per_iteration": 2.9274048805236816 + }, + { + "auxiliary_loss_clip": 0.0115308, + "auxiliary_loss_mlp": 0.01044051, + "balance_loss_clip": 1.05358744, + "balance_loss_mlp": 1.0275408, + "epoch": 0.16389598677288442, + "flos": 23386805576640.0, + "grad_norm": 1.9356283451821965, + "language_loss": 0.88683754, + "learning_rate": 3.814863144409855e-06, + "loss": 0.90880883, + "num_input_tokens_seen": 58980355, + "router_z_loss_clip": 0.99414062, + "router_z_loss_mlp": 0.16516113, + "step": 2726, + "time_per_iteration": 2.6664040088653564 + }, + { + "auxiliary_loss_clip": 0.01159472, + "auxiliary_loss_mlp": 0.01051003, + "balance_loss_clip": 1.0553205, + "balance_loss_mlp": 1.03394997, + "epoch": 0.16395611002555238, + "flos": 26242948425600.0, + "grad_norm": 1.819927302737958, + "language_loss": 0.74160606, + "learning_rate": 3.814699458247963e-06, + "loss": 0.76371086, + "num_input_tokens_seen": 58999505, + "router_z_loss_clip": 1.04199219, + "router_z_loss_mlp": 0.17047119, + "step": 2727, + "time_per_iteration": 2.723167896270752 + }, + { + "auxiliary_loss_clip": 0.0115285, + "auxiliary_loss_mlp": 0.01049442, + "balance_loss_clip": 1.05298781, + "balance_loss_mlp": 1.03346765, + "epoch": 0.16401623327822035, + "flos": 26198871975360.0, + "grad_norm": 1.7852518839222253, + "language_loss": 0.8279618, + "learning_rate": 3.8145357032724855e-06, + "loss": 0.84998477, + "num_input_tokens_seen": 59017930, + "router_z_loss_clip": 0.99804688, + "router_z_loss_mlp": 0.15966797, + "step": 2728, + "time_per_iteration": 2.7040960788726807 + }, + { + "auxiliary_loss_clip": 0.01159806, + "auxiliary_loss_mlp": 0.01046574, + "balance_loss_clip": 1.05448723, + "balance_loss_mlp": 1.02860951, + "epoch": 0.1640763565308883, + "flos": 16626206633760.0, + "grad_norm": 2.363385295781439, + "language_loss": 0.85478127, + "learning_rate": 3.814371879489633e-06, + "loss": 0.87684506, + "num_input_tokens_seen": 59035130, + "router_z_loss_clip": 1.05322266, + "router_z_loss_mlp": 0.1796875, + "step": 2729, + "time_per_iteration": 2.6767430305480957 + }, + { + "auxiliary_loss_clip": 0.01154229, + "auxiliary_loss_mlp": 0.01043527, + "balance_loss_clip": 1.05155993, + "balance_loss_mlp": 1.02726734, + "epoch": 0.16413647978355628, + "flos": 18853407616320.0, + "grad_norm": 2.0153446798918733, + "language_loss": 0.72780895, + "learning_rate": 3.814207986905616e-06, + "loss": 0.7497865, + "num_input_tokens_seen": 59053080, + "router_z_loss_clip": 1.02587891, + "router_z_loss_mlp": 0.16259766, + "step": 2730, + "time_per_iteration": 2.686022996902466 + }, + { + "auxiliary_loss_clip": 0.01159171, + "auxiliary_loss_mlp": 0.0104845, + "balance_loss_clip": 1.05189133, + "balance_loss_mlp": 1.02960348, + "epoch": 0.16419660303622427, + "flos": 55983211434720.0, + "grad_norm": 1.6320976738431088, + "language_loss": 0.74238634, + "learning_rate": 3.814044025526651e-06, + "loss": 0.76446259, + "num_input_tokens_seen": 59075610, + "router_z_loss_clip": 1.07226562, + "router_z_loss_mlp": 0.18847656, + "step": 2731, + "time_per_iteration": 2.848249912261963 + }, + { + "auxiliary_loss_clip": 0.01160183, + "auxiliary_loss_mlp": 0.01042819, + "balance_loss_clip": 1.05612266, + "balance_loss_mlp": 1.02471066, + "epoch": 0.16425672628889224, + "flos": 23126155499520.0, + "grad_norm": 2.2701468482352167, + "language_loss": 0.7895292, + "learning_rate": 3.8138799953589548e-06, + "loss": 0.8115592, + "num_input_tokens_seen": 59094555, + "router_z_loss_clip": 1.04150391, + "router_z_loss_mlp": 0.18115234, + "step": 2732, + "time_per_iteration": 2.7064454555511475 + }, + { + "auxiliary_loss_clip": 0.01158181, + "auxiliary_loss_mlp": 0.01051925, + "balance_loss_clip": 1.05344105, + "balance_loss_mlp": 1.03447294, + "epoch": 0.1643168495415602, + "flos": 29667344605920.0, + "grad_norm": 2.118813007473557, + "language_loss": 0.69292974, + "learning_rate": 3.8137158964087473e-06, + "loss": 0.71503079, + "num_input_tokens_seen": 59113515, + "router_z_loss_clip": 1.04638672, + "router_z_loss_mlp": 0.17456055, + "step": 2733, + "time_per_iteration": 2.660733938217163 + }, + { + "auxiliary_loss_clip": 0.0115751, + "auxiliary_loss_mlp": 0.01048822, + "balance_loss_clip": 1.05404401, + "balance_loss_mlp": 1.02991581, + "epoch": 0.16437697279422817, + "flos": 32248818398880.0, + "grad_norm": 1.7730880474872115, + "language_loss": 0.8071115, + "learning_rate": 3.8135517286822508e-06, + "loss": 0.82917488, + "num_input_tokens_seen": 59133275, + "router_z_loss_clip": 1.03369141, + "router_z_loss_mlp": 0.18908691, + "step": 2734, + "time_per_iteration": 2.7520105838775635 + }, + { + "auxiliary_loss_clip": 0.01157733, + "auxiliary_loss_mlp": 0.01049165, + "balance_loss_clip": 1.05395269, + "balance_loss_mlp": 1.03159356, + "epoch": 0.16443709604689613, + "flos": 42137352519840.0, + "grad_norm": 2.0590195783216574, + "language_loss": 0.81942296, + "learning_rate": 3.8133874921856914e-06, + "loss": 0.84149194, + "num_input_tokens_seen": 59154095, + "router_z_loss_clip": 1.0390625, + "router_z_loss_mlp": 0.17565918, + "step": 2735, + "time_per_iteration": 2.774473190307617 + }, + { + "auxiliary_loss_clip": 0.01154396, + "auxiliary_loss_mlp": 0.01041326, + "balance_loss_clip": 1.05403447, + "balance_loss_mlp": 1.02443361, + "epoch": 0.1644972192995641, + "flos": 28379727540000.0, + "grad_norm": 2.476128022775962, + "language_loss": 0.77980614, + "learning_rate": 3.813223186925296e-06, + "loss": 0.80176336, + "num_input_tokens_seen": 59173795, + "router_z_loss_clip": 1.00439453, + "router_z_loss_mlp": 0.16882324, + "step": 2736, + "time_per_iteration": 2.8309667110443115 + }, + { + "auxiliary_loss_clip": 0.01158142, + "auxiliary_loss_mlp": 0.01048769, + "balance_loss_clip": 1.05545902, + "balance_loss_mlp": 1.03154325, + "epoch": 0.1645573425522321, + "flos": 32920985852640.0, + "grad_norm": 1.6393851538177435, + "language_loss": 0.81527817, + "learning_rate": 3.8130588129072964e-06, + "loss": 0.83734739, + "num_input_tokens_seen": 59191610, + "router_z_loss_clip": 1.02783203, + "router_z_loss_mlp": 0.17224121, + "step": 2737, + "time_per_iteration": 2.952899932861328 + }, + { + "auxiliary_loss_clip": 0.01159623, + "auxiliary_loss_mlp": 0.01043674, + "balance_loss_clip": 1.05473542, + "balance_loss_mlp": 1.02662706, + "epoch": 0.16461746580490005, + "flos": 35059102037280.0, + "grad_norm": 1.9021429842049968, + "language_loss": 0.86780405, + "learning_rate": 3.8128943701379246e-06, + "loss": 0.88983709, + "num_input_tokens_seen": 59213000, + "router_z_loss_clip": 1.04882812, + "router_z_loss_mlp": 0.17053223, + "step": 2738, + "time_per_iteration": 2.7468934059143066 + }, + { + "auxiliary_loss_clip": 0.01156698, + "auxiliary_loss_mlp": 0.01046965, + "balance_loss_clip": 1.05347443, + "balance_loss_mlp": 1.02948904, + "epoch": 0.16467758905756802, + "flos": 30420736505280.0, + "grad_norm": 1.6963518040897747, + "language_loss": 0.72172236, + "learning_rate": 3.8127298586234167e-06, + "loss": 0.74375892, + "num_input_tokens_seen": 59232340, + "router_z_loss_clip": 1.03125, + "router_z_loss_mlp": 0.17456055, + "step": 2739, + "time_per_iteration": 5.731045246124268 + }, + { + "auxiliary_loss_clip": 0.01151933, + "auxiliary_loss_mlp": 0.01041458, + "balance_loss_clip": 1.05118394, + "balance_loss_mlp": 1.02346945, + "epoch": 0.16473771231023598, + "flos": 30293733883680.0, + "grad_norm": 1.7393653286222264, + "language_loss": 0.81634343, + "learning_rate": 3.8125652783700104e-06, + "loss": 0.8382774, + "num_input_tokens_seen": 59253950, + "router_z_loss_clip": 1.0078125, + "router_z_loss_mlp": 0.17993164, + "step": 2740, + "time_per_iteration": 2.70198655128479 + }, + { + "auxiliary_loss_clip": 0.01164006, + "auxiliary_loss_mlp": 0.01049161, + "balance_loss_clip": 1.05724931, + "balance_loss_mlp": 1.0289315, + "epoch": 0.16479783556290395, + "flos": 48682674388800.0, + "grad_norm": 3.3957238896550166, + "language_loss": 0.69495797, + "learning_rate": 3.8124006293839475e-06, + "loss": 0.71708965, + "num_input_tokens_seen": 59275545, + "router_z_loss_clip": 1.06835938, + "router_z_loss_mlp": 0.20239258, + "step": 2741, + "time_per_iteration": 2.8067517280578613 + }, + { + "auxiliary_loss_clip": 0.01156734, + "auxiliary_loss_mlp": 0.01038149, + "balance_loss_clip": 1.05309045, + "balance_loss_mlp": 1.02072048, + "epoch": 0.16485795881557191, + "flos": 24278788039680.0, + "grad_norm": 1.6826113793164197, + "language_loss": 0.79898894, + "learning_rate": 3.812235911671472e-06, + "loss": 0.82093775, + "num_input_tokens_seen": 59293480, + "router_z_loss_clip": 1.03515625, + "router_z_loss_mlp": 0.17431641, + "step": 2742, + "time_per_iteration": 4.118621349334717 + }, + { + "auxiliary_loss_clip": 0.01154877, + "auxiliary_loss_mlp": 0.01047026, + "balance_loss_clip": 1.05418718, + "balance_loss_mlp": 1.02908552, + "epoch": 0.16491808206823988, + "flos": 25082820188640.0, + "grad_norm": 2.010863242863246, + "language_loss": 0.84552395, + "learning_rate": 3.8120711252388274e-06, + "loss": 0.86754298, + "num_input_tokens_seen": 59313435, + "router_z_loss_clip": 1.00634766, + "router_z_loss_mlp": 0.17944336, + "step": 2743, + "time_per_iteration": 2.6794588565826416 + }, + { + "auxiliary_loss_clip": 0.01152554, + "auxiliary_loss_mlp": 0.01044776, + "balance_loss_clip": 1.0523293, + "balance_loss_mlp": 1.02658474, + "epoch": 0.16497820532090787, + "flos": 29042454467520.0, + "grad_norm": 8.196925454770167, + "language_loss": 0.85656476, + "learning_rate": 3.811906270092265e-06, + "loss": 0.87853807, + "num_input_tokens_seen": 59331535, + "router_z_loss_clip": 1.00195312, + "router_z_loss_mlp": 0.18200684, + "step": 2744, + "time_per_iteration": 4.275534629821777 + }, + { + "auxiliary_loss_clip": 0.01151505, + "auxiliary_loss_mlp": 0.01042348, + "balance_loss_clip": 1.05386972, + "balance_loss_mlp": 1.02578962, + "epoch": 0.16503832857357584, + "flos": 31094443615680.0, + "grad_norm": 1.8932448973454459, + "language_loss": 0.82860804, + "learning_rate": 3.811741346238036e-06, + "loss": 0.85054654, + "num_input_tokens_seen": 59350680, + "router_z_loss_clip": 0.97460938, + "router_z_loss_mlp": 0.16552734, + "step": 2745, + "time_per_iteration": 2.6796939373016357 + }, + { + "auxiliary_loss_clip": 0.011609, + "auxiliary_loss_mlp": 0.01051275, + "balance_loss_clip": 1.05738211, + "balance_loss_mlp": 1.03372741, + "epoch": 0.1650984518262438, + "flos": 21568123692000.0, + "grad_norm": 2.576215056003136, + "language_loss": 0.76517648, + "learning_rate": 3.8115763536823923e-06, + "loss": 0.7872982, + "num_input_tokens_seen": 59367020, + "router_z_loss_clip": 1.03417969, + "router_z_loss_mlp": 0.17541504, + "step": 2746, + "time_per_iteration": 2.659250497817993 + }, + { + "auxiliary_loss_clip": 0.01154692, + "auxiliary_loss_mlp": 0.01052793, + "balance_loss_clip": 1.0531131, + "balance_loss_mlp": 1.03551996, + "epoch": 0.16515857507891177, + "flos": 22815513449280.0, + "grad_norm": 1.6071121283727325, + "language_loss": 0.80740464, + "learning_rate": 3.811411292431592e-06, + "loss": 0.82947946, + "num_input_tokens_seen": 59386075, + "router_z_loss_clip": 1.01513672, + "router_z_loss_mlp": 0.17272949, + "step": 2747, + "time_per_iteration": 2.6478915214538574 + }, + { + "auxiliary_loss_clip": 0.01161439, + "auxiliary_loss_mlp": 0.01043325, + "balance_loss_clip": 1.05712962, + "balance_loss_mlp": 1.02586055, + "epoch": 0.16521869833157973, + "flos": 18315576411840.0, + "grad_norm": 2.6393155031040334, + "language_loss": 0.69488478, + "learning_rate": 3.8112461624918945e-06, + "loss": 0.71693248, + "num_input_tokens_seen": 59402690, + "router_z_loss_clip": 1.04345703, + "router_z_loss_mlp": 0.17456055, + "step": 2748, + "time_per_iteration": 2.90751576423645 + }, + { + "auxiliary_loss_clip": 0.01161279, + "auxiliary_loss_mlp": 0.01048386, + "balance_loss_clip": 1.05844367, + "balance_loss_mlp": 1.03158998, + "epoch": 0.1652788215842477, + "flos": 26992936873440.0, + "grad_norm": 2.13830223628004, + "language_loss": 0.88051248, + "learning_rate": 3.811080963869561e-06, + "loss": 0.90260911, + "num_input_tokens_seen": 59421130, + "router_z_loss_clip": 1.02880859, + "router_z_loss_mlp": 0.16784668, + "step": 2749, + "time_per_iteration": 2.662869930267334 + }, + { + "auxiliary_loss_clip": 0.01156916, + "auxiliary_loss_mlp": 0.01045224, + "balance_loss_clip": 1.05278349, + "balance_loss_mlp": 1.02710462, + "epoch": 0.16533894483691566, + "flos": 22369846356000.0, + "grad_norm": 2.7782523505426875, + "language_loss": 0.79624474, + "learning_rate": 3.8109156965708557e-06, + "loss": 0.81826615, + "num_input_tokens_seen": 59438970, + "router_z_loss_clip": 1.04003906, + "router_z_loss_mlp": 0.18115234, + "step": 2750, + "time_per_iteration": 2.648144483566284 + }, + { + "auxiliary_loss_clip": 0.01157346, + "auxiliary_loss_mlp": 0.01043943, + "balance_loss_clip": 1.05535436, + "balance_loss_mlp": 1.0265739, + "epoch": 0.16539906808958366, + "flos": 27310872034080.0, + "grad_norm": 1.7122592425193799, + "language_loss": 0.95259655, + "learning_rate": 3.8107503606020455e-06, + "loss": 0.97460938, + "num_input_tokens_seen": 59458510, + "router_z_loss_clip": 1.01757812, + "router_z_loss_mlp": 0.1739502, + "step": 2751, + "time_per_iteration": 2.676353693008423 + }, + { + "auxiliary_loss_clip": 0.01154273, + "auxiliary_loss_mlp": 0.01048523, + "balance_loss_clip": 1.0555687, + "balance_loss_mlp": 1.03148758, + "epoch": 0.16545919134225162, + "flos": 27712260090720.0, + "grad_norm": 2.5887356938272026, + "language_loss": 0.71060908, + "learning_rate": 3.8105849559693997e-06, + "loss": 0.73263705, + "num_input_tokens_seen": 59477110, + "router_z_loss_clip": 0.98730469, + "router_z_loss_mlp": 0.17028809, + "step": 2752, + "time_per_iteration": 2.6941580772399902 + }, + { + "auxiliary_loss_clip": 0.01059885, + "auxiliary_loss_mlp": 0.01006303, + "balance_loss_clip": 1.02535701, + "balance_loss_mlp": 1.00427377, + "epoch": 0.1655193145949196, + "flos": 82733893076160.0, + "grad_norm": 0.777265857196124, + "language_loss": 0.54180795, + "learning_rate": 3.810419482679192e-06, + "loss": 0.56246984, + "num_input_tokens_seen": 59541155, + "router_z_loss_clip": 0.3449707, + "router_z_loss_mlp": 0.02029419, + "step": 2753, + "time_per_iteration": 3.379937171936035 + }, + { + "auxiliary_loss_clip": 0.01157374, + "auxiliary_loss_mlp": 0.01040509, + "balance_loss_clip": 1.05527389, + "balance_loss_mlp": 1.0227344, + "epoch": 0.16557943784758755, + "flos": 29627198331840.0, + "grad_norm": 1.822631315160001, + "language_loss": 0.7525301, + "learning_rate": 3.8102539407376954e-06, + "loss": 0.77450895, + "num_input_tokens_seen": 59561155, + "router_z_loss_clip": 1.02050781, + "router_z_loss_mlp": 0.17785645, + "step": 2754, + "time_per_iteration": 2.681745767593384 + }, + { + "auxiliary_loss_clip": 0.01167219, + "auxiliary_loss_mlp": 0.01053287, + "balance_loss_clip": 1.05836201, + "balance_loss_mlp": 1.03330767, + "epoch": 0.16563956110025552, + "flos": 24506058228480.0, + "grad_norm": 2.5803301083489885, + "language_loss": 0.87062347, + "learning_rate": 3.810088330151188e-06, + "loss": 0.89282858, + "num_input_tokens_seen": 59580460, + "router_z_loss_clip": 1.08789062, + "router_z_loss_mlp": 0.19970703, + "step": 2755, + "time_per_iteration": 2.6786856651306152 + }, + { + "auxiliary_loss_clip": 0.01155697, + "auxiliary_loss_mlp": 0.01051087, + "balance_loss_clip": 1.05401802, + "balance_loss_mlp": 1.03380167, + "epoch": 0.16569968435292348, + "flos": 34208562401280.0, + "grad_norm": 1.6986133283058504, + "language_loss": 0.73201692, + "learning_rate": 3.80992265092595e-06, + "loss": 0.75408477, + "num_input_tokens_seen": 59600025, + "router_z_loss_clip": 1.01660156, + "router_z_loss_mlp": 0.17285156, + "step": 2756, + "time_per_iteration": 2.7241697311401367 + }, + { + "auxiliary_loss_clip": 0.0115175, + "auxiliary_loss_mlp": 0.01048361, + "balance_loss_clip": 1.05489552, + "balance_loss_mlp": 1.03124285, + "epoch": 0.16575980760559147, + "flos": 32030988736320.0, + "grad_norm": 1.6654902461495382, + "language_loss": 0.75020015, + "learning_rate": 3.8097569030682636e-06, + "loss": 0.7722013, + "num_input_tokens_seen": 59620600, + "router_z_loss_clip": 0.96875, + "router_z_loss_mlp": 0.17126465, + "step": 2757, + "time_per_iteration": 2.702639102935791 + }, + { + "auxiliary_loss_clip": 0.01158106, + "auxiliary_loss_mlp": 0.01047318, + "balance_loss_clip": 1.05710077, + "balance_loss_mlp": 1.03042603, + "epoch": 0.16581993085825944, + "flos": 32877476644320.0, + "grad_norm": 1.8663386601197367, + "language_loss": 0.84971952, + "learning_rate": 3.8095910865844137e-06, + "loss": 0.87177378, + "num_input_tokens_seen": 59641385, + "router_z_loss_clip": 1.01074219, + "router_z_loss_mlp": 0.16894531, + "step": 2758, + "time_per_iteration": 2.763688564300537 + }, + { + "auxiliary_loss_clip": 0.01157169, + "auxiliary_loss_mlp": 0.01047363, + "balance_loss_clip": 1.05606031, + "balance_loss_mlp": 1.0316993, + "epoch": 0.1658800541109274, + "flos": 26421806815200.0, + "grad_norm": 1.8615085457747345, + "language_loss": 0.79175681, + "learning_rate": 3.809425201480689e-06, + "loss": 0.81380212, + "num_input_tokens_seen": 59659865, + "router_z_loss_clip": 1.01025391, + "router_z_loss_mlp": 0.15673828, + "step": 2759, + "time_per_iteration": 2.720595598220825 + }, + { + "auxiliary_loss_clip": 0.01153569, + "auxiliary_loss_mlp": 0.01047567, + "balance_loss_clip": 1.0521698, + "balance_loss_mlp": 1.02959037, + "epoch": 0.16594017736359537, + "flos": 20054289886560.0, + "grad_norm": 2.624775563691504, + "language_loss": 0.75285906, + "learning_rate": 3.8092592477633793e-06, + "loss": 0.7748704, + "num_input_tokens_seen": 59678780, + "router_z_loss_clip": 1.01367188, + "router_z_loss_mlp": 0.17980957, + "step": 2760, + "time_per_iteration": 2.775028705596924 + }, + { + "auxiliary_loss_clip": 0.01161367, + "auxiliary_loss_mlp": 0.01038124, + "balance_loss_clip": 1.05544591, + "balance_loss_mlp": 1.0213933, + "epoch": 0.16600030061626334, + "flos": 27622283912640.0, + "grad_norm": 1.9661183255806598, + "language_loss": 0.72891325, + "learning_rate": 3.8090932254387774e-06, + "loss": 0.7509082, + "num_input_tokens_seen": 59698795, + "router_z_loss_clip": 1.05859375, + "router_z_loss_mlp": 0.16729736, + "step": 2761, + "time_per_iteration": 2.7526562213897705 + }, + { + "auxiliary_loss_clip": 0.01157343, + "auxiliary_loss_mlp": 0.0104217, + "balance_loss_clip": 1.05584478, + "balance_loss_mlp": 1.02511084, + "epoch": 0.1660604238689313, + "flos": 32209522987680.0, + "grad_norm": 2.0109388367119134, + "language_loss": 0.89179355, + "learning_rate": 3.8089271345131788e-06, + "loss": 0.91378862, + "num_input_tokens_seen": 59718795, + "router_z_loss_clip": 1.01416016, + "router_z_loss_mlp": 0.17053223, + "step": 2762, + "time_per_iteration": 2.68058180809021 + }, + { + "auxiliary_loss_clip": 0.0115558, + "auxiliary_loss_mlp": 0.01047935, + "balance_loss_clip": 1.05324483, + "balance_loss_mlp": 1.02986312, + "epoch": 0.16612054712159927, + "flos": 28113567112800.0, + "grad_norm": 1.7010291337549917, + "language_loss": 0.87951058, + "learning_rate": 3.8087609749928822e-06, + "loss": 0.90154576, + "num_input_tokens_seen": 59737555, + "router_z_loss_clip": 1.0234375, + "router_z_loss_mlp": 0.18066406, + "step": 2763, + "time_per_iteration": 2.670254707336426 + }, + { + "auxiliary_loss_clip": 0.01057401, + "auxiliary_loss_mlp": 0.01004029, + "balance_loss_clip": 1.02194142, + "balance_loss_mlp": 1.00194478, + "epoch": 0.16618067037426726, + "flos": 72285047010720.0, + "grad_norm": 0.8056474280265775, + "language_loss": 0.59821928, + "learning_rate": 3.8085947468841885e-06, + "loss": 0.61883354, + "num_input_tokens_seen": 59800915, + "router_z_loss_clip": 0.35522461, + "router_z_loss_mlp": 0.02085876, + "step": 2764, + "time_per_iteration": 3.377962112426758 + }, + { + "auxiliary_loss_clip": 0.0115874, + "auxiliary_loss_mlp": 0.01047308, + "balance_loss_clip": 1.05561137, + "balance_loss_mlp": 1.0281868, + "epoch": 0.16624079362693522, + "flos": 33188523867360.0, + "grad_norm": 1.8741779305479866, + "language_loss": 0.81980467, + "learning_rate": 3.808428450193401e-06, + "loss": 0.84186512, + "num_input_tokens_seen": 59822910, + "router_z_loss_clip": 1.02929688, + "router_z_loss_mlp": 0.19104004, + "step": 2765, + "time_per_iteration": 2.705185890197754 + }, + { + "auxiliary_loss_clip": 0.01162499, + "auxiliary_loss_mlp": 0.01052065, + "balance_loss_clip": 1.05431271, + "balance_loss_mlp": 1.03268123, + "epoch": 0.1663009168796032, + "flos": 13330150145280.0, + "grad_norm": 2.230759294895638, + "language_loss": 0.69819057, + "learning_rate": 3.8082620849268244e-06, + "loss": 0.7203362, + "num_input_tokens_seen": 59838805, + "router_z_loss_clip": 1.08203125, + "router_z_loss_mlp": 0.19384766, + "step": 2766, + "time_per_iteration": 2.6193530559539795 + }, + { + "auxiliary_loss_clip": 0.01155218, + "auxiliary_loss_mlp": 0.01042071, + "balance_loss_clip": 1.0560447, + "balance_loss_mlp": 1.02542901, + "epoch": 0.16636104013227115, + "flos": 21834648774720.0, + "grad_norm": 4.091696843094521, + "language_loss": 0.88437963, + "learning_rate": 3.808095651090769e-06, + "loss": 0.90635252, + "num_input_tokens_seen": 59855345, + "router_z_loss_clip": 0.9921875, + "router_z_loss_mlp": 0.16650391, + "step": 2767, + "time_per_iteration": 2.6639623641967773 + }, + { + "auxiliary_loss_clip": 0.01054994, + "auxiliary_loss_mlp": 0.01004701, + "balance_loss_clip": 1.01942134, + "balance_loss_mlp": 1.0027194, + "epoch": 0.16642116338493912, + "flos": 81418122851040.0, + "grad_norm": 0.641917508312314, + "language_loss": 0.52917057, + "learning_rate": 3.8079291486915447e-06, + "loss": 0.54976749, + "num_input_tokens_seen": 59917710, + "router_z_loss_clip": 0.35620117, + "router_z_loss_mlp": 0.01980591, + "step": 2768, + "time_per_iteration": 3.3521571159362793 + }, + { + "auxiliary_loss_clip": 0.01157903, + "auxiliary_loss_mlp": 0.01043013, + "balance_loss_clip": 1.05215585, + "balance_loss_mlp": 1.02511954, + "epoch": 0.16648128663760708, + "flos": 23215483401120.0, + "grad_norm": 2.7251216605498896, + "language_loss": 0.84938467, + "learning_rate": 3.8077625777354667e-06, + "loss": 0.87139374, + "num_input_tokens_seen": 59935105, + "router_z_loss_clip": 1.05664062, + "router_z_loss_mlp": 0.17907715, + "step": 2769, + "time_per_iteration": 2.6400113105773926 + }, + { + "auxiliary_loss_clip": 0.0105509, + "auxiliary_loss_mlp": 0.0100585, + "balance_loss_clip": 1.01923263, + "balance_loss_mlp": 1.00379944, + "epoch": 0.16654140989027508, + "flos": 85579866087840.0, + "grad_norm": 0.8095030736036831, + "language_loss": 0.57448137, + "learning_rate": 3.80759593822885e-06, + "loss": 0.59509075, + "num_input_tokens_seen": 59984085, + "router_z_loss_clip": 0.35864258, + "router_z_loss_mlp": 0.02050781, + "step": 2770, + "time_per_iteration": 3.125946283340454 + }, + { + "auxiliary_loss_clip": 0.01054911, + "auxiliary_loss_mlp": 0.01004708, + "balance_loss_clip": 1.01895952, + "balance_loss_mlp": 1.00263703, + "epoch": 0.16660153314294304, + "flos": 85748352053760.0, + "grad_norm": 0.856368100354507, + "language_loss": 0.56219971, + "learning_rate": 3.807429230178015e-06, + "loss": 0.58279586, + "num_input_tokens_seen": 60043470, + "router_z_loss_clip": 0.35913086, + "router_z_loss_mlp": 0.02072144, + "step": 2771, + "time_per_iteration": 3.127326726913452 + }, + { + "auxiliary_loss_clip": 0.0115388, + "auxiliary_loss_mlp": 0.01050956, + "balance_loss_clip": 1.0519588, + "balance_loss_mlp": 1.03259754, + "epoch": 0.166661656395611, + "flos": 28156468561920.0, + "grad_norm": 2.1125548901735587, + "language_loss": 0.69839066, + "learning_rate": 3.8072624535892817e-06, + "loss": 0.72043902, + "num_input_tokens_seen": 60063045, + "router_z_loss_clip": 1.01904297, + "router_z_loss_mlp": 0.18347168, + "step": 2772, + "time_per_iteration": 2.7514662742614746 + }, + { + "auxiliary_loss_clip": 0.01150854, + "auxiliary_loss_mlp": 0.01040346, + "balance_loss_clip": 1.04886293, + "balance_loss_mlp": 1.02247691, + "epoch": 0.16672177964827897, + "flos": 34613232357600.0, + "grad_norm": 2.1081184401723263, + "language_loss": 0.86224568, + "learning_rate": 3.807095608468975e-06, + "loss": 0.88415766, + "num_input_tokens_seen": 60081945, + "router_z_loss_clip": 1.01953125, + "router_z_loss_mlp": 0.17858887, + "step": 2773, + "time_per_iteration": 2.8333890438079834 + }, + { + "auxiliary_loss_clip": 0.01152782, + "auxiliary_loss_mlp": 0.01041258, + "balance_loss_clip": 1.05192435, + "balance_loss_mlp": 1.02423465, + "epoch": 0.16678190290094694, + "flos": 23295492328320.0, + "grad_norm": 2.4194049688776254, + "language_loss": 0.82328808, + "learning_rate": 3.8069286948234224e-06, + "loss": 0.84522843, + "num_input_tokens_seen": 60096820, + "router_z_loss_clip": 1.0078125, + "router_z_loss_mlp": 0.17016602, + "step": 2774, + "time_per_iteration": 2.621879816055298 + }, + { + "auxiliary_loss_clip": 0.01155828, + "auxiliary_loss_mlp": 0.01040364, + "balance_loss_clip": 1.05174863, + "balance_loss_mlp": 1.02275634, + "epoch": 0.1668420261536149, + "flos": 26599368651840.0, + "grad_norm": 2.16268937560537, + "language_loss": 0.83089066, + "learning_rate": 3.806761712658952e-06, + "loss": 0.85285258, + "num_input_tokens_seen": 60116140, + "router_z_loss_clip": 1.04199219, + "router_z_loss_mlp": 0.17602539, + "step": 2775, + "time_per_iteration": 2.6621828079223633 + }, + { + "auxiliary_loss_clip": 0.01153166, + "auxiliary_loss_mlp": 0.01046534, + "balance_loss_clip": 1.05271983, + "balance_loss_mlp": 1.02973783, + "epoch": 0.16690214940628287, + "flos": 23260005541440.0, + "grad_norm": 1.8606295597313376, + "language_loss": 0.80665243, + "learning_rate": 3.806594661981897e-06, + "loss": 0.82864946, + "num_input_tokens_seen": 60134235, + "router_z_loss_clip": 1.00634766, + "router_z_loss_mlp": 0.16784668, + "step": 2776, + "time_per_iteration": 2.6427974700927734 + }, + { + "auxiliary_loss_clip": 0.01148773, + "auxiliary_loss_mlp": 0.01043876, + "balance_loss_clip": 1.05244434, + "balance_loss_mlp": 1.02713871, + "epoch": 0.16696227265895086, + "flos": 22681582372800.0, + "grad_norm": 1.9469462800983834, + "language_loss": 0.80629158, + "learning_rate": 3.8064275427985906e-06, + "loss": 0.82821798, + "num_input_tokens_seen": 60153275, + "router_z_loss_clip": 0.96289062, + "router_z_loss_mlp": 0.1673584, + "step": 2777, + "time_per_iteration": 2.6504948139190674 + }, + { + "auxiliary_loss_clip": 0.01152138, + "auxiliary_loss_mlp": 0.01042593, + "balance_loss_clip": 1.05131233, + "balance_loss_mlp": 1.02509296, + "epoch": 0.16702239591161883, + "flos": 28423803990240.0, + "grad_norm": 1.7722894045509645, + "language_loss": 0.85142779, + "learning_rate": 3.806260355115371e-06, + "loss": 0.87337518, + "num_input_tokens_seen": 60173215, + "router_z_loss_clip": 1.00830078, + "router_z_loss_mlp": 0.1751709, + "step": 2778, + "time_per_iteration": 5.534465312957764 + }, + { + "auxiliary_loss_clip": 0.01157122, + "auxiliary_loss_mlp": 0.01035625, + "balance_loss_clip": 1.0535593, + "balance_loss_mlp": 1.01774347, + "epoch": 0.1670825191642868, + "flos": 29804517064800.0, + "grad_norm": 1.815416470430344, + "language_loss": 0.7453205, + "learning_rate": 3.8060930989385778e-06, + "loss": 0.76724797, + "num_input_tokens_seen": 60190515, + "router_z_loss_clip": 1.03417969, + "router_z_loss_mlp": 0.17883301, + "step": 2779, + "time_per_iteration": 2.701972723007202 + }, + { + "auxiliary_loss_clip": 0.01154804, + "auxiliary_loss_mlp": 0.01040571, + "balance_loss_clip": 1.05286241, + "balance_loss_mlp": 1.02264166, + "epoch": 0.16714264241695476, + "flos": 32698820841120.0, + "grad_norm": 2.1517984212261005, + "language_loss": 0.65426791, + "learning_rate": 3.805925774274554e-06, + "loss": 0.67622161, + "num_input_tokens_seen": 60211655, + "router_z_loss_clip": 1.02001953, + "router_z_loss_mlp": 0.17919922, + "step": 2780, + "time_per_iteration": 2.703151226043701 + }, + { + "auxiliary_loss_clip": 0.01153865, + "auxiliary_loss_mlp": 0.01042493, + "balance_loss_clip": 1.05361164, + "balance_loss_mlp": 1.02467179, + "epoch": 0.16720276566962272, + "flos": 26643566653920.0, + "grad_norm": 3.2444336629133144, + "language_loss": 0.7855947, + "learning_rate": 3.805758381129643e-06, + "loss": 0.8075583, + "num_input_tokens_seen": 60230860, + "router_z_loss_clip": 1.00292969, + "router_z_loss_mlp": 0.17822266, + "step": 2781, + "time_per_iteration": 2.6760976314544678 + }, + { + "auxiliary_loss_clip": 0.01156632, + "auxiliary_loss_mlp": 0.01041603, + "balance_loss_clip": 1.05367994, + "balance_loss_mlp": 1.02484274, + "epoch": 0.1672628889222907, + "flos": 26198304733440.0, + "grad_norm": 1.4888990968763316, + "language_loss": 0.75486898, + "learning_rate": 3.805590919510193e-06, + "loss": 0.77685142, + "num_input_tokens_seen": 60250535, + "router_z_loss_clip": 1.02929688, + "router_z_loss_mlp": 0.16748047, + "step": 2782, + "time_per_iteration": 4.203087091445923 + }, + { + "auxiliary_loss_clip": 0.01162708, + "auxiliary_loss_mlp": 0.01050657, + "balance_loss_clip": 1.05615568, + "balance_loss_mlp": 1.03172708, + "epoch": 0.16732301217495865, + "flos": 37551288445920.0, + "grad_norm": 1.9714852048553273, + "language_loss": 0.67614043, + "learning_rate": 3.8054233894225547e-06, + "loss": 0.69827402, + "num_input_tokens_seen": 60269530, + "router_z_loss_clip": 1.06738281, + "router_z_loss_mlp": 0.18933105, + "step": 2783, + "time_per_iteration": 4.41610860824585 + }, + { + "auxiliary_loss_clip": 0.01154584, + "auxiliary_loss_mlp": 0.01046776, + "balance_loss_clip": 1.05309641, + "balance_loss_mlp": 1.02980113, + "epoch": 0.16738313542762664, + "flos": 28644226758720.0, + "grad_norm": 1.661798884228536, + "language_loss": 0.70247573, + "learning_rate": 3.805255790873081e-06, + "loss": 0.72448933, + "num_input_tokens_seen": 60289900, + "router_z_loss_clip": 1.01367188, + "router_z_loss_mlp": 0.16967773, + "step": 2784, + "time_per_iteration": 2.672165870666504 + }, + { + "auxiliary_loss_clip": 0.01158854, + "auxiliary_loss_mlp": 0.01049707, + "balance_loss_clip": 1.05311739, + "balance_loss_mlp": 1.0302999, + "epoch": 0.1674432586802946, + "flos": 36349393243680.0, + "grad_norm": 1.7129832419902136, + "language_loss": 0.6042577, + "learning_rate": 3.805088123868126e-06, + "loss": 0.62634337, + "num_input_tokens_seen": 60310025, + "router_z_loss_clip": 1.05859375, + "router_z_loss_mlp": 0.1940918, + "step": 2785, + "time_per_iteration": 2.689812421798706 + }, + { + "auxiliary_loss_clip": 0.01059902, + "auxiliary_loss_mlp": 0.01018742, + "balance_loss_clip": 1.02397871, + "balance_loss_mlp": 1.01643944, + "epoch": 0.16750338193296258, + "flos": 80698759116480.0, + "grad_norm": 0.7836631595991344, + "language_loss": 0.58842671, + "learning_rate": 3.8049203884140492e-06, + "loss": 0.60921311, + "num_input_tokens_seen": 60377800, + "router_z_loss_clip": 0.35888672, + "router_z_loss_mlp": 0.02302551, + "step": 2786, + "time_per_iteration": 3.3254597187042236 + }, + { + "auxiliary_loss_clip": 0.01159452, + "auxiliary_loss_mlp": 0.0104445, + "balance_loss_clip": 1.05492544, + "balance_loss_mlp": 1.02665186, + "epoch": 0.16756350518563054, + "flos": 31356187659360.0, + "grad_norm": 1.823253534294736, + "language_loss": 0.76119769, + "learning_rate": 3.80475258451721e-06, + "loss": 0.78323674, + "num_input_tokens_seen": 60398215, + "router_z_loss_clip": 1.04492188, + "router_z_loss_mlp": 0.17797852, + "step": 2787, + "time_per_iteration": 2.721630096435547 + }, + { + "auxiliary_loss_clip": 0.01156864, + "auxiliary_loss_mlp": 0.01039468, + "balance_loss_clip": 1.05468702, + "balance_loss_mlp": 1.02296901, + "epoch": 0.1676236284382985, + "flos": 29085639537600.0, + "grad_norm": 2.107646892419614, + "language_loss": 0.77175236, + "learning_rate": 3.804584712183972e-06, + "loss": 0.79371566, + "num_input_tokens_seen": 60416910, + "router_z_loss_clip": 1.02099609, + "router_z_loss_mlp": 0.16503906, + "step": 2788, + "time_per_iteration": 2.770024299621582 + }, + { + "auxiliary_loss_clip": 0.01056359, + "auxiliary_loss_mlp": 0.01008674, + "balance_loss_clip": 1.02076507, + "balance_loss_mlp": 1.0065428, + "epoch": 0.16768375169096647, + "flos": 73060433442720.0, + "grad_norm": 0.8596944851668686, + "language_loss": 0.59329414, + "learning_rate": 3.8044167714207013e-06, + "loss": 0.61394447, + "num_input_tokens_seen": 60468660, + "router_z_loss_clip": 0.35546875, + "router_z_loss_mlp": 0.02133179, + "step": 2789, + "time_per_iteration": 3.0860233306884766 + }, + { + "auxiliary_loss_clip": 0.01159729, + "auxiliary_loss_mlp": 0.01053325, + "balance_loss_clip": 1.0548526, + "balance_loss_mlp": 1.03547943, + "epoch": 0.16774387494363446, + "flos": 46900654292160.0, + "grad_norm": 1.4890850644141533, + "language_loss": 0.70132917, + "learning_rate": 3.804248762233765e-06, + "loss": 0.72345972, + "num_input_tokens_seen": 60492370, + "router_z_loss_clip": 1.04882812, + "router_z_loss_mlp": 0.17858887, + "step": 2790, + "time_per_iteration": 2.7935588359832764 + }, + { + "auxiliary_loss_clip": 0.01153485, + "auxiliary_loss_mlp": 0.01051357, + "balance_loss_clip": 1.05136633, + "balance_loss_mlp": 1.03476346, + "epoch": 0.16780399819630243, + "flos": 27622648568160.0, + "grad_norm": 1.6699093771084965, + "language_loss": 0.79119587, + "learning_rate": 3.8040806846295356e-06, + "loss": 0.81324434, + "num_input_tokens_seen": 60512655, + "router_z_loss_clip": 1.02099609, + "router_z_loss_mlp": 0.16595459, + "step": 2791, + "time_per_iteration": 2.6754167079925537 + }, + { + "auxiliary_loss_clip": 0.01159129, + "auxiliary_loss_mlp": 0.01049269, + "balance_loss_clip": 1.05566049, + "balance_loss_mlp": 1.03200746, + "epoch": 0.1678641214489704, + "flos": 40135112241120.0, + "grad_norm": 1.9064019871596052, + "language_loss": 0.716214, + "learning_rate": 3.8039125386143853e-06, + "loss": 0.73829794, + "num_input_tokens_seen": 60533090, + "router_z_loss_clip": 1.03320312, + "router_z_loss_mlp": 0.17272949, + "step": 2792, + "time_per_iteration": 2.801751136779785 + }, + { + "auxiliary_loss_clip": 0.01156852, + "auxiliary_loss_mlp": 0.01043612, + "balance_loss_clip": 1.05357504, + "balance_loss_mlp": 1.02644587, + "epoch": 0.16792424470163836, + "flos": 24373342670400.0, + "grad_norm": 1.8365408140295867, + "language_loss": 0.71303904, + "learning_rate": 3.803744324194691e-06, + "loss": 0.7350437, + "num_input_tokens_seen": 60553190, + "router_z_loss_clip": 1.03320312, + "router_z_loss_mlp": 0.17175293, + "step": 2793, + "time_per_iteration": 2.7050514221191406 + }, + { + "auxiliary_loss_clip": 0.01157566, + "auxiliary_loss_mlp": 0.01045689, + "balance_loss_clip": 1.05447638, + "balance_loss_mlp": 1.02803385, + "epoch": 0.16798436795430632, + "flos": 24062092860960.0, + "grad_norm": 2.9626529684263794, + "language_loss": 0.76996219, + "learning_rate": 3.803576041376831e-06, + "loss": 0.79199481, + "num_input_tokens_seen": 60571995, + "router_z_loss_clip": 1.03027344, + "router_z_loss_mlp": 0.17675781, + "step": 2794, + "time_per_iteration": 2.6806464195251465 + }, + { + "auxiliary_loss_clip": 0.01158354, + "auxiliary_loss_mlp": 0.01046972, + "balance_loss_clip": 1.05524743, + "balance_loss_mlp": 1.02985382, + "epoch": 0.1680444912069743, + "flos": 34294689437760.0, + "grad_norm": 2.220261965222018, + "language_loss": 0.71178138, + "learning_rate": 3.803407690167187e-06, + "loss": 0.73383468, + "num_input_tokens_seen": 60591275, + "router_z_loss_clip": 1.03125, + "router_z_loss_mlp": 0.17126465, + "step": 2795, + "time_per_iteration": 2.746575355529785 + }, + { + "auxiliary_loss_clip": 0.01153435, + "auxiliary_loss_mlp": 0.01035773, + "balance_loss_clip": 1.05294859, + "balance_loss_mlp": 1.01940513, + "epoch": 0.16810461445964225, + "flos": 22055395681440.0, + "grad_norm": 2.129877517945735, + "language_loss": 0.84402084, + "learning_rate": 3.803239270572142e-06, + "loss": 0.86591291, + "num_input_tokens_seen": 60609235, + "router_z_loss_clip": 1.00439453, + "router_z_loss_mlp": 0.16357422, + "step": 2796, + "time_per_iteration": 2.82413911819458 + }, + { + "auxiliary_loss_clip": 0.01156912, + "auxiliary_loss_mlp": 0.01045028, + "balance_loss_clip": 1.05224609, + "balance_loss_mlp": 1.02763534, + "epoch": 0.16816473771231025, + "flos": 29137009098240.0, + "grad_norm": 1.930735243035259, + "language_loss": 0.81365955, + "learning_rate": 3.8030707825980838e-06, + "loss": 0.83567894, + "num_input_tokens_seen": 60629880, + "router_z_loss_clip": 1.046875, + "router_z_loss_mlp": 0.1739502, + "step": 2797, + "time_per_iteration": 2.6767477989196777 + }, + { + "auxiliary_loss_clip": 0.01146911, + "auxiliary_loss_mlp": 0.01038851, + "balance_loss_clip": 1.05123734, + "balance_loss_mlp": 1.02397358, + "epoch": 0.1682248609649782, + "flos": 27800777646720.0, + "grad_norm": 1.4510470067260168, + "language_loss": 0.7520473, + "learning_rate": 3.802902226251401e-06, + "loss": 0.77390492, + "num_input_tokens_seen": 60651175, + "router_z_loss_clip": 0.95605469, + "router_z_loss_mlp": 0.14855957, + "step": 2798, + "time_per_iteration": 2.702972650527954 + }, + { + "auxiliary_loss_clip": 0.0115537, + "auxiliary_loss_mlp": 0.01043865, + "balance_loss_clip": 1.05404949, + "balance_loss_mlp": 1.02833223, + "epoch": 0.16828498421764618, + "flos": 25264352718720.0, + "grad_norm": 2.3275537115904474, + "language_loss": 0.79636788, + "learning_rate": 3.8027336015384845e-06, + "loss": 0.81836021, + "num_input_tokens_seen": 60670210, + "router_z_loss_clip": 1.01269531, + "router_z_loss_mlp": 0.15527344, + "step": 2799, + "time_per_iteration": 2.643407106399536 + }, + { + "auxiliary_loss_clip": 0.01153537, + "auxiliary_loss_mlp": 0.01036665, + "balance_loss_clip": 1.05006921, + "balance_loss_mlp": 1.01937985, + "epoch": 0.16834510747031414, + "flos": 35900079595200.0, + "grad_norm": 2.423992447124065, + "language_loss": 0.7057969, + "learning_rate": 3.8025649084657296e-06, + "loss": 0.72769892, + "num_input_tokens_seen": 60690895, + "router_z_loss_clip": 1.03369141, + "router_z_loss_mlp": 0.17285156, + "step": 2800, + "time_per_iteration": 2.7523577213287354 + }, + { + "auxiliary_loss_clip": 0.01150907, + "auxiliary_loss_mlp": 0.0103973, + "balance_loss_clip": 1.05024242, + "balance_loss_mlp": 1.02159858, + "epoch": 0.1684052307229821, + "flos": 22141279614240.0, + "grad_norm": 2.2458670596077916, + "language_loss": 0.83426762, + "learning_rate": 3.8023961470395326e-06, + "loss": 0.85617399, + "num_input_tokens_seen": 60708280, + "router_z_loss_clip": 1.0078125, + "router_z_loss_mlp": 0.18127441, + "step": 2801, + "time_per_iteration": 2.6461007595062256 + }, + { + "auxiliary_loss_clip": 0.01150478, + "auxiliary_loss_mlp": 0.01052787, + "balance_loss_clip": 1.04782772, + "balance_loss_mlp": 1.03593087, + "epoch": 0.16846535397565007, + "flos": 20225044820160.0, + "grad_norm": 2.3831157965659346, + "language_loss": 0.8243767, + "learning_rate": 3.8022273172662933e-06, + "loss": 0.84640932, + "num_input_tokens_seen": 60724150, + "router_z_loss_clip": 1.02636719, + "router_z_loss_mlp": 0.16833496, + "step": 2802, + "time_per_iteration": 2.679551124572754 + }, + { + "auxiliary_loss_clip": 0.01156009, + "auxiliary_loss_mlp": 0.01041597, + "balance_loss_clip": 1.05297208, + "balance_loss_mlp": 1.0243237, + "epoch": 0.16852547722831807, + "flos": 37105742904480.0, + "grad_norm": 2.125075493640921, + "language_loss": 0.80773163, + "learning_rate": 3.802058419152413e-06, + "loss": 0.82970762, + "num_input_tokens_seen": 60746485, + "router_z_loss_clip": 1.03271484, + "router_z_loss_mlp": 0.17272949, + "step": 2803, + "time_per_iteration": 2.7536489963531494 + }, + { + "auxiliary_loss_clip": 0.01151851, + "auxiliary_loss_mlp": 0.01043727, + "balance_loss_clip": 1.05130816, + "balance_loss_mlp": 1.02675128, + "epoch": 0.16858560048098603, + "flos": 40887086035680.0, + "grad_norm": 2.241888627825606, + "language_loss": 0.76225007, + "learning_rate": 3.801889452704297e-06, + "loss": 0.78420579, + "num_input_tokens_seen": 60762875, + "router_z_loss_clip": 1.00488281, + "router_z_loss_mlp": 0.17004395, + "step": 2804, + "time_per_iteration": 2.8256850242614746 + }, + { + "auxiliary_loss_clip": 0.010478, + "auxiliary_loss_mlp": 0.01011145, + "balance_loss_clip": 1.01326644, + "balance_loss_mlp": 1.00910068, + "epoch": 0.168645723733654, + "flos": 82205664467040.0, + "grad_norm": 0.8389232492847667, + "language_loss": 0.55459368, + "learning_rate": 3.8017204179283526e-06, + "loss": 0.57518315, + "num_input_tokens_seen": 60825510, + "router_z_loss_clip": 0.34521484, + "router_z_loss_mlp": 0.02044678, + "step": 2805, + "time_per_iteration": 3.2391178607940674 + }, + { + "auxiliary_loss_clip": 0.01145203, + "auxiliary_loss_mlp": 0.01040456, + "balance_loss_clip": 1.04696369, + "balance_loss_mlp": 1.02481604, + "epoch": 0.16870584698632196, + "flos": 26020378241280.0, + "grad_norm": 1.890688308939245, + "language_loss": 0.72953779, + "learning_rate": 3.8015513148309892e-06, + "loss": 0.75139433, + "num_input_tokens_seen": 60844440, + "router_z_loss_clip": 0.98242188, + "router_z_loss_mlp": 0.15637207, + "step": 2806, + "time_per_iteration": 2.665097713470459 + }, + { + "auxiliary_loss_clip": 0.01147698, + "auxiliary_loss_mlp": 0.01042248, + "balance_loss_clip": 1.04956675, + "balance_loss_mlp": 1.02567816, + "epoch": 0.16876597023898993, + "flos": 25307902444320.0, + "grad_norm": 2.245306550115147, + "language_loss": 0.70196116, + "learning_rate": 3.80138214341862e-06, + "loss": 0.72386062, + "num_input_tokens_seen": 60863210, + "router_z_loss_clip": 0.98144531, + "router_z_loss_mlp": 0.16552734, + "step": 2807, + "time_per_iteration": 2.6604766845703125 + }, + { + "auxiliary_loss_clip": 0.01151519, + "auxiliary_loss_mlp": 0.01043994, + "balance_loss_clip": 1.04945374, + "balance_loss_mlp": 1.02686393, + "epoch": 0.1688260934916579, + "flos": 24774285036960.0, + "grad_norm": 2.919980288296814, + "language_loss": 0.70112777, + "learning_rate": 3.8012129036976587e-06, + "loss": 0.7230829, + "num_input_tokens_seen": 60882510, + "router_z_loss_clip": 1.02099609, + "router_z_loss_mlp": 0.17126465, + "step": 2808, + "time_per_iteration": 2.881922960281372 + }, + { + "auxiliary_loss_clip": 0.01152789, + "auxiliary_loss_mlp": 0.01040742, + "balance_loss_clip": 1.0493927, + "balance_loss_mlp": 1.02291453, + "epoch": 0.16888621674432586, + "flos": 24819617522880.0, + "grad_norm": 2.7032347017988196, + "language_loss": 0.800228, + "learning_rate": 3.8010435956745236e-06, + "loss": 0.82216328, + "num_input_tokens_seen": 60901105, + "router_z_loss_clip": 1.03320312, + "router_z_loss_mlp": 0.17828369, + "step": 2809, + "time_per_iteration": 2.7489123344421387 + }, + { + "auxiliary_loss_clip": 0.01157294, + "auxiliary_loss_mlp": 0.01038092, + "balance_loss_clip": 1.05124497, + "balance_loss_mlp": 1.02110481, + "epoch": 0.16894633999699385, + "flos": 19822360210560.0, + "grad_norm": 2.095486614392925, + "language_loss": 0.87911421, + "learning_rate": 3.8008742193556358e-06, + "loss": 0.90106809, + "num_input_tokens_seen": 60915340, + "router_z_loss_clip": 1.0625, + "router_z_loss_mlp": 0.16992188, + "step": 2810, + "time_per_iteration": 2.6125271320343018 + }, + { + "auxiliary_loss_clip": 0.01156204, + "auxiliary_loss_mlp": 0.01048693, + "balance_loss_clip": 1.05252337, + "balance_loss_mlp": 1.03126442, + "epoch": 0.16900646324966181, + "flos": 23928242819040.0, + "grad_norm": 2.144062090136257, + "language_loss": 0.92558229, + "learning_rate": 3.800704774747416e-06, + "loss": 0.94763124, + "num_input_tokens_seen": 60933735, + "router_z_loss_clip": 1.03613281, + "router_z_loss_mlp": 0.17443848, + "step": 2811, + "time_per_iteration": 2.651522159576416 + }, + { + "auxiliary_loss_clip": 0.01156279, + "auxiliary_loss_mlp": 0.01043718, + "balance_loss_clip": 1.05338609, + "balance_loss_mlp": 1.02746987, + "epoch": 0.16906658650232978, + "flos": 26866582528320.0, + "grad_norm": 5.105631606443288, + "language_loss": 0.78962636, + "learning_rate": 3.800535261856291e-06, + "loss": 0.81162626, + "num_input_tokens_seen": 60953105, + "router_z_loss_clip": 1.02929688, + "router_z_loss_mlp": 0.16247559, + "step": 2812, + "time_per_iteration": 2.636977434158325 + }, + { + "auxiliary_loss_clip": 0.01152273, + "auxiliary_loss_mlp": 0.01045832, + "balance_loss_clip": 1.05387282, + "balance_loss_mlp": 1.02964354, + "epoch": 0.16912670975499774, + "flos": 14353551613440.0, + "grad_norm": 2.5256883957422476, + "language_loss": 0.75188726, + "learning_rate": 3.8003656806886887e-06, + "loss": 0.77386832, + "num_input_tokens_seen": 60969150, + "router_z_loss_clip": 0.98388672, + "router_z_loss_mlp": 0.1619873, + "step": 2813, + "time_per_iteration": 2.6527326107025146 + }, + { + "auxiliary_loss_clip": 0.01154974, + "auxiliary_loss_mlp": 0.01042863, + "balance_loss_clip": 1.05118573, + "balance_loss_mlp": 1.02597094, + "epoch": 0.1691868330076657, + "flos": 20940437861280.0, + "grad_norm": 2.7981414484353286, + "language_loss": 0.68770778, + "learning_rate": 3.8001960312510396e-06, + "loss": 0.70968616, + "num_input_tokens_seen": 60982825, + "router_z_loss_clip": 1.0390625, + "router_z_loss_mlp": 0.16894531, + "step": 2814, + "time_per_iteration": 2.7294769287109375 + }, + { + "auxiliary_loss_clip": 0.01152665, + "auxiliary_loss_mlp": 0.01040941, + "balance_loss_clip": 1.05220866, + "balance_loss_mlp": 1.02452552, + "epoch": 0.16924695626033368, + "flos": 27353084689440.0, + "grad_norm": 2.0737231761399757, + "language_loss": 0.61617929, + "learning_rate": 3.800026313549776e-06, + "loss": 0.63811535, + "num_input_tokens_seen": 61000875, + "router_z_loss_clip": 1.00488281, + "router_z_loss_mlp": 0.16418457, + "step": 2815, + "time_per_iteration": 2.659109115600586 + }, + { + "auxiliary_loss_clip": 0.0115045, + "auxiliary_loss_mlp": 0.01039312, + "balance_loss_clip": 1.05041122, + "balance_loss_mlp": 1.02315879, + "epoch": 0.16930707951300164, + "flos": 31407921875520.0, + "grad_norm": 1.7671390970832357, + "language_loss": 0.82355988, + "learning_rate": 3.7998565275913342e-06, + "loss": 0.84545755, + "num_input_tokens_seen": 61021940, + "router_z_loss_clip": 1.00146484, + "router_z_loss_mlp": 0.16149902, + "step": 2816, + "time_per_iteration": 2.7153279781341553 + }, + { + "auxiliary_loss_clip": 0.01156762, + "auxiliary_loss_mlp": 0.01045242, + "balance_loss_clip": 1.05443931, + "balance_loss_mlp": 1.02821851, + "epoch": 0.16936720276566963, + "flos": 27756174471840.0, + "grad_norm": 2.622183629294233, + "language_loss": 0.87774169, + "learning_rate": 3.799686673382153e-06, + "loss": 0.8997618, + "num_input_tokens_seen": 61040285, + "router_z_loss_clip": 1.02294922, + "router_z_loss_mlp": 0.17028809, + "step": 2817, + "time_per_iteration": 2.6366007328033447 + }, + { + "auxiliary_loss_clip": 0.01154406, + "auxiliary_loss_mlp": 0.01047094, + "balance_loss_clip": 1.05385947, + "balance_loss_mlp": 1.02961802, + "epoch": 0.1694273260183376, + "flos": 23884044816960.0, + "grad_norm": 2.065608262907928, + "language_loss": 0.81007922, + "learning_rate": 3.799516750928672e-06, + "loss": 0.83209425, + "num_input_tokens_seen": 61059020, + "router_z_loss_clip": 1.00488281, + "router_z_loss_mlp": 0.17480469, + "step": 2818, + "time_per_iteration": 5.54097318649292 + }, + { + "auxiliary_loss_clip": 0.01152101, + "auxiliary_loss_mlp": 0.01041582, + "balance_loss_clip": 1.05182409, + "balance_loss_mlp": 1.02455926, + "epoch": 0.16948744927100556, + "flos": 15200525728800.0, + "grad_norm": 3.2748926672780363, + "language_loss": 0.80753833, + "learning_rate": 3.799346760237336e-06, + "loss": 0.82947516, + "num_input_tokens_seen": 61074245, + "router_z_loss_clip": 1.00292969, + "router_z_loss_mlp": 0.17016602, + "step": 2819, + "time_per_iteration": 2.6871726512908936 + }, + { + "auxiliary_loss_clip": 0.0105253, + "auxiliary_loss_mlp": 0.0100539, + "balance_loss_clip": 1.01838422, + "balance_loss_mlp": 1.00303555, + "epoch": 0.16954757252367353, + "flos": 86992378876800.0, + "grad_norm": 0.9434533427614085, + "language_loss": 0.61186361, + "learning_rate": 3.7991767013145902e-06, + "loss": 0.63244283, + "num_input_tokens_seen": 61127080, + "router_z_loss_clip": 0.34130859, + "router_z_loss_mlp": 0.02352905, + "step": 2820, + "time_per_iteration": 3.3487508296966553 + }, + { + "auxiliary_loss_clip": 0.01155426, + "auxiliary_loss_mlp": 0.01045595, + "balance_loss_clip": 1.05381942, + "balance_loss_mlp": 1.02972853, + "epoch": 0.1696076957763415, + "flos": 36126620472960.0, + "grad_norm": 1.9516181540438524, + "language_loss": 0.78326523, + "learning_rate": 3.7990065741668844e-06, + "loss": 0.80527544, + "num_input_tokens_seen": 61146955, + "router_z_loss_clip": 1.01513672, + "router_z_loss_mlp": 0.15856934, + "step": 2821, + "time_per_iteration": 4.2570812702178955 + }, + { + "auxiliary_loss_clip": 0.01155292, + "auxiliary_loss_mlp": 0.0105105, + "balance_loss_clip": 1.0536865, + "balance_loss_mlp": 1.03313279, + "epoch": 0.16966781902900946, + "flos": 29759346648000.0, + "grad_norm": 1.9235506265789115, + "language_loss": 0.78209865, + "learning_rate": 3.7988363788006685e-06, + "loss": 0.80416203, + "num_input_tokens_seen": 61166605, + "router_z_loss_clip": 1.01660156, + "router_z_loss_mlp": 0.17919922, + "step": 2822, + "time_per_iteration": 4.263019323348999 + }, + { + "auxiliary_loss_clip": 0.01150406, + "auxiliary_loss_mlp": 0.01041442, + "balance_loss_clip": 1.0529592, + "balance_loss_mlp": 1.02519393, + "epoch": 0.16972794228167745, + "flos": 28112230042560.0, + "grad_norm": 2.412206105148483, + "language_loss": 0.74682415, + "learning_rate": 3.7986661152223967e-06, + "loss": 0.76874262, + "num_input_tokens_seen": 61186535, + "router_z_loss_clip": 0.97460938, + "router_z_loss_mlp": 0.16235352, + "step": 2823, + "time_per_iteration": 2.7102575302124023 + }, + { + "auxiliary_loss_clip": 0.01153156, + "auxiliary_loss_mlp": 0.01049223, + "balance_loss_clip": 1.05246162, + "balance_loss_mlp": 1.03221202, + "epoch": 0.16978806553434542, + "flos": 42986919741120.0, + "grad_norm": 2.0892872460681535, + "language_loss": 0.59847903, + "learning_rate": 3.7984957834385257e-06, + "loss": 0.62050283, + "num_input_tokens_seen": 61208965, + "router_z_loss_clip": 1.00732422, + "router_z_loss_mlp": 0.17004395, + "step": 2824, + "time_per_iteration": 2.807997941970825 + }, + { + "auxiliary_loss_clip": 0.01156719, + "auxiliary_loss_mlp": 0.01041685, + "balance_loss_clip": 1.05712247, + "balance_loss_mlp": 1.02447128, + "epoch": 0.16984818878701338, + "flos": 39064798113120.0, + "grad_norm": 1.6686013173731888, + "language_loss": 0.7311039, + "learning_rate": 3.7983253834555144e-06, + "loss": 0.753088, + "num_input_tokens_seen": 61230670, + "router_z_loss_clip": 0.99609375, + "router_z_loss_mlp": 0.17224121, + "step": 2825, + "time_per_iteration": 2.775622844696045 + }, + { + "auxiliary_loss_clip": 0.01160052, + "auxiliary_loss_mlp": 0.01048824, + "balance_loss_clip": 1.05358028, + "balance_loss_mlp": 1.02975035, + "epoch": 0.16990831203968135, + "flos": 27843638578560.0, + "grad_norm": 2.0932796680103416, + "language_loss": 0.85968959, + "learning_rate": 3.7981549152798245e-06, + "loss": 0.88177836, + "num_input_tokens_seen": 61249510, + "router_z_loss_clip": 1.06347656, + "router_z_loss_mlp": 0.19055176, + "step": 2826, + "time_per_iteration": 2.6398439407348633 + }, + { + "auxiliary_loss_clip": 0.01157287, + "auxiliary_loss_mlp": 0.01050207, + "balance_loss_clip": 1.05203927, + "balance_loss_mlp": 1.03275442, + "epoch": 0.1699684352923493, + "flos": 28113121422720.0, + "grad_norm": 3.4445259366034917, + "language_loss": 0.82333046, + "learning_rate": 3.7979843789179196e-06, + "loss": 0.8454054, + "num_input_tokens_seen": 61269440, + "router_z_loss_clip": 1.05224609, + "router_z_loss_mlp": 0.17443848, + "step": 2827, + "time_per_iteration": 2.6681032180786133 + }, + { + "auxiliary_loss_clip": 0.0115829, + "auxiliary_loss_mlp": 0.01043734, + "balance_loss_clip": 1.05320346, + "balance_loss_mlp": 1.02542341, + "epoch": 0.17002855854501728, + "flos": 26153985179520.0, + "grad_norm": 2.0278365393460778, + "language_loss": 0.74032509, + "learning_rate": 3.797813774376267e-06, + "loss": 0.76234537, + "num_input_tokens_seen": 61288195, + "router_z_loss_clip": 1.05126953, + "router_z_loss_mlp": 0.18310547, + "step": 2828, + "time_per_iteration": 2.6602842807769775 + }, + { + "auxiliary_loss_clip": 0.01047821, + "auxiliary_loss_mlp": 0.01005111, + "balance_loss_clip": 1.01352596, + "balance_loss_mlp": 1.00289059, + "epoch": 0.17008868179768524, + "flos": 87188376075840.0, + "grad_norm": 0.7531322619702533, + "language_loss": 0.56475008, + "learning_rate": 3.797643101661336e-06, + "loss": 0.58527935, + "num_input_tokens_seen": 61350850, + "router_z_loss_clip": 0.34350586, + "router_z_loss_mlp": 0.0222168, + "step": 2829, + "time_per_iteration": 3.33809494972229 + }, + { + "auxiliary_loss_clip": 0.01151104, + "auxiliary_loss_mlp": 0.01044887, + "balance_loss_clip": 1.04956388, + "balance_loss_mlp": 1.02756631, + "epoch": 0.17014880505035324, + "flos": 30383710061760.0, + "grad_norm": 1.8523067103759352, + "language_loss": 0.83314312, + "learning_rate": 3.7974723607795983e-06, + "loss": 0.85510302, + "num_input_tokens_seen": 61370765, + "router_z_loss_clip": 1.01464844, + "router_z_loss_mlp": 0.17333984, + "step": 2830, + "time_per_iteration": 2.7118217945098877 + }, + { + "auxiliary_loss_clip": 0.01152132, + "auxiliary_loss_mlp": 0.01040316, + "balance_loss_clip": 1.05075669, + "balance_loss_mlp": 1.02241051, + "epoch": 0.1702089283030212, + "flos": 36438842697120.0, + "grad_norm": 2.670861565729344, + "language_loss": 0.78809178, + "learning_rate": 3.797301551737529e-06, + "loss": 0.81001627, + "num_input_tokens_seen": 61388935, + "router_z_loss_clip": 1.01269531, + "router_z_loss_mlp": 0.17919922, + "step": 2831, + "time_per_iteration": 2.917011022567749 + }, + { + "auxiliary_loss_clip": 0.0115536, + "auxiliary_loss_mlp": 0.01045102, + "balance_loss_clip": 1.05111909, + "balance_loss_mlp": 1.02705348, + "epoch": 0.17026905155568917, + "flos": 21651698139840.0, + "grad_norm": 2.351038183745423, + "language_loss": 0.79785824, + "learning_rate": 3.7971306745416044e-06, + "loss": 0.81986278, + "num_input_tokens_seen": 61407350, + "router_z_loss_clip": 1.04199219, + "router_z_loss_mlp": 0.18054199, + "step": 2832, + "time_per_iteration": 2.6453425884246826 + }, + { + "auxiliary_loss_clip": 0.01155398, + "auxiliary_loss_mlp": 0.0104753, + "balance_loss_clip": 1.05180633, + "balance_loss_mlp": 1.02986312, + "epoch": 0.17032917480835713, + "flos": 28246242153600.0, + "grad_norm": 1.765087138978658, + "language_loss": 0.88791513, + "learning_rate": 3.7969597291983046e-06, + "loss": 0.90994442, + "num_input_tokens_seen": 61429010, + "router_z_loss_clip": 1.03515625, + "router_z_loss_mlp": 0.17663574, + "step": 2833, + "time_per_iteration": 2.690089225769043 + }, + { + "auxiliary_loss_clip": 0.01151672, + "auxiliary_loss_mlp": 0.01044503, + "balance_loss_clip": 1.05019307, + "balance_loss_mlp": 1.02787364, + "epoch": 0.1703892980610251, + "flos": 47836429584480.0, + "grad_norm": 2.206892780793721, + "language_loss": 0.72277993, + "learning_rate": 3.7967887157141115e-06, + "loss": 0.74474168, + "num_input_tokens_seen": 61450040, + "router_z_loss_clip": 1.01416016, + "router_z_loss_mlp": 0.16638184, + "step": 2834, + "time_per_iteration": 2.8360376358032227 + }, + { + "auxiliary_loss_clip": 0.01155104, + "auxiliary_loss_mlp": 0.01051721, + "balance_loss_clip": 1.05296528, + "balance_loss_mlp": 1.0357945, + "epoch": 0.17044942131369306, + "flos": 28112554180800.0, + "grad_norm": 2.2370629347070863, + "language_loss": 0.8605929, + "learning_rate": 3.7966176340955106e-06, + "loss": 0.88266122, + "num_input_tokens_seen": 61468585, + "router_z_loss_clip": 1.02148438, + "router_z_loss_mlp": 0.15942383, + "step": 2835, + "time_per_iteration": 2.6409223079681396 + }, + { + "auxiliary_loss_clip": 0.01157577, + "auxiliary_loss_mlp": 0.01050941, + "balance_loss_clip": 1.0514307, + "balance_loss_mlp": 1.03146267, + "epoch": 0.17050954456636103, + "flos": 20810315409120.0, + "grad_norm": 4.260771453422608, + "language_loss": 0.73763287, + "learning_rate": 3.796446484348989e-06, + "loss": 0.75971812, + "num_input_tokens_seen": 61486330, + "router_z_loss_clip": 1.06103516, + "router_z_loss_mlp": 0.19494629, + "step": 2836, + "time_per_iteration": 2.645354986190796 + }, + { + "auxiliary_loss_clip": 0.01156122, + "auxiliary_loss_mlp": 0.01041185, + "balance_loss_clip": 1.04969013, + "balance_loss_mlp": 1.02173042, + "epoch": 0.17056966781902902, + "flos": 20544195499200.0, + "grad_norm": 3.060705191671431, + "language_loss": 0.79995352, + "learning_rate": 3.796275266481036e-06, + "loss": 0.82192659, + "num_input_tokens_seen": 61503950, + "router_z_loss_clip": 1.06640625, + "router_z_loss_mlp": 0.19458008, + "step": 2837, + "time_per_iteration": 2.6125261783599854 + }, + { + "auxiliary_loss_clip": 0.01152842, + "auxiliary_loss_mlp": 0.01042601, + "balance_loss_clip": 1.05421638, + "balance_loss_mlp": 1.02579236, + "epoch": 0.17062979107169698, + "flos": 21612645832320.0, + "grad_norm": 1.7390298527811883, + "language_loss": 0.83406472, + "learning_rate": 3.7961039804981456e-06, + "loss": 0.85601914, + "num_input_tokens_seen": 61523550, + "router_z_loss_clip": 0.98535156, + "router_z_loss_mlp": 0.16809082, + "step": 2838, + "time_per_iteration": 2.6410722732543945 + }, + { + "auxiliary_loss_clip": 0.01149202, + "auxiliary_loss_mlp": 0.0104523, + "balance_loss_clip": 1.05000925, + "balance_loss_mlp": 1.02817142, + "epoch": 0.17068991432436495, + "flos": 27485557143840.0, + "grad_norm": 1.9633881928034274, + "language_loss": 0.93345881, + "learning_rate": 3.795932626406812e-06, + "loss": 0.95540309, + "num_input_tokens_seen": 61542720, + "router_z_loss_clip": 0.98974609, + "router_z_loss_mlp": 0.17053223, + "step": 2839, + "time_per_iteration": 2.6807315349578857 + }, + { + "auxiliary_loss_clip": 0.01154468, + "auxiliary_loss_mlp": 0.01041933, + "balance_loss_clip": 1.05131912, + "balance_loss_mlp": 1.02331221, + "epoch": 0.17075003757703291, + "flos": 31583012158080.0, + "grad_norm": 1.9209172214656438, + "language_loss": 0.83386892, + "learning_rate": 3.7957612042135336e-06, + "loss": 0.85583299, + "num_input_tokens_seen": 61563040, + "router_z_loss_clip": 1.03222656, + "router_z_loss_mlp": 0.18603516, + "step": 2840, + "time_per_iteration": 2.7194509506225586 + }, + { + "auxiliary_loss_clip": 0.01157081, + "auxiliary_loss_mlp": 0.01049768, + "balance_loss_clip": 1.05322099, + "balance_loss_mlp": 1.03140962, + "epoch": 0.17081016082970088, + "flos": 24551269162560.0, + "grad_norm": 1.7796395852014266, + "language_loss": 0.76042062, + "learning_rate": 3.79558971392481e-06, + "loss": 0.78248912, + "num_input_tokens_seen": 61581890, + "router_z_loss_clip": 1.0390625, + "router_z_loss_mlp": 0.18359375, + "step": 2841, + "time_per_iteration": 2.652498960494995 + }, + { + "auxiliary_loss_clip": 0.01153461, + "auxiliary_loss_mlp": 0.01047019, + "balance_loss_clip": 1.05108094, + "balance_loss_mlp": 1.02960217, + "epoch": 0.17087028408236885, + "flos": 30427867546560.0, + "grad_norm": 2.0894568954092976, + "language_loss": 0.77468497, + "learning_rate": 3.7954181555471443e-06, + "loss": 0.79668975, + "num_input_tokens_seen": 61602095, + "router_z_loss_clip": 1.02392578, + "router_z_loss_mlp": 0.1739502, + "step": 2842, + "time_per_iteration": 2.6974897384643555 + }, + { + "auxiliary_loss_clip": 0.01149689, + "auxiliary_loss_mlp": 0.0103993, + "balance_loss_clip": 1.05107713, + "balance_loss_mlp": 1.02275181, + "epoch": 0.17093040733503684, + "flos": 23254738295040.0, + "grad_norm": 1.9646316368713328, + "language_loss": 0.8555541, + "learning_rate": 3.795246529087043e-06, + "loss": 0.87745029, + "num_input_tokens_seen": 61620400, + "router_z_loss_clip": 0.98681641, + "router_z_loss_mlp": 0.17175293, + "step": 2843, + "time_per_iteration": 2.652688980102539 + }, + { + "auxiliary_loss_clip": 0.01151676, + "auxiliary_loss_mlp": 0.01043293, + "balance_loss_clip": 1.05249691, + "balance_loss_mlp": 1.02655554, + "epoch": 0.1709905305877048, + "flos": 15958374528960.0, + "grad_norm": 1.707764416952809, + "language_loss": 0.68247294, + "learning_rate": 3.7950748345510126e-06, + "loss": 0.70442259, + "num_input_tokens_seen": 61637680, + "router_z_loss_clip": 0.9921875, + "router_z_loss_mlp": 0.16723633, + "step": 2844, + "time_per_iteration": 2.8533129692077637 + }, + { + "auxiliary_loss_clip": 0.01151621, + "auxiliary_loss_mlp": 0.01049139, + "balance_loss_clip": 1.05125904, + "balance_loss_mlp": 1.03101897, + "epoch": 0.17105065384037277, + "flos": 23439066517440.0, + "grad_norm": 1.7458602372869467, + "language_loss": 0.78306556, + "learning_rate": 3.7949030719455646e-06, + "loss": 0.8050732, + "num_input_tokens_seen": 61655630, + "router_z_loss_clip": 1.00390625, + "router_z_loss_mlp": 0.18127441, + "step": 2845, + "time_per_iteration": 2.669294595718384 + }, + { + "auxiliary_loss_clip": 0.01153044, + "auxiliary_loss_mlp": 0.01043991, + "balance_loss_clip": 1.05224121, + "balance_loss_mlp": 1.02771938, + "epoch": 0.17111077709304073, + "flos": 22592092402080.0, + "grad_norm": 2.654492924463593, + "language_loss": 0.78535843, + "learning_rate": 3.7947312412772127e-06, + "loss": 0.80732882, + "num_input_tokens_seen": 61673475, + "router_z_loss_clip": 1.00683594, + "router_z_loss_mlp": 0.16271973, + "step": 2846, + "time_per_iteration": 2.6329495906829834 + }, + { + "auxiliary_loss_clip": 0.01151998, + "auxiliary_loss_mlp": 0.01040268, + "balance_loss_clip": 1.05172324, + "balance_loss_mlp": 1.02319753, + "epoch": 0.1711709003457087, + "flos": 30606239728800.0, + "grad_norm": 1.6966628554272498, + "language_loss": 0.80188596, + "learning_rate": 3.794559342552472e-06, + "loss": 0.82380867, + "num_input_tokens_seen": 61693370, + "router_z_loss_clip": 1.00244141, + "router_z_loss_mlp": 0.17053223, + "step": 2847, + "time_per_iteration": 2.7370214462280273 + }, + { + "auxiliary_loss_clip": 0.01151808, + "auxiliary_loss_mlp": 0.01046774, + "balance_loss_clip": 1.04756522, + "balance_loss_mlp": 1.028952, + "epoch": 0.17123102359837666, + "flos": 21434314167360.0, + "grad_norm": 2.7622475606577788, + "language_loss": 0.86858749, + "learning_rate": 3.7943873757778614e-06, + "loss": 0.89057332, + "num_input_tokens_seen": 61710820, + "router_z_loss_clip": 1.04345703, + "router_z_loss_mlp": 0.17834473, + "step": 2848, + "time_per_iteration": 2.6100502014160156 + }, + { + "auxiliary_loss_clip": 0.011528, + "auxiliary_loss_mlp": 0.01046667, + "balance_loss_clip": 1.05155122, + "balance_loss_mlp": 1.02867866, + "epoch": 0.17129114685104463, + "flos": 31937852210400.0, + "grad_norm": 1.9969102287232738, + "language_loss": 0.75300324, + "learning_rate": 3.794215340959902e-06, + "loss": 0.77499795, + "num_input_tokens_seen": 61729855, + "router_z_loss_clip": 1.01416016, + "router_z_loss_mlp": 0.17980957, + "step": 2849, + "time_per_iteration": 2.677119255065918 + }, + { + "auxiliary_loss_clip": 0.01048459, + "auxiliary_loss_mlp": 0.01001918, + "balance_loss_clip": 1.013942, + "balance_loss_mlp": 0.99978411, + "epoch": 0.17135127010371262, + "flos": 84523044214080.0, + "grad_norm": 0.7882204056099534, + "language_loss": 0.57540572, + "learning_rate": 3.7940432381051163e-06, + "loss": 0.59590948, + "num_input_tokens_seen": 61790290, + "router_z_loss_clip": 0.3449707, + "router_z_loss_mlp": 0.0213623, + "step": 2850, + "time_per_iteration": 3.277456521987915 + }, + { + "auxiliary_loss_clip": 0.01148496, + "auxiliary_loss_mlp": 0.0103948, + "balance_loss_clip": 1.05159009, + "balance_loss_mlp": 1.0232203, + "epoch": 0.1714113933563806, + "flos": 28736107248960.0, + "grad_norm": 3.48273520630691, + "language_loss": 0.80698764, + "learning_rate": 3.793871067220031e-06, + "loss": 0.82886744, + "num_input_tokens_seen": 61809265, + "router_z_loss_clip": 0.96972656, + "router_z_loss_mlp": 0.16271973, + "step": 2851, + "time_per_iteration": 2.6924314498901367 + }, + { + "auxiliary_loss_clip": 0.01151564, + "auxiliary_loss_mlp": 0.01040674, + "balance_loss_clip": 1.05248678, + "balance_loss_mlp": 1.02436638, + "epoch": 0.17147151660904855, + "flos": 25798051160640.0, + "grad_norm": 2.068995142802405, + "language_loss": 0.92867273, + "learning_rate": 3.7936988283111764e-06, + "loss": 0.95059514, + "num_input_tokens_seen": 61828980, + "router_z_loss_clip": 0.99072266, + "router_z_loss_mlp": 0.16296387, + "step": 2852, + "time_per_iteration": 2.7030768394470215 + }, + { + "auxiliary_loss_clip": 0.01152207, + "auxiliary_loss_mlp": 0.01048215, + "balance_loss_clip": 1.05002534, + "balance_loss_mlp": 1.03110838, + "epoch": 0.17153163986171652, + "flos": 22725942444000.0, + "grad_norm": 2.057692942057177, + "language_loss": 0.69067591, + "learning_rate": 3.7935265213850817e-06, + "loss": 0.7126801, + "num_input_tokens_seen": 61847915, + "router_z_loss_clip": 1.02148438, + "router_z_loss_mlp": 0.17102051, + "step": 2853, + "time_per_iteration": 2.6377415657043457 + }, + { + "auxiliary_loss_clip": 0.0115615, + "auxiliary_loss_mlp": 0.0105016, + "balance_loss_clip": 1.05351686, + "balance_loss_mlp": 1.03337502, + "epoch": 0.17159176311438448, + "flos": 22235874762240.0, + "grad_norm": 2.1116110284462613, + "language_loss": 0.6633603, + "learning_rate": 3.7933541464482815e-06, + "loss": 0.68542343, + "num_input_tokens_seen": 61865570, + "router_z_loss_clip": 1.02587891, + "router_z_loss_mlp": 0.16784668, + "step": 2854, + "time_per_iteration": 2.6439337730407715 + }, + { + "auxiliary_loss_clip": 0.01151832, + "auxiliary_loss_mlp": 0.01043675, + "balance_loss_clip": 1.0521847, + "balance_loss_mlp": 1.02764153, + "epoch": 0.17165188636705245, + "flos": 25305917097600.0, + "grad_norm": 1.6375897689905523, + "language_loss": 0.89445204, + "learning_rate": 3.7931817035073124e-06, + "loss": 0.91640711, + "num_input_tokens_seen": 61883340, + "router_z_loss_clip": 0.99658203, + "router_z_loss_mlp": 0.16033936, + "step": 2855, + "time_per_iteration": 2.697946548461914 + }, + { + "auxiliary_loss_clip": 0.01154607, + "auxiliary_loss_mlp": 0.01042946, + "balance_loss_clip": 1.05301285, + "balance_loss_mlp": 1.02713919, + "epoch": 0.17171200961972044, + "flos": 30383223854400.0, + "grad_norm": 2.04763654419481, + "language_loss": 0.83665323, + "learning_rate": 3.7930091925687134e-06, + "loss": 0.85862875, + "num_input_tokens_seen": 61900610, + "router_z_loss_clip": 1.01611328, + "router_z_loss_mlp": 0.15808105, + "step": 2856, + "time_per_iteration": 2.717097043991089 + }, + { + "auxiliary_loss_clip": 0.01156012, + "auxiliary_loss_mlp": 0.01047094, + "balance_loss_clip": 1.05473733, + "balance_loss_mlp": 1.03001165, + "epoch": 0.1717721328723884, + "flos": 24595831820160.0, + "grad_norm": 1.9430177543695146, + "language_loss": 0.86484241, + "learning_rate": 3.792836613639026e-06, + "loss": 0.88687342, + "num_input_tokens_seen": 61916795, + "router_z_loss_clip": 1.01367188, + "router_z_loss_mlp": 0.1706543, + "step": 2857, + "time_per_iteration": 4.3347227573394775 + }, + { + "auxiliary_loss_clip": 0.01155314, + "auxiliary_loss_mlp": 0.01053387, + "balance_loss_clip": 1.05358624, + "balance_loss_mlp": 1.03574991, + "epoch": 0.17183225612505637, + "flos": 28506689644320.0, + "grad_norm": 2.0343887607260043, + "language_loss": 0.77812648, + "learning_rate": 3.7926639667247947e-06, + "loss": 0.80021346, + "num_input_tokens_seen": 61936665, + "router_z_loss_clip": 1.01708984, + "router_z_loss_mlp": 0.17645264, + "step": 2858, + "time_per_iteration": 4.055436849594116 + }, + { + "auxiliary_loss_clip": 0.0116304, + "auxiliary_loss_mlp": 0.01051784, + "balance_loss_clip": 1.05327034, + "balance_loss_mlp": 1.03260362, + "epoch": 0.17189237937772434, + "flos": 22102794548640.0, + "grad_norm": 2.006505255499478, + "language_loss": 0.77124405, + "learning_rate": 3.7924912518325663e-06, + "loss": 0.7933923, + "num_input_tokens_seen": 61954415, + "router_z_loss_clip": 1.09912109, + "router_z_loss_mlp": 0.19177246, + "step": 2859, + "time_per_iteration": 2.6560826301574707 + }, + { + "auxiliary_loss_clip": 0.01154472, + "auxiliary_loss_mlp": 0.01042997, + "balance_loss_clip": 1.05473876, + "balance_loss_mlp": 1.02529407, + "epoch": 0.1719525026303923, + "flos": 28380132712800.0, + "grad_norm": 2.060150881701003, + "language_loss": 0.77083075, + "learning_rate": 3.7923184689688902e-06, + "loss": 0.79280543, + "num_input_tokens_seen": 61973940, + "router_z_loss_clip": 0.99804688, + "router_z_loss_mlp": 0.17687988, + "step": 2860, + "time_per_iteration": 2.677035331726074 + }, + { + "auxiliary_loss_clip": 0.01155656, + "auxiliary_loss_mlp": 0.01042338, + "balance_loss_clip": 1.05270112, + "balance_loss_mlp": 1.0259347, + "epoch": 0.17201262588306027, + "flos": 25393219135200.0, + "grad_norm": 2.3256274874035214, + "language_loss": 0.81771338, + "learning_rate": 3.792145618140317e-06, + "loss": 0.83969331, + "num_input_tokens_seen": 61991845, + "router_z_loss_clip": 1.02832031, + "router_z_loss_mlp": 0.16394043, + "step": 2861, + "time_per_iteration": 4.071540594100952 + }, + { + "auxiliary_loss_clip": 0.01156603, + "auxiliary_loss_mlp": 0.01047042, + "balance_loss_clip": 1.05497468, + "balance_loss_mlp": 1.03075767, + "epoch": 0.17207274913572823, + "flos": 24862640523840.0, + "grad_norm": 2.8260024055252826, + "language_loss": 0.8590188, + "learning_rate": 3.7919726993534038e-06, + "loss": 0.88105524, + "num_input_tokens_seen": 62009395, + "router_z_loss_clip": 1.01611328, + "router_z_loss_mlp": 0.1628418, + "step": 2862, + "time_per_iteration": 4.074292898178101 + }, + { + "auxiliary_loss_clip": 0.01149493, + "auxiliary_loss_mlp": 0.01043337, + "balance_loss_clip": 1.0513072, + "balance_loss_mlp": 1.02757716, + "epoch": 0.17213287238839622, + "flos": 32699063944800.0, + "grad_norm": 4.308926243738506, + "language_loss": 0.77826864, + "learning_rate": 3.7917997126147054e-06, + "loss": 0.80019689, + "num_input_tokens_seen": 62029005, + "router_z_loss_clip": 0.98291016, + "router_z_loss_mlp": 0.15759277, + "step": 2863, + "time_per_iteration": 2.752502679824829 + }, + { + "auxiliary_loss_clip": 0.01150767, + "auxiliary_loss_mlp": 0.01042267, + "balance_loss_clip": 1.05124235, + "balance_loss_mlp": 1.026281, + "epoch": 0.1721929956410642, + "flos": 31763288652480.0, + "grad_norm": 1.6277222340191895, + "language_loss": 0.72509801, + "learning_rate": 3.7916266579307823e-06, + "loss": 0.74702835, + "num_input_tokens_seen": 62048730, + "router_z_loss_clip": 0.99707031, + "router_z_loss_mlp": 0.15991211, + "step": 2864, + "time_per_iteration": 2.702301502227783 + }, + { + "auxiliary_loss_clip": 0.01157127, + "auxiliary_loss_mlp": 0.01052226, + "balance_loss_clip": 1.05391479, + "balance_loss_mlp": 1.03610885, + "epoch": 0.17225311889373215, + "flos": 27177791820480.0, + "grad_norm": 1.8397197337501487, + "language_loss": 0.73154664, + "learning_rate": 3.7914535353081973e-06, + "loss": 0.75364017, + "num_input_tokens_seen": 62069000, + "router_z_loss_clip": 1.03125, + "router_z_loss_mlp": 0.16113281, + "step": 2865, + "time_per_iteration": 2.696117639541626 + }, + { + "auxiliary_loss_clip": 0.01158264, + "auxiliary_loss_mlp": 0.01044851, + "balance_loss_clip": 1.05703497, + "balance_loss_mlp": 1.02817345, + "epoch": 0.17231324214640012, + "flos": 25975450928160.0, + "grad_norm": 2.2749452056020174, + "language_loss": 0.78387445, + "learning_rate": 3.7912803447535145e-06, + "loss": 0.80590558, + "num_input_tokens_seen": 62086750, + "router_z_loss_clip": 1.01318359, + "router_z_loss_mlp": 0.16687012, + "step": 2866, + "time_per_iteration": 2.6639485359191895 + }, + { + "auxiliary_loss_clip": 0.01156536, + "auxiliary_loss_mlp": 0.01047951, + "balance_loss_clip": 1.05465364, + "balance_loss_mlp": 1.03022492, + "epoch": 0.17237336539906808, + "flos": 24014572441920.0, + "grad_norm": 1.8832111018938755, + "language_loss": 0.79706675, + "learning_rate": 3.7911070862733016e-06, + "loss": 0.81911159, + "num_input_tokens_seen": 62106240, + "router_z_loss_clip": 1.01904297, + "router_z_loss_mlp": 0.17724609, + "step": 2867, + "time_per_iteration": 2.7119228839874268 + }, + { + "auxiliary_loss_clip": 0.01153009, + "auxiliary_loss_mlp": 0.01039562, + "balance_loss_clip": 1.0517838, + "balance_loss_mlp": 1.02249169, + "epoch": 0.17243348865173605, + "flos": 21389508406080.0, + "grad_norm": 3.688061153559696, + "language_loss": 0.79575175, + "learning_rate": 3.7909337598741276e-06, + "loss": 0.81767744, + "num_input_tokens_seen": 62124895, + "router_z_loss_clip": 1.01220703, + "router_z_loss_mlp": 0.1706543, + "step": 2868, + "time_per_iteration": 2.852015733718872 + }, + { + "auxiliary_loss_clip": 0.01161908, + "auxiliary_loss_mlp": 0.01039083, + "balance_loss_clip": 1.05719328, + "balance_loss_mlp": 1.02327609, + "epoch": 0.17249361190440402, + "flos": 22280518454400.0, + "grad_norm": 2.8209547462371236, + "language_loss": 0.83699942, + "learning_rate": 3.7907603655625674e-06, + "loss": 0.85900927, + "num_input_tokens_seen": 62143510, + "router_z_loss_clip": 1.04785156, + "router_z_loss_mlp": 0.15795898, + "step": 2869, + "time_per_iteration": 2.6852643489837646 + }, + { + "auxiliary_loss_clip": 0.01156087, + "auxiliary_loss_mlp": 0.01050653, + "balance_loss_clip": 1.05290794, + "balance_loss_mlp": 1.03277183, + "epoch": 0.172553735157072, + "flos": 25837427606400.0, + "grad_norm": 2.307565075451072, + "language_loss": 0.77333736, + "learning_rate": 3.7905869033451932e-06, + "loss": 0.79540473, + "num_input_tokens_seen": 62162285, + "router_z_loss_clip": 1.03173828, + "router_z_loss_mlp": 0.17871094, + "step": 2870, + "time_per_iteration": 2.6622302532196045 + }, + { + "auxiliary_loss_clip": 0.0115031, + "auxiliary_loss_mlp": 0.01038174, + "balance_loss_clip": 1.05511415, + "balance_loss_mlp": 1.02335596, + "epoch": 0.17261385840973997, + "flos": 27177994406880.0, + "grad_norm": 1.8046036761846431, + "language_loss": 0.76891083, + "learning_rate": 3.7904133732285857e-06, + "loss": 0.79079562, + "num_input_tokens_seen": 62180970, + "router_z_loss_clip": 0.95117188, + "router_z_loss_mlp": 0.14819336, + "step": 2871, + "time_per_iteration": 2.6590545177459717 + }, + { + "auxiliary_loss_clip": 0.01155847, + "auxiliary_loss_mlp": 0.01042424, + "balance_loss_clip": 1.05343771, + "balance_loss_mlp": 1.02521014, + "epoch": 0.17267398166240794, + "flos": 34071876149760.0, + "grad_norm": 2.6383292863641903, + "language_loss": 0.74048191, + "learning_rate": 3.7902397752193228e-06, + "loss": 0.76246464, + "num_input_tokens_seen": 62198965, + "router_z_loss_clip": 1.02392578, + "router_z_loss_mlp": 0.17199707, + "step": 2872, + "time_per_iteration": 2.825721025466919 + }, + { + "auxiliary_loss_clip": 0.01149753, + "auxiliary_loss_mlp": 0.01039398, + "balance_loss_clip": 1.05223703, + "balance_loss_mlp": 1.02299476, + "epoch": 0.1727341049150759, + "flos": 26777538247680.0, + "grad_norm": 2.805550038371508, + "language_loss": 0.82330549, + "learning_rate": 3.790066109323988e-06, + "loss": 0.84519696, + "num_input_tokens_seen": 62219890, + "router_z_loss_clip": 0.97558594, + "router_z_loss_mlp": 0.1640625, + "step": 2873, + "time_per_iteration": 2.6724841594696045 + }, + { + "auxiliary_loss_clip": 0.01151847, + "auxiliary_loss_mlp": 0.01042497, + "balance_loss_clip": 1.05134988, + "balance_loss_mlp": 1.02449608, + "epoch": 0.17279422816774387, + "flos": 22095622990080.0, + "grad_norm": 2.424181376151429, + "language_loss": 0.74738258, + "learning_rate": 3.7898923755491678e-06, + "loss": 0.76932603, + "num_input_tokens_seen": 62237140, + "router_z_loss_clip": 1.00634766, + "router_z_loss_mlp": 0.18005371, + "step": 2874, + "time_per_iteration": 2.678284168243408 + }, + { + "auxiliary_loss_clip": 0.0115602, + "auxiliary_loss_mlp": 0.01045443, + "balance_loss_clip": 1.0534718, + "balance_loss_mlp": 1.02710843, + "epoch": 0.17285435142041183, + "flos": 26643728723040.0, + "grad_norm": 9.151454670243911, + "language_loss": 0.81046969, + "learning_rate": 3.7897185739014487e-06, + "loss": 0.8324843, + "num_input_tokens_seen": 62255405, + "router_z_loss_clip": 1.02490234, + "router_z_loss_mlp": 0.18347168, + "step": 2875, + "time_per_iteration": 2.66799259185791 + }, + { + "auxiliary_loss_clip": 0.0115814, + "auxiliary_loss_mlp": 0.01047927, + "balance_loss_clip": 1.05434799, + "balance_loss_mlp": 1.0298667, + "epoch": 0.17291447467307983, + "flos": 22413112460640.0, + "grad_norm": 2.3237879180323637, + "language_loss": 0.87560165, + "learning_rate": 3.7895447043874217e-06, + "loss": 0.89766228, + "num_input_tokens_seen": 62271280, + "router_z_loss_clip": 1.03710938, + "router_z_loss_mlp": 0.18054199, + "step": 2876, + "time_per_iteration": 2.6771798133850098 + }, + { + "auxiliary_loss_clip": 0.0115395, + "auxiliary_loss_mlp": 0.01038195, + "balance_loss_clip": 1.0555774, + "balance_loss_mlp": 1.02202988, + "epoch": 0.1729745979257478, + "flos": 22725658823040.0, + "grad_norm": 1.9607534471719228, + "language_loss": 0.84745395, + "learning_rate": 3.789370767013681e-06, + "loss": 0.86937535, + "num_input_tokens_seen": 62289140, + "router_z_loss_clip": 0.98291016, + "router_z_loss_mlp": 0.16174316, + "step": 2877, + "time_per_iteration": 2.652008056640625 + }, + { + "auxiliary_loss_clip": 0.01155078, + "auxiliary_loss_mlp": 0.01040423, + "balance_loss_clip": 1.05536699, + "balance_loss_mlp": 1.02333987, + "epoch": 0.17303472117841576, + "flos": 28063088932320.0, + "grad_norm": 2.0819983118630456, + "language_loss": 0.79104531, + "learning_rate": 3.7891967617868204e-06, + "loss": 0.81300032, + "num_input_tokens_seen": 62307490, + "router_z_loss_clip": 0.99609375, + "router_z_loss_mlp": 0.17089844, + "step": 2878, + "time_per_iteration": 2.7201998233795166 + }, + { + "auxiliary_loss_clip": 0.01153013, + "auxiliary_loss_mlp": 0.0104164, + "balance_loss_clip": 1.05415237, + "balance_loss_mlp": 1.02522528, + "epoch": 0.17309484443108372, + "flos": 31316284488960.0, + "grad_norm": 1.9143540290614276, + "language_loss": 0.7059586, + "learning_rate": 3.78902268871344e-06, + "loss": 0.72790515, + "num_input_tokens_seen": 62328570, + "router_z_loss_clip": 0.98730469, + "router_z_loss_mlp": 0.1640625, + "step": 2879, + "time_per_iteration": 2.7054574489593506 + }, + { + "auxiliary_loss_clip": 0.01152985, + "auxiliary_loss_mlp": 0.0104236, + "balance_loss_clip": 1.05257452, + "balance_loss_mlp": 1.02586174, + "epoch": 0.1731549676837517, + "flos": 16536757180320.0, + "grad_norm": 3.4413195951172795, + "language_loss": 0.83219194, + "learning_rate": 3.78884854780014e-06, + "loss": 0.85414541, + "num_input_tokens_seen": 62345735, + "router_z_loss_clip": 1.00439453, + "router_z_loss_mlp": 0.16491699, + "step": 2880, + "time_per_iteration": 2.797672748565674 + }, + { + "auxiliary_loss_clip": 0.01158749, + "auxiliary_loss_mlp": 0.01040986, + "balance_loss_clip": 1.05540681, + "balance_loss_mlp": 1.02374792, + "epoch": 0.17321509093641965, + "flos": 27890389169280.0, + "grad_norm": 3.9620479473811936, + "language_loss": 0.81083274, + "learning_rate": 3.7886743390535236e-06, + "loss": 0.83283007, + "num_input_tokens_seen": 62365525, + "router_z_loss_clip": 1.03271484, + "router_z_loss_mlp": 0.17236328, + "step": 2881, + "time_per_iteration": 2.8245913982391357 + }, + { + "auxiliary_loss_clip": 0.01153957, + "auxiliary_loss_mlp": 0.01042372, + "balance_loss_clip": 1.0549891, + "balance_loss_mlp": 1.0269587, + "epoch": 0.17327521418908762, + "flos": 29715594336000.0, + "grad_norm": 2.812067003646865, + "language_loss": 0.77307963, + "learning_rate": 3.788500062480197e-06, + "loss": 0.79504287, + "num_input_tokens_seen": 62385160, + "router_z_loss_clip": 0.98828125, + "router_z_loss_mlp": 0.1541748, + "step": 2882, + "time_per_iteration": 2.719599485397339 + }, + { + "auxiliary_loss_clip": 0.01155983, + "auxiliary_loss_mlp": 0.01047817, + "balance_loss_clip": 1.05753517, + "balance_loss_mlp": 1.03147924, + "epoch": 0.1733353374417556, + "flos": 40396856284800.0, + "grad_norm": 2.945562442964791, + "language_loss": 0.76309383, + "learning_rate": 3.788325718086769e-06, + "loss": 0.78513181, + "num_input_tokens_seen": 62405280, + "router_z_loss_clip": 0.98583984, + "router_z_loss_mlp": 0.16314697, + "step": 2883, + "time_per_iteration": 2.811089038848877 + }, + { + "auxiliary_loss_clip": 0.01151705, + "auxiliary_loss_mlp": 0.01040558, + "balance_loss_clip": 1.05318379, + "balance_loss_mlp": 1.02469134, + "epoch": 0.17339546069442358, + "flos": 29759751820800.0, + "grad_norm": 2.035667889087317, + "language_loss": 0.85509998, + "learning_rate": 3.7881513058798503e-06, + "loss": 0.87702262, + "num_input_tokens_seen": 62423665, + "router_z_loss_clip": 0.984375, + "router_z_loss_mlp": 0.15869141, + "step": 2884, + "time_per_iteration": 2.6979196071624756 + }, + { + "auxiliary_loss_clip": 0.0115518, + "auxiliary_loss_mlp": 0.01042783, + "balance_loss_clip": 1.05530179, + "balance_loss_mlp": 1.02684438, + "epoch": 0.17345558394709154, + "flos": 33500138332320.0, + "grad_norm": 1.8021609728274133, + "language_loss": 0.74065709, + "learning_rate": 3.787976825866055e-06, + "loss": 0.76263666, + "num_input_tokens_seen": 62445170, + "router_z_loss_clip": 0.99707031, + "router_z_loss_mlp": 0.15924072, + "step": 2885, + "time_per_iteration": 2.7193801403045654 + }, + { + "auxiliary_loss_clip": 0.01153665, + "auxiliary_loss_mlp": 0.0104094, + "balance_loss_clip": 1.05797601, + "balance_loss_mlp": 1.02576447, + "epoch": 0.1735157071997595, + "flos": 30117428082720.0, + "grad_norm": 1.7906944157930413, + "language_loss": 0.70625114, + "learning_rate": 3.7878022780519998e-06, + "loss": 0.72819722, + "num_input_tokens_seen": 62466135, + "router_z_loss_clip": 0.95703125, + "router_z_loss_mlp": 0.1517334, + "step": 2886, + "time_per_iteration": 2.698934316635132 + }, + { + "auxiliary_loss_clip": 0.01152844, + "auxiliary_loss_mlp": 0.01035588, + "balance_loss_clip": 1.05124259, + "balance_loss_mlp": 1.01877928, + "epoch": 0.17357583045242747, + "flos": 26464991885280.0, + "grad_norm": 2.033065619515811, + "language_loss": 0.69234133, + "learning_rate": 3.7876276624443024e-06, + "loss": 0.71422565, + "num_input_tokens_seen": 62483910, + "router_z_loss_clip": 1.01708984, + "router_z_loss_mlp": 0.16809082, + "step": 2887, + "time_per_iteration": 2.6713368892669678 + }, + { + "auxiliary_loss_clip": 0.0115489, + "auxiliary_loss_mlp": 0.01040883, + "balance_loss_clip": 1.05494142, + "balance_loss_mlp": 1.02549291, + "epoch": 0.17363595370509544, + "flos": 18761243505120.0, + "grad_norm": 1.7300366359415809, + "language_loss": 0.84858286, + "learning_rate": 3.787452979049585e-06, + "loss": 0.87054062, + "num_input_tokens_seen": 62501530, + "router_z_loss_clip": 0.99853516, + "router_z_loss_mlp": 0.15380859, + "step": 2888, + "time_per_iteration": 2.638084650039673 + }, + { + "auxiliary_loss_clip": 0.01153569, + "auxiliary_loss_mlp": 0.01042599, + "balance_loss_clip": 1.05393672, + "balance_loss_mlp": 1.02602899, + "epoch": 0.1736960769577634, + "flos": 28602864966240.0, + "grad_norm": 2.7792504925117294, + "language_loss": 0.78213632, + "learning_rate": 3.7872782278744718e-06, + "loss": 0.80409801, + "num_input_tokens_seen": 62521295, + "router_z_loss_clip": 0.99511719, + "router_z_loss_mlp": 0.16577148, + "step": 2889, + "time_per_iteration": 2.7241733074188232 + }, + { + "auxiliary_loss_clip": 0.01153441, + "auxiliary_loss_mlp": 0.01041903, + "balance_loss_clip": 1.05866098, + "balance_loss_mlp": 1.02536821, + "epoch": 0.1737562002104314, + "flos": 22680772027200.0, + "grad_norm": 2.2377226982140077, + "language_loss": 0.84070694, + "learning_rate": 3.7871034089255883e-06, + "loss": 0.86266041, + "num_input_tokens_seen": 62539615, + "router_z_loss_clip": 0.94628906, + "router_z_loss_mlp": 0.16516113, + "step": 2890, + "time_per_iteration": 2.6666531562805176 + }, + { + "auxiliary_loss_clip": 0.01159212, + "auxiliary_loss_mlp": 0.0104152, + "balance_loss_clip": 1.05805802, + "balance_loss_mlp": 1.02491379, + "epoch": 0.17381632346309936, + "flos": 19520469892800.0, + "grad_norm": 2.0576208314435833, + "language_loss": 0.82382709, + "learning_rate": 3.7869285222095653e-06, + "loss": 0.84583437, + "num_input_tokens_seen": 62556820, + "router_z_loss_clip": 1.01171875, + "router_z_loss_mlp": 0.16601562, + "step": 2891, + "time_per_iteration": 2.6601550579071045 + }, + { + "auxiliary_loss_clip": 0.01156731, + "auxiliary_loss_mlp": 0.01041171, + "balance_loss_clip": 1.05405629, + "balance_loss_mlp": 1.02354002, + "epoch": 0.17387644671576732, + "flos": 16313417167680.0, + "grad_norm": 2.3836386581451303, + "language_loss": 0.8135252, + "learning_rate": 3.7867535677330334e-06, + "loss": 0.83550417, + "num_input_tokens_seen": 62572450, + "router_z_loss_clip": 1.02685547, + "router_z_loss_mlp": 0.1763916, + "step": 2892, + "time_per_iteration": 2.6095492839813232 + }, + { + "auxiliary_loss_clip": 0.01158387, + "auxiliary_loss_mlp": 0.01047088, + "balance_loss_clip": 1.05677235, + "balance_loss_mlp": 1.02927792, + "epoch": 0.1739365699684353, + "flos": 32476696346880.0, + "grad_norm": 2.172625488323678, + "language_loss": 0.74225497, + "learning_rate": 3.786578545502627e-06, + "loss": 0.76430976, + "num_input_tokens_seen": 62592580, + "router_z_loss_clip": 1.01611328, + "router_z_loss_mlp": 0.17810059, + "step": 2893, + "time_per_iteration": 2.886096239089966 + }, + { + "auxiliary_loss_clip": 0.0115353, + "auxiliary_loss_mlp": 0.01037552, + "balance_loss_clip": 1.05289495, + "balance_loss_mlp": 1.02082705, + "epoch": 0.17399669322110325, + "flos": 28513820685600.0, + "grad_norm": 2.056843682436563, + "language_loss": 0.82530999, + "learning_rate": 3.7864034555249828e-06, + "loss": 0.84722078, + "num_input_tokens_seen": 62611220, + "router_z_loss_clip": 1.00683594, + "router_z_loss_mlp": 0.16723633, + "step": 2894, + "time_per_iteration": 2.755770206451416 + }, + { + "auxiliary_loss_clip": 0.01154842, + "auxiliary_loss_mlp": 0.01040037, + "balance_loss_clip": 1.05445862, + "balance_loss_mlp": 1.02124941, + "epoch": 0.17405681647377122, + "flos": 26910577944000.0, + "grad_norm": 8.135633852980808, + "language_loss": 0.74301088, + "learning_rate": 3.786228297806741e-06, + "loss": 0.76495969, + "num_input_tokens_seen": 62629185, + "router_z_loss_clip": 1.00488281, + "router_z_loss_mlp": 0.18786621, + "step": 2895, + "time_per_iteration": 2.6477956771850586 + }, + { + "auxiliary_loss_clip": 0.01079467, + "auxiliary_loss_mlp": 0.01003862, + "balance_loss_clip": 1.04509711, + "balance_loss_mlp": 1.00200677, + "epoch": 0.1741169397264392, + "flos": 74990444112000.0, + "grad_norm": 0.865386300712343, + "language_loss": 0.62780559, + "learning_rate": 3.7860530723545435e-06, + "loss": 0.6486389, + "num_input_tokens_seen": 62691895, + "router_z_loss_clip": 0.34350586, + "router_z_loss_mlp": 0.01852417, + "step": 2896, + "time_per_iteration": 3.347667932510376 + }, + { + "auxiliary_loss_clip": 0.01151134, + "auxiliary_loss_mlp": 0.01035693, + "balance_loss_clip": 1.0502398, + "balance_loss_mlp": 1.01894391, + "epoch": 0.17417706297910718, + "flos": 32967371787840.0, + "grad_norm": 1.8451394476177698, + "language_loss": 0.75753188, + "learning_rate": 3.785877779175034e-06, + "loss": 0.77940023, + "num_input_tokens_seen": 62713790, + "router_z_loss_clip": 1.00927734, + "router_z_loss_mlp": 0.1673584, + "step": 2897, + "time_per_iteration": 5.601726531982422 + }, + { + "auxiliary_loss_clip": 0.01150737, + "auxiliary_loss_mlp": 0.01034719, + "balance_loss_clip": 1.05365574, + "balance_loss_mlp": 1.01817846, + "epoch": 0.17423718623177514, + "flos": 40888706726880.0, + "grad_norm": 1.7807552376602367, + "language_loss": 0.69137555, + "learning_rate": 3.7857024182748606e-06, + "loss": 0.71323013, + "num_input_tokens_seen": 62736285, + "router_z_loss_clip": 0.96972656, + "router_z_loss_mlp": 0.16522217, + "step": 2898, + "time_per_iteration": 2.761599063873291 + }, + { + "auxiliary_loss_clip": 0.01161497, + "auxiliary_loss_mlp": 0.01038768, + "balance_loss_clip": 1.05736518, + "balance_loss_mlp": 1.02220953, + "epoch": 0.1742973094844431, + "flos": 33188321280960.0, + "grad_norm": 2.7521902224544275, + "language_loss": 0.75906193, + "learning_rate": 3.7855269896606717e-06, + "loss": 0.78106463, + "num_input_tokens_seen": 62756240, + "router_z_loss_clip": 1.04003906, + "router_z_loss_mlp": 0.16577148, + "step": 2899, + "time_per_iteration": 2.7659575939178467 + }, + { + "auxiliary_loss_clip": 0.01151648, + "auxiliary_loss_mlp": 0.01039059, + "balance_loss_clip": 1.05367279, + "balance_loss_mlp": 1.02209568, + "epoch": 0.17435743273711107, + "flos": 27711854917920.0, + "grad_norm": 1.9687739205903905, + "language_loss": 0.72403103, + "learning_rate": 3.785351493339121e-06, + "loss": 0.74593812, + "num_input_tokens_seen": 62775910, + "router_z_loss_clip": 0.97949219, + "router_z_loss_mlp": 0.16943359, + "step": 2900, + "time_per_iteration": 4.2164061069488525 + }, + { + "auxiliary_loss_clip": 0.01155736, + "auxiliary_loss_mlp": 0.01046317, + "balance_loss_clip": 1.05565667, + "balance_loss_mlp": 1.03079534, + "epoch": 0.17441755598977904, + "flos": 50818724192160.0, + "grad_norm": 1.6947855179081783, + "language_loss": 0.69762349, + "learning_rate": 3.785175929316863e-06, + "loss": 0.71964401, + "num_input_tokens_seen": 62799385, + "router_z_loss_clip": 1.00097656, + "router_z_loss_mlp": 0.15515137, + "step": 2901, + "time_per_iteration": 2.845201253890991 + }, + { + "auxiliary_loss_clip": 0.01157475, + "auxiliary_loss_mlp": 0.01043406, + "balance_loss_clip": 1.05554032, + "balance_loss_mlp": 1.02720559, + "epoch": 0.174477679242447, + "flos": 32075916049440.0, + "grad_norm": 1.7638037559806181, + "language_loss": 0.76183426, + "learning_rate": 3.7850002976005543e-06, + "loss": 0.78384304, + "num_input_tokens_seen": 62819380, + "router_z_loss_clip": 1.01953125, + "router_z_loss_mlp": 0.16186523, + "step": 2902, + "time_per_iteration": 4.16296911239624 + }, + { + "auxiliary_loss_clip": 0.01153368, + "auxiliary_loss_mlp": 0.01046691, + "balance_loss_clip": 1.05295312, + "balance_loss_mlp": 1.0307169, + "epoch": 0.174537802495115, + "flos": 21790653359040.0, + "grad_norm": 2.116231885976131, + "language_loss": 0.82121181, + "learning_rate": 3.7848245981968558e-06, + "loss": 0.84321237, + "num_input_tokens_seen": 62836205, + "router_z_loss_clip": 1.00292969, + "router_z_loss_mlp": 0.15991211, + "step": 2903, + "time_per_iteration": 2.6692991256713867 + }, + { + "auxiliary_loss_clip": 0.01154477, + "auxiliary_loss_mlp": 0.01039688, + "balance_loss_clip": 1.05534124, + "balance_loss_mlp": 1.02328444, + "epoch": 0.17459792574778296, + "flos": 20671319672640.0, + "grad_norm": 1.8767367453112453, + "language_loss": 0.73389018, + "learning_rate": 3.784648831112429e-06, + "loss": 0.75583184, + "num_input_tokens_seen": 62854045, + "router_z_loss_clip": 0.99072266, + "router_z_loss_mlp": 0.1640625, + "step": 2904, + "time_per_iteration": 2.9143567085266113 + }, + { + "auxiliary_loss_clip": 0.01153121, + "auxiliary_loss_mlp": 0.01038088, + "balance_loss_clip": 1.0534029, + "balance_loss_mlp": 1.02250743, + "epoch": 0.17465804900045093, + "flos": 31139978688000.0, + "grad_norm": 2.0248868539414278, + "language_loss": 0.64251637, + "learning_rate": 3.7844729963539406e-06, + "loss": 0.66442841, + "num_input_tokens_seen": 62873075, + "router_z_loss_clip": 0.99707031, + "router_z_loss_mlp": 0.15588379, + "step": 2905, + "time_per_iteration": 2.722511053085327 + }, + { + "auxiliary_loss_clip": 0.01164791, + "auxiliary_loss_mlp": 0.01045691, + "balance_loss_clip": 1.05810058, + "balance_loss_mlp": 1.02852499, + "epoch": 0.1747181722531189, + "flos": 29443761489600.0, + "grad_norm": 1.942046607311452, + "language_loss": 0.79259336, + "learning_rate": 3.7842970939280566e-06, + "loss": 0.81469822, + "num_input_tokens_seen": 62892675, + "router_z_loss_clip": 1.06689453, + "router_z_loss_mlp": 0.17175293, + "step": 2906, + "time_per_iteration": 2.656301259994507 + }, + { + "auxiliary_loss_clip": 0.01158449, + "auxiliary_loss_mlp": 0.01048592, + "balance_loss_clip": 1.05759633, + "balance_loss_mlp": 1.03214097, + "epoch": 0.17477829550578686, + "flos": 21656924868960.0, + "grad_norm": 1.9087745298009333, + "language_loss": 0.81056714, + "learning_rate": 3.784121123841449e-06, + "loss": 0.83263755, + "num_input_tokens_seen": 62910675, + "router_z_loss_clip": 1.00830078, + "router_z_loss_mlp": 0.16455078, + "step": 2907, + "time_per_iteration": 2.6544253826141357 + }, + { + "auxiliary_loss_clip": 0.01157905, + "auxiliary_loss_mlp": 0.01047046, + "balance_loss_clip": 1.05599439, + "balance_loss_mlp": 1.03050017, + "epoch": 0.17483841875845482, + "flos": 18763066782720.0, + "grad_norm": 2.2902592889378965, + "language_loss": 0.81153262, + "learning_rate": 3.7839450861007886e-06, + "loss": 0.83358204, + "num_input_tokens_seen": 62928130, + "router_z_loss_clip": 1.01904297, + "router_z_loss_mlp": 0.16552734, + "step": 2908, + "time_per_iteration": 2.6252081394195557 + }, + { + "auxiliary_loss_clip": 0.01156117, + "auxiliary_loss_mlp": 0.01045828, + "balance_loss_clip": 1.05579996, + "balance_loss_mlp": 1.02924645, + "epoch": 0.17489854201112282, + "flos": 20942544759840.0, + "grad_norm": 2.442747283374494, + "language_loss": 0.8097055, + "learning_rate": 3.7837689807127518e-06, + "loss": 0.83172494, + "num_input_tokens_seen": 62944290, + "router_z_loss_clip": 1.00439453, + "router_z_loss_mlp": 0.16589355, + "step": 2909, + "time_per_iteration": 2.670088052749634 + }, + { + "auxiliary_loss_clip": 0.01159545, + "auxiliary_loss_mlp": 0.01046572, + "balance_loss_clip": 1.05667853, + "balance_loss_mlp": 1.02953744, + "epoch": 0.17495866526379078, + "flos": 24105399482880.0, + "grad_norm": 1.7649645701757095, + "language_loss": 0.76768655, + "learning_rate": 3.783592807684017e-06, + "loss": 0.78974771, + "num_input_tokens_seen": 62963505, + "router_z_loss_clip": 1.02832031, + "router_z_loss_mlp": 0.17053223, + "step": 2910, + "time_per_iteration": 2.7660956382751465 + }, + { + "auxiliary_loss_clip": 0.01158372, + "auxiliary_loss_mlp": 0.01043763, + "balance_loss_clip": 1.05611372, + "balance_loss_mlp": 1.02637041, + "epoch": 0.17501878851645875, + "flos": 34790024365920.0, + "grad_norm": 1.971706718026025, + "language_loss": 0.8734532, + "learning_rate": 3.7834165670212645e-06, + "loss": 0.89547461, + "num_input_tokens_seen": 62985020, + "router_z_loss_clip": 1.02392578, + "router_z_loss_mlp": 0.1739502, + "step": 2911, + "time_per_iteration": 2.7218291759490967 + }, + { + "auxiliary_loss_clip": 0.0115256, + "auxiliary_loss_mlp": 0.01047391, + "balance_loss_clip": 1.05232871, + "balance_loss_mlp": 1.03049874, + "epoch": 0.1750789117691267, + "flos": 21879495053280.0, + "grad_norm": 3.916725775292339, + "language_loss": 0.89358586, + "learning_rate": 3.7832402587311764e-06, + "loss": 0.9155854, + "num_input_tokens_seen": 63001745, + "router_z_loss_clip": 1.00146484, + "router_z_loss_mlp": 0.16894531, + "step": 2912, + "time_per_iteration": 2.6573846340179443 + }, + { + "auxiliary_loss_clip": 0.01157061, + "auxiliary_loss_mlp": 0.01042384, + "balance_loss_clip": 1.05394077, + "balance_loss_mlp": 1.02468157, + "epoch": 0.17513903502179468, + "flos": 22280032247040.0, + "grad_norm": 1.6888028173719272, + "language_loss": 0.7233063, + "learning_rate": 3.783063882820439e-06, + "loss": 0.74530071, + "num_input_tokens_seen": 63019750, + "router_z_loss_clip": 1.03125, + "router_z_loss_mlp": 0.17700195, + "step": 2913, + "time_per_iteration": 2.6426641941070557 + }, + { + "auxiliary_loss_clip": 0.01157368, + "auxiliary_loss_mlp": 0.01043597, + "balance_loss_clip": 1.05734062, + "balance_loss_mlp": 1.02719355, + "epoch": 0.17519915827446264, + "flos": 25263907028640.0, + "grad_norm": 2.2362219404932504, + "language_loss": 0.69366944, + "learning_rate": 3.782887439295741e-06, + "loss": 0.71567911, + "num_input_tokens_seen": 63039500, + "router_z_loss_clip": 1.00097656, + "router_z_loss_mlp": 0.16394043, + "step": 2914, + "time_per_iteration": 2.6893436908721924 + }, + { + "auxiliary_loss_clip": 0.01156022, + "auxiliary_loss_mlp": 0.01046055, + "balance_loss_clip": 1.05539107, + "balance_loss_mlp": 1.02874565, + "epoch": 0.1752592815271306, + "flos": 25042552362720.0, + "grad_norm": 1.927624607647323, + "language_loss": 0.93316078, + "learning_rate": 3.782710928163772e-06, + "loss": 0.9551816, + "num_input_tokens_seen": 63059785, + "router_z_loss_clip": 1.00585938, + "router_z_loss_mlp": 0.17297363, + "step": 2915, + "time_per_iteration": 2.716082811355591 + }, + { + "auxiliary_loss_clip": 0.01153114, + "auxiliary_loss_mlp": 0.01045094, + "balance_loss_clip": 1.05567968, + "balance_loss_mlp": 1.02735543, + "epoch": 0.1753194047797986, + "flos": 26599449686400.0, + "grad_norm": 1.8281565857232696, + "language_loss": 0.81000042, + "learning_rate": 3.782534349431226e-06, + "loss": 0.83198255, + "num_input_tokens_seen": 63079385, + "router_z_loss_clip": 0.97509766, + "router_z_loss_mlp": 0.17736816, + "step": 2916, + "time_per_iteration": 2.767673969268799 + }, + { + "auxiliary_loss_clip": 0.01153611, + "auxiliary_loss_mlp": 0.0105151, + "balance_loss_clip": 1.0516715, + "balance_loss_mlp": 1.03448701, + "epoch": 0.17537952803246656, + "flos": 25219506440160.0, + "grad_norm": 1.900937844026038, + "language_loss": 0.73836303, + "learning_rate": 3.782357703104799e-06, + "loss": 0.76041424, + "num_input_tokens_seen": 63098970, + "router_z_loss_clip": 1.01953125, + "router_z_loss_mlp": 0.17028809, + "step": 2917, + "time_per_iteration": 2.8214542865753174 + }, + { + "auxiliary_loss_clip": 0.01151256, + "auxiliary_loss_mlp": 0.01049884, + "balance_loss_clip": 1.05425477, + "balance_loss_mlp": 1.03221679, + "epoch": 0.17543965128513453, + "flos": 28424857439520.0, + "grad_norm": 1.9745812995780194, + "language_loss": 0.77202624, + "learning_rate": 3.7821809891911897e-06, + "loss": 0.79403758, + "num_input_tokens_seen": 63118750, + "router_z_loss_clip": 0.97021484, + "router_z_loss_mlp": 0.17663574, + "step": 2918, + "time_per_iteration": 2.6637730598449707 + }, + { + "auxiliary_loss_clip": 0.01157013, + "auxiliary_loss_mlp": 0.01041677, + "balance_loss_clip": 1.05309176, + "balance_loss_mlp": 1.02458203, + "epoch": 0.1754997745378025, + "flos": 35503188956640.0, + "grad_norm": 3.3677546604321993, + "language_loss": 0.73951828, + "learning_rate": 3.782004207697098e-06, + "loss": 0.76150525, + "num_input_tokens_seen": 63136865, + "router_z_loss_clip": 1.03808594, + "router_z_loss_mlp": 0.17114258, + "step": 2919, + "time_per_iteration": 2.7053399085998535 + }, + { + "auxiliary_loss_clip": 0.01157192, + "auxiliary_loss_mlp": 0.01043157, + "balance_loss_clip": 1.05307972, + "balance_loss_mlp": 1.02703953, + "epoch": 0.17555989779047046, + "flos": 37060613004960.0, + "grad_norm": 1.9158771475289282, + "language_loss": 0.73995328, + "learning_rate": 3.781827358629228e-06, + "loss": 0.76195675, + "num_input_tokens_seen": 63158325, + "router_z_loss_clip": 1.04101562, + "router_z_loss_mlp": 0.16113281, + "step": 2920, + "time_per_iteration": 2.756835460662842 + }, + { + "auxiliary_loss_clip": 0.01150405, + "auxiliary_loss_mlp": 0.01038115, + "balance_loss_clip": 1.05236113, + "balance_loss_mlp": 1.02258205, + "epoch": 0.17562002104313842, + "flos": 28416591914400.0, + "grad_norm": 2.3493858612228435, + "language_loss": 0.79689515, + "learning_rate": 3.7816504419942873e-06, + "loss": 0.8187803, + "num_input_tokens_seen": 63173115, + "router_z_loss_clip": 0.98046875, + "router_z_loss_mlp": 0.15515137, + "step": 2921, + "time_per_iteration": 2.6617963314056396 + }, + { + "auxiliary_loss_clip": 0.01160213, + "auxiliary_loss_mlp": 0.01040545, + "balance_loss_clip": 1.05625165, + "balance_loss_mlp": 1.02380824, + "epoch": 0.1756801442958064, + "flos": 30249738468000.0, + "grad_norm": 1.6548814261285445, + "language_loss": 0.87720048, + "learning_rate": 3.7814734577989823e-06, + "loss": 0.89920807, + "num_input_tokens_seen": 63192880, + "router_z_loss_clip": 1.0390625, + "router_z_loss_mlp": 0.1673584, + "step": 2922, + "time_per_iteration": 2.6877310276031494 + }, + { + "auxiliary_loss_clip": 0.01155385, + "auxiliary_loss_mlp": 0.0104645, + "balance_loss_clip": 1.05305576, + "balance_loss_mlp": 1.02915239, + "epoch": 0.17574026754847438, + "flos": 31447379355840.0, + "grad_norm": 2.9209826613806307, + "language_loss": 0.63212359, + "learning_rate": 3.7812964060500253e-06, + "loss": 0.65414196, + "num_input_tokens_seen": 63214395, + "router_z_loss_clip": 1.0234375, + "router_z_loss_mlp": 0.17297363, + "step": 2923, + "time_per_iteration": 2.710826873779297 + }, + { + "auxiliary_loss_clip": 0.01159605, + "auxiliary_loss_mlp": 0.01043188, + "balance_loss_clip": 1.05648422, + "balance_loss_mlp": 1.02575982, + "epoch": 0.17580039080114235, + "flos": 21300383090880.0, + "grad_norm": 2.5342729169126677, + "language_loss": 0.80436629, + "learning_rate": 3.78111928675413e-06, + "loss": 0.8263942, + "num_input_tokens_seen": 63231020, + "router_z_loss_clip": 1.03076172, + "router_z_loss_mlp": 0.17419434, + "step": 2924, + "time_per_iteration": 2.641096830368042 + }, + { + "auxiliary_loss_clip": 0.01159498, + "auxiliary_loss_mlp": 0.01051183, + "balance_loss_clip": 1.05434263, + "balance_loss_mlp": 1.03227615, + "epoch": 0.1758605140538103, + "flos": 18137609402400.0, + "grad_norm": 2.1342334717621787, + "language_loss": 0.70820498, + "learning_rate": 3.7809420999180126e-06, + "loss": 0.73031175, + "num_input_tokens_seen": 63246245, + "router_z_loss_clip": 1.05175781, + "router_z_loss_mlp": 0.18896484, + "step": 2925, + "time_per_iteration": 2.6655609607696533 + }, + { + "auxiliary_loss_clip": 0.0115401, + "auxiliary_loss_mlp": 0.01043294, + "balance_loss_clip": 1.05544364, + "balance_loss_mlp": 1.02711701, + "epoch": 0.17592063730647828, + "flos": 28069895835360.0, + "grad_norm": 2.984394073141194, + "language_loss": 0.71768814, + "learning_rate": 3.7807648455483934e-06, + "loss": 0.73966122, + "num_input_tokens_seen": 63267790, + "router_z_loss_clip": 0.98583984, + "router_z_loss_mlp": 0.16174316, + "step": 2926, + "time_per_iteration": 2.7335150241851807 + }, + { + "auxiliary_loss_clip": 0.01159662, + "auxiliary_loss_mlp": 0.01040798, + "balance_loss_clip": 1.05529761, + "balance_loss_mlp": 1.02216554, + "epoch": 0.17598076055914624, + "flos": 25308753307200.0, + "grad_norm": 2.01486970545801, + "language_loss": 0.84497666, + "learning_rate": 3.7805875236519918e-06, + "loss": 0.86698127, + "num_input_tokens_seen": 63286830, + "router_z_loss_clip": 1.04345703, + "router_z_loss_mlp": 0.1862793, + "step": 2927, + "time_per_iteration": 2.7136292457580566 + }, + { + "auxiliary_loss_clip": 0.01154633, + "auxiliary_loss_mlp": 0.01044137, + "balance_loss_clip": 1.05590558, + "balance_loss_mlp": 1.02878904, + "epoch": 0.1760408838118142, + "flos": 41600858385600.0, + "grad_norm": 2.180859580693959, + "language_loss": 0.71779507, + "learning_rate": 3.7804101342355336e-06, + "loss": 0.73978281, + "num_input_tokens_seen": 63308870, + "router_z_loss_clip": 0.98779297, + "router_z_loss_mlp": 0.15344238, + "step": 2928, + "time_per_iteration": 2.7742371559143066 + }, + { + "auxiliary_loss_clip": 0.01155473, + "auxiliary_loss_mlp": 0.01039322, + "balance_loss_clip": 1.05736494, + "balance_loss_mlp": 1.0226922, + "epoch": 0.1761010070644822, + "flos": 29490755184000.0, + "grad_norm": 3.0672569537521963, + "language_loss": 0.82897305, + "learning_rate": 3.780232677305744e-06, + "loss": 0.85092103, + "num_input_tokens_seen": 63329005, + "router_z_loss_clip": 0.98046875, + "router_z_loss_mlp": 0.16638184, + "step": 2929, + "time_per_iteration": 2.8746654987335205 + }, + { + "auxiliary_loss_clip": 0.01155206, + "auxiliary_loss_mlp": 0.01035521, + "balance_loss_clip": 1.05511546, + "balance_loss_mlp": 1.02003622, + "epoch": 0.17616113031715017, + "flos": 32429540583360.0, + "grad_norm": 1.8088644350021603, + "language_loss": 0.79371041, + "learning_rate": 3.7800551528693535e-06, + "loss": 0.81561768, + "num_input_tokens_seen": 63349390, + "router_z_loss_clip": 1.00195312, + "router_z_loss_mlp": 0.15478516, + "step": 2930, + "time_per_iteration": 2.7661688327789307 + }, + { + "auxiliary_loss_clip": 0.01154813, + "auxiliary_loss_mlp": 0.01039711, + "balance_loss_clip": 1.05519104, + "balance_loss_mlp": 1.02290297, + "epoch": 0.17622125356981813, + "flos": 31318877594880.0, + "grad_norm": 2.3879856620582203, + "language_loss": 0.76989836, + "learning_rate": 3.7798775609330927e-06, + "loss": 0.79184365, + "num_input_tokens_seen": 63368835, + "router_z_loss_clip": 0.99804688, + "router_z_loss_mlp": 0.16821289, + "step": 2931, + "time_per_iteration": 2.7301995754241943 + }, + { + "auxiliary_loss_clip": 0.01155972, + "auxiliary_loss_mlp": 0.01038979, + "balance_loss_clip": 1.05583382, + "balance_loss_mlp": 1.02332687, + "epoch": 0.1762813768224861, + "flos": 20143415201760.0, + "grad_norm": 2.7992163505597683, + "language_loss": 0.76265657, + "learning_rate": 3.779699901503696e-06, + "loss": 0.7846061, + "num_input_tokens_seen": 63385220, + "router_z_loss_clip": 1.00195312, + "router_z_loss_mlp": 0.15661621, + "step": 2932, + "time_per_iteration": 2.651219606399536 + }, + { + "auxiliary_loss_clip": 0.0116229, + "auxiliary_loss_mlp": 0.0104, + "balance_loss_clip": 1.05594325, + "balance_loss_mlp": 1.02272701, + "epoch": 0.17634150007515406, + "flos": 13680573814080.0, + "grad_norm": 2.689344237229282, + "language_loss": 0.90568215, + "learning_rate": 3.7795221745879016e-06, + "loss": 0.92770499, + "num_input_tokens_seen": 63400865, + "router_z_loss_clip": 1.06347656, + "router_z_loss_mlp": 0.17285156, + "step": 2933, + "time_per_iteration": 2.6753971576690674 + }, + { + "auxiliary_loss_clip": 0.01149039, + "auxiliary_loss_mlp": 0.01046893, + "balance_loss_clip": 1.05370057, + "balance_loss_mlp": 1.0319916, + "epoch": 0.17640162332782203, + "flos": 28870078842720.0, + "grad_norm": 1.751333348642621, + "language_loss": 0.87793183, + "learning_rate": 3.779344380192448e-06, + "loss": 0.89989114, + "num_input_tokens_seen": 63421390, + "router_z_loss_clip": 0.953125, + "router_z_loss_mlp": 0.14880371, + "step": 2934, + "time_per_iteration": 2.710235834121704 + }, + { + "auxiliary_loss_clip": 0.01147074, + "auxiliary_loss_mlp": 0.01040355, + "balance_loss_clip": 1.0518961, + "balance_loss_mlp": 1.02513218, + "epoch": 0.17646174658049, + "flos": 65645650368000.0, + "grad_norm": 1.6193988973981321, + "language_loss": 0.70718896, + "learning_rate": 3.779166518324077e-06, + "loss": 0.72906327, + "num_input_tokens_seen": 63444715, + "router_z_loss_clip": 0.95019531, + "router_z_loss_mlp": 0.15234375, + "step": 2935, + "time_per_iteration": 2.935438632965088 + }, + { + "auxiliary_loss_clip": 0.01159837, + "auxiliary_loss_mlp": 0.01041306, + "balance_loss_clip": 1.05569124, + "balance_loss_mlp": 1.02502227, + "epoch": 0.17652186983315798, + "flos": 29582635674240.0, + "grad_norm": 1.9925621888334644, + "language_loss": 0.6969322, + "learning_rate": 3.7789885889895325e-06, + "loss": 0.71894366, + "num_input_tokens_seen": 63465525, + "router_z_loss_clip": 1.04101562, + "router_z_loss_mlp": 0.16296387, + "step": 2936, + "time_per_iteration": 4.095510005950928 + }, + { + "auxiliary_loss_clip": 0.01157843, + "auxiliary_loss_mlp": 0.01036105, + "balance_loss_clip": 1.05992079, + "balance_loss_mlp": 1.02100086, + "epoch": 0.17658199308582595, + "flos": 33500259884160.0, + "grad_norm": 2.162876564345019, + "language_loss": 0.71654165, + "learning_rate": 3.7788105921955634e-06, + "loss": 0.73848116, + "num_input_tokens_seen": 63485815, + "router_z_loss_clip": 0.97949219, + "router_z_loss_mlp": 0.15100098, + "step": 2937, + "time_per_iteration": 4.098953723907471 + }, + { + "auxiliary_loss_clip": 0.01160925, + "auxiliary_loss_mlp": 0.01039772, + "balance_loss_clip": 1.05737233, + "balance_loss_mlp": 1.02308309, + "epoch": 0.17664211633849392, + "flos": 27355475208960.0, + "grad_norm": 3.5447816724843855, + "language_loss": 0.75821227, + "learning_rate": 3.7786325279489184e-06, + "loss": 0.7802192, + "num_input_tokens_seen": 63503905, + "router_z_loss_clip": 1.03613281, + "router_z_loss_mlp": 0.16687012, + "step": 2938, + "time_per_iteration": 2.6492550373077393 + }, + { + "auxiliary_loss_clip": 0.01155621, + "auxiliary_loss_mlp": 0.01038139, + "balance_loss_clip": 1.0547359, + "balance_loss_mlp": 1.02236819, + "epoch": 0.17670223959116188, + "flos": 30157979529600.0, + "grad_norm": 2.223421388629424, + "language_loss": 0.70827115, + "learning_rate": 3.7784543962563495e-06, + "loss": 0.73020875, + "num_input_tokens_seen": 63521985, + "router_z_loss_clip": 1.01074219, + "router_z_loss_mlp": 0.15759277, + "step": 2939, + "time_per_iteration": 2.679488182067871 + }, + { + "auxiliary_loss_clip": 0.01155098, + "auxiliary_loss_mlp": 0.0103965, + "balance_loss_clip": 1.05616343, + "balance_loss_mlp": 1.02409291, + "epoch": 0.17676236284382985, + "flos": 27487826111520.0, + "grad_norm": 2.2333006317233597, + "language_loss": 0.74118984, + "learning_rate": 3.7782761971246115e-06, + "loss": 0.76313728, + "num_input_tokens_seen": 63539830, + "router_z_loss_clip": 0.98876953, + "router_z_loss_mlp": 0.15563965, + "step": 2940, + "time_per_iteration": 4.171106815338135 + }, + { + "auxiliary_loss_clip": 0.01156428, + "auxiliary_loss_mlp": 0.01041355, + "balance_loss_clip": 1.05640268, + "balance_loss_mlp": 1.02409291, + "epoch": 0.1768224860964978, + "flos": 15111805586400.0, + "grad_norm": 7.7855977504065725, + "language_loss": 0.85342515, + "learning_rate": 3.7780979305604616e-06, + "loss": 0.87540305, + "num_input_tokens_seen": 63555495, + "router_z_loss_clip": 1.00146484, + "router_z_loss_mlp": 0.17260742, + "step": 2941, + "time_per_iteration": 2.777398109436035 + }, + { + "auxiliary_loss_clip": 0.0115426, + "auxiliary_loss_mlp": 0.01034335, + "balance_loss_clip": 1.05527377, + "balance_loss_mlp": 1.01859915, + "epoch": 0.1768826093491658, + "flos": 29716242612480.0, + "grad_norm": 2.600552949907601, + "language_loss": 0.76547348, + "learning_rate": 3.7779195965706607e-06, + "loss": 0.78735936, + "num_input_tokens_seen": 63575290, + "router_z_loss_clip": 0.98974609, + "router_z_loss_mlp": 0.15740967, + "step": 2942, + "time_per_iteration": 4.133780241012573 + }, + { + "auxiliary_loss_clip": 0.01156665, + "auxiliary_loss_mlp": 0.01037646, + "balance_loss_clip": 1.05499637, + "balance_loss_mlp": 1.02144527, + "epoch": 0.17694273260183377, + "flos": 28781156113920.0, + "grad_norm": 2.6263975879316566, + "language_loss": 0.80285776, + "learning_rate": 3.77774119516197e-06, + "loss": 0.82480085, + "num_input_tokens_seen": 63594670, + "router_z_loss_clip": 1.01660156, + "router_z_loss_mlp": 0.16186523, + "step": 2943, + "time_per_iteration": 2.6950573921203613 + }, + { + "auxiliary_loss_clip": 0.01155962, + "auxiliary_loss_mlp": 0.01041292, + "balance_loss_clip": 1.05235791, + "balance_loss_mlp": 1.0247401, + "epoch": 0.17700285585450173, + "flos": 32654258183520.0, + "grad_norm": 1.8999985454627566, + "language_loss": 0.80295867, + "learning_rate": 3.777562726341155e-06, + "loss": 0.82493126, + "num_input_tokens_seen": 63614780, + "router_z_loss_clip": 1.03466797, + "router_z_loss_mlp": 0.16546631, + "step": 2944, + "time_per_iteration": 2.7528486251831055 + }, + { + "auxiliary_loss_clip": 0.01153348, + "auxiliary_loss_mlp": 0.01046792, + "balance_loss_clip": 1.05362737, + "balance_loss_mlp": 1.03129482, + "epoch": 0.1770629791071697, + "flos": 52197573471840.0, + "grad_norm": 2.1288271756189423, + "language_loss": 0.73677278, + "learning_rate": 3.7773841901149835e-06, + "loss": 0.75877416, + "num_input_tokens_seen": 63637190, + "router_z_loss_clip": 0.99609375, + "router_z_loss_mlp": 0.1550293, + "step": 2945, + "time_per_iteration": 2.933727264404297 + }, + { + "auxiliary_loss_clip": 0.01151739, + "auxiliary_loss_mlp": 0.01041843, + "balance_loss_clip": 1.05366826, + "balance_loss_mlp": 1.02652454, + "epoch": 0.17712310235983766, + "flos": 21165033909600.0, + "grad_norm": 2.7062248282632275, + "language_loss": 0.77879453, + "learning_rate": 3.7772055864902256e-06, + "loss": 0.80073035, + "num_input_tokens_seen": 63652140, + "router_z_loss_clip": 0.98046875, + "router_z_loss_mlp": 0.15307617, + "step": 2946, + "time_per_iteration": 2.620527744293213 + }, + { + "auxiliary_loss_clip": 0.0115089, + "auxiliary_loss_mlp": 0.01039813, + "balance_loss_clip": 1.05262423, + "balance_loss_mlp": 1.02401817, + "epoch": 0.17718322561250563, + "flos": 29136806511840.0, + "grad_norm": 1.803696497631926, + "language_loss": 0.76220214, + "learning_rate": 3.7770269154736535e-06, + "loss": 0.78410923, + "num_input_tokens_seen": 63671700, + "router_z_loss_clip": 0.98193359, + "router_z_loss_mlp": 0.15808105, + "step": 2947, + "time_per_iteration": 2.7078986167907715 + }, + { + "auxiliary_loss_clip": 0.01149164, + "auxiliary_loss_mlp": 0.01038084, + "balance_loss_clip": 1.0510304, + "balance_loss_mlp": 1.0217762, + "epoch": 0.1772433488651736, + "flos": 44497471646880.0, + "grad_norm": 2.361324024104218, + "language_loss": 0.73097622, + "learning_rate": 3.7768481770720424e-06, + "loss": 0.75284868, + "num_input_tokens_seen": 63691685, + "router_z_loss_clip": 0.97949219, + "router_z_loss_mlp": 0.16308594, + "step": 2948, + "time_per_iteration": 2.7953126430511475 + }, + { + "auxiliary_loss_clip": 0.01152352, + "auxiliary_loss_mlp": 0.01041934, + "balance_loss_clip": 1.0556488, + "balance_loss_mlp": 1.02675903, + "epoch": 0.1773034721178416, + "flos": 32561486313120.0, + "grad_norm": 1.7786479998649738, + "language_loss": 0.81565809, + "learning_rate": 3.776669371292171e-06, + "loss": 0.83760095, + "num_input_tokens_seen": 63711720, + "router_z_loss_clip": 0.96630859, + "router_z_loss_mlp": 0.15185547, + "step": 2949, + "time_per_iteration": 2.7510533332824707 + }, + { + "auxiliary_loss_clip": 0.01064896, + "auxiliary_loss_mlp": 0.0100101, + "balance_loss_clip": 1.03086555, + "balance_loss_mlp": 0.99876493, + "epoch": 0.17736359537050955, + "flos": 69694173208800.0, + "grad_norm": 0.7463909256234441, + "language_loss": 0.65027255, + "learning_rate": 3.7764904981408186e-06, + "loss": 0.67093158, + "num_input_tokens_seen": 63776280, + "router_z_loss_clip": 0.33984375, + "router_z_loss_mlp": 0.0224762, + "step": 2950, + "time_per_iteration": 3.335192918777466 + }, + { + "auxiliary_loss_clip": 0.01147447, + "auxiliary_loss_mlp": 0.01037822, + "balance_loss_clip": 1.0506289, + "balance_loss_mlp": 1.02243221, + "epoch": 0.17742371862317752, + "flos": 33187835073600.0, + "grad_norm": 2.041580889517647, + "language_loss": 0.83643675, + "learning_rate": 3.7763115576247686e-06, + "loss": 0.85828948, + "num_input_tokens_seen": 63797535, + "router_z_loss_clip": 0.96826172, + "router_z_loss_mlp": 0.15380859, + "step": 2951, + "time_per_iteration": 2.7222371101379395 + }, + { + "auxiliary_loss_clip": 0.01152661, + "auxiliary_loss_mlp": 0.01041235, + "balance_loss_clip": 1.05174732, + "balance_loss_mlp": 1.02535677, + "epoch": 0.17748384187584548, + "flos": 25575075803520.0, + "grad_norm": 3.767011645772533, + "language_loss": 0.79815817, + "learning_rate": 3.776132549750806e-06, + "loss": 0.82009721, + "num_input_tokens_seen": 63817045, + "router_z_loss_clip": 1.00927734, + "router_z_loss_mlp": 0.15881348, + "step": 2952, + "time_per_iteration": 2.797725200653076 + }, + { + "auxiliary_loss_clip": 0.01150783, + "auxiliary_loss_mlp": 0.01041662, + "balance_loss_clip": 1.05211759, + "balance_loss_mlp": 1.02448416, + "epoch": 0.17754396512851345, + "flos": 30517519586400.0, + "grad_norm": 2.219332206958462, + "language_loss": 0.79282939, + "learning_rate": 3.7759534745257194e-06, + "loss": 0.81475383, + "num_input_tokens_seen": 63837665, + "router_z_loss_clip": 0.98583984, + "router_z_loss_mlp": 0.17175293, + "step": 2953, + "time_per_iteration": 2.8401968479156494 + }, + { + "auxiliary_loss_clip": 0.01151801, + "auxiliary_loss_mlp": 0.01035779, + "balance_loss_clip": 1.05340779, + "balance_loss_mlp": 1.02056766, + "epoch": 0.1776040883811814, + "flos": 39110211633600.0, + "grad_norm": 1.9939650319900608, + "language_loss": 0.88067567, + "learning_rate": 3.7757743319562994e-06, + "loss": 0.90255141, + "num_input_tokens_seen": 63858455, + "router_z_loss_clip": 0.984375, + "router_z_loss_mlp": 0.15222168, + "step": 2954, + "time_per_iteration": 2.8072261810302734 + }, + { + "auxiliary_loss_clip": 0.01152097, + "auxiliary_loss_mlp": 0.01047208, + "balance_loss_clip": 1.05359268, + "balance_loss_mlp": 1.03079319, + "epoch": 0.17766421163384938, + "flos": 26325388389600.0, + "grad_norm": 2.0150709681805723, + "language_loss": 0.84853202, + "learning_rate": 3.7755951220493386e-06, + "loss": 0.87052512, + "num_input_tokens_seen": 63876935, + "router_z_loss_clip": 0.98535156, + "router_z_loss_mlp": 0.1640625, + "step": 2955, + "time_per_iteration": 2.6457390785217285 + }, + { + "auxiliary_loss_clip": 0.01149307, + "auxiliary_loss_mlp": 0.01036322, + "balance_loss_clip": 1.05171561, + "balance_loss_mlp": 1.02084923, + "epoch": 0.17772433488651737, + "flos": 27356204520000.0, + "grad_norm": 2.215421186109582, + "language_loss": 0.70970285, + "learning_rate": 3.7754158448116327e-06, + "loss": 0.73155916, + "num_input_tokens_seen": 63896815, + "router_z_loss_clip": 0.97558594, + "router_z_loss_mlp": 0.15466309, + "step": 2956, + "time_per_iteration": 2.6618752479553223 + }, + { + "auxiliary_loss_clip": 0.01150439, + "auxiliary_loss_mlp": 0.01043594, + "balance_loss_clip": 1.0533483, + "balance_loss_mlp": 1.02776325, + "epoch": 0.17778445813918534, + "flos": 31273504591680.0, + "grad_norm": 1.856654867628095, + "language_loss": 0.82795882, + "learning_rate": 3.7752365002499795e-06, + "loss": 0.84989911, + "num_input_tokens_seen": 63916140, + "router_z_loss_clip": 0.97070312, + "router_z_loss_mlp": 0.15844727, + "step": 2957, + "time_per_iteration": 2.682889223098755 + }, + { + "auxiliary_loss_clip": 0.0114752, + "auxiliary_loss_mlp": 0.01038195, + "balance_loss_clip": 1.05183959, + "balance_loss_mlp": 1.02331746, + "epoch": 0.1778445813918533, + "flos": 31272856315200.0, + "grad_norm": 1.6521845644962203, + "language_loss": 0.75372684, + "learning_rate": 3.7750570883711807e-06, + "loss": 0.77558398, + "num_input_tokens_seen": 63935220, + "router_z_loss_clip": 0.95605469, + "router_z_loss_mlp": 0.14868164, + "step": 2958, + "time_per_iteration": 2.715803384780884 + }, + { + "auxiliary_loss_clip": 0.01153267, + "auxiliary_loss_mlp": 0.01037614, + "balance_loss_clip": 1.05387068, + "balance_loss_mlp": 1.02195024, + "epoch": 0.17790470464452127, + "flos": 27266228341920.0, + "grad_norm": 2.3956646858872523, + "language_loss": 0.7999382, + "learning_rate": 3.7748776091820397e-06, + "loss": 0.82184702, + "num_input_tokens_seen": 63954550, + "router_z_loss_clip": 0.99365234, + "router_z_loss_mlp": 0.15661621, + "step": 2959, + "time_per_iteration": 2.6785552501678467 + }, + { + "auxiliary_loss_clip": 0.01155946, + "auxiliary_loss_mlp": 0.01043364, + "balance_loss_clip": 1.05412197, + "balance_loss_mlp": 1.02705574, + "epoch": 0.17796482789718923, + "flos": 22898439620640.0, + "grad_norm": 2.357286528953672, + "language_loss": 0.51883781, + "learning_rate": 3.774698062689362e-06, + "loss": 0.54083085, + "num_input_tokens_seen": 63972425, + "router_z_loss_clip": 1.01855469, + "router_z_loss_mlp": 0.16308594, + "step": 2960, + "time_per_iteration": 2.6178228855133057 + }, + { + "auxiliary_loss_clip": 0.01155928, + "auxiliary_loss_mlp": 0.01047203, + "balance_loss_clip": 1.05566573, + "balance_loss_mlp": 1.03164577, + "epoch": 0.1780249511498572, + "flos": 28602864966240.0, + "grad_norm": 1.7720445237563127, + "language_loss": 0.8861922, + "learning_rate": 3.7745184488999548e-06, + "loss": 0.90822351, + "num_input_tokens_seen": 63992165, + "router_z_loss_clip": 1.00195312, + "router_z_loss_mlp": 0.15539551, + "step": 2961, + "time_per_iteration": 2.6976397037506104 + }, + { + "auxiliary_loss_clip": 0.01154528, + "auxiliary_loss_mlp": 0.01049902, + "balance_loss_clip": 1.05249524, + "balance_loss_mlp": 1.03222311, + "epoch": 0.1780850744025252, + "flos": 28513253443680.0, + "grad_norm": 1.8690540560671502, + "language_loss": 0.78985173, + "learning_rate": 3.774338767820631e-06, + "loss": 0.81189603, + "num_input_tokens_seen": 64013470, + "router_z_loss_clip": 1.02050781, + "router_z_loss_mlp": 0.17675781, + "step": 2962, + "time_per_iteration": 2.6377527713775635 + }, + { + "auxiliary_loss_clip": 0.01154874, + "auxiliary_loss_mlp": 0.01044235, + "balance_loss_clip": 1.05450583, + "balance_loss_mlp": 1.02719975, + "epoch": 0.17814519765519315, + "flos": 16804092608640.0, + "grad_norm": 1.6648560844126854, + "language_loss": 0.74702317, + "learning_rate": 3.774159019458203e-06, + "loss": 0.76901424, + "num_input_tokens_seen": 64030975, + "router_z_loss_clip": 1.00488281, + "router_z_loss_mlp": 0.17041016, + "step": 2963, + "time_per_iteration": 2.6317200660705566 + }, + { + "auxiliary_loss_clip": 0.01158085, + "auxiliary_loss_mlp": 0.01041878, + "balance_loss_clip": 1.05440652, + "balance_loss_mlp": 1.02450967, + "epoch": 0.17820532090786112, + "flos": 26816185382400.0, + "grad_norm": 2.07592028773692, + "language_loss": 0.78662193, + "learning_rate": 3.7739792038194877e-06, + "loss": 0.80862153, + "num_input_tokens_seen": 64050075, + "router_z_loss_clip": 1.03710938, + "router_z_loss_mlp": 0.17370605, + "step": 2964, + "time_per_iteration": 2.6349971294403076 + }, + { + "auxiliary_loss_clip": 0.01154448, + "auxiliary_loss_mlp": 0.01043397, + "balance_loss_clip": 1.05563796, + "balance_loss_mlp": 1.02726841, + "epoch": 0.17826544416052909, + "flos": 30250022088960.0, + "grad_norm": 1.5853580971464025, + "language_loss": 0.81122893, + "learning_rate": 3.7737993209113027e-06, + "loss": 0.83320743, + "num_input_tokens_seen": 64071920, + "router_z_loss_clip": 0.98876953, + "router_z_loss_mlp": 0.16125488, + "step": 2965, + "time_per_iteration": 2.8986239433288574 + }, + { + "auxiliary_loss_clip": 0.01150444, + "auxiliary_loss_mlp": 0.01044434, + "balance_loss_clip": 1.05219197, + "balance_loss_mlp": 1.0297358, + "epoch": 0.17832556741319705, + "flos": 16935552131040.0, + "grad_norm": 2.2379998318296774, + "language_loss": 0.95056474, + "learning_rate": 3.7736193707404698e-06, + "loss": 0.97251356, + "num_input_tokens_seen": 64086835, + "router_z_loss_clip": 0.98144531, + "router_z_loss_mlp": 0.14685059, + "step": 2966, + "time_per_iteration": 2.677647590637207 + }, + { + "auxiliary_loss_clip": 0.01154254, + "auxiliary_loss_mlp": 0.01046445, + "balance_loss_clip": 1.05455816, + "balance_loss_mlp": 1.02923131, + "epoch": 0.17838569066586502, + "flos": 44712100444320.0, + "grad_norm": 2.471464812846584, + "language_loss": 0.72662866, + "learning_rate": 3.7734393533138127e-06, + "loss": 0.74863571, + "num_input_tokens_seen": 64107360, + "router_z_loss_clip": 0.99804688, + "router_z_loss_mlp": 0.17211914, + "step": 2967, + "time_per_iteration": 2.8138184547424316 + }, + { + "auxiliary_loss_clip": 0.01148896, + "auxiliary_loss_mlp": 0.0103944, + "balance_loss_clip": 1.0532515, + "balance_loss_mlp": 1.02389491, + "epoch": 0.17844581391853298, + "flos": 22851526960800.0, + "grad_norm": 2.259615702815566, + "language_loss": 0.76934975, + "learning_rate": 3.773259268638157e-06, + "loss": 0.79123318, + "num_input_tokens_seen": 64124690, + "router_z_loss_clip": 0.95703125, + "router_z_loss_mlp": 0.15563965, + "step": 2968, + "time_per_iteration": 2.6300036907196045 + }, + { + "auxiliary_loss_clip": 0.01150773, + "auxiliary_loss_mlp": 0.01037035, + "balance_loss_clip": 1.05358887, + "balance_loss_mlp": 1.02203846, + "epoch": 0.17850593717120097, + "flos": 33407771634720.0, + "grad_norm": 2.082294211803883, + "language_loss": 0.76036859, + "learning_rate": 3.7730791167203333e-06, + "loss": 0.78224665, + "num_input_tokens_seen": 64146315, + "router_z_loss_clip": 0.97265625, + "router_z_loss_mlp": 0.15002441, + "step": 2969, + "time_per_iteration": 2.7430505752563477 + }, + { + "auxiliary_loss_clip": 0.01067194, + "auxiliary_loss_mlp": 0.010111, + "balance_loss_clip": 1.03238785, + "balance_loss_mlp": 1.00874543, + "epoch": 0.17856606042386894, + "flos": 81749584432800.0, + "grad_norm": 0.8442174157177277, + "language_loss": 0.690162, + "learning_rate": 3.772898897567171e-06, + "loss": 0.71094495, + "num_input_tokens_seen": 64210875, + "router_z_loss_clip": 0.34765625, + "router_z_loss_mlp": 0.02352905, + "step": 2970, + "time_per_iteration": 3.3694920539855957 + }, + { + "auxiliary_loss_clip": 0.01158635, + "auxiliary_loss_mlp": 0.01040169, + "balance_loss_clip": 1.05507076, + "balance_loss_mlp": 1.02429652, + "epoch": 0.1786261836765369, + "flos": 45120619542240.0, + "grad_norm": 1.7403606822704487, + "language_loss": 0.67244238, + "learning_rate": 3.772718611185505e-06, + "loss": 0.69443041, + "num_input_tokens_seen": 64230740, + "router_z_loss_clip": 1.03466797, + "router_z_loss_mlp": 0.15875244, + "step": 2971, + "time_per_iteration": 2.8427014350891113 + }, + { + "auxiliary_loss_clip": 0.01155071, + "auxiliary_loss_mlp": 0.01045532, + "balance_loss_clip": 1.05372143, + "balance_loss_mlp": 1.02821064, + "epoch": 0.17868630692920487, + "flos": 30292518365280.0, + "grad_norm": 1.674719526104368, + "language_loss": 0.8956691, + "learning_rate": 3.7725382575821717e-06, + "loss": 0.91767514, + "num_input_tokens_seen": 64252300, + "router_z_loss_clip": 1.01269531, + "router_z_loss_mlp": 0.17321777, + "step": 2972, + "time_per_iteration": 2.7273576259613037 + }, + { + "auxiliary_loss_clip": 0.01155048, + "auxiliary_loss_mlp": 0.01047829, + "balance_loss_clip": 1.0555774, + "balance_loss_mlp": 1.03192651, + "epoch": 0.17874643018187283, + "flos": 20721230611200.0, + "grad_norm": 2.108565946101937, + "language_loss": 0.8812151, + "learning_rate": 3.77235783676401e-06, + "loss": 0.9032439, + "num_input_tokens_seen": 64270105, + "router_z_loss_clip": 0.99511719, + "router_z_loss_mlp": 0.15905762, + "step": 2973, + "time_per_iteration": 2.651099920272827 + }, + { + "auxiliary_loss_clip": 0.01155744, + "auxiliary_loss_mlp": 0.01049991, + "balance_loss_clip": 1.05704236, + "balance_loss_mlp": 1.03321242, + "epoch": 0.1788065534345408, + "flos": 25663633876800.0, + "grad_norm": 2.0210752104734824, + "language_loss": 0.76076478, + "learning_rate": 3.7721773487378615e-06, + "loss": 0.78282213, + "num_input_tokens_seen": 64287250, + "router_z_loss_clip": 0.98632812, + "router_z_loss_mlp": 0.16778564, + "step": 2974, + "time_per_iteration": 2.7127676010131836 + }, + { + "auxiliary_loss_clip": 0.01155579, + "auxiliary_loss_mlp": 0.01045825, + "balance_loss_clip": 1.05608225, + "balance_loss_mlp": 1.02970231, + "epoch": 0.17886667668720876, + "flos": 29270129829120.0, + "grad_norm": 4.546476726144313, + "language_loss": 0.74516255, + "learning_rate": 3.7719967935105705e-06, + "loss": 0.76717663, + "num_input_tokens_seen": 64307140, + "router_z_loss_clip": 0.99511719, + "router_z_loss_mlp": 0.16131592, + "step": 2975, + "time_per_iteration": 4.124554872512817 + }, + { + "auxiliary_loss_clip": 0.0115124, + "auxiliary_loss_mlp": 0.01044118, + "balance_loss_clip": 1.05402243, + "balance_loss_mlp": 1.02764297, + "epoch": 0.17892679993987676, + "flos": 31407719289120.0, + "grad_norm": 1.6157479556400578, + "language_loss": 0.72966951, + "learning_rate": 3.7718161710889833e-06, + "loss": 0.75162309, + "num_input_tokens_seen": 64328760, + "router_z_loss_clip": 0.97021484, + "router_z_loss_mlp": 0.16479492, + "step": 2976, + "time_per_iteration": 4.15038275718689 + }, + { + "auxiliary_loss_clip": 0.01149866, + "auxiliary_loss_mlp": 0.01031617, + "balance_loss_clip": 1.0569092, + "balance_loss_mlp": 1.01822436, + "epoch": 0.17898692319254472, + "flos": 31359469559040.0, + "grad_norm": 1.708645546267468, + "language_loss": 0.77459657, + "learning_rate": 3.7716354814799495e-06, + "loss": 0.7964114, + "num_input_tokens_seen": 64348800, + "router_z_loss_clip": 0.92871094, + "router_z_loss_mlp": 0.13391113, + "step": 2977, + "time_per_iteration": 2.9632158279418945 + }, + { + "auxiliary_loss_clip": 0.0115766, + "auxiliary_loss_mlp": 0.01039092, + "balance_loss_clip": 1.06036258, + "balance_loss_mlp": 1.02392876, + "epoch": 0.1790470464452127, + "flos": 23571498454560.0, + "grad_norm": 2.093460737261836, + "language_loss": 0.80119157, + "learning_rate": 3.7714547246903203e-06, + "loss": 0.8231591, + "num_input_tokens_seen": 64367955, + "router_z_loss_clip": 0.97265625, + "router_z_loss_mlp": 0.1517334, + "step": 2978, + "time_per_iteration": 2.7058801651000977 + }, + { + "auxiliary_loss_clip": 0.01157063, + "auxiliary_loss_mlp": 0.01041047, + "balance_loss_clip": 1.05609536, + "balance_loss_mlp": 1.02481055, + "epoch": 0.17910716969788065, + "flos": 36660805122240.0, + "grad_norm": 1.5055691231612713, + "language_loss": 0.76307273, + "learning_rate": 3.7712739007269508e-06, + "loss": 0.78505385, + "num_input_tokens_seen": 64389805, + "router_z_loss_clip": 1.01025391, + "router_z_loss_mlp": 0.16247559, + "step": 2979, + "time_per_iteration": 4.2101891040802 + }, + { + "auxiliary_loss_clip": 0.01152261, + "auxiliary_loss_mlp": 0.01039872, + "balance_loss_clip": 1.05589092, + "balance_loss_mlp": 1.02501845, + "epoch": 0.17916729295054862, + "flos": 23705591600160.0, + "grad_norm": 1.7315402780926132, + "language_loss": 0.69126904, + "learning_rate": 3.7710930095966976e-06, + "loss": 0.71319032, + "num_input_tokens_seen": 64408220, + "router_z_loss_clip": 0.96386719, + "router_z_loss_mlp": 0.14868164, + "step": 2980, + "time_per_iteration": 2.6743156909942627 + }, + { + "auxiliary_loss_clip": 0.01155635, + "auxiliary_loss_mlp": 0.01040876, + "balance_loss_clip": 1.05593097, + "balance_loss_mlp": 1.02267265, + "epoch": 0.17922741620321658, + "flos": 17828547526080.0, + "grad_norm": 3.0613263661795616, + "language_loss": 0.70813227, + "learning_rate": 3.7709120513064196e-06, + "loss": 0.73009735, + "num_input_tokens_seen": 64426380, + "router_z_loss_clip": 0.99707031, + "router_z_loss_mlp": 0.18200684, + "step": 2981, + "time_per_iteration": 4.177434206008911 + }, + { + "auxiliary_loss_clip": 0.01159014, + "auxiliary_loss_mlp": 0.01048543, + "balance_loss_clip": 1.05810606, + "balance_loss_mlp": 1.03228271, + "epoch": 0.17928753945588458, + "flos": 20944854244800.0, + "grad_norm": 5.636729154543654, + "language_loss": 0.81864083, + "learning_rate": 3.7707310258629796e-06, + "loss": 0.84071642, + "num_input_tokens_seen": 64444355, + "router_z_loss_clip": 1.0078125, + "router_z_loss_mlp": 0.16259766, + "step": 2982, + "time_per_iteration": 2.6239588260650635 + }, + { + "auxiliary_loss_clip": 0.01151894, + "auxiliary_loss_mlp": 0.01039156, + "balance_loss_clip": 1.05519891, + "balance_loss_mlp": 1.0239805, + "epoch": 0.17934766270855254, + "flos": 38309704488000.0, + "grad_norm": 1.805651526789099, + "language_loss": 0.82981169, + "learning_rate": 3.7705499332732413e-06, + "loss": 0.85172212, + "num_input_tokens_seen": 64467800, + "router_z_loss_clip": 0.96728516, + "router_z_loss_mlp": 0.1519165, + "step": 2983, + "time_per_iteration": 2.74359130859375 + }, + { + "auxiliary_loss_clip": 0.01154509, + "auxiliary_loss_mlp": 0.01042804, + "balance_loss_clip": 1.05347288, + "balance_loss_mlp": 1.02632928, + "epoch": 0.1794077859612205, + "flos": 25397473449600.0, + "grad_norm": 1.9324951075159618, + "language_loss": 0.85223377, + "learning_rate": 3.7703687735440718e-06, + "loss": 0.8742069, + "num_input_tokens_seen": 64487230, + "router_z_loss_clip": 1.01025391, + "router_z_loss_mlp": 0.16467285, + "step": 2984, + "time_per_iteration": 2.64878249168396 + }, + { + "auxiliary_loss_clip": 0.01153623, + "auxiliary_loss_mlp": 0.01040013, + "balance_loss_clip": 1.05344152, + "balance_loss_mlp": 1.02385426, + "epoch": 0.17946790921388847, + "flos": 35369946673920.0, + "grad_norm": 1.4785380317080372, + "language_loss": 0.89225292, + "learning_rate": 3.7701875466823416e-06, + "loss": 0.91418922, + "num_input_tokens_seen": 64509165, + "router_z_loss_clip": 1.00244141, + "router_z_loss_mlp": 0.16131592, + "step": 2985, + "time_per_iteration": 2.7386419773101807 + }, + { + "auxiliary_loss_clip": 0.01148226, + "auxiliary_loss_mlp": 0.01040932, + "balance_loss_clip": 1.05415249, + "balance_loss_mlp": 1.02702045, + "epoch": 0.17952803246655644, + "flos": 25304134337280.0, + "grad_norm": 1.9295468575740007, + "language_loss": 0.69465119, + "learning_rate": 3.770006252694922e-06, + "loss": 0.71654278, + "num_input_tokens_seen": 64527940, + "router_z_loss_clip": 0.93945312, + "router_z_loss_mlp": 0.13891602, + "step": 2986, + "time_per_iteration": 2.676074266433716 + }, + { + "auxiliary_loss_clip": 0.01149759, + "auxiliary_loss_mlp": 0.01038286, + "balance_loss_clip": 1.05347848, + "balance_loss_mlp": 1.02277672, + "epoch": 0.1795881557192244, + "flos": 34478653004640.0, + "grad_norm": 2.4604550764312445, + "language_loss": 0.77111518, + "learning_rate": 3.769824891588688e-06, + "loss": 0.79299569, + "num_input_tokens_seen": 64545230, + "router_z_loss_clip": 0.96289062, + "router_z_loss_mlp": 0.15484619, + "step": 2987, + "time_per_iteration": 2.7655882835388184 + }, + { + "auxiliary_loss_clip": 0.01154319, + "auxiliary_loss_mlp": 0.01038641, + "balance_loss_clip": 1.05386186, + "balance_loss_mlp": 1.02184439, + "epoch": 0.17964827897189237, + "flos": 22637019715200.0, + "grad_norm": 2.0191422802951573, + "language_loss": 0.77921855, + "learning_rate": 3.7696434633705164e-06, + "loss": 0.80114812, + "num_input_tokens_seen": 64563820, + "router_z_loss_clip": 1.00390625, + "router_z_loss_mlp": 0.16796875, + "step": 2988, + "time_per_iteration": 2.8289501667022705 + }, + { + "auxiliary_loss_clip": 0.01064167, + "auxiliary_loss_mlp": 0.01002301, + "balance_loss_clip": 1.02937007, + "balance_loss_mlp": 0.99965411, + "epoch": 0.17970840222456036, + "flos": 70973524749600.0, + "grad_norm": 0.7616433414746588, + "language_loss": 0.62716687, + "learning_rate": 3.7694619680472875e-06, + "loss": 0.64783144, + "num_input_tokens_seen": 64621315, + "router_z_loss_clip": 0.34765625, + "router_z_loss_mlp": 0.02648926, + "step": 2989, + "time_per_iteration": 3.483565092086792 + }, + { + "auxiliary_loss_clip": 0.0115356, + "auxiliary_loss_mlp": 0.01037431, + "balance_loss_clip": 1.05537033, + "balance_loss_mlp": 1.02243447, + "epoch": 0.17976852547722832, + "flos": 24771246240960.0, + "grad_norm": 2.211051363124648, + "language_loss": 0.71023607, + "learning_rate": 3.7692804056258837e-06, + "loss": 0.73214602, + "num_input_tokens_seen": 64639885, + "router_z_loss_clip": 0.98242188, + "router_z_loss_mlp": 0.14990234, + "step": 2990, + "time_per_iteration": 2.7856814861297607 + }, + { + "auxiliary_loss_clip": 0.01155315, + "auxiliary_loss_mlp": 0.01036233, + "balance_loss_clip": 1.05463648, + "balance_loss_mlp": 1.0207237, + "epoch": 0.1798286487298963, + "flos": 48406749297120.0, + "grad_norm": 2.150259862129585, + "language_loss": 0.69356012, + "learning_rate": 3.7690987761131893e-06, + "loss": 0.71547556, + "num_input_tokens_seen": 64661220, + "router_z_loss_clip": 1.00683594, + "router_z_loss_mlp": 0.15515137, + "step": 2991, + "time_per_iteration": 2.832725763320923 + }, + { + "auxiliary_loss_clip": 0.01151597, + "auxiliary_loss_mlp": 0.01036051, + "balance_loss_clip": 1.05404019, + "balance_loss_mlp": 1.02038646, + "epoch": 0.17988877198256426, + "flos": 31140302826240.0, + "grad_norm": 1.6856591992205174, + "language_loss": 0.82906014, + "learning_rate": 3.7689170795160924e-06, + "loss": 0.85093665, + "num_input_tokens_seen": 64682530, + "router_z_loss_clip": 0.97607422, + "router_z_loss_mlp": 0.15661621, + "step": 2992, + "time_per_iteration": 2.8129589557647705 + }, + { + "auxiliary_loss_clip": 0.01145749, + "auxiliary_loss_mlp": 0.01036704, + "balance_loss_clip": 1.05129051, + "balance_loss_mlp": 1.0215466, + "epoch": 0.17994889523523222, + "flos": 22948755732000.0, + "grad_norm": 2.117919297016812, + "language_loss": 0.82570326, + "learning_rate": 3.7687353158414822e-06, + "loss": 0.84752786, + "num_input_tokens_seen": 64701025, + "router_z_loss_clip": 0.94433594, + "router_z_loss_mlp": 0.15148926, + "step": 2993, + "time_per_iteration": 2.6829869747161865 + }, + { + "auxiliary_loss_clip": 0.01149026, + "auxiliary_loss_mlp": 0.01040693, + "balance_loss_clip": 1.05041647, + "balance_loss_mlp": 1.0251658, + "epoch": 0.18000901848790019, + "flos": 25751827294560.0, + "grad_norm": 1.8653728072830615, + "language_loss": 0.78980553, + "learning_rate": 3.7685534850962517e-06, + "loss": 0.81170273, + "num_input_tokens_seen": 64719570, + "router_z_loss_clip": 0.98583984, + "router_z_loss_mlp": 0.15539551, + "step": 2994, + "time_per_iteration": 2.6634247303009033 + }, + { + "auxiliary_loss_clip": 0.01153687, + "auxiliary_loss_mlp": 0.0104143, + "balance_loss_clip": 1.05455387, + "balance_loss_mlp": 1.02582574, + "epoch": 0.18006914174056818, + "flos": 23972521855680.0, + "grad_norm": 1.9429774167748954, + "language_loss": 0.80542815, + "learning_rate": 3.768371587287296e-06, + "loss": 0.82737923, + "num_input_tokens_seen": 64738110, + "router_z_loss_clip": 0.99072266, + "router_z_loss_mlp": 0.15600586, + "step": 2995, + "time_per_iteration": 2.708897352218628 + }, + { + "auxiliary_loss_clip": 0.01149861, + "auxiliary_loss_mlp": 0.0103879, + "balance_loss_clip": 1.05247855, + "balance_loss_mlp": 1.02453923, + "epoch": 0.18012926499323614, + "flos": 23793744500640.0, + "grad_norm": 1.6533771028243411, + "language_loss": 0.84756529, + "learning_rate": 3.768189622421512e-06, + "loss": 0.86945182, + "num_input_tokens_seen": 64756345, + "router_z_loss_clip": 0.97363281, + "router_z_loss_mlp": 0.14263916, + "step": 2996, + "time_per_iteration": 2.64850115776062 + }, + { + "auxiliary_loss_clip": 0.01142984, + "auxiliary_loss_mlp": 0.01032814, + "balance_loss_clip": 1.04954028, + "balance_loss_mlp": 1.01829398, + "epoch": 0.1801893882459041, + "flos": 23750518913280.0, + "grad_norm": 1.698811194296839, + "language_loss": 0.88344884, + "learning_rate": 3.7680075905058006e-06, + "loss": 0.9052068, + "num_input_tokens_seen": 64776375, + "router_z_loss_clip": 0.93554688, + "router_z_loss_mlp": 0.14526367, + "step": 2997, + "time_per_iteration": 2.6782453060150146 + }, + { + "auxiliary_loss_clip": 0.01150475, + "auxiliary_loss_mlp": 0.01043405, + "balance_loss_clip": 1.04877996, + "balance_loss_mlp": 1.02732933, + "epoch": 0.18024951149857207, + "flos": 32787986673600.0, + "grad_norm": 1.8226032872315585, + "language_loss": 0.85539019, + "learning_rate": 3.7678254915470643e-06, + "loss": 0.87732899, + "num_input_tokens_seen": 64796210, + "router_z_loss_clip": 1.01660156, + "router_z_loss_mlp": 0.16070557, + "step": 2998, + "time_per_iteration": 2.694082260131836 + }, + { + "auxiliary_loss_clip": 0.01147959, + "auxiliary_loss_mlp": 0.01035955, + "balance_loss_clip": 1.05394208, + "balance_loss_mlp": 1.02161419, + "epoch": 0.18030963475124004, + "flos": 36883334789280.0, + "grad_norm": 1.817449305998101, + "language_loss": 0.84441781, + "learning_rate": 3.7676433255522084e-06, + "loss": 0.86625695, + "num_input_tokens_seen": 64818590, + "router_z_loss_clip": 0.93896484, + "router_z_loss_mlp": 0.14343262, + "step": 2999, + "time_per_iteration": 2.7866742610931396 + }, + { + "auxiliary_loss_clip": 0.01146946, + "auxiliary_loss_mlp": 0.01039966, + "balance_loss_clip": 1.04948258, + "balance_loss_mlp": 1.02352726, + "epoch": 0.180369758003908, + "flos": 27219963958560.0, + "grad_norm": 1.9203033157709322, + "language_loss": 0.75037843, + "learning_rate": 3.76746109252814e-06, + "loss": 0.77224755, + "num_input_tokens_seen": 64838350, + "router_z_loss_clip": 0.97460938, + "router_z_loss_mlp": 0.16418457, + "step": 3000, + "time_per_iteration": 2.702864646911621 + }, + { + "auxiliary_loss_clip": 0.01148785, + "auxiliary_loss_mlp": 0.01049098, + "balance_loss_clip": 1.052724, + "balance_loss_mlp": 1.03374362, + "epoch": 0.18042988125657597, + "flos": 28958150708640.0, + "grad_norm": 1.8542968911146611, + "language_loss": 0.71020079, + "learning_rate": 3.76727879248177e-06, + "loss": 0.73217958, + "num_input_tokens_seen": 64858065, + "router_z_loss_clip": 0.95947266, + "router_z_loss_mlp": 0.15356445, + "step": 3001, + "time_per_iteration": 3.045454263687134 + }, + { + "auxiliary_loss_clip": 0.01153711, + "auxiliary_loss_mlp": 0.01050506, + "balance_loss_clip": 1.05226767, + "balance_loss_mlp": 1.03446078, + "epoch": 0.18049000450924396, + "flos": 29399644522080.0, + "grad_norm": 2.084419133997616, + "language_loss": 0.87930143, + "learning_rate": 3.767096425420011e-06, + "loss": 0.90134358, + "num_input_tokens_seen": 64877305, + "router_z_loss_clip": 1.01416016, + "router_z_loss_mlp": 0.16052246, + "step": 3002, + "time_per_iteration": 2.6788747310638428 + }, + { + "auxiliary_loss_clip": 0.01147225, + "auxiliary_loss_mlp": 0.01045444, + "balance_loss_clip": 1.04993761, + "balance_loss_mlp": 1.03054333, + "epoch": 0.18055012776191193, + "flos": 27043698674880.0, + "grad_norm": 1.958582444765506, + "language_loss": 0.80358255, + "learning_rate": 3.7669139913497788e-06, + "loss": 0.82550931, + "num_input_tokens_seen": 64896955, + "router_z_loss_clip": 0.97314453, + "router_z_loss_mlp": 0.14904785, + "step": 3003, + "time_per_iteration": 2.709468364715576 + }, + { + "auxiliary_loss_clip": 0.01151521, + "auxiliary_loss_mlp": 0.01041, + "balance_loss_clip": 1.05219316, + "balance_loss_mlp": 1.02570534, + "epoch": 0.1806102510145799, + "flos": 35280942910560.0, + "grad_norm": 2.1640373969073328, + "language_loss": 0.66808456, + "learning_rate": 3.7667314902779907e-06, + "loss": 0.69000977, + "num_input_tokens_seen": 64917080, + "router_z_loss_clip": 0.99316406, + "router_z_loss_mlp": 0.15270996, + "step": 3004, + "time_per_iteration": 2.757054328918457 + }, + { + "auxiliary_loss_clip": 0.01150758, + "auxiliary_loss_mlp": 0.01046913, + "balance_loss_clip": 1.05195689, + "balance_loss_mlp": 1.031165, + "epoch": 0.18067037426724786, + "flos": 23215078228320.0, + "grad_norm": 1.7638620905733466, + "language_loss": 0.85449886, + "learning_rate": 3.7665489222115677e-06, + "loss": 0.87647557, + "num_input_tokens_seen": 64935215, + "router_z_loss_clip": 0.98779297, + "router_z_loss_mlp": 0.15759277, + "step": 3005, + "time_per_iteration": 2.645453453063965 + }, + { + "auxiliary_loss_clip": 0.01145879, + "auxiliary_loss_mlp": 0.01043204, + "balance_loss_clip": 1.05047441, + "balance_loss_mlp": 1.02877927, + "epoch": 0.18073049751991582, + "flos": 33499530573120.0, + "grad_norm": 1.5890547400551263, + "language_loss": 0.83440822, + "learning_rate": 3.766366287157432e-06, + "loss": 0.85629904, + "num_input_tokens_seen": 64956275, + "router_z_loss_clip": 0.95458984, + "router_z_loss_mlp": 0.14440918, + "step": 3006, + "time_per_iteration": 2.713791608810425 + }, + { + "auxiliary_loss_clip": 0.01147466, + "auxiliary_loss_mlp": 0.01044834, + "balance_loss_clip": 1.04993844, + "balance_loss_mlp": 1.02881217, + "epoch": 0.1807906207725838, + "flos": 35056022724000.0, + "grad_norm": 1.7456125209057833, + "language_loss": 0.77019221, + "learning_rate": 3.7661835851225103e-06, + "loss": 0.79211521, + "num_input_tokens_seen": 64979390, + "router_z_loss_clip": 0.97460938, + "router_z_loss_mlp": 0.16040039, + "step": 3007, + "time_per_iteration": 2.7075443267822266 + }, + { + "auxiliary_loss_clip": 0.01059222, + "auxiliary_loss_mlp": 0.01007708, + "balance_loss_clip": 1.02413106, + "balance_loss_mlp": 1.00535488, + "epoch": 0.18085074402525175, + "flos": 78660626878080.0, + "grad_norm": 0.7972526837399962, + "language_loss": 0.56926882, + "learning_rate": 3.7660008161137294e-06, + "loss": 0.58993816, + "num_input_tokens_seen": 65043135, + "router_z_loss_clip": 0.35058594, + "router_z_loss_mlp": 0.02351379, + "step": 3008, + "time_per_iteration": 3.4314541816711426 + }, + { + "auxiliary_loss_clip": 0.01152872, + "auxiliary_loss_mlp": 0.01045695, + "balance_loss_clip": 1.054443, + "balance_loss_mlp": 1.0291847, + "epoch": 0.18091086727791975, + "flos": 28646860381920.0, + "grad_norm": 2.669750990370078, + "language_loss": 0.67544502, + "learning_rate": 3.765817980138021e-06, + "loss": 0.69743073, + "num_input_tokens_seen": 65062845, + "router_z_loss_clip": 0.984375, + "router_z_loss_mlp": 0.16516113, + "step": 3009, + "time_per_iteration": 2.6798436641693115 + }, + { + "auxiliary_loss_clip": 0.01151196, + "auxiliary_loss_mlp": 0.01042509, + "balance_loss_clip": 1.05275595, + "balance_loss_mlp": 1.02794743, + "epoch": 0.1809709905305877, + "flos": 29493105186240.0, + "grad_norm": 1.8535648450162043, + "language_loss": 0.75635809, + "learning_rate": 3.7656350772023177e-06, + "loss": 0.7782951, + "num_input_tokens_seen": 65082110, + "router_z_loss_clip": 0.98339844, + "router_z_loss_mlp": 0.14562988, + "step": 3010, + "time_per_iteration": 2.7007880210876465 + }, + { + "auxiliary_loss_clip": 0.01142297, + "auxiliary_loss_mlp": 0.01037494, + "balance_loss_clip": 1.04973328, + "balance_loss_mlp": 1.02356434, + "epoch": 0.18103111378325568, + "flos": 26418727501920.0, + "grad_norm": 1.493960784162108, + "language_loss": 0.67394388, + "learning_rate": 3.7654521073135553e-06, + "loss": 0.69574177, + "num_input_tokens_seen": 65101985, + "router_z_loss_clip": 0.92431641, + "router_z_loss_mlp": 0.1394043, + "step": 3011, + "time_per_iteration": 2.695183753967285 + }, + { + "auxiliary_loss_clip": 0.01143902, + "auxiliary_loss_mlp": 0.01043192, + "balance_loss_clip": 1.04885495, + "balance_loss_mlp": 1.02808845, + "epoch": 0.18109123703592364, + "flos": 65511395153280.0, + "grad_norm": 1.6146540367726157, + "language_loss": 0.71346521, + "learning_rate": 3.7652690704786723e-06, + "loss": 0.73533612, + "num_input_tokens_seen": 65129295, + "router_z_loss_clip": 0.95068359, + "router_z_loss_mlp": 0.15100098, + "step": 3012, + "time_per_iteration": 2.9915053844451904 + }, + { + "auxiliary_loss_clip": 0.01145296, + "auxiliary_loss_mlp": 0.01042211, + "balance_loss_clip": 1.05230677, + "balance_loss_mlp": 1.0276978, + "epoch": 0.1811513602885916, + "flos": 43739987502240.0, + "grad_norm": 1.8582884411174814, + "language_loss": 0.6171242, + "learning_rate": 3.765085966704609e-06, + "loss": 0.63899934, + "num_input_tokens_seen": 65150625, + "router_z_loss_clip": 0.93066406, + "router_z_loss_mlp": 0.14520264, + "step": 3013, + "time_per_iteration": 3.0586485862731934 + }, + { + "auxiliary_loss_clip": 0.01149356, + "auxiliary_loss_mlp": 0.01042616, + "balance_loss_clip": 1.05225754, + "balance_loss_mlp": 1.02822733, + "epoch": 0.18121148354125957, + "flos": 28958758467840.0, + "grad_norm": 1.671361661516945, + "language_loss": 0.75974369, + "learning_rate": 3.764902795998309e-06, + "loss": 0.78166342, + "num_input_tokens_seen": 65170880, + "router_z_loss_clip": 0.97119141, + "router_z_loss_mlp": 0.14398193, + "step": 3014, + "time_per_iteration": 4.121519565582275 + }, + { + "auxiliary_loss_clip": 0.01154908, + "auxiliary_loss_mlp": 0.01041317, + "balance_loss_clip": 1.05399227, + "balance_loss_mlp": 1.02428174, + "epoch": 0.18127160679392756, + "flos": 35055212378400.0, + "grad_norm": 1.830705349099661, + "language_loss": 0.65743768, + "learning_rate": 3.7647195583667184e-06, + "loss": 0.67939997, + "num_input_tokens_seen": 65192530, + "router_z_loss_clip": 1.00927734, + "router_z_loss_mlp": 0.17028809, + "step": 3015, + "time_per_iteration": 2.7615230083465576 + }, + { + "auxiliary_loss_clip": 0.01146002, + "auxiliary_loss_mlp": 0.01037423, + "balance_loss_clip": 1.05210459, + "balance_loss_mlp": 1.02242076, + "epoch": 0.18133173004659553, + "flos": 24995923323840.0, + "grad_norm": 1.998146170233956, + "language_loss": 0.78156567, + "learning_rate": 3.764536253816785e-06, + "loss": 0.80339992, + "num_input_tokens_seen": 65211675, + "router_z_loss_clip": 0.93896484, + "router_z_loss_mlp": 0.14984131, + "step": 3016, + "time_per_iteration": 4.141610383987427 + }, + { + "auxiliary_loss_clip": 0.011547, + "auxiliary_loss_mlp": 0.01047858, + "balance_loss_clip": 1.05543923, + "balance_loss_mlp": 1.03181195, + "epoch": 0.1813918532992635, + "flos": 27884757267360.0, + "grad_norm": 1.6756105152297085, + "language_loss": 0.83151603, + "learning_rate": 3.7643528823554602e-06, + "loss": 0.85354155, + "num_input_tokens_seen": 65231185, + "router_z_loss_clip": 0.9921875, + "router_z_loss_mlp": 0.16040039, + "step": 3017, + "time_per_iteration": 2.6973726749420166 + }, + { + "auxiliary_loss_clip": 0.01145232, + "auxiliary_loss_mlp": 0.01035839, + "balance_loss_clip": 1.05178046, + "balance_loss_mlp": 1.02085483, + "epoch": 0.18145197655193146, + "flos": 44007849655200.0, + "grad_norm": 2.0996323117006357, + "language_loss": 0.67121631, + "learning_rate": 3.764169443989697e-06, + "loss": 0.69302702, + "num_input_tokens_seen": 65251645, + "router_z_loss_clip": 0.93457031, + "router_z_loss_mlp": 0.14978027, + "step": 3018, + "time_per_iteration": 2.834723472595215 + }, + { + "auxiliary_loss_clip": 0.01149394, + "auxiliary_loss_mlp": 0.01029782, + "balance_loss_clip": 1.05170071, + "balance_loss_mlp": 1.01460075, + "epoch": 0.18151209980459942, + "flos": 29314368348480.0, + "grad_norm": 2.011434263802063, + "language_loss": 0.76275831, + "learning_rate": 3.7639859387264518e-06, + "loss": 0.78455007, + "num_input_tokens_seen": 65271125, + "router_z_loss_clip": 0.97802734, + "router_z_loss_mlp": 0.15185547, + "step": 3019, + "time_per_iteration": 4.1419289112091064 + }, + { + "auxiliary_loss_clip": 0.01150731, + "auxiliary_loss_mlp": 0.0103913, + "balance_loss_clip": 1.05284142, + "balance_loss_mlp": 1.02191591, + "epoch": 0.1815722230572674, + "flos": 29226053378880.0, + "grad_norm": 2.264705377731266, + "language_loss": 0.81534785, + "learning_rate": 3.7638023665726834e-06, + "loss": 0.83724642, + "num_input_tokens_seen": 65290600, + "router_z_loss_clip": 0.97851562, + "router_z_loss_mlp": 0.17211914, + "step": 3020, + "time_per_iteration": 2.6729326248168945 + }, + { + "auxiliary_loss_clip": 0.01149896, + "auxiliary_loss_mlp": 0.01037523, + "balance_loss_clip": 1.05192578, + "balance_loss_mlp": 1.01982093, + "epoch": 0.18163234630993536, + "flos": 29756956128480.0, + "grad_norm": 1.888313680291257, + "language_loss": 0.7692458, + "learning_rate": 3.763618727535352e-06, + "loss": 0.79112005, + "num_input_tokens_seen": 65311040, + "router_z_loss_clip": 0.97949219, + "router_z_loss_mlp": 0.17712402, + "step": 3021, + "time_per_iteration": 4.128240346908569 + }, + { + "auxiliary_loss_clip": 0.01143416, + "auxiliary_loss_mlp": 0.01035604, + "balance_loss_clip": 1.04821968, + "balance_loss_mlp": 1.02051198, + "epoch": 0.18169246956260335, + "flos": 30116496185280.0, + "grad_norm": 1.8656378770255373, + "language_loss": 0.85269082, + "learning_rate": 3.763435021621422e-06, + "loss": 0.87448102, + "num_input_tokens_seen": 65332115, + "router_z_loss_clip": 0.95166016, + "router_z_loss_mlp": 0.15087891, + "step": 3022, + "time_per_iteration": 2.7283153533935547 + }, + { + "auxiliary_loss_clip": 0.01150136, + "auxiliary_loss_mlp": 0.01038675, + "balance_loss_clip": 1.05111861, + "balance_loss_mlp": 1.02198541, + "epoch": 0.1817525928152713, + "flos": 29581987397760.0, + "grad_norm": 2.0155669194684815, + "language_loss": 0.6960367, + "learning_rate": 3.763251248837859e-06, + "loss": 0.71792477, + "num_input_tokens_seen": 65352210, + "router_z_loss_clip": 0.98925781, + "router_z_loss_mlp": 0.16687012, + "step": 3023, + "time_per_iteration": 2.6852803230285645 + }, + { + "auxiliary_loss_clip": 0.0114868, + "auxiliary_loss_mlp": 0.01037595, + "balance_loss_clip": 1.05169165, + "balance_loss_mlp": 1.02203846, + "epoch": 0.18181271606793928, + "flos": 20098933578720.0, + "grad_norm": 1.788529287185815, + "language_loss": 0.74099278, + "learning_rate": 3.7630674091916317e-06, + "loss": 0.76285559, + "num_input_tokens_seen": 65370600, + "router_z_loss_clip": 0.96875, + "router_z_loss_mlp": 0.15533447, + "step": 3024, + "time_per_iteration": 2.920405387878418 + }, + { + "auxiliary_loss_clip": 0.011493, + "auxiliary_loss_mlp": 0.01037195, + "balance_loss_clip": 1.05153143, + "balance_loss_mlp": 1.02126873, + "epoch": 0.18187283932060724, + "flos": 22673114261280.0, + "grad_norm": 2.2441281424397688, + "language_loss": 0.88634652, + "learning_rate": 3.7628835026897123e-06, + "loss": 0.90821141, + "num_input_tokens_seen": 65387270, + "router_z_loss_clip": 0.97802734, + "router_z_loss_mlp": 0.15917969, + "step": 3025, + "time_per_iteration": 2.6186859607696533 + }, + { + "auxiliary_loss_clip": 0.01150537, + "auxiliary_loss_mlp": 0.0104519, + "balance_loss_clip": 1.05403829, + "balance_loss_mlp": 1.02847695, + "epoch": 0.1819329625732752, + "flos": 24729560310240.0, + "grad_norm": 1.7696334124055935, + "language_loss": 0.78742254, + "learning_rate": 3.7626995293390735e-06, + "loss": 0.80937982, + "num_input_tokens_seen": 65406550, + "router_z_loss_clip": 0.96582031, + "router_z_loss_mlp": 0.16723633, + "step": 3026, + "time_per_iteration": 2.6809492111206055 + }, + { + "auxiliary_loss_clip": 0.01152504, + "auxiliary_loss_mlp": 0.0105294, + "balance_loss_clip": 1.05315638, + "balance_loss_mlp": 1.03647161, + "epoch": 0.18199308582594317, + "flos": 31622753259360.0, + "grad_norm": 1.5871258326557458, + "language_loss": 0.75954425, + "learning_rate": 3.762515489146692e-06, + "loss": 0.78159869, + "num_input_tokens_seen": 65425955, + "router_z_loss_clip": 0.99316406, + "router_z_loss_mlp": 0.16473389, + "step": 3027, + "time_per_iteration": 2.6710550785064697 + }, + { + "auxiliary_loss_clip": 0.01152792, + "auxiliary_loss_mlp": 0.01048381, + "balance_loss_clip": 1.0513643, + "balance_loss_mlp": 1.03065491, + "epoch": 0.18205320907861114, + "flos": 18763269369120.0, + "grad_norm": 2.219232341850235, + "language_loss": 0.85548031, + "learning_rate": 3.762331382119546e-06, + "loss": 0.87749207, + "num_input_tokens_seen": 65442820, + "router_z_loss_clip": 1.01367188, + "router_z_loss_mlp": 0.17736816, + "step": 3028, + "time_per_iteration": 2.686800956726074 + }, + { + "auxiliary_loss_clip": 0.0114899, + "auxiliary_loss_mlp": 0.01040974, + "balance_loss_clip": 1.0525471, + "balance_loss_mlp": 1.02544129, + "epoch": 0.18211333233127913, + "flos": 31267710620640.0, + "grad_norm": 1.7650330824063083, + "language_loss": 0.82662517, + "learning_rate": 3.7621472082646183e-06, + "loss": 0.84852481, + "num_input_tokens_seen": 65461825, + "router_z_loss_clip": 0.96435547, + "router_z_loss_mlp": 0.15527344, + "step": 3029, + "time_per_iteration": 2.7383248805999756 + }, + { + "auxiliary_loss_clip": 0.01152543, + "auxiliary_loss_mlp": 0.01037428, + "balance_loss_clip": 1.05263019, + "balance_loss_mlp": 1.0209775, + "epoch": 0.1821734555839471, + "flos": 18273323239200.0, + "grad_norm": 1.8820896921983836, + "language_loss": 0.78373015, + "learning_rate": 3.761962967588891e-06, + "loss": 0.80562985, + "num_input_tokens_seen": 65479480, + "router_z_loss_clip": 0.99951172, + "router_z_loss_mlp": 0.16442871, + "step": 3030, + "time_per_iteration": 2.648861885070801 + }, + { + "auxiliary_loss_clip": 0.0115183, + "auxiliary_loss_mlp": 0.01039973, + "balance_loss_clip": 1.0518434, + "balance_loss_mlp": 1.02355838, + "epoch": 0.18223357883661506, + "flos": 24640678098720.0, + "grad_norm": 1.9442852747287542, + "language_loss": 0.8500585, + "learning_rate": 3.761778660099352e-06, + "loss": 0.87197661, + "num_input_tokens_seen": 65497775, + "router_z_loss_clip": 0.99951172, + "router_z_loss_mlp": 0.16418457, + "step": 3031, + "time_per_iteration": 2.690215587615967 + }, + { + "auxiliary_loss_clip": 0.01150574, + "auxiliary_loss_mlp": 0.01039719, + "balance_loss_clip": 1.0516088, + "balance_loss_mlp": 1.02431703, + "epoch": 0.18229370208928303, + "flos": 18585626497920.0, + "grad_norm": 1.9552230046882908, + "language_loss": 0.79712272, + "learning_rate": 3.76159428580299e-06, + "loss": 0.8190257, + "num_input_tokens_seen": 65516505, + "router_z_loss_clip": 0.99072266, + "router_z_loss_mlp": 0.15393066, + "step": 3032, + "time_per_iteration": 2.6762044429779053 + }, + { + "auxiliary_loss_clip": 0.01157254, + "auxiliary_loss_mlp": 0.01043952, + "balance_loss_clip": 1.05508447, + "balance_loss_mlp": 1.02722692, + "epoch": 0.182353825341951, + "flos": 29090582645760.0, + "grad_norm": 1.8595390343846414, + "language_loss": 0.81026495, + "learning_rate": 3.761409844706795e-06, + "loss": 0.832277, + "num_input_tokens_seen": 65536160, + "router_z_loss_clip": 1.02246094, + "router_z_loss_mlp": 0.16723633, + "step": 3033, + "time_per_iteration": 2.6930692195892334 + }, + { + "auxiliary_loss_clip": 0.01058942, + "auxiliary_loss_mlp": 0.01008413, + "balance_loss_clip": 1.02355671, + "balance_loss_mlp": 1.0064801, + "epoch": 0.18241394859461896, + "flos": 74666147738400.0, + "grad_norm": 0.8815768484415816, + "language_loss": 0.63603818, + "learning_rate": 3.7612253368177625e-06, + "loss": 0.6567117, + "num_input_tokens_seen": 65589375, + "router_z_loss_clip": 0.35327148, + "router_z_loss_mlp": 0.01930237, + "step": 3034, + "time_per_iteration": 3.1885221004486084 + }, + { + "auxiliary_loss_clip": 0.01150622, + "auxiliary_loss_mlp": 0.01040157, + "balance_loss_clip": 1.05270529, + "balance_loss_mlp": 1.02415895, + "epoch": 0.18247407184728695, + "flos": 22539669392160.0, + "grad_norm": 1.9169320206604026, + "language_loss": 0.79974008, + "learning_rate": 3.7610407621428893e-06, + "loss": 0.82164782, + "num_input_tokens_seen": 65606720, + "router_z_loss_clip": 0.98046875, + "router_z_loss_mlp": 0.15991211, + "step": 3035, + "time_per_iteration": 2.7210967540740967 + }, + { + "auxiliary_loss_clip": 0.01145367, + "auxiliary_loss_mlp": 0.01039387, + "balance_loss_clip": 1.04933739, + "balance_loss_mlp": 1.02383065, + "epoch": 0.18253419509995492, + "flos": 26594952268320.0, + "grad_norm": 1.853812893049906, + "language_loss": 0.84494102, + "learning_rate": 3.7608561206891735e-06, + "loss": 0.86678863, + "num_input_tokens_seen": 65625495, + "router_z_loss_clip": 0.95947266, + "router_z_loss_mlp": 0.15563965, + "step": 3036, + "time_per_iteration": 2.770688772201538 + }, + { + "auxiliary_loss_clip": 0.01145076, + "auxiliary_loss_mlp": 0.0103613, + "balance_loss_clip": 1.0513339, + "balance_loss_mlp": 1.02087164, + "epoch": 0.18259431835262288, + "flos": 24587404225920.0, + "grad_norm": 2.010273342300416, + "language_loss": 0.7988196, + "learning_rate": 3.760671412463617e-06, + "loss": 0.82063162, + "num_input_tokens_seen": 65643515, + "router_z_loss_clip": 0.93798828, + "router_z_loss_mlp": 0.15258789, + "step": 3037, + "time_per_iteration": 2.8475160598754883 + }, + { + "auxiliary_loss_clip": 0.01152973, + "auxiliary_loss_mlp": 0.01038366, + "balance_loss_clip": 1.05451488, + "balance_loss_mlp": 1.02029395, + "epoch": 0.18265444160529085, + "flos": 20720865955680.0, + "grad_norm": 2.919599132246742, + "language_loss": 0.79843628, + "learning_rate": 3.7604866374732246e-06, + "loss": 0.82034969, + "num_input_tokens_seen": 65658155, + "router_z_loss_clip": 0.98583984, + "router_z_loss_mlp": 0.18078613, + "step": 3038, + "time_per_iteration": 2.6731064319610596 + }, + { + "auxiliary_loss_clip": 0.01147218, + "auxiliary_loss_mlp": 0.01040536, + "balance_loss_clip": 1.0505662, + "balance_loss_mlp": 1.02494311, + "epoch": 0.1827145648579588, + "flos": 42004312823520.0, + "grad_norm": 2.1648461018948546, + "language_loss": 0.67826933, + "learning_rate": 3.7603017957250023e-06, + "loss": 0.70014685, + "num_input_tokens_seen": 65679310, + "router_z_loss_clip": 0.96533203, + "router_z_loss_mlp": 0.15576172, + "step": 3039, + "time_per_iteration": 2.7838149070739746 + }, + { + "auxiliary_loss_clip": 0.01150876, + "auxiliary_loss_mlp": 0.01041592, + "balance_loss_clip": 1.0526396, + "balance_loss_mlp": 1.02565348, + "epoch": 0.18277468811062678, + "flos": 65023272300960.0, + "grad_norm": 1.6767430297393524, + "language_loss": 0.73436278, + "learning_rate": 3.7601168872259593e-06, + "loss": 0.75628746, + "num_input_tokens_seen": 65705235, + "router_z_loss_clip": 0.98388672, + "router_z_loss_mlp": 0.1595459, + "step": 3040, + "time_per_iteration": 3.0035526752471924 + }, + { + "auxiliary_loss_clip": 0.011483, + "auxiliary_loss_mlp": 0.01040361, + "balance_loss_clip": 1.0512414, + "balance_loss_mlp": 1.02373087, + "epoch": 0.18283481136329474, + "flos": 38620954297440.0, + "grad_norm": 2.6981897371584127, + "language_loss": 0.60502028, + "learning_rate": 3.7599319119831075e-06, + "loss": 0.62690681, + "num_input_tokens_seen": 65727575, + "router_z_loss_clip": 0.96972656, + "router_z_loss_mlp": 0.16638184, + "step": 3041, + "time_per_iteration": 2.7340612411499023 + }, + { + "auxiliary_loss_clip": 0.01148426, + "auxiliary_loss_mlp": 0.01042145, + "balance_loss_clip": 1.05136538, + "balance_loss_mlp": 1.02578902, + "epoch": 0.18289493461596273, + "flos": 64841334598080.0, + "grad_norm": 1.952637766863619, + "language_loss": 0.5984714, + "learning_rate": 3.7597468700034616e-06, + "loss": 0.62037706, + "num_input_tokens_seen": 65751370, + "router_z_loss_clip": 0.97021484, + "router_z_loss_mlp": 0.16357422, + "step": 3042, + "time_per_iteration": 3.022183656692505 + }, + { + "auxiliary_loss_clip": 0.0115098, + "auxiliary_loss_mlp": 0.01041639, + "balance_loss_clip": 1.05434024, + "balance_loss_mlp": 1.02646399, + "epoch": 0.1829550578686307, + "flos": 31224930723360.0, + "grad_norm": 1.8130314516651855, + "language_loss": 0.87338829, + "learning_rate": 3.7595617612940374e-06, + "loss": 0.89531451, + "num_input_tokens_seen": 65771040, + "router_z_loss_clip": 0.96679688, + "router_z_loss_mlp": 0.15185547, + "step": 3043, + "time_per_iteration": 2.6972155570983887 + }, + { + "auxiliary_loss_clip": 0.01149912, + "auxiliary_loss_mlp": 0.01043524, + "balance_loss_clip": 1.05156612, + "balance_loss_mlp": 1.02727568, + "epoch": 0.18301518112129866, + "flos": 27578450566080.0, + "grad_norm": 1.7802399735451342, + "language_loss": 0.70757604, + "learning_rate": 3.7593765858618552e-06, + "loss": 0.72951037, + "num_input_tokens_seen": 65789345, + "router_z_loss_clip": 0.98291016, + "router_z_loss_mlp": 0.16271973, + "step": 3044, + "time_per_iteration": 2.689429759979248 + }, + { + "auxiliary_loss_clip": 0.01153856, + "auxiliary_loss_mlp": 0.01046417, + "balance_loss_clip": 1.05279613, + "balance_loss_mlp": 1.02926302, + "epoch": 0.18307530437396663, + "flos": 41513313244320.0, + "grad_norm": 2.1382065117044737, + "language_loss": 0.63878894, + "learning_rate": 3.7591913437139365e-06, + "loss": 0.66079164, + "num_input_tokens_seen": 65810990, + "router_z_loss_clip": 1.00976562, + "router_z_loss_mlp": 0.17150879, + "step": 3045, + "time_per_iteration": 2.7542457580566406 + }, + { + "auxiliary_loss_clip": 0.0114906, + "auxiliary_loss_mlp": 0.01043687, + "balance_loss_clip": 1.05224526, + "balance_loss_mlp": 1.02765334, + "epoch": 0.1831354276266346, + "flos": 25966415574720.0, + "grad_norm": 2.689455164182377, + "language_loss": 0.79414386, + "learning_rate": 3.7590060348573066e-06, + "loss": 0.81607127, + "num_input_tokens_seen": 65827230, + "router_z_loss_clip": 0.96777344, + "router_z_loss_mlp": 0.16027832, + "step": 3046, + "time_per_iteration": 2.6611311435699463 + }, + { + "auxiliary_loss_clip": 0.01150206, + "auxiliary_loss_mlp": 0.01039806, + "balance_loss_clip": 1.04991579, + "balance_loss_mlp": 1.02330732, + "epoch": 0.18319555087930256, + "flos": 25664444222400.0, + "grad_norm": 1.774050240724348, + "language_loss": 0.7893672, + "learning_rate": 3.7588206592989903e-06, + "loss": 0.81126738, + "num_input_tokens_seen": 65845900, + "router_z_loss_clip": 1.00341797, + "router_z_loss_mlp": 0.16503906, + "step": 3047, + "time_per_iteration": 2.72695255279541 + }, + { + "auxiliary_loss_clip": 0.01146898, + "auxiliary_loss_mlp": 0.01039374, + "balance_loss_clip": 1.05283153, + "balance_loss_mlp": 1.02424574, + "epoch": 0.18325567413197055, + "flos": 41955495851520.0, + "grad_norm": 1.5150903962938074, + "language_loss": 0.80736303, + "learning_rate": 3.7586352170460194e-06, + "loss": 0.82922566, + "num_input_tokens_seen": 65868730, + "router_z_loss_clip": 0.93945312, + "router_z_loss_mlp": 0.15136719, + "step": 3048, + "time_per_iteration": 2.791926383972168 + }, + { + "auxiliary_loss_clip": 0.01150982, + "auxiliary_loss_mlp": 0.01038173, + "balance_loss_clip": 1.05200171, + "balance_loss_mlp": 1.02195501, + "epoch": 0.18331579738463852, + "flos": 25085453811840.0, + "grad_norm": 3.372251961472111, + "language_loss": 0.86613512, + "learning_rate": 3.758449708105424e-06, + "loss": 0.88802665, + "num_input_tokens_seen": 65888420, + "router_z_loss_clip": 0.98876953, + "router_z_loss_mlp": 0.16217041, + "step": 3049, + "time_per_iteration": 2.9392411708831787 + }, + { + "auxiliary_loss_clip": 0.01155283, + "auxiliary_loss_mlp": 0.01042615, + "balance_loss_clip": 1.05210412, + "balance_loss_mlp": 1.02578306, + "epoch": 0.18337592063730648, + "flos": 23925649713120.0, + "grad_norm": 2.372075426805791, + "language_loss": 0.77643543, + "learning_rate": 3.75826413248424e-06, + "loss": 0.79841447, + "num_input_tokens_seen": 65905840, + "router_z_loss_clip": 1.03076172, + "router_z_loss_mlp": 0.16845703, + "step": 3050, + "time_per_iteration": 2.6445250511169434 + }, + { + "auxiliary_loss_clip": 0.0114542, + "auxiliary_loss_mlp": 0.01036856, + "balance_loss_clip": 1.04856098, + "balance_loss_mlp": 1.02128732, + "epoch": 0.18343604388997445, + "flos": 25442238693600.0, + "grad_norm": 2.4029438414413558, + "language_loss": 0.99255836, + "learning_rate": 3.7580784901895035e-06, + "loss": 1.01438117, + "num_input_tokens_seen": 65922845, + "router_z_loss_clip": 0.96972656, + "router_z_loss_mlp": 0.15563965, + "step": 3051, + "time_per_iteration": 2.690349578857422 + }, + { + "auxiliary_loss_clip": 0.01144796, + "auxiliary_loss_mlp": 0.01037226, + "balance_loss_clip": 1.04986989, + "balance_loss_mlp": 1.02128768, + "epoch": 0.1834961671426424, + "flos": 29765869930080.0, + "grad_norm": 1.6691618541303779, + "language_loss": 0.86341763, + "learning_rate": 3.7578927812282542e-06, + "loss": 0.88523787, + "num_input_tokens_seen": 65945555, + "router_z_loss_clip": 0.94970703, + "router_z_loss_mlp": 0.15930176, + "step": 3052, + "time_per_iteration": 2.720628023147583 + }, + { + "auxiliary_loss_clip": 0.01144272, + "auxiliary_loss_mlp": 0.01037325, + "balance_loss_clip": 1.04894912, + "balance_loss_mlp": 1.02183962, + "epoch": 0.18355629039531038, + "flos": 25931293443360.0, + "grad_norm": 2.0261340900498293, + "language_loss": 0.73456788, + "learning_rate": 3.7577070056075356e-06, + "loss": 0.7563839, + "num_input_tokens_seen": 65963965, + "router_z_loss_clip": 0.95263672, + "router_z_loss_mlp": 0.15490723, + "step": 3053, + "time_per_iteration": 2.676039218902588 + }, + { + "auxiliary_loss_clip": 0.01151067, + "auxiliary_loss_mlp": 0.01040002, + "balance_loss_clip": 1.05262673, + "balance_loss_mlp": 1.023718, + "epoch": 0.18361641364797834, + "flos": 34967464650720.0, + "grad_norm": 1.926827590812356, + "language_loss": 0.62107635, + "learning_rate": 3.7575211633343902e-06, + "loss": 0.64298701, + "num_input_tokens_seen": 65985965, + "router_z_loss_clip": 0.98388672, + "router_z_loss_mlp": 0.16296387, + "step": 3054, + "time_per_iteration": 4.2414257526397705 + }, + { + "auxiliary_loss_clip": 0.01146027, + "auxiliary_loss_mlp": 0.01039086, + "balance_loss_clip": 1.04975677, + "balance_loss_mlp": 1.02407706, + "epoch": 0.18367653690064634, + "flos": 25525610555040.0, + "grad_norm": 2.612872615848965, + "language_loss": 0.7834779, + "learning_rate": 3.7573352544158663e-06, + "loss": 0.80532897, + "num_input_tokens_seen": 66005645, + "router_z_loss_clip": 0.96240234, + "router_z_loss_mlp": 0.15026855, + "step": 3055, + "time_per_iteration": 4.214590072631836 + }, + { + "auxiliary_loss_clip": 0.0114387, + "auxiliary_loss_mlp": 0.01046356, + "balance_loss_clip": 1.04968202, + "balance_loss_mlp": 1.03118706, + "epoch": 0.1837366601533143, + "flos": 35100625898880.0, + "grad_norm": 1.7538521623387209, + "language_loss": 0.70007563, + "learning_rate": 3.757149278859014e-06, + "loss": 0.72197783, + "num_input_tokens_seen": 66025675, + "router_z_loss_clip": 0.93945312, + "router_z_loss_mlp": 0.15167236, + "step": 3056, + "time_per_iteration": 2.7322192192077637 + }, + { + "auxiliary_loss_clip": 0.0114667, + "auxiliary_loss_mlp": 0.01036197, + "balance_loss_clip": 1.05057764, + "balance_loss_mlp": 1.02083099, + "epoch": 0.18379678340598227, + "flos": 25931090856960.0, + "grad_norm": 1.5777512973945222, + "language_loss": 0.80336326, + "learning_rate": 3.7569632366708842e-06, + "loss": 0.82519192, + "num_input_tokens_seen": 66046125, + "router_z_loss_clip": 0.9609375, + "router_z_loss_mlp": 0.15368652, + "step": 3057, + "time_per_iteration": 2.6791255474090576 + }, + { + "auxiliary_loss_clip": 0.01153275, + "auxiliary_loss_mlp": 0.01043956, + "balance_loss_clip": 1.05003846, + "balance_loss_mlp": 1.02582407, + "epoch": 0.18385690665865023, + "flos": 24952576184640.0, + "grad_norm": 2.3855381202452226, + "language_loss": 0.82756913, + "learning_rate": 3.756777127858533e-06, + "loss": 0.84954143, + "num_input_tokens_seen": 66064375, + "router_z_loss_clip": 1.03271484, + "router_z_loss_mlp": 0.18127441, + "step": 3058, + "time_per_iteration": 4.223698854446411 + }, + { + "auxiliary_loss_clip": 0.01145601, + "auxiliary_loss_mlp": 0.01044947, + "balance_loss_clip": 1.04780626, + "balance_loss_mlp": 1.02958131, + "epoch": 0.1839170299113182, + "flos": 31897179211680.0, + "grad_norm": 3.0676785285481953, + "language_loss": 0.8611241, + "learning_rate": 3.756590952429017e-06, + "loss": 0.88302958, + "num_input_tokens_seen": 66084590, + "router_z_loss_clip": 0.97802734, + "router_z_loss_mlp": 0.15356445, + "step": 3059, + "time_per_iteration": 2.7378454208374023 + }, + { + "auxiliary_loss_clip": 0.01141508, + "auxiliary_loss_mlp": 0.01039619, + "balance_loss_clip": 1.04736626, + "balance_loss_mlp": 1.02468812, + "epoch": 0.18397715316398616, + "flos": 38752170716160.0, + "grad_norm": 1.7945481886088552, + "language_loss": 0.72748709, + "learning_rate": 3.756404710389396e-06, + "loss": 0.74929833, + "num_input_tokens_seen": 66107105, + "router_z_loss_clip": 0.94042969, + "router_z_loss_mlp": 0.14923096, + "step": 3060, + "time_per_iteration": 4.4624412059783936 + }, + { + "auxiliary_loss_clip": 0.01148427, + "auxiliary_loss_mlp": 0.01037614, + "balance_loss_clip": 1.05125999, + "balance_loss_mlp": 1.02178288, + "epoch": 0.18403727641665413, + "flos": 30031544149920.0, + "grad_norm": 1.7561396386811792, + "language_loss": 0.72681582, + "learning_rate": 3.7562184017467323e-06, + "loss": 0.74867624, + "num_input_tokens_seen": 66129295, + "router_z_loss_clip": 0.97070312, + "router_z_loss_mlp": 0.1583252, + "step": 3061, + "time_per_iteration": 2.694420099258423 + }, + { + "auxiliary_loss_clip": 0.01145902, + "auxiliary_loss_mlp": 0.01041298, + "balance_loss_clip": 1.05071497, + "balance_loss_mlp": 1.02557433, + "epoch": 0.18409739966932212, + "flos": 28601689965120.0, + "grad_norm": 3.865183912370833, + "language_loss": 0.81671715, + "learning_rate": 3.7560320265080906e-06, + "loss": 0.83858919, + "num_input_tokens_seen": 66146910, + "router_z_loss_clip": 0.95117188, + "router_z_loss_mlp": 0.15722656, + "step": 3062, + "time_per_iteration": 2.7154147624969482 + }, + { + "auxiliary_loss_clip": 0.01151974, + "auxiliary_loss_mlp": 0.01038702, + "balance_loss_clip": 1.05200052, + "balance_loss_mlp": 1.02227473, + "epoch": 0.18415752292199009, + "flos": 26688453449760.0, + "grad_norm": 1.9277960362260504, + "language_loss": 0.73411918, + "learning_rate": 3.7558455846805383e-06, + "loss": 0.75602591, + "num_input_tokens_seen": 66165370, + "router_z_loss_clip": 0.99853516, + "router_z_loss_mlp": 0.16418457, + "step": 3063, + "time_per_iteration": 2.6839263439178467 + }, + { + "auxiliary_loss_clip": 0.01142511, + "auxiliary_loss_mlp": 0.01039876, + "balance_loss_clip": 1.0475924, + "balance_loss_mlp": 1.02592826, + "epoch": 0.18421764617465805, + "flos": 31006533818880.0, + "grad_norm": 2.3249387453045323, + "language_loss": 0.65967619, + "learning_rate": 3.7556590762711463e-06, + "loss": 0.68150008, + "num_input_tokens_seen": 66186210, + "router_z_loss_clip": 0.95019531, + "router_z_loss_mlp": 0.13946533, + "step": 3064, + "time_per_iteration": 2.6943767070770264 + }, + { + "auxiliary_loss_clip": 0.01146262, + "auxiliary_loss_mlp": 0.01039071, + "balance_loss_clip": 1.05127203, + "balance_loss_mlp": 1.0236336, + "epoch": 0.18427776942732602, + "flos": 33187754039040.0, + "grad_norm": 2.041536611565689, + "language_loss": 0.69025576, + "learning_rate": 3.7554725012869853e-06, + "loss": 0.71210909, + "num_input_tokens_seen": 66204800, + "router_z_loss_clip": 0.95019531, + "router_z_loss_mlp": 0.15429688, + "step": 3065, + "time_per_iteration": 2.7193386554718018 + }, + { + "auxiliary_loss_clip": 0.0115208, + "auxiliary_loss_mlp": 0.01040343, + "balance_loss_clip": 1.05318689, + "balance_loss_mlp": 1.0240829, + "epoch": 0.18433789267999398, + "flos": 33986316355200.0, + "grad_norm": 4.606519466393612, + "language_loss": 0.73054028, + "learning_rate": 3.7552858597351318e-06, + "loss": 0.75246453, + "num_input_tokens_seen": 66222195, + "router_z_loss_clip": 0.98974609, + "router_z_loss_mlp": 0.16259766, + "step": 3066, + "time_per_iteration": 2.720345973968506 + }, + { + "auxiliary_loss_clip": 0.01149373, + "auxiliary_loss_mlp": 0.01035889, + "balance_loss_clip": 1.0521884, + "balance_loss_mlp": 1.02154851, + "epoch": 0.18439801593266195, + "flos": 21788992150560.0, + "grad_norm": 2.092272126511206, + "language_loss": 0.82082248, + "learning_rate": 3.7550991516226622e-06, + "loss": 0.84267515, + "num_input_tokens_seen": 66239505, + "router_z_loss_clip": 0.97265625, + "router_z_loss_mlp": 0.14331055, + "step": 3067, + "time_per_iteration": 2.6506123542785645 + }, + { + "auxiliary_loss_clip": 0.01064883, + "auxiliary_loss_mlp": 0.01016269, + "balance_loss_clip": 1.03027427, + "balance_loss_mlp": 1.01426756, + "epoch": 0.18445813918532994, + "flos": 68805715749120.0, + "grad_norm": 0.7966322706472528, + "language_loss": 0.5972563, + "learning_rate": 3.754912376956657e-06, + "loss": 0.61806786, + "num_input_tokens_seen": 66295695, + "router_z_loss_clip": 0.34619141, + "router_z_loss_mlp": 0.02000427, + "step": 3068, + "time_per_iteration": 3.1351394653320312 + }, + { + "auxiliary_loss_clip": 0.01146575, + "auxiliary_loss_mlp": 0.01037849, + "balance_loss_clip": 1.0528754, + "balance_loss_mlp": 1.02314496, + "epoch": 0.1845182624379979, + "flos": 25572239593920.0, + "grad_norm": 1.7203017957427738, + "language_loss": 0.7620635, + "learning_rate": 3.7547255357441987e-06, + "loss": 0.78390777, + "num_input_tokens_seen": 66315315, + "router_z_loss_clip": 0.93798828, + "router_z_loss_mlp": 0.14715576, + "step": 3069, + "time_per_iteration": 2.6660666465759277 + }, + { + "auxiliary_loss_clip": 0.01147811, + "auxiliary_loss_mlp": 0.0103859, + "balance_loss_clip": 1.05110657, + "balance_loss_mlp": 1.02330685, + "epoch": 0.18457838569066587, + "flos": 24996044875680.0, + "grad_norm": 1.8933760700246887, + "language_loss": 0.84833503, + "learning_rate": 3.7545386279923718e-06, + "loss": 0.87019897, + "num_input_tokens_seen": 66333675, + "router_z_loss_clip": 0.96630859, + "router_z_loss_mlp": 0.1529541, + "step": 3070, + "time_per_iteration": 2.669238328933716 + }, + { + "auxiliary_loss_clip": 0.0114752, + "auxiliary_loss_mlp": 0.01037049, + "balance_loss_clip": 1.05005908, + "balance_loss_mlp": 1.02109933, + "epoch": 0.18463850894333383, + "flos": 30517560103680.0, + "grad_norm": 2.2328353198889817, + "language_loss": 0.77919137, + "learning_rate": 3.754351653708265e-06, + "loss": 0.80103707, + "num_input_tokens_seen": 66354075, + "router_z_loss_clip": 0.97460938, + "router_z_loss_mlp": 0.15942383, + "step": 3071, + "time_per_iteration": 2.6919398307800293 + }, + { + "auxiliary_loss_clip": 0.01149664, + "auxiliary_loss_mlp": 0.01042711, + "balance_loss_clip": 1.05254698, + "balance_loss_mlp": 1.02721429, + "epoch": 0.1846986321960018, + "flos": 20499835428000.0, + "grad_norm": 3.9959157423330445, + "language_loss": 0.77188766, + "learning_rate": 3.7541646128989674e-06, + "loss": 0.79381132, + "num_input_tokens_seen": 66372520, + "router_z_loss_clip": 0.97070312, + "router_z_loss_mlp": 0.15484619, + "step": 3072, + "time_per_iteration": 2.895946502685547 + }, + { + "auxiliary_loss_clip": 0.01147241, + "auxiliary_loss_mlp": 0.01038312, + "balance_loss_clip": 1.04850328, + "balance_loss_mlp": 1.02193236, + "epoch": 0.18475875544866976, + "flos": 25397635518720.0, + "grad_norm": 2.3521001224314566, + "language_loss": 0.86152005, + "learning_rate": 3.7539775055715715e-06, + "loss": 0.88337553, + "num_input_tokens_seen": 66390745, + "router_z_loss_clip": 0.98730469, + "router_z_loss_mlp": 0.16394043, + "step": 3073, + "time_per_iteration": 2.6737191677093506 + }, + { + "auxiliary_loss_clip": 0.01148007, + "auxiliary_loss_mlp": 0.01040878, + "balance_loss_clip": 1.05195904, + "balance_loss_mlp": 1.02653754, + "epoch": 0.18481887870133773, + "flos": 27578450566080.0, + "grad_norm": 2.1206002903441283, + "language_loss": 0.91987765, + "learning_rate": 3.7537903317331732e-06, + "loss": 0.9417665, + "num_input_tokens_seen": 66410525, + "router_z_loss_clip": 0.95996094, + "router_z_loss_mlp": 0.14355469, + "step": 3074, + "time_per_iteration": 2.714076519012451 + }, + { + "auxiliary_loss_clip": 0.01143912, + "auxiliary_loss_mlp": 0.01035828, + "balance_loss_clip": 1.04913938, + "balance_loss_mlp": 1.01962733, + "epoch": 0.18487900195400572, + "flos": 35946546564960.0, + "grad_norm": 1.7413171319420822, + "language_loss": 0.65298355, + "learning_rate": 3.75360309139087e-06, + "loss": 0.67478096, + "num_input_tokens_seen": 66432535, + "router_z_loss_clip": 0.94921875, + "router_z_loss_mlp": 0.1619873, + "step": 3075, + "time_per_iteration": 2.715593099594116 + }, + { + "auxiliary_loss_clip": 0.01146285, + "auxiliary_loss_mlp": 0.01042045, + "balance_loss_clip": 1.05043483, + "balance_loss_mlp": 1.02675009, + "epoch": 0.1849391252066737, + "flos": 25171499813760.0, + "grad_norm": 1.8811379637104924, + "language_loss": 0.72473347, + "learning_rate": 3.753415784551761e-06, + "loss": 0.74661672, + "num_input_tokens_seen": 66450620, + "router_z_loss_clip": 0.95849609, + "router_z_loss_mlp": 0.1529541, + "step": 3076, + "time_per_iteration": 2.6817941665649414 + }, + { + "auxiliary_loss_clip": 0.01148796, + "auxiliary_loss_mlp": 0.0103849, + "balance_loss_clip": 1.05010033, + "balance_loss_mlp": 1.02345753, + "epoch": 0.18499924845934165, + "flos": 17605288548000.0, + "grad_norm": 2.983444787692609, + "language_loss": 0.80166316, + "learning_rate": 3.7532284112229507e-06, + "loss": 0.82353604, + "num_input_tokens_seen": 66467865, + "router_z_loss_clip": 0.98730469, + "router_z_loss_mlp": 0.15032959, + "step": 3077, + "time_per_iteration": 2.6276063919067383 + }, + { + "auxiliary_loss_clip": 0.01142413, + "auxiliary_loss_mlp": 0.01037416, + "balance_loss_clip": 1.04855323, + "balance_loss_mlp": 1.02268207, + "epoch": 0.18505937171200962, + "flos": 28953207600480.0, + "grad_norm": 3.0072539179511857, + "language_loss": 0.78913558, + "learning_rate": 3.7530409714115424e-06, + "loss": 0.81093395, + "num_input_tokens_seen": 66486245, + "router_z_loss_clip": 0.9375, + "router_z_loss_mlp": 0.14746094, + "step": 3078, + "time_per_iteration": 2.7208144664764404 + }, + { + "auxiliary_loss_clip": 0.01145596, + "auxiliary_loss_mlp": 0.01036194, + "balance_loss_clip": 1.05079603, + "balance_loss_mlp": 1.02219868, + "epoch": 0.18511949496467758, + "flos": 31673839199040.0, + "grad_norm": 1.8895483483100337, + "language_loss": 0.77930105, + "learning_rate": 3.7528534651246453e-06, + "loss": 0.80111897, + "num_input_tokens_seen": 66506510, + "router_z_loss_clip": 0.94873047, + "router_z_loss_mlp": 0.13995361, + "step": 3079, + "time_per_iteration": 2.705488681793213 + }, + { + "auxiliary_loss_clip": 0.01140549, + "auxiliary_loss_mlp": 0.01036605, + "balance_loss_clip": 1.04613078, + "balance_loss_mlp": 1.02181125, + "epoch": 0.18517961821734555, + "flos": 51754702070880.0, + "grad_norm": 1.8679726898695932, + "language_loss": 0.81933522, + "learning_rate": 3.752665892369369e-06, + "loss": 0.84110677, + "num_input_tokens_seen": 66530960, + "router_z_loss_clip": 0.94433594, + "router_z_loss_mlp": 0.14788818, + "step": 3080, + "time_per_iteration": 2.919661283493042 + }, + { + "auxiliary_loss_clip": 0.01147423, + "auxiliary_loss_mlp": 0.01038898, + "balance_loss_clip": 1.04921412, + "balance_loss_mlp": 1.02369857, + "epoch": 0.18523974147001354, + "flos": 29404141940160.0, + "grad_norm": 2.082888131961588, + "language_loss": 0.74327093, + "learning_rate": 3.7524782531528266e-06, + "loss": 0.76513422, + "num_input_tokens_seen": 66550275, + "router_z_loss_clip": 0.98193359, + "router_z_loss_mlp": 0.15209961, + "step": 3081, + "time_per_iteration": 2.7151827812194824 + }, + { + "auxiliary_loss_clip": 0.01147185, + "auxiliary_loss_mlp": 0.01043944, + "balance_loss_clip": 1.05084658, + "balance_loss_mlp": 1.02788639, + "epoch": 0.1852998647226815, + "flos": 33404408700480.0, + "grad_norm": 2.013921806196751, + "language_loss": 0.71500415, + "learning_rate": 3.7522905474821334e-06, + "loss": 0.73691541, + "num_input_tokens_seen": 66569040, + "router_z_loss_clip": 0.96289062, + "router_z_loss_mlp": 0.16064453, + "step": 3082, + "time_per_iteration": 2.7533748149871826 + }, + { + "auxiliary_loss_clip": 0.01149326, + "auxiliary_loss_mlp": 0.01043569, + "balance_loss_clip": 1.05099916, + "balance_loss_mlp": 1.02753544, + "epoch": 0.18535998797534947, + "flos": 22369481700480.0, + "grad_norm": 2.3932130874799538, + "language_loss": 0.69807822, + "learning_rate": 3.752102775364407e-06, + "loss": 0.72000718, + "num_input_tokens_seen": 66587775, + "router_z_loss_clip": 0.98339844, + "router_z_loss_mlp": 0.16027832, + "step": 3083, + "time_per_iteration": 2.6468427181243896 + }, + { + "auxiliary_loss_clip": 0.01142746, + "auxiliary_loss_mlp": 0.01041253, + "balance_loss_clip": 1.0493468, + "balance_loss_mlp": 1.02670383, + "epoch": 0.18542011122801744, + "flos": 46181493144000.0, + "grad_norm": 11.623112781485021, + "language_loss": 0.69167006, + "learning_rate": 3.751914936806767e-06, + "loss": 0.71351004, + "num_input_tokens_seen": 66610800, + "router_z_loss_clip": 0.93457031, + "router_z_loss_mlp": 0.14556885, + "step": 3084, + "time_per_iteration": 2.8202743530273438 + }, + { + "auxiliary_loss_clip": 0.01142374, + "auxiliary_loss_mlp": 0.01032385, + "balance_loss_clip": 1.04837489, + "balance_loss_mlp": 1.01792526, + "epoch": 0.1854802344806854, + "flos": 30733039764000.0, + "grad_norm": 1.6490580985045458, + "language_loss": 0.77790701, + "learning_rate": 3.7517270318163377e-06, + "loss": 0.7996546, + "num_input_tokens_seen": 66630960, + "router_z_loss_clip": 0.94042969, + "router_z_loss_mlp": 0.14465332, + "step": 3085, + "time_per_iteration": 2.9456093311309814 + }, + { + "auxiliary_loss_clip": 0.01141519, + "auxiliary_loss_mlp": 0.01041449, + "balance_loss_clip": 1.04668283, + "balance_loss_mlp": 1.02675056, + "epoch": 0.18554035773335337, + "flos": 32561567347680.0, + "grad_norm": 2.044990347803474, + "language_loss": 0.73658544, + "learning_rate": 3.751539060400244e-06, + "loss": 0.75841516, + "num_input_tokens_seen": 66650585, + "router_z_loss_clip": 0.94873047, + "router_z_loss_mlp": 0.14697266, + "step": 3086, + "time_per_iteration": 2.712559223175049 + }, + { + "auxiliary_loss_clip": 0.01144362, + "auxiliary_loss_mlp": 0.01040263, + "balance_loss_clip": 1.05048323, + "balance_loss_mlp": 1.0249213, + "epoch": 0.18560048098602133, + "flos": 27267767998560.0, + "grad_norm": 2.225472255906338, + "language_loss": 0.69795704, + "learning_rate": 3.7513510225656132e-06, + "loss": 0.71980327, + "num_input_tokens_seen": 66670045, + "router_z_loss_clip": 0.93994141, + "router_z_loss_mlp": 0.15356445, + "step": 3087, + "time_per_iteration": 2.666513204574585 + }, + { + "auxiliary_loss_clip": 0.0114677, + "auxiliary_loss_mlp": 0.01044434, + "balance_loss_clip": 1.05004382, + "balance_loss_mlp": 1.02841258, + "epoch": 0.18566060423868933, + "flos": 21656600730720.0, + "grad_norm": 2.0855388851927734, + "language_loss": 0.72794253, + "learning_rate": 3.7511629183195764e-06, + "loss": 0.74985456, + "num_input_tokens_seen": 66688790, + "router_z_loss_clip": 0.96728516, + "router_z_loss_mlp": 0.16027832, + "step": 3088, + "time_per_iteration": 2.621932029724121 + }, + { + "auxiliary_loss_clip": 0.01140983, + "auxiliary_loss_mlp": 0.01038834, + "balance_loss_clip": 1.04775226, + "balance_loss_mlp": 1.0239327, + "epoch": 0.1857207274913573, + "flos": 30111917732640.0, + "grad_norm": 2.189087803914107, + "language_loss": 0.91913885, + "learning_rate": 3.7509747476692663e-06, + "loss": 0.94093704, + "num_input_tokens_seen": 66708090, + "router_z_loss_clip": 0.93261719, + "router_z_loss_mlp": 0.14904785, + "step": 3089, + "time_per_iteration": 2.7126502990722656 + }, + { + "auxiliary_loss_clip": 0.01143798, + "auxiliary_loss_mlp": 0.01040077, + "balance_loss_clip": 1.0495528, + "balance_loss_mlp": 1.02510393, + "epoch": 0.18578085074402526, + "flos": 34346545205760.0, + "grad_norm": 5.810230050728649, + "language_loss": 0.58273363, + "learning_rate": 3.7507865106218176e-06, + "loss": 0.60457242, + "num_input_tokens_seen": 66727320, + "router_z_loss_clip": 0.94238281, + "router_z_loss_mlp": 0.14978027, + "step": 3090, + "time_per_iteration": 2.7017781734466553 + }, + { + "auxiliary_loss_clip": 0.01143043, + "auxiliary_loss_mlp": 0.01042014, + "balance_loss_clip": 1.04908609, + "balance_loss_mlp": 1.02719629, + "epoch": 0.18584097399669322, + "flos": 29226336999840.0, + "grad_norm": 1.8578243191024655, + "language_loss": 0.81880689, + "learning_rate": 3.7505982071843695e-06, + "loss": 0.84065747, + "num_input_tokens_seen": 66747505, + "router_z_loss_clip": 0.93994141, + "router_z_loss_mlp": 0.14831543, + "step": 3091, + "time_per_iteration": 2.676473617553711 + }, + { + "auxiliary_loss_clip": 0.01147408, + "auxiliary_loss_mlp": 0.01045443, + "balance_loss_clip": 1.04996324, + "balance_loss_mlp": 1.02982712, + "epoch": 0.18590109724936119, + "flos": 20989254833280.0, + "grad_norm": 2.665012257710558, + "language_loss": 0.83425951, + "learning_rate": 3.7504098373640617e-06, + "loss": 0.85618794, + "num_input_tokens_seen": 66766425, + "router_z_loss_clip": 0.97460938, + "router_z_loss_mlp": 0.15612793, + "step": 3092, + "time_per_iteration": 2.644575595855713 + }, + { + "auxiliary_loss_clip": 0.01149851, + "auxiliary_loss_mlp": 0.01044844, + "balance_loss_clip": 1.04917955, + "balance_loss_mlp": 1.02898955, + "epoch": 0.18596122050202915, + "flos": 21029806280160.0, + "grad_norm": 2.6776124863188526, + "language_loss": 0.93395394, + "learning_rate": 3.750221401168038e-06, + "loss": 0.95590091, + "num_input_tokens_seen": 66781130, + "router_z_loss_clip": 1.00683594, + "router_z_loss_mlp": 0.15863037, + "step": 3093, + "time_per_iteration": 2.63883638381958 + }, + { + "auxiliary_loss_clip": 0.01147517, + "auxiliary_loss_mlp": 0.01038019, + "balance_loss_clip": 1.05164027, + "balance_loss_mlp": 1.02298677, + "epoch": 0.18602134375469712, + "flos": 23209486843680.0, + "grad_norm": 1.695663714177885, + "language_loss": 0.77121478, + "learning_rate": 3.750032898603443e-06, + "loss": 0.7930702, + "num_input_tokens_seen": 66797535, + "router_z_loss_clip": 0.95849609, + "router_z_loss_mlp": 0.15026855, + "step": 3094, + "time_per_iteration": 4.16315484046936 + }, + { + "auxiliary_loss_clip": 0.01146239, + "auxiliary_loss_mlp": 0.01041682, + "balance_loss_clip": 1.05128765, + "balance_loss_mlp": 1.02712631, + "epoch": 0.1860814670073651, + "flos": 62171748421920.0, + "grad_norm": 1.7062178247482604, + "language_loss": 0.69441032, + "learning_rate": 3.749844329677425e-06, + "loss": 0.71628952, + "num_input_tokens_seen": 66821720, + "router_z_loss_clip": 0.94970703, + "router_z_loss_mlp": 0.14562988, + "step": 3095, + "time_per_iteration": 4.344973564147949 + }, + { + "auxiliary_loss_clip": 0.01149675, + "auxiliary_loss_mlp": 0.01039988, + "balance_loss_clip": 1.05144382, + "balance_loss_mlp": 1.02381146, + "epoch": 0.18614159026003307, + "flos": 23660785838880.0, + "grad_norm": 2.312299797847983, + "language_loss": 0.80749893, + "learning_rate": 3.749655694397135e-06, + "loss": 0.82939553, + "num_input_tokens_seen": 66839060, + "router_z_loss_clip": 0.98242188, + "router_z_loss_mlp": 0.16186523, + "step": 3096, + "time_per_iteration": 2.6610465049743652 + }, + { + "auxiliary_loss_clip": 0.01146374, + "auxiliary_loss_mlp": 0.01040771, + "balance_loss_clip": 1.04815245, + "balance_loss_mlp": 1.0252378, + "epoch": 0.18620171351270104, + "flos": 26599003996320.0, + "grad_norm": 2.2696966671973966, + "language_loss": 0.75039488, + "learning_rate": 3.7494669927697255e-06, + "loss": 0.77226639, + "num_input_tokens_seen": 66857760, + "router_z_loss_clip": 0.98339844, + "router_z_loss_mlp": 0.15563965, + "step": 3097, + "time_per_iteration": 2.884392499923706 + }, + { + "auxiliary_loss_clip": 0.01145433, + "auxiliary_loss_mlp": 0.01037172, + "balance_loss_clip": 1.0510937, + "balance_loss_mlp": 1.0220263, + "epoch": 0.186261836765369, + "flos": 19965650778720.0, + "grad_norm": 2.0593231283073465, + "language_loss": 0.66407537, + "learning_rate": 3.749278224802352e-06, + "loss": 0.6859014, + "num_input_tokens_seen": 66876460, + "router_z_loss_clip": 0.94335938, + "router_z_loss_mlp": 0.15142822, + "step": 3098, + "time_per_iteration": 4.115146636962891 + }, + { + "auxiliary_loss_clip": 0.0115036, + "auxiliary_loss_mlp": 0.01046264, + "balance_loss_clip": 1.05160987, + "balance_loss_mlp": 1.02912128, + "epoch": 0.18632196001803697, + "flos": 28516170687840.0, + "grad_norm": 1.5272220502125755, + "language_loss": 0.69322878, + "learning_rate": 3.7490893905021733e-06, + "loss": 0.71519506, + "num_input_tokens_seen": 66897960, + "router_z_loss_clip": 0.98681641, + "router_z_loss_mlp": 0.17138672, + "step": 3099, + "time_per_iteration": 4.096353530883789 + }, + { + "auxiliary_loss_clip": 0.01146972, + "auxiliary_loss_mlp": 0.01046217, + "balance_loss_clip": 1.04902554, + "balance_loss_mlp": 1.02998078, + "epoch": 0.18638208327070493, + "flos": 27444924662400.0, + "grad_norm": 1.707787256560791, + "language_loss": 0.7187072, + "learning_rate": 3.7489004898763494e-06, + "loss": 0.74063909, + "num_input_tokens_seen": 66917675, + "router_z_loss_clip": 0.97998047, + "router_z_loss_mlp": 0.16223145, + "step": 3100, + "time_per_iteration": 2.6770989894866943 + }, + { + "auxiliary_loss_clip": 0.01150869, + "auxiliary_loss_mlp": 0.01043585, + "balance_loss_clip": 1.05161297, + "balance_loss_mlp": 1.02688372, + "epoch": 0.18644220652337293, + "flos": 35588465130240.0, + "grad_norm": 2.100023964763504, + "language_loss": 0.80229956, + "learning_rate": 3.7487115229320444e-06, + "loss": 0.82424414, + "num_input_tokens_seen": 66936000, + "router_z_loss_clip": 0.9921875, + "router_z_loss_mlp": 0.16674805, + "step": 3101, + "time_per_iteration": 2.7388644218444824 + }, + { + "auxiliary_loss_clip": 0.01145774, + "auxiliary_loss_mlp": 0.01039215, + "balance_loss_clip": 1.05117822, + "balance_loss_mlp": 1.02484477, + "epoch": 0.1865023297760409, + "flos": 29582554639680.0, + "grad_norm": 1.9410669933252134, + "language_loss": 0.77277684, + "learning_rate": 3.7485224896764222e-06, + "loss": 0.79462671, + "num_input_tokens_seen": 66955700, + "router_z_loss_clip": 0.94580078, + "router_z_loss_mlp": 0.14361572, + "step": 3102, + "time_per_iteration": 2.7399113178253174 + }, + { + "auxiliary_loss_clip": 0.0114743, + "auxiliary_loss_mlp": 0.01038742, + "balance_loss_clip": 1.04872966, + "balance_loss_mlp": 1.02305353, + "epoch": 0.18656245302870886, + "flos": 23341392056160.0, + "grad_norm": 2.319563108834855, + "language_loss": 0.76677382, + "learning_rate": 3.7483333901166525e-06, + "loss": 0.78863549, + "num_input_tokens_seen": 66972815, + "router_z_loss_clip": 0.98779297, + "router_z_loss_mlp": 0.15698242, + "step": 3103, + "time_per_iteration": 2.6987993717193604 + }, + { + "auxiliary_loss_clip": 0.01147492, + "auxiliary_loss_mlp": 0.01041788, + "balance_loss_clip": 1.05082846, + "balance_loss_mlp": 1.02630281, + "epoch": 0.18662257628137682, + "flos": 21702338389440.0, + "grad_norm": 2.1046302964940033, + "language_loss": 0.78897381, + "learning_rate": 3.7481442242599054e-06, + "loss": 0.81086665, + "num_input_tokens_seen": 66992280, + "router_z_loss_clip": 0.96679688, + "router_z_loss_mlp": 0.15478516, + "step": 3104, + "time_per_iteration": 2.872122049331665 + }, + { + "auxiliary_loss_clip": 0.01147575, + "auxiliary_loss_mlp": 0.01041794, + "balance_loss_clip": 1.05238295, + "balance_loss_mlp": 1.02636838, + "epoch": 0.1866826995340448, + "flos": 29314854555840.0, + "grad_norm": 2.3408580213799213, + "language_loss": 0.85080457, + "learning_rate": 3.747954992113354e-06, + "loss": 0.87269825, + "num_input_tokens_seen": 67012220, + "router_z_loss_clip": 0.95166016, + "router_z_loss_mlp": 0.1541748, + "step": 3105, + "time_per_iteration": 2.7610278129577637 + }, + { + "auxiliary_loss_clip": 0.01150583, + "auxiliary_loss_mlp": 0.01039012, + "balance_loss_clip": 1.04813075, + "balance_loss_mlp": 1.02232313, + "epoch": 0.18674282278671275, + "flos": 31897949040000.0, + "grad_norm": 2.03430326647922, + "language_loss": 0.86916745, + "learning_rate": 3.7477656936841742e-06, + "loss": 0.89106345, + "num_input_tokens_seen": 67032030, + "router_z_loss_clip": 1.0234375, + "router_z_loss_mlp": 0.16687012, + "step": 3106, + "time_per_iteration": 2.7300007343292236 + }, + { + "auxiliary_loss_clip": 0.0115329, + "auxiliary_loss_mlp": 0.01040837, + "balance_loss_clip": 1.05228734, + "balance_loss_mlp": 1.02524436, + "epoch": 0.18680294603938072, + "flos": 23430719957760.0, + "grad_norm": 2.7107594956888517, + "language_loss": 0.7828446, + "learning_rate": 3.7475763289795445e-06, + "loss": 0.80478585, + "num_input_tokens_seen": 67048920, + "router_z_loss_clip": 1.01025391, + "router_z_loss_mlp": 0.15576172, + "step": 3107, + "time_per_iteration": 2.6755616664886475 + }, + { + "auxiliary_loss_clip": 0.01150069, + "auxiliary_loss_mlp": 0.01048216, + "balance_loss_clip": 1.04979062, + "balance_loss_mlp": 1.03109729, + "epoch": 0.1868630692920487, + "flos": 34831467192960.0, + "grad_norm": 2.462221796846266, + "language_loss": 0.73956668, + "learning_rate": 3.7473868980066446e-06, + "loss": 0.76154947, + "num_input_tokens_seen": 67068645, + "router_z_loss_clip": 1.00244141, + "router_z_loss_mlp": 0.17126465, + "step": 3108, + "time_per_iteration": 2.8614940643310547 + }, + { + "auxiliary_loss_clip": 0.01149315, + "auxiliary_loss_mlp": 0.01039743, + "balance_loss_clip": 1.05045724, + "balance_loss_mlp": 1.0232327, + "epoch": 0.18692319254471668, + "flos": 21033614904480.0, + "grad_norm": 1.7669533832869766, + "language_loss": 0.74480021, + "learning_rate": 3.747197400772658e-06, + "loss": 0.76669079, + "num_input_tokens_seen": 67087075, + "router_z_loss_clip": 0.98730469, + "router_z_loss_mlp": 0.16503906, + "step": 3109, + "time_per_iteration": 2.7047555446624756 + }, + { + "auxiliary_loss_clip": 0.01147207, + "auxiliary_loss_mlp": 0.01043753, + "balance_loss_clip": 1.04992843, + "balance_loss_mlp": 1.02701581, + "epoch": 0.18698331579738464, + "flos": 28290804811200.0, + "grad_norm": 1.8291075045843999, + "language_loss": 0.84498823, + "learning_rate": 3.747007837284772e-06, + "loss": 0.86689788, + "num_input_tokens_seen": 67108040, + "router_z_loss_clip": 0.97167969, + "router_z_loss_mlp": 0.16729736, + "step": 3110, + "time_per_iteration": 2.6684882640838623 + }, + { + "auxiliary_loss_clip": 0.01150104, + "auxiliary_loss_mlp": 0.01034875, + "balance_loss_clip": 1.05306911, + "balance_loss_mlp": 1.01940715, + "epoch": 0.1870434390500526, + "flos": 31136251098240.0, + "grad_norm": 1.6878262491290268, + "language_loss": 0.84417641, + "learning_rate": 3.7468182075501737e-06, + "loss": 0.86602628, + "num_input_tokens_seen": 67127605, + "router_z_loss_clip": 0.97119141, + "router_z_loss_mlp": 0.15466309, + "step": 3111, + "time_per_iteration": 2.684950351715088 + }, + { + "auxiliary_loss_clip": 0.01144506, + "auxiliary_loss_mlp": 0.01037414, + "balance_loss_clip": 1.0486896, + "balance_loss_mlp": 1.02217877, + "epoch": 0.18710356230272057, + "flos": 23794797949920.0, + "grad_norm": 2.2140042353645257, + "language_loss": 0.76705736, + "learning_rate": 3.7466285115760536e-06, + "loss": 0.78887659, + "num_input_tokens_seen": 67145785, + "router_z_loss_clip": 0.95849609, + "router_z_loss_mlp": 0.15246582, + "step": 3112, + "time_per_iteration": 2.6312754154205322 + }, + { + "auxiliary_loss_clip": 0.01147874, + "auxiliary_loss_mlp": 0.01041839, + "balance_loss_clip": 1.04956651, + "balance_loss_mlp": 1.02669907, + "epoch": 0.18716368555538854, + "flos": 32654096114400.0, + "grad_norm": 2.254333521118736, + "language_loss": 0.64850605, + "learning_rate": 3.7464387493696046e-06, + "loss": 0.67040324, + "num_input_tokens_seen": 67165930, + "router_z_loss_clip": 0.98291016, + "router_z_loss_mlp": 0.15124512, + "step": 3113, + "time_per_iteration": 2.69628643989563 + }, + { + "auxiliary_loss_clip": 0.0115523, + "auxiliary_loss_mlp": 0.01040206, + "balance_loss_clip": 1.05322242, + "balance_loss_mlp": 1.02394605, + "epoch": 0.1872238088080565, + "flos": 30737213043840.0, + "grad_norm": 2.1957657381876565, + "language_loss": 0.81468546, + "learning_rate": 3.746248920938024e-06, + "loss": 0.83663988, + "num_input_tokens_seen": 67185830, + "router_z_loss_clip": 1.01953125, + "router_z_loss_mlp": 0.16259766, + "step": 3114, + "time_per_iteration": 2.6914877891540527 + }, + { + "auxiliary_loss_clip": 0.01148668, + "auxiliary_loss_mlp": 0.01045652, + "balance_loss_clip": 1.05016279, + "balance_loss_mlp": 1.02799678, + "epoch": 0.1872839320607245, + "flos": 29448947701440.0, + "grad_norm": 1.8995922303848456, + "language_loss": 0.57837141, + "learning_rate": 3.74605902628851e-06, + "loss": 0.60031462, + "num_input_tokens_seen": 67206930, + "router_z_loss_clip": 0.98632812, + "router_z_loss_mlp": 0.17651367, + "step": 3115, + "time_per_iteration": 2.674196481704712 + }, + { + "auxiliary_loss_clip": 0.0114846, + "auxiliary_loss_mlp": 0.01047883, + "balance_loss_clip": 1.05268669, + "balance_loss_mlp": 1.03294563, + "epoch": 0.18734405531339246, + "flos": 25836779329920.0, + "grad_norm": 1.9345399009322082, + "language_loss": 0.71266496, + "learning_rate": 3.745869065428261e-06, + "loss": 0.73462844, + "num_input_tokens_seen": 67226290, + "router_z_loss_clip": 0.95751953, + "router_z_loss_mlp": 0.14929199, + "step": 3116, + "time_per_iteration": 2.6800975799560547 + }, + { + "auxiliary_loss_clip": 0.01141329, + "auxiliary_loss_mlp": 0.0103255, + "balance_loss_clip": 1.04747343, + "balance_loss_mlp": 1.01705265, + "epoch": 0.18740417856606043, + "flos": 21033250248960.0, + "grad_norm": 2.5200300202798025, + "language_loss": 0.78511465, + "learning_rate": 3.7456790383644833e-06, + "loss": 0.80685347, + "num_input_tokens_seen": 67244410, + "router_z_loss_clip": 0.93798828, + "router_z_loss_mlp": 0.15490723, + "step": 3117, + "time_per_iteration": 2.6685051918029785 + }, + { + "auxiliary_loss_clip": 0.01147132, + "auxiliary_loss_mlp": 0.01042148, + "balance_loss_clip": 1.05337882, + "balance_loss_mlp": 1.02623391, + "epoch": 0.1874643018187284, + "flos": 39728659524480.0, + "grad_norm": 1.88604894394921, + "language_loss": 0.83810735, + "learning_rate": 3.745488945104381e-06, + "loss": 0.86000019, + "num_input_tokens_seen": 67264470, + "router_z_loss_clip": 0.9375, + "router_z_loss_mlp": 0.15893555, + "step": 3118, + "time_per_iteration": 2.7792088985443115 + }, + { + "auxiliary_loss_clip": 0.01146917, + "auxiliary_loss_mlp": 0.01037629, + "balance_loss_clip": 1.04966187, + "balance_loss_mlp": 1.02239394, + "epoch": 0.18752442507139636, + "flos": 28379849091840.0, + "grad_norm": 1.8342732035722271, + "language_loss": 0.76378936, + "learning_rate": 3.7452987856551636e-06, + "loss": 0.78563476, + "num_input_tokens_seen": 67284315, + "router_z_loss_clip": 0.97216797, + "router_z_loss_mlp": 0.15228271, + "step": 3119, + "time_per_iteration": 2.6953036785125732 + }, + { + "auxiliary_loss_clip": 0.01148114, + "auxiliary_loss_mlp": 0.01044384, + "balance_loss_clip": 1.05033147, + "balance_loss_mlp": 1.02889276, + "epoch": 0.18758454832406432, + "flos": 26552942199360.0, + "grad_norm": 3.147324374469211, + "language_loss": 0.82240665, + "learning_rate": 3.7451085600240406e-06, + "loss": 0.84433162, + "num_input_tokens_seen": 67302780, + "router_z_loss_clip": 0.97802734, + "router_z_loss_mlp": 0.15484619, + "step": 3120, + "time_per_iteration": 2.708745241165161 + }, + { + "auxiliary_loss_clip": 0.01141823, + "auxiliary_loss_mlp": 0.01035587, + "balance_loss_clip": 1.04779959, + "balance_loss_mlp": 1.02119899, + "epoch": 0.1876446715767323, + "flos": 36082300919040.0, + "grad_norm": 1.7500875704926127, + "language_loss": 0.84796095, + "learning_rate": 3.7449182682182263e-06, + "loss": 0.86973506, + "num_input_tokens_seen": 67323405, + "router_z_loss_clip": 0.93896484, + "router_z_loss_mlp": 0.14404297, + "step": 3121, + "time_per_iteration": 2.9551501274108887 + }, + { + "auxiliary_loss_clip": 0.0114793, + "auxiliary_loss_mlp": 0.01038329, + "balance_loss_clip": 1.05197144, + "balance_loss_mlp": 1.02291501, + "epoch": 0.18770479482940028, + "flos": 37021317593760.0, + "grad_norm": 1.8380074304934255, + "language_loss": 0.70122945, + "learning_rate": 3.744727910244937e-06, + "loss": 0.72309208, + "num_input_tokens_seen": 67345800, + "router_z_loss_clip": 0.95849609, + "router_z_loss_mlp": 0.1541748, + "step": 3122, + "time_per_iteration": 2.7348275184631348 + }, + { + "auxiliary_loss_clip": 0.0114584, + "auxiliary_loss_mlp": 0.01037892, + "balance_loss_clip": 1.05043387, + "balance_loss_mlp": 1.02086866, + "epoch": 0.18776491808206824, + "flos": 17650175343840.0, + "grad_norm": 2.3827675044714343, + "language_loss": 0.70948398, + "learning_rate": 3.7445374861113905e-06, + "loss": 0.73132122, + "num_input_tokens_seen": 67363575, + "router_z_loss_clip": 0.95263672, + "router_z_loss_mlp": 0.17028809, + "step": 3123, + "time_per_iteration": 2.6238207817077637 + }, + { + "auxiliary_loss_clip": 0.01144809, + "auxiliary_loss_mlp": 0.01039763, + "balance_loss_clip": 1.05056477, + "balance_loss_mlp": 1.02508855, + "epoch": 0.1878250413347362, + "flos": 29893480310880.0, + "grad_norm": 2.061245003619166, + "language_loss": 0.73869556, + "learning_rate": 3.7443469958248066e-06, + "loss": 0.76054132, + "num_input_tokens_seen": 67381765, + "router_z_loss_clip": 0.94335938, + "router_z_loss_mlp": 0.14666748, + "step": 3124, + "time_per_iteration": 2.6950244903564453 + }, + { + "auxiliary_loss_clip": 0.01147772, + "auxiliary_loss_mlp": 0.01044586, + "balance_loss_clip": 1.05021107, + "balance_loss_mlp": 1.0273124, + "epoch": 0.18788516458740417, + "flos": 48548621760480.0, + "grad_norm": 1.6524871847288543, + "language_loss": 0.80623078, + "learning_rate": 3.7441564393924106e-06, + "loss": 0.82815433, + "num_input_tokens_seen": 67405000, + "router_z_loss_clip": 0.9765625, + "router_z_loss_mlp": 0.17285156, + "step": 3125, + "time_per_iteration": 2.8230841159820557 + }, + { + "auxiliary_loss_clip": 0.01061517, + "auxiliary_loss_mlp": 0.01006076, + "balance_loss_clip": 1.02738988, + "balance_loss_mlp": 1.0036726, + "epoch": 0.18794528784007214, + "flos": 78946715456640.0, + "grad_norm": 0.9432179982302347, + "language_loss": 0.63598812, + "learning_rate": 3.7439658168214273e-06, + "loss": 0.65666407, + "num_input_tokens_seen": 67467140, + "router_z_loss_clip": 0.34106445, + "router_z_loss_mlp": 0.02401733, + "step": 3126, + "time_per_iteration": 3.3961617946624756 + }, + { + "auxiliary_loss_clip": 0.01146514, + "auxiliary_loss_mlp": 0.01038262, + "balance_loss_clip": 1.05301726, + "balance_loss_mlp": 1.02266908, + "epoch": 0.1880054110927401, + "flos": 34925251995360.0, + "grad_norm": 2.0138584052637314, + "language_loss": 0.813012, + "learning_rate": 3.7437751281190857e-06, + "loss": 0.83485979, + "num_input_tokens_seen": 67487980, + "router_z_loss_clip": 0.93505859, + "router_z_loss_mlp": 0.15612793, + "step": 3127, + "time_per_iteration": 2.8159995079040527 + }, + { + "auxiliary_loss_clip": 0.01060796, + "auxiliary_loss_mlp": 0.01002399, + "balance_loss_clip": 1.02667236, + "balance_loss_mlp": 1.00010669, + "epoch": 0.1880655343454081, + "flos": 78689306761920.0, + "grad_norm": 0.7594212809651097, + "language_loss": 0.6181469, + "learning_rate": 3.7435843732926164e-06, + "loss": 0.63877881, + "num_input_tokens_seen": 67552500, + "router_z_loss_clip": 0.34033203, + "router_z_loss_mlp": 0.0229187, + "step": 3128, + "time_per_iteration": 3.320852279663086 + }, + { + "auxiliary_loss_clip": 0.01152804, + "auxiliary_loss_mlp": 0.01035679, + "balance_loss_clip": 1.0523212, + "balance_loss_mlp": 1.01998532, + "epoch": 0.18812565759807606, + "flos": 39199174879680.0, + "grad_norm": 2.182608079916337, + "language_loss": 0.71096534, + "learning_rate": 3.7433935523492536e-06, + "loss": 0.73285013, + "num_input_tokens_seen": 67573295, + "router_z_loss_clip": 1.00585938, + "router_z_loss_mlp": 0.15698242, + "step": 3129, + "time_per_iteration": 2.766566753387451 + }, + { + "auxiliary_loss_clip": 0.01147901, + "auxiliary_loss_mlp": 0.01047026, + "balance_loss_clip": 1.05132222, + "balance_loss_mlp": 1.03037274, + "epoch": 0.18818578085074403, + "flos": 25166718774720.0, + "grad_norm": 2.0670908675068445, + "language_loss": 0.85298955, + "learning_rate": 3.7432026652962314e-06, + "loss": 0.87493885, + "num_input_tokens_seen": 67590010, + "router_z_loss_clip": 0.96484375, + "router_z_loss_mlp": 0.16662598, + "step": 3130, + "time_per_iteration": 2.7369651794433594 + }, + { + "auxiliary_loss_clip": 0.01146596, + "auxiliary_loss_mlp": 0.01041462, + "balance_loss_clip": 1.04828715, + "balance_loss_mlp": 1.02560735, + "epoch": 0.188245904103412, + "flos": 35191858112640.0, + "grad_norm": 1.7946532322594793, + "language_loss": 0.76696569, + "learning_rate": 3.7430117121407897e-06, + "loss": 0.78884625, + "num_input_tokens_seen": 67611110, + "router_z_loss_clip": 0.98339844, + "router_z_loss_mlp": 0.15856934, + "step": 3131, + "time_per_iteration": 2.7184560298919678 + }, + { + "auxiliary_loss_clip": 0.01148056, + "auxiliary_loss_mlp": 0.01042872, + "balance_loss_clip": 1.05229068, + "balance_loss_mlp": 1.02618229, + "epoch": 0.18830602735607996, + "flos": 35899755456960.0, + "grad_norm": 2.333078541030696, + "language_loss": 0.81465054, + "learning_rate": 3.74282069289017e-06, + "loss": 0.83655977, + "num_input_tokens_seen": 67631990, + "router_z_loss_clip": 0.95800781, + "router_z_loss_mlp": 0.16687012, + "step": 3132, + "time_per_iteration": 2.7506277561187744 + }, + { + "auxiliary_loss_clip": 0.01150411, + "auxiliary_loss_mlp": 0.01043494, + "balance_loss_clip": 1.05129695, + "balance_loss_mlp": 1.02759147, + "epoch": 0.18836615060874792, + "flos": 35231801800320.0, + "grad_norm": 1.9767354110752333, + "language_loss": 0.79834223, + "learning_rate": 3.742629607551614e-06, + "loss": 0.82028127, + "num_input_tokens_seen": 67650490, + "router_z_loss_clip": 0.99121094, + "router_z_loss_mlp": 0.15905762, + "step": 3133, + "time_per_iteration": 2.8701512813568115 + }, + { + "auxiliary_loss_clip": 0.011489, + "auxiliary_loss_mlp": 0.01045986, + "balance_loss_clip": 1.05172265, + "balance_loss_mlp": 1.02999997, + "epoch": 0.18842627386141592, + "flos": 27578734187040.0, + "grad_norm": 2.0647769516942462, + "language_loss": 0.82920855, + "learning_rate": 3.7424384561323698e-06, + "loss": 0.85115743, + "num_input_tokens_seen": 67668860, + "router_z_loss_clip": 0.97265625, + "router_z_loss_mlp": 0.15991211, + "step": 3134, + "time_per_iteration": 5.527334213256836 + }, + { + "auxiliary_loss_clip": 0.01145445, + "auxiliary_loss_mlp": 0.01041605, + "balance_loss_clip": 1.05066061, + "balance_loss_mlp": 1.02600074, + "epoch": 0.18848639711408388, + "flos": 29984874593760.0, + "grad_norm": 1.4491311288423356, + "language_loss": 0.82961953, + "learning_rate": 3.742247238639684e-06, + "loss": 0.85149002, + "num_input_tokens_seen": 67690220, + "router_z_loss_clip": 0.94775391, + "router_z_loss_mlp": 0.15625, + "step": 3135, + "time_per_iteration": 2.69828724861145 + }, + { + "auxiliary_loss_clip": 0.01148568, + "auxiliary_loss_mlp": 0.01041924, + "balance_loss_clip": 1.04965234, + "balance_loss_mlp": 1.02571106, + "epoch": 0.18854652036675185, + "flos": 41691604392000.0, + "grad_norm": 2.6150144897953966, + "language_loss": 0.78643119, + "learning_rate": 3.7420559550808083e-06, + "loss": 0.80833608, + "num_input_tokens_seen": 67709820, + "router_z_loss_clip": 0.98974609, + "router_z_loss_mlp": 0.16210938, + "step": 3136, + "time_per_iteration": 2.7390005588531494 + }, + { + "auxiliary_loss_clip": 0.01148867, + "auxiliary_loss_mlp": 0.01041753, + "balance_loss_clip": 1.05262589, + "balance_loss_mlp": 1.02495611, + "epoch": 0.1886066436194198, + "flos": 29530050595200.0, + "grad_norm": 2.1463530302362033, + "language_loss": 0.81416357, + "learning_rate": 3.741864605462996e-06, + "loss": 0.83606976, + "num_input_tokens_seen": 67729490, + "router_z_loss_clip": 0.96142578, + "router_z_loss_mlp": 0.16784668, + "step": 3137, + "time_per_iteration": 4.181447744369507 + }, + { + "auxiliary_loss_clip": 0.01148612, + "auxiliary_loss_mlp": 0.01044996, + "balance_loss_clip": 1.05250359, + "balance_loss_mlp": 1.02974963, + "epoch": 0.18866676687208778, + "flos": 25930726201440.0, + "grad_norm": 1.5823056008514538, + "language_loss": 0.8120687, + "learning_rate": 3.741673189793504e-06, + "loss": 0.83400476, + "num_input_tokens_seen": 67749665, + "router_z_loss_clip": 0.9609375, + "router_z_loss_mlp": 0.15246582, + "step": 3138, + "time_per_iteration": 2.6625115871429443 + }, + { + "auxiliary_loss_clip": 0.01150914, + "auxiliary_loss_mlp": 0.01049424, + "balance_loss_clip": 1.05039239, + "balance_loss_mlp": 1.03318739, + "epoch": 0.18872689012475574, + "flos": 45525046394880.0, + "grad_norm": 2.130206175166147, + "language_loss": 0.6387831, + "learning_rate": 3.7414817080795896e-06, + "loss": 0.66078639, + "num_input_tokens_seen": 67776230, + "router_z_loss_clip": 1.00585938, + "router_z_loss_mlp": 0.16235352, + "step": 3139, + "time_per_iteration": 4.359669208526611 + }, + { + "auxiliary_loss_clip": 0.01145099, + "auxiliary_loss_mlp": 0.01046661, + "balance_loss_clip": 1.04845488, + "balance_loss_mlp": 1.02888739, + "epoch": 0.1887870133774237, + "flos": 26420307675840.0, + "grad_norm": 1.8758157122059056, + "language_loss": 0.71248114, + "learning_rate": 3.741290160328514e-06, + "loss": 0.73439884, + "num_input_tokens_seen": 67795080, + "router_z_loss_clip": 0.96728516, + "router_z_loss_mlp": 0.1776123, + "step": 3140, + "time_per_iteration": 2.6448428630828857 + }, + { + "auxiliary_loss_clip": 0.01147107, + "auxiliary_loss_mlp": 0.01041743, + "balance_loss_clip": 1.04962373, + "balance_loss_mlp": 1.02491021, + "epoch": 0.1888471366300917, + "flos": 19430939404800.0, + "grad_norm": 4.66562415083805, + "language_loss": 0.8685416, + "learning_rate": 3.7410985465475412e-06, + "loss": 0.89043009, + "num_input_tokens_seen": 67813110, + "router_z_loss_clip": 0.97509766, + "router_z_loss_mlp": 0.16833496, + "step": 3141, + "time_per_iteration": 2.6518115997314453 + }, + { + "auxiliary_loss_clip": 0.01149829, + "auxiliary_loss_mlp": 0.01043281, + "balance_loss_clip": 1.04984105, + "balance_loss_mlp": 1.02618623, + "epoch": 0.18890725988275966, + "flos": 22636938680640.0, + "grad_norm": 1.895225172848109, + "language_loss": 0.77351987, + "learning_rate": 3.7409068667439378e-06, + "loss": 0.79545093, + "num_input_tokens_seen": 67831070, + "router_z_loss_clip": 1.0, + "router_z_loss_mlp": 0.17102051, + "step": 3142, + "time_per_iteration": 2.6691179275512695 + }, + { + "auxiliary_loss_clip": 0.01145279, + "auxiliary_loss_mlp": 0.01040225, + "balance_loss_clip": 1.05119991, + "balance_loss_mlp": 1.02579498, + "epoch": 0.18896738313542763, + "flos": 35192263285440.0, + "grad_norm": 2.0071604670481262, + "language_loss": 0.78524435, + "learning_rate": 3.740715120924971e-06, + "loss": 0.8070994, + "num_input_tokens_seen": 67852170, + "router_z_loss_clip": 0.94091797, + "router_z_loss_mlp": 0.14447021, + "step": 3143, + "time_per_iteration": 2.7309694290161133 + }, + { + "auxiliary_loss_clip": 0.01146406, + "auxiliary_loss_mlp": 0.01045306, + "balance_loss_clip": 1.04922795, + "balance_loss_mlp": 1.02951074, + "epoch": 0.1890275063880956, + "flos": 27348789857760.0, + "grad_norm": 2.0704722706391063, + "language_loss": 0.71614116, + "learning_rate": 3.740523309097912e-06, + "loss": 0.73805833, + "num_input_tokens_seen": 67869945, + "router_z_loss_clip": 0.97216797, + "router_z_loss_mlp": 0.15795898, + "step": 3144, + "time_per_iteration": 2.8898329734802246 + }, + { + "auxiliary_loss_clip": 0.01148813, + "auxiliary_loss_mlp": 0.01043244, + "balance_loss_clip": 1.04974043, + "balance_loss_mlp": 1.02672207, + "epoch": 0.18908762964076356, + "flos": 29582676191520.0, + "grad_norm": 2.568007147857549, + "language_loss": 0.73362559, + "learning_rate": 3.7403314312700356e-06, + "loss": 0.75554609, + "num_input_tokens_seen": 67890240, + "router_z_loss_clip": 0.99121094, + "router_z_loss_mlp": 0.16503906, + "step": 3145, + "time_per_iteration": 2.7094075679779053 + }, + { + "auxiliary_loss_clip": 0.01141901, + "auxiliary_loss_mlp": 0.01037874, + "balance_loss_clip": 1.04656231, + "balance_loss_mlp": 1.02278221, + "epoch": 0.18914775289343153, + "flos": 20722284060480.0, + "grad_norm": 2.2029635565767234, + "language_loss": 0.76529914, + "learning_rate": 3.740139487448616e-06, + "loss": 0.78709686, + "num_input_tokens_seen": 67907825, + "router_z_loss_clip": 0.95214844, + "router_z_loss_mlp": 0.15087891, + "step": 3146, + "time_per_iteration": 2.6279759407043457 + }, + { + "auxiliary_loss_clip": 0.01144348, + "auxiliary_loss_mlp": 0.01038592, + "balance_loss_clip": 1.04736233, + "balance_loss_mlp": 1.02304721, + "epoch": 0.1892078761460995, + "flos": 26287794704160.0, + "grad_norm": 2.1354566762295386, + "language_loss": 0.78672606, + "learning_rate": 3.7399474776409326e-06, + "loss": 0.80855548, + "num_input_tokens_seen": 67926670, + "router_z_loss_clip": 0.96972656, + "router_z_loss_mlp": 0.15539551, + "step": 3147, + "time_per_iteration": 2.659257411956787 + }, + { + "auxiliary_loss_clip": 0.01146793, + "auxiliary_loss_mlp": 0.01041535, + "balance_loss_clip": 1.04947448, + "balance_loss_mlp": 1.02531064, + "epoch": 0.18926799939876748, + "flos": 28066816522080.0, + "grad_norm": 2.706076858578021, + "language_loss": 0.66959494, + "learning_rate": 3.739755401854267e-06, + "loss": 0.69147825, + "num_input_tokens_seen": 67943645, + "router_z_loss_clip": 0.97216797, + "router_z_loss_mlp": 0.16223145, + "step": 3148, + "time_per_iteration": 2.7180521488189697 + }, + { + "auxiliary_loss_clip": 0.01145559, + "auxiliary_loss_mlp": 0.01035652, + "balance_loss_clip": 1.04875803, + "balance_loss_mlp": 1.01979733, + "epoch": 0.18932812265143545, + "flos": 27177710785920.0, + "grad_norm": 2.492492294531442, + "language_loss": 0.75668055, + "learning_rate": 3.739563260095902e-06, + "loss": 0.77849269, + "num_input_tokens_seen": 67962345, + "router_z_loss_clip": 0.96826172, + "router_z_loss_mlp": 0.15844727, + "step": 3149, + "time_per_iteration": 2.7418670654296875 + }, + { + "auxiliary_loss_clip": 0.01142832, + "auxiliary_loss_mlp": 0.01040683, + "balance_loss_clip": 1.05012798, + "balance_loss_mlp": 1.02517354, + "epoch": 0.1893882459041034, + "flos": 22725415719360.0, + "grad_norm": 2.241767029165857, + "language_loss": 0.80245918, + "learning_rate": 3.7393710523731245e-06, + "loss": 0.82429433, + "num_input_tokens_seen": 67979760, + "router_z_loss_clip": 0.92724609, + "router_z_loss_mlp": 0.1550293, + "step": 3150, + "time_per_iteration": 2.6513025760650635 + }, + { + "auxiliary_loss_clip": 0.01149498, + "auxiliary_loss_mlp": 0.01044617, + "balance_loss_clip": 1.05046272, + "balance_loss_mlp": 1.02908397, + "epoch": 0.18944836915677138, + "flos": 27934263033120.0, + "grad_norm": 2.8644295029154723, + "language_loss": 0.85090744, + "learning_rate": 3.7391787786932215e-06, + "loss": 0.87284863, + "num_input_tokens_seen": 67996895, + "router_z_loss_clip": 0.98974609, + "router_z_loss_mlp": 0.15551758, + "step": 3151, + "time_per_iteration": 2.6761703491210938 + }, + { + "auxiliary_loss_clip": 0.01149206, + "auxiliary_loss_mlp": 0.01046498, + "balance_loss_clip": 1.05046844, + "balance_loss_mlp": 1.0307622, + "epoch": 0.18950849240943934, + "flos": 32697726874560.0, + "grad_norm": 1.6857839275016893, + "language_loss": 0.74603862, + "learning_rate": 3.7389864390634857e-06, + "loss": 0.7679956, + "num_input_tokens_seen": 68018365, + "router_z_loss_clip": 0.98828125, + "router_z_loss_mlp": 0.1574707, + "step": 3152, + "time_per_iteration": 2.7037627696990967 + }, + { + "auxiliary_loss_clip": 0.01147385, + "auxiliary_loss_mlp": 0.01042776, + "balance_loss_clip": 1.05101264, + "balance_loss_mlp": 1.0257405, + "epoch": 0.1895686156621073, + "flos": 30470444857440.0, + "grad_norm": 1.8388316029379288, + "language_loss": 0.75704646, + "learning_rate": 3.738794033491209e-06, + "loss": 0.77894813, + "num_input_tokens_seen": 68037985, + "router_z_loss_clip": 0.96484375, + "router_z_loss_mlp": 0.17041016, + "step": 3153, + "time_per_iteration": 2.664658784866333 + }, + { + "auxiliary_loss_clip": 0.01148917, + "auxiliary_loss_mlp": 0.01045405, + "balance_loss_clip": 1.0497551, + "balance_loss_mlp": 1.02903724, + "epoch": 0.1896287389147753, + "flos": 26776889971200.0, + "grad_norm": 2.346769526113632, + "language_loss": 0.79844946, + "learning_rate": 3.7386015619836887e-06, + "loss": 0.82039261, + "num_input_tokens_seen": 68057975, + "router_z_loss_clip": 0.99023438, + "router_z_loss_mlp": 0.16369629, + "step": 3154, + "time_per_iteration": 2.6554388999938965 + }, + { + "auxiliary_loss_clip": 0.0115277, + "auxiliary_loss_mlp": 0.01043186, + "balance_loss_clip": 1.05050623, + "balance_loss_mlp": 1.02652049, + "epoch": 0.18968886216744327, + "flos": 22182641406720.0, + "grad_norm": 2.5714962035469853, + "language_loss": 0.72814226, + "learning_rate": 3.738409024548223e-06, + "loss": 0.7501018, + "num_input_tokens_seen": 68074175, + "router_z_loss_clip": 1.02294922, + "router_z_loss_mlp": 0.16662598, + "step": 3155, + "time_per_iteration": 2.6050047874450684 + }, + { + "auxiliary_loss_clip": 0.01146656, + "auxiliary_loss_mlp": 0.0104173, + "balance_loss_clip": 1.05094624, + "balance_loss_mlp": 1.02539885, + "epoch": 0.18974898542011123, + "flos": 24907324733280.0, + "grad_norm": 1.8680026204946993, + "language_loss": 0.74007881, + "learning_rate": 3.7382164211921136e-06, + "loss": 0.76196265, + "num_input_tokens_seen": 68095230, + "router_z_loss_clip": 0.95654297, + "router_z_loss_mlp": 0.16320801, + "step": 3156, + "time_per_iteration": 2.6795928478240967 + }, + { + "auxiliary_loss_clip": 0.01149712, + "auxiliary_loss_mlp": 0.01040441, + "balance_loss_clip": 1.05178475, + "balance_loss_mlp": 1.02443159, + "epoch": 0.1898091086727792, + "flos": 29267091033120.0, + "grad_norm": 1.611314464509906, + "language_loss": 0.6818862, + "learning_rate": 3.7380237519226623e-06, + "loss": 0.70378774, + "num_input_tokens_seen": 68113805, + "router_z_loss_clip": 0.97998047, + "router_z_loss_mlp": 0.16015625, + "step": 3157, + "time_per_iteration": 2.8639652729034424 + }, + { + "auxiliary_loss_clip": 0.01149863, + "auxiliary_loss_mlp": 0.01043747, + "balance_loss_clip": 1.05118942, + "balance_loss_mlp": 1.02699804, + "epoch": 0.18986923192544716, + "flos": 33723275758560.0, + "grad_norm": 2.2908481570522237, + "language_loss": 0.80163789, + "learning_rate": 3.737831016747176e-06, + "loss": 0.82357401, + "num_input_tokens_seen": 68133190, + "router_z_loss_clip": 0.98730469, + "router_z_loss_mlp": 0.16748047, + "step": 3158, + "time_per_iteration": 2.6781859397888184 + }, + { + "auxiliary_loss_clip": 0.01154998, + "auxiliary_loss_mlp": 0.0104645, + "balance_loss_clip": 1.05275524, + "balance_loss_mlp": 1.0284965, + "epoch": 0.18992935517811513, + "flos": 31096145341440.0, + "grad_norm": 2.026181516563571, + "language_loss": 0.72201502, + "learning_rate": 3.737638215672964e-06, + "loss": 0.74402952, + "num_input_tokens_seen": 68152330, + "router_z_loss_clip": 1.0234375, + "router_z_loss_mlp": 0.17944336, + "step": 3159, + "time_per_iteration": 2.6623854637145996 + }, + { + "auxiliary_loss_clip": 0.01153049, + "auxiliary_loss_mlp": 0.01051296, + "balance_loss_clip": 1.05533147, + "balance_loss_mlp": 1.03323627, + "epoch": 0.1899894784307831, + "flos": 21256630778880.0, + "grad_norm": 1.9043694499912904, + "language_loss": 0.85347831, + "learning_rate": 3.7374453487073366e-06, + "loss": 0.87552178, + "num_input_tokens_seen": 68170185, + "router_z_loss_clip": 0.97851562, + "router_z_loss_mlp": 0.18066406, + "step": 3160, + "time_per_iteration": 2.6458356380462646 + }, + { + "auxiliary_loss_clip": 0.01145441, + "auxiliary_loss_mlp": 0.0104537, + "balance_loss_clip": 1.05167985, + "balance_loss_mlp": 1.02969408, + "epoch": 0.19004960168345109, + "flos": 33546443232960.0, + "grad_norm": 1.8134131296403484, + "language_loss": 0.73184597, + "learning_rate": 3.7372524158576074e-06, + "loss": 0.75375408, + "num_input_tokens_seen": 68191665, + "router_z_loss_clip": 0.93896484, + "router_z_loss_mlp": 0.15673828, + "step": 3161, + "time_per_iteration": 2.728209972381592 + }, + { + "auxiliary_loss_clip": 0.01146691, + "auxiliary_loss_mlp": 0.0104598, + "balance_loss_clip": 1.0510962, + "balance_loss_mlp": 1.02931428, + "epoch": 0.19010972493611905, + "flos": 47167908685920.0, + "grad_norm": 1.6580078612855331, + "language_loss": 0.80808079, + "learning_rate": 3.7370594171310926e-06, + "loss": 0.83000755, + "num_input_tokens_seen": 68214635, + "router_z_loss_clip": 0.95751953, + "router_z_loss_mlp": 0.16674805, + "step": 3162, + "time_per_iteration": 2.8006114959716797 + }, + { + "auxiliary_loss_clip": 0.01151163, + "auxiliary_loss_mlp": 0.01043632, + "balance_loss_clip": 1.05258298, + "balance_loss_mlp": 1.02638245, + "epoch": 0.19016984818878702, + "flos": 23482373139360.0, + "grad_norm": 26.143931646937805, + "language_loss": 0.75761175, + "learning_rate": 3.73686635253511e-06, + "loss": 0.77955973, + "num_input_tokens_seen": 68232150, + "router_z_loss_clip": 0.98730469, + "router_z_loss_mlp": 0.17260742, + "step": 3163, + "time_per_iteration": 2.647534132003784 + }, + { + "auxiliary_loss_clip": 0.01148163, + "auxiliary_loss_mlp": 0.01035372, + "balance_loss_clip": 1.05337119, + "balance_loss_mlp": 1.01919568, + "epoch": 0.19022997144145498, + "flos": 45877090754880.0, + "grad_norm": 1.8730000945065521, + "language_loss": 0.74198496, + "learning_rate": 3.736673222076982e-06, + "loss": 0.76382041, + "num_input_tokens_seen": 68253370, + "router_z_loss_clip": 0.94824219, + "router_z_loss_mlp": 0.16174316, + "step": 3164, + "time_per_iteration": 2.7734122276306152 + }, + { + "auxiliary_loss_clip": 0.01150949, + "auxiliary_loss_mlp": 0.01038408, + "balance_loss_clip": 1.05434656, + "balance_loss_mlp": 1.02149212, + "epoch": 0.19029009469412295, + "flos": 75083290666560.0, + "grad_norm": 4.108297487699057, + "language_loss": 0.66627765, + "learning_rate": 3.7364800257640313e-06, + "loss": 0.68817115, + "num_input_tokens_seen": 68278895, + "router_z_loss_clip": 0.96582031, + "router_z_loss_mlp": 0.16906738, + "step": 3165, + "time_per_iteration": 3.0170466899871826 + }, + { + "auxiliary_loss_clip": 0.01149391, + "auxiliary_loss_mlp": 0.01046033, + "balance_loss_clip": 1.05273294, + "balance_loss_mlp": 1.02837825, + "epoch": 0.1903502179467909, + "flos": 17027189517600.0, + "grad_norm": 2.2126172184948754, + "language_loss": 0.74450171, + "learning_rate": 3.7362867636035835e-06, + "loss": 0.76645595, + "num_input_tokens_seen": 68294880, + "router_z_loss_clip": 0.96728516, + "router_z_loss_mlp": 0.17651367, + "step": 3166, + "time_per_iteration": 2.591148614883423 + }, + { + "auxiliary_loss_clip": 0.01065277, + "auxiliary_loss_mlp": 0.01014478, + "balance_loss_clip": 1.03009593, + "balance_loss_mlp": 1.012115, + "epoch": 0.1904103411994589, + "flos": 81630766301760.0, + "grad_norm": 0.8566714799491376, + "language_loss": 0.50404358, + "learning_rate": 3.736093435602968e-06, + "loss": 0.52484119, + "num_input_tokens_seen": 68359665, + "router_z_loss_clip": 0.35131836, + "router_z_loss_mlp": 0.02360535, + "step": 3167, + "time_per_iteration": 3.292724847793579 + }, + { + "auxiliary_loss_clip": 0.01146708, + "auxiliary_loss_mlp": 0.0104934, + "balance_loss_clip": 1.05130029, + "balance_loss_mlp": 1.03319931, + "epoch": 0.19047046445212687, + "flos": 26733137659200.0, + "grad_norm": 1.6871786187898126, + "language_loss": 0.74694037, + "learning_rate": 3.7359000417695156e-06, + "loss": 0.76890087, + "num_input_tokens_seen": 68378950, + "router_z_loss_clip": 0.95507812, + "router_z_loss_mlp": 0.16125488, + "step": 3168, + "time_per_iteration": 2.6334805488586426 + }, + { + "auxiliary_loss_clip": 0.01062711, + "auxiliary_loss_mlp": 0.01006821, + "balance_loss_clip": 1.02798867, + "balance_loss_mlp": 1.00439906, + "epoch": 0.19053058770479483, + "flos": 72304124299200.0, + "grad_norm": 0.8697510991394141, + "language_loss": 0.6000365, + "learning_rate": 3.73570658211056e-06, + "loss": 0.62073177, + "num_input_tokens_seen": 68434235, + "router_z_loss_clip": 0.34692383, + "router_z_loss_mlp": 0.02420044, + "step": 3169, + "time_per_iteration": 3.2747652530670166 + }, + { + "auxiliary_loss_clip": 0.01154081, + "auxiliary_loss_mlp": 0.01048443, + "balance_loss_clip": 1.05186987, + "balance_loss_mlp": 1.03153861, + "epoch": 0.1905907109574628, + "flos": 29225688723360.0, + "grad_norm": 3.4686436842543613, + "language_loss": 0.78084838, + "learning_rate": 3.735513056633436e-06, + "loss": 0.80287361, + "num_input_tokens_seen": 68453830, + "router_z_loss_clip": 1.02246094, + "router_z_loss_mlp": 0.16906738, + "step": 3170, + "time_per_iteration": 2.8721063137054443 + }, + { + "auxiliary_loss_clip": 0.01147037, + "auxiliary_loss_mlp": 0.0103894, + "balance_loss_clip": 1.05143332, + "balance_loss_mlp": 1.02216673, + "epoch": 0.19065083421013077, + "flos": 25395366551040.0, + "grad_norm": 1.7720867522279182, + "language_loss": 0.78471839, + "learning_rate": 3.7353194653454834e-06, + "loss": 0.80657816, + "num_input_tokens_seen": 68473005, + "router_z_loss_clip": 0.95703125, + "router_z_loss_mlp": 0.16772461, + "step": 3171, + "time_per_iteration": 2.7046871185302734 + }, + { + "auxiliary_loss_clip": 0.01154274, + "auxiliary_loss_mlp": 0.01039167, + "balance_loss_clip": 1.05294466, + "balance_loss_mlp": 1.02173924, + "epoch": 0.19071095746279873, + "flos": 38174800996800.0, + "grad_norm": 3.440392223390907, + "language_loss": 0.77942264, + "learning_rate": 3.7351258082540426e-06, + "loss": 0.80135709, + "num_input_tokens_seen": 68493470, + "router_z_loss_clip": 1.01367188, + "router_z_loss_mlp": 0.17431641, + "step": 3172, + "time_per_iteration": 2.7895150184631348 + }, + { + "auxiliary_loss_clip": 0.01149498, + "auxiliary_loss_mlp": 0.01050044, + "balance_loss_clip": 1.0520947, + "balance_loss_mlp": 1.03370023, + "epoch": 0.1907710807154667, + "flos": 17516852026560.0, + "grad_norm": 1.6699874093316966, + "language_loss": 0.80497622, + "learning_rate": 3.7349320853664576e-06, + "loss": 0.82697165, + "num_input_tokens_seen": 68511290, + "router_z_loss_clip": 0.97412109, + "router_z_loss_mlp": 0.16345215, + "step": 3173, + "time_per_iteration": 4.204849720001221 + }, + { + "auxiliary_loss_clip": 0.01150289, + "auxiliary_loss_mlp": 0.01053745, + "balance_loss_clip": 1.05151892, + "balance_loss_mlp": 1.03713942, + "epoch": 0.1908312039681347, + "flos": 32832468296640.0, + "grad_norm": 1.6368475372447195, + "language_loss": 0.78603005, + "learning_rate": 3.7347382966900735e-06, + "loss": 0.80807042, + "num_input_tokens_seen": 68532575, + "router_z_loss_clip": 0.98681641, + "router_z_loss_mlp": 0.16595459, + "step": 3174, + "time_per_iteration": 2.6805505752563477 + }, + { + "auxiliary_loss_clip": 0.01152744, + "auxiliary_loss_mlp": 0.01044832, + "balance_loss_clip": 1.0544306, + "balance_loss_mlp": 1.02829814, + "epoch": 0.19089132722080265, + "flos": 17686634545440.0, + "grad_norm": 1.6925677496357847, + "language_loss": 0.8094939, + "learning_rate": 3.7345444422322395e-06, + "loss": 0.83146971, + "num_input_tokens_seen": 68548760, + "router_z_loss_clip": 0.98291016, + "router_z_loss_mlp": 0.16540527, + "step": 3175, + "time_per_iteration": 2.645230293273926 + }, + { + "auxiliary_loss_clip": 0.0115047, + "auxiliary_loss_mlp": 0.01053661, + "balance_loss_clip": 1.05134201, + "balance_loss_mlp": 1.03670919, + "epoch": 0.19095145047347062, + "flos": 17024961067200.0, + "grad_norm": 2.7625287742426456, + "language_loss": 0.8574574, + "learning_rate": 3.7343505220003067e-06, + "loss": 0.87949866, + "num_input_tokens_seen": 68563100, + "router_z_loss_clip": 0.99023438, + "router_z_loss_mlp": 0.16931152, + "step": 3176, + "time_per_iteration": 2.5971953868865967 + }, + { + "auxiliary_loss_clip": 0.01153791, + "auxiliary_loss_mlp": 0.01054106, + "balance_loss_clip": 1.05259609, + "balance_loss_mlp": 1.03511608, + "epoch": 0.19101157372613858, + "flos": 30873656191680.0, + "grad_norm": 2.1040279146223697, + "language_loss": 0.8120715, + "learning_rate": 3.7341565360016285e-06, + "loss": 0.83415043, + "num_input_tokens_seen": 68581650, + "router_z_loss_clip": 1.01220703, + "router_z_loss_mlp": 0.18994141, + "step": 3177, + "time_per_iteration": 4.115736722946167 + }, + { + "auxiliary_loss_clip": 0.01147486, + "auxiliary_loss_mlp": 0.01043271, + "balance_loss_clip": 1.05085647, + "balance_loss_mlp": 1.02724934, + "epoch": 0.19107169697880655, + "flos": 25085170190880.0, + "grad_norm": 2.2840000631795156, + "language_loss": 0.74533403, + "learning_rate": 3.73396248424356e-06, + "loss": 0.7672416, + "num_input_tokens_seen": 68600360, + "router_z_loss_clip": 0.96630859, + "router_z_loss_mlp": 0.16027832, + "step": 3178, + "time_per_iteration": 4.023594617843628 + }, + { + "auxiliary_loss_clip": 0.01151059, + "auxiliary_loss_mlp": 0.01042551, + "balance_loss_clip": 1.05214429, + "balance_loss_mlp": 1.02694666, + "epoch": 0.19113182023147451, + "flos": 27043739192160.0, + "grad_norm": 1.8008480177467154, + "language_loss": 0.813528, + "learning_rate": 3.7337683667334606e-06, + "loss": 0.83546412, + "num_input_tokens_seen": 68617885, + "router_z_loss_clip": 0.98876953, + "router_z_loss_mlp": 0.15625, + "step": 3179, + "time_per_iteration": 2.650116443634033 + }, + { + "auxiliary_loss_clip": 0.01151428, + "auxiliary_loss_mlp": 0.01042728, + "balance_loss_clip": 1.05298173, + "balance_loss_mlp": 1.02638435, + "epoch": 0.19119194348414248, + "flos": 22673965124160.0, + "grad_norm": 2.6013601851912487, + "language_loss": 0.7994197, + "learning_rate": 3.733574183478691e-06, + "loss": 0.82136124, + "num_input_tokens_seen": 68634550, + "router_z_loss_clip": 0.98388672, + "router_z_loss_mlp": 0.16333008, + "step": 3180, + "time_per_iteration": 2.8805840015411377 + }, + { + "auxiliary_loss_clip": 0.01146908, + "auxiliary_loss_mlp": 0.01045061, + "balance_loss_clip": 1.04972494, + "balance_loss_mlp": 1.02833641, + "epoch": 0.19125206673681047, + "flos": 23216131677600.0, + "grad_norm": 2.344427505783738, + "language_loss": 0.79804111, + "learning_rate": 3.733379934486615e-06, + "loss": 0.81996083, + "num_input_tokens_seen": 68651895, + "router_z_loss_clip": 0.97216797, + "router_z_loss_mlp": 0.1673584, + "step": 3181, + "time_per_iteration": 2.7994184494018555 + }, + { + "auxiliary_loss_clip": 0.01149938, + "auxiliary_loss_mlp": 0.01053148, + "balance_loss_clip": 1.05024481, + "balance_loss_mlp": 1.03630376, + "epoch": 0.19131218998947844, + "flos": 26466369472800.0, + "grad_norm": 2.0044625088683072, + "language_loss": 0.74290007, + "learning_rate": 3.7331856197645973e-06, + "loss": 0.76493096, + "num_input_tokens_seen": 68671500, + "router_z_loss_clip": 0.99560547, + "router_z_loss_mlp": 0.1685791, + "step": 3182, + "time_per_iteration": 2.6266062259674072 + }, + { + "auxiliary_loss_clip": 0.01147523, + "auxiliary_loss_mlp": 0.01043232, + "balance_loss_clip": 1.05044603, + "balance_loss_mlp": 1.02679336, + "epoch": 0.1913723132421464, + "flos": 22503372259680.0, + "grad_norm": 1.7932944528420502, + "language_loss": 0.65256143, + "learning_rate": 3.7329912393200084e-06, + "loss": 0.67446899, + "num_input_tokens_seen": 68690570, + "router_z_loss_clip": 0.97167969, + "router_z_loss_mlp": 0.16448975, + "step": 3183, + "time_per_iteration": 2.6315207481384277 + }, + { + "auxiliary_loss_clip": 0.01148955, + "auxiliary_loss_mlp": 0.0104397, + "balance_loss_clip": 1.05026913, + "balance_loss_mlp": 1.02658927, + "epoch": 0.19143243649481437, + "flos": 33141489655680.0, + "grad_norm": 1.5584276294269463, + "language_loss": 0.73259979, + "learning_rate": 3.7327967931602173e-06, + "loss": 0.754529, + "num_input_tokens_seen": 68709735, + "router_z_loss_clip": 0.98632812, + "router_z_loss_mlp": 0.17382812, + "step": 3184, + "time_per_iteration": 2.692283868789673 + }, + { + "auxiliary_loss_clip": 0.01149949, + "auxiliary_loss_mlp": 0.0104612, + "balance_loss_clip": 1.05080748, + "balance_loss_mlp": 1.02788103, + "epoch": 0.19149255974748233, + "flos": 26502220915200.0, + "grad_norm": 2.0214676663200066, + "language_loss": 0.88234603, + "learning_rate": 3.732602281292598e-06, + "loss": 0.90430677, + "num_input_tokens_seen": 68727565, + "router_z_loss_clip": 0.99169922, + "router_z_loss_mlp": 0.18237305, + "step": 3185, + "time_per_iteration": 2.6481051445007324 + }, + { + "auxiliary_loss_clip": 0.01145598, + "auxiliary_loss_mlp": 0.01040099, + "balance_loss_clip": 1.04819751, + "balance_loss_mlp": 1.02274251, + "epoch": 0.1915526830001503, + "flos": 28020309035040.0, + "grad_norm": 1.9272130642653627, + "language_loss": 0.73096699, + "learning_rate": 3.7324077037245267e-06, + "loss": 0.75282395, + "num_input_tokens_seen": 68748110, + "router_z_loss_clip": 0.97509766, + "router_z_loss_mlp": 0.17352295, + "step": 3186, + "time_per_iteration": 2.643444538116455 + }, + { + "auxiliary_loss_clip": 0.01153807, + "auxiliary_loss_mlp": 0.01043173, + "balance_loss_clip": 1.05353045, + "balance_loss_mlp": 1.02451658, + "epoch": 0.1916128062528183, + "flos": 31897381798080.0, + "grad_norm": 2.474895498117045, + "language_loss": 0.83403832, + "learning_rate": 3.7322130604633825e-06, + "loss": 0.85600805, + "num_input_tokens_seen": 68769765, + "router_z_loss_clip": 1.00341797, + "router_z_loss_mlp": 0.18652344, + "step": 3187, + "time_per_iteration": 2.6989707946777344 + }, + { + "auxiliary_loss_clip": 0.01052567, + "auxiliary_loss_mlp": 0.01028061, + "balance_loss_clip": 1.01892912, + "balance_loss_mlp": 1.02591991, + "epoch": 0.19167292950548626, + "flos": 67019927545440.0, + "grad_norm": 0.8610279208531866, + "language_loss": 0.5599159, + "learning_rate": 3.732018351516544e-06, + "loss": 0.58072209, + "num_input_tokens_seen": 68826815, + "router_z_loss_clip": 0.33618164, + "router_z_loss_mlp": 0.0214386, + "step": 3188, + "time_per_iteration": 3.3025906085968018 + }, + { + "auxiliary_loss_clip": 0.01148237, + "auxiliary_loss_mlp": 0.01048743, + "balance_loss_clip": 1.05094016, + "balance_loss_mlp": 1.03199375, + "epoch": 0.19173305275815422, + "flos": 36528130081440.0, + "grad_norm": 1.95023818756211, + "language_loss": 0.70112175, + "learning_rate": 3.731823576891397e-06, + "loss": 0.72309148, + "num_input_tokens_seen": 68847585, + "router_z_loss_clip": 0.97363281, + "router_z_loss_mlp": 0.16748047, + "step": 3189, + "time_per_iteration": 2.733687400817871 + }, + { + "auxiliary_loss_clip": 0.01141492, + "auxiliary_loss_mlp": 0.01036809, + "balance_loss_clip": 1.04799271, + "balance_loss_mlp": 1.02102613, + "epoch": 0.1917931760108222, + "flos": 30203474084640.0, + "grad_norm": 2.013900633477541, + "language_loss": 0.7423535, + "learning_rate": 3.7316287365953266e-06, + "loss": 0.76413649, + "num_input_tokens_seen": 68866620, + "router_z_loss_clip": 0.93554688, + "router_z_loss_mlp": 0.15771484, + "step": 3190, + "time_per_iteration": 2.6690893173217773 + }, + { + "auxiliary_loss_clip": 0.01146473, + "auxiliary_loss_mlp": 0.01055786, + "balance_loss_clip": 1.05060005, + "balance_loss_mlp": 1.03934669, + "epoch": 0.19185329926349015, + "flos": 22992791664960.0, + "grad_norm": 2.078783114506068, + "language_loss": 0.84668148, + "learning_rate": 3.73143383063572e-06, + "loss": 0.86870408, + "num_input_tokens_seen": 68885515, + "router_z_loss_clip": 0.95898438, + "router_z_loss_mlp": 0.16442871, + "step": 3191, + "time_per_iteration": 2.800537586212158 + }, + { + "auxiliary_loss_clip": 0.01140888, + "auxiliary_loss_mlp": 0.01041354, + "balance_loss_clip": 1.04684246, + "balance_loss_mlp": 1.02511764, + "epoch": 0.19191342251615812, + "flos": 26950886287200.0, + "grad_norm": 1.7147057552036558, + "language_loss": 0.89652342, + "learning_rate": 3.73123885901997e-06, + "loss": 0.91834581, + "num_input_tokens_seen": 68903225, + "router_z_loss_clip": 0.93994141, + "router_z_loss_mlp": 0.16247559, + "step": 3192, + "time_per_iteration": 2.7130625247955322 + }, + { + "auxiliary_loss_clip": 0.01153936, + "auxiliary_loss_mlp": 0.01050442, + "balance_loss_clip": 1.05350018, + "balance_loss_mlp": 1.03157091, + "epoch": 0.19197354576882608, + "flos": 27087613056000.0, + "grad_norm": 1.7161292237922687, + "language_loss": 0.75136852, + "learning_rate": 3.7310438217554687e-06, + "loss": 0.77341229, + "num_input_tokens_seen": 68922860, + "router_z_loss_clip": 1.00634766, + "router_z_loss_mlp": 0.18859863, + "step": 3193, + "time_per_iteration": 2.956261157989502 + }, + { + "auxiliary_loss_clip": 0.01152298, + "auxiliary_loss_mlp": 0.01045059, + "balance_loss_clip": 1.05090785, + "balance_loss_mlp": 1.02635515, + "epoch": 0.19203366902149407, + "flos": 30379050574560.0, + "grad_norm": 1.733314711334484, + "language_loss": 0.75067061, + "learning_rate": 3.730848718849612e-06, + "loss": 0.77264416, + "num_input_tokens_seen": 68943000, + "router_z_loss_clip": 1.01367188, + "router_z_loss_mlp": 0.18713379, + "step": 3194, + "time_per_iteration": 2.704038143157959 + }, + { + "auxiliary_loss_clip": 0.01051715, + "auxiliary_loss_mlp": 0.01005065, + "balance_loss_clip": 1.0183537, + "balance_loss_mlp": 1.0027796, + "epoch": 0.19209379227416204, + "flos": 83483435833920.0, + "grad_norm": 0.7925058087314204, + "language_loss": 0.68519759, + "learning_rate": 3.7306535503097985e-06, + "loss": 0.70576537, + "num_input_tokens_seen": 69000255, + "router_z_loss_clip": 0.33349609, + "router_z_loss_mlp": 0.02287292, + "step": 3195, + "time_per_iteration": 3.1765429973602295 + }, + { + "auxiliary_loss_clip": 0.01151112, + "auxiliary_loss_mlp": 0.01044546, + "balance_loss_clip": 1.05291998, + "balance_loss_mlp": 1.0277493, + "epoch": 0.19215391552683, + "flos": 26911793462400.0, + "grad_norm": 2.2819967049105507, + "language_loss": 0.72949463, + "learning_rate": 3.730458316143429e-06, + "loss": 0.75145119, + "num_input_tokens_seen": 69019665, + "router_z_loss_clip": 0.98144531, + "router_z_loss_mlp": 0.16784668, + "step": 3196, + "time_per_iteration": 2.6895320415496826 + }, + { + "auxiliary_loss_clip": 0.01153726, + "auxiliary_loss_mlp": 0.01043182, + "balance_loss_clip": 1.05628037, + "balance_loss_mlp": 1.02617085, + "epoch": 0.19221403877949797, + "flos": 24773677277760.0, + "grad_norm": 1.9158847009289302, + "language_loss": 0.83644068, + "learning_rate": 3.7302630163579068e-06, + "loss": 0.85840976, + "num_input_tokens_seen": 69039055, + "router_z_loss_clip": 0.97412109, + "router_z_loss_mlp": 0.17016602, + "step": 3197, + "time_per_iteration": 2.660687208175659 + }, + { + "auxiliary_loss_clip": 0.01151486, + "auxiliary_loss_mlp": 0.0104664, + "balance_loss_clip": 1.0516057, + "balance_loss_mlp": 1.02889013, + "epoch": 0.19227416203216594, + "flos": 28290966880320.0, + "grad_norm": 2.244342424199745, + "language_loss": 0.80422145, + "learning_rate": 3.7300676509606373e-06, + "loss": 0.82620269, + "num_input_tokens_seen": 69056370, + "router_z_loss_clip": 0.99951172, + "router_z_loss_mlp": 0.1776123, + "step": 3198, + "time_per_iteration": 2.6899399757385254 + }, + { + "auxiliary_loss_clip": 0.01150396, + "auxiliary_loss_mlp": 0.01049418, + "balance_loss_clip": 1.05061221, + "balance_loss_mlp": 1.03192985, + "epoch": 0.1923342852848339, + "flos": 31452970740480.0, + "grad_norm": 1.9125500122858772, + "language_loss": 0.78686255, + "learning_rate": 3.729872219959029e-06, + "loss": 0.80886066, + "num_input_tokens_seen": 69075915, + "router_z_loss_clip": 0.99804688, + "router_z_loss_mlp": 0.17492676, + "step": 3199, + "time_per_iteration": 2.694334030151367 + }, + { + "auxiliary_loss_clip": 0.01151093, + "auxiliary_loss_mlp": 0.0104167, + "balance_loss_clip": 1.05385947, + "balance_loss_mlp": 1.02467084, + "epoch": 0.19239440853750187, + "flos": 20899805379840.0, + "grad_norm": 3.2442818872559442, + "language_loss": 0.83131057, + "learning_rate": 3.7296767233604934e-06, + "loss": 0.85323822, + "num_input_tokens_seen": 69094145, + "router_z_loss_clip": 0.97216797, + "router_z_loss_mlp": 0.17004395, + "step": 3200, + "time_per_iteration": 2.641209602355957 + }, + { + "auxiliary_loss_clip": 0.01151459, + "auxiliary_loss_mlp": 0.01049484, + "balance_loss_clip": 1.05347383, + "balance_loss_mlp": 1.0326519, + "epoch": 0.19245453179016986, + "flos": 20053844196480.0, + "grad_norm": 2.0408289315478902, + "language_loss": 0.78812563, + "learning_rate": 3.729481161172443e-06, + "loss": 0.81013507, + "num_input_tokens_seen": 69111110, + "router_z_loss_clip": 0.98046875, + "router_z_loss_mlp": 0.16845703, + "step": 3201, + "time_per_iteration": 2.6317684650421143 + }, + { + "auxiliary_loss_clip": 0.01150755, + "auxiliary_loss_mlp": 0.01043308, + "balance_loss_clip": 1.05057502, + "balance_loss_mlp": 1.02597523, + "epoch": 0.19251465504283782, + "flos": 24684957135360.0, + "grad_norm": 1.8633068382893234, + "language_loss": 0.69836026, + "learning_rate": 3.7292855334022927e-06, + "loss": 0.72030091, + "num_input_tokens_seen": 69130280, + "router_z_loss_clip": 1.00195312, + "router_z_loss_mlp": 0.17346191, + "step": 3202, + "time_per_iteration": 2.7767434120178223 + }, + { + "auxiliary_loss_clip": 0.01145453, + "auxiliary_loss_mlp": 0.01036319, + "balance_loss_clip": 1.05032623, + "balance_loss_mlp": 1.01977229, + "epoch": 0.1925747782955058, + "flos": 23750316326880.0, + "grad_norm": 1.7648109765844064, + "language_loss": 0.91304249, + "learning_rate": 3.7290898400574627e-06, + "loss": 0.93486023, + "num_input_tokens_seen": 69149570, + "router_z_loss_clip": 0.95214844, + "router_z_loss_mlp": 0.16540527, + "step": 3203, + "time_per_iteration": 2.9158499240875244 + }, + { + "auxiliary_loss_clip": 0.01150461, + "auxiliary_loss_mlp": 0.01047659, + "balance_loss_clip": 1.05051351, + "balance_loss_mlp": 1.02975392, + "epoch": 0.19263490154817375, + "flos": 21701771147520.0, + "grad_norm": 2.3196091481123116, + "language_loss": 0.81313694, + "learning_rate": 3.7288940811453725e-06, + "loss": 0.83511817, + "num_input_tokens_seen": 69168190, + "router_z_loss_clip": 1.0, + "router_z_loss_mlp": 0.17895508, + "step": 3204, + "time_per_iteration": 2.878748655319214 + }, + { + "auxiliary_loss_clip": 0.01146831, + "auxiliary_loss_mlp": 0.01043403, + "balance_loss_clip": 1.05137169, + "balance_loss_mlp": 1.02667832, + "epoch": 0.19269502480084172, + "flos": 21300909815520.0, + "grad_norm": 1.985872841490513, + "language_loss": 0.75467181, + "learning_rate": 3.7286982566734454e-06, + "loss": 0.77657419, + "num_input_tokens_seen": 69186950, + "router_z_loss_clip": 0.95507812, + "router_z_loss_mlp": 0.1673584, + "step": 3205, + "time_per_iteration": 2.920328378677368 + }, + { + "auxiliary_loss_clip": 0.01153119, + "auxiliary_loss_mlp": 0.0104669, + "balance_loss_clip": 1.05362666, + "balance_loss_mlp": 1.02934468, + "epoch": 0.19275514805350968, + "flos": 26242826873760.0, + "grad_norm": 2.460057509721092, + "language_loss": 0.82715249, + "learning_rate": 3.728502366649107e-06, + "loss": 0.8491506, + "num_input_tokens_seen": 69204850, + "router_z_loss_clip": 0.99609375, + "router_z_loss_mlp": 0.17358398, + "step": 3206, + "time_per_iteration": 3.24149227142334 + }, + { + "auxiliary_loss_clip": 0.01051174, + "auxiliary_loss_mlp": 0.01007008, + "balance_loss_clip": 1.01798499, + "balance_loss_mlp": 1.00468683, + "epoch": 0.19281527130617768, + "flos": 58198101514560.0, + "grad_norm": 0.8399393342422854, + "language_loss": 0.60617363, + "learning_rate": 3.728306411079786e-06, + "loss": 0.62675542, + "num_input_tokens_seen": 69259200, + "router_z_loss_clip": 0.33203125, + "router_z_loss_mlp": 0.02319336, + "step": 3207, + "time_per_iteration": 3.3704845905303955 + }, + { + "auxiliary_loss_clip": 0.01153116, + "auxiliary_loss_mlp": 0.01046434, + "balance_loss_clip": 1.0535121, + "balance_loss_mlp": 1.02922606, + "epoch": 0.19287539455884564, + "flos": 14399289272160.0, + "grad_norm": 2.384405140129474, + "language_loss": 0.7473253, + "learning_rate": 3.7281103899729125e-06, + "loss": 0.76932073, + "num_input_tokens_seen": 69275835, + "router_z_loss_clip": 0.99609375, + "router_z_loss_mlp": 0.17218018, + "step": 3208, + "time_per_iteration": 2.61472749710083 + }, + { + "auxiliary_loss_clip": 0.01153248, + "auxiliary_loss_mlp": 0.01044153, + "balance_loss_clip": 1.0515815, + "balance_loss_mlp": 1.02587867, + "epoch": 0.1929355178115136, + "flos": 25174498092480.0, + "grad_norm": 2.259760278646027, + "language_loss": 0.60999656, + "learning_rate": 3.7279143033359195e-06, + "loss": 0.63197064, + "num_input_tokens_seen": 69294810, + "router_z_loss_clip": 1.01660156, + "router_z_loss_mlp": 0.18273926, + "step": 3209, + "time_per_iteration": 2.64449143409729 + }, + { + "auxiliary_loss_clip": 0.01151066, + "auxiliary_loss_mlp": 0.0104672, + "balance_loss_clip": 1.05016434, + "balance_loss_mlp": 1.02830207, + "epoch": 0.19299564106418157, + "flos": 49795403758560.0, + "grad_norm": 1.886462964827513, + "language_loss": 0.80072755, + "learning_rate": 3.727718151176243e-06, + "loss": 0.82270539, + "num_input_tokens_seen": 69316065, + "router_z_loss_clip": 1.00927734, + "router_z_loss_mlp": 0.1842041, + "step": 3210, + "time_per_iteration": 2.8331758975982666 + }, + { + "auxiliary_loss_clip": 0.01143385, + "auxiliary_loss_mlp": 0.01041827, + "balance_loss_clip": 1.04776227, + "balance_loss_mlp": 1.02579331, + "epoch": 0.19305576431684954, + "flos": 13862754620640.0, + "grad_norm": 2.195901449734656, + "language_loss": 0.83152437, + "learning_rate": 3.7275219335013217e-06, + "loss": 0.85337651, + "num_input_tokens_seen": 69332900, + "router_z_loss_clip": 0.95605469, + "router_z_loss_mlp": 0.16027832, + "step": 3211, + "time_per_iteration": 2.611870288848877 + }, + { + "auxiliary_loss_clip": 0.01050724, + "auxiliary_loss_mlp": 0.01005841, + "balance_loss_clip": 1.01768386, + "balance_loss_mlp": 1.00359821, + "epoch": 0.1931158875695175, + "flos": 66516043471200.0, + "grad_norm": 0.9649850078499435, + "language_loss": 0.63641584, + "learning_rate": 3.7273256503185953e-06, + "loss": 0.65698147, + "num_input_tokens_seen": 69382535, + "router_z_loss_clip": 0.33032227, + "router_z_loss_mlp": 0.02246094, + "step": 3212, + "time_per_iteration": 4.50884747505188 + }, + { + "auxiliary_loss_clip": 0.01146971, + "auxiliary_loss_mlp": 0.01038764, + "balance_loss_clip": 1.05170035, + "balance_loss_mlp": 1.0229094, + "epoch": 0.19317601082218547, + "flos": 24194646349920.0, + "grad_norm": 1.8561428580157997, + "language_loss": 0.7672677, + "learning_rate": 3.7271293016355074e-06, + "loss": 0.78912508, + "num_input_tokens_seen": 69400600, + "router_z_loss_clip": 0.95214844, + "router_z_loss_mlp": 0.15856934, + "step": 3213, + "time_per_iteration": 4.102145433425903 + }, + { + "auxiliary_loss_clip": 0.0115189, + "auxiliary_loss_mlp": 0.01045757, + "balance_loss_clip": 1.05154145, + "balance_loss_mlp": 1.02775621, + "epoch": 0.19323613407485346, + "flos": 16047580878720.0, + "grad_norm": 2.1182695126079527, + "language_loss": 0.70786017, + "learning_rate": 3.726932887459503e-06, + "loss": 0.7298367, + "num_input_tokens_seen": 69417350, + "router_z_loss_clip": 1.00341797, + "router_z_loss_mlp": 0.17993164, + "step": 3214, + "time_per_iteration": 2.6581242084503174 + }, + { + "auxiliary_loss_clip": 0.01146679, + "auxiliary_loss_mlp": 0.01043066, + "balance_loss_clip": 1.04913533, + "balance_loss_mlp": 1.02532816, + "epoch": 0.19329625732752143, + "flos": 17115585521760.0, + "grad_norm": 3.6800019494878167, + "language_loss": 0.75312889, + "learning_rate": 3.72673640779803e-06, + "loss": 0.77502638, + "num_input_tokens_seen": 69431845, + "router_z_loss_clip": 0.97607422, + "router_z_loss_mlp": 0.17724609, + "step": 3215, + "time_per_iteration": 2.6032042503356934 + }, + { + "auxiliary_loss_clip": 0.01145119, + "auxiliary_loss_mlp": 0.01045641, + "balance_loss_clip": 1.04932642, + "balance_loss_mlp": 1.02982223, + "epoch": 0.1933563805801894, + "flos": 28603756346400.0, + "grad_norm": 1.9106781169973128, + "language_loss": 0.88414967, + "learning_rate": 3.72653986265854e-06, + "loss": 0.9060573, + "num_input_tokens_seen": 69453275, + "router_z_loss_clip": 0.95800781, + "router_z_loss_mlp": 0.15808105, + "step": 3216, + "time_per_iteration": 4.27911639213562 + }, + { + "auxiliary_loss_clip": 0.01148165, + "auxiliary_loss_mlp": 0.01048888, + "balance_loss_clip": 1.05221868, + "balance_loss_mlp": 1.03324771, + "epoch": 0.19341650383285736, + "flos": 24996206944800.0, + "grad_norm": 1.6110331794581283, + "language_loss": 0.79815006, + "learning_rate": 3.726343252048485e-06, + "loss": 0.82012057, + "num_input_tokens_seen": 69471830, + "router_z_loss_clip": 0.95947266, + "router_z_loss_mlp": 0.15625, + "step": 3217, + "time_per_iteration": 2.814411163330078 + }, + { + "auxiliary_loss_clip": 0.01154685, + "auxiliary_loss_mlp": 0.01048121, + "balance_loss_clip": 1.05302739, + "balance_loss_mlp": 1.02908301, + "epoch": 0.19347662708552532, + "flos": 21790734393600.0, + "grad_norm": 2.240541101998409, + "language_loss": 0.61897922, + "learning_rate": 3.7261465759753206e-06, + "loss": 0.6410073, + "num_input_tokens_seen": 69489320, + "router_z_loss_clip": 1.01660156, + "router_z_loss_mlp": 0.19030762, + "step": 3218, + "time_per_iteration": 4.0803070068359375 + }, + { + "auxiliary_loss_clip": 0.01152222, + "auxiliary_loss_mlp": 0.01044421, + "balance_loss_clip": 1.05362511, + "balance_loss_mlp": 1.02757692, + "epoch": 0.1935367503381933, + "flos": 22191879346560.0, + "grad_norm": 1.6102053255024324, + "language_loss": 0.80211419, + "learning_rate": 3.7259498344465053e-06, + "loss": 0.82408065, + "num_input_tokens_seen": 69506665, + "router_z_loss_clip": 0.98632812, + "router_z_loss_mlp": 0.16845703, + "step": 3219, + "time_per_iteration": 2.745224952697754 + }, + { + "auxiliary_loss_clip": 0.01150914, + "auxiliary_loss_mlp": 0.01044441, + "balance_loss_clip": 1.05385983, + "balance_loss_mlp": 1.02712011, + "epoch": 0.19359687359086128, + "flos": 19470923609760.0, + "grad_norm": 2.002300837877506, + "language_loss": 0.86118668, + "learning_rate": 3.7257530274694993e-06, + "loss": 0.88314021, + "num_input_tokens_seen": 69523835, + "router_z_loss_clip": 0.97070312, + "router_z_loss_mlp": 0.17321777, + "step": 3220, + "time_per_iteration": 2.6390562057495117 + }, + { + "auxiliary_loss_clip": 0.01144143, + "auxiliary_loss_mlp": 0.01040243, + "balance_loss_clip": 1.05294132, + "balance_loss_mlp": 1.02482963, + "epoch": 0.19365699684352924, + "flos": 25886852337600.0, + "grad_norm": 2.266878114442832, + "language_loss": 0.84433347, + "learning_rate": 3.725556155051766e-06, + "loss": 0.86617732, + "num_input_tokens_seen": 69542620, + "router_z_loss_clip": 0.91210938, + "router_z_loss_mlp": 0.15411377, + "step": 3221, + "time_per_iteration": 2.6619713306427 + }, + { + "auxiliary_loss_clip": 0.01148247, + "auxiliary_loss_mlp": 0.01042907, + "balance_loss_clip": 1.05445075, + "balance_loss_mlp": 1.02797031, + "epoch": 0.1937171200961972, + "flos": 21122780736960.0, + "grad_norm": 2.245380486703036, + "language_loss": 0.85564744, + "learning_rate": 3.7253592172007702e-06, + "loss": 0.87755895, + "num_input_tokens_seen": 69561130, + "router_z_loss_clip": 0.9375, + "router_z_loss_mlp": 0.14929199, + "step": 3222, + "time_per_iteration": 2.627371072769165 + }, + { + "auxiliary_loss_clip": 0.01148102, + "auxiliary_loss_mlp": 0.01038245, + "balance_loss_clip": 1.05033016, + "balance_loss_mlp": 1.02172256, + "epoch": 0.19377724334886517, + "flos": 27620784773280.0, + "grad_norm": 1.8718241934513842, + "language_loss": 0.78594321, + "learning_rate": 3.72516221392398e-06, + "loss": 0.80780673, + "num_input_tokens_seen": 69580425, + "router_z_loss_clip": 0.97705078, + "router_z_loss_mlp": 0.16503906, + "step": 3223, + "time_per_iteration": 2.6781582832336426 + }, + { + "auxiliary_loss_clip": 0.01149072, + "auxiliary_loss_mlp": 0.01040974, + "balance_loss_clip": 1.05489349, + "balance_loss_mlp": 1.02522612, + "epoch": 0.19383736660153314, + "flos": 18398259479520.0, + "grad_norm": 2.0289723882217783, + "language_loss": 0.75770378, + "learning_rate": 3.7249651452288653e-06, + "loss": 0.77960426, + "num_input_tokens_seen": 69597085, + "router_z_loss_clip": 0.94140625, + "router_z_loss_mlp": 0.15759277, + "step": 3224, + "time_per_iteration": 2.6533827781677246 + }, + { + "auxiliary_loss_clip": 0.01146833, + "auxiliary_loss_mlp": 0.01037454, + "balance_loss_clip": 1.05259514, + "balance_loss_mlp": 1.02180171, + "epoch": 0.1938974898542011, + "flos": 57497815068480.0, + "grad_norm": 2.231955240781305, + "language_loss": 0.7088201, + "learning_rate": 3.7247680111229e-06, + "loss": 0.73066294, + "num_input_tokens_seen": 69618885, + "router_z_loss_clip": 0.94189453, + "router_z_loss_mlp": 0.15649414, + "step": 3225, + "time_per_iteration": 2.8609657287597656 + }, + { + "auxiliary_loss_clip": 0.01147203, + "auxiliary_loss_mlp": 0.01040045, + "balance_loss_clip": 1.05129194, + "balance_loss_mlp": 1.02415442, + "epoch": 0.19395761310686907, + "flos": 31496885121600.0, + "grad_norm": 4.619456326490558, + "language_loss": 0.69257545, + "learning_rate": 3.7245708116135585e-06, + "loss": 0.71444798, + "num_input_tokens_seen": 69638200, + "router_z_loss_clip": 0.95898438, + "router_z_loss_mlp": 0.15881348, + "step": 3226, + "time_per_iteration": 2.714954137802124 + }, + { + "auxiliary_loss_clip": 0.01147409, + "auxiliary_loss_mlp": 0.0103972, + "balance_loss_clip": 1.05419946, + "balance_loss_mlp": 1.02292299, + "epoch": 0.19401773635953706, + "flos": 28113121422720.0, + "grad_norm": 3.711403240738024, + "language_loss": 0.76635617, + "learning_rate": 3.7243735467083193e-06, + "loss": 0.78822744, + "num_input_tokens_seen": 69657550, + "router_z_loss_clip": 0.93212891, + "router_z_loss_mlp": 0.16796875, + "step": 3227, + "time_per_iteration": 2.6628687381744385 + }, + { + "auxiliary_loss_clip": 0.01150539, + "auxiliary_loss_mlp": 0.01037976, + "balance_loss_clip": 1.05297399, + "balance_loss_mlp": 1.02230597, + "epoch": 0.19407785961220503, + "flos": 19426563538560.0, + "grad_norm": 2.545423326305042, + "language_loss": 0.69468713, + "learning_rate": 3.724176216414662e-06, + "loss": 0.71657234, + "num_input_tokens_seen": 69675005, + "router_z_loss_clip": 0.9765625, + "router_z_loss_mlp": 0.15679932, + "step": 3228, + "time_per_iteration": 2.6587023735046387 + }, + { + "auxiliary_loss_clip": 0.01151108, + "auxiliary_loss_mlp": 0.01039338, + "balance_loss_clip": 1.05475378, + "balance_loss_mlp": 1.02376986, + "epoch": 0.194137982864873, + "flos": 31630127404320.0, + "grad_norm": 1.9752532272531673, + "language_loss": 0.74471503, + "learning_rate": 3.72397882074007e-06, + "loss": 0.76661944, + "num_input_tokens_seen": 69696455, + "router_z_loss_clip": 0.96337891, + "router_z_loss_mlp": 0.15576172, + "step": 3229, + "time_per_iteration": 2.8872649669647217 + }, + { + "auxiliary_loss_clip": 0.01147181, + "auxiliary_loss_mlp": 0.01040713, + "balance_loss_clip": 1.0524404, + "balance_loss_mlp": 1.02497101, + "epoch": 0.19419810611754096, + "flos": 16180944713280.0, + "grad_norm": 5.500209232334229, + "language_loss": 0.65343225, + "learning_rate": 3.7237813596920285e-06, + "loss": 0.67531121, + "num_input_tokens_seen": 69714245, + "router_z_loss_clip": 0.94775391, + "router_z_loss_mlp": 0.15753174, + "step": 3230, + "time_per_iteration": 2.674567222595215 + }, + { + "auxiliary_loss_clip": 0.01141764, + "auxiliary_loss_mlp": 0.01038703, + "balance_loss_clip": 1.04917943, + "balance_loss_mlp": 1.02263343, + "epoch": 0.19425822937020892, + "flos": 19163603976480.0, + "grad_norm": 2.053177348992433, + "language_loss": 0.81785834, + "learning_rate": 3.7235838332780254e-06, + "loss": 0.83966291, + "num_input_tokens_seen": 69731515, + "router_z_loss_clip": 0.92431641, + "router_z_loss_mlp": 0.1607666, + "step": 3231, + "time_per_iteration": 2.683242082595825 + }, + { + "auxiliary_loss_clip": 0.01149155, + "auxiliary_loss_mlp": 0.01038313, + "balance_loss_clip": 1.05286896, + "balance_loss_mlp": 1.02057445, + "epoch": 0.1943183526228769, + "flos": 28194062247360.0, + "grad_norm": 1.7232850575881693, + "language_loss": 0.87100756, + "learning_rate": 3.72338624150555e-06, + "loss": 0.89288235, + "num_input_tokens_seen": 69748885, + "router_z_loss_clip": 0.96337891, + "router_z_loss_mlp": 0.1776123, + "step": 3232, + "time_per_iteration": 2.7441258430480957 + }, + { + "auxiliary_loss_clip": 0.0114733, + "auxiliary_loss_mlp": 0.01043254, + "balance_loss_clip": 1.05354667, + "balance_loss_mlp": 1.02717876, + "epoch": 0.19437847587554485, + "flos": 30155143320000.0, + "grad_norm": 2.1061049546197483, + "language_loss": 0.85222399, + "learning_rate": 3.723188584382096e-06, + "loss": 0.87412989, + "num_input_tokens_seen": 69767540, + "router_z_loss_clip": 0.93652344, + "router_z_loss_mlp": 0.16070557, + "step": 3233, + "time_per_iteration": 2.698530435562134 + }, + { + "auxiliary_loss_clip": 0.01148542, + "auxiliary_loss_mlp": 0.01046865, + "balance_loss_clip": 1.05049253, + "balance_loss_mlp": 1.03116465, + "epoch": 0.19443859912821285, + "flos": 28206176914080.0, + "grad_norm": 2.3962257621725342, + "language_loss": 0.89031088, + "learning_rate": 3.722990861915158e-06, + "loss": 0.91226494, + "num_input_tokens_seen": 69789340, + "router_z_loss_clip": 0.98095703, + "router_z_loss_mlp": 0.15698242, + "step": 3234, + "time_per_iteration": 2.7276673316955566 + }, + { + "auxiliary_loss_clip": 0.01147423, + "auxiliary_loss_mlp": 0.01036301, + "balance_loss_clip": 1.04838037, + "balance_loss_mlp": 1.01988542, + "epoch": 0.1944987223808808, + "flos": 18405674141760.0, + "grad_norm": 2.3434054277030674, + "language_loss": 0.78404617, + "learning_rate": 3.722793074112234e-06, + "loss": 0.80588341, + "num_input_tokens_seen": 69806470, + "router_z_loss_clip": 0.99072266, + "router_z_loss_mlp": 0.1640625, + "step": 3235, + "time_per_iteration": 2.6807641983032227 + }, + { + "auxiliary_loss_clip": 0.01148394, + "auxiliary_loss_mlp": 0.01039519, + "balance_loss_clip": 1.05325794, + "balance_loss_mlp": 1.02455235, + "epoch": 0.19455884563354878, + "flos": 20898144171360.0, + "grad_norm": 3.956618962832481, + "language_loss": 0.79558396, + "learning_rate": 3.7225952209808233e-06, + "loss": 0.81746304, + "num_input_tokens_seen": 69822655, + "router_z_loss_clip": 0.95068359, + "router_z_loss_mlp": 0.14971924, + "step": 3236, + "time_per_iteration": 2.6276674270629883 + }, + { + "auxiliary_loss_clip": 0.0114702, + "auxiliary_loss_mlp": 0.01040659, + "balance_loss_clip": 1.05260515, + "balance_loss_mlp": 1.02406561, + "epoch": 0.19461896888621674, + "flos": 24639989304960.0, + "grad_norm": 1.6586957123426693, + "language_loss": 0.75733, + "learning_rate": 3.72239730252843e-06, + "loss": 0.77920681, + "num_input_tokens_seen": 69841895, + "router_z_loss_clip": 0.94433594, + "router_z_loss_mlp": 0.16589355, + "step": 3237, + "time_per_iteration": 2.658628463745117 + }, + { + "auxiliary_loss_clip": 0.01150293, + "auxiliary_loss_mlp": 0.01046316, + "balance_loss_clip": 1.05106091, + "balance_loss_mlp": 1.03087854, + "epoch": 0.1946790921388847, + "flos": 30873129467040.0, + "grad_norm": 1.6157281717528782, + "language_loss": 0.74795169, + "learning_rate": 3.7221993187625583e-06, + "loss": 0.76991779, + "num_input_tokens_seen": 69862220, + "router_z_loss_clip": 0.99121094, + "router_z_loss_mlp": 0.15441895, + "step": 3238, + "time_per_iteration": 2.658945322036743 + }, + { + "auxiliary_loss_clip": 0.01150653, + "auxiliary_loss_mlp": 0.01043116, + "balance_loss_clip": 1.0536691, + "balance_loss_mlp": 1.02623558, + "epoch": 0.19473921539155267, + "flos": 24639989304960.0, + "grad_norm": 2.023760520207433, + "language_loss": 0.73958427, + "learning_rate": 3.7220012696907155e-06, + "loss": 0.76152194, + "num_input_tokens_seen": 69881830, + "router_z_loss_clip": 0.96923828, + "router_z_loss_mlp": 0.16882324, + "step": 3239, + "time_per_iteration": 2.663022756576538 + }, + { + "auxiliary_loss_clip": 0.01145461, + "auxiliary_loss_mlp": 0.0104098, + "balance_loss_clip": 1.04990816, + "balance_loss_mlp": 1.02486324, + "epoch": 0.19479933864422067, + "flos": 25486963420320.0, + "grad_norm": 1.7056517395693822, + "language_loss": 0.73549336, + "learning_rate": 3.721803155320412e-06, + "loss": 0.75735784, + "num_input_tokens_seen": 69900515, + "router_z_loss_clip": 0.95556641, + "router_z_loss_mlp": 0.16101074, + "step": 3240, + "time_per_iteration": 2.634202003479004 + }, + { + "auxiliary_loss_clip": 0.01149924, + "auxiliary_loss_mlp": 0.01036859, + "balance_loss_clip": 1.0537585, + "balance_loss_mlp": 1.02168322, + "epoch": 0.19485946189688863, + "flos": 28424857439520.0, + "grad_norm": 2.486651894273374, + "language_loss": 0.66527736, + "learning_rate": 3.7216049756591606e-06, + "loss": 0.68714517, + "num_input_tokens_seen": 69920060, + "router_z_loss_clip": 0.95947266, + "router_z_loss_mlp": 0.1517334, + "step": 3241, + "time_per_iteration": 2.7396702766418457 + }, + { + "auxiliary_loss_clip": 0.01146796, + "auxiliary_loss_mlp": 0.01045875, + "balance_loss_clip": 1.05127048, + "balance_loss_mlp": 1.03005624, + "epoch": 0.1949195851495566, + "flos": 28424816922240.0, + "grad_norm": 1.4414045116705942, + "language_loss": 0.82844609, + "learning_rate": 3.7214067307144754e-06, + "loss": 0.85037279, + "num_input_tokens_seen": 69939820, + "router_z_loss_clip": 0.95654297, + "router_z_loss_mlp": 0.15820312, + "step": 3242, + "time_per_iteration": 2.852787971496582 + }, + { + "auxiliary_loss_clip": 0.01055073, + "auxiliary_loss_mlp": 0.01018422, + "balance_loss_clip": 1.02211535, + "balance_loss_mlp": 1.01637483, + "epoch": 0.19497970840222456, + "flos": 79267648896000.0, + "grad_norm": 0.8319948139349753, + "language_loss": 0.57465434, + "learning_rate": 3.721208420493875e-06, + "loss": 0.59538937, + "num_input_tokens_seen": 70002145, + "router_z_loss_clip": 0.32910156, + "router_z_loss_mlp": 0.02047729, + "step": 3243, + "time_per_iteration": 3.2915408611297607 + }, + { + "auxiliary_loss_clip": 0.01147711, + "auxiliary_loss_mlp": 0.01040841, + "balance_loss_clip": 1.05120015, + "balance_loss_mlp": 1.02452087, + "epoch": 0.19503983165489253, + "flos": 23970577026240.0, + "grad_norm": 1.9345474692157827, + "language_loss": 0.83725953, + "learning_rate": 3.7210100450048784e-06, + "loss": 0.8591451, + "num_input_tokens_seen": 70020510, + "router_z_loss_clip": 0.96582031, + "router_z_loss_mlp": 0.16320801, + "step": 3244, + "time_per_iteration": 2.7797951698303223 + }, + { + "auxiliary_loss_clip": 0.01150634, + "auxiliary_loss_mlp": 0.01045937, + "balance_loss_clip": 1.05500674, + "balance_loss_mlp": 1.0299753, + "epoch": 0.1950999549075605, + "flos": 25797524436000.0, + "grad_norm": 1.8621175123257825, + "language_loss": 0.76824594, + "learning_rate": 3.7208116042550088e-06, + "loss": 0.79021168, + "num_input_tokens_seen": 70040760, + "router_z_loss_clip": 0.95703125, + "router_z_loss_mlp": 0.15966797, + "step": 3245, + "time_per_iteration": 2.6782724857330322 + }, + { + "auxiliary_loss_clip": 0.0114649, + "auxiliary_loss_mlp": 0.01037859, + "balance_loss_clip": 1.04993129, + "balance_loss_mlp": 1.02139616, + "epoch": 0.19516007816022846, + "flos": 25483884107040.0, + "grad_norm": 2.009453842296034, + "language_loss": 0.83800197, + "learning_rate": 3.7206130982517906e-06, + "loss": 0.85984546, + "num_input_tokens_seen": 70058720, + "router_z_loss_clip": 0.96533203, + "router_z_loss_mlp": 0.16455078, + "step": 3246, + "time_per_iteration": 2.6775963306427 + }, + { + "auxiliary_loss_clip": 0.01149498, + "auxiliary_loss_mlp": 0.01045709, + "balance_loss_clip": 1.05196905, + "balance_loss_mlp": 1.02975857, + "epoch": 0.19522020141289645, + "flos": 20633888056320.0, + "grad_norm": 2.0473863346584604, + "language_loss": 0.76240093, + "learning_rate": 3.7204145270027514e-06, + "loss": 0.78435302, + "num_input_tokens_seen": 70076470, + "router_z_loss_clip": 0.97558594, + "router_z_loss_mlp": 0.15942383, + "step": 3247, + "time_per_iteration": 2.6526172161102295 + }, + { + "auxiliary_loss_clip": 0.01148626, + "auxiliary_loss_mlp": 0.01035332, + "balance_loss_clip": 1.05217767, + "balance_loss_mlp": 1.02041888, + "epoch": 0.19528032466556441, + "flos": 32610100698720.0, + "grad_norm": 1.934139843408274, + "language_loss": 0.75410199, + "learning_rate": 3.720215890515421e-06, + "loss": 0.77594161, + "num_input_tokens_seen": 70096220, + "router_z_loss_clip": 0.96337891, + "router_z_loss_mlp": 0.14916992, + "step": 3248, + "time_per_iteration": 2.7193236351013184 + }, + { + "auxiliary_loss_clip": 0.01146958, + "auxiliary_loss_mlp": 0.01042005, + "balance_loss_clip": 1.04921532, + "balance_loss_mlp": 1.02642441, + "epoch": 0.19534044791823238, + "flos": 25664282153280.0, + "grad_norm": 2.0655437751368466, + "language_loss": 0.7830652, + "learning_rate": 3.7200171887973316e-06, + "loss": 0.80495483, + "num_input_tokens_seen": 70114800, + "router_z_loss_clip": 0.97705078, + "router_z_loss_mlp": 0.15576172, + "step": 3249, + "time_per_iteration": 2.6380107402801514 + }, + { + "auxiliary_loss_clip": 0.01147399, + "auxiliary_loss_mlp": 0.01040918, + "balance_loss_clip": 1.05105162, + "balance_loss_mlp": 1.02530205, + "epoch": 0.19540057117090034, + "flos": 27264567133440.0, + "grad_norm": 1.5869283138844308, + "language_loss": 0.73128545, + "learning_rate": 3.7198184218560176e-06, + "loss": 0.75316858, + "num_input_tokens_seen": 70134930, + "router_z_loss_clip": 0.96435547, + "router_z_loss_mlp": 0.15600586, + "step": 3250, + "time_per_iteration": 2.6918587684631348 + }, + { + "auxiliary_loss_clip": 0.01143332, + "auxiliary_loss_mlp": 0.01037211, + "balance_loss_clip": 1.04935408, + "balance_loss_mlp": 1.02261996, + "epoch": 0.1954606944235683, + "flos": 24772178138400.0, + "grad_norm": 2.459188136833548, + "language_loss": 0.79535693, + "learning_rate": 3.719619589699017e-06, + "loss": 0.81716239, + "num_input_tokens_seen": 70152045, + "router_z_loss_clip": 0.93994141, + "router_z_loss_mlp": 0.14599609, + "step": 3251, + "time_per_iteration": 2.6246302127838135 + }, + { + "auxiliary_loss_clip": 0.01145106, + "auxiliary_loss_mlp": 0.01040996, + "balance_loss_clip": 1.04878974, + "balance_loss_mlp": 1.02471161, + "epoch": 0.19552081767623627, + "flos": 21166492531680.0, + "grad_norm": 2.5907768448402133, + "language_loss": 0.83493686, + "learning_rate": 3.7194206923338695e-06, + "loss": 0.85679787, + "num_input_tokens_seen": 70169240, + "router_z_loss_clip": 0.96435547, + "router_z_loss_mlp": 0.16271973, + "step": 3252, + "time_per_iteration": 4.059183120727539 + }, + { + "auxiliary_loss_clip": 0.01150856, + "auxiliary_loss_mlp": 0.01044021, + "balance_loss_clip": 1.05063844, + "balance_loss_mlp": 1.02708125, + "epoch": 0.19558094092890424, + "flos": 39020073386400.0, + "grad_norm": 2.131909474618766, + "language_loss": 0.73575002, + "learning_rate": 3.719221729768117e-06, + "loss": 0.75769877, + "num_input_tokens_seen": 70192690, + "router_z_loss_clip": 1.00195312, + "router_z_loss_mlp": 0.16955566, + "step": 3253, + "time_per_iteration": 4.4238080978393555 + }, + { + "auxiliary_loss_clip": 0.01149487, + "auxiliary_loss_mlp": 0.01045886, + "balance_loss_clip": 1.04909778, + "balance_loss_mlp": 1.03022182, + "epoch": 0.19564106418157223, + "flos": 27173051298720.0, + "grad_norm": 1.9047450785422941, + "language_loss": 0.76463258, + "learning_rate": 3.7190227020093037e-06, + "loss": 0.78658628, + "num_input_tokens_seen": 70209685, + "router_z_loss_clip": 1.00292969, + "router_z_loss_mlp": 0.15649414, + "step": 3254, + "time_per_iteration": 2.6411635875701904 + }, + { + "auxiliary_loss_clip": 0.01053312, + "auxiliary_loss_mlp": 0.01004352, + "balance_loss_clip": 1.02024555, + "balance_loss_mlp": 1.00225735, + "epoch": 0.1957011874342402, + "flos": 66336374736000.0, + "grad_norm": 0.7664751172051967, + "language_loss": 0.55319458, + "learning_rate": 3.7188236090649774e-06, + "loss": 0.57377124, + "num_input_tokens_seen": 70265050, + "router_z_loss_clip": 0.33056641, + "router_z_loss_mlp": 0.02096558, + "step": 3255, + "time_per_iteration": 3.2585716247558594 + }, + { + "auxiliary_loss_clip": 0.01150025, + "auxiliary_loss_mlp": 0.01044131, + "balance_loss_clip": 1.05259061, + "balance_loss_mlp": 1.02757239, + "epoch": 0.19576131068690816, + "flos": 20141348820480.0, + "grad_norm": 3.2857291503856416, + "language_loss": 0.70824981, + "learning_rate": 3.718624450942688e-06, + "loss": 0.73019129, + "num_input_tokens_seen": 70281830, + "router_z_loss_clip": 0.97363281, + "router_z_loss_mlp": 0.16564941, + "step": 3256, + "time_per_iteration": 4.075100421905518 + }, + { + "auxiliary_loss_clip": 0.01143925, + "auxiliary_loss_mlp": 0.01038477, + "balance_loss_clip": 1.04861128, + "balance_loss_mlp": 1.02280116, + "epoch": 0.19582143393957613, + "flos": 17960614807680.0, + "grad_norm": 2.2757525314807125, + "language_loss": 0.8012439, + "learning_rate": 3.718425227649987e-06, + "loss": 0.8230679, + "num_input_tokens_seen": 70297420, + "router_z_loss_clip": 0.95263672, + "router_z_loss_mlp": 0.15673828, + "step": 3257, + "time_per_iteration": 4.10721755027771 + }, + { + "auxiliary_loss_clip": 0.01146026, + "auxiliary_loss_mlp": 0.01038446, + "balance_loss_clip": 1.05111432, + "balance_loss_mlp": 1.02349687, + "epoch": 0.1958815571922441, + "flos": 29804354995680.0, + "grad_norm": 2.047555896607059, + "language_loss": 0.74971068, + "learning_rate": 3.7182259391944292e-06, + "loss": 0.7715553, + "num_input_tokens_seen": 70319210, + "router_z_loss_clip": 0.94824219, + "router_z_loss_mlp": 0.14941406, + "step": 3258, + "time_per_iteration": 2.6606647968292236 + }, + { + "auxiliary_loss_clip": 0.01150479, + "auxiliary_loss_mlp": 0.01042538, + "balance_loss_clip": 1.05187154, + "balance_loss_mlp": 1.02596748, + "epoch": 0.19594168044491206, + "flos": 30383669544480.0, + "grad_norm": 1.9465765830885453, + "language_loss": 0.73929298, + "learning_rate": 3.7180265855835714e-06, + "loss": 0.7612232, + "num_input_tokens_seen": 70339045, + "router_z_loss_clip": 0.98486328, + "router_z_loss_mlp": 0.16552734, + "step": 3259, + "time_per_iteration": 2.716299057006836 + }, + { + "auxiliary_loss_clip": 0.01151299, + "auxiliary_loss_mlp": 0.01043408, + "balance_loss_clip": 1.05181789, + "balance_loss_mlp": 1.02699339, + "epoch": 0.19600180369758005, + "flos": 14711349427200.0, + "grad_norm": 2.917285943443975, + "language_loss": 0.77093434, + "learning_rate": 3.7178271668249735e-06, + "loss": 0.79288149, + "num_input_tokens_seen": 70356505, + "router_z_loss_clip": 0.99511719, + "router_z_loss_mlp": 0.16394043, + "step": 3260, + "time_per_iteration": 2.6158409118652344 + }, + { + "auxiliary_loss_clip": 0.01146657, + "auxiliary_loss_mlp": 0.01045338, + "balance_loss_clip": 1.04928875, + "balance_loss_mlp": 1.02930415, + "epoch": 0.19606192695024802, + "flos": 25441468865280.0, + "grad_norm": 2.6874163611910866, + "language_loss": 0.81897759, + "learning_rate": 3.7176276829261975e-06, + "loss": 0.84089756, + "num_input_tokens_seen": 70375410, + "router_z_loss_clip": 0.97460938, + "router_z_loss_mlp": 0.16027832, + "step": 3261, + "time_per_iteration": 2.7053275108337402 + }, + { + "auxiliary_loss_clip": 0.01147162, + "auxiliary_loss_mlp": 0.01046657, + "balance_loss_clip": 1.05148339, + "balance_loss_mlp": 1.03088593, + "epoch": 0.19612205020291598, + "flos": 34746474640320.0, + "grad_norm": 1.7263596049277148, + "language_loss": 0.76644099, + "learning_rate": 3.717428133894807e-06, + "loss": 0.78837919, + "num_input_tokens_seen": 70396315, + "router_z_loss_clip": 0.95703125, + "router_z_loss_mlp": 0.15771484, + "step": 3262, + "time_per_iteration": 2.7500972747802734 + }, + { + "auxiliary_loss_clip": 0.01147463, + "auxiliary_loss_mlp": 0.01049054, + "balance_loss_clip": 1.05386555, + "balance_loss_mlp": 1.03350925, + "epoch": 0.19618217345558395, + "flos": 31184622380160.0, + "grad_norm": 1.9615546256507226, + "language_loss": 0.86367071, + "learning_rate": 3.71722851973837e-06, + "loss": 0.88563585, + "num_input_tokens_seen": 70417945, + "router_z_loss_clip": 0.93798828, + "router_z_loss_mlp": 0.15551758, + "step": 3263, + "time_per_iteration": 2.7298431396484375 + }, + { + "auxiliary_loss_clip": 0.01146851, + "auxiliary_loss_mlp": 0.01042189, + "balance_loss_clip": 1.05073345, + "balance_loss_mlp": 1.02735925, + "epoch": 0.1962422967082519, + "flos": 30828080602080.0, + "grad_norm": 2.07148607283617, + "language_loss": 0.73814273, + "learning_rate": 3.717028840464455e-06, + "loss": 0.76003313, + "num_input_tokens_seen": 70438690, + "router_z_loss_clip": 0.96191406, + "router_z_loss_mlp": 0.14825439, + "step": 3264, + "time_per_iteration": 2.7134790420532227 + }, + { + "auxiliary_loss_clip": 0.01147158, + "auxiliary_loss_mlp": 0.01049296, + "balance_loss_clip": 1.05404842, + "balance_loss_mlp": 1.0341264, + "epoch": 0.19630241996091988, + "flos": 22948593662880.0, + "grad_norm": 2.283569616371034, + "language_loss": 0.78758389, + "learning_rate": 3.7168290960806344e-06, + "loss": 0.80954838, + "num_input_tokens_seen": 70455385, + "router_z_loss_clip": 0.93066406, + "router_z_loss_mlp": 0.15167236, + "step": 3265, + "time_per_iteration": 2.91424298286438 + }, + { + "auxiliary_loss_clip": 0.01061229, + "auxiliary_loss_mlp": 0.01019326, + "balance_loss_clip": 1.02820206, + "balance_loss_mlp": 1.01727152, + "epoch": 0.19636254321358784, + "flos": 76045361673600.0, + "grad_norm": 0.790839744763209, + "language_loss": 0.53523582, + "learning_rate": 3.716629286594483e-06, + "loss": 0.55604136, + "num_input_tokens_seen": 70514280, + "router_z_loss_clip": 0.32983398, + "router_z_loss_mlp": 0.02055359, + "step": 3266, + "time_per_iteration": 3.3040502071380615 + }, + { + "auxiliary_loss_clip": 0.01151206, + "auxiliary_loss_mlp": 0.01049268, + "balance_loss_clip": 1.0516479, + "balance_loss_mlp": 1.03197098, + "epoch": 0.19642266646625584, + "flos": 25708844810880.0, + "grad_norm": 2.0676429477492433, + "language_loss": 0.79892188, + "learning_rate": 3.7164294120135767e-06, + "loss": 0.82092667, + "num_input_tokens_seen": 70531800, + "router_z_loss_clip": 0.99414062, + "router_z_loss_mlp": 0.17297363, + "step": 3267, + "time_per_iteration": 2.6476781368255615 + }, + { + "auxiliary_loss_clip": 0.01143533, + "auxiliary_loss_mlp": 0.01044218, + "balance_loss_clip": 1.05064416, + "balance_loss_mlp": 1.02944803, + "epoch": 0.1964827897189238, + "flos": 17739584280000.0, + "grad_norm": 2.619457162977936, + "language_loss": 0.86710072, + "learning_rate": 3.7162294723454953e-06, + "loss": 0.88897824, + "num_input_tokens_seen": 70550615, + "router_z_loss_clip": 0.9296875, + "router_z_loss_mlp": 0.14764404, + "step": 3268, + "time_per_iteration": 2.6896984577178955 + }, + { + "auxiliary_loss_clip": 0.01148403, + "auxiliary_loss_mlp": 0.01043707, + "balance_loss_clip": 1.0548737, + "balance_loss_mlp": 1.02851939, + "epoch": 0.19654291297159177, + "flos": 23482251587520.0, + "grad_norm": 2.1875169974801865, + "language_loss": 0.68751979, + "learning_rate": 3.7160294675978197e-06, + "loss": 0.70944089, + "num_input_tokens_seen": 70568690, + "router_z_loss_clip": 0.93603516, + "router_z_loss_mlp": 0.15185547, + "step": 3269, + "time_per_iteration": 2.6658647060394287 + }, + { + "auxiliary_loss_clip": 0.011516, + "auxiliary_loss_mlp": 0.01047429, + "balance_loss_clip": 1.05393147, + "balance_loss_mlp": 1.03102016, + "epoch": 0.19660303622425973, + "flos": 31451957808480.0, + "grad_norm": 3.4966531525888, + "language_loss": 0.80775827, + "learning_rate": 3.715829397778135e-06, + "loss": 0.82974857, + "num_input_tokens_seen": 70588665, + "router_z_loss_clip": 0.97705078, + "router_z_loss_mlp": 0.16412354, + "step": 3270, + "time_per_iteration": 2.7457082271575928 + }, + { + "auxiliary_loss_clip": 0.01144152, + "auxiliary_loss_mlp": 0.01040358, + "balance_loss_clip": 1.04979289, + "balance_loss_mlp": 1.02524209, + "epoch": 0.1966631594769277, + "flos": 25130138021280.0, + "grad_norm": 2.085857619091796, + "language_loss": 0.84195286, + "learning_rate": 3.715629262894028e-06, + "loss": 0.86379796, + "num_input_tokens_seen": 70606900, + "router_z_loss_clip": 0.94287109, + "router_z_loss_mlp": 0.15124512, + "step": 3271, + "time_per_iteration": 2.644238233566284 + }, + { + "auxiliary_loss_clip": 0.01145552, + "auxiliary_loss_mlp": 0.01045104, + "balance_loss_clip": 1.05200887, + "balance_loss_mlp": 1.02946413, + "epoch": 0.19672328272959566, + "flos": 28825273081440.0, + "grad_norm": 2.2729733913189714, + "language_loss": 0.79984009, + "learning_rate": 3.715429062953087e-06, + "loss": 0.82174659, + "num_input_tokens_seen": 70625955, + "router_z_loss_clip": 0.93554688, + "router_z_loss_mlp": 0.15631104, + "step": 3272, + "time_per_iteration": 2.6506478786468506 + }, + { + "auxiliary_loss_clip": 0.01149886, + "auxiliary_loss_mlp": 0.01040685, + "balance_loss_clip": 1.0535779, + "balance_loss_mlp": 1.02416301, + "epoch": 0.19678340598226365, + "flos": 28200301908480.0, + "grad_norm": 1.9400305165339584, + "language_loss": 0.81369758, + "learning_rate": 3.7152287979629043e-06, + "loss": 0.8356033, + "num_input_tokens_seen": 70646090, + "router_z_loss_clip": 0.96386719, + "router_z_loss_mlp": 0.16516113, + "step": 3273, + "time_per_iteration": 2.709594249725342 + }, + { + "auxiliary_loss_clip": 0.01149868, + "auxiliary_loss_mlp": 0.01047407, + "balance_loss_clip": 1.05185008, + "balance_loss_mlp": 1.03180242, + "epoch": 0.19684352923493162, + "flos": 29937799864800.0, + "grad_norm": 2.209910809185183, + "language_loss": 0.78010267, + "learning_rate": 3.7150284679310735e-06, + "loss": 0.80207539, + "num_input_tokens_seen": 70666065, + "router_z_loss_clip": 0.98144531, + "router_z_loss_mlp": 0.15625, + "step": 3274, + "time_per_iteration": 2.6949679851531982 + }, + { + "auxiliary_loss_clip": 0.01149506, + "auxiliary_loss_mlp": 0.01042026, + "balance_loss_clip": 1.0526402, + "balance_loss_mlp": 1.02570629, + "epoch": 0.19690365248759958, + "flos": 26596370373120.0, + "grad_norm": 2.6472374854419267, + "language_loss": 0.8127656, + "learning_rate": 3.7148280728651914e-06, + "loss": 0.83468097, + "num_input_tokens_seen": 70681580, + "router_z_loss_clip": 0.96875, + "router_z_loss_mlp": 0.16320801, + "step": 3275, + "time_per_iteration": 2.6616342067718506 + }, + { + "auxiliary_loss_clip": 0.01149416, + "auxiliary_loss_mlp": 0.01040271, + "balance_loss_clip": 1.05139315, + "balance_loss_mlp": 1.02417779, + "epoch": 0.19696377574026755, + "flos": 23253360707520.0, + "grad_norm": 2.281028722177479, + "language_loss": 0.81460559, + "learning_rate": 3.7146276127728563e-06, + "loss": 0.83650249, + "num_input_tokens_seen": 70697745, + "router_z_loss_clip": 0.97998047, + "router_z_loss_mlp": 0.16088867, + "step": 3276, + "time_per_iteration": 2.6874799728393555 + }, + { + "auxiliary_loss_clip": 0.01147837, + "auxiliary_loss_mlp": 0.01035208, + "balance_loss_clip": 1.05165601, + "balance_loss_mlp": 1.01937735, + "epoch": 0.19702389899293551, + "flos": 27845948063520.0, + "grad_norm": 2.7791986585880437, + "language_loss": 0.89380497, + "learning_rate": 3.7144270876616713e-06, + "loss": 0.91563547, + "num_input_tokens_seen": 70715110, + "router_z_loss_clip": 0.96191406, + "router_z_loss_mlp": 0.1583252, + "step": 3277, + "time_per_iteration": 2.8488712310791016 + }, + { + "auxiliary_loss_clip": 0.01152632, + "auxiliary_loss_mlp": 0.01047749, + "balance_loss_clip": 1.05239296, + "balance_loss_mlp": 1.02927125, + "epoch": 0.19708402224560348, + "flos": 27935681137920.0, + "grad_norm": 3.3307974884271365, + "language_loss": 0.62143183, + "learning_rate": 3.714226497539239e-06, + "loss": 0.6434356, + "num_input_tokens_seen": 70734715, + "router_z_loss_clip": 1.00146484, + "router_z_loss_mlp": 0.18469238, + "step": 3278, + "time_per_iteration": 2.810403347015381 + }, + { + "auxiliary_loss_clip": 0.01152655, + "auxiliary_loss_mlp": 0.0105208, + "balance_loss_clip": 1.05405879, + "balance_loss_mlp": 1.03555763, + "epoch": 0.19714414549827144, + "flos": 31314177590400.0, + "grad_norm": 2.336655767585001, + "language_loss": 0.73350084, + "learning_rate": 3.714025842413166e-06, + "loss": 0.75554818, + "num_input_tokens_seen": 70752650, + "router_z_loss_clip": 0.98779297, + "router_z_loss_mlp": 0.16516113, + "step": 3279, + "time_per_iteration": 2.6826109886169434 + }, + { + "auxiliary_loss_clip": 0.01149817, + "auxiliary_loss_mlp": 0.01041406, + "balance_loss_clip": 1.05183935, + "balance_loss_mlp": 1.02638531, + "epoch": 0.19720426875093944, + "flos": 29182301066880.0, + "grad_norm": 1.6157410813214996, + "language_loss": 0.82654864, + "learning_rate": 3.713825122291061e-06, + "loss": 0.84846085, + "num_input_tokens_seen": 70772365, + "router_z_loss_clip": 0.97998047, + "router_z_loss_mlp": 0.15014648, + "step": 3280, + "time_per_iteration": 2.6592233180999756 + }, + { + "auxiliary_loss_clip": 0.01150259, + "auxiliary_loss_mlp": 0.01035277, + "balance_loss_clip": 1.05387378, + "balance_loss_mlp": 1.02087641, + "epoch": 0.1972643920036074, + "flos": 16938226271520.0, + "grad_norm": 1.9593083357035024, + "language_loss": 0.77836192, + "learning_rate": 3.713624337180536e-06, + "loss": 0.80021727, + "num_input_tokens_seen": 70790340, + "router_z_loss_clip": 0.96337891, + "router_z_loss_mlp": 0.14404297, + "step": 3281, + "time_per_iteration": 2.632519483566284 + }, + { + "auxiliary_loss_clip": 0.01146008, + "auxiliary_loss_mlp": 0.01039013, + "balance_loss_clip": 1.05322433, + "balance_loss_mlp": 1.0250535, + "epoch": 0.19732451525627537, + "flos": 24237669350880.0, + "grad_norm": 1.6113046672969706, + "language_loss": 0.79327738, + "learning_rate": 3.7134234870892045e-06, + "loss": 0.81512761, + "num_input_tokens_seen": 70809295, + "router_z_loss_clip": 0.92724609, + "router_z_loss_mlp": 0.13977051, + "step": 3282, + "time_per_iteration": 2.6596264839172363 + }, + { + "auxiliary_loss_clip": 0.01157199, + "auxiliary_loss_mlp": 0.0104171, + "balance_loss_clip": 1.05779672, + "balance_loss_mlp": 1.02601063, + "epoch": 0.19738463850894333, + "flos": 30472916411520.0, + "grad_norm": 1.9828221724126385, + "language_loss": 0.72142154, + "learning_rate": 3.7132225720246826e-06, + "loss": 0.74341071, + "num_input_tokens_seen": 70828765, + "router_z_loss_clip": 0.99462891, + "router_z_loss_mlp": 0.15698242, + "step": 3283, + "time_per_iteration": 2.700730085372925 + }, + { + "auxiliary_loss_clip": 0.0115233, + "auxiliary_loss_mlp": 0.01044051, + "balance_loss_clip": 1.05506051, + "balance_loss_mlp": 1.02826726, + "epoch": 0.1974447617616113, + "flos": 22413598668000.0, + "grad_norm": 1.6947204280421877, + "language_loss": 0.78910625, + "learning_rate": 3.7130215919945886e-06, + "loss": 0.81107008, + "num_input_tokens_seen": 70846805, + "router_z_loss_clip": 0.97460938, + "router_z_loss_mlp": 0.15759277, + "step": 3284, + "time_per_iteration": 2.6402196884155273 + }, + { + "auxiliary_loss_clip": 0.01153395, + "auxiliary_loss_mlp": 0.01039791, + "balance_loss_clip": 1.05398214, + "balance_loss_mlp": 1.02343571, + "epoch": 0.19750488501427926, + "flos": 27934222515840.0, + "grad_norm": 1.9773895275816846, + "language_loss": 0.86054653, + "learning_rate": 3.7128205470065445e-06, + "loss": 0.88247836, + "num_input_tokens_seen": 70863805, + "router_z_loss_clip": 0.99414062, + "router_z_loss_mlp": 0.16345215, + "step": 3285, + "time_per_iteration": 2.6519365310668945 + }, + { + "auxiliary_loss_clip": 0.01151806, + "auxiliary_loss_mlp": 0.01038859, + "balance_loss_clip": 1.05728543, + "balance_loss_mlp": 1.02332568, + "epoch": 0.19756500826694723, + "flos": 26687764656000.0, + "grad_norm": 2.312883318259662, + "language_loss": 0.88516486, + "learning_rate": 3.712619437068174e-06, + "loss": 0.90707147, + "num_input_tokens_seen": 70882660, + "router_z_loss_clip": 0.94580078, + "router_z_loss_mlp": 0.15539551, + "step": 3286, + "time_per_iteration": 2.685141086578369 + }, + { + "auxiliary_loss_clip": 0.01159728, + "auxiliary_loss_mlp": 0.01043448, + "balance_loss_clip": 1.05977023, + "balance_loss_mlp": 1.02520919, + "epoch": 0.19762513151961522, + "flos": 18496541700000.0, + "grad_norm": 2.2842077950933946, + "language_loss": 0.78487444, + "learning_rate": 3.712418262187102e-06, + "loss": 0.80690622, + "num_input_tokens_seen": 70898765, + "router_z_loss_clip": 0.99804688, + "router_z_loss_mlp": 0.18249512, + "step": 3287, + "time_per_iteration": 2.656498670578003 + }, + { + "auxiliary_loss_clip": 0.01154949, + "auxiliary_loss_mlp": 0.01043236, + "balance_loss_clip": 1.05548716, + "balance_loss_mlp": 1.02595067, + "epoch": 0.1976852547722832, + "flos": 20718070263360.0, + "grad_norm": 2.1889243459314085, + "language_loss": 0.80668849, + "learning_rate": 3.7122170223709584e-06, + "loss": 0.82867038, + "num_input_tokens_seen": 70916370, + "router_z_loss_clip": 0.99462891, + "router_z_loss_mlp": 0.17285156, + "step": 3288, + "time_per_iteration": 2.722647190093994 + }, + { + "auxiliary_loss_clip": 0.01148046, + "auxiliary_loss_mlp": 0.010489, + "balance_loss_clip": 1.05456614, + "balance_loss_mlp": 1.03351045, + "epoch": 0.19774537802495115, + "flos": 24773636760480.0, + "grad_norm": 1.646154071944119, + "language_loss": 0.7263298, + "learning_rate": 3.712015717627374e-06, + "loss": 0.7482993, + "num_input_tokens_seen": 70934870, + "router_z_loss_clip": 0.93554688, + "router_z_loss_mlp": 0.15393066, + "step": 3289, + "time_per_iteration": 2.6701254844665527 + }, + { + "auxiliary_loss_clip": 0.01151941, + "auxiliary_loss_mlp": 0.01045872, + "balance_loss_clip": 1.05561948, + "balance_loss_mlp": 1.02976692, + "epoch": 0.19780550127761912, + "flos": 33233248594080.0, + "grad_norm": 1.8703285312566267, + "language_loss": 0.7925126, + "learning_rate": 3.7118143479639813e-06, + "loss": 0.81449074, + "num_input_tokens_seen": 70955140, + "router_z_loss_clip": 0.96337891, + "router_z_loss_mlp": 0.16088867, + "step": 3290, + "time_per_iteration": 2.9517600536346436 + }, + { + "auxiliary_loss_clip": 0.01062919, + "auxiliary_loss_mlp": 0.01001636, + "balance_loss_clip": 1.02967215, + "balance_loss_mlp": 0.99954987, + "epoch": 0.19786562453028708, + "flos": 77543724228480.0, + "grad_norm": 1.6860878109428616, + "language_loss": 0.6040194, + "learning_rate": 3.711612913388418e-06, + "loss": 0.62466496, + "num_input_tokens_seen": 71012005, + "router_z_loss_clip": 0.33203125, + "router_z_loss_mlp": 0.02087402, + "step": 3291, + "time_per_iteration": 4.78715443611145 + }, + { + "auxiliary_loss_clip": 0.01156371, + "auxiliary_loss_mlp": 0.01047356, + "balance_loss_clip": 1.05418384, + "balance_loss_mlp": 1.02991593, + "epoch": 0.19792574778295505, + "flos": 32075429842080.0, + "grad_norm": 1.704458718281709, + "language_loss": 0.8101964, + "learning_rate": 3.7114114139083204e-06, + "loss": 0.83223373, + "num_input_tokens_seen": 71031140, + "router_z_loss_clip": 1.02050781, + "router_z_loss_mlp": 0.17431641, + "step": 3292, + "time_per_iteration": 4.187391757965088 + }, + { + "auxiliary_loss_clip": 0.01147934, + "auxiliary_loss_mlp": 0.01046289, + "balance_loss_clip": 1.05513334, + "balance_loss_mlp": 1.03046417, + "epoch": 0.19798587103562304, + "flos": 24328455874560.0, + "grad_norm": 2.436759784846056, + "language_loss": 0.81912053, + "learning_rate": 3.7112098495313313e-06, + "loss": 0.84106278, + "num_input_tokens_seen": 71050250, + "router_z_loss_clip": 0.92919922, + "router_z_loss_mlp": 0.15814209, + "step": 3293, + "time_per_iteration": 2.662221908569336 + }, + { + "auxiliary_loss_clip": 0.01165361, + "auxiliary_loss_mlp": 0.01047859, + "balance_loss_clip": 1.06254649, + "balance_loss_mlp": 1.02991772, + "epoch": 0.198045994288291, + "flos": 24550782955200.0, + "grad_norm": 1.745363639863399, + "language_loss": 0.60919601, + "learning_rate": 3.711008220265093e-06, + "loss": 0.63132823, + "num_input_tokens_seen": 71068665, + "router_z_loss_clip": 1.02734375, + "router_z_loss_mlp": 0.17956543, + "step": 3294, + "time_per_iteration": 2.7062180042266846 + }, + { + "auxiliary_loss_clip": 0.01151251, + "auxiliary_loss_mlp": 0.0104496, + "balance_loss_clip": 1.05461788, + "balance_loss_mlp": 1.02966559, + "epoch": 0.19810611754095897, + "flos": 21923976676320.0, + "grad_norm": 2.1678112260874434, + "language_loss": 0.87319702, + "learning_rate": 3.710806526117251e-06, + "loss": 0.89515913, + "num_input_tokens_seen": 71085320, + "router_z_loss_clip": 0.96630859, + "router_z_loss_mlp": 0.15319824, + "step": 3295, + "time_per_iteration": 4.082299470901489 + }, + { + "auxiliary_loss_clip": 0.0115069, + "auxiliary_loss_mlp": 0.01045574, + "balance_loss_clip": 1.05456579, + "balance_loss_mlp": 1.03029132, + "epoch": 0.19816624079362694, + "flos": 18406362935520.0, + "grad_norm": 3.2364912460867097, + "language_loss": 0.80593145, + "learning_rate": 3.7106047670954544e-06, + "loss": 0.82789403, + "num_input_tokens_seen": 71102020, + "router_z_loss_clip": 0.9609375, + "router_z_loss_mlp": 0.15283203, + "step": 3296, + "time_per_iteration": 4.166049242019653 + }, + { + "auxiliary_loss_clip": 0.01157025, + "auxiliary_loss_mlp": 0.01046627, + "balance_loss_clip": 1.05557656, + "balance_loss_mlp": 1.02925849, + "epoch": 0.1982263640462949, + "flos": 30383710061760.0, + "grad_norm": 1.8908207044133525, + "language_loss": 0.68292284, + "learning_rate": 3.710402943207354e-06, + "loss": 0.70495939, + "num_input_tokens_seen": 71123390, + "router_z_loss_clip": 1.01513672, + "router_z_loss_mlp": 0.17370605, + "step": 3297, + "time_per_iteration": 2.720072031021118 + }, + { + "auxiliary_loss_clip": 0.011464, + "auxiliary_loss_mlp": 0.01035797, + "balance_loss_clip": 1.05305684, + "balance_loss_mlp": 1.02064514, + "epoch": 0.19828648729896287, + "flos": 24863005179360.0, + "grad_norm": 1.6367484045859564, + "language_loss": 0.81308627, + "learning_rate": 3.7102010544606016e-06, + "loss": 0.83490825, + "num_input_tokens_seen": 71141800, + "router_z_loss_clip": 0.93310547, + "router_z_loss_mlp": 0.15148926, + "step": 3298, + "time_per_iteration": 2.6529924869537354 + }, + { + "auxiliary_loss_clip": 0.01158077, + "auxiliary_loss_mlp": 0.01041711, + "balance_loss_clip": 1.05635512, + "balance_loss_mlp": 1.0230962, + "epoch": 0.19834661055163083, + "flos": 23037313805280.0, + "grad_norm": 2.1340121469826543, + "language_loss": 0.85051024, + "learning_rate": 3.7099991008628544e-06, + "loss": 0.87250817, + "num_input_tokens_seen": 71159505, + "router_z_loss_clip": 1.015625, + "router_z_loss_mlp": 0.18621826, + "step": 3299, + "time_per_iteration": 2.6735031604766846 + }, + { + "auxiliary_loss_clip": 0.0106108, + "auxiliary_loss_mlp": 0.01008931, + "balance_loss_clip": 1.02768302, + "balance_loss_mlp": 1.00655687, + "epoch": 0.19840673380429882, + "flos": 73527574694400.0, + "grad_norm": 0.7650771642375207, + "language_loss": 0.53233457, + "learning_rate": 3.7097970824217706e-06, + "loss": 0.55303466, + "num_input_tokens_seen": 71223265, + "router_z_loss_clip": 0.33374023, + "router_z_loss_mlp": 0.02371216, + "step": 3300, + "time_per_iteration": 3.3091304302215576 + }, + { + "auxiliary_loss_clip": 0.01152719, + "auxiliary_loss_mlp": 0.01050561, + "balance_loss_clip": 1.05474138, + "balance_loss_mlp": 1.03238189, + "epoch": 0.1984668570569668, + "flos": 24284379424320.0, + "grad_norm": 2.0370719327281135, + "language_loss": 0.73414814, + "learning_rate": 3.7095949991450093e-06, + "loss": 0.756181, + "num_input_tokens_seen": 71242385, + "router_z_loss_clip": 0.97949219, + "router_z_loss_mlp": 0.18164062, + "step": 3301, + "time_per_iteration": 2.8430018424987793 + }, + { + "auxiliary_loss_clip": 0.01152159, + "auxiliary_loss_mlp": 0.01037158, + "balance_loss_clip": 1.05481851, + "balance_loss_mlp": 1.02194643, + "epoch": 0.19852698030963475, + "flos": 19072493314560.0, + "grad_norm": 2.2221977225170617, + "language_loss": 0.88174474, + "learning_rate": 3.709392851040235e-06, + "loss": 0.90363795, + "num_input_tokens_seen": 71258990, + "router_z_loss_clip": 0.97363281, + "router_z_loss_mlp": 0.15197754, + "step": 3302, + "time_per_iteration": 2.621342897415161 + }, + { + "auxiliary_loss_clip": 0.01150825, + "auxiliary_loss_mlp": 0.01051494, + "balance_loss_clip": 1.05242682, + "balance_loss_mlp": 1.03473282, + "epoch": 0.19858710356230272, + "flos": 52644861256320.0, + "grad_norm": 1.757313855410646, + "language_loss": 0.73206151, + "learning_rate": 3.709190638115111e-06, + "loss": 0.75408471, + "num_input_tokens_seen": 71282770, + "router_z_loss_clip": 0.98291016, + "router_z_loss_mlp": 0.16760254, + "step": 3303, + "time_per_iteration": 2.8384363651275635 + }, + { + "auxiliary_loss_clip": 0.01152302, + "auxiliary_loss_mlp": 0.01046276, + "balance_loss_clip": 1.05513406, + "balance_loss_mlp": 1.02964628, + "epoch": 0.19864722681497068, + "flos": 42884747861760.0, + "grad_norm": 4.332600177145593, + "language_loss": 0.74630153, + "learning_rate": 3.7089883603773084e-06, + "loss": 0.7682873, + "num_input_tokens_seen": 71301410, + "router_z_loss_clip": 0.97119141, + "router_z_loss_mlp": 0.16625977, + "step": 3304, + "time_per_iteration": 2.794574499130249 + }, + { + "auxiliary_loss_clip": 0.01151517, + "auxiliary_loss_mlp": 0.01042966, + "balance_loss_clip": 1.0545609, + "balance_loss_mlp": 1.02739751, + "epoch": 0.19870735006763865, + "flos": 23703889874400.0, + "grad_norm": 1.7157275886356067, + "language_loss": 0.86225855, + "learning_rate": 3.7087860178344955e-06, + "loss": 0.88420337, + "num_input_tokens_seen": 71319670, + "router_z_loss_clip": 0.96923828, + "router_z_loss_mlp": 0.15563965, + "step": 3305, + "time_per_iteration": 2.643414258956909 + }, + { + "auxiliary_loss_clip": 0.01154483, + "auxiliary_loss_mlp": 0.01044242, + "balance_loss_clip": 1.0535208, + "balance_loss_mlp": 1.0283041, + "epoch": 0.19876747332030664, + "flos": 28733027935680.0, + "grad_norm": 1.6723923448662648, + "language_loss": 0.68520117, + "learning_rate": 3.7085836104943445e-06, + "loss": 0.70718843, + "num_input_tokens_seen": 71339850, + "router_z_loss_clip": 1.01025391, + "router_z_loss_mlp": 0.15942383, + "step": 3306, + "time_per_iteration": 2.703242778778076 + }, + { + "auxiliary_loss_clip": 0.01149954, + "auxiliary_loss_mlp": 0.01046475, + "balance_loss_clip": 1.053303, + "balance_loss_mlp": 1.03121579, + "epoch": 0.1988275965729746, + "flos": 24195740316480.0, + "grad_norm": 1.7981862695833686, + "language_loss": 0.76773292, + "learning_rate": 3.7083811383645332e-06, + "loss": 0.78969717, + "num_input_tokens_seen": 71359795, + "router_z_loss_clip": 0.96533203, + "router_z_loss_mlp": 0.15252686, + "step": 3307, + "time_per_iteration": 2.649392604827881 + }, + { + "auxiliary_loss_clip": 0.01150371, + "auxiliary_loss_mlp": 0.01049708, + "balance_loss_clip": 1.05436635, + "balance_loss_mlp": 1.03462851, + "epoch": 0.19888771982564257, + "flos": 28691585108640.0, + "grad_norm": 2.9348811632927783, + "language_loss": 0.75947881, + "learning_rate": 3.708178601452737e-06, + "loss": 0.7814796, + "num_input_tokens_seen": 71378885, + "router_z_loss_clip": 0.96044922, + "router_z_loss_mlp": 0.15075684, + "step": 3308, + "time_per_iteration": 2.810596227645874 + }, + { + "auxiliary_loss_clip": 0.01152866, + "auxiliary_loss_mlp": 0.01036941, + "balance_loss_clip": 1.05475712, + "balance_loss_mlp": 1.02096736, + "epoch": 0.19894784307831054, + "flos": 22147073585280.0, + "grad_norm": 1.6760090247905024, + "language_loss": 0.76041067, + "learning_rate": 3.7079759997666374e-06, + "loss": 0.7823087, + "num_input_tokens_seen": 71397285, + "router_z_loss_clip": 0.98144531, + "router_z_loss_mlp": 0.15991211, + "step": 3309, + "time_per_iteration": 2.687025547027588 + }, + { + "auxiliary_loss_clip": 0.01149604, + "auxiliary_loss_mlp": 0.0104835, + "balance_loss_clip": 1.05332923, + "balance_loss_mlp": 1.03138638, + "epoch": 0.1990079663309785, + "flos": 29622295740960.0, + "grad_norm": 1.8649593843968648, + "language_loss": 0.88056856, + "learning_rate": 3.707773333313917e-06, + "loss": 0.90254813, + "num_input_tokens_seen": 71415775, + "router_z_loss_clip": 0.96142578, + "router_z_loss_mlp": 0.16967773, + "step": 3310, + "time_per_iteration": 2.6792147159576416 + }, + { + "auxiliary_loss_clip": 0.01149886, + "auxiliary_loss_mlp": 0.01042893, + "balance_loss_clip": 1.0533576, + "balance_loss_mlp": 1.02666855, + "epoch": 0.19906808958364647, + "flos": 42583222199520.0, + "grad_norm": 2.7583159969870645, + "language_loss": 0.64668405, + "learning_rate": 3.70757060210226e-06, + "loss": 0.66861188, + "num_input_tokens_seen": 71437315, + "router_z_loss_clip": 0.96435547, + "router_z_loss_mlp": 0.16210938, + "step": 3311, + "time_per_iteration": 2.831831455230713 + }, + { + "auxiliary_loss_clip": 0.01152661, + "auxiliary_loss_mlp": 0.01043883, + "balance_loss_clip": 1.05292201, + "balance_loss_mlp": 1.02784896, + "epoch": 0.19912821283631443, + "flos": 29315421797760.0, + "grad_norm": 2.427655379820718, + "language_loss": 0.74067318, + "learning_rate": 3.707367806139355e-06, + "loss": 0.76263857, + "num_input_tokens_seen": 71456320, + "router_z_loss_clip": 0.99609375, + "router_z_loss_mlp": 0.16052246, + "step": 3312, + "time_per_iteration": 2.687742233276367 + }, + { + "auxiliary_loss_clip": 0.01150312, + "auxiliary_loss_mlp": 0.0104499, + "balance_loss_clip": 1.0527283, + "balance_loss_mlp": 1.02909994, + "epoch": 0.19918833608898243, + "flos": 24232523656320.0, + "grad_norm": 2.2700262463390177, + "language_loss": 0.83726734, + "learning_rate": 3.7071649454328915e-06, + "loss": 0.85922033, + "num_input_tokens_seen": 71475360, + "router_z_loss_clip": 0.97607422, + "router_z_loss_mlp": 0.15893555, + "step": 3313, + "time_per_iteration": 2.9020755290985107 + }, + { + "auxiliary_loss_clip": 0.01151874, + "auxiliary_loss_mlp": 0.01039949, + "balance_loss_clip": 1.05510008, + "balance_loss_mlp": 1.02418947, + "epoch": 0.1992484593416504, + "flos": 35503391543040.0, + "grad_norm": 2.3912775842795098, + "language_loss": 0.808061, + "learning_rate": 3.7069620199905625e-06, + "loss": 0.82997924, + "num_input_tokens_seen": 71496155, + "router_z_loss_clip": 0.96728516, + "router_z_loss_mlp": 0.15759277, + "step": 3314, + "time_per_iteration": 2.757645606994629 + }, + { + "auxiliary_loss_clip": 0.01144589, + "auxiliary_loss_mlp": 0.01044173, + "balance_loss_clip": 1.05051613, + "balance_loss_mlp": 1.029176, + "epoch": 0.19930858259431836, + "flos": 28424776404960.0, + "grad_norm": 1.4935220324354364, + "language_loss": 0.87335086, + "learning_rate": 3.7067590298200627e-06, + "loss": 0.8952384, + "num_input_tokens_seen": 71517295, + "router_z_loss_clip": 0.94042969, + "router_z_loss_mlp": 0.14984131, + "step": 3315, + "time_per_iteration": 2.7014431953430176 + }, + { + "auxiliary_loss_clip": 0.01151232, + "auxiliary_loss_mlp": 0.01039, + "balance_loss_clip": 1.05414319, + "balance_loss_mlp": 1.02351499, + "epoch": 0.19936870584698632, + "flos": 30967238407680.0, + "grad_norm": 1.6406385395283107, + "language_loss": 0.71074009, + "learning_rate": 3.7065559749290892e-06, + "loss": 0.73264241, + "num_input_tokens_seen": 71540000, + "router_z_loss_clip": 0.97167969, + "router_z_loss_mlp": 0.1550293, + "step": 3316, + "time_per_iteration": 2.7298409938812256 + }, + { + "auxiliary_loss_clip": 0.01058559, + "auxiliary_loss_mlp": 0.01002933, + "balance_loss_clip": 1.02512217, + "balance_loss_mlp": 1.00082564, + "epoch": 0.1994288290996543, + "flos": 75855853589760.0, + "grad_norm": 0.8714302083550486, + "language_loss": 0.66338235, + "learning_rate": 3.706352855325342e-06, + "loss": 0.68399727, + "num_input_tokens_seen": 71607880, + "router_z_loss_clip": 0.33422852, + "router_z_loss_mlp": 0.02108765, + "step": 3317, + "time_per_iteration": 3.3689117431640625 + }, + { + "auxiliary_loss_clip": 0.01153676, + "auxiliary_loss_mlp": 0.01043896, + "balance_loss_clip": 1.05226398, + "balance_loss_mlp": 1.02799332, + "epoch": 0.19948895235232225, + "flos": 23215240297440.0, + "grad_norm": 2.184044593252815, + "language_loss": 0.74187249, + "learning_rate": 3.7061496710165233e-06, + "loss": 0.76384819, + "num_input_tokens_seen": 71625695, + "router_z_loss_clip": 1.01416016, + "router_z_loss_mlp": 0.15905762, + "step": 3318, + "time_per_iteration": 2.672032594680786 + }, + { + "auxiliary_loss_clip": 0.01145702, + "auxiliary_loss_mlp": 0.01037636, + "balance_loss_clip": 1.05134058, + "balance_loss_mlp": 1.0227648, + "epoch": 0.19954907560499022, + "flos": 46144223596800.0, + "grad_norm": 1.924730423011253, + "language_loss": 0.78949898, + "learning_rate": 3.7059464220103385e-06, + "loss": 0.81133235, + "num_input_tokens_seen": 71648520, + "router_z_loss_clip": 0.94433594, + "router_z_loss_mlp": 0.14880371, + "step": 3319, + "time_per_iteration": 2.8838257789611816 + }, + { + "auxiliary_loss_clip": 0.01151489, + "auxiliary_loss_mlp": 0.01040139, + "balance_loss_clip": 1.05267143, + "balance_loss_mlp": 1.02257943, + "epoch": 0.1996091988576582, + "flos": 60481649332800.0, + "grad_norm": 1.9722534206835691, + "language_loss": 0.75735229, + "learning_rate": 3.7057431083144945e-06, + "loss": 0.77926856, + "num_input_tokens_seen": 71672185, + "router_z_loss_clip": 0.98876953, + "router_z_loss_mlp": 0.17553711, + "step": 3320, + "time_per_iteration": 2.9829213619232178 + }, + { + "auxiliary_loss_clip": 0.01147263, + "auxiliary_loss_mlp": 0.01042178, + "balance_loss_clip": 1.05130482, + "balance_loss_mlp": 1.02613187, + "epoch": 0.19966932211032618, + "flos": 27620582186880.0, + "grad_norm": 2.6842130134474593, + "language_loss": 0.80356526, + "learning_rate": 3.705539729936701e-06, + "loss": 0.8254596, + "num_input_tokens_seen": 71692890, + "router_z_loss_clip": 0.96044922, + "router_z_loss_mlp": 0.16052246, + "step": 3321, + "time_per_iteration": 2.703291177749634 + }, + { + "auxiliary_loss_clip": 0.01057622, + "auxiliary_loss_mlp": 0.01003921, + "balance_loss_clip": 1.02420259, + "balance_loss_mlp": 1.00182581, + "epoch": 0.19972944536299414, + "flos": 65990853658080.0, + "grad_norm": 0.87867001827104, + "language_loss": 0.65195966, + "learning_rate": 3.7053362868846696e-06, + "loss": 0.67257506, + "num_input_tokens_seen": 71745815, + "router_z_loss_clip": 0.33349609, + "router_z_loss_mlp": 0.02096558, + "step": 3322, + "time_per_iteration": 3.057461738586426 + }, + { + "auxiliary_loss_clip": 0.0105708, + "auxiliary_loss_mlp": 0.01003552, + "balance_loss_clip": 1.02377415, + "balance_loss_mlp": 1.00140071, + "epoch": 0.1997895686156621, + "flos": 84629383022880.0, + "grad_norm": 0.7857590304501509, + "language_loss": 0.56967527, + "learning_rate": 3.7051327791661153e-06, + "loss": 0.59028161, + "num_input_tokens_seen": 71806915, + "router_z_loss_clip": 0.33276367, + "router_z_loss_mlp": 0.02154541, + "step": 3323, + "time_per_iteration": 3.3555071353912354 + }, + { + "auxiliary_loss_clip": 0.01146856, + "auxiliary_loss_mlp": 0.01039397, + "balance_loss_clip": 1.05151093, + "balance_loss_mlp": 1.02310085, + "epoch": 0.19984969186833007, + "flos": 22637060232480.0, + "grad_norm": 1.846940313514776, + "language_loss": 0.80314785, + "learning_rate": 3.7049292067887555e-06, + "loss": 0.82501042, + "num_input_tokens_seen": 71824645, + "router_z_loss_clip": 0.95458984, + "router_z_loss_mlp": 0.1628418, + "step": 3324, + "time_per_iteration": 2.655515193939209 + }, + { + "auxiliary_loss_clip": 0.01146374, + "auxiliary_loss_mlp": 0.01039623, + "balance_loss_clip": 1.05015492, + "balance_loss_mlp": 1.0228622, + "epoch": 0.19990981512099804, + "flos": 32250236503680.0, + "grad_norm": 2.631817430027952, + "language_loss": 0.53994763, + "learning_rate": 3.7047255697603092e-06, + "loss": 0.56180763, + "num_input_tokens_seen": 71845125, + "router_z_loss_clip": 0.96191406, + "router_z_loss_mlp": 0.16760254, + "step": 3325, + "time_per_iteration": 2.974917411804199 + }, + { + "auxiliary_loss_clip": 0.01145971, + "auxiliary_loss_mlp": 0.01039916, + "balance_loss_clip": 1.04974723, + "balance_loss_mlp": 1.02462173, + "epoch": 0.19996993837366603, + "flos": 19920682948320.0, + "grad_norm": 2.0812366539394334, + "language_loss": 0.86321747, + "learning_rate": 3.7045218680884984e-06, + "loss": 0.8850764, + "num_input_tokens_seen": 71863500, + "router_z_loss_clip": 0.96337891, + "router_z_loss_mlp": 0.1529541, + "step": 3326, + "time_per_iteration": 2.761448383331299 + }, + { + "auxiliary_loss_clip": 0.01146707, + "auxiliary_loss_mlp": 0.01037194, + "balance_loss_clip": 1.05278325, + "balance_loss_mlp": 1.0221858, + "epoch": 0.200030061626334, + "flos": 25434661962240.0, + "grad_norm": 4.790570280556171, + "language_loss": 0.71851212, + "learning_rate": 3.7043181017810476e-06, + "loss": 0.74035114, + "num_input_tokens_seen": 71881845, + "router_z_loss_clip": 0.93847656, + "router_z_loss_mlp": 0.15020752, + "step": 3327, + "time_per_iteration": 2.6866862773895264 + }, + { + "auxiliary_loss_clip": 0.01147923, + "auxiliary_loss_mlp": 0.01039437, + "balance_loss_clip": 1.04973912, + "balance_loss_mlp": 1.02289033, + "epoch": 0.20009018487900196, + "flos": 28996028015040.0, + "grad_norm": 3.132729452014916, + "language_loss": 0.77015483, + "learning_rate": 3.7041142708456833e-06, + "loss": 0.79202843, + "num_input_tokens_seen": 71900940, + "router_z_loss_clip": 0.98095703, + "router_z_loss_mlp": 0.16552734, + "step": 3328, + "time_per_iteration": 2.664863109588623 + }, + { + "auxiliary_loss_clip": 0.01139197, + "auxiliary_loss_mlp": 0.01038575, + "balance_loss_clip": 1.04794431, + "balance_loss_mlp": 1.02450812, + "epoch": 0.20015030813166992, + "flos": 34302266169120.0, + "grad_norm": 1.7438406543136333, + "language_loss": 0.6995582, + "learning_rate": 3.7039103752901353e-06, + "loss": 0.72133595, + "num_input_tokens_seen": 71921925, + "router_z_loss_clip": 0.91259766, + "router_z_loss_mlp": 0.14080811, + "step": 3329, + "time_per_iteration": 2.8020706176757812 + }, + { + "auxiliary_loss_clip": 0.01151357, + "auxiliary_loss_mlp": 0.01044731, + "balance_loss_clip": 1.05306005, + "balance_loss_mlp": 1.02687311, + "epoch": 0.2002104313843379, + "flos": 31807648723680.0, + "grad_norm": 3.5233529032985653, + "language_loss": 0.81177139, + "learning_rate": 3.7037064151221353e-06, + "loss": 0.83373225, + "num_input_tokens_seen": 71941855, + "router_z_loss_clip": 0.984375, + "router_z_loss_mlp": 0.17858887, + "step": 3330, + "time_per_iteration": 2.7530677318573 + }, + { + "auxiliary_loss_clip": 0.01147152, + "auxiliary_loss_mlp": 0.01037876, + "balance_loss_clip": 1.0494808, + "balance_loss_mlp": 1.02150857, + "epoch": 0.20027055463700585, + "flos": 28024077142080.0, + "grad_norm": 2.005620322982223, + "language_loss": 0.76095414, + "learning_rate": 3.703502390349417e-06, + "loss": 0.78280443, + "num_input_tokens_seen": 71960915, + "router_z_loss_clip": 0.97753906, + "router_z_loss_mlp": 0.16369629, + "step": 3331, + "time_per_iteration": 5.653068780899048 + }, + { + "auxiliary_loss_clip": 0.0114715, + "auxiliary_loss_mlp": 0.01042334, + "balance_loss_clip": 1.04925823, + "balance_loss_mlp": 1.02570462, + "epoch": 0.20033067788967382, + "flos": 20945259417600.0, + "grad_norm": 2.174006281719193, + "language_loss": 0.79050601, + "learning_rate": 3.7032983009797176e-06, + "loss": 0.81240088, + "num_input_tokens_seen": 71979220, + "router_z_loss_clip": 0.97753906, + "router_z_loss_mlp": 0.16625977, + "step": 3332, + "time_per_iteration": 2.662548542022705 + }, + { + "auxiliary_loss_clip": 0.01052327, + "auxiliary_loss_mlp": 0.01004043, + "balance_loss_clip": 1.01972675, + "balance_loss_mlp": 1.0019244, + "epoch": 0.2003908011423418, + "flos": 74220318576000.0, + "grad_norm": 0.9402879290150585, + "language_loss": 0.62014908, + "learning_rate": 3.703094147020776e-06, + "loss": 0.64071274, + "num_input_tokens_seen": 72033950, + "router_z_loss_clip": 0.32543945, + "router_z_loss_mlp": 0.02120972, + "step": 3333, + "time_per_iteration": 3.1842424869537354 + }, + { + "auxiliary_loss_clip": 0.01145483, + "auxiliary_loss_mlp": 0.01042418, + "balance_loss_clip": 1.04733336, + "balance_loss_mlp": 1.02647352, + "epoch": 0.20045092439500978, + "flos": 29536816980960.0, + "grad_norm": 2.6534723084642904, + "language_loss": 0.8133353, + "learning_rate": 3.7028899284803334e-06, + "loss": 0.83521426, + "num_input_tokens_seen": 72051395, + "router_z_loss_clip": 0.98095703, + "router_z_loss_mlp": 0.15942383, + "step": 3334, + "time_per_iteration": 4.196434259414673 + }, + { + "auxiliary_loss_clip": 0.01152554, + "auxiliary_loss_mlp": 0.01045959, + "balance_loss_clip": 1.05175185, + "balance_loss_mlp": 1.0287931, + "epoch": 0.20051104764767774, + "flos": 35859487631040.0, + "grad_norm": 1.8971426549809178, + "language_loss": 0.74499375, + "learning_rate": 3.702685645366134e-06, + "loss": 0.76697886, + "num_input_tokens_seen": 72071305, + "router_z_loss_clip": 1.00878906, + "router_z_loss_mlp": 0.17175293, + "step": 3335, + "time_per_iteration": 4.333533525466919 + }, + { + "auxiliary_loss_clip": 0.01151125, + "auxiliary_loss_mlp": 0.01053875, + "balance_loss_clip": 1.05349135, + "balance_loss_mlp": 1.03787768, + "epoch": 0.2005711709003457, + "flos": 28691787695040.0, + "grad_norm": 1.726947265269328, + "language_loss": 0.79766405, + "learning_rate": 3.7024812976859243e-06, + "loss": 0.81971407, + "num_input_tokens_seen": 72090165, + "router_z_loss_clip": 0.9765625, + "router_z_loss_mlp": 0.16003418, + "step": 3336, + "time_per_iteration": 2.755204916000366 + }, + { + "auxiliary_loss_clip": 0.01150839, + "auxiliary_loss_mlp": 0.01041244, + "balance_loss_clip": 1.04830813, + "balance_loss_mlp": 1.02372038, + "epoch": 0.20063129415301367, + "flos": 27483693348960.0, + "grad_norm": 1.9505962614463843, + "language_loss": 0.7752912, + "learning_rate": 3.7022768854474532e-06, + "loss": 0.797212, + "num_input_tokens_seen": 72107210, + "router_z_loss_clip": 1.02392578, + "router_z_loss_mlp": 0.17504883, + "step": 3337, + "time_per_iteration": 2.645184278488159 + }, + { + "auxiliary_loss_clip": 0.0115023, + "auxiliary_loss_mlp": 0.01040105, + "balance_loss_clip": 1.05200243, + "balance_loss_mlp": 1.02310598, + "epoch": 0.20069141740568164, + "flos": 31674690061920.0, + "grad_norm": 2.01692108897103, + "language_loss": 0.68874907, + "learning_rate": 3.7020724086584724e-06, + "loss": 0.71065247, + "num_input_tokens_seen": 72126315, + "router_z_loss_clip": 0.98242188, + "router_z_loss_mlp": 0.16992188, + "step": 3338, + "time_per_iteration": 2.689805269241333 + }, + { + "auxiliary_loss_clip": 0.01147666, + "auxiliary_loss_mlp": 0.01044177, + "balance_loss_clip": 1.05074704, + "balance_loss_mlp": 1.02869213, + "epoch": 0.2007515406583496, + "flos": 30249860019840.0, + "grad_norm": 1.8331782310573688, + "language_loss": 0.68722785, + "learning_rate": 3.701867867326735e-06, + "loss": 0.70914632, + "num_input_tokens_seen": 72146470, + "router_z_loss_clip": 0.96875, + "router_z_loss_mlp": 0.15490723, + "step": 3339, + "time_per_iteration": 2.6772146224975586 + }, + { + "auxiliary_loss_clip": 0.01153714, + "auxiliary_loss_mlp": 0.01034877, + "balance_loss_clip": 1.05337739, + "balance_loss_mlp": 1.01939142, + "epoch": 0.2008116639110176, + "flos": 46278154673280.0, + "grad_norm": 2.353520691190051, + "language_loss": 0.66216099, + "learning_rate": 3.7016632614599974e-06, + "loss": 0.68404692, + "num_input_tokens_seen": 72166600, + "router_z_loss_clip": 1.00488281, + "router_z_loss_mlp": 0.15478516, + "step": 3340, + "time_per_iteration": 2.8401029109954834 + }, + { + "auxiliary_loss_clip": 0.01150978, + "auxiliary_loss_mlp": 0.01035469, + "balance_loss_clip": 1.05053735, + "balance_loss_mlp": 1.01829076, + "epoch": 0.20087178716368556, + "flos": 25307659340640.0, + "grad_norm": 3.1694331041360435, + "language_loss": 0.74238479, + "learning_rate": 3.701458591066019e-06, + "loss": 0.76424927, + "num_input_tokens_seen": 72185160, + "router_z_loss_clip": 1.00488281, + "router_z_loss_mlp": 0.17175293, + "step": 3341, + "time_per_iteration": 2.68554949760437 + }, + { + "auxiliary_loss_clip": 0.01145439, + "auxiliary_loss_mlp": 0.01040815, + "balance_loss_clip": 1.05117798, + "balance_loss_mlp": 1.02548504, + "epoch": 0.20093191041635353, + "flos": 29092608509760.0, + "grad_norm": 2.8814292135037993, + "language_loss": 0.71822149, + "learning_rate": 3.70125385615256e-06, + "loss": 0.74008405, + "num_input_tokens_seen": 72205160, + "router_z_loss_clip": 0.94238281, + "router_z_loss_mlp": 0.15344238, + "step": 3342, + "time_per_iteration": 2.7331244945526123 + }, + { + "auxiliary_loss_clip": 0.01147067, + "auxiliary_loss_mlp": 0.01042093, + "balance_loss_clip": 1.04923034, + "balance_loss_mlp": 1.02648842, + "epoch": 0.2009920336690215, + "flos": 26591427264960.0, + "grad_norm": 1.984463873539461, + "language_loss": 0.7218011, + "learning_rate": 3.701049056727384e-06, + "loss": 0.7436927, + "num_input_tokens_seen": 72223555, + "router_z_loss_clip": 0.97802734, + "router_z_loss_mlp": 0.15600586, + "step": 3343, + "time_per_iteration": 2.6950416564941406 + }, + { + "auxiliary_loss_clip": 0.01148792, + "auxiliary_loss_mlp": 0.01044721, + "balance_loss_clip": 1.05087817, + "balance_loss_mlp": 1.02803147, + "epoch": 0.20105215692168946, + "flos": 32164028432640.0, + "grad_norm": 1.9906187890438982, + "language_loss": 0.80940312, + "learning_rate": 3.7008441927982574e-06, + "loss": 0.83133823, + "num_input_tokens_seen": 72242465, + "router_z_loss_clip": 0.97998047, + "router_z_loss_mlp": 0.16699219, + "step": 3344, + "time_per_iteration": 2.7392754554748535 + }, + { + "auxiliary_loss_clip": 0.01147278, + "auxiliary_loss_mlp": 0.01040175, + "balance_loss_clip": 1.04966092, + "balance_loss_mlp": 1.02442122, + "epoch": 0.20111228017435742, + "flos": 22948512628320.0, + "grad_norm": 2.168467876428072, + "language_loss": 0.83653939, + "learning_rate": 3.700639264372948e-06, + "loss": 0.85841393, + "num_input_tokens_seen": 72260655, + "router_z_loss_clip": 0.97558594, + "router_z_loss_mlp": 0.15759277, + "step": 3345, + "time_per_iteration": 2.670781373977661 + }, + { + "auxiliary_loss_clip": 0.01139617, + "auxiliary_loss_mlp": 0.01034479, + "balance_loss_clip": 1.04940295, + "balance_loss_mlp": 1.0203588, + "epoch": 0.20117240342702541, + "flos": 24373504739520.0, + "grad_norm": 1.9360708987430408, + "language_loss": 0.68247956, + "learning_rate": 3.7004342714592283e-06, + "loss": 0.70422053, + "num_input_tokens_seen": 72279055, + "router_z_loss_clip": 0.90283203, + "router_z_loss_mlp": 0.14123535, + "step": 3346, + "time_per_iteration": 2.713167428970337 + }, + { + "auxiliary_loss_clip": 0.01146364, + "auxiliary_loss_mlp": 0.01042227, + "balance_loss_clip": 1.05097389, + "balance_loss_mlp": 1.02670598, + "epoch": 0.20123252667969338, + "flos": 28239516285120.0, + "grad_norm": 2.2364574618709816, + "language_loss": 0.74084055, + "learning_rate": 3.70022921406487e-06, + "loss": 0.76272643, + "num_input_tokens_seen": 72297895, + "router_z_loss_clip": 0.95263672, + "router_z_loss_mlp": 0.15509033, + "step": 3347, + "time_per_iteration": 2.7650418281555176 + }, + { + "auxiliary_loss_clip": 0.01145515, + "auxiliary_loss_mlp": 0.0104348, + "balance_loss_clip": 1.0507499, + "balance_loss_mlp": 1.02911496, + "epoch": 0.20129264993236134, + "flos": 28335326951520.0, + "grad_norm": 1.8912434821468522, + "language_loss": 0.8686316, + "learning_rate": 3.70002409219765e-06, + "loss": 0.89052165, + "num_input_tokens_seen": 72318385, + "router_z_loss_clip": 0.94677734, + "router_z_loss_mlp": 0.14367676, + "step": 3348, + "time_per_iteration": 2.762230157852173 + }, + { + "auxiliary_loss_clip": 0.01141553, + "auxiliary_loss_mlp": 0.01035789, + "balance_loss_clip": 1.04809606, + "balance_loss_mlp": 1.01924849, + "epoch": 0.2013527731850293, + "flos": 26687724138720.0, + "grad_norm": 1.7518020366551277, + "language_loss": 0.70502901, + "learning_rate": 3.699818905865346e-06, + "loss": 0.72680247, + "num_input_tokens_seen": 72338235, + "router_z_loss_clip": 0.93505859, + "router_z_loss_mlp": 0.16534424, + "step": 3349, + "time_per_iteration": 2.7048068046569824 + }, + { + "auxiliary_loss_clip": 0.0114926, + "auxiliary_loss_mlp": 0.01042998, + "balance_loss_clip": 1.05324841, + "balance_loss_mlp": 1.02642798, + "epoch": 0.20141289643769728, + "flos": 22013223543360.0, + "grad_norm": 1.557255195640877, + "language_loss": 0.71146995, + "learning_rate": 3.6996136550757377e-06, + "loss": 0.7333926, + "num_input_tokens_seen": 72357825, + "router_z_loss_clip": 0.95996094, + "router_z_loss_mlp": 0.16564941, + "step": 3350, + "time_per_iteration": 2.676288604736328 + }, + { + "auxiliary_loss_clip": 0.01149082, + "auxiliary_loss_mlp": 0.01038273, + "balance_loss_clip": 1.05175221, + "balance_loss_mlp": 1.0211066, + "epoch": 0.20147301969036524, + "flos": 29225810275200.0, + "grad_norm": 2.2365791032673332, + "language_loss": 0.76886171, + "learning_rate": 3.69940833983661e-06, + "loss": 0.79073524, + "num_input_tokens_seen": 72376335, + "router_z_loss_clip": 0.97265625, + "router_z_loss_mlp": 0.17163086, + "step": 3351, + "time_per_iteration": 2.6857426166534424 + }, + { + "auxiliary_loss_clip": 0.01153273, + "auxiliary_loss_mlp": 0.01038102, + "balance_loss_clip": 1.05400014, + "balance_loss_mlp": 1.02132893, + "epoch": 0.2015331429430332, + "flos": 31223917791360.0, + "grad_norm": 2.000380955890002, + "language_loss": 0.80410582, + "learning_rate": 3.699202960155748e-06, + "loss": 0.82601953, + "num_input_tokens_seen": 72395440, + "router_z_loss_clip": 0.99267578, + "router_z_loss_mlp": 0.16772461, + "step": 3352, + "time_per_iteration": 2.758098602294922 + }, + { + "auxiliary_loss_clip": 0.01146968, + "auxiliary_loss_mlp": 0.01038988, + "balance_loss_clip": 1.05169797, + "balance_loss_mlp": 1.02300239, + "epoch": 0.2015932661957012, + "flos": 32610262767840.0, + "grad_norm": 1.8203759311330372, + "language_loss": 0.80428433, + "learning_rate": 3.6989975160409396e-06, + "loss": 0.82614386, + "num_input_tokens_seen": 72414670, + "router_z_loss_clip": 0.953125, + "router_z_loss_mlp": 0.15991211, + "step": 3353, + "time_per_iteration": 2.736833095550537 + }, + { + "auxiliary_loss_clip": 0.01143428, + "auxiliary_loss_mlp": 0.01043253, + "balance_loss_clip": 1.04995143, + "balance_loss_mlp": 1.02789247, + "epoch": 0.20165338944836916, + "flos": 19075167455040.0, + "grad_norm": 2.528595880954126, + "language_loss": 0.90227211, + "learning_rate": 3.6987920074999747e-06, + "loss": 0.92413884, + "num_input_tokens_seen": 72432210, + "router_z_loss_clip": 0.93554688, + "router_z_loss_mlp": 0.15350342, + "step": 3354, + "time_per_iteration": 2.752826690673828 + }, + { + "auxiliary_loss_clip": 0.01056681, + "auxiliary_loss_mlp": 0.0100114, + "balance_loss_clip": 1.02397203, + "balance_loss_mlp": 0.99898863, + "epoch": 0.20171351270103713, + "flos": 70665070632480.0, + "grad_norm": 0.8269122320994604, + "language_loss": 0.55835718, + "learning_rate": 3.6985864345406465e-06, + "loss": 0.57893538, + "num_input_tokens_seen": 72489225, + "router_z_loss_clip": 0.32666016, + "router_z_loss_mlp": 0.02153015, + "step": 3355, + "time_per_iteration": 3.2152516841888428 + }, + { + "auxiliary_loss_clip": 0.01144123, + "auxiliary_loss_mlp": 0.01040277, + "balance_loss_clip": 1.05217421, + "balance_loss_mlp": 1.0251019, + "epoch": 0.2017736359537051, + "flos": 25397676036000.0, + "grad_norm": 1.6347908821112203, + "language_loss": 0.84207082, + "learning_rate": 3.698380797170751e-06, + "loss": 0.86391479, + "num_input_tokens_seen": 72508715, + "router_z_loss_clip": 0.91943359, + "router_z_loss_mlp": 0.1517334, + "step": 3356, + "time_per_iteration": 2.7870850563049316 + }, + { + "auxiliary_loss_clip": 0.0115559, + "auxiliary_loss_mlp": 0.01041438, + "balance_loss_clip": 1.0542171, + "balance_loss_mlp": 1.02323532, + "epoch": 0.20183375920637306, + "flos": 20855485825920.0, + "grad_norm": 2.985834368046431, + "language_loss": 0.69583327, + "learning_rate": 3.698175095398085e-06, + "loss": 0.71780354, + "num_input_tokens_seen": 72525135, + "router_z_loss_clip": 1.01269531, + "router_z_loss_mlp": 0.18212891, + "step": 3357, + "time_per_iteration": 2.735734224319458 + }, + { + "auxiliary_loss_clip": 0.01149154, + "auxiliary_loss_mlp": 0.01038023, + "balance_loss_clip": 1.05145907, + "balance_loss_mlp": 1.02176285, + "epoch": 0.20189388245904102, + "flos": 22770424067040.0, + "grad_norm": 1.8484261221572265, + "language_loss": 0.71957743, + "learning_rate": 3.6979693292304493e-06, + "loss": 0.74144918, + "num_input_tokens_seen": 72543690, + "router_z_loss_clip": 0.97705078, + "router_z_loss_mlp": 0.16259766, + "step": 3358, + "time_per_iteration": 2.7022056579589844 + }, + { + "auxiliary_loss_clip": 0.01140302, + "auxiliary_loss_mlp": 0.01046127, + "balance_loss_clip": 1.04797101, + "balance_loss_mlp": 1.03179789, + "epoch": 0.20195400571170902, + "flos": 20496350941920.0, + "grad_norm": 2.1145731385215822, + "language_loss": 0.83296365, + "learning_rate": 3.6977634986756463e-06, + "loss": 0.85482788, + "num_input_tokens_seen": 72560725, + "router_z_loss_clip": 0.92382812, + "router_z_loss_mlp": 0.14318848, + "step": 3359, + "time_per_iteration": 2.6503608226776123 + }, + { + "auxiliary_loss_clip": 0.0105312, + "auxiliary_loss_mlp": 0.01004215, + "balance_loss_clip": 1.02045774, + "balance_loss_mlp": 1.0021348, + "epoch": 0.20201412896437698, + "flos": 81967981337280.0, + "grad_norm": 0.7689643425342328, + "language_loss": 0.58943427, + "learning_rate": 3.697557603741482e-06, + "loss": 0.61000764, + "num_input_tokens_seen": 72621940, + "router_z_loss_clip": 0.32666016, + "router_z_loss_mlp": 0.02081299, + "step": 3360, + "time_per_iteration": 3.2769484519958496 + }, + { + "auxiliary_loss_clip": 0.01149772, + "auxiliary_loss_mlp": 0.0104641, + "balance_loss_clip": 1.05231476, + "balance_loss_mlp": 1.02962518, + "epoch": 0.20207425221704495, + "flos": 26020823931360.0, + "grad_norm": 3.130722289069699, + "language_loss": 0.62550819, + "learning_rate": 3.697351644435763e-06, + "loss": 0.64747, + "num_input_tokens_seen": 72639135, + "router_z_loss_clip": 0.97363281, + "router_z_loss_mlp": 0.16772461, + "step": 3361, + "time_per_iteration": 2.67720103263855 + }, + { + "auxiliary_loss_clip": 0.01147214, + "auxiliary_loss_mlp": 0.01050089, + "balance_loss_clip": 1.05316126, + "balance_loss_mlp": 1.03479433, + "epoch": 0.2021343754697129, + "flos": 27487947663360.0, + "grad_norm": 1.890411338858909, + "language_loss": 0.7546773, + "learning_rate": 3.6971456207662993e-06, + "loss": 0.77665031, + "num_input_tokens_seen": 72658525, + "router_z_loss_clip": 0.94042969, + "router_z_loss_mlp": 0.15307617, + "step": 3362, + "time_per_iteration": 2.7149593830108643 + }, + { + "auxiliary_loss_clip": 0.01145807, + "auxiliary_loss_mlp": 0.01041188, + "balance_loss_clip": 1.05129433, + "balance_loss_mlp": 1.0257268, + "epoch": 0.20219449872238088, + "flos": 23260289162400.0, + "grad_norm": 1.6372225189724685, + "language_loss": 0.76476026, + "learning_rate": 3.6969395327409035e-06, + "loss": 0.78663015, + "num_input_tokens_seen": 72678085, + "router_z_loss_clip": 0.94384766, + "router_z_loss_mlp": 0.15454102, + "step": 3363, + "time_per_iteration": 2.823369264602661 + }, + { + "auxiliary_loss_clip": 0.01143303, + "auxiliary_loss_mlp": 0.01043828, + "balance_loss_clip": 1.04843509, + "balance_loss_mlp": 1.02896857, + "epoch": 0.20225462197504884, + "flos": 30160126945440.0, + "grad_norm": 1.6968426389520406, + "language_loss": 0.7495653, + "learning_rate": 3.696733380367391e-06, + "loss": 0.77143663, + "num_input_tokens_seen": 72698695, + "router_z_loss_clip": 0.94873047, + "router_z_loss_mlp": 0.14868164, + "step": 3364, + "time_per_iteration": 2.7481510639190674 + }, + { + "auxiliary_loss_clip": 0.01149269, + "auxiliary_loss_mlp": 0.01045213, + "balance_loss_clip": 1.05126154, + "balance_loss_mlp": 1.02811825, + "epoch": 0.2023147452277168, + "flos": 26866825632000.0, + "grad_norm": 4.704843042642046, + "language_loss": 0.7131421, + "learning_rate": 3.6965271636535783e-06, + "loss": 0.73508692, + "num_input_tokens_seen": 72717880, + "router_z_loss_clip": 0.97998047, + "router_z_loss_mlp": 0.17102051, + "step": 3365, + "time_per_iteration": 2.685703992843628 + }, + { + "auxiliary_loss_clip": 0.01148912, + "auxiliary_loss_mlp": 0.01045437, + "balance_loss_clip": 1.05237556, + "balance_loss_mlp": 1.02960563, + "epoch": 0.2023748684803848, + "flos": 21653035210080.0, + "grad_norm": 2.2384895608250637, + "language_loss": 0.8578403, + "learning_rate": 3.696320882607286e-06, + "loss": 0.87978375, + "num_input_tokens_seen": 72736410, + "router_z_loss_clip": 0.96582031, + "router_z_loss_mlp": 0.15826416, + "step": 3366, + "time_per_iteration": 2.6684608459472656 + }, + { + "auxiliary_loss_clip": 0.01145588, + "auxiliary_loss_mlp": 0.01042964, + "balance_loss_clip": 1.05115747, + "balance_loss_mlp": 1.02788389, + "epoch": 0.20243499173305277, + "flos": 37996590883680.0, + "grad_norm": 1.7381640345445797, + "language_loss": 0.69787645, + "learning_rate": 3.696114537236335e-06, + "loss": 0.71976197, + "num_input_tokens_seen": 72758295, + "router_z_loss_clip": 0.94482422, + "router_z_loss_mlp": 0.15087891, + "step": 3367, + "time_per_iteration": 2.757392168045044 + }, + { + "auxiliary_loss_clip": 0.01150402, + "auxiliary_loss_mlp": 0.01040557, + "balance_loss_clip": 1.049227, + "balance_loss_mlp": 1.02283061, + "epoch": 0.20249511498572073, + "flos": 41291674957440.0, + "grad_norm": 1.9483246870218525, + "language_loss": 0.68471611, + "learning_rate": 3.6959081275485512e-06, + "loss": 0.7066257, + "num_input_tokens_seen": 72782495, + "router_z_loss_clip": 1.01123047, + "router_z_loss_mlp": 0.17724609, + "step": 3368, + "time_per_iteration": 2.786214590072632 + }, + { + "auxiliary_loss_clip": 0.01147795, + "auxiliary_loss_mlp": 0.01049948, + "balance_loss_clip": 1.05332863, + "balance_loss_mlp": 1.03365254, + "epoch": 0.2025552382383887, + "flos": 25887338544960.0, + "grad_norm": 1.7255469040700258, + "language_loss": 0.77004337, + "learning_rate": 3.6957016535517615e-06, + "loss": 0.7920208, + "num_input_tokens_seen": 72801885, + "router_z_loss_clip": 0.94433594, + "router_z_loss_mlp": 0.16308594, + "step": 3369, + "time_per_iteration": 2.710075855255127 + }, + { + "auxiliary_loss_clip": 0.01150896, + "auxiliary_loss_mlp": 0.01050711, + "balance_loss_clip": 1.05092669, + "balance_loss_mlp": 1.03490424, + "epoch": 0.20261536149105666, + "flos": 17872340355360.0, + "grad_norm": 2.911233020017161, + "language_loss": 0.64930362, + "learning_rate": 3.695495115253795e-06, + "loss": 0.67131966, + "num_input_tokens_seen": 72816990, + "router_z_loss_clip": 0.99902344, + "router_z_loss_mlp": 0.15808105, + "step": 3370, + "time_per_iteration": 5.682526588439941 + }, + { + "auxiliary_loss_clip": 0.01052778, + "auxiliary_loss_mlp": 0.0100181, + "balance_loss_clip": 1.02031922, + "balance_loss_mlp": 0.99988461, + "epoch": 0.20267548474372463, + "flos": 81489825735840.0, + "grad_norm": 0.6894219735981796, + "language_loss": 0.58091396, + "learning_rate": 3.6952885126624834e-06, + "loss": 0.60145986, + "num_input_tokens_seen": 72879240, + "router_z_loss_clip": 0.32421875, + "router_z_loss_mlp": 0.01922607, + "step": 3371, + "time_per_iteration": 3.3958334922790527 + }, + { + "auxiliary_loss_clip": 0.01145495, + "auxiliary_loss_mlp": 0.01037366, + "balance_loss_clip": 1.04864359, + "balance_loss_mlp": 1.02179718, + "epoch": 0.2027356079963926, + "flos": 30115888426080.0, + "grad_norm": 1.9244806268367054, + "language_loss": 0.91921782, + "learning_rate": 3.6950818457856617e-06, + "loss": 0.94104642, + "num_input_tokens_seen": 72899030, + "router_z_loss_clip": 0.96777344, + "router_z_loss_mlp": 0.15563965, + "step": 3372, + "time_per_iteration": 2.8648457527160645 + }, + { + "auxiliary_loss_clip": 0.01150796, + "auxiliary_loss_mlp": 0.0104409, + "balance_loss_clip": 1.05292761, + "balance_loss_mlp": 1.02740026, + "epoch": 0.20279573124906058, + "flos": 32205228156000.0, + "grad_norm": 1.615474507356893, + "language_loss": 0.78790617, + "learning_rate": 3.694875114631167e-06, + "loss": 0.80985492, + "num_input_tokens_seen": 72919190, + "router_z_loss_clip": 0.97753906, + "router_z_loss_mlp": 0.16687012, + "step": 3373, + "time_per_iteration": 2.812879800796509 + }, + { + "auxiliary_loss_clip": 0.01142658, + "auxiliary_loss_mlp": 0.01039385, + "balance_loss_clip": 1.04996514, + "balance_loss_mlp": 1.02349424, + "epoch": 0.20285585450172855, + "flos": 41244032986560.0, + "grad_norm": 1.9577554200409202, + "language_loss": 0.71298707, + "learning_rate": 3.6946683192068377e-06, + "loss": 0.73480749, + "num_input_tokens_seen": 72939720, + "router_z_loss_clip": 0.92724609, + "router_z_loss_mlp": 0.15881348, + "step": 3374, + "time_per_iteration": 5.805597543716431 + }, + { + "auxiliary_loss_clip": 0.01051793, + "auxiliary_loss_mlp": 0.01001556, + "balance_loss_clip": 1.01925898, + "balance_loss_mlp": 0.99954617, + "epoch": 0.20291597775439651, + "flos": 86835804991200.0, + "grad_norm": 0.9815148535606819, + "language_loss": 0.625135, + "learning_rate": 3.694461459520516e-06, + "loss": 0.64566851, + "num_input_tokens_seen": 73000015, + "router_z_loss_clip": 0.32495117, + "router_z_loss_mlp": 0.02009583, + "step": 3375, + "time_per_iteration": 3.2934956550598145 + }, + { + "auxiliary_loss_clip": 0.01143714, + "auxiliary_loss_mlp": 0.01044672, + "balance_loss_clip": 1.04826403, + "balance_loss_mlp": 1.02967584, + "epoch": 0.20297610100706448, + "flos": 23787423804960.0, + "grad_norm": 1.6800053808782103, + "language_loss": 0.82495224, + "learning_rate": 3.6942545355800463e-06, + "loss": 0.84683609, + "num_input_tokens_seen": 73017675, + "router_z_loss_clip": 0.95361328, + "router_z_loss_mlp": 0.14990234, + "step": 3376, + "time_per_iteration": 2.6308953762054443 + }, + { + "auxiliary_loss_clip": 0.01146675, + "auxiliary_loss_mlp": 0.01036753, + "balance_loss_clip": 1.04919982, + "balance_loss_mlp": 1.01986098, + "epoch": 0.20303622425973245, + "flos": 30559529655360.0, + "grad_norm": 2.4077410146651546, + "language_loss": 0.81698698, + "learning_rate": 3.6940475473932743e-06, + "loss": 0.83882129, + "num_input_tokens_seen": 73036135, + "router_z_loss_clip": 0.97509766, + "router_z_loss_mlp": 0.16882324, + "step": 3377, + "time_per_iteration": 2.7205305099487305 + }, + { + "auxiliary_loss_clip": 0.01144034, + "auxiliary_loss_mlp": 0.01047738, + "balance_loss_clip": 1.04987943, + "balance_loss_mlp": 1.03048813, + "epoch": 0.2030963475124004, + "flos": 26820237110400.0, + "grad_norm": 4.083573489457402, + "language_loss": 0.77010798, + "learning_rate": 3.69384049496805e-06, + "loss": 0.79202574, + "num_input_tokens_seen": 73054075, + "router_z_loss_clip": 0.94091797, + "router_z_loss_mlp": 0.17248535, + "step": 3378, + "time_per_iteration": 2.7130613327026367 + }, + { + "auxiliary_loss_clip": 0.01149724, + "auxiliary_loss_mlp": 0.01042367, + "balance_loss_clip": 1.05275893, + "balance_loss_mlp": 1.02527261, + "epoch": 0.2031564707650684, + "flos": 23794311742560.0, + "grad_norm": 1.801035335901052, + "language_loss": 0.8024199, + "learning_rate": 3.6936333783122242e-06, + "loss": 0.82434082, + "num_input_tokens_seen": 73073530, + "router_z_loss_clip": 0.96923828, + "router_z_loss_mlp": 0.17114258, + "step": 3379, + "time_per_iteration": 2.644249200820923 + }, + { + "auxiliary_loss_clip": 0.01141088, + "auxiliary_loss_mlp": 0.01037906, + "balance_loss_clip": 1.04940259, + "balance_loss_mlp": 1.02323174, + "epoch": 0.20321659401773637, + "flos": 27756214989120.0, + "grad_norm": 1.7709107632076113, + "language_loss": 0.86737859, + "learning_rate": 3.6934261974336505e-06, + "loss": 0.88916856, + "num_input_tokens_seen": 73092820, + "router_z_loss_clip": 0.91699219, + "router_z_loss_mlp": 0.14685059, + "step": 3380, + "time_per_iteration": 2.661606788635254 + }, + { + "auxiliary_loss_clip": 0.01148389, + "auxiliary_loss_mlp": 0.0104514, + "balance_loss_clip": 1.05346, + "balance_loss_mlp": 1.02893901, + "epoch": 0.20327671727040433, + "flos": 27400767177600.0, + "grad_norm": 2.3693056405241975, + "language_loss": 0.74684536, + "learning_rate": 3.693218952340186e-06, + "loss": 0.76878059, + "num_input_tokens_seen": 73113385, + "router_z_loss_clip": 0.94921875, + "router_z_loss_mlp": 0.1619873, + "step": 3381, + "time_per_iteration": 2.6959116458892822 + }, + { + "auxiliary_loss_clip": 0.01147346, + "auxiliary_loss_mlp": 0.01040785, + "balance_loss_clip": 1.04940224, + "balance_loss_mlp": 1.0247395, + "epoch": 0.2033368405230723, + "flos": 23836726984320.0, + "grad_norm": 1.9214758567724457, + "language_loss": 0.79633689, + "learning_rate": 3.6930116430396895e-06, + "loss": 0.81821817, + "num_input_tokens_seen": 73131195, + "router_z_loss_clip": 0.97753906, + "router_z_loss_mlp": 0.16040039, + "step": 3382, + "time_per_iteration": 2.6514246463775635 + }, + { + "auxiliary_loss_clip": 0.01149802, + "auxiliary_loss_mlp": 0.01038066, + "balance_loss_clip": 1.04959142, + "balance_loss_mlp": 1.02090013, + "epoch": 0.20339696377574026, + "flos": 16849465611840.0, + "grad_norm": 1.7557074352159654, + "language_loss": 0.80219328, + "learning_rate": 3.6928042695400214e-06, + "loss": 0.824072, + "num_input_tokens_seen": 73148850, + "router_z_loss_clip": 1.00195312, + "router_z_loss_mlp": 0.17175293, + "step": 3383, + "time_per_iteration": 2.6733086109161377 + }, + { + "auxiliary_loss_clip": 0.0114496, + "auxiliary_loss_mlp": 0.01033444, + "balance_loss_clip": 1.04955411, + "balance_loss_mlp": 1.01650453, + "epoch": 0.20345708702840823, + "flos": 24818158900800.0, + "grad_norm": 2.263441833409191, + "language_loss": 0.74283791, + "learning_rate": 3.6925968318490464e-06, + "loss": 0.76462197, + "num_input_tokens_seen": 73166775, + "router_z_loss_clip": 0.95458984, + "router_z_loss_mlp": 0.16943359, + "step": 3384, + "time_per_iteration": 2.6501080989837646 + }, + { + "auxiliary_loss_clip": 0.01154537, + "auxiliary_loss_mlp": 0.01042769, + "balance_loss_clip": 1.05300379, + "balance_loss_mlp": 1.0245893, + "epoch": 0.2035172102810762, + "flos": 24812000274240.0, + "grad_norm": 2.6528264776865935, + "language_loss": 0.76383322, + "learning_rate": 3.6923893299746293e-06, + "loss": 0.7858063, + "num_input_tokens_seen": 73183215, + "router_z_loss_clip": 1.01464844, + "router_z_loss_mlp": 0.18164062, + "step": 3385, + "time_per_iteration": 2.6564972400665283 + }, + { + "auxiliary_loss_clip": 0.01145825, + "auxiliary_loss_mlp": 0.01050683, + "balance_loss_clip": 1.04934096, + "balance_loss_mlp": 1.03369617, + "epoch": 0.2035773335337442, + "flos": 28468731303360.0, + "grad_norm": 1.8752532662962296, + "language_loss": 0.68617821, + "learning_rate": 3.692181763924639e-06, + "loss": 0.70814323, + "num_input_tokens_seen": 73203290, + "router_z_loss_clip": 0.96533203, + "router_z_loss_mlp": 0.1697998, + "step": 3386, + "time_per_iteration": 2.6820735931396484 + }, + { + "auxiliary_loss_clip": 0.01147864, + "auxiliary_loss_mlp": 0.01050267, + "balance_loss_clip": 1.05002117, + "balance_loss_mlp": 1.03295755, + "epoch": 0.20363745678641215, + "flos": 34569196424640.0, + "grad_norm": 1.3437656501266153, + "language_loss": 0.81049848, + "learning_rate": 3.691974133706947e-06, + "loss": 0.83247977, + "num_input_tokens_seen": 73226185, + "router_z_loss_clip": 0.97949219, + "router_z_loss_mlp": 0.1730957, + "step": 3387, + "time_per_iteration": 2.6750614643096924 + }, + { + "auxiliary_loss_clip": 0.01142784, + "auxiliary_loss_mlp": 0.01040631, + "balance_loss_clip": 1.04994869, + "balance_loss_mlp": 1.02490711, + "epoch": 0.20369758003908012, + "flos": 23080863530880.0, + "grad_norm": 2.9763990479657627, + "language_loss": 0.79600221, + "learning_rate": 3.6917664393294262e-06, + "loss": 0.8178364, + "num_input_tokens_seen": 73243300, + "router_z_loss_clip": 0.92822266, + "router_z_loss_mlp": 0.1572876, + "step": 3388, + "time_per_iteration": 2.6595726013183594 + }, + { + "auxiliary_loss_clip": 0.01148705, + "auxiliary_loss_mlp": 0.0103553, + "balance_loss_clip": 1.0510416, + "balance_loss_mlp": 1.01868546, + "epoch": 0.20375770329174808, + "flos": 23435865652320.0, + "grad_norm": 2.001833124443637, + "language_loss": 0.71721172, + "learning_rate": 3.6915586807999527e-06, + "loss": 0.73905402, + "num_input_tokens_seen": 73261490, + "router_z_loss_clip": 0.97753906, + "router_z_loss_mlp": 0.16833496, + "step": 3389, + "time_per_iteration": 2.6377480030059814 + }, + { + "auxiliary_loss_clip": 0.01145923, + "auxiliary_loss_mlp": 0.01043158, + "balance_loss_clip": 1.05133271, + "balance_loss_mlp": 1.02698147, + "epoch": 0.20381782654441605, + "flos": 23660826356160.0, + "grad_norm": 2.2999122719222256, + "language_loss": 0.87521678, + "learning_rate": 3.691350858126404e-06, + "loss": 0.8971076, + "num_input_tokens_seen": 73280180, + "router_z_loss_clip": 0.94580078, + "router_z_loss_mlp": 0.16174316, + "step": 3390, + "time_per_iteration": 2.6741068363189697 + }, + { + "auxiliary_loss_clip": 0.01144287, + "auxiliary_loss_mlp": 0.01041187, + "balance_loss_clip": 1.05028152, + "balance_loss_mlp": 1.02512932, + "epoch": 0.203877949797084, + "flos": 30295070953920.0, + "grad_norm": 2.520858102871637, + "language_loss": 0.71164578, + "learning_rate": 3.691142971316662e-06, + "loss": 0.73350048, + "num_input_tokens_seen": 73300680, + "router_z_loss_clip": 0.93994141, + "router_z_loss_mlp": 0.1607666, + "step": 3391, + "time_per_iteration": 2.69218111038208 + }, + { + "auxiliary_loss_clip": 0.01145607, + "auxiliary_loss_mlp": 0.01043549, + "balance_loss_clip": 1.05050445, + "balance_loss_mlp": 1.02730107, + "epoch": 0.20393807304975198, + "flos": 21968012609280.0, + "grad_norm": 2.3638599359368144, + "language_loss": 0.86436254, + "learning_rate": 3.6909350203786086e-06, + "loss": 0.88625407, + "num_input_tokens_seen": 73316760, + "router_z_loss_clip": 0.95166016, + "router_z_loss_mlp": 0.16235352, + "step": 3392, + "time_per_iteration": 2.6460559368133545 + }, + { + "auxiliary_loss_clip": 0.01148385, + "auxiliary_loss_mlp": 0.01046083, + "balance_loss_clip": 1.04982138, + "balance_loss_mlp": 1.03016901, + "epoch": 0.20399819630241997, + "flos": 29537667843840.0, + "grad_norm": 1.5425003288331682, + "language_loss": 0.80794001, + "learning_rate": 3.69072700532013e-06, + "loss": 0.82988471, + "num_input_tokens_seen": 73339385, + "router_z_loss_clip": 0.98583984, + "router_z_loss_mlp": 0.15905762, + "step": 3393, + "time_per_iteration": 2.6581685543060303 + }, + { + "auxiliary_loss_clip": 0.01142445, + "auxiliary_loss_mlp": 0.01033055, + "balance_loss_clip": 1.04818654, + "balance_loss_mlp": 1.01800489, + "epoch": 0.20405831955508794, + "flos": 25352303032800.0, + "grad_norm": 2.0264916394086328, + "language_loss": 0.86098182, + "learning_rate": 3.6905189261491137e-06, + "loss": 0.8827368, + "num_input_tokens_seen": 73357235, + "router_z_loss_clip": 0.94189453, + "router_z_loss_mlp": 0.15045166, + "step": 3394, + "time_per_iteration": 2.6461987495422363 + }, + { + "auxiliary_loss_clip": 0.01144356, + "auxiliary_loss_mlp": 0.01040136, + "balance_loss_clip": 1.05086517, + "balance_loss_mlp": 1.02557504, + "epoch": 0.2041184428077559, + "flos": 18896754755520.0, + "grad_norm": 5.262021795328707, + "language_loss": 0.83704686, + "learning_rate": 3.69031078287345e-06, + "loss": 0.85889173, + "num_input_tokens_seen": 73374435, + "router_z_loss_clip": 0.93457031, + "router_z_loss_mlp": 0.14556885, + "step": 3395, + "time_per_iteration": 2.604001522064209 + }, + { + "auxiliary_loss_clip": 0.01150095, + "auxiliary_loss_mlp": 0.01033318, + "balance_loss_clip": 1.05056465, + "balance_loss_mlp": 1.01673579, + "epoch": 0.20417856606042387, + "flos": 19021407374880.0, + "grad_norm": 1.874179490367534, + "language_loss": 0.83714205, + "learning_rate": 3.690102575501033e-06, + "loss": 0.85897619, + "num_input_tokens_seen": 73391025, + "router_z_loss_clip": 0.99560547, + "router_z_loss_mlp": 0.16564941, + "step": 3396, + "time_per_iteration": 2.724222183227539 + }, + { + "auxiliary_loss_clip": 0.0114304, + "auxiliary_loss_mlp": 0.01036262, + "balance_loss_clip": 1.0487473, + "balance_loss_mlp": 1.01948929, + "epoch": 0.20423868931309183, + "flos": 29626023330720.0, + "grad_norm": 2.081006119752782, + "language_loss": 0.77121413, + "learning_rate": 3.6898943040397556e-06, + "loss": 0.79300714, + "num_input_tokens_seen": 73409270, + "router_z_loss_clip": 0.94433594, + "router_z_loss_mlp": 0.16760254, + "step": 3397, + "time_per_iteration": 2.6761395931243896 + }, + { + "auxiliary_loss_clip": 0.01144853, + "auxiliary_loss_mlp": 0.01037041, + "balance_loss_clip": 1.05049419, + "balance_loss_mlp": 1.02284276, + "epoch": 0.2042988125657598, + "flos": 22714638122880.0, + "grad_norm": 3.0475357848223767, + "language_loss": 0.87258303, + "learning_rate": 3.689685968497518e-06, + "loss": 0.89440197, + "num_input_tokens_seen": 73425225, + "router_z_loss_clip": 0.94384766, + "router_z_loss_mlp": 0.14202881, + "step": 3398, + "time_per_iteration": 2.5862619876861572 + }, + { + "auxiliary_loss_clip": 0.01149421, + "auxiliary_loss_mlp": 0.01042603, + "balance_loss_clip": 1.05290329, + "balance_loss_mlp": 1.02702236, + "epoch": 0.2043589358184278, + "flos": 21782549903040.0, + "grad_norm": 2.139257271262234, + "language_loss": 0.77921706, + "learning_rate": 3.6894775688822186e-06, + "loss": 0.80113733, + "num_input_tokens_seen": 73440940, + "router_z_loss_clip": 0.96435547, + "router_z_loss_mlp": 0.15551758, + "step": 3399, + "time_per_iteration": 2.674078941345215 + }, + { + "auxiliary_loss_clip": 0.01146112, + "auxiliary_loss_mlp": 0.01034542, + "balance_loss_clip": 1.0496161, + "balance_loss_mlp": 1.01892567, + "epoch": 0.20441905907109575, + "flos": 26155241215200.0, + "grad_norm": 2.080855736544472, + "language_loss": 0.76303518, + "learning_rate": 3.6892691052017603e-06, + "loss": 0.78484172, + "num_input_tokens_seen": 73458805, + "router_z_loss_clip": 0.96484375, + "router_z_loss_mlp": 0.15625, + "step": 3400, + "time_per_iteration": 2.655850648880005 + }, + { + "auxiliary_loss_clip": 0.01141976, + "auxiliary_loss_mlp": 0.0103952, + "balance_loss_clip": 1.04883909, + "balance_loss_mlp": 1.02477407, + "epoch": 0.20447918232376372, + "flos": 33809686416000.0, + "grad_norm": 1.699641336009866, + "language_loss": 0.79148972, + "learning_rate": 3.6890605774640487e-06, + "loss": 0.81330466, + "num_input_tokens_seen": 73479380, + "router_z_loss_clip": 0.93164062, + "router_z_loss_mlp": 0.14746094, + "step": 3401, + "time_per_iteration": 2.722790479660034 + }, + { + "auxiliary_loss_clip": 0.01144507, + "auxiliary_loss_mlp": 0.01038119, + "balance_loss_clip": 1.04707754, + "balance_loss_mlp": 1.02260947, + "epoch": 0.20453930557643168, + "flos": 37245954159360.0, + "grad_norm": 1.801356474066611, + "language_loss": 0.69972849, + "learning_rate": 3.688851985676991e-06, + "loss": 0.72155476, + "num_input_tokens_seen": 73505105, + "router_z_loss_clip": 0.97363281, + "router_z_loss_mlp": 0.1550293, + "step": 3402, + "time_per_iteration": 2.80094051361084 + }, + { + "auxiliary_loss_clip": 0.0114827, + "auxiliary_loss_mlp": 0.01039927, + "balance_loss_clip": 1.05088139, + "balance_loss_mlp": 1.02332127, + "epoch": 0.20459942882909965, + "flos": 23168935396800.0, + "grad_norm": 1.9619351497237287, + "language_loss": 0.80996346, + "learning_rate": 3.688643329848496e-06, + "loss": 0.83184552, + "num_input_tokens_seen": 73523700, + "router_z_loss_clip": 0.97412109, + "router_z_loss_mlp": 0.16601562, + "step": 3403, + "time_per_iteration": 2.747100830078125 + }, + { + "auxiliary_loss_clip": 0.01148175, + "auxiliary_loss_mlp": 0.01036425, + "balance_loss_clip": 1.0509913, + "balance_loss_mlp": 1.02096367, + "epoch": 0.20465955208176762, + "flos": 24817753728000.0, + "grad_norm": 2.194617426865429, + "language_loss": 0.83024693, + "learning_rate": 3.6884346099864772e-06, + "loss": 0.85209298, + "num_input_tokens_seen": 73542625, + "router_z_loss_clip": 0.97216797, + "router_z_loss_mlp": 0.15435791, + "step": 3404, + "time_per_iteration": 2.6559407711029053 + }, + { + "auxiliary_loss_clip": 0.01143843, + "auxiliary_loss_mlp": 0.01043731, + "balance_loss_clip": 1.04675877, + "balance_loss_mlp": 1.02754283, + "epoch": 0.20471967533443558, + "flos": 25930928787840.0, + "grad_norm": 2.116306715673809, + "language_loss": 0.85913789, + "learning_rate": 3.6882258260988487e-06, + "loss": 0.88101363, + "num_input_tokens_seen": 73561450, + "router_z_loss_clip": 0.97070312, + "router_z_loss_mlp": 0.1619873, + "step": 3405, + "time_per_iteration": 2.6849098205566406 + }, + { + "auxiliary_loss_clip": 0.01142263, + "auxiliary_loss_mlp": 0.01037558, + "balance_loss_clip": 1.04836857, + "balance_loss_mlp": 1.02188158, + "epoch": 0.20477979858710357, + "flos": 17694373345920.0, + "grad_norm": 2.3274653293914778, + "language_loss": 0.84626305, + "learning_rate": 3.6880169781935276e-06, + "loss": 0.8680613, + "num_input_tokens_seen": 73577155, + "router_z_loss_clip": 0.93945312, + "router_z_loss_mlp": 0.15673828, + "step": 3406, + "time_per_iteration": 2.6093478202819824 + }, + { + "auxiliary_loss_clip": 0.01144315, + "auxiliary_loss_mlp": 0.01033433, + "balance_loss_clip": 1.05012918, + "balance_loss_mlp": 1.01924086, + "epoch": 0.20483992183977154, + "flos": 13909991418720.0, + "grad_norm": 2.3536685102622332, + "language_loss": 0.67554969, + "learning_rate": 3.6878080662784336e-06, + "loss": 0.69732714, + "num_input_tokens_seen": 73594900, + "router_z_loss_clip": 0.94189453, + "router_z_loss_mlp": 0.14202881, + "step": 3407, + "time_per_iteration": 2.657203197479248 + }, + { + "auxiliary_loss_clip": 0.01143503, + "auxiliary_loss_mlp": 0.01039535, + "balance_loss_clip": 1.0495162, + "balance_loss_mlp": 1.02344143, + "epoch": 0.2049000450924395, + "flos": 23257452952800.0, + "grad_norm": 2.454854957879006, + "language_loss": 0.84692353, + "learning_rate": 3.6875990903614886e-06, + "loss": 0.86875391, + "num_input_tokens_seen": 73613810, + "router_z_loss_clip": 0.93945312, + "router_z_loss_mlp": 0.16101074, + "step": 3408, + "time_per_iteration": 2.688392162322998 + }, + { + "auxiliary_loss_clip": 0.01149049, + "auxiliary_loss_mlp": 0.01042114, + "balance_loss_clip": 1.05197167, + "balance_loss_mlp": 1.02693319, + "epoch": 0.20496016834510747, + "flos": 17783458143840.0, + "grad_norm": 2.6143753440557127, + "language_loss": 0.64346957, + "learning_rate": 3.6873900504506166e-06, + "loss": 0.66538119, + "num_input_tokens_seen": 73631495, + "router_z_loss_clip": 0.97119141, + "router_z_loss_mlp": 0.15185547, + "step": 3409, + "time_per_iteration": 2.6939589977264404 + }, + { + "auxiliary_loss_clip": 0.01142056, + "auxiliary_loss_mlp": 0.01042281, + "balance_loss_clip": 1.04708195, + "balance_loss_mlp": 1.02661693, + "epoch": 0.20502029159777543, + "flos": 26999136017280.0, + "grad_norm": 3.435864021512424, + "language_loss": 0.80331433, + "learning_rate": 3.687180946553745e-06, + "loss": 0.8251577, + "num_input_tokens_seen": 73652840, + "router_z_loss_clip": 0.94970703, + "router_z_loss_mlp": 0.15649414, + "step": 3410, + "time_per_iteration": 5.6777119636535645 + }, + { + "auxiliary_loss_clip": 0.01146386, + "auxiliary_loss_mlp": 0.01040555, + "balance_loss_clip": 1.05151677, + "balance_loss_mlp": 1.02561164, + "epoch": 0.2050804148504434, + "flos": 30954921154560.0, + "grad_norm": 3.5355395058485275, + "language_loss": 0.76364613, + "learning_rate": 3.686971778678803e-06, + "loss": 0.78551555, + "num_input_tokens_seen": 73672150, + "router_z_loss_clip": 0.94824219, + "router_z_loss_mlp": 0.14935303, + "step": 3411, + "time_per_iteration": 2.724210023880005 + }, + { + "auxiliary_loss_clip": 0.01147969, + "auxiliary_loss_mlp": 0.01038983, + "balance_loss_clip": 1.05247974, + "balance_loss_mlp": 1.02451086, + "epoch": 0.2051405381031114, + "flos": 28821869629920.0, + "grad_norm": 3.2300308976113596, + "language_loss": 0.73403001, + "learning_rate": 3.686762546833722e-06, + "loss": 0.75589955, + "num_input_tokens_seen": 73691940, + "router_z_loss_clip": 0.95458984, + "router_z_loss_mlp": 0.14477539, + "step": 3412, + "time_per_iteration": 2.6649858951568604 + }, + { + "auxiliary_loss_clip": 0.01149968, + "auxiliary_loss_mlp": 0.01046837, + "balance_loss_clip": 1.04992688, + "balance_loss_mlp": 1.02994466, + "epoch": 0.20520066135577936, + "flos": 23874523256160.0, + "grad_norm": 2.490944897327513, + "language_loss": 0.77477479, + "learning_rate": 3.6865532510264362e-06, + "loss": 0.79674286, + "num_input_tokens_seen": 73709080, + "router_z_loss_clip": 1.0, + "router_z_loss_mlp": 0.16894531, + "step": 3413, + "time_per_iteration": 4.20958685874939 + }, + { + "auxiliary_loss_clip": 0.01146084, + "auxiliary_loss_mlp": 0.01047017, + "balance_loss_clip": 1.05342102, + "balance_loss_mlp": 1.03125715, + "epoch": 0.20526078460844732, + "flos": 21568447830240.0, + "grad_norm": 2.889952972599903, + "language_loss": 0.85084796, + "learning_rate": 3.6863438912648823e-06, + "loss": 0.87277901, + "num_input_tokens_seen": 73727670, + "router_z_loss_clip": 0.92626953, + "router_z_loss_mlp": 0.1574707, + "step": 3414, + "time_per_iteration": 4.1505043506622314 + }, + { + "auxiliary_loss_clip": 0.01144897, + "auxiliary_loss_mlp": 0.01038249, + "balance_loss_clip": 1.04933107, + "balance_loss_mlp": 1.02315676, + "epoch": 0.2053209078611153, + "flos": 26235412211520.0, + "grad_norm": 1.9487953829802653, + "language_loss": 0.80706179, + "learning_rate": 3.6861344675569986e-06, + "loss": 0.82889318, + "num_input_tokens_seen": 73747170, + "router_z_loss_clip": 0.95556641, + "router_z_loss_mlp": 0.15081787, + "step": 3415, + "time_per_iteration": 2.6393606662750244 + }, + { + "auxiliary_loss_clip": 0.01145945, + "auxiliary_loss_mlp": 0.01036456, + "balance_loss_clip": 1.05192971, + "balance_loss_mlp": 1.02291417, + "epoch": 0.20538103111378325, + "flos": 31315312074240.0, + "grad_norm": 1.8149884863529633, + "language_loss": 0.73315257, + "learning_rate": 3.6859249799107275e-06, + "loss": 0.75497651, + "num_input_tokens_seen": 73767690, + "router_z_loss_clip": 0.93994141, + "router_z_loss_mlp": 0.13549805, + "step": 3416, + "time_per_iteration": 2.7402336597442627 + }, + { + "auxiliary_loss_clip": 0.01147131, + "auxiliary_loss_mlp": 0.01042799, + "balance_loss_clip": 1.05006051, + "balance_loss_mlp": 1.02686071, + "epoch": 0.20544115436645122, + "flos": 28246566291840.0, + "grad_norm": 2.1808466583088584, + "language_loss": 0.78637111, + "learning_rate": 3.6857154283340115e-06, + "loss": 0.80827045, + "num_input_tokens_seen": 73786900, + "router_z_loss_clip": 0.97021484, + "router_z_loss_mlp": 0.15930176, + "step": 3417, + "time_per_iteration": 2.653888463973999 + }, + { + "auxiliary_loss_clip": 0.0114685, + "auxiliary_loss_mlp": 0.01046206, + "balance_loss_clip": 1.05114686, + "balance_loss_mlp": 1.0300293, + "epoch": 0.20550127761911918, + "flos": 23660178079680.0, + "grad_norm": 5.564737314352657, + "language_loss": 0.87250853, + "learning_rate": 3.685505812834798e-06, + "loss": 0.89443904, + "num_input_tokens_seen": 73804515, + "router_z_loss_clip": 0.95751953, + "router_z_loss_mlp": 0.16174316, + "step": 3418, + "time_per_iteration": 2.736009120941162 + }, + { + "auxiliary_loss_clip": 0.01143713, + "auxiliary_loss_mlp": 0.01043182, + "balance_loss_clip": 1.04881454, + "balance_loss_mlp": 1.02785194, + "epoch": 0.20556140087178718, + "flos": 27934425102240.0, + "grad_norm": 2.2391545505804937, + "language_loss": 0.61756611, + "learning_rate": 3.685296133421035e-06, + "loss": 0.63943505, + "num_input_tokens_seen": 73822910, + "router_z_loss_clip": 0.94921875, + "router_z_loss_mlp": 0.15332031, + "step": 3419, + "time_per_iteration": 2.769305467605591 + }, + { + "auxiliary_loss_clip": 0.01152541, + "auxiliary_loss_mlp": 0.01054794, + "balance_loss_clip": 1.05382848, + "balance_loss_mlp": 1.03793812, + "epoch": 0.20562152412445514, + "flos": 24147531103680.0, + "grad_norm": 2.127771624478613, + "language_loss": 0.86200392, + "learning_rate": 3.685086390100674e-06, + "loss": 0.88407737, + "num_input_tokens_seen": 73841160, + "router_z_loss_clip": 0.98779297, + "router_z_loss_mlp": 0.16845703, + "step": 3420, + "time_per_iteration": 2.6738078594207764 + }, + { + "auxiliary_loss_clip": 0.01147681, + "auxiliary_loss_mlp": 0.0104114, + "balance_loss_clip": 1.05241346, + "balance_loss_mlp": 1.02589321, + "epoch": 0.2056816473771231, + "flos": 38439786422880.0, + "grad_norm": 3.16815236221751, + "language_loss": 0.71565235, + "learning_rate": 3.684876582881668e-06, + "loss": 0.7375406, + "num_input_tokens_seen": 73862795, + "router_z_loss_clip": 0.953125, + "router_z_loss_mlp": 0.15240479, + "step": 3421, + "time_per_iteration": 2.791421413421631 + }, + { + "auxiliary_loss_clip": 0.01145005, + "auxiliary_loss_mlp": 0.01034219, + "balance_loss_clip": 1.05103242, + "balance_loss_mlp": 1.01867414, + "epoch": 0.20574177062979107, + "flos": 28380092195520.0, + "grad_norm": 2.6214000756510334, + "language_loss": 0.70713198, + "learning_rate": 3.6846667117719732e-06, + "loss": 0.72892416, + "num_input_tokens_seen": 73881525, + "router_z_loss_clip": 0.93994141, + "router_z_loss_mlp": 0.15551758, + "step": 3422, + "time_per_iteration": 2.6723623275756836 + }, + { + "auxiliary_loss_clip": 0.01058667, + "auxiliary_loss_mlp": 0.01015847, + "balance_loss_clip": 1.02603197, + "balance_loss_mlp": 1.0138886, + "epoch": 0.20580189388245904, + "flos": 85796966439360.0, + "grad_norm": 0.7467578917949627, + "language_loss": 0.55533111, + "learning_rate": 3.684456776779548e-06, + "loss": 0.57607627, + "num_input_tokens_seen": 73937775, + "router_z_loss_clip": 0.32617188, + "router_z_loss_mlp": 0.01956177, + "step": 3423, + "time_per_iteration": 3.3873138427734375 + }, + { + "auxiliary_loss_clip": 0.01148552, + "auxiliary_loss_mlp": 0.01037203, + "balance_loss_clip": 1.0513351, + "balance_loss_mlp": 1.02116919, + "epoch": 0.205862017135127, + "flos": 37507130961120.0, + "grad_norm": 2.030390846615748, + "language_loss": 0.71430385, + "learning_rate": 3.684246777912353e-06, + "loss": 0.73616135, + "num_input_tokens_seen": 73958250, + "router_z_loss_clip": 0.97265625, + "router_z_loss_mlp": 0.16040039, + "step": 3424, + "time_per_iteration": 2.7539637088775635 + }, + { + "auxiliary_loss_clip": 0.01147459, + "auxiliary_loss_mlp": 0.01044033, + "balance_loss_clip": 1.05400944, + "balance_loss_mlp": 1.02856016, + "epoch": 0.20592214038779497, + "flos": 26020297206720.0, + "grad_norm": 1.5948450117158717, + "language_loss": 0.75010747, + "learning_rate": 3.684036715178351e-06, + "loss": 0.77202237, + "num_input_tokens_seen": 73977775, + "router_z_loss_clip": 0.93505859, + "router_z_loss_mlp": 0.15490723, + "step": 3425, + "time_per_iteration": 2.7464499473571777 + }, + { + "auxiliary_loss_clip": 0.0114771, + "auxiliary_loss_mlp": 0.01051268, + "balance_loss_clip": 1.05429077, + "balance_loss_mlp": 1.03604484, + "epoch": 0.20598226364046296, + "flos": 27932763893760.0, + "grad_norm": 1.9188113460040919, + "language_loss": 0.87933069, + "learning_rate": 3.683826588585508e-06, + "loss": 0.90132046, + "num_input_tokens_seen": 73996590, + "router_z_loss_clip": 0.93457031, + "router_z_loss_mlp": 0.15222168, + "step": 3426, + "time_per_iteration": 2.725773572921753 + }, + { + "auxiliary_loss_clip": 0.01145417, + "auxiliary_loss_mlp": 0.01038308, + "balance_loss_clip": 1.05298936, + "balance_loss_mlp": 1.02279294, + "epoch": 0.20604238689313092, + "flos": 29136887546400.0, + "grad_norm": 1.7125661278989135, + "language_loss": 0.77212143, + "learning_rate": 3.6836163981417926e-06, + "loss": 0.79395866, + "num_input_tokens_seen": 74015935, + "router_z_loss_clip": 0.92529297, + "router_z_loss_mlp": 0.1552124, + "step": 3427, + "time_per_iteration": 2.7321712970733643 + }, + { + "auxiliary_loss_clip": 0.01150565, + "auxiliary_loss_mlp": 0.01044986, + "balance_loss_clip": 1.05297971, + "balance_loss_mlp": 1.02821326, + "epoch": 0.2061025101457989, + "flos": 27444397937760.0, + "grad_norm": 1.5257288666391393, + "language_loss": 0.73918873, + "learning_rate": 3.683406143855174e-06, + "loss": 0.76114416, + "num_input_tokens_seen": 74036575, + "router_z_loss_clip": 0.97460938, + "router_z_loss_mlp": 0.16796875, + "step": 3428, + "time_per_iteration": 2.714193344116211 + }, + { + "auxiliary_loss_clip": 0.01149201, + "auxiliary_loss_mlp": 0.01042821, + "balance_loss_clip": 1.05152833, + "balance_loss_mlp": 1.02651346, + "epoch": 0.20616263339846685, + "flos": 27795510400320.0, + "grad_norm": 2.396192785218722, + "language_loss": 0.73101008, + "learning_rate": 3.6831958257336256e-06, + "loss": 0.75293028, + "num_input_tokens_seen": 74055365, + "router_z_loss_clip": 0.97753906, + "router_z_loss_mlp": 0.16320801, + "step": 3429, + "time_per_iteration": 2.679800510406494 + }, + { + "auxiliary_loss_clip": 0.01154216, + "auxiliary_loss_mlp": 0.01043444, + "balance_loss_clip": 1.05687284, + "balance_loss_mlp": 1.02720821, + "epoch": 0.20622275665113482, + "flos": 25480926345600.0, + "grad_norm": 3.1401665852451925, + "language_loss": 0.85005057, + "learning_rate": 3.6829854437851237e-06, + "loss": 0.87202722, + "num_input_tokens_seen": 74074875, + "router_z_loss_clip": 0.97363281, + "router_z_loss_mlp": 0.16259766, + "step": 3430, + "time_per_iteration": 2.697042942047119 + }, + { + "auxiliary_loss_clip": 0.01151104, + "auxiliary_loss_mlp": 0.01044734, + "balance_loss_clip": 1.05405307, + "balance_loss_mlp": 1.02880204, + "epoch": 0.20628287990380278, + "flos": 23616182664000.0, + "grad_norm": 1.615322140973592, + "language_loss": 0.68693477, + "learning_rate": 3.6827749980176444e-06, + "loss": 0.70889318, + "num_input_tokens_seen": 74094505, + "router_z_loss_clip": 0.97119141, + "router_z_loss_mlp": 0.15948486, + "step": 3431, + "time_per_iteration": 2.6795949935913086 + }, + { + "auxiliary_loss_clip": 0.01063469, + "auxiliary_loss_mlp": 0.0100416, + "balance_loss_clip": 1.03030729, + "balance_loss_mlp": 1.00193369, + "epoch": 0.20634300315647078, + "flos": 87267777243840.0, + "grad_norm": 0.809232237544711, + "language_loss": 0.60246998, + "learning_rate": 3.6825644884391693e-06, + "loss": 0.6231463, + "num_input_tokens_seen": 74158500, + "router_z_loss_clip": 0.33178711, + "router_z_loss_mlp": 0.02229309, + "step": 3432, + "time_per_iteration": 3.4834444522857666 + }, + { + "auxiliary_loss_clip": 0.01149097, + "auxiliary_loss_mlp": 0.01039789, + "balance_loss_clip": 1.05453372, + "balance_loss_mlp": 1.02420878, + "epoch": 0.20640312640913874, + "flos": 26507852817120.0, + "grad_norm": 2.071912449901576, + "language_loss": 0.72187495, + "learning_rate": 3.682353915057679e-06, + "loss": 0.7437638, + "num_input_tokens_seen": 74176685, + "router_z_loss_clip": 0.94482422, + "router_z_loss_mlp": 0.15576172, + "step": 3433, + "time_per_iteration": 2.6855721473693848 + }, + { + "auxiliary_loss_clip": 0.0115321, + "auxiliary_loss_mlp": 0.01045894, + "balance_loss_clip": 1.0538708, + "balance_loss_mlp": 1.02913904, + "epoch": 0.2064632496618067, + "flos": 25081118462880.0, + "grad_norm": 2.757134488264628, + "language_loss": 0.86681157, + "learning_rate": 3.6821432778811604e-06, + "loss": 0.88880265, + "num_input_tokens_seen": 74194935, + "router_z_loss_clip": 0.99267578, + "router_z_loss_mlp": 0.16760254, + "step": 3434, + "time_per_iteration": 2.7030279636383057 + }, + { + "auxiliary_loss_clip": 0.01153952, + "auxiliary_loss_mlp": 0.01037897, + "balance_loss_clip": 1.05378938, + "balance_loss_mlp": 1.02162528, + "epoch": 0.20652337291447467, + "flos": 36391322278080.0, + "grad_norm": 1.9626919181076181, + "language_loss": 0.69496715, + "learning_rate": 3.6819325769176004e-06, + "loss": 0.71688563, + "num_input_tokens_seen": 74215400, + "router_z_loss_clip": 1.00048828, + "router_z_loss_mlp": 0.1628418, + "step": 3435, + "time_per_iteration": 2.764216899871826 + }, + { + "auxiliary_loss_clip": 0.0114969, + "auxiliary_loss_mlp": 0.01042837, + "balance_loss_clip": 1.05514622, + "balance_loss_mlp": 1.02550364, + "epoch": 0.20658349616714264, + "flos": 31986345044160.0, + "grad_norm": 1.9335912678356655, + "language_loss": 0.8951875, + "learning_rate": 3.681721812174988e-06, + "loss": 0.91711271, + "num_input_tokens_seen": 74234090, + "router_z_loss_clip": 0.94628906, + "router_z_loss_mlp": 0.17333984, + "step": 3436, + "time_per_iteration": 2.6793360710144043 + }, + { + "auxiliary_loss_clip": 0.01147746, + "auxiliary_loss_mlp": 0.01033696, + "balance_loss_clip": 1.05197668, + "balance_loss_mlp": 1.01815128, + "epoch": 0.2066436194198106, + "flos": 31718807029440.0, + "grad_norm": 3.486110029361534, + "language_loss": 0.76658732, + "learning_rate": 3.6815109836613163e-06, + "loss": 0.78840172, + "num_input_tokens_seen": 74253345, + "router_z_loss_clip": 0.95703125, + "router_z_loss_mlp": 0.15533447, + "step": 3437, + "time_per_iteration": 2.7135746479034424 + }, + { + "auxiliary_loss_clip": 0.01147561, + "auxiliary_loss_mlp": 0.01039453, + "balance_loss_clip": 1.05029261, + "balance_loss_mlp": 1.0243969, + "epoch": 0.20670374267247857, + "flos": 26064859864320.0, + "grad_norm": 2.274763740651553, + "language_loss": 0.77384657, + "learning_rate": 3.6813000913845795e-06, + "loss": 0.7957167, + "num_input_tokens_seen": 74271615, + "router_z_loss_clip": 0.97167969, + "router_z_loss_mlp": 0.1505127, + "step": 3438, + "time_per_iteration": 2.6459453105926514 + }, + { + "auxiliary_loss_clip": 0.01058928, + "auxiliary_loss_mlp": 0.01004412, + "balance_loss_clip": 1.0257585, + "balance_loss_mlp": 1.00223672, + "epoch": 0.20676386592514656, + "flos": 81001176158880.0, + "grad_norm": 0.8396282935497562, + "language_loss": 0.67130482, + "learning_rate": 3.6810891353527747e-06, + "loss": 0.69193822, + "num_input_tokens_seen": 74331390, + "router_z_loss_clip": 0.33154297, + "router_z_loss_mlp": 0.02178955, + "step": 3439, + "time_per_iteration": 3.2823197841644287 + }, + { + "auxiliary_loss_clip": 0.01148913, + "auxiliary_loss_mlp": 0.01039098, + "balance_loss_clip": 1.05047107, + "balance_loss_mlp": 1.02327895, + "epoch": 0.20682398917781453, + "flos": 21078218079360.0, + "grad_norm": 2.0783468646887564, + "language_loss": 0.83890051, + "learning_rate": 3.6808781155739014e-06, + "loss": 0.8607806, + "num_input_tokens_seen": 74347335, + "router_z_loss_clip": 0.98535156, + "router_z_loss_mlp": 0.15808105, + "step": 3440, + "time_per_iteration": 2.6444547176361084 + }, + { + "auxiliary_loss_clip": 0.0115117, + "auxiliary_loss_mlp": 0.01040177, + "balance_loss_clip": 1.05422688, + "balance_loss_mlp": 1.02538359, + "epoch": 0.2068841124304825, + "flos": 22057745683680.0, + "grad_norm": 2.436250267303725, + "language_loss": 0.84718072, + "learning_rate": 3.6806670320559614e-06, + "loss": 0.86909419, + "num_input_tokens_seen": 74366310, + "router_z_loss_clip": 0.96923828, + "router_z_loss_mlp": 0.14794922, + "step": 3441, + "time_per_iteration": 2.6566100120544434 + }, + { + "auxiliary_loss_clip": 0.01145861, + "auxiliary_loss_mlp": 0.01040341, + "balance_loss_clip": 1.05172431, + "balance_loss_mlp": 1.02424765, + "epoch": 0.20694423568315046, + "flos": 33369165017280.0, + "grad_norm": 1.8213676058777013, + "language_loss": 0.85741085, + "learning_rate": 3.680455884806959e-06, + "loss": 0.87927294, + "num_input_tokens_seen": 74387100, + "router_z_loss_clip": 0.94189453, + "router_z_loss_mlp": 0.16088867, + "step": 3442, + "time_per_iteration": 2.733048677444458 + }, + { + "auxiliary_loss_clip": 0.01154808, + "auxiliary_loss_mlp": 0.01040224, + "balance_loss_clip": 1.05627751, + "balance_loss_mlp": 1.02379704, + "epoch": 0.20700435893581842, + "flos": 24684470928000.0, + "grad_norm": 1.984038357663269, + "language_loss": 0.73257852, + "learning_rate": 3.6802446738349014e-06, + "loss": 0.75452888, + "num_input_tokens_seen": 74404460, + "router_z_loss_clip": 0.98583984, + "router_z_loss_mlp": 0.16430664, + "step": 3443, + "time_per_iteration": 2.6670992374420166 + }, + { + "auxiliary_loss_clip": 0.011465, + "auxiliary_loss_mlp": 0.01039722, + "balance_loss_clip": 1.05178916, + "balance_loss_mlp": 1.02522612, + "epoch": 0.2070644821884864, + "flos": 25174133436960.0, + "grad_norm": 1.784637566844918, + "language_loss": 0.85424262, + "learning_rate": 3.680033399147797e-06, + "loss": 0.87610483, + "num_input_tokens_seen": 74423790, + "router_z_loss_clip": 0.94726562, + "router_z_loss_mlp": 0.14501953, + "step": 3444, + "time_per_iteration": 2.733355760574341 + }, + { + "auxiliary_loss_clip": 0.01055729, + "auxiliary_loss_mlp": 0.01005687, + "balance_loss_clip": 1.02293587, + "balance_loss_mlp": 1.00368261, + "epoch": 0.20712460544115438, + "flos": 80461967366880.0, + "grad_norm": 0.6931278336482315, + "language_loss": 0.57159114, + "learning_rate": 3.6798220607536585e-06, + "loss": 0.59220529, + "num_input_tokens_seen": 74488130, + "router_z_loss_clip": 0.32788086, + "router_z_loss_mlp": 0.02003479, + "step": 3445, + "time_per_iteration": 3.2394859790802 + }, + { + "auxiliary_loss_clip": 0.0114643, + "auxiliary_loss_mlp": 0.01044429, + "balance_loss_clip": 1.05198479, + "balance_loss_mlp": 1.0284307, + "epoch": 0.20718472869382235, + "flos": 23703363149760.0, + "grad_norm": 2.287961123924176, + "language_loss": 0.78478837, + "learning_rate": 3.6796106586604987e-06, + "loss": 0.80669701, + "num_input_tokens_seen": 74506720, + "router_z_loss_clip": 0.94384766, + "router_z_loss_mlp": 0.15991211, + "step": 3446, + "time_per_iteration": 2.677935838699341 + }, + { + "auxiliary_loss_clip": 0.01156234, + "auxiliary_loss_mlp": 0.01045326, + "balance_loss_clip": 1.05422676, + "balance_loss_mlp": 1.02716994, + "epoch": 0.2072448519464903, + "flos": 29894047552800.0, + "grad_norm": 2.155009419795917, + "language_loss": 0.62064856, + "learning_rate": 3.679399192876334e-06, + "loss": 0.64266419, + "num_input_tokens_seen": 74525330, + "router_z_loss_clip": 1.01904297, + "router_z_loss_mlp": 0.18164062, + "step": 3447, + "time_per_iteration": 2.6874217987060547 + }, + { + "auxiliary_loss_clip": 0.01149674, + "auxiliary_loss_mlp": 0.01045004, + "balance_loss_clip": 1.05354548, + "balance_loss_mlp": 1.02904224, + "epoch": 0.20730497519915828, + "flos": 28157197872960.0, + "grad_norm": 2.227429826422927, + "language_loss": 0.86068571, + "learning_rate": 3.679187663409184e-06, + "loss": 0.88263249, + "num_input_tokens_seen": 74544535, + "router_z_loss_clip": 0.9609375, + "router_z_loss_mlp": 0.15966797, + "step": 3448, + "time_per_iteration": 2.9960503578186035 + }, + { + "auxiliary_loss_clip": 0.01150803, + "auxiliary_loss_mlp": 0.0104297, + "balance_loss_clip": 1.05253124, + "balance_loss_mlp": 1.02512467, + "epoch": 0.20736509845182624, + "flos": 25708520672640.0, + "grad_norm": 8.42945900567028, + "language_loss": 0.74996591, + "learning_rate": 3.6789760702670696e-06, + "loss": 0.77190369, + "num_input_tokens_seen": 74562300, + "router_z_loss_clip": 0.98291016, + "router_z_loss_mlp": 0.17834473, + "step": 3449, + "time_per_iteration": 4.171693563461304 + }, + { + "auxiliary_loss_clip": 0.01155231, + "auxiliary_loss_mlp": 0.01051266, + "balance_loss_clip": 1.05390978, + "balance_loss_mlp": 1.03330147, + "epoch": 0.2074252217044942, + "flos": 21516511027680.0, + "grad_norm": 2.0826944386431725, + "language_loss": 0.76475215, + "learning_rate": 3.6787644134580134e-06, + "loss": 0.78681719, + "num_input_tokens_seen": 74580080, + "router_z_loss_clip": 1.01318359, + "router_z_loss_mlp": 0.1796875, + "step": 3450, + "time_per_iteration": 4.06573748588562 + }, + { + "auxiliary_loss_clip": 0.01152362, + "auxiliary_loss_mlp": 0.01042138, + "balance_loss_clip": 1.05186343, + "balance_loss_mlp": 1.02604532, + "epoch": 0.20748534495716217, + "flos": 28732582245600.0, + "grad_norm": 1.5625550309313725, + "language_loss": 0.82321596, + "learning_rate": 3.6785526929900436e-06, + "loss": 0.84516096, + "num_input_tokens_seen": 74598980, + "router_z_loss_clip": 1.00634766, + "router_z_loss_mlp": 0.16088867, + "step": 3451, + "time_per_iteration": 2.675807237625122 + }, + { + "auxiliary_loss_clip": 0.0105588, + "auxiliary_loss_mlp": 0.01002056, + "balance_loss_clip": 1.02345145, + "balance_loss_mlp": 0.99999034, + "epoch": 0.20754546820983016, + "flos": 63759195774720.0, + "grad_norm": 0.7838419477155029, + "language_loss": 0.56592488, + "learning_rate": 3.6783409088711875e-06, + "loss": 0.58650422, + "num_input_tokens_seen": 74655275, + "router_z_loss_clip": 0.32446289, + "router_z_loss_mlp": 0.0206604, + "step": 3452, + "time_per_iteration": 3.2274889945983887 + }, + { + "auxiliary_loss_clip": 0.01153134, + "auxiliary_loss_mlp": 0.01042425, + "balance_loss_clip": 1.0536952, + "balance_loss_mlp": 1.02546132, + "epoch": 0.20760559146249813, + "flos": 24907203181440.0, + "grad_norm": 3.952562091755774, + "language_loss": 0.88350379, + "learning_rate": 3.6781290611094755e-06, + "loss": 0.90545934, + "num_input_tokens_seen": 74674560, + "router_z_loss_clip": 0.99365234, + "router_z_loss_mlp": 0.16967773, + "step": 3453, + "time_per_iteration": 5.613086223602295 + }, + { + "auxiliary_loss_clip": 0.01153702, + "auxiliary_loss_mlp": 0.01040486, + "balance_loss_clip": 1.0555979, + "balance_loss_mlp": 1.0233916, + "epoch": 0.2076657147151661, + "flos": 28291736708640.0, + "grad_norm": 1.623773734520095, + "language_loss": 0.80056792, + "learning_rate": 3.6779171497129407e-06, + "loss": 0.82250983, + "num_input_tokens_seen": 74694500, + "router_z_loss_clip": 0.98144531, + "router_z_loss_mlp": 0.17114258, + "step": 3454, + "time_per_iteration": 2.6837124824523926 + }, + { + "auxiliary_loss_clip": 0.01150419, + "auxiliary_loss_mlp": 0.01046796, + "balance_loss_clip": 1.05284512, + "balance_loss_mlp": 1.03041637, + "epoch": 0.20772583796783406, + "flos": 22321515591360.0, + "grad_norm": 2.801550550007747, + "language_loss": 0.76383144, + "learning_rate": 3.6777051746896202e-06, + "loss": 0.78580356, + "num_input_tokens_seen": 74710485, + "router_z_loss_clip": 0.97558594, + "router_z_loss_mlp": 0.16381836, + "step": 3455, + "time_per_iteration": 2.6991171836853027 + }, + { + "auxiliary_loss_clip": 0.01151667, + "auxiliary_loss_mlp": 0.01041823, + "balance_loss_clip": 1.05502975, + "balance_loss_mlp": 1.02599812, + "epoch": 0.20778596122050202, + "flos": 21478755273120.0, + "grad_norm": 1.9511521320965222, + "language_loss": 0.8072753, + "learning_rate": 3.6774931360475516e-06, + "loss": 0.82921016, + "num_input_tokens_seen": 74727450, + "router_z_loss_clip": 0.96630859, + "router_z_loss_mlp": 0.1583252, + "step": 3456, + "time_per_iteration": 2.659961462020874 + }, + { + "auxiliary_loss_clip": 0.01155232, + "auxiliary_loss_mlp": 0.01039821, + "balance_loss_clip": 1.05641222, + "balance_loss_mlp": 1.02257109, + "epoch": 0.20784608447317, + "flos": 29047843265760.0, + "grad_norm": 1.4997526209189418, + "language_loss": 0.77811342, + "learning_rate": 3.6772810337947745e-06, + "loss": 0.80006397, + "num_input_tokens_seen": 74746725, + "router_z_loss_clip": 0.98876953, + "router_z_loss_mlp": 0.17260742, + "step": 3457, + "time_per_iteration": 2.687814712524414 + }, + { + "auxiliary_loss_clip": 0.0115565, + "auxiliary_loss_mlp": 0.01053668, + "balance_loss_clip": 1.05519509, + "balance_loss_mlp": 1.03552437, + "epoch": 0.20790620772583795, + "flos": 21523277413440.0, + "grad_norm": 1.8920906482719264, + "language_loss": 0.8379941, + "learning_rate": 3.677068867939333e-06, + "loss": 0.86008728, + "num_input_tokens_seen": 74765255, + "router_z_loss_clip": 1.00439453, + "router_z_loss_mlp": 0.18164062, + "step": 3458, + "time_per_iteration": 2.6524593830108643 + }, + { + "auxiliary_loss_clip": 0.01148454, + "auxiliary_loss_mlp": 0.01036691, + "balance_loss_clip": 1.05387473, + "balance_loss_mlp": 1.02078843, + "epoch": 0.20796633097850595, + "flos": 33767716864320.0, + "grad_norm": 1.7660456555906723, + "language_loss": 0.76317823, + "learning_rate": 3.676856638489272e-06, + "loss": 0.78502965, + "num_input_tokens_seen": 74785710, + "router_z_loss_clip": 0.94482422, + "router_z_loss_mlp": 0.15905762, + "step": 3459, + "time_per_iteration": 2.740236520767212 + }, + { + "auxiliary_loss_clip": 0.01145268, + "auxiliary_loss_mlp": 0.0103872, + "balance_loss_clip": 1.05097902, + "balance_loss_mlp": 1.02317452, + "epoch": 0.2080264542311739, + "flos": 23483345554080.0, + "grad_norm": 3.0733334778406576, + "language_loss": 0.77094471, + "learning_rate": 3.6766443454526382e-06, + "loss": 0.79278463, + "num_input_tokens_seen": 74804490, + "router_z_loss_clip": 0.94238281, + "router_z_loss_mlp": 0.15539551, + "step": 3460, + "time_per_iteration": 2.7214889526367188 + }, + { + "auxiliary_loss_clip": 0.01148139, + "auxiliary_loss_mlp": 0.01043545, + "balance_loss_clip": 1.05067897, + "balance_loss_mlp": 1.02723074, + "epoch": 0.20808657748384188, + "flos": 33587805025440.0, + "grad_norm": 3.2919897255334005, + "language_loss": 0.75442612, + "learning_rate": 3.6764319888374836e-06, + "loss": 0.77634299, + "num_input_tokens_seen": 74826340, + "router_z_loss_clip": 0.97265625, + "router_z_loss_mlp": 0.16314697, + "step": 3461, + "time_per_iteration": 2.726468324661255 + }, + { + "auxiliary_loss_clip": 0.01153101, + "auxiliary_loss_mlp": 0.01039435, + "balance_loss_clip": 1.05138731, + "balance_loss_mlp": 1.02255487, + "epoch": 0.20814670073650984, + "flos": 32832346744800.0, + "grad_norm": 2.0181609448854236, + "language_loss": 0.88178432, + "learning_rate": 3.6762195686518604e-06, + "loss": 0.90370965, + "num_input_tokens_seen": 74844960, + "router_z_loss_clip": 1.01953125, + "router_z_loss_mlp": 0.16882324, + "step": 3462, + "time_per_iteration": 2.750760555267334 + }, + { + "auxiliary_loss_clip": 0.01058056, + "auxiliary_loss_mlp": 0.01002369, + "balance_loss_clip": 1.02554488, + "balance_loss_mlp": 1.00038862, + "epoch": 0.2082068239891778, + "flos": 85629209784480.0, + "grad_norm": 0.759945316077046, + "language_loss": 0.59024894, + "learning_rate": 3.6760070849038226e-06, + "loss": 0.6108532, + "num_input_tokens_seen": 74909075, + "router_z_loss_clip": 0.32519531, + "router_z_loss_mlp": 0.01979065, + "step": 3463, + "time_per_iteration": 3.4227609634399414 + }, + { + "auxiliary_loss_clip": 0.01152213, + "auxiliary_loss_mlp": 0.01044736, + "balance_loss_clip": 1.0523876, + "balance_loss_mlp": 1.0269618, + "epoch": 0.20826694724184577, + "flos": 30027127766400.0, + "grad_norm": 2.8628398483421025, + "language_loss": 0.66024494, + "learning_rate": 3.675794537601429e-06, + "loss": 0.68221444, + "num_input_tokens_seen": 74928125, + "router_z_loss_clip": 0.99755859, + "router_z_loss_mlp": 0.17773438, + "step": 3464, + "time_per_iteration": 2.763446807861328 + }, + { + "auxiliary_loss_clip": 0.01156697, + "auxiliary_loss_mlp": 0.01049928, + "balance_loss_clip": 1.05602908, + "balance_loss_mlp": 1.03139138, + "epoch": 0.20832707049451377, + "flos": 15731590547520.0, + "grad_norm": 3.544047131209996, + "language_loss": 0.84223783, + "learning_rate": 3.6755819267527373e-06, + "loss": 0.86430407, + "num_input_tokens_seen": 74945090, + "router_z_loss_clip": 1.00585938, + "router_z_loss_mlp": 0.18530273, + "step": 3465, + "time_per_iteration": 2.6796064376831055 + }, + { + "auxiliary_loss_clip": 0.01150554, + "auxiliary_loss_mlp": 0.01040576, + "balance_loss_clip": 1.05265188, + "balance_loss_mlp": 1.02455473, + "epoch": 0.20838719374718173, + "flos": 27087288917760.0, + "grad_norm": 2.3013654621202226, + "language_loss": 0.82283247, + "learning_rate": 3.6753692523658113e-06, + "loss": 0.84474373, + "num_input_tokens_seen": 74963630, + "router_z_loss_clip": 0.97802734, + "router_z_loss_mlp": 0.16027832, + "step": 3466, + "time_per_iteration": 2.731032371520996 + }, + { + "auxiliary_loss_clip": 0.01148661, + "auxiliary_loss_mlp": 0.01037913, + "balance_loss_clip": 1.05361962, + "balance_loss_mlp": 1.02292848, + "epoch": 0.2084473169998497, + "flos": 18496177044480.0, + "grad_norm": 2.257191330336058, + "language_loss": 0.8275702, + "learning_rate": 3.675156514448716e-06, + "loss": 0.84943599, + "num_input_tokens_seen": 74981875, + "router_z_loss_clip": 0.95019531, + "router_z_loss_mlp": 0.14978027, + "step": 3467, + "time_per_iteration": 2.7339601516723633 + }, + { + "auxiliary_loss_clip": 0.01146078, + "auxiliary_loss_mlp": 0.0103933, + "balance_loss_clip": 1.05411386, + "balance_loss_mlp": 1.02401185, + "epoch": 0.20850744025251766, + "flos": 21300585677280.0, + "grad_norm": 2.071369073584707, + "language_loss": 0.81592846, + "learning_rate": 3.674943713009518e-06, + "loss": 0.8377825, + "num_input_tokens_seen": 74999155, + "router_z_loss_clip": 0.91992188, + "router_z_loss_mlp": 0.15319824, + "step": 3468, + "time_per_iteration": 2.682621479034424 + }, + { + "auxiliary_loss_clip": 0.01156236, + "auxiliary_loss_mlp": 0.01045669, + "balance_loss_clip": 1.05428493, + "balance_loss_mlp": 1.02684581, + "epoch": 0.20856756350518563, + "flos": 31358659213440.0, + "grad_norm": 1.8106150896402893, + "language_loss": 0.90048701, + "learning_rate": 3.6747308480562856e-06, + "loss": 0.92250603, + "num_input_tokens_seen": 75017850, + "router_z_loss_clip": 1.01904297, + "router_z_loss_mlp": 0.18823242, + "step": 3469, + "time_per_iteration": 2.761781930923462 + }, + { + "auxiliary_loss_clip": 0.0115505, + "auxiliary_loss_mlp": 0.01046643, + "balance_loss_clip": 1.05729139, + "balance_loss_mlp": 1.02958393, + "epoch": 0.2086276867578536, + "flos": 46233592015680.0, + "grad_norm": 2.2434735925391793, + "language_loss": 0.76933122, + "learning_rate": 3.674517919597092e-06, + "loss": 0.7913481, + "num_input_tokens_seen": 75039270, + "router_z_loss_clip": 0.97753906, + "router_z_loss_mlp": 0.17077637, + "step": 3470, + "time_per_iteration": 2.8377602100372314 + }, + { + "auxiliary_loss_clip": 0.01151666, + "auxiliary_loss_mlp": 0.01047413, + "balance_loss_clip": 1.05541229, + "balance_loss_mlp": 1.03078365, + "epoch": 0.20868781001052156, + "flos": 31185108587520.0, + "grad_norm": 1.8382720170976352, + "language_loss": 0.75986767, + "learning_rate": 3.674304927640011e-06, + "loss": 0.78185844, + "num_input_tokens_seen": 75059350, + "router_z_loss_clip": 0.96337891, + "router_z_loss_mlp": 0.16625977, + "step": 3471, + "time_per_iteration": 2.723991870880127 + }, + { + "auxiliary_loss_clip": 0.01154263, + "auxiliary_loss_mlp": 0.01050562, + "balance_loss_clip": 1.0520668, + "balance_loss_mlp": 1.03185821, + "epoch": 0.20874793326318955, + "flos": 33591492097920.0, + "grad_norm": 2.1218561695379887, + "language_loss": 0.7567184, + "learning_rate": 3.67409187219312e-06, + "loss": 0.77876663, + "num_input_tokens_seen": 75080150, + "router_z_loss_clip": 1.02148438, + "router_z_loss_mlp": 0.18701172, + "step": 3472, + "time_per_iteration": 2.730875253677368 + }, + { + "auxiliary_loss_clip": 0.01149144, + "auxiliary_loss_mlp": 0.01042169, + "balance_loss_clip": 1.05402231, + "balance_loss_mlp": 1.02600384, + "epoch": 0.20880805651585752, + "flos": 22633373160000.0, + "grad_norm": 2.2228088500700114, + "language_loss": 0.84252191, + "learning_rate": 3.6738787532644966e-06, + "loss": 0.86443502, + "num_input_tokens_seen": 75097920, + "router_z_loss_clip": 0.95068359, + "router_z_loss_mlp": 0.16174316, + "step": 3473, + "time_per_iteration": 2.676421642303467 + }, + { + "auxiliary_loss_clip": 0.01062436, + "auxiliary_loss_mlp": 0.01002628, + "balance_loss_clip": 1.02982628, + "balance_loss_mlp": 1.0005852, + "epoch": 0.20886817976852548, + "flos": 80468247545280.0, + "grad_norm": 0.8791712292027858, + "language_loss": 0.63697159, + "learning_rate": 3.6736655708622235e-06, + "loss": 0.65762222, + "num_input_tokens_seen": 75152410, + "router_z_loss_clip": 0.32617188, + "router_z_loss_mlp": 0.02043152, + "step": 3474, + "time_per_iteration": 3.2383527755737305 + }, + { + "auxiliary_loss_clip": 0.01152689, + "auxiliary_loss_mlp": 0.0104122, + "balance_loss_clip": 1.05464673, + "balance_loss_mlp": 1.02447069, + "epoch": 0.20892830302119345, + "flos": 44586353858400.0, + "grad_norm": 14.385271682939083, + "language_loss": 0.69967681, + "learning_rate": 3.6734523249943844e-06, + "loss": 0.72161585, + "num_input_tokens_seen": 75173265, + "router_z_loss_clip": 0.98046875, + "router_z_loss_mlp": 0.16748047, + "step": 3475, + "time_per_iteration": 2.842660427093506 + }, + { + "auxiliary_loss_clip": 0.01153805, + "auxiliary_loss_mlp": 0.01047616, + "balance_loss_clip": 1.05658793, + "balance_loss_mlp": 1.03133178, + "epoch": 0.2089884262738614, + "flos": 25571753386560.0, + "grad_norm": 1.600534159648691, + "language_loss": 0.70022625, + "learning_rate": 3.673239015669065e-06, + "loss": 0.72224045, + "num_input_tokens_seen": 75193640, + "router_z_loss_clip": 0.97216797, + "router_z_loss_mlp": 0.1628418, + "step": 3476, + "time_per_iteration": 2.7040388584136963 + }, + { + "auxiliary_loss_clip": 0.01152193, + "auxiliary_loss_mlp": 0.01047665, + "balance_loss_clip": 1.05578375, + "balance_loss_mlp": 1.03113067, + "epoch": 0.20904854952652938, + "flos": 27801263854080.0, + "grad_norm": 1.750037264741046, + "language_loss": 0.88745654, + "learning_rate": 3.6730256428943544e-06, + "loss": 0.90945512, + "num_input_tokens_seen": 75212545, + "router_z_loss_clip": 0.96386719, + "router_z_loss_mlp": 0.1652832, + "step": 3477, + "time_per_iteration": 2.7140183448791504 + }, + { + "auxiliary_loss_clip": 0.01147713, + "auxiliary_loss_mlp": 0.01044132, + "balance_loss_clip": 1.05105317, + "balance_loss_mlp": 1.02747846, + "epoch": 0.20910867277919734, + "flos": 33316458386400.0, + "grad_norm": 2.5498564459959665, + "language_loss": 0.68123758, + "learning_rate": 3.672812206678344e-06, + "loss": 0.70315599, + "num_input_tokens_seen": 75230865, + "router_z_loss_clip": 0.96582031, + "router_z_loss_mlp": 0.16662598, + "step": 3478, + "time_per_iteration": 2.74043345451355 + }, + { + "auxiliary_loss_clip": 0.01152314, + "auxiliary_loss_mlp": 0.01047277, + "balance_loss_clip": 1.05493498, + "balance_loss_mlp": 1.02981257, + "epoch": 0.20916879603186533, + "flos": 17471884196160.0, + "grad_norm": 7.087705345882575, + "language_loss": 0.8509419, + "learning_rate": 3.672598707029127e-06, + "loss": 0.8729378, + "num_input_tokens_seen": 75248285, + "router_z_loss_clip": 0.97363281, + "router_z_loss_mlp": 0.17468262, + "step": 3479, + "time_per_iteration": 2.658020257949829 + }, + { + "auxiliary_loss_clip": 0.01152957, + "auxiliary_loss_mlp": 0.01053334, + "balance_loss_clip": 1.05468178, + "balance_loss_mlp": 1.03595352, + "epoch": 0.2092289192845333, + "flos": 27177710785920.0, + "grad_norm": 2.728282756948289, + "language_loss": 0.745049, + "learning_rate": 3.6723851439548003e-06, + "loss": 0.7671119, + "num_input_tokens_seen": 75266310, + "router_z_loss_clip": 0.98193359, + "router_z_loss_mlp": 0.17370605, + "step": 3480, + "time_per_iteration": 2.76727032661438 + }, + { + "auxiliary_loss_clip": 0.01149348, + "auxiliary_loss_mlp": 0.01047236, + "balance_loss_clip": 1.05483651, + "balance_loss_mlp": 1.03174448, + "epoch": 0.20928904253720126, + "flos": 18095963988960.0, + "grad_norm": 2.0498559543692707, + "language_loss": 0.75550097, + "learning_rate": 3.67217151746346e-06, + "loss": 0.77746683, + "num_input_tokens_seen": 75284175, + "router_z_loss_clip": 0.94384766, + "router_z_loss_mlp": 0.15472412, + "step": 3481, + "time_per_iteration": 2.651466131210327 + }, + { + "auxiliary_loss_clip": 0.01152133, + "auxiliary_loss_mlp": 0.01038346, + "balance_loss_clip": 1.0547756, + "balance_loss_mlp": 1.02203846, + "epoch": 0.20934916578986923, + "flos": 28380983575680.0, + "grad_norm": 1.997075095657245, + "language_loss": 0.85492802, + "learning_rate": 3.671957827563209e-06, + "loss": 0.87683284, + "num_input_tokens_seen": 75303465, + "router_z_loss_clip": 0.97216797, + "router_z_loss_mlp": 0.16296387, + "step": 3482, + "time_per_iteration": 2.709290027618408 + }, + { + "auxiliary_loss_clip": 0.01153231, + "auxiliary_loss_mlp": 0.01039328, + "balance_loss_clip": 1.05688286, + "balance_loss_mlp": 1.02276969, + "epoch": 0.2094092890425372, + "flos": 39064919664960.0, + "grad_norm": 2.240665816176839, + "language_loss": 0.71014726, + "learning_rate": 3.6717440742621494e-06, + "loss": 0.73207289, + "num_input_tokens_seen": 75325290, + "router_z_loss_clip": 0.96386719, + "router_z_loss_mlp": 0.16564941, + "step": 3483, + "time_per_iteration": 2.7551608085632324 + }, + { + "auxiliary_loss_clip": 0.01153022, + "auxiliary_loss_mlp": 0.01052027, + "balance_loss_clip": 1.05421817, + "balance_loss_mlp": 1.03509903, + "epoch": 0.20946941229520516, + "flos": 24417013947840.0, + "grad_norm": 2.490031946676763, + "language_loss": 0.75187224, + "learning_rate": 3.6715302575683865e-06, + "loss": 0.7739228, + "num_input_tokens_seen": 75343895, + "router_z_loss_clip": 0.98779297, + "router_z_loss_mlp": 0.16918945, + "step": 3484, + "time_per_iteration": 2.741692543029785 + }, + { + "auxiliary_loss_clip": 0.01149144, + "auxiliary_loss_mlp": 0.01039555, + "balance_loss_clip": 1.05421364, + "balance_loss_mlp": 1.02273464, + "epoch": 0.20952953554787315, + "flos": 37509643032480.0, + "grad_norm": 1.9312121620484117, + "language_loss": 0.70747435, + "learning_rate": 3.6713163774900292e-06, + "loss": 0.72936136, + "num_input_tokens_seen": 75367100, + "router_z_loss_clip": 0.94921875, + "router_z_loss_mlp": 0.16821289, + "step": 3485, + "time_per_iteration": 2.7989842891693115 + }, + { + "auxiliary_loss_clip": 0.011567, + "auxiliary_loss_mlp": 0.01043287, + "balance_loss_clip": 1.05741191, + "balance_loss_mlp": 1.0263356, + "epoch": 0.20958965880054112, + "flos": 33006302543520.0, + "grad_norm": 2.0739192977686836, + "language_loss": 0.82792914, + "learning_rate": 3.6711024340351875e-06, + "loss": 0.84992909, + "num_input_tokens_seen": 75389925, + "router_z_loss_clip": 0.99316406, + "router_z_loss_mlp": 0.16955566, + "step": 3486, + "time_per_iteration": 2.7485527992248535 + }, + { + "auxiliary_loss_clip": 0.01149052, + "auxiliary_loss_mlp": 0.01043165, + "balance_loss_clip": 1.05244792, + "balance_loss_mlp": 1.02752507, + "epoch": 0.20964978205320908, + "flos": 41736936877920.0, + "grad_norm": 1.7401393382901051, + "language_loss": 0.87254655, + "learning_rate": 3.6708884272119737e-06, + "loss": 0.89446867, + "num_input_tokens_seen": 75408575, + "router_z_loss_clip": 0.96533203, + "router_z_loss_mlp": 0.15637207, + "step": 3487, + "time_per_iteration": 2.855224370956421 + }, + { + "auxiliary_loss_clip": 0.01149766, + "auxiliary_loss_mlp": 0.01040523, + "balance_loss_clip": 1.05293918, + "balance_loss_mlp": 1.02342856, + "epoch": 0.20970990530587705, + "flos": 28647346589280.0, + "grad_norm": 2.423110663768116, + "language_loss": 0.72327864, + "learning_rate": 3.670674357028504e-06, + "loss": 0.74518156, + "num_input_tokens_seen": 75427155, + "router_z_loss_clip": 0.96777344, + "router_z_loss_mlp": 0.17089844, + "step": 3488, + "time_per_iteration": 2.6909728050231934 + }, + { + "auxiliary_loss_clip": 0.01151608, + "auxiliary_loss_mlp": 0.01037658, + "balance_loss_clip": 1.05521691, + "balance_loss_mlp": 1.02164853, + "epoch": 0.209770028558545, + "flos": 22636371438720.0, + "grad_norm": 2.318399152573459, + "language_loss": 0.81481111, + "learning_rate": 3.6704602234928945e-06, + "loss": 0.83670378, + "num_input_tokens_seen": 75444450, + "router_z_loss_clip": 0.96289062, + "router_z_loss_mlp": 0.16015625, + "step": 3489, + "time_per_iteration": 5.5998334884643555 + }, + { + "auxiliary_loss_clip": 0.01151774, + "auxiliary_loss_mlp": 0.01039316, + "balance_loss_clip": 1.05493855, + "balance_loss_mlp": 1.02341366, + "epoch": 0.20983015181121298, + "flos": 26377122605760.0, + "grad_norm": 1.9969931936244316, + "language_loss": 0.73660183, + "learning_rate": 3.670246026613266e-06, + "loss": 0.75851274, + "num_input_tokens_seen": 75462625, + "router_z_loss_clip": 0.96875, + "router_z_loss_mlp": 0.15905762, + "step": 3490, + "time_per_iteration": 2.779714345932007 + }, + { + "auxiliary_loss_clip": 0.01147728, + "auxiliary_loss_mlp": 0.01047411, + "balance_loss_clip": 1.05646241, + "balance_loss_mlp": 1.03210461, + "epoch": 0.20989027506388094, + "flos": 20273294550240.0, + "grad_norm": 1.869940631848733, + "language_loss": 0.70035815, + "learning_rate": 3.6700317663977415e-06, + "loss": 0.72230953, + "num_input_tokens_seen": 75480640, + "router_z_loss_clip": 0.91162109, + "router_z_loss_mlp": 0.15307617, + "step": 3491, + "time_per_iteration": 2.7541868686676025 + }, + { + "auxiliary_loss_clip": 0.01149717, + "auxiliary_loss_mlp": 0.0104057, + "balance_loss_clip": 1.05219018, + "balance_loss_mlp": 1.02404714, + "epoch": 0.20995039831654894, + "flos": 28329857118720.0, + "grad_norm": 3.206180519465885, + "language_loss": 0.80016553, + "learning_rate": 3.669817442854444e-06, + "loss": 0.82206839, + "num_input_tokens_seen": 75494900, + "router_z_loss_clip": 0.97607422, + "router_z_loss_mlp": 0.16522217, + "step": 3492, + "time_per_iteration": 4.276106119155884 + }, + { + "auxiliary_loss_clip": 0.01151991, + "auxiliary_loss_mlp": 0.01036328, + "balance_loss_clip": 1.05618334, + "balance_loss_mlp": 1.02016366, + "epoch": 0.2100105215692169, + "flos": 22143670133760.0, + "grad_norm": 2.8270363430655507, + "language_loss": 0.86845362, + "learning_rate": 3.669603055991502e-06, + "loss": 0.89033687, + "num_input_tokens_seen": 75513370, + "router_z_loss_clip": 0.95751953, + "router_z_loss_mlp": 0.16162109, + "step": 3493, + "time_per_iteration": 4.133756160736084 + }, + { + "auxiliary_loss_clip": 0.01144522, + "auxiliary_loss_mlp": 0.01034485, + "balance_loss_clip": 1.05284786, + "balance_loss_mlp": 1.02008486, + "epoch": 0.21007064482188487, + "flos": 19475988269760.0, + "grad_norm": 2.029312464201746, + "language_loss": 0.68772197, + "learning_rate": 3.6693886058170455e-06, + "loss": 0.709512, + "num_input_tokens_seen": 75532480, + "router_z_loss_clip": 0.91699219, + "router_z_loss_mlp": 0.14404297, + "step": 3494, + "time_per_iteration": 2.692028045654297 + }, + { + "auxiliary_loss_clip": 0.01151997, + "auxiliary_loss_mlp": 0.01037352, + "balance_loss_clip": 1.05434322, + "balance_loss_mlp": 1.02134204, + "epoch": 0.21013076807455283, + "flos": 39333186990720.0, + "grad_norm": 2.133768891517955, + "language_loss": 0.79074955, + "learning_rate": 3.6691740923392053e-06, + "loss": 0.81264305, + "num_input_tokens_seen": 75552745, + "router_z_loss_clip": 0.97705078, + "router_z_loss_mlp": 0.15997314, + "step": 3495, + "time_per_iteration": 2.7409794330596924 + }, + { + "auxiliary_loss_clip": 0.0114962, + "auxiliary_loss_mlp": 0.0104379, + "balance_loss_clip": 1.05350232, + "balance_loss_mlp": 1.02738714, + "epoch": 0.2101908913272208, + "flos": 28914844086720.0, + "grad_norm": 1.8599417646192993, + "language_loss": 0.77589166, + "learning_rate": 3.668959515566116e-06, + "loss": 0.79782581, + "num_input_tokens_seen": 75574355, + "router_z_loss_clip": 0.96240234, + "router_z_loss_mlp": 0.16394043, + "step": 3496, + "time_per_iteration": 2.72810959815979 + }, + { + "auxiliary_loss_clip": 0.01150584, + "auxiliary_loss_mlp": 0.01048294, + "balance_loss_clip": 1.05296564, + "balance_loss_mlp": 1.03134227, + "epoch": 0.21025101457988876, + "flos": 24862559489280.0, + "grad_norm": 2.097289507942934, + "language_loss": 0.82401532, + "learning_rate": 3.668744875505915e-06, + "loss": 0.84600407, + "num_input_tokens_seen": 75592215, + "router_z_loss_clip": 0.97558594, + "router_z_loss_mlp": 0.16943359, + "step": 3497, + "time_per_iteration": 2.6572299003601074 + }, + { + "auxiliary_loss_clip": 0.01155545, + "auxiliary_loss_mlp": 0.01044565, + "balance_loss_clip": 1.05718553, + "balance_loss_mlp": 1.02803087, + "epoch": 0.21031113783255675, + "flos": 31451876773920.0, + "grad_norm": 1.903623655887436, + "language_loss": 0.67151874, + "learning_rate": 3.668530172166741e-06, + "loss": 0.69351983, + "num_input_tokens_seen": 75610740, + "router_z_loss_clip": 0.98193359, + "router_z_loss_mlp": 0.1652832, + "step": 3498, + "time_per_iteration": 2.7421278953552246 + }, + { + "auxiliary_loss_clip": 0.01154581, + "auxiliary_loss_mlp": 0.0103955, + "balance_loss_clip": 1.05508363, + "balance_loss_mlp": 1.02261019, + "epoch": 0.21037126108522472, + "flos": 26866704080160.0, + "grad_norm": 1.9389484470807241, + "language_loss": 0.8080259, + "learning_rate": 3.6683154055567352e-06, + "loss": 0.8299672, + "num_input_tokens_seen": 75631005, + "router_z_loss_clip": 0.99414062, + "router_z_loss_mlp": 0.16955566, + "step": 3499, + "time_per_iteration": 2.6843795776367188 + }, + { + "auxiliary_loss_clip": 0.01150513, + "auxiliary_loss_mlp": 0.01037408, + "balance_loss_clip": 1.0554738, + "balance_loss_mlp": 1.02238798, + "epoch": 0.21043138433789269, + "flos": 30913761948480.0, + "grad_norm": 1.8890845586899143, + "language_loss": 0.78190565, + "learning_rate": 3.668100575684043e-06, + "loss": 0.80378491, + "num_input_tokens_seen": 75650655, + "router_z_loss_clip": 0.95068359, + "router_z_loss_mlp": 0.15020752, + "step": 3500, + "time_per_iteration": 2.767273426055908 + }, + { + "auxiliary_loss_clip": 0.01149965, + "auxiliary_loss_mlp": 0.01040037, + "balance_loss_clip": 1.0553112, + "balance_loss_mlp": 1.02419376, + "epoch": 0.21049150759056065, + "flos": 31184662897440.0, + "grad_norm": 1.81050491369931, + "language_loss": 0.74128222, + "learning_rate": 3.6678856825568094e-06, + "loss": 0.76318216, + "num_input_tokens_seen": 75669895, + "router_z_loss_clip": 0.9453125, + "router_z_loss_mlp": 0.15838623, + "step": 3501, + "time_per_iteration": 2.709620714187622 + }, + { + "auxiliary_loss_clip": 0.0114982, + "auxiliary_loss_mlp": 0.01035485, + "balance_loss_clip": 1.05535579, + "balance_loss_mlp": 1.01966596, + "epoch": 0.21055163084322862, + "flos": 29890157893920.0, + "grad_norm": 1.5757911000653233, + "language_loss": 0.75247902, + "learning_rate": 3.667670726183183e-06, + "loss": 0.77433205, + "num_input_tokens_seen": 75689535, + "router_z_loss_clip": 0.94384766, + "router_z_loss_mlp": 0.15820312, + "step": 3502, + "time_per_iteration": 2.7511143684387207 + }, + { + "auxiliary_loss_clip": 0.01149765, + "auxiliary_loss_mlp": 0.01031026, + "balance_loss_clip": 1.0543884, + "balance_loss_mlp": 1.01521909, + "epoch": 0.21061175409589658, + "flos": 31407233081760.0, + "grad_norm": 2.264222637945093, + "language_loss": 0.77066958, + "learning_rate": 3.667455706571316e-06, + "loss": 0.79247755, + "num_input_tokens_seen": 75709265, + "router_z_loss_clip": 0.95410156, + "router_z_loss_mlp": 0.15783691, + "step": 3503, + "time_per_iteration": 2.8323235511779785 + }, + { + "auxiliary_loss_clip": 0.01160214, + "auxiliary_loss_mlp": 0.0104276, + "balance_loss_clip": 1.0575372, + "balance_loss_mlp": 1.02351975, + "epoch": 0.21067187734856455, + "flos": 23170515570720.0, + "grad_norm": 2.4489200458891114, + "language_loss": 0.78466177, + "learning_rate": 3.6672406237293617e-06, + "loss": 0.80669153, + "num_input_tokens_seen": 75727050, + "router_z_loss_clip": 1.02636719, + "router_z_loss_mlp": 0.19238281, + "step": 3504, + "time_per_iteration": 2.7024004459381104 + }, + { + "auxiliary_loss_clip": 0.01156159, + "auxiliary_loss_mlp": 0.01045325, + "balance_loss_clip": 1.05532658, + "balance_loss_mlp": 1.02871943, + "epoch": 0.21073200060123254, + "flos": 30116536702560.0, + "grad_norm": 1.9538099857729512, + "language_loss": 0.76558673, + "learning_rate": 3.6670254776654754e-06, + "loss": 0.78760159, + "num_input_tokens_seen": 75747175, + "router_z_loss_clip": 1.00878906, + "router_z_loss_mlp": 0.1661377, + "step": 3505, + "time_per_iteration": 2.8004443645477295 + }, + { + "auxiliary_loss_clip": 0.01149343, + "auxiliary_loss_mlp": 0.01047896, + "balance_loss_clip": 1.05544043, + "balance_loss_mlp": 1.03130162, + "epoch": 0.2107921238539005, + "flos": 34836491335680.0, + "grad_norm": 1.9901549021165124, + "language_loss": 0.64114368, + "learning_rate": 3.6668102683878163e-06, + "loss": 0.6631161, + "num_input_tokens_seen": 75767690, + "router_z_loss_clip": 0.93896484, + "router_z_loss_mlp": 0.16583252, + "step": 3506, + "time_per_iteration": 2.783552646636963 + }, + { + "auxiliary_loss_clip": 0.01149183, + "auxiliary_loss_mlp": 0.0104238, + "balance_loss_clip": 1.05347908, + "balance_loss_mlp": 1.02616751, + "epoch": 0.21085224710656847, + "flos": 31586294057760.0, + "grad_norm": 2.0145360440170292, + "language_loss": 0.82051247, + "learning_rate": 3.6665949959045443e-06, + "loss": 0.84242809, + "num_input_tokens_seen": 75787255, + "router_z_loss_clip": 0.95703125, + "router_z_loss_mlp": 0.16210938, + "step": 3507, + "time_per_iteration": 2.8114490509033203 + }, + { + "auxiliary_loss_clip": 0.01150822, + "auxiliary_loss_mlp": 0.0104332, + "balance_loss_clip": 1.05381656, + "balance_loss_mlp": 1.02715492, + "epoch": 0.21091237035923643, + "flos": 18273647377440.0, + "grad_norm": 1.7750203181553739, + "language_loss": 0.7574333, + "learning_rate": 3.666379660223824e-06, + "loss": 0.77937472, + "num_input_tokens_seen": 75805890, + "router_z_loss_clip": 0.96875, + "router_z_loss_mlp": 0.16162109, + "step": 3508, + "time_per_iteration": 2.6668314933776855 + }, + { + "auxiliary_loss_clip": 0.01154512, + "auxiliary_loss_mlp": 0.01037023, + "balance_loss_clip": 1.05610597, + "balance_loss_mlp": 1.02081025, + "epoch": 0.2109724936119044, + "flos": 19965245605920.0, + "grad_norm": 4.802996680454918, + "language_loss": 0.84951591, + "learning_rate": 3.6661642613538192e-06, + "loss": 0.87143129, + "num_input_tokens_seen": 75821620, + "router_z_loss_clip": 0.98486328, + "router_z_loss_mlp": 0.16223145, + "step": 3509, + "time_per_iteration": 2.7252249717712402 + }, + { + "auxiliary_loss_clip": 0.01158304, + "auxiliary_loss_mlp": 0.01039736, + "balance_loss_clip": 1.05681503, + "balance_loss_mlp": 1.0221765, + "epoch": 0.21103261686457236, + "flos": 38440353664800.0, + "grad_norm": 1.8249361331612566, + "language_loss": 0.67971057, + "learning_rate": 3.6659487993026987e-06, + "loss": 0.70169103, + "num_input_tokens_seen": 75842490, + "router_z_loss_clip": 1.01513672, + "router_z_loss_mlp": 0.17553711, + "step": 3510, + "time_per_iteration": 2.745502471923828 + }, + { + "auxiliary_loss_clip": 0.01157282, + "auxiliary_loss_mlp": 0.01038878, + "balance_loss_clip": 1.05757308, + "balance_loss_mlp": 1.02302337, + "epoch": 0.21109274011724033, + "flos": 33365842600320.0, + "grad_norm": 1.6549676110383233, + "language_loss": 0.72133911, + "learning_rate": 3.6657332740786327e-06, + "loss": 0.74330074, + "num_input_tokens_seen": 75865985, + "router_z_loss_clip": 0.99804688, + "router_z_loss_mlp": 0.1585083, + "step": 3511, + "time_per_iteration": 2.7459981441497803 + }, + { + "auxiliary_loss_clip": 0.01160872, + "auxiliary_loss_mlp": 0.01039581, + "balance_loss_clip": 1.05910635, + "balance_loss_mlp": 1.0215807, + "epoch": 0.21115286336990832, + "flos": 21744388975680.0, + "grad_norm": 2.3516481847327846, + "language_loss": 0.68874931, + "learning_rate": 3.665517685689794e-06, + "loss": 0.7107538, + "num_input_tokens_seen": 75882745, + "router_z_loss_clip": 1.01757812, + "router_z_loss_mlp": 0.17993164, + "step": 3512, + "time_per_iteration": 2.6608426570892334 + }, + { + "auxiliary_loss_clip": 0.01154157, + "auxiliary_loss_mlp": 0.01047196, + "balance_loss_clip": 1.05416822, + "balance_loss_mlp": 1.03010106, + "epoch": 0.2112129866225763, + "flos": 33187591969920.0, + "grad_norm": 2.0669971730764147, + "language_loss": 0.73025227, + "learning_rate": 3.6653020341443584e-06, + "loss": 0.75226581, + "num_input_tokens_seen": 75904305, + "router_z_loss_clip": 0.99853516, + "router_z_loss_mlp": 0.17089844, + "step": 3513, + "time_per_iteration": 2.824617624282837 + }, + { + "auxiliary_loss_clip": 0.01153325, + "auxiliary_loss_mlp": 0.01039203, + "balance_loss_clip": 1.05867052, + "balance_loss_mlp": 1.02407575, + "epoch": 0.21127310987524425, + "flos": 28957299845760.0, + "grad_norm": 2.1647716447012804, + "language_loss": 0.74111885, + "learning_rate": 3.665086319450502e-06, + "loss": 0.76304406, + "num_input_tokens_seen": 75923710, + "router_z_loss_clip": 0.94775391, + "router_z_loss_mlp": 0.15112305, + "step": 3514, + "time_per_iteration": 2.7833375930786133 + }, + { + "auxiliary_loss_clip": 0.01156597, + "auxiliary_loss_mlp": 0.01040936, + "balance_loss_clip": 1.05665004, + "balance_loss_mlp": 1.0249027, + "epoch": 0.21133323312791222, + "flos": 22369198079520.0, + "grad_norm": 2.010562492638396, + "language_loss": 0.76065534, + "learning_rate": 3.6648705416164062e-06, + "loss": 0.78263068, + "num_input_tokens_seen": 75942625, + "router_z_loss_clip": 1.0, + "router_z_loss_mlp": 0.16040039, + "step": 3515, + "time_per_iteration": 2.6660280227661133 + }, + { + "auxiliary_loss_clip": 0.01154382, + "auxiliary_loss_mlp": 0.01045127, + "balance_loss_clip": 1.05818725, + "balance_loss_mlp": 1.02852154, + "epoch": 0.21139335638058018, + "flos": 21879495053280.0, + "grad_norm": 1.8940825689339273, + "language_loss": 0.67737877, + "learning_rate": 3.6646547006502518e-06, + "loss": 0.69937384, + "num_input_tokens_seen": 75959930, + "router_z_loss_clip": 0.96142578, + "router_z_loss_mlp": 0.16589355, + "step": 3516, + "time_per_iteration": 2.7721333503723145 + }, + { + "auxiliary_loss_clip": 0.01158882, + "auxiliary_loss_mlp": 0.0104403, + "balance_loss_clip": 1.05876994, + "balance_loss_mlp": 1.02678001, + "epoch": 0.21145347963324815, + "flos": 29983172868000.0, + "grad_norm": 2.2177749269595943, + "language_loss": 0.8553915, + "learning_rate": 3.664438796560225e-06, + "loss": 0.87742066, + "num_input_tokens_seen": 75980335, + "router_z_loss_clip": 1.0, + "router_z_loss_mlp": 0.17260742, + "step": 3517, + "time_per_iteration": 2.713277816772461 + }, + { + "auxiliary_loss_clip": 0.0115582, + "auxiliary_loss_mlp": 0.01037734, + "balance_loss_clip": 1.05753529, + "balance_loss_mlp": 1.02108085, + "epoch": 0.21151360288591614, + "flos": 43740433192320.0, + "grad_norm": 2.029794462922291, + "language_loss": 0.62864363, + "learning_rate": 3.664222829354512e-06, + "loss": 0.65057921, + "num_input_tokens_seen": 76002095, + "router_z_loss_clip": 0.98193359, + "router_z_loss_mlp": 0.16662598, + "step": 3518, + "time_per_iteration": 2.8878540992736816 + }, + { + "auxiliary_loss_clip": 0.01154068, + "auxiliary_loss_mlp": 0.01049633, + "balance_loss_clip": 1.05633855, + "balance_loss_mlp": 1.033885, + "epoch": 0.2115737261385841, + "flos": 30068286972480.0, + "grad_norm": 2.5093193991858898, + "language_loss": 0.89164889, + "learning_rate": 3.664006799041303e-06, + "loss": 0.91368592, + "num_input_tokens_seen": 76020425, + "router_z_loss_clip": 0.97753906, + "router_z_loss_mlp": 0.15759277, + "step": 3519, + "time_per_iteration": 2.691432476043701 + }, + { + "auxiliary_loss_clip": 0.01158667, + "auxiliary_loss_mlp": 0.01051814, + "balance_loss_clip": 1.05860364, + "balance_loss_mlp": 1.03476715, + "epoch": 0.21163384939125207, + "flos": 30782383460640.0, + "grad_norm": 1.9828836162129444, + "language_loss": 0.8160044, + "learning_rate": 3.6637907056287886e-06, + "loss": 0.8381092, + "num_input_tokens_seen": 76041210, + "router_z_loss_clip": 1.00146484, + "router_z_loss_mlp": 0.17053223, + "step": 3520, + "time_per_iteration": 2.7646756172180176 + }, + { + "auxiliary_loss_clip": 0.01154619, + "auxiliary_loss_mlp": 0.01046177, + "balance_loss_clip": 1.05899501, + "balance_loss_mlp": 1.03002453, + "epoch": 0.21169397264392004, + "flos": 31808053896480.0, + "grad_norm": 1.9702559254481258, + "language_loss": 0.76317376, + "learning_rate": 3.6635745491251642e-06, + "loss": 0.7851817, + "num_input_tokens_seen": 76062685, + "router_z_loss_clip": 0.95703125, + "router_z_loss_mlp": 0.16162109, + "step": 3521, + "time_per_iteration": 2.722428798675537 + }, + { + "auxiliary_loss_clip": 0.01154256, + "auxiliary_loss_mlp": 0.01040887, + "balance_loss_clip": 1.05679286, + "balance_loss_mlp": 1.02600348, + "epoch": 0.211754095896588, + "flos": 28197303629760.0, + "grad_norm": 1.7936093032255653, + "language_loss": 0.75419724, + "learning_rate": 3.663358329538626e-06, + "loss": 0.77614868, + "num_input_tokens_seen": 76082300, + "router_z_loss_clip": 0.97314453, + "router_z_loss_mlp": 0.14868164, + "step": 3522, + "time_per_iteration": 2.736245632171631 + }, + { + "auxiliary_loss_clip": 0.01155064, + "auxiliary_loss_mlp": 0.01046686, + "balance_loss_clip": 1.05641794, + "balance_loss_mlp": 1.02946031, + "epoch": 0.21181421914925597, + "flos": 34071997701600.0, + "grad_norm": 2.097966152400112, + "language_loss": 0.70695472, + "learning_rate": 3.663142046877374e-06, + "loss": 0.7289722, + "num_input_tokens_seen": 76101135, + "router_z_loss_clip": 0.98730469, + "router_z_loss_mlp": 0.17211914, + "step": 3523, + "time_per_iteration": 2.7369017601013184 + }, + { + "auxiliary_loss_clip": 0.01157456, + "auxiliary_loss_mlp": 0.01041454, + "balance_loss_clip": 1.05983889, + "balance_loss_mlp": 1.02524781, + "epoch": 0.21187434240192393, + "flos": 20900129518080.0, + "grad_norm": 2.672389184053344, + "language_loss": 0.76545781, + "learning_rate": 3.6629257011496085e-06, + "loss": 0.78744692, + "num_input_tokens_seen": 76119320, + "router_z_loss_clip": 0.97558594, + "router_z_loss_mlp": 0.16204834, + "step": 3524, + "time_per_iteration": 2.66597580909729 + }, + { + "auxiliary_loss_clip": 0.01156211, + "auxiliary_loss_mlp": 0.0104261, + "balance_loss_clip": 1.05437088, + "balance_loss_mlp": 1.02662373, + "epoch": 0.21193446565459192, + "flos": 27268254205920.0, + "grad_norm": 1.8064251416433226, + "language_loss": 0.81605977, + "learning_rate": 3.6627092923635338e-06, + "loss": 0.83804798, + "num_input_tokens_seen": 76137445, + "router_z_loss_clip": 1.01660156, + "router_z_loss_mlp": 0.15991211, + "step": 3525, + "time_per_iteration": 2.7250587940216064 + }, + { + "auxiliary_loss_clip": 0.01157051, + "auxiliary_loss_mlp": 0.01037895, + "balance_loss_clip": 1.05916834, + "balance_loss_mlp": 1.02151513, + "epoch": 0.2119945889072599, + "flos": 33189091109280.0, + "grad_norm": 3.4009345486390115, + "language_loss": 0.75368613, + "learning_rate": 3.662492820527356e-06, + "loss": 0.7756356, + "num_input_tokens_seen": 76159500, + "router_z_loss_clip": 0.97949219, + "router_z_loss_mlp": 0.16369629, + "step": 3526, + "time_per_iteration": 2.7998650074005127 + }, + { + "auxiliary_loss_clip": 0.01156407, + "auxiliary_loss_mlp": 0.01040931, + "balance_loss_clip": 1.05654693, + "balance_loss_mlp": 1.02406335, + "epoch": 0.21205471215992786, + "flos": 25614735870240.0, + "grad_norm": 2.0499542884884576, + "language_loss": 0.77090931, + "learning_rate": 3.662276285649284e-06, + "loss": 0.79288268, + "num_input_tokens_seen": 76177990, + "router_z_loss_clip": 0.99853516, + "router_z_loss_mlp": 0.1685791, + "step": 3527, + "time_per_iteration": 2.746917247772217 + }, + { + "auxiliary_loss_clip": 0.01154821, + "auxiliary_loss_mlp": 0.01047673, + "balance_loss_clip": 1.05702329, + "balance_loss_mlp": 1.03016162, + "epoch": 0.21211483541259582, + "flos": 25352910792000.0, + "grad_norm": 2.2028531028079796, + "language_loss": 0.78054434, + "learning_rate": 3.662059687737528e-06, + "loss": 0.80256927, + "num_input_tokens_seen": 76197125, + "router_z_loss_clip": 0.97851562, + "router_z_loss_mlp": 0.17504883, + "step": 3528, + "time_per_iteration": 5.8170764446258545 + }, + { + "auxiliary_loss_clip": 0.0115554, + "auxiliary_loss_mlp": 0.01048193, + "balance_loss_clip": 1.05795527, + "balance_loss_mlp": 1.03173041, + "epoch": 0.21217495866526379, + "flos": 23171447468160.0, + "grad_norm": 1.93696673724811, + "language_loss": 0.81586599, + "learning_rate": 3.6618430268003024e-06, + "loss": 0.83790332, + "num_input_tokens_seen": 76216215, + "router_z_loss_clip": 0.9765625, + "router_z_loss_mlp": 0.16455078, + "step": 3529, + "time_per_iteration": 2.6806652545928955 + }, + { + "auxiliary_loss_clip": 0.01157309, + "auxiliary_loss_mlp": 0.01039835, + "balance_loss_clip": 1.05723357, + "balance_loss_mlp": 1.02377784, + "epoch": 0.21223508191793175, + "flos": 25218331439040.0, + "grad_norm": 4.487564474047329, + "language_loss": 0.77042854, + "learning_rate": 3.6616263028458235e-06, + "loss": 0.7924, + "num_input_tokens_seen": 76237010, + "router_z_loss_clip": 1.00097656, + "router_z_loss_mlp": 0.16064453, + "step": 3530, + "time_per_iteration": 2.6965887546539307 + }, + { + "auxiliary_loss_clip": 0.01155521, + "auxiliary_loss_mlp": 0.01044171, + "balance_loss_clip": 1.05876553, + "balance_loss_mlp": 1.02849543, + "epoch": 0.21229520517059972, + "flos": 26376514846560.0, + "grad_norm": 2.720552397316369, + "language_loss": 0.83429027, + "learning_rate": 3.661409515882308e-06, + "loss": 0.85628724, + "num_input_tokens_seen": 76255965, + "router_z_loss_clip": 0.96777344, + "router_z_loss_mlp": 0.15661621, + "step": 3531, + "time_per_iteration": 2.7944037914276123 + }, + { + "auxiliary_loss_clip": 0.01157945, + "auxiliary_loss_mlp": 0.01040901, + "balance_loss_clip": 1.0588069, + "balance_loss_mlp": 1.02336514, + "epoch": 0.2123553284232677, + "flos": 17071387519680.0, + "grad_norm": 2.8333949568738768, + "language_loss": 0.73326492, + "learning_rate": 3.661192665917977e-06, + "loss": 0.75525343, + "num_input_tokens_seen": 76272150, + "router_z_loss_clip": 0.99121094, + "router_z_loss_mlp": 0.17529297, + "step": 3532, + "time_per_iteration": 5.613589525222778 + }, + { + "auxiliary_loss_clip": 0.01158246, + "auxiliary_loss_mlp": 0.01045873, + "balance_loss_clip": 1.05987215, + "balance_loss_mlp": 1.02842116, + "epoch": 0.21241545167593567, + "flos": 22325526802080.0, + "grad_norm": 2.1842074668291005, + "language_loss": 0.74146152, + "learning_rate": 3.660975752961054e-06, + "loss": 0.76350272, + "num_input_tokens_seen": 76291425, + "router_z_loss_clip": 0.98486328, + "router_z_loss_mlp": 0.17431641, + "step": 3533, + "time_per_iteration": 2.6773500442504883 + }, + { + "auxiliary_loss_clip": 0.01161479, + "auxiliary_loss_mlp": 0.01045142, + "balance_loss_clip": 1.06118536, + "balance_loss_mlp": 1.02851248, + "epoch": 0.21247557492860364, + "flos": 42358220978400.0, + "grad_norm": 2.1726526936566652, + "language_loss": 0.70811677, + "learning_rate": 3.6607587770197634e-06, + "loss": 0.73018301, + "num_input_tokens_seen": 76313975, + "router_z_loss_clip": 1.00292969, + "router_z_loss_mlp": 0.16638184, + "step": 3534, + "time_per_iteration": 2.8615574836730957 + }, + { + "auxiliary_loss_clip": 0.01162447, + "auxiliary_loss_mlp": 0.01041778, + "balance_loss_clip": 1.0625515, + "balance_loss_mlp": 1.02450466, + "epoch": 0.2125356981812716, + "flos": 26910253805760.0, + "grad_norm": 2.551451362259805, + "language_loss": 0.71894485, + "learning_rate": 3.6605417381023346e-06, + "loss": 0.74098706, + "num_input_tokens_seen": 76330955, + "router_z_loss_clip": 1.0, + "router_z_loss_mlp": 0.17272949, + "step": 3535, + "time_per_iteration": 2.714277982711792 + }, + { + "auxiliary_loss_clip": 0.01156003, + "auxiliary_loss_mlp": 0.01052455, + "balance_loss_clip": 1.05981231, + "balance_loss_mlp": 1.03605211, + "epoch": 0.21259582143393957, + "flos": 34835356851840.0, + "grad_norm": 1.8719587737157888, + "language_loss": 0.70597768, + "learning_rate": 3.660324636216996e-06, + "loss": 0.72806227, + "num_input_tokens_seen": 76352680, + "router_z_loss_clip": 0.96191406, + "router_z_loss_mlp": 0.16394043, + "step": 3536, + "time_per_iteration": 2.804661750793457 + }, + { + "auxiliary_loss_clip": 0.01159285, + "auxiliary_loss_mlp": 0.0104821, + "balance_loss_clip": 1.05906975, + "balance_loss_mlp": 1.03135395, + "epoch": 0.21265594468660753, + "flos": 24551471748960.0, + "grad_norm": 2.591427658023111, + "language_loss": 0.87837458, + "learning_rate": 3.660107471371981e-06, + "loss": 0.90044951, + "num_input_tokens_seen": 76370750, + "router_z_loss_clip": 1.00244141, + "router_z_loss_mlp": 0.1685791, + "step": 3537, + "time_per_iteration": 2.7059030532836914 + }, + { + "auxiliary_loss_clip": 0.01154142, + "auxiliary_loss_mlp": 0.01041541, + "balance_loss_clip": 1.05761409, + "balance_loss_mlp": 1.02444625, + "epoch": 0.21271606793927553, + "flos": 28157724597600.0, + "grad_norm": 1.7534598735092615, + "language_loss": 0.80493635, + "learning_rate": 3.659890243575524e-06, + "loss": 0.82689321, + "num_input_tokens_seen": 76390610, + "router_z_loss_clip": 0.96435547, + "router_z_loss_mlp": 0.17089844, + "step": 3538, + "time_per_iteration": 2.7474615573883057 + }, + { + "auxiliary_loss_clip": 0.01152329, + "auxiliary_loss_mlp": 0.01041823, + "balance_loss_clip": 1.05556893, + "balance_loss_mlp": 1.02668357, + "epoch": 0.2127761911919435, + "flos": 32200082461440.0, + "grad_norm": 1.6463341275777135, + "language_loss": 0.8712275, + "learning_rate": 3.659672952835863e-06, + "loss": 0.89316899, + "num_input_tokens_seen": 76408860, + "router_z_loss_clip": 0.96582031, + "router_z_loss_mlp": 0.15148926, + "step": 3539, + "time_per_iteration": 2.8770086765289307 + }, + { + "auxiliary_loss_clip": 0.01157685, + "auxiliary_loss_mlp": 0.01047029, + "balance_loss_clip": 1.0587101, + "balance_loss_mlp": 1.03022075, + "epoch": 0.21283631444461146, + "flos": 24682688167680.0, + "grad_norm": 2.9304427771023955, + "language_loss": 0.57463878, + "learning_rate": 3.659455599161237e-06, + "loss": 0.59668595, + "num_input_tokens_seen": 76424980, + "router_z_loss_clip": 0.98974609, + "router_z_loss_mlp": 0.16821289, + "step": 3540, + "time_per_iteration": 2.6780667304992676 + }, + { + "auxiliary_loss_clip": 0.01160533, + "auxiliary_loss_mlp": 0.01040826, + "balance_loss_clip": 1.06119466, + "balance_loss_mlp": 1.0248754, + "epoch": 0.21289643769727942, + "flos": 16492883316480.0, + "grad_norm": 2.3716389173481556, + "language_loss": 0.75847447, + "learning_rate": 3.659238182559888e-06, + "loss": 0.78048801, + "num_input_tokens_seen": 76443135, + "router_z_loss_clip": 0.99365234, + "router_z_loss_mlp": 0.1595459, + "step": 3541, + "time_per_iteration": 2.7984414100646973 + }, + { + "auxiliary_loss_clip": 0.01156757, + "auxiliary_loss_mlp": 0.01043876, + "balance_loss_clip": 1.06079853, + "balance_loss_mlp": 1.02763963, + "epoch": 0.2129565609499474, + "flos": 30294301125600.0, + "grad_norm": 2.263575100217958, + "language_loss": 0.69316083, + "learning_rate": 3.6590207030400615e-06, + "loss": 0.71516716, + "num_input_tokens_seen": 76462470, + "router_z_loss_clip": 0.96044922, + "router_z_loss_mlp": 0.16235352, + "step": 3542, + "time_per_iteration": 2.7403862476348877 + }, + { + "auxiliary_loss_clip": 0.01155552, + "auxiliary_loss_mlp": 0.01041578, + "balance_loss_clip": 1.05887341, + "balance_loss_mlp": 1.02611685, + "epoch": 0.21301668420261535, + "flos": 28869309014400.0, + "grad_norm": 2.1513738790141006, + "language_loss": 0.75743157, + "learning_rate": 3.658803160610004e-06, + "loss": 0.77940285, + "num_input_tokens_seen": 76481995, + "router_z_loss_clip": 0.96777344, + "router_z_loss_mlp": 0.15478516, + "step": 3543, + "time_per_iteration": 2.750948905944824 + }, + { + "auxiliary_loss_clip": 0.01157834, + "auxiliary_loss_mlp": 0.01043414, + "balance_loss_clip": 1.06201124, + "balance_loss_mlp": 1.02692699, + "epoch": 0.21307680745528332, + "flos": 19965569744160.0, + "grad_norm": 2.2564729140808795, + "language_loss": 0.66530365, + "learning_rate": 3.6585855552779634e-06, + "loss": 0.68731606, + "num_input_tokens_seen": 76500245, + "router_z_loss_clip": 0.95800781, + "router_z_loss_mlp": 0.16491699, + "step": 3544, + "time_per_iteration": 2.679687976837158 + }, + { + "auxiliary_loss_clip": 0.01153149, + "auxiliary_loss_mlp": 0.01045704, + "balance_loss_clip": 1.05589628, + "balance_loss_mlp": 1.0294559, + "epoch": 0.2131369307079513, + "flos": 23304568199040.0, + "grad_norm": 1.6925205608489513, + "language_loss": 0.70764673, + "learning_rate": 3.6583678870521934e-06, + "loss": 0.72963536, + "num_input_tokens_seen": 76519535, + "router_z_loss_clip": 0.97314453, + "router_z_loss_mlp": 0.16247559, + "step": 3545, + "time_per_iteration": 2.6991569995880127 + }, + { + "auxiliary_loss_clip": 0.01161048, + "auxiliary_loss_mlp": 0.0104687, + "balance_loss_clip": 1.06106853, + "balance_loss_mlp": 1.03021646, + "epoch": 0.21319705396061928, + "flos": 37061099212320.0, + "grad_norm": 1.975908781794066, + "language_loss": 0.72141135, + "learning_rate": 3.658150155940946e-06, + "loss": 0.74349052, + "num_input_tokens_seen": 76542065, + "router_z_loss_clip": 1.00048828, + "router_z_loss_mlp": 0.16662598, + "step": 3546, + "time_per_iteration": 2.7603464126586914 + }, + { + "auxiliary_loss_clip": 0.01156448, + "auxiliary_loss_mlp": 0.01042018, + "balance_loss_clip": 1.05844545, + "balance_loss_mlp": 1.02544749, + "epoch": 0.21325717721328724, + "flos": 26547593918400.0, + "grad_norm": 2.285947034944586, + "language_loss": 0.79981029, + "learning_rate": 3.657932361952479e-06, + "loss": 0.82179493, + "num_input_tokens_seen": 76560540, + "router_z_loss_clip": 0.98095703, + "router_z_loss_mlp": 0.16564941, + "step": 3547, + "time_per_iteration": 2.711420774459839 + }, + { + "auxiliary_loss_clip": 0.0115729, + "auxiliary_loss_mlp": 0.01042145, + "balance_loss_clip": 1.05677509, + "balance_loss_mlp": 1.02524066, + "epoch": 0.2133173004659552, + "flos": 35058089105280.0, + "grad_norm": 2.4686406347167833, + "language_loss": 0.74458647, + "learning_rate": 3.6577145050950504e-06, + "loss": 0.76658082, + "num_input_tokens_seen": 76581760, + "router_z_loss_clip": 1.00634766, + "router_z_loss_mlp": 0.16906738, + "step": 3548, + "time_per_iteration": 2.782299757003784 + }, + { + "auxiliary_loss_clip": 0.01159835, + "auxiliary_loss_mlp": 0.01053313, + "balance_loss_clip": 1.05925035, + "balance_loss_mlp": 1.0355984, + "epoch": 0.21337742371862317, + "flos": 20544317051040.0, + "grad_norm": 2.0696249512922744, + "language_loss": 0.7464813, + "learning_rate": 3.657496585376922e-06, + "loss": 0.76861274, + "num_input_tokens_seen": 76599940, + "router_z_loss_clip": 1.00537109, + "router_z_loss_mlp": 0.17712402, + "step": 3549, + "time_per_iteration": 2.8554584980010986 + }, + { + "auxiliary_loss_clip": 0.01156065, + "auxiliary_loss_mlp": 0.01044453, + "balance_loss_clip": 1.05843735, + "balance_loss_mlp": 1.02858663, + "epoch": 0.21343754697129114, + "flos": 29803706719200.0, + "grad_norm": 2.2123526490251857, + "language_loss": 0.80909491, + "learning_rate": 3.657278602806357e-06, + "loss": 0.83110011, + "num_input_tokens_seen": 76619580, + "router_z_loss_clip": 0.97460938, + "router_z_loss_mlp": 0.15869141, + "step": 3550, + "time_per_iteration": 2.725259304046631 + }, + { + "auxiliary_loss_clip": 0.01151842, + "auxiliary_loss_mlp": 0.01043357, + "balance_loss_clip": 1.05640841, + "balance_loss_mlp": 1.02783608, + "epoch": 0.21349767022395913, + "flos": 23523046138080.0, + "grad_norm": 1.7856149047764631, + "language_loss": 0.88026166, + "learning_rate": 3.657060557391621e-06, + "loss": 0.90221369, + "num_input_tokens_seen": 76638195, + "router_z_loss_clip": 0.95605469, + "router_z_loss_mlp": 0.15515137, + "step": 3551, + "time_per_iteration": 2.751166820526123 + }, + { + "auxiliary_loss_clip": 0.01153681, + "auxiliary_loss_mlp": 0.01048922, + "balance_loss_clip": 1.057019, + "balance_loss_mlp": 1.03254235, + "epoch": 0.2135577934766271, + "flos": 21167262360000.0, + "grad_norm": 2.078873582411819, + "language_loss": 0.83383799, + "learning_rate": 3.656842449140983e-06, + "loss": 0.85586405, + "num_input_tokens_seen": 76656695, + "router_z_loss_clip": 0.96728516, + "router_z_loss_mlp": 0.16394043, + "step": 3552, + "time_per_iteration": 2.706563949584961 + }, + { + "auxiliary_loss_clip": 0.0115512, + "auxiliary_loss_mlp": 0.01048346, + "balance_loss_clip": 1.05793297, + "balance_loss_mlp": 1.03164458, + "epoch": 0.21361791672929506, + "flos": 29355243933600.0, + "grad_norm": 2.1428949525807615, + "language_loss": 0.76280516, + "learning_rate": 3.656624278062713e-06, + "loss": 0.78483987, + "num_input_tokens_seen": 76677430, + "router_z_loss_clip": 0.97265625, + "router_z_loss_mlp": 0.16699219, + "step": 3553, + "time_per_iteration": 2.6946866512298584 + }, + { + "auxiliary_loss_clip": 0.01153384, + "auxiliary_loss_mlp": 0.01040315, + "balance_loss_clip": 1.05843425, + "balance_loss_mlp": 1.0254972, + "epoch": 0.21367803998196302, + "flos": 27042726260160.0, + "grad_norm": 1.6146062009573223, + "language_loss": 0.72631145, + "learning_rate": 3.6564060441650843e-06, + "loss": 0.74824846, + "num_input_tokens_seen": 76697615, + "router_z_loss_clip": 0.94921875, + "router_z_loss_mlp": 0.14807129, + "step": 3554, + "time_per_iteration": 2.705054521560669 + }, + { + "auxiliary_loss_clip": 0.01154567, + "auxiliary_loss_mlp": 0.0103846, + "balance_loss_clip": 1.0576725, + "balance_loss_mlp": 1.02243876, + "epoch": 0.213738163234631, + "flos": 25486193592000.0, + "grad_norm": 2.2450728261834634, + "language_loss": 0.67755103, + "learning_rate": 3.6561877474563724e-06, + "loss": 0.69948137, + "num_input_tokens_seen": 76715685, + "router_z_loss_clip": 0.96826172, + "router_z_loss_mlp": 0.16027832, + "step": 3555, + "time_per_iteration": 2.672718048095703 + }, + { + "auxiliary_loss_clip": 0.01157042, + "auxiliary_loss_mlp": 0.0103736, + "balance_loss_clip": 1.05743313, + "balance_loss_mlp": 1.0205034, + "epoch": 0.21379828648729896, + "flos": 34658078636160.0, + "grad_norm": 1.7152236168285249, + "language_loss": 0.64656436, + "learning_rate": 3.6559693879448553e-06, + "loss": 0.66850835, + "num_input_tokens_seen": 76735405, + "router_z_loss_clip": 0.99707031, + "router_z_loss_mlp": 0.1685791, + "step": 3556, + "time_per_iteration": 2.7355847358703613 + }, + { + "auxiliary_loss_clip": 0.01155984, + "auxiliary_loss_mlp": 0.01049508, + "balance_loss_clip": 1.05766165, + "balance_loss_mlp": 1.03210354, + "epoch": 0.21385840973996692, + "flos": 31091080681440.0, + "grad_norm": 1.7704778744762304, + "language_loss": 0.7251941, + "learning_rate": 3.6557509656388125e-06, + "loss": 0.74724901, + "num_input_tokens_seen": 76754395, + "router_z_loss_clip": 0.98291016, + "router_z_loss_mlp": 0.17407227, + "step": 3557, + "time_per_iteration": 2.6860830783843994 + }, + { + "auxiliary_loss_clip": 0.01157343, + "auxiliary_loss_mlp": 0.010438, + "balance_loss_clip": 1.05653572, + "balance_loss_mlp": 1.02626467, + "epoch": 0.2139185329926349, + "flos": 34256852648640.0, + "grad_norm": 2.1112468602999828, + "language_loss": 0.67418247, + "learning_rate": 3.655532480546528e-06, + "loss": 0.69619393, + "num_input_tokens_seen": 76777210, + "router_z_loss_clip": 1.0078125, + "router_z_loss_mlp": 0.17529297, + "step": 3558, + "time_per_iteration": 2.7215099334716797 + }, + { + "auxiliary_loss_clip": 0.01157481, + "auxiliary_loss_mlp": 0.01038436, + "balance_loss_clip": 1.05471039, + "balance_loss_mlp": 1.02140069, + "epoch": 0.21397865624530288, + "flos": 23926581610560.0, + "grad_norm": 2.6193039148263404, + "language_loss": 0.79973418, + "learning_rate": 3.655313932676286e-06, + "loss": 0.8216933, + "num_input_tokens_seen": 76795830, + "router_z_loss_clip": 1.02734375, + "router_z_loss_mlp": 0.17041016, + "step": 3559, + "time_per_iteration": 2.699488639831543 + }, + { + "auxiliary_loss_clip": 0.01153556, + "auxiliary_loss_mlp": 0.01039392, + "balance_loss_clip": 1.05558562, + "balance_loss_mlp": 1.02438354, + "epoch": 0.21403877949797084, + "flos": 30116658254400.0, + "grad_norm": 1.6641831007985117, + "language_loss": 0.67456186, + "learning_rate": 3.655095322036373e-06, + "loss": 0.6964913, + "num_input_tokens_seen": 76814700, + "router_z_loss_clip": 0.98144531, + "router_z_loss_mlp": 0.15002441, + "step": 3560, + "time_per_iteration": 2.7271721363067627 + }, + { + "auxiliary_loss_clip": 0.0115892, + "auxiliary_loss_mlp": 0.0104706, + "balance_loss_clip": 1.05893874, + "balance_loss_mlp": 1.03037047, + "epoch": 0.2140989027506388, + "flos": 24234914175840.0, + "grad_norm": 2.098521686089679, + "language_loss": 0.72926253, + "learning_rate": 3.65487664863508e-06, + "loss": 0.75132227, + "num_input_tokens_seen": 76833400, + "router_z_loss_clip": 1.0, + "router_z_loss_mlp": 0.16699219, + "step": 3561, + "time_per_iteration": 2.6937592029571533 + }, + { + "auxiliary_loss_clip": 0.01157978, + "auxiliary_loss_mlp": 0.01045258, + "balance_loss_clip": 1.05864573, + "balance_loss_mlp": 1.02912879, + "epoch": 0.21415902600330677, + "flos": 23348847235680.0, + "grad_norm": 2.2347473055173626, + "language_loss": 0.77059603, + "learning_rate": 3.654657912480698e-06, + "loss": 0.79262829, + "num_input_tokens_seen": 76850645, + "router_z_loss_clip": 0.99365234, + "router_z_loss_mlp": 0.16125488, + "step": 3562, + "time_per_iteration": 2.656527519226074 + }, + { + "auxiliary_loss_clip": 0.01155121, + "auxiliary_loss_mlp": 0.01039684, + "balance_loss_clip": 1.05775142, + "balance_loss_mlp": 1.02300644, + "epoch": 0.21421914925597474, + "flos": 27177265095840.0, + "grad_norm": 1.6542067738717565, + "language_loss": 0.8458975, + "learning_rate": 3.6544391135815237e-06, + "loss": 0.86784554, + "num_input_tokens_seen": 76870135, + "router_z_loss_clip": 0.97314453, + "router_z_loss_mlp": 0.16662598, + "step": 3563, + "time_per_iteration": 2.6978845596313477 + }, + { + "auxiliary_loss_clip": 0.01154021, + "auxiliary_loss_mlp": 0.01036315, + "balance_loss_clip": 1.05708361, + "balance_loss_mlp": 1.02099621, + "epoch": 0.2142792725086427, + "flos": 41335508304000.0, + "grad_norm": 1.5368434308414347, + "language_loss": 0.76550752, + "learning_rate": 3.6542202519458507e-06, + "loss": 0.78741086, + "num_input_tokens_seen": 76893905, + "router_z_loss_clip": 0.96875, + "router_z_loss_mlp": 0.15307617, + "step": 3564, + "time_per_iteration": 2.8383495807647705 + }, + { + "auxiliary_loss_clip": 0.01152358, + "auxiliary_loss_mlp": 0.01043924, + "balance_loss_clip": 1.0569098, + "balance_loss_mlp": 1.02755618, + "epoch": 0.2143393957613107, + "flos": 24233860726560.0, + "grad_norm": 1.8280144102982108, + "language_loss": 0.88576388, + "learning_rate": 3.654001327581981e-06, + "loss": 0.90772671, + "num_input_tokens_seen": 76914205, + "router_z_loss_clip": 0.95410156, + "router_z_loss_mlp": 0.16375732, + "step": 3565, + "time_per_iteration": 2.7076432704925537 + }, + { + "auxiliary_loss_clip": 0.01062314, + "auxiliary_loss_mlp": 0.0100576, + "balance_loss_clip": 1.02909958, + "balance_loss_mlp": 1.00359023, + "epoch": 0.21439951901397866, + "flos": 83621418638400.0, + "grad_norm": 0.8354416157840305, + "language_loss": 0.52269334, + "learning_rate": 3.653782340498215e-06, + "loss": 0.54337406, + "num_input_tokens_seen": 76975650, + "router_z_loss_clip": 0.33178711, + "router_z_loss_mlp": 0.02172852, + "step": 3566, + "time_per_iteration": 3.2141900062561035 + }, + { + "auxiliary_loss_clip": 0.01149928, + "auxiliary_loss_mlp": 0.01034289, + "balance_loss_clip": 1.0556581, + "balance_loss_mlp": 1.01894701, + "epoch": 0.21445964226664663, + "flos": 24017125030560.0, + "grad_norm": 1.804969050842099, + "language_loss": 0.67239559, + "learning_rate": 3.6535632907028566e-06, + "loss": 0.69423777, + "num_input_tokens_seen": 76992615, + "router_z_loss_clip": 0.94238281, + "router_z_loss_mlp": 0.15344238, + "step": 3567, + "time_per_iteration": 2.710219621658325 + }, + { + "auxiliary_loss_clip": 0.01150157, + "auxiliary_loss_mlp": 0.01037559, + "balance_loss_clip": 1.05714369, + "balance_loss_mlp": 1.02225304, + "epoch": 0.2145197655193146, + "flos": 37957498058880.0, + "grad_norm": 1.6672073560767804, + "language_loss": 0.7398262, + "learning_rate": 3.6533441782042126e-06, + "loss": 0.76170331, + "num_input_tokens_seen": 77017005, + "router_z_loss_clip": 0.93017578, + "router_z_loss_mlp": 0.15319824, + "step": 3568, + "time_per_iteration": 4.229829549789429 + }, + { + "auxiliary_loss_clip": 0.01150925, + "auxiliary_loss_mlp": 0.01046911, + "balance_loss_clip": 1.05627525, + "balance_loss_mlp": 1.0311749, + "epoch": 0.21457988877198256, + "flos": 24551350197120.0, + "grad_norm": 1.69431819753847, + "language_loss": 0.77688497, + "learning_rate": 3.6531250030105917e-06, + "loss": 0.79886335, + "num_input_tokens_seen": 77034990, + "router_z_loss_clip": 0.94580078, + "router_z_loss_mlp": 0.1574707, + "step": 3569, + "time_per_iteration": 2.683384895324707 + }, + { + "auxiliary_loss_clip": 0.01161957, + "auxiliary_loss_mlp": 0.01038486, + "balance_loss_clip": 1.0593338, + "balance_loss_mlp": 1.020473, + "epoch": 0.21464001202465052, + "flos": 22681015130880.0, + "grad_norm": 16.353952878989436, + "language_loss": 0.69569373, + "learning_rate": 3.6529057651303053e-06, + "loss": 0.7176981, + "num_input_tokens_seen": 77052610, + "router_z_loss_clip": 1.02734375, + "router_z_loss_mlp": 0.18005371, + "step": 3570, + "time_per_iteration": 2.6931207180023193 + }, + { + "auxiliary_loss_clip": 0.01157635, + "auxiliary_loss_mlp": 0.01042442, + "balance_loss_clip": 1.05782151, + "balance_loss_mlp": 1.02641416, + "epoch": 0.21470013527731852, + "flos": 26643404584800.0, + "grad_norm": 4.466276302513099, + "language_loss": 0.78494018, + "learning_rate": 3.6526864645716666e-06, + "loss": 0.80694097, + "num_input_tokens_seen": 77072475, + "router_z_loss_clip": 0.99853516, + "router_z_loss_mlp": 0.16027832, + "step": 3571, + "time_per_iteration": 2.719947338104248 + }, + { + "auxiliary_loss_clip": 0.01156292, + "auxiliary_loss_mlp": 0.01047102, + "balance_loss_clip": 1.05871964, + "balance_loss_mlp": 1.03005481, + "epoch": 0.21476025852998648, + "flos": 21479200963200.0, + "grad_norm": 2.6906441633603686, + "language_loss": 0.82619238, + "learning_rate": 3.652467101342991e-06, + "loss": 0.84822637, + "num_input_tokens_seen": 77089930, + "router_z_loss_clip": 0.97607422, + "router_z_loss_mlp": 0.17041016, + "step": 3572, + "time_per_iteration": 5.661884069442749 + }, + { + "auxiliary_loss_clip": 0.01160136, + "auxiliary_loss_mlp": 0.01042027, + "balance_loss_clip": 1.05766797, + "balance_loss_mlp": 1.02559972, + "epoch": 0.21482038178265445, + "flos": 30295557161280.0, + "grad_norm": 3.7591902247082793, + "language_loss": 0.65314651, + "learning_rate": 3.652247675452598e-06, + "loss": 0.67516816, + "num_input_tokens_seen": 77108970, + "router_z_loss_clip": 1.02490234, + "router_z_loss_mlp": 0.16418457, + "step": 3573, + "time_per_iteration": 2.7250587940216064 + }, + { + "auxiliary_loss_clip": 0.01147778, + "auxiliary_loss_mlp": 0.01045992, + "balance_loss_clip": 1.0535481, + "balance_loss_mlp": 1.03013766, + "epoch": 0.2148805050353224, + "flos": 28379930126400.0, + "grad_norm": 1.9864472106357918, + "language_loss": 0.75300342, + "learning_rate": 3.652028186908807e-06, + "loss": 0.77494109, + "num_input_tokens_seen": 77126045, + "router_z_loss_clip": 0.94287109, + "router_z_loss_mlp": 0.15844727, + "step": 3574, + "time_per_iteration": 2.72361421585083 + }, + { + "auxiliary_loss_clip": 0.01150692, + "auxiliary_loss_mlp": 0.01040753, + "balance_loss_clip": 1.0552206, + "balance_loss_mlp": 1.02460039, + "epoch": 0.21494062828799038, + "flos": 26016407547840.0, + "grad_norm": 1.905754375959609, + "language_loss": 0.72111738, + "learning_rate": 3.6518086357199416e-06, + "loss": 0.74303186, + "num_input_tokens_seen": 77144600, + "router_z_loss_clip": 0.95458984, + "router_z_loss_mlp": 0.16149902, + "step": 3575, + "time_per_iteration": 2.76698899269104 + }, + { + "auxiliary_loss_clip": 0.01156189, + "auxiliary_loss_mlp": 0.01041315, + "balance_loss_clip": 1.06026578, + "balance_loss_mlp": 1.0259608, + "epoch": 0.21500075154065834, + "flos": 22992548561280.0, + "grad_norm": 2.4718665430761946, + "language_loss": 0.68244475, + "learning_rate": 3.6515890218943277e-06, + "loss": 0.70441979, + "num_input_tokens_seen": 77162965, + "router_z_loss_clip": 0.95898438, + "router_z_loss_mlp": 0.15344238, + "step": 3576, + "time_per_iteration": 2.7345168590545654 + }, + { + "auxiliary_loss_clip": 0.01157579, + "auxiliary_loss_mlp": 0.01041717, + "balance_loss_clip": 1.05657434, + "balance_loss_mlp": 1.02480137, + "epoch": 0.2150608747933263, + "flos": 22502805017760.0, + "grad_norm": 2.124781408902381, + "language_loss": 0.88748115, + "learning_rate": 3.651369345440292e-06, + "loss": 0.90947413, + "num_input_tokens_seen": 77179960, + "router_z_loss_clip": 1.01074219, + "router_z_loss_mlp": 0.16906738, + "step": 3577, + "time_per_iteration": 2.6699793338775635 + }, + { + "auxiliary_loss_clip": 0.01057062, + "auxiliary_loss_mlp": 0.01029523, + "balance_loss_clip": 1.02493453, + "balance_loss_mlp": 1.02780473, + "epoch": 0.2151209980459943, + "flos": 81263852100000.0, + "grad_norm": 0.8125534910853771, + "language_loss": 0.5614692, + "learning_rate": 3.6511496063661654e-06, + "loss": 0.58233505, + "num_input_tokens_seen": 77239500, + "router_z_loss_clip": 0.32128906, + "router_z_loss_mlp": 0.01721191, + "step": 3578, + "time_per_iteration": 3.3133347034454346 + }, + { + "auxiliary_loss_clip": 0.01151927, + "auxiliary_loss_mlp": 0.01043609, + "balance_loss_clip": 1.05558658, + "balance_loss_mlp": 1.02805185, + "epoch": 0.21518112129866226, + "flos": 26327373736320.0, + "grad_norm": 1.7951586129519974, + "language_loss": 0.88398278, + "learning_rate": 3.6509298046802807e-06, + "loss": 0.90593803, + "num_input_tokens_seen": 77254680, + "router_z_loss_clip": 0.96386719, + "router_z_loss_mlp": 0.15545654, + "step": 3579, + "time_per_iteration": 2.6792471408843994 + }, + { + "auxiliary_loss_clip": 0.01152966, + "auxiliary_loss_mlp": 0.01048366, + "balance_loss_clip": 1.05489123, + "balance_loss_mlp": 1.03177214, + "epoch": 0.21524124455133023, + "flos": 24462103330080.0, + "grad_norm": 2.226001651981557, + "language_loss": 0.77906072, + "learning_rate": 3.650709940390972e-06, + "loss": 0.80107403, + "num_input_tokens_seen": 77274060, + "router_z_loss_clip": 0.97949219, + "router_z_loss_mlp": 0.16601562, + "step": 3580, + "time_per_iteration": 2.724484443664551 + }, + { + "auxiliary_loss_clip": 0.01150714, + "auxiliary_loss_mlp": 0.01044306, + "balance_loss_clip": 1.05596256, + "balance_loss_mlp": 1.02790284, + "epoch": 0.2153013678039982, + "flos": 29226255965280.0, + "grad_norm": 1.9822462274017614, + "language_loss": 0.72489041, + "learning_rate": 3.6504900135065775e-06, + "loss": 0.7468406, + "num_input_tokens_seen": 77293255, + "router_z_loss_clip": 0.94628906, + "router_z_loss_mlp": 0.16394043, + "step": 3581, + "time_per_iteration": 2.725914239883423 + }, + { + "auxiliary_loss_clip": 0.01150383, + "auxiliary_loss_mlp": 0.0104476, + "balance_loss_clip": 1.05550957, + "balance_loss_mlp": 1.02712846, + "epoch": 0.21536149105666616, + "flos": 25129854400320.0, + "grad_norm": 2.343787413574907, + "language_loss": 0.70876932, + "learning_rate": 3.6502700240354357e-06, + "loss": 0.73072076, + "num_input_tokens_seen": 77312390, + "router_z_loss_clip": 0.94873047, + "router_z_loss_mlp": 0.17626953, + "step": 3582, + "time_per_iteration": 2.6741654872894287 + }, + { + "auxiliary_loss_clip": 0.01149406, + "auxiliary_loss_mlp": 0.01043595, + "balance_loss_clip": 1.05320334, + "balance_loss_mlp": 1.02698898, + "epoch": 0.21542161430933413, + "flos": 15691241687040.0, + "grad_norm": 4.084282868763301, + "language_loss": 0.8377856, + "learning_rate": 3.650049971985889e-06, + "loss": 0.85971558, + "num_input_tokens_seen": 77330985, + "router_z_loss_clip": 0.96142578, + "router_z_loss_mlp": 0.16601562, + "step": 3583, + "time_per_iteration": 2.732745885848999 + }, + { + "auxiliary_loss_clip": 0.01154408, + "auxiliary_loss_mlp": 0.0104495, + "balance_loss_clip": 1.05498374, + "balance_loss_mlp": 1.02905905, + "epoch": 0.21548173756200212, + "flos": 31852495002240.0, + "grad_norm": 2.887064905757286, + "language_loss": 0.82939929, + "learning_rate": 3.6498298573662824e-06, + "loss": 0.85139287, + "num_input_tokens_seen": 77350770, + "router_z_loss_clip": 0.99414062, + "router_z_loss_mlp": 0.15893555, + "step": 3584, + "time_per_iteration": 2.697356939315796 + }, + { + "auxiliary_loss_clip": 0.01151197, + "auxiliary_loss_mlp": 0.01044251, + "balance_loss_clip": 1.0558238, + "balance_loss_mlp": 1.02772856, + "epoch": 0.21554186081467008, + "flos": 27044387468640.0, + "grad_norm": 2.7802664019412466, + "language_loss": 0.90365535, + "learning_rate": 3.6496096801849625e-06, + "loss": 0.92560983, + "num_input_tokens_seen": 77370510, + "router_z_loss_clip": 0.95361328, + "router_z_loss_mlp": 0.1652832, + "step": 3585, + "time_per_iteration": 2.702924966812134 + }, + { + "auxiliary_loss_clip": 0.01154464, + "auxiliary_loss_mlp": 0.01039301, + "balance_loss_clip": 1.05775666, + "balance_loss_mlp": 1.02335072, + "epoch": 0.21560198406733805, + "flos": 28024198693920.0, + "grad_norm": 1.7762971741243778, + "language_loss": 0.75074196, + "learning_rate": 3.649389440450277e-06, + "loss": 0.77267969, + "num_input_tokens_seen": 77390645, + "router_z_loss_clip": 0.96728516, + "router_z_loss_mlp": 0.1595459, + "step": 3586, + "time_per_iteration": 2.6867995262145996 + }, + { + "auxiliary_loss_clip": 0.01153724, + "auxiliary_loss_mlp": 0.01043375, + "balance_loss_clip": 1.0550704, + "balance_loss_mlp": 1.02791393, + "epoch": 0.215662107320006, + "flos": 27801304371360.0, + "grad_norm": 1.9558942306084963, + "language_loss": 0.83240712, + "learning_rate": 3.6491691381705804e-06, + "loss": 0.8543781, + "num_input_tokens_seen": 77409655, + "router_z_loss_clip": 0.98681641, + "router_z_loss_mlp": 0.15454102, + "step": 3587, + "time_per_iteration": 2.7036335468292236 + }, + { + "auxiliary_loss_clip": 0.01152999, + "auxiliary_loss_mlp": 0.01037505, + "balance_loss_clip": 1.05639529, + "balance_loss_mlp": 1.02095902, + "epoch": 0.21572223057267398, + "flos": 37685300556960.0, + "grad_norm": 1.8054715569476865, + "language_loss": 0.75475383, + "learning_rate": 3.648948773354224e-06, + "loss": 0.77665889, + "num_input_tokens_seen": 77430560, + "router_z_loss_clip": 0.96582031, + "router_z_loss_mlp": 0.16540527, + "step": 3588, + "time_per_iteration": 2.76631236076355 + }, + { + "auxiliary_loss_clip": 0.01152216, + "auxiliary_loss_mlp": 0.01037897, + "balance_loss_clip": 1.05481744, + "balance_loss_mlp": 1.0217917, + "epoch": 0.21578235382534194, + "flos": 32836357955520.0, + "grad_norm": 1.7908131513448422, + "language_loss": 0.8089366, + "learning_rate": 3.6487283460095643e-06, + "loss": 0.83083773, + "num_input_tokens_seen": 77455000, + "router_z_loss_clip": 0.97460938, + "router_z_loss_mlp": 0.16101074, + "step": 3589, + "time_per_iteration": 2.7294821739196777 + }, + { + "auxiliary_loss_clip": 0.01154115, + "auxiliary_loss_mlp": 0.01032883, + "balance_loss_clip": 1.05568814, + "balance_loss_mlp": 1.01711178, + "epoch": 0.2158424770780099, + "flos": 29804598099360.0, + "grad_norm": 1.8681641512636655, + "language_loss": 0.7263, + "learning_rate": 3.648507856144961e-06, + "loss": 0.74817002, + "num_input_tokens_seen": 77475075, + "router_z_loss_clip": 0.98388672, + "router_z_loss_mlp": 0.15771484, + "step": 3590, + "time_per_iteration": 2.680131673812866 + }, + { + "auxiliary_loss_clip": 0.01158336, + "auxiliary_loss_mlp": 0.01040633, + "balance_loss_clip": 1.05624759, + "balance_loss_mlp": 1.02321637, + "epoch": 0.2159026003306779, + "flos": 29224189584000.0, + "grad_norm": 3.076479465781626, + "language_loss": 0.83975846, + "learning_rate": 3.648287303768775e-06, + "loss": 0.86174822, + "num_input_tokens_seen": 77495945, + "router_z_loss_clip": 1.02148438, + "router_z_loss_mlp": 0.17419434, + "step": 3591, + "time_per_iteration": 2.7669999599456787 + }, + { + "auxiliary_loss_clip": 0.01157415, + "auxiliary_loss_mlp": 0.01045578, + "balance_loss_clip": 1.05551088, + "balance_loss_mlp": 1.02642059, + "epoch": 0.21596272358334587, + "flos": 36657077532480.0, + "grad_norm": 1.8990617690091944, + "language_loss": 0.689089, + "learning_rate": 3.6480666888893686e-06, + "loss": 0.71111888, + "num_input_tokens_seen": 77517140, + "router_z_loss_clip": 1.01904297, + "router_z_loss_mlp": 0.19152832, + "step": 3592, + "time_per_iteration": 2.741460084915161 + }, + { + "auxiliary_loss_clip": 0.01156131, + "auxiliary_loss_mlp": 0.01047036, + "balance_loss_clip": 1.05693793, + "balance_loss_mlp": 1.03013229, + "epoch": 0.21602284683601383, + "flos": 24863045696640.0, + "grad_norm": 4.474424105405351, + "language_loss": 0.83431882, + "learning_rate": 3.647846011515108e-06, + "loss": 0.85635054, + "num_input_tokens_seen": 77536085, + "router_z_loss_clip": 0.9921875, + "router_z_loss_mlp": 0.16906738, + "step": 3593, + "time_per_iteration": 2.7104716300964355 + }, + { + "auxiliary_loss_clip": 0.01157291, + "auxiliary_loss_mlp": 0.01046344, + "balance_loss_clip": 1.05682778, + "balance_loss_mlp": 1.0295589, + "epoch": 0.2160829700886818, + "flos": 25352546136480.0, + "grad_norm": 2.907291079014195, + "language_loss": 0.75584501, + "learning_rate": 3.6476252716543625e-06, + "loss": 0.77788138, + "num_input_tokens_seen": 77553675, + "router_z_loss_clip": 1.00390625, + "router_z_loss_mlp": 0.16784668, + "step": 3594, + "time_per_iteration": 2.7158193588256836 + }, + { + "auxiliary_loss_clip": 0.01147313, + "auxiliary_loss_mlp": 0.01042557, + "balance_loss_clip": 1.05241513, + "balance_loss_mlp": 1.02624965, + "epoch": 0.21614309334134976, + "flos": 27222516547200.0, + "grad_norm": 2.0096681512568195, + "language_loss": 0.80245745, + "learning_rate": 3.6474044693155007e-06, + "loss": 0.82435608, + "num_input_tokens_seen": 77573360, + "router_z_loss_clip": 0.94873047, + "router_z_loss_mlp": 0.16296387, + "step": 3595, + "time_per_iteration": 2.6752939224243164 + }, + { + "auxiliary_loss_clip": 0.01155091, + "auxiliary_loss_mlp": 0.01038025, + "balance_loss_clip": 1.05419803, + "balance_loss_mlp": 1.02141881, + "epoch": 0.21620321659401773, + "flos": 23927513508000.0, + "grad_norm": 1.9051664173472722, + "language_loss": 0.78419077, + "learning_rate": 3.647183604506897e-06, + "loss": 0.80612195, + "num_input_tokens_seen": 77591865, + "router_z_loss_clip": 1.00878906, + "router_z_loss_mlp": 0.1661377, + "step": 3596, + "time_per_iteration": 2.669234275817871 + }, + { + "auxiliary_loss_clip": 0.01150322, + "auxiliary_loss_mlp": 0.01042077, + "balance_loss_clip": 1.05568409, + "balance_loss_mlp": 1.02649641, + "epoch": 0.2162633398466857, + "flos": 22993926148800.0, + "grad_norm": 1.8136569206693949, + "language_loss": 0.82932115, + "learning_rate": 3.6469626772369253e-06, + "loss": 0.85124511, + "num_input_tokens_seen": 77611600, + "router_z_loss_clip": 0.94628906, + "router_z_loss_mlp": 0.15563965, + "step": 3597, + "time_per_iteration": 2.6762588024139404 + }, + { + "auxiliary_loss_clip": 0.01152822, + "auxiliary_loss_mlp": 0.01043704, + "balance_loss_clip": 1.05387342, + "balance_loss_mlp": 1.02676404, + "epoch": 0.21632346309935369, + "flos": 22901762037600.0, + "grad_norm": 1.773815825414721, + "language_loss": 0.80452633, + "learning_rate": 3.6467416875139642e-06, + "loss": 0.82649153, + "num_input_tokens_seen": 77630665, + "router_z_loss_clip": 0.99023438, + "router_z_loss_mlp": 0.16943359, + "step": 3598, + "time_per_iteration": 2.6845180988311768 + }, + { + "auxiliary_loss_clip": 0.01157028, + "auxiliary_loss_mlp": 0.01047488, + "balance_loss_clip": 1.05569434, + "balance_loss_mlp": 1.02970195, + "epoch": 0.21638358635202165, + "flos": 32121126983520.0, + "grad_norm": 1.703902352943449, + "language_loss": 0.81764311, + "learning_rate": 3.6465206353463934e-06, + "loss": 0.8396883, + "num_input_tokens_seen": 77650835, + "router_z_loss_clip": 1.01269531, + "router_z_loss_mlp": 0.17785645, + "step": 3599, + "time_per_iteration": 2.710411310195923 + }, + { + "auxiliary_loss_clip": 0.01148866, + "auxiliary_loss_mlp": 0.01036358, + "balance_loss_clip": 1.05409873, + "balance_loss_mlp": 1.02085495, + "epoch": 0.21644370960468962, + "flos": 25308023996160.0, + "grad_norm": 1.9788611087948207, + "language_loss": 0.76395094, + "learning_rate": 3.6462995207425947e-06, + "loss": 0.7858032, + "num_input_tokens_seen": 77669000, + "router_z_loss_clip": 0.94726562, + "router_z_loss_mlp": 0.15509033, + "step": 3600, + "time_per_iteration": 2.7047817707061768 + }, + { + "auxiliary_loss_clip": 0.01149802, + "auxiliary_loss_mlp": 0.01042199, + "balance_loss_clip": 1.05357003, + "balance_loss_mlp": 1.02814424, + "epoch": 0.21650383285735758, + "flos": 29226742172640.0, + "grad_norm": 2.189581067707227, + "language_loss": 0.80321848, + "learning_rate": 3.6460783437109533e-06, + "loss": 0.82513845, + "num_input_tokens_seen": 77688745, + "router_z_loss_clip": 0.96337891, + "router_z_loss_mlp": 0.14050293, + "step": 3601, + "time_per_iteration": 2.770843505859375 + }, + { + "auxiliary_loss_clip": 0.01153185, + "auxiliary_loss_mlp": 0.0104365, + "balance_loss_clip": 1.05551052, + "balance_loss_mlp": 1.02779531, + "epoch": 0.21656395611002555, + "flos": 28914236327520.0, + "grad_norm": 1.9260365946491427, + "language_loss": 0.83175719, + "learning_rate": 3.6458571042598565e-06, + "loss": 0.85372555, + "num_input_tokens_seen": 77708445, + "router_z_loss_clip": 0.97607422, + "router_z_loss_mlp": 0.15856934, + "step": 3602, + "time_per_iteration": 2.7381718158721924 + }, + { + "auxiliary_loss_clip": 0.01151324, + "auxiliary_loss_mlp": 0.01041713, + "balance_loss_clip": 1.05482817, + "balance_loss_mlp": 1.02585816, + "epoch": 0.2166240793626935, + "flos": 25216913334240.0, + "grad_norm": 2.1194832882994477, + "language_loss": 0.74574935, + "learning_rate": 3.645635802397693e-06, + "loss": 0.76767963, + "num_input_tokens_seen": 77728465, + "router_z_loss_clip": 0.96533203, + "router_z_loss_mlp": 0.15869141, + "step": 3603, + "time_per_iteration": 2.691680908203125 + }, + { + "auxiliary_loss_clip": 0.01151538, + "auxiliary_loss_mlp": 0.01039949, + "balance_loss_clip": 1.05746007, + "balance_loss_mlp": 1.02478564, + "epoch": 0.2166842026153615, + "flos": 26332478913600.0, + "grad_norm": 1.7997599986838453, + "language_loss": 0.74552333, + "learning_rate": 3.645414438132855e-06, + "loss": 0.76743817, + "num_input_tokens_seen": 77746735, + "router_z_loss_clip": 0.94091797, + "router_z_loss_mlp": 0.15161133, + "step": 3604, + "time_per_iteration": 2.7141475677490234 + }, + { + "auxiliary_loss_clip": 0.01146258, + "auxiliary_loss_mlp": 0.01036573, + "balance_loss_clip": 1.05332243, + "balance_loss_mlp": 1.02146363, + "epoch": 0.21674432586802947, + "flos": 31274071833600.0, + "grad_norm": 1.8296602214749587, + "language_loss": 0.7995109, + "learning_rate": 3.6451930114737366e-06, + "loss": 0.82133919, + "num_input_tokens_seen": 77768105, + "router_z_loss_clip": 0.9296875, + "router_z_loss_mlp": 0.15130615, + "step": 3605, + "time_per_iteration": 2.72603440284729 + }, + { + "auxiliary_loss_clip": 0.01060057, + "auxiliary_loss_mlp": 0.01006775, + "balance_loss_clip": 1.02774346, + "balance_loss_mlp": 1.00490642, + "epoch": 0.21680444912069743, + "flos": 68840959432320.0, + "grad_norm": 0.7149336533622878, + "language_loss": 0.58436, + "learning_rate": 3.6449715224287347e-06, + "loss": 0.60502839, + "num_input_tokens_seen": 77833750, + "router_z_loss_clip": 0.32324219, + "router_z_loss_mlp": 0.01864624, + "step": 3606, + "time_per_iteration": 3.456838607788086 + }, + { + "auxiliary_loss_clip": 0.01155363, + "auxiliary_loss_mlp": 0.01045573, + "balance_loss_clip": 1.05601275, + "balance_loss_mlp": 1.02866888, + "epoch": 0.2168645723733654, + "flos": 29137333236480.0, + "grad_norm": 3.2066509700690182, + "language_loss": 0.72887039, + "learning_rate": 3.644749971006248e-06, + "loss": 0.7508797, + "num_input_tokens_seen": 77853780, + "router_z_loss_clip": 0.99267578, + "router_z_loss_mlp": 0.16906738, + "step": 3607, + "time_per_iteration": 5.564659595489502 + }, + { + "auxiliary_loss_clip": 0.01156776, + "auxiliary_loss_mlp": 0.01046891, + "balance_loss_clip": 1.05715358, + "balance_loss_mlp": 1.029737, + "epoch": 0.21692469562603336, + "flos": 20677478299200.0, + "grad_norm": 2.559002206504367, + "language_loss": 0.76654488, + "learning_rate": 3.6445283572146765e-06, + "loss": 0.78858155, + "num_input_tokens_seen": 77872575, + "router_z_loss_clip": 0.99658203, + "router_z_loss_mlp": 0.17150879, + "step": 3608, + "time_per_iteration": 2.672132968902588 + }, + { + "auxiliary_loss_clip": 0.01155553, + "auxiliary_loss_mlp": 0.01043732, + "balance_loss_clip": 1.05646038, + "balance_loss_mlp": 1.02890289, + "epoch": 0.21698481887870133, + "flos": 30650640317280.0, + "grad_norm": 1.786878719211463, + "language_loss": 0.74458909, + "learning_rate": 3.6443066810624255e-06, + "loss": 0.76658201, + "num_input_tokens_seen": 77892700, + "router_z_loss_clip": 0.99072266, + "router_z_loss_mlp": 0.14855957, + "step": 3609, + "time_per_iteration": 2.7060675621032715 + }, + { + "auxiliary_loss_clip": 0.01155186, + "auxiliary_loss_mlp": 0.01048107, + "balance_loss_clip": 1.05707324, + "balance_loss_mlp": 1.03235948, + "epoch": 0.2170449421313693, + "flos": 21834932395680.0, + "grad_norm": 2.390160256410335, + "language_loss": 0.88277996, + "learning_rate": 3.6440849425579e-06, + "loss": 0.90481293, + "num_input_tokens_seen": 77911060, + "router_z_loss_clip": 0.98046875, + "router_z_loss_mlp": 0.1574707, + "step": 3610, + "time_per_iteration": 2.642237901687622 + }, + { + "auxiliary_loss_clip": 0.0115512, + "auxiliary_loss_mlp": 0.01035706, + "balance_loss_clip": 1.0587132, + "balance_loss_mlp": 1.01981568, + "epoch": 0.2171050653840373, + "flos": 27623580465600.0, + "grad_norm": 1.7515020729072568, + "language_loss": 0.77408737, + "learning_rate": 3.6438631417095095e-06, + "loss": 0.79599571, + "num_input_tokens_seen": 77929930, + "router_z_loss_clip": 0.96533203, + "router_z_loss_mlp": 0.15881348, + "step": 3611, + "time_per_iteration": 4.154945135116577 + }, + { + "auxiliary_loss_clip": 0.01149125, + "auxiliary_loss_mlp": 0.01047925, + "balance_loss_clip": 1.05560267, + "balance_loss_mlp": 1.03224277, + "epoch": 0.21716518863670525, + "flos": 23793866052480.0, + "grad_norm": 2.043838396797084, + "language_loss": 0.63229239, + "learning_rate": 3.6436412785256637e-06, + "loss": 0.65426296, + "num_input_tokens_seen": 77949060, + "router_z_loss_clip": 0.93554688, + "router_z_loss_mlp": 0.15673828, + "step": 3612, + "time_per_iteration": 4.138311147689819 + }, + { + "auxiliary_loss_clip": 0.0115257, + "auxiliary_loss_mlp": 0.01040049, + "balance_loss_clip": 1.05511689, + "balance_loss_mlp": 1.02469456, + "epoch": 0.21722531188937322, + "flos": 24150205244160.0, + "grad_norm": 1.9581251495572207, + "language_loss": 0.76034826, + "learning_rate": 3.643419353014776e-06, + "loss": 0.78227443, + "num_input_tokens_seen": 77967920, + "router_z_loss_clip": 0.97509766, + "router_z_loss_mlp": 0.15344238, + "step": 3613, + "time_per_iteration": 2.643665075302124 + }, + { + "auxiliary_loss_clip": 0.01148008, + "auxiliary_loss_mlp": 0.01040576, + "balance_loss_clip": 1.05307317, + "balance_loss_mlp": 1.02547276, + "epoch": 0.21728543514204118, + "flos": 16270070028480.0, + "grad_norm": 2.103158894134777, + "language_loss": 0.70928311, + "learning_rate": 3.643197365185261e-06, + "loss": 0.73116899, + "num_input_tokens_seen": 77985330, + "router_z_loss_clip": 0.94873047, + "router_z_loss_mlp": 0.15100098, + "step": 3614, + "time_per_iteration": 2.6868114471435547 + }, + { + "auxiliary_loss_clip": 0.01155236, + "auxiliary_loss_mlp": 0.0104343, + "balance_loss_clip": 1.0588311, + "balance_loss_mlp": 1.02727687, + "epoch": 0.21734555839470915, + "flos": 18585504946080.0, + "grad_norm": 1.883308348076143, + "language_loss": 0.73227656, + "learning_rate": 3.6429753150455378e-06, + "loss": 0.75426316, + "num_input_tokens_seen": 78003105, + "router_z_loss_clip": 0.96289062, + "router_z_loss_mlp": 0.16149902, + "step": 3615, + "time_per_iteration": 2.6480119228363037 + }, + { + "auxiliary_loss_clip": 0.01156037, + "auxiliary_loss_mlp": 0.01041754, + "balance_loss_clip": 1.05523264, + "balance_loss_mlp": 1.02465951, + "epoch": 0.2174056816473771, + "flos": 24372289221120.0, + "grad_norm": 2.4190796565478565, + "language_loss": 0.90413171, + "learning_rate": 3.6427532026040263e-06, + "loss": 0.92610961, + "num_input_tokens_seen": 78019655, + "router_z_loss_clip": 1.00878906, + "router_z_loss_mlp": 0.17102051, + "step": 3616, + "time_per_iteration": 2.743652820587158 + }, + { + "auxiliary_loss_clip": 0.01154239, + "auxiliary_loss_mlp": 0.01042475, + "balance_loss_clip": 1.05646193, + "balance_loss_mlp": 1.02626276, + "epoch": 0.21746580490004508, + "flos": 20361893140800.0, + "grad_norm": 2.5085559184439656, + "language_loss": 0.8124007, + "learning_rate": 3.642531027869148e-06, + "loss": 0.83436787, + "num_input_tokens_seen": 78036025, + "router_z_loss_clip": 0.97705078, + "router_z_loss_mlp": 0.16223145, + "step": 3617, + "time_per_iteration": 2.6509199142456055 + }, + { + "auxiliary_loss_clip": 0.01156343, + "auxiliary_loss_mlp": 0.01037219, + "balance_loss_clip": 1.05795979, + "balance_loss_mlp": 1.02200222, + "epoch": 0.21752592815271307, + "flos": 31448108666880.0, + "grad_norm": 2.3072473184896003, + "language_loss": 0.75421113, + "learning_rate": 3.642308790849329e-06, + "loss": 0.77614671, + "num_input_tokens_seen": 78055645, + "router_z_loss_clip": 0.984375, + "router_z_loss_mlp": 0.15216064, + "step": 3618, + "time_per_iteration": 2.7342305183410645 + }, + { + "auxiliary_loss_clip": 0.01154009, + "auxiliary_loss_mlp": 0.01042178, + "balance_loss_clip": 1.05597556, + "balance_loss_mlp": 1.02609611, + "epoch": 0.21758605140538104, + "flos": 13732267512960.0, + "grad_norm": 3.055101907243125, + "language_loss": 0.69016898, + "learning_rate": 3.642086491552996e-06, + "loss": 0.71213078, + "num_input_tokens_seen": 78071660, + "router_z_loss_clip": 0.97998047, + "router_z_loss_mlp": 0.1607666, + "step": 3619, + "time_per_iteration": 2.6539323329925537 + }, + { + "auxiliary_loss_clip": 0.01155878, + "auxiliary_loss_mlp": 0.01043863, + "balance_loss_clip": 1.05661428, + "balance_loss_mlp": 1.02807355, + "epoch": 0.217646174658049, + "flos": 23479982619840.0, + "grad_norm": 1.8333780639223733, + "language_loss": 0.782022, + "learning_rate": 3.641864129988579e-06, + "loss": 0.80401945, + "num_input_tokens_seen": 78091265, + "router_z_loss_clip": 0.99414062, + "router_z_loss_mlp": 0.15789795, + "step": 3620, + "time_per_iteration": 2.6987433433532715 + }, + { + "auxiliary_loss_clip": 0.01149803, + "auxiliary_loss_mlp": 0.01039596, + "balance_loss_clip": 1.05520797, + "balance_loss_mlp": 1.02466464, + "epoch": 0.21770629791071697, + "flos": 26777497730400.0, + "grad_norm": 1.9160813821856604, + "language_loss": 0.79693687, + "learning_rate": 3.641641706164509e-06, + "loss": 0.81883085, + "num_input_tokens_seen": 78110095, + "router_z_loss_clip": 0.9453125, + "router_z_loss_mlp": 0.14935303, + "step": 3621, + "time_per_iteration": 2.684513807296753 + }, + { + "auxiliary_loss_clip": 0.01148274, + "auxiliary_loss_mlp": 0.01039934, + "balance_loss_clip": 1.05301607, + "balance_loss_mlp": 1.02510405, + "epoch": 0.21776642116338493, + "flos": 30428191684800.0, + "grad_norm": 1.6916681172272237, + "language_loss": 0.87614405, + "learning_rate": 3.641419220089221e-06, + "loss": 0.89802611, + "num_input_tokens_seen": 78129475, + "router_z_loss_clip": 0.95166016, + "router_z_loss_mlp": 0.14819336, + "step": 3622, + "time_per_iteration": 2.760704755783081 + }, + { + "auxiliary_loss_clip": 0.01154785, + "auxiliary_loss_mlp": 0.01042412, + "balance_loss_clip": 1.0557611, + "balance_loss_mlp": 1.02451849, + "epoch": 0.2178265444160529, + "flos": 21745442424960.0, + "grad_norm": 2.1835293005356498, + "language_loss": 0.76695609, + "learning_rate": 3.641196671771152e-06, + "loss": 0.78892803, + "num_input_tokens_seen": 78146880, + "router_z_loss_clip": 0.98974609, + "router_z_loss_mlp": 0.17895508, + "step": 3623, + "time_per_iteration": 2.678987741470337 + }, + { + "auxiliary_loss_clip": 0.01155475, + "auxiliary_loss_mlp": 0.01048776, + "balance_loss_clip": 1.05588889, + "balance_loss_mlp": 1.03166926, + "epoch": 0.2178866676687209, + "flos": 21613051005120.0, + "grad_norm": 1.9747436768848632, + "language_loss": 0.84917164, + "learning_rate": 3.640974061218741e-06, + "loss": 0.87121415, + "num_input_tokens_seen": 78165065, + "router_z_loss_clip": 0.99609375, + "router_z_loss_mlp": 0.17102051, + "step": 3624, + "time_per_iteration": 2.7063345909118652 + }, + { + "auxiliary_loss_clip": 0.01158153, + "auxiliary_loss_mlp": 0.01053503, + "balance_loss_clip": 1.05962038, + "balance_loss_mlp": 1.0371598, + "epoch": 0.21794679092138886, + "flos": 20677113643680.0, + "grad_norm": 2.491251141960984, + "language_loss": 0.77389574, + "learning_rate": 3.640751388440429e-06, + "loss": 0.79601228, + "num_input_tokens_seen": 78180005, + "router_z_loss_clip": 0.98730469, + "router_z_loss_mlp": 0.16326904, + "step": 3625, + "time_per_iteration": 2.752464771270752 + }, + { + "auxiliary_loss_clip": 0.01062029, + "auxiliary_loss_mlp": 0.01006396, + "balance_loss_clip": 1.02938867, + "balance_loss_mlp": 1.0044558, + "epoch": 0.21800691417405682, + "flos": 77749601293440.0, + "grad_norm": 0.8878551335995163, + "language_loss": 0.60715568, + "learning_rate": 3.64052865344466e-06, + "loss": 0.62783992, + "num_input_tokens_seen": 78245350, + "router_z_loss_clip": 0.32666016, + "router_z_loss_mlp": 0.01937866, + "step": 3626, + "time_per_iteration": 3.3684439659118652 + }, + { + "auxiliary_loss_clip": 0.01158244, + "auxiliary_loss_mlp": 0.01039719, + "balance_loss_clip": 1.05736935, + "balance_loss_mlp": 1.02232659, + "epoch": 0.21806703742672479, + "flos": 26376433812000.0, + "grad_norm": 2.0640397504581545, + "language_loss": 0.90681553, + "learning_rate": 3.6403058562398795e-06, + "loss": 0.9287951, + "num_input_tokens_seen": 78264165, + "router_z_loss_clip": 1.00878906, + "router_z_loss_mlp": 0.17382812, + "step": 3627, + "time_per_iteration": 2.6699845790863037 + }, + { + "auxiliary_loss_clip": 0.01151585, + "auxiliary_loss_mlp": 0.01037195, + "balance_loss_clip": 1.05438542, + "balance_loss_mlp": 1.02079141, + "epoch": 0.21812716067939275, + "flos": 23616709388640.0, + "grad_norm": 7.001050354887347, + "language_loss": 0.73286128, + "learning_rate": 3.6400829968345365e-06, + "loss": 0.75474906, + "num_input_tokens_seen": 78283745, + "router_z_loss_clip": 0.97119141, + "router_z_loss_mlp": 0.16394043, + "step": 3628, + "time_per_iteration": 2.663236379623413 + }, + { + "auxiliary_loss_clip": 0.01151551, + "auxiliary_loss_mlp": 0.01037867, + "balance_loss_clip": 1.05524611, + "balance_loss_mlp": 1.02239347, + "epoch": 0.21818728393206072, + "flos": 28733797764000.0, + "grad_norm": 1.7217332780546848, + "language_loss": 0.7676307, + "learning_rate": 3.6398600752370826e-06, + "loss": 0.78952491, + "num_input_tokens_seen": 78302900, + "router_z_loss_clip": 0.96337891, + "router_z_loss_mlp": 0.15478516, + "step": 3629, + "time_per_iteration": 2.7416787147521973 + }, + { + "auxiliary_loss_clip": 0.01153565, + "auxiliary_loss_mlp": 0.01032451, + "balance_loss_clip": 1.05610204, + "balance_loss_mlp": 1.01768088, + "epoch": 0.21824740718472868, + "flos": 36882240822720.0, + "grad_norm": 1.6056566745103393, + "language_loss": 0.71302187, + "learning_rate": 3.63963709145597e-06, + "loss": 0.73488206, + "num_input_tokens_seen": 78326470, + "router_z_loss_clip": 0.97509766, + "router_z_loss_mlp": 0.14770508, + "step": 3630, + "time_per_iteration": 2.789870262145996 + }, + { + "auxiliary_loss_clip": 0.01147689, + "auxiliary_loss_mlp": 0.01039854, + "balance_loss_clip": 1.05548072, + "balance_loss_mlp": 1.02562058, + "epoch": 0.21830753043739667, + "flos": 31890534377760.0, + "grad_norm": 2.5104401392170446, + "language_loss": 0.76830769, + "learning_rate": 3.6394140454996544e-06, + "loss": 0.79018319, + "num_input_tokens_seen": 78345810, + "router_z_loss_clip": 0.92285156, + "router_z_loss_mlp": 0.14251709, + "step": 3631, + "time_per_iteration": 2.694783926010132 + }, + { + "auxiliary_loss_clip": 0.01151998, + "auxiliary_loss_mlp": 0.0103815, + "balance_loss_clip": 1.05416119, + "balance_loss_mlp": 1.02271295, + "epoch": 0.21836765369006464, + "flos": 26504206261920.0, + "grad_norm": 2.4222011876892413, + "language_loss": 0.74715149, + "learning_rate": 3.639190937376594e-06, + "loss": 0.76905298, + "num_input_tokens_seen": 78364085, + "router_z_loss_clip": 0.97705078, + "router_z_loss_mlp": 0.15429688, + "step": 3632, + "time_per_iteration": 2.705395460128784 + }, + { + "auxiliary_loss_clip": 0.01151373, + "auxiliary_loss_mlp": 0.01041675, + "balance_loss_clip": 1.05500424, + "balance_loss_mlp": 1.02713704, + "epoch": 0.2184277769427326, + "flos": 24327888632640.0, + "grad_norm": 2.2615641362295262, + "language_loss": 0.83860302, + "learning_rate": 3.638967767095249e-06, + "loss": 0.86053348, + "num_input_tokens_seen": 78381385, + "router_z_loss_clip": 0.96191406, + "router_z_loss_mlp": 0.14556885, + "step": 3633, + "time_per_iteration": 2.7752795219421387 + }, + { + "auxiliary_loss_clip": 0.0115196, + "auxiliary_loss_mlp": 0.01046662, + "balance_loss_clip": 1.0560317, + "balance_loss_mlp": 1.03061652, + "epoch": 0.21848790019540057, + "flos": 24819090798240.0, + "grad_norm": 2.0060486452652615, + "language_loss": 0.81486166, + "learning_rate": 3.6387445346640823e-06, + "loss": 0.83684778, + "num_input_tokens_seen": 78400500, + "router_z_loss_clip": 0.95898438, + "router_z_loss_mlp": 0.16064453, + "step": 3634, + "time_per_iteration": 2.677826166152954 + }, + { + "auxiliary_loss_clip": 0.01156327, + "auxiliary_loss_mlp": 0.01039277, + "balance_loss_clip": 1.05755699, + "balance_loss_mlp": 1.02342772, + "epoch": 0.21854802344806853, + "flos": 18852678305280.0, + "grad_norm": 2.0657757067348532, + "language_loss": 0.74849081, + "learning_rate": 3.638521240091558e-06, + "loss": 0.7704469, + "num_input_tokens_seen": 78418340, + "router_z_loss_clip": 0.98632812, + "router_z_loss_mlp": 0.1585083, + "step": 3635, + "time_per_iteration": 2.633776903152466 + }, + { + "auxiliary_loss_clip": 0.0115039, + "auxiliary_loss_mlp": 0.01049686, + "balance_loss_clip": 1.05576181, + "balance_loss_mlp": 1.03445148, + "epoch": 0.2186081467007365, + "flos": 19914726908160.0, + "grad_norm": 2.55662717755253, + "language_loss": 0.88103908, + "learning_rate": 3.6382978833861445e-06, + "loss": 0.90303981, + "num_input_tokens_seen": 78434375, + "router_z_loss_clip": 0.94580078, + "router_z_loss_mlp": 0.15222168, + "step": 3636, + "time_per_iteration": 2.671614170074463 + }, + { + "auxiliary_loss_clip": 0.01151367, + "auxiliary_loss_mlp": 0.01045728, + "balance_loss_clip": 1.05372596, + "balance_loss_mlp": 1.02977777, + "epoch": 0.2186682699534045, + "flos": 26465356540800.0, + "grad_norm": 2.157944002529091, + "language_loss": 0.75816143, + "learning_rate": 3.638074464556311e-06, + "loss": 0.78013235, + "num_input_tokens_seen": 78451735, + "router_z_loss_clip": 0.97705078, + "router_z_loss_mlp": 0.1595459, + "step": 3637, + "time_per_iteration": 2.7560319900512695 + }, + { + "auxiliary_loss_clip": 0.01158119, + "auxiliary_loss_mlp": 0.01041706, + "balance_loss_clip": 1.05646658, + "balance_loss_mlp": 1.02415895, + "epoch": 0.21872839320607246, + "flos": 21646147272480.0, + "grad_norm": 2.8340136364420925, + "language_loss": 0.90041465, + "learning_rate": 3.63785098361053e-06, + "loss": 0.92241287, + "num_input_tokens_seen": 78462730, + "router_z_loss_clip": 1.015625, + "router_z_loss_mlp": 0.17541504, + "step": 3638, + "time_per_iteration": 2.6425061225891113 + }, + { + "auxiliary_loss_clip": 0.01153118, + "auxiliary_loss_mlp": 0.01047785, + "balance_loss_clip": 1.05592823, + "balance_loss_mlp": 1.03096485, + "epoch": 0.21878851645874042, + "flos": 22760213712480.0, + "grad_norm": 4.9969423142348965, + "language_loss": 0.89651185, + "learning_rate": 3.637627440557275e-06, + "loss": 0.91852093, + "num_input_tokens_seen": 78476300, + "router_z_loss_clip": 0.97119141, + "router_z_loss_mlp": 0.16821289, + "step": 3639, + "time_per_iteration": 2.5967607498168945 + }, + { + "auxiliary_loss_clip": 0.01155633, + "auxiliary_loss_mlp": 0.01038309, + "balance_loss_clip": 1.05733514, + "balance_loss_mlp": 1.02290726, + "epoch": 0.2188486397114084, + "flos": 31185392208480.0, + "grad_norm": 2.746837643988669, + "language_loss": 0.79201376, + "learning_rate": 3.637403835405024e-06, + "loss": 0.81395316, + "num_input_tokens_seen": 78496135, + "router_z_loss_clip": 0.98291016, + "router_z_loss_mlp": 0.15393066, + "step": 3640, + "time_per_iteration": 2.7200350761413574 + }, + { + "auxiliary_loss_clip": 0.01157231, + "auxiliary_loss_mlp": 0.0105323, + "balance_loss_clip": 1.05848742, + "balance_loss_mlp": 1.03514552, + "epoch": 0.21890876296407635, + "flos": 21831974634240.0, + "grad_norm": 2.175008402434044, + "language_loss": 0.71696806, + "learning_rate": 3.637180168162255e-06, + "loss": 0.73907274, + "num_input_tokens_seen": 78513855, + "router_z_loss_clip": 0.98876953, + "router_z_loss_mlp": 0.1809082, + "step": 3641, + "time_per_iteration": 2.6231725215911865 + }, + { + "auxiliary_loss_clip": 0.01155709, + "auxiliary_loss_mlp": 0.01035779, + "balance_loss_clip": 1.05992877, + "balance_loss_mlp": 1.02036476, + "epoch": 0.21896888621674432, + "flos": 21657249007200.0, + "grad_norm": 2.0369035707281085, + "language_loss": 0.81240308, + "learning_rate": 3.63695643883745e-06, + "loss": 0.83431798, + "num_input_tokens_seen": 78531740, + "router_z_loss_clip": 0.95898438, + "router_z_loss_mlp": 0.15405273, + "step": 3642, + "time_per_iteration": 2.6731691360473633 + }, + { + "auxiliary_loss_clip": 0.01157831, + "auxiliary_loss_mlp": 0.01041679, + "balance_loss_clip": 1.05941772, + "balance_loss_mlp": 1.02478743, + "epoch": 0.21902900946941228, + "flos": 28157724597600.0, + "grad_norm": 1.9499229310489172, + "language_loss": 0.71794146, + "learning_rate": 3.6367326474390928e-06, + "loss": 0.73993659, + "num_input_tokens_seen": 78549600, + "router_z_loss_clip": 0.98339844, + "router_z_loss_mlp": 0.16870117, + "step": 3643, + "time_per_iteration": 2.653022289276123 + }, + { + "auxiliary_loss_clip": 0.01156058, + "auxiliary_loss_mlp": 0.01042706, + "balance_loss_clip": 1.05745566, + "balance_loss_mlp": 1.02626681, + "epoch": 0.21908913272208028, + "flos": 58789564896960.0, + "grad_norm": 2.56620411994041, + "language_loss": 0.68791938, + "learning_rate": 3.6365087939756696e-06, + "loss": 0.70990694, + "num_input_tokens_seen": 78573350, + "router_z_loss_clip": 0.98486328, + "router_z_loss_mlp": 0.16442871, + "step": 3644, + "time_per_iteration": 2.921645402908325 + }, + { + "auxiliary_loss_clip": 0.01157948, + "auxiliary_loss_mlp": 0.0104354, + "balance_loss_clip": 1.05686629, + "balance_loss_mlp": 1.02693439, + "epoch": 0.21914925597474824, + "flos": 27133431749280.0, + "grad_norm": 2.600166880738169, + "language_loss": 0.77677637, + "learning_rate": 3.636284878455669e-06, + "loss": 0.79879123, + "num_input_tokens_seen": 78591005, + "router_z_loss_clip": 1.01171875, + "router_z_loss_mlp": 0.16601562, + "step": 3645, + "time_per_iteration": 2.713773012161255 + }, + { + "auxiliary_loss_clip": 0.01155991, + "auxiliary_loss_mlp": 0.01048116, + "balance_loss_clip": 1.06044638, + "balance_loss_mlp": 1.03267884, + "epoch": 0.2192093792274162, + "flos": 26997961016160.0, + "grad_norm": 1.6921080700356408, + "language_loss": 0.82439733, + "learning_rate": 3.636060900887582e-06, + "loss": 0.84643841, + "num_input_tokens_seen": 78610645, + "router_z_loss_clip": 0.95507812, + "router_z_loss_mlp": 0.15441895, + "step": 3646, + "time_per_iteration": 4.18591570854187 + }, + { + "auxiliary_loss_clip": 0.01151193, + "auxiliary_loss_mlp": 0.01037784, + "balance_loss_clip": 1.05695045, + "balance_loss_mlp": 1.02207255, + "epoch": 0.21926950248008417, + "flos": 19119487008960.0, + "grad_norm": 1.718167138673878, + "language_loss": 0.829377, + "learning_rate": 3.635836861279901e-06, + "loss": 0.85126674, + "num_input_tokens_seen": 78628340, + "router_z_loss_clip": 0.94287109, + "router_z_loss_mlp": 0.15722656, + "step": 3647, + "time_per_iteration": 4.20621657371521 + }, + { + "auxiliary_loss_clip": 0.01150213, + "auxiliary_loss_mlp": 0.01041106, + "balance_loss_clip": 1.05363071, + "balance_loss_mlp": 1.02559638, + "epoch": 0.21932962573275214, + "flos": 36927168135840.0, + "grad_norm": 1.72038869102057, + "language_loss": 0.72523808, + "learning_rate": 3.635612759641123e-06, + "loss": 0.74715132, + "num_input_tokens_seen": 78649355, + "router_z_loss_clip": 0.96435547, + "router_z_loss_mlp": 0.1550293, + "step": 3648, + "time_per_iteration": 2.742798328399658 + }, + { + "auxiliary_loss_clip": 0.01154984, + "auxiliary_loss_mlp": 0.01043409, + "balance_loss_clip": 1.05513346, + "balance_loss_mlp": 1.02605247, + "epoch": 0.2193897489854201, + "flos": 13153196067840.0, + "grad_norm": 5.103411728368746, + "language_loss": 0.74137104, + "learning_rate": 3.635388595979745e-06, + "loss": 0.7633549, + "num_input_tokens_seen": 78664915, + "router_z_loss_clip": 0.99804688, + "router_z_loss_mlp": 0.17358398, + "step": 3649, + "time_per_iteration": 2.6542584896087646 + }, + { + "auxiliary_loss_clip": 0.01148785, + "auxiliary_loss_mlp": 0.01038683, + "balance_loss_clip": 1.05508566, + "balance_loss_mlp": 1.02309632, + "epoch": 0.21944987223808807, + "flos": 23346578268000.0, + "grad_norm": 2.1243487574430535, + "language_loss": 0.86235058, + "learning_rate": 3.635164370304267e-06, + "loss": 0.88422525, + "num_input_tokens_seen": 78681475, + "router_z_loss_clip": 0.9375, + "router_z_loss_mlp": 0.15594482, + "step": 3650, + "time_per_iteration": 2.6479220390319824 + }, + { + "auxiliary_loss_clip": 0.011538, + "auxiliary_loss_mlp": 0.01040882, + "balance_loss_clip": 1.05538785, + "balance_loss_mlp": 1.02426434, + "epoch": 0.21950999549075606, + "flos": 27712260090720.0, + "grad_norm": 2.554563107334926, + "language_loss": 0.83892101, + "learning_rate": 3.6349400826231927e-06, + "loss": 0.86086786, + "num_input_tokens_seen": 78702300, + "router_z_loss_clip": 0.98339844, + "router_z_loss_mlp": 0.16601562, + "step": 3651, + "time_per_iteration": 5.670179843902588 + }, + { + "auxiliary_loss_clip": 0.01151264, + "auxiliary_loss_mlp": 0.01041379, + "balance_loss_clip": 1.05479169, + "balance_loss_mlp": 1.02587581, + "epoch": 0.21957011874342403, + "flos": 12886184777760.0, + "grad_norm": 1.98926649389326, + "language_loss": 0.74476492, + "learning_rate": 3.634715732945027e-06, + "loss": 0.76669133, + "num_input_tokens_seen": 78720230, + "router_z_loss_clip": 0.96435547, + "router_z_loss_mlp": 0.15496826, + "step": 3652, + "time_per_iteration": 2.729886054992676 + }, + { + "auxiliary_loss_clip": 0.01069546, + "auxiliary_loss_mlp": 0.0100319, + "balance_loss_clip": 1.03662181, + "balance_loss_mlp": 1.00129056, + "epoch": 0.219630241996092, + "flos": 80227241998560.0, + "grad_norm": 0.7463653684051289, + "language_loss": 0.51570946, + "learning_rate": 3.6344913212782764e-06, + "loss": 0.5364368, + "num_input_tokens_seen": 78780200, + "router_z_loss_clip": 0.32910156, + "router_z_loss_mlp": 0.01896667, + "step": 3653, + "time_per_iteration": 3.2630996704101562 + }, + { + "auxiliary_loss_clip": 0.01157902, + "auxiliary_loss_mlp": 0.01048299, + "balance_loss_clip": 1.06087327, + "balance_loss_mlp": 1.03209901, + "epoch": 0.21969036524875996, + "flos": 28914722534880.0, + "grad_norm": 1.8163862945509994, + "language_loss": 0.75037944, + "learning_rate": 3.6342668476314514e-06, + "loss": 0.77244151, + "num_input_tokens_seen": 78800575, + "router_z_loss_clip": 0.96972656, + "router_z_loss_mlp": 0.1619873, + "step": 3654, + "time_per_iteration": 2.713439702987671 + }, + { + "auxiliary_loss_clip": 0.01160096, + "auxiliary_loss_mlp": 0.01037715, + "balance_loss_clip": 1.06070065, + "balance_loss_mlp": 1.02115679, + "epoch": 0.21975048850142792, + "flos": 23966282194560.0, + "grad_norm": 2.7084000417968026, + "language_loss": 0.72854686, + "learning_rate": 3.634042312013064e-06, + "loss": 0.750525, + "num_input_tokens_seen": 78819585, + "router_z_loss_clip": 0.99511719, + "router_z_loss_mlp": 0.16564941, + "step": 3655, + "time_per_iteration": 2.6261961460113525 + }, + { + "auxiliary_loss_clip": 0.01155739, + "auxiliary_loss_mlp": 0.01040562, + "balance_loss_clip": 1.05790901, + "balance_loss_mlp": 1.02510023, + "epoch": 0.21981061175409589, + "flos": 27392542169760.0, + "grad_norm": 1.65935967725226, + "language_loss": 0.80510962, + "learning_rate": 3.6338177144316276e-06, + "loss": 0.82707262, + "num_input_tokens_seen": 78837330, + "router_z_loss_clip": 0.97753906, + "router_z_loss_mlp": 0.15478516, + "step": 3656, + "time_per_iteration": 2.711864948272705 + }, + { + "auxiliary_loss_clip": 0.01156035, + "auxiliary_loss_mlp": 0.01034348, + "balance_loss_clip": 1.05993903, + "balance_loss_mlp": 1.01867247, + "epoch": 0.21987073500676388, + "flos": 22147681344480.0, + "grad_norm": 2.468260295895558, + "language_loss": 0.8492077, + "learning_rate": 3.63359305489566e-06, + "loss": 0.87111157, + "num_input_tokens_seen": 78854955, + "router_z_loss_clip": 0.96240234, + "router_z_loss_mlp": 0.15673828, + "step": 3657, + "time_per_iteration": 2.629072427749634 + }, + { + "auxiliary_loss_clip": 0.01156105, + "auxiliary_loss_mlp": 0.01033035, + "balance_loss_clip": 1.05675554, + "balance_loss_mlp": 1.01669097, + "epoch": 0.21993085825943184, + "flos": 31269817519200.0, + "grad_norm": 1.7215350667408922, + "language_loss": 0.8050552, + "learning_rate": 3.6333683334136803e-06, + "loss": 0.82694662, + "num_input_tokens_seen": 78874965, + "router_z_loss_clip": 0.99365234, + "router_z_loss_mlp": 0.16345215, + "step": 3658, + "time_per_iteration": 2.710712432861328 + }, + { + "auxiliary_loss_clip": 0.01068445, + "auxiliary_loss_mlp": 0.01006585, + "balance_loss_clip": 1.03572929, + "balance_loss_mlp": 1.00477505, + "epoch": 0.2199909815120998, + "flos": 86539708644480.0, + "grad_norm": 0.7934982173437122, + "language_loss": 0.58198661, + "learning_rate": 3.6331435499942095e-06, + "loss": 0.60273695, + "num_input_tokens_seen": 78937740, + "router_z_loss_clip": 0.32739258, + "router_z_loss_mlp": 0.01808167, + "step": 3659, + "time_per_iteration": 3.4076004028320312 + }, + { + "auxiliary_loss_clip": 0.01153752, + "auxiliary_loss_mlp": 0.01039996, + "balance_loss_clip": 1.05776405, + "balance_loss_mlp": 1.02378368, + "epoch": 0.22005110476476777, + "flos": 26287511083200.0, + "grad_norm": 2.391872402367788, + "language_loss": 0.74469292, + "learning_rate": 3.632918704645772e-06, + "loss": 0.76663041, + "num_input_tokens_seen": 78955055, + "router_z_loss_clip": 0.96044922, + "router_z_loss_mlp": 0.16192627, + "step": 3660, + "time_per_iteration": 2.708880662918091 + }, + { + "auxiliary_loss_clip": 0.01155565, + "auxiliary_loss_mlp": 0.01039869, + "balance_loss_clip": 1.0574683, + "balance_loss_mlp": 1.02352524, + "epoch": 0.22011122801743574, + "flos": 26911023634080.0, + "grad_norm": 2.6335331916677562, + "language_loss": 0.81644464, + "learning_rate": 3.632693797376893e-06, + "loss": 0.83839893, + "num_input_tokens_seen": 78974895, + "router_z_loss_clip": 0.98095703, + "router_z_loss_mlp": 0.16333008, + "step": 3661, + "time_per_iteration": 2.726426601409912 + }, + { + "auxiliary_loss_clip": 0.01153361, + "auxiliary_loss_mlp": 0.01038347, + "balance_loss_clip": 1.05657899, + "balance_loss_mlp": 1.02344608, + "epoch": 0.2201713512701037, + "flos": 32564727695520.0, + "grad_norm": 1.8025977870768697, + "language_loss": 0.73270875, + "learning_rate": 3.632468828196102e-06, + "loss": 0.7546258, + "num_input_tokens_seen": 78994990, + "router_z_loss_clip": 0.96728516, + "router_z_loss_mlp": 0.14892578, + "step": 3662, + "time_per_iteration": 2.7144830226898193 + }, + { + "auxiliary_loss_clip": 0.01152158, + "auxiliary_loss_mlp": 0.01048431, + "balance_loss_clip": 1.05833292, + "balance_loss_mlp": 1.03373241, + "epoch": 0.22023147452277167, + "flos": 27042604708320.0, + "grad_norm": 1.7053761764184658, + "language_loss": 0.78470027, + "learning_rate": 3.632243797111929e-06, + "loss": 0.80670619, + "num_input_tokens_seen": 79014405, + "router_z_loss_clip": 0.93896484, + "router_z_loss_mlp": 0.14697266, + "step": 3663, + "time_per_iteration": 2.7053239345550537 + }, + { + "auxiliary_loss_clip": 0.01157152, + "auxiliary_loss_mlp": 0.01042742, + "balance_loss_clip": 1.05870605, + "balance_loss_mlp": 1.02570653, + "epoch": 0.22029159777543966, + "flos": 27483571797120.0, + "grad_norm": 2.015966596023241, + "language_loss": 0.80292648, + "learning_rate": 3.632018704132908e-06, + "loss": 0.82492542, + "num_input_tokens_seen": 79032375, + "router_z_loss_clip": 0.984375, + "router_z_loss_mlp": 0.17041016, + "step": 3664, + "time_per_iteration": 2.672711133956909 + }, + { + "auxiliary_loss_clip": 0.01160077, + "auxiliary_loss_mlp": 0.010464, + "balance_loss_clip": 1.05737114, + "balance_loss_mlp": 1.02795851, + "epoch": 0.22035172102810763, + "flos": 15913852388640.0, + "grad_norm": 2.514918749446536, + "language_loss": 0.76319748, + "learning_rate": 3.6317935492675742e-06, + "loss": 0.78526223, + "num_input_tokens_seen": 79049635, + "router_z_loss_clip": 1.02734375, + "router_z_loss_mlp": 0.1842041, + "step": 3665, + "time_per_iteration": 2.676900625228882 + }, + { + "auxiliary_loss_clip": 0.01152408, + "auxiliary_loss_mlp": 0.01046152, + "balance_loss_clip": 1.05599666, + "balance_loss_mlp": 1.03041625, + "epoch": 0.2204118442807756, + "flos": 14844186537120.0, + "grad_norm": 3.097316221338639, + "language_loss": 0.97858208, + "learning_rate": 3.631568332524466e-06, + "loss": 1.00056767, + "num_input_tokens_seen": 79062890, + "router_z_loss_clip": 0.96386719, + "router_z_loss_mlp": 0.15734863, + "step": 3666, + "time_per_iteration": 2.6258771419525146 + }, + { + "auxiliary_loss_clip": 0.01152367, + "auxiliary_loss_mlp": 0.01044633, + "balance_loss_clip": 1.05475736, + "balance_loss_mlp": 1.02795529, + "epoch": 0.22047196753344356, + "flos": 48942838258560.0, + "grad_norm": 1.976368099387219, + "language_loss": 0.80491048, + "learning_rate": 3.631343053912122e-06, + "loss": 0.82688046, + "num_input_tokens_seen": 79085495, + "router_z_loss_clip": 0.97460938, + "router_z_loss_mlp": 0.16662598, + "step": 3667, + "time_per_iteration": 2.9168262481689453 + }, + { + "auxiliary_loss_clip": 0.01157517, + "auxiliary_loss_mlp": 0.01048259, + "balance_loss_clip": 1.05727553, + "balance_loss_mlp": 1.03056884, + "epoch": 0.22053209078611152, + "flos": 25259855300640.0, + "grad_norm": 2.0245451325525763, + "language_loss": 0.7734108, + "learning_rate": 3.631117713439087e-06, + "loss": 0.79546857, + "num_input_tokens_seen": 79101820, + "router_z_loss_clip": 1.00146484, + "router_z_loss_mlp": 0.17675781, + "step": 3668, + "time_per_iteration": 2.6390669345855713 + }, + { + "auxiliary_loss_clip": 0.01157298, + "auxiliary_loss_mlp": 0.01041868, + "balance_loss_clip": 1.05939996, + "balance_loss_mlp": 1.02515459, + "epoch": 0.2205922140387795, + "flos": 30159316599840.0, + "grad_norm": 1.7498086876025498, + "language_loss": 0.71020734, + "learning_rate": 3.630892311113904e-06, + "loss": 0.73219901, + "num_input_tokens_seen": 79123320, + "router_z_loss_clip": 0.97900391, + "router_z_loss_mlp": 0.16711426, + "step": 3669, + "time_per_iteration": 2.755669116973877 + }, + { + "auxiliary_loss_clip": 0.01153717, + "auxiliary_loss_mlp": 0.01038848, + "balance_loss_clip": 1.05515552, + "balance_loss_mlp": 1.02285051, + "epoch": 0.22065233729144745, + "flos": 28647184520160.0, + "grad_norm": 1.7221120638121508, + "language_loss": 0.85535657, + "learning_rate": 3.6306668469451215e-06, + "loss": 0.87728226, + "num_input_tokens_seen": 79141615, + "router_z_loss_clip": 0.98535156, + "router_z_loss_mlp": 0.15979004, + "step": 3670, + "time_per_iteration": 2.684803009033203 + }, + { + "auxiliary_loss_clip": 0.01160897, + "auxiliary_loss_mlp": 0.01036877, + "balance_loss_clip": 1.05968881, + "balance_loss_mlp": 1.02056909, + "epoch": 0.22071246054411545, + "flos": 42983394737760.0, + "grad_norm": 1.7618247842161128, + "language_loss": 0.76957285, + "learning_rate": 3.6304413209412886e-06, + "loss": 0.79155064, + "num_input_tokens_seen": 79164910, + "router_z_loss_clip": 1.01220703, + "router_z_loss_mlp": 0.16320801, + "step": 3671, + "time_per_iteration": 2.7670156955718994 + }, + { + "auxiliary_loss_clip": 0.01155089, + "auxiliary_loss_mlp": 0.01035955, + "balance_loss_clip": 1.05713773, + "balance_loss_mlp": 1.02008843, + "epoch": 0.2207725837967834, + "flos": 22146830481600.0, + "grad_norm": 12.355936516441172, + "language_loss": 0.81379449, + "learning_rate": 3.6302157331109573e-06, + "loss": 0.83570498, + "num_input_tokens_seen": 79179685, + "router_z_loss_clip": 0.97900391, + "router_z_loss_mlp": 0.15869141, + "step": 3672, + "time_per_iteration": 2.632983684539795 + }, + { + "auxiliary_loss_clip": 0.01158872, + "auxiliary_loss_mlp": 0.01049446, + "balance_loss_clip": 1.05943882, + "balance_loss_mlp": 1.03323352, + "epoch": 0.22083270704945138, + "flos": 24989845731840.0, + "grad_norm": 3.775586375458885, + "language_loss": 0.73441082, + "learning_rate": 3.629990083462682e-06, + "loss": 0.75649405, + "num_input_tokens_seen": 79196285, + "router_z_loss_clip": 0.99365234, + "router_z_loss_mlp": 0.16210938, + "step": 3673, + "time_per_iteration": 2.6921322345733643 + }, + { + "auxiliary_loss_clip": 0.01159231, + "auxiliary_loss_mlp": 0.01039276, + "balance_loss_clip": 1.06022906, + "balance_loss_mlp": 1.02263474, + "epoch": 0.22089283030211934, + "flos": 41641855522560.0, + "grad_norm": 2.546606545990875, + "language_loss": 0.76136702, + "learning_rate": 3.6297643720050203e-06, + "loss": 0.78335208, + "num_input_tokens_seen": 79216060, + "router_z_loss_clip": 0.98925781, + "router_z_loss_mlp": 0.16638184, + "step": 3674, + "time_per_iteration": 2.7898828983306885 + }, + { + "auxiliary_loss_clip": 0.01154256, + "auxiliary_loss_mlp": 0.01042519, + "balance_loss_clip": 1.05786657, + "balance_loss_mlp": 1.02488828, + "epoch": 0.2209529535547873, + "flos": 22057664649120.0, + "grad_norm": 2.1024457643092656, + "language_loss": 0.7464577, + "learning_rate": 3.6295385987465293e-06, + "loss": 0.76842541, + "num_input_tokens_seen": 79235145, + "router_z_loss_clip": 0.96533203, + "router_z_loss_mlp": 0.1763916, + "step": 3675, + "time_per_iteration": 2.6611006259918213 + }, + { + "auxiliary_loss_clip": 0.01156701, + "auxiliary_loss_mlp": 0.0103793, + "balance_loss_clip": 1.05844998, + "balance_loss_mlp": 1.02170587, + "epoch": 0.22101307680745527, + "flos": 33233086524960.0, + "grad_norm": 1.7148005015874344, + "language_loss": 0.80276144, + "learning_rate": 3.629312763695772e-06, + "loss": 0.82470775, + "num_input_tokens_seen": 79256960, + "router_z_loss_clip": 0.98242188, + "router_z_loss_mlp": 0.16223145, + "step": 3676, + "time_per_iteration": 2.730858325958252 + }, + { + "auxiliary_loss_clip": 0.01156848, + "auxiliary_loss_mlp": 0.01046785, + "balance_loss_clip": 1.05703354, + "balance_loss_mlp": 1.03118074, + "epoch": 0.22107320006012326, + "flos": 20185951995360.0, + "grad_norm": 2.2077108174943203, + "language_loss": 0.7536242, + "learning_rate": 3.6290868668613107e-06, + "loss": 0.77566057, + "num_input_tokens_seen": 79274860, + "router_z_loss_clip": 0.99853516, + "router_z_loss_mlp": 0.15600586, + "step": 3677, + "time_per_iteration": 2.605922222137451 + }, + { + "auxiliary_loss_clip": 0.0115309, + "auxiliary_loss_mlp": 0.0104431, + "balance_loss_clip": 1.05591679, + "balance_loss_mlp": 1.0290159, + "epoch": 0.22113332331279123, + "flos": 26910983116800.0, + "grad_norm": 9.52172570367039, + "language_loss": 0.83312857, + "learning_rate": 3.628860908251712e-06, + "loss": 0.8551026, + "num_input_tokens_seen": 79294005, + "router_z_loss_clip": 0.97119141, + "router_z_loss_mlp": 0.1529541, + "step": 3678, + "time_per_iteration": 2.769479274749756 + }, + { + "auxiliary_loss_clip": 0.0115684, + "auxiliary_loss_mlp": 0.0104929, + "balance_loss_clip": 1.06013942, + "balance_loss_mlp": 1.03300595, + "epoch": 0.2211934465654592, + "flos": 32473617033600.0, + "grad_norm": 1.9048438211023035, + "language_loss": 0.88889349, + "learning_rate": 3.6286348878755452e-06, + "loss": 0.91095483, + "num_input_tokens_seen": 79314005, + "router_z_loss_clip": 0.96679688, + "router_z_loss_mlp": 0.1628418, + "step": 3679, + "time_per_iteration": 2.728724956512451 + }, + { + "auxiliary_loss_clip": 0.01157421, + "auxiliary_loss_mlp": 0.01050037, + "balance_loss_clip": 1.05777359, + "balance_loss_mlp": 1.03337193, + "epoch": 0.22125356981812716, + "flos": 19962814569120.0, + "grad_norm": 2.6687615642021356, + "language_loss": 0.8639394, + "learning_rate": 3.6284088057413803e-06, + "loss": 0.88601404, + "num_input_tokens_seen": 79331030, + "router_z_loss_clip": 0.99707031, + "router_z_loss_mlp": 0.16650391, + "step": 3680, + "time_per_iteration": 2.6440584659576416 + }, + { + "auxiliary_loss_clip": 0.01153215, + "auxiliary_loss_mlp": 0.01045316, + "balance_loss_clip": 1.05861402, + "balance_loss_mlp": 1.02871609, + "epoch": 0.22131369307079513, + "flos": 26418970605600.0, + "grad_norm": 1.9484507360469285, + "language_loss": 0.81398869, + "learning_rate": 3.6281826618577894e-06, + "loss": 0.83597398, + "num_input_tokens_seen": 79348560, + "router_z_loss_clip": 0.9453125, + "router_z_loss_mlp": 0.16607666, + "step": 3681, + "time_per_iteration": 2.6500017642974854 + }, + { + "auxiliary_loss_clip": 0.01149422, + "auxiliary_loss_mlp": 0.0103515, + "balance_loss_clip": 1.05679071, + "balance_loss_mlp": 1.0198797, + "epoch": 0.2213738163234631, + "flos": 23927513508000.0, + "grad_norm": 2.683482067886511, + "language_loss": 0.79714072, + "learning_rate": 3.62795645623335e-06, + "loss": 0.81898648, + "num_input_tokens_seen": 79367175, + "router_z_loss_clip": 0.92626953, + "router_z_loss_mlp": 0.15258789, + "step": 3682, + "time_per_iteration": 2.706110715866089 + }, + { + "auxiliary_loss_clip": 0.01153915, + "auxiliary_loss_mlp": 0.01041185, + "balance_loss_clip": 1.05581474, + "balance_loss_mlp": 1.02442443, + "epoch": 0.22143393957613106, + "flos": 28825516185120.0, + "grad_norm": 1.7736279375448032, + "language_loss": 0.77245736, + "learning_rate": 3.627730188876638e-06, + "loss": 0.79440838, + "num_input_tokens_seen": 79388435, + "router_z_loss_clip": 0.98095703, + "router_z_loss_mlp": 0.16760254, + "step": 3683, + "time_per_iteration": 2.729708671569824 + }, + { + "auxiliary_loss_clip": 0.01156812, + "auxiliary_loss_mlp": 0.01042262, + "balance_loss_clip": 1.05693793, + "balance_loss_mlp": 1.02672946, + "epoch": 0.22149406282879905, + "flos": 31942552214880.0, + "grad_norm": 2.6300974921941696, + "language_loss": 0.72582316, + "learning_rate": 3.627503859796234e-06, + "loss": 0.74781394, + "num_input_tokens_seen": 79407910, + "router_z_loss_clip": 0.99853516, + "router_z_loss_mlp": 0.15539551, + "step": 3684, + "time_per_iteration": 2.7056400775909424 + }, + { + "auxiliary_loss_clip": 0.01157762, + "auxiliary_loss_mlp": 0.01042971, + "balance_loss_clip": 1.05960834, + "balance_loss_mlp": 1.0263536, + "epoch": 0.221554186081467, + "flos": 17739705831840.0, + "grad_norm": 1.9939393253449158, + "language_loss": 0.80123323, + "learning_rate": 3.6272774690007207e-06, + "loss": 0.82324058, + "num_input_tokens_seen": 79424020, + "router_z_loss_clip": 0.98144531, + "router_z_loss_mlp": 0.1661377, + "step": 3685, + "time_per_iteration": 4.109756708145142 + }, + { + "auxiliary_loss_clip": 0.01149543, + "auxiliary_loss_mlp": 0.01041949, + "balance_loss_clip": 1.05601895, + "balance_loss_mlp": 1.02700019, + "epoch": 0.22161430933413498, + "flos": 27135133475040.0, + "grad_norm": 2.4054705054675476, + "language_loss": 0.8715111, + "learning_rate": 3.6270510164986823e-06, + "loss": 0.89342594, + "num_input_tokens_seen": 79445605, + "router_z_loss_clip": 0.93457031, + "router_z_loss_mlp": 0.14941406, + "step": 3686, + "time_per_iteration": 2.686652183532715 + }, + { + "auxiliary_loss_clip": 0.01153414, + "auxiliary_loss_mlp": 0.01040712, + "balance_loss_clip": 1.05740047, + "balance_loss_mlp": 1.02441573, + "epoch": 0.22167443258680294, + "flos": 28646374174560.0, + "grad_norm": 2.1496410268240402, + "language_loss": 0.77868152, + "learning_rate": 3.626824502298707e-06, + "loss": 0.80062276, + "num_input_tokens_seen": 79463850, + "router_z_loss_clip": 0.95849609, + "router_z_loss_mlp": 0.16290283, + "step": 3687, + "time_per_iteration": 4.113537073135376 + }, + { + "auxiliary_loss_clip": 0.01162521, + "auxiliary_loss_mlp": 0.01048571, + "balance_loss_clip": 1.06109989, + "balance_loss_mlp": 1.03096354, + "epoch": 0.2217345558394709, + "flos": 28334719192320.0, + "grad_norm": 1.8262934960852208, + "language_loss": 0.84783304, + "learning_rate": 3.626597926409383e-06, + "loss": 0.86994392, + "num_input_tokens_seen": 79482845, + "router_z_loss_clip": 1.01416016, + "router_z_loss_mlp": 0.17590332, + "step": 3688, + "time_per_iteration": 2.6901471614837646 + }, + { + "auxiliary_loss_clip": 0.01160619, + "auxiliary_loss_mlp": 0.01043743, + "balance_loss_clip": 1.0595293, + "balance_loss_mlp": 1.02713704, + "epoch": 0.22179467909213887, + "flos": 24417500155200.0, + "grad_norm": 2.058672723486836, + "language_loss": 0.81385577, + "learning_rate": 3.6263712888393027e-06, + "loss": 0.83589941, + "num_input_tokens_seen": 79501550, + "router_z_loss_clip": 1.01074219, + "router_z_loss_mlp": 0.1661377, + "step": 3689, + "time_per_iteration": 2.7010457515716553 + }, + { + "auxiliary_loss_clip": 0.01152648, + "auxiliary_loss_mlp": 0.01042814, + "balance_loss_clip": 1.05628848, + "balance_loss_mlp": 1.02724481, + "epoch": 0.22185480234480687, + "flos": 24017165547840.0, + "grad_norm": 1.8456556023357293, + "language_loss": 0.70038247, + "learning_rate": 3.626144589597061e-06, + "loss": 0.72233713, + "num_input_tokens_seen": 79519680, + "router_z_loss_clip": 0.96484375, + "router_z_loss_mlp": 0.15588379, + "step": 3690, + "time_per_iteration": 4.061494588851929 + }, + { + "auxiliary_loss_clip": 0.0115839, + "auxiliary_loss_mlp": 0.01038994, + "balance_loss_clip": 1.05777276, + "balance_loss_mlp": 1.02180386, + "epoch": 0.22191492559747483, + "flos": 26821736249760.0, + "grad_norm": 1.9021482664480323, + "language_loss": 0.72047085, + "learning_rate": 3.6259178286912528e-06, + "loss": 0.74244475, + "num_input_tokens_seen": 79539000, + "router_z_loss_clip": 1.00488281, + "router_z_loss_mlp": 0.17175293, + "step": 3691, + "time_per_iteration": 4.141115665435791 + }, + { + "auxiliary_loss_clip": 0.0115631, + "auxiliary_loss_mlp": 0.01049682, + "balance_loss_clip": 1.06052411, + "balance_loss_mlp": 1.03319538, + "epoch": 0.2219750488501428, + "flos": 28335772641600.0, + "grad_norm": 2.0643949386821205, + "language_loss": 0.70965141, + "learning_rate": 3.625691006130477e-06, + "loss": 0.73171127, + "num_input_tokens_seen": 79559695, + "router_z_loss_clip": 0.95800781, + "router_z_loss_mlp": 0.16491699, + "step": 3692, + "time_per_iteration": 2.6876022815704346 + }, + { + "auxiliary_loss_clip": 0.01160195, + "auxiliary_loss_mlp": 0.01043818, + "balance_loss_clip": 1.05890131, + "balance_loss_mlp": 1.02739096, + "epoch": 0.22203517210281076, + "flos": 27398336140800.0, + "grad_norm": 1.8025490255613685, + "language_loss": 0.87341416, + "learning_rate": 3.6254641219233362e-06, + "loss": 0.89545429, + "num_input_tokens_seen": 79579095, + "router_z_loss_clip": 1.01269531, + "router_z_loss_mlp": 0.16400146, + "step": 3693, + "time_per_iteration": 2.6875557899475098 + }, + { + "auxiliary_loss_clip": 0.01150297, + "auxiliary_loss_mlp": 0.01037845, + "balance_loss_clip": 1.05622578, + "balance_loss_mlp": 1.02349281, + "epoch": 0.22209529535547873, + "flos": 21432247786080.0, + "grad_norm": 2.0872748572336834, + "language_loss": 0.85313714, + "learning_rate": 3.6252371760784325e-06, + "loss": 0.8750186, + "num_input_tokens_seen": 79596430, + "router_z_loss_clip": 0.94091797, + "router_z_loss_mlp": 0.14355469, + "step": 3694, + "time_per_iteration": 2.67592716217041 + }, + { + "auxiliary_loss_clip": 0.01159936, + "auxiliary_loss_mlp": 0.01041119, + "balance_loss_clip": 1.05626488, + "balance_loss_mlp": 1.02402472, + "epoch": 0.2221554186081467, + "flos": 26196724559520.0, + "grad_norm": 3.5564027815432255, + "language_loss": 0.69215721, + "learning_rate": 3.6250101686043725e-06, + "loss": 0.71416771, + "num_input_tokens_seen": 79615825, + "router_z_loss_clip": 1.03808594, + "router_z_loss_mlp": 0.17102051, + "step": 3695, + "time_per_iteration": 2.700537919998169 + }, + { + "auxiliary_loss_clip": 0.01150879, + "auxiliary_loss_mlp": 0.01038839, + "balance_loss_clip": 1.05701613, + "balance_loss_mlp": 1.02441502, + "epoch": 0.22221554186081466, + "flos": 33767676347040.0, + "grad_norm": 1.6196049588221677, + "language_loss": 0.71651781, + "learning_rate": 3.6247830995097637e-06, + "loss": 0.738415, + "num_input_tokens_seen": 79637875, + "router_z_loss_clip": 0.93847656, + "router_z_loss_mlp": 0.14440918, + "step": 3696, + "time_per_iteration": 2.8371939659118652 + }, + { + "auxiliary_loss_clip": 0.01154313, + "auxiliary_loss_mlp": 0.01036647, + "balance_loss_clip": 1.05634725, + "balance_loss_mlp": 1.01989841, + "epoch": 0.22227566511348265, + "flos": 31675459890240.0, + "grad_norm": 1.7926575857582838, + "language_loss": 0.8721599, + "learning_rate": 3.624555968803217e-06, + "loss": 0.89406943, + "num_input_tokens_seen": 79656970, + "router_z_loss_clip": 0.98095703, + "router_z_loss_mlp": 0.16748047, + "step": 3697, + "time_per_iteration": 2.7204689979553223 + }, + { + "auxiliary_loss_clip": 0.01148142, + "auxiliary_loss_mlp": 0.01039758, + "balance_loss_clip": 1.05583882, + "balance_loss_mlp": 1.0245111, + "epoch": 0.22233578836615062, + "flos": 47837685620160.0, + "grad_norm": 1.670689068480228, + "language_loss": 0.66135889, + "learning_rate": 3.624328776493346e-06, + "loss": 0.68323791, + "num_input_tokens_seen": 79680275, + "router_z_loss_clip": 0.92285156, + "router_z_loss_mlp": 0.15258789, + "step": 3698, + "time_per_iteration": 2.948075294494629 + }, + { + "auxiliary_loss_clip": 0.01153229, + "auxiliary_loss_mlp": 0.01038041, + "balance_loss_clip": 1.05426788, + "balance_loss_mlp": 1.02163756, + "epoch": 0.22239591161881858, + "flos": 44274698876160.0, + "grad_norm": 1.810531454311149, + "language_loss": 0.82592595, + "learning_rate": 3.6241015225887637e-06, + "loss": 0.84783864, + "num_input_tokens_seen": 79701255, + "router_z_loss_clip": 0.98925781, + "router_z_loss_mlp": 0.1640625, + "step": 3699, + "time_per_iteration": 2.7852039337158203 + }, + { + "auxiliary_loss_clip": 0.01152272, + "auxiliary_loss_mlp": 0.01040592, + "balance_loss_clip": 1.05586779, + "balance_loss_mlp": 1.02411771, + "epoch": 0.22245603487148655, + "flos": 24061768722720.0, + "grad_norm": 1.580153812077184, + "language_loss": 0.79284841, + "learning_rate": 3.62387420709809e-06, + "loss": 0.81477702, + "num_input_tokens_seen": 79721315, + "router_z_loss_clip": 0.96484375, + "router_z_loss_mlp": 0.16479492, + "step": 3700, + "time_per_iteration": 2.6568901538848877 + }, + { + "auxiliary_loss_clip": 0.01161082, + "auxiliary_loss_mlp": 0.01045545, + "balance_loss_clip": 1.05979776, + "balance_loss_mlp": 1.02862895, + "epoch": 0.2225161581241545, + "flos": 56473441185600.0, + "grad_norm": 2.3561240389168874, + "language_loss": 0.7239188, + "learning_rate": 3.623646830029943e-06, + "loss": 0.74598503, + "num_input_tokens_seen": 79742705, + "router_z_loss_clip": 1.01269531, + "router_z_loss_mlp": 0.16918945, + "step": 3701, + "time_per_iteration": 2.8968639373779297 + }, + { + "auxiliary_loss_clip": 0.011507, + "auxiliary_loss_mlp": 0.01037639, + "balance_loss_clip": 1.05337238, + "balance_loss_mlp": 1.0224638, + "epoch": 0.22257628137682248, + "flos": 28914357879360.0, + "grad_norm": 2.31666961913316, + "language_loss": 0.79180658, + "learning_rate": 3.6234193913929454e-06, + "loss": 0.81368995, + "num_input_tokens_seen": 79763000, + "router_z_loss_clip": 0.97314453, + "router_z_loss_mlp": 0.15161133, + "step": 3702, + "time_per_iteration": 2.6668386459350586 + }, + { + "auxiliary_loss_clip": 0.01146025, + "auxiliary_loss_mlp": 0.01034518, + "balance_loss_clip": 1.05394161, + "balance_loss_mlp": 1.01888978, + "epoch": 0.22263640462949044, + "flos": 23615696456640.0, + "grad_norm": 2.2513527666579707, + "language_loss": 0.7768513, + "learning_rate": 3.623191891195723e-06, + "loss": 0.7986567, + "num_input_tokens_seen": 79781335, + "router_z_loss_clip": 0.92041016, + "router_z_loss_mlp": 0.15637207, + "step": 3703, + "time_per_iteration": 2.667936325073242 + }, + { + "auxiliary_loss_clip": 0.01154434, + "auxiliary_loss_mlp": 0.01036865, + "balance_loss_clip": 1.0559001, + "balance_loss_mlp": 1.01934123, + "epoch": 0.22269652788215843, + "flos": 25351087514400.0, + "grad_norm": 2.085896699508242, + "language_loss": 0.74752009, + "learning_rate": 3.6229643294469005e-06, + "loss": 0.76943308, + "num_input_tokens_seen": 79800150, + "router_z_loss_clip": 0.98583984, + "router_z_loss_mlp": 0.17529297, + "step": 3704, + "time_per_iteration": 2.673799753189087 + }, + { + "auxiliary_loss_clip": 0.01149828, + "auxiliary_loss_mlp": 0.01036332, + "balance_loss_clip": 1.05762863, + "balance_loss_mlp": 1.02139556, + "epoch": 0.2227566511348264, + "flos": 58521540674880.0, + "grad_norm": 1.9297501866647226, + "language_loss": 0.64553976, + "learning_rate": 3.6227367061551074e-06, + "loss": 0.66740137, + "num_input_tokens_seen": 79822390, + "router_z_loss_clip": 0.92138672, + "router_z_loss_mlp": 0.14923096, + "step": 3705, + "time_per_iteration": 2.9196972846984863 + }, + { + "auxiliary_loss_clip": 0.01078316, + "auxiliary_loss_mlp": 0.00999856, + "balance_loss_clip": 1.04572296, + "balance_loss_mlp": 0.997971, + "epoch": 0.22281677438749437, + "flos": 80802342750240.0, + "grad_norm": 1.2392811442702856, + "language_loss": 0.65189123, + "learning_rate": 3.6225090213289766e-06, + "loss": 0.67267299, + "num_input_tokens_seen": 79873350, + "router_z_loss_clip": 0.32568359, + "router_z_loss_mlp": 0.01881409, + "step": 3706, + "time_per_iteration": 3.173527240753174 + }, + { + "auxiliary_loss_clip": 0.01149848, + "auxiliary_loss_mlp": 0.01039231, + "balance_loss_clip": 1.0539161, + "balance_loss_mlp": 1.02393651, + "epoch": 0.22287689764016233, + "flos": 26687886207840.0, + "grad_norm": 2.1019435408567504, + "language_loss": 0.80594712, + "learning_rate": 3.622281274977141e-06, + "loss": 0.82783794, + "num_input_tokens_seen": 79891715, + "router_z_loss_clip": 0.96044922, + "router_z_loss_mlp": 0.15283203, + "step": 3707, + "time_per_iteration": 2.7264366149902344 + }, + { + "auxiliary_loss_clip": 0.01152718, + "auxiliary_loss_mlp": 0.01037003, + "balance_loss_clip": 1.05768418, + "balance_loss_mlp": 1.02179217, + "epoch": 0.2229370208928303, + "flos": 33766582380480.0, + "grad_norm": 1.8968774584004038, + "language_loss": 0.78039432, + "learning_rate": 3.6220534671082367e-06, + "loss": 0.80229151, + "num_input_tokens_seen": 79911175, + "router_z_loss_clip": 0.95117188, + "router_z_loss_mlp": 0.15197754, + "step": 3708, + "time_per_iteration": 2.773841142654419 + }, + { + "auxiliary_loss_clip": 0.0115573, + "auxiliary_loss_mlp": 0.01036897, + "balance_loss_clip": 1.0571506, + "balance_loss_mlp": 1.02142334, + "epoch": 0.22299714414549826, + "flos": 36795424992480.0, + "grad_norm": 2.6007264088344924, + "language_loss": 0.80243868, + "learning_rate": 3.6218255977309024e-06, + "loss": 0.82436496, + "num_input_tokens_seen": 79931875, + "router_z_loss_clip": 0.98486328, + "router_z_loss_mlp": 0.15466309, + "step": 3709, + "time_per_iteration": 2.8238375186920166 + }, + { + "auxiliary_loss_clip": 0.01152747, + "auxiliary_loss_mlp": 0.01039332, + "balance_loss_clip": 1.05476427, + "balance_loss_mlp": 1.02376294, + "epoch": 0.22305726739816625, + "flos": 28240002492480.0, + "grad_norm": 2.1023460727340195, + "language_loss": 0.69058567, + "learning_rate": 3.6215976668537787e-06, + "loss": 0.71250641, + "num_input_tokens_seen": 79952445, + "router_z_loss_clip": 0.98095703, + "router_z_loss_mlp": 0.15576172, + "step": 3710, + "time_per_iteration": 2.732966423034668 + }, + { + "auxiliary_loss_clip": 0.01157321, + "auxiliary_loss_mlp": 0.01044818, + "balance_loss_clip": 1.05711424, + "balance_loss_mlp": 1.02882004, + "epoch": 0.22311739065083422, + "flos": 23393652996960.0, + "grad_norm": 9.1077145181766, + "language_loss": 0.90807605, + "learning_rate": 3.6213696744855096e-06, + "loss": 0.9300974, + "num_input_tokens_seen": 79971030, + "router_z_loss_clip": 1.00292969, + "router_z_loss_mlp": 0.15997314, + "step": 3711, + "time_per_iteration": 2.6665821075439453 + }, + { + "auxiliary_loss_clip": 0.0115261, + "auxiliary_loss_mlp": 0.01051371, + "balance_loss_clip": 1.05536366, + "balance_loss_mlp": 1.03433561, + "epoch": 0.22317751390350218, + "flos": 16616644555680.0, + "grad_norm": 2.371202371358993, + "language_loss": 0.88946009, + "learning_rate": 3.6211416206347395e-06, + "loss": 0.91149986, + "num_input_tokens_seen": 79982085, + "router_z_loss_clip": 0.97314453, + "router_z_loss_mlp": 0.17028809, + "step": 3712, + "time_per_iteration": 2.608334541320801 + }, + { + "auxiliary_loss_clip": 0.01153024, + "auxiliary_loss_mlp": 0.01047019, + "balance_loss_clip": 1.05874693, + "balance_loss_mlp": 1.03105712, + "epoch": 0.22323763715617015, + "flos": 13457557939680.0, + "grad_norm": 4.5733114732646065, + "language_loss": 0.74413514, + "learning_rate": 3.620913505310117e-06, + "loss": 0.76613557, + "num_input_tokens_seen": 79997460, + "router_z_loss_clip": 0.94384766, + "router_z_loss_mlp": 0.15979004, + "step": 3713, + "time_per_iteration": 2.6069414615631104 + }, + { + "auxiliary_loss_clip": 0.01151948, + "auxiliary_loss_mlp": 0.01042981, + "balance_loss_clip": 1.05632663, + "balance_loss_mlp": 1.02760303, + "epoch": 0.22329776040883811, + "flos": 50459103100800.0, + "grad_norm": 3.3103650619090006, + "language_loss": 0.62585694, + "learning_rate": 3.6206853285202917e-06, + "loss": 0.64780629, + "num_input_tokens_seen": 80022450, + "router_z_loss_clip": 0.95605469, + "router_z_loss_mlp": 0.15380859, + "step": 3714, + "time_per_iteration": 2.847365617752075 + }, + { + "auxiliary_loss_clip": 0.01151542, + "auxiliary_loss_mlp": 0.01033397, + "balance_loss_clip": 1.05698276, + "balance_loss_mlp": 1.01848376, + "epoch": 0.22335788366150608, + "flos": 30651126524640.0, + "grad_norm": 1.8705418876594015, + "language_loss": 0.79142344, + "learning_rate": 3.6204570902739164e-06, + "loss": 0.81327277, + "num_input_tokens_seen": 80042100, + "router_z_loss_clip": 0.94580078, + "router_z_loss_mlp": 0.14916992, + "step": 3715, + "time_per_iteration": 2.7070748805999756 + }, + { + "auxiliary_loss_clip": 0.01155963, + "auxiliary_loss_mlp": 0.01051958, + "balance_loss_clip": 1.05915833, + "balance_loss_mlp": 1.03605509, + "epoch": 0.22341800691417404, + "flos": 20722810785120.0, + "grad_norm": 1.954381996000871, + "language_loss": 0.76997721, + "learning_rate": 3.620228790579645e-06, + "loss": 0.79205644, + "num_input_tokens_seen": 80059690, + "router_z_loss_clip": 0.96826172, + "router_z_loss_mlp": 0.15917969, + "step": 3716, + "time_per_iteration": 2.680586338043213 + }, + { + "auxiliary_loss_clip": 0.011548, + "auxiliary_loss_mlp": 0.01041875, + "balance_loss_clip": 1.05790794, + "balance_loss_mlp": 1.02656221, + "epoch": 0.22347813016684204, + "flos": 17249192460000.0, + "grad_norm": 4.62857669895429, + "language_loss": 0.78830051, + "learning_rate": 3.6200004294461367e-06, + "loss": 0.81026733, + "num_input_tokens_seen": 80076060, + "router_z_loss_clip": 0.96826172, + "router_z_loss_mlp": 0.15301514, + "step": 3717, + "time_per_iteration": 2.637124538421631 + }, + { + "auxiliary_loss_clip": 0.01153773, + "auxiliary_loss_mlp": 0.01039621, + "balance_loss_clip": 1.05522966, + "balance_loss_mlp": 1.02357531, + "epoch": 0.22353825341951, + "flos": 28776739730400.0, + "grad_norm": 2.628933707991703, + "language_loss": 0.6791485, + "learning_rate": 3.6197720068820497e-06, + "loss": 0.70108247, + "num_input_tokens_seen": 80094760, + "router_z_loss_clip": 0.98583984, + "router_z_loss_mlp": 0.16027832, + "step": 3718, + "time_per_iteration": 2.6803226470947266 + }, + { + "auxiliary_loss_clip": 0.01155405, + "auxiliary_loss_mlp": 0.01042752, + "balance_loss_clip": 1.0566256, + "balance_loss_mlp": 1.02581191, + "epoch": 0.22359837667217797, + "flos": 36392375727360.0, + "grad_norm": 2.0781924786851245, + "language_loss": 0.80408698, + "learning_rate": 3.619543522896045e-06, + "loss": 0.82606852, + "num_input_tokens_seen": 80114475, + "router_z_loss_clip": 0.98779297, + "router_z_loss_mlp": 0.16931152, + "step": 3719, + "time_per_iteration": 2.737826108932495 + }, + { + "auxiliary_loss_clip": 0.01154771, + "auxiliary_loss_mlp": 0.01051304, + "balance_loss_clip": 1.0534507, + "balance_loss_mlp": 1.03450727, + "epoch": 0.22365849992484593, + "flos": 21479079411360.0, + "grad_norm": 2.0564455359879794, + "language_loss": 0.86668646, + "learning_rate": 3.6193149774967885e-06, + "loss": 0.88874722, + "num_input_tokens_seen": 80132920, + "router_z_loss_clip": 1.01269531, + "router_z_loss_mlp": 0.16809082, + "step": 3720, + "time_per_iteration": 2.6830062866210938 + }, + { + "auxiliary_loss_clip": 0.01148683, + "auxiliary_loss_mlp": 0.0103629, + "balance_loss_clip": 1.05525899, + "balance_loss_mlp": 1.02135301, + "epoch": 0.2237186231775139, + "flos": 27712098021600.0, + "grad_norm": 1.6766014368510258, + "language_loss": 0.74673551, + "learning_rate": 3.619086370692945e-06, + "loss": 0.76858521, + "num_input_tokens_seen": 80152845, + "router_z_loss_clip": 0.93457031, + "router_z_loss_mlp": 0.14941406, + "step": 3721, + "time_per_iteration": 2.6967854499816895 + }, + { + "auxiliary_loss_clip": 0.01155623, + "auxiliary_loss_mlp": 0.01040434, + "balance_loss_clip": 1.05644834, + "balance_loss_mlp": 1.0245676, + "epoch": 0.22377874643018186, + "flos": 16315321479840.0, + "grad_norm": 2.499555902869555, + "language_loss": 0.79473776, + "learning_rate": 3.6188577024931844e-06, + "loss": 0.81669831, + "num_input_tokens_seen": 80170680, + "router_z_loss_clip": 0.99169922, + "router_z_loss_mlp": 0.15856934, + "step": 3722, + "time_per_iteration": 2.765716314315796 + }, + { + "auxiliary_loss_clip": 0.01149812, + "auxiliary_loss_mlp": 0.01037101, + "balance_loss_clip": 1.05617559, + "balance_loss_mlp": 1.02229476, + "epoch": 0.22383886968284986, + "flos": 21835256533920.0, + "grad_norm": 2.18288728141455, + "language_loss": 0.82629859, + "learning_rate": 3.618628972906178e-06, + "loss": 0.84816778, + "num_input_tokens_seen": 80189030, + "router_z_loss_clip": 0.9375, + "router_z_loss_mlp": 0.14819336, + "step": 3723, + "time_per_iteration": 2.6685128211975098 + }, + { + "auxiliary_loss_clip": 0.01154174, + "auxiliary_loss_mlp": 0.01042525, + "balance_loss_clip": 1.05589044, + "balance_loss_mlp": 1.02627659, + "epoch": 0.22389899293551782, + "flos": 29137535822880.0, + "grad_norm": 1.898473287899304, + "language_loss": 0.8426373, + "learning_rate": 3.6184001819405984e-06, + "loss": 0.86460435, + "num_input_tokens_seen": 80208365, + "router_z_loss_clip": 0.98242188, + "router_z_loss_mlp": 0.16259766, + "step": 3724, + "time_per_iteration": 2.681966543197632 + }, + { + "auxiliary_loss_clip": 0.01150192, + "auxiliary_loss_mlp": 0.01036986, + "balance_loss_clip": 1.05497694, + "balance_loss_mlp": 1.02146506, + "epoch": 0.2239591161881858, + "flos": 33278297459040.0, + "grad_norm": 1.8467011405586014, + "language_loss": 0.79086661, + "learning_rate": 3.618171329605121e-06, + "loss": 0.81273842, + "num_input_tokens_seen": 80228685, + "router_z_loss_clip": 0.95214844, + "router_z_loss_mlp": 0.1550293, + "step": 3725, + "time_per_iteration": 4.17107629776001 + }, + { + "auxiliary_loss_clip": 0.01150537, + "auxiliary_loss_mlp": 0.01038925, + "balance_loss_clip": 1.05553329, + "balance_loss_mlp": 1.02315402, + "epoch": 0.22401923944085375, + "flos": 27133553301120.0, + "grad_norm": 5.914674358201279, + "language_loss": 0.77400774, + "learning_rate": 3.6179424159084254e-06, + "loss": 0.79590237, + "num_input_tokens_seen": 80247635, + "router_z_loss_clip": 0.94970703, + "router_z_loss_mlp": 0.15771484, + "step": 3726, + "time_per_iteration": 2.6385915279388428 + }, + { + "auxiliary_loss_clip": 0.0116328, + "auxiliary_loss_mlp": 0.01045018, + "balance_loss_clip": 1.05881381, + "balance_loss_mlp": 1.02712417, + "epoch": 0.22407936269352172, + "flos": 14707581320160.0, + "grad_norm": 2.6691371963260457, + "language_loss": 0.72482032, + "learning_rate": 3.6177134408591914e-06, + "loss": 0.7469033, + "num_input_tokens_seen": 80260045, + "router_z_loss_clip": 1.04589844, + "router_z_loss_mlp": 0.17895508, + "step": 3727, + "time_per_iteration": 4.0605244636535645 + }, + { + "auxiliary_loss_clip": 0.01155744, + "auxiliary_loss_mlp": 0.01040457, + "balance_loss_clip": 1.05499625, + "balance_loss_mlp": 1.0225395, + "epoch": 0.22413948594618968, + "flos": 23615210249280.0, + "grad_norm": 2.1831336443925973, + "language_loss": 0.86860478, + "learning_rate": 3.6174844044661013e-06, + "loss": 0.89056677, + "num_input_tokens_seen": 80277680, + "router_z_loss_clip": 1.00634766, + "router_z_loss_mlp": 0.17895508, + "step": 3728, + "time_per_iteration": 2.688971519470215 + }, + { + "auxiliary_loss_clip": 0.01154743, + "auxiliary_loss_mlp": 0.01042631, + "balance_loss_clip": 1.0587734, + "balance_loss_mlp": 1.02561986, + "epoch": 0.22419960919885765, + "flos": 29492578461600.0, + "grad_norm": 2.739733931089315, + "language_loss": 0.80084395, + "learning_rate": 3.6172553067378406e-06, + "loss": 0.82281768, + "num_input_tokens_seen": 80294795, + "router_z_loss_clip": 0.95947266, + "router_z_loss_mlp": 0.17016602, + "step": 3729, + "time_per_iteration": 4.136222839355469 + }, + { + "auxiliary_loss_clip": 0.01150332, + "auxiliary_loss_mlp": 0.01043162, + "balance_loss_clip": 1.05682135, + "balance_loss_mlp": 1.02872014, + "epoch": 0.22425973245152564, + "flos": 33408581980320.0, + "grad_norm": 1.84171894048493, + "language_loss": 0.86703014, + "learning_rate": 3.6170261476830964e-06, + "loss": 0.88896513, + "num_input_tokens_seen": 80315425, + "router_z_loss_clip": 0.93554688, + "router_z_loss_mlp": 0.14447021, + "step": 3730, + "time_per_iteration": 4.102532625198364 + }, + { + "auxiliary_loss_clip": 0.01146758, + "auxiliary_loss_mlp": 0.01036686, + "balance_loss_clip": 1.05486083, + "balance_loss_mlp": 1.02146316, + "epoch": 0.2243198557041936, + "flos": 16759732537440.0, + "grad_norm": 2.574231489407076, + "language_loss": 0.7320134, + "learning_rate": 3.616796927310559e-06, + "loss": 0.75384784, + "num_input_tokens_seen": 80333905, + "router_z_loss_clip": 0.91845703, + "router_z_loss_mlp": 0.15222168, + "step": 3731, + "time_per_iteration": 2.7237608432769775 + }, + { + "auxiliary_loss_clip": 0.01151612, + "auxiliary_loss_mlp": 0.01035904, + "balance_loss_clip": 1.055215, + "balance_loss_mlp": 1.0205853, + "epoch": 0.22437997895686157, + "flos": 23831864910720.0, + "grad_norm": 2.1485344212676067, + "language_loss": 0.75477779, + "learning_rate": 3.6165676456289195e-06, + "loss": 0.77665293, + "num_input_tokens_seen": 80352165, + "router_z_loss_clip": 0.96484375, + "router_z_loss_mlp": 0.15332031, + "step": 3732, + "time_per_iteration": 2.7502524852752686 + }, + { + "auxiliary_loss_clip": 0.01152968, + "auxiliary_loss_mlp": 0.01053393, + "balance_loss_clip": 1.0567354, + "balance_loss_mlp": 1.03844428, + "epoch": 0.22444010220952954, + "flos": 28914398396640.0, + "grad_norm": 1.846201161603934, + "language_loss": 0.88090789, + "learning_rate": 3.616338302646873e-06, + "loss": 0.90297151, + "num_input_tokens_seen": 80371305, + "router_z_loss_clip": 0.96386719, + "router_z_loss_mlp": 0.14941406, + "step": 3733, + "time_per_iteration": 2.724348306655884 + }, + { + "auxiliary_loss_clip": 0.01149611, + "auxiliary_loss_mlp": 0.01041638, + "balance_loss_clip": 1.05407262, + "balance_loss_mlp": 1.02626002, + "epoch": 0.2245002254621975, + "flos": 27311763414240.0, + "grad_norm": 1.7659103836646952, + "language_loss": 0.84813589, + "learning_rate": 3.6161088983731166e-06, + "loss": 0.8700484, + "num_input_tokens_seen": 80391020, + "router_z_loss_clip": 0.95556641, + "router_z_loss_mlp": 0.15368652, + "step": 3734, + "time_per_iteration": 2.6859703063964844 + }, + { + "auxiliary_loss_clip": 0.01155063, + "auxiliary_loss_mlp": 0.01041951, + "balance_loss_clip": 1.05927348, + "balance_loss_mlp": 1.02653742, + "epoch": 0.22456034871486547, + "flos": 32875612849440.0, + "grad_norm": 1.6708876149362724, + "language_loss": 0.76819837, + "learning_rate": 3.6158794328163482e-06, + "loss": 0.79016852, + "num_input_tokens_seen": 80411365, + "router_z_loss_clip": 0.95751953, + "router_z_loss_mlp": 0.1541748, + "step": 3735, + "time_per_iteration": 2.7214386463165283 + }, + { + "auxiliary_loss_clip": 0.01147265, + "auxiliary_loss_mlp": 0.01045253, + "balance_loss_clip": 1.05683827, + "balance_loss_mlp": 1.03093576, + "epoch": 0.22462047196753343, + "flos": 35366664774240.0, + "grad_norm": 1.8325781335198799, + "language_loss": 0.84614789, + "learning_rate": 3.6156499059852702e-06, + "loss": 0.86807311, + "num_input_tokens_seen": 80431075, + "router_z_loss_clip": 0.90380859, + "router_z_loss_mlp": 0.14331055, + "step": 3736, + "time_per_iteration": 2.7451164722442627 + }, + { + "auxiliary_loss_clip": 0.01151987, + "auxiliary_loss_mlp": 0.01039337, + "balance_loss_clip": 1.05664861, + "balance_loss_mlp": 1.02439463, + "epoch": 0.22468059522020142, + "flos": 24417783776160.0, + "grad_norm": 2.276004926945405, + "language_loss": 0.86461782, + "learning_rate": 3.615420317888586e-06, + "loss": 0.88653105, + "num_input_tokens_seen": 80449240, + "router_z_loss_clip": 0.953125, + "router_z_loss_mlp": 0.1494751, + "step": 3737, + "time_per_iteration": 2.659790277481079 + }, + { + "auxiliary_loss_clip": 0.01151553, + "auxiliary_loss_mlp": 0.01045586, + "balance_loss_clip": 1.05433917, + "balance_loss_mlp": 1.02953994, + "epoch": 0.2247407184728694, + "flos": 35769997660320.0, + "grad_norm": 2.796377549846108, + "language_loss": 0.79277682, + "learning_rate": 3.6151906685350006e-06, + "loss": 0.81474817, + "num_input_tokens_seen": 80467900, + "router_z_loss_clip": 0.97167969, + "router_z_loss_mlp": 0.16052246, + "step": 3738, + "time_per_iteration": 2.72959303855896 + }, + { + "auxiliary_loss_clip": 0.01150835, + "auxiliary_loss_mlp": 0.01034472, + "balance_loss_clip": 1.05571437, + "balance_loss_mlp": 1.01975608, + "epoch": 0.22480084172553735, + "flos": 27223205340960.0, + "grad_norm": 1.6251961635880217, + "language_loss": 0.76218289, + "learning_rate": 3.614960957933224e-06, + "loss": 0.78403592, + "num_input_tokens_seen": 80487100, + "router_z_loss_clip": 0.95117188, + "router_z_loss_mlp": 0.1472168, + "step": 3739, + "time_per_iteration": 2.64334774017334 + }, + { + "auxiliary_loss_clip": 0.01148226, + "auxiliary_loss_mlp": 0.0104011, + "balance_loss_clip": 1.05306077, + "balance_loss_mlp": 1.02470839, + "epoch": 0.22486096497820532, + "flos": 31227078139200.0, + "grad_norm": 2.146622291467096, + "language_loss": 0.74164367, + "learning_rate": 3.6147311860919655e-06, + "loss": 0.76352704, + "num_input_tokens_seen": 80508625, + "router_z_loss_clip": 0.95214844, + "router_z_loss_mlp": 0.15405273, + "step": 3740, + "time_per_iteration": 2.7149126529693604 + }, + { + "auxiliary_loss_clip": 0.0114814, + "auxiliary_loss_mlp": 0.01037182, + "balance_loss_clip": 1.05426121, + "balance_loss_mlp": 1.02197063, + "epoch": 0.22492108823087328, + "flos": 21523885172640.0, + "grad_norm": 2.151558589911907, + "language_loss": 0.75773203, + "learning_rate": 3.614501353019939e-06, + "loss": 0.7795853, + "num_input_tokens_seen": 80527345, + "router_z_loss_clip": 0.93847656, + "router_z_loss_mlp": 0.15222168, + "step": 3741, + "time_per_iteration": 2.625237226486206 + }, + { + "auxiliary_loss_clip": 0.01150411, + "auxiliary_loss_mlp": 0.0103348, + "balance_loss_clip": 1.05698001, + "balance_loss_mlp": 1.01878703, + "epoch": 0.22498121148354125, + "flos": 19565113584960.0, + "grad_norm": 1.9420003401512218, + "language_loss": 0.8743192, + "learning_rate": 3.6142714587258592e-06, + "loss": 0.8961581, + "num_input_tokens_seen": 80545545, + "router_z_loss_clip": 0.93310547, + "router_z_loss_mlp": 0.14691162, + "step": 3742, + "time_per_iteration": 2.6905198097229004 + }, + { + "auxiliary_loss_clip": 0.01146368, + "auxiliary_loss_mlp": 0.01043217, + "balance_loss_clip": 1.05501842, + "balance_loss_mlp": 1.02760005, + "epoch": 0.22504133473620924, + "flos": 29314165762080.0, + "grad_norm": 1.7379945720116179, + "language_loss": 0.81772196, + "learning_rate": 3.614041503218444e-06, + "loss": 0.83961779, + "num_input_tokens_seen": 80565040, + "router_z_loss_clip": 0.9140625, + "router_z_loss_mlp": 0.15612793, + "step": 3743, + "time_per_iteration": 2.6941566467285156 + }, + { + "auxiliary_loss_clip": 0.01148255, + "auxiliary_loss_mlp": 0.01034163, + "balance_loss_clip": 1.05309474, + "balance_loss_mlp": 1.01915455, + "epoch": 0.2251014579888772, + "flos": 20455110701280.0, + "grad_norm": 2.427281469573551, + "language_loss": 0.6339916, + "learning_rate": 3.6138114865064134e-06, + "loss": 0.65581572, + "num_input_tokens_seen": 80582815, + "router_z_loss_clip": 0.95117188, + "router_z_loss_mlp": 0.15014648, + "step": 3744, + "time_per_iteration": 2.646883010864258 + }, + { + "auxiliary_loss_clip": 0.01146267, + "auxiliary_loss_mlp": 0.01037402, + "balance_loss_clip": 1.05154538, + "balance_loss_mlp": 1.02154756, + "epoch": 0.22516158124154517, + "flos": 17071347002400.0, + "grad_norm": 2.8933287157080945, + "language_loss": 0.75676298, + "learning_rate": 3.613581408598489e-06, + "loss": 0.77859968, + "num_input_tokens_seen": 80600865, + "router_z_loss_clip": 0.94775391, + "router_z_loss_mlp": 0.15856934, + "step": 3745, + "time_per_iteration": 2.801257610321045 + }, + { + "auxiliary_loss_clip": 0.01145668, + "auxiliary_loss_mlp": 0.01036712, + "balance_loss_clip": 1.05219877, + "balance_loss_mlp": 1.02209735, + "epoch": 0.22522170449421314, + "flos": 17558456922720.0, + "grad_norm": 2.3280704679767616, + "language_loss": 0.80715442, + "learning_rate": 3.6133512695033965e-06, + "loss": 0.82897824, + "num_input_tokens_seen": 80617455, + "router_z_loss_clip": 0.93505859, + "router_z_loss_mlp": 0.14605713, + "step": 3746, + "time_per_iteration": 2.6831271648406982 + }, + { + "auxiliary_loss_clip": 0.01148593, + "auxiliary_loss_mlp": 0.01044338, + "balance_loss_clip": 1.05226016, + "balance_loss_mlp": 1.0288527, + "epoch": 0.2252818277468811, + "flos": 29047802748480.0, + "grad_norm": 2.4482976389636346, + "language_loss": 0.86409974, + "learning_rate": 3.613121069229862e-06, + "loss": 0.88602912, + "num_input_tokens_seen": 80635125, + "router_z_loss_clip": 0.96386719, + "router_z_loss_mlp": 0.15478516, + "step": 3747, + "time_per_iteration": 2.7114861011505127 + }, + { + "auxiliary_loss_clip": 0.01145636, + "auxiliary_loss_mlp": 0.01030609, + "balance_loss_clip": 1.05050409, + "balance_loss_mlp": 1.01612461, + "epoch": 0.22534195099954907, + "flos": 30161261429280.0, + "grad_norm": 1.6852966494425168, + "language_loss": 0.76330233, + "learning_rate": 3.6128908077866145e-06, + "loss": 0.78506476, + "num_input_tokens_seen": 80656370, + "router_z_loss_clip": 0.95263672, + "router_z_loss_mlp": 0.14501953, + "step": 3748, + "time_per_iteration": 2.750354051589966 + }, + { + "auxiliary_loss_clip": 0.01151638, + "auxiliary_loss_mlp": 0.01039676, + "balance_loss_clip": 1.05564141, + "balance_loss_mlp": 1.02494216, + "epoch": 0.22540207425221703, + "flos": 25663755428640.0, + "grad_norm": 1.7872261841747552, + "language_loss": 0.79628354, + "learning_rate": 3.6126604851823864e-06, + "loss": 0.81819671, + "num_input_tokens_seen": 80676495, + "router_z_loss_clip": 0.95947266, + "router_z_loss_mlp": 0.1472168, + "step": 3749, + "time_per_iteration": 2.716283082962036 + }, + { + "auxiliary_loss_clip": 0.01142093, + "auxiliary_loss_mlp": 0.01039722, + "balance_loss_clip": 1.05119824, + "balance_loss_mlp": 1.02517247, + "epoch": 0.22546219750488503, + "flos": 23660097045120.0, + "grad_norm": 1.5648464844397831, + "language_loss": 0.79539645, + "learning_rate": 3.6124301014259108e-06, + "loss": 0.81721455, + "num_input_tokens_seen": 80694755, + "router_z_loss_clip": 0.90917969, + "router_z_loss_mlp": 0.14538574, + "step": 3750, + "time_per_iteration": 2.657318115234375 + }, + { + "auxiliary_loss_clip": 0.01148475, + "auxiliary_loss_mlp": 0.01040951, + "balance_loss_clip": 1.05326962, + "balance_loss_mlp": 1.02568078, + "epoch": 0.225522320757553, + "flos": 30740373391680.0, + "grad_norm": 1.822423800655338, + "language_loss": 0.82149303, + "learning_rate": 3.6121996565259244e-06, + "loss": 0.84338737, + "num_input_tokens_seen": 80713670, + "router_z_loss_clip": 0.95263672, + "router_z_loss_mlp": 0.15258789, + "step": 3751, + "time_per_iteration": 2.762723684310913 + }, + { + "auxiliary_loss_clip": 0.01150888, + "auxiliary_loss_mlp": 0.01040589, + "balance_loss_clip": 1.05465055, + "balance_loss_mlp": 1.02540183, + "epoch": 0.22558244401022096, + "flos": 20942382690720.0, + "grad_norm": 1.7691072430172656, + "language_loss": 0.83432138, + "learning_rate": 3.611969150491165e-06, + "loss": 0.85623616, + "num_input_tokens_seen": 80731450, + "router_z_loss_clip": 0.96191406, + "router_z_loss_mlp": 0.15197754, + "step": 3752, + "time_per_iteration": 2.6391117572784424 + }, + { + "auxiliary_loss_clip": 0.01145627, + "auxiliary_loss_mlp": 0.01031795, + "balance_loss_clip": 1.05245626, + "balance_loss_mlp": 1.01751959, + "epoch": 0.22564256726288892, + "flos": 18584532531360.0, + "grad_norm": 2.28610928067414, + "language_loss": 0.7880137, + "learning_rate": 3.611738583330375e-06, + "loss": 0.80978793, + "num_input_tokens_seen": 80748415, + "router_z_loss_clip": 0.9296875, + "router_z_loss_mlp": 0.14263916, + "step": 3753, + "time_per_iteration": 2.687501907348633 + }, + { + "auxiliary_loss_clip": 0.01144596, + "auxiliary_loss_mlp": 0.01041525, + "balance_loss_clip": 1.0517993, + "balance_loss_mlp": 1.02612901, + "epoch": 0.2257026905155569, + "flos": 42181793625600.0, + "grad_norm": 2.326973396732487, + "language_loss": 0.78175837, + "learning_rate": 3.611507955052295e-06, + "loss": 0.80361956, + "num_input_tokens_seen": 80770835, + "router_z_loss_clip": 0.92871094, + "router_z_loss_mlp": 0.15405273, + "step": 3754, + "time_per_iteration": 2.8113417625427246 + }, + { + "auxiliary_loss_clip": 0.01147813, + "auxiliary_loss_mlp": 0.01039195, + "balance_loss_clip": 1.05640483, + "balance_loss_mlp": 1.02406788, + "epoch": 0.22576281376822485, + "flos": 24328820530080.0, + "grad_norm": 2.0615643583004224, + "language_loss": 0.70349872, + "learning_rate": 3.6112772656656727e-06, + "loss": 0.72536874, + "num_input_tokens_seen": 80787840, + "router_z_loss_clip": 0.91357422, + "router_z_loss_mlp": 0.15118408, + "step": 3755, + "time_per_iteration": 2.7090883255004883 + }, + { + "auxiliary_loss_clip": 0.01154374, + "auxiliary_loss_mlp": 0.01053218, + "balance_loss_clip": 1.05766225, + "balance_loss_mlp": 1.03826952, + "epoch": 0.22582293702089282, + "flos": 30020280346080.0, + "grad_norm": 2.1777180861068306, + "language_loss": 0.77400333, + "learning_rate": 3.6110465151792547e-06, + "loss": 0.79607928, + "num_input_tokens_seen": 80806335, + "router_z_loss_clip": 0.96777344, + "router_z_loss_mlp": 0.1496582, + "step": 3756, + "time_per_iteration": 2.696371078491211 + }, + { + "auxiliary_loss_clip": 0.01156801, + "auxiliary_loss_mlp": 0.01040828, + "balance_loss_clip": 1.05839002, + "balance_loss_mlp": 1.02481818, + "epoch": 0.2258830602735608, + "flos": 28108421418240.0, + "grad_norm": 1.9314365604798227, + "language_loss": 0.82033741, + "learning_rate": 3.6108157036017916e-06, + "loss": 0.84231377, + "num_input_tokens_seen": 80825355, + "router_z_loss_clip": 0.984375, + "router_z_loss_mlp": 0.16003418, + "step": 3757, + "time_per_iteration": 2.653850793838501 + }, + { + "auxiliary_loss_clip": 0.01150665, + "auxiliary_loss_mlp": 0.0103801, + "balance_loss_clip": 1.05519843, + "balance_loss_mlp": 1.02215481, + "epoch": 0.22594318352622877, + "flos": 27038350393920.0, + "grad_norm": 1.78355214822739, + "language_loss": 0.73106641, + "learning_rate": 3.6105848309420358e-06, + "loss": 0.75295317, + "num_input_tokens_seen": 80842570, + "router_z_loss_clip": 0.95410156, + "router_z_loss_mlp": 0.15844727, + "step": 3758, + "time_per_iteration": 2.738405466079712 + }, + { + "auxiliary_loss_clip": 0.01149682, + "auxiliary_loss_mlp": 0.01045428, + "balance_loss_clip": 1.05358267, + "balance_loss_mlp": 1.02901244, + "epoch": 0.22600330677889674, + "flos": 25129246641120.0, + "grad_norm": 2.0648979783360213, + "language_loss": 0.77030373, + "learning_rate": 3.6103538972087412e-06, + "loss": 0.79225487, + "num_input_tokens_seen": 80858745, + "router_z_loss_clip": 0.9609375, + "router_z_loss_mlp": 0.1640625, + "step": 3759, + "time_per_iteration": 2.6772570610046387 + }, + { + "auxiliary_loss_clip": 0.01150955, + "auxiliary_loss_mlp": 0.01037896, + "balance_loss_clip": 1.0542686, + "balance_loss_mlp": 1.0223515, + "epoch": 0.2260634300315647, + "flos": 43513608693600.0, + "grad_norm": 1.7029543694130476, + "language_loss": 0.78302544, + "learning_rate": 3.6101229024106655e-06, + "loss": 0.804914, + "num_input_tokens_seen": 80880085, + "router_z_loss_clip": 0.96533203, + "router_z_loss_mlp": 0.15551758, + "step": 3760, + "time_per_iteration": 2.8046317100524902 + }, + { + "auxiliary_loss_clip": 0.01069849, + "auxiliary_loss_mlp": 0.01008621, + "balance_loss_clip": 1.03671956, + "balance_loss_mlp": 1.00670314, + "epoch": 0.22612355328423267, + "flos": 87967449580320.0, + "grad_norm": 0.9533027344384198, + "language_loss": 0.60087627, + "learning_rate": 3.609891846556569e-06, + "loss": 0.62166095, + "num_input_tokens_seen": 80937660, + "router_z_loss_clip": 0.33154297, + "router_z_loss_mlp": 0.01914978, + "step": 3761, + "time_per_iteration": 3.2120778560638428 + }, + { + "auxiliary_loss_clip": 0.01154786, + "auxiliary_loss_mlp": 0.0104015, + "balance_loss_clip": 1.05401325, + "balance_loss_mlp": 1.02373528, + "epoch": 0.22618367653690064, + "flos": 27800980233120.0, + "grad_norm": 2.3273335228503647, + "language_loss": 0.77650899, + "learning_rate": 3.609660729655211e-06, + "loss": 0.79845834, + "num_input_tokens_seen": 80956265, + "router_z_loss_clip": 1.00830078, + "router_z_loss_mlp": 0.16412354, + "step": 3762, + "time_per_iteration": 2.7506103515625 + }, + { + "auxiliary_loss_clip": 0.01152161, + "auxiliary_loss_mlp": 0.01035349, + "balance_loss_clip": 1.05522025, + "balance_loss_mlp": 1.01950598, + "epoch": 0.22624379978956863, + "flos": 24951036528000.0, + "grad_norm": 2.2047418369332314, + "language_loss": 0.78765231, + "learning_rate": 3.6094295517153573e-06, + "loss": 0.8095274, + "num_input_tokens_seen": 80975185, + "router_z_loss_clip": 0.96875, + "router_z_loss_mlp": 0.15844727, + "step": 3763, + "time_per_iteration": 2.6467790603637695 + }, + { + "auxiliary_loss_clip": 0.011558, + "auxiliary_loss_mlp": 0.01048849, + "balance_loss_clip": 1.0558306, + "balance_loss_mlp": 1.0319097, + "epoch": 0.2263039230422366, + "flos": 21346282818720.0, + "grad_norm": 1.9336814412838004, + "language_loss": 0.90832269, + "learning_rate": 3.6091983127457743e-06, + "loss": 0.93036914, + "num_input_tokens_seen": 80992830, + "router_z_loss_clip": 0.99902344, + "router_z_loss_mlp": 0.16955566, + "step": 3764, + "time_per_iteration": 4.250481367111206 + }, + { + "auxiliary_loss_clip": 0.01149522, + "auxiliary_loss_mlp": 0.01043462, + "balance_loss_clip": 1.05483866, + "balance_loss_mlp": 1.02816701, + "epoch": 0.22636404629490456, + "flos": 34568588665440.0, + "grad_norm": 1.807059067040207, + "language_loss": 0.75238806, + "learning_rate": 3.6089670127552293e-06, + "loss": 0.77431786, + "num_input_tokens_seen": 81013675, + "router_z_loss_clip": 0.94775391, + "router_z_loss_mlp": 0.1529541, + "step": 3765, + "time_per_iteration": 2.7761952877044678 + }, + { + "auxiliary_loss_clip": 0.01148129, + "auxiliary_loss_mlp": 0.01042271, + "balance_loss_clip": 1.05345809, + "balance_loss_mlp": 1.02626693, + "epoch": 0.22642416954757252, + "flos": 21340448330400.0, + "grad_norm": 2.1738514455572537, + "language_loss": 0.89295429, + "learning_rate": 3.608735651752494e-06, + "loss": 0.91485834, + "num_input_tokens_seen": 81030345, + "router_z_loss_clip": 0.94628906, + "router_z_loss_mlp": 0.15997314, + "step": 3766, + "time_per_iteration": 4.165951728820801 + }, + { + "auxiliary_loss_clip": 0.01147741, + "auxiliary_loss_mlp": 0.01041388, + "balance_loss_clip": 1.05487943, + "balance_loss_mlp": 1.02544975, + "epoch": 0.2264842928002405, + "flos": 29754768195360.0, + "grad_norm": 1.7009359835677365, + "language_loss": 0.74650431, + "learning_rate": 3.6085042297463417e-06, + "loss": 0.7683956, + "num_input_tokens_seen": 81051000, + "router_z_loss_clip": 0.92871094, + "router_z_loss_mlp": 0.15942383, + "step": 3767, + "time_per_iteration": 2.7657270431518555 + }, + { + "auxiliary_loss_clip": 0.01150421, + "auxiliary_loss_mlp": 0.01039952, + "balance_loss_clip": 1.05375051, + "balance_loss_mlp": 1.0235486, + "epoch": 0.22654441605290845, + "flos": 24196996352160.0, + "grad_norm": 1.7763473219770474, + "language_loss": 0.71846759, + "learning_rate": 3.6082727467455477e-06, + "loss": 0.74037135, + "num_input_tokens_seen": 81071205, + "router_z_loss_clip": 0.96777344, + "router_z_loss_mlp": 0.1640625, + "step": 3768, + "time_per_iteration": 2.725879430770874 + }, + { + "auxiliary_loss_clip": 0.01155986, + "auxiliary_loss_mlp": 0.01053328, + "balance_loss_clip": 1.05987716, + "balance_loss_mlp": 1.03650713, + "epoch": 0.22660453930557642, + "flos": 33500584022400.0, + "grad_norm": 1.8624191748479044, + "language_loss": 0.78612739, + "learning_rate": 3.6080412027588905e-06, + "loss": 0.80822051, + "num_input_tokens_seen": 81091880, + "router_z_loss_clip": 0.96191406, + "router_z_loss_mlp": 0.16821289, + "step": 3769, + "time_per_iteration": 4.236283302307129 + }, + { + "auxiliary_loss_clip": 0.01153982, + "auxiliary_loss_mlp": 0.01040773, + "balance_loss_clip": 1.05483651, + "balance_loss_mlp": 1.02447164, + "epoch": 0.2266646625582444, + "flos": 29270616036480.0, + "grad_norm": 2.3185179762757437, + "language_loss": 0.68478477, + "learning_rate": 3.6078095977951488e-06, + "loss": 0.70673233, + "num_input_tokens_seen": 81113290, + "router_z_loss_clip": 0.99023438, + "router_z_loss_mlp": 0.16314697, + "step": 3770, + "time_per_iteration": 4.085184574127197 + }, + { + "auxiliary_loss_clip": 0.01151513, + "auxiliary_loss_mlp": 0.01040546, + "balance_loss_clip": 1.05346155, + "balance_loss_mlp": 1.02488208, + "epoch": 0.22672478581091238, + "flos": 31760168821920.0, + "grad_norm": 1.977026421264606, + "language_loss": 0.80641854, + "learning_rate": 3.6075779318631067e-06, + "loss": 0.8283391, + "num_input_tokens_seen": 81133535, + "router_z_loss_clip": 0.98095703, + "router_z_loss_mlp": 0.15673828, + "step": 3771, + "time_per_iteration": 2.795757532119751 + }, + { + "auxiliary_loss_clip": 0.01145224, + "auxiliary_loss_mlp": 0.01046113, + "balance_loss_clip": 1.05348301, + "balance_loss_mlp": 1.03034139, + "epoch": 0.22678490906358034, + "flos": 29092081785120.0, + "grad_norm": 1.5872157979989951, + "language_loss": 0.78604269, + "learning_rate": 3.6073462049715486e-06, + "loss": 0.80795604, + "num_input_tokens_seen": 81154650, + "router_z_loss_clip": 0.91650391, + "router_z_loss_mlp": 0.15771484, + "step": 3772, + "time_per_iteration": 2.7599596977233887 + }, + { + "auxiliary_loss_clip": 0.01061242, + "auxiliary_loss_mlp": 0.01004556, + "balance_loss_clip": 1.02868378, + "balance_loss_mlp": 1.00266826, + "epoch": 0.2268450323162483, + "flos": 79370509569120.0, + "grad_norm": 0.6506963016161645, + "language_loss": 0.54373145, + "learning_rate": 3.607114417129261e-06, + "loss": 0.56438941, + "num_input_tokens_seen": 81221240, + "router_z_loss_clip": 0.32568359, + "router_z_loss_mlp": 0.0188446, + "step": 3773, + "time_per_iteration": 3.3510565757751465 + }, + { + "auxiliary_loss_clip": 0.01146877, + "auxiliary_loss_mlp": 0.01042237, + "balance_loss_clip": 1.05298674, + "balance_loss_mlp": 1.02595329, + "epoch": 0.22690515556891627, + "flos": 27486732144960.0, + "grad_norm": 1.951277568100895, + "language_loss": 0.70561457, + "learning_rate": 3.6068825683450334e-06, + "loss": 0.72750568, + "num_input_tokens_seen": 81241520, + "router_z_loss_clip": 0.93847656, + "router_z_loss_mlp": 0.16296387, + "step": 3774, + "time_per_iteration": 2.7520153522491455 + }, + { + "auxiliary_loss_clip": 0.01147451, + "auxiliary_loss_mlp": 0.01039163, + "balance_loss_clip": 1.05350399, + "balance_loss_mlp": 1.02299857, + "epoch": 0.22696527882158424, + "flos": 22236077348640.0, + "grad_norm": 3.6348412414123494, + "language_loss": 0.74349928, + "learning_rate": 3.606650658627658e-06, + "loss": 0.76536548, + "num_input_tokens_seen": 81256825, + "router_z_loss_clip": 0.93994141, + "router_z_loss_mlp": 0.16162109, + "step": 3775, + "time_per_iteration": 2.640059232711792 + }, + { + "auxiliary_loss_clip": 0.01149427, + "auxiliary_loss_mlp": 0.01037442, + "balance_loss_clip": 1.05378604, + "balance_loss_mlp": 1.02239799, + "epoch": 0.22702540207425223, + "flos": 20766644131680.0, + "grad_norm": 2.1235954295310737, + "language_loss": 0.81646532, + "learning_rate": 3.606418687985928e-06, + "loss": 0.83833396, + "num_input_tokens_seen": 81275695, + "router_z_loss_clip": 0.95605469, + "router_z_loss_mlp": 0.1505127, + "step": 3776, + "time_per_iteration": 2.7035977840423584 + }, + { + "auxiliary_loss_clip": 0.01150468, + "auxiliary_loss_mlp": 0.01039534, + "balance_loss_clip": 1.05277324, + "balance_loss_mlp": 1.02354801, + "epoch": 0.2270855253269202, + "flos": 26021634276960.0, + "grad_norm": 1.9613038198210617, + "language_loss": 0.82359385, + "learning_rate": 3.606186656428641e-06, + "loss": 0.84549379, + "num_input_tokens_seen": 81294920, + "router_z_loss_clip": 0.9765625, + "router_z_loss_mlp": 0.15991211, + "step": 3777, + "time_per_iteration": 2.651326894760132 + }, + { + "auxiliary_loss_clip": 0.01151621, + "auxiliary_loss_mlp": 0.0104413, + "balance_loss_clip": 1.0545752, + "balance_loss_mlp": 1.02773905, + "epoch": 0.22714564857958816, + "flos": 28736512421760.0, + "grad_norm": 1.7922821297689229, + "language_loss": 0.724298, + "learning_rate": 3.6059545639645955e-06, + "loss": 0.74625552, + "num_input_tokens_seen": 81314275, + "router_z_loss_clip": 0.97265625, + "router_z_loss_mlp": 0.16394043, + "step": 3778, + "time_per_iteration": 2.708716869354248 + }, + { + "auxiliary_loss_clip": 0.01149631, + "auxiliary_loss_mlp": 0.01039416, + "balance_loss_clip": 1.05183041, + "balance_loss_mlp": 1.0230124, + "epoch": 0.22720577183225613, + "flos": 31713742369440.0, + "grad_norm": 2.5698313527366405, + "language_loss": 0.64327359, + "learning_rate": 3.605722410602591e-06, + "loss": 0.66516399, + "num_input_tokens_seen": 81333890, + "router_z_loss_clip": 0.97851562, + "router_z_loss_mlp": 0.1640625, + "step": 3779, + "time_per_iteration": 2.665621280670166 + }, + { + "auxiliary_loss_clip": 0.01148909, + "auxiliary_loss_mlp": 0.01048424, + "balance_loss_clip": 1.05520296, + "balance_loss_mlp": 1.03303456, + "epoch": 0.2272658950849241, + "flos": 25397351897760.0, + "grad_norm": 1.643551893048714, + "language_loss": 0.70356405, + "learning_rate": 3.6054901963514323e-06, + "loss": 0.72553742, + "num_input_tokens_seen": 81353640, + "router_z_loss_clip": 0.93798828, + "router_z_loss_mlp": 0.15380859, + "step": 3780, + "time_per_iteration": 2.6842434406280518 + }, + { + "auxiliary_loss_clip": 0.01149544, + "auxiliary_loss_mlp": 0.01044919, + "balance_loss_clip": 1.05440462, + "balance_loss_mlp": 1.02807474, + "epoch": 0.22732601833759206, + "flos": 29174967439200.0, + "grad_norm": 1.7353600444924187, + "language_loss": 0.89505738, + "learning_rate": 3.6052579212199246e-06, + "loss": 0.91700208, + "num_input_tokens_seen": 81371595, + "router_z_loss_clip": 0.95263672, + "router_z_loss_mlp": 0.16845703, + "step": 3781, + "time_per_iteration": 2.7475528717041016 + }, + { + "auxiliary_loss_clip": 0.01149741, + "auxiliary_loss_mlp": 0.01043475, + "balance_loss_clip": 1.05159402, + "balance_loss_mlp": 1.02701211, + "epoch": 0.22738614159026002, + "flos": 19430655783840.0, + "grad_norm": 2.423853396845154, + "language_loss": 0.74307954, + "learning_rate": 3.6050255852168753e-06, + "loss": 0.76501173, + "num_input_tokens_seen": 81388435, + "router_z_loss_clip": 0.98046875, + "router_z_loss_mlp": 0.16467285, + "step": 3782, + "time_per_iteration": 2.6625208854675293 + }, + { + "auxiliary_loss_clip": 0.01149683, + "auxiliary_loss_mlp": 0.0104784, + "balance_loss_clip": 1.05283713, + "balance_loss_mlp": 1.03265274, + "epoch": 0.22744626484292801, + "flos": 29536006635360.0, + "grad_norm": 1.5321869128540544, + "language_loss": 0.82385397, + "learning_rate": 3.604793188351095e-06, + "loss": 0.84582919, + "num_input_tokens_seen": 81410195, + "router_z_loss_clip": 0.96777344, + "router_z_loss_mlp": 0.15185547, + "step": 3783, + "time_per_iteration": 2.815988779067993 + }, + { + "auxiliary_loss_clip": 0.01148653, + "auxiliary_loss_mlp": 0.01041213, + "balance_loss_clip": 1.05286753, + "balance_loss_mlp": 1.02501833, + "epoch": 0.22750638809559598, + "flos": 30250994503680.0, + "grad_norm": 1.9657171554348376, + "language_loss": 0.75705403, + "learning_rate": 3.6045607306313964e-06, + "loss": 0.7789526, + "num_input_tokens_seen": 81430060, + "router_z_loss_clip": 0.95800781, + "router_z_loss_mlp": 0.1618042, + "step": 3784, + "time_per_iteration": 2.720553159713745 + }, + { + "auxiliary_loss_clip": 0.01146501, + "auxiliary_loss_mlp": 0.01044571, + "balance_loss_clip": 1.05111122, + "balance_loss_mlp": 1.02847791, + "epoch": 0.22756651134826394, + "flos": 27133148128320.0, + "grad_norm": 1.8616059832019902, + "language_loss": 0.70798743, + "learning_rate": 3.604328212066594e-06, + "loss": 0.72989815, + "num_input_tokens_seen": 81447375, + "router_z_loss_clip": 0.95410156, + "router_z_loss_mlp": 0.1607666, + "step": 3785, + "time_per_iteration": 2.7026147842407227 + }, + { + "auxiliary_loss_clip": 0.01061669, + "auxiliary_loss_mlp": 0.01005091, + "balance_loss_clip": 1.02888775, + "balance_loss_mlp": 1.00325108, + "epoch": 0.2276266346009319, + "flos": 76514812410240.0, + "grad_norm": 0.8228002851493526, + "language_loss": 0.61925197, + "learning_rate": 3.6040956326655047e-06, + "loss": 0.63991964, + "num_input_tokens_seen": 81505235, + "router_z_loss_clip": 0.32763672, + "router_z_loss_mlp": 0.01837158, + "step": 3786, + "time_per_iteration": 3.24426531791687 + }, + { + "auxiliary_loss_clip": 0.0115691, + "auxiliary_loss_mlp": 0.01043256, + "balance_loss_clip": 1.05730152, + "balance_loss_mlp": 1.02623296, + "epoch": 0.22768675785359987, + "flos": 22713584673600.0, + "grad_norm": 2.722569893451661, + "language_loss": 0.86786175, + "learning_rate": 3.6038629924369486e-06, + "loss": 0.88986343, + "num_input_tokens_seen": 81518685, + "router_z_loss_clip": 0.99560547, + "router_z_loss_mlp": 0.17016602, + "step": 3787, + "time_per_iteration": 2.6847634315490723 + }, + { + "auxiliary_loss_clip": 0.01147993, + "auxiliary_loss_mlp": 0.01035302, + "balance_loss_clip": 1.05417502, + "balance_loss_mlp": 1.02007902, + "epoch": 0.22774688110626784, + "flos": 32787540983520.0, + "grad_norm": 1.3183502348600191, + "language_loss": 0.72546017, + "learning_rate": 3.6036302913897474e-06, + "loss": 0.74729317, + "num_input_tokens_seen": 81538940, + "router_z_loss_clip": 0.9375, + "router_z_loss_mlp": 0.15209961, + "step": 3788, + "time_per_iteration": 2.763535737991333 + }, + { + "auxiliary_loss_clip": 0.0114773, + "auxiliary_loss_mlp": 0.01034976, + "balance_loss_clip": 1.05331957, + "balance_loss_mlp": 1.01897812, + "epoch": 0.2278070043589358, + "flos": 18978708512160.0, + "grad_norm": 2.3015726505673153, + "language_loss": 0.67523181, + "learning_rate": 3.6033975295327243e-06, + "loss": 0.69705892, + "num_input_tokens_seen": 81555525, + "router_z_loss_clip": 0.94433594, + "router_z_loss_mlp": 0.15991211, + "step": 3789, + "time_per_iteration": 2.635728120803833 + }, + { + "auxiliary_loss_clip": 0.01149643, + "auxiliary_loss_mlp": 0.01041269, + "balance_loss_clip": 1.05316424, + "balance_loss_mlp": 1.02434134, + "epoch": 0.2278671276116038, + "flos": 27353368310400.0, + "grad_norm": 2.4051631045586124, + "language_loss": 0.75954586, + "learning_rate": 3.6031647068747065e-06, + "loss": 0.78145492, + "num_input_tokens_seen": 81576305, + "router_z_loss_clip": 0.96533203, + "router_z_loss_mlp": 0.16931152, + "step": 3790, + "time_per_iteration": 2.7591097354888916 + }, + { + "auxiliary_loss_clip": 0.01146465, + "auxiliary_loss_mlp": 0.01040388, + "balance_loss_clip": 1.05218887, + "balance_loss_mlp": 1.0243187, + "epoch": 0.22792725086427176, + "flos": 25174417057920.0, + "grad_norm": 1.9981196971493773, + "language_loss": 0.90931988, + "learning_rate": 3.602931823424522e-06, + "loss": 0.93118846, + "num_input_tokens_seen": 81594115, + "router_z_loss_clip": 0.94189453, + "router_z_loss_mlp": 0.16052246, + "step": 3791, + "time_per_iteration": 2.6616387367248535 + }, + { + "auxiliary_loss_clip": 0.01148268, + "auxiliary_loss_mlp": 0.01036666, + "balance_loss_clip": 1.05049491, + "balance_loss_mlp": 1.02071571, + "epoch": 0.22798737411693973, + "flos": 38350661107680.0, + "grad_norm": 1.9084477069728705, + "language_loss": 0.82640779, + "learning_rate": 3.6026988791910026e-06, + "loss": 0.84825712, + "num_input_tokens_seen": 81615355, + "router_z_loss_clip": 0.97705078, + "router_z_loss_mlp": 0.15948486, + "step": 3792, + "time_per_iteration": 2.773937463760376 + }, + { + "auxiliary_loss_clip": 0.01061331, + "auxiliary_loss_mlp": 0.01001862, + "balance_loss_clip": 1.02906203, + "balance_loss_mlp": 1.00000107, + "epoch": 0.2280474973696077, + "flos": 63934812781920.0, + "grad_norm": 1.1437092611646262, + "language_loss": 0.65687275, + "learning_rate": 3.602465874182981e-06, + "loss": 0.67750466, + "num_input_tokens_seen": 81662075, + "router_z_loss_clip": 0.32324219, + "router_z_loss_mlp": 0.01858521, + "step": 3793, + "time_per_iteration": 3.0335710048675537 + }, + { + "auxiliary_loss_clip": 0.01152938, + "auxiliary_loss_mlp": 0.01050019, + "balance_loss_clip": 1.05316508, + "balance_loss_mlp": 1.03292394, + "epoch": 0.22810762062227566, + "flos": 32208955745760.0, + "grad_norm": 2.1038922483986466, + "language_loss": 0.76912093, + "learning_rate": 3.602232808409293e-06, + "loss": 0.79115057, + "num_input_tokens_seen": 81681625, + "router_z_loss_clip": 0.99804688, + "router_z_loss_mlp": 0.17102051, + "step": 3794, + "time_per_iteration": 2.814077615737915 + }, + { + "auxiliary_loss_clip": 0.01150956, + "auxiliary_loss_mlp": 0.01039545, + "balance_loss_clip": 1.05361593, + "balance_loss_mlp": 1.02405989, + "epoch": 0.22816774387494362, + "flos": 31274436489120.0, + "grad_norm": 1.8571826880732583, + "language_loss": 0.80447531, + "learning_rate": 3.6019996818787755e-06, + "loss": 0.82638025, + "num_input_tokens_seen": 81701170, + "router_z_loss_clip": 0.97460938, + "router_z_loss_mlp": 0.15484619, + "step": 3795, + "time_per_iteration": 2.7123615741729736 + }, + { + "auxiliary_loss_clip": 0.01146096, + "auxiliary_loss_mlp": 0.01048236, + "balance_loss_clip": 1.05019426, + "balance_loss_mlp": 1.03226233, + "epoch": 0.22822786712761162, + "flos": 27395986138560.0, + "grad_norm": 1.7270948810910864, + "language_loss": 0.76757991, + "learning_rate": 3.6017664946002704e-06, + "loss": 0.78952318, + "num_input_tokens_seen": 81721265, + "router_z_loss_clip": 0.95947266, + "router_z_loss_mlp": 0.15991211, + "step": 3796, + "time_per_iteration": 2.7104570865631104 + }, + { + "auxiliary_loss_clip": 0.0114582, + "auxiliary_loss_mlp": 0.010388, + "balance_loss_clip": 1.04933095, + "balance_loss_mlp": 1.02279067, + "epoch": 0.22828799038027958, + "flos": 14888992298400.0, + "grad_norm": 2.294160994788078, + "language_loss": 0.95981312, + "learning_rate": 3.6015332465826188e-06, + "loss": 0.98165929, + "num_input_tokens_seen": 81736565, + "router_z_loss_clip": 0.96337891, + "router_z_loss_mlp": 0.16015625, + "step": 3797, + "time_per_iteration": 2.6938986778259277 + }, + { + "auxiliary_loss_clip": 0.01149435, + "auxiliary_loss_mlp": 0.01038706, + "balance_loss_clip": 1.05332434, + "balance_loss_mlp": 1.02312517, + "epoch": 0.22834811363294755, + "flos": 26949589734240.0, + "grad_norm": 1.7116145193269117, + "language_loss": 0.81782472, + "learning_rate": 3.601299937834666e-06, + "loss": 0.83970606, + "num_input_tokens_seen": 81756240, + "router_z_loss_clip": 0.95996094, + "router_z_loss_mlp": 0.15582275, + "step": 3798, + "time_per_iteration": 2.7100536823272705 + }, + { + "auxiliary_loss_clip": 0.01149048, + "auxiliary_loss_mlp": 0.01040934, + "balance_loss_clip": 1.05052543, + "balance_loss_mlp": 1.02381599, + "epoch": 0.2284082368856155, + "flos": 30335581883520.0, + "grad_norm": 2.691102205968239, + "language_loss": 0.79580498, + "learning_rate": 3.6010665683652596e-06, + "loss": 0.8177048, + "num_input_tokens_seen": 81775720, + "router_z_loss_clip": 0.984375, + "router_z_loss_mlp": 0.17114258, + "step": 3799, + "time_per_iteration": 2.739466667175293 + }, + { + "auxiliary_loss_clip": 0.01146479, + "auxiliary_loss_mlp": 0.0104238, + "balance_loss_clip": 1.05048621, + "balance_loss_mlp": 1.02639449, + "epoch": 0.22846836013828348, + "flos": 28422385885440.0, + "grad_norm": 1.6569664783160267, + "language_loss": 0.7533325, + "learning_rate": 3.6008331381832484e-06, + "loss": 0.77522105, + "num_input_tokens_seen": 81795830, + "router_z_loss_clip": 0.95996094, + "router_z_loss_mlp": 0.15979004, + "step": 3800, + "time_per_iteration": 2.6447126865386963 + }, + { + "auxiliary_loss_clip": 0.01147261, + "auxiliary_loss_mlp": 0.01040034, + "balance_loss_clip": 1.05287087, + "balance_loss_mlp": 1.02522826, + "epoch": 0.22852848339095144, + "flos": 33454036018080.0, + "grad_norm": 1.702807365621643, + "language_loss": 0.64062005, + "learning_rate": 3.600599647297484e-06, + "loss": 0.66249299, + "num_input_tokens_seen": 81815745, + "router_z_loss_clip": 0.94433594, + "router_z_loss_mlp": 0.14788818, + "step": 3801, + "time_per_iteration": 2.7401793003082275 + }, + { + "auxiliary_loss_clip": 0.01145819, + "auxiliary_loss_mlp": 0.0103336, + "balance_loss_clip": 1.05296326, + "balance_loss_mlp": 1.0189836, + "epoch": 0.2285886066436194, + "flos": 32117723532000.0, + "grad_norm": 1.7432582571282829, + "language_loss": 0.81478876, + "learning_rate": 3.60036609571682e-06, + "loss": 0.83658063, + "num_input_tokens_seen": 81835155, + "router_z_loss_clip": 0.92871094, + "router_z_loss_mlp": 0.1439209, + "step": 3802, + "time_per_iteration": 2.6646997928619385 + }, + { + "auxiliary_loss_clip": 0.01146663, + "auxiliary_loss_mlp": 0.01049669, + "balance_loss_clip": 1.05086315, + "balance_loss_mlp": 1.03377223, + "epoch": 0.2286487298962874, + "flos": 36260754135840.0, + "grad_norm": 2.240170276434083, + "language_loss": 0.78725648, + "learning_rate": 3.600132483450114e-06, + "loss": 0.80921984, + "num_input_tokens_seen": 81855655, + "router_z_loss_clip": 0.95703125, + "router_z_loss_mlp": 0.15887451, + "step": 3803, + "time_per_iteration": 2.7390851974487305 + }, + { + "auxiliary_loss_clip": 0.01145932, + "auxiliary_loss_mlp": 0.01037746, + "balance_loss_clip": 1.0490278, + "balance_loss_mlp": 1.02162862, + "epoch": 0.22870885314895537, + "flos": 25974802651680.0, + "grad_norm": 1.622308080428386, + "language_loss": 0.85185844, + "learning_rate": 3.5998988105062235e-06, + "loss": 0.87369525, + "num_input_tokens_seen": 81876385, + "router_z_loss_clip": 0.96923828, + "router_z_loss_mlp": 0.16125488, + "step": 3804, + "time_per_iteration": 5.2923743724823 + }, + { + "auxiliary_loss_clip": 0.01150119, + "auxiliary_loss_mlp": 0.01041174, + "balance_loss_clip": 1.05121922, + "balance_loss_mlp": 1.02542615, + "epoch": 0.22876897640162333, + "flos": 18228720064320.0, + "grad_norm": 3.4579425757052547, + "language_loss": 0.76477373, + "learning_rate": 3.59966507689401e-06, + "loss": 0.78668666, + "num_input_tokens_seen": 81893225, + "router_z_loss_clip": 0.98974609, + "router_z_loss_mlp": 0.1574707, + "step": 3805, + "time_per_iteration": 4.166781663894653 + }, + { + "auxiliary_loss_clip": 0.01149865, + "auxiliary_loss_mlp": 0.0104274, + "balance_loss_clip": 1.05107546, + "balance_loss_mlp": 1.02600336, + "epoch": 0.2288290996542913, + "flos": 22102429893120.0, + "grad_norm": 2.071882665123802, + "language_loss": 0.78953516, + "learning_rate": 3.5994312826223363e-06, + "loss": 0.81146121, + "num_input_tokens_seen": 81911350, + "router_z_loss_clip": 0.98730469, + "router_z_loss_mlp": 0.1673584, + "step": 3806, + "time_per_iteration": 2.652174711227417 + }, + { + "auxiliary_loss_clip": 0.01148739, + "auxiliary_loss_mlp": 0.01045128, + "balance_loss_clip": 1.05240726, + "balance_loss_mlp": 1.02958262, + "epoch": 0.22888922290695926, + "flos": 48633492761280.0, + "grad_norm": 2.428166393998369, + "language_loss": 0.70158309, + "learning_rate": 3.5991974277000684e-06, + "loss": 0.72352171, + "num_input_tokens_seen": 81935420, + "router_z_loss_clip": 0.96240234, + "router_z_loss_mlp": 0.15545654, + "step": 3807, + "time_per_iteration": 2.8347718715667725 + }, + { + "auxiliary_loss_clip": 0.01153289, + "auxiliary_loss_mlp": 0.01048232, + "balance_loss_clip": 1.05381465, + "balance_loss_mlp": 1.03145921, + "epoch": 0.22894934615962723, + "flos": 28557572997600.0, + "grad_norm": 2.2144282668773014, + "language_loss": 0.65801287, + "learning_rate": 3.5989635121360733e-06, + "loss": 0.68002808, + "num_input_tokens_seen": 81953845, + "router_z_loss_clip": 0.99462891, + "router_z_loss_mlp": 0.16760254, + "step": 3808, + "time_per_iteration": 4.237252950668335 + }, + { + "auxiliary_loss_clip": 0.01145505, + "auxiliary_loss_mlp": 0.01045711, + "balance_loss_clip": 1.0501554, + "balance_loss_mlp": 1.03027296, + "epoch": 0.22900946941229522, + "flos": 22992183905760.0, + "grad_norm": 1.9981466191850883, + "language_loss": 0.7502377, + "learning_rate": 3.598729535939222e-06, + "loss": 0.7721498, + "num_input_tokens_seen": 81972100, + "router_z_loss_clip": 0.953125, + "router_z_loss_mlp": 0.15441895, + "step": 3809, + "time_per_iteration": 4.127058506011963 + }, + { + "auxiliary_loss_clip": 0.01146934, + "auxiliary_loss_mlp": 0.01043288, + "balance_loss_clip": 1.0530045, + "balance_loss_mlp": 1.02799344, + "epoch": 0.22906959266496318, + "flos": 27978704138880.0, + "grad_norm": 1.531521509165812, + "language_loss": 0.81120139, + "learning_rate": 3.5984954991183862e-06, + "loss": 0.8331036, + "num_input_tokens_seen": 81992760, + "router_z_loss_clip": 0.93798828, + "router_z_loss_mlp": 0.15307617, + "step": 3810, + "time_per_iteration": 2.8549485206604004 + }, + { + "auxiliary_loss_clip": 0.0114526, + "auxiliary_loss_mlp": 0.01041163, + "balance_loss_clip": 1.05043256, + "balance_loss_mlp": 1.0261786, + "epoch": 0.22912971591763115, + "flos": 23616304215840.0, + "grad_norm": 2.0113382830491418, + "language_loss": 0.78396749, + "learning_rate": 3.598261401682441e-06, + "loss": 0.80583167, + "num_input_tokens_seen": 82009080, + "router_z_loss_clip": 0.94824219, + "router_z_loss_mlp": 0.14990234, + "step": 3811, + "time_per_iteration": 2.716668128967285 + }, + { + "auxiliary_loss_clip": 0.01148068, + "auxiliary_loss_mlp": 0.01041371, + "balance_loss_clip": 1.05199587, + "balance_loss_mlp": 1.02544427, + "epoch": 0.22918983917029911, + "flos": 24323310180000.0, + "grad_norm": 1.9388828909886187, + "language_loss": 0.83110225, + "learning_rate": 3.5980272436402632e-06, + "loss": 0.85299665, + "num_input_tokens_seen": 82026705, + "router_z_loss_clip": 0.96142578, + "router_z_loss_mlp": 0.15917969, + "step": 3812, + "time_per_iteration": 2.6896443367004395 + }, + { + "auxiliary_loss_clip": 0.01154539, + "auxiliary_loss_mlp": 0.01050904, + "balance_loss_clip": 1.05416703, + "balance_loss_mlp": 1.03452432, + "epoch": 0.22924996242296708, + "flos": 20365944868800.0, + "grad_norm": 2.9083320169214555, + "language_loss": 0.82417619, + "learning_rate": 3.5977930250007324e-06, + "loss": 0.84623063, + "num_input_tokens_seen": 82043245, + "router_z_loss_clip": 1.00390625, + "router_z_loss_mlp": 0.16381836, + "step": 3813, + "time_per_iteration": 2.693192958831787 + }, + { + "auxiliary_loss_clip": 0.01149456, + "auxiliary_loss_mlp": 0.01043863, + "balance_loss_clip": 1.05294418, + "balance_loss_mlp": 1.02852106, + "epoch": 0.22931008567563504, + "flos": 40311742180320.0, + "grad_norm": 1.7227742679944187, + "language_loss": 0.69801199, + "learning_rate": 3.5975587457727298e-06, + "loss": 0.71994519, + "num_input_tokens_seen": 82066870, + "router_z_loss_clip": 0.96386719, + "router_z_loss_mlp": 0.15338135, + "step": 3814, + "time_per_iteration": 2.7852742671966553 + }, + { + "auxiliary_loss_clip": 0.01147383, + "auxiliary_loss_mlp": 0.01047629, + "balance_loss_clip": 1.05041075, + "balance_loss_mlp": 1.03107131, + "epoch": 0.229370208928303, + "flos": 28468407165120.0, + "grad_norm": 2.8040412323053228, + "language_loss": 0.66841435, + "learning_rate": 3.597324405965139e-06, + "loss": 0.69036442, + "num_input_tokens_seen": 82083180, + "router_z_loss_clip": 0.96972656, + "router_z_loss_mlp": 0.16552734, + "step": 3815, + "time_per_iteration": 2.6726722717285156 + }, + { + "auxiliary_loss_clip": 0.01148456, + "auxiliary_loss_mlp": 0.01041839, + "balance_loss_clip": 1.05177391, + "balance_loss_mlp": 1.02660429, + "epoch": 0.229430332180971, + "flos": 34919782162560.0, + "grad_norm": 2.196197048171797, + "language_loss": 0.83734357, + "learning_rate": 3.597090005586848e-06, + "loss": 0.85924649, + "num_input_tokens_seen": 82102950, + "router_z_loss_clip": 0.96728516, + "router_z_loss_mlp": 0.15234375, + "step": 3816, + "time_per_iteration": 2.7161307334899902 + }, + { + "auxiliary_loss_clip": 0.01147521, + "auxiliary_loss_mlp": 0.01036045, + "balance_loss_clip": 1.05202508, + "balance_loss_mlp": 1.020262, + "epoch": 0.22949045543363897, + "flos": 21033817490880.0, + "grad_norm": 2.1409493891664653, + "language_loss": 0.86768913, + "learning_rate": 3.596855544646742e-06, + "loss": 0.8895247, + "num_input_tokens_seen": 82119510, + "router_z_loss_clip": 0.95556641, + "router_z_loss_mlp": 0.15765381, + "step": 3817, + "time_per_iteration": 2.767199993133545 + }, + { + "auxiliary_loss_clip": 0.01152746, + "auxiliary_loss_mlp": 0.01043041, + "balance_loss_clip": 1.05506504, + "balance_loss_mlp": 1.02780008, + "epoch": 0.22955057868630693, + "flos": 33543282885120.0, + "grad_norm": 1.754949403597018, + "language_loss": 0.74743283, + "learning_rate": 3.5966210231537154e-06, + "loss": 0.7693907, + "num_input_tokens_seen": 82140095, + "router_z_loss_clip": 0.97705078, + "router_z_loss_mlp": 0.15228271, + "step": 3818, + "time_per_iteration": 2.7852492332458496 + }, + { + "auxiliary_loss_clip": 0.01150651, + "auxiliary_loss_mlp": 0.01041962, + "balance_loss_clip": 1.05392599, + "balance_loss_mlp": 1.02590442, + "epoch": 0.2296107019389749, + "flos": 28646252622720.0, + "grad_norm": 1.7106515403596805, + "language_loss": 0.74723065, + "learning_rate": 3.596386441116659e-06, + "loss": 0.76915675, + "num_input_tokens_seen": 82159510, + "router_z_loss_clip": 0.96826172, + "router_z_loss_mlp": 0.16064453, + "step": 3819, + "time_per_iteration": 2.7186577320098877 + }, + { + "auxiliary_loss_clip": 0.01148187, + "auxiliary_loss_mlp": 0.01043286, + "balance_loss_clip": 1.05296445, + "balance_loss_mlp": 1.02762747, + "epoch": 0.22967082519164286, + "flos": 38175003583200.0, + "grad_norm": 4.928195045789073, + "language_loss": 0.80845141, + "learning_rate": 3.5961517985444684e-06, + "loss": 0.83036613, + "num_input_tokens_seen": 82179580, + "router_z_loss_clip": 0.95166016, + "router_z_loss_mlp": 0.15655518, + "step": 3820, + "time_per_iteration": 2.7442679405212402 + }, + { + "auxiliary_loss_clip": 0.01151177, + "auxiliary_loss_mlp": 0.01048228, + "balance_loss_clip": 1.05203629, + "balance_loss_mlp": 1.03147888, + "epoch": 0.22973094844431083, + "flos": 17867316212640.0, + "grad_norm": 2.487312322202814, + "language_loss": 0.69612581, + "learning_rate": 3.595917095446042e-06, + "loss": 0.71811986, + "num_input_tokens_seen": 82195585, + "router_z_loss_clip": 0.9921875, + "router_z_loss_mlp": 0.16748047, + "step": 3821, + "time_per_iteration": 2.6968681812286377 + }, + { + "auxiliary_loss_clip": 0.01149982, + "auxiliary_loss_mlp": 0.01035648, + "balance_loss_clip": 1.05410337, + "balance_loss_mlp": 1.01926875, + "epoch": 0.2297910716969788, + "flos": 27849554101440.0, + "grad_norm": 2.106170709148114, + "language_loss": 0.82928979, + "learning_rate": 3.5956823318302796e-06, + "loss": 0.85114604, + "num_input_tokens_seen": 82217530, + "router_z_loss_clip": 0.95947266, + "router_z_loss_mlp": 0.16381836, + "step": 3822, + "time_per_iteration": 2.7313544750213623 + }, + { + "auxiliary_loss_clip": 0.01148823, + "auxiliary_loss_mlp": 0.01044981, + "balance_loss_clip": 1.05203748, + "balance_loss_mlp": 1.02798223, + "epoch": 0.2298511949496468, + "flos": 28112473146240.0, + "grad_norm": 1.754906772261715, + "language_loss": 0.66352642, + "learning_rate": 3.5954475077060833e-06, + "loss": 0.6854645, + "num_input_tokens_seen": 82237980, + "router_z_loss_clip": 0.96777344, + "router_z_loss_mlp": 0.16992188, + "step": 3823, + "time_per_iteration": 2.711925983428955 + }, + { + "auxiliary_loss_clip": 0.01063289, + "auxiliary_loss_mlp": 0.01000441, + "balance_loss_clip": 1.03171825, + "balance_loss_mlp": 0.9985944, + "epoch": 0.22991131820231475, + "flos": 81620920602720.0, + "grad_norm": 0.8086412149194737, + "language_loss": 0.56765294, + "learning_rate": 3.595212623082357e-06, + "loss": 0.58829021, + "num_input_tokens_seen": 82301785, + "router_z_loss_clip": 0.31591797, + "router_z_loss_mlp": 0.01843262, + "step": 3824, + "time_per_iteration": 3.403796911239624 + }, + { + "auxiliary_loss_clip": 0.01147543, + "auxiliary_loss_mlp": 0.01038629, + "balance_loss_clip": 1.05357313, + "balance_loss_mlp": 1.023489, + "epoch": 0.22997144145498272, + "flos": 21826545318720.0, + "grad_norm": 2.4777650738913213, + "language_loss": 0.73292816, + "learning_rate": 3.594977677968009e-06, + "loss": 0.75478989, + "num_input_tokens_seen": 82317355, + "router_z_loss_clip": 0.93994141, + "router_z_loss_mlp": 0.15136719, + "step": 3825, + "time_per_iteration": 2.642954111099243 + }, + { + "auxiliary_loss_clip": 0.01151149, + "auxiliary_loss_mlp": 0.01044278, + "balance_loss_clip": 1.05403733, + "balance_loss_mlp": 1.02713537, + "epoch": 0.23003156470765068, + "flos": 30111431525280.0, + "grad_norm": 2.054249910777628, + "language_loss": 0.87809938, + "learning_rate": 3.5947426723719473e-06, + "loss": 0.90005362, + "num_input_tokens_seen": 82336645, + "router_z_loss_clip": 0.97167969, + "router_z_loss_mlp": 0.17150879, + "step": 3826, + "time_per_iteration": 2.7111892700195312 + }, + { + "auxiliary_loss_clip": 0.01153861, + "auxiliary_loss_mlp": 0.01042986, + "balance_loss_clip": 1.05252314, + "balance_loss_mlp": 1.02607012, + "epoch": 0.23009168796031865, + "flos": 19296157465440.0, + "grad_norm": 3.075760024514034, + "language_loss": 0.81957638, + "learning_rate": 3.594507606303083e-06, + "loss": 0.84154481, + "num_input_tokens_seen": 82354225, + "router_z_loss_clip": 1.01367188, + "router_z_loss_mlp": 0.16918945, + "step": 3827, + "time_per_iteration": 2.6283106803894043 + }, + { + "auxiliary_loss_clip": 0.01148337, + "auxiliary_loss_mlp": 0.01043064, + "balance_loss_clip": 1.05247641, + "balance_loss_mlp": 1.02786469, + "epoch": 0.2301518112129866, + "flos": 19782943247520.0, + "grad_norm": 1.8215010946488477, + "language_loss": 0.86446971, + "learning_rate": 3.5942724797703314e-06, + "loss": 0.88638365, + "num_input_tokens_seen": 82370240, + "router_z_loss_clip": 0.95849609, + "router_z_loss_mlp": 0.15197754, + "step": 3828, + "time_per_iteration": 2.6638553142547607 + }, + { + "auxiliary_loss_clip": 0.01147613, + "auxiliary_loss_mlp": 0.01044759, + "balance_loss_clip": 1.0502665, + "balance_loss_mlp": 1.0278194, + "epoch": 0.2302119344656546, + "flos": 25130543194080.0, + "grad_norm": 2.4676588018155363, + "language_loss": 0.70808709, + "learning_rate": 3.594037292782607e-06, + "loss": 0.73001087, + "num_input_tokens_seen": 82389145, + "router_z_loss_clip": 0.97363281, + "router_z_loss_mlp": 0.16931152, + "step": 3829, + "time_per_iteration": 2.7230942249298096 + }, + { + "auxiliary_loss_clip": 0.01149849, + "auxiliary_loss_mlp": 0.01036851, + "balance_loss_clip": 1.0562737, + "balance_loss_mlp": 1.02202117, + "epoch": 0.23027205771832257, + "flos": 32698132047360.0, + "grad_norm": 1.6791557908750416, + "language_loss": 0.84708303, + "learning_rate": 3.5938020453488293e-06, + "loss": 0.86895001, + "num_input_tokens_seen": 82409185, + "router_z_loss_clip": 0.93652344, + "router_z_loss_mlp": 0.14825439, + "step": 3830, + "time_per_iteration": 2.7678146362304688 + }, + { + "auxiliary_loss_clip": 0.01146591, + "auxiliary_loss_mlp": 0.01044882, + "balance_loss_clip": 1.05101895, + "balance_loss_mlp": 1.02921796, + "epoch": 0.23033218097099054, + "flos": 53535790270080.0, + "grad_norm": 1.6910340252944935, + "language_loss": 0.66813868, + "learning_rate": 3.5935667374779177e-06, + "loss": 0.6900534, + "num_input_tokens_seen": 82432070, + "router_z_loss_clip": 0.95605469, + "router_z_loss_mlp": 0.15673828, + "step": 3831, + "time_per_iteration": 2.8867125511169434 + }, + { + "auxiliary_loss_clip": 0.01149619, + "auxiliary_loss_mlp": 0.01045923, + "balance_loss_clip": 1.05187047, + "balance_loss_mlp": 1.03005672, + "epoch": 0.2303923042236585, + "flos": 31808215965600.0, + "grad_norm": 2.3269983531227543, + "language_loss": 0.75124109, + "learning_rate": 3.5933313691787957e-06, + "loss": 0.77319652, + "num_input_tokens_seen": 82450625, + "router_z_loss_clip": 0.97851562, + "router_z_loss_mlp": 0.15881348, + "step": 3832, + "time_per_iteration": 2.7372682094573975 + }, + { + "auxiliary_loss_clip": 0.01149607, + "auxiliary_loss_mlp": 0.01037109, + "balance_loss_clip": 1.05245256, + "balance_loss_mlp": 1.02050292, + "epoch": 0.23045242747632647, + "flos": 22325081112000.0, + "grad_norm": 1.8278951017550724, + "language_loss": 0.87390149, + "learning_rate": 3.593095940460389e-06, + "loss": 0.89576864, + "num_input_tokens_seen": 82468575, + "router_z_loss_clip": 0.97021484, + "router_z_loss_mlp": 0.16589355, + "step": 3833, + "time_per_iteration": 2.748760461807251 + }, + { + "auxiliary_loss_clip": 0.01150765, + "auxiliary_loss_mlp": 0.01044573, + "balance_loss_clip": 1.05426109, + "balance_loss_mlp": 1.02790773, + "epoch": 0.23051255072899443, + "flos": 31140545929920.0, + "grad_norm": 1.9624619538777803, + "language_loss": 0.75026172, + "learning_rate": 3.592860451331624e-06, + "loss": 0.77221513, + "num_input_tokens_seen": 82488655, + "router_z_loss_clip": 0.96679688, + "router_z_loss_mlp": 0.16674805, + "step": 3834, + "time_per_iteration": 2.7309136390686035 + }, + { + "auxiliary_loss_clip": 0.01148729, + "auxiliary_loss_mlp": 0.01049706, + "balance_loss_clip": 1.05354834, + "balance_loss_mlp": 1.03375542, + "epoch": 0.2305726739816624, + "flos": 25886852337600.0, + "grad_norm": 1.8947744095761272, + "language_loss": 0.86282915, + "learning_rate": 3.592624901801432e-06, + "loss": 0.88481355, + "num_input_tokens_seen": 82507220, + "router_z_loss_clip": 0.95263672, + "router_z_loss_mlp": 0.15942383, + "step": 3835, + "time_per_iteration": 2.693955898284912 + }, + { + "auxiliary_loss_clip": 0.01157265, + "auxiliary_loss_mlp": 0.0104823, + "balance_loss_clip": 1.05473697, + "balance_loss_mlp": 1.03098035, + "epoch": 0.2306327972343304, + "flos": 28468893372480.0, + "grad_norm": 2.466203444228833, + "language_loss": 0.82625777, + "learning_rate": 3.5923892918787432e-06, + "loss": 0.84831274, + "num_input_tokens_seen": 82527920, + "router_z_loss_clip": 1.02539062, + "router_z_loss_mlp": 0.17236328, + "step": 3836, + "time_per_iteration": 2.7190420627593994 + }, + { + "auxiliary_loss_clip": 0.01152958, + "auxiliary_loss_mlp": 0.01040913, + "balance_loss_clip": 1.05598617, + "balance_loss_mlp": 1.0250169, + "epoch": 0.23069292048699835, + "flos": 25217683162560.0, + "grad_norm": 1.743192208094665, + "language_loss": 0.79267985, + "learning_rate": 3.5921536215724934e-06, + "loss": 0.81461859, + "num_input_tokens_seen": 82549040, + "router_z_loss_clip": 0.96972656, + "router_z_loss_mlp": 0.15887451, + "step": 3837, + "time_per_iteration": 2.7266128063201904 + }, + { + "auxiliary_loss_clip": 0.01055883, + "auxiliary_loss_mlp": 0.01005642, + "balance_loss_clip": 1.0242182, + "balance_loss_mlp": 1.00389683, + "epoch": 0.23075304373966632, + "flos": 85970962755360.0, + "grad_norm": 0.9292788022381138, + "language_loss": 0.65426695, + "learning_rate": 3.5919178908916184e-06, + "loss": 0.67488217, + "num_input_tokens_seen": 82604070, + "router_z_loss_clip": 0.31665039, + "router_z_loss_mlp": 0.01748657, + "step": 3838, + "time_per_iteration": 3.256413459777832 + }, + { + "auxiliary_loss_clip": 0.01148058, + "auxiliary_loss_mlp": 0.01048828, + "balance_loss_clip": 1.052809, + "balance_loss_mlp": 1.03350937, + "epoch": 0.23081316699233428, + "flos": 20276981622720.0, + "grad_norm": 2.012345887373579, + "language_loss": 0.75494772, + "learning_rate": 3.591682099845058e-06, + "loss": 0.77691662, + "num_input_tokens_seen": 82619665, + "router_z_loss_clip": 0.95361328, + "router_z_loss_mlp": 0.15319824, + "step": 3839, + "time_per_iteration": 2.7177181243896484 + }, + { + "auxiliary_loss_clip": 0.01153373, + "auxiliary_loss_mlp": 0.01042421, + "balance_loss_clip": 1.05464625, + "balance_loss_mlp": 1.02604783, + "epoch": 0.23087329024500225, + "flos": 16225750474560.0, + "grad_norm": 2.866388834350627, + "language_loss": 0.69169998, + "learning_rate": 3.591446248441752e-06, + "loss": 0.71365798, + "num_input_tokens_seen": 82637530, + "router_z_loss_clip": 0.98779297, + "router_z_loss_mlp": 0.16375732, + "step": 3840, + "time_per_iteration": 2.6896450519561768 + }, + { + "auxiliary_loss_clip": 0.011536, + "auxiliary_loss_mlp": 0.01040079, + "balance_loss_clip": 1.05638933, + "balance_loss_mlp": 1.02210236, + "epoch": 0.23093341349767021, + "flos": 21701933216640.0, + "grad_norm": 2.352077239509364, + "language_loss": 0.79673856, + "learning_rate": 3.591210336690645e-06, + "loss": 0.81867534, + "num_input_tokens_seen": 82656130, + "router_z_loss_clip": 0.97167969, + "router_z_loss_mlp": 0.17980957, + "step": 3841, + "time_per_iteration": 2.734238862991333 + }, + { + "auxiliary_loss_clip": 0.01150277, + "auxiliary_loss_mlp": 0.01040686, + "balance_loss_clip": 1.05392408, + "balance_loss_mlp": 1.02613592, + "epoch": 0.23099353675033818, + "flos": 29270535001920.0, + "grad_norm": 2.6601711059527133, + "language_loss": 0.83476311, + "learning_rate": 3.590974364600683e-06, + "loss": 0.8566727, + "num_input_tokens_seen": 82675295, + "router_z_loss_clip": 0.96289062, + "router_z_loss_mlp": 0.14556885, + "step": 3842, + "time_per_iteration": 2.7463502883911133 + }, + { + "auxiliary_loss_clip": 0.01148589, + "auxiliary_loss_mlp": 0.01044735, + "balance_loss_clip": 1.05171871, + "balance_loss_mlp": 1.02794445, + "epoch": 0.23105366000300617, + "flos": 43922208826080.0, + "grad_norm": 1.7622569715926069, + "language_loss": 0.6648066, + "learning_rate": 3.5907383321808135e-06, + "loss": 0.68673986, + "num_input_tokens_seen": 82703260, + "router_z_loss_clip": 0.96875, + "router_z_loss_mlp": 0.16778564, + "step": 3843, + "time_per_iteration": 2.84612774848938 + }, + { + "auxiliary_loss_clip": 0.01146699, + "auxiliary_loss_mlp": 0.01039393, + "balance_loss_clip": 1.05280495, + "balance_loss_mlp": 1.02388382, + "epoch": 0.23111378325567414, + "flos": 38126713335840.0, + "grad_norm": 1.8092050984417705, + "language_loss": 0.77023029, + "learning_rate": 3.590502239439987e-06, + "loss": 0.79209125, + "num_input_tokens_seen": 82725060, + "router_z_loss_clip": 0.93896484, + "router_z_loss_mlp": 0.1550293, + "step": 3844, + "time_per_iteration": 4.33416485786438 + }, + { + "auxiliary_loss_clip": 0.01151977, + "auxiliary_loss_mlp": 0.01044558, + "balance_loss_clip": 1.05444741, + "balance_loss_mlp": 1.02745104, + "epoch": 0.2311739065083421, + "flos": 23437810481760.0, + "grad_norm": 1.653983053662726, + "language_loss": 0.77999806, + "learning_rate": 3.590266086387156e-06, + "loss": 0.80196345, + "num_input_tokens_seen": 82742960, + "router_z_loss_clip": 0.97509766, + "router_z_loss_mlp": 0.17089844, + "step": 3845, + "time_per_iteration": 4.12018609046936 + }, + { + "auxiliary_loss_clip": 0.01142732, + "auxiliary_loss_mlp": 0.01034942, + "balance_loss_clip": 1.05173683, + "balance_loss_mlp": 1.02019572, + "epoch": 0.23123402976101007, + "flos": 28505028435840.0, + "grad_norm": 2.1594674760561214, + "language_loss": 0.76982796, + "learning_rate": 3.590029873031276e-06, + "loss": 0.79160476, + "num_input_tokens_seen": 82760205, + "router_z_loss_clip": 0.90917969, + "router_z_loss_mlp": 0.14733887, + "step": 3846, + "time_per_iteration": 2.688274383544922 + }, + { + "auxiliary_loss_clip": 0.011499, + "auxiliary_loss_mlp": 0.01043961, + "balance_loss_clip": 1.05264187, + "balance_loss_mlp": 1.02801061, + "epoch": 0.23129415301367803, + "flos": 16759408399200.0, + "grad_norm": 2.084355490614489, + "language_loss": 0.69549751, + "learning_rate": 3.589793599381304e-06, + "loss": 0.71743619, + "num_input_tokens_seen": 82778590, + "router_z_loss_clip": 0.97314453, + "router_z_loss_mlp": 0.15930176, + "step": 3847, + "time_per_iteration": 4.110933780670166 + }, + { + "auxiliary_loss_clip": 0.01049069, + "auxiliary_loss_mlp": 0.00999701, + "balance_loss_clip": 1.01737785, + "balance_loss_mlp": 0.9979977, + "epoch": 0.231354276266346, + "flos": 85093931168640.0, + "grad_norm": 0.7896119121614044, + "language_loss": 0.61010391, + "learning_rate": 3.589557265446198e-06, + "loss": 0.63059163, + "num_input_tokens_seen": 82833925, + "router_z_loss_clip": 0.31665039, + "router_z_loss_mlp": 0.01705933, + "step": 3848, + "time_per_iteration": 3.1851511001586914 + }, + { + "auxiliary_loss_clip": 0.01146919, + "auxiliary_loss_mlp": 0.01044854, + "balance_loss_clip": 1.0511899, + "balance_loss_mlp": 1.02840304, + "epoch": 0.231414399519014, + "flos": 22984161484320.0, + "grad_norm": 2.19691127228674, + "language_loss": 0.78154093, + "learning_rate": 3.589320871234923e-06, + "loss": 0.80345869, + "num_input_tokens_seen": 82850625, + "router_z_loss_clip": 0.95800781, + "router_z_loss_mlp": 0.16442871, + "step": 3849, + "time_per_iteration": 4.087080478668213 + }, + { + "auxiliary_loss_clip": 0.01147595, + "auxiliary_loss_mlp": 0.01037189, + "balance_loss_clip": 1.05014801, + "balance_loss_mlp": 1.02078557, + "epoch": 0.23147452277168196, + "flos": 44093936174400.0, + "grad_norm": 1.941100709337638, + "language_loss": 0.71229577, + "learning_rate": 3.5890844167564405e-06, + "loss": 0.73414361, + "num_input_tokens_seen": 82872105, + "router_z_loss_clip": 0.97412109, + "router_z_loss_mlp": 0.1640625, + "step": 3850, + "time_per_iteration": 2.816694498062134 + }, + { + "auxiliary_loss_clip": 0.01144869, + "auxiliary_loss_mlp": 0.01034337, + "balance_loss_clip": 1.05008519, + "balance_loss_mlp": 1.01882827, + "epoch": 0.23153464602434992, + "flos": 25396055344800.0, + "grad_norm": 1.8778614451740399, + "language_loss": 0.76550663, + "learning_rate": 3.588847902019718e-06, + "loss": 0.78729868, + "num_input_tokens_seen": 82890595, + "router_z_loss_clip": 0.94775391, + "router_z_loss_mlp": 0.15515137, + "step": 3851, + "time_per_iteration": 2.789278268814087 + }, + { + "auxiliary_loss_clip": 0.01145554, + "auxiliary_loss_mlp": 0.01041286, + "balance_loss_clip": 1.05122042, + "balance_loss_mlp": 1.02534807, + "epoch": 0.2315947692770179, + "flos": 24329752427520.0, + "grad_norm": 1.6655792095651212, + "language_loss": 0.69421357, + "learning_rate": 3.588611327033723e-06, + "loss": 0.71608198, + "num_input_tokens_seen": 82908910, + "router_z_loss_clip": 0.94287109, + "router_z_loss_mlp": 0.15930176, + "step": 3852, + "time_per_iteration": 2.693389415740967 + }, + { + "auxiliary_loss_clip": 0.01150211, + "auxiliary_loss_mlp": 0.0103917, + "balance_loss_clip": 1.05269456, + "balance_loss_mlp": 1.02364874, + "epoch": 0.23165489252968585, + "flos": 15686177027040.0, + "grad_norm": 2.3070910541055882, + "language_loss": 0.6758213, + "learning_rate": 3.588374691807428e-06, + "loss": 0.69771516, + "num_input_tokens_seen": 82925405, + "router_z_loss_clip": 0.97509766, + "router_z_loss_mlp": 0.15527344, + "step": 3853, + "time_per_iteration": 2.655402660369873 + }, + { + "auxiliary_loss_clip": 0.01150488, + "auxiliary_loss_mlp": 0.01034854, + "balance_loss_clip": 1.05325091, + "balance_loss_mlp": 1.01818871, + "epoch": 0.23171501578235382, + "flos": 37373442988320.0, + "grad_norm": 1.7147935260393992, + "language_loss": 0.7964052, + "learning_rate": 3.5881379963498053e-06, + "loss": 0.81825864, + "num_input_tokens_seen": 82945615, + "router_z_loss_clip": 0.97216797, + "router_z_loss_mlp": 0.16650391, + "step": 3854, + "time_per_iteration": 2.737234592437744 + }, + { + "auxiliary_loss_clip": 0.01155924, + "auxiliary_loss_mlp": 0.01041366, + "balance_loss_clip": 1.05313444, + "balance_loss_mlp": 1.02459347, + "epoch": 0.23177513903502178, + "flos": 29093175751680.0, + "grad_norm": 2.368381963603461, + "language_loss": 0.65611577, + "learning_rate": 3.587901240669831e-06, + "loss": 0.67808867, + "num_input_tokens_seen": 82967570, + "router_z_loss_clip": 1.02636719, + "router_z_loss_mlp": 0.16772461, + "step": 3855, + "time_per_iteration": 2.925415515899658 + }, + { + "auxiliary_loss_clip": 0.01150457, + "auxiliary_loss_mlp": 0.01044019, + "balance_loss_clip": 1.05243468, + "balance_loss_mlp": 1.02839088, + "epoch": 0.23183526228768978, + "flos": 36082381953600.0, + "grad_norm": 2.0094462602378864, + "language_loss": 0.70989239, + "learning_rate": 3.5876644247764815e-06, + "loss": 0.73183715, + "num_input_tokens_seen": 82987435, + "router_z_loss_clip": 0.97900391, + "router_z_loss_mlp": 0.15618896, + "step": 3856, + "time_per_iteration": 2.762336492538452 + }, + { + "auxiliary_loss_clip": 0.01146629, + "auxiliary_loss_mlp": 0.01036425, + "balance_loss_clip": 1.0520761, + "balance_loss_mlp": 1.02210832, + "epoch": 0.23189538554035774, + "flos": 42047659962720.0, + "grad_norm": 2.6383036434598717, + "language_loss": 0.76975328, + "learning_rate": 3.5874275486787387e-06, + "loss": 0.79158384, + "num_input_tokens_seen": 83010505, + "router_z_loss_clip": 0.94628906, + "router_z_loss_mlp": 0.14312744, + "step": 3857, + "time_per_iteration": 2.774035930633545 + }, + { + "auxiliary_loss_clip": 0.01152497, + "auxiliary_loss_mlp": 0.01052212, + "balance_loss_clip": 1.05311453, + "balance_loss_mlp": 1.0347122, + "epoch": 0.2319555087930257, + "flos": 21968134161120.0, + "grad_norm": 2.408423771301589, + "language_loss": 0.91709936, + "learning_rate": 3.587190612385584e-06, + "loss": 0.93914646, + "num_input_tokens_seen": 83026705, + "router_z_loss_clip": 0.99316406, + "router_z_loss_mlp": 0.1751709, + "step": 3858, + "time_per_iteration": 2.681882858276367 + }, + { + "auxiliary_loss_clip": 0.01144632, + "auxiliary_loss_mlp": 0.0104027, + "balance_loss_clip": 1.0525378, + "balance_loss_mlp": 1.02464211, + "epoch": 0.23201563204569367, + "flos": 28240812838080.0, + "grad_norm": 2.237280501651783, + "language_loss": 0.76634347, + "learning_rate": 3.5869536159060026e-06, + "loss": 0.78819251, + "num_input_tokens_seen": 83046500, + "router_z_loss_clip": 0.92138672, + "router_z_loss_mlp": 0.15618896, + "step": 3859, + "time_per_iteration": 2.6772189140319824 + }, + { + "auxiliary_loss_clip": 0.01144999, + "auxiliary_loss_mlp": 0.01035497, + "balance_loss_clip": 1.04950178, + "balance_loss_mlp": 1.02010083, + "epoch": 0.23207575529836164, + "flos": 25219101267360.0, + "grad_norm": 2.7950619669446053, + "language_loss": 0.8423537, + "learning_rate": 3.58671655924898e-06, + "loss": 0.86415863, + "num_input_tokens_seen": 83065280, + "router_z_loss_clip": 0.95507812, + "router_z_loss_mlp": 0.15380859, + "step": 3860, + "time_per_iteration": 2.651989698410034 + }, + { + "auxiliary_loss_clip": 0.01145832, + "auxiliary_loss_mlp": 0.01037268, + "balance_loss_clip": 1.05155921, + "balance_loss_mlp": 1.02123404, + "epoch": 0.2321358785510296, + "flos": 20099055130560.0, + "grad_norm": 2.0521500698098523, + "language_loss": 0.83172613, + "learning_rate": 3.586479442423508e-06, + "loss": 0.85355711, + "num_input_tokens_seen": 83082310, + "router_z_loss_clip": 0.94287109, + "router_z_loss_mlp": 0.16015625, + "step": 3861, + "time_per_iteration": 2.692119836807251 + }, + { + "auxiliary_loss_clip": 0.01149871, + "auxiliary_loss_mlp": 0.01044738, + "balance_loss_clip": 1.05305004, + "balance_loss_mlp": 1.02913344, + "epoch": 0.2321960018036976, + "flos": 26376352777440.0, + "grad_norm": 2.023120500495195, + "language_loss": 0.85536039, + "learning_rate": 3.586242265438576e-06, + "loss": 0.87730646, + "num_input_tokens_seen": 83102065, + "router_z_loss_clip": 0.96679688, + "router_z_loss_mlp": 0.15612793, + "step": 3862, + "time_per_iteration": 2.6766374111175537 + }, + { + "auxiliary_loss_clip": 0.01144533, + "auxiliary_loss_mlp": 0.01042334, + "balance_loss_clip": 1.05172575, + "balance_loss_mlp": 1.0279876, + "epoch": 0.23225612505636556, + "flos": 27175968542880.0, + "grad_norm": 2.388730884565954, + "language_loss": 0.74817669, + "learning_rate": 3.5860050283031773e-06, + "loss": 0.7700454, + "num_input_tokens_seen": 83121445, + "router_z_loss_clip": 0.92871094, + "router_z_loss_mlp": 0.14349365, + "step": 3863, + "time_per_iteration": 2.732469320297241 + }, + { + "auxiliary_loss_clip": 0.0114633, + "auxiliary_loss_mlp": 0.01042446, + "balance_loss_clip": 1.05512595, + "balance_loss_mlp": 1.02789068, + "epoch": 0.23231624830903352, + "flos": 20807317130400.0, + "grad_norm": 1.7651073195315945, + "language_loss": 0.74681538, + "learning_rate": 3.58576773102631e-06, + "loss": 0.7687031, + "num_input_tokens_seen": 83138175, + "router_z_loss_clip": 0.91064453, + "router_z_loss_mlp": 0.14556885, + "step": 3864, + "time_per_iteration": 2.6697170734405518 + }, + { + "auxiliary_loss_clip": 0.01145032, + "auxiliary_loss_mlp": 0.01034876, + "balance_loss_clip": 1.05075145, + "balance_loss_mlp": 1.01977217, + "epoch": 0.2323763715617015, + "flos": 42269055145920.0, + "grad_norm": 4.353690297706963, + "language_loss": 0.70671594, + "learning_rate": 3.5855303736169714e-06, + "loss": 0.72851503, + "num_input_tokens_seen": 83161975, + "router_z_loss_clip": 0.94238281, + "router_z_loss_mlp": 0.15112305, + "step": 3865, + "time_per_iteration": 2.7617475986480713 + }, + { + "auxiliary_loss_clip": 0.01157511, + "auxiliary_loss_mlp": 0.01043528, + "balance_loss_clip": 1.05594015, + "balance_loss_mlp": 1.0265286, + "epoch": 0.23243649481436945, + "flos": 31179071512800.0, + "grad_norm": 1.8658975018783095, + "language_loss": 0.94448578, + "learning_rate": 3.5852929560841617e-06, + "loss": 0.96649623, + "num_input_tokens_seen": 83180905, + "router_z_loss_clip": 1.01513672, + "router_z_loss_mlp": 0.17004395, + "step": 3866, + "time_per_iteration": 2.7161080837249756 + }, + { + "auxiliary_loss_clip": 0.01147415, + "auxiliary_loss_mlp": 0.010392, + "balance_loss_clip": 1.0534668, + "balance_loss_mlp": 1.02367878, + "epoch": 0.23249661806703742, + "flos": 24994140563520.0, + "grad_norm": 2.492030591473252, + "language_loss": 0.73028588, + "learning_rate": 3.5850554784368846e-06, + "loss": 0.75215203, + "num_input_tokens_seen": 83196390, + "router_z_loss_clip": 0.93798828, + "router_z_loss_mlp": 0.15515137, + "step": 3867, + "time_per_iteration": 2.7262628078460693 + }, + { + "auxiliary_loss_clip": 0.01145199, + "auxiliary_loss_mlp": 0.01036325, + "balance_loss_clip": 1.05021226, + "balance_loss_mlp": 1.02066135, + "epoch": 0.23255674131970538, + "flos": 24863491386720.0, + "grad_norm": 1.7390619103292762, + "language_loss": 0.81972682, + "learning_rate": 3.584817940684145e-06, + "loss": 0.84154207, + "num_input_tokens_seen": 83216165, + "router_z_loss_clip": 0.94873047, + "router_z_loss_mlp": 0.15661621, + "step": 3868, + "time_per_iteration": 2.65502667427063 + }, + { + "auxiliary_loss_clip": 0.01144532, + "auxiliary_loss_mlp": 0.01034838, + "balance_loss_clip": 1.05235577, + "balance_loss_mlp": 1.0200088, + "epoch": 0.23261686457237338, + "flos": 20811571444800.0, + "grad_norm": 2.4608980211246867, + "language_loss": 0.73076904, + "learning_rate": 3.58458034283495e-06, + "loss": 0.75256276, + "num_input_tokens_seen": 83233845, + "router_z_loss_clip": 0.92138672, + "router_z_loss_mlp": 0.14831543, + "step": 3869, + "time_per_iteration": 2.6476778984069824 + }, + { + "auxiliary_loss_clip": 0.01145728, + "auxiliary_loss_mlp": 0.01049428, + "balance_loss_clip": 1.0534445, + "balance_loss_mlp": 1.03430009, + "epoch": 0.23267698782504134, + "flos": 35593691859360.0, + "grad_norm": 2.000962577262632, + "language_loss": 0.79463005, + "learning_rate": 3.5843426848983097e-06, + "loss": 0.81658155, + "num_input_tokens_seen": 83254930, + "router_z_loss_clip": 0.92382812, + "router_z_loss_mlp": 0.15124512, + "step": 3870, + "time_per_iteration": 2.714207172393799 + }, + { + "auxiliary_loss_clip": 0.01152842, + "auxiliary_loss_mlp": 0.01041196, + "balance_loss_clip": 1.05476165, + "balance_loss_mlp": 1.02503133, + "epoch": 0.2327371110777093, + "flos": 25837751744640.0, + "grad_norm": 1.961728787187391, + "language_loss": 0.7085188, + "learning_rate": 3.5841049668832357e-06, + "loss": 0.73045921, + "num_input_tokens_seen": 83272095, + "router_z_loss_clip": 0.98046875, + "router_z_loss_mlp": 0.16149902, + "step": 3871, + "time_per_iteration": 2.6824750900268555 + }, + { + "auxiliary_loss_clip": 0.0115189, + "auxiliary_loss_mlp": 0.010537, + "balance_loss_clip": 1.05416369, + "balance_loss_mlp": 1.03698075, + "epoch": 0.23279723433037727, + "flos": 30338377575840.0, + "grad_norm": 2.3945580362269254, + "language_loss": 0.68800914, + "learning_rate": 3.5838671887987433e-06, + "loss": 0.71006501, + "num_input_tokens_seen": 83290980, + "router_z_loss_clip": 0.97851562, + "router_z_loss_mlp": 0.16705322, + "step": 3872, + "time_per_iteration": 2.6836249828338623 + }, + { + "auxiliary_loss_clip": 0.01153351, + "auxiliary_loss_mlp": 0.01041347, + "balance_loss_clip": 1.05397034, + "balance_loss_mlp": 1.02494347, + "epoch": 0.23285735758304524, + "flos": 47345997247200.0, + "grad_norm": 1.5150561175870412, + "language_loss": 0.77636933, + "learning_rate": 3.5836293506538474e-06, + "loss": 0.7983163, + "num_input_tokens_seen": 83315175, + "router_z_loss_clip": 0.99511719, + "router_z_loss_mlp": 0.16394043, + "step": 3873, + "time_per_iteration": 2.799799919128418 + }, + { + "auxiliary_loss_clip": 0.01055598, + "auxiliary_loss_mlp": 0.01013209, + "balance_loss_clip": 1.02321172, + "balance_loss_mlp": 1.01153409, + "epoch": 0.2329174808357132, + "flos": 65821719415680.0, + "grad_norm": 0.8590097159386764, + "language_loss": 0.60523206, + "learning_rate": 3.5833914524575687e-06, + "loss": 0.62592018, + "num_input_tokens_seen": 83372060, + "router_z_loss_clip": 0.32373047, + "router_z_loss_mlp": 0.01678467, + "step": 3874, + "time_per_iteration": 3.181321620941162 + }, + { + "auxiliary_loss_clip": 0.01147796, + "auxiliary_loss_mlp": 0.01042918, + "balance_loss_clip": 1.0539999, + "balance_loss_mlp": 1.02675331, + "epoch": 0.23297760408838117, + "flos": 25887784235040.0, + "grad_norm": 2.5522407293216416, + "language_loss": 0.80985558, + "learning_rate": 3.583153494218927e-06, + "loss": 0.83176279, + "num_input_tokens_seen": 83389795, + "router_z_loss_clip": 0.9375, + "router_z_loss_mlp": 0.16174316, + "step": 3875, + "time_per_iteration": 2.6680045127868652 + }, + { + "auxiliary_loss_clip": 0.0114658, + "auxiliary_loss_mlp": 0.01034814, + "balance_loss_clip": 1.05404449, + "balance_loss_mlp": 1.02053285, + "epoch": 0.23303772734104916, + "flos": 34657957084320.0, + "grad_norm": 1.7856063681169079, + "language_loss": 0.61572468, + "learning_rate": 3.5829154759469464e-06, + "loss": 0.63753867, + "num_input_tokens_seen": 83410005, + "router_z_loss_clip": 0.92529297, + "router_z_loss_mlp": 0.14282227, + "step": 3876, + "time_per_iteration": 2.724026918411255 + }, + { + "auxiliary_loss_clip": 0.01152118, + "auxiliary_loss_mlp": 0.01048056, + "balance_loss_clip": 1.0567522, + "balance_loss_mlp": 1.03190362, + "epoch": 0.23309785059371713, + "flos": 29669289435360.0, + "grad_norm": 1.8831752002176807, + "language_loss": 0.70636654, + "learning_rate": 3.5826773976506523e-06, + "loss": 0.72836834, + "num_input_tokens_seen": 83430250, + "router_z_loss_clip": 0.953125, + "router_z_loss_mlp": 0.16149902, + "step": 3877, + "time_per_iteration": 2.7965309619903564 + }, + { + "auxiliary_loss_clip": 0.01150348, + "auxiliary_loss_mlp": 0.01048498, + "balance_loss_clip": 1.05446529, + "balance_loss_mlp": 1.03194559, + "epoch": 0.2331579738463851, + "flos": 19514675921760.0, + "grad_norm": 3.254479411024824, + "language_loss": 0.80892384, + "learning_rate": 3.582439259339073e-06, + "loss": 0.83091229, + "num_input_tokens_seen": 83447950, + "router_z_loss_clip": 0.95898438, + "router_z_loss_mlp": 0.16546631, + "step": 3878, + "time_per_iteration": 2.7363669872283936 + }, + { + "auxiliary_loss_clip": 0.01153182, + "auxiliary_loss_mlp": 0.01043747, + "balance_loss_clip": 1.05436349, + "balance_loss_mlp": 1.02635396, + "epoch": 0.23321809709905306, + "flos": 44449667606880.0, + "grad_norm": 1.995144687756534, + "language_loss": 0.74727595, + "learning_rate": 3.5822010610212374e-06, + "loss": 0.76924521, + "num_input_tokens_seen": 83467785, + "router_z_loss_clip": 0.98681641, + "router_z_loss_mlp": 0.1739502, + "step": 3879, + "time_per_iteration": 2.780118227005005 + }, + { + "auxiliary_loss_clip": 0.01145885, + "auxiliary_loss_mlp": 0.0104118, + "balance_loss_clip": 1.05072737, + "balance_loss_mlp": 1.02556336, + "epoch": 0.23327822035172102, + "flos": 26020540310400.0, + "grad_norm": 4.577500049286483, + "language_loss": 0.90024865, + "learning_rate": 3.5819628027061795e-06, + "loss": 0.92211932, + "num_input_tokens_seen": 83485390, + "router_z_loss_clip": 0.95117188, + "router_z_loss_mlp": 0.15625, + "step": 3880, + "time_per_iteration": 2.722311019897461 + }, + { + "auxiliary_loss_clip": 0.01149937, + "auxiliary_loss_mlp": 0.01050697, + "balance_loss_clip": 1.05390227, + "balance_loss_mlp": 1.03496194, + "epoch": 0.233338343604389, + "flos": 23393450410560.0, + "grad_norm": 1.6752274873191189, + "language_loss": 0.72172242, + "learning_rate": 3.5817244844029334e-06, + "loss": 0.74372876, + "num_input_tokens_seen": 83504890, + "router_z_loss_clip": 0.95996094, + "router_z_loss_mlp": 0.15734863, + "step": 3881, + "time_per_iteration": 2.7276768684387207 + }, + { + "auxiliary_loss_clip": 0.01147476, + "auxiliary_loss_mlp": 0.01038152, + "balance_loss_clip": 1.0531683, + "balance_loss_mlp": 1.02265501, + "epoch": 0.23339846685705698, + "flos": 32833967436000.0, + "grad_norm": 1.7343523579395335, + "language_loss": 0.67826724, + "learning_rate": 3.581486106120537e-06, + "loss": 0.70012355, + "num_input_tokens_seen": 83526475, + "router_z_loss_clip": 0.94140625, + "router_z_loss_mlp": 0.1550293, + "step": 3882, + "time_per_iteration": 2.7633585929870605 + }, + { + "auxiliary_loss_clip": 0.01150726, + "auxiliary_loss_mlp": 0.01048095, + "balance_loss_clip": 1.05281091, + "balance_loss_mlp": 1.03237128, + "epoch": 0.23345859010972494, + "flos": 39466064617920.0, + "grad_norm": 2.075200999798526, + "language_loss": 0.76927119, + "learning_rate": 3.5812476678680287e-06, + "loss": 0.79125941, + "num_input_tokens_seen": 83546620, + "router_z_loss_clip": 0.98046875, + "router_z_loss_mlp": 0.15734863, + "step": 3883, + "time_per_iteration": 2.7607638835906982 + }, + { + "auxiliary_loss_clip": 0.01052682, + "auxiliary_loss_mlp": 0.01003477, + "balance_loss_clip": 1.02098811, + "balance_loss_mlp": 1.00165153, + "epoch": 0.2335187133623929, + "flos": 71360454487680.0, + "grad_norm": 0.7834342175314443, + "language_loss": 0.59077817, + "learning_rate": 3.58100916965445e-06, + "loss": 0.61133975, + "num_input_tokens_seen": 83616160, + "router_z_loss_clip": 0.31713867, + "router_z_loss_mlp": 0.01823425, + "step": 3884, + "time_per_iteration": 7.093369007110596 + }, + { + "auxiliary_loss_clip": 0.01146789, + "auxiliary_loss_mlp": 0.01037426, + "balance_loss_clip": 1.0522213, + "balance_loss_mlp": 1.02196431, + "epoch": 0.23357883661506088, + "flos": 29897734625280.0, + "grad_norm": 2.1601110260071725, + "language_loss": 0.80228019, + "learning_rate": 3.5807706114888455e-06, + "loss": 0.82412231, + "num_input_tokens_seen": 83636795, + "router_z_loss_clip": 0.94628906, + "router_z_loss_mlp": 0.15441895, + "step": 3885, + "time_per_iteration": 2.7374603748321533 + }, + { + "auxiliary_loss_clip": 0.01146806, + "auxiliary_loss_mlp": 0.01035906, + "balance_loss_clip": 1.05247128, + "balance_loss_mlp": 1.02044499, + "epoch": 0.23363895986772884, + "flos": 23121293425920.0, + "grad_norm": 4.362802816612345, + "language_loss": 0.88284242, + "learning_rate": 3.580531993380261e-06, + "loss": 0.90466946, + "num_input_tokens_seen": 83654050, + "router_z_loss_clip": 0.94335938, + "router_z_loss_mlp": 0.15460205, + "step": 3886, + "time_per_iteration": 2.664602756500244 + }, + { + "auxiliary_loss_clip": 0.01149147, + "auxiliary_loss_mlp": 0.01036857, + "balance_loss_clip": 1.05377412, + "balance_loss_mlp": 1.02121687, + "epoch": 0.2336990831203968, + "flos": 38665111782240.0, + "grad_norm": 2.4115384924674617, + "language_loss": 0.73447025, + "learning_rate": 3.5802933153377445e-06, + "loss": 0.75633031, + "num_input_tokens_seen": 83673720, + "router_z_loss_clip": 0.953125, + "router_z_loss_mlp": 0.15637207, + "step": 3887, + "time_per_iteration": 4.316988945007324 + }, + { + "auxiliary_loss_clip": 0.01145805, + "auxiliary_loss_mlp": 0.01039988, + "balance_loss_clip": 1.05115509, + "balance_loss_mlp": 1.02496791, + "epoch": 0.23375920637306477, + "flos": 33811995900960.0, + "grad_norm": 1.8449024702070178, + "language_loss": 0.83797896, + "learning_rate": 3.5800545773703475e-06, + "loss": 0.85983688, + "num_input_tokens_seen": 83693470, + "router_z_loss_clip": 0.94628906, + "router_z_loss_mlp": 0.15026855, + "step": 3888, + "time_per_iteration": 4.17170262336731 + }, + { + "auxiliary_loss_clip": 0.01146954, + "auxiliary_loss_mlp": 0.01049462, + "balance_loss_clip": 1.05345428, + "balance_loss_mlp": 1.03388143, + "epoch": 0.23381932962573276, + "flos": 21567637484640.0, + "grad_norm": 2.2992579827461714, + "language_loss": 0.87422478, + "learning_rate": 3.5798157794871225e-06, + "loss": 0.89618897, + "num_input_tokens_seen": 83711620, + "router_z_loss_clip": 0.93505859, + "router_z_loss_mlp": 0.15570068, + "step": 3889, + "time_per_iteration": 2.691225528717041 + }, + { + "auxiliary_loss_clip": 0.01151147, + "auxiliary_loss_mlp": 0.01038645, + "balance_loss_clip": 1.05353558, + "balance_loss_mlp": 1.02327919, + "epoch": 0.23387945287840073, + "flos": 17559388820160.0, + "grad_norm": 3.0527256058419865, + "language_loss": 0.77022272, + "learning_rate": 3.579576921697125e-06, + "loss": 0.79212064, + "num_input_tokens_seen": 83727890, + "router_z_loss_clip": 0.97607422, + "router_z_loss_mlp": 0.15380859, + "step": 3890, + "time_per_iteration": 2.641549825668335 + }, + { + "auxiliary_loss_clip": 0.01148133, + "auxiliary_loss_mlp": 0.01041192, + "balance_loss_clip": 1.05334711, + "balance_loss_mlp": 1.02575421, + "epoch": 0.2339395761310687, + "flos": 56250789966720.0, + "grad_norm": 1.999958200767432, + "language_loss": 0.73414898, + "learning_rate": 3.579338004009412e-06, + "loss": 0.75604224, + "num_input_tokens_seen": 83749370, + "router_z_loss_clip": 0.94775391, + "router_z_loss_mlp": 0.15441895, + "step": 3891, + "time_per_iteration": 2.8712966442108154 + }, + { + "auxiliary_loss_clip": 0.01144337, + "auxiliary_loss_mlp": 0.01034256, + "balance_loss_clip": 1.05249572, + "balance_loss_mlp": 1.01902711, + "epoch": 0.23399969938373666, + "flos": 27310790999520.0, + "grad_norm": 1.8991869205709968, + "language_loss": 0.82425475, + "learning_rate": 3.5790990264330433e-06, + "loss": 0.84604073, + "num_input_tokens_seen": 83769560, + "router_z_loss_clip": 0.91796875, + "router_z_loss_mlp": 0.15234375, + "step": 3892, + "time_per_iteration": 2.7143003940582275 + }, + { + "auxiliary_loss_clip": 0.01149048, + "auxiliary_loss_mlp": 0.01036214, + "balance_loss_clip": 1.05286169, + "balance_loss_mlp": 1.01991773, + "epoch": 0.23405982263640462, + "flos": 53090852487840.0, + "grad_norm": 1.6786326850893492, + "language_loss": 0.6472739, + "learning_rate": 3.578859988977082e-06, + "loss": 0.66912651, + "num_input_tokens_seen": 83795635, + "router_z_loss_clip": 0.96337891, + "router_z_loss_mlp": 0.16278076, + "step": 3893, + "time_per_iteration": 2.857736587524414 + }, + { + "auxiliary_loss_clip": 0.01145645, + "auxiliary_loss_mlp": 0.01033164, + "balance_loss_clip": 1.05396676, + "balance_loss_mlp": 1.01704681, + "epoch": 0.2341199458890726, + "flos": 27533644804800.0, + "grad_norm": 2.0556644366418393, + "language_loss": 0.79063368, + "learning_rate": 3.5786208916505916e-06, + "loss": 0.8124218, + "num_input_tokens_seen": 83814090, + "router_z_loss_clip": 0.91748047, + "router_z_loss_mlp": 0.16125488, + "step": 3894, + "time_per_iteration": 2.728464365005493 + }, + { + "auxiliary_loss_clip": 0.01144834, + "auxiliary_loss_mlp": 0.01034309, + "balance_loss_clip": 1.05099356, + "balance_loss_mlp": 1.01980102, + "epoch": 0.23418006914174055, + "flos": 31278609768960.0, + "grad_norm": 1.5700358957638425, + "language_loss": 0.81737018, + "learning_rate": 3.5783817344626383e-06, + "loss": 0.83916152, + "num_input_tokens_seen": 83836870, + "router_z_loss_clip": 0.93847656, + "router_z_loss_mlp": 0.14508057, + "step": 3895, + "time_per_iteration": 2.741349697113037 + }, + { + "auxiliary_loss_clip": 0.0114761, + "auxiliary_loss_mlp": 0.01043136, + "balance_loss_clip": 1.05302918, + "balance_loss_mlp": 1.02741182, + "epoch": 0.23424019239440855, + "flos": 16528896828000.0, + "grad_norm": 1.8934805514102557, + "language_loss": 0.79791492, + "learning_rate": 3.578142517422292e-06, + "loss": 0.81982243, + "num_input_tokens_seen": 83853275, + "router_z_loss_clip": 0.94580078, + "router_z_loss_mlp": 0.15734863, + "step": 3896, + "time_per_iteration": 2.65755558013916 + }, + { + "auxiliary_loss_clip": 0.01149077, + "auxiliary_loss_mlp": 0.01040257, + "balance_loss_clip": 1.05292904, + "balance_loss_mlp": 1.02453327, + "epoch": 0.2343003156470765, + "flos": 27356407106400.0, + "grad_norm": 1.6397800590715812, + "language_loss": 0.83027673, + "learning_rate": 3.577903240538623e-06, + "loss": 0.85217011, + "num_input_tokens_seen": 83872340, + "router_z_loss_clip": 0.96191406, + "router_z_loss_mlp": 0.15710449, + "step": 3897, + "time_per_iteration": 2.667942762374878 + }, + { + "auxiliary_loss_clip": 0.01149388, + "auxiliary_loss_mlp": 0.01045387, + "balance_loss_clip": 1.0520401, + "balance_loss_mlp": 1.02944839, + "epoch": 0.23436043889974448, + "flos": 18047592707040.0, + "grad_norm": 1.6351996993656528, + "language_loss": 0.78836215, + "learning_rate": 3.577663903820705e-06, + "loss": 0.81030989, + "num_input_tokens_seen": 83888795, + "router_z_loss_clip": 0.97216797, + "router_z_loss_mlp": 0.15942383, + "step": 3898, + "time_per_iteration": 2.8046305179595947 + }, + { + "auxiliary_loss_clip": 0.0114503, + "auxiliary_loss_mlp": 0.01045639, + "balance_loss_clip": 1.05428696, + "balance_loss_mlp": 1.03034401, + "epoch": 0.23442056215241244, + "flos": 28023023692800.0, + "grad_norm": 1.9291700707149155, + "language_loss": 0.73480058, + "learning_rate": 3.577424507277614e-06, + "loss": 0.75670725, + "num_input_tokens_seen": 83906820, + "router_z_loss_clip": 0.90722656, + "router_z_loss_mlp": 0.15307617, + "step": 3899, + "time_per_iteration": 2.68171763420105 + }, + { + "auxiliary_loss_clip": 0.01146824, + "auxiliary_loss_mlp": 0.01043496, + "balance_loss_clip": 1.05230427, + "balance_loss_mlp": 1.02743888, + "epoch": 0.2344806854050804, + "flos": 28152903041280.0, + "grad_norm": 2.11173664558176, + "language_loss": 0.75418341, + "learning_rate": 3.5771850509184277e-06, + "loss": 0.77608669, + "num_input_tokens_seen": 83926370, + "router_z_loss_clip": 0.94580078, + "router_z_loss_mlp": 0.16064453, + "step": 3900, + "time_per_iteration": 2.8440628051757812 + }, + { + "auxiliary_loss_clip": 0.01147465, + "auxiliary_loss_mlp": 0.01047126, + "balance_loss_clip": 1.05318093, + "balance_loss_mlp": 1.03154588, + "epoch": 0.23454080865774837, + "flos": 19921898466720.0, + "grad_norm": 1.915663859974847, + "language_loss": 0.67169261, + "learning_rate": 3.5769455347522256e-06, + "loss": 0.69363856, + "num_input_tokens_seen": 83944600, + "router_z_loss_clip": 0.94189453, + "router_z_loss_mlp": 0.15588379, + "step": 3901, + "time_per_iteration": 2.7402656078338623 + }, + { + "auxiliary_loss_clip": 0.01053984, + "auxiliary_loss_mlp": 0.01004699, + "balance_loss_clip": 1.02195358, + "balance_loss_mlp": 1.0029763, + "epoch": 0.23460093191041637, + "flos": 82683131274720.0, + "grad_norm": 0.7628766192534039, + "language_loss": 0.58152229, + "learning_rate": 3.576705958788091e-06, + "loss": 0.60210907, + "num_input_tokens_seen": 84005100, + "router_z_loss_clip": 0.32055664, + "router_z_loss_mlp": 0.01725769, + "step": 3902, + "time_per_iteration": 3.215385913848877 + }, + { + "auxiliary_loss_clip": 0.01148448, + "auxiliary_loss_mlp": 0.0104449, + "balance_loss_clip": 1.05438256, + "balance_loss_mlp": 1.02836657, + "epoch": 0.23466105516308433, + "flos": 24499291842720.0, + "grad_norm": 2.4574199050232055, + "language_loss": 0.8001377, + "learning_rate": 3.576466323035108e-06, + "loss": 0.82206708, + "num_input_tokens_seen": 84023775, + "router_z_loss_clip": 0.94140625, + "router_z_loss_mlp": 0.16119385, + "step": 3903, + "time_per_iteration": 2.6523122787475586 + }, + { + "auxiliary_loss_clip": 0.01145587, + "auxiliary_loss_mlp": 0.01039626, + "balance_loss_clip": 1.04977596, + "balance_loss_mlp": 1.0239377, + "epoch": 0.2347211784157523, + "flos": 29938772279520.0, + "grad_norm": 2.124153321473898, + "language_loss": 0.82027924, + "learning_rate": 3.5762266275023645e-06, + "loss": 0.84213138, + "num_input_tokens_seen": 84042605, + "router_z_loss_clip": 0.95849609, + "router_z_loss_mlp": 0.15698242, + "step": 3904, + "time_per_iteration": 2.668076276779175 + }, + { + "auxiliary_loss_clip": 0.01146312, + "auxiliary_loss_mlp": 0.01039065, + "balance_loss_clip": 1.05363274, + "balance_loss_mlp": 1.02405667, + "epoch": 0.23478130166842026, + "flos": 29047235506560.0, + "grad_norm": 2.0685014675815308, + "language_loss": 0.7106365, + "learning_rate": 3.57598687219895e-06, + "loss": 0.7324903, + "num_input_tokens_seen": 84061520, + "router_z_loss_clip": 0.92675781, + "router_z_loss_mlp": 0.15002441, + "step": 3905, + "time_per_iteration": 2.691004514694214 + }, + { + "auxiliary_loss_clip": 0.01144885, + "auxiliary_loss_mlp": 0.0103266, + "balance_loss_clip": 1.05309057, + "balance_loss_mlp": 1.01785398, + "epoch": 0.23484142492108823, + "flos": 29399685039360.0, + "grad_norm": 1.8026834427489569, + "language_loss": 0.71128577, + "learning_rate": 3.5757470571339543e-06, + "loss": 0.73306119, + "num_input_tokens_seen": 84081800, + "router_z_loss_clip": 0.91796875, + "router_z_loss_mlp": 0.14813232, + "step": 3906, + "time_per_iteration": 2.6755828857421875 + }, + { + "auxiliary_loss_clip": 0.01152187, + "auxiliary_loss_mlp": 0.01036499, + "balance_loss_clip": 1.05140686, + "balance_loss_mlp": 1.01916599, + "epoch": 0.2349015481737562, + "flos": 35503432060320.0, + "grad_norm": 2.1073101404979724, + "language_loss": 0.73630965, + "learning_rate": 3.575507182316473e-06, + "loss": 0.75819647, + "num_input_tokens_seen": 84102340, + "router_z_loss_clip": 1.00878906, + "router_z_loss_mlp": 0.17333984, + "step": 3907, + "time_per_iteration": 2.7636423110961914 + }, + { + "auxiliary_loss_clip": 0.01146475, + "auxiliary_loss_mlp": 0.01043346, + "balance_loss_clip": 1.05179715, + "balance_loss_mlp": 1.02774143, + "epoch": 0.23496167142642416, + "flos": 23081592841920.0, + "grad_norm": 1.6678675557932283, + "language_loss": 0.72551388, + "learning_rate": 3.575267247755601e-06, + "loss": 0.74741209, + "num_input_tokens_seen": 84120370, + "router_z_loss_clip": 0.9453125, + "router_z_loss_mlp": 0.15600586, + "step": 3908, + "time_per_iteration": 2.6561031341552734 + }, + { + "auxiliary_loss_clip": 0.01055703, + "auxiliary_loss_mlp": 0.01001491, + "balance_loss_clip": 1.02372921, + "balance_loss_mlp": 0.99957192, + "epoch": 0.23502179467909215, + "flos": 68170372152480.0, + "grad_norm": 1.02064882240048, + "language_loss": 0.73315305, + "learning_rate": 3.5750272534604367e-06, + "loss": 0.75372505, + "num_input_tokens_seen": 84165515, + "router_z_loss_clip": 0.31982422, + "router_z_loss_mlp": 0.01916504, + "step": 3909, + "time_per_iteration": 3.0745744705200195 + }, + { + "auxiliary_loss_clip": 0.01147993, + "auxiliary_loss_mlp": 0.01039539, + "balance_loss_clip": 1.0541122, + "balance_loss_mlp": 1.02387488, + "epoch": 0.23508191793176011, + "flos": 28555304029920.0, + "grad_norm": 1.701783934765713, + "language_loss": 0.88146561, + "learning_rate": 3.5747871994400822e-06, + "loss": 0.90334094, + "num_input_tokens_seen": 84184540, + "router_z_loss_clip": 0.93798828, + "router_z_loss_mlp": 0.15673828, + "step": 3910, + "time_per_iteration": 2.7602155208587646 + }, + { + "auxiliary_loss_clip": 0.01149645, + "auxiliary_loss_mlp": 0.01040766, + "balance_loss_clip": 1.05415606, + "balance_loss_mlp": 1.02510178, + "epoch": 0.23514204118442808, + "flos": 24462143847360.0, + "grad_norm": 2.6220934144528103, + "language_loss": 0.75969684, + "learning_rate": 3.5745470857036386e-06, + "loss": 0.78160095, + "num_input_tokens_seen": 84202025, + "router_z_loss_clip": 0.95507812, + "router_z_loss_mlp": 0.15673828, + "step": 3911, + "time_per_iteration": 2.6408610343933105 + }, + { + "auxiliary_loss_clip": 0.01144225, + "auxiliary_loss_mlp": 0.0103796, + "balance_loss_clip": 1.05267453, + "balance_loss_mlp": 1.02364278, + "epoch": 0.23520216443709605, + "flos": 26332681500000.0, + "grad_norm": 1.6741387824283138, + "language_loss": 0.81611276, + "learning_rate": 3.5743069122602122e-06, + "loss": 0.83793455, + "num_input_tokens_seen": 84221895, + "router_z_loss_clip": 0.91455078, + "router_z_loss_mlp": 0.14312744, + "step": 3912, + "time_per_iteration": 2.6548562049865723 + }, + { + "auxiliary_loss_clip": 0.01145412, + "auxiliary_loss_mlp": 0.01049175, + "balance_loss_clip": 1.0533402, + "balance_loss_mlp": 1.03315902, + "epoch": 0.235262287689764, + "flos": 28291493604960.0, + "grad_norm": 1.8511325224975756, + "language_loss": 0.71469694, + "learning_rate": 3.574066679118909e-06, + "loss": 0.73664284, + "num_input_tokens_seen": 84240455, + "router_z_loss_clip": 0.92089844, + "router_z_loss_mlp": 0.16003418, + "step": 3913, + "time_per_iteration": 2.649200439453125 + }, + { + "auxiliary_loss_clip": 0.01158659, + "auxiliary_loss_mlp": 0.01045519, + "balance_loss_clip": 1.05832374, + "balance_loss_mlp": 1.02848411, + "epoch": 0.23532241094243198, + "flos": 28290845328480.0, + "grad_norm": 2.0734389008907557, + "language_loss": 0.76280725, + "learning_rate": 3.57382638628884e-06, + "loss": 0.78484905, + "num_input_tokens_seen": 84261605, + "router_z_loss_clip": 1.00341797, + "router_z_loss_mlp": 0.17028809, + "step": 3914, + "time_per_iteration": 2.689628839492798 + }, + { + "auxiliary_loss_clip": 0.01147387, + "auxiliary_loss_mlp": 0.01039442, + "balance_loss_clip": 1.05292416, + "balance_loss_mlp": 1.02364647, + "epoch": 0.23538253419509997, + "flos": 20766725166240.0, + "grad_norm": 2.41275387512308, + "language_loss": 0.89690506, + "learning_rate": 3.5735860337791174e-06, + "loss": 0.91877335, + "num_input_tokens_seen": 84278675, + "router_z_loss_clip": 0.94384766, + "router_z_loss_mlp": 0.15795898, + "step": 3915, + "time_per_iteration": 2.6149280071258545 + }, + { + "auxiliary_loss_clip": 0.01052474, + "auxiliary_loss_mlp": 0.01002117, + "balance_loss_clip": 1.02061164, + "balance_loss_mlp": 1.00021267, + "epoch": 0.23544265744776793, + "flos": 77421496295520.0, + "grad_norm": 0.8180108098391691, + "language_loss": 0.59402484, + "learning_rate": 3.573345621598854e-06, + "loss": 0.6145708, + "num_input_tokens_seen": 84329765, + "router_z_loss_clip": 0.31860352, + "router_z_loss_mlp": 0.01901245, + "step": 3916, + "time_per_iteration": 3.250190019607544 + }, + { + "auxiliary_loss_clip": 0.01051026, + "auxiliary_loss_mlp": 0.01002224, + "balance_loss_clip": 1.01913536, + "balance_loss_mlp": 1.00054288, + "epoch": 0.2355027807004359, + "flos": 86042590956000.0, + "grad_norm": 0.7621410863030778, + "language_loss": 0.49430618, + "learning_rate": 3.5731051497571675e-06, + "loss": 0.51483864, + "num_input_tokens_seen": 84393680, + "router_z_loss_clip": 0.31982422, + "router_z_loss_mlp": 0.0168457, + "step": 3917, + "time_per_iteration": 3.271944046020508 + }, + { + "auxiliary_loss_clip": 0.01150065, + "auxiliary_loss_mlp": 0.01053133, + "balance_loss_clip": 1.05331159, + "balance_loss_mlp": 1.03771889, + "epoch": 0.23556290395310386, + "flos": 26154187765920.0, + "grad_norm": 3.0225918908525404, + "language_loss": 0.76519823, + "learning_rate": 3.5728646182631756e-06, + "loss": 0.78723025, + "num_input_tokens_seen": 84412640, + "router_z_loss_clip": 0.96826172, + "router_z_loss_mlp": 0.1541748, + "step": 3918, + "time_per_iteration": 2.755047082901001 + }, + { + "auxiliary_loss_clip": 0.01153044, + "auxiliary_loss_mlp": 0.01043888, + "balance_loss_clip": 1.05429626, + "balance_loss_mlp": 1.02840304, + "epoch": 0.23562302720577183, + "flos": 22190947449120.0, + "grad_norm": 1.9807993183540111, + "language_loss": 0.68965864, + "learning_rate": 3.5726240271259995e-06, + "loss": 0.71162796, + "num_input_tokens_seen": 84431605, + "router_z_loss_clip": 0.98779297, + "router_z_loss_mlp": 0.15478516, + "step": 3919, + "time_per_iteration": 2.654099702835083 + }, + { + "auxiliary_loss_clip": 0.01144021, + "auxiliary_loss_mlp": 0.01035903, + "balance_loss_clip": 1.05236602, + "balance_loss_mlp": 1.02093029, + "epoch": 0.2356831504584398, + "flos": 41158918882080.0, + "grad_norm": 1.725533275849243, + "language_loss": 0.70498633, + "learning_rate": 3.5723833763547634e-06, + "loss": 0.7267856, + "num_input_tokens_seen": 84454210, + "router_z_loss_clip": 0.91699219, + "router_z_loss_mlp": 0.14953613, + "step": 3920, + "time_per_iteration": 2.786074161529541 + }, + { + "auxiliary_loss_clip": 0.01143849, + "auxiliary_loss_mlp": 0.01046068, + "balance_loss_clip": 1.05197263, + "balance_loss_mlp": 1.03121459, + "epoch": 0.23574327371110776, + "flos": 30423694266720.0, + "grad_norm": 2.153550937080782, + "language_loss": 0.77252585, + "learning_rate": 3.5721426659585916e-06, + "loss": 0.79442507, + "num_input_tokens_seen": 84475540, + "router_z_loss_clip": 0.91943359, + "router_z_loss_mlp": 0.1484375, + "step": 3921, + "time_per_iteration": 2.726545810699463 + }, + { + "auxiliary_loss_clip": 0.01148243, + "auxiliary_loss_mlp": 0.01043426, + "balance_loss_clip": 1.053496, + "balance_loss_mlp": 1.02727365, + "epoch": 0.23580339696377575, + "flos": 21746333805120.0, + "grad_norm": 2.4205999302629846, + "language_loss": 0.75265491, + "learning_rate": 3.571901895946612e-06, + "loss": 0.77457166, + "num_input_tokens_seen": 84494580, + "router_z_loss_clip": 0.94628906, + "router_z_loss_mlp": 0.16162109, + "step": 3922, + "time_per_iteration": 2.6719508171081543 + }, + { + "auxiliary_loss_clip": 0.01145266, + "auxiliary_loss_mlp": 0.01041896, + "balance_loss_clip": 1.05179548, + "balance_loss_mlp": 1.02691174, + "epoch": 0.23586352021644372, + "flos": 32075389324800.0, + "grad_norm": 3.0102954891287177, + "language_loss": 0.80135232, + "learning_rate": 3.571661066327956e-06, + "loss": 0.82322395, + "num_input_tokens_seen": 84513850, + "router_z_loss_clip": 0.93603516, + "router_z_loss_mlp": 0.14978027, + "step": 3923, + "time_per_iteration": 4.38702654838562 + }, + { + "auxiliary_loss_clip": 0.01144159, + "auxiliary_loss_mlp": 0.01045565, + "balance_loss_clip": 1.0518837, + "balance_loss_mlp": 1.03076565, + "epoch": 0.23592364346911168, + "flos": 17383326122880.0, + "grad_norm": 1.7309802371721945, + "language_loss": 0.74486035, + "learning_rate": 3.571420177111754e-06, + "loss": 0.76675761, + "num_input_tokens_seen": 84532315, + "router_z_loss_clip": 0.92382812, + "router_z_loss_mlp": 0.14807129, + "step": 3924, + "time_per_iteration": 4.1375885009765625 + }, + { + "auxiliary_loss_clip": 0.01147293, + "auxiliary_loss_mlp": 0.01043934, + "balance_loss_clip": 1.05455697, + "balance_loss_mlp": 1.0291996, + "epoch": 0.23598376672177965, + "flos": 22680893579040.0, + "grad_norm": 2.215636770680996, + "language_loss": 0.82477713, + "learning_rate": 3.5711792283071416e-06, + "loss": 0.84668934, + "num_input_tokens_seen": 84550970, + "router_z_loss_clip": 0.92773438, + "router_z_loss_mlp": 0.14733887, + "step": 3925, + "time_per_iteration": 2.6641993522644043 + }, + { + "auxiliary_loss_clip": 0.01149221, + "auxiliary_loss_mlp": 0.01046842, + "balance_loss_clip": 1.05417132, + "balance_loss_mlp": 1.03066516, + "epoch": 0.2360438899744476, + "flos": 27668791399680.0, + "grad_norm": 1.803226236789561, + "language_loss": 0.59968638, + "learning_rate": 3.5709382199232564e-06, + "loss": 0.621647, + "num_input_tokens_seen": 84571655, + "router_z_loss_clip": 0.95019531, + "router_z_loss_mlp": 0.16186523, + "step": 3926, + "time_per_iteration": 4.105268955230713 + }, + { + "auxiliary_loss_clip": 0.01140568, + "auxiliary_loss_mlp": 0.01040246, + "balance_loss_clip": 1.05017567, + "balance_loss_mlp": 1.02579808, + "epoch": 0.23610401322711558, + "flos": 36082219884480.0, + "grad_norm": 1.8028975922754968, + "language_loss": 0.71556282, + "learning_rate": 3.570697151969235e-06, + "loss": 0.73737091, + "num_input_tokens_seen": 84593130, + "router_z_loss_clip": 0.90429688, + "router_z_loss_mlp": 0.14453125, + "step": 3927, + "time_per_iteration": 2.8559563159942627 + }, + { + "auxiliary_loss_clip": 0.01145411, + "auxiliary_loss_mlp": 0.01046289, + "balance_loss_clip": 1.05299878, + "balance_loss_mlp": 1.0321331, + "epoch": 0.23616413647978354, + "flos": 21790774910880.0, + "grad_norm": 3.016286651574217, + "language_loss": 0.747473, + "learning_rate": 3.570456024454221e-06, + "loss": 0.76938999, + "num_input_tokens_seen": 84612410, + "router_z_loss_clip": 0.92333984, + "router_z_loss_mlp": 0.14160156, + "step": 3928, + "time_per_iteration": 4.109726190567017 + }, + { + "auxiliary_loss_clip": 0.01147691, + "auxiliary_loss_mlp": 0.01045009, + "balance_loss_clip": 1.05383539, + "balance_loss_mlp": 1.02835608, + "epoch": 0.23622425973245154, + "flos": 13464081221760.0, + "grad_norm": 2.4381165658193407, + "language_loss": 0.80953991, + "learning_rate": 3.5702148373873576e-06, + "loss": 0.83146703, + "num_input_tokens_seen": 84627610, + "router_z_loss_clip": 0.93701172, + "router_z_loss_mlp": 0.16638184, + "step": 3929, + "time_per_iteration": 2.6981194019317627 + }, + { + "auxiliary_loss_clip": 0.01156215, + "auxiliary_loss_mlp": 0.01053044, + "balance_loss_clip": 1.05596924, + "balance_loss_mlp": 1.03561544, + "epoch": 0.2362843829851195, + "flos": 28557897135840.0, + "grad_norm": 1.8319775733463095, + "language_loss": 0.7192024, + "learning_rate": 3.569973590777789e-06, + "loss": 0.74129498, + "num_input_tokens_seen": 84648415, + "router_z_loss_clip": 1.00146484, + "router_z_loss_mlp": 0.17443848, + "step": 3930, + "time_per_iteration": 2.7940711975097656 + }, + { + "auxiliary_loss_clip": 0.01148804, + "auxiliary_loss_mlp": 0.01036164, + "balance_loss_clip": 1.0537523, + "balance_loss_mlp": 1.0203805, + "epoch": 0.23634450623778747, + "flos": 48235913328960.0, + "grad_norm": 1.8416762875591906, + "language_loss": 0.74222225, + "learning_rate": 3.569732284634665e-06, + "loss": 0.76407194, + "num_input_tokens_seen": 84670080, + "router_z_loss_clip": 0.95166016, + "router_z_loss_mlp": 0.15783691, + "step": 3931, + "time_per_iteration": 2.89750599861145 + }, + { + "auxiliary_loss_clip": 0.0115017, + "auxiliary_loss_mlp": 0.01045453, + "balance_loss_clip": 1.05586052, + "balance_loss_mlp": 1.02798879, + "epoch": 0.23640462949045543, + "flos": 29537870430240.0, + "grad_norm": 2.5826830970609294, + "language_loss": 0.80660641, + "learning_rate": 3.569490918967136e-06, + "loss": 0.82856262, + "num_input_tokens_seen": 84686465, + "router_z_loss_clip": 0.94238281, + "router_z_loss_mlp": 0.17468262, + "step": 3932, + "time_per_iteration": 2.679919958114624 + }, + { + "auxiliary_loss_clip": 0.01145495, + "auxiliary_loss_mlp": 0.01036226, + "balance_loss_clip": 1.05321574, + "balance_loss_mlp": 1.0224272, + "epoch": 0.2364647527431234, + "flos": 31942349628480.0, + "grad_norm": 4.101415982522932, + "language_loss": 0.85491771, + "learning_rate": 3.5692494937843537e-06, + "loss": 0.87673491, + "num_input_tokens_seen": 84708825, + "router_z_loss_clip": 0.92236328, + "router_z_loss_mlp": 0.13806152, + "step": 3933, + "time_per_iteration": 2.70910906791687 + }, + { + "auxiliary_loss_clip": 0.01152579, + "auxiliary_loss_mlp": 0.0104189, + "balance_loss_clip": 1.05701566, + "balance_loss_mlp": 1.02514088, + "epoch": 0.23652487599579136, + "flos": 27621959774400.0, + "grad_norm": 2.2849290000577542, + "language_loss": 0.83029139, + "learning_rate": 3.5690080090954727e-06, + "loss": 0.85223603, + "num_input_tokens_seen": 84726165, + "router_z_loss_clip": 0.95605469, + "router_z_loss_mlp": 0.16748047, + "step": 3934, + "time_per_iteration": 2.6615376472473145 + }, + { + "auxiliary_loss_clip": 0.01149953, + "auxiliary_loss_mlp": 0.0104161, + "balance_loss_clip": 1.05420089, + "balance_loss_mlp": 1.02511144, + "epoch": 0.23658499924845935, + "flos": 26554441338720.0, + "grad_norm": 1.7124199787078873, + "language_loss": 0.78458613, + "learning_rate": 3.5687664649096515e-06, + "loss": 0.80650175, + "num_input_tokens_seen": 84745815, + "router_z_loss_clip": 0.95605469, + "router_z_loss_mlp": 0.16503906, + "step": 3935, + "time_per_iteration": 2.6934075355529785 + }, + { + "auxiliary_loss_clip": 0.01146346, + "auxiliary_loss_mlp": 0.01035258, + "balance_loss_clip": 1.05441856, + "balance_loss_mlp": 1.01996338, + "epoch": 0.23664512250112732, + "flos": 26599247100000.0, + "grad_norm": 1.643789764204548, + "language_loss": 0.79257154, + "learning_rate": 3.5685248612360487e-06, + "loss": 0.81438756, + "num_input_tokens_seen": 84765415, + "router_z_loss_clip": 0.91894531, + "router_z_loss_mlp": 0.15307617, + "step": 3936, + "time_per_iteration": 2.639742374420166 + }, + { + "auxiliary_loss_clip": 0.01146656, + "auxiliary_loss_mlp": 0.01040674, + "balance_loss_clip": 1.0520606, + "balance_loss_mlp": 1.02429509, + "epoch": 0.23670524575379528, + "flos": 27622932189120.0, + "grad_norm": 1.4745826703694844, + "language_loss": 0.78867209, + "learning_rate": 3.568283198083826e-06, + "loss": 0.81054544, + "num_input_tokens_seen": 84787080, + "router_z_loss_clip": 0.94580078, + "router_z_loss_mlp": 0.16369629, + "step": 3937, + "time_per_iteration": 2.730799913406372 + }, + { + "auxiliary_loss_clip": 0.01143775, + "auxiliary_loss_mlp": 0.01039159, + "balance_loss_clip": 1.0536164, + "balance_loss_mlp": 1.02404904, + "epoch": 0.23676536900646325, + "flos": 20407792868640.0, + "grad_norm": 2.04016146188898, + "language_loss": 0.85236049, + "learning_rate": 3.568041475462147e-06, + "loss": 0.87418991, + "num_input_tokens_seen": 84805395, + "router_z_loss_clip": 0.90234375, + "router_z_loss_mlp": 0.15112305, + "step": 3938, + "time_per_iteration": 2.671109437942505 + }, + { + "auxiliary_loss_clip": 0.01143786, + "auxiliary_loss_mlp": 0.01043971, + "balance_loss_clip": 1.05223346, + "balance_loss_mlp": 1.0284853, + "epoch": 0.23682549225913122, + "flos": 13587842460960.0, + "grad_norm": 2.362345869582214, + "language_loss": 0.93412626, + "learning_rate": 3.5677996933801785e-06, + "loss": 0.95600384, + "num_input_tokens_seen": 84818090, + "router_z_loss_clip": 0.91552734, + "router_z_loss_mlp": 0.15490723, + "step": 3939, + "time_per_iteration": 2.6620705127716064 + }, + { + "auxiliary_loss_clip": 0.01150087, + "auxiliary_loss_mlp": 0.01044787, + "balance_loss_clip": 1.05339289, + "balance_loss_mlp": 1.02757275, + "epoch": 0.23688561551179918, + "flos": 27527486178240.0, + "grad_norm": 2.7162740849333784, + "language_loss": 0.82168192, + "learning_rate": 3.567557851847088e-06, + "loss": 0.84363061, + "num_input_tokens_seen": 84837695, + "router_z_loss_clip": 0.96728516, + "router_z_loss_mlp": 0.17199707, + "step": 3940, + "time_per_iteration": 2.698158025741577 + }, + { + "auxiliary_loss_clip": 0.01154059, + "auxiliary_loss_mlp": 0.01045044, + "balance_loss_clip": 1.0549289, + "balance_loss_mlp": 1.02817631, + "epoch": 0.23694573876446715, + "flos": 22591768263840.0, + "grad_norm": 2.358551518828182, + "language_loss": 0.89082038, + "learning_rate": 3.5673159508720464e-06, + "loss": 0.9128114, + "num_input_tokens_seen": 84854630, + "router_z_loss_clip": 0.99267578, + "router_z_loss_mlp": 0.16870117, + "step": 3941, + "time_per_iteration": 2.658719778060913 + }, + { + "auxiliary_loss_clip": 0.01147518, + "auxiliary_loss_mlp": 0.01041976, + "balance_loss_clip": 1.05078351, + "balance_loss_mlp": 1.02442884, + "epoch": 0.23700586201713514, + "flos": 18712750671360.0, + "grad_norm": 1.9322881639176106, + "language_loss": 0.84805954, + "learning_rate": 3.5670739904642274e-06, + "loss": 0.86995441, + "num_input_tokens_seen": 84871805, + "router_z_loss_clip": 0.96826172, + "router_z_loss_mlp": 0.17541504, + "step": 3942, + "time_per_iteration": 2.605778932571411 + }, + { + "auxiliary_loss_clip": 0.01150958, + "auxiliary_loss_mlp": 0.01041635, + "balance_loss_clip": 1.05395985, + "balance_loss_mlp": 1.02466023, + "epoch": 0.2370659852698031, + "flos": 29221515443520.0, + "grad_norm": 2.0696898025905486, + "language_loss": 0.81060207, + "learning_rate": 3.5668319706328065e-06, + "loss": 0.832528, + "num_input_tokens_seen": 84889815, + "router_z_loss_clip": 0.97119141, + "router_z_loss_mlp": 0.1697998, + "step": 3943, + "time_per_iteration": 2.8062353134155273 + }, + { + "auxiliary_loss_clip": 0.01152303, + "auxiliary_loss_mlp": 0.01041915, + "balance_loss_clip": 1.05110419, + "balance_loss_mlp": 1.02392602, + "epoch": 0.23712610852247107, + "flos": 18709144633440.0, + "grad_norm": 2.2615330784182848, + "language_loss": 0.67498481, + "learning_rate": 3.566589891386959e-06, + "loss": 0.69692695, + "num_input_tokens_seen": 84904380, + "router_z_loss_clip": 1.01171875, + "router_z_loss_mlp": 0.17993164, + "step": 3944, + "time_per_iteration": 2.7331998348236084 + }, + { + "auxiliary_loss_clip": 0.011508, + "auxiliary_loss_mlp": 0.01039757, + "balance_loss_clip": 1.05390167, + "balance_loss_mlp": 1.02302027, + "epoch": 0.23718623177513903, + "flos": 24016638823200.0, + "grad_norm": 1.8270279675111536, + "language_loss": 0.75426894, + "learning_rate": 3.566347752735866e-06, + "loss": 0.77617455, + "num_input_tokens_seen": 84922935, + "router_z_loss_clip": 0.96875, + "router_z_loss_mlp": 0.1673584, + "step": 3945, + "time_per_iteration": 2.6499037742614746 + }, + { + "auxiliary_loss_clip": 0.01149219, + "auxiliary_loss_mlp": 0.01040258, + "balance_loss_clip": 1.05476499, + "balance_loss_mlp": 1.02474844, + "epoch": 0.237246355027807, + "flos": 30472794859680.0, + "grad_norm": 1.8987033771023427, + "language_loss": 0.63385522, + "learning_rate": 3.5661055546887094e-06, + "loss": 0.65575004, + "num_input_tokens_seen": 84943685, + "router_z_loss_clip": 0.94482422, + "router_z_loss_mlp": 0.15515137, + "step": 3946, + "time_per_iteration": 2.8137094974517822 + }, + { + "auxiliary_loss_clip": 0.01148253, + "auxiliary_loss_mlp": 0.01039396, + "balance_loss_clip": 1.05200386, + "balance_loss_mlp": 1.02295709, + "epoch": 0.23730647828047496, + "flos": 18763147817280.0, + "grad_norm": 2.208655981057857, + "language_loss": 0.77076566, + "learning_rate": 3.5658632972546734e-06, + "loss": 0.79264212, + "num_input_tokens_seen": 84959505, + "router_z_loss_clip": 0.96337891, + "router_z_loss_mlp": 0.16455078, + "step": 3947, + "time_per_iteration": 2.5976715087890625 + }, + { + "auxiliary_loss_clip": 0.01151937, + "auxiliary_loss_mlp": 0.01041461, + "balance_loss_clip": 1.055619, + "balance_loss_mlp": 1.02439046, + "epoch": 0.23736660153314296, + "flos": 34349746070880.0, + "grad_norm": 1.5205118272977374, + "language_loss": 0.80585968, + "learning_rate": 3.565620980442944e-06, + "loss": 0.82779372, + "num_input_tokens_seen": 84982130, + "router_z_loss_clip": 0.96435547, + "router_z_loss_mlp": 0.17077637, + "step": 3948, + "time_per_iteration": 2.721524238586426 + }, + { + "auxiliary_loss_clip": 0.01149972, + "auxiliary_loss_mlp": 0.01044181, + "balance_loss_clip": 1.05322182, + "balance_loss_mlp": 1.02819514, + "epoch": 0.23742672478581092, + "flos": 26950562148960.0, + "grad_norm": 2.0055775288559836, + "language_loss": 0.80272263, + "learning_rate": 3.5653786042627107e-06, + "loss": 0.82466412, + "num_input_tokens_seen": 85000640, + "router_z_loss_clip": 0.96679688, + "router_z_loss_mlp": 0.15991211, + "step": 3949, + "time_per_iteration": 2.6270015239715576 + }, + { + "auxiliary_loss_clip": 0.01150454, + "auxiliary_loss_mlp": 0.01036215, + "balance_loss_clip": 1.05175471, + "balance_loss_mlp": 1.0199908, + "epoch": 0.2374868480384789, + "flos": 23839076986560.0, + "grad_norm": 1.7029287812056633, + "language_loss": 0.72951698, + "learning_rate": 3.565136168723163e-06, + "loss": 0.75138372, + "num_input_tokens_seen": 85018970, + "router_z_loss_clip": 0.98681641, + "router_z_loss_mlp": 0.16223145, + "step": 3950, + "time_per_iteration": 2.643173933029175 + }, + { + "auxiliary_loss_clip": 0.01144617, + "auxiliary_loss_mlp": 0.01035421, + "balance_loss_clip": 1.05102229, + "balance_loss_mlp": 1.02121115, + "epoch": 0.23754697129114685, + "flos": 23699716594560.0, + "grad_norm": 2.010434969609176, + "language_loss": 0.73224187, + "learning_rate": 3.564893673833495e-06, + "loss": 0.75404227, + "num_input_tokens_seen": 85035905, + "router_z_loss_clip": 0.93603516, + "router_z_loss_mlp": 0.14202881, + "step": 3951, + "time_per_iteration": 2.609771490097046 + }, + { + "auxiliary_loss_clip": 0.01151717, + "auxiliary_loss_mlp": 0.01040498, + "balance_loss_clip": 1.05490351, + "balance_loss_mlp": 1.02357054, + "epoch": 0.23760709454381482, + "flos": 23795000536320.0, + "grad_norm": 1.788875123805153, + "language_loss": 0.73661757, + "learning_rate": 3.564651119602903e-06, + "loss": 0.75853974, + "num_input_tokens_seen": 85054560, + "router_z_loss_clip": 0.96826172, + "router_z_loss_mlp": 0.16931152, + "step": 3952, + "time_per_iteration": 2.6843974590301514 + }, + { + "auxiliary_loss_clip": 0.01147896, + "auxiliary_loss_mlp": 0.01037045, + "balance_loss_clip": 1.05176032, + "balance_loss_mlp": 1.02171481, + "epoch": 0.23766721779648278, + "flos": 33722749033920.0, + "grad_norm": 1.9172228825777045, + "language_loss": 0.71253264, + "learning_rate": 3.564408506040583e-06, + "loss": 0.73438203, + "num_input_tokens_seen": 85074425, + "router_z_loss_clip": 0.96191406, + "router_z_loss_mlp": 0.15344238, + "step": 3953, + "time_per_iteration": 2.7032155990600586 + }, + { + "auxiliary_loss_clip": 0.01151702, + "auxiliary_loss_mlp": 0.01041988, + "balance_loss_clip": 1.05393302, + "balance_loss_mlp": 1.02467906, + "epoch": 0.23772734104915075, + "flos": 28558221274080.0, + "grad_norm": 1.9702276691534346, + "language_loss": 0.81815004, + "learning_rate": 3.5641658331557356e-06, + "loss": 0.84008694, + "num_input_tokens_seen": 85092865, + "router_z_loss_clip": 0.97705078, + "router_z_loss_mlp": 0.1730957, + "step": 3954, + "time_per_iteration": 2.6622140407562256 + }, + { + "auxiliary_loss_clip": 0.01150569, + "auxiliary_loss_mlp": 0.01044698, + "balance_loss_clip": 1.05314445, + "balance_loss_mlp": 1.02744842, + "epoch": 0.23778746430181874, + "flos": 19163522941920.0, + "grad_norm": 2.298806464105781, + "language_loss": 0.65771896, + "learning_rate": 3.5639231009575634e-06, + "loss": 0.67967165, + "num_input_tokens_seen": 85110175, + "router_z_loss_clip": 0.97314453, + "router_z_loss_mlp": 0.17236328, + "step": 3955, + "time_per_iteration": 2.593841075897217 + }, + { + "auxiliary_loss_clip": 0.01146229, + "auxiliary_loss_mlp": 0.01049015, + "balance_loss_clip": 1.05199003, + "balance_loss_mlp": 1.033566, + "epoch": 0.2378475875544867, + "flos": 23704173495360.0, + "grad_norm": 1.6275539371389345, + "language_loss": 0.83797967, + "learning_rate": 3.5636803094552704e-06, + "loss": 0.85993212, + "num_input_tokens_seen": 85129925, + "router_z_loss_clip": 0.94335938, + "router_z_loss_mlp": 0.15441895, + "step": 3956, + "time_per_iteration": 2.69278883934021 + }, + { + "auxiliary_loss_clip": 0.01144573, + "auxiliary_loss_mlp": 0.01038972, + "balance_loss_clip": 1.05277014, + "balance_loss_mlp": 1.02322507, + "epoch": 0.23790771080715467, + "flos": 27172686643200.0, + "grad_norm": 2.009466652967665, + "language_loss": 0.84913254, + "learning_rate": 3.5634374586580635e-06, + "loss": 0.87096798, + "num_input_tokens_seen": 85147755, + "router_z_loss_clip": 0.91748047, + "router_z_loss_mlp": 0.1574707, + "step": 3957, + "time_per_iteration": 2.639230251312256 + }, + { + "auxiliary_loss_clip": 0.01145974, + "auxiliary_loss_mlp": 0.01044698, + "balance_loss_clip": 1.05141509, + "balance_loss_mlp": 1.02948642, + "epoch": 0.23796783405982264, + "flos": 24461738674560.0, + "grad_norm": 2.1871269070767734, + "language_loss": 0.70352769, + "learning_rate": 3.563194548575151e-06, + "loss": 0.7254343, + "num_input_tokens_seen": 85165270, + "router_z_loss_clip": 0.94580078, + "router_z_loss_mlp": 0.15197754, + "step": 3958, + "time_per_iteration": 2.6240484714508057 + }, + { + "auxiliary_loss_clip": 0.01149641, + "auxiliary_loss_mlp": 0.01037325, + "balance_loss_clip": 1.05417061, + "balance_loss_mlp": 1.02076674, + "epoch": 0.2380279573124906, + "flos": 17382475260000.0, + "grad_norm": 2.83471705294584, + "language_loss": 0.6607883, + "learning_rate": 3.562951579215745e-06, + "loss": 0.68265802, + "num_input_tokens_seen": 85181555, + "router_z_loss_clip": 0.95361328, + "router_z_loss_mlp": 0.16552734, + "step": 3959, + "time_per_iteration": 2.649353265762329 + }, + { + "auxiliary_loss_clip": 0.01147767, + "auxiliary_loss_mlp": 0.01038898, + "balance_loss_clip": 1.05265522, + "balance_loss_mlp": 1.0236392, + "epoch": 0.23808808056515857, + "flos": 25842856921920.0, + "grad_norm": 1.7755011435776595, + "language_loss": 0.72545522, + "learning_rate": 3.5627085505890586e-06, + "loss": 0.7473219, + "num_input_tokens_seen": 85199455, + "router_z_loss_clip": 0.95117188, + "router_z_loss_mlp": 0.15246582, + "step": 3960, + "time_per_iteration": 2.656559705734253 + }, + { + "auxiliary_loss_clip": 0.01145486, + "auxiliary_loss_mlp": 0.01039459, + "balance_loss_clip": 1.05189419, + "balance_loss_mlp": 1.02290118, + "epoch": 0.23814820381782653, + "flos": 27489244216320.0, + "grad_norm": 1.8980890064536302, + "language_loss": 0.74248785, + "learning_rate": 3.562465462704307e-06, + "loss": 0.7643373, + "num_input_tokens_seen": 85219170, + "router_z_loss_clip": 0.93603516, + "router_z_loss_mlp": 0.16552734, + "step": 3961, + "time_per_iteration": 2.659947633743286 + }, + { + "auxiliary_loss_clip": 0.01148761, + "auxiliary_loss_mlp": 0.01044875, + "balance_loss_clip": 1.05112624, + "balance_loss_mlp": 1.02737534, + "epoch": 0.23820832707049452, + "flos": 27216398437920.0, + "grad_norm": 1.8249792236949165, + "language_loss": 0.65740871, + "learning_rate": 3.5622223155707085e-06, + "loss": 0.67934507, + "num_input_tokens_seen": 85238480, + "router_z_loss_clip": 0.9765625, + "router_z_loss_mlp": 0.17492676, + "step": 3962, + "time_per_iteration": 4.27950644493103 + }, + { + "auxiliary_loss_clip": 0.01147506, + "auxiliary_loss_mlp": 0.01046553, + "balance_loss_clip": 1.05259669, + "balance_loss_mlp": 1.03098464, + "epoch": 0.2382684503231625, + "flos": 30339633611520.0, + "grad_norm": 1.7895260697263782, + "language_loss": 0.74665654, + "learning_rate": 3.561979109197483e-06, + "loss": 0.76859713, + "num_input_tokens_seen": 85259180, + "router_z_loss_clip": 0.94970703, + "router_z_loss_mlp": 0.15557861, + "step": 3963, + "time_per_iteration": 4.287666082382202 + }, + { + "auxiliary_loss_clip": 0.01152388, + "auxiliary_loss_mlp": 0.01041524, + "balance_loss_clip": 1.05562115, + "balance_loss_mlp": 1.02451301, + "epoch": 0.23832857357583045, + "flos": 26687886207840.0, + "grad_norm": 2.013675129746364, + "language_loss": 0.77091676, + "learning_rate": 3.5617358435938538e-06, + "loss": 0.79285592, + "num_input_tokens_seen": 85278550, + "router_z_loss_clip": 0.96875, + "router_z_loss_mlp": 0.17004395, + "step": 3964, + "time_per_iteration": 2.6701130867004395 + }, + { + "auxiliary_loss_clip": 0.0114607, + "auxiliary_loss_mlp": 0.01039632, + "balance_loss_clip": 1.05203879, + "balance_loss_mlp": 1.02423048, + "epoch": 0.23838869682849842, + "flos": 25975775066400.0, + "grad_norm": 1.972417242712152, + "language_loss": 0.71588808, + "learning_rate": 3.561492518769045e-06, + "loss": 0.73774511, + "num_input_tokens_seen": 85297345, + "router_z_loss_clip": 0.94042969, + "router_z_loss_mlp": 0.15405273, + "step": 3965, + "time_per_iteration": 2.6228244304656982 + }, + { + "auxiliary_loss_clip": 0.01143355, + "auxiliary_loss_mlp": 0.01039731, + "balance_loss_clip": 1.05219746, + "balance_loss_mlp": 1.02441812, + "epoch": 0.23844882008116638, + "flos": 19743202146240.0, + "grad_norm": 2.0106671327346706, + "language_loss": 0.7828325, + "learning_rate": 3.561249134732282e-06, + "loss": 0.80466336, + "num_input_tokens_seen": 85315105, + "router_z_loss_clip": 0.91162109, + "router_z_loss_mlp": 0.15289307, + "step": 3966, + "time_per_iteration": 4.063939571380615 + }, + { + "auxiliary_loss_clip": 0.01144843, + "auxiliary_loss_mlp": 0.01038691, + "balance_loss_clip": 1.05201757, + "balance_loss_mlp": 1.02303243, + "epoch": 0.23850894333383435, + "flos": 25709290500960.0, + "grad_norm": 2.1563341554581044, + "language_loss": 0.68824315, + "learning_rate": 3.561005691492797e-06, + "loss": 0.71007848, + "num_input_tokens_seen": 85334735, + "router_z_loss_clip": 0.92773438, + "router_z_loss_mlp": 0.15643311, + "step": 3967, + "time_per_iteration": 4.115229845046997 + }, + { + "auxiliary_loss_clip": 0.01147351, + "auxiliary_loss_mlp": 0.01052554, + "balance_loss_clip": 1.05300415, + "balance_loss_mlp": 1.03603137, + "epoch": 0.23856906658650234, + "flos": 20989173798720.0, + "grad_norm": 2.3116939646044226, + "language_loss": 0.67779851, + "learning_rate": 3.5607621890598185e-06, + "loss": 0.69979757, + "num_input_tokens_seen": 85352875, + "router_z_loss_clip": 0.94238281, + "router_z_loss_mlp": 0.16516113, + "step": 3968, + "time_per_iteration": 2.641462802886963 + }, + { + "auxiliary_loss_clip": 0.01144781, + "auxiliary_loss_mlp": 0.01042069, + "balance_loss_clip": 1.05052948, + "balance_loss_mlp": 1.02582026, + "epoch": 0.2386291898391703, + "flos": 35989853186880.0, + "grad_norm": 1.8731922995184849, + "language_loss": 0.76458216, + "learning_rate": 3.5605186274425823e-06, + "loss": 0.78645068, + "num_input_tokens_seen": 85372205, + "router_z_loss_clip": 0.94091797, + "router_z_loss_mlp": 0.16241455, + "step": 3969, + "time_per_iteration": 2.7083520889282227 + }, + { + "auxiliary_loss_clip": 0.01141563, + "auxiliary_loss_mlp": 0.01036556, + "balance_loss_clip": 1.05019927, + "balance_loss_mlp": 1.02149963, + "epoch": 0.23868931309183827, + "flos": 25798051160640.0, + "grad_norm": 2.1953591469521028, + "language_loss": 0.76508892, + "learning_rate": 3.5602750066503225e-06, + "loss": 0.78687012, + "num_input_tokens_seen": 85389705, + "router_z_loss_clip": 0.91503906, + "router_z_loss_mlp": 0.1505127, + "step": 3970, + "time_per_iteration": 2.6469225883483887 + }, + { + "auxiliary_loss_clip": 0.01143049, + "auxiliary_loss_mlp": 0.01043625, + "balance_loss_clip": 1.0482285, + "balance_loss_mlp": 1.02687621, + "epoch": 0.23874943634450624, + "flos": 31310652587040.0, + "grad_norm": 2.2958758903505396, + "language_loss": 0.85276687, + "learning_rate": 3.5600313266922793e-06, + "loss": 0.87463355, + "num_input_tokens_seen": 85407855, + "router_z_loss_clip": 0.94873047, + "router_z_loss_mlp": 0.16760254, + "step": 3971, + "time_per_iteration": 2.65158748626709 + }, + { + "auxiliary_loss_clip": 0.0106007, + "auxiliary_loss_mlp": 0.01009135, + "balance_loss_clip": 1.02821779, + "balance_loss_mlp": 1.00717878, + "epoch": 0.2388095595971742, + "flos": 71974283408640.0, + "grad_norm": 0.7497981111251417, + "language_loss": 0.62997472, + "learning_rate": 3.5597875875776915e-06, + "loss": 0.65066677, + "num_input_tokens_seen": 85470885, + "router_z_loss_clip": 0.31860352, + "router_z_loss_mlp": 0.01954651, + "step": 3972, + "time_per_iteration": 3.366840362548828 + }, + { + "auxiliary_loss_clip": 0.01144182, + "auxiliary_loss_mlp": 0.0103691, + "balance_loss_clip": 1.05022955, + "balance_loss_mlp": 1.02204514, + "epoch": 0.23886968284984217, + "flos": 20499511289760.0, + "grad_norm": 2.244973153919385, + "language_loss": 0.81642735, + "learning_rate": 3.5595437893158013e-06, + "loss": 0.83823824, + "num_input_tokens_seen": 85488460, + "router_z_loss_clip": 0.93896484, + "router_z_loss_mlp": 0.14868164, + "step": 3973, + "time_per_iteration": 2.621389865875244 + }, + { + "auxiliary_loss_clip": 0.01144686, + "auxiliary_loss_mlp": 0.01043214, + "balance_loss_clip": 1.05140615, + "balance_loss_mlp": 1.02728808, + "epoch": 0.23892980610251013, + "flos": 27311479793280.0, + "grad_norm": 1.5555743359606995, + "language_loss": 0.79356658, + "learning_rate": 3.5592999319158546e-06, + "loss": 0.8154456, + "num_input_tokens_seen": 85508590, + "router_z_loss_clip": 0.93457031, + "router_z_loss_mlp": 0.15930176, + "step": 3974, + "time_per_iteration": 2.6584739685058594 + }, + { + "auxiliary_loss_clip": 0.01144497, + "auxiliary_loss_mlp": 0.01048022, + "balance_loss_clip": 1.05001998, + "balance_loss_mlp": 1.03186929, + "epoch": 0.23898992935517813, + "flos": 15646314373920.0, + "grad_norm": 1.929599280724271, + "language_loss": 0.84768665, + "learning_rate": 3.5590560153870984e-06, + "loss": 0.86961186, + "num_input_tokens_seen": 85525970, + "router_z_loss_clip": 0.9453125, + "router_z_loss_mlp": 0.16149902, + "step": 3975, + "time_per_iteration": 2.605910539627075 + }, + { + "auxiliary_loss_clip": 0.01139997, + "auxiliary_loss_mlp": 0.01038192, + "balance_loss_clip": 1.04859018, + "balance_loss_mlp": 1.02346921, + "epoch": 0.2390500526078461, + "flos": 27266390411040.0, + "grad_norm": 2.3033379600250674, + "language_loss": 0.83675373, + "learning_rate": 3.5588120397387816e-06, + "loss": 0.85853553, + "num_input_tokens_seen": 85543700, + "router_z_loss_clip": 0.91455078, + "router_z_loss_mlp": 0.1472168, + "step": 3976, + "time_per_iteration": 2.669680118560791 + }, + { + "auxiliary_loss_clip": 0.01138314, + "auxiliary_loss_mlp": 0.01034588, + "balance_loss_clip": 1.04739892, + "balance_loss_mlp": 1.02053356, + "epoch": 0.23911017586051406, + "flos": 27619852875840.0, + "grad_norm": 1.732960503455735, + "language_loss": 0.74346089, + "learning_rate": 3.5585680049801566e-06, + "loss": 0.76518995, + "num_input_tokens_seen": 85562765, + "router_z_loss_clip": 0.90966797, + "router_z_loss_mlp": 0.14074707, + "step": 3977, + "time_per_iteration": 2.646571159362793 + }, + { + "auxiliary_loss_clip": 0.01144901, + "auxiliary_loss_mlp": 0.01045614, + "balance_loss_clip": 1.05090797, + "balance_loss_mlp": 1.02937722, + "epoch": 0.23917029911318202, + "flos": 28862583145920.0, + "grad_norm": 1.7564036092524573, + "language_loss": 0.72078794, + "learning_rate": 3.5583239111204764e-06, + "loss": 0.74269307, + "num_input_tokens_seen": 85581755, + "router_z_loss_clip": 0.94042969, + "router_z_loss_mlp": 0.16235352, + "step": 3978, + "time_per_iteration": 2.6770153045654297 + }, + { + "auxiliary_loss_clip": 0.01149272, + "auxiliary_loss_mlp": 0.01041227, + "balance_loss_clip": 1.05414701, + "balance_loss_mlp": 1.02514589, + "epoch": 0.23923042236585, + "flos": 27800777646720.0, + "grad_norm": 2.0539308152971234, + "language_loss": 0.78678495, + "learning_rate": 3.558079758168997e-06, + "loss": 0.80868995, + "num_input_tokens_seen": 85599455, + "router_z_loss_clip": 0.95117188, + "router_z_loss_mlp": 0.1607666, + "step": 3979, + "time_per_iteration": 2.741199016571045 + }, + { + "auxiliary_loss_clip": 0.01139905, + "auxiliary_loss_mlp": 0.01044708, + "balance_loss_clip": 1.04815161, + "balance_loss_mlp": 1.02850783, + "epoch": 0.23929054561851795, + "flos": 34346423653920.0, + "grad_norm": 1.8167158311674336, + "language_loss": 0.81497288, + "learning_rate": 3.557835546134977e-06, + "loss": 0.83681905, + "num_input_tokens_seen": 85619970, + "router_z_loss_clip": 0.91748047, + "router_z_loss_mlp": 0.1619873, + "step": 3980, + "time_per_iteration": 2.7969894409179688 + }, + { + "auxiliary_loss_clip": 0.01141257, + "auxiliary_loss_mlp": 0.01035698, + "balance_loss_clip": 1.04982567, + "balance_loss_mlp": 1.02000976, + "epoch": 0.23935066887118592, + "flos": 26462277227520.0, + "grad_norm": 1.7744703576811942, + "language_loss": 0.83497012, + "learning_rate": 3.5575912750276775e-06, + "loss": 0.8567397, + "num_input_tokens_seen": 85638850, + "router_z_loss_clip": 0.91455078, + "router_z_loss_mlp": 0.15698242, + "step": 3981, + "time_per_iteration": 2.7208282947540283 + }, + { + "auxiliary_loss_clip": 0.01146661, + "auxiliary_loss_mlp": 0.01039802, + "balance_loss_clip": 1.05122983, + "balance_loss_mlp": 1.02360153, + "epoch": 0.2394107921238539, + "flos": 39196703325600.0, + "grad_norm": 1.8727315572297003, + "language_loss": 0.77016222, + "learning_rate": 3.5573469448563607e-06, + "loss": 0.79202682, + "num_input_tokens_seen": 85656285, + "router_z_loss_clip": 0.95361328, + "router_z_loss_mlp": 0.1619873, + "step": 3982, + "time_per_iteration": 2.7108070850372314 + }, + { + "auxiliary_loss_clip": 0.01139481, + "auxiliary_loss_mlp": 0.01040212, + "balance_loss_clip": 1.04894996, + "balance_loss_mlp": 1.02554309, + "epoch": 0.23947091537652188, + "flos": 20766563097120.0, + "grad_norm": 1.9795257366052987, + "language_loss": 0.78158474, + "learning_rate": 3.5571025556302915e-06, + "loss": 0.80338168, + "num_input_tokens_seen": 85673020, + "router_z_loss_clip": 0.90576172, + "router_z_loss_mlp": 0.14666748, + "step": 3983, + "time_per_iteration": 2.680319309234619 + }, + { + "auxiliary_loss_clip": 0.01142342, + "auxiliary_loss_mlp": 0.01044852, + "balance_loss_clip": 1.04962039, + "balance_loss_mlp": 1.02908099, + "epoch": 0.23953103862918984, + "flos": 25128436295520.0, + "grad_norm": 1.8097364130489326, + "language_loss": 0.73015368, + "learning_rate": 3.556858107358737e-06, + "loss": 0.7520256, + "num_input_tokens_seen": 85692565, + "router_z_loss_clip": 0.92773438, + "router_z_loss_mlp": 0.15777588, + "step": 3984, + "time_per_iteration": 2.731048583984375 + }, + { + "auxiliary_loss_clip": 0.01144472, + "auxiliary_loss_mlp": 0.01038125, + "balance_loss_clip": 1.04913926, + "balance_loss_mlp": 1.02232957, + "epoch": 0.2395911618818578, + "flos": 25263542373120.0, + "grad_norm": 2.124141364823811, + "language_loss": 0.78933382, + "learning_rate": 3.5566136000509674e-06, + "loss": 0.81115979, + "num_input_tokens_seen": 85709730, + "router_z_loss_clip": 0.95361328, + "router_z_loss_mlp": 0.15795898, + "step": 3985, + "time_per_iteration": 2.7022898197174072 + }, + { + "auxiliary_loss_clip": 0.01146382, + "auxiliary_loss_mlp": 0.01043015, + "balance_loss_clip": 1.05279183, + "balance_loss_mlp": 1.0271244, + "epoch": 0.23965128513452577, + "flos": 33011164617120.0, + "grad_norm": 1.8889590470033297, + "language_loss": 0.73375082, + "learning_rate": 3.556369033716254e-06, + "loss": 0.75564486, + "num_input_tokens_seen": 85730045, + "router_z_loss_clip": 0.93554688, + "router_z_loss_mlp": 0.15887451, + "step": 3986, + "time_per_iteration": 2.7710416316986084 + }, + { + "auxiliary_loss_clip": 0.01147432, + "auxiliary_loss_mlp": 0.01045924, + "balance_loss_clip": 1.05024099, + "balance_loss_mlp": 1.03097534, + "epoch": 0.23971140838719374, + "flos": 28241339562720.0, + "grad_norm": 1.8491825428409614, + "language_loss": 0.87514925, + "learning_rate": 3.556124408363871e-06, + "loss": 0.89708281, + "num_input_tokens_seen": 85747590, + "router_z_loss_clip": 0.97216797, + "router_z_loss_mlp": 0.14929199, + "step": 3987, + "time_per_iteration": 2.6672863960266113 + }, + { + "auxiliary_loss_clip": 0.01135422, + "auxiliary_loss_mlp": 0.01039765, + "balance_loss_clip": 1.04850304, + "balance_loss_mlp": 1.02563882, + "epoch": 0.23977153163986173, + "flos": 22006173536640.0, + "grad_norm": 2.254962648402722, + "language_loss": 0.83384895, + "learning_rate": 3.5558797240030945e-06, + "loss": 0.85560083, + "num_input_tokens_seen": 85763460, + "router_z_loss_clip": 0.86914062, + "router_z_loss_mlp": 0.14129639, + "step": 3988, + "time_per_iteration": 2.5708415508270264 + }, + { + "auxiliary_loss_clip": 0.01140126, + "auxiliary_loss_mlp": 0.01038409, + "balance_loss_clip": 1.04834247, + "balance_loss_mlp": 1.02296507, + "epoch": 0.2398316548925297, + "flos": 22101822133920.0, + "grad_norm": 1.9118512615521113, + "language_loss": 0.85205144, + "learning_rate": 3.5556349806432035e-06, + "loss": 0.87383676, + "num_input_tokens_seen": 85782050, + "router_z_loss_clip": 0.91845703, + "router_z_loss_mlp": 0.15441895, + "step": 3989, + "time_per_iteration": 2.6224279403686523 + }, + { + "auxiliary_loss_clip": 0.01138432, + "auxiliary_loss_mlp": 0.01038105, + "balance_loss_clip": 1.0476017, + "balance_loss_mlp": 1.0229063, + "epoch": 0.23989177814519766, + "flos": 15334578357120.0, + "grad_norm": 2.556842886361537, + "language_loss": 0.84353018, + "learning_rate": 3.555390178293477e-06, + "loss": 0.86529559, + "num_input_tokens_seen": 85797400, + "router_z_loss_clip": 0.90722656, + "router_z_loss_mlp": 0.15197754, + "step": 3990, + "time_per_iteration": 2.6501598358154297 + }, + { + "auxiliary_loss_clip": 0.01139296, + "auxiliary_loss_mlp": 0.01038439, + "balance_loss_clip": 1.04845798, + "balance_loss_mlp": 1.02385974, + "epoch": 0.23995190139786562, + "flos": 30828283188480.0, + "grad_norm": 1.5209594026986955, + "language_loss": 0.76009154, + "learning_rate": 3.5551453169631994e-06, + "loss": 0.78186893, + "num_input_tokens_seen": 85818995, + "router_z_loss_clip": 0.90820312, + "router_z_loss_mlp": 0.14587402, + "step": 3991, + "time_per_iteration": 2.6797053813934326 + }, + { + "auxiliary_loss_clip": 0.01054257, + "auxiliary_loss_mlp": 0.01003631, + "balance_loss_clip": 1.02263832, + "balance_loss_mlp": 1.00149894, + "epoch": 0.2400120246505336, + "flos": 75604759240320.0, + "grad_norm": 1.0840739068516658, + "language_loss": 0.63821048, + "learning_rate": 3.554900396661656e-06, + "loss": 0.6587894, + "num_input_tokens_seen": 85876695, + "router_z_loss_clip": 0.31640625, + "router_z_loss_mlp": 0.02134705, + "step": 3992, + "time_per_iteration": 3.245239019393921 + }, + { + "auxiliary_loss_clip": 0.01053953, + "auxiliary_loss_mlp": 0.01001653, + "balance_loss_clip": 1.02232087, + "balance_loss_mlp": 0.99952203, + "epoch": 0.24007214790320155, + "flos": 81398390935680.0, + "grad_norm": 0.7584297830769846, + "language_loss": 0.6298058, + "learning_rate": 3.5546554173981334e-06, + "loss": 0.65036184, + "num_input_tokens_seen": 85940990, + "router_z_loss_clip": 0.31689453, + "router_z_loss_mlp": 0.02133179, + "step": 3993, + "time_per_iteration": 3.3657331466674805 + }, + { + "auxiliary_loss_clip": 0.01148207, + "auxiliary_loss_mlp": 0.01042481, + "balance_loss_clip": 1.05359995, + "balance_loss_mlp": 1.02632833, + "epoch": 0.24013227115586952, + "flos": 31492954945440.0, + "grad_norm": 1.7718106646409617, + "language_loss": 0.76691723, + "learning_rate": 3.5544103791819218e-06, + "loss": 0.78882408, + "num_input_tokens_seen": 85961165, + "router_z_loss_clip": 0.94775391, + "router_z_loss_mlp": 0.16162109, + "step": 3994, + "time_per_iteration": 2.677781343460083 + }, + { + "auxiliary_loss_clip": 0.0114531, + "auxiliary_loss_mlp": 0.01045396, + "balance_loss_clip": 1.05107045, + "balance_loss_mlp": 1.02803874, + "epoch": 0.2401923944085375, + "flos": 31185594794880.0, + "grad_norm": 1.679218096768048, + "language_loss": 0.78468478, + "learning_rate": 3.5541652820223124e-06, + "loss": 0.80659175, + "num_input_tokens_seen": 85982710, + "router_z_loss_clip": 0.94189453, + "router_z_loss_mlp": 0.17370605, + "step": 3995, + "time_per_iteration": 2.7013511657714844 + }, + { + "auxiliary_loss_clip": 0.01052689, + "auxiliary_loss_mlp": 0.01001005, + "balance_loss_clip": 1.02109051, + "balance_loss_mlp": 0.99901152, + "epoch": 0.24025251766120548, + "flos": 67037268941280.0, + "grad_norm": 0.8974046192875081, + "language_loss": 0.63432896, + "learning_rate": 3.5539201259286006e-06, + "loss": 0.65486586, + "num_input_tokens_seen": 86046935, + "router_z_loss_clip": 0.31640625, + "router_z_loss_mlp": 0.01992798, + "step": 3996, + "time_per_iteration": 3.3305156230926514 + }, + { + "auxiliary_loss_clip": 0.01146116, + "auxiliary_loss_mlp": 0.01036891, + "balance_loss_clip": 1.05136299, + "balance_loss_mlp": 1.02125037, + "epoch": 0.24031264091387344, + "flos": 25174092919680.0, + "grad_norm": 2.767501744692996, + "language_loss": 0.6987226, + "learning_rate": 3.5536749109100808e-06, + "loss": 0.72055268, + "num_input_tokens_seen": 86064355, + "router_z_loss_clip": 0.94824219, + "router_z_loss_mlp": 0.15637207, + "step": 3997, + "time_per_iteration": 2.750796318054199 + }, + { + "auxiliary_loss_clip": 0.01142123, + "auxiliary_loss_mlp": 0.01039297, + "balance_loss_clip": 1.05032873, + "balance_loss_mlp": 1.02366889, + "epoch": 0.2403727641665414, + "flos": 25485504798240.0, + "grad_norm": 1.7208385587889388, + "language_loss": 0.87166965, + "learning_rate": 3.5534296369760535e-06, + "loss": 0.89348382, + "num_input_tokens_seen": 86081340, + "router_z_loss_clip": 0.91845703, + "router_z_loss_mlp": 0.15625, + "step": 3998, + "time_per_iteration": 2.6654767990112305 + }, + { + "auxiliary_loss_clip": 0.01146815, + "auxiliary_loss_mlp": 0.01034409, + "balance_loss_clip": 1.04919934, + "balance_loss_mlp": 1.01924551, + "epoch": 0.24043288741920937, + "flos": 27845259269760.0, + "grad_norm": 1.6428042853455345, + "language_loss": 0.75812751, + "learning_rate": 3.5531843041358183e-06, + "loss": 0.77993977, + "num_input_tokens_seen": 86102260, + "router_z_loss_clip": 0.97705078, + "router_z_loss_mlp": 0.15161133, + "step": 3999, + "time_per_iteration": 2.720470666885376 + }, + { + "auxiliary_loss_clip": 0.01144648, + "auxiliary_loss_mlp": 0.01036164, + "balance_loss_clip": 1.05334949, + "balance_loss_mlp": 1.02159643, + "epoch": 0.24049301067187734, + "flos": 34117451739360.0, + "grad_norm": 2.984966547463666, + "language_loss": 0.72649932, + "learning_rate": 3.552938912398679e-06, + "loss": 0.74830747, + "num_input_tokens_seen": 86123400, + "router_z_loss_clip": 0.91357422, + "router_z_loss_mlp": 0.14569092, + "step": 4000, + "time_per_iteration": 2.7214112281799316 + }, + { + "auxiliary_loss_clip": 0.01151158, + "auxiliary_loss_mlp": 0.01038024, + "balance_loss_clip": 1.05346155, + "balance_loss_mlp": 1.02203798, + "epoch": 0.24055313392454533, + "flos": 33497018501760.0, + "grad_norm": 4.1009396853862965, + "language_loss": 0.66783172, + "learning_rate": 3.5526934617739397e-06, + "loss": 0.68972355, + "num_input_tokens_seen": 86144060, + "router_z_loss_clip": 0.97705078, + "router_z_loss_mlp": 0.15979004, + "step": 4001, + "time_per_iteration": 2.7489991188049316 + }, + { + "auxiliary_loss_clip": 0.01144443, + "auxiliary_loss_mlp": 0.01037624, + "balance_loss_clip": 1.05066895, + "balance_loss_mlp": 1.02131665, + "epoch": 0.2406132571772133, + "flos": 31184622380160.0, + "grad_norm": 4.944788800470624, + "language_loss": 0.82485056, + "learning_rate": 3.5524479522709095e-06, + "loss": 0.84667122, + "num_input_tokens_seen": 86163005, + "router_z_loss_clip": 0.93798828, + "router_z_loss_mlp": 0.16320801, + "step": 4002, + "time_per_iteration": 4.269405126571655 + }, + { + "auxiliary_loss_clip": 0.0114397, + "auxiliary_loss_mlp": 0.0103834, + "balance_loss_clip": 1.05078411, + "balance_loss_mlp": 1.02343917, + "epoch": 0.24067338042988126, + "flos": 30250751400000.0, + "grad_norm": 1.8132334641242274, + "language_loss": 0.83018482, + "learning_rate": 3.552202383898897e-06, + "loss": 0.85200787, + "num_input_tokens_seen": 86182580, + "router_z_loss_clip": 0.93359375, + "router_z_loss_mlp": 0.14916992, + "step": 4003, + "time_per_iteration": 4.297792911529541 + }, + { + "auxiliary_loss_clip": 0.01143949, + "auxiliary_loss_mlp": 0.01037391, + "balance_loss_clip": 1.05142689, + "balance_loss_mlp": 1.02194178, + "epoch": 0.24073350368254923, + "flos": 25841438817120.0, + "grad_norm": 2.936964774007025, + "language_loss": 0.87209189, + "learning_rate": 3.551956756667215e-06, + "loss": 0.89390528, + "num_input_tokens_seen": 86200665, + "router_z_loss_clip": 0.92578125, + "router_z_loss_mlp": 0.15454102, + "step": 4004, + "time_per_iteration": 2.6525392532348633 + }, + { + "auxiliary_loss_clip": 0.0114621, + "auxiliary_loss_mlp": 0.01046919, + "balance_loss_clip": 1.05019152, + "balance_loss_mlp": 1.03151727, + "epoch": 0.2407936269352172, + "flos": 27447517768320.0, + "grad_norm": 2.0600452075647033, + "language_loss": 0.78270876, + "learning_rate": 3.551711070585177e-06, + "loss": 0.80464005, + "num_input_tokens_seen": 86221640, + "router_z_loss_clip": 0.95947266, + "router_z_loss_mlp": 0.15405273, + "step": 4005, + "time_per_iteration": 4.174415349960327 + }, + { + "auxiliary_loss_clip": 0.0114142, + "auxiliary_loss_mlp": 0.01035652, + "balance_loss_clip": 1.05078185, + "balance_loss_mlp": 1.02010751, + "epoch": 0.24085375018788516, + "flos": 22636006783200.0, + "grad_norm": 1.6284346466802184, + "language_loss": 0.79096282, + "learning_rate": 3.5514653256620995e-06, + "loss": 0.81273359, + "num_input_tokens_seen": 86240795, + "router_z_loss_clip": 0.90673828, + "router_z_loss_mlp": 0.15539551, + "step": 4006, + "time_per_iteration": 2.647732973098755 + }, + { + "auxiliary_loss_clip": 0.01149463, + "auxiliary_loss_mlp": 0.01042756, + "balance_loss_clip": 1.05092943, + "balance_loss_mlp": 1.02576876, + "epoch": 0.24091387344055312, + "flos": 29493186220800.0, + "grad_norm": 1.8528893370795203, + "language_loss": 0.71345901, + "learning_rate": 3.551219521907302e-06, + "loss": 0.73538113, + "num_input_tokens_seen": 86262000, + "router_z_loss_clip": 0.98583984, + "router_z_loss_mlp": 0.17004395, + "step": 4007, + "time_per_iteration": 4.152093887329102 + }, + { + "auxiliary_loss_clip": 0.01140059, + "auxiliary_loss_mlp": 0.01040041, + "balance_loss_clip": 1.04912806, + "balance_loss_mlp": 1.02533054, + "epoch": 0.24097399669322112, + "flos": 13465742430240.0, + "grad_norm": 1.6844680226059559, + "language_loss": 0.7589007, + "learning_rate": 3.5509736593301042e-06, + "loss": 0.7807017, + "num_input_tokens_seen": 86279680, + "router_z_loss_clip": 0.91064453, + "router_z_loss_mlp": 0.14709473, + "step": 4008, + "time_per_iteration": 2.7200276851654053 + }, + { + "auxiliary_loss_clip": 0.01142376, + "auxiliary_loss_mlp": 0.01036332, + "balance_loss_clip": 1.05008578, + "balance_loss_mlp": 1.020787, + "epoch": 0.24103411994588908, + "flos": 20944611141120.0, + "grad_norm": 2.562743512242146, + "language_loss": 0.74964368, + "learning_rate": 3.5507277379398295e-06, + "loss": 0.77143073, + "num_input_tokens_seen": 86297180, + "router_z_loss_clip": 0.92333984, + "router_z_loss_mlp": 0.15539551, + "step": 4009, + "time_per_iteration": 2.613784074783325 + }, + { + "auxiliary_loss_clip": 0.01144227, + "auxiliary_loss_mlp": 0.01041088, + "balance_loss_clip": 1.052899, + "balance_loss_mlp": 1.02591276, + "epoch": 0.24109424319855705, + "flos": 25219060750080.0, + "grad_norm": 1.9172153485623689, + "language_loss": 0.79772866, + "learning_rate": 3.550481757745804e-06, + "loss": 0.81958181, + "num_input_tokens_seen": 86317660, + "router_z_loss_clip": 0.91357422, + "router_z_loss_mlp": 0.15167236, + "step": 4010, + "time_per_iteration": 2.734992027282715 + }, + { + "auxiliary_loss_clip": 0.01143921, + "auxiliary_loss_mlp": 0.01049454, + "balance_loss_clip": 1.04898524, + "balance_loss_mlp": 1.03172743, + "epoch": 0.241154366451225, + "flos": 34388231136480.0, + "grad_norm": 2.0274093611804274, + "language_loss": 0.70777094, + "learning_rate": 3.5502357187573555e-06, + "loss": 0.72970462, + "num_input_tokens_seen": 86338325, + "router_z_loss_clip": 0.95019531, + "router_z_loss_mlp": 0.17724609, + "step": 4011, + "time_per_iteration": 2.708051919937134 + }, + { + "auxiliary_loss_clip": 0.01141439, + "auxiliary_loss_mlp": 0.01038136, + "balance_loss_clip": 1.04851675, + "balance_loss_mlp": 1.02327073, + "epoch": 0.24121448970389298, + "flos": 26465883265440.0, + "grad_norm": 1.9066656327549665, + "language_loss": 0.69269764, + "learning_rate": 3.5499896209838118e-06, + "loss": 0.71449339, + "num_input_tokens_seen": 86357615, + "router_z_loss_clip": 0.92871094, + "router_z_loss_mlp": 0.1484375, + "step": 4012, + "time_per_iteration": 2.6865828037261963 + }, + { + "auxiliary_loss_clip": 0.01143902, + "auxiliary_loss_mlp": 0.01039149, + "balance_loss_clip": 1.04997885, + "balance_loss_mlp": 1.02240014, + "epoch": 0.24127461295656094, + "flos": 48414447580320.0, + "grad_norm": 1.6967095177464653, + "language_loss": 0.73797405, + "learning_rate": 3.5497434644345073e-06, + "loss": 0.75980461, + "num_input_tokens_seen": 86380355, + "router_z_loss_clip": 0.93847656, + "router_z_loss_mlp": 0.16760254, + "step": 4013, + "time_per_iteration": 2.8348300457000732 + }, + { + "auxiliary_loss_clip": 0.01144225, + "auxiliary_loss_mlp": 0.01034625, + "balance_loss_clip": 1.05085695, + "balance_loss_mlp": 1.01912761, + "epoch": 0.2413347362092289, + "flos": 23349454994880.0, + "grad_norm": 1.8189333975018873, + "language_loss": 0.87903678, + "learning_rate": 3.5494972491187753e-06, + "loss": 0.90082526, + "num_input_tokens_seen": 86399125, + "router_z_loss_clip": 0.93408203, + "router_z_loss_mlp": 0.1550293, + "step": 4014, + "time_per_iteration": 2.6471035480499268 + }, + { + "auxiliary_loss_clip": 0.01150171, + "auxiliary_loss_mlp": 0.01043539, + "balance_loss_clip": 1.05230451, + "balance_loss_mlp": 1.0277195, + "epoch": 0.2413948594618969, + "flos": 32872371467040.0, + "grad_norm": 2.1814001718412936, + "language_loss": 0.94907945, + "learning_rate": 3.549250975045952e-06, + "loss": 0.97101653, + "num_input_tokens_seen": 86418625, + "router_z_loss_clip": 0.97802734, + "router_z_loss_mlp": 0.15820312, + "step": 4015, + "time_per_iteration": 2.7190186977386475 + }, + { + "auxiliary_loss_clip": 0.01147263, + "auxiliary_loss_mlp": 0.01034985, + "balance_loss_clip": 1.05190206, + "balance_loss_mlp": 1.01938009, + "epoch": 0.24145498271456486, + "flos": 30783761048160.0, + "grad_norm": 3.9354742794378432, + "language_loss": 0.82536089, + "learning_rate": 3.5490046422253768e-06, + "loss": 0.84718347, + "num_input_tokens_seen": 86438375, + "router_z_loss_clip": 0.95263672, + "router_z_loss_mlp": 0.15588379, + "step": 4016, + "time_per_iteration": 2.6756832599639893 + }, + { + "auxiliary_loss_clip": 0.01141507, + "auxiliary_loss_mlp": 0.01043574, + "balance_loss_clip": 1.05124736, + "balance_loss_mlp": 1.02777886, + "epoch": 0.24151510596723283, + "flos": 49617072093600.0, + "grad_norm": 3.266173972414059, + "language_loss": 0.6925506, + "learning_rate": 3.54875825066639e-06, + "loss": 0.71440136, + "num_input_tokens_seen": 86463230, + "router_z_loss_clip": 0.90185547, + "router_z_loss_mlp": 0.15808105, + "step": 4017, + "time_per_iteration": 2.867443799972534 + }, + { + "auxiliary_loss_clip": 0.01147824, + "auxiliary_loss_mlp": 0.01048305, + "balance_loss_clip": 1.05135357, + "balance_loss_mlp": 1.03231943, + "epoch": 0.2415752292199008, + "flos": 22142738236320.0, + "grad_norm": 1.8560761254013458, + "language_loss": 0.84795964, + "learning_rate": 3.5485118003783353e-06, + "loss": 0.86992085, + "num_input_tokens_seen": 86481230, + "router_z_loss_clip": 0.96386719, + "router_z_loss_mlp": 0.159729, + "step": 4018, + "time_per_iteration": 2.594790458679199 + }, + { + "auxiliary_loss_clip": 0.01071279, + "auxiliary_loss_mlp": 0.01044057, + "balance_loss_clip": 1.04003549, + "balance_loss_mlp": 1.04205716, + "epoch": 0.24163535247256876, + "flos": 82106328797280.0, + "grad_norm": 0.830793408807889, + "language_loss": 0.60628873, + "learning_rate": 3.548265291370558e-06, + "loss": 0.62744212, + "num_input_tokens_seen": 86541260, + "router_z_loss_clip": 0.31225586, + "router_z_loss_mlp": 0.01998901, + "step": 4019, + "time_per_iteration": 3.3992748260498047 + }, + { + "auxiliary_loss_clip": 0.01142419, + "auxiliary_loss_mlp": 0.01035754, + "balance_loss_clip": 1.04922485, + "balance_loss_mlp": 1.02110314, + "epoch": 0.24169547572523672, + "flos": 30420493401600.0, + "grad_norm": 2.0302707677023277, + "language_loss": 0.73290986, + "learning_rate": 3.5480187236524055e-06, + "loss": 0.7546916, + "num_input_tokens_seen": 86559580, + "router_z_loss_clip": 0.93115234, + "router_z_loss_mlp": 0.14642334, + "step": 4020, + "time_per_iteration": 2.645956039428711 + }, + { + "auxiliary_loss_clip": 0.01147241, + "auxiliary_loss_mlp": 0.01036956, + "balance_loss_clip": 1.05461872, + "balance_loss_mlp": 1.02106524, + "epoch": 0.24175559897790472, + "flos": 22852701961920.0, + "grad_norm": 1.8857986760778223, + "language_loss": 0.81930268, + "learning_rate": 3.5477720972332285e-06, + "loss": 0.84114462, + "num_input_tokens_seen": 86577560, + "router_z_loss_clip": 0.92724609, + "router_z_loss_mlp": 0.15881348, + "step": 4021, + "time_per_iteration": 2.65197491645813 + }, + { + "auxiliary_loss_clip": 0.01149839, + "auxiliary_loss_mlp": 0.01045079, + "balance_loss_clip": 1.05322874, + "balance_loss_mlp": 1.02784169, + "epoch": 0.24181572223057268, + "flos": 28112756767200.0, + "grad_norm": 2.019813975686559, + "language_loss": 0.76264948, + "learning_rate": 3.547525412122378e-06, + "loss": 0.78459871, + "num_input_tokens_seen": 86595350, + "router_z_loss_clip": 0.96728516, + "router_z_loss_mlp": 0.17248535, + "step": 4022, + "time_per_iteration": 2.7058162689208984 + }, + { + "auxiliary_loss_clip": 0.01150601, + "auxiliary_loss_mlp": 0.01037065, + "balance_loss_clip": 1.05166543, + "balance_loss_mlp": 1.01988697, + "epoch": 0.24187584548324065, + "flos": 24862964662080.0, + "grad_norm": 1.8013651940179887, + "language_loss": 0.75591141, + "learning_rate": 3.5472786683292083e-06, + "loss": 0.7777881, + "num_input_tokens_seen": 86614805, + "router_z_loss_clip": 0.99072266, + "router_z_loss_mlp": 0.171875, + "step": 4023, + "time_per_iteration": 2.6881330013275146 + }, + { + "auxiliary_loss_clip": 0.01145771, + "auxiliary_loss_mlp": 0.01048212, + "balance_loss_clip": 1.05298483, + "balance_loss_mlp": 1.03229773, + "epoch": 0.2419359687359086, + "flos": 26109341487360.0, + "grad_norm": 2.1917992860123685, + "language_loss": 0.81994843, + "learning_rate": 3.5470318658630766e-06, + "loss": 0.84188831, + "num_input_tokens_seen": 86633700, + "router_z_loss_clip": 0.92773438, + "router_z_loss_mlp": 0.15917969, + "step": 4024, + "time_per_iteration": 2.73486590385437 + }, + { + "auxiliary_loss_clip": 0.01144306, + "auxiliary_loss_mlp": 0.01045874, + "balance_loss_clip": 1.05251992, + "balance_loss_mlp": 1.02979279, + "epoch": 0.24199609198857658, + "flos": 22414165909920.0, + "grad_norm": 2.1570684547626167, + "language_loss": 0.85922313, + "learning_rate": 3.5467850047333424e-06, + "loss": 0.88112485, + "num_input_tokens_seen": 86650905, + "router_z_loss_clip": 0.91796875, + "router_z_loss_mlp": 0.1607666, + "step": 4025, + "time_per_iteration": 2.6281635761260986 + }, + { + "auxiliary_loss_clip": 0.01145542, + "auxiliary_loss_mlp": 0.01045428, + "balance_loss_clip": 1.04985702, + "balance_loss_mlp": 1.02987123, + "epoch": 0.24205621524124454, + "flos": 23749749084960.0, + "grad_norm": 2.338473470231273, + "language_loss": 0.72100544, + "learning_rate": 3.546538084949365e-06, + "loss": 0.74291515, + "num_input_tokens_seen": 86669185, + "router_z_loss_clip": 0.95800781, + "router_z_loss_mlp": 0.15551758, + "step": 4026, + "time_per_iteration": 2.7747716903686523 + }, + { + "auxiliary_loss_clip": 0.0114032, + "auxiliary_loss_mlp": 0.01043995, + "balance_loss_clip": 1.04888797, + "balance_loss_mlp": 1.02860522, + "epoch": 0.2421163384939125, + "flos": 18273687894720.0, + "grad_norm": 1.8597593492915285, + "language_loss": 0.64760464, + "learning_rate": 3.546291106520509e-06, + "loss": 0.66944778, + "num_input_tokens_seen": 86686805, + "router_z_loss_clip": 0.9140625, + "router_z_loss_mlp": 0.15393066, + "step": 4027, + "time_per_iteration": 2.64471173286438 + }, + { + "auxiliary_loss_clip": 0.01149429, + "auxiliary_loss_mlp": 0.01039359, + "balance_loss_clip": 1.0543859, + "balance_loss_mlp": 1.02449322, + "epoch": 0.2421764617465805, + "flos": 22771234412640.0, + "grad_norm": 2.1972233018745086, + "language_loss": 0.70517814, + "learning_rate": 3.5460440694561388e-06, + "loss": 0.72706604, + "num_input_tokens_seen": 86705520, + "router_z_loss_clip": 0.95068359, + "router_z_loss_mlp": 0.14855957, + "step": 4028, + "time_per_iteration": 2.6965737342834473 + }, + { + "auxiliary_loss_clip": 0.01055429, + "auxiliary_loss_mlp": 0.01000792, + "balance_loss_clip": 1.02419972, + "balance_loss_mlp": 0.99874556, + "epoch": 0.24223658499924847, + "flos": 78516080274240.0, + "grad_norm": 0.8550606998076519, + "language_loss": 0.55347097, + "learning_rate": 3.545796973765623e-06, + "loss": 0.57403314, + "num_input_tokens_seen": 86767320, + "router_z_loss_clip": 0.31298828, + "router_z_loss_mlp": 0.02046204, + "step": 4029, + "time_per_iteration": 3.30249285697937 + }, + { + "auxiliary_loss_clip": 0.01148054, + "auxiliary_loss_mlp": 0.01045752, + "balance_loss_clip": 1.05373716, + "balance_loss_mlp": 1.02865744, + "epoch": 0.24229670825191643, + "flos": 31451431083840.0, + "grad_norm": 1.827707777988767, + "language_loss": 0.74286222, + "learning_rate": 3.54554981945833e-06, + "loss": 0.76480031, + "num_input_tokens_seen": 86788110, + "router_z_loss_clip": 0.94384766, + "router_z_loss_mlp": 0.17089844, + "step": 4030, + "time_per_iteration": 2.794312000274658 + }, + { + "auxiliary_loss_clip": 0.01146012, + "auxiliary_loss_mlp": 0.01055453, + "balance_loss_clip": 1.05116105, + "balance_loss_mlp": 1.03860879, + "epoch": 0.2423568315045844, + "flos": 25218088335360.0, + "grad_norm": 3.2269380435081487, + "language_loss": 0.76869726, + "learning_rate": 3.5453026065436343e-06, + "loss": 0.79071194, + "num_input_tokens_seen": 86807640, + "router_z_loss_clip": 0.94921875, + "router_z_loss_mlp": 0.1685791, + "step": 4031, + "time_per_iteration": 2.760343551635742 + }, + { + "auxiliary_loss_clip": 0.01152692, + "auxiliary_loss_mlp": 0.01050679, + "balance_loss_clip": 1.05452001, + "balance_loss_mlp": 1.034621, + "epoch": 0.24241695475725236, + "flos": 27353368310400.0, + "grad_norm": 3.524920132802355, + "language_loss": 0.66102207, + "learning_rate": 3.5450553350309083e-06, + "loss": 0.68305576, + "num_input_tokens_seen": 86826795, + "router_z_loss_clip": 0.98095703, + "router_z_loss_mlp": 0.16052246, + "step": 4032, + "time_per_iteration": 2.683208465576172 + }, + { + "auxiliary_loss_clip": 0.01145475, + "auxiliary_loss_mlp": 0.01048396, + "balance_loss_clip": 1.05120623, + "balance_loss_mlp": 1.03201699, + "epoch": 0.24247707800992033, + "flos": 20900534690880.0, + "grad_norm": 2.074328697349513, + "language_loss": 0.81693107, + "learning_rate": 3.5448080049295286e-06, + "loss": 0.83886975, + "num_input_tokens_seen": 86843175, + "router_z_loss_clip": 0.94189453, + "router_z_loss_mlp": 0.16381836, + "step": 4033, + "time_per_iteration": 2.646973133087158 + }, + { + "auxiliary_loss_clip": 0.01142506, + "auxiliary_loss_mlp": 0.01039742, + "balance_loss_clip": 1.05123913, + "balance_loss_mlp": 1.02359498, + "epoch": 0.2425372012625883, + "flos": 38575945949760.0, + "grad_norm": 1.9532794179487087, + "language_loss": 0.68964076, + "learning_rate": 3.5445606162488754e-06, + "loss": 0.71146321, + "num_input_tokens_seen": 86863185, + "router_z_loss_clip": 0.91162109, + "router_z_loss_mlp": 0.16131592, + "step": 4034, + "time_per_iteration": 2.740650177001953 + }, + { + "auxiliary_loss_clip": 0.01145257, + "auxiliary_loss_mlp": 0.01036583, + "balance_loss_clip": 1.05119109, + "balance_loss_mlp": 1.02043021, + "epoch": 0.24259732451525629, + "flos": 19920723465600.0, + "grad_norm": 1.9869259441607987, + "language_loss": 0.9603833, + "learning_rate": 3.5443131689983283e-06, + "loss": 0.9822017, + "num_input_tokens_seen": 86880040, + "router_z_loss_clip": 0.94042969, + "router_z_loss_mlp": 0.16149902, + "step": 4035, + "time_per_iteration": 2.7063920497894287 + }, + { + "auxiliary_loss_clip": 0.01138601, + "auxiliary_loss_mlp": 0.01049008, + "balance_loss_clip": 1.04903495, + "balance_loss_mlp": 1.03379679, + "epoch": 0.24265744776792425, + "flos": 27889902961920.0, + "grad_norm": 2.1846029954791715, + "language_loss": 0.77641296, + "learning_rate": 3.5440656631872715e-06, + "loss": 0.79828906, + "num_input_tokens_seen": 86900610, + "router_z_loss_clip": 0.89599609, + "router_z_loss_mlp": 0.15203857, + "step": 4036, + "time_per_iteration": 2.7207295894622803 + }, + { + "auxiliary_loss_clip": 0.01149056, + "auxiliary_loss_mlp": 0.01049695, + "balance_loss_clip": 1.05327952, + "balance_loss_mlp": 1.03293395, + "epoch": 0.24271757102059222, + "flos": 26687845690560.0, + "grad_norm": 2.4856243375426703, + "language_loss": 0.74674809, + "learning_rate": 3.5438180988250898e-06, + "loss": 0.76873565, + "num_input_tokens_seen": 86919385, + "router_z_loss_clip": 0.95751953, + "router_z_loss_mlp": 0.16772461, + "step": 4037, + "time_per_iteration": 2.7380688190460205 + }, + { + "auxiliary_loss_clip": 0.01145829, + "auxiliary_loss_mlp": 0.01041609, + "balance_loss_clip": 1.05093932, + "balance_loss_mlp": 1.02570641, + "epoch": 0.24277769427326018, + "flos": 23438215654560.0, + "grad_norm": 2.4097514333926653, + "language_loss": 0.76726091, + "learning_rate": 3.543570475921171e-06, + "loss": 0.78913534, + "num_input_tokens_seen": 86938885, + "router_z_loss_clip": 0.94921875, + "router_z_loss_mlp": 0.15893555, + "step": 4038, + "time_per_iteration": 2.679527521133423 + }, + { + "auxiliary_loss_clip": 0.01146663, + "auxiliary_loss_mlp": 0.01048795, + "balance_loss_clip": 1.05219126, + "balance_loss_mlp": 1.03166437, + "epoch": 0.24283781752592815, + "flos": 23793744500640.0, + "grad_norm": 2.3563259586171568, + "language_loss": 0.71973467, + "learning_rate": 3.543322794484905e-06, + "loss": 0.74168926, + "num_input_tokens_seen": 86957705, + "router_z_loss_clip": 0.94482422, + "router_z_loss_mlp": 0.17126465, + "step": 4039, + "time_per_iteration": 2.7361083030700684 + }, + { + "auxiliary_loss_clip": 0.01146628, + "auxiliary_loss_mlp": 0.01041114, + "balance_loss_clip": 1.05335999, + "balance_loss_mlp": 1.02512836, + "epoch": 0.2428979407785961, + "flos": 24284582010720.0, + "grad_norm": 1.9397552585859599, + "language_loss": 0.78255165, + "learning_rate": 3.5430750545256843e-06, + "loss": 0.80442905, + "num_input_tokens_seen": 86975845, + "router_z_loss_clip": 0.93457031, + "router_z_loss_mlp": 0.15979004, + "step": 4040, + "time_per_iteration": 2.636193037033081 + }, + { + "auxiliary_loss_clip": 0.01140602, + "auxiliary_loss_mlp": 0.01039376, + "balance_loss_clip": 1.05051279, + "balance_loss_mlp": 1.02510619, + "epoch": 0.2429580640312641, + "flos": 30158830392480.0, + "grad_norm": 1.623627128357152, + "language_loss": 0.80228597, + "learning_rate": 3.5428272560529027e-06, + "loss": 0.82408571, + "num_input_tokens_seen": 86994800, + "router_z_loss_clip": 0.90039062, + "router_z_loss_mlp": 0.14257812, + "step": 4041, + "time_per_iteration": 5.141633749008179 + }, + { + "auxiliary_loss_clip": 0.01144656, + "auxiliary_loss_mlp": 0.01042637, + "balance_loss_clip": 1.05267227, + "balance_loss_mlp": 1.02737796, + "epoch": 0.24301818728393207, + "flos": 31274517523680.0, + "grad_norm": 2.905132669580739, + "language_loss": 0.76165926, + "learning_rate": 3.542579399075957e-06, + "loss": 0.78353214, + "num_input_tokens_seen": 87016845, + "router_z_loss_clip": 0.91943359, + "router_z_loss_mlp": 0.15258789, + "step": 4042, + "time_per_iteration": 2.739748477935791 + }, + { + "auxiliary_loss_clip": 0.01142065, + "auxiliary_loss_mlp": 0.01032976, + "balance_loss_clip": 1.05161119, + "balance_loss_mlp": 1.0188911, + "epoch": 0.24307831053660003, + "flos": 31898111109120.0, + "grad_norm": 1.9842261780121377, + "language_loss": 0.81450081, + "learning_rate": 3.542331483604246e-06, + "loss": 0.83625126, + "num_input_tokens_seen": 87036270, + "router_z_loss_clip": 0.90380859, + "router_z_loss_mlp": 0.14093018, + "step": 4043, + "time_per_iteration": 4.118134260177612 + }, + { + "auxiliary_loss_clip": 0.01145091, + "auxiliary_loss_mlp": 0.01035249, + "balance_loss_clip": 1.04864323, + "balance_loss_mlp": 1.01903677, + "epoch": 0.243138433789268, + "flos": 18269271511200.0, + "grad_norm": 2.5830114705609133, + "language_loss": 0.73421556, + "learning_rate": 3.5420835096471706e-06, + "loss": 0.756019, + "num_input_tokens_seen": 87049920, + "router_z_loss_clip": 0.96386719, + "router_z_loss_mlp": 0.16210938, + "step": 4044, + "time_per_iteration": 4.0871946811676025 + }, + { + "auxiliary_loss_clip": 0.01144531, + "auxiliary_loss_mlp": 0.01040366, + "balance_loss_clip": 1.05162966, + "balance_loss_mlp": 1.02424896, + "epoch": 0.24319855704193596, + "flos": 30739319942400.0, + "grad_norm": 1.987779524691046, + "language_loss": 0.83293724, + "learning_rate": 3.5418354772141337e-06, + "loss": 0.85478628, + "num_input_tokens_seen": 87068230, + "router_z_loss_clip": 0.92871094, + "router_z_loss_mlp": 0.16113281, + "step": 4045, + "time_per_iteration": 4.115273475646973 + }, + { + "auxiliary_loss_clip": 0.01144872, + "auxiliary_loss_mlp": 0.01045528, + "balance_loss_clip": 1.05208564, + "balance_loss_mlp": 1.0304836, + "epoch": 0.24325868029460393, + "flos": 26999784293760.0, + "grad_norm": 1.6122886461609192, + "language_loss": 0.86812156, + "learning_rate": 3.541587386314541e-06, + "loss": 0.89002562, + "num_input_tokens_seen": 87086435, + "router_z_loss_clip": 0.92871094, + "router_z_loss_mlp": 0.15057373, + "step": 4046, + "time_per_iteration": 2.7458059787750244 + }, + { + "auxiliary_loss_clip": 0.01139609, + "auxiliary_loss_mlp": 0.01038603, + "balance_loss_clip": 1.04937959, + "balance_loss_mlp": 1.02311826, + "epoch": 0.2433188035472719, + "flos": 28775564729280.0, + "grad_norm": 2.0706836746162103, + "language_loss": 0.72790432, + "learning_rate": 3.5413392369578e-06, + "loss": 0.74968648, + "num_input_tokens_seen": 87105340, + "router_z_loss_clip": 0.90234375, + "router_z_loss_mlp": 0.15490723, + "step": 4047, + "time_per_iteration": 2.7404074668884277 + }, + { + "auxiliary_loss_clip": 0.0114648, + "auxiliary_loss_mlp": 0.01042201, + "balance_loss_clip": 1.05157638, + "balance_loss_mlp": 1.02546382, + "epoch": 0.2433789267999399, + "flos": 29849809033440.0, + "grad_norm": 6.059775611303685, + "language_loss": 0.72812808, + "learning_rate": 3.5410910291533213e-06, + "loss": 0.7500149, + "num_input_tokens_seen": 87125780, + "router_z_loss_clip": 0.95068359, + "router_z_loss_mlp": 0.1673584, + "step": 4048, + "time_per_iteration": 2.7238972187042236 + }, + { + "auxiliary_loss_clip": 0.01142589, + "auxiliary_loss_mlp": 0.01042118, + "balance_loss_clip": 1.05116892, + "balance_loss_mlp": 1.02720511, + "epoch": 0.24343905005260785, + "flos": 20411074768320.0, + "grad_norm": 1.9846303150743727, + "language_loss": 0.7291128, + "learning_rate": 3.5408427629105155e-06, + "loss": 0.75095987, + "num_input_tokens_seen": 87144470, + "router_z_loss_clip": 0.91357422, + "router_z_loss_mlp": 0.14904785, + "step": 4049, + "time_per_iteration": 2.705550193786621 + }, + { + "auxiliary_loss_clip": 0.01139862, + "auxiliary_loss_mlp": 0.01040611, + "balance_loss_clip": 1.04927444, + "balance_loss_mlp": 1.0256505, + "epoch": 0.24349917330527582, + "flos": 24457727463840.0, + "grad_norm": 1.6223704581552651, + "language_loss": 0.73490167, + "learning_rate": 3.5405944382387985e-06, + "loss": 0.75670642, + "num_input_tokens_seen": 87162830, + "router_z_loss_clip": 0.90527344, + "router_z_loss_mlp": 0.1496582, + "step": 4050, + "time_per_iteration": 2.6341452598571777 + }, + { + "auxiliary_loss_clip": 0.01138494, + "auxiliary_loss_mlp": 0.01042455, + "balance_loss_clip": 1.0495801, + "balance_loss_mlp": 1.02840614, + "epoch": 0.24355929655794378, + "flos": 21256347157920.0, + "grad_norm": 2.7171094861012994, + "language_loss": 0.74858385, + "learning_rate": 3.5403460551475854e-06, + "loss": 0.77039337, + "num_input_tokens_seen": 87180905, + "router_z_loss_clip": 0.88916016, + "router_z_loss_mlp": 0.14044189, + "step": 4051, + "time_per_iteration": 2.658430576324463 + }, + { + "auxiliary_loss_clip": 0.01140401, + "auxiliary_loss_mlp": 0.01040887, + "balance_loss_clip": 1.0496664, + "balance_loss_mlp": 1.02567625, + "epoch": 0.24361941981061175, + "flos": 31006817439840.0, + "grad_norm": 4.312179629715174, + "language_loss": 0.706375, + "learning_rate": 3.540097613646296e-06, + "loss": 0.72818792, + "num_input_tokens_seen": 87202290, + "router_z_loss_clip": 0.90722656, + "router_z_loss_mlp": 0.15203857, + "step": 4052, + "time_per_iteration": 2.668712615966797 + }, + { + "auxiliary_loss_clip": 0.01140434, + "auxiliary_loss_mlp": 0.01042067, + "balance_loss_clip": 1.04887247, + "balance_loss_mlp": 1.02647448, + "epoch": 0.2436795430632797, + "flos": 27846110132640.0, + "grad_norm": 1.552945692012243, + "language_loss": 0.81170487, + "learning_rate": 3.539849113744351e-06, + "loss": 0.83352989, + "num_input_tokens_seen": 87221650, + "router_z_loss_clip": 0.91699219, + "router_z_loss_mlp": 0.15576172, + "step": 4053, + "time_per_iteration": 2.7022202014923096 + }, + { + "auxiliary_loss_clip": 0.01144652, + "auxiliary_loss_mlp": 0.01035647, + "balance_loss_clip": 1.05107522, + "balance_loss_mlp": 1.02013779, + "epoch": 0.2437396663159477, + "flos": 18495204629760.0, + "grad_norm": 1.5470549494935066, + "language_loss": 0.77760243, + "learning_rate": 3.539600555451172e-06, + "loss": 0.7994054, + "num_input_tokens_seen": 87238515, + "router_z_loss_clip": 0.93554688, + "router_z_loss_mlp": 0.15515137, + "step": 4054, + "time_per_iteration": 2.6560230255126953 + }, + { + "auxiliary_loss_clip": 0.01137274, + "auxiliary_loss_mlp": 0.01046568, + "balance_loss_clip": 1.04602432, + "balance_loss_mlp": 1.03184533, + "epoch": 0.24379978956861567, + "flos": 26955707843520.0, + "grad_norm": 1.6667867907114906, + "language_loss": 0.84246814, + "learning_rate": 3.5393519387761866e-06, + "loss": 0.86430657, + "num_input_tokens_seen": 87256290, + "router_z_loss_clip": 0.91210938, + "router_z_loss_mlp": 0.1472168, + "step": 4055, + "time_per_iteration": 2.7034831047058105 + }, + { + "auxiliary_loss_clip": 0.01146303, + "auxiliary_loss_mlp": 0.01039698, + "balance_loss_clip": 1.05001044, + "balance_loss_mlp": 1.02323544, + "epoch": 0.24385991282128364, + "flos": 38397816871200.0, + "grad_norm": 3.1398073124555435, + "language_loss": 0.55446064, + "learning_rate": 3.5391032637288217e-06, + "loss": 0.57632065, + "num_input_tokens_seen": 87277085, + "router_z_loss_clip": 0.96240234, + "router_z_loss_mlp": 0.16442871, + "step": 4056, + "time_per_iteration": 2.701866388320923 + }, + { + "auxiliary_loss_clip": 0.01143187, + "auxiliary_loss_mlp": 0.01040834, + "balance_loss_clip": 1.04913855, + "balance_loss_mlp": 1.02512801, + "epoch": 0.2439200360739516, + "flos": 29088678333600.0, + "grad_norm": 2.9838248710127218, + "language_loss": 0.79984331, + "learning_rate": 3.538854530318506e-06, + "loss": 0.82168353, + "num_input_tokens_seen": 87293020, + "router_z_loss_clip": 0.94140625, + "router_z_loss_mlp": 0.15692139, + "step": 4057, + "time_per_iteration": 2.7211251258850098 + }, + { + "auxiliary_loss_clip": 0.01137593, + "auxiliary_loss_mlp": 0.01038454, + "balance_loss_clip": 1.04747939, + "balance_loss_mlp": 1.02331448, + "epoch": 0.24398015932661957, + "flos": 23391302994720.0, + "grad_norm": 1.8271335799162947, + "language_loss": 0.79423028, + "learning_rate": 3.538605738554673e-06, + "loss": 0.81599081, + "num_input_tokens_seen": 87311445, + "router_z_loss_clip": 0.90087891, + "router_z_loss_mlp": 0.15124512, + "step": 4058, + "time_per_iteration": 2.6121439933776855 + }, + { + "auxiliary_loss_clip": 0.0114197, + "auxiliary_loss_mlp": 0.01034475, + "balance_loss_clip": 1.0472362, + "balance_loss_mlp": 1.01986027, + "epoch": 0.24404028257928753, + "flos": 30825892668960.0, + "grad_norm": 1.6194639183103476, + "language_loss": 0.85720706, + "learning_rate": 3.538356888446756e-06, + "loss": 0.87897152, + "num_input_tokens_seen": 87332055, + "router_z_loss_clip": 0.94726562, + "router_z_loss_mlp": 0.14611816, + "step": 4059, + "time_per_iteration": 2.7222864627838135 + }, + { + "auxiliary_loss_clip": 0.01137866, + "auxiliary_loss_mlp": 0.01035913, + "balance_loss_clip": 1.04867887, + "balance_loss_mlp": 1.02160788, + "epoch": 0.2441004058319555, + "flos": 32296379335200.0, + "grad_norm": 1.63012425204373, + "language_loss": 0.74614179, + "learning_rate": 3.5381079800041913e-06, + "loss": 0.76787961, + "num_input_tokens_seen": 87351295, + "router_z_loss_clip": 0.89160156, + "router_z_loss_mlp": 0.14318848, + "step": 4060, + "time_per_iteration": 2.7113358974456787 + }, + { + "auxiliary_loss_clip": 0.01145997, + "auxiliary_loss_mlp": 0.01046662, + "balance_loss_clip": 1.04891586, + "balance_loss_mlp": 1.02898359, + "epoch": 0.2441605290846235, + "flos": 32653731458880.0, + "grad_norm": 2.9261417971716583, + "language_loss": 0.73092055, + "learning_rate": 3.5378590132364182e-06, + "loss": 0.75284719, + "num_input_tokens_seen": 87370650, + "router_z_loss_clip": 0.96972656, + "router_z_loss_mlp": 0.17675781, + "step": 4061, + "time_per_iteration": 2.745373487472534 + }, + { + "auxiliary_loss_clip": 0.01138003, + "auxiliary_loss_mlp": 0.01037346, + "balance_loss_clip": 1.04881907, + "balance_loss_mlp": 1.02366102, + "epoch": 0.24422065233729146, + "flos": 25753407468480.0, + "grad_norm": 1.9835094091914889, + "language_loss": 0.76023388, + "learning_rate": 3.5376099881528768e-06, + "loss": 0.78198731, + "num_input_tokens_seen": 87389020, + "router_z_loss_clip": 0.89111328, + "router_z_loss_mlp": 0.13696289, + "step": 4062, + "time_per_iteration": 2.7002923488616943 + }, + { + "auxiliary_loss_clip": 0.01135102, + "auxiliary_loss_mlp": 0.01036078, + "balance_loss_clip": 1.04804993, + "balance_loss_mlp": 1.02105784, + "epoch": 0.24428077558995942, + "flos": 30826662497280.0, + "grad_norm": 1.9012071524289575, + "language_loss": 0.85066575, + "learning_rate": 3.537360904763011e-06, + "loss": 0.87237757, + "num_input_tokens_seen": 87409695, + "router_z_loss_clip": 0.87011719, + "router_z_loss_mlp": 0.15002441, + "step": 4063, + "time_per_iteration": 2.6816375255584717 + }, + { + "auxiliary_loss_clip": 0.01143486, + "auxiliary_loss_mlp": 0.01040299, + "balance_loss_clip": 1.04925084, + "balance_loss_mlp": 1.02484953, + "epoch": 0.24434089884262739, + "flos": 24996166427520.0, + "grad_norm": 2.8269287559243264, + "language_loss": 0.68643498, + "learning_rate": 3.5371117630762656e-06, + "loss": 0.70827287, + "num_input_tokens_seen": 87428250, + "router_z_loss_clip": 0.94238281, + "router_z_loss_mlp": 0.15441895, + "step": 4064, + "time_per_iteration": 2.6859636306762695 + }, + { + "auxiliary_loss_clip": 0.01143729, + "auxiliary_loss_mlp": 0.01038306, + "balance_loss_clip": 1.04846263, + "balance_loss_mlp": 1.02245164, + "epoch": 0.24440102209529535, + "flos": 28823814459360.0, + "grad_norm": 9.423448114029592, + "language_loss": 0.7024315, + "learning_rate": 3.536862563102088e-06, + "loss": 0.72425187, + "num_input_tokens_seen": 87449380, + "router_z_loss_clip": 0.95263672, + "router_z_loss_mlp": 0.15856934, + "step": 4065, + "time_per_iteration": 2.692599058151245 + }, + { + "auxiliary_loss_clip": 0.01145701, + "auxiliary_loss_mlp": 0.0104934, + "balance_loss_clip": 1.05002749, + "balance_loss_mlp": 1.03200662, + "epoch": 0.24446114534796332, + "flos": 25081240014720.0, + "grad_norm": 2.3314107116899985, + "language_loss": 0.84185451, + "learning_rate": 3.5366133048499282e-06, + "loss": 0.86380494, + "num_input_tokens_seen": 87465365, + "router_z_loss_clip": 0.95605469, + "router_z_loss_mlp": 0.17346191, + "step": 4066, + "time_per_iteration": 2.682647943496704 + }, + { + "auxiliary_loss_clip": 0.01060449, + "auxiliary_loss_mlp": 0.01006839, + "balance_loss_clip": 1.02921939, + "balance_loss_mlp": 1.00498962, + "epoch": 0.24452126860063128, + "flos": 73688119273440.0, + "grad_norm": 0.7436285161484905, + "language_loss": 0.5230791, + "learning_rate": 3.5363639883292374e-06, + "loss": 0.54375196, + "num_input_tokens_seen": 87522525, + "router_z_loss_clip": 0.31201172, + "router_z_loss_mlp": 0.01846313, + "step": 4067, + "time_per_iteration": 3.1990487575531006 + }, + { + "auxiliary_loss_clip": 0.01142789, + "auxiliary_loss_mlp": 0.01040529, + "balance_loss_clip": 1.04873419, + "balance_loss_mlp": 1.02506137, + "epoch": 0.24458139185329927, + "flos": 18451087662240.0, + "grad_norm": 3.223572348550063, + "language_loss": 0.72809792, + "learning_rate": 3.5361146135494706e-06, + "loss": 0.7499311, + "num_input_tokens_seen": 87539170, + "router_z_loss_clip": 0.94189453, + "router_z_loss_mlp": 0.15460205, + "step": 4068, + "time_per_iteration": 2.7368078231811523 + }, + { + "auxiliary_loss_clip": 0.011408, + "auxiliary_loss_mlp": 0.0104064, + "balance_loss_clip": 1.04999638, + "balance_loss_mlp": 1.02499986, + "epoch": 0.24464151510596724, + "flos": 34164891123840.0, + "grad_norm": 1.4822747235975098, + "language_loss": 0.77775884, + "learning_rate": 3.5358651805200835e-06, + "loss": 0.79957318, + "num_input_tokens_seen": 87558875, + "router_z_loss_clip": 0.90820312, + "router_z_loss_mlp": 0.15625, + "step": 4069, + "time_per_iteration": 2.748608112335205 + }, + { + "auxiliary_loss_clip": 0.01139807, + "auxiliary_loss_mlp": 0.01042087, + "balance_loss_clip": 1.04950333, + "balance_loss_mlp": 1.02610111, + "epoch": 0.2447016383586352, + "flos": 24151177658880.0, + "grad_norm": 2.8634670859257736, + "language_loss": 0.80397248, + "learning_rate": 3.5356156892505347e-06, + "loss": 0.82579136, + "num_input_tokens_seen": 87576485, + "router_z_loss_clip": 0.90234375, + "router_z_loss_mlp": 0.15985107, + "step": 4070, + "time_per_iteration": 2.6685237884521484 + }, + { + "auxiliary_loss_clip": 0.0113762, + "auxiliary_loss_mlp": 0.0104027, + "balance_loss_clip": 1.04627347, + "balance_loss_mlp": 1.02548838, + "epoch": 0.24476176161130317, + "flos": 31807851310080.0, + "grad_norm": 1.6699499854449324, + "language_loss": 0.84527171, + "learning_rate": 3.5353661397502854e-06, + "loss": 0.86705065, + "num_input_tokens_seen": 87598620, + "router_z_loss_clip": 0.9140625, + "router_z_loss_mlp": 0.14770508, + "step": 4071, + "time_per_iteration": 2.786303997039795 + }, + { + "auxiliary_loss_clip": 0.01143704, + "auxiliary_loss_mlp": 0.01040809, + "balance_loss_clip": 1.04799628, + "balance_loss_mlp": 1.0241673, + "epoch": 0.24482188486397113, + "flos": 22992791664960.0, + "grad_norm": 2.090011306287829, + "language_loss": 0.80192858, + "learning_rate": 3.535116532028798e-06, + "loss": 0.82377374, + "num_input_tokens_seen": 87616595, + "router_z_loss_clip": 0.95751953, + "router_z_loss_mlp": 0.16650391, + "step": 4072, + "time_per_iteration": 2.7339515686035156 + }, + { + "auxiliary_loss_clip": 0.01139224, + "auxiliary_loss_mlp": 0.01036557, + "balance_loss_clip": 1.04981518, + "balance_loss_mlp": 1.02228785, + "epoch": 0.2448820081166391, + "flos": 25931496029760.0, + "grad_norm": 1.553473662269443, + "language_loss": 0.70095432, + "learning_rate": 3.5348668660955382e-06, + "loss": 0.72271204, + "num_input_tokens_seen": 87635755, + "router_z_loss_clip": 0.89404297, + "router_z_loss_mlp": 0.14263916, + "step": 4073, + "time_per_iteration": 2.791788101196289 + }, + { + "auxiliary_loss_clip": 0.01137359, + "auxiliary_loss_mlp": 0.01039483, + "balance_loss_clip": 1.04912066, + "balance_loss_mlp": 1.02542269, + "epoch": 0.2449421313693071, + "flos": 29224878377760.0, + "grad_norm": 2.2787440032536375, + "language_loss": 0.67333752, + "learning_rate": 3.5346171419599728e-06, + "loss": 0.69510591, + "num_input_tokens_seen": 87652885, + "router_z_loss_clip": 0.88232422, + "router_z_loss_mlp": 0.140625, + "step": 4074, + "time_per_iteration": 2.736109495162964 + }, + { + "auxiliary_loss_clip": 0.01054082, + "auxiliary_loss_mlp": 0.01006906, + "balance_loss_clip": 1.02279377, + "balance_loss_mlp": 1.00507271, + "epoch": 0.24500225462197506, + "flos": 74051305885440.0, + "grad_norm": 0.8957243842625163, + "language_loss": 0.68735838, + "learning_rate": 3.5343673596315718e-06, + "loss": 0.70796824, + "num_input_tokens_seen": 87713220, + "router_z_loss_clip": 0.3125, + "router_z_loss_mlp": 0.01829529, + "step": 4075, + "time_per_iteration": 3.425645351409912 + }, + { + "auxiliary_loss_clip": 0.01138009, + "auxiliary_loss_mlp": 0.01039318, + "balance_loss_clip": 1.0481199, + "balance_loss_mlp": 1.02470338, + "epoch": 0.24506237787464302, + "flos": 32072674667040.0, + "grad_norm": 1.8933131051080307, + "language_loss": 0.79050148, + "learning_rate": 3.5341175191198063e-06, + "loss": 0.81227481, + "num_input_tokens_seen": 87732680, + "router_z_loss_clip": 0.89990234, + "router_z_loss_mlp": 0.14611816, + "step": 4076, + "time_per_iteration": 2.7467429637908936 + }, + { + "auxiliary_loss_clip": 0.01143938, + "auxiliary_loss_mlp": 0.01040016, + "balance_loss_clip": 1.04889202, + "balance_loss_mlp": 1.02330303, + "epoch": 0.245122501127311, + "flos": 25078484839680.0, + "grad_norm": 1.842551997068219, + "language_loss": 0.81923866, + "learning_rate": 3.533867620434151e-06, + "loss": 0.84107828, + "num_input_tokens_seen": 87751880, + "router_z_loss_clip": 0.95117188, + "router_z_loss_mlp": 0.16723633, + "step": 4077, + "time_per_iteration": 2.846773624420166 + }, + { + "auxiliary_loss_clip": 0.01142125, + "auxiliary_loss_mlp": 0.01048342, + "balance_loss_clip": 1.04863238, + "balance_loss_mlp": 1.03154564, + "epoch": 0.24518262437997895, + "flos": 35548278338880.0, + "grad_norm": 2.095046741653732, + "language_loss": 0.62587148, + "learning_rate": 3.533617663584082e-06, + "loss": 0.64777613, + "num_input_tokens_seen": 87771795, + "router_z_loss_clip": 0.93457031, + "router_z_loss_mlp": 0.16784668, + "step": 4078, + "time_per_iteration": 2.755817413330078 + }, + { + "auxiliary_loss_clip": 0.01135598, + "auxiliary_loss_mlp": 0.01032734, + "balance_loss_clip": 1.04700661, + "balance_loss_mlp": 1.01783311, + "epoch": 0.24524274763264692, + "flos": 28646819864640.0, + "grad_norm": 1.737467074189694, + "language_loss": 0.7544049, + "learning_rate": 3.5333676485790765e-06, + "loss": 0.77608824, + "num_input_tokens_seen": 87793640, + "router_z_loss_clip": 0.88525391, + "router_z_loss_mlp": 0.14892578, + "step": 4079, + "time_per_iteration": 2.731647253036499 + }, + { + "auxiliary_loss_clip": 0.01137143, + "auxiliary_loss_mlp": 0.01038782, + "balance_loss_clip": 1.04735279, + "balance_loss_mlp": 1.02240229, + "epoch": 0.24530287088531488, + "flos": 20988687591360.0, + "grad_norm": 1.7256492951751086, + "language_loss": 0.74945527, + "learning_rate": 3.5331175754286173e-06, + "loss": 0.7712146, + "num_input_tokens_seen": 87812390, + "router_z_loss_clip": 0.8984375, + "router_z_loss_mlp": 0.16381836, + "step": 4080, + "time_per_iteration": 2.7012760639190674 + }, + { + "auxiliary_loss_clip": 0.01131926, + "auxiliary_loss_mlp": 0.01035411, + "balance_loss_clip": 1.04492819, + "balance_loss_mlp": 1.02056944, + "epoch": 0.24536299413798288, + "flos": 18140769750240.0, + "grad_norm": 1.8275419551852377, + "language_loss": 0.8309747, + "learning_rate": 3.532867444142186e-06, + "loss": 0.85264814, + "num_input_tokens_seen": 87830640, + "router_z_loss_clip": 0.86962891, + "router_z_loss_mlp": 0.14831543, + "step": 4081, + "time_per_iteration": 5.677090167999268 + }, + { + "auxiliary_loss_clip": 0.01137205, + "auxiliary_loss_mlp": 0.01037348, + "balance_loss_clip": 1.04759157, + "balance_loss_mlp": 1.02281594, + "epoch": 0.24542311739065084, + "flos": 43027916878080.0, + "grad_norm": 2.8816834927258475, + "language_loss": 0.73441511, + "learning_rate": 3.532617254729267e-06, + "loss": 0.75616068, + "num_input_tokens_seen": 87850450, + "router_z_loss_clip": 0.89648438, + "router_z_loss_mlp": 0.14526367, + "step": 4082, + "time_per_iteration": 4.325042724609375 + }, + { + "auxiliary_loss_clip": 0.01135293, + "auxiliary_loss_mlp": 0.01040699, + "balance_loss_clip": 1.04736078, + "balance_loss_mlp": 1.02749681, + "epoch": 0.2454832406433188, + "flos": 26238572559360.0, + "grad_norm": 2.025702679886641, + "language_loss": 0.71808612, + "learning_rate": 3.5323670071993485e-06, + "loss": 0.73984611, + "num_input_tokens_seen": 87868810, + "router_z_loss_clip": 0.87939453, + "router_z_loss_mlp": 0.13214111, + "step": 4083, + "time_per_iteration": 4.27691125869751 + }, + { + "auxiliary_loss_clip": 0.01140643, + "auxiliary_loss_mlp": 0.01042359, + "balance_loss_clip": 1.04736423, + "balance_loss_mlp": 1.02576494, + "epoch": 0.24554336389598677, + "flos": 18005704189920.0, + "grad_norm": 1.8571843134826695, + "language_loss": 0.74347329, + "learning_rate": 3.532116701561919e-06, + "loss": 0.76530331, + "num_input_tokens_seen": 87885685, + "router_z_loss_clip": 0.93261719, + "router_z_loss_mlp": 0.16577148, + "step": 4084, + "time_per_iteration": 2.74638295173645 + }, + { + "auxiliary_loss_clip": 0.01136314, + "auxiliary_loss_mlp": 0.0103674, + "balance_loss_clip": 1.04712844, + "balance_loss_mlp": 1.02174306, + "epoch": 0.24560348714865474, + "flos": 18273768929280.0, + "grad_norm": 1.8288177698301873, + "language_loss": 0.85222435, + "learning_rate": 3.531866337826471e-06, + "loss": 0.87395489, + "num_input_tokens_seen": 87903715, + "router_z_loss_clip": 0.89160156, + "router_z_loss_mlp": 0.14971924, + "step": 4085, + "time_per_iteration": 4.136972188949585 + }, + { + "auxiliary_loss_clip": 0.01140854, + "auxiliary_loss_mlp": 0.01044325, + "balance_loss_clip": 1.04905224, + "balance_loss_mlp": 1.02904856, + "epoch": 0.2456636104013227, + "flos": 27668548296000.0, + "grad_norm": 1.7189870296069203, + "language_loss": 0.78980333, + "learning_rate": 3.5316159160024982e-06, + "loss": 0.8116551, + "num_input_tokens_seen": 87923375, + "router_z_loss_clip": 0.91894531, + "router_z_loss_mlp": 0.15264893, + "step": 4086, + "time_per_iteration": 2.673024892807007 + }, + { + "auxiliary_loss_clip": 0.01135724, + "auxiliary_loss_mlp": 0.0104187, + "balance_loss_clip": 1.04842854, + "balance_loss_mlp": 1.0272311, + "epoch": 0.2457237336539907, + "flos": 33455818778400.0, + "grad_norm": 1.5910324065052772, + "language_loss": 0.75328016, + "learning_rate": 3.531365436099496e-06, + "loss": 0.77505612, + "num_input_tokens_seen": 87943115, + "router_z_loss_clip": 0.87353516, + "router_z_loss_mlp": 0.14648438, + "step": 4087, + "time_per_iteration": 2.76444673538208 + }, + { + "auxiliary_loss_clip": 0.01143226, + "auxiliary_loss_mlp": 0.01044618, + "balance_loss_clip": 1.05280697, + "balance_loss_mlp": 1.02895415, + "epoch": 0.24578385690665866, + "flos": 24907203181440.0, + "grad_norm": 2.393725128448395, + "language_loss": 0.79258239, + "learning_rate": 3.5311148981269635e-06, + "loss": 0.81446081, + "num_input_tokens_seen": 87959505, + "router_z_loss_clip": 0.90380859, + "router_z_loss_mlp": 0.15661621, + "step": 4088, + "time_per_iteration": 2.6916942596435547 + }, + { + "auxiliary_loss_clip": 0.01134595, + "auxiliary_loss_mlp": 0.01033847, + "balance_loss_clip": 1.04811144, + "balance_loss_mlp": 1.02000117, + "epoch": 0.24584398015932662, + "flos": 29181814859520.0, + "grad_norm": 1.6416512351671544, + "language_loss": 0.77185595, + "learning_rate": 3.5308643020944e-06, + "loss": 0.79354036, + "num_input_tokens_seen": 87979725, + "router_z_loss_clip": 0.86621094, + "router_z_loss_mlp": 0.1385498, + "step": 4089, + "time_per_iteration": 2.6788768768310547 + }, + { + "auxiliary_loss_clip": 0.01137479, + "auxiliary_loss_mlp": 0.01044142, + "balance_loss_clip": 1.04662931, + "balance_loss_mlp": 1.02904391, + "epoch": 0.2459041034119946, + "flos": 50635935626400.0, + "grad_norm": 1.8152811072951032, + "language_loss": 0.8095063, + "learning_rate": 3.530613648011309e-06, + "loss": 0.83132243, + "num_input_tokens_seen": 87998270, + "router_z_loss_clip": 0.90869141, + "router_z_loss_mlp": 0.15106201, + "step": 4090, + "time_per_iteration": 2.8122386932373047 + }, + { + "auxiliary_loss_clip": 0.01140597, + "auxiliary_loss_mlp": 0.01045497, + "balance_loss_clip": 1.04874241, + "balance_loss_mlp": 1.02930248, + "epoch": 0.24596422666466256, + "flos": 24326997252480.0, + "grad_norm": 1.8272637729925625, + "language_loss": 0.72865164, + "learning_rate": 3.5303629358871946e-06, + "loss": 0.7505126, + "num_input_tokens_seen": 88016760, + "router_z_loss_clip": 0.91894531, + "router_z_loss_mlp": 0.16192627, + "step": 4091, + "time_per_iteration": 2.6498892307281494 + }, + { + "auxiliary_loss_clip": 0.01142875, + "auxiliary_loss_mlp": 0.01042827, + "balance_loss_clip": 1.05372882, + "balance_loss_mlp": 1.02861762, + "epoch": 0.24602434991733052, + "flos": 26288483497920.0, + "grad_norm": 1.9260344578808335, + "language_loss": 0.76876432, + "learning_rate": 3.5301121657315653e-06, + "loss": 0.79062128, + "num_input_tokens_seen": 88036465, + "router_z_loss_clip": 0.89257812, + "router_z_loss_mlp": 0.14221191, + "step": 4092, + "time_per_iteration": 2.6643123626708984 + }, + { + "auxiliary_loss_clip": 0.01144393, + "auxiliary_loss_mlp": 0.01038029, + "balance_loss_clip": 1.05038643, + "balance_loss_mlp": 1.02300286, + "epoch": 0.24608447316999849, + "flos": 28290683259360.0, + "grad_norm": 3.4713445601380513, + "language_loss": 0.81711805, + "learning_rate": 3.5298613375539287e-06, + "loss": 0.83894223, + "num_input_tokens_seen": 88053270, + "router_z_loss_clip": 0.93945312, + "router_z_loss_mlp": 0.15032959, + "step": 4093, + "time_per_iteration": 2.6737051010131836 + }, + { + "auxiliary_loss_clip": 0.01143466, + "auxiliary_loss_mlp": 0.01039661, + "balance_loss_clip": 1.0493784, + "balance_loss_mlp": 1.02394915, + "epoch": 0.24614459642266648, + "flos": 23967457195680.0, + "grad_norm": 2.096148698500266, + "language_loss": 0.87000442, + "learning_rate": 3.529610451363797e-06, + "loss": 0.89183569, + "num_input_tokens_seen": 88072305, + "router_z_loss_clip": 0.94042969, + "router_z_loss_mlp": 0.15716553, + "step": 4094, + "time_per_iteration": 2.6828701496124268 + }, + { + "auxiliary_loss_clip": 0.01052443, + "auxiliary_loss_mlp": 0.01006702, + "balance_loss_clip": 1.02137208, + "balance_loss_mlp": 1.0048424, + "epoch": 0.24620471967533444, + "flos": 75335762603520.0, + "grad_norm": 0.7511818151144533, + "language_loss": 0.57521343, + "learning_rate": 3.5293595071706833e-06, + "loss": 0.59580487, + "num_input_tokens_seen": 88137995, + "router_z_loss_clip": 0.31103516, + "router_z_loss_mlp": 0.01855469, + "step": 4095, + "time_per_iteration": 3.31715989112854 + }, + { + "auxiliary_loss_clip": 0.01052483, + "auxiliary_loss_mlp": 0.01004428, + "balance_loss_clip": 1.02121508, + "balance_loss_mlp": 1.00252688, + "epoch": 0.2462648429280024, + "flos": 84382913993760.0, + "grad_norm": 0.7847267020956189, + "language_loss": 0.56286228, + "learning_rate": 3.5291085049841042e-06, + "loss": 0.58343142, + "num_input_tokens_seen": 88208490, + "router_z_loss_clip": 0.31274414, + "router_z_loss_mlp": 0.01898193, + "step": 4096, + "time_per_iteration": 3.3825244903564453 + }, + { + "auxiliary_loss_clip": 0.01142315, + "auxiliary_loss_mlp": 0.01037198, + "balance_loss_clip": 1.05153275, + "balance_loss_mlp": 1.02260721, + "epoch": 0.24632496618067037, + "flos": 35947437945120.0, + "grad_norm": 1.7216234195258038, + "language_loss": 0.77173877, + "learning_rate": 3.5288574448135773e-06, + "loss": 0.79353386, + "num_input_tokens_seen": 88228050, + "router_z_loss_clip": 0.90722656, + "router_z_loss_mlp": 0.14605713, + "step": 4097, + "time_per_iteration": 2.7435142993927 + }, + { + "auxiliary_loss_clip": 0.01142533, + "auxiliary_loss_mlp": 0.01042556, + "balance_loss_clip": 1.04878473, + "balance_loss_mlp": 1.02535367, + "epoch": 0.24638508943333834, + "flos": 29671234264800.0, + "grad_norm": 2.21465686778946, + "language_loss": 0.76925743, + "learning_rate": 3.5286063266686235e-06, + "loss": 0.79110825, + "num_input_tokens_seen": 88248090, + "router_z_loss_clip": 0.93701172, + "router_z_loss_mlp": 0.17211914, + "step": 4098, + "time_per_iteration": 2.669893264770508 + }, + { + "auxiliary_loss_clip": 0.01141736, + "auxiliary_loss_mlp": 0.01036314, + "balance_loss_clip": 1.04885328, + "balance_loss_mlp": 1.02218807, + "epoch": 0.2464452126860063, + "flos": 32474143758240.0, + "grad_norm": 2.2736105181714303, + "language_loss": 0.6851815, + "learning_rate": 3.528355150558764e-06, + "loss": 0.70696199, + "num_input_tokens_seen": 88267545, + "router_z_loss_clip": 0.92822266, + "router_z_loss_mlp": 0.14111328, + "step": 4099, + "time_per_iteration": 2.7068653106689453 + }, + { + "auxiliary_loss_clip": 0.01135848, + "auxiliary_loss_mlp": 0.01039887, + "balance_loss_clip": 1.04898429, + "balance_loss_mlp": 1.025594, + "epoch": 0.24650533593867427, + "flos": 38086202406240.0, + "grad_norm": 2.202882351483153, + "language_loss": 0.66527295, + "learning_rate": 3.5281039164935237e-06, + "loss": 0.68703032, + "num_input_tokens_seen": 88289785, + "router_z_loss_clip": 0.86962891, + "router_z_loss_mlp": 0.14306641, + "step": 4100, + "time_per_iteration": 2.7513976097106934 + }, + { + "auxiliary_loss_clip": 0.01049863, + "auxiliary_loss_mlp": 0.0100209, + "balance_loss_clip": 1.01864016, + "balance_loss_mlp": 1.0002439, + "epoch": 0.24656545919134226, + "flos": 83577342188160.0, + "grad_norm": 0.7132787728164929, + "language_loss": 0.61527193, + "learning_rate": 3.5278526244824304e-06, + "loss": 0.63579148, + "num_input_tokens_seen": 88357320, + "router_z_loss_clip": 0.3125, + "router_z_loss_mlp": 0.01843262, + "step": 4101, + "time_per_iteration": 3.3400630950927734 + }, + { + "auxiliary_loss_clip": 0.01135664, + "auxiliary_loss_mlp": 0.01037006, + "balance_loss_clip": 1.04672408, + "balance_loss_mlp": 1.02164018, + "epoch": 0.24662558244401023, + "flos": 24506706504960.0, + "grad_norm": 1.676138555000306, + "language_loss": 0.73055738, + "learning_rate": 3.527601274535012e-06, + "loss": 0.75228405, + "num_input_tokens_seen": 88377040, + "router_z_loss_clip": 0.88916016, + "router_z_loss_mlp": 0.15362549, + "step": 4102, + "time_per_iteration": 2.6487154960632324 + }, + { + "auxiliary_loss_clip": 0.01141684, + "auxiliary_loss_mlp": 0.01038913, + "balance_loss_clip": 1.04854786, + "balance_loss_mlp": 1.02395201, + "epoch": 0.2466857056966782, + "flos": 37462608820800.0, + "grad_norm": 2.055142643983992, + "language_loss": 0.76158881, + "learning_rate": 3.5273498666608004e-06, + "loss": 0.78339475, + "num_input_tokens_seen": 88395085, + "router_z_loss_clip": 0.93115234, + "router_z_loss_mlp": 0.14941406, + "step": 4103, + "time_per_iteration": 2.695000410079956 + }, + { + "auxiliary_loss_clip": 0.01138246, + "auxiliary_loss_mlp": 0.01040884, + "balance_loss_clip": 1.04649925, + "balance_loss_mlp": 1.02476716, + "epoch": 0.24674582894934616, + "flos": 27489487320000.0, + "grad_norm": 2.0822405866758893, + "language_loss": 0.782058, + "learning_rate": 3.5270984008693288e-06, + "loss": 0.80384934, + "num_input_tokens_seen": 88413205, + "router_z_loss_clip": 0.91894531, + "router_z_loss_mlp": 0.16113281, + "step": 4104, + "time_per_iteration": 2.6593480110168457 + }, + { + "auxiliary_loss_clip": 0.01137549, + "auxiliary_loss_mlp": 0.01034921, + "balance_loss_clip": 1.04737997, + "balance_loss_mlp": 1.01829708, + "epoch": 0.24680595220201412, + "flos": 25263420821280.0, + "grad_norm": 1.7525878217930428, + "language_loss": 0.83281124, + "learning_rate": 3.526846877170133e-06, + "loss": 0.85453594, + "num_input_tokens_seen": 88431525, + "router_z_loss_clip": 0.90185547, + "router_z_loss_mlp": 0.1663208, + "step": 4105, + "time_per_iteration": 2.629559278488159 + }, + { + "auxiliary_loss_clip": 0.01143252, + "auxiliary_loss_mlp": 0.01042935, + "balance_loss_clip": 1.05203569, + "balance_loss_mlp": 1.02830195, + "epoch": 0.2468660754546821, + "flos": 26638907166720.0, + "grad_norm": 2.010491369332014, + "language_loss": 0.76605618, + "learning_rate": 3.52659529557275e-06, + "loss": 0.78791809, + "num_input_tokens_seen": 88451210, + "router_z_loss_clip": 0.91162109, + "router_z_loss_mlp": 0.1461792, + "step": 4106, + "time_per_iteration": 2.667839765548706 + }, + { + "auxiliary_loss_clip": 0.01135763, + "auxiliary_loss_mlp": 0.01038282, + "balance_loss_clip": 1.04611325, + "balance_loss_mlp": 1.02261782, + "epoch": 0.24692619870735008, + "flos": 18629297775360.0, + "grad_norm": 3.3421677313627063, + "language_loss": 0.72532767, + "learning_rate": 3.5263436560867205e-06, + "loss": 0.74706811, + "num_input_tokens_seen": 88467790, + "router_z_loss_clip": 0.89599609, + "router_z_loss_mlp": 0.15649414, + "step": 4107, + "time_per_iteration": 2.5993547439575195 + }, + { + "auxiliary_loss_clip": 0.01140292, + "auxiliary_loss_mlp": 0.01043176, + "balance_loss_clip": 1.04930758, + "balance_loss_mlp": 1.02754748, + "epoch": 0.24698632196001805, + "flos": 36216353547360.0, + "grad_norm": 2.303040851328973, + "language_loss": 0.65182513, + "learning_rate": 3.526091958721587e-06, + "loss": 0.6736598, + "num_input_tokens_seen": 88490330, + "router_z_loss_clip": 0.91064453, + "router_z_loss_mlp": 0.15637207, + "step": 4108, + "time_per_iteration": 2.7804036140441895 + }, + { + "auxiliary_loss_clip": 0.01139764, + "auxiliary_loss_mlp": 0.01038046, + "balance_loss_clip": 1.04770303, + "balance_loss_mlp": 1.02232194, + "epoch": 0.247046445212686, + "flos": 47791218650400.0, + "grad_norm": 2.5448381657517354, + "language_loss": 0.72706562, + "learning_rate": 3.5258402034868936e-06, + "loss": 0.74884373, + "num_input_tokens_seen": 88512435, + "router_z_loss_clip": 0.92041016, + "router_z_loss_mlp": 0.15734863, + "step": 4109, + "time_per_iteration": 2.8030669689178467 + }, + { + "auxiliary_loss_clip": 0.01141309, + "auxiliary_loss_mlp": 0.01042954, + "balance_loss_clip": 1.04872584, + "balance_loss_mlp": 1.02720594, + "epoch": 0.24710656846535398, + "flos": 28063777726080.0, + "grad_norm": 1.7844076360550898, + "language_loss": 0.79117817, + "learning_rate": 3.5255883903921866e-06, + "loss": 0.81302083, + "num_input_tokens_seen": 88529780, + "router_z_loss_clip": 0.92480469, + "router_z_loss_mlp": 0.15759277, + "step": 4110, + "time_per_iteration": 2.700403928756714 + }, + { + "auxiliary_loss_clip": 0.01142273, + "auxiliary_loss_mlp": 0.01034056, + "balance_loss_clip": 1.0496316, + "balance_loss_mlp": 1.01821303, + "epoch": 0.24716669171802194, + "flos": 32253761507040.0, + "grad_norm": 2.117470914486667, + "language_loss": 0.80924219, + "learning_rate": 3.5253365194470144e-06, + "loss": 0.83100545, + "num_input_tokens_seen": 88547200, + "router_z_loss_clip": 0.92724609, + "router_z_loss_mlp": 0.15844727, + "step": 4111, + "time_per_iteration": 2.67077898979187 + }, + { + "auxiliary_loss_clip": 0.01137441, + "auxiliary_loss_mlp": 0.01036313, + "balance_loss_clip": 1.04670691, + "balance_loss_mlp": 1.02216911, + "epoch": 0.2472268149706899, + "flos": 28467475267680.0, + "grad_norm": 2.0298997154749476, + "language_loss": 0.75104964, + "learning_rate": 3.5250845906609294e-06, + "loss": 0.77278721, + "num_input_tokens_seen": 88566415, + "router_z_loss_clip": 0.90722656, + "router_z_loss_mlp": 0.14154053, + "step": 4112, + "time_per_iteration": 2.656715154647827 + }, + { + "auxiliary_loss_clip": 0.01139223, + "auxiliary_loss_mlp": 0.01040759, + "balance_loss_clip": 1.04751921, + "balance_loss_mlp": 1.02531528, + "epoch": 0.24728693822335787, + "flos": 29002915952640.0, + "grad_norm": 1.8412926302142616, + "language_loss": 0.82754868, + "learning_rate": 3.5248326040434835e-06, + "loss": 0.84934849, + "num_input_tokens_seen": 88585225, + "router_z_loss_clip": 0.91699219, + "router_z_loss_mlp": 0.15441895, + "step": 4113, + "time_per_iteration": 2.6814444065093994 + }, + { + "auxiliary_loss_clip": 0.01138656, + "auxiliary_loss_mlp": 0.01035972, + "balance_loss_clip": 1.04673553, + "balance_loss_mlp": 1.01995015, + "epoch": 0.24734706147602586, + "flos": 23571174316320.0, + "grad_norm": 4.526067781418184, + "language_loss": 0.86935371, + "learning_rate": 3.5245805596042322e-06, + "loss": 0.89109999, + "num_input_tokens_seen": 88603280, + "router_z_loss_clip": 0.92041016, + "router_z_loss_mlp": 0.16027832, + "step": 4114, + "time_per_iteration": 2.663858413696289 + }, + { + "auxiliary_loss_clip": 0.01138096, + "auxiliary_loss_mlp": 0.01035029, + "balance_loss_clip": 1.04756761, + "balance_loss_mlp": 1.02046132, + "epoch": 0.24740718472869383, + "flos": 34211358093600.0, + "grad_norm": 1.605139305406212, + "language_loss": 0.75218344, + "learning_rate": 3.524328457352734e-06, + "loss": 0.77391469, + "num_input_tokens_seen": 88624925, + "router_z_loss_clip": 0.90527344, + "router_z_loss_mlp": 0.14575195, + "step": 4115, + "time_per_iteration": 2.702887773513794 + }, + { + "auxiliary_loss_clip": 0.01051181, + "auxiliary_loss_mlp": 0.00999953, + "balance_loss_clip": 1.02009213, + "balance_loss_mlp": 0.9982571, + "epoch": 0.2474673079813618, + "flos": 83104413315840.0, + "grad_norm": 1.795121206318956, + "language_loss": 0.58154964, + "learning_rate": 3.5240762972985475e-06, + "loss": 0.60206097, + "num_input_tokens_seen": 88691475, + "router_z_loss_clip": 0.3112793, + "router_z_loss_mlp": 0.01699829, + "step": 4116, + "time_per_iteration": 3.3437342643737793 + }, + { + "auxiliary_loss_clip": 0.01137697, + "auxiliary_loss_mlp": 0.01033199, + "balance_loss_clip": 1.04771185, + "balance_loss_mlp": 1.01875091, + "epoch": 0.24752743123402976, + "flos": 35950274154720.0, + "grad_norm": 1.428611857538679, + "language_loss": 0.83521056, + "learning_rate": 3.523824079451235e-06, + "loss": 0.85691953, + "num_input_tokens_seen": 88713425, + "router_z_loss_clip": 0.89892578, + "router_z_loss_mlp": 0.14447021, + "step": 4117, + "time_per_iteration": 2.69827938079834 + }, + { + "auxiliary_loss_clip": 0.01050774, + "auxiliary_loss_mlp": 0.00999906, + "balance_loss_clip": 1.01964283, + "balance_loss_mlp": 0.99822372, + "epoch": 0.24758755448669773, + "flos": 71199457868160.0, + "grad_norm": 0.9073827440221167, + "language_loss": 0.63473558, + "learning_rate": 3.5235718038203602e-06, + "loss": 0.65524232, + "num_input_tokens_seen": 88769995, + "router_z_loss_clip": 0.31152344, + "router_z_loss_mlp": 0.0168457, + "step": 4118, + "time_per_iteration": 3.087813377380371 + }, + { + "auxiliary_loss_clip": 0.01135794, + "auxiliary_loss_mlp": 0.010429, + "balance_loss_clip": 1.04642963, + "balance_loss_mlp": 1.02816617, + "epoch": 0.2476476777393657, + "flos": 24995194012800.0, + "grad_norm": 1.7596684357255523, + "language_loss": 0.7942515, + "learning_rate": 3.523319470415491e-06, + "loss": 0.81603849, + "num_input_tokens_seen": 88789970, + "router_z_loss_clip": 0.89306641, + "router_z_loss_mlp": 0.14733887, + "step": 4119, + "time_per_iteration": 2.6336042881011963 + }, + { + "auxiliary_loss_clip": 0.01139039, + "auxiliary_loss_mlp": 0.01036874, + "balance_loss_clip": 1.0490551, + "balance_loss_mlp": 1.0222708, + "epoch": 0.24770780099203366, + "flos": 24996814704000.0, + "grad_norm": 1.7321280353290271, + "language_loss": 0.74860585, + "learning_rate": 3.5230670792461943e-06, + "loss": 0.770365, + "num_input_tokens_seen": 88810000, + "router_z_loss_clip": 0.89941406, + "router_z_loss_mlp": 0.14593506, + "step": 4120, + "time_per_iteration": 4.831719398498535 + }, + { + "auxiliary_loss_clip": 0.01137827, + "auxiliary_loss_mlp": 0.01046453, + "balance_loss_clip": 1.04707801, + "balance_loss_mlp": 1.03103328, + "epoch": 0.24776792424470165, + "flos": 18490261521600.0, + "grad_norm": 2.646356841662925, + "language_loss": 0.87961614, + "learning_rate": 3.522814630322041e-06, + "loss": 0.90145898, + "num_input_tokens_seen": 88827515, + "router_z_loss_clip": 0.90820312, + "router_z_loss_mlp": 0.1541748, + "step": 4121, + "time_per_iteration": 2.606320381164551 + }, + { + "auxiliary_loss_clip": 0.01138564, + "auxiliary_loss_mlp": 0.01033683, + "balance_loss_clip": 1.04618287, + "balance_loss_mlp": 1.01828074, + "epoch": 0.2478280474973696, + "flos": 26509554542880.0, + "grad_norm": 2.1008415949596886, + "language_loss": 0.69154894, + "learning_rate": 3.5225621236526045e-06, + "loss": 0.71327138, + "num_input_tokens_seen": 88845025, + "router_z_loss_clip": 0.92236328, + "router_z_loss_mlp": 0.1541748, + "step": 4122, + "time_per_iteration": 4.040687561035156 + }, + { + "auxiliary_loss_clip": 0.01139997, + "auxiliary_loss_mlp": 0.01035454, + "balance_loss_clip": 1.04698038, + "balance_loss_mlp": 1.01903856, + "epoch": 0.24788817075003758, + "flos": 24907000595040.0, + "grad_norm": 2.076250539187766, + "language_loss": 0.80073476, + "learning_rate": 3.5223095592474596e-06, + "loss": 0.82248926, + "num_input_tokens_seen": 88861740, + "router_z_loss_clip": 0.93017578, + "router_z_loss_mlp": 0.16430664, + "step": 4123, + "time_per_iteration": 4.040777683258057 + }, + { + "auxiliary_loss_clip": 0.01138232, + "auxiliary_loss_mlp": 0.01038415, + "balance_loss_clip": 1.04828167, + "balance_loss_mlp": 1.02415121, + "epoch": 0.24794829400270554, + "flos": 27570590213760.0, + "grad_norm": 3.193663862983495, + "language_loss": 0.74820006, + "learning_rate": 3.5220569371161846e-06, + "loss": 0.7699666, + "num_input_tokens_seen": 88879740, + "router_z_loss_clip": 0.90039062, + "router_z_loss_mlp": 0.14257812, + "step": 4124, + "time_per_iteration": 2.62337327003479 + }, + { + "auxiliary_loss_clip": 0.01135295, + "auxiliary_loss_mlp": 0.01032032, + "balance_loss_clip": 1.04763031, + "balance_loss_mlp": 1.01816189, + "epoch": 0.2480084172553735, + "flos": 48414285511200.0, + "grad_norm": 1.5925678523722546, + "language_loss": 0.74075997, + "learning_rate": 3.521804257268357e-06, + "loss": 0.76243323, + "num_input_tokens_seen": 88904095, + "router_z_loss_clip": 0.87646484, + "router_z_loss_mlp": 0.13873291, + "step": 4125, + "time_per_iteration": 4.369304656982422 + }, + { + "auxiliary_loss_clip": 0.01142044, + "auxiliary_loss_mlp": 0.01044128, + "balance_loss_clip": 1.04735756, + "balance_loss_mlp": 1.0286901, + "epoch": 0.24806854050804147, + "flos": 26910132253920.0, + "grad_norm": 1.875943831912057, + "language_loss": 0.69807756, + "learning_rate": 3.5215515197135595e-06, + "loss": 0.71993923, + "num_input_tokens_seen": 88920740, + "router_z_loss_clip": 0.94677734, + "router_z_loss_mlp": 0.15423584, + "step": 4126, + "time_per_iteration": 2.6828627586364746 + }, + { + "auxiliary_loss_clip": 0.01138449, + "auxiliary_loss_mlp": 0.01043059, + "balance_loss_clip": 1.04716229, + "balance_loss_mlp": 1.02840185, + "epoch": 0.24812866376070947, + "flos": 18896592686400.0, + "grad_norm": 2.3545535413537375, + "language_loss": 0.8123405, + "learning_rate": 3.5212987244613764e-06, + "loss": 0.83415556, + "num_input_tokens_seen": 88938510, + "router_z_loss_clip": 0.91308594, + "router_z_loss_mlp": 0.14660645, + "step": 4127, + "time_per_iteration": 2.633180856704712 + }, + { + "auxiliary_loss_clip": 0.01140684, + "auxiliary_loss_mlp": 0.01041718, + "balance_loss_clip": 1.04825866, + "balance_loss_mlp": 1.02720451, + "epoch": 0.24818878701337743, + "flos": 18006555052800.0, + "grad_norm": 2.5888619257718166, + "language_loss": 0.8397814, + "learning_rate": 3.5210458715213927e-06, + "loss": 0.86160535, + "num_input_tokens_seen": 88955235, + "router_z_loss_clip": 0.92382812, + "router_z_loss_mlp": 0.14508057, + "step": 4128, + "time_per_iteration": 2.707275629043579 + }, + { + "auxiliary_loss_clip": 0.01138285, + "auxiliary_loss_mlp": 0.01050031, + "balance_loss_clip": 1.04684436, + "balance_loss_mlp": 1.03533888, + "epoch": 0.2482489102660454, + "flos": 33055119515520.0, + "grad_norm": 3.6859003788040026, + "language_loss": 0.65227717, + "learning_rate": 3.5207929609031973e-06, + "loss": 0.67416036, + "num_input_tokens_seen": 88975210, + "router_z_loss_clip": 0.91552734, + "router_z_loss_mlp": 0.14697266, + "step": 4129, + "time_per_iteration": 2.7229392528533936 + }, + { + "auxiliary_loss_clip": 0.01138883, + "auxiliary_loss_mlp": 0.01038374, + "balance_loss_clip": 1.04716909, + "balance_loss_mlp": 1.02241206, + "epoch": 0.24830903351871336, + "flos": 32296298300640.0, + "grad_norm": 1.584261902965354, + "language_loss": 0.75190914, + "learning_rate": 3.5205399926163806e-06, + "loss": 0.7736817, + "num_input_tokens_seen": 88996120, + "router_z_loss_clip": 0.91748047, + "router_z_loss_mlp": 0.1595459, + "step": 4130, + "time_per_iteration": 2.8524434566497803 + }, + { + "auxiliary_loss_clip": 0.01140749, + "auxiliary_loss_mlp": 0.01045229, + "balance_loss_clip": 1.0475353, + "balance_loss_mlp": 1.02948153, + "epoch": 0.24836915677138133, + "flos": 12480380337600.0, + "grad_norm": 2.129706917265225, + "language_loss": 0.76748741, + "learning_rate": 3.520286966670535e-06, + "loss": 0.78934717, + "num_input_tokens_seen": 89008685, + "router_z_loss_clip": 0.93212891, + "router_z_loss_mlp": 0.15759277, + "step": 4131, + "time_per_iteration": 2.650369644165039 + }, + { + "auxiliary_loss_clip": 0.01136798, + "auxiliary_loss_mlp": 0.0104478, + "balance_loss_clip": 1.04729939, + "balance_loss_mlp": 1.03069556, + "epoch": 0.2484292800240493, + "flos": 36704962607040.0, + "grad_norm": 1.718012991749995, + "language_loss": 0.83759558, + "learning_rate": 3.520033883075255e-06, + "loss": 0.85941142, + "num_input_tokens_seen": 89031160, + "router_z_loss_clip": 0.89453125, + "router_z_loss_mlp": 0.14086914, + "step": 4132, + "time_per_iteration": 2.7529265880584717 + }, + { + "auxiliary_loss_clip": 0.0113899, + "auxiliary_loss_mlp": 0.01040387, + "balance_loss_clip": 1.04697037, + "balance_loss_mlp": 1.02447283, + "epoch": 0.24848940327671726, + "flos": 16402623517440.0, + "grad_norm": 1.9292801628114562, + "language_loss": 0.71081108, + "learning_rate": 3.5197807418401386e-06, + "loss": 0.73260486, + "num_input_tokens_seen": 89047235, + "router_z_loss_clip": 0.91943359, + "router_z_loss_mlp": 0.15917969, + "step": 4133, + "time_per_iteration": 2.7298190593719482 + }, + { + "auxiliary_loss_clip": 0.01145767, + "auxiliary_loss_mlp": 0.01044414, + "balance_loss_clip": 1.04744649, + "balance_loss_mlp": 1.02686656, + "epoch": 0.24854952652938525, + "flos": 24368075424000.0, + "grad_norm": 2.6845759794258433, + "language_loss": 0.61806536, + "learning_rate": 3.5195275429747834e-06, + "loss": 0.63996714, + "num_input_tokens_seen": 89064790, + "router_z_loss_clip": 0.98339844, + "router_z_loss_mlp": 0.17565918, + "step": 4134, + "time_per_iteration": 2.6879923343658447 + }, + { + "auxiliary_loss_clip": 0.01140949, + "auxiliary_loss_mlp": 0.01036724, + "balance_loss_clip": 1.04772544, + "balance_loss_mlp": 1.02186418, + "epoch": 0.24860964978205322, + "flos": 22146708929760.0, + "grad_norm": 1.8991404373317056, + "language_loss": 0.78143048, + "learning_rate": 3.5192742864887914e-06, + "loss": 0.80320716, + "num_input_tokens_seen": 89083250, + "router_z_loss_clip": 0.93212891, + "router_z_loss_mlp": 0.14849854, + "step": 4135, + "time_per_iteration": 2.694486618041992 + }, + { + "auxiliary_loss_clip": 0.01142214, + "auxiliary_loss_mlp": 0.01031349, + "balance_loss_clip": 1.04968667, + "balance_loss_mlp": 1.01710355, + "epoch": 0.24866977303472118, + "flos": 14310650164320.0, + "grad_norm": 2.0528882893556406, + "language_loss": 0.82628882, + "learning_rate": 3.5190209723917662e-06, + "loss": 0.84802449, + "num_input_tokens_seen": 89100905, + "router_z_loss_clip": 0.92431641, + "router_z_loss_mlp": 0.14239502, + "step": 4136, + "time_per_iteration": 2.6763951778411865 + }, + { + "auxiliary_loss_clip": 0.01141446, + "auxiliary_loss_mlp": 0.01043017, + "balance_loss_clip": 1.04784226, + "balance_loss_mlp": 1.02806234, + "epoch": 0.24872989628738915, + "flos": 42492921883200.0, + "grad_norm": 2.4182929915588707, + "language_loss": 0.70340121, + "learning_rate": 3.518767600693314e-06, + "loss": 0.72524583, + "num_input_tokens_seen": 89122630, + "router_z_loss_clip": 0.93603516, + "router_z_loss_mlp": 0.1496582, + "step": 4137, + "time_per_iteration": 2.8175394535064697 + }, + { + "auxiliary_loss_clip": 0.01142358, + "auxiliary_loss_mlp": 0.01043299, + "balance_loss_clip": 1.04635882, + "balance_loss_mlp": 1.02806413, + "epoch": 0.2487900195400571, + "flos": 16715412983520.0, + "grad_norm": 2.2214756339872537, + "language_loss": 0.66845953, + "learning_rate": 3.518514171403042e-06, + "loss": 0.69031608, + "num_input_tokens_seen": 89141050, + "router_z_loss_clip": 0.95947266, + "router_z_loss_mlp": 0.15234375, + "step": 4138, + "time_per_iteration": 2.9301235675811768 + }, + { + "auxiliary_loss_clip": 0.01135706, + "auxiliary_loss_mlp": 0.01037503, + "balance_loss_clip": 1.0460335, + "balance_loss_mlp": 1.02342415, + "epoch": 0.24885014279272508, + "flos": 30917611090080.0, + "grad_norm": 1.8409558863695057, + "language_loss": 0.83689952, + "learning_rate": 3.51826068453056e-06, + "loss": 0.85863161, + "num_input_tokens_seen": 89160810, + "router_z_loss_clip": 0.89501953, + "router_z_loss_mlp": 0.14093018, + "step": 4139, + "time_per_iteration": 2.7498550415039062 + }, + { + "auxiliary_loss_clip": 0.01142979, + "auxiliary_loss_mlp": 0.0104062, + "balance_loss_clip": 1.04802215, + "balance_loss_mlp": 1.02465773, + "epoch": 0.24891026604539307, + "flos": 25174498092480.0, + "grad_norm": 1.8356664246844643, + "language_loss": 0.79076761, + "learning_rate": 3.518007140085481e-06, + "loss": 0.81260359, + "num_input_tokens_seen": 89180610, + "router_z_loss_clip": 0.95068359, + "router_z_loss_mlp": 0.15979004, + "step": 4140, + "time_per_iteration": 2.9221603870391846 + }, + { + "auxiliary_loss_clip": 0.01048485, + "auxiliary_loss_mlp": 0.01021879, + "balance_loss_clip": 1.01748729, + "balance_loss_mlp": 1.02018583, + "epoch": 0.24897038929806103, + "flos": 81705913155360.0, + "grad_norm": 0.8249260143389117, + "language_loss": 0.60969424, + "learning_rate": 3.51775353807742e-06, + "loss": 0.63039792, + "num_input_tokens_seen": 89241880, + "router_z_loss_clip": 0.31030273, + "router_z_loss_mlp": 0.01695251, + "step": 4141, + "time_per_iteration": 3.386162042617798 + }, + { + "auxiliary_loss_clip": 0.01143654, + "auxiliary_loss_mlp": 0.01041739, + "balance_loss_clip": 1.04865718, + "balance_loss_mlp": 1.02602673, + "epoch": 0.249030512550729, + "flos": 44407495468800.0, + "grad_norm": 1.8373633903948257, + "language_loss": 0.72847581, + "learning_rate": 3.5174998785159913e-06, + "loss": 0.75032973, + "num_input_tokens_seen": 89263340, + "router_z_loss_clip": 0.94970703, + "router_z_loss_mlp": 0.15722656, + "step": 4142, + "time_per_iteration": 2.865710496902466 + }, + { + "auxiliary_loss_clip": 0.01138847, + "auxiliary_loss_mlp": 0.01041121, + "balance_loss_clip": 1.04746926, + "balance_loss_mlp": 1.02635098, + "epoch": 0.24909063580339696, + "flos": 24592833541440.0, + "grad_norm": 1.8983971699109368, + "language_loss": 0.80941653, + "learning_rate": 3.5172461614108157e-06, + "loss": 0.83121622, + "num_input_tokens_seen": 89282870, + "router_z_loss_clip": 0.91357422, + "router_z_loss_mlp": 0.14794922, + "step": 4143, + "time_per_iteration": 2.6508097648620605 + }, + { + "auxiliary_loss_clip": 0.01134154, + "auxiliary_loss_mlp": 0.01032046, + "balance_loss_clip": 1.04496253, + "balance_loss_mlp": 1.01825309, + "epoch": 0.24915075905606493, + "flos": 32209522987680.0, + "grad_norm": 1.9009378465317617, + "language_loss": 0.59147906, + "learning_rate": 3.5169923867715137e-06, + "loss": 0.61314106, + "num_input_tokens_seen": 89303830, + "router_z_loss_clip": 0.89111328, + "router_z_loss_mlp": 0.13787842, + "step": 4144, + "time_per_iteration": 2.7284305095672607 + }, + { + "auxiliary_loss_clip": 0.01136722, + "auxiliary_loss_mlp": 0.01034538, + "balance_loss_clip": 1.04519701, + "balance_loss_mlp": 1.01936221, + "epoch": 0.2492108823087329, + "flos": 33589263647520.0, + "grad_norm": 1.9640551933485102, + "language_loss": 0.78263712, + "learning_rate": 3.516738554607708e-06, + "loss": 0.80434978, + "num_input_tokens_seen": 89324350, + "router_z_loss_clip": 0.91455078, + "router_z_loss_mlp": 0.15148926, + "step": 4145, + "time_per_iteration": 2.872182607650757 + }, + { + "auxiliary_loss_clip": 0.01148268, + "auxiliary_loss_mlp": 0.010454, + "balance_loss_clip": 1.04818475, + "balance_loss_mlp": 1.02692258, + "epoch": 0.24927100556140086, + "flos": 20366512110720.0, + "grad_norm": 1.895306562604429, + "language_loss": 0.65041935, + "learning_rate": 3.5164846649290253e-06, + "loss": 0.67235601, + "num_input_tokens_seen": 89342875, + "router_z_loss_clip": 0.99951172, + "router_z_loss_mlp": 0.18469238, + "step": 4146, + "time_per_iteration": 2.6893975734710693 + }, + { + "auxiliary_loss_clip": 0.01046772, + "auxiliary_loss_mlp": 0.01000078, + "balance_loss_clip": 1.01542735, + "balance_loss_mlp": 0.99830604, + "epoch": 0.24933112881406885, + "flos": 76594861854720.0, + "grad_norm": 0.944779750453674, + "language_loss": 0.67238355, + "learning_rate": 3.5162307177450915e-06, + "loss": 0.69285202, + "num_input_tokens_seen": 89404925, + "router_z_loss_clip": 0.31323242, + "router_z_loss_mlp": 0.01771545, + "step": 4147, + "time_per_iteration": 3.404451847076416 + }, + { + "auxiliary_loss_clip": 0.01141849, + "auxiliary_loss_mlp": 0.01042499, + "balance_loss_clip": 1.04871047, + "balance_loss_mlp": 1.0268048, + "epoch": 0.24939125206673682, + "flos": 32521096935360.0, + "grad_norm": 1.7169164749484838, + "language_loss": 0.89024794, + "learning_rate": 3.5159767130655366e-06, + "loss": 0.91209137, + "num_input_tokens_seen": 89425090, + "router_z_loss_clip": 0.93212891, + "router_z_loss_mlp": 0.15692139, + "step": 4148, + "time_per_iteration": 2.754166841506958 + }, + { + "auxiliary_loss_clip": 0.01148027, + "auxiliary_loss_mlp": 0.01045117, + "balance_loss_clip": 1.05076516, + "balance_loss_mlp": 1.0266993, + "epoch": 0.24945137531940478, + "flos": 25263461338560.0, + "grad_norm": 17.135562355191013, + "language_loss": 0.68344373, + "learning_rate": 3.5157226508999935e-06, + "loss": 0.70537508, + "num_input_tokens_seen": 89442615, + "router_z_loss_clip": 0.97265625, + "router_z_loss_mlp": 0.18408203, + "step": 4149, + "time_per_iteration": 2.699206590652466 + }, + { + "auxiliary_loss_clip": 0.0113754, + "auxiliary_loss_mlp": 0.01036005, + "balance_loss_clip": 1.047822, + "balance_loss_mlp": 1.02076411, + "epoch": 0.24951149857207275, + "flos": 29002834918080.0, + "grad_norm": 2.0122848738790178, + "language_loss": 0.71116966, + "learning_rate": 3.515468531258095e-06, + "loss": 0.73290509, + "num_input_tokens_seen": 89463025, + "router_z_loss_clip": 0.89599609, + "router_z_loss_mlp": 0.15252686, + "step": 4150, + "time_per_iteration": 2.729863405227661 + }, + { + "auxiliary_loss_clip": 0.01142111, + "auxiliary_loss_mlp": 0.01041739, + "balance_loss_clip": 1.04750192, + "balance_loss_mlp": 1.0259316, + "epoch": 0.2495716218247407, + "flos": 19114341314400.0, + "grad_norm": 1.861899507968988, + "language_loss": 0.72996843, + "learning_rate": 3.515214354149478e-06, + "loss": 0.75180686, + "num_input_tokens_seen": 89480225, + "router_z_loss_clip": 0.9453125, + "router_z_loss_mlp": 0.15826416, + "step": 4151, + "time_per_iteration": 2.692368984222412 + }, + { + "auxiliary_loss_clip": 0.01146069, + "auxiliary_loss_mlp": 0.01045156, + "balance_loss_clip": 1.04803491, + "balance_loss_mlp": 1.02903914, + "epoch": 0.24963174507740868, + "flos": 29349936169920.0, + "grad_norm": 3.063213980975614, + "language_loss": 0.62555319, + "learning_rate": 3.514960119583781e-06, + "loss": 0.64746541, + "num_input_tokens_seen": 89496985, + "router_z_loss_clip": 0.97802734, + "router_z_loss_mlp": 0.16113281, + "step": 4152, + "time_per_iteration": 2.724506378173828 + }, + { + "auxiliary_loss_clip": 0.01139109, + "auxiliary_loss_mlp": 0.01036789, + "balance_loss_clip": 1.04859114, + "balance_loss_mlp": 1.02142286, + "epoch": 0.24969186833007664, + "flos": 26599814341920.0, + "grad_norm": 1.9806454664545334, + "language_loss": 0.77259541, + "learning_rate": 3.514705827570645e-06, + "loss": 0.79435444, + "num_input_tokens_seen": 89514420, + "router_z_loss_clip": 0.90527344, + "router_z_loss_mlp": 0.15356445, + "step": 4153, + "time_per_iteration": 2.6816534996032715 + }, + { + "auxiliary_loss_clip": 0.01140749, + "auxiliary_loss_mlp": 0.01033332, + "balance_loss_clip": 1.04945445, + "balance_loss_mlp": 1.01800191, + "epoch": 0.24975199158274464, + "flos": 24328496391840.0, + "grad_norm": 1.8245501757070492, + "language_loss": 0.76645935, + "learning_rate": 3.514451478119711e-06, + "loss": 0.78820014, + "num_input_tokens_seen": 89532925, + "router_z_loss_clip": 0.91308594, + "router_z_loss_mlp": 0.15332031, + "step": 4154, + "time_per_iteration": 2.7391440868377686 + }, + { + "auxiliary_loss_clip": 0.01144054, + "auxiliary_loss_mlp": 0.0104247, + "balance_loss_clip": 1.04506993, + "balance_loss_mlp": 1.02492249, + "epoch": 0.2498121148354126, + "flos": 30918259366560.0, + "grad_norm": 2.041614701810784, + "language_loss": 0.70916915, + "learning_rate": 3.5141970712406258e-06, + "loss": 0.73103434, + "num_input_tokens_seen": 89552855, + "router_z_loss_clip": 0.99023438, + "router_z_loss_mlp": 0.17553711, + "step": 4155, + "time_per_iteration": 2.713301420211792 + }, + { + "auxiliary_loss_clip": 0.01144589, + "auxiliary_loss_mlp": 0.01042921, + "balance_loss_clip": 1.04892993, + "balance_loss_mlp": 1.02712536, + "epoch": 0.24987223808808057, + "flos": 25085818467360.0, + "grad_norm": 1.7357375314878223, + "language_loss": 0.75053763, + "learning_rate": 3.513942606943036e-06, + "loss": 0.77241272, + "num_input_tokens_seen": 89572830, + "router_z_loss_clip": 0.95556641, + "router_z_loss_mlp": 0.15783691, + "step": 4156, + "time_per_iteration": 2.7176148891448975 + }, + { + "auxiliary_loss_clip": 0.01138748, + "auxiliary_loss_mlp": 0.01037632, + "balance_loss_clip": 1.04642737, + "balance_loss_mlp": 1.02245617, + "epoch": 0.24993236134074853, + "flos": 24098349476160.0, + "grad_norm": 1.908581402883896, + "language_loss": 0.77861297, + "learning_rate": 3.513688085236591e-06, + "loss": 0.80037683, + "num_input_tokens_seen": 89590345, + "router_z_loss_clip": 0.92480469, + "router_z_loss_mlp": 0.1517334, + "step": 4157, + "time_per_iteration": 2.7035820484161377 + }, + { + "auxiliary_loss_clip": 0.01143027, + "auxiliary_loss_mlp": 0.01043573, + "balance_loss_clip": 1.04815543, + "balance_loss_mlp": 1.02864778, + "epoch": 0.2499924845934165, + "flos": 22903423246080.0, + "grad_norm": 1.6583315399387675, + "language_loss": 0.81457174, + "learning_rate": 3.513433506130942e-06, + "loss": 0.83643776, + "num_input_tokens_seen": 89610295, + "router_z_loss_clip": 0.94921875, + "router_z_loss_mlp": 0.14935303, + "step": 4158, + "time_per_iteration": 2.7043302059173584 + }, + { + "auxiliary_loss_clip": 0.01142252, + "auxiliary_loss_mlp": 0.01033766, + "balance_loss_clip": 1.04776418, + "balance_loss_mlp": 1.01844716, + "epoch": 0.25005260784608446, + "flos": 20587299534720.0, + "grad_norm": 1.815079282747176, + "language_loss": 0.75579417, + "learning_rate": 3.5131788696357427e-06, + "loss": 0.77755427, + "num_input_tokens_seen": 89627795, + "router_z_loss_clip": 0.94482422, + "router_z_loss_mlp": 0.15319824, + "step": 4159, + "time_per_iteration": 2.708036422729492 + }, + { + "auxiliary_loss_clip": 0.01143895, + "auxiliary_loss_mlp": 0.01039182, + "balance_loss_clip": 1.0478363, + "balance_loss_mlp": 1.02285016, + "epoch": 0.2501127310987524, + "flos": 26997191187840.0, + "grad_norm": 2.258304050631511, + "language_loss": 0.71351033, + "learning_rate": 3.512924175760649e-06, + "loss": 0.73534107, + "num_input_tokens_seen": 89648090, + "router_z_loss_clip": 0.96044922, + "router_z_loss_mlp": 0.16345215, + "step": 4160, + "time_per_iteration": 4.93120551109314 + }, + { + "auxiliary_loss_clip": 0.01047882, + "auxiliary_loss_mlp": 0.01003601, + "balance_loss_clip": 1.01700258, + "balance_loss_mlp": 1.00195146, + "epoch": 0.2501728543514204, + "flos": 84752340266880.0, + "grad_norm": 0.7461813635485791, + "language_loss": 0.56761038, + "learning_rate": 3.5126694245153186e-06, + "loss": 0.58812523, + "num_input_tokens_seen": 89710345, + "router_z_loss_clip": 0.30883789, + "router_z_loss_mlp": 0.01652527, + "step": 4161, + "time_per_iteration": 4.703098773956299 + }, + { + "auxiliary_loss_clip": 0.01150984, + "auxiliary_loss_mlp": 0.01050175, + "balance_loss_clip": 1.05064511, + "balance_loss_mlp": 1.03346181, + "epoch": 0.25023297760408836, + "flos": 19876687532640.0, + "grad_norm": 3.7311149387936924, + "language_loss": 0.81329024, + "learning_rate": 3.5124146159094125e-06, + "loss": 0.83530188, + "num_input_tokens_seen": 89729390, + "router_z_loss_clip": 1.00341797, + "router_z_loss_mlp": 0.16711426, + "step": 4162, + "time_per_iteration": 4.034192800521851 + }, + { + "auxiliary_loss_clip": 0.01141241, + "auxiliary_loss_mlp": 0.01040674, + "balance_loss_clip": 1.04395187, + "balance_loss_mlp": 1.02474737, + "epoch": 0.2502931008567563, + "flos": 14932339437600.0, + "grad_norm": 2.568795280146556, + "language_loss": 0.87262738, + "learning_rate": 3.5121597499525927e-06, + "loss": 0.89444655, + "num_input_tokens_seen": 89742805, + "router_z_loss_clip": 0.97412109, + "router_z_loss_mlp": 0.15930176, + "step": 4163, + "time_per_iteration": 2.631324529647827 + }, + { + "auxiliary_loss_clip": 0.01142902, + "auxiliary_loss_mlp": 0.01039408, + "balance_loss_clip": 1.04737484, + "balance_loss_mlp": 1.02349401, + "epoch": 0.25035322410942434, + "flos": 28287077221440.0, + "grad_norm": 1.7397542309925518, + "language_loss": 0.83308744, + "learning_rate": 3.5119048266545232e-06, + "loss": 0.85491049, + "num_input_tokens_seen": 89761145, + "router_z_loss_clip": 0.95458984, + "router_z_loss_mlp": 0.15893555, + "step": 4164, + "time_per_iteration": 2.6572206020355225 + }, + { + "auxiliary_loss_clip": 0.01141954, + "auxiliary_loss_mlp": 0.01042177, + "balance_loss_clip": 1.05215883, + "balance_loss_mlp": 1.02764571, + "epoch": 0.2504133473620923, + "flos": 25524151932960.0, + "grad_norm": 1.9983082743845484, + "language_loss": 0.74522007, + "learning_rate": 3.5116498460248716e-06, + "loss": 0.76706141, + "num_input_tokens_seen": 89780905, + "router_z_loss_clip": 0.89746094, + "router_z_loss_mlp": 0.1451416, + "step": 4165, + "time_per_iteration": 4.20706033706665 + }, + { + "auxiliary_loss_clip": 0.01145223, + "auxiliary_loss_mlp": 0.01042944, + "balance_loss_clip": 1.04848444, + "balance_loss_mlp": 1.02662373, + "epoch": 0.2504734706147603, + "flos": 25351654756320.0, + "grad_norm": 1.827087642152704, + "language_loss": 0.7463131, + "learning_rate": 3.5113948080733062e-06, + "loss": 0.76819474, + "num_input_tokens_seen": 89799230, + "router_z_loss_clip": 0.96630859, + "router_z_loss_mlp": 0.1630249, + "step": 4166, + "time_per_iteration": 2.642115831375122 + }, + { + "auxiliary_loss_clip": 0.01139434, + "auxiliary_loss_mlp": 0.01038534, + "balance_loss_clip": 1.04758251, + "balance_loss_mlp": 1.02422905, + "epoch": 0.25053359386742824, + "flos": 29711745194400.0, + "grad_norm": 1.6754050211234035, + "language_loss": 0.81577462, + "learning_rate": 3.5111397128094973e-06, + "loss": 0.83755434, + "num_input_tokens_seen": 89818240, + "router_z_loss_clip": 0.91943359, + "router_z_loss_mlp": 0.1428833, + "step": 4167, + "time_per_iteration": 2.693591356277466 + }, + { + "auxiliary_loss_clip": 0.01141712, + "auxiliary_loss_mlp": 0.01044196, + "balance_loss_clip": 1.04905069, + "balance_loss_mlp": 1.02863932, + "epoch": 0.2505937171200962, + "flos": 25885879922880.0, + "grad_norm": 2.23049893332861, + "language_loss": 0.79482567, + "learning_rate": 3.51088456024312e-06, + "loss": 0.81668472, + "num_input_tokens_seen": 89834485, + "router_z_loss_clip": 0.92529297, + "router_z_loss_mlp": 0.15545654, + "step": 4168, + "time_per_iteration": 2.6684372425079346 + }, + { + "auxiliary_loss_clip": 0.0114674, + "auxiliary_loss_mlp": 0.01040414, + "balance_loss_clip": 1.047925, + "balance_loss_mlp": 1.02240133, + "epoch": 0.25065384037276417, + "flos": 50551226694720.0, + "grad_norm": 2.412419331196359, + "language_loss": 0.69994473, + "learning_rate": 3.510629350383849e-06, + "loss": 0.7218163, + "num_input_tokens_seen": 89855645, + "router_z_loss_clip": 0.98681641, + "router_z_loss_mlp": 0.18017578, + "step": 4169, + "time_per_iteration": 2.885366439819336 + }, + { + "auxiliary_loss_clip": 0.0113765, + "auxiliary_loss_mlp": 0.01042046, + "balance_loss_clip": 1.04667091, + "balance_loss_mlp": 1.02678704, + "epoch": 0.25071396362543213, + "flos": 31850752759200.0, + "grad_norm": 1.6917726560811046, + "language_loss": 0.77875888, + "learning_rate": 3.510374083241361e-06, + "loss": 0.80055583, + "num_input_tokens_seen": 89874895, + "router_z_loss_clip": 0.91210938, + "router_z_loss_mlp": 0.15246582, + "step": 4170, + "time_per_iteration": 2.86810564994812 + }, + { + "auxiliary_loss_clip": 0.01142063, + "auxiliary_loss_mlp": 0.01041075, + "balance_loss_clip": 1.04765391, + "balance_loss_mlp": 1.02562523, + "epoch": 0.2507740868781001, + "flos": 23304487164480.0, + "grad_norm": 2.476737924119252, + "language_loss": 0.76103103, + "learning_rate": 3.5101187588253368e-06, + "loss": 0.78286242, + "num_input_tokens_seen": 89891700, + "router_z_loss_clip": 0.94433594, + "router_z_loss_mlp": 0.15447998, + "step": 4171, + "time_per_iteration": 2.66709303855896 + }, + { + "auxiliary_loss_clip": 0.01050233, + "auxiliary_loss_mlp": 0.01000083, + "balance_loss_clip": 1.01912713, + "balance_loss_mlp": 0.99830866, + "epoch": 0.25083421013076806, + "flos": 78509921647680.0, + "grad_norm": 0.8337182695112703, + "language_loss": 0.60045141, + "learning_rate": 3.509863377145458e-06, + "loss": 0.62095463, + "num_input_tokens_seen": 89955775, + "router_z_loss_clip": 0.31103516, + "router_z_loss_mlp": 0.01773071, + "step": 4172, + "time_per_iteration": 3.319396495819092 + }, + { + "auxiliary_loss_clip": 0.01142154, + "auxiliary_loss_mlp": 0.01046958, + "balance_loss_clip": 1.04750967, + "balance_loss_mlp": 1.03060222, + "epoch": 0.25089433338343603, + "flos": 29626306951680.0, + "grad_norm": 1.6307902460198944, + "language_loss": 0.7855823, + "learning_rate": 3.509607938211409e-06, + "loss": 0.80747342, + "num_input_tokens_seen": 89977150, + "router_z_loss_clip": 0.94775391, + "router_z_loss_mlp": 0.16357422, + "step": 4173, + "time_per_iteration": 2.7046995162963867 + }, + { + "auxiliary_loss_clip": 0.01141188, + "auxiliary_loss_mlp": 0.01044914, + "balance_loss_clip": 1.04794145, + "balance_loss_mlp": 1.02941656, + "epoch": 0.250954456636104, + "flos": 17962640671680.0, + "grad_norm": 1.9574904242186293, + "language_loss": 0.83578181, + "learning_rate": 3.509352442032875e-06, + "loss": 0.85764277, + "num_input_tokens_seen": 89994925, + "router_z_loss_clip": 0.93212891, + "router_z_loss_mlp": 0.1550293, + "step": 4174, + "time_per_iteration": 2.865453004837036 + }, + { + "auxiliary_loss_clip": 0.01144015, + "auxiliary_loss_mlp": 0.01038247, + "balance_loss_clip": 1.04844379, + "balance_loss_mlp": 1.02210617, + "epoch": 0.25101457988877196, + "flos": 27178359062400.0, + "grad_norm": 2.1275026072772056, + "language_loss": 0.71573287, + "learning_rate": 3.509096888619545e-06, + "loss": 0.7375555, + "num_input_tokens_seen": 90013235, + "router_z_loss_clip": 0.95458984, + "router_z_loss_mlp": 0.16137695, + "step": 4175, + "time_per_iteration": 2.720151424407959 + }, + { + "auxiliary_loss_clip": 0.01141208, + "auxiliary_loss_mlp": 0.01036137, + "balance_loss_clip": 1.04487872, + "balance_loss_mlp": 1.01999032, + "epoch": 0.2510747031414399, + "flos": 30736200111840.0, + "grad_norm": 1.8481695619998584, + "language_loss": 0.8074106, + "learning_rate": 3.50884127798111e-06, + "loss": 0.82918406, + "num_input_tokens_seen": 90032150, + "router_z_loss_clip": 0.96435547, + "router_z_loss_mlp": 0.16143799, + "step": 4176, + "time_per_iteration": 2.6812119483947754 + }, + { + "auxiliary_loss_clip": 0.01143028, + "auxiliary_loss_mlp": 0.01039472, + "balance_loss_clip": 1.04848433, + "balance_loss_mlp": 1.02312851, + "epoch": 0.25113482639410795, + "flos": 25263299269440.0, + "grad_norm": 3.535079765260622, + "language_loss": 0.82953984, + "learning_rate": 3.5085856101272623e-06, + "loss": 0.85136485, + "num_input_tokens_seen": 90049085, + "router_z_loss_clip": 0.9453125, + "router_z_loss_mlp": 0.16333008, + "step": 4177, + "time_per_iteration": 2.778883218765259 + }, + { + "auxiliary_loss_clip": 0.01144486, + "auxiliary_loss_mlp": 0.01044163, + "balance_loss_clip": 1.05131102, + "balance_loss_mlp": 1.02778351, + "epoch": 0.2511949496467759, + "flos": 26242867391040.0, + "grad_norm": 2.1495731768161437, + "language_loss": 0.82772505, + "learning_rate": 3.508329885067698e-06, + "loss": 0.84961152, + "num_input_tokens_seen": 90067695, + "router_z_loss_clip": 0.93164062, + "router_z_loss_mlp": 0.16369629, + "step": 4178, + "time_per_iteration": 2.6557235717773438 + }, + { + "auxiliary_loss_clip": 0.01135328, + "auxiliary_loss_mlp": 0.01038432, + "balance_loss_clip": 1.04507113, + "balance_loss_mlp": 1.02411509, + "epoch": 0.2512550728994439, + "flos": 25260827715360.0, + "grad_norm": 2.515001694238476, + "language_loss": 0.75573874, + "learning_rate": 3.508074102812112e-06, + "loss": 0.77747631, + "num_input_tokens_seen": 90083890, + "router_z_loss_clip": 0.90185547, + "router_z_loss_mlp": 0.14312744, + "step": 4179, + "time_per_iteration": 2.635282516479492 + }, + { + "auxiliary_loss_clip": 0.01144895, + "auxiliary_loss_mlp": 0.0104553, + "balance_loss_clip": 1.04903424, + "balance_loss_mlp": 1.02907908, + "epoch": 0.25131519615211184, + "flos": 22547489227200.0, + "grad_norm": 2.070907876889013, + "language_loss": 0.70291293, + "learning_rate": 3.507818263370206e-06, + "loss": 0.72481716, + "num_input_tokens_seen": 90100995, + "router_z_loss_clip": 0.95751953, + "router_z_loss_mlp": 0.16442871, + "step": 4180, + "time_per_iteration": 2.6313650608062744 + }, + { + "auxiliary_loss_clip": 0.01140639, + "auxiliary_loss_mlp": 0.01042442, + "balance_loss_clip": 1.04834783, + "balance_loss_mlp": 1.02704024, + "epoch": 0.2513753194047798, + "flos": 24995801772000.0, + "grad_norm": 1.7339245996532646, + "language_loss": 0.86153197, + "learning_rate": 3.5075623667516796e-06, + "loss": 0.88336283, + "num_input_tokens_seen": 90120365, + "router_z_loss_clip": 0.92333984, + "router_z_loss_mlp": 0.1541748, + "step": 4181, + "time_per_iteration": 2.667239189147949 + }, + { + "auxiliary_loss_clip": 0.01140951, + "auxiliary_loss_mlp": 0.01046171, + "balance_loss_clip": 1.04816961, + "balance_loss_mlp": 1.03033996, + "epoch": 0.25143544265744777, + "flos": 45966378139200.0, + "grad_norm": 2.1928454113630034, + "language_loss": 0.67906487, + "learning_rate": 3.507306412966238e-06, + "loss": 0.70093608, + "num_input_tokens_seen": 90142610, + "router_z_loss_clip": 0.92773438, + "router_z_loss_mlp": 0.1583252, + "step": 4182, + "time_per_iteration": 2.7995524406433105 + }, + { + "auxiliary_loss_clip": 0.01045511, + "auxiliary_loss_mlp": 0.01011622, + "balance_loss_clip": 1.01465857, + "balance_loss_mlp": 1.01007748, + "epoch": 0.25149556591011574, + "flos": 84643402001760.0, + "grad_norm": 0.8498799212128576, + "language_loss": 0.7005403, + "learning_rate": 3.5070504020235853e-06, + "loss": 0.72111166, + "num_input_tokens_seen": 90200555, + "router_z_loss_clip": 0.30859375, + "router_z_loss_mlp": 0.01543427, + "step": 4183, + "time_per_iteration": 3.352957248687744 + }, + { + "auxiliary_loss_clip": 0.01136411, + "auxiliary_loss_mlp": 0.0104223, + "balance_loss_clip": 1.0430913, + "balance_loss_mlp": 1.02614844, + "epoch": 0.2515556891627837, + "flos": 17070658208640.0, + "grad_norm": 1.6781518854193442, + "language_loss": 0.74425602, + "learning_rate": 3.506794333933431e-06, + "loss": 0.76604235, + "num_input_tokens_seen": 90218120, + "router_z_loss_clip": 0.93359375, + "router_z_loss_mlp": 0.16088867, + "step": 4184, + "time_per_iteration": 2.7235448360443115 + }, + { + "auxiliary_loss_clip": 0.0114306, + "auxiliary_loss_mlp": 0.0104733, + "balance_loss_clip": 1.05050409, + "balance_loss_mlp": 1.03227353, + "epoch": 0.25161581241545167, + "flos": 27044103847680.0, + "grad_norm": 1.7707113231849305, + "language_loss": 0.83256674, + "learning_rate": 3.506538208705484e-06, + "loss": 0.85447067, + "num_input_tokens_seen": 90236790, + "router_z_loss_clip": 0.92578125, + "router_z_loss_mlp": 0.15063477, + "step": 4185, + "time_per_iteration": 2.6522040367126465 + }, + { + "auxiliary_loss_clip": 0.01046388, + "auxiliary_loss_mlp": 0.0100338, + "balance_loss_clip": 1.015625, + "balance_loss_mlp": 1.00178218, + "epoch": 0.25167593566811963, + "flos": 84633191647200.0, + "grad_norm": 0.7913058789470663, + "language_loss": 0.61561966, + "learning_rate": 3.5062820263494574e-06, + "loss": 0.63611734, + "num_input_tokens_seen": 90297070, + "router_z_loss_clip": 0.30834961, + "router_z_loss_mlp": 0.01598358, + "step": 4186, + "time_per_iteration": 3.1414196491241455 + }, + { + "auxiliary_loss_clip": 0.01140196, + "auxiliary_loss_mlp": 0.01039677, + "balance_loss_clip": 1.04725885, + "balance_loss_mlp": 1.02353668, + "epoch": 0.2517360589207876, + "flos": 16181066265120.0, + "grad_norm": 2.3834105058230293, + "language_loss": 0.79251981, + "learning_rate": 3.5060257868750656e-06, + "loss": 0.81431854, + "num_input_tokens_seen": 90315255, + "router_z_loss_clip": 0.92871094, + "router_z_loss_mlp": 0.16149902, + "step": 4187, + "time_per_iteration": 2.611109495162964 + }, + { + "auxiliary_loss_clip": 0.01140083, + "auxiliary_loss_mlp": 0.0105127, + "balance_loss_clip": 1.04866433, + "balance_loss_mlp": 1.03549862, + "epoch": 0.25179618217345556, + "flos": 24863734490400.0, + "grad_norm": 1.4645056991350223, + "language_loss": 0.79788899, + "learning_rate": 3.5057694902920244e-06, + "loss": 0.81980252, + "num_input_tokens_seen": 90334990, + "router_z_loss_clip": 0.91357422, + "router_z_loss_mlp": 0.15771484, + "step": 4188, + "time_per_iteration": 2.6701908111572266 + }, + { + "auxiliary_loss_clip": 0.01138407, + "auxiliary_loss_mlp": 0.01044729, + "balance_loss_clip": 1.04564381, + "balance_loss_mlp": 1.02992296, + "epoch": 0.25185630542612353, + "flos": 33757830648000.0, + "grad_norm": 2.0270821712845635, + "language_loss": 0.74133891, + "learning_rate": 3.5055131366100534e-06, + "loss": 0.76317024, + "num_input_tokens_seen": 90351825, + "router_z_loss_clip": 0.92773438, + "router_z_loss_mlp": 0.14807129, + "step": 4189, + "time_per_iteration": 2.707303047180176 + }, + { + "auxiliary_loss_clip": 0.01135816, + "auxiliary_loss_mlp": 0.01036956, + "balance_loss_clip": 1.04687142, + "balance_loss_mlp": 1.02265692, + "epoch": 0.25191642867879155, + "flos": 25619881564800.0, + "grad_norm": 1.9164889781543686, + "language_loss": 0.84356838, + "learning_rate": 3.5052567258388745e-06, + "loss": 0.86529613, + "num_input_tokens_seen": 90369860, + "router_z_loss_clip": 0.89013672, + "router_z_loss_mlp": 0.14294434, + "step": 4190, + "time_per_iteration": 2.7912425994873047 + }, + { + "auxiliary_loss_clip": 0.01139901, + "auxiliary_loss_mlp": 0.01043503, + "balance_loss_clip": 1.04730701, + "balance_loss_mlp": 1.02659893, + "epoch": 0.2519765519314595, + "flos": 25753204882080.0, + "grad_norm": 1.8745437489469265, + "language_loss": 0.75244641, + "learning_rate": 3.5050002579882082e-06, + "loss": 0.77428049, + "num_input_tokens_seen": 90389245, + "router_z_loss_clip": 0.92626953, + "router_z_loss_mlp": 0.16918945, + "step": 4191, + "time_per_iteration": 2.637235641479492 + }, + { + "auxiliary_loss_clip": 0.01046992, + "auxiliary_loss_mlp": 0.0099934, + "balance_loss_clip": 1.01637149, + "balance_loss_mlp": 0.99773592, + "epoch": 0.2520366751841275, + "flos": 76560509551680.0, + "grad_norm": 0.7194341591007893, + "language_loss": 0.57109332, + "learning_rate": 3.5047437330677823e-06, + "loss": 0.59155667, + "num_input_tokens_seen": 90456735, + "router_z_loss_clip": 0.30688477, + "router_z_loss_mlp": 0.01605988, + "step": 4192, + "time_per_iteration": 3.3926124572753906 + }, + { + "auxiliary_loss_clip": 0.0113783, + "auxiliary_loss_mlp": 0.01036677, + "balance_loss_clip": 1.0476613, + "balance_loss_mlp": 1.02085805, + "epoch": 0.25209679843679544, + "flos": 27126624846240.0, + "grad_norm": 1.8805389277830675, + "language_loss": 0.75929332, + "learning_rate": 3.504487151087323e-06, + "loss": 0.7810384, + "num_input_tokens_seen": 90474165, + "router_z_loss_clip": 0.90087891, + "router_z_loss_mlp": 0.15808105, + "step": 4193, + "time_per_iteration": 2.671358823776245 + }, + { + "auxiliary_loss_clip": 0.01140926, + "auxiliary_loss_mlp": 0.01046828, + "balance_loss_clip": 1.04637575, + "balance_loss_mlp": 1.03142667, + "epoch": 0.2521569216894634, + "flos": 14845118434560.0, + "grad_norm": 3.307819219524239, + "language_loss": 0.84103453, + "learning_rate": 3.5042305120565598e-06, + "loss": 0.862912, + "num_input_tokens_seen": 90491660, + "router_z_loss_clip": 0.9453125, + "router_z_loss_mlp": 0.15386963, + "step": 4194, + "time_per_iteration": 2.6648342609405518 + }, + { + "auxiliary_loss_clip": 0.01140272, + "auxiliary_loss_mlp": 0.01046878, + "balance_loss_clip": 1.04584908, + "balance_loss_mlp": 1.03281164, + "epoch": 0.2522170449421314, + "flos": 28918409607360.0, + "grad_norm": 1.5676739085422302, + "language_loss": 0.88149583, + "learning_rate": 3.5039738159852253e-06, + "loss": 0.90336734, + "num_input_tokens_seen": 90514025, + "router_z_loss_clip": 0.94433594, + "router_z_loss_mlp": 0.140625, + "step": 4195, + "time_per_iteration": 2.6804051399230957 + }, + { + "auxiliary_loss_clip": 0.01140414, + "auxiliary_loss_mlp": 0.01040286, + "balance_loss_clip": 1.04618335, + "balance_loss_mlp": 1.02248764, + "epoch": 0.25227716819479934, + "flos": 25570294764480.0, + "grad_norm": 3.2139269642748074, + "language_loss": 0.85508329, + "learning_rate": 3.503717062883053e-06, + "loss": 0.8768903, + "num_input_tokens_seen": 90533530, + "router_z_loss_clip": 0.94287109, + "router_z_loss_mlp": 0.17785645, + "step": 4196, + "time_per_iteration": 2.730095148086548 + }, + { + "auxiliary_loss_clip": 0.01140087, + "auxiliary_loss_mlp": 0.01040225, + "balance_loss_clip": 1.04640579, + "balance_loss_mlp": 1.02513289, + "epoch": 0.2523372914474673, + "flos": 28469136476160.0, + "grad_norm": 1.9399792778285443, + "language_loss": 0.83579165, + "learning_rate": 3.5034602527597786e-06, + "loss": 0.85759479, + "num_input_tokens_seen": 90554025, + "router_z_loss_clip": 0.93603516, + "router_z_loss_mlp": 0.15100098, + "step": 4197, + "time_per_iteration": 2.7721869945526123 + }, + { + "auxiliary_loss_clip": 0.01142915, + "auxiliary_loss_mlp": 0.01048261, + "balance_loss_clip": 1.04786301, + "balance_loss_mlp": 1.03104734, + "epoch": 0.25239741470013527, + "flos": 45113083328160.0, + "grad_norm": 1.8386525940629859, + "language_loss": 0.725227, + "learning_rate": 3.5032033856251405e-06, + "loss": 0.74713874, + "num_input_tokens_seen": 90576930, + "router_z_loss_clip": 0.95068359, + "router_z_loss_mlp": 0.17224121, + "step": 4198, + "time_per_iteration": 2.8199405670166016 + }, + { + "auxiliary_loss_clip": 0.01143893, + "auxiliary_loss_mlp": 0.01045665, + "balance_loss_clip": 1.04693317, + "balance_loss_mlp": 1.02902353, + "epoch": 0.25245753795280323, + "flos": 22592740678560.0, + "grad_norm": 1.783796548537948, + "language_loss": 0.76818156, + "learning_rate": 3.50294646148888e-06, + "loss": 0.79007709, + "num_input_tokens_seen": 90595710, + "router_z_loss_clip": 0.97021484, + "router_z_loss_mlp": 0.16638184, + "step": 4199, + "time_per_iteration": 2.700176477432251 + }, + { + "auxiliary_loss_clip": 0.01143205, + "auxiliary_loss_mlp": 0.01041447, + "balance_loss_clip": 1.04797947, + "balance_loss_mlp": 1.02691507, + "epoch": 0.2525176612054712, + "flos": 39468009447360.0, + "grad_norm": 2.375366678284751, + "language_loss": 0.73508942, + "learning_rate": 3.502689480360739e-06, + "loss": 0.75693595, + "num_input_tokens_seen": 90617945, + "router_z_loss_clip": 0.95214844, + "router_z_loss_mlp": 0.14538574, + "step": 4200, + "time_per_iteration": 5.606541633605957 + }, + { + "auxiliary_loss_clip": 0.01141423, + "auxiliary_loss_mlp": 0.0104406, + "balance_loss_clip": 1.04690647, + "balance_loss_mlp": 1.029737, + "epoch": 0.25257778445813917, + "flos": 55226091945600.0, + "grad_norm": 1.5659064152311233, + "language_loss": 0.8259182, + "learning_rate": 3.5024324422504616e-06, + "loss": 0.84777302, + "num_input_tokens_seen": 90640855, + "router_z_loss_clip": 0.94628906, + "router_z_loss_mlp": 0.14312744, + "step": 4201, + "time_per_iteration": 4.325026035308838 + }, + { + "auxiliary_loss_clip": 0.01144028, + "auxiliary_loss_mlp": 0.01044388, + "balance_loss_clip": 1.0478425, + "balance_loss_mlp": 1.02925456, + "epoch": 0.25263790771080713, + "flos": 28513618099200.0, + "grad_norm": 1.7688861433246794, + "language_loss": 0.75224906, + "learning_rate": 3.5021753471677965e-06, + "loss": 0.77413321, + "num_input_tokens_seen": 90661350, + "router_z_loss_clip": 0.96142578, + "router_z_loss_mlp": 0.15148926, + "step": 4202, + "time_per_iteration": 2.6492860317230225 + }, + { + "auxiliary_loss_clip": 0.01138251, + "auxiliary_loss_mlp": 0.01035705, + "balance_loss_clip": 1.04686809, + "balance_loss_mlp": 1.02097082, + "epoch": 0.25269803096347515, + "flos": 22591930332960.0, + "grad_norm": 1.9506402517831878, + "language_loss": 0.73665702, + "learning_rate": 3.501918195122491e-06, + "loss": 0.75839663, + "num_input_tokens_seen": 90680540, + "router_z_loss_clip": 0.91357422, + "router_z_loss_mlp": 0.14733887, + "step": 4203, + "time_per_iteration": 2.6725871562957764 + }, + { + "auxiliary_loss_clip": 0.01139441, + "auxiliary_loss_mlp": 0.01041267, + "balance_loss_clip": 1.0457201, + "balance_loss_mlp": 1.02581739, + "epoch": 0.2527581542161431, + "flos": 30029923458720.0, + "grad_norm": 1.5310307334413387, + "language_loss": 0.77673793, + "learning_rate": 3.501660986124297e-06, + "loss": 0.798545, + "num_input_tokens_seen": 90703460, + "router_z_loss_clip": 0.93701172, + "router_z_loss_mlp": 0.15460205, + "step": 4204, + "time_per_iteration": 4.2410054206848145 + }, + { + "auxiliary_loss_clip": 0.01141355, + "auxiliary_loss_mlp": 0.01045093, + "balance_loss_clip": 1.04621077, + "balance_loss_mlp": 1.03001285, + "epoch": 0.2528182774688111, + "flos": 15424270914240.0, + "grad_norm": 1.9960217327290875, + "language_loss": 0.7253536, + "learning_rate": 3.5014037201829684e-06, + "loss": 0.74721807, + "num_input_tokens_seen": 90718815, + "router_z_loss_clip": 0.95214844, + "router_z_loss_mlp": 0.15081787, + "step": 4205, + "time_per_iteration": 2.5942413806915283 + }, + { + "auxiliary_loss_clip": 0.01138093, + "auxiliary_loss_mlp": 0.01036316, + "balance_loss_clip": 1.04894614, + "balance_loss_mlp": 1.02260673, + "epoch": 0.25287840072147905, + "flos": 57274394021280.0, + "grad_norm": 1.716282415414355, + "language_loss": 0.75703013, + "learning_rate": 3.50114639730826e-06, + "loss": 0.77877414, + "num_input_tokens_seen": 90742125, + "router_z_loss_clip": 0.890625, + "router_z_loss_mlp": 0.137146, + "step": 4206, + "time_per_iteration": 2.861661911010742 + }, + { + "auxiliary_loss_clip": 0.01139769, + "auxiliary_loss_mlp": 0.01037281, + "balance_loss_clip": 1.04644835, + "balance_loss_mlp": 1.02236772, + "epoch": 0.252938523974147, + "flos": 23036949149760.0, + "grad_norm": 1.7538569816767682, + "language_loss": 0.79033434, + "learning_rate": 3.5008890175099296e-06, + "loss": 0.81210488, + "num_input_tokens_seen": 90760785, + "router_z_loss_clip": 0.93310547, + "router_z_loss_mlp": 0.14916992, + "step": 4207, + "time_per_iteration": 2.6008973121643066 + }, + { + "auxiliary_loss_clip": 0.01134667, + "auxiliary_loss_mlp": 0.01042607, + "balance_loss_clip": 1.04569387, + "balance_loss_mlp": 1.02755094, + "epoch": 0.252998647226815, + "flos": 26154673973280.0, + "grad_norm": 1.4468667115938671, + "language_loss": 0.76362705, + "learning_rate": 3.5006315807977375e-06, + "loss": 0.78539979, + "num_input_tokens_seen": 90780045, + "router_z_loss_clip": 0.88867188, + "router_z_loss_mlp": 0.15063477, + "step": 4208, + "time_per_iteration": 2.6975536346435547 + }, + { + "auxiliary_loss_clip": 0.01136402, + "auxiliary_loss_mlp": 0.01032517, + "balance_loss_clip": 1.04598272, + "balance_loss_mlp": 1.01780665, + "epoch": 0.25305877047948294, + "flos": 31045424057280.0, + "grad_norm": 1.9255922421222438, + "language_loss": 0.69899225, + "learning_rate": 3.5003740871814456e-06, + "loss": 0.72068143, + "num_input_tokens_seen": 90797980, + "router_z_loss_clip": 0.90332031, + "router_z_loss_mlp": 0.14709473, + "step": 4209, + "time_per_iteration": 2.7175393104553223 + }, + { + "auxiliary_loss_clip": 0.01048947, + "auxiliary_loss_mlp": 0.01009239, + "balance_loss_clip": 1.01856089, + "balance_loss_mlp": 1.0076468, + "epoch": 0.2531188937321509, + "flos": 73437841620000.0, + "grad_norm": 0.7784177056783786, + "language_loss": 0.55141163, + "learning_rate": 3.5001165366708175e-06, + "loss": 0.57199347, + "num_input_tokens_seen": 90864865, + "router_z_loss_clip": 0.30419922, + "router_z_loss_mlp": 0.01593781, + "step": 4210, + "time_per_iteration": 3.3907623291015625 + }, + { + "auxiliary_loss_clip": 0.01140075, + "auxiliary_loss_mlp": 0.01035157, + "balance_loss_clip": 1.04622519, + "balance_loss_mlp": 1.02038145, + "epoch": 0.25317901698481887, + "flos": 24016436236800.0, + "grad_norm": 1.9683618296202985, + "language_loss": 0.8013351, + "learning_rate": 3.4998589292756204e-06, + "loss": 0.82308739, + "num_input_tokens_seen": 90882885, + "router_z_loss_clip": 0.93896484, + "router_z_loss_mlp": 0.14764404, + "step": 4211, + "time_per_iteration": 2.698706865310669 + }, + { + "auxiliary_loss_clip": 0.01134941, + "auxiliary_loss_mlp": 0.01040024, + "balance_loss_clip": 1.04604816, + "balance_loss_mlp": 1.02601051, + "epoch": 0.25323914023748684, + "flos": 29803058442720.0, + "grad_norm": 1.53750278810872, + "language_loss": 0.78292191, + "learning_rate": 3.499601265005622e-06, + "loss": 0.80467153, + "num_input_tokens_seen": 90902985, + "router_z_loss_clip": 0.88818359, + "router_z_loss_mlp": 0.14001465, + "step": 4212, + "time_per_iteration": 2.7725510597229004 + }, + { + "auxiliary_loss_clip": 0.01137537, + "auxiliary_loss_mlp": 0.01036067, + "balance_loss_clip": 1.04510212, + "balance_loss_mlp": 1.02028942, + "epoch": 0.2532992634901548, + "flos": 31051380097440.0, + "grad_norm": 2.1391213025310445, + "language_loss": 0.53470021, + "learning_rate": 3.4993435438705938e-06, + "loss": 0.55643624, + "num_input_tokens_seen": 90923550, + "router_z_loss_clip": 0.92431641, + "router_z_loss_mlp": 0.15789795, + "step": 4213, + "time_per_iteration": 2.7667651176452637 + }, + { + "auxiliary_loss_clip": 0.01142329, + "auxiliary_loss_mlp": 0.01037865, + "balance_loss_clip": 1.04944897, + "balance_loss_mlp": 1.02231455, + "epoch": 0.25335938674282277, + "flos": 23037881047200.0, + "grad_norm": 2.9179029646904113, + "language_loss": 0.65218788, + "learning_rate": 3.499085765880308e-06, + "loss": 0.67398983, + "num_input_tokens_seen": 90943260, + "router_z_loss_clip": 0.92773438, + "router_z_loss_mlp": 0.15557861, + "step": 4214, + "time_per_iteration": 2.7298669815063477 + }, + { + "auxiliary_loss_clip": 0.01050127, + "auxiliary_loss_mlp": 0.00999881, + "balance_loss_clip": 1.01946425, + "balance_loss_mlp": 0.99823284, + "epoch": 0.25341950999549073, + "flos": 64746624248640.0, + "grad_norm": 0.8518254049920098, + "language_loss": 0.58100367, + "learning_rate": 3.4988279310445396e-06, + "loss": 0.60150373, + "num_input_tokens_seen": 90996295, + "router_z_loss_clip": 0.30761719, + "router_z_loss_mlp": 0.01649475, + "step": 4215, + "time_per_iteration": 3.0823585987091064 + }, + { + "auxiliary_loss_clip": 0.0113825, + "auxiliary_loss_mlp": 0.01036447, + "balance_loss_clip": 1.04796362, + "balance_loss_mlp": 1.02115202, + "epoch": 0.2534796332481587, + "flos": 47613940434720.0, + "grad_norm": 1.7550424590464722, + "language_loss": 0.83505744, + "learning_rate": 3.498570039373066e-06, + "loss": 0.85680437, + "num_input_tokens_seen": 91017545, + "router_z_loss_clip": 0.90234375, + "router_z_loss_mlp": 0.1529541, + "step": 4216, + "time_per_iteration": 2.840589761734009 + }, + { + "auxiliary_loss_clip": 0.0113968, + "auxiliary_loss_mlp": 0.01032989, + "balance_loss_clip": 1.04809368, + "balance_loss_mlp": 1.01764655, + "epoch": 0.2535397565008267, + "flos": 28781075079360.0, + "grad_norm": 1.9522185271029195, + "language_loss": 0.80370724, + "learning_rate": 3.498312090875666e-06, + "loss": 0.82543385, + "num_input_tokens_seen": 91037715, + "router_z_loss_clip": 0.91552734, + "router_z_loss_mlp": 0.15344238, + "step": 4217, + "time_per_iteration": 2.6733877658843994 + }, + { + "auxiliary_loss_clip": 0.01134205, + "auxiliary_loss_mlp": 0.0103261, + "balance_loss_clip": 1.04369807, + "balance_loss_mlp": 1.01880586, + "epoch": 0.2535998797534947, + "flos": 23527057348800.0, + "grad_norm": 2.6108599876730723, + "language_loss": 0.75402725, + "learning_rate": 3.4980540855621218e-06, + "loss": 0.77569532, + "num_input_tokens_seen": 91055295, + "router_z_loss_clip": 0.90478516, + "router_z_loss_mlp": 0.13824463, + "step": 4218, + "time_per_iteration": 2.649740695953369 + }, + { + "auxiliary_loss_clip": 0.0114035, + "auxiliary_loss_mlp": 0.01035496, + "balance_loss_clip": 1.04588342, + "balance_loss_mlp": 1.01990366, + "epoch": 0.25366000300616265, + "flos": 29314854555840.0, + "grad_norm": 2.1887993009695026, + "language_loss": 0.74664342, + "learning_rate": 3.4977960234422167e-06, + "loss": 0.76840186, + "num_input_tokens_seen": 91075485, + "router_z_loss_clip": 0.94482422, + "router_z_loss_mlp": 0.15600586, + "step": 4219, + "time_per_iteration": 2.658738374710083 + }, + { + "auxiliary_loss_clip": 0.01143745, + "auxiliary_loss_mlp": 0.01044028, + "balance_loss_clip": 1.04971802, + "balance_loss_mlp": 1.02862668, + "epoch": 0.2537201262588306, + "flos": 19875998738880.0, + "grad_norm": 2.378487789809895, + "language_loss": 0.81148708, + "learning_rate": 3.497537904525736e-06, + "loss": 0.83336484, + "num_input_tokens_seen": 91093620, + "router_z_loss_clip": 0.94091797, + "router_z_loss_mlp": 0.15393066, + "step": 4220, + "time_per_iteration": 2.663456678390503 + }, + { + "auxiliary_loss_clip": 0.0114221, + "auxiliary_loss_mlp": 0.01042436, + "balance_loss_clip": 1.04850817, + "balance_loss_mlp": 1.02651, + "epoch": 0.2537802495114986, + "flos": 28424411749440.0, + "grad_norm": 2.7656992494274455, + "language_loss": 0.70727968, + "learning_rate": 3.497279728822468e-06, + "loss": 0.72912616, + "num_input_tokens_seen": 91114110, + "router_z_loss_clip": 0.93652344, + "router_z_loss_mlp": 0.15930176, + "step": 4221, + "time_per_iteration": 2.6534571647644043 + }, + { + "auxiliary_loss_clip": 0.01139766, + "auxiliary_loss_mlp": 0.01037678, + "balance_loss_clip": 1.04647946, + "balance_loss_mlp": 1.02266943, + "epoch": 0.25384037276416654, + "flos": 21523115344320.0, + "grad_norm": 1.791570126541308, + "language_loss": 0.61681497, + "learning_rate": 3.497021496342202e-06, + "loss": 0.63858938, + "num_input_tokens_seen": 91133135, + "router_z_loss_clip": 0.93261719, + "router_z_loss_mlp": 0.15008545, + "step": 4222, + "time_per_iteration": 2.6988778114318848 + }, + { + "auxiliary_loss_clip": 0.01142973, + "auxiliary_loss_mlp": 0.01048002, + "balance_loss_clip": 1.047562, + "balance_loss_mlp": 1.03263581, + "epoch": 0.2539004960168345, + "flos": 26242988942880.0, + "grad_norm": 1.696626919222426, + "language_loss": 0.74849653, + "learning_rate": 3.496763207094731e-06, + "loss": 0.77040637, + "num_input_tokens_seen": 91151805, + "router_z_loss_clip": 0.953125, + "router_z_loss_mlp": 0.15368652, + "step": 4223, + "time_per_iteration": 2.6730775833129883 + }, + { + "auxiliary_loss_clip": 0.01137726, + "auxiliary_loss_mlp": 0.0103352, + "balance_loss_clip": 1.04775095, + "balance_loss_mlp": 1.01873791, + "epoch": 0.2539606192695025, + "flos": 29225243033280.0, + "grad_norm": 1.5844854941823037, + "language_loss": 0.8000195, + "learning_rate": 3.49650486108985e-06, + "loss": 0.82173204, + "num_input_tokens_seen": 91172270, + "router_z_loss_clip": 0.90039062, + "router_z_loss_mlp": 0.14770508, + "step": 4224, + "time_per_iteration": 2.7046589851379395 + }, + { + "auxiliary_loss_clip": 0.01135397, + "auxiliary_loss_mlp": 0.0103904, + "balance_loss_clip": 1.04422021, + "balance_loss_mlp": 1.02425158, + "epoch": 0.25402074252217044, + "flos": 29492335357920.0, + "grad_norm": 1.6539220971824327, + "language_loss": 0.77501512, + "learning_rate": 3.496246458337354e-06, + "loss": 0.79675949, + "num_input_tokens_seen": 91192080, + "router_z_loss_clip": 0.91113281, + "router_z_loss_mlp": 0.14794922, + "step": 4225, + "time_per_iteration": 2.7865757942199707 + }, + { + "auxiliary_loss_clip": 0.01137932, + "auxiliary_loss_mlp": 0.01047043, + "balance_loss_clip": 1.04489017, + "balance_loss_mlp": 1.03174806, + "epoch": 0.2540808657748384, + "flos": 27215750161440.0, + "grad_norm": 1.7154951086567116, + "language_loss": 0.84842175, + "learning_rate": 3.4959879988470426e-06, + "loss": 0.87027156, + "num_input_tokens_seen": 91211450, + "router_z_loss_clip": 0.93017578, + "router_z_loss_mlp": 0.1529541, + "step": 4226, + "time_per_iteration": 2.683586835861206 + }, + { + "auxiliary_loss_clip": 0.0113552, + "auxiliary_loss_mlp": 0.01040334, + "balance_loss_clip": 1.04456234, + "balance_loss_mlp": 1.02437162, + "epoch": 0.25414098902750637, + "flos": 33677578617120.0, + "grad_norm": 1.631611046876871, + "language_loss": 0.71233529, + "learning_rate": 3.4957294826287164e-06, + "loss": 0.73409379, + "num_input_tokens_seen": 91231835, + "router_z_loss_clip": 0.90917969, + "router_z_loss_mlp": 0.15979004, + "step": 4227, + "time_per_iteration": 2.7268764972686768 + }, + { + "auxiliary_loss_clip": 0.01043332, + "auxiliary_loss_mlp": 0.01010491, + "balance_loss_clip": 1.01281357, + "balance_loss_mlp": 1.00895333, + "epoch": 0.25420111228017434, + "flos": 70980007514400.0, + "grad_norm": 0.9837859091555511, + "language_loss": 0.61966503, + "learning_rate": 3.4954709096921785e-06, + "loss": 0.64020324, + "num_input_tokens_seen": 91288755, + "router_z_loss_clip": 0.30566406, + "router_z_loss_mlp": 0.01537323, + "step": 4228, + "time_per_iteration": 3.1751668453216553 + }, + { + "auxiliary_loss_clip": 0.01139402, + "auxiliary_loss_mlp": 0.01038085, + "balance_loss_clip": 1.04610801, + "balance_loss_mlp": 1.02262342, + "epoch": 0.2542612355328423, + "flos": 13989027931200.0, + "grad_norm": 5.0060681499190975, + "language_loss": 0.86328566, + "learning_rate": 3.4952122800472336e-06, + "loss": 0.88506049, + "num_input_tokens_seen": 91302485, + "router_z_loss_clip": 0.93310547, + "router_z_loss_mlp": 0.15466309, + "step": 4229, + "time_per_iteration": 2.6466689109802246 + }, + { + "auxiliary_loss_clip": 0.01140254, + "auxiliary_loss_mlp": 0.01042071, + "balance_loss_clip": 1.04762995, + "balance_loss_mlp": 1.02638268, + "epoch": 0.2543213587855103, + "flos": 28023226279200.0, + "grad_norm": 2.230889450621933, + "language_loss": 0.77381516, + "learning_rate": 3.4949535937036892e-06, + "loss": 0.79563844, + "num_input_tokens_seen": 91321120, + "router_z_loss_clip": 0.92578125, + "router_z_loss_mlp": 0.15686035, + "step": 4230, + "time_per_iteration": 2.7146146297454834 + }, + { + "auxiliary_loss_clip": 0.01137913, + "auxiliary_loss_mlp": 0.0103971, + "balance_loss_clip": 1.0470103, + "balance_loss_mlp": 1.02409327, + "epoch": 0.2543814820381783, + "flos": 22272982240320.0, + "grad_norm": 2.455454256465768, + "language_loss": 0.74818617, + "learning_rate": 3.4946948506713544e-06, + "loss": 0.76996237, + "num_input_tokens_seen": 91338575, + "router_z_loss_clip": 0.90966797, + "router_z_loss_mlp": 0.15612793, + "step": 4231, + "time_per_iteration": 2.696460008621216 + }, + { + "auxiliary_loss_clip": 0.01136519, + "auxiliary_loss_mlp": 0.01034931, + "balance_loss_clip": 1.04619479, + "balance_loss_mlp": 1.0202806, + "epoch": 0.25444160529084625, + "flos": 19074681247680.0, + "grad_norm": 2.0251347212343456, + "language_loss": 0.74040771, + "learning_rate": 3.4944360509600416e-06, + "loss": 0.76212215, + "num_input_tokens_seen": 91357355, + "router_z_loss_clip": 0.90234375, + "router_z_loss_mlp": 0.1463623, + "step": 4232, + "time_per_iteration": 2.708965539932251 + }, + { + "auxiliary_loss_clip": 0.01139032, + "auxiliary_loss_mlp": 0.01039349, + "balance_loss_clip": 1.04798901, + "balance_loss_mlp": 1.02344096, + "epoch": 0.2545017285435142, + "flos": 30019834656000.0, + "grad_norm": 1.9419509067883582, + "language_loss": 0.86537957, + "learning_rate": 3.4941771945795637e-06, + "loss": 0.8871634, + "num_input_tokens_seen": 91376515, + "router_z_loss_clip": 0.90966797, + "router_z_loss_mlp": 0.15887451, + "step": 4233, + "time_per_iteration": 2.710907220840454 + }, + { + "auxiliary_loss_clip": 0.01128919, + "auxiliary_loss_mlp": 0.01036362, + "balance_loss_clip": 1.04234183, + "balance_loss_mlp": 1.02317715, + "epoch": 0.2545618517961822, + "flos": 30113943596640.0, + "grad_norm": 1.5049002871056274, + "language_loss": 0.74814224, + "learning_rate": 3.493918281539737e-06, + "loss": 0.76979506, + "num_input_tokens_seen": 91397595, + "router_z_loss_clip": 0.86572266, + "router_z_loss_mlp": 0.1317749, + "step": 4234, + "time_per_iteration": 2.7236382961273193 + }, + { + "auxiliary_loss_clip": 0.01136749, + "auxiliary_loss_mlp": 0.01038502, + "balance_loss_clip": 1.04353368, + "balance_loss_mlp": 1.02406597, + "epoch": 0.25462197504885015, + "flos": 29181490721280.0, + "grad_norm": 1.5526750231311146, + "language_loss": 0.74818337, + "learning_rate": 3.493659311850379e-06, + "loss": 0.76993585, + "num_input_tokens_seen": 91417775, + "router_z_loss_clip": 0.93212891, + "router_z_loss_mlp": 0.14453125, + "step": 4235, + "time_per_iteration": 2.824867010116577 + }, + { + "auxiliary_loss_clip": 0.01145016, + "auxiliary_loss_mlp": 0.01042403, + "balance_loss_clip": 1.04673243, + "balance_loss_mlp": 1.02571416, + "epoch": 0.2546820983015181, + "flos": 30248482432320.0, + "grad_norm": 2.13559027660286, + "language_loss": 0.64692163, + "learning_rate": 3.4934002855213106e-06, + "loss": 0.66879582, + "num_input_tokens_seen": 91437665, + "router_z_loss_clip": 0.98193359, + "router_z_loss_mlp": 0.16687012, + "step": 4236, + "time_per_iteration": 2.758321762084961 + }, + { + "auxiliary_loss_clip": 0.01134149, + "auxiliary_loss_mlp": 0.0103518, + "balance_loss_clip": 1.04391873, + "balance_loss_mlp": 1.02183449, + "epoch": 0.2547422215541861, + "flos": 22859265761280.0, + "grad_norm": 1.5351534429745584, + "language_loss": 0.6668098, + "learning_rate": 3.493141202562354e-06, + "loss": 0.68850309, + "num_input_tokens_seen": 91456705, + "router_z_loss_clip": 0.90185547, + "router_z_loss_mlp": 0.13348389, + "step": 4237, + "time_per_iteration": 2.7223622798919678 + }, + { + "auxiliary_loss_clip": 0.01137134, + "auxiliary_loss_mlp": 0.01042802, + "balance_loss_clip": 1.04451811, + "balance_loss_mlp": 1.02825284, + "epoch": 0.25480234480685404, + "flos": 25663876980480.0, + "grad_norm": 2.030601049963998, + "language_loss": 0.74710011, + "learning_rate": 3.492882062983333e-06, + "loss": 0.76889944, + "num_input_tokens_seen": 91475535, + "router_z_loss_clip": 0.92578125, + "router_z_loss_mlp": 0.14562988, + "step": 4238, + "time_per_iteration": 2.745126485824585 + }, + { + "auxiliary_loss_clip": 0.01141333, + "auxiliary_loss_mlp": 0.01041993, + "balance_loss_clip": 1.04875112, + "balance_loss_mlp": 1.02624607, + "epoch": 0.254862468059522, + "flos": 30605672486880.0, + "grad_norm": 1.788383772469148, + "language_loss": 0.80328202, + "learning_rate": 3.492622866794074e-06, + "loss": 0.82511532, + "num_input_tokens_seen": 91499140, + "router_z_loss_clip": 0.92626953, + "router_z_loss_mlp": 0.1574707, + "step": 4239, + "time_per_iteration": 4.243969917297363 + }, + { + "auxiliary_loss_clip": 0.01135794, + "auxiliary_loss_mlp": 0.01036378, + "balance_loss_clip": 1.04695868, + "balance_loss_mlp": 1.02116704, + "epoch": 0.25492259131219, + "flos": 25085372777280.0, + "grad_norm": 1.7128508691904676, + "language_loss": 0.77057868, + "learning_rate": 3.492363614004407e-06, + "loss": 0.7923004, + "num_input_tokens_seen": 91518335, + "router_z_loss_clip": 0.88818359, + "router_z_loss_mlp": 0.15222168, + "step": 4240, + "time_per_iteration": 4.083385705947876 + }, + { + "auxiliary_loss_clip": 0.01142856, + "auxiliary_loss_mlp": 0.01035638, + "balance_loss_clip": 1.04622912, + "balance_loss_mlp": 1.01992667, + "epoch": 0.25498271456485794, + "flos": 30557544308640.0, + "grad_norm": 2.1739824435383577, + "language_loss": 0.83245122, + "learning_rate": 3.492104304624162e-06, + "loss": 0.85423613, + "num_input_tokens_seen": 91537655, + "router_z_loss_clip": 0.96484375, + "router_z_loss_mlp": 0.15722656, + "step": 4241, + "time_per_iteration": 4.191491603851318 + }, + { + "auxiliary_loss_clip": 0.01138043, + "auxiliary_loss_mlp": 0.01040889, + "balance_loss_clip": 1.04564095, + "balance_loss_mlp": 1.02566624, + "epoch": 0.2550428378175259, + "flos": 31943241008640.0, + "grad_norm": 1.5509416513559555, + "language_loss": 0.73474765, + "learning_rate": 3.4918449386631725e-06, + "loss": 0.75653696, + "num_input_tokens_seen": 91557545, + "router_z_loss_clip": 0.92382812, + "router_z_loss_mlp": 0.15222168, + "step": 4242, + "time_per_iteration": 2.7082390785217285 + }, + { + "auxiliary_loss_clip": 0.01139229, + "auxiliary_loss_mlp": 0.01037668, + "balance_loss_clip": 1.04615355, + "balance_loss_mlp": 1.02249217, + "epoch": 0.2551029610701939, + "flos": 18628690016160.0, + "grad_norm": 2.6376045719441104, + "language_loss": 0.72574788, + "learning_rate": 3.491585516131273e-06, + "loss": 0.74751681, + "num_input_tokens_seen": 91574405, + "router_z_loss_clip": 0.93115234, + "router_z_loss_mlp": 0.1517334, + "step": 4243, + "time_per_iteration": 4.091249704360962 + }, + { + "auxiliary_loss_clip": 0.01139275, + "auxiliary_loss_mlp": 0.01040853, + "balance_loss_clip": 1.04667568, + "balance_loss_mlp": 1.02533197, + "epoch": 0.2551630843228619, + "flos": 22100485063680.0, + "grad_norm": 3.017243494061696, + "language_loss": 0.81855893, + "learning_rate": 3.491326037038301e-06, + "loss": 0.84036022, + "num_input_tokens_seen": 91593755, + "router_z_loss_clip": 0.92675781, + "router_z_loss_mlp": 0.15527344, + "step": 4244, + "time_per_iteration": 2.6748838424682617 + }, + { + "auxiliary_loss_clip": 0.01041795, + "auxiliary_loss_mlp": 0.01004269, + "balance_loss_clip": 1.01132727, + "balance_loss_mlp": 1.0026629, + "epoch": 0.25522320757552985, + "flos": 86048060788800.0, + "grad_norm": 0.6899094342494531, + "language_loss": 0.57722831, + "learning_rate": 3.4910665013940967e-06, + "loss": 0.59768891, + "num_input_tokens_seen": 91660335, + "router_z_loss_clip": 0.30493164, + "router_z_loss_mlp": 0.0160675, + "step": 4245, + "time_per_iteration": 3.3970651626586914 + }, + { + "auxiliary_loss_clip": 0.01137953, + "auxiliary_loss_mlp": 0.01045532, + "balance_loss_clip": 1.0437541, + "balance_loss_mlp": 1.03067875, + "epoch": 0.2552833308281978, + "flos": 27934384584960.0, + "grad_norm": 2.5079826438117117, + "language_loss": 0.65442836, + "learning_rate": 3.4908069092085015e-06, + "loss": 0.67626321, + "num_input_tokens_seen": 91678500, + "router_z_loss_clip": 0.94140625, + "router_z_loss_mlp": 0.14855957, + "step": 4246, + "time_per_iteration": 2.6716055870056152 + }, + { + "auxiliary_loss_clip": 0.01130788, + "auxiliary_loss_mlp": 0.01037158, + "balance_loss_clip": 1.04343557, + "balance_loss_mlp": 1.02355039, + "epoch": 0.2553434540808658, + "flos": 26910010702080.0, + "grad_norm": 1.872632306097809, + "language_loss": 0.81327128, + "learning_rate": 3.4905472604913585e-06, + "loss": 0.83495069, + "num_input_tokens_seen": 91696430, + "router_z_loss_clip": 0.87304688, + "router_z_loss_mlp": 0.13592529, + "step": 4247, + "time_per_iteration": 2.679960012435913 + }, + { + "auxiliary_loss_clip": 0.01143624, + "auxiliary_loss_mlp": 0.01040768, + "balance_loss_clip": 1.04575825, + "balance_loss_mlp": 1.02435231, + "epoch": 0.25540357733353375, + "flos": 20187005444640.0, + "grad_norm": 4.242428398713562, + "language_loss": 0.8366254, + "learning_rate": 3.490287555252514e-06, + "loss": 0.85846931, + "num_input_tokens_seen": 91713270, + "router_z_loss_clip": 0.97705078, + "router_z_loss_mlp": 0.1640625, + "step": 4248, + "time_per_iteration": 2.715606451034546 + }, + { + "auxiliary_loss_clip": 0.01140328, + "auxiliary_loss_mlp": 0.01042604, + "balance_loss_clip": 1.04730105, + "balance_loss_mlp": 1.02709436, + "epoch": 0.2554637005862017, + "flos": 21432774510720.0, + "grad_norm": 2.170897257975659, + "language_loss": 0.83830392, + "learning_rate": 3.4900277935018166e-06, + "loss": 0.86013323, + "num_input_tokens_seen": 91728865, + "router_z_loss_clip": 0.92871094, + "router_z_loss_mlp": 0.1550293, + "step": 4249, + "time_per_iteration": 2.6546518802642822 + }, + { + "auxiliary_loss_clip": 0.01044315, + "auxiliary_loss_mlp": 0.01002521, + "balance_loss_clip": 1.01377344, + "balance_loss_mlp": 1.00089335, + "epoch": 0.2555238238388697, + "flos": 88154978667840.0, + "grad_norm": 0.7498725779119852, + "language_loss": 0.56295055, + "learning_rate": 3.489767975249115e-06, + "loss": 0.58341891, + "num_input_tokens_seen": 91787470, + "router_z_loss_clip": 0.30541992, + "router_z_loss_mlp": 0.01629639, + "step": 4250, + "time_per_iteration": 3.231719970703125 + }, + { + "auxiliary_loss_clip": 0.0113828, + "auxiliary_loss_mlp": 0.01034971, + "balance_loss_clip": 1.04533982, + "balance_loss_mlp": 1.01930737, + "epoch": 0.25558394709153764, + "flos": 29760278545440.0, + "grad_norm": 1.889962265994187, + "language_loss": 0.80942619, + "learning_rate": 3.4895081005042632e-06, + "loss": 0.83115864, + "num_input_tokens_seen": 91805640, + "router_z_loss_clip": 0.92919922, + "router_z_loss_mlp": 0.15661621, + "step": 4251, + "time_per_iteration": 2.7463321685791016 + }, + { + "auxiliary_loss_clip": 0.01042708, + "auxiliary_loss_mlp": 0.01000629, + "balance_loss_clip": 1.01196337, + "balance_loss_mlp": 0.99897623, + "epoch": 0.2556440703442056, + "flos": 84476253106080.0, + "grad_norm": 0.7990423896798733, + "language_loss": 0.66134763, + "learning_rate": 3.4892481692771146e-06, + "loss": 0.68178099, + "num_input_tokens_seen": 91869695, + "router_z_loss_clip": 0.30810547, + "router_z_loss_mlp": 0.01655579, + "step": 4252, + "time_per_iteration": 3.318183422088623 + }, + { + "auxiliary_loss_clip": 0.01136682, + "auxiliary_loss_mlp": 0.01032875, + "balance_loss_clip": 1.04565871, + "balance_loss_mlp": 1.01871943, + "epoch": 0.2557041935968736, + "flos": 30339268956000.0, + "grad_norm": 2.076675467044785, + "language_loss": 0.7360689, + "learning_rate": 3.4889881815775267e-06, + "loss": 0.75776446, + "num_input_tokens_seen": 91889920, + "router_z_loss_clip": 0.90966797, + "router_z_loss_mlp": 0.14160156, + "step": 4253, + "time_per_iteration": 2.761489152908325 + }, + { + "auxiliary_loss_clip": 0.01139034, + "auxiliary_loss_mlp": 0.01037375, + "balance_loss_clip": 1.04746151, + "balance_loss_mlp": 1.0229094, + "epoch": 0.25576431684954154, + "flos": 27445046214240.0, + "grad_norm": 2.1399578832701436, + "language_loss": 0.73544228, + "learning_rate": 3.488728137415357e-06, + "loss": 0.75720632, + "num_input_tokens_seen": 91908665, + "router_z_loss_clip": 0.91455078, + "router_z_loss_mlp": 0.14453125, + "step": 4254, + "time_per_iteration": 2.7552642822265625 + }, + { + "auxiliary_loss_clip": 0.01139283, + "auxiliary_loss_mlp": 0.01038468, + "balance_loss_clip": 1.0462687, + "balance_loss_mlp": 1.02268434, + "epoch": 0.2558244401022095, + "flos": 24192742037760.0, + "grad_norm": 1.6951726992529774, + "language_loss": 0.81291711, + "learning_rate": 3.4884680368004675e-06, + "loss": 0.83469462, + "num_input_tokens_seen": 91927855, + "router_z_loss_clip": 0.93066406, + "router_z_loss_mlp": 0.15783691, + "step": 4255, + "time_per_iteration": 2.6849615573883057 + }, + { + "auxiliary_loss_clip": 0.01138438, + "auxiliary_loss_mlp": 0.0103802, + "balance_loss_clip": 1.04731917, + "balance_loss_mlp": 1.02269018, + "epoch": 0.2558845633548775, + "flos": 28333989881280.0, + "grad_norm": 1.6928154326515263, + "language_loss": 0.85443866, + "learning_rate": 3.488207879742721e-06, + "loss": 0.8762033, + "num_input_tokens_seen": 91948500, + "router_z_loss_clip": 0.91259766, + "router_z_loss_mlp": 0.15332031, + "step": 4256, + "time_per_iteration": 2.8538801670074463 + }, + { + "auxiliary_loss_clip": 0.01143813, + "auxiliary_loss_mlp": 0.01044195, + "balance_loss_clip": 1.04772115, + "balance_loss_mlp": 1.02797055, + "epoch": 0.2559446866075455, + "flos": 20545208431200.0, + "grad_norm": 1.6452657082662623, + "language_loss": 0.7469151, + "learning_rate": 3.4879476662519826e-06, + "loss": 0.76879513, + "num_input_tokens_seen": 91968375, + "router_z_loss_clip": 0.96142578, + "router_z_loss_mlp": 0.16235352, + "step": 4257, + "time_per_iteration": 2.6539437770843506 + }, + { + "auxiliary_loss_clip": 0.01043945, + "auxiliary_loss_mlp": 0.01001474, + "balance_loss_clip": 1.01343977, + "balance_loss_mlp": 0.99983668, + "epoch": 0.25600480986021346, + "flos": 70275230000640.0, + "grad_norm": 0.7982315963761981, + "language_loss": 0.65279049, + "learning_rate": 3.4876873963381196e-06, + "loss": 0.67324471, + "num_input_tokens_seen": 92028490, + "router_z_loss_clip": 0.30493164, + "router_z_loss_mlp": 0.0164032, + "step": 4258, + "time_per_iteration": 3.264585018157959 + }, + { + "auxiliary_loss_clip": 0.01137753, + "auxiliary_loss_mlp": 0.01036253, + "balance_loss_clip": 1.04790592, + "balance_loss_mlp": 1.01983833, + "epoch": 0.2560649331128814, + "flos": 33945926977440.0, + "grad_norm": 1.6657854930797855, + "language_loss": 0.76509607, + "learning_rate": 3.4874270700110013e-06, + "loss": 0.78683609, + "num_input_tokens_seen": 92048060, + "router_z_loss_clip": 0.89794922, + "router_z_loss_mlp": 0.16418457, + "step": 4259, + "time_per_iteration": 2.7958719730377197 + }, + { + "auxiliary_loss_clip": 0.01042308, + "auxiliary_loss_mlp": 0.01003193, + "balance_loss_clip": 1.011868, + "balance_loss_mlp": 1.0015763, + "epoch": 0.2561250563655494, + "flos": 86573858361120.0, + "grad_norm": 0.7980388723294218, + "language_loss": 0.58519304, + "learning_rate": 3.4871666872804994e-06, + "loss": 0.60564804, + "num_input_tokens_seen": 92118180, + "router_z_loss_clip": 0.30444336, + "router_z_loss_mlp": 0.01618195, + "step": 4260, + "time_per_iteration": 3.4262709617614746 + }, + { + "auxiliary_loss_clip": 0.01138486, + "auxiliary_loss_mlp": 0.01043263, + "balance_loss_clip": 1.04524076, + "balance_loss_mlp": 1.02731323, + "epoch": 0.25618517961821735, + "flos": 32961334713120.0, + "grad_norm": 1.8573307922798825, + "language_loss": 0.7685982, + "learning_rate": 3.4869062481564875e-06, + "loss": 0.79041564, + "num_input_tokens_seen": 92137570, + "router_z_loss_clip": 0.93164062, + "router_z_loss_mlp": 0.15942383, + "step": 4261, + "time_per_iteration": 2.7741684913635254 + }, + { + "auxiliary_loss_clip": 0.01138738, + "auxiliary_loss_mlp": 0.01037082, + "balance_loss_clip": 1.0475471, + "balance_loss_mlp": 1.02308047, + "epoch": 0.2562453028708853, + "flos": 28148851313280.0, + "grad_norm": 1.5370856483173294, + "language_loss": 0.83004797, + "learning_rate": 3.486645752648842e-06, + "loss": 0.85180616, + "num_input_tokens_seen": 92157625, + "router_z_loss_clip": 0.91210938, + "router_z_loss_mlp": 0.13983154, + "step": 4262, + "time_per_iteration": 2.7116873264312744 + }, + { + "auxiliary_loss_clip": 0.01144676, + "auxiliary_loss_mlp": 0.01043662, + "balance_loss_clip": 1.04679394, + "balance_loss_mlp": 1.02797389, + "epoch": 0.2563054261235533, + "flos": 18450277316640.0, + "grad_norm": 3.4108926293725426, + "language_loss": 0.74000162, + "learning_rate": 3.4863852007674405e-06, + "loss": 0.76188499, + "num_input_tokens_seen": 92175350, + "router_z_loss_clip": 0.97900391, + "router_z_loss_mlp": 0.15686035, + "step": 4263, + "time_per_iteration": 2.688589096069336 + }, + { + "auxiliary_loss_clip": 0.01141265, + "auxiliary_loss_mlp": 0.01049454, + "balance_loss_clip": 1.05037618, + "balance_loss_mlp": 1.03434992, + "epoch": 0.25636554937622125, + "flos": 33989152564800.0, + "grad_norm": 1.9318403592447926, + "language_loss": 0.82633203, + "learning_rate": 3.486124592522163e-06, + "loss": 0.84823918, + "num_input_tokens_seen": 92196070, + "router_z_loss_clip": 0.91064453, + "router_z_loss_mlp": 0.15100098, + "step": 4264, + "time_per_iteration": 2.703906774520874 + }, + { + "auxiliary_loss_clip": 0.01143441, + "auxiliary_loss_mlp": 0.01043061, + "balance_loss_clip": 1.04897022, + "balance_loss_mlp": 1.02768278, + "epoch": 0.2564256726288892, + "flos": 35273042040960.0, + "grad_norm": 1.854122417085423, + "language_loss": 0.74431348, + "learning_rate": 3.4858639279228924e-06, + "loss": 0.76617849, + "num_input_tokens_seen": 92216310, + "router_z_loss_clip": 0.9453125, + "router_z_loss_mlp": 0.15374756, + "step": 4265, + "time_per_iteration": 2.7431390285491943 + }, + { + "auxiliary_loss_clip": 0.01138607, + "auxiliary_loss_mlp": 0.01037447, + "balance_loss_clip": 1.04601908, + "balance_loss_mlp": 1.02279019, + "epoch": 0.2564857958815572, + "flos": 22591808781120.0, + "grad_norm": 1.741774382555772, + "language_loss": 0.81645572, + "learning_rate": 3.485603206979513e-06, + "loss": 0.83821625, + "num_input_tokens_seen": 92234510, + "router_z_loss_clip": 0.92626953, + "router_z_loss_mlp": 0.14660645, + "step": 4266, + "time_per_iteration": 2.692113161087036 + }, + { + "auxiliary_loss_clip": 0.0113431, + "auxiliary_loss_mlp": 0.01038919, + "balance_loss_clip": 1.0441277, + "balance_loss_mlp": 1.02422047, + "epoch": 0.25654591913422514, + "flos": 31492063565280.0, + "grad_norm": 1.5978647752669755, + "language_loss": 0.79216266, + "learning_rate": 3.4853424297019103e-06, + "loss": 0.81389487, + "num_input_tokens_seen": 92254070, + "router_z_loss_clip": 0.90185547, + "router_z_loss_mlp": 0.14709473, + "step": 4267, + "time_per_iteration": 2.6916322708129883 + }, + { + "auxiliary_loss_clip": 0.01134412, + "auxiliary_loss_mlp": 0.01041585, + "balance_loss_clip": 1.04653001, + "balance_loss_mlp": 1.02694631, + "epoch": 0.2566060423868931, + "flos": 23305256992800.0, + "grad_norm": 1.6585236074719856, + "language_loss": 0.79076248, + "learning_rate": 3.4850815960999736e-06, + "loss": 0.81252241, + "num_input_tokens_seen": 92275060, + "router_z_loss_clip": 0.87939453, + "router_z_loss_mlp": 0.1463623, + "step": 4268, + "time_per_iteration": 2.736217975616455 + }, + { + "auxiliary_loss_clip": 0.01137176, + "auxiliary_loss_mlp": 0.01039661, + "balance_loss_clip": 1.04593325, + "balance_loss_mlp": 1.02514672, + "epoch": 0.25666616563956113, + "flos": 29093054199840.0, + "grad_norm": 1.7000273695953674, + "language_loss": 0.67838264, + "learning_rate": 3.484820706183595e-06, + "loss": 0.70015103, + "num_input_tokens_seen": 92293610, + "router_z_loss_clip": 0.91259766, + "router_z_loss_mlp": 0.14520264, + "step": 4269, + "time_per_iteration": 2.705754280090332 + }, + { + "auxiliary_loss_clip": 0.01141693, + "auxiliary_loss_mlp": 0.01041606, + "balance_loss_clip": 1.04815757, + "balance_loss_mlp": 1.02587032, + "epoch": 0.2567262888922291, + "flos": 17820322518240.0, + "grad_norm": 3.3023456761901286, + "language_loss": 0.79319733, + "learning_rate": 3.484559759962666e-06, + "loss": 0.81503034, + "num_input_tokens_seen": 92308305, + "router_z_loss_clip": 0.93554688, + "router_z_loss_mlp": 0.15710449, + "step": 4270, + "time_per_iteration": 2.68200945854187 + }, + { + "auxiliary_loss_clip": 0.01145261, + "auxiliary_loss_mlp": 0.01036868, + "balance_loss_clip": 1.04781318, + "balance_loss_mlp": 1.01957035, + "epoch": 0.25678641214489706, + "flos": 40174124031360.0, + "grad_norm": 2.1506621888428903, + "language_loss": 0.67883098, + "learning_rate": 3.4842987574470816e-06, + "loss": 0.7006523, + "num_input_tokens_seen": 92329875, + "router_z_loss_clip": 0.97412109, + "router_z_loss_mlp": 0.17297363, + "step": 4271, + "time_per_iteration": 2.820216655731201 + }, + { + "auxiliary_loss_clip": 0.01141854, + "auxiliary_loss_mlp": 0.01040097, + "balance_loss_clip": 1.04684687, + "balance_loss_mlp": 1.02448654, + "epoch": 0.256846535397565, + "flos": 29405681596800.0, + "grad_norm": 1.8581711724789838, + "language_loss": 0.87586504, + "learning_rate": 3.4840376986467403e-06, + "loss": 0.89768451, + "num_input_tokens_seen": 92348780, + "router_z_loss_clip": 0.95166016, + "router_z_loss_mlp": 0.15606689, + "step": 4272, + "time_per_iteration": 2.789872646331787 + }, + { + "auxiliary_loss_clip": 0.01143348, + "auxiliary_loss_mlp": 0.01040582, + "balance_loss_clip": 1.04853702, + "balance_loss_mlp": 1.02457237, + "epoch": 0.256906658650233, + "flos": 24060472169760.0, + "grad_norm": 1.8139167250641246, + "language_loss": 0.81498748, + "learning_rate": 3.483776583571541e-06, + "loss": 0.8368268, + "num_input_tokens_seen": 92368175, + "router_z_loss_clip": 0.94873047, + "router_z_loss_mlp": 0.16003418, + "step": 4273, + "time_per_iteration": 2.744622230529785 + }, + { + "auxiliary_loss_clip": 0.01136036, + "auxiliary_loss_mlp": 0.0103957, + "balance_loss_clip": 1.04735613, + "balance_loss_mlp": 1.02476466, + "epoch": 0.25696678190290095, + "flos": 27974895514560.0, + "grad_norm": 1.6583750842920977, + "language_loss": 0.7724067, + "learning_rate": 3.4835154122313846e-06, + "loss": 0.79416275, + "num_input_tokens_seen": 92387755, + "router_z_loss_clip": 0.88623047, + "router_z_loss_mlp": 0.14819336, + "step": 4274, + "time_per_iteration": 2.7337160110473633 + }, + { + "auxiliary_loss_clip": 0.01133914, + "auxiliary_loss_mlp": 0.01033553, + "balance_loss_clip": 1.04367924, + "balance_loss_mlp": 1.01784086, + "epoch": 0.2570269051555689, + "flos": 33321604080960.0, + "grad_norm": 1.6289996503008115, + "language_loss": 0.84082484, + "learning_rate": 3.4832541846361743e-06, + "loss": 0.86249948, + "num_input_tokens_seen": 92409850, + "router_z_loss_clip": 0.90283203, + "router_z_loss_mlp": 0.15722656, + "step": 4275, + "time_per_iteration": 2.789067506790161 + }, + { + "auxiliary_loss_clip": 0.01140212, + "auxiliary_loss_mlp": 0.01035464, + "balance_loss_clip": 1.04669952, + "balance_loss_mlp": 1.01949012, + "epoch": 0.2570870284082369, + "flos": 33633461649600.0, + "grad_norm": 2.1040924870715303, + "language_loss": 0.7860744, + "learning_rate": 3.4829929007958175e-06, + "loss": 0.80783117, + "num_input_tokens_seen": 92431250, + "router_z_loss_clip": 0.93408203, + "router_z_loss_mlp": 0.15979004, + "step": 4276, + "time_per_iteration": 2.735287666320801 + }, + { + "auxiliary_loss_clip": 0.01138521, + "auxiliary_loss_mlp": 0.0104196, + "balance_loss_clip": 1.04791367, + "balance_loss_mlp": 1.02733874, + "epoch": 0.25714715166090485, + "flos": 35058048588000.0, + "grad_norm": 1.6069857332980593, + "language_loss": 0.79130757, + "learning_rate": 3.4827315607202214e-06, + "loss": 0.81311238, + "num_input_tokens_seen": 92452065, + "router_z_loss_clip": 0.90429688, + "router_z_loss_mlp": 0.1461792, + "step": 4277, + "time_per_iteration": 2.7643203735351562 + }, + { + "auxiliary_loss_clip": 0.01139047, + "auxiliary_loss_mlp": 0.01034929, + "balance_loss_clip": 1.04662681, + "balance_loss_mlp": 1.02074313, + "epoch": 0.2572072749135728, + "flos": 24546123468000.0, + "grad_norm": 2.38751010083889, + "language_loss": 0.78478533, + "learning_rate": 3.482470164419295e-06, + "loss": 0.80652505, + "num_input_tokens_seen": 92470025, + "router_z_loss_clip": 0.92382812, + "router_z_loss_mlp": 0.1418457, + "step": 4278, + "time_per_iteration": 2.7509613037109375 + }, + { + "auxiliary_loss_clip": 0.01140414, + "auxiliary_loss_mlp": 0.01036383, + "balance_loss_clip": 1.04694271, + "balance_loss_mlp": 1.02129102, + "epoch": 0.2572673981662408, + "flos": 31763329169760.0, + "grad_norm": 1.8091375014257323, + "language_loss": 0.7472831, + "learning_rate": 3.482208711902952e-06, + "loss": 0.76905107, + "num_input_tokens_seen": 92489825, + "router_z_loss_clip": 0.93505859, + "router_z_loss_mlp": 0.15106201, + "step": 4279, + "time_per_iteration": 4.258676052093506 + }, + { + "auxiliary_loss_clip": 0.01137588, + "auxiliary_loss_mlp": 0.01046536, + "balance_loss_clip": 1.04449534, + "balance_loss_mlp": 1.03146815, + "epoch": 0.25732752141890874, + "flos": 19653023381760.0, + "grad_norm": 3.550123209542615, + "language_loss": 0.85300398, + "learning_rate": 3.4819472031811065e-06, + "loss": 0.87484515, + "num_input_tokens_seen": 92507270, + "router_z_loss_clip": 0.93066406, + "router_z_loss_mlp": 0.15075684, + "step": 4280, + "time_per_iteration": 4.13556694984436 + }, + { + "auxiliary_loss_clip": 0.01140655, + "auxiliary_loss_mlp": 0.01036911, + "balance_loss_clip": 1.04632854, + "balance_loss_mlp": 1.02146125, + "epoch": 0.2573876446715767, + "flos": 27484868350080.0, + "grad_norm": 27.885640469188814, + "language_loss": 0.79065716, + "learning_rate": 3.4816856382636744e-06, + "loss": 0.81243289, + "num_input_tokens_seen": 92526300, + "router_z_loss_clip": 0.94189453, + "router_z_loss_mlp": 0.15454102, + "step": 4281, + "time_per_iteration": 2.714900255203247 + }, + { + "auxiliary_loss_clip": 0.01139096, + "auxiliary_loss_mlp": 0.01037052, + "balance_loss_clip": 1.04709387, + "balance_loss_mlp": 1.02184713, + "epoch": 0.2574477679242447, + "flos": 29225161998720.0, + "grad_norm": 2.3290551959443904, + "language_loss": 0.87167454, + "learning_rate": 3.4814240171605737e-06, + "loss": 0.89343601, + "num_input_tokens_seen": 92546465, + "router_z_loss_clip": 0.91992188, + "router_z_loss_mlp": 0.15209961, + "step": 4282, + "time_per_iteration": 2.7313640117645264 + }, + { + "auxiliary_loss_clip": 0.01139717, + "auxiliary_loss_mlp": 0.01039749, + "balance_loss_clip": 1.04597044, + "balance_loss_mlp": 1.02496696, + "epoch": 0.2575078911769127, + "flos": 26822181939840.0, + "grad_norm": 1.488719529287462, + "language_loss": 0.70103383, + "learning_rate": 3.4811623398817267e-06, + "loss": 0.72282845, + "num_input_tokens_seen": 92567260, + "router_z_loss_clip": 0.93896484, + "router_z_loss_mlp": 0.14794922, + "step": 4283, + "time_per_iteration": 4.323098421096802 + }, + { + "auxiliary_loss_clip": 0.0113464, + "auxiliary_loss_mlp": 0.01039994, + "balance_loss_clip": 1.04736853, + "balance_loss_mlp": 1.02517653, + "epoch": 0.25756801442958066, + "flos": 26777862385920.0, + "grad_norm": 2.1396884599684753, + "language_loss": 0.80053425, + "learning_rate": 3.4809006064370553e-06, + "loss": 0.82228059, + "num_input_tokens_seen": 92585425, + "router_z_loss_clip": 0.87255859, + "router_z_loss_mlp": 0.14831543, + "step": 4284, + "time_per_iteration": 2.736100435256958 + }, + { + "auxiliary_loss_clip": 0.01138514, + "auxiliary_loss_mlp": 0.01036347, + "balance_loss_clip": 1.04695499, + "balance_loss_mlp": 1.02287614, + "epoch": 0.2576281376822486, + "flos": 43027835843520.0, + "grad_norm": 2.0974279787748826, + "language_loss": 0.7055468, + "learning_rate": 3.4806388168364835e-06, + "loss": 0.7272954, + "num_input_tokens_seen": 92604770, + "router_z_loss_clip": 0.91503906, + "router_z_loss_mlp": 0.13476562, + "step": 4285, + "time_per_iteration": 2.8041040897369385 + }, + { + "auxiliary_loss_clip": 0.01140581, + "auxiliary_loss_mlp": 0.01034202, + "balance_loss_clip": 1.04924393, + "balance_loss_mlp": 1.02041578, + "epoch": 0.2576882609349166, + "flos": 17244168317280.0, + "grad_norm": 2.018335397987333, + "language_loss": 0.58242655, + "learning_rate": 3.4803769710899402e-06, + "loss": 0.60417438, + "num_input_tokens_seen": 92622635, + "router_z_loss_clip": 0.91455078, + "router_z_loss_mlp": 0.13787842, + "step": 4286, + "time_per_iteration": 2.6480016708374023 + }, + { + "auxiliary_loss_clip": 0.01143447, + "auxiliary_loss_mlp": 0.01045062, + "balance_loss_clip": 1.04819107, + "balance_loss_mlp": 1.02979147, + "epoch": 0.25774838418758456, + "flos": 28380051678240.0, + "grad_norm": 1.7509009267447266, + "language_loss": 0.63715208, + "learning_rate": 3.480115069207354e-06, + "loss": 0.65903717, + "num_input_tokens_seen": 92642960, + "router_z_loss_clip": 0.953125, + "router_z_loss_mlp": 0.15258789, + "step": 4287, + "time_per_iteration": 2.731602191925049 + }, + { + "auxiliary_loss_clip": 0.01142302, + "auxiliary_loss_mlp": 0.01040152, + "balance_loss_clip": 1.04697311, + "balance_loss_mlp": 1.0245353, + "epoch": 0.2578085074402525, + "flos": 27578815221600.0, + "grad_norm": 2.428807502041592, + "language_loss": 0.71023601, + "learning_rate": 3.4798531111986557e-06, + "loss": 0.73206055, + "num_input_tokens_seen": 92662455, + "router_z_loss_clip": 0.953125, + "router_z_loss_mlp": 0.15612793, + "step": 4288, + "time_per_iteration": 2.698800563812256 + }, + { + "auxiliary_loss_clip": 0.01136773, + "auxiliary_loss_mlp": 0.01033412, + "balance_loss_clip": 1.04640496, + "balance_loss_mlp": 1.0195601, + "epoch": 0.2578686306929205, + "flos": 29982889247040.0, + "grad_norm": 1.4928496962439997, + "language_loss": 0.77377796, + "learning_rate": 3.4795910970737786e-06, + "loss": 0.79547989, + "num_input_tokens_seen": 92683520, + "router_z_loss_clip": 0.90380859, + "router_z_loss_mlp": 0.13848877, + "step": 4289, + "time_per_iteration": 2.720360040664673 + }, + { + "auxiliary_loss_clip": 0.01138327, + "auxiliary_loss_mlp": 0.01036585, + "balance_loss_clip": 1.04600954, + "balance_loss_mlp": 1.0215174, + "epoch": 0.25792875394558845, + "flos": 22102470410400.0, + "grad_norm": 2.147778411684397, + "language_loss": 0.85566998, + "learning_rate": 3.4793290268426592e-06, + "loss": 0.87741911, + "num_input_tokens_seen": 92701450, + "router_z_loss_clip": 0.92285156, + "router_z_loss_mlp": 0.1505127, + "step": 4290, + "time_per_iteration": 2.665475845336914 + }, + { + "auxiliary_loss_clip": 0.01144219, + "auxiliary_loss_mlp": 0.01045451, + "balance_loss_clip": 1.05035412, + "balance_loss_mlp": 1.02945352, + "epoch": 0.2579888771982564, + "flos": 21613051005120.0, + "grad_norm": 1.9311479053961256, + "language_loss": 0.72322083, + "learning_rate": 3.4790669005152354e-06, + "loss": 0.74511749, + "num_input_tokens_seen": 92720355, + "router_z_loss_clip": 0.93994141, + "router_z_loss_mlp": 0.15991211, + "step": 4291, + "time_per_iteration": 2.6616218090057373 + }, + { + "auxiliary_loss_clip": 0.01143282, + "auxiliary_loss_mlp": 0.01034117, + "balance_loss_clip": 1.04878497, + "balance_loss_mlp": 1.01907897, + "epoch": 0.2580490004509244, + "flos": 20053601092800.0, + "grad_norm": 2.575005782280905, + "language_loss": 0.80830926, + "learning_rate": 3.4788047181014458e-06, + "loss": 0.83008325, + "num_input_tokens_seen": 92736755, + "router_z_loss_clip": 0.9453125, + "router_z_loss_mlp": 0.15026855, + "step": 4292, + "time_per_iteration": 2.6706883907318115 + }, + { + "auxiliary_loss_clip": 0.01142142, + "auxiliary_loss_mlp": 0.0103958, + "balance_loss_clip": 1.04952049, + "balance_loss_mlp": 1.02393961, + "epoch": 0.25810912370359235, + "flos": 41287055987520.0, + "grad_norm": 1.9956077161734644, + "language_loss": 0.67578924, + "learning_rate": 3.4785424796112337e-06, + "loss": 0.69760644, + "num_input_tokens_seen": 92757655, + "router_z_loss_clip": 0.92529297, + "router_z_loss_mlp": 0.15637207, + "step": 4293, + "time_per_iteration": 2.7978532314300537 + }, + { + "auxiliary_loss_clip": 0.01135728, + "auxiliary_loss_mlp": 0.01042062, + "balance_loss_clip": 1.04754567, + "balance_loss_mlp": 1.02799559, + "epoch": 0.2581692469562603, + "flos": 30739887184320.0, + "grad_norm": 3.480995370237483, + "language_loss": 0.75411898, + "learning_rate": 3.478280185054542e-06, + "loss": 0.77589691, + "num_input_tokens_seen": 92776100, + "router_z_loss_clip": 0.88037109, + "router_z_loss_mlp": 0.140625, + "step": 4294, + "time_per_iteration": 2.7444441318511963 + }, + { + "auxiliary_loss_clip": 0.01140176, + "auxiliary_loss_mlp": 0.01042537, + "balance_loss_clip": 1.04908895, + "balance_loss_mlp": 1.02744484, + "epoch": 0.2582293702089283, + "flos": 42625556406720.0, + "grad_norm": 2.098343065384115, + "language_loss": 0.80962604, + "learning_rate": 3.478017834441318e-06, + "loss": 0.8314532, + "num_input_tokens_seen": 92798880, + "router_z_loss_clip": 0.90966797, + "router_z_loss_mlp": 0.15087891, + "step": 4295, + "time_per_iteration": 2.7724106311798096 + }, + { + "auxiliary_loss_clip": 0.01144505, + "auxiliary_loss_mlp": 0.01042215, + "balance_loss_clip": 1.0494709, + "balance_loss_mlp": 1.02645504, + "epoch": 0.2582894934615963, + "flos": 32743059360480.0, + "grad_norm": 1.7901530372644534, + "language_loss": 0.72684944, + "learning_rate": 3.4777554277815096e-06, + "loss": 0.74871659, + "num_input_tokens_seen": 92817750, + "router_z_loss_clip": 0.94970703, + "router_z_loss_mlp": 0.15771484, + "step": 4296, + "time_per_iteration": 2.786663293838501 + }, + { + "auxiliary_loss_clip": 0.01144329, + "auxiliary_loss_mlp": 0.01035688, + "balance_loss_clip": 1.05169463, + "balance_loss_mlp": 1.02039361, + "epoch": 0.25834961671426426, + "flos": 28693408386240.0, + "grad_norm": 1.8751307075772439, + "language_loss": 0.86906832, + "learning_rate": 3.477492965085067e-06, + "loss": 0.89086854, + "num_input_tokens_seen": 92837995, + "router_z_loss_clip": 0.92431641, + "router_z_loss_mlp": 0.15283203, + "step": 4297, + "time_per_iteration": 2.8078858852386475 + }, + { + "auxiliary_loss_clip": 0.01140833, + "auxiliary_loss_mlp": 0.0104701, + "balance_loss_clip": 1.04862952, + "balance_loss_mlp": 1.0325197, + "epoch": 0.25840973996693223, + "flos": 27395337862080.0, + "grad_norm": 1.9106947756274189, + "language_loss": 0.84814781, + "learning_rate": 3.477230446361943e-06, + "loss": 0.87002623, + "num_input_tokens_seen": 92857245, + "router_z_loss_clip": 0.921875, + "router_z_loss_mlp": 0.14477539, + "step": 4298, + "time_per_iteration": 2.7361714839935303 + }, + { + "auxiliary_loss_clip": 0.01142304, + "auxiliary_loss_mlp": 0.01033559, + "balance_loss_clip": 1.04950619, + "balance_loss_mlp": 1.01799059, + "epoch": 0.2584698632196002, + "flos": 13776424997760.0, + "grad_norm": 2.6395310141897754, + "language_loss": 0.83722079, + "learning_rate": 3.4769678716220927e-06, + "loss": 0.85897934, + "num_input_tokens_seen": 92873265, + "router_z_loss_clip": 0.92871094, + "router_z_loss_mlp": 0.15551758, + "step": 4299, + "time_per_iteration": 2.8116374015808105 + }, + { + "auxiliary_loss_clip": 0.01139178, + "auxiliary_loss_mlp": 0.01032367, + "balance_loss_clip": 1.0496875, + "balance_loss_mlp": 1.01856267, + "epoch": 0.25852998647226816, + "flos": 21878117465760.0, + "grad_norm": 2.5495039153986063, + "language_loss": 0.82829118, + "learning_rate": 3.4767052408754726e-06, + "loss": 0.85000658, + "num_input_tokens_seen": 92890880, + "router_z_loss_clip": 0.89501953, + "router_z_loss_mlp": 0.13793945, + "step": 4300, + "time_per_iteration": 2.664691686630249 + }, + { + "auxiliary_loss_clip": 0.01141865, + "auxiliary_loss_mlp": 0.01033913, + "balance_loss_clip": 1.04786348, + "balance_loss_mlp": 1.01909542, + "epoch": 0.2585901097249361, + "flos": 40580050023360.0, + "grad_norm": 2.2445222656031376, + "language_loss": 0.67364538, + "learning_rate": 3.4764425541320417e-06, + "loss": 0.69540316, + "num_input_tokens_seen": 92910770, + "router_z_loss_clip": 0.93945312, + "router_z_loss_mlp": 0.14831543, + "step": 4301, + "time_per_iteration": 2.826435089111328 + }, + { + "auxiliary_loss_clip": 0.01144414, + "auxiliary_loss_mlp": 0.01038452, + "balance_loss_clip": 1.0482626, + "balance_loss_mlp": 1.02216792, + "epoch": 0.2586502329776041, + "flos": 22501994672160.0, + "grad_norm": 2.4377598736494086, + "language_loss": 0.81125009, + "learning_rate": 3.4761798114017617e-06, + "loss": 0.83307874, + "num_input_tokens_seen": 92929520, + "router_z_loss_clip": 0.96289062, + "router_z_loss_mlp": 0.16308594, + "step": 4302, + "time_per_iteration": 2.6813228130340576 + }, + { + "auxiliary_loss_clip": 0.01140689, + "auxiliary_loss_mlp": 0.01038969, + "balance_loss_clip": 1.04804659, + "balance_loss_mlp": 1.02404439, + "epoch": 0.25871035623027205, + "flos": 21924503400960.0, + "grad_norm": 1.9876575787029196, + "language_loss": 0.92259514, + "learning_rate": 3.475917012694595e-06, + "loss": 0.94439173, + "num_input_tokens_seen": 92947890, + "router_z_loss_clip": 0.92626953, + "router_z_loss_mlp": 0.14929199, + "step": 4303, + "time_per_iteration": 2.7119839191436768 + }, + { + "auxiliary_loss_clip": 0.01143456, + "auxiliary_loss_mlp": 0.01036195, + "balance_loss_clip": 1.05105519, + "balance_loss_mlp": 1.02107906, + "epoch": 0.25877047948294, + "flos": 33895408279680.0, + "grad_norm": 1.801986017039165, + "language_loss": 0.67481673, + "learning_rate": 3.475654158020507e-06, + "loss": 0.69661325, + "num_input_tokens_seen": 92967690, + "router_z_loss_clip": 0.92431641, + "router_z_loss_mlp": 0.15106201, + "step": 4304, + "time_per_iteration": 2.7305073738098145 + }, + { + "auxiliary_loss_clip": 0.0114257, + "auxiliary_loss_mlp": 0.01045501, + "balance_loss_clip": 1.04853344, + "balance_loss_mlp": 1.02988434, + "epoch": 0.258830602735608, + "flos": 33099479586720.0, + "grad_norm": 2.842402232025649, + "language_loss": 0.71932, + "learning_rate": 3.4753912473894657e-06, + "loss": 0.74120069, + "num_input_tokens_seen": 92986830, + "router_z_loss_clip": 0.94140625, + "router_z_loss_mlp": 0.15588379, + "step": 4305, + "time_per_iteration": 2.760956048965454 + }, + { + "auxiliary_loss_clip": 0.0114386, + "auxiliary_loss_mlp": 0.01044691, + "balance_loss_clip": 1.0486722, + "balance_loss_mlp": 1.0289315, + "epoch": 0.25889072598827595, + "flos": 21831326357760.0, + "grad_norm": 2.112836900662131, + "language_loss": 0.75419819, + "learning_rate": 3.4751282808114403e-06, + "loss": 0.77608377, + "num_input_tokens_seen": 93002740, + "router_z_loss_clip": 0.953125, + "router_z_loss_mlp": 0.15759277, + "step": 4306, + "time_per_iteration": 2.716679334640503 + }, + { + "auxiliary_loss_clip": 0.010501, + "auxiliary_loss_mlp": 0.01001652, + "balance_loss_clip": 1.0198431, + "balance_loss_mlp": 0.99987835, + "epoch": 0.2589508492409439, + "flos": 65811387509280.0, + "grad_norm": 0.8454172057669437, + "language_loss": 0.57139647, + "learning_rate": 3.474865258296403e-06, + "loss": 0.59191394, + "num_input_tokens_seen": 93058645, + "router_z_loss_clip": 0.30249023, + "router_z_loss_mlp": 0.01771545, + "step": 4307, + "time_per_iteration": 3.21419095993042 + }, + { + "auxiliary_loss_clip": 0.01140135, + "auxiliary_loss_mlp": 0.01033605, + "balance_loss_clip": 1.04915547, + "balance_loss_mlp": 1.01885867, + "epoch": 0.2590109724936119, + "flos": 26998163602560.0, + "grad_norm": 1.7237413303841589, + "language_loss": 0.71635151, + "learning_rate": 3.474602179854327e-06, + "loss": 0.73808897, + "num_input_tokens_seen": 93077140, + "router_z_loss_clip": 0.90917969, + "router_z_loss_mlp": 0.14758301, + "step": 4308, + "time_per_iteration": 2.733635663986206 + }, + { + "auxiliary_loss_clip": 0.01144951, + "auxiliary_loss_mlp": 0.01041135, + "balance_loss_clip": 1.05034256, + "balance_loss_mlp": 1.02563739, + "epoch": 0.2590710957462799, + "flos": 16441351686720.0, + "grad_norm": 2.440227191326238, + "language_loss": 0.83973992, + "learning_rate": 3.4743390454951886e-06, + "loss": 0.86160076, + "num_input_tokens_seen": 93093580, + "router_z_loss_clip": 0.94726562, + "router_z_loss_mlp": 0.15496826, + "step": 4309, + "time_per_iteration": 2.7077438831329346 + }, + { + "auxiliary_loss_clip": 0.01144, + "auxiliary_loss_mlp": 0.01040902, + "balance_loss_clip": 1.05298078, + "balance_loss_mlp": 1.02685332, + "epoch": 0.25913121899894787, + "flos": 27219518268480.0, + "grad_norm": 1.5604345658824186, + "language_loss": 0.84629679, + "learning_rate": 3.474075855228966e-06, + "loss": 0.86814582, + "num_input_tokens_seen": 93112345, + "router_z_loss_clip": 0.90917969, + "router_z_loss_mlp": 0.14050293, + "step": 4310, + "time_per_iteration": 2.7693533897399902 + }, + { + "auxiliary_loss_clip": 0.01147846, + "auxiliary_loss_mlp": 0.0103926, + "balance_loss_clip": 1.05350816, + "balance_loss_mlp": 1.0244298, + "epoch": 0.25919134225161583, + "flos": 31495953224160.0, + "grad_norm": 2.0828738445916484, + "language_loss": 0.77351707, + "learning_rate": 3.473812609065639e-06, + "loss": 0.7953881, + "num_input_tokens_seen": 93131545, + "router_z_loss_clip": 0.94384766, + "router_z_loss_mlp": 0.14837646, + "step": 4311, + "time_per_iteration": 2.7395007610321045 + }, + { + "auxiliary_loss_clip": 0.01142914, + "auxiliary_loss_mlp": 0.01038861, + "balance_loss_clip": 1.04959106, + "balance_loss_mlp": 1.02398396, + "epoch": 0.2592514655042838, + "flos": 38086040337120.0, + "grad_norm": 2.091144186232533, + "language_loss": 0.72189033, + "learning_rate": 3.4735493070151904e-06, + "loss": 0.74370807, + "num_input_tokens_seen": 93150730, + "router_z_loss_clip": 0.93212891, + "router_z_loss_mlp": 0.14880371, + "step": 4312, + "time_per_iteration": 2.821950674057007 + }, + { + "auxiliary_loss_clip": 0.01142258, + "auxiliary_loss_mlp": 0.01036097, + "balance_loss_clip": 1.04961658, + "balance_loss_mlp": 1.02121937, + "epoch": 0.25931158875695176, + "flos": 22543396981920.0, + "grad_norm": 1.7981649806854862, + "language_loss": 0.69637203, + "learning_rate": 3.4732859490876044e-06, + "loss": 0.71815556, + "num_input_tokens_seen": 93167895, + "router_z_loss_clip": 0.92724609, + "router_z_loss_mlp": 0.14880371, + "step": 4313, + "time_per_iteration": 2.648192882537842 + }, + { + "auxiliary_loss_clip": 0.01142464, + "auxiliary_loss_mlp": 0.01038382, + "balance_loss_clip": 1.0508498, + "balance_loss_mlp": 1.02447045, + "epoch": 0.2593717120096197, + "flos": 23437486343520.0, + "grad_norm": 2.21290821164394, + "language_loss": 0.80579638, + "learning_rate": 3.473022535292867e-06, + "loss": 0.82760483, + "num_input_tokens_seen": 93187650, + "router_z_loss_clip": 0.91552734, + "router_z_loss_mlp": 0.13928223, + "step": 4314, + "time_per_iteration": 2.731823444366455 + }, + { + "auxiliary_loss_clip": 0.01144214, + "auxiliary_loss_mlp": 0.0104411, + "balance_loss_clip": 1.04951024, + "balance_loss_mlp": 1.0279572, + "epoch": 0.2594318352622877, + "flos": 38129833166400.0, + "grad_norm": 2.0367185337132967, + "language_loss": 0.67197937, + "learning_rate": 3.472759065640968e-06, + "loss": 0.69386262, + "num_input_tokens_seen": 93207370, + "router_z_loss_clip": 0.94726562, + "router_z_loss_mlp": 0.16149902, + "step": 4315, + "time_per_iteration": 2.8053019046783447 + }, + { + "auxiliary_loss_clip": 0.01139959, + "auxiliary_loss_mlp": 0.0103554, + "balance_loss_clip": 1.04867792, + "balance_loss_mlp": 1.02152133, + "epoch": 0.25949195851495566, + "flos": 27133877439360.0, + "grad_norm": 1.5343181681873748, + "language_loss": 0.79339451, + "learning_rate": 3.4724955401418976e-06, + "loss": 0.81514955, + "num_input_tokens_seen": 93227925, + "router_z_loss_clip": 0.91259766, + "router_z_loss_mlp": 0.14025879, + "step": 4316, + "time_per_iteration": 2.7625958919525146 + }, + { + "auxiliary_loss_clip": 0.01140113, + "auxiliary_loss_mlp": 0.01038902, + "balance_loss_clip": 1.04648876, + "balance_loss_mlp": 1.02345288, + "epoch": 0.2595520817676236, + "flos": 34258473339840.0, + "grad_norm": 1.854110057480375, + "language_loss": 0.77939045, + "learning_rate": 3.4722319588056487e-06, + "loss": 0.80118072, + "num_input_tokens_seen": 93250020, + "router_z_loss_clip": 0.93505859, + "router_z_loss_mlp": 0.15441895, + "step": 4317, + "time_per_iteration": 2.7630715370178223 + }, + { + "auxiliary_loss_clip": 0.01144353, + "auxiliary_loss_mlp": 0.01049511, + "balance_loss_clip": 1.05165267, + "balance_loss_mlp": 1.03383517, + "epoch": 0.2596122050202916, + "flos": 24637234129920.0, + "grad_norm": 2.529703438026356, + "language_loss": 0.78321731, + "learning_rate": 3.4719683216422163e-06, + "loss": 0.80515599, + "num_input_tokens_seen": 93269070, + "router_z_loss_clip": 0.92675781, + "router_z_loss_mlp": 0.15686035, + "step": 4318, + "time_per_iteration": 5.584358215332031 + }, + { + "auxiliary_loss_clip": 0.01138899, + "auxiliary_loss_mlp": 0.01037321, + "balance_loss_clip": 1.04854226, + "balance_loss_mlp": 1.02138233, + "epoch": 0.25967232827295955, + "flos": 27488839043520.0, + "grad_norm": 1.7000977027065207, + "language_loss": 0.76466107, + "learning_rate": 3.471704628661598e-06, + "loss": 0.78642333, + "num_input_tokens_seen": 93290250, + "router_z_loss_clip": 0.90332031, + "router_z_loss_mlp": 0.15942383, + "step": 4319, + "time_per_iteration": 2.7259905338287354 + }, + { + "auxiliary_loss_clip": 0.01138517, + "auxiliary_loss_mlp": 0.01038694, + "balance_loss_clip": 1.04867494, + "balance_loss_mlp": 1.02403116, + "epoch": 0.2597324515256275, + "flos": 25707629292480.0, + "grad_norm": 2.767745010482956, + "language_loss": 0.76558518, + "learning_rate": 3.4714408798737925e-06, + "loss": 0.78735727, + "num_input_tokens_seen": 93310090, + "router_z_loss_clip": 0.89794922, + "router_z_loss_mlp": 0.14666748, + "step": 4320, + "time_per_iteration": 4.267667055130005 + }, + { + "auxiliary_loss_clip": 0.01141382, + "auxiliary_loss_mlp": 0.01039597, + "balance_loss_clip": 1.0489291, + "balance_loss_mlp": 1.02430284, + "epoch": 0.2597925747782955, + "flos": 26905958974080.0, + "grad_norm": 1.6233420652497754, + "language_loss": 0.71052897, + "learning_rate": 3.471177075288801e-06, + "loss": 0.73233879, + "num_input_tokens_seen": 93329570, + "router_z_loss_clip": 0.92480469, + "router_z_loss_mlp": 0.1529541, + "step": 4321, + "time_per_iteration": 2.7355926036834717 + }, + { + "auxiliary_loss_clip": 0.01142396, + "auxiliary_loss_mlp": 0.01040362, + "balance_loss_clip": 1.04868209, + "balance_loss_mlp": 1.02488315, + "epoch": 0.2598526980309635, + "flos": 23838914917440.0, + "grad_norm": 1.9749012001880366, + "language_loss": 0.74675679, + "learning_rate": 3.4709132149166277e-06, + "loss": 0.76858443, + "num_input_tokens_seen": 93347920, + "router_z_loss_clip": 0.93701172, + "router_z_loss_mlp": 0.15472412, + "step": 4322, + "time_per_iteration": 4.207697153091431 + }, + { + "auxiliary_loss_clip": 0.01140638, + "auxiliary_loss_mlp": 0.01039312, + "balance_loss_clip": 1.04808736, + "balance_loss_mlp": 1.02407134, + "epoch": 0.25991282128363147, + "flos": 29889712203840.0, + "grad_norm": 4.194048015492246, + "language_loss": 0.73526317, + "learning_rate": 3.470649298767278e-06, + "loss": 0.75706261, + "num_input_tokens_seen": 93367145, + "router_z_loss_clip": 0.92626953, + "router_z_loss_mlp": 0.15240479, + "step": 4323, + "time_per_iteration": 2.775909423828125 + }, + { + "auxiliary_loss_clip": 0.01146897, + "auxiliary_loss_mlp": 0.0103739, + "balance_loss_clip": 1.04893696, + "balance_loss_mlp": 1.02159476, + "epoch": 0.25997294453629943, + "flos": 29531428182720.0, + "grad_norm": 1.9974388755946504, + "language_loss": 0.66664559, + "learning_rate": 3.4703853268507597e-06, + "loss": 0.68848848, + "num_input_tokens_seen": 93386555, + "router_z_loss_clip": 0.97949219, + "router_z_loss_mlp": 0.15777588, + "step": 4324, + "time_per_iteration": 2.757286310195923 + }, + { + "auxiliary_loss_clip": 0.01140157, + "auxiliary_loss_mlp": 0.0103727, + "balance_loss_clip": 1.04894948, + "balance_loss_mlp": 1.02340567, + "epoch": 0.2600330677889674, + "flos": 38353497317280.0, + "grad_norm": 2.3168923688979044, + "language_loss": 0.71312243, + "learning_rate": 3.470121299177082e-06, + "loss": 0.73489666, + "num_input_tokens_seen": 93405590, + "router_z_loss_clip": 0.91210938, + "router_z_loss_mlp": 0.1385498, + "step": 4325, + "time_per_iteration": 2.7272632122039795 + }, + { + "auxiliary_loss_clip": 0.01139438, + "auxiliary_loss_mlp": 0.01034043, + "balance_loss_clip": 1.04639113, + "balance_loss_mlp": 1.01837897, + "epoch": 0.26009319104163536, + "flos": 39372887574720.0, + "grad_norm": 1.978514440248308, + "language_loss": 0.72848511, + "learning_rate": 3.469857215756257e-06, + "loss": 0.75021994, + "num_input_tokens_seen": 93424750, + "router_z_loss_clip": 0.93115234, + "router_z_loss_mlp": 0.15649414, + "step": 4326, + "time_per_iteration": 2.810245990753174 + }, + { + "auxiliary_loss_clip": 0.01134107, + "auxiliary_loss_mlp": 0.01035692, + "balance_loss_clip": 1.04699874, + "balance_loss_mlp": 1.02147615, + "epoch": 0.26015331429430333, + "flos": 32075551393920.0, + "grad_norm": 1.803155410212022, + "language_loss": 0.87024611, + "learning_rate": 3.4695930765982997e-06, + "loss": 0.89194411, + "num_input_tokens_seen": 93443465, + "router_z_loss_clip": 0.87109375, + "router_z_loss_mlp": 0.14227295, + "step": 4327, + "time_per_iteration": 2.7092437744140625 + }, + { + "auxiliary_loss_clip": 0.0114494, + "auxiliary_loss_mlp": 0.01047079, + "balance_loss_clip": 1.05094779, + "balance_loss_mlp": 1.03049731, + "epoch": 0.2602134375469713, + "flos": 25797929608800.0, + "grad_norm": 1.5988870287126202, + "language_loss": 0.80489194, + "learning_rate": 3.4693288817132255e-06, + "loss": 0.82681215, + "num_input_tokens_seen": 93462580, + "router_z_loss_clip": 0.93896484, + "router_z_loss_mlp": 0.16577148, + "step": 4328, + "time_per_iteration": 2.754281997680664 + }, + { + "auxiliary_loss_clip": 0.01135729, + "auxiliary_loss_mlp": 0.01036147, + "balance_loss_clip": 1.04575014, + "balance_loss_mlp": 1.02145481, + "epoch": 0.26027356079963926, + "flos": 31629681714240.0, + "grad_norm": 1.5794809513052264, + "language_loss": 0.87542015, + "learning_rate": 3.4690646311110525e-06, + "loss": 0.89713889, + "num_input_tokens_seen": 93482790, + "router_z_loss_clip": 0.89990234, + "router_z_loss_mlp": 0.14678955, + "step": 4329, + "time_per_iteration": 2.7184879779815674 + }, + { + "auxiliary_loss_clip": 0.01138597, + "auxiliary_loss_mlp": 0.01037836, + "balance_loss_clip": 1.04894114, + "balance_loss_mlp": 1.02335203, + "epoch": 0.2603336840523072, + "flos": 32164028432640.0, + "grad_norm": 1.9154351922747814, + "language_loss": 0.78041816, + "learning_rate": 3.468800324801802e-06, + "loss": 0.8021825, + "num_input_tokens_seen": 93498795, + "router_z_loss_clip": 0.89697266, + "router_z_loss_mlp": 0.14483643, + "step": 4330, + "time_per_iteration": 2.7281994819641113 + }, + { + "auxiliary_loss_clip": 0.01143353, + "auxiliary_loss_mlp": 0.01048025, + "balance_loss_clip": 1.0495553, + "balance_loss_mlp": 1.03285003, + "epoch": 0.2603938073049752, + "flos": 28691625625920.0, + "grad_norm": 1.519171103744975, + "language_loss": 0.75482309, + "learning_rate": 3.4685359627954958e-06, + "loss": 0.77673692, + "num_input_tokens_seen": 93518335, + "router_z_loss_clip": 0.93847656, + "router_z_loss_mlp": 0.15155029, + "step": 4331, + "time_per_iteration": 2.6982572078704834 + }, + { + "auxiliary_loss_clip": 0.01140462, + "auxiliary_loss_mlp": 0.01042528, + "balance_loss_clip": 1.05054498, + "balance_loss_mlp": 1.02828848, + "epoch": 0.26045393055764315, + "flos": 30962133230400.0, + "grad_norm": 1.4418932505664206, + "language_loss": 0.69206095, + "learning_rate": 3.4682715451021584e-06, + "loss": 0.71389091, + "num_input_tokens_seen": 93539170, + "router_z_loss_clip": 0.90087891, + "router_z_loss_mlp": 0.14239502, + "step": 4332, + "time_per_iteration": 2.7706069946289062 + }, + { + "auxiliary_loss_clip": 0.01143663, + "auxiliary_loss_mlp": 0.01042908, + "balance_loss_clip": 1.04969978, + "balance_loss_mlp": 1.02768481, + "epoch": 0.2605140538103111, + "flos": 33721249894560.0, + "grad_norm": 2.0628112795495417, + "language_loss": 0.79641402, + "learning_rate": 3.4680070717318174e-06, + "loss": 0.81827968, + "num_input_tokens_seen": 93558480, + "router_z_loss_clip": 0.93945312, + "router_z_loss_mlp": 0.15246582, + "step": 4333, + "time_per_iteration": 2.744445323944092 + }, + { + "auxiliary_loss_clip": 0.01136011, + "auxiliary_loss_mlp": 0.01043229, + "balance_loss_clip": 1.04769409, + "balance_loss_mlp": 1.02909064, + "epoch": 0.2605741770629791, + "flos": 16801661571840.0, + "grad_norm": 1.6705150798957524, + "language_loss": 0.80822867, + "learning_rate": 3.467742542694501e-06, + "loss": 0.83002108, + "num_input_tokens_seen": 93575220, + "router_z_loss_clip": 0.88330078, + "router_z_loss_mlp": 0.14135742, + "step": 4334, + "time_per_iteration": 2.6741247177124023 + }, + { + "auxiliary_loss_clip": 0.01140491, + "auxiliary_loss_mlp": 0.01041578, + "balance_loss_clip": 1.04956245, + "balance_loss_mlp": 1.02670062, + "epoch": 0.26063430031564705, + "flos": 31764261067200.0, + "grad_norm": 1.8948643183924259, + "language_loss": 0.79916137, + "learning_rate": 3.46747795800024e-06, + "loss": 0.8209821, + "num_input_tokens_seen": 93597015, + "router_z_loss_clip": 0.90917969, + "router_z_loss_mlp": 0.14868164, + "step": 4335, + "time_per_iteration": 2.837334156036377 + }, + { + "auxiliary_loss_clip": 0.01052041, + "auxiliary_loss_mlp": 0.0103141, + "balance_loss_clip": 1.02115083, + "balance_loss_mlp": 1.02995098, + "epoch": 0.26069442356831507, + "flos": 76193838453600.0, + "grad_norm": 0.857835241592866, + "language_loss": 0.60838163, + "learning_rate": 3.467213317659068e-06, + "loss": 0.62921613, + "num_input_tokens_seen": 93657775, + "router_z_loss_clip": 0.30932617, + "router_z_loss_mlp": 0.01457977, + "step": 4336, + "time_per_iteration": 3.267587423324585 + }, + { + "auxiliary_loss_clip": 0.01139939, + "auxiliary_loss_mlp": 0.01047894, + "balance_loss_clip": 1.04830158, + "balance_loss_mlp": 1.03293347, + "epoch": 0.26075454682098304, + "flos": 16626368702880.0, + "grad_norm": 2.053610413076621, + "language_loss": 0.76846933, + "learning_rate": 3.46694862168102e-06, + "loss": 0.7903477, + "num_input_tokens_seen": 93676145, + "router_z_loss_clip": 0.91650391, + "router_z_loss_mlp": 0.1496582, + "step": 4337, + "time_per_iteration": 2.6976044178009033 + }, + { + "auxiliary_loss_clip": 0.01140921, + "auxiliary_loss_mlp": 0.01043307, + "balance_loss_clip": 1.04797196, + "balance_loss_mlp": 1.02735734, + "epoch": 0.260814670073651, + "flos": 14796504048960.0, + "grad_norm": 2.0845839682341607, + "language_loss": 0.74371636, + "learning_rate": 3.4666838700761334e-06, + "loss": 0.7655586, + "num_input_tokens_seen": 93692480, + "router_z_loss_clip": 0.92871094, + "router_z_loss_mlp": 0.15948486, + "step": 4338, + "time_per_iteration": 2.6867733001708984 + }, + { + "auxiliary_loss_clip": 0.01144174, + "auxiliary_loss_mlp": 0.01039021, + "balance_loss_clip": 1.04981732, + "balance_loss_mlp": 1.02369046, + "epoch": 0.26087479332631897, + "flos": 18452060076960.0, + "grad_norm": 2.3810050285030617, + "language_loss": 0.80302203, + "learning_rate": 3.466419062854447e-06, + "loss": 0.82485396, + "num_input_tokens_seen": 93710165, + "router_z_loss_clip": 0.94384766, + "router_z_loss_mlp": 0.15332031, + "step": 4339, + "time_per_iteration": 2.694573163986206 + }, + { + "auxiliary_loss_clip": 0.01138865, + "auxiliary_loss_mlp": 0.01038474, + "balance_loss_clip": 1.04823446, + "balance_loss_mlp": 1.02412105, + "epoch": 0.26093491657898693, + "flos": 30116050495200.0, + "grad_norm": 1.7775090742410065, + "language_loss": 0.76676732, + "learning_rate": 3.4661542000260033e-06, + "loss": 0.78854066, + "num_input_tokens_seen": 93730185, + "router_z_loss_clip": 0.90625, + "router_z_loss_mlp": 0.14343262, + "step": 4340, + "time_per_iteration": 2.678332805633545 + }, + { + "auxiliary_loss_clip": 0.01141578, + "auxiliary_loss_mlp": 0.0103735, + "balance_loss_clip": 1.04893732, + "balance_loss_mlp": 1.02320027, + "epoch": 0.2609950398316549, + "flos": 30647358417600.0, + "grad_norm": 1.5311638141959472, + "language_loss": 0.82573599, + "learning_rate": 3.465889281600845e-06, + "loss": 0.84752536, + "num_input_tokens_seen": 93747690, + "router_z_loss_clip": 0.92529297, + "router_z_loss_mlp": 0.14147949, + "step": 4341, + "time_per_iteration": 2.822995185852051 + }, + { + "auxiliary_loss_clip": 0.0113903, + "auxiliary_loss_mlp": 0.01040002, + "balance_loss_clip": 1.04755628, + "balance_loss_mlp": 1.02449322, + "epoch": 0.26105516308432286, + "flos": 34835640472800.0, + "grad_norm": 1.8090954441182483, + "language_loss": 0.76547539, + "learning_rate": 3.4656243075890183e-06, + "loss": 0.78726578, + "num_input_tokens_seen": 93767405, + "router_z_loss_clip": 0.91503906, + "router_z_loss_mlp": 0.15515137, + "step": 4342, + "time_per_iteration": 2.7933971881866455 + }, + { + "auxiliary_loss_clip": 0.01137498, + "auxiliary_loss_mlp": 0.01032429, + "balance_loss_clip": 1.04562831, + "balance_loss_mlp": 1.0169791, + "epoch": 0.2611152863369908, + "flos": 48236764191840.0, + "grad_norm": 2.3512739764962016, + "language_loss": 0.65941578, + "learning_rate": 3.4653592780005707e-06, + "loss": 0.68111503, + "num_input_tokens_seen": 93789950, + "router_z_loss_clip": 0.91992188, + "router_z_loss_mlp": 0.15454102, + "step": 4343, + "time_per_iteration": 2.8719089031219482 + }, + { + "auxiliary_loss_clip": 0.01140707, + "auxiliary_loss_mlp": 0.0104223, + "balance_loss_clip": 1.04694009, + "balance_loss_mlp": 1.02660155, + "epoch": 0.2611754095896588, + "flos": 16759367881920.0, + "grad_norm": 2.127294587759326, + "language_loss": 0.73622513, + "learning_rate": 3.465094192845553e-06, + "loss": 0.75805444, + "num_input_tokens_seen": 93807835, + "router_z_loss_clip": 0.9375, + "router_z_loss_mlp": 0.15625, + "step": 4344, + "time_per_iteration": 2.7402937412261963 + }, + { + "auxiliary_loss_clip": 0.01141867, + "auxiliary_loss_mlp": 0.01037356, + "balance_loss_clip": 1.04957557, + "balance_loss_mlp": 1.0225203, + "epoch": 0.26123553284232676, + "flos": 26242543252800.0, + "grad_norm": 2.103734953365562, + "language_loss": 0.86438471, + "learning_rate": 3.4648290521340165e-06, + "loss": 0.88617688, + "num_input_tokens_seen": 93825670, + "router_z_loss_clip": 0.92285156, + "router_z_loss_mlp": 0.14837646, + "step": 4345, + "time_per_iteration": 2.751372814178467 + }, + { + "auxiliary_loss_clip": 0.01135384, + "auxiliary_loss_mlp": 0.01037113, + "balance_loss_clip": 1.04657364, + "balance_loss_mlp": 1.02197957, + "epoch": 0.2612956560949947, + "flos": 25795052881920.0, + "grad_norm": 1.9190594801688714, + "language_loss": 0.76195145, + "learning_rate": 3.464563855876015e-06, + "loss": 0.78367639, + "num_input_tokens_seen": 93844045, + "router_z_loss_clip": 0.88818359, + "router_z_loss_mlp": 0.15142822, + "step": 4346, + "time_per_iteration": 2.743720293045044 + }, + { + "auxiliary_loss_clip": 0.01138892, + "auxiliary_loss_mlp": 0.01036727, + "balance_loss_clip": 1.04654086, + "balance_loss_mlp": 1.02196872, + "epoch": 0.2613557793476627, + "flos": 30650599800000.0, + "grad_norm": 2.555821737160623, + "language_loss": 0.75772363, + "learning_rate": 3.464298604081606e-06, + "loss": 0.7794798, + "num_input_tokens_seen": 93864380, + "router_z_loss_clip": 0.92333984, + "router_z_loss_mlp": 0.14752197, + "step": 4347, + "time_per_iteration": 2.7308359146118164 + }, + { + "auxiliary_loss_clip": 0.01137822, + "auxiliary_loss_mlp": 0.01032426, + "balance_loss_clip": 1.04704869, + "balance_loss_mlp": 1.01735187, + "epoch": 0.26141590260033065, + "flos": 31808256482880.0, + "grad_norm": 1.7339022797353407, + "language_loss": 0.73244226, + "learning_rate": 3.4640332967608476e-06, + "loss": 0.75414467, + "num_input_tokens_seen": 93885475, + "router_z_loss_clip": 0.90673828, + "router_z_loss_mlp": 0.1506958, + "step": 4348, + "time_per_iteration": 2.7441344261169434 + }, + { + "auxiliary_loss_clip": 0.01141088, + "auxiliary_loss_mlp": 0.01042735, + "balance_loss_clip": 1.0478977, + "balance_loss_mlp": 1.02739894, + "epoch": 0.2614760258529987, + "flos": 31361535940320.0, + "grad_norm": 1.827675292762506, + "language_loss": 0.90725696, + "learning_rate": 3.463767933923799e-06, + "loss": 0.92909527, + "num_input_tokens_seen": 93905545, + "router_z_loss_clip": 0.93212891, + "router_z_loss_mlp": 0.15338135, + "step": 4349, + "time_per_iteration": 2.699199676513672 + }, + { + "auxiliary_loss_clip": 0.0113875, + "auxiliary_loss_mlp": 0.01042568, + "balance_loss_clip": 1.04939187, + "balance_loss_mlp": 1.0284065, + "epoch": 0.26153614910566664, + "flos": 21300990850080.0, + "grad_norm": 2.0533344521486208, + "language_loss": 0.805511, + "learning_rate": 3.463502515580524e-06, + "loss": 0.82732415, + "num_input_tokens_seen": 93924185, + "router_z_loss_clip": 0.89355469, + "router_z_loss_mlp": 0.1418457, + "step": 4350, + "time_per_iteration": 2.677314519882202 + }, + { + "auxiliary_loss_clip": 0.01134599, + "auxiliary_loss_mlp": 0.01038332, + "balance_loss_clip": 1.04663324, + "balance_loss_mlp": 1.02344298, + "epoch": 0.2615962723583346, + "flos": 21612969970560.0, + "grad_norm": 2.073182290881889, + "language_loss": 0.62129897, + "learning_rate": 3.4632370417410866e-06, + "loss": 0.64302826, + "num_input_tokens_seen": 93942825, + "router_z_loss_clip": 0.87988281, + "router_z_loss_mlp": 0.14874268, + "step": 4351, + "time_per_iteration": 2.7430994510650635 + }, + { + "auxiliary_loss_clip": 0.01139513, + "auxiliary_loss_mlp": 0.01036066, + "balance_loss_clip": 1.04665399, + "balance_loss_mlp": 1.02065229, + "epoch": 0.26165639561100257, + "flos": 28379362884480.0, + "grad_norm": 1.8130778325502606, + "language_loss": 0.83622497, + "learning_rate": 3.462971512415555e-06, + "loss": 0.85798073, + "num_input_tokens_seen": 93962045, + "router_z_loss_clip": 0.92871094, + "router_z_loss_mlp": 0.1541748, + "step": 4352, + "time_per_iteration": 2.695058822631836 + }, + { + "auxiliary_loss_clip": 0.01052163, + "auxiliary_loss_mlp": 0.01014155, + "balance_loss_clip": 1.02110958, + "balance_loss_mlp": 1.01259077, + "epoch": 0.26171651886367053, + "flos": 81434931171840.0, + "grad_norm": 0.8052843983895522, + "language_loss": 0.70556664, + "learning_rate": 3.462705927613996e-06, + "loss": 0.72622985, + "num_input_tokens_seen": 94021175, + "router_z_loss_clip": 0.31054688, + "router_z_loss_mlp": 0.01565552, + "step": 4353, + "time_per_iteration": 3.164608955383301 + }, + { + "auxiliary_loss_clip": 0.01138707, + "auxiliary_loss_mlp": 0.01044505, + "balance_loss_clip": 1.04821503, + "balance_loss_mlp": 1.02849567, + "epoch": 0.2617766421163385, + "flos": 27270806794560.0, + "grad_norm": 7.128500515584396, + "language_loss": 0.77706873, + "learning_rate": 3.4624402873464816e-06, + "loss": 0.79890084, + "num_input_tokens_seen": 94043370, + "router_z_loss_clip": 0.90478516, + "router_z_loss_mlp": 0.16015625, + "step": 4354, + "time_per_iteration": 2.739865779876709 + }, + { + "auxiliary_loss_clip": 0.01144914, + "auxiliary_loss_mlp": 0.01044675, + "balance_loss_clip": 1.04928088, + "balance_loss_mlp": 1.02977395, + "epoch": 0.26183676536900646, + "flos": 31807608206400.0, + "grad_norm": 1.9740644485104548, + "language_loss": 0.67644554, + "learning_rate": 3.462174591623085e-06, + "loss": 0.69834149, + "num_input_tokens_seen": 94063510, + "router_z_loss_clip": 0.95605469, + "router_z_loss_mlp": 0.14910889, + "step": 4355, + "time_per_iteration": 2.7243030071258545 + }, + { + "auxiliary_loss_clip": 0.01140952, + "auxiliary_loss_mlp": 0.0103802, + "balance_loss_clip": 1.04929638, + "balance_loss_mlp": 1.02210009, + "epoch": 0.26189688862167443, + "flos": 25619922082080.0, + "grad_norm": 1.9778250489252687, + "language_loss": 0.67563999, + "learning_rate": 3.4619088404538815e-06, + "loss": 0.69742972, + "num_input_tokens_seen": 94083865, + "router_z_loss_clip": 0.91650391, + "router_z_loss_mlp": 0.15924072, + "step": 4356, + "time_per_iteration": 2.694678544998169 + }, + { + "auxiliary_loss_clip": 0.01053046, + "auxiliary_loss_mlp": 0.01001606, + "balance_loss_clip": 1.02190614, + "balance_loss_mlp": 1.00009465, + "epoch": 0.2619570118743424, + "flos": 80287403808960.0, + "grad_norm": 0.6898856829324833, + "language_loss": 0.53143966, + "learning_rate": 3.4616430338489487e-06, + "loss": 0.55198622, + "num_input_tokens_seen": 94144095, + "router_z_loss_clip": 0.31103516, + "router_z_loss_mlp": 0.01509094, + "step": 4357, + "time_per_iteration": 6.174126863479614 + }, + { + "auxiliary_loss_clip": 0.01144843, + "auxiliary_loss_mlp": 0.01046052, + "balance_loss_clip": 1.04912615, + "balance_loss_mlp": 1.0312407, + "epoch": 0.26201713512701036, + "flos": 35102003486400.0, + "grad_norm": 2.005480898159129, + "language_loss": 0.84091419, + "learning_rate": 3.4613771718183654e-06, + "loss": 0.86282313, + "num_input_tokens_seen": 94163035, + "router_z_loss_clip": 0.95703125, + "router_z_loss_mlp": 0.14801025, + "step": 4358, + "time_per_iteration": 2.839905023574829 + }, + { + "auxiliary_loss_clip": 0.01149066, + "auxiliary_loss_mlp": 0.01045997, + "balance_loss_clip": 1.05059123, + "balance_loss_mlp": 1.02921188, + "epoch": 0.2620772583796783, + "flos": 32253437368800.0, + "grad_norm": 2.3644369898341178, + "language_loss": 0.66852462, + "learning_rate": 3.4611112543722127e-06, + "loss": 0.69047534, + "num_input_tokens_seen": 94182520, + "router_z_loss_clip": 0.98388672, + "router_z_loss_mlp": 0.16796875, + "step": 4359, + "time_per_iteration": 4.207553148269653 + }, + { + "auxiliary_loss_clip": 0.01141865, + "auxiliary_loss_mlp": 0.01051169, + "balance_loss_clip": 1.04799294, + "balance_loss_mlp": 1.03632736, + "epoch": 0.2621373816323463, + "flos": 24595061991840.0, + "grad_norm": 2.1192952644397565, + "language_loss": 0.78356189, + "learning_rate": 3.4608452815205757e-06, + "loss": 0.80549216, + "num_input_tokens_seen": 94201795, + "router_z_loss_clip": 0.93896484, + "router_z_loss_mlp": 0.14831543, + "step": 4360, + "time_per_iteration": 2.7061331272125244 + }, + { + "auxiliary_loss_clip": 0.0113791, + "auxiliary_loss_mlp": 0.01046502, + "balance_loss_clip": 1.04691744, + "balance_loss_mlp": 1.03276896, + "epoch": 0.26219750488501425, + "flos": 34924320097920.0, + "grad_norm": 2.3341592960859043, + "language_loss": 0.68183589, + "learning_rate": 3.4605792532735387e-06, + "loss": 0.70367992, + "num_input_tokens_seen": 94222390, + "router_z_loss_clip": 0.91015625, + "router_z_loss_mlp": 0.1373291, + "step": 4361, + "time_per_iteration": 2.7432405948638916 + }, + { + "auxiliary_loss_clip": 0.01143617, + "auxiliary_loss_mlp": 0.01061954, + "balance_loss_clip": 1.04887295, + "balance_loss_mlp": 1.04667091, + "epoch": 0.2622576281376823, + "flos": 18355114926720.0, + "grad_norm": 1.776802278590264, + "language_loss": 0.83939946, + "learning_rate": 3.46031316964119e-06, + "loss": 0.8614552, + "num_input_tokens_seen": 94239980, + "router_z_loss_clip": 0.94677734, + "router_z_loss_mlp": 0.152771, + "step": 4362, + "time_per_iteration": 4.121882200241089 + }, + { + "auxiliary_loss_clip": 0.01143252, + "auxiliary_loss_mlp": 0.01051217, + "balance_loss_clip": 1.05149674, + "balance_loss_mlp": 1.03551722, + "epoch": 0.26231775139035024, + "flos": 32209077297600.0, + "grad_norm": 1.7872932976597793, + "language_loss": 0.65466011, + "learning_rate": 3.4600470306336197e-06, + "loss": 0.67660475, + "num_input_tokens_seen": 94260715, + "router_z_loss_clip": 0.91699219, + "router_z_loss_mlp": 0.15698242, + "step": 4363, + "time_per_iteration": 2.7035717964172363 + }, + { + "auxiliary_loss_clip": 0.0105668, + "auxiliary_loss_mlp": 0.01032256, + "balance_loss_clip": 1.0251075, + "balance_loss_mlp": 1.0307281, + "epoch": 0.2623778746430182, + "flos": 79811193036960.0, + "grad_norm": 0.8977676094629388, + "language_loss": 0.61119354, + "learning_rate": 3.4597808362609194e-06, + "loss": 0.63208288, + "num_input_tokens_seen": 94321285, + "router_z_loss_clip": 0.31567383, + "router_z_loss_mlp": 0.01526642, + "step": 4364, + "time_per_iteration": 3.386323928833008 + }, + { + "auxiliary_loss_clip": 0.01146892, + "auxiliary_loss_mlp": 0.01050176, + "balance_loss_clip": 1.05195808, + "balance_loss_mlp": 1.03399968, + "epoch": 0.26243799789568617, + "flos": 15379100497440.0, + "grad_norm": 2.495814496990789, + "language_loss": 0.71961939, + "learning_rate": 3.459514586533184e-06, + "loss": 0.74159008, + "num_input_tokens_seen": 94335420, + "router_z_loss_clip": 0.94873047, + "router_z_loss_mlp": 0.16162109, + "step": 4365, + "time_per_iteration": 2.657160997390747 + }, + { + "auxiliary_loss_clip": 0.01144749, + "auxiliary_loss_mlp": 0.01046816, + "balance_loss_clip": 1.05219507, + "balance_loss_mlp": 1.03268397, + "epoch": 0.26249812114835414, + "flos": 34927237342080.0, + "grad_norm": 1.9897093062664437, + "language_loss": 0.77558678, + "learning_rate": 3.459248281460509e-06, + "loss": 0.79750246, + "num_input_tokens_seen": 94357440, + "router_z_loss_clip": 0.92529297, + "router_z_loss_mlp": 0.14117432, + "step": 4366, + "time_per_iteration": 2.7314717769622803 + }, + { + "auxiliary_loss_clip": 0.0114636, + "auxiliary_loss_mlp": 0.01046458, + "balance_loss_clip": 1.05337739, + "balance_loss_mlp": 1.03185511, + "epoch": 0.2625582444010221, + "flos": 17650215861120.0, + "grad_norm": 2.765110220802305, + "language_loss": 0.75864875, + "learning_rate": 3.4589819210529927e-06, + "loss": 0.78057694, + "num_input_tokens_seen": 94375690, + "router_z_loss_clip": 0.92919922, + "router_z_loss_mlp": 0.14611816, + "step": 4367, + "time_per_iteration": 2.6796493530273438 + }, + { + "auxiliary_loss_clip": 0.01143244, + "auxiliary_loss_mlp": 0.01045162, + "balance_loss_clip": 1.05215788, + "balance_loss_mlp": 1.03092885, + "epoch": 0.26261836765369007, + "flos": 20271552307200.0, + "grad_norm": 1.606675004162574, + "language_loss": 0.69673848, + "learning_rate": 3.458715505320736e-06, + "loss": 0.71862257, + "num_input_tokens_seen": 94393190, + "router_z_loss_clip": 0.91113281, + "router_z_loss_mlp": 0.14233398, + "step": 4368, + "time_per_iteration": 2.742415189743042 + }, + { + "auxiliary_loss_clip": 0.011413, + "auxiliary_loss_mlp": 0.01043009, + "balance_loss_clip": 1.05025935, + "balance_loss_mlp": 1.02749395, + "epoch": 0.26267849090635803, + "flos": 25038622186560.0, + "grad_norm": 1.7602128775510941, + "language_loss": 0.78862786, + "learning_rate": 3.458449034273841e-06, + "loss": 0.81047094, + "num_input_tokens_seen": 94410975, + "router_z_loss_clip": 0.91064453, + "router_z_loss_mlp": 0.1550293, + "step": 4369, + "time_per_iteration": 2.6954739093780518 + }, + { + "auxiliary_loss_clip": 0.01143249, + "auxiliary_loss_mlp": 0.01038783, + "balance_loss_clip": 1.05175245, + "balance_loss_mlp": 1.02442384, + "epoch": 0.262738614159026, + "flos": 26019608412960.0, + "grad_norm": 1.9795711516036902, + "language_loss": 0.83542717, + "learning_rate": 3.4581825079224133e-06, + "loss": 0.85724747, + "num_input_tokens_seen": 94429985, + "router_z_loss_clip": 0.91503906, + "router_z_loss_mlp": 0.14355469, + "step": 4370, + "time_per_iteration": 2.6893904209136963 + }, + { + "auxiliary_loss_clip": 0.01148173, + "auxiliary_loss_mlp": 0.01052236, + "balance_loss_clip": 1.05244684, + "balance_loss_mlp": 1.03384209, + "epoch": 0.26279873741169396, + "flos": 21479646653280.0, + "grad_norm": 2.2187135840536776, + "language_loss": 0.71390533, + "learning_rate": 3.4579159262765575e-06, + "loss": 0.73590946, + "num_input_tokens_seen": 94448660, + "router_z_loss_clip": 0.95751953, + "router_z_loss_mlp": 0.18395996, + "step": 4371, + "time_per_iteration": 2.6882126331329346 + }, + { + "auxiliary_loss_clip": 0.010552, + "auxiliary_loss_mlp": 0.00998947, + "balance_loss_clip": 1.02406812, + "balance_loss_mlp": 0.99727762, + "epoch": 0.2628588606643619, + "flos": 74370497081760.0, + "grad_norm": 0.690706309919989, + "language_loss": 0.56419241, + "learning_rate": 3.457649289346384e-06, + "loss": 0.58473384, + "num_input_tokens_seen": 94515630, + "router_z_loss_clip": 0.31103516, + "router_z_loss_mlp": 0.01672363, + "step": 4372, + "time_per_iteration": 3.447277784347534 + }, + { + "auxiliary_loss_clip": 0.01141179, + "auxiliary_loss_mlp": 0.01035408, + "balance_loss_clip": 1.05089486, + "balance_loss_mlp": 1.02074528, + "epoch": 0.2629189839170299, + "flos": 32965832131200.0, + "grad_norm": 1.6505629994994306, + "language_loss": 0.77783495, + "learning_rate": 3.4573825971420042e-06, + "loss": 0.79960084, + "num_input_tokens_seen": 94535385, + "router_z_loss_clip": 0.90283203, + "router_z_loss_mlp": 0.14672852, + "step": 4373, + "time_per_iteration": 2.742600440979004 + }, + { + "auxiliary_loss_clip": 0.01139162, + "auxiliary_loss_mlp": 0.01035229, + "balance_loss_clip": 1.04914129, + "balance_loss_mlp": 1.02085257, + "epoch": 0.26297910716969786, + "flos": 20767332925440.0, + "grad_norm": 2.5419090691459947, + "language_loss": 0.71269822, + "learning_rate": 3.4571158496735294e-06, + "loss": 0.73444211, + "num_input_tokens_seen": 94552650, + "router_z_loss_clip": 0.89941406, + "router_z_loss_mlp": 0.14373779, + "step": 4374, + "time_per_iteration": 2.6607582569122314 + }, + { + "auxiliary_loss_clip": 0.01141738, + "auxiliary_loss_mlp": 0.0103803, + "balance_loss_clip": 1.05061984, + "balance_loss_mlp": 1.02273512, + "epoch": 0.2630392304223659, + "flos": 30380387644800.0, + "grad_norm": 1.8224661780896487, + "language_loss": 0.81091344, + "learning_rate": 3.4568490469510756e-06, + "loss": 0.83271116, + "num_input_tokens_seen": 94574075, + "router_z_loss_clip": 0.91015625, + "router_z_loss_mlp": 0.15283203, + "step": 4375, + "time_per_iteration": 2.690706968307495 + }, + { + "auxiliary_loss_clip": 0.01139826, + "auxiliary_loss_mlp": 0.01036567, + "balance_loss_clip": 1.04928493, + "balance_loss_mlp": 1.02240539, + "epoch": 0.26309935367503384, + "flos": 40089455616960.0, + "grad_norm": 2.1583144422410694, + "language_loss": 0.65945232, + "learning_rate": 3.4565821889847603e-06, + "loss": 0.68121624, + "num_input_tokens_seen": 94594255, + "router_z_loss_clip": 0.90429688, + "router_z_loss_mlp": 0.1416626, + "step": 4376, + "time_per_iteration": 2.8039560317993164 + }, + { + "auxiliary_loss_clip": 0.01141769, + "auxiliary_loss_mlp": 0.01041056, + "balance_loss_clip": 1.04908514, + "balance_loss_mlp": 1.02645278, + "epoch": 0.2631594769277018, + "flos": 19386174160800.0, + "grad_norm": 2.0537033024838602, + "language_loss": 0.69019949, + "learning_rate": 3.4563152757847026e-06, + "loss": 0.71202773, + "num_input_tokens_seen": 94611410, + "router_z_loss_clip": 0.92675781, + "router_z_loss_mlp": 0.14599609, + "step": 4377, + "time_per_iteration": 2.7062630653381348 + }, + { + "auxiliary_loss_clip": 0.01140695, + "auxiliary_loss_mlp": 0.01037937, + "balance_loss_clip": 1.04898047, + "balance_loss_mlp": 1.02320278, + "epoch": 0.2632196001803698, + "flos": 61994551240800.0, + "grad_norm": 1.7808159274468942, + "language_loss": 0.78992724, + "learning_rate": 3.4560483073610233e-06, + "loss": 0.81171358, + "num_input_tokens_seen": 94636575, + "router_z_loss_clip": 0.91650391, + "router_z_loss_mlp": 0.14746094, + "step": 4378, + "time_per_iteration": 2.9106247425079346 + }, + { + "auxiliary_loss_clip": 0.01141221, + "auxiliary_loss_mlp": 0.01042544, + "balance_loss_clip": 1.05073822, + "balance_loss_mlp": 1.02879357, + "epoch": 0.26327972343303774, + "flos": 16756774776000.0, + "grad_norm": 2.0261199618261814, + "language_loss": 0.76665163, + "learning_rate": 3.455781283723846e-06, + "loss": 0.78848934, + "num_input_tokens_seen": 94654345, + "router_z_loss_clip": 0.90527344, + "router_z_loss_mlp": 0.13763428, + "step": 4379, + "time_per_iteration": 2.65075945854187 + }, + { + "auxiliary_loss_clip": 0.01145754, + "auxiliary_loss_mlp": 0.01041711, + "balance_loss_clip": 1.0510273, + "balance_loss_mlp": 1.02536738, + "epoch": 0.2633398466857057, + "flos": 29003280608160.0, + "grad_norm": 2.280288043706954, + "language_loss": 0.78083563, + "learning_rate": 3.4555142048832975e-06, + "loss": 0.80271029, + "num_input_tokens_seen": 94673985, + "router_z_loss_clip": 0.94677734, + "router_z_loss_mlp": 0.16357422, + "step": 4380, + "time_per_iteration": 2.7548606395721436 + }, + { + "auxiliary_loss_clip": 0.01142351, + "auxiliary_loss_mlp": 0.01035741, + "balance_loss_clip": 1.04804265, + "balance_loss_mlp": 1.02036309, + "epoch": 0.26339996993837367, + "flos": 33678348445440.0, + "grad_norm": 2.5393157249419116, + "language_loss": 0.63831115, + "learning_rate": 3.4552470708495036e-06, + "loss": 0.66009206, + "num_input_tokens_seen": 94693145, + "router_z_loss_clip": 0.94287109, + "router_z_loss_mlp": 0.15368652, + "step": 4381, + "time_per_iteration": 2.692948341369629 + }, + { + "auxiliary_loss_clip": 0.0114045, + "auxiliary_loss_mlp": 0.01034393, + "balance_loss_clip": 1.04832804, + "balance_loss_mlp": 1.02012396, + "epoch": 0.26346009319104163, + "flos": 20677640368320.0, + "grad_norm": 1.8460670950922027, + "language_loss": 0.82663006, + "learning_rate": 3.454979881632595e-06, + "loss": 0.84837854, + "num_input_tokens_seen": 94710185, + "router_z_loss_clip": 0.92236328, + "router_z_loss_mlp": 0.1427002, + "step": 4382, + "time_per_iteration": 2.653306007385254 + }, + { + "auxiliary_loss_clip": 0.01145115, + "auxiliary_loss_mlp": 0.01044219, + "balance_loss_clip": 1.04859543, + "balance_loss_mlp": 1.02781618, + "epoch": 0.2635202164437096, + "flos": 45432071938080.0, + "grad_norm": 2.0596878698946783, + "language_loss": 0.70037085, + "learning_rate": 3.4547126372427035e-06, + "loss": 0.72226417, + "num_input_tokens_seen": 94730280, + "router_z_loss_clip": 0.96582031, + "router_z_loss_mlp": 0.1640625, + "step": 4383, + "time_per_iteration": 2.82993745803833 + }, + { + "auxiliary_loss_clip": 0.01144354, + "auxiliary_loss_mlp": 0.01041658, + "balance_loss_clip": 1.05122757, + "balance_loss_mlp": 1.02806175, + "epoch": 0.26358033969637756, + "flos": 25619800530240.0, + "grad_norm": 1.7239986933610048, + "language_loss": 0.69595236, + "learning_rate": 3.4544453376899638e-06, + "loss": 0.71781242, + "num_input_tokens_seen": 94748560, + "router_z_loss_clip": 0.93017578, + "router_z_loss_mlp": 0.13586426, + "step": 4384, + "time_per_iteration": 2.692131757736206 + }, + { + "auxiliary_loss_clip": 0.01141631, + "auxiliary_loss_mlp": 0.0103352, + "balance_loss_clip": 1.0507164, + "balance_loss_mlp": 1.01903605, + "epoch": 0.26364046294904553, + "flos": 33856355972160.0, + "grad_norm": 2.5803383931524424, + "language_loss": 0.70610964, + "learning_rate": 3.45417798298451e-06, + "loss": 0.72786117, + "num_input_tokens_seen": 94767570, + "router_z_loss_clip": 0.90869141, + "router_z_loss_mlp": 0.1449585, + "step": 4385, + "time_per_iteration": 2.751405715942383 + }, + { + "auxiliary_loss_clip": 0.01145023, + "auxiliary_loss_mlp": 0.01042187, + "balance_loss_clip": 1.0538106, + "balance_loss_mlp": 1.02730954, + "epoch": 0.2637005862017135, + "flos": 27934384584960.0, + "grad_norm": 2.016962047666797, + "language_loss": 0.85633743, + "learning_rate": 3.453910573136482e-06, + "loss": 0.87820959, + "num_input_tokens_seen": 94784985, + "router_z_loss_clip": 0.91162109, + "router_z_loss_mlp": 0.14892578, + "step": 4386, + "time_per_iteration": 2.7236788272857666 + }, + { + "auxiliary_loss_clip": 0.01145879, + "auxiliary_loss_mlp": 0.010381, + "balance_loss_clip": 1.05317187, + "balance_loss_mlp": 1.02391422, + "epoch": 0.26376070945438146, + "flos": 18362205450720.0, + "grad_norm": 2.7154030012733466, + "language_loss": 0.76763296, + "learning_rate": 3.4536431081560196e-06, + "loss": 0.78947276, + "num_input_tokens_seen": 94802545, + "router_z_loss_clip": 0.92675781, + "router_z_loss_mlp": 0.14196777, + "step": 4387, + "time_per_iteration": 2.6627776622772217 + }, + { + "auxiliary_loss_clip": 0.01144969, + "auxiliary_loss_mlp": 0.01041635, + "balance_loss_clip": 1.05451119, + "balance_loss_mlp": 1.02758598, + "epoch": 0.2638208327070494, + "flos": 25797362366880.0, + "grad_norm": 2.500476399852604, + "language_loss": 0.75620329, + "learning_rate": 3.453375588053264e-06, + "loss": 0.77806938, + "num_input_tokens_seen": 94820730, + "router_z_loss_clip": 0.90478516, + "router_z_loss_mlp": 0.14044189, + "step": 4388, + "time_per_iteration": 2.6555330753326416 + }, + { + "auxiliary_loss_clip": 0.01142811, + "auxiliary_loss_mlp": 0.01036434, + "balance_loss_clip": 1.05022979, + "balance_loss_mlp": 1.02147365, + "epoch": 0.26388095595971744, + "flos": 26509716612000.0, + "grad_norm": 2.1080160097577307, + "language_loss": 0.86222446, + "learning_rate": 3.4531080128383617e-06, + "loss": 0.88401687, + "num_input_tokens_seen": 94839175, + "router_z_loss_clip": 0.92431641, + "router_z_loss_mlp": 0.14953613, + "step": 4389, + "time_per_iteration": 2.6815125942230225 + }, + { + "auxiliary_loss_clip": 0.01054266, + "auxiliary_loss_mlp": 0.01010869, + "balance_loss_clip": 1.02353358, + "balance_loss_mlp": 1.00928962, + "epoch": 0.2639410792123854, + "flos": 79941558592800.0, + "grad_norm": 0.8055208104510488, + "language_loss": 0.60229051, + "learning_rate": 3.452840382521457e-06, + "loss": 0.62294185, + "num_input_tokens_seen": 94898865, + "router_z_loss_clip": 0.30688477, + "router_z_loss_mlp": 0.01580048, + "step": 4390, + "time_per_iteration": 3.282696008682251 + }, + { + "auxiliary_loss_clip": 0.01144646, + "auxiliary_loss_mlp": 0.0103682, + "balance_loss_clip": 1.04996681, + "balance_loss_mlp": 1.0210669, + "epoch": 0.2640012024650534, + "flos": 29222609410080.0, + "grad_norm": 1.6503757783325448, + "language_loss": 0.77368969, + "learning_rate": 3.4525726971127e-06, + "loss": 0.79550433, + "num_input_tokens_seen": 94917490, + "router_z_loss_clip": 0.94628906, + "router_z_loss_mlp": 0.15740967, + "step": 4391, + "time_per_iteration": 2.6820194721221924 + }, + { + "auxiliary_loss_clip": 0.01053714, + "auxiliary_loss_mlp": 0.01008378, + "balance_loss_clip": 1.02280354, + "balance_loss_mlp": 1.00676727, + "epoch": 0.26406132571772134, + "flos": 68867336181600.0, + "grad_norm": 0.8548824657349318, + "language_loss": 0.58742905, + "learning_rate": 3.45230495662224e-06, + "loss": 0.60804999, + "num_input_tokens_seen": 94969065, + "router_z_loss_clip": 0.30908203, + "router_z_loss_mlp": 0.01612091, + "step": 4392, + "time_per_iteration": 3.2143869400024414 + }, + { + "auxiliary_loss_clip": 0.01146918, + "auxiliary_loss_mlp": 0.01045828, + "balance_loss_clip": 1.05282712, + "balance_loss_mlp": 1.03061724, + "epoch": 0.2641214489703893, + "flos": 26955059567040.0, + "grad_norm": 1.7458954189112594, + "language_loss": 0.68653917, + "learning_rate": 3.4520371610602306e-06, + "loss": 0.70846659, + "num_input_tokens_seen": 94988540, + "router_z_loss_clip": 0.94140625, + "router_z_loss_mlp": 0.15216064, + "step": 4393, + "time_per_iteration": 2.717827796936035 + }, + { + "auxiliary_loss_clip": 0.01146483, + "auxiliary_loss_mlp": 0.01045289, + "balance_loss_clip": 1.05003834, + "balance_loss_mlp": 1.02954173, + "epoch": 0.26418157222305727, + "flos": 20187653721120.0, + "grad_norm": 2.205109586443339, + "language_loss": 0.84016359, + "learning_rate": 3.4517693104368267e-06, + "loss": 0.86208129, + "num_input_tokens_seen": 95004810, + "router_z_loss_clip": 0.96582031, + "router_z_loss_mlp": 0.1574707, + "step": 4394, + "time_per_iteration": 2.676798105239868 + }, + { + "auxiliary_loss_clip": 0.01152455, + "auxiliary_loss_mlp": 0.01041898, + "balance_loss_clip": 1.05406725, + "balance_loss_mlp": 1.0250535, + "epoch": 0.26424169547572524, + "flos": 21966635021760.0, + "grad_norm": 2.280311127994769, + "language_loss": 0.70218563, + "learning_rate": 3.4515014047621856e-06, + "loss": 0.72412914, + "num_input_tokens_seen": 95024085, + "router_z_loss_clip": 0.98339844, + "router_z_loss_mlp": 0.16833496, + "step": 4395, + "time_per_iteration": 2.67104434967041 + }, + { + "auxiliary_loss_clip": 0.01143959, + "auxiliary_loss_mlp": 0.01035095, + "balance_loss_clip": 1.05213714, + "balance_loss_mlp": 1.0198127, + "epoch": 0.2643018187283932, + "flos": 20722567681440.0, + "grad_norm": 1.7823098841121783, + "language_loss": 0.86758369, + "learning_rate": 3.4512334440464655e-06, + "loss": 0.88937426, + "num_input_tokens_seen": 95042515, + "router_z_loss_clip": 0.91796875, + "router_z_loss_mlp": 0.15283203, + "step": 4396, + "time_per_iteration": 2.781430959701538 + }, + { + "auxiliary_loss_clip": 0.01053821, + "auxiliary_loss_mlp": 0.00999417, + "balance_loss_clip": 1.02272367, + "balance_loss_mlp": 0.99787718, + "epoch": 0.26436194198106117, + "flos": 72802741127040.0, + "grad_norm": 0.7887931484371723, + "language_loss": 0.55076444, + "learning_rate": 3.4509654282998277e-06, + "loss": 0.57129681, + "num_input_tokens_seen": 95094835, + "router_z_loss_clip": 0.31103516, + "router_z_loss_mlp": 0.01540375, + "step": 4397, + "time_per_iteration": 4.494189023971558 + }, + { + "auxiliary_loss_clip": 0.01143694, + "auxiliary_loss_mlp": 0.01050661, + "balance_loss_clip": 1.05165792, + "balance_loss_mlp": 1.03549743, + "epoch": 0.26442206523372913, + "flos": 40172098167360.0, + "grad_norm": 2.067217224491935, + "language_loss": 0.77906573, + "learning_rate": 3.450697357532435e-06, + "loss": 0.80100924, + "num_input_tokens_seen": 95113480, + "router_z_loss_clip": 0.91992188, + "router_z_loss_mlp": 0.15155029, + "step": 4398, + "time_per_iteration": 2.771089553833008 + }, + { + "auxiliary_loss_clip": 0.01150504, + "auxiliary_loss_mlp": 0.01036828, + "balance_loss_clip": 1.0558579, + "balance_loss_mlp": 1.02096152, + "epoch": 0.2644821884863971, + "flos": 25662580427520.0, + "grad_norm": 1.7481933896993982, + "language_loss": 0.67162395, + "learning_rate": 3.4504292317544534e-06, + "loss": 0.6934973, + "num_input_tokens_seen": 95132580, + "router_z_loss_clip": 0.94726562, + "router_z_loss_mlp": 0.15869141, + "step": 4399, + "time_per_iteration": 4.1792919635772705 + }, + { + "auxiliary_loss_clip": 0.01140263, + "auxiliary_loss_mlp": 0.01039474, + "balance_loss_clip": 1.05243587, + "balance_loss_mlp": 1.02539563, + "epoch": 0.26454231173906506, + "flos": 25351938377280.0, + "grad_norm": 1.628253333890484, + "language_loss": 0.86296129, + "learning_rate": 3.4501610509760504e-06, + "loss": 0.88475865, + "num_input_tokens_seen": 95152375, + "router_z_loss_clip": 0.87792969, + "router_z_loss_mlp": 0.14086914, + "step": 4400, + "time_per_iteration": 2.710552453994751 + }, + { + "auxiliary_loss_clip": 0.01146938, + "auxiliary_loss_mlp": 0.01036502, + "balance_loss_clip": 1.05259573, + "balance_loss_mlp": 1.0207305, + "epoch": 0.264602434991733, + "flos": 20277751451040.0, + "grad_norm": 1.9117916015855116, + "language_loss": 0.75554854, + "learning_rate": 3.4498928152073944e-06, + "loss": 0.77738297, + "num_input_tokens_seen": 95170265, + "router_z_loss_clip": 0.94335938, + "router_z_loss_mlp": 0.15771484, + "step": 4401, + "time_per_iteration": 4.129625082015991 + }, + { + "auxiliary_loss_clip": 0.01148406, + "auxiliary_loss_mlp": 0.01048833, + "balance_loss_clip": 1.05235434, + "balance_loss_mlp": 1.03262019, + "epoch": 0.26466255824440105, + "flos": 23260167610560.0, + "grad_norm": 1.6936363996367665, + "language_loss": 0.88291353, + "learning_rate": 3.4496245244586577e-06, + "loss": 0.90488589, + "num_input_tokens_seen": 95188655, + "router_z_loss_clip": 0.96191406, + "router_z_loss_mlp": 0.16235352, + "step": 4402, + "time_per_iteration": 2.674074411392212 + }, + { + "auxiliary_loss_clip": 0.0114546, + "auxiliary_loss_mlp": 0.01040846, + "balance_loss_clip": 1.05183756, + "balance_loss_mlp": 1.02514637, + "epoch": 0.264722681497069, + "flos": 27623702017440.0, + "grad_norm": 1.5337634823244146, + "language_loss": 0.78264916, + "learning_rate": 3.4493561787400137e-06, + "loss": 0.80451226, + "num_input_tokens_seen": 95209615, + "router_z_loss_clip": 0.93652344, + "router_z_loss_mlp": 0.15673828, + "step": 4403, + "time_per_iteration": 2.705122709274292 + }, + { + "auxiliary_loss_clip": 0.01145525, + "auxiliary_loss_mlp": 0.01034393, + "balance_loss_clip": 1.05072951, + "balance_loss_mlp": 1.01946831, + "epoch": 0.264782804749737, + "flos": 27445208283360.0, + "grad_norm": 1.7992197089746809, + "language_loss": 0.88135028, + "learning_rate": 3.4490877780616387e-06, + "loss": 0.90314949, + "num_input_tokens_seen": 95227810, + "router_z_loss_clip": 0.94824219, + "router_z_loss_mlp": 0.14929199, + "step": 4404, + "time_per_iteration": 2.7798147201538086 + }, + { + "auxiliary_loss_clip": 0.01144541, + "auxiliary_loss_mlp": 0.01036984, + "balance_loss_clip": 1.04984617, + "balance_loss_mlp": 1.02233303, + "epoch": 0.26484292800240494, + "flos": 20499430255200.0, + "grad_norm": 1.7482427226110218, + "language_loss": 0.76011372, + "learning_rate": 3.448819322433709e-06, + "loss": 0.7819289, + "num_input_tokens_seen": 95245890, + "router_z_loss_clip": 0.94726562, + "router_z_loss_mlp": 0.14654541, + "step": 4405, + "time_per_iteration": 2.6860194206237793 + }, + { + "auxiliary_loss_clip": 0.01148418, + "auxiliary_loss_mlp": 0.01037644, + "balance_loss_clip": 1.05463743, + "balance_loss_mlp": 1.02208734, + "epoch": 0.2649030512550729, + "flos": 24952535667360.0, + "grad_norm": 1.841830824610239, + "language_loss": 0.70142525, + "learning_rate": 3.4485508118664066e-06, + "loss": 0.72328585, + "num_input_tokens_seen": 95264955, + "router_z_loss_clip": 0.93896484, + "router_z_loss_mlp": 0.15563965, + "step": 4406, + "time_per_iteration": 2.7005553245544434 + }, + { + "auxiliary_loss_clip": 0.01144083, + "auxiliary_loss_mlp": 0.01043421, + "balance_loss_clip": 1.05262184, + "balance_loss_mlp": 1.02894926, + "epoch": 0.2649631745077409, + "flos": 27352598482080.0, + "grad_norm": 1.6118251633848175, + "language_loss": 0.83786261, + "learning_rate": 3.448282246369912e-06, + "loss": 0.85973763, + "num_input_tokens_seen": 95284245, + "router_z_loss_clip": 0.91503906, + "router_z_loss_mlp": 0.14477539, + "step": 4407, + "time_per_iteration": 2.7186405658721924 + }, + { + "auxiliary_loss_clip": 0.01145025, + "auxiliary_loss_mlp": 0.01033722, + "balance_loss_clip": 1.05221248, + "balance_loss_mlp": 1.01837993, + "epoch": 0.26502329776040884, + "flos": 42849828316800.0, + "grad_norm": 1.800482057873689, + "language_loss": 0.75963521, + "learning_rate": 3.4480136259544084e-06, + "loss": 0.78142267, + "num_input_tokens_seen": 95307125, + "router_z_loss_clip": 0.92919922, + "router_z_loss_mlp": 0.15356445, + "step": 4408, + "time_per_iteration": 2.8120486736297607 + }, + { + "auxiliary_loss_clip": 0.01142637, + "auxiliary_loss_mlp": 0.01033603, + "balance_loss_clip": 1.05168605, + "balance_loss_mlp": 1.01869011, + "epoch": 0.2650834210130768, + "flos": 47208298063680.0, + "grad_norm": 1.9323882211166385, + "language_loss": 0.70702404, + "learning_rate": 3.447744950630084e-06, + "loss": 0.72878647, + "num_input_tokens_seen": 95329150, + "router_z_loss_clip": 0.91015625, + "router_z_loss_mlp": 0.14910889, + "step": 4409, + "time_per_iteration": 2.818617105484009 + }, + { + "auxiliary_loss_clip": 0.01146425, + "auxiliary_loss_mlp": 0.01038121, + "balance_loss_clip": 1.05190396, + "balance_loss_mlp": 1.02227759, + "epoch": 0.26514354426574477, + "flos": 30159924359040.0, + "grad_norm": 1.6567562758843009, + "language_loss": 0.7346617, + "learning_rate": 3.4474762204071253e-06, + "loss": 0.75650716, + "num_input_tokens_seen": 95349880, + "router_z_loss_clip": 0.94580078, + "router_z_loss_mlp": 0.15844727, + "step": 4410, + "time_per_iteration": 2.6858770847320557 + }, + { + "auxiliary_loss_clip": 0.01151268, + "auxiliary_loss_mlp": 0.01046906, + "balance_loss_clip": 1.05475926, + "balance_loss_mlp": 1.03121829, + "epoch": 0.26520366751841273, + "flos": 24818888211840.0, + "grad_norm": 1.9489746660825438, + "language_loss": 0.73271894, + "learning_rate": 3.4472074352957244e-06, + "loss": 0.75470072, + "num_input_tokens_seen": 95368570, + "router_z_loss_clip": 0.96386719, + "router_z_loss_mlp": 0.15686035, + "step": 4411, + "time_per_iteration": 2.699808120727539 + }, + { + "auxiliary_loss_clip": 0.0114769, + "auxiliary_loss_mlp": 0.01037583, + "balance_loss_clip": 1.05470979, + "balance_loss_mlp": 1.02284288, + "epoch": 0.2652637907710807, + "flos": 27264364547040.0, + "grad_norm": 1.9644827834999747, + "language_loss": 0.82500637, + "learning_rate": 3.446938595306071e-06, + "loss": 0.8468591, + "num_input_tokens_seen": 95387065, + "router_z_loss_clip": 0.9296875, + "router_z_loss_mlp": 0.14758301, + "step": 4412, + "time_per_iteration": 2.6976499557495117 + }, + { + "auxiliary_loss_clip": 0.01147443, + "auxiliary_loss_mlp": 0.01050514, + "balance_loss_clip": 1.05421984, + "balance_loss_mlp": 1.03539181, + "epoch": 0.26532391402374866, + "flos": 23615980077600.0, + "grad_norm": 1.970826482170617, + "language_loss": 0.73827863, + "learning_rate": 3.4466697004483622e-06, + "loss": 0.7602582, + "num_input_tokens_seen": 95406345, + "router_z_loss_clip": 0.93212891, + "router_z_loss_mlp": 0.15130615, + "step": 4413, + "time_per_iteration": 2.68373966217041 + }, + { + "auxiliary_loss_clip": 0.01059145, + "auxiliary_loss_mlp": 0.01000061, + "balance_loss_clip": 1.02868056, + "balance_loss_mlp": 0.99842775, + "epoch": 0.26538403727641663, + "flos": 54648120817440.0, + "grad_norm": 0.8768740475987223, + "language_loss": 0.56895179, + "learning_rate": 3.446400750732793e-06, + "loss": 0.58954382, + "num_input_tokens_seen": 95463595, + "router_z_loss_clip": 0.3046875, + "router_z_loss_mlp": 0.01635742, + "step": 4414, + "time_per_iteration": 3.270845890045166 + }, + { + "auxiliary_loss_clip": 0.01143529, + "auxiliary_loss_mlp": 0.01041595, + "balance_loss_clip": 1.05390501, + "balance_loss_mlp": 1.02746868, + "epoch": 0.26544416052908465, + "flos": 34388595792000.0, + "grad_norm": 2.05388517036879, + "language_loss": 0.74093568, + "learning_rate": 3.4461317461695625e-06, + "loss": 0.76278692, + "num_input_tokens_seen": 95484115, + "router_z_loss_clip": 0.89599609, + "router_z_loss_mlp": 0.14135742, + "step": 4415, + "time_per_iteration": 2.7482380867004395 + }, + { + "auxiliary_loss_clip": 0.01149697, + "auxiliary_loss_mlp": 0.01041751, + "balance_loss_clip": 1.05314589, + "balance_loss_mlp": 1.02476406, + "epoch": 0.2655042837817526, + "flos": 21433058131680.0, + "grad_norm": 2.6226705845088345, + "language_loss": 0.86453253, + "learning_rate": 3.4458626867688707e-06, + "loss": 0.88644707, + "num_input_tokens_seen": 95501435, + "router_z_loss_clip": 0.96337891, + "router_z_loss_mlp": 0.16967773, + "step": 4416, + "time_per_iteration": 2.744633913040161 + }, + { + "auxiliary_loss_clip": 0.01147572, + "auxiliary_loss_mlp": 0.01040466, + "balance_loss_clip": 1.05324817, + "balance_loss_mlp": 1.02439666, + "epoch": 0.2655644070344206, + "flos": 28558545412320.0, + "grad_norm": 1.6251887008108656, + "language_loss": 0.7582379, + "learning_rate": 3.4455935725409217e-06, + "loss": 0.78011835, + "num_input_tokens_seen": 95520135, + "router_z_loss_clip": 0.94287109, + "router_z_loss_mlp": 0.16064453, + "step": 4417, + "time_per_iteration": 2.681412935256958 + }, + { + "auxiliary_loss_clip": 0.011435, + "auxiliary_loss_mlp": 0.01039998, + "balance_loss_clip": 1.05260754, + "balance_loss_mlp": 1.02326107, + "epoch": 0.26562453028708854, + "flos": 32297351749920.0, + "grad_norm": 1.5304969769988546, + "language_loss": 0.79836154, + "learning_rate": 3.4453244034959196e-06, + "loss": 0.82019651, + "num_input_tokens_seen": 95541705, + "router_z_loss_clip": 0.91015625, + "router_z_loss_mlp": 0.16748047, + "step": 4418, + "time_per_iteration": 2.733137369155884 + }, + { + "auxiliary_loss_clip": 0.01147691, + "auxiliary_loss_mlp": 0.0105054, + "balance_loss_clip": 1.05377924, + "balance_loss_mlp": 1.03539991, + "epoch": 0.2656846535397565, + "flos": 23437081170720.0, + "grad_norm": 2.1899418852565034, + "language_loss": 0.6672529, + "learning_rate": 3.445055179644071e-06, + "loss": 0.68923515, + "num_input_tokens_seen": 95560300, + "router_z_loss_clip": 0.93945312, + "router_z_loss_mlp": 0.15136719, + "step": 4419, + "time_per_iteration": 2.668992757797241 + }, + { + "auxiliary_loss_clip": 0.01147942, + "auxiliary_loss_mlp": 0.01042409, + "balance_loss_clip": 1.05378819, + "balance_loss_mlp": 1.02581513, + "epoch": 0.2657447767924245, + "flos": 37279901289600.0, + "grad_norm": 2.4075011722418784, + "language_loss": 0.7906726, + "learning_rate": 3.444785900995585e-06, + "loss": 0.81257606, + "num_input_tokens_seen": 95580150, + "router_z_loss_clip": 0.94189453, + "router_z_loss_mlp": 0.16589355, + "step": 4420, + "time_per_iteration": 2.773806571960449 + }, + { + "auxiliary_loss_clip": 0.01151729, + "auxiliary_loss_mlp": 0.01047946, + "balance_loss_clip": 1.05441046, + "balance_loss_mlp": 1.03033304, + "epoch": 0.26580490004509244, + "flos": 25530026938560.0, + "grad_norm": 1.9897296164694884, + "language_loss": 0.81426543, + "learning_rate": 3.444516567560673e-06, + "loss": 0.83626217, + "num_input_tokens_seen": 95597570, + "router_z_loss_clip": 0.97412109, + "router_z_loss_mlp": 0.17608643, + "step": 4421, + "time_per_iteration": 2.690554141998291 + }, + { + "auxiliary_loss_clip": 0.01143591, + "auxiliary_loss_mlp": 0.01038376, + "balance_loss_clip": 1.05282629, + "balance_loss_mlp": 1.0230819, + "epoch": 0.2658650232977604, + "flos": 53624348343360.0, + "grad_norm": 1.5433692220815618, + "language_loss": 0.65273619, + "learning_rate": 3.444247179349548e-06, + "loss": 0.6745559, + "num_input_tokens_seen": 95619415, + "router_z_loss_clip": 0.90625, + "router_z_loss_mlp": 0.15307617, + "step": 4422, + "time_per_iteration": 2.873310089111328 + }, + { + "auxiliary_loss_clip": 0.01148441, + "auxiliary_loss_mlp": 0.01041804, + "balance_loss_clip": 1.0534904, + "balance_loss_mlp": 1.02705133, + "epoch": 0.26592514655042837, + "flos": 36260713618560.0, + "grad_norm": 3.037753036999763, + "language_loss": 0.74764085, + "learning_rate": 3.4439777363724252e-06, + "loss": 0.76954329, + "num_input_tokens_seen": 95639155, + "router_z_loss_clip": 0.94873047, + "router_z_loss_mlp": 0.14758301, + "step": 4423, + "time_per_iteration": 2.7359912395477295 + }, + { + "auxiliary_loss_clip": 0.01145624, + "auxiliary_loss_mlp": 0.01044409, + "balance_loss_clip": 1.05104876, + "balance_loss_mlp": 1.02975821, + "epoch": 0.26598526980309634, + "flos": 56958606276480.0, + "grad_norm": 2.020230245596906, + "language_loss": 0.77815312, + "learning_rate": 3.443708238639522e-06, + "loss": 0.80005348, + "num_input_tokens_seen": 95663320, + "router_z_loss_clip": 0.94580078, + "router_z_loss_mlp": 0.14654541, + "step": 4424, + "time_per_iteration": 2.8679134845733643 + }, + { + "auxiliary_loss_clip": 0.01147001, + "auxiliary_loss_mlp": 0.01047405, + "balance_loss_clip": 1.05324507, + "balance_loss_mlp": 1.03283191, + "epoch": 0.2660453930557643, + "flos": 14043193184160.0, + "grad_norm": 2.3696339658347463, + "language_loss": 0.79416412, + "learning_rate": 3.4434386861610573e-06, + "loss": 0.81610817, + "num_input_tokens_seen": 95680260, + "router_z_loss_clip": 0.9375, + "router_z_loss_mlp": 0.14569092, + "step": 4425, + "time_per_iteration": 2.675227642059326 + }, + { + "auxiliary_loss_clip": 0.01143582, + "auxiliary_loss_mlp": 0.01040178, + "balance_loss_clip": 1.0533433, + "balance_loss_mlp": 1.02581382, + "epoch": 0.26610551630843227, + "flos": 30250953986400.0, + "grad_norm": 1.6558782942906176, + "language_loss": 0.80382997, + "learning_rate": 3.4431690789472532e-06, + "loss": 0.82566756, + "num_input_tokens_seen": 95701140, + "router_z_loss_clip": 0.90185547, + "router_z_loss_mlp": 0.14361572, + "step": 4426, + "time_per_iteration": 2.6995902061462402 + }, + { + "auxiliary_loss_clip": 0.01147524, + "auxiliary_loss_mlp": 0.01044549, + "balance_loss_clip": 1.05459237, + "balance_loss_mlp": 1.02935028, + "epoch": 0.26616563956110023, + "flos": 33722505930240.0, + "grad_norm": 1.936055739217352, + "language_loss": 0.77466249, + "learning_rate": 3.442899417008333e-06, + "loss": 0.79658324, + "num_input_tokens_seen": 95722060, + "router_z_loss_clip": 0.9296875, + "router_z_loss_mlp": 0.15197754, + "step": 4427, + "time_per_iteration": 2.7330493927001953 + }, + { + "auxiliary_loss_clip": 0.01141371, + "auxiliary_loss_mlp": 0.01031508, + "balance_loss_clip": 1.05177927, + "balance_loss_mlp": 1.01722026, + "epoch": 0.26622576281376825, + "flos": 34609342698720.0, + "grad_norm": 1.735387195843516, + "language_loss": 0.76921165, + "learning_rate": 3.4426297003545227e-06, + "loss": 0.7909404, + "num_input_tokens_seen": 95742495, + "router_z_loss_clip": 0.89550781, + "router_z_loss_mlp": 0.14276123, + "step": 4428, + "time_per_iteration": 2.715632915496826 + }, + { + "auxiliary_loss_clip": 0.0114659, + "auxiliary_loss_mlp": 0.01033725, + "balance_loss_clip": 1.05223215, + "balance_loss_mlp": 1.01977766, + "epoch": 0.2662858860664362, + "flos": 22013831302560.0, + "grad_norm": 1.9397221115099932, + "language_loss": 0.83097631, + "learning_rate": 3.4423599289960495e-06, + "loss": 0.85277951, + "num_input_tokens_seen": 95761510, + "router_z_loss_clip": 0.94384766, + "router_z_loss_mlp": 0.13952637, + "step": 4429, + "time_per_iteration": 2.748159646987915 + }, + { + "auxiliary_loss_clip": 0.01143516, + "auxiliary_loss_mlp": 0.01036592, + "balance_loss_clip": 1.05174017, + "balance_loss_mlp": 1.02113616, + "epoch": 0.2663460093191042, + "flos": 27754796884320.0, + "grad_norm": 1.6904690145648147, + "language_loss": 0.71648127, + "learning_rate": 3.442090102943143e-06, + "loss": 0.73828232, + "num_input_tokens_seen": 95782385, + "router_z_loss_clip": 0.91845703, + "router_z_loss_mlp": 0.15460205, + "step": 4430, + "time_per_iteration": 2.689971923828125 + }, + { + "auxiliary_loss_clip": 0.01146755, + "auxiliary_loss_mlp": 0.01045364, + "balance_loss_clip": 1.05355167, + "balance_loss_mlp": 1.02949715, + "epoch": 0.26640613257177215, + "flos": 20143698822720.0, + "grad_norm": 1.9929678223723248, + "language_loss": 0.82173657, + "learning_rate": 3.441820222206035e-06, + "loss": 0.84365773, + "num_input_tokens_seen": 95800595, + "router_z_loss_clip": 0.93212891, + "router_z_loss_mlp": 0.15881348, + "step": 4431, + "time_per_iteration": 2.678086519241333 + }, + { + "auxiliary_loss_clip": 0.01153917, + "auxiliary_loss_mlp": 0.01046506, + "balance_loss_clip": 1.05499268, + "balance_loss_mlp": 1.03034091, + "epoch": 0.2664662558244401, + "flos": 28157481493920.0, + "grad_norm": 2.5097520358043637, + "language_loss": 0.76280296, + "learning_rate": 3.44155028679496e-06, + "loss": 0.78480715, + "num_input_tokens_seen": 95818480, + "router_z_loss_clip": 0.98876953, + "router_z_loss_mlp": 0.16162109, + "step": 4432, + "time_per_iteration": 2.677870988845825 + }, + { + "auxiliary_loss_clip": 0.01148155, + "auxiliary_loss_mlp": 0.01037971, + "balance_loss_clip": 1.05336058, + "balance_loss_mlp": 1.02210438, + "epoch": 0.2665263790771081, + "flos": 29003523711840.0, + "grad_norm": 2.2294269467209027, + "language_loss": 0.82824415, + "learning_rate": 3.441280296720154e-06, + "loss": 0.8501054, + "num_input_tokens_seen": 95837205, + "router_z_loss_clip": 0.94775391, + "router_z_loss_mlp": 0.15869141, + "step": 4433, + "time_per_iteration": 2.692917823791504 + }, + { + "auxiliary_loss_clip": 0.01145499, + "auxiliary_loss_mlp": 0.01040944, + "balance_loss_clip": 1.053406, + "balance_loss_mlp": 1.02460074, + "epoch": 0.26658650232977604, + "flos": 34167767850720.0, + "grad_norm": 2.631288250371135, + "language_loss": 0.76716733, + "learning_rate": 3.441010251991854e-06, + "loss": 0.78903186, + "num_input_tokens_seen": 95858395, + "router_z_loss_clip": 0.92041016, + "router_z_loss_mlp": 0.16333008, + "step": 4434, + "time_per_iteration": 2.7802722454071045 + }, + { + "auxiliary_loss_clip": 0.01144872, + "auxiliary_loss_mlp": 0.01042197, + "balance_loss_clip": 1.05282688, + "balance_loss_mlp": 1.02741492, + "epoch": 0.266646625582444, + "flos": 27044225399520.0, + "grad_norm": 1.9696012101248825, + "language_loss": 0.82504785, + "learning_rate": 3.440740152620301e-06, + "loss": 0.84691858, + "num_input_tokens_seen": 95877875, + "router_z_loss_clip": 0.91943359, + "router_z_loss_mlp": 0.14794922, + "step": 4435, + "time_per_iteration": 2.685058832168579 + }, + { + "auxiliary_loss_clip": 0.01149632, + "auxiliary_loss_mlp": 0.01052592, + "balance_loss_clip": 1.05254841, + "balance_loss_mlp": 1.0363791, + "epoch": 0.266706748835112, + "flos": 33987653425440.0, + "grad_norm": 1.9817020719290537, + "language_loss": 0.87646002, + "learning_rate": 3.4404699986157376e-06, + "loss": 0.89848226, + "num_input_tokens_seen": 95895820, + "router_z_loss_clip": 0.97070312, + "router_z_loss_mlp": 0.16210938, + "step": 4436, + "time_per_iteration": 5.584449529647827 + }, + { + "auxiliary_loss_clip": 0.01150409, + "auxiliary_loss_mlp": 0.01044028, + "balance_loss_clip": 1.05456626, + "balance_loss_mlp": 1.02826822, + "epoch": 0.26676687208777994, + "flos": 31496398914240.0, + "grad_norm": 2.0287090630296216, + "language_loss": 0.78899634, + "learning_rate": 3.440199789988407e-06, + "loss": 0.81094062, + "num_input_tokens_seen": 95918025, + "router_z_loss_clip": 0.95654297, + "router_z_loss_mlp": 0.15759277, + "step": 4437, + "time_per_iteration": 2.7303826808929443 + }, + { + "auxiliary_loss_clip": 0.0114695, + "auxiliary_loss_mlp": 0.01039646, + "balance_loss_clip": 1.05347133, + "balance_loss_mlp": 1.02456605, + "epoch": 0.2668269953404479, + "flos": 44007241896000.0, + "grad_norm": 3.0365654381170732, + "language_loss": 0.6382544, + "learning_rate": 3.439929526748556e-06, + "loss": 0.66012037, + "num_input_tokens_seen": 95937725, + "router_z_loss_clip": 0.93554688, + "router_z_loss_mlp": 0.1506958, + "step": 4438, + "time_per_iteration": 2.772907018661499 + }, + { + "auxiliary_loss_clip": 0.01148128, + "auxiliary_loss_mlp": 0.01037035, + "balance_loss_clip": 1.05379748, + "balance_loss_mlp": 1.022205, + "epoch": 0.26688711859311587, + "flos": 32423098335840.0, + "grad_norm": 2.3736813409404127, + "language_loss": 0.75682545, + "learning_rate": 3.4396592089064334e-06, + "loss": 0.77867711, + "num_input_tokens_seen": 95956335, + "router_z_loss_clip": 0.94287109, + "router_z_loss_mlp": 0.1484375, + "step": 4439, + "time_per_iteration": 4.114533185958862 + }, + { + "auxiliary_loss_clip": 0.01150366, + "auxiliary_loss_mlp": 0.01035003, + "balance_loss_clip": 1.05383933, + "balance_loss_mlp": 1.01818216, + "epoch": 0.26694724184578383, + "flos": 32654906460000.0, + "grad_norm": 3.9073634930835826, + "language_loss": 0.71776032, + "learning_rate": 3.4393888364722897e-06, + "loss": 0.73961401, + "num_input_tokens_seen": 95977135, + "router_z_loss_clip": 0.96582031, + "router_z_loss_mlp": 0.16821289, + "step": 4440, + "time_per_iteration": 2.789236307144165 + }, + { + "auxiliary_loss_clip": 0.01146594, + "auxiliary_loss_mlp": 0.01041439, + "balance_loss_clip": 1.05110276, + "balance_loss_mlp": 1.0250001, + "epoch": 0.2670073650984518, + "flos": 25574954251680.0, + "grad_norm": 3.1957089742021405, + "language_loss": 0.66521966, + "learning_rate": 3.439118409456376e-06, + "loss": 0.68709993, + "num_input_tokens_seen": 95995435, + "router_z_loss_clip": 0.95507812, + "router_z_loss_mlp": 0.16430664, + "step": 4441, + "time_per_iteration": 4.178623199462891 + }, + { + "auxiliary_loss_clip": 0.01148053, + "auxiliary_loss_mlp": 0.01036927, + "balance_loss_clip": 1.05379367, + "balance_loss_mlp": 1.02079797, + "epoch": 0.2670674883511198, + "flos": 34612908219360.0, + "grad_norm": 1.5557463110259337, + "language_loss": 0.76199019, + "learning_rate": 3.4388479278689486e-06, + "loss": 0.78384, + "num_input_tokens_seen": 96016340, + "router_z_loss_clip": 0.94384766, + "router_z_loss_mlp": 0.16125488, + "step": 4442, + "time_per_iteration": 2.7540345191955566 + }, + { + "auxiliary_loss_clip": 0.01058427, + "auxiliary_loss_mlp": 0.01011022, + "balance_loss_clip": 1.02686787, + "balance_loss_mlp": 1.00920844, + "epoch": 0.2671276116037878, + "flos": 71957023047360.0, + "grad_norm": 0.927815365352977, + "language_loss": 0.61194068, + "learning_rate": 3.4385773917202637e-06, + "loss": 0.63263518, + "num_input_tokens_seen": 96071205, + "router_z_loss_clip": 0.31591797, + "router_z_loss_mlp": 0.01812744, + "step": 4443, + "time_per_iteration": 3.230229616165161 + }, + { + "auxiliary_loss_clip": 0.01149151, + "auxiliary_loss_mlp": 0.01036968, + "balance_loss_clip": 1.05330527, + "balance_loss_mlp": 1.02197695, + "epoch": 0.26718773485645575, + "flos": 53623659549600.0, + "grad_norm": 1.4648545106056876, + "language_loss": 0.76244032, + "learning_rate": 3.4383068010205793e-06, + "loss": 0.78430152, + "num_input_tokens_seen": 96094240, + "router_z_loss_clip": 0.95751953, + "router_z_loss_mlp": 0.14984131, + "step": 4444, + "time_per_iteration": 2.8811964988708496 + }, + { + "auxiliary_loss_clip": 0.01147415, + "auxiliary_loss_mlp": 0.0103589, + "balance_loss_clip": 1.05256045, + "balance_loss_mlp": 1.0196532, + "epoch": 0.2672478581091237, + "flos": 30784004151840.0, + "grad_norm": 2.1002224342417293, + "language_loss": 0.80761552, + "learning_rate": 3.438036155780158e-06, + "loss": 0.82944858, + "num_input_tokens_seen": 96114105, + "router_z_loss_clip": 0.94824219, + "router_z_loss_mlp": 0.16241455, + "step": 4445, + "time_per_iteration": 2.7485804557800293 + }, + { + "auxiliary_loss_clip": 0.01147359, + "auxiliary_loss_mlp": 0.01039588, + "balance_loss_clip": 1.05102873, + "balance_loss_mlp": 1.02316129, + "epoch": 0.2673079813617917, + "flos": 18629824500000.0, + "grad_norm": 1.9077809829004126, + "language_loss": 0.88991737, + "learning_rate": 3.43776545600926e-06, + "loss": 0.91178679, + "num_input_tokens_seen": 96132140, + "router_z_loss_clip": 0.96289062, + "router_z_loss_mlp": 0.16430664, + "step": 4446, + "time_per_iteration": 2.710273027420044 + }, + { + "auxiliary_loss_clip": 0.01147914, + "auxiliary_loss_mlp": 0.0103942, + "balance_loss_clip": 1.05315685, + "balance_loss_mlp": 1.02438164, + "epoch": 0.26736810461445965, + "flos": 31497087708000.0, + "grad_norm": 1.7349367277386287, + "language_loss": 0.67750859, + "learning_rate": 3.437494701718153e-06, + "loss": 0.69938195, + "num_input_tokens_seen": 96152090, + "router_z_loss_clip": 0.94873047, + "router_z_loss_mlp": 0.15045166, + "step": 4447, + "time_per_iteration": 2.7429141998291016 + }, + { + "auxiliary_loss_clip": 0.01149032, + "auxiliary_loss_mlp": 0.0103398, + "balance_loss_clip": 1.05298471, + "balance_loss_mlp": 1.01844692, + "epoch": 0.2674282278671276, + "flos": 29667020467680.0, + "grad_norm": 2.15826592949817, + "language_loss": 0.82978445, + "learning_rate": 3.4372238929171026e-06, + "loss": 0.85161459, + "num_input_tokens_seen": 96170015, + "router_z_loss_clip": 0.9609375, + "router_z_loss_mlp": 0.15539551, + "step": 4448, + "time_per_iteration": 2.740795612335205 + }, + { + "auxiliary_loss_clip": 0.01144734, + "auxiliary_loss_mlp": 0.01042518, + "balance_loss_clip": 1.05250013, + "balance_loss_mlp": 1.02721143, + "epoch": 0.2674883511197956, + "flos": 27840154092480.0, + "grad_norm": 1.6191268532529055, + "language_loss": 0.8418932, + "learning_rate": 3.436953029616378e-06, + "loss": 0.86376572, + "num_input_tokens_seen": 96188065, + "router_z_loss_clip": 0.921875, + "router_z_loss_mlp": 0.15307617, + "step": 4449, + "time_per_iteration": 2.6815502643585205 + }, + { + "auxiliary_loss_clip": 0.011536, + "auxiliary_loss_mlp": 0.01041095, + "balance_loss_clip": 1.05162895, + "balance_loss_mlp": 1.02441728, + "epoch": 0.26754847437246354, + "flos": 30957473743200.0, + "grad_norm": 1.6875119455038239, + "language_loss": 0.84280425, + "learning_rate": 3.4366821118262506e-06, + "loss": 0.86475122, + "num_input_tokens_seen": 96205780, + "router_z_loss_clip": 1.01953125, + "router_z_loss_mlp": 0.16674805, + "step": 4450, + "time_per_iteration": 2.713178873062134 + }, + { + "auxiliary_loss_clip": 0.01141268, + "auxiliary_loss_mlp": 0.01040173, + "balance_loss_clip": 1.04986835, + "balance_loss_mlp": 1.02553463, + "epoch": 0.2676085976251315, + "flos": 24685119204480.0, + "grad_norm": 1.9062643061232216, + "language_loss": 0.80988836, + "learning_rate": 3.4364111395569937e-06, + "loss": 0.83170283, + "num_input_tokens_seen": 96224990, + "router_z_loss_clip": 0.91308594, + "router_z_loss_mlp": 0.14660645, + "step": 4451, + "time_per_iteration": 2.641119956970215 + }, + { + "auxiliary_loss_clip": 0.01146273, + "auxiliary_loss_mlp": 0.01036825, + "balance_loss_clip": 1.05309784, + "balance_loss_mlp": 1.02282429, + "epoch": 0.26766872087779947, + "flos": 34212816715680.0, + "grad_norm": 1.6575560289336813, + "language_loss": 0.86240178, + "learning_rate": 3.436140112818882e-06, + "loss": 0.88423276, + "num_input_tokens_seen": 96245345, + "router_z_loss_clip": 0.93261719, + "router_z_loss_mlp": 0.14013672, + "step": 4452, + "time_per_iteration": 2.7658791542053223 + }, + { + "auxiliary_loss_clip": 0.01150396, + "auxiliary_loss_mlp": 0.01036879, + "balance_loss_clip": 1.05399513, + "balance_loss_mlp": 1.02141762, + "epoch": 0.26772884413046744, + "flos": 22360932554400.0, + "grad_norm": 2.3560535671509055, + "language_loss": 0.83146578, + "learning_rate": 3.435869031622194e-06, + "loss": 0.85333854, + "num_input_tokens_seen": 96259000, + "router_z_loss_clip": 0.96435547, + "router_z_loss_mlp": 0.15466309, + "step": 4453, + "time_per_iteration": 2.6086909770965576 + }, + { + "auxiliary_loss_clip": 0.01145519, + "auxiliary_loss_mlp": 0.01046066, + "balance_loss_clip": 1.05172133, + "balance_loss_mlp": 1.03013968, + "epoch": 0.2677889673831354, + "flos": 27000108432000.0, + "grad_norm": 2.383203519431809, + "language_loss": 0.79683745, + "learning_rate": 3.435597895977208e-06, + "loss": 0.81875336, + "num_input_tokens_seen": 96277000, + "router_z_loss_clip": 0.93798828, + "router_z_loss_mlp": 0.15930176, + "step": 4454, + "time_per_iteration": 2.7047173976898193 + }, + { + "auxiliary_loss_clip": 0.01147124, + "auxiliary_loss_mlp": 0.01036831, + "balance_loss_clip": 1.05132198, + "balance_loss_mlp": 1.02241814, + "epoch": 0.2678490906358034, + "flos": 28955517085440.0, + "grad_norm": 1.574936565629602, + "language_loss": 0.72438216, + "learning_rate": 3.435326705894206e-06, + "loss": 0.74622178, + "num_input_tokens_seen": 96297010, + "router_z_loss_clip": 0.95751953, + "router_z_loss_mlp": 0.144104, + "step": 4455, + "time_per_iteration": 2.677429437637329 + }, + { + "auxiliary_loss_clip": 0.01142813, + "auxiliary_loss_mlp": 0.01039161, + "balance_loss_clip": 1.05226922, + "balance_loss_mlp": 1.02450979, + "epoch": 0.2679092138884714, + "flos": 26555130132480.0, + "grad_norm": 1.5446955683302104, + "language_loss": 0.74115157, + "learning_rate": 3.435055461383471e-06, + "loss": 0.76297134, + "num_input_tokens_seen": 96315780, + "router_z_loss_clip": 0.90478516, + "router_z_loss_mlp": 0.14642334, + "step": 4456, + "time_per_iteration": 2.7004237174987793 + }, + { + "auxiliary_loss_clip": 0.01148258, + "auxiliary_loss_mlp": 0.01039967, + "balance_loss_clip": 1.05175078, + "balance_loss_mlp": 1.02452934, + "epoch": 0.26796933714113935, + "flos": 24234954693120.0, + "grad_norm": 2.332721464769679, + "language_loss": 0.70892274, + "learning_rate": 3.4347841624552896e-06, + "loss": 0.73080498, + "num_input_tokens_seen": 96333465, + "router_z_loss_clip": 0.96435547, + "router_z_loss_mlp": 0.1541748, + "step": 4457, + "time_per_iteration": 2.6756997108459473 + }, + { + "auxiliary_loss_clip": 0.0114893, + "auxiliary_loss_mlp": 0.01044571, + "balance_loss_clip": 1.05416918, + "balance_loss_mlp": 1.0294311, + "epoch": 0.2680294603938073, + "flos": 24462386951040.0, + "grad_norm": 2.337915381353936, + "language_loss": 0.79261774, + "learning_rate": 3.4345128091199493e-06, + "loss": 0.81455278, + "num_input_tokens_seen": 96352005, + "router_z_loss_clip": 0.94921875, + "router_z_loss_mlp": 0.15112305, + "step": 4458, + "time_per_iteration": 2.7035138607025146 + }, + { + "auxiliary_loss_clip": 0.01055946, + "auxiliary_loss_mlp": 0.01018134, + "balance_loss_clip": 1.02447772, + "balance_loss_mlp": 1.01679647, + "epoch": 0.2680895836464753, + "flos": 87993745295040.0, + "grad_norm": 0.8566122417880921, + "language_loss": 0.58629012, + "learning_rate": 3.434241401387739e-06, + "loss": 0.60703099, + "num_input_tokens_seen": 96406265, + "router_z_loss_clip": 0.31445312, + "router_z_loss_mlp": 0.01337433, + "step": 4459, + "time_per_iteration": 3.3053932189941406 + }, + { + "auxiliary_loss_clip": 0.01141724, + "auxiliary_loss_mlp": 0.01040038, + "balance_loss_clip": 1.0490346, + "balance_loss_mlp": 1.02548814, + "epoch": 0.26814970689914325, + "flos": 24951927908160.0, + "grad_norm": 1.8441172696872712, + "language_loss": 0.85151136, + "learning_rate": 3.4339699392689507e-06, + "loss": 0.87332898, + "num_input_tokens_seen": 96425225, + "router_z_loss_clip": 0.92626953, + "router_z_loss_mlp": 0.14544678, + "step": 4460, + "time_per_iteration": 2.7781553268432617 + }, + { + "auxiliary_loss_clip": 0.0114321, + "auxiliary_loss_mlp": 0.0104003, + "balance_loss_clip": 1.05064595, + "balance_loss_mlp": 1.02505088, + "epoch": 0.2682098301518112, + "flos": 21434638305600.0, + "grad_norm": 1.8772120710602553, + "language_loss": 0.67504841, + "learning_rate": 3.4336984227738796e-06, + "loss": 0.69688076, + "num_input_tokens_seen": 96443780, + "router_z_loss_clip": 0.92529297, + "router_z_loss_mlp": 0.14971924, + "step": 4461, + "time_per_iteration": 2.671318292617798 + }, + { + "auxiliary_loss_clip": 0.01143003, + "auxiliary_loss_mlp": 0.01048397, + "balance_loss_clip": 1.05079317, + "balance_loss_mlp": 1.03343582, + "epoch": 0.2682699534044792, + "flos": 22370048942400.0, + "grad_norm": 1.6425723474928537, + "language_loss": 0.66946727, + "learning_rate": 3.43342685191282e-06, + "loss": 0.69138128, + "num_input_tokens_seen": 96464530, + "router_z_loss_clip": 0.92138672, + "router_z_loss_mlp": 0.14953613, + "step": 4462, + "time_per_iteration": 2.765071153640747 + }, + { + "auxiliary_loss_clip": 0.01143946, + "auxiliary_loss_mlp": 0.01035407, + "balance_loss_clip": 1.0513804, + "balance_loss_mlp": 1.01923001, + "epoch": 0.26833007665714714, + "flos": 30873048432480.0, + "grad_norm": 1.765687797381976, + "language_loss": 0.69445002, + "learning_rate": 3.4331552266960705e-06, + "loss": 0.71624351, + "num_input_tokens_seen": 96483345, + "router_z_loss_clip": 0.92626953, + "router_z_loss_mlp": 0.16174316, + "step": 4463, + "time_per_iteration": 2.7434566020965576 + }, + { + "auxiliary_loss_clip": 0.01146138, + "auxiliary_loss_mlp": 0.01034998, + "balance_loss_clip": 1.05099535, + "balance_loss_mlp": 1.01987052, + "epoch": 0.2683901999098151, + "flos": 19645284581280.0, + "grad_norm": 2.8225521487964533, + "language_loss": 0.77978253, + "learning_rate": 3.432883547133931e-06, + "loss": 0.80159384, + "num_input_tokens_seen": 96498305, + "router_z_loss_clip": 0.95117188, + "router_z_loss_mlp": 0.15112305, + "step": 4464, + "time_per_iteration": 2.6245250701904297 + }, + { + "auxiliary_loss_clip": 0.01141416, + "auxiliary_loss_mlp": 0.01039621, + "balance_loss_clip": 1.04915059, + "balance_loss_mlp": 1.0237546, + "epoch": 0.2684503231624831, + "flos": 33321847184640.0, + "grad_norm": 1.8593676754327726, + "language_loss": 0.71011823, + "learning_rate": 3.432611813236704e-06, + "loss": 0.73192859, + "num_input_tokens_seen": 96519740, + "router_z_loss_clip": 0.92236328, + "router_z_loss_mlp": 0.15844727, + "step": 4465, + "time_per_iteration": 2.995826482772827 + }, + { + "auxiliary_loss_clip": 0.01055931, + "auxiliary_loss_mlp": 0.0100319, + "balance_loss_clip": 1.02456903, + "balance_loss_mlp": 1.00166368, + "epoch": 0.26851044641515104, + "flos": 87681685140000.0, + "grad_norm": 0.7230128636325053, + "language_loss": 0.53097701, + "learning_rate": 3.4323400250146943e-06, + "loss": 0.55156821, + "num_input_tokens_seen": 96588870, + "router_z_loss_clip": 0.31396484, + "router_z_loss_mlp": 0.01525116, + "step": 4466, + "time_per_iteration": 3.4586448669433594 + }, + { + "auxiliary_loss_clip": 0.01140566, + "auxiliary_loss_mlp": 0.01043686, + "balance_loss_clip": 1.04799426, + "balance_loss_mlp": 1.02843964, + "epoch": 0.268570569667819, + "flos": 22858860588480.0, + "grad_norm": 1.8224304772914193, + "language_loss": 0.73993337, + "learning_rate": 3.4320681824782057e-06, + "loss": 0.76177591, + "num_input_tokens_seen": 96605100, + "router_z_loss_clip": 0.92480469, + "router_z_loss_mlp": 0.15246582, + "step": 4467, + "time_per_iteration": 2.6662192344665527 + }, + { + "auxiliary_loss_clip": 0.01146605, + "auxiliary_loss_mlp": 0.01043641, + "balance_loss_clip": 1.05115402, + "balance_loss_mlp": 1.02782178, + "epoch": 0.268630692920487, + "flos": 22181871578400.0, + "grad_norm": 2.8255423697474766, + "language_loss": 0.80736667, + "learning_rate": 3.4317962856375493e-06, + "loss": 0.82926917, + "num_input_tokens_seen": 96621410, + "router_z_loss_clip": 0.95654297, + "router_z_loss_mlp": 0.15820312, + "step": 4468, + "time_per_iteration": 2.7348763942718506 + }, + { + "auxiliary_loss_clip": 0.01052662, + "auxiliary_loss_mlp": 0.01001529, + "balance_loss_clip": 1.02143013, + "balance_loss_mlp": 1.00004292, + "epoch": 0.268690816173155, + "flos": 83865746602080.0, + "grad_norm": 0.8478863707080003, + "language_loss": 0.59588504, + "learning_rate": 3.4315243345030334e-06, + "loss": 0.616427, + "num_input_tokens_seen": 96684810, + "router_z_loss_clip": 0.31201172, + "router_z_loss_mlp": 0.01485443, + "step": 4469, + "time_per_iteration": 3.3108017444610596 + }, + { + "auxiliary_loss_clip": 0.0114595, + "auxiliary_loss_mlp": 0.01044075, + "balance_loss_clip": 1.05102241, + "balance_loss_mlp": 1.02770209, + "epoch": 0.26875093942582295, + "flos": 28423236748320.0, + "grad_norm": 2.1835710425972064, + "language_loss": 0.81494004, + "learning_rate": 3.431252329084972e-06, + "loss": 0.83684027, + "num_input_tokens_seen": 96701920, + "router_z_loss_clip": 0.95019531, + "router_z_loss_mlp": 0.16363525, + "step": 4470, + "time_per_iteration": 2.716367721557617 + }, + { + "auxiliary_loss_clip": 0.0113666, + "auxiliary_loss_mlp": 0.01033441, + "balance_loss_clip": 1.04813242, + "balance_loss_mlp": 1.01840913, + "epoch": 0.2688110626784909, + "flos": 26287430048640.0, + "grad_norm": 1.6107889378929658, + "language_loss": 0.82794631, + "learning_rate": 3.4309802693936786e-06, + "loss": 0.84964728, + "num_input_tokens_seen": 96721260, + "router_z_loss_clip": 0.88476562, + "router_z_loss_mlp": 0.15032959, + "step": 4471, + "time_per_iteration": 2.7724215984344482 + }, + { + "auxiliary_loss_clip": 0.01139315, + "auxiliary_loss_mlp": 0.01033957, + "balance_loss_clip": 1.0496887, + "balance_loss_mlp": 1.01948452, + "epoch": 0.2688711859311589, + "flos": 34655242426560.0, + "grad_norm": 5.260739668984383, + "language_loss": 0.69885391, + "learning_rate": 3.43070815543947e-06, + "loss": 0.7205866, + "num_input_tokens_seen": 96740385, + "router_z_loss_clip": 0.89648438, + "router_z_loss_mlp": 0.14477539, + "step": 4472, + "time_per_iteration": 2.7299602031707764 + }, + { + "auxiliary_loss_clip": 0.01143029, + "auxiliary_loss_mlp": 0.01037505, + "balance_loss_clip": 1.0513258, + "balance_loss_mlp": 1.02299774, + "epoch": 0.26893130918382685, + "flos": 31719455305920.0, + "grad_norm": 1.5816093939202611, + "language_loss": 0.67737436, + "learning_rate": 3.4304359872326656e-06, + "loss": 0.69917977, + "num_input_tokens_seen": 96761860, + "router_z_loss_clip": 0.91748047, + "router_z_loss_mlp": 0.1451416, + "step": 4473, + "time_per_iteration": 2.744372844696045 + }, + { + "auxiliary_loss_clip": 0.01142041, + "auxiliary_loss_mlp": 0.0104144, + "balance_loss_clip": 1.05166376, + "balance_loss_mlp": 1.02693236, + "epoch": 0.2689914324364948, + "flos": 24818402004480.0, + "grad_norm": 1.7058204313083443, + "language_loss": 0.82954794, + "learning_rate": 3.4301637647835843e-06, + "loss": 0.85138273, + "num_input_tokens_seen": 96781890, + "router_z_loss_clip": 0.90283203, + "router_z_loss_mlp": 0.14501953, + "step": 4474, + "time_per_iteration": 2.6853508949279785 + }, + { + "auxiliary_loss_clip": 0.01141261, + "auxiliary_loss_mlp": 0.01041114, + "balance_loss_clip": 1.05160069, + "balance_loss_mlp": 1.02626061, + "epoch": 0.2690515556891628, + "flos": 23750518913280.0, + "grad_norm": 1.9872947401916705, + "language_loss": 0.70730329, + "learning_rate": 3.4298914881025494e-06, + "loss": 0.72912705, + "num_input_tokens_seen": 96800390, + "router_z_loss_clip": 0.89746094, + "router_z_loss_mlp": 0.14855957, + "step": 4475, + "time_per_iteration": 2.6576590538024902 + }, + { + "auxiliary_loss_clip": 0.0114217, + "auxiliary_loss_mlp": 0.01040318, + "balance_loss_clip": 1.04886532, + "balance_loss_mlp": 1.02561915, + "epoch": 0.26911167894183075, + "flos": 22142697719040.0, + "grad_norm": 1.8728883989413818, + "language_loss": 0.73281932, + "learning_rate": 3.4296191571998863e-06, + "loss": 0.75464422, + "num_input_tokens_seen": 96816685, + "router_z_loss_clip": 0.93261719, + "router_z_loss_mlp": 0.14709473, + "step": 4476, + "time_per_iteration": 5.531100511550903 + }, + { + "auxiliary_loss_clip": 0.01138427, + "auxiliary_loss_mlp": 0.01034879, + "balance_loss_clip": 1.04804182, + "balance_loss_mlp": 1.02071691, + "epoch": 0.2691718021944987, + "flos": 24373585774080.0, + "grad_norm": 1.6080404771000933, + "language_loss": 0.806687, + "learning_rate": 3.429346772085922e-06, + "loss": 0.8284201, + "num_input_tokens_seen": 96836285, + "router_z_loss_clip": 0.90380859, + "router_z_loss_mlp": 0.1416626, + "step": 4477, + "time_per_iteration": 2.669309377670288 + }, + { + "auxiliary_loss_clip": 0.01141268, + "auxiliary_loss_mlp": 0.01038754, + "balance_loss_clip": 1.04797602, + "balance_loss_mlp": 1.02369738, + "epoch": 0.2692319254471667, + "flos": 45694788396480.0, + "grad_norm": 1.6704682767119434, + "language_loss": 0.65058678, + "learning_rate": 3.429074332770984e-06, + "loss": 0.672387, + "num_input_tokens_seen": 96857745, + "router_z_loss_clip": 0.93310547, + "router_z_loss_mlp": 0.15063477, + "step": 4478, + "time_per_iteration": 4.269383907318115 + }, + { + "auxiliary_loss_clip": 0.01140781, + "auxiliary_loss_mlp": 0.0103684, + "balance_loss_clip": 1.04831052, + "balance_loss_mlp": 1.02222538, + "epoch": 0.26929204869983464, + "flos": 27000432570240.0, + "grad_norm": 1.9323269085944694, + "language_loss": 0.80991971, + "learning_rate": 3.4288018392654047e-06, + "loss": 0.83169591, + "num_input_tokens_seen": 96877295, + "router_z_loss_clip": 0.92480469, + "router_z_loss_mlp": 0.14624023, + "step": 4479, + "time_per_iteration": 2.6796340942382812 + }, + { + "auxiliary_loss_clip": 0.01143395, + "auxiliary_loss_mlp": 0.01041671, + "balance_loss_clip": 1.04961765, + "balance_loss_mlp": 1.02643645, + "epoch": 0.2693521719525026, + "flos": 24151096624320.0, + "grad_norm": 2.040983877503971, + "language_loss": 0.80923963, + "learning_rate": 3.4285292915795166e-06, + "loss": 0.83109033, + "num_input_tokens_seen": 96896160, + "router_z_loss_clip": 0.93701172, + "router_z_loss_mlp": 0.15246582, + "step": 4480, + "time_per_iteration": 4.1723175048828125 + }, + { + "auxiliary_loss_clip": 0.01135722, + "auxiliary_loss_mlp": 0.01033921, + "balance_loss_clip": 1.04730511, + "balance_loss_mlp": 1.02046776, + "epoch": 0.2694122952051706, + "flos": 25617572079840.0, + "grad_norm": 2.712361701300435, + "language_loss": 0.78015649, + "learning_rate": 3.4282566897236543e-06, + "loss": 0.80185294, + "num_input_tokens_seen": 96915410, + "router_z_loss_clip": 0.88427734, + "router_z_loss_mlp": 0.13446045, + "step": 4481, + "time_per_iteration": 2.696589231491089 + }, + { + "auxiliary_loss_clip": 0.01143742, + "auxiliary_loss_mlp": 0.01040554, + "balance_loss_clip": 1.04985845, + "balance_loss_mlp": 1.02579629, + "epoch": 0.2694724184578386, + "flos": 31541690882880.0, + "grad_norm": 1.8664511079716883, + "language_loss": 0.73918271, + "learning_rate": 3.4279840337081547e-06, + "loss": 0.76102567, + "num_input_tokens_seen": 96937865, + "router_z_loss_clip": 0.93994141, + "router_z_loss_mlp": 0.14746094, + "step": 4482, + "time_per_iteration": 2.7418906688690186 + }, + { + "auxiliary_loss_clip": 0.0114339, + "auxiliary_loss_mlp": 0.01032644, + "balance_loss_clip": 1.05101442, + "balance_loss_mlp": 1.01765394, + "epoch": 0.26953254171050656, + "flos": 26511256268640.0, + "grad_norm": 1.8095412742629406, + "language_loss": 0.7296446, + "learning_rate": 3.4277113235433584e-06, + "loss": 0.75140488, + "num_input_tokens_seen": 96957710, + "router_z_loss_clip": 0.92431641, + "router_z_loss_mlp": 0.14984131, + "step": 4483, + "time_per_iteration": 2.698662281036377 + }, + { + "auxiliary_loss_clip": 0.01143244, + "auxiliary_loss_mlp": 0.01042693, + "balance_loss_clip": 1.04861212, + "balance_loss_mlp": 1.02690995, + "epoch": 0.2695926649631745, + "flos": 24017043996000.0, + "grad_norm": 2.1274070575239774, + "language_loss": 0.86914074, + "learning_rate": 3.427438559239605e-06, + "loss": 0.89100009, + "num_input_tokens_seen": 96975890, + "router_z_loss_clip": 0.94580078, + "router_z_loss_mlp": 0.15753174, + "step": 4484, + "time_per_iteration": 2.702911853790283 + }, + { + "auxiliary_loss_clip": 0.01141994, + "auxiliary_loss_mlp": 0.01040513, + "balance_loss_clip": 1.04890692, + "balance_loss_mlp": 1.02631497, + "epoch": 0.2696527882158425, + "flos": 40129561373760.0, + "grad_norm": 1.8228690225036146, + "language_loss": 0.66433579, + "learning_rate": 3.427165740807239e-06, + "loss": 0.6861608, + "num_input_tokens_seen": 96998595, + "router_z_loss_clip": 0.93066406, + "router_z_loss_mlp": 0.14196777, + "step": 4485, + "time_per_iteration": 2.7955081462860107 + }, + { + "auxiliary_loss_clip": 0.01141949, + "auxiliary_loss_mlp": 0.01036288, + "balance_loss_clip": 1.04930985, + "balance_loss_mlp": 1.02181041, + "epoch": 0.26971291146851045, + "flos": 14794599736800.0, + "grad_norm": 5.5139538560788415, + "language_loss": 0.72868311, + "learning_rate": 3.426892868256604e-06, + "loss": 0.75046551, + "num_input_tokens_seen": 97013715, + "router_z_loss_clip": 0.92724609, + "router_z_loss_mlp": 0.14465332, + "step": 4486, + "time_per_iteration": 2.767420530319214 + }, + { + "auxiliary_loss_clip": 0.01146714, + "auxiliary_loss_mlp": 0.01036015, + "balance_loss_clip": 1.05284989, + "balance_loss_mlp": 1.02177536, + "epoch": 0.2697730347211784, + "flos": 27619083047520.0, + "grad_norm": 2.6196498425434656, + "language_loss": 0.83753133, + "learning_rate": 3.4266199415980495e-06, + "loss": 0.85935867, + "num_input_tokens_seen": 97031570, + "router_z_loss_clip": 0.93945312, + "router_z_loss_mlp": 0.14239502, + "step": 4487, + "time_per_iteration": 2.735614061355591 + }, + { + "auxiliary_loss_clip": 0.01144617, + "auxiliary_loss_mlp": 0.01039579, + "balance_loss_clip": 1.04987931, + "balance_loss_mlp": 1.02459407, + "epoch": 0.2698331579738464, + "flos": 28690936832160.0, + "grad_norm": 2.1027011709270402, + "language_loss": 0.71351075, + "learning_rate": 3.4263469608419234e-06, + "loss": 0.73535275, + "num_input_tokens_seen": 97049815, + "router_z_loss_clip": 0.94775391, + "router_z_loss_mlp": 0.15002441, + "step": 4488, + "time_per_iteration": 2.810537815093994 + }, + { + "auxiliary_loss_clip": 0.01145525, + "auxiliary_loss_mlp": 0.01038502, + "balance_loss_clip": 1.05234671, + "balance_loss_mlp": 1.02336287, + "epoch": 0.26989328122651435, + "flos": 30067679213280.0, + "grad_norm": 1.878926793810583, + "language_loss": 0.84081471, + "learning_rate": 3.426073925998578e-06, + "loss": 0.86265498, + "num_input_tokens_seen": 97067570, + "router_z_loss_clip": 0.93115234, + "router_z_loss_mlp": 0.15136719, + "step": 4489, + "time_per_iteration": 2.6839942932128906 + }, + { + "auxiliary_loss_clip": 0.01148954, + "auxiliary_loss_mlp": 0.01049632, + "balance_loss_clip": 1.05387974, + "balance_loss_mlp": 1.034266, + "epoch": 0.2699534044791823, + "flos": 13144646921760.0, + "grad_norm": 2.229937700179176, + "language_loss": 0.90091944, + "learning_rate": 3.4258008370783656e-06, + "loss": 0.92290527, + "num_input_tokens_seen": 97082180, + "router_z_loss_clip": 0.95117188, + "router_z_loss_mlp": 0.15356445, + "step": 4490, + "time_per_iteration": 2.6563050746917725 + }, + { + "auxiliary_loss_clip": 0.01140774, + "auxiliary_loss_mlp": 0.01033882, + "balance_loss_clip": 1.05148244, + "balance_loss_mlp": 1.02001786, + "epoch": 0.2700135277318503, + "flos": 44138053141920.0, + "grad_norm": 2.061022095283407, + "language_loss": 0.73169404, + "learning_rate": 3.4255276940916434e-06, + "loss": 0.75344062, + "num_input_tokens_seen": 97103470, + "router_z_loss_clip": 0.89306641, + "router_z_loss_mlp": 0.13861084, + "step": 4491, + "time_per_iteration": 2.806849956512451 + }, + { + "auxiliary_loss_clip": 0.01148248, + "auxiliary_loss_mlp": 0.01039455, + "balance_loss_clip": 1.05523872, + "balance_loss_mlp": 1.02442265, + "epoch": 0.27007365098451824, + "flos": 21254159224800.0, + "grad_norm": 2.5459633854570827, + "language_loss": 0.7454077, + "learning_rate": 3.4252544970487676e-06, + "loss": 0.76728475, + "num_input_tokens_seen": 97118100, + "router_z_loss_clip": 0.93066406, + "router_z_loss_mlp": 0.15032959, + "step": 4492, + "time_per_iteration": 2.688021421432495 + }, + { + "auxiliary_loss_clip": 0.01144435, + "auxiliary_loss_mlp": 0.01039268, + "balance_loss_clip": 1.05323792, + "balance_loss_mlp": 1.02433085, + "epoch": 0.2701337742371862, + "flos": 28291372053120.0, + "grad_norm": 1.9600184745660723, + "language_loss": 0.88675702, + "learning_rate": 3.4249812459600986e-06, + "loss": 0.90859407, + "num_input_tokens_seen": 97136765, + "router_z_loss_clip": 0.91113281, + "router_z_loss_mlp": 0.14929199, + "step": 4493, + "time_per_iteration": 2.6915700435638428 + }, + { + "auxiliary_loss_clip": 0.01146202, + "auxiliary_loss_mlp": 0.01039574, + "balance_loss_clip": 1.05433357, + "balance_loss_mlp": 1.02520978, + "epoch": 0.2701938974898542, + "flos": 29760319062720.0, + "grad_norm": 2.2301676070725462, + "language_loss": 0.7121321, + "learning_rate": 3.424707940835998e-06, + "loss": 0.73398989, + "num_input_tokens_seen": 97157470, + "router_z_loss_clip": 0.91845703, + "router_z_loss_mlp": 0.14349365, + "step": 4494, + "time_per_iteration": 2.7045443058013916 + }, + { + "auxiliary_loss_clip": 0.01140261, + "auxiliary_loss_mlp": 0.01034468, + "balance_loss_clip": 1.05118942, + "balance_loss_mlp": 1.02104545, + "epoch": 0.2702540207425222, + "flos": 31987114872480.0, + "grad_norm": 2.2901439803347596, + "language_loss": 0.86579001, + "learning_rate": 3.42443458168683e-06, + "loss": 0.88753736, + "num_input_tokens_seen": 97176905, + "router_z_loss_clip": 0.89013672, + "router_z_loss_mlp": 0.13421631, + "step": 4495, + "time_per_iteration": 2.7070772647857666 + }, + { + "auxiliary_loss_clip": 0.01141586, + "auxiliary_loss_mlp": 0.01040051, + "balance_loss_clip": 1.05111539, + "balance_loss_mlp": 1.02579319, + "epoch": 0.27031414399519016, + "flos": 27975300687360.0, + "grad_norm": 1.6551383754060465, + "language_loss": 0.7673378, + "learning_rate": 3.424161168522959e-06, + "loss": 0.78915417, + "num_input_tokens_seen": 97196380, + "router_z_loss_clip": 0.90527344, + "router_z_loss_mlp": 0.14245605, + "step": 4496, + "time_per_iteration": 2.6832189559936523 + }, + { + "auxiliary_loss_clip": 0.0105725, + "auxiliary_loss_mlp": 0.01017352, + "balance_loss_clip": 1.02626884, + "balance_loss_mlp": 1.0156827, + "epoch": 0.2703742672478581, + "flos": 76897400448960.0, + "grad_norm": 0.7477240934851526, + "language_loss": 0.50265563, + "learning_rate": 3.423887701354754e-06, + "loss": 0.52340162, + "num_input_tokens_seen": 97260100, + "router_z_loss_clip": 0.30957031, + "router_z_loss_mlp": 0.01672363, + "step": 4497, + "time_per_iteration": 3.32820987701416 + }, + { + "auxiliary_loss_clip": 0.01143407, + "auxiliary_loss_mlp": 0.01033493, + "balance_loss_clip": 1.05361295, + "balance_loss_mlp": 1.01977813, + "epoch": 0.2704343905005261, + "flos": 22989712351680.0, + "grad_norm": 1.905664016623931, + "language_loss": 0.7230311, + "learning_rate": 3.4236141801925847e-06, + "loss": 0.74480009, + "num_input_tokens_seen": 97277935, + "router_z_loss_clip": 0.89746094, + "router_z_loss_mlp": 0.13720703, + "step": 4498, + "time_per_iteration": 2.6776740550994873 + }, + { + "auxiliary_loss_clip": 0.01056436, + "auxiliary_loss_mlp": 0.01008343, + "balance_loss_clip": 1.02531242, + "balance_loss_mlp": 1.00656581, + "epoch": 0.27049451375319405, + "flos": 86919906163680.0, + "grad_norm": 0.7526473287350329, + "language_loss": 0.59165066, + "learning_rate": 3.4233406050468237e-06, + "loss": 0.61229849, + "num_input_tokens_seen": 97338845, + "router_z_loss_clip": 0.31201172, + "router_z_loss_mlp": 0.01777649, + "step": 4499, + "time_per_iteration": 3.29809308052063 + }, + { + "auxiliary_loss_clip": 0.01141091, + "auxiliary_loss_mlp": 0.01030859, + "balance_loss_clip": 1.05181479, + "balance_loss_mlp": 1.016518, + "epoch": 0.270554637005862, + "flos": 29625172467840.0, + "grad_norm": 2.1233752608622445, + "language_loss": 0.7377001, + "learning_rate": 3.4230669759278438e-06, + "loss": 0.75941962, + "num_input_tokens_seen": 97356640, + "router_z_loss_clip": 0.89306641, + "router_z_loss_mlp": 0.14343262, + "step": 4500, + "time_per_iteration": 2.783252716064453 + }, + { + "auxiliary_loss_clip": 0.01140845, + "auxiliary_loss_mlp": 0.01039676, + "balance_loss_clip": 1.04997849, + "balance_loss_mlp": 1.02450037, + "epoch": 0.27061476025853, + "flos": 21515417061120.0, + "grad_norm": 2.98510992588504, + "language_loss": 0.80817455, + "learning_rate": 3.4227932928460215e-06, + "loss": 0.82997978, + "num_input_tokens_seen": 97372585, + "router_z_loss_clip": 0.90917969, + "router_z_loss_mlp": 0.15185547, + "step": 4501, + "time_per_iteration": 2.6443936824798584 + }, + { + "auxiliary_loss_clip": 0.01144265, + "auxiliary_loss_mlp": 0.010357, + "balance_loss_clip": 1.05200315, + "balance_loss_mlp": 1.02075088, + "epoch": 0.27067488351119795, + "flos": 27711166124160.0, + "grad_norm": 1.848852491798364, + "language_loss": 0.72757781, + "learning_rate": 3.422519555811735e-06, + "loss": 0.74937737, + "num_input_tokens_seen": 97393315, + "router_z_loss_clip": 0.92285156, + "router_z_loss_mlp": 0.1494751, + "step": 4502, + "time_per_iteration": 2.728062868118286 + }, + { + "auxiliary_loss_clip": 0.01144109, + "auxiliary_loss_mlp": 0.01034224, + "balance_loss_clip": 1.0485549, + "balance_loss_mlp": 1.01817811, + "epoch": 0.2707350067638659, + "flos": 50907971059200.0, + "grad_norm": 1.7409071272587564, + "language_loss": 0.68610615, + "learning_rate": 3.4222457648353642e-06, + "loss": 0.70788944, + "num_input_tokens_seen": 97417860, + "router_z_loss_clip": 0.95507812, + "router_z_loss_mlp": 0.16052246, + "step": 4503, + "time_per_iteration": 2.8278002738952637 + }, + { + "auxiliary_loss_clip": 0.01142243, + "auxiliary_loss_mlp": 0.01037937, + "balance_loss_clip": 1.05083466, + "balance_loss_mlp": 1.02364397, + "epoch": 0.2707951300165339, + "flos": 24639584132160.0, + "grad_norm": 2.2070813861058114, + "language_loss": 0.67805791, + "learning_rate": 3.4219719199272918e-06, + "loss": 0.69985974, + "num_input_tokens_seen": 97436780, + "router_z_loss_clip": 0.91455078, + "router_z_loss_mlp": 0.14294434, + "step": 4504, + "time_per_iteration": 2.7014341354370117 + }, + { + "auxiliary_loss_clip": 0.01142195, + "auxiliary_loss_mlp": 0.01036627, + "balance_loss_clip": 1.05256104, + "balance_loss_mlp": 1.02304947, + "epoch": 0.27085525326920185, + "flos": 26153823110400.0, + "grad_norm": 1.4618207589987968, + "language_loss": 0.75753504, + "learning_rate": 3.421698021097902e-06, + "loss": 0.77932328, + "num_input_tokens_seen": 97456190, + "router_z_loss_clip": 0.89648438, + "router_z_loss_mlp": 0.13580322, + "step": 4505, + "time_per_iteration": 2.7309231758117676 + }, + { + "auxiliary_loss_clip": 0.01146767, + "auxiliary_loss_mlp": 0.0104166, + "balance_loss_clip": 1.05182242, + "balance_loss_mlp": 1.02591801, + "epoch": 0.2709153765218698, + "flos": 21567191794560.0, + "grad_norm": 2.121160343576869, + "language_loss": 0.73538673, + "learning_rate": 3.42142406835758e-06, + "loss": 0.75727099, + "num_input_tokens_seen": 97474545, + "router_z_loss_clip": 0.95068359, + "router_z_loss_mlp": 0.15740967, + "step": 4506, + "time_per_iteration": 2.6519782543182373 + }, + { + "auxiliary_loss_clip": 0.01145802, + "auxiliary_loss_mlp": 0.01036696, + "balance_loss_clip": 1.05280554, + "balance_loss_mlp": 1.02095485, + "epoch": 0.2709754997745378, + "flos": 29842394371200.0, + "grad_norm": 2.59570890169319, + "language_loss": 0.80565763, + "learning_rate": 3.421150061716715e-06, + "loss": 0.82748258, + "num_input_tokens_seen": 97494520, + "router_z_loss_clip": 0.93017578, + "router_z_loss_mlp": 0.15740967, + "step": 4507, + "time_per_iteration": 2.7089879512786865 + }, + { + "auxiliary_loss_clip": 0.01054322, + "auxiliary_loss_mlp": 0.01001812, + "balance_loss_clip": 1.02346873, + "balance_loss_mlp": 1.00001776, + "epoch": 0.2710356230272058, + "flos": 79570876284000.0, + "grad_norm": 0.7288779786775569, + "language_loss": 0.50882185, + "learning_rate": 3.420876001185698e-06, + "loss": 0.52938318, + "num_input_tokens_seen": 97552455, + "router_z_loss_clip": 0.30859375, + "router_z_loss_mlp": 0.01792908, + "step": 4508, + "time_per_iteration": 3.2268927097320557 + }, + { + "auxiliary_loss_clip": 0.01137536, + "auxiliary_loss_mlp": 0.01032345, + "balance_loss_clip": 1.04894221, + "balance_loss_mlp": 1.0183084, + "epoch": 0.27109574627987376, + "flos": 31096469479680.0, + "grad_norm": 2.1115492451925473, + "language_loss": 0.74644089, + "learning_rate": 3.4206018867749197e-06, + "loss": 0.76813966, + "num_input_tokens_seen": 97572650, + "router_z_loss_clip": 0.88525391, + "router_z_loss_mlp": 0.14025879, + "step": 4509, + "time_per_iteration": 2.7162747383117676 + }, + { + "auxiliary_loss_clip": 0.01135944, + "auxiliary_loss_mlp": 0.01034565, + "balance_loss_clip": 1.04819322, + "balance_loss_mlp": 1.02096331, + "epoch": 0.2711558695325417, + "flos": 24016881926880.0, + "grad_norm": 1.6768064268151261, + "language_loss": 0.7165513, + "learning_rate": 3.4203277184947757e-06, + "loss": 0.73825634, + "num_input_tokens_seen": 97591150, + "router_z_loss_clip": 0.87841797, + "router_z_loss_mlp": 0.13604736, + "step": 4510, + "time_per_iteration": 2.693592071533203 + }, + { + "auxiliary_loss_clip": 0.01143353, + "auxiliary_loss_mlp": 0.01031282, + "balance_loss_clip": 1.05214787, + "balance_loss_mlp": 1.01673222, + "epoch": 0.2712159927852097, + "flos": 22680690992640.0, + "grad_norm": 2.5643280216451187, + "language_loss": 0.70623928, + "learning_rate": 3.4200534963556627e-06, + "loss": 0.72798562, + "num_input_tokens_seen": 97607410, + "router_z_loss_clip": 0.91259766, + "router_z_loss_mlp": 0.14556885, + "step": 4511, + "time_per_iteration": 2.6725802421569824 + }, + { + "auxiliary_loss_clip": 0.01141439, + "auxiliary_loss_mlp": 0.01037549, + "balance_loss_clip": 1.04938281, + "balance_loss_mlp": 1.02236176, + "epoch": 0.27127611603787766, + "flos": 31274395971840.0, + "grad_norm": 2.4945215088641848, + "language_loss": 0.80892467, + "learning_rate": 3.419779220367979e-06, + "loss": 0.83071458, + "num_input_tokens_seen": 97626870, + "router_z_loss_clip": 0.92138672, + "router_z_loss_mlp": 0.1517334, + "step": 4512, + "time_per_iteration": 2.690380334854126 + }, + { + "auxiliary_loss_clip": 0.01138772, + "auxiliary_loss_mlp": 0.01035128, + "balance_loss_clip": 1.04996252, + "balance_loss_mlp": 1.02137733, + "epoch": 0.2713362392905456, + "flos": 28246566291840.0, + "grad_norm": 1.868911744713373, + "language_loss": 0.80385137, + "learning_rate": 3.419504890542124e-06, + "loss": 0.82559031, + "num_input_tokens_seen": 97646595, + "router_z_loss_clip": 0.88818359, + "router_z_loss_mlp": 0.13745117, + "step": 4513, + "time_per_iteration": 2.751953601837158 + }, + { + "auxiliary_loss_clip": 0.01139255, + "auxiliary_loss_mlp": 0.0103759, + "balance_loss_clip": 1.04817319, + "balance_loss_mlp": 1.02386308, + "epoch": 0.2713963625432136, + "flos": 22410397802880.0, + "grad_norm": 1.9438200494088025, + "language_loss": 0.87701845, + "learning_rate": 3.4192305068885026e-06, + "loss": 0.89878696, + "num_input_tokens_seen": 97665485, + "router_z_loss_clip": 0.91015625, + "router_z_loss_mlp": 0.13751221, + "step": 4514, + "time_per_iteration": 4.171311855316162 + }, + { + "auxiliary_loss_clip": 0.01140731, + "auxiliary_loss_mlp": 0.01038929, + "balance_loss_clip": 1.05091667, + "balance_loss_mlp": 1.02419448, + "epoch": 0.27145648579588155, + "flos": 27444641041440.0, + "grad_norm": 1.6916270937334503, + "language_loss": 0.92251635, + "learning_rate": 3.418956069417517e-06, + "loss": 0.94431293, + "num_input_tokens_seen": 97683800, + "router_z_loss_clip": 0.89746094, + "router_z_loss_mlp": 0.1472168, + "step": 4515, + "time_per_iteration": 4.211684465408325 + }, + { + "auxiliary_loss_clip": 0.01147067, + "auxiliary_loss_mlp": 0.01043852, + "balance_loss_clip": 1.05131698, + "balance_loss_mlp": 1.02675724, + "epoch": 0.2715166090485495, + "flos": 23476092960960.0, + "grad_norm": 2.1556057288350736, + "language_loss": 0.74145615, + "learning_rate": 3.4186815781395756e-06, + "loss": 0.76336539, + "num_input_tokens_seen": 97700505, + "router_z_loss_clip": 0.95751953, + "router_z_loss_mlp": 0.17089844, + "step": 4516, + "time_per_iteration": 2.651071310043335 + }, + { + "auxiliary_loss_clip": 0.0114229, + "auxiliary_loss_mlp": 0.01036811, + "balance_loss_clip": 1.05141103, + "balance_loss_mlp": 1.021981, + "epoch": 0.2715767323012175, + "flos": 21609323415360.0, + "grad_norm": 2.1807495692881083, + "language_loss": 0.76135612, + "learning_rate": 3.4184070330650866e-06, + "loss": 0.78314722, + "num_input_tokens_seen": 97717410, + "router_z_loss_clip": 0.90869141, + "router_z_loss_mlp": 0.14825439, + "step": 4517, + "time_per_iteration": 2.6744771003723145 + }, + { + "auxiliary_loss_clip": 0.01138846, + "auxiliary_loss_mlp": 0.010397, + "balance_loss_clip": 1.04839277, + "balance_loss_mlp": 1.02437603, + "epoch": 0.27163685555388545, + "flos": 27311682379680.0, + "grad_norm": 2.568222323716036, + "language_loss": 0.77492404, + "learning_rate": 3.4181324342044607e-06, + "loss": 0.79670948, + "num_input_tokens_seen": 97734545, + "router_z_loss_clip": 0.90527344, + "router_z_loss_mlp": 0.15319824, + "step": 4518, + "time_per_iteration": 4.152942419052124 + }, + { + "auxiliary_loss_clip": 0.01141589, + "auxiliary_loss_mlp": 0.01038246, + "balance_loss_clip": 1.05085313, + "balance_loss_mlp": 1.02515674, + "epoch": 0.2716969788065534, + "flos": 27267565412160.0, + "grad_norm": 1.825633032107985, + "language_loss": 0.68016243, + "learning_rate": 3.41785778156811e-06, + "loss": 0.7019608, + "num_input_tokens_seen": 97754000, + "router_z_loss_clip": 0.90673828, + "router_z_loss_mlp": 0.13092041, + "step": 4519, + "time_per_iteration": 2.6878559589385986 + }, + { + "auxiliary_loss_clip": 0.01138094, + "auxiliary_loss_mlp": 0.01034859, + "balance_loss_clip": 1.04807663, + "balance_loss_mlp": 1.02106619, + "epoch": 0.2717571020592214, + "flos": 30784166220960.0, + "grad_norm": 2.0589542204744755, + "language_loss": 0.76085955, + "learning_rate": 3.417583075166451e-06, + "loss": 0.78258908, + "num_input_tokens_seen": 97772080, + "router_z_loss_clip": 0.90087891, + "router_z_loss_mlp": 0.13781738, + "step": 4520, + "time_per_iteration": 4.1950013637542725 + }, + { + "auxiliary_loss_clip": 0.01142899, + "auxiliary_loss_mlp": 0.01041918, + "balance_loss_clip": 1.05048501, + "balance_loss_mlp": 1.02628982, + "epoch": 0.2718172253118894, + "flos": 24635897059680.0, + "grad_norm": 2.4752184705968543, + "language_loss": 0.76503658, + "learning_rate": 3.4173083150099e-06, + "loss": 0.78688473, + "num_input_tokens_seen": 97789370, + "router_z_loss_clip": 0.92333984, + "router_z_loss_mlp": 0.15625, + "step": 4521, + "time_per_iteration": 2.6415295600891113 + }, + { + "auxiliary_loss_clip": 0.01142532, + "auxiliary_loss_mlp": 0.01043283, + "balance_loss_clip": 1.04903316, + "balance_loss_mlp": 1.02815557, + "epoch": 0.27187734856455736, + "flos": 17471762644320.0, + "grad_norm": 2.3985794335632993, + "language_loss": 0.7499578, + "learning_rate": 3.417033501108875e-06, + "loss": 0.7718159, + "num_input_tokens_seen": 97807385, + "router_z_loss_clip": 0.93505859, + "router_z_loss_mlp": 0.15136719, + "step": 4522, + "time_per_iteration": 2.6439573764801025 + }, + { + "auxiliary_loss_clip": 0.01143924, + "auxiliary_loss_mlp": 0.01039047, + "balance_loss_clip": 1.0513165, + "balance_loss_mlp": 1.02358544, + "epoch": 0.27193747181722533, + "flos": 25755473849760.0, + "grad_norm": 1.7698498468625217, + "language_loss": 0.72609442, + "learning_rate": 3.416758633473798e-06, + "loss": 0.74792415, + "num_input_tokens_seen": 97827930, + "router_z_loss_clip": 0.92724609, + "router_z_loss_mlp": 0.15454102, + "step": 4523, + "time_per_iteration": 2.7141568660736084 + }, + { + "auxiliary_loss_clip": 0.01137291, + "auxiliary_loss_mlp": 0.01036383, + "balance_loss_clip": 1.04797304, + "balance_loss_mlp": 1.02113593, + "epoch": 0.2719975950698933, + "flos": 24016881926880.0, + "grad_norm": 1.4383615072547389, + "language_loss": 0.74052382, + "learning_rate": 3.4164837121150915e-06, + "loss": 0.76226056, + "num_input_tokens_seen": 97847440, + "router_z_loss_clip": 0.89306641, + "router_z_loss_mlp": 0.15258789, + "step": 4524, + "time_per_iteration": 2.67980694770813 + }, + { + "auxiliary_loss_clip": 0.01143242, + "auxiliary_loss_mlp": 0.01035624, + "balance_loss_clip": 1.05145621, + "balance_loss_mlp": 1.02081275, + "epoch": 0.27205771832256126, + "flos": 30205743052320.0, + "grad_norm": 2.7881565706872244, + "language_loss": 0.76487815, + "learning_rate": 3.4162087370431803e-06, + "loss": 0.78666675, + "num_input_tokens_seen": 97867620, + "router_z_loss_clip": 0.91650391, + "router_z_loss_mlp": 0.14813232, + "step": 4525, + "time_per_iteration": 2.6632018089294434 + }, + { + "auxiliary_loss_clip": 0.01137495, + "auxiliary_loss_mlp": 0.0104199, + "balance_loss_clip": 1.04823577, + "balance_loss_mlp": 1.02705288, + "epoch": 0.2721178415752292, + "flos": 26546905124640.0, + "grad_norm": 2.1439855521799904, + "language_loss": 0.81758404, + "learning_rate": 3.4159337082684926e-06, + "loss": 0.83937883, + "num_input_tokens_seen": 97884345, + "router_z_loss_clip": 0.89208984, + "router_z_loss_mlp": 0.14941406, + "step": 4526, + "time_per_iteration": 2.6634366512298584 + }, + { + "auxiliary_loss_clip": 0.01144633, + "auxiliary_loss_mlp": 0.01038669, + "balance_loss_clip": 1.04996359, + "balance_loss_mlp": 1.02203906, + "epoch": 0.2721779648278972, + "flos": 15467739605280.0, + "grad_norm": 3.614933791475529, + "language_loss": 0.77089787, + "learning_rate": 3.4156586258014566e-06, + "loss": 0.79273087, + "num_input_tokens_seen": 97901500, + "router_z_loss_clip": 0.94726562, + "router_z_loss_mlp": 0.16625977, + "step": 4527, + "time_per_iteration": 2.643415689468384 + }, + { + "auxiliary_loss_clip": 0.01140918, + "auxiliary_loss_mlp": 0.01040829, + "balance_loss_clip": 1.04987645, + "balance_loss_mlp": 1.02570724, + "epoch": 0.27223808808056515, + "flos": 19831841254080.0, + "grad_norm": 2.282990758100476, + "language_loss": 0.82258803, + "learning_rate": 3.415383489652503e-06, + "loss": 0.84440553, + "num_input_tokens_seen": 97917800, + "router_z_loss_clip": 0.91064453, + "router_z_loss_mlp": 0.15112305, + "step": 4528, + "time_per_iteration": 2.654414653778076 + }, + { + "auxiliary_loss_clip": 0.01139071, + "auxiliary_loss_mlp": 0.01036552, + "balance_loss_clip": 1.05033803, + "balance_loss_mlp": 1.02306914, + "epoch": 0.2722982113332331, + "flos": 33856031833920.0, + "grad_norm": 2.090760774319052, + "language_loss": 0.77519429, + "learning_rate": 3.4151082998320666e-06, + "loss": 0.79695058, + "num_input_tokens_seen": 97937225, + "router_z_loss_clip": 0.88720703, + "router_z_loss_mlp": 0.13470459, + "step": 4529, + "time_per_iteration": 2.731166362762451 + }, + { + "auxiliary_loss_clip": 0.01142402, + "auxiliary_loss_mlp": 0.01043611, + "balance_loss_clip": 1.04937088, + "balance_loss_mlp": 1.02921081, + "epoch": 0.2723583345859011, + "flos": 26510445923040.0, + "grad_norm": 1.9527944694999844, + "language_loss": 0.82660741, + "learning_rate": 3.4148330563505805e-06, + "loss": 0.84846753, + "num_input_tokens_seen": 97956845, + "router_z_loss_clip": 0.93017578, + "router_z_loss_mlp": 0.14404297, + "step": 4530, + "time_per_iteration": 2.7696452140808105 + }, + { + "auxiliary_loss_clip": 0.01141895, + "auxiliary_loss_mlp": 0.01037064, + "balance_loss_clip": 1.05021644, + "balance_loss_mlp": 1.0220201, + "epoch": 0.27241845783856905, + "flos": 21166938221760.0, + "grad_norm": 11.218469415961719, + "language_loss": 0.91956592, + "learning_rate": 3.4145577592184838e-06, + "loss": 0.94135547, + "num_input_tokens_seen": 97972465, + "router_z_loss_clip": 0.91601562, + "router_z_loss_mlp": 0.1505127, + "step": 4531, + "time_per_iteration": 2.8209362030029297 + }, + { + "auxiliary_loss_clip": 0.01143603, + "auxiliary_loss_mlp": 0.01042735, + "balance_loss_clip": 1.05020022, + "balance_loss_mlp": 1.02750015, + "epoch": 0.272478581091237, + "flos": 30205540465920.0, + "grad_norm": 2.1717548343876563, + "language_loss": 0.76298702, + "learning_rate": 3.4142824084462155e-06, + "loss": 0.78485048, + "num_input_tokens_seen": 97990770, + "router_z_loss_clip": 0.93603516, + "router_z_loss_mlp": 0.15246582, + "step": 4532, + "time_per_iteration": 2.730644464492798 + }, + { + "auxiliary_loss_clip": 0.01137054, + "auxiliary_loss_mlp": 0.01032208, + "balance_loss_clip": 1.04870617, + "balance_loss_mlp": 1.01740181, + "epoch": 0.272538704343905, + "flos": 21830353943040.0, + "grad_norm": 2.481642231636989, + "language_loss": 0.8891871, + "learning_rate": 3.4140070040442162e-06, + "loss": 0.91087973, + "num_input_tokens_seen": 98005775, + "router_z_loss_clip": 0.88330078, + "router_z_loss_mlp": 0.14794922, + "step": 4533, + "time_per_iteration": 2.6698131561279297 + }, + { + "auxiliary_loss_clip": 0.01137696, + "auxiliary_loss_mlp": 0.01030494, + "balance_loss_clip": 1.04976821, + "balance_loss_mlp": 1.01579499, + "epoch": 0.272598827596573, + "flos": 27981661900320.0, + "grad_norm": 1.848955987716963, + "language_loss": 0.71497726, + "learning_rate": 3.413731546022929e-06, + "loss": 0.73665917, + "num_input_tokens_seen": 98025750, + "router_z_loss_clip": 0.87841797, + "router_z_loss_mlp": 0.14685059, + "step": 4534, + "time_per_iteration": 2.6923439502716064 + }, + { + "auxiliary_loss_clip": 0.01141295, + "auxiliary_loss_mlp": 0.01038014, + "balance_loss_clip": 1.04853237, + "balance_loss_mlp": 1.02214754, + "epoch": 0.27265895084924097, + "flos": 29576477047680.0, + "grad_norm": 2.4261868252174503, + "language_loss": 0.91431063, + "learning_rate": 3.4134560343928005e-06, + "loss": 0.93610376, + "num_input_tokens_seen": 98044955, + "router_z_loss_clip": 0.92822266, + "router_z_loss_mlp": 0.15881348, + "step": 4535, + "time_per_iteration": 2.7235865592956543 + }, + { + "auxiliary_loss_clip": 0.01145696, + "auxiliary_loss_mlp": 0.01037241, + "balance_loss_clip": 1.05424595, + "balance_loss_mlp": 1.02179182, + "epoch": 0.27271907410190893, + "flos": 32962266610560.0, + "grad_norm": 1.642954902477926, + "language_loss": 0.73129904, + "learning_rate": 3.4131804691642778e-06, + "loss": 0.75312841, + "num_input_tokens_seen": 98065860, + "router_z_loss_clip": 0.9140625, + "router_z_loss_mlp": 0.15454102, + "step": 4536, + "time_per_iteration": 2.7613637447357178 + }, + { + "auxiliary_loss_clip": 0.01140081, + "auxiliary_loss_mlp": 0.01037172, + "balance_loss_clip": 1.04868126, + "balance_loss_mlp": 1.02216399, + "epoch": 0.2727791973545769, + "flos": 42040772025120.0, + "grad_norm": 1.907562421116014, + "language_loss": 0.71417236, + "learning_rate": 3.41290485034781e-06, + "loss": 0.73594487, + "num_input_tokens_seen": 98085450, + "router_z_loss_clip": 0.9140625, + "router_z_loss_mlp": 0.15014648, + "step": 4537, + "time_per_iteration": 2.7965502738952637 + }, + { + "auxiliary_loss_clip": 0.01140259, + "auxiliary_loss_mlp": 0.01037641, + "balance_loss_clip": 1.04860711, + "balance_loss_mlp": 1.0229125, + "epoch": 0.27283932060724486, + "flos": 18354183029280.0, + "grad_norm": 2.6054840518222973, + "language_loss": 0.78233016, + "learning_rate": 3.4126291779538485e-06, + "loss": 0.8041091, + "num_input_tokens_seen": 98099115, + "router_z_loss_clip": 0.91650391, + "router_z_loss_mlp": 0.14727783, + "step": 4538, + "time_per_iteration": 2.680402994155884 + }, + { + "auxiliary_loss_clip": 0.01141934, + "auxiliary_loss_mlp": 0.01039855, + "balance_loss_clip": 1.05143285, + "balance_loss_mlp": 1.02566934, + "epoch": 0.2728994438599128, + "flos": 26420429227680.0, + "grad_norm": 1.8458622267225826, + "language_loss": 0.89883077, + "learning_rate": 3.412353451992847e-06, + "loss": 0.92064869, + "num_input_tokens_seen": 98118415, + "router_z_loss_clip": 0.90527344, + "router_z_loss_mlp": 0.14196777, + "step": 4539, + "time_per_iteration": 2.665207624435425 + }, + { + "auxiliary_loss_clip": 0.01142221, + "auxiliary_loss_mlp": 0.01037864, + "balance_loss_clip": 1.05195975, + "balance_loss_mlp": 1.02199709, + "epoch": 0.2729595671125808, + "flos": 21339637984800.0, + "grad_norm": 2.175756887333704, + "language_loss": 0.88301992, + "learning_rate": 3.4120776724752607e-06, + "loss": 0.9048208, + "num_input_tokens_seen": 98136300, + "router_z_loss_clip": 0.90283203, + "router_z_loss_mlp": 0.15856934, + "step": 4540, + "time_per_iteration": 2.6632533073425293 + }, + { + "auxiliary_loss_clip": 0.0114337, + "auxiliary_loss_mlp": 0.01037987, + "balance_loss_clip": 1.05119359, + "balance_loss_mlp": 1.02310371, + "epoch": 0.27301969036524876, + "flos": 23571903627360.0, + "grad_norm": 3.3674614449377422, + "language_loss": 0.81845975, + "learning_rate": 3.4118018394115476e-06, + "loss": 0.84027338, + "num_input_tokens_seen": 98154580, + "router_z_loss_clip": 0.921875, + "router_z_loss_mlp": 0.14898682, + "step": 4541, + "time_per_iteration": 2.656313180923462 + }, + { + "auxiliary_loss_clip": 0.01140152, + "auxiliary_loss_mlp": 0.01037693, + "balance_loss_clip": 1.04943824, + "balance_loss_mlp": 1.02291083, + "epoch": 0.2730798136179167, + "flos": 25704712048320.0, + "grad_norm": 2.930169053213318, + "language_loss": 0.79433548, + "learning_rate": 3.4115259528121678e-06, + "loss": 0.81611383, + "num_input_tokens_seen": 98173115, + "router_z_loss_clip": 0.90673828, + "router_z_loss_mlp": 0.14788818, + "step": 4542, + "time_per_iteration": 2.7349929809570312 + }, + { + "auxiliary_loss_clip": 0.01142982, + "auxiliary_loss_mlp": 0.01034915, + "balance_loss_clip": 1.05276084, + "balance_loss_mlp": 1.02008545, + "epoch": 0.2731399368705847, + "flos": 23393288341440.0, + "grad_norm": 2.3907724086255824, + "language_loss": 0.89766455, + "learning_rate": 3.411250012687582e-06, + "loss": 0.91944349, + "num_input_tokens_seen": 98190260, + "router_z_loss_clip": 0.90332031, + "router_z_loss_mlp": 0.14855957, + "step": 4543, + "time_per_iteration": 2.6552915573120117 + }, + { + "auxiliary_loss_clip": 0.01144322, + "auxiliary_loss_mlp": 0.01043159, + "balance_loss_clip": 1.05029452, + "balance_loss_mlp": 1.02743578, + "epoch": 0.27320006012325265, + "flos": 22318031105280.0, + "grad_norm": 2.0092707170897706, + "language_loss": 0.63415754, + "learning_rate": 3.410974019048255e-06, + "loss": 0.65603232, + "num_input_tokens_seen": 98207115, + "router_z_loss_clip": 0.94042969, + "router_z_loss_mlp": 0.15722656, + "step": 4544, + "time_per_iteration": 2.7128689289093018 + }, + { + "auxiliary_loss_clip": 0.01142057, + "auxiliary_loss_mlp": 0.01038047, + "balance_loss_clip": 1.05107057, + "balance_loss_mlp": 1.02170336, + "epoch": 0.2732601833759206, + "flos": 42538416438240.0, + "grad_norm": 1.7129164742894984, + "language_loss": 0.69955486, + "learning_rate": 3.410697971904651e-06, + "loss": 0.72135592, + "num_input_tokens_seen": 98230610, + "router_z_loss_clip": 0.91113281, + "router_z_loss_mlp": 0.16320801, + "step": 4545, + "time_per_iteration": 2.8022093772888184 + }, + { + "auxiliary_loss_clip": 0.01054667, + "auxiliary_loss_mlp": 0.01003512, + "balance_loss_clip": 1.0234772, + "balance_loss_mlp": 1.00192547, + "epoch": 0.2733203066285886, + "flos": 65780803313280.0, + "grad_norm": 0.7203202434560138, + "language_loss": 0.61602569, + "learning_rate": 3.4104218712672383e-06, + "loss": 0.63660747, + "num_input_tokens_seen": 98293585, + "router_z_loss_clip": 0.31176758, + "router_z_loss_mlp": 0.01587677, + "step": 4546, + "time_per_iteration": 3.3613126277923584 + }, + { + "auxiliary_loss_clip": 0.0114718, + "auxiliary_loss_mlp": 0.01043976, + "balance_loss_clip": 1.05568957, + "balance_loss_mlp": 1.028705, + "epoch": 0.2733804298812566, + "flos": 25210511604000.0, + "grad_norm": 2.023394803180945, + "language_loss": 0.64977133, + "learning_rate": 3.410145717146488e-06, + "loss": 0.67168295, + "num_input_tokens_seen": 98311680, + "router_z_loss_clip": 0.91503906, + "router_z_loss_mlp": 0.15258789, + "step": 4547, + "time_per_iteration": 2.758897304534912 + }, + { + "auxiliary_loss_clip": 0.01138864, + "auxiliary_loss_mlp": 0.0104003, + "balance_loss_clip": 1.05072236, + "balance_loss_mlp": 1.02596283, + "epoch": 0.27344055313392457, + "flos": 31584592332000.0, + "grad_norm": 2.430111756441093, + "language_loss": 0.77747989, + "learning_rate": 3.4098695095528694e-06, + "loss": 0.79926878, + "num_input_tokens_seen": 98330770, + "router_z_loss_clip": 0.88232422, + "router_z_loss_mlp": 0.14056396, + "step": 4548, + "time_per_iteration": 2.732609748840332 + }, + { + "auxiliary_loss_clip": 0.01141313, + "auxiliary_loss_mlp": 0.01040471, + "balance_loss_clip": 1.05152583, + "balance_loss_mlp": 1.02701843, + "epoch": 0.27350067638659253, + "flos": 27979109311680.0, + "grad_norm": 2.24501702843274, + "language_loss": 0.8274318, + "learning_rate": 3.4095932484968585e-06, + "loss": 0.8492496, + "num_input_tokens_seen": 98349860, + "router_z_loss_clip": 0.89746094, + "router_z_loss_mlp": 0.13452148, + "step": 4549, + "time_per_iteration": 2.7528531551361084 + }, + { + "auxiliary_loss_clip": 0.01143113, + "auxiliary_loss_mlp": 0.01036769, + "balance_loss_clip": 1.04968035, + "balance_loss_mlp": 1.02044952, + "epoch": 0.2735607996392605, + "flos": 20224923268320.0, + "grad_norm": 2.7457399318378948, + "language_loss": 0.71051288, + "learning_rate": 3.4093169339889305e-06, + "loss": 0.73231167, + "num_input_tokens_seen": 98367040, + "router_z_loss_clip": 0.93457031, + "router_z_loss_mlp": 0.16333008, + "step": 4550, + "time_per_iteration": 2.7171056270599365 + }, + { + "auxiliary_loss_clip": 0.01139591, + "auxiliary_loss_mlp": 0.01033731, + "balance_loss_clip": 1.05025005, + "balance_loss_mlp": 1.02015257, + "epoch": 0.27362092289192846, + "flos": 23971225302720.0, + "grad_norm": 2.1898293741906025, + "language_loss": 0.78651285, + "learning_rate": 3.409040566039563e-06, + "loss": 0.80824608, + "num_input_tokens_seen": 98384010, + "router_z_loss_clip": 0.89355469, + "router_z_loss_mlp": 0.13574219, + "step": 4551, + "time_per_iteration": 2.6782422065734863 + }, + { + "auxiliary_loss_clip": 0.01140751, + "auxiliary_loss_mlp": 0.0104052, + "balance_loss_clip": 1.04893374, + "balance_loss_mlp": 1.02535665, + "epoch": 0.27368104614459643, + "flos": 21523115344320.0, + "grad_norm": 2.2827293427960007, + "language_loss": 0.71096987, + "learning_rate": 3.4087641446592362e-06, + "loss": 0.73278254, + "num_input_tokens_seen": 98399625, + "router_z_loss_clip": 0.91894531, + "router_z_loss_mlp": 0.15161133, + "step": 4552, + "time_per_iteration": 2.6314568519592285 + }, + { + "auxiliary_loss_clip": 0.011428, + "auxiliary_loss_mlp": 0.01035172, + "balance_loss_clip": 1.0509721, + "balance_loss_mlp": 1.02010453, + "epoch": 0.2737411693972644, + "flos": 26332033223520.0, + "grad_norm": 2.2829308815088876, + "language_loss": 0.71555549, + "learning_rate": 3.408487669858431e-06, + "loss": 0.73733521, + "num_input_tokens_seen": 98417310, + "router_z_loss_clip": 0.91748047, + "router_z_loss_mlp": 0.15075684, + "step": 4553, + "time_per_iteration": 2.735250234603882 + }, + { + "auxiliary_loss_clip": 0.01139802, + "auxiliary_loss_mlp": 0.0103844, + "balance_loss_clip": 1.04947579, + "balance_loss_mlp": 1.02263331, + "epoch": 0.27380129264993236, + "flos": 31096023789600.0, + "grad_norm": 1.7453282499007048, + "language_loss": 0.58902121, + "learning_rate": 3.4082111416476337e-06, + "loss": 0.6108036, + "num_input_tokens_seen": 98438670, + "router_z_loss_clip": 0.90283203, + "router_z_loss_mlp": 0.15808105, + "step": 4554, + "time_per_iteration": 4.158017635345459 + }, + { + "auxiliary_loss_clip": 0.01148271, + "auxiliary_loss_mlp": 0.01033662, + "balance_loss_clip": 1.05272985, + "balance_loss_mlp": 1.01840353, + "epoch": 0.2738614159026003, + "flos": 22770261997920.0, + "grad_norm": 1.7690755823873532, + "language_loss": 0.73594308, + "learning_rate": 3.4079345600373275e-06, + "loss": 0.75776243, + "num_input_tokens_seen": 98456060, + "router_z_loss_clip": 0.95654297, + "router_z_loss_mlp": 0.15246582, + "step": 4555, + "time_per_iteration": 4.066922903060913 + }, + { + "auxiliary_loss_clip": 0.01143391, + "auxiliary_loss_mlp": 0.01036686, + "balance_loss_clip": 1.05091155, + "balance_loss_mlp": 1.02195191, + "epoch": 0.2739215391552683, + "flos": 28647062968320.0, + "grad_norm": 2.623527034481916, + "language_loss": 0.7761907, + "learning_rate": 3.407657925038002e-06, + "loss": 0.79799151, + "num_input_tokens_seen": 98473765, + "router_z_loss_clip": 0.92431641, + "router_z_loss_mlp": 0.14758301, + "step": 4556, + "time_per_iteration": 2.679594039916992 + }, + { + "auxiliary_loss_clip": 0.0115163, + "auxiliary_loss_mlp": 0.0104579, + "balance_loss_clip": 1.05078566, + "balance_loss_mlp": 1.02916002, + "epoch": 0.27398166240793626, + "flos": 20900129518080.0, + "grad_norm": 1.8712975526044335, + "language_loss": 0.82051313, + "learning_rate": 3.4073812366601473e-06, + "loss": 0.84248734, + "num_input_tokens_seen": 98490590, + "router_z_loss_clip": 1.01074219, + "router_z_loss_mlp": 0.16625977, + "step": 4557, + "time_per_iteration": 4.034588098526001 + }, + { + "auxiliary_loss_clip": 0.01139963, + "auxiliary_loss_mlp": 0.01035946, + "balance_loss_clip": 1.04873896, + "balance_loss_mlp": 1.02143228, + "epoch": 0.2740417856606042, + "flos": 28558666964160.0, + "grad_norm": 1.87108170706634, + "language_loss": 0.72861671, + "learning_rate": 3.4071044949142547e-06, + "loss": 0.75037575, + "num_input_tokens_seen": 98510590, + "router_z_loss_clip": 0.91210938, + "router_z_loss_mlp": 0.14508057, + "step": 4558, + "time_per_iteration": 2.680990219116211 + }, + { + "auxiliary_loss_clip": 0.01139353, + "auxiliary_loss_mlp": 0.01045591, + "balance_loss_clip": 1.04820871, + "balance_loss_mlp": 1.03054643, + "epoch": 0.2741019089132722, + "flos": 15594701709600.0, + "grad_norm": 3.328144631182357, + "language_loss": 0.67810929, + "learning_rate": 3.406827699810819e-06, + "loss": 0.69995874, + "num_input_tokens_seen": 98527875, + "router_z_loss_clip": 0.91259766, + "router_z_loss_mlp": 0.1505127, + "step": 4559, + "time_per_iteration": 4.057924270629883 + }, + { + "auxiliary_loss_clip": 0.01137896, + "auxiliary_loss_mlp": 0.01042278, + "balance_loss_clip": 1.04774427, + "balance_loss_mlp": 1.02733469, + "epoch": 0.27416203216594015, + "flos": 25174619644320.0, + "grad_norm": 1.9655106941520444, + "language_loss": 0.72337157, + "learning_rate": 3.4065508513603353e-06, + "loss": 0.74517334, + "num_input_tokens_seen": 98547575, + "router_z_loss_clip": 0.90136719, + "router_z_loss_mlp": 0.14941406, + "step": 4560, + "time_per_iteration": 2.714668035507202 + }, + { + "auxiliary_loss_clip": 0.01140592, + "auxiliary_loss_mlp": 0.01040564, + "balance_loss_clip": 1.04775786, + "balance_loss_mlp": 1.02594948, + "epoch": 0.27422215541860817, + "flos": 32386558099680.0, + "grad_norm": 2.2163865592432117, + "language_loss": 0.81430829, + "learning_rate": 3.406273949573303e-06, + "loss": 0.83611977, + "num_input_tokens_seen": 98566290, + "router_z_loss_clip": 0.92822266, + "router_z_loss_mlp": 0.14624023, + "step": 4561, + "time_per_iteration": 2.6835761070251465 + }, + { + "auxiliary_loss_clip": 0.01142228, + "auxiliary_loss_mlp": 0.01042216, + "balance_loss_clip": 1.04930806, + "balance_loss_mlp": 1.02757072, + "epoch": 0.27428227867127614, + "flos": 28468933889760.0, + "grad_norm": 1.6399297173934912, + "language_loss": 0.75115371, + "learning_rate": 3.4059969944602214e-06, + "loss": 0.77299809, + "num_input_tokens_seen": 98586255, + "router_z_loss_clip": 0.92919922, + "router_z_loss_mlp": 0.14654541, + "step": 4562, + "time_per_iteration": 2.6988134384155273 + }, + { + "auxiliary_loss_clip": 0.01138171, + "auxiliary_loss_mlp": 0.01035673, + "balance_loss_clip": 1.04746199, + "balance_loss_mlp": 1.02138615, + "epoch": 0.2743424019239441, + "flos": 28108056762720.0, + "grad_norm": 1.6709437516278722, + "language_loss": 0.74721271, + "learning_rate": 3.4057199860315928e-06, + "loss": 0.76895112, + "num_input_tokens_seen": 98606030, + "router_z_loss_clip": 0.90722656, + "router_z_loss_mlp": 0.14276123, + "step": 4563, + "time_per_iteration": 2.6743314266204834 + }, + { + "auxiliary_loss_clip": 0.01146192, + "auxiliary_loss_mlp": 0.01043305, + "balance_loss_clip": 1.04995775, + "balance_loss_mlp": 1.02663946, + "epoch": 0.27440252517661207, + "flos": 26821493146080.0, + "grad_norm": 1.902380743604232, + "language_loss": 0.622729, + "learning_rate": 3.4054429242979213e-06, + "loss": 0.64462399, + "num_input_tokens_seen": 98625225, + "router_z_loss_clip": 0.96191406, + "router_z_loss_mlp": 0.16662598, + "step": 4564, + "time_per_iteration": 2.6874938011169434 + }, + { + "auxiliary_loss_clip": 0.01141767, + "auxiliary_loss_mlp": 0.0104339, + "balance_loss_clip": 1.04907823, + "balance_loss_mlp": 1.02736223, + "epoch": 0.27446264842928003, + "flos": 49038284269440.0, + "grad_norm": 1.760019695799873, + "language_loss": 0.78549969, + "learning_rate": 3.4051658092697135e-06, + "loss": 0.80735129, + "num_input_tokens_seen": 98649470, + "router_z_loss_clip": 0.92675781, + "router_z_loss_mlp": 0.16009521, + "step": 4565, + "time_per_iteration": 2.8109095096588135 + }, + { + "auxiliary_loss_clip": 0.0114288, + "auxiliary_loss_mlp": 0.01044544, + "balance_loss_clip": 1.05135012, + "balance_loss_mlp": 1.0301553, + "epoch": 0.274522771681948, + "flos": 16447429278720.0, + "grad_norm": 2.0548490136092172, + "language_loss": 0.68829107, + "learning_rate": 3.404888640957477e-06, + "loss": 0.71016532, + "num_input_tokens_seen": 98666915, + "router_z_loss_clip": 0.91650391, + "router_z_loss_mlp": 0.14404297, + "step": 4566, + "time_per_iteration": 2.7124226093292236 + }, + { + "auxiliary_loss_clip": 0.01141352, + "auxiliary_loss_mlp": 0.01043339, + "balance_loss_clip": 1.05220544, + "balance_loss_mlp": 1.0295701, + "epoch": 0.27458289493461596, + "flos": 34925333029920.0, + "grad_norm": 1.7822764105260742, + "language_loss": 0.61287177, + "learning_rate": 3.404611419371723e-06, + "loss": 0.63471872, + "num_input_tokens_seen": 98688240, + "router_z_loss_clip": 0.89160156, + "router_z_loss_mlp": 0.13769531, + "step": 4567, + "time_per_iteration": 2.7092840671539307 + }, + { + "auxiliary_loss_clip": 0.01142165, + "auxiliary_loss_mlp": 0.01039059, + "balance_loss_clip": 1.05028629, + "balance_loss_mlp": 1.02380025, + "epoch": 0.2746430181872839, + "flos": 24550256230560.0, + "grad_norm": 1.839216235645627, + "language_loss": 0.82729602, + "learning_rate": 3.4043341445229627e-06, + "loss": 0.84910822, + "num_input_tokens_seen": 98708245, + "router_z_loss_clip": 0.91845703, + "router_z_loss_mlp": 0.15264893, + "step": 4568, + "time_per_iteration": 2.6771764755249023 + }, + { + "auxiliary_loss_clip": 0.01145631, + "auxiliary_loss_mlp": 0.01035203, + "balance_loss_clip": 1.0521822, + "balance_loss_mlp": 1.01978922, + "epoch": 0.2747031414399519, + "flos": 24640070339520.0, + "grad_norm": 2.057106795891912, + "language_loss": 0.68604106, + "learning_rate": 3.4040568164217117e-06, + "loss": 0.70784938, + "num_input_tokens_seen": 98724575, + "router_z_loss_clip": 0.93554688, + "router_z_loss_mlp": 0.1541748, + "step": 4569, + "time_per_iteration": 2.672255516052246 + }, + { + "auxiliary_loss_clip": 0.01141444, + "auxiliary_loss_mlp": 0.01034811, + "balance_loss_clip": 1.04856658, + "balance_loss_mlp": 1.01934981, + "epoch": 0.27476326469261986, + "flos": 16492154005440.0, + "grad_norm": 2.3868444478782624, + "language_loss": 0.71137798, + "learning_rate": 3.4037794350784848e-06, + "loss": 0.73314059, + "num_input_tokens_seen": 98740700, + "router_z_loss_clip": 0.92773438, + "router_z_loss_mlp": 0.15447998, + "step": 4570, + "time_per_iteration": 2.636786699295044 + }, + { + "auxiliary_loss_clip": 0.01054088, + "auxiliary_loss_mlp": 0.01020797, + "balance_loss_clip": 1.02356315, + "balance_loss_mlp": 1.01927209, + "epoch": 0.2748233879452878, + "flos": 80457510466080.0, + "grad_norm": 0.7301675810494552, + "language_loss": 0.5580759, + "learning_rate": 3.4035020005038014e-06, + "loss": 0.57882476, + "num_input_tokens_seen": 98803030, + "router_z_loss_clip": 0.30541992, + "router_z_loss_mlp": 0.0152359, + "step": 4571, + "time_per_iteration": 3.3782782554626465 + }, + { + "auxiliary_loss_clip": 0.01146362, + "auxiliary_loss_mlp": 0.0104112, + "balance_loss_clip": 1.05161762, + "balance_loss_mlp": 1.02642202, + "epoch": 0.2748835111979558, + "flos": 21211987086720.0, + "grad_norm": 2.1611497367458403, + "language_loss": 0.7789529, + "learning_rate": 3.4032245127081812e-06, + "loss": 0.80082774, + "num_input_tokens_seen": 98820505, + "router_z_loss_clip": 0.94726562, + "router_z_loss_mlp": 0.14697266, + "step": 4572, + "time_per_iteration": 2.709352970123291 + }, + { + "auxiliary_loss_clip": 0.01138474, + "auxiliary_loss_mlp": 0.01031302, + "balance_loss_clip": 1.05035758, + "balance_loss_mlp": 1.01831961, + "epoch": 0.27494363445062375, + "flos": 28781723355840.0, + "grad_norm": 2.5415742447909246, + "language_loss": 0.81431848, + "learning_rate": 3.402946971702147e-06, + "loss": 0.83601624, + "num_input_tokens_seen": 98842150, + "router_z_loss_clip": 0.88183594, + "router_z_loss_mlp": 0.12982178, + "step": 4573, + "time_per_iteration": 2.747054100036621 + }, + { + "auxiliary_loss_clip": 0.01136198, + "auxiliary_loss_mlp": 0.0103337, + "balance_loss_clip": 1.04784095, + "balance_loss_mlp": 1.01926732, + "epoch": 0.2750037577032918, + "flos": 20944043899200.0, + "grad_norm": 1.629635261511025, + "language_loss": 0.79198396, + "learning_rate": 3.402669377496223e-06, + "loss": 0.81367964, + "num_input_tokens_seen": 98861050, + "router_z_loss_clip": 0.88330078, + "router_z_loss_mlp": 0.14099121, + "step": 4574, + "time_per_iteration": 2.829035520553589 + }, + { + "auxiliary_loss_clip": 0.01141319, + "auxiliary_loss_mlp": 0.01042991, + "balance_loss_clip": 1.04886723, + "balance_loss_mlp": 1.02886438, + "epoch": 0.27506388095595974, + "flos": 29885984614080.0, + "grad_norm": 2.235855767405729, + "language_loss": 0.74506712, + "learning_rate": 3.402391730100936e-06, + "loss": 0.7669102, + "num_input_tokens_seen": 98879695, + "router_z_loss_clip": 0.92480469, + "router_z_loss_mlp": 0.14123535, + "step": 4575, + "time_per_iteration": 2.7019472122192383 + }, + { + "auxiliary_loss_clip": 0.01138, + "auxiliary_loss_mlp": 0.0103477, + "balance_loss_clip": 1.04855871, + "balance_loss_mlp": 1.02134144, + "epoch": 0.2751240042086277, + "flos": 47302123383360.0, + "grad_norm": 1.697829220700597, + "language_loss": 0.72270858, + "learning_rate": 3.402114029526814e-06, + "loss": 0.74443626, + "num_input_tokens_seen": 98902035, + "router_z_loss_clip": 0.89306641, + "router_z_loss_mlp": 0.13433838, + "step": 4576, + "time_per_iteration": 2.7972612380981445 + }, + { + "auxiliary_loss_clip": 0.01139965, + "auxiliary_loss_mlp": 0.01036623, + "balance_loss_clip": 1.04901147, + "balance_loss_mlp": 1.02177012, + "epoch": 0.27518412746129567, + "flos": 32832468296640.0, + "grad_norm": 1.7351061013926894, + "language_loss": 0.73084486, + "learning_rate": 3.4018362757843866e-06, + "loss": 0.75261074, + "num_input_tokens_seen": 98921835, + "router_z_loss_clip": 0.91015625, + "router_z_loss_mlp": 0.14849854, + "step": 4577, + "time_per_iteration": 2.8642666339874268 + }, + { + "auxiliary_loss_clip": 0.01141371, + "auxiliary_loss_mlp": 0.01034166, + "balance_loss_clip": 1.04945862, + "balance_loss_mlp": 1.01891303, + "epoch": 0.27524425071396363, + "flos": 30383953165440.0, + "grad_norm": 1.8191619564257806, + "language_loss": 0.75737429, + "learning_rate": 3.401558468884188e-06, + "loss": 0.77912962, + "num_input_tokens_seen": 98939610, + "router_z_loss_clip": 0.91894531, + "router_z_loss_mlp": 0.15252686, + "step": 4578, + "time_per_iteration": 2.75492787361145 + }, + { + "auxiliary_loss_clip": 0.01140271, + "auxiliary_loss_mlp": 0.01045521, + "balance_loss_clip": 1.04709387, + "balance_loss_mlp": 1.02814043, + "epoch": 0.2753043739666316, + "flos": 32077010016000.0, + "grad_norm": 1.5064117991494537, + "language_loss": 0.6658622, + "learning_rate": 3.4012806088367516e-06, + "loss": 0.68772018, + "num_input_tokens_seen": 98962250, + "router_z_loss_clip": 0.93212891, + "router_z_loss_mlp": 0.17382812, + "step": 4579, + "time_per_iteration": 2.7098333835601807 + }, + { + "auxiliary_loss_clip": 0.01143661, + "auxiliary_loss_mlp": 0.01052212, + "balance_loss_clip": 1.04947662, + "balance_loss_mlp": 1.03645277, + "epoch": 0.27536449721929956, + "flos": 29537424740160.0, + "grad_norm": 1.8497964026155398, + "language_loss": 0.79952228, + "learning_rate": 3.4010026956526137e-06, + "loss": 0.82148099, + "num_input_tokens_seen": 98981845, + "router_z_loss_clip": 0.94140625, + "router_z_loss_mlp": 0.15771484, + "step": 4580, + "time_per_iteration": 2.6836435794830322 + }, + { + "auxiliary_loss_clip": 0.01142184, + "auxiliary_loss_mlp": 0.01045228, + "balance_loss_clip": 1.05010033, + "balance_loss_mlp": 1.02833605, + "epoch": 0.27542462047196753, + "flos": 23839279572960.0, + "grad_norm": 1.538271119608397, + "language_loss": 0.67712831, + "learning_rate": 3.4007247293423137e-06, + "loss": 0.6990025, + "num_input_tokens_seen": 99001855, + "router_z_loss_clip": 0.92138672, + "router_z_loss_mlp": 0.16894531, + "step": 4581, + "time_per_iteration": 2.652618169784546 + }, + { + "auxiliary_loss_clip": 0.0114246, + "auxiliary_loss_mlp": 0.01043808, + "balance_loss_clip": 1.04813886, + "balance_loss_mlp": 1.0293479, + "epoch": 0.2754847437246355, + "flos": 17471803161600.0, + "grad_norm": 1.6407477553271017, + "language_loss": 0.78136921, + "learning_rate": 3.400446709916392e-06, + "loss": 0.80323189, + "num_input_tokens_seen": 99019880, + "router_z_loss_clip": 0.94384766, + "router_z_loss_mlp": 0.14459229, + "step": 4582, + "time_per_iteration": 2.666593551635742 + }, + { + "auxiliary_loss_clip": 0.01139136, + "auxiliary_loss_mlp": 0.01036133, + "balance_loss_clip": 1.04929781, + "balance_loss_mlp": 1.0226326, + "epoch": 0.27554486697730346, + "flos": 22987402866720.0, + "grad_norm": 1.6989254437621772, + "language_loss": 0.84138358, + "learning_rate": 3.4001686373853895e-06, + "loss": 0.86313617, + "num_input_tokens_seen": 99037570, + "router_z_loss_clip": 0.8984375, + "router_z_loss_mlp": 0.13513184, + "step": 4583, + "time_per_iteration": 2.626807451248169 + }, + { + "auxiliary_loss_clip": 0.01140367, + "auxiliary_loss_mlp": 0.01034583, + "balance_loss_clip": 1.04791737, + "balance_loss_mlp": 1.0207305, + "epoch": 0.2756049902299714, + "flos": 27311074620480.0, + "grad_norm": 1.7813025414669372, + "language_loss": 0.67239845, + "learning_rate": 3.3998905117598528e-06, + "loss": 0.694148, + "num_input_tokens_seen": 99056875, + "router_z_loss_clip": 0.92431641, + "router_z_loss_mlp": 0.13830566, + "step": 4584, + "time_per_iteration": 2.669088125228882 + }, + { + "auxiliary_loss_clip": 0.01136578, + "auxiliary_loss_mlp": 0.01040691, + "balance_loss_clip": 1.04723561, + "balance_loss_mlp": 1.02698791, + "epoch": 0.2756651134826394, + "flos": 23883963782400.0, + "grad_norm": 1.8002037674647307, + "language_loss": 0.77451754, + "learning_rate": 3.399612333050327e-06, + "loss": 0.79629028, + "num_input_tokens_seen": 99074685, + "router_z_loss_clip": 0.89306641, + "router_z_loss_mlp": 0.13720703, + "step": 4585, + "time_per_iteration": 2.7463157176971436 + }, + { + "auxiliary_loss_clip": 0.01145614, + "auxiliary_loss_mlp": 0.0103849, + "balance_loss_clip": 1.05244923, + "balance_loss_mlp": 1.02294469, + "epoch": 0.27572523673530736, + "flos": 28780588872000.0, + "grad_norm": 2.0760840765582445, + "language_loss": 0.71915549, + "learning_rate": 3.399334101267362e-06, + "loss": 0.74099648, + "num_input_tokens_seen": 99095300, + "router_z_loss_clip": 0.93115234, + "router_z_loss_mlp": 0.15545654, + "step": 4586, + "time_per_iteration": 2.7135353088378906 + }, + { + "auxiliary_loss_clip": 0.01142239, + "auxiliary_loss_mlp": 0.01033861, + "balance_loss_clip": 1.05132806, + "balance_loss_mlp": 1.01987207, + "epoch": 0.2757853599879754, + "flos": 27845340304320.0, + "grad_norm": 1.7511257598095304, + "language_loss": 0.8032577, + "learning_rate": 3.3990558164215073e-06, + "loss": 0.82501864, + "num_input_tokens_seen": 99115965, + "router_z_loss_clip": 0.90966797, + "router_z_loss_mlp": 0.13995361, + "step": 4587, + "time_per_iteration": 2.6551198959350586 + }, + { + "auxiliary_loss_clip": 0.01139138, + "auxiliary_loss_mlp": 0.01036503, + "balance_loss_clip": 1.04875565, + "balance_loss_mlp": 1.02236462, + "epoch": 0.27584548324064334, + "flos": 22636655059680.0, + "grad_norm": 1.847751536912949, + "language_loss": 0.82915735, + "learning_rate": 3.398777478523316e-06, + "loss": 0.85091376, + "num_input_tokens_seen": 99134265, + "router_z_loss_clip": 0.90429688, + "router_z_loss_mlp": 0.14141846, + "step": 4588, + "time_per_iteration": 2.6810104846954346 + }, + { + "auxiliary_loss_clip": 0.01137764, + "auxiliary_loss_mlp": 0.01032816, + "balance_loss_clip": 1.04827809, + "balance_loss_mlp": 1.01826036, + "epoch": 0.2759056064933113, + "flos": 29004212505600.0, + "grad_norm": 1.4906522267194542, + "language_loss": 0.75242305, + "learning_rate": 3.398499087583342e-06, + "loss": 0.77412891, + "num_input_tokens_seen": 99156185, + "router_z_loss_clip": 0.89453125, + "router_z_loss_mlp": 0.14538574, + "step": 4589, + "time_per_iteration": 2.6950910091400146 + }, + { + "auxiliary_loss_clip": 0.01135748, + "auxiliary_loss_mlp": 0.01043849, + "balance_loss_clip": 1.04696608, + "balance_loss_mlp": 1.02923429, + "epoch": 0.27596572974597927, + "flos": 29628900057600.0, + "grad_norm": 1.8878059515837218, + "language_loss": 0.88531226, + "learning_rate": 3.398220643612143e-06, + "loss": 0.90710825, + "num_input_tokens_seen": 99176735, + "router_z_loss_clip": 0.88574219, + "router_z_loss_mlp": 0.1463623, + "step": 4590, + "time_per_iteration": 2.8067727088928223 + }, + { + "auxiliary_loss_clip": 0.01138747, + "auxiliary_loss_mlp": 0.01043303, + "balance_loss_clip": 1.04791129, + "balance_loss_mlp": 1.02840829, + "epoch": 0.27602585299864724, + "flos": 42758352999360.0, + "grad_norm": 1.5254606266698867, + "language_loss": 0.71247435, + "learning_rate": 3.397942146620277e-06, + "loss": 0.73429489, + "num_input_tokens_seen": 99199765, + "router_z_loss_clip": 0.90820312, + "router_z_loss_mlp": 0.14886475, + "step": 4591, + "time_per_iteration": 2.9217734336853027 + }, + { + "auxiliary_loss_clip": 0.01139223, + "auxiliary_loss_mlp": 0.01041609, + "balance_loss_clip": 1.04671514, + "balance_loss_mlp": 1.02661872, + "epoch": 0.2760859762513152, + "flos": 29663292877920.0, + "grad_norm": 1.8322614570219578, + "language_loss": 0.8001157, + "learning_rate": 3.3976635966183046e-06, + "loss": 0.82192397, + "num_input_tokens_seen": 99218435, + "router_z_loss_clip": 0.92480469, + "router_z_loss_mlp": 0.14984131, + "step": 4592, + "time_per_iteration": 2.767552375793457 + }, + { + "auxiliary_loss_clip": 0.0105122, + "auxiliary_loss_mlp": 0.01005674, + "balance_loss_clip": 1.02103889, + "balance_loss_mlp": 1.00394225, + "epoch": 0.27614609950398317, + "flos": 86954096397600.0, + "grad_norm": 0.7075805932304433, + "language_loss": 0.61595786, + "learning_rate": 3.3973849936167886e-06, + "loss": 0.63652682, + "num_input_tokens_seen": 99276200, + "router_z_loss_clip": 0.30151367, + "router_z_loss_mlp": 0.0173645, + "step": 4593, + "time_per_iteration": 3.3539247512817383 + }, + { + "auxiliary_loss_clip": 0.01138265, + "auxiliary_loss_mlp": 0.01043956, + "balance_loss_clip": 1.04905438, + "balance_loss_mlp": 1.02924001, + "epoch": 0.27620622275665113, + "flos": 36210438024480.0, + "grad_norm": 3.3154502347761414, + "language_loss": 0.77170324, + "learning_rate": 3.3971063376262937e-06, + "loss": 0.79352546, + "num_input_tokens_seen": 99297625, + "router_z_loss_clip": 0.89208984, + "router_z_loss_mlp": 0.14709473, + "step": 4594, + "time_per_iteration": 5.761789083480835 + }, + { + "auxiliary_loss_clip": 0.0113952, + "auxiliary_loss_mlp": 0.0103548, + "balance_loss_clip": 1.05039716, + "balance_loss_mlp": 1.02092481, + "epoch": 0.2762663460093191, + "flos": 18763877128320.0, + "grad_norm": 1.5241060947644054, + "language_loss": 0.91549253, + "learning_rate": 3.3968276286573866e-06, + "loss": 0.93724251, + "num_input_tokens_seen": 99315790, + "router_z_loss_clip": 0.89160156, + "router_z_loss_mlp": 0.14556885, + "step": 4595, + "time_per_iteration": 2.741957187652588 + }, + { + "auxiliary_loss_clip": 0.01141028, + "auxiliary_loss_mlp": 0.01048249, + "balance_loss_clip": 1.04950297, + "balance_loss_mlp": 1.03332436, + "epoch": 0.27632646926198706, + "flos": 25263177717600.0, + "grad_norm": 1.9572878486129235, + "language_loss": 0.69369066, + "learning_rate": 3.3965488667206353e-06, + "loss": 0.71558344, + "num_input_tokens_seen": 99334615, + "router_z_loss_clip": 0.91552734, + "router_z_loss_mlp": 0.14929199, + "step": 4596, + "time_per_iteration": 4.273008584976196 + }, + { + "auxiliary_loss_clip": 0.01145311, + "auxiliary_loss_mlp": 0.01039651, + "balance_loss_clip": 1.04977167, + "balance_loss_mlp": 1.02497661, + "epoch": 0.276386592514655, + "flos": 40040233472160.0, + "grad_norm": 2.0061228433601728, + "language_loss": 0.63767278, + "learning_rate": 3.3962700518266113e-06, + "loss": 0.65952235, + "num_input_tokens_seen": 99356685, + "router_z_loss_clip": 0.95458984, + "router_z_loss_mlp": 0.14660645, + "step": 4597, + "time_per_iteration": 2.787538528442383 + }, + { + "auxiliary_loss_clip": 0.01138079, + "auxiliary_loss_mlp": 0.01039406, + "balance_loss_clip": 1.04967666, + "balance_loss_mlp": 1.0257926, + "epoch": 0.276446715767323, + "flos": 22636655059680.0, + "grad_norm": 2.6308403515512224, + "language_loss": 0.86473268, + "learning_rate": 3.395991183985887e-06, + "loss": 0.88650751, + "num_input_tokens_seen": 99374810, + "router_z_loss_clip": 0.88330078, + "router_z_loss_mlp": 0.13598633, + "step": 4598, + "time_per_iteration": 4.090108871459961 + }, + { + "auxiliary_loss_clip": 0.01141375, + "auxiliary_loss_mlp": 0.01042166, + "balance_loss_clip": 1.05025101, + "balance_loss_mlp": 1.02700305, + "epoch": 0.27650683901999096, + "flos": 27844773062400.0, + "grad_norm": 2.261334656664807, + "language_loss": 0.80153537, + "learning_rate": 3.395712263209037e-06, + "loss": 0.82337081, + "num_input_tokens_seen": 99391290, + "router_z_loss_clip": 0.91113281, + "router_z_loss_mlp": 0.15161133, + "step": 4599, + "time_per_iteration": 2.6455001831054688 + }, + { + "auxiliary_loss_clip": 0.01144667, + "auxiliary_loss_mlp": 0.01045004, + "balance_loss_clip": 1.05046606, + "balance_loss_mlp": 1.03081775, + "epoch": 0.276566962272659, + "flos": 26065386588960.0, + "grad_norm": 2.0392886668796213, + "language_loss": 0.78983247, + "learning_rate": 3.395433289506639e-06, + "loss": 0.81172919, + "num_input_tokens_seen": 99409120, + "router_z_loss_clip": 0.94091797, + "router_z_loss_mlp": 0.14202881, + "step": 4600, + "time_per_iteration": 2.7642908096313477 + }, + { + "auxiliary_loss_clip": 0.01143703, + "auxiliary_loss_mlp": 0.01043774, + "balance_loss_clip": 1.04925513, + "balance_loss_mlp": 1.0290283, + "epoch": 0.27662708552532694, + "flos": 21610457899200.0, + "grad_norm": 2.29748506838711, + "language_loss": 0.72976154, + "learning_rate": 3.3951542628892694e-06, + "loss": 0.75163627, + "num_input_tokens_seen": 99426180, + "router_z_loss_clip": 0.94482422, + "router_z_loss_mlp": 0.14733887, + "step": 4601, + "time_per_iteration": 2.6241884231567383 + }, + { + "auxiliary_loss_clip": 0.01137873, + "auxiliary_loss_mlp": 0.01044939, + "balance_loss_clip": 1.04745173, + "balance_loss_mlp": 1.02972758, + "epoch": 0.2766872087779949, + "flos": 25932549479040.0, + "grad_norm": 1.7014267253208533, + "language_loss": 0.8018713, + "learning_rate": 3.3948751833675113e-06, + "loss": 0.82369941, + "num_input_tokens_seen": 99447720, + "router_z_loss_clip": 0.90478516, + "router_z_loss_mlp": 0.15209961, + "step": 4602, + "time_per_iteration": 2.8529224395751953 + }, + { + "auxiliary_loss_clip": 0.0114536, + "auxiliary_loss_mlp": 0.01045143, + "balance_loss_clip": 1.04950082, + "balance_loss_mlp": 1.02844191, + "epoch": 0.2767473320306629, + "flos": 15779029932000.0, + "grad_norm": 2.4969880153681405, + "language_loss": 0.77274811, + "learning_rate": 3.3945960509519455e-06, + "loss": 0.79465318, + "num_input_tokens_seen": 99464720, + "router_z_loss_clip": 0.95947266, + "router_z_loss_mlp": 0.16699219, + "step": 4603, + "time_per_iteration": 2.6363842487335205 + }, + { + "auxiliary_loss_clip": 0.01135728, + "auxiliary_loss_mlp": 0.01043178, + "balance_loss_clip": 1.04823518, + "balance_loss_mlp": 1.02952886, + "epoch": 0.27680745528333084, + "flos": 18318250552320.0, + "grad_norm": 2.5287982215682487, + "language_loss": 0.81794697, + "learning_rate": 3.3943168656531585e-06, + "loss": 0.83973604, + "num_input_tokens_seen": 99482310, + "router_z_loss_clip": 0.87402344, + "router_z_loss_mlp": 0.13647461, + "step": 4604, + "time_per_iteration": 2.6483523845672607 + }, + { + "auxiliary_loss_clip": 0.01141746, + "auxiliary_loss_mlp": 0.01033328, + "balance_loss_clip": 1.04890513, + "balance_loss_mlp": 1.01918387, + "epoch": 0.2768675785359988, + "flos": 27622891671840.0, + "grad_norm": 2.0770681212952646, + "language_loss": 0.69920468, + "learning_rate": 3.3940376274817363e-06, + "loss": 0.72095537, + "num_input_tokens_seen": 99501255, + "router_z_loss_clip": 0.92919922, + "router_z_loss_mlp": 0.14147949, + "step": 4605, + "time_per_iteration": 2.6986255645751953 + }, + { + "auxiliary_loss_clip": 0.01048512, + "auxiliary_loss_mlp": 0.01006625, + "balance_loss_clip": 1.01854789, + "balance_loss_mlp": 1.0048666, + "epoch": 0.27692770178866677, + "flos": 80692519455360.0, + "grad_norm": 0.6971646495509528, + "language_loss": 0.57108378, + "learning_rate": 3.3937583364482673e-06, + "loss": 0.59163511, + "num_input_tokens_seen": 99568925, + "router_z_loss_clip": 0.29980469, + "router_z_loss_mlp": 0.01757812, + "step": 4606, + "time_per_iteration": 3.380445718765259 + }, + { + "auxiliary_loss_clip": 0.01142705, + "auxiliary_loss_mlp": 0.01041118, + "balance_loss_clip": 1.05048847, + "balance_loss_mlp": 1.02583539, + "epoch": 0.27698782504133473, + "flos": 32293421573760.0, + "grad_norm": 2.0128974362422953, + "language_loss": 0.69225669, + "learning_rate": 3.3934789925633424e-06, + "loss": 0.71409488, + "num_input_tokens_seen": 99588455, + "router_z_loss_clip": 0.92333984, + "router_z_loss_mlp": 0.15270996, + "step": 4607, + "time_per_iteration": 2.759855031967163 + }, + { + "auxiliary_loss_clip": 0.01135154, + "auxiliary_loss_mlp": 0.01031833, + "balance_loss_clip": 1.0475148, + "balance_loss_mlp": 1.01789713, + "epoch": 0.2770479482940027, + "flos": 31585119056640.0, + "grad_norm": 1.6354322030982542, + "language_loss": 0.70103484, + "learning_rate": 3.393199595837555e-06, + "loss": 0.72270471, + "num_input_tokens_seen": 99609355, + "router_z_loss_clip": 0.87646484, + "router_z_loss_mlp": 0.1394043, + "step": 4608, + "time_per_iteration": 2.769137382507324 + }, + { + "auxiliary_loss_clip": 0.01140776, + "auxiliary_loss_mlp": 0.01036481, + "balance_loss_clip": 1.04876566, + "balance_loss_mlp": 1.02204454, + "epoch": 0.27710807154667066, + "flos": 27890591755680.0, + "grad_norm": 2.2798593894516803, + "language_loss": 0.72989488, + "learning_rate": 3.392920146281499e-06, + "loss": 0.7516675, + "num_input_tokens_seen": 99628780, + "router_z_loss_clip": 0.91992188, + "router_z_loss_mlp": 0.14440918, + "step": 4609, + "time_per_iteration": 2.699496269226074 + }, + { + "auxiliary_loss_clip": 0.0113913, + "auxiliary_loss_mlp": 0.01041639, + "balance_loss_clip": 1.04615581, + "balance_loss_mlp": 1.02670193, + "epoch": 0.27716819479933863, + "flos": 21610984623840.0, + "grad_norm": 2.36582699514068, + "language_loss": 0.84147799, + "learning_rate": 3.3926406439057714e-06, + "loss": 0.86328566, + "num_input_tokens_seen": 99644545, + "router_z_loss_clip": 0.93066406, + "router_z_loss_mlp": 0.14935303, + "step": 4610, + "time_per_iteration": 2.608570098876953 + }, + { + "auxiliary_loss_clip": 0.01142701, + "auxiliary_loss_mlp": 0.01039534, + "balance_loss_clip": 1.04796767, + "balance_loss_mlp": 1.02373934, + "epoch": 0.2772283180520066, + "flos": 23972076165600.0, + "grad_norm": 1.8305533106673817, + "language_loss": 0.69128937, + "learning_rate": 3.3923610887209705e-06, + "loss": 0.7131117, + "num_input_tokens_seen": 99663125, + "router_z_loss_clip": 0.94677734, + "router_z_loss_mlp": 0.15783691, + "step": 4611, + "time_per_iteration": 2.6769328117370605 + }, + { + "auxiliary_loss_clip": 0.0113474, + "auxiliary_loss_mlp": 0.01030598, + "balance_loss_clip": 1.04856157, + "balance_loss_mlp": 1.01632261, + "epoch": 0.27728844130467456, + "flos": 25664079566880.0, + "grad_norm": 2.907931459578057, + "language_loss": 0.73650312, + "learning_rate": 3.392081480737698e-06, + "loss": 0.75815642, + "num_input_tokens_seen": 99682645, + "router_z_loss_clip": 0.86132812, + "router_z_loss_mlp": 0.14282227, + "step": 4612, + "time_per_iteration": 2.686365842819214 + }, + { + "auxiliary_loss_clip": 0.01139989, + "auxiliary_loss_mlp": 0.01041559, + "balance_loss_clip": 1.04768813, + "balance_loss_mlp": 1.02630019, + "epoch": 0.2773485645573425, + "flos": 23170556088000.0, + "grad_norm": 2.251361796133116, + "language_loss": 0.66826719, + "learning_rate": 3.3918018199665563e-06, + "loss": 0.69008261, + "num_input_tokens_seen": 99700520, + "router_z_loss_clip": 0.92236328, + "router_z_loss_mlp": 0.15252686, + "step": 4613, + "time_per_iteration": 2.6532461643218994 + }, + { + "auxiliary_loss_clip": 0.01138008, + "auxiliary_loss_mlp": 0.01039977, + "balance_loss_clip": 1.04822588, + "balance_loss_mlp": 1.02443206, + "epoch": 0.27740868781001055, + "flos": 26196076283040.0, + "grad_norm": 3.3174218540614677, + "language_loss": 0.79515159, + "learning_rate": 3.39152210641815e-06, + "loss": 0.81693149, + "num_input_tokens_seen": 99720355, + "router_z_loss_clip": 0.89794922, + "router_z_loss_mlp": 0.15545654, + "step": 4614, + "time_per_iteration": 2.680203914642334 + }, + { + "auxiliary_loss_clip": 0.01140471, + "auxiliary_loss_mlp": 0.01035397, + "balance_loss_clip": 1.04819107, + "balance_loss_mlp": 1.02100801, + "epoch": 0.2774688110626785, + "flos": 24193957556160.0, + "grad_norm": 2.5104079336999385, + "language_loss": 0.80067545, + "learning_rate": 3.3912423401030865e-06, + "loss": 0.82243413, + "num_input_tokens_seen": 99736090, + "router_z_loss_clip": 0.92236328, + "router_z_loss_mlp": 0.14379883, + "step": 4615, + "time_per_iteration": 2.650247573852539 + }, + { + "auxiliary_loss_clip": 0.01143125, + "auxiliary_loss_mlp": 0.01043626, + "balance_loss_clip": 1.04876804, + "balance_loss_mlp": 1.02860582, + "epoch": 0.2775289343153465, + "flos": 22227933375360.0, + "grad_norm": 2.105680963080477, + "language_loss": 0.63587391, + "learning_rate": 3.3909625210319735e-06, + "loss": 0.65774149, + "num_input_tokens_seen": 99751805, + "router_z_loss_clip": 0.94384766, + "router_z_loss_mlp": 0.15039062, + "step": 4616, + "time_per_iteration": 2.7312545776367188 + }, + { + "auxiliary_loss_clip": 0.01138384, + "auxiliary_loss_mlp": 0.01034296, + "balance_loss_clip": 1.0478065, + "balance_loss_mlp": 1.02012181, + "epoch": 0.27758905756801444, + "flos": 20099379268800.0, + "grad_norm": 2.0091564050027597, + "language_loss": 0.82691753, + "learning_rate": 3.3906826492154226e-06, + "loss": 0.84864426, + "num_input_tokens_seen": 99770610, + "router_z_loss_clip": 0.90625, + "router_z_loss_mlp": 0.1416626, + "step": 4617, + "time_per_iteration": 2.6445958614349365 + }, + { + "auxiliary_loss_clip": 0.01138903, + "auxiliary_loss_mlp": 0.01040208, + "balance_loss_clip": 1.04704559, + "balance_loss_mlp": 1.02583766, + "epoch": 0.2776491808206824, + "flos": 22852620927360.0, + "grad_norm": 2.607006436256811, + "language_loss": 0.76702148, + "learning_rate": 3.3904027246640458e-06, + "loss": 0.78881258, + "num_input_tokens_seen": 99787305, + "router_z_loss_clip": 0.91943359, + "router_z_loss_mlp": 0.14373779, + "step": 4618, + "time_per_iteration": 2.673048257827759 + }, + { + "auxiliary_loss_clip": 0.01141612, + "auxiliary_loss_mlp": 0.01035846, + "balance_loss_clip": 1.05141032, + "balance_loss_mlp": 1.02199388, + "epoch": 0.27770930407335037, + "flos": 34212776198400.0, + "grad_norm": 1.742329939717865, + "language_loss": 0.85074043, + "learning_rate": 3.390122747388459e-06, + "loss": 0.87251508, + "num_input_tokens_seen": 99808940, + "router_z_loss_clip": 0.90185547, + "router_z_loss_mlp": 0.13848877, + "step": 4619, + "time_per_iteration": 2.7143046855926514 + }, + { + "auxiliary_loss_clip": 0.0113555, + "auxiliary_loss_mlp": 0.01033875, + "balance_loss_clip": 1.0484395, + "balance_loss_mlp": 1.02080965, + "epoch": 0.27776942732601834, + "flos": 28735783110720.0, + "grad_norm": 1.5324065498582105, + "language_loss": 0.76611233, + "learning_rate": 3.3898427173992778e-06, + "loss": 0.78780657, + "num_input_tokens_seen": 99829575, + "router_z_loss_clip": 0.87060547, + "router_z_loss_mlp": 0.13067627, + "step": 4620, + "time_per_iteration": 2.720885992050171 + }, + { + "auxiliary_loss_clip": 0.01136305, + "auxiliary_loss_mlp": 0.01037426, + "balance_loss_clip": 1.04902244, + "balance_loss_mlp": 1.02350223, + "epoch": 0.2778295505786863, + "flos": 29174278645440.0, + "grad_norm": 2.805553173539015, + "language_loss": 0.78433758, + "learning_rate": 3.389562634707122e-06, + "loss": 0.80607486, + "num_input_tokens_seen": 99847575, + "router_z_loss_clip": 0.87353516, + "router_z_loss_mlp": 0.13922119, + "step": 4621, + "time_per_iteration": 2.7226552963256836 + }, + { + "auxiliary_loss_clip": 0.01140501, + "auxiliary_loss_mlp": 0.01044772, + "balance_loss_clip": 1.04935491, + "balance_loss_mlp": 1.02965593, + "epoch": 0.27788967383135427, + "flos": 31182312895200.0, + "grad_norm": 2.1499682421261435, + "language_loss": 0.87322563, + "learning_rate": 3.389282499322611e-06, + "loss": 0.8950783, + "num_input_tokens_seen": 99864995, + "router_z_loss_clip": 0.91162109, + "router_z_loss_mlp": 0.15124512, + "step": 4622, + "time_per_iteration": 2.751366376876831 + }, + { + "auxiliary_loss_clip": 0.01138436, + "auxiliary_loss_mlp": 0.01042669, + "balance_loss_clip": 1.04815185, + "balance_loss_mlp": 1.02838743, + "epoch": 0.27794979708402223, + "flos": 19831192977600.0, + "grad_norm": 2.217417460089425, + "language_loss": 0.81940854, + "learning_rate": 3.389002311256369e-06, + "loss": 0.84121954, + "num_input_tokens_seen": 99881540, + "router_z_loss_clip": 0.90283203, + "router_z_loss_mlp": 0.14276123, + "step": 4623, + "time_per_iteration": 2.6836748123168945 + }, + { + "auxiliary_loss_clip": 0.01141366, + "auxiliary_loss_mlp": 0.01036599, + "balance_loss_clip": 1.05055642, + "balance_loss_mlp": 1.02228248, + "epoch": 0.2780099203366902, + "flos": 25218696094560.0, + "grad_norm": 2.1104178062722223, + "language_loss": 0.81726396, + "learning_rate": 3.3887220705190204e-06, + "loss": 0.83904362, + "num_input_tokens_seen": 99899595, + "router_z_loss_clip": 0.90820312, + "router_z_loss_mlp": 0.14331055, + "step": 4624, + "time_per_iteration": 2.6952450275421143 + }, + { + "auxiliary_loss_clip": 0.0113729, + "auxiliary_loss_mlp": 0.0103979, + "balance_loss_clip": 1.0496304, + "balance_loss_mlp": 1.02552652, + "epoch": 0.27807004358935816, + "flos": 21646511928000.0, + "grad_norm": 2.553165227258747, + "language_loss": 0.76695812, + "learning_rate": 3.388441777121191e-06, + "loss": 0.78872883, + "num_input_tokens_seen": 99913020, + "router_z_loss_clip": 0.87646484, + "router_z_loss_mlp": 0.1427002, + "step": 4625, + "time_per_iteration": 2.617122173309326 + }, + { + "auxiliary_loss_clip": 0.01136008, + "auxiliary_loss_mlp": 0.01036861, + "balance_loss_clip": 1.04824221, + "balance_loss_mlp": 1.02258587, + "epoch": 0.2781301668420261, + "flos": 20410183388160.0, + "grad_norm": 2.066915279725081, + "language_loss": 0.70323652, + "learning_rate": 3.388161431073511e-06, + "loss": 0.72496521, + "num_input_tokens_seen": 99931405, + "router_z_loss_clip": 0.87695312, + "router_z_loss_mlp": 0.14276123, + "step": 4626, + "time_per_iteration": 2.657569646835327 + }, + { + "auxiliary_loss_clip": 0.01143369, + "auxiliary_loss_mlp": 0.01034973, + "balance_loss_clip": 1.04993248, + "balance_loss_mlp": 1.02017355, + "epoch": 0.27819029009469415, + "flos": 16892893785600.0, + "grad_norm": 2.1885635022875753, + "language_loss": 0.92413926, + "learning_rate": 3.38788103238661e-06, + "loss": 0.94592267, + "num_input_tokens_seen": 99948100, + "router_z_loss_clip": 0.93505859, + "router_z_loss_mlp": 0.14801025, + "step": 4627, + "time_per_iteration": 2.691812038421631 + }, + { + "auxiliary_loss_clip": 0.01138582, + "auxiliary_loss_mlp": 0.01035371, + "balance_loss_clip": 1.04784465, + "balance_loss_mlp": 1.02169776, + "epoch": 0.2782504133473621, + "flos": 33054876411840.0, + "grad_norm": 1.8241093067460399, + "language_loss": 0.85594773, + "learning_rate": 3.387600581071121e-06, + "loss": 0.87768728, + "num_input_tokens_seen": 99966470, + "router_z_loss_clip": 0.90722656, + "router_z_loss_mlp": 0.13671875, + "step": 4628, + "time_per_iteration": 2.7034213542938232 + }, + { + "auxiliary_loss_clip": 0.01135379, + "auxiliary_loss_mlp": 0.0103756, + "balance_loss_clip": 1.04726934, + "balance_loss_mlp": 1.02408922, + "epoch": 0.2783105366000301, + "flos": 25708277568960.0, + "grad_norm": 1.5674449640607546, + "language_loss": 0.79405349, + "learning_rate": 3.387320077137679e-06, + "loss": 0.8157829, + "num_input_tokens_seen": 99985930, + "router_z_loss_clip": 0.88037109, + "router_z_loss_mlp": 0.13458252, + "step": 4629, + "time_per_iteration": 2.6763103008270264 + }, + { + "auxiliary_loss_clip": 0.01134032, + "auxiliary_loss_mlp": 0.01033756, + "balance_loss_clip": 1.04948807, + "balance_loss_mlp": 1.02038097, + "epoch": 0.27837065985269804, + "flos": 32338348886880.0, + "grad_norm": 1.748582831851337, + "language_loss": 0.84193254, + "learning_rate": 3.3870395205969208e-06, + "loss": 0.86361045, + "num_input_tokens_seen": 100006235, + "router_z_loss_clip": 0.84472656, + "router_z_loss_mlp": 0.13366699, + "step": 4630, + "time_per_iteration": 2.70247483253479 + }, + { + "auxiliary_loss_clip": 0.01139085, + "auxiliary_loss_mlp": 0.01038657, + "balance_loss_clip": 1.04793763, + "balance_loss_mlp": 1.02361286, + "epoch": 0.278430783105366, + "flos": 24676975231200.0, + "grad_norm": 1.987577530248351, + "language_loss": 0.8097114, + "learning_rate": 3.386758911459485e-06, + "loss": 0.83148879, + "num_input_tokens_seen": 100023655, + "router_z_loss_clip": 0.91064453, + "router_z_loss_mlp": 0.15039062, + "step": 4631, + "time_per_iteration": 2.680431365966797 + }, + { + "auxiliary_loss_clip": 0.01142852, + "auxiliary_loss_mlp": 0.0104811, + "balance_loss_clip": 1.05215621, + "balance_loss_mlp": 1.03400803, + "epoch": 0.278490906358034, + "flos": 31228901416800.0, + "grad_norm": 1.9158529040159096, + "language_loss": 0.71258748, + "learning_rate": 3.3864782497360126e-06, + "loss": 0.73449707, + "num_input_tokens_seen": 100043280, + "router_z_loss_clip": 0.90722656, + "router_z_loss_mlp": 0.14111328, + "step": 4632, + "time_per_iteration": 2.7010128498077393 + }, + { + "auxiliary_loss_clip": 0.01136137, + "auxiliary_loss_mlp": 0.01037667, + "balance_loss_clip": 1.05078185, + "balance_loss_mlp": 1.02433908, + "epoch": 0.27855102961070194, + "flos": 19732991791680.0, + "grad_norm": 1.7759781385925313, + "language_loss": 0.82332635, + "learning_rate": 3.386197535437145e-06, + "loss": 0.8450644, + "num_input_tokens_seen": 100057690, + "router_z_loss_clip": 0.85253906, + "router_z_loss_mlp": 0.13323975, + "step": 4633, + "time_per_iteration": 2.6680686473846436 + }, + { + "auxiliary_loss_clip": 0.01138653, + "auxiliary_loss_mlp": 0.01034505, + "balance_loss_clip": 1.04761624, + "balance_loss_mlp": 1.01937699, + "epoch": 0.2786111528633699, + "flos": 27976678274880.0, + "grad_norm": 1.6701397360653814, + "language_loss": 0.87821871, + "learning_rate": 3.385916768573529e-06, + "loss": 0.89995027, + "num_input_tokens_seen": 100075875, + "router_z_loss_clip": 0.91064453, + "router_z_loss_mlp": 0.15124512, + "step": 4634, + "time_per_iteration": 4.3340003490448 + }, + { + "auxiliary_loss_clip": 0.01141392, + "auxiliary_loss_mlp": 0.01035722, + "balance_loss_clip": 1.05126441, + "balance_loss_mlp": 1.02078545, + "epoch": 0.27867127611603787, + "flos": 28558059204960.0, + "grad_norm": 1.5176312827886855, + "language_loss": 0.76791966, + "learning_rate": 3.38563594915581e-06, + "loss": 0.78969079, + "num_input_tokens_seen": 100092930, + "router_z_loss_clip": 0.90136719, + "router_z_loss_mlp": 0.14953613, + "step": 4635, + "time_per_iteration": 2.6736643314361572 + }, + { + "auxiliary_loss_clip": 0.01138619, + "auxiliary_loss_mlp": 0.01035392, + "balance_loss_clip": 1.04712617, + "balance_loss_mlp": 1.02088451, + "epoch": 0.27873139936870583, + "flos": 24195294626400.0, + "grad_norm": 1.582248746040799, + "language_loss": 0.65197003, + "learning_rate": 3.385355077194637e-06, + "loss": 0.67371011, + "num_input_tokens_seen": 100110790, + "router_z_loss_clip": 0.9140625, + "router_z_loss_mlp": 0.14526367, + "step": 4636, + "time_per_iteration": 4.093331575393677 + }, + { + "auxiliary_loss_clip": 0.01141834, + "auxiliary_loss_mlp": 0.01038135, + "balance_loss_clip": 1.04937983, + "balance_loss_mlp": 1.02315056, + "epoch": 0.2787915226213738, + "flos": 21607135482240.0, + "grad_norm": 2.2338192656072327, + "language_loss": 0.83188546, + "learning_rate": 3.3850741527006604e-06, + "loss": 0.85368514, + "num_input_tokens_seen": 100126970, + "router_z_loss_clip": 0.92431641, + "router_z_loss_mlp": 0.14978027, + "step": 4637, + "time_per_iteration": 2.6932921409606934 + }, + { + "auxiliary_loss_clip": 0.01135168, + "auxiliary_loss_mlp": 0.01036646, + "balance_loss_clip": 1.04719114, + "balance_loss_mlp": 1.02299023, + "epoch": 0.27885164587404176, + "flos": 26955059567040.0, + "grad_norm": 1.5403444993675866, + "language_loss": 0.75918031, + "learning_rate": 3.384793175684533e-06, + "loss": 0.78089845, + "num_input_tokens_seen": 100146720, + "router_z_loss_clip": 0.87988281, + "router_z_loss_mlp": 0.13659668, + "step": 4638, + "time_per_iteration": 4.13681697845459 + }, + { + "auxiliary_loss_clip": 0.01140074, + "auxiliary_loss_mlp": 0.01043892, + "balance_loss_clip": 1.04839706, + "balance_loss_mlp": 1.02933621, + "epoch": 0.27891176912670973, + "flos": 23437202722560.0, + "grad_norm": 1.544948731735261, + "language_loss": 0.71560955, + "learning_rate": 3.38451214615691e-06, + "loss": 0.73744929, + "num_input_tokens_seen": 100165920, + "router_z_loss_clip": 0.91650391, + "router_z_loss_mlp": 0.14550781, + "step": 4639, + "time_per_iteration": 2.6936938762664795 + }, + { + "auxiliary_loss_clip": 0.0113921, + "auxiliary_loss_mlp": 0.01036032, + "balance_loss_clip": 1.04838169, + "balance_loss_mlp": 1.02105355, + "epoch": 0.27897189237937775, + "flos": 33678105341760.0, + "grad_norm": 2.077779653732464, + "language_loss": 0.65504599, + "learning_rate": 3.384231064128447e-06, + "loss": 0.67679846, + "num_input_tokens_seen": 100185525, + "router_z_loss_clip": 0.90917969, + "router_z_loss_mlp": 0.14971924, + "step": 4640, + "time_per_iteration": 2.7604498863220215 + }, + { + "auxiliary_loss_clip": 0.01139746, + "auxiliary_loss_mlp": 0.01033316, + "balance_loss_clip": 1.04852295, + "balance_loss_mlp": 1.01937485, + "epoch": 0.2790320156320457, + "flos": 25841762955360.0, + "grad_norm": 1.913779664365524, + "language_loss": 0.7258119, + "learning_rate": 3.383949929609804e-06, + "loss": 0.7475425, + "num_input_tokens_seen": 100204850, + "router_z_loss_clip": 0.91064453, + "router_z_loss_mlp": 0.13922119, + "step": 4641, + "time_per_iteration": 2.7625222206115723 + }, + { + "auxiliary_loss_clip": 0.01142631, + "auxiliary_loss_mlp": 0.01039383, + "balance_loss_clip": 1.04775238, + "balance_loss_mlp": 1.02364767, + "epoch": 0.2790921388847137, + "flos": 27801142302240.0, + "grad_norm": 1.7691159969375132, + "language_loss": 0.74698246, + "learning_rate": 3.383668742611641e-06, + "loss": 0.76880264, + "num_input_tokens_seen": 100224520, + "router_z_loss_clip": 0.94873047, + "router_z_loss_mlp": 0.1574707, + "step": 4642, + "time_per_iteration": 2.680349111557007 + }, + { + "auxiliary_loss_clip": 0.01141254, + "auxiliary_loss_mlp": 0.01036495, + "balance_loss_clip": 1.04855037, + "balance_loss_mlp": 1.02080691, + "epoch": 0.27915226213738165, + "flos": 28554047994240.0, + "grad_norm": 2.0452658253710654, + "language_loss": 0.85735142, + "learning_rate": 3.3833875031446205e-06, + "loss": 0.87912893, + "num_input_tokens_seen": 100243935, + "router_z_loss_clip": 0.92773438, + "router_z_loss_mlp": 0.15686035, + "step": 4643, + "time_per_iteration": 2.674253463745117 + }, + { + "auxiliary_loss_clip": 0.01140342, + "auxiliary_loss_mlp": 0.01039754, + "balance_loss_clip": 1.04912293, + "balance_loss_mlp": 1.02497792, + "epoch": 0.2792123853900496, + "flos": 27757025334720.0, + "grad_norm": 1.7128374961174615, + "language_loss": 0.83330041, + "learning_rate": 3.383106211219407e-06, + "loss": 0.85510135, + "num_input_tokens_seen": 100262290, + "router_z_loss_clip": 0.91210938, + "router_z_loss_mlp": 0.14776611, + "step": 4644, + "time_per_iteration": 2.754380702972412 + }, + { + "auxiliary_loss_clip": 0.01140404, + "auxiliary_loss_mlp": 0.01031282, + "balance_loss_clip": 1.04894125, + "balance_loss_mlp": 1.01639867, + "epoch": 0.2792725086427176, + "flos": 18362813209920.0, + "grad_norm": 2.0326726828077506, + "language_loss": 0.78963137, + "learning_rate": 3.3828248668466673e-06, + "loss": 0.8113482, + "num_input_tokens_seen": 100280015, + "router_z_loss_clip": 0.91503906, + "router_z_loss_mlp": 0.14862061, + "step": 4645, + "time_per_iteration": 2.6765336990356445 + }, + { + "auxiliary_loss_clip": 0.01054072, + "auxiliary_loss_mlp": 0.01006968, + "balance_loss_clip": 1.02373171, + "balance_loss_mlp": 1.00502062, + "epoch": 0.27933263189538554, + "flos": 76317923831040.0, + "grad_norm": 0.7840812237920904, + "language_loss": 0.62269235, + "learning_rate": 3.3825434700370705e-06, + "loss": 0.64330268, + "num_input_tokens_seen": 100338935, + "router_z_loss_clip": 0.30419922, + "router_z_loss_mlp": 0.01945496, + "step": 4646, + "time_per_iteration": 3.303840160369873 + }, + { + "auxiliary_loss_clip": 0.01133053, + "auxiliary_loss_mlp": 0.01032288, + "balance_loss_clip": 1.04655778, + "balance_loss_mlp": 1.01888847, + "epoch": 0.2793927551480535, + "flos": 30649870488960.0, + "grad_norm": 13.841283805557536, + "language_loss": 0.89644659, + "learning_rate": 3.3822620208012865e-06, + "loss": 0.9181, + "num_input_tokens_seen": 100359905, + "router_z_loss_clip": 0.86572266, + "router_z_loss_mlp": 0.1340332, + "step": 4647, + "time_per_iteration": 2.7134580612182617 + }, + { + "auxiliary_loss_clip": 0.01140617, + "auxiliary_loss_mlp": 0.01037169, + "balance_loss_clip": 1.04798257, + "balance_loss_mlp": 1.02166629, + "epoch": 0.27945287840072147, + "flos": 26020337724000.0, + "grad_norm": 2.5808667683739714, + "language_loss": 0.87087899, + "learning_rate": 3.381980519149988e-06, + "loss": 0.8926568, + "num_input_tokens_seen": 100376955, + "router_z_loss_clip": 0.92626953, + "router_z_loss_mlp": 0.15496826, + "step": 4648, + "time_per_iteration": 2.7118520736694336 + }, + { + "auxiliary_loss_clip": 0.01141846, + "auxiliary_loss_mlp": 0.01031113, + "balance_loss_clip": 1.04893231, + "balance_loss_mlp": 1.01674795, + "epoch": 0.27951300165338944, + "flos": 33496046087040.0, + "grad_norm": 2.670992587463582, + "language_loss": 0.73130453, + "learning_rate": 3.38169896509385e-06, + "loss": 0.75303411, + "num_input_tokens_seen": 100397545, + "router_z_loss_clip": 0.92871094, + "router_z_loss_mlp": 0.14367676, + "step": 4649, + "time_per_iteration": 2.728706121444702 + }, + { + "auxiliary_loss_clip": 0.0113918, + "auxiliary_loss_mlp": 0.01034212, + "balance_loss_clip": 1.04837596, + "balance_loss_mlp": 1.01796341, + "epoch": 0.2795731249060574, + "flos": 18496582217280.0, + "grad_norm": 2.3978681295584603, + "language_loss": 0.80361164, + "learning_rate": 3.381417358643549e-06, + "loss": 0.82534552, + "num_input_tokens_seen": 100415080, + "router_z_loss_clip": 0.90625, + "router_z_loss_mlp": 0.16247559, + "step": 4650, + "time_per_iteration": 2.6687676906585693 + }, + { + "auxiliary_loss_clip": 0.01053227, + "auxiliary_loss_mlp": 0.01002173, + "balance_loss_clip": 1.02298415, + "balance_loss_mlp": 1.00032401, + "epoch": 0.27963324815872537, + "flos": 73357224933600.0, + "grad_norm": 0.8600689227210121, + "language_loss": 0.58847421, + "learning_rate": 3.3811356998097624e-06, + "loss": 0.60902822, + "num_input_tokens_seen": 100471105, + "router_z_loss_clip": 0.30249023, + "router_z_loss_mlp": 0.01844788, + "step": 4651, + "time_per_iteration": 3.286464214324951 + }, + { + "auxiliary_loss_clip": 0.01140644, + "auxiliary_loss_mlp": 0.01035714, + "balance_loss_clip": 1.04655385, + "balance_loss_mlp": 1.01927495, + "epoch": 0.27969337141139333, + "flos": 26558614618560.0, + "grad_norm": 3.018123045628607, + "language_loss": 0.74118578, + "learning_rate": 3.3808539886031726e-06, + "loss": 0.76294935, + "num_input_tokens_seen": 100492520, + "router_z_loss_clip": 0.93994141, + "router_z_loss_mlp": 0.16430664, + "step": 4652, + "time_per_iteration": 2.6863210201263428 + }, + { + "auxiliary_loss_clip": 0.01146826, + "auxiliary_loss_mlp": 0.01042369, + "balance_loss_clip": 1.0538069, + "balance_loss_mlp": 1.02727723, + "epoch": 0.27975349466406135, + "flos": 48628914308640.0, + "grad_norm": 2.665789303819262, + "language_loss": 0.80064809, + "learning_rate": 3.380572225034461e-06, + "loss": 0.82254004, + "num_input_tokens_seen": 100512870, + "router_z_loss_clip": 0.9296875, + "router_z_loss_mlp": 0.15087891, + "step": 4653, + "time_per_iteration": 2.832071542739868 + }, + { + "auxiliary_loss_clip": 0.01140218, + "auxiliary_loss_mlp": 0.01043231, + "balance_loss_clip": 1.04850793, + "balance_loss_mlp": 1.0284307, + "epoch": 0.2798136179167293, + "flos": 26331790119840.0, + "grad_norm": 2.1348614502176497, + "language_loss": 0.78989148, + "learning_rate": 3.380290409114312e-06, + "loss": 0.81172591, + "num_input_tokens_seen": 100531655, + "router_z_loss_clip": 0.91650391, + "router_z_loss_mlp": 0.14801025, + "step": 4654, + "time_per_iteration": 2.6553406715393066 + }, + { + "auxiliary_loss_clip": 0.01145747, + "auxiliary_loss_mlp": 0.01039744, + "balance_loss_clip": 1.0510869, + "balance_loss_mlp": 1.02379405, + "epoch": 0.2798737411693973, + "flos": 26280906766560.0, + "grad_norm": 1.8659580767393602, + "language_loss": 0.81003571, + "learning_rate": 3.3800085408534127e-06, + "loss": 0.83189058, + "num_input_tokens_seen": 100548005, + "router_z_loss_clip": 0.94726562, + "router_z_loss_mlp": 0.15930176, + "step": 4655, + "time_per_iteration": 2.6791203022003174 + }, + { + "auxiliary_loss_clip": 0.01138127, + "auxiliary_loss_mlp": 0.01036885, + "balance_loss_clip": 1.04615855, + "balance_loss_mlp": 1.02139926, + "epoch": 0.27993386442206525, + "flos": 32922687578400.0, + "grad_norm": 1.6111796664928724, + "language_loss": 0.81394279, + "learning_rate": 3.3797266202624506e-06, + "loss": 0.83569288, + "num_input_tokens_seen": 100567980, + "router_z_loss_clip": 0.92041016, + "router_z_loss_mlp": 0.15478516, + "step": 4656, + "time_per_iteration": 2.7081985473632812 + }, + { + "auxiliary_loss_clip": 0.0114048, + "auxiliary_loss_mlp": 0.01037584, + "balance_loss_clip": 1.04949641, + "balance_loss_mlp": 1.02174687, + "epoch": 0.2799939876747332, + "flos": 29712312436320.0, + "grad_norm": 2.3435816434117194, + "language_loss": 0.83347154, + "learning_rate": 3.3794446473521176e-06, + "loss": 0.85525215, + "num_input_tokens_seen": 100588630, + "router_z_loss_clip": 0.90966797, + "router_z_loss_mlp": 0.1585083, + "step": 4657, + "time_per_iteration": 2.7275209426879883 + }, + { + "auxiliary_loss_clip": 0.01140699, + "auxiliary_loss_mlp": 0.01041862, + "balance_loss_clip": 1.04924917, + "balance_loss_mlp": 1.02730048, + "epoch": 0.2800541109274012, + "flos": 41069266842240.0, + "grad_norm": 2.0348621133638836, + "language_loss": 0.63330662, + "learning_rate": 3.379162622133105e-06, + "loss": 0.65513229, + "num_input_tokens_seen": 100608775, + "router_z_loss_clip": 0.91503906, + "router_z_loss_mlp": 0.14550781, + "step": 4658, + "time_per_iteration": 2.7582905292510986 + }, + { + "auxiliary_loss_clip": 0.01139935, + "auxiliary_loss_mlp": 0.01039455, + "balance_loss_clip": 1.04755986, + "balance_loss_mlp": 1.02435136, + "epoch": 0.28011423418006914, + "flos": 26374205361600.0, + "grad_norm": 2.131732859307287, + "language_loss": 0.78300035, + "learning_rate": 3.3788805446161073e-06, + "loss": 0.80479431, + "num_input_tokens_seen": 100627975, + "router_z_loss_clip": 0.92431641, + "router_z_loss_mlp": 0.15100098, + "step": 4659, + "time_per_iteration": 2.6818301677703857 + }, + { + "auxiliary_loss_clip": 0.01147356, + "auxiliary_loss_mlp": 0.01045397, + "balance_loss_clip": 1.05304444, + "balance_loss_mlp": 1.02967286, + "epoch": 0.2801743574327371, + "flos": 28202003634240.0, + "grad_norm": 1.8159845012250935, + "language_loss": 0.79499435, + "learning_rate": 3.3785984148118215e-06, + "loss": 0.81692183, + "num_input_tokens_seen": 100645430, + "router_z_loss_clip": 0.94335938, + "router_z_loss_mlp": 0.15710449, + "step": 4660, + "time_per_iteration": 2.8214945793151855 + }, + { + "auxiliary_loss_clip": 0.01139785, + "auxiliary_loss_mlp": 0.01034769, + "balance_loss_clip": 1.05012298, + "balance_loss_mlp": 1.02048755, + "epoch": 0.2802344806854051, + "flos": 15424432983360.0, + "grad_norm": 2.0017132056401827, + "language_loss": 0.80318666, + "learning_rate": 3.3783162327309453e-06, + "loss": 0.82493222, + "num_input_tokens_seen": 100663775, + "router_z_loss_clip": 0.89599609, + "router_z_loss_mlp": 0.1427002, + "step": 4661, + "time_per_iteration": 2.6804604530334473 + }, + { + "auxiliary_loss_clip": 0.0114765, + "auxiliary_loss_mlp": 0.01051141, + "balance_loss_clip": 1.05521464, + "balance_loss_mlp": 1.03541732, + "epoch": 0.28029460393807304, + "flos": 45475662180960.0, + "grad_norm": 2.661278420063895, + "language_loss": 0.78699678, + "learning_rate": 3.3780339983841794e-06, + "loss": 0.80898476, + "num_input_tokens_seen": 100686085, + "router_z_loss_clip": 0.92382812, + "router_z_loss_mlp": 0.15698242, + "step": 4662, + "time_per_iteration": 2.850618600845337 + }, + { + "auxiliary_loss_clip": 0.01147636, + "auxiliary_loss_mlp": 0.01042947, + "balance_loss_clip": 1.05150247, + "balance_loss_mlp": 1.02650821, + "epoch": 0.280354727190741, + "flos": 25308307617120.0, + "grad_norm": 1.9425052942715748, + "language_loss": 0.696715, + "learning_rate": 3.377751711782227e-06, + "loss": 0.71862084, + "num_input_tokens_seen": 100705135, + "router_z_loss_clip": 0.96191406, + "router_z_loss_mlp": 0.16442871, + "step": 4663, + "time_per_iteration": 2.668999195098877 + }, + { + "auxiliary_loss_clip": 0.01145487, + "auxiliary_loss_mlp": 0.01045689, + "balance_loss_clip": 1.05231452, + "balance_loss_mlp": 1.02852845, + "epoch": 0.28041485044340897, + "flos": 26198993527200.0, + "grad_norm": 1.62741584498509, + "language_loss": 0.77243543, + "learning_rate": 3.377469372935791e-06, + "loss": 0.79434717, + "num_input_tokens_seen": 100724960, + "router_z_loss_clip": 0.93164062, + "router_z_loss_mlp": 0.17163086, + "step": 4664, + "time_per_iteration": 2.803950309753418 + }, + { + "auxiliary_loss_clip": 0.01135809, + "auxiliary_loss_mlp": 0.01040743, + "balance_loss_clip": 1.04746771, + "balance_loss_mlp": 1.02619982, + "epoch": 0.28047497369607693, + "flos": 18051482365920.0, + "grad_norm": 1.8141270632041449, + "language_loss": 0.7922228, + "learning_rate": 3.377186981855578e-06, + "loss": 0.81398833, + "num_input_tokens_seen": 100741995, + "router_z_loss_clip": 0.88330078, + "router_z_loss_mlp": 0.14550781, + "step": 4665, + "time_per_iteration": 2.621819496154785 + }, + { + "auxiliary_loss_clip": 0.0113883, + "auxiliary_loss_mlp": 0.01041825, + "balance_loss_clip": 1.04866838, + "balance_loss_mlp": 1.02729964, + "epoch": 0.2805350969487449, + "flos": 28151606488320.0, + "grad_norm": 1.8438342064335054, + "language_loss": 0.80533779, + "learning_rate": 3.3769045385522968e-06, + "loss": 0.82714438, + "num_input_tokens_seen": 100758985, + "router_z_loss_clip": 0.90283203, + "router_z_loss_mlp": 0.14532471, + "step": 4666, + "time_per_iteration": 2.7614898681640625 + }, + { + "auxiliary_loss_clip": 0.01144765, + "auxiliary_loss_mlp": 0.01044238, + "balance_loss_clip": 1.05209923, + "balance_loss_mlp": 1.02865744, + "epoch": 0.2805952202014129, + "flos": 24990048318240.0, + "grad_norm": 2.5894298218108034, + "language_loss": 0.84501582, + "learning_rate": 3.376622043036658e-06, + "loss": 0.86690593, + "num_input_tokens_seen": 100777820, + "router_z_loss_clip": 0.92773438, + "router_z_loss_mlp": 0.15576172, + "step": 4667, + "time_per_iteration": 2.7250871658325195 + }, + { + "auxiliary_loss_clip": 0.01144487, + "auxiliary_loss_mlp": 0.01040465, + "balance_loss_clip": 1.05059242, + "balance_loss_mlp": 1.02509904, + "epoch": 0.2806553434540809, + "flos": 33455211019200.0, + "grad_norm": 1.7614299489525524, + "language_loss": 0.78855085, + "learning_rate": 3.376339495319373e-06, + "loss": 0.81040037, + "num_input_tokens_seen": 100798205, + "router_z_loss_clip": 0.93847656, + "router_z_loss_mlp": 0.15356445, + "step": 4668, + "time_per_iteration": 2.7633585929870605 + }, + { + "auxiliary_loss_clip": 0.01140948, + "auxiliary_loss_mlp": 0.01039912, + "balance_loss_clip": 1.04703617, + "balance_loss_mlp": 1.02451038, + "epoch": 0.28071546670674885, + "flos": 32341954924800.0, + "grad_norm": 1.475117914138777, + "language_loss": 0.76080358, + "learning_rate": 3.3760568954111563e-06, + "loss": 0.7826122, + "num_input_tokens_seen": 100819800, + "router_z_loss_clip": 0.93896484, + "router_z_loss_mlp": 0.15393066, + "step": 4669, + "time_per_iteration": 2.7183337211608887 + }, + { + "auxiliary_loss_clip": 0.01140259, + "auxiliary_loss_mlp": 0.0104035, + "balance_loss_clip": 1.04860353, + "balance_loss_mlp": 1.02523398, + "epoch": 0.2807755899594168, + "flos": 25085696915520.0, + "grad_norm": 2.106137638165581, + "language_loss": 0.78712845, + "learning_rate": 3.375774243322725e-06, + "loss": 0.80893451, + "num_input_tokens_seen": 100837880, + "router_z_loss_clip": 0.91699219, + "router_z_loss_mlp": 0.15106201, + "step": 4670, + "time_per_iteration": 2.747117280960083 + }, + { + "auxiliary_loss_clip": 0.01145104, + "auxiliary_loss_mlp": 0.01043985, + "balance_loss_clip": 1.05061626, + "balance_loss_mlp": 1.02751589, + "epoch": 0.2808357132120848, + "flos": 29667709261440.0, + "grad_norm": 2.016207360519588, + "language_loss": 0.78966951, + "learning_rate": 3.3754915390647955e-06, + "loss": 0.81156039, + "num_input_tokens_seen": 100856350, + "router_z_loss_clip": 0.94433594, + "router_z_loss_mlp": 0.16448975, + "step": 4671, + "time_per_iteration": 2.7413740158081055 + }, + { + "auxiliary_loss_clip": 0.01140598, + "auxiliary_loss_mlp": 0.01035682, + "balance_loss_clip": 1.05201173, + "balance_loss_mlp": 1.02180588, + "epoch": 0.28089583646475275, + "flos": 32253315816960.0, + "grad_norm": 1.8261727769482032, + "language_loss": 0.75142944, + "learning_rate": 3.37520878264809e-06, + "loss": 0.77319223, + "num_input_tokens_seen": 100876135, + "router_z_loss_clip": 0.88574219, + "router_z_loss_mlp": 0.13867188, + "step": 4672, + "time_per_iteration": 2.7058866024017334 + }, + { + "auxiliary_loss_clip": 0.01142528, + "auxiliary_loss_mlp": 0.01041758, + "balance_loss_clip": 1.04995763, + "balance_loss_mlp": 1.02515209, + "epoch": 0.2809559597174207, + "flos": 28201314840480.0, + "grad_norm": 3.6124668011662138, + "language_loss": 0.75271285, + "learning_rate": 3.3749259740833286e-06, + "loss": 0.77455568, + "num_input_tokens_seen": 100894790, + "router_z_loss_clip": 0.92529297, + "router_z_loss_mlp": 0.1661377, + "step": 4673, + "time_per_iteration": 4.173219203948975 + }, + { + "auxiliary_loss_clip": 0.01140841, + "auxiliary_loss_mlp": 0.01034498, + "balance_loss_clip": 1.04878855, + "balance_loss_mlp": 1.01989448, + "epoch": 0.2810160829700887, + "flos": 25530877801440.0, + "grad_norm": 1.8680885462858194, + "language_loss": 0.72963578, + "learning_rate": 3.374643113381237e-06, + "loss": 0.75138915, + "num_input_tokens_seen": 100915100, + "router_z_loss_clip": 0.92041016, + "router_z_loss_mlp": 0.14599609, + "step": 4674, + "time_per_iteration": 2.7180373668670654 + }, + { + "auxiliary_loss_clip": 0.01144639, + "auxiliary_loss_mlp": 0.01038126, + "balance_loss_clip": 1.05108106, + "balance_loss_mlp": 1.022223, + "epoch": 0.28107620622275664, + "flos": 17516608922880.0, + "grad_norm": 1.8177291126619448, + "language_loss": 0.76903778, + "learning_rate": 3.374360200552541e-06, + "loss": 0.79086542, + "num_input_tokens_seen": 100932795, + "router_z_loss_clip": 0.93701172, + "router_z_loss_mlp": 0.15917969, + "step": 4675, + "time_per_iteration": 2.6477174758911133 + }, + { + "auxiliary_loss_clip": 0.01144377, + "auxiliary_loss_mlp": 0.01040063, + "balance_loss_clip": 1.0498414, + "balance_loss_mlp": 1.02385056, + "epoch": 0.2811363294754246, + "flos": 25525894176000.0, + "grad_norm": 1.83674149869959, + "language_loss": 0.70195955, + "learning_rate": 3.374077235607968e-06, + "loss": 0.72380394, + "num_input_tokens_seen": 100950505, + "router_z_loss_clip": 0.9453125, + "router_z_loss_mlp": 0.16223145, + "step": 4676, + "time_per_iteration": 4.200531959533691 + }, + { + "auxiliary_loss_clip": 0.01136579, + "auxiliary_loss_mlp": 0.01036918, + "balance_loss_clip": 1.05007529, + "balance_loss_mlp": 1.02243948, + "epoch": 0.28119645272809257, + "flos": 25129854400320.0, + "grad_norm": 1.722334341411444, + "language_loss": 0.70470452, + "learning_rate": 3.3737942185582487e-06, + "loss": 0.72643948, + "num_input_tokens_seen": 100968790, + "router_z_loss_clip": 0.86425781, + "router_z_loss_mlp": 0.14459229, + "step": 4677, + "time_per_iteration": 4.181176662445068 + }, + { + "auxiliary_loss_clip": 0.0114482, + "auxiliary_loss_mlp": 0.01042449, + "balance_loss_clip": 1.05200493, + "balance_loss_mlp": 1.02610517, + "epoch": 0.28125657598076054, + "flos": 30917165400000.0, + "grad_norm": 5.461843999375832, + "language_loss": 0.6405766, + "learning_rate": 3.3735111494141153e-06, + "loss": 0.6624493, + "num_input_tokens_seen": 100990205, + "router_z_loss_clip": 0.92724609, + "router_z_loss_mlp": 0.16345215, + "step": 4678, + "time_per_iteration": 2.7156946659088135 + }, + { + "auxiliary_loss_clip": 0.01139409, + "auxiliary_loss_mlp": 0.01038826, + "balance_loss_clip": 1.04782939, + "balance_loss_mlp": 1.0240562, + "epoch": 0.2813166992334285, + "flos": 30294382160160.0, + "grad_norm": 1.4917000802501383, + "language_loss": 0.70036411, + "learning_rate": 3.3732280281863013e-06, + "loss": 0.72214651, + "num_input_tokens_seen": 101009815, + "router_z_loss_clip": 0.91503906, + "router_z_loss_mlp": 0.14770508, + "step": 4679, + "time_per_iteration": 2.751156806945801 + }, + { + "auxiliary_loss_clip": 0.01142082, + "auxiliary_loss_mlp": 0.01040532, + "balance_loss_clip": 1.04887366, + "balance_loss_mlp": 1.02535653, + "epoch": 0.2813768224860965, + "flos": 26552739612960.0, + "grad_norm": 3.0875708443271708, + "language_loss": 0.74595332, + "learning_rate": 3.3729448548855422e-06, + "loss": 0.76777947, + "num_input_tokens_seen": 101026780, + "router_z_loss_clip": 0.93261719, + "router_z_loss_mlp": 0.15185547, + "step": 4680, + "time_per_iteration": 2.6633129119873047 + }, + { + "auxiliary_loss_clip": 0.01143644, + "auxiliary_loss_mlp": 0.01038195, + "balance_loss_clip": 1.05034268, + "balance_loss_mlp": 1.02419972, + "epoch": 0.2814369457387645, + "flos": 29671639437600.0, + "grad_norm": 1.9887509426928287, + "language_loss": 0.77180851, + "learning_rate": 3.3726616295225774e-06, + "loss": 0.79362684, + "num_input_tokens_seen": 101046215, + "router_z_loss_clip": 0.93408203, + "router_z_loss_mlp": 0.13995361, + "step": 4681, + "time_per_iteration": 2.786931276321411 + }, + { + "auxiliary_loss_clip": 0.01144256, + "auxiliary_loss_mlp": 0.01035193, + "balance_loss_clip": 1.05056834, + "balance_loss_mlp": 1.01955247, + "epoch": 0.28149706899143245, + "flos": 22592254471200.0, + "grad_norm": 3.029596723165671, + "language_loss": 0.74106228, + "learning_rate": 3.372378352108146e-06, + "loss": 0.76285678, + "num_input_tokens_seen": 101063365, + "router_z_loss_clip": 0.9375, + "router_z_loss_mlp": 0.15649414, + "step": 4682, + "time_per_iteration": 2.661140203475952 + }, + { + "auxiliary_loss_clip": 0.01140391, + "auxiliary_loss_mlp": 0.01040085, + "balance_loss_clip": 1.05006003, + "balance_loss_mlp": 1.02567244, + "epoch": 0.2815571922441004, + "flos": 30338539644960.0, + "grad_norm": 2.808715177374023, + "language_loss": 0.8054347, + "learning_rate": 3.3720950226529894e-06, + "loss": 0.82723951, + "num_input_tokens_seen": 101083835, + "router_z_loss_clip": 0.90429688, + "router_z_loss_mlp": 0.14416504, + "step": 4683, + "time_per_iteration": 2.766840934753418 + }, + { + "auxiliary_loss_clip": 0.01145728, + "auxiliary_loss_mlp": 0.01039832, + "balance_loss_clip": 1.05172181, + "balance_loss_mlp": 1.0242281, + "epoch": 0.2816173154967684, + "flos": 24283933734240.0, + "grad_norm": 1.6216891105174647, + "language_loss": 0.76170534, + "learning_rate": 3.371811641167852e-06, + "loss": 0.78356087, + "num_input_tokens_seen": 101101740, + "router_z_loss_clip": 0.94091797, + "router_z_loss_mlp": 0.15612793, + "step": 4684, + "time_per_iteration": 2.705949306488037 + }, + { + "auxiliary_loss_clip": 0.01137452, + "auxiliary_loss_mlp": 0.01035069, + "balance_loss_clip": 1.04782546, + "balance_loss_mlp": 1.02094889, + "epoch": 0.28167743874943635, + "flos": 21342919884480.0, + "grad_norm": 2.7788895046653206, + "language_loss": 0.76507652, + "learning_rate": 3.3715282076634807e-06, + "loss": 0.78680182, + "num_input_tokens_seen": 101120480, + "router_z_loss_clip": 0.89550781, + "router_z_loss_mlp": 0.14123535, + "step": 4685, + "time_per_iteration": 2.6768712997436523 + }, + { + "auxiliary_loss_clip": 0.01137589, + "auxiliary_loss_mlp": 0.01040335, + "balance_loss_clip": 1.04871142, + "balance_loss_mlp": 1.02610159, + "epoch": 0.2817375620021043, + "flos": 30873980329920.0, + "grad_norm": 1.9545474067001611, + "language_loss": 0.75785053, + "learning_rate": 3.3712447221506218e-06, + "loss": 0.77962977, + "num_input_tokens_seen": 101142910, + "router_z_loss_clip": 0.88818359, + "router_z_loss_mlp": 0.14233398, + "step": 4686, + "time_per_iteration": 2.817441463470459 + }, + { + "auxiliary_loss_clip": 0.01145644, + "auxiliary_loss_mlp": 0.01042915, + "balance_loss_clip": 1.05114162, + "balance_loss_mlp": 1.02741766, + "epoch": 0.2817976852547723, + "flos": 22809759995520.0, + "grad_norm": 2.286025361245597, + "language_loss": 0.63240999, + "learning_rate": 3.370961184640025e-06, + "loss": 0.65429556, + "num_input_tokens_seen": 101160030, + "router_z_loss_clip": 0.9453125, + "router_z_loss_mlp": 0.1550293, + "step": 4687, + "time_per_iteration": 2.667512893676758 + }, + { + "auxiliary_loss_clip": 0.01145281, + "auxiliary_loss_mlp": 0.01045325, + "balance_loss_clip": 1.05197382, + "balance_loss_mlp": 1.02966046, + "epoch": 0.28185780850744024, + "flos": 27750785673600.0, + "grad_norm": 2.3938423190213576, + "language_loss": 0.76277959, + "learning_rate": 3.3706775951424433e-06, + "loss": 0.78468561, + "num_input_tokens_seen": 101177675, + "router_z_loss_clip": 0.93310547, + "router_z_loss_mlp": 0.15661621, + "step": 4688, + "time_per_iteration": 2.7581264972686768 + }, + { + "auxiliary_loss_clip": 0.01138621, + "auxiliary_loss_mlp": 0.01035738, + "balance_loss_clip": 1.04938436, + "balance_loss_mlp": 1.0212009, + "epoch": 0.2819179317601082, + "flos": 18222480403200.0, + "grad_norm": 1.9170794644399918, + "language_loss": 0.78322375, + "learning_rate": 3.37039395366863e-06, + "loss": 0.80496734, + "num_input_tokens_seen": 101192225, + "router_z_loss_clip": 0.89160156, + "router_z_loss_mlp": 0.14538574, + "step": 4689, + "time_per_iteration": 2.6583869457244873 + }, + { + "auxiliary_loss_clip": 0.01140532, + "auxiliary_loss_mlp": 0.01038005, + "balance_loss_clip": 1.04902029, + "balance_loss_mlp": 1.02249622, + "epoch": 0.2819780550127762, + "flos": 28242474046560.0, + "grad_norm": 1.666831067158457, + "language_loss": 0.77845955, + "learning_rate": 3.37011026022934e-06, + "loss": 0.80024493, + "num_input_tokens_seen": 101210870, + "router_z_loss_clip": 0.91552734, + "router_z_loss_mlp": 0.15515137, + "step": 4690, + "time_per_iteration": 2.7366037368774414 + }, + { + "auxiliary_loss_clip": 0.0114153, + "auxiliary_loss_mlp": 0.01041408, + "balance_loss_clip": 1.04912305, + "balance_loss_mlp": 1.02680504, + "epoch": 0.28203817826544414, + "flos": 26376393294720.0, + "grad_norm": 2.0233475026429337, + "language_loss": 0.87977701, + "learning_rate": 3.369826514835332e-06, + "loss": 0.90160638, + "num_input_tokens_seen": 101229965, + "router_z_loss_clip": 0.92333984, + "router_z_loss_mlp": 0.14611816, + "step": 4691, + "time_per_iteration": 2.682371139526367 + }, + { + "auxiliary_loss_clip": 0.01146519, + "auxiliary_loss_mlp": 0.01038153, + "balance_loss_clip": 1.05097663, + "balance_loss_mlp": 1.02288854, + "epoch": 0.2820983015181121, + "flos": 29317569213600.0, + "grad_norm": 2.5992557087673833, + "language_loss": 0.81897211, + "learning_rate": 3.3695427174973654e-06, + "loss": 0.84081876, + "num_input_tokens_seen": 101250980, + "router_z_loss_clip": 0.95654297, + "router_z_loss_mlp": 0.15264893, + "step": 4692, + "time_per_iteration": 2.7491061687469482 + }, + { + "auxiliary_loss_clip": 0.01140773, + "auxiliary_loss_mlp": 0.01035014, + "balance_loss_clip": 1.04856229, + "balance_loss_mlp": 1.01957035, + "epoch": 0.2821584247707801, + "flos": 36616445051040.0, + "grad_norm": 1.4278535889857449, + "language_loss": 0.74672776, + "learning_rate": 3.3692588682262022e-06, + "loss": 0.76848567, + "num_input_tokens_seen": 101273335, + "router_z_loss_clip": 0.92285156, + "router_z_loss_mlp": 0.15441895, + "step": 4693, + "time_per_iteration": 2.737276315689087 + }, + { + "auxiliary_loss_clip": 0.01142771, + "auxiliary_loss_mlp": 0.01031738, + "balance_loss_clip": 1.04891956, + "balance_loss_mlp": 1.0168314, + "epoch": 0.2822185480234481, + "flos": 26108490624480.0, + "grad_norm": 1.7633153422024914, + "language_loss": 0.77298778, + "learning_rate": 3.3689749670326046e-06, + "loss": 0.79473287, + "num_input_tokens_seen": 101292110, + "router_z_loss_clip": 0.93945312, + "router_z_loss_mlp": 0.14898682, + "step": 4694, + "time_per_iteration": 2.8009800910949707 + }, + { + "auxiliary_loss_clip": 0.01138707, + "auxiliary_loss_mlp": 0.01035375, + "balance_loss_clip": 1.04875135, + "balance_loss_mlp": 1.02064085, + "epoch": 0.28227867127611606, + "flos": 33498598675680.0, + "grad_norm": 1.9523713470662398, + "language_loss": 0.66729224, + "learning_rate": 3.3686910139273392e-06, + "loss": 0.68903303, + "num_input_tokens_seen": 101312815, + "router_z_loss_clip": 0.89941406, + "router_z_loss_mlp": 0.14758301, + "step": 4695, + "time_per_iteration": 2.7550551891326904 + }, + { + "auxiliary_loss_clip": 0.01146623, + "auxiliary_loss_mlp": 0.01046905, + "balance_loss_clip": 1.05077219, + "balance_loss_mlp": 1.02972698, + "epoch": 0.282338794528784, + "flos": 27569779868160.0, + "grad_norm": 2.384022538403818, + "language_loss": 0.75615895, + "learning_rate": 3.3684070089211736e-06, + "loss": 0.77809417, + "num_input_tokens_seen": 101329045, + "router_z_loss_clip": 0.95898438, + "router_z_loss_mlp": 0.17175293, + "step": 4696, + "time_per_iteration": 2.6711273193359375 + }, + { + "auxiliary_loss_clip": 0.01146616, + "auxiliary_loss_mlp": 0.01038972, + "balance_loss_clip": 1.05296624, + "balance_loss_mlp": 1.02424955, + "epoch": 0.282398917781452, + "flos": 51263499905280.0, + "grad_norm": 1.6835490094609178, + "language_loss": 0.62144333, + "learning_rate": 3.368122952024877e-06, + "loss": 0.64329922, + "num_input_tokens_seen": 101352715, + "router_z_loss_clip": 0.93554688, + "router_z_loss_mlp": 0.14715576, + "step": 4697, + "time_per_iteration": 2.8508496284484863 + }, + { + "auxiliary_loss_clip": 0.01136327, + "auxiliary_loss_mlp": 0.01033314, + "balance_loss_clip": 1.04734015, + "balance_loss_mlp": 1.01913393, + "epoch": 0.28245904103411995, + "flos": 28338770920320.0, + "grad_norm": 1.5627384513907634, + "language_loss": 0.73117363, + "learning_rate": 3.3678388432492214e-06, + "loss": 0.75287002, + "num_input_tokens_seen": 101374640, + "router_z_loss_clip": 0.88964844, + "router_z_loss_mlp": 0.1416626, + "step": 4698, + "time_per_iteration": 2.738304615020752 + }, + { + "auxiliary_loss_clip": 0.01138434, + "auxiliary_loss_mlp": 0.01040731, + "balance_loss_clip": 1.04887915, + "balance_loss_mlp": 1.02636647, + "epoch": 0.2825191642867879, + "flos": 30962497885920.0, + "grad_norm": 1.926120145859799, + "language_loss": 0.74803984, + "learning_rate": 3.3675546826049788e-06, + "loss": 0.76983154, + "num_input_tokens_seen": 101393595, + "router_z_loss_clip": 0.89599609, + "router_z_loss_mlp": 0.14355469, + "step": 4699, + "time_per_iteration": 2.7246570587158203 + }, + { + "auxiliary_loss_clip": 0.01141003, + "auxiliary_loss_mlp": 0.01033074, + "balance_loss_clip": 1.0473752, + "balance_loss_mlp": 1.01704049, + "epoch": 0.2825792875394559, + "flos": 21032237316960.0, + "grad_norm": 3.0089104168537326, + "language_loss": 0.79780841, + "learning_rate": 3.3672704701029265e-06, + "loss": 0.8195492, + "num_input_tokens_seen": 101409265, + "router_z_loss_clip": 0.93554688, + "router_z_loss_mlp": 0.16027832, + "step": 4700, + "time_per_iteration": 2.6324148178100586 + }, + { + "auxiliary_loss_clip": 0.0113922, + "auxiliary_loss_mlp": 0.01042547, + "balance_loss_clip": 1.05193031, + "balance_loss_mlp": 1.02926707, + "epoch": 0.28263941079212385, + "flos": 32609128284000.0, + "grad_norm": 1.7121311522862586, + "language_loss": 0.81644422, + "learning_rate": 3.3669862057538402e-06, + "loss": 0.8382619, + "num_input_tokens_seen": 101428365, + "router_z_loss_clip": 0.87255859, + "router_z_loss_mlp": 0.13269043, + "step": 4701, + "time_per_iteration": 2.7244060039520264 + }, + { + "auxiliary_loss_clip": 0.01139365, + "auxiliary_loss_mlp": 0.01037388, + "balance_loss_clip": 1.04807925, + "balance_loss_mlp": 1.02231979, + "epoch": 0.2826995340447918, + "flos": 31629681714240.0, + "grad_norm": 2.3733225855048485, + "language_loss": 0.72807491, + "learning_rate": 3.3667018895685004e-06, + "loss": 0.74984241, + "num_input_tokens_seen": 101447280, + "router_z_loss_clip": 0.91308594, + "router_z_loss_mlp": 0.15075684, + "step": 4702, + "time_per_iteration": 2.7105343341827393 + }, + { + "auxiliary_loss_clip": 0.01141118, + "auxiliary_loss_mlp": 0.0103586, + "balance_loss_clip": 1.05084634, + "balance_loss_mlp": 1.02105403, + "epoch": 0.2827596572974598, + "flos": 27308157376320.0, + "grad_norm": 1.8440463677558927, + "language_loss": 0.78011829, + "learning_rate": 3.3664175215576886e-06, + "loss": 0.80188811, + "num_input_tokens_seen": 101465435, + "router_z_loss_clip": 0.90332031, + "router_z_loss_mlp": 0.14794922, + "step": 4703, + "time_per_iteration": 2.7160301208496094 + }, + { + "auxiliary_loss_clip": 0.01139625, + "auxiliary_loss_mlp": 0.01041929, + "balance_loss_clip": 1.04778492, + "balance_loss_mlp": 1.02706349, + "epoch": 0.28281978055012774, + "flos": 40935254731200.0, + "grad_norm": 1.7954651890228055, + "language_loss": 0.69119823, + "learning_rate": 3.3661331017321867e-06, + "loss": 0.71301377, + "num_input_tokens_seen": 101486355, + "router_z_loss_clip": 0.91894531, + "router_z_loss_mlp": 0.14862061, + "step": 4704, + "time_per_iteration": 2.777667760848999 + }, + { + "auxiliary_loss_clip": 0.01142451, + "auxiliary_loss_mlp": 0.01039922, + "balance_loss_clip": 1.05183101, + "balance_loss_mlp": 1.02421021, + "epoch": 0.2828799038027957, + "flos": 28602783931680.0, + "grad_norm": 2.0703948237114824, + "language_loss": 0.70505679, + "learning_rate": 3.3658486301027807e-06, + "loss": 0.72688055, + "num_input_tokens_seen": 101505875, + "router_z_loss_clip": 0.90625, + "router_z_loss_mlp": 0.15722656, + "step": 4705, + "time_per_iteration": 2.7338967323303223 + }, + { + "auxiliary_loss_clip": 0.01059131, + "auxiliary_loss_mlp": 0.01008369, + "balance_loss_clip": 1.0285809, + "balance_loss_mlp": 1.00637639, + "epoch": 0.2829400270554637, + "flos": 85261566271680.0, + "grad_norm": 0.7230017699393734, + "language_loss": 0.59303313, + "learning_rate": 3.3655641066802577e-06, + "loss": 0.61370814, + "num_input_tokens_seen": 101565045, + "router_z_loss_clip": 0.30566406, + "router_z_loss_mlp": 0.01991272, + "step": 4706, + "time_per_iteration": 3.326369524002075 + }, + { + "auxiliary_loss_clip": 0.01138404, + "auxiliary_loss_mlp": 0.01037647, + "balance_loss_clip": 1.05000019, + "balance_loss_mlp": 1.02437878, + "epoch": 0.2830001503081317, + "flos": 30248077259520.0, + "grad_norm": 1.435954742415559, + "language_loss": 0.8179639, + "learning_rate": 3.365279531475407e-06, + "loss": 0.83972442, + "num_input_tokens_seen": 101585825, + "router_z_loss_clip": 0.88378906, + "router_z_loss_mlp": 0.13269043, + "step": 4707, + "time_per_iteration": 2.746151924133301 + }, + { + "auxiliary_loss_clip": 0.01143051, + "auxiliary_loss_mlp": 0.01038575, + "balance_loss_clip": 1.04933739, + "balance_loss_mlp": 1.02295864, + "epoch": 0.28306027356079966, + "flos": 33763097894400.0, + "grad_norm": 1.575191513476712, + "language_loss": 0.80515832, + "learning_rate": 3.36499490449902e-06, + "loss": 0.82697451, + "num_input_tokens_seen": 101606105, + "router_z_loss_clip": 0.93847656, + "router_z_loss_mlp": 0.15612793, + "step": 4708, + "time_per_iteration": 2.8436877727508545 + }, + { + "auxiliary_loss_clip": 0.01057008, + "auxiliary_loss_mlp": 0.01003945, + "balance_loss_clip": 1.0266552, + "balance_loss_mlp": 1.00183845, + "epoch": 0.2831203968134676, + "flos": 73857658688640.0, + "grad_norm": 0.8811710882497946, + "language_loss": 0.62772804, + "learning_rate": 3.3647102257618895e-06, + "loss": 0.6483376, + "num_input_tokens_seen": 101656875, + "router_z_loss_clip": 0.3034668, + "router_z_loss_mlp": 0.02108765, + "step": 4709, + "time_per_iteration": 3.126065731048584 + }, + { + "auxiliary_loss_clip": 0.01139016, + "auxiliary_loss_mlp": 0.01036511, + "balance_loss_clip": 1.04917634, + "balance_loss_mlp": 1.02190793, + "epoch": 0.2831805200661356, + "flos": 26911347772320.0, + "grad_norm": 1.5178136429525146, + "language_loss": 0.74063027, + "learning_rate": 3.3644254952748103e-06, + "loss": 0.76238561, + "num_input_tokens_seen": 101676225, + "router_z_loss_clip": 0.89941406, + "router_z_loss_mlp": 0.14599609, + "step": 4710, + "time_per_iteration": 2.7437744140625 + }, + { + "auxiliary_loss_clip": 0.01141538, + "auxiliary_loss_mlp": 0.01047547, + "balance_loss_clip": 1.04903924, + "balance_loss_mlp": 1.03120375, + "epoch": 0.28324064331880355, + "flos": 27577599703200.0, + "grad_norm": 1.7719794471835832, + "language_loss": 0.79403508, + "learning_rate": 3.364140713048579e-06, + "loss": 0.81592596, + "num_input_tokens_seen": 101693710, + "router_z_loss_clip": 0.92382812, + "router_z_loss_mlp": 0.16333008, + "step": 4711, + "time_per_iteration": 2.7934422492980957 + }, + { + "auxiliary_loss_clip": 0.01144195, + "auxiliary_loss_mlp": 0.01038872, + "balance_loss_clip": 1.05059552, + "balance_loss_mlp": 1.02259994, + "epoch": 0.2833007665714715, + "flos": 37100394623520.0, + "grad_norm": 1.9396764986005293, + "language_loss": 0.71291661, + "learning_rate": 3.363855879093996e-06, + "loss": 0.73474729, + "num_input_tokens_seen": 101714010, + "router_z_loss_clip": 0.93652344, + "router_z_loss_mlp": 0.16278076, + "step": 4712, + "time_per_iteration": 4.331730842590332 + }, + { + "auxiliary_loss_clip": 0.01142699, + "auxiliary_loss_mlp": 0.01048216, + "balance_loss_clip": 1.04914832, + "balance_loss_mlp": 1.03206301, + "epoch": 0.2833608898241395, + "flos": 28735418455200.0, + "grad_norm": 3.5740694305962966, + "language_loss": 0.81710005, + "learning_rate": 3.3635709934218605e-06, + "loss": 0.83900917, + "num_input_tokens_seen": 101732995, + "router_z_loss_clip": 0.93701172, + "router_z_loss_mlp": 0.16149902, + "step": 4713, + "time_per_iteration": 2.7417664527893066 + }, + { + "auxiliary_loss_clip": 0.01140076, + "auxiliary_loss_mlp": 0.01035555, + "balance_loss_clip": 1.04918718, + "balance_loss_mlp": 1.02046275, + "epoch": 0.28342101307680745, + "flos": 24729438758400.0, + "grad_norm": 1.8393817115479472, + "language_loss": 0.75130117, + "learning_rate": 3.3632860560429766e-06, + "loss": 0.77305752, + "num_input_tokens_seen": 101751385, + "router_z_loss_clip": 0.90917969, + "router_z_loss_mlp": 0.1506958, + "step": 4714, + "time_per_iteration": 2.6972930431365967 + }, + { + "auxiliary_loss_clip": 0.01138361, + "auxiliary_loss_mlp": 0.01042834, + "balance_loss_clip": 1.0466795, + "balance_loss_mlp": 1.02749205, + "epoch": 0.2834811363294754, + "flos": 37640413761120.0, + "grad_norm": 1.4724384133567245, + "language_loss": 0.78104514, + "learning_rate": 3.3630010669681494e-06, + "loss": 0.8028571, + "num_input_tokens_seen": 101773825, + "router_z_loss_clip": 0.91796875, + "router_z_loss_mlp": 0.15344238, + "step": 4715, + "time_per_iteration": 4.172213554382324 + }, + { + "auxiliary_loss_clip": 0.01139196, + "auxiliary_loss_mlp": 0.01039313, + "balance_loss_clip": 1.04778588, + "balance_loss_mlp": 1.02345276, + "epoch": 0.2835412595821434, + "flos": 27712179056160.0, + "grad_norm": 1.811115941884924, + "language_loss": 0.73825014, + "learning_rate": 3.3627160262081845e-06, + "loss": 0.76003522, + "num_input_tokens_seen": 101791920, + "router_z_loss_clip": 0.9140625, + "router_z_loss_mlp": 0.15863037, + "step": 4716, + "time_per_iteration": 2.7523319721221924 + }, + { + "auxiliary_loss_clip": 0.01145006, + "auxiliary_loss_mlp": 0.01040456, + "balance_loss_clip": 1.04725683, + "balance_loss_mlp": 1.02402854, + "epoch": 0.28360138283481134, + "flos": 22058231891040.0, + "grad_norm": 2.7540788150343953, + "language_loss": 0.74114168, + "learning_rate": 3.3624309337738917e-06, + "loss": 0.76299626, + "num_input_tokens_seen": 101809515, + "router_z_loss_clip": 0.9765625, + "router_z_loss_mlp": 0.16430664, + "step": 4717, + "time_per_iteration": 4.302496910095215 + }, + { + "auxiliary_loss_clip": 0.01142403, + "auxiliary_loss_mlp": 0.01043295, + "balance_loss_clip": 1.04859161, + "balance_loss_mlp": 1.02740395, + "epoch": 0.2836615060874793, + "flos": 21786561113760.0, + "grad_norm": 1.531091117749167, + "language_loss": 0.66874689, + "learning_rate": 3.3621457896760813e-06, + "loss": 0.69060385, + "num_input_tokens_seen": 101827735, + "router_z_loss_clip": 0.9375, + "router_z_loss_mlp": 0.15881348, + "step": 4718, + "time_per_iteration": 2.6698780059814453 + }, + { + "auxiliary_loss_clip": 0.01142813, + "auxiliary_loss_mlp": 0.01041174, + "balance_loss_clip": 1.04733038, + "balance_loss_mlp": 1.02522373, + "epoch": 0.2837216293401473, + "flos": 31408651186560.0, + "grad_norm": 2.67578947728048, + "language_loss": 0.72337615, + "learning_rate": 3.361860593925566e-06, + "loss": 0.74521601, + "num_input_tokens_seen": 101845970, + "router_z_loss_clip": 0.95556641, + "router_z_loss_mlp": 0.15948486, + "step": 4719, + "time_per_iteration": 2.740199565887451 + }, + { + "auxiliary_loss_clip": 0.01140338, + "auxiliary_loss_mlp": 0.01037223, + "balance_loss_clip": 1.04996932, + "balance_loss_mlp": 1.02235758, + "epoch": 0.2837817525928153, + "flos": 25530918318720.0, + "grad_norm": 1.6801693793357775, + "language_loss": 0.80560601, + "learning_rate": 3.3615753465331605e-06, + "loss": 0.82738161, + "num_input_tokens_seen": 101865040, + "router_z_loss_clip": 0.90429688, + "router_z_loss_mlp": 0.14880371, + "step": 4720, + "time_per_iteration": 2.681410551071167 + }, + { + "auxiliary_loss_clip": 0.0114357, + "auxiliary_loss_mlp": 0.01038767, + "balance_loss_clip": 1.05037451, + "balance_loss_mlp": 1.02304327, + "epoch": 0.28384187584548326, + "flos": 23082038532000.0, + "grad_norm": 4.598255882220353, + "language_loss": 0.79053521, + "learning_rate": 3.3612900475096817e-06, + "loss": 0.81235856, + "num_input_tokens_seen": 101883735, + "router_z_loss_clip": 0.93212891, + "router_z_loss_mlp": 0.15722656, + "step": 4721, + "time_per_iteration": 2.7064313888549805 + }, + { + "auxiliary_loss_clip": 0.0113933, + "auxiliary_loss_mlp": 0.01032637, + "balance_loss_clip": 1.04876637, + "balance_loss_mlp": 1.01754475, + "epoch": 0.2839019990981512, + "flos": 33365721048480.0, + "grad_norm": 1.9896844764751085, + "language_loss": 0.8286013, + "learning_rate": 3.3610046968659474e-06, + "loss": 0.85032099, + "num_input_tokens_seen": 101903025, + "router_z_loss_clip": 0.90722656, + "router_z_loss_mlp": 0.15087891, + "step": 4722, + "time_per_iteration": 2.7135210037231445 + }, + { + "auxiliary_loss_clip": 0.01143544, + "auxiliary_loss_mlp": 0.01034204, + "balance_loss_clip": 1.0516386, + "balance_loss_mlp": 1.01977968, + "epoch": 0.2839621223508192, + "flos": 22102875583200.0, + "grad_norm": 1.9567168576472058, + "language_loss": 0.70257181, + "learning_rate": 3.3607192946127785e-06, + "loss": 0.72434926, + "num_input_tokens_seen": 101922255, + "router_z_loss_clip": 0.91845703, + "router_z_loss_mlp": 0.14422607, + "step": 4723, + "time_per_iteration": 2.7049736976623535 + }, + { + "auxiliary_loss_clip": 0.01141165, + "auxiliary_loss_mlp": 0.01036208, + "balance_loss_clip": 1.04995227, + "balance_loss_mlp": 1.02017498, + "epoch": 0.28402224560348716, + "flos": 32163056017920.0, + "grad_norm": 1.4260579128448698, + "language_loss": 0.78600901, + "learning_rate": 3.360433840760998e-06, + "loss": 0.80778271, + "num_input_tokens_seen": 101943100, + "router_z_loss_clip": 0.91259766, + "router_z_loss_mlp": 0.16040039, + "step": 4724, + "time_per_iteration": 2.7881991863250732 + }, + { + "auxiliary_loss_clip": 0.0114422, + "auxiliary_loss_mlp": 0.01043221, + "balance_loss_clip": 1.05112183, + "balance_loss_mlp": 1.02786708, + "epoch": 0.2840823688561551, + "flos": 29359295661600.0, + "grad_norm": 1.6538720127962816, + "language_loss": 0.92265511, + "learning_rate": 3.36014833532143e-06, + "loss": 0.94452953, + "num_input_tokens_seen": 101963160, + "router_z_loss_clip": 0.93164062, + "router_z_loss_mlp": 0.15332031, + "step": 4725, + "time_per_iteration": 2.6895837783813477 + }, + { + "auxiliary_loss_clip": 0.01145063, + "auxiliary_loss_mlp": 0.01041214, + "balance_loss_clip": 1.05118036, + "balance_loss_mlp": 1.02525234, + "epoch": 0.2841424921088231, + "flos": 35947681048800.0, + "grad_norm": 1.6203313486457358, + "language_loss": 0.88537943, + "learning_rate": 3.3598627783049e-06, + "loss": 0.90724224, + "num_input_tokens_seen": 101984300, + "router_z_loss_clip": 0.93798828, + "router_z_loss_mlp": 0.15966797, + "step": 4726, + "time_per_iteration": 2.791382074356079 + }, + { + "auxiliary_loss_clip": 0.01144361, + "auxiliary_loss_mlp": 0.0104031, + "balance_loss_clip": 1.0507865, + "balance_loss_mlp": 1.02492011, + "epoch": 0.28420261536149105, + "flos": 58694645610720.0, + "grad_norm": 2.1158072865333093, + "language_loss": 0.78822076, + "learning_rate": 3.359577169722238e-06, + "loss": 0.81006753, + "num_input_tokens_seen": 102005765, + "router_z_loss_clip": 0.93554688, + "router_z_loss_mlp": 0.15380859, + "step": 4727, + "time_per_iteration": 2.9178271293640137 + }, + { + "auxiliary_loss_clip": 0.01138921, + "auxiliary_loss_mlp": 0.01033139, + "balance_loss_clip": 1.05011392, + "balance_loss_mlp": 1.01941776, + "epoch": 0.284262738614159, + "flos": 31318269835680.0, + "grad_norm": 2.4942021131484777, + "language_loss": 0.66910845, + "learning_rate": 3.3592915095842733e-06, + "loss": 0.69082904, + "num_input_tokens_seen": 102022755, + "router_z_loss_clip": 0.88818359, + "router_z_loss_mlp": 0.13708496, + "step": 4728, + "time_per_iteration": 2.7794065475463867 + }, + { + "auxiliary_loss_clip": 0.01140225, + "auxiliary_loss_mlp": 0.01045114, + "balance_loss_clip": 1.04865456, + "balance_loss_mlp": 1.02911592, + "epoch": 0.284322861866827, + "flos": 24061079928960.0, + "grad_norm": 1.8772801942862012, + "language_loss": 0.75692916, + "learning_rate": 3.3590057979018386e-06, + "loss": 0.77878249, + "num_input_tokens_seen": 102041850, + "router_z_loss_clip": 0.91503906, + "router_z_loss_mlp": 0.16003418, + "step": 4729, + "time_per_iteration": 2.6523680686950684 + }, + { + "auxiliary_loss_clip": 0.01146702, + "auxiliary_loss_mlp": 0.01045867, + "balance_loss_clip": 1.05248904, + "balance_loss_mlp": 1.03032184, + "epoch": 0.28438298511949495, + "flos": 29181652790400.0, + "grad_norm": 1.822059650919666, + "language_loss": 0.66366023, + "learning_rate": 3.3587200346857674e-06, + "loss": 0.68558592, + "num_input_tokens_seen": 102059500, + "router_z_loss_clip": 0.94287109, + "router_z_loss_mlp": 0.15551758, + "step": 4730, + "time_per_iteration": 2.746325969696045 + }, + { + "auxiliary_loss_clip": 0.01144826, + "auxiliary_loss_mlp": 0.01038748, + "balance_loss_clip": 1.05148649, + "balance_loss_mlp": 1.02279806, + "epoch": 0.2844431083721629, + "flos": 31807851310080.0, + "grad_norm": 1.770419114029783, + "language_loss": 0.74403369, + "learning_rate": 3.3584342199468965e-06, + "loss": 0.76586944, + "num_input_tokens_seen": 102080460, + "router_z_loss_clip": 0.93261719, + "router_z_loss_mlp": 0.1595459, + "step": 4731, + "time_per_iteration": 2.7252612113952637 + }, + { + "auxiliary_loss_clip": 0.01143588, + "auxiliary_loss_mlp": 0.01037928, + "balance_loss_clip": 1.05086243, + "balance_loss_mlp": 1.02266884, + "epoch": 0.2845032316248309, + "flos": 31494494602080.0, + "grad_norm": 1.732155054384875, + "language_loss": 0.83982265, + "learning_rate": 3.3581483536960638e-06, + "loss": 0.86163783, + "num_input_tokens_seen": 102100950, + "router_z_loss_clip": 0.92675781, + "router_z_loss_mlp": 0.15246582, + "step": 4732, + "time_per_iteration": 2.845064401626587 + }, + { + "auxiliary_loss_clip": 0.01144979, + "auxiliary_loss_mlp": 0.01044051, + "balance_loss_clip": 1.05181372, + "balance_loss_mlp": 1.02789831, + "epoch": 0.2845633548774989, + "flos": 24188730827040.0, + "grad_norm": 1.89897789428214, + "language_loss": 0.78658342, + "learning_rate": 3.357862435944109e-06, + "loss": 0.80847377, + "num_input_tokens_seen": 102119345, + "router_z_loss_clip": 0.93066406, + "router_z_loss_mlp": 0.16131592, + "step": 4733, + "time_per_iteration": 2.724062919616699 + }, + { + "auxiliary_loss_clip": 0.01148494, + "auxiliary_loss_mlp": 0.01047788, + "balance_loss_clip": 1.05226028, + "balance_loss_mlp": 1.03193283, + "epoch": 0.28462347813016686, + "flos": 28288130670720.0, + "grad_norm": 4.553994993614913, + "language_loss": 0.70904875, + "learning_rate": 3.357576466701875e-06, + "loss": 0.73101151, + "num_input_tokens_seen": 102139050, + "router_z_loss_clip": 0.96191406, + "router_z_loss_mlp": 0.1583252, + "step": 4734, + "time_per_iteration": 2.696117877960205 + }, + { + "auxiliary_loss_clip": 0.01138144, + "auxiliary_loss_mlp": 0.01032871, + "balance_loss_clip": 1.0470736, + "balance_loss_mlp": 1.0176599, + "epoch": 0.2846836013828348, + "flos": 22769451652320.0, + "grad_norm": 2.078338554940047, + "language_loss": 0.73886275, + "learning_rate": 3.3572904459802056e-06, + "loss": 0.76057291, + "num_input_tokens_seen": 102157935, + "router_z_loss_clip": 0.91064453, + "router_z_loss_mlp": 0.15209961, + "step": 4735, + "time_per_iteration": 2.806774854660034 + }, + { + "auxiliary_loss_clip": 0.01144187, + "auxiliary_loss_mlp": 0.01047271, + "balance_loss_clip": 1.05175412, + "balance_loss_mlp": 1.03248954, + "epoch": 0.2847437246355028, + "flos": 17294038738560.0, + "grad_norm": 1.7088838710095475, + "language_loss": 0.79805416, + "learning_rate": 3.357004373789946e-06, + "loss": 0.8199687, + "num_input_tokens_seen": 102175325, + "router_z_loss_clip": 0.92382812, + "router_z_loss_mlp": 0.14770508, + "step": 4736, + "time_per_iteration": 2.652313232421875 + }, + { + "auxiliary_loss_clip": 0.01144541, + "auxiliary_loss_mlp": 0.01044604, + "balance_loss_clip": 1.05194175, + "balance_loss_mlp": 1.02895188, + "epoch": 0.28480384788817076, + "flos": 35726164313760.0, + "grad_norm": 2.612616493303382, + "language_loss": 0.60029435, + "learning_rate": 3.3567182501419453e-06, + "loss": 0.62218583, + "num_input_tokens_seen": 102196625, + "router_z_loss_clip": 0.92578125, + "router_z_loss_mlp": 0.15649414, + "step": 4737, + "time_per_iteration": 2.7413864135742188 + }, + { + "auxiliary_loss_clip": 0.01140142, + "auxiliary_loss_mlp": 0.01035056, + "balance_loss_clip": 1.05035019, + "balance_loss_mlp": 1.02091718, + "epoch": 0.2848639711408387, + "flos": 27578329014240.0, + "grad_norm": 1.7516841100336433, + "language_loss": 0.864205, + "learning_rate": 3.356432075047052e-06, + "loss": 0.885957, + "num_input_tokens_seen": 102214975, + "router_z_loss_clip": 0.89746094, + "router_z_loss_mlp": 0.14117432, + "step": 4738, + "time_per_iteration": 2.6718456745147705 + }, + { + "auxiliary_loss_clip": 0.0114641, + "auxiliary_loss_mlp": 0.0104394, + "balance_loss_clip": 1.05162621, + "balance_loss_mlp": 1.02745342, + "epoch": 0.2849240943935067, + "flos": 21474987166080.0, + "grad_norm": 1.989718111589225, + "language_loss": 0.90050185, + "learning_rate": 3.356145848516118e-06, + "loss": 0.92240536, + "num_input_tokens_seen": 102231885, + "router_z_loss_clip": 0.94677734, + "router_z_loss_mlp": 0.16491699, + "step": 4739, + "time_per_iteration": 2.664600133895874 + }, + { + "auxiliary_loss_clip": 0.01144602, + "auxiliary_loss_mlp": 0.01040963, + "balance_loss_clip": 1.05376983, + "balance_loss_mlp": 1.02544177, + "epoch": 0.28498421764617465, + "flos": 30337769816640.0, + "grad_norm": 1.451064046814246, + "language_loss": 0.72104585, + "learning_rate": 3.355859570559998e-06, + "loss": 0.7429015, + "num_input_tokens_seen": 102252725, + "router_z_loss_clip": 0.90771484, + "router_z_loss_mlp": 0.15527344, + "step": 4740, + "time_per_iteration": 2.707597255706787 + }, + { + "auxiliary_loss_clip": 0.011415, + "auxiliary_loss_mlp": 0.01034291, + "balance_loss_clip": 1.05169296, + "balance_loss_mlp": 1.01989007, + "epoch": 0.2850443408988426, + "flos": 27799602645600.0, + "grad_norm": 1.8099085776413548, + "language_loss": 0.77859777, + "learning_rate": 3.3555732411895477e-06, + "loss": 0.80035567, + "num_input_tokens_seen": 102271730, + "router_z_loss_clip": 0.89794922, + "router_z_loss_mlp": 0.144104, + "step": 4741, + "time_per_iteration": 2.682072401046753 + }, + { + "auxiliary_loss_clip": 0.01145193, + "auxiliary_loss_mlp": 0.01042204, + "balance_loss_clip": 1.0496223, + "balance_loss_mlp": 1.02620578, + "epoch": 0.2851044641515106, + "flos": 22993358906880.0, + "grad_norm": 1.9710480025063963, + "language_loss": 0.76240182, + "learning_rate": 3.3552868604156235e-06, + "loss": 0.78427577, + "num_input_tokens_seen": 102291325, + "router_z_loss_clip": 0.95605469, + "router_z_loss_mlp": 0.16003418, + "step": 4742, + "time_per_iteration": 2.6542258262634277 + }, + { + "auxiliary_loss_clip": 0.01147505, + "auxiliary_loss_mlp": 0.01047182, + "balance_loss_clip": 1.0519371, + "balance_loss_mlp": 1.03002763, + "epoch": 0.28516458740417855, + "flos": 23037840529920.0, + "grad_norm": 2.0584078009676414, + "language_loss": 0.57173055, + "learning_rate": 3.355000428249086e-06, + "loss": 0.5936774, + "num_input_tokens_seen": 102309000, + "router_z_loss_clip": 0.95507812, + "router_z_loss_mlp": 0.17150879, + "step": 4743, + "time_per_iteration": 2.7124288082122803 + }, + { + "auxiliary_loss_clip": 0.01150883, + "auxiliary_loss_mlp": 0.01051124, + "balance_loss_clip": 1.05598092, + "balance_loss_mlp": 1.03481615, + "epoch": 0.2852247106568465, + "flos": 30871387224000.0, + "grad_norm": 2.280977711224192, + "language_loss": 0.742612, + "learning_rate": 3.354713944700797e-06, + "loss": 0.76463199, + "num_input_tokens_seen": 102329240, + "router_z_loss_clip": 0.95068359, + "router_z_loss_mlp": 0.1628418, + "step": 4744, + "time_per_iteration": 2.702563524246216 + }, + { + "auxiliary_loss_clip": 0.01144625, + "auxiliary_loss_mlp": 0.01042658, + "balance_loss_clip": 1.0533607, + "balance_loss_mlp": 1.02773333, + "epoch": 0.2852848339095145, + "flos": 14221281745440.0, + "grad_norm": 2.6313749111689217, + "language_loss": 0.7729069, + "learning_rate": 3.3544274097816185e-06, + "loss": 0.79477978, + "num_input_tokens_seen": 102344440, + "router_z_loss_clip": 0.9140625, + "router_z_loss_mlp": 0.14941406, + "step": 4745, + "time_per_iteration": 2.6375575065612793 + }, + { + "auxiliary_loss_clip": 0.01136822, + "auxiliary_loss_mlp": 0.01037768, + "balance_loss_clip": 1.05100513, + "balance_loss_mlp": 1.02387381, + "epoch": 0.2853449571621825, + "flos": 15780529071360.0, + "grad_norm": 2.119907985571855, + "language_loss": 0.8257072, + "learning_rate": 3.3541408235024173e-06, + "loss": 0.84745306, + "num_input_tokens_seen": 102360985, + "router_z_loss_clip": 0.85888672, + "router_z_loss_mlp": 0.13909912, + "step": 4746, + "time_per_iteration": 2.6905910968780518 + }, + { + "auxiliary_loss_clip": 0.01148208, + "auxiliary_loss_mlp": 0.01036046, + "balance_loss_clip": 1.05185938, + "balance_loss_mlp": 1.02039361, + "epoch": 0.28540508041485046, + "flos": 24416932913280.0, + "grad_norm": 1.876794706526769, + "language_loss": 0.79492319, + "learning_rate": 3.3538541858740604e-06, + "loss": 0.81676567, + "num_input_tokens_seen": 102380320, + "router_z_loss_clip": 0.96289062, + "router_z_loss_mlp": 0.15649414, + "step": 4747, + "time_per_iteration": 2.662919521331787 + }, + { + "auxiliary_loss_clip": 0.01056519, + "auxiliary_loss_mlp": 0.01005546, + "balance_loss_clip": 1.0263927, + "balance_loss_mlp": 1.00366879, + "epoch": 0.28546520366751843, + "flos": 83143627692480.0, + "grad_norm": 0.7749497808058958, + "language_loss": 0.60530072, + "learning_rate": 3.3535674969074173e-06, + "loss": 0.62592137, + "num_input_tokens_seen": 102439140, + "router_z_loss_clip": 0.30078125, + "router_z_loss_mlp": 0.01873779, + "step": 4748, + "time_per_iteration": 3.281161069869995 + }, + { + "auxiliary_loss_clip": 0.01142873, + "auxiliary_loss_mlp": 0.01039115, + "balance_loss_clip": 1.05041361, + "balance_loss_mlp": 1.02428508, + "epoch": 0.2855253269201864, + "flos": 16169883495840.0, + "grad_norm": 2.1223322623112186, + "language_loss": 0.80610967, + "learning_rate": 3.3532807566133592e-06, + "loss": 0.82792962, + "num_input_tokens_seen": 102450990, + "router_z_loss_clip": 0.92529297, + "router_z_loss_mlp": 0.14831543, + "step": 4749, + "time_per_iteration": 2.7062366008758545 + }, + { + "auxiliary_loss_clip": 0.01144079, + "auxiliary_loss_mlp": 0.01040028, + "balance_loss_clip": 1.0521208, + "balance_loss_mlp": 1.02536559, + "epoch": 0.28558545017285436, + "flos": 34923914925120.0, + "grad_norm": 1.825105461315808, + "language_loss": 0.70511687, + "learning_rate": 3.3529939650027587e-06, + "loss": 0.72695792, + "num_input_tokens_seen": 102471820, + "router_z_loss_clip": 0.91796875, + "router_z_loss_mlp": 0.14654541, + "step": 4750, + "time_per_iteration": 2.8301889896392822 + }, + { + "auxiliary_loss_clip": 0.01140152, + "auxiliary_loss_mlp": 0.01037318, + "balance_loss_clip": 1.05157757, + "balance_loss_mlp": 1.02254844, + "epoch": 0.2856455734255223, + "flos": 41647122768960.0, + "grad_norm": 1.4718398737751246, + "language_loss": 0.82131135, + "learning_rate": 3.3527071220864917e-06, + "loss": 0.843086, + "num_input_tokens_seen": 102492625, + "router_z_loss_clip": 0.88476562, + "router_z_loss_mlp": 0.14770508, + "step": 4751, + "time_per_iteration": 2.7670443058013916 + }, + { + "auxiliary_loss_clip": 0.01141569, + "auxiliary_loss_mlp": 0.01040233, + "balance_loss_clip": 1.05118799, + "balance_loss_mlp": 1.02580285, + "epoch": 0.2857056966781903, + "flos": 48548581243200.0, + "grad_norm": 2.754530466396804, + "language_loss": 0.80284935, + "learning_rate": 3.3524202278754353e-06, + "loss": 0.82466733, + "num_input_tokens_seen": 102514145, + "router_z_loss_clip": 0.90380859, + "router_z_loss_mlp": 0.14428711, + "step": 4752, + "time_per_iteration": 4.274737358093262 + }, + { + "auxiliary_loss_clip": 0.01143529, + "auxiliary_loss_mlp": 0.01035588, + "balance_loss_clip": 1.05141664, + "balance_loss_mlp": 1.02030575, + "epoch": 0.28576581993085826, + "flos": 26688656036160.0, + "grad_norm": 2.0921556140093562, + "language_loss": 0.78708011, + "learning_rate": 3.3521332823804676e-06, + "loss": 0.80887127, + "num_input_tokens_seen": 102532365, + "router_z_loss_clip": 0.92138672, + "router_z_loss_mlp": 0.15283203, + "step": 4753, + "time_per_iteration": 2.6610162258148193 + }, + { + "auxiliary_loss_clip": 0.01146056, + "auxiliary_loss_mlp": 0.010396, + "balance_loss_clip": 1.05158281, + "balance_loss_mlp": 1.02308989, + "epoch": 0.2858259431835262, + "flos": 23296221639360.0, + "grad_norm": 2.2963588012377216, + "language_loss": 0.89647865, + "learning_rate": 3.3518462856124704e-06, + "loss": 0.91833526, + "num_input_tokens_seen": 102548425, + "router_z_loss_clip": 0.94433594, + "router_z_loss_mlp": 0.16516113, + "step": 4754, + "time_per_iteration": 2.6573734283447266 + }, + { + "auxiliary_loss_clip": 0.01140062, + "auxiliary_loss_mlp": 0.01042476, + "balance_loss_clip": 1.05099916, + "balance_loss_mlp": 1.02830839, + "epoch": 0.2858860664361942, + "flos": 24812405447040.0, + "grad_norm": 2.9174781407307333, + "language_loss": 0.8258549, + "learning_rate": 3.3515592375823267e-06, + "loss": 0.84768021, + "num_input_tokens_seen": 102566370, + "router_z_loss_clip": 0.89111328, + "router_z_loss_mlp": 0.14160156, + "step": 4755, + "time_per_iteration": 4.206706285476685 + }, + { + "auxiliary_loss_clip": 0.01142707, + "auxiliary_loss_mlp": 0.01040875, + "balance_loss_clip": 1.04981065, + "balance_loss_mlp": 1.02609313, + "epoch": 0.28594618968886215, + "flos": 29848593515040.0, + "grad_norm": 1.5973162105512895, + "language_loss": 0.83975732, + "learning_rate": 3.351272138300922e-06, + "loss": 0.86159313, + "num_input_tokens_seen": 102588715, + "router_z_loss_clip": 0.92822266, + "router_z_loss_mlp": 0.14782715, + "step": 4756, + "time_per_iteration": 4.2727625370025635 + }, + { + "auxiliary_loss_clip": 0.01057874, + "auxiliary_loss_mlp": 0.0100405, + "balance_loss_clip": 1.02759862, + "balance_loss_mlp": 1.00213075, + "epoch": 0.2860063129415301, + "flos": 87430070416320.0, + "grad_norm": 0.8653334815533843, + "language_loss": 0.61050379, + "learning_rate": 3.350984987779142e-06, + "loss": 0.63112307, + "num_input_tokens_seen": 102656715, + "router_z_loss_clip": 0.30224609, + "router_z_loss_mlp": 0.01916504, + "step": 4757, + "time_per_iteration": 3.4189107418060303 + }, + { + "auxiliary_loss_clip": 0.01145623, + "auxiliary_loss_mlp": 0.01033954, + "balance_loss_clip": 1.05426443, + "balance_loss_mlp": 1.01997721, + "epoch": 0.2860664361941981, + "flos": 25085696915520.0, + "grad_norm": 2.0670731779382288, + "language_loss": 0.66435349, + "learning_rate": 3.3506977860278756e-06, + "loss": 0.6861493, + "num_input_tokens_seen": 102676545, + "router_z_loss_clip": 0.91357422, + "router_z_loss_mlp": 0.13983154, + "step": 4758, + "time_per_iteration": 2.6698989868164062 + }, + { + "auxiliary_loss_clip": 0.01141278, + "auxiliary_loss_mlp": 0.01035883, + "balance_loss_clip": 1.04892516, + "balance_loss_mlp": 1.02112544, + "epoch": 0.2861265594468661, + "flos": 43923586413600.0, + "grad_norm": 6.263824474642399, + "language_loss": 0.62671971, + "learning_rate": 3.3504105330580143e-06, + "loss": 0.64849132, + "num_input_tokens_seen": 102702875, + "router_z_loss_clip": 0.92480469, + "router_z_loss_mlp": 0.1473999, + "step": 4759, + "time_per_iteration": 2.8263564109802246 + }, + { + "auxiliary_loss_clip": 0.01141652, + "auxiliary_loss_mlp": 0.01043656, + "balance_loss_clip": 1.05048895, + "balance_loss_mlp": 1.0279088, + "epoch": 0.28618668269953407, + "flos": 24461860226400.0, + "grad_norm": 2.2148718250263064, + "language_loss": 0.7447831, + "learning_rate": 3.3501232288804496e-06, + "loss": 0.76663613, + "num_input_tokens_seen": 102723160, + "router_z_loss_clip": 0.91113281, + "router_z_loss_mlp": 0.15734863, + "step": 4760, + "time_per_iteration": 2.673870801925659 + }, + { + "auxiliary_loss_clip": 0.01139043, + "auxiliary_loss_mlp": 0.01034793, + "balance_loss_clip": 1.05129278, + "balance_loss_mlp": 1.02089334, + "epoch": 0.28624680595220203, + "flos": 30471133651200.0, + "grad_norm": 1.8871632108122252, + "language_loss": 0.72569346, + "learning_rate": 3.3498358735060773e-06, + "loss": 0.74743181, + "num_input_tokens_seen": 102743855, + "router_z_loss_clip": 0.87841797, + "router_z_loss_mlp": 0.13909912, + "step": 4761, + "time_per_iteration": 2.8726532459259033 + }, + { + "auxiliary_loss_clip": 0.01141991, + "auxiliary_loss_mlp": 0.0104088, + "balance_loss_clip": 1.04921103, + "balance_loss_mlp": 1.02607977, + "epoch": 0.28630692920487, + "flos": 27445208283360.0, + "grad_norm": 3.219406685115693, + "language_loss": 0.74339509, + "learning_rate": 3.349548466945793e-06, + "loss": 0.76522386, + "num_input_tokens_seen": 102761370, + "router_z_loss_clip": 0.92724609, + "router_z_loss_mlp": 0.14801025, + "step": 4762, + "time_per_iteration": 2.7342984676361084 + }, + { + "auxiliary_loss_clip": 0.01141935, + "auxiliary_loss_mlp": 0.0103892, + "balance_loss_clip": 1.05260956, + "balance_loss_mlp": 1.0242337, + "epoch": 0.28636705245753796, + "flos": 25929348613920.0, + "grad_norm": 1.4171789710398128, + "language_loss": 0.7609235, + "learning_rate": 3.349261009210496e-06, + "loss": 0.78273207, + "num_input_tokens_seen": 102780885, + "router_z_loss_clip": 0.89257812, + "router_z_loss_mlp": 0.14672852, + "step": 4763, + "time_per_iteration": 2.672441244125366 + }, + { + "auxiliary_loss_clip": 0.01140073, + "auxiliary_loss_mlp": 0.01036964, + "balance_loss_clip": 1.04832387, + "balance_loss_mlp": 1.02149057, + "epoch": 0.28642717571020593, + "flos": 29402116076160.0, + "grad_norm": 1.6514916580497092, + "language_loss": 0.76824063, + "learning_rate": 3.348973500311086e-06, + "loss": 0.79001099, + "num_input_tokens_seen": 102801000, + "router_z_loss_clip": 0.91748047, + "router_z_loss_mlp": 0.15478516, + "step": 4764, + "time_per_iteration": 2.7217228412628174 + }, + { + "auxiliary_loss_clip": 0.01143246, + "auxiliary_loss_mlp": 0.01041235, + "balance_loss_clip": 1.05088031, + "balance_loss_mlp": 1.02477193, + "epoch": 0.2864872989628739, + "flos": 27578410048800.0, + "grad_norm": 1.9192151690112924, + "language_loss": 0.71135944, + "learning_rate": 3.348685940258466e-06, + "loss": 0.73320425, + "num_input_tokens_seen": 102820230, + "router_z_loss_clip": 0.92333984, + "router_z_loss_mlp": 0.16467285, + "step": 4765, + "time_per_iteration": 2.665855884552002 + }, + { + "auxiliary_loss_clip": 0.0113724, + "auxiliary_loss_mlp": 0.01034779, + "balance_loss_clip": 1.04860377, + "balance_loss_mlp": 1.02020013, + "epoch": 0.28654742221554186, + "flos": 39956456437920.0, + "grad_norm": 1.5254575085020223, + "language_loss": 0.75800455, + "learning_rate": 3.3483983290635395e-06, + "loss": 0.77972472, + "num_input_tokens_seen": 102842670, + "router_z_loss_clip": 0.88623047, + "router_z_loss_mlp": 0.14562988, + "step": 4766, + "time_per_iteration": 2.925701379776001 + }, + { + "auxiliary_loss_clip": 0.01139541, + "auxiliary_loss_mlp": 0.01032368, + "balance_loss_clip": 1.04999685, + "balance_loss_mlp": 1.01811671, + "epoch": 0.2866075454682098, + "flos": 32923335854880.0, + "grad_norm": 1.5392840022477974, + "language_loss": 0.77644008, + "learning_rate": 3.348110666737214e-06, + "loss": 0.79815912, + "num_input_tokens_seen": 102864480, + "router_z_loss_clip": 0.89550781, + "router_z_loss_mlp": 0.14251709, + "step": 4767, + "time_per_iteration": 2.6981422901153564 + }, + { + "auxiliary_loss_clip": 0.01140747, + "auxiliary_loss_mlp": 0.01044073, + "balance_loss_clip": 1.05025077, + "balance_loss_mlp": 1.02894497, + "epoch": 0.2866676687208778, + "flos": 28374987018240.0, + "grad_norm": 3.1121548160386627, + "language_loss": 0.64973938, + "learning_rate": 3.3478229532903956e-06, + "loss": 0.67158759, + "num_input_tokens_seen": 102883740, + "router_z_loss_clip": 0.90527344, + "router_z_loss_mlp": 0.15112305, + "step": 4768, + "time_per_iteration": 2.7100448608398438 + }, + { + "auxiliary_loss_clip": 0.0114324, + "auxiliary_loss_mlp": 0.01036856, + "balance_loss_clip": 1.0499661, + "balance_loss_mlp": 1.02184725, + "epoch": 0.28672779197354575, + "flos": 26331709085280.0, + "grad_norm": 1.8259288877150892, + "language_loss": 0.70564044, + "learning_rate": 3.3475351887339967e-06, + "loss": 0.72744143, + "num_input_tokens_seen": 102902945, + "router_z_loss_clip": 0.93212891, + "router_z_loss_mlp": 0.15002441, + "step": 4769, + "time_per_iteration": 2.646933078765869 + }, + { + "auxiliary_loss_clip": 0.01140772, + "auxiliary_loss_mlp": 0.01029697, + "balance_loss_clip": 1.04945445, + "balance_loss_mlp": 1.01571345, + "epoch": 0.2867879152262137, + "flos": 24239897801280.0, + "grad_norm": 1.7000883097330293, + "language_loss": 0.74639183, + "learning_rate": 3.3472473730789288e-06, + "loss": 0.76809651, + "num_input_tokens_seen": 102922405, + "router_z_loss_clip": 0.91259766, + "router_z_loss_mlp": 0.13989258, + "step": 4770, + "time_per_iteration": 2.7100892066955566 + }, + { + "auxiliary_loss_clip": 0.01142936, + "auxiliary_loss_mlp": 0.01034266, + "balance_loss_clip": 1.05042624, + "balance_loss_mlp": 1.01952004, + "epoch": 0.2868480384788817, + "flos": 34427161892160.0, + "grad_norm": 3.8415092364499164, + "language_loss": 0.67226768, + "learning_rate": 3.3469595063361045e-06, + "loss": 0.6940397, + "num_input_tokens_seen": 102938980, + "router_z_loss_clip": 0.92529297, + "router_z_loss_mlp": 0.1472168, + "step": 4771, + "time_per_iteration": 2.7169699668884277 + }, + { + "auxiliary_loss_clip": 0.0105765, + "auxiliary_loss_mlp": 0.01002384, + "balance_loss_clip": 1.02710807, + "balance_loss_mlp": 1.00043964, + "epoch": 0.2869081617315497, + "flos": 79831891016640.0, + "grad_norm": 1.7148943946248851, + "language_loss": 0.56859756, + "learning_rate": 3.3466715885164414e-06, + "loss": 0.58919787, + "num_input_tokens_seen": 103000405, + "router_z_loss_clip": 0.30541992, + "router_z_loss_mlp": 0.01942444, + "step": 4772, + "time_per_iteration": 3.233433246612549 + }, + { + "auxiliary_loss_clip": 0.01144411, + "auxiliary_loss_mlp": 0.01034804, + "balance_loss_clip": 1.05130649, + "balance_loss_mlp": 1.02008772, + "epoch": 0.28696828498421767, + "flos": 22769735273280.0, + "grad_norm": 2.2465265989486176, + "language_loss": 0.83201361, + "learning_rate": 3.346383619630856e-06, + "loss": 0.85380578, + "num_input_tokens_seen": 103017970, + "router_z_loss_clip": 0.93115234, + "router_z_loss_mlp": 0.14727783, + "step": 4773, + "time_per_iteration": 2.668696165084839 + }, + { + "auxiliary_loss_clip": 0.01140875, + "auxiliary_loss_mlp": 0.01034603, + "balance_loss_clip": 1.04734993, + "balance_loss_mlp": 1.01907015, + "epoch": 0.28702840823688563, + "flos": 28869876256320.0, + "grad_norm": 2.351037293527731, + "language_loss": 0.77553678, + "learning_rate": 3.34609559969027e-06, + "loss": 0.79729164, + "num_input_tokens_seen": 103036385, + "router_z_loss_clip": 0.93652344, + "router_z_loss_mlp": 0.15539551, + "step": 4774, + "time_per_iteration": 2.7097930908203125 + }, + { + "auxiliary_loss_clip": 0.01140553, + "auxiliary_loss_mlp": 0.01033114, + "balance_loss_clip": 1.0492754, + "balance_loss_mlp": 1.01761103, + "epoch": 0.2870885314895536, + "flos": 16844400951840.0, + "grad_norm": 2.658431047685511, + "language_loss": 0.73282909, + "learning_rate": 3.3458075287056034e-06, + "loss": 0.75456572, + "num_input_tokens_seen": 103052170, + "router_z_loss_clip": 0.91259766, + "router_z_loss_mlp": 0.15509033, + "step": 4775, + "time_per_iteration": 2.7292606830596924 + }, + { + "auxiliary_loss_clip": 0.01145541, + "auxiliary_loss_mlp": 0.01037124, + "balance_loss_clip": 1.05161822, + "balance_loss_mlp": 1.02181756, + "epoch": 0.28714865474222157, + "flos": 21703108217760.0, + "grad_norm": 1.6366050352863735, + "language_loss": 0.87951756, + "learning_rate": 3.34551940668778e-06, + "loss": 0.90134418, + "num_input_tokens_seen": 103070510, + "router_z_loss_clip": 0.94042969, + "router_z_loss_mlp": 0.15307617, + "step": 4776, + "time_per_iteration": 2.704833984375 + }, + { + "auxiliary_loss_clip": 0.01142333, + "auxiliary_loss_mlp": 0.01034851, + "balance_loss_clip": 1.05166674, + "balance_loss_mlp": 1.02060533, + "epoch": 0.28720877799488953, + "flos": 19519821616320.0, + "grad_norm": 1.815709506783215, + "language_loss": 0.74149477, + "learning_rate": 3.345231233647726e-06, + "loss": 0.76326656, + "num_input_tokens_seen": 103089590, + "router_z_loss_clip": 0.90527344, + "router_z_loss_mlp": 0.14245605, + "step": 4777, + "time_per_iteration": 2.690213680267334 + }, + { + "auxiliary_loss_clip": 0.01148117, + "auxiliary_loss_mlp": 0.01039693, + "balance_loss_clip": 1.05291021, + "balance_loss_mlp": 1.02437437, + "epoch": 0.2872689012475575, + "flos": 25530472628640.0, + "grad_norm": 3.4773036646721867, + "language_loss": 0.800331, + "learning_rate": 3.3449430095963696e-06, + "loss": 0.82220906, + "num_input_tokens_seen": 103109080, + "router_z_loss_clip": 0.95117188, + "router_z_loss_mlp": 0.15307617, + "step": 4778, + "time_per_iteration": 2.750681161880493 + }, + { + "auxiliary_loss_clip": 0.01141235, + "auxiliary_loss_mlp": 0.01040236, + "balance_loss_clip": 1.05151033, + "balance_loss_mlp": 1.02554905, + "epoch": 0.28732902450022546, + "flos": 26020945483200.0, + "grad_norm": 1.6807687329470686, + "language_loss": 0.73979712, + "learning_rate": 3.3446547345446386e-06, + "loss": 0.76161182, + "num_input_tokens_seen": 103127755, + "router_z_loss_clip": 0.89648438, + "router_z_loss_mlp": 0.14691162, + "step": 4779, + "time_per_iteration": 2.7326772212982178 + }, + { + "auxiliary_loss_clip": 0.01144173, + "auxiliary_loss_mlp": 0.01039364, + "balance_loss_clip": 1.05126619, + "balance_loss_mlp": 1.02353334, + "epoch": 0.2873891477528934, + "flos": 25441671451680.0, + "grad_norm": 2.115947147278578, + "language_loss": 0.76162529, + "learning_rate": 3.3443664085034656e-06, + "loss": 0.78346062, + "num_input_tokens_seen": 103147035, + "router_z_loss_clip": 0.93017578, + "router_z_loss_mlp": 0.1583252, + "step": 4780, + "time_per_iteration": 2.65889048576355 + }, + { + "auxiliary_loss_clip": 0.01137633, + "auxiliary_loss_mlp": 0.01036182, + "balance_loss_clip": 1.04905939, + "balance_loss_mlp": 1.02206182, + "epoch": 0.2874492710055614, + "flos": 21255779916000.0, + "grad_norm": 3.074216174240593, + "language_loss": 0.81100523, + "learning_rate": 3.344078031483784e-06, + "loss": 0.83274341, + "num_input_tokens_seen": 103165410, + "router_z_loss_clip": 0.88476562, + "router_z_loss_mlp": 0.14123535, + "step": 4781, + "time_per_iteration": 2.6625471115112305 + }, + { + "auxiliary_loss_clip": 0.01145786, + "auxiliary_loss_mlp": 0.0104381, + "balance_loss_clip": 1.05106616, + "balance_loss_mlp": 1.02710295, + "epoch": 0.28750939425822936, + "flos": 16358506549920.0, + "grad_norm": 2.1274511016629583, + "language_loss": 0.8637737, + "learning_rate": 3.3437896034965283e-06, + "loss": 0.88566965, + "num_input_tokens_seen": 103183710, + "router_z_loss_clip": 0.94677734, + "router_z_loss_mlp": 0.16705322, + "step": 4782, + "time_per_iteration": 2.6417174339294434 + }, + { + "auxiliary_loss_clip": 0.01144073, + "auxiliary_loss_mlp": 0.01038416, + "balance_loss_clip": 1.05325162, + "balance_loss_mlp": 1.02343082, + "epoch": 0.2875695175108973, + "flos": 26686103447520.0, + "grad_norm": 1.4596905460574385, + "language_loss": 0.71635288, + "learning_rate": 3.3435011245526357e-06, + "loss": 0.73817778, + "num_input_tokens_seen": 103203790, + "router_z_loss_clip": 0.90820312, + "router_z_loss_mlp": 0.14984131, + "step": 4783, + "time_per_iteration": 2.6582858562469482 + }, + { + "auxiliary_loss_clip": 0.01143674, + "auxiliary_loss_mlp": 0.01037097, + "balance_loss_clip": 1.05277205, + "balance_loss_mlp": 1.02210045, + "epoch": 0.2876296407635653, + "flos": 32025599938080.0, + "grad_norm": 1.9055260775734182, + "language_loss": 0.76893002, + "learning_rate": 3.343212594663047e-06, + "loss": 0.79073775, + "num_input_tokens_seen": 103223925, + "router_z_loss_clip": 0.90820312, + "router_z_loss_mlp": 0.15014648, + "step": 4784, + "time_per_iteration": 2.6922504901885986 + }, + { + "auxiliary_loss_clip": 0.01140684, + "auxiliary_loss_mlp": 0.01036188, + "balance_loss_clip": 1.05201578, + "balance_loss_mlp": 1.02162075, + "epoch": 0.28768976401623325, + "flos": 30961403919360.0, + "grad_norm": 1.985498096754129, + "language_loss": 0.75571972, + "learning_rate": 3.3429240138387015e-06, + "loss": 0.77748847, + "num_input_tokens_seen": 103244760, + "router_z_loss_clip": 0.88720703, + "router_z_loss_mlp": 0.14569092, + "step": 4785, + "time_per_iteration": 2.7071337699890137 + }, + { + "auxiliary_loss_clip": 0.01142447, + "auxiliary_loss_mlp": 0.01043349, + "balance_loss_clip": 1.05095065, + "balance_loss_mlp": 1.02825689, + "epoch": 0.28774988726890127, + "flos": 37417519438560.0, + "grad_norm": 2.034396892902109, + "language_loss": 0.83227944, + "learning_rate": 3.3426353820905425e-06, + "loss": 0.85413742, + "num_input_tokens_seen": 103261995, + "router_z_loss_clip": 0.91552734, + "router_z_loss_mlp": 0.15112305, + "step": 4786, + "time_per_iteration": 2.737926483154297 + }, + { + "auxiliary_loss_clip": 0.01141429, + "auxiliary_loss_mlp": 0.0103516, + "balance_loss_clip": 1.05110002, + "balance_loss_mlp": 1.02142119, + "epoch": 0.28781001052156924, + "flos": 25130543194080.0, + "grad_norm": 2.9187461584123557, + "language_loss": 0.79957652, + "learning_rate": 3.342346699429516e-06, + "loss": 0.82134247, + "num_input_tokens_seen": 103279780, + "router_z_loss_clip": 0.90332031, + "router_z_loss_mlp": 0.13739014, + "step": 4787, + "time_per_iteration": 2.6406092643737793 + }, + { + "auxiliary_loss_clip": 0.01143963, + "auxiliary_loss_mlp": 0.0103731, + "balance_loss_clip": 1.05150115, + "balance_loss_mlp": 1.02219462, + "epoch": 0.2878701337742372, + "flos": 32387408962560.0, + "grad_norm": 1.9566967606601033, + "language_loss": 0.83320212, + "learning_rate": 3.3420579658665677e-06, + "loss": 0.85501492, + "num_input_tokens_seen": 103300580, + "router_z_loss_clip": 0.92480469, + "router_z_loss_mlp": 0.15124512, + "step": 4788, + "time_per_iteration": 2.748535394668579 + }, + { + "auxiliary_loss_clip": 0.01143926, + "auxiliary_loss_mlp": 0.01039005, + "balance_loss_clip": 1.05105758, + "balance_loss_mlp": 1.02362657, + "epoch": 0.28793025702690517, + "flos": 34345451239200.0, + "grad_norm": 1.9357286977860366, + "language_loss": 0.73344469, + "learning_rate": 3.3417691814126468e-06, + "loss": 0.75527394, + "num_input_tokens_seen": 103320430, + "router_z_loss_clip": 0.92871094, + "router_z_loss_mlp": 0.15386963, + "step": 4789, + "time_per_iteration": 2.7784016132354736 + }, + { + "auxiliary_loss_clip": 0.01134374, + "auxiliary_loss_mlp": 0.01038306, + "balance_loss_clip": 1.04630888, + "balance_loss_mlp": 1.02375078, + "epoch": 0.28799038027957313, + "flos": 29047721713920.0, + "grad_norm": 1.8044415500915139, + "language_loss": 0.83945203, + "learning_rate": 3.341480346078704e-06, + "loss": 0.86117887, + "num_input_tokens_seen": 103337695, + "router_z_loss_clip": 0.88037109, + "router_z_loss_mlp": 0.14550781, + "step": 4790, + "time_per_iteration": 2.7269954681396484 + }, + { + "auxiliary_loss_clip": 0.01144594, + "auxiliary_loss_mlp": 0.01040245, + "balance_loss_clip": 1.05238104, + "balance_loss_mlp": 1.02519464, + "epoch": 0.2880505035322411, + "flos": 27264648168000.0, + "grad_norm": 2.841819609730789, + "language_loss": 0.77479309, + "learning_rate": 3.3411914598756922e-06, + "loss": 0.79664147, + "num_input_tokens_seen": 103357010, + "router_z_loss_clip": 0.92089844, + "router_z_loss_mlp": 0.15039062, + "step": 4791, + "time_per_iteration": 5.468498945236206 + }, + { + "auxiliary_loss_clip": 0.01146026, + "auxiliary_loss_mlp": 0.01033859, + "balance_loss_clip": 1.05082369, + "balance_loss_mlp": 1.01920843, + "epoch": 0.28811062678490906, + "flos": 21968620368480.0, + "grad_norm": 1.8796100860769467, + "language_loss": 0.70405167, + "learning_rate": 3.3409025228145654e-06, + "loss": 0.72585046, + "num_input_tokens_seen": 103375600, + "router_z_loss_clip": 0.953125, + "router_z_loss_mlp": 0.14660645, + "step": 4792, + "time_per_iteration": 2.697481870651245 + }, + { + "auxiliary_loss_clip": 0.01143075, + "auxiliary_loss_mlp": 0.0103517, + "balance_loss_clip": 1.05127203, + "balance_loss_mlp": 1.02036977, + "epoch": 0.28817075003757703, + "flos": 26955424222560.0, + "grad_norm": 1.7261779979512226, + "language_loss": 0.7916187, + "learning_rate": 3.3406135349062812e-06, + "loss": 0.81340116, + "num_input_tokens_seen": 103395225, + "router_z_loss_clip": 0.91796875, + "router_z_loss_mlp": 0.14801025, + "step": 4793, + "time_per_iteration": 2.6764073371887207 + }, + { + "auxiliary_loss_clip": 0.01138531, + "auxiliary_loss_mlp": 0.01038157, + "balance_loss_clip": 1.05041075, + "balance_loss_mlp": 1.02364898, + "epoch": 0.288230873290245, + "flos": 50863448918880.0, + "grad_norm": 1.726394380665696, + "language_loss": 0.78197157, + "learning_rate": 3.340324496161797e-06, + "loss": 0.80373847, + "num_input_tokens_seen": 103417245, + "router_z_loss_clip": 0.88232422, + "router_z_loss_mlp": 0.14501953, + "step": 4794, + "time_per_iteration": 4.272939920425415 + }, + { + "auxiliary_loss_clip": 0.01142799, + "auxiliary_loss_mlp": 0.01042365, + "balance_loss_clip": 1.05096817, + "balance_loss_mlp": 1.02659416, + "epoch": 0.28829099654291296, + "flos": 22725415719360.0, + "grad_norm": 2.135764722340042, + "language_loss": 0.82779777, + "learning_rate": 3.340035406592074e-06, + "loss": 0.84964943, + "num_input_tokens_seen": 103435500, + "router_z_loss_clip": 0.91796875, + "router_z_loss_mlp": 0.15771484, + "step": 4795, + "time_per_iteration": 2.6204068660736084 + }, + { + "auxiliary_loss_clip": 0.01137786, + "auxiliary_loss_mlp": 0.01041016, + "balance_loss_clip": 1.05026984, + "balance_loss_mlp": 1.02659154, + "epoch": 0.2883511197955809, + "flos": 30109162557600.0, + "grad_norm": 2.18717813762478, + "language_loss": 0.74323452, + "learning_rate": 3.339746266208074e-06, + "loss": 0.76502252, + "num_input_tokens_seen": 103451040, + "router_z_loss_clip": 0.87402344, + "router_z_loss_mlp": 0.14440918, + "step": 4796, + "time_per_iteration": 4.110283613204956 + }, + { + "auxiliary_loss_clip": 0.01148342, + "auxiliary_loss_mlp": 0.01040071, + "balance_loss_clip": 1.05273211, + "balance_loss_mlp": 1.02363205, + "epoch": 0.2884112430482489, + "flos": 28201679496000.0, + "grad_norm": 2.2592106346675935, + "language_loss": 0.72973663, + "learning_rate": 3.3394570750207614e-06, + "loss": 0.75162077, + "num_input_tokens_seen": 103471330, + "router_z_loss_clip": 0.95605469, + "router_z_loss_mlp": 0.16442871, + "step": 4797, + "time_per_iteration": 2.701122760772705 + }, + { + "auxiliary_loss_clip": 0.0114199, + "auxiliary_loss_mlp": 0.01034979, + "balance_loss_clip": 1.05142033, + "balance_loss_mlp": 1.01935029, + "epoch": 0.28847136630091685, + "flos": 20588596087680.0, + "grad_norm": 2.433034814525292, + "language_loss": 0.74518538, + "learning_rate": 3.3391678330411017e-06, + "loss": 0.76695502, + "num_input_tokens_seen": 103488060, + "router_z_loss_clip": 0.90527344, + "router_z_loss_mlp": 0.15612793, + "step": 4798, + "time_per_iteration": 2.646442413330078 + }, + { + "auxiliary_loss_clip": 0.01145574, + "auxiliary_loss_mlp": 0.01041896, + "balance_loss_clip": 1.05079591, + "balance_loss_mlp": 1.02455103, + "epoch": 0.2885314895535849, + "flos": 31308545688480.0, + "grad_norm": 26.020375492139497, + "language_loss": 0.65385616, + "learning_rate": 3.3388785402800642e-06, + "loss": 0.67573082, + "num_input_tokens_seen": 103503600, + "router_z_loss_clip": 0.94775391, + "router_z_loss_mlp": 0.17333984, + "step": 4799, + "time_per_iteration": 2.670475959777832 + }, + { + "auxiliary_loss_clip": 0.01145278, + "auxiliary_loss_mlp": 0.01044242, + "balance_loss_clip": 1.05199599, + "balance_loss_mlp": 1.02862573, + "epoch": 0.28859161280625284, + "flos": 25754987642400.0, + "grad_norm": 1.9953066411182332, + "language_loss": 0.82634133, + "learning_rate": 3.3385891967486178e-06, + "loss": 0.84823656, + "num_input_tokens_seen": 103524195, + "router_z_loss_clip": 0.93261719, + "router_z_loss_mlp": 0.15588379, + "step": 4800, + "time_per_iteration": 2.663928270339966 + }, + { + "auxiliary_loss_clip": 0.01140283, + "auxiliary_loss_mlp": 0.01038978, + "balance_loss_clip": 1.05185056, + "balance_loss_mlp": 1.02398133, + "epoch": 0.2886517360589208, + "flos": 32298567268320.0, + "grad_norm": 6.518786158741437, + "language_loss": 0.90862346, + "learning_rate": 3.3382998024577347e-06, + "loss": 0.93041611, + "num_input_tokens_seen": 103545235, + "router_z_loss_clip": 0.88476562, + "router_z_loss_mlp": 0.14990234, + "step": 4801, + "time_per_iteration": 2.7156853675842285 + }, + { + "auxiliary_loss_clip": 0.01144567, + "auxiliary_loss_mlp": 0.01041961, + "balance_loss_clip": 1.05332935, + "balance_loss_mlp": 1.02610672, + "epoch": 0.28871185931158877, + "flos": 30829053016800.0, + "grad_norm": 5.200672546350924, + "language_loss": 0.73671985, + "learning_rate": 3.33801035741839e-06, + "loss": 0.7585851, + "num_input_tokens_seen": 103563305, + "router_z_loss_clip": 0.91162109, + "router_z_loss_mlp": 0.15856934, + "step": 4802, + "time_per_iteration": 2.6790201663970947 + }, + { + "auxiliary_loss_clip": 0.01059966, + "auxiliary_loss_mlp": 0.01005806, + "balance_loss_clip": 1.02977312, + "balance_loss_mlp": 1.00389767, + "epoch": 0.28877198256425674, + "flos": 81346373098560.0, + "grad_norm": 0.7777585890340353, + "language_loss": 0.62992799, + "learning_rate": 3.337720861641558e-06, + "loss": 0.65058577, + "num_input_tokens_seen": 103625025, + "router_z_loss_clip": 0.30175781, + "router_z_loss_mlp": 0.01905823, + "step": 4803, + "time_per_iteration": 3.2416093349456787 + }, + { + "auxiliary_loss_clip": 0.01140442, + "auxiliary_loss_mlp": 0.01045171, + "balance_loss_clip": 1.04956341, + "balance_loss_mlp": 1.03007936, + "epoch": 0.2888321058169247, + "flos": 24773758312320.0, + "grad_norm": 2.487319335972939, + "language_loss": 0.7050947, + "learning_rate": 3.3374313151382165e-06, + "loss": 0.72695088, + "num_input_tokens_seen": 103644235, + "router_z_loss_clip": 0.90820312, + "router_z_loss_mlp": 0.15081787, + "step": 4804, + "time_per_iteration": 2.6543233394622803 + }, + { + "auxiliary_loss_clip": 0.01145884, + "auxiliary_loss_mlp": 0.01038639, + "balance_loss_clip": 1.05110526, + "balance_loss_mlp": 1.02249837, + "epoch": 0.28889222906959267, + "flos": 31136251098240.0, + "grad_norm": 1.854947972798947, + "language_loss": 0.68382001, + "learning_rate": 3.337141717919346e-06, + "loss": 0.70566523, + "num_input_tokens_seen": 103664700, + "router_z_loss_clip": 0.94628906, + "router_z_loss_mlp": 0.16137695, + "step": 4805, + "time_per_iteration": 2.7000720500946045 + }, + { + "auxiliary_loss_clip": 0.01144658, + "auxiliary_loss_mlp": 0.01038744, + "balance_loss_clip": 1.0514245, + "balance_loss_mlp": 1.02313328, + "epoch": 0.28895235232226063, + "flos": 39866520777120.0, + "grad_norm": 1.5941516840249361, + "language_loss": 0.69447398, + "learning_rate": 3.3368520699959272e-06, + "loss": 0.716308, + "num_input_tokens_seen": 103686595, + "router_z_loss_clip": 0.93212891, + "router_z_loss_mlp": 0.15618896, + "step": 4806, + "time_per_iteration": 2.7702341079711914 + }, + { + "auxiliary_loss_clip": 0.0114169, + "auxiliary_loss_mlp": 0.01043972, + "balance_loss_clip": 1.0520997, + "balance_loss_mlp": 1.02897561, + "epoch": 0.2890124755749286, + "flos": 36258728271840.0, + "grad_norm": 1.5902619945975722, + "language_loss": 0.71480131, + "learning_rate": 3.3365623713789443e-06, + "loss": 0.73665792, + "num_input_tokens_seen": 103707525, + "router_z_loss_clip": 0.89550781, + "router_z_loss_mlp": 0.15002441, + "step": 4807, + "time_per_iteration": 2.7929575443267822 + }, + { + "auxiliary_loss_clip": 0.01144441, + "auxiliary_loss_mlp": 0.01042002, + "balance_loss_clip": 1.05291343, + "balance_loss_mlp": 1.02660024, + "epoch": 0.28907259882759656, + "flos": 27667575881280.0, + "grad_norm": 1.9761075477112966, + "language_loss": 0.81816357, + "learning_rate": 3.336272622079382e-06, + "loss": 0.84002793, + "num_input_tokens_seen": 103727905, + "router_z_loss_clip": 0.91455078, + "router_z_loss_mlp": 0.15405273, + "step": 4808, + "time_per_iteration": 2.698974370956421 + }, + { + "auxiliary_loss_clip": 0.01141504, + "auxiliary_loss_mlp": 0.01042822, + "balance_loss_clip": 1.05361032, + "balance_loss_mlp": 1.02678835, + "epoch": 0.2891327220802645, + "flos": 27535184461440.0, + "grad_norm": 1.498113712168276, + "language_loss": 0.78271759, + "learning_rate": 3.3359828221082276e-06, + "loss": 0.8045609, + "num_input_tokens_seen": 103748335, + "router_z_loss_clip": 0.87988281, + "router_z_loss_mlp": 0.16040039, + "step": 4809, + "time_per_iteration": 2.6635146141052246 + }, + { + "auxiliary_loss_clip": 0.01149132, + "auxiliary_loss_mlp": 0.01042552, + "balance_loss_clip": 1.05210447, + "balance_loss_mlp": 1.02539754, + "epoch": 0.2891928453329325, + "flos": 26420874917760.0, + "grad_norm": 1.8175972883307845, + "language_loss": 0.79237241, + "learning_rate": 3.3356929714764714e-06, + "loss": 0.81428921, + "num_input_tokens_seen": 103767020, + "router_z_loss_clip": 0.97070312, + "router_z_loss_mlp": 0.17163086, + "step": 4810, + "time_per_iteration": 2.6875436305999756 + }, + { + "auxiliary_loss_clip": 0.0114533, + "auxiliary_loss_mlp": 0.01039883, + "balance_loss_clip": 1.05522978, + "balance_loss_mlp": 1.02466631, + "epoch": 0.28925296858560046, + "flos": 28335813158880.0, + "grad_norm": 1.728230073405471, + "language_loss": 0.76738489, + "learning_rate": 3.3354030701951032e-06, + "loss": 0.78923702, + "num_input_tokens_seen": 103786355, + "router_z_loss_clip": 0.89990234, + "router_z_loss_mlp": 0.15216064, + "step": 4811, + "time_per_iteration": 2.661109447479248 + }, + { + "auxiliary_loss_clip": 0.01144633, + "auxiliary_loss_mlp": 0.01043948, + "balance_loss_clip": 1.05381012, + "balance_loss_mlp": 1.02719903, + "epoch": 0.2893130918382685, + "flos": 34924806305280.0, + "grad_norm": 1.3665523463985776, + "language_loss": 0.77489519, + "learning_rate": 3.335113118275117e-06, + "loss": 0.79678094, + "num_input_tokens_seen": 103809345, + "router_z_loss_clip": 0.90722656, + "router_z_loss_mlp": 0.16748047, + "step": 4812, + "time_per_iteration": 2.765068531036377 + }, + { + "auxiliary_loss_clip": 0.01059932, + "auxiliary_loss_mlp": 0.01001274, + "balance_loss_clip": 1.02952266, + "balance_loss_mlp": 0.99931121, + "epoch": 0.28937321509093644, + "flos": 88222109450400.0, + "grad_norm": 0.7983942432535254, + "language_loss": 0.60279006, + "learning_rate": 3.3348231157275085e-06, + "loss": 0.62340212, + "num_input_tokens_seen": 103871180, + "router_z_loss_clip": 0.30419922, + "router_z_loss_mlp": 0.01960754, + "step": 4813, + "time_per_iteration": 3.4635558128356934 + }, + { + "auxiliary_loss_clip": 0.01144063, + "auxiliary_loss_mlp": 0.01037236, + "balance_loss_clip": 1.05381441, + "balance_loss_mlp": 1.02152419, + "epoch": 0.2894333383436044, + "flos": 19786954458240.0, + "grad_norm": 3.416354865160681, + "language_loss": 0.82343125, + "learning_rate": 3.3345330625632725e-06, + "loss": 0.84524429, + "num_input_tokens_seen": 103889040, + "router_z_loss_clip": 0.90185547, + "router_z_loss_mlp": 0.15722656, + "step": 4814, + "time_per_iteration": 2.6551260948181152 + }, + { + "auxiliary_loss_clip": 0.0114766, + "auxiliary_loss_mlp": 0.01047921, + "balance_loss_clip": 1.05312681, + "balance_loss_mlp": 1.03144646, + "epoch": 0.2894934615962724, + "flos": 30295719230400.0, + "grad_norm": 1.8491451462292077, + "language_loss": 0.72501755, + "learning_rate": 3.3342429587934094e-06, + "loss": 0.74697334, + "num_input_tokens_seen": 103910380, + "router_z_loss_clip": 0.94580078, + "router_z_loss_mlp": 0.16467285, + "step": 4815, + "time_per_iteration": 2.7599031925201416 + }, + { + "auxiliary_loss_clip": 0.01142887, + "auxiliary_loss_mlp": 0.01040278, + "balance_loss_clip": 1.05537724, + "balance_loss_mlp": 1.02610946, + "epoch": 0.28955358484894034, + "flos": 24952981357440.0, + "grad_norm": 1.6032517660739847, + "language_loss": 0.70240158, + "learning_rate": 3.3339528044289198e-06, + "loss": 0.72423327, + "num_input_tokens_seen": 103929955, + "router_z_loss_clip": 0.875, + "router_z_loss_mlp": 0.1416626, + "step": 4816, + "time_per_iteration": 2.646796941757202 + }, + { + "auxiliary_loss_clip": 0.01149831, + "auxiliary_loss_mlp": 0.01043884, + "balance_loss_clip": 1.05349255, + "balance_loss_mlp": 1.0267477, + "epoch": 0.2896137081016083, + "flos": 27534212046720.0, + "grad_norm": 3.6110972371828254, + "language_loss": 0.74749249, + "learning_rate": 3.3336625994808055e-06, + "loss": 0.76942962, + "num_input_tokens_seen": 103948020, + "router_z_loss_clip": 0.96289062, + "router_z_loss_mlp": 0.17120361, + "step": 4817, + "time_per_iteration": 2.681405544281006 + }, + { + "auxiliary_loss_clip": 0.01145007, + "auxiliary_loss_mlp": 0.01045723, + "balance_loss_clip": 1.05158186, + "balance_loss_mlp": 1.02905798, + "epoch": 0.28967383135427627, + "flos": 32565132868320.0, + "grad_norm": 1.9178680271291162, + "language_loss": 0.7632215, + "learning_rate": 3.3333723439600723e-06, + "loss": 0.78512883, + "num_input_tokens_seen": 103968740, + "router_z_loss_clip": 0.93505859, + "router_z_loss_mlp": 0.16650391, + "step": 4818, + "time_per_iteration": 2.6933252811431885 + }, + { + "auxiliary_loss_clip": 0.01144545, + "auxiliary_loss_mlp": 0.01038603, + "balance_loss_clip": 1.05205631, + "balance_loss_mlp": 1.02204537, + "epoch": 0.28973395460694423, + "flos": 18983975758560.0, + "grad_norm": 1.7748046836847393, + "language_loss": 0.80060434, + "learning_rate": 3.3330820378777263e-06, + "loss": 0.82243586, + "num_input_tokens_seen": 103986005, + "router_z_loss_clip": 0.92382812, + "router_z_loss_mlp": 0.16552734, + "step": 4819, + "time_per_iteration": 2.7226920127868652 + }, + { + "auxiliary_loss_clip": 0.01149396, + "auxiliary_loss_mlp": 0.01042367, + "balance_loss_clip": 1.0519073, + "balance_loss_mlp": 1.02515364, + "epoch": 0.2897940778596122, + "flos": 22814541034560.0, + "grad_norm": 1.9738812104664494, + "language_loss": 0.78612638, + "learning_rate": 3.332791681244776e-06, + "loss": 0.80804396, + "num_input_tokens_seen": 104005070, + "router_z_loss_clip": 0.97460938, + "router_z_loss_mlp": 0.17211914, + "step": 4820, + "time_per_iteration": 2.644050359725952 + }, + { + "auxiliary_loss_clip": 0.01147928, + "auxiliary_loss_mlp": 0.01035154, + "balance_loss_clip": 1.05371463, + "balance_loss_mlp": 1.01954973, + "epoch": 0.28985420111228016, + "flos": 22903423246080.0, + "grad_norm": 2.2126545600686467, + "language_loss": 0.72912467, + "learning_rate": 3.332501274072231e-06, + "loss": 0.75095546, + "num_input_tokens_seen": 104022945, + "router_z_loss_clip": 0.94335938, + "router_z_loss_mlp": 0.15600586, + "step": 4821, + "time_per_iteration": 2.6662566661834717 + }, + { + "auxiliary_loss_clip": 0.011424, + "auxiliary_loss_mlp": 0.01040651, + "balance_loss_clip": 1.05101264, + "balance_loss_mlp": 1.02425981, + "epoch": 0.28991432436494813, + "flos": 28150593556320.0, + "grad_norm": 2.419986236345046, + "language_loss": 0.72352934, + "learning_rate": 3.332210816371104e-06, + "loss": 0.74535978, + "num_input_tokens_seen": 104042080, + "router_z_loss_clip": 0.91455078, + "router_z_loss_mlp": 0.16394043, + "step": 4822, + "time_per_iteration": 2.6524744033813477 + }, + { + "auxiliary_loss_clip": 0.01145397, + "auxiliary_loss_mlp": 0.01043478, + "balance_loss_clip": 1.05425501, + "balance_loss_mlp": 1.02796888, + "epoch": 0.2899744476176161, + "flos": 21479200963200.0, + "grad_norm": 1.794577819002429, + "language_loss": 0.66168356, + "learning_rate": 3.3319203081524102e-06, + "loss": 0.68357229, + "num_input_tokens_seen": 104060975, + "router_z_loss_clip": 0.91162109, + "router_z_loss_mlp": 0.15515137, + "step": 4823, + "time_per_iteration": 2.644252300262451 + }, + { + "auxiliary_loss_clip": 0.01139402, + "auxiliary_loss_mlp": 0.01035212, + "balance_loss_clip": 1.04910278, + "balance_loss_mlp": 1.01997685, + "epoch": 0.29003457087028406, + "flos": 27222070857120.0, + "grad_norm": 1.8954412349469074, + "language_loss": 0.80929995, + "learning_rate": 3.331629749427164e-06, + "loss": 0.8310461, + "num_input_tokens_seen": 104081395, + "router_z_loss_clip": 0.90234375, + "router_z_loss_mlp": 0.15222168, + "step": 4824, + "time_per_iteration": 2.6830368041992188 + }, + { + "auxiliary_loss_clip": 0.01146032, + "auxiliary_loss_mlp": 0.01042667, + "balance_loss_clip": 1.05277598, + "balance_loss_mlp": 1.02600205, + "epoch": 0.2900946941229521, + "flos": 26777376178560.0, + "grad_norm": 1.935957758980496, + "language_loss": 0.72278833, + "learning_rate": 3.331339140206385e-06, + "loss": 0.74467534, + "num_input_tokens_seen": 104099995, + "router_z_loss_clip": 0.93212891, + "router_z_loss_mlp": 0.16650391, + "step": 4825, + "time_per_iteration": 2.6815099716186523 + }, + { + "auxiliary_loss_clip": 0.01146119, + "auxiliary_loss_mlp": 0.0103895, + "balance_loss_clip": 1.05323315, + "balance_loss_mlp": 1.02299953, + "epoch": 0.29015481737562004, + "flos": 21880589019840.0, + "grad_norm": 2.874149729814318, + "language_loss": 0.72974068, + "learning_rate": 3.331048480501092e-06, + "loss": 0.75159138, + "num_input_tokens_seen": 104118930, + "router_z_loss_clip": 0.92919922, + "router_z_loss_mlp": 0.1595459, + "step": 4826, + "time_per_iteration": 2.6440482139587402 + }, + { + "auxiliary_loss_clip": 0.01143382, + "auxiliary_loss_mlp": 0.01041671, + "balance_loss_clip": 1.05094481, + "balance_loss_mlp": 1.02675188, + "epoch": 0.290214940628288, + "flos": 27800696612160.0, + "grad_norm": 2.022966883367705, + "language_loss": 0.68683296, + "learning_rate": 3.3307577703223073e-06, + "loss": 0.70868349, + "num_input_tokens_seen": 104136940, + "router_z_loss_clip": 0.92382812, + "router_z_loss_mlp": 0.14923096, + "step": 4827, + "time_per_iteration": 2.647205114364624 + }, + { + "auxiliary_loss_clip": 0.01146237, + "auxiliary_loss_mlp": 0.01038124, + "balance_loss_clip": 1.05308878, + "balance_loss_mlp": 1.02176845, + "epoch": 0.290275063880956, + "flos": 24412313943360.0, + "grad_norm": 2.0808186092227783, + "language_loss": 0.80180037, + "learning_rate": 3.3304670096810545e-06, + "loss": 0.82364404, + "num_input_tokens_seen": 104154280, + "router_z_loss_clip": 0.93212891, + "router_z_loss_mlp": 0.16357422, + "step": 4828, + "time_per_iteration": 2.710050344467163 + }, + { + "auxiliary_loss_clip": 0.01143484, + "auxiliary_loss_mlp": 0.01046025, + "balance_loss_clip": 1.05207109, + "balance_loss_mlp": 1.03039122, + "epoch": 0.29033518713362394, + "flos": 26910010702080.0, + "grad_norm": 1.7620066917836492, + "language_loss": 0.80663633, + "learning_rate": 3.33017619858836e-06, + "loss": 0.82853138, + "num_input_tokens_seen": 104172605, + "router_z_loss_clip": 0.91259766, + "router_z_loss_mlp": 0.15606689, + "step": 4829, + "time_per_iteration": 2.647749900817871 + }, + { + "auxiliary_loss_clip": 0.0114143, + "auxiliary_loss_mlp": 0.01039597, + "balance_loss_clip": 1.05282974, + "balance_loss_mlp": 1.02435029, + "epoch": 0.2903953103862919, + "flos": 31274193385440.0, + "grad_norm": 1.712437207175117, + "language_loss": 0.82391506, + "learning_rate": 3.329885337055249e-06, + "loss": 0.84572536, + "num_input_tokens_seen": 104194120, + "router_z_loss_clip": 0.88574219, + "router_z_loss_mlp": 0.15234375, + "step": 4830, + "time_per_iteration": 2.696335792541504 + }, + { + "auxiliary_loss_clip": 0.01148194, + "auxiliary_loss_mlp": 0.01044733, + "balance_loss_clip": 1.05473483, + "balance_loss_mlp": 1.02847314, + "epoch": 0.29045543363895987, + "flos": 20677113643680.0, + "grad_norm": 8.269256379333402, + "language_loss": 0.78736663, + "learning_rate": 3.3295944250927546e-06, + "loss": 0.80929589, + "num_input_tokens_seen": 104210875, + "router_z_loss_clip": 0.93457031, + "router_z_loss_mlp": 0.16259766, + "step": 4831, + "time_per_iteration": 5.497368335723877 + }, + { + "auxiliary_loss_clip": 0.01142546, + "auxiliary_loss_mlp": 0.01037194, + "balance_loss_clip": 1.05321217, + "balance_loss_mlp": 1.02261448, + "epoch": 0.29051555689162784, + "flos": 32208307469280.0, + "grad_norm": 1.7644051583948834, + "language_loss": 0.74312168, + "learning_rate": 3.3293034627119055e-06, + "loss": 0.76491904, + "num_input_tokens_seen": 104229875, + "router_z_loss_clip": 0.89355469, + "router_z_loss_mlp": 0.14575195, + "step": 4832, + "time_per_iteration": 2.6883625984191895 + }, + { + "auxiliary_loss_clip": 0.01141258, + "auxiliary_loss_mlp": 0.01030089, + "balance_loss_clip": 1.05123866, + "balance_loss_mlp": 1.01624846, + "epoch": 0.2905756801442958, + "flos": 25751179018080.0, + "grad_norm": 1.672867656921858, + "language_loss": 0.75671554, + "learning_rate": 3.329012449923736e-06, + "loss": 0.77842903, + "num_input_tokens_seen": 104250405, + "router_z_loss_clip": 0.90039062, + "router_z_loss_mlp": 0.13824463, + "step": 4833, + "time_per_iteration": 4.131944417953491 + }, + { + "auxiliary_loss_clip": 0.01138992, + "auxiliary_loss_mlp": 0.01036992, + "balance_loss_clip": 1.05012584, + "balance_loss_mlp": 1.0219717, + "epoch": 0.29063580339696377, + "flos": 19164171218400.0, + "grad_norm": 1.805276771023709, + "language_loss": 0.64644802, + "learning_rate": 3.3287213867392813e-06, + "loss": 0.66820782, + "num_input_tokens_seen": 104269185, + "router_z_loss_clip": 0.88916016, + "router_z_loss_mlp": 0.15014648, + "step": 4834, + "time_per_iteration": 2.672762870788574 + }, + { + "auxiliary_loss_clip": 0.01141213, + "auxiliary_loss_mlp": 0.01030427, + "balance_loss_clip": 1.05310011, + "balance_loss_mlp": 1.01676548, + "epoch": 0.29069592664963173, + "flos": 30072136114080.0, + "grad_norm": 1.4881747687203857, + "language_loss": 0.71855372, + "learning_rate": 3.3284302731695783e-06, + "loss": 0.74027014, + "num_input_tokens_seen": 104289400, + "router_z_loss_clip": 0.88085938, + "router_z_loss_mlp": 0.13659668, + "step": 4835, + "time_per_iteration": 4.103718280792236 + }, + { + "auxiliary_loss_clip": 0.01141011, + "auxiliary_loss_mlp": 0.01032953, + "balance_loss_clip": 1.05219173, + "balance_loss_mlp": 1.01944649, + "epoch": 0.2907560499022997, + "flos": 30473281067040.0, + "grad_norm": 1.8533471446220853, + "language_loss": 0.79674959, + "learning_rate": 3.3281391092256668e-06, + "loss": 0.81848919, + "num_input_tokens_seen": 104310485, + "router_z_loss_clip": 0.88769531, + "router_z_loss_mlp": 0.13519287, + "step": 4836, + "time_per_iteration": 2.707838535308838 + }, + { + "auxiliary_loss_clip": 0.01140898, + "auxiliary_loss_mlp": 0.01036331, + "balance_loss_clip": 1.05223644, + "balance_loss_mlp": 1.0210489, + "epoch": 0.29081617315496766, + "flos": 22766088718080.0, + "grad_norm": 1.7878631773259392, + "language_loss": 0.81011534, + "learning_rate": 3.3278478949185865e-06, + "loss": 0.8318876, + "num_input_tokens_seen": 104327330, + "router_z_loss_clip": 0.88574219, + "router_z_loss_mlp": 0.1529541, + "step": 4837, + "time_per_iteration": 2.631786823272705 + }, + { + "auxiliary_loss_clip": 0.01139951, + "auxiliary_loss_mlp": 0.01035898, + "balance_loss_clip": 1.04898298, + "balance_loss_mlp": 1.02119339, + "epoch": 0.2908762964076356, + "flos": 43112463740640.0, + "grad_norm": 1.9962495437325765, + "language_loss": 0.67546296, + "learning_rate": 3.327556630259381e-06, + "loss": 0.69722146, + "num_input_tokens_seen": 104350350, + "router_z_loss_clip": 0.90966797, + "router_z_loss_mlp": 0.14697266, + "step": 4838, + "time_per_iteration": 2.8026037216186523 + }, + { + "auxiliary_loss_clip": 0.01149336, + "auxiliary_loss_mlp": 0.01040154, + "balance_loss_clip": 1.0554769, + "balance_loss_mlp": 1.0239234, + "epoch": 0.29093641966030365, + "flos": 28157886666720.0, + "grad_norm": 1.5603474159791648, + "language_loss": 0.71159869, + "learning_rate": 3.327265315259095e-06, + "loss": 0.73349357, + "num_input_tokens_seen": 104369995, + "router_z_loss_clip": 0.93798828, + "router_z_loss_mlp": 0.16204834, + "step": 4839, + "time_per_iteration": 2.7899680137634277 + }, + { + "auxiliary_loss_clip": 0.01141781, + "auxiliary_loss_mlp": 0.01033869, + "balance_loss_clip": 1.05060768, + "balance_loss_mlp": 1.0194211, + "epoch": 0.2909965429129716, + "flos": 43873999613280.0, + "grad_norm": 1.9949085901985542, + "language_loss": 0.75367415, + "learning_rate": 3.326973949928776e-06, + "loss": 0.77543068, + "num_input_tokens_seen": 104392285, + "router_z_loss_clip": 0.91113281, + "router_z_loss_mlp": 0.14453125, + "step": 4840, + "time_per_iteration": 2.7693514823913574 + }, + { + "auxiliary_loss_clip": 0.01143737, + "auxiliary_loss_mlp": 0.01043894, + "balance_loss_clip": 1.05287194, + "balance_loss_mlp": 1.02852821, + "epoch": 0.2910566661656396, + "flos": 37681208311680.0, + "grad_norm": 1.6785602179074826, + "language_loss": 0.60167068, + "learning_rate": 3.326682534279471e-06, + "loss": 0.62354696, + "num_input_tokens_seen": 104412640, + "router_z_loss_clip": 0.90869141, + "router_z_loss_mlp": 0.15368652, + "step": 4841, + "time_per_iteration": 2.796851634979248 + }, + { + "auxiliary_loss_clip": 0.01144521, + "auxiliary_loss_mlp": 0.01037856, + "balance_loss_clip": 1.05386782, + "balance_loss_mlp": 1.0216676, + "epoch": 0.29111678941830754, + "flos": 36618997639680.0, + "grad_norm": 1.348829218117236, + "language_loss": 0.71145302, + "learning_rate": 3.326391068322232e-06, + "loss": 0.73327678, + "num_input_tokens_seen": 104435245, + "router_z_loss_clip": 0.90673828, + "router_z_loss_mlp": 0.16186523, + "step": 4842, + "time_per_iteration": 2.709887742996216 + }, + { + "auxiliary_loss_clip": 0.01141033, + "auxiliary_loss_mlp": 0.01034598, + "balance_loss_clip": 1.05152738, + "balance_loss_mlp": 1.02045977, + "epoch": 0.2911769126709755, + "flos": 27890753824800.0, + "grad_norm": 2.5454023703427544, + "language_loss": 0.73354131, + "learning_rate": 3.3260995520681098e-06, + "loss": 0.75529766, + "num_input_tokens_seen": 104455395, + "router_z_loss_clip": 0.89550781, + "router_z_loss_mlp": 0.14147949, + "step": 4843, + "time_per_iteration": 2.698683977127075 + }, + { + "auxiliary_loss_clip": 0.01142793, + "auxiliary_loss_mlp": 0.01032897, + "balance_loss_clip": 1.05132198, + "balance_loss_mlp": 1.01849675, + "epoch": 0.2912370359236435, + "flos": 26418281811840.0, + "grad_norm": 2.0382432846297522, + "language_loss": 0.58207881, + "learning_rate": 3.3258079855281602e-06, + "loss": 0.60383576, + "num_input_tokens_seen": 104473350, + "router_z_loss_clip": 0.91455078, + "router_z_loss_mlp": 0.14416504, + "step": 4844, + "time_per_iteration": 2.6418073177337646 + }, + { + "auxiliary_loss_clip": 0.01149695, + "auxiliary_loss_mlp": 0.01036988, + "balance_loss_clip": 1.05664515, + "balance_loss_mlp": 1.02085924, + "epoch": 0.29129715917631144, + "flos": 27934668205920.0, + "grad_norm": 3.417130994355821, + "language_loss": 0.86938286, + "learning_rate": 3.3255163687134396e-06, + "loss": 0.89124966, + "num_input_tokens_seen": 104492265, + "router_z_loss_clip": 0.92919922, + "router_z_loss_mlp": 0.16131592, + "step": 4845, + "time_per_iteration": 2.6498894691467285 + }, + { + "auxiliary_loss_clip": 0.01146223, + "auxiliary_loss_mlp": 0.01049295, + "balance_loss_clip": 1.05363154, + "balance_loss_mlp": 1.03314209, + "epoch": 0.2913572824289794, + "flos": 27667575881280.0, + "grad_norm": 1.9226149039416849, + "language_loss": 0.6684382, + "learning_rate": 3.3252247016350046e-06, + "loss": 0.69039333, + "num_input_tokens_seen": 104510755, + "router_z_loss_clip": 0.92578125, + "router_z_loss_mlp": 0.16162109, + "step": 4846, + "time_per_iteration": 2.631876230239868 + }, + { + "auxiliary_loss_clip": 0.01145933, + "auxiliary_loss_mlp": 0.01040665, + "balance_loss_clip": 1.05663264, + "balance_loss_mlp": 1.02602673, + "epoch": 0.29141740568164737, + "flos": 28196331215040.0, + "grad_norm": 1.7993690173319696, + "language_loss": 0.70569026, + "learning_rate": 3.3249329843039166e-06, + "loss": 0.72755623, + "num_input_tokens_seen": 104530830, + "router_z_loss_clip": 0.89257812, + "router_z_loss_mlp": 0.14654541, + "step": 4847, + "time_per_iteration": 2.708449363708496 + }, + { + "auxiliary_loss_clip": 0.011408, + "auxiliary_loss_mlp": 0.01032657, + "balance_loss_clip": 1.05095422, + "balance_loss_mlp": 1.01761889, + "epoch": 0.29147752893431533, + "flos": 28781115596640.0, + "grad_norm": 3.5321433549071077, + "language_loss": 0.7381714, + "learning_rate": 3.324641216731237e-06, + "loss": 0.75990593, + "num_input_tokens_seen": 104550115, + "router_z_loss_clip": 0.89892578, + "router_z_loss_mlp": 0.15045166, + "step": 4848, + "time_per_iteration": 2.6996469497680664 + }, + { + "auxiliary_loss_clip": 0.01142464, + "auxiliary_loss_mlp": 0.01038644, + "balance_loss_clip": 1.05141342, + "balance_loss_mlp": 1.02315855, + "epoch": 0.2915376521869833, + "flos": 25126613017920.0, + "grad_norm": 2.1655852391391455, + "language_loss": 0.76961148, + "learning_rate": 3.3243493989280295e-06, + "loss": 0.79142261, + "num_input_tokens_seen": 104566255, + "router_z_loss_clip": 0.91064453, + "router_z_loss_mlp": 0.1550293, + "step": 4849, + "time_per_iteration": 2.6208715438842773 + }, + { + "auxiliary_loss_clip": 0.01146026, + "auxiliary_loss_mlp": 0.01034502, + "balance_loss_clip": 1.05264115, + "balance_loss_mlp": 1.01997042, + "epoch": 0.29159777543965126, + "flos": 25394434653600.0, + "grad_norm": 1.689424699378651, + "language_loss": 0.7853722, + "learning_rate": 3.3240575309053596e-06, + "loss": 0.80717748, + "num_input_tokens_seen": 104585235, + "router_z_loss_clip": 0.93359375, + "router_z_loss_mlp": 0.14526367, + "step": 4850, + "time_per_iteration": 2.6946516036987305 + }, + { + "auxiliary_loss_clip": 0.01140119, + "auxiliary_loss_mlp": 0.01037404, + "balance_loss_clip": 1.05135751, + "balance_loss_mlp": 1.02190661, + "epoch": 0.29165789869231923, + "flos": 29582959812480.0, + "grad_norm": 1.8580483989191974, + "language_loss": 0.75906456, + "learning_rate": 3.323765612674296e-06, + "loss": 0.7808398, + "num_input_tokens_seen": 104605315, + "router_z_loss_clip": 0.88769531, + "router_z_loss_mlp": 0.15496826, + "step": 4851, + "time_per_iteration": 2.6865878105163574 + }, + { + "auxiliary_loss_clip": 0.01140309, + "auxiliary_loss_mlp": 0.01038762, + "balance_loss_clip": 1.05317163, + "balance_loss_mlp": 1.02545786, + "epoch": 0.29171802194498725, + "flos": 35325384016320.0, + "grad_norm": 1.3812500252373476, + "language_loss": 0.7738328, + "learning_rate": 3.3234736442459078e-06, + "loss": 0.79562354, + "num_input_tokens_seen": 104626055, + "router_z_loss_clip": 0.87109375, + "router_z_loss_mlp": 0.13323975, + "step": 4852, + "time_per_iteration": 2.7609198093414307 + }, + { + "auxiliary_loss_clip": 0.01141975, + "auxiliary_loss_mlp": 0.01043569, + "balance_loss_clip": 1.0522604, + "balance_loss_mlp": 1.02889395, + "epoch": 0.2917781451976552, + "flos": 27574844528160.0, + "grad_norm": 1.6920113254079987, + "language_loss": 0.78341424, + "learning_rate": 3.3231816256312665e-06, + "loss": 0.80526966, + "num_input_tokens_seen": 104646005, + "router_z_loss_clip": 0.89794922, + "router_z_loss_mlp": 0.14672852, + "step": 4853, + "time_per_iteration": 2.7557082176208496 + }, + { + "auxiliary_loss_clip": 0.01142001, + "auxiliary_loss_mlp": 0.01042492, + "balance_loss_clip": 1.05054379, + "balance_loss_mlp": 1.02694714, + "epoch": 0.2918382684503232, + "flos": 26325753045120.0, + "grad_norm": 4.238465571463475, + "language_loss": 0.88270074, + "learning_rate": 3.322889556841445e-06, + "loss": 0.90454572, + "num_input_tokens_seen": 104661620, + "router_z_loss_clip": 0.91552734, + "router_z_loss_mlp": 0.15551758, + "step": 4854, + "time_per_iteration": 2.725094795227051 + }, + { + "auxiliary_loss_clip": 0.01140997, + "auxiliary_loss_mlp": 0.01053297, + "balance_loss_clip": 1.05120945, + "balance_loss_mlp": 1.0360595, + "epoch": 0.29189839170299114, + "flos": 29716404681600.0, + "grad_norm": 1.8059220364697106, + "language_loss": 0.86541808, + "learning_rate": 3.322597437887519e-06, + "loss": 0.88736099, + "num_input_tokens_seen": 104681445, + "router_z_loss_clip": 0.8984375, + "router_z_loss_mlp": 0.17248535, + "step": 4855, + "time_per_iteration": 2.6946895122528076 + }, + { + "auxiliary_loss_clip": 0.01058942, + "auxiliary_loss_mlp": 0.01008401, + "balance_loss_clip": 1.02910519, + "balance_loss_mlp": 1.00656819, + "epoch": 0.2919585149556591, + "flos": 87020619420960.0, + "grad_norm": 0.7995330224523504, + "language_loss": 0.60233366, + "learning_rate": 3.322305268780566e-06, + "loss": 0.62300718, + "num_input_tokens_seen": 104747945, + "router_z_loss_clip": 0.2980957, + "router_z_loss_mlp": 0.01831055, + "step": 4856, + "time_per_iteration": 3.4006824493408203 + }, + { + "auxiliary_loss_clip": 0.01139795, + "auxiliary_loss_mlp": 0.01038448, + "balance_loss_clip": 1.05038595, + "balance_loss_mlp": 1.02423191, + "epoch": 0.2920186382083271, + "flos": 19075248489600.0, + "grad_norm": 2.0002845649517655, + "language_loss": 0.68151867, + "learning_rate": 3.322013049531664e-06, + "loss": 0.70330107, + "num_input_tokens_seen": 104766225, + "router_z_loss_clip": 0.89404297, + "router_z_loss_mlp": 0.14202881, + "step": 4857, + "time_per_iteration": 2.6686458587646484 + }, + { + "auxiliary_loss_clip": 0.01139748, + "auxiliary_loss_mlp": 0.01037308, + "balance_loss_clip": 1.05213761, + "balance_loss_mlp": 1.023206, + "epoch": 0.29207876146099504, + "flos": 34612867702080.0, + "grad_norm": 3.7642003038083485, + "language_loss": 0.83744794, + "learning_rate": 3.321720780151895e-06, + "loss": 0.85921848, + "num_input_tokens_seen": 104785345, + "router_z_loss_clip": 0.87646484, + "router_z_loss_mlp": 0.14099121, + "step": 4858, + "time_per_iteration": 2.731802225112915 + }, + { + "auxiliary_loss_clip": 0.01140995, + "auxiliary_loss_mlp": 0.0103817, + "balance_loss_clip": 1.05279815, + "balance_loss_mlp": 1.02344799, + "epoch": 0.292138884713663, + "flos": 26687156896800.0, + "grad_norm": 2.367968337117561, + "language_loss": 0.77508283, + "learning_rate": 3.321428460652342e-06, + "loss": 0.79687452, + "num_input_tokens_seen": 104804560, + "router_z_loss_clip": 0.88232422, + "router_z_loss_mlp": 0.14733887, + "step": 4859, + "time_per_iteration": 2.690598964691162 + }, + { + "auxiliary_loss_clip": 0.0114447, + "auxiliary_loss_mlp": 0.01038544, + "balance_loss_clip": 1.04995918, + "balance_loss_mlp": 1.02266538, + "epoch": 0.29219900796633097, + "flos": 25615505698560.0, + "grad_norm": 3.879412719473893, + "language_loss": 0.68335664, + "learning_rate": 3.3211360910440885e-06, + "loss": 0.70518672, + "num_input_tokens_seen": 104821105, + "router_z_loss_clip": 0.94580078, + "router_z_loss_mlp": 0.15869141, + "step": 4860, + "time_per_iteration": 2.635511875152588 + }, + { + "auxiliary_loss_clip": 0.01138312, + "auxiliary_loss_mlp": 0.01040616, + "balance_loss_clip": 1.05195618, + "balance_loss_mlp": 1.02728844, + "epoch": 0.29225913121899894, + "flos": 42713952410880.0, + "grad_norm": 2.769425841967321, + "language_loss": 0.74963605, + "learning_rate": 3.320843671338222e-06, + "loss": 0.77142537, + "num_input_tokens_seen": 104841440, + "router_z_loss_clip": 0.86181641, + "router_z_loss_mlp": 0.13336182, + "step": 4861, + "time_per_iteration": 2.7914936542510986 + }, + { + "auxiliary_loss_clip": 0.01138873, + "auxiliary_loss_mlp": 0.01040661, + "balance_loss_clip": 1.0509696, + "balance_loss_mlp": 1.02704692, + "epoch": 0.2923192544716669, + "flos": 16491384177120.0, + "grad_norm": 2.2747082513537946, + "language_loss": 0.91700691, + "learning_rate": 3.320551201545832e-06, + "loss": 0.93880224, + "num_input_tokens_seen": 104858210, + "router_z_loss_clip": 0.87841797, + "router_z_loss_mlp": 0.13604736, + "step": 4862, + "time_per_iteration": 2.6260459423065186 + }, + { + "auxiliary_loss_clip": 0.01138034, + "auxiliary_loss_mlp": 0.0103385, + "balance_loss_clip": 1.04931688, + "balance_loss_mlp": 1.01984298, + "epoch": 0.29237937772433487, + "flos": 23749343912160.0, + "grad_norm": 2.7769977292480474, + "language_loss": 0.74033844, + "learning_rate": 3.320258681678008e-06, + "loss": 0.7620573, + "num_input_tokens_seen": 104875620, + "router_z_loss_clip": 0.88769531, + "router_z_loss_mlp": 0.13995361, + "step": 4863, + "time_per_iteration": 2.6796531677246094 + }, + { + "auxiliary_loss_clip": 0.01136351, + "auxiliary_loss_mlp": 0.01035029, + "balance_loss_clip": 1.05083048, + "balance_loss_mlp": 1.02139163, + "epoch": 0.29243950097700283, + "flos": 25441914555360.0, + "grad_norm": 2.4907282566160602, + "language_loss": 0.77698737, + "learning_rate": 3.319966111745842e-06, + "loss": 0.79870123, + "num_input_tokens_seen": 104894600, + "router_z_loss_clip": 0.85498047, + "router_z_loss_mlp": 0.13641357, + "step": 4864, + "time_per_iteration": 2.729790449142456 + }, + { + "auxiliary_loss_clip": 0.01141617, + "auxiliary_loss_mlp": 0.01041136, + "balance_loss_clip": 1.05045962, + "balance_loss_mlp": 1.02581763, + "epoch": 0.29249962422967085, + "flos": 28777955248800.0, + "grad_norm": 1.6901317149459565, + "language_loss": 0.81601048, + "learning_rate": 3.319673491760429e-06, + "loss": 0.83783799, + "num_input_tokens_seen": 104914530, + "router_z_loss_clip": 0.91210938, + "router_z_loss_mlp": 0.15307617, + "step": 4865, + "time_per_iteration": 2.709247589111328 + }, + { + "auxiliary_loss_clip": 0.01140114, + "auxiliary_loss_mlp": 0.0103841, + "balance_loss_clip": 1.04921031, + "balance_loss_mlp": 1.02323484, + "epoch": 0.2925597474823388, + "flos": 27178075441440.0, + "grad_norm": 1.971766573558038, + "language_loss": 0.84918177, + "learning_rate": 3.3193808217328645e-06, + "loss": 0.87096703, + "num_input_tokens_seen": 104933460, + "router_z_loss_clip": 0.90869141, + "router_z_loss_mlp": 0.15185547, + "step": 4866, + "time_per_iteration": 2.6558308601379395 + }, + { + "auxiliary_loss_clip": 0.01135844, + "auxiliary_loss_mlp": 0.01034341, + "balance_loss_clip": 1.04918063, + "balance_loss_mlp": 1.02070916, + "epoch": 0.2926198707350068, + "flos": 42044742718560.0, + "grad_norm": 1.709162369320451, + "language_loss": 0.75009024, + "learning_rate": 3.3190881016742476e-06, + "loss": 0.77179211, + "num_input_tokens_seen": 104954495, + "router_z_loss_clip": 0.86621094, + "router_z_loss_mlp": 0.13641357, + "step": 4867, + "time_per_iteration": 2.745837450027466 + }, + { + "auxiliary_loss_clip": 0.01138631, + "auxiliary_loss_mlp": 0.0104505, + "balance_loss_clip": 1.0483495, + "balance_loss_mlp": 1.03012466, + "epoch": 0.29267999398767475, + "flos": 25263258752160.0, + "grad_norm": 2.1089761855152522, + "language_loss": 0.73061532, + "learning_rate": 3.3187953315956776e-06, + "loss": 0.75245214, + "num_input_tokens_seen": 104971915, + "router_z_loss_clip": 0.90283203, + "router_z_loss_mlp": 0.14916992, + "step": 4868, + "time_per_iteration": 2.6452646255493164 + }, + { + "auxiliary_loss_clip": 0.011375, + "auxiliary_loss_mlp": 0.01027239, + "balance_loss_clip": 1.05015326, + "balance_loss_mlp": 1.01317859, + "epoch": 0.2927401172403427, + "flos": 22413355564320.0, + "grad_norm": 1.3700160436434394, + "language_loss": 0.74607122, + "learning_rate": 3.3185025115082566e-06, + "loss": 0.76771861, + "num_input_tokens_seen": 104991335, + "router_z_loss_clip": 0.87255859, + "router_z_loss_mlp": 0.14056396, + "step": 4869, + "time_per_iteration": 2.7029268741607666 + }, + { + "auxiliary_loss_clip": 0.01139291, + "auxiliary_loss_mlp": 0.01036109, + "balance_loss_clip": 1.0497452, + "balance_loss_mlp": 1.0219115, + "epoch": 0.2928002404930107, + "flos": 31852616554080.0, + "grad_norm": 1.4533790391433994, + "language_loss": 0.76399577, + "learning_rate": 3.318209641423088e-06, + "loss": 0.78574979, + "num_input_tokens_seen": 105012015, + "router_z_loss_clip": 0.89550781, + "router_z_loss_mlp": 0.1418457, + "step": 4870, + "time_per_iteration": 2.728336811065674 + }, + { + "auxiliary_loss_clip": 0.01143915, + "auxiliary_loss_mlp": 0.01042205, + "balance_loss_clip": 1.05113482, + "balance_loss_mlp": 1.02683854, + "epoch": 0.29286036374567864, + "flos": 26020499793120.0, + "grad_norm": 2.3241549779273925, + "language_loss": 0.67924988, + "learning_rate": 3.3179167213512777e-06, + "loss": 0.70111108, + "num_input_tokens_seen": 105031460, + "router_z_loss_clip": 0.92626953, + "router_z_loss_mlp": 0.15380859, + "step": 4871, + "time_per_iteration": 5.362074136734009 + }, + { + "auxiliary_loss_clip": 0.01133683, + "auxiliary_loss_mlp": 0.01038572, + "balance_loss_clip": 1.04630852, + "balance_loss_mlp": 1.02500021, + "epoch": 0.2929204869983466, + "flos": 36081490573440.0, + "grad_norm": 1.920800022820618, + "language_loss": 0.77374041, + "learning_rate": 3.317623751303933e-06, + "loss": 0.79546297, + "num_input_tokens_seen": 105052965, + "router_z_loss_clip": 0.875, + "router_z_loss_mlp": 0.13574219, + "step": 4872, + "time_per_iteration": 2.694681167602539 + }, + { + "auxiliary_loss_clip": 0.01140081, + "auxiliary_loss_mlp": 0.0103821, + "balance_loss_clip": 1.04893649, + "balance_loss_mlp": 1.02233171, + "epoch": 0.2929806102510146, + "flos": 23255183985120.0, + "grad_norm": 1.954208239094709, + "language_loss": 0.72815192, + "learning_rate": 3.317330731292164e-06, + "loss": 0.74993491, + "num_input_tokens_seen": 105071840, + "router_z_loss_clip": 0.91015625, + "router_z_loss_mlp": 0.15881348, + "step": 4873, + "time_per_iteration": 4.162993907928467 + }, + { + "auxiliary_loss_clip": 0.011393, + "auxiliary_loss_mlp": 0.01044057, + "balance_loss_clip": 1.04782581, + "balance_loss_mlp": 1.02849972, + "epoch": 0.29304073350368254, + "flos": 26776525315680.0, + "grad_norm": 1.8661255963237826, + "language_loss": 0.77688515, + "learning_rate": 3.3170376613270812e-06, + "loss": 0.79871869, + "num_input_tokens_seen": 105089445, + "router_z_loss_clip": 0.91455078, + "router_z_loss_mlp": 0.15539551, + "step": 4874, + "time_per_iteration": 2.722222089767456 + }, + { + "auxiliary_loss_clip": 0.01144134, + "auxiliary_loss_mlp": 0.01044127, + "balance_loss_clip": 1.05058956, + "balance_loss_mlp": 1.02886808, + "epoch": 0.2931008567563505, + "flos": 18852678305280.0, + "grad_norm": 2.3467480904653786, + "language_loss": 0.76970553, + "learning_rate": 3.3167445414197985e-06, + "loss": 0.79158813, + "num_input_tokens_seen": 105106210, + "router_z_loss_clip": 0.93554688, + "router_z_loss_mlp": 0.15258789, + "step": 4875, + "time_per_iteration": 4.231383800506592 + }, + { + "auxiliary_loss_clip": 0.01144249, + "auxiliary_loss_mlp": 0.01034088, + "balance_loss_clip": 1.05339289, + "balance_loss_mlp": 1.01930606, + "epoch": 0.29316098000901847, + "flos": 20722567681440.0, + "grad_norm": 1.6468921987625145, + "language_loss": 0.69722331, + "learning_rate": 3.316451371581431e-06, + "loss": 0.71900666, + "num_input_tokens_seen": 105124200, + "router_z_loss_clip": 0.90771484, + "router_z_loss_mlp": 0.14776611, + "step": 4876, + "time_per_iteration": 2.850299596786499 + }, + { + "auxiliary_loss_clip": 0.01136864, + "auxiliary_loss_mlp": 0.01038059, + "balance_loss_clip": 1.048949, + "balance_loss_mlp": 1.02405775, + "epoch": 0.29322110326168643, + "flos": 19959816290400.0, + "grad_norm": 2.0526436380837394, + "language_loss": 0.82401609, + "learning_rate": 3.316158151823096e-06, + "loss": 0.84576529, + "num_input_tokens_seen": 105140400, + "router_z_loss_clip": 0.87939453, + "router_z_loss_mlp": 0.13995361, + "step": 4877, + "time_per_iteration": 2.6292405128479004 + }, + { + "auxiliary_loss_clip": 0.0114214, + "auxiliary_loss_mlp": 0.0103848, + "balance_loss_clip": 1.04961634, + "balance_loss_mlp": 1.02372193, + "epoch": 0.29328122651435445, + "flos": 17071184933280.0, + "grad_norm": 2.3676251032599027, + "language_loss": 0.67756504, + "learning_rate": 3.315864882155911e-06, + "loss": 0.69937128, + "num_input_tokens_seen": 105157535, + "router_z_loss_clip": 0.92480469, + "router_z_loss_mlp": 0.14758301, + "step": 4878, + "time_per_iteration": 2.691417694091797 + }, + { + "auxiliary_loss_clip": 0.01139868, + "auxiliary_loss_mlp": 0.01042535, + "balance_loss_clip": 1.05025983, + "balance_loss_mlp": 1.02799761, + "epoch": 0.2933413497670224, + "flos": 30828283188480.0, + "grad_norm": 1.837821205778902, + "language_loss": 0.73653507, + "learning_rate": 3.3155715625909982e-06, + "loss": 0.75835913, + "num_input_tokens_seen": 105175185, + "router_z_loss_clip": 0.89648438, + "router_z_loss_mlp": 0.14538574, + "step": 4879, + "time_per_iteration": 2.755502462387085 + }, + { + "auxiliary_loss_clip": 0.01145255, + "auxiliary_loss_mlp": 0.01047131, + "balance_loss_clip": 1.05182314, + "balance_loss_mlp": 1.03063214, + "epoch": 0.2934014730196904, + "flos": 39198364534080.0, + "grad_norm": 1.9558433452826411, + "language_loss": 0.66006476, + "learning_rate": 3.3152781931394803e-06, + "loss": 0.6819886, + "num_input_tokens_seen": 105194540, + "router_z_loss_clip": 0.93457031, + "router_z_loss_mlp": 0.16491699, + "step": 4880, + "time_per_iteration": 2.7337677478790283 + }, + { + "auxiliary_loss_clip": 0.01141936, + "auxiliary_loss_mlp": 0.01046974, + "balance_loss_clip": 1.05100763, + "balance_loss_mlp": 1.03225136, + "epoch": 0.29346159627235835, + "flos": 29715270197760.0, + "grad_norm": 3.688065902422539, + "language_loss": 0.70112801, + "learning_rate": 3.314984773812481e-06, + "loss": 0.7230171, + "num_input_tokens_seen": 105213215, + "router_z_loss_clip": 0.90917969, + "router_z_loss_mlp": 0.14709473, + "step": 4881, + "time_per_iteration": 2.701772451400757 + }, + { + "auxiliary_loss_clip": 0.01145084, + "auxiliary_loss_mlp": 0.01038462, + "balance_loss_clip": 1.05326259, + "balance_loss_mlp": 1.02348375, + "epoch": 0.2935217195250263, + "flos": 27755850333600.0, + "grad_norm": 1.5321869479953059, + "language_loss": 0.83524168, + "learning_rate": 3.314691304621127e-06, + "loss": 0.85707712, + "num_input_tokens_seen": 105231585, + "router_z_loss_clip": 0.91796875, + "router_z_loss_mlp": 0.14971924, + "step": 4882, + "time_per_iteration": 2.6551735401153564 + }, + { + "auxiliary_loss_clip": 0.01145741, + "auxiliary_loss_mlp": 0.01041081, + "balance_loss_clip": 1.05165124, + "balance_loss_mlp": 1.02554798, + "epoch": 0.2935818427776943, + "flos": 26509676094720.0, + "grad_norm": 2.2765633259262135, + "language_loss": 0.71095097, + "learning_rate": 3.314397785576548e-06, + "loss": 0.7328192, + "num_input_tokens_seen": 105250120, + "router_z_loss_clip": 0.94091797, + "router_z_loss_mlp": 0.15539551, + "step": 4883, + "time_per_iteration": 2.6850225925445557 + }, + { + "auxiliary_loss_clip": 0.01142276, + "auxiliary_loss_mlp": 0.01036909, + "balance_loss_clip": 1.05090857, + "balance_loss_mlp": 1.02179289, + "epoch": 0.29364196603036224, + "flos": 29047154472000.0, + "grad_norm": 8.989505038696917, + "language_loss": 0.92458874, + "learning_rate": 3.3141042166898726e-06, + "loss": 0.94638062, + "num_input_tokens_seen": 105266065, + "router_z_loss_clip": 0.9140625, + "router_z_loss_mlp": 0.15124512, + "step": 4884, + "time_per_iteration": 2.68278431892395 + }, + { + "auxiliary_loss_clip": 0.01150074, + "auxiliary_loss_mlp": 0.0103984, + "balance_loss_clip": 1.05721807, + "balance_loss_mlp": 1.02495098, + "epoch": 0.2937020892830302, + "flos": 28639081064160.0, + "grad_norm": 2.054439015726826, + "language_loss": 0.73517668, + "learning_rate": 3.313810597972234e-06, + "loss": 0.75707591, + "num_input_tokens_seen": 105282155, + "router_z_loss_clip": 0.92871094, + "router_z_loss_mlp": 0.14886475, + "step": 4885, + "time_per_iteration": 2.6835741996765137 + }, + { + "auxiliary_loss_clip": 0.01140581, + "auxiliary_loss_mlp": 0.01041459, + "balance_loss_clip": 1.0510509, + "balance_loss_mlp": 1.02714157, + "epoch": 0.2937622125356982, + "flos": 29618568151200.0, + "grad_norm": 2.2199479778392552, + "language_loss": 0.8473841, + "learning_rate": 3.3135169294347655e-06, + "loss": 0.86920446, + "num_input_tokens_seen": 105299225, + "router_z_loss_clip": 0.89599609, + "router_z_loss_mlp": 0.14324951, + "step": 4886, + "time_per_iteration": 2.6422910690307617 + }, + { + "auxiliary_loss_clip": 0.01142563, + "auxiliary_loss_mlp": 0.01038519, + "balance_loss_clip": 1.0499289, + "balance_loss_mlp": 1.02402902, + "epoch": 0.29382233578836614, + "flos": 25212861606240.0, + "grad_norm": 2.201211798783121, + "language_loss": 0.76837707, + "learning_rate": 3.313223211088603e-06, + "loss": 0.79018795, + "num_input_tokens_seen": 105315710, + "router_z_loss_clip": 0.92675781, + "router_z_loss_mlp": 0.14508057, + "step": 4887, + "time_per_iteration": 2.695152997970581 + }, + { + "auxiliary_loss_clip": 0.01146233, + "auxiliary_loss_mlp": 0.01044697, + "balance_loss_clip": 1.05283833, + "balance_loss_mlp": 1.03018963, + "epoch": 0.2938824590410341, + "flos": 20187977859360.0, + "grad_norm": 3.5186618292578253, + "language_loss": 0.79743606, + "learning_rate": 3.3129294429448855e-06, + "loss": 0.81934536, + "num_input_tokens_seen": 105333505, + "router_z_loss_clip": 0.93359375, + "router_z_loss_mlp": 0.14501953, + "step": 4888, + "time_per_iteration": 2.658074378967285 + }, + { + "auxiliary_loss_clip": 0.01140752, + "auxiliary_loss_mlp": 0.01033709, + "balance_loss_clip": 1.05035305, + "balance_loss_mlp": 1.01961231, + "epoch": 0.29394258229370207, + "flos": 46278033121440.0, + "grad_norm": 1.44765752020612, + "language_loss": 0.5550065, + "learning_rate": 3.3126356250147517e-06, + "loss": 0.57675105, + "num_input_tokens_seen": 105355605, + "router_z_loss_clip": 0.90380859, + "router_z_loss_mlp": 0.14105225, + "step": 4889, + "time_per_iteration": 2.8066647052764893 + }, + { + "auxiliary_loss_clip": 0.01144725, + "auxiliary_loss_mlp": 0.01038932, + "balance_loss_clip": 1.05120587, + "balance_loss_mlp": 1.02260017, + "epoch": 0.29400270554637004, + "flos": 24458780913120.0, + "grad_norm": 1.855041537984909, + "language_loss": 0.8437975, + "learning_rate": 3.3123417573093434e-06, + "loss": 0.86563408, + "num_input_tokens_seen": 105374225, + "router_z_loss_clip": 0.93505859, + "router_z_loss_mlp": 0.16357422, + "step": 4890, + "time_per_iteration": 2.6345789432525635 + }, + { + "auxiliary_loss_clip": 0.01146299, + "auxiliary_loss_mlp": 0.01043359, + "balance_loss_clip": 1.05315757, + "balance_loss_mlp": 1.02829051, + "epoch": 0.294062828799038, + "flos": 18627798636000.0, + "grad_norm": 1.8185612455883615, + "language_loss": 0.72534078, + "learning_rate": 3.3120478398398046e-06, + "loss": 0.74723738, + "num_input_tokens_seen": 105391565, + "router_z_loss_clip": 0.93115234, + "router_z_loss_mlp": 0.15075684, + "step": 4891, + "time_per_iteration": 2.620882034301758 + }, + { + "auxiliary_loss_clip": 0.01145276, + "auxiliary_loss_mlp": 0.01044277, + "balance_loss_clip": 1.05232632, + "balance_loss_mlp": 1.02821982, + "epoch": 0.294122952051706, + "flos": 27756296023680.0, + "grad_norm": 2.0369288322033854, + "language_loss": 0.77307612, + "learning_rate": 3.3117538726172797e-06, + "loss": 0.79497164, + "num_input_tokens_seen": 105409840, + "router_z_loss_clip": 0.92871094, + "router_z_loss_mlp": 0.1605835, + "step": 4892, + "time_per_iteration": 2.6613378524780273 + }, + { + "auxiliary_loss_clip": 0.01141096, + "auxiliary_loss_mlp": 0.01036802, + "balance_loss_clip": 1.04952812, + "balance_loss_mlp": 1.02157903, + "epoch": 0.294183075304374, + "flos": 30472308652320.0, + "grad_norm": 1.737459939877008, + "language_loss": 0.78370923, + "learning_rate": 3.3114598556529164e-06, + "loss": 0.80548823, + "num_input_tokens_seen": 105428645, + "router_z_loss_clip": 0.91552734, + "router_z_loss_mlp": 0.15222168, + "step": 4893, + "time_per_iteration": 2.661445379257202 + }, + { + "auxiliary_loss_clip": 0.01143957, + "auxiliary_loss_mlp": 0.0104551, + "balance_loss_clip": 1.05173767, + "balance_loss_mlp": 1.03090119, + "epoch": 0.29424319855704195, + "flos": 37770009488640.0, + "grad_norm": 1.7976840276895487, + "language_loss": 0.84779394, + "learning_rate": 3.311165788957864e-06, + "loss": 0.86968863, + "num_input_tokens_seen": 105447480, + "router_z_loss_clip": 0.92236328, + "router_z_loss_mlp": 0.1461792, + "step": 4894, + "time_per_iteration": 2.7158596515655518 + }, + { + "auxiliary_loss_clip": 0.01140885, + "auxiliary_loss_mlp": 0.01034936, + "balance_loss_clip": 1.0486722, + "balance_loss_mlp": 1.02027297, + "epoch": 0.2943033218097099, + "flos": 18585180807840.0, + "grad_norm": 3.17824431904129, + "language_loss": 0.90234029, + "learning_rate": 3.310871672543274e-06, + "loss": 0.92409849, + "num_input_tokens_seen": 105464600, + "router_z_loss_clip": 0.92333984, + "router_z_loss_mlp": 0.14672852, + "step": 4895, + "time_per_iteration": 2.606062650680542 + }, + { + "auxiliary_loss_clip": 0.011475, + "auxiliary_loss_mlp": 0.01036368, + "balance_loss_clip": 1.05304122, + "balance_loss_mlp": 1.02079952, + "epoch": 0.2943634450623779, + "flos": 26510040750240.0, + "grad_norm": 1.767709685741061, + "language_loss": 0.86698198, + "learning_rate": 3.3105775064202982e-06, + "loss": 0.88882065, + "num_input_tokens_seen": 105481510, + "router_z_loss_clip": 0.94482422, + "router_z_loss_mlp": 0.15576172, + "step": 4896, + "time_per_iteration": 2.6850743293762207 + }, + { + "auxiliary_loss_clip": 0.01145254, + "auxiliary_loss_mlp": 0.01047586, + "balance_loss_clip": 1.05212069, + "balance_loss_mlp": 1.03199303, + "epoch": 0.29442356831504585, + "flos": 27579220394400.0, + "grad_norm": 1.8419698568350351, + "language_loss": 0.73406523, + "learning_rate": 3.3102832906000924e-06, + "loss": 0.75599366, + "num_input_tokens_seen": 105501390, + "router_z_loss_clip": 0.93164062, + "router_z_loss_mlp": 0.15600586, + "step": 4897, + "time_per_iteration": 2.6439976692199707 + }, + { + "auxiliary_loss_clip": 0.01145952, + "auxiliary_loss_mlp": 0.01043079, + "balance_loss_clip": 1.04863238, + "balance_loss_mlp": 1.02623487, + "epoch": 0.2944836915677138, + "flos": 24417864810720.0, + "grad_norm": 1.9910489370503446, + "language_loss": 0.74122846, + "learning_rate": 3.309989025093813e-06, + "loss": 0.7631188, + "num_input_tokens_seen": 105519600, + "router_z_loss_clip": 0.97216797, + "router_z_loss_mlp": 0.16833496, + "step": 4898, + "time_per_iteration": 2.667825222015381 + }, + { + "auxiliary_loss_clip": 0.01152365, + "auxiliary_loss_mlp": 0.01040835, + "balance_loss_clip": 1.05443239, + "balance_loss_mlp": 1.02325201, + "epoch": 0.2945438148203818, + "flos": 24460117983360.0, + "grad_norm": 2.786755990864972, + "language_loss": 0.70360184, + "learning_rate": 3.309694709912618e-06, + "loss": 0.72553384, + "num_input_tokens_seen": 105535970, + "router_z_loss_clip": 0.97753906, + "router_z_loss_mlp": 0.17565918, + "step": 4899, + "time_per_iteration": 2.697460174560547 + }, + { + "auxiliary_loss_clip": 0.01140865, + "auxiliary_loss_mlp": 0.01041945, + "balance_loss_clip": 1.0483489, + "balance_loss_mlp": 1.0260787, + "epoch": 0.29460393807304974, + "flos": 28959852434400.0, + "grad_norm": 2.2847782320616825, + "language_loss": 0.78788149, + "learning_rate": 3.3094003450676685e-06, + "loss": 0.80970961, + "num_input_tokens_seen": 105556735, + "router_z_loss_clip": 0.92529297, + "router_z_loss_mlp": 0.15856934, + "step": 4900, + "time_per_iteration": 2.822117805480957 + }, + { + "auxiliary_loss_clip": 0.01140657, + "auxiliary_loss_mlp": 0.0104253, + "balance_loss_clip": 1.04708016, + "balance_loss_mlp": 1.02774858, + "epoch": 0.2946640613257177, + "flos": 18273404273760.0, + "grad_norm": 2.056548363817113, + "language_loss": 0.80779713, + "learning_rate": 3.3091059305701268e-06, + "loss": 0.829629, + "num_input_tokens_seen": 105574875, + "router_z_loss_clip": 0.93603516, + "router_z_loss_mlp": 0.14794922, + "step": 4901, + "time_per_iteration": 2.666565179824829 + }, + { + "auxiliary_loss_clip": 0.01134824, + "auxiliary_loss_mlp": 0.0103372, + "balance_loss_clip": 1.04766774, + "balance_loss_mlp": 1.01986182, + "epoch": 0.2947241845783857, + "flos": 29582676191520.0, + "grad_norm": 2.004028549730126, + "language_loss": 0.57751298, + "learning_rate": 3.308811466431157e-06, + "loss": 0.5991984, + "num_input_tokens_seen": 105594225, + "router_z_loss_clip": 0.87158203, + "router_z_loss_mlp": 0.13867188, + "step": 4902, + "time_per_iteration": 2.678671360015869 + }, + { + "auxiliary_loss_clip": 0.01140853, + "auxiliary_loss_mlp": 0.01041473, + "balance_loss_clip": 1.04886162, + "balance_loss_mlp": 1.02691114, + "epoch": 0.29478430783105364, + "flos": 24328780012800.0, + "grad_norm": 2.5359230235042576, + "language_loss": 0.75586563, + "learning_rate": 3.308516952661925e-06, + "loss": 0.77768886, + "num_input_tokens_seen": 105614000, + "router_z_loss_clip": 0.92041016, + "router_z_loss_mlp": 0.14562988, + "step": 4903, + "time_per_iteration": 2.629760265350342 + }, + { + "auxiliary_loss_clip": 0.01140943, + "auxiliary_loss_mlp": 0.01042749, + "balance_loss_clip": 1.04830742, + "balance_loss_mlp": 1.02690601, + "epoch": 0.2948444310837216, + "flos": 33411256120800.0, + "grad_norm": 2.0376537408802795, + "language_loss": 0.62838924, + "learning_rate": 3.3082223892736e-06, + "loss": 0.65022624, + "num_input_tokens_seen": 105634575, + "router_z_loss_clip": 0.92724609, + "router_z_loss_mlp": 0.1585083, + "step": 4904, + "time_per_iteration": 2.7245473861694336 + }, + { + "auxiliary_loss_clip": 0.01143415, + "auxiliary_loss_mlp": 0.01039406, + "balance_loss_clip": 1.04817104, + "balance_loss_mlp": 1.02448153, + "epoch": 0.2949045543363896, + "flos": 28559477309760.0, + "grad_norm": 1.566713573770207, + "language_loss": 0.73281717, + "learning_rate": 3.3079277762773496e-06, + "loss": 0.75464547, + "num_input_tokens_seen": 105654385, + "router_z_loss_clip": 0.953125, + "router_z_loss_mlp": 0.14916992, + "step": 4905, + "time_per_iteration": 2.653942108154297 + }, + { + "auxiliary_loss_clip": 0.01138829, + "auxiliary_loss_mlp": 0.01041145, + "balance_loss_clip": 1.04782867, + "balance_loss_mlp": 1.02531362, + "epoch": 0.2949646775890576, + "flos": 29225891309760.0, + "grad_norm": 1.7931788299420393, + "language_loss": 0.81689948, + "learning_rate": 3.3076331136843476e-06, + "loss": 0.83869922, + "num_input_tokens_seen": 105673570, + "router_z_loss_clip": 0.91015625, + "router_z_loss_mlp": 0.15820312, + "step": 4906, + "time_per_iteration": 2.672684669494629 + }, + { + "auxiliary_loss_clip": 0.01136284, + "auxiliary_loss_mlp": 0.010375, + "balance_loss_clip": 1.0473361, + "balance_loss_mlp": 1.022825, + "epoch": 0.29502480084172555, + "flos": 27801587992320.0, + "grad_norm": 1.9258798614202752, + "language_loss": 0.87362856, + "learning_rate": 3.3073384015057667e-06, + "loss": 0.89536637, + "num_input_tokens_seen": 105691940, + "router_z_loss_clip": 0.89013672, + "router_z_loss_mlp": 0.14685059, + "step": 4907, + "time_per_iteration": 2.7092742919921875 + }, + { + "auxiliary_loss_clip": 0.01142855, + "auxiliary_loss_mlp": 0.01044352, + "balance_loss_clip": 1.04892683, + "balance_loss_mlp": 1.02897429, + "epoch": 0.2950849240943935, + "flos": 23972764959360.0, + "grad_norm": 2.972681393461523, + "language_loss": 0.81998217, + "learning_rate": 3.307043639752782e-06, + "loss": 0.84185427, + "num_input_tokens_seen": 105709825, + "router_z_loss_clip": 0.93896484, + "router_z_loss_mlp": 0.15380859, + "step": 4908, + "time_per_iteration": 2.6070992946624756 + }, + { + "auxiliary_loss_clip": 0.01057729, + "auxiliary_loss_mlp": 0.01011511, + "balance_loss_clip": 1.0271225, + "balance_loss_mlp": 1.0093739, + "epoch": 0.2951450473470615, + "flos": 86639605205760.0, + "grad_norm": 1.490060213702019, + "language_loss": 0.57245433, + "learning_rate": 3.3067488284365728e-06, + "loss": 0.59314674, + "num_input_tokens_seen": 105766880, + "router_z_loss_clip": 0.3059082, + "router_z_loss_mlp": 0.02139282, + "step": 4909, + "time_per_iteration": 3.1572985649108887 + }, + { + "auxiliary_loss_clip": 0.01140425, + "auxiliary_loss_mlp": 0.0104762, + "balance_loss_clip": 1.05088937, + "balance_loss_mlp": 1.03321385, + "epoch": 0.29520517059972945, + "flos": 28023712486560.0, + "grad_norm": 1.6771925026752454, + "language_loss": 0.86701214, + "learning_rate": 3.3064539675683163e-06, + "loss": 0.88889259, + "num_input_tokens_seen": 105786875, + "router_z_loss_clip": 0.89648438, + "router_z_loss_mlp": 0.14385986, + "step": 4910, + "time_per_iteration": 5.6038124561309814 + }, + { + "auxiliary_loss_clip": 0.01135438, + "auxiliary_loss_mlp": 0.01038776, + "balance_loss_clip": 1.0477457, + "balance_loss_mlp": 1.02441096, + "epoch": 0.2952652938523974, + "flos": 24996612117600.0, + "grad_norm": 1.948488243596606, + "language_loss": 0.73252034, + "learning_rate": 3.3061590571591946e-06, + "loss": 0.75426245, + "num_input_tokens_seen": 105805315, + "router_z_loss_clip": 0.87695312, + "router_z_loss_mlp": 0.14361572, + "step": 4911, + "time_per_iteration": 2.6252570152282715 + }, + { + "auxiliary_loss_clip": 0.01140059, + "auxiliary_loss_mlp": 0.01035784, + "balance_loss_clip": 1.05064201, + "balance_loss_mlp": 1.02132428, + "epoch": 0.2953254171050654, + "flos": 23972683924800.0, + "grad_norm": 1.580417966411098, + "language_loss": 0.89626706, + "learning_rate": 3.3058640972203904e-06, + "loss": 0.91802549, + "num_input_tokens_seen": 105825125, + "router_z_loss_clip": 0.89306641, + "router_z_loss_mlp": 0.14465332, + "step": 4912, + "time_per_iteration": 4.094221353530884 + }, + { + "auxiliary_loss_clip": 0.01140131, + "auxiliary_loss_mlp": 0.01045769, + "balance_loss_clip": 1.04987657, + "balance_loss_mlp": 1.03099298, + "epoch": 0.29538554035773334, + "flos": 27757227921120.0, + "grad_norm": 1.6018192822170212, + "language_loss": 0.83084738, + "learning_rate": 3.3055690877630894e-06, + "loss": 0.85270631, + "num_input_tokens_seen": 105846085, + "router_z_loss_clip": 0.90283203, + "router_z_loss_mlp": 0.14788818, + "step": 4913, + "time_per_iteration": 2.668626308441162 + }, + { + "auxiliary_loss_clip": 0.01140789, + "auxiliary_loss_mlp": 0.01042865, + "balance_loss_clip": 1.04876709, + "balance_loss_mlp": 1.02819073, + "epoch": 0.2954456636104013, + "flos": 26688169828800.0, + "grad_norm": 1.7710639945832014, + "language_loss": 0.7683326, + "learning_rate": 3.3052740287984765e-06, + "loss": 0.79016912, + "num_input_tokens_seen": 105865400, + "router_z_loss_clip": 0.92138672, + "router_z_loss_mlp": 0.14685059, + "step": 4914, + "time_per_iteration": 2.6441972255706787 + }, + { + "auxiliary_loss_clip": 0.01138166, + "auxiliary_loss_mlp": 0.01036143, + "balance_loss_clip": 1.04916394, + "balance_loss_mlp": 1.02139688, + "epoch": 0.2955057868630693, + "flos": 49349615113440.0, + "grad_norm": 2.119690646967475, + "language_loss": 0.81401598, + "learning_rate": 3.3049789203377424e-06, + "loss": 0.83575904, + "num_input_tokens_seen": 105887920, + "router_z_loss_clip": 0.88964844, + "router_z_loss_mlp": 0.14733887, + "step": 4915, + "time_per_iteration": 4.2579216957092285 + }, + { + "auxiliary_loss_clip": 0.01143105, + "auxiliary_loss_mlp": 0.01035931, + "balance_loss_clip": 1.0502547, + "balance_loss_mlp": 1.0211606, + "epoch": 0.29556591011573724, + "flos": 27533442218400.0, + "grad_norm": 2.1145672458073994, + "language_loss": 0.84749186, + "learning_rate": 3.3046837623920772e-06, + "loss": 0.86928219, + "num_input_tokens_seen": 105904035, + "router_z_loss_clip": 0.92822266, + "router_z_loss_mlp": 0.14770508, + "step": 4916, + "time_per_iteration": 2.6973202228546143 + }, + { + "auxiliary_loss_clip": 0.01136465, + "auxiliary_loss_mlp": 0.01035005, + "balance_loss_clip": 1.04801965, + "balance_loss_mlp": 1.02048528, + "epoch": 0.2956260333684052, + "flos": 26953641462240.0, + "grad_norm": 2.1877738362138057, + "language_loss": 0.70203292, + "learning_rate": 3.3043885549726723e-06, + "loss": 0.72374761, + "num_input_tokens_seen": 105922685, + "router_z_loss_clip": 0.88427734, + "router_z_loss_mlp": 0.1451416, + "step": 4917, + "time_per_iteration": 2.6313045024871826 + }, + { + "auxiliary_loss_clip": 0.01141612, + "auxiliary_loss_mlp": 0.01039527, + "balance_loss_clip": 1.05149043, + "balance_loss_mlp": 1.02461338, + "epoch": 0.2956861566210732, + "flos": 20054289886560.0, + "grad_norm": 1.9197320742484856, + "language_loss": 0.90877014, + "learning_rate": 3.3040932980907226e-06, + "loss": 0.93058157, + "num_input_tokens_seen": 105940425, + "router_z_loss_clip": 0.90087891, + "router_z_loss_mlp": 0.14904785, + "step": 4918, + "time_per_iteration": 2.623297691345215 + }, + { + "auxiliary_loss_clip": 0.01142171, + "auxiliary_loss_mlp": 0.01033127, + "balance_loss_clip": 1.05095196, + "balance_loss_mlp": 1.01854157, + "epoch": 0.2957462798737412, + "flos": 31496358396960.0, + "grad_norm": 2.0997266867439968, + "language_loss": 0.7238282, + "learning_rate": 3.303797991757425e-06, + "loss": 0.74558115, + "num_input_tokens_seen": 105960550, + "router_z_loss_clip": 0.91210938, + "router_z_loss_mlp": 0.14599609, + "step": 4919, + "time_per_iteration": 2.680025815963745 + }, + { + "auxiliary_loss_clip": 0.01137864, + "auxiliary_loss_mlp": 0.01041498, + "balance_loss_clip": 1.04907918, + "balance_loss_mlp": 1.02702618, + "epoch": 0.29580640312640916, + "flos": 20365904351520.0, + "grad_norm": 1.893727243774095, + "language_loss": 0.76370513, + "learning_rate": 3.3035026359839763e-06, + "loss": 0.78549874, + "num_input_tokens_seen": 105978820, + "router_z_loss_clip": 0.88769531, + "router_z_loss_mlp": 0.14483643, + "step": 4920, + "time_per_iteration": 2.6558361053466797 + }, + { + "auxiliary_loss_clip": 0.01142898, + "auxiliary_loss_mlp": 0.01047815, + "balance_loss_clip": 1.05160284, + "balance_loss_mlp": 1.03267527, + "epoch": 0.2958665263790771, + "flos": 29219246475840.0, + "grad_norm": 2.9966393007966166, + "language_loss": 0.68931115, + "learning_rate": 3.3032072307815774e-06, + "loss": 0.71121824, + "num_input_tokens_seen": 105997545, + "router_z_loss_clip": 0.91308594, + "router_z_loss_mlp": 0.15136719, + "step": 4921, + "time_per_iteration": 2.670093059539795 + }, + { + "auxiliary_loss_clip": 0.01143818, + "auxiliary_loss_mlp": 0.01041575, + "balance_loss_clip": 1.05120409, + "balance_loss_mlp": 1.02529109, + "epoch": 0.2959266496317451, + "flos": 22547246123520.0, + "grad_norm": 1.7786469655587793, + "language_loss": 0.75099277, + "learning_rate": 3.3029117761614298e-06, + "loss": 0.77284676, + "num_input_tokens_seen": 106015320, + "router_z_loss_clip": 0.92675781, + "router_z_loss_mlp": 0.16296387, + "step": 4922, + "time_per_iteration": 2.645406723022461 + }, + { + "auxiliary_loss_clip": 0.01146768, + "auxiliary_loss_mlp": 0.01035602, + "balance_loss_clip": 1.05119991, + "balance_loss_mlp": 1.02004528, + "epoch": 0.29598677288441305, + "flos": 31673839199040.0, + "grad_norm": 1.7371112567289546, + "language_loss": 0.7644788, + "learning_rate": 3.302616272134737e-06, + "loss": 0.78630251, + "num_input_tokens_seen": 106034555, + "router_z_loss_clip": 0.95654297, + "router_z_loss_mlp": 0.15551758, + "step": 4923, + "time_per_iteration": 2.6849446296691895 + }, + { + "auxiliary_loss_clip": 0.01141434, + "auxiliary_loss_mlp": 0.01036019, + "balance_loss_clip": 1.05065489, + "balance_loss_mlp": 1.02110004, + "epoch": 0.296046896137081, + "flos": 30695689182240.0, + "grad_norm": 1.6895517982178805, + "language_loss": 0.86009014, + "learning_rate": 3.3023207187127042e-06, + "loss": 0.88186467, + "num_input_tokens_seen": 106054200, + "router_z_loss_clip": 0.90722656, + "router_z_loss_mlp": 0.14923096, + "step": 4924, + "time_per_iteration": 2.6913976669311523 + }, + { + "auxiliary_loss_clip": 0.01140911, + "auxiliary_loss_mlp": 0.0103354, + "balance_loss_clip": 1.05204821, + "balance_loss_mlp": 1.01862681, + "epoch": 0.296107019389749, + "flos": 26553185303040.0, + "grad_norm": 1.4248568558471035, + "language_loss": 0.81939149, + "learning_rate": 3.3020251159065396e-06, + "loss": 0.84113598, + "num_input_tokens_seen": 106074700, + "router_z_loss_clip": 0.88867188, + "router_z_loss_mlp": 0.14904785, + "step": 4925, + "time_per_iteration": 2.6703174114227295 + }, + { + "auxiliary_loss_clip": 0.01138508, + "auxiliary_loss_mlp": 0.01034166, + "balance_loss_clip": 1.05063105, + "balance_loss_mlp": 1.01969397, + "epoch": 0.29616714264241695, + "flos": 21915387012960.0, + "grad_norm": 2.3470890941692475, + "language_loss": 0.8590945, + "learning_rate": 3.301729463727452e-06, + "loss": 0.88082123, + "num_input_tokens_seen": 106091415, + "router_z_loss_clip": 0.87890625, + "router_z_loss_mlp": 0.14459229, + "step": 4926, + "time_per_iteration": 2.661048412322998 + }, + { + "auxiliary_loss_clip": 0.01140527, + "auxiliary_loss_mlp": 0.01035582, + "balance_loss_clip": 1.04937696, + "balance_loss_mlp": 1.02109206, + "epoch": 0.2962272658950849, + "flos": 18318169517760.0, + "grad_norm": 1.9347440606466786, + "language_loss": 0.86259317, + "learning_rate": 3.3014337621866527e-06, + "loss": 0.88435423, + "num_input_tokens_seen": 106109135, + "router_z_loss_clip": 0.91113281, + "router_z_loss_mlp": 0.14489746, + "step": 4927, + "time_per_iteration": 2.675638437271118 + }, + { + "auxiliary_loss_clip": 0.01139413, + "auxiliary_loss_mlp": 0.01033859, + "balance_loss_clip": 1.05156136, + "balance_loss_mlp": 1.01997113, + "epoch": 0.2962873891477529, + "flos": 17961951877920.0, + "grad_norm": 1.7513780668277579, + "language_loss": 0.80424678, + "learning_rate": 3.3011380112953553e-06, + "loss": 0.82597947, + "num_input_tokens_seen": 106125750, + "router_z_loss_clip": 0.87890625, + "router_z_loss_mlp": 0.13885498, + "step": 4928, + "time_per_iteration": 2.634005546569824 + }, + { + "auxiliary_loss_clip": 0.01147604, + "auxiliary_loss_mlp": 0.01036832, + "balance_loss_clip": 1.05194521, + "balance_loss_mlp": 1.01977336, + "epoch": 0.29634751240042084, + "flos": 32608925697600.0, + "grad_norm": 2.663143624307608, + "language_loss": 0.72266346, + "learning_rate": 3.300842211064773e-06, + "loss": 0.74450779, + "num_input_tokens_seen": 106142835, + "router_z_loss_clip": 0.95751953, + "router_z_loss_mlp": 0.1706543, + "step": 4929, + "time_per_iteration": 2.727947473526001 + }, + { + "auxiliary_loss_clip": 0.01146069, + "auxiliary_loss_mlp": 0.01040373, + "balance_loss_clip": 1.05293751, + "balance_loss_mlp": 1.02475023, + "epoch": 0.2964076356530888, + "flos": 17779771071360.0, + "grad_norm": 2.380137125897927, + "language_loss": 0.72088087, + "learning_rate": 3.3005463615061246e-06, + "loss": 0.74274528, + "num_input_tokens_seen": 106160680, + "router_z_loss_clip": 0.93164062, + "router_z_loss_mlp": 0.15631104, + "step": 4930, + "time_per_iteration": 2.614629030227661 + }, + { + "auxiliary_loss_clip": 0.01060382, + "auxiliary_loss_mlp": 0.01003239, + "balance_loss_clip": 1.02991021, + "balance_loss_mlp": 1.00143027, + "epoch": 0.29646775890575683, + "flos": 76999491293760.0, + "grad_norm": 0.7847457464743687, + "language_loss": 0.60662603, + "learning_rate": 3.3002504626306275e-06, + "loss": 0.62726223, + "num_input_tokens_seen": 106224415, + "router_z_loss_clip": 0.3046875, + "router_z_loss_mlp": 0.01808167, + "step": 4931, + "time_per_iteration": 3.2247979640960693 + }, + { + "auxiliary_loss_clip": 0.01060291, + "auxiliary_loss_mlp": 0.01003842, + "balance_loss_clip": 1.02976274, + "balance_loss_mlp": 1.0018692, + "epoch": 0.2965278821584248, + "flos": 76954807084320.0, + "grad_norm": 0.7375675200418709, + "language_loss": 0.52399278, + "learning_rate": 3.2999545144495023e-06, + "loss": 0.54463416, + "num_input_tokens_seen": 106279140, + "router_z_loss_clip": 0.30517578, + "router_z_loss_mlp": 0.01971436, + "step": 4932, + "time_per_iteration": 3.127565383911133 + }, + { + "auxiliary_loss_clip": 0.01138938, + "auxiliary_loss_mlp": 0.01036673, + "balance_loss_clip": 1.05108654, + "balance_loss_mlp": 1.02256489, + "epoch": 0.29658800541109276, + "flos": 29002915952640.0, + "grad_norm": 2.1041562584293314, + "language_loss": 0.81695044, + "learning_rate": 3.299658516973972e-06, + "loss": 0.83870655, + "num_input_tokens_seen": 106298190, + "router_z_loss_clip": 0.87792969, + "router_z_loss_mlp": 0.14111328, + "step": 4933, + "time_per_iteration": 2.684285879135132 + }, + { + "auxiliary_loss_clip": 0.01140338, + "auxiliary_loss_mlp": 0.0102907, + "balance_loss_clip": 1.05476665, + "balance_loss_mlp": 1.01518846, + "epoch": 0.2966481286637607, + "flos": 29271061726560.0, + "grad_norm": 1.7641719998637828, + "language_loss": 0.75200003, + "learning_rate": 3.299362470215261e-06, + "loss": 0.7736941, + "num_input_tokens_seen": 106319065, + "router_z_loss_clip": 0.85449219, + "router_z_loss_mlp": 0.13873291, + "step": 4934, + "time_per_iteration": 2.6936094760894775 + }, + { + "auxiliary_loss_clip": 0.0114573, + "auxiliary_loss_mlp": 0.0104184, + "balance_loss_clip": 1.05243635, + "balance_loss_mlp": 1.02667642, + "epoch": 0.2967082519164287, + "flos": 20944692175680.0, + "grad_norm": 2.6935227785038274, + "language_loss": 0.6251303, + "learning_rate": 3.299066374184594e-06, + "loss": 0.64700603, + "num_input_tokens_seen": 106338040, + "router_z_loss_clip": 0.93359375, + "router_z_loss_mlp": 0.15179443, + "step": 4935, + "time_per_iteration": 2.6678037643432617 + }, + { + "auxiliary_loss_clip": 0.01141418, + "auxiliary_loss_mlp": 0.01038546, + "balance_loss_clip": 1.05337644, + "balance_loss_mlp": 1.02332258, + "epoch": 0.29676837516909665, + "flos": 35859609182880.0, + "grad_norm": 2.2143103941996407, + "language_loss": 0.79463392, + "learning_rate": 3.2987702288932e-06, + "loss": 0.81643355, + "num_input_tokens_seen": 106358900, + "router_z_loss_clip": 0.88037109, + "router_z_loss_mlp": 0.15222168, + "step": 4936, + "time_per_iteration": 2.7832794189453125 + }, + { + "auxiliary_loss_clip": 0.01144645, + "auxiliary_loss_mlp": 0.01041687, + "balance_loss_clip": 1.05301619, + "balance_loss_mlp": 1.02603483, + "epoch": 0.2968284984217646, + "flos": 42404566396320.0, + "grad_norm": 1.6583808994103608, + "language_loss": 0.74067295, + "learning_rate": 3.298474034352309e-06, + "loss": 0.76253629, + "num_input_tokens_seen": 106381805, + "router_z_loss_clip": 0.91552734, + "router_z_loss_mlp": 0.15649414, + "step": 4937, + "time_per_iteration": 2.8052327632904053 + }, + { + "auxiliary_loss_clip": 0.01143204, + "auxiliary_loss_mlp": 0.01034793, + "balance_loss_clip": 1.05422032, + "balance_loss_mlp": 1.01984441, + "epoch": 0.2968886216744326, + "flos": 26288159359680.0, + "grad_norm": 1.5598595735857603, + "language_loss": 0.77940345, + "learning_rate": 3.2981777905731526e-06, + "loss": 0.80118346, + "num_input_tokens_seen": 106402365, + "router_z_loss_clip": 0.890625, + "router_z_loss_mlp": 0.14941406, + "step": 4938, + "time_per_iteration": 2.7032206058502197 + }, + { + "auxiliary_loss_clip": 0.01148046, + "auxiliary_loss_mlp": 0.01040808, + "balance_loss_clip": 1.05582666, + "balance_loss_mlp": 1.02557349, + "epoch": 0.29694874492710055, + "flos": 15602481027360.0, + "grad_norm": 2.0056292576031933, + "language_loss": 0.76709902, + "learning_rate": 3.297881497566964e-06, + "loss": 0.78898758, + "num_input_tokens_seen": 106419800, + "router_z_loss_clip": 0.92138672, + "router_z_loss_mlp": 0.15234375, + "step": 4939, + "time_per_iteration": 2.6635143756866455 + }, + { + "auxiliary_loss_clip": 0.0114847, + "auxiliary_loss_mlp": 0.01033468, + "balance_loss_clip": 1.05592275, + "balance_loss_mlp": 1.01875734, + "epoch": 0.2970088681797685, + "flos": 29980782348480.0, + "grad_norm": 1.6828512474987862, + "language_loss": 0.78336865, + "learning_rate": 3.297585155344979e-06, + "loss": 0.80518806, + "num_input_tokens_seen": 106440300, + "router_z_loss_clip": 0.92675781, + "router_z_loss_mlp": 0.14715576, + "step": 4940, + "time_per_iteration": 2.727558135986328 + }, + { + "auxiliary_loss_clip": 0.01144383, + "auxiliary_loss_mlp": 0.01034632, + "balance_loss_clip": 1.05200791, + "balance_loss_mlp": 1.01828814, + "epoch": 0.2970689914324365, + "flos": 28869227979840.0, + "grad_norm": 1.4571066744385233, + "language_loss": 0.75411952, + "learning_rate": 3.297288763918435e-06, + "loss": 0.77590972, + "num_input_tokens_seen": 106460035, + "router_z_loss_clip": 0.92431641, + "router_z_loss_mlp": 0.16326904, + "step": 4941, + "time_per_iteration": 2.6742665767669678 + }, + { + "auxiliary_loss_clip": 0.01147384, + "auxiliary_loss_mlp": 0.01047946, + "balance_loss_clip": 1.05296659, + "balance_loss_mlp": 1.03249633, + "epoch": 0.29712911468510445, + "flos": 48407802746400.0, + "grad_norm": 26.480789753502563, + "language_loss": 0.7348671, + "learning_rate": 3.2969923232985712e-06, + "loss": 0.75682044, + "num_input_tokens_seen": 106481095, + "router_z_loss_clip": 0.94384766, + "router_z_loss_mlp": 0.15454102, + "step": 4942, + "time_per_iteration": 2.8319318294525146 + }, + { + "auxiliary_loss_clip": 0.01147156, + "auxiliary_loss_mlp": 0.01039533, + "balance_loss_clip": 1.05283809, + "balance_loss_mlp": 1.02376175, + "epoch": 0.2971892379377724, + "flos": 32208104882880.0, + "grad_norm": 1.918100469507776, + "language_loss": 0.70252705, + "learning_rate": 3.2966958334966287e-06, + "loss": 0.7243939, + "num_input_tokens_seen": 106501590, + "router_z_loss_clip": 0.94384766, + "router_z_loss_mlp": 0.15771484, + "step": 4943, + "time_per_iteration": 2.690695285797119 + }, + { + "auxiliary_loss_clip": 0.01145931, + "auxiliary_loss_mlp": 0.01037784, + "balance_loss_clip": 1.05306816, + "balance_loss_mlp": 1.02238238, + "epoch": 0.2972493611904404, + "flos": 21479200963200.0, + "grad_norm": 6.743311276546623, + "language_loss": 0.79272938, + "learning_rate": 3.2963992945238497e-06, + "loss": 0.81456649, + "num_input_tokens_seen": 106519430, + "router_z_loss_clip": 0.92919922, + "router_z_loss_mlp": 0.15393066, + "step": 4944, + "time_per_iteration": 2.666855573654175 + }, + { + "auxiliary_loss_clip": 0.0113904, + "auxiliary_loss_mlp": 0.01034374, + "balance_loss_clip": 1.05112231, + "balance_loss_mlp": 1.02065897, + "epoch": 0.2973094844431084, + "flos": 24907648871520.0, + "grad_norm": 2.317913201666277, + "language_loss": 0.82957369, + "learning_rate": 3.2961027063914795e-06, + "loss": 0.85130787, + "num_input_tokens_seen": 106535870, + "router_z_loss_clip": 0.87988281, + "router_z_loss_mlp": 0.1373291, + "step": 4945, + "time_per_iteration": 2.6646645069122314 + }, + { + "auxiliary_loss_clip": 0.01140591, + "auxiliary_loss_mlp": 0.01032762, + "balance_loss_clip": 1.05212021, + "balance_loss_mlp": 1.01857626, + "epoch": 0.29736960769577636, + "flos": 21345350921280.0, + "grad_norm": 1.9210176452337369, + "language_loss": 0.66760439, + "learning_rate": 3.2958060691107654e-06, + "loss": 0.68933785, + "num_input_tokens_seen": 106553560, + "router_z_loss_clip": 0.88427734, + "router_z_loss_mlp": 0.14172363, + "step": 4946, + "time_per_iteration": 2.7705154418945312 + }, + { + "auxiliary_loss_clip": 0.01144401, + "auxiliary_loss_mlp": 0.01035733, + "balance_loss_clip": 1.05324411, + "balance_loss_mlp": 1.02173758, + "epoch": 0.2974297309484443, + "flos": 31852535519520.0, + "grad_norm": 2.0086767673669264, + "language_loss": 0.73665917, + "learning_rate": 3.2955093826929547e-06, + "loss": 0.75846052, + "num_input_tokens_seen": 106574115, + "router_z_loss_clip": 0.91064453, + "router_z_loss_mlp": 0.13995361, + "step": 4947, + "time_per_iteration": 2.707000494003296 + }, + { + "auxiliary_loss_clip": 0.01144744, + "auxiliary_loss_mlp": 0.01044304, + "balance_loss_clip": 1.05262518, + "balance_loss_mlp": 1.0284965, + "epoch": 0.2974898542011123, + "flos": 31318593973920.0, + "grad_norm": 1.9028855834707, + "language_loss": 0.7309202, + "learning_rate": 3.2952126471492985e-06, + "loss": 0.75281066, + "num_input_tokens_seen": 106593070, + "router_z_loss_clip": 0.921875, + "router_z_loss_mlp": 0.15795898, + "step": 4948, + "time_per_iteration": 2.6989877223968506 + }, + { + "auxiliary_loss_clip": 0.01138574, + "auxiliary_loss_mlp": 0.01031931, + "balance_loss_clip": 1.04924941, + "balance_loss_mlp": 1.01764369, + "epoch": 0.29754997745378026, + "flos": 22770424067040.0, + "grad_norm": 1.9882970226075893, + "language_loss": 0.84085512, + "learning_rate": 3.2949158624910497e-06, + "loss": 0.86256015, + "num_input_tokens_seen": 106610695, + "router_z_loss_clip": 0.89160156, + "router_z_loss_mlp": 0.14300537, + "step": 4949, + "time_per_iteration": 4.173006057739258 + }, + { + "auxiliary_loss_clip": 0.01138282, + "auxiliary_loss_mlp": 0.01035898, + "balance_loss_clip": 1.04950511, + "balance_loss_mlp": 1.02102637, + "epoch": 0.2976101007064482, + "flos": 27179493546240.0, + "grad_norm": 2.3581493056845533, + "language_loss": 0.71230805, + "learning_rate": 3.2946190287294603e-06, + "loss": 0.73404986, + "num_input_tokens_seen": 106631300, + "router_z_loss_clip": 0.88818359, + "router_z_loss_mlp": 0.14880371, + "step": 4950, + "time_per_iteration": 2.742133617401123 + }, + { + "auxiliary_loss_clip": 0.01136271, + "auxiliary_loss_mlp": 0.01040122, + "balance_loss_clip": 1.05092788, + "balance_loss_mlp": 1.02586472, + "epoch": 0.2976702239591162, + "flos": 26778146006880.0, + "grad_norm": 2.043334584170769, + "language_loss": 0.82703644, + "learning_rate": 3.294322145875789e-06, + "loss": 0.8488003, + "num_input_tokens_seen": 106650065, + "router_z_loss_clip": 0.85302734, + "router_z_loss_mlp": 0.14257812, + "step": 4951, + "time_per_iteration": 2.643815755844116 + }, + { + "auxiliary_loss_clip": 0.01138086, + "auxiliary_loss_mlp": 0.01033298, + "balance_loss_clip": 1.04783726, + "balance_loss_mlp": 1.01781309, + "epoch": 0.29773034721178415, + "flos": 30063343864320.0, + "grad_norm": 4.625727744376408, + "language_loss": 0.73929304, + "learning_rate": 3.2940252139412912e-06, + "loss": 0.76100689, + "num_input_tokens_seen": 106668230, + "router_z_loss_clip": 0.90283203, + "router_z_loss_mlp": 0.1550293, + "step": 4952, + "time_per_iteration": 4.207516193389893 + }, + { + "auxiliary_loss_clip": 0.01141798, + "auxiliary_loss_mlp": 0.01037203, + "balance_loss_clip": 1.05235696, + "balance_loss_mlp": 1.0218606, + "epoch": 0.2977904704644521, + "flos": 25084440879840.0, + "grad_norm": 1.6315842191893581, + "language_loss": 0.84003055, + "learning_rate": 3.293728232937228e-06, + "loss": 0.86182052, + "num_input_tokens_seen": 106687785, + "router_z_loss_clip": 0.89648438, + "router_z_loss_mlp": 0.15332031, + "step": 4953, + "time_per_iteration": 2.632750988006592 + }, + { + "auxiliary_loss_clip": 0.01140824, + "auxiliary_loss_mlp": 0.0103894, + "balance_loss_clip": 1.05110943, + "balance_loss_mlp": 1.02477765, + "epoch": 0.2978505937171201, + "flos": 23081754911040.0, + "grad_norm": 2.2276640935866006, + "language_loss": 0.74375468, + "learning_rate": 3.2934312028748597e-06, + "loss": 0.76555234, + "num_input_tokens_seen": 106706875, + "router_z_loss_clip": 0.8984375, + "router_z_loss_mlp": 0.14160156, + "step": 4954, + "time_per_iteration": 4.053887844085693 + }, + { + "auxiliary_loss_clip": 0.0113702, + "auxiliary_loss_mlp": 0.01035299, + "balance_loss_clip": 1.0482527, + "balance_loss_mlp": 1.02100611, + "epoch": 0.29791071696978805, + "flos": 23571457937280.0, + "grad_norm": 2.023514914224409, + "language_loss": 0.75611532, + "learning_rate": 3.293134123765452e-06, + "loss": 0.77783859, + "num_input_tokens_seen": 106725105, + "router_z_loss_clip": 0.88769531, + "router_z_loss_mlp": 0.14306641, + "step": 4955, + "time_per_iteration": 2.6353979110717773 + }, + { + "auxiliary_loss_clip": 0.01142249, + "auxiliary_loss_mlp": 0.01033488, + "balance_loss_clip": 1.05062163, + "balance_loss_mlp": 1.01878953, + "epoch": 0.297970840222456, + "flos": 22947985903680.0, + "grad_norm": 1.7705372349242006, + "language_loss": 0.72276807, + "learning_rate": 3.2928369956202684e-06, + "loss": 0.74452543, + "num_input_tokens_seen": 106744780, + "router_z_loss_clip": 0.91796875, + "router_z_loss_mlp": 0.14691162, + "step": 4956, + "time_per_iteration": 2.6069302558898926 + }, + { + "auxiliary_loss_clip": 0.01140547, + "auxiliary_loss_mlp": 0.01041449, + "balance_loss_clip": 1.04725814, + "balance_loss_mlp": 1.0257194, + "epoch": 0.298030963475124, + "flos": 27885527095680.0, + "grad_norm": 1.6834676888714644, + "language_loss": 0.78904796, + "learning_rate": 3.2925398184505754e-06, + "loss": 0.81086791, + "num_input_tokens_seen": 106764670, + "router_z_loss_clip": 0.93310547, + "router_z_loss_mlp": 0.1572876, + "step": 4957, + "time_per_iteration": 2.6457178592681885 + }, + { + "auxiliary_loss_clip": 0.01140851, + "auxiliary_loss_mlp": 0.01035961, + "balance_loss_clip": 1.04934621, + "balance_loss_mlp": 1.02047575, + "epoch": 0.298091086727792, + "flos": 26684361204480.0, + "grad_norm": 1.548245981648508, + "language_loss": 0.70624769, + "learning_rate": 3.2922425922676437e-06, + "loss": 0.72801578, + "num_input_tokens_seen": 106783695, + "router_z_loss_clip": 0.91503906, + "router_z_loss_mlp": 0.15478516, + "step": 4958, + "time_per_iteration": 2.62703800201416 + }, + { + "auxiliary_loss_clip": 0.01136177, + "auxiliary_loss_mlp": 0.01040912, + "balance_loss_clip": 1.04916453, + "balance_loss_mlp": 1.02613032, + "epoch": 0.29815120998045996, + "flos": 25838156917440.0, + "grad_norm": 2.8809950523025862, + "language_loss": 0.78673816, + "learning_rate": 3.291945317082743e-06, + "loss": 0.80850911, + "num_input_tokens_seen": 106803150, + "router_z_loss_clip": 0.87011719, + "router_z_loss_mlp": 0.14782715, + "step": 4959, + "time_per_iteration": 2.658487319946289 + }, + { + "auxiliary_loss_clip": 0.01136146, + "auxiliary_loss_mlp": 0.01041268, + "balance_loss_clip": 1.04798627, + "balance_loss_mlp": 1.02680755, + "epoch": 0.29821133323312793, + "flos": 24277896659520.0, + "grad_norm": 1.7691751718615787, + "language_loss": 0.79708856, + "learning_rate": 3.291647992907147e-06, + "loss": 0.81886268, + "num_input_tokens_seen": 106820705, + "router_z_loss_clip": 0.88037109, + "router_z_loss_mlp": 0.14459229, + "step": 4960, + "time_per_iteration": 2.6497867107391357 + }, + { + "auxiliary_loss_clip": 0.01139996, + "auxiliary_loss_mlp": 0.01042963, + "balance_loss_clip": 1.0480026, + "balance_loss_mlp": 1.02692342, + "epoch": 0.2982714564857959, + "flos": 15557877852480.0, + "grad_norm": 2.6512665881037623, + "language_loss": 0.74311799, + "learning_rate": 3.291350619752129e-06, + "loss": 0.76494753, + "num_input_tokens_seen": 106837335, + "router_z_loss_clip": 0.91943359, + "router_z_loss_mlp": 0.16033936, + "step": 4961, + "time_per_iteration": 2.637267589569092 + }, + { + "auxiliary_loss_clip": 0.01137781, + "auxiliary_loss_mlp": 0.0103538, + "balance_loss_clip": 1.04843891, + "balance_loss_mlp": 1.0212661, + "epoch": 0.29833157973846386, + "flos": 27176454750240.0, + "grad_norm": 1.82329826988343, + "language_loss": 0.62361425, + "learning_rate": 3.291053197628967e-06, + "loss": 0.64534587, + "num_input_tokens_seen": 106856250, + "router_z_loss_clip": 0.89306641, + "router_z_loss_mlp": 0.14123535, + "step": 4962, + "time_per_iteration": 2.7213025093078613 + }, + { + "auxiliary_loss_clip": 0.0113717, + "auxiliary_loss_mlp": 0.0103961, + "balance_loss_clip": 1.04763424, + "balance_loss_mlp": 1.02377903, + "epoch": 0.2983917029911318, + "flos": 18758164191840.0, + "grad_norm": 1.8247042192089618, + "language_loss": 0.83391488, + "learning_rate": 3.2907557265489375e-06, + "loss": 0.85568267, + "num_input_tokens_seen": 106873370, + "router_z_loss_clip": 0.89501953, + "router_z_loss_mlp": 0.1583252, + "step": 4963, + "time_per_iteration": 2.6515800952911377 + }, + { + "auxiliary_loss_clip": 0.01138482, + "auxiliary_loss_mlp": 0.01037489, + "balance_loss_clip": 1.05022788, + "balance_loss_mlp": 1.02275527, + "epoch": 0.2984518262437998, + "flos": 18763390920960.0, + "grad_norm": 2.2652005829206274, + "language_loss": 0.66332501, + "learning_rate": 3.290458206523322e-06, + "loss": 0.6850847, + "num_input_tokens_seen": 106890330, + "router_z_loss_clip": 0.88232422, + "router_z_loss_mlp": 0.14733887, + "step": 4964, + "time_per_iteration": 2.6171932220458984 + }, + { + "auxiliary_loss_clip": 0.01135317, + "auxiliary_loss_mlp": 0.01033211, + "balance_loss_clip": 1.04744518, + "balance_loss_mlp": 1.01956797, + "epoch": 0.29851194949646775, + "flos": 22096271266560.0, + "grad_norm": 2.235452856229034, + "language_loss": 0.70982218, + "learning_rate": 3.2901606375634015e-06, + "loss": 0.73150748, + "num_input_tokens_seen": 106909190, + "router_z_loss_clip": 0.87988281, + "router_z_loss_mlp": 0.13653564, + "step": 4965, + "time_per_iteration": 2.677659034729004 + }, + { + "auxiliary_loss_clip": 0.01142596, + "auxiliary_loss_mlp": 0.01046322, + "balance_loss_clip": 1.05168581, + "balance_loss_mlp": 1.03074098, + "epoch": 0.2985720727491357, + "flos": 26866744597440.0, + "grad_norm": 2.8896910990607827, + "language_loss": 0.65896571, + "learning_rate": 3.289863019680461e-06, + "loss": 0.68085492, + "num_input_tokens_seen": 106927825, + "router_z_loss_clip": 0.91064453, + "router_z_loss_mlp": 0.15576172, + "step": 4966, + "time_per_iteration": 2.692556142807007 + }, + { + "auxiliary_loss_clip": 0.01141933, + "auxiliary_loss_mlp": 0.01039246, + "balance_loss_clip": 1.05094612, + "balance_loss_mlp": 1.02420187, + "epoch": 0.2986321960018037, + "flos": 15912596352960.0, + "grad_norm": 3.515761970081767, + "language_loss": 0.73754668, + "learning_rate": 3.289565352885785e-06, + "loss": 0.75935847, + "num_input_tokens_seen": 106943155, + "router_z_loss_clip": 0.91064453, + "router_z_loss_mlp": 0.1505127, + "step": 4967, + "time_per_iteration": 2.6304476261138916 + }, + { + "auxiliary_loss_clip": 0.01135545, + "auxiliary_loss_mlp": 0.01031999, + "balance_loss_clip": 1.04629731, + "balance_loss_mlp": 1.01827765, + "epoch": 0.29869231925447165, + "flos": 17649810688320.0, + "grad_norm": 2.2309402655131136, + "language_loss": 0.71070266, + "learning_rate": 3.2892676371906614e-06, + "loss": 0.73237801, + "num_input_tokens_seen": 106960295, + "router_z_loss_clip": 0.89257812, + "router_z_loss_mlp": 0.13720703, + "step": 4968, + "time_per_iteration": 2.6211743354797363 + }, + { + "auxiliary_loss_clip": 0.01137111, + "auxiliary_loss_mlp": 0.01033669, + "balance_loss_clip": 1.04625535, + "balance_loss_mlp": 1.01885724, + "epoch": 0.2987524425071396, + "flos": 38620468090080.0, + "grad_norm": 1.7570480370048214, + "language_loss": 0.76903135, + "learning_rate": 3.2889698726063805e-06, + "loss": 0.79073918, + "num_input_tokens_seen": 106982870, + "router_z_loss_clip": 0.90966797, + "router_z_loss_mlp": 0.14825439, + "step": 4969, + "time_per_iteration": 2.7085697650909424 + }, + { + "auxiliary_loss_clip": 0.0113678, + "auxiliary_loss_mlp": 0.01034167, + "balance_loss_clip": 1.04842496, + "balance_loss_mlp": 1.02092266, + "epoch": 0.2988125657598076, + "flos": 26153215351200.0, + "grad_norm": 1.9311549084951956, + "language_loss": 0.69922185, + "learning_rate": 3.2886720591442327e-06, + "loss": 0.72093129, + "num_input_tokens_seen": 107002405, + "router_z_loss_clip": 0.8828125, + "router_z_loss_mlp": 0.13250732, + "step": 4970, + "time_per_iteration": 2.67250657081604 + }, + { + "auxiliary_loss_clip": 0.01139857, + "auxiliary_loss_mlp": 0.01039369, + "balance_loss_clip": 1.04702115, + "balance_loss_mlp": 1.02376437, + "epoch": 0.2988726890124756, + "flos": 22057381028160.0, + "grad_norm": 5.859484918214417, + "language_loss": 0.84851658, + "learning_rate": 3.2883741968155103e-06, + "loss": 0.87030888, + "num_input_tokens_seen": 107017310, + "router_z_loss_clip": 0.92822266, + "router_z_loss_mlp": 0.15588379, + "step": 4971, + "time_per_iteration": 2.622671365737915 + }, + { + "auxiliary_loss_clip": 0.01134309, + "auxiliary_loss_mlp": 0.01042935, + "balance_loss_clip": 1.04855347, + "balance_loss_mlp": 1.02773595, + "epoch": 0.29893281226514357, + "flos": 26546986159200.0, + "grad_norm": 2.045962750252344, + "language_loss": 0.79282898, + "learning_rate": 3.2880762856315107e-06, + "loss": 0.81460142, + "num_input_tokens_seen": 107034645, + "router_z_loss_clip": 0.85791016, + "router_z_loss_mlp": 0.15222168, + "step": 4972, + "time_per_iteration": 2.672269344329834 + }, + { + "auxiliary_loss_clip": 0.01138476, + "auxiliary_loss_mlp": 0.01045984, + "balance_loss_clip": 1.04931736, + "balance_loss_mlp": 1.03155971, + "epoch": 0.29899293551781153, + "flos": 20543911878240.0, + "grad_norm": 1.875239084097116, + "language_loss": 0.8551538, + "learning_rate": 3.2877783256035285e-06, + "loss": 0.87699836, + "num_input_tokens_seen": 107051125, + "router_z_loss_clip": 0.89257812, + "router_z_loss_mlp": 0.14428711, + "step": 4973, + "time_per_iteration": 2.5858590602874756 + }, + { + "auxiliary_loss_clip": 0.0113258, + "auxiliary_loss_mlp": 0.01035262, + "balance_loss_clip": 1.04956937, + "balance_loss_mlp": 1.02054536, + "epoch": 0.2990530587704795, + "flos": 14311298440800.0, + "grad_norm": 1.6670758683517792, + "language_loss": 0.77554023, + "learning_rate": 3.287480316742863e-06, + "loss": 0.79721868, + "num_input_tokens_seen": 107068815, + "router_z_loss_clip": 0.82861328, + "router_z_loss_mlp": 0.14709473, + "step": 4974, + "time_per_iteration": 2.6466856002807617 + }, + { + "auxiliary_loss_clip": 0.01139357, + "auxiliary_loss_mlp": 0.0103799, + "balance_loss_clip": 1.04919982, + "balance_loss_mlp": 1.02367878, + "epoch": 0.29911318202314746, + "flos": 34214680510560.0, + "grad_norm": 1.692679623111028, + "language_loss": 0.72400033, + "learning_rate": 3.287182259060815e-06, + "loss": 0.74577379, + "num_input_tokens_seen": 107090420, + "router_z_loss_clip": 0.90283203, + "router_z_loss_mlp": 0.14312744, + "step": 4975, + "time_per_iteration": 2.7140052318573 + }, + { + "auxiliary_loss_clip": 0.01137388, + "auxiliary_loss_mlp": 0.01039972, + "balance_loss_clip": 1.04975843, + "balance_loss_mlp": 1.02493358, + "epoch": 0.2991733052758154, + "flos": 22858617484800.0, + "grad_norm": 2.8579268095499524, + "language_loss": 0.75833631, + "learning_rate": 3.286884152568687e-06, + "loss": 0.78010988, + "num_input_tokens_seen": 107107255, + "router_z_loss_clip": 0.87597656, + "router_z_loss_mlp": 0.15045166, + "step": 4976, + "time_per_iteration": 2.6165008544921875 + }, + { + "auxiliary_loss_clip": 0.01134883, + "auxiliary_loss_mlp": 0.01038428, + "balance_loss_clip": 1.04869843, + "balance_loss_mlp": 1.02443886, + "epoch": 0.2992334285284834, + "flos": 18984988690560.0, + "grad_norm": 2.412244789570938, + "language_loss": 0.86572623, + "learning_rate": 3.2865859972777827e-06, + "loss": 0.8874594, + "num_input_tokens_seen": 107123840, + "router_z_loss_clip": 0.86132812, + "router_z_loss_mlp": 0.13983154, + "step": 4977, + "time_per_iteration": 2.6023108959198 + }, + { + "auxiliary_loss_clip": 0.01139483, + "auxiliary_loss_mlp": 0.01037533, + "balance_loss_clip": 1.05091143, + "balance_loss_mlp": 1.02327013, + "epoch": 0.29929355178115136, + "flos": 26597626408800.0, + "grad_norm": 1.6034090401003542, + "language_loss": 0.6838516, + "learning_rate": 3.2862877931994088e-06, + "loss": 0.70562172, + "num_input_tokens_seen": 107143475, + "router_z_loss_clip": 0.88623047, + "router_z_loss_mlp": 0.14257812, + "step": 4978, + "time_per_iteration": 2.6783533096313477 + }, + { + "auxiliary_loss_clip": 0.01142043, + "auxiliary_loss_mlp": 0.01037039, + "balance_loss_clip": 1.05271971, + "balance_loss_mlp": 1.0218513, + "epoch": 0.2993536750338193, + "flos": 25842127610880.0, + "grad_norm": 1.8085864655401747, + "language_loss": 0.76080632, + "learning_rate": 3.2859895403448726e-06, + "loss": 0.78259718, + "num_input_tokens_seen": 107161725, + "router_z_loss_clip": 0.89208984, + "router_z_loss_mlp": 0.15185547, + "step": 4979, + "time_per_iteration": 2.625708818435669 + }, + { + "auxiliary_loss_clip": 0.01134466, + "auxiliary_loss_mlp": 0.01039317, + "balance_loss_clip": 1.04491556, + "balance_loss_mlp": 1.02474976, + "epoch": 0.2994137982864873, + "flos": 39196541256480.0, + "grad_norm": 1.7312064915000307, + "language_loss": 0.68893027, + "learning_rate": 3.285691238725484e-06, + "loss": 0.71066809, + "num_input_tokens_seen": 107183935, + "router_z_loss_clip": 0.89453125, + "router_z_loss_mlp": 0.14575195, + "step": 4980, + "time_per_iteration": 2.7386415004730225 + }, + { + "auxiliary_loss_clip": 0.01135905, + "auxiliary_loss_mlp": 0.01040307, + "balance_loss_clip": 1.0506494, + "balance_loss_mlp": 1.02600765, + "epoch": 0.29947392153915525, + "flos": 25752921261120.0, + "grad_norm": 2.359674340403075, + "language_loss": 0.73680329, + "learning_rate": 3.285392888352555e-06, + "loss": 0.75856543, + "num_input_tokens_seen": 107204285, + "router_z_loss_clip": 0.85253906, + "router_z_loss_mlp": 0.14300537, + "step": 4981, + "time_per_iteration": 2.7299718856811523 + }, + { + "auxiliary_loss_clip": 0.01136944, + "auxiliary_loss_mlp": 0.01040349, + "balance_loss_clip": 1.04486299, + "balance_loss_mlp": 1.02616858, + "epoch": 0.2995340447918232, + "flos": 26286498151200.0, + "grad_norm": 1.6488965507930964, + "language_loss": 0.86543584, + "learning_rate": 3.2850944892373987e-06, + "loss": 0.8872087, + "num_input_tokens_seen": 107225265, + "router_z_loss_clip": 0.91992188, + "router_z_loss_mlp": 0.1418457, + "step": 4982, + "time_per_iteration": 2.6697239875793457 + }, + { + "auxiliary_loss_clip": 0.0113978, + "auxiliary_loss_mlp": 0.0103773, + "balance_loss_clip": 1.04753375, + "balance_loss_mlp": 1.02160132, + "epoch": 0.2995941680444912, + "flos": 20407954937760.0, + "grad_norm": 2.6242784636384693, + "language_loss": 0.86235434, + "learning_rate": 3.2847960413913307e-06, + "loss": 0.88412946, + "num_input_tokens_seen": 107241335, + "router_z_loss_clip": 0.92285156, + "router_z_loss_mlp": 0.16137695, + "step": 4983, + "time_per_iteration": 2.659627676010132 + }, + { + "auxiliary_loss_clip": 0.01134005, + "auxiliary_loss_mlp": 0.01040701, + "balance_loss_clip": 1.0463829, + "balance_loss_mlp": 1.02694452, + "epoch": 0.2996542912971592, + "flos": 25530877801440.0, + "grad_norm": 2.3316033720651594, + "language_loss": 0.78536654, + "learning_rate": 3.284497544825668e-06, + "loss": 0.80711359, + "num_input_tokens_seen": 107259375, + "router_z_loss_clip": 0.87695312, + "router_z_loss_mlp": 0.13763428, + "step": 4984, + "time_per_iteration": 2.6611580848693848 + }, + { + "auxiliary_loss_clip": 0.0113878, + "auxiliary_loss_mlp": 0.01044128, + "balance_loss_clip": 1.04891908, + "balance_loss_mlp": 1.02851176, + "epoch": 0.29971441454982717, + "flos": 30603079380960.0, + "grad_norm": 1.5997995056764809, + "language_loss": 0.78316098, + "learning_rate": 3.2841989995517303e-06, + "loss": 0.80499005, + "num_input_tokens_seen": 107279890, + "router_z_loss_clip": 0.8984375, + "router_z_loss_mlp": 0.15600586, + "step": 4985, + "time_per_iteration": 2.676830530166626 + }, + { + "auxiliary_loss_clip": 0.01141761, + "auxiliary_loss_mlp": 0.01038142, + "balance_loss_clip": 1.04956388, + "balance_loss_mlp": 1.02181053, + "epoch": 0.29977453780249513, + "flos": 64131249320640.0, + "grad_norm": 3.0059327523976496, + "language_loss": 0.71413314, + "learning_rate": 3.283900405580837e-06, + "loss": 0.73593223, + "num_input_tokens_seen": 107303430, + "router_z_loss_clip": 0.92138672, + "router_z_loss_mlp": 0.16333008, + "step": 4986, + "time_per_iteration": 2.9211678504943848 + }, + { + "auxiliary_loss_clip": 0.01139139, + "auxiliary_loss_mlp": 0.01042555, + "balance_loss_clip": 1.04715085, + "balance_loss_mlp": 1.02714145, + "epoch": 0.2998346610551631, + "flos": 27133877439360.0, + "grad_norm": 1.9088821295439384, + "language_loss": 0.73159969, + "learning_rate": 3.283601762924312e-06, + "loss": 0.75341666, + "num_input_tokens_seen": 107323700, + "router_z_loss_clip": 0.91943359, + "router_z_loss_mlp": 0.1541748, + "step": 4987, + "time_per_iteration": 2.6601452827453613 + }, + { + "auxiliary_loss_clip": 0.01134403, + "auxiliary_loss_mlp": 0.01035738, + "balance_loss_clip": 1.04713166, + "balance_loss_mlp": 1.0217905, + "epoch": 0.29989478430783106, + "flos": 20588434018560.0, + "grad_norm": 1.8276231531790623, + "language_loss": 0.80605114, + "learning_rate": 3.2833030715934793e-06, + "loss": 0.82775259, + "num_input_tokens_seen": 107341965, + "router_z_loss_clip": 0.87304688, + "router_z_loss_mlp": 0.1394043, + "step": 4988, + "time_per_iteration": 2.664654493331909 + }, + { + "auxiliary_loss_clip": 0.01135616, + "auxiliary_loss_mlp": 0.0104541, + "balance_loss_clip": 1.04732382, + "balance_loss_mlp": 1.03033018, + "epoch": 0.29995490756049903, + "flos": 29003078021760.0, + "grad_norm": 1.525024920953211, + "language_loss": 0.70760119, + "learning_rate": 3.2830043315996658e-06, + "loss": 0.72941148, + "num_input_tokens_seen": 107362615, + "router_z_loss_clip": 0.88232422, + "router_z_loss_mlp": 0.15081787, + "step": 4989, + "time_per_iteration": 5.542206287384033 + }, + { + "auxiliary_loss_clip": 0.01139807, + "auxiliary_loss_mlp": 0.01039465, + "balance_loss_clip": 1.04930091, + "balance_loss_mlp": 1.02427745, + "epoch": 0.300015030813167, + "flos": 17650134826560.0, + "grad_norm": 2.3369471014392396, + "language_loss": 0.85596085, + "learning_rate": 3.282705542954199e-06, + "loss": 0.87775356, + "num_input_tokens_seen": 107378980, + "router_z_loss_clip": 0.90429688, + "router_z_loss_mlp": 0.15185547, + "step": 4990, + "time_per_iteration": 2.6487741470336914 + }, + { + "auxiliary_loss_clip": 0.01138586, + "auxiliary_loss_mlp": 0.0103444, + "balance_loss_clip": 1.04674602, + "balance_loss_mlp": 1.01885939, + "epoch": 0.30007515406583496, + "flos": 30739644080640.0, + "grad_norm": 3.641195121801913, + "language_loss": 0.67158079, + "learning_rate": 3.28240670566841e-06, + "loss": 0.69331104, + "num_input_tokens_seen": 107397640, + "router_z_loss_clip": 0.91748047, + "router_z_loss_mlp": 0.15576172, + "step": 4991, + "time_per_iteration": 4.3462042808532715 + }, + { + "auxiliary_loss_clip": 0.01139392, + "auxiliary_loss_mlp": 0.01039787, + "balance_loss_clip": 1.04645157, + "balance_loss_mlp": 1.02368188, + "epoch": 0.3001352773185029, + "flos": 23661028942560.0, + "grad_norm": 1.903931336021997, + "language_loss": 0.78769517, + "learning_rate": 3.28210781975363e-06, + "loss": 0.80948699, + "num_input_tokens_seen": 107416020, + "router_z_loss_clip": 0.92919922, + "router_z_loss_mlp": 0.16088867, + "step": 4992, + "time_per_iteration": 2.708926200866699 + }, + { + "auxiliary_loss_clip": 0.01136053, + "auxiliary_loss_mlp": 0.0103554, + "balance_loss_clip": 1.047261, + "balance_loss_mlp": 1.02034092, + "epoch": 0.3001954005711709, + "flos": 26287875738720.0, + "grad_norm": 1.9386195070739085, + "language_loss": 0.82906306, + "learning_rate": 3.281808885221193e-06, + "loss": 0.85077906, + "num_input_tokens_seen": 107436340, + "router_z_loss_clip": 0.88769531, + "router_z_loss_mlp": 0.15197754, + "step": 4993, + "time_per_iteration": 2.6921422481536865 + }, + { + "auxiliary_loss_clip": 0.01141793, + "auxiliary_loss_mlp": 0.01045651, + "balance_loss_clip": 1.04734015, + "balance_loss_mlp": 1.02915227, + "epoch": 0.30025552382383885, + "flos": 21212108638560.0, + "grad_norm": 2.165184933238382, + "language_loss": 0.86184019, + "learning_rate": 3.2815099020824345e-06, + "loss": 0.88371462, + "num_input_tokens_seen": 107454585, + "router_z_loss_clip": 0.94482422, + "router_z_loss_mlp": 0.16479492, + "step": 4994, + "time_per_iteration": 4.200507640838623 + }, + { + "auxiliary_loss_clip": 0.01139585, + "auxiliary_loss_mlp": 0.01036539, + "balance_loss_clip": 1.04985392, + "balance_loss_mlp": 1.02188826, + "epoch": 0.3003156470765068, + "flos": 36037333088640.0, + "grad_norm": 1.5864322933587556, + "language_loss": 0.81445843, + "learning_rate": 3.2812108703486924e-06, + "loss": 0.83621967, + "num_input_tokens_seen": 107477180, + "router_z_loss_clip": 0.89697266, + "router_z_loss_mlp": 0.14642334, + "step": 4995, + "time_per_iteration": 2.7352848052978516 + }, + { + "auxiliary_loss_clip": 0.01135596, + "auxiliary_loss_mlp": 0.01035979, + "balance_loss_clip": 1.04871082, + "balance_loss_mlp": 1.0214293, + "epoch": 0.3003757703291748, + "flos": 53259824661120.0, + "grad_norm": 1.8002826297362506, + "language_loss": 0.67665839, + "learning_rate": 3.2809117900313055e-06, + "loss": 0.69837409, + "num_input_tokens_seen": 107500250, + "router_z_loss_clip": 0.87011719, + "router_z_loss_mlp": 0.14556885, + "step": 4996, + "time_per_iteration": 2.830453395843506 + }, + { + "auxiliary_loss_clip": 0.01136609, + "auxiliary_loss_mlp": 0.01035081, + "balance_loss_clip": 1.04695868, + "balance_loss_mlp": 1.01984048, + "epoch": 0.30043589358184275, + "flos": 27489325250880.0, + "grad_norm": 1.8764334682942734, + "language_loss": 0.75002611, + "learning_rate": 3.280612661141615e-06, + "loss": 0.77174306, + "num_input_tokens_seen": 107520070, + "router_z_loss_clip": 0.89648438, + "router_z_loss_mlp": 0.15240479, + "step": 4997, + "time_per_iteration": 2.633808135986328 + }, + { + "auxiliary_loss_clip": 0.01133272, + "auxiliary_loss_mlp": 0.01043076, + "balance_loss_clip": 1.04626787, + "balance_loss_mlp": 1.02908039, + "epoch": 0.30049601683451077, + "flos": 25619273805600.0, + "grad_norm": 2.253531353392832, + "language_loss": 0.77414608, + "learning_rate": 3.2803134836909646e-06, + "loss": 0.79590952, + "num_input_tokens_seen": 107539285, + "router_z_loss_clip": 0.87011719, + "router_z_loss_mlp": 0.14001465, + "step": 4998, + "time_per_iteration": 2.6894710063934326 + }, + { + "auxiliary_loss_clip": 0.01133571, + "auxiliary_loss_mlp": 0.0104052, + "balance_loss_clip": 1.04843926, + "balance_loss_mlp": 1.02663219, + "epoch": 0.30055614008717874, + "flos": 29182503653280.0, + "grad_norm": 4.38037328019079, + "language_loss": 0.73334527, + "learning_rate": 3.2800142576906985e-06, + "loss": 0.75508618, + "num_input_tokens_seen": 107560260, + "router_z_loss_clip": 0.85058594, + "router_z_loss_mlp": 0.13891602, + "step": 4999, + "time_per_iteration": 2.670825719833374 + }, + { + "auxiliary_loss_clip": 0.0113791, + "auxiliary_loss_mlp": 0.01037923, + "balance_loss_clip": 1.04932976, + "balance_loss_mlp": 1.02371275, + "epoch": 0.3006162633398467, + "flos": 23391100408320.0, + "grad_norm": 1.6215015560430568, + "language_loss": 0.75661933, + "learning_rate": 3.2797149831521626e-06, + "loss": 0.77837765, + "num_input_tokens_seen": 107579260, + "router_z_loss_clip": 0.88525391, + "router_z_loss_mlp": 0.14221191, + "step": 5000, + "time_per_iteration": 2.6439080238342285 + }, + { + "auxiliary_loss_clip": 0.011355, + "auxiliary_loss_mlp": 0.01039675, + "balance_loss_clip": 1.04982364, + "balance_loss_mlp": 1.02651453, + "epoch": 0.30067638659251467, + "flos": 17911635766560.0, + "grad_norm": 1.8054741516075243, + "language_loss": 0.81569183, + "learning_rate": 3.2794156600867073e-06, + "loss": 0.83744353, + "num_input_tokens_seen": 107595245, + "router_z_loss_clip": 0.85693359, + "router_z_loss_mlp": 0.13165283, + "step": 5001, + "time_per_iteration": 2.628906011581421 + }, + { + "auxiliary_loss_clip": 0.01138104, + "auxiliary_loss_mlp": 0.01043501, + "balance_loss_clip": 1.04996729, + "balance_loss_mlp": 1.02834916, + "epoch": 0.30073650984518263, + "flos": 28513861202880.0, + "grad_norm": 1.7230139314363608, + "language_loss": 0.80388016, + "learning_rate": 3.2791162885056815e-06, + "loss": 0.82569623, + "num_input_tokens_seen": 107613985, + "router_z_loss_clip": 0.88183594, + "router_z_loss_mlp": 0.15155029, + "step": 5002, + "time_per_iteration": 2.634483814239502 + }, + { + "auxiliary_loss_clip": 0.01143311, + "auxiliary_loss_mlp": 0.0103154, + "balance_loss_clip": 1.05097556, + "balance_loss_mlp": 1.01671004, + "epoch": 0.3007966330978506, + "flos": 28024077142080.0, + "grad_norm": 2.4661964051230707, + "language_loss": 0.70889938, + "learning_rate": 3.2788168684204376e-06, + "loss": 0.73064792, + "num_input_tokens_seen": 107631435, + "router_z_loss_clip": 0.92236328, + "router_z_loss_mlp": 0.14831543, + "step": 5003, + "time_per_iteration": 2.698556423187256 + }, + { + "auxiliary_loss_clip": 0.01140605, + "auxiliary_loss_mlp": 0.01043604, + "balance_loss_clip": 1.04913855, + "balance_loss_mlp": 1.02867925, + "epoch": 0.30085675635051856, + "flos": 33944711459040.0, + "grad_norm": 1.9540710325270474, + "language_loss": 0.7080152, + "learning_rate": 3.27851739984233e-06, + "loss": 0.72985733, + "num_input_tokens_seen": 107650530, + "router_z_loss_clip": 0.91552734, + "router_z_loss_mlp": 0.14929199, + "step": 5004, + "time_per_iteration": 2.727490186691284 + }, + { + "auxiliary_loss_clip": 0.01139656, + "auxiliary_loss_mlp": 0.01043059, + "balance_loss_clip": 1.04974627, + "balance_loss_mlp": 1.02861691, + "epoch": 0.3009168796031865, + "flos": 13279023688320.0, + "grad_norm": 2.7674733342410556, + "language_loss": 0.81932938, + "learning_rate": 3.278217882782715e-06, + "loss": 0.84115648, + "num_input_tokens_seen": 107662240, + "router_z_loss_clip": 0.89794922, + "router_z_loss_mlp": 0.14434814, + "step": 5005, + "time_per_iteration": 2.6811299324035645 + }, + { + "auxiliary_loss_clip": 0.01137637, + "auxiliary_loss_mlp": 0.01035314, + "balance_loss_clip": 1.04991341, + "balance_loss_mlp": 1.02097332, + "epoch": 0.3009770028558545, + "flos": 29047640679360.0, + "grad_norm": 2.6774141401571914, + "language_loss": 0.74648255, + "learning_rate": 3.2779183172529497e-06, + "loss": 0.76821208, + "num_input_tokens_seen": 107680330, + "router_z_loss_clip": 0.87695312, + "router_z_loss_mlp": 0.14337158, + "step": 5006, + "time_per_iteration": 2.666485548019409 + }, + { + "auxiliary_loss_clip": 0.01135349, + "auxiliary_loss_mlp": 0.01038455, + "balance_loss_clip": 1.04911828, + "balance_loss_mlp": 1.02474046, + "epoch": 0.30103712610852246, + "flos": 32297068128960.0, + "grad_norm": 1.9215750488344692, + "language_loss": 0.71206629, + "learning_rate": 3.2776187032643932e-06, + "loss": 0.73380429, + "num_input_tokens_seen": 107700020, + "router_z_loss_clip": 0.86181641, + "router_z_loss_mlp": 0.137146, + "step": 5007, + "time_per_iteration": 2.717910051345825 + }, + { + "auxiliary_loss_clip": 0.01137226, + "auxiliary_loss_mlp": 0.01035211, + "balance_loss_clip": 1.04911923, + "balance_loss_mlp": 1.02004766, + "epoch": 0.3010972493611904, + "flos": 27889700375520.0, + "grad_norm": 2.1748983669175037, + "language_loss": 0.762972, + "learning_rate": 3.2773190408284075e-06, + "loss": 0.7846964, + "num_input_tokens_seen": 107718575, + "router_z_loss_clip": 0.88134766, + "router_z_loss_mlp": 0.1517334, + "step": 5008, + "time_per_iteration": 2.7159264087677 + }, + { + "auxiliary_loss_clip": 0.01138205, + "auxiliary_loss_mlp": 0.01037146, + "balance_loss_clip": 1.04944503, + "balance_loss_mlp": 1.02254927, + "epoch": 0.3011573726138584, + "flos": 29351273240160.0, + "grad_norm": 1.9698495017838655, + "language_loss": 0.84471828, + "learning_rate": 3.2770193299563564e-06, + "loss": 0.86647183, + "num_input_tokens_seen": 107738635, + "router_z_loss_clip": 0.88623047, + "router_z_loss_mlp": 0.14611816, + "step": 5009, + "time_per_iteration": 2.6945416927337646 + }, + { + "auxiliary_loss_clip": 0.01141343, + "auxiliary_loss_mlp": 0.01039628, + "balance_loss_clip": 1.04947376, + "balance_loss_mlp": 1.02338028, + "epoch": 0.30121749586652635, + "flos": 24721497371520.0, + "grad_norm": 2.1471008134719956, + "language_loss": 0.83345079, + "learning_rate": 3.276719570659604e-06, + "loss": 0.85526055, + "num_input_tokens_seen": 107753415, + "router_z_loss_clip": 0.91845703, + "router_z_loss_mlp": 0.16247559, + "step": 5010, + "time_per_iteration": 2.6414308547973633 + }, + { + "auxiliary_loss_clip": 0.01135166, + "auxiliary_loss_mlp": 0.01034173, + "balance_loss_clip": 1.04756987, + "balance_loss_mlp": 1.02051163, + "epoch": 0.3012776191191944, + "flos": 32876463712320.0, + "grad_norm": 2.1199038182749415, + "language_loss": 0.85639381, + "learning_rate": 3.2764197629495176e-06, + "loss": 0.87808716, + "num_input_tokens_seen": 107773840, + "router_z_loss_clip": 0.87548828, + "router_z_loss_mlp": 0.13665771, + "step": 5011, + "time_per_iteration": 2.7855210304260254 + }, + { + "auxiliary_loss_clip": 0.01138195, + "auxiliary_loss_mlp": 0.01037828, + "balance_loss_clip": 1.04681182, + "balance_loss_mlp": 1.02230048, + "epoch": 0.30133774237186234, + "flos": 24907122146880.0, + "grad_norm": 3.105921429599973, + "language_loss": 0.71985638, + "learning_rate": 3.2761199068374656e-06, + "loss": 0.74161661, + "num_input_tokens_seen": 107792020, + "router_z_loss_clip": 0.9140625, + "router_z_loss_mlp": 0.15533447, + "step": 5012, + "time_per_iteration": 2.641003131866455 + }, + { + "auxiliary_loss_clip": 0.01137245, + "auxiliary_loss_mlp": 0.01037962, + "balance_loss_clip": 1.04775715, + "balance_loss_mlp": 1.02362108, + "epoch": 0.3013978656245303, + "flos": 24150691451520.0, + "grad_norm": 2.179282882433934, + "language_loss": 0.87654358, + "learning_rate": 3.275820002334819e-06, + "loss": 0.89829564, + "num_input_tokens_seen": 107809595, + "router_z_loss_clip": 0.89599609, + "router_z_loss_mlp": 0.14337158, + "step": 5013, + "time_per_iteration": 2.7000420093536377 + }, + { + "auxiliary_loss_clip": 0.01142108, + "auxiliary_loss_mlp": 0.01039195, + "balance_loss_clip": 1.05019879, + "balance_loss_mlp": 1.02339935, + "epoch": 0.30145798887719827, + "flos": 19827830043360.0, + "grad_norm": 4.352356542304662, + "language_loss": 0.82959962, + "learning_rate": 3.2755200494529496e-06, + "loss": 0.85141253, + "num_input_tokens_seen": 107827230, + "router_z_loss_clip": 0.91845703, + "router_z_loss_mlp": 0.15795898, + "step": 5014, + "time_per_iteration": 2.6904096603393555 + }, + { + "auxiliary_loss_clip": 0.01133031, + "auxiliary_loss_mlp": 0.01034963, + "balance_loss_clip": 1.04772449, + "balance_loss_mlp": 1.02089632, + "epoch": 0.30151811212986623, + "flos": 29982767695200.0, + "grad_norm": 1.6124354236434764, + "language_loss": 0.68027526, + "learning_rate": 3.2752200482032323e-06, + "loss": 0.7019552, + "num_input_tokens_seen": 107847195, + "router_z_loss_clip": 0.85253906, + "router_z_loss_mlp": 0.140625, + "step": 5015, + "time_per_iteration": 2.7962465286254883 + }, + { + "auxiliary_loss_clip": 0.01136355, + "auxiliary_loss_mlp": 0.0103923, + "balance_loss_clip": 1.0478344, + "balance_loss_mlp": 1.02413845, + "epoch": 0.3015782353825342, + "flos": 26688412932480.0, + "grad_norm": 3.5082206625743915, + "language_loss": 0.74974501, + "learning_rate": 3.2749199985970436e-06, + "loss": 0.77150089, + "num_input_tokens_seen": 107866420, + "router_z_loss_clip": 0.88525391, + "router_z_loss_mlp": 0.15081787, + "step": 5016, + "time_per_iteration": 2.6938798427581787 + }, + { + "auxiliary_loss_clip": 0.0114127, + "auxiliary_loss_mlp": 0.01035532, + "balance_loss_clip": 1.05130816, + "balance_loss_mlp": 1.02092934, + "epoch": 0.30163835863520216, + "flos": 35104839696000.0, + "grad_norm": 2.2441520393059746, + "language_loss": 0.65387547, + "learning_rate": 3.2746199006457603e-06, + "loss": 0.67564356, + "num_input_tokens_seen": 107889090, + "router_z_loss_clip": 0.89941406, + "router_z_loss_mlp": 0.14605713, + "step": 5017, + "time_per_iteration": 2.75134015083313 + }, + { + "auxiliary_loss_clip": 0.01139016, + "auxiliary_loss_mlp": 0.01042975, + "balance_loss_clip": 1.05037284, + "balance_loss_mlp": 1.0287652, + "epoch": 0.30169848188787013, + "flos": 28023590934720.0, + "grad_norm": 2.000357561930761, + "language_loss": 0.6878264, + "learning_rate": 3.2743197543607628e-06, + "loss": 0.70964634, + "num_input_tokens_seen": 107907520, + "router_z_loss_clip": 0.88671875, + "router_z_loss_mlp": 0.14215088, + "step": 5018, + "time_per_iteration": 2.719000816345215 + }, + { + "auxiliary_loss_clip": 0.0113209, + "auxiliary_loss_mlp": 0.01039976, + "balance_loss_clip": 1.04702973, + "balance_loss_mlp": 1.02698159, + "epoch": 0.3017586051405381, + "flos": 26643526136640.0, + "grad_norm": 2.742708559130966, + "language_loss": 0.79146552, + "learning_rate": 3.2740195597534327e-06, + "loss": 0.81318617, + "num_input_tokens_seen": 107925650, + "router_z_loss_clip": 0.8515625, + "router_z_loss_mlp": 0.13012695, + "step": 5019, + "time_per_iteration": 2.670093297958374 + }, + { + "auxiliary_loss_clip": 0.01138981, + "auxiliary_loss_mlp": 0.01036921, + "balance_loss_clip": 1.05092788, + "balance_loss_mlp": 1.02291369, + "epoch": 0.30181872839320606, + "flos": 27040578844320.0, + "grad_norm": 2.2923331063915366, + "language_loss": 0.69504881, + "learning_rate": 3.2737193168351527e-06, + "loss": 0.71680778, + "num_input_tokens_seen": 107943975, + "router_z_loss_clip": 0.87939453, + "router_z_loss_mlp": 0.14013672, + "step": 5020, + "time_per_iteration": 2.7559990882873535 + }, + { + "auxiliary_loss_clip": 0.01142297, + "auxiliary_loss_mlp": 0.01042884, + "balance_loss_clip": 1.05100489, + "balance_loss_mlp": 1.02835274, + "epoch": 0.301878851645874, + "flos": 22102713514080.0, + "grad_norm": 2.9879453703097645, + "language_loss": 0.78507441, + "learning_rate": 3.2734190256173085e-06, + "loss": 0.80692625, + "num_input_tokens_seen": 107962950, + "router_z_loss_clip": 0.91259766, + "router_z_loss_mlp": 0.14550781, + "step": 5021, + "time_per_iteration": 2.643681287765503 + }, + { + "auxiliary_loss_clip": 0.01139031, + "auxiliary_loss_mlp": 0.01029313, + "balance_loss_clip": 1.04989767, + "balance_loss_mlp": 1.01519847, + "epoch": 0.301938974898542, + "flos": 21478106996640.0, + "grad_norm": 2.5659075894692545, + "language_loss": 0.76225519, + "learning_rate": 3.2731186861112877e-06, + "loss": 0.78393865, + "num_input_tokens_seen": 107979700, + "router_z_loss_clip": 0.89160156, + "router_z_loss_mlp": 0.14111328, + "step": 5022, + "time_per_iteration": 2.667224884033203 + }, + { + "auxiliary_loss_clip": 0.01139038, + "auxiliary_loss_mlp": 0.01037382, + "balance_loss_clip": 1.04933095, + "balance_loss_mlp": 1.02270734, + "epoch": 0.30199909815120995, + "flos": 13642655990400.0, + "grad_norm": 1.803304907874266, + "language_loss": 0.6983552, + "learning_rate": 3.2728182983284793e-06, + "loss": 0.72011936, + "num_input_tokens_seen": 107996645, + "router_z_loss_clip": 0.89599609, + "router_z_loss_mlp": 0.14672852, + "step": 5023, + "time_per_iteration": 2.617591142654419 + }, + { + "auxiliary_loss_clip": 0.01141165, + "auxiliary_loss_mlp": 0.01035278, + "balance_loss_clip": 1.04896975, + "balance_loss_mlp": 1.02099085, + "epoch": 0.302059221403878, + "flos": 26732124727200.0, + "grad_norm": 1.910831622018665, + "language_loss": 0.71517777, + "learning_rate": 3.2725178622802724e-06, + "loss": 0.73694217, + "num_input_tokens_seen": 108015020, + "router_z_loss_clip": 0.92138672, + "router_z_loss_mlp": 0.1428833, + "step": 5024, + "time_per_iteration": 2.7450244426727295 + }, + { + "auxiliary_loss_clip": 0.01134962, + "auxiliary_loss_mlp": 0.01038761, + "balance_loss_clip": 1.04883409, + "balance_loss_mlp": 1.02402067, + "epoch": 0.30211934465654594, + "flos": 32209320401280.0, + "grad_norm": 1.76122608386297, + "language_loss": 0.74438488, + "learning_rate": 3.272217377978061e-06, + "loss": 0.7661221, + "num_input_tokens_seen": 108036430, + "router_z_loss_clip": 0.86230469, + "router_z_loss_mlp": 0.14733887, + "step": 5025, + "time_per_iteration": 2.7336630821228027 + }, + { + "auxiliary_loss_clip": 0.0113583, + "auxiliary_loss_mlp": 0.01035655, + "balance_loss_clip": 1.050699, + "balance_loss_mlp": 1.02195787, + "epoch": 0.3021794679092139, + "flos": 28553845407840.0, + "grad_norm": 1.81664777100887, + "language_loss": 0.66934735, + "learning_rate": 3.2719168454332387e-06, + "loss": 0.69106221, + "num_input_tokens_seen": 108054250, + "router_z_loss_clip": 0.84960938, + "router_z_loss_mlp": 0.13708496, + "step": 5026, + "time_per_iteration": 2.725383996963501 + }, + { + "auxiliary_loss_clip": 0.01139586, + "auxiliary_loss_mlp": 0.01039359, + "balance_loss_clip": 1.05177438, + "balance_loss_mlp": 1.02482772, + "epoch": 0.30223959116188187, + "flos": 24722955993600.0, + "grad_norm": 1.7394345841511076, + "language_loss": 0.85179567, + "learning_rate": 3.2716162646572034e-06, + "loss": 0.8735851, + "num_input_tokens_seen": 108071495, + "router_z_loss_clip": 0.87890625, + "router_z_loss_mlp": 0.14550781, + "step": 5027, + "time_per_iteration": 2.6610004901885986 + }, + { + "auxiliary_loss_clip": 0.01133541, + "auxiliary_loss_mlp": 0.01039268, + "balance_loss_clip": 1.04774261, + "balance_loss_mlp": 1.02532601, + "epoch": 0.30229971441454984, + "flos": 32565011316480.0, + "grad_norm": 1.6268847631779453, + "language_loss": 0.78505021, + "learning_rate": 3.271315635661351e-06, + "loss": 0.80677831, + "num_input_tokens_seen": 108092135, + "router_z_loss_clip": 0.85644531, + "router_z_loss_mlp": 0.13946533, + "step": 5028, + "time_per_iteration": 5.473953723907471 + }, + { + "auxiliary_loss_clip": 0.01138427, + "auxiliary_loss_mlp": 0.01042792, + "balance_loss_clip": 1.05056095, + "balance_loss_mlp": 1.0281297, + "epoch": 0.3023598376672178, + "flos": 41909677158240.0, + "grad_norm": 2.6946513364015674, + "language_loss": 0.77067399, + "learning_rate": 3.2710149584570826e-06, + "loss": 0.79248619, + "num_input_tokens_seen": 108112945, + "router_z_loss_clip": 0.87841797, + "router_z_loss_mlp": 0.14660645, + "step": 5029, + "time_per_iteration": 2.7799580097198486 + }, + { + "auxiliary_loss_clip": 0.01140479, + "auxiliary_loss_mlp": 0.01039634, + "balance_loss_clip": 1.05029607, + "balance_loss_mlp": 1.02321887, + "epoch": 0.30241996091988577, + "flos": 28201679496000.0, + "grad_norm": 2.367170615355741, + "language_loss": 0.82434022, + "learning_rate": 3.2707142330557993e-06, + "loss": 0.84614134, + "num_input_tokens_seen": 108130325, + "router_z_loss_clip": 0.90332031, + "router_z_loss_mlp": 0.1640625, + "step": 5030, + "time_per_iteration": 2.9128923416137695 + }, + { + "auxiliary_loss_clip": 0.01140851, + "auxiliary_loss_mlp": 0.01041449, + "balance_loss_clip": 1.05083585, + "balance_loss_mlp": 1.0263927, + "epoch": 0.30248008417255373, + "flos": 23659691872320.0, + "grad_norm": 1.700345133284201, + "language_loss": 0.69619012, + "learning_rate": 3.270413459468905e-06, + "loss": 0.71801305, + "num_input_tokens_seen": 108150300, + "router_z_loss_clip": 0.89990234, + "router_z_loss_mlp": 0.15063477, + "step": 5031, + "time_per_iteration": 4.209355354309082 + }, + { + "auxiliary_loss_clip": 0.01137147, + "auxiliary_loss_mlp": 0.01037897, + "balance_loss_clip": 1.04955101, + "balance_loss_mlp": 1.02303755, + "epoch": 0.3025402074252217, + "flos": 29004009919200.0, + "grad_norm": 1.7733466616870142, + "language_loss": 0.82348567, + "learning_rate": 3.2701126377078047e-06, + "loss": 0.84523612, + "num_input_tokens_seen": 108170330, + "router_z_loss_clip": 0.875, + "router_z_loss_mlp": 0.14880371, + "step": 5032, + "time_per_iteration": 2.6608951091766357 + }, + { + "auxiliary_loss_clip": 0.01147577, + "auxiliary_loss_mlp": 0.01044917, + "balance_loss_clip": 1.05530381, + "balance_loss_mlp": 1.02854919, + "epoch": 0.30260033067788966, + "flos": 31718523408480.0, + "grad_norm": 4.13004409952771, + "language_loss": 0.73682648, + "learning_rate": 3.269811767783906e-06, + "loss": 0.75875139, + "num_input_tokens_seen": 108191265, + "router_z_loss_clip": 0.92285156, + "router_z_loss_mlp": 0.16369629, + "step": 5033, + "time_per_iteration": 4.160547494888306 + }, + { + "auxiliary_loss_clip": 0.01136088, + "auxiliary_loss_mlp": 0.01045169, + "balance_loss_clip": 1.04846728, + "balance_loss_mlp": 1.03017294, + "epoch": 0.3026604539305576, + "flos": 30962416851360.0, + "grad_norm": 1.5234013088148695, + "language_loss": 0.73958325, + "learning_rate": 3.2695108497086185e-06, + "loss": 0.76139581, + "num_input_tokens_seen": 108211615, + "router_z_loss_clip": 0.875, + "router_z_loss_mlp": 0.15002441, + "step": 5034, + "time_per_iteration": 2.736046075820923 + }, + { + "auxiliary_loss_clip": 0.01138589, + "auxiliary_loss_mlp": 0.0103085, + "balance_loss_clip": 1.04994535, + "balance_loss_mlp": 1.01662207, + "epoch": 0.3027205771832256, + "flos": 31497087708000.0, + "grad_norm": 1.8991021309978398, + "language_loss": 0.72033131, + "learning_rate": 3.269209883493352e-06, + "loss": 0.74202573, + "num_input_tokens_seen": 108231080, + "router_z_loss_clip": 0.88720703, + "router_z_loss_mlp": 0.14208984, + "step": 5035, + "time_per_iteration": 2.7476563453674316 + }, + { + "auxiliary_loss_clip": 0.01136562, + "auxiliary_loss_mlp": 0.0103446, + "balance_loss_clip": 1.05029559, + "balance_loss_mlp": 1.02143681, + "epoch": 0.30278070043589356, + "flos": 33366369324960.0, + "grad_norm": 2.097191735338468, + "language_loss": 0.87338352, + "learning_rate": 3.2689088691495196e-06, + "loss": 0.89509374, + "num_input_tokens_seen": 108251125, + "router_z_loss_clip": 0.86279297, + "router_z_loss_mlp": 0.13037109, + "step": 5036, + "time_per_iteration": 2.710080862045288 + }, + { + "auxiliary_loss_clip": 0.01136727, + "auxiliary_loss_mlp": 0.01048351, + "balance_loss_clip": 1.04970586, + "balance_loss_mlp": 1.03293097, + "epoch": 0.3028408236885616, + "flos": 30244916911680.0, + "grad_norm": 1.4885840228542493, + "language_loss": 0.77548444, + "learning_rate": 3.268607806688536e-06, + "loss": 0.79733515, + "num_input_tokens_seen": 108272545, + "router_z_loss_clip": 0.87011719, + "router_z_loss_mlp": 0.15423584, + "step": 5037, + "time_per_iteration": 2.732408046722412 + }, + { + "auxiliary_loss_clip": 0.01141111, + "auxiliary_loss_mlp": 0.01039489, + "balance_loss_clip": 1.05098641, + "balance_loss_mlp": 1.02407503, + "epoch": 0.30290094694122954, + "flos": 15779678208480.0, + "grad_norm": 2.3360123314662333, + "language_loss": 0.77244574, + "learning_rate": 3.268306696121816e-06, + "loss": 0.7942518, + "num_input_tokens_seen": 108289725, + "router_z_loss_clip": 0.90087891, + "router_z_loss_mlp": 0.15405273, + "step": 5038, + "time_per_iteration": 2.642617702484131 + }, + { + "auxiliary_loss_clip": 0.0113558, + "auxiliary_loss_mlp": 0.01036051, + "balance_loss_clip": 1.05114579, + "balance_loss_mlp": 1.02205014, + "epoch": 0.3029610701938975, + "flos": 31624171364160.0, + "grad_norm": 2.3264892972200952, + "language_loss": 0.73887318, + "learning_rate": 3.2680055374607804e-06, + "loss": 0.76058948, + "num_input_tokens_seen": 108310690, + "router_z_loss_clip": 0.84375, + "router_z_loss_mlp": 0.13983154, + "step": 5039, + "time_per_iteration": 2.6917314529418945 + }, + { + "auxiliary_loss_clip": 0.01136013, + "auxiliary_loss_mlp": 0.0103653, + "balance_loss_clip": 1.05080271, + "balance_loss_mlp": 1.02354801, + "epoch": 0.3030211934465655, + "flos": 26821371594240.0, + "grad_norm": 2.1702773473490145, + "language_loss": 0.79807985, + "learning_rate": 3.267704330716847e-06, + "loss": 0.81980526, + "num_input_tokens_seen": 108328905, + "router_z_loss_clip": 0.85253906, + "router_z_loss_mlp": 0.13000488, + "step": 5040, + "time_per_iteration": 2.7974355220794678 + }, + { + "auxiliary_loss_clip": 0.01139016, + "auxiliary_loss_mlp": 0.01036935, + "balance_loss_clip": 1.05234933, + "balance_loss_mlp": 1.02339256, + "epoch": 0.30308131669923344, + "flos": 25614857422080.0, + "grad_norm": 1.657263812052922, + "language_loss": 0.81899142, + "learning_rate": 3.267403075901438e-06, + "loss": 0.84075093, + "num_input_tokens_seen": 108346680, + "router_z_loss_clip": 0.86572266, + "router_z_loss_mlp": 0.13555908, + "step": 5041, + "time_per_iteration": 2.637291193008423 + }, + { + "auxiliary_loss_clip": 0.01064393, + "auxiliary_loss_mlp": 0.01007409, + "balance_loss_clip": 1.03474653, + "balance_loss_mlp": 1.00543964, + "epoch": 0.3031414399519014, + "flos": 73881280262880.0, + "grad_norm": 0.7590428956572197, + "language_loss": 0.59507334, + "learning_rate": 3.267101773025978e-06, + "loss": 0.61579138, + "num_input_tokens_seen": 108413885, + "router_z_loss_clip": 0.296875, + "router_z_loss_mlp": 0.01968384, + "step": 5042, + "time_per_iteration": 3.3724608421325684 + }, + { + "auxiliary_loss_clip": 0.01142182, + "auxiliary_loss_mlp": 0.01030699, + "balance_loss_clip": 1.05230093, + "balance_loss_mlp": 1.01645315, + "epoch": 0.30320156320456937, + "flos": 26731922140800.0, + "grad_norm": 1.651783704649134, + "language_loss": 0.71372771, + "learning_rate": 3.266800422101892e-06, + "loss": 0.73545653, + "num_input_tokens_seen": 108433640, + "router_z_loss_clip": 0.8984375, + "router_z_loss_mlp": 0.14245605, + "step": 5043, + "time_per_iteration": 2.7288095951080322 + }, + { + "auxiliary_loss_clip": 0.01138801, + "auxiliary_loss_mlp": 0.01035849, + "balance_loss_clip": 1.05092692, + "balance_loss_mlp": 1.02195489, + "epoch": 0.30326168645723733, + "flos": 26420834400480.0, + "grad_norm": 2.1966259237509846, + "language_loss": 0.6980899, + "learning_rate": 3.266499023140606e-06, + "loss": 0.71983635, + "num_input_tokens_seen": 108452640, + "router_z_loss_clip": 0.87890625, + "router_z_loss_mlp": 0.13885498, + "step": 5044, + "time_per_iteration": 2.651181936264038 + }, + { + "auxiliary_loss_clip": 0.01136262, + "auxiliary_loss_mlp": 0.01032079, + "balance_loss_clip": 1.05013013, + "balance_loss_mlp": 1.01801813, + "epoch": 0.3033218097099053, + "flos": 26687521552320.0, + "grad_norm": 5.566451207356943, + "language_loss": 0.77188754, + "learning_rate": 3.2661975761535513e-06, + "loss": 0.793571, + "num_input_tokens_seen": 108472470, + "router_z_loss_clip": 0.86035156, + "router_z_loss_mlp": 0.14056396, + "step": 5045, + "time_per_iteration": 2.66483998298645 + }, + { + "auxiliary_loss_clip": 0.01140911, + "auxiliary_loss_mlp": 0.01035168, + "balance_loss_clip": 1.05189538, + "balance_loss_mlp": 1.01994491, + "epoch": 0.30338193296257326, + "flos": 33055362619200.0, + "grad_norm": 1.5740583855569132, + "language_loss": 0.72319567, + "learning_rate": 3.2658960811521564e-06, + "loss": 0.74495643, + "num_input_tokens_seen": 108493025, + "router_z_loss_clip": 0.88964844, + "router_z_loss_mlp": 0.15222168, + "step": 5046, + "time_per_iteration": 2.691486358642578 + }, + { + "auxiliary_loss_clip": 0.01141428, + "auxiliary_loss_mlp": 0.0103799, + "balance_loss_clip": 1.0518297, + "balance_loss_mlp": 1.02215326, + "epoch": 0.30344205621524123, + "flos": 23835997673280.0, + "grad_norm": 2.1613130178512554, + "language_loss": 0.80863619, + "learning_rate": 3.2655945381478564e-06, + "loss": 0.83043039, + "num_input_tokens_seen": 108513480, + "router_z_loss_clip": 0.89501953, + "router_z_loss_mlp": 0.15826416, + "step": 5047, + "time_per_iteration": 2.7633349895477295 + }, + { + "auxiliary_loss_clip": 0.01137992, + "auxiliary_loss_mlp": 0.01040861, + "balance_loss_clip": 1.04935658, + "balance_loss_mlp": 1.0266453, + "epoch": 0.3035021794679092, + "flos": 29176831234080.0, + "grad_norm": 4.366317403305131, + "language_loss": 0.7209897, + "learning_rate": 3.265292947152084e-06, + "loss": 0.7427783, + "num_input_tokens_seen": 108533155, + "router_z_loss_clip": 0.88574219, + "router_z_loss_mlp": 0.14215088, + "step": 5048, + "time_per_iteration": 2.7252063751220703 + }, + { + "auxiliary_loss_clip": 0.01138298, + "auxiliary_loss_mlp": 0.01034587, + "balance_loss_clip": 1.04997325, + "balance_loss_mlp": 1.02109838, + "epoch": 0.30356230272057716, + "flos": 19698112764000.0, + "grad_norm": 1.944705160282873, + "language_loss": 0.75835443, + "learning_rate": 3.2649913081762763e-06, + "loss": 0.78008324, + "num_input_tokens_seen": 108551900, + "router_z_loss_clip": 0.8828125, + "router_z_loss_mlp": 0.13476562, + "step": 5049, + "time_per_iteration": 2.6583287715911865 + }, + { + "auxiliary_loss_clip": 0.01140094, + "auxiliary_loss_mlp": 0.01033324, + "balance_loss_clip": 1.05077255, + "balance_loss_mlp": 1.01902473, + "epoch": 0.3036224259732452, + "flos": 35282279980800.0, + "grad_norm": 1.7572897759950836, + "language_loss": 0.81991905, + "learning_rate": 3.2646896212318717e-06, + "loss": 0.84165323, + "num_input_tokens_seen": 108574005, + "router_z_loss_clip": 0.89208984, + "router_z_loss_mlp": 0.1428833, + "step": 5050, + "time_per_iteration": 2.6947054862976074 + }, + { + "auxiliary_loss_clip": 0.01139682, + "auxiliary_loss_mlp": 0.01038324, + "balance_loss_clip": 1.05156279, + "balance_loss_mlp": 1.02295148, + "epoch": 0.30368254922591315, + "flos": 25753123847520.0, + "grad_norm": 2.4055865759599033, + "language_loss": 0.73487103, + "learning_rate": 3.2643878863303106e-06, + "loss": 0.75665104, + "num_input_tokens_seen": 108592715, + "router_z_loss_clip": 0.88037109, + "router_z_loss_mlp": 0.15374756, + "step": 5051, + "time_per_iteration": 2.688239336013794 + }, + { + "auxiliary_loss_clip": 0.01137072, + "auxiliary_loss_mlp": 0.01036839, + "balance_loss_clip": 1.04969168, + "balance_loss_mlp": 1.02272415, + "epoch": 0.3037426724785811, + "flos": 28067667384960.0, + "grad_norm": 1.6960980409720539, + "language_loss": 0.76717979, + "learning_rate": 3.264086103483033e-06, + "loss": 0.78891885, + "num_input_tokens_seen": 108611770, + "router_z_loss_clip": 0.87353516, + "router_z_loss_mlp": 0.14111328, + "step": 5052, + "time_per_iteration": 2.666635751724243 + }, + { + "auxiliary_loss_clip": 0.01140847, + "auxiliary_loss_mlp": 0.01041651, + "balance_loss_clip": 1.0506537, + "balance_loss_mlp": 1.02716672, + "epoch": 0.3038027957312491, + "flos": 19074681247680.0, + "grad_norm": 2.106970619220562, + "language_loss": 0.83207995, + "learning_rate": 3.2637842727014836e-06, + "loss": 0.85390484, + "num_input_tokens_seen": 108629070, + "router_z_loss_clip": 0.90234375, + "router_z_loss_mlp": 0.1449585, + "step": 5053, + "time_per_iteration": 2.7653748989105225 + }, + { + "auxiliary_loss_clip": 0.01135663, + "auxiliary_loss_mlp": 0.01037807, + "balance_loss_clip": 1.04831922, + "balance_loss_mlp": 1.02324009, + "epoch": 0.30386291898391704, + "flos": 15513031573920.0, + "grad_norm": 1.753087764978674, + "language_loss": 0.71155161, + "learning_rate": 3.2634823939971083e-06, + "loss": 0.73328632, + "num_input_tokens_seen": 108646315, + "router_z_loss_clip": 0.87353516, + "router_z_loss_mlp": 0.14575195, + "step": 5054, + "time_per_iteration": 2.7077584266662598 + }, + { + "auxiliary_loss_clip": 0.01140152, + "auxiliary_loss_mlp": 0.01040876, + "balance_loss_clip": 1.05104327, + "balance_loss_mlp": 1.02612329, + "epoch": 0.303923042236585, + "flos": 32164271536320.0, + "grad_norm": 2.008231905013292, + "language_loss": 0.69922137, + "learning_rate": 3.2631804673813545e-06, + "loss": 0.72103167, + "num_input_tokens_seen": 108665920, + "router_z_loss_clip": 0.89013672, + "router_z_loss_mlp": 0.14746094, + "step": 5055, + "time_per_iteration": 2.724311590194702 + }, + { + "auxiliary_loss_clip": 0.0113947, + "auxiliary_loss_mlp": 0.01035625, + "balance_loss_clip": 1.04984236, + "balance_loss_mlp": 1.02072406, + "epoch": 0.30398316548925297, + "flos": 24061363549920.0, + "grad_norm": 2.4627464331674354, + "language_loss": 0.67616117, + "learning_rate": 3.2628784928656707e-06, + "loss": 0.69791216, + "num_input_tokens_seen": 108683485, + "router_z_loss_clip": 0.89599609, + "router_z_loss_mlp": 0.14904785, + "step": 5056, + "time_per_iteration": 2.695485830307007 + }, + { + "auxiliary_loss_clip": 0.0113657, + "auxiliary_loss_mlp": 0.01038456, + "balance_loss_clip": 1.0507946, + "balance_loss_mlp": 1.02466369, + "epoch": 0.30404328874192094, + "flos": 29578097738880.0, + "grad_norm": 1.8370695378551047, + "language_loss": 0.82562077, + "learning_rate": 3.262576470461507e-06, + "loss": 0.84737104, + "num_input_tokens_seen": 108702700, + "router_z_loss_clip": 0.85791016, + "router_z_loss_mlp": 0.13787842, + "step": 5057, + "time_per_iteration": 2.6887388229370117 + }, + { + "auxiliary_loss_clip": 0.01135087, + "auxiliary_loss_mlp": 0.01036911, + "balance_loss_clip": 1.04824543, + "balance_loss_mlp": 1.02296388, + "epoch": 0.3041034119945889, + "flos": 29893926000960.0, + "grad_norm": 1.7580875140015175, + "language_loss": 0.88894713, + "learning_rate": 3.2622744001803176e-06, + "loss": 0.91066706, + "num_input_tokens_seen": 108721860, + "router_z_loss_clip": 0.86816406, + "router_z_loss_mlp": 0.13934326, + "step": 5058, + "time_per_iteration": 2.71209716796875 + }, + { + "auxiliary_loss_clip": 0.01139269, + "auxiliary_loss_mlp": 0.01042811, + "balance_loss_clip": 1.05040002, + "balance_loss_mlp": 1.02782047, + "epoch": 0.30416353524725687, + "flos": 34519123416960.0, + "grad_norm": 2.1775075077015904, + "language_loss": 0.71240109, + "learning_rate": 3.2619722820335564e-06, + "loss": 0.73422194, + "num_input_tokens_seen": 108743215, + "router_z_loss_clip": 0.88769531, + "router_z_loss_mlp": 0.14990234, + "step": 5059, + "time_per_iteration": 2.6758291721343994 + }, + { + "auxiliary_loss_clip": 0.01136388, + "auxiliary_loss_mlp": 0.01034397, + "balance_loss_clip": 1.04867959, + "balance_loss_mlp": 1.0209024, + "epoch": 0.30422365849992483, + "flos": 28870605567360.0, + "grad_norm": 1.7372880870342082, + "language_loss": 0.72976547, + "learning_rate": 3.26167011603268e-06, + "loss": 0.75147331, + "num_input_tokens_seen": 108765505, + "router_z_loss_clip": 0.87646484, + "router_z_loss_mlp": 0.13500977, + "step": 5060, + "time_per_iteration": 2.7592358589172363 + }, + { + "auxiliary_loss_clip": 0.01140171, + "auxiliary_loss_mlp": 0.01035265, + "balance_loss_clip": 1.05137038, + "balance_loss_mlp": 1.02116239, + "epoch": 0.3042837817525928, + "flos": 28063696691520.0, + "grad_norm": 2.1276672278875517, + "language_loss": 0.76775551, + "learning_rate": 3.2613679021891463e-06, + "loss": 0.78950989, + "num_input_tokens_seen": 108783370, + "router_z_loss_clip": 0.88671875, + "router_z_loss_mlp": 0.14111328, + "step": 5061, + "time_per_iteration": 2.6815009117126465 + }, + { + "auxiliary_loss_clip": 0.01140686, + "auxiliary_loss_mlp": 0.01036067, + "balance_loss_clip": 1.05164921, + "balance_loss_mlp": 1.02093947, + "epoch": 0.30434390500526076, + "flos": 26945781109920.0, + "grad_norm": 2.2899842703639335, + "language_loss": 0.8227821, + "learning_rate": 3.261065640514415e-06, + "loss": 0.84454966, + "num_input_tokens_seen": 108797430, + "router_z_loss_clip": 0.89013672, + "router_z_loss_mlp": 0.15118408, + "step": 5062, + "time_per_iteration": 2.7600276470184326 + }, + { + "auxiliary_loss_clip": 0.01133114, + "auxiliary_loss_mlp": 0.01030555, + "balance_loss_clip": 1.04688072, + "balance_loss_mlp": 1.01768613, + "epoch": 0.3044040282579287, + "flos": 31095416030400.0, + "grad_norm": 1.8396582300696296, + "language_loss": 0.7429626, + "learning_rate": 3.2607633310199483e-06, + "loss": 0.76459932, + "num_input_tokens_seen": 108816945, + "router_z_loss_clip": 0.86230469, + "router_z_loss_mlp": 0.12866211, + "step": 5063, + "time_per_iteration": 2.7323696613311768 + }, + { + "auxiliary_loss_clip": 0.01136663, + "auxiliary_loss_mlp": 0.0103503, + "balance_loss_clip": 1.05021441, + "balance_loss_mlp": 1.01994991, + "epoch": 0.30446415151059675, + "flos": 26777943420480.0, + "grad_norm": 1.5441870398981126, + "language_loss": 0.84282267, + "learning_rate": 3.26046097371721e-06, + "loss": 0.8645395, + "num_input_tokens_seen": 108836615, + "router_z_loss_clip": 0.86376953, + "router_z_loss_mlp": 0.15081787, + "step": 5064, + "time_per_iteration": 2.7231040000915527 + }, + { + "auxiliary_loss_clip": 0.01135424, + "auxiliary_loss_mlp": 0.01034837, + "balance_loss_clip": 1.04698348, + "balance_loss_mlp": 1.01996553, + "epoch": 0.3045242747632647, + "flos": 20054208852000.0, + "grad_norm": 1.9210408324103598, + "language_loss": 0.76119071, + "learning_rate": 3.2601585686176655e-06, + "loss": 0.7828933, + "num_input_tokens_seen": 108855165, + "router_z_loss_clip": 0.88525391, + "router_z_loss_mlp": 0.14862061, + "step": 5065, + "time_per_iteration": 2.6247506141662598 + }, + { + "auxiliary_loss_clip": 0.01137572, + "auxiliary_loss_mlp": 0.01043714, + "balance_loss_clip": 1.0473814, + "balance_loss_mlp": 1.02874088, + "epoch": 0.3045843980159327, + "flos": 38486415461760.0, + "grad_norm": 2.439314313346213, + "language_loss": 0.62059677, + "learning_rate": 3.2598561157327814e-06, + "loss": 0.64240968, + "num_input_tokens_seen": 108874690, + "router_z_loss_clip": 0.90234375, + "router_z_loss_mlp": 0.1496582, + "step": 5066, + "time_per_iteration": 2.7652180194854736 + }, + { + "auxiliary_loss_clip": 0.01142853, + "auxiliary_loss_mlp": 0.0104017, + "balance_loss_clip": 1.05168986, + "balance_loss_mlp": 1.02554321, + "epoch": 0.30464452126860064, + "flos": 21785264560800.0, + "grad_norm": 1.891075045509771, + "language_loss": 0.82798922, + "learning_rate": 3.2595536150740265e-06, + "loss": 0.84981948, + "num_input_tokens_seen": 108893140, + "router_z_loss_clip": 0.91162109, + "router_z_loss_mlp": 0.14624023, + "step": 5067, + "time_per_iteration": 4.0663957595825195 + }, + { + "auxiliary_loss_clip": 0.01133245, + "auxiliary_loss_mlp": 0.010403, + "balance_loss_clip": 1.04783201, + "balance_loss_mlp": 1.02580404, + "epoch": 0.3047046445212686, + "flos": 25174457575200.0, + "grad_norm": 1.8208127359217818, + "language_loss": 0.63036031, + "learning_rate": 3.259251066652873e-06, + "loss": 0.65209579, + "num_input_tokens_seen": 108911880, + "router_z_loss_clip": 0.85449219, + "router_z_loss_mlp": 0.14501953, + "step": 5068, + "time_per_iteration": 4.158961534500122 + }, + { + "auxiliary_loss_clip": 0.01134186, + "auxiliary_loss_mlp": 0.01034298, + "balance_loss_clip": 1.04713988, + "balance_loss_mlp": 1.02021384, + "epoch": 0.3047647677739366, + "flos": 25975248341760.0, + "grad_norm": 6.800797706791783, + "language_loss": 0.74999362, + "learning_rate": 3.258948470480793e-06, + "loss": 0.77167845, + "num_input_tokens_seen": 108930440, + "router_z_loss_clip": 0.87011719, + "router_z_loss_mlp": 0.14099121, + "step": 5069, + "time_per_iteration": 2.6955807209014893 + }, + { + "auxiliary_loss_clip": 0.01130374, + "auxiliary_loss_mlp": 0.01042381, + "balance_loss_clip": 1.04695666, + "balance_loss_mlp": 1.02895188, + "epoch": 0.30482489102660454, + "flos": 25619314322880.0, + "grad_norm": 2.1596088577147428, + "language_loss": 0.75791293, + "learning_rate": 3.258645826569261e-06, + "loss": 0.77964044, + "num_input_tokens_seen": 108949125, + "router_z_loss_clip": 0.83300781, + "router_z_loss_mlp": 0.13421631, + "step": 5070, + "time_per_iteration": 4.180227279663086 + }, + { + "auxiliary_loss_clip": 0.01138569, + "auxiliary_loss_mlp": 0.01038376, + "balance_loss_clip": 1.0482893, + "balance_loss_mlp": 1.02368963, + "epoch": 0.3048850142792725, + "flos": 32080129846560.0, + "grad_norm": 1.8223342232600357, + "language_loss": 0.81748796, + "learning_rate": 3.2583431349297527e-06, + "loss": 0.83925736, + "num_input_tokens_seen": 108972190, + "router_z_loss_clip": 0.90380859, + "router_z_loss_mlp": 0.14697266, + "step": 5071, + "time_per_iteration": 2.75941801071167 + }, + { + "auxiliary_loss_clip": 0.01137615, + "auxiliary_loss_mlp": 0.01040082, + "balance_loss_clip": 1.04703462, + "balance_loss_mlp": 1.02496648, + "epoch": 0.30494513753194047, + "flos": 27267038687520.0, + "grad_norm": 2.4390803090439945, + "language_loss": 0.75756872, + "learning_rate": 3.2580403955737467e-06, + "loss": 0.77934563, + "num_input_tokens_seen": 108990325, + "router_z_loss_clip": 0.90625, + "router_z_loss_mlp": 0.15112305, + "step": 5072, + "time_per_iteration": 2.689509153366089 + }, + { + "auxiliary_loss_clip": 0.01134785, + "auxiliary_loss_mlp": 0.01040031, + "balance_loss_clip": 1.0475992, + "balance_loss_mlp": 1.0252254, + "epoch": 0.30500526078460843, + "flos": 23839725263040.0, + "grad_norm": 1.8611155402697697, + "language_loss": 0.71209931, + "learning_rate": 3.257737608512723e-06, + "loss": 0.73384744, + "num_input_tokens_seen": 109009505, + "router_z_loss_clip": 0.87158203, + "router_z_loss_mlp": 0.14794922, + "step": 5073, + "time_per_iteration": 4.114821910858154 + }, + { + "auxiliary_loss_clip": 0.01141543, + "auxiliary_loss_mlp": 0.01044094, + "balance_loss_clip": 1.05022526, + "balance_loss_mlp": 1.02943182, + "epoch": 0.3050653840372764, + "flos": 17650134826560.0, + "grad_norm": 2.129368704075491, + "language_loss": 0.75965726, + "learning_rate": 3.257434773758163e-06, + "loss": 0.78151363, + "num_input_tokens_seen": 109026350, + "router_z_loss_clip": 0.91259766, + "router_z_loss_mlp": 0.14672852, + "step": 5074, + "time_per_iteration": 2.6273679733276367 + }, + { + "auxiliary_loss_clip": 0.0113574, + "auxiliary_loss_mlp": 0.01036947, + "balance_loss_clip": 1.04960799, + "balance_loss_mlp": 1.02301717, + "epoch": 0.30512550728994436, + "flos": 29582554639680.0, + "grad_norm": 1.9298831304112447, + "language_loss": 0.74154758, + "learning_rate": 3.25713189132155e-06, + "loss": 0.76327443, + "num_input_tokens_seen": 109044165, + "router_z_loss_clip": 0.86083984, + "router_z_loss_mlp": 0.13928223, + "step": 5075, + "time_per_iteration": 2.6927154064178467 + }, + { + "auxiliary_loss_clip": 0.01138902, + "auxiliary_loss_mlp": 0.01045189, + "balance_loss_clip": 1.04882646, + "balance_loss_mlp": 1.02940607, + "epoch": 0.30518563054261233, + "flos": 19963017155520.0, + "grad_norm": 1.9676280311089038, + "language_loss": 0.75448656, + "learning_rate": 3.2568289612143703e-06, + "loss": 0.77632749, + "num_input_tokens_seen": 109060665, + "router_z_loss_clip": 0.90039062, + "router_z_loss_mlp": 0.15759277, + "step": 5076, + "time_per_iteration": 2.7030906677246094 + }, + { + "auxiliary_loss_clip": 0.01136354, + "auxiliary_loss_mlp": 0.01036765, + "balance_loss_clip": 1.04933536, + "balance_loss_mlp": 1.02253103, + "epoch": 0.30524575379528035, + "flos": 26331547016160.0, + "grad_norm": 1.9831313666058115, + "language_loss": 0.79297459, + "learning_rate": 3.25652598344811e-06, + "loss": 0.81470573, + "num_input_tokens_seen": 109080035, + "router_z_loss_clip": 0.86914062, + "router_z_loss_mlp": 0.14245605, + "step": 5077, + "time_per_iteration": 2.6783714294433594 + }, + { + "auxiliary_loss_clip": 0.01130294, + "auxiliary_loss_mlp": 0.0103159, + "balance_loss_clip": 1.04793, + "balance_loss_mlp": 1.01879835, + "epoch": 0.3053058770479483, + "flos": 20188180445760.0, + "grad_norm": 1.5745185950056437, + "language_loss": 0.74786091, + "learning_rate": 3.256222958034259e-06, + "loss": 0.76947975, + "num_input_tokens_seen": 109097385, + "router_z_loss_clip": 0.82226562, + "router_z_loss_mlp": 0.12799072, + "step": 5078, + "time_per_iteration": 2.6398324966430664 + }, + { + "auxiliary_loss_clip": 0.01132167, + "auxiliary_loss_mlp": 0.01051484, + "balance_loss_clip": 1.04684186, + "balance_loss_mlp": 1.0375067, + "epoch": 0.3053660003006163, + "flos": 15023571651360.0, + "grad_norm": 1.9615654102986748, + "language_loss": 0.676332, + "learning_rate": 3.255919884984307e-06, + "loss": 0.69816852, + "num_input_tokens_seen": 109115495, + "router_z_loss_clip": 0.85205078, + "router_z_loss_mlp": 0.13983154, + "step": 5079, + "time_per_iteration": 2.781662940979004 + }, + { + "auxiliary_loss_clip": 0.01134701, + "auxiliary_loss_mlp": 0.01041391, + "balance_loss_clip": 1.04823375, + "balance_loss_mlp": 1.02774143, + "epoch": 0.30542612355328425, + "flos": 28201841565120.0, + "grad_norm": 2.0076034494708312, + "language_loss": 0.80160308, + "learning_rate": 3.2556167643097477e-06, + "loss": 0.82336396, + "num_input_tokens_seen": 109134235, + "router_z_loss_clip": 0.86572266, + "router_z_loss_mlp": 0.13647461, + "step": 5080, + "time_per_iteration": 2.6998775005340576 + }, + { + "auxiliary_loss_clip": 0.01132467, + "auxiliary_loss_mlp": 0.01038974, + "balance_loss_clip": 1.04654431, + "balance_loss_mlp": 1.02557516, + "epoch": 0.3054862468059522, + "flos": 29760319062720.0, + "grad_norm": 2.2000859858638018, + "language_loss": 0.81620181, + "learning_rate": 3.255313596022074e-06, + "loss": 0.83791625, + "num_input_tokens_seen": 109152760, + "router_z_loss_clip": 0.86035156, + "router_z_loss_mlp": 0.1340332, + "step": 5081, + "time_per_iteration": 2.681217908859253 + }, + { + "auxiliary_loss_clip": 0.01130604, + "auxiliary_loss_mlp": 0.01034117, + "balance_loss_clip": 1.04455161, + "balance_loss_mlp": 1.02057505, + "epoch": 0.3055463700586202, + "flos": 35857785905280.0, + "grad_norm": 2.163178571085778, + "language_loss": 0.71952242, + "learning_rate": 3.255010380132783e-06, + "loss": 0.74116957, + "num_input_tokens_seen": 109173925, + "router_z_loss_clip": 0.85986328, + "router_z_loss_mlp": 0.13543701, + "step": 5082, + "time_per_iteration": 2.759946823120117 + }, + { + "auxiliary_loss_clip": 0.01136227, + "auxiliary_loss_mlp": 0.0104089, + "balance_loss_clip": 1.04635644, + "balance_loss_mlp": 1.02524948, + "epoch": 0.30560649331128814, + "flos": 31227645381120.0, + "grad_norm": 2.9189725745657977, + "language_loss": 0.72989583, + "learning_rate": 3.2547071166533736e-06, + "loss": 0.75166702, + "num_input_tokens_seen": 109192510, + "router_z_loss_clip": 0.89892578, + "router_z_loss_mlp": 0.15649414, + "step": 5083, + "time_per_iteration": 2.754249095916748 + }, + { + "auxiliary_loss_clip": 0.01132175, + "auxiliary_loss_mlp": 0.01036546, + "balance_loss_clip": 1.04384363, + "balance_loss_mlp": 1.02095914, + "epoch": 0.3056666165639561, + "flos": 23340946366080.0, + "grad_norm": 1.8919677579552574, + "language_loss": 0.71221757, + "learning_rate": 3.254403805595344e-06, + "loss": 0.73390484, + "num_input_tokens_seen": 109210885, + "router_z_loss_clip": 0.88378906, + "router_z_loss_mlp": 0.15570068, + "step": 5084, + "time_per_iteration": 2.715934991836548 + }, + { + "auxiliary_loss_clip": 0.01138115, + "auxiliary_loss_mlp": 0.01031013, + "balance_loss_clip": 1.04818845, + "balance_loss_mlp": 1.01673198, + "epoch": 0.30572673981662407, + "flos": 18941803620480.0, + "grad_norm": 1.957843517643951, + "language_loss": 0.7853229, + "learning_rate": 3.2541004469701962e-06, + "loss": 0.80701423, + "num_input_tokens_seen": 109229180, + "router_z_loss_clip": 0.89990234, + "router_z_loss_mlp": 0.14276123, + "step": 5085, + "time_per_iteration": 2.6247332096099854 + }, + { + "auxiliary_loss_clip": 0.01129369, + "auxiliary_loss_mlp": 0.01034373, + "balance_loss_clip": 1.04483581, + "balance_loss_mlp": 1.02063966, + "epoch": 0.30578686306929204, + "flos": 26242421700960.0, + "grad_norm": 1.609823126416249, + "language_loss": 0.78144687, + "learning_rate": 3.2537970407894342e-06, + "loss": 0.80308425, + "num_input_tokens_seen": 109249510, + "router_z_loss_clip": 0.84521484, + "router_z_loss_mlp": 0.13751221, + "step": 5086, + "time_per_iteration": 2.725207805633545 + }, + { + "auxiliary_loss_clip": 0.01134613, + "auxiliary_loss_mlp": 0.01041021, + "balance_loss_clip": 1.04865563, + "balance_loss_mlp": 1.0260129, + "epoch": 0.30584698632196, + "flos": 25568957694240.0, + "grad_norm": 1.8256420559882165, + "language_loss": 0.76482606, + "learning_rate": 3.253493587064563e-06, + "loss": 0.78658241, + "num_input_tokens_seen": 109268200, + "router_z_loss_clip": 0.85986328, + "router_z_loss_mlp": 0.15008545, + "step": 5087, + "time_per_iteration": 2.6768362522125244 + }, + { + "auxiliary_loss_clip": 0.01136552, + "auxiliary_loss_mlp": 0.01042023, + "balance_loss_clip": 1.04713821, + "balance_loss_mlp": 1.0268302, + "epoch": 0.30590710957462797, + "flos": 30116253081600.0, + "grad_norm": 3.2314120949229803, + "language_loss": 0.7241444, + "learning_rate": 3.2531900858070885e-06, + "loss": 0.74593019, + "num_input_tokens_seen": 109288370, + "router_z_loss_clip": 0.89453125, + "router_z_loss_mlp": 0.1519165, + "step": 5088, + "time_per_iteration": 2.696074962615967 + }, + { + "auxiliary_loss_clip": 0.01143973, + "auxiliary_loss_mlp": 0.01036585, + "balance_loss_clip": 1.05122948, + "balance_loss_mlp": 1.02211344, + "epoch": 0.30596723282729593, + "flos": 20849691854880.0, + "grad_norm": 4.271596098417395, + "language_loss": 0.7940774, + "learning_rate": 3.252886537028521e-06, + "loss": 0.81588292, + "num_input_tokens_seen": 109306730, + "router_z_loss_clip": 0.92724609, + "router_z_loss_mlp": 0.14465332, + "step": 5089, + "time_per_iteration": 2.6332831382751465 + }, + { + "auxiliary_loss_clip": 0.01134797, + "auxiliary_loss_mlp": 0.01037947, + "balance_loss_clip": 1.04770255, + "balance_loss_mlp": 1.02365971, + "epoch": 0.30602735607996395, + "flos": 27890429686560.0, + "grad_norm": 1.748521699920722, + "language_loss": 0.77183992, + "learning_rate": 3.2525829407403703e-06, + "loss": 0.79356736, + "num_input_tokens_seen": 109327360, + "router_z_loss_clip": 0.87011719, + "router_z_loss_mlp": 0.14294434, + "step": 5090, + "time_per_iteration": 2.6932950019836426 + }, + { + "auxiliary_loss_clip": 0.0113872, + "auxiliary_loss_mlp": 0.01044487, + "balance_loss_clip": 1.04800653, + "balance_loss_mlp": 1.03001547, + "epoch": 0.3060874793326319, + "flos": 36437789247840.0, + "grad_norm": 2.2491519048123902, + "language_loss": 0.7630465, + "learning_rate": 3.2522792969541488e-06, + "loss": 0.78487855, + "num_input_tokens_seen": 109348135, + "router_z_loss_clip": 0.90673828, + "router_z_loss_mlp": 0.14459229, + "step": 5091, + "time_per_iteration": 2.7198996543884277 + }, + { + "auxiliary_loss_clip": 0.01137735, + "auxiliary_loss_mlp": 0.01038916, + "balance_loss_clip": 1.04728127, + "balance_loss_mlp": 1.02421772, + "epoch": 0.3061476025852999, + "flos": 24951401183520.0, + "grad_norm": 1.584852069431685, + "language_loss": 0.71913511, + "learning_rate": 3.2519756056813705e-06, + "loss": 0.74090165, + "num_input_tokens_seen": 109366220, + "router_z_loss_clip": 0.90478516, + "router_z_loss_mlp": 0.14703369, + "step": 5092, + "time_per_iteration": 2.655162811279297 + }, + { + "auxiliary_loss_clip": 0.01136131, + "auxiliary_loss_mlp": 0.01038213, + "balance_loss_clip": 1.04881763, + "balance_loss_mlp": 1.02434945, + "epoch": 0.30620772583796785, + "flos": 23661353080800.0, + "grad_norm": 2.594654770040329, + "language_loss": 0.82697779, + "learning_rate": 3.2516718669335522e-06, + "loss": 0.84872127, + "num_input_tokens_seen": 109385260, + "router_z_loss_clip": 0.87304688, + "router_z_loss_mlp": 0.13848877, + "step": 5093, + "time_per_iteration": 2.6706345081329346 + }, + { + "auxiliary_loss_clip": 0.01134994, + "auxiliary_loss_mlp": 0.01039251, + "balance_loss_clip": 1.04864025, + "balance_loss_mlp": 1.02593565, + "epoch": 0.3062678490906358, + "flos": 29315057142240.0, + "grad_norm": 2.0820156218838823, + "language_loss": 0.74579394, + "learning_rate": 3.2513680807222114e-06, + "loss": 0.7675364, + "num_input_tokens_seen": 109405025, + "router_z_loss_clip": 0.86572266, + "router_z_loss_mlp": 0.13330078, + "step": 5094, + "time_per_iteration": 2.70265793800354 + }, + { + "auxiliary_loss_clip": 0.0113524, + "auxiliary_loss_mlp": 0.01038143, + "balance_loss_clip": 1.04899657, + "balance_loss_mlp": 1.02442813, + "epoch": 0.3063279723433038, + "flos": 24104670171840.0, + "grad_norm": 7.291198507224746, + "language_loss": 0.76036423, + "learning_rate": 3.251064247058868e-06, + "loss": 0.78209805, + "num_input_tokens_seen": 109422465, + "router_z_loss_clip": 0.86279297, + "router_z_loss_mlp": 0.13720703, + "step": 5095, + "time_per_iteration": 2.6575450897216797 + }, + { + "auxiliary_loss_clip": 0.01132356, + "auxiliary_loss_mlp": 0.01039471, + "balance_loss_clip": 1.0478971, + "balance_loss_mlp": 1.02576804, + "epoch": 0.30638809559597174, + "flos": 27394203378240.0, + "grad_norm": 1.766586268174164, + "language_loss": 0.80757046, + "learning_rate": 3.250760365955042e-06, + "loss": 0.82928872, + "num_input_tokens_seen": 109440575, + "router_z_loss_clip": 0.84472656, + "router_z_loss_mlp": 0.13708496, + "step": 5096, + "time_per_iteration": 2.6842052936553955 + }, + { + "auxiliary_loss_clip": 0.01137049, + "auxiliary_loss_mlp": 0.0103407, + "balance_loss_clip": 1.04863179, + "balance_loss_mlp": 1.02040315, + "epoch": 0.3064482188486397, + "flos": 20944692175680.0, + "grad_norm": 2.2736439237617843, + "language_loss": 0.81560189, + "learning_rate": 3.250456437422258e-06, + "loss": 0.83731306, + "num_input_tokens_seen": 109459050, + "router_z_loss_clip": 0.88330078, + "router_z_loss_mlp": 0.13671875, + "step": 5097, + "time_per_iteration": 2.73779034614563 + }, + { + "auxiliary_loss_clip": 0.01135255, + "auxiliary_loss_mlp": 0.01040719, + "balance_loss_clip": 1.04770982, + "balance_loss_mlp": 1.02575791, + "epoch": 0.3065083421013077, + "flos": 29002632331680.0, + "grad_norm": 2.3594154943440477, + "language_loss": 0.78012347, + "learning_rate": 3.250152461472041e-06, + "loss": 0.80188316, + "num_input_tokens_seen": 109475860, + "router_z_loss_clip": 0.87548828, + "router_z_loss_mlp": 0.14953613, + "step": 5098, + "time_per_iteration": 2.645332098007202 + }, + { + "auxiliary_loss_clip": 0.01135341, + "auxiliary_loss_mlp": 0.01033948, + "balance_loss_clip": 1.05022287, + "balance_loss_mlp": 1.01992941, + "epoch": 0.30656846535397564, + "flos": 32252667540480.0, + "grad_norm": 1.767868200697439, + "language_loss": 0.83806968, + "learning_rate": 3.249848438115917e-06, + "loss": 0.85976255, + "num_input_tokens_seen": 109494760, + "router_z_loss_clip": 0.85205078, + "router_z_loss_mlp": 0.14019775, + "step": 5099, + "time_per_iteration": 2.834231376647949 + }, + { + "auxiliary_loss_clip": 0.01136331, + "auxiliary_loss_mlp": 0.01040836, + "balance_loss_clip": 1.04632282, + "balance_loss_mlp": 1.02646577, + "epoch": 0.3066285886066436, + "flos": 32520975383520.0, + "grad_norm": 1.829932043376579, + "language_loss": 0.85531914, + "learning_rate": 3.2495443673654148e-06, + "loss": 0.87709081, + "num_input_tokens_seen": 109516480, + "router_z_loss_clip": 0.89990234, + "router_z_loss_mlp": 0.14379883, + "step": 5100, + "time_per_iteration": 2.7719223499298096 + }, + { + "auxiliary_loss_clip": 0.0113659, + "auxiliary_loss_mlp": 0.01032332, + "balance_loss_clip": 1.04898477, + "balance_loss_mlp": 1.01758575, + "epoch": 0.30668871185931157, + "flos": 18363461486400.0, + "grad_norm": 2.4633658854321188, + "language_loss": 0.79256153, + "learning_rate": 3.249240249232065e-06, + "loss": 0.81425071, + "num_input_tokens_seen": 109534615, + "router_z_loss_clip": 0.87597656, + "router_z_loss_mlp": 0.14727783, + "step": 5101, + "time_per_iteration": 2.70151948928833 + }, + { + "auxiliary_loss_clip": 0.01140471, + "auxiliary_loss_mlp": 0.01042687, + "balance_loss_clip": 1.05141687, + "balance_loss_mlp": 1.02738094, + "epoch": 0.30674883511197953, + "flos": 24504356502720.0, + "grad_norm": 1.9842621066059274, + "language_loss": 0.80148244, + "learning_rate": 3.2489360837273998e-06, + "loss": 0.82331407, + "num_input_tokens_seen": 109554040, + "router_z_loss_clip": 0.890625, + "router_z_loss_mlp": 0.15313721, + "step": 5102, + "time_per_iteration": 2.6834769248962402 + }, + { + "auxiliary_loss_clip": 0.01140975, + "auxiliary_loss_mlp": 0.01039415, + "balance_loss_clip": 1.05300784, + "balance_loss_mlp": 1.02389443, + "epoch": 0.30680895836464755, + "flos": 27935194930560.0, + "grad_norm": 1.9163547079312906, + "language_loss": 0.88489836, + "learning_rate": 3.2486318708629532e-06, + "loss": 0.90670228, + "num_input_tokens_seen": 109574345, + "router_z_loss_clip": 0.88037109, + "router_z_loss_mlp": 0.15515137, + "step": 5103, + "time_per_iteration": 2.671933650970459 + }, + { + "auxiliary_loss_clip": 0.01137717, + "auxiliary_loss_mlp": 0.01041254, + "balance_loss_clip": 1.04971075, + "balance_loss_mlp": 1.02650762, + "epoch": 0.3068690816173155, + "flos": 28914317362080.0, + "grad_norm": 2.2251029576259294, + "language_loss": 0.74398923, + "learning_rate": 3.2483276106502607e-06, + "loss": 0.76577902, + "num_input_tokens_seen": 109593670, + "router_z_loss_clip": 0.88037109, + "router_z_loss_mlp": 0.14746094, + "step": 5104, + "time_per_iteration": 2.6838886737823486 + }, + { + "auxiliary_loss_clip": 0.01140182, + "auxiliary_loss_mlp": 0.01040695, + "balance_loss_clip": 1.0497514, + "balance_loss_mlp": 1.02541268, + "epoch": 0.3069292048699835, + "flos": 28737160698240.0, + "grad_norm": 1.7475219212932849, + "language_loss": 0.73573399, + "learning_rate": 3.2480233031008605e-06, + "loss": 0.75754273, + "num_input_tokens_seen": 109613385, + "router_z_loss_clip": 0.90429688, + "router_z_loss_mlp": 0.15283203, + "step": 5105, + "time_per_iteration": 2.654038190841675 + }, + { + "auxiliary_loss_clip": 0.01140921, + "auxiliary_loss_mlp": 0.01037391, + "balance_loss_clip": 1.05076933, + "balance_loss_mlp": 1.02206624, + "epoch": 0.30698932812265145, + "flos": 29934193826880.0, + "grad_norm": 1.9334186321273574, + "language_loss": 0.87454808, + "learning_rate": 3.2477189482262916e-06, + "loss": 0.89633119, + "num_input_tokens_seen": 109632395, + "router_z_loss_clip": 0.90087891, + "router_z_loss_mlp": 0.15325928, + "step": 5106, + "time_per_iteration": 4.169897794723511 + }, + { + "auxiliary_loss_clip": 0.01145706, + "auxiliary_loss_mlp": 0.01042928, + "balance_loss_clip": 1.05285752, + "balance_loss_mlp": 1.0278244, + "epoch": 0.3070494513753194, + "flos": 25619881564800.0, + "grad_norm": 2.116011853393495, + "language_loss": 0.71321464, + "learning_rate": 3.2474145460380945e-06, + "loss": 0.73510098, + "num_input_tokens_seen": 109651380, + "router_z_loss_clip": 0.92773438, + "router_z_loss_mlp": 0.15100098, + "step": 5107, + "time_per_iteration": 2.799014091491699 + }, + { + "auxiliary_loss_clip": 0.01135925, + "auxiliary_loss_mlp": 0.01044997, + "balance_loss_clip": 1.04906893, + "balance_loss_mlp": 1.03008986, + "epoch": 0.3071095746279874, + "flos": 23215361849280.0, + "grad_norm": 2.3800087101644225, + "language_loss": 0.72264892, + "learning_rate": 3.247110096547814e-06, + "loss": 0.74445814, + "num_input_tokens_seen": 109670240, + "router_z_loss_clip": 0.86865234, + "router_z_loss_mlp": 0.14898682, + "step": 5108, + "time_per_iteration": 4.106838226318359 + }, + { + "auxiliary_loss_clip": 0.01137604, + "auxiliary_loss_mlp": 0.01040685, + "balance_loss_clip": 1.04992902, + "balance_loss_mlp": 1.02606416, + "epoch": 0.30716969788065535, + "flos": 25887216993120.0, + "grad_norm": 1.578502301013346, + "language_loss": 0.85646111, + "learning_rate": 3.2468055997669926e-06, + "loss": 0.87824404, + "num_input_tokens_seen": 109690810, + "router_z_loss_clip": 0.87695312, + "router_z_loss_mlp": 0.14624023, + "step": 5109, + "time_per_iteration": 4.2928924560546875 + }, + { + "auxiliary_loss_clip": 0.01135664, + "auxiliary_loss_mlp": 0.01035315, + "balance_loss_clip": 1.04812086, + "balance_loss_mlp": 1.02137327, + "epoch": 0.3072298211333233, + "flos": 31448635391520.0, + "grad_norm": 1.7905731985195916, + "language_loss": 0.67332196, + "learning_rate": 3.2465010557071788e-06, + "loss": 0.69503176, + "num_input_tokens_seen": 109711145, + "router_z_loss_clip": 0.87451172, + "router_z_loss_mlp": 0.13952637, + "step": 5110, + "time_per_iteration": 2.7693324089050293 + }, + { + "auxiliary_loss_clip": 0.01133905, + "auxiliary_loss_mlp": 0.01032189, + "balance_loss_clip": 1.04881167, + "balance_loss_mlp": 1.018677, + "epoch": 0.3072899443859913, + "flos": 31541285710080.0, + "grad_norm": 1.5587900023480383, + "language_loss": 0.76746982, + "learning_rate": 3.246196464379919e-06, + "loss": 0.78913075, + "num_input_tokens_seen": 109731425, + "router_z_loss_clip": 0.8515625, + "router_z_loss_mlp": 0.1350708, + "step": 5111, + "time_per_iteration": 2.7069451808929443 + }, + { + "auxiliary_loss_clip": 0.01136807, + "auxiliary_loss_mlp": 0.01033986, + "balance_loss_clip": 1.04911876, + "balance_loss_mlp": 1.01977646, + "epoch": 0.30735006763865924, + "flos": 31631342922720.0, + "grad_norm": 1.9732077627129878, + "language_loss": 0.67305899, + "learning_rate": 3.245891825796765e-06, + "loss": 0.69476688, + "num_input_tokens_seen": 109752720, + "router_z_loss_clip": 0.87744141, + "router_z_loss_mlp": 0.14190674, + "step": 5112, + "time_per_iteration": 2.7401161193847656 + }, + { + "auxiliary_loss_clip": 0.01144974, + "auxiliary_loss_mlp": 0.0104001, + "balance_loss_clip": 1.05198383, + "balance_loss_mlp": 1.02391624, + "epoch": 0.3074101908913272, + "flos": 37726054590240.0, + "grad_norm": 2.122162774820418, + "language_loss": 0.79362905, + "learning_rate": 3.2455871399692678e-06, + "loss": 0.8154788, + "num_input_tokens_seen": 109772840, + "router_z_loss_clip": 0.92919922, + "router_z_loss_mlp": 0.16088867, + "step": 5113, + "time_per_iteration": 4.315961837768555 + }, + { + "auxiliary_loss_clip": 0.01136945, + "auxiliary_loss_mlp": 0.01039013, + "balance_loss_clip": 1.04846025, + "balance_loss_mlp": 1.024863, + "epoch": 0.30747031414399517, + "flos": 22453258734720.0, + "grad_norm": 1.8831538133351489, + "language_loss": 0.76649141, + "learning_rate": 3.2452824069089815e-06, + "loss": 0.78825104, + "num_input_tokens_seen": 109790150, + "router_z_loss_clip": 0.88427734, + "router_z_loss_mlp": 0.14172363, + "step": 5114, + "time_per_iteration": 2.6446940898895264 + }, + { + "auxiliary_loss_clip": 0.01139591, + "auxiliary_loss_mlp": 0.01035124, + "balance_loss_clip": 1.05096054, + "balance_loss_mlp": 1.01940632, + "epoch": 0.30753043739666314, + "flos": 27617624425440.0, + "grad_norm": 1.9026155267103573, + "language_loss": 0.62238044, + "learning_rate": 3.2449776266274623e-06, + "loss": 0.64412761, + "num_input_tokens_seen": 109807985, + "router_z_loss_clip": 0.88623047, + "router_z_loss_mlp": 0.15716553, + "step": 5115, + "time_per_iteration": 2.68648624420166 + }, + { + "auxiliary_loss_clip": 0.01139838, + "auxiliary_loss_mlp": 0.01037116, + "balance_loss_clip": 1.05054939, + "balance_loss_mlp": 1.02236974, + "epoch": 0.3075905606493311, + "flos": 33365842600320.0, + "grad_norm": 2.3674636090219225, + "language_loss": 0.82370913, + "learning_rate": 3.2446727991362657e-06, + "loss": 0.84547865, + "num_input_tokens_seen": 109825920, + "router_z_loss_clip": 0.89306641, + "router_z_loss_mlp": 0.14733887, + "step": 5116, + "time_per_iteration": 2.696089506149292 + }, + { + "auxiliary_loss_clip": 0.0113798, + "auxiliary_loss_mlp": 0.01037524, + "balance_loss_clip": 1.05058515, + "balance_loss_mlp": 1.02296829, + "epoch": 0.3076506839019991, + "flos": 26955343188000.0, + "grad_norm": 2.608387578011882, + "language_loss": 0.7614249, + "learning_rate": 3.244367924446952e-06, + "loss": 0.78317988, + "num_input_tokens_seen": 109846220, + "router_z_loss_clip": 0.87304688, + "router_z_loss_mlp": 0.14550781, + "step": 5117, + "time_per_iteration": 2.6878716945648193 + }, + { + "auxiliary_loss_clip": 0.01140078, + "auxiliary_loss_mlp": 0.01033801, + "balance_loss_clip": 1.05146503, + "balance_loss_mlp": 1.01854253, + "epoch": 0.3077108071546671, + "flos": 25977436274880.0, + "grad_norm": 2.6921142626392824, + "language_loss": 0.71393716, + "learning_rate": 3.2440630025710826e-06, + "loss": 0.73567593, + "num_input_tokens_seen": 109863870, + "router_z_loss_clip": 0.88623047, + "router_z_loss_mlp": 0.15258789, + "step": 5118, + "time_per_iteration": 2.6972339153289795 + }, + { + "auxiliary_loss_clip": 0.01138641, + "auxiliary_loss_mlp": 0.01035643, + "balance_loss_clip": 1.05107749, + "balance_loss_mlp": 1.02128994, + "epoch": 0.30777093040733505, + "flos": 26150460176160.0, + "grad_norm": 2.0710027730391825, + "language_loss": 0.74357629, + "learning_rate": 3.243758033520219e-06, + "loss": 0.76531911, + "num_input_tokens_seen": 109883500, + "router_z_loss_clip": 0.875, + "router_z_loss_mlp": 0.14343262, + "step": 5119, + "time_per_iteration": 2.667144775390625 + }, + { + "auxiliary_loss_clip": 0.01141818, + "auxiliary_loss_mlp": 0.01046222, + "balance_loss_clip": 1.05162668, + "balance_loss_mlp": 1.02980685, + "epoch": 0.307831053660003, + "flos": 28246930947360.0, + "grad_norm": 2.112490053953016, + "language_loss": 0.80276132, + "learning_rate": 3.243453017305926e-06, + "loss": 0.8246417, + "num_input_tokens_seen": 109904620, + "router_z_loss_clip": 0.90185547, + "router_z_loss_mlp": 0.1640625, + "step": 5120, + "time_per_iteration": 2.8236300945281982 + }, + { + "auxiliary_loss_clip": 0.01134476, + "auxiliary_loss_mlp": 0.01043941, + "balance_loss_clip": 1.04759645, + "balance_loss_mlp": 1.02921271, + "epoch": 0.307891176912671, + "flos": 20767211373600.0, + "grad_norm": 1.6804180487499618, + "language_loss": 0.79933453, + "learning_rate": 3.24314795393977e-06, + "loss": 0.82111871, + "num_input_tokens_seen": 109922275, + "router_z_loss_clip": 0.86962891, + "router_z_loss_mlp": 0.14727783, + "step": 5121, + "time_per_iteration": 2.634751081466675 + }, + { + "auxiliary_loss_clip": 0.01137158, + "auxiliary_loss_mlp": 0.010332, + "balance_loss_clip": 1.05038774, + "balance_loss_mlp": 1.01893067, + "epoch": 0.30795130016533895, + "flos": 33806688137280.0, + "grad_norm": 1.5022323313343362, + "language_loss": 0.82349962, + "learning_rate": 3.242842843433319e-06, + "loss": 0.84520322, + "num_input_tokens_seen": 109944265, + "router_z_loss_clip": 0.86865234, + "router_z_loss_mlp": 0.1428833, + "step": 5122, + "time_per_iteration": 2.783393621444702 + }, + { + "auxiliary_loss_clip": 0.01053612, + "auxiliary_loss_mlp": 0.01007535, + "balance_loss_clip": 1.02429032, + "balance_loss_mlp": 1.00544047, + "epoch": 0.3080114234180069, + "flos": 84264825173760.0, + "grad_norm": 0.7399190945644264, + "language_loss": 0.58623761, + "learning_rate": 3.242537685798143e-06, + "loss": 0.60684907, + "num_input_tokens_seen": 110014160, + "router_z_loss_clip": 0.29321289, + "router_z_loss_mlp": 0.02096558, + "step": 5123, + "time_per_iteration": 3.4225401878356934 + }, + { + "auxiliary_loss_clip": 0.01143226, + "auxiliary_loss_mlp": 0.01038309, + "balance_loss_clip": 1.05172873, + "balance_loss_mlp": 1.0222162, + "epoch": 0.3080715466706749, + "flos": 29358647385120.0, + "grad_norm": 1.5493947182992844, + "language_loss": 0.83112216, + "learning_rate": 3.242232481045813e-06, + "loss": 0.85293758, + "num_input_tokens_seen": 110034865, + "router_z_loss_clip": 0.91503906, + "router_z_loss_mlp": 0.16101074, + "step": 5124, + "time_per_iteration": 2.701934814453125 + }, + { + "auxiliary_loss_clip": 0.01142525, + "auxiliary_loss_mlp": 0.01037972, + "balance_loss_clip": 1.05210245, + "balance_loss_mlp": 1.02360129, + "epoch": 0.30813166992334284, + "flos": 31541002089120.0, + "grad_norm": 1.778124663471198, + "language_loss": 0.78988075, + "learning_rate": 3.2419272291879035e-06, + "loss": 0.81168568, + "num_input_tokens_seen": 110052930, + "router_z_loss_clip": 0.90527344, + "router_z_loss_mlp": 0.14373779, + "step": 5125, + "time_per_iteration": 2.686702251434326 + }, + { + "auxiliary_loss_clip": 0.01142528, + "auxiliary_loss_mlp": 0.01036953, + "balance_loss_clip": 1.05017877, + "balance_loss_mlp": 1.02095461, + "epoch": 0.3081917931760108, + "flos": 24952576184640.0, + "grad_norm": 2.38961836210945, + "language_loss": 0.64162582, + "learning_rate": 3.241621930235989e-06, + "loss": 0.66342068, + "num_input_tokens_seen": 110071765, + "router_z_loss_clip": 0.92333984, + "router_z_loss_mlp": 0.15991211, + "step": 5126, + "time_per_iteration": 2.6910929679870605 + }, + { + "auxiliary_loss_clip": 0.01135418, + "auxiliary_loss_mlp": 0.01035409, + "balance_loss_clip": 1.05002904, + "balance_loss_mlp": 1.02090716, + "epoch": 0.3082519164286788, + "flos": 27047183160960.0, + "grad_norm": 1.6738953713836058, + "language_loss": 0.86748022, + "learning_rate": 3.241316584201646e-06, + "loss": 0.88918847, + "num_input_tokens_seen": 110092660, + "router_z_loss_clip": 0.85400391, + "router_z_loss_mlp": 0.14508057, + "step": 5127, + "time_per_iteration": 2.747328996658325 + }, + { + "auxiliary_loss_clip": 0.01137485, + "auxiliary_loss_mlp": 0.01038689, + "balance_loss_clip": 1.0500164, + "balance_loss_mlp": 1.02380574, + "epoch": 0.30831203968134674, + "flos": 35280821358720.0, + "grad_norm": 1.578663580750157, + "language_loss": 0.68798953, + "learning_rate": 3.2410111910964538e-06, + "loss": 0.70975125, + "num_input_tokens_seen": 110114960, + "router_z_loss_clip": 0.87402344, + "router_z_loss_mlp": 0.14880371, + "step": 5128, + "time_per_iteration": 2.7090418338775635 + }, + { + "auxiliary_loss_clip": 0.0114266, + "auxiliary_loss_mlp": 0.01040593, + "balance_loss_clip": 1.05195355, + "balance_loss_mlp": 1.02473843, + "epoch": 0.3083721629340147, + "flos": 31320052596000.0, + "grad_norm": 2.140659646307787, + "language_loss": 0.71358615, + "learning_rate": 3.240705750931993e-06, + "loss": 0.73541868, + "num_input_tokens_seen": 110135750, + "router_z_loss_clip": 0.90722656, + "router_z_loss_mlp": 0.15856934, + "step": 5129, + "time_per_iteration": 2.7633886337280273 + }, + { + "auxiliary_loss_clip": 0.01052075, + "auxiliary_loss_mlp": 0.01004361, + "balance_loss_clip": 1.02332413, + "balance_loss_mlp": 1.00237596, + "epoch": 0.3084322861866827, + "flos": 83233077145920.0, + "grad_norm": 0.824553859434546, + "language_loss": 0.59181046, + "learning_rate": 3.240400263719846e-06, + "loss": 0.61237478, + "num_input_tokens_seen": 110189480, + "router_z_loss_clip": 0.28857422, + "router_z_loss_mlp": 0.01983643, + "step": 5130, + "time_per_iteration": 3.239940643310547 + }, + { + "auxiliary_loss_clip": 0.01144751, + "auxiliary_loss_mlp": 0.01044084, + "balance_loss_clip": 1.05354905, + "balance_loss_mlp": 1.02850366, + "epoch": 0.3084924094393507, + "flos": 24766384167360.0, + "grad_norm": 2.4927700816589664, + "language_loss": 0.73017609, + "learning_rate": 3.2400947294715957e-06, + "loss": 0.75206441, + "num_input_tokens_seen": 110206445, + "router_z_loss_clip": 0.91210938, + "router_z_loss_mlp": 0.15588379, + "step": 5131, + "time_per_iteration": 2.66281795501709 + }, + { + "auxiliary_loss_clip": 0.01138597, + "auxiliary_loss_mlp": 0.01033306, + "balance_loss_clip": 1.0513823, + "balance_loss_mlp": 1.01993728, + "epoch": 0.30855253269201866, + "flos": 29224027514880.0, + "grad_norm": 1.5851813852418999, + "language_loss": 0.71356225, + "learning_rate": 3.2397891481988303e-06, + "loss": 0.73528123, + "num_input_tokens_seen": 110226845, + "router_z_loss_clip": 0.87304688, + "router_z_loss_mlp": 0.13372803, + "step": 5132, + "time_per_iteration": 2.6523351669311523 + }, + { + "auxiliary_loss_clip": 0.01134273, + "auxiliary_loss_mlp": 0.01041026, + "balance_loss_clip": 1.05068135, + "balance_loss_mlp": 1.02654839, + "epoch": 0.3086126559446866, + "flos": 23527624590720.0, + "grad_norm": 1.8104940009077752, + "language_loss": 0.89519686, + "learning_rate": 3.239483519913136e-06, + "loss": 0.91694981, + "num_input_tokens_seen": 110244095, + "router_z_loss_clip": 0.83544922, + "router_z_loss_mlp": 0.14483643, + "step": 5133, + "time_per_iteration": 2.779649496078491 + }, + { + "auxiliary_loss_clip": 0.01143044, + "auxiliary_loss_mlp": 0.01041667, + "balance_loss_clip": 1.05183816, + "balance_loss_mlp": 1.02674246, + "epoch": 0.3086727791973546, + "flos": 41196431532960.0, + "grad_norm": 2.1402590309308525, + "language_loss": 0.6727618, + "learning_rate": 3.239177844626102e-06, + "loss": 0.69460893, + "num_input_tokens_seen": 110264240, + "router_z_loss_clip": 0.91210938, + "router_z_loss_mlp": 0.14923096, + "step": 5134, + "time_per_iteration": 2.7723822593688965 + }, + { + "auxiliary_loss_clip": 0.01142817, + "auxiliary_loss_mlp": 0.01043145, + "balance_loss_clip": 1.05163717, + "balance_loss_mlp": 1.02780306, + "epoch": 0.30873290245002255, + "flos": 19564546343040.0, + "grad_norm": 2.719744819442789, + "language_loss": 0.82505322, + "learning_rate": 3.2388721223493197e-06, + "loss": 0.8469128, + "num_input_tokens_seen": 110282450, + "router_z_loss_clip": 0.91162109, + "router_z_loss_mlp": 0.15344238, + "step": 5135, + "time_per_iteration": 2.662600040435791 + }, + { + "auxiliary_loss_clip": 0.01050059, + "auxiliary_loss_mlp": 0.01007582, + "balance_loss_clip": 1.02161288, + "balance_loss_mlp": 1.00566375, + "epoch": 0.3087930257026905, + "flos": 79370752672800.0, + "grad_norm": 0.7026869204654921, + "language_loss": 0.5525806, + "learning_rate": 3.2385663530943824e-06, + "loss": 0.57315695, + "num_input_tokens_seen": 110343715, + "router_z_loss_clip": 0.28515625, + "router_z_loss_mlp": 0.01914978, + "step": 5136, + "time_per_iteration": 3.3222367763519287 + }, + { + "auxiliary_loss_clip": 0.01139775, + "auxiliary_loss_mlp": 0.01040803, + "balance_loss_clip": 1.05102587, + "balance_loss_mlp": 1.02618766, + "epoch": 0.3088531489553585, + "flos": 91200589083360.0, + "grad_norm": 2.1726366215038064, + "language_loss": 0.76273572, + "learning_rate": 3.2382605368728852e-06, + "loss": 0.78454149, + "num_input_tokens_seen": 110368430, + "router_z_loss_clip": 0.88818359, + "router_z_loss_mlp": 0.14611816, + "step": 5137, + "time_per_iteration": 3.0627963542938232 + }, + { + "auxiliary_loss_clip": 0.01137696, + "auxiliary_loss_mlp": 0.01033948, + "balance_loss_clip": 1.0493536, + "balance_loss_mlp": 1.02073407, + "epoch": 0.30891327220802645, + "flos": 25797889091520.0, + "grad_norm": 1.8580678972624458, + "language_loss": 0.79785901, + "learning_rate": 3.237954673696424e-06, + "loss": 0.81957549, + "num_input_tokens_seen": 110386735, + "router_z_loss_clip": 0.88378906, + "router_z_loss_mlp": 0.13201904, + "step": 5138, + "time_per_iteration": 2.6896257400512695 + }, + { + "auxiliary_loss_clip": 0.01139611, + "auxiliary_loss_mlp": 0.01041416, + "balance_loss_clip": 1.04928231, + "balance_loss_mlp": 1.02605629, + "epoch": 0.3089733954606944, + "flos": 31316406040800.0, + "grad_norm": 1.5831190564859277, + "language_loss": 0.81564331, + "learning_rate": 3.2376487635765983e-06, + "loss": 0.83745354, + "num_input_tokens_seen": 110406820, + "router_z_loss_clip": 0.90332031, + "router_z_loss_mlp": 0.15362549, + "step": 5139, + "time_per_iteration": 2.683155059814453 + }, + { + "auxiliary_loss_clip": 0.01145929, + "auxiliary_loss_mlp": 0.01041915, + "balance_loss_clip": 1.05209088, + "balance_loss_mlp": 1.02558875, + "epoch": 0.3090335187133624, + "flos": 23705186427360.0, + "grad_norm": 2.0064298202941107, + "language_loss": 0.7703191, + "learning_rate": 3.2373428065250067e-06, + "loss": 0.79219753, + "num_input_tokens_seen": 110424225, + "router_z_loss_clip": 0.9375, + "router_z_loss_mlp": 0.1630249, + "step": 5140, + "time_per_iteration": 2.6837034225463867 + }, + { + "auxiliary_loss_clip": 0.01135413, + "auxiliary_loss_mlp": 0.01047712, + "balance_loss_clip": 1.05070245, + "balance_loss_mlp": 1.03375852, + "epoch": 0.30909364196603034, + "flos": 24417783776160.0, + "grad_norm": 1.92576141946868, + "language_loss": 0.78586245, + "learning_rate": 3.237036802553252e-06, + "loss": 0.80769372, + "num_input_tokens_seen": 110443310, + "router_z_loss_clip": 0.84814453, + "router_z_loss_mlp": 0.13970947, + "step": 5141, + "time_per_iteration": 2.6609694957733154 + }, + { + "auxiliary_loss_clip": 0.01142846, + "auxiliary_loss_mlp": 0.01046481, + "balance_loss_clip": 1.05167687, + "balance_loss_mlp": 1.03151453, + "epoch": 0.3091537652186983, + "flos": 24011006921280.0, + "grad_norm": 2.1843703695847867, + "language_loss": 0.87056708, + "learning_rate": 3.2367307516729377e-06, + "loss": 0.89246035, + "num_input_tokens_seen": 110460215, + "router_z_loss_clip": 0.91162109, + "router_z_loss_mlp": 0.14959717, + "step": 5142, + "time_per_iteration": 2.773594856262207 + }, + { + "auxiliary_loss_clip": 0.01138539, + "auxiliary_loss_mlp": 0.01042663, + "balance_loss_clip": 1.04903913, + "balance_loss_mlp": 1.02847767, + "epoch": 0.3092138884713663, + "flos": 20767535511840.0, + "grad_norm": 1.6872052712403116, + "language_loss": 0.78881043, + "learning_rate": 3.23642465389567e-06, + "loss": 0.81062245, + "num_input_tokens_seen": 110479385, + "router_z_loss_clip": 0.89501953, + "router_z_loss_mlp": 0.14196777, + "step": 5143, + "time_per_iteration": 2.7170052528381348 + }, + { + "auxiliary_loss_clip": 0.01138483, + "auxiliary_loss_mlp": 0.0103786, + "balance_loss_clip": 1.04939175, + "balance_loss_mlp": 1.02294672, + "epoch": 0.3092740117240343, + "flos": 30517519586400.0, + "grad_norm": 1.9489618507234805, + "language_loss": 0.72030866, + "learning_rate": 3.236118509233055e-06, + "loss": 0.74207211, + "num_input_tokens_seen": 110499885, + "router_z_loss_clip": 0.89111328, + "router_z_loss_mlp": 0.14916992, + "step": 5144, + "time_per_iteration": 2.7335703372955322 + }, + { + "auxiliary_loss_clip": 0.01140734, + "auxiliary_loss_mlp": 0.01039211, + "balance_loss_clip": 1.04987025, + "balance_loss_mlp": 1.0246917, + "epoch": 0.30933413497670226, + "flos": 31226105724480.0, + "grad_norm": 1.9704619726684056, + "language_loss": 0.74046052, + "learning_rate": 3.235812317696702e-06, + "loss": 0.76225996, + "num_input_tokens_seen": 110519690, + "router_z_loss_clip": 0.90820312, + "router_z_loss_mlp": 0.14532471, + "step": 5145, + "time_per_iteration": 2.770507574081421 + }, + { + "auxiliary_loss_clip": 0.01135297, + "auxiliary_loss_mlp": 0.01043512, + "balance_loss_clip": 1.04718757, + "balance_loss_mlp": 1.02880764, + "epoch": 0.3093942582293702, + "flos": 29760562166400.0, + "grad_norm": 1.804140930412302, + "language_loss": 0.76448846, + "learning_rate": 3.2355060792982224e-06, + "loss": 0.78627658, + "num_input_tokens_seen": 110540520, + "router_z_loss_clip": 0.88085938, + "router_z_loss_mlp": 0.14703369, + "step": 5146, + "time_per_iteration": 4.405009984970093 + }, + { + "auxiliary_loss_clip": 0.01135132, + "auxiliary_loss_mlp": 0.01037239, + "balance_loss_clip": 1.04798079, + "balance_loss_mlp": 1.02297568, + "epoch": 0.3094543814820382, + "flos": 23972521855680.0, + "grad_norm": 1.7715591880804444, + "language_loss": 0.66325438, + "learning_rate": 3.2351997940492286e-06, + "loss": 0.68497813, + "num_input_tokens_seen": 110557950, + "router_z_loss_clip": 0.87109375, + "router_z_loss_mlp": 0.14257812, + "step": 5147, + "time_per_iteration": 2.6551353931427 + }, + { + "auxiliary_loss_clip": 0.01142834, + "auxiliary_loss_mlp": 0.01039595, + "balance_loss_clip": 1.05269361, + "balance_loss_mlp": 1.02556384, + "epoch": 0.30951450473470615, + "flos": 31315636212480.0, + "grad_norm": 1.9229507305818232, + "language_loss": 0.74956357, + "learning_rate": 3.2348934619613346e-06, + "loss": 0.77138782, + "num_input_tokens_seen": 110578215, + "router_z_loss_clip": 0.90136719, + "router_z_loss_mlp": 0.14013672, + "step": 5148, + "time_per_iteration": 4.0525195598602295 + }, + { + "auxiliary_loss_clip": 0.01146211, + "auxiliary_loss_mlp": 0.01048634, + "balance_loss_clip": 1.05160809, + "balance_loss_mlp": 1.03274381, + "epoch": 0.3095746279873741, + "flos": 14666705735040.0, + "grad_norm": 4.016172912753136, + "language_loss": 0.72570586, + "learning_rate": 3.2345870830461567e-06, + "loss": 0.74765432, + "num_input_tokens_seen": 110592990, + "router_z_loss_clip": 0.94677734, + "router_z_loss_mlp": 0.15893555, + "step": 5149, + "time_per_iteration": 4.115828275680542 + }, + { + "auxiliary_loss_clip": 0.01138836, + "auxiliary_loss_mlp": 0.01037738, + "balance_loss_clip": 1.04811907, + "balance_loss_mlp": 1.02188969, + "epoch": 0.3096347512400421, + "flos": 28825111012320.0, + "grad_norm": 1.8235884213217992, + "language_loss": 0.84521019, + "learning_rate": 3.2342806573153132e-06, + "loss": 0.8669759, + "num_input_tokens_seen": 110612130, + "router_z_loss_clip": 0.90625, + "router_z_loss_mlp": 0.1585083, + "step": 5150, + "time_per_iteration": 2.646920919418335 + }, + { + "auxiliary_loss_clip": 0.01136128, + "auxiliary_loss_mlp": 0.01035261, + "balance_loss_clip": 1.04781401, + "balance_loss_mlp": 1.02016926, + "epoch": 0.30969487449271005, + "flos": 27490378700160.0, + "grad_norm": 1.8958127436254386, + "language_loss": 0.79044634, + "learning_rate": 3.233974184780424e-06, + "loss": 0.81216025, + "num_input_tokens_seen": 110632045, + "router_z_loss_clip": 0.88330078, + "router_z_loss_mlp": 0.15100098, + "step": 5151, + "time_per_iteration": 2.6637790203094482 + }, + { + "auxiliary_loss_clip": 0.01139646, + "auxiliary_loss_mlp": 0.01037054, + "balance_loss_clip": 1.04890823, + "balance_loss_mlp": 1.02158034, + "epoch": 0.309754997745378, + "flos": 18629540879040.0, + "grad_norm": 2.0843660575190657, + "language_loss": 0.67369229, + "learning_rate": 3.2336676654531084e-06, + "loss": 0.69545925, + "num_input_tokens_seen": 110649340, + "router_z_loss_clip": 0.90722656, + "router_z_loss_mlp": 0.15466309, + "step": 5152, + "time_per_iteration": 4.081866979598999 + }, + { + "auxiliary_loss_clip": 0.01136361, + "auxiliary_loss_mlp": 0.01042916, + "balance_loss_clip": 1.048195, + "balance_loss_mlp": 1.02859271, + "epoch": 0.309815120998046, + "flos": 32920702231680.0, + "grad_norm": 2.570721788304023, + "language_loss": 0.82760406, + "learning_rate": 3.2333610993449926e-06, + "loss": 0.84939682, + "num_input_tokens_seen": 110668450, + "router_z_loss_clip": 0.88134766, + "router_z_loss_mlp": 0.14337158, + "step": 5153, + "time_per_iteration": 2.725372791290283 + }, + { + "auxiliary_loss_clip": 0.01137646, + "auxiliary_loss_mlp": 0.01039744, + "balance_loss_clip": 1.0495944, + "balance_loss_mlp": 1.02541518, + "epoch": 0.30987524425071394, + "flos": 25798861506240.0, + "grad_norm": 1.708293078675105, + "language_loss": 0.73916662, + "learning_rate": 3.2330544864676997e-06, + "loss": 0.76094043, + "num_input_tokens_seen": 110689410, + "router_z_loss_clip": 0.88037109, + "router_z_loss_mlp": 0.14318848, + "step": 5154, + "time_per_iteration": 2.671569347381592 + }, + { + "auxiliary_loss_clip": 0.0113549, + "auxiliary_loss_mlp": 0.01034702, + "balance_loss_clip": 1.04900765, + "balance_loss_mlp": 1.02052808, + "epoch": 0.3099353675033819, + "flos": 18629946051840.0, + "grad_norm": 1.9901997610503246, + "language_loss": 0.76026148, + "learning_rate": 3.232747826832858e-06, + "loss": 0.78196341, + "num_input_tokens_seen": 110707350, + "router_z_loss_clip": 0.86572266, + "router_z_loss_mlp": 0.14178467, + "step": 5155, + "time_per_iteration": 2.6396970748901367 + }, + { + "auxiliary_loss_clip": 0.01139401, + "auxiliary_loss_mlp": 0.01042324, + "balance_loss_clip": 1.0492295, + "balance_loss_mlp": 1.02706575, + "epoch": 0.30999549075604993, + "flos": 18807548405760.0, + "grad_norm": 1.8498361324399677, + "language_loss": 0.79417276, + "learning_rate": 3.232441120452094e-06, + "loss": 0.81599003, + "num_input_tokens_seen": 110724910, + "router_z_loss_clip": 0.90283203, + "router_z_loss_mlp": 0.15246582, + "step": 5156, + "time_per_iteration": 2.7194159030914307 + }, + { + "auxiliary_loss_clip": 0.01138867, + "auxiliary_loss_mlp": 0.01041542, + "balance_loss_clip": 1.0478009, + "balance_loss_mlp": 1.02506733, + "epoch": 0.3100556140087179, + "flos": 28290075500160.0, + "grad_norm": 2.4430540567924575, + "language_loss": 0.75338233, + "learning_rate": 3.23213436733704e-06, + "loss": 0.7751863, + "num_input_tokens_seen": 110744010, + "router_z_loss_clip": 0.91210938, + "router_z_loss_mlp": 0.16491699, + "step": 5157, + "time_per_iteration": 2.6406748294830322 + }, + { + "auxiliary_loss_clip": 0.01133106, + "auxiliary_loss_mlp": 0.01036585, + "balance_loss_clip": 1.04665351, + "balance_loss_mlp": 1.02312624, + "epoch": 0.31011573726138586, + "flos": 31410758085120.0, + "grad_norm": 1.7537467245842566, + "language_loss": 0.69380808, + "learning_rate": 3.231827567499327e-06, + "loss": 0.715505, + "num_input_tokens_seen": 110765835, + "router_z_loss_clip": 0.86425781, + "router_z_loss_mlp": 0.13470459, + "step": 5158, + "time_per_iteration": 2.744320869445801 + }, + { + "auxiliary_loss_clip": 0.01131004, + "auxiliary_loss_mlp": 0.01039891, + "balance_loss_clip": 1.04553199, + "balance_loss_mlp": 1.02683783, + "epoch": 0.3101758605140538, + "flos": 24417783776160.0, + "grad_norm": 1.8918022431962054, + "language_loss": 0.84280741, + "learning_rate": 3.2315207209505896e-06, + "loss": 0.86451638, + "num_input_tokens_seen": 110784655, + "router_z_loss_clip": 0.85400391, + "router_z_loss_mlp": 0.1305542, + "step": 5159, + "time_per_iteration": 2.6695852279663086 + }, + { + "auxiliary_loss_clip": 0.0113358, + "auxiliary_loss_mlp": 0.01036654, + "balance_loss_clip": 1.04642177, + "balance_loss_mlp": 1.02190769, + "epoch": 0.3102359837667218, + "flos": 23349252408480.0, + "grad_norm": 2.48677914222485, + "language_loss": 0.84974825, + "learning_rate": 3.231213827702462e-06, + "loss": 0.87145054, + "num_input_tokens_seen": 110802545, + "router_z_loss_clip": 0.87109375, + "router_z_loss_mlp": 0.14752197, + "step": 5160, + "time_per_iteration": 2.6598334312438965 + }, + { + "auxiliary_loss_clip": 0.01133928, + "auxiliary_loss_mlp": 0.0103997, + "balance_loss_clip": 1.04848742, + "balance_loss_mlp": 1.02554607, + "epoch": 0.31029610701938976, + "flos": 27169121122560.0, + "grad_norm": 1.8914209017907604, + "language_loss": 0.76304883, + "learning_rate": 3.230906887766584e-06, + "loss": 0.78478783, + "num_input_tokens_seen": 110820265, + "router_z_loss_clip": 0.85400391, + "router_z_loss_mlp": 0.14422607, + "step": 5161, + "time_per_iteration": 2.6246302127838135 + }, + { + "auxiliary_loss_clip": 0.01137415, + "auxiliary_loss_mlp": 0.01037102, + "balance_loss_clip": 1.04730844, + "balance_loss_mlp": 1.02263045, + "epoch": 0.3103562302720577, + "flos": 25387506198720.0, + "grad_norm": 2.537846503183195, + "language_loss": 0.81516516, + "learning_rate": 3.2305999011545924e-06, + "loss": 0.83691037, + "num_input_tokens_seen": 110836195, + "router_z_loss_clip": 0.90039062, + "router_z_loss_mlp": 0.14477539, + "step": 5162, + "time_per_iteration": 2.6458613872528076 + }, + { + "auxiliary_loss_clip": 0.01132654, + "auxiliary_loss_mlp": 0.01031251, + "balance_loss_clip": 1.04722202, + "balance_loss_mlp": 1.01847148, + "epoch": 0.3104163535247257, + "flos": 27264688685280.0, + "grad_norm": 1.5346002975305983, + "language_loss": 0.82769358, + "learning_rate": 3.2302928678781295e-06, + "loss": 0.84933269, + "num_input_tokens_seen": 110856420, + "router_z_loss_clip": 0.85498047, + "router_z_loss_mlp": 0.12780762, + "step": 5163, + "time_per_iteration": 2.631891965866089 + }, + { + "auxiliary_loss_clip": 0.01139735, + "auxiliary_loss_mlp": 0.01038432, + "balance_loss_clip": 1.05013347, + "balance_loss_mlp": 1.02379346, + "epoch": 0.31047647677739365, + "flos": 26465478092640.0, + "grad_norm": 2.507595500571496, + "language_loss": 0.76107681, + "learning_rate": 3.2299857879488376e-06, + "loss": 0.78285849, + "num_input_tokens_seen": 110876650, + "router_z_loss_clip": 0.89697266, + "router_z_loss_mlp": 0.1463623, + "step": 5164, + "time_per_iteration": 2.692128896713257 + }, + { + "auxiliary_loss_clip": 0.01140622, + "auxiliary_loss_mlp": 0.01037653, + "balance_loss_clip": 1.0529933, + "balance_loss_mlp": 1.02325869, + "epoch": 0.3105366000300616, + "flos": 23082686808480.0, + "grad_norm": 2.051668359633917, + "language_loss": 0.7470423, + "learning_rate": 3.2296786613783626e-06, + "loss": 0.76882505, + "num_input_tokens_seen": 110894445, + "router_z_loss_clip": 0.87548828, + "router_z_loss_mlp": 0.1439209, + "step": 5165, + "time_per_iteration": 2.749105930328369 + }, + { + "auxiliary_loss_clip": 0.01134129, + "auxiliary_loss_mlp": 0.01035746, + "balance_loss_clip": 1.04862535, + "balance_loss_mlp": 1.02166724, + "epoch": 0.3105967232827296, + "flos": 22281126213600.0, + "grad_norm": 3.021373441270258, + "language_loss": 0.75809574, + "learning_rate": 3.229371488178348e-06, + "loss": 0.77979451, + "num_input_tokens_seen": 110912855, + "router_z_loss_clip": 0.85351562, + "router_z_loss_mlp": 0.14086914, + "step": 5166, + "time_per_iteration": 2.6496663093566895 + }, + { + "auxiliary_loss_clip": 0.01138675, + "auxiliary_loss_mlp": 0.01039818, + "balance_loss_clip": 1.05063319, + "balance_loss_mlp": 1.02533448, + "epoch": 0.31065684653539755, + "flos": 21563707308480.0, + "grad_norm": 3.384115374668336, + "language_loss": 0.73164773, + "learning_rate": 3.229064268360444e-06, + "loss": 0.75343269, + "num_input_tokens_seen": 110928025, + "router_z_loss_clip": 0.88037109, + "router_z_loss_mlp": 0.14477539, + "step": 5167, + "time_per_iteration": 2.722968578338623 + }, + { + "auxiliary_loss_clip": 0.01058821, + "auxiliary_loss_mlp": 0.0100332, + "balance_loss_clip": 1.03028679, + "balance_loss_mlp": 1.0014956, + "epoch": 0.3107169697880655, + "flos": 83623566054240.0, + "grad_norm": 0.7180872964045952, + "language_loss": 0.52997321, + "learning_rate": 3.2287570019362997e-06, + "loss": 0.55059457, + "num_input_tokens_seen": 110992215, + "router_z_loss_clip": 0.28588867, + "router_z_loss_mlp": 0.01821899, + "step": 5168, + "time_per_iteration": 3.318208694458008 + }, + { + "auxiliary_loss_clip": 0.01139112, + "auxiliary_loss_mlp": 0.01039025, + "balance_loss_clip": 1.04932427, + "balance_loss_mlp": 1.02415347, + "epoch": 0.3107770930407335, + "flos": 16092305605440.0, + "grad_norm": 1.715781400526348, + "language_loss": 0.78980517, + "learning_rate": 3.2284496889175668e-06, + "loss": 0.81158656, + "num_input_tokens_seen": 111010400, + "router_z_loss_clip": 0.89794922, + "router_z_loss_mlp": 0.14874268, + "step": 5169, + "time_per_iteration": 2.7086286544799805 + }, + { + "auxiliary_loss_clip": 0.01139313, + "auxiliary_loss_mlp": 0.01040915, + "balance_loss_clip": 1.05033946, + "balance_loss_mlp": 1.02625191, + "epoch": 0.3108372162934015, + "flos": 38535475537440.0, + "grad_norm": 1.5723325291972818, + "language_loss": 0.64167851, + "learning_rate": 3.2281423293158986e-06, + "loss": 0.66348076, + "num_input_tokens_seen": 111033960, + "router_z_loss_clip": 0.88916016, + "router_z_loss_mlp": 0.14648438, + "step": 5170, + "time_per_iteration": 2.7520525455474854 + }, + { + "auxiliary_loss_clip": 0.01139445, + "auxiliary_loss_mlp": 0.01042715, + "balance_loss_clip": 1.05046153, + "balance_loss_mlp": 1.02828431, + "epoch": 0.31089733954606946, + "flos": 35056914104160.0, + "grad_norm": 2.3239809572998924, + "language_loss": 0.77454746, + "learning_rate": 3.22783492314295e-06, + "loss": 0.79636908, + "num_input_tokens_seen": 111053265, + "router_z_loss_clip": 0.88964844, + "router_z_loss_mlp": 0.14453125, + "step": 5171, + "time_per_iteration": 2.7332682609558105 + }, + { + "auxiliary_loss_clip": 0.01138953, + "auxiliary_loss_mlp": 0.01042753, + "balance_loss_clip": 1.05103683, + "balance_loss_mlp": 1.02814388, + "epoch": 0.3109574627987374, + "flos": 24017408651520.0, + "grad_norm": 1.7369384308224078, + "language_loss": 0.83882904, + "learning_rate": 3.2275274704103785e-06, + "loss": 0.86064613, + "num_input_tokens_seen": 111071130, + "router_z_loss_clip": 0.87744141, + "router_z_loss_mlp": 0.14624023, + "step": 5172, + "time_per_iteration": 2.6690104007720947 + }, + { + "auxiliary_loss_clip": 0.01137323, + "auxiliary_loss_mlp": 0.0104405, + "balance_loss_clip": 1.04852283, + "balance_loss_mlp": 1.02937531, + "epoch": 0.3110175860514054, + "flos": 17916984047520.0, + "grad_norm": 2.0415976067251953, + "language_loss": 0.84251165, + "learning_rate": 3.227219971129842e-06, + "loss": 0.8643254, + "num_input_tokens_seen": 111089560, + "router_z_loss_clip": 0.88867188, + "router_z_loss_mlp": 0.14672852, + "step": 5173, + "time_per_iteration": 2.692500591278076 + }, + { + "auxiliary_loss_clip": 0.01131628, + "auxiliary_loss_mlp": 0.01033824, + "balance_loss_clip": 1.04817581, + "balance_loss_mlp": 1.02045441, + "epoch": 0.31107770930407336, + "flos": 31407921875520.0, + "grad_norm": 1.6117834865781364, + "language_loss": 0.8329795, + "learning_rate": 3.226912425313001e-06, + "loss": 0.85463405, + "num_input_tokens_seen": 111109960, + "router_z_loss_clip": 0.83398438, + "router_z_loss_mlp": 0.13366699, + "step": 5174, + "time_per_iteration": 2.7448716163635254 + }, + { + "auxiliary_loss_clip": 0.01138012, + "auxiliary_loss_mlp": 0.01039326, + "balance_loss_clip": 1.05056405, + "balance_loss_mlp": 1.02529478, + "epoch": 0.3111378325567413, + "flos": 23437850999040.0, + "grad_norm": 1.9744575041640406, + "language_loss": 0.84561741, + "learning_rate": 3.2266048329715183e-06, + "loss": 0.86739075, + "num_input_tokens_seen": 111127960, + "router_z_loss_clip": 0.87353516, + "router_z_loss_mlp": 0.14038086, + "step": 5175, + "time_per_iteration": 2.725719451904297 + }, + { + "auxiliary_loss_clip": 0.01137094, + "auxiliary_loss_mlp": 0.01041576, + "balance_loss_clip": 1.05206442, + "balance_loss_mlp": 1.0262692, + "epoch": 0.3111979558094093, + "flos": 28914560465760.0, + "grad_norm": 1.7757914761954445, + "language_loss": 0.83445203, + "learning_rate": 3.2262971941170575e-06, + "loss": 0.85623872, + "num_input_tokens_seen": 111146730, + "router_z_loss_clip": 0.84960938, + "router_z_loss_mlp": 0.15319824, + "step": 5176, + "time_per_iteration": 2.702561140060425 + }, + { + "auxiliary_loss_clip": 0.01134119, + "auxiliary_loss_mlp": 0.01039863, + "balance_loss_clip": 1.04731095, + "balance_loss_mlp": 1.02456808, + "epoch": 0.31125807906207725, + "flos": 25664525256960.0, + "grad_norm": 1.8763067124863022, + "language_loss": 0.80128467, + "learning_rate": 3.2259895087612837e-06, + "loss": 0.82302445, + "num_input_tokens_seen": 111166295, + "router_z_loss_clip": 0.86865234, + "router_z_loss_mlp": 0.15283203, + "step": 5177, + "time_per_iteration": 2.6394333839416504 + }, + { + "auxiliary_loss_clip": 0.01139742, + "auxiliary_loss_mlp": 0.01041511, + "balance_loss_clip": 1.05215812, + "balance_loss_mlp": 1.02655029, + "epoch": 0.3113182023147452, + "flos": 28158251322240.0, + "grad_norm": 1.6710112784084874, + "language_loss": 0.80533957, + "learning_rate": 3.2256817769158657e-06, + "loss": 0.82715213, + "num_input_tokens_seen": 111185665, + "router_z_loss_clip": 0.87646484, + "router_z_loss_mlp": 0.1496582, + "step": 5178, + "time_per_iteration": 2.681262493133545 + }, + { + "auxiliary_loss_clip": 0.01136958, + "auxiliary_loss_mlp": 0.01040287, + "balance_loss_clip": 1.04799247, + "balance_loss_mlp": 1.02579141, + "epoch": 0.3113783255674132, + "flos": 14445229517280.0, + "grad_norm": 1.8367624019891216, + "language_loss": 0.81392729, + "learning_rate": 3.225373998592471e-06, + "loss": 0.83569968, + "num_input_tokens_seen": 111201615, + "router_z_loss_clip": 0.890625, + "router_z_loss_mlp": 0.14489746, + "step": 5179, + "time_per_iteration": 2.6166679859161377 + }, + { + "auxiliary_loss_clip": 0.01138442, + "auxiliary_loss_mlp": 0.01040497, + "balance_loss_clip": 1.05238628, + "balance_loss_mlp": 1.02602458, + "epoch": 0.31143844882008115, + "flos": 19876201325280.0, + "grad_norm": 3.2638524614971653, + "language_loss": 0.78485298, + "learning_rate": 3.2250661738027715e-06, + "loss": 0.80664241, + "num_input_tokens_seen": 111220515, + "router_z_loss_clip": 0.86035156, + "router_z_loss_mlp": 0.14471436, + "step": 5180, + "time_per_iteration": 2.6585488319396973 + }, + { + "auxiliary_loss_clip": 0.01136385, + "auxiliary_loss_mlp": 0.0103007, + "balance_loss_clip": 1.04907382, + "balance_loss_mlp": 1.01524603, + "epoch": 0.3114985720727491, + "flos": 28330951085280.0, + "grad_norm": 1.5922007806383534, + "language_loss": 0.83120716, + "learning_rate": 3.22475830255844e-06, + "loss": 0.85287166, + "num_input_tokens_seen": 111240395, + "router_z_loss_clip": 0.87304688, + "router_z_loss_mlp": 0.14825439, + "step": 5181, + "time_per_iteration": 2.6912763118743896 + }, + { + "auxiliary_loss_clip": 0.01135895, + "auxiliary_loss_mlp": 0.01035793, + "balance_loss_clip": 1.05188346, + "balance_loss_mlp": 1.02229238, + "epoch": 0.3115586953254171, + "flos": 36661210295040.0, + "grad_norm": 1.9428836737892792, + "language_loss": 0.73900318, + "learning_rate": 3.2244503848711516e-06, + "loss": 0.76072001, + "num_input_tokens_seen": 111261100, + "router_z_loss_clip": 0.83984375, + "router_z_loss_mlp": 0.13500977, + "step": 5182, + "time_per_iteration": 2.780879259109497 + }, + { + "auxiliary_loss_clip": 0.01142431, + "auxiliary_loss_mlp": 0.01040117, + "balance_loss_clip": 1.05139697, + "balance_loss_mlp": 1.0249896, + "epoch": 0.3116188185780851, + "flos": 31318472422080.0, + "grad_norm": 2.0626935405696494, + "language_loss": 0.70316637, + "learning_rate": 3.2241424207525815e-06, + "loss": 0.72499186, + "num_input_tokens_seen": 111281320, + "router_z_loss_clip": 0.91015625, + "router_z_loss_mlp": 0.15148926, + "step": 5183, + "time_per_iteration": 2.6946051120758057 + }, + { + "auxiliary_loss_clip": 0.01051152, + "auxiliary_loss_mlp": 0.01003021, + "balance_loss_clip": 1.0221951, + "balance_loss_mlp": 1.00108051, + "epoch": 0.31167894183075306, + "flos": 84817722456000.0, + "grad_norm": 0.9407025978865423, + "language_loss": 0.59622562, + "learning_rate": 3.223834410214408e-06, + "loss": 0.61676741, + "num_input_tokens_seen": 111341405, + "router_z_loss_clip": 0.28930664, + "router_z_loss_mlp": 0.01937866, + "step": 5184, + "time_per_iteration": 3.314711332321167 + }, + { + "auxiliary_loss_clip": 0.01137047, + "auxiliary_loss_mlp": 0.01046795, + "balance_loss_clip": 1.04982007, + "balance_loss_mlp": 1.0328232, + "epoch": 0.31173906508342103, + "flos": 18229165754400.0, + "grad_norm": 2.4857622304658595, + "language_loss": 0.70370114, + "learning_rate": 3.223526353268311e-06, + "loss": 0.72553951, + "num_input_tokens_seen": 111358975, + "router_z_loss_clip": 0.87207031, + "router_z_loss_mlp": 0.13952637, + "step": 5185, + "time_per_iteration": 2.729804754257202 + }, + { + "auxiliary_loss_clip": 0.0114036, + "auxiliary_loss_mlp": 0.01045812, + "balance_loss_clip": 1.05123222, + "balance_loss_mlp": 1.03179336, + "epoch": 0.311799188336089, + "flos": 19738056451680.0, + "grad_norm": 2.5544748283233454, + "language_loss": 0.64264667, + "learning_rate": 3.2232182499259725e-06, + "loss": 0.66450846, + "num_input_tokens_seen": 111375845, + "router_z_loss_clip": 0.89111328, + "router_z_loss_mlp": 0.14019775, + "step": 5186, + "time_per_iteration": 4.1205971240997314 + }, + { + "auxiliary_loss_clip": 0.01143473, + "auxiliary_loss_mlp": 0.01047705, + "balance_loss_clip": 1.05182278, + "balance_loss_mlp": 1.0316478, + "epoch": 0.31185931158875696, + "flos": 30517114413600.0, + "grad_norm": 2.0625189000181154, + "language_loss": 0.8648172, + "learning_rate": 3.2229101001990747e-06, + "loss": 0.886729, + "num_input_tokens_seen": 111394150, + "router_z_loss_clip": 0.91650391, + "router_z_loss_mlp": 0.16052246, + "step": 5187, + "time_per_iteration": 4.1852195262908936 + }, + { + "auxiliary_loss_clip": 0.01133926, + "auxiliary_loss_mlp": 0.01046088, + "balance_loss_clip": 1.04687059, + "balance_loss_mlp": 1.03142536, + "epoch": 0.3119194348414249, + "flos": 45432841766400.0, + "grad_norm": 2.2368274756739384, + "language_loss": 0.62883484, + "learning_rate": 3.2226019040993036e-06, + "loss": 0.650635, + "num_input_tokens_seen": 111418355, + "router_z_loss_clip": 0.87060547, + "router_z_loss_mlp": 0.14648438, + "step": 5188, + "time_per_iteration": 4.251172780990601 + }, + { + "auxiliary_loss_clip": 0.01139638, + "auxiliary_loss_mlp": 0.01039994, + "balance_loss_clip": 1.05191255, + "balance_loss_mlp": 1.02580786, + "epoch": 0.3119795580940929, + "flos": 18318169517760.0, + "grad_norm": 2.4223836146553754, + "language_loss": 0.83274996, + "learning_rate": 3.222293661638346e-06, + "loss": 0.85454625, + "num_input_tokens_seen": 111435445, + "router_z_loss_clip": 0.87695312, + "router_z_loss_mlp": 0.1418457, + "step": 5189, + "time_per_iteration": 2.7142627239227295 + }, + { + "auxiliary_loss_clip": 0.01133727, + "auxiliary_loss_mlp": 0.01035038, + "balance_loss_clip": 1.04793561, + "balance_loss_mlp": 1.02098286, + "epoch": 0.31203968134676086, + "flos": 19520591444640.0, + "grad_norm": 2.259561986167714, + "language_loss": 0.79245269, + "learning_rate": 3.22198537282789e-06, + "loss": 0.81414038, + "num_input_tokens_seen": 111453430, + "router_z_loss_clip": 0.85742188, + "router_z_loss_mlp": 0.140625, + "step": 5190, + "time_per_iteration": 2.6595354080200195 + }, + { + "auxiliary_loss_clip": 0.01136858, + "auxiliary_loss_mlp": 0.01038334, + "balance_loss_clip": 1.04864383, + "balance_loss_mlp": 1.02423751, + "epoch": 0.3120998045994288, + "flos": 29087057642400.0, + "grad_norm": 1.5070286458454063, + "language_loss": 0.7533797, + "learning_rate": 3.2216770376796262e-06, + "loss": 0.77513164, + "num_input_tokens_seen": 111475325, + "router_z_loss_clip": 0.88183594, + "router_z_loss_mlp": 0.14080811, + "step": 5191, + "time_per_iteration": 2.7316877841949463 + }, + { + "auxiliary_loss_clip": 0.01050375, + "auxiliary_loss_mlp": 0.01003428, + "balance_loss_clip": 1.02144623, + "balance_loss_mlp": 1.00164557, + "epoch": 0.3121599278520968, + "flos": 84419859402720.0, + "grad_norm": 0.8467871204835459, + "language_loss": 0.63984472, + "learning_rate": 3.221368656205247e-06, + "loss": 0.66038275, + "num_input_tokens_seen": 111533960, + "router_z_loss_clip": 0.28881836, + "router_z_loss_mlp": 0.01780701, + "step": 5192, + "time_per_iteration": 4.748478412628174 + }, + { + "auxiliary_loss_clip": 0.0113694, + "auxiliary_loss_mlp": 0.01046362, + "balance_loss_clip": 1.04779613, + "balance_loss_mlp": 1.03111529, + "epoch": 0.31222005110476475, + "flos": 29048734645920.0, + "grad_norm": 1.6988327657335116, + "language_loss": 0.79710817, + "learning_rate": 3.221060228416446e-06, + "loss": 0.81894124, + "num_input_tokens_seen": 111554055, + "router_z_loss_clip": 0.890625, + "router_z_loss_mlp": 0.15234375, + "step": 5193, + "time_per_iteration": 2.665560007095337 + }, + { + "auxiliary_loss_clip": 0.01135968, + "auxiliary_loss_mlp": 0.01038356, + "balance_loss_clip": 1.0469054, + "balance_loss_mlp": 1.02320492, + "epoch": 0.3122801743574327, + "flos": 30781532597760.0, + "grad_norm": 2.0163415322586413, + "language_loss": 0.72361827, + "learning_rate": 3.2207517543249183e-06, + "loss": 0.74536145, + "num_input_tokens_seen": 111574305, + "router_z_loss_clip": 0.88964844, + "router_z_loss_mlp": 0.15130615, + "step": 5194, + "time_per_iteration": 2.7360198497772217 + }, + { + "auxiliary_loss_clip": 0.01135739, + "auxiliary_loss_mlp": 0.01038562, + "balance_loss_clip": 1.04928493, + "balance_loss_mlp": 1.02456713, + "epoch": 0.3123402976101007, + "flos": 28023915072960.0, + "grad_norm": 1.508857647867586, + "language_loss": 0.76334596, + "learning_rate": 3.2204432339423616e-06, + "loss": 0.78508896, + "num_input_tokens_seen": 111595680, + "router_z_loss_clip": 0.86328125, + "router_z_loss_mlp": 0.13989258, + "step": 5195, + "time_per_iteration": 2.7489941120147705 + }, + { + "auxiliary_loss_clip": 0.01135021, + "auxiliary_loss_mlp": 0.01039223, + "balance_loss_clip": 1.04602671, + "balance_loss_mlp": 1.02487004, + "epoch": 0.3124004208627687, + "flos": 30739765632480.0, + "grad_norm": 4.421214921895053, + "language_loss": 0.77832341, + "learning_rate": 3.220134667280476e-06, + "loss": 0.80006588, + "num_input_tokens_seen": 111618135, + "router_z_loss_clip": 0.88964844, + "router_z_loss_mlp": 0.14355469, + "step": 5196, + "time_per_iteration": 2.8013317584991455 + }, + { + "auxiliary_loss_clip": 0.01050818, + "auxiliary_loss_mlp": 0.01003096, + "balance_loss_clip": 1.02144122, + "balance_loss_mlp": 1.00133491, + "epoch": 0.31246054411543667, + "flos": 82345105893600.0, + "grad_norm": 0.7765343107772756, + "language_loss": 0.54825813, + "learning_rate": 3.2198260543509613e-06, + "loss": 0.56879729, + "num_input_tokens_seen": 111682220, + "router_z_loss_clip": 0.29394531, + "router_z_loss_mlp": 0.01759338, + "step": 5197, + "time_per_iteration": 3.298241138458252 + }, + { + "auxiliary_loss_clip": 0.01133724, + "auxiliary_loss_mlp": 0.01035893, + "balance_loss_clip": 1.04831767, + "balance_loss_mlp": 1.02198172, + "epoch": 0.31252066736810463, + "flos": 21790774910880.0, + "grad_norm": 1.6356270315187251, + "language_loss": 0.66376108, + "learning_rate": 3.21951739516552e-06, + "loss": 0.68545729, + "num_input_tokens_seen": 111700815, + "router_z_loss_clip": 0.85449219, + "router_z_loss_mlp": 0.13916016, + "step": 5198, + "time_per_iteration": 2.664396286010742 + }, + { + "auxiliary_loss_clip": 0.01138643, + "auxiliary_loss_mlp": 0.0103715, + "balance_loss_clip": 1.04857409, + "balance_loss_mlp": 1.02240348, + "epoch": 0.3125807906207726, + "flos": 22543437499200.0, + "grad_norm": 2.453336824067714, + "language_loss": 0.69819504, + "learning_rate": 3.219208689735857e-06, + "loss": 0.71995294, + "num_input_tokens_seen": 111718195, + "router_z_loss_clip": 0.89941406, + "router_z_loss_mlp": 0.14746094, + "step": 5199, + "time_per_iteration": 2.722135543823242 + }, + { + "auxiliary_loss_clip": 0.01135415, + "auxiliary_loss_mlp": 0.01046103, + "balance_loss_clip": 1.04840565, + "balance_loss_mlp": 1.03141093, + "epoch": 0.31264091387344056, + "flos": 23119064975520.0, + "grad_norm": 2.043086677740596, + "language_loss": 0.78935456, + "learning_rate": 3.2188999380736785e-06, + "loss": 0.81116974, + "num_input_tokens_seen": 111734440, + "router_z_loss_clip": 0.87011719, + "router_z_loss_mlp": 0.14709473, + "step": 5200, + "time_per_iteration": 2.6683473587036133 + }, + { + "auxiliary_loss_clip": 0.01132443, + "auxiliary_loss_mlp": 0.01030026, + "balance_loss_clip": 1.04825759, + "balance_loss_mlp": 1.01648986, + "epoch": 0.3127010371261085, + "flos": 26196157317600.0, + "grad_norm": 2.3643072109333416, + "language_loss": 0.83438784, + "learning_rate": 3.2185911401906917e-06, + "loss": 0.85601246, + "num_input_tokens_seen": 111751960, + "router_z_loss_clip": 0.84179688, + "router_z_loss_mlp": 0.13525391, + "step": 5201, + "time_per_iteration": 2.715536594390869 + }, + { + "auxiliary_loss_clip": 0.01137735, + "auxiliary_loss_mlp": 0.01042018, + "balance_loss_clip": 1.05030751, + "balance_loss_mlp": 1.02687204, + "epoch": 0.3127611603787765, + "flos": 18713115326880.0, + "grad_norm": 2.0857744850032724, + "language_loss": 0.69046003, + "learning_rate": 3.2182822960986072e-06, + "loss": 0.71225756, + "num_input_tokens_seen": 111769585, + "router_z_loss_clip": 0.87402344, + "router_z_loss_mlp": 0.15161133, + "step": 5202, + "time_per_iteration": 2.647263765335083 + }, + { + "auxiliary_loss_clip": 0.01136742, + "auxiliary_loss_mlp": 0.01035917, + "balance_loss_clip": 1.04862905, + "balance_loss_mlp": 1.02276254, + "epoch": 0.31282128363144446, + "flos": 21478957859520.0, + "grad_norm": 2.4460894424623207, + "language_loss": 0.83759499, + "learning_rate": 3.2179734058091358e-06, + "loss": 0.85932159, + "num_input_tokens_seen": 111787880, + "router_z_loss_clip": 0.88134766, + "router_z_loss_mlp": 0.13153076, + "step": 5203, + "time_per_iteration": 2.655054807662964 + }, + { + "auxiliary_loss_clip": 0.01136929, + "auxiliary_loss_mlp": 0.01040359, + "balance_loss_clip": 1.04926777, + "balance_loss_mlp": 1.0261972, + "epoch": 0.3128814068841124, + "flos": 32649477144480.0, + "grad_norm": 2.2974341959465407, + "language_loss": 0.60398751, + "learning_rate": 3.2176644693339913e-06, + "loss": 0.62576044, + "num_input_tokens_seen": 111805950, + "router_z_loss_clip": 0.87792969, + "router_z_loss_mlp": 0.14160156, + "step": 5204, + "time_per_iteration": 2.7527103424072266 + }, + { + "auxiliary_loss_clip": 0.01132543, + "auxiliary_loss_mlp": 0.01036067, + "balance_loss_clip": 1.04792035, + "balance_loss_mlp": 1.02318645, + "epoch": 0.3129415301367804, + "flos": 27176981474880.0, + "grad_norm": 1.8861073083997857, + "language_loss": 0.66047883, + "learning_rate": 3.217355486684887e-06, + "loss": 0.68216491, + "num_input_tokens_seen": 111826135, + "router_z_loss_clip": 0.84619141, + "router_z_loss_mlp": 0.12866211, + "step": 5205, + "time_per_iteration": 2.7036125659942627 + }, + { + "auxiliary_loss_clip": 0.01138005, + "auxiliary_loss_mlp": 0.01044396, + "balance_loss_clip": 1.05012655, + "balance_loss_mlp": 1.02882791, + "epoch": 0.31300165338944835, + "flos": 32293340539200.0, + "grad_norm": 1.5872807909923967, + "language_loss": 0.77140129, + "learning_rate": 3.2170464578735414e-06, + "loss": 0.79322535, + "num_input_tokens_seen": 111844700, + "router_z_loss_clip": 0.87939453, + "router_z_loss_mlp": 0.15570068, + "step": 5206, + "time_per_iteration": 2.730468273162842 + }, + { + "auxiliary_loss_clip": 0.01133977, + "auxiliary_loss_mlp": 0.01034748, + "balance_loss_clip": 1.04703736, + "balance_loss_mlp": 1.02100348, + "epoch": 0.3130617766421163, + "flos": 26777011523040.0, + "grad_norm": 2.16699599910381, + "language_loss": 0.84032595, + "learning_rate": 3.216737382911672e-06, + "loss": 0.86201316, + "num_input_tokens_seen": 111861585, + "router_z_loss_clip": 0.86962891, + "router_z_loss_mlp": 0.13739014, + "step": 5207, + "time_per_iteration": 2.696223735809326 + }, + { + "auxiliary_loss_clip": 0.01131593, + "auxiliary_loss_mlp": 0.01041382, + "balance_loss_clip": 1.04762769, + "balance_loss_mlp": 1.02786994, + "epoch": 0.3131218998947843, + "flos": 28422264333600.0, + "grad_norm": 1.530830177820576, + "language_loss": 0.71427572, + "learning_rate": 3.216428261810999e-06, + "loss": 0.73600543, + "num_input_tokens_seen": 111882950, + "router_z_loss_clip": 0.84082031, + "router_z_loss_mlp": 0.13525391, + "step": 5208, + "time_per_iteration": 2.7088775634765625 + }, + { + "auxiliary_loss_clip": 0.0113832, + "auxiliary_loss_mlp": 0.01040265, + "balance_loss_clip": 1.05053973, + "balance_loss_mlp": 1.02549505, + "epoch": 0.3131820231474523, + "flos": 25794890812800.0, + "grad_norm": 2.0175215672651916, + "language_loss": 0.74961829, + "learning_rate": 3.2161190945832445e-06, + "loss": 0.77140415, + "num_input_tokens_seen": 111901640, + "router_z_loss_clip": 0.87841797, + "router_z_loss_mlp": 0.14782715, + "step": 5209, + "time_per_iteration": 2.714641571044922 + }, + { + "auxiliary_loss_clip": 0.01134009, + "auxiliary_loss_mlp": 0.0103198, + "balance_loss_clip": 1.0464009, + "balance_loss_mlp": 1.01838398, + "epoch": 0.31324214640012027, + "flos": 29175575198400.0, + "grad_norm": 1.8429299440573017, + "language_loss": 0.7761429, + "learning_rate": 3.2158098812401325e-06, + "loss": 0.79780275, + "num_input_tokens_seen": 111919615, + "router_z_loss_clip": 0.875, + "router_z_loss_mlp": 0.13586426, + "step": 5210, + "time_per_iteration": 2.7151432037353516 + }, + { + "auxiliary_loss_clip": 0.01129776, + "auxiliary_loss_mlp": 0.01036897, + "balance_loss_clip": 1.04684281, + "balance_loss_mlp": 1.02352715, + "epoch": 0.31330226965278823, + "flos": 27133634335680.0, + "grad_norm": 1.796183881030241, + "language_loss": 0.79345107, + "learning_rate": 3.2155006217933874e-06, + "loss": 0.81511778, + "num_input_tokens_seen": 111938485, + "router_z_loss_clip": 0.83007812, + "router_z_loss_mlp": 0.13366699, + "step": 5211, + "time_per_iteration": 2.6782314777374268 + }, + { + "auxiliary_loss_clip": 0.01132196, + "auxiliary_loss_mlp": 0.01030279, + "balance_loss_clip": 1.0470773, + "balance_loss_mlp": 1.01691008, + "epoch": 0.3133623929054562, + "flos": 24103940860800.0, + "grad_norm": 1.7976419429462724, + "language_loss": 0.79907084, + "learning_rate": 3.2151913162547367e-06, + "loss": 0.82069564, + "num_input_tokens_seen": 111956425, + "router_z_loss_clip": 0.85058594, + "router_z_loss_mlp": 0.13360596, + "step": 5212, + "time_per_iteration": 2.7222163677215576 + }, + { + "auxiliary_loss_clip": 0.01139566, + "auxiliary_loss_mlp": 0.01045171, + "balance_loss_clip": 1.05050135, + "balance_loss_mlp": 1.03079474, + "epoch": 0.31342251615812416, + "flos": 33143920692480.0, + "grad_norm": 2.1227436166082927, + "language_loss": 0.70824039, + "learning_rate": 3.2148819646359097e-06, + "loss": 0.73008776, + "num_input_tokens_seen": 111975915, + "router_z_loss_clip": 0.890625, + "router_z_loss_mlp": 0.14367676, + "step": 5213, + "time_per_iteration": 2.699655055999756 + }, + { + "auxiliary_loss_clip": 0.01138241, + "auxiliary_loss_mlp": 0.01035263, + "balance_loss_clip": 1.05087304, + "balance_loss_mlp": 1.02130377, + "epoch": 0.31348263941079213, + "flos": 24684430410720.0, + "grad_norm": 2.378190262679126, + "language_loss": 0.77471107, + "learning_rate": 3.2145725669486374e-06, + "loss": 0.79644608, + "num_input_tokens_seen": 111995055, + "router_z_loss_clip": 0.87304688, + "router_z_loss_mlp": 0.13964844, + "step": 5214, + "time_per_iteration": 2.7090697288513184 + }, + { + "auxiliary_loss_clip": 0.01130749, + "auxiliary_loss_mlp": 0.01029039, + "balance_loss_clip": 1.04774952, + "balance_loss_mlp": 1.01607466, + "epoch": 0.3135427626634601, + "flos": 30026641559040.0, + "grad_norm": 1.5716625044594252, + "language_loss": 0.82460773, + "learning_rate": 3.2142631232046517e-06, + "loss": 0.84620571, + "num_input_tokens_seen": 112015830, + "router_z_loss_clip": 0.82958984, + "router_z_loss_mlp": 0.12963867, + "step": 5215, + "time_per_iteration": 2.7018635272979736 + }, + { + "auxiliary_loss_clip": 0.01134297, + "auxiliary_loss_mlp": 0.01035284, + "balance_loss_clip": 1.04775906, + "balance_loss_mlp": 1.02045465, + "epoch": 0.31360288591612806, + "flos": 25575278389920.0, + "grad_norm": 2.383371051150375, + "language_loss": 0.79516006, + "learning_rate": 3.213953633415686e-06, + "loss": 0.81685591, + "num_input_tokens_seen": 112035065, + "router_z_loss_clip": 0.86523438, + "router_z_loss_mlp": 0.14819336, + "step": 5216, + "time_per_iteration": 2.723658323287964 + }, + { + "auxiliary_loss_clip": 0.0113647, + "auxiliary_loss_mlp": 0.01042841, + "balance_loss_clip": 1.04762089, + "balance_loss_mlp": 1.0272367, + "epoch": 0.313663009168796, + "flos": 32922079819200.0, + "grad_norm": 1.68582330947513, + "language_loss": 0.68457282, + "learning_rate": 3.213644097593477e-06, + "loss": 0.706366, + "num_input_tokens_seen": 112058405, + "router_z_loss_clip": 0.88867188, + "router_z_loss_mlp": 0.15600586, + "step": 5217, + "time_per_iteration": 2.772420883178711 + }, + { + "auxiliary_loss_clip": 0.01134657, + "auxiliary_loss_mlp": 0.01030741, + "balance_loss_clip": 1.04780579, + "balance_loss_mlp": 1.01734746, + "epoch": 0.313723132421464, + "flos": 22013345095200.0, + "grad_norm": 2.016735961664886, + "language_loss": 0.80966419, + "learning_rate": 3.2133345157497624e-06, + "loss": 0.83131814, + "num_input_tokens_seen": 112076420, + "router_z_loss_clip": 0.86816406, + "router_z_loss_mlp": 0.13391113, + "step": 5218, + "time_per_iteration": 2.682730197906494 + }, + { + "auxiliary_loss_clip": 0.01133239, + "auxiliary_loss_mlp": 0.01036931, + "balance_loss_clip": 1.0460391, + "balance_loss_mlp": 1.0230552, + "epoch": 0.31378325567413196, + "flos": 27441440176320.0, + "grad_norm": 2.8874406884669725, + "language_loss": 0.69555449, + "learning_rate": 3.2130248878962813e-06, + "loss": 0.71725619, + "num_input_tokens_seen": 112090775, + "router_z_loss_clip": 0.87207031, + "router_z_loss_mlp": 0.13885498, + "step": 5219, + "time_per_iteration": 2.6609079837799072 + }, + { + "auxiliary_loss_clip": 0.01134994, + "auxiliary_loss_mlp": 0.01040232, + "balance_loss_clip": 1.04962635, + "balance_loss_mlp": 1.02716088, + "epoch": 0.3138433789267999, + "flos": 27356123485440.0, + "grad_norm": 3.1277848658569765, + "language_loss": 0.80037069, + "learning_rate": 3.2127152140447747e-06, + "loss": 0.82212293, + "num_input_tokens_seen": 112110980, + "router_z_loss_clip": 0.85449219, + "router_z_loss_mlp": 0.13061523, + "step": 5220, + "time_per_iteration": 2.7713475227355957 + }, + { + "auxiliary_loss_clip": 0.01133643, + "auxiliary_loss_mlp": 0.01035038, + "balance_loss_clip": 1.0482446, + "balance_loss_mlp": 1.02151346, + "epoch": 0.3139035021794679, + "flos": 15869249213760.0, + "grad_norm": 1.6837207660810587, + "language_loss": 0.73382175, + "learning_rate": 3.212405494206986e-06, + "loss": 0.7555086, + "num_input_tokens_seen": 112129020, + "router_z_loss_clip": 0.85546875, + "router_z_loss_mlp": 0.13531494, + "step": 5221, + "time_per_iteration": 2.6920106410980225 + }, + { + "auxiliary_loss_clip": 0.01131709, + "auxiliary_loss_mlp": 0.01034479, + "balance_loss_clip": 1.04785144, + "balance_loss_mlp": 1.02167594, + "epoch": 0.31396362543213585, + "flos": 20677275712800.0, + "grad_norm": 2.1524741711458337, + "language_loss": 0.81841737, + "learning_rate": 3.2120957283946588e-06, + "loss": 0.84007925, + "num_input_tokens_seen": 112147865, + "router_z_loss_clip": 0.83935547, + "router_z_loss_mlp": 0.12823486, + "step": 5222, + "time_per_iteration": 2.686332941055298 + }, + { + "auxiliary_loss_clip": 0.01136615, + "auxiliary_loss_mlp": 0.01039514, + "balance_loss_clip": 1.04848385, + "balance_loss_mlp": 1.02407622, + "epoch": 0.31402374868480387, + "flos": 24595264578240.0, + "grad_norm": 2.038939895745122, + "language_loss": 0.70538735, + "learning_rate": 3.2117859166195407e-06, + "loss": 0.72714859, + "num_input_tokens_seen": 112166745, + "router_z_loss_clip": 0.88134766, + "router_z_loss_mlp": 0.15429688, + "step": 5223, + "time_per_iteration": 2.6919639110565186 + }, + { + "auxiliary_loss_clip": 0.01132781, + "auxiliary_loss_mlp": 0.0103236, + "balance_loss_clip": 1.0471648, + "balance_loss_mlp": 1.01884127, + "epoch": 0.31408387193747184, + "flos": 25930685684160.0, + "grad_norm": 1.590566855809503, + "language_loss": 0.80425751, + "learning_rate": 3.211476058893379e-06, + "loss": 0.8259089, + "num_input_tokens_seen": 112185895, + "router_z_loss_clip": 0.85595703, + "router_z_loss_mlp": 0.13525391, + "step": 5224, + "time_per_iteration": 2.6469240188598633 + }, + { + "auxiliary_loss_clip": 0.01141642, + "auxiliary_loss_mlp": 0.01041395, + "balance_loss_clip": 1.05027866, + "balance_loss_mlp": 1.0266968, + "epoch": 0.3141439951901398, + "flos": 33537448396800.0, + "grad_norm": 1.9605374229380392, + "language_loss": 0.58277875, + "learning_rate": 3.2111661552279243e-06, + "loss": 0.60460913, + "num_input_tokens_seen": 112204465, + "router_z_loss_clip": 0.91455078, + "router_z_loss_mlp": 0.14685059, + "step": 5225, + "time_per_iteration": 2.7220640182495117 + }, + { + "auxiliary_loss_clip": 0.01130139, + "auxiliary_loss_mlp": 0.01032465, + "balance_loss_clip": 1.04741275, + "balance_loss_mlp": 1.0195483, + "epoch": 0.31420411844280777, + "flos": 21783684386880.0, + "grad_norm": 1.707549037866498, + "language_loss": 0.81751019, + "learning_rate": 3.2108562056349273e-06, + "loss": 0.83913618, + "num_input_tokens_seen": 112221635, + "router_z_loss_clip": 0.82763672, + "router_z_loss_mlp": 0.12915039, + "step": 5226, + "time_per_iteration": 5.4955058097839355 + }, + { + "auxiliary_loss_clip": 0.01137138, + "auxiliary_loss_mlp": 0.01041049, + "balance_loss_clip": 1.04972064, + "balance_loss_mlp": 1.02654719, + "epoch": 0.31426424169547573, + "flos": 26377082088480.0, + "grad_norm": 1.8206360886365016, + "language_loss": 0.74035394, + "learning_rate": 3.210546210126141e-06, + "loss": 0.76213574, + "num_input_tokens_seen": 112241240, + "router_z_loss_clip": 0.87451172, + "router_z_loss_mlp": 0.14501953, + "step": 5227, + "time_per_iteration": 2.691164970397949 + }, + { + "auxiliary_loss_clip": 0.01138254, + "auxiliary_loss_mlp": 0.01037105, + "balance_loss_clip": 1.05087161, + "balance_loss_mlp": 1.02305055, + "epoch": 0.3143243649481437, + "flos": 37730146835520.0, + "grad_norm": 3.230765722300039, + "language_loss": 0.67739218, + "learning_rate": 3.2102361687133213e-06, + "loss": 0.69914579, + "num_input_tokens_seen": 112262350, + "router_z_loss_clip": 0.87304688, + "router_z_loss_mlp": 0.140625, + "step": 5228, + "time_per_iteration": 4.20283579826355 + }, + { + "auxiliary_loss_clip": 0.01135696, + "auxiliary_loss_mlp": 0.0103917, + "balance_loss_clip": 1.04902506, + "balance_loss_mlp": 1.02577078, + "epoch": 0.31438448820081166, + "flos": 27846798926400.0, + "grad_norm": 1.8019801511994242, + "language_loss": 0.79429632, + "learning_rate": 3.2099260814082254e-06, + "loss": 0.81604499, + "num_input_tokens_seen": 112283710, + "router_z_loss_clip": 0.86767578, + "router_z_loss_mlp": 0.13391113, + "step": 5229, + "time_per_iteration": 2.7079989910125732 + }, + { + "auxiliary_loss_clip": 0.01134154, + "auxiliary_loss_mlp": 0.01037471, + "balance_loss_clip": 1.04891336, + "balance_loss_mlp": 1.02339244, + "epoch": 0.3144446114534796, + "flos": 28421535022560.0, + "grad_norm": 1.8986680209322364, + "language_loss": 0.69789821, + "learning_rate": 3.209615948222611e-06, + "loss": 0.71961445, + "num_input_tokens_seen": 112304285, + "router_z_loss_clip": 0.85205078, + "router_z_loss_mlp": 0.14099121, + "step": 5230, + "time_per_iteration": 2.6741106510162354 + }, + { + "auxiliary_loss_clip": 0.01135208, + "auxiliary_loss_mlp": 0.0104103, + "balance_loss_clip": 1.04670954, + "balance_loss_mlp": 1.02584267, + "epoch": 0.3145047347061476, + "flos": 38261616827040.0, + "grad_norm": 1.7844836969232414, + "language_loss": 0.79583943, + "learning_rate": 3.209305769168239e-06, + "loss": 0.8176018, + "num_input_tokens_seen": 112325110, + "router_z_loss_clip": 0.88525391, + "router_z_loss_mlp": 0.15197754, + "step": 5231, + "time_per_iteration": 2.7396609783172607 + }, + { + "auxiliary_loss_clip": 0.01133751, + "auxiliary_loss_mlp": 0.01039547, + "balance_loss_clip": 1.04946983, + "balance_loss_mlp": 1.02518201, + "epoch": 0.31456485795881556, + "flos": 13286721971520.0, + "grad_norm": 2.730268036039064, + "language_loss": 0.85335422, + "learning_rate": 3.2089955442568704e-06, + "loss": 0.87508714, + "num_input_tokens_seen": 112339855, + "router_z_loss_clip": 0.84326172, + "router_z_loss_mlp": 0.14367676, + "step": 5232, + "time_per_iteration": 4.062649250030518 + }, + { + "auxiliary_loss_clip": 0.01132268, + "auxiliary_loss_mlp": 0.01043603, + "balance_loss_clip": 1.04786348, + "balance_loss_mlp": 1.02901173, + "epoch": 0.3146249812114835, + "flos": 20855931516000.0, + "grad_norm": 4.855599551437342, + "language_loss": 0.79889393, + "learning_rate": 3.2086852735002692e-06, + "loss": 0.82065266, + "num_input_tokens_seen": 112358480, + "router_z_loss_clip": 0.84423828, + "router_z_loss_mlp": 0.14593506, + "step": 5233, + "time_per_iteration": 2.6381115913391113 + }, + { + "auxiliary_loss_clip": 0.01140993, + "auxiliary_loss_mlp": 0.01038805, + "balance_loss_clip": 1.05329418, + "balance_loss_mlp": 1.02422595, + "epoch": 0.3146851044641515, + "flos": 67469437429920.0, + "grad_norm": 1.75756868788397, + "language_loss": 0.70685691, + "learning_rate": 3.2083749569102024e-06, + "loss": 0.72865486, + "num_input_tokens_seen": 112382350, + "router_z_loss_clip": 0.87744141, + "router_z_loss_mlp": 0.14587402, + "step": 5234, + "time_per_iteration": 2.9806582927703857 + }, + { + "auxiliary_loss_clip": 0.01136831, + "auxiliary_loss_mlp": 0.01033039, + "balance_loss_clip": 1.04968226, + "balance_loss_mlp": 1.01896012, + "epoch": 0.31474522771681945, + "flos": 32965345923840.0, + "grad_norm": 1.9258878135995698, + "language_loss": 0.72186613, + "learning_rate": 3.2080645944984356e-06, + "loss": 0.74356478, + "num_input_tokens_seen": 112400260, + "router_z_loss_clip": 0.87207031, + "router_z_loss_mlp": 0.14068604, + "step": 5235, + "time_per_iteration": 2.693831205368042 + }, + { + "auxiliary_loss_clip": 0.01134904, + "auxiliary_loss_mlp": 0.01037142, + "balance_loss_clip": 1.04845715, + "balance_loss_mlp": 1.02310514, + "epoch": 0.3148053509694875, + "flos": 25931658098880.0, + "grad_norm": 3.162320693625303, + "language_loss": 0.78349823, + "learning_rate": 3.2077541862767384e-06, + "loss": 0.8052187, + "num_input_tokens_seen": 112419400, + "router_z_loss_clip": 0.86425781, + "router_z_loss_mlp": 0.14038086, + "step": 5236, + "time_per_iteration": 2.7219793796539307 + }, + { + "auxiliary_loss_clip": 0.01140407, + "auxiliary_loss_mlp": 0.01037659, + "balance_loss_clip": 1.05081177, + "balance_loss_mlp": 1.02321076, + "epoch": 0.31486547422215544, + "flos": 38130481442880.0, + "grad_norm": 1.6855852326903051, + "language_loss": 0.75868994, + "learning_rate": 3.207443732256881e-06, + "loss": 0.78047061, + "num_input_tokens_seen": 112440825, + "router_z_loss_clip": 0.89550781, + "router_z_loss_mlp": 0.14459229, + "step": 5237, + "time_per_iteration": 2.7122836112976074 + }, + { + "auxiliary_loss_clip": 0.01132933, + "auxiliary_loss_mlp": 0.01033334, + "balance_loss_clip": 1.04999936, + "balance_loss_mlp": 1.02064431, + "epoch": 0.3149255974748234, + "flos": 24194848936320.0, + "grad_norm": 4.602657745501173, + "language_loss": 0.80261636, + "learning_rate": 3.2071332324506372e-06, + "loss": 0.82427907, + "num_input_tokens_seen": 112459180, + "router_z_loss_clip": 0.83056641, + "router_z_loss_mlp": 0.12695312, + "step": 5238, + "time_per_iteration": 2.6641793251037598 + }, + { + "auxiliary_loss_clip": 0.01057664, + "auxiliary_loss_mlp": 0.01020706, + "balance_loss_clip": 1.02863336, + "balance_loss_mlp": 1.01884151, + "epoch": 0.31498572072749137, + "flos": 82589549058720.0, + "grad_norm": 0.8383298389584906, + "language_loss": 0.67968237, + "learning_rate": 3.2068226868697795e-06, + "loss": 0.7004661, + "num_input_tokens_seen": 112516680, + "router_z_loss_clip": 0.29052734, + "router_z_loss_mlp": 0.01860046, + "step": 5239, + "time_per_iteration": 3.3174169063568115 + }, + { + "auxiliary_loss_clip": 0.01141075, + "auxiliary_loss_mlp": 0.01043434, + "balance_loss_clip": 1.05153263, + "balance_loss_mlp": 1.02778196, + "epoch": 0.31504584398015933, + "flos": 24151461279840.0, + "grad_norm": 2.400417433768751, + "language_loss": 0.82549977, + "learning_rate": 3.2065120955260846e-06, + "loss": 0.84734488, + "num_input_tokens_seen": 112535895, + "router_z_loss_clip": 0.89550781, + "router_z_loss_mlp": 0.15649414, + "step": 5240, + "time_per_iteration": 2.6508195400238037 + }, + { + "auxiliary_loss_clip": 0.01138077, + "auxiliary_loss_mlp": 0.01034957, + "balance_loss_clip": 1.05313504, + "balance_loss_mlp": 1.02107549, + "epoch": 0.3151059672328273, + "flos": 32476574795040.0, + "grad_norm": 2.175908380445412, + "language_loss": 0.8123647, + "learning_rate": 3.2062014584313302e-06, + "loss": 0.834095, + "num_input_tokens_seen": 112557490, + "router_z_loss_clip": 0.84912109, + "router_z_loss_mlp": 0.13873291, + "step": 5241, + "time_per_iteration": 2.803281545639038 + }, + { + "auxiliary_loss_clip": 0.01136388, + "auxiliary_loss_mlp": 0.01037737, + "balance_loss_clip": 1.05220222, + "balance_loss_mlp": 1.0238905, + "epoch": 0.31516609048549526, + "flos": 29534791116960.0, + "grad_norm": 2.318273553982837, + "language_loss": 0.74152809, + "learning_rate": 3.2058907755972956e-06, + "loss": 0.76326931, + "num_input_tokens_seen": 112577075, + "router_z_loss_clip": 0.84130859, + "router_z_loss_mlp": 0.13842773, + "step": 5242, + "time_per_iteration": 2.675902843475342 + }, + { + "auxiliary_loss_clip": 0.01137277, + "auxiliary_loss_mlp": 0.01033849, + "balance_loss_clip": 1.05124283, + "balance_loss_mlp": 1.01879275, + "epoch": 0.31522621373816323, + "flos": 31675216786560.0, + "grad_norm": 2.007416874836603, + "language_loss": 0.73526871, + "learning_rate": 3.2055800470357626e-06, + "loss": 0.75697994, + "num_input_tokens_seen": 112597620, + "router_z_loss_clip": 0.85986328, + "router_z_loss_mlp": 0.15063477, + "step": 5243, + "time_per_iteration": 2.732191324234009 + }, + { + "auxiliary_loss_clip": 0.0113776, + "auxiliary_loss_mlp": 0.01034042, + "balance_loss_clip": 1.05121994, + "balance_loss_mlp": 1.01951647, + "epoch": 0.3152863369908312, + "flos": 26733218693760.0, + "grad_norm": 2.0553070691222985, + "language_loss": 0.64292037, + "learning_rate": 3.205269272758513e-06, + "loss": 0.6646384, + "num_input_tokens_seen": 112617150, + "router_z_loss_clip": 0.86523438, + "router_z_loss_mlp": 0.14520264, + "step": 5244, + "time_per_iteration": 2.6760013103485107 + }, + { + "auxiliary_loss_clip": 0.01141477, + "auxiliary_loss_mlp": 0.01035391, + "balance_loss_clip": 1.05271983, + "balance_loss_mlp": 1.021312, + "epoch": 0.31534646024349916, + "flos": 19866639247200.0, + "grad_norm": 2.3486967740184803, + "language_loss": 0.91051745, + "learning_rate": 3.2049584527773313e-06, + "loss": 0.93228614, + "num_input_tokens_seen": 112631090, + "router_z_loss_clip": 0.88818359, + "router_z_loss_mlp": 0.14068604, + "step": 5245, + "time_per_iteration": 2.658705234527588 + }, + { + "auxiliary_loss_clip": 0.01140119, + "auxiliary_loss_mlp": 0.01041606, + "balance_loss_clip": 1.05156982, + "balance_loss_mlp": 1.02675307, + "epoch": 0.3154065834961671, + "flos": 30160370049120.0, + "grad_norm": 2.2609253856045326, + "language_loss": 0.75283134, + "learning_rate": 3.2046475871040048e-06, + "loss": 0.77464867, + "num_input_tokens_seen": 112651220, + "router_z_loss_clip": 0.88525391, + "router_z_loss_mlp": 0.14868164, + "step": 5246, + "time_per_iteration": 2.696012496948242 + }, + { + "auxiliary_loss_clip": 0.01136953, + "auxiliary_loss_mlp": 0.01040154, + "balance_loss_clip": 1.04934597, + "balance_loss_mlp": 1.02563453, + "epoch": 0.3154667067488351, + "flos": 43161078126240.0, + "grad_norm": 1.836444356475665, + "language_loss": 0.61506742, + "learning_rate": 3.204336675750321e-06, + "loss": 0.6368385, + "num_input_tokens_seen": 112671560, + "router_z_loss_clip": 0.87646484, + "router_z_loss_mlp": 0.14526367, + "step": 5247, + "time_per_iteration": 2.7746434211730957 + }, + { + "auxiliary_loss_clip": 0.01136964, + "auxiliary_loss_mlp": 0.01043955, + "balance_loss_clip": 1.04895115, + "balance_loss_mlp": 1.02974534, + "epoch": 0.31552683000150306, + "flos": 21300221021760.0, + "grad_norm": 2.1914622684693046, + "language_loss": 0.8223893, + "learning_rate": 3.2040257187280693e-06, + "loss": 0.84419852, + "num_input_tokens_seen": 112689790, + "router_z_loss_clip": 0.87939453, + "router_z_loss_mlp": 0.14208984, + "step": 5248, + "time_per_iteration": 2.781219244003296 + }, + { + "auxiliary_loss_clip": 0.0113851, + "auxiliary_loss_mlp": 0.01047183, + "balance_loss_clip": 1.05114508, + "balance_loss_mlp": 1.03142393, + "epoch": 0.3155869532541711, + "flos": 22458566498400.0, + "grad_norm": 1.8438146385834902, + "language_loss": 0.84741735, + "learning_rate": 3.2037147160490423e-06, + "loss": 0.86927426, + "num_input_tokens_seen": 112708265, + "router_z_loss_clip": 0.87255859, + "router_z_loss_mlp": 0.15759277, + "step": 5249, + "time_per_iteration": 2.6454665660858154 + }, + { + "auxiliary_loss_clip": 0.01138161, + "auxiliary_loss_mlp": 0.01036231, + "balance_loss_clip": 1.05012012, + "balance_loss_mlp": 1.02065063, + "epoch": 0.31564707650683904, + "flos": 26331749602560.0, + "grad_norm": 1.8430699307365581, + "language_loss": 0.85710388, + "learning_rate": 3.2034036677250322e-06, + "loss": 0.87884784, + "num_input_tokens_seen": 112727820, + "router_z_loss_clip": 0.87890625, + "router_z_loss_mlp": 0.15576172, + "step": 5250, + "time_per_iteration": 2.671025514602661 + }, + { + "auxiliary_loss_clip": 0.01137978, + "auxiliary_loss_mlp": 0.01039444, + "balance_loss_clip": 1.05062509, + "balance_loss_mlp": 1.02441216, + "epoch": 0.315707199759507, + "flos": 25662215772000.0, + "grad_norm": 3.7731054741836796, + "language_loss": 0.67887706, + "learning_rate": 3.203092573767835e-06, + "loss": 0.70065129, + "num_input_tokens_seen": 112743140, + "router_z_loss_clip": 0.87255859, + "router_z_loss_mlp": 0.15026855, + "step": 5251, + "time_per_iteration": 2.635965347290039 + }, + { + "auxiliary_loss_clip": 0.0113687, + "auxiliary_loss_mlp": 0.01036823, + "balance_loss_clip": 1.05088961, + "balance_loss_mlp": 1.02212429, + "epoch": 0.31576732301217497, + "flos": 32737549010400.0, + "grad_norm": 1.7752886624400697, + "language_loss": 0.78965545, + "learning_rate": 3.202781434189246e-06, + "loss": 0.81139243, + "num_input_tokens_seen": 112764705, + "router_z_loss_clip": 0.86035156, + "router_z_loss_mlp": 0.14709473, + "step": 5252, + "time_per_iteration": 2.7072203159332275 + }, + { + "auxiliary_loss_clip": 0.01137373, + "auxiliary_loss_mlp": 0.01041593, + "balance_loss_clip": 1.05289721, + "balance_loss_mlp": 1.02747834, + "epoch": 0.31582744626484294, + "flos": 27712341125280.0, + "grad_norm": 1.7192432272205675, + "language_loss": 0.74137634, + "learning_rate": 3.202470249001066e-06, + "loss": 0.76316595, + "num_input_tokens_seen": 112785310, + "router_z_loss_clip": 0.84472656, + "router_z_loss_mlp": 0.14141846, + "step": 5253, + "time_per_iteration": 2.720010280609131 + }, + { + "auxiliary_loss_clip": 0.01136758, + "auxiliary_loss_mlp": 0.01036508, + "balance_loss_clip": 1.04867148, + "balance_loss_mlp": 1.02187467, + "epoch": 0.3158875695175109, + "flos": 29226377517120.0, + "grad_norm": 1.9902220959310433, + "language_loss": 0.73845506, + "learning_rate": 3.2021590182150924e-06, + "loss": 0.76018775, + "num_input_tokens_seen": 112802905, + "router_z_loss_clip": 0.88037109, + "router_z_loss_mlp": 0.14630127, + "step": 5254, + "time_per_iteration": 2.683863878250122 + }, + { + "auxiliary_loss_clip": 0.01138527, + "auxiliary_loss_mlp": 0.01034752, + "balance_loss_clip": 1.05027819, + "balance_loss_mlp": 1.02010751, + "epoch": 0.31594769277017887, + "flos": 16181552472480.0, + "grad_norm": 3.00184988624586, + "language_loss": 0.77616256, + "learning_rate": 3.201847741843128e-06, + "loss": 0.79789537, + "num_input_tokens_seen": 112820305, + "router_z_loss_clip": 0.88183594, + "router_z_loss_mlp": 0.14648438, + "step": 5255, + "time_per_iteration": 2.6178009510040283 + }, + { + "auxiliary_loss_clip": 0.01138117, + "auxiliary_loss_mlp": 0.01038092, + "balance_loss_clip": 1.05192542, + "balance_loss_mlp": 1.0218792, + "epoch": 0.31600781602284683, + "flos": 28514144823840.0, + "grad_norm": 2.340159875493994, + "language_loss": 0.78480351, + "learning_rate": 3.2015364198969772e-06, + "loss": 0.80656564, + "num_input_tokens_seen": 112841185, + "router_z_loss_clip": 0.86132812, + "router_z_loss_mlp": 0.16210938, + "step": 5256, + "time_per_iteration": 2.671555757522583 + }, + { + "auxiliary_loss_clip": 0.01131784, + "auxiliary_loss_mlp": 0.01037483, + "balance_loss_clip": 1.05046237, + "balance_loss_mlp": 1.02456093, + "epoch": 0.3160679392755148, + "flos": 24194808419040.0, + "grad_norm": 1.545049714646074, + "language_loss": 0.71279728, + "learning_rate": 3.2012250523884453e-06, + "loss": 0.73448998, + "num_input_tokens_seen": 112860570, + "router_z_loss_clip": 0.81298828, + "router_z_loss_mlp": 0.12915039, + "step": 5257, + "time_per_iteration": 2.6209521293640137 + }, + { + "auxiliary_loss_clip": 0.01138078, + "auxiliary_loss_mlp": 0.01040057, + "balance_loss_clip": 1.05097389, + "balance_loss_mlp": 1.02495885, + "epoch": 0.31612806252818276, + "flos": 24639786718560.0, + "grad_norm": 2.221403843567788, + "language_loss": 0.7630825, + "learning_rate": 3.2009136393293393e-06, + "loss": 0.78486383, + "num_input_tokens_seen": 112877975, + "router_z_loss_clip": 0.87158203, + "router_z_loss_mlp": 0.15093994, + "step": 5258, + "time_per_iteration": 2.6633715629577637 + }, + { + "auxiliary_loss_clip": 0.01138698, + "auxiliary_loss_mlp": 0.01037354, + "balance_loss_clip": 1.05173063, + "balance_loss_mlp": 1.02245319, + "epoch": 0.31618818578085073, + "flos": 29573721872640.0, + "grad_norm": 6.266188286246565, + "language_loss": 0.72722352, + "learning_rate": 3.200602180731467e-06, + "loss": 0.74898398, + "num_input_tokens_seen": 112896170, + "router_z_loss_clip": 0.86914062, + "router_z_loss_mlp": 0.14904785, + "step": 5259, + "time_per_iteration": 2.6466853618621826 + }, + { + "auxiliary_loss_clip": 0.01141714, + "auxiliary_loss_mlp": 0.01039068, + "balance_loss_clip": 1.05287325, + "balance_loss_mlp": 1.02534103, + "epoch": 0.3162483090335187, + "flos": 30605307831360.0, + "grad_norm": 3.6393662170684475, + "language_loss": 0.66688806, + "learning_rate": 3.20029067660664e-06, + "loss": 0.68869585, + "num_input_tokens_seen": 112916180, + "router_z_loss_clip": 0.88818359, + "router_z_loss_mlp": 0.1373291, + "step": 5260, + "time_per_iteration": 2.703263759613037 + }, + { + "auxiliary_loss_clip": 0.01132558, + "auxiliary_loss_mlp": 0.01029532, + "balance_loss_clip": 1.04662323, + "balance_loss_mlp": 1.01627004, + "epoch": 0.31630843228618666, + "flos": 32120235603360.0, + "grad_norm": 2.0708703584701857, + "language_loss": 0.72687316, + "learning_rate": 3.1999791269666706e-06, + "loss": 0.74849403, + "num_input_tokens_seen": 112936745, + "router_z_loss_clip": 0.85888672, + "router_z_loss_mlp": 0.13256836, + "step": 5261, + "time_per_iteration": 2.673037528991699 + }, + { + "auxiliary_loss_clip": 0.01057785, + "auxiliary_loss_mlp": 0.01006842, + "balance_loss_clip": 1.0296675, + "balance_loss_mlp": 1.00491095, + "epoch": 0.3163685555388547, + "flos": 81458026021440.0, + "grad_norm": 0.7614488071469427, + "language_loss": 0.50626004, + "learning_rate": 3.1996675318233716e-06, + "loss": 0.52690637, + "num_input_tokens_seen": 112994845, + "router_z_loss_clip": 0.28100586, + "router_z_loss_mlp": 0.01928711, + "step": 5262, + "time_per_iteration": 3.3094229698181152 + }, + { + "auxiliary_loss_clip": 0.01139992, + "auxiliary_loss_mlp": 0.01035918, + "balance_loss_clip": 1.05248213, + "balance_loss_mlp": 1.02207148, + "epoch": 0.31642867879152264, + "flos": 31720265651520.0, + "grad_norm": 1.5258768026085636, + "language_loss": 0.85321212, + "learning_rate": 3.19935589118856e-06, + "loss": 0.87497121, + "num_input_tokens_seen": 113015125, + "router_z_loss_clip": 0.875, + "router_z_loss_mlp": 0.13848877, + "step": 5263, + "time_per_iteration": 2.731492519378662 + }, + { + "auxiliary_loss_clip": 0.01133928, + "auxiliary_loss_mlp": 0.01035482, + "balance_loss_clip": 1.05048394, + "balance_loss_mlp": 1.02267861, + "epoch": 0.3164888020441906, + "flos": 31451106945600.0, + "grad_norm": 4.263934199604657, + "language_loss": 0.81669319, + "learning_rate": 3.1990442050740535e-06, + "loss": 0.83838725, + "num_input_tokens_seen": 113035535, + "router_z_loss_clip": 0.83398438, + "router_z_loss_mlp": 0.12805176, + "step": 5264, + "time_per_iteration": 2.7100226879119873 + }, + { + "auxiliary_loss_clip": 0.01139899, + "auxiliary_loss_mlp": 0.01031566, + "balance_loss_clip": 1.05198133, + "balance_loss_mlp": 1.01709425, + "epoch": 0.3165489252968586, + "flos": 24106452932160.0, + "grad_norm": 2.6640230718881286, + "language_loss": 0.79183614, + "learning_rate": 3.19873247349167e-06, + "loss": 0.81355071, + "num_input_tokens_seen": 113052720, + "router_z_loss_clip": 0.87939453, + "router_z_loss_mlp": 0.14477539, + "step": 5265, + "time_per_iteration": 4.091144561767578 + }, + { + "auxiliary_loss_clip": 0.01141577, + "auxiliary_loss_mlp": 0.01037285, + "balance_loss_clip": 1.05373716, + "balance_loss_mlp": 1.0229857, + "epoch": 0.31660904854952654, + "flos": 28289386706400.0, + "grad_norm": 40.8363225175171, + "language_loss": 0.74937057, + "learning_rate": 3.1984206964532307e-06, + "loss": 0.77115917, + "num_input_tokens_seen": 113071435, + "router_z_loss_clip": 0.87841797, + "router_z_loss_mlp": 0.1428833, + "step": 5266, + "time_per_iteration": 4.062729120254517 + }, + { + "auxiliary_loss_clip": 0.01139577, + "auxiliary_loss_mlp": 0.0103783, + "balance_loss_clip": 1.05208492, + "balance_loss_mlp": 1.02403164, + "epoch": 0.3166691718021945, + "flos": 24902827315200.0, + "grad_norm": 2.4537776696646363, + "language_loss": 0.78803939, + "learning_rate": 3.1981088739705585e-06, + "loss": 0.80981344, + "num_input_tokens_seen": 113088645, + "router_z_loss_clip": 0.875, + "router_z_loss_mlp": 0.13793945, + "step": 5267, + "time_per_iteration": 4.123198747634888 + }, + { + "auxiliary_loss_clip": 0.01059323, + "auxiliary_loss_mlp": 0.01004072, + "balance_loss_clip": 1.03104198, + "balance_loss_mlp": 1.00210965, + "epoch": 0.31672929505486247, + "flos": 85590927305280.0, + "grad_norm": 0.7334727431324847, + "language_loss": 0.57770878, + "learning_rate": 3.197797006055478e-06, + "loss": 0.59834272, + "num_input_tokens_seen": 113152775, + "router_z_loss_clip": 0.2824707, + "router_z_loss_mlp": 0.0196228, + "step": 5268, + "time_per_iteration": 3.2987117767333984 + }, + { + "auxiliary_loss_clip": 0.01139772, + "auxiliary_loss_mlp": 0.01037873, + "balance_loss_clip": 1.05204868, + "balance_loss_mlp": 1.02403259, + "epoch": 0.31678941830753043, + "flos": 17516446853760.0, + "grad_norm": 2.0239567653918518, + "language_loss": 0.72915459, + "learning_rate": 3.197485092719815e-06, + "loss": 0.75093108, + "num_input_tokens_seen": 113171410, + "router_z_loss_clip": 0.87695312, + "router_z_loss_mlp": 0.1383667, + "step": 5269, + "time_per_iteration": 2.6891074180603027 + }, + { + "auxiliary_loss_clip": 0.0113593, + "auxiliary_loss_mlp": 0.01043897, + "balance_loss_clip": 1.05082226, + "balance_loss_mlp": 1.03032517, + "epoch": 0.3168495415601984, + "flos": 27756944300160.0, + "grad_norm": 3.13307087837338, + "language_loss": 0.79708314, + "learning_rate": 3.1971731339753973e-06, + "loss": 0.81888139, + "num_input_tokens_seen": 113189965, + "router_z_loss_clip": 0.85107422, + "router_z_loss_mlp": 0.13568115, + "step": 5270, + "time_per_iteration": 2.6795995235443115 + }, + { + "auxiliary_loss_clip": 0.01143012, + "auxiliary_loss_mlp": 0.01047247, + "balance_loss_clip": 1.05326259, + "balance_loss_mlp": 1.03217924, + "epoch": 0.31690966481286637, + "flos": 24545110536000.0, + "grad_norm": 2.157147369711405, + "language_loss": 0.79531455, + "learning_rate": 3.1968611298340545e-06, + "loss": 0.81721711, + "num_input_tokens_seen": 113206355, + "router_z_loss_clip": 0.89794922, + "router_z_loss_mlp": 0.15057373, + "step": 5271, + "time_per_iteration": 4.12530779838562 + }, + { + "auxiliary_loss_clip": 0.01140264, + "auxiliary_loss_mlp": 0.01038359, + "balance_loss_clip": 1.05268085, + "balance_loss_mlp": 1.02369606, + "epoch": 0.31696978806553433, + "flos": 25842573300960.0, + "grad_norm": 2.063061768246361, + "language_loss": 0.73003864, + "learning_rate": 3.1965490803076173e-06, + "loss": 0.75182486, + "num_input_tokens_seen": 113225440, + "router_z_loss_clip": 0.87646484, + "router_z_loss_mlp": 0.14672852, + "step": 5272, + "time_per_iteration": 2.742466449737549 + }, + { + "auxiliary_loss_clip": 0.01141978, + "auxiliary_loss_mlp": 0.01044725, + "balance_loss_clip": 1.05153382, + "balance_loss_mlp": 1.02844059, + "epoch": 0.3170299113182023, + "flos": 52466732177760.0, + "grad_norm": 2.1079635471793217, + "language_loss": 0.69266927, + "learning_rate": 3.1962369854079194e-06, + "loss": 0.71453631, + "num_input_tokens_seen": 113248840, + "router_z_loss_clip": 0.90478516, + "router_z_loss_mlp": 0.16296387, + "step": 5273, + "time_per_iteration": 2.8426218032836914 + }, + { + "auxiliary_loss_clip": 0.01138596, + "auxiliary_loss_mlp": 0.01041356, + "balance_loss_clip": 1.05228388, + "balance_loss_mlp": 1.0268544, + "epoch": 0.31709003457087026, + "flos": 29846810754720.0, + "grad_norm": 1.734013820082596, + "language_loss": 0.67973202, + "learning_rate": 3.195924845146795e-06, + "loss": 0.70153153, + "num_input_tokens_seen": 113269630, + "router_z_loss_clip": 0.86279297, + "router_z_loss_mlp": 0.14508057, + "step": 5274, + "time_per_iteration": 2.7118661403656006 + }, + { + "auxiliary_loss_clip": 0.01133377, + "auxiliary_loss_mlp": 0.01042995, + "balance_loss_clip": 1.05052221, + "balance_loss_mlp": 1.02964401, + "epoch": 0.3171501578235382, + "flos": 29450082185280.0, + "grad_norm": 1.5405652621225288, + "language_loss": 0.80595315, + "learning_rate": 3.195612659536081e-06, + "loss": 0.82771683, + "num_input_tokens_seen": 113291200, + "router_z_loss_clip": 0.828125, + "router_z_loss_mlp": 0.13354492, + "step": 5275, + "time_per_iteration": 2.677330493927002 + }, + { + "auxiliary_loss_clip": 0.01139826, + "auxiliary_loss_mlp": 0.0104613, + "balance_loss_clip": 1.05173874, + "balance_loss_mlp": 1.03147912, + "epoch": 0.31721028107620625, + "flos": 23036949149760.0, + "grad_norm": 2.661526580287571, + "language_loss": 0.72259343, + "learning_rate": 3.1953004285876147e-06, + "loss": 0.74445295, + "num_input_tokens_seen": 113310170, + "router_z_loss_clip": 0.87988281, + "router_z_loss_mlp": 0.14648438, + "step": 5276, + "time_per_iteration": 2.642157793045044 + }, + { + "auxiliary_loss_clip": 0.01132869, + "auxiliary_loss_mlp": 0.01037211, + "balance_loss_clip": 1.05081582, + "balance_loss_mlp": 1.02454531, + "epoch": 0.3172704043288742, + "flos": 28245350773440.0, + "grad_norm": 1.4352178804516864, + "language_loss": 0.77934957, + "learning_rate": 3.194988152313236e-06, + "loss": 0.80105036, + "num_input_tokens_seen": 113331140, + "router_z_loss_clip": 0.82226562, + "router_z_loss_mlp": 0.12677002, + "step": 5277, + "time_per_iteration": 2.6998450756073 + }, + { + "auxiliary_loss_clip": 0.01137368, + "auxiliary_loss_mlp": 0.01038698, + "balance_loss_clip": 1.04955649, + "balance_loss_mlp": 1.02350497, + "epoch": 0.3173305275815422, + "flos": 21790774910880.0, + "grad_norm": 1.7583964281961337, + "language_loss": 0.7898857, + "learning_rate": 3.1946758307247878e-06, + "loss": 0.81164634, + "num_input_tokens_seen": 113350030, + "router_z_loss_clip": 0.87939453, + "router_z_loss_mlp": 0.1519165, + "step": 5278, + "time_per_iteration": 2.7784712314605713 + }, + { + "auxiliary_loss_clip": 0.01058025, + "auxiliary_loss_mlp": 0.01014637, + "balance_loss_clip": 1.03000784, + "balance_loss_mlp": 1.01278019, + "epoch": 0.31739065083421014, + "flos": 73181723127840.0, + "grad_norm": 0.9063339722473616, + "language_loss": 0.62866491, + "learning_rate": 3.1943634638341114e-06, + "loss": 0.64939153, + "num_input_tokens_seen": 113395820, + "router_z_loss_clip": 0.28027344, + "router_z_loss_mlp": 0.01852417, + "step": 5279, + "time_per_iteration": 3.0585432052612305 + }, + { + "auxiliary_loss_clip": 0.01141105, + "auxiliary_loss_mlp": 0.01045241, + "balance_loss_clip": 1.05109572, + "balance_loss_mlp": 1.02962422, + "epoch": 0.3174507740868781, + "flos": 29042657053920.0, + "grad_norm": 1.4309642682256634, + "language_loss": 0.80900419, + "learning_rate": 3.194051051653053e-06, + "loss": 0.83086765, + "num_input_tokens_seen": 113416835, + "router_z_loss_clip": 0.89990234, + "router_z_loss_mlp": 0.15612793, + "step": 5280, + "time_per_iteration": 2.6899232864379883 + }, + { + "auxiliary_loss_clip": 0.01140437, + "auxiliary_loss_mlp": 0.01044829, + "balance_loss_clip": 1.05524182, + "balance_loss_mlp": 1.03134656, + "epoch": 0.31751089733954607, + "flos": 33725342139840.0, + "grad_norm": 1.5600017359657008, + "language_loss": 0.78250241, + "learning_rate": 3.19373859419346e-06, + "loss": 0.80435503, + "num_input_tokens_seen": 113440850, + "router_z_loss_clip": 0.85107422, + "router_z_loss_mlp": 0.13476562, + "step": 5281, + "time_per_iteration": 2.807399034500122 + }, + { + "auxiliary_loss_clip": 0.01140071, + "auxiliary_loss_mlp": 0.0103622, + "balance_loss_clip": 1.05381918, + "balance_loss_mlp": 1.02101469, + "epoch": 0.31757102059221404, + "flos": 29003523711840.0, + "grad_norm": 1.6856554513522286, + "language_loss": 0.78624654, + "learning_rate": 3.193426091467179e-06, + "loss": 0.80800939, + "num_input_tokens_seen": 113461000, + "router_z_loss_clip": 0.86328125, + "router_z_loss_mlp": 0.15209961, + "step": 5282, + "time_per_iteration": 2.6991939544677734 + }, + { + "auxiliary_loss_clip": 0.01141361, + "auxiliary_loss_mlp": 0.01034155, + "balance_loss_clip": 1.05269742, + "balance_loss_mlp": 1.02020764, + "epoch": 0.317631143844882, + "flos": 30828202153920.0, + "grad_norm": 2.285762722408484, + "language_loss": 0.6806286, + "learning_rate": 3.193113543486061e-06, + "loss": 0.70238376, + "num_input_tokens_seen": 113480820, + "router_z_loss_clip": 0.88720703, + "router_z_loss_mlp": 0.13964844, + "step": 5283, + "time_per_iteration": 2.6978187561035156 + }, + { + "auxiliary_loss_clip": 0.01056405, + "auxiliary_loss_mlp": 0.01000708, + "balance_loss_clip": 1.02832091, + "balance_loss_mlp": 0.99885309, + "epoch": 0.31769126709754997, + "flos": 68119488799200.0, + "grad_norm": 0.7501462573964488, + "language_loss": 0.52758586, + "learning_rate": 3.192800950261958e-06, + "loss": 0.54815698, + "num_input_tokens_seen": 113536910, + "router_z_loss_clip": 0.28100586, + "router_z_loss_mlp": 0.01853943, + "step": 5284, + "time_per_iteration": 3.216339588165283 + }, + { + "auxiliary_loss_clip": 0.01143491, + "auxiliary_loss_mlp": 0.01034341, + "balance_loss_clip": 1.05362701, + "balance_loss_mlp": 1.02021492, + "epoch": 0.31775139035021793, + "flos": 20365580213280.0, + "grad_norm": 1.7499410931024524, + "language_loss": 0.70312154, + "learning_rate": 3.1924883118067235e-06, + "loss": 0.72489983, + "num_input_tokens_seen": 113555480, + "router_z_loss_clip": 0.89892578, + "router_z_loss_mlp": 0.14129639, + "step": 5285, + "time_per_iteration": 2.6637868881225586 + }, + { + "auxiliary_loss_clip": 0.01053367, + "auxiliary_loss_mlp": 0.01001366, + "balance_loss_clip": 1.02523875, + "balance_loss_mlp": 0.99958825, + "epoch": 0.3178115136028859, + "flos": 78370520738400.0, + "grad_norm": 0.8222983761519637, + "language_loss": 0.60567862, + "learning_rate": 3.1921756281322123e-06, + "loss": 0.62622595, + "num_input_tokens_seen": 113616790, + "router_z_loss_clip": 0.28173828, + "router_z_loss_mlp": 0.01777649, + "step": 5286, + "time_per_iteration": 3.3025424480438232 + }, + { + "auxiliary_loss_clip": 0.01140113, + "auxiliary_loss_mlp": 0.01041658, + "balance_loss_clip": 1.05140996, + "balance_loss_mlp": 1.02677441, + "epoch": 0.31787163685555386, + "flos": 22814541034560.0, + "grad_norm": 1.9969633335190744, + "language_loss": 0.72201854, + "learning_rate": 3.1918628992502826e-06, + "loss": 0.74383628, + "num_input_tokens_seen": 113635320, + "router_z_loss_clip": 0.88720703, + "router_z_loss_mlp": 0.14874268, + "step": 5287, + "time_per_iteration": 2.654623508453369 + }, + { + "auxiliary_loss_clip": 0.01139219, + "auxiliary_loss_mlp": 0.01040063, + "balance_loss_clip": 1.04991734, + "balance_loss_mlp": 1.02477992, + "epoch": 0.31793176010822183, + "flos": 26020540310400.0, + "grad_norm": 2.7741569782577318, + "language_loss": 0.75730205, + "learning_rate": 3.191550125172792e-06, + "loss": 0.77909482, + "num_input_tokens_seen": 113654000, + "router_z_loss_clip": 0.89404297, + "router_z_loss_mlp": 0.15283203, + "step": 5288, + "time_per_iteration": 2.6688003540039062 + }, + { + "auxiliary_loss_clip": 0.0112867, + "auxiliary_loss_mlp": 0.01028321, + "balance_loss_clip": 1.04496527, + "balance_loss_mlp": 1.01518965, + "epoch": 0.31799188336088985, + "flos": 25573941319680.0, + "grad_norm": 1.9195034043669508, + "language_loss": 0.87382084, + "learning_rate": 3.1912373059116007e-06, + "loss": 0.89539075, + "num_input_tokens_seen": 113672375, + "router_z_loss_clip": 0.83642578, + "router_z_loss_mlp": 0.13128662, + "step": 5289, + "time_per_iteration": 2.7028937339782715 + }, + { + "auxiliary_loss_clip": 0.01136771, + "auxiliary_loss_mlp": 0.01035764, + "balance_loss_clip": 1.0525279, + "balance_loss_mlp": 1.02257991, + "epoch": 0.3180520066135578, + "flos": 27444843627840.0, + "grad_norm": 15.66409442086179, + "language_loss": 0.68138099, + "learning_rate": 3.190924441478572e-06, + "loss": 0.70310634, + "num_input_tokens_seen": 113692385, + "router_z_loss_clip": 0.84228516, + "router_z_loss_mlp": 0.13201904, + "step": 5290, + "time_per_iteration": 2.703922986984253 + }, + { + "auxiliary_loss_clip": 0.01141912, + "auxiliary_loss_mlp": 0.01038121, + "balance_loss_clip": 1.0521543, + "balance_loss_mlp": 1.02354717, + "epoch": 0.3181121298662258, + "flos": 33233086524960.0, + "grad_norm": 2.335990803820329, + "language_loss": 0.80147266, + "learning_rate": 3.1906115318855687e-06, + "loss": 0.823273, + "num_input_tokens_seen": 113712145, + "router_z_loss_clip": 0.8984375, + "router_z_loss_mlp": 0.14581299, + "step": 5291, + "time_per_iteration": 2.684072256088257 + }, + { + "auxiliary_loss_clip": 0.01140401, + "auxiliary_loss_mlp": 0.01034067, + "balance_loss_clip": 1.0520792, + "balance_loss_mlp": 1.0189873, + "epoch": 0.31817225311889374, + "flos": 28285132392000.0, + "grad_norm": 2.2404672281841536, + "language_loss": 0.79918349, + "learning_rate": 3.1902985771444577e-06, + "loss": 0.82092822, + "num_input_tokens_seen": 113731435, + "router_z_loss_clip": 0.8828125, + "router_z_loss_mlp": 0.15087891, + "step": 5292, + "time_per_iteration": 2.709796905517578 + }, + { + "auxiliary_loss_clip": 0.011331, + "auxiliary_loss_mlp": 0.01034457, + "balance_loss_clip": 1.05060768, + "balance_loss_mlp": 1.02163565, + "epoch": 0.3182323763715617, + "flos": 28380578402880.0, + "grad_norm": 1.59418693659839, + "language_loss": 0.75326395, + "learning_rate": 3.1899855772671043e-06, + "loss": 0.77493954, + "num_input_tokens_seen": 113750825, + "router_z_loss_clip": 0.82568359, + "router_z_loss_mlp": 0.12811279, + "step": 5293, + "time_per_iteration": 2.6627964973449707 + }, + { + "auxiliary_loss_clip": 0.01136477, + "auxiliary_loss_mlp": 0.01037554, + "balance_loss_clip": 1.05333114, + "balance_loss_mlp": 1.02429152, + "epoch": 0.3182924996242297, + "flos": 35407418807520.0, + "grad_norm": 1.929533605125247, + "language_loss": 0.73917949, + "learning_rate": 3.189672532265379e-06, + "loss": 0.76091975, + "num_input_tokens_seen": 113770010, + "router_z_loss_clip": 0.83105469, + "router_z_loss_mlp": 0.13269043, + "step": 5294, + "time_per_iteration": 2.7714669704437256 + }, + { + "auxiliary_loss_clip": 0.01138096, + "auxiliary_loss_mlp": 0.01033125, + "balance_loss_clip": 1.04977846, + "balance_loss_mlp": 1.01778889, + "epoch": 0.31835262287689764, + "flos": 24952089977280.0, + "grad_norm": 1.9433256889191923, + "language_loss": 0.76101911, + "learning_rate": 3.189359442151152e-06, + "loss": 0.78273129, + "num_input_tokens_seen": 113788640, + "router_z_loss_clip": 0.88330078, + "router_z_loss_mlp": 0.15338135, + "step": 5295, + "time_per_iteration": 2.6686816215515137 + }, + { + "auxiliary_loss_clip": 0.01142278, + "auxiliary_loss_mlp": 0.0103857, + "balance_loss_clip": 1.05310869, + "balance_loss_mlp": 1.0246048, + "epoch": 0.3184127461295656, + "flos": 30651126524640.0, + "grad_norm": 1.6135732912680862, + "language_loss": 0.69720483, + "learning_rate": 3.189046306936296e-06, + "loss": 0.71901333, + "num_input_tokens_seen": 113809515, + "router_z_loss_clip": 0.89208984, + "router_z_loss_mlp": 0.13970947, + "step": 5296, + "time_per_iteration": 2.7554657459259033 + }, + { + "auxiliary_loss_clip": 0.01139205, + "auxiliary_loss_mlp": 0.01040339, + "balance_loss_clip": 1.05109906, + "balance_loss_mlp": 1.02621865, + "epoch": 0.31847286938223357, + "flos": 31178544788160.0, + "grad_norm": 1.7335024972423863, + "language_loss": 0.77617288, + "learning_rate": 3.1887331266326846e-06, + "loss": 0.79796839, + "num_input_tokens_seen": 113829770, + "router_z_loss_clip": 0.88085938, + "router_z_loss_mlp": 0.14123535, + "step": 5297, + "time_per_iteration": 2.717066764831543 + }, + { + "auxiliary_loss_clip": 0.01134264, + "auxiliary_loss_mlp": 0.01032305, + "balance_loss_clip": 1.04970694, + "balance_loss_mlp": 1.01770222, + "epoch": 0.31853299263490154, + "flos": 33900756560640.0, + "grad_norm": 1.9243255133587513, + "language_loss": 0.79194409, + "learning_rate": 3.1884199012521942e-06, + "loss": 0.81360978, + "num_input_tokens_seen": 113849320, + "router_z_loss_clip": 0.84521484, + "router_z_loss_mlp": 0.14587402, + "step": 5298, + "time_per_iteration": 2.7397162914276123 + }, + { + "auxiliary_loss_clip": 0.01140418, + "auxiliary_loss_mlp": 0.01039269, + "balance_loss_clip": 1.05144262, + "balance_loss_mlp": 1.02526188, + "epoch": 0.3185931158875695, + "flos": 27707357499840.0, + "grad_norm": 1.8586451567495088, + "language_loss": 0.74387497, + "learning_rate": 3.1881066308067016e-06, + "loss": 0.76567185, + "num_input_tokens_seen": 113867860, + "router_z_loss_clip": 0.88964844, + "router_z_loss_mlp": 0.14007568, + "step": 5299, + "time_per_iteration": 2.7071053981781006 + }, + { + "auxiliary_loss_clip": 0.0113761, + "auxiliary_loss_mlp": 0.01046555, + "balance_loss_clip": 1.04793954, + "balance_loss_mlp": 1.03226137, + "epoch": 0.31865323914023747, + "flos": 29982524591520.0, + "grad_norm": 1.9611900487447413, + "language_loss": 0.77807009, + "learning_rate": 3.1877933153080873e-06, + "loss": 0.79991174, + "num_input_tokens_seen": 113886375, + "router_z_loss_clip": 0.89648438, + "router_z_loss_mlp": 0.14300537, + "step": 5300, + "time_per_iteration": 2.6768312454223633 + }, + { + "auxiliary_loss_clip": 0.01134214, + "auxiliary_loss_mlp": 0.01038545, + "balance_loss_clip": 1.04766238, + "balance_loss_mlp": 1.02361405, + "epoch": 0.31871336239290543, + "flos": 22191109518240.0, + "grad_norm": 2.120592446621264, + "language_loss": 0.83991396, + "learning_rate": 3.1874799547682304e-06, + "loss": 0.86164153, + "num_input_tokens_seen": 113904065, + "router_z_loss_clip": 0.86621094, + "router_z_loss_mlp": 0.14935303, + "step": 5301, + "time_per_iteration": 2.670421838760376 + }, + { + "auxiliary_loss_clip": 0.01137783, + "auxiliary_loss_mlp": 0.01044507, + "balance_loss_clip": 1.05315948, + "balance_loss_mlp": 1.029755, + "epoch": 0.31877348564557345, + "flos": 26633923541280.0, + "grad_norm": 2.410217523468363, + "language_loss": 0.77393234, + "learning_rate": 3.187166549199015e-06, + "loss": 0.79575527, + "num_input_tokens_seen": 113918415, + "router_z_loss_clip": 0.84570312, + "router_z_loss_mlp": 0.1473999, + "step": 5302, + "time_per_iteration": 2.660773277282715 + }, + { + "auxiliary_loss_clip": 0.01131591, + "auxiliary_loss_mlp": 0.01035745, + "balance_loss_clip": 1.04857588, + "balance_loss_mlp": 1.02166009, + "epoch": 0.3188336088982414, + "flos": 26863381663200.0, + "grad_norm": 1.6374940533963387, + "language_loss": 0.7926324, + "learning_rate": 3.1868530986123255e-06, + "loss": 0.81430578, + "num_input_tokens_seen": 113938135, + "router_z_loss_clip": 0.83105469, + "router_z_loss_mlp": 0.14080811, + "step": 5303, + "time_per_iteration": 2.677278995513916 + }, + { + "auxiliary_loss_clip": 0.01146622, + "auxiliary_loss_mlp": 0.01040901, + "balance_loss_clip": 1.05317497, + "balance_loss_mlp": 1.02480817, + "epoch": 0.3188937321509094, + "flos": 24462386951040.0, + "grad_norm": 2.4748811236668287, + "language_loss": 0.73358524, + "learning_rate": 3.186539603020047e-06, + "loss": 0.7554605, + "num_input_tokens_seen": 113957125, + "router_z_loss_clip": 0.93457031, + "router_z_loss_mlp": 0.16101074, + "step": 5304, + "time_per_iteration": 2.651547908782959 + }, + { + "auxiliary_loss_clip": 0.01132915, + "auxiliary_loss_mlp": 0.01035176, + "balance_loss_clip": 1.0492487, + "balance_loss_mlp": 1.02195549, + "epoch": 0.31895385540357735, + "flos": 31540353812640.0, + "grad_norm": 2.3950577581756245, + "language_loss": 0.72131211, + "learning_rate": 3.186226062434068e-06, + "loss": 0.742993, + "num_input_tokens_seen": 113974875, + "router_z_loss_clip": 0.8359375, + "router_z_loss_mlp": 0.13208008, + "step": 5305, + "time_per_iteration": 5.689935207366943 + }, + { + "auxiliary_loss_clip": 0.01137687, + "auxiliary_loss_mlp": 0.01037189, + "balance_loss_clip": 1.05211711, + "balance_loss_mlp": 1.02383733, + "epoch": 0.3190139786562453, + "flos": 28647711244800.0, + "grad_norm": 6.396493168163951, + "language_loss": 0.6402986, + "learning_rate": 3.1859124768662778e-06, + "loss": 0.66204739, + "num_input_tokens_seen": 113994450, + "router_z_loss_clip": 0.85693359, + "router_z_loss_mlp": 0.13354492, + "step": 5306, + "time_per_iteration": 2.680377721786499 + }, + { + "auxiliary_loss_clip": 0.01139955, + "auxiliary_loss_mlp": 0.01042502, + "balance_loss_clip": 1.05228853, + "balance_loss_mlp": 1.02806568, + "epoch": 0.3190741019089133, + "flos": 35503107922080.0, + "grad_norm": 2.1186323948524897, + "language_loss": 0.78997922, + "learning_rate": 3.1855988463285678e-06, + "loss": 0.81180382, + "num_input_tokens_seen": 114013945, + "router_z_loss_clip": 0.87744141, + "router_z_loss_mlp": 0.14440918, + "step": 5307, + "time_per_iteration": 4.141872406005859 + }, + { + "auxiliary_loss_clip": 0.01133728, + "auxiliary_loss_mlp": 0.01040637, + "balance_loss_clip": 1.04976344, + "balance_loss_mlp": 1.02568173, + "epoch": 0.31913422516158124, + "flos": 20901142450080.0, + "grad_norm": 1.7383705377167373, + "language_loss": 0.77555972, + "learning_rate": 3.1852851708328308e-06, + "loss": 0.79730338, + "num_input_tokens_seen": 114031375, + "router_z_loss_clip": 0.83984375, + "router_z_loss_mlp": 0.14959717, + "step": 5308, + "time_per_iteration": 2.616926431655884 + }, + { + "auxiliary_loss_clip": 0.01148199, + "auxiliary_loss_mlp": 0.0104555, + "balance_loss_clip": 1.05424666, + "balance_loss_mlp": 1.02847958, + "epoch": 0.3191943484142492, + "flos": 19608177103200.0, + "grad_norm": 2.2364006051222147, + "language_loss": 0.74912232, + "learning_rate": 3.184971450390961e-06, + "loss": 0.77105981, + "num_input_tokens_seen": 114048465, + "router_z_loss_clip": 0.93896484, + "router_z_loss_mlp": 0.1706543, + "step": 5309, + "time_per_iteration": 2.65901517868042 + }, + { + "auxiliary_loss_clip": 0.01138102, + "auxiliary_loss_mlp": 0.01034393, + "balance_loss_clip": 1.05063927, + "balance_loss_mlp": 1.02058816, + "epoch": 0.3192544716669172, + "flos": 28023550417440.0, + "grad_norm": 3.115012239463666, + "language_loss": 0.83176422, + "learning_rate": 3.184657685014856e-06, + "loss": 0.85348916, + "num_input_tokens_seen": 114068415, + "router_z_loss_clip": 0.87451172, + "router_z_loss_mlp": 0.13812256, + "step": 5310, + "time_per_iteration": 4.171445608139038 + }, + { + "auxiliary_loss_clip": 0.01133917, + "auxiliary_loss_mlp": 0.0103788, + "balance_loss_clip": 1.04769385, + "balance_loss_mlp": 1.02461171, + "epoch": 0.31931459491958514, + "flos": 32787703052640.0, + "grad_norm": 1.5807752012140162, + "language_loss": 0.78213412, + "learning_rate": 3.184343874716412e-06, + "loss": 0.80385208, + "num_input_tokens_seen": 114088565, + "router_z_loss_clip": 0.86181641, + "router_z_loss_mlp": 0.13262939, + "step": 5311, + "time_per_iteration": 2.6906254291534424 + }, + { + "auxiliary_loss_clip": 0.01136867, + "auxiliary_loss_mlp": 0.01037204, + "balance_loss_clip": 1.05065739, + "balance_loss_mlp": 1.02276146, + "epoch": 0.3193747181722531, + "flos": 26644944241440.0, + "grad_norm": 1.7848127762423405, + "language_loss": 0.84181011, + "learning_rate": 3.1840300195075295e-06, + "loss": 0.86355078, + "num_input_tokens_seen": 114107160, + "router_z_loss_clip": 0.86181641, + "router_z_loss_mlp": 0.14447021, + "step": 5312, + "time_per_iteration": 2.757690191268921 + }, + { + "auxiliary_loss_clip": 0.01142479, + "auxiliary_loss_mlp": 0.01041665, + "balance_loss_clip": 1.0508095, + "balance_loss_mlp": 1.02614427, + "epoch": 0.31943484142492107, + "flos": 22360729968000.0, + "grad_norm": 2.440868410510654, + "language_loss": 0.79206485, + "learning_rate": 3.1837161194001102e-06, + "loss": 0.81390631, + "num_input_tokens_seen": 114123420, + "router_z_loss_clip": 0.91748047, + "router_z_loss_mlp": 0.15527344, + "step": 5313, + "time_per_iteration": 2.643608808517456 + }, + { + "auxiliary_loss_clip": 0.01134927, + "auxiliary_loss_mlp": 0.01034209, + "balance_loss_clip": 1.04838634, + "balance_loss_mlp": 1.02026117, + "epoch": 0.31949496467758903, + "flos": 26375785535520.0, + "grad_norm": 2.805160793555451, + "language_loss": 0.86005151, + "learning_rate": 3.183402174406057e-06, + "loss": 0.88174284, + "num_input_tokens_seen": 114139230, + "router_z_loss_clip": 0.86523438, + "router_z_loss_mlp": 0.13952637, + "step": 5314, + "time_per_iteration": 2.6828293800354004 + }, + { + "auxiliary_loss_clip": 0.01136598, + "auxiliary_loss_mlp": 0.0103779, + "balance_loss_clip": 1.04953063, + "balance_loss_mlp": 1.0228889, + "epoch": 0.31955508793025705, + "flos": 26552131853760.0, + "grad_norm": 2.1644285997778994, + "language_loss": 0.79613566, + "learning_rate": 3.1830881845372747e-06, + "loss": 0.81787956, + "num_input_tokens_seen": 114159290, + "router_z_loss_clip": 0.87109375, + "router_z_loss_mlp": 0.14916992, + "step": 5315, + "time_per_iteration": 2.6359994411468506 + }, + { + "auxiliary_loss_clip": 0.01140472, + "auxiliary_loss_mlp": 0.01047091, + "balance_loss_clip": 1.05133426, + "balance_loss_mlp": 1.03130186, + "epoch": 0.319615211182925, + "flos": 20943760278240.0, + "grad_norm": 1.8027410670713166, + "language_loss": 0.67433035, + "learning_rate": 3.18277414980567e-06, + "loss": 0.69620597, + "num_input_tokens_seen": 114177655, + "router_z_loss_clip": 0.89208984, + "router_z_loss_mlp": 0.15795898, + "step": 5316, + "time_per_iteration": 2.6742115020751953 + }, + { + "auxiliary_loss_clip": 0.01136671, + "auxiliary_loss_mlp": 0.01032538, + "balance_loss_clip": 1.04988503, + "balance_loss_mlp": 1.01921034, + "epoch": 0.319675334435593, + "flos": 34303684273920.0, + "grad_norm": 1.4550189522327384, + "language_loss": 0.69064963, + "learning_rate": 3.1824600702231515e-06, + "loss": 0.71234173, + "num_input_tokens_seen": 114200880, + "router_z_loss_clip": 0.86816406, + "router_z_loss_mlp": 0.13323975, + "step": 5317, + "time_per_iteration": 2.7459053993225098 + }, + { + "auxiliary_loss_clip": 0.01059621, + "auxiliary_loss_mlp": 0.01010279, + "balance_loss_clip": 1.03172147, + "balance_loss_mlp": 1.00814521, + "epoch": 0.31973545768826095, + "flos": 84804277069440.0, + "grad_norm": 0.7297735206860455, + "language_loss": 0.53129864, + "learning_rate": 3.182145945801628e-06, + "loss": 0.55199766, + "num_input_tokens_seen": 114267145, + "router_z_loss_clip": 0.27905273, + "router_z_loss_mlp": 0.02134705, + "step": 5318, + "time_per_iteration": 3.4277074337005615 + }, + { + "auxiliary_loss_clip": 0.01134915, + "auxiliary_loss_mlp": 0.01037621, + "balance_loss_clip": 1.05014515, + "balance_loss_mlp": 1.02410245, + "epoch": 0.3197955809409289, + "flos": 16715250914400.0, + "grad_norm": 2.1081727740743896, + "language_loss": 0.83929014, + "learning_rate": 3.181831776553012e-06, + "loss": 0.86101556, + "num_input_tokens_seen": 114284630, + "router_z_loss_clip": 0.84814453, + "router_z_loss_mlp": 0.13525391, + "step": 5319, + "time_per_iteration": 2.6401631832122803 + }, + { + "auxiliary_loss_clip": 0.01136075, + "auxiliary_loss_mlp": 0.0103477, + "balance_loss_clip": 1.05022073, + "balance_loss_mlp": 1.02102482, + "epoch": 0.3198557041935969, + "flos": 40533826157280.0, + "grad_norm": 1.824623293433075, + "language_loss": 0.63332617, + "learning_rate": 3.1815175624892165e-06, + "loss": 0.65503466, + "num_input_tokens_seen": 114305830, + "router_z_loss_clip": 0.85839844, + "router_z_loss_mlp": 0.1373291, + "step": 5320, + "time_per_iteration": 2.748892068862915 + }, + { + "auxiliary_loss_clip": 0.01143396, + "auxiliary_loss_mlp": 0.01036965, + "balance_loss_clip": 1.05400658, + "balance_loss_mlp": 1.02270806, + "epoch": 0.31991582744626484, + "flos": 28958150708640.0, + "grad_norm": 2.3317234043075725, + "language_loss": 0.7037096, + "learning_rate": 3.1812033036221567e-06, + "loss": 0.72551322, + "num_input_tokens_seen": 114325165, + "router_z_loss_clip": 0.89355469, + "router_z_loss_mlp": 0.1427002, + "step": 5321, + "time_per_iteration": 2.71091890335083 + }, + { + "auxiliary_loss_clip": 0.0114655, + "auxiliary_loss_mlp": 0.01052139, + "balance_loss_clip": 1.05411577, + "balance_loss_mlp": 1.03696346, + "epoch": 0.3199759506989328, + "flos": 22635682644960.0, + "grad_norm": 2.3412322532694434, + "language_loss": 0.86362749, + "learning_rate": 3.180888999963749e-06, + "loss": 0.8856144, + "num_input_tokens_seen": 114341310, + "router_z_loss_clip": 0.92529297, + "router_z_loss_mlp": 0.15167236, + "step": 5322, + "time_per_iteration": 2.641491413116455 + }, + { + "auxiliary_loss_clip": 0.01138514, + "auxiliary_loss_mlp": 0.01035037, + "balance_loss_clip": 1.05093932, + "balance_loss_mlp": 1.02088714, + "epoch": 0.3200360739516008, + "flos": 27355718312640.0, + "grad_norm": 1.9108992513385352, + "language_loss": 0.8327232, + "learning_rate": 3.1805746515259123e-06, + "loss": 0.85445869, + "num_input_tokens_seen": 114360355, + "router_z_loss_clip": 0.875, + "router_z_loss_mlp": 0.14160156, + "step": 5323, + "time_per_iteration": 2.7616569995880127 + }, + { + "auxiliary_loss_clip": 0.01133441, + "auxiliary_loss_mlp": 0.01036251, + "balance_loss_clip": 1.04854381, + "balance_loss_mlp": 1.02121258, + "epoch": 0.32009619720426874, + "flos": 25130300090400.0, + "grad_norm": 1.7918806485780838, + "language_loss": 0.78270608, + "learning_rate": 3.1802602583205663e-06, + "loss": 0.80440307, + "num_input_tokens_seen": 114379220, + "router_z_loss_clip": 0.84863281, + "router_z_loss_mlp": 0.15039062, + "step": 5324, + "time_per_iteration": 2.6833646297454834 + }, + { + "auxiliary_loss_clip": 0.01135128, + "auxiliary_loss_mlp": 0.01032262, + "balance_loss_clip": 1.04848981, + "balance_loss_mlp": 1.01780224, + "epoch": 0.3201563204569367, + "flos": 22143832202880.0, + "grad_norm": 2.1019243181197838, + "language_loss": 0.80203032, + "learning_rate": 3.1799458203596333e-06, + "loss": 0.82370424, + "num_input_tokens_seen": 114396365, + "router_z_loss_clip": 0.8671875, + "router_z_loss_mlp": 0.14477539, + "step": 5325, + "time_per_iteration": 2.6956145763397217 + }, + { + "auxiliary_loss_clip": 0.01139802, + "auxiliary_loss_mlp": 0.0103936, + "balance_loss_clip": 1.05264711, + "balance_loss_mlp": 1.0252161, + "epoch": 0.32021644370960467, + "flos": 38662194538080.0, + "grad_norm": 1.5995907327700538, + "language_loss": 0.74920821, + "learning_rate": 3.179631337655037e-06, + "loss": 0.77099979, + "num_input_tokens_seen": 114416780, + "router_z_loss_clip": 0.87255859, + "router_z_loss_mlp": 0.14147949, + "step": 5326, + "time_per_iteration": 2.7314233779907227 + }, + { + "auxiliary_loss_clip": 0.01135052, + "auxiliary_loss_mlp": 0.01036838, + "balance_loss_clip": 1.05114937, + "balance_loss_mlp": 1.02273536, + "epoch": 0.32027656696227264, + "flos": 32782719427200.0, + "grad_norm": 1.5790580764875068, + "language_loss": 0.81095523, + "learning_rate": 3.179316810218701e-06, + "loss": 0.83267415, + "num_input_tokens_seen": 114437405, + "router_z_loss_clip": 0.83935547, + "router_z_loss_mlp": 0.14111328, + "step": 5327, + "time_per_iteration": 2.7448503971099854 + }, + { + "auxiliary_loss_clip": 0.01140659, + "auxiliary_loss_mlp": 0.01036729, + "balance_loss_clip": 1.05168128, + "balance_loss_mlp": 1.02187538, + "epoch": 0.32033669021494066, + "flos": 29492497427040.0, + "grad_norm": 1.7999750710441282, + "language_loss": 0.77908677, + "learning_rate": 3.179002238062554e-06, + "loss": 0.80086064, + "num_input_tokens_seen": 114458505, + "router_z_loss_clip": 0.890625, + "router_z_loss_mlp": 0.14849854, + "step": 5328, + "time_per_iteration": 2.719007968902588 + }, + { + "auxiliary_loss_clip": 0.01139905, + "auxiliary_loss_mlp": 0.01043054, + "balance_loss_clip": 1.05083799, + "balance_loss_mlp": 1.02678192, + "epoch": 0.3203968134676086, + "flos": 29847499548480.0, + "grad_norm": 1.688817351364408, + "language_loss": 0.74198461, + "learning_rate": 3.178687621198524e-06, + "loss": 0.76381421, + "num_input_tokens_seen": 114479050, + "router_z_loss_clip": 0.890625, + "router_z_loss_mlp": 0.1628418, + "step": 5329, + "time_per_iteration": 2.711573600769043 + }, + { + "auxiliary_loss_clip": 0.01133009, + "auxiliary_loss_mlp": 0.01033931, + "balance_loss_clip": 1.04994822, + "balance_loss_mlp": 1.02053189, + "epoch": 0.3204569367202766, + "flos": 21969147093120.0, + "grad_norm": 1.8027346665140598, + "language_loss": 0.70741826, + "learning_rate": 3.1783729596385415e-06, + "loss": 0.72908765, + "num_input_tokens_seen": 114497415, + "router_z_loss_clip": 0.83007812, + "router_z_loss_mlp": 0.1340332, + "step": 5330, + "time_per_iteration": 2.6760754585266113 + }, + { + "auxiliary_loss_clip": 0.01141944, + "auxiliary_loss_mlp": 0.01043318, + "balance_loss_clip": 1.05082178, + "balance_loss_mlp": 1.02757001, + "epoch": 0.32051705997294455, + "flos": 37326530328480.0, + "grad_norm": 1.8938570416023865, + "language_loss": 0.79622269, + "learning_rate": 3.1780582533945376e-06, + "loss": 0.8180753, + "num_input_tokens_seen": 114518785, + "router_z_loss_clip": 0.91162109, + "router_z_loss_mlp": 0.15759277, + "step": 5331, + "time_per_iteration": 2.7083489894866943 + }, + { + "auxiliary_loss_clip": 0.01055992, + "auxiliary_loss_mlp": 0.01002386, + "balance_loss_clip": 1.02860034, + "balance_loss_mlp": 1.0004127, + "epoch": 0.3205771832256125, + "flos": 83484853938720.0, + "grad_norm": 0.8328245868622882, + "language_loss": 0.57850564, + "learning_rate": 3.177743502478447e-06, + "loss": 0.59908938, + "num_input_tokens_seen": 114577710, + "router_z_loss_clip": 0.27392578, + "router_z_loss_mlp": 0.01971436, + "step": 5332, + "time_per_iteration": 3.2319555282592773 + }, + { + "auxiliary_loss_clip": 0.01142057, + "auxiliary_loss_mlp": 0.01037184, + "balance_loss_clip": 1.05213952, + "balance_loss_mlp": 1.02274179, + "epoch": 0.3206373064782805, + "flos": 37148684870880.0, + "grad_norm": 1.731693701753596, + "language_loss": 0.73233175, + "learning_rate": 3.177428706902205e-06, + "loss": 0.75412416, + "num_input_tokens_seen": 114598640, + "router_z_loss_clip": 0.89892578, + "router_z_loss_mlp": 0.14453125, + "step": 5333, + "time_per_iteration": 2.7325637340545654 + }, + { + "auxiliary_loss_clip": 0.01139185, + "auxiliary_loss_mlp": 0.01037888, + "balance_loss_clip": 1.0508616, + "balance_loss_mlp": 1.02325511, + "epoch": 0.32069742973094845, + "flos": 26911023634080.0, + "grad_norm": 1.6812856347734588, + "language_loss": 0.7010479, + "learning_rate": 3.1771138666777485e-06, + "loss": 0.72281861, + "num_input_tokens_seen": 114618780, + "router_z_loss_clip": 0.88378906, + "router_z_loss_mlp": 0.14642334, + "step": 5334, + "time_per_iteration": 2.7166311740875244 + }, + { + "auxiliary_loss_clip": 0.01137139, + "auxiliary_loss_mlp": 0.01042403, + "balance_loss_clip": 1.04863644, + "balance_loss_mlp": 1.02768028, + "epoch": 0.3207575529836164, + "flos": 26910456392160.0, + "grad_norm": 1.8579834816898353, + "language_loss": 0.77424932, + "learning_rate": 3.1767989818170156e-06, + "loss": 0.79604471, + "num_input_tokens_seen": 114637525, + "router_z_loss_clip": 0.88427734, + "router_z_loss_mlp": 0.14733887, + "step": 5335, + "time_per_iteration": 2.6792171001434326 + }, + { + "auxiliary_loss_clip": 0.01139707, + "auxiliary_loss_mlp": 0.01041691, + "balance_loss_clip": 1.05190504, + "balance_loss_mlp": 1.02718353, + "epoch": 0.3208176762362844, + "flos": 41558119005600.0, + "grad_norm": 1.7489334322626975, + "language_loss": 0.68361282, + "learning_rate": 3.1764840523319477e-06, + "loss": 0.70542681, + "num_input_tokens_seen": 114659705, + "router_z_loss_clip": 0.87792969, + "router_z_loss_mlp": 0.14489746, + "step": 5336, + "time_per_iteration": 2.8236353397369385 + }, + { + "auxiliary_loss_clip": 0.01139449, + "auxiliary_loss_mlp": 0.01042644, + "balance_loss_clip": 1.05180717, + "balance_loss_mlp": 1.02842879, + "epoch": 0.32087779948895234, + "flos": 26598922961760.0, + "grad_norm": 1.759486013081469, + "language_loss": 0.79000247, + "learning_rate": 3.176169078234487e-06, + "loss": 0.81182337, + "num_input_tokens_seen": 114678340, + "router_z_loss_clip": 0.87548828, + "router_z_loss_mlp": 0.14215088, + "step": 5337, + "time_per_iteration": 2.7112576961517334 + }, + { + "auxiliary_loss_clip": 0.01129392, + "auxiliary_loss_mlp": 0.01038039, + "balance_loss_clip": 1.04666841, + "balance_loss_mlp": 1.02494419, + "epoch": 0.3209379227416203, + "flos": 26153985179520.0, + "grad_norm": 2.844932162636468, + "language_loss": 0.74140471, + "learning_rate": 3.1758540595365766e-06, + "loss": 0.76307905, + "num_input_tokens_seen": 114696980, + "router_z_loss_clip": 0.82666016, + "router_z_loss_mlp": 0.13098145, + "step": 5338, + "time_per_iteration": 2.6986007690429688 + }, + { + "auxiliary_loss_clip": 0.01138293, + "auxiliary_loss_mlp": 0.01038699, + "balance_loss_clip": 1.04812908, + "balance_loss_mlp": 1.02401865, + "epoch": 0.3209980459942883, + "flos": 31541447779200.0, + "grad_norm": 2.026151961323338, + "language_loss": 0.63023424, + "learning_rate": 3.1755389962501626e-06, + "loss": 0.65200412, + "num_input_tokens_seen": 114717330, + "router_z_loss_clip": 0.90185547, + "router_z_loss_mlp": 0.14678955, + "step": 5339, + "time_per_iteration": 2.6963601112365723 + }, + { + "auxiliary_loss_clip": 0.0113691, + "auxiliary_loss_mlp": 0.01038653, + "balance_loss_clip": 1.04937816, + "balance_loss_mlp": 1.02397799, + "epoch": 0.32105816924695624, + "flos": 23304689750880.0, + "grad_norm": 2.435698573424828, + "language_loss": 0.81656742, + "learning_rate": 3.175223888387192e-06, + "loss": 0.83832312, + "num_input_tokens_seen": 114736320, + "router_z_loss_clip": 0.87548828, + "router_z_loss_mlp": 0.14666748, + "step": 5340, + "time_per_iteration": 2.6384799480438232 + }, + { + "auxiliary_loss_clip": 0.01138456, + "auxiliary_loss_mlp": 0.01043062, + "balance_loss_clip": 1.05044067, + "balance_loss_mlp": 1.02900112, + "epoch": 0.3211182924996242, + "flos": 20232864655200.0, + "grad_norm": 2.148957610539467, + "language_loss": 0.76366097, + "learning_rate": 3.1749087359596137e-06, + "loss": 0.78547615, + "num_input_tokens_seen": 114754575, + "router_z_loss_clip": 0.88037109, + "router_z_loss_mlp": 0.140625, + "step": 5341, + "time_per_iteration": 2.6494085788726807 + }, + { + "auxiliary_loss_clip": 0.01135583, + "auxiliary_loss_mlp": 0.01035818, + "balance_loss_clip": 1.05044174, + "balance_loss_mlp": 1.02156067, + "epoch": 0.3211784157522922, + "flos": 27665023292640.0, + "grad_norm": 1.6487595452387485, + "language_loss": 0.79057688, + "learning_rate": 3.1745935389793786e-06, + "loss": 0.81229091, + "num_input_tokens_seen": 114773590, + "router_z_loss_clip": 0.85205078, + "router_z_loss_mlp": 0.14263916, + "step": 5342, + "time_per_iteration": 2.703227996826172 + }, + { + "auxiliary_loss_clip": 0.01138602, + "auxiliary_loss_mlp": 0.01034655, + "balance_loss_clip": 1.05027187, + "balance_loss_mlp": 1.01942015, + "epoch": 0.3212385390049602, + "flos": 25085818467360.0, + "grad_norm": 2.870899620186238, + "language_loss": 0.74255544, + "learning_rate": 3.174278297458438e-06, + "loss": 0.76428807, + "num_input_tokens_seen": 114790775, + "router_z_loss_clip": 0.8828125, + "router_z_loss_mlp": 0.15222168, + "step": 5343, + "time_per_iteration": 2.6788809299468994 + }, + { + "auxiliary_loss_clip": 0.01138116, + "auxiliary_loss_mlp": 0.01037969, + "balance_loss_clip": 1.05064237, + "balance_loss_mlp": 1.02331817, + "epoch": 0.32129866225762815, + "flos": 30250467779040.0, + "grad_norm": 1.9781268856893433, + "language_loss": 0.82567585, + "learning_rate": 3.173963011408748e-06, + "loss": 0.84743667, + "num_input_tokens_seen": 114809835, + "router_z_loss_clip": 0.87597656, + "router_z_loss_mlp": 0.14648438, + "step": 5344, + "time_per_iteration": 4.117301940917969 + }, + { + "auxiliary_loss_clip": 0.01136649, + "auxiliary_loss_mlp": 0.01036886, + "balance_loss_clip": 1.0470686, + "balance_loss_mlp": 1.02190757, + "epoch": 0.3213587855102961, + "flos": 22411329700320.0, + "grad_norm": 2.0079664906807553, + "language_loss": 0.79563338, + "learning_rate": 3.173647680842262e-06, + "loss": 0.81736875, + "num_input_tokens_seen": 114826505, + "router_z_loss_clip": 0.89599609, + "router_z_loss_mlp": 0.14971924, + "step": 5345, + "time_per_iteration": 3.917680501937866 + }, + { + "auxiliary_loss_clip": 0.01137253, + "auxiliary_loss_mlp": 0.0103686, + "balance_loss_clip": 1.04866457, + "balance_loss_mlp": 1.02252519, + "epoch": 0.3214189087629641, + "flos": 32965467475680.0, + "grad_norm": 1.7917373737491356, + "language_loss": 0.83299935, + "learning_rate": 3.1733323057709384e-06, + "loss": 0.85474044, + "num_input_tokens_seen": 114846140, + "router_z_loss_clip": 0.88525391, + "router_z_loss_mlp": 0.14337158, + "step": 5346, + "time_per_iteration": 4.149758338928223 + }, + { + "auxiliary_loss_clip": 0.01139067, + "auxiliary_loss_mlp": 0.01039665, + "balance_loss_clip": 1.04883003, + "balance_loss_mlp": 1.02450716, + "epoch": 0.32147903201563205, + "flos": 28245796463520.0, + "grad_norm": 1.5354379658995243, + "language_loss": 0.8180297, + "learning_rate": 3.1730168862067366e-06, + "loss": 0.83981699, + "num_input_tokens_seen": 114866660, + "router_z_loss_clip": 0.90185547, + "router_z_loss_mlp": 0.1517334, + "step": 5347, + "time_per_iteration": 2.7065443992614746 + }, + { + "auxiliary_loss_clip": 0.01135825, + "auxiliary_loss_mlp": 0.01039192, + "balance_loss_clip": 1.04908538, + "balance_loss_mlp": 1.02416015, + "epoch": 0.3215391552683, + "flos": 20231608619520.0, + "grad_norm": 2.304336028532477, + "language_loss": 0.79927385, + "learning_rate": 3.1727014221616164e-06, + "loss": 0.821024, + "num_input_tokens_seen": 114882820, + "router_z_loss_clip": 0.86767578, + "router_z_loss_mlp": 0.15026855, + "step": 5348, + "time_per_iteration": 2.671210289001465 + }, + { + "auxiliary_loss_clip": 0.0114084, + "auxiliary_loss_mlp": 0.01044201, + "balance_loss_clip": 1.05159712, + "balance_loss_mlp": 1.03002048, + "epoch": 0.321599278520968, + "flos": 21746009666880.0, + "grad_norm": 2.060084200810559, + "language_loss": 0.85094887, + "learning_rate": 3.172385913647542e-06, + "loss": 0.87279928, + "num_input_tokens_seen": 114900745, + "router_z_loss_clip": 0.89111328, + "router_z_loss_mlp": 0.1416626, + "step": 5349, + "time_per_iteration": 2.70273494720459 + }, + { + "auxiliary_loss_clip": 0.01138649, + "auxiliary_loss_mlp": 0.01040572, + "balance_loss_clip": 1.05144119, + "balance_loss_mlp": 1.02607656, + "epoch": 0.32165940177363594, + "flos": 19830220562880.0, + "grad_norm": 2.354344011019765, + "language_loss": 0.80211347, + "learning_rate": 3.172070360676475e-06, + "loss": 0.82390565, + "num_input_tokens_seen": 114917940, + "router_z_loss_clip": 0.87255859, + "router_z_loss_mlp": 0.14477539, + "step": 5350, + "time_per_iteration": 4.052797079086304 + }, + { + "auxiliary_loss_clip": 0.01135715, + "auxiliary_loss_mlp": 0.01038738, + "balance_loss_clip": 1.04933786, + "balance_loss_mlp": 1.02514839, + "epoch": 0.3217195250263039, + "flos": 33677983789920.0, + "grad_norm": 1.6217866424016882, + "language_loss": 0.79902053, + "learning_rate": 3.1717547632603828e-06, + "loss": 0.82076502, + "num_input_tokens_seen": 114937735, + "router_z_loss_clip": 0.86328125, + "router_z_loss_mlp": 0.13586426, + "step": 5351, + "time_per_iteration": 2.6827378273010254 + }, + { + "auxiliary_loss_clip": 0.01134823, + "auxiliary_loss_mlp": 0.01042107, + "balance_loss_clip": 1.04838693, + "balance_loss_mlp": 1.02717042, + "epoch": 0.3217796482789719, + "flos": 26198507319840.0, + "grad_norm": 1.6991611854525737, + "language_loss": 0.75922894, + "learning_rate": 3.1714391214112326e-06, + "loss": 0.78099823, + "num_input_tokens_seen": 114956630, + "router_z_loss_clip": 0.86425781, + "router_z_loss_mlp": 0.14929199, + "step": 5352, + "time_per_iteration": 2.67604398727417 + }, + { + "auxiliary_loss_clip": 0.01136221, + "auxiliary_loss_mlp": 0.01039596, + "balance_loss_clip": 1.0489713, + "balance_loss_mlp": 1.02465916, + "epoch": 0.32183977153163984, + "flos": 25887298027680.0, + "grad_norm": 2.1208859197512933, + "language_loss": 0.81693435, + "learning_rate": 3.1711234351409933e-06, + "loss": 0.83869255, + "num_input_tokens_seen": 114976470, + "router_z_loss_clip": 0.87255859, + "router_z_loss_mlp": 0.14929199, + "step": 5353, + "time_per_iteration": 2.6413793563842773 + }, + { + "auxiliary_loss_clip": 0.01132945, + "auxiliary_loss_mlp": 0.01035667, + "balance_loss_clip": 1.0484724, + "balance_loss_mlp": 1.02132034, + "epoch": 0.3218998947843078, + "flos": 30027249318240.0, + "grad_norm": 1.646869831578103, + "language_loss": 0.73216093, + "learning_rate": 3.1708077044616365e-06, + "loss": 0.753847, + "num_input_tokens_seen": 114996710, + "router_z_loss_clip": 0.84375, + "router_z_loss_mlp": 0.14355469, + "step": 5354, + "time_per_iteration": 2.7156543731689453 + }, + { + "auxiliary_loss_clip": 0.0113446, + "auxiliary_loss_mlp": 0.01033244, + "balance_loss_clip": 1.04660296, + "balance_loss_mlp": 1.02005315, + "epoch": 0.3219600180369758, + "flos": 27174955610880.0, + "grad_norm": 7.640305807842207, + "language_loss": 0.83826923, + "learning_rate": 3.1704919293851334e-06, + "loss": 0.85994625, + "num_input_tokens_seen": 115015775, + "router_z_loss_clip": 0.87939453, + "router_z_loss_mlp": 0.13189697, + "step": 5355, + "time_per_iteration": 2.652920961380005 + }, + { + "auxiliary_loss_clip": 0.01141965, + "auxiliary_loss_mlp": 0.010434, + "balance_loss_clip": 1.05290043, + "balance_loss_mlp": 1.02907073, + "epoch": 0.3220201412896438, + "flos": 18228355408800.0, + "grad_norm": 2.2083418054735806, + "language_loss": 0.7144866, + "learning_rate": 3.1701761099234597e-06, + "loss": 0.73634028, + "num_input_tokens_seen": 115034265, + "router_z_loss_clip": 0.89160156, + "router_z_loss_mlp": 0.14331055, + "step": 5356, + "time_per_iteration": 2.7168657779693604 + }, + { + "auxiliary_loss_clip": 0.01145575, + "auxiliary_loss_mlp": 0.01038414, + "balance_loss_clip": 1.05210376, + "balance_loss_mlp": 1.02348292, + "epoch": 0.32208026454231176, + "flos": 27659593977120.0, + "grad_norm": 11.155781765710955, + "language_loss": 0.68105853, + "learning_rate": 3.1698602460885903e-06, + "loss": 0.70289838, + "num_input_tokens_seen": 115051945, + "router_z_loss_clip": 0.93408203, + "router_z_loss_mlp": 0.14935303, + "step": 5357, + "time_per_iteration": 2.7249996662139893 + }, + { + "auxiliary_loss_clip": 0.0105252, + "auxiliary_loss_mlp": 0.01004875, + "balance_loss_clip": 1.02507973, + "balance_loss_mlp": 1.0029825, + "epoch": 0.3221403877949797, + "flos": 78830449914240.0, + "grad_norm": 0.7055701624688017, + "language_loss": 0.58279836, + "learning_rate": 3.1695443378925035e-06, + "loss": 0.60337234, + "num_input_tokens_seen": 115119090, + "router_z_loss_clip": 0.27392578, + "router_z_loss_mlp": 0.01889038, + "step": 5358, + "time_per_iteration": 3.383741855621338 + }, + { + "auxiliary_loss_clip": 0.01138912, + "auxiliary_loss_mlp": 0.01035757, + "balance_loss_clip": 1.04965043, + "balance_loss_mlp": 1.02057004, + "epoch": 0.3222005110476477, + "flos": 24595588716480.0, + "grad_norm": 1.7870017305054042, + "language_loss": 0.83859682, + "learning_rate": 3.1692283853471777e-06, + "loss": 0.86034352, + "num_input_tokens_seen": 115137755, + "router_z_loss_clip": 0.89257812, + "router_z_loss_mlp": 0.1517334, + "step": 5359, + "time_per_iteration": 2.6555395126342773 + }, + { + "auxiliary_loss_clip": 0.01138182, + "auxiliary_loss_mlp": 0.01035443, + "balance_loss_clip": 1.04948413, + "balance_loss_mlp": 1.02130473, + "epoch": 0.32226063430031565, + "flos": 27667251743040.0, + "grad_norm": 1.9823753815438485, + "language_loss": 0.79581189, + "learning_rate": 3.168912388464595e-06, + "loss": 0.81754816, + "num_input_tokens_seen": 115158150, + "router_z_loss_clip": 0.88769531, + "router_z_loss_mlp": 0.14141846, + "step": 5360, + "time_per_iteration": 2.6986491680145264 + }, + { + "auxiliary_loss_clip": 0.0104947, + "auxiliary_loss_mlp": 0.01003589, + "balance_loss_clip": 1.02191556, + "balance_loss_mlp": 1.00164258, + "epoch": 0.3223207575529836, + "flos": 77883734956320.0, + "grad_norm": 0.6542614676875264, + "language_loss": 0.56931674, + "learning_rate": 3.168596347256737e-06, + "loss": 0.58984733, + "num_input_tokens_seen": 115212755, + "router_z_loss_clip": 0.27539062, + "router_z_loss_mlp": 0.0194397, + "step": 5361, + "time_per_iteration": 3.1665804386138916 + }, + { + "auxiliary_loss_clip": 0.01136799, + "auxiliary_loss_mlp": 0.01043406, + "balance_loss_clip": 1.05020928, + "balance_loss_mlp": 1.02913678, + "epoch": 0.3223808808056516, + "flos": 32787459948960.0, + "grad_norm": 2.1660645253224615, + "language_loss": 0.71688539, + "learning_rate": 3.168280261735588e-06, + "loss": 0.7386874, + "num_input_tokens_seen": 115233090, + "router_z_loss_clip": 0.86474609, + "router_z_loss_mlp": 0.14263916, + "step": 5362, + "time_per_iteration": 2.7251529693603516 + }, + { + "auxiliary_loss_clip": 0.01137642, + "auxiliary_loss_mlp": 0.0104035, + "balance_loss_clip": 1.0510447, + "balance_loss_mlp": 1.02758253, + "epoch": 0.32244100405831955, + "flos": 32654663356320.0, + "grad_norm": 1.996584962441404, + "language_loss": 0.74227607, + "learning_rate": 3.167964131913135e-06, + "loss": 0.76405597, + "num_input_tokens_seen": 115252645, + "router_z_loss_clip": 0.86523438, + "router_z_loss_mlp": 0.12768555, + "step": 5363, + "time_per_iteration": 2.708164691925049 + }, + { + "auxiliary_loss_clip": 0.01142719, + "auxiliary_loss_mlp": 0.0104135, + "balance_loss_clip": 1.05095792, + "balance_loss_mlp": 1.0261147, + "epoch": 0.3225011273109875, + "flos": 29044966538880.0, + "grad_norm": 2.8194202924195384, + "language_loss": 0.76831675, + "learning_rate": 3.167647957801365e-06, + "loss": 0.79015744, + "num_input_tokens_seen": 115269085, + "router_z_loss_clip": 0.91699219, + "router_z_loss_mlp": 0.15234375, + "step": 5364, + "time_per_iteration": 2.656331777572632 + }, + { + "auxiliary_loss_clip": 0.01138152, + "auxiliary_loss_mlp": 0.01039898, + "balance_loss_clip": 1.05041707, + "balance_loss_mlp": 1.02514005, + "epoch": 0.3225612505636555, + "flos": 21078137044800.0, + "grad_norm": 2.2624257453245273, + "language_loss": 0.77175212, + "learning_rate": 3.1673317394122672e-06, + "loss": 0.79353261, + "num_input_tokens_seen": 115286470, + "router_z_loss_clip": 0.87841797, + "router_z_loss_mlp": 0.14758301, + "step": 5365, + "time_per_iteration": 2.634678840637207 + }, + { + "auxiliary_loss_clip": 0.01140626, + "auxiliary_loss_mlp": 0.01041489, + "balance_loss_clip": 1.0527606, + "balance_loss_mlp": 1.02722573, + "epoch": 0.32262137381632344, + "flos": 28511875856160.0, + "grad_norm": 1.9588033530996218, + "language_loss": 0.76543617, + "learning_rate": 3.1670154767578333e-06, + "loss": 0.78725731, + "num_input_tokens_seen": 115307000, + "router_z_loss_clip": 0.87890625, + "router_z_loss_mlp": 0.14263916, + "step": 5366, + "time_per_iteration": 2.6634647846221924 + }, + { + "auxiliary_loss_clip": 0.01136662, + "auxiliary_loss_mlp": 0.01036024, + "balance_loss_clip": 1.04877484, + "balance_loss_mlp": 1.0215342, + "epoch": 0.3226814970689914, + "flos": 28380578402880.0, + "grad_norm": 1.9468674018070615, + "language_loss": 0.71546519, + "learning_rate": 3.166699169850055e-06, + "loss": 0.73719209, + "num_input_tokens_seen": 115325925, + "router_z_loss_clip": 0.87939453, + "router_z_loss_mlp": 0.14501953, + "step": 5367, + "time_per_iteration": 2.7330639362335205 + }, + { + "auxiliary_loss_clip": 0.01135494, + "auxiliary_loss_mlp": 0.01038187, + "balance_loss_clip": 1.04990983, + "balance_loss_mlp": 1.02492523, + "epoch": 0.32274162032165943, + "flos": 20005472914560.0, + "grad_norm": 1.9560756249826454, + "language_loss": 0.74623263, + "learning_rate": 3.1663828187009274e-06, + "loss": 0.76796949, + "num_input_tokens_seen": 115343705, + "router_z_loss_clip": 0.85595703, + "router_z_loss_mlp": 0.13275146, + "step": 5368, + "time_per_iteration": 2.640099287033081 + }, + { + "auxiliary_loss_clip": 0.01135497, + "auxiliary_loss_mlp": 0.01038653, + "balance_loss_clip": 1.05116701, + "balance_loss_mlp": 1.02435994, + "epoch": 0.3228017435743274, + "flos": 33989395668480.0, + "grad_norm": 4.737504688874236, + "language_loss": 0.78480715, + "learning_rate": 3.1660664233224467e-06, + "loss": 0.8065486, + "num_input_tokens_seen": 115364170, + "router_z_loss_clip": 0.84375, + "router_z_loss_mlp": 0.1428833, + "step": 5369, + "time_per_iteration": 2.681406259536743 + }, + { + "auxiliary_loss_clip": 0.01132728, + "auxiliary_loss_mlp": 0.010323, + "balance_loss_clip": 1.0486753, + "balance_loss_mlp": 1.01842356, + "epoch": 0.32286186682699536, + "flos": 23926662645120.0, + "grad_norm": 1.863613183400238, + "language_loss": 0.83318722, + "learning_rate": 3.16574998372661e-06, + "loss": 0.85483748, + "num_input_tokens_seen": 115382495, + "router_z_loss_clip": 0.84033203, + "router_z_loss_mlp": 0.13867188, + "step": 5370, + "time_per_iteration": 2.6424872875213623 + }, + { + "auxiliary_loss_clip": 0.01138645, + "auxiliary_loss_mlp": 0.01038173, + "balance_loss_clip": 1.05135584, + "balance_loss_mlp": 1.0240407, + "epoch": 0.3229219900796633, + "flos": 29448623563200.0, + "grad_norm": 2.366293824552467, + "language_loss": 0.83083606, + "learning_rate": 3.1654334999254177e-06, + "loss": 0.85260427, + "num_input_tokens_seen": 115399450, + "router_z_loss_clip": 0.87304688, + "router_z_loss_mlp": 0.14129639, + "step": 5371, + "time_per_iteration": 2.663949489593506 + }, + { + "auxiliary_loss_clip": 0.0114202, + "auxiliary_loss_mlp": 0.01044239, + "balance_loss_clip": 1.05146861, + "balance_loss_mlp": 1.02841949, + "epoch": 0.3229821133323313, + "flos": 21656803317120.0, + "grad_norm": 2.4195679132006322, + "language_loss": 0.88905859, + "learning_rate": 3.1651169719308695e-06, + "loss": 0.91092122, + "num_input_tokens_seen": 115417700, + "router_z_loss_clip": 0.90625, + "router_z_loss_mlp": 0.15808105, + "step": 5372, + "time_per_iteration": 2.6544501781463623 + }, + { + "auxiliary_loss_clip": 0.01139337, + "auxiliary_loss_mlp": 0.01046024, + "balance_loss_clip": 1.05260575, + "balance_loss_mlp": 1.03174233, + "epoch": 0.32304223658499925, + "flos": 27266795583840.0, + "grad_norm": 2.078966613867566, + "language_loss": 0.72915316, + "learning_rate": 3.1648003997549694e-06, + "loss": 0.75100678, + "num_input_tokens_seen": 115435840, + "router_z_loss_clip": 0.86816406, + "router_z_loss_mlp": 0.14276123, + "step": 5373, + "time_per_iteration": 2.759127616882324 + }, + { + "auxiliary_loss_clip": 0.01136732, + "auxiliary_loss_mlp": 0.01035923, + "balance_loss_clip": 1.05146408, + "balance_loss_mlp": 1.02223206, + "epoch": 0.3231023598376672, + "flos": 22547124571680.0, + "grad_norm": 2.263254683836142, + "language_loss": 0.80880612, + "learning_rate": 3.1644837834097214e-06, + "loss": 0.83053267, + "num_input_tokens_seen": 115454210, + "router_z_loss_clip": 0.85253906, + "router_z_loss_mlp": 0.13702393, + "step": 5374, + "time_per_iteration": 2.7035934925079346 + }, + { + "auxiliary_loss_clip": 0.01134103, + "auxiliary_loss_mlp": 0.01038494, + "balance_loss_clip": 1.04896617, + "balance_loss_mlp": 1.02412295, + "epoch": 0.3231624830903352, + "flos": 33722424895680.0, + "grad_norm": 2.259021101780934, + "language_loss": 0.87613362, + "learning_rate": 3.1641671229071317e-06, + "loss": 0.89785963, + "num_input_tokens_seen": 115471785, + "router_z_loss_clip": 0.85107422, + "router_z_loss_mlp": 0.14361572, + "step": 5375, + "time_per_iteration": 2.7038257122039795 + }, + { + "auxiliary_loss_clip": 0.01140646, + "auxiliary_loss_mlp": 0.01037737, + "balance_loss_clip": 1.04994881, + "balance_loss_mlp": 1.02258503, + "epoch": 0.32322260634300315, + "flos": 26510648509440.0, + "grad_norm": 2.522466840436076, + "language_loss": 0.75902379, + "learning_rate": 3.1638504182592076e-06, + "loss": 0.78080761, + "num_input_tokens_seen": 115491405, + "router_z_loss_clip": 0.90722656, + "router_z_loss_mlp": 0.15136719, + "step": 5376, + "time_per_iteration": 2.682037115097046 + }, + { + "auxiliary_loss_clip": 0.011349, + "auxiliary_loss_mlp": 0.01038461, + "balance_loss_clip": 1.04894137, + "balance_loss_mlp": 1.02519917, + "epoch": 0.3232827295956711, + "flos": 27622689085440.0, + "grad_norm": 1.6684862010482657, + "language_loss": 0.66618001, + "learning_rate": 3.1635336694779594e-06, + "loss": 0.68791366, + "num_input_tokens_seen": 115511555, + "router_z_loss_clip": 0.85839844, + "router_z_loss_mlp": 0.13275146, + "step": 5377, + "time_per_iteration": 2.6961510181427 + }, + { + "auxiliary_loss_clip": 0.01138155, + "auxiliary_loss_mlp": 0.01048391, + "balance_loss_clip": 1.05079782, + "balance_loss_mlp": 1.03203511, + "epoch": 0.3233428528483391, + "flos": 32119344223200.0, + "grad_norm": 1.5171517971870234, + "language_loss": 0.72238767, + "learning_rate": 3.1632168765753982e-06, + "loss": 0.7442531, + "num_input_tokens_seen": 115532860, + "router_z_loss_clip": 0.87451172, + "router_z_loss_mlp": 0.16363525, + "step": 5378, + "time_per_iteration": 2.7109837532043457 + }, + { + "auxiliary_loss_clip": 0.01136003, + "auxiliary_loss_mlp": 0.01031049, + "balance_loss_clip": 1.0486753, + "balance_loss_mlp": 1.01710117, + "epoch": 0.32340297610100704, + "flos": 34880324682240.0, + "grad_norm": 2.002694504073069, + "language_loss": 0.82000965, + "learning_rate": 3.1629000395635357e-06, + "loss": 0.84168017, + "num_input_tokens_seen": 115553850, + "router_z_loss_clip": 0.87207031, + "router_z_loss_mlp": 0.1394043, + "step": 5379, + "time_per_iteration": 2.7369422912597656 + }, + { + "auxiliary_loss_clip": 0.01141038, + "auxiliary_loss_mlp": 0.01035003, + "balance_loss_clip": 1.0508039, + "balance_loss_mlp": 1.02116919, + "epoch": 0.323463099353675, + "flos": 37551045342240.0, + "grad_norm": 1.6714459804562762, + "language_loss": 0.78895032, + "learning_rate": 3.162583158454388e-06, + "loss": 0.81071073, + "num_input_tokens_seen": 115575530, + "router_z_loss_clip": 0.90185547, + "router_z_loss_mlp": 0.13824463, + "step": 5380, + "time_per_iteration": 2.7215356826782227 + }, + { + "auxiliary_loss_clip": 0.01140091, + "auxiliary_loss_mlp": 0.01039163, + "balance_loss_clip": 1.05279398, + "balance_loss_mlp": 1.02552557, + "epoch": 0.32352322260634303, + "flos": 30784328290080.0, + "grad_norm": 1.7533691374023797, + "language_loss": 0.77146566, + "learning_rate": 3.1622662332599697e-06, + "loss": 0.79325819, + "num_input_tokens_seen": 115594885, + "router_z_loss_clip": 0.87304688, + "router_z_loss_mlp": 0.13653564, + "step": 5381, + "time_per_iteration": 2.67939829826355 + }, + { + "auxiliary_loss_clip": 0.01133432, + "auxiliary_loss_mlp": 0.01035269, + "balance_loss_clip": 1.04989314, + "balance_loss_mlp": 1.02229929, + "epoch": 0.323583345859011, + "flos": 28468164061440.0, + "grad_norm": 2.216577839432679, + "language_loss": 0.71370125, + "learning_rate": 3.1619492639922998e-06, + "loss": 0.73538828, + "num_input_tokens_seen": 115614080, + "router_z_loss_clip": 0.83544922, + "router_z_loss_mlp": 0.12969971, + "step": 5382, + "time_per_iteration": 2.6566736698150635 + }, + { + "auxiliary_loss_clip": 0.01141438, + "auxiliary_loss_mlp": 0.01042809, + "balance_loss_clip": 1.0515132, + "balance_loss_mlp": 1.02852809, + "epoch": 0.32364346911167896, + "flos": 31979416589280.0, + "grad_norm": 3.1900085588145783, + "language_loss": 0.7051608, + "learning_rate": 3.1616322506633964e-06, + "loss": 0.72700322, + "num_input_tokens_seen": 115632820, + "router_z_loss_clip": 0.89941406, + "router_z_loss_mlp": 0.1427002, + "step": 5383, + "time_per_iteration": 2.6996521949768066 + }, + { + "auxiliary_loss_clip": 0.01132634, + "auxiliary_loss_mlp": 0.01037234, + "balance_loss_clip": 1.04865134, + "balance_loss_mlp": 1.02418089, + "epoch": 0.3237035923643469, + "flos": 28913871672000.0, + "grad_norm": 2.0217215357528544, + "language_loss": 0.78451437, + "learning_rate": 3.161315193285283e-06, + "loss": 0.80621302, + "num_input_tokens_seen": 115652860, + "router_z_loss_clip": 0.84033203, + "router_z_loss_mlp": 0.1305542, + "step": 5384, + "time_per_iteration": 5.470987558364868 + }, + { + "auxiliary_loss_clip": 0.01138164, + "auxiliary_loss_mlp": 0.01041778, + "balance_loss_clip": 1.04958093, + "balance_loss_mlp": 1.02593541, + "epoch": 0.3237637156170149, + "flos": 17605491134400.0, + "grad_norm": 2.520097072360728, + "language_loss": 0.74907804, + "learning_rate": 3.16099809186998e-06, + "loss": 0.77087748, + "num_input_tokens_seen": 115670940, + "router_z_loss_clip": 0.88574219, + "router_z_loss_mlp": 0.15844727, + "step": 5385, + "time_per_iteration": 2.6785542964935303 + }, + { + "auxiliary_loss_clip": 0.01138481, + "auxiliary_loss_mlp": 0.01038332, + "balance_loss_clip": 1.052037, + "balance_loss_mlp": 1.0241816, + "epoch": 0.32382383886968286, + "flos": 37904183668800.0, + "grad_norm": 1.9144100743147414, + "language_loss": 0.71766639, + "learning_rate": 3.1606809464295145e-06, + "loss": 0.73943448, + "num_input_tokens_seen": 115691155, + "router_z_loss_clip": 0.86425781, + "router_z_loss_mlp": 0.14160156, + "step": 5386, + "time_per_iteration": 4.160263538360596 + }, + { + "auxiliary_loss_clip": 0.0113978, + "auxiliary_loss_mlp": 0.01035763, + "balance_loss_clip": 1.04961967, + "balance_loss_mlp": 1.02067125, + "epoch": 0.3238839621223508, + "flos": 28378471504320.0, + "grad_norm": 1.905688997642518, + "language_loss": 0.94714534, + "learning_rate": 3.1603637569759095e-06, + "loss": 0.9689008, + "num_input_tokens_seen": 115710340, + "router_z_loss_clip": 0.90185547, + "router_z_loss_mlp": 0.15087891, + "step": 5387, + "time_per_iteration": 2.681183099746704 + }, + { + "auxiliary_loss_clip": 0.0113935, + "auxiliary_loss_mlp": 0.01041421, + "balance_loss_clip": 1.05056763, + "balance_loss_mlp": 1.02605522, + "epoch": 0.3239440853750188, + "flos": 28021848691680.0, + "grad_norm": 16.13543602229099, + "language_loss": 0.77546638, + "learning_rate": 3.1600465235211956e-06, + "loss": 0.79727411, + "num_input_tokens_seen": 115726745, + "router_z_loss_clip": 0.88769531, + "router_z_loss_mlp": 0.15356445, + "step": 5388, + "time_per_iteration": 2.6590425968170166 + }, + { + "auxiliary_loss_clip": 0.01137387, + "auxiliary_loss_mlp": 0.01032646, + "balance_loss_clip": 1.04931998, + "balance_loss_mlp": 1.01818609, + "epoch": 0.32400420862768675, + "flos": 44229406907520.0, + "grad_norm": 2.130058605643187, + "language_loss": 0.71502209, + "learning_rate": 3.1597292460774006e-06, + "loss": 0.73672235, + "num_input_tokens_seen": 115749385, + "router_z_loss_clip": 0.88037109, + "router_z_loss_mlp": 0.14459229, + "step": 5389, + "time_per_iteration": 4.238020181655884 + }, + { + "auxiliary_loss_clip": 0.01135799, + "auxiliary_loss_mlp": 0.01033214, + "balance_loss_clip": 1.05067539, + "balance_loss_mlp": 1.01857507, + "epoch": 0.3240643318803547, + "flos": 26376555363840.0, + "grad_norm": 1.902051897928305, + "language_loss": 0.81178743, + "learning_rate": 3.159411924656557e-06, + "loss": 0.83347756, + "num_input_tokens_seen": 115768105, + "router_z_loss_clip": 0.85058594, + "router_z_loss_mlp": 0.1463623, + "step": 5390, + "time_per_iteration": 2.6542046070098877 + }, + { + "auxiliary_loss_clip": 0.01139987, + "auxiliary_loss_mlp": 0.01043224, + "balance_loss_clip": 1.05269742, + "balance_loss_mlp": 1.02844214, + "epoch": 0.3241244551330227, + "flos": 28425465198720.0, + "grad_norm": 2.107904737958526, + "language_loss": 0.72761256, + "learning_rate": 3.1590945592706967e-06, + "loss": 0.74944466, + "num_input_tokens_seen": 115787340, + "router_z_loss_clip": 0.87255859, + "router_z_loss_mlp": 0.14770508, + "step": 5391, + "time_per_iteration": 2.638170003890991 + }, + { + "auxiliary_loss_clip": 0.01131832, + "auxiliary_loss_mlp": 0.01038518, + "balance_loss_clip": 1.04788256, + "balance_loss_mlp": 1.02510095, + "epoch": 0.32418457838569065, + "flos": 17201266868160.0, + "grad_norm": 2.3631616674019247, + "language_loss": 0.77147841, + "learning_rate": 3.158777149931855e-06, + "loss": 0.79318196, + "num_input_tokens_seen": 115805565, + "router_z_loss_clip": 0.83984375, + "router_z_loss_mlp": 0.13415527, + "step": 5392, + "time_per_iteration": 2.628356695175171 + }, + { + "auxiliary_loss_clip": 0.01139554, + "auxiliary_loss_mlp": 0.0103699, + "balance_loss_clip": 1.04954338, + "balance_loss_mlp": 1.02155232, + "epoch": 0.3242447016383586, + "flos": 36304952137920.0, + "grad_norm": 2.059415078498091, + "language_loss": 0.62387639, + "learning_rate": 3.158459696652067e-06, + "loss": 0.6456418, + "num_input_tokens_seen": 115826725, + "router_z_loss_clip": 0.90039062, + "router_z_loss_mlp": 0.15441895, + "step": 5393, + "time_per_iteration": 2.687781572341919 + }, + { + "auxiliary_loss_clip": 0.01133675, + "auxiliary_loss_mlp": 0.01037585, + "balance_loss_clip": 1.04763222, + "balance_loss_mlp": 1.02323246, + "epoch": 0.3243048248910266, + "flos": 29713811575680.0, + "grad_norm": 1.5501560901566889, + "language_loss": 0.82469392, + "learning_rate": 3.158142199443371e-06, + "loss": 0.84640646, + "num_input_tokens_seen": 115846955, + "router_z_loss_clip": 0.85986328, + "router_z_loss_mlp": 0.14343262, + "step": 5394, + "time_per_iteration": 2.6915407180786133 + }, + { + "auxiliary_loss_clip": 0.01131677, + "auxiliary_loss_mlp": 0.01038674, + "balance_loss_clip": 1.04989111, + "balance_loss_mlp": 1.02566195, + "epoch": 0.3243649481436946, + "flos": 29715877956960.0, + "grad_norm": 1.8322079622116008, + "language_loss": 0.81669295, + "learning_rate": 3.1578246583178076e-06, + "loss": 0.83839643, + "num_input_tokens_seen": 115865975, + "router_z_loss_clip": 0.81835938, + "router_z_loss_mlp": 0.13031006, + "step": 5395, + "time_per_iteration": 2.6553750038146973 + }, + { + "auxiliary_loss_clip": 0.01134221, + "auxiliary_loss_mlp": 0.01042888, + "balance_loss_clip": 1.05241704, + "balance_loss_mlp": 1.02899981, + "epoch": 0.32442507139636256, + "flos": 27974004134400.0, + "grad_norm": 1.800211993533915, + "language_loss": 0.83520764, + "learning_rate": 3.157507073287417e-06, + "loss": 0.85697865, + "num_input_tokens_seen": 115884950, + "router_z_loss_clip": 0.81787109, + "router_z_loss_mlp": 0.13891602, + "step": 5396, + "time_per_iteration": 2.668379306793213 + }, + { + "auxiliary_loss_clip": 0.01139524, + "auxiliary_loss_mlp": 0.01042242, + "balance_loss_clip": 1.04947305, + "balance_loss_mlp": 1.02812791, + "epoch": 0.32448519464903053, + "flos": 27088828574400.0, + "grad_norm": 2.417262805506303, + "language_loss": 0.75812525, + "learning_rate": 3.1571894443642414e-06, + "loss": 0.77994287, + "num_input_tokens_seen": 115904170, + "router_z_loss_clip": 0.89941406, + "router_z_loss_mlp": 0.14111328, + "step": 5397, + "time_per_iteration": 2.6584291458129883 + }, + { + "auxiliary_loss_clip": 0.01135442, + "auxiliary_loss_mlp": 0.01030603, + "balance_loss_clip": 1.05095232, + "balance_loss_mlp": 1.01635766, + "epoch": 0.3245453179016985, + "flos": 22987038211200.0, + "grad_norm": 2.394960045528709, + "language_loss": 0.67885929, + "learning_rate": 3.1568717715603263e-06, + "loss": 0.70051974, + "num_input_tokens_seen": 115919255, + "router_z_loss_clip": 0.84472656, + "router_z_loss_mlp": 0.14233398, + "step": 5398, + "time_per_iteration": 2.6463029384613037 + }, + { + "auxiliary_loss_clip": 0.01133678, + "auxiliary_loss_mlp": 0.01032136, + "balance_loss_clip": 1.04850829, + "balance_loss_mlp": 1.01809359, + "epoch": 0.32460544115436646, + "flos": 25841965541760.0, + "grad_norm": 1.5122916954385286, + "language_loss": 0.72667819, + "learning_rate": 3.156554054887718e-06, + "loss": 0.74833632, + "num_input_tokens_seen": 115938535, + "router_z_loss_clip": 0.8515625, + "router_z_loss_mlp": 0.14044189, + "step": 5399, + "time_per_iteration": 2.677633762359619 + }, + { + "auxiliary_loss_clip": 0.01135338, + "auxiliary_loss_mlp": 0.01034468, + "balance_loss_clip": 1.04978168, + "balance_loss_mlp": 1.02037787, + "epoch": 0.3246655644070344, + "flos": 26821533663360.0, + "grad_norm": 2.4072480530097375, + "language_loss": 0.71526098, + "learning_rate": 3.1562362943584645e-06, + "loss": 0.7369591, + "num_input_tokens_seen": 115955005, + "router_z_loss_clip": 0.85546875, + "router_z_loss_mlp": 0.14086914, + "step": 5400, + "time_per_iteration": 2.661062240600586 + }, + { + "auxiliary_loss_clip": 0.01137688, + "auxiliary_loss_mlp": 0.01037691, + "balance_loss_clip": 1.04993558, + "balance_loss_mlp": 1.02385056, + "epoch": 0.3247256876597024, + "flos": 39242319432480.0, + "grad_norm": 2.092518526362012, + "language_loss": 0.79364181, + "learning_rate": 3.155918489984614e-06, + "loss": 0.81539559, + "num_input_tokens_seen": 115975305, + "router_z_loss_clip": 0.87792969, + "router_z_loss_mlp": 0.13830566, + "step": 5401, + "time_per_iteration": 2.7129576206207275 + }, + { + "auxiliary_loss_clip": 0.01139665, + "auxiliary_loss_mlp": 0.01037204, + "balance_loss_clip": 1.05207384, + "balance_loss_mlp": 1.02224362, + "epoch": 0.32478581091237035, + "flos": 25621137600480.0, + "grad_norm": 2.1758674262594297, + "language_loss": 0.87701732, + "learning_rate": 3.1556006417782196e-06, + "loss": 0.89878601, + "num_input_tokens_seen": 115994810, + "router_z_loss_clip": 0.875, + "router_z_loss_mlp": 0.14953613, + "step": 5402, + "time_per_iteration": 2.672250986099243 + }, + { + "auxiliary_loss_clip": 0.01130624, + "auxiliary_loss_mlp": 0.01036618, + "balance_loss_clip": 1.0479629, + "balance_loss_mlp": 1.02345097, + "epoch": 0.3248459341650383, + "flos": 21872485563840.0, + "grad_norm": 2.293899999456762, + "language_loss": 0.84240633, + "learning_rate": 3.155282749751332e-06, + "loss": 0.86407876, + "num_input_tokens_seen": 116011095, + "router_z_loss_clip": 0.82617188, + "router_z_loss_mlp": 0.13171387, + "step": 5403, + "time_per_iteration": 2.7286014556884766 + }, + { + "auxiliary_loss_clip": 0.01132635, + "auxiliary_loss_mlp": 0.01040556, + "balance_loss_clip": 1.05119765, + "balance_loss_mlp": 1.02820551, + "epoch": 0.3249060574177063, + "flos": 29938650727680.0, + "grad_norm": 1.9894936764830726, + "language_loss": 0.87405509, + "learning_rate": 3.154964813916007e-06, + "loss": 0.89578694, + "num_input_tokens_seen": 116028805, + "router_z_loss_clip": 0.81494141, + "router_z_loss_mlp": 0.12341309, + "step": 5404, + "time_per_iteration": 2.697913646697998 + }, + { + "auxiliary_loss_clip": 0.01135897, + "auxiliary_loss_mlp": 0.01036882, + "balance_loss_clip": 1.0519706, + "balance_loss_mlp": 1.02283967, + "epoch": 0.32496618067037425, + "flos": 31719252719520.0, + "grad_norm": 1.7450199707954808, + "language_loss": 0.72811478, + "learning_rate": 3.1546468342843008e-06, + "loss": 0.74984252, + "num_input_tokens_seen": 116047765, + "router_z_loss_clip": 0.83886719, + "router_z_loss_mlp": 0.14056396, + "step": 5405, + "time_per_iteration": 2.7112693786621094 + }, + { + "auxiliary_loss_clip": 0.01134575, + "auxiliary_loss_mlp": 0.01034532, + "balance_loss_clip": 1.05038607, + "balance_loss_mlp": 1.02067971, + "epoch": 0.3250263039230422, + "flos": 23883801713280.0, + "grad_norm": 2.1090892454321453, + "language_loss": 0.82902759, + "learning_rate": 3.1543288108682707e-06, + "loss": 0.85071862, + "num_input_tokens_seen": 116068385, + "router_z_loss_clip": 0.84130859, + "router_z_loss_mlp": 0.13842773, + "step": 5406, + "time_per_iteration": 2.659240484237671 + }, + { + "auxiliary_loss_clip": 0.0113522, + "auxiliary_loss_mlp": 0.01032698, + "balance_loss_clip": 1.05134571, + "balance_loss_mlp": 1.01996684, + "epoch": 0.3250864271757102, + "flos": 20454746045760.0, + "grad_norm": 1.7659184701425898, + "language_loss": 0.87498248, + "learning_rate": 3.1540107436799764e-06, + "loss": 0.8966617, + "num_input_tokens_seen": 116085350, + "router_z_loss_clip": 0.83935547, + "router_z_loss_mlp": 0.12731934, + "step": 5407, + "time_per_iteration": 2.6095657348632812 + }, + { + "auxiliary_loss_clip": 0.0113598, + "auxiliary_loss_mlp": 0.01036527, + "balance_loss_clip": 1.0510087, + "balance_loss_mlp": 1.02293706, + "epoch": 0.3251465504283782, + "flos": 33945845942880.0, + "grad_norm": 1.686852656465957, + "language_loss": 0.69615728, + "learning_rate": 3.153692632731479e-06, + "loss": 0.71788239, + "num_input_tokens_seen": 116107560, + "router_z_loss_clip": 0.84912109, + "router_z_loss_mlp": 0.13592529, + "step": 5408, + "time_per_iteration": 2.6874101161956787 + }, + { + "auxiliary_loss_clip": 0.01141356, + "auxiliary_loss_mlp": 0.0103347, + "balance_loss_clip": 1.05107343, + "balance_loss_mlp": 1.01980233, + "epoch": 0.32520667368104617, + "flos": 23260694335200.0, + "grad_norm": 3.3405274424604143, + "language_loss": 0.77659112, + "learning_rate": 3.153374478034841e-06, + "loss": 0.79833937, + "num_input_tokens_seen": 116125980, + "router_z_loss_clip": 0.90234375, + "router_z_loss_mlp": 0.13653564, + "step": 5409, + "time_per_iteration": 2.661595582962036 + }, + { + "auxiliary_loss_clip": 0.01135724, + "auxiliary_loss_mlp": 0.01040123, + "balance_loss_clip": 1.04762459, + "balance_loss_mlp": 1.02686143, + "epoch": 0.32526679693371413, + "flos": 35853329004480.0, + "grad_norm": 1.7709621782498548, + "language_loss": 0.8321436, + "learning_rate": 3.1530562796021285e-06, + "loss": 0.85390204, + "num_input_tokens_seen": 116146530, + "router_z_loss_clip": 0.87988281, + "router_z_loss_mlp": 0.13262939, + "step": 5410, + "time_per_iteration": 2.723240613937378 + }, + { + "auxiliary_loss_clip": 0.01131064, + "auxiliary_loss_mlp": 0.01028915, + "balance_loss_clip": 1.0476408, + "balance_loss_mlp": 1.01602888, + "epoch": 0.3253269201863821, + "flos": 25263663924960.0, + "grad_norm": 3.9658772095624015, + "language_loss": 0.71073568, + "learning_rate": 3.152738037445405e-06, + "loss": 0.73233551, + "num_input_tokens_seen": 116165695, + "router_z_loss_clip": 0.83496094, + "router_z_loss_mlp": 0.12890625, + "step": 5411, + "time_per_iteration": 2.6550185680389404 + }, + { + "auxiliary_loss_clip": 0.01135697, + "auxiliary_loss_mlp": 0.01037279, + "balance_loss_clip": 1.05061483, + "balance_loss_mlp": 1.02415979, + "epoch": 0.32538704343905006, + "flos": 35501527748160.0, + "grad_norm": 1.740907328206109, + "language_loss": 0.82981336, + "learning_rate": 3.1524197515767403e-06, + "loss": 0.85154313, + "num_input_tokens_seen": 116185375, + "router_z_loss_clip": 0.85107422, + "router_z_loss_mlp": 0.13128662, + "step": 5412, + "time_per_iteration": 2.670785665512085 + }, + { + "auxiliary_loss_clip": 0.01138781, + "auxiliary_loss_mlp": 0.01037967, + "balance_loss_clip": 1.0500766, + "balance_loss_mlp": 1.02348876, + "epoch": 0.325447166691718, + "flos": 30110621179680.0, + "grad_norm": 1.7185977203729244, + "language_loss": 0.81156433, + "learning_rate": 3.152101422008203e-06, + "loss": 0.83333182, + "num_input_tokens_seen": 116204335, + "router_z_loss_clip": 0.88623047, + "router_z_loss_mlp": 0.14471436, + "step": 5413, + "time_per_iteration": 2.6704769134521484 + }, + { + "auxiliary_loss_clip": 0.01138005, + "auxiliary_loss_mlp": 0.01034697, + "balance_loss_clip": 1.0528543, + "balance_loss_mlp": 1.01997471, + "epoch": 0.325507289944386, + "flos": 26287065393120.0, + "grad_norm": 1.76430669638224, + "language_loss": 0.76989657, + "learning_rate": 3.151783048751864e-06, + "loss": 0.79162359, + "num_input_tokens_seen": 116222840, + "router_z_loss_clip": 0.85205078, + "router_z_loss_mlp": 0.1472168, + "step": 5414, + "time_per_iteration": 2.6404869556427 + }, + { + "auxiliary_loss_clip": 0.01047859, + "auxiliary_loss_mlp": 0.01029592, + "balance_loss_clip": 1.01989126, + "balance_loss_mlp": 1.02764428, + "epoch": 0.32556741319705396, + "flos": 87268668624000.0, + "grad_norm": 0.9225594766186254, + "language_loss": 0.63982606, + "learning_rate": 3.1514646318197965e-06, + "loss": 0.6606006, + "num_input_tokens_seen": 116274940, + "router_z_loss_clip": 0.27978516, + "router_z_loss_mlp": 0.01945496, + "step": 5415, + "time_per_iteration": 3.189889907836914 + }, + { + "auxiliary_loss_clip": 0.0113514, + "auxiliary_loss_mlp": 0.01035716, + "balance_loss_clip": 1.05007803, + "balance_loss_mlp": 1.02236462, + "epoch": 0.3256275364497219, + "flos": 28958596398720.0, + "grad_norm": 1.534564754540977, + "language_loss": 0.74031973, + "learning_rate": 3.151146171224075e-06, + "loss": 0.76202822, + "num_input_tokens_seen": 116297300, + "router_z_loss_clip": 0.8515625, + "router_z_loss_mlp": 0.13366699, + "step": 5416, + "time_per_iteration": 2.7161924839019775 + }, + { + "auxiliary_loss_clip": 0.01047648, + "auxiliary_loss_mlp": 0.01015333, + "balance_loss_clip": 1.01962042, + "balance_loss_mlp": 1.01337481, + "epoch": 0.3256876597023899, + "flos": 82107220177440.0, + "grad_norm": 0.778866014182375, + "language_loss": 0.5790298, + "learning_rate": 3.1508276669767757e-06, + "loss": 0.59965962, + "num_input_tokens_seen": 116362370, + "router_z_loss_clip": 0.28051758, + "router_z_loss_mlp": 0.01956177, + "step": 5417, + "time_per_iteration": 3.339186668395996 + }, + { + "auxiliary_loss_clip": 0.01047791, + "auxiliary_loss_mlp": 0.01007062, + "balance_loss_clip": 1.01976418, + "balance_loss_mlp": 1.00513232, + "epoch": 0.32574778295505785, + "flos": 86978447282880.0, + "grad_norm": 0.8144942704897541, + "language_loss": 0.63430524, + "learning_rate": 3.150509119089975e-06, + "loss": 0.65485376, + "num_input_tokens_seen": 116430365, + "router_z_loss_clip": 0.28051758, + "router_z_loss_mlp": 0.01927185, + "step": 5418, + "time_per_iteration": 3.357624053955078 + }, + { + "auxiliary_loss_clip": 0.01135784, + "auxiliary_loss_mlp": 0.01039924, + "balance_loss_clip": 1.05080843, + "balance_loss_mlp": 1.02679932, + "epoch": 0.3258079062077258, + "flos": 25351452169920.0, + "grad_norm": 2.6081847948583503, + "language_loss": 0.69528699, + "learning_rate": 3.1501905275757537e-06, + "loss": 0.71704406, + "num_input_tokens_seen": 116447525, + "router_z_loss_clip": 0.84912109, + "router_z_loss_mlp": 0.13128662, + "step": 5419, + "time_per_iteration": 2.6911396980285645 + }, + { + "auxiliary_loss_clip": 0.01137547, + "auxiliary_loss_mlp": 0.01037805, + "balance_loss_clip": 1.05124974, + "balance_loss_mlp": 1.02330863, + "epoch": 0.3258680294603938, + "flos": 27131973127200.0, + "grad_norm": 1.6803759855459757, + "language_loss": 0.77569371, + "learning_rate": 3.1498718924461926e-06, + "loss": 0.7974472, + "num_input_tokens_seen": 116466310, + "router_z_loss_clip": 0.86181641, + "router_z_loss_mlp": 0.14489746, + "step": 5420, + "time_per_iteration": 2.695857048034668 + }, + { + "auxiliary_loss_clip": 0.01138895, + "auxiliary_loss_mlp": 0.01036893, + "balance_loss_clip": 1.0514617, + "balance_loss_mlp": 1.02246869, + "epoch": 0.3259281527130618, + "flos": 32921472060000.0, + "grad_norm": 1.8385734207651687, + "language_loss": 0.80223298, + "learning_rate": 3.1495532137133736e-06, + "loss": 0.82399082, + "num_input_tokens_seen": 116487825, + "router_z_loss_clip": 0.87451172, + "router_z_loss_mlp": 0.14404297, + "step": 5421, + "time_per_iteration": 2.720716714859009 + }, + { + "auxiliary_loss_clip": 0.0113292, + "auxiliary_loss_mlp": 0.01039309, + "balance_loss_clip": 1.05012798, + "balance_loss_mlp": 1.02661324, + "epoch": 0.32598827596572977, + "flos": 31987033837920.0, + "grad_norm": 1.512663015104041, + "language_loss": 0.75249565, + "learning_rate": 3.149234491389381e-06, + "loss": 0.77421796, + "num_input_tokens_seen": 116509950, + "router_z_loss_clip": 0.82861328, + "router_z_loss_mlp": 0.12695312, + "step": 5422, + "time_per_iteration": 2.74928617477417 + }, + { + "auxiliary_loss_clip": 0.01140486, + "auxiliary_loss_mlp": 0.0104111, + "balance_loss_clip": 1.05434966, + "balance_loss_mlp": 1.02719879, + "epoch": 0.32604839921839773, + "flos": 21523804138080.0, + "grad_norm": 7.790611545106692, + "language_loss": 0.62660003, + "learning_rate": 3.1489157254863026e-06, + "loss": 0.64841604, + "num_input_tokens_seen": 116527695, + "router_z_loss_clip": 0.86132812, + "router_z_loss_mlp": 0.13916016, + "step": 5423, + "time_per_iteration": 4.064270734786987 + }, + { + "auxiliary_loss_clip": 0.01127153, + "auxiliary_loss_mlp": 0.01034592, + "balance_loss_clip": 1.04715788, + "balance_loss_mlp": 1.02274227, + "epoch": 0.3261085224710657, + "flos": 28824746356800.0, + "grad_norm": 2.4800532487356217, + "language_loss": 0.74828231, + "learning_rate": 3.148596916016224e-06, + "loss": 0.76989979, + "num_input_tokens_seen": 116547800, + "router_z_loss_clip": 0.80029297, + "router_z_loss_mlp": 0.11859131, + "step": 5424, + "time_per_iteration": 4.141298770904541 + }, + { + "auxiliary_loss_clip": 0.01132277, + "auxiliary_loss_mlp": 0.01043701, + "balance_loss_clip": 1.05028868, + "balance_loss_mlp": 1.03121388, + "epoch": 0.32616864572373366, + "flos": 28382442197760.0, + "grad_norm": 2.5163809573949436, + "language_loss": 0.77439862, + "learning_rate": 3.1482780629912355e-06, + "loss": 0.79615843, + "num_input_tokens_seen": 116568460, + "router_z_loss_clip": 0.82080078, + "router_z_loss_mlp": 0.12475586, + "step": 5425, + "time_per_iteration": 4.144116163253784 + }, + { + "auxiliary_loss_clip": 0.01137957, + "auxiliary_loss_mlp": 0.01044025, + "balance_loss_clip": 1.04928505, + "balance_loss_mlp": 1.02890968, + "epoch": 0.32622876897640163, + "flos": 31229509176000.0, + "grad_norm": 3.588354820047803, + "language_loss": 0.78053284, + "learning_rate": 3.147959166423428e-06, + "loss": 0.80235267, + "num_input_tokens_seen": 116588705, + "router_z_loss_clip": 0.88720703, + "router_z_loss_mlp": 0.15124512, + "step": 5426, + "time_per_iteration": 2.706721544265747 + }, + { + "auxiliary_loss_clip": 0.01134592, + "auxiliary_loss_mlp": 0.01036258, + "balance_loss_clip": 1.04917979, + "balance_loss_mlp": 1.02244139, + "epoch": 0.3262888922290696, + "flos": 27355839864480.0, + "grad_norm": 1.9132246835203128, + "language_loss": 0.7409367, + "learning_rate": 3.147640226324893e-06, + "loss": 0.76264524, + "num_input_tokens_seen": 116608845, + "router_z_loss_clip": 0.85400391, + "router_z_loss_mlp": 0.13812256, + "step": 5427, + "time_per_iteration": 2.667628526687622 + }, + { + "auxiliary_loss_clip": 0.01136041, + "auxiliary_loss_mlp": 0.01040373, + "balance_loss_clip": 1.04792547, + "balance_loss_mlp": 1.02572775, + "epoch": 0.32634901548173756, + "flos": 24060998894400.0, + "grad_norm": 1.976012177261611, + "language_loss": 0.79326057, + "learning_rate": 3.1473212427077266e-06, + "loss": 0.81502467, + "num_input_tokens_seen": 116628145, + "router_z_loss_clip": 0.88134766, + "router_z_loss_mlp": 0.14654541, + "step": 5428, + "time_per_iteration": 4.144024848937988 + }, + { + "auxiliary_loss_clip": 0.01132253, + "auxiliary_loss_mlp": 0.01039193, + "balance_loss_clip": 1.04737759, + "balance_loss_mlp": 1.02549601, + "epoch": 0.3264091387344055, + "flos": 19697342935680.0, + "grad_norm": 1.814156967982985, + "language_loss": 0.71170676, + "learning_rate": 3.147002215584023e-06, + "loss": 0.73342121, + "num_input_tokens_seen": 116646920, + "router_z_loss_clip": 0.84863281, + "router_z_loss_mlp": 0.13684082, + "step": 5429, + "time_per_iteration": 2.6559181213378906 + }, + { + "auxiliary_loss_clip": 0.01132717, + "auxiliary_loss_mlp": 0.01035131, + "balance_loss_clip": 1.04910254, + "balance_loss_mlp": 1.0223757, + "epoch": 0.3264692619870735, + "flos": 20009848780800.0, + "grad_norm": 2.2079153157019533, + "language_loss": 0.78494966, + "learning_rate": 3.146683144965881e-06, + "loss": 0.80662811, + "num_input_tokens_seen": 116665100, + "router_z_loss_clip": 0.83642578, + "router_z_loss_mlp": 0.12756348, + "step": 5430, + "time_per_iteration": 2.6242423057556152 + }, + { + "auxiliary_loss_clip": 0.01137516, + "auxiliary_loss_mlp": 0.01040937, + "balance_loss_clip": 1.05066705, + "balance_loss_mlp": 1.02638197, + "epoch": 0.32652938523974145, + "flos": 27311115137760.0, + "grad_norm": 1.8089973192800353, + "language_loss": 0.8403616, + "learning_rate": 3.146364030865399e-06, + "loss": 0.86214614, + "num_input_tokens_seen": 116682205, + "router_z_loss_clip": 0.86767578, + "router_z_loss_mlp": 0.14550781, + "step": 5431, + "time_per_iteration": 2.648986339569092 + }, + { + "auxiliary_loss_clip": 0.01130049, + "auxiliary_loss_mlp": 0.01036004, + "balance_loss_clip": 1.04736447, + "balance_loss_mlp": 1.02308786, + "epoch": 0.3265895084924094, + "flos": 26732367830880.0, + "grad_norm": 2.253993327102952, + "language_loss": 0.70477706, + "learning_rate": 3.146044873294678e-06, + "loss": 0.72643757, + "num_input_tokens_seen": 116702575, + "router_z_loss_clip": 0.82763672, + "router_z_loss_mlp": 0.12908936, + "step": 5432, + "time_per_iteration": 2.6490933895111084 + }, + { + "auxiliary_loss_clip": 0.01132515, + "auxiliary_loss_mlp": 0.01030859, + "balance_loss_clip": 1.04744601, + "balance_loss_mlp": 1.01825309, + "epoch": 0.3266496317450774, + "flos": 19605138307200.0, + "grad_norm": 1.556686515481988, + "language_loss": 0.84143525, + "learning_rate": 3.1457256722658203e-06, + "loss": 0.86306894, + "num_input_tokens_seen": 116720885, + "router_z_loss_clip": 0.85009766, + "router_z_loss_mlp": 0.12609863, + "step": 5433, + "time_per_iteration": 2.6748030185699463 + }, + { + "auxiliary_loss_clip": 0.01132381, + "auxiliary_loss_mlp": 0.01030256, + "balance_loss_clip": 1.05000687, + "balance_loss_mlp": 1.01769686, + "epoch": 0.3267097549977454, + "flos": 27489163181760.0, + "grad_norm": 1.4130522661097136, + "language_loss": 0.85782492, + "learning_rate": 3.145406427790931e-06, + "loss": 0.87945127, + "num_input_tokens_seen": 116740395, + "router_z_loss_clip": 0.82421875, + "router_z_loss_mlp": 0.12567139, + "step": 5434, + "time_per_iteration": 2.6451332569122314 + }, + { + "auxiliary_loss_clip": 0.01137881, + "auxiliary_loss_mlp": 0.01037126, + "balance_loss_clip": 1.05165517, + "balance_loss_mlp": 1.02327406, + "epoch": 0.32676987825041337, + "flos": 33276028491360.0, + "grad_norm": 3.488667678653578, + "language_loss": 0.87611514, + "learning_rate": 3.1450871398821147e-06, + "loss": 0.89786518, + "num_input_tokens_seen": 116758870, + "router_z_loss_clip": 0.86181641, + "router_z_loss_mlp": 0.13848877, + "step": 5435, + "time_per_iteration": 2.6707229614257812 + }, + { + "auxiliary_loss_clip": 0.01130752, + "auxiliary_loss_mlp": 0.010339, + "balance_loss_clip": 1.04676056, + "balance_loss_mlp": 1.02052498, + "epoch": 0.32683000150308134, + "flos": 14041856113920.0, + "grad_norm": 2.421219841050958, + "language_loss": 0.76765174, + "learning_rate": 3.144767808551479e-06, + "loss": 0.7892983, + "num_input_tokens_seen": 116773440, + "router_z_loss_clip": 0.83984375, + "router_z_loss_mlp": 0.13391113, + "step": 5436, + "time_per_iteration": 2.6448256969451904 + }, + { + "auxiliary_loss_clip": 0.01130352, + "auxiliary_loss_mlp": 0.01033587, + "balance_loss_clip": 1.04798698, + "balance_loss_mlp": 1.02087331, + "epoch": 0.3268901247557493, + "flos": 31274355454560.0, + "grad_norm": 1.7951310196100347, + "language_loss": 0.71699941, + "learning_rate": 3.144448433811134e-06, + "loss": 0.73863876, + "num_input_tokens_seen": 116794375, + "router_z_loss_clip": 0.82275391, + "router_z_loss_mlp": 0.12713623, + "step": 5437, + "time_per_iteration": 2.675175905227661 + }, + { + "auxiliary_loss_clip": 0.01134696, + "auxiliary_loss_mlp": 0.01035366, + "balance_loss_clip": 1.04697526, + "balance_loss_mlp": 1.02027416, + "epoch": 0.32695024800841727, + "flos": 30335824987200.0, + "grad_norm": 1.534859506810044, + "language_loss": 0.6349026, + "learning_rate": 3.144129015673189e-06, + "loss": 0.65660322, + "num_input_tokens_seen": 116815095, + "router_z_loss_clip": 0.87646484, + "router_z_loss_mlp": 0.15081787, + "step": 5438, + "time_per_iteration": 2.704148769378662 + }, + { + "auxiliary_loss_clip": 0.0113231, + "auxiliary_loss_mlp": 0.0103732, + "balance_loss_clip": 1.04878759, + "balance_loss_mlp": 1.02401674, + "epoch": 0.32701037126108523, + "flos": 35190440007840.0, + "grad_norm": 1.7555763694252091, + "language_loss": 0.74869937, + "learning_rate": 3.1438095541497576e-06, + "loss": 0.7703957, + "num_input_tokens_seen": 116836630, + "router_z_loss_clip": 0.8359375, + "router_z_loss_mlp": 0.13293457, + "step": 5439, + "time_per_iteration": 2.6994049549102783 + }, + { + "auxiliary_loss_clip": 0.01133194, + "auxiliary_loss_mlp": 0.01038995, + "balance_loss_clip": 1.04891813, + "balance_loss_mlp": 1.02533913, + "epoch": 0.3270704945137532, + "flos": 34123569848640.0, + "grad_norm": 1.8592865630507054, + "language_loss": 0.74825025, + "learning_rate": 3.1434900492529527e-06, + "loss": 0.76997215, + "num_input_tokens_seen": 116856880, + "router_z_loss_clip": 0.84277344, + "router_z_loss_mlp": 0.13647461, + "step": 5440, + "time_per_iteration": 2.7117114067077637 + }, + { + "auxiliary_loss_clip": 0.01130238, + "auxiliary_loss_mlp": 0.01044757, + "balance_loss_clip": 1.04686308, + "balance_loss_mlp": 1.03197169, + "epoch": 0.32713061776642116, + "flos": 28908158735520.0, + "grad_norm": 2.2970757826671786, + "language_loss": 0.84634829, + "learning_rate": 3.1431705009948914e-06, + "loss": 0.86809826, + "num_input_tokens_seen": 116873770, + "router_z_loss_clip": 0.83398438, + "router_z_loss_mlp": 0.12792969, + "step": 5441, + "time_per_iteration": 2.6323788166046143 + }, + { + "auxiliary_loss_clip": 0.01131837, + "auxiliary_loss_mlp": 0.01038326, + "balance_loss_clip": 1.04644227, + "balance_loss_mlp": 1.02466416, + "epoch": 0.3271907410190891, + "flos": 27400524073920.0, + "grad_norm": 2.4137583420769833, + "language_loss": 0.86839092, + "learning_rate": 3.1428509093876897e-06, + "loss": 0.89009249, + "num_input_tokens_seen": 116891225, + "router_z_loss_clip": 0.85400391, + "router_z_loss_mlp": 0.13659668, + "step": 5442, + "time_per_iteration": 2.7273056507110596 + }, + { + "auxiliary_loss_clip": 0.01135295, + "auxiliary_loss_mlp": 0.01035931, + "balance_loss_clip": 1.04809213, + "balance_loss_mlp": 1.02126265, + "epoch": 0.3272508642717571, + "flos": 27845340304320.0, + "grad_norm": 2.407973364641513, + "language_loss": 0.7727499, + "learning_rate": 3.1425312744434668e-06, + "loss": 0.7944622, + "num_input_tokens_seen": 116912300, + "router_z_loss_clip": 0.87255859, + "router_z_loss_mlp": 0.14654541, + "step": 5443, + "time_per_iteration": 2.6546144485473633 + }, + { + "auxiliary_loss_clip": 0.01133143, + "auxiliary_loss_mlp": 0.01039142, + "balance_loss_clip": 1.04632044, + "balance_loss_mlp": 1.02580261, + "epoch": 0.32731098752442506, + "flos": 14399410824000.0, + "grad_norm": 2.30103831117299, + "language_loss": 0.81273198, + "learning_rate": 3.142211596174343e-06, + "loss": 0.83445477, + "num_input_tokens_seen": 116929425, + "router_z_loss_clip": 0.86865234, + "router_z_loss_mlp": 0.13342285, + "step": 5444, + "time_per_iteration": 2.6325976848602295 + }, + { + "auxiliary_loss_clip": 0.01132338, + "auxiliary_loss_mlp": 0.01040342, + "balance_loss_clip": 1.04808903, + "balance_loss_mlp": 1.0270977, + "epoch": 0.327371110777093, + "flos": 25658609734080.0, + "grad_norm": 2.8914922912430807, + "language_loss": 0.58942044, + "learning_rate": 3.1418918745924423e-06, + "loss": 0.61114728, + "num_input_tokens_seen": 116948255, + "router_z_loss_clip": 0.84228516, + "router_z_loss_mlp": 0.13250732, + "step": 5445, + "time_per_iteration": 2.7152814865112305 + }, + { + "auxiliary_loss_clip": 0.01133024, + "auxiliary_loss_mlp": 0.01039761, + "balance_loss_clip": 1.04784513, + "balance_loss_mlp": 1.0260098, + "epoch": 0.327431234029761, + "flos": 23259519334080.0, + "grad_norm": 2.033744417646334, + "language_loss": 0.8844623, + "learning_rate": 3.1415721097098865e-06, + "loss": 0.9061901, + "num_input_tokens_seen": 116964905, + "router_z_loss_clip": 0.85253906, + "router_z_loss_mlp": 0.1373291, + "step": 5446, + "time_per_iteration": 2.669785737991333 + }, + { + "auxiliary_loss_clip": 0.01138286, + "auxiliary_loss_mlp": 0.01046917, + "balance_loss_clip": 1.04973197, + "balance_loss_mlp": 1.03139567, + "epoch": 0.32749135728242895, + "flos": 31541609848320.0, + "grad_norm": 1.6658395305275715, + "language_loss": 0.79039121, + "learning_rate": 3.141252301538802e-06, + "loss": 0.81224322, + "num_input_tokens_seen": 116983650, + "router_z_loss_clip": 0.88525391, + "router_z_loss_mlp": 0.15527344, + "step": 5447, + "time_per_iteration": 2.6953353881835938 + }, + { + "auxiliary_loss_clip": 0.01129547, + "auxiliary_loss_mlp": 0.01042765, + "balance_loss_clip": 1.04605639, + "balance_loss_mlp": 1.03034925, + "epoch": 0.327551480535097, + "flos": 24551066576160.0, + "grad_norm": 1.9224830450469879, + "language_loss": 0.73191261, + "learning_rate": 3.1409324500913157e-06, + "loss": 0.75363576, + "num_input_tokens_seen": 117003265, + "router_z_loss_clip": 0.83447266, + "router_z_loss_mlp": 0.12420654, + "step": 5448, + "time_per_iteration": 2.638767719268799 + }, + { + "auxiliary_loss_clip": 0.01129003, + "auxiliary_loss_mlp": 0.01042131, + "balance_loss_clip": 1.04565668, + "balance_loss_mlp": 1.02857733, + "epoch": 0.32761160378776494, + "flos": 35146809247680.0, + "grad_norm": 1.6247910731105677, + "language_loss": 0.66430128, + "learning_rate": 3.1406125553795567e-06, + "loss": 0.68601263, + "num_input_tokens_seen": 117025370, + "router_z_loss_clip": 0.83300781, + "router_z_loss_mlp": 0.13562012, + "step": 5449, + "time_per_iteration": 2.856379270553589 + }, + { + "auxiliary_loss_clip": 0.01129033, + "auxiliary_loss_mlp": 0.01037418, + "balance_loss_clip": 1.04562294, + "balance_loss_mlp": 1.02443027, + "epoch": 0.3276717270404329, + "flos": 32870507672160.0, + "grad_norm": 1.6770828037065961, + "language_loss": 0.65325832, + "learning_rate": 3.1402926174156556e-06, + "loss": 0.67492282, + "num_input_tokens_seen": 117044350, + "router_z_loss_clip": 0.83251953, + "router_z_loss_mlp": 0.12988281, + "step": 5450, + "time_per_iteration": 2.6583549976348877 + }, + { + "auxiliary_loss_clip": 0.01131377, + "auxiliary_loss_mlp": 0.01039618, + "balance_loss_clip": 1.04716086, + "balance_loss_mlp": 1.0263201, + "epoch": 0.32773185029310087, + "flos": 30918299883840.0, + "grad_norm": 2.3404015034346446, + "language_loss": 0.77423179, + "learning_rate": 3.1399726362117437e-06, + "loss": 0.79594171, + "num_input_tokens_seen": 117064450, + "router_z_loss_clip": 0.84179688, + "router_z_loss_mlp": 0.13287354, + "step": 5451, + "time_per_iteration": 2.690497875213623 + }, + { + "auxiliary_loss_clip": 0.01133446, + "auxiliary_loss_mlp": 0.01041688, + "balance_loss_clip": 1.0475738, + "balance_loss_mlp": 1.02706659, + "epoch": 0.32779197354576883, + "flos": 32204053154880.0, + "grad_norm": 4.856005515523404, + "language_loss": 0.70749986, + "learning_rate": 3.1396526117799555e-06, + "loss": 0.72925127, + "num_input_tokens_seen": 117083060, + "router_z_loss_clip": 0.85888672, + "router_z_loss_mlp": 0.14630127, + "step": 5452, + "time_per_iteration": 2.6518514156341553 + }, + { + "auxiliary_loss_clip": 0.01126944, + "auxiliary_loss_mlp": 0.01034627, + "balance_loss_clip": 1.04567778, + "balance_loss_mlp": 1.02188337, + "epoch": 0.3278520967984368, + "flos": 30382656612480.0, + "grad_norm": 1.8621256579544807, + "language_loss": 0.78698301, + "learning_rate": 3.1393325441324256e-06, + "loss": 0.8085987, + "num_input_tokens_seen": 117101860, + "router_z_loss_clip": 0.81298828, + "router_z_loss_mlp": 0.12744141, + "step": 5453, + "time_per_iteration": 2.6864583492279053 + }, + { + "auxiliary_loss_clip": 0.01133771, + "auxiliary_loss_mlp": 0.01031585, + "balance_loss_clip": 1.04895139, + "balance_loss_mlp": 1.01859736, + "epoch": 0.32791222005110476, + "flos": 36305316793440.0, + "grad_norm": 2.399258882765624, + "language_loss": 0.75233829, + "learning_rate": 3.1390124332812916e-06, + "loss": 0.77399188, + "num_input_tokens_seen": 117123100, + "router_z_loss_clip": 0.84912109, + "router_z_loss_mlp": 0.12976074, + "step": 5454, + "time_per_iteration": 2.719019889831543 + }, + { + "auxiliary_loss_clip": 0.01123676, + "auxiliary_loss_mlp": 0.01034793, + "balance_loss_clip": 1.04318416, + "balance_loss_mlp": 1.02322936, + "epoch": 0.32797234330377273, + "flos": 20142888477120.0, + "grad_norm": 1.8064227844300356, + "language_loss": 0.76818323, + "learning_rate": 3.1386922792386924e-06, + "loss": 0.78976792, + "num_input_tokens_seen": 117140515, + "router_z_loss_clip": 0.8046875, + "router_z_loss_mlp": 0.11560059, + "step": 5455, + "time_per_iteration": 2.6330292224884033 + }, + { + "auxiliary_loss_clip": 0.0113234, + "auxiliary_loss_mlp": 0.0103923, + "balance_loss_clip": 1.04537916, + "balance_loss_mlp": 1.02513385, + "epoch": 0.3280324665564407, + "flos": 32430918170880.0, + "grad_norm": 1.7477144955129846, + "language_loss": 0.7352699, + "learning_rate": 3.138372082016768e-06, + "loss": 0.75698555, + "num_input_tokens_seen": 117161485, + "router_z_loss_clip": 0.87060547, + "router_z_loss_mlp": 0.14086914, + "step": 5456, + "time_per_iteration": 2.6664271354675293 + }, + { + "auxiliary_loss_clip": 0.01129935, + "auxiliary_loss_mlp": 0.01038981, + "balance_loss_clip": 1.04540336, + "balance_loss_mlp": 1.02572489, + "epoch": 0.32809258980910866, + "flos": 27979352415360.0, + "grad_norm": 1.7666126010750438, + "language_loss": 0.78216273, + "learning_rate": 3.1380518416276596e-06, + "loss": 0.80385184, + "num_input_tokens_seen": 117181870, + "router_z_loss_clip": 0.84423828, + "router_z_loss_mlp": 0.13244629, + "step": 5457, + "time_per_iteration": 2.6552464962005615 + }, + { + "auxiliary_loss_clip": 0.01133646, + "auxiliary_loss_mlp": 0.01033069, + "balance_loss_clip": 1.04547048, + "balance_loss_mlp": 1.02040291, + "epoch": 0.3281527130617766, + "flos": 27801061267680.0, + "grad_norm": 1.980703722080827, + "language_loss": 0.79071987, + "learning_rate": 3.1377315580835115e-06, + "loss": 0.81238699, + "num_input_tokens_seen": 117201380, + "router_z_loss_clip": 0.88183594, + "router_z_loss_mlp": 0.12677002, + "step": 5458, + "time_per_iteration": 2.6739494800567627 + }, + { + "auxiliary_loss_clip": 0.01127532, + "auxiliary_loss_mlp": 0.01034015, + "balance_loss_clip": 1.04475045, + "balance_loss_mlp": 1.02076507, + "epoch": 0.3282128363144446, + "flos": 25930239994080.0, + "grad_norm": 2.008236217664884, + "language_loss": 0.73083311, + "learning_rate": 3.1374112313964686e-06, + "loss": 0.75244862, + "num_input_tokens_seen": 117221040, + "router_z_loss_clip": 0.82763672, + "router_z_loss_mlp": 0.13238525, + "step": 5459, + "time_per_iteration": 2.6818490028381348 + }, + { + "auxiliary_loss_clip": 0.01132824, + "auxiliary_loss_mlp": 0.01036168, + "balance_loss_clip": 1.04747939, + "balance_loss_mlp": 1.02310276, + "epoch": 0.32827295956711255, + "flos": 37635916343040.0, + "grad_norm": 2.1494212080598625, + "language_loss": 0.84207737, + "learning_rate": 3.1370908615786783e-06, + "loss": 0.86376739, + "num_input_tokens_seen": 117241395, + "router_z_loss_clip": 0.85400391, + "router_z_loss_mlp": 0.13085938, + "step": 5460, + "time_per_iteration": 2.7370529174804688 + }, + { + "auxiliary_loss_clip": 0.01126426, + "auxiliary_loss_mlp": 0.0103065, + "balance_loss_clip": 1.04220021, + "balance_loss_mlp": 1.01832986, + "epoch": 0.3283330828197806, + "flos": 31622550672960.0, + "grad_norm": 2.0205250432816806, + "language_loss": 0.76996791, + "learning_rate": 3.136770448642288e-06, + "loss": 0.79153872, + "num_input_tokens_seen": 117259340, + "router_z_loss_clip": 0.84130859, + "router_z_loss_mlp": 0.12329102, + "step": 5461, + "time_per_iteration": 2.657804489135742 + }, + { + "auxiliary_loss_clip": 0.01129954, + "auxiliary_loss_mlp": 0.01032795, + "balance_loss_clip": 1.04492378, + "balance_loss_mlp": 1.01856756, + "epoch": 0.32839320607244854, + "flos": 47079269578080.0, + "grad_norm": 2.112034357661097, + "language_loss": 0.62978244, + "learning_rate": 3.1364499925994484e-06, + "loss": 0.65140986, + "num_input_tokens_seen": 117282375, + "router_z_loss_clip": 0.85058594, + "router_z_loss_mlp": 0.14221191, + "step": 5462, + "time_per_iteration": 2.8146424293518066 + }, + { + "auxiliary_loss_clip": 0.01126477, + "auxiliary_loss_mlp": 0.01031797, + "balance_loss_clip": 1.04390872, + "balance_loss_mlp": 1.02005458, + "epoch": 0.3284533293251165, + "flos": 32519962451520.0, + "grad_norm": 1.420223692370731, + "language_loss": 0.78089094, + "learning_rate": 3.1361294934623115e-06, + "loss": 0.80247366, + "num_input_tokens_seen": 117303830, + "router_z_loss_clip": 0.82617188, + "router_z_loss_mlp": 0.11743164, + "step": 5463, + "time_per_iteration": 5.369857311248779 + }, + { + "auxiliary_loss_clip": 0.01129592, + "auxiliary_loss_mlp": 0.01032692, + "balance_loss_clip": 1.0449717, + "balance_loss_mlp": 1.0193646, + "epoch": 0.32851345257778447, + "flos": 18673779398400.0, + "grad_norm": 2.1149569082117323, + "language_loss": 0.69440186, + "learning_rate": 3.1358089512430303e-06, + "loss": 0.7160247, + "num_input_tokens_seen": 117320665, + "router_z_loss_clip": 0.84521484, + "router_z_loss_mlp": 0.13342285, + "step": 5464, + "time_per_iteration": 4.169458389282227 + }, + { + "auxiliary_loss_clip": 0.01127953, + "auxiliary_loss_mlp": 0.01034534, + "balance_loss_clip": 1.04637456, + "balance_loss_mlp": 1.02143335, + "epoch": 0.32857357583045244, + "flos": 28691706660480.0, + "grad_norm": 2.777824492647447, + "language_loss": 0.72513247, + "learning_rate": 3.1354883659537594e-06, + "loss": 0.74675733, + "num_input_tokens_seen": 117339795, + "router_z_loss_clip": 0.81640625, + "router_z_loss_mlp": 0.13098145, + "step": 5465, + "time_per_iteration": 2.633335590362549 + }, + { + "auxiliary_loss_clip": 0.01128336, + "auxiliary_loss_mlp": 0.01033164, + "balance_loss_clip": 1.04534948, + "balance_loss_mlp": 1.02025998, + "epoch": 0.3286336990831204, + "flos": 25618503977280.0, + "grad_norm": 1.6648528495130352, + "language_loss": 0.83108449, + "learning_rate": 3.1351677376066567e-06, + "loss": 0.85269952, + "num_input_tokens_seen": 117359525, + "router_z_loss_clip": 0.83007812, + "router_z_loss_mlp": 0.12896729, + "step": 5466, + "time_per_iteration": 2.6632895469665527 + }, + { + "auxiliary_loss_clip": 0.01129073, + "auxiliary_loss_mlp": 0.01033129, + "balance_loss_clip": 1.04478467, + "balance_loss_mlp": 1.02080894, + "epoch": 0.32869382233578837, + "flos": 28868822807040.0, + "grad_norm": 1.8095416979214043, + "language_loss": 0.79730368, + "learning_rate": 3.134847066213879e-06, + "loss": 0.81892568, + "num_input_tokens_seen": 117380320, + "router_z_loss_clip": 0.84179688, + "router_z_loss_mlp": 0.12322998, + "step": 5467, + "time_per_iteration": 2.640556573867798 + }, + { + "auxiliary_loss_clip": 0.01130464, + "auxiliary_loss_mlp": 0.01029563, + "balance_loss_clip": 1.04617822, + "balance_loss_mlp": 1.01684356, + "epoch": 0.32875394558845633, + "flos": 30916152468000.0, + "grad_norm": 2.787958250001543, + "language_loss": 0.74646848, + "learning_rate": 3.134526351787587e-06, + "loss": 0.76806873, + "num_input_tokens_seen": 117400695, + "router_z_loss_clip": 0.84326172, + "router_z_loss_mlp": 0.1272583, + "step": 5468, + "time_per_iteration": 4.125908851623535 + }, + { + "auxiliary_loss_clip": 0.01137025, + "auxiliary_loss_mlp": 0.01038291, + "balance_loss_clip": 1.04945314, + "balance_loss_mlp": 1.02408087, + "epoch": 0.3288140688411243, + "flos": 18184359993120.0, + "grad_norm": 2.729944718788714, + "language_loss": 0.7866323, + "learning_rate": 3.134205594339942e-06, + "loss": 0.80838543, + "num_input_tokens_seen": 117418800, + "router_z_loss_clip": 0.87548828, + "router_z_loss_mlp": 0.14221191, + "step": 5469, + "time_per_iteration": 2.6796586513519287 + }, + { + "auxiliary_loss_clip": 0.01129785, + "auxiliary_loss_mlp": 0.01031617, + "balance_loss_clip": 1.04497206, + "balance_loss_mlp": 1.0189569, + "epoch": 0.32887419209379226, + "flos": 22636249886880.0, + "grad_norm": 3.0766318934010632, + "language_loss": 0.81883752, + "learning_rate": 3.133884793883107e-06, + "loss": 0.84045154, + "num_input_tokens_seen": 117438220, + "router_z_loss_clip": 0.84765625, + "router_z_loss_mlp": 0.12664795, + "step": 5470, + "time_per_iteration": 2.6565208435058594 + }, + { + "auxiliary_loss_clip": 0.01130393, + "auxiliary_loss_mlp": 0.01037852, + "balance_loss_clip": 1.04449666, + "balance_loss_mlp": 1.02428603, + "epoch": 0.3289343153464602, + "flos": 58700561133600.0, + "grad_norm": 2.09920522128866, + "language_loss": 0.6756559, + "learning_rate": 3.1335639504292478e-06, + "loss": 0.69733834, + "num_input_tokens_seen": 117462560, + "router_z_loss_clip": 0.85791016, + "router_z_loss_mlp": 0.13568115, + "step": 5471, + "time_per_iteration": 2.878570556640625 + }, + { + "auxiliary_loss_clip": 0.01136537, + "auxiliary_loss_mlp": 0.01041833, + "balance_loss_clip": 1.04794741, + "balance_loss_mlp": 1.02640724, + "epoch": 0.3289944385991282, + "flos": 33678429480000.0, + "grad_norm": 1.6295740540737025, + "language_loss": 0.64659429, + "learning_rate": 3.1332430639905288e-06, + "loss": 0.66837806, + "num_input_tokens_seen": 117483665, + "router_z_loss_clip": 0.88671875, + "router_z_loss_mlp": 0.15429688, + "step": 5472, + "time_per_iteration": 2.672109603881836 + }, + { + "auxiliary_loss_clip": 0.01134898, + "auxiliary_loss_mlp": 0.01044372, + "balance_loss_clip": 1.04858184, + "balance_loss_mlp": 1.02976871, + "epoch": 0.32905456185179616, + "flos": 24550823472480.0, + "grad_norm": 1.8121676908197373, + "language_loss": 0.8808974, + "learning_rate": 3.13292213457912e-06, + "loss": 0.90269011, + "num_input_tokens_seen": 117503565, + "router_z_loss_clip": 0.86376953, + "router_z_loss_mlp": 0.14581299, + "step": 5473, + "time_per_iteration": 2.6521003246307373 + }, + { + "auxiliary_loss_clip": 0.01135403, + "auxiliary_loss_mlp": 0.01039414, + "balance_loss_clip": 1.04870653, + "balance_loss_mlp": 1.02503693, + "epoch": 0.3291146851044642, + "flos": 28285213426560.0, + "grad_norm": 1.9193299331518836, + "language_loss": 0.7797038, + "learning_rate": 3.1326011622071903e-06, + "loss": 0.80145192, + "num_input_tokens_seen": 117521460, + "router_z_loss_clip": 0.86669922, + "router_z_loss_mlp": 0.14385986, + "step": 5474, + "time_per_iteration": 2.6252567768096924 + }, + { + "auxiliary_loss_clip": 0.01062893, + "auxiliary_loss_mlp": 0.01030955, + "balance_loss_clip": 1.0345608, + "balance_loss_mlp": 1.02902114, + "epoch": 0.32917480835713214, + "flos": 82512822031200.0, + "grad_norm": 0.8216768469109014, + "language_loss": 0.60247862, + "learning_rate": 3.132280146886911e-06, + "loss": 0.62341708, + "num_input_tokens_seen": 117580550, + "router_z_loss_clip": 0.28393555, + "router_z_loss_mlp": 0.01931763, + "step": 5475, + "time_per_iteration": 3.2963356971740723 + }, + { + "auxiliary_loss_clip": 0.01135779, + "auxiliary_loss_mlp": 0.01043941, + "balance_loss_clip": 1.04644144, + "balance_loss_mlp": 1.02894449, + "epoch": 0.3292349316098001, + "flos": 34122962089440.0, + "grad_norm": 2.402711356926168, + "language_loss": 0.7668435, + "learning_rate": 3.131959088630455e-06, + "loss": 0.78864074, + "num_input_tokens_seen": 117600645, + "router_z_loss_clip": 0.89257812, + "router_z_loss_mlp": 0.14990234, + "step": 5476, + "time_per_iteration": 2.7019710540771484 + }, + { + "auxiliary_loss_clip": 0.01133383, + "auxiliary_loss_mlp": 0.01039167, + "balance_loss_clip": 1.04858088, + "balance_loss_mlp": 1.02612519, + "epoch": 0.3292950548624681, + "flos": 24725994789600.0, + "grad_norm": 6.0857638852647655, + "language_loss": 0.74553144, + "learning_rate": 3.131637987449997e-06, + "loss": 0.76725686, + "num_input_tokens_seen": 117618880, + "router_z_loss_clip": 0.84863281, + "router_z_loss_mlp": 0.13037109, + "step": 5477, + "time_per_iteration": 2.6622111797332764 + }, + { + "auxiliary_loss_clip": 0.01129102, + "auxiliary_loss_mlp": 0.01035362, + "balance_loss_clip": 1.04722452, + "balance_loss_mlp": 1.02311945, + "epoch": 0.32935517811513604, + "flos": 25396136379360.0, + "grad_norm": 2.296768978671361, + "language_loss": 0.75414795, + "learning_rate": 3.131316843357713e-06, + "loss": 0.7757926, + "num_input_tokens_seen": 117636445, + "router_z_loss_clip": 0.81933594, + "router_z_loss_mlp": 0.12237549, + "step": 5478, + "time_per_iteration": 2.6533596515655518 + }, + { + "auxiliary_loss_clip": 0.01131695, + "auxiliary_loss_mlp": 0.01035485, + "balance_loss_clip": 1.04865038, + "balance_loss_mlp": 1.02268744, + "epoch": 0.329415301367804, + "flos": 22502642948640.0, + "grad_norm": 2.4179274824443557, + "language_loss": 0.80289984, + "learning_rate": 3.1309956563657807e-06, + "loss": 0.82457167, + "num_input_tokens_seen": 117653105, + "router_z_loss_clip": 0.83007812, + "router_z_loss_mlp": 0.12817383, + "step": 5479, + "time_per_iteration": 2.600721597671509 + }, + { + "auxiliary_loss_clip": 0.0105605, + "auxiliary_loss_mlp": 0.01008021, + "balance_loss_clip": 1.02781606, + "balance_loss_mlp": 1.0061661, + "epoch": 0.32947542462047197, + "flos": 80926474995360.0, + "grad_norm": 0.7455269840434386, + "language_loss": 0.56533587, + "learning_rate": 3.1306744264863804e-06, + "loss": 0.58597648, + "num_input_tokens_seen": 117719225, + "router_z_loss_clip": 0.28198242, + "router_z_loss_mlp": 0.01852417, + "step": 5480, + "time_per_iteration": 3.320373296737671 + }, + { + "auxiliary_loss_clip": 0.01132539, + "auxiliary_loss_mlp": 0.01047677, + "balance_loss_clip": 1.04761124, + "balance_loss_mlp": 1.03430772, + "epoch": 0.32953554787313993, + "flos": 28335245916960.0, + "grad_norm": 1.8627732076397092, + "language_loss": 0.77208388, + "learning_rate": 3.1303531537316915e-06, + "loss": 0.79388595, + "num_input_tokens_seen": 117738725, + "router_z_loss_clip": 0.84960938, + "router_z_loss_mlp": 0.13366699, + "step": 5481, + "time_per_iteration": 2.6765248775482178 + }, + { + "auxiliary_loss_clip": 0.01134409, + "auxiliary_loss_mlp": 0.01039617, + "balance_loss_clip": 1.04854321, + "balance_loss_mlp": 1.02647471, + "epoch": 0.3295956711258079, + "flos": 32958133848000.0, + "grad_norm": 1.6716784836762042, + "language_loss": 0.78530037, + "learning_rate": 3.130031838113899e-06, + "loss": 0.80704069, + "num_input_tokens_seen": 117757765, + "router_z_loss_clip": 0.85791016, + "router_z_loss_mlp": 0.13128662, + "step": 5482, + "time_per_iteration": 2.8036129474639893 + }, + { + "auxiliary_loss_clip": 0.01132177, + "auxiliary_loss_mlp": 0.01042178, + "balance_loss_clip": 1.04577541, + "balance_loss_mlp": 1.02828479, + "epoch": 0.32965579437847586, + "flos": 23393247824160.0, + "grad_norm": 2.0225113556878638, + "language_loss": 0.73791015, + "learning_rate": 3.129710479645185e-06, + "loss": 0.75965375, + "num_input_tokens_seen": 117776810, + "router_z_loss_clip": 0.86328125, + "router_z_loss_mlp": 0.13897705, + "step": 5483, + "time_per_iteration": 2.621797800064087 + }, + { + "auxiliary_loss_clip": 0.01133321, + "auxiliary_loss_mlp": 0.01037438, + "balance_loss_clip": 1.04907644, + "balance_loss_mlp": 1.02509379, + "epoch": 0.32971591763114383, + "flos": 37195557013440.0, + "grad_norm": 1.5760301907683096, + "language_loss": 0.7569167, + "learning_rate": 3.1293890783377366e-06, + "loss": 0.77862424, + "num_input_tokens_seen": 117797730, + "router_z_loss_clip": 0.84277344, + "router_z_loss_mlp": 0.12341309, + "step": 5484, + "time_per_iteration": 2.729642391204834 + }, + { + "auxiliary_loss_clip": 0.01131882, + "auxiliary_loss_mlp": 0.01043211, + "balance_loss_clip": 1.04837298, + "balance_loss_mlp": 1.02993703, + "epoch": 0.3297760408838118, + "flos": 19876241842560.0, + "grad_norm": 2.198791858812074, + "language_loss": 0.71644735, + "learning_rate": 3.129067634203742e-06, + "loss": 0.73819828, + "num_input_tokens_seen": 117815365, + "router_z_loss_clip": 0.83447266, + "router_z_loss_mlp": 0.1328125, + "step": 5485, + "time_per_iteration": 2.6239664554595947 + }, + { + "auxiliary_loss_clip": 0.01129204, + "auxiliary_loss_mlp": 0.01037752, + "balance_loss_clip": 1.04646897, + "balance_loss_mlp": 1.02499664, + "epoch": 0.32983616413647976, + "flos": 36034618430880.0, + "grad_norm": 2.7630948080881326, + "language_loss": 0.80487025, + "learning_rate": 3.128746147255388e-06, + "loss": 0.82653975, + "num_input_tokens_seen": 117836095, + "router_z_loss_clip": 0.828125, + "router_z_loss_mlp": 0.12762451, + "step": 5486, + "time_per_iteration": 2.718012809753418 + }, + { + "auxiliary_loss_clip": 0.011287, + "auxiliary_loss_mlp": 0.0103398, + "balance_loss_clip": 1.04601336, + "balance_loss_mlp": 1.0207895, + "epoch": 0.3298962873891478, + "flos": 25174457575200.0, + "grad_norm": 2.1094042845615553, + "language_loss": 0.84511518, + "learning_rate": 3.1284246175048683e-06, + "loss": 0.86674196, + "num_input_tokens_seen": 117854655, + "router_z_loss_clip": 0.82763672, + "router_z_loss_mlp": 0.13201904, + "step": 5487, + "time_per_iteration": 2.649331569671631 + }, + { + "auxiliary_loss_clip": 0.01134869, + "auxiliary_loss_mlp": 0.01040258, + "balance_loss_clip": 1.04739714, + "balance_loss_mlp": 1.02597642, + "epoch": 0.32995641064181574, + "flos": 18273323239200.0, + "grad_norm": 2.1774001830543934, + "language_loss": 0.74204314, + "learning_rate": 3.1281030449643735e-06, + "loss": 0.76379442, + "num_input_tokens_seen": 117873300, + "router_z_loss_clip": 0.87451172, + "router_z_loss_mlp": 0.14294434, + "step": 5488, + "time_per_iteration": 2.6955411434173584 + }, + { + "auxiliary_loss_clip": 0.01131369, + "auxiliary_loss_mlp": 0.01034577, + "balance_loss_clip": 1.04705155, + "balance_loss_mlp": 1.02088022, + "epoch": 0.3300165338944837, + "flos": 22770545618880.0, + "grad_norm": 2.7173360960444577, + "language_loss": 0.72740126, + "learning_rate": 3.127781429646098e-06, + "loss": 0.74906063, + "num_input_tokens_seen": 117891540, + "router_z_loss_clip": 0.84277344, + "router_z_loss_mlp": 0.13690186, + "step": 5489, + "time_per_iteration": 2.701669692993164 + }, + { + "auxiliary_loss_clip": 0.01126653, + "auxiliary_loss_mlp": 0.01032856, + "balance_loss_clip": 1.04309368, + "balance_loss_mlp": 1.01977253, + "epoch": 0.3300766571471517, + "flos": 31218569510400.0, + "grad_norm": 2.752265654304707, + "language_loss": 0.88968951, + "learning_rate": 3.127459771562238e-06, + "loss": 0.91128463, + "num_input_tokens_seen": 117907690, + "router_z_loss_clip": 0.83544922, + "router_z_loss_mlp": 0.13085938, + "step": 5490, + "time_per_iteration": 2.655972719192505 + }, + { + "auxiliary_loss_clip": 0.01126353, + "auxiliary_loss_mlp": 0.01029892, + "balance_loss_clip": 1.04311585, + "balance_loss_mlp": 1.01675534, + "epoch": 0.33013678039981964, + "flos": 13865104622880.0, + "grad_norm": 2.2866534919546164, + "language_loss": 0.83021927, + "learning_rate": 3.1271380707249907e-06, + "loss": 0.85178173, + "num_input_tokens_seen": 117925640, + "router_z_loss_clip": 0.83105469, + "router_z_loss_mlp": 0.13146973, + "step": 5491, + "time_per_iteration": 2.642005681991577 + }, + { + "auxiliary_loss_clip": 0.01131019, + "auxiliary_loss_mlp": 0.01035154, + "balance_loss_clip": 1.04674244, + "balance_loss_mlp": 1.02160025, + "epoch": 0.3301969036524876, + "flos": 30287737326240.0, + "grad_norm": 8.154491567163541, + "language_loss": 0.77469444, + "learning_rate": 3.126816327146554e-06, + "loss": 0.7963562, + "num_input_tokens_seen": 117944525, + "router_z_loss_clip": 0.84375, + "router_z_loss_mlp": 0.13555908, + "step": 5492, + "time_per_iteration": 2.6756389141082764 + }, + { + "auxiliary_loss_clip": 0.01134895, + "auxiliary_loss_mlp": 0.01038969, + "balance_loss_clip": 1.0484817, + "balance_loss_mlp": 1.02447343, + "epoch": 0.33025702690515557, + "flos": 19475461545120.0, + "grad_norm": 2.155962556268438, + "language_loss": 0.7387346, + "learning_rate": 3.12649454083913e-06, + "loss": 0.76047325, + "num_input_tokens_seen": 117962515, + "router_z_loss_clip": 0.86425781, + "router_z_loss_mlp": 0.1449585, + "step": 5493, + "time_per_iteration": 2.6605846881866455 + }, + { + "auxiliary_loss_clip": 0.01045169, + "auxiliary_loss_mlp": 0.01006061, + "balance_loss_clip": 1.01719093, + "balance_loss_mlp": 1.00445592, + "epoch": 0.33031715015782354, + "flos": 72499878394560.0, + "grad_norm": 0.7840701653610184, + "language_loss": 0.53935945, + "learning_rate": 3.12617271181492e-06, + "loss": 0.55987179, + "num_input_tokens_seen": 118018780, + "router_z_loss_clip": 0.28076172, + "router_z_loss_mlp": 0.0160675, + "step": 5494, + "time_per_iteration": 3.227585554122925 + }, + { + "auxiliary_loss_clip": 0.01130439, + "auxiliary_loss_mlp": 0.01030425, + "balance_loss_clip": 1.0473156, + "balance_loss_mlp": 1.01646566, + "epoch": 0.3303772734104915, + "flos": 28289872913760.0, + "grad_norm": 1.7907923996577146, + "language_loss": 0.86959255, + "learning_rate": 3.1258508400861276e-06, + "loss": 0.8912012, + "num_input_tokens_seen": 118038610, + "router_z_loss_clip": 0.83105469, + "router_z_loss_mlp": 0.13952637, + "step": 5495, + "time_per_iteration": 2.7727370262145996 + }, + { + "auxiliary_loss_clip": 0.01132173, + "auxiliary_loss_mlp": 0.0103871, + "balance_loss_clip": 1.04633045, + "balance_loss_mlp": 1.02402973, + "epoch": 0.33043739666315947, + "flos": 40355413457760.0, + "grad_norm": 2.152377314306296, + "language_loss": 0.73876047, + "learning_rate": 3.1255289256649587e-06, + "loss": 0.76046932, + "num_input_tokens_seen": 118055905, + "router_z_loss_clip": 0.85839844, + "router_z_loss_mlp": 0.14678955, + "step": 5496, + "time_per_iteration": 2.7107203006744385 + }, + { + "auxiliary_loss_clip": 0.01130356, + "auxiliary_loss_mlp": 0.01030128, + "balance_loss_clip": 1.04712009, + "balance_loss_mlp": 1.01664579, + "epoch": 0.33049751991582743, + "flos": 30378199711680.0, + "grad_norm": 2.161542964606078, + "language_loss": 0.72619259, + "learning_rate": 3.1252069685636196e-06, + "loss": 0.74779743, + "num_input_tokens_seen": 118073695, + "router_z_loss_clip": 0.83203125, + "router_z_loss_mlp": 0.13494873, + "step": 5497, + "time_per_iteration": 2.6628165245056152 + }, + { + "auxiliary_loss_clip": 0.01127019, + "auxiliary_loss_mlp": 0.01030671, + "balance_loss_clip": 1.04504907, + "balance_loss_mlp": 1.01727819, + "epoch": 0.3305576431684954, + "flos": 35948531911680.0, + "grad_norm": 1.805324876697433, + "language_loss": 0.80325323, + "learning_rate": 3.124884968794321e-06, + "loss": 0.82483006, + "num_input_tokens_seen": 118094030, + "router_z_loss_clip": 0.81982422, + "router_z_loss_mlp": 0.13397217, + "step": 5498, + "time_per_iteration": 2.7093896865844727 + }, + { + "auxiliary_loss_clip": 0.01129557, + "auxiliary_loss_mlp": 0.0103919, + "balance_loss_clip": 1.04362559, + "balance_loss_mlp": 1.02505207, + "epoch": 0.33061776642116336, + "flos": 27616652010720.0, + "grad_norm": 2.5080635009899583, + "language_loss": 0.76641762, + "learning_rate": 3.12456292636927e-06, + "loss": 0.78810513, + "num_input_tokens_seen": 118111665, + "router_z_loss_clip": 0.85888672, + "router_z_loss_mlp": 0.14147949, + "step": 5499, + "time_per_iteration": 2.633873462677002 + }, + { + "auxiliary_loss_clip": 0.01128572, + "auxiliary_loss_mlp": 0.01033628, + "balance_loss_clip": 1.04509985, + "balance_loss_mlp": 1.02013946, + "epoch": 0.3306778896738313, + "flos": 31452646602240.0, + "grad_norm": 1.8891457304817911, + "language_loss": 0.7910102, + "learning_rate": 3.124240841300681e-06, + "loss": 0.8126322, + "num_input_tokens_seen": 118132435, + "router_z_loss_clip": 0.83496094, + "router_z_loss_mlp": 0.13494873, + "step": 5500, + "time_per_iteration": 2.691150665283203 + }, + { + "auxiliary_loss_clip": 0.01132987, + "auxiliary_loss_mlp": 0.01030692, + "balance_loss_clip": 1.04750371, + "balance_loss_mlp": 1.01647639, + "epoch": 0.33073801292649935, + "flos": 45076137919200.0, + "grad_norm": 1.953539560711292, + "language_loss": 0.66245425, + "learning_rate": 3.1239187136007665e-06, + "loss": 0.68409103, + "num_input_tokens_seen": 118155255, + "router_z_loss_clip": 0.85400391, + "router_z_loss_mlp": 0.14227295, + "step": 5501, + "time_per_iteration": 2.7756874561309814 + }, + { + "auxiliary_loss_clip": 0.01132519, + "auxiliary_loss_mlp": 0.01037689, + "balance_loss_clip": 1.04626071, + "balance_loss_mlp": 1.02301455, + "epoch": 0.3307981361791673, + "flos": 15822822761280.0, + "grad_norm": 2.412197118993493, + "language_loss": 0.77413559, + "learning_rate": 3.1235965432817417e-06, + "loss": 0.7958377, + "num_input_tokens_seen": 118169865, + "router_z_loss_clip": 0.86328125, + "router_z_loss_mlp": 0.14648438, + "step": 5502, + "time_per_iteration": 4.1570799350738525 + }, + { + "auxiliary_loss_clip": 0.01135391, + "auxiliary_loss_mlp": 0.01038927, + "balance_loss_clip": 1.04932117, + "balance_loss_mlp": 1.02495599, + "epoch": 0.3308582594318353, + "flos": 30962214264960.0, + "grad_norm": 1.6426357427374487, + "language_loss": 0.72281528, + "learning_rate": 3.123274330355824e-06, + "loss": 0.74455845, + "num_input_tokens_seen": 118190760, + "router_z_loss_clip": 0.86035156, + "router_z_loss_mlp": 0.13983154, + "step": 5503, + "time_per_iteration": 4.067140340805054 + }, + { + "auxiliary_loss_clip": 0.01129585, + "auxiliary_loss_mlp": 0.01032459, + "balance_loss_clip": 1.04517794, + "balance_loss_mlp": 1.0183804, + "epoch": 0.33091838268450324, + "flos": 32297473301760.0, + "grad_norm": 1.9494452528034312, + "language_loss": 0.75248802, + "learning_rate": 3.12295207483523e-06, + "loss": 0.77410841, + "num_input_tokens_seen": 118213620, + "router_z_loss_clip": 0.84472656, + "router_z_loss_mlp": 0.14105225, + "step": 5504, + "time_per_iteration": 4.183223247528076 + }, + { + "auxiliary_loss_clip": 0.01131886, + "auxiliary_loss_mlp": 0.01038428, + "balance_loss_clip": 1.04688299, + "balance_loss_mlp": 1.02544641, + "epoch": 0.3309785059371712, + "flos": 30470363822880.0, + "grad_norm": 1.5991376177770378, + "language_loss": 0.69594437, + "learning_rate": 3.1226297767321816e-06, + "loss": 0.71764755, + "num_input_tokens_seen": 118235010, + "router_z_loss_clip": 0.85058594, + "router_z_loss_mlp": 0.12969971, + "step": 5505, + "time_per_iteration": 2.750051498413086 + }, + { + "auxiliary_loss_clip": 0.01131425, + "auxiliary_loss_mlp": 0.01041079, + "balance_loss_clip": 1.04898596, + "balance_loss_mlp": 1.02816844, + "epoch": 0.3310386291898392, + "flos": 24949294284960.0, + "grad_norm": 1.801624212886825, + "language_loss": 0.82079554, + "learning_rate": 3.122307436058899e-06, + "loss": 0.84252048, + "num_input_tokens_seen": 118255820, + "router_z_loss_clip": 0.82568359, + "router_z_loss_mlp": 0.12921143, + "step": 5506, + "time_per_iteration": 2.6775448322296143 + }, + { + "auxiliary_loss_clip": 0.01132817, + "auxiliary_loss_mlp": 0.01034018, + "balance_loss_clip": 1.04918265, + "balance_loss_mlp": 1.02039242, + "epoch": 0.33109875244250714, + "flos": 28287968601600.0, + "grad_norm": 1.8862252028822062, + "language_loss": 0.79183531, + "learning_rate": 3.121985052827606e-06, + "loss": 0.81350362, + "num_input_tokens_seen": 118274160, + "router_z_loss_clip": 0.83642578, + "router_z_loss_mlp": 0.13616943, + "step": 5507, + "time_per_iteration": 2.6589009761810303 + }, + { + "auxiliary_loss_clip": 0.01129253, + "auxiliary_loss_mlp": 0.01040772, + "balance_loss_clip": 1.04497433, + "balance_loss_mlp": 1.0272181, + "epoch": 0.3311588756951751, + "flos": 29490755184000.0, + "grad_norm": 1.582477971043636, + "language_loss": 0.71657175, + "learning_rate": 3.1216626270505274e-06, + "loss": 0.73827195, + "num_input_tokens_seen": 118294385, + "router_z_loss_clip": 0.84375, + "router_z_loss_mlp": 0.13568115, + "step": 5508, + "time_per_iteration": 4.045779705047607 + }, + { + "auxiliary_loss_clip": 0.01129195, + "auxiliary_loss_mlp": 0.01030089, + "balance_loss_clip": 1.04886603, + "balance_loss_mlp": 1.01742315, + "epoch": 0.33121899894784307, + "flos": 34344195203520.0, + "grad_norm": 1.922447071294572, + "language_loss": 0.71852225, + "learning_rate": 3.12134015873989e-06, + "loss": 0.74011511, + "num_input_tokens_seen": 118313105, + "router_z_loss_clip": 0.80322266, + "router_z_loss_mlp": 0.12652588, + "step": 5509, + "time_per_iteration": 2.8325204849243164 + }, + { + "auxiliary_loss_clip": 0.01129638, + "auxiliary_loss_mlp": 0.01030944, + "balance_loss_clip": 1.04740167, + "balance_loss_mlp": 1.01741958, + "epoch": 0.33127912220051103, + "flos": 36079424192160.0, + "grad_norm": 1.6232620723957578, + "language_loss": 0.73131585, + "learning_rate": 3.121017647907921e-06, + "loss": 0.7529217, + "num_input_tokens_seen": 118335250, + "router_z_loss_clip": 0.82275391, + "router_z_loss_mlp": 0.13525391, + "step": 5510, + "time_per_iteration": 2.7199318408966064 + }, + { + "auxiliary_loss_clip": 0.01129118, + "auxiliary_loss_mlp": 0.01037139, + "balance_loss_clip": 1.04629552, + "balance_loss_mlp": 1.02458048, + "epoch": 0.331339245453179, + "flos": 17605612686240.0, + "grad_norm": 3.833736264271565, + "language_loss": 0.88554072, + "learning_rate": 3.1206950945668508e-06, + "loss": 0.90720332, + "num_input_tokens_seen": 118351470, + "router_z_loss_clip": 0.82763672, + "router_z_loss_mlp": 0.12554932, + "step": 5511, + "time_per_iteration": 2.6452033519744873 + }, + { + "auxiliary_loss_clip": 0.01124982, + "auxiliary_loss_mlp": 0.01029457, + "balance_loss_clip": 1.0470047, + "balance_loss_mlp": 1.01773906, + "epoch": 0.33139936870584696, + "flos": 25486477212960.0, + "grad_norm": 1.7169692867618414, + "language_loss": 0.73179317, + "learning_rate": 3.12037249872891e-06, + "loss": 0.75333756, + "num_input_tokens_seen": 118370970, + "router_z_loss_clip": 0.77978516, + "router_z_loss_mlp": 0.11712646, + "step": 5512, + "time_per_iteration": 2.649165153503418 + }, + { + "auxiliary_loss_clip": 0.01130557, + "auxiliary_loss_mlp": 0.01038749, + "balance_loss_clip": 1.04875004, + "balance_loss_mlp": 1.02588594, + "epoch": 0.33145949195851493, + "flos": 44275347152640.0, + "grad_norm": 2.1551722256544594, + "language_loss": 0.72388238, + "learning_rate": 3.1200498604063317e-06, + "loss": 0.74557543, + "num_input_tokens_seen": 118393125, + "router_z_loss_clip": 0.81835938, + "router_z_loss_mlp": 0.12866211, + "step": 5513, + "time_per_iteration": 2.8126399517059326 + }, + { + "auxiliary_loss_clip": 0.01134423, + "auxiliary_loss_mlp": 0.01032888, + "balance_loss_clip": 1.04873848, + "balance_loss_mlp": 1.01928616, + "epoch": 0.33151961521118295, + "flos": 17423958604320.0, + "grad_norm": 1.7647980418848344, + "language_loss": 0.68058419, + "learning_rate": 3.1197271796113507e-06, + "loss": 0.70225728, + "num_input_tokens_seen": 118410860, + "router_z_loss_clip": 0.85693359, + "router_z_loss_mlp": 0.13592529, + "step": 5514, + "time_per_iteration": 2.6420345306396484 + }, + { + "auxiliary_loss_clip": 0.01133357, + "auxiliary_loss_mlp": 0.01038295, + "balance_loss_clip": 1.04873633, + "balance_loss_mlp": 1.02341211, + "epoch": 0.3315797384638509, + "flos": 25349142684960.0, + "grad_norm": 2.1375640913062166, + "language_loss": 0.66035879, + "learning_rate": 3.1194044563562026e-06, + "loss": 0.68207532, + "num_input_tokens_seen": 118429570, + "router_z_loss_clip": 0.84570312, + "router_z_loss_mlp": 0.14880371, + "step": 5515, + "time_per_iteration": 2.6437807083129883 + }, + { + "auxiliary_loss_clip": 0.0113134, + "auxiliary_loss_mlp": 0.01032796, + "balance_loss_clip": 1.04748607, + "balance_loss_mlp": 1.01894403, + "epoch": 0.3316398617165189, + "flos": 30114389286720.0, + "grad_norm": 1.728731789782756, + "language_loss": 0.69212949, + "learning_rate": 3.1190816906531257e-06, + "loss": 0.71377081, + "num_input_tokens_seen": 118450285, + "router_z_loss_clip": 0.83837891, + "router_z_loss_mlp": 0.13867188, + "step": 5516, + "time_per_iteration": 2.6524713039398193 + }, + { + "auxiliary_loss_clip": 0.01135147, + "auxiliary_loss_mlp": 0.01036119, + "balance_loss_clip": 1.04819608, + "balance_loss_mlp": 1.02226663, + "epoch": 0.33169998496918685, + "flos": 22680934096320.0, + "grad_norm": 2.0299797438131972, + "language_loss": 0.80294812, + "learning_rate": 3.118758882514359e-06, + "loss": 0.82466078, + "num_input_tokens_seen": 118468270, + "router_z_loss_clip": 0.87011719, + "router_z_loss_mlp": 0.13848877, + "step": 5517, + "time_per_iteration": 2.686933994293213 + }, + { + "auxiliary_loss_clip": 0.01126715, + "auxiliary_loss_mlp": 0.0103014, + "balance_loss_clip": 1.04659235, + "balance_loss_mlp": 1.01746821, + "epoch": 0.3317601082218548, + "flos": 24640272925920.0, + "grad_norm": 1.735507016081151, + "language_loss": 0.74678785, + "learning_rate": 3.118436031952143e-06, + "loss": 0.76835638, + "num_input_tokens_seen": 118486615, + "router_z_loss_clip": 0.80078125, + "router_z_loss_mlp": 0.12677002, + "step": 5518, + "time_per_iteration": 2.6184093952178955 + }, + { + "auxiliary_loss_clip": 0.01050025, + "auxiliary_loss_mlp": 0.01007119, + "balance_loss_clip": 1.02229464, + "balance_loss_mlp": 1.00569677, + "epoch": 0.3318202314745228, + "flos": 84163342088160.0, + "grad_norm": 0.6176978331135978, + "language_loss": 0.5435735, + "learning_rate": 3.1181131389787206e-06, + "loss": 0.56414491, + "num_input_tokens_seen": 118553580, + "router_z_loss_clip": 0.27709961, + "router_z_loss_mlp": 0.01422119, + "step": 5519, + "time_per_iteration": 3.370744466781616 + }, + { + "auxiliary_loss_clip": 0.01132303, + "auxiliary_loss_mlp": 0.01036409, + "balance_loss_clip": 1.04888153, + "balance_loss_mlp": 1.02206802, + "epoch": 0.33188035472719074, + "flos": 26235169107840.0, + "grad_norm": 2.185889296185024, + "language_loss": 0.78707576, + "learning_rate": 3.117790203606336e-06, + "loss": 0.80876291, + "num_input_tokens_seen": 118570280, + "router_z_loss_clip": 0.83300781, + "router_z_loss_mlp": 0.14343262, + "step": 5520, + "time_per_iteration": 2.654134750366211 + }, + { + "auxiliary_loss_clip": 0.01128068, + "auxiliary_loss_mlp": 0.01031154, + "balance_loss_clip": 1.04768682, + "balance_loss_mlp": 1.01863718, + "epoch": 0.3319404779798587, + "flos": 35229046625280.0, + "grad_norm": 1.7939103416978306, + "language_loss": 0.76360476, + "learning_rate": 3.1174672258472344e-06, + "loss": 0.78519702, + "num_input_tokens_seen": 118590455, + "router_z_loss_clip": 0.80419922, + "router_z_loss_mlp": 0.12506104, + "step": 5521, + "time_per_iteration": 2.7229843139648438 + }, + { + "auxiliary_loss_clip": 0.01134007, + "auxiliary_loss_mlp": 0.01039952, + "balance_loss_clip": 1.04841018, + "balance_loss_mlp": 1.02615964, + "epoch": 0.33200060123252667, + "flos": 28158575460480.0, + "grad_norm": 2.014971398968742, + "language_loss": 0.70187229, + "learning_rate": 3.117144205713664e-06, + "loss": 0.72361189, + "num_input_tokens_seen": 118609495, + "router_z_loss_clip": 0.85546875, + "router_z_loss_mlp": 0.13806152, + "step": 5522, + "time_per_iteration": 2.6614153385162354 + }, + { + "auxiliary_loss_clip": 0.01128991, + "auxiliary_loss_mlp": 0.01028705, + "balance_loss_clip": 1.0468235, + "balance_loss_mlp": 1.01602745, + "epoch": 0.33206072448519464, + "flos": 25798091677920.0, + "grad_norm": 2.0163522667675635, + "language_loss": 0.73420143, + "learning_rate": 3.1168211432178735e-06, + "loss": 0.75577843, + "num_input_tokens_seen": 118628720, + "router_z_loss_clip": 0.82177734, + "router_z_loss_mlp": 0.12677002, + "step": 5523, + "time_per_iteration": 2.626169443130493 + }, + { + "auxiliary_loss_clip": 0.01128615, + "auxiliary_loss_mlp": 0.01032387, + "balance_loss_clip": 1.04717803, + "balance_loss_mlp": 1.01911902, + "epoch": 0.3321208477378626, + "flos": 15958536598080.0, + "grad_norm": 1.9801581185706372, + "language_loss": 0.81712067, + "learning_rate": 3.116498038372114e-06, + "loss": 0.83873069, + "num_input_tokens_seen": 118645955, + "router_z_loss_clip": 0.81494141, + "router_z_loss_mlp": 0.13275146, + "step": 5524, + "time_per_iteration": 2.6323156356811523 + }, + { + "auxiliary_loss_clip": 0.01127149, + "auxiliary_loss_mlp": 0.01038484, + "balance_loss_clip": 1.04585493, + "balance_loss_mlp": 1.02643204, + "epoch": 0.33218097099053057, + "flos": 25887095441280.0, + "grad_norm": 1.725190193898595, + "language_loss": 0.82543421, + "learning_rate": 3.116174891188636e-06, + "loss": 0.84709054, + "num_input_tokens_seen": 118665605, + "router_z_loss_clip": 0.8125, + "router_z_loss_mlp": 0.12054443, + "step": 5525, + "time_per_iteration": 2.610023021697998 + }, + { + "auxiliary_loss_clip": 0.01045574, + "auxiliary_loss_mlp": 0.010068, + "balance_loss_clip": 1.0182656, + "balance_loss_mlp": 1.0053854, + "epoch": 0.33224109424319853, + "flos": 78517944069120.0, + "grad_norm": 0.7639295606671452, + "language_loss": 0.52546477, + "learning_rate": 3.1158517016796945e-06, + "loss": 0.54598856, + "num_input_tokens_seen": 118728155, + "router_z_loss_clip": 0.27319336, + "router_z_loss_mlp": 0.0141449, + "step": 5526, + "time_per_iteration": 3.253788709640503 + }, + { + "auxiliary_loss_clip": 0.01131872, + "auxiliary_loss_mlp": 0.01044441, + "balance_loss_clip": 1.0479846, + "balance_loss_mlp": 1.03070176, + "epoch": 0.33230121749586655, + "flos": 21165520116960.0, + "grad_norm": 2.174363223453236, + "language_loss": 0.77316535, + "learning_rate": 3.1155284698575445e-06, + "loss": 0.79492843, + "num_input_tokens_seen": 118743955, + "router_z_loss_clip": 0.83886719, + "router_z_loss_mlp": 0.13739014, + "step": 5527, + "time_per_iteration": 2.6201469898223877 + }, + { + "auxiliary_loss_clip": 0.01129261, + "auxiliary_loss_mlp": 0.0104306, + "balance_loss_clip": 1.04806161, + "balance_loss_mlp": 1.03047192, + "epoch": 0.3323613407485345, + "flos": 25620570358560.0, + "grad_norm": 1.7507469549323675, + "language_loss": 0.71672535, + "learning_rate": 3.1152051957344434e-06, + "loss": 0.73844856, + "num_input_tokens_seen": 118763275, + "router_z_loss_clip": 0.8125, + "router_z_loss_mlp": 0.12597656, + "step": 5528, + "time_per_iteration": 2.6676156520843506 + }, + { + "auxiliary_loss_clip": 0.01129365, + "auxiliary_loss_mlp": 0.0103381, + "balance_loss_clip": 1.04659009, + "balance_loss_mlp": 1.02190709, + "epoch": 0.3324214640012025, + "flos": 16048107603360.0, + "grad_norm": 1.8035805382910544, + "language_loss": 0.82827675, + "learning_rate": 3.1148818793226497e-06, + "loss": 0.84990853, + "num_input_tokens_seen": 118781110, + "router_z_loss_clip": 0.82714844, + "router_z_loss_mlp": 0.11901855, + "step": 5529, + "time_per_iteration": 2.7350754737854004 + }, + { + "auxiliary_loss_clip": 0.01133241, + "auxiliary_loss_mlp": 0.01038031, + "balance_loss_clip": 1.04824162, + "balance_loss_mlp": 1.02531147, + "epoch": 0.33248158725387045, + "flos": 27178278027840.0, + "grad_norm": 1.9139015502611088, + "language_loss": 0.69725209, + "learning_rate": 3.114558520634423e-06, + "loss": 0.71896482, + "num_input_tokens_seen": 118800620, + "router_z_loss_clip": 0.85009766, + "router_z_loss_mlp": 0.12713623, + "step": 5530, + "time_per_iteration": 2.750767707824707 + }, + { + "auxiliary_loss_clip": 0.01132946, + "auxiliary_loss_mlp": 0.01046249, + "balance_loss_clip": 1.04947758, + "balance_loss_mlp": 1.03267121, + "epoch": 0.3325417105065384, + "flos": 25308186065280.0, + "grad_norm": 2.617794421827946, + "language_loss": 0.76016343, + "learning_rate": 3.1142351196820256e-06, + "loss": 0.78195542, + "num_input_tokens_seen": 118818725, + "router_z_loss_clip": 0.83496094, + "router_z_loss_mlp": 0.13574219, + "step": 5531, + "time_per_iteration": 2.687039613723755 + }, + { + "auxiliary_loss_clip": 0.0113184, + "auxiliary_loss_mlp": 0.01036951, + "balance_loss_clip": 1.0476222, + "balance_loss_mlp": 1.02338481, + "epoch": 0.3326018337592064, + "flos": 30249778985280.0, + "grad_norm": 1.934266831268593, + "language_loss": 0.73128867, + "learning_rate": 3.1139116764777206e-06, + "loss": 0.75297654, + "num_input_tokens_seen": 118839390, + "router_z_loss_clip": 0.84326172, + "router_z_loss_mlp": 0.13549805, + "step": 5532, + "time_per_iteration": 2.697749137878418 + }, + { + "auxiliary_loss_clip": 0.01132036, + "auxiliary_loss_mlp": 0.01028506, + "balance_loss_clip": 1.04941058, + "balance_loss_mlp": 1.0164243, + "epoch": 0.33266195701187434, + "flos": 17694535415040.0, + "grad_norm": 2.7369528220615864, + "language_loss": 0.66290736, + "learning_rate": 3.1135881910337735e-06, + "loss": 0.68451273, + "num_input_tokens_seen": 118856275, + "router_z_loss_clip": 0.82568359, + "router_z_loss_mlp": 0.12078857, + "step": 5533, + "time_per_iteration": 2.6512908935546875 + }, + { + "auxiliary_loss_clip": 0.01129195, + "auxiliary_loss_mlp": 0.01036294, + "balance_loss_clip": 1.04648328, + "balance_loss_mlp": 1.02365172, + "epoch": 0.3327220802645423, + "flos": 18674063019360.0, + "grad_norm": 3.2757659938892116, + "language_loss": 0.71067756, + "learning_rate": 3.113264663362451e-06, + "loss": 0.73233247, + "num_input_tokens_seen": 118873830, + "router_z_loss_clip": 0.82666016, + "router_z_loss_mlp": 0.12652588, + "step": 5534, + "time_per_iteration": 2.666658401489258 + }, + { + "auxiliary_loss_clip": 0.01130317, + "auxiliary_loss_mlp": 0.01032822, + "balance_loss_clip": 1.0489471, + "balance_loss_mlp": 1.0198822, + "epoch": 0.3327822035172103, + "flos": 28647508658400.0, + "grad_norm": 1.8823869081126132, + "language_loss": 0.67183876, + "learning_rate": 3.1129410934760204e-06, + "loss": 0.69347018, + "num_input_tokens_seen": 118891560, + "router_z_loss_clip": 0.81347656, + "router_z_loss_mlp": 0.12939453, + "step": 5535, + "time_per_iteration": 2.668013095855713 + }, + { + "auxiliary_loss_clip": 0.01129725, + "auxiliary_loss_mlp": 0.0103664, + "balance_loss_clip": 1.04632854, + "balance_loss_mlp": 1.02386045, + "epoch": 0.33284232676987824, + "flos": 30962376334080.0, + "grad_norm": 2.8196771837889085, + "language_loss": 0.72842526, + "learning_rate": 3.1126174813867517e-06, + "loss": 0.75008887, + "num_input_tokens_seen": 118910260, + "router_z_loss_clip": 0.83349609, + "router_z_loss_mlp": 0.12786865, + "step": 5536, + "time_per_iteration": 2.6629526615142822 + }, + { + "auxiliary_loss_clip": 0.0113052, + "auxiliary_loss_mlp": 0.01037191, + "balance_loss_clip": 1.04827857, + "balance_loss_mlp": 1.02463841, + "epoch": 0.3329024500225462, + "flos": 28912291498080.0, + "grad_norm": 1.653515927907381, + "language_loss": 0.8188622, + "learning_rate": 3.112293827106917e-06, + "loss": 0.84053928, + "num_input_tokens_seen": 118929985, + "router_z_loss_clip": 0.82324219, + "router_z_loss_mlp": 0.12554932, + "step": 5537, + "time_per_iteration": 2.676459789276123 + }, + { + "auxiliary_loss_clip": 0.01136427, + "auxiliary_loss_mlp": 0.01039478, + "balance_loss_clip": 1.05162978, + "balance_loss_mlp": 1.0260849, + "epoch": 0.33296257327521417, + "flos": 38709471853440.0, + "grad_norm": 1.8851423800924265, + "language_loss": 0.71636558, + "learning_rate": 3.111970130648789e-06, + "loss": 0.73812455, + "num_input_tokens_seen": 118951355, + "router_z_loss_clip": 0.84765625, + "router_z_loss_mlp": 0.13391113, + "step": 5538, + "time_per_iteration": 2.7143714427948 + }, + { + "auxiliary_loss_clip": 0.01128612, + "auxiliary_loss_mlp": 0.01032986, + "balance_loss_clip": 1.04741991, + "balance_loss_mlp": 1.02013528, + "epoch": 0.33302269652788213, + "flos": 27753540848640.0, + "grad_norm": 2.8903729217081517, + "language_loss": 0.74449778, + "learning_rate": 3.1116463920246424e-06, + "loss": 0.76611376, + "num_input_tokens_seen": 118970910, + "router_z_loss_clip": 0.81201172, + "router_z_loss_mlp": 0.12860107, + "step": 5539, + "time_per_iteration": 2.6654226779937744 + }, + { + "auxiliary_loss_clip": 0.01138011, + "auxiliary_loss_mlp": 0.01041738, + "balance_loss_clip": 1.0497123, + "balance_loss_mlp": 1.0284338, + "epoch": 0.33308281978055015, + "flos": 13999562424000.0, + "grad_norm": 2.2044541080671527, + "language_loss": 0.70835465, + "learning_rate": 3.1113226112467527e-06, + "loss": 0.73015213, + "num_input_tokens_seen": 118989200, + "router_z_loss_clip": 0.88427734, + "router_z_loss_mlp": 0.13299561, + "step": 5540, + "time_per_iteration": 2.659471273422241 + }, + { + "auxiliary_loss_clip": 0.01128237, + "auxiliary_loss_mlp": 0.01029239, + "balance_loss_clip": 1.045403, + "balance_loss_mlp": 1.01631713, + "epoch": 0.3331429430332181, + "flos": 46630077481440.0, + "grad_norm": 1.7204667072557593, + "language_loss": 0.60802829, + "learning_rate": 3.1109987883273983e-06, + "loss": 0.62960303, + "num_input_tokens_seen": 119011030, + "router_z_loss_clip": 0.82763672, + "router_z_loss_mlp": 0.12921143, + "step": 5541, + "time_per_iteration": 4.361118316650391 + }, + { + "auxiliary_loss_clip": 0.01134011, + "auxiliary_loss_mlp": 0.0104033, + "balance_loss_clip": 1.04956007, + "balance_loss_mlp": 1.0263586, + "epoch": 0.3332030662858861, + "flos": 27489649389120.0, + "grad_norm": 1.841269313409223, + "language_loss": 0.69073915, + "learning_rate": 3.1106749232788584e-06, + "loss": 0.71248257, + "num_input_tokens_seen": 119030620, + "router_z_loss_clip": 0.84375, + "router_z_loss_mlp": 0.13952637, + "step": 5542, + "time_per_iteration": 3.966822624206543 + }, + { + "auxiliary_loss_clip": 0.01131145, + "auxiliary_loss_mlp": 0.01038958, + "balance_loss_clip": 1.0487839, + "balance_loss_mlp": 1.0261668, + "epoch": 0.33326318953855405, + "flos": 19520145754560.0, + "grad_norm": 3.490827472571614, + "language_loss": 0.75367308, + "learning_rate": 3.110351016113414e-06, + "loss": 0.77537411, + "num_input_tokens_seen": 119048015, + "router_z_loss_clip": 0.82373047, + "router_z_loss_mlp": 0.12799072, + "step": 5543, + "time_per_iteration": 4.060790061950684 + }, + { + "auxiliary_loss_clip": 0.01138328, + "auxiliary_loss_mlp": 0.0103476, + "balance_loss_clip": 1.05255485, + "balance_loss_mlp": 1.02170646, + "epoch": 0.333323312791222, + "flos": 31229630727840.0, + "grad_norm": 3.641780294186012, + "language_loss": 0.74314493, + "learning_rate": 3.110027066843348e-06, + "loss": 0.76487583, + "num_input_tokens_seen": 119066280, + "router_z_loss_clip": 0.85839844, + "router_z_loss_mlp": 0.1305542, + "step": 5544, + "time_per_iteration": 2.662905693054199 + }, + { + "auxiliary_loss_clip": 0.01131556, + "auxiliary_loss_mlp": 0.01035275, + "balance_loss_clip": 1.04938006, + "balance_loss_mlp": 1.02266908, + "epoch": 0.33338343604389, + "flos": 30651207559200.0, + "grad_norm": 1.7413642905387008, + "language_loss": 0.70241755, + "learning_rate": 3.1097030754809456e-06, + "loss": 0.72408587, + "num_input_tokens_seen": 119087680, + "router_z_loss_clip": 0.8203125, + "router_z_loss_mlp": 0.12615967, + "step": 5545, + "time_per_iteration": 2.6930925846099854 + }, + { + "auxiliary_loss_clip": 0.01133167, + "auxiliary_loss_mlp": 0.01039911, + "balance_loss_clip": 1.05188322, + "balance_loss_mlp": 1.02709651, + "epoch": 0.33344355929655795, + "flos": 20678734334880.0, + "grad_norm": 2.437653935229917, + "language_loss": 0.69166338, + "learning_rate": 3.1093790420384894e-06, + "loss": 0.71339417, + "num_input_tokens_seen": 119105820, + "router_z_loss_clip": 0.81347656, + "router_z_loss_mlp": 0.12799072, + "step": 5546, + "time_per_iteration": 2.622265100479126 + }, + { + "auxiliary_loss_clip": 0.01135471, + "auxiliary_loss_mlp": 0.01035598, + "balance_loss_clip": 1.05011845, + "balance_loss_mlp": 1.02267015, + "epoch": 0.3335036825492259, + "flos": 34031689358400.0, + "grad_norm": 1.5901021322787559, + "language_loss": 0.64600533, + "learning_rate": 3.1090549665282702e-06, + "loss": 0.66771603, + "num_input_tokens_seen": 119126630, + "router_z_loss_clip": 0.85302734, + "router_z_loss_mlp": 0.12927246, + "step": 5547, + "time_per_iteration": 4.110743999481201 + }, + { + "auxiliary_loss_clip": 0.01133715, + "auxiliary_loss_mlp": 0.01030433, + "balance_loss_clip": 1.05222249, + "balance_loss_mlp": 1.01876271, + "epoch": 0.3335638058018939, + "flos": 19742027145120.0, + "grad_norm": 2.091318839860516, + "language_loss": 0.84890771, + "learning_rate": 3.1087308489625742e-06, + "loss": 0.8705492, + "num_input_tokens_seen": 119143375, + "router_z_loss_clip": 0.81494141, + "router_z_loss_mlp": 0.11682129, + "step": 5548, + "time_per_iteration": 2.6355159282684326 + }, + { + "auxiliary_loss_clip": 0.01137168, + "auxiliary_loss_mlp": 0.01032434, + "balance_loss_clip": 1.05208373, + "balance_loss_mlp": 1.01876068, + "epoch": 0.33362392905456184, + "flos": 48682714906080.0, + "grad_norm": 2.809431441512881, + "language_loss": 0.73720849, + "learning_rate": 3.1084066893536945e-06, + "loss": 0.75890458, + "num_input_tokens_seen": 119166450, + "router_z_loss_clip": 0.84960938, + "router_z_loss_mlp": 0.13684082, + "step": 5549, + "time_per_iteration": 2.8329522609710693 + }, + { + "auxiliary_loss_clip": 0.01138491, + "auxiliary_loss_mlp": 0.01035592, + "balance_loss_clip": 1.05362701, + "balance_loss_mlp": 1.02150106, + "epoch": 0.3336840523072298, + "flos": 54021846741120.0, + "grad_norm": 2.1304851938753986, + "language_loss": 0.68309987, + "learning_rate": 3.108082487713921e-06, + "loss": 0.70484072, + "num_input_tokens_seen": 119189645, + "router_z_loss_clip": 0.84863281, + "router_z_loss_mlp": 0.14099121, + "step": 5550, + "time_per_iteration": 2.8765764236450195 + }, + { + "auxiliary_loss_clip": 0.01136397, + "auxiliary_loss_mlp": 0.01036772, + "balance_loss_clip": 1.05291843, + "balance_loss_mlp": 1.02464259, + "epoch": 0.33374417555989777, + "flos": 18407132763840.0, + "grad_norm": 1.949450465405887, + "language_loss": 0.60612935, + "learning_rate": 3.1077582440555495e-06, + "loss": 0.62786102, + "num_input_tokens_seen": 119208045, + "router_z_loss_clip": 0.83398438, + "router_z_loss_mlp": 0.12127686, + "step": 5551, + "time_per_iteration": 2.688027858734131 + }, + { + "auxiliary_loss_clip": 0.01133464, + "auxiliary_loss_mlp": 0.01037959, + "balance_loss_clip": 1.05161762, + "balance_loss_mlp": 1.02470326, + "epoch": 0.33380429881256574, + "flos": 19339058914560.0, + "grad_norm": 1.7334466100551318, + "language_loss": 0.70780599, + "learning_rate": 3.1074339583908746e-06, + "loss": 0.7295202, + "num_input_tokens_seen": 119224910, + "router_z_loss_clip": 0.81835938, + "router_z_loss_mlp": 0.13256836, + "step": 5552, + "time_per_iteration": 2.6605727672576904 + }, + { + "auxiliary_loss_clip": 0.01136066, + "auxiliary_loss_mlp": 0.0103436, + "balance_loss_clip": 1.05166698, + "balance_loss_mlp": 1.02143168, + "epoch": 0.33386442206523376, + "flos": 16448158589760.0, + "grad_norm": 2.078699805827692, + "language_loss": 0.82851511, + "learning_rate": 3.107109630732192e-06, + "loss": 0.85021937, + "num_input_tokens_seen": 119243290, + "router_z_loss_clip": 0.84375, + "router_z_loss_mlp": 0.12927246, + "step": 5553, + "time_per_iteration": 2.7190282344818115 + }, + { + "auxiliary_loss_clip": 0.01137857, + "auxiliary_loss_mlp": 0.01037327, + "balance_loss_clip": 1.05273175, + "balance_loss_mlp": 1.02317095, + "epoch": 0.3339245453179017, + "flos": 20365701765120.0, + "grad_norm": 2.121607171731427, + "language_loss": 0.81541842, + "learning_rate": 3.1067852610918017e-06, + "loss": 0.83717024, + "num_input_tokens_seen": 119261195, + "router_z_loss_clip": 0.85058594, + "router_z_loss_mlp": 0.14160156, + "step": 5554, + "time_per_iteration": 2.6363918781280518 + }, + { + "auxiliary_loss_clip": 0.01136452, + "auxiliary_loss_mlp": 0.01037926, + "balance_loss_clip": 1.05255222, + "balance_loss_mlp": 1.02496219, + "epoch": 0.3339846685705697, + "flos": 30030004493280.0, + "grad_norm": 1.6282646927321165, + "language_loss": 0.81206465, + "learning_rate": 3.1064608494820032e-06, + "loss": 0.83380842, + "num_input_tokens_seen": 119282845, + "router_z_loss_clip": 0.83935547, + "router_z_loss_mlp": 0.12969971, + "step": 5555, + "time_per_iteration": 2.7795064449310303 + }, + { + "auxiliary_loss_clip": 0.01134466, + "auxiliary_loss_mlp": 0.01034351, + "balance_loss_clip": 1.05174577, + "balance_loss_mlp": 1.02200711, + "epoch": 0.33404479182323765, + "flos": 37770941386080.0, + "grad_norm": 1.717196114460539, + "language_loss": 0.74186713, + "learning_rate": 3.106136395915099e-06, + "loss": 0.76355529, + "num_input_tokens_seen": 119304430, + "router_z_loss_clip": 0.82617188, + "router_z_loss_mlp": 0.12341309, + "step": 5556, + "time_per_iteration": 2.7318310737609863 + }, + { + "auxiliary_loss_clip": 0.01133901, + "auxiliary_loss_mlp": 0.0103045, + "balance_loss_clip": 1.0521245, + "balance_loss_mlp": 1.01795077, + "epoch": 0.3341049150759056, + "flos": 28691220453120.0, + "grad_norm": 1.448566928552938, + "language_loss": 0.82281065, + "learning_rate": 3.105811900403391e-06, + "loss": 0.84445417, + "num_input_tokens_seen": 119323830, + "router_z_loss_clip": 0.81689453, + "router_z_loss_mlp": 0.12493896, + "step": 5557, + "time_per_iteration": 2.7117326259613037 + }, + { + "auxiliary_loss_clip": 0.0113893, + "auxiliary_loss_mlp": 0.01034906, + "balance_loss_clip": 1.05375385, + "balance_loss_mlp": 1.021662, + "epoch": 0.3341650383285736, + "flos": 29317609730880.0, + "grad_norm": 1.586931137170191, + "language_loss": 0.80018824, + "learning_rate": 3.1054873629591855e-06, + "loss": 0.82192659, + "num_input_tokens_seen": 119346340, + "router_z_loss_clip": 0.85253906, + "router_z_loss_mlp": 0.13250732, + "step": 5558, + "time_per_iteration": 2.7242157459259033 + }, + { + "auxiliary_loss_clip": 0.01138076, + "auxiliary_loss_mlp": 0.01027799, + "balance_loss_clip": 1.05303788, + "balance_loss_mlp": 1.01536536, + "epoch": 0.33422516158124155, + "flos": 30385573856640.0, + "grad_norm": 1.6999935648476403, + "language_loss": 0.81451076, + "learning_rate": 3.105162783594788e-06, + "loss": 0.83616948, + "num_input_tokens_seen": 119367285, + "router_z_loss_clip": 0.84960938, + "router_z_loss_mlp": 0.12432861, + "step": 5559, + "time_per_iteration": 2.7155799865722656 + }, + { + "auxiliary_loss_clip": 0.01134041, + "auxiliary_loss_mlp": 0.01034295, + "balance_loss_clip": 1.05196488, + "balance_loss_mlp": 1.02128935, + "epoch": 0.3342852848339095, + "flos": 22370211011520.0, + "grad_norm": 1.8345585786622167, + "language_loss": 0.7189945, + "learning_rate": 3.1048381623225074e-06, + "loss": 0.74067795, + "num_input_tokens_seen": 119385370, + "router_z_loss_clip": 0.81982422, + "router_z_loss_mlp": 0.13006592, + "step": 5560, + "time_per_iteration": 2.674514055252075 + }, + { + "auxiliary_loss_clip": 0.01140591, + "auxiliary_loss_mlp": 0.01037321, + "balance_loss_clip": 1.05289233, + "balance_loss_mlp": 1.02355266, + "epoch": 0.3343454080865775, + "flos": 36663357710880.0, + "grad_norm": 1.533481051851045, + "language_loss": 0.75100935, + "learning_rate": 3.1045134991546526e-06, + "loss": 0.77278852, + "num_input_tokens_seen": 119409150, + "router_z_loss_clip": 0.87695312, + "router_z_loss_mlp": 0.13763428, + "step": 5561, + "time_per_iteration": 2.7669835090637207 + }, + { + "auxiliary_loss_clip": 0.01137456, + "auxiliary_loss_mlp": 0.01031095, + "balance_loss_clip": 1.05396831, + "balance_loss_mlp": 1.01829171, + "epoch": 0.33440553133924544, + "flos": 20009565159840.0, + "grad_norm": 1.8362368352410923, + "language_loss": 0.69570601, + "learning_rate": 3.1041887941035355e-06, + "loss": 0.71739155, + "num_input_tokens_seen": 119426475, + "router_z_loss_clip": 0.83544922, + "router_z_loss_mlp": 0.12823486, + "step": 5562, + "time_per_iteration": 2.6924686431884766 + }, + { + "auxiliary_loss_clip": 0.01133457, + "auxiliary_loss_mlp": 0.01035406, + "balance_loss_clip": 1.05146277, + "balance_loss_mlp": 1.02341914, + "epoch": 0.3344656545919134, + "flos": 29580609810240.0, + "grad_norm": 2.244028311561445, + "language_loss": 0.64882243, + "learning_rate": 3.1038640471814685e-06, + "loss": 0.67051113, + "num_input_tokens_seen": 119446900, + "router_z_loss_clip": 0.8203125, + "router_z_loss_mlp": 0.11981201, + "step": 5563, + "time_per_iteration": 2.6697258949279785 + }, + { + "auxiliary_loss_clip": 0.01139624, + "auxiliary_loss_mlp": 0.01035959, + "balance_loss_clip": 1.05408239, + "balance_loss_mlp": 1.0222857, + "epoch": 0.3345257778445814, + "flos": 63597834499680.0, + "grad_norm": 1.5544558331048488, + "language_loss": 0.74144804, + "learning_rate": 3.103539258400766e-06, + "loss": 0.76320386, + "num_input_tokens_seen": 119470945, + "router_z_loss_clip": 0.85498047, + "router_z_loss_mlp": 0.13677979, + "step": 5564, + "time_per_iteration": 2.9365859031677246 + }, + { + "auxiliary_loss_clip": 0.01053141, + "auxiliary_loss_mlp": 0.01007375, + "balance_loss_clip": 1.02528572, + "balance_loss_mlp": 1.0056392, + "epoch": 0.33458590109724934, + "flos": 83034527358240.0, + "grad_norm": 0.7810588503588632, + "language_loss": 0.55487823, + "learning_rate": 3.103214427773745e-06, + "loss": 0.57548338, + "num_input_tokens_seen": 119529925, + "router_z_loss_clip": 0.27832031, + "router_z_loss_mlp": 0.01737976, + "step": 5565, + "time_per_iteration": 3.2277626991271973 + }, + { + "auxiliary_loss_clip": 0.01133429, + "auxiliary_loss_mlp": 0.01030537, + "balance_loss_clip": 1.05132604, + "balance_loss_mlp": 1.01788902, + "epoch": 0.3346460243499173, + "flos": 45654844708800.0, + "grad_norm": 1.8494623132061014, + "language_loss": 0.64535546, + "learning_rate": 3.102889555312721e-06, + "loss": 0.66699517, + "num_input_tokens_seen": 119550700, + "router_z_loss_clip": 0.8203125, + "router_z_loss_mlp": 0.12652588, + "step": 5566, + "time_per_iteration": 2.8624584674835205 + }, + { + "auxiliary_loss_clip": 0.0113427, + "auxiliary_loss_mlp": 0.01035296, + "balance_loss_clip": 1.05205822, + "balance_loss_mlp": 1.02227199, + "epoch": 0.3347061476025853, + "flos": 22814419482720.0, + "grad_norm": 1.7879483893837798, + "language_loss": 0.76868713, + "learning_rate": 3.102564641030016e-06, + "loss": 0.79038286, + "num_input_tokens_seen": 119569295, + "router_z_loss_clip": 0.82177734, + "router_z_loss_mlp": 0.13037109, + "step": 5567, + "time_per_iteration": 2.6432082653045654 + }, + { + "auxiliary_loss_clip": 0.01135146, + "auxiliary_loss_mlp": 0.01033519, + "balance_loss_clip": 1.05163455, + "balance_loss_mlp": 1.01939273, + "epoch": 0.3347662708552533, + "flos": 16982059618080.0, + "grad_norm": 1.6743200566689154, + "language_loss": 0.7622146, + "learning_rate": 3.102239684937949e-06, + "loss": 0.78390121, + "num_input_tokens_seen": 119587375, + "router_z_loss_clip": 0.83398438, + "router_z_loss_mlp": 0.14105225, + "step": 5568, + "time_per_iteration": 2.721344232559204 + }, + { + "auxiliary_loss_clip": 0.01133477, + "auxiliary_loss_mlp": 0.01036804, + "balance_loss_clip": 1.04900813, + "balance_loss_mlp": 1.02320266, + "epoch": 0.33482639410792125, + "flos": 24099889132800.0, + "grad_norm": 1.8830623810693194, + "language_loss": 0.70975995, + "learning_rate": 3.101914687048842e-06, + "loss": 0.73146284, + "num_input_tokens_seen": 119604530, + "router_z_loss_clip": 0.84521484, + "router_z_loss_mlp": 0.13592529, + "step": 5569, + "time_per_iteration": 2.6312997341156006 + }, + { + "auxiliary_loss_clip": 0.01133754, + "auxiliary_loss_mlp": 0.01029794, + "balance_loss_clip": 1.04822958, + "balance_loss_mlp": 1.01578712, + "epoch": 0.3348865173605892, + "flos": 19648931136480.0, + "grad_norm": 2.0183216724109956, + "language_loss": 0.89900315, + "learning_rate": 3.10158964737502e-06, + "loss": 0.92063856, + "num_input_tokens_seen": 119621025, + "router_z_loss_clip": 0.85400391, + "router_z_loss_mlp": 0.14025879, + "step": 5570, + "time_per_iteration": 2.623978853225708 + }, + { + "auxiliary_loss_clip": 0.01132461, + "auxiliary_loss_mlp": 0.01031667, + "balance_loss_clip": 1.04959559, + "balance_loss_mlp": 1.01828003, + "epoch": 0.3349466406132572, + "flos": 30516344585280.0, + "grad_norm": 1.6676709290929475, + "language_loss": 0.80060971, + "learning_rate": 3.101264565928808e-06, + "loss": 0.82225096, + "num_input_tokens_seen": 119641725, + "router_z_loss_clip": 0.82910156, + "router_z_loss_mlp": 0.13378906, + "step": 5571, + "time_per_iteration": 2.6786623001098633 + }, + { + "auxiliary_loss_clip": 0.01048078, + "auxiliary_loss_mlp": 0.01002694, + "balance_loss_clip": 1.02030945, + "balance_loss_mlp": 1.00090587, + "epoch": 0.33500676386592515, + "flos": 66279379623840.0, + "grad_norm": 0.8993787659222319, + "language_loss": 0.56034553, + "learning_rate": 3.1009394427225335e-06, + "loss": 0.58085322, + "num_input_tokens_seen": 119693560, + "router_z_loss_clip": 0.27783203, + "router_z_loss_mlp": 0.0178833, + "step": 5572, + "time_per_iteration": 3.2475500106811523 + }, + { + "auxiliary_loss_clip": 0.01133247, + "auxiliary_loss_mlp": 0.01036412, + "balance_loss_clip": 1.05140245, + "balance_loss_mlp": 1.02325153, + "epoch": 0.3350668871185931, + "flos": 32698618254720.0, + "grad_norm": 2.68570898831839, + "language_loss": 0.77988821, + "learning_rate": 3.1006142777685257e-06, + "loss": 0.80158478, + "num_input_tokens_seen": 119712935, + "router_z_loss_clip": 0.81787109, + "router_z_loss_mlp": 0.1315918, + "step": 5573, + "time_per_iteration": 2.7283875942230225 + }, + { + "auxiliary_loss_clip": 0.01132879, + "auxiliary_loss_mlp": 0.01039346, + "balance_loss_clip": 1.04899585, + "balance_loss_mlp": 1.02551126, + "epoch": 0.3351270103712611, + "flos": 40890570521760.0, + "grad_norm": 2.500702716711473, + "language_loss": 0.7229284, + "learning_rate": 3.1002890710791133e-06, + "loss": 0.7446506, + "num_input_tokens_seen": 119731680, + "router_z_loss_clip": 0.83837891, + "router_z_loss_mlp": 0.1385498, + "step": 5574, + "time_per_iteration": 2.7144556045532227 + }, + { + "auxiliary_loss_clip": 0.01129264, + "auxiliary_loss_mlp": 0.01032475, + "balance_loss_clip": 1.04693854, + "balance_loss_mlp": 1.01970148, + "epoch": 0.33518713362392905, + "flos": 32343332512320.0, + "grad_norm": 1.7035126896359827, + "language_loss": 0.8778069, + "learning_rate": 3.0999638226666287e-06, + "loss": 0.89942431, + "num_input_tokens_seen": 119752155, + "router_z_loss_clip": 0.82324219, + "router_z_loss_mlp": 0.12756348, + "step": 5575, + "time_per_iteration": 2.696470260620117 + }, + { + "auxiliary_loss_clip": 0.01137461, + "auxiliary_loss_mlp": 0.01039598, + "balance_loss_clip": 1.04901397, + "balance_loss_mlp": 1.02464938, + "epoch": 0.335247256876597, + "flos": 21027213174240.0, + "grad_norm": 2.4398321076702247, + "language_loss": 0.82579851, + "learning_rate": 3.0996385325434063e-06, + "loss": 0.84756905, + "num_input_tokens_seen": 119769195, + "router_z_loss_clip": 0.88476562, + "router_z_loss_mlp": 0.14959717, + "step": 5576, + "time_per_iteration": 2.611994981765747 + }, + { + "auxiliary_loss_clip": 0.01133026, + "auxiliary_loss_mlp": 0.01033522, + "balance_loss_clip": 1.04704916, + "balance_loss_mlp": 1.0195868, + "epoch": 0.335307380129265, + "flos": 31273423557120.0, + "grad_norm": 4.545824339776414, + "language_loss": 0.72489095, + "learning_rate": 3.0993132007217806e-06, + "loss": 0.7465564, + "num_input_tokens_seen": 119786810, + "router_z_loss_clip": 0.86035156, + "router_z_loss_mlp": 0.13928223, + "step": 5577, + "time_per_iteration": 2.70981502532959 + }, + { + "auxiliary_loss_clip": 0.0113592, + "auxiliary_loss_mlp": 0.01034107, + "balance_loss_clip": 1.05177641, + "balance_loss_mlp": 1.02021348, + "epoch": 0.33536750338193294, + "flos": 24013843130880.0, + "grad_norm": 1.6869525931103657, + "language_loss": 0.8147583, + "learning_rate": 3.0989878272140883e-06, + "loss": 0.83645856, + "num_input_tokens_seen": 119805395, + "router_z_loss_clip": 0.84130859, + "router_z_loss_mlp": 0.13891602, + "step": 5578, + "time_per_iteration": 2.658010959625244 + }, + { + "auxiliary_loss_clip": 0.0112955, + "auxiliary_loss_mlp": 0.01032072, + "balance_loss_clip": 1.04964876, + "balance_loss_mlp": 1.01899445, + "epoch": 0.3354276266346009, + "flos": 22369319631360.0, + "grad_norm": 2.031075985118911, + "language_loss": 0.72068304, + "learning_rate": 3.0986624120326676e-06, + "loss": 0.7422992, + "num_input_tokens_seen": 119823135, + "router_z_loss_clip": 0.79882812, + "router_z_loss_mlp": 0.13085938, + "step": 5579, + "time_per_iteration": 2.7132370471954346 + }, + { + "auxiliary_loss_clip": 0.01134088, + "auxiliary_loss_mlp": 0.01034667, + "balance_loss_clip": 1.04840481, + "balance_loss_mlp": 1.02092195, + "epoch": 0.3354877498872689, + "flos": 21790815428160.0, + "grad_norm": 2.7507816340564695, + "language_loss": 0.81079304, + "learning_rate": 3.0983369551898573e-06, + "loss": 0.83248055, + "num_input_tokens_seen": 119842265, + "router_z_loss_clip": 0.85693359, + "router_z_loss_mlp": 0.13751221, + "step": 5580, + "time_per_iteration": 4.1065449714660645 + }, + { + "auxiliary_loss_clip": 0.01134391, + "auxiliary_loss_mlp": 0.01028653, + "balance_loss_clip": 1.04885292, + "balance_loss_mlp": 1.01540875, + "epoch": 0.3355478731399369, + "flos": 30160937291040.0, + "grad_norm": 1.7011890737493658, + "language_loss": 0.78354222, + "learning_rate": 3.0980114566980003e-06, + "loss": 0.80517268, + "num_input_tokens_seen": 119862500, + "router_z_loss_clip": 0.85595703, + "router_z_loss_mlp": 0.13238525, + "step": 5581, + "time_per_iteration": 3.9816832542419434 + }, + { + "auxiliary_loss_clip": 0.01137244, + "auxiliary_loss_mlp": 0.01043426, + "balance_loss_clip": 1.04817986, + "balance_loss_mlp": 1.0277555, + "epoch": 0.33560799639260486, + "flos": 20589203846880.0, + "grad_norm": 2.386748410242573, + "language_loss": 0.74831784, + "learning_rate": 3.0976859165694384e-06, + "loss": 0.77012455, + "num_input_tokens_seen": 119880160, + "router_z_loss_clip": 0.89160156, + "router_z_loss_mlp": 0.15667725, + "step": 5582, + "time_per_iteration": 2.6377406120300293 + }, + { + "auxiliary_loss_clip": 0.01132854, + "auxiliary_loss_mlp": 0.01038334, + "balance_loss_clip": 1.04575276, + "balance_loss_mlp": 1.02438641, + "epoch": 0.3356681196452728, + "flos": 22370332563360.0, + "grad_norm": 1.6590726458304008, + "language_loss": 0.81444943, + "learning_rate": 3.0973603348165166e-06, + "loss": 0.83616126, + "num_input_tokens_seen": 119899040, + "router_z_loss_clip": 0.87060547, + "router_z_loss_mlp": 0.13964844, + "step": 5583, + "time_per_iteration": 4.104580402374268 + }, + { + "auxiliary_loss_clip": 0.01132523, + "auxiliary_loss_mlp": 0.0104235, + "balance_loss_clip": 1.04918277, + "balance_loss_mlp": 1.02899897, + "epoch": 0.3357282428979408, + "flos": 42403877602560.0, + "grad_norm": 1.7679701091554576, + "language_loss": 0.77139771, + "learning_rate": 3.097034711451581e-06, + "loss": 0.79314649, + "num_input_tokens_seen": 119921120, + "router_z_loss_clip": 0.83349609, + "router_z_loss_mlp": 0.13342285, + "step": 5584, + "time_per_iteration": 2.7772698402404785 + }, + { + "auxiliary_loss_clip": 0.01133879, + "auxiliary_loss_mlp": 0.01040063, + "balance_loss_clip": 1.0464561, + "balance_loss_mlp": 1.02655101, + "epoch": 0.33578836615060875, + "flos": 26332114258080.0, + "grad_norm": 2.5361221193676253, + "language_loss": 0.76226962, + "learning_rate": 3.0967090464869795e-06, + "loss": 0.78400904, + "num_input_tokens_seen": 119940165, + "router_z_loss_clip": 0.87402344, + "router_z_loss_mlp": 0.13513184, + "step": 5585, + "time_per_iteration": 2.6969776153564453 + }, + { + "auxiliary_loss_clip": 0.01126295, + "auxiliary_loss_mlp": 0.01036534, + "balance_loss_clip": 1.04403305, + "balance_loss_mlp": 1.02271807, + "epoch": 0.3358484894032767, + "flos": 29932654170240.0, + "grad_norm": 1.5247001023984894, + "language_loss": 0.77690375, + "learning_rate": 3.0963833399350608e-06, + "loss": 0.79853201, + "num_input_tokens_seen": 119959730, + "router_z_loss_clip": 0.82324219, + "router_z_loss_mlp": 0.13824463, + "step": 5586, + "time_per_iteration": 4.148859977722168 + }, + { + "auxiliary_loss_clip": 0.01140729, + "auxiliary_loss_mlp": 0.01042263, + "balance_loss_clip": 1.05155647, + "balance_loss_mlp": 1.026492, + "epoch": 0.3359086126559447, + "flos": 27400483556640.0, + "grad_norm": 8.262447966006329, + "language_loss": 0.80812007, + "learning_rate": 3.0960575918081756e-06, + "loss": 0.82994998, + "num_input_tokens_seen": 119979315, + "router_z_loss_clip": 0.89160156, + "router_z_loss_mlp": 0.15783691, + "step": 5587, + "time_per_iteration": 2.6410470008850098 + }, + { + "auxiliary_loss_clip": 0.01127602, + "auxiliary_loss_mlp": 0.01033072, + "balance_loss_clip": 1.0465095, + "balance_loss_mlp": 1.02026272, + "epoch": 0.33596873590861265, + "flos": 20186438202720.0, + "grad_norm": 2.7338742516931047, + "language_loss": 0.6701715, + "learning_rate": 3.095731802118677e-06, + "loss": 0.69177818, + "num_input_tokens_seen": 119996140, + "router_z_loss_clip": 0.81103516, + "router_z_loss_mlp": 0.1282959, + "step": 5588, + "time_per_iteration": 2.6130526065826416 + }, + { + "auxiliary_loss_clip": 0.01131436, + "auxiliary_loss_mlp": 0.0104328, + "balance_loss_clip": 1.04648376, + "balance_loss_mlp": 1.02854586, + "epoch": 0.3360288591612806, + "flos": 38041558714080.0, + "grad_norm": 1.8404692930717679, + "language_loss": 0.70020139, + "learning_rate": 3.095405970878919e-06, + "loss": 0.7219485, + "num_input_tokens_seen": 120017720, + "router_z_loss_clip": 0.84960938, + "router_z_loss_mlp": 0.14727783, + "step": 5589, + "time_per_iteration": 2.7624220848083496 + }, + { + "auxiliary_loss_clip": 0.01133531, + "auxiliary_loss_mlp": 0.01038247, + "balance_loss_clip": 1.04803824, + "balance_loss_mlp": 1.02323818, + "epoch": 0.3360889824139486, + "flos": 28915168224960.0, + "grad_norm": 3.6128117030095415, + "language_loss": 0.66967291, + "learning_rate": 3.0950800981012567e-06, + "loss": 0.69139063, + "num_input_tokens_seen": 120036335, + "router_z_loss_clip": 0.85546875, + "router_z_loss_mlp": 0.15014648, + "step": 5590, + "time_per_iteration": 2.6901752948760986 + }, + { + "auxiliary_loss_clip": 0.01130544, + "auxiliary_loss_mlp": 0.01041501, + "balance_loss_clip": 1.04828405, + "balance_loss_mlp": 1.02730298, + "epoch": 0.33614910566661654, + "flos": 23571863110080.0, + "grad_norm": 2.0445873937593695, + "language_loss": 0.73477638, + "learning_rate": 3.094754183798047e-06, + "loss": 0.75649685, + "num_input_tokens_seen": 120056120, + "router_z_loss_clip": 0.82080078, + "router_z_loss_mlp": 0.14196777, + "step": 5591, + "time_per_iteration": 2.6832756996154785 + }, + { + "auxiliary_loss_clip": 0.01130798, + "auxiliary_loss_mlp": 0.0103737, + "balance_loss_clip": 1.04792893, + "balance_loss_mlp": 1.02408981, + "epoch": 0.3362092289192845, + "flos": 20099136165120.0, + "grad_norm": 2.1368819310117555, + "language_loss": 0.7000851, + "learning_rate": 3.0944282279816493e-06, + "loss": 0.72176671, + "num_input_tokens_seen": 120073650, + "router_z_loss_clip": 0.82714844, + "router_z_loss_mlp": 0.13287354, + "step": 5592, + "time_per_iteration": 2.67881178855896 + }, + { + "auxiliary_loss_clip": 0.01129373, + "auxiliary_loss_mlp": 0.01034758, + "balance_loss_clip": 1.04700351, + "balance_loss_mlp": 1.02200294, + "epoch": 0.33626935217195253, + "flos": 29582311536000.0, + "grad_norm": 3.6965414429253927, + "language_loss": 0.76493174, + "learning_rate": 3.094102230664423e-06, + "loss": 0.78657305, + "num_input_tokens_seen": 120093260, + "router_z_loss_clip": 0.82373047, + "router_z_loss_mlp": 0.12762451, + "step": 5593, + "time_per_iteration": 2.7268574237823486 + }, + { + "auxiliary_loss_clip": 0.01133229, + "auxiliary_loss_mlp": 0.01036561, + "balance_loss_clip": 1.04638493, + "balance_loss_mlp": 1.02039623, + "epoch": 0.3363294754246205, + "flos": 24060836825280.0, + "grad_norm": 2.2267402420183817, + "language_loss": 0.71972811, + "learning_rate": 3.093776191858731e-06, + "loss": 0.74142599, + "num_input_tokens_seen": 120111830, + "router_z_loss_clip": 0.86816406, + "router_z_loss_mlp": 0.16149902, + "step": 5594, + "time_per_iteration": 2.6053810119628906 + }, + { + "auxiliary_loss_clip": 0.01136034, + "auxiliary_loss_mlp": 0.01041828, + "balance_loss_clip": 1.04933834, + "balance_loss_mlp": 1.02685499, + "epoch": 0.33638959867728846, + "flos": 27572656595040.0, + "grad_norm": 1.9555918309009017, + "language_loss": 0.79929686, + "learning_rate": 3.0934501115769363e-06, + "loss": 0.82107556, + "num_input_tokens_seen": 120130470, + "router_z_loss_clip": 0.8671875, + "router_z_loss_mlp": 0.14959717, + "step": 5595, + "time_per_iteration": 2.6585514545440674 + }, + { + "auxiliary_loss_clip": 0.01130247, + "auxiliary_loss_mlp": 0.01033924, + "balance_loss_clip": 1.04618788, + "balance_loss_mlp": 1.02171707, + "epoch": 0.3364497219299564, + "flos": 25617977252640.0, + "grad_norm": 1.7217025117859768, + "language_loss": 0.81311381, + "learning_rate": 3.0931239898314037e-06, + "loss": 0.83475554, + "num_input_tokens_seen": 120150735, + "router_z_loss_clip": 0.84179688, + "router_z_loss_mlp": 0.12219238, + "step": 5596, + "time_per_iteration": 2.6328444480895996 + }, + { + "auxiliary_loss_clip": 0.01128722, + "auxiliary_loss_mlp": 0.01031122, + "balance_loss_clip": 1.04485548, + "balance_loss_mlp": 1.01864719, + "epoch": 0.3365098451826244, + "flos": 30784328290080.0, + "grad_norm": 1.8933697573849895, + "language_loss": 0.75911224, + "learning_rate": 3.0927978266344995e-06, + "loss": 0.78071064, + "num_input_tokens_seen": 120173230, + "router_z_loss_clip": 0.83837891, + "router_z_loss_mlp": 0.12487793, + "step": 5597, + "time_per_iteration": 2.7228384017944336 + }, + { + "auxiliary_loss_clip": 0.01128934, + "auxiliary_loss_mlp": 0.01034006, + "balance_loss_clip": 1.04722047, + "balance_loss_mlp": 1.02113724, + "epoch": 0.33656996843529235, + "flos": 29982727177920.0, + "grad_norm": 2.178387834344794, + "language_loss": 0.78652811, + "learning_rate": 3.0924716219985916e-06, + "loss": 0.80815744, + "num_input_tokens_seen": 120191860, + "router_z_loss_clip": 0.81689453, + "router_z_loss_mlp": 0.12866211, + "step": 5598, + "time_per_iteration": 2.6498770713806152 + }, + { + "auxiliary_loss_clip": 0.01136189, + "auxiliary_loss_mlp": 0.01033483, + "balance_loss_clip": 1.04721665, + "balance_loss_mlp": 1.01895189, + "epoch": 0.3366300916879603, + "flos": 53801869662720.0, + "grad_norm": 3.62937785597857, + "language_loss": 0.64085323, + "learning_rate": 3.0921453759360514e-06, + "loss": 0.66254997, + "num_input_tokens_seen": 120219195, + "router_z_loss_clip": 0.89013672, + "router_z_loss_mlp": 0.14544678, + "step": 5599, + "time_per_iteration": 2.8436243534088135 + }, + { + "auxiliary_loss_clip": 0.01139684, + "auxiliary_loss_mlp": 0.01040504, + "balance_loss_clip": 1.04950833, + "balance_loss_mlp": 1.02476192, + "epoch": 0.3366902149406283, + "flos": 16935876269280.0, + "grad_norm": 3.3611187053717795, + "language_loss": 0.82193476, + "learning_rate": 3.091819088459249e-06, + "loss": 0.84373665, + "num_input_tokens_seen": 120232950, + "router_z_loss_clip": 0.90185547, + "router_z_loss_mlp": 0.1572876, + "step": 5600, + "time_per_iteration": 2.585686683654785 + }, + { + "auxiliary_loss_clip": 0.01132471, + "auxiliary_loss_mlp": 0.01043676, + "balance_loss_clip": 1.04537463, + "balance_loss_mlp": 1.02894747, + "epoch": 0.33675033819329625, + "flos": 19831719702240.0, + "grad_norm": 7.419490633777304, + "language_loss": 0.82465684, + "learning_rate": 3.0914927595805573e-06, + "loss": 0.84641832, + "num_input_tokens_seen": 120248865, + "router_z_loss_clip": 0.87011719, + "router_z_loss_mlp": 0.14733887, + "step": 5601, + "time_per_iteration": 2.639117479324341 + }, + { + "auxiliary_loss_clip": 0.01130913, + "auxiliary_loss_mlp": 0.01035278, + "balance_loss_clip": 1.05005825, + "balance_loss_mlp": 1.02279067, + "epoch": 0.3368104614459642, + "flos": 20811247306560.0, + "grad_norm": 1.7403402537650876, + "language_loss": 0.83226466, + "learning_rate": 3.0911663893123507e-06, + "loss": 0.85392654, + "num_input_tokens_seen": 120267820, + "router_z_loss_clip": 0.80810547, + "router_z_loss_mlp": 0.12493896, + "step": 5602, + "time_per_iteration": 2.722249746322632 + }, + { + "auxiliary_loss_clip": 0.01134757, + "auxiliary_loss_mlp": 0.01049252, + "balance_loss_clip": 1.04836249, + "balance_loss_mlp": 1.0352571, + "epoch": 0.3368705846986322, + "flos": 21790896462720.0, + "grad_norm": 1.9348926596870204, + "language_loss": 0.69881701, + "learning_rate": 3.0908399776670048e-06, + "loss": 0.72065711, + "num_input_tokens_seen": 120286540, + "router_z_loss_clip": 0.86425781, + "router_z_loss_mlp": 0.14007568, + "step": 5603, + "time_per_iteration": 2.6344027519226074 + }, + { + "auxiliary_loss_clip": 0.01137534, + "auxiliary_loss_mlp": 0.01038223, + "balance_loss_clip": 1.04921496, + "balance_loss_mlp": 1.02460957, + "epoch": 0.33693070795130015, + "flos": 27978825690720.0, + "grad_norm": 1.5975853092421863, + "language_loss": 0.83081591, + "learning_rate": 3.090513524656898e-06, + "loss": 0.85257351, + "num_input_tokens_seen": 120307305, + "router_z_loss_clip": 0.88378906, + "router_z_loss_mlp": 0.1361084, + "step": 5604, + "time_per_iteration": 2.7360761165618896 + }, + { + "auxiliary_loss_clip": 0.0113517, + "auxiliary_loss_mlp": 0.01039124, + "balance_loss_clip": 1.04833722, + "balance_loss_mlp": 1.02541471, + "epoch": 0.3369908312039681, + "flos": 26865853217280.0, + "grad_norm": 1.6462340839845457, + "language_loss": 0.73511744, + "learning_rate": 3.090187030294409e-06, + "loss": 0.75686032, + "num_input_tokens_seen": 120327845, + "router_z_loss_clip": 0.86816406, + "router_z_loss_mlp": 0.13720703, + "step": 5605, + "time_per_iteration": 2.6690773963928223 + }, + { + "auxiliary_loss_clip": 0.01137432, + "auxiliary_loss_mlp": 0.01034734, + "balance_loss_clip": 1.04843974, + "balance_loss_mlp": 1.02074504, + "epoch": 0.33705095445663613, + "flos": 14400099617760.0, + "grad_norm": 3.4786330450537126, + "language_loss": 0.83338869, + "learning_rate": 3.089860494591919e-06, + "loss": 0.85511035, + "num_input_tokens_seen": 120343255, + "router_z_loss_clip": 0.88964844, + "router_z_loss_mlp": 0.13989258, + "step": 5606, + "time_per_iteration": 2.635580539703369 + }, + { + "auxiliary_loss_clip": 0.01130088, + "auxiliary_loss_mlp": 0.01038153, + "balance_loss_clip": 1.0453825, + "balance_loss_mlp": 1.02473569, + "epoch": 0.3371110777093041, + "flos": 30561960692160.0, + "grad_norm": 1.6281491613934944, + "language_loss": 0.67554176, + "learning_rate": 3.089533917561809e-06, + "loss": 0.69722414, + "num_input_tokens_seen": 120361745, + "router_z_loss_clip": 0.84814453, + "router_z_loss_mlp": 0.1340332, + "step": 5607, + "time_per_iteration": 2.6479482650756836 + }, + { + "auxiliary_loss_clip": 0.01136778, + "auxiliary_loss_mlp": 0.01036913, + "balance_loss_clip": 1.0487144, + "balance_loss_mlp": 1.02266121, + "epoch": 0.33717120096197206, + "flos": 32432174206560.0, + "grad_norm": 1.875871023748059, + "language_loss": 0.70539856, + "learning_rate": 3.089207299216464e-06, + "loss": 0.72713554, + "num_input_tokens_seen": 120380565, + "router_z_loss_clip": 0.88085938, + "router_z_loss_mlp": 0.14239502, + "step": 5608, + "time_per_iteration": 2.704883575439453 + }, + { + "auxiliary_loss_clip": 0.01135094, + "auxiliary_loss_mlp": 0.01036158, + "balance_loss_clip": 1.04837322, + "balance_loss_mlp": 1.02248478, + "epoch": 0.33723132421464, + "flos": 18496055492640.0, + "grad_norm": 1.8513682959750193, + "language_loss": 0.79312563, + "learning_rate": 3.088880639568269e-06, + "loss": 0.81483817, + "num_input_tokens_seen": 120399235, + "router_z_loss_clip": 0.8671875, + "router_z_loss_mlp": 0.13665771, + "step": 5609, + "time_per_iteration": 2.6035733222961426 + }, + { + "auxiliary_loss_clip": 0.01134956, + "auxiliary_loss_mlp": 0.01039695, + "balance_loss_clip": 1.04833233, + "balance_loss_mlp": 1.02487731, + "epoch": 0.337291447467308, + "flos": 28597395133440.0, + "grad_norm": 1.687634514858619, + "language_loss": 0.82567501, + "learning_rate": 3.0885539386296114e-06, + "loss": 0.84742153, + "num_input_tokens_seen": 120420095, + "router_z_loss_clip": 0.86669922, + "router_z_loss_mlp": 0.14825439, + "step": 5610, + "time_per_iteration": 2.647815704345703 + }, + { + "auxiliary_loss_clip": 0.0113178, + "auxiliary_loss_mlp": 0.0103909, + "balance_loss_clip": 1.04785502, + "balance_loss_mlp": 1.02430773, + "epoch": 0.33735157071997596, + "flos": 21033776973600.0, + "grad_norm": 5.9358925363396216, + "language_loss": 0.82007778, + "learning_rate": 3.088227196412879e-06, + "loss": 0.8417865, + "num_input_tokens_seen": 120437690, + "router_z_loss_clip": 0.84033203, + "router_z_loss_mlp": 0.14782715, + "step": 5611, + "time_per_iteration": 2.6039786338806152 + }, + { + "auxiliary_loss_clip": 0.01137447, + "auxiliary_loss_mlp": 0.01037147, + "balance_loss_clip": 1.05009413, + "balance_loss_mlp": 1.02181649, + "epoch": 0.3374116939726439, + "flos": 34480233178560.0, + "grad_norm": 1.89604055847459, + "language_loss": 0.79280198, + "learning_rate": 3.0879004129304626e-06, + "loss": 0.8145479, + "num_input_tokens_seen": 120459240, + "router_z_loss_clip": 0.87402344, + "router_z_loss_mlp": 0.15319824, + "step": 5612, + "time_per_iteration": 2.7172155380249023 + }, + { + "auxiliary_loss_clip": 0.01133966, + "auxiliary_loss_mlp": 0.01033826, + "balance_loss_clip": 1.0456636, + "balance_loss_mlp": 1.01956868, + "epoch": 0.3374718172253119, + "flos": 43830004197600.0, + "grad_norm": 2.5751814095862433, + "language_loss": 0.69788206, + "learning_rate": 3.087573588194753e-06, + "loss": 0.71956003, + "num_input_tokens_seen": 120481090, + "router_z_loss_clip": 0.88378906, + "router_z_loss_mlp": 0.14245605, + "step": 5613, + "time_per_iteration": 2.7725255489349365 + }, + { + "auxiliary_loss_clip": 0.01137151, + "auxiliary_loss_mlp": 0.01037289, + "balance_loss_clip": 1.04928064, + "balance_loss_mlp": 1.02265048, + "epoch": 0.33753194047797985, + "flos": 22191231070080.0, + "grad_norm": 2.014020762467578, + "language_loss": 0.79595339, + "learning_rate": 3.087246722218144e-06, + "loss": 0.81769776, + "num_input_tokens_seen": 120500045, + "router_z_loss_clip": 0.87841797, + "router_z_loss_mlp": 0.14642334, + "step": 5614, + "time_per_iteration": 2.6786868572235107 + }, + { + "auxiliary_loss_clip": 0.01135624, + "auxiliary_loss_mlp": 0.01037259, + "balance_loss_clip": 1.04921949, + "balance_loss_mlp": 1.02153552, + "epoch": 0.3375920637306478, + "flos": 28246525774560.0, + "grad_norm": 2.194294432036442, + "language_loss": 0.91228271, + "learning_rate": 3.086919815013031e-06, + "loss": 0.93401158, + "num_input_tokens_seen": 120521125, + "router_z_loss_clip": 0.86376953, + "router_z_loss_mlp": 0.15722656, + "step": 5615, + "time_per_iteration": 2.7198069095611572 + }, + { + "auxiliary_loss_clip": 0.01132187, + "auxiliary_loss_mlp": 0.01031076, + "balance_loss_clip": 1.04730642, + "balance_loss_mlp": 1.01785004, + "epoch": 0.3376521869833158, + "flos": 28201760530560.0, + "grad_norm": 2.4342825931298737, + "language_loss": 0.80953074, + "learning_rate": 3.086592866591809e-06, + "loss": 0.83116335, + "num_input_tokens_seen": 120539180, + "router_z_loss_clip": 0.84814453, + "router_z_loss_mlp": 0.13220215, + "step": 5616, + "time_per_iteration": 2.6777470111846924 + }, + { + "auxiliary_loss_clip": 0.01138815, + "auxiliary_loss_mlp": 0.01042946, + "balance_loss_clip": 1.04869378, + "balance_loss_mlp": 1.0272162, + "epoch": 0.33771231023598375, + "flos": 23519399582880.0, + "grad_norm": 2.35109901012835, + "language_loss": 0.84251964, + "learning_rate": 3.0862658769668774e-06, + "loss": 0.86433721, + "num_input_tokens_seen": 120556280, + "router_z_loss_clip": 0.90136719, + "router_z_loss_mlp": 0.15740967, + "step": 5617, + "time_per_iteration": 2.626744270324707 + }, + { + "auxiliary_loss_clip": 0.0113414, + "auxiliary_loss_mlp": 0.01040084, + "balance_loss_clip": 1.04742622, + "balance_loss_mlp": 1.02636349, + "epoch": 0.3377724334886517, + "flos": 22146708929760.0, + "grad_norm": 1.5768217441924939, + "language_loss": 0.80055547, + "learning_rate": 3.0859388461506343e-06, + "loss": 0.82229769, + "num_input_tokens_seen": 120575395, + "router_z_loss_clip": 0.86669922, + "router_z_loss_mlp": 0.13708496, + "step": 5618, + "time_per_iteration": 2.696944236755371 + }, + { + "auxiliary_loss_clip": 0.01137182, + "auxiliary_loss_mlp": 0.01034273, + "balance_loss_clip": 1.04848492, + "balance_loss_mlp": 1.0203315, + "epoch": 0.3378325567413197, + "flos": 31452281946720.0, + "grad_norm": 1.846466856220711, + "language_loss": 0.70351797, + "learning_rate": 3.085611774155481e-06, + "loss": 0.72523254, + "num_input_tokens_seen": 120596075, + "router_z_loss_clip": 0.88671875, + "router_z_loss_mlp": 0.1395874, + "step": 5619, + "time_per_iteration": 2.772172689437866 + }, + { + "auxiliary_loss_clip": 0.01134083, + "auxiliary_loss_mlp": 0.01047375, + "balance_loss_clip": 1.04766798, + "balance_loss_mlp": 1.0329926, + "epoch": 0.3378926799939877, + "flos": 26011910129760.0, + "grad_norm": 2.574454180133978, + "language_loss": 0.69826746, + "learning_rate": 3.085284660993821e-06, + "loss": 0.72008204, + "num_input_tokens_seen": 120614195, + "router_z_loss_clip": 0.86376953, + "router_z_loss_mlp": 0.1439209, + "step": 5620, + "time_per_iteration": 4.104165315628052 + }, + { + "auxiliary_loss_clip": 0.01132433, + "auxiliary_loss_mlp": 0.01035407, + "balance_loss_clip": 1.04778099, + "balance_loss_mlp": 1.02218699, + "epoch": 0.33795280324665566, + "flos": 30383385923520.0, + "grad_norm": 1.95389238170996, + "language_loss": 0.67647612, + "learning_rate": 3.084957506678058e-06, + "loss": 0.69815457, + "num_input_tokens_seen": 120634475, + "router_z_loss_clip": 0.84619141, + "router_z_loss_mlp": 0.13214111, + "step": 5621, + "time_per_iteration": 4.077130556106567 + }, + { + "auxiliary_loss_clip": 0.01130891, + "auxiliary_loss_mlp": 0.01038014, + "balance_loss_clip": 1.04715228, + "balance_loss_mlp": 1.0247581, + "epoch": 0.33801292649932363, + "flos": 30205135293120.0, + "grad_norm": 1.8541746676233184, + "language_loss": 0.82656884, + "learning_rate": 3.0846303112205975e-06, + "loss": 0.84825796, + "num_input_tokens_seen": 120654980, + "router_z_loss_clip": 0.83642578, + "router_z_loss_mlp": 0.13262939, + "step": 5622, + "time_per_iteration": 2.702260732650757 + }, + { + "auxiliary_loss_clip": 0.01132449, + "auxiliary_loss_mlp": 0.01035427, + "balance_loss_clip": 1.04837823, + "balance_loss_mlp": 1.02226055, + "epoch": 0.3380730497519916, + "flos": 32609128284000.0, + "grad_norm": 1.4536702260176004, + "language_loss": 0.73495507, + "learning_rate": 3.0843030746338464e-06, + "loss": 0.75663388, + "num_input_tokens_seen": 120676245, + "router_z_loss_clip": 0.83935547, + "router_z_loss_mlp": 0.13165283, + "step": 5623, + "time_per_iteration": 4.288825988769531 + }, + { + "auxiliary_loss_clip": 0.010515, + "auxiliary_loss_mlp": 0.01020938, + "balance_loss_clip": 1.02371132, + "balance_loss_mlp": 1.01918888, + "epoch": 0.33813317300465956, + "flos": 85457847091680.0, + "grad_norm": 2.928756284207659, + "language_loss": 0.54887271, + "learning_rate": 3.083975796930215e-06, + "loss": 0.56959713, + "num_input_tokens_seen": 120741965, + "router_z_loss_clip": 0.27758789, + "router_z_loss_mlp": 0.01751709, + "step": 5624, + "time_per_iteration": 3.4077625274658203 + }, + { + "auxiliary_loss_clip": 0.01134443, + "auxiliary_loss_mlp": 0.01041648, + "balance_loss_clip": 1.04812443, + "balance_loss_mlp": 1.0269438, + "epoch": 0.3381932962573275, + "flos": 29403858319200.0, + "grad_norm": 2.271522361759559, + "language_loss": 0.73294932, + "learning_rate": 3.083648478122111e-06, + "loss": 0.75471026, + "num_input_tokens_seen": 120760410, + "router_z_loss_clip": 0.86425781, + "router_z_loss_mlp": 0.14691162, + "step": 5625, + "time_per_iteration": 2.7970666885375977 + }, + { + "auxiliary_loss_clip": 0.01135926, + "auxiliary_loss_mlp": 0.01041907, + "balance_loss_clip": 1.04737771, + "balance_loss_mlp": 1.02775657, + "epoch": 0.3382534195099955, + "flos": 23523532345440.0, + "grad_norm": 2.753105750011906, + "language_loss": 0.7073096, + "learning_rate": 3.0833211182219497e-06, + "loss": 0.72908795, + "num_input_tokens_seen": 120777705, + "router_z_loss_clip": 0.88525391, + "router_z_loss_mlp": 0.14147949, + "step": 5626, + "time_per_iteration": 4.21563196182251 + }, + { + "auxiliary_loss_clip": 0.01131556, + "auxiliary_loss_mlp": 0.01034296, + "balance_loss_clip": 1.04771137, + "balance_loss_mlp": 1.01998472, + "epoch": 0.33831354276266346, + "flos": 30781654149600.0, + "grad_norm": 1.6187444041406054, + "language_loss": 0.81040907, + "learning_rate": 3.0829937172421425e-06, + "loss": 0.83206761, + "num_input_tokens_seen": 120798660, + "router_z_loss_clip": 0.83935547, + "router_z_loss_mlp": 0.14312744, + "step": 5627, + "time_per_iteration": 2.6775808334350586 + }, + { + "auxiliary_loss_clip": 0.01137666, + "auxiliary_loss_mlp": 0.01041094, + "balance_loss_clip": 1.05000114, + "balance_loss_mlp": 1.02697945, + "epoch": 0.3383736660153314, + "flos": 28201517426880.0, + "grad_norm": 2.0618194294711496, + "language_loss": 0.80397558, + "learning_rate": 3.0826662751951055e-06, + "loss": 0.82576323, + "num_input_tokens_seen": 120816705, + "router_z_loss_clip": 0.87646484, + "router_z_loss_mlp": 0.14111328, + "step": 5628, + "time_per_iteration": 2.733548164367676 + }, + { + "auxiliary_loss_clip": 0.0113263, + "auxiliary_loss_mlp": 0.01032209, + "balance_loss_clip": 1.04650474, + "balance_loss_mlp": 1.01834536, + "epoch": 0.3384337892679994, + "flos": 28647062968320.0, + "grad_norm": 2.403503181443642, + "language_loss": 0.77391285, + "learning_rate": 3.082338792093254e-06, + "loss": 0.79556119, + "num_input_tokens_seen": 120835375, + "router_z_loss_clip": 0.86083984, + "router_z_loss_mlp": 0.13867188, + "step": 5629, + "time_per_iteration": 2.660158157348633 + }, + { + "auxiliary_loss_clip": 0.01134738, + "auxiliary_loss_mlp": 0.01039082, + "balance_loss_clip": 1.04531765, + "balance_loss_mlp": 1.02359724, + "epoch": 0.33849391252066735, + "flos": 23703444184320.0, + "grad_norm": 1.7400800567725967, + "language_loss": 0.84379882, + "learning_rate": 3.0820112679490074e-06, + "loss": 0.86553705, + "num_input_tokens_seen": 120854260, + "router_z_loss_clip": 0.89355469, + "router_z_loss_mlp": 0.1550293, + "step": 5630, + "time_per_iteration": 2.6763761043548584 + }, + { + "auxiliary_loss_clip": 0.01135297, + "auxiliary_loss_mlp": 0.01043419, + "balance_loss_clip": 1.0486033, + "balance_loss_mlp": 1.03006744, + "epoch": 0.3385540357733353, + "flos": 25703780150880.0, + "grad_norm": 1.9307379190575846, + "language_loss": 0.71892262, + "learning_rate": 3.0816837027747857e-06, + "loss": 0.74070978, + "num_input_tokens_seen": 120871590, + "router_z_loss_clip": 0.86669922, + "router_z_loss_mlp": 0.13366699, + "step": 5631, + "time_per_iteration": 2.6941895484924316 + }, + { + "auxiliary_loss_clip": 0.01048931, + "auxiliary_loss_mlp": 0.01009685, + "balance_loss_clip": 1.02138925, + "balance_loss_mlp": 1.00801265, + "epoch": 0.3386141590260033, + "flos": 84450078943200.0, + "grad_norm": 0.8621712008590774, + "language_loss": 0.56270897, + "learning_rate": 3.0813560965830084e-06, + "loss": 0.58329511, + "num_input_tokens_seen": 120925550, + "router_z_loss_clip": 0.27563477, + "router_z_loss_mlp": 0.01675415, + "step": 5632, + "time_per_iteration": 3.3521335124969482 + }, + { + "auxiliary_loss_clip": 0.01132765, + "auxiliary_loss_mlp": 0.0103263, + "balance_loss_clip": 1.04677963, + "balance_loss_mlp": 1.01859307, + "epoch": 0.3386742822786713, + "flos": 31139370928800.0, + "grad_norm": 2.048208333160579, + "language_loss": 0.80533248, + "learning_rate": 3.0810284493861005e-06, + "loss": 0.82698643, + "num_input_tokens_seen": 120947620, + "router_z_loss_clip": 0.859375, + "router_z_loss_mlp": 0.14038086, + "step": 5633, + "time_per_iteration": 2.729619026184082 + }, + { + "auxiliary_loss_clip": 0.01132543, + "auxiliary_loss_mlp": 0.01031859, + "balance_loss_clip": 1.04624414, + "balance_loss_mlp": 1.01816201, + "epoch": 0.33873440553133927, + "flos": 28825192046880.0, + "grad_norm": 2.0203013498075904, + "language_loss": 0.59167576, + "learning_rate": 3.0807007611964855e-06, + "loss": 0.61331975, + "num_input_tokens_seen": 120965205, + "router_z_loss_clip": 0.86279297, + "router_z_loss_mlp": 0.13690186, + "step": 5634, + "time_per_iteration": 2.6756980419158936 + }, + { + "auxiliary_loss_clip": 0.01131835, + "auxiliary_loss_mlp": 0.01032967, + "balance_loss_clip": 1.04618645, + "balance_loss_mlp": 1.01931763, + "epoch": 0.33879452878400723, + "flos": 20851920305280.0, + "grad_norm": 2.1920377656298813, + "language_loss": 0.92648911, + "learning_rate": 3.080373032026589e-06, + "loss": 0.94813716, + "num_input_tokens_seen": 120983560, + "router_z_loss_clip": 0.85693359, + "router_z_loss_mlp": 0.13641357, + "step": 5635, + "time_per_iteration": 2.7000465393066406 + }, + { + "auxiliary_loss_clip": 0.01130468, + "auxiliary_loss_mlp": 0.01029425, + "balance_loss_clip": 1.04787254, + "balance_loss_mlp": 1.01612747, + "epoch": 0.3388546520366752, + "flos": 19208126116800.0, + "grad_norm": 1.8736638085530368, + "language_loss": 0.74950433, + "learning_rate": 3.0800452618888386e-06, + "loss": 0.77110326, + "num_input_tokens_seen": 121001400, + "router_z_loss_clip": 0.82519531, + "router_z_loss_mlp": 0.13287354, + "step": 5636, + "time_per_iteration": 2.6280431747436523 + }, + { + "auxiliary_loss_clip": 0.01131682, + "auxiliary_loss_mlp": 0.01034742, + "balance_loss_clip": 1.04762101, + "balance_loss_mlp": 1.02100945, + "epoch": 0.33891477528934316, + "flos": 27355556243520.0, + "grad_norm": 6.033353812953736, + "language_loss": 0.83374733, + "learning_rate": 3.0797174507956637e-06, + "loss": 0.85541159, + "num_input_tokens_seen": 121021760, + "router_z_loss_clip": 0.83984375, + "router_z_loss_mlp": 0.13739014, + "step": 5637, + "time_per_iteration": 2.719661235809326 + }, + { + "auxiliary_loss_clip": 0.01137161, + "auxiliary_loss_mlp": 0.01042507, + "balance_loss_clip": 1.04930115, + "balance_loss_mlp": 1.02729607, + "epoch": 0.3389748985420111, + "flos": 21078501700320.0, + "grad_norm": 2.00857113110018, + "language_loss": 0.69517404, + "learning_rate": 3.079389598759495e-06, + "loss": 0.71697068, + "num_input_tokens_seen": 121041070, + "router_z_loss_clip": 0.87890625, + "router_z_loss_mlp": 0.15222168, + "step": 5638, + "time_per_iteration": 2.668375015258789 + }, + { + "auxiliary_loss_clip": 0.01132493, + "auxiliary_loss_mlp": 0.0104289, + "balance_loss_clip": 1.04731286, + "balance_loss_mlp": 1.02894223, + "epoch": 0.3390350217946791, + "flos": 33855099936480.0, + "grad_norm": 1.8111556804781601, + "language_loss": 0.8109867, + "learning_rate": 3.079061705792765e-06, + "loss": 0.83274055, + "num_input_tokens_seen": 121060890, + "router_z_loss_clip": 0.85205078, + "router_z_loss_mlp": 0.13952637, + "step": 5639, + "time_per_iteration": 2.7637417316436768 + }, + { + "auxiliary_loss_clip": 0.01134229, + "auxiliary_loss_mlp": 0.01039651, + "balance_loss_clip": 1.04613078, + "balance_loss_mlp": 1.02508938, + "epoch": 0.33909514504734706, + "flos": 24818604590880.0, + "grad_norm": 2.1733438905529323, + "language_loss": 0.67893851, + "learning_rate": 3.078733771907907e-06, + "loss": 0.70067739, + "num_input_tokens_seen": 121079135, + "router_z_loss_clip": 0.88134766, + "router_z_loss_mlp": 0.14556885, + "step": 5640, + "time_per_iteration": 2.666078805923462 + }, + { + "auxiliary_loss_clip": 0.01134239, + "auxiliary_loss_mlp": 0.01035255, + "balance_loss_clip": 1.04865181, + "balance_loss_mlp": 1.02058005, + "epoch": 0.339155268300015, + "flos": 18095194160640.0, + "grad_norm": 2.2425135007996655, + "language_loss": 0.69488549, + "learning_rate": 3.0784057971173554e-06, + "loss": 0.71658039, + "num_input_tokens_seen": 121097685, + "router_z_loss_clip": 0.85546875, + "router_z_loss_mlp": 0.14678955, + "step": 5641, + "time_per_iteration": 2.670346975326538 + }, + { + "auxiliary_loss_clip": 0.01135009, + "auxiliary_loss_mlp": 0.0103761, + "balance_loss_clip": 1.04849899, + "balance_loss_mlp": 1.02365685, + "epoch": 0.339215391552683, + "flos": 31808540103840.0, + "grad_norm": 2.393177331320325, + "language_loss": 0.86958218, + "learning_rate": 3.0780777814335483e-06, + "loss": 0.89130843, + "num_input_tokens_seen": 121115640, + "router_z_loss_clip": 0.86523438, + "router_z_loss_mlp": 0.1394043, + "step": 5642, + "time_per_iteration": 2.680485725402832 + }, + { + "auxiliary_loss_clip": 0.01126672, + "auxiliary_loss_mlp": 0.01030779, + "balance_loss_clip": 1.04724181, + "balance_loss_mlp": 1.01807714, + "epoch": 0.33927551480535095, + "flos": 17783620212960.0, + "grad_norm": 1.7869513545829667, + "language_loss": 0.83991867, + "learning_rate": 3.077749724868924e-06, + "loss": 0.86149317, + "num_input_tokens_seen": 121132485, + "router_z_loss_clip": 0.79492188, + "router_z_loss_mlp": 0.12701416, + "step": 5643, + "time_per_iteration": 2.661996364593506 + }, + { + "auxiliary_loss_clip": 0.01129286, + "auxiliary_loss_mlp": 0.01040551, + "balance_loss_clip": 1.04570115, + "balance_loss_mlp": 1.02739573, + "epoch": 0.3393356380580189, + "flos": 29047843265760.0, + "grad_norm": 2.0628315114940894, + "language_loss": 0.77200806, + "learning_rate": 3.077421627435922e-06, + "loss": 0.79370648, + "num_input_tokens_seen": 121152935, + "router_z_loss_clip": 0.83642578, + "router_z_loss_mlp": 0.13146973, + "step": 5644, + "time_per_iteration": 2.7265968322753906 + }, + { + "auxiliary_loss_clip": 0.01130848, + "auxiliary_loss_mlp": 0.0103956, + "balance_loss_clip": 1.04566681, + "balance_loss_mlp": 1.02561247, + "epoch": 0.3393957613106869, + "flos": 21167627015520.0, + "grad_norm": 3.3577198281253504, + "language_loss": 0.63537532, + "learning_rate": 3.0770934891469832e-06, + "loss": 0.65707934, + "num_input_tokens_seen": 121169835, + "router_z_loss_clip": 0.85058594, + "router_z_loss_mlp": 0.13952637, + "step": 5645, + "time_per_iteration": 2.6445488929748535 + }, + { + "auxiliary_loss_clip": 0.01126976, + "auxiliary_loss_mlp": 0.01032434, + "balance_loss_clip": 1.04401934, + "balance_loss_mlp": 1.0200361, + "epoch": 0.3394558845633549, + "flos": 34696239563520.0, + "grad_norm": 1.9175683242727515, + "language_loss": 0.76567709, + "learning_rate": 3.076765310014552e-06, + "loss": 0.78727126, + "num_input_tokens_seen": 121190290, + "router_z_loss_clip": 0.82910156, + "router_z_loss_mlp": 0.12402344, + "step": 5646, + "time_per_iteration": 2.720761299133301 + }, + { + "auxiliary_loss_clip": 0.01135065, + "auxiliary_loss_mlp": 0.01036551, + "balance_loss_clip": 1.04703331, + "balance_loss_mlp": 1.02209711, + "epoch": 0.33951600781602287, + "flos": 26950967321760.0, + "grad_norm": 2.090223224851265, + "language_loss": 0.78822529, + "learning_rate": 3.0764370900510727e-06, + "loss": 0.80994141, + "num_input_tokens_seen": 121209060, + "router_z_loss_clip": 0.88037109, + "router_z_loss_mlp": 0.14453125, + "step": 5647, + "time_per_iteration": 2.7191436290740967 + }, + { + "auxiliary_loss_clip": 0.01133829, + "auxiliary_loss_mlp": 0.01038134, + "balance_loss_clip": 1.04892421, + "balance_loss_mlp": 1.02451479, + "epoch": 0.33957613106869083, + "flos": 29136563408160.0, + "grad_norm": 1.8706096747481389, + "language_loss": 0.7762062, + "learning_rate": 3.0761088292689904e-06, + "loss": 0.79792583, + "num_input_tokens_seen": 121227480, + "router_z_loss_clip": 0.84863281, + "router_z_loss_mlp": 0.13616943, + "step": 5648, + "time_per_iteration": 2.7333638668060303 + }, + { + "auxiliary_loss_clip": 0.01046584, + "auxiliary_loss_mlp": 0.01006744, + "balance_loss_clip": 1.01958442, + "balance_loss_mlp": 1.00513184, + "epoch": 0.3396362543213588, + "flos": 86930319104640.0, + "grad_norm": 0.7706570380163149, + "language_loss": 0.56360573, + "learning_rate": 3.075780527680754e-06, + "loss": 0.58413899, + "num_input_tokens_seen": 121291305, + "router_z_loss_clip": 0.27001953, + "router_z_loss_mlp": 0.0161438, + "step": 5649, + "time_per_iteration": 3.344686985015869 + }, + { + "auxiliary_loss_clip": 0.01130424, + "auxiliary_loss_mlp": 0.0103852, + "balance_loss_clip": 1.04565299, + "balance_loss_mlp": 1.02456045, + "epoch": 0.33969637757402676, + "flos": 31630005852480.0, + "grad_norm": 1.6798652352480161, + "language_loss": 0.85878217, + "learning_rate": 3.0754521852988117e-06, + "loss": 0.88047159, + "num_input_tokens_seen": 121312740, + "router_z_loss_clip": 0.84765625, + "router_z_loss_mlp": 0.13970947, + "step": 5650, + "time_per_iteration": 2.735599994659424 + }, + { + "auxiliary_loss_clip": 0.01130341, + "auxiliary_loss_mlp": 0.01026686, + "balance_loss_clip": 1.04697692, + "balance_loss_mlp": 1.01358509, + "epoch": 0.33975650082669473, + "flos": 43027552222560.0, + "grad_norm": 1.6363393678381624, + "language_loss": 0.71038163, + "learning_rate": 3.0751238021356152e-06, + "loss": 0.73195183, + "num_input_tokens_seen": 121334220, + "router_z_loss_clip": 0.83251953, + "router_z_loss_mlp": 0.13116455, + "step": 5651, + "time_per_iteration": 2.8449060916900635 + }, + { + "auxiliary_loss_clip": 0.01132265, + "auxiliary_loss_mlp": 0.0103342, + "balance_loss_clip": 1.04838049, + "balance_loss_mlp": 1.01993775, + "epoch": 0.3398166240793627, + "flos": 20313076168800.0, + "grad_norm": 1.9281000139334001, + "language_loss": 0.81131911, + "learning_rate": 3.074795378203616e-06, + "loss": 0.83297598, + "num_input_tokens_seen": 121351870, + "router_z_loss_clip": 0.83740234, + "router_z_loss_mlp": 0.13494873, + "step": 5652, + "time_per_iteration": 2.786231279373169 + }, + { + "auxiliary_loss_clip": 0.01135251, + "auxiliary_loss_mlp": 0.01039094, + "balance_loss_clip": 1.04864502, + "balance_loss_mlp": 1.02483678, + "epoch": 0.33987674733203066, + "flos": 29360875835520.0, + "grad_norm": 1.7864734402464713, + "language_loss": 0.77217591, + "learning_rate": 3.0744669135152685e-06, + "loss": 0.79391932, + "num_input_tokens_seen": 121373400, + "router_z_loss_clip": 0.86572266, + "router_z_loss_mlp": 0.14276123, + "step": 5653, + "time_per_iteration": 2.7308967113494873 + }, + { + "auxiliary_loss_clip": 0.01129691, + "auxiliary_loss_mlp": 0.01029297, + "balance_loss_clip": 1.04575372, + "balance_loss_mlp": 1.01660156, + "epoch": 0.3399368705846986, + "flos": 16168911081120.0, + "grad_norm": 3.228565141438369, + "language_loss": 0.8557688, + "learning_rate": 3.0741384080830278e-06, + "loss": 0.87735868, + "num_input_tokens_seen": 121385225, + "router_z_loss_clip": 0.84033203, + "router_z_loss_mlp": 0.12713623, + "step": 5654, + "time_per_iteration": 2.6197280883789062 + }, + { + "auxiliary_loss_clip": 0.01129703, + "auxiliary_loss_mlp": 0.01033223, + "balance_loss_clip": 1.04499435, + "balance_loss_mlp": 1.0194962, + "epoch": 0.3399969938373666, + "flos": 32961901955040.0, + "grad_norm": 1.8731519934541234, + "language_loss": 0.65025377, + "learning_rate": 3.073809861919351e-06, + "loss": 0.67188299, + "num_input_tokens_seen": 121404735, + "router_z_loss_clip": 0.84667969, + "router_z_loss_mlp": 0.13739014, + "step": 5655, + "time_per_iteration": 2.7501091957092285 + }, + { + "auxiliary_loss_clip": 0.01132142, + "auxiliary_loss_mlp": 0.01036977, + "balance_loss_clip": 1.04805374, + "balance_loss_mlp": 1.02379251, + "epoch": 0.34005711709003456, + "flos": 34837342198560.0, + "grad_norm": 1.4279802018800083, + "language_loss": 0.76855332, + "learning_rate": 3.073481275036697e-06, + "loss": 0.79024446, + "num_input_tokens_seen": 121426780, + "router_z_loss_clip": 0.84130859, + "router_z_loss_mlp": 0.13189697, + "step": 5656, + "time_per_iteration": 2.756791591644287 + }, + { + "auxiliary_loss_clip": 0.01137545, + "auxiliary_loss_mlp": 0.01032987, + "balance_loss_clip": 1.04743183, + "balance_loss_mlp": 1.01844978, + "epoch": 0.3401172403427025, + "flos": 26377122605760.0, + "grad_norm": 1.9635171743888515, + "language_loss": 0.83036304, + "learning_rate": 3.073152647447525e-06, + "loss": 0.85206842, + "num_input_tokens_seen": 121447245, + "router_z_loss_clip": 0.90136719, + "router_z_loss_mlp": 0.14544678, + "step": 5657, + "time_per_iteration": 2.725888252258301 + }, + { + "auxiliary_loss_clip": 0.01129521, + "auxiliary_loss_mlp": 0.01036096, + "balance_loss_clip": 1.04599857, + "balance_loss_mlp": 1.02289391, + "epoch": 0.3401773635953705, + "flos": 31586213023200.0, + "grad_norm": 2.000188348532588, + "language_loss": 0.85338187, + "learning_rate": 3.0728239791642976e-06, + "loss": 0.87503803, + "num_input_tokens_seen": 121468165, + "router_z_loss_clip": 0.83642578, + "router_z_loss_mlp": 0.13189697, + "step": 5658, + "time_per_iteration": 2.7146663665771484 + }, + { + "auxiliary_loss_clip": 0.01046599, + "auxiliary_loss_mlp": 0.01003981, + "balance_loss_clip": 1.01918483, + "balance_loss_mlp": 1.00226247, + "epoch": 0.3402374868480385, + "flos": 79933049964000.0, + "grad_norm": 0.810449486379363, + "language_loss": 0.60002267, + "learning_rate": 3.072495270199477e-06, + "loss": 0.62052852, + "num_input_tokens_seen": 121523795, + "router_z_loss_clip": 0.27416992, + "router_z_loss_mlp": 0.01719666, + "step": 5659, + "time_per_iteration": 4.781926393508911 + }, + { + "auxiliary_loss_clip": 0.0112922, + "auxiliary_loss_mlp": 0.01034725, + "balance_loss_clip": 1.04777384, + "balance_loss_mlp": 1.0217315, + "epoch": 0.34029761010070647, + "flos": 29359052557920.0, + "grad_norm": 1.8756105304995156, + "language_loss": 0.68279183, + "learning_rate": 3.0721665205655284e-06, + "loss": 0.7044313, + "num_input_tokens_seen": 121542950, + "router_z_loss_clip": 0.81445312, + "router_z_loss_mlp": 0.12994385, + "step": 5660, + "time_per_iteration": 4.063872575759888 + }, + { + "auxiliary_loss_clip": 0.01134653, + "auxiliary_loss_mlp": 0.01040218, + "balance_loss_clip": 1.05072117, + "balance_loss_mlp": 1.02666354, + "epoch": 0.34035773335337444, + "flos": 33677781203520.0, + "grad_norm": 1.9724249605050286, + "language_loss": 0.67545831, + "learning_rate": 3.071837730274918e-06, + "loss": 0.69720697, + "num_input_tokens_seen": 121562765, + "router_z_loss_clip": 0.83789062, + "router_z_loss_mlp": 0.13555908, + "step": 5661, + "time_per_iteration": 2.785799264907837 + }, + { + "auxiliary_loss_clip": 0.011298, + "auxiliary_loss_mlp": 0.01034838, + "balance_loss_clip": 1.04735279, + "balance_loss_mlp": 1.02202296, + "epoch": 0.3404178566060424, + "flos": 25395893275680.0, + "grad_norm": 3.089990688130012, + "language_loss": 0.78991711, + "learning_rate": 3.071508899340113e-06, + "loss": 0.81156349, + "num_input_tokens_seen": 121581610, + "router_z_loss_clip": 0.82421875, + "router_z_loss_mlp": 0.12805176, + "step": 5662, + "time_per_iteration": 4.116427183151245 + }, + { + "auxiliary_loss_clip": 0.01131658, + "auxiliary_loss_mlp": 0.01038607, + "balance_loss_clip": 1.04704499, + "balance_loss_mlp": 1.02437901, + "epoch": 0.34047797985871037, + "flos": 32742856774080.0, + "grad_norm": 2.1358460116777302, + "language_loss": 0.73320627, + "learning_rate": 3.0711800277735833e-06, + "loss": 0.75490886, + "num_input_tokens_seen": 121601885, + "router_z_loss_clip": 0.84619141, + "router_z_loss_mlp": 0.14215088, + "step": 5663, + "time_per_iteration": 2.757453680038452 + }, + { + "auxiliary_loss_clip": 0.01126536, + "auxiliary_loss_mlp": 0.0103665, + "balance_loss_clip": 1.04619074, + "balance_loss_mlp": 1.02427614, + "epoch": 0.34053810311137833, + "flos": 24016719857760.0, + "grad_norm": 1.9308315408673822, + "language_loss": 0.8625195, + "learning_rate": 3.0708511155877997e-06, + "loss": 0.8841514, + "num_input_tokens_seen": 121621335, + "router_z_loss_clip": 0.80273438, + "router_z_loss_mlp": 0.1237793, + "step": 5664, + "time_per_iteration": 2.6992671489715576 + }, + { + "auxiliary_loss_clip": 0.01133106, + "auxiliary_loss_mlp": 0.01035853, + "balance_loss_clip": 1.04841733, + "balance_loss_mlp": 1.02324641, + "epoch": 0.3405982263640463, + "flos": 26509878681120.0, + "grad_norm": 2.3742234068665145, + "language_loss": 0.69224161, + "learning_rate": 3.070522162795235e-06, + "loss": 0.7139312, + "num_input_tokens_seen": 121641310, + "router_z_loss_clip": 0.84765625, + "router_z_loss_mlp": 0.12615967, + "step": 5665, + "time_per_iteration": 4.220242261886597 + }, + { + "auxiliary_loss_clip": 0.01131548, + "auxiliary_loss_mlp": 0.01034298, + "balance_loss_clip": 1.04651594, + "balance_loss_mlp": 1.0204159, + "epoch": 0.34065834961671426, + "flos": 22014155440800.0, + "grad_norm": 2.802840235707175, + "language_loss": 0.73112899, + "learning_rate": 3.0701931694083626e-06, + "loss": 0.75278747, + "num_input_tokens_seen": 121659625, + "router_z_loss_clip": 0.85058594, + "router_z_loss_mlp": 0.13885498, + "step": 5666, + "time_per_iteration": 2.7162978649139404 + }, + { + "auxiliary_loss_clip": 0.01132812, + "auxiliary_loss_mlp": 0.01036236, + "balance_loss_clip": 1.04753137, + "balance_loss_mlp": 1.02342701, + "epoch": 0.3407184728693822, + "flos": 26109138900960.0, + "grad_norm": 1.564905839860423, + "language_loss": 0.72935057, + "learning_rate": 3.0698641354396576e-06, + "loss": 0.75104105, + "num_input_tokens_seen": 121679205, + "router_z_loss_clip": 0.8515625, + "router_z_loss_mlp": 0.12817383, + "step": 5667, + "time_per_iteration": 2.6375396251678467 + }, + { + "auxiliary_loss_clip": 0.01045012, + "auxiliary_loss_mlp": 0.01001732, + "balance_loss_clip": 1.01745296, + "balance_loss_mlp": 1.00000954, + "epoch": 0.3407785961220502, + "flos": 83813121005760.0, + "grad_norm": 0.8417515839525123, + "language_loss": 0.63212341, + "learning_rate": 3.069535060901597e-06, + "loss": 0.65259087, + "num_input_tokens_seen": 121751085, + "router_z_loss_clip": 0.27661133, + "router_z_loss_mlp": 0.01727295, + "step": 5668, + "time_per_iteration": 3.512132167816162 + }, + { + "auxiliary_loss_clip": 0.01129662, + "auxiliary_loss_mlp": 0.01039601, + "balance_loss_clip": 1.04594326, + "balance_loss_mlp": 1.02595127, + "epoch": 0.34083871937471816, + "flos": 17160836973120.0, + "grad_norm": 4.216205721976509, + "language_loss": 0.71385401, + "learning_rate": 3.0692059458066596e-06, + "loss": 0.73554665, + "num_input_tokens_seen": 121768565, + "router_z_loss_clip": 0.83691406, + "router_z_loss_mlp": 0.13635254, + "step": 5669, + "time_per_iteration": 2.6984143257141113 + }, + { + "auxiliary_loss_clip": 0.0113233, + "auxiliary_loss_mlp": 0.01032136, + "balance_loss_clip": 1.04649544, + "balance_loss_mlp": 1.01890373, + "epoch": 0.3408988426273861, + "flos": 20848030646400.0, + "grad_norm": 3.4215488561485814, + "language_loss": 0.80783689, + "learning_rate": 3.0688767901673265e-06, + "loss": 0.8294816, + "num_input_tokens_seen": 121784925, + "router_z_loss_clip": 0.85839844, + "router_z_loss_mlp": 0.13244629, + "step": 5670, + "time_per_iteration": 2.625494956970215 + }, + { + "auxiliary_loss_clip": 0.01133991, + "auxiliary_loss_mlp": 0.01036268, + "balance_loss_clip": 1.04639173, + "balance_loss_mlp": 1.02320886, + "epoch": 0.3409589658800541, + "flos": 29315057142240.0, + "grad_norm": 5.216579517611916, + "language_loss": 0.77029812, + "learning_rate": 3.068547593996078e-06, + "loss": 0.79200071, + "num_input_tokens_seen": 121804425, + "router_z_loss_clip": 0.87646484, + "router_z_loss_mlp": 0.13067627, + "step": 5671, + "time_per_iteration": 2.7107298374176025 + }, + { + "auxiliary_loss_clip": 0.01133084, + "auxiliary_loss_mlp": 0.01040986, + "balance_loss_clip": 1.0481168, + "balance_loss_mlp": 1.02669287, + "epoch": 0.34101908913272205, + "flos": 25797564953280.0, + "grad_norm": 2.227316311134376, + "language_loss": 0.74096328, + "learning_rate": 3.0682183573053974e-06, + "loss": 0.76270401, + "num_input_tokens_seen": 121825145, + "router_z_loss_clip": 0.84814453, + "router_z_loss_mlp": 0.1428833, + "step": 5672, + "time_per_iteration": 2.6858112812042236 + }, + { + "auxiliary_loss_clip": 0.01132751, + "auxiliary_loss_mlp": 0.01033398, + "balance_loss_clip": 1.04675674, + "balance_loss_mlp": 1.02075577, + "epoch": 0.3410792123853901, + "flos": 19159592765760.0, + "grad_norm": 1.8835473222198689, + "language_loss": 0.73879343, + "learning_rate": 3.06788908010777e-06, + "loss": 0.76045489, + "num_input_tokens_seen": 121842185, + "router_z_loss_clip": 0.859375, + "router_z_loss_mlp": 0.12634277, + "step": 5673, + "time_per_iteration": 2.67863392829895 + }, + { + "auxiliary_loss_clip": 0.01130363, + "auxiliary_loss_mlp": 0.01033823, + "balance_loss_clip": 1.04721642, + "balance_loss_mlp": 1.02023935, + "epoch": 0.34113933563805804, + "flos": 28109069694720.0, + "grad_norm": 3.417400833762673, + "language_loss": 0.79605508, + "learning_rate": 3.067559762415682e-06, + "loss": 0.81769693, + "num_input_tokens_seen": 121862260, + "router_z_loss_clip": 0.83154297, + "router_z_loss_mlp": 0.13574219, + "step": 5674, + "time_per_iteration": 2.7286794185638428 + }, + { + "auxiliary_loss_clip": 0.01043597, + "auxiliary_loss_mlp": 0.01002442, + "balance_loss_clip": 1.016011, + "balance_loss_mlp": 1.0007863, + "epoch": 0.341199458890726, + "flos": 84945332836800.0, + "grad_norm": 0.8081868312157738, + "language_loss": 0.56054974, + "learning_rate": 3.0672304042416198e-06, + "loss": 0.5810101, + "num_input_tokens_seen": 121923560, + "router_z_loss_clip": 0.27587891, + "router_z_loss_mlp": 0.0165863, + "step": 5675, + "time_per_iteration": 3.422863483428955 + }, + { + "auxiliary_loss_clip": 0.01127621, + "auxiliary_loss_mlp": 0.01037656, + "balance_loss_clip": 1.04613471, + "balance_loss_mlp": 1.02381563, + "epoch": 0.34125958214339397, + "flos": 27267160239360.0, + "grad_norm": 13.102596923153978, + "language_loss": 0.78482246, + "learning_rate": 3.0669010055980734e-06, + "loss": 0.80647516, + "num_input_tokens_seen": 121943515, + "router_z_loss_clip": 0.81542969, + "router_z_loss_mlp": 0.13830566, + "step": 5676, + "time_per_iteration": 2.8188648223876953 + }, + { + "auxiliary_loss_clip": 0.01131394, + "auxiliary_loss_mlp": 0.01029687, + "balance_loss_clip": 1.04568863, + "balance_loss_mlp": 1.01640725, + "epoch": 0.34131970539606193, + "flos": 26688250863360.0, + "grad_norm": 1.8087773226370492, + "language_loss": 0.85367334, + "learning_rate": 3.0665715664975357e-06, + "loss": 0.87528419, + "num_input_tokens_seen": 121962540, + "router_z_loss_clip": 0.85742188, + "router_z_loss_mlp": 0.13275146, + "step": 5677, + "time_per_iteration": 2.7293856143951416 + }, + { + "auxiliary_loss_clip": 0.01130422, + "auxiliary_loss_mlp": 0.01036928, + "balance_loss_clip": 1.04657722, + "balance_loss_mlp": 1.02309382, + "epoch": 0.3413798286487299, + "flos": 30427016683680.0, + "grad_norm": 1.993299445933796, + "language_loss": 0.7931639, + "learning_rate": 3.0662420869524966e-06, + "loss": 0.81483746, + "num_input_tokens_seen": 121979830, + "router_z_loss_clip": 0.83837891, + "router_z_loss_mlp": 0.1383667, + "step": 5678, + "time_per_iteration": 2.7598657608032227 + }, + { + "auxiliary_loss_clip": 0.01129489, + "auxiliary_loss_mlp": 0.010314, + "balance_loss_clip": 1.04459929, + "balance_loss_mlp": 1.01853693, + "epoch": 0.34143995190139786, + "flos": 30962214264960.0, + "grad_norm": 1.9882682942460068, + "language_loss": 0.74975652, + "learning_rate": 3.0659125669754506e-06, + "loss": 0.7713654, + "num_input_tokens_seen": 121999055, + "router_z_loss_clip": 0.84863281, + "router_z_loss_mlp": 0.12872314, + "step": 5679, + "time_per_iteration": 2.674149751663208 + }, + { + "auxiliary_loss_clip": 0.01042765, + "auxiliary_loss_mlp": 0.01001035, + "balance_loss_clip": 1.01503181, + "balance_loss_mlp": 0.99941981, + "epoch": 0.34150007515406583, + "flos": 82707522677280.0, + "grad_norm": 0.7146111997577994, + "language_loss": 0.59420085, + "learning_rate": 3.0655830065788923e-06, + "loss": 0.61463886, + "num_input_tokens_seen": 122067015, + "router_z_loss_clip": 0.27734375, + "router_z_loss_mlp": 0.01616669, + "step": 5680, + "time_per_iteration": 3.3408730030059814 + }, + { + "auxiliary_loss_clip": 0.01127261, + "auxiliary_loss_mlp": 0.01028742, + "balance_loss_clip": 1.04548264, + "balance_loss_mlp": 1.01597524, + "epoch": 0.3415601984067338, + "flos": 24773069518560.0, + "grad_norm": 2.2118340632935705, + "language_loss": 0.72545314, + "learning_rate": 3.0652534057753206e-06, + "loss": 0.74701315, + "num_input_tokens_seen": 122085295, + "router_z_loss_clip": 0.81835938, + "router_z_loss_mlp": 0.12774658, + "step": 5681, + "time_per_iteration": 2.643348455429077 + }, + { + "auxiliary_loss_clip": 0.01124682, + "auxiliary_loss_mlp": 0.01041646, + "balance_loss_clip": 1.04271245, + "balance_loss_mlp": 1.029356, + "epoch": 0.34162032165940176, + "flos": 31763693825280.0, + "grad_norm": 6.974400348584861, + "language_loss": 0.71609932, + "learning_rate": 3.064923764577233e-06, + "loss": 0.73776257, + "num_input_tokens_seen": 122104020, + "router_z_loss_clip": 0.81933594, + "router_z_loss_mlp": 0.12280273, + "step": 5682, + "time_per_iteration": 2.719223976135254 + }, + { + "auxiliary_loss_clip": 0.01128058, + "auxiliary_loss_mlp": 0.0103508, + "balance_loss_clip": 1.0433495, + "balance_loss_mlp": 1.02144289, + "epoch": 0.3416804449120697, + "flos": 35146566144000.0, + "grad_norm": 1.7944845279926358, + "language_loss": 0.8441394, + "learning_rate": 3.0645940829971295e-06, + "loss": 0.86577076, + "num_input_tokens_seen": 122125080, + "router_z_loss_clip": 0.84814453, + "router_z_loss_mlp": 0.1362915, + "step": 5683, + "time_per_iteration": 2.7356677055358887 + }, + { + "auxiliary_loss_clip": 0.01132169, + "auxiliary_loss_mlp": 0.01043259, + "balance_loss_clip": 1.04658997, + "balance_loss_mlp": 1.02903104, + "epoch": 0.3417405681647377, + "flos": 27578653152480.0, + "grad_norm": 1.9230457601740196, + "language_loss": 0.70679581, + "learning_rate": 3.0642643610475116e-06, + "loss": 0.72855014, + "num_input_tokens_seen": 122146350, + "router_z_loss_clip": 0.85693359, + "router_z_loss_mlp": 0.14227295, + "step": 5684, + "time_per_iteration": 2.679107427597046 + }, + { + "auxiliary_loss_clip": 0.01127149, + "auxiliary_loss_mlp": 0.01033374, + "balance_loss_clip": 1.04482746, + "balance_loss_mlp": 1.02122688, + "epoch": 0.34180069141740566, + "flos": 30159235565280.0, + "grad_norm": 1.4259695069830107, + "language_loss": 0.74946904, + "learning_rate": 3.0639345987408823e-06, + "loss": 0.77107418, + "num_input_tokens_seen": 122168085, + "router_z_loss_clip": 0.82324219, + "router_z_loss_mlp": 0.12164307, + "step": 5685, + "time_per_iteration": 2.7021028995513916 + }, + { + "auxiliary_loss_clip": 0.01127381, + "auxiliary_loss_mlp": 0.01033254, + "balance_loss_clip": 1.04456925, + "balance_loss_mlp": 1.02116609, + "epoch": 0.3418608146700737, + "flos": 37239795532800.0, + "grad_norm": 1.8237463115614363, + "language_loss": 0.70427632, + "learning_rate": 3.0636047960897468e-06, + "loss": 0.72588265, + "num_input_tokens_seen": 122191040, + "router_z_loss_clip": 0.82861328, + "router_z_loss_mlp": 0.12084961, + "step": 5686, + "time_per_iteration": 2.755739688873291 + }, + { + "auxiliary_loss_clip": 0.01130107, + "auxiliary_loss_mlp": 0.01038572, + "balance_loss_clip": 1.04508901, + "balance_loss_mlp": 1.02474356, + "epoch": 0.34192093792274164, + "flos": 18451857490560.0, + "grad_norm": 2.303530351772069, + "language_loss": 0.7716974, + "learning_rate": 3.06327495310661e-06, + "loss": 0.79338419, + "num_input_tokens_seen": 122209225, + "router_z_loss_clip": 0.85107422, + "router_z_loss_mlp": 0.1383667, + "step": 5687, + "time_per_iteration": 2.6354172229766846 + }, + { + "auxiliary_loss_clip": 0.01128126, + "auxiliary_loss_mlp": 0.01031145, + "balance_loss_clip": 1.04633498, + "balance_loss_mlp": 1.01780534, + "epoch": 0.3419810611754096, + "flos": 16091252156160.0, + "grad_norm": 2.7948304839733704, + "language_loss": 0.86567038, + "learning_rate": 3.062945069803981e-06, + "loss": 0.88726306, + "num_input_tokens_seen": 122226160, + "router_z_loss_clip": 0.81689453, + "router_z_loss_mlp": 0.13342285, + "step": 5688, + "time_per_iteration": 2.6396608352661133 + }, + { + "auxiliary_loss_clip": 0.01136228, + "auxiliary_loss_mlp": 0.0103898, + "balance_loss_clip": 1.04673111, + "balance_loss_mlp": 1.02473998, + "epoch": 0.34204118442807757, + "flos": 23838145089120.0, + "grad_norm": 2.253848630215164, + "language_loss": 0.80062234, + "learning_rate": 3.0626151461943684e-06, + "loss": 0.82237446, + "num_input_tokens_seen": 122243115, + "router_z_loss_clip": 0.89501953, + "router_z_loss_mlp": 0.14233398, + "step": 5689, + "time_per_iteration": 2.668468713760376 + }, + { + "auxiliary_loss_clip": 0.01131367, + "auxiliary_loss_mlp": 0.01035707, + "balance_loss_clip": 1.04536355, + "balance_loss_mlp": 1.02136064, + "epoch": 0.34210130768074554, + "flos": 18540375046560.0, + "grad_norm": 1.987317761521298, + "language_loss": 0.74040765, + "learning_rate": 3.0622851822902834e-06, + "loss": 0.7620784, + "num_input_tokens_seen": 122261105, + "router_z_loss_clip": 0.86035156, + "router_z_loss_mlp": 0.14361572, + "step": 5690, + "time_per_iteration": 2.6370270252227783 + }, + { + "auxiliary_loss_clip": 0.01129811, + "auxiliary_loss_mlp": 0.01032426, + "balance_loss_clip": 1.04547954, + "balance_loss_mlp": 1.02024257, + "epoch": 0.3421614309334135, + "flos": 30427583925600.0, + "grad_norm": 2.2480434963176337, + "language_loss": 0.75619519, + "learning_rate": 3.061955178104237e-06, + "loss": 0.77781761, + "num_input_tokens_seen": 122279995, + "router_z_loss_clip": 0.84326172, + "router_z_loss_mlp": 0.12176514, + "step": 5691, + "time_per_iteration": 2.7343719005584717 + }, + { + "auxiliary_loss_clip": 0.0112763, + "auxiliary_loss_mlp": 0.01033546, + "balance_loss_clip": 1.04487967, + "balance_loss_mlp": 1.02071929, + "epoch": 0.34222155418608147, + "flos": 26733016107360.0, + "grad_norm": 1.6874508503855108, + "language_loss": 0.67960846, + "learning_rate": 3.0616251336487447e-06, + "loss": 0.70122027, + "num_input_tokens_seen": 122299070, + "router_z_loss_clip": 0.82763672, + "router_z_loss_mlp": 0.12841797, + "step": 5692, + "time_per_iteration": 2.6886520385742188 + }, + { + "auxiliary_loss_clip": 0.01131951, + "auxiliary_loss_mlp": 0.01042061, + "balance_loss_clip": 1.04609585, + "balance_loss_mlp": 1.0274694, + "epoch": 0.34228167743874943, + "flos": 22102591962240.0, + "grad_norm": 5.176182766589687, + "language_loss": 0.73244905, + "learning_rate": 3.06129504893632e-06, + "loss": 0.75418919, + "num_input_tokens_seen": 122316800, + "router_z_loss_clip": 0.85888672, + "router_z_loss_mlp": 0.14593506, + "step": 5693, + "time_per_iteration": 2.7269492149353027 + }, + { + "auxiliary_loss_clip": 0.01125878, + "auxiliary_loss_mlp": 0.01035954, + "balance_loss_clip": 1.04310358, + "balance_loss_mlp": 1.02364528, + "epoch": 0.3423418006914174, + "flos": 25976382825600.0, + "grad_norm": 1.8121433889143836, + "language_loss": 0.75763905, + "learning_rate": 3.0609649239794813e-06, + "loss": 0.77925742, + "num_input_tokens_seen": 122335275, + "router_z_loss_clip": 0.82763672, + "router_z_loss_mlp": 0.12304688, + "step": 5694, + "time_per_iteration": 2.654487371444702 + }, + { + "auxiliary_loss_clip": 0.01128066, + "auxiliary_loss_mlp": 0.01036125, + "balance_loss_clip": 1.0467782, + "balance_loss_mlp": 1.02313745, + "epoch": 0.34240192394408536, + "flos": 24189054965280.0, + "grad_norm": 4.024915436878116, + "language_loss": 0.79530621, + "learning_rate": 3.060634758790747e-06, + "loss": 0.81694812, + "num_input_tokens_seen": 122353215, + "router_z_loss_clip": 0.8125, + "router_z_loss_mlp": 0.12976074, + "step": 5695, + "time_per_iteration": 2.671236276626587 + }, + { + "auxiliary_loss_clip": 0.01130402, + "auxiliary_loss_mlp": 0.01035606, + "balance_loss_clip": 1.04608107, + "balance_loss_mlp": 1.02264202, + "epoch": 0.3424620471967533, + "flos": 29938569693120.0, + "grad_norm": 2.0089082285907462, + "language_loss": 0.73598278, + "learning_rate": 3.060304553382635e-06, + "loss": 0.75764287, + "num_input_tokens_seen": 122372495, + "router_z_loss_clip": 0.84375, + "router_z_loss_mlp": 0.12982178, + "step": 5696, + "time_per_iteration": 2.7044596672058105 + }, + { + "auxiliary_loss_clip": 0.0113065, + "auxiliary_loss_mlp": 0.01045651, + "balance_loss_clip": 1.0469178, + "balance_loss_mlp": 1.03280044, + "epoch": 0.3425221704494213, + "flos": 31541245192800.0, + "grad_norm": 1.6531722051713402, + "language_loss": 0.71039081, + "learning_rate": 3.0599743077676685e-06, + "loss": 0.73215377, + "num_input_tokens_seen": 122394600, + "router_z_loss_clip": 0.8359375, + "router_z_loss_mlp": 0.12854004, + "step": 5697, + "time_per_iteration": 2.6938064098358154 + }, + { + "auxiliary_loss_clip": 0.01126794, + "auxiliary_loss_mlp": 0.01030821, + "balance_loss_clip": 1.04538774, + "balance_loss_mlp": 1.01755941, + "epoch": 0.34258229370208926, + "flos": 26284107631680.0, + "grad_norm": 2.1570928986388234, + "language_loss": 0.82448733, + "learning_rate": 3.05964402195837e-06, + "loss": 0.84606349, + "num_input_tokens_seen": 122414700, + "router_z_loss_clip": 0.81445312, + "router_z_loss_mlp": 0.13256836, + "step": 5698, + "time_per_iteration": 2.671933174133301 + }, + { + "auxiliary_loss_clip": 0.01130227, + "auxiliary_loss_mlp": 0.01046808, + "balance_loss_clip": 1.04433894, + "balance_loss_mlp": 1.03152561, + "epoch": 0.3426424169547573, + "flos": 28862015904000.0, + "grad_norm": 1.9889594383573737, + "language_loss": 0.68578857, + "learning_rate": 3.0593136959672645e-06, + "loss": 0.70755893, + "num_input_tokens_seen": 122432760, + "router_z_loss_clip": 0.85791016, + "router_z_loss_mlp": 0.15270996, + "step": 5699, + "time_per_iteration": 4.1261231899261475 + }, + { + "auxiliary_loss_clip": 0.01129635, + "auxiliary_loss_mlp": 0.01030816, + "balance_loss_clip": 1.04545319, + "balance_loss_mlp": 1.01790559, + "epoch": 0.34270254020742524, + "flos": 30072014562240.0, + "grad_norm": 5.795503263917892, + "language_loss": 0.72461289, + "learning_rate": 3.058983329806877e-06, + "loss": 0.74621737, + "num_input_tokens_seen": 122449105, + "router_z_loss_clip": 0.84326172, + "router_z_loss_mlp": 0.12896729, + "step": 5700, + "time_per_iteration": 3.9339473247528076 + }, + { + "auxiliary_loss_clip": 0.01131556, + "auxiliary_loss_mlp": 0.01029126, + "balance_loss_clip": 1.04804015, + "balance_loss_mlp": 1.01664495, + "epoch": 0.3427626634600932, + "flos": 25620246220320.0, + "grad_norm": 1.9928847171272623, + "language_loss": 0.81951499, + "learning_rate": 3.0586529234897354e-06, + "loss": 0.84112179, + "num_input_tokens_seen": 122468700, + "router_z_loss_clip": 0.83544922, + "router_z_loss_mlp": 0.12469482, + "step": 5701, + "time_per_iteration": 2.6759657859802246 + }, + { + "auxiliary_loss_clip": 0.01130372, + "auxiliary_loss_mlp": 0.01030283, + "balance_loss_clip": 1.04517031, + "balance_loss_mlp": 1.01773643, + "epoch": 0.3428227867127612, + "flos": 26153620524000.0, + "grad_norm": 2.367454342011945, + "language_loss": 0.71523905, + "learning_rate": 3.0583224770283694e-06, + "loss": 0.73684555, + "num_input_tokens_seen": 122488160, + "router_z_loss_clip": 0.8515625, + "router_z_loss_mlp": 0.12554932, + "step": 5702, + "time_per_iteration": 4.1072282791137695 + }, + { + "auxiliary_loss_clip": 0.01046542, + "auxiliary_loss_mlp": 0.01005414, + "balance_loss_clip": 1.01882398, + "balance_loss_mlp": 1.00346375, + "epoch": 0.34288290996542914, + "flos": 68003182739520.0, + "grad_norm": 0.776482861074626, + "language_loss": 0.57463247, + "learning_rate": 3.057991990435309e-06, + "loss": 0.59515202, + "num_input_tokens_seen": 122542890, + "router_z_loss_clip": 0.27709961, + "router_z_loss_mlp": 0.01948547, + "step": 5703, + "time_per_iteration": 3.163020372390747 + }, + { + "auxiliary_loss_clip": 0.01131653, + "auxiliary_loss_mlp": 0.01037658, + "balance_loss_clip": 1.04683769, + "balance_loss_mlp": 1.02287579, + "epoch": 0.3429430332180971, + "flos": 24595345612800.0, + "grad_norm": 2.008198308005951, + "language_loss": 0.74536192, + "learning_rate": 3.057661463723086e-06, + "loss": 0.76705515, + "num_input_tokens_seen": 122561770, + "router_z_loss_clip": 0.84814453, + "router_z_loss_mlp": 0.14801025, + "step": 5704, + "time_per_iteration": 2.659668207168579 + }, + { + "auxiliary_loss_clip": 0.01127737, + "auxiliary_loss_mlp": 0.0103315, + "balance_loss_clip": 1.04554915, + "balance_loss_mlp": 1.02080011, + "epoch": 0.34300315647076507, + "flos": 21921950812320.0, + "grad_norm": 1.816839798406725, + "language_loss": 0.72780597, + "learning_rate": 3.0573308969042346e-06, + "loss": 0.7494148, + "num_input_tokens_seen": 122580580, + "router_z_loss_clip": 0.82177734, + "router_z_loss_mlp": 0.12341309, + "step": 5705, + "time_per_iteration": 4.08971381187439 + }, + { + "auxiliary_loss_clip": 0.01128365, + "auxiliary_loss_mlp": 0.01030279, + "balance_loss_clip": 1.04471087, + "balance_loss_mlp": 1.01687431, + "epoch": 0.34306327972343303, + "flos": 26950845769920.0, + "grad_norm": 3.7850905439392877, + "language_loss": 0.79182804, + "learning_rate": 3.057000289991289e-06, + "loss": 0.81341451, + "num_input_tokens_seen": 122599810, + "router_z_loss_clip": 0.83544922, + "router_z_loss_mlp": 0.13409424, + "step": 5706, + "time_per_iteration": 2.6397619247436523 + }, + { + "auxiliary_loss_clip": 0.01134402, + "auxiliary_loss_mlp": 0.01033238, + "balance_loss_clip": 1.0474422, + "balance_loss_mlp": 1.01957059, + "epoch": 0.343123402976101, + "flos": 22503574846080.0, + "grad_norm": 2.0168352984588416, + "language_loss": 0.82599688, + "learning_rate": 3.056669642996787e-06, + "loss": 0.8476733, + "num_input_tokens_seen": 122616035, + "router_z_loss_clip": 0.86914062, + "router_z_loss_mlp": 0.13671875, + "step": 5707, + "time_per_iteration": 2.6237423419952393 + }, + { + "auxiliary_loss_clip": 0.01132119, + "auxiliary_loss_mlp": 0.01033688, + "balance_loss_clip": 1.0480988, + "balance_loss_mlp": 1.02071846, + "epoch": 0.34318352622876896, + "flos": 20943030967200.0, + "grad_norm": 1.548157013501449, + "language_loss": 0.75370359, + "learning_rate": 3.056338955933266e-06, + "loss": 0.77536178, + "num_input_tokens_seen": 122633785, + "router_z_loss_clip": 0.84082031, + "router_z_loss_mlp": 0.12969971, + "step": 5708, + "time_per_iteration": 2.6212868690490723 + }, + { + "auxiliary_loss_clip": 0.01126629, + "auxiliary_loss_mlp": 0.01032876, + "balance_loss_clip": 1.04420757, + "balance_loss_mlp": 1.02009094, + "epoch": 0.34324364948143693, + "flos": 32565375972000.0, + "grad_norm": 1.9609466374072155, + "language_loss": 0.81524092, + "learning_rate": 3.0560082288132662e-06, + "loss": 0.83683598, + "num_input_tokens_seen": 122652100, + "router_z_loss_clip": 0.82470703, + "router_z_loss_mlp": 0.12786865, + "step": 5709, + "time_per_iteration": 2.741349935531616 + }, + { + "auxiliary_loss_clip": 0.01133281, + "auxiliary_loss_mlp": 0.01038614, + "balance_loss_clip": 1.04791451, + "balance_loss_mlp": 1.02365923, + "epoch": 0.3433037727341049, + "flos": 25931252926080.0, + "grad_norm": 2.390219893980192, + "language_loss": 0.79070598, + "learning_rate": 3.055677461649329e-06, + "loss": 0.8124249, + "num_input_tokens_seen": 122669720, + "router_z_loss_clip": 0.85302734, + "router_z_loss_mlp": 0.14959717, + "step": 5710, + "time_per_iteration": 2.6768152713775635 + }, + { + "auxiliary_loss_clip": 0.01132755, + "auxiliary_loss_mlp": 0.01031235, + "balance_loss_clip": 1.0460633, + "balance_loss_mlp": 1.01703739, + "epoch": 0.34336389598677286, + "flos": 25172229124800.0, + "grad_norm": 2.0008776720521717, + "language_loss": 0.70624411, + "learning_rate": 3.055346654453996e-06, + "loss": 0.72788405, + "num_input_tokens_seen": 122688715, + "router_z_loss_clip": 0.86669922, + "router_z_loss_mlp": 0.14190674, + "step": 5711, + "time_per_iteration": 2.675384044647217 + }, + { + "auxiliary_loss_clip": 0.01127398, + "auxiliary_loss_mlp": 0.01031307, + "balance_loss_clip": 1.04409707, + "balance_loss_mlp": 1.01809263, + "epoch": 0.3434240192394409, + "flos": 17739503245440.0, + "grad_norm": 1.9110020911607564, + "language_loss": 0.67468929, + "learning_rate": 3.055015807239812e-06, + "loss": 0.69627631, + "num_input_tokens_seen": 122706970, + "router_z_loss_clip": 0.83398438, + "router_z_loss_mlp": 0.13232422, + "step": 5712, + "time_per_iteration": 2.5981285572052 + }, + { + "auxiliary_loss_clip": 0.01045908, + "auxiliary_loss_mlp": 0.01002939, + "balance_loss_clip": 1.01829493, + "balance_loss_mlp": 1.00097525, + "epoch": 0.34348414249210885, + "flos": 70835339358720.0, + "grad_norm": 1.5345451323234374, + "language_loss": 0.58080411, + "learning_rate": 3.0546849200193226e-06, + "loss": 0.60129261, + "num_input_tokens_seen": 122758095, + "router_z_loss_clip": 0.27685547, + "router_z_loss_mlp": 0.0196228, + "step": 5713, + "time_per_iteration": 3.2234749794006348 + }, + { + "auxiliary_loss_clip": 0.01131779, + "auxiliary_loss_mlp": 0.01035567, + "balance_loss_clip": 1.0469768, + "balance_loss_mlp": 1.0224359, + "epoch": 0.3435442657447768, + "flos": 25263582890400.0, + "grad_norm": 1.76636653413877, + "language_loss": 0.80788505, + "learning_rate": 3.054353992805076e-06, + "loss": 0.82955849, + "num_input_tokens_seen": 122777815, + "router_z_loss_clip": 0.84667969, + "router_z_loss_mlp": 0.13128662, + "step": 5714, + "time_per_iteration": 2.636244297027588 + }, + { + "auxiliary_loss_clip": 0.01130861, + "auxiliary_loss_mlp": 0.01039944, + "balance_loss_clip": 1.04661214, + "balance_loss_mlp": 1.02614582, + "epoch": 0.3436043889974448, + "flos": 27979676553600.0, + "grad_norm": 3.695429199244153, + "language_loss": 0.71502882, + "learning_rate": 3.05402302560962e-06, + "loss": 0.73673689, + "num_input_tokens_seen": 122797555, + "router_z_loss_clip": 0.84326172, + "router_z_loss_mlp": 0.13800049, + "step": 5715, + "time_per_iteration": 2.8240175247192383 + }, + { + "auxiliary_loss_clip": 0.01044081, + "auxiliary_loss_mlp": 0.01002343, + "balance_loss_clip": 1.01653862, + "balance_loss_mlp": 1.00047159, + "epoch": 0.34366451225011274, + "flos": 71262415370880.0, + "grad_norm": 0.8974949877879681, + "language_loss": 0.65936136, + "learning_rate": 3.053692018445505e-06, + "loss": 0.6798256, + "num_input_tokens_seen": 122863955, + "router_z_loss_clip": 0.27587891, + "router_z_loss_mlp": 0.01869202, + "step": 5716, + "time_per_iteration": 3.299969434738159 + }, + { + "auxiliary_loss_clip": 0.01128662, + "auxiliary_loss_mlp": 0.01037714, + "balance_loss_clip": 1.04588723, + "balance_loss_mlp": 1.02464867, + "epoch": 0.3437246355027807, + "flos": 19030240141920.0, + "grad_norm": 2.0094241770647754, + "language_loss": 0.74091291, + "learning_rate": 3.0533609713252838e-06, + "loss": 0.7625767, + "num_input_tokens_seen": 122883000, + "router_z_loss_clip": 0.82666016, + "router_z_loss_mlp": 0.13079834, + "step": 5717, + "time_per_iteration": 2.6111607551574707 + }, + { + "auxiliary_loss_clip": 0.01127767, + "auxiliary_loss_mlp": 0.01034841, + "balance_loss_clip": 1.04372954, + "balance_loss_mlp": 1.02197814, + "epoch": 0.34378475875544867, + "flos": 33766339276800.0, + "grad_norm": 2.824987150455938, + "language_loss": 0.75147915, + "learning_rate": 3.0530298842615077e-06, + "loss": 0.77310526, + "num_input_tokens_seen": 122903265, + "router_z_loss_clip": 0.84082031, + "router_z_loss_mlp": 0.12878418, + "step": 5718, + "time_per_iteration": 2.748255729675293 + }, + { + "auxiliary_loss_clip": 0.01129195, + "auxiliary_loss_mlp": 0.01038134, + "balance_loss_clip": 1.04303241, + "balance_loss_mlp": 1.02455604, + "epoch": 0.34384488200811664, + "flos": 38353537834560.0, + "grad_norm": 1.9045194600981892, + "language_loss": 0.63488078, + "learning_rate": 3.052698757266734e-06, + "loss": 0.65655404, + "num_input_tokens_seen": 122923860, + "router_z_loss_clip": 0.86132812, + "router_z_loss_mlp": 0.13580322, + "step": 5719, + "time_per_iteration": 2.713779926300049 + }, + { + "auxiliary_loss_clip": 0.01132526, + "auxiliary_loss_mlp": 0.01035135, + "balance_loss_clip": 1.04533792, + "balance_loss_mlp": 1.02078247, + "epoch": 0.3439050052607846, + "flos": 30383264371680.0, + "grad_norm": 1.7419768320265105, + "language_loss": 0.73570031, + "learning_rate": 3.0523675903535183e-06, + "loss": 0.75737691, + "num_input_tokens_seen": 122945305, + "router_z_loss_clip": 0.87207031, + "router_z_loss_mlp": 0.14355469, + "step": 5720, + "time_per_iteration": 2.7636096477508545 + }, + { + "auxiliary_loss_clip": 0.01131626, + "auxiliary_loss_mlp": 0.0103291, + "balance_loss_clip": 1.04623246, + "balance_loss_mlp": 1.01889062, + "epoch": 0.34396512851345257, + "flos": 22146587377920.0, + "grad_norm": 1.7444423009406804, + "language_loss": 0.73917609, + "learning_rate": 3.0520363835344173e-06, + "loss": 0.7608214, + "num_input_tokens_seen": 122962535, + "router_z_loss_clip": 0.85351562, + "router_z_loss_mlp": 0.14031982, + "step": 5721, + "time_per_iteration": 2.6337192058563232 + }, + { + "auxiliary_loss_clip": 0.01133889, + "auxiliary_loss_mlp": 0.01043208, + "balance_loss_clip": 1.04758155, + "balance_loss_mlp": 1.0298388, + "epoch": 0.34402525176612053, + "flos": 19564870481280.0, + "grad_norm": 3.9833850275120426, + "language_loss": 0.80216128, + "learning_rate": 3.051705136821992e-06, + "loss": 0.82393217, + "num_input_tokens_seen": 122979750, + "router_z_loss_clip": 0.86279297, + "router_z_loss_mlp": 0.1338501, + "step": 5722, + "time_per_iteration": 2.75864577293396 + }, + { + "auxiliary_loss_clip": 0.01127746, + "auxiliary_loss_mlp": 0.01031091, + "balance_loss_clip": 1.04467046, + "balance_loss_mlp": 1.01836586, + "epoch": 0.3440853750187885, + "flos": 25842249162720.0, + "grad_norm": 2.088202033447225, + "language_loss": 0.81698298, + "learning_rate": 3.051373850228801e-06, + "loss": 0.83857137, + "num_input_tokens_seen": 122998955, + "router_z_loss_clip": 0.83007812, + "router_z_loss_mlp": 0.12719727, + "step": 5723, + "time_per_iteration": 2.6965839862823486 + }, + { + "auxiliary_loss_clip": 0.01129881, + "auxiliary_loss_mlp": 0.01034535, + "balance_loss_clip": 1.04465199, + "balance_loss_mlp": 1.0209806, + "epoch": 0.34414549827145646, + "flos": 15468914606400.0, + "grad_norm": 1.8106036129626166, + "language_loss": 0.80906713, + "learning_rate": 3.0510425237674096e-06, + "loss": 0.83071131, + "num_input_tokens_seen": 123016165, + "router_z_loss_clip": 0.8515625, + "router_z_loss_mlp": 0.13568115, + "step": 5724, + "time_per_iteration": 2.6381423473358154 + }, + { + "auxiliary_loss_clip": 0.01130809, + "auxiliary_loss_mlp": 0.01030441, + "balance_loss_clip": 1.04538202, + "balance_loss_mlp": 1.01683962, + "epoch": 0.3442056215241244, + "flos": 38174274272160.0, + "grad_norm": 1.8986936311143572, + "language_loss": 0.69231939, + "learning_rate": 3.05071115745038e-06, + "loss": 0.71393192, + "num_input_tokens_seen": 123036900, + "router_z_loss_clip": 0.85449219, + "router_z_loss_mlp": 0.13592529, + "step": 5725, + "time_per_iteration": 2.741281270980835 + }, + { + "auxiliary_loss_clip": 0.01134949, + "auxiliary_loss_mlp": 0.01045444, + "balance_loss_clip": 1.04524875, + "balance_loss_mlp": 1.02995896, + "epoch": 0.34426574477679245, + "flos": 28513050857280.0, + "grad_norm": 1.4843139512686845, + "language_loss": 0.69557196, + "learning_rate": 3.0503797512902773e-06, + "loss": 0.71737587, + "num_input_tokens_seen": 123057480, + "router_z_loss_clip": 0.89550781, + "router_z_loss_mlp": 0.15478516, + "step": 5726, + "time_per_iteration": 2.6754097938537598 + }, + { + "auxiliary_loss_clip": 0.01129793, + "auxiliary_loss_mlp": 0.01037453, + "balance_loss_clip": 1.04436827, + "balance_loss_mlp": 1.02463222, + "epoch": 0.3443258680294604, + "flos": 29938448141280.0, + "grad_norm": 2.214221723493348, + "language_loss": 0.73238629, + "learning_rate": 3.0500483052996703e-06, + "loss": 0.75405878, + "num_input_tokens_seen": 123076890, + "router_z_loss_clip": 0.85449219, + "router_z_loss_mlp": 0.12817383, + "step": 5727, + "time_per_iteration": 2.69069766998291 + }, + { + "auxiliary_loss_clip": 0.01130347, + "auxiliary_loss_mlp": 0.01036742, + "balance_loss_clip": 1.04522824, + "balance_loss_mlp": 1.02265716, + "epoch": 0.3443859912821284, + "flos": 24684795066240.0, + "grad_norm": 2.627838845923006, + "language_loss": 0.88038886, + "learning_rate": 3.0497168194911257e-06, + "loss": 0.90205967, + "num_input_tokens_seen": 123092530, + "router_z_loss_clip": 0.8515625, + "router_z_loss_mlp": 0.14080811, + "step": 5728, + "time_per_iteration": 2.6489713191986084 + }, + { + "auxiliary_loss_clip": 0.01130011, + "auxiliary_loss_mlp": 0.01042043, + "balance_loss_clip": 1.04502416, + "balance_loss_mlp": 1.02900159, + "epoch": 0.34444611453479634, + "flos": 29671315299360.0, + "grad_norm": 1.9357759739161409, + "language_loss": 0.70444173, + "learning_rate": 3.0493852938772143e-06, + "loss": 0.7261622, + "num_input_tokens_seen": 123110560, + "router_z_loss_clip": 0.85058594, + "router_z_loss_mlp": 0.13049316, + "step": 5729, + "time_per_iteration": 2.680828094482422 + }, + { + "auxiliary_loss_clip": 0.01129293, + "auxiliary_loss_mlp": 0.01032129, + "balance_loss_clip": 1.04522896, + "balance_loss_mlp": 1.01863468, + "epoch": 0.3445062377874643, + "flos": 20722851302400.0, + "grad_norm": 1.915153565010603, + "language_loss": 0.73942411, + "learning_rate": 3.0490537284705078e-06, + "loss": 0.7610383, + "num_input_tokens_seen": 123128655, + "router_z_loss_clip": 0.84082031, + "router_z_loss_mlp": 0.13494873, + "step": 5730, + "time_per_iteration": 2.6039583683013916 + }, + { + "auxiliary_loss_clip": 0.01129575, + "auxiliary_loss_mlp": 0.01039754, + "balance_loss_clip": 1.04421258, + "balance_loss_mlp": 1.02592587, + "epoch": 0.3445663610401323, + "flos": 24773393656800.0, + "grad_norm": 2.692076100723057, + "language_loss": 0.79416406, + "learning_rate": 3.048722123283578e-06, + "loss": 0.81585729, + "num_input_tokens_seen": 123145130, + "router_z_loss_clip": 0.85400391, + "router_z_loss_mlp": 0.13830566, + "step": 5731, + "time_per_iteration": 2.6480019092559814 + }, + { + "auxiliary_loss_clip": 0.01131892, + "auxiliary_loss_mlp": 0.0103933, + "balance_loss_clip": 1.04582787, + "balance_loss_mlp": 1.02540004, + "epoch": 0.34462648429280024, + "flos": 19386336229920.0, + "grad_norm": 3.606721622664931, + "language_loss": 0.78497195, + "learning_rate": 3.0483904783290006e-06, + "loss": 0.80668414, + "num_input_tokens_seen": 123162265, + "router_z_loss_clip": 0.86035156, + "router_z_loss_mlp": 0.1394043, + "step": 5732, + "time_per_iteration": 2.639559268951416 + }, + { + "auxiliary_loss_clip": 0.01041744, + "auxiliary_loss_mlp": 0.01006532, + "balance_loss_clip": 1.01430178, + "balance_loss_mlp": 1.00485408, + "epoch": 0.3446866075454682, + "flos": 72370525770720.0, + "grad_norm": 0.7405810301398794, + "language_loss": 0.53535515, + "learning_rate": 3.0480587936193505e-06, + "loss": 0.55583799, + "num_input_tokens_seen": 123218620, + "router_z_loss_clip": 0.27441406, + "router_z_loss_mlp": 0.01681519, + "step": 5733, + "time_per_iteration": 3.2932770252227783 + }, + { + "auxiliary_loss_clip": 0.01131632, + "auxiliary_loss_mlp": 0.01038748, + "balance_loss_clip": 1.04681182, + "balance_loss_mlp": 1.02513456, + "epoch": 0.34474673079813617, + "flos": 27264283512480.0, + "grad_norm": 1.7078134801124887, + "language_loss": 0.83405137, + "learning_rate": 3.047727069167207e-06, + "loss": 0.85575521, + "num_input_tokens_seen": 123237325, + "router_z_loss_clip": 0.84863281, + "router_z_loss_mlp": 0.13616943, + "step": 5734, + "time_per_iteration": 2.623455762863159 + }, + { + "auxiliary_loss_clip": 0.0112988, + "auxiliary_loss_mlp": 0.01030939, + "balance_loss_clip": 1.04459226, + "balance_loss_mlp": 1.01709867, + "epoch": 0.34480685405080413, + "flos": 33763948757280.0, + "grad_norm": 1.9486292092012285, + "language_loss": 0.92837238, + "learning_rate": 3.0473953049851478e-06, + "loss": 0.94998062, + "num_input_tokens_seen": 123258650, + "router_z_loss_clip": 0.85351562, + "router_z_loss_mlp": 0.13842773, + "step": 5735, + "time_per_iteration": 2.735480308532715 + }, + { + "auxiliary_loss_clip": 0.01135244, + "auxiliary_loss_mlp": 0.01042107, + "balance_loss_clip": 1.04874444, + "balance_loss_mlp": 1.02749825, + "epoch": 0.3448669773034721, + "flos": 27400969764000.0, + "grad_norm": 1.7601921504536764, + "language_loss": 0.77144945, + "learning_rate": 3.0470635010857533e-06, + "loss": 0.79322296, + "num_input_tokens_seen": 123277155, + "router_z_loss_clip": 0.86621094, + "router_z_loss_mlp": 0.1461792, + "step": 5736, + "time_per_iteration": 2.6691601276397705 + }, + { + "auxiliary_loss_clip": 0.01135541, + "auxiliary_loss_mlp": 0.01041641, + "balance_loss_clip": 1.04873824, + "balance_loss_mlp": 1.02749658, + "epoch": 0.34492710055614006, + "flos": 30427543408320.0, + "grad_norm": 1.6631230835233435, + "language_loss": 0.78876448, + "learning_rate": 3.0467316574816064e-06, + "loss": 0.81053632, + "num_input_tokens_seen": 123297640, + "router_z_loss_clip": 0.86816406, + "router_z_loss_mlp": 0.14135742, + "step": 5737, + "time_per_iteration": 2.784231185913086 + }, + { + "auxiliary_loss_clip": 0.01138309, + "auxiliary_loss_mlp": 0.01035343, + "balance_loss_clip": 1.04705548, + "balance_loss_mlp": 1.02066827, + "epoch": 0.34498722380880803, + "flos": 24551390714400.0, + "grad_norm": 2.397587925389903, + "language_loss": 0.71072829, + "learning_rate": 3.0463997741852893e-06, + "loss": 0.73246479, + "num_input_tokens_seen": 123314370, + "router_z_loss_clip": 0.91162109, + "router_z_loss_mlp": 0.14660645, + "step": 5738, + "time_per_iteration": 4.162651538848877 + }, + { + "auxiliary_loss_clip": 0.0113405, + "auxiliary_loss_mlp": 0.01037033, + "balance_loss_clip": 1.045789, + "balance_loss_mlp": 1.02302635, + "epoch": 0.34504734706147605, + "flos": 34700939568000.0, + "grad_norm": 2.6809972431544264, + "language_loss": 0.82068342, + "learning_rate": 3.046067851209389e-06, + "loss": 0.84239423, + "num_input_tokens_seen": 123336085, + "router_z_loss_clip": 0.88232422, + "router_z_loss_mlp": 0.14007568, + "step": 5739, + "time_per_iteration": 4.039295434951782 + }, + { + "auxiliary_loss_clip": 0.01136058, + "auxiliary_loss_mlp": 0.010364, + "balance_loss_clip": 1.04895866, + "balance_loss_mlp": 1.02246428, + "epoch": 0.345107470314144, + "flos": 27667575881280.0, + "grad_norm": 2.1512632838259385, + "language_loss": 0.83015859, + "learning_rate": 3.0457358885664898e-06, + "loss": 0.85188317, + "num_input_tokens_seen": 123354460, + "router_z_loss_clip": 0.87060547, + "router_z_loss_mlp": 0.1394043, + "step": 5740, + "time_per_iteration": 2.724879026412964 + }, + { + "auxiliary_loss_clip": 0.01132637, + "auxiliary_loss_mlp": 0.01034826, + "balance_loss_clip": 1.04722846, + "balance_loss_mlp": 1.02024639, + "epoch": 0.345167593566812, + "flos": 25173687746880.0, + "grad_norm": 2.1292977592759836, + "language_loss": 0.76925665, + "learning_rate": 3.045403886269181e-06, + "loss": 0.79093128, + "num_input_tokens_seen": 123373420, + "router_z_loss_clip": 0.85400391, + "router_z_loss_mlp": 0.14569092, + "step": 5741, + "time_per_iteration": 4.187136173248291 + }, + { + "auxiliary_loss_clip": 0.01133487, + "auxiliary_loss_mlp": 0.01034332, + "balance_loss_clip": 1.0447458, + "balance_loss_mlp": 1.0207299, + "epoch": 0.34522771681947995, + "flos": 31987155389760.0, + "grad_norm": 1.5485002233281318, + "language_loss": 0.77299494, + "learning_rate": 3.045071844330053e-06, + "loss": 0.79467309, + "num_input_tokens_seen": 123394730, + "router_z_loss_clip": 0.88671875, + "router_z_loss_mlp": 0.13604736, + "step": 5742, + "time_per_iteration": 2.7588253021240234 + }, + { + "auxiliary_loss_clip": 0.01131702, + "auxiliary_loss_mlp": 0.01038033, + "balance_loss_clip": 1.04634523, + "balance_loss_mlp": 1.02409148, + "epoch": 0.3452878400721479, + "flos": 23259802955040.0, + "grad_norm": 2.2913792847068195, + "language_loss": 0.76147544, + "learning_rate": 3.0447397627616955e-06, + "loss": 0.78317285, + "num_input_tokens_seen": 123412895, + "router_z_loss_clip": 0.85302734, + "router_z_loss_mlp": 0.1394043, + "step": 5743, + "time_per_iteration": 2.6335983276367188 + }, + { + "auxiliary_loss_clip": 0.01128906, + "auxiliary_loss_mlp": 0.01037497, + "balance_loss_clip": 1.04481304, + "balance_loss_mlp": 1.02395511, + "epoch": 0.3453479633248159, + "flos": 34080344261280.0, + "grad_norm": 1.8821588483745024, + "language_loss": 0.70520604, + "learning_rate": 3.0444076415767016e-06, + "loss": 0.72687006, + "num_input_tokens_seen": 123432320, + "router_z_loss_clip": 0.84082031, + "router_z_loss_mlp": 0.13537598, + "step": 5744, + "time_per_iteration": 4.276110887527466 + }, + { + "auxiliary_loss_clip": 0.01129321, + "auxiliary_loss_mlp": 0.01033742, + "balance_loss_clip": 1.04550457, + "balance_loss_mlp": 1.01972294, + "epoch": 0.34540808657748384, + "flos": 23923664366400.0, + "grad_norm": 1.621778952035525, + "language_loss": 0.7955128, + "learning_rate": 3.044075480787665e-06, + "loss": 0.81714338, + "num_input_tokens_seen": 123450980, + "router_z_loss_clip": 0.83740234, + "router_z_loss_mlp": 0.14007568, + "step": 5745, + "time_per_iteration": 2.7648701667785645 + }, + { + "auxiliary_loss_clip": 0.01134487, + "auxiliary_loss_mlp": 0.01035849, + "balance_loss_clip": 1.04602635, + "balance_loss_mlp": 1.02065015, + "epoch": 0.3454682098301518, + "flos": 24906392835840.0, + "grad_norm": 1.829730860618878, + "language_loss": 0.8916328, + "learning_rate": 3.043743280407182e-06, + "loss": 0.91333616, + "num_input_tokens_seen": 123469365, + "router_z_loss_clip": 0.88378906, + "router_z_loss_mlp": 0.15197754, + "step": 5746, + "time_per_iteration": 2.720305919647217 + }, + { + "auxiliary_loss_clip": 0.011353, + "auxiliary_loss_mlp": 0.01034785, + "balance_loss_clip": 1.04608774, + "balance_loss_mlp": 1.01999164, + "epoch": 0.34552833308281977, + "flos": 26020945483200.0, + "grad_norm": 2.562536573337185, + "language_loss": 0.64389694, + "learning_rate": 3.043411040447849e-06, + "loss": 0.6655978, + "num_input_tokens_seen": 123489425, + "router_z_loss_clip": 0.89306641, + "router_z_loss_mlp": 0.14794922, + "step": 5747, + "time_per_iteration": 2.6913366317749023 + }, + { + "auxiliary_loss_clip": 0.01131911, + "auxiliary_loss_mlp": 0.0102984, + "balance_loss_clip": 1.04686189, + "balance_loss_mlp": 1.01627409, + "epoch": 0.34558845633548774, + "flos": 44226043973280.0, + "grad_norm": 1.6316633616486116, + "language_loss": 0.73453021, + "learning_rate": 3.043078760922264e-06, + "loss": 0.75614774, + "num_input_tokens_seen": 123509970, + "router_z_loss_clip": 0.85107422, + "router_z_loss_mlp": 0.13562012, + "step": 5748, + "time_per_iteration": 2.73396897315979 + }, + { + "auxiliary_loss_clip": 0.01129277, + "auxiliary_loss_mlp": 0.01032475, + "balance_loss_clip": 1.04708779, + "balance_loss_mlp": 1.01968408, + "epoch": 0.3456485795881557, + "flos": 27394811137440.0, + "grad_norm": 1.640627510619686, + "language_loss": 0.75154352, + "learning_rate": 3.042746441843029e-06, + "loss": 0.77316105, + "num_input_tokens_seen": 123531055, + "router_z_loss_clip": 0.82177734, + "router_z_loss_mlp": 0.12805176, + "step": 5749, + "time_per_iteration": 2.6970505714416504 + }, + { + "auxiliary_loss_clip": 0.01040164, + "auxiliary_loss_mlp": 0.0100587, + "balance_loss_clip": 1.01277113, + "balance_loss_mlp": 1.00415671, + "epoch": 0.34570870284082367, + "flos": 75657668457600.0, + "grad_norm": 0.8827537731778565, + "language_loss": 0.62688804, + "learning_rate": 3.0424140832227437e-06, + "loss": 0.6473484, + "num_input_tokens_seen": 123584720, + "router_z_loss_clip": 0.27368164, + "router_z_loss_mlp": 0.01715088, + "step": 5750, + "time_per_iteration": 3.130634069442749 + }, + { + "auxiliary_loss_clip": 0.01126923, + "auxiliary_loss_mlp": 0.01032465, + "balance_loss_clip": 1.04589617, + "balance_loss_mlp": 1.01930428, + "epoch": 0.34576882609349163, + "flos": 27799197472800.0, + "grad_norm": 1.8526522645307604, + "language_loss": 0.80671275, + "learning_rate": 3.042081685074012e-06, + "loss": 0.82830667, + "num_input_tokens_seen": 123604465, + "router_z_loss_clip": 0.81103516, + "router_z_loss_mlp": 0.13153076, + "step": 5751, + "time_per_iteration": 2.713709831237793 + }, + { + "auxiliary_loss_clip": 0.01126963, + "auxiliary_loss_mlp": 0.01044077, + "balance_loss_clip": 1.04420853, + "balance_loss_mlp": 1.03039145, + "epoch": 0.34582894934615965, + "flos": 15067445515200.0, + "grad_norm": 2.534604860059577, + "language_loss": 0.84219784, + "learning_rate": 3.041749247409439e-06, + "loss": 0.86390823, + "num_input_tokens_seen": 123622320, + "router_z_loss_clip": 0.82861328, + "router_z_loss_mlp": 0.13684082, + "step": 5752, + "time_per_iteration": 2.754694700241089 + }, + { + "auxiliary_loss_clip": 0.01039983, + "auxiliary_loss_mlp": 0.01001189, + "balance_loss_clip": 1.01257849, + "balance_loss_mlp": 0.99953818, + "epoch": 0.3458890725988276, + "flos": 85620133913760.0, + "grad_norm": 0.7332757790224048, + "language_loss": 0.63102776, + "learning_rate": 3.0414167702416296e-06, + "loss": 0.65143949, + "num_input_tokens_seen": 123678010, + "router_z_loss_clip": 0.27392578, + "router_z_loss_mlp": 0.01654053, + "step": 5753, + "time_per_iteration": 3.224560022354126 + }, + { + "auxiliary_loss_clip": 0.01130127, + "auxiliary_loss_mlp": 0.01036813, + "balance_loss_clip": 1.0457418, + "balance_loss_mlp": 1.02243686, + "epoch": 0.3459491958514956, + "flos": 20856093585120.0, + "grad_norm": 1.9291280960529757, + "language_loss": 0.71082741, + "learning_rate": 3.0410842535831914e-06, + "loss": 0.73249686, + "num_input_tokens_seen": 123696830, + "router_z_loss_clip": 0.84521484, + "router_z_loss_mlp": 0.14379883, + "step": 5754, + "time_per_iteration": 2.691828727722168 + }, + { + "auxiliary_loss_clip": 0.01133181, + "auxiliary_loss_mlp": 0.01036554, + "balance_loss_clip": 1.04546404, + "balance_loss_mlp": 1.02255905, + "epoch": 0.34600931910416355, + "flos": 20317533069600.0, + "grad_norm": 2.0825921750088887, + "language_loss": 0.72795379, + "learning_rate": 3.0407516974467343e-06, + "loss": 0.74965113, + "num_input_tokens_seen": 123714360, + "router_z_loss_clip": 0.87695312, + "router_z_loss_mlp": 0.13995361, + "step": 5755, + "time_per_iteration": 2.6788625717163086 + }, + { + "auxiliary_loss_clip": 0.01127686, + "auxiliary_loss_mlp": 0.01033083, + "balance_loss_clip": 1.04494059, + "balance_loss_mlp": 1.01967776, + "epoch": 0.3460694423568315, + "flos": 47035436231520.0, + "grad_norm": 2.0207271733271837, + "language_loss": 0.72449607, + "learning_rate": 3.040419101844869e-06, + "loss": 0.74610376, + "num_input_tokens_seen": 123739250, + "router_z_loss_clip": 0.82666016, + "router_z_loss_mlp": 0.13391113, + "step": 5756, + "time_per_iteration": 2.903113603591919 + }, + { + "auxiliary_loss_clip": 0.0103909, + "auxiliary_loss_mlp": 0.01005619, + "balance_loss_clip": 1.01184583, + "balance_loss_mlp": 1.00394273, + "epoch": 0.3461295656094995, + "flos": 87953963676480.0, + "grad_norm": 0.7849849815657154, + "language_loss": 0.6262697, + "learning_rate": 3.040086466790207e-06, + "loss": 0.64671671, + "num_input_tokens_seen": 123802845, + "router_z_loss_clip": 0.27270508, + "router_z_loss_mlp": 0.01679993, + "step": 5757, + "time_per_iteration": 3.300462007522583 + }, + { + "auxiliary_loss_clip": 0.01038761, + "auxiliary_loss_mlp": 0.01007578, + "balance_loss_clip": 1.0116322, + "balance_loss_mlp": 1.00595331, + "epoch": 0.34618968886216744, + "flos": 79875643328640.0, + "grad_norm": 0.8214693743025796, + "language_loss": 0.5921725, + "learning_rate": 3.039753792295362e-06, + "loss": 0.61263591, + "num_input_tokens_seen": 123861805, + "router_z_loss_clip": 0.27172852, + "router_z_loss_mlp": 0.0162735, + "step": 5758, + "time_per_iteration": 3.2828986644744873 + }, + { + "auxiliary_loss_clip": 0.01133573, + "auxiliary_loss_mlp": 0.01040601, + "balance_loss_clip": 1.0496068, + "balance_loss_mlp": 1.02772617, + "epoch": 0.3462498121148354, + "flos": 28641876756480.0, + "grad_norm": 1.8613076414170422, + "language_loss": 0.71759403, + "learning_rate": 3.0394210783729487e-06, + "loss": 0.73933578, + "num_input_tokens_seen": 123881820, + "router_z_loss_clip": 0.83935547, + "router_z_loss_mlp": 0.12860107, + "step": 5759, + "time_per_iteration": 2.782972812652588 + }, + { + "auxiliary_loss_clip": 0.01127777, + "auxiliary_loss_mlp": 0.01046446, + "balance_loss_clip": 1.04435217, + "balance_loss_mlp": 1.03243923, + "epoch": 0.3463099353675034, + "flos": 29493469841760.0, + "grad_norm": 1.6526277990470262, + "language_loss": 0.83394396, + "learning_rate": 3.0390883250355836e-06, + "loss": 0.85568619, + "num_input_tokens_seen": 123903700, + "router_z_loss_clip": 0.83447266, + "router_z_loss_mlp": 0.13995361, + "step": 5760, + "time_per_iteration": 2.665524482727051 + }, + { + "auxiliary_loss_clip": 0.01038153, + "auxiliary_loss_mlp": 0.01004248, + "balance_loss_clip": 1.01108062, + "balance_loss_mlp": 1.00262356, + "epoch": 0.34637005862017134, + "flos": 77729342653440.0, + "grad_norm": 0.8268759215592881, + "language_loss": 0.56625426, + "learning_rate": 3.0387555322958865e-06, + "loss": 0.58667827, + "num_input_tokens_seen": 123960075, + "router_z_loss_clip": 0.27050781, + "router_z_loss_mlp": 0.01625824, + "step": 5761, + "time_per_iteration": 3.368582248687744 + }, + { + "auxiliary_loss_clip": 0.01127999, + "auxiliary_loss_mlp": 0.01040796, + "balance_loss_clip": 1.04440331, + "balance_loss_mlp": 1.02746809, + "epoch": 0.3464301818728393, + "flos": 16040206733760.0, + "grad_norm": 2.0974387563887875, + "language_loss": 0.95096827, + "learning_rate": 3.038422700166474e-06, + "loss": 0.97265619, + "num_input_tokens_seen": 123975805, + "router_z_loss_clip": 0.83642578, + "router_z_loss_mlp": 0.13342285, + "step": 5762, + "time_per_iteration": 2.629807949066162 + }, + { + "auxiliary_loss_clip": 0.01131381, + "auxiliary_loss_mlp": 0.0103655, + "balance_loss_clip": 1.04352713, + "balance_loss_mlp": 1.02235198, + "epoch": 0.34649030512550727, + "flos": 35771051109600.0, + "grad_norm": 1.7849787877977266, + "language_loss": 0.69479144, + "learning_rate": 3.0380898286599692e-06, + "loss": 0.71647072, + "num_input_tokens_seen": 123997530, + "router_z_loss_clip": 0.87744141, + "router_z_loss_mlp": 0.1418457, + "step": 5763, + "time_per_iteration": 2.7717928886413574 + }, + { + "auxiliary_loss_clip": 0.01137086, + "auxiliary_loss_mlp": 0.01041908, + "balance_loss_clip": 1.04867959, + "balance_loss_mlp": 1.02638721, + "epoch": 0.34655042837817523, + "flos": 28956854155680.0, + "grad_norm": 1.9478775162726294, + "language_loss": 0.8356477, + "learning_rate": 3.0377569177889945e-06, + "loss": 0.85743761, + "num_input_tokens_seen": 124016375, + "router_z_loss_clip": 0.8828125, + "router_z_loss_mlp": 0.15527344, + "step": 5764, + "time_per_iteration": 2.7004148960113525 + }, + { + "auxiliary_loss_clip": 0.011323, + "auxiliary_loss_mlp": 0.01035854, + "balance_loss_clip": 1.04690182, + "balance_loss_mlp": 1.02258039, + "epoch": 0.34661055163084326, + "flos": 26910699495840.0, + "grad_norm": 2.360502962192501, + "language_loss": 0.67321992, + "learning_rate": 3.0374239675661722e-06, + "loss": 0.69490147, + "num_input_tokens_seen": 124033975, + "router_z_loss_clip": 0.85351562, + "router_z_loss_mlp": 0.13262939, + "step": 5765, + "time_per_iteration": 2.7252049446105957 + }, + { + "auxiliary_loss_clip": 0.01136802, + "auxiliary_loss_mlp": 0.01039498, + "balance_loss_clip": 1.05261278, + "balance_loss_mlp": 1.02527034, + "epoch": 0.3466706748835112, + "flos": 26599490203680.0, + "grad_norm": 2.0880073588711237, + "language_loss": 0.77176321, + "learning_rate": 3.03709097800413e-06, + "loss": 0.79352623, + "num_input_tokens_seen": 124051930, + "router_z_loss_clip": 0.84179688, + "router_z_loss_mlp": 0.14245605, + "step": 5766, + "time_per_iteration": 2.825493097305298 + }, + { + "auxiliary_loss_clip": 0.0113005, + "auxiliary_loss_mlp": 0.0103824, + "balance_loss_clip": 1.04677868, + "balance_loss_mlp": 1.02519202, + "epoch": 0.3467307981361792, + "flos": 23747277530880.0, + "grad_norm": 1.933812461577206, + "language_loss": 0.73383325, + "learning_rate": 3.0367579491154943e-06, + "loss": 0.75551617, + "num_input_tokens_seen": 124071220, + "router_z_loss_clip": 0.83349609, + "router_z_loss_mlp": 0.13043213, + "step": 5767, + "time_per_iteration": 2.7668182849884033 + }, + { + "auxiliary_loss_clip": 0.01134251, + "auxiliary_loss_mlp": 0.01043438, + "balance_loss_clip": 1.04979014, + "balance_loss_mlp": 1.02889395, + "epoch": 0.34679092138884715, + "flos": 30294787332960.0, + "grad_norm": 1.809887307566913, + "language_loss": 0.77650893, + "learning_rate": 3.036424880912893e-06, + "loss": 0.79828584, + "num_input_tokens_seen": 124090140, + "router_z_loss_clip": 0.84423828, + "router_z_loss_mlp": 0.14550781, + "step": 5768, + "time_per_iteration": 2.7735962867736816 + }, + { + "auxiliary_loss_clip": 0.01038806, + "auxiliary_loss_mlp": 0.01003101, + "balance_loss_clip": 1.01183212, + "balance_loss_mlp": 1.00148535, + "epoch": 0.3468510446415151, + "flos": 77161575529440.0, + "grad_norm": 0.7661086294973615, + "language_loss": 0.57529819, + "learning_rate": 3.036091773408956e-06, + "loss": 0.59571731, + "num_input_tokens_seen": 124152025, + "router_z_loss_clip": 0.26953125, + "router_z_loss_mlp": 0.01617432, + "step": 5769, + "time_per_iteration": 3.315190315246582 + }, + { + "auxiliary_loss_clip": 0.01141346, + "auxiliary_loss_mlp": 0.01038614, + "balance_loss_clip": 1.0486939, + "balance_loss_mlp": 1.02281857, + "epoch": 0.3469111678941831, + "flos": 14789940249600.0, + "grad_norm": 22.774608479607195, + "language_loss": 0.85675323, + "learning_rate": 3.0357586266163154e-06, + "loss": 0.87855285, + "num_input_tokens_seen": 124165795, + "router_z_loss_clip": 0.92529297, + "router_z_loss_mlp": 0.15783691, + "step": 5770, + "time_per_iteration": 2.665475368499756 + }, + { + "auxiliary_loss_clip": 0.01038215, + "auxiliary_loss_mlp": 0.01000877, + "balance_loss_clip": 1.01139367, + "balance_loss_mlp": 0.99932539, + "epoch": 0.34697129114685105, + "flos": 80454107014560.0, + "grad_norm": 0.7680002946496245, + "language_loss": 0.59765035, + "learning_rate": 3.0354254405476036e-06, + "loss": 0.61804128, + "num_input_tokens_seen": 124222925, + "router_z_loss_clip": 0.26879883, + "router_z_loss_mlp": 0.01552582, + "step": 5771, + "time_per_iteration": 3.0982608795166016 + }, + { + "auxiliary_loss_clip": 0.01132917, + "auxiliary_loss_mlp": 0.01043168, + "balance_loss_clip": 1.04869115, + "balance_loss_mlp": 1.02934587, + "epoch": 0.347031414399519, + "flos": 42042595302720.0, + "grad_norm": 1.7707667328745258, + "language_loss": 0.71529889, + "learning_rate": 3.0350922152154557e-06, + "loss": 0.73705971, + "num_input_tokens_seen": 124240915, + "router_z_loss_clip": 0.84179688, + "router_z_loss_mlp": 0.13824463, + "step": 5772, + "time_per_iteration": 2.7707810401916504 + }, + { + "auxiliary_loss_clip": 0.01132745, + "auxiliary_loss_mlp": 0.01036994, + "balance_loss_clip": 1.0471766, + "balance_loss_mlp": 1.02247465, + "epoch": 0.347091537652187, + "flos": 32877679230720.0, + "grad_norm": 1.4859756352646634, + "language_loss": 0.76238191, + "learning_rate": 3.034758950632507e-06, + "loss": 0.78407931, + "num_input_tokens_seen": 124262770, + "router_z_loss_clip": 0.85595703, + "router_z_loss_mlp": 0.14520264, + "step": 5773, + "time_per_iteration": 2.756610631942749 + }, + { + "auxiliary_loss_clip": 0.01134197, + "auxiliary_loss_mlp": 0.01040805, + "balance_loss_clip": 1.04783809, + "balance_loss_mlp": 1.02605283, + "epoch": 0.34715166090485494, + "flos": 25797808056960.0, + "grad_norm": 2.3134248439871423, + "language_loss": 0.70039332, + "learning_rate": 3.034425646811396e-06, + "loss": 0.72214329, + "num_input_tokens_seen": 124280950, + "router_z_loss_clip": 0.86328125, + "router_z_loss_mlp": 0.14764404, + "step": 5774, + "time_per_iteration": 2.6990110874176025 + }, + { + "auxiliary_loss_clip": 0.01130086, + "auxiliary_loss_mlp": 0.01038591, + "balance_loss_clip": 1.04796767, + "balance_loss_mlp": 1.0250783, + "epoch": 0.3472117841575229, + "flos": 28648035383040.0, + "grad_norm": 1.681993858644423, + "language_loss": 0.76255918, + "learning_rate": 3.0340923037647602e-06, + "loss": 0.78424597, + "num_input_tokens_seen": 124299540, + "router_z_loss_clip": 0.82128906, + "router_z_loss_mlp": 0.13513184, + "step": 5775, + "time_per_iteration": 2.7353529930114746 + }, + { + "auxiliary_loss_clip": 0.01135259, + "auxiliary_loss_mlp": 0.01039079, + "balance_loss_clip": 1.04665589, + "balance_loss_mlp": 1.02458882, + "epoch": 0.34727190741019087, + "flos": 21344581092960.0, + "grad_norm": 1.9651052613400148, + "language_loss": 0.77624798, + "learning_rate": 3.0337589215052404e-06, + "loss": 0.79799128, + "num_input_tokens_seen": 124316285, + "router_z_loss_clip": 0.88525391, + "router_z_loss_mlp": 0.1449585, + "step": 5776, + "time_per_iteration": 2.6391022205352783 + }, + { + "auxiliary_loss_clip": 0.01040755, + "auxiliary_loss_mlp": 0.01000589, + "balance_loss_clip": 1.01391602, + "balance_loss_mlp": 0.99896169, + "epoch": 0.34733203066285884, + "flos": 79635940685280.0, + "grad_norm": 0.8301228345833476, + "language_loss": 0.6336695, + "learning_rate": 3.033425500045478e-06, + "loss": 0.65408295, + "num_input_tokens_seen": 124376650, + "router_z_loss_clip": 0.26879883, + "router_z_loss_mlp": 0.01629639, + "step": 5777, + "time_per_iteration": 3.3707029819488525 + }, + { + "auxiliary_loss_clip": 0.01134395, + "auxiliary_loss_mlp": 0.01038585, + "balance_loss_clip": 1.04876077, + "balance_loss_mlp": 1.02448869, + "epoch": 0.3473921539155268, + "flos": 34969976722080.0, + "grad_norm": 2.0175696059832027, + "language_loss": 0.64448917, + "learning_rate": 3.033092039398119e-06, + "loss": 0.666219, + "num_input_tokens_seen": 124396475, + "router_z_loss_clip": 0.85693359, + "router_z_loss_mlp": 0.14093018, + "step": 5778, + "time_per_iteration": 4.188487529754639 + }, + { + "auxiliary_loss_clip": 0.01135369, + "auxiliary_loss_mlp": 0.01045644, + "balance_loss_clip": 1.04802835, + "balance_loss_mlp": 1.0316906, + "epoch": 0.3474522771681948, + "flos": 49831457787360.0, + "grad_norm": 3.1442571111342503, + "language_loss": 0.71541524, + "learning_rate": 3.0327585395758046e-06, + "loss": 0.73722529, + "num_input_tokens_seen": 124416480, + "router_z_loss_clip": 0.87353516, + "router_z_loss_mlp": 0.13946533, + "step": 5779, + "time_per_iteration": 4.229124307632446 + }, + { + "auxiliary_loss_clip": 0.01139495, + "auxiliary_loss_mlp": 0.01040137, + "balance_loss_clip": 1.05040932, + "balance_loss_mlp": 1.02601695, + "epoch": 0.3475124004208628, + "flos": 30027695008320.0, + "grad_norm": 1.989028274712147, + "language_loss": 0.6226939, + "learning_rate": 3.0324250005911837e-06, + "loss": 0.64449024, + "num_input_tokens_seen": 124435950, + "router_z_loss_clip": 0.89013672, + "router_z_loss_mlp": 0.14129639, + "step": 5780, + "time_per_iteration": 2.689039707183838 + }, + { + "auxiliary_loss_clip": 0.01135647, + "auxiliary_loss_mlp": 0.0103683, + "balance_loss_clip": 1.05013263, + "balance_loss_mlp": 1.02294207, + "epoch": 0.34757252367353075, + "flos": 27712503194400.0, + "grad_norm": 1.858161902289922, + "language_loss": 0.72203827, + "learning_rate": 3.0320914224569033e-06, + "loss": 0.74376303, + "num_input_tokens_seen": 124455410, + "router_z_loss_clip": 0.85449219, + "router_z_loss_mlp": 0.13867188, + "step": 5781, + "time_per_iteration": 4.171799421310425 + }, + { + "auxiliary_loss_clip": 0.01138031, + "auxiliary_loss_mlp": 0.01040127, + "balance_loss_clip": 1.05025196, + "balance_loss_mlp": 1.02521944, + "epoch": 0.3476326469261987, + "flos": 24194362728960.0, + "grad_norm": 2.395942632253251, + "language_loss": 0.77166057, + "learning_rate": 3.031757805185612e-06, + "loss": 0.79344213, + "num_input_tokens_seen": 124474870, + "router_z_loss_clip": 0.87744141, + "router_z_loss_mlp": 0.14898682, + "step": 5782, + "time_per_iteration": 2.8162424564361572 + }, + { + "auxiliary_loss_clip": 0.01134111, + "auxiliary_loss_mlp": 0.01030827, + "balance_loss_clip": 1.04925966, + "balance_loss_mlp": 1.01710045, + "epoch": 0.3476927701788667, + "flos": 24328455874560.0, + "grad_norm": 2.8231708258834245, + "language_loss": 0.62450981, + "learning_rate": 3.0314241487899622e-06, + "loss": 0.64615917, + "num_input_tokens_seen": 124494105, + "router_z_loss_clip": 0.84765625, + "router_z_loss_mlp": 0.13726807, + "step": 5783, + "time_per_iteration": 2.6994378566741943 + }, + { + "auxiliary_loss_clip": 0.0112958, + "auxiliary_loss_mlp": 0.0102909, + "balance_loss_clip": 1.04721177, + "balance_loss_mlp": 1.01630473, + "epoch": 0.34775289343153465, + "flos": 25301541231360.0, + "grad_norm": 1.693133790780575, + "language_loss": 0.8833189, + "learning_rate": 3.031090453282605e-06, + "loss": 0.90490568, + "num_input_tokens_seen": 124512030, + "router_z_loss_clip": 0.82275391, + "router_z_loss_mlp": 0.12805176, + "step": 5784, + "time_per_iteration": 4.109288454055786 + }, + { + "auxiliary_loss_clip": 0.01132996, + "auxiliary_loss_mlp": 0.01033665, + "balance_loss_clip": 1.04963326, + "balance_loss_mlp": 1.02016413, + "epoch": 0.3478130166842026, + "flos": 23616668871360.0, + "grad_norm": 2.060057049627511, + "language_loss": 0.81211877, + "learning_rate": 3.0307567186761946e-06, + "loss": 0.8337853, + "num_input_tokens_seen": 124530980, + "router_z_loss_clip": 0.83398438, + "router_z_loss_mlp": 0.13494873, + "step": 5785, + "time_per_iteration": 2.6382322311401367 + }, + { + "auxiliary_loss_clip": 0.0113676, + "auxiliary_loss_mlp": 0.01036745, + "balance_loss_clip": 1.05280781, + "balance_loss_mlp": 1.023507, + "epoch": 0.3478731399368706, + "flos": 26907296044320.0, + "grad_norm": 1.8441684422835483, + "language_loss": 0.80444264, + "learning_rate": 3.0304229449833862e-06, + "loss": 0.82617772, + "num_input_tokens_seen": 124549330, + "router_z_loss_clip": 0.83886719, + "router_z_loss_mlp": 0.13238525, + "step": 5786, + "time_per_iteration": 2.7832865715026855 + }, + { + "auxiliary_loss_clip": 0.0113228, + "auxiliary_loss_mlp": 0.0103287, + "balance_loss_clip": 1.04959428, + "balance_loss_mlp": 1.01878548, + "epoch": 0.34793326318953854, + "flos": 22013952854400.0, + "grad_norm": 2.004557614529276, + "language_loss": 0.7480194, + "learning_rate": 3.030089132216836e-06, + "loss": 0.7696709, + "num_input_tokens_seen": 124567200, + "router_z_loss_clip": 0.82617188, + "router_z_loss_mlp": 0.14086914, + "step": 5787, + "time_per_iteration": 2.7154834270477295 + }, + { + "auxiliary_loss_clip": 0.01133121, + "auxiliary_loss_mlp": 0.01034662, + "balance_loss_clip": 1.0473696, + "balance_loss_mlp": 1.02072072, + "epoch": 0.3479933864422065, + "flos": 35770483867680.0, + "grad_norm": 2.1680331539717783, + "language_loss": 0.81303799, + "learning_rate": 3.029755280389203e-06, + "loss": 0.83471584, + "num_input_tokens_seen": 124587025, + "router_z_loss_clip": 0.85693359, + "router_z_loss_mlp": 0.13952637, + "step": 5788, + "time_per_iteration": 2.7937684059143066 + }, + { + "auxiliary_loss_clip": 0.01139192, + "auxiliary_loss_mlp": 0.01035182, + "balance_loss_clip": 1.05136538, + "balance_loss_mlp": 1.0210557, + "epoch": 0.3480535096948745, + "flos": 24551269162560.0, + "grad_norm": 2.0763570710829304, + "language_loss": 0.85567504, + "learning_rate": 3.029421389513147e-06, + "loss": 0.87741876, + "num_input_tokens_seen": 124605860, + "router_z_loss_clip": 0.87695312, + "router_z_loss_mlp": 0.14123535, + "step": 5789, + "time_per_iteration": 2.7184884548187256 + }, + { + "auxiliary_loss_clip": 0.01139751, + "auxiliary_loss_mlp": 0.0104975, + "balance_loss_clip": 1.05229235, + "balance_loss_mlp": 1.03585064, + "epoch": 0.34811363294754244, + "flos": 22633575746400.0, + "grad_norm": 1.9561843224046112, + "language_loss": 0.84966177, + "learning_rate": 3.029087459601328e-06, + "loss": 0.87155676, + "num_input_tokens_seen": 124624270, + "router_z_loss_clip": 0.87451172, + "router_z_loss_mlp": 0.13903809, + "step": 5790, + "time_per_iteration": 2.669304847717285 + }, + { + "auxiliary_loss_clip": 0.01137161, + "auxiliary_loss_mlp": 0.01040888, + "balance_loss_clip": 1.05245543, + "balance_loss_mlp": 1.02685142, + "epoch": 0.3481737562002104, + "flos": 32787662535360.0, + "grad_norm": 2.5415398457978435, + "language_loss": 0.81171435, + "learning_rate": 3.0287534906664097e-06, + "loss": 0.83349484, + "num_input_tokens_seen": 124644005, + "router_z_loss_clip": 0.84667969, + "router_z_loss_mlp": 0.14050293, + "step": 5791, + "time_per_iteration": 2.7550976276397705 + }, + { + "auxiliary_loss_clip": 0.01135687, + "auxiliary_loss_mlp": 0.01032278, + "balance_loss_clip": 1.04823864, + "balance_loss_mlp": 1.01855707, + "epoch": 0.3482338794528784, + "flos": 35275473077760.0, + "grad_norm": 3.384073380690846, + "language_loss": 0.77879053, + "learning_rate": 3.028419482721056e-06, + "loss": 0.80047011, + "num_input_tokens_seen": 124663020, + "router_z_loss_clip": 0.875, + "router_z_loss_mlp": 0.1373291, + "step": 5792, + "time_per_iteration": 2.745999813079834 + }, + { + "auxiliary_loss_clip": 0.01130827, + "auxiliary_loss_mlp": 0.01029419, + "balance_loss_clip": 1.04684532, + "balance_loss_mlp": 1.01607966, + "epoch": 0.3482940027055464, + "flos": 27089476850880.0, + "grad_norm": 1.5963089887056625, + "language_loss": 0.81637597, + "learning_rate": 3.0280854357779325e-06, + "loss": 0.83797842, + "num_input_tokens_seen": 124682975, + "router_z_loss_clip": 0.83984375, + "router_z_loss_mlp": 0.13336182, + "step": 5793, + "time_per_iteration": 2.7377259731292725 + }, + { + "auxiliary_loss_clip": 0.01134711, + "auxiliary_loss_mlp": 0.01046196, + "balance_loss_clip": 1.04928434, + "balance_loss_mlp": 1.03155744, + "epoch": 0.34835412595821436, + "flos": 24773393656800.0, + "grad_norm": 1.7963687311958954, + "language_loss": 0.76493776, + "learning_rate": 3.027751349849706e-06, + "loss": 0.78674686, + "num_input_tokens_seen": 124701340, + "router_z_loss_clip": 0.85400391, + "router_z_loss_mlp": 0.14648438, + "step": 5794, + "time_per_iteration": 2.626542806625366 + }, + { + "auxiliary_loss_clip": 0.01134538, + "auxiliary_loss_mlp": 0.01038885, + "balance_loss_clip": 1.05016994, + "balance_loss_mlp": 1.02491415, + "epoch": 0.3484142492108823, + "flos": 24952657219200.0, + "grad_norm": 8.988032316133365, + "language_loss": 0.57421708, + "learning_rate": 3.0274172249490456e-06, + "loss": 0.59595132, + "num_input_tokens_seen": 124719165, + "router_z_loss_clip": 0.84472656, + "router_z_loss_mlp": 0.13977051, + "step": 5795, + "time_per_iteration": 2.6782827377319336 + }, + { + "auxiliary_loss_clip": 0.01130693, + "auxiliary_loss_mlp": 0.01034121, + "balance_loss_clip": 1.04805171, + "balance_loss_mlp": 1.02045405, + "epoch": 0.3484743724635503, + "flos": 29715756405120.0, + "grad_norm": 2.043777523469556, + "language_loss": 0.82591683, + "learning_rate": 3.0270830610886213e-06, + "loss": 0.847565, + "num_input_tokens_seen": 124738670, + "router_z_loss_clip": 0.82666016, + "router_z_loss_mlp": 0.13665771, + "step": 5796, + "time_per_iteration": 2.658932685852051 + }, + { + "auxiliary_loss_clip": 0.01129467, + "auxiliary_loss_mlp": 0.01031041, + "balance_loss_clip": 1.04828608, + "balance_loss_mlp": 1.01782072, + "epoch": 0.34853449571621825, + "flos": 29715837439680.0, + "grad_norm": 1.6551389308127902, + "language_loss": 0.83100379, + "learning_rate": 3.0267488582811033e-06, + "loss": 0.8526088, + "num_input_tokens_seen": 124758760, + "router_z_loss_clip": 0.81347656, + "router_z_loss_mlp": 0.13226318, + "step": 5797, + "time_per_iteration": 2.7163994312286377 + }, + { + "auxiliary_loss_clip": 0.01129781, + "auxiliary_loss_mlp": 0.01033808, + "balance_loss_clip": 1.04790318, + "balance_loss_mlp": 1.01985466, + "epoch": 0.3485946189688862, + "flos": 33272949178080.0, + "grad_norm": 1.5896901141674606, + "language_loss": 0.73496997, + "learning_rate": 3.026414616539167e-06, + "loss": 0.75660592, + "num_input_tokens_seen": 124777765, + "router_z_loss_clip": 0.81787109, + "router_z_loss_mlp": 0.13952637, + "step": 5798, + "time_per_iteration": 2.6765501499176025 + }, + { + "auxiliary_loss_clip": 0.0113305, + "auxiliary_loss_mlp": 0.01039267, + "balance_loss_clip": 1.04738307, + "balance_loss_mlp": 1.02520061, + "epoch": 0.3486547422215542, + "flos": 24595224060960.0, + "grad_norm": 2.176671006493139, + "language_loss": 0.76543665, + "learning_rate": 3.026080335875485e-06, + "loss": 0.78715986, + "num_input_tokens_seen": 124796775, + "router_z_loss_clip": 0.85644531, + "router_z_loss_mlp": 0.14044189, + "step": 5799, + "time_per_iteration": 2.6879184246063232 + }, + { + "auxiliary_loss_clip": 0.01130923, + "auxiliary_loss_mlp": 0.01030902, + "balance_loss_clip": 1.04769874, + "balance_loss_mlp": 1.01797378, + "epoch": 0.34871486547422215, + "flos": 24685038169920.0, + "grad_norm": 1.8405764959848347, + "language_loss": 0.7570129, + "learning_rate": 3.025746016302734e-06, + "loss": 0.77863115, + "num_input_tokens_seen": 124815825, + "router_z_loss_clip": 0.83203125, + "router_z_loss_mlp": 0.12939453, + "step": 5800, + "time_per_iteration": 2.709359645843506 + }, + { + "auxiliary_loss_clip": 0.01138152, + "auxiliary_loss_mlp": 0.01037511, + "balance_loss_clip": 1.04983521, + "balance_loss_mlp": 1.02265787, + "epoch": 0.3487749887268901, + "flos": 53755767348480.0, + "grad_norm": 1.7330317708710479, + "language_loss": 0.66749781, + "learning_rate": 3.025411657833591e-06, + "loss": 0.68925446, + "num_input_tokens_seen": 124838420, + "router_z_loss_clip": 0.88378906, + "router_z_loss_mlp": 0.1484375, + "step": 5801, + "time_per_iteration": 2.8375585079193115 + }, + { + "auxiliary_loss_clip": 0.01130898, + "auxiliary_loss_mlp": 0.01039808, + "balance_loss_clip": 1.04833281, + "balance_loss_mlp": 1.02609253, + "epoch": 0.3488351119795581, + "flos": 28424938474080.0, + "grad_norm": 1.8171010249252884, + "language_loss": 0.76546925, + "learning_rate": 3.025077260480735e-06, + "loss": 0.78717625, + "num_input_tokens_seen": 124857320, + "router_z_loss_clip": 0.82519531, + "router_z_loss_mlp": 0.13720703, + "step": 5802, + "time_per_iteration": 2.725022554397583 + }, + { + "auxiliary_loss_clip": 0.01125556, + "auxiliary_loss_mlp": 0.0103682, + "balance_loss_clip": 1.04535341, + "balance_loss_mlp": 1.02346277, + "epoch": 0.34889523523222604, + "flos": 24324768802080.0, + "grad_norm": 3.6938216269013684, + "language_loss": 0.78627455, + "learning_rate": 3.0247428242568474e-06, + "loss": 0.80789834, + "num_input_tokens_seen": 124875685, + "router_z_loss_clip": 0.80175781, + "router_z_loss_mlp": 0.13360596, + "step": 5803, + "time_per_iteration": 2.642366647720337 + }, + { + "auxiliary_loss_clip": 0.01130918, + "auxiliary_loss_mlp": 0.01032255, + "balance_loss_clip": 1.04449964, + "balance_loss_mlp": 1.01902294, + "epoch": 0.348955358484894, + "flos": 37151196942240.0, + "grad_norm": 2.257963169071295, + "language_loss": 0.67881191, + "learning_rate": 3.0244083491746085e-06, + "loss": 0.70044369, + "num_input_tokens_seen": 124895960, + "router_z_loss_clip": 0.86474609, + "router_z_loss_mlp": 0.13232422, + "step": 5804, + "time_per_iteration": 2.7758235931396484 + }, + { + "auxiliary_loss_clip": 0.01126888, + "auxiliary_loss_mlp": 0.01039271, + "balance_loss_clip": 1.04721475, + "balance_loss_mlp": 1.02554965, + "epoch": 0.349015481737562, + "flos": 21963798812160.0, + "grad_norm": 1.9618363493742121, + "language_loss": 0.75998807, + "learning_rate": 3.024073835246702e-06, + "loss": 0.78164971, + "num_input_tokens_seen": 124914140, + "router_z_loss_clip": 0.79638672, + "router_z_loss_mlp": 0.13720703, + "step": 5805, + "time_per_iteration": 2.675077438354492 + }, + { + "auxiliary_loss_clip": 0.01130441, + "auxiliary_loss_mlp": 0.0103297, + "balance_loss_clip": 1.04726148, + "balance_loss_mlp": 1.01907587, + "epoch": 0.34907560499023, + "flos": 33188159211840.0, + "grad_norm": 2.4538628914421126, + "language_loss": 0.67508996, + "learning_rate": 3.023739282485814e-06, + "loss": 0.696724, + "num_input_tokens_seen": 124934180, + "router_z_loss_clip": 0.83154297, + "router_z_loss_mlp": 0.13897705, + "step": 5806, + "time_per_iteration": 2.726032257080078 + }, + { + "auxiliary_loss_clip": 0.01131854, + "auxiliary_loss_mlp": 0.0103894, + "balance_loss_clip": 1.04816818, + "balance_loss_mlp": 1.025249, + "epoch": 0.34913572824289796, + "flos": 36883618410240.0, + "grad_norm": 1.4721275713721067, + "language_loss": 0.72206497, + "learning_rate": 3.023404690904629e-06, + "loss": 0.74377298, + "num_input_tokens_seen": 124956060, + "router_z_loss_clip": 0.83691406, + "router_z_loss_mlp": 0.13690186, + "step": 5807, + "time_per_iteration": 2.7062361240386963 + }, + { + "auxiliary_loss_clip": 0.01130441, + "auxiliary_loss_mlp": 0.01035316, + "balance_loss_clip": 1.04470301, + "balance_loss_mlp": 1.02133298, + "epoch": 0.3491958514955659, + "flos": 36572449635360.0, + "grad_norm": 1.9130543249387824, + "language_loss": 0.74288213, + "learning_rate": 3.0230700605158364e-06, + "loss": 0.76453966, + "num_input_tokens_seen": 124976070, + "router_z_loss_clip": 0.85791016, + "router_z_loss_mlp": 0.1395874, + "step": 5808, + "time_per_iteration": 2.769681692123413 + }, + { + "auxiliary_loss_clip": 0.01128406, + "auxiliary_loss_mlp": 0.01039276, + "balance_loss_clip": 1.04845798, + "balance_loss_mlp": 1.02637148, + "epoch": 0.3492559747482339, + "flos": 27800412991200.0, + "grad_norm": 1.9901838233809728, + "language_loss": 0.84299552, + "learning_rate": 3.0227353913321238e-06, + "loss": 0.86467236, + "num_input_tokens_seen": 124996995, + "router_z_loss_clip": 0.79931641, + "router_z_loss_mlp": 0.12902832, + "step": 5809, + "time_per_iteration": 2.7007107734680176 + }, + { + "auxiliary_loss_clip": 0.01125293, + "auxiliary_loss_mlp": 0.01031905, + "balance_loss_clip": 1.04622698, + "balance_loss_mlp": 1.019382, + "epoch": 0.34931609800090185, + "flos": 31808621138400.0, + "grad_norm": 2.6606543326274856, + "language_loss": 0.80529249, + "learning_rate": 3.0224006833661835e-06, + "loss": 0.82686448, + "num_input_tokens_seen": 125015600, + "router_z_loss_clip": 0.79052734, + "router_z_loss_mlp": 0.12524414, + "step": 5810, + "time_per_iteration": 2.757446765899658 + }, + { + "auxiliary_loss_clip": 0.01128867, + "auxiliary_loss_mlp": 0.01039579, + "balance_loss_clip": 1.04579735, + "balance_loss_mlp": 1.02716887, + "epoch": 0.3493762212535698, + "flos": 35681804242560.0, + "grad_norm": 1.7824648348981886, + "language_loss": 0.75275564, + "learning_rate": 3.0220659366307057e-06, + "loss": 0.77444011, + "num_input_tokens_seen": 125035290, + "router_z_loss_clip": 0.83056641, + "router_z_loss_mlp": 0.12408447, + "step": 5811, + "time_per_iteration": 2.743803024291992 + }, + { + "auxiliary_loss_clip": 0.01130422, + "auxiliary_loss_mlp": 0.01034683, + "balance_loss_clip": 1.04527974, + "balance_loss_mlp": 1.02162981, + "epoch": 0.3494363445062378, + "flos": 33099722690400.0, + "grad_norm": 1.5871344303935104, + "language_loss": 0.79862911, + "learning_rate": 3.021731151138386e-06, + "loss": 0.82028019, + "num_input_tokens_seen": 125057130, + "router_z_loss_clip": 0.85107422, + "router_z_loss_mlp": 0.1307373, + "step": 5812, + "time_per_iteration": 2.8025963306427 + }, + { + "auxiliary_loss_clip": 0.0112554, + "auxiliary_loss_mlp": 0.01037227, + "balance_loss_clip": 1.04198897, + "balance_loss_mlp": 1.02364886, + "epoch": 0.34949646775890575, + "flos": 14978806407360.0, + "grad_norm": 2.048173784053399, + "language_loss": 0.69530284, + "learning_rate": 3.021396326901918e-06, + "loss": 0.71693051, + "num_input_tokens_seen": 125073720, + "router_z_loss_clip": 0.83544922, + "router_z_loss_mlp": 0.13580322, + "step": 5813, + "time_per_iteration": 2.656705856323242 + }, + { + "auxiliary_loss_clip": 0.01125576, + "auxiliary_loss_mlp": 0.01036074, + "balance_loss_clip": 1.04455423, + "balance_loss_mlp": 1.023211, + "epoch": 0.3495565910115737, + "flos": 20945097348480.0, + "grad_norm": 2.0702214800796255, + "language_loss": 0.76283646, + "learning_rate": 3.0210614639339998e-06, + "loss": 0.78445292, + "num_input_tokens_seen": 125090635, + "router_z_loss_clip": 0.81005859, + "router_z_loss_mlp": 0.12860107, + "step": 5814, + "time_per_iteration": 2.685314655303955 + }, + { + "auxiliary_loss_clip": 0.01128355, + "auxiliary_loss_mlp": 0.01037994, + "balance_loss_clip": 1.04417813, + "balance_loss_mlp": 1.02376068, + "epoch": 0.3496167142642417, + "flos": 32294191402080.0, + "grad_norm": 2.0281315431372016, + "language_loss": 0.84261405, + "learning_rate": 3.020726562247328e-06, + "loss": 0.86427754, + "num_input_tokens_seen": 125110070, + "router_z_loss_clip": 0.84179688, + "router_z_loss_mlp": 0.14245605, + "step": 5815, + "time_per_iteration": 2.727665424346924 + }, + { + "auxiliary_loss_clip": 0.01126329, + "auxiliary_loss_mlp": 0.01028674, + "balance_loss_clip": 1.04326177, + "balance_loss_mlp": 1.01656818, + "epoch": 0.34967683751690964, + "flos": 21249904910400.0, + "grad_norm": 1.990772447318994, + "language_loss": 0.77098072, + "learning_rate": 3.0203916218546024e-06, + "loss": 0.79253078, + "num_input_tokens_seen": 125125730, + "router_z_loss_clip": 0.83056641, + "router_z_loss_mlp": 0.12097168, + "step": 5816, + "time_per_iteration": 2.7591755390167236 + }, + { + "auxiliary_loss_clip": 0.01131165, + "auxiliary_loss_mlp": 0.01039175, + "balance_loss_clip": 1.04674196, + "balance_loss_mlp": 1.02615178, + "epoch": 0.3497369607695776, + "flos": 27576870392160.0, + "grad_norm": 2.07080069881252, + "language_loss": 0.58925593, + "learning_rate": 3.0200566427685246e-06, + "loss": 0.61095935, + "num_input_tokens_seen": 125146195, + "router_z_loss_clip": 0.84472656, + "router_z_loss_mlp": 0.13024902, + "step": 5817, + "time_per_iteration": 4.065850257873535 + }, + { + "auxiliary_loss_clip": 0.01046622, + "auxiliary_loss_mlp": 0.01008822, + "balance_loss_clip": 1.01954651, + "balance_loss_mlp": 1.00736344, + "epoch": 0.34979708402224563, + "flos": 83620608292800.0, + "grad_norm": 0.8827648055623202, + "language_loss": 0.59904015, + "learning_rate": 3.0197216250017975e-06, + "loss": 0.61959457, + "num_input_tokens_seen": 125207790, + "router_z_loss_clip": 0.27099609, + "router_z_loss_mlp": 0.01457977, + "step": 5818, + "time_per_iteration": 4.583780527114868 + }, + { + "auxiliary_loss_clip": 0.0112555, + "auxiliary_loss_mlp": 0.01035017, + "balance_loss_clip": 1.04486859, + "balance_loss_mlp": 1.02158809, + "epoch": 0.3498572072749136, + "flos": 23171042295360.0, + "grad_norm": 1.7945553518232917, + "language_loss": 0.83266211, + "learning_rate": 3.019386568567123e-06, + "loss": 0.85426772, + "num_input_tokens_seen": 125226220, + "router_z_loss_clip": 0.80566406, + "router_z_loss_mlp": 0.13439941, + "step": 5819, + "time_per_iteration": 2.638549566268921 + }, + { + "auxiliary_loss_clip": 0.01127085, + "auxiliary_loss_mlp": 0.0103093, + "balance_loss_clip": 1.04437327, + "balance_loss_mlp": 1.01800215, + "epoch": 0.34991733052758156, + "flos": 33945197666400.0, + "grad_norm": 1.8834869225475892, + "language_loss": 0.70802277, + "learning_rate": 3.0190514734772083e-06, + "loss": 0.72960293, + "num_input_tokens_seen": 125247485, + "router_z_loss_clip": 0.82763672, + "router_z_loss_mlp": 0.12915039, + "step": 5820, + "time_per_iteration": 2.6875553131103516 + }, + { + "auxiliary_loss_clip": 0.0112958, + "auxiliary_loss_mlp": 0.01033015, + "balance_loss_clip": 1.04497528, + "balance_loss_mlp": 1.02048004, + "epoch": 0.3499774537802495, + "flos": 40980425148000.0, + "grad_norm": 1.6455336667877298, + "language_loss": 0.70565426, + "learning_rate": 3.018716339744759e-06, + "loss": 0.7272802, + "num_input_tokens_seen": 125268625, + "router_z_loss_clip": 0.84619141, + "router_z_loss_mlp": 0.12536621, + "step": 5821, + "time_per_iteration": 4.115598440170288 + }, + { + "auxiliary_loss_clip": 0.01136305, + "auxiliary_loss_mlp": 0.01042175, + "balance_loss_clip": 1.04879022, + "balance_loss_mlp": 1.02738738, + "epoch": 0.3500375770329175, + "flos": 28646617278240.0, + "grad_norm": 2.070475786242931, + "language_loss": 0.74102795, + "learning_rate": 3.0183811673824842e-06, + "loss": 0.76281273, + "num_input_tokens_seen": 125287530, + "router_z_loss_clip": 0.87597656, + "router_z_loss_mlp": 0.14801025, + "step": 5822, + "time_per_iteration": 2.6597530841827393 + }, + { + "auxiliary_loss_clip": 0.01130608, + "auxiliary_loss_mlp": 0.01032824, + "balance_loss_clip": 1.04550636, + "balance_loss_mlp": 1.01834583, + "epoch": 0.35009770028558546, + "flos": 23215483401120.0, + "grad_norm": 1.610351995737909, + "language_loss": 0.78241116, + "learning_rate": 3.018045956403094e-06, + "loss": 0.8040455, + "num_input_tokens_seen": 125307020, + "router_z_loss_clip": 0.85058594, + "router_z_loss_mlp": 0.14483643, + "step": 5823, + "time_per_iteration": 4.061099052429199 + }, + { + "auxiliary_loss_clip": 0.01045594, + "auxiliary_loss_mlp": 0.01001905, + "balance_loss_clip": 1.01840997, + "balance_loss_mlp": 1.00042152, + "epoch": 0.3501578235382534, + "flos": 83403548458560.0, + "grad_norm": 0.7155066429532203, + "language_loss": 0.5924803, + "learning_rate": 3.017710706819298e-06, + "loss": 0.61295533, + "num_input_tokens_seen": 125370445, + "router_z_loss_clip": 0.27197266, + "router_z_loss_mlp": 0.01482391, + "step": 5824, + "time_per_iteration": 3.2930099964141846 + }, + { + "auxiliary_loss_clip": 0.01130471, + "auxiliary_loss_mlp": 0.01035429, + "balance_loss_clip": 1.04600608, + "balance_loss_mlp": 1.02172589, + "epoch": 0.3502179467909214, + "flos": 25930483097760.0, + "grad_norm": 2.07410267878854, + "language_loss": 0.84845614, + "learning_rate": 3.017375418643811e-06, + "loss": 0.87011516, + "num_input_tokens_seen": 125388900, + "router_z_loss_clip": 0.84375, + "router_z_loss_mlp": 0.13708496, + "step": 5825, + "time_per_iteration": 2.645210027694702 + }, + { + "auxiliary_loss_clip": 0.01130882, + "auxiliary_loss_mlp": 0.01036961, + "balance_loss_clip": 1.04684925, + "balance_loss_mlp": 1.02359128, + "epoch": 0.35027807004358935, + "flos": 14572596794400.0, + "grad_norm": 2.233112832316811, + "language_loss": 0.83530289, + "learning_rate": 3.0170400918893464e-06, + "loss": 0.85698128, + "num_input_tokens_seen": 125402675, + "router_z_loss_clip": 0.84033203, + "router_z_loss_mlp": 0.13354492, + "step": 5826, + "time_per_iteration": 2.66137433052063 + }, + { + "auxiliary_loss_clip": 0.01131382, + "auxiliary_loss_mlp": 0.01039652, + "balance_loss_clip": 1.04610717, + "balance_loss_mlp": 1.02579367, + "epoch": 0.3503381932962573, + "flos": 26198871975360.0, + "grad_norm": 2.1829985433826296, + "language_loss": 0.80793071, + "learning_rate": 3.0167047265686186e-06, + "loss": 0.82964098, + "num_input_tokens_seen": 125421360, + "router_z_loss_clip": 0.85205078, + "router_z_loss_mlp": 0.13848877, + "step": 5827, + "time_per_iteration": 2.7198076248168945 + }, + { + "auxiliary_loss_clip": 0.01131252, + "auxiliary_loss_mlp": 0.01040771, + "balance_loss_clip": 1.04760647, + "balance_loss_mlp": 1.02759802, + "epoch": 0.3503983165489253, + "flos": 25930726201440.0, + "grad_norm": 2.070059407440375, + "language_loss": 0.71067894, + "learning_rate": 3.0163693226943467e-06, + "loss": 0.73239911, + "num_input_tokens_seen": 125440000, + "router_z_loss_clip": 0.8359375, + "router_z_loss_mlp": 0.13165283, + "step": 5828, + "time_per_iteration": 2.663832187652588 + }, + { + "auxiliary_loss_clip": 0.01136544, + "auxiliary_loss_mlp": 0.01044347, + "balance_loss_clip": 1.04944777, + "balance_loss_mlp": 1.02814686, + "epoch": 0.35045843980159325, + "flos": 33941632145760.0, + "grad_norm": 1.9751315180716011, + "language_loss": 0.79490137, + "learning_rate": 3.016033880279248e-06, + "loss": 0.81671023, + "num_input_tokens_seen": 125460390, + "router_z_loss_clip": 0.87158203, + "router_z_loss_mlp": 0.16210938, + "step": 5829, + "time_per_iteration": 2.754985809326172 + }, + { + "auxiliary_loss_clip": 0.01138168, + "auxiliary_loss_mlp": 0.0104523, + "balance_loss_clip": 1.04959488, + "balance_loss_mlp": 1.03026342, + "epoch": 0.3505185630542612, + "flos": 31629479127840.0, + "grad_norm": 1.9258760683389133, + "language_loss": 0.72101808, + "learning_rate": 3.0156983993360417e-06, + "loss": 0.74285203, + "num_input_tokens_seen": 125478410, + "router_z_loss_clip": 0.88671875, + "router_z_loss_mlp": 0.14971924, + "step": 5830, + "time_per_iteration": 2.6832282543182373 + }, + { + "auxiliary_loss_clip": 0.01128426, + "auxiliary_loss_mlp": 0.01031779, + "balance_loss_clip": 1.04562712, + "balance_loss_mlp": 1.01782572, + "epoch": 0.35057868630692923, + "flos": 25040688567840.0, + "grad_norm": 2.784875488479076, + "language_loss": 0.8821665, + "learning_rate": 3.0153628798774513e-06, + "loss": 0.9037686, + "num_input_tokens_seen": 125495975, + "router_z_loss_clip": 0.82861328, + "router_z_loss_mlp": 0.1395874, + "step": 5831, + "time_per_iteration": 2.690098762512207 + }, + { + "auxiliary_loss_clip": 0.01131201, + "auxiliary_loss_mlp": 0.01038972, + "balance_loss_clip": 1.04613137, + "balance_loss_mlp": 1.02510762, + "epoch": 0.3506388095595972, + "flos": 24951482218080.0, + "grad_norm": 2.0126560950372854, + "language_loss": 0.78653681, + "learning_rate": 3.0150273219161985e-06, + "loss": 0.80823851, + "num_input_tokens_seen": 125515035, + "router_z_loss_clip": 0.85205078, + "router_z_loss_mlp": 0.13867188, + "step": 5832, + "time_per_iteration": 2.618821620941162 + }, + { + "auxiliary_loss_clip": 0.01132385, + "auxiliary_loss_mlp": 0.01037477, + "balance_loss_clip": 1.04555821, + "balance_loss_mlp": 1.02236152, + "epoch": 0.35069893281226516, + "flos": 28198883803680.0, + "grad_norm": 1.8982007606530074, + "language_loss": 0.70683539, + "learning_rate": 3.014691725465008e-06, + "loss": 0.72853398, + "num_input_tokens_seen": 125535555, + "router_z_loss_clip": 0.86865234, + "router_z_loss_mlp": 0.15124512, + "step": 5833, + "time_per_iteration": 2.699990749359131 + }, + { + "auxiliary_loss_clip": 0.01127287, + "auxiliary_loss_mlp": 0.0103125, + "balance_loss_clip": 1.04622388, + "balance_loss_mlp": 1.01801753, + "epoch": 0.35075905606493313, + "flos": 33277811251680.0, + "grad_norm": 1.3767286102587077, + "language_loss": 0.80949056, + "learning_rate": 3.014356090536606e-06, + "loss": 0.83107591, + "num_input_tokens_seen": 125558195, + "router_z_loss_clip": 0.81005859, + "router_z_loss_mlp": 0.13250732, + "step": 5834, + "time_per_iteration": 2.6801917552948 + }, + { + "auxiliary_loss_clip": 0.01131916, + "auxiliary_loss_mlp": 0.01037621, + "balance_loss_clip": 1.04732132, + "balance_loss_mlp": 1.02359056, + "epoch": 0.3508191793176011, + "flos": 23341270504320.0, + "grad_norm": 2.5794822963643926, + "language_loss": 0.83946264, + "learning_rate": 3.0140204171437183e-06, + "loss": 0.86115801, + "num_input_tokens_seen": 125575375, + "router_z_loss_clip": 0.84619141, + "router_z_loss_mlp": 0.14025879, + "step": 5835, + "time_per_iteration": 2.7641353607177734 + }, + { + "auxiliary_loss_clip": 0.01129268, + "auxiliary_loss_mlp": 0.01042269, + "balance_loss_clip": 1.04513431, + "balance_loss_mlp": 1.02840495, + "epoch": 0.35087930257026906, + "flos": 31185716346720.0, + "grad_norm": 1.7139940349256184, + "language_loss": 0.76719272, + "learning_rate": 3.0136847052990754e-06, + "loss": 0.78890806, + "num_input_tokens_seen": 125596745, + "router_z_loss_clip": 0.84228516, + "router_z_loss_mlp": 0.13873291, + "step": 5836, + "time_per_iteration": 2.7707624435424805 + }, + { + "auxiliary_loss_clip": 0.01132834, + "auxiliary_loss_mlp": 0.01039594, + "balance_loss_clip": 1.0488174, + "balance_loss_mlp": 1.02534854, + "epoch": 0.350939425822937, + "flos": 21968620368480.0, + "grad_norm": 2.2160837199694345, + "language_loss": 0.77736139, + "learning_rate": 3.0133489550154074e-06, + "loss": 0.79908574, + "num_input_tokens_seen": 125613980, + "router_z_loss_clip": 0.83984375, + "router_z_loss_mlp": 0.14245605, + "step": 5837, + "time_per_iteration": 2.72941255569458 + }, + { + "auxiliary_loss_clip": 0.01131482, + "auxiliary_loss_mlp": 0.01037803, + "balance_loss_clip": 1.04622662, + "balance_loss_mlp": 1.02367687, + "epoch": 0.350999549075605, + "flos": 27177913372320.0, + "grad_norm": 1.8285490493486405, + "language_loss": 0.67959607, + "learning_rate": 3.0130131663054442e-06, + "loss": 0.70128894, + "num_input_tokens_seen": 125632100, + "router_z_loss_clip": 0.85205078, + "router_z_loss_mlp": 0.14111328, + "step": 5838, + "time_per_iteration": 2.683363914489746 + }, + { + "auxiliary_loss_clip": 0.01128755, + "auxiliary_loss_mlp": 0.01036009, + "balance_loss_clip": 1.04429507, + "balance_loss_mlp": 1.02084517, + "epoch": 0.35105967232827295, + "flos": 17561009511360.0, + "grad_norm": 6.129897951729606, + "language_loss": 0.83599037, + "learning_rate": 3.0126773391819215e-06, + "loss": 0.857638, + "num_input_tokens_seen": 125649190, + "router_z_loss_clip": 0.84472656, + "router_z_loss_mlp": 0.1517334, + "step": 5839, + "time_per_iteration": 2.5994656085968018 + }, + { + "auxiliary_loss_clip": 0.0113327, + "auxiliary_loss_mlp": 0.01041004, + "balance_loss_clip": 1.04556203, + "balance_loss_mlp": 1.0258522, + "epoch": 0.3511197955809409, + "flos": 30605875073280.0, + "grad_norm": 1.6534068910371493, + "language_loss": 0.58631498, + "learning_rate": 3.012341473657572e-06, + "loss": 0.60805768, + "num_input_tokens_seen": 125668680, + "router_z_loss_clip": 0.87695312, + "router_z_loss_mlp": 0.15155029, + "step": 5840, + "time_per_iteration": 2.7043516635894775 + }, + { + "auxiliary_loss_clip": 0.01132471, + "auxiliary_loss_mlp": 0.01038112, + "balance_loss_clip": 1.04639435, + "balance_loss_mlp": 1.02374721, + "epoch": 0.3511799188336089, + "flos": 31584389745600.0, + "grad_norm": 2.562940121493891, + "language_loss": 0.87207031, + "learning_rate": 3.0120055697451322e-06, + "loss": 0.89377618, + "num_input_tokens_seen": 125686935, + "router_z_loss_clip": 0.86083984, + "router_z_loss_mlp": 0.14367676, + "step": 5841, + "time_per_iteration": 2.658853054046631 + }, + { + "auxiliary_loss_clip": 0.01135231, + "auxiliary_loss_mlp": 0.01038628, + "balance_loss_clip": 1.04696965, + "balance_loss_mlp": 1.02266622, + "epoch": 0.35124004208627685, + "flos": 24506179780320.0, + "grad_norm": 1.7823368830077901, + "language_loss": 0.75388962, + "learning_rate": 3.0116696274573406e-06, + "loss": 0.77562821, + "num_input_tokens_seen": 125707180, + "router_z_loss_clip": 0.8828125, + "router_z_loss_mlp": 0.15942383, + "step": 5842, + "time_per_iteration": 2.8160271644592285 + }, + { + "auxiliary_loss_clip": 0.0113217, + "auxiliary_loss_mlp": 0.01038272, + "balance_loss_clip": 1.045344, + "balance_loss_mlp": 1.02428293, + "epoch": 0.3513001653389448, + "flos": 21699340110720.0, + "grad_norm": 2.1303799581728233, + "language_loss": 0.68197322, + "learning_rate": 3.0113336468069346e-06, + "loss": 0.70367765, + "num_input_tokens_seen": 125722780, + "router_z_loss_clip": 0.86816406, + "router_z_loss_mlp": 0.13989258, + "step": 5843, + "time_per_iteration": 2.593160390853882 + }, + { + "auxiliary_loss_clip": 0.01133087, + "auxiliary_loss_mlp": 0.01038819, + "balance_loss_clip": 1.04785383, + "balance_loss_mlp": 1.02412653, + "epoch": 0.3513602885916128, + "flos": 35858960906400.0, + "grad_norm": 2.915485646125683, + "language_loss": 0.65851301, + "learning_rate": 3.010997627806655e-06, + "loss": 0.68023205, + "num_input_tokens_seen": 125742110, + "router_z_loss_clip": 0.85253906, + "router_z_loss_mlp": 0.14697266, + "step": 5844, + "time_per_iteration": 2.7483625411987305 + }, + { + "auxiliary_loss_clip": 0.01133681, + "auxiliary_loss_mlp": 0.01038211, + "balance_loss_clip": 1.04786706, + "balance_loss_mlp": 1.02342939, + "epoch": 0.3514204118442808, + "flos": 19742391800640.0, + "grad_norm": 2.6333655284133246, + "language_loss": 0.75210971, + "learning_rate": 3.010661570469245e-06, + "loss": 0.77382863, + "num_input_tokens_seen": 125759980, + "router_z_loss_clip": 0.85888672, + "router_z_loss_mlp": 0.14770508, + "step": 5845, + "time_per_iteration": 2.614962100982666 + }, + { + "auxiliary_loss_clip": 0.01132352, + "auxiliary_loss_mlp": 0.01037986, + "balance_loss_clip": 1.04884875, + "balance_loss_mlp": 1.02356148, + "epoch": 0.35148053509694877, + "flos": 29084018846400.0, + "grad_norm": 2.5831187157044946, + "language_loss": 0.7325716, + "learning_rate": 3.0103254748074465e-06, + "loss": 0.7542749, + "num_input_tokens_seen": 125772660, + "router_z_loss_clip": 0.83447266, + "router_z_loss_mlp": 0.14422607, + "step": 5846, + "time_per_iteration": 2.6628637313842773 + }, + { + "auxiliary_loss_clip": 0.01134379, + "auxiliary_loss_mlp": 0.01033641, + "balance_loss_clip": 1.04912353, + "balance_loss_mlp": 1.0195092, + "epoch": 0.35154065834961673, + "flos": 25614087593760.0, + "grad_norm": 1.804504227584242, + "language_loss": 0.75667101, + "learning_rate": 3.0099893408340046e-06, + "loss": 0.77835119, + "num_input_tokens_seen": 125791935, + "router_z_loss_clip": 0.85253906, + "router_z_loss_mlp": 0.14135742, + "step": 5847, + "time_per_iteration": 2.6707072257995605 + }, + { + "auxiliary_loss_clip": 0.01133059, + "auxiliary_loss_mlp": 0.01031661, + "balance_loss_clip": 1.0472275, + "balance_loss_mlp": 1.01788044, + "epoch": 0.3516007816022847, + "flos": 40579320712320.0, + "grad_norm": 2.066012216736022, + "language_loss": 0.72614586, + "learning_rate": 3.009653168561666e-06, + "loss": 0.74779314, + "num_input_tokens_seen": 125813455, + "router_z_loss_clip": 0.85839844, + "router_z_loss_mlp": 0.13793945, + "step": 5848, + "time_per_iteration": 2.77441668510437 + }, + { + "auxiliary_loss_clip": 0.01136671, + "auxiliary_loss_mlp": 0.01045325, + "balance_loss_clip": 1.04969263, + "balance_loss_mlp": 1.03068578, + "epoch": 0.35166090485495266, + "flos": 14308745852160.0, + "grad_norm": 2.423740562245535, + "language_loss": 0.894894, + "learning_rate": 3.009316958003178e-06, + "loss": 0.91671401, + "num_input_tokens_seen": 125827660, + "router_z_loss_clip": 0.87060547, + "router_z_loss_mlp": 0.1463623, + "step": 5849, + "time_per_iteration": 2.6100103855133057 + }, + { + "auxiliary_loss_clip": 0.01133923, + "auxiliary_loss_mlp": 0.01032393, + "balance_loss_clip": 1.04839754, + "balance_loss_mlp": 1.01817179, + "epoch": 0.3517210281076206, + "flos": 27623580465600.0, + "grad_norm": 2.5721411916819186, + "language_loss": 0.74672347, + "learning_rate": 3.0089807091712897e-06, + "loss": 0.7683866, + "num_input_tokens_seen": 125846655, + "router_z_loss_clip": 0.85595703, + "router_z_loss_mlp": 0.14239502, + "step": 5850, + "time_per_iteration": 2.677827835083008 + }, + { + "auxiliary_loss_clip": 0.0113385, + "auxiliary_loss_mlp": 0.01033012, + "balance_loss_clip": 1.0503968, + "balance_loss_mlp": 1.01837313, + "epoch": 0.3517811513602886, + "flos": 26018271342720.0, + "grad_norm": 2.031578961132383, + "language_loss": 0.75449312, + "learning_rate": 3.0086444220787515e-06, + "loss": 0.77616167, + "num_input_tokens_seen": 125866290, + "router_z_loss_clip": 0.83447266, + "router_z_loss_mlp": 0.14642334, + "step": 5851, + "time_per_iteration": 2.636650562286377 + }, + { + "auxiliary_loss_clip": 0.01136444, + "auxiliary_loss_mlp": 0.01035025, + "balance_loss_clip": 1.05028677, + "balance_loss_mlp": 1.02075577, + "epoch": 0.35184127461295656, + "flos": 25664363187840.0, + "grad_norm": 2.344147782885275, + "language_loss": 0.87366217, + "learning_rate": 3.0083080967383165e-06, + "loss": 0.89537692, + "num_input_tokens_seen": 125884620, + "router_z_loss_clip": 0.86230469, + "router_z_loss_mlp": 0.14263916, + "step": 5852, + "time_per_iteration": 2.714592933654785 + }, + { + "auxiliary_loss_clip": 0.01130164, + "auxiliary_loss_mlp": 0.01036589, + "balance_loss_clip": 1.04662406, + "balance_loss_mlp": 1.02284455, + "epoch": 0.3519013978656245, + "flos": 27400118901120.0, + "grad_norm": 5.227491117047164, + "language_loss": 0.67997301, + "learning_rate": 3.007971733162737e-06, + "loss": 0.70164055, + "num_input_tokens_seen": 125902430, + "router_z_loss_clip": 0.83496094, + "router_z_loss_mlp": 0.13745117, + "step": 5853, + "time_per_iteration": 2.6941263675689697 + }, + { + "auxiliary_loss_clip": 0.01134914, + "auxiliary_loss_mlp": 0.01038594, + "balance_loss_clip": 1.04910016, + "balance_loss_mlp": 1.02403879, + "epoch": 0.3519615211182925, + "flos": 16002734600160.0, + "grad_norm": 2.6909189186500524, + "language_loss": 0.81075239, + "learning_rate": 3.0076353313647686e-06, + "loss": 0.83248752, + "num_input_tokens_seen": 125920570, + "router_z_loss_clip": 0.85791016, + "router_z_loss_mlp": 0.14562988, + "step": 5854, + "time_per_iteration": 2.631953001022339 + }, + { + "auxiliary_loss_clip": 0.01130154, + "auxiliary_loss_mlp": 0.01034787, + "balance_loss_clip": 1.0473876, + "balance_loss_mlp": 1.02173364, + "epoch": 0.35202164437096045, + "flos": 23348644649280.0, + "grad_norm": 1.502004015264127, + "language_loss": 0.73174918, + "learning_rate": 3.0072988913571666e-06, + "loss": 0.7533986, + "num_input_tokens_seen": 125939800, + "router_z_loss_clip": 0.82666016, + "router_z_loss_mlp": 0.13067627, + "step": 5855, + "time_per_iteration": 2.7024316787719727 + }, + { + "auxiliary_loss_clip": 0.01130521, + "auxiliary_loss_mlp": 0.01037382, + "balance_loss_clip": 1.04741681, + "balance_loss_mlp": 1.02395272, + "epoch": 0.3520817676236284, + "flos": 32387246893440.0, + "grad_norm": 2.454962652460334, + "language_loss": 0.71333742, + "learning_rate": 3.006962413152691e-06, + "loss": 0.73501647, + "num_input_tokens_seen": 125958720, + "router_z_loss_clip": 0.83154297, + "router_z_loss_mlp": 0.13427734, + "step": 5856, + "time_per_iteration": 2.6869685649871826 + }, + { + "auxiliary_loss_clip": 0.0113708, + "auxiliary_loss_mlp": 0.01042167, + "balance_loss_clip": 1.04930913, + "balance_loss_mlp": 1.02690852, + "epoch": 0.3521418908762964, + "flos": 54780992094240.0, + "grad_norm": 1.876157232412763, + "language_loss": 0.61289418, + "learning_rate": 3.0066258967640987e-06, + "loss": 0.63468665, + "num_input_tokens_seen": 125984310, + "router_z_loss_clip": 0.87841797, + "router_z_loss_mlp": 0.15246582, + "step": 5857, + "time_per_iteration": 4.278966426849365 + }, + { + "auxiliary_loss_clip": 0.01133601, + "auxiliary_loss_mlp": 0.01041841, + "balance_loss_clip": 1.04881799, + "balance_loss_mlp": 1.02746487, + "epoch": 0.3522020141289644, + "flos": 24633222919200.0, + "grad_norm": 1.8442144758666694, + "language_loss": 0.73346382, + "learning_rate": 3.006289342204152e-06, + "loss": 0.75521827, + "num_input_tokens_seen": 126002410, + "router_z_loss_clip": 0.84667969, + "router_z_loss_mlp": 0.14367676, + "step": 5858, + "time_per_iteration": 4.15053915977478 + }, + { + "auxiliary_loss_clip": 0.01138139, + "auxiliary_loss_mlp": 0.01039351, + "balance_loss_clip": 1.05010676, + "balance_loss_mlp": 1.02533162, + "epoch": 0.35226213738163237, + "flos": 33634028891520.0, + "grad_norm": 1.7042401091599808, + "language_loss": 0.76192862, + "learning_rate": 3.0059527494856126e-06, + "loss": 0.78370357, + "num_input_tokens_seen": 126022490, + "router_z_loss_clip": 0.87988281, + "router_z_loss_mlp": 0.14007568, + "step": 5859, + "time_per_iteration": 2.760420322418213 + }, + { + "auxiliary_loss_clip": 0.01141166, + "auxiliary_loss_mlp": 0.01047503, + "balance_loss_clip": 1.05122387, + "balance_loss_mlp": 1.03185105, + "epoch": 0.35232226063430033, + "flos": 28023712486560.0, + "grad_norm": 1.9903697981696742, + "language_loss": 0.71647072, + "learning_rate": 3.0056161186212435e-06, + "loss": 0.73835742, + "num_input_tokens_seen": 126042895, + "router_z_loss_clip": 0.89941406, + "router_z_loss_mlp": 0.15637207, + "step": 5860, + "time_per_iteration": 2.725571393966675 + }, + { + "auxiliary_loss_clip": 0.01139552, + "auxiliary_loss_mlp": 0.01042036, + "balance_loss_clip": 1.04873872, + "balance_loss_mlp": 1.02709937, + "epoch": 0.3523823838869683, + "flos": 23389398682560.0, + "grad_norm": 2.3968579372448264, + "language_loss": 0.66198838, + "learning_rate": 3.005279449623811e-06, + "loss": 0.68380427, + "num_input_tokens_seen": 126060130, + "router_z_loss_clip": 0.90869141, + "router_z_loss_mlp": 0.14941406, + "step": 5861, + "time_per_iteration": 4.282347679138184 + }, + { + "auxiliary_loss_clip": 0.01132269, + "auxiliary_loss_mlp": 0.0103457, + "balance_loss_clip": 1.0484426, + "balance_loss_mlp": 1.02112865, + "epoch": 0.35244250713963626, + "flos": 21879251949600.0, + "grad_norm": 2.0644580206047967, + "language_loss": 0.6616255, + "learning_rate": 3.0049427425060815e-06, + "loss": 0.68329388, + "num_input_tokens_seen": 126077850, + "router_z_loss_clip": 0.83789062, + "router_z_loss_mlp": 0.13439941, + "step": 5862, + "time_per_iteration": 2.6209716796875 + }, + { + "auxiliary_loss_clip": 0.01137348, + "auxiliary_loss_mlp": 0.01041417, + "balance_loss_clip": 1.05102909, + "balance_loss_mlp": 1.02550268, + "epoch": 0.35250263039230423, + "flos": 26152202419200.0, + "grad_norm": 2.4261359954382926, + "language_loss": 0.76998568, + "learning_rate": 3.0046059972808215e-06, + "loss": 0.79177338, + "num_input_tokens_seen": 126095985, + "router_z_loss_clip": 0.86376953, + "router_z_loss_mlp": 0.15930176, + "step": 5863, + "time_per_iteration": 4.136845111846924 + }, + { + "auxiliary_loss_clip": 0.01135287, + "auxiliary_loss_mlp": 0.01036954, + "balance_loss_clip": 1.04922676, + "balance_loss_mlp": 1.02295291, + "epoch": 0.3525627536449722, + "flos": 33454684294560.0, + "grad_norm": 1.8539463237014921, + "language_loss": 0.75664139, + "learning_rate": 3.0042692139608024e-06, + "loss": 0.77836382, + "num_input_tokens_seen": 126116070, + "router_z_loss_clip": 0.86035156, + "router_z_loss_mlp": 0.13995361, + "step": 5864, + "time_per_iteration": 2.7713372707366943 + }, + { + "auxiliary_loss_clip": 0.01133703, + "auxiliary_loss_mlp": 0.01049089, + "balance_loss_clip": 1.04868567, + "balance_loss_mlp": 1.03489113, + "epoch": 0.35262287689764016, + "flos": 30248968639680.0, + "grad_norm": 3.109062344495184, + "language_loss": 0.79702669, + "learning_rate": 3.003932392558793e-06, + "loss": 0.81885463, + "num_input_tokens_seen": 126135205, + "router_z_loss_clip": 0.85107422, + "router_z_loss_mlp": 0.14208984, + "step": 5865, + "time_per_iteration": 2.71787428855896 + }, + { + "auxiliary_loss_clip": 0.011428, + "auxiliary_loss_mlp": 0.01041644, + "balance_loss_clip": 1.05422783, + "balance_loss_mlp": 1.026618, + "epoch": 0.3526830001503081, + "flos": 21745766563200.0, + "grad_norm": 3.2174225180483997, + "language_loss": 0.81269032, + "learning_rate": 3.0035955330875677e-06, + "loss": 0.83453476, + "num_input_tokens_seen": 126151895, + "router_z_loss_clip": 0.88623047, + "router_z_loss_mlp": 0.15032959, + "step": 5866, + "time_per_iteration": 2.685370922088623 + }, + { + "auxiliary_loss_clip": 0.01142173, + "auxiliary_loss_mlp": 0.01039252, + "balance_loss_clip": 1.04930508, + "balance_loss_mlp": 1.02339745, + "epoch": 0.3527431234029761, + "flos": 22058677581120.0, + "grad_norm": 2.4426122169630817, + "language_loss": 0.83871287, + "learning_rate": 3.0032586355598986e-06, + "loss": 0.8605271, + "num_input_tokens_seen": 126168515, + "router_z_loss_clip": 0.93017578, + "router_z_loss_mlp": 0.15869141, + "step": 5867, + "time_per_iteration": 2.666877031326294 + }, + { + "auxiliary_loss_clip": 0.01138377, + "auxiliary_loss_mlp": 0.01045671, + "balance_loss_clip": 1.05031371, + "balance_loss_mlp": 1.030388, + "epoch": 0.35280324665564405, + "flos": 23705186427360.0, + "grad_norm": 2.246936255376344, + "language_loss": 0.74390769, + "learning_rate": 3.0029216999885613e-06, + "loss": 0.7657482, + "num_input_tokens_seen": 126186460, + "router_z_loss_clip": 0.88037109, + "router_z_loss_mlp": 0.15283203, + "step": 5868, + "time_per_iteration": 2.72506046295166 + }, + { + "auxiliary_loss_clip": 0.01138503, + "auxiliary_loss_mlp": 0.01037652, + "balance_loss_clip": 1.05017364, + "balance_loss_mlp": 1.02245855, + "epoch": 0.352863369908312, + "flos": 26239585491360.0, + "grad_norm": 2.7075677471349744, + "language_loss": 0.61497307, + "learning_rate": 3.0025847263863327e-06, + "loss": 0.6367346, + "num_input_tokens_seen": 126206170, + "router_z_loss_clip": 0.88378906, + "router_z_loss_mlp": 0.15197754, + "step": 5869, + "time_per_iteration": 2.653940439224243 + }, + { + "auxiliary_loss_clip": 0.01136173, + "auxiliary_loss_mlp": 0.01040705, + "balance_loss_clip": 1.04964614, + "balance_loss_mlp": 1.02613819, + "epoch": 0.35292349316098, + "flos": 27222232926240.0, + "grad_norm": 2.7652381389536766, + "language_loss": 0.74682486, + "learning_rate": 3.0022477147659917e-06, + "loss": 0.76859367, + "num_input_tokens_seen": 126225605, + "router_z_loss_clip": 0.86474609, + "router_z_loss_mlp": 0.14575195, + "step": 5870, + "time_per_iteration": 2.6956355571746826 + }, + { + "auxiliary_loss_clip": 0.01134712, + "auxiliary_loss_mlp": 0.0103241, + "balance_loss_clip": 1.04879487, + "balance_loss_mlp": 1.01744962, + "epoch": 0.352983616413648, + "flos": 40401475254720.0, + "grad_norm": 2.061881092061072, + "language_loss": 0.71629226, + "learning_rate": 3.001910665140316e-06, + "loss": 0.7379635, + "num_input_tokens_seen": 126250230, + "router_z_loss_clip": 0.85888672, + "router_z_loss_mlp": 0.14953613, + "step": 5871, + "time_per_iteration": 2.751708745956421 + }, + { + "auxiliary_loss_clip": 0.01132309, + "auxiliary_loss_mlp": 0.01036836, + "balance_loss_clip": 1.04957914, + "balance_loss_mlp": 1.02358031, + "epoch": 0.35304373966631597, + "flos": 22813568619840.0, + "grad_norm": 2.4520583116575514, + "language_loss": 0.73706466, + "learning_rate": 3.0015735775220873e-06, + "loss": 0.7587561, + "num_input_tokens_seen": 126268315, + "router_z_loss_clip": 0.82617188, + "router_z_loss_mlp": 0.13250732, + "step": 5872, + "time_per_iteration": 2.716722011566162 + }, + { + "auxiliary_loss_clip": 0.01133928, + "auxiliary_loss_mlp": 0.01040032, + "balance_loss_clip": 1.04970837, + "balance_loss_mlp": 1.02632272, + "epoch": 0.35310386291898394, + "flos": 28510417234080.0, + "grad_norm": 2.0064948064947457, + "language_loss": 0.82609081, + "learning_rate": 3.001236451924089e-06, + "loss": 0.84783047, + "num_input_tokens_seen": 126288390, + "router_z_loss_clip": 0.84228516, + "router_z_loss_mlp": 0.13708496, + "step": 5873, + "time_per_iteration": 2.7482402324676514 + }, + { + "auxiliary_loss_clip": 0.01139414, + "auxiliary_loss_mlp": 0.01039678, + "balance_loss_clip": 1.05004561, + "balance_loss_mlp": 1.02407348, + "epoch": 0.3531639861716519, + "flos": 29848836618720.0, + "grad_norm": 1.83416317805475, + "language_loss": 0.66242683, + "learning_rate": 3.000899288359104e-06, + "loss": 0.68421781, + "num_input_tokens_seen": 126305750, + "router_z_loss_clip": 0.89404297, + "router_z_loss_mlp": 0.15612793, + "step": 5874, + "time_per_iteration": 2.7153818607330322 + }, + { + "auxiliary_loss_clip": 0.01057369, + "auxiliary_loss_mlp": 0.01003059, + "balance_loss_clip": 1.03011227, + "balance_loss_mlp": 1.00122786, + "epoch": 0.35322410942431987, + "flos": 85796885404800.0, + "grad_norm": 0.7656734210019654, + "language_loss": 0.61551559, + "learning_rate": 3.000562086839917e-06, + "loss": 0.6361199, + "num_input_tokens_seen": 126362495, + "router_z_loss_clip": 0.2734375, + "router_z_loss_mlp": 0.01829529, + "step": 5875, + "time_per_iteration": 3.2039356231689453 + }, + { + "auxiliary_loss_clip": 0.01138536, + "auxiliary_loss_mlp": 0.01041764, + "balance_loss_clip": 1.0527072, + "balance_loss_mlp": 1.02814436, + "epoch": 0.35328423267698783, + "flos": 24186137721120.0, + "grad_norm": 1.7600923548917886, + "language_loss": 0.80060482, + "learning_rate": 3.0002248473793163e-06, + "loss": 0.82240778, + "num_input_tokens_seen": 126378320, + "router_z_loss_clip": 0.85888672, + "router_z_loss_mlp": 0.13616943, + "step": 5876, + "time_per_iteration": 2.6903305053710938 + }, + { + "auxiliary_loss_clip": 0.01055887, + "auxiliary_loss_mlp": 0.01001807, + "balance_loss_clip": 1.02868366, + "balance_loss_mlp": 1.0000298, + "epoch": 0.3533443559296558, + "flos": 74221209956160.0, + "grad_norm": 0.6758596736623199, + "language_loss": 0.56737864, + "learning_rate": 2.999887569990088e-06, + "loss": 0.58795559, + "num_input_tokens_seen": 126442735, + "router_z_loss_clip": 0.27246094, + "router_z_loss_mlp": 0.01776123, + "step": 5877, + "time_per_iteration": 3.3149311542510986 + }, + { + "auxiliary_loss_clip": 0.0113946, + "auxiliary_loss_mlp": 0.01033882, + "balance_loss_clip": 1.05212307, + "balance_loss_mlp": 1.01981521, + "epoch": 0.35340447918232376, + "flos": 30207201674400.0, + "grad_norm": 1.7741112058646589, + "language_loss": 0.71879995, + "learning_rate": 2.999550254685024e-06, + "loss": 0.74053335, + "num_input_tokens_seen": 126463090, + "router_z_loss_clip": 0.87304688, + "router_z_loss_mlp": 0.140625, + "step": 5878, + "time_per_iteration": 2.7068569660186768 + }, + { + "auxiliary_loss_clip": 0.0113391, + "auxiliary_loss_mlp": 0.01037737, + "balance_loss_clip": 1.04763579, + "balance_loss_mlp": 1.02406335, + "epoch": 0.3534646024349917, + "flos": 26596208304000.0, + "grad_norm": 2.433217816623266, + "language_loss": 0.78528106, + "learning_rate": 2.9992129014769136e-06, + "loss": 0.80699754, + "num_input_tokens_seen": 126482105, + "router_z_loss_clip": 0.86328125, + "router_z_loss_mlp": 0.13665771, + "step": 5879, + "time_per_iteration": 2.777477741241455 + }, + { + "auxiliary_loss_clip": 0.01141062, + "auxiliary_loss_mlp": 0.01045298, + "balance_loss_clip": 1.05162764, + "balance_loss_mlp": 1.0298245, + "epoch": 0.3535247256876597, + "flos": 24418675156320.0, + "grad_norm": 2.0667225547480417, + "language_loss": 0.63437194, + "learning_rate": 2.9988755103785493e-06, + "loss": 0.65623546, + "num_input_tokens_seen": 126502125, + "router_z_loss_clip": 0.89453125, + "router_z_loss_mlp": 0.15490723, + "step": 5880, + "time_per_iteration": 2.792980432510376 + }, + { + "auxiliary_loss_clip": 0.01137182, + "auxiliary_loss_mlp": 0.01034206, + "balance_loss_clip": 1.04954648, + "balance_loss_mlp": 1.01957297, + "epoch": 0.35358484894032766, + "flos": 22192649174880.0, + "grad_norm": 1.9846883719881712, + "language_loss": 0.65655273, + "learning_rate": 2.998538081402727e-06, + "loss": 0.67826658, + "num_input_tokens_seen": 126521950, + "router_z_loss_clip": 0.87695312, + "router_z_loss_mlp": 0.14642334, + "step": 5881, + "time_per_iteration": 2.707138776779175 + }, + { + "auxiliary_loss_clip": 0.01131793, + "auxiliary_loss_mlp": 0.01028392, + "balance_loss_clip": 1.04890013, + "balance_loss_mlp": 1.01539207, + "epoch": 0.3536449721929956, + "flos": 27845623925280.0, + "grad_norm": 1.5663273825799653, + "language_loss": 0.75838077, + "learning_rate": 2.998200614562239e-06, + "loss": 0.77998269, + "num_input_tokens_seen": 126542445, + "router_z_loss_clip": 0.82910156, + "router_z_loss_mlp": 0.12994385, + "step": 5882, + "time_per_iteration": 2.6820826530456543 + }, + { + "auxiliary_loss_clip": 0.0113921, + "auxiliary_loss_mlp": 0.01040845, + "balance_loss_clip": 1.05090225, + "balance_loss_mlp": 1.02627802, + "epoch": 0.3537050954456636, + "flos": 32253234782400.0, + "grad_norm": 2.6837474426887096, + "language_loss": 0.70862114, + "learning_rate": 2.9978631098698847e-06, + "loss": 0.73042172, + "num_input_tokens_seen": 126560690, + "router_z_loss_clip": 0.88378906, + "router_z_loss_mlp": 0.14575195, + "step": 5883, + "time_per_iteration": 2.6749300956726074 + }, + { + "auxiliary_loss_clip": 0.01142373, + "auxiliary_loss_mlp": 0.0104012, + "balance_loss_clip": 1.05224597, + "balance_loss_mlp": 1.02514696, + "epoch": 0.3537652186983316, + "flos": 20984149656000.0, + "grad_norm": 2.0092729105510734, + "language_loss": 0.78272891, + "learning_rate": 2.9975255673384614e-06, + "loss": 0.80455387, + "num_input_tokens_seen": 126577620, + "router_z_loss_clip": 0.90087891, + "router_z_loss_mlp": 0.1496582, + "step": 5884, + "time_per_iteration": 2.6670408248901367 + }, + { + "auxiliary_loss_clip": 0.01135998, + "auxiliary_loss_mlp": 0.01037841, + "balance_loss_clip": 1.05068088, + "balance_loss_mlp": 1.02491295, + "epoch": 0.3538253419509996, + "flos": 23838712331040.0, + "grad_norm": 2.2223200109067056, + "language_loss": 0.75182354, + "learning_rate": 2.9971879869807673e-06, + "loss": 0.77356195, + "num_input_tokens_seen": 126596235, + "router_z_loss_clip": 0.85253906, + "router_z_loss_mlp": 0.12908936, + "step": 5885, + "time_per_iteration": 2.6369974613189697 + }, + { + "auxiliary_loss_clip": 0.01138604, + "auxiliary_loss_mlp": 0.01046041, + "balance_loss_clip": 1.04953861, + "balance_loss_mlp": 1.03169465, + "epoch": 0.35388546520366754, + "flos": 14798813533920.0, + "grad_norm": 3.0580770778246578, + "language_loss": 0.83268845, + "learning_rate": 2.996850368809606e-06, + "loss": 0.85453492, + "num_input_tokens_seen": 126612830, + "router_z_loss_clip": 0.890625, + "router_z_loss_mlp": 0.14337158, + "step": 5886, + "time_per_iteration": 2.652646780014038 + }, + { + "auxiliary_loss_clip": 0.01138111, + "auxiliary_loss_mlp": 0.01037774, + "balance_loss_clip": 1.05193305, + "balance_loss_mlp": 1.02227712, + "epoch": 0.3539455884563355, + "flos": 24011655197760.0, + "grad_norm": 2.363695944283475, + "language_loss": 0.77946126, + "learning_rate": 2.9965127128377787e-06, + "loss": 0.80122006, + "num_input_tokens_seen": 126630910, + "router_z_loss_clip": 0.86132812, + "router_z_loss_mlp": 0.15490723, + "step": 5887, + "time_per_iteration": 2.6267166137695312 + }, + { + "auxiliary_loss_clip": 0.01134124, + "auxiliary_loss_mlp": 0.01046458, + "balance_loss_clip": 1.04774857, + "balance_loss_mlp": 1.03229618, + "epoch": 0.35400571170900347, + "flos": 22051830160800.0, + "grad_norm": 2.1375454210808305, + "language_loss": 0.65404165, + "learning_rate": 2.996175019078089e-06, + "loss": 0.67584747, + "num_input_tokens_seen": 126648365, + "router_z_loss_clip": 0.86376953, + "router_z_loss_mlp": 0.1416626, + "step": 5888, + "time_per_iteration": 2.6396820545196533 + }, + { + "auxiliary_loss_clip": 0.01137515, + "auxiliary_loss_mlp": 0.01041377, + "balance_loss_clip": 1.05140209, + "balance_loss_mlp": 1.027704, + "epoch": 0.35406583496167143, + "flos": 31808661655680.0, + "grad_norm": 1.8691840600062206, + "language_loss": 0.77404845, + "learning_rate": 2.9958372875433437e-06, + "loss": 0.7958374, + "num_input_tokens_seen": 126667500, + "router_z_loss_clip": 0.86083984, + "router_z_loss_mlp": 0.13653564, + "step": 5889, + "time_per_iteration": 2.7285733222961426 + }, + { + "auxiliary_loss_clip": 0.01137563, + "auxiliary_loss_mlp": 0.01040683, + "balance_loss_clip": 1.05266237, + "balance_loss_mlp": 1.02705765, + "epoch": 0.3541259582143394, + "flos": 24150691451520.0, + "grad_norm": 2.879938784058711, + "language_loss": 0.80992901, + "learning_rate": 2.9954995182463478e-06, + "loss": 0.83171153, + "num_input_tokens_seen": 126686820, + "router_z_loss_clip": 0.84814453, + "router_z_loss_mlp": 0.13623047, + "step": 5890, + "time_per_iteration": 2.653038501739502 + }, + { + "auxiliary_loss_clip": 0.01133062, + "auxiliary_loss_mlp": 0.0103915, + "balance_loss_clip": 1.04781401, + "balance_loss_mlp": 1.02698493, + "epoch": 0.35418608146700736, + "flos": 29312261449920.0, + "grad_norm": 1.6325285236609484, + "language_loss": 0.79727852, + "learning_rate": 2.99516171119991e-06, + "loss": 0.8190006, + "num_input_tokens_seen": 126706965, + "router_z_loss_clip": 0.85351562, + "router_z_loss_mlp": 0.12164307, + "step": 5891, + "time_per_iteration": 2.693389654159546 + }, + { + "auxiliary_loss_clip": 0.01136905, + "auxiliary_loss_mlp": 0.01048565, + "balance_loss_clip": 1.05008113, + "balance_loss_mlp": 1.03467691, + "epoch": 0.35424620471967533, + "flos": 15112494380160.0, + "grad_norm": 1.8655926130580491, + "language_loss": 0.7356143, + "learning_rate": 2.9948238664168415e-06, + "loss": 0.75746894, + "num_input_tokens_seen": 126724015, + "router_z_loss_clip": 0.86816406, + "router_z_loss_mlp": 0.13891602, + "step": 5892, + "time_per_iteration": 2.635981559753418 + }, + { + "auxiliary_loss_clip": 0.01136366, + "auxiliary_loss_mlp": 0.01044039, + "balance_loss_clip": 1.05055046, + "balance_loss_mlp": 1.02973378, + "epoch": 0.3543063279723433, + "flos": 24006550020480.0, + "grad_norm": 3.2771744364711224, + "language_loss": 0.67248017, + "learning_rate": 2.9944859839099518e-06, + "loss": 0.6942842, + "num_input_tokens_seen": 126737565, + "router_z_loss_clip": 0.85693359, + "router_z_loss_mlp": 0.14306641, + "step": 5893, + "time_per_iteration": 2.709765672683716 + }, + { + "auxiliary_loss_clip": 0.01134964, + "auxiliary_loss_mlp": 0.01038487, + "balance_loss_clip": 1.04980409, + "balance_loss_mlp": 1.02402711, + "epoch": 0.35436645122501126, + "flos": 26733664383840.0, + "grad_norm": 2.03509473607788, + "language_loss": 0.69809657, + "learning_rate": 2.9941480636920533e-06, + "loss": 0.71983111, + "num_input_tokens_seen": 126756095, + "router_z_loss_clip": 0.85058594, + "router_z_loss_mlp": 0.14459229, + "step": 5894, + "time_per_iteration": 2.6709022521972656 + }, + { + "auxiliary_loss_clip": 0.01136621, + "auxiliary_loss_mlp": 0.01033928, + "balance_loss_clip": 1.05195212, + "balance_loss_mlp": 1.02039182, + "epoch": 0.3544265744776792, + "flos": 26502869191680.0, + "grad_norm": 1.6213842558396336, + "language_loss": 0.74345124, + "learning_rate": 2.9938101057759615e-06, + "loss": 0.76515669, + "num_input_tokens_seen": 126775455, + "router_z_loss_clip": 0.84667969, + "router_z_loss_mlp": 0.13543701, + "step": 5895, + "time_per_iteration": 2.692333936691284 + }, + { + "auxiliary_loss_clip": 0.01133747, + "auxiliary_loss_mlp": 0.01036251, + "balance_loss_clip": 1.04824638, + "balance_loss_mlp": 1.02272093, + "epoch": 0.3544866977303472, + "flos": 25884623887200.0, + "grad_norm": 1.991696638808693, + "language_loss": 0.83644783, + "learning_rate": 2.993472110174491e-06, + "loss": 0.85814786, + "num_input_tokens_seen": 126792320, + "router_z_loss_clip": 0.85546875, + "router_z_loss_mlp": 0.13519287, + "step": 5896, + "time_per_iteration": 4.145471572875977 + }, + { + "auxiliary_loss_clip": 0.01133992, + "auxiliary_loss_mlp": 0.01044951, + "balance_loss_clip": 1.049685, + "balance_loss_mlp": 1.03120041, + "epoch": 0.35454682098301515, + "flos": 35766796795200.0, + "grad_norm": 2.0252375023430336, + "language_loss": 0.6959852, + "learning_rate": 2.9931340769004576e-06, + "loss": 0.71777457, + "num_input_tokens_seen": 126813680, + "router_z_loss_clip": 0.84326172, + "router_z_loss_mlp": 0.13726807, + "step": 5897, + "time_per_iteration": 4.09067964553833 + }, + { + "auxiliary_loss_clip": 0.01134732, + "auxiliary_loss_mlp": 0.01036006, + "balance_loss_clip": 1.04940748, + "balance_loss_mlp": 1.02203488, + "epoch": 0.3546069442356832, + "flos": 29671396333920.0, + "grad_norm": 1.8273395239912031, + "language_loss": 0.81628382, + "learning_rate": 2.9927960059666816e-06, + "loss": 0.83799124, + "num_input_tokens_seen": 126834395, + "router_z_loss_clip": 0.85400391, + "router_z_loss_mlp": 0.13964844, + "step": 5898, + "time_per_iteration": 2.749607801437378 + }, + { + "auxiliary_loss_clip": 0.01131724, + "auxiliary_loss_mlp": 0.01041048, + "balance_loss_clip": 1.04836321, + "balance_loss_mlp": 1.02756, + "epoch": 0.35466706748835114, + "flos": 27890632272960.0, + "grad_norm": 1.765440563798076, + "language_loss": 0.74104369, + "learning_rate": 2.9924578973859804e-06, + "loss": 0.76277137, + "num_input_tokens_seen": 126855145, + "router_z_loss_clip": 0.83349609, + "router_z_loss_mlp": 0.1350708, + "step": 5899, + "time_per_iteration": 2.656308889389038 + }, + { + "auxiliary_loss_clip": 0.01132483, + "auxiliary_loss_mlp": 0.01036277, + "balance_loss_clip": 1.04607654, + "balance_loss_mlp": 1.02228141, + "epoch": 0.3547271907410191, + "flos": 34568993838240.0, + "grad_norm": 2.2223854038173454, + "language_loss": 0.79786277, + "learning_rate": 2.9921197511711763e-06, + "loss": 0.8195504, + "num_input_tokens_seen": 126873790, + "router_z_loss_clip": 0.86425781, + "router_z_loss_mlp": 0.13989258, + "step": 5900, + "time_per_iteration": 4.168360471725464 + }, + { + "auxiliary_loss_clip": 0.01133785, + "auxiliary_loss_mlp": 0.01039952, + "balance_loss_clip": 1.04802394, + "balance_loss_mlp": 1.02545655, + "epoch": 0.35478731399368707, + "flos": 28692111833280.0, + "grad_norm": 2.0038630673592794, + "language_loss": 0.81573731, + "learning_rate": 2.991781567335093e-06, + "loss": 0.83747464, + "num_input_tokens_seen": 126892865, + "router_z_loss_clip": 0.85595703, + "router_z_loss_mlp": 0.14489746, + "step": 5901, + "time_per_iteration": 2.7043886184692383 + }, + { + "auxiliary_loss_clip": 0.01139057, + "auxiliary_loss_mlp": 0.010371, + "balance_loss_clip": 1.04875028, + "balance_loss_mlp": 1.0227946, + "epoch": 0.35484743724635504, + "flos": 22725294167520.0, + "grad_norm": 2.195352494409704, + "language_loss": 0.75559121, + "learning_rate": 2.9914433458905525e-06, + "loss": 0.77735275, + "num_input_tokens_seen": 126911935, + "router_z_loss_clip": 0.90380859, + "router_z_loss_mlp": 0.14318848, + "step": 5902, + "time_per_iteration": 2.6509127616882324 + }, + { + "auxiliary_loss_clip": 0.01133327, + "auxiliary_loss_mlp": 0.01039053, + "balance_loss_clip": 1.0476048, + "balance_loss_mlp": 1.02606475, + "epoch": 0.354907560499023, + "flos": 21212311224960.0, + "grad_norm": 2.5518874142338066, + "language_loss": 0.70542347, + "learning_rate": 2.991105086850381e-06, + "loss": 0.72714728, + "num_input_tokens_seen": 126930040, + "router_z_loss_clip": 0.85742188, + "router_z_loss_mlp": 0.12988281, + "step": 5903, + "time_per_iteration": 3.992408037185669 + }, + { + "auxiliary_loss_clip": 0.01135509, + "auxiliary_loss_mlp": 0.01036599, + "balance_loss_clip": 1.04639864, + "balance_loss_mlp": 1.02243662, + "epoch": 0.35496768375169097, + "flos": 23438175137280.0, + "grad_norm": 3.659721048185805, + "language_loss": 0.74480081, + "learning_rate": 2.9907667902274053e-06, + "loss": 0.76652187, + "num_input_tokens_seen": 126948390, + "router_z_loss_clip": 0.89160156, + "router_z_loss_mlp": 0.14178467, + "step": 5904, + "time_per_iteration": 2.680427074432373 + }, + { + "auxiliary_loss_clip": 0.01135218, + "auxiliary_loss_mlp": 0.01040598, + "balance_loss_clip": 1.04792762, + "balance_loss_mlp": 1.02671659, + "epoch": 0.35502780700435893, + "flos": 22369522217760.0, + "grad_norm": 2.867112207416475, + "language_loss": 0.79046977, + "learning_rate": 2.9904284560344536e-06, + "loss": 0.8122279, + "num_input_tokens_seen": 126964905, + "router_z_loss_clip": 0.87353516, + "router_z_loss_mlp": 0.13873291, + "step": 5905, + "time_per_iteration": 2.606886148452759 + }, + { + "auxiliary_loss_clip": 0.01126085, + "auxiliary_loss_mlp": 0.01031085, + "balance_loss_clip": 1.04634321, + "balance_loss_mlp": 1.01902699, + "epoch": 0.3550879302570269, + "flos": 18850125716640.0, + "grad_norm": 2.8210635197129528, + "language_loss": 0.72662807, + "learning_rate": 2.990090084284356e-06, + "loss": 0.74819976, + "num_input_tokens_seen": 126982000, + "router_z_loss_clip": 0.79785156, + "router_z_loss_mlp": 0.12060547, + "step": 5906, + "time_per_iteration": 2.6747560501098633 + }, + { + "auxiliary_loss_clip": 0.01138211, + "auxiliary_loss_mlp": 0.01035179, + "balance_loss_clip": 1.04844403, + "balance_loss_mlp": 1.0200274, + "epoch": 0.35514805350969486, + "flos": 26819345730240.0, + "grad_norm": 1.9337813865422881, + "language_loss": 0.7470113, + "learning_rate": 2.9897516749899426e-06, + "loss": 0.76874524, + "num_input_tokens_seen": 126998390, + "router_z_loss_clip": 0.89990234, + "router_z_loss_mlp": 0.15148926, + "step": 5907, + "time_per_iteration": 2.6763346195220947 + }, + { + "auxiliary_loss_clip": 0.01134042, + "auxiliary_loss_mlp": 0.01033669, + "balance_loss_clip": 1.04822636, + "balance_loss_mlp": 1.01954842, + "epoch": 0.3552081767623628, + "flos": 36438315972480.0, + "grad_norm": 2.81449481426451, + "language_loss": 0.75377321, + "learning_rate": 2.989413228164047e-06, + "loss": 0.77545035, + "num_input_tokens_seen": 127020220, + "router_z_loss_clip": 0.85791016, + "router_z_loss_mlp": 0.14117432, + "step": 5908, + "time_per_iteration": 2.7792105674743652 + }, + { + "auxiliary_loss_clip": 0.01135152, + "auxiliary_loss_mlp": 0.01035415, + "balance_loss_clip": 1.04941654, + "balance_loss_mlp": 1.02162838, + "epoch": 0.3552683000150308, + "flos": 32253153747840.0, + "grad_norm": 1.808315338465675, + "language_loss": 0.68368399, + "learning_rate": 2.989074743819502e-06, + "loss": 0.70538962, + "num_input_tokens_seen": 127038585, + "router_z_loss_clip": 0.85839844, + "router_z_loss_mlp": 0.13763428, + "step": 5909, + "time_per_iteration": 2.6691412925720215 + }, + { + "auxiliary_loss_clip": 0.01128553, + "auxiliary_loss_mlp": 0.01035422, + "balance_loss_clip": 1.04719925, + "balance_loss_mlp": 1.02224994, + "epoch": 0.35532842326769876, + "flos": 24143398341120.0, + "grad_norm": 1.886681165781989, + "language_loss": 0.78586823, + "learning_rate": 2.988736221969144e-06, + "loss": 0.80750799, + "num_input_tokens_seen": 127056215, + "router_z_loss_clip": 0.8125, + "router_z_loss_mlp": 0.1317749, + "step": 5910, + "time_per_iteration": 2.6785051822662354 + }, + { + "auxiliary_loss_clip": 0.01136324, + "auxiliary_loss_mlp": 0.01036784, + "balance_loss_clip": 1.04750371, + "balance_loss_mlp": 1.02237713, + "epoch": 0.3553885465203668, + "flos": 21035033009280.0, + "grad_norm": 2.1714367188205066, + "language_loss": 0.70779979, + "learning_rate": 2.98839766262581e-06, + "loss": 0.72953081, + "num_input_tokens_seen": 127075825, + "router_z_loss_clip": 0.88818359, + "router_z_loss_mlp": 0.14416504, + "step": 5911, + "time_per_iteration": 2.6049299240112305 + }, + { + "auxiliary_loss_clip": 0.01129456, + "auxiliary_loss_mlp": 0.01033643, + "balance_loss_clip": 1.04628301, + "balance_loss_mlp": 1.01997006, + "epoch": 0.35544866977303474, + "flos": 18222966610560.0, + "grad_norm": 2.4939959919329877, + "language_loss": 0.86792296, + "learning_rate": 2.9880590658023366e-06, + "loss": 0.88955402, + "num_input_tokens_seen": 127091205, + "router_z_loss_clip": 0.83203125, + "router_z_loss_mlp": 0.13665771, + "step": 5912, + "time_per_iteration": 2.635960102081299 + }, + { + "auxiliary_loss_clip": 0.01134315, + "auxiliary_loss_mlp": 0.01038419, + "balance_loss_clip": 1.04885411, + "balance_loss_mlp": 1.02474523, + "epoch": 0.3555087930257027, + "flos": 24106250345760.0, + "grad_norm": 2.8439211581344375, + "language_loss": 0.76785314, + "learning_rate": 2.9877204315115646e-06, + "loss": 0.78958046, + "num_input_tokens_seen": 127109210, + "router_z_loss_clip": 0.85351562, + "router_z_loss_mlp": 0.13665771, + "step": 5913, + "time_per_iteration": 2.621206521987915 + }, + { + "auxiliary_loss_clip": 0.01132304, + "auxiliary_loss_mlp": 0.01032795, + "balance_loss_clip": 1.04995286, + "balance_loss_mlp": 1.01945531, + "epoch": 0.3555689162783707, + "flos": 25707629292480.0, + "grad_norm": 1.4739692554771624, + "language_loss": 0.82683492, + "learning_rate": 2.9873817597663353e-06, + "loss": 0.84848595, + "num_input_tokens_seen": 127128400, + "router_z_loss_clip": 0.82470703, + "router_z_loss_mlp": 0.13323975, + "step": 5914, + "time_per_iteration": 2.65649151802063 + }, + { + "auxiliary_loss_clip": 0.0113451, + "auxiliary_loss_mlp": 0.0103425, + "balance_loss_clip": 1.0485065, + "balance_loss_mlp": 1.02031994, + "epoch": 0.35562903953103864, + "flos": 40352860869120.0, + "grad_norm": 6.203751132646663, + "language_loss": 0.70647931, + "learning_rate": 2.98704305057949e-06, + "loss": 0.72816694, + "num_input_tokens_seen": 127149965, + "router_z_loss_clip": 0.86083984, + "router_z_loss_mlp": 0.13928223, + "step": 5915, + "time_per_iteration": 2.7842960357666016 + }, + { + "auxiliary_loss_clip": 0.01131909, + "auxiliary_loss_mlp": 0.01037294, + "balance_loss_clip": 1.04566264, + "balance_loss_mlp": 1.02362037, + "epoch": 0.3556891627837066, + "flos": 25084765018080.0, + "grad_norm": 1.7929662912707243, + "language_loss": 0.76073873, + "learning_rate": 2.9867043039638737e-06, + "loss": 0.78243077, + "num_input_tokens_seen": 127169865, + "router_z_loss_clip": 0.86230469, + "router_z_loss_mlp": 0.13677979, + "step": 5916, + "time_per_iteration": 2.69118332862854 + }, + { + "auxiliary_loss_clip": 0.01136207, + "auxiliary_loss_mlp": 0.01034322, + "balance_loss_clip": 1.05039859, + "balance_loss_mlp": 1.02130997, + "epoch": 0.35574928603637457, + "flos": 25262326854720.0, + "grad_norm": 1.7617088994548267, + "language_loss": 0.88617539, + "learning_rate": 2.986365519932332e-06, + "loss": 0.90788066, + "num_input_tokens_seen": 127188075, + "router_z_loss_clip": 0.859375, + "router_z_loss_mlp": 0.12994385, + "step": 5917, + "time_per_iteration": 2.704195261001587 + }, + { + "auxiliary_loss_clip": 0.01132352, + "auxiliary_loss_mlp": 0.01029182, + "balance_loss_clip": 1.04756045, + "balance_loss_mlp": 1.01571727, + "epoch": 0.35580940928904253, + "flos": 18540091425600.0, + "grad_norm": 2.141190215108402, + "language_loss": 0.74743044, + "learning_rate": 2.98602669849771e-06, + "loss": 0.76904577, + "num_input_tokens_seen": 127206065, + "router_z_loss_clip": 0.84814453, + "router_z_loss_mlp": 0.13476562, + "step": 5918, + "time_per_iteration": 2.6680831909179688 + }, + { + "auxiliary_loss_clip": 0.01050257, + "auxiliary_loss_mlp": 0.01022568, + "balance_loss_clip": 1.02249956, + "balance_loss_mlp": 1.0210855, + "epoch": 0.3558695325417105, + "flos": 71552028952800.0, + "grad_norm": 0.9284860516508531, + "language_loss": 0.63845003, + "learning_rate": 2.985687839672857e-06, + "loss": 0.65917826, + "num_input_tokens_seen": 127257885, + "router_z_loss_clip": 0.27783203, + "router_z_loss_mlp": 0.01482391, + "step": 5919, + "time_per_iteration": 3.1279349327087402 + }, + { + "auxiliary_loss_clip": 0.01137275, + "auxiliary_loss_mlp": 0.01032535, + "balance_loss_clip": 1.04847264, + "balance_loss_mlp": 1.01818204, + "epoch": 0.35592965579437846, + "flos": 26866744597440.0, + "grad_norm": 2.067499607251835, + "language_loss": 0.73625052, + "learning_rate": 2.9853489434706223e-06, + "loss": 0.75794864, + "num_input_tokens_seen": 127275550, + "router_z_loss_clip": 0.88769531, + "router_z_loss_mlp": 0.14367676, + "step": 5920, + "time_per_iteration": 2.7288551330566406 + }, + { + "auxiliary_loss_clip": 0.01130971, + "auxiliary_loss_mlp": 0.01030852, + "balance_loss_clip": 1.0463202, + "balance_loss_mlp": 1.01712537, + "epoch": 0.35598977904704643, + "flos": 28513496547360.0, + "grad_norm": 1.8757783438962474, + "language_loss": 0.77604741, + "learning_rate": 2.985010009903857e-06, + "loss": 0.7976656, + "num_input_tokens_seen": 127295110, + "router_z_loss_clip": 0.84619141, + "router_z_loss_mlp": 0.13739014, + "step": 5921, + "time_per_iteration": 2.7078001499176025 + }, + { + "auxiliary_loss_clip": 0.01131947, + "auxiliary_loss_mlp": 0.01032713, + "balance_loss_clip": 1.04669952, + "balance_loss_mlp": 1.01948667, + "epoch": 0.3560499022997144, + "flos": 21701487526560.0, + "grad_norm": 1.7416120516165563, + "language_loss": 0.67600501, + "learning_rate": 2.9846710389854133e-06, + "loss": 0.69765162, + "num_input_tokens_seen": 127312865, + "router_z_loss_clip": 0.85205078, + "router_z_loss_mlp": 0.13226318, + "step": 5922, + "time_per_iteration": 2.6111679077148438 + }, + { + "auxiliary_loss_clip": 0.01134882, + "auxiliary_loss_mlp": 0.01035423, + "balance_loss_clip": 1.04898953, + "balance_loss_mlp": 1.02164829, + "epoch": 0.35611002555238236, + "flos": 25307578306080.0, + "grad_norm": 2.0492946165127273, + "language_loss": 0.79265833, + "learning_rate": 2.9843320307281454e-06, + "loss": 0.81436133, + "num_input_tokens_seen": 127331710, + "router_z_loss_clip": 0.85791016, + "router_z_loss_mlp": 0.13769531, + "step": 5923, + "time_per_iteration": 2.7859954833984375 + }, + { + "auxiliary_loss_clip": 0.011341, + "auxiliary_loss_mlp": 0.01032782, + "balance_loss_clip": 1.04805398, + "balance_loss_mlp": 1.0197289, + "epoch": 0.3561701488050504, + "flos": 23747885290080.0, + "grad_norm": 2.317514770207041, + "language_loss": 0.85321736, + "learning_rate": 2.983992985144908e-06, + "loss": 0.87488616, + "num_input_tokens_seen": 127350950, + "router_z_loss_clip": 0.85986328, + "router_z_loss_mlp": 0.13037109, + "step": 5924, + "time_per_iteration": 2.6234548091888428 + }, + { + "auxiliary_loss_clip": 0.0113266, + "auxiliary_loss_mlp": 0.01038401, + "balance_loss_clip": 1.04779768, + "balance_loss_mlp": 1.02411973, + "epoch": 0.35623027205771834, + "flos": 37552341895200.0, + "grad_norm": 2.017709878523361, + "language_loss": 0.77344507, + "learning_rate": 2.9836539022485578e-06, + "loss": 0.79515564, + "num_input_tokens_seen": 127369385, + "router_z_loss_clip": 0.84912109, + "router_z_loss_mlp": 0.14263916, + "step": 5925, + "time_per_iteration": 2.725795030593872 + }, + { + "auxiliary_loss_clip": 0.01130652, + "auxiliary_loss_mlp": 0.01037939, + "balance_loss_clip": 1.04422987, + "balance_loss_mlp": 1.02411628, + "epoch": 0.3562903953103863, + "flos": 20720217679200.0, + "grad_norm": 1.8160684620439673, + "language_loss": 0.75784552, + "learning_rate": 2.9833147820519535e-06, + "loss": 0.77953148, + "num_input_tokens_seen": 127386965, + "router_z_loss_clip": 0.86425781, + "router_z_loss_mlp": 0.13806152, + "step": 5926, + "time_per_iteration": 2.6261959075927734 + }, + { + "auxiliary_loss_clip": 0.01137169, + "auxiliary_loss_mlp": 0.01033942, + "balance_loss_clip": 1.0482378, + "balance_loss_mlp": 1.01932681, + "epoch": 0.3563505185630543, + "flos": 29089853334720.0, + "grad_norm": 2.63493841478263, + "language_loss": 0.69391477, + "learning_rate": 2.9829756245679544e-06, + "loss": 0.71562588, + "num_input_tokens_seen": 127406075, + "router_z_loss_clip": 0.88964844, + "router_z_loss_mlp": 0.1461792, + "step": 5927, + "time_per_iteration": 2.726961612701416 + }, + { + "auxiliary_loss_clip": 0.01129607, + "auxiliary_loss_mlp": 0.01031698, + "balance_loss_clip": 1.04587197, + "balance_loss_mlp": 1.01853752, + "epoch": 0.35641064181572224, + "flos": 27178196993280.0, + "grad_norm": 2.672635147837152, + "language_loss": 0.7973839, + "learning_rate": 2.9826364298094212e-06, + "loss": 0.81899691, + "num_input_tokens_seen": 127425350, + "router_z_loss_clip": 0.83740234, + "router_z_loss_mlp": 0.13153076, + "step": 5928, + "time_per_iteration": 2.6957015991210938 + }, + { + "auxiliary_loss_clip": 0.01133666, + "auxiliary_loss_mlp": 0.01034097, + "balance_loss_clip": 1.04810786, + "balance_loss_mlp": 1.02042413, + "epoch": 0.3564707650683902, + "flos": 28066492383840.0, + "grad_norm": 1.3984885345107179, + "language_loss": 0.81950545, + "learning_rate": 2.982297197789215e-06, + "loss": 0.84118313, + "num_input_tokens_seen": 127446335, + "router_z_loss_clip": 0.85449219, + "router_z_loss_mlp": 0.13671875, + "step": 5929, + "time_per_iteration": 2.6369833946228027 + }, + { + "auxiliary_loss_clip": 0.01127063, + "auxiliary_loss_mlp": 0.01029791, + "balance_loss_clip": 1.043926, + "balance_loss_mlp": 1.01697564, + "epoch": 0.35653088832105817, + "flos": 17917308185760.0, + "grad_norm": 1.7161312492161884, + "language_loss": 0.69957137, + "learning_rate": 2.981957928520201e-06, + "loss": 0.72113991, + "num_input_tokens_seen": 127462795, + "router_z_loss_clip": 0.83105469, + "router_z_loss_mlp": 0.12811279, + "step": 5930, + "time_per_iteration": 2.6767563819885254 + }, + { + "auxiliary_loss_clip": 0.01136532, + "auxiliary_loss_mlp": 0.01041643, + "balance_loss_clip": 1.04828763, + "balance_loss_mlp": 1.0272063, + "epoch": 0.35659101157372614, + "flos": 28646981933760.0, + "grad_norm": 2.632958145744441, + "language_loss": 0.67918569, + "learning_rate": 2.981618622015244e-06, + "loss": 0.70096743, + "num_input_tokens_seen": 127482675, + "router_z_loss_clip": 0.88232422, + "router_z_loss_mlp": 0.14447021, + "step": 5931, + "time_per_iteration": 2.639117956161499 + }, + { + "auxiliary_loss_clip": 0.01132031, + "auxiliary_loss_mlp": 0.01034179, + "balance_loss_clip": 1.04752195, + "balance_loss_mlp": 1.02095306, + "epoch": 0.3566511348263941, + "flos": 32431769033760.0, + "grad_norm": 2.1496650680099605, + "language_loss": 0.68220514, + "learning_rate": 2.981279278287211e-06, + "loss": 0.70386726, + "num_input_tokens_seen": 127502275, + "router_z_loss_clip": 0.84521484, + "router_z_loss_mlp": 0.13220215, + "step": 5932, + "time_per_iteration": 2.727482318878174 + }, + { + "auxiliary_loss_clip": 0.0112983, + "auxiliary_loss_mlp": 0.01029373, + "balance_loss_clip": 1.04718256, + "balance_loss_mlp": 1.01597428, + "epoch": 0.35671125807906207, + "flos": 16002410461920.0, + "grad_norm": 2.511063125462433, + "language_loss": 0.78544819, + "learning_rate": 2.980939897348969e-06, + "loss": 0.80704027, + "num_input_tokens_seen": 127520195, + "router_z_loss_clip": 0.82617188, + "router_z_loss_mlp": 0.13397217, + "step": 5933, + "time_per_iteration": 2.6092283725738525 + }, + { + "auxiliary_loss_clip": 0.01133148, + "auxiliary_loss_mlp": 0.01040526, + "balance_loss_clip": 1.0461812, + "balance_loss_mlp": 1.02707362, + "epoch": 0.35677138133173003, + "flos": 40268232972000.0, + "grad_norm": 1.4413352622026026, + "language_loss": 0.69691837, + "learning_rate": 2.980600479213388e-06, + "loss": 0.71865511, + "num_input_tokens_seen": 127544495, + "router_z_loss_clip": 0.87011719, + "router_z_loss_mlp": 0.13458252, + "step": 5934, + "time_per_iteration": 2.8251054286956787 + }, + { + "auxiliary_loss_clip": 0.01140238, + "auxiliary_loss_mlp": 0.01041484, + "balance_loss_clip": 1.04908371, + "balance_loss_mlp": 1.02605855, + "epoch": 0.356831504584398, + "flos": 25353113378400.0, + "grad_norm": 1.9854015507088112, + "language_loss": 0.70948076, + "learning_rate": 2.9802610238933384e-06, + "loss": 0.73129791, + "num_input_tokens_seen": 127563810, + "router_z_loss_clip": 0.91259766, + "router_z_loss_mlp": 0.15435791, + "step": 5935, + "time_per_iteration": 4.119955062866211 + }, + { + "auxiliary_loss_clip": 0.01133718, + "auxiliary_loss_mlp": 0.01039196, + "balance_loss_clip": 1.04690874, + "balance_loss_mlp": 1.02453303, + "epoch": 0.35689162783706596, + "flos": 14843902916160.0, + "grad_norm": 2.134594732156077, + "language_loss": 0.78488666, + "learning_rate": 2.979921531401692e-06, + "loss": 0.80661577, + "num_input_tokens_seen": 127579065, + "router_z_loss_clip": 0.86767578, + "router_z_loss_mlp": 0.14685059, + "step": 5936, + "time_per_iteration": 3.923459768295288 + }, + { + "auxiliary_loss_clip": 0.01130362, + "auxiliary_loss_mlp": 0.01038097, + "balance_loss_clip": 1.04559517, + "balance_loss_mlp": 1.02399445, + "epoch": 0.356951751089734, + "flos": 29091717129600.0, + "grad_norm": 1.6062991158047797, + "language_loss": 0.64355773, + "learning_rate": 2.9795820017513242e-06, + "loss": 0.66524231, + "num_input_tokens_seen": 127599105, + "router_z_loss_clip": 0.84716797, + "router_z_loss_mlp": 0.14111328, + "step": 5937, + "time_per_iteration": 2.6877660751342773 + }, + { + "auxiliary_loss_clip": 0.01135223, + "auxiliary_loss_mlp": 0.01036778, + "balance_loss_clip": 1.04838562, + "balance_loss_mlp": 1.02272892, + "epoch": 0.35701187434240195, + "flos": 14303357053920.0, + "grad_norm": 2.65264703605008, + "language_loss": 0.773857, + "learning_rate": 2.9792424349551073e-06, + "loss": 0.79557699, + "num_input_tokens_seen": 127614940, + "router_z_loss_clip": 0.86816406, + "router_z_loss_mlp": 0.14056396, + "step": 5938, + "time_per_iteration": 2.654839038848877 + }, + { + "auxiliary_loss_clip": 0.01135952, + "auxiliary_loss_mlp": 0.01042712, + "balance_loss_clip": 1.04952574, + "balance_loss_mlp": 1.02964091, + "epoch": 0.3570719975950699, + "flos": 30382089370560.0, + "grad_norm": 1.6610613178871159, + "language_loss": 0.8031944, + "learning_rate": 2.9789028310259202e-06, + "loss": 0.82498109, + "num_input_tokens_seen": 127634960, + "router_z_loss_clip": 0.86523438, + "router_z_loss_mlp": 0.13067627, + "step": 5939, + "time_per_iteration": 4.171753168106079 + }, + { + "auxiliary_loss_clip": 0.01137716, + "auxiliary_loss_mlp": 0.01033768, + "balance_loss_clip": 1.04631925, + "balance_loss_mlp": 1.0196476, + "epoch": 0.3571321208477379, + "flos": 31719333754080.0, + "grad_norm": 1.8482069071741314, + "language_loss": 0.79313135, + "learning_rate": 2.9785631899766395e-06, + "loss": 0.81484616, + "num_input_tokens_seen": 127654545, + "router_z_loss_clip": 0.91308594, + "router_z_loss_mlp": 0.14123535, + "step": 5940, + "time_per_iteration": 2.728245496749878 + }, + { + "auxiliary_loss_clip": 0.01134219, + "auxiliary_loss_mlp": 0.01034421, + "balance_loss_clip": 1.0464654, + "balance_loss_mlp": 1.01989532, + "epoch": 0.35719224410040584, + "flos": 17694494897760.0, + "grad_norm": 2.2759663851258463, + "language_loss": 0.72022367, + "learning_rate": 2.9782235118201443e-06, + "loss": 0.74191004, + "num_input_tokens_seen": 127672320, + "router_z_loss_clip": 0.87744141, + "router_z_loss_mlp": 0.14526367, + "step": 5941, + "time_per_iteration": 2.617098808288574 + }, + { + "auxiliary_loss_clip": 0.01137133, + "auxiliary_loss_mlp": 0.01035363, + "balance_loss_clip": 1.05009651, + "balance_loss_mlp": 1.02039647, + "epoch": 0.3572523673530738, + "flos": 38041153541280.0, + "grad_norm": 2.4731889257008084, + "language_loss": 0.64705062, + "learning_rate": 2.9778837965693154e-06, + "loss": 0.66877562, + "num_input_tokens_seen": 127693315, + "router_z_loss_clip": 0.86962891, + "router_z_loss_mlp": 0.14959717, + "step": 5942, + "time_per_iteration": 4.133028030395508 + }, + { + "auxiliary_loss_clip": 0.01132627, + "auxiliary_loss_mlp": 0.01033838, + "balance_loss_clip": 1.04705167, + "balance_loss_mlp": 1.01971817, + "epoch": 0.3573124906057418, + "flos": 19342219262400.0, + "grad_norm": 2.126271833171298, + "language_loss": 0.74033535, + "learning_rate": 2.9775440442370354e-06, + "loss": 0.76200002, + "num_input_tokens_seen": 127711570, + "router_z_loss_clip": 0.85546875, + "router_z_loss_mlp": 0.14117432, + "step": 5943, + "time_per_iteration": 2.704608917236328 + }, + { + "auxiliary_loss_clip": 0.01050553, + "auxiliary_loss_mlp": 0.01001301, + "balance_loss_clip": 1.02291143, + "balance_loss_mlp": 0.99979484, + "epoch": 0.35737261385840974, + "flos": 74217117710880.0, + "grad_norm": 0.8180681563060156, + "language_loss": 0.60772282, + "learning_rate": 2.9772042548361867e-06, + "loss": 0.62824136, + "num_input_tokens_seen": 127772475, + "router_z_loss_clip": 0.27661133, + "router_z_loss_mlp": 0.0150528, + "step": 5944, + "time_per_iteration": 3.4044103622436523 + }, + { + "auxiliary_loss_clip": 0.01131555, + "auxiliary_loss_mlp": 0.01032788, + "balance_loss_clip": 1.04656839, + "balance_loss_mlp": 1.01924038, + "epoch": 0.3574327371110777, + "flos": 22993034768640.0, + "grad_norm": 2.501100587037077, + "language_loss": 0.72383922, + "learning_rate": 2.976864428379655e-06, + "loss": 0.74548268, + "num_input_tokens_seen": 127790940, + "router_z_loss_clip": 0.85009766, + "router_z_loss_mlp": 0.13549805, + "step": 5945, + "time_per_iteration": 2.686082124710083 + }, + { + "auxiliary_loss_clip": 0.01130468, + "auxiliary_loss_mlp": 0.0103541, + "balance_loss_clip": 1.04465663, + "balance_loss_mlp": 1.02075958, + "epoch": 0.35749286036374567, + "flos": 28735621041600.0, + "grad_norm": 1.8461207718281716, + "language_loss": 0.80983126, + "learning_rate": 2.976524564880326e-06, + "loss": 0.83149004, + "num_input_tokens_seen": 127808275, + "router_z_loss_clip": 0.85742188, + "router_z_loss_mlp": 0.14648438, + "step": 5946, + "time_per_iteration": 2.67120623588562 + }, + { + "auxiliary_loss_clip": 0.01137524, + "auxiliary_loss_mlp": 0.01044328, + "balance_loss_clip": 1.05127299, + "balance_loss_mlp": 1.02974224, + "epoch": 0.35755298361641363, + "flos": 25753002295680.0, + "grad_norm": 1.5139567597653696, + "language_loss": 0.6885196, + "learning_rate": 2.9761846643510882e-06, + "loss": 0.71033812, + "num_input_tokens_seen": 127828840, + "router_z_loss_clip": 0.86181641, + "router_z_loss_mlp": 0.14587402, + "step": 5947, + "time_per_iteration": 2.674962282180786 + }, + { + "auxiliary_loss_clip": 0.01130914, + "auxiliary_loss_mlp": 0.01036874, + "balance_loss_clip": 1.04766297, + "balance_loss_mlp": 1.02303421, + "epoch": 0.3576131068690816, + "flos": 23482697277600.0, + "grad_norm": 1.9508134203304581, + "language_loss": 0.75414717, + "learning_rate": 2.9758447268048297e-06, + "loss": 0.77582502, + "num_input_tokens_seen": 127846240, + "router_z_loss_clip": 0.83251953, + "router_z_loss_mlp": 0.13867188, + "step": 5948, + "time_per_iteration": 2.7766499519348145 + }, + { + "auxiliary_loss_clip": 0.01131252, + "auxiliary_loss_mlp": 0.01041066, + "balance_loss_clip": 1.04523957, + "balance_loss_mlp": 1.02720165, + "epoch": 0.35767323012174956, + "flos": 34965438786720.0, + "grad_norm": 4.081270739682265, + "language_loss": 0.7034061, + "learning_rate": 2.9755047522544415e-06, + "loss": 0.72512937, + "num_input_tokens_seen": 127866880, + "router_z_loss_clip": 0.86035156, + "router_z_loss_mlp": 0.13861084, + "step": 5949, + "time_per_iteration": 2.7479636669158936 + }, + { + "auxiliary_loss_clip": 0.01132323, + "auxiliary_loss_mlp": 0.01042856, + "balance_loss_clip": 1.04683673, + "balance_loss_mlp": 1.02940893, + "epoch": 0.35773335337441753, + "flos": 20847949611840.0, + "grad_norm": 4.039309652342937, + "language_loss": 0.77155209, + "learning_rate": 2.9751647407128154e-06, + "loss": 0.79330385, + "num_input_tokens_seen": 127883560, + "router_z_loss_clip": 0.85546875, + "router_z_loss_mlp": 0.13446045, + "step": 5950, + "time_per_iteration": 2.808389902114868 + }, + { + "auxiliary_loss_clip": 0.01134792, + "auxiliary_loss_mlp": 0.0103465, + "balance_loss_clip": 1.04754674, + "balance_loss_mlp": 1.02064323, + "epoch": 0.35779347662708555, + "flos": 19386903471840.0, + "grad_norm": 1.9137974713109276, + "language_loss": 0.72626579, + "learning_rate": 2.9748246921928445e-06, + "loss": 0.74796021, + "num_input_tokens_seen": 127902330, + "router_z_loss_clip": 0.87207031, + "router_z_loss_mlp": 0.14007568, + "step": 5951, + "time_per_iteration": 2.6173129081726074 + }, + { + "auxiliary_loss_clip": 0.01137836, + "auxiliary_loss_mlp": 0.01040591, + "balance_loss_clip": 1.04839969, + "balance_loss_mlp": 1.02634537, + "epoch": 0.3578535998797535, + "flos": 34970098273920.0, + "grad_norm": 2.512466453033459, + "language_loss": 0.70119995, + "learning_rate": 2.9744846067074236e-06, + "loss": 0.72298425, + "num_input_tokens_seen": 127922325, + "router_z_loss_clip": 0.89453125, + "router_z_loss_mlp": 0.14257812, + "step": 5952, + "time_per_iteration": 2.7589054107666016 + }, + { + "auxiliary_loss_clip": 0.01128792, + "auxiliary_loss_mlp": 0.01039861, + "balance_loss_clip": 1.04465747, + "balance_loss_mlp": 1.02695608, + "epoch": 0.3579137231324215, + "flos": 46188907806240.0, + "grad_norm": 2.363571671819476, + "language_loss": 0.69524449, + "learning_rate": 2.974144484269449e-06, + "loss": 0.7169311, + "num_input_tokens_seen": 127942635, + "router_z_loss_clip": 0.84082031, + "router_z_loss_mlp": 0.12908936, + "step": 5953, + "time_per_iteration": 2.7643680572509766 + }, + { + "auxiliary_loss_clip": 0.01130229, + "auxiliary_loss_mlp": 0.01030547, + "balance_loss_clip": 1.04584825, + "balance_loss_mlp": 1.01760721, + "epoch": 0.35797384638508944, + "flos": 27267727481280.0, + "grad_norm": 1.894872607616284, + "language_loss": 0.66806835, + "learning_rate": 2.9738043248918175e-06, + "loss": 0.68967611, + "num_input_tokens_seen": 127962520, + "router_z_loss_clip": 0.84326172, + "router_z_loss_mlp": 0.1295166, + "step": 5954, + "time_per_iteration": 2.7043395042419434 + }, + { + "auxiliary_loss_clip": 0.01132935, + "auxiliary_loss_mlp": 0.01036026, + "balance_loss_clip": 1.04824471, + "balance_loss_mlp": 1.02300262, + "epoch": 0.3580339696377574, + "flos": 16581643976160.0, + "grad_norm": 2.3403885220834844, + "language_loss": 0.75342488, + "learning_rate": 2.9734641285874282e-06, + "loss": 0.77511448, + "num_input_tokens_seen": 127981180, + "router_z_loss_clip": 0.84619141, + "router_z_loss_mlp": 0.13037109, + "step": 5955, + "time_per_iteration": 2.662790298461914 + }, + { + "auxiliary_loss_clip": 0.01127634, + "auxiliary_loss_mlp": 0.01030787, + "balance_loss_clip": 1.0455817, + "balance_loss_mlp": 1.01793623, + "epoch": 0.3580940928904254, + "flos": 29002348710720.0, + "grad_norm": 2.3423147795352186, + "language_loss": 0.7601167, + "learning_rate": 2.973123895369182e-06, + "loss": 0.78170091, + "num_input_tokens_seen": 127999725, + "router_z_loss_clip": 0.82128906, + "router_z_loss_mlp": 0.12854004, + "step": 5956, + "time_per_iteration": 2.728651762008667 + }, + { + "auxiliary_loss_clip": 0.01129343, + "auxiliary_loss_mlp": 0.01034731, + "balance_loss_clip": 1.04685521, + "balance_loss_mlp": 1.02172494, + "epoch": 0.35815421614309334, + "flos": 23749749084960.0, + "grad_norm": 1.858640381478379, + "language_loss": 0.72678018, + "learning_rate": 2.9727836252499805e-06, + "loss": 0.74842095, + "num_input_tokens_seen": 128018885, + "router_z_loss_clip": 0.82519531, + "router_z_loss_mlp": 0.13012695, + "step": 5957, + "time_per_iteration": 2.678194046020508 + }, + { + "auxiliary_loss_clip": 0.01134636, + "auxiliary_loss_mlp": 0.01036285, + "balance_loss_clip": 1.05080831, + "balance_loss_mlp": 1.02336287, + "epoch": 0.3582143393957613, + "flos": 28514063789280.0, + "grad_norm": 3.489621202565483, + "language_loss": 0.71429312, + "learning_rate": 2.972443318242726e-06, + "loss": 0.73600233, + "num_input_tokens_seen": 128037875, + "router_z_loss_clip": 0.83837891, + "router_z_loss_mlp": 0.12927246, + "step": 5958, + "time_per_iteration": 2.761493682861328 + }, + { + "auxiliary_loss_clip": 0.01128493, + "auxiliary_loss_mlp": 0.01028583, + "balance_loss_clip": 1.04664838, + "balance_loss_mlp": 1.01571989, + "epoch": 0.35827446264842927, + "flos": 32120114051520.0, + "grad_norm": 1.8204075436388134, + "language_loss": 0.88506448, + "learning_rate": 2.972102974360324e-06, + "loss": 0.90663528, + "num_input_tokens_seen": 128056045, + "router_z_loss_clip": 0.81933594, + "router_z_loss_mlp": 0.12872314, + "step": 5959, + "time_per_iteration": 2.7396280765533447 + }, + { + "auxiliary_loss_clip": 0.01131796, + "auxiliary_loss_mlp": 0.01029874, + "balance_loss_clip": 1.04834569, + "balance_loss_mlp": 1.01654088, + "epoch": 0.35833458590109724, + "flos": 37151683149600.0, + "grad_norm": 1.5975679244877103, + "language_loss": 0.5803256, + "learning_rate": 2.971762593615679e-06, + "loss": 0.6019423, + "num_input_tokens_seen": 128077815, + "router_z_loss_clip": 0.83398438, + "router_z_loss_mlp": 0.13336182, + "step": 5960, + "time_per_iteration": 2.715275526046753 + }, + { + "auxiliary_loss_clip": 0.01132903, + "auxiliary_loss_mlp": 0.01037535, + "balance_loss_clip": 1.04752922, + "balance_loss_mlp": 1.02290249, + "epoch": 0.3583947091537652, + "flos": 18095032091520.0, + "grad_norm": 2.263418263454043, + "language_loss": 0.76457798, + "learning_rate": 2.9714221760216993e-06, + "loss": 0.78628236, + "num_input_tokens_seen": 128095460, + "router_z_loss_clip": 0.85351562, + "router_z_loss_mlp": 0.14642334, + "step": 5961, + "time_per_iteration": 2.7811994552612305 + }, + { + "auxiliary_loss_clip": 0.01133147, + "auxiliary_loss_mlp": 0.01031016, + "balance_loss_clip": 1.04963887, + "balance_loss_mlp": 1.01732481, + "epoch": 0.35845483240643317, + "flos": 41781499535520.0, + "grad_norm": 1.83654437144721, + "language_loss": 0.70202851, + "learning_rate": 2.971081721591294e-06, + "loss": 0.72367013, + "num_input_tokens_seen": 128118605, + "router_z_loss_clip": 0.83544922, + "router_z_loss_mlp": 0.13684082, + "step": 5962, + "time_per_iteration": 2.7554709911346436 + }, + { + "auxiliary_loss_clip": 0.01131551, + "auxiliary_loss_mlp": 0.01036707, + "balance_loss_clip": 1.04852843, + "balance_loss_mlp": 1.02522755, + "epoch": 0.35851495565910113, + "flos": 25575521493600.0, + "grad_norm": 1.7959526585905625, + "language_loss": 0.7441023, + "learning_rate": 2.9707412303373716e-06, + "loss": 0.76578486, + "num_input_tokens_seen": 128139205, + "router_z_loss_clip": 0.82910156, + "router_z_loss_mlp": 0.11486816, + "step": 5963, + "time_per_iteration": 2.7119979858398438 + }, + { + "auxiliary_loss_clip": 0.01134433, + "auxiliary_loss_mlp": 0.01035505, + "balance_loss_clip": 1.05148673, + "balance_loss_mlp": 1.0221895, + "epoch": 0.35857507891176915, + "flos": 27222638099040.0, + "grad_norm": 2.047043348924158, + "language_loss": 0.79020828, + "learning_rate": 2.9704007022728447e-06, + "loss": 0.81190765, + "num_input_tokens_seen": 128158765, + "router_z_loss_clip": 0.828125, + "router_z_loss_mlp": 0.13317871, + "step": 5964, + "time_per_iteration": 2.6550309658050537 + }, + { + "auxiliary_loss_clip": 0.01135845, + "auxiliary_loss_mlp": 0.01034661, + "balance_loss_clip": 1.0488987, + "balance_loss_mlp": 1.02048731, + "epoch": 0.3586352021644371, + "flos": 28513699133760.0, + "grad_norm": 3.777740875485922, + "language_loss": 0.66750705, + "learning_rate": 2.970060137410626e-06, + "loss": 0.68921214, + "num_input_tokens_seen": 128177850, + "router_z_loss_clip": 0.86865234, + "router_z_loss_mlp": 0.14160156, + "step": 5965, + "time_per_iteration": 2.830472707748413 + }, + { + "auxiliary_loss_clip": 0.01131256, + "auxiliary_loss_mlp": 0.01037711, + "balance_loss_clip": 1.0473125, + "balance_loss_mlp": 1.02393663, + "epoch": 0.3586953254171051, + "flos": 33982791351840.0, + "grad_norm": 1.7379826318595861, + "language_loss": 0.79233003, + "learning_rate": 2.9697195357636294e-06, + "loss": 0.81401968, + "num_input_tokens_seen": 128196925, + "router_z_loss_clip": 0.84033203, + "router_z_loss_mlp": 0.13775635, + "step": 5966, + "time_per_iteration": 2.7910685539245605 + }, + { + "auxiliary_loss_clip": 0.01132132, + "auxiliary_loss_mlp": 0.01036421, + "balance_loss_clip": 1.04783404, + "balance_loss_mlp": 1.02210414, + "epoch": 0.35875544866977305, + "flos": 23794433294400.0, + "grad_norm": 2.5352232299483988, + "language_loss": 0.9122147, + "learning_rate": 2.9693788973447715e-06, + "loss": 0.93390024, + "num_input_tokens_seen": 128213955, + "router_z_loss_clip": 0.84277344, + "router_z_loss_mlp": 0.14324951, + "step": 5967, + "time_per_iteration": 2.677088737487793 + }, + { + "auxiliary_loss_clip": 0.01139162, + "auxiliary_loss_mlp": 0.01039535, + "balance_loss_clip": 1.05248725, + "balance_loss_mlp": 1.02502131, + "epoch": 0.358815571922441, + "flos": 26199115079040.0, + "grad_norm": 1.7889389279106107, + "language_loss": 0.79692996, + "learning_rate": 2.9690382221669682e-06, + "loss": 0.81871688, + "num_input_tokens_seen": 128232980, + "router_z_loss_clip": 0.86572266, + "router_z_loss_mlp": 0.14520264, + "step": 5968, + "time_per_iteration": 2.7043025493621826 + }, + { + "auxiliary_loss_clip": 0.01139294, + "auxiliary_loss_mlp": 0.01048202, + "balance_loss_clip": 1.05042613, + "balance_loss_mlp": 1.03367078, + "epoch": 0.358875695175109, + "flos": 26643688205760.0, + "grad_norm": 2.2441465324130356, + "language_loss": 0.84461081, + "learning_rate": 2.9686975102431384e-06, + "loss": 0.86648583, + "num_input_tokens_seen": 128252795, + "router_z_loss_clip": 0.88867188, + "router_z_loss_mlp": 0.14544678, + "step": 5969, + "time_per_iteration": 2.6446127891540527 + }, + { + "auxiliary_loss_clip": 0.01131205, + "auxiliary_loss_mlp": 0.01033023, + "balance_loss_clip": 1.04745436, + "balance_loss_mlp": 1.02035677, + "epoch": 0.35893581842777694, + "flos": 39061799834400.0, + "grad_norm": 1.8733340036663084, + "language_loss": 0.71893686, + "learning_rate": 2.968356761586202e-06, + "loss": 0.74057913, + "num_input_tokens_seen": 128273115, + "router_z_loss_clip": 0.83789062, + "router_z_loss_mlp": 0.12664795, + "step": 5970, + "time_per_iteration": 2.773510694503784 + }, + { + "auxiliary_loss_clip": 0.01132889, + "auxiliary_loss_mlp": 0.01036404, + "balance_loss_clip": 1.04965067, + "balance_loss_mlp": 1.0231359, + "epoch": 0.3589959416804449, + "flos": 24996774186720.0, + "grad_norm": 1.729383389257829, + "language_loss": 0.79278374, + "learning_rate": 2.9680159762090805e-06, + "loss": 0.81447667, + "num_input_tokens_seen": 128292220, + "router_z_loss_clip": 0.83251953, + "router_z_loss_mlp": 0.13269043, + "step": 5971, + "time_per_iteration": 2.673654079437256 + }, + { + "auxiliary_loss_clip": 0.01132176, + "auxiliary_loss_mlp": 0.01038244, + "balance_loss_clip": 1.04532158, + "balance_loss_mlp": 1.02403414, + "epoch": 0.3590560649331129, + "flos": 19743283180800.0, + "grad_norm": 1.962193539673522, + "language_loss": 0.78151655, + "learning_rate": 2.967675154124696e-06, + "loss": 0.80322075, + "num_input_tokens_seen": 128310305, + "router_z_loss_clip": 0.86816406, + "router_z_loss_mlp": 0.14221191, + "step": 5972, + "time_per_iteration": 2.657028913497925 + }, + { + "auxiliary_loss_clip": 0.01131104, + "auxiliary_loss_mlp": 0.01032478, + "balance_loss_clip": 1.04574203, + "balance_loss_mlp": 1.01957965, + "epoch": 0.35911618818578084, + "flos": 24862113799200.0, + "grad_norm": 2.1277561757862347, + "language_loss": 0.81221068, + "learning_rate": 2.9673342953459722e-06, + "loss": 0.83384657, + "num_input_tokens_seen": 128328305, + "router_z_loss_clip": 0.85302734, + "router_z_loss_mlp": 0.12896729, + "step": 5973, + "time_per_iteration": 2.6444246768951416 + }, + { + "auxiliary_loss_clip": 0.0105104, + "auxiliary_loss_mlp": 0.01002416, + "balance_loss_clip": 1.02354145, + "balance_loss_mlp": 1.00085008, + "epoch": 0.3591763114384488, + "flos": 50316102504000.0, + "grad_norm": 0.910667019014868, + "language_loss": 0.56696111, + "learning_rate": 2.9669933998858355e-06, + "loss": 0.58749568, + "num_input_tokens_seen": 128378380, + "router_z_loss_clip": 0.27563477, + "router_z_loss_mlp": 0.01566315, + "step": 5974, + "time_per_iteration": 3.171416997909546 + }, + { + "auxiliary_loss_clip": 0.01134606, + "auxiliary_loss_mlp": 0.01039116, + "balance_loss_clip": 1.04867685, + "balance_loss_mlp": 1.02605641, + "epoch": 0.35923643469111677, + "flos": 22812272066880.0, + "grad_norm": 2.119422006453879, + "language_loss": 0.68729281, + "learning_rate": 2.9666524677572114e-06, + "loss": 0.70903003, + "num_input_tokens_seen": 128394315, + "router_z_loss_clip": 0.85986328, + "router_z_loss_mlp": 0.1305542, + "step": 5975, + "time_per_iteration": 4.1209375858306885 + }, + { + "auxiliary_loss_clip": 0.01130003, + "auxiliary_loss_mlp": 0.01032773, + "balance_loss_clip": 1.04622436, + "balance_loss_mlp": 1.02000618, + "epoch": 0.35929655794378473, + "flos": 30517479069120.0, + "grad_norm": 1.7072361886551943, + "language_loss": 0.79865754, + "learning_rate": 2.96631149897303e-06, + "loss": 0.82028532, + "num_input_tokens_seen": 128414515, + "router_z_loss_clip": 0.83837891, + "router_z_loss_mlp": 0.12774658, + "step": 5976, + "time_per_iteration": 4.165175199508667 + }, + { + "auxiliary_loss_clip": 0.01131046, + "auxiliary_loss_mlp": 0.01036066, + "balance_loss_clip": 1.04655826, + "balance_loss_mlp": 1.02196336, + "epoch": 0.35935668119645275, + "flos": 18272837031840.0, + "grad_norm": 2.292547577745004, + "language_loss": 0.79111993, + "learning_rate": 2.9659704935462194e-06, + "loss": 0.81279099, + "num_input_tokens_seen": 128430615, + "router_z_loss_clip": 0.84423828, + "router_z_loss_mlp": 0.14111328, + "step": 5977, + "time_per_iteration": 2.6741135120391846 + }, + { + "auxiliary_loss_clip": 0.01127815, + "auxiliary_loss_mlp": 0.01033851, + "balance_loss_clip": 1.04473138, + "balance_loss_mlp": 1.0209291, + "epoch": 0.3594168044491207, + "flos": 25841681920800.0, + "grad_norm": 1.877470824567584, + "language_loss": 0.80369771, + "learning_rate": 2.9656294514897102e-06, + "loss": 0.82531434, + "num_input_tokens_seen": 128449480, + "router_z_loss_clip": 0.83105469, + "router_z_loss_mlp": 0.12927246, + "step": 5978, + "time_per_iteration": 2.726213216781616 + }, + { + "auxiliary_loss_clip": 0.01132328, + "auxiliary_loss_mlp": 0.01036685, + "balance_loss_clip": 1.0482744, + "balance_loss_mlp": 1.02267838, + "epoch": 0.3594769277017887, + "flos": 33766217724960.0, + "grad_norm": 2.151531151600599, + "language_loss": 0.67659301, + "learning_rate": 2.965288372816436e-06, + "loss": 0.69828308, + "num_input_tokens_seen": 128471465, + "router_z_loss_clip": 0.84033203, + "router_z_loss_mlp": 0.13989258, + "step": 5979, + "time_per_iteration": 4.142794132232666 + }, + { + "auxiliary_loss_clip": 0.01130749, + "auxiliary_loss_mlp": 0.01033021, + "balance_loss_clip": 1.04700589, + "balance_loss_mlp": 1.01997924, + "epoch": 0.35953705095445665, + "flos": 28067707902240.0, + "grad_norm": 1.9850062664708144, + "language_loss": 0.66960251, + "learning_rate": 2.9649472575393296e-06, + "loss": 0.69124025, + "num_input_tokens_seen": 128490645, + "router_z_loss_clip": 0.83642578, + "router_z_loss_mlp": 0.13031006, + "step": 5980, + "time_per_iteration": 2.7278876304626465 + }, + { + "auxiliary_loss_clip": 0.01134687, + "auxiliary_loss_mlp": 0.0103656, + "balance_loss_clip": 1.04588473, + "balance_loss_mlp": 1.02204001, + "epoch": 0.3595971742071246, + "flos": 31133171784960.0, + "grad_norm": 2.5350433974146385, + "language_loss": 0.71226621, + "learning_rate": 2.964606105671327e-06, + "loss": 0.73397869, + "num_input_tokens_seen": 128510225, + "router_z_loss_clip": 0.88818359, + "router_z_loss_mlp": 0.1451416, + "step": 5981, + "time_per_iteration": 2.71419358253479 + }, + { + "auxiliary_loss_clip": 0.01132601, + "auxiliary_loss_mlp": 0.01038383, + "balance_loss_clip": 1.04729033, + "balance_loss_mlp": 1.02357709, + "epoch": 0.3596572974597926, + "flos": 36439004766240.0, + "grad_norm": 2.0420960737617952, + "language_loss": 0.70995116, + "learning_rate": 2.9642649172253635e-06, + "loss": 0.73166102, + "num_input_tokens_seen": 128530195, + "router_z_loss_clip": 0.85253906, + "router_z_loss_mlp": 0.14807129, + "step": 5982, + "time_per_iteration": 4.038985013961792 + }, + { + "auxiliary_loss_clip": 0.0112838, + "auxiliary_loss_mlp": 0.01037594, + "balance_loss_clip": 1.04703009, + "balance_loss_mlp": 1.02505887, + "epoch": 0.35971742071246054, + "flos": 28202125186080.0, + "grad_norm": 2.0788435457111554, + "language_loss": 0.75891709, + "learning_rate": 2.9639236922143786e-06, + "loss": 0.78057683, + "num_input_tokens_seen": 128549990, + "router_z_loss_clip": 0.81396484, + "router_z_loss_mlp": 0.12530518, + "step": 5983, + "time_per_iteration": 2.7965893745422363 + }, + { + "auxiliary_loss_clip": 0.0113723, + "auxiliary_loss_mlp": 0.0104167, + "balance_loss_clip": 1.04891086, + "balance_loss_mlp": 1.02661955, + "epoch": 0.3597775439651285, + "flos": 20408198041440.0, + "grad_norm": 2.7130830014381493, + "language_loss": 0.76266176, + "learning_rate": 2.96358243065131e-06, + "loss": 0.78445071, + "num_input_tokens_seen": 128567925, + "router_z_loss_clip": 0.88330078, + "router_z_loss_mlp": 0.15045166, + "step": 5984, + "time_per_iteration": 2.605635166168213 + }, + { + "auxiliary_loss_clip": 0.01128278, + "auxiliary_loss_mlp": 0.01033958, + "balance_loss_clip": 1.04513383, + "balance_loss_mlp": 1.02068448, + "epoch": 0.3598376672177965, + "flos": 24061647170880.0, + "grad_norm": 2.1329980480289956, + "language_loss": 0.86234462, + "learning_rate": 2.9632411325490993e-06, + "loss": 0.88396698, + "num_input_tokens_seen": 128585655, + "router_z_loss_clip": 0.83007812, + "router_z_loss_mlp": 0.13293457, + "step": 5985, + "time_per_iteration": 2.6869726181030273 + }, + { + "auxiliary_loss_clip": 0.01129012, + "auxiliary_loss_mlp": 0.01036668, + "balance_loss_clip": 1.04507697, + "balance_loss_mlp": 1.02192199, + "epoch": 0.35989779047046444, + "flos": 21123266944320.0, + "grad_norm": 1.3695582802553514, + "language_loss": 0.72340554, + "learning_rate": 2.9628997979206884e-06, + "loss": 0.74506235, + "num_input_tokens_seen": 128604820, + "router_z_loss_clip": 0.83837891, + "router_z_loss_mlp": 0.14733887, + "step": 5986, + "time_per_iteration": 2.6665356159210205 + }, + { + "auxiliary_loss_clip": 0.01134265, + "auxiliary_loss_mlp": 0.01036594, + "balance_loss_clip": 1.04560018, + "balance_loss_mlp": 1.02293873, + "epoch": 0.3599579137231324, + "flos": 27712422159840.0, + "grad_norm": 2.1621431161135316, + "language_loss": 0.73454618, + "learning_rate": 2.9625584267790204e-06, + "loss": 0.75625473, + "num_input_tokens_seen": 128623070, + "router_z_loss_clip": 0.88818359, + "router_z_loss_mlp": 0.13641357, + "step": 5987, + "time_per_iteration": 2.6958272457122803 + }, + { + "auxiliary_loss_clip": 0.01134396, + "auxiliary_loss_mlp": 0.01038362, + "balance_loss_clip": 1.04744005, + "balance_loss_mlp": 1.02423, + "epoch": 0.36001803697580037, + "flos": 25575278389920.0, + "grad_norm": 1.7363923402712083, + "language_loss": 0.69504309, + "learning_rate": 2.9622170191370404e-06, + "loss": 0.71677077, + "num_input_tokens_seen": 128642430, + "router_z_loss_clip": 0.86962891, + "router_z_loss_mlp": 0.14117432, + "step": 5988, + "time_per_iteration": 2.7037830352783203 + }, + { + "auxiliary_loss_clip": 0.01135565, + "auxiliary_loss_mlp": 0.01038576, + "balance_loss_clip": 1.04700565, + "balance_loss_mlp": 1.02446723, + "epoch": 0.36007816022846834, + "flos": 24996490565760.0, + "grad_norm": 1.938052269195222, + "language_loss": 0.73319876, + "learning_rate": 2.9618755750076953e-06, + "loss": 0.75494021, + "num_input_tokens_seen": 128661285, + "router_z_loss_clip": 0.88476562, + "router_z_loss_mlp": 0.14105225, + "step": 5989, + "time_per_iteration": 2.6390459537506104 + }, + { + "auxiliary_loss_clip": 0.01132336, + "auxiliary_loss_mlp": 0.01033277, + "balance_loss_clip": 1.0473628, + "balance_loss_mlp": 1.02008069, + "epoch": 0.36013828348113636, + "flos": 34167727333440.0, + "grad_norm": 2.557601472805333, + "language_loss": 0.79987359, + "learning_rate": 2.961534094403931e-06, + "loss": 0.82152969, + "num_input_tokens_seen": 128682210, + "router_z_loss_clip": 0.84912109, + "router_z_loss_mlp": 0.13183594, + "step": 5990, + "time_per_iteration": 2.735706090927124 + }, + { + "auxiliary_loss_clip": 0.01131582, + "auxiliary_loss_mlp": 0.01034081, + "balance_loss_clip": 1.04635787, + "balance_loss_mlp": 1.02024698, + "epoch": 0.3601984067338043, + "flos": 24504437537280.0, + "grad_norm": 1.9874954797063196, + "language_loss": 0.84009272, + "learning_rate": 2.961192577338698e-06, + "loss": 0.86174929, + "num_input_tokens_seen": 128700445, + "router_z_loss_clip": 0.85302734, + "router_z_loss_mlp": 0.13818359, + "step": 5991, + "time_per_iteration": 2.635383129119873 + }, + { + "auxiliary_loss_clip": 0.01133006, + "auxiliary_loss_mlp": 0.01039229, + "balance_loss_clip": 1.04433322, + "balance_loss_mlp": 1.02516198, + "epoch": 0.3602585299864723, + "flos": 22717717436160.0, + "grad_norm": 1.8807398373167703, + "language_loss": 0.75920856, + "learning_rate": 2.9608510238249463e-06, + "loss": 0.78093088, + "num_input_tokens_seen": 128716855, + "router_z_loss_clip": 0.88867188, + "router_z_loss_mlp": 0.14074707, + "step": 5992, + "time_per_iteration": 2.650268316268921 + }, + { + "auxiliary_loss_clip": 0.01132683, + "auxiliary_loss_mlp": 0.01037922, + "balance_loss_clip": 1.04786277, + "balance_loss_mlp": 1.02352178, + "epoch": 0.36031865323914025, + "flos": 23882950850400.0, + "grad_norm": 2.047836321254221, + "language_loss": 0.77324945, + "learning_rate": 2.960509433875627e-06, + "loss": 0.79495549, + "num_input_tokens_seen": 128735835, + "router_z_loss_clip": 0.84765625, + "router_z_loss_mlp": 0.1439209, + "step": 5993, + "time_per_iteration": 2.62349796295166 + }, + { + "auxiliary_loss_clip": 0.01136268, + "auxiliary_loss_mlp": 0.01036758, + "balance_loss_clip": 1.04863966, + "balance_loss_mlp": 1.02242899, + "epoch": 0.3603787764918082, + "flos": 21342109538880.0, + "grad_norm": 1.7553623173062955, + "language_loss": 0.74577886, + "learning_rate": 2.9601678075036943e-06, + "loss": 0.7675091, + "num_input_tokens_seen": 128752465, + "router_z_loss_clip": 0.87695312, + "router_z_loss_mlp": 0.14337158, + "step": 5994, + "time_per_iteration": 2.678921699523926 + }, + { + "auxiliary_loss_clip": 0.01138784, + "auxiliary_loss_mlp": 0.01036843, + "balance_loss_clip": 1.04958975, + "balance_loss_mlp": 1.02316356, + "epoch": 0.3604388997444762, + "flos": 18941276895840.0, + "grad_norm": 1.8195919187305405, + "language_loss": 0.68856883, + "learning_rate": 2.9598261447221024e-06, + "loss": 0.71032512, + "num_input_tokens_seen": 128770865, + "router_z_loss_clip": 0.89111328, + "router_z_loss_mlp": 0.13684082, + "step": 5995, + "time_per_iteration": 2.7366247177124023 + }, + { + "auxiliary_loss_clip": 0.01136243, + "auxiliary_loss_mlp": 0.01039989, + "balance_loss_clip": 1.04686165, + "balance_loss_mlp": 1.0252552, + "epoch": 0.36049902299714415, + "flos": 21122537633280.0, + "grad_norm": 2.1684268791774297, + "language_loss": 0.82402074, + "learning_rate": 2.9594844455438057e-06, + "loss": 0.84578311, + "num_input_tokens_seen": 128789730, + "router_z_loss_clip": 0.89355469, + "router_z_loss_mlp": 0.14746094, + "step": 5996, + "time_per_iteration": 2.675612211227417 + }, + { + "auxiliary_loss_clip": 0.01131979, + "auxiliary_loss_mlp": 0.0103755, + "balance_loss_clip": 1.04650331, + "balance_loss_mlp": 1.02325106, + "epoch": 0.3605591462498121, + "flos": 20811490410240.0, + "grad_norm": 2.003013376874554, + "language_loss": 0.73854029, + "learning_rate": 2.959142709981763e-06, + "loss": 0.76023561, + "num_input_tokens_seen": 128806610, + "router_z_loss_clip": 0.85498047, + "router_z_loss_mlp": 0.14300537, + "step": 5997, + "time_per_iteration": 2.6460649967193604 + }, + { + "auxiliary_loss_clip": 0.01130999, + "auxiliary_loss_mlp": 0.01033549, + "balance_loss_clip": 1.04642344, + "balance_loss_mlp": 1.02050734, + "epoch": 0.3606192695024801, + "flos": 20544073947360.0, + "grad_norm": 2.277415062673168, + "language_loss": 0.69257247, + "learning_rate": 2.9588009380489337e-06, + "loss": 0.7142179, + "num_input_tokens_seen": 128824830, + "router_z_loss_clip": 0.84472656, + "router_z_loss_mlp": 0.13037109, + "step": 5998, + "time_per_iteration": 2.7161943912506104 + }, + { + "auxiliary_loss_clip": 0.01133239, + "auxiliary_loss_mlp": 0.01034666, + "balance_loss_clip": 1.04762042, + "balance_loss_mlp": 1.02108216, + "epoch": 0.36067939275514804, + "flos": 14800029052320.0, + "grad_norm": 3.988823531765819, + "language_loss": 0.77029693, + "learning_rate": 2.9584591297582758e-06, + "loss": 0.79197598, + "num_input_tokens_seen": 128838170, + "router_z_loss_clip": 0.85546875, + "router_z_loss_mlp": 0.13574219, + "step": 5999, + "time_per_iteration": 2.61930775642395 + }, + { + "auxiliary_loss_clip": 0.01133861, + "auxiliary_loss_mlp": 0.01042174, + "balance_loss_clip": 1.04709601, + "balance_loss_mlp": 1.02906692, + "epoch": 0.360739516007816, + "flos": 22013952854400.0, + "grad_norm": 2.1670357337183836, + "language_loss": 0.78269607, + "learning_rate": 2.9581172851227516e-06, + "loss": 0.80445641, + "num_input_tokens_seen": 128855625, + "router_z_loss_clip": 0.86816406, + "router_z_loss_mlp": 0.13116455, + "step": 6000, + "time_per_iteration": 2.6904547214508057 + }, + { + "auxiliary_loss_clip": 0.01133887, + "auxiliary_loss_mlp": 0.01032962, + "balance_loss_clip": 1.0474788, + "balance_loss_mlp": 1.01978981, + "epoch": 0.360799639260484, + "flos": 22634507643840.0, + "grad_norm": 2.118572815349864, + "language_loss": 0.78601438, + "learning_rate": 2.9577754041553243e-06, + "loss": 0.80768287, + "num_input_tokens_seen": 128873540, + "router_z_loss_clip": 0.86425781, + "router_z_loss_mlp": 0.1317749, + "step": 6001, + "time_per_iteration": 2.65244197845459 + }, + { + "auxiliary_loss_clip": 0.01129985, + "auxiliary_loss_mlp": 0.01034496, + "balance_loss_clip": 1.04663134, + "balance_loss_mlp": 1.02120447, + "epoch": 0.36085976251315194, + "flos": 24016031064000.0, + "grad_norm": 2.105244891473176, + "language_loss": 0.83148539, + "learning_rate": 2.9574334868689575e-06, + "loss": 0.85313016, + "num_input_tokens_seen": 128889925, + "router_z_loss_clip": 0.83300781, + "router_z_loss_mlp": 0.13287354, + "step": 6002, + "time_per_iteration": 2.810784339904785 + }, + { + "auxiliary_loss_clip": 0.01126301, + "auxiliary_loss_mlp": 0.0103228, + "balance_loss_clip": 1.04581475, + "balance_loss_mlp": 1.01993597, + "epoch": 0.3609198857658199, + "flos": 29528591973120.0, + "grad_norm": 2.142602636227609, + "language_loss": 0.90827513, + "learning_rate": 2.9570915332766165e-06, + "loss": 0.92986095, + "num_input_tokens_seen": 128906890, + "router_z_loss_clip": 0.8046875, + "router_z_loss_mlp": 0.12347412, + "step": 6003, + "time_per_iteration": 2.7027671337127686 + }, + { + "auxiliary_loss_clip": 0.01053625, + "auxiliary_loss_mlp": 0.01001462, + "balance_loss_clip": 1.02629542, + "balance_loss_mlp": 0.99984062, + "epoch": 0.3609800090184879, + "flos": 69692512000320.0, + "grad_norm": 0.8616372114219509, + "language_loss": 0.53379023, + "learning_rate": 2.9567495433912693e-06, + "loss": 0.55434108, + "num_input_tokens_seen": 128965940, + "router_z_loss_clip": 0.27319336, + "router_z_loss_mlp": 0.01623535, + "step": 6004, + "time_per_iteration": 3.2070670127868652 + }, + { + "auxiliary_loss_clip": 0.01133695, + "auxiliary_loss_mlp": 0.01036967, + "balance_loss_clip": 1.04574859, + "balance_loss_mlp": 1.02194715, + "epoch": 0.3610401322711559, + "flos": 25394151032640.0, + "grad_norm": 1.8653126037545447, + "language_loss": 0.77702081, + "learning_rate": 2.956407517225883e-06, + "loss": 0.79872745, + "num_input_tokens_seen": 128985835, + "router_z_loss_clip": 0.87841797, + "router_z_loss_mlp": 0.15014648, + "step": 6005, + "time_per_iteration": 2.718062162399292 + }, + { + "auxiliary_loss_clip": 0.01133323, + "auxiliary_loss_mlp": 0.01040258, + "balance_loss_clip": 1.0484302, + "balance_loss_mlp": 1.02710295, + "epoch": 0.36110025552382385, + "flos": 16715210397120.0, + "grad_norm": 2.2821089529745566, + "language_loss": 0.78785968, + "learning_rate": 2.956065454793429e-06, + "loss": 0.80959547, + "num_input_tokens_seen": 129003120, + "router_z_loss_clip": 0.84912109, + "router_z_loss_mlp": 0.13146973, + "step": 6006, + "time_per_iteration": 2.628080129623413 + }, + { + "auxiliary_loss_clip": 0.01134791, + "auxiliary_loss_mlp": 0.01041554, + "balance_loss_clip": 1.047616, + "balance_loss_mlp": 1.02598476, + "epoch": 0.3611603787764918, + "flos": 27399835280160.0, + "grad_norm": 1.987919629310985, + "language_loss": 0.84575182, + "learning_rate": 2.955723356106876e-06, + "loss": 0.86751527, + "num_input_tokens_seen": 129021645, + "router_z_loss_clip": 0.87207031, + "router_z_loss_mlp": 0.15576172, + "step": 6007, + "time_per_iteration": 2.703171968460083 + }, + { + "auxiliary_loss_clip": 0.01140711, + "auxiliary_loss_mlp": 0.01035121, + "balance_loss_clip": 1.04962313, + "balance_loss_mlp": 1.02058315, + "epoch": 0.3612205020291598, + "flos": 25485950488320.0, + "grad_norm": 2.1264170026218525, + "language_loss": 0.72564554, + "learning_rate": 2.955381221179198e-06, + "loss": 0.7474038, + "num_input_tokens_seen": 129038375, + "router_z_loss_clip": 0.91113281, + "router_z_loss_mlp": 0.14532471, + "step": 6008, + "time_per_iteration": 2.637606620788574 + }, + { + "auxiliary_loss_clip": 0.01132775, + "auxiliary_loss_mlp": 0.01036624, + "balance_loss_clip": 1.04670203, + "balance_loss_mlp": 1.02304637, + "epoch": 0.36128062528182775, + "flos": 19208045082240.0, + "grad_norm": 2.135136411858613, + "language_loss": 0.82506973, + "learning_rate": 2.955039050023368e-06, + "loss": 0.84676373, + "num_input_tokens_seen": 129056235, + "router_z_loss_clip": 0.86132812, + "router_z_loss_mlp": 0.13586426, + "step": 6009, + "time_per_iteration": 2.7159242630004883 + }, + { + "auxiliary_loss_clip": 0.01134265, + "auxiliary_loss_mlp": 0.01042685, + "balance_loss_clip": 1.04794812, + "balance_loss_mlp": 1.02884436, + "epoch": 0.3613407485344957, + "flos": 20455718460480.0, + "grad_norm": 1.923493208963201, + "language_loss": 0.76506186, + "learning_rate": 2.954696842652362e-06, + "loss": 0.78683138, + "num_input_tokens_seen": 129072405, + "router_z_loss_clip": 0.86376953, + "router_z_loss_mlp": 0.13842773, + "step": 6010, + "time_per_iteration": 2.6284079551696777 + }, + { + "auxiliary_loss_clip": 0.0113528, + "auxiliary_loss_mlp": 0.01039713, + "balance_loss_clip": 1.04943585, + "balance_loss_mlp": 1.02624273, + "epoch": 0.3614008717871637, + "flos": 24857373277440.0, + "grad_norm": 1.7013815164018824, + "language_loss": 0.82824552, + "learning_rate": 2.9543545990791554e-06, + "loss": 0.84999537, + "num_input_tokens_seen": 129090225, + "router_z_loss_clip": 0.85839844, + "router_z_loss_mlp": 0.13476562, + "step": 6011, + "time_per_iteration": 2.7398831844329834 + }, + { + "auxiliary_loss_clip": 0.01142771, + "auxiliary_loss_mlp": 0.0103833, + "balance_loss_clip": 1.05217791, + "balance_loss_mlp": 1.02428722, + "epoch": 0.36146099503983165, + "flos": 27792957811680.0, + "grad_norm": 2.6425286396037104, + "language_loss": 0.63067615, + "learning_rate": 2.954012319316727e-06, + "loss": 0.65248716, + "num_input_tokens_seen": 129107685, + "router_z_loss_clip": 0.90771484, + "router_z_loss_mlp": 0.14050293, + "step": 6012, + "time_per_iteration": 2.6937460899353027 + }, + { + "auxiliary_loss_clip": 0.0113032, + "auxiliary_loss_mlp": 0.01035753, + "balance_loss_clip": 1.04659057, + "balance_loss_mlp": 1.02163887, + "epoch": 0.3615211182924996, + "flos": 28061306172000.0, + "grad_norm": 1.8700073709582092, + "language_loss": 0.83774978, + "learning_rate": 2.9536700033780565e-06, + "loss": 0.85941046, + "num_input_tokens_seen": 129125315, + "router_z_loss_clip": 0.83691406, + "router_z_loss_mlp": 0.14111328, + "step": 6013, + "time_per_iteration": 2.6481399536132812 + }, + { + "auxiliary_loss_clip": 0.01135582, + "auxiliary_loss_mlp": 0.01036029, + "balance_loss_clip": 1.04870689, + "balance_loss_mlp": 1.02142644, + "epoch": 0.3615812415451676, + "flos": 20314291687200.0, + "grad_norm": 2.1629044221988116, + "language_loss": 0.91884834, + "learning_rate": 2.9533276512761228e-06, + "loss": 0.94056439, + "num_input_tokens_seen": 129141600, + "router_z_loss_clip": 0.86816406, + "router_z_loss_mlp": 0.14605713, + "step": 6014, + "time_per_iteration": 4.087894916534424 + }, + { + "auxiliary_loss_clip": 0.01132129, + "auxiliary_loss_mlp": 0.01037236, + "balance_loss_clip": 1.04706645, + "balance_loss_mlp": 1.02335453, + "epoch": 0.36164136479783554, + "flos": 26015273064000.0, + "grad_norm": 1.8416539459833035, + "language_loss": 0.73695087, + "learning_rate": 2.95298526302391e-06, + "loss": 0.75864446, + "num_input_tokens_seen": 129160665, + "router_z_loss_clip": 0.84960938, + "router_z_loss_mlp": 0.13867188, + "step": 6015, + "time_per_iteration": 2.6238744258880615 + }, + { + "auxiliary_loss_clip": 0.01137612, + "auxiliary_loss_mlp": 0.01034319, + "balance_loss_clip": 1.05010724, + "balance_loss_mlp": 1.0196445, + "epoch": 0.3617014880505035, + "flos": 29492092254240.0, + "grad_norm": 1.7702833643794316, + "language_loss": 0.64933544, + "learning_rate": 2.9526428386344e-06, + "loss": 0.67105472, + "num_input_tokens_seen": 129179220, + "router_z_loss_clip": 0.87548828, + "router_z_loss_mlp": 0.14685059, + "step": 6016, + "time_per_iteration": 3.9344699382781982 + }, + { + "auxiliary_loss_clip": 0.01138844, + "auxiliary_loss_mlp": 0.01040473, + "balance_loss_clip": 1.05169845, + "balance_loss_mlp": 1.02452266, + "epoch": 0.3617616113031715, + "flos": 47607822325440.0, + "grad_norm": 1.8204629263086403, + "language_loss": 0.71828872, + "learning_rate": 2.9523003781205785e-06, + "loss": 0.74008185, + "num_input_tokens_seen": 129200385, + "router_z_loss_clip": 0.87109375, + "router_z_loss_mlp": 0.15942383, + "step": 6017, + "time_per_iteration": 2.8194491863250732 + }, + { + "auxiliary_loss_clip": 0.01136884, + "auxiliary_loss_mlp": 0.01035567, + "balance_loss_clip": 1.04767489, + "balance_loss_mlp": 1.02111924, + "epoch": 0.3618217345558395, + "flos": 14801366122560.0, + "grad_norm": 1.8974445086108012, + "language_loss": 0.73825669, + "learning_rate": 2.9519578814954307e-06, + "loss": 0.75998127, + "num_input_tokens_seen": 129217395, + "router_z_loss_clip": 0.89257812, + "router_z_loss_mlp": 0.14447021, + "step": 6018, + "time_per_iteration": 4.132829666137695 + }, + { + "auxiliary_loss_clip": 0.01133517, + "auxiliary_loss_mlp": 0.01035663, + "balance_loss_clip": 1.04965425, + "balance_loss_mlp": 1.02107167, + "epoch": 0.36188185780850746, + "flos": 30426206338080.0, + "grad_norm": 1.8144246330554863, + "language_loss": 0.69107223, + "learning_rate": 2.9516153487719448e-06, + "loss": 0.71276402, + "num_input_tokens_seen": 129238940, + "router_z_loss_clip": 0.83886719, + "router_z_loss_mlp": 0.14599609, + "step": 6019, + "time_per_iteration": 2.7102890014648438 + }, + { + "auxiliary_loss_clip": 0.01138171, + "auxiliary_loss_mlp": 0.01033689, + "balance_loss_clip": 1.04939556, + "balance_loss_mlp": 1.01863337, + "epoch": 0.3619419810611754, + "flos": 25574103388800.0, + "grad_norm": 1.574261089317872, + "language_loss": 0.76751477, + "learning_rate": 2.95127277996311e-06, + "loss": 0.78923339, + "num_input_tokens_seen": 129258240, + "router_z_loss_clip": 0.88769531, + "router_z_loss_mlp": 0.1505127, + "step": 6020, + "time_per_iteration": 2.6770267486572266 + }, + { + "auxiliary_loss_clip": 0.01138275, + "auxiliary_loss_mlp": 0.01040519, + "balance_loss_clip": 1.05115271, + "balance_loss_mlp": 1.02499795, + "epoch": 0.3620021043138434, + "flos": 27489892492800.0, + "grad_norm": 1.8315359110562834, + "language_loss": 0.73813546, + "learning_rate": 2.9509301750819156e-06, + "loss": 0.7599234, + "num_input_tokens_seen": 129279040, + "router_z_loss_clip": 0.87158203, + "router_z_loss_mlp": 0.15515137, + "step": 6021, + "time_per_iteration": 4.092499017715454 + }, + { + "auxiliary_loss_clip": 0.01135784, + "auxiliary_loss_mlp": 0.01035558, + "balance_loss_clip": 1.04967809, + "balance_loss_mlp": 1.02233744, + "epoch": 0.36206222756651135, + "flos": 19030685832000.0, + "grad_norm": 2.402360657168422, + "language_loss": 0.81123489, + "learning_rate": 2.9505875341413533e-06, + "loss": 0.83294833, + "num_input_tokens_seen": 129295415, + "router_z_loss_clip": 0.86035156, + "router_z_loss_mlp": 0.13214111, + "step": 6022, + "time_per_iteration": 2.6800429821014404 + }, + { + "auxiliary_loss_clip": 0.01137023, + "auxiliary_loss_mlp": 0.01037945, + "balance_loss_clip": 1.05363441, + "balance_loss_mlp": 1.02507615, + "epoch": 0.3621223508191793, + "flos": 28781277665760.0, + "grad_norm": 1.725261338833365, + "language_loss": 0.81675774, + "learning_rate": 2.950244857154417e-06, + "loss": 0.83850741, + "num_input_tokens_seen": 129312620, + "router_z_loss_clip": 0.83544922, + "router_z_loss_mlp": 0.12860107, + "step": 6023, + "time_per_iteration": 2.7462522983551025 + }, + { + "auxiliary_loss_clip": 0.01138153, + "auxiliary_loss_mlp": 0.0103582, + "balance_loss_clip": 1.04989934, + "balance_loss_mlp": 1.02128804, + "epoch": 0.3621824740718473, + "flos": 27222678616320.0, + "grad_norm": 3.6792910382507946, + "language_loss": 0.79591095, + "learning_rate": 2.9499021441341e-06, + "loss": 0.81765062, + "num_input_tokens_seen": 129331825, + "router_z_loss_clip": 0.88134766, + "router_z_loss_mlp": 0.14532471, + "step": 6024, + "time_per_iteration": 2.6919472217559814 + }, + { + "auxiliary_loss_clip": 0.01132143, + "auxiliary_loss_mlp": 0.01029547, + "balance_loss_clip": 1.0495857, + "balance_loss_mlp": 1.01642752, + "epoch": 0.36224259732451525, + "flos": 20454746045760.0, + "grad_norm": 2.0038963676081445, + "language_loss": 0.74790549, + "learning_rate": 2.9495593950933997e-06, + "loss": 0.76952243, + "num_input_tokens_seen": 129350400, + "router_z_loss_clip": 0.82617188, + "router_z_loss_mlp": 0.13134766, + "step": 6025, + "time_per_iteration": 2.690681219100952 + }, + { + "auxiliary_loss_clip": 0.01134737, + "auxiliary_loss_mlp": 0.01032541, + "balance_loss_clip": 1.05091619, + "balance_loss_mlp": 1.0195291, + "epoch": 0.3623027205771832, + "flos": 28247457672000.0, + "grad_norm": 1.7879134081569867, + "language_loss": 0.72379017, + "learning_rate": 2.9492166100453107e-06, + "loss": 0.74546289, + "num_input_tokens_seen": 129371155, + "router_z_loss_clip": 0.83740234, + "router_z_loss_mlp": 0.13000488, + "step": 6026, + "time_per_iteration": 2.8311591148376465 + }, + { + "auxiliary_loss_clip": 0.01144268, + "auxiliary_loss_mlp": 0.0104309, + "balance_loss_clip": 1.05306625, + "balance_loss_mlp": 1.0281055, + "epoch": 0.3623628438298512, + "flos": 34836288749280.0, + "grad_norm": 2.236849674063044, + "language_loss": 0.789814, + "learning_rate": 2.948873789002833e-06, + "loss": 0.81168759, + "num_input_tokens_seen": 129391230, + "router_z_loss_clip": 0.91162109, + "router_z_loss_mlp": 0.15002441, + "step": 6027, + "time_per_iteration": 2.764915943145752 + }, + { + "auxiliary_loss_clip": 0.01137686, + "auxiliary_loss_mlp": 0.01036468, + "balance_loss_clip": 1.05060506, + "balance_loss_mlp": 1.02165008, + "epoch": 0.36242296708251914, + "flos": 31097036721600.0, + "grad_norm": 1.8310892380656416, + "language_loss": 0.67637336, + "learning_rate": 2.9485309319789667e-06, + "loss": 0.69811493, + "num_input_tokens_seen": 129410065, + "router_z_loss_clip": 0.87109375, + "router_z_loss_mlp": 0.14819336, + "step": 6028, + "time_per_iteration": 2.7033865451812744 + }, + { + "auxiliary_loss_clip": 0.01134535, + "auxiliary_loss_mlp": 0.01029243, + "balance_loss_clip": 1.05044651, + "balance_loss_mlp": 1.01635671, + "epoch": 0.3624830903351871, + "flos": 19877133222720.0, + "grad_norm": 2.046150453680142, + "language_loss": 0.85671318, + "learning_rate": 2.9481880389867117e-06, + "loss": 0.87835097, + "num_input_tokens_seen": 129428655, + "router_z_loss_clip": 0.84082031, + "router_z_loss_mlp": 0.12896729, + "step": 6029, + "time_per_iteration": 2.6748130321502686 + }, + { + "auxiliary_loss_clip": 0.01133413, + "auxiliary_loss_mlp": 0.01033858, + "balance_loss_clip": 1.04987657, + "balance_loss_mlp": 1.02061963, + "epoch": 0.36254321358785513, + "flos": 22325000077440.0, + "grad_norm": 1.7802214568716144, + "language_loss": 0.72779661, + "learning_rate": 2.9478451100390714e-06, + "loss": 0.74946934, + "num_input_tokens_seen": 129447845, + "router_z_loss_clip": 0.83642578, + "router_z_loss_mlp": 0.13226318, + "step": 6030, + "time_per_iteration": 2.725796937942505 + }, + { + "auxiliary_loss_clip": 0.01141441, + "auxiliary_loss_mlp": 0.01037719, + "balance_loss_clip": 1.05154777, + "balance_loss_mlp": 1.02217412, + "epoch": 0.3626033368405231, + "flos": 18138338713440.0, + "grad_norm": 5.1568757879818, + "language_loss": 0.74299884, + "learning_rate": 2.94750214514905e-06, + "loss": 0.76479042, + "num_input_tokens_seen": 129463275, + "router_z_loss_clip": 0.89941406, + "router_z_loss_mlp": 0.15551758, + "step": 6031, + "time_per_iteration": 2.6995110511779785 + }, + { + "auxiliary_loss_clip": 0.01134792, + "auxiliary_loss_mlp": 0.01036788, + "balance_loss_clip": 1.0501492, + "balance_loss_mlp": 1.0231266, + "epoch": 0.36266346009319106, + "flos": 27218951026560.0, + "grad_norm": 1.7884897422971764, + "language_loss": 0.72985423, + "learning_rate": 2.9471591443296516e-06, + "loss": 0.75156999, + "num_input_tokens_seen": 129483205, + "router_z_loss_clip": 0.84716797, + "router_z_loss_mlp": 0.13653564, + "step": 6032, + "time_per_iteration": 2.684252977371216 + }, + { + "auxiliary_loss_clip": 0.01137192, + "auxiliary_loss_mlp": 0.01041504, + "balance_loss_clip": 1.05012369, + "balance_loss_mlp": 1.02829528, + "epoch": 0.362723583345859, + "flos": 22234821312960.0, + "grad_norm": 5.227279868785717, + "language_loss": 0.77467775, + "learning_rate": 2.946816107593884e-06, + "loss": 0.79646474, + "num_input_tokens_seen": 129499885, + "router_z_loss_clip": 0.87109375, + "router_z_loss_mlp": 0.13220215, + "step": 6033, + "time_per_iteration": 2.673658609390259 + }, + { + "auxiliary_loss_clip": 0.01056166, + "auxiliary_loss_mlp": 0.01007746, + "balance_loss_clip": 1.02863348, + "balance_loss_mlp": 1.00593972, + "epoch": 0.362783706598527, + "flos": 83583176676480.0, + "grad_norm": 0.7840016902131025, + "language_loss": 0.64770937, + "learning_rate": 2.9464730349547547e-06, + "loss": 0.66834849, + "num_input_tokens_seen": 129561885, + "router_z_loss_clip": 0.27587891, + "router_z_loss_mlp": 0.01805115, + "step": 6034, + "time_per_iteration": 3.3543801307678223 + }, + { + "auxiliary_loss_clip": 0.01134209, + "auxiliary_loss_mlp": 0.01034668, + "balance_loss_clip": 1.04987323, + "balance_loss_mlp": 1.02102995, + "epoch": 0.36284382985119495, + "flos": 32429419031520.0, + "grad_norm": 2.4715012783514236, + "language_loss": 0.89753878, + "learning_rate": 2.946129926425273e-06, + "loss": 0.9192276, + "num_input_tokens_seen": 129582325, + "router_z_loss_clip": 0.84326172, + "router_z_loss_mlp": 0.13635254, + "step": 6035, + "time_per_iteration": 2.68776535987854 + }, + { + "auxiliary_loss_clip": 0.01141296, + "auxiliary_loss_mlp": 0.01039436, + "balance_loss_clip": 1.0518465, + "balance_loss_mlp": 1.02436841, + "epoch": 0.3629039531038629, + "flos": 24595305095520.0, + "grad_norm": 1.810203484901991, + "language_loss": 0.73852789, + "learning_rate": 2.9457867820184496e-06, + "loss": 0.76033521, + "num_input_tokens_seen": 129600350, + "router_z_loss_clip": 0.89501953, + "router_z_loss_mlp": 0.15087891, + "step": 6036, + "time_per_iteration": 2.705754280090332 + }, + { + "auxiliary_loss_clip": 0.01138509, + "auxiliary_loss_mlp": 0.01032369, + "balance_loss_clip": 1.04859483, + "balance_loss_mlp": 1.01789069, + "epoch": 0.3629640763565309, + "flos": 22726874341440.0, + "grad_norm": 2.9024168396521266, + "language_loss": 0.76041633, + "learning_rate": 2.945443601747297e-06, + "loss": 0.78212512, + "num_input_tokens_seen": 129618425, + "router_z_loss_clip": 0.8984375, + "router_z_loss_mlp": 0.14459229, + "step": 6037, + "time_per_iteration": 2.6174447536468506 + }, + { + "auxiliary_loss_clip": 0.01131483, + "auxiliary_loss_mlp": 0.01046379, + "balance_loss_clip": 1.04939365, + "balance_loss_mlp": 1.03200793, + "epoch": 0.36302419960919885, + "flos": 23881208607360.0, + "grad_norm": 1.5793744206936309, + "language_loss": 0.78102314, + "learning_rate": 2.945100385624828e-06, + "loss": 0.80280173, + "num_input_tokens_seen": 129636750, + "router_z_loss_clip": 0.8203125, + "router_z_loss_mlp": 0.14373779, + "step": 6038, + "time_per_iteration": 2.712462902069092 + }, + { + "auxiliary_loss_clip": 0.01053141, + "auxiliary_loss_mlp": 0.01002886, + "balance_loss_clip": 1.02576971, + "balance_loss_mlp": 1.00109053, + "epoch": 0.3630843228618668, + "flos": 77845492994400.0, + "grad_norm": 0.835153796391573, + "language_loss": 0.63385493, + "learning_rate": 2.9447571336640573e-06, + "loss": 0.65441519, + "num_input_tokens_seen": 129699030, + "router_z_loss_clip": 0.27416992, + "router_z_loss_mlp": 0.01794434, + "step": 6039, + "time_per_iteration": 3.3428404331207275 + }, + { + "auxiliary_loss_clip": 0.01136182, + "auxiliary_loss_mlp": 0.01038898, + "balance_loss_clip": 1.05192709, + "balance_loss_mlp": 1.02574313, + "epoch": 0.3631444461145348, + "flos": 26643404584800.0, + "grad_norm": 5.4511346303247805, + "language_loss": 0.71107465, + "learning_rate": 2.944413845878002e-06, + "loss": 0.73282546, + "num_input_tokens_seen": 129717135, + "router_z_loss_clip": 0.84277344, + "router_z_loss_mlp": 0.13153076, + "step": 6040, + "time_per_iteration": 2.760397434234619 + }, + { + "auxiliary_loss_clip": 0.01140594, + "auxiliary_loss_mlp": 0.01035016, + "balance_loss_clip": 1.05164003, + "balance_loss_mlp": 1.0211643, + "epoch": 0.36320456936720275, + "flos": 26505705401280.0, + "grad_norm": 1.9322590508412156, + "language_loss": 0.81306589, + "learning_rate": 2.9440705222796783e-06, + "loss": 0.83482194, + "num_input_tokens_seen": 129735940, + "router_z_loss_clip": 0.88867188, + "router_z_loss_mlp": 0.1385498, + "step": 6041, + "time_per_iteration": 2.7900867462158203 + }, + { + "auxiliary_loss_clip": 0.01135643, + "auxiliary_loss_mlp": 0.01032743, + "balance_loss_clip": 1.04856801, + "balance_loss_mlp": 1.01743686, + "epoch": 0.3632646926198707, + "flos": 20766441545280.0, + "grad_norm": 2.4062750203563734, + "language_loss": 0.84180868, + "learning_rate": 2.943727162882107e-06, + "loss": 0.86349249, + "num_input_tokens_seen": 129752790, + "router_z_loss_clip": 0.86962891, + "router_z_loss_mlp": 0.15307617, + "step": 6042, + "time_per_iteration": 2.6115610599517822 + }, + { + "auxiliary_loss_clip": 0.01138126, + "auxiliary_loss_mlp": 0.01042615, + "balance_loss_clip": 1.05344963, + "balance_loss_mlp": 1.02849436, + "epoch": 0.36332481587253873, + "flos": 28468933889760.0, + "grad_norm": 1.8276705261502273, + "language_loss": 0.78044009, + "learning_rate": 2.9433837676983064e-06, + "loss": 0.80224746, + "num_input_tokens_seen": 129773655, + "router_z_loss_clip": 0.84667969, + "router_z_loss_mlp": 0.14135742, + "step": 6043, + "time_per_iteration": 2.7071661949157715 + }, + { + "auxiliary_loss_clip": 0.01133898, + "auxiliary_loss_mlp": 0.01037924, + "balance_loss_clip": 1.05077374, + "balance_loss_mlp": 1.02317834, + "epoch": 0.3633849391252067, + "flos": 13108552375680.0, + "grad_norm": 2.8259052879100985, + "language_loss": 0.66016275, + "learning_rate": 2.943040336741298e-06, + "loss": 0.68188095, + "num_input_tokens_seen": 129791605, + "router_z_loss_clip": 0.83105469, + "router_z_loss_mlp": 0.14764404, + "step": 6044, + "time_per_iteration": 2.633388042449951 + }, + { + "auxiliary_loss_clip": 0.01135339, + "auxiliary_loss_mlp": 0.01030318, + "balance_loss_clip": 1.05123949, + "balance_loss_mlp": 1.01669252, + "epoch": 0.36344506237787466, + "flos": 31541528813760.0, + "grad_norm": 1.7296545658908502, + "language_loss": 0.8098439, + "learning_rate": 2.9426968700241066e-06, + "loss": 0.83150047, + "num_input_tokens_seen": 129811075, + "router_z_loss_clip": 0.84033203, + "router_z_loss_mlp": 0.13641357, + "step": 6045, + "time_per_iteration": 2.770085096359253 + }, + { + "auxiliary_loss_clip": 0.0113601, + "auxiliary_loss_mlp": 0.01036444, + "balance_loss_clip": 1.04974508, + "balance_loss_mlp": 1.02215064, + "epoch": 0.3635051856305426, + "flos": 36794736198720.0, + "grad_norm": 4.189650279647054, + "language_loss": 0.64985043, + "learning_rate": 2.942353367559755e-06, + "loss": 0.67157495, + "num_input_tokens_seen": 129833755, + "router_z_loss_clip": 0.86279297, + "router_z_loss_mlp": 0.14294434, + "step": 6046, + "time_per_iteration": 2.735241651535034 + }, + { + "auxiliary_loss_clip": 0.01135289, + "auxiliary_loss_mlp": 0.01036949, + "balance_loss_clip": 1.04971588, + "balance_loss_mlp": 1.0230372, + "epoch": 0.3635653088832106, + "flos": 27086802710400.0, + "grad_norm": 1.7436933940533288, + "language_loss": 0.77199793, + "learning_rate": 2.9420098293612692e-06, + "loss": 0.7937203, + "num_input_tokens_seen": 129854475, + "router_z_loss_clip": 0.85644531, + "router_z_loss_mlp": 0.13916016, + "step": 6047, + "time_per_iteration": 2.719684600830078 + }, + { + "auxiliary_loss_clip": 0.01142868, + "auxiliary_loss_mlp": 0.01043703, + "balance_loss_clip": 1.05024219, + "balance_loss_mlp": 1.02733588, + "epoch": 0.36362543213587856, + "flos": 30245848809120.0, + "grad_norm": 1.6310600410758218, + "language_loss": 0.79158247, + "learning_rate": 2.9416662554416767e-06, + "loss": 0.81344819, + "num_input_tokens_seen": 129873530, + "router_z_loss_clip": 0.92626953, + "router_z_loss_mlp": 0.16369629, + "step": 6048, + "time_per_iteration": 2.669424057006836 + }, + { + "auxiliary_loss_clip": 0.01048794, + "auxiliary_loss_mlp": 0.01003333, + "balance_loss_clip": 1.0216856, + "balance_loss_mlp": 1.00158858, + "epoch": 0.3636855553885465, + "flos": 76293700848000.0, + "grad_norm": 0.7519750614197487, + "language_loss": 0.52565885, + "learning_rate": 2.9413226458140054e-06, + "loss": 0.54618013, + "num_input_tokens_seen": 129940400, + "router_z_loss_clip": 0.27124023, + "router_z_loss_mlp": 0.01748657, + "step": 6049, + "time_per_iteration": 3.355207681655884 + }, + { + "auxiliary_loss_clip": 0.01138014, + "auxiliary_loss_mlp": 0.01035077, + "balance_loss_clip": 1.05183887, + "balance_loss_mlp": 1.02052164, + "epoch": 0.3637456786412145, + "flos": 29359133592480.0, + "grad_norm": 2.01779605123267, + "language_loss": 0.86563075, + "learning_rate": 2.9409790004912845e-06, + "loss": 0.88736171, + "num_input_tokens_seen": 129958635, + "router_z_loss_clip": 0.86132812, + "router_z_loss_mlp": 0.14544678, + "step": 6050, + "time_per_iteration": 2.74940824508667 + }, + { + "auxiliary_loss_clip": 0.01132777, + "auxiliary_loss_mlp": 0.01033104, + "balance_loss_clip": 1.04894805, + "balance_loss_mlp": 1.01978207, + "epoch": 0.36380580189388245, + "flos": 20366633662560.0, + "grad_norm": 1.7267232035272073, + "language_loss": 0.77909541, + "learning_rate": 2.940635319486546e-06, + "loss": 0.80075419, + "num_input_tokens_seen": 129977685, + "router_z_loss_clip": 0.83642578, + "router_z_loss_mlp": 0.13336182, + "step": 6051, + "time_per_iteration": 2.63122820854187 + }, + { + "auxiliary_loss_clip": 0.01133689, + "auxiliary_loss_mlp": 0.01032726, + "balance_loss_clip": 1.04775977, + "balance_loss_mlp": 1.01918936, + "epoch": 0.3638659251465504, + "flos": 30645048932640.0, + "grad_norm": 1.9352804440037474, + "language_loss": 0.82462668, + "learning_rate": 2.940291602812822e-06, + "loss": 0.84629083, + "num_input_tokens_seen": 129997530, + "router_z_loss_clip": 0.85888672, + "router_z_loss_mlp": 0.13531494, + "step": 6052, + "time_per_iteration": 2.738292694091797 + }, + { + "auxiliary_loss_clip": 0.01129084, + "auxiliary_loss_mlp": 0.01031518, + "balance_loss_clip": 1.04626226, + "balance_loss_mlp": 1.0189178, + "epoch": 0.3639260483992184, + "flos": 28068477730560.0, + "grad_norm": 1.7254510979119382, + "language_loss": 0.72243017, + "learning_rate": 2.939947850483145e-06, + "loss": 0.7440362, + "num_input_tokens_seen": 130017955, + "router_z_loss_clip": 0.82958984, + "router_z_loss_mlp": 0.12615967, + "step": 6053, + "time_per_iteration": 2.699897527694702 + }, + { + "auxiliary_loss_clip": 0.01049627, + "auxiliary_loss_mlp": 0.0100201, + "balance_loss_clip": 1.02264953, + "balance_loss_mlp": 1.00026917, + "epoch": 0.36398617165188635, + "flos": 86289141019680.0, + "grad_norm": 0.7651512968518326, + "language_loss": 0.61212349, + "learning_rate": 2.9396040625105532e-06, + "loss": 0.63263988, + "num_input_tokens_seen": 130074275, + "router_z_loss_clip": 0.27001953, + "router_z_loss_mlp": 0.01741028, + "step": 6054, + "time_per_iteration": 4.705052375793457 + }, + { + "auxiliary_loss_clip": 0.01135308, + "auxiliary_loss_mlp": 0.01036191, + "balance_loss_clip": 1.04844594, + "balance_loss_mlp": 1.02126563, + "epoch": 0.3640462949045543, + "flos": 27131973127200.0, + "grad_norm": 2.115789015660221, + "language_loss": 0.75779331, + "learning_rate": 2.9392602389080802e-06, + "loss": 0.77950829, + "num_input_tokens_seen": 130091375, + "router_z_loss_clip": 0.86865234, + "router_z_loss_mlp": 0.14923096, + "step": 6055, + "time_per_iteration": 4.232286691665649 + }, + { + "auxiliary_loss_clip": 0.01133974, + "auxiliary_loss_mlp": 0.01035146, + "balance_loss_clip": 1.04811597, + "balance_loss_mlp": 1.02142549, + "epoch": 0.3641064181572223, + "flos": 26287511083200.0, + "grad_norm": 1.9997615311505372, + "language_loss": 0.75087357, + "learning_rate": 2.938916379688765e-06, + "loss": 0.77256477, + "num_input_tokens_seen": 130111595, + "router_z_loss_clip": 0.85742188, + "router_z_loss_mlp": 0.13720703, + "step": 6056, + "time_per_iteration": 2.82844877243042 + }, + { + "auxiliary_loss_clip": 0.01133465, + "auxiliary_loss_mlp": 0.0103915, + "balance_loss_clip": 1.04938197, + "balance_loss_mlp": 1.02494621, + "epoch": 0.3641665414098903, + "flos": 27178034924160.0, + "grad_norm": 1.9789508766856172, + "language_loss": 0.80402565, + "learning_rate": 2.9385724848656468e-06, + "loss": 0.82575178, + "num_input_tokens_seen": 130131440, + "router_z_loss_clip": 0.83984375, + "router_z_loss_mlp": 0.14190674, + "step": 6057, + "time_per_iteration": 2.638887882232666 + }, + { + "auxiliary_loss_clip": 0.01132807, + "auxiliary_loss_mlp": 0.01037817, + "balance_loss_clip": 1.04940248, + "balance_loss_mlp": 1.02393472, + "epoch": 0.36422666466255826, + "flos": 34568507630880.0, + "grad_norm": 2.273639563573652, + "language_loss": 0.80194807, + "learning_rate": 2.9382285544517647e-06, + "loss": 0.82365429, + "num_input_tokens_seen": 130151375, + "router_z_loss_clip": 0.83398438, + "router_z_loss_mlp": 0.13891602, + "step": 6058, + "time_per_iteration": 4.1108479499816895 + }, + { + "auxiliary_loss_clip": 0.01131157, + "auxiliary_loss_mlp": 0.010341, + "balance_loss_clip": 1.04641783, + "balance_loss_mlp": 1.02043891, + "epoch": 0.36428678791522623, + "flos": 29493145703520.0, + "grad_norm": 1.9384304903492255, + "language_loss": 0.84869307, + "learning_rate": 2.9378845884601636e-06, + "loss": 0.87034559, + "num_input_tokens_seen": 130169960, + "router_z_loss_clip": 0.84619141, + "router_z_loss_mlp": 0.13659668, + "step": 6059, + "time_per_iteration": 2.6571431159973145 + }, + { + "auxiliary_loss_clip": 0.01137753, + "auxiliary_loss_mlp": 0.01039516, + "balance_loss_clip": 1.0503459, + "balance_loss_mlp": 1.0250324, + "epoch": 0.3643469111678942, + "flos": 27489406285440.0, + "grad_norm": 1.8875152391930148, + "language_loss": 0.87771273, + "learning_rate": 2.937540586903884e-06, + "loss": 0.89948535, + "num_input_tokens_seen": 130189800, + "router_z_loss_clip": 0.87353516, + "router_z_loss_mlp": 0.14489746, + "step": 6060, + "time_per_iteration": 2.678173303604126 + }, + { + "auxiliary_loss_clip": 0.01139105, + "auxiliary_loss_mlp": 0.01038747, + "balance_loss_clip": 1.0506928, + "balance_loss_mlp": 1.02383351, + "epoch": 0.36440703442056216, + "flos": 23704335564480.0, + "grad_norm": 3.619584018229907, + "language_loss": 0.66710776, + "learning_rate": 2.937196549795971e-06, + "loss": 0.68888628, + "num_input_tokens_seen": 130206370, + "router_z_loss_clip": 0.88476562, + "router_z_loss_mlp": 0.14916992, + "step": 6061, + "time_per_iteration": 3.913968324661255 + }, + { + "auxiliary_loss_clip": 0.01139224, + "auxiliary_loss_mlp": 0.01034758, + "balance_loss_clip": 1.05131948, + "balance_loss_mlp": 1.02028573, + "epoch": 0.3644671576732301, + "flos": 22012899405120.0, + "grad_norm": 2.1157308092992504, + "language_loss": 0.74933511, + "learning_rate": 2.9368524771494718e-06, + "loss": 0.77107489, + "num_input_tokens_seen": 130224445, + "router_z_loss_clip": 0.88037109, + "router_z_loss_mlp": 0.14459229, + "step": 6062, + "time_per_iteration": 2.5952072143554688 + }, + { + "auxiliary_loss_clip": 0.01135118, + "auxiliary_loss_mlp": 0.01032556, + "balance_loss_clip": 1.04975975, + "balance_loss_mlp": 1.0168736, + "epoch": 0.3645272809258981, + "flos": 26286700737600.0, + "grad_norm": 1.8048945422663782, + "language_loss": 0.72255665, + "learning_rate": 2.936508368977432e-06, + "loss": 0.74423337, + "num_input_tokens_seen": 130245380, + "router_z_loss_clip": 0.85449219, + "router_z_loss_mlp": 0.15686035, + "step": 6063, + "time_per_iteration": 2.7526776790618896 + }, + { + "auxiliary_loss_clip": 0.01134121, + "auxiliary_loss_mlp": 0.01039389, + "balance_loss_clip": 1.04889011, + "balance_loss_mlp": 1.02495265, + "epoch": 0.36458740417856605, + "flos": 27755971885440.0, + "grad_norm": 1.9578236596321854, + "language_loss": 0.67955095, + "learning_rate": 2.936164225292901e-06, + "loss": 0.70128608, + "num_input_tokens_seen": 130265575, + "router_z_loss_clip": 0.85302734, + "router_z_loss_mlp": 0.14453125, + "step": 6064, + "time_per_iteration": 2.6399810314178467 + }, + { + "auxiliary_loss_clip": 0.01136343, + "auxiliary_loss_mlp": 0.01043965, + "balance_loss_clip": 1.04998755, + "balance_loss_mlp": 1.0295167, + "epoch": 0.364647527431234, + "flos": 31897300763520.0, + "grad_norm": 3.745024394360881, + "language_loss": 0.74457288, + "learning_rate": 2.9358200461089297e-06, + "loss": 0.7663759, + "num_input_tokens_seen": 130286195, + "router_z_loss_clip": 0.86279297, + "router_z_loss_mlp": 0.14453125, + "step": 6065, + "time_per_iteration": 2.7371277809143066 + }, + { + "auxiliary_loss_clip": 0.01137087, + "auxiliary_loss_mlp": 0.0103837, + "balance_loss_clip": 1.04899144, + "balance_loss_mlp": 1.02318287, + "epoch": 0.364707650683902, + "flos": 37863510670080.0, + "grad_norm": 5.175863952302649, + "language_loss": 0.75362062, + "learning_rate": 2.9354758314385676e-06, + "loss": 0.77537519, + "num_input_tokens_seen": 130306095, + "router_z_loss_clip": 0.87939453, + "router_z_loss_mlp": 0.15185547, + "step": 6066, + "time_per_iteration": 2.7299466133117676 + }, + { + "auxiliary_loss_clip": 0.01130974, + "auxiliary_loss_mlp": 0.01029208, + "balance_loss_clip": 1.04824805, + "balance_loss_mlp": 1.01657724, + "epoch": 0.36476777393656995, + "flos": 23882829298560.0, + "grad_norm": 1.9419531428300314, + "language_loss": 0.76828504, + "learning_rate": 2.9351315812948684e-06, + "loss": 0.78988683, + "num_input_tokens_seen": 130324685, + "router_z_loss_clip": 0.82666016, + "router_z_loss_mlp": 0.12646484, + "step": 6067, + "time_per_iteration": 2.6664650440216064 + }, + { + "auxiliary_loss_clip": 0.01132182, + "auxiliary_loss_mlp": 0.01036937, + "balance_loss_clip": 1.04979551, + "balance_loss_mlp": 1.02465248, + "epoch": 0.3648278971892379, + "flos": 21656965386240.0, + "grad_norm": 2.3431077306801256, + "language_loss": 0.70916939, + "learning_rate": 2.934787295690886e-06, + "loss": 0.73086059, + "num_input_tokens_seen": 130343855, + "router_z_loss_clip": 0.82421875, + "router_z_loss_mlp": 0.12286377, + "step": 6068, + "time_per_iteration": 2.673128366470337 + }, + { + "auxiliary_loss_clip": 0.01133716, + "auxiliary_loss_mlp": 0.01036869, + "balance_loss_clip": 1.04736149, + "balance_loss_mlp": 1.02309406, + "epoch": 0.3648880204419059, + "flos": 21879454536000.0, + "grad_norm": 1.823744573710587, + "language_loss": 0.74146867, + "learning_rate": 2.9344429746396755e-06, + "loss": 0.76317453, + "num_input_tokens_seen": 130362320, + "router_z_loss_clip": 0.86376953, + "router_z_loss_mlp": 0.13769531, + "step": 6069, + "time_per_iteration": 2.6446304321289062 + }, + { + "auxiliary_loss_clip": 0.01136709, + "auxiliary_loss_mlp": 0.01036013, + "balance_loss_clip": 1.04939556, + "balance_loss_mlp": 1.0215826, + "epoch": 0.3649481436945739, + "flos": 27623377879200.0, + "grad_norm": 1.8533625948032517, + "language_loss": 0.66102493, + "learning_rate": 2.9340986181542945e-06, + "loss": 0.68275219, + "num_input_tokens_seen": 130383165, + "router_z_loss_clip": 0.87353516, + "router_z_loss_mlp": 0.14428711, + "step": 6070, + "time_per_iteration": 2.7454636096954346 + }, + { + "auxiliary_loss_clip": 0.01131139, + "auxiliary_loss_mlp": 0.01031094, + "balance_loss_clip": 1.04876387, + "balance_loss_mlp": 1.01783776, + "epoch": 0.36500826694724187, + "flos": 26331628050720.0, + "grad_norm": 1.765306488559707, + "language_loss": 0.74296153, + "learning_rate": 2.9337542262477994e-06, + "loss": 0.76458383, + "num_input_tokens_seen": 130402425, + "router_z_loss_clip": 0.82373047, + "router_z_loss_mlp": 0.13262939, + "step": 6071, + "time_per_iteration": 2.6797783374786377 + }, + { + "auxiliary_loss_clip": 0.01132219, + "auxiliary_loss_mlp": 0.01036016, + "balance_loss_clip": 1.04775071, + "balance_loss_mlp": 1.02141285, + "epoch": 0.36506839019990983, + "flos": 16804700367840.0, + "grad_norm": 1.8920183852989212, + "language_loss": 0.88546741, + "learning_rate": 2.9334097989332506e-06, + "loss": 0.90714973, + "num_input_tokens_seen": 130419440, + "router_z_loss_clip": 0.84472656, + "router_z_loss_mlp": 0.14599609, + "step": 6072, + "time_per_iteration": 2.6954915523529053 + }, + { + "auxiliary_loss_clip": 0.01133992, + "auxiliary_loss_mlp": 0.0103027, + "balance_loss_clip": 1.04960048, + "balance_loss_mlp": 1.01695395, + "epoch": 0.3651285134525778, + "flos": 21078663769440.0, + "grad_norm": 2.1399173861750214, + "language_loss": 0.72088909, + "learning_rate": 2.9330653362237094e-06, + "loss": 0.74253178, + "num_input_tokens_seen": 130438495, + "router_z_loss_clip": 0.84277344, + "router_z_loss_mlp": 0.13330078, + "step": 6073, + "time_per_iteration": 2.64668345451355 + }, + { + "auxiliary_loss_clip": 0.01137355, + "auxiliary_loss_mlp": 0.01032479, + "balance_loss_clip": 1.05090594, + "balance_loss_mlp": 1.0182749, + "epoch": 0.36518863670524576, + "flos": 26732286796320.0, + "grad_norm": 2.0796652977921273, + "language_loss": 0.6710881, + "learning_rate": 2.932720838132236e-06, + "loss": 0.69278646, + "num_input_tokens_seen": 130455575, + "router_z_loss_clip": 0.86425781, + "router_z_loss_mlp": 0.14202881, + "step": 6074, + "time_per_iteration": 2.710772752761841 + }, + { + "auxiliary_loss_clip": 0.01130239, + "auxiliary_loss_mlp": 0.01033346, + "balance_loss_clip": 1.04624033, + "balance_loss_mlp": 1.02023292, + "epoch": 0.3652487599579137, + "flos": 33095346824160.0, + "grad_norm": 1.53669170561342, + "language_loss": 0.72922409, + "learning_rate": 2.9323763046718954e-06, + "loss": 0.75085998, + "num_input_tokens_seen": 130476385, + "router_z_loss_clip": 0.84130859, + "router_z_loss_mlp": 0.13110352, + "step": 6075, + "time_per_iteration": 2.708271026611328 + }, + { + "auxiliary_loss_clip": 0.01137216, + "auxiliary_loss_mlp": 0.0104065, + "balance_loss_clip": 1.04908788, + "balance_loss_mlp": 1.02574849, + "epoch": 0.3653088832105817, + "flos": 24105723621120.0, + "grad_norm": 2.538048414366196, + "language_loss": 0.89634252, + "learning_rate": 2.9320317358557524e-06, + "loss": 0.91812116, + "num_input_tokens_seen": 130493630, + "router_z_loss_clip": 0.88085938, + "router_z_loss_mlp": 0.14904785, + "step": 6076, + "time_per_iteration": 2.653811454772949 + }, + { + "auxiliary_loss_clip": 0.01133878, + "auxiliary_loss_mlp": 0.01034821, + "balance_loss_clip": 1.05013049, + "balance_loss_mlp": 1.02090883, + "epoch": 0.36536900646324966, + "flos": 16002410461920.0, + "grad_norm": 2.149040337287881, + "language_loss": 0.69570953, + "learning_rate": 2.931687131696872e-06, + "loss": 0.71739656, + "num_input_tokens_seen": 130510735, + "router_z_loss_clip": 0.83740234, + "router_z_loss_mlp": 0.13928223, + "step": 6077, + "time_per_iteration": 2.605405569076538 + }, + { + "auxiliary_loss_clip": 0.01047068, + "auxiliary_loss_mlp": 0.01002926, + "balance_loss_clip": 1.02015519, + "balance_loss_mlp": 1.00132287, + "epoch": 0.3654291297159176, + "flos": 86756484857760.0, + "grad_norm": 0.7511185154930318, + "language_loss": 0.61808133, + "learning_rate": 2.9313424922083224e-06, + "loss": 0.63858134, + "num_input_tokens_seen": 130577050, + "router_z_loss_clip": 0.26928711, + "router_z_loss_mlp": 0.01605225, + "step": 6078, + "time_per_iteration": 3.3578338623046875 + }, + { + "auxiliary_loss_clip": 0.01131652, + "auxiliary_loss_mlp": 0.01035557, + "balance_loss_clip": 1.04716992, + "balance_loss_mlp": 1.02255726, + "epoch": 0.3654892529685856, + "flos": 28818830833920.0, + "grad_norm": 2.6541891090145535, + "language_loss": 0.78270924, + "learning_rate": 2.930997817403173e-06, + "loss": 0.80438131, + "num_input_tokens_seen": 130593780, + "router_z_loss_clip": 0.84521484, + "router_z_loss_mlp": 0.12994385, + "step": 6079, + "time_per_iteration": 2.6819963455200195 + }, + { + "auxiliary_loss_clip": 0.01136049, + "auxiliary_loss_mlp": 0.01034943, + "balance_loss_clip": 1.04982233, + "balance_loss_mlp": 1.02028024, + "epoch": 0.36554937622125355, + "flos": 53044912242720.0, + "grad_norm": 1.8085744887006354, + "language_loss": 0.62797391, + "learning_rate": 2.9306531072944913e-06, + "loss": 0.64968377, + "num_input_tokens_seen": 130615510, + "router_z_loss_clip": 0.86181641, + "router_z_loss_mlp": 0.14660645, + "step": 6080, + "time_per_iteration": 2.8394923210144043 + }, + { + "auxiliary_loss_clip": 0.01135783, + "auxiliary_loss_mlp": 0.01033386, + "balance_loss_clip": 1.04856205, + "balance_loss_mlp": 1.01862836, + "epoch": 0.3656094994739215, + "flos": 28424371232160.0, + "grad_norm": 2.6688689932962184, + "language_loss": 0.67696595, + "learning_rate": 2.930308361895352e-06, + "loss": 0.69865763, + "num_input_tokens_seen": 130635410, + "router_z_loss_clip": 0.87255859, + "router_z_loss_mlp": 0.14752197, + "step": 6081, + "time_per_iteration": 2.705002784729004 + }, + { + "auxiliary_loss_clip": 0.01139177, + "auxiliary_loss_mlp": 0.01041059, + "balance_loss_clip": 1.04895949, + "balance_loss_mlp": 1.02664709, + "epoch": 0.3656696227265895, + "flos": 29983051316160.0, + "grad_norm": 6.487959241711502, + "language_loss": 0.74997336, + "learning_rate": 2.9299635812188257e-06, + "loss": 0.77177572, + "num_input_tokens_seen": 130657725, + "router_z_loss_clip": 0.90185547, + "router_z_loss_mlp": 0.14404297, + "step": 6082, + "time_per_iteration": 2.7319107055664062 + }, + { + "auxiliary_loss_clip": 0.01135977, + "auxiliary_loss_mlp": 0.01029416, + "balance_loss_clip": 1.04957616, + "balance_loss_mlp": 1.01626754, + "epoch": 0.3657297459792575, + "flos": 34078804604640.0, + "grad_norm": 1.879131413481047, + "language_loss": 0.82768071, + "learning_rate": 2.929618765277987e-06, + "loss": 0.84933472, + "num_input_tokens_seen": 130678360, + "router_z_loss_clip": 0.86328125, + "router_z_loss_mlp": 0.13171387, + "step": 6083, + "time_per_iteration": 2.8474769592285156 + }, + { + "auxiliary_loss_clip": 0.01047006, + "auxiliary_loss_mlp": 0.01003947, + "balance_loss_clip": 1.01998937, + "balance_loss_mlp": 1.00224054, + "epoch": 0.36578986923192547, + "flos": 82233979695360.0, + "grad_norm": 0.8081208391522304, + "language_loss": 0.5919351, + "learning_rate": 2.9292739140859125e-06, + "loss": 0.61244464, + "num_input_tokens_seen": 130742110, + "router_z_loss_clip": 0.27026367, + "router_z_loss_mlp": 0.0171051, + "step": 6084, + "time_per_iteration": 3.388716220855713 + }, + { + "auxiliary_loss_clip": 0.01134641, + "auxiliary_loss_mlp": 0.01034709, + "balance_loss_clip": 1.04977226, + "balance_loss_mlp": 1.02085638, + "epoch": 0.36584999248459343, + "flos": 24681837304800.0, + "grad_norm": 1.888222721860112, + "language_loss": 0.72800469, + "learning_rate": 2.9289290276556767e-06, + "loss": 0.74969816, + "num_input_tokens_seen": 130759870, + "router_z_loss_clip": 0.84960938, + "router_z_loss_mlp": 0.13848877, + "step": 6085, + "time_per_iteration": 2.680979013442993 + }, + { + "auxiliary_loss_clip": 0.01138421, + "auxiliary_loss_mlp": 0.01032476, + "balance_loss_clip": 1.05250406, + "balance_loss_mlp": 1.01951182, + "epoch": 0.3659101157372614, + "flos": 23259924506880.0, + "grad_norm": 1.6919738590544027, + "language_loss": 0.77678025, + "learning_rate": 2.9285841060003604e-06, + "loss": 0.79848921, + "num_input_tokens_seen": 130778510, + "router_z_loss_clip": 0.86035156, + "router_z_loss_mlp": 0.12969971, + "step": 6086, + "time_per_iteration": 2.789168357849121 + }, + { + "auxiliary_loss_clip": 0.0113098, + "auxiliary_loss_mlp": 0.01030288, + "balance_loss_clip": 1.04905367, + "balance_loss_mlp": 1.01690042, + "epoch": 0.36597023898992936, + "flos": 37596053689920.0, + "grad_norm": 4.288184509378264, + "language_loss": 0.76971602, + "learning_rate": 2.9282391491330416e-06, + "loss": 0.79132873, + "num_input_tokens_seen": 130798535, + "router_z_loss_clip": 0.81982422, + "router_z_loss_mlp": 0.1338501, + "step": 6087, + "time_per_iteration": 2.7997305393218994 + }, + { + "auxiliary_loss_clip": 0.01139184, + "auxiliary_loss_mlp": 0.01035034, + "balance_loss_clip": 1.05166042, + "balance_loss_mlp": 1.02053869, + "epoch": 0.36603036224259733, + "flos": 25040729085120.0, + "grad_norm": 2.555301794551832, + "language_loss": 0.70202506, + "learning_rate": 2.9278941570668002e-06, + "loss": 0.72376728, + "num_input_tokens_seen": 130816655, + "router_z_loss_clip": 0.87451172, + "router_z_loss_mlp": 0.14489746, + "step": 6088, + "time_per_iteration": 2.7251174449920654 + }, + { + "auxiliary_loss_clip": 0.01145585, + "auxiliary_loss_mlp": 0.01038326, + "balance_loss_clip": 1.05269408, + "balance_loss_mlp": 1.02315116, + "epoch": 0.3660904854952653, + "flos": 46770693909120.0, + "grad_norm": 2.130446119294587, + "language_loss": 0.7984519, + "learning_rate": 2.92754912981472e-06, + "loss": 0.8202911, + "num_input_tokens_seen": 130841225, + "router_z_loss_clip": 0.92822266, + "router_z_loss_mlp": 0.15179443, + "step": 6089, + "time_per_iteration": 2.8094234466552734 + }, + { + "auxiliary_loss_clip": 0.01135344, + "auxiliary_loss_mlp": 0.01031051, + "balance_loss_clip": 1.05091465, + "balance_loss_mlp": 1.01782489, + "epoch": 0.36615060874793326, + "flos": 26643850274880.0, + "grad_norm": 2.1931543116151433, + "language_loss": 0.71347916, + "learning_rate": 2.927204067389884e-06, + "loss": 0.73514307, + "num_input_tokens_seen": 130861050, + "router_z_loss_clip": 0.84423828, + "router_z_loss_mlp": 0.13232422, + "step": 6090, + "time_per_iteration": 2.682677745819092 + }, + { + "auxiliary_loss_clip": 0.01134244, + "auxiliary_loss_mlp": 0.01043904, + "balance_loss_clip": 1.05249941, + "balance_loss_mlp": 1.03062963, + "epoch": 0.3662107320006012, + "flos": 20232499999680.0, + "grad_norm": 2.3209543219321795, + "language_loss": 0.74175334, + "learning_rate": 2.9268589698053763e-06, + "loss": 0.76353484, + "num_input_tokens_seen": 130879775, + "router_z_loss_clip": 0.81787109, + "router_z_loss_mlp": 0.1328125, + "step": 6091, + "time_per_iteration": 2.6221988201141357 + }, + { + "auxiliary_loss_clip": 0.01137939, + "auxiliary_loss_mlp": 0.01040598, + "balance_loss_clip": 1.0525434, + "balance_loss_mlp": 1.02650189, + "epoch": 0.3662708552532692, + "flos": 25573576664160.0, + "grad_norm": 1.8771172637494744, + "language_loss": 0.72657633, + "learning_rate": 2.926513837074284e-06, + "loss": 0.74836171, + "num_input_tokens_seen": 130898070, + "router_z_loss_clip": 0.85253906, + "router_z_loss_mlp": 0.14093018, + "step": 6092, + "time_per_iteration": 2.7959442138671875 + }, + { + "auxiliary_loss_clip": 0.01138373, + "auxiliary_loss_mlp": 0.01042733, + "balance_loss_clip": 1.05172169, + "balance_loss_mlp": 1.02746856, + "epoch": 0.36633097850593715, + "flos": 26726330756160.0, + "grad_norm": 2.2622472440574852, + "language_loss": 0.78275478, + "learning_rate": 2.9261686692096942e-06, + "loss": 0.80456585, + "num_input_tokens_seen": 130915250, + "router_z_loss_clip": 0.86669922, + "router_z_loss_mlp": 0.152771, + "step": 6093, + "time_per_iteration": 4.114275693893433 + }, + { + "auxiliary_loss_clip": 0.01135981, + "auxiliary_loss_mlp": 0.01038182, + "balance_loss_clip": 1.04890013, + "balance_loss_mlp": 1.02441299, + "epoch": 0.3663911017586051, + "flos": 40089577168800.0, + "grad_norm": 1.8335266562406665, + "language_loss": 0.73877692, + "learning_rate": 2.925823466224696e-06, + "loss": 0.76051855, + "num_input_tokens_seen": 130936995, + "router_z_loss_clip": 0.87060547, + "router_z_loss_mlp": 0.13769531, + "step": 6094, + "time_per_iteration": 4.187153339385986 + }, + { + "auxiliary_loss_clip": 0.01139933, + "auxiliary_loss_mlp": 0.01049433, + "balance_loss_clip": 1.05227804, + "balance_loss_mlp": 1.03480625, + "epoch": 0.3664512250112731, + "flos": 33277527630720.0, + "grad_norm": 1.635782084593199, + "language_loss": 0.79451632, + "learning_rate": 2.9254782281323785e-06, + "loss": 0.81640995, + "num_input_tokens_seen": 130957970, + "router_z_loss_clip": 0.87597656, + "router_z_loss_mlp": 0.14630127, + "step": 6095, + "time_per_iteration": 2.733736991882324 + }, + { + "auxiliary_loss_clip": 0.01141051, + "auxiliary_loss_mlp": 0.01036206, + "balance_loss_clip": 1.05196428, + "balance_loss_mlp": 1.02069128, + "epoch": 0.3665113482639411, + "flos": 21700474594560.0, + "grad_norm": 2.2046222411293077, + "language_loss": 0.73071587, + "learning_rate": 2.925132954945834e-06, + "loss": 0.75248849, + "num_input_tokens_seen": 130974915, + "router_z_loss_clip": 0.890625, + "router_z_loss_mlp": 0.15509033, + "step": 6096, + "time_per_iteration": 2.736384391784668 + }, + { + "auxiliary_loss_clip": 0.01138553, + "auxiliary_loss_mlp": 0.01035539, + "balance_loss_clip": 1.05014157, + "balance_loss_mlp": 1.02124596, + "epoch": 0.36657147151660907, + "flos": 33988868943840.0, + "grad_norm": 2.6266495658668907, + "language_loss": 0.66822177, + "learning_rate": 2.924787646678155e-06, + "loss": 0.68996263, + "num_input_tokens_seen": 130995745, + "router_z_loss_clip": 0.88427734, + "router_z_loss_mlp": 0.1428833, + "step": 6097, + "time_per_iteration": 4.15762996673584 + }, + { + "auxiliary_loss_clip": 0.01141795, + "auxiliary_loss_mlp": 0.01040939, + "balance_loss_clip": 1.05444789, + "balance_loss_mlp": 1.02677679, + "epoch": 0.36663159476927704, + "flos": 30961809092160.0, + "grad_norm": 1.5826106873446826, + "language_loss": 0.77498943, + "learning_rate": 2.9244423033424365e-06, + "loss": 0.79681671, + "num_input_tokens_seen": 131015545, + "router_z_loss_clip": 0.87353516, + "router_z_loss_mlp": 0.14172363, + "step": 6098, + "time_per_iteration": 2.743715763092041 + }, + { + "auxiliary_loss_clip": 0.01135791, + "auxiliary_loss_mlp": 0.01040126, + "balance_loss_clip": 1.05235744, + "balance_loss_mlp": 1.02588081, + "epoch": 0.366691718021945, + "flos": 26060524515360.0, + "grad_norm": 2.2827158824912837, + "language_loss": 0.73281842, + "learning_rate": 2.9240969249517723e-06, + "loss": 0.75457752, + "num_input_tokens_seen": 131033990, + "router_z_loss_clip": 0.83496094, + "router_z_loss_mlp": 0.14251709, + "step": 6099, + "time_per_iteration": 2.6629228591918945 + }, + { + "auxiliary_loss_clip": 0.01132205, + "auxiliary_loss_mlp": 0.01045438, + "balance_loss_clip": 1.04934311, + "balance_loss_mlp": 1.0316689, + "epoch": 0.36675184127461297, + "flos": 20499551807040.0, + "grad_norm": 2.115469899177919, + "language_loss": 0.84668434, + "learning_rate": 2.9237515115192602e-06, + "loss": 0.86846077, + "num_input_tokens_seen": 131050710, + "router_z_loss_clip": 0.828125, + "router_z_loss_mlp": 0.13769531, + "step": 6100, + "time_per_iteration": 4.058032035827637 + }, + { + "auxiliary_loss_clip": 0.01141277, + "auxiliary_loss_mlp": 0.01030962, + "balance_loss_clip": 1.05194104, + "balance_loss_mlp": 1.01688349, + "epoch": 0.36681196452728093, + "flos": 26730625587840.0, + "grad_norm": 1.9585107298834599, + "language_loss": 0.70450664, + "learning_rate": 2.9234060630579992e-06, + "loss": 0.72622907, + "num_input_tokens_seen": 131071435, + "router_z_loss_clip": 0.89208984, + "router_z_loss_mlp": 0.14074707, + "step": 6101, + "time_per_iteration": 2.7064602375030518 + }, + { + "auxiliary_loss_clip": 0.01141227, + "auxiliary_loss_mlp": 0.01046225, + "balance_loss_clip": 1.0524025, + "balance_loss_mlp": 1.03104413, + "epoch": 0.3668720877799489, + "flos": 21612524280480.0, + "grad_norm": 2.4845434809494655, + "language_loss": 0.75566971, + "learning_rate": 2.9230605795810865e-06, + "loss": 0.77754426, + "num_input_tokens_seen": 131088775, + "router_z_loss_clip": 0.88867188, + "router_z_loss_mlp": 0.15179443, + "step": 6102, + "time_per_iteration": 2.6069388389587402 + }, + { + "auxiliary_loss_clip": 0.01142016, + "auxiliary_loss_mlp": 0.0103515, + "balance_loss_clip": 1.05118024, + "balance_loss_mlp": 1.02015388, + "epoch": 0.36693221103261686, + "flos": 57407069062080.0, + "grad_norm": 3.242319689249053, + "language_loss": 0.69945902, + "learning_rate": 2.922715061101625e-06, + "loss": 0.72123069, + "num_input_tokens_seen": 131112800, + "router_z_loss_clip": 0.90771484, + "router_z_loss_mlp": 0.14990234, + "step": 6103, + "time_per_iteration": 2.8808741569519043 + }, + { + "auxiliary_loss_clip": 0.01137374, + "auxiliary_loss_mlp": 0.01037816, + "balance_loss_clip": 1.05008721, + "balance_loss_mlp": 1.02333164, + "epoch": 0.3669923342852848, + "flos": 19475218441440.0, + "grad_norm": 1.7910676935401058, + "language_loss": 0.71651936, + "learning_rate": 2.922369507632716e-06, + "loss": 0.73827124, + "num_input_tokens_seen": 131131150, + "router_z_loss_clip": 0.87255859, + "router_z_loss_mlp": 0.14489746, + "step": 6104, + "time_per_iteration": 2.804903030395508 + }, + { + "auxiliary_loss_clip": 0.01137448, + "auxiliary_loss_mlp": 0.01032909, + "balance_loss_clip": 1.05118132, + "balance_loss_mlp": 1.01812124, + "epoch": 0.3670524575379528, + "flos": 24372856463040.0, + "grad_norm": 2.042653655705656, + "language_loss": 0.81662011, + "learning_rate": 2.9220239191874617e-06, + "loss": 0.83832359, + "num_input_tokens_seen": 131150365, + "router_z_loss_clip": 0.86328125, + "router_z_loss_mlp": 0.14788818, + "step": 6105, + "time_per_iteration": 2.6304445266723633 + }, + { + "auxiliary_loss_clip": 0.01145321, + "auxiliary_loss_mlp": 0.01039368, + "balance_loss_clip": 1.05441284, + "balance_loss_mlp": 1.02431178, + "epoch": 0.36711258079062076, + "flos": 31363156631520.0, + "grad_norm": 1.7349173027467173, + "language_loss": 0.81076264, + "learning_rate": 2.9216782957789692e-06, + "loss": 0.83260953, + "num_input_tokens_seen": 131169310, + "router_z_loss_clip": 0.90917969, + "router_z_loss_mlp": 0.15045166, + "step": 6106, + "time_per_iteration": 2.75113582611084 + }, + { + "auxiliary_loss_clip": 0.01051458, + "auxiliary_loss_mlp": 0.01001186, + "balance_loss_clip": 1.02391577, + "balance_loss_mlp": 0.99953103, + "epoch": 0.3671727040432887, + "flos": 74154004489440.0, + "grad_norm": 0.6940498111477221, + "language_loss": 0.59200644, + "learning_rate": 2.9213326374203426e-06, + "loss": 0.61253285, + "num_input_tokens_seen": 131232900, + "router_z_loss_clip": 0.27514648, + "router_z_loss_mlp": 0.0165863, + "step": 6107, + "time_per_iteration": 3.3157644271850586 + }, + { + "auxiliary_loss_clip": 0.01134875, + "auxiliary_loss_mlp": 0.01033214, + "balance_loss_clip": 1.04988623, + "balance_loss_mlp": 1.01912963, + "epoch": 0.3672328272959567, + "flos": 22770383549760.0, + "grad_norm": 1.6529770205128145, + "language_loss": 0.74240232, + "learning_rate": 2.92098694412469e-06, + "loss": 0.76408321, + "num_input_tokens_seen": 131250920, + "router_z_loss_clip": 0.85009766, + "router_z_loss_mlp": 0.14074707, + "step": 6108, + "time_per_iteration": 2.7918694019317627 + }, + { + "auxiliary_loss_clip": 0.01139701, + "auxiliary_loss_mlp": 0.01035774, + "balance_loss_clip": 1.0513624, + "balance_loss_mlp": 1.02155805, + "epoch": 0.3672929505486247, + "flos": 18362448554400.0, + "grad_norm": 2.8038686690922927, + "language_loss": 0.7329331, + "learning_rate": 2.9206412159051213e-06, + "loss": 0.75468785, + "num_input_tokens_seen": 131267910, + "router_z_loss_clip": 0.88232422, + "router_z_loss_mlp": 0.14221191, + "step": 6109, + "time_per_iteration": 2.677640676498413 + }, + { + "auxiliary_loss_clip": 0.01135681, + "auxiliary_loss_mlp": 0.01033895, + "balance_loss_clip": 1.05030489, + "balance_loss_mlp": 1.01987624, + "epoch": 0.3673530738012927, + "flos": 25123655256480.0, + "grad_norm": 2.4217836351193527, + "language_loss": 0.52808988, + "learning_rate": 2.920295452774744e-06, + "loss": 0.54978561, + "num_input_tokens_seen": 131287150, + "router_z_loss_clip": 0.85302734, + "router_z_loss_mlp": 0.14025879, + "step": 6110, + "time_per_iteration": 2.702807664871216 + }, + { + "auxiliary_loss_clip": 0.01139792, + "auxiliary_loss_mlp": 0.01035575, + "balance_loss_clip": 1.05508018, + "balance_loss_mlp": 1.02064443, + "epoch": 0.36741319705396064, + "flos": 26466004817280.0, + "grad_norm": 1.6081250818377124, + "language_loss": 0.80571634, + "learning_rate": 2.919949654746672e-06, + "loss": 0.82747006, + "num_input_tokens_seen": 131308225, + "router_z_loss_clip": 0.84716797, + "router_z_loss_mlp": 0.14923096, + "step": 6111, + "time_per_iteration": 2.6901447772979736 + }, + { + "auxiliary_loss_clip": 0.01136534, + "auxiliary_loss_mlp": 0.01037635, + "balance_loss_clip": 1.0521853, + "balance_loss_mlp": 1.02374685, + "epoch": 0.3674733203066286, + "flos": 36438599593440.0, + "grad_norm": 1.6356612384426032, + "language_loss": 0.72321117, + "learning_rate": 2.9196038218340163e-06, + "loss": 0.7449528, + "num_input_tokens_seen": 131332115, + "router_z_loss_clip": 0.84326172, + "router_z_loss_mlp": 0.13873291, + "step": 6112, + "time_per_iteration": 2.789107084274292 + }, + { + "auxiliary_loss_clip": 0.01137493, + "auxiliary_loss_mlp": 0.01043576, + "balance_loss_clip": 1.05185413, + "balance_loss_mlp": 1.02959275, + "epoch": 0.36753344355929657, + "flos": 22277317589280.0, + "grad_norm": 2.3671867597679896, + "language_loss": 0.85009462, + "learning_rate": 2.919257954049892e-06, + "loss": 0.87190533, + "num_input_tokens_seen": 131351885, + "router_z_loss_clip": 0.85644531, + "router_z_loss_mlp": 0.13989258, + "step": 6113, + "time_per_iteration": 2.690662384033203 + }, + { + "auxiliary_loss_clip": 0.01140923, + "auxiliary_loss_mlp": 0.0103564, + "balance_loss_clip": 1.05155158, + "balance_loss_mlp": 1.02041733, + "epoch": 0.36759356681196453, + "flos": 30872886363360.0, + "grad_norm": 1.959572311416268, + "language_loss": 0.78662741, + "learning_rate": 2.918912051407413e-06, + "loss": 0.80839306, + "num_input_tokens_seen": 131370245, + "router_z_loss_clip": 0.89306641, + "router_z_loss_mlp": 0.15222168, + "step": 6114, + "time_per_iteration": 2.6773366928100586 + }, + { + "auxiliary_loss_clip": 0.01143603, + "auxiliary_loss_mlp": 0.01047254, + "balance_loss_clip": 1.0525732, + "balance_loss_mlp": 1.03045726, + "epoch": 0.3676536900646325, + "flos": 25664444222400.0, + "grad_norm": 1.7795425813442065, + "language_loss": 0.67210823, + "learning_rate": 2.918566113919698e-06, + "loss": 0.69401681, + "num_input_tokens_seen": 131388115, + "router_z_loss_clip": 0.90966797, + "router_z_loss_mlp": 0.16796875, + "step": 6115, + "time_per_iteration": 2.7704741954803467 + }, + { + "auxiliary_loss_clip": 0.01134064, + "auxiliary_loss_mlp": 0.01033355, + "balance_loss_clip": 1.05107415, + "balance_loss_mlp": 1.01972961, + "epoch": 0.36771381331730046, + "flos": 19875026324160.0, + "grad_norm": 3.448918177669379, + "language_loss": 0.76249838, + "learning_rate": 2.9182201415998636e-06, + "loss": 0.78417253, + "num_input_tokens_seen": 131404595, + "router_z_loss_clip": 0.82958984, + "router_z_loss_mlp": 0.1362915, + "step": 6116, + "time_per_iteration": 2.6135306358337402 + }, + { + "auxiliary_loss_clip": 0.01136701, + "auxiliary_loss_mlp": 0.01036985, + "balance_loss_clip": 1.04983044, + "balance_loss_mlp": 1.02322817, + "epoch": 0.36777393656996843, + "flos": 27222759650880.0, + "grad_norm": 2.0308099459271016, + "language_loss": 0.6322546, + "learning_rate": 2.9178741344610286e-06, + "loss": 0.65399146, + "num_input_tokens_seen": 131423760, + "router_z_loss_clip": 0.86962891, + "router_z_loss_mlp": 0.13757324, + "step": 6117, + "time_per_iteration": 2.7733218669891357 + }, + { + "auxiliary_loss_clip": 0.01135826, + "auxiliary_loss_mlp": 0.01035911, + "balance_loss_clip": 1.05043793, + "balance_loss_mlp": 1.02126002, + "epoch": 0.3678340598226364, + "flos": 32743424016000.0, + "grad_norm": 1.8927432365953094, + "language_loss": 0.73211575, + "learning_rate": 2.9175280925163156e-06, + "loss": 0.75383306, + "num_input_tokens_seen": 131444955, + "router_z_loss_clip": 0.85449219, + "router_z_loss_mlp": 0.14642334, + "step": 6118, + "time_per_iteration": 2.6859469413757324 + }, + { + "auxiliary_loss_clip": 0.01144286, + "auxiliary_loss_mlp": 0.01044145, + "balance_loss_clip": 1.05424523, + "balance_loss_mlp": 1.02886808, + "epoch": 0.36789418307530436, + "flos": 26553833579520.0, + "grad_norm": 2.15562434099768, + "language_loss": 0.72307014, + "learning_rate": 2.9171820157788445e-06, + "loss": 0.74495447, + "num_input_tokens_seen": 131465720, + "router_z_loss_clip": 0.89941406, + "router_z_loss_mlp": 0.15264893, + "step": 6119, + "time_per_iteration": 2.7614104747772217 + }, + { + "auxiliary_loss_clip": 0.0113824, + "auxiliary_loss_mlp": 0.01036655, + "balance_loss_clip": 1.05236006, + "balance_loss_mlp": 1.02178359, + "epoch": 0.3679543063279723, + "flos": 19430412680160.0, + "grad_norm": 3.209980115841713, + "language_loss": 0.80217969, + "learning_rate": 2.9168359042617404e-06, + "loss": 0.82392865, + "num_input_tokens_seen": 131483080, + "router_z_loss_clip": 0.85986328, + "router_z_loss_mlp": 0.14862061, + "step": 6120, + "time_per_iteration": 2.6574740409851074 + }, + { + "auxiliary_loss_clip": 0.01136931, + "auxiliary_loss_mlp": 0.01041549, + "balance_loss_clip": 1.05034685, + "balance_loss_mlp": 1.02729118, + "epoch": 0.3680144295806403, + "flos": 29621728499040.0, + "grad_norm": 2.063412226658891, + "language_loss": 0.63983428, + "learning_rate": 2.916489757978126e-06, + "loss": 0.66161913, + "num_input_tokens_seen": 131502545, + "router_z_loss_clip": 0.8671875, + "router_z_loss_mlp": 0.1427002, + "step": 6121, + "time_per_iteration": 2.6886019706726074 + }, + { + "auxiliary_loss_clip": 0.01142073, + "auxiliary_loss_mlp": 0.01042796, + "balance_loss_clip": 1.05360973, + "balance_loss_mlp": 1.02810323, + "epoch": 0.36807455283330826, + "flos": 31852738105920.0, + "grad_norm": 2.1766285488503363, + "language_loss": 0.71132803, + "learning_rate": 2.9161435769411286e-06, + "loss": 0.73317683, + "num_input_tokens_seen": 131522155, + "router_z_loss_clip": 0.88427734, + "router_z_loss_mlp": 0.14709473, + "step": 6122, + "time_per_iteration": 2.6941380500793457 + }, + { + "auxiliary_loss_clip": 0.01136937, + "auxiliary_loss_mlp": 0.0103576, + "balance_loss_clip": 1.05465555, + "balance_loss_mlp": 1.02163911, + "epoch": 0.3681346760859763, + "flos": 30072136114080.0, + "grad_norm": 1.9542449317576138, + "language_loss": 0.69235015, + "learning_rate": 2.915797361163875e-06, + "loss": 0.71407712, + "num_input_tokens_seen": 131543865, + "router_z_loss_clip": 0.82373047, + "router_z_loss_mlp": 0.14111328, + "step": 6123, + "time_per_iteration": 2.7407617568969727 + }, + { + "auxiliary_loss_clip": 0.01144847, + "auxiliary_loss_mlp": 0.01039164, + "balance_loss_clip": 1.05386519, + "balance_loss_mlp": 1.02280879, + "epoch": 0.36819479933864424, + "flos": 29136684960000.0, + "grad_norm": 3.087218633546481, + "language_loss": 0.73867226, + "learning_rate": 2.9154511106594933e-06, + "loss": 0.76051235, + "num_input_tokens_seen": 131562155, + "router_z_loss_clip": 0.91015625, + "router_z_loss_mlp": 0.16357422, + "step": 6124, + "time_per_iteration": 2.6696484088897705 + }, + { + "auxiliary_loss_clip": 0.0113953, + "auxiliary_loss_mlp": 0.01043402, + "balance_loss_clip": 1.05177844, + "balance_loss_mlp": 1.02785134, + "epoch": 0.3682549225913122, + "flos": 31181664618720.0, + "grad_norm": 5.075996017825029, + "language_loss": 0.74593782, + "learning_rate": 2.915104825441114e-06, + "loss": 0.76776713, + "num_input_tokens_seen": 131581695, + "router_z_loss_clip": 0.87744141, + "router_z_loss_mlp": 0.15582275, + "step": 6125, + "time_per_iteration": 2.663057327270508 + }, + { + "auxiliary_loss_clip": 0.0114409, + "auxiliary_loss_mlp": 0.01052573, + "balance_loss_clip": 1.05446517, + "balance_loss_mlp": 1.03571665, + "epoch": 0.36831504584398017, + "flos": 20677964506560.0, + "grad_norm": 1.9032809816847456, + "language_loss": 0.77885956, + "learning_rate": 2.9147585055218686e-06, + "loss": 0.80082619, + "num_input_tokens_seen": 131599465, + "router_z_loss_clip": 0.89648438, + "router_z_loss_mlp": 0.16845703, + "step": 6126, + "time_per_iteration": 2.7048728466033936 + }, + { + "auxiliary_loss_clip": 0.01141367, + "auxiliary_loss_mlp": 0.01044305, + "balance_loss_clip": 1.0510546, + "balance_loss_mlp": 1.02763987, + "epoch": 0.36837516909664814, + "flos": 24239330559360.0, + "grad_norm": 2.3350501130738266, + "language_loss": 0.64998263, + "learning_rate": 2.914412150914888e-06, + "loss": 0.67183936, + "num_input_tokens_seen": 131618330, + "router_z_loss_clip": 0.90332031, + "router_z_loss_mlp": 0.16687012, + "step": 6127, + "time_per_iteration": 2.6326494216918945 + }, + { + "auxiliary_loss_clip": 0.01141872, + "auxiliary_loss_mlp": 0.0104265, + "balance_loss_clip": 1.05347204, + "balance_loss_mlp": 1.02686703, + "epoch": 0.3684352923493161, + "flos": 45914400819360.0, + "grad_norm": 1.8087442026655456, + "language_loss": 0.70312774, + "learning_rate": 2.9140657616333074e-06, + "loss": 0.72497296, + "num_input_tokens_seen": 131638960, + "router_z_loss_clip": 0.88427734, + "router_z_loss_mlp": 0.15783691, + "step": 6128, + "time_per_iteration": 2.8515682220458984 + }, + { + "auxiliary_loss_clip": 0.01139559, + "auxiliary_loss_mlp": 0.0104272, + "balance_loss_clip": 1.05244958, + "balance_loss_mlp": 1.02777171, + "epoch": 0.36849541560198407, + "flos": 17650337412960.0, + "grad_norm": 1.92884965423423, + "language_loss": 0.75320351, + "learning_rate": 2.9137193376902614e-06, + "loss": 0.77502626, + "num_input_tokens_seen": 131657440, + "router_z_loss_clip": 0.87011719, + "router_z_loss_mlp": 0.14953613, + "step": 6129, + "time_per_iteration": 2.6143016815185547 + }, + { + "auxiliary_loss_clip": 0.01134194, + "auxiliary_loss_mlp": 0.01039998, + "balance_loss_clip": 1.04848683, + "balance_loss_mlp": 1.02525187, + "epoch": 0.36855553885465203, + "flos": 31446366423840.0, + "grad_norm": 1.9466912041428994, + "language_loss": 0.84348357, + "learning_rate": 2.9133728790988868e-06, + "loss": 0.86522549, + "num_input_tokens_seen": 131678035, + "router_z_loss_clip": 0.85693359, + "router_z_loss_mlp": 0.1473999, + "step": 6130, + "time_per_iteration": 2.734884738922119 + }, + { + "auxiliary_loss_clip": 0.01049698, + "auxiliary_loss_mlp": 0.01006648, + "balance_loss_clip": 1.02248788, + "balance_loss_mlp": 1.00486255, + "epoch": 0.36861566210732, + "flos": 79372008708480.0, + "grad_norm": 0.8114897317003895, + "language_loss": 0.60232085, + "learning_rate": 2.913026385872321e-06, + "loss": 0.62288433, + "num_input_tokens_seen": 131742470, + "router_z_loss_clip": 0.27246094, + "router_z_loss_mlp": 0.01783752, + "step": 6131, + "time_per_iteration": 3.3887104988098145 + }, + { + "auxiliary_loss_clip": 0.01135267, + "auxiliary_loss_mlp": 0.01034606, + "balance_loss_clip": 1.04988706, + "balance_loss_mlp": 1.02041423, + "epoch": 0.36867578535998796, + "flos": 37771711214400.0, + "grad_norm": 1.6676063247106996, + "language_loss": 0.72777748, + "learning_rate": 2.9126798580237034e-06, + "loss": 0.74947625, + "num_input_tokens_seen": 131764570, + "router_z_loss_clip": 0.85400391, + "router_z_loss_mlp": 0.1418457, + "step": 6132, + "time_per_iteration": 2.8022103309631348 + }, + { + "auxiliary_loss_clip": 0.01142599, + "auxiliary_loss_mlp": 0.01039563, + "balance_loss_clip": 1.05192685, + "balance_loss_mlp": 1.02428091, + "epoch": 0.3687359086126559, + "flos": 35188981385760.0, + "grad_norm": 1.599661857953591, + "language_loss": 0.74215353, + "learning_rate": 2.9123332955661736e-06, + "loss": 0.76397514, + "num_input_tokens_seen": 131785720, + "router_z_loss_clip": 0.90576172, + "router_z_loss_mlp": 0.15283203, + "step": 6133, + "time_per_iteration": 4.184885740280151 + }, + { + "auxiliary_loss_clip": 0.01134025, + "auxiliary_loss_mlp": 0.01044853, + "balance_loss_clip": 1.05095649, + "balance_loss_mlp": 1.02971315, + "epoch": 0.3687960318653239, + "flos": 26108531141760.0, + "grad_norm": 1.967739707083901, + "language_loss": 0.71802676, + "learning_rate": 2.911986698512874e-06, + "loss": 0.73981547, + "num_input_tokens_seen": 131804430, + "router_z_loss_clip": 0.83105469, + "router_z_loss_mlp": 0.15136719, + "step": 6134, + "time_per_iteration": 4.022676229476929 + }, + { + "auxiliary_loss_clip": 0.0113557, + "auxiliary_loss_mlp": 0.01037734, + "balance_loss_clip": 1.04914379, + "balance_loss_mlp": 1.02235579, + "epoch": 0.36885615511799186, + "flos": 24728668930080.0, + "grad_norm": 1.6987740018915438, + "language_loss": 0.75059414, + "learning_rate": 2.9116400668769477e-06, + "loss": 0.77232718, + "num_input_tokens_seen": 131822060, + "router_z_loss_clip": 0.86474609, + "router_z_loss_mlp": 0.15380859, + "step": 6135, + "time_per_iteration": 2.6470248699188232 + }, + { + "auxiliary_loss_clip": 0.01048462, + "auxiliary_loss_mlp": 0.0100291, + "balance_loss_clip": 1.02143168, + "balance_loss_mlp": 1.00122952, + "epoch": 0.3689162783706599, + "flos": 76982028346080.0, + "grad_norm": 0.8165280280984218, + "language_loss": 0.58769345, + "learning_rate": 2.9112934006715376e-06, + "loss": 0.60820723, + "num_input_tokens_seen": 131880715, + "router_z_loss_clip": 0.27001953, + "router_z_loss_mlp": 0.0168457, + "step": 6136, + "time_per_iteration": 3.257528305053711 + }, + { + "auxiliary_loss_clip": 0.01133437, + "auxiliary_loss_mlp": 0.01038224, + "balance_loss_clip": 1.04912901, + "balance_loss_mlp": 1.02325702, + "epoch": 0.36897640162332784, + "flos": 13375158492960.0, + "grad_norm": 2.1016020198330825, + "language_loss": 0.79125667, + "learning_rate": 2.9109466999097918e-06, + "loss": 0.81297326, + "num_input_tokens_seen": 131895850, + "router_z_loss_clip": 0.84375, + "router_z_loss_mlp": 0.1496582, + "step": 6137, + "time_per_iteration": 4.184508323669434 + }, + { + "auxiliary_loss_clip": 0.01135918, + "auxiliary_loss_mlp": 0.01042387, + "balance_loss_clip": 1.04957557, + "balance_loss_mlp": 1.02728307, + "epoch": 0.3690365248759958, + "flos": 25263420821280.0, + "grad_norm": 2.093848166453792, + "language_loss": 0.74380457, + "learning_rate": 2.9105999646048552e-06, + "loss": 0.76558763, + "num_input_tokens_seen": 131915775, + "router_z_loss_clip": 0.86181641, + "router_z_loss_mlp": 0.15100098, + "step": 6138, + "time_per_iteration": 2.706169605255127 + }, + { + "auxiliary_loss_clip": 0.01141588, + "auxiliary_loss_mlp": 0.01040894, + "balance_loss_clip": 1.0518856, + "balance_loss_mlp": 1.02649951, + "epoch": 0.3690966481286638, + "flos": 38835988267680.0, + "grad_norm": 2.4359931729289515, + "language_loss": 0.64711797, + "learning_rate": 2.9102531947698764e-06, + "loss": 0.66894281, + "num_input_tokens_seen": 131935715, + "router_z_loss_clip": 0.89746094, + "router_z_loss_mlp": 0.14398193, + "step": 6139, + "time_per_iteration": 2.7665421962738037 + }, + { + "auxiliary_loss_clip": 0.01132821, + "auxiliary_loss_mlp": 0.01046459, + "balance_loss_clip": 1.04882693, + "balance_loss_mlp": 1.03124809, + "epoch": 0.36915677138133174, + "flos": 16002329427360.0, + "grad_norm": 2.1933877994660977, + "language_loss": 0.7160356, + "learning_rate": 2.909906390418006e-06, + "loss": 0.73782843, + "num_input_tokens_seen": 131954120, + "router_z_loss_clip": 0.83984375, + "router_z_loss_mlp": 0.15222168, + "step": 6140, + "time_per_iteration": 4.097067356109619 + }, + { + "auxiliary_loss_clip": 0.01047063, + "auxiliary_loss_mlp": 0.01001823, + "balance_loss_clip": 1.01986933, + "balance_loss_mlp": 1.00022221, + "epoch": 0.3692168946339997, + "flos": 83811743418240.0, + "grad_norm": 0.7488883635797807, + "language_loss": 0.5927822, + "learning_rate": 2.9095595515623934e-06, + "loss": 0.61327112, + "num_input_tokens_seen": 132017485, + "router_z_loss_clip": 0.2722168, + "router_z_loss_mlp": 0.0160141, + "step": 6141, + "time_per_iteration": 3.3396060466766357 + }, + { + "auxiliary_loss_clip": 0.01134266, + "auxiliary_loss_mlp": 0.01036497, + "balance_loss_clip": 1.04739141, + "balance_loss_mlp": 1.02157176, + "epoch": 0.36927701788666767, + "flos": 26866339424640.0, + "grad_norm": 1.9467421027907348, + "language_loss": 0.74920088, + "learning_rate": 2.909212678216192e-06, + "loss": 0.77090847, + "num_input_tokens_seen": 132036760, + "router_z_loss_clip": 0.86914062, + "router_z_loss_mlp": 0.14904785, + "step": 6142, + "time_per_iteration": 2.7522854804992676 + }, + { + "auxiliary_loss_clip": 0.01133534, + "auxiliary_loss_mlp": 0.01036313, + "balance_loss_clip": 1.04938936, + "balance_loss_mlp": 1.02237725, + "epoch": 0.36933714113933563, + "flos": 26643890792160.0, + "grad_norm": 1.687697027548512, + "language_loss": 0.77433181, + "learning_rate": 2.908865770392555e-06, + "loss": 0.79603028, + "num_input_tokens_seen": 132056935, + "router_z_loss_clip": 0.84082031, + "router_z_loss_mlp": 0.13946533, + "step": 6143, + "time_per_iteration": 2.6387417316436768 + }, + { + "auxiliary_loss_clip": 0.01132381, + "auxiliary_loss_mlp": 0.01033927, + "balance_loss_clip": 1.04857898, + "balance_loss_mlp": 1.02069449, + "epoch": 0.3693972643920036, + "flos": 28909293219360.0, + "grad_norm": 1.9402612806408572, + "language_loss": 0.81828517, + "learning_rate": 2.9085188281046364e-06, + "loss": 0.83994824, + "num_input_tokens_seen": 132077285, + "router_z_loss_clip": 0.83837891, + "router_z_loss_mlp": 0.13220215, + "step": 6144, + "time_per_iteration": 2.7306113243103027 + }, + { + "auxiliary_loss_clip": 0.01134054, + "auxiliary_loss_mlp": 0.01039433, + "balance_loss_clip": 1.04739571, + "balance_loss_mlp": 1.02512753, + "epoch": 0.36945738764467156, + "flos": 27890024513760.0, + "grad_norm": 2.0238603974510845, + "language_loss": 0.77407992, + "learning_rate": 2.908171851365593e-06, + "loss": 0.79581475, + "num_input_tokens_seen": 132095520, + "router_z_loss_clip": 0.86767578, + "router_z_loss_mlp": 0.14306641, + "step": 6145, + "time_per_iteration": 2.663754940032959 + }, + { + "auxiliary_loss_clip": 0.01136514, + "auxiliary_loss_mlp": 0.01033642, + "balance_loss_clip": 1.04981196, + "balance_loss_mlp": 1.01944423, + "epoch": 0.36951751089733953, + "flos": 20274672137760.0, + "grad_norm": 1.6974572747767758, + "language_loss": 0.77059817, + "learning_rate": 2.9078248401885815e-06, + "loss": 0.79229975, + "num_input_tokens_seen": 132112810, + "router_z_loss_clip": 0.86621094, + "router_z_loss_mlp": 0.14196777, + "step": 6146, + "time_per_iteration": 2.8215441703796387 + }, + { + "auxiliary_loss_clip": 0.01136183, + "auxiliary_loss_mlp": 0.01045701, + "balance_loss_clip": 1.04894924, + "balance_loss_mlp": 1.03020334, + "epoch": 0.3695776341500075, + "flos": 23079607495200.0, + "grad_norm": 2.9354672740841763, + "language_loss": 0.80384827, + "learning_rate": 2.907477794586761e-06, + "loss": 0.82566714, + "num_input_tokens_seen": 132131615, + "router_z_loss_clip": 0.87402344, + "router_z_loss_mlp": 0.15478516, + "step": 6147, + "time_per_iteration": 2.670353651046753 + }, + { + "auxiliary_loss_clip": 0.0113425, + "auxiliary_loss_mlp": 0.01039808, + "balance_loss_clip": 1.04676616, + "balance_loss_mlp": 1.02636755, + "epoch": 0.36963775740267546, + "flos": 25391031202080.0, + "grad_norm": 1.9510363886481614, + "language_loss": 0.83573186, + "learning_rate": 2.9071307145732926e-06, + "loss": 0.85747242, + "num_input_tokens_seen": 132149585, + "router_z_loss_clip": 0.87402344, + "router_z_loss_mlp": 0.13433838, + "step": 6148, + "time_per_iteration": 2.6637866497039795 + }, + { + "auxiliary_loss_clip": 0.01132581, + "auxiliary_loss_mlp": 0.01041764, + "balance_loss_clip": 1.04862618, + "balance_loss_mlp": 1.02704811, + "epoch": 0.3696978806553435, + "flos": 31801692683520.0, + "grad_norm": 2.3236596790279394, + "language_loss": 0.74145502, + "learning_rate": 2.9067836001613357e-06, + "loss": 0.76319849, + "num_input_tokens_seen": 132165555, + "router_z_loss_clip": 0.83935547, + "router_z_loss_mlp": 0.14727783, + "step": 6149, + "time_per_iteration": 2.737948417663574 + }, + { + "auxiliary_loss_clip": 0.01137224, + "auxiliary_loss_mlp": 0.0104185, + "balance_loss_clip": 1.05014372, + "balance_loss_mlp": 1.02623391, + "epoch": 0.36975800390801145, + "flos": 32743261946880.0, + "grad_norm": 2.093360810695572, + "language_loss": 0.7152856, + "learning_rate": 2.906436451364054e-06, + "loss": 0.73707634, + "num_input_tokens_seen": 132185100, + "router_z_loss_clip": 0.87109375, + "router_z_loss_mlp": 0.15625, + "step": 6150, + "time_per_iteration": 2.681264877319336 + }, + { + "auxiliary_loss_clip": 0.01134094, + "auxiliary_loss_mlp": 0.01041171, + "balance_loss_clip": 1.04927897, + "balance_loss_mlp": 1.02706265, + "epoch": 0.3698181271606794, + "flos": 25798253747040.0, + "grad_norm": 1.5672157174954107, + "language_loss": 0.81481183, + "learning_rate": 2.906089268194611e-06, + "loss": 0.83656448, + "num_input_tokens_seen": 132203930, + "router_z_loss_clip": 0.84814453, + "router_z_loss_mlp": 0.14117432, + "step": 6151, + "time_per_iteration": 2.7135300636291504 + }, + { + "auxiliary_loss_clip": 0.01048537, + "auxiliary_loss_mlp": 0.01004205, + "balance_loss_clip": 1.02190781, + "balance_loss_mlp": 1.00270414, + "epoch": 0.3698782504133474, + "flos": 81440198418240.0, + "grad_norm": 0.8015092742519837, + "language_loss": 0.63134873, + "learning_rate": 2.9057420506661726e-06, + "loss": 0.65187609, + "num_input_tokens_seen": 132263845, + "router_z_loss_clip": 0.26635742, + "router_z_loss_mlp": 0.01500702, + "step": 6152, + "time_per_iteration": 3.342862606048584 + }, + { + "auxiliary_loss_clip": 0.01132239, + "auxiliary_loss_mlp": 0.01035522, + "balance_loss_clip": 1.04976308, + "balance_loss_mlp": 1.02175307, + "epoch": 0.36993837366601534, + "flos": 29665683397440.0, + "grad_norm": 4.811804740942238, + "language_loss": 0.69977117, + "learning_rate": 2.9053947987919044e-06, + "loss": 0.72144878, + "num_input_tokens_seen": 132282350, + "router_z_loss_clip": 0.82470703, + "router_z_loss_mlp": 0.13763428, + "step": 6153, + "time_per_iteration": 2.7177109718322754 + }, + { + "auxiliary_loss_clip": 0.0113796, + "auxiliary_loss_mlp": 0.01038629, + "balance_loss_clip": 1.0503372, + "balance_loss_mlp": 1.02400219, + "epoch": 0.3699984969186833, + "flos": 29711542608000.0, + "grad_norm": 1.8640103196445368, + "language_loss": 0.72581768, + "learning_rate": 2.9050475125849755e-06, + "loss": 0.74758357, + "num_input_tokens_seen": 132301930, + "router_z_loss_clip": 0.87695312, + "router_z_loss_mlp": 0.1461792, + "step": 6154, + "time_per_iteration": 2.6701817512512207 + }, + { + "auxiliary_loss_clip": 0.01134854, + "auxiliary_loss_mlp": 0.0103259, + "balance_loss_clip": 1.0501163, + "balance_loss_mlp": 1.01863706, + "epoch": 0.37005862017135127, + "flos": 24195456695520.0, + "grad_norm": 1.7334948117931097, + "language_loss": 0.68226457, + "learning_rate": 2.9047001920585534e-06, + "loss": 0.70393896, + "num_input_tokens_seen": 132320915, + "router_z_loss_clip": 0.84863281, + "router_z_loss_mlp": 0.1395874, + "step": 6155, + "time_per_iteration": 2.6747705936431885 + }, + { + "auxiliary_loss_clip": 0.01132971, + "auxiliary_loss_mlp": 0.01031433, + "balance_loss_clip": 1.04886019, + "balance_loss_mlp": 1.01759279, + "epoch": 0.37011874342401924, + "flos": 23883842230560.0, + "grad_norm": 1.8016388949816118, + "language_loss": 0.67603403, + "learning_rate": 2.9043528372258097e-06, + "loss": 0.69767809, + "num_input_tokens_seen": 132340415, + "router_z_loss_clip": 0.84130859, + "router_z_loss_mlp": 0.13842773, + "step": 6156, + "time_per_iteration": 2.614027738571167 + }, + { + "auxiliary_loss_clip": 0.01131099, + "auxiliary_loss_mlp": 0.01034112, + "balance_loss_clip": 1.04785156, + "balance_loss_mlp": 1.02120733, + "epoch": 0.3701788666766872, + "flos": 24860938798080.0, + "grad_norm": 1.9354900526946281, + "language_loss": 0.82170796, + "learning_rate": 2.904005448099916e-06, + "loss": 0.84336007, + "num_input_tokens_seen": 132358600, + "router_z_loss_clip": 0.83154297, + "router_z_loss_mlp": 0.12915039, + "step": 6157, + "time_per_iteration": 2.7558696269989014 + }, + { + "auxiliary_loss_clip": 0.01138523, + "auxiliary_loss_mlp": 0.01039598, + "balance_loss_clip": 1.05020022, + "balance_loss_mlp": 1.02431512, + "epoch": 0.37023898992935517, + "flos": 18718625676960.0, + "grad_norm": 2.422043137681169, + "language_loss": 0.76672268, + "learning_rate": 2.9036580246940444e-06, + "loss": 0.78850383, + "num_input_tokens_seen": 132373160, + "router_z_loss_clip": 0.88330078, + "router_z_loss_mlp": 0.1529541, + "step": 6158, + "time_per_iteration": 2.6452789306640625 + }, + { + "auxiliary_loss_clip": 0.0113581, + "auxiliary_loss_mlp": 0.01034271, + "balance_loss_clip": 1.04835892, + "balance_loss_mlp": 1.0196147, + "epoch": 0.37029911318202313, + "flos": 23884004299680.0, + "grad_norm": 2.106037133135756, + "language_loss": 0.68827778, + "learning_rate": 2.9033105670213708e-06, + "loss": 0.70997864, + "num_input_tokens_seen": 132392345, + "router_z_loss_clip": 0.87451172, + "router_z_loss_mlp": 0.14648438, + "step": 6159, + "time_per_iteration": 2.6807591915130615 + }, + { + "auxiliary_loss_clip": 0.01131544, + "auxiliary_loss_mlp": 0.01037461, + "balance_loss_clip": 1.04761815, + "balance_loss_mlp": 1.02428246, + "epoch": 0.3703592364346911, + "flos": 31986020905920.0, + "grad_norm": 1.7299636489766825, + "language_loss": 0.71021283, + "learning_rate": 2.9029630750950697e-06, + "loss": 0.7319029, + "num_input_tokens_seen": 132412620, + "router_z_loss_clip": 0.83984375, + "router_z_loss_mlp": 0.13171387, + "step": 6160, + "time_per_iteration": 2.746274709701538 + }, + { + "auxiliary_loss_clip": 0.01127559, + "auxiliary_loss_mlp": 0.01032929, + "balance_loss_clip": 1.0461905, + "balance_loss_mlp": 1.02031052, + "epoch": 0.37041935968735906, + "flos": 24462873158400.0, + "grad_norm": 2.6987762238366613, + "language_loss": 0.79105794, + "learning_rate": 2.9026155489283176e-06, + "loss": 0.81266284, + "num_input_tokens_seen": 132431570, + "router_z_loss_clip": 0.81396484, + "router_z_loss_mlp": 0.1262207, + "step": 6161, + "time_per_iteration": 2.6165382862091064 + }, + { + "auxiliary_loss_clip": 0.01133665, + "auxiliary_loss_mlp": 0.01035151, + "balance_loss_clip": 1.0488286, + "balance_loss_mlp": 1.02050042, + "epoch": 0.3704794829400271, + "flos": 29448380459520.0, + "grad_norm": 1.8202062906208478, + "language_loss": 0.7939375, + "learning_rate": 2.902267988534295e-06, + "loss": 0.81562567, + "num_input_tokens_seen": 132451525, + "router_z_loss_clip": 0.84912109, + "router_z_loss_mlp": 0.14642334, + "step": 6162, + "time_per_iteration": 2.6809887886047363 + }, + { + "auxiliary_loss_clip": 0.01132546, + "auxiliary_loss_mlp": 0.01037213, + "balance_loss_clip": 1.0483191, + "balance_loss_mlp": 1.02360559, + "epoch": 0.37053960619269505, + "flos": 18140121473760.0, + "grad_norm": 1.8672112588928935, + "language_loss": 0.79459447, + "learning_rate": 2.9019203939261783e-06, + "loss": 0.81629205, + "num_input_tokens_seen": 132469875, + "router_z_loss_clip": 0.84228516, + "router_z_loss_mlp": 0.13586426, + "step": 6163, + "time_per_iteration": 2.659628391265869 + }, + { + "auxiliary_loss_clip": 0.01133963, + "auxiliary_loss_mlp": 0.01034655, + "balance_loss_clip": 1.04887152, + "balance_loss_mlp": 1.01992667, + "epoch": 0.370599729445363, + "flos": 26065346071680.0, + "grad_norm": 1.6156271683032855, + "language_loss": 0.68391442, + "learning_rate": 2.9015727651171507e-06, + "loss": 0.70560062, + "num_input_tokens_seen": 132488360, + "router_z_loss_clip": 0.85205078, + "router_z_loss_mlp": 0.14715576, + "step": 6164, + "time_per_iteration": 2.709503173828125 + }, + { + "auxiliary_loss_clip": 0.01136882, + "auxiliary_loss_mlp": 0.01037294, + "balance_loss_clip": 1.05125463, + "balance_loss_mlp": 1.02267873, + "epoch": 0.370659852698031, + "flos": 32737751596800.0, + "grad_norm": 4.143685369359268, + "language_loss": 0.83074057, + "learning_rate": 2.9012251021203935e-06, + "loss": 0.85248232, + "num_input_tokens_seen": 132508630, + "router_z_loss_clip": 0.85546875, + "router_z_loss_mlp": 0.14624023, + "step": 6165, + "time_per_iteration": 2.6807305812835693 + }, + { + "auxiliary_loss_clip": 0.01138833, + "auxiliary_loss_mlp": 0.01033414, + "balance_loss_clip": 1.05109501, + "balance_loss_mlp": 1.01767814, + "epoch": 0.37071997595069894, + "flos": 23304527681760.0, + "grad_norm": 1.6952712469282358, + "language_loss": 0.6932742, + "learning_rate": 2.9008774049490896e-06, + "loss": 0.7149967, + "num_input_tokens_seen": 132527465, + "router_z_loss_clip": 0.87744141, + "router_z_loss_mlp": 0.15734863, + "step": 6166, + "time_per_iteration": 2.7453320026397705 + }, + { + "auxiliary_loss_clip": 0.01047606, + "auxiliary_loss_mlp": 0.01002344, + "balance_loss_clip": 1.02087927, + "balance_loss_mlp": 1.00085413, + "epoch": 0.3707800992033669, + "flos": 63668368733760.0, + "grad_norm": 0.7938907938553094, + "language_loss": 0.56927645, + "learning_rate": 2.9005296736164244e-06, + "loss": 0.58977592, + "num_input_tokens_seen": 132579940, + "router_z_loss_clip": 0.26757812, + "router_z_loss_mlp": 0.01489258, + "step": 6167, + "time_per_iteration": 3.0995192527770996 + }, + { + "auxiliary_loss_clip": 0.01129503, + "auxiliary_loss_mlp": 0.01038354, + "balance_loss_clip": 1.04797637, + "balance_loss_mlp": 1.02503288, + "epoch": 0.3708402224560349, + "flos": 24284176837920.0, + "grad_norm": 2.4501087087244118, + "language_loss": 0.75141096, + "learning_rate": 2.900181908135584e-06, + "loss": 0.77308953, + "num_input_tokens_seen": 132598390, + "router_z_loss_clip": 0.81640625, + "router_z_loss_mlp": 0.13348389, + "step": 6168, + "time_per_iteration": 2.6917083263397217 + }, + { + "auxiliary_loss_clip": 0.01131157, + "auxiliary_loss_mlp": 0.01034186, + "balance_loss_clip": 1.04726863, + "balance_loss_mlp": 1.02093554, + "epoch": 0.37090034570870284, + "flos": 24413610496320.0, + "grad_norm": 1.7268365153033873, + "language_loss": 0.73570716, + "learning_rate": 2.899834108519755e-06, + "loss": 0.75736058, + "num_input_tokens_seen": 132616920, + "router_z_loss_clip": 0.83935547, + "router_z_loss_mlp": 0.13256836, + "step": 6169, + "time_per_iteration": 2.6874802112579346 + }, + { + "auxiliary_loss_clip": 0.01132154, + "auxiliary_loss_mlp": 0.01031117, + "balance_loss_clip": 1.05077899, + "balance_loss_mlp": 1.0177834, + "epoch": 0.3709604689613708, + "flos": 29449393391520.0, + "grad_norm": 1.5246173795135818, + "language_loss": 0.79606384, + "learning_rate": 2.899486274782127e-06, + "loss": 0.81769657, + "num_input_tokens_seen": 132637660, + "router_z_loss_clip": 0.81298828, + "router_z_loss_mlp": 0.13330078, + "step": 6170, + "time_per_iteration": 2.6940808296203613 + }, + { + "auxiliary_loss_clip": 0.01134968, + "auxiliary_loss_mlp": 0.01041483, + "balance_loss_clip": 1.04953575, + "balance_loss_mlp": 1.02659345, + "epoch": 0.37102059221403877, + "flos": 29134091854080.0, + "grad_norm": 1.640199564987419, + "language_loss": 0.76227111, + "learning_rate": 2.8991384069358885e-06, + "loss": 0.78403556, + "num_input_tokens_seen": 132657635, + "router_z_loss_clip": 0.85449219, + "router_z_loss_mlp": 0.14880371, + "step": 6171, + "time_per_iteration": 2.734710693359375 + }, + { + "auxiliary_loss_clip": 0.01134902, + "auxiliary_loss_mlp": 0.01031136, + "balance_loss_clip": 1.05201137, + "balance_loss_mlp": 1.01715255, + "epoch": 0.37108071546670673, + "flos": 17694049207680.0, + "grad_norm": 2.3040133401577285, + "language_loss": 0.8046717, + "learning_rate": 2.898790504994232e-06, + "loss": 0.82633209, + "num_input_tokens_seen": 132674455, + "router_z_loss_clip": 0.82910156, + "router_z_loss_mlp": 0.13983154, + "step": 6172, + "time_per_iteration": 4.0333592891693115 + }, + { + "auxiliary_loss_clip": 0.0113559, + "auxiliary_loss_mlp": 0.01035833, + "balance_loss_clip": 1.04972136, + "balance_loss_mlp": 1.02125335, + "epoch": 0.3711408387193747, + "flos": 42174986722560.0, + "grad_norm": 1.9180699793561578, + "language_loss": 0.59375048, + "learning_rate": 2.89844256897035e-06, + "loss": 0.61546481, + "num_input_tokens_seen": 132695140, + "router_z_loss_clip": 0.85791016, + "router_z_loss_mlp": 0.14575195, + "step": 6173, + "time_per_iteration": 4.188132286071777 + }, + { + "auxiliary_loss_clip": 0.01131735, + "auxiliary_loss_mlp": 0.01033587, + "balance_loss_clip": 1.04758477, + "balance_loss_mlp": 1.01993799, + "epoch": 0.37120096197204266, + "flos": 21122335046880.0, + "grad_norm": 1.8131446986500892, + "language_loss": 0.80950606, + "learning_rate": 2.898094598877435e-06, + "loss": 0.83115923, + "num_input_tokens_seen": 132712470, + "router_z_loss_clip": 0.84179688, + "router_z_loss_mlp": 0.13635254, + "step": 6174, + "time_per_iteration": 2.664682626724243 + }, + { + "auxiliary_loss_clip": 0.01128075, + "auxiliary_loss_mlp": 0.01035427, + "balance_loss_clip": 1.0476985, + "balance_loss_mlp": 1.02248096, + "epoch": 0.37126108522471063, + "flos": 37417154783040.0, + "grad_norm": 1.9524525550119032, + "language_loss": 0.79806089, + "learning_rate": 2.8977465947286826e-06, + "loss": 0.81969589, + "num_input_tokens_seen": 132732945, + "router_z_loss_clip": 0.8046875, + "router_z_loss_mlp": 0.12939453, + "step": 6175, + "time_per_iteration": 2.738487720489502 + }, + { + "auxiliary_loss_clip": 0.0113497, + "auxiliary_loss_mlp": 0.01041624, + "balance_loss_clip": 1.05201662, + "balance_loss_mlp": 1.02823722, + "epoch": 0.37132120847737865, + "flos": 30695284009440.0, + "grad_norm": 1.8356844116222055, + "language_loss": 0.88788176, + "learning_rate": 2.89739855653729e-06, + "loss": 0.9096477, + "num_input_tokens_seen": 132752470, + "router_z_loss_clip": 0.82958984, + "router_z_loss_mlp": 0.13391113, + "step": 6176, + "time_per_iteration": 4.233124256134033 + }, + { + "auxiliary_loss_clip": 0.01132775, + "auxiliary_loss_mlp": 0.0103634, + "balance_loss_clip": 1.04922998, + "balance_loss_mlp": 1.02310193, + "epoch": 0.3713813317300466, + "flos": 25884907508160.0, + "grad_norm": 1.6118204061598969, + "language_loss": 0.73441195, + "learning_rate": 2.8970504843164546e-06, + "loss": 0.7561031, + "num_input_tokens_seen": 132771485, + "router_z_loss_clip": 0.83544922, + "router_z_loss_mlp": 0.13244629, + "step": 6177, + "time_per_iteration": 2.6852238178253174 + }, + { + "auxiliary_loss_clip": 0.01131583, + "auxiliary_loss_mlp": 0.01042886, + "balance_loss_clip": 1.04922009, + "balance_loss_mlp": 1.02927768, + "epoch": 0.3714414549827146, + "flos": 26376717432960.0, + "grad_norm": 1.91334091030957, + "language_loss": 0.75571203, + "learning_rate": 2.896702378079374e-06, + "loss": 0.77745676, + "num_input_tokens_seen": 132791465, + "router_z_loss_clip": 0.82373047, + "router_z_loss_mlp": 0.1361084, + "step": 6178, + "time_per_iteration": 2.6607604026794434 + }, + { + "auxiliary_loss_clip": 0.01136196, + "auxiliary_loss_mlp": 0.01039113, + "balance_loss_clip": 1.05366611, + "balance_loss_mlp": 1.02560687, + "epoch": 0.37150157823538255, + "flos": 24370384908960.0, + "grad_norm": 1.73995983557473, + "language_loss": 0.71594906, + "learning_rate": 2.8963542378392502e-06, + "loss": 0.73770219, + "num_input_tokens_seen": 132810160, + "router_z_loss_clip": 0.82470703, + "router_z_loss_mlp": 0.13513184, + "step": 6179, + "time_per_iteration": 2.6885995864868164 + }, + { + "auxiliary_loss_clip": 0.01136071, + "auxiliary_loss_mlp": 0.01039189, + "balance_loss_clip": 1.05133796, + "balance_loss_mlp": 1.02464569, + "epoch": 0.3715617014880505, + "flos": 30335500848960.0, + "grad_norm": 1.9469653706419072, + "language_loss": 0.6983366, + "learning_rate": 2.896006063609283e-06, + "loss": 0.7200892, + "num_input_tokens_seen": 132831265, + "router_z_loss_clip": 0.84716797, + "router_z_loss_mlp": 0.14550781, + "step": 6180, + "time_per_iteration": 4.17007040977478 + }, + { + "auxiliary_loss_clip": 0.01129672, + "auxiliary_loss_mlp": 0.01037008, + "balance_loss_clip": 1.04773903, + "balance_loss_mlp": 1.02349019, + "epoch": 0.3716218247407185, + "flos": 24951968425440.0, + "grad_norm": 1.893674284879873, + "language_loss": 0.7769618, + "learning_rate": 2.8956578554026767e-06, + "loss": 0.79862869, + "num_input_tokens_seen": 132850005, + "router_z_loss_clip": 0.81982422, + "router_z_loss_mlp": 0.13519287, + "step": 6181, + "time_per_iteration": 2.6217939853668213 + }, + { + "auxiliary_loss_clip": 0.01130786, + "auxiliary_loss_mlp": 0.01037381, + "balance_loss_clip": 1.04887807, + "balance_loss_mlp": 1.02336812, + "epoch": 0.37168194799338644, + "flos": 29448015804000.0, + "grad_norm": 2.804666900930182, + "language_loss": 0.78675491, + "learning_rate": 2.8953096132326343e-06, + "loss": 0.80843651, + "num_input_tokens_seen": 132865790, + "router_z_loss_clip": 0.81884766, + "router_z_loss_mlp": 0.14025879, + "step": 6182, + "time_per_iteration": 2.725618600845337 + }, + { + "auxiliary_loss_clip": 0.01052823, + "auxiliary_loss_mlp": 0.01005293, + "balance_loss_clip": 1.02579916, + "balance_loss_mlp": 1.00373435, + "epoch": 0.3717420712460544, + "flos": 82252374540480.0, + "grad_norm": 0.7825928503278606, + "language_loss": 0.57460749, + "learning_rate": 2.894961337112362e-06, + "loss": 0.59518862, + "num_input_tokens_seen": 132921775, + "router_z_loss_clip": 0.27026367, + "router_z_loss_mlp": 0.01557922, + "step": 6183, + "time_per_iteration": 3.271787643432617 + }, + { + "auxiliary_loss_clip": 0.01138549, + "auxiliary_loss_mlp": 0.01040334, + "balance_loss_clip": 1.05134678, + "balance_loss_mlp": 1.02607632, + "epoch": 0.37180219449872237, + "flos": 27304997028480.0, + "grad_norm": 1.7714993793344602, + "language_loss": 0.76724231, + "learning_rate": 2.894613027055066e-06, + "loss": 0.78903109, + "num_input_tokens_seen": 132941060, + "router_z_loss_clip": 0.87158203, + "router_z_loss_mlp": 0.14263916, + "step": 6184, + "time_per_iteration": 2.692960262298584 + }, + { + "auxiliary_loss_clip": 0.01131654, + "auxiliary_loss_mlp": 0.0103794, + "balance_loss_clip": 1.05042315, + "balance_loss_mlp": 1.02451754, + "epoch": 0.37186231775139034, + "flos": 26685941378400.0, + "grad_norm": 1.8333562277937276, + "language_loss": 0.71998703, + "learning_rate": 2.894264683073954e-06, + "loss": 0.74168289, + "num_input_tokens_seen": 132961850, + "router_z_loss_clip": 0.81201172, + "router_z_loss_mlp": 0.13439941, + "step": 6185, + "time_per_iteration": 2.6558752059936523 + }, + { + "auxiliary_loss_clip": 0.01131332, + "auxiliary_loss_mlp": 0.01030369, + "balance_loss_clip": 1.04995465, + "balance_loss_mlp": 1.01648664, + "epoch": 0.3719224410040583, + "flos": 27351585550080.0, + "grad_norm": 1.5567457422613054, + "language_loss": 0.76875889, + "learning_rate": 2.8939163051822363e-06, + "loss": 0.79037595, + "num_input_tokens_seen": 132981625, + "router_z_loss_clip": 0.81396484, + "router_z_loss_mlp": 0.13879395, + "step": 6186, + "time_per_iteration": 2.732797384262085 + }, + { + "auxiliary_loss_clip": 0.01139978, + "auxiliary_loss_mlp": 0.010401, + "balance_loss_clip": 1.05186629, + "balance_loss_mlp": 1.02542543, + "epoch": 0.37198256425672627, + "flos": 30690300384000.0, + "grad_norm": 2.0756122134625326, + "language_loss": 0.83690619, + "learning_rate": 2.8935678933931224e-06, + "loss": 0.85870695, + "num_input_tokens_seen": 133001225, + "router_z_loss_clip": 0.88085938, + "router_z_loss_mlp": 0.14709473, + "step": 6187, + "time_per_iteration": 2.7240257263183594 + }, + { + "auxiliary_loss_clip": 0.01132487, + "auxiliary_loss_mlp": 0.01037299, + "balance_loss_clip": 1.05033064, + "balance_loss_mlp": 1.0235182, + "epoch": 0.37204268750939423, + "flos": 25793391673440.0, + "grad_norm": 2.012259269239897, + "language_loss": 0.84988797, + "learning_rate": 2.893219447719824e-06, + "loss": 0.87158585, + "num_input_tokens_seen": 133018820, + "router_z_loss_clip": 0.8203125, + "router_z_loss_mlp": 0.13769531, + "step": 6188, + "time_per_iteration": 2.7120373249053955 + }, + { + "auxiliary_loss_clip": 0.01135519, + "auxiliary_loss_mlp": 0.01037678, + "balance_loss_clip": 1.05236149, + "balance_loss_mlp": 1.02345085, + "epoch": 0.37210281076206225, + "flos": 26242745839200.0, + "grad_norm": 2.671587184141849, + "language_loss": 0.64835429, + "learning_rate": 2.8928709681755548e-06, + "loss": 0.67008632, + "num_input_tokens_seen": 133040205, + "router_z_loss_clip": 0.83154297, + "router_z_loss_mlp": 0.14215088, + "step": 6189, + "time_per_iteration": 2.725133180618286 + }, + { + "auxiliary_loss_clip": 0.0113697, + "auxiliary_loss_mlp": 0.01034364, + "balance_loss_clip": 1.05323493, + "balance_loss_mlp": 1.02051175, + "epoch": 0.3721629340147302, + "flos": 21167586498240.0, + "grad_norm": 1.873813903368266, + "language_loss": 0.84198618, + "learning_rate": 2.8925224547735293e-06, + "loss": 0.86369956, + "num_input_tokens_seen": 133058095, + "router_z_loss_clip": 0.83789062, + "router_z_loss_mlp": 0.13867188, + "step": 6190, + "time_per_iteration": 2.6089766025543213 + }, + { + "auxiliary_loss_clip": 0.0113796, + "auxiliary_loss_mlp": 0.01036204, + "balance_loss_clip": 1.05056965, + "balance_loss_mlp": 1.02253127, + "epoch": 0.3722230572673982, + "flos": 20050805400480.0, + "grad_norm": 3.2492140794376607, + "language_loss": 0.87729424, + "learning_rate": 2.8921739075269633e-06, + "loss": 0.89903593, + "num_input_tokens_seen": 133071530, + "router_z_loss_clip": 0.87353516, + "router_z_loss_mlp": 0.13684082, + "step": 6191, + "time_per_iteration": 2.6799025535583496 + }, + { + "auxiliary_loss_clip": 0.01135665, + "auxiliary_loss_mlp": 0.01034421, + "balance_loss_clip": 1.04939461, + "balance_loss_mlp": 1.01900756, + "epoch": 0.37228318052006615, + "flos": 27667859502240.0, + "grad_norm": 1.90937794847832, + "language_loss": 0.73905456, + "learning_rate": 2.891825326449073e-06, + "loss": 0.76075542, + "num_input_tokens_seen": 133091410, + "router_z_loss_clip": 0.86279297, + "router_z_loss_mlp": 0.15411377, + "step": 6192, + "time_per_iteration": 2.739954710006714 + }, + { + "auxiliary_loss_clip": 0.01133727, + "auxiliary_loss_mlp": 0.01034116, + "balance_loss_clip": 1.05031657, + "balance_loss_mlp": 1.0209316, + "epoch": 0.3723433037727341, + "flos": 30829296120480.0, + "grad_norm": 5.372988223428152, + "language_loss": 0.80024946, + "learning_rate": 2.8914767115530766e-06, + "loss": 0.82192791, + "num_input_tokens_seen": 133110365, + "router_z_loss_clip": 0.83349609, + "router_z_loss_mlp": 0.13183594, + "step": 6193, + "time_per_iteration": 2.7735238075256348 + }, + { + "auxiliary_loss_clip": 0.01134719, + "auxiliary_loss_mlp": 0.01032843, + "balance_loss_clip": 1.04941642, + "balance_loss_mlp": 1.01959252, + "epoch": 0.3724034270254021, + "flos": 12841460051040.0, + "grad_norm": 2.1017243287251293, + "language_loss": 0.84078121, + "learning_rate": 2.891128062852194e-06, + "loss": 0.8624568, + "num_input_tokens_seen": 133128255, + "router_z_loss_clip": 0.85400391, + "router_z_loss_mlp": 0.13244629, + "step": 6194, + "time_per_iteration": 2.639530658721924 + }, + { + "auxiliary_loss_clip": 0.01133333, + "auxiliary_loss_mlp": 0.01032237, + "balance_loss_clip": 1.04938126, + "balance_loss_mlp": 1.01892686, + "epoch": 0.37246355027807004, + "flos": 24728871516480.0, + "grad_norm": 2.4432496675713855, + "language_loss": 0.77438647, + "learning_rate": 2.890779380359646e-06, + "loss": 0.7960422, + "num_input_tokens_seen": 133143975, + "router_z_loss_clip": 0.83837891, + "router_z_loss_mlp": 0.13311768, + "step": 6195, + "time_per_iteration": 2.670656442642212 + }, + { + "auxiliary_loss_clip": 0.01134313, + "auxiliary_loss_mlp": 0.01031744, + "balance_loss_clip": 1.05217028, + "balance_loss_mlp": 1.01821935, + "epoch": 0.372523673530738, + "flos": 23794514328960.0, + "grad_norm": 1.6319214422802513, + "language_loss": 0.79300654, + "learning_rate": 2.890430664088655e-06, + "loss": 0.81466711, + "num_input_tokens_seen": 133162935, + "router_z_loss_clip": 0.82177734, + "router_z_loss_mlp": 0.13525391, + "step": 6196, + "time_per_iteration": 2.6476781368255615 + }, + { + "auxiliary_loss_clip": 0.01135167, + "auxiliary_loss_mlp": 0.01037007, + "balance_loss_clip": 1.05189908, + "balance_loss_mlp": 1.02396536, + "epoch": 0.372583796783406, + "flos": 20455434839520.0, + "grad_norm": 2.2624756630317444, + "language_loss": 0.83171797, + "learning_rate": 2.890081914052443e-06, + "loss": 0.85343975, + "num_input_tokens_seen": 133181180, + "router_z_loss_clip": 0.83300781, + "router_z_loss_mlp": 0.13061523, + "step": 6197, + "time_per_iteration": 2.708768129348755 + }, + { + "auxiliary_loss_clip": 0.01128862, + "auxiliary_loss_mlp": 0.01034382, + "balance_loss_clip": 1.04795408, + "balance_loss_mlp": 1.02003562, + "epoch": 0.37264392003607394, + "flos": 27623053740960.0, + "grad_norm": 1.5776028353426534, + "language_loss": 0.64356053, + "learning_rate": 2.889733130264237e-06, + "loss": 0.66519296, + "num_input_tokens_seen": 133199615, + "router_z_loss_clip": 0.80810547, + "router_z_loss_mlp": 0.14337158, + "step": 6198, + "time_per_iteration": 2.6707518100738525 + }, + { + "auxiliary_loss_clip": 0.01129895, + "auxiliary_loss_mlp": 0.01043745, + "balance_loss_clip": 1.04868472, + "balance_loss_mlp": 1.03053093, + "epoch": 0.3727040432887419, + "flos": 24371316806400.0, + "grad_norm": 1.4605825692421375, + "language_loss": 0.73736775, + "learning_rate": 2.889384312737261e-06, + "loss": 0.75910413, + "num_input_tokens_seen": 133219650, + "router_z_loss_clip": 0.81152344, + "router_z_loss_mlp": 0.13220215, + "step": 6199, + "time_per_iteration": 2.722142457962036 + }, + { + "auxiliary_loss_clip": 0.01131138, + "auxiliary_loss_mlp": 0.01035452, + "balance_loss_clip": 1.04904401, + "balance_loss_mlp": 1.02212429, + "epoch": 0.37276416654140987, + "flos": 77973461680320.0, + "grad_norm": 1.8684707838602235, + "language_loss": 0.80673611, + "learning_rate": 2.889035461484742e-06, + "loss": 0.82840204, + "num_input_tokens_seen": 133245675, + "router_z_loss_clip": 0.82128906, + "router_z_loss_mlp": 0.13330078, + "step": 6200, + "time_per_iteration": 3.103410482406616 + }, + { + "auxiliary_loss_clip": 0.01131114, + "auxiliary_loss_mlp": 0.01039813, + "balance_loss_clip": 1.04958403, + "balance_loss_mlp": 1.02648592, + "epoch": 0.37282428979407783, + "flos": 48548378656800.0, + "grad_norm": 1.9891479839840862, + "language_loss": 0.60902375, + "learning_rate": 2.88868657651991e-06, + "loss": 0.63073301, + "num_input_tokens_seen": 133266905, + "router_z_loss_clip": 0.81689453, + "router_z_loss_mlp": 0.13317871, + "step": 6201, + "time_per_iteration": 2.8031327724456787 + }, + { + "auxiliary_loss_clip": 0.01135969, + "auxiliary_loss_mlp": 0.01035788, + "balance_loss_clip": 1.05155814, + "balance_loss_mlp": 1.02237749, + "epoch": 0.37288441304674586, + "flos": 27709950605760.0, + "grad_norm": 2.7468812872292054, + "language_loss": 0.72861379, + "learning_rate": 2.8883376578559934e-06, + "loss": 0.75033134, + "num_input_tokens_seen": 133286865, + "router_z_loss_clip": 0.84423828, + "router_z_loss_mlp": 0.13427734, + "step": 6202, + "time_per_iteration": 2.721731662750244 + }, + { + "auxiliary_loss_clip": 0.0113211, + "auxiliary_loss_mlp": 0.01035006, + "balance_loss_clip": 1.05061638, + "balance_loss_mlp": 1.02154136, + "epoch": 0.3729445362994138, + "flos": 22903423246080.0, + "grad_norm": 1.9234396091162034, + "language_loss": 0.735726, + "learning_rate": 2.8879887055062243e-06, + "loss": 0.75739717, + "num_input_tokens_seen": 133305295, + "router_z_loss_clip": 0.81542969, + "router_z_loss_mlp": 0.13470459, + "step": 6203, + "time_per_iteration": 2.6302576065063477 + }, + { + "auxiliary_loss_clip": 0.01126809, + "auxiliary_loss_mlp": 0.01034891, + "balance_loss_clip": 1.04669929, + "balance_loss_mlp": 1.02324462, + "epoch": 0.3730046595520818, + "flos": 27401172350400.0, + "grad_norm": 1.9300962549215503, + "language_loss": 0.81805748, + "learning_rate": 2.8876397194838353e-06, + "loss": 0.83967447, + "num_input_tokens_seen": 133324625, + "router_z_loss_clip": 0.80126953, + "router_z_loss_mlp": 0.11651611, + "step": 6204, + "time_per_iteration": 2.7056941986083984 + }, + { + "auxiliary_loss_clip": 0.01135588, + "auxiliary_loss_mlp": 0.01037942, + "balance_loss_clip": 1.05078483, + "balance_loss_mlp": 1.02429271, + "epoch": 0.37306478280474975, + "flos": 29671153230240.0, + "grad_norm": 2.010426957234195, + "language_loss": 0.75395274, + "learning_rate": 2.8872906998020577e-06, + "loss": 0.77568805, + "num_input_tokens_seen": 133344625, + "router_z_loss_clip": 0.84814453, + "router_z_loss_mlp": 0.13659668, + "step": 6205, + "time_per_iteration": 2.6677441596984863 + }, + { + "auxiliary_loss_clip": 0.01130133, + "auxiliary_loss_mlp": 0.01036207, + "balance_loss_clip": 1.04799092, + "balance_loss_mlp": 1.02215219, + "epoch": 0.3731249060574177, + "flos": 19297332466560.0, + "grad_norm": 1.8202598053888805, + "language_loss": 0.77682507, + "learning_rate": 2.886941646474128e-06, + "loss": 0.7984885, + "num_input_tokens_seen": 133363605, + "router_z_loss_clip": 0.82226562, + "router_z_loss_mlp": 0.14050293, + "step": 6206, + "time_per_iteration": 2.660588026046753 + }, + { + "auxiliary_loss_clip": 0.01130955, + "auxiliary_loss_mlp": 0.01032298, + "balance_loss_clip": 1.04777122, + "balance_loss_mlp": 1.0180881, + "epoch": 0.3731850293100857, + "flos": 24194241177120.0, + "grad_norm": 4.105422889629379, + "language_loss": 0.93378067, + "learning_rate": 2.886592559513283e-06, + "loss": 0.9554131, + "num_input_tokens_seen": 133379405, + "router_z_loss_clip": 0.83105469, + "router_z_loss_mlp": 0.14221191, + "step": 6207, + "time_per_iteration": 2.6186344623565674 + }, + { + "auxiliary_loss_clip": 0.01132871, + "auxiliary_loss_mlp": 0.01027095, + "balance_loss_clip": 1.04797423, + "balance_loss_mlp": 1.01391029, + "epoch": 0.37324515256275365, + "flos": 23259802955040.0, + "grad_norm": 2.0571845077030053, + "language_loss": 0.82646286, + "learning_rate": 2.886243438932759e-06, + "loss": 0.84806252, + "num_input_tokens_seen": 133397585, + "router_z_loss_clip": 0.84814453, + "router_z_loss_mlp": 0.13165283, + "step": 6208, + "time_per_iteration": 2.661353349685669 + }, + { + "auxiliary_loss_clip": 0.01133227, + "auxiliary_loss_mlp": 0.0103219, + "balance_loss_clip": 1.04930735, + "balance_loss_mlp": 1.0175159, + "epoch": 0.3733052758154216, + "flos": 25263461338560.0, + "grad_norm": 2.021952403146259, + "language_loss": 0.72819996, + "learning_rate": 2.8858942847457953e-06, + "loss": 0.74985421, + "num_input_tokens_seen": 133415365, + "router_z_loss_clip": 0.84033203, + "router_z_loss_mlp": 0.14672852, + "step": 6209, + "time_per_iteration": 2.635411024093628 + }, + { + "auxiliary_loss_clip": 0.01131681, + "auxiliary_loss_mlp": 0.01035137, + "balance_loss_clip": 1.04926443, + "balance_loss_mlp": 1.01986659, + "epoch": 0.3733653990680896, + "flos": 24640110856800.0, + "grad_norm": 1.9027900295983662, + "language_loss": 0.7034868, + "learning_rate": 2.8855450969656305e-06, + "loss": 0.72515494, + "num_input_tokens_seen": 133435700, + "router_z_loss_clip": 0.82421875, + "router_z_loss_mlp": 0.15264893, + "step": 6210, + "time_per_iteration": 2.721388816833496 + }, + { + "auxiliary_loss_clip": 0.0113201, + "auxiliary_loss_mlp": 0.01030258, + "balance_loss_clip": 1.04715061, + "balance_loss_mlp": 1.01554108, + "epoch": 0.37342552232075754, + "flos": 24818442521760.0, + "grad_norm": 2.122595355145096, + "language_loss": 0.77528226, + "learning_rate": 2.8851958756055073e-06, + "loss": 0.79690492, + "num_input_tokens_seen": 133455180, + "router_z_loss_clip": 0.84863281, + "router_z_loss_mlp": 0.14727783, + "step": 6211, + "time_per_iteration": 4.158293008804321 + }, + { + "auxiliary_loss_clip": 0.0113334, + "auxiliary_loss_mlp": 0.01038123, + "balance_loss_clip": 1.04882121, + "balance_loss_mlp": 1.02475381, + "epoch": 0.3734856455734255, + "flos": 43340058067680.0, + "grad_norm": 1.6531346872000923, + "language_loss": 0.73073417, + "learning_rate": 2.884846620678668e-06, + "loss": 0.7524488, + "num_input_tokens_seen": 133476715, + "router_z_loss_clip": 0.84521484, + "router_z_loss_mlp": 0.13366699, + "step": 6212, + "time_per_iteration": 4.342316150665283 + }, + { + "auxiliary_loss_clip": 0.01139079, + "auxiliary_loss_mlp": 0.0104193, + "balance_loss_clip": 1.04909539, + "balance_loss_mlp": 1.02785146, + "epoch": 0.37354576882609347, + "flos": 25797889091520.0, + "grad_norm": 2.1391327688041133, + "language_loss": 0.81854033, + "learning_rate": 2.884497332198356e-06, + "loss": 0.84035045, + "num_input_tokens_seen": 133494550, + "router_z_loss_clip": 0.89990234, + "router_z_loss_mlp": 0.140625, + "step": 6213, + "time_per_iteration": 2.6545228958129883 + }, + { + "auxiliary_loss_clip": 0.01133391, + "auxiliary_loss_mlp": 0.01038244, + "balance_loss_clip": 1.04919434, + "balance_loss_mlp": 1.02429652, + "epoch": 0.37360589207876144, + "flos": 26242705321920.0, + "grad_norm": 2.7674010735362686, + "language_loss": 0.78465599, + "learning_rate": 2.8841480101778167e-06, + "loss": 0.80637234, + "num_input_tokens_seen": 133512640, + "router_z_loss_clip": 0.84326172, + "router_z_loss_mlp": 0.13952637, + "step": 6214, + "time_per_iteration": 2.742037057876587 + }, + { + "auxiliary_loss_clip": 0.01127592, + "auxiliary_loss_mlp": 0.01039712, + "balance_loss_clip": 1.04630196, + "balance_loss_mlp": 1.02606225, + "epoch": 0.37366601533142946, + "flos": 46900816361280.0, + "grad_norm": 1.7976339569510558, + "language_loss": 0.85098541, + "learning_rate": 2.883798654630296e-06, + "loss": 0.87265849, + "num_input_tokens_seen": 133535540, + "router_z_loss_clip": 0.81298828, + "router_z_loss_mlp": 0.13653564, + "step": 6215, + "time_per_iteration": 2.7969086170196533 + }, + { + "auxiliary_loss_clip": 0.0113366, + "auxiliary_loss_mlp": 0.01036256, + "balance_loss_clip": 1.04788923, + "balance_loss_mlp": 1.0218972, + "epoch": 0.3737261385840974, + "flos": 22501873120320.0, + "grad_norm": 2.677104692731294, + "language_loss": 0.67973644, + "learning_rate": 2.8834492655690423e-06, + "loss": 0.70143557, + "num_input_tokens_seen": 133555795, + "router_z_loss_clip": 0.85742188, + "router_z_loss_mlp": 0.14349365, + "step": 6216, + "time_per_iteration": 4.087790012359619 + }, + { + "auxiliary_loss_clip": 0.01132199, + "auxiliary_loss_mlp": 0.01036336, + "balance_loss_clip": 1.04820812, + "balance_loss_mlp": 1.02194762, + "epoch": 0.3737862618367654, + "flos": 27979230863520.0, + "grad_norm": 2.2707099726958293, + "language_loss": 0.66040969, + "learning_rate": 2.883099843007303e-06, + "loss": 0.68209499, + "num_input_tokens_seen": 133575905, + "router_z_loss_clip": 0.83984375, + "router_z_loss_mlp": 0.1439209, + "step": 6217, + "time_per_iteration": 2.706977605819702 + }, + { + "auxiliary_loss_clip": 0.01131442, + "auxiliary_loss_mlp": 0.01037216, + "balance_loss_clip": 1.04681551, + "balance_loss_mlp": 1.02325654, + "epoch": 0.37384638508943335, + "flos": 18802888918560.0, + "grad_norm": 1.7539419591542338, + "language_loss": 0.80493569, + "learning_rate": 2.88275038695833e-06, + "loss": 0.82662225, + "num_input_tokens_seen": 133592585, + "router_z_loss_clip": 0.84619141, + "router_z_loss_mlp": 0.1395874, + "step": 6218, + "time_per_iteration": 2.618042230606079 + }, + { + "auxiliary_loss_clip": 0.01125273, + "auxiliary_loss_mlp": 0.01032152, + "balance_loss_clip": 1.04576373, + "balance_loss_mlp": 1.0191344, + "epoch": 0.3739065083421013, + "flos": 29627684539200.0, + "grad_norm": 1.3682171493835438, + "language_loss": 0.78759223, + "learning_rate": 2.8824008974353736e-06, + "loss": 0.80916643, + "num_input_tokens_seen": 133615070, + "router_z_loss_clip": 0.79492188, + "router_z_loss_mlp": 0.13024902, + "step": 6219, + "time_per_iteration": 4.168856143951416 + }, + { + "auxiliary_loss_clip": 0.01128403, + "auxiliary_loss_mlp": 0.01039894, + "balance_loss_clip": 1.04821897, + "balance_loss_mlp": 1.02589941, + "epoch": 0.3739666315947693, + "flos": 28068275144160.0, + "grad_norm": 1.819599375271431, + "language_loss": 0.76823902, + "learning_rate": 2.8820513744516866e-06, + "loss": 0.789922, + "num_input_tokens_seen": 133633490, + "router_z_loss_clip": 0.80175781, + "router_z_loss_mlp": 0.13995361, + "step": 6220, + "time_per_iteration": 2.7391209602355957 + }, + { + "auxiliary_loss_clip": 0.01128811, + "auxiliary_loss_mlp": 0.01036114, + "balance_loss_clip": 1.04541564, + "balance_loss_mlp": 1.02232707, + "epoch": 0.37402675484743725, + "flos": 23660947908000.0, + "grad_norm": 1.8873057393616577, + "language_loss": 0.82884133, + "learning_rate": 2.8817018180205235e-06, + "loss": 0.85049051, + "num_input_tokens_seen": 133653425, + "router_z_loss_clip": 0.83496094, + "router_z_loss_mlp": 0.13793945, + "step": 6221, + "time_per_iteration": 2.7431135177612305 + }, + { + "auxiliary_loss_clip": 0.01129857, + "auxiliary_loss_mlp": 0.01040047, + "balance_loss_clip": 1.04737997, + "balance_loss_mlp": 1.02680898, + "epoch": 0.3740868781001052, + "flos": 20897333825760.0, + "grad_norm": 2.0835352304155386, + "language_loss": 0.76221192, + "learning_rate": 2.8813522281551387e-06, + "loss": 0.78391093, + "num_input_tokens_seen": 133670220, + "router_z_loss_clip": 0.82617188, + "router_z_loss_mlp": 0.13250732, + "step": 6222, + "time_per_iteration": 2.7682933807373047 + }, + { + "auxiliary_loss_clip": 0.01131386, + "auxiliary_loss_mlp": 0.01031063, + "balance_loss_clip": 1.04889286, + "balance_loss_mlp": 1.01734185, + "epoch": 0.3741470013527732, + "flos": 24457443842880.0, + "grad_norm": 1.696432234564071, + "language_loss": 0.70439267, + "learning_rate": 2.881002604868789e-06, + "loss": 0.72601718, + "num_input_tokens_seen": 133688910, + "router_z_loss_clip": 0.82519531, + "router_z_loss_mlp": 0.13745117, + "step": 6223, + "time_per_iteration": 2.6650915145874023 + }, + { + "auxiliary_loss_clip": 0.01133405, + "auxiliary_loss_mlp": 0.0103625, + "balance_loss_clip": 1.05140567, + "balance_loss_mlp": 1.02338183, + "epoch": 0.37420712460544114, + "flos": 45023958012960.0, + "grad_norm": 1.8794579557226343, + "language_loss": 0.68977594, + "learning_rate": 2.8806529481747325e-06, + "loss": 0.71147251, + "num_input_tokens_seen": 133708690, + "router_z_loss_clip": 0.81933594, + "router_z_loss_mlp": 0.12872314, + "step": 6224, + "time_per_iteration": 2.823540210723877 + }, + { + "auxiliary_loss_clip": 0.01129663, + "auxiliary_loss_mlp": 0.01034422, + "balance_loss_clip": 1.05007756, + "balance_loss_mlp": 1.02139282, + "epoch": 0.3742672478581091, + "flos": 27089963058240.0, + "grad_norm": 1.65348008046908, + "language_loss": 0.70018721, + "learning_rate": 2.880303258086228e-06, + "loss": 0.72182804, + "num_input_tokens_seen": 133728095, + "router_z_loss_clip": 0.79589844, + "router_z_loss_mlp": 0.13037109, + "step": 6225, + "time_per_iteration": 2.6720802783966064 + }, + { + "auxiliary_loss_clip": 0.01127649, + "auxiliary_loss_mlp": 0.01039399, + "balance_loss_clip": 1.04720926, + "balance_loss_mlp": 1.02521908, + "epoch": 0.3743273711107771, + "flos": 30116739288960.0, + "grad_norm": 2.0053348936926416, + "language_loss": 0.79208386, + "learning_rate": 2.879953534616536e-06, + "loss": 0.81375438, + "num_input_tokens_seen": 133745590, + "router_z_loss_clip": 0.80419922, + "router_z_loss_mlp": 0.14178467, + "step": 6226, + "time_per_iteration": 2.7305948734283447 + }, + { + "auxiliary_loss_clip": 0.01131394, + "auxiliary_loss_mlp": 0.01035737, + "balance_loss_clip": 1.0485667, + "balance_loss_mlp": 1.02150941, + "epoch": 0.37438749436344504, + "flos": 29846202995520.0, + "grad_norm": 1.8906896242581874, + "language_loss": 0.68267262, + "learning_rate": 2.879603777778917e-06, + "loss": 0.70434397, + "num_input_tokens_seen": 133766155, + "router_z_loss_clip": 0.828125, + "router_z_loss_mlp": 0.14233398, + "step": 6227, + "time_per_iteration": 2.7035880088806152 + }, + { + "auxiliary_loss_clip": 0.01127743, + "auxiliary_loss_mlp": 0.01030376, + "balance_loss_clip": 1.04720378, + "balance_loss_mlp": 1.01717389, + "epoch": 0.374447617616113, + "flos": 26598963479040.0, + "grad_norm": 1.9067961643812634, + "language_loss": 0.82825708, + "learning_rate": 2.879253987586635e-06, + "loss": 0.84983832, + "num_input_tokens_seen": 133783185, + "router_z_loss_clip": 0.8046875, + "router_z_loss_mlp": 0.13214111, + "step": 6228, + "time_per_iteration": 2.6684818267822266 + }, + { + "auxiliary_loss_clip": 0.01129533, + "auxiliary_loss_mlp": 0.01035273, + "balance_loss_clip": 1.0492065, + "balance_loss_mlp": 1.02210653, + "epoch": 0.374507740868781, + "flos": 21924584435520.0, + "grad_norm": 1.6268765314877192, + "language_loss": 0.74542427, + "learning_rate": 2.8789041640529535e-06, + "loss": 0.76707232, + "num_input_tokens_seen": 133800975, + "router_z_loss_clip": 0.80322266, + "router_z_loss_mlp": 0.13171387, + "step": 6229, + "time_per_iteration": 2.6725857257843018 + }, + { + "auxiliary_loss_clip": 0.01132508, + "auxiliary_loss_mlp": 0.01035707, + "balance_loss_clip": 1.0483532, + "balance_loss_mlp": 1.02168739, + "epoch": 0.374567864121449, + "flos": 19652132001600.0, + "grad_norm": 1.9220345147229345, + "language_loss": 0.83230537, + "learning_rate": 2.8785543071911383e-06, + "loss": 0.85398751, + "num_input_tokens_seen": 133818020, + "router_z_loss_clip": 0.84179688, + "router_z_loss_mlp": 0.14025879, + "step": 6230, + "time_per_iteration": 2.6669371128082275 + }, + { + "auxiliary_loss_clip": 0.01132438, + "auxiliary_loss_mlp": 0.01036753, + "balance_loss_clip": 1.04950643, + "balance_loss_mlp": 1.02293611, + "epoch": 0.37462798737411696, + "flos": 31449081081600.0, + "grad_norm": 2.110760311010473, + "language_loss": 0.73525929, + "learning_rate": 2.878204417014456e-06, + "loss": 0.75695121, + "num_input_tokens_seen": 133840690, + "router_z_loss_clip": 0.82958984, + "router_z_loss_mlp": 0.13818359, + "step": 6231, + "time_per_iteration": 2.8147759437561035 + }, + { + "auxiliary_loss_clip": 0.01135917, + "auxiliary_loss_mlp": 0.01037746, + "balance_loss_clip": 1.05144787, + "balance_loss_mlp": 1.02374506, + "epoch": 0.3746881106267849, + "flos": 20321544280320.0, + "grad_norm": 2.164934365825449, + "language_loss": 0.73096025, + "learning_rate": 2.8778544935361735e-06, + "loss": 0.75269687, + "num_input_tokens_seen": 133858350, + "router_z_loss_clip": 0.84472656, + "router_z_loss_mlp": 0.14001465, + "step": 6232, + "time_per_iteration": 2.651130437850952 + }, + { + "auxiliary_loss_clip": 0.01130497, + "auxiliary_loss_mlp": 0.01033166, + "balance_loss_clip": 1.04708743, + "balance_loss_mlp": 1.01918852, + "epoch": 0.3747482338794529, + "flos": 31942754801280.0, + "grad_norm": 1.9099268333813142, + "language_loss": 0.77047253, + "learning_rate": 2.877504536769561e-06, + "loss": 0.79210913, + "num_input_tokens_seen": 133879775, + "router_z_loss_clip": 0.83398438, + "router_z_loss_mlp": 0.1395874, + "step": 6233, + "time_per_iteration": 2.769124746322632 + }, + { + "auxiliary_loss_clip": 0.01136472, + "auxiliary_loss_mlp": 0.01036606, + "balance_loss_clip": 1.05256844, + "balance_loss_mlp": 1.0228548, + "epoch": 0.37480835713212085, + "flos": 14667435046080.0, + "grad_norm": 1.8110474721886787, + "language_loss": 0.68840903, + "learning_rate": 2.8771545467278883e-06, + "loss": 0.71013981, + "num_input_tokens_seen": 133898295, + "router_z_loss_clip": 0.83886719, + "router_z_loss_mlp": 0.13745117, + "step": 6234, + "time_per_iteration": 2.70369291305542 + }, + { + "auxiliary_loss_clip": 0.01129541, + "auxiliary_loss_mlp": 0.01038054, + "balance_loss_clip": 1.04831171, + "balance_loss_mlp": 1.02482736, + "epoch": 0.3748684803847888, + "flos": 24012911233440.0, + "grad_norm": 2.021830474567062, + "language_loss": 0.82371092, + "learning_rate": 2.8768045234244276e-06, + "loss": 0.84538686, + "num_input_tokens_seen": 133915230, + "router_z_loss_clip": 0.81152344, + "router_z_loss_mlp": 0.13226318, + "step": 6235, + "time_per_iteration": 2.7150449752807617 + }, + { + "auxiliary_loss_clip": 0.0113511, + "auxiliary_loss_mlp": 0.01032548, + "balance_loss_clip": 1.05069244, + "balance_loss_mlp": 1.0192858, + "epoch": 0.3749286036374568, + "flos": 25040040291360.0, + "grad_norm": 1.863831751303996, + "language_loss": 0.78032565, + "learning_rate": 2.8764544668724517e-06, + "loss": 0.80200219, + "num_input_tokens_seen": 133934110, + "router_z_loss_clip": 0.84423828, + "router_z_loss_mlp": 0.13262939, + "step": 6236, + "time_per_iteration": 2.665403127670288 + }, + { + "auxiliary_loss_clip": 0.0113496, + "auxiliary_loss_mlp": 0.01043632, + "balance_loss_clip": 1.04846239, + "balance_loss_mlp": 1.02801526, + "epoch": 0.37498872689012475, + "flos": 25263785476800.0, + "grad_norm": 2.4863751502347395, + "language_loss": 0.73570824, + "learning_rate": 2.876104377085234e-06, + "loss": 0.75749415, + "num_input_tokens_seen": 133952395, + "router_z_loss_clip": 0.86523438, + "router_z_loss_mlp": 0.15612793, + "step": 6237, + "time_per_iteration": 2.749377965927124 + }, + { + "auxiliary_loss_clip": 0.01132857, + "auxiliary_loss_mlp": 0.01034561, + "balance_loss_clip": 1.04676795, + "balance_loss_mlp": 1.02014816, + "epoch": 0.3750488501427927, + "flos": 26326198735200.0, + "grad_norm": 2.2313842225738236, + "language_loss": 0.93077731, + "learning_rate": 2.8757542540760508e-06, + "loss": 0.95245147, + "num_input_tokens_seen": 133969635, + "router_z_loss_clip": 0.86181641, + "router_z_loss_mlp": 0.14422607, + "step": 6238, + "time_per_iteration": 2.6556313037872314 + }, + { + "auxiliary_loss_clip": 0.01131869, + "auxiliary_loss_mlp": 0.01030894, + "balance_loss_clip": 1.04768264, + "balance_loss_mlp": 1.01665425, + "epoch": 0.3751089733954607, + "flos": 19430048024640.0, + "grad_norm": 1.936523567794105, + "language_loss": 0.70597291, + "learning_rate": 2.8754040978581777e-06, + "loss": 0.72760057, + "num_input_tokens_seen": 133987215, + "router_z_loss_clip": 0.84130859, + "router_z_loss_mlp": 0.14239502, + "step": 6239, + "time_per_iteration": 2.6846961975097656 + }, + { + "auxiliary_loss_clip": 0.01135282, + "auxiliary_loss_mlp": 0.01033196, + "balance_loss_clip": 1.0507772, + "balance_loss_mlp": 1.01917148, + "epoch": 0.37516909664812864, + "flos": 44275671290880.0, + "grad_norm": 2.1704375367661224, + "language_loss": 0.6529758, + "learning_rate": 2.875053908444895e-06, + "loss": 0.67466056, + "num_input_tokens_seen": 134009250, + "router_z_loss_clip": 0.84521484, + "router_z_loss_mlp": 0.14013672, + "step": 6240, + "time_per_iteration": 2.77531361579895 + }, + { + "auxiliary_loss_clip": 0.01134985, + "auxiliary_loss_mlp": 0.01028584, + "balance_loss_clip": 1.04937673, + "balance_loss_mlp": 1.01551902, + "epoch": 0.3752292199007966, + "flos": 16491667798080.0, + "grad_norm": 2.355793124503267, + "language_loss": 0.75859064, + "learning_rate": 2.8747036858494795e-06, + "loss": 0.78022635, + "num_input_tokens_seen": 134026875, + "router_z_loss_clip": 0.85595703, + "router_z_loss_mlp": 0.1307373, + "step": 6241, + "time_per_iteration": 2.6791138648986816 + }, + { + "auxiliary_loss_clip": 0.01134578, + "auxiliary_loss_mlp": 0.01037356, + "balance_loss_clip": 1.04974174, + "balance_loss_mlp": 1.02253246, + "epoch": 0.3752893431534646, + "flos": 33187713521760.0, + "grad_norm": 2.278546976685281, + "language_loss": 0.83423823, + "learning_rate": 2.874353430085213e-06, + "loss": 0.85595757, + "num_input_tokens_seen": 134047185, + "router_z_loss_clip": 0.84765625, + "router_z_loss_mlp": 0.14819336, + "step": 6242, + "time_per_iteration": 2.7723138332366943 + }, + { + "auxiliary_loss_clip": 0.01134323, + "auxiliary_loss_mlp": 0.01039224, + "balance_loss_clip": 1.04938817, + "balance_loss_mlp": 1.02621818, + "epoch": 0.3753494664061326, + "flos": 36616161430080.0, + "grad_norm": 3.7720177906559216, + "language_loss": 0.68316478, + "learning_rate": 2.8740031411653766e-06, + "loss": 0.70490026, + "num_input_tokens_seen": 134067330, + "router_z_loss_clip": 0.84863281, + "router_z_loss_mlp": 0.13012695, + "step": 6243, + "time_per_iteration": 2.7636404037475586 + }, + { + "auxiliary_loss_clip": 0.01132597, + "auxiliary_loss_mlp": 0.01038599, + "balance_loss_clip": 1.04935217, + "balance_loss_mlp": 1.0242461, + "epoch": 0.37540958965880056, + "flos": 29848836618720.0, + "grad_norm": 2.1892114422394364, + "language_loss": 0.83710039, + "learning_rate": 2.8736528191032535e-06, + "loss": 0.85881239, + "num_input_tokens_seen": 134085525, + "router_z_loss_clip": 0.83154297, + "router_z_loss_mlp": 0.14349365, + "step": 6244, + "time_per_iteration": 2.716485023498535 + }, + { + "auxiliary_loss_clip": 0.0112898, + "auxiliary_loss_mlp": 0.01030595, + "balance_loss_clip": 1.04767251, + "balance_loss_mlp": 1.01765537, + "epoch": 0.3754697129114685, + "flos": 20143253132640.0, + "grad_norm": 2.7655466782242133, + "language_loss": 0.83368123, + "learning_rate": 2.8733024639121277e-06, + "loss": 0.85527694, + "num_input_tokens_seen": 134101855, + "router_z_loss_clip": 0.8125, + "router_z_loss_mlp": 0.1295166, + "step": 6245, + "time_per_iteration": 2.629962921142578 + }, + { + "auxiliary_loss_clip": 0.0113238, + "auxiliary_loss_mlp": 0.01036716, + "balance_loss_clip": 1.04932868, + "balance_loss_mlp": 1.02257824, + "epoch": 0.3755298361641365, + "flos": 23660704804320.0, + "grad_norm": 2.362584240332642, + "language_loss": 0.63715041, + "learning_rate": 2.8729520756052853e-06, + "loss": 0.65884137, + "num_input_tokens_seen": 134119360, + "router_z_loss_clip": 0.83056641, + "router_z_loss_mlp": 0.14154053, + "step": 6246, + "time_per_iteration": 2.7131404876708984 + }, + { + "auxiliary_loss_clip": 0.01136083, + "auxiliary_loss_mlp": 0.01036894, + "balance_loss_clip": 1.04993129, + "balance_loss_mlp": 1.02306581, + "epoch": 0.37558995941680445, + "flos": 17961425153280.0, + "grad_norm": 2.042614346124288, + "language_loss": 0.74836266, + "learning_rate": 2.8726016541960124e-06, + "loss": 0.77009243, + "num_input_tokens_seen": 134137475, + "router_z_loss_clip": 0.86230469, + "router_z_loss_mlp": 0.13824463, + "step": 6247, + "time_per_iteration": 2.696197986602783 + }, + { + "auxiliary_loss_clip": 0.01132474, + "auxiliary_loss_mlp": 0.01032886, + "balance_loss_clip": 1.04821539, + "balance_loss_mlp": 1.01908147, + "epoch": 0.3756500826694724, + "flos": 26466085851840.0, + "grad_norm": 7.114185026389849, + "language_loss": 0.55215418, + "learning_rate": 2.872251199697598e-06, + "loss": 0.57380772, + "num_input_tokens_seen": 134154580, + "router_z_loss_clip": 0.84326172, + "router_z_loss_mlp": 0.13793945, + "step": 6248, + "time_per_iteration": 2.707552671432495 + }, + { + "auxiliary_loss_clip": 0.01132096, + "auxiliary_loss_mlp": 0.01035844, + "balance_loss_clip": 1.04909062, + "balance_loss_mlp": 1.02211666, + "epoch": 0.3757102059221404, + "flos": 32342684235840.0, + "grad_norm": 1.959917881060541, + "language_loss": 0.84525132, + "learning_rate": 2.8719007121233297e-06, + "loss": 0.86693072, + "num_input_tokens_seen": 134174285, + "router_z_loss_clip": 0.83056641, + "router_z_loss_mlp": 0.13726807, + "step": 6249, + "time_per_iteration": 2.7226791381835938 + }, + { + "auxiliary_loss_clip": 0.01128313, + "auxiliary_loss_mlp": 0.01029789, + "balance_loss_clip": 1.04654479, + "balance_loss_mlp": 1.01648569, + "epoch": 0.37577032917480835, + "flos": 45561586631040.0, + "grad_norm": 1.7626342785290035, + "language_loss": 0.67569619, + "learning_rate": 2.8715501914864993e-06, + "loss": 0.69727719, + "num_input_tokens_seen": 134195940, + "router_z_loss_clip": 0.81738281, + "router_z_loss_mlp": 0.13311768, + "step": 6250, + "time_per_iteration": 4.315982818603516 + }, + { + "auxiliary_loss_clip": 0.01132356, + "auxiliary_loss_mlp": 0.01040559, + "balance_loss_clip": 1.0493927, + "balance_loss_mlp": 1.02734494, + "epoch": 0.3758304524274763, + "flos": 26732975590080.0, + "grad_norm": 2.233158044817563, + "language_loss": 0.77800906, + "learning_rate": 2.8711996378003987e-06, + "loss": 0.79973817, + "num_input_tokens_seen": 134212235, + "router_z_loss_clip": 0.83007812, + "router_z_loss_mlp": 0.13214111, + "step": 6251, + "time_per_iteration": 2.687764883041382 + }, + { + "auxiliary_loss_clip": 0.01130828, + "auxiliary_loss_mlp": 0.01029146, + "balance_loss_clip": 1.04810429, + "balance_loss_mlp": 1.01611602, + "epoch": 0.3758905756801443, + "flos": 44623744957440.0, + "grad_norm": 1.9047452032774654, + "language_loss": 0.57654262, + "learning_rate": 2.8708490510783203e-06, + "loss": 0.59814239, + "num_input_tokens_seen": 134233810, + "router_z_loss_clip": 0.82666016, + "router_z_loss_mlp": 0.13037109, + "step": 6252, + "time_per_iteration": 4.2692177295684814 + }, + { + "auxiliary_loss_clip": 0.01135711, + "auxiliary_loss_mlp": 0.01035774, + "balance_loss_clip": 1.04958296, + "balance_loss_mlp": 1.02156436, + "epoch": 0.37595069893281224, + "flos": 29930952444480.0, + "grad_norm": 1.951331655954917, + "language_loss": 0.89867502, + "learning_rate": 2.8704984313335584e-06, + "loss": 0.92038989, + "num_input_tokens_seen": 134252020, + "router_z_loss_clip": 0.86083984, + "router_z_loss_mlp": 0.14190674, + "step": 6253, + "time_per_iteration": 2.67519211769104 + }, + { + "auxiliary_loss_clip": 0.01133072, + "auxiliary_loss_mlp": 0.01030365, + "balance_loss_clip": 1.05206275, + "balance_loss_mlp": 1.01775301, + "epoch": 0.3760108221854802, + "flos": 20053763161920.0, + "grad_norm": 3.056728018975039, + "language_loss": 0.76755607, + "learning_rate": 2.8701477785794097e-06, + "loss": 0.78919041, + "num_input_tokens_seen": 134269495, + "router_z_loss_clip": 0.81005859, + "router_z_loss_mlp": 0.1262207, + "step": 6254, + "time_per_iteration": 2.6588222980499268 + }, + { + "auxiliary_loss_clip": 0.0113552, + "auxiliary_loss_mlp": 0.0103919, + "balance_loss_clip": 1.0505085, + "balance_loss_mlp": 1.02493238, + "epoch": 0.37607094543814823, + "flos": 16803808987680.0, + "grad_norm": 2.024263662955564, + "language_loss": 0.61596847, + "learning_rate": 2.869797092829169e-06, + "loss": 0.63771552, + "num_input_tokens_seen": 134287035, + "router_z_loss_clip": 0.85058594, + "router_z_loss_mlp": 0.14251709, + "step": 6255, + "time_per_iteration": 4.0646281242370605 + }, + { + "auxiliary_loss_clip": 0.01135239, + "auxiliary_loss_mlp": 0.0103334, + "balance_loss_clip": 1.04846632, + "balance_loss_mlp": 1.01841533, + "epoch": 0.3761310686908162, + "flos": 24231267620640.0, + "grad_norm": 2.329378800973771, + "language_loss": 0.73723781, + "learning_rate": 2.869446374096135e-06, + "loss": 0.75892359, + "num_input_tokens_seen": 134304840, + "router_z_loss_clip": 0.8671875, + "router_z_loss_mlp": 0.14916992, + "step": 6256, + "time_per_iteration": 2.6567561626434326 + }, + { + "auxiliary_loss_clip": 0.01135727, + "auxiliary_loss_mlp": 0.01037587, + "balance_loss_clip": 1.05089021, + "balance_loss_mlp": 1.02305508, + "epoch": 0.37619119194348416, + "flos": 15557918369760.0, + "grad_norm": 4.112249018094801, + "language_loss": 0.70314956, + "learning_rate": 2.8690956223936088e-06, + "loss": 0.72488272, + "num_input_tokens_seen": 134323180, + "router_z_loss_clip": 0.84814453, + "router_z_loss_mlp": 0.14526367, + "step": 6257, + "time_per_iteration": 2.696021318435669 + }, + { + "auxiliary_loss_clip": 0.01133733, + "auxiliary_loss_mlp": 0.01027836, + "balance_loss_clip": 1.05031121, + "balance_loss_mlp": 1.01460958, + "epoch": 0.3762513151961521, + "flos": 21390521338080.0, + "grad_norm": 3.3053868737119463, + "language_loss": 0.84368062, + "learning_rate": 2.868744837734889e-06, + "loss": 0.86529636, + "num_input_tokens_seen": 134341390, + "router_z_loss_clip": 0.83544922, + "router_z_loss_mlp": 0.13226318, + "step": 6258, + "time_per_iteration": 4.029572248458862 + }, + { + "auxiliary_loss_clip": 0.01131388, + "auxiliary_loss_mlp": 0.01040526, + "balance_loss_clip": 1.04902005, + "balance_loss_mlp": 1.02760398, + "epoch": 0.3763114384488201, + "flos": 28818749799360.0, + "grad_norm": 1.8122408521531566, + "language_loss": 0.80779344, + "learning_rate": 2.868394020133277e-06, + "loss": 0.8295126, + "num_input_tokens_seen": 134360425, + "router_z_loss_clip": 0.82373047, + "router_z_loss_mlp": 0.12927246, + "step": 6259, + "time_per_iteration": 2.7272422313690186 + }, + { + "auxiliary_loss_clip": 0.01135564, + "auxiliary_loss_mlp": 0.01038666, + "balance_loss_clip": 1.04931235, + "balance_loss_mlp": 1.02361023, + "epoch": 0.37637156170148806, + "flos": 31002360539040.0, + "grad_norm": 2.8569974719053435, + "language_loss": 0.71188486, + "learning_rate": 2.8680431696020783e-06, + "loss": 0.73362708, + "num_input_tokens_seen": 134379775, + "router_z_loss_clip": 0.86279297, + "router_z_loss_mlp": 0.1505127, + "step": 6260, + "time_per_iteration": 2.6785948276519775 + }, + { + "auxiliary_loss_clip": 0.01133075, + "auxiliary_loss_mlp": 0.01036577, + "balance_loss_clip": 1.04852653, + "balance_loss_mlp": 1.02226007, + "epoch": 0.376431684954156, + "flos": 28602824448960.0, + "grad_norm": 1.9912429694499223, + "language_loss": 0.77835083, + "learning_rate": 2.867692286154594e-06, + "loss": 0.80004728, + "num_input_tokens_seen": 134400315, + "router_z_loss_clip": 0.84521484, + "router_z_loss_mlp": 0.14318848, + "step": 6261, + "time_per_iteration": 2.6915929317474365 + }, + { + "auxiliary_loss_clip": 0.01136052, + "auxiliary_loss_mlp": 0.01039997, + "balance_loss_clip": 1.0495379, + "balance_loss_mlp": 1.02612114, + "epoch": 0.376491808206824, + "flos": 41737342050720.0, + "grad_norm": 1.6641020480216884, + "language_loss": 0.8059243, + "learning_rate": 2.867341369804132e-06, + "loss": 0.82768482, + "num_input_tokens_seen": 134422875, + "router_z_loss_clip": 0.86474609, + "router_z_loss_mlp": 0.13861084, + "step": 6262, + "time_per_iteration": 2.805190086364746 + }, + { + "auxiliary_loss_clip": 0.0113004, + "auxiliary_loss_mlp": 0.01030599, + "balance_loss_clip": 1.04751396, + "balance_loss_mlp": 1.01706886, + "epoch": 0.37655193145949195, + "flos": 42935023455840.0, + "grad_norm": 2.4395245669568455, + "language_loss": 0.80614859, + "learning_rate": 2.866990420563998e-06, + "loss": 0.82775497, + "num_input_tokens_seen": 134443025, + "router_z_loss_clip": 0.82470703, + "router_z_loss_mlp": 0.13519287, + "step": 6263, + "time_per_iteration": 2.7594170570373535 + }, + { + "auxiliary_loss_clip": 0.01133636, + "auxiliary_loss_mlp": 0.01041269, + "balance_loss_clip": 1.04946136, + "balance_loss_mlp": 1.02728605, + "epoch": 0.3766120547121599, + "flos": 20452720181760.0, + "grad_norm": 1.8582132892313472, + "language_loss": 0.79708409, + "learning_rate": 2.866639438447501e-06, + "loss": 0.81883311, + "num_input_tokens_seen": 134460945, + "router_z_loss_clip": 0.84228516, + "router_z_loss_mlp": 0.13977051, + "step": 6264, + "time_per_iteration": 2.7082810401916504 + }, + { + "auxiliary_loss_clip": 0.01128967, + "auxiliary_loss_mlp": 0.01043147, + "balance_loss_clip": 1.04605675, + "balance_loss_mlp": 1.02903259, + "epoch": 0.3766721779648279, + "flos": 28736390869920.0, + "grad_norm": 1.9375613883592457, + "language_loss": 0.73533916, + "learning_rate": 2.8662884234679497e-06, + "loss": 0.75706029, + "num_input_tokens_seen": 134480440, + "router_z_loss_clip": 0.82958984, + "router_z_loss_mlp": 0.14111328, + "step": 6265, + "time_per_iteration": 2.65492582321167 + }, + { + "auxiliary_loss_clip": 0.01131491, + "auxiliary_loss_mlp": 0.01035954, + "balance_loss_clip": 1.05043316, + "balance_loss_mlp": 1.02337778, + "epoch": 0.37673230121749585, + "flos": 35544429197280.0, + "grad_norm": 1.8563336261605992, + "language_loss": 0.68869972, + "learning_rate": 2.865937375638654e-06, + "loss": 0.71037424, + "num_input_tokens_seen": 134501110, + "router_z_loss_clip": 0.81054688, + "router_z_loss_mlp": 0.12585449, + "step": 6266, + "time_per_iteration": 2.7619431018829346 + }, + { + "auxiliary_loss_clip": 0.0113462, + "auxiliary_loss_mlp": 0.01035736, + "balance_loss_clip": 1.04716325, + "balance_loss_mlp": 1.0221405, + "epoch": 0.3767924244701638, + "flos": 34345370204640.0, + "grad_norm": 2.7624308259449304, + "language_loss": 0.62786353, + "learning_rate": 2.8655862949729264e-06, + "loss": 0.64956707, + "num_input_tokens_seen": 134522460, + "router_z_loss_clip": 0.87451172, + "router_z_loss_mlp": 0.13592529, + "step": 6267, + "time_per_iteration": 2.807100772857666 + }, + { + "auxiliary_loss_clip": 0.01053978, + "auxiliary_loss_mlp": 0.01004109, + "balance_loss_clip": 1.02549207, + "balance_loss_mlp": 1.00249684, + "epoch": 0.37685254772283183, + "flos": 74184953340960.0, + "grad_norm": 0.7145029243462658, + "language_loss": 0.58912426, + "learning_rate": 2.8652351814840795e-06, + "loss": 0.60970521, + "num_input_tokens_seen": 134589545, + "router_z_loss_clip": 0.28466797, + "router_z_loss_mlp": 0.0161438, + "step": 6268, + "time_per_iteration": 3.4391119480133057 + }, + { + "auxiliary_loss_clip": 0.01129983, + "auxiliary_loss_mlp": 0.01042348, + "balance_loss_clip": 1.04599619, + "balance_loss_mlp": 1.02816188, + "epoch": 0.3769126709754998, + "flos": 31765233481920.0, + "grad_norm": 3.9936670414656223, + "language_loss": 0.64964902, + "learning_rate": 2.8648840351854283e-06, + "loss": 0.67137229, + "num_input_tokens_seen": 134610550, + "router_z_loss_clip": 0.83935547, + "router_z_loss_mlp": 0.1418457, + "step": 6269, + "time_per_iteration": 2.723101854324341 + }, + { + "auxiliary_loss_clip": 0.01131187, + "auxiliary_loss_mlp": 0.01031481, + "balance_loss_clip": 1.04955459, + "balance_loss_mlp": 1.01727152, + "epoch": 0.37697279422816776, + "flos": 28773052657920.0, + "grad_norm": 1.7754509752768295, + "language_loss": 0.70754886, + "learning_rate": 2.8645328560902874e-06, + "loss": 0.72917551, + "num_input_tokens_seen": 134630485, + "router_z_loss_clip": 0.81689453, + "router_z_loss_mlp": 0.14202881, + "step": 6270, + "time_per_iteration": 2.67390775680542 + }, + { + "auxiliary_loss_clip": 0.01052901, + "auxiliary_loss_mlp": 0.01002302, + "balance_loss_clip": 1.02440429, + "balance_loss_mlp": 1.00064087, + "epoch": 0.3770329174808357, + "flos": 79002744504480.0, + "grad_norm": 0.6987525731469765, + "language_loss": 0.56109524, + "learning_rate": 2.8641816442119746e-06, + "loss": 0.58164728, + "num_input_tokens_seen": 134693510, + "router_z_loss_clip": 0.28491211, + "router_z_loss_mlp": 0.01663971, + "step": 6271, + "time_per_iteration": 3.2685089111328125 + }, + { + "auxiliary_loss_clip": 0.01125694, + "auxiliary_loss_mlp": 0.0103424, + "balance_loss_clip": 1.04406548, + "balance_loss_mlp": 1.02038801, + "epoch": 0.3770930407335037, + "flos": 26643566653920.0, + "grad_norm": 1.8016534853636388, + "language_loss": 0.80076987, + "learning_rate": 2.8638303995638066e-06, + "loss": 0.82236922, + "num_input_tokens_seen": 134713115, + "router_z_loss_clip": 0.81640625, + "router_z_loss_mlp": 0.13842773, + "step": 6272, + "time_per_iteration": 2.6389224529266357 + }, + { + "auxiliary_loss_clip": 0.01125172, + "auxiliary_loss_mlp": 0.01034586, + "balance_loss_clip": 1.045748, + "balance_loss_mlp": 1.02203357, + "epoch": 0.37715316398617166, + "flos": 27757146886560.0, + "grad_norm": 1.7412682498134686, + "language_loss": 0.73842174, + "learning_rate": 2.863479122159103e-06, + "loss": 0.7600193, + "num_input_tokens_seen": 134732635, + "router_z_loss_clip": 0.79296875, + "router_z_loss_mlp": 0.12542725, + "step": 6273, + "time_per_iteration": 2.7524988651275635 + }, + { + "auxiliary_loss_clip": 0.01127658, + "auxiliary_loss_mlp": 0.01039372, + "balance_loss_clip": 1.04662895, + "balance_loss_mlp": 1.02613366, + "epoch": 0.3772132872388396, + "flos": 23079648012480.0, + "grad_norm": 1.7967484035286039, + "language_loss": 0.72060347, + "learning_rate": 2.8631278120111858e-06, + "loss": 0.74227381, + "num_input_tokens_seen": 134750695, + "router_z_loss_clip": 0.81005859, + "router_z_loss_mlp": 0.13238525, + "step": 6274, + "time_per_iteration": 2.6380422115325928 + }, + { + "auxiliary_loss_clip": 0.01131946, + "auxiliary_loss_mlp": 0.01032545, + "balance_loss_clip": 1.04829216, + "balance_loss_mlp": 1.01996219, + "epoch": 0.3772734104915076, + "flos": 21165763220640.0, + "grad_norm": 3.4118545137073943, + "language_loss": 0.83753091, + "learning_rate": 2.8627764691333742e-06, + "loss": 0.8591758, + "num_input_tokens_seen": 134768935, + "router_z_loss_clip": 0.83740234, + "router_z_loss_mlp": 0.12585449, + "step": 6275, + "time_per_iteration": 2.7257702350616455 + }, + { + "auxiliary_loss_clip": 0.01126411, + "auxiliary_loss_mlp": 0.01030007, + "balance_loss_clip": 1.04781806, + "balance_loss_mlp": 1.01821077, + "epoch": 0.37733353374417555, + "flos": 39465173237760.0, + "grad_norm": 2.0808547790719243, + "language_loss": 0.7552979, + "learning_rate": 2.8624250935389935e-06, + "loss": 0.77686203, + "num_input_tokens_seen": 134791260, + "router_z_loss_clip": 0.78662109, + "router_z_loss_mlp": 0.11804199, + "step": 6276, + "time_per_iteration": 2.738471269607544 + }, + { + "auxiliary_loss_clip": 0.01129559, + "auxiliary_loss_mlp": 0.0103314, + "balance_loss_clip": 1.04652655, + "balance_loss_mlp": 1.01929951, + "epoch": 0.3773936569968435, + "flos": 28505595677760.0, + "grad_norm": 2.365760184518967, + "language_loss": 0.85332334, + "learning_rate": 2.862073685241366e-06, + "loss": 0.87495029, + "num_input_tokens_seen": 134808350, + "router_z_loss_clip": 0.82958984, + "router_z_loss_mlp": 0.13842773, + "step": 6277, + "time_per_iteration": 2.667433500289917 + }, + { + "auxiliary_loss_clip": 0.01127402, + "auxiliary_loss_mlp": 0.01029925, + "balance_loss_clip": 1.04844689, + "balance_loss_mlp": 1.01774764, + "epoch": 0.3774537802495115, + "flos": 26194212488160.0, + "grad_norm": 2.731525597344517, + "language_loss": 0.78269827, + "learning_rate": 2.861722244253818e-06, + "loss": 0.80427152, + "num_input_tokens_seen": 134826005, + "router_z_loss_clip": 0.7890625, + "router_z_loss_mlp": 0.12188721, + "step": 6278, + "time_per_iteration": 2.665130138397217 + }, + { + "auxiliary_loss_clip": 0.01132551, + "auxiliary_loss_mlp": 0.01035863, + "balance_loss_clip": 1.04833305, + "balance_loss_mlp": 1.02190948, + "epoch": 0.37751390350217945, + "flos": 30473240549760.0, + "grad_norm": 2.716733738250203, + "language_loss": 0.83058375, + "learning_rate": 2.8613707705896767e-06, + "loss": 0.85226792, + "num_input_tokens_seen": 134844995, + "router_z_loss_clip": 0.84228516, + "router_z_loss_mlp": 0.13964844, + "step": 6279, + "time_per_iteration": 2.706235647201538 + }, + { + "auxiliary_loss_clip": 0.01129824, + "auxiliary_loss_mlp": 0.01034071, + "balance_loss_clip": 1.04743505, + "balance_loss_mlp": 1.02150059, + "epoch": 0.3775740267548474, + "flos": 33945440770080.0, + "grad_norm": 2.301882733821091, + "language_loss": 0.75550199, + "learning_rate": 2.861019264262269e-06, + "loss": 0.77714097, + "num_input_tokens_seen": 134865285, + "router_z_loss_clip": 0.82373047, + "router_z_loss_mlp": 0.12579346, + "step": 6280, + "time_per_iteration": 2.7811410427093506 + }, + { + "auxiliary_loss_clip": 0.01125331, + "auxiliary_loss_mlp": 0.0103038, + "balance_loss_clip": 1.0466187, + "balance_loss_mlp": 1.01823831, + "epoch": 0.3776341500075154, + "flos": 27534252564000.0, + "grad_norm": 1.4641504131450889, + "language_loss": 0.76081002, + "learning_rate": 2.8606677252849242e-06, + "loss": 0.78236711, + "num_input_tokens_seen": 134886535, + "router_z_loss_clip": 0.78710938, + "router_z_loss_mlp": 0.12127686, + "step": 6281, + "time_per_iteration": 2.647282838821411 + }, + { + "auxiliary_loss_clip": 0.0112588, + "auxiliary_loss_mlp": 0.0103112, + "balance_loss_clip": 1.04435849, + "balance_loss_mlp": 1.01844215, + "epoch": 0.3776942732601834, + "flos": 28158940116000.0, + "grad_norm": 1.81861127374491, + "language_loss": 0.84146494, + "learning_rate": 2.860316153670974e-06, + "loss": 0.8630349, + "num_input_tokens_seen": 134907435, + "router_z_loss_clip": 0.81494141, + "router_z_loss_mlp": 0.12677002, + "step": 6282, + "time_per_iteration": 2.694046974182129 + }, + { + "auxiliary_loss_clip": 0.01125542, + "auxiliary_loss_mlp": 0.01029928, + "balance_loss_clip": 1.04586601, + "balance_loss_mlp": 1.01694059, + "epoch": 0.37775439651285136, + "flos": 26508379541760.0, + "grad_norm": 1.7283519547736665, + "language_loss": 0.69465482, + "learning_rate": 2.8599645494337484e-06, + "loss": 0.71620953, + "num_input_tokens_seen": 134925360, + "router_z_loss_clip": 0.79736328, + "router_z_loss_mlp": 0.13000488, + "step": 6283, + "time_per_iteration": 2.7828385829925537 + }, + { + "auxiliary_loss_clip": 0.0112817, + "auxiliary_loss_mlp": 0.01037828, + "balance_loss_clip": 1.04817224, + "balance_loss_mlp": 1.02429795, + "epoch": 0.37781451976551933, + "flos": 29270170346400.0, + "grad_norm": 2.229016526752753, + "language_loss": 0.76232708, + "learning_rate": 2.859612912586581e-06, + "loss": 0.78398705, + "num_input_tokens_seen": 134944205, + "router_z_loss_clip": 0.79980469, + "router_z_loss_mlp": 0.13543701, + "step": 6284, + "time_per_iteration": 2.74670672416687 + }, + { + "auxiliary_loss_clip": 0.01132137, + "auxiliary_loss_mlp": 0.01028496, + "balance_loss_clip": 1.04751873, + "balance_loss_mlp": 1.01499557, + "epoch": 0.3778746430181873, + "flos": 16751061839520.0, + "grad_norm": 2.5869835699950414, + "language_loss": 0.85351729, + "learning_rate": 2.8592612431428055e-06, + "loss": 0.87512362, + "num_input_tokens_seen": 134960255, + "router_z_loss_clip": 0.84619141, + "router_z_loss_mlp": 0.1350708, + "step": 6285, + "time_per_iteration": 2.6419548988342285 + }, + { + "auxiliary_loss_clip": 0.01128602, + "auxiliary_loss_mlp": 0.01035307, + "balance_loss_clip": 1.04606247, + "balance_loss_mlp": 1.02176476, + "epoch": 0.37793476627085526, + "flos": 23745940460640.0, + "grad_norm": 1.7770894248124192, + "language_loss": 0.84600681, + "learning_rate": 2.858909541115758e-06, + "loss": 0.86764586, + "num_input_tokens_seen": 134978605, + "router_z_loss_clip": 0.82568359, + "router_z_loss_mlp": 0.13549805, + "step": 6286, + "time_per_iteration": 2.7451860904693604 + }, + { + "auxiliary_loss_clip": 0.01125807, + "auxiliary_loss_mlp": 0.01036411, + "balance_loss_clip": 1.04517102, + "balance_loss_mlp": 1.02322626, + "epoch": 0.3779948895235232, + "flos": 13063989718080.0, + "grad_norm": 2.346544648589426, + "language_loss": 0.82079518, + "learning_rate": 2.858557806518775e-06, + "loss": 0.84241736, + "num_input_tokens_seen": 134995020, + "router_z_loss_clip": 0.80566406, + "router_z_loss_mlp": 0.13189697, + "step": 6287, + "time_per_iteration": 2.6142187118530273 + }, + { + "auxiliary_loss_clip": 0.01126539, + "auxiliary_loss_mlp": 0.01031904, + "balance_loss_clip": 1.04618716, + "balance_loss_mlp": 1.01892257, + "epoch": 0.3780550127761912, + "flos": 27222800168160.0, + "grad_norm": 2.696352857306129, + "language_loss": 0.72681499, + "learning_rate": 2.8582060393651927e-06, + "loss": 0.74839944, + "num_input_tokens_seen": 135012620, + "router_z_loss_clip": 0.80371094, + "router_z_loss_mlp": 0.12988281, + "step": 6288, + "time_per_iteration": 2.6663401126861572 + }, + { + "auxiliary_loss_clip": 0.01129378, + "auxiliary_loss_mlp": 0.0103481, + "balance_loss_clip": 1.04849648, + "balance_loss_mlp": 1.02181029, + "epoch": 0.37811513602885916, + "flos": 35326194361920.0, + "grad_norm": 1.6884523326436156, + "language_loss": 0.75217617, + "learning_rate": 2.857854239668352e-06, + "loss": 0.77381808, + "num_input_tokens_seen": 135033365, + "router_z_loss_clip": 0.80908203, + "router_z_loss_mlp": 0.13006592, + "step": 6289, + "time_per_iteration": 4.16347074508667 + }, + { + "auxiliary_loss_clip": 0.01128382, + "auxiliary_loss_mlp": 0.01030784, + "balance_loss_clip": 1.04772496, + "balance_loss_mlp": 1.01805234, + "epoch": 0.3781752592815271, + "flos": 28202570876160.0, + "grad_norm": 1.8211950843069278, + "language_loss": 0.73917401, + "learning_rate": 2.857502407441593e-06, + "loss": 0.76076567, + "num_input_tokens_seen": 135052185, + "router_z_loss_clip": 0.80712891, + "router_z_loss_mlp": 0.12731934, + "step": 6290, + "time_per_iteration": 2.666041135787964 + }, + { + "auxiliary_loss_clip": 0.01132701, + "auxiliary_loss_mlp": 0.0103638, + "balance_loss_clip": 1.04730201, + "balance_loss_mlp": 1.02120471, + "epoch": 0.3782353825341951, + "flos": 24106128793920.0, + "grad_norm": 2.9818201686842345, + "language_loss": 0.7927568, + "learning_rate": 2.8571505426982566e-06, + "loss": 0.81444764, + "num_input_tokens_seen": 135070425, + "router_z_loss_clip": 0.85351562, + "router_z_loss_mlp": 0.15148926, + "step": 6291, + "time_per_iteration": 4.0822913646698 + }, + { + "auxiliary_loss_clip": 0.01130877, + "auxiliary_loss_mlp": 0.01032452, + "balance_loss_clip": 1.04732633, + "balance_loss_mlp": 1.01884437, + "epoch": 0.37829550578686305, + "flos": 26906890871520.0, + "grad_norm": 2.193570343094143, + "language_loss": 0.7603482, + "learning_rate": 2.8567986454516854e-06, + "loss": 0.78198147, + "num_input_tokens_seen": 135090525, + "router_z_loss_clip": 0.83447266, + "router_z_loss_mlp": 0.13604736, + "step": 6292, + "time_per_iteration": 2.6692464351654053 + }, + { + "auxiliary_loss_clip": 0.01128558, + "auxiliary_loss_mlp": 0.01038969, + "balance_loss_clip": 1.04681575, + "balance_loss_mlp": 1.02528334, + "epoch": 0.378355629039531, + "flos": 20096988749280.0, + "grad_norm": 1.8050597489514266, + "language_loss": 0.69827175, + "learning_rate": 2.856446715715224e-06, + "loss": 0.71994704, + "num_input_tokens_seen": 135109575, + "router_z_loss_clip": 0.81738281, + "router_z_loss_mlp": 0.13671875, + "step": 6293, + "time_per_iteration": 2.6230835914611816 + }, + { + "auxiliary_loss_clip": 0.01126045, + "auxiliary_loss_mlp": 0.01032956, + "balance_loss_clip": 1.04568744, + "balance_loss_mlp": 1.01995039, + "epoch": 0.378415752292199, + "flos": 24373261635840.0, + "grad_norm": 2.002449247349164, + "language_loss": 0.71359921, + "learning_rate": 2.8560947535022173e-06, + "loss": 0.73518926, + "num_input_tokens_seen": 135127000, + "router_z_loss_clip": 0.80419922, + "router_z_loss_mlp": 0.13006592, + "step": 6294, + "time_per_iteration": 2.6655609607696533 + }, + { + "auxiliary_loss_clip": 0.01132462, + "auxiliary_loss_mlp": 0.01033421, + "balance_loss_clip": 1.04822087, + "balance_loss_mlp": 1.02023065, + "epoch": 0.378475875544867, + "flos": 17872340355360.0, + "grad_norm": 2.6732395907626865, + "language_loss": 0.82639182, + "learning_rate": 2.855742758826011e-06, + "loss": 0.84805065, + "num_input_tokens_seen": 135145285, + "router_z_loss_clip": 0.84277344, + "router_z_loss_mlp": 0.13195801, + "step": 6295, + "time_per_iteration": 4.0996503829956055 + }, + { + "auxiliary_loss_clip": 0.0113121, + "auxiliary_loss_mlp": 0.01030856, + "balance_loss_clip": 1.0484165, + "balance_loss_mlp": 1.01817775, + "epoch": 0.37853599879753497, + "flos": 32520084003360.0, + "grad_norm": 1.8061750615086607, + "language_loss": 0.71762669, + "learning_rate": 2.8553907316999547e-06, + "loss": 0.73924732, + "num_input_tokens_seen": 135165240, + "router_z_loss_clip": 0.82763672, + "router_z_loss_mlp": 0.12683105, + "step": 6296, + "time_per_iteration": 2.695319414138794 + }, + { + "auxiliary_loss_clip": 0.01126092, + "auxiliary_loss_mlp": 0.01035621, + "balance_loss_clip": 1.04801321, + "balance_loss_mlp": 1.02285945, + "epoch": 0.37859612205020293, + "flos": 21123753151680.0, + "grad_norm": 2.05156844252185, + "language_loss": 0.77138144, + "learning_rate": 2.855038672137396e-06, + "loss": 0.79299855, + "num_input_tokens_seen": 135184045, + "router_z_loss_clip": 0.78125, + "router_z_loss_mlp": 0.12762451, + "step": 6297, + "time_per_iteration": 2.68984055519104 + }, + { + "auxiliary_loss_clip": 0.01128953, + "auxiliary_loss_mlp": 0.01034383, + "balance_loss_clip": 1.04726696, + "balance_loss_mlp": 1.02177656, + "epoch": 0.3786562453028709, + "flos": 22232633379840.0, + "grad_norm": 1.797154144022293, + "language_loss": 0.79552191, + "learning_rate": 2.854686580151684e-06, + "loss": 0.8171553, + "num_input_tokens_seen": 135202365, + "router_z_loss_clip": 0.81591797, + "router_z_loss_mlp": 0.1260376, + "step": 6298, + "time_per_iteration": 4.095748662948608 + }, + { + "auxiliary_loss_clip": 0.01127806, + "auxiliary_loss_mlp": 0.01037585, + "balance_loss_clip": 1.04856348, + "balance_loss_mlp": 1.02512205, + "epoch": 0.37871636855553886, + "flos": 25886487682080.0, + "grad_norm": 2.1957688523359757, + "language_loss": 0.8431657, + "learning_rate": 2.8543344557561722e-06, + "loss": 0.86481965, + "num_input_tokens_seen": 135220955, + "router_z_loss_clip": 0.79248047, + "router_z_loss_mlp": 0.12481689, + "step": 6299, + "time_per_iteration": 2.72021484375 + }, + { + "auxiliary_loss_clip": 0.01131387, + "auxiliary_loss_mlp": 0.01030754, + "balance_loss_clip": 1.04900849, + "balance_loss_mlp": 1.01810598, + "epoch": 0.3787764918082068, + "flos": 25567215451200.0, + "grad_norm": 2.002667647921863, + "language_loss": 0.76457536, + "learning_rate": 2.8539822989642116e-06, + "loss": 0.78619683, + "num_input_tokens_seen": 135239715, + "router_z_loss_clip": 0.82421875, + "router_z_loss_mlp": 0.12664795, + "step": 6300, + "time_per_iteration": 2.6604413986206055 + }, + { + "auxiliary_loss_clip": 0.01134034, + "auxiliary_loss_mlp": 0.01035817, + "balance_loss_clip": 1.04838419, + "balance_loss_mlp": 1.02127361, + "epoch": 0.3788366150608748, + "flos": 21119296250880.0, + "grad_norm": 2.4071712979134343, + "language_loss": 0.82755661, + "learning_rate": 2.8536301097891577e-06, + "loss": 0.8492552, + "num_input_tokens_seen": 135257035, + "router_z_loss_clip": 0.85595703, + "router_z_loss_mlp": 0.14538574, + "step": 6301, + "time_per_iteration": 2.692638397216797 + }, + { + "auxiliary_loss_clip": 0.0112769, + "auxiliary_loss_mlp": 0.0103693, + "balance_loss_clip": 1.04577827, + "balance_loss_mlp": 1.02453184, + "epoch": 0.37889673831354276, + "flos": 29665197190080.0, + "grad_norm": 2.217060231067756, + "language_loss": 0.67889374, + "learning_rate": 2.8532778882443636e-06, + "loss": 0.70053995, + "num_input_tokens_seen": 135275720, + "router_z_loss_clip": 0.81933594, + "router_z_loss_mlp": 0.12408447, + "step": 6302, + "time_per_iteration": 2.7469210624694824 + }, + { + "auxiliary_loss_clip": 0.01127282, + "auxiliary_loss_mlp": 0.01038732, + "balance_loss_clip": 1.04717469, + "balance_loss_mlp": 1.02644122, + "epoch": 0.3789568615662107, + "flos": 32560230277440.0, + "grad_norm": 2.3398896673502447, + "language_loss": 0.68390346, + "learning_rate": 2.8529256343431867e-06, + "loss": 0.70556355, + "num_input_tokens_seen": 135294140, + "router_z_loss_clip": 0.80078125, + "router_z_loss_mlp": 0.1227417, + "step": 6303, + "time_per_iteration": 2.6839866638183594 + }, + { + "auxiliary_loss_clip": 0.01126508, + "auxiliary_loss_mlp": 0.01033833, + "balance_loss_clip": 1.04617381, + "balance_loss_mlp": 1.02158415, + "epoch": 0.3790169848188787, + "flos": 28779049215360.0, + "grad_norm": 1.7678417261586008, + "language_loss": 0.77916825, + "learning_rate": 2.8525733480989846e-06, + "loss": 0.80077171, + "num_input_tokens_seen": 135314845, + "router_z_loss_clip": 0.80371094, + "router_z_loss_mlp": 0.12255859, + "step": 6304, + "time_per_iteration": 2.7123515605926514 + }, + { + "auxiliary_loss_clip": 0.0113377, + "auxiliary_loss_mlp": 0.01037519, + "balance_loss_clip": 1.04981375, + "balance_loss_mlp": 1.02394724, + "epoch": 0.37907710807154665, + "flos": 22497659323200.0, + "grad_norm": 2.629797196842333, + "language_loss": 0.80332649, + "learning_rate": 2.8522210295251146e-06, + "loss": 0.82503939, + "num_input_tokens_seen": 135333055, + "router_z_loss_clip": 0.83984375, + "router_z_loss_mlp": 0.13568115, + "step": 6305, + "time_per_iteration": 2.6313979625701904 + }, + { + "auxiliary_loss_clip": 0.0105209, + "auxiliary_loss_mlp": 0.01015058, + "balance_loss_clip": 1.02500141, + "balance_loss_mlp": 1.01370406, + "epoch": 0.3791372313242146, + "flos": 61140128296320.0, + "grad_norm": 0.9812909275098829, + "language_loss": 0.64504766, + "learning_rate": 2.8518686786349387e-06, + "loss": 0.66571915, + "num_input_tokens_seen": 135387865, + "router_z_loss_clip": 0.27124023, + "router_z_loss_mlp": 0.0135498, + "step": 6306, + "time_per_iteration": 3.2352190017700195 + }, + { + "auxiliary_loss_clip": 0.01131514, + "auxiliary_loss_mlp": 0.0104081, + "balance_loss_clip": 1.04832196, + "balance_loss_mlp": 1.02738118, + "epoch": 0.3791973545768826, + "flos": 29671355816640.0, + "grad_norm": 1.5976210297549573, + "language_loss": 0.73717427, + "learning_rate": 2.851516295441817e-06, + "loss": 0.75889748, + "num_input_tokens_seen": 135409095, + "router_z_loss_clip": 0.83203125, + "router_z_loss_mlp": 0.13427734, + "step": 6307, + "time_per_iteration": 2.6706159114837646 + }, + { + "auxiliary_loss_clip": 0.01132444, + "auxiliary_loss_mlp": 0.0103947, + "balance_loss_clip": 1.04879713, + "balance_loss_mlp": 1.0261786, + "epoch": 0.3792574778295506, + "flos": 26331911671680.0, + "grad_norm": 1.6867634973946888, + "language_loss": 0.78448486, + "learning_rate": 2.851163879959112e-06, + "loss": 0.80620396, + "num_input_tokens_seen": 135429585, + "router_z_loss_clip": 0.8359375, + "router_z_loss_mlp": 0.13299561, + "step": 6308, + "time_per_iteration": 2.734699010848999 + }, + { + "auxiliary_loss_clip": 0.01126717, + "auxiliary_loss_mlp": 0.01036786, + "balance_loss_clip": 1.04527855, + "balance_loss_mlp": 1.02350557, + "epoch": 0.37931760108221857, + "flos": 27177386647680.0, + "grad_norm": 2.1465748570921432, + "language_loss": 0.72670197, + "learning_rate": 2.8508114322001876e-06, + "loss": 0.74833703, + "num_input_tokens_seen": 135446320, + "router_z_loss_clip": 0.81445312, + "router_z_loss_mlp": 0.13275146, + "step": 6309, + "time_per_iteration": 2.643864631652832 + }, + { + "auxiliary_loss_clip": 0.01127815, + "auxiliary_loss_mlp": 0.01032786, + "balance_loss_clip": 1.04724121, + "balance_loss_mlp": 1.01989365, + "epoch": 0.37937772433488653, + "flos": 24017125030560.0, + "grad_norm": 1.5493588607490325, + "language_loss": 0.79041201, + "learning_rate": 2.8504589521784083e-06, + "loss": 0.81201798, + "num_input_tokens_seen": 135465720, + "router_z_loss_clip": 0.80566406, + "router_z_loss_mlp": 0.12884521, + "step": 6310, + "time_per_iteration": 2.695868730545044 + }, + { + "auxiliary_loss_clip": 0.0112615, + "auxiliary_loss_mlp": 0.01032574, + "balance_loss_clip": 1.04471481, + "balance_loss_mlp": 1.02003968, + "epoch": 0.3794378475875545, + "flos": 23304932854560.0, + "grad_norm": 1.7814982355562698, + "language_loss": 0.76123416, + "learning_rate": 2.8501064399071403e-06, + "loss": 0.78282142, + "num_input_tokens_seen": 135485155, + "router_z_loss_clip": 0.81445312, + "router_z_loss_mlp": 0.12536621, + "step": 6311, + "time_per_iteration": 2.6736629009246826 + }, + { + "auxiliary_loss_clip": 0.01127138, + "auxiliary_loss_mlp": 0.01029394, + "balance_loss_clip": 1.04740524, + "balance_loss_mlp": 1.01702631, + "epoch": 0.37949797084022246, + "flos": 24818442521760.0, + "grad_norm": 1.5284577649091855, + "language_loss": 0.71262246, + "learning_rate": 2.8497538953997504e-06, + "loss": 0.73418784, + "num_input_tokens_seen": 135502675, + "router_z_loss_clip": 0.79638672, + "router_z_loss_mlp": 0.12365723, + "step": 6312, + "time_per_iteration": 2.6520161628723145 + }, + { + "auxiliary_loss_clip": 0.01049848, + "auxiliary_loss_mlp": 0.01000811, + "balance_loss_clip": 1.0229404, + "balance_loss_mlp": 0.99932444, + "epoch": 0.37955809409289043, + "flos": 78059270928960.0, + "grad_norm": 0.7761083158000117, + "language_loss": 0.56078428, + "learning_rate": 2.849401318669608e-06, + "loss": 0.58129084, + "num_input_tokens_seen": 135562005, + "router_z_loss_clip": 0.26953125, + "router_z_loss_mlp": 0.01485443, + "step": 6313, + "time_per_iteration": 3.372288227081299 + }, + { + "auxiliary_loss_clip": 0.01125326, + "auxiliary_loss_mlp": 0.01031672, + "balance_loss_clip": 1.04426432, + "balance_loss_mlp": 1.01932156, + "epoch": 0.3796182173455584, + "flos": 38486780117280.0, + "grad_norm": 1.9541027279029044, + "language_loss": 0.71609712, + "learning_rate": 2.849048709730083e-06, + "loss": 0.73766708, + "num_input_tokens_seen": 135582600, + "router_z_loss_clip": 0.81103516, + "router_z_loss_mlp": 0.12359619, + "step": 6314, + "time_per_iteration": 2.7503912448883057 + }, + { + "auxiliary_loss_clip": 0.01132191, + "auxiliary_loss_mlp": 0.01035378, + "balance_loss_clip": 1.04687309, + "balance_loss_mlp": 1.02191889, + "epoch": 0.37967834059822636, + "flos": 14801325605280.0, + "grad_norm": 2.0158467549622174, + "language_loss": 0.73187, + "learning_rate": 2.848696068594545e-06, + "loss": 0.75354564, + "num_input_tokens_seen": 135600280, + "router_z_loss_clip": 0.85253906, + "router_z_loss_mlp": 0.13470459, + "step": 6315, + "time_per_iteration": 2.694876194000244 + }, + { + "auxiliary_loss_clip": 0.01122763, + "auxiliary_loss_mlp": 0.01032129, + "balance_loss_clip": 1.04341793, + "balance_loss_mlp": 1.01937425, + "epoch": 0.3797384638508943, + "flos": 48014599180320.0, + "grad_norm": 2.0354487156793497, + "language_loss": 0.70883369, + "learning_rate": 2.8483433952763677e-06, + "loss": 0.73038268, + "num_input_tokens_seen": 135621560, + "router_z_loss_clip": 0.79394531, + "router_z_loss_mlp": 0.12750244, + "step": 6316, + "time_per_iteration": 2.792905569076538 + }, + { + "auxiliary_loss_clip": 0.01126339, + "auxiliary_loss_mlp": 0.01032416, + "balance_loss_clip": 1.04655969, + "balance_loss_mlp": 1.0200901, + "epoch": 0.3797985871035623, + "flos": 41554148312160.0, + "grad_norm": 2.015349550495413, + "language_loss": 0.65586263, + "learning_rate": 2.847990689788923e-06, + "loss": 0.67745018, + "num_input_tokens_seen": 135641745, + "router_z_loss_clip": 0.79833984, + "router_z_loss_mlp": 0.12329102, + "step": 6317, + "time_per_iteration": 2.764575481414795 + }, + { + "auxiliary_loss_clip": 0.01121896, + "auxiliary_loss_mlp": 0.01029852, + "balance_loss_clip": 1.04244471, + "balance_loss_mlp": 1.01761544, + "epoch": 0.37985871035623026, + "flos": 28335894193440.0, + "grad_norm": 2.283651454976399, + "language_loss": 0.85467994, + "learning_rate": 2.8476379521455877e-06, + "loss": 0.8761974, + "num_input_tokens_seen": 135660650, + "router_z_loss_clip": 0.79394531, + "router_z_loss_mlp": 0.12237549, + "step": 6318, + "time_per_iteration": 2.639090061187744 + }, + { + "auxiliary_loss_clip": 0.01129087, + "auxiliary_loss_mlp": 0.01032818, + "balance_loss_clip": 1.04599881, + "balance_loss_mlp": 1.0187633, + "epoch": 0.3799188336088982, + "flos": 22103604894240.0, + "grad_norm": 2.4608786021839006, + "language_loss": 0.76026666, + "learning_rate": 2.8472851823597354e-06, + "loss": 0.78188574, + "num_input_tokens_seen": 135679980, + "router_z_loss_clip": 0.82958984, + "router_z_loss_mlp": 0.14056396, + "step": 6319, + "time_per_iteration": 2.7046144008636475 + }, + { + "auxiliary_loss_clip": 0.01127019, + "auxiliary_loss_mlp": 0.0102849, + "balance_loss_clip": 1.04638898, + "balance_loss_mlp": 1.01617026, + "epoch": 0.3799789568615662, + "flos": 26688210346080.0, + "grad_norm": 1.5170703423901228, + "language_loss": 0.63450974, + "learning_rate": 2.846932380444744e-06, + "loss": 0.65606481, + "num_input_tokens_seen": 135699400, + "router_z_loss_clip": 0.80664062, + "router_z_loss_mlp": 0.12335205, + "step": 6320, + "time_per_iteration": 2.690699577331543 + }, + { + "auxiliary_loss_clip": 0.01126458, + "auxiliary_loss_mlp": 0.01032706, + "balance_loss_clip": 1.04537308, + "balance_loss_mlp": 1.01950943, + "epoch": 0.3800390801142342, + "flos": 40222414278720.0, + "grad_norm": 1.8609339979862152, + "language_loss": 0.71171188, + "learning_rate": 2.846579546413992e-06, + "loss": 0.73330355, + "num_input_tokens_seen": 135723455, + "router_z_loss_clip": 0.81005859, + "router_z_loss_mlp": 0.13208008, + "step": 6321, + "time_per_iteration": 2.741398334503174 + }, + { + "auxiliary_loss_clip": 0.01127263, + "auxiliary_loss_mlp": 0.01028891, + "balance_loss_clip": 1.04410172, + "balance_loss_mlp": 1.015993, + "epoch": 0.38009920336690217, + "flos": 32832508813920.0, + "grad_norm": 1.9895748887087692, + "language_loss": 0.74907458, + "learning_rate": 2.846226680280859e-06, + "loss": 0.77063608, + "num_input_tokens_seen": 135744335, + "router_z_loss_clip": 0.83105469, + "router_z_loss_mlp": 0.12896729, + "step": 6322, + "time_per_iteration": 2.7113800048828125 + }, + { + "auxiliary_loss_clip": 0.01125398, + "auxiliary_loss_mlp": 0.01032887, + "balance_loss_clip": 1.04474759, + "balance_loss_mlp": 1.01944017, + "epoch": 0.38015932661957014, + "flos": 27441440176320.0, + "grad_norm": 1.9782263303509477, + "language_loss": 0.85011035, + "learning_rate": 2.845873782058725e-06, + "loss": 0.87169325, + "num_input_tokens_seen": 135761440, + "router_z_loss_clip": 0.80664062, + "router_z_loss_mlp": 0.13439941, + "step": 6323, + "time_per_iteration": 2.6746950149536133 + }, + { + "auxiliary_loss_clip": 0.01126726, + "auxiliary_loss_mlp": 0.01030948, + "balance_loss_clip": 1.04512167, + "balance_loss_mlp": 1.01701844, + "epoch": 0.3802194498722381, + "flos": 26821371594240.0, + "grad_norm": 2.3610541370196017, + "language_loss": 0.73291212, + "learning_rate": 2.845520851760973e-06, + "loss": 0.75448889, + "num_input_tokens_seen": 135779955, + "router_z_loss_clip": 0.81640625, + "router_z_loss_mlp": 0.13946533, + "step": 6324, + "time_per_iteration": 2.697622299194336 + }, + { + "auxiliary_loss_clip": 0.01129556, + "auxiliary_loss_mlp": 0.0103023, + "balance_loss_clip": 1.04580641, + "balance_loss_mlp": 1.01686645, + "epoch": 0.38027957312490607, + "flos": 26020661862240.0, + "grad_norm": 2.5502194848604085, + "language_loss": 0.84461653, + "learning_rate": 2.8451678894009847e-06, + "loss": 0.86621439, + "num_input_tokens_seen": 135799840, + "router_z_loss_clip": 0.83789062, + "router_z_loss_mlp": 0.13378906, + "step": 6325, + "time_per_iteration": 2.6257731914520264 + }, + { + "auxiliary_loss_clip": 0.01125492, + "auxiliary_loss_mlp": 0.01027831, + "balance_loss_clip": 1.04506421, + "balance_loss_mlp": 1.0155946, + "epoch": 0.38033969637757403, + "flos": 20366350041600.0, + "grad_norm": 1.8217246632628314, + "language_loss": 0.7996437, + "learning_rate": 2.8448148949921465e-06, + "loss": 0.82117701, + "num_input_tokens_seen": 135817880, + "router_z_loss_clip": 0.8046875, + "router_z_loss_mlp": 0.12255859, + "step": 6326, + "time_per_iteration": 2.695371150970459 + }, + { + "auxiliary_loss_clip": 0.01125538, + "auxiliary_loss_mlp": 0.0103091, + "balance_loss_clip": 1.04494023, + "balance_loss_mlp": 1.01861334, + "epoch": 0.380399819630242, + "flos": 44185695112800.0, + "grad_norm": 1.7815612125762847, + "language_loss": 0.72823071, + "learning_rate": 2.844461868547842e-06, + "loss": 0.7497952, + "num_input_tokens_seen": 135838940, + "router_z_loss_clip": 0.80566406, + "router_z_loss_mlp": 0.12304688, + "step": 6327, + "time_per_iteration": 2.755248546600342 + }, + { + "auxiliary_loss_clip": 0.0112649, + "auxiliary_loss_mlp": 0.01028834, + "balance_loss_clip": 1.0459168, + "balance_loss_mlp": 1.01603675, + "epoch": 0.38045994288290996, + "flos": 25976666446560.0, + "grad_norm": 1.6482598805081103, + "language_loss": 0.83071566, + "learning_rate": 2.844108810081459e-06, + "loss": 0.85226893, + "num_input_tokens_seen": 135858325, + "router_z_loss_clip": 0.80615234, + "router_z_loss_mlp": 0.12792969, + "step": 6328, + "time_per_iteration": 2.6740429401397705 + }, + { + "auxiliary_loss_clip": 0.0112306, + "auxiliary_loss_mlp": 0.01025014, + "balance_loss_clip": 1.04353189, + "balance_loss_mlp": 1.01219344, + "epoch": 0.38052006613557793, + "flos": 25530189007680.0, + "grad_norm": 1.416332534435864, + "language_loss": 0.61114967, + "learning_rate": 2.843755719606385e-06, + "loss": 0.63263035, + "num_input_tokens_seen": 135878430, + "router_z_loss_clip": 0.79443359, + "router_z_loss_mlp": 0.12823486, + "step": 6329, + "time_per_iteration": 4.275714159011841 + }, + { + "auxiliary_loss_clip": 0.01126139, + "auxiliary_loss_mlp": 0.01031792, + "balance_loss_clip": 1.04568768, + "balance_loss_mlp": 1.0191437, + "epoch": 0.3805801893882459, + "flos": 25613439317280.0, + "grad_norm": 1.9666162802966145, + "language_loss": 0.55963302, + "learning_rate": 2.8434025971360104e-06, + "loss": 0.58121234, + "num_input_tokens_seen": 135894755, + "router_z_loss_clip": 0.80517578, + "router_z_loss_mlp": 0.12658691, + "step": 6330, + "time_per_iteration": 4.097424507141113 + }, + { + "auxiliary_loss_clip": 0.01122195, + "auxiliary_loss_mlp": 0.01029127, + "balance_loss_clip": 1.04559803, + "balance_loss_mlp": 1.016801, + "epoch": 0.38064031264091386, + "flos": 31186850830560.0, + "grad_norm": 1.533650993060714, + "language_loss": 0.66341913, + "learning_rate": 2.8430494426837243e-06, + "loss": 0.68493235, + "num_input_tokens_seen": 135918275, + "router_z_loss_clip": 0.76513672, + "router_z_loss_mlp": 0.12335205, + "step": 6331, + "time_per_iteration": 2.6976349353790283 + }, + { + "auxiliary_loss_clip": 0.01128492, + "auxiliary_loss_mlp": 0.01034315, + "balance_loss_clip": 1.04674864, + "balance_loss_mlp": 1.02099299, + "epoch": 0.3807004358935818, + "flos": 18407335350240.0, + "grad_norm": 1.532380920342363, + "language_loss": 0.75488532, + "learning_rate": 2.842696256262919e-06, + "loss": 0.7765134, + "num_input_tokens_seen": 135937430, + "router_z_loss_clip": 0.81787109, + "router_z_loss_mlp": 0.13323975, + "step": 6332, + "time_per_iteration": 2.661404848098755 + }, + { + "auxiliary_loss_clip": 0.01125105, + "auxiliary_loss_mlp": 0.01033512, + "balance_loss_clip": 1.04193425, + "balance_loss_mlp": 1.02026796, + "epoch": 0.3807605591462498, + "flos": 20010213436320.0, + "grad_norm": 1.9999903826961176, + "language_loss": 0.82072616, + "learning_rate": 2.842343037886987e-06, + "loss": 0.84231234, + "num_input_tokens_seen": 135954210, + "router_z_loss_clip": 0.83203125, + "router_z_loss_mlp": 0.13238525, + "step": 6333, + "time_per_iteration": 2.5961360931396484 + }, + { + "auxiliary_loss_clip": 0.01125722, + "auxiliary_loss_mlp": 0.01027758, + "balance_loss_clip": 1.0448519, + "balance_loss_mlp": 1.01509809, + "epoch": 0.3808206823989178, + "flos": 35456357331360.0, + "grad_norm": 1.5485158549798712, + "language_loss": 0.86243975, + "learning_rate": 2.8419897875693226e-06, + "loss": 0.88397455, + "num_input_tokens_seen": 135974425, + "router_z_loss_clip": 0.81005859, + "router_z_loss_mlp": 0.12664795, + "step": 6334, + "time_per_iteration": 4.163980722427368 + }, + { + "auxiliary_loss_clip": 0.01123877, + "auxiliary_loss_mlp": 0.01028748, + "balance_loss_clip": 1.04283881, + "balance_loss_mlp": 1.01571226, + "epoch": 0.3808808056515858, + "flos": 19163928114720.0, + "grad_norm": 3.1035884143455137, + "language_loss": 0.79339075, + "learning_rate": 2.841636505323321e-06, + "loss": 0.81491703, + "num_input_tokens_seen": 135991985, + "router_z_loss_clip": 0.80957031, + "router_z_loss_mlp": 0.13031006, + "step": 6335, + "time_per_iteration": 2.622345209121704 + }, + { + "auxiliary_loss_clip": 0.01126543, + "auxiliary_loss_mlp": 0.01033505, + "balance_loss_clip": 1.04401648, + "balance_loss_mlp": 1.02003467, + "epoch": 0.38094092890425374, + "flos": 25263744959520.0, + "grad_norm": 1.975043863761989, + "language_loss": 0.72987771, + "learning_rate": 2.8412831911623795e-06, + "loss": 0.7514782, + "num_input_tokens_seen": 136010015, + "router_z_loss_clip": 0.82324219, + "router_z_loss_mlp": 0.13482666, + "step": 6336, + "time_per_iteration": 2.692509412765503 + }, + { + "auxiliary_loss_clip": 0.01122963, + "auxiliary_loss_mlp": 0.01029981, + "balance_loss_clip": 1.04209673, + "balance_loss_mlp": 1.01756573, + "epoch": 0.3810010521569217, + "flos": 25219060750080.0, + "grad_norm": 2.1046574971195655, + "language_loss": 0.69304931, + "learning_rate": 2.840929845099894e-06, + "loss": 0.71457869, + "num_input_tokens_seen": 136028440, + "router_z_loss_clip": 0.80908203, + "router_z_loss_mlp": 0.12420654, + "step": 6337, + "time_per_iteration": 4.114896535873413 + }, + { + "auxiliary_loss_clip": 0.01128163, + "auxiliary_loss_mlp": 0.01030343, + "balance_loss_clip": 1.04658103, + "balance_loss_mlp": 1.01643121, + "epoch": 0.38106117540958967, + "flos": 38837406372480.0, + "grad_norm": 1.781602662599361, + "language_loss": 0.63691568, + "learning_rate": 2.8405764671492652e-06, + "loss": 0.65850079, + "num_input_tokens_seen": 136048360, + "router_z_loss_clip": 0.81494141, + "router_z_loss_mlp": 0.13916016, + "step": 6338, + "time_per_iteration": 2.7663304805755615 + }, + { + "auxiliary_loss_clip": 0.01129739, + "auxiliary_loss_mlp": 0.01036514, + "balance_loss_clip": 1.04652596, + "balance_loss_mlp": 1.0226562, + "epoch": 0.38112129866225763, + "flos": 20625136323840.0, + "grad_norm": 1.785549635572871, + "language_loss": 0.68870795, + "learning_rate": 2.8402230573238923e-06, + "loss": 0.71037048, + "num_input_tokens_seen": 136065500, + "router_z_loss_clip": 0.83203125, + "router_z_loss_mlp": 0.13867188, + "step": 6339, + "time_per_iteration": 2.681643009185791 + }, + { + "auxiliary_loss_clip": 0.01129179, + "auxiliary_loss_mlp": 0.01036932, + "balance_loss_clip": 1.04653382, + "balance_loss_mlp": 1.02368212, + "epoch": 0.3811814219149256, + "flos": 25486598764800.0, + "grad_norm": 3.278589513429363, + "language_loss": 0.68014336, + "learning_rate": 2.839869615637177e-06, + "loss": 0.70180452, + "num_input_tokens_seen": 136084060, + "router_z_loss_clip": 0.82666016, + "router_z_loss_mlp": 0.13244629, + "step": 6340, + "time_per_iteration": 2.628234386444092 + }, + { + "auxiliary_loss_clip": 0.01131604, + "auxiliary_loss_mlp": 0.01037386, + "balance_loss_clip": 1.04671311, + "balance_loss_mlp": 1.02297354, + "epoch": 0.38124154516759357, + "flos": 20365863834240.0, + "grad_norm": 2.236102507934733, + "language_loss": 0.89643359, + "learning_rate": 2.839516142102522e-06, + "loss": 0.91812354, + "num_input_tokens_seen": 136102310, + "router_z_loss_clip": 0.84863281, + "router_z_loss_mlp": 0.14416504, + "step": 6341, + "time_per_iteration": 2.660813093185425 + }, + { + "auxiliary_loss_clip": 0.01129911, + "auxiliary_loss_mlp": 0.01035613, + "balance_loss_clip": 1.04529071, + "balance_loss_mlp": 1.02149892, + "epoch": 0.38130166842026153, + "flos": 24015950029440.0, + "grad_norm": 2.1308008609900693, + "language_loss": 0.74892557, + "learning_rate": 2.83916263673333e-06, + "loss": 0.77058083, + "num_input_tokens_seen": 136120725, + "router_z_loss_clip": 0.84619141, + "router_z_loss_mlp": 0.14123535, + "step": 6342, + "time_per_iteration": 2.646962881088257 + }, + { + "auxiliary_loss_clip": 0.01125527, + "auxiliary_loss_mlp": 0.01032515, + "balance_loss_clip": 1.0436995, + "balance_loss_mlp": 1.01926517, + "epoch": 0.3813617916729295, + "flos": 27086843227680.0, + "grad_norm": 1.8479949957122708, + "language_loss": 0.83579683, + "learning_rate": 2.838809099543007e-06, + "loss": 0.85737717, + "num_input_tokens_seen": 136139105, + "router_z_loss_clip": 0.81835938, + "router_z_loss_mlp": 0.13232422, + "step": 6343, + "time_per_iteration": 2.6554598808288574 + }, + { + "auxiliary_loss_clip": 0.01126823, + "auxiliary_loss_mlp": 0.01034913, + "balance_loss_clip": 1.04348838, + "balance_loss_mlp": 1.02172828, + "epoch": 0.38142191492559746, + "flos": 23302137162240.0, + "grad_norm": 1.7999037939387423, + "language_loss": 0.77310967, + "learning_rate": 2.838455530544959e-06, + "loss": 0.79472703, + "num_input_tokens_seen": 136158265, + "router_z_loss_clip": 0.83251953, + "router_z_loss_mlp": 0.13189697, + "step": 6344, + "time_per_iteration": 2.7470078468322754 + }, + { + "auxiliary_loss_clip": 0.01128824, + "auxiliary_loss_mlp": 0.01037561, + "balance_loss_clip": 1.04582572, + "balance_loss_mlp": 1.02386343, + "epoch": 0.3814820381782654, + "flos": 29404344526560.0, + "grad_norm": 1.9972464166380381, + "language_loss": 0.72869807, + "learning_rate": 2.838101929752593e-06, + "loss": 0.75036192, + "num_input_tokens_seen": 136176100, + "router_z_loss_clip": 0.83007812, + "router_z_loss_mlp": 0.13708496, + "step": 6345, + "time_per_iteration": 2.646503210067749 + }, + { + "auxiliary_loss_clip": 0.01124255, + "auxiliary_loss_mlp": 0.01031942, + "balance_loss_clip": 1.04362154, + "balance_loss_mlp": 1.01921701, + "epoch": 0.3815421614309334, + "flos": 19252607739840.0, + "grad_norm": 1.8584761352409114, + "language_loss": 0.69328213, + "learning_rate": 2.8377482971793187e-06, + "loss": 0.71484411, + "num_input_tokens_seen": 136195125, + "router_z_loss_clip": 0.80712891, + "router_z_loss_mlp": 0.1272583, + "step": 6346, + "time_per_iteration": 2.636857748031616 + }, + { + "auxiliary_loss_clip": 0.01130059, + "auxiliary_loss_mlp": 0.01031824, + "balance_loss_clip": 1.04766226, + "balance_loss_mlp": 1.01852584, + "epoch": 0.38160228468360136, + "flos": 24281826835680.0, + "grad_norm": 1.986195855593379, + "language_loss": 0.75691193, + "learning_rate": 2.8373946328385437e-06, + "loss": 0.77853078, + "num_input_tokens_seen": 136213885, + "router_z_loss_clip": 0.82373047, + "router_z_loss_mlp": 0.13299561, + "step": 6347, + "time_per_iteration": 2.6344895362854004 + }, + { + "auxiliary_loss_clip": 0.0112656, + "auxiliary_loss_mlp": 0.01035425, + "balance_loss_clip": 1.04368949, + "balance_loss_mlp": 1.02272308, + "epoch": 0.3816624079362694, + "flos": 23526692693280.0, + "grad_norm": 2.181292383408767, + "language_loss": 0.74807763, + "learning_rate": 2.8370409367436813e-06, + "loss": 0.76969749, + "num_input_tokens_seen": 136232700, + "router_z_loss_clip": 0.82910156, + "router_z_loss_mlp": 0.12701416, + "step": 6348, + "time_per_iteration": 2.681506633758545 + }, + { + "auxiliary_loss_clip": 0.01126759, + "auxiliary_loss_mlp": 0.01031646, + "balance_loss_clip": 1.04486537, + "balance_loss_mlp": 1.01843739, + "epoch": 0.38172253118893734, + "flos": 25841033644320.0, + "grad_norm": 1.9393695464376794, + "language_loss": 0.86980033, + "learning_rate": 2.836687208908142e-06, + "loss": 0.8913843, + "num_input_tokens_seen": 136248975, + "router_z_loss_clip": 0.81933594, + "router_z_loss_mlp": 0.13208008, + "step": 6349, + "time_per_iteration": 2.660893678665161 + }, + { + "auxiliary_loss_clip": 0.01127147, + "auxiliary_loss_mlp": 0.01037685, + "balance_loss_clip": 1.04468846, + "balance_loss_mlp": 1.02440524, + "epoch": 0.3817826544416053, + "flos": 21389022198720.0, + "grad_norm": 2.005375618804904, + "language_loss": 0.768273, + "learning_rate": 2.836333449345341e-06, + "loss": 0.78992128, + "num_input_tokens_seen": 136266710, + "router_z_loss_clip": 0.82568359, + "router_z_loss_mlp": 0.1328125, + "step": 6350, + "time_per_iteration": 2.679924964904785 + }, + { + "auxiliary_loss_clip": 0.01127155, + "auxiliary_loss_mlp": 0.0102981, + "balance_loss_clip": 1.0450238, + "balance_loss_mlp": 1.01536155, + "epoch": 0.38184277769427327, + "flos": 19921290707520.0, + "grad_norm": 2.1309338533835667, + "language_loss": 0.75808048, + "learning_rate": 2.8359796580686907e-06, + "loss": 0.77965009, + "num_input_tokens_seen": 136284445, + "router_z_loss_clip": 0.82177734, + "router_z_loss_mlp": 0.14447021, + "step": 6351, + "time_per_iteration": 2.6454427242279053 + }, + { + "auxiliary_loss_clip": 0.0113155, + "auxiliary_loss_mlp": 0.01035887, + "balance_loss_clip": 1.04746521, + "balance_loss_mlp": 1.02133727, + "epoch": 0.38190290094694124, + "flos": 37148117628960.0, + "grad_norm": 1.8389987155992888, + "language_loss": 0.74390858, + "learning_rate": 2.8356258350916085e-06, + "loss": 0.76558292, + "num_input_tokens_seen": 136305730, + "router_z_loss_clip": 0.84082031, + "router_z_loss_mlp": 0.14562988, + "step": 6352, + "time_per_iteration": 2.747663736343384 + }, + { + "auxiliary_loss_clip": 0.0112508, + "auxiliary_loss_mlp": 0.010313, + "balance_loss_clip": 1.04404569, + "balance_loss_mlp": 1.01921189, + "epoch": 0.3819630241996092, + "flos": 17338479844320.0, + "grad_norm": 1.937809314472539, + "language_loss": 0.64344192, + "learning_rate": 2.8352719804275104e-06, + "loss": 0.66500568, + "num_input_tokens_seen": 136323850, + "router_z_loss_clip": 0.81005859, + "router_z_loss_mlp": 0.12072754, + "step": 6353, + "time_per_iteration": 2.629955530166626 + }, + { + "auxiliary_loss_clip": 0.01127071, + "auxiliary_loss_mlp": 0.01031908, + "balance_loss_clip": 1.04552555, + "balance_loss_mlp": 1.01929593, + "epoch": 0.38202314745227717, + "flos": 30517195448160.0, + "grad_norm": 1.765144747158167, + "language_loss": 0.83025551, + "learning_rate": 2.834918094089816e-06, + "loss": 0.85184526, + "num_input_tokens_seen": 136344880, + "router_z_loss_clip": 0.81640625, + "router_z_loss_mlp": 0.12597656, + "step": 6354, + "time_per_iteration": 2.7056052684783936 + }, + { + "auxiliary_loss_clip": 0.01127571, + "auxiliary_loss_mlp": 0.01029675, + "balance_loss_clip": 1.04785097, + "balance_loss_mlp": 1.01742029, + "epoch": 0.38208327070494513, + "flos": 25397838105120.0, + "grad_norm": 3.5188673882127475, + "language_loss": 0.80311996, + "learning_rate": 2.834564176091943e-06, + "loss": 0.82469249, + "num_input_tokens_seen": 136366060, + "router_z_loss_clip": 0.79736328, + "router_z_loss_mlp": 0.12261963, + "step": 6355, + "time_per_iteration": 2.6465747356414795 + }, + { + "auxiliary_loss_clip": 0.01127408, + "auxiliary_loss_mlp": 0.01032579, + "balance_loss_clip": 1.04633665, + "balance_loss_mlp": 1.01954341, + "epoch": 0.3821433939576131, + "flos": 27622770120000.0, + "grad_norm": 2.00201567612001, + "language_loss": 0.75434768, + "learning_rate": 2.8342102264473125e-06, + "loss": 0.77594757, + "num_input_tokens_seen": 136385625, + "router_z_loss_clip": 0.81005859, + "router_z_loss_mlp": 0.13018799, + "step": 6356, + "time_per_iteration": 2.714846134185791 + }, + { + "auxiliary_loss_clip": 0.01127263, + "auxiliary_loss_mlp": 0.01031485, + "balance_loss_clip": 1.04543912, + "balance_loss_mlp": 1.01859856, + "epoch": 0.38220351721028106, + "flos": 32787176328000.0, + "grad_norm": 1.9439317119103439, + "language_loss": 0.81283927, + "learning_rate": 2.833856245169348e-06, + "loss": 0.83442676, + "num_input_tokens_seen": 136405750, + "router_z_loss_clip": 0.81787109, + "router_z_loss_mlp": 0.12890625, + "step": 6357, + "time_per_iteration": 2.734344005584717 + }, + { + "auxiliary_loss_clip": 0.0113392, + "auxiliary_loss_mlp": 0.01038232, + "balance_loss_clip": 1.05034995, + "balance_loss_mlp": 1.023808, + "epoch": 0.38226364046294903, + "flos": 28513253443680.0, + "grad_norm": 1.8549953343500338, + "language_loss": 0.77663445, + "learning_rate": 2.8335022322714695e-06, + "loss": 0.79835594, + "num_input_tokens_seen": 136426085, + "router_z_loss_clip": 0.83642578, + "router_z_loss_mlp": 0.14416504, + "step": 6358, + "time_per_iteration": 2.656458616256714 + }, + { + "auxiliary_loss_clip": 0.01130295, + "auxiliary_loss_mlp": 0.01033352, + "balance_loss_clip": 1.04640341, + "balance_loss_mlp": 1.01990533, + "epoch": 0.382323763715617, + "flos": 23972278752000.0, + "grad_norm": 2.3163844209983586, + "language_loss": 0.78755867, + "learning_rate": 2.8331481877671036e-06, + "loss": 0.80919516, + "num_input_tokens_seen": 136442670, + "router_z_loss_clip": 0.83886719, + "router_z_loss_mlp": 0.13427734, + "step": 6359, + "time_per_iteration": 2.780707836151123 + }, + { + "auxiliary_loss_clip": 0.01129882, + "auxiliary_loss_mlp": 0.01034533, + "balance_loss_clip": 1.04864717, + "balance_loss_mlp": 1.02137864, + "epoch": 0.38238388696828496, + "flos": 66045660837120.0, + "grad_norm": 1.8095860265783832, + "language_loss": 0.69560659, + "learning_rate": 2.8327941116696754e-06, + "loss": 0.7172507, + "num_input_tokens_seen": 136465730, + "router_z_loss_clip": 0.81201172, + "router_z_loss_mlp": 0.13165283, + "step": 6360, + "time_per_iteration": 2.989898204803467 + }, + { + "auxiliary_loss_clip": 0.01127051, + "auxiliary_loss_mlp": 0.01028355, + "balance_loss_clip": 1.04626763, + "balance_loss_mlp": 1.01478922, + "epoch": 0.382444010220953, + "flos": 30427543408320.0, + "grad_norm": 1.5015205883958778, + "language_loss": 0.7865808, + "learning_rate": 2.83244000399261e-06, + "loss": 0.80813491, + "num_input_tokens_seen": 136487215, + "router_z_loss_clip": 0.80761719, + "router_z_loss_mlp": 0.13586426, + "step": 6361, + "time_per_iteration": 2.787132501602173 + }, + { + "auxiliary_loss_clip": 0.01123803, + "auxiliary_loss_mlp": 0.01036627, + "balance_loss_clip": 1.0452106, + "balance_loss_mlp": 1.02455056, + "epoch": 0.38250413347362094, + "flos": 51662011235040.0, + "grad_norm": 1.5438835208746564, + "language_loss": 0.65469897, + "learning_rate": 2.832085864749337e-06, + "loss": 0.67630327, + "num_input_tokens_seen": 136510365, + "router_z_loss_clip": 0.78662109, + "router_z_loss_mlp": 0.12078857, + "step": 6362, + "time_per_iteration": 2.894564390182495 + }, + { + "auxiliary_loss_clip": 0.01126989, + "auxiliary_loss_mlp": 0.01030777, + "balance_loss_clip": 1.04523683, + "balance_loss_mlp": 1.01667476, + "epoch": 0.3825642567262889, + "flos": 19875836669760.0, + "grad_norm": 1.928800087402159, + "language_loss": 0.82071155, + "learning_rate": 2.8317316939532848e-06, + "loss": 0.84228921, + "num_input_tokens_seen": 136527100, + "router_z_loss_clip": 0.81689453, + "router_z_loss_mlp": 0.14105225, + "step": 6363, + "time_per_iteration": 2.7248032093048096 + }, + { + "auxiliary_loss_clip": 0.01128009, + "auxiliary_loss_mlp": 0.01036035, + "balance_loss_clip": 1.04804277, + "balance_loss_mlp": 1.02308273, + "epoch": 0.3826243799789569, + "flos": 55710446690880.0, + "grad_norm": 1.6767765829645163, + "language_loss": 0.58879507, + "learning_rate": 2.8313774916178825e-06, + "loss": 0.61043549, + "num_input_tokens_seen": 136550870, + "router_z_loss_clip": 0.79931641, + "router_z_loss_mlp": 0.12957764, + "step": 6364, + "time_per_iteration": 2.87264347076416 + }, + { + "auxiliary_loss_clip": 0.0113233, + "auxiliary_loss_mlp": 0.01034687, + "balance_loss_clip": 1.04913187, + "balance_loss_mlp": 1.02082932, + "epoch": 0.38268450323162484, + "flos": 30873372570720.0, + "grad_norm": 10.232931366704044, + "language_loss": 0.69329166, + "learning_rate": 2.8310232577565635e-06, + "loss": 0.71496189, + "num_input_tokens_seen": 136569895, + "router_z_loss_clip": 0.83300781, + "router_z_loss_mlp": 0.1385498, + "step": 6365, + "time_per_iteration": 2.694786548614502 + }, + { + "auxiliary_loss_clip": 0.01129418, + "auxiliary_loss_mlp": 0.01031956, + "balance_loss_clip": 1.04432905, + "balance_loss_mlp": 1.01839542, + "epoch": 0.3827446264842928, + "flos": 26644376999520.0, + "grad_norm": 1.9179174135487753, + "language_loss": 0.73341477, + "learning_rate": 2.830668992382758e-06, + "loss": 0.75502849, + "num_input_tokens_seen": 136588585, + "router_z_loss_clip": 0.85107422, + "router_z_loss_mlp": 0.13574219, + "step": 6366, + "time_per_iteration": 2.706151008605957 + }, + { + "auxiliary_loss_clip": 0.01129514, + "auxiliary_loss_mlp": 0.01035226, + "balance_loss_clip": 1.04750681, + "balance_loss_mlp": 1.02162457, + "epoch": 0.38280474973696077, + "flos": 31402208939040.0, + "grad_norm": 2.3832230200669478, + "language_loss": 0.68836164, + "learning_rate": 2.830314695509902e-06, + "loss": 0.71000904, + "num_input_tokens_seen": 136606640, + "router_z_loss_clip": 0.81933594, + "router_z_loss_mlp": 0.13623047, + "step": 6367, + "time_per_iteration": 2.703324556350708 + }, + { + "auxiliary_loss_clip": 0.01126858, + "auxiliary_loss_mlp": 0.01030404, + "balance_loss_clip": 1.04846597, + "balance_loss_mlp": 1.01699317, + "epoch": 0.38286487298962874, + "flos": 30378766953600.0, + "grad_norm": 2.0765713384351665, + "language_loss": 0.65081382, + "learning_rate": 2.82996036715143e-06, + "loss": 0.67238641, + "num_input_tokens_seen": 136624940, + "router_z_loss_clip": 0.78369141, + "router_z_loss_mlp": 0.13415527, + "step": 6368, + "time_per_iteration": 2.7499327659606934 + }, + { + "auxiliary_loss_clip": 0.01129736, + "auxiliary_loss_mlp": 0.01032506, + "balance_loss_clip": 1.04857504, + "balance_loss_mlp": 1.01927423, + "epoch": 0.3829249962422967, + "flos": 34830130122720.0, + "grad_norm": 1.3885469530465273, + "language_loss": 0.68371302, + "learning_rate": 2.8296060073207763e-06, + "loss": 0.70533544, + "num_input_tokens_seen": 136645540, + "router_z_loss_clip": 0.81103516, + "router_z_loss_mlp": 0.13250732, + "step": 6369, + "time_per_iteration": 5.403347730636597 + }, + { + "auxiliary_loss_clip": 0.01128676, + "auxiliary_loss_mlp": 0.01040569, + "balance_loss_clip": 1.04795361, + "balance_loss_mlp": 1.0270865, + "epoch": 0.38298511949496467, + "flos": 26199196113600.0, + "grad_norm": 2.693156600621184, + "language_loss": 0.78776038, + "learning_rate": 2.8292516160313804e-06, + "loss": 0.80945283, + "num_input_tokens_seen": 136664530, + "router_z_loss_clip": 0.80712891, + "router_z_loss_mlp": 0.13482666, + "step": 6370, + "time_per_iteration": 2.644068956375122 + }, + { + "auxiliary_loss_clip": 0.01127943, + "auxiliary_loss_mlp": 0.01037032, + "balance_loss_clip": 1.04765201, + "balance_loss_mlp": 1.02357292, + "epoch": 0.38304524274763263, + "flos": 38657413499040.0, + "grad_norm": 2.413097592298472, + "language_loss": 0.64409435, + "learning_rate": 2.8288971932966805e-06, + "loss": 0.66574413, + "num_input_tokens_seen": 136682315, + "router_z_loss_clip": 0.80273438, + "router_z_loss_mlp": 0.13464355, + "step": 6371, + "time_per_iteration": 2.7928407192230225 + }, + { + "auxiliary_loss_clip": 0.01132196, + "auxiliary_loss_mlp": 0.01039891, + "balance_loss_clip": 1.04697812, + "balance_loss_mlp": 1.02556789, + "epoch": 0.3831053660003006, + "flos": 30599635412160.0, + "grad_norm": 1.998564971381234, + "language_loss": 0.73146659, + "learning_rate": 2.8285427391301155e-06, + "loss": 0.75318748, + "num_input_tokens_seen": 136701185, + "router_z_loss_clip": 0.85253906, + "router_z_loss_mlp": 0.14331055, + "step": 6372, + "time_per_iteration": 2.714735984802246 + }, + { + "auxiliary_loss_clip": 0.01131048, + "auxiliary_loss_mlp": 0.01031912, + "balance_loss_clip": 1.04732358, + "balance_loss_mlp": 1.01828671, + "epoch": 0.38316548925296856, + "flos": 28380821506560.0, + "grad_norm": 1.7173268965706636, + "language_loss": 0.84927845, + "learning_rate": 2.8281882535451266e-06, + "loss": 0.87090814, + "num_input_tokens_seen": 136721265, + "router_z_loss_clip": 0.83691406, + "router_z_loss_mlp": 0.13623047, + "step": 6373, + "time_per_iteration": 2.7188210487365723 + }, + { + "auxiliary_loss_clip": 0.01133008, + "auxiliary_loss_mlp": 0.01041653, + "balance_loss_clip": 1.04874229, + "balance_loss_mlp": 1.02786052, + "epoch": 0.3832256125056366, + "flos": 42004555927200.0, + "grad_norm": 2.565251592130731, + "language_loss": 0.75241172, + "learning_rate": 2.8278337365551567e-06, + "loss": 0.77415824, + "num_input_tokens_seen": 136741885, + "router_z_loss_clip": 0.84277344, + "router_z_loss_mlp": 0.13775635, + "step": 6374, + "time_per_iteration": 4.375730991363525 + }, + { + "auxiliary_loss_clip": 0.01132992, + "auxiliary_loss_mlp": 0.01040587, + "balance_loss_clip": 1.04902875, + "balance_loss_mlp": 1.02673459, + "epoch": 0.38328573575830455, + "flos": 26555008580640.0, + "grad_norm": 5.977525318713532, + "language_loss": 0.7582339, + "learning_rate": 2.8274791881736485e-06, + "loss": 0.77996969, + "num_input_tokens_seen": 136760905, + "router_z_loss_clip": 0.84082031, + "router_z_loss_mlp": 0.1383667, + "step": 6375, + "time_per_iteration": 2.703512191772461 + }, + { + "auxiliary_loss_clip": 0.01131336, + "auxiliary_loss_mlp": 0.01037261, + "balance_loss_clip": 1.04789901, + "balance_loss_mlp": 1.02373731, + "epoch": 0.3833458590109725, + "flos": 21207246564960.0, + "grad_norm": 2.2422557167274997, + "language_loss": 0.72829616, + "learning_rate": 2.8271246084140457e-06, + "loss": 0.74998212, + "num_input_tokens_seen": 136777240, + "router_z_loss_clip": 0.83398438, + "router_z_loss_mlp": 0.13519287, + "step": 6376, + "time_per_iteration": 2.6582398414611816 + }, + { + "auxiliary_loss_clip": 0.01127649, + "auxiliary_loss_mlp": 0.01034504, + "balance_loss_clip": 1.04643798, + "balance_loss_mlp": 1.02023435, + "epoch": 0.3834059822636405, + "flos": 35904536496000.0, + "grad_norm": 1.6001066294461939, + "language_loss": 0.67837685, + "learning_rate": 2.826769997289796e-06, + "loss": 0.69999838, + "num_input_tokens_seen": 136801040, + "router_z_loss_clip": 0.8125, + "router_z_loss_mlp": 0.14251709, + "step": 6377, + "time_per_iteration": 4.204295635223389 + }, + { + "auxiliary_loss_clip": 0.01134291, + "auxiliary_loss_mlp": 0.01040798, + "balance_loss_clip": 1.04997134, + "balance_loss_mlp": 1.02636158, + "epoch": 0.38346610551630844, + "flos": 26198669388960.0, + "grad_norm": 2.656552620927847, + "language_loss": 0.73160356, + "learning_rate": 2.826415354814344e-06, + "loss": 0.75335443, + "num_input_tokens_seen": 136819495, + "router_z_loss_clip": 0.84130859, + "router_z_loss_mlp": 0.14434814, + "step": 6378, + "time_per_iteration": 2.6515984535217285 + }, + { + "auxiliary_loss_clip": 0.01131243, + "auxiliary_loss_mlp": 0.0103846, + "balance_loss_clip": 1.04744387, + "balance_loss_mlp": 1.02495921, + "epoch": 0.3835262287689764, + "flos": 33630990095520.0, + "grad_norm": 1.9531245080754065, + "language_loss": 0.69123387, + "learning_rate": 2.8260606810011396e-06, + "loss": 0.71293092, + "num_input_tokens_seen": 136838840, + "router_z_loss_clip": 0.83837891, + "router_z_loss_mlp": 0.13500977, + "step": 6379, + "time_per_iteration": 2.7227649688720703 + }, + { + "auxiliary_loss_clip": 0.01130774, + "auxiliary_loss_mlp": 0.01039026, + "balance_loss_clip": 1.04926193, + "balance_loss_mlp": 1.02507901, + "epoch": 0.3835863520216444, + "flos": 18941641551360.0, + "grad_norm": 2.1816007167582967, + "language_loss": 0.83313155, + "learning_rate": 2.8257059758636315e-06, + "loss": 0.85482955, + "num_input_tokens_seen": 136854425, + "router_z_loss_clip": 0.81689453, + "router_z_loss_mlp": 0.13970947, + "step": 6380, + "time_per_iteration": 2.6008481979370117 + }, + { + "auxiliary_loss_clip": 0.01129133, + "auxiliary_loss_mlp": 0.01036383, + "balance_loss_clip": 1.04851842, + "balance_loss_mlp": 1.02254915, + "epoch": 0.38364647527431234, + "flos": 26728842827520.0, + "grad_norm": 1.7866599527182447, + "language_loss": 0.81094193, + "learning_rate": 2.8253512394152697e-06, + "loss": 0.83259708, + "num_input_tokens_seen": 136874355, + "router_z_loss_clip": 0.80664062, + "router_z_loss_mlp": 0.13842773, + "step": 6381, + "time_per_iteration": 2.704087257385254 + }, + { + "auxiliary_loss_clip": 0.01057085, + "auxiliary_loss_mlp": 0.01004199, + "balance_loss_clip": 1.0292002, + "balance_loss_mlp": 1.00275898, + "epoch": 0.3837065985269803, + "flos": 79966794507840.0, + "grad_norm": 0.8028819251708554, + "language_loss": 0.60479099, + "learning_rate": 2.8249964716695068e-06, + "loss": 0.62540388, + "num_input_tokens_seen": 136937475, + "router_z_loss_clip": 0.27905273, + "router_z_loss_mlp": 0.0144043, + "step": 6382, + "time_per_iteration": 3.2857208251953125 + }, + { + "auxiliary_loss_clip": 0.01134098, + "auxiliary_loss_mlp": 0.01035619, + "balance_loss_clip": 1.04932654, + "balance_loss_mlp": 1.02139163, + "epoch": 0.38376672177964827, + "flos": 34479341798400.0, + "grad_norm": 3.1168085669050583, + "language_loss": 0.66223061, + "learning_rate": 2.824641672639794e-06, + "loss": 0.68392783, + "num_input_tokens_seen": 136955805, + "router_z_loss_clip": 0.84814453, + "router_z_loss_mlp": 0.14208984, + "step": 6383, + "time_per_iteration": 2.7139291763305664 + }, + { + "auxiliary_loss_clip": 0.01135985, + "auxiliary_loss_mlp": 0.01033565, + "balance_loss_clip": 1.05236888, + "balance_loss_mlp": 1.02004659, + "epoch": 0.38382684503231623, + "flos": 25174417057920.0, + "grad_norm": 2.8073435328220495, + "language_loss": 0.74773771, + "learning_rate": 2.824286842339587e-06, + "loss": 0.7694332, + "num_input_tokens_seen": 136975240, + "router_z_loss_clip": 0.83642578, + "router_z_loss_mlp": 0.13525391, + "step": 6384, + "time_per_iteration": 2.702852249145508 + }, + { + "auxiliary_loss_clip": 0.01130447, + "auxiliary_loss_mlp": 0.01038872, + "balance_loss_clip": 1.04967034, + "balance_loss_mlp": 1.02588463, + "epoch": 0.3838869682849842, + "flos": 23923340228160.0, + "grad_norm": 1.6692049012219468, + "language_loss": 0.76126516, + "learning_rate": 2.823931980782341e-06, + "loss": 0.78295839, + "num_input_tokens_seen": 136994985, + "router_z_loss_clip": 0.80761719, + "router_z_loss_mlp": 0.12994385, + "step": 6385, + "time_per_iteration": 2.695815086364746 + }, + { + "auxiliary_loss_clip": 0.01054682, + "auxiliary_loss_mlp": 0.01004281, + "balance_loss_clip": 1.02715242, + "balance_loss_mlp": 1.00283432, + "epoch": 0.38394709153765216, + "flos": 69010458330240.0, + "grad_norm": 0.92114785155254, + "language_loss": 0.67046905, + "learning_rate": 2.82357708798151e-06, + "loss": 0.69105864, + "num_input_tokens_seen": 137046290, + "router_z_loss_clip": 0.27539062, + "router_z_loss_mlp": 0.01444244, + "step": 6386, + "time_per_iteration": 3.165661334991455 + }, + { + "auxiliary_loss_clip": 0.01133729, + "auxiliary_loss_mlp": 0.01037189, + "balance_loss_clip": 1.05267966, + "balance_loss_mlp": 1.02431464, + "epoch": 0.3840072147903202, + "flos": 19387065540960.0, + "grad_norm": 1.726984064688701, + "language_loss": 0.72764444, + "learning_rate": 2.8232221639505547e-06, + "loss": 0.74935365, + "num_input_tokens_seen": 137064725, + "router_z_loss_clip": 0.81054688, + "router_z_loss_mlp": 0.12878418, + "step": 6387, + "time_per_iteration": 2.7134623527526855 + }, + { + "auxiliary_loss_clip": 0.01132384, + "auxiliary_loss_mlp": 0.01037172, + "balance_loss_clip": 1.05172515, + "balance_loss_mlp": 1.0241785, + "epoch": 0.38406733804298815, + "flos": 34432469655840.0, + "grad_norm": 1.687870541178825, + "language_loss": 0.81128341, + "learning_rate": 2.822867208702932e-06, + "loss": 0.83297896, + "num_input_tokens_seen": 137086030, + "router_z_loss_clip": 0.80712891, + "router_z_loss_mlp": 0.13000488, + "step": 6388, + "time_per_iteration": 2.788893222808838 + }, + { + "auxiliary_loss_clip": 0.01127695, + "auxiliary_loss_mlp": 0.01043451, + "balance_loss_clip": 1.04749489, + "balance_loss_mlp": 1.0311482, + "epoch": 0.3841274612956561, + "flos": 22236239417760.0, + "grad_norm": 1.8008855813274163, + "language_loss": 0.7631253, + "learning_rate": 2.8225122222521026e-06, + "loss": 0.78483677, + "num_input_tokens_seen": 137105400, + "router_z_loss_clip": 0.80175781, + "router_z_loss_mlp": 0.12304688, + "step": 6389, + "time_per_iteration": 2.626431465148926 + }, + { + "auxiliary_loss_clip": 0.01135668, + "auxiliary_loss_mlp": 0.01045347, + "balance_loss_clip": 1.05102038, + "balance_loss_mlp": 1.03096986, + "epoch": 0.3841875845483241, + "flos": 24151177658880.0, + "grad_norm": 1.6293828412456872, + "language_loss": 0.76363593, + "learning_rate": 2.8221572046115273e-06, + "loss": 0.78544605, + "num_input_tokens_seen": 137124985, + "router_z_loss_clip": 0.84716797, + "router_z_loss_mlp": 0.14379883, + "step": 6390, + "time_per_iteration": 2.6806278228759766 + }, + { + "auxiliary_loss_clip": 0.01137195, + "auxiliary_loss_mlp": 0.01049372, + "balance_loss_clip": 1.05090785, + "balance_loss_mlp": 1.03478122, + "epoch": 0.38424770780099204, + "flos": 36483283802880.0, + "grad_norm": 2.5563752378499296, + "language_loss": 0.69763982, + "learning_rate": 2.821802155794668e-06, + "loss": 0.71950549, + "num_input_tokens_seen": 137146745, + "router_z_loss_clip": 0.86279297, + "router_z_loss_mlp": 0.14599609, + "step": 6391, + "time_per_iteration": 2.7329089641571045 + }, + { + "auxiliary_loss_clip": 0.01132544, + "auxiliary_loss_mlp": 0.01037227, + "balance_loss_clip": 1.0482614, + "balance_loss_mlp": 1.02309501, + "epoch": 0.38430783105366, + "flos": 25396946724960.0, + "grad_norm": 1.8271296024024584, + "language_loss": 0.84189463, + "learning_rate": 2.8214470758149884e-06, + "loss": 0.86359239, + "num_input_tokens_seen": 137163195, + "router_z_loss_clip": 0.84228516, + "router_z_loss_mlp": 0.14135742, + "step": 6392, + "time_per_iteration": 2.6888434886932373 + }, + { + "auxiliary_loss_clip": 0.01133701, + "auxiliary_loss_mlp": 0.01036459, + "balance_loss_clip": 1.0507282, + "balance_loss_mlp": 1.02322698, + "epoch": 0.384367954306328, + "flos": 13419802185120.0, + "grad_norm": 3.015427558535707, + "language_loss": 0.60794204, + "learning_rate": 2.8210919646859536e-06, + "loss": 0.62964362, + "num_input_tokens_seen": 137179330, + "router_z_loss_clip": 0.82958984, + "router_z_loss_mlp": 0.13226318, + "step": 6393, + "time_per_iteration": 2.657953977584839 + }, + { + "auxiliary_loss_clip": 0.01140952, + "auxiliary_loss_mlp": 0.01036648, + "balance_loss_clip": 1.05296111, + "balance_loss_mlp": 1.02169323, + "epoch": 0.38442807755899594, + "flos": 30917854193760.0, + "grad_norm": 2.3773064102457964, + "language_loss": 0.71205401, + "learning_rate": 2.820736822421029e-06, + "loss": 0.73382998, + "num_input_tokens_seen": 137198655, + "router_z_loss_clip": 0.88037109, + "router_z_loss_mlp": 0.1496582, + "step": 6394, + "time_per_iteration": 2.6953413486480713 + }, + { + "auxiliary_loss_clip": 0.01140512, + "auxiliary_loss_mlp": 0.01037768, + "balance_loss_clip": 1.05234957, + "balance_loss_mlp": 1.02279544, + "epoch": 0.3844882008116639, + "flos": 25709331018240.0, + "grad_norm": 2.4439441495237366, + "language_loss": 0.81155348, + "learning_rate": 2.8203816490336822e-06, + "loss": 0.83333629, + "num_input_tokens_seen": 137217120, + "router_z_loss_clip": 0.88183594, + "router_z_loss_mlp": 0.1496582, + "step": 6395, + "time_per_iteration": 2.6800296306610107 + }, + { + "auxiliary_loss_clip": 0.01139798, + "auxiliary_loss_mlp": 0.01042873, + "balance_loss_clip": 1.05444157, + "balance_loss_mlp": 1.02940226, + "epoch": 0.38454832406433187, + "flos": 21918993050880.0, + "grad_norm": 2.866388607962043, + "language_loss": 0.70522416, + "learning_rate": 2.8200264445373813e-06, + "loss": 0.7270509, + "num_input_tokens_seen": 137234410, + "router_z_loss_clip": 0.85351562, + "router_z_loss_mlp": 0.13458252, + "step": 6396, + "time_per_iteration": 2.614656925201416 + }, + { + "auxiliary_loss_clip": 0.01051045, + "auxiliary_loss_mlp": 0.0100257, + "balance_loss_clip": 1.02420211, + "balance_loss_mlp": 1.00118613, + "epoch": 0.38460844731699984, + "flos": 82881640545120.0, + "grad_norm": 0.8826355156442862, + "language_loss": 0.59714985, + "learning_rate": 2.8196712089455954e-06, + "loss": 0.61768603, + "num_input_tokens_seen": 137294940, + "router_z_loss_clip": 0.26879883, + "router_z_loss_mlp": 0.01384735, + "step": 6397, + "time_per_iteration": 3.3707754611968994 + }, + { + "auxiliary_loss_clip": 0.01133784, + "auxiliary_loss_mlp": 0.01032456, + "balance_loss_clip": 1.0506438, + "balance_loss_mlp": 1.01824594, + "epoch": 0.3846685705696678, + "flos": 31541447779200.0, + "grad_norm": 1.932212921417396, + "language_loss": 0.8482846, + "learning_rate": 2.819315942271794e-06, + "loss": 0.86994696, + "num_input_tokens_seen": 137315035, + "router_z_loss_clip": 0.83154297, + "router_z_loss_mlp": 0.14196777, + "step": 6398, + "time_per_iteration": 2.6686112880706787 + }, + { + "auxiliary_loss_clip": 0.01134247, + "auxiliary_loss_mlp": 0.01033378, + "balance_loss_clip": 1.05133724, + "balance_loss_mlp": 1.01971054, + "epoch": 0.38472869382233577, + "flos": 19876930636320.0, + "grad_norm": 3.7028430390586204, + "language_loss": 0.79401112, + "learning_rate": 2.8189606445294515e-06, + "loss": 0.8156873, + "num_input_tokens_seen": 137333155, + "router_z_loss_clip": 0.82958984, + "router_z_loss_mlp": 0.13690186, + "step": 6399, + "time_per_iteration": 2.6503067016601562 + }, + { + "auxiliary_loss_clip": 0.01136433, + "auxiliary_loss_mlp": 0.01034427, + "balance_loss_clip": 1.05174053, + "balance_loss_mlp": 1.01982343, + "epoch": 0.38478881707500373, + "flos": 23614724041920.0, + "grad_norm": 2.3816070810686254, + "language_loss": 0.67015821, + "learning_rate": 2.818605315732038e-06, + "loss": 0.69186682, + "num_input_tokens_seen": 137351515, + "router_z_loss_clip": 0.84765625, + "router_z_loss_mlp": 0.14587402, + "step": 6400, + "time_per_iteration": 2.653397560119629 + }, + { + "auxiliary_loss_clip": 0.01138868, + "auxiliary_loss_mlp": 0.01043292, + "balance_loss_clip": 1.05328918, + "balance_loss_mlp": 1.0291419, + "epoch": 0.38484894032767175, + "flos": 30335824987200.0, + "grad_norm": 1.8054279883161692, + "language_loss": 0.73179859, + "learning_rate": 2.81824995589303e-06, + "loss": 0.75362015, + "num_input_tokens_seen": 137371255, + "router_z_loss_clip": 0.85595703, + "router_z_loss_mlp": 0.14135742, + "step": 6401, + "time_per_iteration": 2.7337961196899414 + }, + { + "auxiliary_loss_clip": 0.01135479, + "auxiliary_loss_mlp": 0.0104114, + "balance_loss_clip": 1.05118084, + "balance_loss_mlp": 1.02681112, + "epoch": 0.3849090635803397, + "flos": 17694656966880.0, + "grad_norm": 2.289192689307946, + "language_loss": 0.71902788, + "learning_rate": 2.8178945650259012e-06, + "loss": 0.74079412, + "num_input_tokens_seen": 137388980, + "router_z_loss_clip": 0.84228516, + "router_z_loss_mlp": 0.14331055, + "step": 6402, + "time_per_iteration": 2.641735553741455 + }, + { + "auxiliary_loss_clip": 0.01129824, + "auxiliary_loss_mlp": 0.01036115, + "balance_loss_clip": 1.0482769, + "balance_loss_mlp": 1.0226562, + "epoch": 0.3849691868330077, + "flos": 22592862230400.0, + "grad_norm": 2.0745410901812567, + "language_loss": 0.82813621, + "learning_rate": 2.817539143144128e-06, + "loss": 0.84979564, + "num_input_tokens_seen": 137406885, + "router_z_loss_clip": 0.81689453, + "router_z_loss_mlp": 0.13458252, + "step": 6403, + "time_per_iteration": 2.6538288593292236 + }, + { + "auxiliary_loss_clip": 0.01134763, + "auxiliary_loss_mlp": 0.01032405, + "balance_loss_clip": 1.05254626, + "balance_loss_mlp": 1.0181241, + "epoch": 0.38502931008567565, + "flos": 26376433812000.0, + "grad_norm": 1.9488609045658984, + "language_loss": 0.83294898, + "learning_rate": 2.817183690261189e-06, + "loss": 0.8546207, + "num_input_tokens_seen": 137425535, + "router_z_loss_clip": 0.82226562, + "router_z_loss_mlp": 0.1428833, + "step": 6404, + "time_per_iteration": 2.6826088428497314 + }, + { + "auxiliary_loss_clip": 0.01137455, + "auxiliary_loss_mlp": 0.01036216, + "balance_loss_clip": 1.05153441, + "balance_loss_mlp": 1.02301943, + "epoch": 0.3850894333383436, + "flos": 31011801065280.0, + "grad_norm": 3.2517828942399123, + "language_loss": 0.69445038, + "learning_rate": 2.816828206390563e-06, + "loss": 0.71618706, + "num_input_tokens_seen": 137447700, + "router_z_loss_clip": 0.85839844, + "router_z_loss_mlp": 0.13220215, + "step": 6405, + "time_per_iteration": 2.7098844051361084 + }, + { + "auxiliary_loss_clip": 0.01130554, + "auxiliary_loss_mlp": 0.01038048, + "balance_loss_clip": 1.04977381, + "balance_loss_mlp": 1.02501845, + "epoch": 0.3851495565910116, + "flos": 24682242477600.0, + "grad_norm": 2.21759060585236, + "language_loss": 0.79437268, + "learning_rate": 2.816472691545729e-06, + "loss": 0.81605875, + "num_input_tokens_seen": 137462245, + "router_z_loss_clip": 0.80810547, + "router_z_loss_mlp": 0.13031006, + "step": 6406, + "time_per_iteration": 2.7338719367980957 + }, + { + "auxiliary_loss_clip": 0.01136266, + "auxiliary_loss_mlp": 0.01035796, + "balance_loss_clip": 1.05179131, + "balance_loss_mlp": 1.02109754, + "epoch": 0.38520967984367954, + "flos": 20143334167200.0, + "grad_norm": 4.573526682673254, + "language_loss": 0.83935714, + "learning_rate": 2.8161171457401694e-06, + "loss": 0.86107779, + "num_input_tokens_seen": 137476455, + "router_z_loss_clip": 0.84472656, + "router_z_loss_mlp": 0.14685059, + "step": 6407, + "time_per_iteration": 2.5990614891052246 + }, + { + "auxiliary_loss_clip": 0.01048656, + "auxiliary_loss_mlp": 0.00999836, + "balance_loss_clip": 1.02202415, + "balance_loss_mlp": 0.99837124, + "epoch": 0.3852698030963475, + "flos": 74814786587520.0, + "grad_norm": 0.8380792618826144, + "language_loss": 0.64978909, + "learning_rate": 2.815761568987365e-06, + "loss": 0.67027402, + "num_input_tokens_seen": 137539845, + "router_z_loss_clip": 0.26611328, + "router_z_loss_mlp": 0.01462555, + "step": 6408, + "time_per_iteration": 4.84579873085022 + }, + { + "auxiliary_loss_clip": 0.01137045, + "auxiliary_loss_mlp": 0.01041226, + "balance_loss_clip": 1.05213428, + "balance_loss_mlp": 1.02588403, + "epoch": 0.3853299263490155, + "flos": 27934303550400.0, + "grad_norm": 1.5177487831931158, + "language_loss": 0.7353192, + "learning_rate": 2.8154059613008e-06, + "loss": 0.75710189, + "num_input_tokens_seen": 137559880, + "router_z_loss_clip": 0.84912109, + "router_z_loss_mlp": 0.15344238, + "step": 6409, + "time_per_iteration": 4.1002280712127686 + }, + { + "auxiliary_loss_clip": 0.01145636, + "auxiliary_loss_mlp": 0.01044553, + "balance_loss_clip": 1.05485296, + "balance_loss_mlp": 1.02894843, + "epoch": 0.38539004960168344, + "flos": 24461941260960.0, + "grad_norm": 2.493930414317088, + "language_loss": 0.70890903, + "learning_rate": 2.81505032269396e-06, + "loss": 0.73081088, + "num_input_tokens_seen": 137578225, + "router_z_loss_clip": 0.90771484, + "router_z_loss_mlp": 0.15600586, + "step": 6410, + "time_per_iteration": 2.663539171218872 + }, + { + "auxiliary_loss_clip": 0.01046976, + "auxiliary_loss_mlp": 0.01001181, + "balance_loss_clip": 1.02044606, + "balance_loss_mlp": 0.99974084, + "epoch": 0.3854501728543514, + "flos": 83864531083680.0, + "grad_norm": 0.6737128183239286, + "language_loss": 0.60339296, + "learning_rate": 2.81469465318033e-06, + "loss": 0.62387455, + "num_input_tokens_seen": 137645770, + "router_z_loss_clip": 0.265625, + "router_z_loss_mlp": 0.01438904, + "step": 6411, + "time_per_iteration": 3.3355772495269775 + }, + { + "auxiliary_loss_clip": 0.01131045, + "auxiliary_loss_mlp": 0.01027688, + "balance_loss_clip": 1.04760039, + "balance_loss_mlp": 1.01481318, + "epoch": 0.38551029610701937, + "flos": 24995882806560.0, + "grad_norm": 2.0159212445359986, + "language_loss": 0.7775079, + "learning_rate": 2.814338952773397e-06, + "loss": 0.79909527, + "num_input_tokens_seen": 137664090, + "router_z_loss_clip": 0.83349609, + "router_z_loss_mlp": 0.12890625, + "step": 6412, + "time_per_iteration": 2.6740996837615967 + }, + { + "auxiliary_loss_clip": 0.01136934, + "auxiliary_loss_mlp": 0.01037633, + "balance_loss_clip": 1.04987788, + "balance_loss_mlp": 1.0220046, + "epoch": 0.38557041935968733, + "flos": 28641025893600.0, + "grad_norm": 1.7482821191883062, + "language_loss": 0.77947694, + "learning_rate": 2.8139832214866493e-06, + "loss": 0.80122262, + "num_input_tokens_seen": 137683190, + "router_z_loss_clip": 0.87011719, + "router_z_loss_mlp": 0.15625, + "step": 6413, + "time_per_iteration": 4.186856031417847 + }, + { + "auxiliary_loss_clip": 0.01045165, + "auxiliary_loss_mlp": 0.01002261, + "balance_loss_clip": 1.0187043, + "balance_loss_mlp": 1.0008086, + "epoch": 0.38563054261235535, + "flos": 78052990750560.0, + "grad_norm": 0.8182318408814718, + "language_loss": 0.61322033, + "learning_rate": 2.813627459333576e-06, + "loss": 0.63369453, + "num_input_tokens_seen": 137737315, + "router_z_loss_clip": 0.2644043, + "router_z_loss_mlp": 0.01451874, + "step": 6414, + "time_per_iteration": 3.1236181259155273 + }, + { + "auxiliary_loss_clip": 0.01137244, + "auxiliary_loss_mlp": 0.01035951, + "balance_loss_clip": 1.05091047, + "balance_loss_mlp": 1.02208138, + "epoch": 0.3856906658650233, + "flos": 29270210863680.0, + "grad_norm": 2.5181686009162205, + "language_loss": 0.76994675, + "learning_rate": 2.8132716663276685e-06, + "loss": 0.79167873, + "num_input_tokens_seen": 137753535, + "router_z_loss_clip": 0.86376953, + "router_z_loss_mlp": 0.13873291, + "step": 6415, + "time_per_iteration": 2.696483612060547 + }, + { + "auxiliary_loss_clip": 0.01129328, + "auxiliary_loss_mlp": 0.01030354, + "balance_loss_clip": 1.04986203, + "balance_loss_mlp": 1.01793826, + "epoch": 0.3857507891176913, + "flos": 30514642859520.0, + "grad_norm": 1.7729249361570003, + "language_loss": 0.79765242, + "learning_rate": 2.8129158424824173e-06, + "loss": 0.81924927, + "num_input_tokens_seen": 137773405, + "router_z_loss_clip": 0.79541016, + "router_z_loss_mlp": 0.12414551, + "step": 6416, + "time_per_iteration": 2.738659143447876 + }, + { + "auxiliary_loss_clip": 0.01131378, + "auxiliary_loss_mlp": 0.01031073, + "balance_loss_clip": 1.04839921, + "balance_loss_mlp": 1.01826441, + "epoch": 0.38581091237035925, + "flos": 26279893834560.0, + "grad_norm": 1.888550679604655, + "language_loss": 0.79472911, + "learning_rate": 2.8125599878113155e-06, + "loss": 0.81635362, + "num_input_tokens_seen": 137790810, + "router_z_loss_clip": 0.82910156, + "router_z_loss_mlp": 0.12805176, + "step": 6417, + "time_per_iteration": 3.975827693939209 + }, + { + "auxiliary_loss_clip": 0.01132359, + "auxiliary_loss_mlp": 0.01038694, + "balance_loss_clip": 1.04948664, + "balance_loss_mlp": 1.02595067, + "epoch": 0.3858710356230272, + "flos": 21211622431200.0, + "grad_norm": 3.8747541710787807, + "language_loss": 0.80124998, + "learning_rate": 2.8122041023278583e-06, + "loss": 0.82296056, + "num_input_tokens_seen": 137810265, + "router_z_loss_clip": 0.82763672, + "router_z_loss_mlp": 0.12762451, + "step": 6418, + "time_per_iteration": 2.749969482421875 + }, + { + "auxiliary_loss_clip": 0.01128516, + "auxiliary_loss_mlp": 0.0103347, + "balance_loss_clip": 1.04704714, + "balance_loss_mlp": 1.02039862, + "epoch": 0.3859311588756952, + "flos": 24818361487200.0, + "grad_norm": 1.987878891130988, + "language_loss": 0.79742396, + "learning_rate": 2.8118481860455407e-06, + "loss": 0.81904387, + "num_input_tokens_seen": 137828580, + "router_z_loss_clip": 0.81445312, + "router_z_loss_mlp": 0.13067627, + "step": 6419, + "time_per_iteration": 2.6751999855041504 + }, + { + "auxiliary_loss_clip": 0.01129123, + "auxiliary_loss_mlp": 0.01036424, + "balance_loss_clip": 1.04866314, + "balance_loss_mlp": 1.02155817, + "epoch": 0.38599128212836314, + "flos": 32117723532000.0, + "grad_norm": 2.255118566678123, + "language_loss": 0.6755054, + "learning_rate": 2.8114922389778573e-06, + "loss": 0.6971609, + "num_input_tokens_seen": 137846145, + "router_z_loss_clip": 0.80517578, + "router_z_loss_mlp": 0.14868164, + "step": 6420, + "time_per_iteration": 2.75213623046875 + }, + { + "auxiliary_loss_clip": 0.0113083, + "auxiliary_loss_mlp": 0.01036661, + "balance_loss_clip": 1.05051208, + "balance_loss_mlp": 1.02388752, + "epoch": 0.3860514053810311, + "flos": 16537810629600.0, + "grad_norm": 1.8963344164894282, + "language_loss": 0.81467253, + "learning_rate": 2.8111362611383076e-06, + "loss": 0.8363474, + "num_input_tokens_seen": 137863705, + "router_z_loss_clip": 0.80322266, + "router_z_loss_mlp": 0.12786865, + "step": 6421, + "time_per_iteration": 2.612473249435425 + }, + { + "auxiliary_loss_clip": 0.01133208, + "auxiliary_loss_mlp": 0.01035036, + "balance_loss_clip": 1.04914522, + "balance_loss_mlp": 1.02130866, + "epoch": 0.3861115286336991, + "flos": 25568957694240.0, + "grad_norm": 2.4081805080774186, + "language_loss": 0.72379512, + "learning_rate": 2.8107802525403886e-06, + "loss": 0.74547756, + "num_input_tokens_seen": 137880285, + "router_z_loss_clip": 0.84130859, + "router_z_loss_mlp": 0.13739014, + "step": 6422, + "time_per_iteration": 2.710158348083496 + }, + { + "auxiliary_loss_clip": 0.01129509, + "auxiliary_loss_mlp": 0.01036097, + "balance_loss_clip": 1.04968131, + "balance_loss_mlp": 1.02359188, + "epoch": 0.38617165188636704, + "flos": 19965448192320.0, + "grad_norm": 1.7877304278126178, + "language_loss": 0.66397172, + "learning_rate": 2.8104242131976025e-06, + "loss": 0.68562776, + "num_input_tokens_seen": 137898335, + "router_z_loss_clip": 0.796875, + "router_z_loss_mlp": 0.12518311, + "step": 6423, + "time_per_iteration": 2.6755776405334473 + }, + { + "auxiliary_loss_clip": 0.01137265, + "auxiliary_loss_mlp": 0.01038507, + "balance_loss_clip": 1.05308747, + "balance_loss_mlp": 1.02549505, + "epoch": 0.386231775139035, + "flos": 42448602329280.0, + "grad_norm": 2.705035589315453, + "language_loss": 0.68943214, + "learning_rate": 2.810068143123449e-06, + "loss": 0.71118987, + "num_input_tokens_seen": 137918605, + "router_z_loss_clip": 0.84179688, + "router_z_loss_mlp": 0.13000488, + "step": 6424, + "time_per_iteration": 2.8008017539978027 + }, + { + "auxiliary_loss_clip": 0.01129933, + "auxiliary_loss_mlp": 0.01035992, + "balance_loss_clip": 1.04945827, + "balance_loss_mlp": 1.02262235, + "epoch": 0.38629189839170297, + "flos": 26510364888480.0, + "grad_norm": 1.4242392025892106, + "language_loss": 0.7216835, + "learning_rate": 2.809712042331429e-06, + "loss": 0.74334276, + "num_input_tokens_seen": 137938245, + "router_z_loss_clip": 0.80419922, + "router_z_loss_mlp": 0.13366699, + "step": 6425, + "time_per_iteration": 2.6501307487487793 + }, + { + "auxiliary_loss_clip": 0.01135084, + "auxiliary_loss_mlp": 0.01035759, + "balance_loss_clip": 1.04897404, + "balance_loss_mlp": 1.02262235, + "epoch": 0.38635202164437094, + "flos": 34073456323680.0, + "grad_norm": 3.6405694492903438, + "language_loss": 0.80152023, + "learning_rate": 2.8093559108350484e-06, + "loss": 0.82322866, + "num_input_tokens_seen": 137956770, + "router_z_loss_clip": 0.85986328, + "router_z_loss_mlp": 0.13146973, + "step": 6426, + "time_per_iteration": 2.7258872985839844 + }, + { + "auxiliary_loss_clip": 0.01133825, + "auxiliary_loss_mlp": 0.01034776, + "balance_loss_clip": 1.05054069, + "balance_loss_mlp": 1.02136445, + "epoch": 0.38641214489703896, + "flos": 28780669906560.0, + "grad_norm": 1.7058384386547738, + "language_loss": 0.74793708, + "learning_rate": 2.80899974864781e-06, + "loss": 0.7696231, + "num_input_tokens_seen": 137977040, + "router_z_loss_clip": 0.83203125, + "router_z_loss_mlp": 0.13397217, + "step": 6427, + "time_per_iteration": 2.698671340942383 + }, + { + "auxiliary_loss_clip": 0.01132528, + "auxiliary_loss_mlp": 0.01041617, + "balance_loss_clip": 1.04995382, + "balance_loss_mlp": 1.02826595, + "epoch": 0.3864722681497069, + "flos": 15424230396960.0, + "grad_norm": 1.952599963305639, + "language_loss": 0.70164418, + "learning_rate": 2.8086435557832203e-06, + "loss": 0.72338557, + "num_input_tokens_seen": 137993545, + "router_z_loss_clip": 0.82568359, + "router_z_loss_mlp": 0.13354492, + "step": 6428, + "time_per_iteration": 2.7128570079803467 + }, + { + "auxiliary_loss_clip": 0.01134977, + "auxiliary_loss_mlp": 0.01046667, + "balance_loss_clip": 1.05081511, + "balance_loss_mlp": 1.03299344, + "epoch": 0.3865323914023749, + "flos": 21474379406880.0, + "grad_norm": 2.092441723045781, + "language_loss": 0.84637719, + "learning_rate": 2.8082873322547863e-06, + "loss": 0.86819363, + "num_input_tokens_seen": 138010140, + "router_z_loss_clip": 0.84130859, + "router_z_loss_mlp": 0.13690186, + "step": 6429, + "time_per_iteration": 2.6675913333892822 + }, + { + "auxiliary_loss_clip": 0.01133169, + "auxiliary_loss_mlp": 0.01035777, + "balance_loss_clip": 1.05002832, + "balance_loss_mlp": 1.02219331, + "epoch": 0.38659251465504285, + "flos": 22547853882720.0, + "grad_norm": 3.2605907308874476, + "language_loss": 0.80976886, + "learning_rate": 2.807931078076015e-06, + "loss": 0.83145833, + "num_input_tokens_seen": 138028880, + "router_z_loss_clip": 0.83154297, + "router_z_loss_mlp": 0.13568115, + "step": 6430, + "time_per_iteration": 2.700026750564575 + }, + { + "auxiliary_loss_clip": 0.01046496, + "auxiliary_loss_mlp": 0.01004748, + "balance_loss_clip": 1.02001703, + "balance_loss_mlp": 1.00325167, + "epoch": 0.3866526379077108, + "flos": 78294401470080.0, + "grad_norm": 0.7199708271110594, + "language_loss": 0.58818698, + "learning_rate": 2.807574793260416e-06, + "loss": 0.60869944, + "num_input_tokens_seen": 138098090, + "router_z_loss_clip": 0.26513672, + "router_z_loss_mlp": 0.01495361, + "step": 6431, + "time_per_iteration": 3.3120508193969727 + }, + { + "auxiliary_loss_clip": 0.01137097, + "auxiliary_loss_mlp": 0.01037374, + "balance_loss_clip": 1.05034626, + "balance_loss_mlp": 1.02271748, + "epoch": 0.3867127611603788, + "flos": 17556512093280.0, + "grad_norm": 3.15697960234432, + "language_loss": 0.79376256, + "learning_rate": 2.8072184778215004e-06, + "loss": 0.81550729, + "num_input_tokens_seen": 138114735, + "router_z_loss_clip": 0.86621094, + "router_z_loss_mlp": 0.14654541, + "step": 6432, + "time_per_iteration": 2.6506094932556152 + }, + { + "auxiliary_loss_clip": 0.0113731, + "auxiliary_loss_mlp": 0.01040506, + "balance_loss_clip": 1.04837132, + "balance_loss_mlp": 1.0257597, + "epoch": 0.38677288441304675, + "flos": 24417135499680.0, + "grad_norm": 2.5634511830804345, + "language_loss": 0.80625534, + "learning_rate": 2.806862131772779e-06, + "loss": 0.82803351, + "num_input_tokens_seen": 138130480, + "router_z_loss_clip": 0.88916016, + "router_z_loss_mlp": 0.14758301, + "step": 6433, + "time_per_iteration": 2.6726276874542236 + }, + { + "auxiliary_loss_clip": 0.01135467, + "auxiliary_loss_mlp": 0.01035074, + "balance_loss_clip": 1.05101943, + "balance_loss_mlp": 1.02012527, + "epoch": 0.3868330076657147, + "flos": 27133593818400.0, + "grad_norm": 1.652284563382691, + "language_loss": 0.70799577, + "learning_rate": 2.806505755127765e-06, + "loss": 0.72970116, + "num_input_tokens_seen": 138150640, + "router_z_loss_clip": 0.84570312, + "router_z_loss_mlp": 0.14953613, + "step": 6434, + "time_per_iteration": 2.6637887954711914 + }, + { + "auxiliary_loss_clip": 0.01136186, + "auxiliary_loss_mlp": 0.01038668, + "balance_loss_clip": 1.04824591, + "balance_loss_mlp": 1.02410638, + "epoch": 0.3868931309183827, + "flos": 20410548043680.0, + "grad_norm": 1.8957999552908928, + "language_loss": 0.77128732, + "learning_rate": 2.806149347899972e-06, + "loss": 0.79303575, + "num_input_tokens_seen": 138169700, + "router_z_loss_clip": 0.88037109, + "router_z_loss_mlp": 0.14556885, + "step": 6435, + "time_per_iteration": 2.893589735031128 + }, + { + "auxiliary_loss_clip": 0.0113113, + "auxiliary_loss_mlp": 0.01033074, + "balance_loss_clip": 1.04830897, + "balance_loss_mlp": 1.01896548, + "epoch": 0.38695325417105064, + "flos": 27667859502240.0, + "grad_norm": 1.7424913607018342, + "language_loss": 0.79934466, + "learning_rate": 2.805792910102915e-06, + "loss": 0.82098663, + "num_input_tokens_seen": 138185835, + "router_z_loss_clip": 0.82763672, + "router_z_loss_mlp": 0.14099121, + "step": 6436, + "time_per_iteration": 2.680018424987793 + }, + { + "auxiliary_loss_clip": 0.01129933, + "auxiliary_loss_mlp": 0.01033174, + "balance_loss_clip": 1.04827845, + "balance_loss_mlp": 1.01987004, + "epoch": 0.3870133774237186, + "flos": 28201557944160.0, + "grad_norm": 1.810253862881334, + "language_loss": 0.76934093, + "learning_rate": 2.8054364417501093e-06, + "loss": 0.79097199, + "num_input_tokens_seen": 138204080, + "router_z_loss_clip": 0.81591797, + "router_z_loss_mlp": 0.13317871, + "step": 6437, + "time_per_iteration": 2.6946425437927246 + }, + { + "auxiliary_loss_clip": 0.01131563, + "auxiliary_loss_mlp": 0.01034335, + "balance_loss_clip": 1.04883254, + "balance_loss_mlp": 1.02184176, + "epoch": 0.3870735006763866, + "flos": 21567718519200.0, + "grad_norm": 2.901714092220467, + "language_loss": 0.81450981, + "learning_rate": 2.805079942855074e-06, + "loss": 0.83616877, + "num_input_tokens_seen": 138220710, + "router_z_loss_clip": 0.82666016, + "router_z_loss_mlp": 0.12493896, + "step": 6438, + "time_per_iteration": 2.6904618740081787 + }, + { + "auxiliary_loss_clip": 0.01134755, + "auxiliary_loss_mlp": 0.01031073, + "balance_loss_clip": 1.04985714, + "balance_loss_mlp": 1.01676834, + "epoch": 0.38713362392905454, + "flos": 28425667785120.0, + "grad_norm": 1.4067692674354249, + "language_loss": 0.75301492, + "learning_rate": 2.804723413431326e-06, + "loss": 0.77467322, + "num_input_tokens_seen": 138241720, + "router_z_loss_clip": 0.84814453, + "router_z_loss_mlp": 0.14294434, + "step": 6439, + "time_per_iteration": 2.677433490753174 + }, + { + "auxiliary_loss_clip": 0.01128811, + "auxiliary_loss_mlp": 0.01030882, + "balance_loss_clip": 1.04823852, + "balance_loss_mlp": 1.0170958, + "epoch": 0.38719374718172256, + "flos": 25662458875680.0, + "grad_norm": 1.9012214646353607, + "language_loss": 0.735116, + "learning_rate": 2.8043668534923855e-06, + "loss": 0.75671291, + "num_input_tokens_seen": 138261885, + "router_z_loss_clip": 0.80566406, + "router_z_loss_mlp": 0.13787842, + "step": 6440, + "time_per_iteration": 2.7495737075805664 + }, + { + "auxiliary_loss_clip": 0.01134817, + "auxiliary_loss_mlp": 0.01035852, + "balance_loss_clip": 1.04813099, + "balance_loss_mlp": 1.02168989, + "epoch": 0.3872538704343905, + "flos": 23927797128960.0, + "grad_norm": 1.9657708520670347, + "language_loss": 0.8174336, + "learning_rate": 2.804010263051774e-06, + "loss": 0.8391403, + "num_input_tokens_seen": 138280255, + "router_z_loss_clip": 0.86669922, + "router_z_loss_mlp": 0.1416626, + "step": 6441, + "time_per_iteration": 2.6434481143951416 + }, + { + "auxiliary_loss_clip": 0.01131677, + "auxiliary_loss_mlp": 0.01039814, + "balance_loss_clip": 1.04819942, + "balance_loss_mlp": 1.02672493, + "epoch": 0.3873139936870585, + "flos": 21389954096160.0, + "grad_norm": 2.3929726855835574, + "language_loss": 0.81295955, + "learning_rate": 2.8036536421230118e-06, + "loss": 0.83467448, + "num_input_tokens_seen": 138296675, + "router_z_loss_clip": 0.83398438, + "router_z_loss_mlp": 0.13092041, + "step": 6442, + "time_per_iteration": 2.7084920406341553 + }, + { + "auxiliary_loss_clip": 0.01130991, + "auxiliary_loss_mlp": 0.01032195, + "balance_loss_clip": 1.04785144, + "balance_loss_mlp": 1.01825333, + "epoch": 0.38737411693972645, + "flos": 21702865114080.0, + "grad_norm": 1.6491354387664692, + "language_loss": 0.83838969, + "learning_rate": 2.803296990719624e-06, + "loss": 0.86002159, + "num_input_tokens_seen": 138314985, + "router_z_loss_clip": 0.83154297, + "router_z_loss_mlp": 0.13934326, + "step": 6443, + "time_per_iteration": 2.6284139156341553 + }, + { + "auxiliary_loss_clip": 0.01044054, + "auxiliary_loss_mlp": 0.01002762, + "balance_loss_clip": 1.0178833, + "balance_loss_mlp": 1.00131714, + "epoch": 0.3874342401923944, + "flos": 71144401235040.0, + "grad_norm": 0.7580898808047443, + "language_loss": 0.50205863, + "learning_rate": 2.8029403088551327e-06, + "loss": 0.52252674, + "num_input_tokens_seen": 138373275, + "router_z_loss_clip": 0.26171875, + "router_z_loss_mlp": 0.01445007, + "step": 6444, + "time_per_iteration": 3.3339507579803467 + }, + { + "auxiliary_loss_clip": 0.01128274, + "auxiliary_loss_mlp": 0.01034581, + "balance_loss_clip": 1.04872537, + "balance_loss_mlp": 1.02121794, + "epoch": 0.3874943634450624, + "flos": 21611957038560.0, + "grad_norm": 1.731431867620868, + "language_loss": 0.78756356, + "learning_rate": 2.802583596543065e-06, + "loss": 0.80919206, + "num_input_tokens_seen": 138391145, + "router_z_loss_clip": 0.79492188, + "router_z_loss_mlp": 0.13378906, + "step": 6445, + "time_per_iteration": 2.627727746963501 + }, + { + "auxiliary_loss_clip": 0.01129007, + "auxiliary_loss_mlp": 0.01033167, + "balance_loss_clip": 1.04844129, + "balance_loss_mlp": 1.01967251, + "epoch": 0.38755448669773035, + "flos": 23482494691200.0, + "grad_norm": 1.9892450988522947, + "language_loss": 0.81580949, + "learning_rate": 2.8022268537969474e-06, + "loss": 0.83743119, + "num_input_tokens_seen": 138409875, + "router_z_loss_clip": 0.80615234, + "router_z_loss_mlp": 0.13513184, + "step": 6446, + "time_per_iteration": 2.726449728012085 + }, + { + "auxiliary_loss_clip": 0.01129854, + "auxiliary_loss_mlp": 0.01040776, + "balance_loss_clip": 1.04731631, + "balance_loss_mlp": 1.02734089, + "epoch": 0.3876146099503983, + "flos": 25129206123840.0, + "grad_norm": 1.9366829013486548, + "language_loss": 0.77209264, + "learning_rate": 2.801870080630306e-06, + "loss": 0.79379892, + "num_input_tokens_seen": 138428965, + "router_z_loss_clip": 0.82519531, + "router_z_loss_mlp": 0.13439941, + "step": 6447, + "time_per_iteration": 4.087037563323975 + }, + { + "auxiliary_loss_clip": 0.01129359, + "auxiliary_loss_mlp": 0.01033778, + "balance_loss_clip": 1.04842138, + "balance_loss_mlp": 1.02072489, + "epoch": 0.3876747332030663, + "flos": 23526773727840.0, + "grad_norm": 2.9659341263672125, + "language_loss": 0.76264644, + "learning_rate": 2.801513277056671e-06, + "loss": 0.78427786, + "num_input_tokens_seen": 138448090, + "router_z_loss_clip": 0.80908203, + "router_z_loss_mlp": 0.1305542, + "step": 6448, + "time_per_iteration": 4.133332014083862 + }, + { + "auxiliary_loss_clip": 0.01129069, + "auxiliary_loss_mlp": 0.01030879, + "balance_loss_clip": 1.0482825, + "balance_loss_mlp": 1.01801586, + "epoch": 0.38773485645573424, + "flos": 23118335664480.0, + "grad_norm": 1.6448005524683393, + "language_loss": 0.76227832, + "learning_rate": 2.8011564430895725e-06, + "loss": 0.78387773, + "num_input_tokens_seen": 138466105, + "router_z_loss_clip": 0.80712891, + "router_z_loss_mlp": 0.12860107, + "step": 6449, + "time_per_iteration": 2.6521713733673096 + }, + { + "auxiliary_loss_clip": 0.01132173, + "auxiliary_loss_mlp": 0.01035298, + "balance_loss_clip": 1.04585648, + "balance_loss_mlp": 1.0212667, + "epoch": 0.3877949797084022, + "flos": 28152295282080.0, + "grad_norm": 1.8749156882000317, + "language_loss": 0.78518045, + "learning_rate": 2.800799578742542e-06, + "loss": 0.80685514, + "num_input_tokens_seen": 138485160, + "router_z_loss_clip": 0.86376953, + "router_z_loss_mlp": 0.14038086, + "step": 6450, + "time_per_iteration": 2.8176610469818115 + }, + { + "auxiliary_loss_clip": 0.01132833, + "auxiliary_loss_mlp": 0.01036737, + "balance_loss_clip": 1.04598999, + "balance_loss_mlp": 1.02256298, + "epoch": 0.3878551029610702, + "flos": 35503107922080.0, + "grad_norm": 4.9561460729074325, + "language_loss": 0.77717787, + "learning_rate": 2.8004426840291106e-06, + "loss": 0.79887354, + "num_input_tokens_seen": 138504135, + "router_z_loss_clip": 0.86767578, + "router_z_loss_mlp": 0.14172363, + "step": 6451, + "time_per_iteration": 2.7678582668304443 + }, + { + "auxiliary_loss_clip": 0.01125096, + "auxiliary_loss_mlp": 0.01028219, + "balance_loss_clip": 1.04600847, + "balance_loss_mlp": 1.01552343, + "epoch": 0.38791522621373814, + "flos": 25619841047520.0, + "grad_norm": 1.8091199896516337, + "language_loss": 0.76547575, + "learning_rate": 2.800085758962812e-06, + "loss": 0.78700888, + "num_input_tokens_seen": 138523955, + "router_z_loss_clip": 0.79150391, + "router_z_loss_mlp": 0.1270752, + "step": 6452, + "time_per_iteration": 2.7428970336914062 + }, + { + "auxiliary_loss_clip": 0.01130413, + "auxiliary_loss_mlp": 0.01041701, + "balance_loss_clip": 1.04801631, + "balance_loss_mlp": 1.02882636, + "epoch": 0.3879753494664061, + "flos": 18896673720960.0, + "grad_norm": 1.55830277656379, + "language_loss": 0.79585725, + "learning_rate": 2.799728803557182e-06, + "loss": 0.81757838, + "num_input_tokens_seen": 138541655, + "router_z_loss_clip": 0.82421875, + "router_z_loss_mlp": 0.12872314, + "step": 6453, + "time_per_iteration": 4.111170291900635 + }, + { + "auxiliary_loss_clip": 0.01137037, + "auxiliary_loss_mlp": 0.0103507, + "balance_loss_clip": 1.05042243, + "balance_loss_mlp": 1.02101541, + "epoch": 0.3880354727190741, + "flos": 26910415874880.0, + "grad_norm": 1.7915133042414093, + "language_loss": 0.71065587, + "learning_rate": 2.7993718178257555e-06, + "loss": 0.73237693, + "num_input_tokens_seen": 138560860, + "router_z_loss_clip": 0.86572266, + "router_z_loss_mlp": 0.14044189, + "step": 6454, + "time_per_iteration": 2.693448781967163 + }, + { + "auxiliary_loss_clip": 0.01136934, + "auxiliary_loss_mlp": 0.01038854, + "balance_loss_clip": 1.05024791, + "balance_loss_mlp": 1.02417982, + "epoch": 0.3880955959717421, + "flos": 24819171832800.0, + "grad_norm": 1.6126089627041944, + "language_loss": 0.77473426, + "learning_rate": 2.7990148017820694e-06, + "loss": 0.7964921, + "num_input_tokens_seen": 138580200, + "router_z_loss_clip": 0.86621094, + "router_z_loss_mlp": 0.14685059, + "step": 6455, + "time_per_iteration": 2.683120012283325 + }, + { + "auxiliary_loss_clip": 0.01127055, + "auxiliary_loss_mlp": 0.01034134, + "balance_loss_clip": 1.04612589, + "balance_loss_mlp": 1.02047825, + "epoch": 0.38815571922441006, + "flos": 28156873734720.0, + "grad_norm": 1.603660136327544, + "language_loss": 0.76128697, + "learning_rate": 2.798657755439662e-06, + "loss": 0.78289878, + "num_input_tokens_seen": 138598315, + "router_z_loss_clip": 0.80908203, + "router_z_loss_mlp": 0.13653564, + "step": 6456, + "time_per_iteration": 4.160175323486328 + }, + { + "auxiliary_loss_clip": 0.01133296, + "auxiliary_loss_mlp": 0.01032307, + "balance_loss_clip": 1.04839337, + "balance_loss_mlp": 1.01890814, + "epoch": 0.388215842477078, + "flos": 25352100446400.0, + "grad_norm": 2.457833116856693, + "language_loss": 0.60487616, + "learning_rate": 2.7983006788120726e-06, + "loss": 0.6265322, + "num_input_tokens_seen": 138615695, + "router_z_loss_clip": 0.84912109, + "router_z_loss_mlp": 0.13397217, + "step": 6457, + "time_per_iteration": 2.655606985092163 + }, + { + "auxiliary_loss_clip": 0.01130904, + "auxiliary_loss_mlp": 0.01035949, + "balance_loss_clip": 1.04577637, + "balance_loss_mlp": 1.02154899, + "epoch": 0.388275965729746, + "flos": 24950955493440.0, + "grad_norm": 2.24470478578551, + "language_loss": 0.80415088, + "learning_rate": 2.797943571912841e-06, + "loss": 0.82581943, + "num_input_tokens_seen": 138633180, + "router_z_loss_clip": 0.85107422, + "router_z_loss_mlp": 0.14398193, + "step": 6458, + "time_per_iteration": 2.708507776260376 + }, + { + "auxiliary_loss_clip": 0.01132267, + "auxiliary_loss_mlp": 0.01035912, + "balance_loss_clip": 1.04763734, + "balance_loss_mlp": 1.02245891, + "epoch": 0.38833608898241395, + "flos": 34034849706240.0, + "grad_norm": 2.1515887248366257, + "language_loss": 0.81640816, + "learning_rate": 2.797586434755509e-06, + "loss": 0.83808994, + "num_input_tokens_seen": 138654785, + "router_z_loss_clip": 0.84570312, + "router_z_loss_mlp": 0.13446045, + "step": 6459, + "time_per_iteration": 2.716505765914917 + }, + { + "auxiliary_loss_clip": 0.01127659, + "auxiliary_loss_mlp": 0.01033143, + "balance_loss_clip": 1.04721856, + "balance_loss_mlp": 1.02016127, + "epoch": 0.3883962122350819, + "flos": 22056732751680.0, + "grad_norm": 3.0120385458688244, + "language_loss": 0.61466932, + "learning_rate": 2.7972292673536202e-06, + "loss": 0.63627738, + "num_input_tokens_seen": 138673330, + "router_z_loss_clip": 0.80419922, + "router_z_loss_mlp": 0.12994385, + "step": 6460, + "time_per_iteration": 2.7633681297302246 + }, + { + "auxiliary_loss_clip": 0.01131816, + "auxiliary_loss_mlp": 0.0103239, + "balance_loss_clip": 1.05018854, + "balance_loss_mlp": 1.02020085, + "epoch": 0.3884563354877499, + "flos": 28824219632160.0, + "grad_norm": 1.672234824923352, + "language_loss": 0.8601079, + "learning_rate": 2.796872069720717e-06, + "loss": 0.88174993, + "num_input_tokens_seen": 138694185, + "router_z_loss_clip": 0.81542969, + "router_z_loss_mlp": 0.12176514, + "step": 6461, + "time_per_iteration": 2.682936191558838 + }, + { + "auxiliary_loss_clip": 0.01131361, + "auxiliary_loss_mlp": 0.01033022, + "balance_loss_clip": 1.04716134, + "balance_loss_mlp": 1.01975441, + "epoch": 0.38851645874041785, + "flos": 33499409021280.0, + "grad_norm": 2.3075196700406617, + "language_loss": 0.70581293, + "learning_rate": 2.7965148418703456e-06, + "loss": 0.72745681, + "num_input_tokens_seen": 138714625, + "router_z_loss_clip": 0.84130859, + "router_z_loss_mlp": 0.1328125, + "step": 6462, + "time_per_iteration": 2.784363269805908 + }, + { + "auxiliary_loss_clip": 0.01129384, + "auxiliary_loss_mlp": 0.01036454, + "balance_loss_clip": 1.04546762, + "balance_loss_mlp": 1.02242303, + "epoch": 0.3885765819930858, + "flos": 30783720530880.0, + "grad_norm": 2.1249900461053772, + "language_loss": 0.75945008, + "learning_rate": 2.796157583816052e-06, + "loss": 0.78110844, + "num_input_tokens_seen": 138733585, + "router_z_loss_clip": 0.83886719, + "router_z_loss_mlp": 0.14031982, + "step": 6463, + "time_per_iteration": 2.676258087158203 + }, + { + "auxiliary_loss_clip": 0.01135342, + "auxiliary_loss_mlp": 0.01042244, + "balance_loss_clip": 1.049752, + "balance_loss_mlp": 1.02741456, + "epoch": 0.3886367052457538, + "flos": 20677964506560.0, + "grad_norm": 2.297391259872665, + "language_loss": 0.69842547, + "learning_rate": 2.795800295571382e-06, + "loss": 0.72020125, + "num_input_tokens_seen": 138752335, + "router_z_loss_clip": 0.85644531, + "router_z_loss_mlp": 0.1484375, + "step": 6464, + "time_per_iteration": 2.7305307388305664 + }, + { + "auxiliary_loss_clip": 0.01130022, + "auxiliary_loss_mlp": 0.01032087, + "balance_loss_clip": 1.04882312, + "balance_loss_mlp": 1.01839638, + "epoch": 0.38869682849842174, + "flos": 33134763787200.0, + "grad_norm": 2.175596033212069, + "language_loss": 0.69659817, + "learning_rate": 2.7954429771498858e-06, + "loss": 0.71821922, + "num_input_tokens_seen": 138768450, + "router_z_loss_clip": 0.81152344, + "router_z_loss_mlp": 0.13696289, + "step": 6465, + "time_per_iteration": 2.719264030456543 + }, + { + "auxiliary_loss_clip": 0.01131099, + "auxiliary_loss_mlp": 0.01035627, + "balance_loss_clip": 1.04648376, + "balance_loss_mlp": 1.02166164, + "epoch": 0.3887569517510897, + "flos": 25702402563360.0, + "grad_norm": 2.3474464066017333, + "language_loss": 0.78262103, + "learning_rate": 2.7950856285651117e-06, + "loss": 0.80428827, + "num_input_tokens_seen": 138786775, + "router_z_loss_clip": 0.84667969, + "router_z_loss_mlp": 0.13970947, + "step": 6466, + "time_per_iteration": 2.7044060230255127 + }, + { + "auxiliary_loss_clip": 0.01134944, + "auxiliary_loss_mlp": 0.0103876, + "balance_loss_clip": 1.05032873, + "balance_loss_mlp": 1.02483618, + "epoch": 0.38881707500375773, + "flos": 35993418707520.0, + "grad_norm": 1.5304428091620124, + "language_loss": 0.6939649, + "learning_rate": 2.794728249830611e-06, + "loss": 0.71570194, + "num_input_tokens_seen": 138810100, + "router_z_loss_clip": 0.84570312, + "router_z_loss_mlp": 0.13928223, + "step": 6467, + "time_per_iteration": 2.7971084117889404 + }, + { + "auxiliary_loss_clip": 0.01134501, + "auxiliary_loss_mlp": 0.01039532, + "balance_loss_clip": 1.05011702, + "balance_loss_mlp": 1.0248518, + "epoch": 0.3888771982564257, + "flos": 21339475915680.0, + "grad_norm": 2.409395364883989, + "language_loss": 0.83295012, + "learning_rate": 2.794370840959936e-06, + "loss": 0.85469043, + "num_input_tokens_seen": 138825140, + "router_z_loss_clip": 0.84375, + "router_z_loss_mlp": 0.14666748, + "step": 6468, + "time_per_iteration": 2.6459760665893555 + }, + { + "auxiliary_loss_clip": 0.01129776, + "auxiliary_loss_mlp": 0.01033903, + "balance_loss_clip": 1.0470016, + "balance_loss_mlp": 1.02120113, + "epoch": 0.38893732150909366, + "flos": 26774499451680.0, + "grad_norm": 1.9550885314981306, + "language_loss": 0.84180653, + "learning_rate": 2.7940134019666383e-06, + "loss": 0.86344332, + "num_input_tokens_seen": 138844115, + "router_z_loss_clip": 0.828125, + "router_z_loss_mlp": 0.12713623, + "step": 6469, + "time_per_iteration": 2.7304272651672363 + }, + { + "auxiliary_loss_clip": 0.01133233, + "auxiliary_loss_mlp": 0.01033817, + "balance_loss_clip": 1.05015135, + "balance_loss_mlp": 1.01994753, + "epoch": 0.3889974447617616, + "flos": 29622660396480.0, + "grad_norm": 1.8175878818724498, + "language_loss": 0.74851239, + "learning_rate": 2.793655932864273e-06, + "loss": 0.77018291, + "num_input_tokens_seen": 138860860, + "router_z_loss_clip": 0.83007812, + "router_z_loss_mlp": 0.13873291, + "step": 6470, + "time_per_iteration": 2.6786773204803467 + }, + { + "auxiliary_loss_clip": 0.01132569, + "auxiliary_loss_mlp": 0.01036711, + "balance_loss_clip": 1.04859328, + "balance_loss_mlp": 1.02189898, + "epoch": 0.3890575680144296, + "flos": 31319363802240.0, + "grad_norm": 1.650768911510791, + "language_loss": 0.74919295, + "learning_rate": 2.7932984336663953e-06, + "loss": 0.77088571, + "num_input_tokens_seen": 138881910, + "router_z_loss_clip": 0.83935547, + "router_z_loss_mlp": 0.14788818, + "step": 6471, + "time_per_iteration": 2.7666611671447754 + }, + { + "auxiliary_loss_clip": 0.01134123, + "auxiliary_loss_mlp": 0.0104468, + "balance_loss_clip": 1.0517602, + "balance_loss_mlp": 1.03051805, + "epoch": 0.38911769126709755, + "flos": 27888363305280.0, + "grad_norm": 2.000243744609043, + "language_loss": 0.68271327, + "learning_rate": 2.792940904386562e-06, + "loss": 0.70450127, + "num_input_tokens_seen": 138900975, + "router_z_loss_clip": 0.82421875, + "router_z_loss_mlp": 0.14147949, + "step": 6472, + "time_per_iteration": 2.689335584640503 + }, + { + "auxiliary_loss_clip": 0.01133337, + "auxiliary_loss_mlp": 0.01043775, + "balance_loss_clip": 1.04956222, + "balance_loss_mlp": 1.03044772, + "epoch": 0.3891778145197655, + "flos": 31051461132000.0, + "grad_norm": 1.7237462734812932, + "language_loss": 0.76401097, + "learning_rate": 2.7925833450383293e-06, + "loss": 0.7857821, + "num_input_tokens_seen": 138920795, + "router_z_loss_clip": 0.83837891, + "router_z_loss_mlp": 0.13323975, + "step": 6473, + "time_per_iteration": 2.7492592334747314 + }, + { + "auxiliary_loss_clip": 0.01136115, + "auxiliary_loss_mlp": 0.01041174, + "balance_loss_clip": 1.05231428, + "balance_loss_mlp": 1.02755427, + "epoch": 0.3892379377724335, + "flos": 17116395867360.0, + "grad_norm": 1.8558108646065694, + "language_loss": 0.7094546, + "learning_rate": 2.792225755635257e-06, + "loss": 0.73122746, + "num_input_tokens_seen": 138938770, + "router_z_loss_clip": 0.83789062, + "router_z_loss_mlp": 0.13635254, + "step": 6474, + "time_per_iteration": 2.6881964206695557 + }, + { + "auxiliary_loss_clip": 0.01132406, + "auxiliary_loss_mlp": 0.01036801, + "balance_loss_clip": 1.04836941, + "balance_loss_mlp": 1.02421296, + "epoch": 0.38929806102510145, + "flos": 24596074923840.0, + "grad_norm": 1.4590106727990209, + "language_loss": 0.6893611, + "learning_rate": 2.7918681361909046e-06, + "loss": 0.71105313, + "num_input_tokens_seen": 138958880, + "router_z_loss_clip": 0.84033203, + "router_z_loss_mlp": 0.12591553, + "step": 6475, + "time_per_iteration": 2.7098822593688965 + }, + { + "auxiliary_loss_clip": 0.01142933, + "auxiliary_loss_mlp": 0.01044604, + "balance_loss_clip": 1.05463123, + "balance_loss_mlp": 1.0302211, + "epoch": 0.3893581842777694, + "flos": 27044590055040.0, + "grad_norm": 1.8548183298170344, + "language_loss": 0.75616652, + "learning_rate": 2.7915104867188332e-06, + "loss": 0.77804184, + "num_input_tokens_seen": 138977240, + "router_z_loss_clip": 0.88232422, + "router_z_loss_mlp": 0.14379883, + "step": 6476, + "time_per_iteration": 2.694999933242798 + }, + { + "auxiliary_loss_clip": 0.01047041, + "auxiliary_loss_mlp": 0.01001536, + "balance_loss_clip": 1.02061236, + "balance_loss_mlp": 1.00010204, + "epoch": 0.3894183075304374, + "flos": 82121522777280.0, + "grad_norm": 0.7810076421580028, + "language_loss": 0.58251375, + "learning_rate": 2.7911528072326055e-06, + "loss": 0.60299951, + "num_input_tokens_seen": 139039035, + "router_z_loss_clip": 0.26538086, + "router_z_loss_mlp": 0.014328, + "step": 6477, + "time_per_iteration": 3.295849561691284 + }, + { + "auxiliary_loss_clip": 0.01136812, + "auxiliary_loss_mlp": 0.01039506, + "balance_loss_clip": 1.05188179, + "balance_loss_mlp": 1.02523124, + "epoch": 0.38947843078310534, + "flos": 22631914537920.0, + "grad_norm": 1.9336855936255164, + "language_loss": 0.78344977, + "learning_rate": 2.7907950977457832e-06, + "loss": 0.80521297, + "num_input_tokens_seen": 139055560, + "router_z_loss_clip": 0.84863281, + "router_z_loss_mlp": 0.14294434, + "step": 6478, + "time_per_iteration": 2.699160575866699 + }, + { + "auxiliary_loss_clip": 0.01133204, + "auxiliary_loss_mlp": 0.0103816, + "balance_loss_clip": 1.04984736, + "balance_loss_mlp": 1.02513075, + "epoch": 0.3895385540357733, + "flos": 17822145795840.0, + "grad_norm": 2.204523517103074, + "language_loss": 0.82556832, + "learning_rate": 2.7904373582719317e-06, + "loss": 0.84728199, + "num_input_tokens_seen": 139071865, + "router_z_loss_clip": 0.83349609, + "router_z_loss_mlp": 0.13037109, + "step": 6479, + "time_per_iteration": 2.605208396911621 + }, + { + "auxiliary_loss_clip": 0.01132246, + "auxiliary_loss_mlp": 0.0103279, + "balance_loss_clip": 1.05021238, + "balance_loss_mlp": 1.01879525, + "epoch": 0.38959867728844133, + "flos": 24373666808640.0, + "grad_norm": 1.7787213851918866, + "language_loss": 0.80130291, + "learning_rate": 2.790079588824617e-06, + "loss": 0.82295334, + "num_input_tokens_seen": 139089640, + "router_z_loss_clip": 0.81982422, + "router_z_loss_mlp": 0.13995361, + "step": 6480, + "time_per_iteration": 2.684624433517456 + }, + { + "auxiliary_loss_clip": 0.01129462, + "auxiliary_loss_mlp": 0.0102803, + "balance_loss_clip": 1.04838908, + "balance_loss_mlp": 1.01556611, + "epoch": 0.3896588005411093, + "flos": 27664942258080.0, + "grad_norm": 2.354718954390899, + "language_loss": 0.82900929, + "learning_rate": 2.7897217894174038e-06, + "loss": 0.85058421, + "num_input_tokens_seen": 139109365, + "router_z_loss_clip": 0.81152344, + "router_z_loss_mlp": 0.12469482, + "step": 6481, + "time_per_iteration": 2.6563916206359863 + }, + { + "auxiliary_loss_clip": 0.01131147, + "auxiliary_loss_mlp": 0.01032787, + "balance_loss_clip": 1.05184674, + "balance_loss_mlp": 1.02054405, + "epoch": 0.38971892379377726, + "flos": 25619435874720.0, + "grad_norm": 1.5862884777599375, + "language_loss": 0.75561666, + "learning_rate": 2.789363960063863e-06, + "loss": 0.77725595, + "num_input_tokens_seen": 139128260, + "router_z_loss_clip": 0.79296875, + "router_z_loss_mlp": 0.12249756, + "step": 6482, + "time_per_iteration": 2.696316957473755 + }, + { + "auxiliary_loss_clip": 0.01133147, + "auxiliary_loss_mlp": 0.01035166, + "balance_loss_clip": 1.05002165, + "balance_loss_mlp": 1.02244687, + "epoch": 0.3897790470464452, + "flos": 27489446802720.0, + "grad_norm": 1.7882266256529427, + "language_loss": 0.79224873, + "learning_rate": 2.78900610077756e-06, + "loss": 0.81393182, + "num_input_tokens_seen": 139147315, + "router_z_loss_clip": 0.83154297, + "router_z_loss_mlp": 0.12719727, + "step": 6483, + "time_per_iteration": 2.6502685546875 + }, + { + "auxiliary_loss_clip": 0.01132648, + "auxiliary_loss_mlp": 0.01030754, + "balance_loss_clip": 1.04849863, + "balance_loss_mlp": 1.01605582, + "epoch": 0.3898391702991132, + "flos": 31982131247040.0, + "grad_norm": 1.5922383631302874, + "language_loss": 0.80199438, + "learning_rate": 2.788648211572067e-06, + "loss": 0.82362843, + "num_input_tokens_seen": 139167270, + "router_z_loss_clip": 0.84277344, + "router_z_loss_mlp": 0.14697266, + "step": 6484, + "time_per_iteration": 2.786694049835205 + }, + { + "auxiliary_loss_clip": 0.01134079, + "auxiliary_loss_mlp": 0.01043441, + "balance_loss_clip": 1.05225289, + "balance_loss_mlp": 1.02858758, + "epoch": 0.38989929355178116, + "flos": 25704590496480.0, + "grad_norm": 1.6176049725548403, + "language_loss": 0.77692628, + "learning_rate": 2.7882902924609557e-06, + "loss": 0.79870152, + "num_input_tokens_seen": 139185970, + "router_z_loss_clip": 0.81787109, + "router_z_loss_mlp": 0.1484375, + "step": 6485, + "time_per_iteration": 2.625056028366089 + }, + { + "auxiliary_loss_clip": 0.01133952, + "auxiliary_loss_mlp": 0.01033685, + "balance_loss_clip": 1.04925215, + "balance_loss_mlp": 1.01943362, + "epoch": 0.3899594168044491, + "flos": 31096104824160.0, + "grad_norm": 2.415661409140425, + "language_loss": 0.84927881, + "learning_rate": 2.7879323434577965e-06, + "loss": 0.87095511, + "num_input_tokens_seen": 139203730, + "router_z_loss_clip": 0.84619141, + "router_z_loss_mlp": 0.14263916, + "step": 6486, + "time_per_iteration": 4.12963080406189 + }, + { + "auxiliary_loss_clip": 0.0113469, + "auxiliary_loss_mlp": 0.01034594, + "balance_loss_clip": 1.04844594, + "balance_loss_mlp": 1.0209986, + "epoch": 0.3900195400571171, + "flos": 37997077091040.0, + "grad_norm": 2.0987650195771916, + "language_loss": 0.85491931, + "learning_rate": 2.7875743645761645e-06, + "loss": 0.87661219, + "num_input_tokens_seen": 139222560, + "router_z_loss_clip": 0.86181641, + "router_z_loss_mlp": 0.13604736, + "step": 6487, + "time_per_iteration": 4.084296464920044 + }, + { + "auxiliary_loss_clip": 0.01129811, + "auxiliary_loss_mlp": 0.01030899, + "balance_loss_clip": 1.04864979, + "balance_loss_mlp": 1.01690435, + "epoch": 0.39007966330978505, + "flos": 24684551962560.0, + "grad_norm": 1.8150439833941867, + "language_loss": 0.73338258, + "learning_rate": 2.787216355829633e-06, + "loss": 0.75498968, + "num_input_tokens_seen": 139242165, + "router_z_loss_clip": 0.81152344, + "router_z_loss_mlp": 0.14025879, + "step": 6488, + "time_per_iteration": 2.7324063777923584 + }, + { + "auxiliary_loss_clip": 0.01137084, + "auxiliary_loss_mlp": 0.01038085, + "balance_loss_clip": 1.052737, + "balance_loss_mlp": 1.02478695, + "epoch": 0.390139786562453, + "flos": 27489730423680.0, + "grad_norm": 1.63105123037319, + "language_loss": 0.6905688, + "learning_rate": 2.786858317231779e-06, + "loss": 0.71232057, + "num_input_tokens_seen": 139262525, + "router_z_loss_clip": 0.84326172, + "router_z_loss_mlp": 0.13287354, + "step": 6489, + "time_per_iteration": 2.6827845573425293 + }, + { + "auxiliary_loss_clip": 0.01126263, + "auxiliary_loss_mlp": 0.01036844, + "balance_loss_clip": 1.04753852, + "balance_loss_mlp": 1.02398157, + "epoch": 0.390199909815121, + "flos": 32253883058880.0, + "grad_norm": 1.751853727514991, + "language_loss": 0.80793214, + "learning_rate": 2.7865002487961788e-06, + "loss": 0.82956314, + "num_input_tokens_seen": 139282835, + "router_z_loss_clip": 0.78613281, + "router_z_loss_mlp": 0.12878418, + "step": 6490, + "time_per_iteration": 2.743901014328003 + }, + { + "auxiliary_loss_clip": 0.01132752, + "auxiliary_loss_mlp": 0.01036144, + "balance_loss_clip": 1.04924631, + "balance_loss_mlp": 1.02285194, + "epoch": 0.39026003306778895, + "flos": 21078218079360.0, + "grad_norm": 1.800329807483142, + "language_loss": 0.89276922, + "learning_rate": 2.7861421505364104e-06, + "loss": 0.91445822, + "num_input_tokens_seen": 139299490, + "router_z_loss_clip": 0.83398438, + "router_z_loss_mlp": 0.13299561, + "step": 6491, + "time_per_iteration": 2.6061997413635254 + }, + { + "auxiliary_loss_clip": 0.01131267, + "auxiliary_loss_mlp": 0.0103869, + "balance_loss_clip": 1.04701388, + "balance_loss_mlp": 1.02513635, + "epoch": 0.3903201563204569, + "flos": 29935530897120.0, + "grad_norm": 2.3504195429869967, + "language_loss": 0.7906422, + "learning_rate": 2.7857840224660523e-06, + "loss": 0.81234181, + "num_input_tokens_seen": 139317865, + "router_z_loss_clip": 0.84228516, + "router_z_loss_mlp": 0.13549805, + "step": 6492, + "time_per_iteration": 4.1259543895721436 + }, + { + "auxiliary_loss_clip": 0.01131465, + "auxiliary_loss_mlp": 0.01037319, + "balance_loss_clip": 1.04806352, + "balance_loss_mlp": 1.02442694, + "epoch": 0.39038027957312493, + "flos": 29002024572480.0, + "grad_norm": 4.160700796089832, + "language_loss": 0.73888314, + "learning_rate": 2.7854258645986857e-06, + "loss": 0.760571, + "num_input_tokens_seen": 139339840, + "router_z_loss_clip": 0.83447266, + "router_z_loss_mlp": 0.12908936, + "step": 6493, + "time_per_iteration": 2.6773574352264404 + }, + { + "auxiliary_loss_clip": 0.01135364, + "auxiliary_loss_mlp": 0.01035704, + "balance_loss_clip": 1.04819643, + "balance_loss_mlp": 1.02192378, + "epoch": 0.3904404028257929, + "flos": 17205075492480.0, + "grad_norm": 2.446628142654533, + "language_loss": 0.76251119, + "learning_rate": 2.7850676769478916e-06, + "loss": 0.78422189, + "num_input_tokens_seen": 139357555, + "router_z_loss_clip": 0.87158203, + "router_z_loss_mlp": 0.13781738, + "step": 6494, + "time_per_iteration": 2.6507914066314697 + }, + { + "auxiliary_loss_clip": 0.01139811, + "auxiliary_loss_mlp": 0.01043248, + "balance_loss_clip": 1.04955804, + "balance_loss_mlp": 1.02837062, + "epoch": 0.39050052607846086, + "flos": 20633361331680.0, + "grad_norm": 1.8007406288291778, + "language_loss": 0.74491704, + "learning_rate": 2.7847094595272525e-06, + "loss": 0.76674759, + "num_input_tokens_seen": 139374455, + "router_z_loss_clip": 0.90234375, + "router_z_loss_mlp": 0.14892578, + "step": 6495, + "time_per_iteration": 2.749729633331299 + }, + { + "auxiliary_loss_clip": 0.01134089, + "auxiliary_loss_mlp": 0.01043348, + "balance_loss_clip": 1.05006039, + "balance_loss_mlp": 1.02913833, + "epoch": 0.39056064933112883, + "flos": 31623401535840.0, + "grad_norm": 1.5933650276968667, + "language_loss": 0.67919832, + "learning_rate": 2.784351212350352e-06, + "loss": 0.70097268, + "num_input_tokens_seen": 139394770, + "router_z_loss_clip": 0.84033203, + "router_z_loss_mlp": 0.14215088, + "step": 6496, + "time_per_iteration": 4.004867076873779 + }, + { + "auxiliary_loss_clip": 0.01045307, + "auxiliary_loss_mlp": 0.01006, + "balance_loss_clip": 1.01907635, + "balance_loss_mlp": 1.00471807, + "epoch": 0.3906207725837968, + "flos": 73247071150080.0, + "grad_norm": 0.6648042744435784, + "language_loss": 0.53929615, + "learning_rate": 2.783992935430775e-06, + "loss": 0.55980927, + "num_input_tokens_seen": 139454760, + "router_z_loss_clip": 0.26196289, + "router_z_loss_mlp": 0.01282501, + "step": 6497, + "time_per_iteration": 3.354552745819092 + }, + { + "auxiliary_loss_clip": 0.01130773, + "auxiliary_loss_mlp": 0.01035834, + "balance_loss_clip": 1.04800177, + "balance_loss_mlp": 1.02217257, + "epoch": 0.39068089583646476, + "flos": 25708115499840.0, + "grad_norm": 3.814980857506778, + "language_loss": 0.68786502, + "learning_rate": 2.7836346287821068e-06, + "loss": 0.70953113, + "num_input_tokens_seen": 139472645, + "router_z_loss_clip": 0.82714844, + "router_z_loss_mlp": 0.13653564, + "step": 6498, + "time_per_iteration": 2.709951162338257 + }, + { + "auxiliary_loss_clip": 0.01045385, + "auxiliary_loss_mlp": 0.01006644, + "balance_loss_clip": 1.01923347, + "balance_loss_mlp": 1.0053004, + "epoch": 0.3907410190891327, + "flos": 85957274265120.0, + "grad_norm": 0.762317312431287, + "language_loss": 0.51919937, + "learning_rate": 2.783276292417936e-06, + "loss": 0.5397197, + "num_input_tokens_seen": 139536730, + "router_z_loss_clip": 0.26171875, + "router_z_loss_mlp": 0.01344299, + "step": 6499, + "time_per_iteration": 3.3110344409942627 + }, + { + "auxiliary_loss_clip": 0.0113211, + "auxiliary_loss_mlp": 0.01042727, + "balance_loss_clip": 1.04662991, + "balance_loss_mlp": 1.02779031, + "epoch": 0.3908011423418007, + "flos": 34120612087200.0, + "grad_norm": 1.7244437655070495, + "language_loss": 0.74007064, + "learning_rate": 2.7829179263518487e-06, + "loss": 0.76181906, + "num_input_tokens_seen": 139557540, + "router_z_loss_clip": 0.85595703, + "router_z_loss_mlp": 0.14953613, + "step": 6500, + "time_per_iteration": 2.7998437881469727 + }, + { + "auxiliary_loss_clip": 0.01135271, + "auxiliary_loss_mlp": 0.01034904, + "balance_loss_clip": 1.05017471, + "balance_loss_mlp": 1.02124894, + "epoch": 0.39086126559446865, + "flos": 29848998687840.0, + "grad_norm": 2.3735872834004863, + "language_loss": 0.69210231, + "learning_rate": 2.7825595305974354e-06, + "loss": 0.71380413, + "num_input_tokens_seen": 139576875, + "router_z_loss_clip": 0.85009766, + "router_z_loss_mlp": 0.13665771, + "step": 6501, + "time_per_iteration": 2.7095649242401123 + }, + { + "auxiliary_loss_clip": 0.01129866, + "auxiliary_loss_mlp": 0.01034934, + "balance_loss_clip": 1.04790378, + "balance_loss_mlp": 1.02220201, + "epoch": 0.3909213888471366, + "flos": 20672251570080.0, + "grad_norm": 6.210294103251721, + "language_loss": 0.78821295, + "learning_rate": 2.782201105168287e-06, + "loss": 0.80986094, + "num_input_tokens_seen": 139594295, + "router_z_loss_clip": 0.81982422, + "router_z_loss_mlp": 0.12731934, + "step": 6502, + "time_per_iteration": 2.717756509780884 + }, + { + "auxiliary_loss_clip": 0.01131795, + "auxiliary_loss_mlp": 0.01036039, + "balance_loss_clip": 1.05175376, + "balance_loss_mlp": 1.02318192, + "epoch": 0.3909815120998046, + "flos": 35726204831040.0, + "grad_norm": 2.2213796037069753, + "language_loss": 0.80064744, + "learning_rate": 2.7818426500779932e-06, + "loss": 0.82232577, + "num_input_tokens_seen": 139614080, + "router_z_loss_clip": 0.80078125, + "router_z_loss_mlp": 0.12854004, + "step": 6503, + "time_per_iteration": 2.7115237712860107 + }, + { + "auxiliary_loss_clip": 0.01126756, + "auxiliary_loss_mlp": 0.0102922, + "balance_loss_clip": 1.04672837, + "balance_loss_mlp": 1.01653028, + "epoch": 0.39104163535247255, + "flos": 23124372739200.0, + "grad_norm": 1.8193342059936157, + "language_loss": 0.71365857, + "learning_rate": 2.7814841653401485e-06, + "loss": 0.73521841, + "num_input_tokens_seen": 139632755, + "router_z_loss_clip": 0.80029297, + "router_z_loss_mlp": 0.12695312, + "step": 6504, + "time_per_iteration": 2.7130727767944336 + }, + { + "auxiliary_loss_clip": 0.01128742, + "auxiliary_loss_mlp": 0.01032016, + "balance_loss_clip": 1.04685593, + "balance_loss_mlp": 1.01850986, + "epoch": 0.3911017586051405, + "flos": 32120195086080.0, + "grad_norm": 1.8989850713770888, + "language_loss": 0.83363509, + "learning_rate": 2.7811256509683454e-06, + "loss": 0.85524273, + "num_input_tokens_seen": 139654205, + "router_z_loss_clip": 0.81933594, + "router_z_loss_mlp": 0.13513184, + "step": 6505, + "time_per_iteration": 2.8388984203338623 + }, + { + "auxiliary_loss_clip": 0.01130595, + "auxiliary_loss_mlp": 0.0103341, + "balance_loss_clip": 1.04971838, + "balance_loss_mlp": 1.0193851, + "epoch": 0.3911618818578085, + "flos": 26644093378560.0, + "grad_norm": 2.010006403858482, + "language_loss": 0.71600169, + "learning_rate": 2.7807671069761797e-06, + "loss": 0.73764175, + "num_input_tokens_seen": 139673595, + "router_z_loss_clip": 0.80810547, + "router_z_loss_mlp": 0.14025879, + "step": 6506, + "time_per_iteration": 2.64609694480896 + }, + { + "auxiliary_loss_clip": 0.01127147, + "auxiliary_loss_mlp": 0.01034907, + "balance_loss_clip": 1.04862881, + "balance_loss_mlp": 1.02228904, + "epoch": 0.3912220051104765, + "flos": 19962044740800.0, + "grad_norm": 2.0126887355225693, + "language_loss": 0.7533446, + "learning_rate": 2.7804085333772477e-06, + "loss": 0.77496511, + "num_input_tokens_seen": 139690565, + "router_z_loss_clip": 0.78515625, + "router_z_loss_mlp": 0.12634277, + "step": 6507, + "time_per_iteration": 2.710646629333496 + }, + { + "auxiliary_loss_clip": 0.0104655, + "auxiliary_loss_mlp": 0.0100065, + "balance_loss_clip": 1.02041268, + "balance_loss_mlp": 0.99925339, + "epoch": 0.39128212836314447, + "flos": 86697295462080.0, + "grad_norm": 0.7570768868377044, + "language_loss": 0.56596208, + "learning_rate": 2.7800499301851446e-06, + "loss": 0.58643401, + "num_input_tokens_seen": 139749420, + "router_z_loss_clip": 0.26123047, + "router_z_loss_mlp": 0.01396942, + "step": 6508, + "time_per_iteration": 3.389357805252075 + }, + { + "auxiliary_loss_clip": 0.01130706, + "auxiliary_loss_mlp": 0.01036983, + "balance_loss_clip": 1.04886985, + "balance_loss_mlp": 1.02405524, + "epoch": 0.39134225161581243, + "flos": 24809771823840.0, + "grad_norm": 2.017857755211109, + "language_loss": 0.76514858, + "learning_rate": 2.779691297413471e-06, + "loss": 0.78682542, + "num_input_tokens_seen": 139766265, + "router_z_loss_clip": 0.81884766, + "router_z_loss_mlp": 0.12927246, + "step": 6509, + "time_per_iteration": 2.715610980987549 + }, + { + "auxiliary_loss_clip": 0.01131801, + "auxiliary_loss_mlp": 0.01040933, + "balance_loss_clip": 1.04786265, + "balance_loss_mlp": 1.0256145, + "epoch": 0.3914023748684804, + "flos": 20765671716960.0, + "grad_norm": 3.105081968378754, + "language_loss": 0.8266803, + "learning_rate": 2.779332635075825e-06, + "loss": 0.84840763, + "num_input_tokens_seen": 139782400, + "router_z_loss_clip": 0.83935547, + "router_z_loss_mlp": 0.15332031, + "step": 6510, + "time_per_iteration": 2.67472767829895 + }, + { + "auxiliary_loss_clip": 0.01133181, + "auxiliary_loss_mlp": 0.01033534, + "balance_loss_clip": 1.04939592, + "balance_loss_mlp": 1.02057552, + "epoch": 0.39146249812114836, + "flos": 22457958739200.0, + "grad_norm": 1.9759799422391178, + "language_loss": 0.76822561, + "learning_rate": 2.7789739431858073e-06, + "loss": 0.78989273, + "num_input_tokens_seen": 139801435, + "router_z_loss_clip": 0.83789062, + "router_z_loss_mlp": 0.12957764, + "step": 6511, + "time_per_iteration": 2.7324795722961426 + }, + { + "auxiliary_loss_clip": 0.01046002, + "auxiliary_loss_mlp": 0.0100113, + "balance_loss_clip": 1.01983917, + "balance_loss_mlp": 0.99965972, + "epoch": 0.3915226213738163, + "flos": 82530244461600.0, + "grad_norm": 0.7236182237864104, + "language_loss": 0.57765949, + "learning_rate": 2.7786152217570196e-06, + "loss": 0.59813082, + "num_input_tokens_seen": 139869700, + "router_z_loss_clip": 0.26196289, + "router_z_loss_mlp": 0.01470947, + "step": 6512, + "time_per_iteration": 3.3407349586486816 + }, + { + "auxiliary_loss_clip": 0.0113372, + "auxiliary_loss_mlp": 0.01033685, + "balance_loss_clip": 1.04935503, + "balance_loss_mlp": 1.01859868, + "epoch": 0.3915827446264843, + "flos": 32164149984480.0, + "grad_norm": 1.6638132646874233, + "language_loss": 0.69406366, + "learning_rate": 2.7782564708030647e-06, + "loss": 0.7157377, + "num_input_tokens_seen": 139890140, + "router_z_loss_clip": 0.84277344, + "router_z_loss_mlp": 0.15075684, + "step": 6513, + "time_per_iteration": 2.680006265640259 + }, + { + "auxiliary_loss_clip": 0.01136922, + "auxiliary_loss_mlp": 0.01039291, + "balance_loss_clip": 1.04995728, + "balance_loss_mlp": 1.02565897, + "epoch": 0.39164286787915226, + "flos": 26776160660160.0, + "grad_norm": 2.452081793869272, + "language_loss": 0.75775301, + "learning_rate": 2.7778976903375464e-06, + "loss": 0.77951515, + "num_input_tokens_seen": 139908020, + "router_z_loss_clip": 0.87011719, + "router_z_loss_mlp": 0.13635254, + "step": 6514, + "time_per_iteration": 2.6891555786132812 + }, + { + "auxiliary_loss_clip": 0.01133692, + "auxiliary_loss_mlp": 0.01036889, + "balance_loss_clip": 1.04955864, + "balance_loss_mlp": 1.02372289, + "epoch": 0.3917029911318202, + "flos": 20010699643680.0, + "grad_norm": 2.650055317631245, + "language_loss": 0.76909947, + "learning_rate": 2.7775388803740693e-06, + "loss": 0.79080522, + "num_input_tokens_seen": 139926180, + "router_z_loss_clip": 0.84179688, + "router_z_loss_mlp": 0.13165283, + "step": 6515, + "time_per_iteration": 2.6636948585510254 + }, + { + "auxiliary_loss_clip": 0.01129632, + "auxiliary_loss_mlp": 0.01039757, + "balance_loss_clip": 1.04940486, + "balance_loss_mlp": 1.02754378, + "epoch": 0.3917631143844882, + "flos": 31984157111040.0, + "grad_norm": 1.3301316626393167, + "language_loss": 0.79807031, + "learning_rate": 2.7771800409262406e-06, + "loss": 0.81976414, + "num_input_tokens_seen": 139947420, + "router_z_loss_clip": 0.80273438, + "router_z_loss_mlp": 0.12213135, + "step": 6516, + "time_per_iteration": 2.820300579071045 + }, + { + "auxiliary_loss_clip": 0.01136911, + "auxiliary_loss_mlp": 0.01042522, + "balance_loss_clip": 1.05257022, + "balance_loss_mlp": 1.02890873, + "epoch": 0.39182323763715615, + "flos": 22632886952640.0, + "grad_norm": 2.5596459512834007, + "language_loss": 0.70009392, + "learning_rate": 2.7768211720076665e-06, + "loss": 0.7218883, + "num_input_tokens_seen": 139965800, + "router_z_loss_clip": 0.84277344, + "router_z_loss_mlp": 0.13623047, + "step": 6517, + "time_per_iteration": 2.7163171768188477 + }, + { + "auxiliary_loss_clip": 0.01131529, + "auxiliary_loss_mlp": 0.01039657, + "balance_loss_clip": 1.04800367, + "balance_loss_mlp": 1.02596545, + "epoch": 0.3918833608898241, + "flos": 41870462781600.0, + "grad_norm": 1.7863615039902796, + "language_loss": 0.72371131, + "learning_rate": 2.776462273631956e-06, + "loss": 0.7454232, + "num_input_tokens_seen": 139988140, + "router_z_loss_clip": 0.83447266, + "router_z_loss_mlp": 0.137146, + "step": 6518, + "time_per_iteration": 2.7946157455444336 + }, + { + "auxiliary_loss_clip": 0.01134943, + "auxiliary_loss_mlp": 0.01036561, + "balance_loss_clip": 1.05057907, + "balance_loss_mlp": 1.02320981, + "epoch": 0.3919434841424921, + "flos": 45075043952640.0, + "grad_norm": 1.5730769508028013, + "language_loss": 0.61632037, + "learning_rate": 2.7761033458127177e-06, + "loss": 0.63803536, + "num_input_tokens_seen": 140010060, + "router_z_loss_clip": 0.84423828, + "router_z_loss_mlp": 0.13366699, + "step": 6519, + "time_per_iteration": 2.820172071456909 + }, + { + "auxiliary_loss_clip": 0.01142059, + "auxiliary_loss_mlp": 0.01041335, + "balance_loss_clip": 1.05368459, + "balance_loss_mlp": 1.02651727, + "epoch": 0.3920036073951601, + "flos": 28686317862240.0, + "grad_norm": 2.0759146822980195, + "language_loss": 0.67007887, + "learning_rate": 2.775744388563563e-06, + "loss": 0.69191277, + "num_input_tokens_seen": 140029400, + "router_z_loss_clip": 0.88427734, + "router_z_loss_mlp": 0.14807129, + "step": 6520, + "time_per_iteration": 2.7569892406463623 + }, + { + "auxiliary_loss_clip": 0.01130935, + "auxiliary_loss_mlp": 0.01034212, + "balance_loss_clip": 1.04920101, + "balance_loss_mlp": 1.02060449, + "epoch": 0.39206373064782807, + "flos": 22458931153920.0, + "grad_norm": 1.7779316503302602, + "language_loss": 0.78642482, + "learning_rate": 2.775385401898104e-06, + "loss": 0.80807638, + "num_input_tokens_seen": 140048940, + "router_z_loss_clip": 0.81738281, + "router_z_loss_mlp": 0.13604736, + "step": 6521, + "time_per_iteration": 2.664297580718994 + }, + { + "auxiliary_loss_clip": 0.01138923, + "auxiliary_loss_mlp": 0.01036268, + "balance_loss_clip": 1.0511688, + "balance_loss_mlp": 1.01991272, + "epoch": 0.39212385390049603, + "flos": 15023733720480.0, + "grad_norm": 2.2297978838241432, + "language_loss": 0.7008853, + "learning_rate": 2.775026385829952e-06, + "loss": 0.72263718, + "num_input_tokens_seen": 140066380, + "router_z_loss_clip": 0.87695312, + "router_z_loss_mlp": 0.16345215, + "step": 6522, + "time_per_iteration": 2.646174907684326 + }, + { + "auxiliary_loss_clip": 0.01135568, + "auxiliary_loss_mlp": 0.0103773, + "balance_loss_clip": 1.05002594, + "balance_loss_mlp": 1.02403307, + "epoch": 0.392183977153164, + "flos": 24061282515360.0, + "grad_norm": 2.107609348841259, + "language_loss": 0.76919347, + "learning_rate": 2.774667340372722e-06, + "loss": 0.79092646, + "num_input_tokens_seen": 140085275, + "router_z_loss_clip": 0.85546875, + "router_z_loss_mlp": 0.13702393, + "step": 6523, + "time_per_iteration": 2.744488000869751 + }, + { + "auxiliary_loss_clip": 0.01135907, + "auxiliary_loss_mlp": 0.01039622, + "balance_loss_clip": 1.05113745, + "balance_loss_mlp": 1.02579355, + "epoch": 0.39224410040583196, + "flos": 40444498255680.0, + "grad_norm": 3.2186876780292923, + "language_loss": 0.6172241, + "learning_rate": 2.7743082655400293e-06, + "loss": 0.63897938, + "num_input_tokens_seen": 140105105, + "router_z_loss_clip": 0.84765625, + "router_z_loss_mlp": 0.13830566, + "step": 6524, + "time_per_iteration": 2.7859182357788086 + }, + { + "auxiliary_loss_clip": 0.01135306, + "auxiliary_loss_mlp": 0.01037222, + "balance_loss_clip": 1.04991055, + "balance_loss_mlp": 1.0224098, + "epoch": 0.39230422365849993, + "flos": 33900716043360.0, + "grad_norm": 1.6101169892452514, + "language_loss": 0.73700953, + "learning_rate": 2.773949161345489e-06, + "loss": 0.75873482, + "num_input_tokens_seen": 140125645, + "router_z_loss_clip": 0.85449219, + "router_z_loss_mlp": 0.14807129, + "step": 6525, + "time_per_iteration": 2.846193790435791 + }, + { + "auxiliary_loss_clip": 0.01135397, + "auxiliary_loss_mlp": 0.01042556, + "balance_loss_clip": 1.04999435, + "balance_loss_mlp": 1.02898979, + "epoch": 0.3923643469111679, + "flos": 21879981260640.0, + "grad_norm": 1.9612410775690488, + "language_loss": 0.81582916, + "learning_rate": 2.773590027802719e-06, + "loss": 0.8376087, + "num_input_tokens_seen": 140141925, + "router_z_loss_clip": 0.85400391, + "router_z_loss_mlp": 0.13562012, + "step": 6526, + "time_per_iteration": 5.494038105010986 + }, + { + "auxiliary_loss_clip": 0.01132082, + "auxiliary_loss_mlp": 0.01038218, + "balance_loss_clip": 1.04803729, + "balance_loss_mlp": 1.02459264, + "epoch": 0.39242447016383586, + "flos": 29357877556800.0, + "grad_norm": 1.9207390818366543, + "language_loss": 0.69839954, + "learning_rate": 2.7732308649253383e-06, + "loss": 0.72010255, + "num_input_tokens_seen": 140160965, + "router_z_loss_clip": 0.84130859, + "router_z_loss_mlp": 0.13635254, + "step": 6527, + "time_per_iteration": 2.6741135120391846 + }, + { + "auxiliary_loss_clip": 0.01134339, + "auxiliary_loss_mlp": 0.01033707, + "balance_loss_clip": 1.05157936, + "balance_loss_mlp": 1.02048659, + "epoch": 0.3924845934165038, + "flos": 13012174467360.0, + "grad_norm": 6.627815432418446, + "language_loss": 0.82676554, + "learning_rate": 2.772871672726965e-06, + "loss": 0.84844601, + "num_input_tokens_seen": 140177780, + "router_z_loss_clip": 0.82714844, + "router_z_loss_mlp": 0.13214111, + "step": 6528, + "time_per_iteration": 2.77370285987854 + }, + { + "auxiliary_loss_clip": 0.01133766, + "auxiliary_loss_mlp": 0.01038802, + "balance_loss_clip": 1.05277348, + "balance_loss_mlp": 1.02509296, + "epoch": 0.3925447166691718, + "flos": 38127361612320.0, + "grad_norm": 1.6721013740249615, + "language_loss": 0.68428302, + "learning_rate": 2.7725124512212205e-06, + "loss": 0.70600873, + "num_input_tokens_seen": 140201660, + "router_z_loss_clip": 0.81103516, + "router_z_loss_mlp": 0.137146, + "step": 6529, + "time_per_iteration": 2.803272008895874 + }, + { + "auxiliary_loss_clip": 0.01134754, + "auxiliary_loss_mlp": 0.0104148, + "balance_loss_clip": 1.04990983, + "balance_loss_mlp": 1.0275985, + "epoch": 0.39260483992183975, + "flos": 35893799416800.0, + "grad_norm": 2.510540650330409, + "language_loss": 0.79593545, + "learning_rate": 2.7721532004217267e-06, + "loss": 0.81769776, + "num_input_tokens_seen": 140218585, + "router_z_loss_clip": 0.84765625, + "router_z_loss_mlp": 0.13873291, + "step": 6530, + "time_per_iteration": 2.8107879161834717 + }, + { + "auxiliary_loss_clip": 0.01129769, + "auxiliary_loss_mlp": 0.01039287, + "balance_loss_clip": 1.04771233, + "balance_loss_mlp": 1.02536941, + "epoch": 0.3926649631745077, + "flos": 27891199514880.0, + "grad_norm": 1.57941389165507, + "language_loss": 0.75904667, + "learning_rate": 2.7717939203421063e-06, + "loss": 0.78073728, + "num_input_tokens_seen": 140239905, + "router_z_loss_clip": 0.8203125, + "router_z_loss_mlp": 0.13928223, + "step": 6531, + "time_per_iteration": 2.7277026176452637 + }, + { + "auxiliary_loss_clip": 0.01050141, + "auxiliary_loss_mlp": 0.01005947, + "balance_loss_clip": 1.0241363, + "balance_loss_mlp": 1.00448775, + "epoch": 0.3927250864271757, + "flos": 77964189573600.0, + "grad_norm": 0.8138242678091955, + "language_loss": 0.60369045, + "learning_rate": 2.7714346109959822e-06, + "loss": 0.62425137, + "num_input_tokens_seen": 140293820, + "router_z_loss_clip": 0.26098633, + "router_z_loss_mlp": 0.01457977, + "step": 6532, + "time_per_iteration": 4.573643445968628 + }, + { + "auxiliary_loss_clip": 0.01049271, + "auxiliary_loss_mlp": 0.01003252, + "balance_loss_clip": 1.0232513, + "balance_loss_mlp": 1.00175548, + "epoch": 0.3927852096798437, + "flos": 84083333160960.0, + "grad_norm": 0.7789433120269642, + "language_loss": 0.55489981, + "learning_rate": 2.771075272396981e-06, + "loss": 0.57542503, + "num_input_tokens_seen": 140360420, + "router_z_loss_clip": 0.26025391, + "router_z_loss_mlp": 0.01495361, + "step": 6533, + "time_per_iteration": 3.3353586196899414 + }, + { + "auxiliary_loss_clip": 0.01137988, + "auxiliary_loss_mlp": 0.01038093, + "balance_loss_clip": 1.05239701, + "balance_loss_mlp": 1.02418745, + "epoch": 0.39284533293251167, + "flos": 36260186893920.0, + "grad_norm": 1.947378300326364, + "language_loss": 0.76175499, + "learning_rate": 2.7707159045587284e-06, + "loss": 0.78351581, + "num_input_tokens_seen": 140381950, + "router_z_loss_clip": 0.85449219, + "router_z_loss_mlp": 0.13928223, + "step": 6534, + "time_per_iteration": 2.765355110168457 + }, + { + "auxiliary_loss_clip": 0.01139954, + "auxiliary_loss_mlp": 0.01038692, + "balance_loss_clip": 1.05274343, + "balance_loss_mlp": 1.02423239, + "epoch": 0.39290545618517964, + "flos": 22637019715200.0, + "grad_norm": 2.3268297781375176, + "language_loss": 0.78117442, + "learning_rate": 2.770356507494851e-06, + "loss": 0.80296087, + "num_input_tokens_seen": 140399410, + "router_z_loss_clip": 0.87207031, + "router_z_loss_mlp": 0.14465332, + "step": 6535, + "time_per_iteration": 4.0697548389434814 + }, + { + "auxiliary_loss_clip": 0.0113091, + "auxiliary_loss_mlp": 0.0103111, + "balance_loss_clip": 1.05008507, + "balance_loss_mlp": 1.01830053, + "epoch": 0.3929655794378476, + "flos": 32030137873440.0, + "grad_norm": 3.0666535315996426, + "language_loss": 0.68606114, + "learning_rate": 2.769997081218978e-06, + "loss": 0.7076813, + "num_input_tokens_seen": 140419055, + "router_z_loss_clip": 0.80859375, + "router_z_loss_mlp": 0.12817383, + "step": 6536, + "time_per_iteration": 2.9190592765808105 + }, + { + "auxiliary_loss_clip": 0.01129926, + "auxiliary_loss_mlp": 0.01039966, + "balance_loss_clip": 1.05008817, + "balance_loss_mlp": 1.02703202, + "epoch": 0.39302570269051557, + "flos": 35725678106400.0, + "grad_norm": 1.7854640949684346, + "language_loss": 0.69369781, + "learning_rate": 2.769637625744738e-06, + "loss": 0.7153967, + "num_input_tokens_seen": 140438800, + "router_z_loss_clip": 0.79931641, + "router_z_loss_mlp": 0.1295166, + "step": 6537, + "time_per_iteration": 2.7762985229492188 + }, + { + "auxiliary_loss_clip": 0.0113548, + "auxiliary_loss_mlp": 0.01038355, + "balance_loss_clip": 1.05171168, + "balance_loss_mlp": 1.02495003, + "epoch": 0.39308582594318353, + "flos": 21167140808160.0, + "grad_norm": 1.8688557192392914, + "language_loss": 0.79087591, + "learning_rate": 2.769278141085763e-06, + "loss": 0.81261432, + "num_input_tokens_seen": 140456880, + "router_z_loss_clip": 0.83642578, + "router_z_loss_mlp": 0.13415527, + "step": 6538, + "time_per_iteration": 2.679412364959717 + }, + { + "auxiliary_loss_clip": 0.01047153, + "auxiliary_loss_mlp": 0.01009887, + "balance_loss_clip": 1.02116442, + "balance_loss_mlp": 1.00843465, + "epoch": 0.3931459491958515, + "flos": 74441916345600.0, + "grad_norm": 0.8067203974704543, + "language_loss": 0.61906749, + "learning_rate": 2.768918627255683e-06, + "loss": 0.63963789, + "num_input_tokens_seen": 140507510, + "router_z_loss_clip": 0.26000977, + "router_z_loss_mlp": 0.01451111, + "step": 6539, + "time_per_iteration": 3.1259639263153076 + }, + { + "auxiliary_loss_clip": 0.01131497, + "auxiliary_loss_mlp": 0.01033067, + "balance_loss_clip": 1.04882812, + "balance_loss_mlp": 1.01926875, + "epoch": 0.39320607244851946, + "flos": 47609159395680.0, + "grad_norm": 1.977604927186294, + "language_loss": 0.67924124, + "learning_rate": 2.7685590842681315e-06, + "loss": 0.70088685, + "num_input_tokens_seen": 140528740, + "router_z_loss_clip": 0.82666016, + "router_z_loss_mlp": 0.13812256, + "step": 6540, + "time_per_iteration": 2.9273362159729004 + }, + { + "auxiliary_loss_clip": 0.01131316, + "auxiliary_loss_mlp": 0.01035553, + "balance_loss_clip": 1.04971194, + "balance_loss_mlp": 1.02233839, + "epoch": 0.3932661957011874, + "flos": 30114713424960.0, + "grad_norm": 1.851375278256756, + "language_loss": 0.7247138, + "learning_rate": 2.7681995121367433e-06, + "loss": 0.74638247, + "num_input_tokens_seen": 140547560, + "router_z_loss_clip": 0.81542969, + "router_z_loss_mlp": 0.13226318, + "step": 6541, + "time_per_iteration": 2.770862102508545 + }, + { + "auxiliary_loss_clip": 0.0104566, + "auxiliary_loss_mlp": 0.01011964, + "balance_loss_clip": 1.01961005, + "balance_loss_mlp": 1.01055038, + "epoch": 0.3933263189538554, + "flos": 85533156014400.0, + "grad_norm": 0.8334258768841091, + "language_loss": 0.60317314, + "learning_rate": 2.7678399108751516e-06, + "loss": 0.62374938, + "num_input_tokens_seen": 140601175, + "router_z_loss_clip": 0.26098633, + "router_z_loss_mlp": 0.01412201, + "step": 6542, + "time_per_iteration": 3.170410394668579 + }, + { + "auxiliary_loss_clip": 0.01132953, + "auxiliary_loss_mlp": 0.01033231, + "balance_loss_clip": 1.05026531, + "balance_loss_mlp": 1.02020788, + "epoch": 0.39338644220652336, + "flos": 27978339483360.0, + "grad_norm": 1.5404374626196815, + "language_loss": 0.82259393, + "learning_rate": 2.7674802804969947e-06, + "loss": 0.84425581, + "num_input_tokens_seen": 140622200, + "router_z_loss_clip": 0.82617188, + "router_z_loss_mlp": 0.13024902, + "step": 6543, + "time_per_iteration": 2.686765432357788 + }, + { + "auxiliary_loss_clip": 0.0112904, + "auxiliary_loss_mlp": 0.01034616, + "balance_loss_clip": 1.04724789, + "balance_loss_mlp": 1.02075183, + "epoch": 0.3934465654591913, + "flos": 37641669796800.0, + "grad_norm": 1.6103397073282988, + "language_loss": 0.68911874, + "learning_rate": 2.767120621015908e-06, + "loss": 0.71075523, + "num_input_tokens_seen": 140643125, + "router_z_loss_clip": 0.81787109, + "router_z_loss_mlp": 0.13861084, + "step": 6544, + "time_per_iteration": 2.794351577758789 + }, + { + "auxiliary_loss_clip": 0.01135234, + "auxiliary_loss_mlp": 0.01041978, + "balance_loss_clip": 1.05007184, + "balance_loss_mlp": 1.02735138, + "epoch": 0.3935066887118593, + "flos": 35675605098720.0, + "grad_norm": 2.0363503203158713, + "language_loss": 0.75005287, + "learning_rate": 2.76676093244553e-06, + "loss": 0.77182496, + "num_input_tokens_seen": 140662500, + "router_z_loss_clip": 0.85205078, + "router_z_loss_mlp": 0.1463623, + "step": 6545, + "time_per_iteration": 2.723605155944824 + }, + { + "auxiliary_loss_clip": 0.01129082, + "auxiliary_loss_mlp": 0.01035432, + "balance_loss_clip": 1.05089211, + "balance_loss_mlp": 1.02370226, + "epoch": 0.3935668119645273, + "flos": 23348806718400.0, + "grad_norm": 1.4281849637164423, + "language_loss": 0.74570638, + "learning_rate": 2.7664012147995015e-06, + "loss": 0.76735151, + "num_input_tokens_seen": 140681960, + "router_z_loss_clip": 0.78222656, + "router_z_loss_mlp": 0.11730957, + "step": 6546, + "time_per_iteration": 2.7203402519226074 + }, + { + "auxiliary_loss_clip": 0.01135594, + "auxiliary_loss_mlp": 0.01035962, + "balance_loss_clip": 1.04818201, + "balance_loss_mlp": 1.02205074, + "epoch": 0.3936269352171953, + "flos": 22593145851360.0, + "grad_norm": 1.7554395326733108, + "language_loss": 0.81481886, + "learning_rate": 2.7660414680914617e-06, + "loss": 0.83653438, + "num_input_tokens_seen": 140699170, + "router_z_loss_clip": 0.87402344, + "router_z_loss_mlp": 0.13916016, + "step": 6547, + "time_per_iteration": 2.663872480392456 + }, + { + "auxiliary_loss_clip": 0.01130606, + "auxiliary_loss_mlp": 0.01029211, + "balance_loss_clip": 1.0477767, + "balance_loss_mlp": 1.01601505, + "epoch": 0.39368705846986324, + "flos": 19074802799520.0, + "grad_norm": 1.7917761971317454, + "language_loss": 0.8428601, + "learning_rate": 2.7656816923350525e-06, + "loss": 0.86445826, + "num_input_tokens_seen": 140714920, + "router_z_loss_clip": 0.82714844, + "router_z_loss_mlp": 0.13201904, + "step": 6548, + "time_per_iteration": 2.680551528930664 + }, + { + "auxiliary_loss_clip": 0.0112958, + "auxiliary_loss_mlp": 0.01028473, + "balance_loss_clip": 1.0492177, + "balance_loss_mlp": 1.01614714, + "epoch": 0.3937471817225312, + "flos": 26021755828800.0, + "grad_norm": 1.7626975594067895, + "language_loss": 0.73107708, + "learning_rate": 2.7653218875439174e-06, + "loss": 0.75265765, + "num_input_tokens_seen": 140734595, + "router_z_loss_clip": 0.80419922, + "router_z_loss_mlp": 0.12322998, + "step": 6549, + "time_per_iteration": 2.664431571960449 + }, + { + "auxiliary_loss_clip": 0.01133833, + "auxiliary_loss_mlp": 0.01035892, + "balance_loss_clip": 1.05079389, + "balance_loss_mlp": 1.02223074, + "epoch": 0.39380730497519917, + "flos": 25351816825440.0, + "grad_norm": 1.6762772039188059, + "language_loss": 0.77832472, + "learning_rate": 2.764962053731699e-06, + "loss": 0.80002201, + "num_input_tokens_seen": 140754050, + "router_z_loss_clip": 0.82958984, + "router_z_loss_mlp": 0.13647461, + "step": 6550, + "time_per_iteration": 2.71685791015625 + }, + { + "auxiliary_loss_clip": 0.01129221, + "auxiliary_loss_mlp": 0.01030696, + "balance_loss_clip": 1.04763889, + "balance_loss_mlp": 1.01770175, + "epoch": 0.39386742822786713, + "flos": 26368978632480.0, + "grad_norm": 1.6683684090660764, + "language_loss": 0.81130892, + "learning_rate": 2.7646021909120434e-06, + "loss": 0.83290809, + "num_input_tokens_seen": 140771440, + "router_z_loss_clip": 0.81591797, + "router_z_loss_mlp": 0.12988281, + "step": 6551, + "time_per_iteration": 2.746206283569336 + }, + { + "auxiliary_loss_clip": 0.01131763, + "auxiliary_loss_mlp": 0.01037267, + "balance_loss_clip": 1.04845273, + "balance_loss_mlp": 1.02321827, + "epoch": 0.3939275514805351, + "flos": 15148426857120.0, + "grad_norm": 2.1825103475785967, + "language_loss": 0.80448949, + "learning_rate": 2.764242299098596e-06, + "loss": 0.8261798, + "num_input_tokens_seen": 140786715, + "router_z_loss_clip": 0.83349609, + "router_z_loss_mlp": 0.14068604, + "step": 6552, + "time_per_iteration": 2.6947433948516846 + }, + { + "auxiliary_loss_clip": 0.01135021, + "auxiliary_loss_mlp": 0.01039856, + "balance_loss_clip": 1.05075097, + "balance_loss_mlp": 1.02684426, + "epoch": 0.39398767473320306, + "flos": 22637019715200.0, + "grad_norm": 1.7509672175111624, + "language_loss": 0.70824254, + "learning_rate": 2.763882378305003e-06, + "loss": 0.72999132, + "num_input_tokens_seen": 140804950, + "router_z_loss_clip": 0.84277344, + "router_z_loss_mlp": 0.13024902, + "step": 6553, + "time_per_iteration": 2.6839418411254883 + }, + { + "auxiliary_loss_clip": 0.01131041, + "auxiliary_loss_mlp": 0.01036634, + "balance_loss_clip": 1.04923499, + "balance_loss_mlp": 1.02306843, + "epoch": 0.39404779798587103, + "flos": 35763960585600.0, + "grad_norm": 1.9226838509949378, + "language_loss": 0.64365298, + "learning_rate": 2.7635224285449144e-06, + "loss": 0.66532975, + "num_input_tokens_seen": 140822800, + "router_z_loss_clip": 0.81884766, + "router_z_loss_mlp": 0.13555908, + "step": 6554, + "time_per_iteration": 2.707754135131836 + }, + { + "auxiliary_loss_clip": 0.01130613, + "auxiliary_loss_mlp": 0.01035891, + "balance_loss_clip": 1.04936504, + "balance_loss_mlp": 1.02352262, + "epoch": 0.394107921238539, + "flos": 42582533405760.0, + "grad_norm": 2.054215716227953, + "language_loss": 0.79717779, + "learning_rate": 2.7631624498319796e-06, + "loss": 0.81884277, + "num_input_tokens_seen": 140842940, + "router_z_loss_clip": 0.81201172, + "router_z_loss_mlp": 0.12371826, + "step": 6555, + "time_per_iteration": 2.8981125354766846 + }, + { + "auxiliary_loss_clip": 0.01135078, + "auxiliary_loss_mlp": 0.01034422, + "balance_loss_clip": 1.05100775, + "balance_loss_mlp": 1.0208087, + "epoch": 0.39416804449120696, + "flos": 30604821624000.0, + "grad_norm": 1.7434726906141482, + "language_loss": 0.7193253, + "learning_rate": 2.7628024421798473e-06, + "loss": 0.74102026, + "num_input_tokens_seen": 140863060, + "router_z_loss_clip": 0.84082031, + "router_z_loss_mlp": 0.13616943, + "step": 6556, + "time_per_iteration": 2.6610047817230225 + }, + { + "auxiliary_loss_clip": 0.01131011, + "auxiliary_loss_mlp": 0.01034202, + "balance_loss_clip": 1.04749727, + "balance_loss_mlp": 1.02066541, + "epoch": 0.3942281677438749, + "flos": 39421501960320.0, + "grad_norm": 1.838427055667474, + "language_loss": 0.8333751, + "learning_rate": 2.7624424056021705e-06, + "loss": 0.8550272, + "num_input_tokens_seen": 140883795, + "router_z_loss_clip": 0.83496094, + "router_z_loss_mlp": 0.13537598, + "step": 6557, + "time_per_iteration": 2.772584915161133 + }, + { + "auxiliary_loss_clip": 0.01130843, + "auxiliary_loss_mlp": 0.01032455, + "balance_loss_clip": 1.04886115, + "balance_loss_mlp": 1.01910961, + "epoch": 0.3942882909965429, + "flos": 30428029615680.0, + "grad_norm": 2.0879202410690234, + "language_loss": 0.79887497, + "learning_rate": 2.7620823401126004e-06, + "loss": 0.820508, + "num_input_tokens_seen": 140903055, + "router_z_loss_clip": 0.81835938, + "router_z_loss_mlp": 0.13342285, + "step": 6558, + "time_per_iteration": 2.66021728515625 + }, + { + "auxiliary_loss_clip": 0.01129882, + "auxiliary_loss_mlp": 0.01032139, + "balance_loss_clip": 1.04935062, + "balance_loss_mlp": 1.01972353, + "epoch": 0.39434841424921085, + "flos": 14488981829280.0, + "grad_norm": 1.6788490375857132, + "language_loss": 0.7074908, + "learning_rate": 2.761722245724792e-06, + "loss": 0.72911096, + "num_input_tokens_seen": 140920685, + "router_z_loss_clip": 0.8046875, + "router_z_loss_mlp": 0.12408447, + "step": 6559, + "time_per_iteration": 2.6582729816436768 + }, + { + "auxiliary_loss_clip": 0.0113552, + "auxiliary_loss_mlp": 0.01039921, + "balance_loss_clip": 1.0484879, + "balance_loss_mlp": 1.02459109, + "epoch": 0.3944085375018789, + "flos": 19964232673920.0, + "grad_norm": 2.0513207753314875, + "language_loss": 0.80241293, + "learning_rate": 2.7613621224524003e-06, + "loss": 0.82416737, + "num_input_tokens_seen": 140937320, + "router_z_loss_clip": 0.87109375, + "router_z_loss_mlp": 0.15332031, + "step": 6560, + "time_per_iteration": 2.594978094100952 + }, + { + "auxiliary_loss_clip": 0.01135011, + "auxiliary_loss_mlp": 0.01038576, + "balance_loss_clip": 1.05072355, + "balance_loss_mlp": 1.02432418, + "epoch": 0.39446866075454684, + "flos": 12974175609120.0, + "grad_norm": 2.170078656799036, + "language_loss": 0.83236825, + "learning_rate": 2.7610019703090803e-06, + "loss": 0.8541041, + "num_input_tokens_seen": 140954855, + "router_z_loss_clip": 0.84277344, + "router_z_loss_mlp": 0.14239502, + "step": 6561, + "time_per_iteration": 2.670396566390991 + }, + { + "auxiliary_loss_clip": 0.01131715, + "auxiliary_loss_mlp": 0.01042944, + "balance_loss_clip": 1.04831803, + "balance_loss_mlp": 1.02966452, + "epoch": 0.3945287840072148, + "flos": 22192162967520.0, + "grad_norm": 2.2721229752641334, + "language_loss": 0.80138695, + "learning_rate": 2.7606417893084887e-06, + "loss": 0.82313347, + "num_input_tokens_seen": 140973250, + "router_z_loss_clip": 0.83349609, + "router_z_loss_mlp": 0.1328125, + "step": 6562, + "time_per_iteration": 2.6742446422576904 + }, + { + "auxiliary_loss_clip": 0.01129149, + "auxiliary_loss_mlp": 0.01032639, + "balance_loss_clip": 1.04932082, + "balance_loss_mlp": 1.01957917, + "epoch": 0.39458890725988277, + "flos": 28113445560960.0, + "grad_norm": 1.8420094303844703, + "language_loss": 0.81276011, + "learning_rate": 2.7602815794642853e-06, + "loss": 0.83437794, + "num_input_tokens_seen": 140993050, + "router_z_loss_clip": 0.79931641, + "router_z_loss_mlp": 0.13061523, + "step": 6563, + "time_per_iteration": 2.710148572921753 + }, + { + "auxiliary_loss_clip": 0.01132485, + "auxiliary_loss_mlp": 0.01034818, + "balance_loss_clip": 1.05011129, + "balance_loss_mlp": 1.02101409, + "epoch": 0.39464903051255074, + "flos": 20937804238080.0, + "grad_norm": 2.350627410910342, + "language_loss": 0.69884509, + "learning_rate": 2.759921340790127e-06, + "loss": 0.72051811, + "num_input_tokens_seen": 141010815, + "router_z_loss_clip": 0.82470703, + "router_z_loss_mlp": 0.13806152, + "step": 6564, + "time_per_iteration": 2.7532174587249756 + }, + { + "auxiliary_loss_clip": 0.01133605, + "auxiliary_loss_mlp": 0.01039752, + "balance_loss_clip": 1.0496453, + "balance_loss_mlp": 1.02645397, + "epoch": 0.3947091537652187, + "flos": 19387349161920.0, + "grad_norm": 2.2619492213251964, + "language_loss": 0.83410656, + "learning_rate": 2.759561073299676e-06, + "loss": 0.85584009, + "num_input_tokens_seen": 141028720, + "router_z_loss_clip": 0.83984375, + "router_z_loss_mlp": 0.13299561, + "step": 6565, + "time_per_iteration": 4.102544546127319 + }, + { + "auxiliary_loss_clip": 0.01132703, + "auxiliary_loss_mlp": 0.0103483, + "balance_loss_clip": 1.04979789, + "balance_loss_mlp": 1.02181816, + "epoch": 0.39476927701788667, + "flos": 22629645570240.0, + "grad_norm": 1.7697174100064483, + "language_loss": 0.83245122, + "learning_rate": 2.7592007770065937e-06, + "loss": 0.85412657, + "num_input_tokens_seen": 141046025, + "router_z_loss_clip": 0.82910156, + "router_z_loss_mlp": 0.13012695, + "step": 6566, + "time_per_iteration": 4.0904295444488525 + }, + { + "auxiliary_loss_clip": 0.01139668, + "auxiliary_loss_mlp": 0.0103602, + "balance_loss_clip": 1.05165315, + "balance_loss_mlp": 1.02223325, + "epoch": 0.39482940027055463, + "flos": 27180749581920.0, + "grad_norm": 1.9090687925057956, + "language_loss": 0.77716786, + "learning_rate": 2.7588404519245403e-06, + "loss": 0.79892474, + "num_input_tokens_seen": 141066865, + "router_z_loss_clip": 0.87988281, + "router_z_loss_mlp": 0.13806152, + "step": 6567, + "time_per_iteration": 2.7264459133148193 + }, + { + "auxiliary_loss_clip": 0.01128002, + "auxiliary_loss_mlp": 0.01043393, + "balance_loss_clip": 1.04930043, + "balance_loss_mlp": 1.02999353, + "epoch": 0.3948895235232226, + "flos": 18006798156480.0, + "grad_norm": 2.380435560169668, + "language_loss": 0.80969489, + "learning_rate": 2.758480098067182e-06, + "loss": 0.83140886, + "num_input_tokens_seen": 141084210, + "router_z_loss_clip": 0.78710938, + "router_z_loss_mlp": 0.13391113, + "step": 6568, + "time_per_iteration": 2.6285605430603027 + }, + { + "auxiliary_loss_clip": 0.01130602, + "auxiliary_loss_mlp": 0.01037653, + "balance_loss_clip": 1.04845214, + "balance_loss_mlp": 1.0244689, + "epoch": 0.39494964677589056, + "flos": 27534860323200.0, + "grad_norm": 1.7741935048224293, + "language_loss": 0.84702778, + "learning_rate": 2.7581197154481816e-06, + "loss": 0.8687104, + "num_input_tokens_seen": 141103895, + "router_z_loss_clip": 0.82177734, + "router_z_loss_mlp": 0.13195801, + "step": 6569, + "time_per_iteration": 2.7153046131134033 + }, + { + "auxiliary_loss_clip": 0.01135846, + "auxiliary_loss_mlp": 0.01036853, + "balance_loss_clip": 1.05372417, + "balance_loss_mlp": 1.02358532, + "epoch": 0.3950097700285585, + "flos": 28020187483200.0, + "grad_norm": 3.0710059323149697, + "language_loss": 0.74557781, + "learning_rate": 2.7577593040812066e-06, + "loss": 0.76730478, + "num_input_tokens_seen": 141124000, + "router_z_loss_clip": 0.82177734, + "router_z_loss_mlp": 0.13262939, + "step": 6570, + "time_per_iteration": 2.676090955734253 + }, + { + "auxiliary_loss_clip": 0.0113504, + "auxiliary_loss_mlp": 0.01039161, + "balance_loss_clip": 1.05061603, + "balance_loss_mlp": 1.02577353, + "epoch": 0.3950698932812265, + "flos": 25130300090400.0, + "grad_norm": 1.5584570559119568, + "language_loss": 0.79547495, + "learning_rate": 2.757398863979922e-06, + "loss": 0.81721687, + "num_input_tokens_seen": 141142535, + "router_z_loss_clip": 0.84375, + "router_z_loss_mlp": 0.1340332, + "step": 6571, + "time_per_iteration": 4.12442421913147 + }, + { + "auxiliary_loss_clip": 0.01134735, + "auxiliary_loss_mlp": 0.01046422, + "balance_loss_clip": 1.05180311, + "balance_loss_mlp": 1.0328027, + "epoch": 0.39513001653389446, + "flos": 24862559489280.0, + "grad_norm": 1.823629689063114, + "language_loss": 0.78331298, + "learning_rate": 2.757038395157997e-06, + "loss": 0.80512452, + "num_input_tokens_seen": 141161575, + "router_z_loss_clip": 0.82958984, + "router_z_loss_mlp": 0.13616943, + "step": 6572, + "time_per_iteration": 2.757652521133423 + }, + { + "auxiliary_loss_clip": 0.01136512, + "auxiliary_loss_mlp": 0.01038353, + "balance_loss_clip": 1.05043077, + "balance_loss_mlp": 1.02402985, + "epoch": 0.3951901397865625, + "flos": 32291922434400.0, + "grad_norm": 1.9658441660688428, + "language_loss": 0.74663305, + "learning_rate": 2.7566778976291002e-06, + "loss": 0.76838171, + "num_input_tokens_seen": 141181150, + "router_z_loss_clip": 0.86083984, + "router_z_loss_mlp": 0.14318848, + "step": 6573, + "time_per_iteration": 2.6765453815460205 + }, + { + "auxiliary_loss_clip": 0.01132031, + "auxiliary_loss_mlp": 0.01035419, + "balance_loss_clip": 1.04987216, + "balance_loss_mlp": 1.02342093, + "epoch": 0.39525026303923044, + "flos": 53493172441920.0, + "grad_norm": 1.6120979081196951, + "language_loss": 0.67857027, + "learning_rate": 2.7563173714069017e-06, + "loss": 0.70024478, + "num_input_tokens_seen": 141206310, + "router_z_loss_clip": 0.82177734, + "router_z_loss_mlp": 0.12011719, + "step": 6574, + "time_per_iteration": 2.9364256858825684 + }, + { + "auxiliary_loss_clip": 0.01136021, + "auxiliary_loss_mlp": 0.01036517, + "balance_loss_clip": 1.05089307, + "balance_loss_mlp": 1.02271235, + "epoch": 0.3953103862918984, + "flos": 22013223543360.0, + "grad_norm": 2.342629778960893, + "language_loss": 0.72090048, + "learning_rate": 2.755956816505072e-06, + "loss": 0.74262589, + "num_input_tokens_seen": 141223925, + "router_z_loss_clip": 0.84960938, + "router_z_loss_mlp": 0.13812256, + "step": 6575, + "time_per_iteration": 4.043827772140503 + }, + { + "auxiliary_loss_clip": 0.01135282, + "auxiliary_loss_mlp": 0.01042243, + "balance_loss_clip": 1.0492332, + "balance_loss_mlp": 1.02818871, + "epoch": 0.3953705095445664, + "flos": 20714504742720.0, + "grad_norm": 2.73213977870154, + "language_loss": 0.73233628, + "learning_rate": 2.7555962329372845e-06, + "loss": 0.75411153, + "num_input_tokens_seen": 141239010, + "router_z_loss_clip": 0.86035156, + "router_z_loss_mlp": 0.14056396, + "step": 6576, + "time_per_iteration": 2.608184576034546 + }, + { + "auxiliary_loss_clip": 0.01132924, + "auxiliary_loss_mlp": 0.01040879, + "balance_loss_clip": 1.048841, + "balance_loss_mlp": 1.02829647, + "epoch": 0.39543063279723434, + "flos": 21245731630560.0, + "grad_norm": 11.356823147274142, + "language_loss": 0.83794886, + "learning_rate": 2.7552356207172124e-06, + "loss": 0.85968691, + "num_input_tokens_seen": 141252255, + "router_z_loss_clip": 0.84033203, + "router_z_loss_mlp": 0.12573242, + "step": 6577, + "time_per_iteration": 2.64862322807312 + }, + { + "auxiliary_loss_clip": 0.01134556, + "auxiliary_loss_mlp": 0.01038922, + "balance_loss_clip": 1.05257308, + "balance_loss_mlp": 1.02554095, + "epoch": 0.3954907560499023, + "flos": 27801466440480.0, + "grad_norm": 2.7378480979338904, + "language_loss": 0.89916486, + "learning_rate": 2.75487497985853e-06, + "loss": 0.92089969, + "num_input_tokens_seen": 141269325, + "router_z_loss_clip": 0.81884766, + "router_z_loss_mlp": 0.13391113, + "step": 6578, + "time_per_iteration": 2.6611790657043457 + }, + { + "auxiliary_loss_clip": 0.01137449, + "auxiliary_loss_mlp": 0.01037576, + "balance_loss_clip": 1.0508846, + "balance_loss_mlp": 1.02271092, + "epoch": 0.39555087930257027, + "flos": 26776727902080.0, + "grad_norm": 3.1120470961473092, + "language_loss": 0.78318071, + "learning_rate": 2.7545143103749117e-06, + "loss": 0.80493098, + "num_input_tokens_seen": 141288505, + "router_z_loss_clip": 0.86621094, + "router_z_loss_mlp": 0.14880371, + "step": 6579, + "time_per_iteration": 2.7341694831848145 + }, + { + "auxiliary_loss_clip": 0.01137475, + "auxiliary_loss_mlp": 0.01035145, + "balance_loss_clip": 1.05114114, + "balance_loss_mlp": 1.0208993, + "epoch": 0.39561100255523823, + "flos": 24898451448960.0, + "grad_norm": 2.258762986405457, + "language_loss": 0.68835366, + "learning_rate": 2.754153612280037e-06, + "loss": 0.71007991, + "num_input_tokens_seen": 141303680, + "router_z_loss_clip": 0.86474609, + "router_z_loss_mlp": 0.14245605, + "step": 6580, + "time_per_iteration": 2.622286319732666 + }, + { + "auxiliary_loss_clip": 0.01132911, + "auxiliary_loss_mlp": 0.01031316, + "balance_loss_clip": 1.0511651, + "balance_loss_mlp": 1.01845944, + "epoch": 0.3956711258079062, + "flos": 34123002606720.0, + "grad_norm": 5.90385036809347, + "language_loss": 0.58382839, + "learning_rate": 2.7537928855875797e-06, + "loss": 0.60547066, + "num_input_tokens_seen": 141324090, + "router_z_loss_clip": 0.81835938, + "router_z_loss_mlp": 0.12860107, + "step": 6581, + "time_per_iteration": 2.7524046897888184 + }, + { + "auxiliary_loss_clip": 0.01137128, + "auxiliary_loss_mlp": 0.01039991, + "balance_loss_clip": 1.05225945, + "balance_loss_mlp": 1.02605557, + "epoch": 0.39573124906057416, + "flos": 17605004927040.0, + "grad_norm": 1.8540249452570006, + "language_loss": 0.69864136, + "learning_rate": 2.7534321303112224e-06, + "loss": 0.72041255, + "num_input_tokens_seen": 141342235, + "router_z_loss_clip": 0.84863281, + "router_z_loss_mlp": 0.13934326, + "step": 6582, + "time_per_iteration": 2.6191232204437256 + }, + { + "auxiliary_loss_clip": 0.01137259, + "auxiliary_loss_mlp": 0.01034756, + "balance_loss_clip": 1.05245435, + "balance_loss_mlp": 1.02120209, + "epoch": 0.39579137231324213, + "flos": 22858779553920.0, + "grad_norm": 2.227038006034876, + "language_loss": 0.76288724, + "learning_rate": 2.753071346464642e-06, + "loss": 0.78460741, + "num_input_tokens_seen": 141361195, + "router_z_loss_clip": 0.84667969, + "router_z_loss_mlp": 0.13562012, + "step": 6583, + "time_per_iteration": 2.729074478149414 + }, + { + "auxiliary_loss_clip": 0.0113488, + "auxiliary_loss_mlp": 0.01032447, + "balance_loss_clip": 1.05082679, + "balance_loss_mlp": 1.01916122, + "epoch": 0.3958514955659101, + "flos": 21568528864800.0, + "grad_norm": 1.8332017260838895, + "language_loss": 0.66085595, + "learning_rate": 2.7527105340615207e-06, + "loss": 0.68252921, + "num_input_tokens_seen": 141378275, + "router_z_loss_clip": 0.84033203, + "router_z_loss_mlp": 0.13311768, + "step": 6584, + "time_per_iteration": 2.623953342437744 + }, + { + "auxiliary_loss_clip": 0.01136549, + "auxiliary_loss_mlp": 0.01042384, + "balance_loss_clip": 1.04961026, + "balance_loss_mlp": 1.02743506, + "epoch": 0.39591161881857806, + "flos": 35764203689280.0, + "grad_norm": 4.0662448461518785, + "language_loss": 0.72781765, + "learning_rate": 2.7523496931155413e-06, + "loss": 0.74960697, + "num_input_tokens_seen": 141396960, + "router_z_loss_clip": 0.87011719, + "router_z_loss_mlp": 0.14941406, + "step": 6585, + "time_per_iteration": 2.7730305194854736 + }, + { + "auxiliary_loss_clip": 0.01133559, + "auxiliary_loss_mlp": 0.01034146, + "balance_loss_clip": 1.04920566, + "balance_loss_mlp": 1.02084184, + "epoch": 0.3959717420712461, + "flos": 31447581942240.0, + "grad_norm": 1.8759090497454591, + "language_loss": 0.73396552, + "learning_rate": 2.7519888236403856e-06, + "loss": 0.75564253, + "num_input_tokens_seen": 141417320, + "router_z_loss_clip": 0.84423828, + "router_z_loss_mlp": 0.13317871, + "step": 6586, + "time_per_iteration": 2.780458688735962 + }, + { + "auxiliary_loss_clip": 0.01134102, + "auxiliary_loss_mlp": 0.01031759, + "balance_loss_clip": 1.05133426, + "balance_loss_mlp": 1.01793027, + "epoch": 0.39603186532391405, + "flos": 25531201939680.0, + "grad_norm": 1.6339511597762577, + "language_loss": 0.71460581, + "learning_rate": 2.7516279256497382e-06, + "loss": 0.73626435, + "num_input_tokens_seen": 141435985, + "router_z_loss_clip": 0.82714844, + "router_z_loss_mlp": 0.13824463, + "step": 6587, + "time_per_iteration": 2.7323954105377197 + }, + { + "auxiliary_loss_clip": 0.01050914, + "auxiliary_loss_mlp": 0.01002881, + "balance_loss_clip": 1.0241276, + "balance_loss_mlp": 1.00133169, + "epoch": 0.396091988576582, + "flos": 66966045913440.0, + "grad_norm": 0.9109357400843349, + "language_loss": 0.61181366, + "learning_rate": 2.751266999157285e-06, + "loss": 0.63235164, + "num_input_tokens_seen": 141486075, + "router_z_loss_clip": 0.26806641, + "router_z_loss_mlp": 0.0154953, + "step": 6588, + "time_per_iteration": 3.111246347427368 + }, + { + "auxiliary_loss_clip": 0.01135857, + "auxiliary_loss_mlp": 0.01038558, + "balance_loss_clip": 1.05091143, + "balance_loss_mlp": 1.02432477, + "epoch": 0.39615211182925, + "flos": 25261840647360.0, + "grad_norm": 2.009381234562714, + "language_loss": 0.81575495, + "learning_rate": 2.7509060441767115e-06, + "loss": 0.83749908, + "num_input_tokens_seen": 141505280, + "router_z_loss_clip": 0.84960938, + "router_z_loss_mlp": 0.14245605, + "step": 6589, + "time_per_iteration": 2.6508400440216064 + }, + { + "auxiliary_loss_clip": 0.01133097, + "auxiliary_loss_mlp": 0.01032905, + "balance_loss_clip": 1.04877245, + "balance_loss_mlp": 1.01836109, + "epoch": 0.39621223508191794, + "flos": 25617491045280.0, + "grad_norm": 1.8970371042633907, + "language_loss": 0.70366693, + "learning_rate": 2.7505450607217057e-06, + "loss": 0.7253269, + "num_input_tokens_seen": 141523930, + "router_z_loss_clip": 0.84423828, + "router_z_loss_mlp": 0.14550781, + "step": 6590, + "time_per_iteration": 2.7223751544952393 + }, + { + "auxiliary_loss_clip": 0.01134886, + "auxiliary_loss_mlp": 0.01042883, + "balance_loss_clip": 1.0506556, + "balance_loss_mlp": 1.02898335, + "epoch": 0.3962723583345859, + "flos": 28514793100320.0, + "grad_norm": 2.3891527999135147, + "language_loss": 0.75911504, + "learning_rate": 2.750184048805956e-06, + "loss": 0.78089273, + "num_input_tokens_seen": 141541320, + "router_z_loss_clip": 0.84277344, + "router_z_loss_mlp": 0.13909912, + "step": 6591, + "time_per_iteration": 2.678605318069458 + }, + { + "auxiliary_loss_clip": 0.01134948, + "auxiliary_loss_mlp": 0.0104277, + "balance_loss_clip": 1.05000091, + "balance_loss_mlp": 1.02903676, + "epoch": 0.39633248158725387, + "flos": 30646750658400.0, + "grad_norm": 6.299850819592366, + "language_loss": 0.78011495, + "learning_rate": 2.749823008443152e-06, + "loss": 0.80189216, + "num_input_tokens_seen": 141561880, + "router_z_loss_clip": 0.84912109, + "router_z_loss_mlp": 0.13726807, + "step": 6592, + "time_per_iteration": 2.700850486755371 + }, + { + "auxiliary_loss_clip": 0.01129321, + "auxiliary_loss_mlp": 0.0103303, + "balance_loss_clip": 1.04848552, + "balance_loss_mlp": 1.01933336, + "epoch": 0.39639260483992184, + "flos": 48550242451680.0, + "grad_norm": 1.699987874084373, + "language_loss": 0.69007599, + "learning_rate": 2.7494619396469843e-06, + "loss": 0.71169949, + "num_input_tokens_seen": 141586460, + "router_z_loss_clip": 0.80908203, + "router_z_loss_mlp": 0.137146, + "step": 6593, + "time_per_iteration": 2.823471784591675 + }, + { + "auxiliary_loss_clip": 0.01134647, + "auxiliary_loss_mlp": 0.01041669, + "balance_loss_clip": 1.04832554, + "balance_loss_mlp": 1.0268271, + "epoch": 0.3964527280925898, + "flos": 21166695118080.0, + "grad_norm": 6.662180247696253, + "language_loss": 0.78015226, + "learning_rate": 2.7491008424311452e-06, + "loss": 0.80191541, + "num_input_tokens_seen": 141605955, + "router_z_loss_clip": 0.86328125, + "router_z_loss_mlp": 0.14831543, + "step": 6594, + "time_per_iteration": 2.6982784271240234 + }, + { + "auxiliary_loss_clip": 0.01044249, + "auxiliary_loss_mlp": 0.01001969, + "balance_loss_clip": 1.01781559, + "balance_loss_mlp": 1.00039387, + "epoch": 0.39651285134525777, + "flos": 87510119860800.0, + "grad_norm": 0.9499080152468681, + "language_loss": 0.62982684, + "learning_rate": 2.7487397168093265e-06, + "loss": 0.65028906, + "num_input_tokens_seen": 141673140, + "router_z_loss_clip": 0.26489258, + "router_z_loss_mlp": 0.0157547, + "step": 6595, + "time_per_iteration": 3.3113958835601807 + }, + { + "auxiliary_loss_clip": 0.01137763, + "auxiliary_loss_mlp": 0.01044901, + "balance_loss_clip": 1.05099118, + "balance_loss_mlp": 1.03033352, + "epoch": 0.39657297459792573, + "flos": 31451957808480.0, + "grad_norm": 1.897697473467875, + "language_loss": 0.63603449, + "learning_rate": 2.748378562795223e-06, + "loss": 0.65786117, + "num_input_tokens_seen": 141692955, + "router_z_loss_clip": 0.86816406, + "router_z_loss_mlp": 0.14562988, + "step": 6596, + "time_per_iteration": 2.68017315864563 + }, + { + "auxiliary_loss_clip": 0.0112949, + "auxiliary_loss_mlp": 0.01036393, + "balance_loss_clip": 1.04810166, + "balance_loss_mlp": 1.02271938, + "epoch": 0.3966330978505937, + "flos": 24728749964640.0, + "grad_norm": 2.0505964651719104, + "language_loss": 0.78590703, + "learning_rate": 2.7480173804025293e-06, + "loss": 0.80756581, + "num_input_tokens_seen": 141710680, + "router_z_loss_clip": 0.81445312, + "router_z_loss_mlp": 0.13671875, + "step": 6597, + "time_per_iteration": 2.6633999347686768 + }, + { + "auxiliary_loss_clip": 0.01136758, + "auxiliary_loss_mlp": 0.01040924, + "balance_loss_clip": 1.05014181, + "balance_loss_mlp": 1.02596283, + "epoch": 0.39669322110326166, + "flos": 25174214471520.0, + "grad_norm": 2.093186077211768, + "language_loss": 0.67601395, + "learning_rate": 2.747656169644941e-06, + "loss": 0.69779074, + "num_input_tokens_seen": 141729860, + "router_z_loss_clip": 0.86572266, + "router_z_loss_mlp": 0.1494751, + "step": 6598, + "time_per_iteration": 2.670128107070923 + }, + { + "auxiliary_loss_clip": 0.01130806, + "auxiliary_loss_mlp": 0.01043178, + "balance_loss_clip": 1.04716778, + "balance_loss_mlp": 1.03012443, + "epoch": 0.3967533443559297, + "flos": 26510243336640.0, + "grad_norm": 2.176356609999161, + "language_loss": 0.78938067, + "learning_rate": 2.747294930536157e-06, + "loss": 0.81112051, + "num_input_tokens_seen": 141749060, + "router_z_loss_clip": 0.8359375, + "router_z_loss_mlp": 0.13061523, + "step": 6599, + "time_per_iteration": 2.7114882469177246 + }, + { + "auxiliary_loss_clip": 0.01133978, + "auxiliary_loss_mlp": 0.01035858, + "balance_loss_clip": 1.05015504, + "balance_loss_mlp": 1.02021801, + "epoch": 0.39681346760859765, + "flos": 31096145341440.0, + "grad_norm": 2.5281112816611633, + "language_loss": 0.72890794, + "learning_rate": 2.7469336630898737e-06, + "loss": 0.7506063, + "num_input_tokens_seen": 141769860, + "router_z_loss_clip": 0.83789062, + "router_z_loss_mlp": 0.15649414, + "step": 6600, + "time_per_iteration": 2.6821205615997314 + }, + { + "auxiliary_loss_clip": 0.01128375, + "auxiliary_loss_mlp": 0.01032766, + "balance_loss_clip": 1.04560709, + "balance_loss_mlp": 1.01909852, + "epoch": 0.3968735908612656, + "flos": 25574954251680.0, + "grad_norm": 2.203565119264999, + "language_loss": 0.86205173, + "learning_rate": 2.746572367319791e-06, + "loss": 0.88366318, + "num_input_tokens_seen": 141788465, + "router_z_loss_clip": 0.82714844, + "router_z_loss_mlp": 0.13659668, + "step": 6601, + "time_per_iteration": 2.7385027408599854 + }, + { + "auxiliary_loss_clip": 0.01138995, + "auxiliary_loss_mlp": 0.01044415, + "balance_loss_clip": 1.0493418, + "balance_loss_mlp": 1.02829242, + "epoch": 0.3969337141139336, + "flos": 13064070752640.0, + "grad_norm": 2.6595107538035823, + "language_loss": 0.70221472, + "learning_rate": 2.7462110432396095e-06, + "loss": 0.72404879, + "num_input_tokens_seen": 141804955, + "router_z_loss_clip": 0.89697266, + "router_z_loss_mlp": 0.16143799, + "step": 6602, + "time_per_iteration": 2.6140944957733154 + }, + { + "auxiliary_loss_clip": 0.01131728, + "auxiliary_loss_mlp": 0.01043639, + "balance_loss_clip": 1.04655755, + "balance_loss_mlp": 1.02968538, + "epoch": 0.39699383736660154, + "flos": 21468504401280.0, + "grad_norm": 2.3381592411497993, + "language_loss": 0.83550948, + "learning_rate": 2.7458496908630305e-06, + "loss": 0.85726315, + "num_input_tokens_seen": 141820025, + "router_z_loss_clip": 0.85253906, + "router_z_loss_mlp": 0.1395874, + "step": 6603, + "time_per_iteration": 2.6447336673736572 + }, + { + "auxiliary_loss_clip": 0.01126652, + "auxiliary_loss_mlp": 0.01035077, + "balance_loss_clip": 1.04455626, + "balance_loss_mlp": 1.02129602, + "epoch": 0.3970539606192695, + "flos": 21702176320320.0, + "grad_norm": 5.767487410483213, + "language_loss": 0.73032022, + "learning_rate": 2.7454883102037563e-06, + "loss": 0.75193751, + "num_input_tokens_seen": 141838735, + "router_z_loss_clip": 0.8203125, + "router_z_loss_mlp": 0.13793945, + "step": 6604, + "time_per_iteration": 4.081082344055176 + }, + { + "auxiliary_loss_clip": 0.01127617, + "auxiliary_loss_mlp": 0.01038147, + "balance_loss_clip": 1.04762125, + "balance_loss_mlp": 1.02413404, + "epoch": 0.3971140838719375, + "flos": 30248603984160.0, + "grad_norm": 1.5724873823139265, + "language_loss": 0.82717848, + "learning_rate": 2.745126901275491e-06, + "loss": 0.84883618, + "num_input_tokens_seen": 141858090, + "router_z_loss_clip": 0.80029297, + "router_z_loss_mlp": 0.14013672, + "step": 6605, + "time_per_iteration": 3.995631456375122 + }, + { + "auxiliary_loss_clip": 0.01128051, + "auxiliary_loss_mlp": 0.01029647, + "balance_loss_clip": 1.04684544, + "balance_loss_mlp": 1.01736283, + "epoch": 0.39717420712460544, + "flos": 29582311536000.0, + "grad_norm": 1.5908737061880935, + "language_loss": 0.73795283, + "learning_rate": 2.7447654640919383e-06, + "loss": 0.75952983, + "num_input_tokens_seen": 141877540, + "router_z_loss_clip": 0.81054688, + "router_z_loss_mlp": 0.12286377, + "step": 6606, + "time_per_iteration": 2.6743855476379395 + }, + { + "auxiliary_loss_clip": 0.01133974, + "auxiliary_loss_mlp": 0.01035133, + "balance_loss_clip": 1.0487783, + "balance_loss_mlp": 1.02131045, + "epoch": 0.3972343303772734, + "flos": 31584997504800.0, + "grad_norm": 1.7227301042141132, + "language_loss": 0.73980469, + "learning_rate": 2.744403998666805e-06, + "loss": 0.76149577, + "num_input_tokens_seen": 141897315, + "router_z_loss_clip": 0.85205078, + "router_z_loss_mlp": 0.13824463, + "step": 6607, + "time_per_iteration": 2.744067430496216 + }, + { + "auxiliary_loss_clip": 0.01136478, + "auxiliary_loss_mlp": 0.01036338, + "balance_loss_clip": 1.05094123, + "balance_loss_mlp": 1.0225811, + "epoch": 0.39729445362994137, + "flos": 55672042659840.0, + "grad_norm": 1.7413026570315921, + "language_loss": 0.67784971, + "learning_rate": 2.744042505013797e-06, + "loss": 0.69957787, + "num_input_tokens_seen": 141919580, + "router_z_loss_clip": 0.85498047, + "router_z_loss_mlp": 0.13775635, + "step": 6608, + "time_per_iteration": 2.8557920455932617 + }, + { + "auxiliary_loss_clip": 0.0113477, + "auxiliary_loss_mlp": 0.01045803, + "balance_loss_clip": 1.04864645, + "balance_loss_mlp": 1.03065705, + "epoch": 0.39735457688260933, + "flos": 24640637581440.0, + "grad_norm": 1.7876926501791508, + "language_loss": 0.74166459, + "learning_rate": 2.7436809831466233e-06, + "loss": 0.76347029, + "num_input_tokens_seen": 141937045, + "router_z_loss_clip": 0.86132812, + "router_z_loss_mlp": 0.15142822, + "step": 6609, + "time_per_iteration": 2.721660852432251 + }, + { + "auxiliary_loss_clip": 0.01131276, + "auxiliary_loss_mlp": 0.01033786, + "balance_loss_clip": 1.0473299, + "balance_loss_mlp": 1.01987433, + "epoch": 0.3974147001352773, + "flos": 28468933889760.0, + "grad_norm": 1.6927282874388203, + "language_loss": 0.71552539, + "learning_rate": 2.7433194330789927e-06, + "loss": 0.737176, + "num_input_tokens_seen": 141956695, + "router_z_loss_clip": 0.83837891, + "router_z_loss_mlp": 0.13916016, + "step": 6610, + "time_per_iteration": 2.6634583473205566 + }, + { + "auxiliary_loss_clip": 0.01124479, + "auxiliary_loss_mlp": 0.01031359, + "balance_loss_clip": 1.04422259, + "balance_loss_mlp": 1.01788294, + "epoch": 0.39747482338794526, + "flos": 26464667747040.0, + "grad_norm": 1.7407863009444193, + "language_loss": 0.79156238, + "learning_rate": 2.7429578548246133e-06, + "loss": 0.81312078, + "num_input_tokens_seen": 141975935, + "router_z_loss_clip": 0.80224609, + "router_z_loss_mlp": 0.13464355, + "step": 6611, + "time_per_iteration": 4.1365978717803955 + }, + { + "auxiliary_loss_clip": 0.01130216, + "auxiliary_loss_mlp": 0.01039042, + "balance_loss_clip": 1.04759824, + "balance_loss_mlp": 1.0251298, + "epoch": 0.3975349466406133, + "flos": 37813680766080.0, + "grad_norm": 2.0445644261316853, + "language_loss": 0.7904731, + "learning_rate": 2.7425962483971985e-06, + "loss": 0.81216568, + "num_input_tokens_seen": 141995750, + "router_z_loss_clip": 0.82568359, + "router_z_loss_mlp": 0.13922119, + "step": 6612, + "time_per_iteration": 2.765576124191284 + }, + { + "auxiliary_loss_clip": 0.01041922, + "auxiliary_loss_mlp": 0.01001146, + "balance_loss_clip": 1.01564693, + "balance_loss_mlp": 0.99958205, + "epoch": 0.39759506989328125, + "flos": 77707186051680.0, + "grad_norm": 0.8459505325123808, + "language_loss": 0.64995944, + "learning_rate": 2.742234613810459e-06, + "loss": 0.67039013, + "num_input_tokens_seen": 142057655, + "router_z_loss_clip": 0.26220703, + "router_z_loss_mlp": 0.01564789, + "step": 6613, + "time_per_iteration": 3.1708364486694336 + }, + { + "auxiliary_loss_clip": 0.0113011, + "auxiliary_loss_mlp": 0.01032562, + "balance_loss_clip": 1.04671061, + "balance_loss_mlp": 1.01875138, + "epoch": 0.3976551931459492, + "flos": 28913952706560.0, + "grad_norm": 3.739856159361873, + "language_loss": 0.71719831, + "learning_rate": 2.741872951078109e-06, + "loss": 0.73882508, + "num_input_tokens_seen": 142076020, + "router_z_loss_clip": 0.83349609, + "router_z_loss_mlp": 0.13800049, + "step": 6614, + "time_per_iteration": 2.720494031906128 + }, + { + "auxiliary_loss_clip": 0.01129611, + "auxiliary_loss_mlp": 0.01033811, + "balance_loss_clip": 1.04633498, + "balance_loss_mlp": 1.01998234, + "epoch": 0.3977153163986172, + "flos": 19119730112640.0, + "grad_norm": 2.705980838482282, + "language_loss": 0.81374794, + "learning_rate": 2.741511260213862e-06, + "loss": 0.8353821, + "num_input_tokens_seen": 142093790, + "router_z_loss_clip": 0.83203125, + "router_z_loss_mlp": 0.13842773, + "step": 6615, + "time_per_iteration": 3.976980686187744 + }, + { + "auxiliary_loss_clip": 0.01129877, + "auxiliary_loss_mlp": 0.01033643, + "balance_loss_clip": 1.04712725, + "balance_loss_mlp": 1.02075672, + "epoch": 0.39777543965128515, + "flos": 17161201628640.0, + "grad_norm": 2.157568600922542, + "language_loss": 0.67465997, + "learning_rate": 2.741149541231434e-06, + "loss": 0.69629514, + "num_input_tokens_seen": 142110545, + "router_z_loss_clip": 0.82763672, + "router_z_loss_mlp": 0.12896729, + "step": 6616, + "time_per_iteration": 2.631422281265259 + }, + { + "auxiliary_loss_clip": 0.01132295, + "auxiliary_loss_mlp": 0.01037614, + "balance_loss_clip": 1.04713392, + "balance_loss_mlp": 1.02370214, + "epoch": 0.3978355629039531, + "flos": 28513253443680.0, + "grad_norm": 2.384387278989741, + "language_loss": 0.83475494, + "learning_rate": 2.740787794144541e-06, + "loss": 0.85645401, + "num_input_tokens_seen": 142128695, + "router_z_loss_clip": 0.85107422, + "router_z_loss_mlp": 0.13916016, + "step": 6617, + "time_per_iteration": 2.6765732765197754 + }, + { + "auxiliary_loss_clip": 0.01125468, + "auxiliary_loss_mlp": 0.01039023, + "balance_loss_clip": 1.04668617, + "balance_loss_mlp": 1.02681565, + "epoch": 0.3978956861566211, + "flos": 23260289162400.0, + "grad_norm": 1.8573732536939174, + "language_loss": 0.7254622, + "learning_rate": 2.7404260189669e-06, + "loss": 0.74710703, + "num_input_tokens_seen": 142148375, + "router_z_loss_clip": 0.78808594, + "router_z_loss_mlp": 0.12207031, + "step": 6618, + "time_per_iteration": 2.6519622802734375 + }, + { + "auxiliary_loss_clip": 0.01131693, + "auxiliary_loss_mlp": 0.01036761, + "balance_loss_clip": 1.04929149, + "balance_loss_mlp": 1.02173471, + "epoch": 0.39795580940928904, + "flos": 36884550307680.0, + "grad_norm": 1.6704151124348627, + "language_loss": 0.65456295, + "learning_rate": 2.740064215712231e-06, + "loss": 0.67624754, + "num_input_tokens_seen": 142169735, + "router_z_loss_clip": 0.82324219, + "router_z_loss_mlp": 0.15032959, + "step": 6619, + "time_per_iteration": 2.806121587753296 + }, + { + "auxiliary_loss_clip": 0.01043248, + "auxiliary_loss_mlp": 0.01004082, + "balance_loss_clip": 1.01712155, + "balance_loss_mlp": 1.00250268, + "epoch": 0.398015932661957, + "flos": 85228348452480.0, + "grad_norm": 0.7663278494738497, + "language_loss": 0.58232296, + "learning_rate": 2.7397023843942527e-06, + "loss": 0.60279626, + "num_input_tokens_seen": 142229520, + "router_z_loss_clip": 0.26171875, + "router_z_loss_mlp": 0.01580048, + "step": 6620, + "time_per_iteration": 3.2151029109954834 + }, + { + "auxiliary_loss_clip": 0.01131206, + "auxiliary_loss_mlp": 0.01035855, + "balance_loss_clip": 1.04952323, + "balance_loss_mlp": 1.02348673, + "epoch": 0.39807605591462497, + "flos": 24596439579360.0, + "grad_norm": 1.6597389407171341, + "language_loss": 0.78901625, + "learning_rate": 2.739340525026686e-06, + "loss": 0.81068683, + "num_input_tokens_seen": 142247660, + "router_z_loss_clip": 0.81640625, + "router_z_loss_mlp": 0.12371826, + "step": 6621, + "time_per_iteration": 2.695418119430542 + }, + { + "auxiliary_loss_clip": 0.01128099, + "auxiliary_loss_mlp": 0.01033878, + "balance_loss_clip": 1.04724813, + "balance_loss_mlp": 1.02128935, + "epoch": 0.39813617916729294, + "flos": 25797281332320.0, + "grad_norm": 5.240826741546326, + "language_loss": 0.78130019, + "learning_rate": 2.738978637623252e-06, + "loss": 0.80291992, + "num_input_tokens_seen": 142266990, + "router_z_loss_clip": 0.80859375, + "router_z_loss_mlp": 0.12585449, + "step": 6622, + "time_per_iteration": 2.6874184608459473 + }, + { + "auxiliary_loss_clip": 0.01128286, + "auxiliary_loss_mlp": 0.0103793, + "balance_loss_clip": 1.04621458, + "balance_loss_mlp": 1.02404213, + "epoch": 0.3981963024199609, + "flos": 23169745742400.0, + "grad_norm": 1.6786480975489206, + "language_loss": 0.75440663, + "learning_rate": 2.738616722197674e-06, + "loss": 0.77606881, + "num_input_tokens_seen": 142287170, + "router_z_loss_clip": 0.8203125, + "router_z_loss_mlp": 0.13897705, + "step": 6623, + "time_per_iteration": 2.671091079711914 + }, + { + "auxiliary_loss_clip": 0.01129342, + "auxiliary_loss_mlp": 0.01039119, + "balance_loss_clip": 1.04717946, + "balance_loss_mlp": 1.0257144, + "epoch": 0.39825642567262887, + "flos": 20225247406560.0, + "grad_norm": 1.7450739385677558, + "language_loss": 0.79464912, + "learning_rate": 2.7382547787636766e-06, + "loss": 0.81633365, + "num_input_tokens_seen": 142305405, + "router_z_loss_clip": 0.82080078, + "router_z_loss_mlp": 0.13421631, + "step": 6624, + "time_per_iteration": 2.6950109004974365 + }, + { + "auxiliary_loss_clip": 0.01135735, + "auxiliary_loss_mlp": 0.01044931, + "balance_loss_clip": 1.04926503, + "balance_loss_mlp": 1.02999377, + "epoch": 0.39831654892529683, + "flos": 27088423401600.0, + "grad_norm": 2.0475863133969163, + "language_loss": 0.83528459, + "learning_rate": 2.7378928073349832e-06, + "loss": 0.85709125, + "num_input_tokens_seen": 142322710, + "router_z_loss_clip": 0.86376953, + "router_z_loss_mlp": 0.14935303, + "step": 6625, + "time_per_iteration": 2.683717727661133 + }, + { + "auxiliary_loss_clip": 0.01125983, + "auxiliary_loss_mlp": 0.01040334, + "balance_loss_clip": 1.04528093, + "balance_loss_mlp": 1.02681017, + "epoch": 0.39837667217796485, + "flos": 12796694807040.0, + "grad_norm": 3.25774061754567, + "language_loss": 0.86349845, + "learning_rate": 2.737530807925321e-06, + "loss": 0.88516164, + "num_input_tokens_seen": 142338535, + "router_z_loss_clip": 0.80761719, + "router_z_loss_mlp": 0.13525391, + "step": 6626, + "time_per_iteration": 2.666775703430176 + }, + { + "auxiliary_loss_clip": 0.01129202, + "auxiliary_loss_mlp": 0.0103802, + "balance_loss_clip": 1.04755926, + "balance_loss_mlp": 1.02412033, + "epoch": 0.3984367954306328, + "flos": 21921302535840.0, + "grad_norm": 2.4414983561613504, + "language_loss": 0.83142722, + "learning_rate": 2.737168780548417e-06, + "loss": 0.85309935, + "num_input_tokens_seen": 142354570, + "router_z_loss_clip": 0.81542969, + "router_z_loss_mlp": 0.13903809, + "step": 6627, + "time_per_iteration": 2.6208901405334473 + }, + { + "auxiliary_loss_clip": 0.01125642, + "auxiliary_loss_mlp": 0.01039325, + "balance_loss_clip": 1.04545045, + "balance_loss_mlp": 1.02674794, + "epoch": 0.3984969186833008, + "flos": 27712219573440.0, + "grad_norm": 1.4969450727081322, + "language_loss": 0.82864046, + "learning_rate": 2.736806725217998e-06, + "loss": 0.85029006, + "num_input_tokens_seen": 142374395, + "router_z_loss_clip": 0.80273438, + "router_z_loss_mlp": 0.12573242, + "step": 6628, + "time_per_iteration": 2.774991273880005 + }, + { + "auxiliary_loss_clip": 0.01130559, + "auxiliary_loss_mlp": 0.01045162, + "balance_loss_clip": 1.04771101, + "balance_loss_mlp": 1.0317626, + "epoch": 0.39855704193596875, + "flos": 28559841965280.0, + "grad_norm": 2.6256372997359514, + "language_loss": 0.71166992, + "learning_rate": 2.7364446419477945e-06, + "loss": 0.73342717, + "num_input_tokens_seen": 142396040, + "router_z_loss_clip": 0.82861328, + "router_z_loss_mlp": 0.13391113, + "step": 6629, + "time_per_iteration": 2.6801955699920654 + }, + { + "auxiliary_loss_clip": 0.01127555, + "auxiliary_loss_mlp": 0.01038263, + "balance_loss_clip": 1.04892683, + "balance_loss_mlp": 1.02494144, + "epoch": 0.3986171651886367, + "flos": 25931658098880.0, + "grad_norm": 2.0408587042633495, + "language_loss": 0.81328046, + "learning_rate": 2.7360825307515366e-06, + "loss": 0.83493865, + "num_input_tokens_seen": 142415495, + "router_z_loss_clip": 0.78710938, + "router_z_loss_mlp": 0.13305664, + "step": 6630, + "time_per_iteration": 2.712247848510742 + }, + { + "auxiliary_loss_clip": 0.01130596, + "auxiliary_loss_mlp": 0.01030614, + "balance_loss_clip": 1.04776192, + "balance_loss_mlp": 1.01720929, + "epoch": 0.3986772884413047, + "flos": 15201498143520.0, + "grad_norm": 2.000883973605282, + "language_loss": 0.75078648, + "learning_rate": 2.7357203916429555e-06, + "loss": 0.77239859, + "num_input_tokens_seen": 142431865, + "router_z_loss_clip": 0.828125, + "router_z_loss_mlp": 0.13421631, + "step": 6631, + "time_per_iteration": 2.6125502586364746 + }, + { + "auxiliary_loss_clip": 0.01130919, + "auxiliary_loss_mlp": 0.01037968, + "balance_loss_clip": 1.04712105, + "balance_loss_mlp": 1.02424121, + "epoch": 0.39873741169397264, + "flos": 23972157200160.0, + "grad_norm": 1.632669294164728, + "language_loss": 0.71431077, + "learning_rate": 2.735358224635783e-06, + "loss": 0.73599964, + "num_input_tokens_seen": 142450595, + "router_z_loss_clip": 0.83691406, + "router_z_loss_mlp": 0.13708496, + "step": 6632, + "time_per_iteration": 2.819680690765381 + }, + { + "auxiliary_loss_clip": 0.01126055, + "auxiliary_loss_mlp": 0.01036382, + "balance_loss_clip": 1.04595923, + "balance_loss_mlp": 1.02340055, + "epoch": 0.3987975349466406, + "flos": 26460291880800.0, + "grad_norm": 2.4517702584849665, + "language_loss": 0.7539261, + "learning_rate": 2.7349960297437533e-06, + "loss": 0.77555048, + "num_input_tokens_seen": 142466650, + "router_z_loss_clip": 0.80078125, + "router_z_loss_mlp": 0.12982178, + "step": 6633, + "time_per_iteration": 2.6522953510284424 + }, + { + "auxiliary_loss_clip": 0.01127557, + "auxiliary_loss_mlp": 0.0102891, + "balance_loss_clip": 1.04477286, + "balance_loss_mlp": 1.01620841, + "epoch": 0.3988576581993086, + "flos": 29180315720160.0, + "grad_norm": 1.7749049527385823, + "language_loss": 0.8121714, + "learning_rate": 2.7346338069806e-06, + "loss": 0.83373606, + "num_input_tokens_seen": 142486165, + "router_z_loss_clip": 0.828125, + "router_z_loss_mlp": 0.12695312, + "step": 6634, + "time_per_iteration": 2.7215380668640137 + }, + { + "auxiliary_loss_clip": 0.01131137, + "auxiliary_loss_mlp": 0.01029165, + "balance_loss_clip": 1.04946256, + "balance_loss_mlp": 1.01568866, + "epoch": 0.39891778145197654, + "flos": 22146384791520.0, + "grad_norm": 1.8225065842053043, + "language_loss": 0.75126934, + "learning_rate": 2.7342715563600597e-06, + "loss": 0.77287233, + "num_input_tokens_seen": 142505035, + "router_z_loss_clip": 0.81738281, + "router_z_loss_mlp": 0.13476562, + "step": 6635, + "time_per_iteration": 2.654104471206665 + }, + { + "auxiliary_loss_clip": 0.01137253, + "auxiliary_loss_mlp": 0.01041191, + "balance_loss_clip": 1.04964805, + "balance_loss_mlp": 1.02640915, + "epoch": 0.3989779047046445, + "flos": 27571522111200.0, + "grad_norm": 1.7971471014954437, + "language_loss": 0.65816754, + "learning_rate": 2.733909277895868e-06, + "loss": 0.67995197, + "num_input_tokens_seen": 142521870, + "router_z_loss_clip": 0.87548828, + "router_z_loss_mlp": 0.14776611, + "step": 6636, + "time_per_iteration": 2.642232656478882 + }, + { + "auxiliary_loss_clip": 0.01129699, + "auxiliary_loss_mlp": 0.01039855, + "balance_loss_clip": 1.04882121, + "balance_loss_mlp": 1.02675939, + "epoch": 0.39903802795731247, + "flos": 22057664649120.0, + "grad_norm": 1.8175217288502696, + "language_loss": 0.81496876, + "learning_rate": 2.733546971601763e-06, + "loss": 0.83666432, + "num_input_tokens_seen": 142540455, + "router_z_loss_clip": 0.80859375, + "router_z_loss_mlp": 0.13098145, + "step": 6637, + "time_per_iteration": 2.682161569595337 + }, + { + "auxiliary_loss_clip": 0.01042261, + "auxiliary_loss_mlp": 0.0100234, + "balance_loss_clip": 1.01625705, + "balance_loss_mlp": 1.00082803, + "epoch": 0.39909815120998043, + "flos": 85952412191520.0, + "grad_norm": 0.7196324601045396, + "language_loss": 0.53233248, + "learning_rate": 2.733184637491484e-06, + "loss": 0.55277848, + "num_input_tokens_seen": 142599665, + "router_z_loss_clip": 0.26049805, + "router_z_loss_mlp": 0.01511383, + "step": 6638, + "time_per_iteration": 3.292327880859375 + }, + { + "auxiliary_loss_clip": 0.01130369, + "auxiliary_loss_mlp": 0.01034917, + "balance_loss_clip": 1.04789114, + "balance_loss_mlp": 1.02195907, + "epoch": 0.39915827446264845, + "flos": 22633899884640.0, + "grad_norm": 2.1300140096533666, + "language_loss": 0.75264078, + "learning_rate": 2.732822275578769e-06, + "loss": 0.77429366, + "num_input_tokens_seen": 142618845, + "router_z_loss_clip": 0.82519531, + "router_z_loss_mlp": 0.12963867, + "step": 6639, + "time_per_iteration": 2.6741786003112793 + }, + { + "auxiliary_loss_clip": 0.01127415, + "auxiliary_loss_mlp": 0.01029447, + "balance_loss_clip": 1.04847121, + "balance_loss_mlp": 1.01724601, + "epoch": 0.3992183977153164, + "flos": 36481501042560.0, + "grad_norm": 1.5515661630976922, + "language_loss": 0.76025081, + "learning_rate": 2.7324598858773603e-06, + "loss": 0.78181946, + "num_input_tokens_seen": 142640885, + "router_z_loss_clip": 0.78808594, + "router_z_loss_mlp": 0.12200928, + "step": 6640, + "time_per_iteration": 2.717414617538452 + }, + { + "auxiliary_loss_clip": 0.011284, + "auxiliary_loss_mlp": 0.01036391, + "balance_loss_clip": 1.04495263, + "balance_loss_mlp": 1.02293849, + "epoch": 0.3992785209679844, + "flos": 27534090494880.0, + "grad_norm": 2.462454926081158, + "language_loss": 0.8168292, + "learning_rate": 2.7320974684009996e-06, + "loss": 0.83847719, + "num_input_tokens_seen": 142659340, + "router_z_loss_clip": 0.83496094, + "router_z_loss_mlp": 0.13452148, + "step": 6641, + "time_per_iteration": 2.676985263824463 + }, + { + "auxiliary_loss_clip": 0.01132469, + "auxiliary_loss_mlp": 0.01033076, + "balance_loss_clip": 1.04986537, + "balance_loss_mlp": 1.01968837, + "epoch": 0.39933864422065235, + "flos": 24016760375040.0, + "grad_norm": 2.1918939691932406, + "language_loss": 0.76653045, + "learning_rate": 2.7317350231634288e-06, + "loss": 0.78818589, + "num_input_tokens_seen": 142677085, + "router_z_loss_clip": 0.82617188, + "router_z_loss_mlp": 0.13391113, + "step": 6642, + "time_per_iteration": 2.6191253662109375 + }, + { + "auxiliary_loss_clip": 0.01128002, + "auxiliary_loss_mlp": 0.01033895, + "balance_loss_clip": 1.04563904, + "balance_loss_mlp": 1.02018559, + "epoch": 0.3993987674733203, + "flos": 28111865387040.0, + "grad_norm": 1.9907632169867207, + "language_loss": 0.72587192, + "learning_rate": 2.731372550178393e-06, + "loss": 0.74749082, + "num_input_tokens_seen": 142694595, + "router_z_loss_clip": 0.82275391, + "router_z_loss_mlp": 0.13720703, + "step": 6643, + "time_per_iteration": 2.66076397895813 + }, + { + "auxiliary_loss_clip": 0.01130708, + "auxiliary_loss_mlp": 0.01031393, + "balance_loss_clip": 1.04732931, + "balance_loss_mlp": 1.01794648, + "epoch": 0.3994588907259883, + "flos": 23660623769760.0, + "grad_norm": 1.7342129015418493, + "language_loss": 0.6646632, + "learning_rate": 2.7310100494596375e-06, + "loss": 0.68628424, + "num_input_tokens_seen": 142714175, + "router_z_loss_clip": 0.83251953, + "router_z_loss_mlp": 0.13464355, + "step": 6644, + "time_per_iteration": 4.095121622085571 + }, + { + "auxiliary_loss_clip": 0.01127716, + "auxiliary_loss_mlp": 0.01035686, + "balance_loss_clip": 1.04528093, + "balance_loss_mlp": 1.02187514, + "epoch": 0.39951901397865625, + "flos": 16758314432640.0, + "grad_norm": 1.9866212904658263, + "language_loss": 0.77908427, + "learning_rate": 2.730647521020907e-06, + "loss": 0.80071831, + "num_input_tokens_seen": 142730955, + "router_z_loss_clip": 0.82421875, + "router_z_loss_mlp": 0.13793945, + "step": 6645, + "time_per_iteration": 4.028321743011475 + }, + { + "auxiliary_loss_clip": 0.01131736, + "auxiliary_loss_mlp": 0.01034156, + "balance_loss_clip": 1.04800653, + "balance_loss_mlp": 1.02026212, + "epoch": 0.3995791372313242, + "flos": 28780467320160.0, + "grad_norm": 1.9395185156903274, + "language_loss": 0.69970101, + "learning_rate": 2.73028496487595e-06, + "loss": 0.72135997, + "num_input_tokens_seen": 142751200, + "router_z_loss_clip": 0.83740234, + "router_z_loss_mlp": 0.13885498, + "step": 6646, + "time_per_iteration": 2.724942684173584 + }, + { + "auxiliary_loss_clip": 0.01128339, + "auxiliary_loss_mlp": 0.01032157, + "balance_loss_clip": 1.04532456, + "balance_loss_mlp": 1.01820338, + "epoch": 0.3996392604839922, + "flos": 26059025376000.0, + "grad_norm": 2.1917842958555656, + "language_loss": 0.7214461, + "learning_rate": 2.729922381038513e-06, + "loss": 0.74305105, + "num_input_tokens_seen": 142770170, + "router_z_loss_clip": 0.83056641, + "router_z_loss_mlp": 0.13970947, + "step": 6647, + "time_per_iteration": 2.7663521766662598 + }, + { + "auxiliary_loss_clip": 0.01124421, + "auxiliary_loss_mlp": 0.01032563, + "balance_loss_clip": 1.04523873, + "balance_loss_mlp": 1.02002859, + "epoch": 0.39969938373666014, + "flos": 31764909343680.0, + "grad_norm": 1.3431464614347526, + "language_loss": 0.74039298, + "learning_rate": 2.7295597695223463e-06, + "loss": 0.76196283, + "num_input_tokens_seen": 142792680, + "router_z_loss_clip": 0.79199219, + "router_z_loss_mlp": 0.12530518, + "step": 6648, + "time_per_iteration": 2.6826324462890625 + }, + { + "auxiliary_loss_clip": 0.01129377, + "auxiliary_loss_mlp": 0.01031836, + "balance_loss_clip": 1.04653692, + "balance_loss_mlp": 1.01812744, + "epoch": 0.3997595069893281, + "flos": 24546366571680.0, + "grad_norm": 2.1996528389317453, + "language_loss": 0.65902466, + "learning_rate": 2.7291971303412006e-06, + "loss": 0.68063676, + "num_input_tokens_seen": 142810510, + "router_z_loss_clip": 0.82861328, + "router_z_loss_mlp": 0.13696289, + "step": 6649, + "time_per_iteration": 2.6509323120117188 + }, + { + "auxiliary_loss_clip": 0.01135621, + "auxiliary_loss_mlp": 0.01038491, + "balance_loss_clip": 1.05222058, + "balance_loss_mlp": 1.02453732, + "epoch": 0.39981963024199607, + "flos": 33901810009920.0, + "grad_norm": 1.6645232416458853, + "language_loss": 0.75590098, + "learning_rate": 2.728834463508826e-06, + "loss": 0.77764213, + "num_input_tokens_seen": 142832455, + "router_z_loss_clip": 0.83496094, + "router_z_loss_mlp": 0.13970947, + "step": 6650, + "time_per_iteration": 4.207698106765747 + }, + { + "auxiliary_loss_clip": 0.01131576, + "auxiliary_loss_mlp": 0.01039315, + "balance_loss_clip": 1.04838836, + "balance_loss_mlp": 1.02576685, + "epoch": 0.39987975349466404, + "flos": 26776768419360.0, + "grad_norm": 1.5749002519454782, + "language_loss": 0.71602619, + "learning_rate": 2.728471769038975e-06, + "loss": 0.73773515, + "num_input_tokens_seen": 142852590, + "router_z_loss_clip": 0.83056641, + "router_z_loss_mlp": 0.13543701, + "step": 6651, + "time_per_iteration": 2.659227132797241 + }, + { + "auxiliary_loss_clip": 0.0112953, + "auxiliary_loss_mlp": 0.01036778, + "balance_loss_clip": 1.04640174, + "balance_loss_mlp": 1.02339077, + "epoch": 0.39993987674733206, + "flos": 25263542373120.0, + "grad_norm": 1.9182136902234954, + "language_loss": 0.73419636, + "learning_rate": 2.728109046945403e-06, + "loss": 0.75585943, + "num_input_tokens_seen": 142870595, + "router_z_loss_clip": 0.83154297, + "router_z_loss_mlp": 0.1340332, + "step": 6652, + "time_per_iteration": 2.7036354541778564 + }, + { + "auxiliary_loss_clip": 0.01042416, + "auxiliary_loss_mlp": 0.01004967, + "balance_loss_clip": 1.01629901, + "balance_loss_mlp": 1.0035789, + "epoch": 0.4, + "flos": 75073734938880.0, + "grad_norm": 0.8700924742300055, + "language_loss": 0.606493, + "learning_rate": 2.727746297241862e-06, + "loss": 0.62696683, + "num_input_tokens_seen": 142925805, + "router_z_loss_clip": 0.26171875, + "router_z_loss_mlp": 0.0138855, + "step": 6653, + "time_per_iteration": 3.1553707122802734 + }, + { + "auxiliary_loss_clip": 0.01129447, + "auxiliary_loss_mlp": 0.01041426, + "balance_loss_clip": 1.05095756, + "balance_loss_mlp": 1.02865314, + "epoch": 0.400060123252668, + "flos": 17695224208800.0, + "grad_norm": 3.187005482682286, + "language_loss": 0.6666739, + "learning_rate": 2.7273835199421085e-06, + "loss": 0.68838263, + "num_input_tokens_seen": 142943145, + "router_z_loss_clip": 0.78515625, + "router_z_loss_mlp": 0.12774658, + "step": 6654, + "time_per_iteration": 2.7992379665374756 + }, + { + "auxiliary_loss_clip": 0.01128953, + "auxiliary_loss_mlp": 0.0103744, + "balance_loss_clip": 1.04759002, + "balance_loss_mlp": 1.02576315, + "epoch": 0.40012024650533595, + "flos": 23298085434240.0, + "grad_norm": 2.298893086404922, + "language_loss": 0.89780331, + "learning_rate": 2.7270207150599e-06, + "loss": 0.91946721, + "num_input_tokens_seen": 142956925, + "router_z_loss_clip": 0.81347656, + "router_z_loss_mlp": 0.11676025, + "step": 6655, + "time_per_iteration": 4.110247611999512 + }, + { + "auxiliary_loss_clip": 0.01128193, + "auxiliary_loss_mlp": 0.01030358, + "balance_loss_clip": 1.05031896, + "balance_loss_mlp": 1.0188303, + "epoch": 0.4001803697580039, + "flos": 35814843938880.0, + "grad_norm": 1.886347860361059, + "language_loss": 0.73465788, + "learning_rate": 2.7266578826089917e-06, + "loss": 0.75624335, + "num_input_tokens_seen": 142978040, + "router_z_loss_clip": 0.77929688, + "router_z_loss_mlp": 0.11523438, + "step": 6656, + "time_per_iteration": 2.701101779937744 + }, + { + "auxiliary_loss_clip": 0.01129946, + "auxiliary_loss_mlp": 0.01049391, + "balance_loss_clip": 1.047382, + "balance_loss_mlp": 1.03578377, + "epoch": 0.4002404930106719, + "flos": 25526988142560.0, + "grad_norm": 1.6695498572984255, + "language_loss": 0.73271072, + "learning_rate": 2.726295022603144e-06, + "loss": 0.75450408, + "num_input_tokens_seen": 142998390, + "router_z_loss_clip": 0.82519531, + "router_z_loss_mlp": 0.1362915, + "step": 6657, + "time_per_iteration": 2.742136001586914 + }, + { + "auxiliary_loss_clip": 0.01131997, + "auxiliary_loss_mlp": 0.01039389, + "balance_loss_clip": 1.04902184, + "balance_loss_mlp": 1.0247618, + "epoch": 0.40030061626333985, + "flos": 34661036397600.0, + "grad_norm": 1.991528873160326, + "language_loss": 0.79744434, + "learning_rate": 2.725932135056117e-06, + "loss": 0.8191582, + "num_input_tokens_seen": 143021505, + "router_z_loss_clip": 0.83056641, + "router_z_loss_mlp": 0.1463623, + "step": 6658, + "time_per_iteration": 2.781963586807251 + }, + { + "auxiliary_loss_clip": 0.0113028, + "auxiliary_loss_mlp": 0.0103922, + "balance_loss_clip": 1.04804158, + "balance_loss_mlp": 1.02619648, + "epoch": 0.4003607395160078, + "flos": 31630005852480.0, + "grad_norm": 2.3337173359322847, + "language_loss": 0.77776778, + "learning_rate": 2.72556921998167e-06, + "loss": 0.7994628, + "num_input_tokens_seen": 143041375, + "router_z_loss_clip": 0.82226562, + "router_z_loss_mlp": 0.13024902, + "step": 6659, + "time_per_iteration": 2.750814199447632 + }, + { + "auxiliary_loss_clip": 0.01121564, + "auxiliary_loss_mlp": 0.010271, + "balance_loss_clip": 1.04637945, + "balance_loss_mlp": 1.01601386, + "epoch": 0.4004208627686758, + "flos": 25342740954720.0, + "grad_norm": 2.1094468614505852, + "language_loss": 0.72609615, + "learning_rate": 2.7252062773935662e-06, + "loss": 0.74758279, + "num_input_tokens_seen": 143058725, + "router_z_loss_clip": 0.75146484, + "router_z_loss_mlp": 0.11096191, + "step": 6660, + "time_per_iteration": 2.771038770675659 + }, + { + "auxiliary_loss_clip": 0.01128823, + "auxiliary_loss_mlp": 0.01041788, + "balance_loss_clip": 1.04706645, + "balance_loss_mlp": 1.02922893, + "epoch": 0.40048098602134374, + "flos": 30116374633440.0, + "grad_norm": 2.7297268710428306, + "language_loss": 0.71543878, + "learning_rate": 2.7248433073055674e-06, + "loss": 0.73714495, + "num_input_tokens_seen": 143076995, + "router_z_loss_clip": 0.81689453, + "router_z_loss_mlp": 0.12548828, + "step": 6661, + "time_per_iteration": 2.6800403594970703 + }, + { + "auxiliary_loss_clip": 0.01135148, + "auxiliary_loss_mlp": 0.01042071, + "balance_loss_clip": 1.05154753, + "balance_loss_mlp": 1.02887475, + "epoch": 0.4005411092740117, + "flos": 28291372053120.0, + "grad_norm": 2.4600134704600896, + "language_loss": 0.75223899, + "learning_rate": 2.724480309731437e-06, + "loss": 0.77401114, + "num_input_tokens_seen": 143096780, + "router_z_loss_clip": 0.83642578, + "router_z_loss_mlp": 0.13208008, + "step": 6662, + "time_per_iteration": 2.716308116912842 + }, + { + "auxiliary_loss_clip": 0.01131532, + "auxiliary_loss_mlp": 0.01034249, + "balance_loss_clip": 1.04760885, + "balance_loss_mlp": 1.0212909, + "epoch": 0.4006012325266797, + "flos": 21381242880960.0, + "grad_norm": 2.74503104560282, + "language_loss": 0.66335279, + "learning_rate": 2.7241172846849417e-06, + "loss": 0.68501055, + "num_input_tokens_seen": 143112590, + "router_z_loss_clip": 0.83886719, + "router_z_loss_mlp": 0.1295166, + "step": 6663, + "time_per_iteration": 2.664954662322998 + }, + { + "auxiliary_loss_clip": 0.01129628, + "auxiliary_loss_mlp": 0.010367, + "balance_loss_clip": 1.0474714, + "balance_loss_mlp": 1.0245465, + "epoch": 0.40066135577934764, + "flos": 24230457275040.0, + "grad_norm": 3.5115939853215647, + "language_loss": 0.85550541, + "learning_rate": 2.7237542321798455e-06, + "loss": 0.87716866, + "num_input_tokens_seen": 143130220, + "router_z_loss_clip": 0.82080078, + "router_z_loss_mlp": 0.12164307, + "step": 6664, + "time_per_iteration": 2.682800769805908 + }, + { + "auxiliary_loss_clip": 0.01130058, + "auxiliary_loss_mlp": 0.01033971, + "balance_loss_clip": 1.04858124, + "balance_loss_mlp": 1.02108467, + "epoch": 0.40072147903201566, + "flos": 22146506343360.0, + "grad_norm": 3.3189399924651206, + "language_loss": 0.84662759, + "learning_rate": 2.723391152229917e-06, + "loss": 0.86826789, + "num_input_tokens_seen": 143147160, + "router_z_loss_clip": 0.81542969, + "router_z_loss_mlp": 0.12884521, + "step": 6665, + "time_per_iteration": 2.6213440895080566 + }, + { + "auxiliary_loss_clip": 0.01133261, + "auxiliary_loss_mlp": 0.01034616, + "balance_loss_clip": 1.05014038, + "balance_loss_mlp": 1.02133656, + "epoch": 0.4007816022846836, + "flos": 22770343032480.0, + "grad_norm": 1.828888465783704, + "language_loss": 0.78683937, + "learning_rate": 2.7230280448489236e-06, + "loss": 0.80851817, + "num_input_tokens_seen": 143164605, + "router_z_loss_clip": 0.83203125, + "router_z_loss_mlp": 0.1328125, + "step": 6666, + "time_per_iteration": 2.6595561504364014 + }, + { + "auxiliary_loss_clip": 0.01134171, + "auxiliary_loss_mlp": 0.01033629, + "balance_loss_clip": 1.05126143, + "balance_loss_mlp": 1.02003372, + "epoch": 0.4008417255373516, + "flos": 31363399735200.0, + "grad_norm": 1.958566905074169, + "language_loss": 0.73944962, + "learning_rate": 2.7226649100506333e-06, + "loss": 0.76112759, + "num_input_tokens_seen": 143183965, + "router_z_loss_clip": 0.82958984, + "router_z_loss_mlp": 0.13598633, + "step": 6667, + "time_per_iteration": 2.7409074306488037 + }, + { + "auxiliary_loss_clip": 0.01132562, + "auxiliary_loss_mlp": 0.01048465, + "balance_loss_clip": 1.04898334, + "balance_loss_mlp": 1.03453517, + "epoch": 0.40090184879001955, + "flos": 27892131412320.0, + "grad_norm": 1.549884804706295, + "language_loss": 0.75537145, + "learning_rate": 2.7223017478488183e-06, + "loss": 0.77718168, + "num_input_tokens_seen": 143204965, + "router_z_loss_clip": 0.8359375, + "router_z_loss_mlp": 0.13934326, + "step": 6668, + "time_per_iteration": 2.7629125118255615 + }, + { + "auxiliary_loss_clip": 0.01134557, + "auxiliary_loss_mlp": 0.01038033, + "balance_loss_clip": 1.05293131, + "balance_loss_mlp": 1.0247705, + "epoch": 0.4009619720426875, + "flos": 35459477161920.0, + "grad_norm": 1.9466800846677101, + "language_loss": 0.81915843, + "learning_rate": 2.721938558257248e-06, + "loss": 0.84088439, + "num_input_tokens_seen": 143225015, + "router_z_loss_clip": 0.81591797, + "router_z_loss_mlp": 0.1328125, + "step": 6669, + "time_per_iteration": 2.712226152420044 + }, + { + "auxiliary_loss_clip": 0.01043508, + "auxiliary_loss_mlp": 0.01002627, + "balance_loss_clip": 1.01739049, + "balance_loss_mlp": 1.00111794, + "epoch": 0.4010220952953555, + "flos": 80605946728800.0, + "grad_norm": 0.7024109461144963, + "language_loss": 0.53339696, + "learning_rate": 2.721575341289695e-06, + "loss": 0.55385828, + "num_input_tokens_seen": 143294925, + "router_z_loss_clip": 0.26147461, + "router_z_loss_mlp": 0.01509094, + "step": 6670, + "time_per_iteration": 3.4482150077819824 + }, + { + "auxiliary_loss_clip": 0.01129324, + "auxiliary_loss_mlp": 0.01037474, + "balance_loss_clip": 1.04781353, + "balance_loss_mlp": 1.02424145, + "epoch": 0.40108221854802345, + "flos": 36170940026880.0, + "grad_norm": 1.7821664743296033, + "language_loss": 0.89045173, + "learning_rate": 2.7212120969599333e-06, + "loss": 0.91211963, + "num_input_tokens_seen": 143314170, + "router_z_loss_clip": 0.81542969, + "router_z_loss_mlp": 0.13226318, + "step": 6671, + "time_per_iteration": 2.77361798286438 + }, + { + "auxiliary_loss_clip": 0.01132228, + "auxiliary_loss_mlp": 0.01037492, + "balance_loss_clip": 1.04975629, + "balance_loss_mlp": 1.02381325, + "epoch": 0.4011423418006914, + "flos": 24318812761920.0, + "grad_norm": 1.866524328421824, + "language_loss": 0.78934348, + "learning_rate": 2.720848825281736e-06, + "loss": 0.81104076, + "num_input_tokens_seen": 143330050, + "router_z_loss_clip": 0.82470703, + "router_z_loss_mlp": 0.13690186, + "step": 6672, + "time_per_iteration": 2.6597445011138916 + }, + { + "auxiliary_loss_clip": 0.01124979, + "auxiliary_loss_mlp": 0.01027646, + "balance_loss_clip": 1.04664779, + "balance_loss_mlp": 1.01548028, + "epoch": 0.4012024650533594, + "flos": 24506828056800.0, + "grad_norm": 2.9835706089722582, + "language_loss": 0.62750316, + "learning_rate": 2.72048552626888e-06, + "loss": 0.64902949, + "num_input_tokens_seen": 143348650, + "router_z_loss_clip": 0.78369141, + "router_z_loss_mlp": 0.12164307, + "step": 6673, + "time_per_iteration": 2.770979404449463 + }, + { + "auxiliary_loss_clip": 0.01129134, + "auxiliary_loss_mlp": 0.01033308, + "balance_loss_clip": 1.04833663, + "balance_loss_mlp": 1.02086258, + "epoch": 0.40126258830602735, + "flos": 26472649651200.0, + "grad_norm": 1.4780680239242405, + "language_loss": 0.80176473, + "learning_rate": 2.7201221999351402e-06, + "loss": 0.82338905, + "num_input_tokens_seen": 143370275, + "router_z_loss_clip": 0.80859375, + "router_z_loss_mlp": 0.12432861, + "step": 6674, + "time_per_iteration": 2.745922088623047 + }, + { + "auxiliary_loss_clip": 0.01132047, + "auxiliary_loss_mlp": 0.01031478, + "balance_loss_clip": 1.04810488, + "balance_loss_mlp": 1.01857364, + "epoch": 0.4013227115586953, + "flos": 14666665217760.0, + "grad_norm": 2.60236768345187, + "language_loss": 0.82518959, + "learning_rate": 2.719758846294294e-06, + "loss": 0.84682482, + "num_input_tokens_seen": 143385390, + "router_z_loss_clip": 0.84033203, + "router_z_loss_mlp": 0.12902832, + "step": 6675, + "time_per_iteration": 2.6451563835144043 + }, + { + "auxiliary_loss_clip": 0.01127077, + "auxiliary_loss_mlp": 0.01033156, + "balance_loss_clip": 1.046278, + "balance_loss_mlp": 1.01966143, + "epoch": 0.4013828348113633, + "flos": 31051461132000.0, + "grad_norm": 1.801788610212234, + "language_loss": 0.93536508, + "learning_rate": 2.71939546536012e-06, + "loss": 0.95696735, + "num_input_tokens_seen": 143404215, + "router_z_loss_clip": 0.80761719, + "router_z_loss_mlp": 0.13494873, + "step": 6676, + "time_per_iteration": 2.725996732711792 + }, + { + "auxiliary_loss_clip": 0.01134846, + "auxiliary_loss_mlp": 0.01036903, + "balance_loss_clip": 1.04804325, + "balance_loss_mlp": 1.02286637, + "epoch": 0.40144295806403124, + "flos": 22675261677120.0, + "grad_norm": 2.0307197727866297, + "language_loss": 0.79305899, + "learning_rate": 2.719032057146399e-06, + "loss": 0.81477654, + "num_input_tokens_seen": 143422245, + "router_z_loss_clip": 0.86865234, + "router_z_loss_mlp": 0.14025879, + "step": 6677, + "time_per_iteration": 2.7246053218841553 + }, + { + "auxiliary_loss_clip": 0.01131387, + "auxiliary_loss_mlp": 0.01035446, + "balance_loss_clip": 1.04981709, + "balance_loss_mlp": 1.02213001, + "epoch": 0.4015030813166992, + "flos": 27979636036320.0, + "grad_norm": 2.247119894264202, + "language_loss": 0.83450627, + "learning_rate": 2.71866862166691e-06, + "loss": 0.85617465, + "num_input_tokens_seen": 143443130, + "router_z_loss_clip": 0.81494141, + "router_z_loss_mlp": 0.13317871, + "step": 6678, + "time_per_iteration": 2.7170801162719727 + }, + { + "auxiliary_loss_clip": 0.01127213, + "auxiliary_loss_mlp": 0.01037821, + "balance_loss_clip": 1.04736137, + "balance_loss_mlp": 1.0249759, + "epoch": 0.4015632045693672, + "flos": 25130097504000.0, + "grad_norm": 2.3548006188209527, + "language_loss": 0.63676637, + "learning_rate": 2.718305158935434e-06, + "loss": 0.65841675, + "num_input_tokens_seen": 143461385, + "router_z_loss_clip": 0.79736328, + "router_z_loss_mlp": 0.12841797, + "step": 6679, + "time_per_iteration": 2.6793227195739746 + }, + { + "auxiliary_loss_clip": 0.01126182, + "auxiliary_loss_mlp": 0.01029728, + "balance_loss_clip": 1.04696262, + "balance_loss_mlp": 1.01723444, + "epoch": 0.4016233278220352, + "flos": 28600595998560.0, + "grad_norm": 1.543308502458815, + "language_loss": 0.78932667, + "learning_rate": 2.7179416689657554e-06, + "loss": 0.81088573, + "num_input_tokens_seen": 143481750, + "router_z_loss_clip": 0.79199219, + "router_z_loss_mlp": 0.12506104, + "step": 6680, + "time_per_iteration": 2.704662322998047 + }, + { + "auxiliary_loss_clip": 0.01133513, + "auxiliary_loss_mlp": 0.01045043, + "balance_loss_clip": 1.04862809, + "balance_loss_mlp": 1.03132772, + "epoch": 0.40168345107470316, + "flos": 26150541210720.0, + "grad_norm": 1.617760805630671, + "language_loss": 0.76063228, + "learning_rate": 2.7175781517716556e-06, + "loss": 0.78241777, + "num_input_tokens_seen": 143501540, + "router_z_loss_clip": 0.84960938, + "router_z_loss_mlp": 0.137146, + "step": 6681, + "time_per_iteration": 2.73431658744812 + }, + { + "auxiliary_loss_clip": 0.01132287, + "auxiliary_loss_mlp": 0.01035054, + "balance_loss_clip": 1.04946995, + "balance_loss_mlp": 1.02235866, + "epoch": 0.4017435743273711, + "flos": 27890551238400.0, + "grad_norm": 2.5137917656536577, + "language_loss": 0.64313799, + "learning_rate": 2.7172146073669213e-06, + "loss": 0.66481137, + "num_input_tokens_seen": 143520530, + "router_z_loss_clip": 0.828125, + "router_z_loss_mlp": 0.1270752, + "step": 6682, + "time_per_iteration": 2.7232720851898193 + }, + { + "auxiliary_loss_clip": 0.01130658, + "auxiliary_loss_mlp": 0.01034703, + "balance_loss_clip": 1.04666328, + "balance_loss_mlp": 1.02201343, + "epoch": 0.4018036975800391, + "flos": 34925535616320.0, + "grad_norm": 1.7606146425124194, + "language_loss": 0.7297076, + "learning_rate": 2.716851035765337e-06, + "loss": 0.75136119, + "num_input_tokens_seen": 143540210, + "router_z_loss_clip": 0.84033203, + "router_z_loss_mlp": 0.12695312, + "step": 6683, + "time_per_iteration": 5.8324363231658936 + }, + { + "auxiliary_loss_clip": 0.01129853, + "auxiliary_loss_mlp": 0.01039044, + "balance_loss_clip": 1.04809368, + "balance_loss_mlp": 1.02643764, + "epoch": 0.40186382083270705, + "flos": 32520327107040.0, + "grad_norm": 1.8217738274495368, + "language_loss": 0.73210597, + "learning_rate": 2.7164874369806896e-06, + "loss": 0.75379497, + "num_input_tokens_seen": 143560940, + "router_z_loss_clip": 0.81689453, + "router_z_loss_mlp": 0.12615967, + "step": 6684, + "time_per_iteration": 2.7024295330047607 + }, + { + "auxiliary_loss_clip": 0.01045498, + "auxiliary_loss_mlp": 0.01002938, + "balance_loss_clip": 1.01941681, + "balance_loss_mlp": 1.00139737, + "epoch": 0.401923944085375, + "flos": 72309553614720.0, + "grad_norm": 0.8115526941036967, + "language_loss": 0.60432935, + "learning_rate": 2.716123811026767e-06, + "loss": 0.62481368, + "num_input_tokens_seen": 143624015, + "router_z_loss_clip": 0.26123047, + "router_z_loss_mlp": 0.01540375, + "step": 6685, + "time_per_iteration": 3.369906187057495 + }, + { + "auxiliary_loss_clip": 0.01131327, + "auxiliary_loss_mlp": 0.01031916, + "balance_loss_clip": 1.04645467, + "balance_loss_mlp": 1.01908326, + "epoch": 0.401984067338043, + "flos": 20722162508640.0, + "grad_norm": 2.863412104200807, + "language_loss": 0.69908059, + "learning_rate": 2.715760157917357e-06, + "loss": 0.72071296, + "num_input_tokens_seen": 143642750, + "router_z_loss_clip": 0.84863281, + "router_z_loss_mlp": 0.12841797, + "step": 6686, + "time_per_iteration": 2.673680305480957 + }, + { + "auxiliary_loss_clip": 0.01126804, + "auxiliary_loss_mlp": 0.01031482, + "balance_loss_clip": 1.0459764, + "balance_loss_mlp": 1.0191853, + "epoch": 0.40204419059071095, + "flos": 30471862962240.0, + "grad_norm": 1.4502453774401136, + "language_loss": 0.7484442, + "learning_rate": 2.7153964776662504e-06, + "loss": 0.77002704, + "num_input_tokens_seen": 143664515, + "router_z_loss_clip": 0.80908203, + "router_z_loss_mlp": 0.1229248, + "step": 6687, + "time_per_iteration": 2.712883234024048 + }, + { + "auxiliary_loss_clip": 0.01133406, + "auxiliary_loss_mlp": 0.01038006, + "balance_loss_clip": 1.05100358, + "balance_loss_mlp": 1.02493453, + "epoch": 0.4021043138433789, + "flos": 28647427623840.0, + "grad_norm": 1.8273971283801278, + "language_loss": 0.70558572, + "learning_rate": 2.7150327702872385e-06, + "loss": 0.72729981, + "num_input_tokens_seen": 143683135, + "router_z_loss_clip": 0.82324219, + "router_z_loss_mlp": 0.1307373, + "step": 6688, + "time_per_iteration": 2.7044355869293213 + }, + { + "auxiliary_loss_clip": 0.0113245, + "auxiliary_loss_mlp": 0.01039935, + "balance_loss_clip": 1.04761124, + "balance_loss_mlp": 1.02604723, + "epoch": 0.4021644370960469, + "flos": 31720427720640.0, + "grad_norm": 1.5961410755515457, + "language_loss": 0.64112169, + "learning_rate": 2.7146690357941112e-06, + "loss": 0.66284549, + "num_input_tokens_seen": 143703985, + "router_z_loss_clip": 0.84667969, + "router_z_loss_mlp": 0.13891602, + "step": 6689, + "time_per_iteration": 4.15885591506958 + }, + { + "auxiliary_loss_clip": 0.01130472, + "auxiliary_loss_mlp": 0.01031928, + "balance_loss_clip": 1.04649079, + "balance_loss_mlp": 1.01936913, + "epoch": 0.40222456034871484, + "flos": 16579699146720.0, + "grad_norm": 2.3373893232746537, + "language_loss": 0.73651409, + "learning_rate": 2.7143052742006632e-06, + "loss": 0.75813806, + "num_input_tokens_seen": 143719245, + "router_z_loss_clip": 0.84033203, + "router_z_loss_mlp": 0.12567139, + "step": 6690, + "time_per_iteration": 2.634429454803467 + }, + { + "auxiliary_loss_clip": 0.01129872, + "auxiliary_loss_mlp": 0.01032417, + "balance_loss_clip": 1.04775262, + "balance_loss_mlp": 1.01932216, + "epoch": 0.4022846836013828, + "flos": 29624524191360.0, + "grad_norm": 1.5101592264551726, + "language_loss": 0.74416572, + "learning_rate": 2.7139414855206872e-06, + "loss": 0.76578856, + "num_input_tokens_seen": 143739575, + "router_z_loss_clip": 0.82128906, + "router_z_loss_mlp": 0.13085938, + "step": 6691, + "time_per_iteration": 2.654965877532959 + }, + { + "auxiliary_loss_clip": 0.01132383, + "auxiliary_loss_mlp": 0.01042134, + "balance_loss_clip": 1.04858899, + "balance_loss_mlp": 1.02902675, + "epoch": 0.40234480685405083, + "flos": 24589470607200.0, + "grad_norm": 1.7028853857917743, + "language_loss": 0.72273111, + "learning_rate": 2.7135776697679785e-06, + "loss": 0.74447626, + "num_input_tokens_seen": 143758515, + "router_z_loss_clip": 0.83789062, + "router_z_loss_mlp": 0.13110352, + "step": 6692, + "time_per_iteration": 2.649843215942383 + }, + { + "auxiliary_loss_clip": 0.01129376, + "auxiliary_loss_mlp": 0.01031674, + "balance_loss_clip": 1.04583549, + "balance_loss_mlp": 1.01903176, + "epoch": 0.4024049301067188, + "flos": 27979636036320.0, + "grad_norm": 2.306218319224954, + "language_loss": 0.83753133, + "learning_rate": 2.7132138269563333e-06, + "loss": 0.85914177, + "num_input_tokens_seen": 143776770, + "router_z_loss_clip": 0.83447266, + "router_z_loss_mlp": 0.12646484, + "step": 6693, + "time_per_iteration": 2.6890838146209717 + }, + { + "auxiliary_loss_clip": 0.01132186, + "auxiliary_loss_mlp": 0.01035781, + "balance_loss_clip": 1.04864419, + "balance_loss_mlp": 1.023067, + "epoch": 0.40246505335938676, + "flos": 43963084411200.0, + "grad_norm": 1.7849346578571899, + "language_loss": 0.70614874, + "learning_rate": 2.7128499570995483e-06, + "loss": 0.72782838, + "num_input_tokens_seen": 143798450, + "router_z_loss_clip": 0.83496094, + "router_z_loss_mlp": 0.12719727, + "step": 6694, + "time_per_iteration": 4.299107074737549 + }, + { + "auxiliary_loss_clip": 0.0112714, + "auxiliary_loss_mlp": 0.01037551, + "balance_loss_clip": 1.04543293, + "balance_loss_mlp": 1.02414548, + "epoch": 0.4025251766120547, + "flos": 25129246641120.0, + "grad_norm": 3.563749203552065, + "language_loss": 0.68049407, + "learning_rate": 2.7124860602114212e-06, + "loss": 0.70214093, + "num_input_tokens_seen": 143816995, + "router_z_loss_clip": 0.81689453, + "router_z_loss_mlp": 0.13409424, + "step": 6695, + "time_per_iteration": 2.6874098777770996 + }, + { + "auxiliary_loss_clip": 0.0112614, + "auxiliary_loss_mlp": 0.01035003, + "balance_loss_clip": 1.04405069, + "balance_loss_mlp": 1.0224148, + "epoch": 0.4025852998647227, + "flos": 78735483760320.0, + "grad_norm": 1.9812760958472324, + "language_loss": 0.79847819, + "learning_rate": 2.7121221363057515e-06, + "loss": 0.82008958, + "num_input_tokens_seen": 143842090, + "router_z_loss_clip": 0.82080078, + "router_z_loss_mlp": 0.12591553, + "step": 6696, + "time_per_iteration": 3.1349165439605713 + }, + { + "auxiliary_loss_clip": 0.01130511, + "auxiliary_loss_mlp": 0.01037733, + "balance_loss_clip": 1.04781091, + "balance_loss_mlp": 1.02456021, + "epoch": 0.40264542311739066, + "flos": 25485545315520.0, + "grad_norm": 2.009610762109959, + "language_loss": 0.71207082, + "learning_rate": 2.7117581853963393e-06, + "loss": 0.73375326, + "num_input_tokens_seen": 143860800, + "router_z_loss_clip": 0.82714844, + "router_z_loss_mlp": 0.13165283, + "step": 6697, + "time_per_iteration": 2.695721387863159 + }, + { + "auxiliary_loss_clip": 0.01126911, + "auxiliary_loss_mlp": 0.01036484, + "balance_loss_clip": 1.04611635, + "balance_loss_mlp": 1.02384162, + "epoch": 0.4027055463700586, + "flos": 32030948219040.0, + "grad_norm": 2.3186773241790437, + "language_loss": 0.61905974, + "learning_rate": 2.711394207496984e-06, + "loss": 0.64069366, + "num_input_tokens_seen": 143878950, + "router_z_loss_clip": 0.80761719, + "router_z_loss_mlp": 0.12640381, + "step": 6698, + "time_per_iteration": 2.802436351776123 + }, + { + "auxiliary_loss_clip": 0.0113138, + "auxiliary_loss_mlp": 0.01037975, + "balance_loss_clip": 1.04763913, + "balance_loss_mlp": 1.02400947, + "epoch": 0.4027656696227266, + "flos": 25174457575200.0, + "grad_norm": 2.1054516045977167, + "language_loss": 0.76726669, + "learning_rate": 2.711030202621491e-06, + "loss": 0.78896028, + "num_input_tokens_seen": 143898385, + "router_z_loss_clip": 0.8359375, + "router_z_loss_mlp": 0.13964844, + "step": 6699, + "time_per_iteration": 2.6350510120391846 + }, + { + "auxiliary_loss_clip": 0.01126108, + "auxiliary_loss_mlp": 0.01031593, + "balance_loss_clip": 1.04589534, + "balance_loss_mlp": 1.0185281, + "epoch": 0.40282579287539455, + "flos": 27267079204800.0, + "grad_norm": 1.6145206335166145, + "language_loss": 0.7999022, + "learning_rate": 2.7106661707836605e-06, + "loss": 0.82147914, + "num_input_tokens_seen": 143918795, + "router_z_loss_clip": 0.80175781, + "router_z_loss_mlp": 0.13061523, + "step": 6700, + "time_per_iteration": 2.688516616821289 + }, + { + "auxiliary_loss_clip": 0.01132884, + "auxiliary_loss_mlp": 0.01041469, + "balance_loss_clip": 1.04559076, + "balance_loss_mlp": 1.02654386, + "epoch": 0.4028859161280625, + "flos": 35722355689440.0, + "grad_norm": 1.700216628067911, + "language_loss": 0.74818039, + "learning_rate": 2.7103021119972977e-06, + "loss": 0.76992393, + "num_input_tokens_seen": 143938245, + "router_z_loss_clip": 0.87304688, + "router_z_loss_mlp": 0.14929199, + "step": 6701, + "time_per_iteration": 2.6892645359039307 + }, + { + "auxiliary_loss_clip": 0.01127997, + "auxiliary_loss_mlp": 0.01035958, + "balance_loss_clip": 1.04555655, + "balance_loss_mlp": 1.02350104, + "epoch": 0.4029460393807305, + "flos": 34925535616320.0, + "grad_norm": 1.6479284312434808, + "language_loss": 0.65829366, + "learning_rate": 2.709938026276208e-06, + "loss": 0.67993319, + "num_input_tokens_seen": 143960995, + "router_z_loss_clip": 0.82470703, + "router_z_loss_mlp": 0.12451172, + "step": 6702, + "time_per_iteration": 2.7917497158050537 + }, + { + "auxiliary_loss_clip": 0.01132602, + "auxiliary_loss_mlp": 0.01041563, + "balance_loss_clip": 1.04806185, + "balance_loss_mlp": 1.02743101, + "epoch": 0.40300616263339845, + "flos": 27579017808000.0, + "grad_norm": 1.685059911045903, + "language_loss": 0.65967476, + "learning_rate": 2.7095739136341964e-06, + "loss": 0.68141645, + "num_input_tokens_seen": 143979910, + "router_z_loss_clip": 0.84521484, + "router_z_loss_mlp": 0.14135742, + "step": 6703, + "time_per_iteration": 2.6461734771728516 + }, + { + "auxiliary_loss_clip": 0.01134795, + "auxiliary_loss_mlp": 0.0103608, + "balance_loss_clip": 1.05031419, + "balance_loss_mlp": 1.02263355, + "epoch": 0.4030662858860664, + "flos": 31140302826240.0, + "grad_norm": 4.99931377770788, + "language_loss": 0.82235098, + "learning_rate": 2.709209774085071e-06, + "loss": 0.84405971, + "num_input_tokens_seen": 144000095, + "router_z_loss_clip": 0.84423828, + "router_z_loss_mlp": 0.13464355, + "step": 6704, + "time_per_iteration": 2.7396841049194336 + }, + { + "auxiliary_loss_clip": 0.01132142, + "auxiliary_loss_mlp": 0.01035909, + "balance_loss_clip": 1.04663539, + "balance_loss_mlp": 1.02270043, + "epoch": 0.40312640913873443, + "flos": 28780750941120.0, + "grad_norm": 2.123777083608251, + "language_loss": 0.73378021, + "learning_rate": 2.7088456076426407e-06, + "loss": 0.75546074, + "num_input_tokens_seen": 144019695, + "router_z_loss_clip": 0.85498047, + "router_z_loss_mlp": 0.13214111, + "step": 6705, + "time_per_iteration": 2.7058677673339844 + }, + { + "auxiliary_loss_clip": 0.01126193, + "auxiliary_loss_mlp": 0.01035594, + "balance_loss_clip": 1.04535437, + "balance_loss_mlp": 1.02291584, + "epoch": 0.4031865323914024, + "flos": 24417459637920.0, + "grad_norm": 1.9101559939943387, + "language_loss": 0.66244018, + "learning_rate": 2.708481414320713e-06, + "loss": 0.68405801, + "num_input_tokens_seen": 144038525, + "router_z_loss_clip": 0.80761719, + "router_z_loss_mlp": 0.12683105, + "step": 6706, + "time_per_iteration": 2.766923666000366 + }, + { + "auxiliary_loss_clip": 0.01129062, + "auxiliary_loss_mlp": 0.01036828, + "balance_loss_clip": 1.04604483, + "balance_loss_mlp": 1.02328587, + "epoch": 0.40324665564407036, + "flos": 26687683621440.0, + "grad_norm": 1.3189923673031694, + "language_loss": 0.71348417, + "learning_rate": 2.7081171941330992e-06, + "loss": 0.73514307, + "num_input_tokens_seen": 144059485, + "router_z_loss_clip": 0.83007812, + "router_z_loss_mlp": 0.13537598, + "step": 6707, + "time_per_iteration": 2.738291025161743 + }, + { + "auxiliary_loss_clip": 0.01120615, + "auxiliary_loss_mlp": 0.01029577, + "balance_loss_clip": 1.0427593, + "balance_loss_mlp": 1.01611269, + "epoch": 0.4033067788967383, + "flos": 29137090132800.0, + "grad_norm": 1.6754001162398824, + "language_loss": 0.7989397, + "learning_rate": 2.707752947093611e-06, + "loss": 0.8204416, + "num_input_tokens_seen": 144080265, + "router_z_loss_clip": 0.77929688, + "router_z_loss_mlp": 0.13476562, + "step": 6708, + "time_per_iteration": 2.7247471809387207 + }, + { + "auxiliary_loss_clip": 0.01128923, + "auxiliary_loss_mlp": 0.01038367, + "balance_loss_clip": 1.04281855, + "balance_loss_mlp": 1.02475357, + "epoch": 0.4033669021494063, + "flos": 21255617846880.0, + "grad_norm": 2.085690063049461, + "language_loss": 0.82803988, + "learning_rate": 2.70738867321606e-06, + "loss": 0.84971273, + "num_input_tokens_seen": 144098040, + "router_z_loss_clip": 0.86181641, + "router_z_loss_mlp": 0.13635254, + "step": 6709, + "time_per_iteration": 2.6980268955230713 + }, + { + "auxiliary_loss_clip": 0.01131189, + "auxiliary_loss_mlp": 0.01037692, + "balance_loss_clip": 1.04728985, + "balance_loss_mlp": 1.02411985, + "epoch": 0.40342702540207426, + "flos": 36119894604480.0, + "grad_norm": 1.4688259447101095, + "language_loss": 0.71251583, + "learning_rate": 2.70702437251426e-06, + "loss": 0.73420465, + "num_input_tokens_seen": 144118265, + "router_z_loss_clip": 0.83886719, + "router_z_loss_mlp": 0.13580322, + "step": 6710, + "time_per_iteration": 2.7143521308898926 + }, + { + "auxiliary_loss_clip": 0.01127796, + "auxiliary_loss_mlp": 0.01036035, + "balance_loss_clip": 1.04494548, + "balance_loss_mlp": 1.02220702, + "epoch": 0.4034871486547422, + "flos": 13769374991040.0, + "grad_norm": 2.1947976785992194, + "language_loss": 0.85180712, + "learning_rate": 2.7066600450020236e-06, + "loss": 0.87344539, + "num_input_tokens_seen": 144133865, + "router_z_loss_clip": 0.828125, + "router_z_loss_mlp": 0.13824463, + "step": 6711, + "time_per_iteration": 2.681032657623291 + }, + { + "auxiliary_loss_clip": 0.0112878, + "auxiliary_loss_mlp": 0.01033375, + "balance_loss_clip": 1.04601836, + "balance_loss_mlp": 1.01929045, + "epoch": 0.4035472719074102, + "flos": 18977857649280.0, + "grad_norm": 3.025588502336598, + "language_loss": 0.76671433, + "learning_rate": 2.706295690693168e-06, + "loss": 0.7883358, + "num_input_tokens_seen": 144150125, + "router_z_loss_clip": 0.828125, + "router_z_loss_mlp": 0.14093018, + "step": 6712, + "time_per_iteration": 2.6279189586639404 + }, + { + "auxiliary_loss_clip": 0.01130657, + "auxiliary_loss_mlp": 0.01032189, + "balance_loss_clip": 1.04769981, + "balance_loss_mlp": 1.01893878, + "epoch": 0.40360739516007815, + "flos": 30108635832960.0, + "grad_norm": 2.036985658356139, + "language_loss": 0.783539, + "learning_rate": 2.7059313096015096e-06, + "loss": 0.80516744, + "num_input_tokens_seen": 144169295, + "router_z_loss_clip": 0.82958984, + "router_z_loss_mlp": 0.13238525, + "step": 6713, + "time_per_iteration": 2.737008571624756 + }, + { + "auxiliary_loss_clip": 0.01129529, + "auxiliary_loss_mlp": 0.01030791, + "balance_loss_clip": 1.04466736, + "balance_loss_mlp": 1.01750565, + "epoch": 0.4036675184127461, + "flos": 21114879867360.0, + "grad_norm": 2.0043009027063055, + "language_loss": 0.88238412, + "learning_rate": 2.705566901740865e-06, + "loss": 0.90398729, + "num_input_tokens_seen": 144185790, + "router_z_loss_clip": 0.84863281, + "router_z_loss_mlp": 0.13293457, + "step": 6714, + "time_per_iteration": 2.6292309761047363 + }, + { + "auxiliary_loss_clip": 0.01127693, + "auxiliary_loss_mlp": 0.01042873, + "balance_loss_clip": 1.04453826, + "balance_loss_mlp": 1.02947998, + "epoch": 0.4037276416654141, + "flos": 24238155558240.0, + "grad_norm": 1.6009892277031124, + "language_loss": 0.69122076, + "learning_rate": 2.7052024671250527e-06, + "loss": 0.71292639, + "num_input_tokens_seen": 144205190, + "router_z_loss_clip": 0.83154297, + "router_z_loss_mlp": 0.13397217, + "step": 6715, + "time_per_iteration": 2.678361654281616 + }, + { + "auxiliary_loss_clip": 0.01130953, + "auxiliary_loss_mlp": 0.0103473, + "balance_loss_clip": 1.04553282, + "balance_loss_mlp": 1.02105081, + "epoch": 0.40378776491808205, + "flos": 22324959560160.0, + "grad_norm": 2.0566000485127014, + "language_loss": 0.77582037, + "learning_rate": 2.704838005767892e-06, + "loss": 0.79747725, + "num_input_tokens_seen": 144222705, + "router_z_loss_clip": 0.85449219, + "router_z_loss_mlp": 0.13690186, + "step": 6716, + "time_per_iteration": 2.7035040855407715 + }, + { + "auxiliary_loss_clip": 0.01127502, + "auxiliary_loss_mlp": 0.01035611, + "balance_loss_clip": 1.04592371, + "balance_loss_mlp": 1.02289104, + "epoch": 0.40384788817075, + "flos": 18362813209920.0, + "grad_norm": 2.003598115446434, + "language_loss": 0.75853431, + "learning_rate": 2.7044735176832037e-06, + "loss": 0.78016543, + "num_input_tokens_seen": 144239545, + "router_z_loss_clip": 0.81445312, + "router_z_loss_mlp": 0.1272583, + "step": 6717, + "time_per_iteration": 2.7010855674743652 + }, + { + "auxiliary_loss_clip": 0.01041263, + "auxiliary_loss_mlp": 0.01003184, + "balance_loss_clip": 1.01466012, + "balance_loss_mlp": 1.00168371, + "epoch": 0.40390801142341803, + "flos": 75567044003040.0, + "grad_norm": 0.9383958278080066, + "language_loss": 0.60800803, + "learning_rate": 2.7041090028848084e-06, + "loss": 0.62845248, + "num_input_tokens_seen": 144288145, + "router_z_loss_clip": 0.26586914, + "router_z_loss_mlp": 0.01499176, + "step": 6718, + "time_per_iteration": 3.1754257678985596 + }, + { + "auxiliary_loss_clip": 0.01133783, + "auxiliary_loss_mlp": 0.01034603, + "balance_loss_clip": 1.04650879, + "balance_loss_mlp": 1.02004719, + "epoch": 0.403968134676086, + "flos": 27746855497440.0, + "grad_norm": 2.1133409723022814, + "language_loss": 0.74850345, + "learning_rate": 2.7037444613865306e-06, + "loss": 0.77018726, + "num_input_tokens_seen": 144302315, + "router_z_loss_clip": 0.87304688, + "router_z_loss_mlp": 0.14562988, + "step": 6719, + "time_per_iteration": 2.694938898086548 + }, + { + "auxiliary_loss_clip": 0.01129358, + "auxiliary_loss_mlp": 0.0103595, + "balance_loss_clip": 1.04612494, + "balance_loss_mlp": 1.02166915, + "epoch": 0.40402825792875396, + "flos": 24141615580800.0, + "grad_norm": 2.093525245189624, + "language_loss": 0.81453198, + "learning_rate": 2.7033798932021906e-06, + "loss": 0.8361851, + "num_input_tokens_seen": 144318990, + "router_z_loss_clip": 0.83300781, + "router_z_loss_mlp": 0.14282227, + "step": 6720, + "time_per_iteration": 2.717451333999634 + }, + { + "auxiliary_loss_clip": 0.01127791, + "auxiliary_loss_mlp": 0.01032472, + "balance_loss_clip": 1.04355741, + "balance_loss_mlp": 1.01873899, + "epoch": 0.40408838118142193, + "flos": 23927229887040.0, + "grad_norm": 1.9179089678847916, + "language_loss": 0.7714923, + "learning_rate": 2.7030152983456153e-06, + "loss": 0.79309499, + "num_input_tokens_seen": 144335765, + "router_z_loss_clip": 0.84277344, + "router_z_loss_mlp": 0.13726807, + "step": 6721, + "time_per_iteration": 2.7053470611572266 + }, + { + "auxiliary_loss_clip": 0.01125965, + "auxiliary_loss_mlp": 0.01030199, + "balance_loss_clip": 1.04661465, + "balance_loss_mlp": 1.0187366, + "epoch": 0.4041485044340899, + "flos": 29804962754880.0, + "grad_norm": 3.3919451579293938, + "language_loss": 0.72709596, + "learning_rate": 2.7026506768306304e-06, + "loss": 0.74865764, + "num_input_tokens_seen": 144355825, + "router_z_loss_clip": 0.79199219, + "router_z_loss_mlp": 0.11456299, + "step": 6722, + "time_per_iteration": 2.7582461833953857 + }, + { + "auxiliary_loss_clip": 0.01126971, + "auxiliary_loss_mlp": 0.01033363, + "balance_loss_clip": 1.04544854, + "balance_loss_mlp": 1.02052426, + "epoch": 0.40420862768675786, + "flos": 20450856386880.0, + "grad_norm": 2.1371701613384855, + "language_loss": 0.65999919, + "learning_rate": 2.7022860286710602e-06, + "loss": 0.68160254, + "num_input_tokens_seen": 144374320, + "router_z_loss_clip": 0.81591797, + "router_z_loss_mlp": 0.128479, + "step": 6723, + "time_per_iteration": 5.6338725090026855 + }, + { + "auxiliary_loss_clip": 0.01130434, + "auxiliary_loss_mlp": 0.0104709, + "balance_loss_clip": 1.04610133, + "balance_loss_mlp": 1.03284478, + "epoch": 0.4042687509394258, + "flos": 27444438455040.0, + "grad_norm": 1.6329329349999595, + "language_loss": 0.7375052, + "learning_rate": 2.701921353880734e-06, + "loss": 0.7592805, + "num_input_tokens_seen": 144394325, + "router_z_loss_clip": 0.84277344, + "router_z_loss_mlp": 0.14239502, + "step": 6724, + "time_per_iteration": 2.7002949714660645 + }, + { + "auxiliary_loss_clip": 0.01122027, + "auxiliary_loss_mlp": 0.01036237, + "balance_loss_clip": 1.04339552, + "balance_loss_mlp": 1.02330232, + "epoch": 0.4043288741920938, + "flos": 37016982244800.0, + "grad_norm": 1.938072636816385, + "language_loss": 0.74984133, + "learning_rate": 2.7015566524734787e-06, + "loss": 0.771424, + "num_input_tokens_seen": 144412765, + "router_z_loss_clip": 0.78710938, + "router_z_loss_mlp": 0.1293335, + "step": 6725, + "time_per_iteration": 2.777289628982544 + }, + { + "auxiliary_loss_clip": 0.01125026, + "auxiliary_loss_mlp": 0.01030779, + "balance_loss_clip": 1.04392374, + "balance_loss_mlp": 1.01687336, + "epoch": 0.40438899744476176, + "flos": 56556732012480.0, + "grad_norm": 3.598842500541182, + "language_loss": 0.76733863, + "learning_rate": 2.701191924463126e-06, + "loss": 0.78889668, + "num_input_tokens_seen": 144435400, + "router_z_loss_clip": 0.81152344, + "router_z_loss_mlp": 0.13909912, + "step": 6726, + "time_per_iteration": 2.921712636947632 + }, + { + "auxiliary_loss_clip": 0.01127618, + "auxiliary_loss_mlp": 0.01036824, + "balance_loss_clip": 1.0442698, + "balance_loss_mlp": 1.02297151, + "epoch": 0.4044491206974297, + "flos": 16269097613760.0, + "grad_norm": 2.0059491064477686, + "language_loss": 0.81041086, + "learning_rate": 2.7008271698635054e-06, + "loss": 0.83205521, + "num_input_tokens_seen": 144452925, + "router_z_loss_clip": 0.83300781, + "router_z_loss_mlp": 0.1383667, + "step": 6727, + "time_per_iteration": 2.6531388759613037 + }, + { + "auxiliary_loss_clip": 0.01127653, + "auxiliary_loss_mlp": 0.01034216, + "balance_loss_clip": 1.04514587, + "balance_loss_mlp": 1.02089417, + "epoch": 0.4045092439500977, + "flos": 14756155188480.0, + "grad_norm": 2.1276116620149166, + "language_loss": 0.85660195, + "learning_rate": 2.700462388688447e-06, + "loss": 0.87822068, + "num_input_tokens_seen": 144470195, + "router_z_loss_clip": 0.82470703, + "router_z_loss_mlp": 0.13330078, + "step": 6728, + "time_per_iteration": 4.1091530323028564 + }, + { + "auxiliary_loss_clip": 0.01131002, + "auxiliary_loss_mlp": 0.01035721, + "balance_loss_clip": 1.04842699, + "balance_loss_mlp": 1.0220542, + "epoch": 0.40456936720276565, + "flos": 25843424163840.0, + "grad_norm": 2.1293831528783556, + "language_loss": 0.81953752, + "learning_rate": 2.700097580951786e-06, + "loss": 0.84120476, + "num_input_tokens_seen": 144490320, + "router_z_loss_clip": 0.82617188, + "router_z_loss_mlp": 0.13653564, + "step": 6729, + "time_per_iteration": 2.703695058822632 + }, + { + "auxiliary_loss_clip": 0.0112801, + "auxiliary_loss_mlp": 0.01038397, + "balance_loss_clip": 1.04638803, + "balance_loss_mlp": 1.02537298, + "epoch": 0.4046294904554336, + "flos": 29181693307680.0, + "grad_norm": 1.9609889483158132, + "language_loss": 0.72948432, + "learning_rate": 2.6997327466673533e-06, + "loss": 0.75114834, + "num_input_tokens_seen": 144508990, + "router_z_loss_clip": 0.81591797, + "router_z_loss_mlp": 0.13018799, + "step": 6730, + "time_per_iteration": 2.7578845024108887 + }, + { + "auxiliary_loss_clip": 0.01126399, + "auxiliary_loss_mlp": 0.010337, + "balance_loss_clip": 1.04533076, + "balance_loss_mlp": 1.02081382, + "epoch": 0.4046896137081016, + "flos": 46455919096320.0, + "grad_norm": 2.00937294068768, + "language_loss": 0.67619824, + "learning_rate": 2.699367885848985e-06, + "loss": 0.69779927, + "num_input_tokens_seen": 144529550, + "router_z_loss_clip": 0.81054688, + "router_z_loss_mlp": 0.12872314, + "step": 6731, + "time_per_iteration": 2.8719534873962402 + }, + { + "auxiliary_loss_clip": 0.01125156, + "auxiliary_loss_mlp": 0.01031506, + "balance_loss_clip": 1.04367709, + "balance_loss_mlp": 1.01932847, + "epoch": 0.4047497369607696, + "flos": 28818587730240.0, + "grad_norm": 2.091986140210573, + "language_loss": 0.73894298, + "learning_rate": 2.699002998510517e-06, + "loss": 0.76050961, + "num_input_tokens_seen": 144549310, + "router_z_loss_clip": 0.81591797, + "router_z_loss_mlp": 0.12176514, + "step": 6732, + "time_per_iteration": 2.733287811279297 + }, + { + "auxiliary_loss_clip": 0.01124926, + "auxiliary_loss_mlp": 0.01030146, + "balance_loss_clip": 1.04491282, + "balance_loss_mlp": 1.01817179, + "epoch": 0.40480986021343757, + "flos": 15646760064000.0, + "grad_norm": 1.7445550748895198, + "language_loss": 0.77118856, + "learning_rate": 2.6986380846657852e-06, + "loss": 0.79273927, + "num_input_tokens_seen": 144567430, + "router_z_loss_clip": 0.80029297, + "router_z_loss_mlp": 0.11975098, + "step": 6733, + "time_per_iteration": 4.176142454147339 + }, + { + "auxiliary_loss_clip": 0.01130118, + "auxiliary_loss_mlp": 0.01041698, + "balance_loss_clip": 1.04465294, + "balance_loss_mlp": 1.02741051, + "epoch": 0.40486998346610553, + "flos": 29002956469920.0, + "grad_norm": 2.5567449370076827, + "language_loss": 0.76158988, + "learning_rate": 2.698273144328627e-06, + "loss": 0.78330809, + "num_input_tokens_seen": 144585975, + "router_z_loss_clip": 0.85449219, + "router_z_loss_mlp": 0.1428833, + "step": 6734, + "time_per_iteration": 2.725189447402954 + }, + { + "auxiliary_loss_clip": 0.01130945, + "auxiliary_loss_mlp": 0.01032827, + "balance_loss_clip": 1.04609954, + "balance_loss_mlp": 1.01994669, + "epoch": 0.4049301067187735, + "flos": 27890105548320.0, + "grad_norm": 2.7616622099213846, + "language_loss": 0.65223658, + "learning_rate": 2.6979081775128805e-06, + "loss": 0.67387432, + "num_input_tokens_seen": 144605225, + "router_z_loss_clip": 0.84863281, + "router_z_loss_mlp": 0.12872314, + "step": 6735, + "time_per_iteration": 2.7229106426239014 + }, + { + "auxiliary_loss_clip": 0.01122285, + "auxiliary_loss_mlp": 0.01034132, + "balance_loss_clip": 1.04206693, + "balance_loss_mlp": 1.02243793, + "epoch": 0.40499022997144146, + "flos": 27801223336800.0, + "grad_norm": 1.8254206496402032, + "language_loss": 0.82934082, + "learning_rate": 2.697543184232387e-06, + "loss": 0.85090494, + "num_input_tokens_seen": 144624145, + "router_z_loss_clip": 0.80175781, + "router_z_loss_mlp": 0.11694336, + "step": 6736, + "time_per_iteration": 2.7302966117858887 + }, + { + "auxiliary_loss_clip": 0.01130053, + "auxiliary_loss_mlp": 0.01037502, + "balance_loss_clip": 1.04481363, + "balance_loss_mlp": 1.02355504, + "epoch": 0.4050503532241094, + "flos": 28112837801760.0, + "grad_norm": 1.6044553146432758, + "language_loss": 0.75211763, + "learning_rate": 2.6971781645009863e-06, + "loss": 0.77379322, + "num_input_tokens_seen": 144644470, + "router_z_loss_clip": 0.85205078, + "router_z_loss_mlp": 0.13934326, + "step": 6737, + "time_per_iteration": 2.689321279525757 + }, + { + "auxiliary_loss_clip": 0.01126134, + "auxiliary_loss_mlp": 0.01042917, + "balance_loss_clip": 1.04474902, + "balance_loss_mlp": 1.02997708, + "epoch": 0.4051104764767774, + "flos": 20313927031680.0, + "grad_norm": 2.178914330795718, + "language_loss": 0.71813071, + "learning_rate": 2.696813118332519e-06, + "loss": 0.7398212, + "num_input_tokens_seen": 144661055, + "router_z_loss_clip": 0.81445312, + "router_z_loss_mlp": 0.12945557, + "step": 6738, + "time_per_iteration": 2.7649824619293213 + }, + { + "auxiliary_loss_clip": 0.01122643, + "auxiliary_loss_mlp": 0.01032222, + "balance_loss_clip": 1.04192352, + "balance_loss_mlp": 1.02061117, + "epoch": 0.40517059972944536, + "flos": 19961234395200.0, + "grad_norm": 2.3685606699255253, + "language_loss": 0.7498076, + "learning_rate": 2.696448045740828e-06, + "loss": 0.77135623, + "num_input_tokens_seen": 144677935, + "router_z_loss_clip": 0.80761719, + "router_z_loss_mlp": 0.11621094, + "step": 6739, + "time_per_iteration": 2.645303964614868 + }, + { + "auxiliary_loss_clip": 0.01127508, + "auxiliary_loss_mlp": 0.01034942, + "balance_loss_clip": 1.04461908, + "balance_loss_mlp": 1.02223468, + "epoch": 0.4052307229821133, + "flos": 35146606661280.0, + "grad_norm": 1.7454842281595377, + "language_loss": 0.74338472, + "learning_rate": 2.6960829467397576e-06, + "loss": 0.76500916, + "num_input_tokens_seen": 144697725, + "router_z_loss_clip": 0.82861328, + "router_z_loss_mlp": 0.1272583, + "step": 6740, + "time_per_iteration": 2.7651896476745605 + }, + { + "auxiliary_loss_clip": 0.0112251, + "auxiliary_loss_mlp": 0.01033778, + "balance_loss_clip": 1.04314005, + "balance_loss_mlp": 1.02103496, + "epoch": 0.4052908462347813, + "flos": 26109746660160.0, + "grad_norm": 1.5776973290992133, + "language_loss": 0.76972276, + "learning_rate": 2.695717821343153e-06, + "loss": 0.79128563, + "num_input_tokens_seen": 144718805, + "router_z_loss_clip": 0.79296875, + "router_z_loss_mlp": 0.12738037, + "step": 6741, + "time_per_iteration": 2.7174580097198486 + }, + { + "auxiliary_loss_clip": 0.01126269, + "auxiliary_loss_mlp": 0.01038985, + "balance_loss_clip": 1.04281092, + "balance_loss_mlp": 1.02524579, + "epoch": 0.40535096948744925, + "flos": 27355920899040.0, + "grad_norm": 1.9571314695343853, + "language_loss": 0.71077567, + "learning_rate": 2.6953526695648577e-06, + "loss": 0.73242825, + "num_input_tokens_seen": 144737105, + "router_z_loss_clip": 0.83398438, + "router_z_loss_mlp": 0.13751221, + "step": 6742, + "time_per_iteration": 2.715524673461914 + }, + { + "auxiliary_loss_clip": 0.01126429, + "auxiliary_loss_mlp": 0.01029245, + "balance_loss_clip": 1.04425597, + "balance_loss_mlp": 1.01615572, + "epoch": 0.4054110927401172, + "flos": 20757082053600.0, + "grad_norm": 2.440097123886151, + "language_loss": 0.72574496, + "learning_rate": 2.6949874914187202e-06, + "loss": 0.7473017, + "num_input_tokens_seen": 144751350, + "router_z_loss_clip": 0.82226562, + "router_z_loss_mlp": 0.13092041, + "step": 6743, + "time_per_iteration": 2.6539134979248047 + }, + { + "auxiliary_loss_clip": 0.01127691, + "auxiliary_loss_mlp": 0.0103288, + "balance_loss_clip": 1.04320121, + "balance_loss_mlp": 1.01934385, + "epoch": 0.4054712159927852, + "flos": 26374691568960.0, + "grad_norm": 1.903696198364553, + "language_loss": 0.70467639, + "learning_rate": 2.694622286918588e-06, + "loss": 0.72628212, + "num_input_tokens_seen": 144770030, + "router_z_loss_clip": 0.84326172, + "router_z_loss_mlp": 0.13531494, + "step": 6744, + "time_per_iteration": 2.6840837001800537 + }, + { + "auxiliary_loss_clip": 0.0112354, + "auxiliary_loss_mlp": 0.01035452, + "balance_loss_clip": 1.04282367, + "balance_loss_mlp": 1.02316737, + "epoch": 0.4055313392454532, + "flos": 31496885121600.0, + "grad_norm": 1.9076523592640604, + "language_loss": 0.80146754, + "learning_rate": 2.6942570560783076e-06, + "loss": 0.82305747, + "num_input_tokens_seen": 144790965, + "router_z_loss_clip": 0.80615234, + "router_z_loss_mlp": 0.12280273, + "step": 6745, + "time_per_iteration": 2.706195116043091 + }, + { + "auxiliary_loss_clip": 0.0112697, + "auxiliary_loss_mlp": 0.0103438, + "balance_loss_clip": 1.04634643, + "balance_loss_mlp": 1.02138662, + "epoch": 0.40559146249812117, + "flos": 17249597632800.0, + "grad_norm": 1.7346806825590717, + "language_loss": 0.66743875, + "learning_rate": 2.693891798911731e-06, + "loss": 0.68905222, + "num_input_tokens_seen": 144807755, + "router_z_loss_clip": 0.80615234, + "router_z_loss_mlp": 0.12994385, + "step": 6746, + "time_per_iteration": 2.6765360832214355 + }, + { + "auxiliary_loss_clip": 0.01123876, + "auxiliary_loss_mlp": 0.01030099, + "balance_loss_clip": 1.04332304, + "balance_loss_mlp": 1.01784444, + "epoch": 0.40565158575078913, + "flos": 50463560001600.0, + "grad_norm": 1.5763539364848207, + "language_loss": 0.57258779, + "learning_rate": 2.6935265154327075e-06, + "loss": 0.59412754, + "num_input_tokens_seen": 144832405, + "router_z_loss_clip": 0.80566406, + "router_z_loss_mlp": 0.12268066, + "step": 6747, + "time_per_iteration": 2.864387035369873 + }, + { + "auxiliary_loss_clip": 0.01127485, + "auxiliary_loss_mlp": 0.01038159, + "balance_loss_clip": 1.04561543, + "balance_loss_mlp": 1.02597618, + "epoch": 0.4057117090034571, + "flos": 34831062020160.0, + "grad_norm": 1.7824543011781504, + "language_loss": 0.84387922, + "learning_rate": 2.693161205655089e-06, + "loss": 0.86553568, + "num_input_tokens_seen": 144853890, + "router_z_loss_clip": 0.81884766, + "router_z_loss_mlp": 0.12194824, + "step": 6748, + "time_per_iteration": 2.71901798248291 + }, + { + "auxiliary_loss_clip": 0.01127328, + "auxiliary_loss_mlp": 0.01039925, + "balance_loss_clip": 1.04412115, + "balance_loss_mlp": 1.02650785, + "epoch": 0.40577183225612506, + "flos": 21968417782080.0, + "grad_norm": 1.783073406532177, + "language_loss": 0.81713009, + "learning_rate": 2.6927958695927287e-06, + "loss": 0.83880258, + "num_input_tokens_seen": 144871395, + "router_z_loss_clip": 0.83105469, + "router_z_loss_mlp": 0.13415527, + "step": 6749, + "time_per_iteration": 2.759005546569824 + }, + { + "auxiliary_loss_clip": 0.01127955, + "auxiliary_loss_mlp": 0.01037057, + "balance_loss_clip": 1.04600239, + "balance_loss_mlp": 1.02449191, + "epoch": 0.40583195550879303, + "flos": 23838671813760.0, + "grad_norm": 2.1938877519863276, + "language_loss": 0.75651377, + "learning_rate": 2.6924305072594784e-06, + "loss": 0.77816391, + "num_input_tokens_seen": 144890975, + "router_z_loss_clip": 0.81982422, + "router_z_loss_mlp": 0.12561035, + "step": 6750, + "time_per_iteration": 2.665832042694092 + }, + { + "auxiliary_loss_clip": 0.01128102, + "auxiliary_loss_mlp": 0.01037198, + "balance_loss_clip": 1.04254413, + "balance_loss_mlp": 1.02400732, + "epoch": 0.405892078761461, + "flos": 27222192408960.0, + "grad_norm": 2.301763457230443, + "language_loss": 0.74229884, + "learning_rate": 2.692065118669195e-06, + "loss": 0.76395184, + "num_input_tokens_seen": 144908170, + "router_z_loss_clip": 0.85546875, + "router_z_loss_mlp": 0.13195801, + "step": 6751, + "time_per_iteration": 2.6718692779541016 + }, + { + "auxiliary_loss_clip": 0.01128048, + "auxiliary_loss_mlp": 0.01035329, + "balance_loss_clip": 1.0461787, + "balance_loss_mlp": 1.02209044, + "epoch": 0.40595220201412896, + "flos": 31096428962400.0, + "grad_norm": 2.4581369517148546, + "language_loss": 0.66864038, + "learning_rate": 2.6916997038357326e-06, + "loss": 0.69027412, + "num_input_tokens_seen": 144928020, + "router_z_loss_clip": 0.81835938, + "router_z_loss_mlp": 0.13244629, + "step": 6752, + "time_per_iteration": 2.764754056930542 + }, + { + "auxiliary_loss_clip": 0.01133505, + "auxiliary_loss_mlp": 0.01035027, + "balance_loss_clip": 1.04688454, + "balance_loss_mlp": 1.02128851, + "epoch": 0.4060123252667969, + "flos": 60836610936960.0, + "grad_norm": 1.8361516193961942, + "language_loss": 0.71024126, + "learning_rate": 2.691334262772948e-06, + "loss": 0.73192656, + "num_input_tokens_seen": 144951240, + "router_z_loss_clip": 0.86572266, + "router_z_loss_mlp": 0.13739014, + "step": 6753, + "time_per_iteration": 2.9443163871765137 + }, + { + "auxiliary_loss_clip": 0.01127254, + "auxiliary_loss_mlp": 0.0103392, + "balance_loss_clip": 1.04269958, + "balance_loss_mlp": 1.0206995, + "epoch": 0.4060724485194649, + "flos": 25790190808320.0, + "grad_norm": 2.062462166153724, + "language_loss": 0.72169602, + "learning_rate": 2.690968795494699e-06, + "loss": 0.74330777, + "num_input_tokens_seen": 144969100, + "router_z_loss_clip": 0.84619141, + "router_z_loss_mlp": 0.13208008, + "step": 6754, + "time_per_iteration": 2.6441965103149414 + }, + { + "auxiliary_loss_clip": 0.01128749, + "auxiliary_loss_mlp": 0.01038339, + "balance_loss_clip": 1.04421353, + "balance_loss_mlp": 1.02563167, + "epoch": 0.40613257177213286, + "flos": 26549700816960.0, + "grad_norm": 2.1280532196409636, + "language_loss": 0.82708979, + "learning_rate": 2.690603302014844e-06, + "loss": 0.8487606, + "num_input_tokens_seen": 144987065, + "router_z_loss_clip": 0.84521484, + "router_z_loss_mlp": 0.1270752, + "step": 6755, + "time_per_iteration": 2.697695016860962 + }, + { + "auxiliary_loss_clip": 0.0113022, + "auxiliary_loss_mlp": 0.01036485, + "balance_loss_clip": 1.04475868, + "balance_loss_mlp": 1.02261472, + "epoch": 0.4061926950248008, + "flos": 31182880137120.0, + "grad_norm": 1.6354770975527695, + "language_loss": 0.70454907, + "learning_rate": 2.6902377823472426e-06, + "loss": 0.72621608, + "num_input_tokens_seen": 145007310, + "router_z_loss_clip": 0.85449219, + "router_z_loss_mlp": 0.13879395, + "step": 6756, + "time_per_iteration": 2.7289414405822754 + }, + { + "auxiliary_loss_clip": 0.01129326, + "auxiliary_loss_mlp": 0.01040644, + "balance_loss_clip": 1.04325414, + "balance_loss_mlp": 1.02679801, + "epoch": 0.4062528182774688, + "flos": 28913912189280.0, + "grad_norm": 2.1340743567692293, + "language_loss": 0.78940964, + "learning_rate": 2.689872236505755e-06, + "loss": 0.8111093, + "num_input_tokens_seen": 145026210, + "router_z_loss_clip": 0.86181641, + "router_z_loss_mlp": 0.1385498, + "step": 6757, + "time_per_iteration": 2.70710825920105 + }, + { + "auxiliary_loss_clip": 0.01130085, + "auxiliary_loss_mlp": 0.01029767, + "balance_loss_clip": 1.04701543, + "balance_loss_mlp": 1.01676083, + "epoch": 0.4063129415301368, + "flos": 26510202819360.0, + "grad_norm": 1.9930686777281892, + "language_loss": 0.78736532, + "learning_rate": 2.6895066645042437e-06, + "loss": 0.80896378, + "num_input_tokens_seen": 145045475, + "router_z_loss_clip": 0.83105469, + "router_z_loss_mlp": 0.13006592, + "step": 6758, + "time_per_iteration": 2.751056432723999 + }, + { + "auxiliary_loss_clip": 0.01126465, + "auxiliary_loss_mlp": 0.01032307, + "balance_loss_clip": 1.04501176, + "balance_loss_mlp": 1.0195756, + "epoch": 0.40637306478280477, + "flos": 15602764648320.0, + "grad_norm": 2.4032530527968845, + "language_loss": 0.88999963, + "learning_rate": 2.6891410663565703e-06, + "loss": 0.91158736, + "num_input_tokens_seen": 145062260, + "router_z_loss_clip": 0.81396484, + "router_z_loss_mlp": 0.12738037, + "step": 6759, + "time_per_iteration": 2.662856340408325 + }, + { + "auxiliary_loss_clip": 0.01128333, + "auxiliary_loss_mlp": 0.01030354, + "balance_loss_clip": 1.04558599, + "balance_loss_mlp": 1.0174737, + "epoch": 0.40643318803547274, + "flos": 29314976107680.0, + "grad_norm": 1.8116262530645444, + "language_loss": 0.64365923, + "learning_rate": 2.688775442076598e-06, + "loss": 0.66524607, + "num_input_tokens_seen": 145082470, + "router_z_loss_clip": 0.82666016, + "router_z_loss_mlp": 0.12896729, + "step": 6760, + "time_per_iteration": 2.7443413734436035 + }, + { + "auxiliary_loss_clip": 0.01128409, + "auxiliary_loss_mlp": 0.01030151, + "balance_loss_clip": 1.04479504, + "balance_loss_mlp": 1.01669812, + "epoch": 0.4064933112881407, + "flos": 31228374692160.0, + "grad_norm": 1.9250329656934413, + "language_loss": 0.7495774, + "learning_rate": 2.688409791678193e-06, + "loss": 0.77116299, + "num_input_tokens_seen": 145105685, + "router_z_loss_clip": 0.8359375, + "router_z_loss_mlp": 0.13452148, + "step": 6761, + "time_per_iteration": 2.697312831878662 + }, + { + "auxiliary_loss_clip": 0.01122095, + "auxiliary_loss_mlp": 0.01036168, + "balance_loss_clip": 1.04406595, + "balance_loss_mlp": 1.02340078, + "epoch": 0.40655343454080867, + "flos": 26910821047680.0, + "grad_norm": 1.477610211406112, + "language_loss": 0.70056951, + "learning_rate": 2.6880441151752185e-06, + "loss": 0.72215211, + "num_input_tokens_seen": 145125590, + "router_z_loss_clip": 0.77978516, + "router_z_loss_mlp": 0.12774658, + "step": 6762, + "time_per_iteration": 5.566957473754883 + }, + { + "auxiliary_loss_clip": 0.01126669, + "auxiliary_loss_mlp": 0.01031012, + "balance_loss_clip": 1.04522991, + "balance_loss_mlp": 1.01863194, + "epoch": 0.40661355779347663, + "flos": 32297837957280.0, + "grad_norm": 1.847754961008662, + "language_loss": 0.73338181, + "learning_rate": 2.6876784125815433e-06, + "loss": 0.75495863, + "num_input_tokens_seen": 145146810, + "router_z_loss_clip": 0.81445312, + "router_z_loss_mlp": 0.12371826, + "step": 6763, + "time_per_iteration": 2.7965736389160156 + }, + { + "auxiliary_loss_clip": 0.01130073, + "auxiliary_loss_mlp": 0.01033265, + "balance_loss_clip": 1.04504442, + "balance_loss_mlp": 1.01955569, + "epoch": 0.4066736810461446, + "flos": 16180701609600.0, + "grad_norm": 2.3749546006910016, + "language_loss": 0.69004703, + "learning_rate": 2.687312683911033e-06, + "loss": 0.71168041, + "num_input_tokens_seen": 145163130, + "router_z_loss_clip": 0.84960938, + "router_z_loss_mlp": 0.13696289, + "step": 6764, + "time_per_iteration": 2.6536436080932617 + }, + { + "auxiliary_loss_clip": 0.01131875, + "auxiliary_loss_mlp": 0.01040179, + "balance_loss_clip": 1.04652131, + "balance_loss_mlp": 1.02512336, + "epoch": 0.40673380429881256, + "flos": 35146890282240.0, + "grad_norm": 2.2624665423966155, + "language_loss": 0.90951926, + "learning_rate": 2.686946929177557e-06, + "loss": 0.93123978, + "num_input_tokens_seen": 145181420, + "router_z_loss_clip": 0.85302734, + "router_z_loss_mlp": 0.1505127, + "step": 6765, + "time_per_iteration": 2.7595012187957764 + }, + { + "auxiliary_loss_clip": 0.01132246, + "auxiliary_loss_mlp": 0.01033407, + "balance_loss_clip": 1.04496837, + "balance_loss_mlp": 1.01974511, + "epoch": 0.4067939275514805, + "flos": 15246182352960.0, + "grad_norm": 3.5082820882083867, + "language_loss": 0.78704411, + "learning_rate": 2.6865811483949855e-06, + "loss": 0.80870062, + "num_input_tokens_seen": 145198545, + "router_z_loss_clip": 0.87304688, + "router_z_loss_mlp": 0.13653564, + "step": 6766, + "time_per_iteration": 2.661129951477051 + }, + { + "auxiliary_loss_clip": 0.01128473, + "auxiliary_loss_mlp": 0.01034713, + "balance_loss_clip": 1.04290366, + "balance_loss_mlp": 1.02137947, + "epoch": 0.4068540508041485, + "flos": 22903666349760.0, + "grad_norm": 2.1125814188081278, + "language_loss": 0.76637506, + "learning_rate": 2.6862153415771867e-06, + "loss": 0.7880069, + "num_input_tokens_seen": 145215835, + "router_z_loss_clip": 0.85644531, + "router_z_loss_mlp": 0.13336182, + "step": 6767, + "time_per_iteration": 2.6751632690429688 + }, + { + "auxiliary_loss_clip": 0.01127581, + "auxiliary_loss_mlp": 0.01031945, + "balance_loss_clip": 1.04667997, + "balance_loss_mlp": 1.01910055, + "epoch": 0.40691417405681646, + "flos": 34791928678080.0, + "grad_norm": 2.368809563002208, + "language_loss": 0.77377927, + "learning_rate": 2.685849508738034e-06, + "loss": 0.79537451, + "num_input_tokens_seen": 145236555, + "router_z_loss_clip": 0.80810547, + "router_z_loss_mlp": 0.12841797, + "step": 6768, + "time_per_iteration": 2.772359848022461 + }, + { + "auxiliary_loss_clip": 0.01126999, + "auxiliary_loss_mlp": 0.01030374, + "balance_loss_clip": 1.04441273, + "balance_loss_mlp": 1.01760101, + "epoch": 0.4069742973094844, + "flos": 25397311380480.0, + "grad_norm": 2.218707580065378, + "language_loss": 0.86880934, + "learning_rate": 2.6854836498913995e-06, + "loss": 0.89038306, + "num_input_tokens_seen": 145254595, + "router_z_loss_clip": 0.82568359, + "router_z_loss_mlp": 0.12774658, + "step": 6769, + "time_per_iteration": 4.091512441635132 + }, + { + "auxiliary_loss_clip": 0.01128686, + "auxiliary_loss_mlp": 0.01033011, + "balance_loss_clip": 1.04846931, + "balance_loss_mlp": 1.02078605, + "epoch": 0.4070344205621524, + "flos": 26198142664320.0, + "grad_norm": 1.727151697417425, + "language_loss": 0.81021863, + "learning_rate": 2.685117765051156e-06, + "loss": 0.83183563, + "num_input_tokens_seen": 145274005, + "router_z_loss_clip": 0.80175781, + "router_z_loss_mlp": 0.12213135, + "step": 6770, + "time_per_iteration": 2.714648962020874 + }, + { + "auxiliary_loss_clip": 0.01130296, + "auxiliary_loss_mlp": 0.01028982, + "balance_loss_clip": 1.04520237, + "balance_loss_mlp": 1.01504004, + "epoch": 0.4070945438148204, + "flos": 32738886080640.0, + "grad_norm": 1.8700995962154352, + "language_loss": 0.8039422, + "learning_rate": 2.6847518542311783e-06, + "loss": 0.82553494, + "num_input_tokens_seen": 145294850, + "router_z_loss_clip": 0.85058594, + "router_z_loss_mlp": 0.13934326, + "step": 6771, + "time_per_iteration": 2.7330071926116943 + }, + { + "auxiliary_loss_clip": 0.01126801, + "auxiliary_loss_mlp": 0.01039809, + "balance_loss_clip": 1.04443872, + "balance_loss_mlp": 1.02699995, + "epoch": 0.4071546670674884, + "flos": 32158315496160.0, + "grad_norm": 1.8798065367073344, + "language_loss": 0.76354325, + "learning_rate": 2.6843859174453417e-06, + "loss": 0.7852093, + "num_input_tokens_seen": 145317050, + "router_z_loss_clip": 0.82275391, + "router_z_loss_mlp": 0.12823486, + "step": 6772, + "time_per_iteration": 2.731651782989502 + }, + { + "auxiliary_loss_clip": 0.011275, + "auxiliary_loss_mlp": 0.01038681, + "balance_loss_clip": 1.04361582, + "balance_loss_mlp": 1.02486515, + "epoch": 0.40721479032015634, + "flos": 21835742741280.0, + "grad_norm": 1.8489026838893352, + "language_loss": 0.81359684, + "learning_rate": 2.6840199547075218e-06, + "loss": 0.83525866, + "num_input_tokens_seen": 145334480, + "router_z_loss_clip": 0.83886719, + "router_z_loss_mlp": 0.13806152, + "step": 6773, + "time_per_iteration": 4.095489740371704 + }, + { + "auxiliary_loss_clip": 0.01049232, + "auxiliary_loss_mlp": 0.0100592, + "balance_loss_clip": 1.02259839, + "balance_loss_mlp": 1.00445461, + "epoch": 0.4072749135728243, + "flos": 60831633661920.0, + "grad_norm": 0.8267678823275313, + "language_loss": 0.64356673, + "learning_rate": 2.683653966031597e-06, + "loss": 0.66411823, + "num_input_tokens_seen": 145388695, + "router_z_loss_clip": 0.26635742, + "router_z_loss_mlp": 0.01464081, + "step": 6774, + "time_per_iteration": 3.203451633453369 + }, + { + "auxiliary_loss_clip": 0.01128779, + "auxiliary_loss_mlp": 0.01029429, + "balance_loss_clip": 1.04383326, + "balance_loss_mlp": 1.01631618, + "epoch": 0.40733503682549227, + "flos": 33633704753280.0, + "grad_norm": 2.4790231214564016, + "language_loss": 0.72263777, + "learning_rate": 2.683287951431446e-06, + "loss": 0.74421984, + "num_input_tokens_seen": 145408240, + "router_z_loss_clip": 0.84960938, + "router_z_loss_mlp": 0.13128662, + "step": 6775, + "time_per_iteration": 2.7162649631500244 + }, + { + "auxiliary_loss_clip": 0.01128939, + "auxiliary_loss_mlp": 0.01041592, + "balance_loss_clip": 1.04453588, + "balance_loss_mlp": 1.02852035, + "epoch": 0.40739516007816023, + "flos": 26999500672800.0, + "grad_norm": 1.4289102357997951, + "language_loss": 0.77948189, + "learning_rate": 2.6829219109209474e-06, + "loss": 0.80118716, + "num_input_tokens_seen": 145428395, + "router_z_loss_clip": 0.84375, + "router_z_loss_mlp": 0.13092041, + "step": 6776, + "time_per_iteration": 2.7505929470062256 + }, + { + "auxiliary_loss_clip": 0.01131168, + "auxiliary_loss_mlp": 0.01041486, + "balance_loss_clip": 1.04559684, + "balance_loss_mlp": 1.02799737, + "epoch": 0.4074552833308282, + "flos": 29092567992480.0, + "grad_norm": 4.106173338439407, + "language_loss": 0.78879881, + "learning_rate": 2.682555844513981e-06, + "loss": 0.81052542, + "num_input_tokens_seen": 145448290, + "router_z_loss_clip": 0.85546875, + "router_z_loss_mlp": 0.13482666, + "step": 6777, + "time_per_iteration": 2.6966588497161865 + }, + { + "auxiliary_loss_clip": 0.01048201, + "auxiliary_loss_mlp": 0.01001726, + "balance_loss_clip": 1.02146363, + "balance_loss_mlp": 1.0003041, + "epoch": 0.40751540658349616, + "flos": 70770483894240.0, + "grad_norm": 0.6890321450900512, + "language_loss": 0.53161299, + "learning_rate": 2.6821897522244286e-06, + "loss": 0.55211222, + "num_input_tokens_seen": 145509785, + "router_z_loss_clip": 0.26757812, + "router_z_loss_mlp": 0.01422882, + "step": 6778, + "time_per_iteration": 3.316371202468872 + }, + { + "auxiliary_loss_clip": 0.01129352, + "auxiliary_loss_mlp": 0.01041885, + "balance_loss_clip": 1.04658294, + "balance_loss_mlp": 1.02805102, + "epoch": 0.40757552983616413, + "flos": 25886244578400.0, + "grad_norm": 2.298656377774213, + "language_loss": 0.8245759, + "learning_rate": 2.6818236340661718e-06, + "loss": 0.8462882, + "num_input_tokens_seen": 145528620, + "router_z_loss_clip": 0.82763672, + "router_z_loss_mlp": 0.1383667, + "step": 6779, + "time_per_iteration": 2.6628477573394775 + }, + { + "auxiliary_loss_clip": 0.01128031, + "auxiliary_loss_mlp": 0.01039065, + "balance_loss_clip": 1.04496789, + "balance_loss_mlp": 1.02559483, + "epoch": 0.4076356530888321, + "flos": 32742816256800.0, + "grad_norm": 1.6520259313268013, + "language_loss": 0.76369202, + "learning_rate": 2.6814574900530957e-06, + "loss": 0.78536296, + "num_input_tokens_seen": 145547775, + "router_z_loss_clip": 0.83007812, + "router_z_loss_mlp": 0.13470459, + "step": 6780, + "time_per_iteration": 2.7554233074188232 + }, + { + "auxiliary_loss_clip": 0.01124313, + "auxiliary_loss_mlp": 0.01033474, + "balance_loss_clip": 1.04455328, + "balance_loss_mlp": 1.02109969, + "epoch": 0.40769577634150006, + "flos": 14889519023040.0, + "grad_norm": 2.212574843481533, + "language_loss": 0.66124487, + "learning_rate": 2.6810913201990827e-06, + "loss": 0.68282276, + "num_input_tokens_seen": 145564465, + "router_z_loss_clip": 0.796875, + "router_z_loss_mlp": 0.12365723, + "step": 6781, + "time_per_iteration": 2.646373987197876 + }, + { + "auxiliary_loss_clip": 0.01128347, + "auxiliary_loss_mlp": 0.01036807, + "balance_loss_clip": 1.04496062, + "balance_loss_mlp": 1.02281201, + "epoch": 0.407755899594168, + "flos": 41067322012800.0, + "grad_norm": 1.8825960226200067, + "language_loss": 0.71607089, + "learning_rate": 2.6807251245180183e-06, + "loss": 0.73772252, + "num_input_tokens_seen": 145585965, + "router_z_loss_clip": 0.83300781, + "router_z_loss_mlp": 0.13995361, + "step": 6782, + "time_per_iteration": 2.8141634464263916 + }, + { + "auxiliary_loss_clip": 0.01129828, + "auxiliary_loss_mlp": 0.01034302, + "balance_loss_clip": 1.04541337, + "balance_loss_mlp": 1.02131999, + "epoch": 0.407816022846836, + "flos": 24595953372000.0, + "grad_norm": 1.7110143859449822, + "language_loss": 0.82108271, + "learning_rate": 2.6803589030237897e-06, + "loss": 0.84272403, + "num_input_tokens_seen": 145605000, + "router_z_loss_clip": 0.84423828, + "router_z_loss_mlp": 0.12988281, + "step": 6783, + "time_per_iteration": 2.7689948081970215 + }, + { + "auxiliary_loss_clip": 0.01128087, + "auxiliary_loss_mlp": 0.0103895, + "balance_loss_clip": 1.04495025, + "balance_loss_mlp": 1.0249722, + "epoch": 0.40787614609950396, + "flos": 25842451749120.0, + "grad_norm": 1.5238388465852153, + "language_loss": 0.80468154, + "learning_rate": 2.679992655730283e-06, + "loss": 0.82635194, + "num_input_tokens_seen": 145623740, + "router_z_loss_clip": 0.82958984, + "router_z_loss_mlp": 0.13983154, + "step": 6784, + "time_per_iteration": 2.6748244762420654 + }, + { + "auxiliary_loss_clip": 0.01133538, + "auxiliary_loss_mlp": 0.01039461, + "balance_loss_clip": 1.04573607, + "balance_loss_mlp": 1.02454805, + "epoch": 0.407936269352172, + "flos": 25039432532160.0, + "grad_norm": 2.0804309350295735, + "language_loss": 0.66167045, + "learning_rate": 2.679626382651386e-06, + "loss": 0.68340045, + "num_input_tokens_seen": 145643515, + "router_z_loss_clip": 0.87744141, + "router_z_loss_mlp": 0.14916992, + "step": 6785, + "time_per_iteration": 2.7434749603271484 + }, + { + "auxiliary_loss_clip": 0.01127127, + "auxiliary_loss_mlp": 0.01035768, + "balance_loss_clip": 1.04410529, + "balance_loss_mlp": 1.02292323, + "epoch": 0.40799639260483994, + "flos": 24549121746720.0, + "grad_norm": 1.9487133764524727, + "language_loss": 0.79708934, + "learning_rate": 2.679260083800989e-06, + "loss": 0.81871831, + "num_input_tokens_seen": 145660890, + "router_z_loss_clip": 0.83007812, + "router_z_loss_mlp": 0.128479, + "step": 6786, + "time_per_iteration": 2.6432945728302 + }, + { + "auxiliary_loss_clip": 0.01129023, + "auxiliary_loss_mlp": 0.010365, + "balance_loss_clip": 1.0464232, + "balance_loss_mlp": 1.02429271, + "epoch": 0.4080565158575079, + "flos": 25620894496800.0, + "grad_norm": 1.7669724617012683, + "language_loss": 0.81721675, + "learning_rate": 2.678893759192982e-06, + "loss": 0.83887196, + "num_input_tokens_seen": 145680070, + "router_z_loss_clip": 0.82568359, + "router_z_loss_mlp": 0.12207031, + "step": 6787, + "time_per_iteration": 2.7240169048309326 + }, + { + "auxiliary_loss_clip": 0.01124247, + "auxiliary_loss_mlp": 0.01036552, + "balance_loss_clip": 1.04342246, + "balance_loss_mlp": 1.02355874, + "epoch": 0.40811663911017587, + "flos": 23571417420000.0, + "grad_norm": 1.884997134095727, + "language_loss": 0.67663705, + "learning_rate": 2.678527408841255e-06, + "loss": 0.69824505, + "num_input_tokens_seen": 145698010, + "router_z_loss_clip": 0.80908203, + "router_z_loss_mlp": 0.13000488, + "step": 6788, + "time_per_iteration": 2.645143747329712 + }, + { + "auxiliary_loss_clip": 0.01126541, + "auxiliary_loss_mlp": 0.01041093, + "balance_loss_clip": 1.04431581, + "balance_loss_mlp": 1.02734244, + "epoch": 0.40817676236284384, + "flos": 49572671505120.0, + "grad_norm": 1.9448114250570097, + "language_loss": 0.66135633, + "learning_rate": 2.678161032759701e-06, + "loss": 0.68303269, + "num_input_tokens_seen": 145722215, + "router_z_loss_clip": 0.82373047, + "router_z_loss_mlp": 0.13751221, + "step": 6789, + "time_per_iteration": 2.8777787685394287 + }, + { + "auxiliary_loss_clip": 0.0112734, + "auxiliary_loss_mlp": 0.01032821, + "balance_loss_clip": 1.04440689, + "balance_loss_mlp": 1.01929069, + "epoch": 0.4082368856155118, + "flos": 25041053223360.0, + "grad_norm": 2.2479231252636214, + "language_loss": 0.60785758, + "learning_rate": 2.6777946309622123e-06, + "loss": 0.62945926, + "num_input_tokens_seen": 145741090, + "router_z_loss_clip": 0.83056641, + "router_z_loss_mlp": 0.13531494, + "step": 6790, + "time_per_iteration": 2.6369035243988037 + }, + { + "auxiliary_loss_clip": 0.01130141, + "auxiliary_loss_mlp": 0.01036865, + "balance_loss_clip": 1.04854131, + "balance_loss_mlp": 1.02341843, + "epoch": 0.40829700886817977, + "flos": 13944303204480.0, + "grad_norm": 3.747776828657643, + "language_loss": 0.69922948, + "learning_rate": 2.677428203462683e-06, + "loss": 0.72089958, + "num_input_tokens_seen": 145754985, + "router_z_loss_clip": 0.81640625, + "router_z_loss_mlp": 0.13452148, + "step": 6791, + "time_per_iteration": 2.6413371562957764 + }, + { + "auxiliary_loss_clip": 0.01044015, + "auxiliary_loss_mlp": 0.01002378, + "balance_loss_clip": 1.01763201, + "balance_loss_mlp": 1.00089848, + "epoch": 0.40835713212084773, + "flos": 82157374219680.0, + "grad_norm": 0.7499467905643251, + "language_loss": 0.59640121, + "learning_rate": 2.6770617502750093e-06, + "loss": 0.61686516, + "num_input_tokens_seen": 145815260, + "router_z_loss_clip": 0.26416016, + "router_z_loss_mlp": 0.01477814, + "step": 6792, + "time_per_iteration": 3.2986462116241455 + }, + { + "auxiliary_loss_clip": 0.01133346, + "auxiliary_loss_mlp": 0.01046066, + "balance_loss_clip": 1.04904974, + "balance_loss_mlp": 1.03153408, + "epoch": 0.4084172553735157, + "flos": 26554157717760.0, + "grad_norm": 1.7471422460642185, + "language_loss": 0.79974949, + "learning_rate": 2.6766952714130857e-06, + "loss": 0.82154363, + "num_input_tokens_seen": 145832665, + "router_z_loss_clip": 0.84375, + "router_z_loss_mlp": 0.14538574, + "step": 6793, + "time_per_iteration": 2.681130886077881 + }, + { + "auxiliary_loss_clip": 0.01130087, + "auxiliary_loss_mlp": 0.01035082, + "balance_loss_clip": 1.04551792, + "balance_loss_mlp": 1.0212121, + "epoch": 0.40847737862618366, + "flos": 33455129984640.0, + "grad_norm": 1.7841411667674365, + "language_loss": 0.85092485, + "learning_rate": 2.6763287668908094e-06, + "loss": 0.87257648, + "num_input_tokens_seen": 145850240, + "router_z_loss_clip": 0.84472656, + "router_z_loss_mlp": 0.13867188, + "step": 6794, + "time_per_iteration": 2.739809274673462 + }, + { + "auxiliary_loss_clip": 0.01129605, + "auxiliary_loss_mlp": 0.01038117, + "balance_loss_clip": 1.04676199, + "balance_loss_mlp": 1.02480674, + "epoch": 0.4085375018788516, + "flos": 22680123750720.0, + "grad_norm": 2.0413984736364243, + "language_loss": 0.80043173, + "learning_rate": 2.6759622367220788e-06, + "loss": 0.82210898, + "num_input_tokens_seen": 145869545, + "router_z_loss_clip": 0.82861328, + "router_z_loss_mlp": 0.13317871, + "step": 6795, + "time_per_iteration": 2.675640821456909 + }, + { + "auxiliary_loss_clip": 0.01131811, + "auxiliary_loss_mlp": 0.0103633, + "balance_loss_clip": 1.0450325, + "balance_loss_mlp": 1.02201843, + "epoch": 0.4085976251315196, + "flos": 18805279438080.0, + "grad_norm": 2.6018071554221143, + "language_loss": 0.6981324, + "learning_rate": 2.675595680920792e-06, + "loss": 0.71981382, + "num_input_tokens_seen": 145884025, + "router_z_loss_clip": 0.86816406, + "router_z_loss_mlp": 0.14331055, + "step": 6796, + "time_per_iteration": 2.6674444675445557 + }, + { + "auxiliary_loss_clip": 0.01127571, + "auxiliary_loss_mlp": 0.01037327, + "balance_loss_clip": 1.04465294, + "balance_loss_mlp": 1.02433324, + "epoch": 0.40865774838418756, + "flos": 25931982237120.0, + "grad_norm": 2.041779743188373, + "language_loss": 0.77831417, + "learning_rate": 2.6752290995008498e-06, + "loss": 0.79996312, + "num_input_tokens_seen": 145903210, + "router_z_loss_clip": 0.82910156, + "router_z_loss_mlp": 0.13000488, + "step": 6797, + "time_per_iteration": 2.7626898288726807 + }, + { + "auxiliary_loss_clip": 0.011274, + "auxiliary_loss_mlp": 0.01046116, + "balance_loss_clip": 1.0439918, + "balance_loss_mlp": 1.0328238, + "epoch": 0.4087178716368556, + "flos": 16804376229600.0, + "grad_norm": 2.18820856729773, + "language_loss": 0.85748124, + "learning_rate": 2.6748624924761523e-06, + "loss": 0.87921643, + "num_input_tokens_seen": 145920985, + "router_z_loss_clip": 0.83300781, + "router_z_loss_mlp": 0.13293457, + "step": 6798, + "time_per_iteration": 2.770535707473755 + }, + { + "auxiliary_loss_clip": 0.01125011, + "auxiliary_loss_mlp": 0.01034768, + "balance_loss_clip": 1.04516041, + "balance_loss_mlp": 1.02272785, + "epoch": 0.40877799488952354, + "flos": 28823652390240.0, + "grad_norm": 1.4549113485982172, + "language_loss": 0.84197199, + "learning_rate": 2.674495859860601e-06, + "loss": 0.86356974, + "num_input_tokens_seen": 145940350, + "router_z_loss_clip": 0.79882812, + "router_z_loss_mlp": 0.12036133, + "step": 6799, + "time_per_iteration": 2.6791775226593018 + }, + { + "auxiliary_loss_clip": 0.01129597, + "auxiliary_loss_mlp": 0.01037151, + "balance_loss_clip": 1.04657745, + "balance_loss_mlp": 1.02314425, + "epoch": 0.4088381181421915, + "flos": 25525570037760.0, + "grad_norm": 2.051882541773208, + "language_loss": 0.83052135, + "learning_rate": 2.6741292016681e-06, + "loss": 0.85218883, + "num_input_tokens_seen": 145957460, + "router_z_loss_clip": 0.83007812, + "router_z_loss_mlp": 0.14001465, + "step": 6800, + "time_per_iteration": 2.6700143814086914 + }, + { + "auxiliary_loss_clip": 0.01129071, + "auxiliary_loss_mlp": 0.01040794, + "balance_loss_clip": 1.04505384, + "balance_loss_mlp": 1.02732921, + "epoch": 0.4088982413948595, + "flos": 16225183232640.0, + "grad_norm": 2.0070861889371754, + "language_loss": 0.74761337, + "learning_rate": 2.6737625179125514e-06, + "loss": 0.76931202, + "num_input_tokens_seen": 145975285, + "router_z_loss_clip": 0.84130859, + "router_z_loss_mlp": 0.13464355, + "step": 6801, + "time_per_iteration": 2.6558117866516113 + }, + { + "auxiliary_loss_clip": 0.01129742, + "auxiliary_loss_mlp": 0.01034278, + "balance_loss_clip": 1.04537272, + "balance_loss_mlp": 1.02053308, + "epoch": 0.40895836464752744, + "flos": 18629257258080.0, + "grad_norm": 2.052568489678103, + "language_loss": 0.80169034, + "learning_rate": 2.673395808607861e-06, + "loss": 0.82333052, + "num_input_tokens_seen": 145989150, + "router_z_loss_clip": 0.84423828, + "router_z_loss_mlp": 0.13745117, + "step": 6802, + "time_per_iteration": 5.529289245605469 + }, + { + "auxiliary_loss_clip": 0.01132013, + "auxiliary_loss_mlp": 0.01038635, + "balance_loss_clip": 1.04659033, + "balance_loss_mlp": 1.02391279, + "epoch": 0.4090184879001954, + "flos": 17694211276800.0, + "grad_norm": 2.2025170593639762, + "language_loss": 0.76164514, + "learning_rate": 2.673029073767934e-06, + "loss": 0.7833516, + "num_input_tokens_seen": 146006980, + "router_z_loss_clip": 0.85400391, + "router_z_loss_mlp": 0.1472168, + "step": 6803, + "time_per_iteration": 2.676882266998291 + }, + { + "auxiliary_loss_clip": 0.01129159, + "auxiliary_loss_mlp": 0.01034698, + "balance_loss_clip": 1.04594481, + "balance_loss_mlp": 1.02169228, + "epoch": 0.40907861115286337, + "flos": 16937821098720.0, + "grad_norm": 1.810241882189324, + "language_loss": 0.78045619, + "learning_rate": 2.6726623134066764e-06, + "loss": 0.80209482, + "num_input_tokens_seen": 146025125, + "router_z_loss_clip": 0.83154297, + "router_z_loss_mlp": 0.13012695, + "step": 6804, + "time_per_iteration": 2.6576340198516846 + }, + { + "auxiliary_loss_clip": 0.01132088, + "auxiliary_loss_mlp": 0.01039771, + "balance_loss_clip": 1.04468226, + "balance_loss_mlp": 1.02683711, + "epoch": 0.40913873440553133, + "flos": 34212127921920.0, + "grad_norm": 1.7902833695965525, + "language_loss": 0.75439119, + "learning_rate": 2.672295527537998e-06, + "loss": 0.77610981, + "num_input_tokens_seen": 146044990, + "router_z_loss_clip": 0.87353516, + "router_z_loss_mlp": 0.12945557, + "step": 6805, + "time_per_iteration": 2.706704616546631 + }, + { + "auxiliary_loss_clip": 0.0113148, + "auxiliary_loss_mlp": 0.01038244, + "balance_loss_clip": 1.04675698, + "balance_loss_mlp": 1.02530408, + "epoch": 0.4091988576581993, + "flos": 26378459676000.0, + "grad_norm": 1.6633613269253749, + "language_loss": 0.79371762, + "learning_rate": 2.671928716175804e-06, + "loss": 0.81541485, + "num_input_tokens_seen": 146066045, + "router_z_loss_clip": 0.84667969, + "router_z_loss_mlp": 0.1293335, + "step": 6806, + "time_per_iteration": 2.72554612159729 + }, + { + "auxiliary_loss_clip": 0.01129281, + "auxiliary_loss_mlp": 0.0103135, + "balance_loss_clip": 1.04469216, + "balance_loss_mlp": 1.01829624, + "epoch": 0.40925898091086726, + "flos": 30779223112800.0, + "grad_norm": 2.2822367776478645, + "language_loss": 0.71229887, + "learning_rate": 2.671561879334007e-06, + "loss": 0.7339052, + "num_input_tokens_seen": 146086280, + "router_z_loss_clip": 0.84570312, + "router_z_loss_mlp": 0.1305542, + "step": 6807, + "time_per_iteration": 2.6743175983428955 + }, + { + "auxiliary_loss_clip": 0.0104211, + "auxiliary_loss_mlp": 0.01000166, + "balance_loss_clip": 1.01574171, + "balance_loss_mlp": 0.99866527, + "epoch": 0.40931910416353523, + "flos": 84110189767200.0, + "grad_norm": 0.8244519559854083, + "language_loss": 0.588287, + "learning_rate": 2.6711950170265155e-06, + "loss": 0.60870969, + "num_input_tokens_seen": 146148840, + "router_z_loss_clip": 0.2644043, + "router_z_loss_mlp": 0.01499939, + "step": 6808, + "time_per_iteration": 4.832051038742065 + }, + { + "auxiliary_loss_clip": 0.01127317, + "auxiliary_loss_mlp": 0.01036467, + "balance_loss_clip": 1.04534388, + "balance_loss_mlp": 1.02455211, + "epoch": 0.4093792274162032, + "flos": 24635653956000.0, + "grad_norm": 1.7308205901731835, + "language_loss": 0.54262918, + "learning_rate": 2.670828129267242e-06, + "loss": 0.56426704, + "num_input_tokens_seen": 146166195, + "router_z_loss_clip": 0.82080078, + "router_z_loss_mlp": 0.11920166, + "step": 6809, + "time_per_iteration": 2.702357769012451 + }, + { + "auxiliary_loss_clip": 0.01127941, + "auxiliary_loss_mlp": 0.01027481, + "balance_loss_clip": 1.04648995, + "balance_loss_mlp": 1.01522684, + "epoch": 0.40943935066887116, + "flos": 30784490359200.0, + "grad_norm": 3.054588540488955, + "language_loss": 0.8324784, + "learning_rate": 2.6704612160700983e-06, + "loss": 0.85403264, + "num_input_tokens_seen": 146185045, + "router_z_loss_clip": 0.81347656, + "router_z_loss_mlp": 0.12268066, + "step": 6810, + "time_per_iteration": 2.691538095474243 + }, + { + "auxiliary_loss_clip": 0.01132874, + "auxiliary_loss_mlp": 0.01036098, + "balance_loss_clip": 1.04905629, + "balance_loss_mlp": 1.02185225, + "epoch": 0.4094994739215392, + "flos": 28376648226720.0, + "grad_norm": 3.2032558405858036, + "language_loss": 0.77688789, + "learning_rate": 2.670094277448999e-06, + "loss": 0.79857761, + "num_input_tokens_seen": 146204655, + "router_z_loss_clip": 0.83837891, + "router_z_loss_mlp": 0.14233398, + "step": 6811, + "time_per_iteration": 2.710202217102051 + }, + { + "auxiliary_loss_clip": 0.01130296, + "auxiliary_loss_mlp": 0.01031934, + "balance_loss_clip": 1.04612207, + "balance_loss_mlp": 1.01812911, + "epoch": 0.40955959717420715, + "flos": 21209677601760.0, + "grad_norm": 1.6900848492773444, + "language_loss": 0.70000106, + "learning_rate": 2.669727313417857e-06, + "loss": 0.7216233, + "num_input_tokens_seen": 146222000, + "router_z_loss_clip": 0.84082031, + "router_z_loss_mlp": 0.13806152, + "step": 6812, + "time_per_iteration": 2.6239776611328125 + }, + { + "auxiliary_loss_clip": 0.01125332, + "auxiliary_loss_mlp": 0.0103582, + "balance_loss_clip": 1.04375958, + "balance_loss_mlp": 1.02223635, + "epoch": 0.4096197204268751, + "flos": 30605956107840.0, + "grad_norm": 2.368645671748739, + "language_loss": 0.6623593, + "learning_rate": 2.6693603239905872e-06, + "loss": 0.68397081, + "num_input_tokens_seen": 146242630, + "router_z_loss_clip": 0.81640625, + "router_z_loss_mlp": 0.13586426, + "step": 6813, + "time_per_iteration": 4.133495092391968 + }, + { + "auxiliary_loss_clip": 0.01127663, + "auxiliary_loss_mlp": 0.01033082, + "balance_loss_clip": 1.04580402, + "balance_loss_mlp": 1.01980805, + "epoch": 0.4096798436795431, + "flos": 37323248428800.0, + "grad_norm": 2.6506568195417834, + "language_loss": 0.73441243, + "learning_rate": 2.6689933091811087e-06, + "loss": 0.75601983, + "num_input_tokens_seen": 146263070, + "router_z_loss_clip": 0.81787109, + "router_z_loss_mlp": 0.1328125, + "step": 6814, + "time_per_iteration": 2.7774243354797363 + }, + { + "auxiliary_loss_clip": 0.0113086, + "auxiliary_loss_mlp": 0.0103176, + "balance_loss_clip": 1.04616475, + "balance_loss_mlp": 1.01868904, + "epoch": 0.40973996693221104, + "flos": 29448177873120.0, + "grad_norm": 2.04014409569925, + "language_loss": 0.66124105, + "learning_rate": 2.6686262690033357e-06, + "loss": 0.68286729, + "num_input_tokens_seen": 146282890, + "router_z_loss_clip": 0.84716797, + "router_z_loss_mlp": 0.13079834, + "step": 6815, + "time_per_iteration": 2.6807026863098145 + }, + { + "auxiliary_loss_clip": 0.011273, + "auxiliary_loss_mlp": 0.01037847, + "balance_loss_clip": 1.0485673, + "balance_loss_mlp": 1.0251509, + "epoch": 0.409800090184879, + "flos": 29271547933920.0, + "grad_norm": 1.6582771435527663, + "language_loss": 0.76743954, + "learning_rate": 2.668259203471188e-06, + "loss": 0.78909099, + "num_input_tokens_seen": 146301755, + "router_z_loss_clip": 0.78662109, + "router_z_loss_mlp": 0.12701416, + "step": 6816, + "time_per_iteration": 2.727837085723877 + }, + { + "auxiliary_loss_clip": 0.01130357, + "auxiliary_loss_mlp": 0.01034856, + "balance_loss_clip": 1.04765058, + "balance_loss_mlp": 1.02198708, + "epoch": 0.40986021343754697, + "flos": 19698193798560.0, + "grad_norm": 2.1229953862836406, + "language_loss": 0.81405008, + "learning_rate": 2.6678921125985843e-06, + "loss": 0.83570218, + "num_input_tokens_seen": 146316835, + "router_z_loss_clip": 0.82617188, + "router_z_loss_mlp": 0.12866211, + "step": 6817, + "time_per_iteration": 2.6264631748199463 + }, + { + "auxiliary_loss_clip": 0.01132797, + "auxiliary_loss_mlp": 0.0103831, + "balance_loss_clip": 1.04699612, + "balance_loss_mlp": 1.02377796, + "epoch": 0.40992033669021494, + "flos": 30250103123520.0, + "grad_norm": 1.8755663477206146, + "language_loss": 0.80046862, + "learning_rate": 2.667524996399444e-06, + "loss": 0.82217968, + "num_input_tokens_seen": 146336650, + "router_z_loss_clip": 0.85888672, + "router_z_loss_mlp": 0.14532471, + "step": 6818, + "time_per_iteration": 2.695265769958496 + }, + { + "auxiliary_loss_clip": 0.0112561, + "auxiliary_loss_mlp": 0.0103534, + "balance_loss_clip": 1.04439569, + "balance_loss_mlp": 1.02256072, + "epoch": 0.4099804599428829, + "flos": 36169886577600.0, + "grad_norm": 1.6515202620359652, + "language_loss": 0.6625219, + "learning_rate": 2.66715785488769e-06, + "loss": 0.68413138, + "num_input_tokens_seen": 146357640, + "router_z_loss_clip": 0.8125, + "router_z_loss_mlp": 0.12786865, + "step": 6819, + "time_per_iteration": 2.7456181049346924 + }, + { + "auxiliary_loss_clip": 0.01132574, + "auxiliary_loss_mlp": 0.01036384, + "balance_loss_clip": 1.04591513, + "balance_loss_mlp": 1.0223887, + "epoch": 0.41004058319555087, + "flos": 30293814918240.0, + "grad_norm": 1.7144501927784603, + "language_loss": 0.8515799, + "learning_rate": 2.6667906880772428e-06, + "loss": 0.87326944, + "num_input_tokens_seen": 146379325, + "router_z_loss_clip": 0.8671875, + "router_z_loss_mlp": 0.14013672, + "step": 6820, + "time_per_iteration": 2.7173399925231934 + }, + { + "auxiliary_loss_clip": 0.01126321, + "auxiliary_loss_mlp": 0.01029895, + "balance_loss_clip": 1.04588473, + "balance_loss_mlp": 1.01711607, + "epoch": 0.41010070644821883, + "flos": 31405450321440.0, + "grad_norm": 1.8858245686646447, + "language_loss": 0.70991015, + "learning_rate": 2.6664234959820256e-06, + "loss": 0.73147225, + "num_input_tokens_seen": 146398635, + "router_z_loss_clip": 0.80371094, + "router_z_loss_mlp": 0.12774658, + "step": 6821, + "time_per_iteration": 2.6816515922546387 + }, + { + "auxiliary_loss_clip": 0.01128086, + "auxiliary_loss_mlp": 0.01032207, + "balance_loss_clip": 1.04602909, + "balance_loss_mlp": 1.01983941, + "epoch": 0.4101608297008868, + "flos": 27266836101120.0, + "grad_norm": 1.9929009758753178, + "language_loss": 0.74742055, + "learning_rate": 2.6660562786159634e-06, + "loss": 0.76902342, + "num_input_tokens_seen": 146417585, + "router_z_loss_clip": 0.82080078, + "router_z_loss_mlp": 0.12371826, + "step": 6822, + "time_per_iteration": 2.719846487045288 + }, + { + "auxiliary_loss_clip": 0.01128734, + "auxiliary_loss_mlp": 0.01033263, + "balance_loss_clip": 1.04656971, + "balance_loss_mlp": 1.0198226, + "epoch": 0.41022095295355476, + "flos": 26777497730400.0, + "grad_norm": 2.251690280740331, + "language_loss": 0.75837934, + "learning_rate": 2.6656890359929796e-06, + "loss": 0.77999932, + "num_input_tokens_seen": 146437035, + "router_z_loss_clip": 0.82324219, + "router_z_loss_mlp": 0.13446045, + "step": 6823, + "time_per_iteration": 2.7234764099121094 + }, + { + "auxiliary_loss_clip": 0.01134281, + "auxiliary_loss_mlp": 0.01038441, + "balance_loss_clip": 1.04650164, + "balance_loss_mlp": 1.02429044, + "epoch": 0.4102810762062228, + "flos": 33496005569760.0, + "grad_norm": 2.061966800030391, + "language_loss": 0.72919786, + "learning_rate": 2.665321768127001e-06, + "loss": 0.75092506, + "num_input_tokens_seen": 146457370, + "router_z_loss_clip": 0.87744141, + "router_z_loss_mlp": 0.14160156, + "step": 6824, + "time_per_iteration": 2.7265546321868896 + }, + { + "auxiliary_loss_clip": 0.01130389, + "auxiliary_loss_mlp": 0.01032754, + "balance_loss_clip": 1.04442763, + "balance_loss_mlp": 1.01918817, + "epoch": 0.41034119945889075, + "flos": 29894290656480.0, + "grad_norm": 1.8567570971737788, + "language_loss": 0.7168023, + "learning_rate": 2.6649544750319548e-06, + "loss": 0.73843372, + "num_input_tokens_seen": 146478105, + "router_z_loss_clip": 0.859375, + "router_z_loss_mlp": 0.13568115, + "step": 6825, + "time_per_iteration": 2.7552196979522705 + }, + { + "auxiliary_loss_clip": 0.01127385, + "auxiliary_loss_mlp": 0.0103902, + "balance_loss_clip": 1.04588461, + "balance_loss_mlp": 1.02707505, + "epoch": 0.4104013227115587, + "flos": 29715432266880.0, + "grad_norm": 1.7807299311158051, + "language_loss": 0.84607565, + "learning_rate": 2.664587156721768e-06, + "loss": 0.86773968, + "num_input_tokens_seen": 146497835, + "router_z_loss_clip": 0.81591797, + "router_z_loss_mlp": 0.11950684, + "step": 6826, + "time_per_iteration": 2.68497633934021 + }, + { + "auxiliary_loss_clip": 0.0112535, + "auxiliary_loss_mlp": 0.01035281, + "balance_loss_clip": 1.04536009, + "balance_loss_mlp": 1.02205515, + "epoch": 0.4104614459642267, + "flos": 28955111912640.0, + "grad_norm": 1.8240438137659707, + "language_loss": 0.66973853, + "learning_rate": 2.6642198132103696e-06, + "loss": 0.6913448, + "num_input_tokens_seen": 146517735, + "router_z_loss_clip": 0.80078125, + "router_z_loss_mlp": 0.13226318, + "step": 6827, + "time_per_iteration": 2.7241384983062744 + }, + { + "auxiliary_loss_clip": 0.01124162, + "auxiliary_loss_mlp": 0.01028466, + "balance_loss_clip": 1.04373264, + "balance_loss_mlp": 1.01590693, + "epoch": 0.41052156921689464, + "flos": 27000635156640.0, + "grad_norm": 1.8903913496821838, + "language_loss": 0.72041309, + "learning_rate": 2.663852444511689e-06, + "loss": 0.74193937, + "num_input_tokens_seen": 146537640, + "router_z_loss_clip": 0.80419922, + "router_z_loss_mlp": 0.12573242, + "step": 6828, + "time_per_iteration": 2.66563081741333 + }, + { + "auxiliary_loss_clip": 0.01131465, + "auxiliary_loss_mlp": 0.01041081, + "balance_loss_clip": 1.04562187, + "balance_loss_mlp": 1.02700281, + "epoch": 0.4105816924695626, + "flos": 24506909091360.0, + "grad_norm": 1.820416998054516, + "language_loss": 0.83445698, + "learning_rate": 2.6634850506396574e-06, + "loss": 0.85618246, + "num_input_tokens_seen": 146554695, + "router_z_loss_clip": 0.85839844, + "router_z_loss_mlp": 0.14086914, + "step": 6829, + "time_per_iteration": 2.699317216873169 + }, + { + "auxiliary_loss_clip": 0.01125169, + "auxiliary_loss_mlp": 0.01033666, + "balance_loss_clip": 1.04394197, + "balance_loss_mlp": 1.02101207, + "epoch": 0.4106418157222306, + "flos": 22057178441760.0, + "grad_norm": 1.518782457657793, + "language_loss": 0.89920688, + "learning_rate": 2.663117631608206e-06, + "loss": 0.9207952, + "num_input_tokens_seen": 146573740, + "router_z_loss_clip": 0.81152344, + "router_z_loss_mlp": 0.12646484, + "step": 6830, + "time_per_iteration": 2.650867223739624 + }, + { + "auxiliary_loss_clip": 0.01129244, + "auxiliary_loss_mlp": 0.01028742, + "balance_loss_clip": 1.04686546, + "balance_loss_mlp": 1.01595688, + "epoch": 0.41070193897489854, + "flos": 26415283533120.0, + "grad_norm": 1.9077587656283208, + "language_loss": 0.65308356, + "learning_rate": 2.662750187431268e-06, + "loss": 0.67466342, + "num_input_tokens_seen": 146592885, + "router_z_loss_clip": 0.82373047, + "router_z_loss_mlp": 0.12780762, + "step": 6831, + "time_per_iteration": 2.729674816131592 + }, + { + "auxiliary_loss_clip": 0.01124674, + "auxiliary_loss_mlp": 0.01034943, + "balance_loss_clip": 1.04433966, + "balance_loss_mlp": 1.02268839, + "epoch": 0.4107620622275665, + "flos": 32516923655520.0, + "grad_norm": 1.8530443302060333, + "language_loss": 0.69373721, + "learning_rate": 2.662382718122776e-06, + "loss": 0.71533334, + "num_input_tokens_seen": 146611995, + "router_z_loss_clip": 0.80371094, + "router_z_loss_mlp": 0.12261963, + "step": 6832, + "time_per_iteration": 2.67085337638855 + }, + { + "auxiliary_loss_clip": 0.01123117, + "auxiliary_loss_mlp": 0.01030139, + "balance_loss_clip": 1.04316831, + "balance_loss_mlp": 1.01783633, + "epoch": 0.41082218548023447, + "flos": 22859468347680.0, + "grad_norm": 2.2216784629345674, + "language_loss": 0.74170351, + "learning_rate": 2.662015223696666e-06, + "loss": 0.76323611, + "num_input_tokens_seen": 146628045, + "router_z_loss_clip": 0.79931641, + "router_z_loss_mlp": 0.12316895, + "step": 6833, + "time_per_iteration": 2.6730029582977295 + }, + { + "auxiliary_loss_clip": 0.01130384, + "auxiliary_loss_mlp": 0.01035702, + "balance_loss_clip": 1.04471135, + "balance_loss_mlp": 1.02152801, + "epoch": 0.41088230873290243, + "flos": 27935154413280.0, + "grad_norm": 3.846134201678985, + "language_loss": 0.72813362, + "learning_rate": 2.6616477041668713e-06, + "loss": 0.74979448, + "num_input_tokens_seen": 146648355, + "router_z_loss_clip": 0.85546875, + "router_z_loss_mlp": 0.14172363, + "step": 6834, + "time_per_iteration": 2.6715307235717773 + }, + { + "auxiliary_loss_clip": 0.01129091, + "auxiliary_loss_mlp": 0.01040846, + "balance_loss_clip": 1.04445314, + "balance_loss_mlp": 1.02790546, + "epoch": 0.4109424319855704, + "flos": 29623349190240.0, + "grad_norm": 2.2330162995468053, + "language_loss": 0.71352828, + "learning_rate": 2.661280159547329e-06, + "loss": 0.73522764, + "num_input_tokens_seen": 146668370, + "router_z_loss_clip": 0.84667969, + "router_z_loss_mlp": 0.12939453, + "step": 6835, + "time_per_iteration": 2.677339553833008 + }, + { + "auxiliary_loss_clip": 0.01129086, + "auxiliary_loss_mlp": 0.01034531, + "balance_loss_clip": 1.04549813, + "balance_loss_mlp": 1.02058315, + "epoch": 0.41100255523823837, + "flos": 15824402935200.0, + "grad_norm": 2.067676686650788, + "language_loss": 0.87261766, + "learning_rate": 2.660912589851978e-06, + "loss": 0.89425379, + "num_input_tokens_seen": 146686665, + "router_z_loss_clip": 0.83642578, + "router_z_loss_mlp": 0.1395874, + "step": 6836, + "time_per_iteration": 2.6614489555358887 + }, + { + "auxiliary_loss_clip": 0.01125978, + "auxiliary_loss_mlp": 0.01033428, + "balance_loss_clip": 1.04565763, + "balance_loss_mlp": 1.02037418, + "epoch": 0.4110626784909064, + "flos": 28242838702080.0, + "grad_norm": 1.8305823493552373, + "language_loss": 0.6899904, + "learning_rate": 2.6605449950947547e-06, + "loss": 0.71158445, + "num_input_tokens_seen": 146706570, + "router_z_loss_clip": 0.80371094, + "router_z_loss_mlp": 0.13067627, + "step": 6837, + "time_per_iteration": 2.6462597846984863 + }, + { + "auxiliary_loss_clip": 0.01127146, + "auxiliary_loss_mlp": 0.01034716, + "balance_loss_clip": 1.04434407, + "balance_loss_mlp": 1.02148342, + "epoch": 0.41112280174357435, + "flos": 27756579644640.0, + "grad_norm": 2.128538619725729, + "language_loss": 0.75241554, + "learning_rate": 2.660177375289599e-06, + "loss": 0.77403414, + "num_input_tokens_seen": 146723425, + "router_z_loss_clip": 0.82714844, + "router_z_loss_mlp": 0.13250732, + "step": 6838, + "time_per_iteration": 2.681211471557617 + }, + { + "auxiliary_loss_clip": 0.01125335, + "auxiliary_loss_mlp": 0.01034497, + "balance_loss_clip": 1.04401088, + "balance_loss_mlp": 1.02108622, + "epoch": 0.4111829249962423, + "flos": 25749517809600.0, + "grad_norm": 1.9438048877942318, + "language_loss": 0.82202649, + "learning_rate": 2.659809730450451e-06, + "loss": 0.84362477, + "num_input_tokens_seen": 146741640, + "router_z_loss_clip": 0.81396484, + "router_z_loss_mlp": 0.1340332, + "step": 6839, + "time_per_iteration": 2.6475653648376465 + }, + { + "auxiliary_loss_clip": 0.01124437, + "auxiliary_loss_mlp": 0.01033569, + "balance_loss_clip": 1.04265082, + "balance_loss_mlp": 1.02091503, + "epoch": 0.4112430482489103, + "flos": 26241570838080.0, + "grad_norm": 1.8564728009419422, + "language_loss": 0.8032577, + "learning_rate": 2.6594420605912523e-06, + "loss": 0.82483774, + "num_input_tokens_seen": 146759195, + "router_z_loss_clip": 0.81787109, + "router_z_loss_mlp": 0.12652588, + "step": 6840, + "time_per_iteration": 2.683366537094116 + }, + { + "auxiliary_loss_clip": 0.0112314, + "auxiliary_loss_mlp": 0.01029598, + "balance_loss_clip": 1.04372072, + "balance_loss_mlp": 1.01745665, + "epoch": 0.41130317150157825, + "flos": 23879587916160.0, + "grad_norm": 1.9201702384815422, + "language_loss": 0.68104345, + "learning_rate": 2.6590743657259442e-06, + "loss": 0.7025708, + "num_input_tokens_seen": 146774990, + "router_z_loss_clip": 0.79541016, + "router_z_loss_mlp": 0.12158203, + "step": 6841, + "time_per_iteration": 4.094921112060547 + }, + { + "auxiliary_loss_clip": 0.01048068, + "auxiliary_loss_mlp": 0.01013092, + "balance_loss_clip": 1.02162623, + "balance_loss_mlp": 1.01172924, + "epoch": 0.4113632947542462, + "flos": 76118691600000.0, + "grad_norm": 0.7637712556383307, + "language_loss": 0.59704506, + "learning_rate": 2.65870664586847e-06, + "loss": 0.61765665, + "num_input_tokens_seen": 146839610, + "router_z_loss_clip": 0.26513672, + "router_z_loss_mlp": 0.01364136, + "step": 6842, + "time_per_iteration": 4.901157855987549 + }, + { + "auxiliary_loss_clip": 0.01123263, + "auxiliary_loss_mlp": 0.01033259, + "balance_loss_clip": 1.04454851, + "balance_loss_mlp": 1.02118301, + "epoch": 0.4114234180069142, + "flos": 16982950998240.0, + "grad_norm": 2.2481462460879613, + "language_loss": 0.69961405, + "learning_rate": 2.6583389010327742e-06, + "loss": 0.72117925, + "num_input_tokens_seen": 146857360, + "router_z_loss_clip": 0.78710938, + "router_z_loss_mlp": 0.12072754, + "step": 6843, + "time_per_iteration": 2.7065725326538086 + }, + { + "auxiliary_loss_clip": 0.01046866, + "auxiliary_loss_mlp": 0.01007147, + "balance_loss_clip": 1.02051997, + "balance_loss_mlp": 1.00583482, + "epoch": 0.41148354125958214, + "flos": 79225517275200.0, + "grad_norm": 0.7226276890494644, + "language_loss": 0.53630805, + "learning_rate": 2.6579711312328013e-06, + "loss": 0.55684823, + "num_input_tokens_seen": 146917055, + "router_z_loss_clip": 0.26391602, + "router_z_loss_mlp": 0.01313019, + "step": 6844, + "time_per_iteration": 3.23414945602417 + }, + { + "auxiliary_loss_clip": 0.01124016, + "auxiliary_loss_mlp": 0.01036316, + "balance_loss_clip": 1.044765, + "balance_loss_mlp": 1.02385283, + "epoch": 0.4115436645122501, + "flos": 22853228686560.0, + "grad_norm": 1.8204908228702692, + "language_loss": 0.66096759, + "learning_rate": 2.6576033364824967e-06, + "loss": 0.68257087, + "num_input_tokens_seen": 146935215, + "router_z_loss_clip": 0.79248047, + "router_z_loss_mlp": 0.12475586, + "step": 6845, + "time_per_iteration": 2.659701347351074 + }, + { + "auxiliary_loss_clip": 0.01125346, + "auxiliary_loss_mlp": 0.01029845, + "balance_loss_clip": 1.04687762, + "balance_loss_mlp": 1.01739347, + "epoch": 0.41160378776491807, + "flos": 19831517115840.0, + "grad_norm": 2.022123577140635, + "language_loss": 0.70477235, + "learning_rate": 2.657235516795808e-06, + "loss": 0.72632432, + "num_input_tokens_seen": 146951970, + "router_z_loss_clip": 0.78515625, + "router_z_loss_mlp": 0.12445068, + "step": 6846, + "time_per_iteration": 2.7489655017852783 + }, + { + "auxiliary_loss_clip": 0.01123333, + "auxiliary_loss_mlp": 0.0103329, + "balance_loss_clip": 1.04285908, + "balance_loss_mlp": 1.02029622, + "epoch": 0.41166391101758604, + "flos": 34129161233280.0, + "grad_norm": 1.3967866667462974, + "language_loss": 0.65048265, + "learning_rate": 2.6568676721866826e-06, + "loss": 0.67204893, + "num_input_tokens_seen": 146975615, + "router_z_loss_clip": 0.80517578, + "router_z_loss_mlp": 0.13000488, + "step": 6847, + "time_per_iteration": 4.206215143203735 + }, + { + "auxiliary_loss_clip": 0.01124097, + "auxiliary_loss_mlp": 0.01038956, + "balance_loss_clip": 1.04274011, + "balance_loss_mlp": 1.02599239, + "epoch": 0.411724034270254, + "flos": 41648054666400.0, + "grad_norm": 1.6499466231444915, + "language_loss": 0.70812941, + "learning_rate": 2.656499802669069e-06, + "loss": 0.72975993, + "num_input_tokens_seen": 146998855, + "router_z_loss_clip": 0.81347656, + "router_z_loss_mlp": 0.12988281, + "step": 6848, + "time_per_iteration": 2.7632718086242676 + }, + { + "auxiliary_loss_clip": 0.01045674, + "auxiliary_loss_mlp": 0.01000405, + "balance_loss_clip": 1.01895237, + "balance_loss_mlp": 0.99896824, + "epoch": 0.41178415752292197, + "flos": 82880141405760.0, + "grad_norm": 0.8923569903404626, + "language_loss": 0.5628652, + "learning_rate": 2.6561319082569174e-06, + "loss": 0.58332598, + "num_input_tokens_seen": 147062710, + "router_z_loss_clip": 0.2668457, + "router_z_loss_mlp": 0.01435852, + "step": 6849, + "time_per_iteration": 3.357913017272949 + }, + { + "auxiliary_loss_clip": 0.01126581, + "auxiliary_loss_mlp": 0.01032073, + "balance_loss_clip": 1.04670548, + "balance_loss_mlp": 1.01894236, + "epoch": 0.41184428077558993, + "flos": 41870908471680.0, + "grad_norm": 1.5925354708165462, + "language_loss": 0.75893492, + "learning_rate": 2.6557639889641783e-06, + "loss": 0.78052151, + "num_input_tokens_seen": 147086075, + "router_z_loss_clip": 0.79785156, + "router_z_loss_mlp": 0.13134766, + "step": 6850, + "time_per_iteration": 2.7667012214660645 + }, + { + "auxiliary_loss_clip": 0.01123222, + "auxiliary_loss_mlp": 0.0103027, + "balance_loss_clip": 1.04367256, + "balance_loss_mlp": 1.01760375, + "epoch": 0.41190440402825795, + "flos": 43250122406880.0, + "grad_norm": 1.5182587528802742, + "language_loss": 0.68047786, + "learning_rate": 2.6553960448048025e-06, + "loss": 0.70201278, + "num_input_tokens_seen": 147107590, + "router_z_loss_clip": 0.79541016, + "router_z_loss_mlp": 0.12670898, + "step": 6851, + "time_per_iteration": 2.7833805084228516 + }, + { + "auxiliary_loss_clip": 0.01130952, + "auxiliary_loss_mlp": 0.01042195, + "balance_loss_clip": 1.04584384, + "balance_loss_mlp": 1.02741933, + "epoch": 0.4119645272809259, + "flos": 25441104209760.0, + "grad_norm": 2.335678251931197, + "language_loss": 0.79311204, + "learning_rate": 2.655028075792743e-06, + "loss": 0.81484354, + "num_input_tokens_seen": 147123715, + "router_z_loss_clip": 0.85205078, + "router_z_loss_mlp": 0.14788818, + "step": 6852, + "time_per_iteration": 4.080576181411743 + }, + { + "auxiliary_loss_clip": 0.01130444, + "auxiliary_loss_mlp": 0.01032372, + "balance_loss_clip": 1.04568172, + "balance_loss_mlp": 1.01887774, + "epoch": 0.4120246505335939, + "flos": 33631921992960.0, + "grad_norm": 2.102667011162146, + "language_loss": 0.77766019, + "learning_rate": 2.6546600819419537e-06, + "loss": 0.79928839, + "num_input_tokens_seen": 147144290, + "router_z_loss_clip": 0.84765625, + "router_z_loss_mlp": 0.13513184, + "step": 6853, + "time_per_iteration": 2.707627296447754 + }, + { + "auxiliary_loss_clip": 0.01132764, + "auxiliary_loss_mlp": 0.01037143, + "balance_loss_clip": 1.04750729, + "balance_loss_mlp": 1.02320719, + "epoch": 0.41208477378626185, + "flos": 46143818424000.0, + "grad_norm": 1.916350672463684, + "language_loss": 0.65642786, + "learning_rate": 2.6542920632663883e-06, + "loss": 0.67812693, + "num_input_tokens_seen": 147166340, + "router_z_loss_clip": 0.85253906, + "router_z_loss_mlp": 0.1394043, + "step": 6854, + "time_per_iteration": 2.8046011924743652 + }, + { + "auxiliary_loss_clip": 0.01124407, + "auxiliary_loss_mlp": 0.01031841, + "balance_loss_clip": 1.04412043, + "balance_loss_mlp": 1.01954412, + "epoch": 0.4121448970389298, + "flos": 28602662379840.0, + "grad_norm": 1.6766075849284805, + "language_loss": 0.83184528, + "learning_rate": 2.6539240197800023e-06, + "loss": 0.8534078, + "num_input_tokens_seen": 147184025, + "router_z_loss_clip": 0.80273438, + "router_z_loss_mlp": 0.1229248, + "step": 6855, + "time_per_iteration": 2.6366500854492188 + }, + { + "auxiliary_loss_clip": 0.01124156, + "auxiliary_loss_mlp": 0.01038201, + "balance_loss_clip": 1.0452075, + "balance_loss_mlp": 1.02603602, + "epoch": 0.4122050202915978, + "flos": 26021148069600.0, + "grad_norm": 1.6655026780853879, + "language_loss": 0.79014218, + "learning_rate": 2.6535559514967517e-06, + "loss": 0.81176579, + "num_input_tokens_seen": 147202730, + "router_z_loss_clip": 0.7890625, + "router_z_loss_mlp": 0.12158203, + "step": 6856, + "time_per_iteration": 2.680675983428955 + }, + { + "auxiliary_loss_clip": 0.01129711, + "auxiliary_loss_mlp": 0.010311, + "balance_loss_clip": 1.04750121, + "balance_loss_mlp": 1.01867247, + "epoch": 0.41226514354426574, + "flos": 21117351421440.0, + "grad_norm": 2.3951214564797105, + "language_loss": 0.79427594, + "learning_rate": 2.6531878584305935e-06, + "loss": 0.81588405, + "num_input_tokens_seen": 147215315, + "router_z_loss_clip": 0.82275391, + "router_z_loss_mlp": 0.12420654, + "step": 6857, + "time_per_iteration": 2.59657883644104 + }, + { + "auxiliary_loss_clip": 0.01125986, + "auxiliary_loss_mlp": 0.01035669, + "balance_loss_clip": 1.04296625, + "balance_loss_mlp": 1.02238297, + "epoch": 0.4123252667969337, + "flos": 21522791206080.0, + "grad_norm": 1.9145774090771996, + "language_loss": 0.70730007, + "learning_rate": 2.6528197405954873e-06, + "loss": 0.72891665, + "num_input_tokens_seen": 147233330, + "router_z_loss_clip": 0.83007812, + "router_z_loss_mlp": 0.13299561, + "step": 6858, + "time_per_iteration": 2.7012829780578613 + }, + { + "auxiliary_loss_clip": 0.01125666, + "auxiliary_loss_mlp": 0.01039549, + "balance_loss_clip": 1.04526031, + "balance_loss_mlp": 1.02625096, + "epoch": 0.4123853900496017, + "flos": 56649746986560.0, + "grad_norm": 1.566578805846834, + "language_loss": 0.59102869, + "learning_rate": 2.652451598005391e-06, + "loss": 0.61268085, + "num_input_tokens_seen": 147257780, + "router_z_loss_clip": 0.80371094, + "router_z_loss_mlp": 0.13287354, + "step": 6859, + "time_per_iteration": 2.898758888244629 + }, + { + "auxiliary_loss_clip": 0.01127834, + "auxiliary_loss_mlp": 0.01039547, + "balance_loss_clip": 1.04431283, + "balance_loss_mlp": 1.02639842, + "epoch": 0.41244551330226964, + "flos": 21567110760000.0, + "grad_norm": 2.3658149782751243, + "language_loss": 0.7350499, + "learning_rate": 2.652083430674264e-06, + "loss": 0.75672376, + "num_input_tokens_seen": 147276055, + "router_z_loss_clip": 0.83496094, + "router_z_loss_mlp": 0.13140869, + "step": 6860, + "time_per_iteration": 2.615556001663208 + }, + { + "auxiliary_loss_clip": 0.0112493, + "auxiliary_loss_mlp": 0.01034594, + "balance_loss_clip": 1.04474378, + "balance_loss_mlp": 1.02246416, + "epoch": 0.4125056365549376, + "flos": 22810327237440.0, + "grad_norm": 1.6858042944782068, + "language_loss": 0.73923194, + "learning_rate": 2.651715238616068e-06, + "loss": 0.76082718, + "num_input_tokens_seen": 147293200, + "router_z_loss_clip": 0.80126953, + "router_z_loss_mlp": 0.12145996, + "step": 6861, + "time_per_iteration": 2.6509950160980225 + }, + { + "auxiliary_loss_clip": 0.01126407, + "auxiliary_loss_mlp": 0.01034774, + "balance_loss_clip": 1.04632092, + "balance_loss_mlp": 1.02270961, + "epoch": 0.41256575980760557, + "flos": 21835621189440.0, + "grad_norm": 2.2021419329227063, + "language_loss": 0.80299294, + "learning_rate": 2.651347021844765e-06, + "loss": 0.82460475, + "num_input_tokens_seen": 147310640, + "router_z_loss_clip": 0.80029297, + "router_z_loss_mlp": 0.1206665, + "step": 6862, + "time_per_iteration": 2.613607406616211 + }, + { + "auxiliary_loss_clip": 0.01128346, + "auxiliary_loss_mlp": 0.01035397, + "balance_loss_clip": 1.04655516, + "balance_loss_mlp": 1.02319574, + "epoch": 0.41262588306027354, + "flos": 26821736249760.0, + "grad_norm": 2.1182372443661728, + "language_loss": 0.75611252, + "learning_rate": 2.650978780374318e-06, + "loss": 0.7777499, + "num_input_tokens_seen": 147329435, + "router_z_loss_clip": 0.81689453, + "router_z_loss_mlp": 0.12194824, + "step": 6863, + "time_per_iteration": 2.6728897094726562 + }, + { + "auxiliary_loss_clip": 0.01048733, + "auxiliary_loss_mlp": 0.01003718, + "balance_loss_clip": 1.02222347, + "balance_loss_mlp": 1.00221491, + "epoch": 0.41268600631294156, + "flos": 65097696193920.0, + "grad_norm": 0.7043442366538392, + "language_loss": 0.52739906, + "learning_rate": 2.650610514218691e-06, + "loss": 0.54792356, + "num_input_tokens_seen": 147385805, + "router_z_loss_clip": 0.26513672, + "router_z_loss_mlp": 0.01502228, + "step": 6864, + "time_per_iteration": 3.2163498401641846 + }, + { + "auxiliary_loss_clip": 0.01131355, + "auxiliary_loss_mlp": 0.01037424, + "balance_loss_clip": 1.04597688, + "balance_loss_mlp": 1.02372122, + "epoch": 0.4127461295656095, + "flos": 29755700092800.0, + "grad_norm": 1.696749943168778, + "language_loss": 0.72786438, + "learning_rate": 2.6502422233918468e-06, + "loss": 0.74955213, + "num_input_tokens_seen": 147405160, + "router_z_loss_clip": 0.85351562, + "router_z_loss_mlp": 0.13708496, + "step": 6865, + "time_per_iteration": 2.735903263092041 + }, + { + "auxiliary_loss_clip": 0.01047346, + "auxiliary_loss_mlp": 0.010035, + "balance_loss_clip": 1.02075076, + "balance_loss_mlp": 1.00198746, + "epoch": 0.4128062528182775, + "flos": 87495898295520.0, + "grad_norm": 0.9225828916869886, + "language_loss": 0.66599059, + "learning_rate": 2.649873907907753e-06, + "loss": 0.686499, + "num_input_tokens_seen": 147460245, + "router_z_loss_clip": 0.265625, + "router_z_loss_mlp": 0.01512146, + "step": 6866, + "time_per_iteration": 3.1645071506500244 + }, + { + "auxiliary_loss_clip": 0.01125217, + "auxiliary_loss_mlp": 0.01040242, + "balance_loss_clip": 1.04331517, + "balance_loss_mlp": 1.02751625, + "epoch": 0.41286637607094545, + "flos": 21780159383520.0, + "grad_norm": 2.116273285963606, + "language_loss": 0.80721426, + "learning_rate": 2.649505567780375e-06, + "loss": 0.82886887, + "num_input_tokens_seen": 147476200, + "router_z_loss_clip": 0.81884766, + "router_z_loss_mlp": 0.12713623, + "step": 6867, + "time_per_iteration": 2.6462841033935547 + }, + { + "auxiliary_loss_clip": 0.01130467, + "auxiliary_loss_mlp": 0.01033612, + "balance_loss_clip": 1.04620326, + "balance_loss_mlp": 1.02074957, + "epoch": 0.4129264993236134, + "flos": 31176640476000.0, + "grad_norm": 2.558856668666268, + "language_loss": 0.77219588, + "learning_rate": 2.6491372030236815e-06, + "loss": 0.79383659, + "num_input_tokens_seen": 147494315, + "router_z_loss_clip": 0.84228516, + "router_z_loss_mlp": 0.12860107, + "step": 6868, + "time_per_iteration": 2.7102749347686768 + }, + { + "auxiliary_loss_clip": 0.01043462, + "auxiliary_loss_mlp": 0.01001441, + "balance_loss_clip": 1.01702189, + "balance_loss_mlp": 0.99996829, + "epoch": 0.4129866225762814, + "flos": 79820789281920.0, + "grad_norm": 0.8390491468290878, + "language_loss": 0.57895672, + "learning_rate": 2.64876881365164e-06, + "loss": 0.59940577, + "num_input_tokens_seen": 147543665, + "router_z_loss_clip": 0.26464844, + "router_z_loss_mlp": 0.01472473, + "step": 6869, + "time_per_iteration": 3.014003276824951 + }, + { + "auxiliary_loss_clip": 0.01125092, + "auxiliary_loss_mlp": 0.01032079, + "balance_loss_clip": 1.04471684, + "balance_loss_mlp": 1.01885831, + "epoch": 0.41304674582894935, + "flos": 35236380252960.0, + "grad_norm": 1.763396024214472, + "language_loss": 0.74986434, + "learning_rate": 2.64840039967822e-06, + "loss": 0.77143598, + "num_input_tokens_seen": 147564870, + "router_z_loss_clip": 0.80273438, + "router_z_loss_mlp": 0.13220215, + "step": 6870, + "time_per_iteration": 2.782554864883423 + }, + { + "auxiliary_loss_clip": 0.01127793, + "auxiliary_loss_mlp": 0.01041651, + "balance_loss_clip": 1.04460001, + "balance_loss_mlp": 1.02785826, + "epoch": 0.4131068690816173, + "flos": 27933128549280.0, + "grad_norm": 1.5110301320258595, + "language_loss": 0.83506036, + "learning_rate": 2.6480319611173912e-06, + "loss": 0.85675478, + "num_input_tokens_seen": 147584840, + "router_z_loss_clip": 0.83300781, + "router_z_loss_mlp": 0.13793945, + "step": 6871, + "time_per_iteration": 2.7483174800872803 + }, + { + "auxiliary_loss_clip": 0.01128985, + "auxiliary_loss_mlp": 0.01039714, + "balance_loss_clip": 1.04618835, + "balance_loss_mlp": 1.02669597, + "epoch": 0.4131669923342853, + "flos": 31805420273280.0, + "grad_norm": 2.1556372919102222, + "language_loss": 0.68369883, + "learning_rate": 2.6476634979831263e-06, + "loss": 0.7053858, + "num_input_tokens_seen": 147604635, + "router_z_loss_clip": 0.828125, + "router_z_loss_mlp": 0.13024902, + "step": 6872, + "time_per_iteration": 2.770781993865967 + }, + { + "auxiliary_loss_clip": 0.01129571, + "auxiliary_loss_mlp": 0.01034369, + "balance_loss_clip": 1.04630804, + "balance_loss_mlp": 1.02145886, + "epoch": 0.41322711558695324, + "flos": 23482170552960.0, + "grad_norm": 2.0471875429901885, + "language_loss": 0.7592802, + "learning_rate": 2.6472950102893964e-06, + "loss": 0.78091961, + "num_input_tokens_seen": 147620700, + "router_z_loss_clip": 0.83398438, + "router_z_loss_mlp": 0.12921143, + "step": 6873, + "time_per_iteration": 2.639626979827881 + }, + { + "auxiliary_loss_clip": 0.01130557, + "auxiliary_loss_mlp": 0.01034479, + "balance_loss_clip": 1.04629803, + "balance_loss_mlp": 1.02075791, + "epoch": 0.4132872388396212, + "flos": 27667575881280.0, + "grad_norm": 2.1497780897264596, + "language_loss": 0.83224177, + "learning_rate": 2.6469264980501746e-06, + "loss": 0.85389209, + "num_input_tokens_seen": 147639490, + "router_z_loss_clip": 0.84277344, + "router_z_loss_mlp": 0.13726807, + "step": 6874, + "time_per_iteration": 2.7298290729522705 + }, + { + "auxiliary_loss_clip": 0.01127621, + "auxiliary_loss_mlp": 0.01037526, + "balance_loss_clip": 1.04401577, + "balance_loss_mlp": 1.02387691, + "epoch": 0.4133473620922892, + "flos": 24587890433280.0, + "grad_norm": 1.9686746256969918, + "language_loss": 0.71330667, + "learning_rate": 2.646557961279436e-06, + "loss": 0.73495817, + "num_input_tokens_seen": 147657205, + "router_z_loss_clip": 0.83544922, + "router_z_loss_mlp": 0.13653564, + "step": 6875, + "time_per_iteration": 2.634157180786133 + }, + { + "auxiliary_loss_clip": 0.01121795, + "auxiliary_loss_mlp": 0.0103618, + "balance_loss_clip": 1.04426765, + "balance_loss_mlp": 1.02394378, + "epoch": 0.41340748534495714, + "flos": 29581379638560.0, + "grad_norm": 1.5033523085276579, + "language_loss": 0.82542229, + "learning_rate": 2.646189399991154e-06, + "loss": 0.84700203, + "num_input_tokens_seen": 147677005, + "router_z_loss_clip": 0.77490234, + "router_z_loss_mlp": 0.12243652, + "step": 6876, + "time_per_iteration": 2.7717108726501465 + }, + { + "auxiliary_loss_clip": 0.01130902, + "auxiliary_loss_mlp": 0.01038478, + "balance_loss_clip": 1.04449058, + "balance_loss_mlp": 1.02398205, + "epoch": 0.41346760859762516, + "flos": 17560887959520.0, + "grad_norm": 3.951504473335, + "language_loss": 0.65823847, + "learning_rate": 2.6458208141993048e-06, + "loss": 0.67993224, + "num_input_tokens_seen": 147693435, + "router_z_loss_clip": 0.86474609, + "router_z_loss_mlp": 0.14501953, + "step": 6877, + "time_per_iteration": 2.682964563369751 + }, + { + "auxiliary_loss_clip": 0.01125341, + "auxiliary_loss_mlp": 0.01029325, + "balance_loss_clip": 1.04415584, + "balance_loss_mlp": 1.01634932, + "epoch": 0.4135277318502931, + "flos": 27444924662400.0, + "grad_norm": 2.357441980788114, + "language_loss": 0.76444775, + "learning_rate": 2.6454522039178668e-06, + "loss": 0.78599441, + "num_input_tokens_seen": 147714000, + "router_z_loss_clip": 0.81201172, + "router_z_loss_mlp": 0.12988281, + "step": 6878, + "time_per_iteration": 2.669599771499634 + }, + { + "auxiliary_loss_clip": 0.01126553, + "auxiliary_loss_mlp": 0.01032741, + "balance_loss_clip": 1.04480028, + "balance_loss_mlp": 1.01913309, + "epoch": 0.4135878551029611, + "flos": 27355556243520.0, + "grad_norm": 1.9525562829364933, + "language_loss": 0.80374372, + "learning_rate": 2.6450835691608154e-06, + "loss": 0.82533669, + "num_input_tokens_seen": 147731010, + "router_z_loss_clip": 0.81738281, + "router_z_loss_mlp": 0.13604736, + "step": 6879, + "time_per_iteration": 2.733330249786377 + }, + { + "auxiliary_loss_clip": 0.01126718, + "auxiliary_loss_mlp": 0.01031324, + "balance_loss_clip": 1.04531932, + "balance_loss_mlp": 1.01801443, + "epoch": 0.41364797835562905, + "flos": 33010840478880.0, + "grad_norm": 1.8469438720427482, + "language_loss": 0.84726286, + "learning_rate": 2.6447149099421315e-06, + "loss": 0.86884332, + "num_input_tokens_seen": 147750880, + "router_z_loss_clip": 0.81445312, + "router_z_loss_mlp": 0.13317871, + "step": 6880, + "time_per_iteration": 2.703150510787964 + }, + { + "auxiliary_loss_clip": 0.01126899, + "auxiliary_loss_mlp": 0.01028601, + "balance_loss_clip": 1.04389572, + "balance_loss_mlp": 1.01529145, + "epoch": 0.413708101608297, + "flos": 28025171108640.0, + "grad_norm": 1.7006714285796416, + "language_loss": 0.70549589, + "learning_rate": 2.6443462262757927e-06, + "loss": 0.7270509, + "num_input_tokens_seen": 147771360, + "router_z_loss_clip": 0.83056641, + "router_z_loss_mlp": 0.13317871, + "step": 6881, + "time_per_iteration": 5.598753213882446 + }, + { + "auxiliary_loss_clip": 0.01125026, + "auxiliary_loss_mlp": 0.01031247, + "balance_loss_clip": 1.04607391, + "balance_loss_mlp": 1.01900375, + "epoch": 0.413768224860965, + "flos": 16269624338400.0, + "grad_norm": 1.7479680832462934, + "language_loss": 0.81602859, + "learning_rate": 2.6439775181757805e-06, + "loss": 0.83759123, + "num_input_tokens_seen": 147787440, + "router_z_loss_clip": 0.7890625, + "router_z_loss_mlp": 0.12243652, + "step": 6882, + "time_per_iteration": 2.67667555809021 + }, + { + "auxiliary_loss_clip": 0.01130265, + "auxiliary_loss_mlp": 0.01041967, + "balance_loss_clip": 1.04597211, + "balance_loss_mlp": 1.0266068, + "epoch": 0.41382834811363295, + "flos": 25397230345920.0, + "grad_norm": 47.71827342623753, + "language_loss": 0.69768429, + "learning_rate": 2.643608785656077e-06, + "loss": 0.7194066, + "num_input_tokens_seen": 147805720, + "router_z_loss_clip": 0.84277344, + "router_z_loss_mlp": 0.15338135, + "step": 6883, + "time_per_iteration": 2.6376419067382812 + }, + { + "auxiliary_loss_clip": 0.01125988, + "auxiliary_loss_mlp": 0.01033081, + "balance_loss_clip": 1.0442872, + "balance_loss_mlp": 1.02027202, + "epoch": 0.4138884713663009, + "flos": 25218128852640.0, + "grad_norm": 2.0313297902405245, + "language_loss": 0.75595212, + "learning_rate": 2.643240028730663e-06, + "loss": 0.77754289, + "num_input_tokens_seen": 147824605, + "router_z_loss_clip": 0.81738281, + "router_z_loss_mlp": 0.12811279, + "step": 6884, + "time_per_iteration": 2.720111846923828 + }, + { + "auxiliary_loss_clip": 0.01127061, + "auxiliary_loss_mlp": 0.01040459, + "balance_loss_clip": 1.04378688, + "balance_loss_mlp": 1.0274713, + "epoch": 0.4139485946189689, + "flos": 35455830606720.0, + "grad_norm": 1.410915697706342, + "language_loss": 0.75770986, + "learning_rate": 2.642871247413523e-06, + "loss": 0.77938503, + "num_input_tokens_seen": 147845445, + "router_z_loss_clip": 0.83203125, + "router_z_loss_mlp": 0.12994385, + "step": 6885, + "time_per_iteration": 2.6787221431732178 + }, + { + "auxiliary_loss_clip": 0.01128717, + "auxiliary_loss_mlp": 0.01034766, + "balance_loss_clip": 1.04497242, + "balance_loss_mlp": 1.02209973, + "epoch": 0.41400871787163684, + "flos": 29804598099360.0, + "grad_norm": 1.6795973258436054, + "language_loss": 0.69555175, + "learning_rate": 2.6425024417186414e-06, + "loss": 0.71718663, + "num_input_tokens_seen": 147865580, + "router_z_loss_clip": 0.83789062, + "router_z_loss_mlp": 0.12683105, + "step": 6886, + "time_per_iteration": 2.7285492420196533 + }, + { + "auxiliary_loss_clip": 0.01129417, + "auxiliary_loss_mlp": 0.01035627, + "balance_loss_clip": 1.04662919, + "balance_loss_mlp": 1.02257371, + "epoch": 0.4140688411243048, + "flos": 23749951671360.0, + "grad_norm": 1.4859336608239375, + "language_loss": 0.75200236, + "learning_rate": 2.642133611660002e-06, + "loss": 0.77365285, + "num_input_tokens_seen": 147885230, + "router_z_loss_clip": 0.828125, + "router_z_loss_mlp": 0.13061523, + "step": 6887, + "time_per_iteration": 4.267370939254761 + }, + { + "auxiliary_loss_clip": 0.01125378, + "auxiliary_loss_mlp": 0.01028876, + "balance_loss_clip": 1.04379845, + "balance_loss_mlp": 1.0154649, + "epoch": 0.4141289643769728, + "flos": 23566838967360.0, + "grad_norm": 4.3070082100202995, + "language_loss": 0.70173085, + "learning_rate": 2.641764757251592e-06, + "loss": 0.72327334, + "num_input_tokens_seen": 147903035, + "router_z_loss_clip": 0.81542969, + "router_z_loss_mlp": 0.13415527, + "step": 6888, + "time_per_iteration": 2.7435526847839355 + }, + { + "auxiliary_loss_clip": 0.0112393, + "auxiliary_loss_mlp": 0.01035011, + "balance_loss_clip": 1.04328442, + "balance_loss_mlp": 1.02214873, + "epoch": 0.41418908762964074, + "flos": 20409940284480.0, + "grad_norm": 1.9198153204889545, + "language_loss": 0.7626459, + "learning_rate": 2.6413958785073976e-06, + "loss": 0.7842353, + "num_input_tokens_seen": 147918745, + "router_z_loss_clip": 0.80517578, + "router_z_loss_mlp": 0.12884521, + "step": 6889, + "time_per_iteration": 2.658653497695923 + }, + { + "auxiliary_loss_clip": 0.01126544, + "auxiliary_loss_mlp": 0.01034288, + "balance_loss_clip": 1.0464232, + "balance_loss_mlp": 1.02178288, + "epoch": 0.41424921088230876, + "flos": 30868024289760.0, + "grad_norm": 1.4705504299811978, + "language_loss": 0.80292964, + "learning_rate": 2.6410269754414074e-06, + "loss": 0.82453793, + "num_input_tokens_seen": 147938265, + "router_z_loss_clip": 0.80175781, + "router_z_loss_mlp": 0.12506104, + "step": 6890, + "time_per_iteration": 2.6863794326782227 + }, + { + "auxiliary_loss_clip": 0.01124996, + "auxiliary_loss_mlp": 0.01038275, + "balance_loss_clip": 1.04470539, + "balance_loss_mlp": 1.02441144, + "epoch": 0.4143093341349767, + "flos": 25576007700960.0, + "grad_norm": 1.7106146485577807, + "language_loss": 0.74568492, + "learning_rate": 2.6406580480676113e-06, + "loss": 0.76731765, + "num_input_tokens_seen": 147957320, + "router_z_loss_clip": 0.80224609, + "router_z_loss_mlp": 0.13873291, + "step": 6891, + "time_per_iteration": 2.7090632915496826 + }, + { + "auxiliary_loss_clip": 0.01129649, + "auxiliary_loss_mlp": 0.01036616, + "balance_loss_clip": 1.04573381, + "balance_loss_mlp": 1.02240586, + "epoch": 0.4143694573876447, + "flos": 26866136838240.0, + "grad_norm": 1.775459180733546, + "language_loss": 0.84213221, + "learning_rate": 2.6402890963999963e-06, + "loss": 0.86379492, + "num_input_tokens_seen": 147977045, + "router_z_loss_clip": 0.83984375, + "router_z_loss_mlp": 0.14202881, + "step": 6892, + "time_per_iteration": 4.219520092010498 + }, + { + "auxiliary_loss_clip": 0.01126395, + "auxiliary_loss_mlp": 0.01030836, + "balance_loss_clip": 1.04701984, + "balance_loss_mlp": 1.01833057, + "epoch": 0.41442958064031266, + "flos": 43559386869600.0, + "grad_norm": 3.9246190679218858, + "language_loss": 0.70510882, + "learning_rate": 2.6399201204525554e-06, + "loss": 0.72668111, + "num_input_tokens_seen": 147996905, + "router_z_loss_clip": 0.79345703, + "router_z_loss_mlp": 0.12512207, + "step": 6893, + "time_per_iteration": 2.745958089828491 + }, + { + "auxiliary_loss_clip": 0.01125363, + "auxiliary_loss_mlp": 0.01031941, + "balance_loss_clip": 1.04520011, + "balance_loss_mlp": 1.01917386, + "epoch": 0.4144897038929806, + "flos": 34524633767040.0, + "grad_norm": 1.5322391652884697, + "language_loss": 0.72579587, + "learning_rate": 2.639551120239279e-06, + "loss": 0.74736887, + "num_input_tokens_seen": 148017875, + "router_z_loss_clip": 0.80175781, + "router_z_loss_mlp": 0.12756348, + "step": 6894, + "time_per_iteration": 2.7172484397888184 + }, + { + "auxiliary_loss_clip": 0.01128007, + "auxiliary_loss_mlp": 0.01033019, + "balance_loss_clip": 1.04546821, + "balance_loss_mlp": 1.02036476, + "epoch": 0.4145498271456486, + "flos": 14213056737600.0, + "grad_norm": 2.7184935209845458, + "language_loss": 0.62859416, + "learning_rate": 2.63918209577416e-06, + "loss": 0.65020442, + "num_input_tokens_seen": 148032300, + "router_z_loss_clip": 0.82617188, + "router_z_loss_mlp": 0.12652588, + "step": 6895, + "time_per_iteration": 2.608365058898926 + }, + { + "auxiliary_loss_clip": 0.01126369, + "auxiliary_loss_mlp": 0.01035669, + "balance_loss_clip": 1.04634285, + "balance_loss_mlp": 1.02302063, + "epoch": 0.41460995039831655, + "flos": 33233410663200.0, + "grad_norm": 1.4329293997033141, + "language_loss": 0.70391691, + "learning_rate": 2.638813047071192e-06, + "loss": 0.7255373, + "num_input_tokens_seen": 148053260, + "router_z_loss_clip": 0.80029297, + "router_z_loss_mlp": 0.12640381, + "step": 6896, + "time_per_iteration": 2.7055795192718506 + }, + { + "auxiliary_loss_clip": 0.0112696, + "auxiliary_loss_mlp": 0.01040325, + "balance_loss_clip": 1.04495227, + "balance_loss_mlp": 1.02656245, + "epoch": 0.4146700736509845, + "flos": 31630856715360.0, + "grad_norm": 1.7521066779544279, + "language_loss": 0.72990644, + "learning_rate": 2.6384439741443696e-06, + "loss": 0.75157928, + "num_input_tokens_seen": 148072965, + "router_z_loss_clip": 0.8203125, + "router_z_loss_mlp": 0.13775635, + "step": 6897, + "time_per_iteration": 2.7203900814056396 + }, + { + "auxiliary_loss_clip": 0.01127026, + "auxiliary_loss_mlp": 0.0104379, + "balance_loss_clip": 1.04710865, + "balance_loss_mlp": 1.03090382, + "epoch": 0.4147301969036525, + "flos": 32742492118560.0, + "grad_norm": 1.68501278215953, + "language_loss": 0.84365565, + "learning_rate": 2.6380748770076873e-06, + "loss": 0.86536384, + "num_input_tokens_seen": 148093240, + "router_z_loss_clip": 0.79980469, + "router_z_loss_mlp": 0.12884521, + "step": 6898, + "time_per_iteration": 2.6753315925598145 + }, + { + "auxiliary_loss_clip": 0.01125851, + "auxiliary_loss_mlp": 0.0103732, + "balance_loss_clip": 1.04307079, + "balance_loss_mlp": 1.02411163, + "epoch": 0.41479032015632045, + "flos": 24768410031360.0, + "grad_norm": 1.9895244029294092, + "language_loss": 0.74862242, + "learning_rate": 2.6377057556751416e-06, + "loss": 0.77025414, + "num_input_tokens_seen": 148110925, + "router_z_loss_clip": 0.82763672, + "router_z_loss_mlp": 0.13208008, + "step": 6899, + "time_per_iteration": 2.733088254928589 + }, + { + "auxiliary_loss_clip": 0.01130503, + "auxiliary_loss_mlp": 0.01036292, + "balance_loss_clip": 1.04505956, + "balance_loss_mlp": 1.02213025, + "epoch": 0.4148504434089884, + "flos": 30828728878560.0, + "grad_norm": 2.672850396804459, + "language_loss": 0.7594738, + "learning_rate": 2.6373366101607306e-06, + "loss": 0.7811417, + "num_input_tokens_seen": 148130670, + "router_z_loss_clip": 0.85498047, + "router_z_loss_mlp": 0.1416626, + "step": 6900, + "time_per_iteration": 2.6832375526428223 + }, + { + "auxiliary_loss_clip": 0.01128125, + "auxiliary_loss_mlp": 0.01038037, + "balance_loss_clip": 1.04739773, + "balance_loss_mlp": 1.02453041, + "epoch": 0.4149105666616564, + "flos": 15645301441920.0, + "grad_norm": 2.1620458343192923, + "language_loss": 0.79789591, + "learning_rate": 2.6369674404784503e-06, + "loss": 0.81955755, + "num_input_tokens_seen": 148148350, + "router_z_loss_clip": 0.80712891, + "router_z_loss_mlp": 0.13482666, + "step": 6901, + "time_per_iteration": 2.6700961589813232 + }, + { + "auxiliary_loss_clip": 0.01126206, + "auxiliary_loss_mlp": 0.01033522, + "balance_loss_clip": 1.045362, + "balance_loss_mlp": 1.02039659, + "epoch": 0.41497068991432434, + "flos": 20455029666720.0, + "grad_norm": 1.7412957382534546, + "language_loss": 0.69521034, + "learning_rate": 2.6365982466423014e-06, + "loss": 0.71680748, + "num_input_tokens_seen": 148167550, + "router_z_loss_clip": 0.80859375, + "router_z_loss_mlp": 0.13140869, + "step": 6902, + "time_per_iteration": 2.609739065170288 + }, + { + "auxiliary_loss_clip": 0.01125538, + "auxiliary_loss_mlp": 0.01045389, + "balance_loss_clip": 1.04657328, + "balance_loss_mlp": 1.03256202, + "epoch": 0.4150308131669923, + "flos": 21964852261440.0, + "grad_norm": 1.6702634842585693, + "language_loss": 0.83665496, + "learning_rate": 2.6362290286662834e-06, + "loss": 0.85836422, + "num_input_tokens_seen": 148184740, + "router_z_loss_clip": 0.7890625, + "router_z_loss_mlp": 0.1282959, + "step": 6903, + "time_per_iteration": 2.646057367324829 + }, + { + "auxiliary_loss_clip": 0.01132147, + "auxiliary_loss_mlp": 0.01043892, + "balance_loss_clip": 1.04695725, + "balance_loss_mlp": 1.02884197, + "epoch": 0.41509093641966033, + "flos": 36661372364160.0, + "grad_norm": 1.966514442013, + "language_loss": 0.67806315, + "learning_rate": 2.6358597865643968e-06, + "loss": 0.6998235, + "num_input_tokens_seen": 148204605, + "router_z_loss_clip": 0.85205078, + "router_z_loss_mlp": 0.1505127, + "step": 6904, + "time_per_iteration": 2.6845896244049072 + }, + { + "auxiliary_loss_clip": 0.01129254, + "auxiliary_loss_mlp": 0.01041807, + "balance_loss_clip": 1.04468453, + "balance_loss_mlp": 1.02805638, + "epoch": 0.4151510596723283, + "flos": 29626752641760.0, + "grad_norm": 1.7672789557134296, + "language_loss": 0.77535689, + "learning_rate": 2.635490520350643e-06, + "loss": 0.79706752, + "num_input_tokens_seen": 148224675, + "router_z_loss_clip": 0.84619141, + "router_z_loss_mlp": 0.1373291, + "step": 6905, + "time_per_iteration": 2.692457437515259 + }, + { + "auxiliary_loss_clip": 0.01130528, + "auxiliary_loss_mlp": 0.01031627, + "balance_loss_clip": 1.04691195, + "balance_loss_mlp": 1.01832902, + "epoch": 0.41521118292499626, + "flos": 28646009519040.0, + "grad_norm": 2.3644884363396783, + "language_loss": 0.68526375, + "learning_rate": 2.635121230039025e-06, + "loss": 0.70688534, + "num_input_tokens_seen": 148243375, + "router_z_loss_clip": 0.83691406, + "router_z_loss_mlp": 0.13293457, + "step": 6906, + "time_per_iteration": 2.6340689659118652 + }, + { + "auxiliary_loss_clip": 0.0112489, + "auxiliary_loss_mlp": 0.01035058, + "balance_loss_clip": 1.04512239, + "balance_loss_mlp": 1.02271414, + "epoch": 0.4152713061776642, + "flos": 26997839464320.0, + "grad_norm": 2.6494198533160502, + "language_loss": 0.66953063, + "learning_rate": 2.6347519156435467e-06, + "loss": 0.69113016, + "num_input_tokens_seen": 148261140, + "router_z_loss_clip": 0.79785156, + "router_z_loss_mlp": 0.12365723, + "step": 6907, + "time_per_iteration": 2.645552158355713 + }, + { + "auxiliary_loss_clip": 0.01128976, + "auxiliary_loss_mlp": 0.01036854, + "balance_loss_clip": 1.04712057, + "balance_loss_mlp": 1.02409279, + "epoch": 0.4153314294303322, + "flos": 25931050339680.0, + "grad_norm": 2.3112175067701126, + "language_loss": 0.77253687, + "learning_rate": 2.6343825771782123e-06, + "loss": 0.79419523, + "num_input_tokens_seen": 148279655, + "router_z_loss_clip": 0.81787109, + "router_z_loss_mlp": 0.12774658, + "step": 6908, + "time_per_iteration": 2.6574411392211914 + }, + { + "auxiliary_loss_clip": 0.01044021, + "auxiliary_loss_mlp": 0.01003848, + "balance_loss_clip": 1.01777482, + "balance_loss_mlp": 1.00236034, + "epoch": 0.41539155268300015, + "flos": 70674268055040.0, + "grad_norm": 0.7837713044335299, + "language_loss": 0.64910251, + "learning_rate": 2.634013214657026e-06, + "loss": 0.66958117, + "num_input_tokens_seen": 148339005, + "router_z_loss_clip": 0.26220703, + "router_z_loss_mlp": 0.01486969, + "step": 6909, + "time_per_iteration": 3.23695707321167 + }, + { + "auxiliary_loss_clip": 0.01127552, + "auxiliary_loss_mlp": 0.0103963, + "balance_loss_clip": 1.04675102, + "balance_loss_mlp": 1.02676713, + "epoch": 0.4154516759356681, + "flos": 26727424722720.0, + "grad_norm": 1.473497674229731, + "language_loss": 0.86769992, + "learning_rate": 2.633643828093996e-06, + "loss": 0.88937175, + "num_input_tokens_seen": 148358715, + "router_z_loss_clip": 0.80859375, + "router_z_loss_mlp": 0.12878418, + "step": 6910, + "time_per_iteration": 2.7251107692718506 + }, + { + "auxiliary_loss_clip": 0.0104295, + "auxiliary_loss_mlp": 0.01002108, + "balance_loss_clip": 1.01674628, + "balance_loss_mlp": 1.00063217, + "epoch": 0.4155117991883361, + "flos": 82771891934400.0, + "grad_norm": 0.804227898637137, + "language_loss": 0.62103528, + "learning_rate": 2.633274417503128e-06, + "loss": 0.64148581, + "num_input_tokens_seen": 148417280, + "router_z_loss_clip": 0.26220703, + "router_z_loss_mlp": 0.01475525, + "step": 6911, + "time_per_iteration": 3.2282235622406006 + }, + { + "auxiliary_loss_clip": 0.01133991, + "auxiliary_loss_mlp": 0.01037645, + "balance_loss_clip": 1.04735339, + "balance_loss_mlp": 1.02401948, + "epoch": 0.41557192244100405, + "flos": 17427281021280.0, + "grad_norm": 2.9274236289704327, + "language_loss": 0.88093138, + "learning_rate": 2.6329049828984312e-06, + "loss": 0.90264773, + "num_input_tokens_seen": 148432610, + "router_z_loss_clip": 0.86572266, + "router_z_loss_mlp": 0.1362915, + "step": 6912, + "time_per_iteration": 2.6771841049194336 + }, + { + "auxiliary_loss_clip": 0.01129088, + "auxiliary_loss_mlp": 0.01033685, + "balance_loss_clip": 1.04755759, + "balance_loss_mlp": 1.0214119, + "epoch": 0.415632045693672, + "flos": 29848836618720.0, + "grad_norm": 2.531286234922768, + "language_loss": 0.62489367, + "learning_rate": 2.632535524293914e-06, + "loss": 0.64652145, + "num_input_tokens_seen": 148451510, + "router_z_loss_clip": 0.81494141, + "router_z_loss_mlp": 0.12280273, + "step": 6913, + "time_per_iteration": 2.6703577041625977 + }, + { + "auxiliary_loss_clip": 0.01125041, + "auxiliary_loss_mlp": 0.01033769, + "balance_loss_clip": 1.04554045, + "balance_loss_mlp": 1.02151442, + "epoch": 0.41569216894634, + "flos": 24545151053280.0, + "grad_norm": 1.767205873312915, + "language_loss": 0.75097769, + "learning_rate": 2.632166041703586e-06, + "loss": 0.77256578, + "num_input_tokens_seen": 148469945, + "router_z_loss_clip": 0.79443359, + "router_z_loss_mlp": 0.12255859, + "step": 6914, + "time_per_iteration": 2.6779162883758545 + }, + { + "auxiliary_loss_clip": 0.01128405, + "auxiliary_loss_mlp": 0.0103971, + "balance_loss_clip": 1.04496717, + "balance_loss_mlp": 1.02666271, + "epoch": 0.41575229219900794, + "flos": 29040226017120.0, + "grad_norm": 2.1443419870737217, + "language_loss": 0.87830573, + "learning_rate": 2.631796535141458e-06, + "loss": 0.89998686, + "num_input_tokens_seen": 148486655, + "router_z_loss_clip": 0.83349609, + "router_z_loss_mlp": 0.13049316, + "step": 6915, + "time_per_iteration": 2.648834228515625 + }, + { + "auxiliary_loss_clip": 0.01130391, + "auxiliary_loss_mlp": 0.01042143, + "balance_loss_clip": 1.04767954, + "balance_loss_mlp": 1.02914906, + "epoch": 0.4158124154516759, + "flos": 28197263112480.0, + "grad_norm": 2.724436986555096, + "language_loss": 0.71234584, + "learning_rate": 2.6314270046215426e-06, + "loss": 0.73407114, + "num_input_tokens_seen": 148505035, + "router_z_loss_clip": 0.82666016, + "router_z_loss_mlp": 0.12994385, + "step": 6916, + "time_per_iteration": 2.655418634414673 + }, + { + "auxiliary_loss_clip": 0.01132237, + "auxiliary_loss_mlp": 0.01029131, + "balance_loss_clip": 1.04760277, + "balance_loss_mlp": 1.0159936, + "epoch": 0.41587253870434393, + "flos": 29581622742240.0, + "grad_norm": 2.3808726094298094, + "language_loss": 0.71933347, + "learning_rate": 2.631057450157852e-06, + "loss": 0.74094719, + "num_input_tokens_seen": 148525575, + "router_z_loss_clip": 0.84619141, + "router_z_loss_mlp": 0.13146973, + "step": 6917, + "time_per_iteration": 2.683584213256836 + }, + { + "auxiliary_loss_clip": 0.01125337, + "auxiliary_loss_mlp": 0.01029835, + "balance_loss_clip": 1.04364061, + "balance_loss_mlp": 1.01713336, + "epoch": 0.4159326619570119, + "flos": 29140331515200.0, + "grad_norm": 1.6262880813085392, + "language_loss": 0.8102113, + "learning_rate": 2.6306878717643988e-06, + "loss": 0.83176303, + "num_input_tokens_seen": 148547270, + "router_z_loss_clip": 0.81738281, + "router_z_loss_mlp": 0.12695312, + "step": 6918, + "time_per_iteration": 2.717536687850952 + }, + { + "auxiliary_loss_clip": 0.01131062, + "auxiliary_loss_mlp": 0.01036016, + "balance_loss_clip": 1.0475769, + "balance_loss_mlp": 1.02194953, + "epoch": 0.41599278520967986, + "flos": 49573968058080.0, + "grad_norm": 1.5468721483272658, + "language_loss": 0.70099407, + "learning_rate": 2.6303182694551995e-06, + "loss": 0.72266489, + "num_input_tokens_seen": 148572100, + "router_z_loss_clip": 0.83398438, + "router_z_loss_mlp": 0.14080811, + "step": 6919, + "time_per_iteration": 2.841616153717041 + }, + { + "auxiliary_loss_clip": 0.01129924, + "auxiliary_loss_mlp": 0.01031497, + "balance_loss_clip": 1.04731476, + "balance_loss_mlp": 1.01808548, + "epoch": 0.4160529084623478, + "flos": 22235510106720.0, + "grad_norm": 1.7945827374093135, + "language_loss": 0.81157207, + "learning_rate": 2.6299486432442677e-06, + "loss": 0.83318627, + "num_input_tokens_seen": 148591245, + "router_z_loss_clip": 0.82568359, + "router_z_loss_mlp": 0.13421631, + "step": 6920, + "time_per_iteration": 2.5966475009918213 + }, + { + "auxiliary_loss_clip": 0.0113182, + "auxiliary_loss_mlp": 0.01035313, + "balance_loss_clip": 1.04760456, + "balance_loss_mlp": 1.02131748, + "epoch": 0.4161130317150158, + "flos": 16669877911200.0, + "grad_norm": 3.0163289307261385, + "language_loss": 0.65189171, + "learning_rate": 2.6295789931456195e-06, + "loss": 0.67356306, + "num_input_tokens_seen": 148607980, + "router_z_loss_clip": 0.84228516, + "router_z_loss_mlp": 0.13983154, + "step": 6921, + "time_per_iteration": 4.129891633987427 + }, + { + "auxiliary_loss_clip": 0.01128416, + "auxiliary_loss_mlp": 0.01035373, + "balance_loss_clip": 1.04623735, + "balance_loss_mlp": 1.02217042, + "epoch": 0.41617315496768376, + "flos": 19740852144000.0, + "grad_norm": 1.9936179460053955, + "language_loss": 0.80112839, + "learning_rate": 2.629209319173274e-06, + "loss": 0.8227663, + "num_input_tokens_seen": 148624490, + "router_z_loss_clip": 0.82128906, + "router_z_loss_mlp": 0.13208008, + "step": 6922, + "time_per_iteration": 2.6410934925079346 + }, + { + "auxiliary_loss_clip": 0.01129039, + "auxiliary_loss_mlp": 0.01033954, + "balance_loss_clip": 1.04526329, + "balance_loss_mlp": 1.02111483, + "epoch": 0.4162332782203517, + "flos": 31985980388640.0, + "grad_norm": 1.6856310145054996, + "language_loss": 0.67562455, + "learning_rate": 2.628839621341247e-06, + "loss": 0.69725454, + "num_input_tokens_seen": 148646490, + "router_z_loss_clip": 0.83837891, + "router_z_loss_mlp": 0.12841797, + "step": 6923, + "time_per_iteration": 2.715595245361328 + }, + { + "auxiliary_loss_clip": 0.01128206, + "auxiliary_loss_mlp": 0.01041224, + "balance_loss_clip": 1.0460242, + "balance_loss_mlp": 1.02746725, + "epoch": 0.4162934014730197, + "flos": 34390702690560.0, + "grad_norm": 1.9915045775223559, + "language_loss": 0.76368266, + "learning_rate": 2.6284698996635593e-06, + "loss": 0.78537703, + "num_input_tokens_seen": 148668580, + "router_z_loss_clip": 0.82177734, + "router_z_loss_mlp": 0.13751221, + "step": 6924, + "time_per_iteration": 2.7027595043182373 + }, + { + "auxiliary_loss_clip": 0.01129542, + "auxiliary_loss_mlp": 0.01035274, + "balance_loss_clip": 1.04547358, + "balance_loss_mlp": 1.02251279, + "epoch": 0.41635352472568765, + "flos": 24239614180320.0, + "grad_norm": 1.8889115240959728, + "language_loss": 0.73324496, + "learning_rate": 2.62810015415423e-06, + "loss": 0.75489312, + "num_input_tokens_seen": 148688410, + "router_z_loss_clip": 0.84130859, + "router_z_loss_mlp": 0.12756348, + "step": 6925, + "time_per_iteration": 2.6174776554107666 + }, + { + "auxiliary_loss_clip": 0.01125433, + "auxiliary_loss_mlp": 0.01032211, + "balance_loss_clip": 1.04301262, + "balance_loss_mlp": 1.01964605, + "epoch": 0.4164136479783556, + "flos": 18224101094400.0, + "grad_norm": 2.253418503656651, + "language_loss": 0.83879793, + "learning_rate": 2.6277303848272792e-06, + "loss": 0.86037433, + "num_input_tokens_seen": 148704855, + "router_z_loss_clip": 0.82519531, + "router_z_loss_mlp": 0.12561035, + "step": 6926, + "time_per_iteration": 4.0470967292785645 + }, + { + "auxiliary_loss_clip": 0.01122903, + "auxiliary_loss_mlp": 0.01035834, + "balance_loss_clip": 1.0439707, + "balance_loss_mlp": 1.02367461, + "epoch": 0.4164737712310236, + "flos": 26549174092320.0, + "grad_norm": 1.755296653535295, + "language_loss": 0.86290181, + "learning_rate": 2.6273605916967302e-06, + "loss": 0.88448918, + "num_input_tokens_seen": 148723065, + "router_z_loss_clip": 0.79003906, + "router_z_loss_mlp": 0.121521, + "step": 6927, + "time_per_iteration": 2.61364483833313 + }, + { + "auxiliary_loss_clip": 0.01125274, + "auxiliary_loss_mlp": 0.01035249, + "balance_loss_clip": 1.04388189, + "balance_loss_mlp": 1.02126002, + "epoch": 0.41653389448369155, + "flos": 25307213650560.0, + "grad_norm": 2.2362347670694023, + "language_loss": 0.71833217, + "learning_rate": 2.626990774776604e-06, + "loss": 0.73993742, + "num_input_tokens_seen": 148741780, + "router_z_loss_clip": 0.81298828, + "router_z_loss_mlp": 0.14001465, + "step": 6928, + "time_per_iteration": 2.6433775424957275 + }, + { + "auxiliary_loss_clip": 0.01123576, + "auxiliary_loss_mlp": 0.01033445, + "balance_loss_clip": 1.04278278, + "balance_loss_mlp": 1.02002168, + "epoch": 0.4165940177363595, + "flos": 30472308652320.0, + "grad_norm": 2.071356599593642, + "language_loss": 0.77875125, + "learning_rate": 2.6266209340809254e-06, + "loss": 0.80032146, + "num_input_tokens_seen": 148759795, + "router_z_loss_clip": 0.80712891, + "router_z_loss_mlp": 0.13421631, + "step": 6929, + "time_per_iteration": 2.656702995300293 + }, + { + "auxiliary_loss_clip": 0.01125509, + "auxiliary_loss_mlp": 0.01029601, + "balance_loss_clip": 1.04402804, + "balance_loss_mlp": 1.01750112, + "epoch": 0.41665414098902753, + "flos": 25040648050560.0, + "grad_norm": 1.9082297773392616, + "language_loss": 0.71044922, + "learning_rate": 2.6262510696237182e-06, + "loss": 0.73200035, + "num_input_tokens_seen": 148778680, + "router_z_loss_clip": 0.81494141, + "router_z_loss_mlp": 0.12097168, + "step": 6930, + "time_per_iteration": 2.6644890308380127 + }, + { + "auxiliary_loss_clip": 0.01124081, + "auxiliary_loss_mlp": 0.01034954, + "balance_loss_clip": 1.04117918, + "balance_loss_mlp": 1.02185857, + "epoch": 0.4167142642416955, + "flos": 24017043996000.0, + "grad_norm": 1.8258424201280838, + "language_loss": 0.80924928, + "learning_rate": 2.625881181419007e-06, + "loss": 0.83083963, + "num_input_tokens_seen": 148796470, + "router_z_loss_clip": 0.82861328, + "router_z_loss_mlp": 0.13098145, + "step": 6931, + "time_per_iteration": 4.070663928985596 + }, + { + "auxiliary_loss_clip": 0.01120533, + "auxiliary_loss_mlp": 0.01033507, + "balance_loss_clip": 1.04040027, + "balance_loss_mlp": 1.0203464, + "epoch": 0.41677438749436346, + "flos": 28997284050720.0, + "grad_norm": 2.055022981520642, + "language_loss": 0.79515904, + "learning_rate": 2.6255112694808193e-06, + "loss": 0.81669945, + "num_input_tokens_seen": 148815300, + "router_z_loss_clip": 0.80078125, + "router_z_loss_mlp": 0.13165283, + "step": 6932, + "time_per_iteration": 2.672327756881714 + }, + { + "auxiliary_loss_clip": 0.01127014, + "auxiliary_loss_mlp": 0.01034352, + "balance_loss_clip": 1.04403901, + "balance_loss_mlp": 1.02049947, + "epoch": 0.41683451074703143, + "flos": 37107079974720.0, + "grad_norm": 2.233876761532908, + "language_loss": 0.82085621, + "learning_rate": 2.6251413338231813e-06, + "loss": 0.84246981, + "num_input_tokens_seen": 148834315, + "router_z_loss_clip": 0.82958984, + "router_z_loss_mlp": 0.13842773, + "step": 6933, + "time_per_iteration": 2.8497512340545654 + }, + { + "auxiliary_loss_clip": 0.01128385, + "auxiliary_loss_mlp": 0.01032828, + "balance_loss_clip": 1.04375076, + "balance_loss_mlp": 1.01863575, + "epoch": 0.4168946339996994, + "flos": 26242664804640.0, + "grad_norm": 5.304628048005776, + "language_loss": 0.76699054, + "learning_rate": 2.624771374460121e-06, + "loss": 0.78860265, + "num_input_tokens_seen": 148852420, + "router_z_loss_clip": 0.84716797, + "router_z_loss_mlp": 0.14202881, + "step": 6934, + "time_per_iteration": 2.7007877826690674 + }, + { + "auxiliary_loss_clip": 0.01128893, + "auxiliary_loss_mlp": 0.01030565, + "balance_loss_clip": 1.04619384, + "balance_loss_mlp": 1.01733875, + "epoch": 0.41695475725236736, + "flos": 21522588619680.0, + "grad_norm": 4.144616090907945, + "language_loss": 0.67628872, + "learning_rate": 2.624401391405668e-06, + "loss": 0.69788325, + "num_input_tokens_seen": 148869305, + "router_z_loss_clip": 0.82763672, + "router_z_loss_mlp": 0.13220215, + "step": 6935, + "time_per_iteration": 2.644223690032959 + }, + { + "auxiliary_loss_clip": 0.01127232, + "auxiliary_loss_mlp": 0.01039858, + "balance_loss_clip": 1.04559243, + "balance_loss_mlp": 1.02659035, + "epoch": 0.4170148805050353, + "flos": 19118879249760.0, + "grad_norm": 2.3028740016483047, + "language_loss": 0.73628962, + "learning_rate": 2.6240313846738513e-06, + "loss": 0.75796056, + "num_input_tokens_seen": 148886395, + "router_z_loss_clip": 0.81640625, + "router_z_loss_mlp": 0.13275146, + "step": 6936, + "time_per_iteration": 2.615964651107788 + }, + { + "auxiliary_loss_clip": 0.01124557, + "auxiliary_loss_mlp": 0.01032798, + "balance_loss_clip": 1.04402339, + "balance_loss_mlp": 1.02015567, + "epoch": 0.4170750037577033, + "flos": 18496744286400.0, + "grad_norm": 2.6003995475541286, + "language_loss": 0.73534971, + "learning_rate": 2.6236613542787024e-06, + "loss": 0.75692326, + "num_input_tokens_seen": 148905235, + "router_z_loss_clip": 0.80615234, + "router_z_loss_mlp": 0.12646484, + "step": 6937, + "time_per_iteration": 2.6365199089050293 + }, + { + "auxiliary_loss_clip": 0.0112347, + "auxiliary_loss_mlp": 0.01035956, + "balance_loss_clip": 1.04299998, + "balance_loss_mlp": 1.02323067, + "epoch": 0.41713512701037125, + "flos": 35103664694880.0, + "grad_norm": 1.463122849655875, + "language_loss": 0.84140378, + "learning_rate": 2.6232913002342518e-06, + "loss": 0.86299807, + "num_input_tokens_seen": 148928130, + "router_z_loss_clip": 0.80517578, + "router_z_loss_mlp": 0.12731934, + "step": 6938, + "time_per_iteration": 2.7202517986297607 + }, + { + "auxiliary_loss_clip": 0.01129514, + "auxiliary_loss_mlp": 0.01034246, + "balance_loss_clip": 1.04517972, + "balance_loss_mlp": 1.02035761, + "epoch": 0.4171952502630392, + "flos": 34479787488480.0, + "grad_norm": 1.8556613701011884, + "language_loss": 0.73987353, + "learning_rate": 2.6229212225545334e-06, + "loss": 0.76151109, + "num_input_tokens_seen": 148948790, + "router_z_loss_clip": 0.84375, + "router_z_loss_mlp": 0.13891602, + "step": 6939, + "time_per_iteration": 2.6646640300750732 + }, + { + "auxiliary_loss_clip": 0.01126804, + "auxiliary_loss_mlp": 0.01033636, + "balance_loss_clip": 1.04527104, + "balance_loss_mlp": 1.02008152, + "epoch": 0.4172553735157072, + "flos": 29982727177920.0, + "grad_norm": 2.2666195970148, + "language_loss": 0.74869645, + "learning_rate": 2.622551121253579e-06, + "loss": 0.77030081, + "num_input_tokens_seen": 148967690, + "router_z_loss_clip": 0.81542969, + "router_z_loss_mlp": 0.13549805, + "step": 6940, + "time_per_iteration": 2.6858632564544678 + }, + { + "auxiliary_loss_clip": 0.01127199, + "auxiliary_loss_mlp": 0.01035664, + "balance_loss_clip": 1.04499722, + "balance_loss_mlp": 1.02328384, + "epoch": 0.41731549676837515, + "flos": 33002453401920.0, + "grad_norm": 3.024385213747718, + "language_loss": 0.71392667, + "learning_rate": 2.622180996345424e-06, + "loss": 0.73555529, + "num_input_tokens_seen": 148987150, + "router_z_loss_clip": 0.82275391, + "router_z_loss_mlp": 0.12384033, + "step": 6941, + "time_per_iteration": 2.6415367126464844 + }, + { + "auxiliary_loss_clip": 0.01129165, + "auxiliary_loss_mlp": 0.01038371, + "balance_loss_clip": 1.04544485, + "balance_loss_mlp": 1.02469742, + "epoch": 0.4173756200210431, + "flos": 34647868281600.0, + "grad_norm": 2.41658148822408, + "language_loss": 0.73732376, + "learning_rate": 2.621810847844104e-06, + "loss": 0.75899923, + "num_input_tokens_seen": 149004895, + "router_z_loss_clip": 0.83691406, + "router_z_loss_mlp": 0.13671875, + "step": 6942, + "time_per_iteration": 2.7105367183685303 + }, + { + "auxiliary_loss_clip": 0.01132096, + "auxiliary_loss_mlp": 0.01037233, + "balance_loss_clip": 1.04692197, + "balance_loss_mlp": 1.02367854, + "epoch": 0.41743574327371114, + "flos": 27481626967680.0, + "grad_norm": 4.137765380446346, + "language_loss": 0.71970713, + "learning_rate": 2.6214406757636534e-06, + "loss": 0.74140048, + "num_input_tokens_seen": 149020970, + "router_z_loss_clip": 0.8515625, + "router_z_loss_mlp": 0.13543701, + "step": 6943, + "time_per_iteration": 2.6709532737731934 + }, + { + "auxiliary_loss_clip": 0.01131669, + "auxiliary_loss_mlp": 0.01031094, + "balance_loss_clip": 1.0473454, + "balance_loss_mlp": 1.01736152, + "epoch": 0.4174958665263791, + "flos": 36745554571200.0, + "grad_norm": 1.9429385503151149, + "language_loss": 0.63996029, + "learning_rate": 2.621070480118111e-06, + "loss": 0.66158795, + "num_input_tokens_seen": 149041795, + "router_z_loss_clip": 0.84228516, + "router_z_loss_mlp": 0.1373291, + "step": 6944, + "time_per_iteration": 2.751906156539917 + }, + { + "auxiliary_loss_clip": 0.01127443, + "auxiliary_loss_mlp": 0.01034352, + "balance_loss_clip": 1.04550862, + "balance_loss_mlp": 1.02134061, + "epoch": 0.41755598977904707, + "flos": 30826986635520.0, + "grad_norm": 1.7925708428714888, + "language_loss": 0.70151067, + "learning_rate": 2.620700260921513e-06, + "loss": 0.72312862, + "num_input_tokens_seen": 149063700, + "router_z_loss_clip": 0.81835938, + "router_z_loss_mlp": 0.13006592, + "step": 6945, + "time_per_iteration": 2.671812057495117 + }, + { + "auxiliary_loss_clip": 0.01122504, + "auxiliary_loss_mlp": 0.01039935, + "balance_loss_clip": 1.04097438, + "balance_loss_mlp": 1.02545714, + "epoch": 0.41761611303171503, + "flos": 24194808419040.0, + "grad_norm": 1.7055025246702193, + "language_loss": 0.80627692, + "learning_rate": 2.620330018187899e-06, + "loss": 0.82790136, + "num_input_tokens_seen": 149082410, + "router_z_loss_clip": 0.81542969, + "router_z_loss_mlp": 0.14477539, + "step": 6946, + "time_per_iteration": 2.7336721420288086 + }, + { + "auxiliary_loss_clip": 0.01127865, + "auxiliary_loss_mlp": 0.01034376, + "balance_loss_clip": 1.04654825, + "balance_loss_mlp": 1.02204394, + "epoch": 0.417676236284383, + "flos": 18941114826720.0, + "grad_norm": 2.1968229882478365, + "language_loss": 0.76959991, + "learning_rate": 2.6199597519313086e-06, + "loss": 0.79122227, + "num_input_tokens_seen": 149098745, + "router_z_loss_clip": 0.81445312, + "router_z_loss_mlp": 0.12335205, + "step": 6947, + "time_per_iteration": 2.595292091369629 + }, + { + "auxiliary_loss_clip": 0.01128063, + "auxiliary_loss_mlp": 0.01035056, + "balance_loss_clip": 1.04519391, + "balance_loss_mlp": 1.02163291, + "epoch": 0.41773635953705096, + "flos": 39687419283840.0, + "grad_norm": 10.382350298818828, + "language_loss": 0.71759665, + "learning_rate": 2.6195894621657825e-06, + "loss": 0.73922783, + "num_input_tokens_seen": 149122255, + "router_z_loss_clip": 0.82910156, + "router_z_loss_mlp": 0.13439941, + "step": 6948, + "time_per_iteration": 2.7516210079193115 + }, + { + "auxiliary_loss_clip": 0.0112208, + "auxiliary_loss_mlp": 0.01034288, + "balance_loss_clip": 1.04224133, + "balance_loss_mlp": 1.02122879, + "epoch": 0.4177964827897189, + "flos": 28602864966240.0, + "grad_norm": 1.4577407681029366, + "language_loss": 0.76783681, + "learning_rate": 2.619219148905362e-06, + "loss": 0.78940052, + "num_input_tokens_seen": 149142845, + "router_z_loss_clip": 0.79882812, + "router_z_loss_mlp": 0.13061523, + "step": 6949, + "time_per_iteration": 2.6526377201080322 + }, + { + "auxiliary_loss_clip": 0.01131114, + "auxiliary_loss_mlp": 0.01036927, + "balance_loss_clip": 1.04716778, + "balance_loss_mlp": 1.02368283, + "epoch": 0.4178566060423869, + "flos": 27758281370400.0, + "grad_norm": 1.632169068136412, + "language_loss": 0.81947672, + "learning_rate": 2.6188488121640888e-06, + "loss": 0.84115708, + "num_input_tokens_seen": 149163375, + "router_z_loss_clip": 0.83935547, + "router_z_loss_mlp": 0.13262939, + "step": 6950, + "time_per_iteration": 2.6597282886505127 + }, + { + "auxiliary_loss_clip": 0.01123739, + "auxiliary_loss_mlp": 0.01036156, + "balance_loss_clip": 1.04571772, + "balance_loss_mlp": 1.0240438, + "epoch": 0.41791672929505486, + "flos": 31765517102880.0, + "grad_norm": 1.2766310998079453, + "language_loss": 0.76053441, + "learning_rate": 2.618478451956007e-06, + "loss": 0.78213334, + "num_input_tokens_seen": 149185610, + "router_z_loss_clip": 0.78076172, + "router_z_loss_mlp": 0.12109375, + "step": 6951, + "time_per_iteration": 2.7207300662994385 + }, + { + "auxiliary_loss_clip": 0.01131601, + "auxiliary_loss_mlp": 0.0103591, + "balance_loss_clip": 1.04651546, + "balance_loss_mlp": 1.02191496, + "epoch": 0.4179768525477228, + "flos": 23878088776800.0, + "grad_norm": 1.6473060971211342, + "language_loss": 0.73367906, + "learning_rate": 2.61810806829516e-06, + "loss": 0.75535417, + "num_input_tokens_seen": 149203990, + "router_z_loss_clip": 0.85107422, + "router_z_loss_mlp": 0.13995361, + "step": 6952, + "time_per_iteration": 2.6111133098602295 + }, + { + "auxiliary_loss_clip": 0.0113025, + "auxiliary_loss_mlp": 0.01037607, + "balance_loss_clip": 1.04765654, + "balance_loss_mlp": 1.02463675, + "epoch": 0.4180369758003908, + "flos": 21699785800800.0, + "grad_norm": 2.586077832970016, + "language_loss": 0.71991932, + "learning_rate": 2.617737661195593e-06, + "loss": 0.74159789, + "num_input_tokens_seen": 149221385, + "router_z_loss_clip": 0.82568359, + "router_z_loss_mlp": 0.12976074, + "step": 6953, + "time_per_iteration": 2.6257948875427246 + }, + { + "auxiliary_loss_clip": 0.01127133, + "auxiliary_loss_mlp": 0.01035715, + "balance_loss_clip": 1.0468061, + "balance_loss_mlp": 1.02159429, + "epoch": 0.41809709905305875, + "flos": 25576169770080.0, + "grad_norm": 1.885730442823766, + "language_loss": 0.76089221, + "learning_rate": 2.617367230671353e-06, + "loss": 0.78252065, + "num_input_tokens_seen": 149241175, + "router_z_loss_clip": 0.80371094, + "router_z_loss_mlp": 0.14117432, + "step": 6954, + "time_per_iteration": 2.6617813110351562 + }, + { + "auxiliary_loss_clip": 0.01127595, + "auxiliary_loss_mlp": 0.01039053, + "balance_loss_clip": 1.04489052, + "balance_loss_mlp": 1.02496839, + "epoch": 0.4181572223057267, + "flos": 26866055803680.0, + "grad_norm": 2.3432657530809764, + "language_loss": 0.84399819, + "learning_rate": 2.616996776736485e-06, + "loss": 0.86566472, + "num_input_tokens_seen": 149259115, + "router_z_loss_clip": 0.82714844, + "router_z_loss_mlp": 0.14086914, + "step": 6955, + "time_per_iteration": 2.684980869293213 + }, + { + "auxiliary_loss_clip": 0.01126913, + "auxiliary_loss_mlp": 0.0103839, + "balance_loss_clip": 1.04645944, + "balance_loss_mlp": 1.02528358, + "epoch": 0.4182173455583947, + "flos": 32025680972640.0, + "grad_norm": 1.6220455801473355, + "language_loss": 0.83125281, + "learning_rate": 2.616626299405037e-06, + "loss": 0.85290587, + "num_input_tokens_seen": 149278705, + "router_z_loss_clip": 0.8046875, + "router_z_loss_mlp": 0.13104248, + "step": 6956, + "time_per_iteration": 2.6585748195648193 + }, + { + "auxiliary_loss_clip": 0.01130559, + "auxiliary_loss_mlp": 0.0103725, + "balance_loss_clip": 1.04689336, + "balance_loss_mlp": 1.02357697, + "epoch": 0.4182774688110627, + "flos": 18048484087200.0, + "grad_norm": 2.255114435482355, + "language_loss": 0.71647906, + "learning_rate": 2.616255798691059e-06, + "loss": 0.73815715, + "num_input_tokens_seen": 149294040, + "router_z_loss_clip": 0.83642578, + "router_z_loss_mlp": 0.13684082, + "step": 6957, + "time_per_iteration": 2.645109176635742 + }, + { + "auxiliary_loss_clip": 0.01128257, + "auxiliary_loss_mlp": 0.01040778, + "balance_loss_clip": 1.04705143, + "balance_loss_mlp": 1.02863097, + "epoch": 0.41833759206373067, + "flos": 24906798008640.0, + "grad_norm": 2.035363647980615, + "language_loss": 0.75620341, + "learning_rate": 2.6158852746085982e-06, + "loss": 0.77789372, + "num_input_tokens_seen": 149310385, + "router_z_loss_clip": 0.81201172, + "router_z_loss_mlp": 0.12145996, + "step": 6958, + "time_per_iteration": 2.631624698638916 + }, + { + "auxiliary_loss_clip": 0.01126127, + "auxiliary_loss_mlp": 0.01034387, + "balance_loss_clip": 1.04469824, + "balance_loss_mlp": 1.021065, + "epoch": 0.41839771531639863, + "flos": 28866027114720.0, + "grad_norm": 1.7677964441557636, + "language_loss": 0.76786458, + "learning_rate": 2.6155147271717066e-06, + "loss": 0.78946972, + "num_input_tokens_seen": 149328235, + "router_z_loss_clip": 0.81542969, + "router_z_loss_mlp": 0.13317871, + "step": 6959, + "time_per_iteration": 2.6608188152313232 + }, + { + "auxiliary_loss_clip": 0.0112778, + "auxiliary_loss_mlp": 0.01037204, + "balance_loss_clip": 1.0456419, + "balance_loss_mlp": 1.02317309, + "epoch": 0.4184578385690666, + "flos": 24104102929920.0, + "grad_norm": 1.772406640624198, + "language_loss": 0.76872659, + "learning_rate": 2.6151441563944347e-06, + "loss": 0.79037642, + "num_input_tokens_seen": 149347465, + "router_z_loss_clip": 0.82128906, + "router_z_loss_mlp": 0.14025879, + "step": 6960, + "time_per_iteration": 5.502792119979858 + }, + { + "auxiliary_loss_clip": 0.01123671, + "auxiliary_loss_mlp": 0.01030849, + "balance_loss_clip": 1.04613757, + "balance_loss_mlp": 1.01854694, + "epoch": 0.41851796182173456, + "flos": 24639989304960.0, + "grad_norm": 1.9192495562309297, + "language_loss": 0.75948739, + "learning_rate": 2.614773562290835e-06, + "loss": 0.78103262, + "num_input_tokens_seen": 149366685, + "router_z_loss_clip": 0.77441406, + "router_z_loss_mlp": 0.12304688, + "step": 6961, + "time_per_iteration": 2.6316447257995605 + }, + { + "auxiliary_loss_clip": 0.01049337, + "auxiliary_loss_mlp": 0.01001962, + "balance_loss_clip": 1.02349615, + "balance_loss_mlp": 1.00060737, + "epoch": 0.41857808507440253, + "flos": 72013943475360.0, + "grad_norm": 0.7743682485816422, + "language_loss": 0.54718012, + "learning_rate": 2.61440294487496e-06, + "loss": 0.56769311, + "num_input_tokens_seen": 149422925, + "router_z_loss_clip": 0.25854492, + "router_z_loss_mlp": 0.01355743, + "step": 6962, + "time_per_iteration": 3.2241756916046143 + }, + { + "auxiliary_loss_clip": 0.0113051, + "auxiliary_loss_mlp": 0.01037862, + "balance_loss_clip": 1.04708827, + "balance_loss_mlp": 1.02419448, + "epoch": 0.4186382083270705, + "flos": 22547691813600.0, + "grad_norm": 1.8843304675005519, + "language_loss": 0.85307574, + "learning_rate": 2.614032304160864e-06, + "loss": 0.8747595, + "num_input_tokens_seen": 149440820, + "router_z_loss_clip": 0.83398438, + "router_z_loss_mlp": 0.13659668, + "step": 6963, + "time_per_iteration": 2.6504383087158203 + }, + { + "auxiliary_loss_clip": 0.01128853, + "auxiliary_loss_mlp": 0.01034127, + "balance_loss_clip": 1.04752815, + "balance_loss_mlp": 1.02093053, + "epoch": 0.41869833157973846, + "flos": 26330615118720.0, + "grad_norm": 1.4874284961437165, + "language_loss": 0.70227736, + "learning_rate": 2.6136616401626014e-06, + "loss": 0.72390723, + "num_input_tokens_seen": 149461060, + "router_z_loss_clip": 0.81542969, + "router_z_loss_mlp": 0.13208008, + "step": 6964, + "time_per_iteration": 2.6197433471679688 + }, + { + "auxiliary_loss_clip": 0.01126518, + "auxiliary_loss_mlp": 0.01035758, + "balance_loss_clip": 1.04658318, + "balance_loss_mlp": 1.02281165, + "epoch": 0.4187584548324064, + "flos": 43339531343040.0, + "grad_norm": 1.6934418044228072, + "language_loss": 0.71324289, + "learning_rate": 2.6132909528942273e-06, + "loss": 0.73486567, + "num_input_tokens_seen": 149483115, + "router_z_loss_clip": 0.79931641, + "router_z_loss_mlp": 0.12945557, + "step": 6965, + "time_per_iteration": 2.788390874862671 + }, + { + "auxiliary_loss_clip": 0.01126722, + "auxiliary_loss_mlp": 0.01030629, + "balance_loss_clip": 1.04732561, + "balance_loss_mlp": 1.01869655, + "epoch": 0.4188185780850744, + "flos": 22764265440480.0, + "grad_norm": 1.6812226757943802, + "language_loss": 0.72140199, + "learning_rate": 2.6129202423697997e-06, + "loss": 0.74297547, + "num_input_tokens_seen": 149501495, + "router_z_loss_clip": 0.79345703, + "router_z_loss_mlp": 0.11920166, + "step": 6966, + "time_per_iteration": 4.051756381988525 + }, + { + "auxiliary_loss_clip": 0.0113358, + "auxiliary_loss_mlp": 0.01031191, + "balance_loss_clip": 1.04843879, + "balance_loss_mlp": 1.01724362, + "epoch": 0.41887870133774235, + "flos": 49216413348000.0, + "grad_norm": 1.9640091598844893, + "language_loss": 0.71065831, + "learning_rate": 2.612549508603375e-06, + "loss": 0.73230594, + "num_input_tokens_seen": 149523170, + "router_z_loss_clip": 0.85009766, + "router_z_loss_mlp": 0.13946533, + "step": 6967, + "time_per_iteration": 2.80131459236145 + }, + { + "auxiliary_loss_clip": 0.01044638, + "auxiliary_loss_mlp": 0.01000912, + "balance_loss_clip": 1.01892173, + "balance_loss_mlp": 0.99952132, + "epoch": 0.4189388245904103, + "flos": 74886448955040.0, + "grad_norm": 0.6701432808352603, + "language_loss": 0.46194488, + "learning_rate": 2.612178751609011e-06, + "loss": 0.48240036, + "num_input_tokens_seen": 149583955, + "router_z_loss_clip": 0.25732422, + "router_z_loss_mlp": 0.01391602, + "step": 6968, + "time_per_iteration": 3.2562170028686523 + }, + { + "auxiliary_loss_clip": 0.01130554, + "auxiliary_loss_mlp": 0.010345, + "balance_loss_clip": 1.04622102, + "balance_loss_mlp": 1.02039099, + "epoch": 0.4189989478430783, + "flos": 34429754998080.0, + "grad_norm": 1.8421408168109066, + "language_loss": 0.75271368, + "learning_rate": 2.6118079714007685e-06, + "loss": 0.77436429, + "num_input_tokens_seen": 149604440, + "router_z_loss_clip": 0.84277344, + "router_z_loss_mlp": 0.14105225, + "step": 6969, + "time_per_iteration": 2.6667845249176025 + }, + { + "auxiliary_loss_clip": 0.01125858, + "auxiliary_loss_mlp": 0.01033498, + "balance_loss_clip": 1.0450772, + "balance_loss_mlp": 1.02144623, + "epoch": 0.4190590710957463, + "flos": 29976163378560.0, + "grad_norm": 1.9531758896620928, + "language_loss": 0.81281841, + "learning_rate": 2.611437167992705e-06, + "loss": 0.83441198, + "num_input_tokens_seen": 149623745, + "router_z_loss_clip": 0.80957031, + "router_z_loss_mlp": 0.12060547, + "step": 6970, + "time_per_iteration": 2.694134473800659 + }, + { + "auxiliary_loss_clip": 0.01126472, + "auxiliary_loss_mlp": 0.01032646, + "balance_loss_clip": 1.04640746, + "balance_loss_mlp": 1.0190624, + "epoch": 0.41911919434841427, + "flos": 26510202819360.0, + "grad_norm": 2.9377297242165668, + "language_loss": 0.83345115, + "learning_rate": 2.6110663413988835e-06, + "loss": 0.85504234, + "num_input_tokens_seen": 149643025, + "router_z_loss_clip": 0.79980469, + "router_z_loss_mlp": 0.13580322, + "step": 6971, + "time_per_iteration": 4.010248184204102 + }, + { + "auxiliary_loss_clip": 0.01128079, + "auxiliary_loss_mlp": 0.01032111, + "balance_loss_clip": 1.04888058, + "balance_loss_mlp": 1.01802647, + "epoch": 0.41917931760108224, + "flos": 21477337168320.0, + "grad_norm": 1.60497077705697, + "language_loss": 0.74463516, + "learning_rate": 2.6106954916333648e-06, + "loss": 0.76623702, + "num_input_tokens_seen": 149660695, + "router_z_loss_clip": 0.79248047, + "router_z_loss_mlp": 0.14086914, + "step": 6972, + "time_per_iteration": 2.6689624786376953 + }, + { + "auxiliary_loss_clip": 0.01125848, + "auxiliary_loss_mlp": 0.01029347, + "balance_loss_clip": 1.04555702, + "balance_loss_mlp": 1.01694345, + "epoch": 0.4192394408537502, + "flos": 46145236528800.0, + "grad_norm": 1.5308613298039364, + "language_loss": 0.73025817, + "learning_rate": 2.610324618710212e-06, + "loss": 0.75181007, + "num_input_tokens_seen": 149682040, + "router_z_loss_clip": 0.80224609, + "router_z_loss_mlp": 0.12408447, + "step": 6973, + "time_per_iteration": 2.8207507133483887 + }, + { + "auxiliary_loss_clip": 0.01135359, + "auxiliary_loss_mlp": 0.01040395, + "balance_loss_clip": 1.04929304, + "balance_loss_mlp": 1.02691817, + "epoch": 0.41929956410641817, + "flos": 28196695870560.0, + "grad_norm": 2.1275358685889314, + "language_loss": 0.748694, + "learning_rate": 2.609953722643489e-06, + "loss": 0.77045155, + "num_input_tokens_seen": 149700855, + "router_z_loss_clip": 0.85986328, + "router_z_loss_mlp": 0.13482666, + "step": 6974, + "time_per_iteration": 2.650418996810913 + }, + { + "auxiliary_loss_clip": 0.01126524, + "auxiliary_loss_mlp": 0.0103042, + "balance_loss_clip": 1.04567862, + "balance_loss_mlp": 1.01799262, + "epoch": 0.41935968735908613, + "flos": 27484989901920.0, + "grad_norm": 1.7233428419906731, + "language_loss": 0.73002988, + "learning_rate": 2.609582803447259e-06, + "loss": 0.75159931, + "num_input_tokens_seen": 149717360, + "router_z_loss_clip": 0.80908203, + "router_z_loss_mlp": 0.12438965, + "step": 6975, + "time_per_iteration": 2.7000222206115723 + }, + { + "auxiliary_loss_clip": 0.01127803, + "auxiliary_loss_mlp": 0.0103321, + "balance_loss_clip": 1.04823971, + "balance_loss_mlp": 1.02025771, + "epoch": 0.4194198106117541, + "flos": 32787824604480.0, + "grad_norm": 1.472593398668525, + "language_loss": 0.81036943, + "learning_rate": 2.6092118611355885e-06, + "loss": 0.83197957, + "num_input_tokens_seen": 149738975, + "router_z_loss_clip": 0.79541016, + "router_z_loss_mlp": 0.12957764, + "step": 6976, + "time_per_iteration": 2.679760456085205 + }, + { + "auxiliary_loss_clip": 0.01127125, + "auxiliary_loss_mlp": 0.01025155, + "balance_loss_clip": 1.04511869, + "balance_loss_mlp": 1.01280522, + "epoch": 0.41947993386442206, + "flos": 24284825114400.0, + "grad_norm": 4.414304464187838, + "language_loss": 0.68002117, + "learning_rate": 2.6088408957225425e-06, + "loss": 0.70154399, + "num_input_tokens_seen": 149757055, + "router_z_loss_clip": 0.81933594, + "router_z_loss_mlp": 0.12359619, + "step": 6977, + "time_per_iteration": 2.703524112701416 + }, + { + "auxiliary_loss_clip": 0.01129608, + "auxiliary_loss_mlp": 0.01036002, + "balance_loss_clip": 1.04764175, + "balance_loss_mlp": 1.02318072, + "epoch": 0.41954005711709, + "flos": 21211622431200.0, + "grad_norm": 2.7228746865085687, + "language_loss": 0.80893064, + "learning_rate": 2.6084699072221898e-06, + "loss": 0.83058673, + "num_input_tokens_seen": 149772885, + "router_z_loss_clip": 0.81933594, + "router_z_loss_mlp": 0.12817383, + "step": 6978, + "time_per_iteration": 2.5957443714141846 + }, + { + "auxiliary_loss_clip": 0.01129792, + "auxiliary_loss_mlp": 0.01030079, + "balance_loss_clip": 1.04569817, + "balance_loss_mlp": 1.01682305, + "epoch": 0.419600180369758, + "flos": 30509983372320.0, + "grad_norm": 1.8921618381554492, + "language_loss": 0.82490289, + "learning_rate": 2.6080988956485964e-06, + "loss": 0.84650159, + "num_input_tokens_seen": 149791515, + "router_z_loss_clip": 0.84082031, + "router_z_loss_mlp": 0.13262939, + "step": 6979, + "time_per_iteration": 2.6808245182037354 + }, + { + "auxiliary_loss_clip": 0.01125686, + "auxiliary_loss_mlp": 0.01029417, + "balance_loss_clip": 1.04514265, + "balance_loss_mlp": 1.01658463, + "epoch": 0.41966030362242596, + "flos": 21211500879360.0, + "grad_norm": 1.9373545574128426, + "language_loss": 0.83746648, + "learning_rate": 2.6077278610158325e-06, + "loss": 0.85901755, + "num_input_tokens_seen": 149807250, + "router_z_loss_clip": 0.80517578, + "router_z_loss_mlp": 0.12835693, + "step": 6980, + "time_per_iteration": 2.7333486080169678 + }, + { + "auxiliary_loss_clip": 0.01130642, + "auxiliary_loss_mlp": 0.01032576, + "balance_loss_clip": 1.04767585, + "balance_loss_mlp": 1.0198679, + "epoch": 0.4197204268750939, + "flos": 27035797805280.0, + "grad_norm": 2.353494280485676, + "language_loss": 0.79819417, + "learning_rate": 2.6073568033379665e-06, + "loss": 0.81982636, + "num_input_tokens_seen": 149821640, + "router_z_loss_clip": 0.83154297, + "router_z_loss_mlp": 0.12713623, + "step": 6981, + "time_per_iteration": 2.7384636402130127 + }, + { + "auxiliary_loss_clip": 0.01124962, + "auxiliary_loss_mlp": 0.01031416, + "balance_loss_clip": 1.04627609, + "balance_loss_mlp": 1.01901841, + "epoch": 0.4197805501277619, + "flos": 26947806973920.0, + "grad_norm": 1.6847624829621546, + "language_loss": 0.84329456, + "learning_rate": 2.6069857226290696e-06, + "loss": 0.86485839, + "num_input_tokens_seen": 149840545, + "router_z_loss_clip": 0.78710938, + "router_z_loss_mlp": 0.12390137, + "step": 6982, + "time_per_iteration": 2.6691577434539795 + }, + { + "auxiliary_loss_clip": 0.01129329, + "auxiliary_loss_mlp": 0.0103147, + "balance_loss_clip": 1.04561806, + "balance_loss_mlp": 1.01851821, + "epoch": 0.4198406733804299, + "flos": 32253072713280.0, + "grad_norm": 1.9998013056832327, + "language_loss": 0.56525135, + "learning_rate": 2.606614618903214e-06, + "loss": 0.58685929, + "num_input_tokens_seen": 149860375, + "router_z_loss_clip": 0.83691406, + "router_z_loss_mlp": 0.1295166, + "step": 6983, + "time_per_iteration": 2.6626532077789307 + }, + { + "auxiliary_loss_clip": 0.01127385, + "auxiliary_loss_mlp": 0.01035345, + "balance_loss_clip": 1.04664898, + "balance_loss_mlp": 1.02289379, + "epoch": 0.4199007966330979, + "flos": 15289813113120.0, + "grad_norm": 2.001624733813338, + "language_loss": 0.82113755, + "learning_rate": 2.606243492174471e-06, + "loss": 0.84276491, + "num_input_tokens_seen": 149877850, + "router_z_loss_clip": 0.80761719, + "router_z_loss_mlp": 0.12445068, + "step": 6984, + "time_per_iteration": 2.6540119647979736 + }, + { + "auxiliary_loss_clip": 0.01125973, + "auxiliary_loss_mlp": 0.01028109, + "balance_loss_clip": 1.04579449, + "balance_loss_mlp": 1.0150373, + "epoch": 0.41996091988576584, + "flos": 26555211167040.0, + "grad_norm": 1.703121704306216, + "language_loss": 0.78797615, + "learning_rate": 2.605872342456914e-06, + "loss": 0.80951691, + "num_input_tokens_seen": 149896110, + "router_z_loss_clip": 0.80175781, + "router_z_loss_mlp": 0.13061523, + "step": 6985, + "time_per_iteration": 2.6549415588378906 + }, + { + "auxiliary_loss_clip": 0.01128634, + "auxiliary_loss_mlp": 0.01034776, + "balance_loss_clip": 1.04346323, + "balance_loss_mlp": 1.02205026, + "epoch": 0.4200210431384338, + "flos": 32386963272480.0, + "grad_norm": 1.656806539228207, + "language_loss": 0.78201079, + "learning_rate": 2.6055011697646173e-06, + "loss": 0.8036449, + "num_input_tokens_seen": 149916495, + "router_z_loss_clip": 0.85107422, + "router_z_loss_mlp": 0.12731934, + "step": 6986, + "time_per_iteration": 2.7421586513519287 + }, + { + "auxiliary_loss_clip": 0.01123648, + "auxiliary_loss_mlp": 0.01029801, + "balance_loss_clip": 1.04519832, + "balance_loss_mlp": 1.01788568, + "epoch": 0.42008116639110177, + "flos": 32696916528960.0, + "grad_norm": 1.4843109017345828, + "language_loss": 0.72347867, + "learning_rate": 2.605129974111655e-06, + "loss": 0.74501318, + "num_input_tokens_seen": 149936445, + "router_z_loss_clip": 0.78369141, + "router_z_loss_mlp": 0.11914062, + "step": 6987, + "time_per_iteration": 2.7144460678100586 + }, + { + "auxiliary_loss_clip": 0.01127677, + "auxiliary_loss_mlp": 0.01039002, + "balance_loss_clip": 1.04673219, + "balance_loss_mlp": 1.02680659, + "epoch": 0.42014128964376973, + "flos": 39153923428320.0, + "grad_norm": 1.6593593388776, + "language_loss": 0.75016075, + "learning_rate": 2.604758755512104e-06, + "loss": 0.77182752, + "num_input_tokens_seen": 149959430, + "router_z_loss_clip": 0.80957031, + "router_z_loss_mlp": 0.12188721, + "step": 6988, + "time_per_iteration": 2.7871665954589844 + }, + { + "auxiliary_loss_clip": 0.01129125, + "auxiliary_loss_mlp": 0.01037035, + "balance_loss_clip": 1.04458189, + "balance_loss_mlp": 1.02357602, + "epoch": 0.4202014128964377, + "flos": 32295852610560.0, + "grad_norm": 1.5529445469475702, + "language_loss": 0.74248844, + "learning_rate": 2.60438751398004e-06, + "loss": 0.76415008, + "num_input_tokens_seen": 149980365, + "router_z_loss_clip": 0.84570312, + "router_z_loss_mlp": 0.13458252, + "step": 6989, + "time_per_iteration": 2.660606622695923 + }, + { + "auxiliary_loss_clip": 0.01129234, + "auxiliary_loss_mlp": 0.01030829, + "balance_loss_clip": 1.04656792, + "balance_loss_mlp": 1.01772809, + "epoch": 0.42026153614910566, + "flos": 16352185854240.0, + "grad_norm": 2.7914762352587856, + "language_loss": 0.71216196, + "learning_rate": 2.6040162495295404e-06, + "loss": 0.73376262, + "num_input_tokens_seen": 149997375, + "router_z_loss_clip": 0.82714844, + "router_z_loss_mlp": 0.13122559, + "step": 6990, + "time_per_iteration": 2.6613149642944336 + }, + { + "auxiliary_loss_clip": 0.01044535, + "auxiliary_loss_mlp": 0.01010764, + "balance_loss_clip": 1.01848292, + "balance_loss_mlp": 1.00948453, + "epoch": 0.42032165940177363, + "flos": 73517728995360.0, + "grad_norm": 0.8374933239330162, + "language_loss": 0.60465622, + "learning_rate": 2.603644962174685e-06, + "loss": 0.62520921, + "num_input_tokens_seen": 150051230, + "router_z_loss_clip": 0.26049805, + "router_z_loss_mlp": 0.01280212, + "step": 6991, + "time_per_iteration": 3.1358449459075928 + }, + { + "auxiliary_loss_clip": 0.01129953, + "auxiliary_loss_mlp": 0.0103642, + "balance_loss_clip": 1.04737127, + "balance_loss_mlp": 1.02334833, + "epoch": 0.4203817826544416, + "flos": 29938205037600.0, + "grad_norm": 1.6268424964656578, + "language_loss": 0.83085889, + "learning_rate": 2.6032736519295517e-06, + "loss": 0.85252261, + "num_input_tokens_seen": 150071135, + "router_z_loss_clip": 0.82714844, + "router_z_loss_mlp": 0.13067627, + "step": 6992, + "time_per_iteration": 2.7797632217407227 + }, + { + "auxiliary_loss_clip": 0.01044788, + "auxiliary_loss_mlp": 0.01012076, + "balance_loss_clip": 1.01868987, + "balance_loss_mlp": 1.01074696, + "epoch": 0.42044190590710956, + "flos": 71773748274240.0, + "grad_norm": 0.8094646027553248, + "language_loss": 0.65611529, + "learning_rate": 2.6029023188082217e-06, + "loss": 0.6766839, + "num_input_tokens_seen": 150125220, + "router_z_loss_clip": 0.26147461, + "router_z_loss_mlp": 0.01329803, + "step": 6993, + "time_per_iteration": 3.2393219470977783 + }, + { + "auxiliary_loss_clip": 0.01131578, + "auxiliary_loss_mlp": 0.01036891, + "balance_loss_clip": 1.04591084, + "balance_loss_mlp": 1.02258611, + "epoch": 0.4205020291597775, + "flos": 20054938163040.0, + "grad_norm": 2.0417808665400234, + "language_loss": 0.83792222, + "learning_rate": 2.6025309628247746e-06, + "loss": 0.85960686, + "num_input_tokens_seen": 150142300, + "router_z_loss_clip": 0.85644531, + "router_z_loss_mlp": 0.14306641, + "step": 6994, + "time_per_iteration": 2.5947961807250977 + }, + { + "auxiliary_loss_clip": 0.01125161, + "auxiliary_loss_mlp": 0.01031661, + "balance_loss_clip": 1.04632568, + "balance_loss_mlp": 1.01961493, + "epoch": 0.4205621524124455, + "flos": 22458444946560.0, + "grad_norm": 1.7346098203422533, + "language_loss": 0.78288192, + "learning_rate": 2.6021595839932934e-06, + "loss": 0.80445009, + "num_input_tokens_seen": 150161345, + "router_z_loss_clip": 0.78857422, + "router_z_loss_mlp": 0.12054443, + "step": 6995, + "time_per_iteration": 2.6650731563568115 + }, + { + "auxiliary_loss_clip": 0.01120211, + "auxiliary_loss_mlp": 0.01031405, + "balance_loss_clip": 1.04224885, + "balance_loss_mlp": 1.01934743, + "epoch": 0.4206222756651135, + "flos": 31140059722560.0, + "grad_norm": 1.6111073584222737, + "language_loss": 0.80336577, + "learning_rate": 2.60178818232786e-06, + "loss": 0.82488191, + "num_input_tokens_seen": 150182420, + "router_z_loss_clip": 0.77929688, + "router_z_loss_mlp": 0.1206665, + "step": 6996, + "time_per_iteration": 2.6575934886932373 + }, + { + "auxiliary_loss_clip": 0.01126641, + "auxiliary_loss_mlp": 0.01029562, + "balance_loss_clip": 1.04541087, + "balance_loss_mlp": 1.01720572, + "epoch": 0.4206823989177815, + "flos": 18673981984800.0, + "grad_norm": 1.992485387956217, + "language_loss": 0.75948435, + "learning_rate": 2.601416757842559e-06, + "loss": 0.78104633, + "num_input_tokens_seen": 150200175, + "router_z_loss_clip": 0.81201172, + "router_z_loss_mlp": 0.12347412, + "step": 6997, + "time_per_iteration": 2.7153685092926025 + }, + { + "auxiliary_loss_clip": 0.01123873, + "auxiliary_loss_mlp": 0.01036303, + "balance_loss_clip": 1.0419209, + "balance_loss_mlp": 1.02403009, + "epoch": 0.42074252217044944, + "flos": 18979275754080.0, + "grad_norm": 4.3914941270576895, + "language_loss": 0.7515524, + "learning_rate": 2.6010453105514743e-06, + "loss": 0.77315414, + "num_input_tokens_seen": 150217100, + "router_z_loss_clip": 0.8203125, + "router_z_loss_mlp": 0.1227417, + "step": 6998, + "time_per_iteration": 2.593144178390503 + }, + { + "auxiliary_loss_clip": 0.01130126, + "auxiliary_loss_mlp": 0.01033141, + "balance_loss_clip": 1.04697871, + "balance_loss_mlp": 1.02024281, + "epoch": 0.4208026454231174, + "flos": 31903175769120.0, + "grad_norm": 1.7693274584038299, + "language_loss": 0.76084208, + "learning_rate": 2.60067384046869e-06, + "loss": 0.78247476, + "num_input_tokens_seen": 150239830, + "router_z_loss_clip": 0.83056641, + "router_z_loss_mlp": 0.12908936, + "step": 6999, + "time_per_iteration": 4.203633785247803 + }, + { + "auxiliary_loss_clip": 0.01125265, + "auxiliary_loss_mlp": 0.01036352, + "balance_loss_clip": 1.04522002, + "balance_loss_mlp": 1.02340603, + "epoch": 0.42086276867578537, + "flos": 28736107248960.0, + "grad_norm": 3.474555419422558, + "language_loss": 0.64124709, + "learning_rate": 2.600302347608295e-06, + "loss": 0.66286331, + "num_input_tokens_seen": 150260690, + "router_z_loss_clip": 0.80126953, + "router_z_loss_mlp": 0.12945557, + "step": 7000, + "time_per_iteration": 4.207256317138672 + }, + { + "auxiliary_loss_clip": 0.01127088, + "auxiliary_loss_mlp": 0.01035945, + "balance_loss_clip": 1.04581928, + "balance_loss_mlp": 1.02298141, + "epoch": 0.42092289192845334, + "flos": 22101862651200.0, + "grad_norm": 1.8789009683691766, + "language_loss": 0.76543367, + "learning_rate": 2.5999308319843743e-06, + "loss": 0.78706402, + "num_input_tokens_seen": 150279885, + "router_z_loss_clip": 0.81396484, + "router_z_loss_mlp": 0.12963867, + "step": 7001, + "time_per_iteration": 2.6977603435516357 + }, + { + "auxiliary_loss_clip": 0.01125628, + "auxiliary_loss_mlp": 0.01031987, + "balance_loss_clip": 1.04563999, + "balance_loss_mlp": 1.01983333, + "epoch": 0.4209830151811213, + "flos": 24412435495200.0, + "grad_norm": 2.044933949019262, + "language_loss": 0.86777806, + "learning_rate": 2.5995592936110154e-06, + "loss": 0.88935423, + "num_input_tokens_seen": 150297390, + "router_z_loss_clip": 0.80078125, + "router_z_loss_mlp": 0.12158203, + "step": 7002, + "time_per_iteration": 2.626382350921631 + }, + { + "auxiliary_loss_clip": 0.01124053, + "auxiliary_loss_mlp": 0.01033453, + "balance_loss_clip": 1.04495454, + "balance_loss_mlp": 1.02178276, + "epoch": 0.42104313843378927, + "flos": 26819062109280.0, + "grad_norm": 1.9643265784975075, + "language_loss": 0.67649597, + "learning_rate": 2.5991877325023096e-06, + "loss": 0.69807106, + "num_input_tokens_seen": 150317390, + "router_z_loss_clip": 0.79052734, + "router_z_loss_mlp": 0.11663818, + "step": 7003, + "time_per_iteration": 2.7303249835968018 + }, + { + "auxiliary_loss_clip": 0.01127556, + "auxiliary_loss_mlp": 0.0103297, + "balance_loss_clip": 1.04466283, + "balance_loss_mlp": 1.01980925, + "epoch": 0.42110326168645723, + "flos": 31047449921280.0, + "grad_norm": 3.1738229161161344, + "language_loss": 0.77593863, + "learning_rate": 2.598816148672344e-06, + "loss": 0.79754388, + "num_input_tokens_seen": 150337455, + "router_z_loss_clip": 0.82861328, + "router_z_loss_mlp": 0.1317749, + "step": 7004, + "time_per_iteration": 2.6655187606811523 + }, + { + "auxiliary_loss_clip": 0.01124165, + "auxiliary_loss_mlp": 0.01036371, + "balance_loss_clip": 1.04617202, + "balance_loss_mlp": 1.02332401, + "epoch": 0.4211633849391252, + "flos": 21746860529760.0, + "grad_norm": 1.7853364480968186, + "language_loss": 0.67810202, + "learning_rate": 2.59844454213521e-06, + "loss": 0.69970739, + "num_input_tokens_seen": 150355385, + "router_z_loss_clip": 0.77929688, + "router_z_loss_mlp": 0.13061523, + "step": 7005, + "time_per_iteration": 2.6932666301727295 + }, + { + "auxiliary_loss_clip": 0.01125804, + "auxiliary_loss_mlp": 0.01033026, + "balance_loss_clip": 1.04424381, + "balance_loss_mlp": 1.02039576, + "epoch": 0.42122350819179316, + "flos": 19868178903840.0, + "grad_norm": 1.7811119506284336, + "language_loss": 0.72969741, + "learning_rate": 2.5980729129049994e-06, + "loss": 0.75128573, + "num_input_tokens_seen": 150371750, + "router_z_loss_clip": 0.81542969, + "router_z_loss_mlp": 0.12628174, + "step": 7006, + "time_per_iteration": 3.969564199447632 + }, + { + "auxiliary_loss_clip": 0.01126995, + "auxiliary_loss_mlp": 0.01031281, + "balance_loss_clip": 1.04403675, + "balance_loss_mlp": 1.01854324, + "epoch": 0.4212836314444611, + "flos": 23971873579200.0, + "grad_norm": 1.9197155207167949, + "language_loss": 0.70743382, + "learning_rate": 2.5977012609958033e-06, + "loss": 0.7290166, + "num_input_tokens_seen": 150389955, + "router_z_loss_clip": 0.83105469, + "router_z_loss_mlp": 0.12744141, + "step": 7007, + "time_per_iteration": 2.669642686843872 + }, + { + "auxiliary_loss_clip": 0.01125558, + "auxiliary_loss_mlp": 0.0103332, + "balance_loss_clip": 1.0450232, + "balance_loss_mlp": 1.02102959, + "epoch": 0.4213437546971291, + "flos": 22413760737120.0, + "grad_norm": 1.9771930041673365, + "language_loss": 0.82407892, + "learning_rate": 2.5973295864217166e-06, + "loss": 0.84566778, + "num_input_tokens_seen": 150405780, + "router_z_loss_clip": 0.80566406, + "router_z_loss_mlp": 0.12286377, + "step": 7008, + "time_per_iteration": 2.651829481124878 + }, + { + "auxiliary_loss_clip": 0.0112471, + "auxiliary_loss_mlp": 0.01038204, + "balance_loss_clip": 1.04316247, + "balance_loss_mlp": 1.02516866, + "epoch": 0.42140387794979706, + "flos": 33806080378080.0, + "grad_norm": 1.8070584353312782, + "language_loss": 0.71812218, + "learning_rate": 2.596957889196831e-06, + "loss": 0.73975134, + "num_input_tokens_seen": 150425615, + "router_z_loss_clip": 0.81542969, + "router_z_loss_mlp": 0.13049316, + "step": 7009, + "time_per_iteration": 2.6900336742401123 + }, + { + "auxiliary_loss_clip": 0.01126789, + "auxiliary_loss_mlp": 0.01032059, + "balance_loss_clip": 1.04459214, + "balance_loss_mlp": 1.01930976, + "epoch": 0.4214640012024651, + "flos": 34345694342880.0, + "grad_norm": 1.9787317371214317, + "language_loss": 0.66118503, + "learning_rate": 2.596586169335243e-06, + "loss": 0.68277359, + "num_input_tokens_seen": 150445765, + "router_z_loss_clip": 0.82226562, + "router_z_loss_mlp": 0.12768555, + "step": 7010, + "time_per_iteration": 4.123753547668457 + }, + { + "auxiliary_loss_clip": 0.01123636, + "auxiliary_loss_mlp": 0.01030077, + "balance_loss_clip": 1.04398274, + "balance_loss_mlp": 1.01717889, + "epoch": 0.42152412445513304, + "flos": 28061994965760.0, + "grad_norm": 1.6646718760163999, + "language_loss": 0.72598505, + "learning_rate": 2.5962144268510477e-06, + "loss": 0.74752223, + "num_input_tokens_seen": 150464405, + "router_z_loss_clip": 0.79589844, + "router_z_loss_mlp": 0.12908936, + "step": 7011, + "time_per_iteration": 2.6423146724700928 + }, + { + "auxiliary_loss_clip": 0.0104711, + "auxiliary_loss_mlp": 0.01010297, + "balance_loss_clip": 1.02127361, + "balance_loss_mlp": 1.00898683, + "epoch": 0.421584247707801, + "flos": 77787964807200.0, + "grad_norm": 0.810601289449329, + "language_loss": 0.54367691, + "learning_rate": 2.5958426617583417e-06, + "loss": 0.56425101, + "num_input_tokens_seen": 150520430, + "router_z_loss_clip": 0.25805664, + "router_z_loss_mlp": 0.0131073, + "step": 7012, + "time_per_iteration": 3.1871328353881836 + }, + { + "auxiliary_loss_clip": 0.01127667, + "auxiliary_loss_mlp": 0.01030201, + "balance_loss_clip": 1.04590726, + "balance_loss_mlp": 1.01703477, + "epoch": 0.421644370960469, + "flos": 29669167883520.0, + "grad_norm": 1.6009529001051332, + "language_loss": 0.78560483, + "learning_rate": 2.5954708740712215e-06, + "loss": 0.8071835, + "num_input_tokens_seen": 150542610, + "router_z_loss_clip": 0.81689453, + "router_z_loss_mlp": 0.13165283, + "step": 7013, + "time_per_iteration": 2.7832446098327637 + }, + { + "auxiliary_loss_clip": 0.01127215, + "auxiliary_loss_mlp": 0.01035124, + "balance_loss_clip": 1.04440701, + "balance_loss_mlp": 1.0219934, + "epoch": 0.42170449421313694, + "flos": 28602581345280.0, + "grad_norm": 1.8900708843060272, + "language_loss": 0.81154108, + "learning_rate": 2.595099063803787e-06, + "loss": 0.83316445, + "num_input_tokens_seen": 150560970, + "router_z_loss_clip": 0.828125, + "router_z_loss_mlp": 0.13146973, + "step": 7014, + "time_per_iteration": 2.6823525428771973 + }, + { + "auxiliary_loss_clip": 0.01125725, + "auxiliary_loss_mlp": 0.01032659, + "balance_loss_clip": 1.04374671, + "balance_loss_mlp": 1.02013576, + "epoch": 0.4217646174658049, + "flos": 28913547533760.0, + "grad_norm": 2.130750452048755, + "language_loss": 0.77765954, + "learning_rate": 2.5947272309701354e-06, + "loss": 0.79924345, + "num_input_tokens_seen": 150582615, + "router_z_loss_clip": 0.81982422, + "router_z_loss_mlp": 0.12542725, + "step": 7015, + "time_per_iteration": 2.756805181503296 + }, + { + "auxiliary_loss_clip": 0.01128509, + "auxiliary_loss_mlp": 0.01038004, + "balance_loss_clip": 1.04514396, + "balance_loss_mlp": 1.02467048, + "epoch": 0.42182474071847287, + "flos": 30470809512960.0, + "grad_norm": 1.51077172486921, + "language_loss": 0.8183018, + "learning_rate": 2.594355375584368e-06, + "loss": 0.83996689, + "num_input_tokens_seen": 150603640, + "router_z_loss_clip": 0.83251953, + "router_z_loss_mlp": 0.13330078, + "step": 7016, + "time_per_iteration": 2.672609806060791 + }, + { + "auxiliary_loss_clip": 0.01128827, + "auxiliary_loss_mlp": 0.01033167, + "balance_loss_clip": 1.04626203, + "balance_loss_mlp": 1.0197798, + "epoch": 0.42188486397114083, + "flos": 27889902961920.0, + "grad_norm": 1.9319352782971144, + "language_loss": 0.68037188, + "learning_rate": 2.593983497660586e-06, + "loss": 0.70199186, + "num_input_tokens_seen": 150622490, + "router_z_loss_clip": 0.82519531, + "router_z_loss_mlp": 0.13378906, + "step": 7017, + "time_per_iteration": 2.7376675605773926 + }, + { + "auxiliary_loss_clip": 0.01044679, + "auxiliary_loss_mlp": 0.00999907, + "balance_loss_clip": 1.01887727, + "balance_loss_mlp": 0.99869573, + "epoch": 0.4219449872238088, + "flos": 81723295068480.0, + "grad_norm": 0.6906659210888672, + "language_loss": 0.59427524, + "learning_rate": 2.5936115972128895e-06, + "loss": 0.61472106, + "num_input_tokens_seen": 150689545, + "router_z_loss_clip": 0.25805664, + "router_z_loss_mlp": 0.01210785, + "step": 7018, + "time_per_iteration": 3.346855401992798 + }, + { + "auxiliary_loss_clip": 0.01127273, + "auxiliary_loss_mlp": 0.01033975, + "balance_loss_clip": 1.04391456, + "balance_loss_mlp": 1.02146983, + "epoch": 0.42200511047647676, + "flos": 16002815634720.0, + "grad_norm": 3.5229343383799816, + "language_loss": 0.75318003, + "learning_rate": 2.593239674255382e-06, + "loss": 0.77479249, + "num_input_tokens_seen": 150707610, + "router_z_loss_clip": 0.83398438, + "router_z_loss_mlp": 0.12518311, + "step": 7019, + "time_per_iteration": 2.6960666179656982 + }, + { + "auxiliary_loss_clip": 0.01123991, + "auxiliary_loss_mlp": 0.01035664, + "balance_loss_clip": 1.04385746, + "balance_loss_mlp": 1.02235436, + "epoch": 0.42206523372914473, + "flos": 17071549588800.0, + "grad_norm": 2.003359484799376, + "language_loss": 0.69165981, + "learning_rate": 2.592867728802166e-06, + "loss": 0.71325636, + "num_input_tokens_seen": 150724530, + "router_z_loss_clip": 0.80273438, + "router_z_loss_mlp": 0.13299561, + "step": 7020, + "time_per_iteration": 2.6788113117218018 + }, + { + "auxiliary_loss_clip": 0.01124415, + "auxiliary_loss_mlp": 0.01032573, + "balance_loss_clip": 1.04640234, + "balance_loss_mlp": 1.02060437, + "epoch": 0.4221253569818127, + "flos": 26774377899840.0, + "grad_norm": 1.8743515440100857, + "language_loss": 0.80861568, + "learning_rate": 2.592495760867347e-06, + "loss": 0.83018565, + "num_input_tokens_seen": 150742870, + "router_z_loss_clip": 0.77929688, + "router_z_loss_mlp": 0.11968994, + "step": 7021, + "time_per_iteration": 2.666597604751587 + }, + { + "auxiliary_loss_clip": 0.01126827, + "auxiliary_loss_mlp": 0.01030845, + "balance_loss_clip": 1.04518485, + "balance_loss_mlp": 1.01808965, + "epoch": 0.42218548023448066, + "flos": 39282749327520.0, + "grad_norm": 1.575586582770612, + "language_loss": 0.70039022, + "learning_rate": 2.5921237704650293e-06, + "loss": 0.72196692, + "num_input_tokens_seen": 150765500, + "router_z_loss_clip": 0.81591797, + "router_z_loss_mlp": 0.12756348, + "step": 7022, + "time_per_iteration": 2.7540524005889893 + }, + { + "auxiliary_loss_clip": 0.01123015, + "auxiliary_loss_mlp": 0.01032858, + "balance_loss_clip": 1.04668307, + "balance_loss_mlp": 1.02202773, + "epoch": 0.4222456034871487, + "flos": 36751794232320.0, + "grad_norm": 1.8047148029944056, + "language_loss": 0.67739183, + "learning_rate": 2.5917517576093188e-06, + "loss": 0.69895053, + "num_input_tokens_seen": 150784945, + "router_z_loss_clip": 0.76416016, + "router_z_loss_mlp": 0.10827637, + "step": 7023, + "time_per_iteration": 2.7043871879577637 + }, + { + "auxiliary_loss_clip": 0.01123363, + "auxiliary_loss_mlp": 0.01037169, + "balance_loss_clip": 1.04633152, + "balance_loss_mlp": 1.0243839, + "epoch": 0.42230572673981664, + "flos": 27000554122080.0, + "grad_norm": 1.5739624821772829, + "language_loss": 0.6909647, + "learning_rate": 2.591379722314322e-06, + "loss": 0.71256995, + "num_input_tokens_seen": 150803120, + "router_z_loss_clip": 0.77050781, + "router_z_loss_mlp": 0.12799072, + "step": 7024, + "time_per_iteration": 2.73442006111145 + }, + { + "auxiliary_loss_clip": 0.01125317, + "auxiliary_loss_mlp": 0.01039206, + "balance_loss_clip": 1.04528856, + "balance_loss_mlp": 1.02676666, + "epoch": 0.4223658499924846, + "flos": 26911671910560.0, + "grad_norm": 1.6186661448376856, + "language_loss": 0.76760906, + "learning_rate": 2.591007664594147e-06, + "loss": 0.78925431, + "num_input_tokens_seen": 150823135, + "router_z_loss_clip": 0.80078125, + "router_z_loss_mlp": 0.12432861, + "step": 7025, + "time_per_iteration": 2.7096753120422363 + }, + { + "auxiliary_loss_clip": 0.01123469, + "auxiliary_loss_mlp": 0.01033995, + "balance_loss_clip": 1.04498601, + "balance_loss_mlp": 1.02230668, + "epoch": 0.4224259732451526, + "flos": 24905339386560.0, + "grad_norm": 1.962664691519916, + "language_loss": 0.7991066, + "learning_rate": 2.5906355844629024e-06, + "loss": 0.82068121, + "num_input_tokens_seen": 150842070, + "router_z_loss_clip": 0.78515625, + "router_z_loss_mlp": 0.11688232, + "step": 7026, + "time_per_iteration": 2.7267649173736572 + }, + { + "auxiliary_loss_clip": 0.01044185, + "auxiliary_loss_mlp": 0.01016808, + "balance_loss_clip": 1.01808321, + "balance_loss_mlp": 1.01552248, + "epoch": 0.42248609649782054, + "flos": 76685283722880.0, + "grad_norm": 0.7238648811629704, + "language_loss": 0.61902165, + "learning_rate": 2.5902634819346966e-06, + "loss": 0.63963157, + "num_input_tokens_seen": 150907450, + "router_z_loss_clip": 0.26147461, + "router_z_loss_mlp": 0.01285553, + "step": 7027, + "time_per_iteration": 3.3363633155822754 + }, + { + "auxiliary_loss_clip": 0.01124149, + "auxiliary_loss_mlp": 0.01035666, + "balance_loss_clip": 1.04413128, + "balance_loss_mlp": 1.02308357, + "epoch": 0.4225462197504885, + "flos": 32030664598080.0, + "grad_norm": 2.1265939077888287, + "language_loss": 0.70852101, + "learning_rate": 2.5898913570236414e-06, + "loss": 0.73011917, + "num_input_tokens_seen": 150928040, + "router_z_loss_clip": 0.79882812, + "router_z_loss_mlp": 0.12579346, + "step": 7028, + "time_per_iteration": 2.723416328430176 + }, + { + "auxiliary_loss_clip": 0.01125918, + "auxiliary_loss_mlp": 0.0104403, + "balance_loss_clip": 1.04412639, + "balance_loss_mlp": 1.03053594, + "epoch": 0.42260634300315647, + "flos": 25040891154240.0, + "grad_norm": 1.8146282777001157, + "language_loss": 0.82409537, + "learning_rate": 2.589519209743846e-06, + "loss": 0.84579492, + "num_input_tokens_seen": 150945760, + "router_z_loss_clip": 0.81787109, + "router_z_loss_mlp": 0.13494873, + "step": 7029, + "time_per_iteration": 2.683023452758789 + }, + { + "auxiliary_loss_clip": 0.01130543, + "auxiliary_loss_mlp": 0.01039862, + "balance_loss_clip": 1.04677296, + "balance_loss_mlp": 1.02603388, + "epoch": 0.42266646625582444, + "flos": 29671558403040.0, + "grad_norm": 2.003928125289474, + "language_loss": 0.75596178, + "learning_rate": 2.589147040109424e-06, + "loss": 0.77766585, + "num_input_tokens_seen": 150965665, + "router_z_loss_clip": 0.83691406, + "router_z_loss_mlp": 0.13824463, + "step": 7030, + "time_per_iteration": 2.6474862098693848 + }, + { + "auxiliary_loss_clip": 0.01124467, + "auxiliary_loss_mlp": 0.01033578, + "balance_loss_clip": 1.04304647, + "balance_loss_mlp": 1.01987445, + "epoch": 0.4227265895084924, + "flos": 29534466978720.0, + "grad_norm": 2.911056105628908, + "language_loss": 0.86502552, + "learning_rate": 2.588774848134486e-06, + "loss": 0.88660592, + "num_input_tokens_seen": 150982260, + "router_z_loss_clip": 0.81396484, + "router_z_loss_mlp": 0.137146, + "step": 7031, + "time_per_iteration": 2.7280526161193848 + }, + { + "auxiliary_loss_clip": 0.01126151, + "auxiliary_loss_mlp": 0.01034903, + "balance_loss_clip": 1.04517949, + "balance_loss_mlp": 1.02078867, + "epoch": 0.42278671276116037, + "flos": 20633037193440.0, + "grad_norm": 1.9773502625081867, + "language_loss": 0.73335594, + "learning_rate": 2.5884026338331473e-06, + "loss": 0.7549665, + "num_input_tokens_seen": 150999990, + "router_z_loss_clip": 0.81103516, + "router_z_loss_mlp": 0.14086914, + "step": 7032, + "time_per_iteration": 2.622093439102173 + }, + { + "auxiliary_loss_clip": 0.01125247, + "auxiliary_loss_mlp": 0.01039291, + "balance_loss_clip": 1.04225838, + "balance_loss_mlp": 1.02657139, + "epoch": 0.42284683601382833, + "flos": 31006979508960.0, + "grad_norm": 1.5985817455289058, + "language_loss": 0.69916391, + "learning_rate": 2.5880303972195222e-06, + "loss": 0.72080922, + "num_input_tokens_seen": 151021105, + "router_z_loss_clip": 0.82910156, + "router_z_loss_mlp": 0.1272583, + "step": 7033, + "time_per_iteration": 2.706662893295288 + }, + { + "auxiliary_loss_clip": 0.01126626, + "auxiliary_loss_mlp": 0.01031563, + "balance_loss_clip": 1.04471135, + "balance_loss_mlp": 1.01827157, + "epoch": 0.4229069592664963, + "flos": 28113931768320.0, + "grad_norm": 2.1307385400805727, + "language_loss": 0.90403795, + "learning_rate": 2.5876581383077256e-06, + "loss": 0.92561984, + "num_input_tokens_seen": 151040665, + "router_z_loss_clip": 0.81884766, + "router_z_loss_mlp": 0.13305664, + "step": 7034, + "time_per_iteration": 2.64041805267334 + }, + { + "auxiliary_loss_clip": 0.01121301, + "auxiliary_loss_mlp": 0.01035866, + "balance_loss_clip": 1.04198432, + "balance_loss_mlp": 1.02366519, + "epoch": 0.42296708251916426, + "flos": 31807972861920.0, + "grad_norm": 1.700849398072338, + "language_loss": 0.77527219, + "learning_rate": 2.5872858571118723e-06, + "loss": 0.79684389, + "num_input_tokens_seen": 151061240, + "router_z_loss_clip": 0.79248047, + "router_z_loss_mlp": 0.12200928, + "step": 7035, + "time_per_iteration": 2.730905532836914 + }, + { + "auxiliary_loss_clip": 0.01126306, + "auxiliary_loss_mlp": 0.0104232, + "balance_loss_clip": 1.0443989, + "balance_loss_mlp": 1.02917743, + "epoch": 0.4230272057718323, + "flos": 23743549941120.0, + "grad_norm": 1.7985684540944515, + "language_loss": 0.82259774, + "learning_rate": 2.5869135536460817e-06, + "loss": 0.84428394, + "num_input_tokens_seen": 151076870, + "router_z_loss_clip": 0.81835938, + "router_z_loss_mlp": 0.13146973, + "step": 7036, + "time_per_iteration": 2.5854110717773438 + }, + { + "auxiliary_loss_clip": 0.01123117, + "auxiliary_loss_mlp": 0.01034135, + "balance_loss_clip": 1.04486394, + "balance_loss_mlp": 1.02168941, + "epoch": 0.42308732902450025, + "flos": 27312209104320.0, + "grad_norm": 1.8110269837590434, + "language_loss": 0.70534122, + "learning_rate": 2.58654122792447e-06, + "loss": 0.72691381, + "num_input_tokens_seen": 151095110, + "router_z_loss_clip": 0.78173828, + "router_z_loss_mlp": 0.12438965, + "step": 7037, + "time_per_iteration": 2.7171437740325928 + }, + { + "auxiliary_loss_clip": 0.01123243, + "auxiliary_loss_mlp": 0.01034517, + "balance_loss_clip": 1.04230464, + "balance_loss_mlp": 1.02117789, + "epoch": 0.4231474522771682, + "flos": 25618747080960.0, + "grad_norm": 1.5669699568468125, + "language_loss": 0.77800566, + "learning_rate": 2.586168879961155e-06, + "loss": 0.79958332, + "num_input_tokens_seen": 151114355, + "router_z_loss_clip": 0.80957031, + "router_z_loss_mlp": 0.13360596, + "step": 7038, + "time_per_iteration": 2.676013231277466 + }, + { + "auxiliary_loss_clip": 0.01129977, + "auxiliary_loss_mlp": 0.01041731, + "balance_loss_clip": 1.04509521, + "balance_loss_mlp": 1.02775979, + "epoch": 0.4232075755298362, + "flos": 18273161170080.0, + "grad_norm": 3.7408902553289214, + "language_loss": 0.67182231, + "learning_rate": 2.585796509770259e-06, + "loss": 0.69353938, + "num_input_tokens_seen": 151131505, + "router_z_loss_clip": 0.84863281, + "router_z_loss_mlp": 0.13964844, + "step": 7039, + "time_per_iteration": 5.548948049545288 + }, + { + "auxiliary_loss_clip": 0.0112914, + "auxiliary_loss_mlp": 0.01035657, + "balance_loss_clip": 1.04424119, + "balance_loss_mlp": 1.02233529, + "epoch": 0.42326769878250414, + "flos": 29935328310720.0, + "grad_norm": 1.6024015684102078, + "language_loss": 0.75691366, + "learning_rate": 2.5854241173658996e-06, + "loss": 0.77856165, + "num_input_tokens_seen": 151151555, + "router_z_loss_clip": 0.84912109, + "router_z_loss_mlp": 0.13323975, + "step": 7040, + "time_per_iteration": 2.678117036819458 + }, + { + "auxiliary_loss_clip": 0.01125878, + "auxiliary_loss_mlp": 0.01028692, + "balance_loss_clip": 1.04373753, + "balance_loss_mlp": 1.01564431, + "epoch": 0.4233278220351721, + "flos": 32787662535360.0, + "grad_norm": 1.857852813512283, + "language_loss": 0.65383434, + "learning_rate": 2.5850517027621996e-06, + "loss": 0.67537999, + "num_input_tokens_seen": 151172385, + "router_z_loss_clip": 0.82177734, + "router_z_loss_mlp": 0.13049316, + "step": 7041, + "time_per_iteration": 2.71636962890625 + }, + { + "auxiliary_loss_clip": 0.01127501, + "auxiliary_loss_mlp": 0.01032866, + "balance_loss_clip": 1.04402423, + "balance_loss_mlp": 1.01939583, + "epoch": 0.4233879452878401, + "flos": 52244486131680.0, + "grad_norm": 1.6591977520117176, + "language_loss": 0.73859358, + "learning_rate": 2.5846792659732803e-06, + "loss": 0.76019728, + "num_input_tokens_seen": 151194930, + "router_z_loss_clip": 0.83447266, + "router_z_loss_mlp": 0.13470459, + "step": 7042, + "time_per_iteration": 2.8182196617126465 + }, + { + "auxiliary_loss_clip": 0.01123407, + "auxiliary_loss_mlp": 0.01032588, + "balance_loss_clip": 1.04435837, + "balance_loss_mlp": 1.02007639, + "epoch": 0.42344806854050804, + "flos": 30784773980160.0, + "grad_norm": 1.333482230848988, + "language_loss": 0.82159346, + "learning_rate": 2.5843068070132643e-06, + "loss": 0.84315342, + "num_input_tokens_seen": 151217905, + "router_z_loss_clip": 0.79150391, + "router_z_loss_mlp": 0.12518311, + "step": 7043, + "time_per_iteration": 2.7020061016082764 + }, + { + "auxiliary_loss_clip": 0.01127367, + "auxiliary_loss_mlp": 0.01039027, + "balance_loss_clip": 1.04651105, + "balance_loss_mlp": 1.02435279, + "epoch": 0.423508191793176, + "flos": 27795388848480.0, + "grad_norm": 2.764570996051263, + "language_loss": 0.65256023, + "learning_rate": 2.5839343258962763e-06, + "loss": 0.67422414, + "num_input_tokens_seen": 151234580, + "router_z_loss_clip": 0.80810547, + "router_z_loss_mlp": 0.14672852, + "step": 7044, + "time_per_iteration": 2.619650363922119 + }, + { + "auxiliary_loss_clip": 0.01133471, + "auxiliary_loss_mlp": 0.01043208, + "balance_loss_clip": 1.0495497, + "balance_loss_mlp": 1.02878356, + "epoch": 0.42356831504584397, + "flos": 42266178419040.0, + "grad_norm": 1.8611117190913413, + "language_loss": 0.75432676, + "learning_rate": 2.5835618226364393e-06, + "loss": 0.7760936, + "num_input_tokens_seen": 151254765, + "router_z_loss_clip": 0.83984375, + "router_z_loss_mlp": 0.144104, + "step": 7045, + "time_per_iteration": 4.156236410140991 + }, + { + "auxiliary_loss_clip": 0.0112511, + "auxiliary_loss_mlp": 0.01033759, + "balance_loss_clip": 1.04637837, + "balance_loss_mlp": 1.02088439, + "epoch": 0.42362843829851193, + "flos": 21471948370080.0, + "grad_norm": 2.972483498329293, + "language_loss": 0.80805963, + "learning_rate": 2.5831892972478797e-06, + "loss": 0.82964826, + "num_input_tokens_seen": 151269045, + "router_z_loss_clip": 0.78710938, + "router_z_loss_mlp": 0.12890625, + "step": 7046, + "time_per_iteration": 2.7098307609558105 + }, + { + "auxiliary_loss_clip": 0.0112798, + "auxiliary_loss_mlp": 0.01034907, + "balance_loss_clip": 1.04468238, + "balance_loss_mlp": 1.02202678, + "epoch": 0.4236885615511799, + "flos": 27534779288640.0, + "grad_norm": 1.678370743002373, + "language_loss": 0.76583058, + "learning_rate": 2.5828167497447242e-06, + "loss": 0.78745943, + "num_input_tokens_seen": 151287530, + "router_z_loss_clip": 0.83398438, + "router_z_loss_mlp": 0.12884521, + "step": 7047, + "time_per_iteration": 2.6496024131774902 + }, + { + "auxiliary_loss_clip": 0.0112632, + "auxiliary_loss_mlp": 0.01033207, + "balance_loss_clip": 1.04705441, + "balance_loss_mlp": 1.02074933, + "epoch": 0.42374868480384786, + "flos": 32298202612800.0, + "grad_norm": 1.7134984024624242, + "language_loss": 0.68123615, + "learning_rate": 2.582444180141098e-06, + "loss": 0.70283139, + "num_input_tokens_seen": 151308905, + "router_z_loss_clip": 0.79345703, + "router_z_loss_mlp": 0.12469482, + "step": 7048, + "time_per_iteration": 2.70756459236145 + }, + { + "auxiliary_loss_clip": 0.01128775, + "auxiliary_loss_mlp": 0.01036291, + "balance_loss_clip": 1.0466783, + "balance_loss_mlp": 1.02222455, + "epoch": 0.4238088080565159, + "flos": 24856279310880.0, + "grad_norm": 2.0977702497862447, + "language_loss": 0.78423208, + "learning_rate": 2.5820715884511307e-06, + "loss": 0.80588275, + "num_input_tokens_seen": 151326525, + "router_z_loss_clip": 0.81982422, + "router_z_loss_mlp": 0.140625, + "step": 7049, + "time_per_iteration": 2.648268461227417 + }, + { + "auxiliary_loss_clip": 0.01130903, + "auxiliary_loss_mlp": 0.01039726, + "balance_loss_clip": 1.04769075, + "balance_loss_mlp": 1.02636826, + "epoch": 0.42386893130918385, + "flos": 25834875017760.0, + "grad_norm": 1.8125152828801554, + "language_loss": 0.82413357, + "learning_rate": 2.5816989746889504e-06, + "loss": 0.84583986, + "num_input_tokens_seen": 151344675, + "router_z_loss_clip": 0.83203125, + "router_z_loss_mlp": 0.13366699, + "step": 7050, + "time_per_iteration": 3.964246988296509 + }, + { + "auxiliary_loss_clip": 0.01127828, + "auxiliary_loss_mlp": 0.01034604, + "balance_loss_clip": 1.04463243, + "balance_loss_mlp": 1.02156854, + "epoch": 0.4239290545618518, + "flos": 21567637484640.0, + "grad_norm": 1.9556467801995283, + "language_loss": 0.73868299, + "learning_rate": 2.581326338868687e-06, + "loss": 0.76030731, + "num_input_tokens_seen": 151360730, + "router_z_loss_clip": 0.83203125, + "router_z_loss_mlp": 0.13049316, + "step": 7051, + "time_per_iteration": 2.6170132160186768 + }, + { + "auxiliary_loss_clip": 0.01126488, + "auxiliary_loss_mlp": 0.01032637, + "balance_loss_clip": 1.04516196, + "balance_loss_mlp": 1.02000666, + "epoch": 0.4239891778145198, + "flos": 29669046331680.0, + "grad_norm": 1.539652247437972, + "language_loss": 0.86359686, + "learning_rate": 2.5809536810044706e-06, + "loss": 0.8851881, + "num_input_tokens_seen": 151380445, + "router_z_loss_clip": 0.81298828, + "router_z_loss_mlp": 0.12646484, + "step": 7052, + "time_per_iteration": 2.658477783203125 + }, + { + "auxiliary_loss_clip": 0.01128436, + "auxiliary_loss_mlp": 0.01042229, + "balance_loss_clip": 1.04615963, + "balance_loss_mlp": 1.02866268, + "epoch": 0.42404930106718774, + "flos": 25085413294560.0, + "grad_norm": 2.585725923842254, + "language_loss": 0.72554457, + "learning_rate": 2.5805810011104323e-06, + "loss": 0.74725115, + "num_input_tokens_seen": 151399325, + "router_z_loss_clip": 0.82324219, + "router_z_loss_mlp": 0.13568115, + "step": 7053, + "time_per_iteration": 2.680939197540283 + }, + { + "auxiliary_loss_clip": 0.01128937, + "auxiliary_loss_mlp": 0.01032189, + "balance_loss_clip": 1.04816842, + "balance_loss_mlp": 1.01917696, + "epoch": 0.4241094243198557, + "flos": 27220612235040.0, + "grad_norm": 1.4639812609436875, + "language_loss": 0.82251942, + "learning_rate": 2.580208299200704e-06, + "loss": 0.84413064, + "num_input_tokens_seen": 151417240, + "router_z_loss_clip": 0.80761719, + "router_z_loss_mlp": 0.13024902, + "step": 7054, + "time_per_iteration": 2.631070852279663 + }, + { + "auxiliary_loss_clip": 0.01047184, + "auxiliary_loss_mlp": 0.01003565, + "balance_loss_clip": 1.02141929, + "balance_loss_mlp": 1.00225687, + "epoch": 0.4241695475725237, + "flos": 86162219432640.0, + "grad_norm": 0.7942330155685918, + "language_loss": 0.60455358, + "learning_rate": 2.5798355752894183e-06, + "loss": 0.62506109, + "num_input_tokens_seen": 151476015, + "router_z_loss_clip": 0.2578125, + "router_z_loss_mlp": 0.01308441, + "step": 7055, + "time_per_iteration": 3.3215322494506836 + }, + { + "auxiliary_loss_clip": 0.01129155, + "auxiliary_loss_mlp": 0.0103644, + "balance_loss_clip": 1.04698932, + "balance_loss_mlp": 1.02258182, + "epoch": 0.42422967082519164, + "flos": 17115828625440.0, + "grad_norm": 2.6207199249753166, + "language_loss": 0.77124959, + "learning_rate": 2.5794628293907107e-06, + "loss": 0.79290557, + "num_input_tokens_seen": 151492035, + "router_z_loss_clip": 0.8203125, + "router_z_loss_mlp": 0.13861084, + "step": 7056, + "time_per_iteration": 2.6091184616088867 + }, + { + "auxiliary_loss_clip": 0.01132177, + "auxiliary_loss_mlp": 0.01034104, + "balance_loss_clip": 1.04684496, + "balance_loss_mlp": 1.01945913, + "epoch": 0.4242897940778596, + "flos": 27266228341920.0, + "grad_norm": 2.0451870642456544, + "language_loss": 0.8379426, + "learning_rate": 2.579090061518714e-06, + "loss": 0.85960543, + "num_input_tokens_seen": 151508970, + "router_z_loss_clip": 0.85400391, + "router_z_loss_mlp": 0.1463623, + "step": 7057, + "time_per_iteration": 2.728379964828491 + }, + { + "auxiliary_loss_clip": 0.01131159, + "auxiliary_loss_mlp": 0.01037056, + "balance_loss_clip": 1.04659486, + "balance_loss_mlp": 1.02328706, + "epoch": 0.42434991733052757, + "flos": 27572170387680.0, + "grad_norm": 2.3036926717027484, + "language_loss": 0.83614588, + "learning_rate": 2.5787172716875642e-06, + "loss": 0.85782796, + "num_input_tokens_seen": 151525295, + "router_z_loss_clip": 0.84619141, + "router_z_loss_mlp": 0.13769531, + "step": 7058, + "time_per_iteration": 2.6289901733398438 + }, + { + "auxiliary_loss_clip": 0.01127783, + "auxiliary_loss_mlp": 0.01029379, + "balance_loss_clip": 1.04782784, + "balance_loss_mlp": 1.01726758, + "epoch": 0.42441004058319554, + "flos": 24417905328000.0, + "grad_norm": 1.6926831284679114, + "language_loss": 0.80230832, + "learning_rate": 2.5783444599113973e-06, + "loss": 0.8238799, + "num_input_tokens_seen": 151544435, + "router_z_loss_clip": 0.79833984, + "router_z_loss_mlp": 0.12115479, + "step": 7059, + "time_per_iteration": 2.750715494155884 + }, + { + "auxiliary_loss_clip": 0.01128925, + "auxiliary_loss_mlp": 0.01036975, + "balance_loss_clip": 1.04579127, + "balance_loss_mlp": 1.02199626, + "epoch": 0.4244701638358635, + "flos": 13598417471040.0, + "grad_norm": 2.0762115749948284, + "language_loss": 0.70286715, + "learning_rate": 2.57797162620435e-06, + "loss": 0.72452617, + "num_input_tokens_seen": 151559520, + "router_z_loss_clip": 0.83154297, + "router_z_loss_mlp": 0.14990234, + "step": 7060, + "time_per_iteration": 2.5951969623565674 + }, + { + "auxiliary_loss_clip": 0.01128212, + "auxiliary_loss_mlp": 0.0103195, + "balance_loss_clip": 1.04703808, + "balance_loss_mlp": 1.01905203, + "epoch": 0.42453028708853147, + "flos": 29270170346400.0, + "grad_norm": 1.5834890485918667, + "language_loss": 0.76381171, + "learning_rate": 2.577598770580562e-06, + "loss": 0.78541338, + "num_input_tokens_seen": 151579790, + "router_z_loss_clip": 0.81152344, + "router_z_loss_mlp": 0.12896729, + "step": 7061, + "time_per_iteration": 2.725623607635498 + }, + { + "auxiliary_loss_clip": 0.01132294, + "auxiliary_loss_mlp": 0.01039165, + "balance_loss_clip": 1.04849195, + "balance_loss_mlp": 1.02480662, + "epoch": 0.42459041034119943, + "flos": 22459214774880.0, + "grad_norm": 2.0742378871151868, + "language_loss": 0.72545284, + "learning_rate": 2.5772258930541693e-06, + "loss": 0.74716747, + "num_input_tokens_seen": 151598285, + "router_z_loss_clip": 0.83691406, + "router_z_loss_mlp": 0.14349365, + "step": 7062, + "time_per_iteration": 2.6260926723480225 + }, + { + "auxiliary_loss_clip": 0.01127918, + "auxiliary_loss_mlp": 0.01040442, + "balance_loss_clip": 1.04528785, + "balance_loss_mlp": 1.02752566, + "epoch": 0.42465053359386745, + "flos": 25573171491360.0, + "grad_norm": 2.560180233326195, + "language_loss": 0.66122901, + "learning_rate": 2.5768529936393137e-06, + "loss": 0.68291259, + "num_input_tokens_seen": 151615430, + "router_z_loss_clip": 0.82763672, + "router_z_loss_mlp": 0.12915039, + "step": 7063, + "time_per_iteration": 2.6924729347229004 + }, + { + "auxiliary_loss_clip": 0.01123969, + "auxiliary_loss_mlp": 0.01031095, + "balance_loss_clip": 1.04511762, + "balance_loss_mlp": 1.01854241, + "epoch": 0.4247106568465354, + "flos": 40397464044000.0, + "grad_norm": 2.7370218016040706, + "language_loss": 0.78716803, + "learning_rate": 2.5764800723501354e-06, + "loss": 0.80871868, + "num_input_tokens_seen": 151637030, + "router_z_loss_clip": 0.78857422, + "router_z_loss_mlp": 0.12524414, + "step": 7064, + "time_per_iteration": 2.792452573776245 + }, + { + "auxiliary_loss_clip": 0.01128383, + "auxiliary_loss_mlp": 0.01035892, + "balance_loss_clip": 1.04615402, + "balance_loss_mlp": 1.02230811, + "epoch": 0.4247707800992034, + "flos": 24461292984480.0, + "grad_norm": 2.0583512937555084, + "language_loss": 0.75550067, + "learning_rate": 2.5761071292007736e-06, + "loss": 0.77714348, + "num_input_tokens_seen": 151655745, + "router_z_loss_clip": 0.82226562, + "router_z_loss_mlp": 0.13592529, + "step": 7065, + "time_per_iteration": 2.662238836288452 + }, + { + "auxiliary_loss_clip": 0.0112959, + "auxiliary_loss_mlp": 0.01039491, + "balance_loss_clip": 1.0483942, + "balance_loss_mlp": 1.02518034, + "epoch": 0.42483090335187135, + "flos": 27314275485600.0, + "grad_norm": 1.3718081158970346, + "language_loss": 0.72297311, + "learning_rate": 2.5757341642053725e-06, + "loss": 0.74466395, + "num_input_tokens_seen": 151678040, + "router_z_loss_clip": 0.81152344, + "router_z_loss_mlp": 0.14294434, + "step": 7066, + "time_per_iteration": 2.6860241889953613 + }, + { + "auxiliary_loss_clip": 0.01129477, + "auxiliary_loss_mlp": 0.0103939, + "balance_loss_clip": 1.04579175, + "balance_loss_mlp": 1.02474475, + "epoch": 0.4248910266045393, + "flos": 26060078825280.0, + "grad_norm": 2.372829912589596, + "language_loss": 0.79685605, + "learning_rate": 2.5753611773780745e-06, + "loss": 0.81854475, + "num_input_tokens_seen": 151696410, + "router_z_loss_clip": 0.83740234, + "router_z_loss_mlp": 0.14648438, + "step": 7067, + "time_per_iteration": 2.671271800994873 + }, + { + "auxiliary_loss_clip": 0.01046103, + "auxiliary_loss_mlp": 0.01002445, + "balance_loss_clip": 1.02041483, + "balance_loss_mlp": 1.00107837, + "epoch": 0.4249511498572073, + "flos": 78103144792800.0, + "grad_norm": 0.916745228599912, + "language_loss": 0.63499182, + "learning_rate": 2.574988168733022e-06, + "loss": 0.65547729, + "num_input_tokens_seen": 151756365, + "router_z_loss_clip": 0.25732422, + "router_z_loss_mlp": 0.01366425, + "step": 7068, + "time_per_iteration": 3.267163038253784 + }, + { + "auxiliary_loss_clip": 0.01128371, + "auxiliary_loss_mlp": 0.01034709, + "balance_loss_clip": 1.04614234, + "balance_loss_mlp": 1.02004671, + "epoch": 0.42501127310987524, + "flos": 23924717815680.0, + "grad_norm": 1.6715462345278114, + "language_loss": 0.72081208, + "learning_rate": 2.574615138284361e-06, + "loss": 0.74244285, + "num_input_tokens_seen": 151775165, + "router_z_loss_clip": 0.82177734, + "router_z_loss_mlp": 0.14654541, + "step": 7069, + "time_per_iteration": 2.649066925048828 + }, + { + "auxiliary_loss_clip": 0.01130521, + "auxiliary_loss_mlp": 0.0103404, + "balance_loss_clip": 1.04731762, + "balance_loss_mlp": 1.01983571, + "epoch": 0.4250713963625432, + "flos": 23748006841920.0, + "grad_norm": 1.9274347041610678, + "language_loss": 0.79332101, + "learning_rate": 2.5742420860462364e-06, + "loss": 0.81496656, + "num_input_tokens_seen": 151792620, + "router_z_loss_clip": 0.83056641, + "router_z_loss_mlp": 0.14208984, + "step": 7070, + "time_per_iteration": 2.636030673980713 + }, + { + "auxiliary_loss_clip": 0.01128394, + "auxiliary_loss_mlp": 0.0103362, + "balance_loss_clip": 1.04560447, + "balance_loss_mlp": 1.01976788, + "epoch": 0.4251315196152112, + "flos": 30917489538240.0, + "grad_norm": 2.5871252097881596, + "language_loss": 0.7025283, + "learning_rate": 2.573869012032795e-06, + "loss": 0.72414845, + "num_input_tokens_seen": 151812850, + "router_z_loss_clip": 0.82861328, + "router_z_loss_mlp": 0.13842773, + "step": 7071, + "time_per_iteration": 2.7206931114196777 + }, + { + "auxiliary_loss_clip": 0.01126015, + "auxiliary_loss_mlp": 0.01031995, + "balance_loss_clip": 1.04443383, + "balance_loss_mlp": 1.01867354, + "epoch": 0.42519164286787914, + "flos": 32163987915360.0, + "grad_norm": 3.9880186127064063, + "language_loss": 0.71253765, + "learning_rate": 2.5734959162581824e-06, + "loss": 0.73411775, + "num_input_tokens_seen": 151831785, + "router_z_loss_clip": 0.81591797, + "router_z_loss_mlp": 0.13330078, + "step": 7072, + "time_per_iteration": 2.781445264816284 + }, + { + "auxiliary_loss_clip": 0.01130233, + "auxiliary_loss_mlp": 0.0103405, + "balance_loss_clip": 1.04621387, + "balance_loss_mlp": 1.0211457, + "epoch": 0.4252517661205471, + "flos": 31763572273440.0, + "grad_norm": 1.6306331209816756, + "language_loss": 0.81427956, + "learning_rate": 2.5731227987365475e-06, + "loss": 0.83592242, + "num_input_tokens_seen": 151853885, + "router_z_loss_clip": 0.83935547, + "router_z_loss_mlp": 0.12908936, + "step": 7073, + "time_per_iteration": 2.7212841510772705 + }, + { + "auxiliary_loss_clip": 0.01125465, + "auxiliary_loss_mlp": 0.0102823, + "balance_loss_clip": 1.04561758, + "balance_loss_mlp": 1.01569486, + "epoch": 0.42531188937321507, + "flos": 15512626401120.0, + "grad_norm": 3.038600579705482, + "language_loss": 0.91306043, + "learning_rate": 2.5727496594820386e-06, + "loss": 0.93459743, + "num_input_tokens_seen": 151871780, + "router_z_loss_clip": 0.79833984, + "router_z_loss_mlp": 0.12536621, + "step": 7074, + "time_per_iteration": 2.6086277961730957 + }, + { + "auxiliary_loss_clip": 0.01131187, + "auxiliary_loss_mlp": 0.01034066, + "balance_loss_clip": 1.04601789, + "balance_loss_mlp": 1.01917672, + "epoch": 0.42537201262588303, + "flos": 26956315602720.0, + "grad_norm": 1.8558461585344983, + "language_loss": 0.64166576, + "learning_rate": 2.572376498508805e-06, + "loss": 0.66331828, + "num_input_tokens_seen": 151891600, + "router_z_loss_clip": 0.8515625, + "router_z_loss_mlp": 0.14874268, + "step": 7075, + "time_per_iteration": 2.6615445613861084 + }, + { + "auxiliary_loss_clip": 0.01121349, + "auxiliary_loss_mlp": 0.01030053, + "balance_loss_clip": 1.04371881, + "balance_loss_mlp": 1.01767349, + "epoch": 0.42543213587855105, + "flos": 28068599282400.0, + "grad_norm": 1.8554911358922432, + "language_loss": 0.73976517, + "learning_rate": 2.5720033158309973e-06, + "loss": 0.76127917, + "num_input_tokens_seen": 151911330, + "router_z_loss_clip": 0.77636719, + "router_z_loss_mlp": 0.1237793, + "step": 7076, + "time_per_iteration": 2.683396100997925 + }, + { + "auxiliary_loss_clip": 0.01128697, + "auxiliary_loss_mlp": 0.01040176, + "balance_loss_clip": 1.04507113, + "balance_loss_mlp": 1.02599633, + "epoch": 0.425492259131219, + "flos": 30605996625120.0, + "grad_norm": 6.446184648109934, + "language_loss": 0.78740609, + "learning_rate": 2.571630111462766e-06, + "loss": 0.80909479, + "num_input_tokens_seen": 151930355, + "router_z_loss_clip": 0.83740234, + "router_z_loss_mlp": 0.14172363, + "step": 7077, + "time_per_iteration": 2.731677532196045 + }, + { + "auxiliary_loss_clip": 0.01123633, + "auxiliary_loss_mlp": 0.01029716, + "balance_loss_clip": 1.04567873, + "balance_loss_mlp": 1.01800382, + "epoch": 0.425552382383887, + "flos": 27841653231840.0, + "grad_norm": 1.6056072496280729, + "language_loss": 0.72747457, + "learning_rate": 2.571256885418265e-06, + "loss": 0.74900806, + "num_input_tokens_seen": 151949695, + "router_z_loss_clip": 0.78076172, + "router_z_loss_mlp": 0.1171875, + "step": 7078, + "time_per_iteration": 5.457165002822876 + }, + { + "auxiliary_loss_clip": 0.01124932, + "auxiliary_loss_mlp": 0.01038295, + "balance_loss_clip": 1.04616773, + "balance_loss_mlp": 1.02574229, + "epoch": 0.42561250563655495, + "flos": 16537527008640.0, + "grad_norm": 2.2509023664146772, + "language_loss": 0.79680848, + "learning_rate": 2.5708836377116445e-06, + "loss": 0.81844074, + "num_input_tokens_seen": 151967640, + "router_z_loss_clip": 0.78710938, + "router_z_loss_mlp": 0.12561035, + "step": 7079, + "time_per_iteration": 2.600496530532837 + }, + { + "auxiliary_loss_clip": 0.01126887, + "auxiliary_loss_mlp": 0.01032466, + "balance_loss_clip": 1.04719543, + "balance_loss_mlp": 1.02018166, + "epoch": 0.4256726288892229, + "flos": 57324588580800.0, + "grad_norm": 1.4646470286844355, + "language_loss": 0.71892691, + "learning_rate": 2.5705103683570592e-06, + "loss": 0.74052042, + "num_input_tokens_seen": 151994020, + "router_z_loss_clip": 0.796875, + "router_z_loss_mlp": 0.1229248, + "step": 7080, + "time_per_iteration": 2.9098215103149414 + }, + { + "auxiliary_loss_clip": 0.01124532, + "auxiliary_loss_mlp": 0.01031969, + "balance_loss_clip": 1.04380107, + "balance_loss_mlp": 1.01898718, + "epoch": 0.4257327521418909, + "flos": 28780467320160.0, + "grad_norm": 3.2008022684954405, + "language_loss": 0.80446798, + "learning_rate": 2.5701370773686646e-06, + "loss": 0.826033, + "num_input_tokens_seen": 152013415, + "router_z_loss_clip": 0.80712891, + "router_z_loss_mlp": 0.12982178, + "step": 7081, + "time_per_iteration": 2.687238931655884 + }, + { + "auxiliary_loss_clip": 0.01120784, + "auxiliary_loss_mlp": 0.01028982, + "balance_loss_clip": 1.04375458, + "balance_loss_mlp": 1.01673925, + "epoch": 0.42579287539455885, + "flos": 23171447468160.0, + "grad_norm": 2.243962951322105, + "language_loss": 0.81616163, + "learning_rate": 2.5697637647606138e-06, + "loss": 0.83765936, + "num_input_tokens_seen": 152030860, + "router_z_loss_clip": 0.77050781, + "router_z_loss_mlp": 0.12249756, + "step": 7082, + "time_per_iteration": 2.7139062881469727 + }, + { + "auxiliary_loss_clip": 0.0112605, + "auxiliary_loss_mlp": 0.01035383, + "balance_loss_clip": 1.04498959, + "balance_loss_mlp": 1.0222702, + "epoch": 0.4258529986472268, + "flos": 30739563046080.0, + "grad_norm": 2.4382952298963936, + "language_loss": 0.69907892, + "learning_rate": 2.569390430547065e-06, + "loss": 0.72069329, + "num_input_tokens_seen": 152050395, + "router_z_loss_clip": 0.80957031, + "router_z_loss_mlp": 0.13122559, + "step": 7083, + "time_per_iteration": 2.8132007122039795 + }, + { + "auxiliary_loss_clip": 0.01043138, + "auxiliary_loss_mlp": 0.01001051, + "balance_loss_clip": 1.0175941, + "balance_loss_mlp": 0.9997108, + "epoch": 0.4259131218998948, + "flos": 85375001954880.0, + "grad_norm": 0.8801129864453285, + "language_loss": 0.67089391, + "learning_rate": 2.569017074742173e-06, + "loss": 0.6913358, + "num_input_tokens_seen": 152113555, + "router_z_loss_clip": 0.25561523, + "router_z_loss_mlp": 0.01339722, + "step": 7084, + "time_per_iteration": 3.396878719329834 + }, + { + "auxiliary_loss_clip": 0.01124792, + "auxiliary_loss_mlp": 0.01041053, + "balance_loss_clip": 1.045017, + "balance_loss_mlp": 1.02740359, + "epoch": 0.42597324515256274, + "flos": 21969430714080.0, + "grad_norm": 2.3106563544411785, + "language_loss": 0.78497577, + "learning_rate": 2.5686436973600964e-06, + "loss": 0.80663419, + "num_input_tokens_seen": 152131575, + "router_z_loss_clip": 0.79736328, + "router_z_loss_mlp": 0.1362915, + "step": 7085, + "time_per_iteration": 4.059384107589722 + }, + { + "auxiliary_loss_clip": 0.01131549, + "auxiliary_loss_mlp": 0.01043384, + "balance_loss_clip": 1.04705358, + "balance_loss_mlp": 1.02896595, + "epoch": 0.4260333684052307, + "flos": 18496339113600.0, + "grad_norm": 2.8165857787190847, + "language_loss": 0.7632857, + "learning_rate": 2.568270298414995e-06, + "loss": 0.78503501, + "num_input_tokens_seen": 152149435, + "router_z_loss_clip": 0.84423828, + "router_z_loss_mlp": 0.144104, + "step": 7086, + "time_per_iteration": 2.645545482635498 + }, + { + "auxiliary_loss_clip": 0.01125518, + "auxiliary_loss_mlp": 0.01035997, + "balance_loss_clip": 1.044541, + "balance_loss_mlp": 1.02238929, + "epoch": 0.42609349165789867, + "flos": 18228598512480.0, + "grad_norm": 1.9215869902533318, + "language_loss": 0.79956532, + "learning_rate": 2.5678968779210255e-06, + "loss": 0.82118046, + "num_input_tokens_seen": 152166860, + "router_z_loss_clip": 0.81005859, + "router_z_loss_mlp": 0.13598633, + "step": 7087, + "time_per_iteration": 2.6602377891540527 + }, + { + "auxiliary_loss_clip": 0.01127424, + "auxiliary_loss_mlp": 0.01031625, + "balance_loss_clip": 1.04539168, + "balance_loss_mlp": 1.0178628, + "epoch": 0.42615361491056664, + "flos": 28958069674080.0, + "grad_norm": 1.5589030377975108, + "language_loss": 0.66158074, + "learning_rate": 2.5675234358923505e-06, + "loss": 0.68317121, + "num_input_tokens_seen": 152187475, + "router_z_loss_clip": 0.8203125, + "router_z_loss_mlp": 0.13763428, + "step": 7088, + "time_per_iteration": 2.711181402206421 + }, + { + "auxiliary_loss_clip": 0.01127288, + "auxiliary_loss_mlp": 0.01035965, + "balance_loss_clip": 1.04488516, + "balance_loss_mlp": 1.02260184, + "epoch": 0.42621373816323466, + "flos": 30427827029280.0, + "grad_norm": 2.6006218480470045, + "language_loss": 0.68777937, + "learning_rate": 2.56714997234313e-06, + "loss": 0.70941192, + "num_input_tokens_seen": 152207235, + "router_z_loss_clip": 0.82421875, + "router_z_loss_mlp": 0.13378906, + "step": 7089, + "time_per_iteration": 2.68866229057312 + }, + { + "auxiliary_loss_clip": 0.01126086, + "auxiliary_loss_mlp": 0.01035792, + "balance_loss_clip": 1.04252756, + "balance_loss_mlp": 1.02266729, + "epoch": 0.4262738614159026, + "flos": 16536838214880.0, + "grad_norm": 2.470720701539669, + "language_loss": 0.7360397, + "learning_rate": 2.566776487287525e-06, + "loss": 0.75765848, + "num_input_tokens_seen": 152224240, + "router_z_loss_clip": 0.83496094, + "router_z_loss_mlp": 0.13128662, + "step": 7090, + "time_per_iteration": 4.05888557434082 + }, + { + "auxiliary_loss_clip": 0.01128513, + "auxiliary_loss_mlp": 0.01041632, + "balance_loss_clip": 1.04349625, + "balance_loss_mlp": 1.02888894, + "epoch": 0.4263339846685706, + "flos": 36300576271680.0, + "grad_norm": 1.7986138077947171, + "language_loss": 0.75304443, + "learning_rate": 2.5664029807396994e-06, + "loss": 0.77474594, + "num_input_tokens_seen": 152242595, + "router_z_loss_clip": 0.85107422, + "router_z_loss_mlp": 0.12756348, + "step": 7091, + "time_per_iteration": 2.710179567337036 + }, + { + "auxiliary_loss_clip": 0.01120366, + "auxiliary_loss_mlp": 0.0103118, + "balance_loss_clip": 1.04322267, + "balance_loss_mlp": 1.01918173, + "epoch": 0.42639410792123855, + "flos": 20541075668640.0, + "grad_norm": 1.765371237563759, + "language_loss": 0.82544315, + "learning_rate": 2.5660294527138156e-06, + "loss": 0.84695864, + "num_input_tokens_seen": 152260840, + "router_z_loss_clip": 0.77099609, + "router_z_loss_mlp": 0.11993408, + "step": 7092, + "time_per_iteration": 2.6635782718658447 + }, + { + "auxiliary_loss_clip": 0.01131978, + "auxiliary_loss_mlp": 0.01038319, + "balance_loss_clip": 1.04621077, + "balance_loss_mlp": 1.02535546, + "epoch": 0.4264542311739065, + "flos": 35097830206560.0, + "grad_norm": 1.680316648508108, + "language_loss": 0.73868698, + "learning_rate": 2.565655903224038e-06, + "loss": 0.76038992, + "num_input_tokens_seen": 152280580, + "router_z_loss_clip": 0.85791016, + "router_z_loss_mlp": 0.1295166, + "step": 7093, + "time_per_iteration": 2.7238175868988037 + }, + { + "auxiliary_loss_clip": 0.01124688, + "auxiliary_loss_mlp": 0.01032655, + "balance_loss_clip": 1.04342318, + "balance_loss_mlp": 1.01931596, + "epoch": 0.4265143544265745, + "flos": 30156277803840.0, + "grad_norm": 2.2511128690779585, + "language_loss": 0.69981706, + "learning_rate": 2.565282332284532e-06, + "loss": 0.72139049, + "num_input_tokens_seen": 152298455, + "router_z_loss_clip": 0.8125, + "router_z_loss_mlp": 0.13323975, + "step": 7094, + "time_per_iteration": 2.6654322147369385 + }, + { + "auxiliary_loss_clip": 0.01127045, + "auxiliary_loss_mlp": 0.01037177, + "balance_loss_clip": 1.04436517, + "balance_loss_mlp": 1.02334237, + "epoch": 0.42657447767924245, + "flos": 26681524994880.0, + "grad_norm": 1.8656335725546225, + "language_loss": 0.81579697, + "learning_rate": 2.564908739909464e-06, + "loss": 0.83743918, + "num_input_tokens_seen": 152316995, + "router_z_loss_clip": 0.82714844, + "router_z_loss_mlp": 0.13824463, + "step": 7095, + "time_per_iteration": 2.6663477420806885 + }, + { + "auxiliary_loss_clip": 0.01126551, + "auxiliary_loss_mlp": 0.01039594, + "balance_loss_clip": 1.04432893, + "balance_loss_mlp": 1.02674961, + "epoch": 0.4266346009319104, + "flos": 26198102147040.0, + "grad_norm": 1.8076836384225048, + "language_loss": 0.80638313, + "learning_rate": 2.5645351261129996e-06, + "loss": 0.82804459, + "num_input_tokens_seen": 152334800, + "router_z_loss_clip": 0.82226562, + "router_z_loss_mlp": 0.128479, + "step": 7096, + "time_per_iteration": 2.630079746246338 + }, + { + "auxiliary_loss_clip": 0.01129481, + "auxiliary_loss_mlp": 0.01035963, + "balance_loss_clip": 1.0453546, + "balance_loss_mlp": 1.02243853, + "epoch": 0.4266947241845784, + "flos": 31139249376960.0, + "grad_norm": 2.452695640122381, + "language_loss": 0.65748847, + "learning_rate": 2.5641614909093066e-06, + "loss": 0.67914295, + "num_input_tokens_seen": 152355175, + "router_z_loss_clip": 0.84082031, + "router_z_loss_mlp": 0.13537598, + "step": 7097, + "time_per_iteration": 2.6896159648895264 + }, + { + "auxiliary_loss_clip": 0.01121857, + "auxiliary_loss_mlp": 0.01031629, + "balance_loss_clip": 1.04212737, + "balance_loss_mlp": 1.01840281, + "epoch": 0.42675484743724634, + "flos": 32386152926880.0, + "grad_norm": 1.6818247524447245, + "language_loss": 0.74290258, + "learning_rate": 2.5637878343125535e-06, + "loss": 0.76443744, + "num_input_tokens_seen": 152377245, + "router_z_loss_clip": 0.796875, + "router_z_loss_mlp": 0.13226318, + "step": 7098, + "time_per_iteration": 2.6681301593780518 + }, + { + "auxiliary_loss_clip": 0.01124429, + "auxiliary_loss_mlp": 0.01030281, + "balance_loss_clip": 1.04424417, + "balance_loss_mlp": 1.01764441, + "epoch": 0.4268149706899143, + "flos": 28202327772480.0, + "grad_norm": 1.947610456637254, + "language_loss": 0.74576825, + "learning_rate": 2.5634141563369086e-06, + "loss": 0.76731527, + "num_input_tokens_seen": 152396985, + "router_z_loss_clip": 0.80078125, + "router_z_loss_mlp": 0.12646484, + "step": 7099, + "time_per_iteration": 2.667776584625244 + }, + { + "auxiliary_loss_clip": 0.01127444, + "auxiliary_loss_mlp": 0.01037639, + "balance_loss_clip": 1.04521859, + "balance_loss_mlp": 1.02463281, + "epoch": 0.4268750939425823, + "flos": 27707114396160.0, + "grad_norm": 2.11897389119998, + "language_loss": 0.83182037, + "learning_rate": 2.5630404569965432e-06, + "loss": 0.85347122, + "num_input_tokens_seen": 152415590, + "router_z_loss_clip": 0.82177734, + "router_z_loss_mlp": 0.13018799, + "step": 7100, + "time_per_iteration": 2.6327426433563232 + }, + { + "auxiliary_loss_clip": 0.01125515, + "auxiliary_loss_mlp": 0.01031046, + "balance_loss_clip": 1.04325235, + "balance_loss_mlp": 1.0182786, + "epoch": 0.42693521719525024, + "flos": 30962781506880.0, + "grad_norm": 1.3896892751962608, + "language_loss": 0.82177734, + "learning_rate": 2.562666736305627e-06, + "loss": 0.84334302, + "num_input_tokens_seen": 152436735, + "router_z_loss_clip": 0.82373047, + "router_z_loss_mlp": 0.12774658, + "step": 7101, + "time_per_iteration": 2.8418493270874023 + }, + { + "auxiliary_loss_clip": 0.01130541, + "auxiliary_loss_mlp": 0.0103271, + "balance_loss_clip": 1.0464251, + "balance_loss_mlp": 1.01913834, + "epoch": 0.42699534044791826, + "flos": 22147276171680.0, + "grad_norm": 2.386302345157594, + "language_loss": 0.72709233, + "learning_rate": 2.5622929942783314e-06, + "loss": 0.74872482, + "num_input_tokens_seen": 152455685, + "router_z_loss_clip": 0.84130859, + "router_z_loss_mlp": 0.13574219, + "step": 7102, + "time_per_iteration": 2.6031241416931152 + }, + { + "auxiliary_loss_clip": 0.0112235, + "auxiliary_loss_mlp": 0.01032281, + "balance_loss_clip": 1.0449456, + "balance_loss_mlp": 1.01958513, + "epoch": 0.4270554637005862, + "flos": 16714359534240.0, + "grad_norm": 1.8699666259805179, + "language_loss": 0.82913578, + "learning_rate": 2.5619192309288297e-06, + "loss": 0.85068208, + "num_input_tokens_seen": 152473500, + "router_z_loss_clip": 0.77441406, + "router_z_loss_mlp": 0.12695312, + "step": 7103, + "time_per_iteration": 2.700131416320801 + }, + { + "auxiliary_loss_clip": 0.01127752, + "auxiliary_loss_mlp": 0.01030834, + "balance_loss_clip": 1.04453266, + "balance_loss_mlp": 1.01733947, + "epoch": 0.4271155869532542, + "flos": 21345472473120.0, + "grad_norm": 2.0607091969811435, + "language_loss": 0.7343545, + "learning_rate": 2.561545446271294e-06, + "loss": 0.75594032, + "num_input_tokens_seen": 152491320, + "router_z_loss_clip": 0.83251953, + "router_z_loss_mlp": 0.1348877, + "step": 7104, + "time_per_iteration": 2.629518985748291 + }, + { + "auxiliary_loss_clip": 0.01126016, + "auxiliary_loss_mlp": 0.01027261, + "balance_loss_clip": 1.04477918, + "balance_loss_mlp": 1.01479733, + "epoch": 0.42717571020592215, + "flos": 39599266383360.0, + "grad_norm": 2.182114572484396, + "language_loss": 0.76254666, + "learning_rate": 2.5611716403198987e-06, + "loss": 0.78407943, + "num_input_tokens_seen": 152511970, + "router_z_loss_clip": 0.8125, + "router_z_loss_mlp": 0.12463379, + "step": 7105, + "time_per_iteration": 2.7946579456329346 + }, + { + "auxiliary_loss_clip": 0.01129252, + "auxiliary_loss_mlp": 0.01032523, + "balance_loss_clip": 1.04723001, + "balance_loss_mlp": 1.02014947, + "epoch": 0.4272358334585901, + "flos": 19831679184960.0, + "grad_norm": 2.4310507961166503, + "language_loss": 0.76703787, + "learning_rate": 2.560797813088819e-06, + "loss": 0.78865564, + "num_input_tokens_seen": 152530515, + "router_z_loss_clip": 0.82080078, + "router_z_loss_mlp": 0.1237793, + "step": 7106, + "time_per_iteration": 2.695535898208618 + }, + { + "auxiliary_loss_clip": 0.01125475, + "auxiliary_loss_mlp": 0.01032752, + "balance_loss_clip": 1.04453444, + "balance_loss_mlp": 1.0202353, + "epoch": 0.4272959567112581, + "flos": 29529888526080.0, + "grad_norm": 1.9321194351782012, + "language_loss": 0.79707861, + "learning_rate": 2.560423964592229e-06, + "loss": 0.81866097, + "num_input_tokens_seen": 152549295, + "router_z_loss_clip": 0.80908203, + "router_z_loss_mlp": 0.12518311, + "step": 7107, + "time_per_iteration": 2.668100357055664 + }, + { + "auxiliary_loss_clip": 0.01125924, + "auxiliary_loss_mlp": 0.01032668, + "balance_loss_clip": 1.04646742, + "balance_loss_mlp": 1.02020454, + "epoch": 0.42735607996392605, + "flos": 34121787088320.0, + "grad_norm": 1.4065166565150364, + "language_loss": 0.68121672, + "learning_rate": 2.5600500948443075e-06, + "loss": 0.70280254, + "num_input_tokens_seen": 152570725, + "router_z_loss_clip": 0.79492188, + "router_z_loss_mlp": 0.12475586, + "step": 7108, + "time_per_iteration": 2.7243804931640625 + }, + { + "auxiliary_loss_clip": 0.0112628, + "auxiliary_loss_mlp": 0.01035646, + "balance_loss_clip": 1.0458976, + "balance_loss_mlp": 1.02306926, + "epoch": 0.427416203216594, + "flos": 24765371235360.0, + "grad_norm": 1.7734759647721463, + "language_loss": 0.71300364, + "learning_rate": 2.5596762038592294e-06, + "loss": 0.7346229, + "num_input_tokens_seen": 152588950, + "router_z_loss_clip": 0.80517578, + "router_z_loss_mlp": 0.12579346, + "step": 7109, + "time_per_iteration": 2.6444621086120605 + }, + { + "auxiliary_loss_clip": 0.01127807, + "auxiliary_loss_mlp": 0.01032198, + "balance_loss_clip": 1.04585874, + "balance_loss_mlp": 1.01786911, + "epoch": 0.427476326469262, + "flos": 32877476644320.0, + "grad_norm": 1.9366272213603841, + "language_loss": 0.64393318, + "learning_rate": 2.559302291651174e-06, + "loss": 0.66553319, + "num_input_tokens_seen": 152608965, + "router_z_loss_clip": 0.81982422, + "router_z_loss_mlp": 0.14318848, + "step": 7110, + "time_per_iteration": 2.7433741092681885 + }, + { + "auxiliary_loss_clip": 0.01128705, + "auxiliary_loss_mlp": 0.01035803, + "balance_loss_clip": 1.04667664, + "balance_loss_mlp": 1.021456, + "epoch": 0.42753644972192995, + "flos": 31363035079680.0, + "grad_norm": 1.8465028758761948, + "language_loss": 0.76455033, + "learning_rate": 2.5589283582343197e-06, + "loss": 0.7861954, + "num_input_tokens_seen": 152630220, + "router_z_loss_clip": 0.8203125, + "router_z_loss_mlp": 0.14337158, + "step": 7111, + "time_per_iteration": 2.677231788635254 + }, + { + "auxiliary_loss_clip": 0.01129975, + "auxiliary_loss_mlp": 0.01030095, + "balance_loss_clip": 1.04735422, + "balance_loss_mlp": 1.01707172, + "epoch": 0.4275965729745979, + "flos": 22900181863680.0, + "grad_norm": 1.9386168526394392, + "language_loss": 0.73004019, + "learning_rate": 2.558554403622845e-06, + "loss": 0.75164092, + "num_input_tokens_seen": 152648835, + "router_z_loss_clip": 0.82714844, + "router_z_loss_mlp": 0.13024902, + "step": 7112, + "time_per_iteration": 2.6846911907196045 + }, + { + "auxiliary_loss_clip": 0.01125049, + "auxiliary_loss_mlp": 0.01033933, + "balance_loss_clip": 1.04597998, + "balance_loss_mlp": 1.02140987, + "epoch": 0.4276566962272659, + "flos": 28998053879040.0, + "grad_norm": 1.5896935989173153, + "language_loss": 0.71548176, + "learning_rate": 2.5581804278309323e-06, + "loss": 0.73707151, + "num_input_tokens_seen": 152668375, + "router_z_loss_clip": 0.79052734, + "router_z_loss_mlp": 0.12518311, + "step": 7113, + "time_per_iteration": 2.6589787006378174 + }, + { + "auxiliary_loss_clip": 0.01130283, + "auxiliary_loss_mlp": 0.01041026, + "balance_loss_clip": 1.04764211, + "balance_loss_mlp": 1.0274719, + "epoch": 0.42771681947993384, + "flos": 27445491904320.0, + "grad_norm": 1.740940654703152, + "language_loss": 0.61599159, + "learning_rate": 2.5578064308727617e-06, + "loss": 0.63770473, + "num_input_tokens_seen": 152689725, + "router_z_loss_clip": 0.82617188, + "router_z_loss_mlp": 0.13555908, + "step": 7114, + "time_per_iteration": 2.687166690826416 + }, + { + "auxiliary_loss_clip": 0.01133326, + "auxiliary_loss_mlp": 0.0104022, + "balance_loss_clip": 1.04748082, + "balance_loss_mlp": 1.02533054, + "epoch": 0.42777694273260186, + "flos": 30560056380000.0, + "grad_norm": 1.8215927602097168, + "language_loss": 0.64735734, + "learning_rate": 2.5574324127625153e-06, + "loss": 0.66909283, + "num_input_tokens_seen": 152709375, + "router_z_loss_clip": 0.859375, + "router_z_loss_mlp": 0.14898682, + "step": 7115, + "time_per_iteration": 2.704089641571045 + }, + { + "auxiliary_loss_clip": 0.01126474, + "auxiliary_loss_mlp": 0.01034286, + "balance_loss_clip": 1.04518831, + "balance_loss_mlp": 1.02178669, + "epoch": 0.4278370659852698, + "flos": 22770626653440.0, + "grad_norm": 1.7206808822752389, + "language_loss": 0.73991144, + "learning_rate": 2.5570583735143753e-06, + "loss": 0.76151896, + "num_input_tokens_seen": 152727510, + "router_z_loss_clip": 0.8125, + "router_z_loss_mlp": 0.125, + "step": 7116, + "time_per_iteration": 2.6587259769439697 + }, + { + "auxiliary_loss_clip": 0.01123663, + "auxiliary_loss_mlp": 0.01038388, + "balance_loss_clip": 1.04468524, + "balance_loss_mlp": 1.02658033, + "epoch": 0.4278971892379378, + "flos": 33322090288320.0, + "grad_norm": 1.8783472944376027, + "language_loss": 0.69390368, + "learning_rate": 2.5566843131425275e-06, + "loss": 0.7155242, + "num_input_tokens_seen": 152746670, + "router_z_loss_clip": 0.79052734, + "router_z_loss_mlp": 0.11804199, + "step": 7117, + "time_per_iteration": 4.210463762283325 + }, + { + "auxiliary_loss_clip": 0.0112887, + "auxiliary_loss_mlp": 0.01034536, + "balance_loss_clip": 1.04813802, + "balance_loss_mlp": 1.02173305, + "epoch": 0.42795731249060576, + "flos": 15726931060320.0, + "grad_norm": 2.6759349078115187, + "language_loss": 0.70071602, + "learning_rate": 2.5563102316611536e-06, + "loss": 0.72235, + "num_input_tokens_seen": 152760545, + "router_z_loss_clip": 0.80712891, + "router_z_loss_mlp": 0.12805176, + "step": 7118, + "time_per_iteration": 4.105860948562622 + }, + { + "auxiliary_loss_clip": 0.01128692, + "auxiliary_loss_mlp": 0.01041675, + "balance_loss_clip": 1.04799175, + "balance_loss_mlp": 1.02796543, + "epoch": 0.4280174357432737, + "flos": 40757449790880.0, + "grad_norm": 1.9539524269527668, + "language_loss": 0.74838066, + "learning_rate": 2.55593612908444e-06, + "loss": 0.77008426, + "num_input_tokens_seen": 152780970, + "router_z_loss_clip": 0.80664062, + "router_z_loss_mlp": 0.13720703, + "step": 7119, + "time_per_iteration": 2.732276678085327 + }, + { + "auxiliary_loss_clip": 0.01126574, + "auxiliary_loss_mlp": 0.01031572, + "balance_loss_clip": 1.04634857, + "balance_loss_mlp": 1.01885843, + "epoch": 0.4280775589959417, + "flos": 22279667591520.0, + "grad_norm": 1.9585779368584415, + "language_loss": 0.74645126, + "learning_rate": 2.555562005426573e-06, + "loss": 0.76803273, + "num_input_tokens_seen": 152798475, + "router_z_loss_clip": 0.80322266, + "router_z_loss_mlp": 0.12719727, + "step": 7120, + "time_per_iteration": 2.651488780975342 + }, + { + "auxiliary_loss_clip": 0.01127983, + "auxiliary_loss_mlp": 0.01036379, + "balance_loss_clip": 1.04743266, + "balance_loss_mlp": 1.02410054, + "epoch": 0.42813768224860965, + "flos": 26199277148160.0, + "grad_norm": 1.7783354028479375, + "language_loss": 0.76939487, + "learning_rate": 2.5551878607017385e-06, + "loss": 0.79103845, + "num_input_tokens_seen": 152817555, + "router_z_loss_clip": 0.80615234, + "router_z_loss_mlp": 0.12286377, + "step": 7121, + "time_per_iteration": 2.6363320350646973 + }, + { + "auxiliary_loss_clip": 0.0112523, + "auxiliary_loss_mlp": 0.01035456, + "balance_loss_clip": 1.04576659, + "balance_loss_mlp": 1.02329612, + "epoch": 0.4281978055012776, + "flos": 19119284422560.0, + "grad_norm": 1.7261081179090731, + "language_loss": 0.85644865, + "learning_rate": 2.554813694924126e-06, + "loss": 0.87805545, + "num_input_tokens_seen": 152836295, + "router_z_loss_clip": 0.79541016, + "router_z_loss_mlp": 0.1217041, + "step": 7122, + "time_per_iteration": 2.6582417488098145 + }, + { + "auxiliary_loss_clip": 0.01123916, + "auxiliary_loss_mlp": 0.01031934, + "balance_loss_clip": 1.04490685, + "balance_loss_mlp": 1.01917243, + "epoch": 0.4282579287539456, + "flos": 21612038073120.0, + "grad_norm": 2.0799087179288565, + "language_loss": 0.81179827, + "learning_rate": 2.554439508107921e-06, + "loss": 0.8333568, + "num_input_tokens_seen": 152854950, + "router_z_loss_clip": 0.79052734, + "router_z_loss_mlp": 0.12756348, + "step": 7123, + "time_per_iteration": 2.6270997524261475 + }, + { + "auxiliary_loss_clip": 0.01126605, + "auxiliary_loss_mlp": 0.0103475, + "balance_loss_clip": 1.04733407, + "balance_loss_mlp": 1.02138042, + "epoch": 0.42831805200661355, + "flos": 23526814245120.0, + "grad_norm": 1.6196072134588266, + "language_loss": 0.80931294, + "learning_rate": 2.5540653002673153e-06, + "loss": 0.83092654, + "num_input_tokens_seen": 152873995, + "router_z_loss_clip": 0.79248047, + "router_z_loss_mlp": 0.13372803, + "step": 7124, + "time_per_iteration": 4.042130708694458 + }, + { + "auxiliary_loss_clip": 0.01124698, + "auxiliary_loss_mlp": 0.01037811, + "balance_loss_clip": 1.044963, + "balance_loss_mlp": 1.02412629, + "epoch": 0.4283781752592815, + "flos": 24150894037920.0, + "grad_norm": 1.8173691455439593, + "language_loss": 0.80807412, + "learning_rate": 2.553691071416498e-06, + "loss": 0.82969928, + "num_input_tokens_seen": 152892925, + "router_z_loss_clip": 0.79736328, + "router_z_loss_mlp": 0.13690186, + "step": 7125, + "time_per_iteration": 2.6852502822875977 + }, + { + "auxiliary_loss_clip": 0.01125458, + "auxiliary_loss_mlp": 0.01033367, + "balance_loss_clip": 1.04682326, + "balance_loss_mlp": 1.02135634, + "epoch": 0.4284382985119495, + "flos": 20142888477120.0, + "grad_norm": 1.9912750412301994, + "language_loss": 0.74855065, + "learning_rate": 2.553316821569659e-06, + "loss": 0.77013886, + "num_input_tokens_seen": 152910935, + "router_z_loss_clip": 0.78710938, + "router_z_loss_mlp": 0.12023926, + "step": 7126, + "time_per_iteration": 2.6186256408691406 + }, + { + "auxiliary_loss_clip": 0.01127277, + "auxiliary_loss_mlp": 0.01032483, + "balance_loss_clip": 1.04631376, + "balance_loss_mlp": 1.01920891, + "epoch": 0.42849842176461744, + "flos": 28468488199680.0, + "grad_norm": 2.874582706289201, + "language_loss": 0.814031, + "learning_rate": 2.5529425507409913e-06, + "loss": 0.83562863, + "num_input_tokens_seen": 152931030, + "router_z_loss_clip": 0.80908203, + "router_z_loss_mlp": 0.1328125, + "step": 7127, + "time_per_iteration": 2.6846506595611572 + }, + { + "auxiliary_loss_clip": 0.01125979, + "auxiliary_loss_mlp": 0.01035629, + "balance_loss_clip": 1.04550624, + "balance_loss_mlp": 1.02289104, + "epoch": 0.4285585450172854, + "flos": 21078056010240.0, + "grad_norm": 1.9014516205492658, + "language_loss": 0.76424527, + "learning_rate": 2.5525682589446867e-06, + "loss": 0.78586137, + "num_input_tokens_seen": 152948085, + "router_z_loss_clip": 0.8046875, + "router_z_loss_mlp": 0.12744141, + "step": 7128, + "time_per_iteration": 2.633281946182251 + }, + { + "auxiliary_loss_clip": 0.01127939, + "auxiliary_loss_mlp": 0.01035893, + "balance_loss_clip": 1.04625976, + "balance_loss_mlp": 1.02265501, + "epoch": 0.42861866826995343, + "flos": 29626306951680.0, + "grad_norm": 1.945417735341725, + "language_loss": 0.74258363, + "learning_rate": 2.552193946194937e-06, + "loss": 0.76422191, + "num_input_tokens_seen": 152966265, + "router_z_loss_clip": 0.81689453, + "router_z_loss_mlp": 0.13250732, + "step": 7129, + "time_per_iteration": 4.013166427612305 + }, + { + "auxiliary_loss_clip": 0.01128963, + "auxiliary_loss_mlp": 0.01031643, + "balance_loss_clip": 1.04841912, + "balance_loss_mlp": 1.01886344, + "epoch": 0.4286787915226214, + "flos": 29715958991520.0, + "grad_norm": 1.6895936459896834, + "language_loss": 0.77685553, + "learning_rate": 2.5518196125059394e-06, + "loss": 0.79846156, + "num_input_tokens_seen": 152986775, + "router_z_loss_clip": 0.80517578, + "router_z_loss_mlp": 0.12768555, + "step": 7130, + "time_per_iteration": 2.731283664703369 + }, + { + "auxiliary_loss_clip": 0.01131875, + "auxiliary_loss_mlp": 0.01036898, + "balance_loss_clip": 1.04958725, + "balance_loss_mlp": 1.02372575, + "epoch": 0.42873891477528936, + "flos": 18851746407840.0, + "grad_norm": 2.1085028380062107, + "language_loss": 0.73136038, + "learning_rate": 2.551445257891886e-06, + "loss": 0.75304812, + "num_input_tokens_seen": 153003595, + "router_z_loss_clip": 0.82226562, + "router_z_loss_mlp": 0.1317749, + "step": 7131, + "time_per_iteration": 2.6365954875946045 + }, + { + "auxiliary_loss_clip": 0.01130002, + "auxiliary_loss_mlp": 0.01033073, + "balance_loss_clip": 1.04816532, + "balance_loss_mlp": 1.019835, + "epoch": 0.4287990380279573, + "flos": 21523601551680.0, + "grad_norm": 2.4503730212474686, + "language_loss": 0.77519983, + "learning_rate": 2.551070882366973e-06, + "loss": 0.79683059, + "num_input_tokens_seen": 153021960, + "router_z_loss_clip": 0.81884766, + "router_z_loss_mlp": 0.13244629, + "step": 7132, + "time_per_iteration": 2.699824094772339 + }, + { + "auxiliary_loss_clip": 0.01127915, + "auxiliary_loss_mlp": 0.01035721, + "balance_loss_clip": 1.04614449, + "balance_loss_mlp": 1.02241099, + "epoch": 0.4288591612806253, + "flos": 33144001727040.0, + "grad_norm": 1.5537566019600269, + "language_loss": 0.78514791, + "learning_rate": 2.550696485945397e-06, + "loss": 0.80678427, + "num_input_tokens_seen": 153042110, + "router_z_loss_clip": 0.81640625, + "router_z_loss_mlp": 0.13299561, + "step": 7133, + "time_per_iteration": 2.6940181255340576 + }, + { + "auxiliary_loss_clip": 0.01130325, + "auxiliary_loss_mlp": 0.01035956, + "balance_loss_clip": 1.047979, + "balance_loss_mlp": 1.02328372, + "epoch": 0.42891928453329325, + "flos": 20942463725280.0, + "grad_norm": 1.898597055705124, + "language_loss": 0.75044996, + "learning_rate": 2.550322068641355e-06, + "loss": 0.77211273, + "num_input_tokens_seen": 153058925, + "router_z_loss_clip": 0.82275391, + "router_z_loss_mlp": 0.12677002, + "step": 7134, + "time_per_iteration": 2.651601552963257 + }, + { + "auxiliary_loss_clip": 0.01123186, + "auxiliary_loss_mlp": 0.01032796, + "balance_loss_clip": 1.04435933, + "balance_loss_mlp": 1.02095854, + "epoch": 0.4289794077859612, + "flos": 22191636242880.0, + "grad_norm": 1.8525229518549473, + "language_loss": 0.83827055, + "learning_rate": 2.5499476304690455e-06, + "loss": 0.85983038, + "num_input_tokens_seen": 153078070, + "router_z_loss_clip": 0.7890625, + "router_z_loss_mlp": 0.1184082, + "step": 7135, + "time_per_iteration": 2.617295026779175 + }, + { + "auxiliary_loss_clip": 0.01121445, + "auxiliary_loss_mlp": 0.01030911, + "balance_loss_clip": 1.04293978, + "balance_loss_mlp": 1.01866257, + "epoch": 0.4290395310386292, + "flos": 34479665936640.0, + "grad_norm": 3.070106635504399, + "language_loss": 0.75094211, + "learning_rate": 2.549573171442666e-06, + "loss": 0.77246565, + "num_input_tokens_seen": 153096680, + "router_z_loss_clip": 0.78515625, + "router_z_loss_mlp": 0.12255859, + "step": 7136, + "time_per_iteration": 2.7785160541534424 + }, + { + "auxiliary_loss_clip": 0.01125104, + "auxiliary_loss_mlp": 0.0103614, + "balance_loss_clip": 1.04294372, + "balance_loss_mlp": 1.02335453, + "epoch": 0.42909965429129715, + "flos": 19786589802720.0, + "grad_norm": 2.3247988871405063, + "language_loss": 0.79116023, + "learning_rate": 2.5491986915764175e-06, + "loss": 0.81277263, + "num_input_tokens_seen": 153113305, + "router_z_loss_clip": 0.82177734, + "router_z_loss_mlp": 0.12805176, + "step": 7137, + "time_per_iteration": 2.6346116065979004 + }, + { + "auxiliary_loss_clip": 0.01128907, + "auxiliary_loss_mlp": 0.01032287, + "balance_loss_clip": 1.04664564, + "balance_loss_mlp": 1.01921558, + "epoch": 0.4291597775439651, + "flos": 28202489841600.0, + "grad_norm": 2.3105874952622827, + "language_loss": 0.76515937, + "learning_rate": 2.548824190884499e-06, + "loss": 0.7867713, + "num_input_tokens_seen": 153132735, + "router_z_loss_clip": 0.82177734, + "router_z_loss_mlp": 0.13067627, + "step": 7138, + "time_per_iteration": 2.74404239654541 + }, + { + "auxiliary_loss_clip": 0.01044183, + "auxiliary_loss_mlp": 0.0100337, + "balance_loss_clip": 1.01871884, + "balance_loss_mlp": 1.00217748, + "epoch": 0.4292199007966331, + "flos": 82420698437280.0, + "grad_norm": 0.7917431062549694, + "language_loss": 0.56218743, + "learning_rate": 2.548449669381113e-06, + "loss": 0.58266306, + "num_input_tokens_seen": 153187925, + "router_z_loss_clip": 0.25439453, + "router_z_loss_mlp": 0.01190948, + "step": 7139, + "time_per_iteration": 3.1290853023529053 + }, + { + "auxiliary_loss_clip": 0.01122065, + "auxiliary_loss_mlp": 0.01041638, + "balance_loss_clip": 1.04465652, + "balance_loss_mlp": 1.03031933, + "epoch": 0.42928002404930105, + "flos": 28064912209920.0, + "grad_norm": 1.6614801859725465, + "language_loss": 0.80843425, + "learning_rate": 2.5480751270804595e-06, + "loss": 0.83007133, + "num_input_tokens_seen": 153206990, + "router_z_loss_clip": 0.77294922, + "router_z_loss_mlp": 0.11328125, + "step": 7140, + "time_per_iteration": 2.7722432613372803 + }, + { + "auxiliary_loss_clip": 0.01125093, + "auxiliary_loss_mlp": 0.01031093, + "balance_loss_clip": 1.04406214, + "balance_loss_mlp": 1.01789641, + "epoch": 0.429340147301969, + "flos": 14086580840640.0, + "grad_norm": 1.8309088259944672, + "language_loss": 0.81740856, + "learning_rate": 2.5477005639967424e-06, + "loss": 0.83897042, + "num_input_tokens_seen": 153222345, + "router_z_loss_clip": 0.81054688, + "router_z_loss_mlp": 0.13201904, + "step": 7141, + "time_per_iteration": 2.7811880111694336 + }, + { + "auxiliary_loss_clip": 0.01130166, + "auxiliary_loss_mlp": 0.01034346, + "balance_loss_clip": 1.04661822, + "balance_loss_mlp": 1.02124465, + "epoch": 0.42940027055463703, + "flos": 30828728878560.0, + "grad_norm": 1.6679826687013155, + "language_loss": 0.86400712, + "learning_rate": 2.547325980144166e-06, + "loss": 0.88565224, + "num_input_tokens_seen": 153240570, + "router_z_loss_clip": 0.83544922, + "router_z_loss_mlp": 0.13092041, + "step": 7142, + "time_per_iteration": 2.6767642498016357 + }, + { + "auxiliary_loss_clip": 0.01125267, + "auxiliary_loss_mlp": 0.01031924, + "balance_loss_clip": 1.04816258, + "balance_loss_mlp": 1.01921654, + "epoch": 0.429460393807305, + "flos": 29047194989280.0, + "grad_norm": 2.2050101587188253, + "language_loss": 0.78575885, + "learning_rate": 2.5469513755369323e-06, + "loss": 0.80733079, + "num_input_tokens_seen": 153259575, + "router_z_loss_clip": 0.77001953, + "router_z_loss_mlp": 0.12713623, + "step": 7143, + "time_per_iteration": 2.7296431064605713 + }, + { + "auxiliary_loss_clip": 0.01125085, + "auxiliary_loss_mlp": 0.01035362, + "balance_loss_clip": 1.04577971, + "balance_loss_mlp": 1.02298236, + "epoch": 0.42952051705997296, + "flos": 16982586342720.0, + "grad_norm": 1.8858076343824688, + "language_loss": 0.77155483, + "learning_rate": 2.5465767501892484e-06, + "loss": 0.79315931, + "num_input_tokens_seen": 153276650, + "router_z_loss_clip": 0.79248047, + "router_z_loss_mlp": 0.1237793, + "step": 7144, + "time_per_iteration": 2.6001265048980713 + }, + { + "auxiliary_loss_clip": 0.01124963, + "auxiliary_loss_mlp": 0.01029973, + "balance_loss_clip": 1.04455638, + "balance_loss_mlp": 1.01719427, + "epoch": 0.4295806403126409, + "flos": 32653569389760.0, + "grad_norm": 2.148519775491064, + "language_loss": 0.73455083, + "learning_rate": 2.54620210411532e-06, + "loss": 0.75610018, + "num_input_tokens_seen": 153298025, + "router_z_loss_clip": 0.80322266, + "router_z_loss_mlp": 0.12774658, + "step": 7145, + "time_per_iteration": 2.730762243270874 + }, + { + "auxiliary_loss_clip": 0.01127752, + "auxiliary_loss_mlp": 0.01030516, + "balance_loss_clip": 1.04536438, + "balance_loss_mlp": 1.01749182, + "epoch": 0.4296407635653089, + "flos": 25567093899360.0, + "grad_norm": 2.321892800058301, + "language_loss": 0.7903862, + "learning_rate": 2.545827437329352e-06, + "loss": 0.81196886, + "num_input_tokens_seen": 153315775, + "router_z_loss_clip": 0.82324219, + "router_z_loss_mlp": 0.13024902, + "step": 7146, + "time_per_iteration": 2.6287264823913574 + }, + { + "auxiliary_loss_clip": 0.01122074, + "auxiliary_loss_mlp": 0.01029306, + "balance_loss_clip": 1.04427147, + "balance_loss_mlp": 1.01748013, + "epoch": 0.42970088681797686, + "flos": 19342624435200.0, + "grad_norm": 2.3654871681364846, + "language_loss": 0.829355, + "learning_rate": 2.5454527498455532e-06, + "loss": 0.85086882, + "num_input_tokens_seen": 153332765, + "router_z_loss_clip": 0.77685547, + "router_z_loss_mlp": 0.1182251, + "step": 7147, + "time_per_iteration": 2.6973068714141846 + }, + { + "auxiliary_loss_clip": 0.0112902, + "auxiliary_loss_mlp": 0.01039062, + "balance_loss_clip": 1.04824352, + "balance_loss_mlp": 1.02494788, + "epoch": 0.4297610100706448, + "flos": 27311439276000.0, + "grad_norm": 2.233009265289916, + "language_loss": 0.87012088, + "learning_rate": 2.545078041678131e-06, + "loss": 0.89180171, + "num_input_tokens_seen": 153350760, + "router_z_loss_clip": 0.80810547, + "router_z_loss_mlp": 0.14111328, + "step": 7148, + "time_per_iteration": 2.6284618377685547 + }, + { + "auxiliary_loss_clip": 0.01126137, + "auxiliary_loss_mlp": 0.01031346, + "balance_loss_clip": 1.04606414, + "balance_loss_mlp": 1.01883471, + "epoch": 0.4298211333233128, + "flos": 34075765808640.0, + "grad_norm": 1.633898203313379, + "language_loss": 0.77977705, + "learning_rate": 2.5447033128412957e-06, + "loss": 0.80135185, + "num_input_tokens_seen": 153370765, + "router_z_loss_clip": 0.80126953, + "router_z_loss_mlp": 0.12518311, + "step": 7149, + "time_per_iteration": 2.699876070022583 + }, + { + "auxiliary_loss_clip": 0.01122842, + "auxiliary_loss_mlp": 0.01033057, + "balance_loss_clip": 1.04446793, + "balance_loss_mlp": 1.02067113, + "epoch": 0.42988125657598075, + "flos": 29804476547520.0, + "grad_norm": 2.564941568491051, + "language_loss": 0.79738557, + "learning_rate": 2.544328563349256e-06, + "loss": 0.81894457, + "num_input_tokens_seen": 153390725, + "router_z_loss_clip": 0.78369141, + "router_z_loss_mlp": 0.12384033, + "step": 7150, + "time_per_iteration": 2.6661229133605957 + }, + { + "auxiliary_loss_clip": 0.01130555, + "auxiliary_loss_mlp": 0.01038197, + "balance_loss_clip": 1.04699934, + "balance_loss_mlp": 1.02387393, + "epoch": 0.4299413798286487, + "flos": 19340193398400.0, + "grad_norm": 1.6713488798089065, + "language_loss": 0.75021112, + "learning_rate": 2.5439537932162222e-06, + "loss": 0.77189863, + "num_input_tokens_seen": 153408010, + "router_z_loss_clip": 0.83496094, + "router_z_loss_mlp": 0.14343262, + "step": 7151, + "time_per_iteration": 2.6647398471832275 + }, + { + "auxiliary_loss_clip": 0.01128511, + "auxiliary_loss_mlp": 0.01032761, + "balance_loss_clip": 1.04589128, + "balance_loss_mlp": 1.01941001, + "epoch": 0.4300015030813167, + "flos": 27222232926240.0, + "grad_norm": 3.0266796531485665, + "language_loss": 0.70976865, + "learning_rate": 2.543579002456406e-06, + "loss": 0.73138136, + "num_input_tokens_seen": 153426865, + "router_z_loss_clip": 0.82714844, + "router_z_loss_mlp": 0.13336182, + "step": 7152, + "time_per_iteration": 2.6830718517303467 + }, + { + "auxiliary_loss_clip": 0.01122798, + "auxiliary_loss_mlp": 0.01033136, + "balance_loss_clip": 1.04229093, + "balance_loss_mlp": 1.02047014, + "epoch": 0.43006162633398465, + "flos": 42583222199520.0, + "grad_norm": 1.6916683560415442, + "language_loss": 0.70878798, + "learning_rate": 2.54320419108402e-06, + "loss": 0.73034734, + "num_input_tokens_seen": 153449410, + "router_z_loss_clip": 0.80419922, + "router_z_loss_mlp": 0.12677002, + "step": 7153, + "time_per_iteration": 2.811256170272827 + }, + { + "auxiliary_loss_clip": 0.0112746, + "auxiliary_loss_mlp": 0.01033145, + "balance_loss_clip": 1.04515839, + "balance_loss_mlp": 1.02031839, + "epoch": 0.4301217495866526, + "flos": 19475947752480.0, + "grad_norm": 2.3757920300610027, + "language_loss": 0.78002858, + "learning_rate": 2.542829359113276e-06, + "loss": 0.80163467, + "num_input_tokens_seen": 153467910, + "router_z_loss_clip": 0.82324219, + "router_z_loss_mlp": 0.12841797, + "step": 7154, + "time_per_iteration": 2.692765951156616 + }, + { + "auxiliary_loss_clip": 0.01122475, + "auxiliary_loss_mlp": 0.01032206, + "balance_loss_clip": 1.04380751, + "balance_loss_mlp": 1.01982045, + "epoch": 0.43018187283932063, + "flos": 22903504280640.0, + "grad_norm": 1.6203070322354731, + "language_loss": 0.78780317, + "learning_rate": 2.542454506558389e-06, + "loss": 0.80935001, + "num_input_tokens_seen": 153487100, + "router_z_loss_clip": 0.78710938, + "router_z_loss_mlp": 0.12371826, + "step": 7155, + "time_per_iteration": 2.6412575244903564 + }, + { + "auxiliary_loss_clip": 0.01123123, + "auxiliary_loss_mlp": 0.01031167, + "balance_loss_clip": 1.0450666, + "balance_loss_mlp": 1.01887023, + "epoch": 0.4302419960919886, + "flos": 24589592159040.0, + "grad_norm": 1.9546252334476855, + "language_loss": 0.8902818, + "learning_rate": 2.5420796334335723e-06, + "loss": 0.9118247, + "num_input_tokens_seen": 153505565, + "router_z_loss_clip": 0.78076172, + "router_z_loss_mlp": 0.12286377, + "step": 7156, + "time_per_iteration": 2.7115700244903564 + }, + { + "auxiliary_loss_clip": 0.01127306, + "auxiliary_loss_mlp": 0.01031831, + "balance_loss_clip": 1.04521275, + "balance_loss_mlp": 1.01850975, + "epoch": 0.43030211934465656, + "flos": 32253680472480.0, + "grad_norm": 1.8388081838094967, + "language_loss": 0.83017087, + "learning_rate": 2.541704739753042e-06, + "loss": 0.85176218, + "num_input_tokens_seen": 153526130, + "router_z_loss_clip": 0.82128906, + "router_z_loss_mlp": 0.13323975, + "step": 7157, + "time_per_iteration": 5.611353397369385 + }, + { + "auxiliary_loss_clip": 0.01131999, + "auxiliary_loss_mlp": 0.01030686, + "balance_loss_clip": 1.04908776, + "balance_loss_mlp": 1.01774573, + "epoch": 0.43036224259732453, + "flos": 29760075959040.0, + "grad_norm": 1.8366494991002449, + "language_loss": 0.71755862, + "learning_rate": 2.5413298255310132e-06, + "loss": 0.73918545, + "num_input_tokens_seen": 153546370, + "router_z_loss_clip": 0.82861328, + "router_z_loss_mlp": 0.12963867, + "step": 7158, + "time_per_iteration": 2.675466537475586 + }, + { + "auxiliary_loss_clip": 0.01127063, + "auxiliary_loss_mlp": 0.01031413, + "balance_loss_clip": 1.04626942, + "balance_loss_mlp": 1.01856256, + "epoch": 0.4304223658499925, + "flos": 20989295350560.0, + "grad_norm": 1.7820540239019542, + "language_loss": 0.827595, + "learning_rate": 2.5409548907817034e-06, + "loss": 0.84917974, + "num_input_tokens_seen": 153562800, + "router_z_loss_clip": 0.80810547, + "router_z_loss_mlp": 0.12860107, + "step": 7159, + "time_per_iteration": 2.6954634189605713 + }, + { + "auxiliary_loss_clip": 0.01127321, + "auxiliary_loss_mlp": 0.01033097, + "balance_loss_clip": 1.04674029, + "balance_loss_mlp": 1.02031207, + "epoch": 0.43048248910266046, + "flos": 18184238441280.0, + "grad_norm": 2.8451395408575806, + "language_loss": 0.83250946, + "learning_rate": 2.54057993551933e-06, + "loss": 0.85411364, + "num_input_tokens_seen": 153578395, + "router_z_loss_clip": 0.80615234, + "router_z_loss_mlp": 0.12786865, + "step": 7160, + "time_per_iteration": 2.615281105041504 + }, + { + "auxiliary_loss_clip": 0.01131053, + "auxiliary_loss_mlp": 0.01039597, + "balance_loss_clip": 1.04725444, + "balance_loss_mlp": 1.02494633, + "epoch": 0.4305426123553284, + "flos": 26331830637120.0, + "grad_norm": 2.1960556216315954, + "language_loss": 0.76659119, + "learning_rate": 2.5402049597581116e-06, + "loss": 0.78829777, + "num_input_tokens_seen": 153596880, + "router_z_loss_clip": 0.83789062, + "router_z_loss_mlp": 0.14648438, + "step": 7161, + "time_per_iteration": 2.702794313430786 + }, + { + "auxiliary_loss_clip": 0.01126249, + "auxiliary_loss_mlp": 0.01037019, + "balance_loss_clip": 1.04532111, + "balance_loss_mlp": 1.02439427, + "epoch": 0.4306027356079964, + "flos": 27578288496960.0, + "grad_norm": 3.15410723574156, + "language_loss": 0.72821915, + "learning_rate": 2.5398299635122662e-06, + "loss": 0.74985188, + "num_input_tokens_seen": 153616570, + "router_z_loss_clip": 0.80908203, + "router_z_loss_mlp": 0.1260376, + "step": 7162, + "time_per_iteration": 2.6261796951293945 + }, + { + "auxiliary_loss_clip": 0.01047604, + "auxiliary_loss_mlp": 0.01007456, + "balance_loss_clip": 1.02245784, + "balance_loss_mlp": 1.00623083, + "epoch": 0.43066285886066435, + "flos": 86236231802400.0, + "grad_norm": 0.7963343364195896, + "language_loss": 0.59039247, + "learning_rate": 2.5394549467960147e-06, + "loss": 0.61094308, + "num_input_tokens_seen": 153671450, + "router_z_loss_clip": 0.25170898, + "router_z_loss_mlp": 0.01223755, + "step": 7163, + "time_per_iteration": 4.673052549362183 + }, + { + "auxiliary_loss_clip": 0.01122235, + "auxiliary_loss_mlp": 0.01029034, + "balance_loss_clip": 1.04324865, + "balance_loss_mlp": 1.0175302, + "epoch": 0.4307229821133323, + "flos": 32605319659680.0, + "grad_norm": 1.9063990210765969, + "language_loss": 0.7942428, + "learning_rate": 2.5390799096235783e-06, + "loss": 0.81575549, + "num_input_tokens_seen": 153691405, + "router_z_loss_clip": 0.79003906, + "router_z_loss_mlp": 0.11499023, + "step": 7164, + "time_per_iteration": 2.7075366973876953 + }, + { + "auxiliary_loss_clip": 0.01128118, + "auxiliary_loss_mlp": 0.01039192, + "balance_loss_clip": 1.04456902, + "balance_loss_mlp": 1.0263834, + "epoch": 0.4307831053660003, + "flos": 31941782386560.0, + "grad_norm": 1.9917962834538043, + "language_loss": 0.67697954, + "learning_rate": 2.538704852009177e-06, + "loss": 0.69865263, + "num_input_tokens_seen": 153711555, + "router_z_loss_clip": 0.8359375, + "router_z_loss_mlp": 0.12817383, + "step": 7165, + "time_per_iteration": 2.682811737060547 + }, + { + "auxiliary_loss_clip": 0.01125887, + "auxiliary_loss_mlp": 0.01046668, + "balance_loss_clip": 1.04621077, + "balance_loss_mlp": 1.03459227, + "epoch": 0.43084322861866825, + "flos": 23074866973440.0, + "grad_norm": 2.2801474198538343, + "language_loss": 0.75002009, + "learning_rate": 2.538329773967034e-06, + "loss": 0.77174568, + "num_input_tokens_seen": 153730095, + "router_z_loss_clip": 0.79589844, + "router_z_loss_mlp": 0.12078857, + "step": 7166, + "time_per_iteration": 2.689145565032959 + }, + { + "auxiliary_loss_clip": 0.01125006, + "auxiliary_loss_mlp": 0.0103742, + "balance_loss_clip": 1.04617608, + "balance_loss_mlp": 1.02569604, + "epoch": 0.4309033518713362, + "flos": 32253883058880.0, + "grad_norm": 1.8930900317136108, + "language_loss": 0.71744716, + "learning_rate": 2.537954675511372e-06, + "loss": 0.73907149, + "num_input_tokens_seen": 153749320, + "router_z_loss_clip": 0.78857422, + "router_z_loss_mlp": 0.1171875, + "step": 7167, + "time_per_iteration": 2.7173306941986084 + }, + { + "auxiliary_loss_clip": 0.01120936, + "auxiliary_loss_mlp": 0.01034376, + "balance_loss_clip": 1.04467785, + "balance_loss_mlp": 1.0225445, + "epoch": 0.43096347512400424, + "flos": 25884704921760.0, + "grad_norm": 1.5510503796020223, + "language_loss": 0.78475916, + "learning_rate": 2.537579556656414e-06, + "loss": 0.8063122, + "num_input_tokens_seen": 153767825, + "router_z_loss_clip": 0.76171875, + "router_z_loss_mlp": 0.11834717, + "step": 7168, + "time_per_iteration": 2.678133487701416 + }, + { + "auxiliary_loss_clip": 0.01127, + "auxiliary_loss_mlp": 0.01039149, + "balance_loss_clip": 1.04645908, + "balance_loss_mlp": 1.02687013, + "epoch": 0.4310235983766722, + "flos": 20182426992000.0, + "grad_norm": 2.1553245963901375, + "language_loss": 0.82583493, + "learning_rate": 2.537204417416387e-06, + "loss": 0.84749639, + "num_input_tokens_seen": 153785350, + "router_z_loss_clip": 0.80517578, + "router_z_loss_mlp": 0.1227417, + "step": 7169, + "time_per_iteration": 3.8907861709594727 + }, + { + "auxiliary_loss_clip": 0.01045828, + "auxiliary_loss_mlp": 0.01001143, + "balance_loss_clip": 1.02052236, + "balance_loss_mlp": 0.99985462, + "epoch": 0.43108372162934017, + "flos": 79039689913440.0, + "grad_norm": 0.6728405159213325, + "language_loss": 0.60755444, + "learning_rate": 2.5368292578055132e-06, + "loss": 0.6280241, + "num_input_tokens_seen": 153856400, + "router_z_loss_clip": 0.25390625, + "router_z_loss_mlp": 0.01288605, + "step": 7170, + "time_per_iteration": 3.43021821975708 + }, + { + "auxiliary_loss_clip": 0.01126738, + "auxiliary_loss_mlp": 0.01030852, + "balance_loss_clip": 1.04686284, + "balance_loss_mlp": 1.01902008, + "epoch": 0.43114384488200813, + "flos": 16403920070400.0, + "grad_norm": 2.2618895151757017, + "language_loss": 0.75704455, + "learning_rate": 2.536454077838021e-06, + "loss": 0.77862048, + "num_input_tokens_seen": 153875230, + "router_z_loss_clip": 0.79882812, + "router_z_loss_mlp": 0.1182251, + "step": 7171, + "time_per_iteration": 2.6315104961395264 + }, + { + "auxiliary_loss_clip": 0.01124739, + "auxiliary_loss_mlp": 0.0103066, + "balance_loss_clip": 1.04630983, + "balance_loss_mlp": 1.01866722, + "epoch": 0.4312039681346761, + "flos": 32075470359360.0, + "grad_norm": 1.5384397063599748, + "language_loss": 0.77734113, + "learning_rate": 2.5360788775281357e-06, + "loss": 0.79889512, + "num_input_tokens_seen": 153894740, + "router_z_loss_clip": 0.78320312, + "router_z_loss_mlp": 0.11987305, + "step": 7172, + "time_per_iteration": 2.7134859561920166 + }, + { + "auxiliary_loss_clip": 0.01126779, + "auxiliary_loss_mlp": 0.01034264, + "balance_loss_clip": 1.04531598, + "balance_loss_mlp": 1.02029276, + "epoch": 0.43126409138734406, + "flos": 24863572421280.0, + "grad_norm": 1.6738059702007315, + "language_loss": 0.76786804, + "learning_rate": 2.535703656890086e-06, + "loss": 0.78947842, + "num_input_tokens_seen": 153913230, + "router_z_loss_clip": 0.81445312, + "router_z_loss_mlp": 0.13970947, + "step": 7173, + "time_per_iteration": 2.6512608528137207 + }, + { + "auxiliary_loss_clip": 0.01124459, + "auxiliary_loss_mlp": 0.0102976, + "balance_loss_clip": 1.04584968, + "balance_loss_mlp": 1.01755881, + "epoch": 0.431324214640012, + "flos": 26995124806560.0, + "grad_norm": 1.473072833367343, + "language_loss": 0.76786834, + "learning_rate": 2.5353284159381e-06, + "loss": 0.78941053, + "num_input_tokens_seen": 153933250, + "router_z_loss_clip": 0.78662109, + "router_z_loss_mlp": 0.12194824, + "step": 7174, + "time_per_iteration": 2.729112148284912 + }, + { + "auxiliary_loss_clip": 0.01127323, + "auxiliary_loss_mlp": 0.0103353, + "balance_loss_clip": 1.04529452, + "balance_loss_mlp": 1.0197556, + "epoch": 0.43138433789268, + "flos": 18584532531360.0, + "grad_norm": 1.5677754130981771, + "language_loss": 0.82527602, + "learning_rate": 2.534953154686407e-06, + "loss": 0.84688461, + "num_input_tokens_seen": 153951325, + "router_z_loss_clip": 0.81982422, + "router_z_loss_mlp": 0.13763428, + "step": 7175, + "time_per_iteration": 2.6849935054779053 + }, + { + "auxiliary_loss_clip": 0.01130249, + "auxiliary_loss_mlp": 0.01034949, + "balance_loss_clip": 1.04654932, + "balance_loss_mlp": 1.02153802, + "epoch": 0.43144446114534796, + "flos": 22146830481600.0, + "grad_norm": 2.6696418534953903, + "language_loss": 0.73813045, + "learning_rate": 2.5345778731492366e-06, + "loss": 0.75978243, + "num_input_tokens_seen": 153966975, + "router_z_loss_clip": 0.83789062, + "router_z_loss_mlp": 0.13427734, + "step": 7176, + "time_per_iteration": 2.7550976276397705 + }, + { + "auxiliary_loss_clip": 0.01128384, + "auxiliary_loss_mlp": 0.01032583, + "balance_loss_clip": 1.04596305, + "balance_loss_mlp": 1.01944566, + "epoch": 0.4315045843980159, + "flos": 27979230863520.0, + "grad_norm": 1.6225728702448168, + "language_loss": 0.73339778, + "learning_rate": 2.534202571340819e-06, + "loss": 0.75500739, + "num_input_tokens_seen": 153986695, + "router_z_loss_clip": 0.82421875, + "router_z_loss_mlp": 0.13140869, + "step": 7177, + "time_per_iteration": 2.655109167098999 + }, + { + "auxiliary_loss_clip": 0.0113252, + "auxiliary_loss_mlp": 0.01039225, + "balance_loss_clip": 1.04483974, + "balance_loss_mlp": 1.02440155, + "epoch": 0.4315647076506839, + "flos": 26999379120960.0, + "grad_norm": 2.0301228131678117, + "language_loss": 0.81551003, + "learning_rate": 2.533827249275387e-06, + "loss": 0.83722746, + "num_input_tokens_seen": 154004710, + "router_z_loss_clip": 0.87597656, + "router_z_loss_mlp": 0.14837646, + "step": 7178, + "time_per_iteration": 2.7861673831939697 + }, + { + "auxiliary_loss_clip": 0.01122602, + "auxiliary_loss_mlp": 0.0103193, + "balance_loss_clip": 1.04669809, + "balance_loss_mlp": 1.01992583, + "epoch": 0.43162483090335185, + "flos": 32788594432800.0, + "grad_norm": 1.630578778704649, + "language_loss": 0.84085757, + "learning_rate": 2.5334519069671725e-06, + "loss": 0.86240292, + "num_input_tokens_seen": 154024320, + "router_z_loss_clip": 0.75878906, + "router_z_loss_mlp": 0.12011719, + "step": 7179, + "time_per_iteration": 2.708282470703125 + }, + { + "auxiliary_loss_clip": 0.01125231, + "auxiliary_loss_mlp": 0.01033771, + "balance_loss_clip": 1.04526722, + "balance_loss_mlp": 1.0212599, + "epoch": 0.4316849541560198, + "flos": 16976751854400.0, + "grad_norm": 1.6674949471702072, + "language_loss": 0.75153381, + "learning_rate": 2.5330765444304075e-06, + "loss": 0.77312386, + "num_input_tokens_seen": 154041755, + "router_z_loss_clip": 0.79980469, + "router_z_loss_mlp": 0.12524414, + "step": 7180, + "time_per_iteration": 2.6726436614990234 + }, + { + "auxiliary_loss_clip": 0.01126012, + "auxiliary_loss_mlp": 0.01033083, + "balance_loss_clip": 1.04467809, + "balance_loss_mlp": 1.0199641, + "epoch": 0.4317450774086878, + "flos": 20053925231040.0, + "grad_norm": 1.912648943182666, + "language_loss": 0.8189373, + "learning_rate": 2.5327011616793274e-06, + "loss": 0.84052825, + "num_input_tokens_seen": 154056775, + "router_z_loss_clip": 0.81445312, + "router_z_loss_mlp": 0.13116455, + "step": 7181, + "time_per_iteration": 2.5874502658843994 + }, + { + "auxiliary_loss_clip": 0.01128756, + "auxiliary_loss_mlp": 0.01035889, + "balance_loss_clip": 1.04623318, + "balance_loss_mlp": 1.02244186, + "epoch": 0.4318052006613558, + "flos": 25081523635680.0, + "grad_norm": 1.9213283998155297, + "language_loss": 0.88853031, + "learning_rate": 2.532325758728165e-06, + "loss": 0.91017675, + "num_input_tokens_seen": 154075015, + "router_z_loss_clip": 0.82519531, + "router_z_loss_mlp": 0.13458252, + "step": 7182, + "time_per_iteration": 2.6667919158935547 + }, + { + "auxiliary_loss_clip": 0.01125923, + "auxiliary_loss_mlp": 0.01033614, + "balance_loss_clip": 1.04691124, + "balance_loss_mlp": 1.02188349, + "epoch": 0.43186532391402377, + "flos": 27845259269760.0, + "grad_norm": 1.842632085971817, + "language_loss": 0.75563657, + "learning_rate": 2.5319503355911566e-06, + "loss": 0.77723193, + "num_input_tokens_seen": 154095170, + "router_z_loss_clip": 0.78955078, + "router_z_loss_mlp": 0.11730957, + "step": 7183, + "time_per_iteration": 2.6449248790740967 + }, + { + "auxiliary_loss_clip": 0.01126011, + "auxiliary_loss_mlp": 0.01030009, + "balance_loss_clip": 1.04552114, + "balance_loss_mlp": 1.01735544, + "epoch": 0.43192544716669173, + "flos": 31184581862880.0, + "grad_norm": 1.7351529720770484, + "language_loss": 0.77272451, + "learning_rate": 2.5315748922825393e-06, + "loss": 0.7942847, + "num_input_tokens_seen": 154116895, + "router_z_loss_clip": 0.8046875, + "router_z_loss_mlp": 0.12658691, + "step": 7184, + "time_per_iteration": 2.6849558353424072 + }, + { + "auxiliary_loss_clip": 0.01122466, + "auxiliary_loss_mlp": 0.01035559, + "balance_loss_clip": 1.04638505, + "balance_loss_mlp": 1.02335167, + "epoch": 0.4319855704193597, + "flos": 37771913800800.0, + "grad_norm": 1.7456132762096217, + "language_loss": 0.73140174, + "learning_rate": 2.5311994288165474e-06, + "loss": 0.75298196, + "num_input_tokens_seen": 154138395, + "router_z_loss_clip": 0.75976562, + "router_z_loss_mlp": 0.12207031, + "step": 7185, + "time_per_iteration": 2.808319091796875 + }, + { + "auxiliary_loss_clip": 0.01128493, + "auxiliary_loss_mlp": 0.01040107, + "balance_loss_clip": 1.04501903, + "balance_loss_mlp": 1.02671981, + "epoch": 0.43204569367202766, + "flos": 29576841703200.0, + "grad_norm": 2.4538051386524544, + "language_loss": 0.75357878, + "learning_rate": 2.530823945207421e-06, + "loss": 0.7752648, + "num_input_tokens_seen": 154156775, + "router_z_loss_clip": 0.83496094, + "router_z_loss_mlp": 0.1338501, + "step": 7186, + "time_per_iteration": 2.680298328399658 + }, + { + "auxiliary_loss_clip": 0.01127575, + "auxiliary_loss_mlp": 0.01037758, + "balance_loss_clip": 1.04647636, + "balance_loss_mlp": 1.02494895, + "epoch": 0.43210581692469563, + "flos": 22459903568640.0, + "grad_norm": 2.365194407112426, + "language_loss": 0.76051086, + "learning_rate": 2.5304484414693962e-06, + "loss": 0.78216422, + "num_input_tokens_seen": 154177500, + "router_z_loss_clip": 0.81152344, + "router_z_loss_mlp": 0.12811279, + "step": 7187, + "time_per_iteration": 2.702970266342163 + }, + { + "auxiliary_loss_clip": 0.01046979, + "auxiliary_loss_mlp": 0.01010417, + "balance_loss_clip": 1.02168858, + "balance_loss_mlp": 1.00921726, + "epoch": 0.4321659401773636, + "flos": 60803237399040.0, + "grad_norm": 0.8589960185821907, + "language_loss": 0.68150949, + "learning_rate": 2.530072917616714e-06, + "loss": 0.70208341, + "num_input_tokens_seen": 154237110, + "router_z_loss_clip": 0.25292969, + "router_z_loss_mlp": 0.01199341, + "step": 7188, + "time_per_iteration": 3.271343469619751 + }, + { + "auxiliary_loss_clip": 0.01122462, + "auxiliary_loss_mlp": 0.01032699, + "balance_loss_clip": 1.04558468, + "balance_loss_mlp": 1.02047408, + "epoch": 0.43222606343003156, + "flos": 20900048483520.0, + "grad_norm": 2.4937975698922066, + "language_loss": 0.78681934, + "learning_rate": 2.529697373663614e-06, + "loss": 0.80837101, + "num_input_tokens_seen": 154253910, + "router_z_loss_clip": 0.76855469, + "router_z_loss_mlp": 0.12225342, + "step": 7189, + "time_per_iteration": 2.6878719329833984 + }, + { + "auxiliary_loss_clip": 0.01128245, + "auxiliary_loss_mlp": 0.01043843, + "balance_loss_clip": 1.04372287, + "balance_loss_mlp": 1.03047347, + "epoch": 0.4322861866826995, + "flos": 27759456371520.0, + "grad_norm": 1.739324734418148, + "language_loss": 0.7159884, + "learning_rate": 2.5293218096243364e-06, + "loss": 0.73770928, + "num_input_tokens_seen": 154274770, + "router_z_loss_clip": 0.84472656, + "router_z_loss_mlp": 0.13378906, + "step": 7190, + "time_per_iteration": 2.67724871635437 + }, + { + "auxiliary_loss_clip": 0.01121361, + "auxiliary_loss_mlp": 0.01031251, + "balance_loss_clip": 1.04295826, + "balance_loss_mlp": 1.01918101, + "epoch": 0.4323463099353675, + "flos": 34034322981600.0, + "grad_norm": 1.924450037697317, + "language_loss": 0.79691148, + "learning_rate": 2.5289462255131223e-06, + "loss": 0.81843758, + "num_input_tokens_seen": 154295035, + "router_z_loss_clip": 0.78369141, + "router_z_loss_mlp": 0.12060547, + "step": 7191, + "time_per_iteration": 2.791541576385498 + }, + { + "auxiliary_loss_clip": 0.01123387, + "auxiliary_loss_mlp": 0.01032399, + "balance_loss_clip": 1.04433036, + "balance_loss_mlp": 1.01986396, + "epoch": 0.43240643318803546, + "flos": 26374570017120.0, + "grad_norm": 1.557667578786596, + "language_loss": 0.7500453, + "learning_rate": 2.5285706213442146e-06, + "loss": 0.77160323, + "num_input_tokens_seen": 154314905, + "router_z_loss_clip": 0.79052734, + "router_z_loss_mlp": 0.12530518, + "step": 7192, + "time_per_iteration": 2.6866931915283203 + }, + { + "auxiliary_loss_clip": 0.01126838, + "auxiliary_loss_mlp": 0.0103638, + "balance_loss_clip": 1.04689264, + "balance_loss_mlp": 1.02280784, + "epoch": 0.4324665564407034, + "flos": 21425278813920.0, + "grad_norm": 1.938457715221518, + "language_loss": 0.78645056, + "learning_rate": 2.5281949971318557e-06, + "loss": 0.8080827, + "num_input_tokens_seen": 154331740, + "router_z_loss_clip": 0.80029297, + "router_z_loss_mlp": 0.13574219, + "step": 7193, + "time_per_iteration": 2.6715667247772217 + }, + { + "auxiliary_loss_clip": 0.01124302, + "auxiliary_loss_mlp": 0.01040475, + "balance_loss_clip": 1.04381597, + "balance_loss_mlp": 1.02672958, + "epoch": 0.4325266796933714, + "flos": 22454798391360.0, + "grad_norm": 1.7980801548999088, + "language_loss": 0.75422859, + "learning_rate": 2.5278193528902897e-06, + "loss": 0.7758764, + "num_input_tokens_seen": 154348740, + "router_z_loss_clip": 0.80419922, + "router_z_loss_mlp": 0.13745117, + "step": 7194, + "time_per_iteration": 2.6711442470550537 + }, + { + "auxiliary_loss_clip": 0.01125796, + "auxiliary_loss_mlp": 0.01033213, + "balance_loss_clip": 1.04597831, + "balance_loss_mlp": 1.02024317, + "epoch": 0.4325868029460394, + "flos": 27533766356640.0, + "grad_norm": 2.3131114773987718, + "language_loss": 0.59340817, + "learning_rate": 2.5274436886337613e-06, + "loss": 0.61499828, + "num_input_tokens_seen": 154368835, + "router_z_loss_clip": 0.79833984, + "router_z_loss_mlp": 0.12963867, + "step": 7195, + "time_per_iteration": 2.657533884048462 + }, + { + "auxiliary_loss_clip": 0.01128837, + "auxiliary_loss_mlp": 0.010339, + "balance_loss_clip": 1.04599392, + "balance_loss_mlp": 1.01988137, + "epoch": 0.43264692619870737, + "flos": 17821740623040.0, + "grad_norm": 2.005173722793929, + "language_loss": 0.65613621, + "learning_rate": 2.527068004376515e-06, + "loss": 0.67776358, + "num_input_tokens_seen": 154384620, + "router_z_loss_clip": 0.82763672, + "router_z_loss_mlp": 0.14031982, + "step": 7196, + "time_per_iteration": 2.6568362712860107 + }, + { + "auxiliary_loss_clip": 0.01131294, + "auxiliary_loss_mlp": 0.01034725, + "balance_loss_clip": 1.04717445, + "balance_loss_mlp": 1.02071786, + "epoch": 0.43270704945137534, + "flos": 26236911350880.0, + "grad_norm": 2.2304496003815446, + "language_loss": 0.72143334, + "learning_rate": 2.526692300132797e-06, + "loss": 0.74309349, + "num_input_tokens_seen": 154402865, + "router_z_loss_clip": 0.84130859, + "router_z_loss_mlp": 0.14007568, + "step": 7197, + "time_per_iteration": 4.1583638191223145 + }, + { + "auxiliary_loss_clip": 0.01124102, + "auxiliary_loss_mlp": 0.01037805, + "balance_loss_clip": 1.046332, + "balance_loss_mlp": 1.02475762, + "epoch": 0.4327671727040433, + "flos": 30695567630400.0, + "grad_norm": 1.4709237939937128, + "language_loss": 0.72616655, + "learning_rate": 2.5263165759168547e-06, + "loss": 0.74778557, + "num_input_tokens_seen": 154423625, + "router_z_loss_clip": 0.77832031, + "router_z_loss_mlp": 0.1305542, + "step": 7198, + "time_per_iteration": 2.7138659954071045 + }, + { + "auxiliary_loss_clip": 0.01123532, + "auxiliary_loss_mlp": 0.01027391, + "balance_loss_clip": 1.04380262, + "balance_loss_mlp": 1.01455784, + "epoch": 0.43282729595671127, + "flos": 31051055959200.0, + "grad_norm": 2.2407253233657776, + "language_loss": 0.81191218, + "learning_rate": 2.525940831742934e-06, + "loss": 0.83342135, + "num_input_tokens_seen": 154444775, + "router_z_loss_clip": 0.79736328, + "router_z_loss_mlp": 0.1282959, + "step": 7199, + "time_per_iteration": 2.7431271076202393 + }, + { + "auxiliary_loss_clip": 0.01128591, + "auxiliary_loss_mlp": 0.01034096, + "balance_loss_clip": 1.04749298, + "balance_loss_mlp": 1.0214839, + "epoch": 0.43288741920937923, + "flos": 29439709761600.0, + "grad_norm": 2.3035742001490096, + "language_loss": 0.68747067, + "learning_rate": 2.525565067625286e-06, + "loss": 0.70909756, + "num_input_tokens_seen": 154460815, + "router_z_loss_clip": 0.81005859, + "router_z_loss_mlp": 0.1262207, + "step": 7200, + "time_per_iteration": 2.681608200073242 + }, + { + "auxiliary_loss_clip": 0.01128574, + "auxiliary_loss_mlp": 0.01038264, + "balance_loss_clip": 1.04695725, + "balance_loss_mlp": 1.02479362, + "epoch": 0.4329475424620472, + "flos": 23434123409280.0, + "grad_norm": 2.2024552290131396, + "language_loss": 0.87147832, + "learning_rate": 2.525189283578157e-06, + "loss": 0.89314669, + "num_input_tokens_seen": 154479145, + "router_z_loss_clip": 0.81591797, + "router_z_loss_mlp": 0.13470459, + "step": 7201, + "time_per_iteration": 2.6995837688446045 + }, + { + "auxiliary_loss_clip": 0.01134683, + "auxiliary_loss_mlp": 0.01037935, + "balance_loss_clip": 1.05014014, + "balance_loss_mlp": 1.02326584, + "epoch": 0.43300766571471516, + "flos": 27623499431040.0, + "grad_norm": 2.8260497039638537, + "language_loss": 0.64403641, + "learning_rate": 2.5248134796157974e-06, + "loss": 0.66576266, + "num_input_tokens_seen": 154498905, + "router_z_loss_clip": 0.84521484, + "router_z_loss_mlp": 0.14654541, + "step": 7202, + "time_per_iteration": 2.6566827297210693 + }, + { + "auxiliary_loss_clip": 0.01128571, + "auxiliary_loss_mlp": 0.01027355, + "balance_loss_clip": 1.04699385, + "balance_loss_mlp": 1.01519573, + "epoch": 0.4330677889673831, + "flos": 26992855838880.0, + "grad_norm": 1.8219592886262903, + "language_loss": 0.81586587, + "learning_rate": 2.5244376557524586e-06, + "loss": 0.83742511, + "num_input_tokens_seen": 154517270, + "router_z_loss_clip": 0.81591797, + "router_z_loss_mlp": 0.12164307, + "step": 7203, + "time_per_iteration": 4.106656551361084 + }, + { + "auxiliary_loss_clip": 0.01134107, + "auxiliary_loss_mlp": 0.01039986, + "balance_loss_clip": 1.04868472, + "balance_loss_mlp": 1.02616382, + "epoch": 0.4331279122200511, + "flos": 28335164882400.0, + "grad_norm": 2.0509115665118247, + "language_loss": 0.81360233, + "learning_rate": 2.5240618120023912e-06, + "loss": 0.83534324, + "num_input_tokens_seen": 154535945, + "router_z_loss_clip": 0.85400391, + "router_z_loss_mlp": 0.13830566, + "step": 7204, + "time_per_iteration": 2.7325022220611572 + }, + { + "auxiliary_loss_clip": 0.01128297, + "auxiliary_loss_mlp": 0.01036827, + "balance_loss_clip": 1.04643905, + "balance_loss_mlp": 1.02428651, + "epoch": 0.43318803547271906, + "flos": 22280396902560.0, + "grad_norm": 1.9814672150270733, + "language_loss": 0.73702788, + "learning_rate": 2.5236859483798468e-06, + "loss": 0.75867915, + "num_input_tokens_seen": 154554935, + "router_z_loss_clip": 0.81835938, + "router_z_loss_mlp": 0.12542725, + "step": 7205, + "time_per_iteration": 2.649141788482666 + }, + { + "auxiliary_loss_clip": 0.01131345, + "auxiliary_loss_mlp": 0.01037429, + "balance_loss_clip": 1.05268145, + "balance_loss_mlp": 1.02447081, + "epoch": 0.433248158725387, + "flos": 33456426537600.0, + "grad_norm": 1.8882194633182072, + "language_loss": 0.74621254, + "learning_rate": 2.5233100648990803e-06, + "loss": 0.76790029, + "num_input_tokens_seen": 154576065, + "router_z_loss_clip": 0.78564453, + "router_z_loss_mlp": 0.12963867, + "step": 7206, + "time_per_iteration": 2.7602126598358154 + }, + { + "auxiliary_loss_clip": 0.01128742, + "auxiliary_loss_mlp": 0.01035163, + "balance_loss_clip": 1.04727602, + "balance_loss_mlp": 1.02210951, + "epoch": 0.433308281978055, + "flos": 28331194188960.0, + "grad_norm": 2.902434239063608, + "language_loss": 0.78282034, + "learning_rate": 2.522934161574342e-06, + "loss": 0.80445933, + "num_input_tokens_seen": 154595110, + "router_z_loss_clip": 0.81396484, + "router_z_loss_mlp": 0.1305542, + "step": 7207, + "time_per_iteration": 2.6774373054504395 + }, + { + "auxiliary_loss_clip": 0.01135482, + "auxiliary_loss_mlp": 0.01033994, + "balance_loss_clip": 1.05032563, + "balance_loss_mlp": 1.01983237, + "epoch": 0.433368405230723, + "flos": 19342705469760.0, + "grad_norm": 2.372616893569539, + "language_loss": 0.80984843, + "learning_rate": 2.5225582384198888e-06, + "loss": 0.83154321, + "num_input_tokens_seen": 154612255, + "router_z_loss_clip": 0.85107422, + "router_z_loss_mlp": 0.14172363, + "step": 7208, + "time_per_iteration": 2.677128791809082 + }, + { + "auxiliary_loss_clip": 0.01133423, + "auxiliary_loss_mlp": 0.01031578, + "balance_loss_clip": 1.05188346, + "balance_loss_mlp": 1.01889443, + "epoch": 0.433428528483391, + "flos": 23215523918400.0, + "grad_norm": 2.1286722448172664, + "language_loss": 0.70169473, + "learning_rate": 2.5221822954499744e-06, + "loss": 0.7233448, + "num_input_tokens_seen": 154630440, + "router_z_loss_clip": 0.81445312, + "router_z_loss_mlp": 0.12689209, + "step": 7209, + "time_per_iteration": 4.12802791595459 + }, + { + "auxiliary_loss_clip": 0.01127546, + "auxiliary_loss_mlp": 0.01034632, + "balance_loss_clip": 1.04705942, + "balance_loss_mlp": 1.02097631, + "epoch": 0.43348865173605894, + "flos": 30161504532960.0, + "grad_norm": 1.48582474079697, + "language_loss": 0.81558847, + "learning_rate": 2.5218063326788557e-06, + "loss": 0.8372103, + "num_input_tokens_seen": 154652515, + "router_z_loss_clip": 0.80419922, + "router_z_loss_mlp": 0.13653564, + "step": 7210, + "time_per_iteration": 2.71637225151062 + }, + { + "auxiliary_loss_clip": 0.01129582, + "auxiliary_loss_mlp": 0.01037295, + "balance_loss_clip": 1.04828238, + "balance_loss_mlp": 1.02442062, + "epoch": 0.4335487749887269, + "flos": 26955181118880.0, + "grad_norm": 1.7349497625948498, + "language_loss": 0.81852925, + "learning_rate": 2.5214303501207885e-06, + "loss": 0.84019792, + "num_input_tokens_seen": 154670965, + "router_z_loss_clip": 0.8125, + "router_z_loss_mlp": 0.12884521, + "step": 7211, + "time_per_iteration": 2.844754695892334 + }, + { + "auxiliary_loss_clip": 0.01128456, + "auxiliary_loss_mlp": 0.01032337, + "balance_loss_clip": 1.04697132, + "balance_loss_mlp": 1.02095246, + "epoch": 0.43360889824139487, + "flos": 27311803931520.0, + "grad_norm": 2.5484811492466477, + "language_loss": 0.74810743, + "learning_rate": 2.521054347790029e-06, + "loss": 0.76971543, + "num_input_tokens_seen": 154689980, + "router_z_loss_clip": 0.81396484, + "router_z_loss_mlp": 0.11383057, + "step": 7212, + "time_per_iteration": 2.64593768119812 + }, + { + "auxiliary_loss_clip": 0.01131565, + "auxiliary_loss_mlp": 0.01032756, + "balance_loss_clip": 1.05006564, + "balance_loss_mlp": 1.02024508, + "epoch": 0.43366902149406283, + "flos": 21387968749440.0, + "grad_norm": 1.9313348039936342, + "language_loss": 0.76675212, + "learning_rate": 2.5206783257008375e-06, + "loss": 0.78839529, + "num_input_tokens_seen": 154706570, + "router_z_loss_clip": 0.81445312, + "router_z_loss_mlp": 0.125, + "step": 7213, + "time_per_iteration": 2.6464855670928955 + }, + { + "auxiliary_loss_clip": 0.01129333, + "auxiliary_loss_mlp": 0.01034794, + "balance_loss_clip": 1.04760265, + "balance_loss_mlp": 1.02261639, + "epoch": 0.4337291447467308, + "flos": 23211674776800.0, + "grad_norm": 2.2201815960450557, + "language_loss": 0.65024424, + "learning_rate": 2.520302283867471e-06, + "loss": 0.67188549, + "num_input_tokens_seen": 154725210, + "router_z_loss_clip": 0.81689453, + "router_z_loss_mlp": 0.12182617, + "step": 7214, + "time_per_iteration": 2.663285732269287 + }, + { + "auxiliary_loss_clip": 0.01124944, + "auxiliary_loss_mlp": 0.01037546, + "balance_loss_clip": 1.047158, + "balance_loss_mlp": 1.0251298, + "epoch": 0.43378926799939876, + "flos": 33231627902880.0, + "grad_norm": 2.6488178907040476, + "language_loss": 0.71472138, + "learning_rate": 2.519926222304191e-06, + "loss": 0.7363463, + "num_input_tokens_seen": 154745945, + "router_z_loss_clip": 0.77783203, + "router_z_loss_mlp": 0.12408447, + "step": 7215, + "time_per_iteration": 2.7291512489318848 + }, + { + "auxiliary_loss_clip": 0.01127621, + "auxiliary_loss_mlp": 0.01035579, + "balance_loss_clip": 1.04868603, + "balance_loss_mlp": 1.02281785, + "epoch": 0.43384939125206673, + "flos": 19475421027840.0, + "grad_norm": 1.868349218447997, + "language_loss": 0.74770617, + "learning_rate": 2.519550141025255e-06, + "loss": 0.76933819, + "num_input_tokens_seen": 154763580, + "router_z_loss_clip": 0.7890625, + "router_z_loss_mlp": 0.12762451, + "step": 7216, + "time_per_iteration": 2.609340190887451 + }, + { + "auxiliary_loss_clip": 0.01139082, + "auxiliary_loss_mlp": 0.01040701, + "balance_loss_clip": 1.05155659, + "balance_loss_mlp": 1.02656245, + "epoch": 0.4339095145047347, + "flos": 26592723817920.0, + "grad_norm": 2.701477552551681, + "language_loss": 0.75814903, + "learning_rate": 2.519174040044927e-06, + "loss": 0.77994674, + "num_input_tokens_seen": 154776825, + "router_z_loss_clip": 0.87597656, + "router_z_loss_mlp": 0.14135742, + "step": 7217, + "time_per_iteration": 2.6637895107269287 + }, + { + "auxiliary_loss_clip": 0.01130972, + "auxiliary_loss_mlp": 0.01037104, + "balance_loss_clip": 1.04928279, + "balance_loss_mlp": 1.02394903, + "epoch": 0.43396963775740266, + "flos": 17338398809760.0, + "grad_norm": 1.7767720526850102, + "language_loss": 0.73939383, + "learning_rate": 2.5187979193774664e-06, + "loss": 0.76107454, + "num_input_tokens_seen": 154794025, + "router_z_loss_clip": 0.81542969, + "router_z_loss_mlp": 0.13153076, + "step": 7218, + "time_per_iteration": 2.6344385147094727 + }, + { + "auxiliary_loss_clip": 0.0113385, + "auxiliary_loss_mlp": 0.01032063, + "balance_loss_clip": 1.05020463, + "balance_loss_mlp": 1.01903939, + "epoch": 0.4340297610100706, + "flos": 24061160963520.0, + "grad_norm": 1.7122713191273407, + "language_loss": 0.68586659, + "learning_rate": 2.5184217790371367e-06, + "loss": 0.70752567, + "num_input_tokens_seen": 154813105, + "router_z_loss_clip": 0.8359375, + "router_z_loss_mlp": 0.13024902, + "step": 7219, + "time_per_iteration": 2.687749147415161 + }, + { + "auxiliary_loss_clip": 0.01130705, + "auxiliary_loss_mlp": 0.01032737, + "balance_loss_clip": 1.05021286, + "balance_loss_mlp": 1.01989198, + "epoch": 0.4340898842627386, + "flos": 23126925327840.0, + "grad_norm": 1.6135113655586333, + "language_loss": 0.77205044, + "learning_rate": 2.518045619038202e-06, + "loss": 0.79368484, + "num_input_tokens_seen": 154833525, + "router_z_loss_clip": 0.80322266, + "router_z_loss_mlp": 0.12835693, + "step": 7220, + "time_per_iteration": 2.6771795749664307 + }, + { + "auxiliary_loss_clip": 0.01131049, + "auxiliary_loss_mlp": 0.01033534, + "balance_loss_clip": 1.04803371, + "balance_loss_mlp": 1.02020645, + "epoch": 0.4341500075154066, + "flos": 26866825632000.0, + "grad_norm": 1.9941229599682642, + "language_loss": 0.69647896, + "learning_rate": 2.5176694393949243e-06, + "loss": 0.71812475, + "num_input_tokens_seen": 154853090, + "router_z_loss_clip": 0.82910156, + "router_z_loss_mlp": 0.13348389, + "step": 7221, + "time_per_iteration": 2.7572221755981445 + }, + { + "auxiliary_loss_clip": 0.01132035, + "auxiliary_loss_mlp": 0.01039298, + "balance_loss_clip": 1.04762626, + "balance_loss_mlp": 1.02640545, + "epoch": 0.4342101307680746, + "flos": 28775969902080.0, + "grad_norm": 2.325868602289637, + "language_loss": 0.65105414, + "learning_rate": 2.51729324012157e-06, + "loss": 0.67276746, + "num_input_tokens_seen": 154872055, + "router_z_loss_clip": 0.84375, + "router_z_loss_mlp": 0.12884521, + "step": 7222, + "time_per_iteration": 2.7624404430389404 + }, + { + "auxiliary_loss_clip": 0.01129112, + "auxiliary_loss_mlp": 0.01033527, + "balance_loss_clip": 1.04764748, + "balance_loss_mlp": 1.02004492, + "epoch": 0.43427025402074254, + "flos": 21924341331840.0, + "grad_norm": 2.476080846525853, + "language_loss": 0.72688735, + "learning_rate": 2.5169170212324053e-06, + "loss": 0.74851382, + "num_input_tokens_seen": 154886645, + "router_z_loss_clip": 0.81445312, + "router_z_loss_mlp": 0.13482666, + "step": 7223, + "time_per_iteration": 2.6679928302764893 + }, + { + "auxiliary_loss_clip": 0.01130532, + "auxiliary_loss_mlp": 0.0102947, + "balance_loss_clip": 1.0464046, + "balance_loss_mlp": 1.01567745, + "epoch": 0.4343303772734105, + "flos": 32075146221120.0, + "grad_norm": 1.9795535466931315, + "language_loss": 0.93562526, + "learning_rate": 2.516540782741694e-06, + "loss": 0.9572252, + "num_input_tokens_seen": 154906775, + "router_z_loss_clip": 0.84130859, + "router_z_loss_mlp": 0.13769531, + "step": 7224, + "time_per_iteration": 2.720211982727051 + }, + { + "auxiliary_loss_clip": 0.01129215, + "auxiliary_loss_mlp": 0.01037008, + "balance_loss_clip": 1.04829752, + "balance_loss_mlp": 1.02380586, + "epoch": 0.43439050052607847, + "flos": 32743302464160.0, + "grad_norm": 1.6510340396735712, + "language_loss": 0.60982394, + "learning_rate": 2.5161645246637056e-06, + "loss": 0.63148612, + "num_input_tokens_seen": 154926990, + "router_z_loss_clip": 0.81005859, + "router_z_loss_mlp": 0.13183594, + "step": 7225, + "time_per_iteration": 2.706582546234131 + }, + { + "auxiliary_loss_clip": 0.01128457, + "auxiliary_loss_mlp": 0.01035904, + "balance_loss_clip": 1.04736269, + "balance_loss_mlp": 1.02204609, + "epoch": 0.43445062377874644, + "flos": 26109503556480.0, + "grad_norm": 1.8074934842371966, + "language_loss": 0.77192199, + "learning_rate": 2.5157882470127054e-06, + "loss": 0.79356557, + "num_input_tokens_seen": 154946210, + "router_z_loss_clip": 0.81152344, + "router_z_loss_mlp": 0.13873291, + "step": 7226, + "time_per_iteration": 2.672043800354004 + }, + { + "auxiliary_loss_clip": 0.01128848, + "auxiliary_loss_mlp": 0.01031362, + "balance_loss_clip": 1.04867017, + "balance_loss_mlp": 1.01855922, + "epoch": 0.4345107470314144, + "flos": 24284298389760.0, + "grad_norm": 1.54884909519258, + "language_loss": 0.84426177, + "learning_rate": 2.515411949802964e-06, + "loss": 0.8658638, + "num_input_tokens_seen": 154964995, + "router_z_loss_clip": 0.80126953, + "router_z_loss_mlp": 0.12805176, + "step": 7227, + "time_per_iteration": 2.6466128826141357 + }, + { + "auxiliary_loss_clip": 0.01128814, + "auxiliary_loss_mlp": 0.01040202, + "balance_loss_clip": 1.04837465, + "balance_loss_mlp": 1.02681446, + "epoch": 0.43457087028408237, + "flos": 32253558920640.0, + "grad_norm": 2.9252426494301162, + "language_loss": 0.76704752, + "learning_rate": 2.5150356330487498e-06, + "loss": 0.78873771, + "num_input_tokens_seen": 154984775, + "router_z_loss_clip": 0.80419922, + "router_z_loss_mlp": 0.13378906, + "step": 7228, + "time_per_iteration": 2.750427722930908 + }, + { + "auxiliary_loss_clip": 0.01132024, + "auxiliary_loss_mlp": 0.01039435, + "balance_loss_clip": 1.05026793, + "balance_loss_mlp": 1.02589858, + "epoch": 0.43463099353675033, + "flos": 38887236276480.0, + "grad_norm": 1.6742607497746858, + "language_loss": 0.80477268, + "learning_rate": 2.5146592967643324e-06, + "loss": 0.82648724, + "num_input_tokens_seen": 155008125, + "router_z_loss_clip": 0.81738281, + "router_z_loss_mlp": 0.13543701, + "step": 7229, + "time_per_iteration": 2.736016035079956 + }, + { + "auxiliary_loss_clip": 0.01130957, + "auxiliary_loss_mlp": 0.01042573, + "balance_loss_clip": 1.04724717, + "balance_loss_mlp": 1.02874494, + "epoch": 0.4346911167894183, + "flos": 29982240970560.0, + "grad_norm": 1.9132987172160154, + "language_loss": 0.81496686, + "learning_rate": 2.5142829409639834e-06, + "loss": 0.83670217, + "num_input_tokens_seen": 155027885, + "router_z_loss_clip": 0.83642578, + "router_z_loss_mlp": 0.13830566, + "step": 7230, + "time_per_iteration": 2.733095407485962 + }, + { + "auxiliary_loss_clip": 0.01136682, + "auxiliary_loss_mlp": 0.01043016, + "balance_loss_clip": 1.05124807, + "balance_loss_mlp": 1.02922916, + "epoch": 0.43475124004208626, + "flos": 20853946169280.0, + "grad_norm": 4.254838928215658, + "language_loss": 0.77139908, + "learning_rate": 2.513906565661973e-06, + "loss": 0.79319608, + "num_input_tokens_seen": 155043375, + "router_z_loss_clip": 0.85498047, + "router_z_loss_mlp": 0.13787842, + "step": 7231, + "time_per_iteration": 2.634354591369629 + }, + { + "auxiliary_loss_clip": 0.01128319, + "auxiliary_loss_mlp": 0.01035363, + "balance_loss_clip": 1.04837453, + "balance_loss_mlp": 1.023067, + "epoch": 0.4348113632947542, + "flos": 31853264830560.0, + "grad_norm": 1.4955340366188634, + "language_loss": 0.68560576, + "learning_rate": 2.513530170872575e-06, + "loss": 0.70724261, + "num_input_tokens_seen": 155062930, + "router_z_loss_clip": 0.79833984, + "router_z_loss_mlp": 0.12298584, + "step": 7232, + "time_per_iteration": 2.727155923843384 + }, + { + "auxiliary_loss_clip": 0.01133567, + "auxiliary_loss_mlp": 0.01035669, + "balance_loss_clip": 1.04865456, + "balance_loss_mlp": 1.02151275, + "epoch": 0.4348714865474222, + "flos": 41732844632640.0, + "grad_norm": 1.7533401878215322, + "language_loss": 0.7182008, + "learning_rate": 2.5131537566100605e-06, + "loss": 0.73989314, + "num_input_tokens_seen": 155084980, + "router_z_loss_clip": 0.84960938, + "router_z_loss_mlp": 0.14154053, + "step": 7233, + "time_per_iteration": 2.8000051975250244 + }, + { + "auxiliary_loss_clip": 0.0113544, + "auxiliary_loss_mlp": 0.01041383, + "balance_loss_clip": 1.05007529, + "balance_loss_mlp": 1.02675557, + "epoch": 0.43493160980009016, + "flos": 38483174079360.0, + "grad_norm": 1.8168833275930991, + "language_loss": 0.74445319, + "learning_rate": 2.5127773228887053e-06, + "loss": 0.7662214, + "num_input_tokens_seen": 155107260, + "router_z_loss_clip": 0.85449219, + "router_z_loss_mlp": 0.1463623, + "step": 7234, + "time_per_iteration": 2.792245864868164 + }, + { + "auxiliary_loss_clip": 0.0113659, + "auxiliary_loss_mlp": 0.01036737, + "balance_loss_clip": 1.04899406, + "balance_loss_mlp": 1.02262878, + "epoch": 0.4349917330527582, + "flos": 29359903420800.0, + "grad_norm": 1.9299193811652047, + "language_loss": 0.59002995, + "learning_rate": 2.512400869722782e-06, + "loss": 0.61176324, + "num_input_tokens_seen": 155126720, + "router_z_loss_clip": 0.87646484, + "router_z_loss_mlp": 0.14105225, + "step": 7235, + "time_per_iteration": 2.800696849822998 + }, + { + "auxiliary_loss_clip": 0.01129003, + "auxiliary_loss_mlp": 0.01033429, + "balance_loss_clip": 1.04674125, + "balance_loss_mlp": 1.02008915, + "epoch": 0.43505185630542614, + "flos": 37240443809280.0, + "grad_norm": 1.8588789761303335, + "language_loss": 0.77602708, + "learning_rate": 2.512024397126566e-06, + "loss": 0.79765141, + "num_input_tokens_seen": 155148640, + "router_z_loss_clip": 0.82324219, + "router_z_loss_mlp": 0.13336182, + "step": 7236, + "time_per_iteration": 5.595293998718262 + }, + { + "auxiliary_loss_clip": 0.01126657, + "auxiliary_loss_mlp": 0.01031831, + "balance_loss_clip": 1.04722941, + "balance_loss_mlp": 1.01849771, + "epoch": 0.4351119795580941, + "flos": 19200346799040.0, + "grad_norm": 2.434546338323873, + "language_loss": 0.81432223, + "learning_rate": 2.5116479051143345e-06, + "loss": 0.8359071, + "num_input_tokens_seen": 155165870, + "router_z_loss_clip": 0.79394531, + "router_z_loss_mlp": 0.13317871, + "step": 7237, + "time_per_iteration": 2.639411211013794 + }, + { + "auxiliary_loss_clip": 0.01129482, + "auxiliary_loss_mlp": 0.01036097, + "balance_loss_clip": 1.04818869, + "balance_loss_mlp": 1.02273381, + "epoch": 0.4351721028107621, + "flos": 22856591620800.0, + "grad_norm": 2.318397476918772, + "language_loss": 0.63096666, + "learning_rate": 2.5112713937003623e-06, + "loss": 0.65262246, + "num_input_tokens_seen": 155185315, + "router_z_loss_clip": 0.81396484, + "router_z_loss_mlp": 0.13366699, + "step": 7238, + "time_per_iteration": 2.703061580657959 + }, + { + "auxiliary_loss_clip": 0.01125379, + "auxiliary_loss_mlp": 0.01038261, + "balance_loss_clip": 1.04608154, + "balance_loss_mlp": 1.02529752, + "epoch": 0.43523222606343004, + "flos": 30782383460640.0, + "grad_norm": 2.401458025038376, + "language_loss": 0.86274815, + "learning_rate": 2.510894862898928e-06, + "loss": 0.88438457, + "num_input_tokens_seen": 155205790, + "router_z_loss_clip": 0.79296875, + "router_z_loss_mlp": 0.12976074, + "step": 7239, + "time_per_iteration": 2.670872449874878 + }, + { + "auxiliary_loss_clip": 0.01133022, + "auxiliary_loss_mlp": 0.01032082, + "balance_loss_clip": 1.05193686, + "balance_loss_mlp": 1.01938617, + "epoch": 0.435292349316098, + "flos": 27710112674880.0, + "grad_norm": 1.5376874566970897, + "language_loss": 0.72445118, + "learning_rate": 2.510518312724309e-06, + "loss": 0.74610221, + "num_input_tokens_seen": 155226475, + "router_z_loss_clip": 0.81103516, + "router_z_loss_mlp": 0.12719727, + "step": 7240, + "time_per_iteration": 2.7088286876678467 + }, + { + "auxiliary_loss_clip": 0.01134151, + "auxiliary_loss_mlp": 0.01032283, + "balance_loss_clip": 1.05020833, + "balance_loss_mlp": 1.01848459, + "epoch": 0.43535247256876597, + "flos": 31452160394880.0, + "grad_norm": 2.0463633448753873, + "language_loss": 0.8215822, + "learning_rate": 2.5101417431907842e-06, + "loss": 0.84324658, + "num_input_tokens_seen": 155247110, + "router_z_loss_clip": 0.83837891, + "router_z_loss_mlp": 0.13793945, + "step": 7241, + "time_per_iteration": 2.6909377574920654 + }, + { + "auxiliary_loss_clip": 0.01135432, + "auxiliary_loss_mlp": 0.01037142, + "balance_loss_clip": 1.04930627, + "balance_loss_mlp": 1.02264023, + "epoch": 0.43541259582143393, + "flos": 21387806680320.0, + "grad_norm": 2.5487334543938616, + "language_loss": 0.79391974, + "learning_rate": 2.5097651543126345e-06, + "loss": 0.81564546, + "num_input_tokens_seen": 155261335, + "router_z_loss_clip": 0.86083984, + "router_z_loss_mlp": 0.14501953, + "step": 7242, + "time_per_iteration": 4.227131366729736 + }, + { + "auxiliary_loss_clip": 0.01132437, + "auxiliary_loss_mlp": 0.01032676, + "balance_loss_clip": 1.04689181, + "balance_loss_mlp": 1.01901436, + "epoch": 0.4354727190741019, + "flos": 18540780219360.0, + "grad_norm": 2.593482411954218, + "language_loss": 0.68189925, + "learning_rate": 2.509388546104138e-06, + "loss": 0.70355034, + "num_input_tokens_seen": 155278510, + "router_z_loss_clip": 0.85498047, + "router_z_loss_mlp": 0.13665771, + "step": 7243, + "time_per_iteration": 2.7364206314086914 + }, + { + "auxiliary_loss_clip": 0.01130442, + "auxiliary_loss_mlp": 0.0103051, + "balance_loss_clip": 1.05109262, + "balance_loss_mlp": 1.01796877, + "epoch": 0.43553284232676986, + "flos": 20315628757440.0, + "grad_norm": 1.5590186637152783, + "language_loss": 0.81302661, + "learning_rate": 2.5090119185795766e-06, + "loss": 0.83463609, + "num_input_tokens_seen": 155296450, + "router_z_loss_clip": 0.79296875, + "router_z_loss_mlp": 0.12542725, + "step": 7244, + "time_per_iteration": 2.6941261291503906 + }, + { + "auxiliary_loss_clip": 0.01132103, + "auxiliary_loss_mlp": 0.0102916, + "balance_loss_clip": 1.05073929, + "balance_loss_mlp": 1.01684022, + "epoch": 0.43559296557943783, + "flos": 28554291097920.0, + "grad_norm": 1.988708190094881, + "language_loss": 0.73658562, + "learning_rate": 2.508635271753234e-06, + "loss": 0.75819826, + "num_input_tokens_seen": 155316080, + "router_z_loss_clip": 0.81298828, + "router_z_loss_mlp": 0.12322998, + "step": 7245, + "time_per_iteration": 2.7022006511688232 + }, + { + "auxiliary_loss_clip": 0.01131798, + "auxiliary_loss_mlp": 0.01035484, + "balance_loss_clip": 1.05015898, + "balance_loss_mlp": 1.02299643, + "epoch": 0.4356530888321058, + "flos": 27355920899040.0, + "grad_norm": 1.61810876255576, + "language_loss": 0.76822865, + "learning_rate": 2.508258605639389e-06, + "loss": 0.78990144, + "num_input_tokens_seen": 155336765, + "router_z_loss_clip": 0.81689453, + "router_z_loss_mlp": 0.12481689, + "step": 7246, + "time_per_iteration": 2.706470489501953 + }, + { + "auxiliary_loss_clip": 0.01132567, + "auxiliary_loss_mlp": 0.0104052, + "balance_loss_clip": 1.05000341, + "balance_loss_mlp": 1.02713251, + "epoch": 0.43571321208477376, + "flos": 26376150191040.0, + "grad_norm": 1.8055725719749016, + "language_loss": 0.85434496, + "learning_rate": 2.5078819202523275e-06, + "loss": 0.87607586, + "num_input_tokens_seen": 155356440, + "router_z_loss_clip": 0.82470703, + "router_z_loss_mlp": 0.13378906, + "step": 7247, + "time_per_iteration": 2.684509515762329 + }, + { + "auxiliary_loss_clip": 0.01131937, + "auxiliary_loss_mlp": 0.01036953, + "balance_loss_clip": 1.05028665, + "balance_loss_mlp": 1.02460301, + "epoch": 0.4357733353374418, + "flos": 29269724656320.0, + "grad_norm": 1.8658247979087972, + "language_loss": 0.72453851, + "learning_rate": 2.507505215606333e-06, + "loss": 0.74622738, + "num_input_tokens_seen": 155377070, + "router_z_loss_clip": 0.81542969, + "router_z_loss_mlp": 0.12359619, + "step": 7248, + "time_per_iteration": 4.152439832687378 + }, + { + "auxiliary_loss_clip": 0.01131103, + "auxiliary_loss_mlp": 0.01036537, + "balance_loss_clip": 1.04977131, + "balance_loss_mlp": 1.02300072, + "epoch": 0.43583345859010975, + "flos": 30828445257600.0, + "grad_norm": 2.064102977644865, + "language_loss": 0.87157691, + "learning_rate": 2.5071284917156893e-06, + "loss": 0.89325333, + "num_input_tokens_seen": 155398415, + "router_z_loss_clip": 0.81445312, + "router_z_loss_mlp": 0.13531494, + "step": 7249, + "time_per_iteration": 2.6883797645568848 + }, + { + "auxiliary_loss_clip": 0.01134608, + "auxiliary_loss_mlp": 0.01043106, + "balance_loss_clip": 1.05027413, + "balance_loss_mlp": 1.03062439, + "epoch": 0.4358935818427777, + "flos": 28914479431200.0, + "grad_norm": 1.8229099339443966, + "language_loss": 0.81842661, + "learning_rate": 2.506751748594683e-06, + "loss": 0.84020376, + "num_input_tokens_seen": 155415625, + "router_z_loss_clip": 0.84423828, + "router_z_loss_mlp": 0.12493896, + "step": 7250, + "time_per_iteration": 2.7554514408111572 + }, + { + "auxiliary_loss_clip": 0.01138345, + "auxiliary_loss_mlp": 0.0103541, + "balance_loss_clip": 1.05537748, + "balance_loss_mlp": 1.02190363, + "epoch": 0.4359537050954457, + "flos": 36036765846720.0, + "grad_norm": 2.005832402199543, + "language_loss": 0.84553719, + "learning_rate": 2.5063749862575988e-06, + "loss": 0.8672747, + "num_input_tokens_seen": 155435505, + "router_z_loss_clip": 0.82910156, + "router_z_loss_mlp": 0.13513184, + "step": 7251, + "time_per_iteration": 2.7462825775146484 + }, + { + "auxiliary_loss_clip": 0.0112577, + "auxiliary_loss_mlp": 0.01037433, + "balance_loss_clip": 1.04612291, + "balance_loss_mlp": 1.02354503, + "epoch": 0.43601382834811364, + "flos": 27712462677120.0, + "grad_norm": 1.7713634448591515, + "language_loss": 0.69619006, + "learning_rate": 2.5059982047187245e-06, + "loss": 0.71782207, + "num_input_tokens_seen": 155455425, + "router_z_loss_clip": 0.796875, + "router_z_loss_mlp": 0.13873291, + "step": 7252, + "time_per_iteration": 2.731017589569092 + }, + { + "auxiliary_loss_clip": 0.01130421, + "auxiliary_loss_mlp": 0.0103515, + "balance_loss_clip": 1.0508585, + "balance_loss_mlp": 1.02184606, + "epoch": 0.4360739516007816, + "flos": 23304203543520.0, + "grad_norm": 1.6499666528652814, + "language_loss": 0.83607739, + "learning_rate": 2.505621403992348e-06, + "loss": 0.85773313, + "num_input_tokens_seen": 155474250, + "router_z_loss_clip": 0.79541016, + "router_z_loss_mlp": 0.13305664, + "step": 7253, + "time_per_iteration": 2.711470365524292 + }, + { + "auxiliary_loss_clip": 0.0113059, + "auxiliary_loss_mlp": 0.01040601, + "balance_loss_clip": 1.04966116, + "balance_loss_mlp": 1.027035, + "epoch": 0.43613407485344957, + "flos": 28558302308640.0, + "grad_norm": 1.4848217843278655, + "language_loss": 0.70595932, + "learning_rate": 2.505244584092757e-06, + "loss": 0.72767127, + "num_input_tokens_seen": 155494685, + "router_z_loss_clip": 0.80859375, + "router_z_loss_mlp": 0.13580322, + "step": 7254, + "time_per_iteration": 2.6980926990509033 + }, + { + "auxiliary_loss_clip": 0.01130958, + "auxiliary_loss_mlp": 0.01035683, + "balance_loss_clip": 1.05134702, + "balance_loss_mlp": 1.02277303, + "epoch": 0.43619419810611754, + "flos": 27622851154560.0, + "grad_norm": 1.8122015080734197, + "language_loss": 0.81237715, + "learning_rate": 2.5048677450342406e-06, + "loss": 0.83404362, + "num_input_tokens_seen": 155513040, + "router_z_loss_clip": 0.79589844, + "router_z_loss_mlp": 0.12915039, + "step": 7255, + "time_per_iteration": 2.9560108184814453 + }, + { + "auxiliary_loss_clip": 0.01128599, + "auxiliary_loss_mlp": 0.01038254, + "balance_loss_clip": 1.04724312, + "balance_loss_mlp": 1.02533126, + "epoch": 0.4362543213587855, + "flos": 24462467985600.0, + "grad_norm": 1.6634610981188194, + "language_loss": 0.77842307, + "learning_rate": 2.504490886831089e-06, + "loss": 0.80009162, + "num_input_tokens_seen": 155530100, + "router_z_loss_clip": 0.81396484, + "router_z_loss_mlp": 0.12921143, + "step": 7256, + "time_per_iteration": 2.6856250762939453 + }, + { + "auxiliary_loss_clip": 0.01131751, + "auxiliary_loss_mlp": 0.0103539, + "balance_loss_clip": 1.05101371, + "balance_loss_mlp": 1.02191961, + "epoch": 0.43631444461145347, + "flos": 26065143485280.0, + "grad_norm": 1.5759425348271061, + "language_loss": 0.76467818, + "learning_rate": 2.5041140094975922e-06, + "loss": 0.78634959, + "num_input_tokens_seen": 155549375, + "router_z_loss_clip": 0.80712891, + "router_z_loss_mlp": 0.13476562, + "step": 7257, + "time_per_iteration": 2.7213938236236572 + }, + { + "auxiliary_loss_clip": 0.01127738, + "auxiliary_loss_mlp": 0.01035932, + "balance_loss_clip": 1.04632592, + "balance_loss_mlp": 1.02199674, + "epoch": 0.43637456786412143, + "flos": 27355313139840.0, + "grad_norm": 1.7049840366863034, + "language_loss": 0.73339379, + "learning_rate": 2.5037371130480417e-06, + "loss": 0.75503051, + "num_input_tokens_seen": 155569395, + "router_z_loss_clip": 0.81445312, + "router_z_loss_mlp": 0.13934326, + "step": 7258, + "time_per_iteration": 2.649212598800659 + }, + { + "auxiliary_loss_clip": 0.01129896, + "auxiliary_loss_mlp": 0.01035174, + "balance_loss_clip": 1.0474546, + "balance_loss_mlp": 1.02235293, + "epoch": 0.4364346911167894, + "flos": 34835924093760.0, + "grad_norm": 1.7518431771762428, + "language_loss": 0.76885509, + "learning_rate": 2.5033601974967297e-06, + "loss": 0.79050577, + "num_input_tokens_seen": 155589090, + "router_z_loss_clip": 0.82373047, + "router_z_loss_mlp": 0.12817383, + "step": 7259, + "time_per_iteration": 2.7350523471832275 + }, + { + "auxiliary_loss_clip": 0.010506, + "auxiliary_loss_mlp": 0.01006438, + "balance_loss_clip": 1.02497542, + "balance_loss_mlp": 1.0050416, + "epoch": 0.43649481436945736, + "flos": 76457162671200.0, + "grad_norm": 0.7480610779460817, + "language_loss": 0.56930715, + "learning_rate": 2.5029832628579483e-06, + "loss": 0.58987755, + "num_input_tokens_seen": 155648660, + "router_z_loss_clip": 0.25683594, + "router_z_loss_mlp": 0.01396179, + "step": 7260, + "time_per_iteration": 3.2594101428985596 + }, + { + "auxiliary_loss_clip": 0.01131183, + "auxiliary_loss_mlp": 0.01044908, + "balance_loss_clip": 1.04710817, + "balance_loss_mlp": 1.03107429, + "epoch": 0.4365549376221254, + "flos": 37329285503520.0, + "grad_norm": 2.0195407004470374, + "language_loss": 0.71138716, + "learning_rate": 2.5026063091459907e-06, + "loss": 0.7331481, + "num_input_tokens_seen": 155669945, + "router_z_loss_clip": 0.83984375, + "router_z_loss_mlp": 0.13830566, + "step": 7261, + "time_per_iteration": 2.7863831520080566 + }, + { + "auxiliary_loss_clip": 0.01131056, + "auxiliary_loss_mlp": 0.01044012, + "balance_loss_clip": 1.04792511, + "balance_loss_mlp": 1.0302968, + "epoch": 0.43661506087479335, + "flos": 20945502521280.0, + "grad_norm": 1.8456135702338248, + "language_loss": 0.69521922, + "learning_rate": 2.5022293363751522e-06, + "loss": 0.71696991, + "num_input_tokens_seen": 155688555, + "router_z_loss_clip": 0.83154297, + "router_z_loss_mlp": 0.13708496, + "step": 7262, + "time_per_iteration": 2.6918601989746094 + }, + { + "auxiliary_loss_clip": 0.01123793, + "auxiliary_loss_mlp": 0.0103454, + "balance_loss_clip": 1.04702461, + "balance_loss_mlp": 1.02280998, + "epoch": 0.4366751841274613, + "flos": 26903122764480.0, + "grad_norm": 1.6416795147450332, + "language_loss": 0.79569471, + "learning_rate": 2.501852344559726e-06, + "loss": 0.81727809, + "num_input_tokens_seen": 155705370, + "router_z_loss_clip": 0.76855469, + "router_z_loss_mlp": 0.11737061, + "step": 7263, + "time_per_iteration": 2.658456325531006 + }, + { + "auxiliary_loss_clip": 0.01131009, + "auxiliary_loss_mlp": 0.01047298, + "balance_loss_clip": 1.0504427, + "balance_loss_mlp": 1.03397632, + "epoch": 0.4367353073801293, + "flos": 19519619029920.0, + "grad_norm": 1.7341520057340583, + "language_loss": 0.75529754, + "learning_rate": 2.50147533371401e-06, + "loss": 0.7770806, + "num_input_tokens_seen": 155721890, + "router_z_loss_clip": 0.80566406, + "router_z_loss_mlp": 0.13317871, + "step": 7264, + "time_per_iteration": 2.6857900619506836 + }, + { + "auxiliary_loss_clip": 0.01124618, + "auxiliary_loss_mlp": 0.01035425, + "balance_loss_clip": 1.04461718, + "balance_loss_mlp": 1.02175224, + "epoch": 0.43679543063279724, + "flos": 46634412830400.0, + "grad_norm": 2.1419171124728447, + "language_loss": 0.61459595, + "learning_rate": 2.501098303852298e-06, + "loss": 0.63619637, + "num_input_tokens_seen": 155743970, + "router_z_loss_clip": 0.80029297, + "router_z_loss_mlp": 0.13665771, + "step": 7265, + "time_per_iteration": 2.8131189346313477 + }, + { + "auxiliary_loss_clip": 0.01124054, + "auxiliary_loss_mlp": 0.01038075, + "balance_loss_clip": 1.04568911, + "balance_loss_mlp": 1.02621329, + "epoch": 0.4368555538854652, + "flos": 18538551768960.0, + "grad_norm": 1.8648919761394476, + "language_loss": 0.72631788, + "learning_rate": 2.5007212549888884e-06, + "loss": 0.74793911, + "num_input_tokens_seen": 155761830, + "router_z_loss_clip": 0.78417969, + "router_z_loss_mlp": 0.11853027, + "step": 7266, + "time_per_iteration": 2.7221450805664062 + }, + { + "auxiliary_loss_clip": 0.01130271, + "auxiliary_loss_mlp": 0.01041405, + "balance_loss_clip": 1.04805899, + "balance_loss_mlp": 1.02783298, + "epoch": 0.4369156771381332, + "flos": 28149742693440.0, + "grad_norm": 2.047945407427171, + "language_loss": 0.8238337, + "learning_rate": 2.5003441871380794e-06, + "loss": 0.84555054, + "num_input_tokens_seen": 155779610, + "router_z_loss_clip": 0.82128906, + "router_z_loss_mlp": 0.13574219, + "step": 7267, + "time_per_iteration": 2.716564416885376 + }, + { + "auxiliary_loss_clip": 0.0112393, + "auxiliary_loss_mlp": 0.01032881, + "balance_loss_clip": 1.04502082, + "balance_loss_mlp": 1.02065039, + "epoch": 0.43697580039080114, + "flos": 28603027035360.0, + "grad_norm": 3.0884818643247787, + "language_loss": 0.74520189, + "learning_rate": 2.4999671003141674e-06, + "loss": 0.76677001, + "num_input_tokens_seen": 155798765, + "router_z_loss_clip": 0.78955078, + "router_z_loss_mlp": 0.12219238, + "step": 7268, + "time_per_iteration": 2.7029941082000732 + }, + { + "auxiliary_loss_clip": 0.0113257, + "auxiliary_loss_mlp": 0.01041817, + "balance_loss_clip": 1.04794073, + "balance_loss_mlp": 1.0280782, + "epoch": 0.4370359236434691, + "flos": 22591525160160.0, + "grad_norm": 3.3724849527837018, + "language_loss": 0.79718637, + "learning_rate": 2.499589994531454e-06, + "loss": 0.81893021, + "num_input_tokens_seen": 155817750, + "router_z_loss_clip": 0.84570312, + "router_z_loss_mlp": 0.13751221, + "step": 7269, + "time_per_iteration": 2.63553786277771 + }, + { + "auxiliary_loss_clip": 0.01128099, + "auxiliary_loss_mlp": 0.01039377, + "balance_loss_clip": 1.04746437, + "balance_loss_mlp": 1.02643669, + "epoch": 0.43709604689613707, + "flos": 28336258848960.0, + "grad_norm": 1.767661777781811, + "language_loss": 0.74897003, + "learning_rate": 2.499212869804237e-06, + "loss": 0.77064478, + "num_input_tokens_seen": 155836490, + "router_z_loss_clip": 0.80517578, + "router_z_loss_mlp": 0.12945557, + "step": 7270, + "time_per_iteration": 2.691253423690796 + }, + { + "auxiliary_loss_clip": 0.01127858, + "auxiliary_loss_mlp": 0.01035825, + "balance_loss_clip": 1.04624343, + "balance_loss_mlp": 1.02259243, + "epoch": 0.43715617014880503, + "flos": 29045736367200.0, + "grad_norm": 1.807483948448789, + "language_loss": 0.79820204, + "learning_rate": 2.4988357261468182e-06, + "loss": 0.81983888, + "num_input_tokens_seen": 155856225, + "router_z_loss_clip": 0.81542969, + "router_z_loss_mlp": 0.13220215, + "step": 7271, + "time_per_iteration": 2.6673576831817627 + }, + { + "auxiliary_loss_clip": 0.01046178, + "auxiliary_loss_mlp": 0.01001663, + "balance_loss_clip": 1.02075982, + "balance_loss_mlp": 1.00026488, + "epoch": 0.437216293401473, + "flos": 75579766428960.0, + "grad_norm": 0.7151401919601048, + "language_loss": 0.54889035, + "learning_rate": 2.4984585635734993e-06, + "loss": 0.56936878, + "num_input_tokens_seen": 155916770, + "router_z_loss_clip": 0.25390625, + "router_z_loss_mlp": 0.01397705, + "step": 7272, + "time_per_iteration": 3.372917652130127 + }, + { + "auxiliary_loss_clip": 0.01130631, + "auxiliary_loss_mlp": 0.01046122, + "balance_loss_clip": 1.04825926, + "balance_loss_mlp": 1.0331583, + "epoch": 0.43727641665414096, + "flos": 26822789699040.0, + "grad_norm": 1.729713764677597, + "language_loss": 0.69748163, + "learning_rate": 2.498081382098581e-06, + "loss": 0.71924913, + "num_input_tokens_seen": 155936490, + "router_z_loss_clip": 0.82373047, + "router_z_loss_mlp": 0.12963867, + "step": 7273, + "time_per_iteration": 2.7014682292938232 + }, + { + "auxiliary_loss_clip": 0.01129999, + "auxiliary_loss_mlp": 0.01038202, + "balance_loss_clip": 1.04732072, + "balance_loss_mlp": 1.02475524, + "epoch": 0.437336539906809, + "flos": 48237047812800.0, + "grad_norm": 1.9681733622878417, + "language_loss": 0.75177181, + "learning_rate": 2.497704181736367e-06, + "loss": 0.77345383, + "num_input_tokens_seen": 155957595, + "router_z_loss_clip": 0.82617188, + "router_z_loss_mlp": 0.13439941, + "step": 7274, + "time_per_iteration": 2.819507360458374 + }, + { + "auxiliary_loss_clip": 0.01124145, + "auxiliary_loss_mlp": 0.0103306, + "balance_loss_clip": 1.04417133, + "balance_loss_mlp": 1.02144885, + "epoch": 0.43739666315947695, + "flos": 21300990850080.0, + "grad_norm": 1.7824342189465059, + "language_loss": 0.8037858, + "learning_rate": 2.49732696250116e-06, + "loss": 0.82535779, + "num_input_tokens_seen": 155975710, + "router_z_loss_clip": 0.79882812, + "router_z_loss_mlp": 0.1161499, + "step": 7275, + "time_per_iteration": 2.653824806213379 + }, + { + "auxiliary_loss_clip": 0.01128797, + "auxiliary_loss_mlp": 0.01035052, + "balance_loss_clip": 1.04862273, + "balance_loss_mlp": 1.02223694, + "epoch": 0.4374567864121449, + "flos": 19960991291520.0, + "grad_norm": 2.083318988440588, + "language_loss": 0.8097403, + "learning_rate": 2.496949724407266e-06, + "loss": 0.83137876, + "num_input_tokens_seen": 155993090, + "router_z_loss_clip": 0.80126953, + "router_z_loss_mlp": 0.12805176, + "step": 7276, + "time_per_iteration": 5.569474935531616 + }, + { + "auxiliary_loss_clip": 0.01132957, + "auxiliary_loss_mlp": 0.01032596, + "balance_loss_clip": 1.04705846, + "balance_loss_mlp": 1.0191555, + "epoch": 0.4375169096648129, + "flos": 37324099291680.0, + "grad_norm": 1.9596604730833758, + "language_loss": 0.72885644, + "learning_rate": 2.496572467468988e-06, + "loss": 0.75051188, + "num_input_tokens_seen": 156013685, + "router_z_loss_clip": 0.859375, + "router_z_loss_mlp": 0.13439941, + "step": 7277, + "time_per_iteration": 2.755066394805908 + }, + { + "auxiliary_loss_clip": 0.01127251, + "auxiliary_loss_mlp": 0.01039796, + "balance_loss_clip": 1.04641294, + "balance_loss_mlp": 1.02622378, + "epoch": 0.43757703291748085, + "flos": 37284722845920.0, + "grad_norm": 1.884839811719251, + "language_loss": 0.72831672, + "learning_rate": 2.4961951917006317e-06, + "loss": 0.74998719, + "num_input_tokens_seen": 156034300, + "router_z_loss_clip": 0.80859375, + "router_z_loss_mlp": 0.13580322, + "step": 7278, + "time_per_iteration": 2.7970423698425293 + }, + { + "auxiliary_loss_clip": 0.01126546, + "auxiliary_loss_mlp": 0.01042376, + "balance_loss_clip": 1.04691446, + "balance_loss_mlp": 1.03053892, + "epoch": 0.4376371561701488, + "flos": 26109179418240.0, + "grad_norm": 1.5360165746050436, + "language_loss": 0.66183078, + "learning_rate": 2.4958178971165046e-06, + "loss": 0.68351996, + "num_input_tokens_seen": 156053805, + "router_z_loss_clip": 0.796875, + "router_z_loss_mlp": 0.11834717, + "step": 7279, + "time_per_iteration": 2.7286250591278076 + }, + { + "auxiliary_loss_clip": 0.01134324, + "auxiliary_loss_mlp": 0.01038694, + "balance_loss_clip": 1.04783177, + "balance_loss_mlp": 1.02542043, + "epoch": 0.4376972794228168, + "flos": 28558261791360.0, + "grad_norm": 5.95790203166967, + "language_loss": 0.81724352, + "learning_rate": 2.4954405837309126e-06, + "loss": 0.83897376, + "num_input_tokens_seen": 156073295, + "router_z_loss_clip": 0.86572266, + "router_z_loss_mlp": 0.13275146, + "step": 7280, + "time_per_iteration": 2.686086654663086 + }, + { + "auxiliary_loss_clip": 0.01123073, + "auxiliary_loss_mlp": 0.01036698, + "balance_loss_clip": 1.04451561, + "balance_loss_mlp": 1.02425814, + "epoch": 0.43775740267548474, + "flos": 27934222515840.0, + "grad_norm": 1.6181583566867668, + "language_loss": 0.77027941, + "learning_rate": 2.4950632515581653e-06, + "loss": 0.79187709, + "num_input_tokens_seen": 156094540, + "router_z_loss_clip": 0.78564453, + "router_z_loss_mlp": 0.12432861, + "step": 7281, + "time_per_iteration": 2.7257938385009766 + }, + { + "auxiliary_loss_clip": 0.01125293, + "auxiliary_loss_mlp": 0.01039874, + "balance_loss_clip": 1.04461145, + "balance_loss_mlp": 1.02768481, + "epoch": 0.4378175259281527, + "flos": 28424087611200.0, + "grad_norm": 2.1601610407589678, + "language_loss": 0.75683463, + "learning_rate": 2.494685900612569e-06, + "loss": 0.77848631, + "num_input_tokens_seen": 156114070, + "router_z_loss_clip": 0.80810547, + "router_z_loss_mlp": 0.12188721, + "step": 7282, + "time_per_iteration": 4.066568374633789 + }, + { + "auxiliary_loss_clip": 0.01129014, + "auxiliary_loss_mlp": 0.01039588, + "balance_loss_clip": 1.04735923, + "balance_loss_mlp": 1.02693367, + "epoch": 0.43787764918082067, + "flos": 29135023751520.0, + "grad_norm": 1.9821345098385659, + "language_loss": 0.8465414, + "learning_rate": 2.4943085309084333e-06, + "loss": 0.86822748, + "num_input_tokens_seen": 156132130, + "router_z_loss_clip": 0.81738281, + "router_z_loss_mlp": 0.12664795, + "step": 7283, + "time_per_iteration": 2.710893392562866 + }, + { + "auxiliary_loss_clip": 0.01132495, + "auxiliary_loss_mlp": 0.01035455, + "balance_loss_clip": 1.04721928, + "balance_loss_mlp": 1.02188945, + "epoch": 0.43793777243348864, + "flos": 29270616036480.0, + "grad_norm": 1.7817323125150493, + "language_loss": 0.80495167, + "learning_rate": 2.49393114246007e-06, + "loss": 0.82663119, + "num_input_tokens_seen": 156150820, + "router_z_loss_clip": 0.85302734, + "router_z_loss_mlp": 0.13568115, + "step": 7284, + "time_per_iteration": 2.698931932449341 + }, + { + "auxiliary_loss_clip": 0.01127061, + "auxiliary_loss_mlp": 0.01038812, + "balance_loss_clip": 1.04620993, + "balance_loss_mlp": 1.02618814, + "epoch": 0.4379978956861566, + "flos": 22725010546560.0, + "grad_norm": 2.04030257220524, + "language_loss": 0.80253512, + "learning_rate": 2.493553735281787e-06, + "loss": 0.82419378, + "num_input_tokens_seen": 156170125, + "router_z_loss_clip": 0.80908203, + "router_z_loss_mlp": 0.12609863, + "step": 7285, + "time_per_iteration": 2.6516613960266113 + }, + { + "auxiliary_loss_clip": 0.01124008, + "auxiliary_loss_mlp": 0.01030046, + "balance_loss_clip": 1.0438391, + "balance_loss_mlp": 1.01708817, + "epoch": 0.43805801893882457, + "flos": 26821817284320.0, + "grad_norm": 2.013257695451107, + "language_loss": 0.74728179, + "learning_rate": 2.493176309387897e-06, + "loss": 0.76882237, + "num_input_tokens_seen": 156187320, + "router_z_loss_clip": 0.80224609, + "router_z_loss_mlp": 0.12957764, + "step": 7286, + "time_per_iteration": 2.734644889831543 + }, + { + "auxiliary_loss_clip": 0.01127191, + "auxiliary_loss_mlp": 0.01029198, + "balance_loss_clip": 1.04410458, + "balance_loss_mlp": 1.01598418, + "epoch": 0.43811814219149253, + "flos": 32205876432480.0, + "grad_norm": 1.5341153028787329, + "language_loss": 0.73460472, + "learning_rate": 2.492798864792712e-06, + "loss": 0.7561686, + "num_input_tokens_seen": 156207455, + "router_z_loss_clip": 0.83105469, + "router_z_loss_mlp": 0.13220215, + "step": 7287, + "time_per_iteration": 4.019006013870239 + }, + { + "auxiliary_loss_clip": 0.0112878, + "auxiliary_loss_mlp": 0.01041601, + "balance_loss_clip": 1.04638314, + "balance_loss_mlp": 1.02775526, + "epoch": 0.43817826544416055, + "flos": 21345148334880.0, + "grad_norm": 1.821219119102915, + "language_loss": 0.82552111, + "learning_rate": 2.492421401510545e-06, + "loss": 0.84722489, + "num_input_tokens_seen": 156226560, + "router_z_loss_clip": 0.82275391, + "router_z_loss_mlp": 0.13848877, + "step": 7288, + "time_per_iteration": 2.6951255798339844 + }, + { + "auxiliary_loss_clip": 0.01126653, + "auxiliary_loss_mlp": 0.01031732, + "balance_loss_clip": 1.04248261, + "balance_loss_mlp": 1.01851785, + "epoch": 0.4382383886968285, + "flos": 26333167707360.0, + "grad_norm": 2.111497143420479, + "language_loss": 0.84249139, + "learning_rate": 2.4920439195557093e-06, + "loss": 0.86407524, + "num_input_tokens_seen": 156246740, + "router_z_loss_clip": 0.84179688, + "router_z_loss_mlp": 0.13232422, + "step": 7289, + "time_per_iteration": 2.664203643798828 + }, + { + "auxiliary_loss_clip": 0.01131452, + "auxiliary_loss_mlp": 0.01035461, + "balance_loss_clip": 1.045748, + "balance_loss_mlp": 1.02237141, + "epoch": 0.4382985119494965, + "flos": 34073577875520.0, + "grad_norm": 10.639111034754958, + "language_loss": 0.78357846, + "learning_rate": 2.4916664189425183e-06, + "loss": 0.80524755, + "num_input_tokens_seen": 156266440, + "router_z_loss_clip": 0.85644531, + "router_z_loss_mlp": 0.13092041, + "step": 7290, + "time_per_iteration": 2.752501964569092 + }, + { + "auxiliary_loss_clip": 0.01128049, + "auxiliary_loss_mlp": 0.01039231, + "balance_loss_clip": 1.04712403, + "balance_loss_mlp": 1.0268389, + "epoch": 0.43835863520216445, + "flos": 30428272719360.0, + "grad_norm": 1.8527199316944756, + "language_loss": 0.7812537, + "learning_rate": 2.491288899685288e-06, + "loss": 0.80292654, + "num_input_tokens_seen": 156286900, + "router_z_loss_clip": 0.81005859, + "router_z_loss_mlp": 0.1239624, + "step": 7291, + "time_per_iteration": 2.7062036991119385 + }, + { + "auxiliary_loss_clip": 0.01127744, + "auxiliary_loss_mlp": 0.01032568, + "balance_loss_clip": 1.04463696, + "balance_loss_mlp": 1.01914525, + "epoch": 0.4384187584548324, + "flos": 40890327418080.0, + "grad_norm": 1.6146828137849496, + "language_loss": 0.6523006, + "learning_rate": 2.4909113617983325e-06, + "loss": 0.6739037, + "num_input_tokens_seen": 156307690, + "router_z_loss_clip": 0.83056641, + "router_z_loss_mlp": 0.13415527, + "step": 7292, + "time_per_iteration": 2.7819602489471436 + }, + { + "auxiliary_loss_clip": 0.01129654, + "auxiliary_loss_mlp": 0.01029428, + "balance_loss_clip": 1.04569077, + "balance_loss_mlp": 1.01703572, + "epoch": 0.4384788817075004, + "flos": 29225405102400.0, + "grad_norm": 1.8238596637523237, + "language_loss": 0.74409497, + "learning_rate": 2.49053380529597e-06, + "loss": 0.7656858, + "num_input_tokens_seen": 156326620, + "router_z_loss_clip": 0.83984375, + "router_z_loss_mlp": 0.12390137, + "step": 7293, + "time_per_iteration": 2.6884233951568604 + }, + { + "auxiliary_loss_clip": 0.01127841, + "auxiliary_loss_mlp": 0.01039487, + "balance_loss_clip": 1.04552364, + "balance_loss_mlp": 1.02586174, + "epoch": 0.43853900496016834, + "flos": 23303960439840.0, + "grad_norm": 2.687237672819263, + "language_loss": 0.78619242, + "learning_rate": 2.490156230192516e-06, + "loss": 0.80786568, + "num_input_tokens_seen": 156345495, + "router_z_loss_clip": 0.82421875, + "router_z_loss_mlp": 0.1362915, + "step": 7294, + "time_per_iteration": 2.6318788528442383 + }, + { + "auxiliary_loss_clip": 0.01129211, + "auxiliary_loss_mlp": 0.01035422, + "balance_loss_clip": 1.04606295, + "balance_loss_mlp": 1.02260053, + "epoch": 0.4385991282128363, + "flos": 16136989814880.0, + "grad_norm": 1.711404770846045, + "language_loss": 0.73234355, + "learning_rate": 2.4897786365022883e-06, + "loss": 0.75398993, + "num_input_tokens_seen": 156363155, + "router_z_loss_clip": 0.83203125, + "router_z_loss_mlp": 0.1282959, + "step": 7295, + "time_per_iteration": 2.658163070678711 + }, + { + "auxiliary_loss_clip": 0.01129876, + "auxiliary_loss_mlp": 0.0103956, + "balance_loss_clip": 1.04668629, + "balance_loss_mlp": 1.0259701, + "epoch": 0.4386592514655043, + "flos": 17472613507200.0, + "grad_norm": 3.292983518395702, + "language_loss": 0.75507057, + "learning_rate": 2.4894010242396063e-06, + "loss": 0.77676493, + "num_input_tokens_seen": 156380940, + "router_z_loss_clip": 0.83251953, + "router_z_loss_mlp": 0.13586426, + "step": 7296, + "time_per_iteration": 2.6210687160491943 + }, + { + "auxiliary_loss_clip": 0.01126871, + "auxiliary_loss_mlp": 0.01030143, + "balance_loss_clip": 1.04462469, + "balance_loss_mlp": 1.0167501, + "epoch": 0.43871937471817224, + "flos": 27801709544160.0, + "grad_norm": 1.4992496198575491, + "language_loss": 0.69155395, + "learning_rate": 2.4890233934187873e-06, + "loss": 0.71312416, + "num_input_tokens_seen": 156400415, + "router_z_loss_clip": 0.82324219, + "router_z_loss_mlp": 0.1340332, + "step": 7297, + "time_per_iteration": 2.7046408653259277 + }, + { + "auxiliary_loss_clip": 0.01125316, + "auxiliary_loss_mlp": 0.01033269, + "balance_loss_clip": 1.04491699, + "balance_loss_mlp": 1.02103782, + "epoch": 0.4387794979708402, + "flos": 34254624198240.0, + "grad_norm": 1.6080004079754533, + "language_loss": 0.70711255, + "learning_rate": 2.4886457440541535e-06, + "loss": 0.72869843, + "num_input_tokens_seen": 156421120, + "router_z_loss_clip": 0.80371094, + "router_z_loss_mlp": 0.12243652, + "step": 7298, + "time_per_iteration": 2.6941277980804443 + }, + { + "auxiliary_loss_clip": 0.01127386, + "auxiliary_loss_mlp": 0.01027791, + "balance_loss_clip": 1.04676998, + "balance_loss_mlp": 1.01477361, + "epoch": 0.43883962122350817, + "flos": 32030218908000.0, + "grad_norm": 1.6461904416436444, + "language_loss": 0.72768283, + "learning_rate": 2.4882680761600238e-06, + "loss": 0.74923462, + "num_input_tokens_seen": 156441535, + "router_z_loss_clip": 0.80566406, + "router_z_loss_mlp": 0.13024902, + "step": 7299, + "time_per_iteration": 2.6998212337493896 + }, + { + "auxiliary_loss_clip": 0.01127889, + "auxiliary_loss_mlp": 0.01044899, + "balance_loss_clip": 1.04468107, + "balance_loss_mlp": 1.0306778, + "epoch": 0.43889974447617613, + "flos": 31583944055520.0, + "grad_norm": 1.7027894593174286, + "language_loss": 0.77188164, + "learning_rate": 2.487890389750719e-06, + "loss": 0.7936095, + "num_input_tokens_seen": 156462015, + "router_z_loss_clip": 0.83251953, + "router_z_loss_mlp": 0.14221191, + "step": 7300, + "time_per_iteration": 2.6652612686157227 + }, + { + "auxiliary_loss_clip": 0.01127791, + "auxiliary_loss_mlp": 0.01034107, + "balance_loss_clip": 1.04496431, + "balance_loss_mlp": 1.02052891, + "epoch": 0.43895986772884416, + "flos": 30562203795840.0, + "grad_norm": 1.8058519885432118, + "language_loss": 0.70844764, + "learning_rate": 2.4875126848405626e-06, + "loss": 0.73006666, + "num_input_tokens_seen": 156482165, + "router_z_loss_clip": 0.828125, + "router_z_loss_mlp": 0.13580322, + "step": 7301, + "time_per_iteration": 2.804990530014038 + }, + { + "auxiliary_loss_clip": 0.01131528, + "auxiliary_loss_mlp": 0.01032297, + "balance_loss_clip": 1.04770875, + "balance_loss_mlp": 1.01881456, + "epoch": 0.4390199909815121, + "flos": 31719050133120.0, + "grad_norm": 2.2377417340898234, + "language_loss": 0.70862687, + "learning_rate": 2.4871349614438757e-06, + "loss": 0.73026514, + "num_input_tokens_seen": 156503170, + "router_z_loss_clip": 0.83740234, + "router_z_loss_mlp": 0.1348877, + "step": 7302, + "time_per_iteration": 2.707292318344116 + }, + { + "auxiliary_loss_clip": 0.01128222, + "auxiliary_loss_mlp": 0.01035133, + "balance_loss_clip": 1.04654574, + "balance_loss_mlp": 1.02238953, + "epoch": 0.4390801142341801, + "flos": 35413698985920.0, + "grad_norm": 1.7099656064752244, + "language_loss": 0.82444721, + "learning_rate": 2.486757219574983e-06, + "loss": 0.84608078, + "num_input_tokens_seen": 156523005, + "router_z_loss_clip": 0.81835938, + "router_z_loss_mlp": 0.12756348, + "step": 7303, + "time_per_iteration": 2.7267346382141113 + }, + { + "auxiliary_loss_clip": 0.0113303, + "auxiliary_loss_mlp": 0.01040173, + "balance_loss_clip": 1.04709697, + "balance_loss_mlp": 1.02579641, + "epoch": 0.43914023748684805, + "flos": 40802660724960.0, + "grad_norm": 2.5531348070773228, + "language_loss": 0.68934762, + "learning_rate": 2.4863794592482067e-06, + "loss": 0.71107966, + "num_input_tokens_seen": 156544440, + "router_z_loss_clip": 0.85986328, + "router_z_loss_mlp": 0.14398193, + "step": 7304, + "time_per_iteration": 2.811769962310791 + }, + { + "auxiliary_loss_clip": 0.01125074, + "auxiliary_loss_mlp": 0.01036064, + "balance_loss_clip": 1.04513967, + "balance_loss_mlp": 1.02323151, + "epoch": 0.439200360739516, + "flos": 42136906829760.0, + "grad_norm": 1.859741063550875, + "language_loss": 0.78034174, + "learning_rate": 2.486001680477873e-06, + "loss": 0.80195308, + "num_input_tokens_seen": 156565410, + "router_z_loss_clip": 0.79980469, + "router_z_loss_mlp": 0.1282959, + "step": 7305, + "time_per_iteration": 2.787425994873047 + }, + { + "auxiliary_loss_clip": 0.0112668, + "auxiliary_loss_mlp": 0.01035066, + "balance_loss_clip": 1.04526615, + "balance_loss_mlp": 1.02171409, + "epoch": 0.439260483992184, + "flos": 26732124727200.0, + "grad_norm": 2.5535944327021474, + "language_loss": 0.68657386, + "learning_rate": 2.485623883278308e-06, + "loss": 0.70819139, + "num_input_tokens_seen": 156584210, + "router_z_loss_clip": 0.81445312, + "router_z_loss_mlp": 0.13360596, + "step": 7306, + "time_per_iteration": 2.744098663330078 + }, + { + "auxiliary_loss_clip": 0.01127049, + "auxiliary_loss_mlp": 0.01029175, + "balance_loss_clip": 1.04436684, + "balance_loss_mlp": 1.01590729, + "epoch": 0.43932060724485195, + "flos": 25619881564800.0, + "grad_norm": 1.5585548444007702, + "language_loss": 0.62618411, + "learning_rate": 2.4852460676638344e-06, + "loss": 0.64774632, + "num_input_tokens_seen": 156602730, + "router_z_loss_clip": 0.82568359, + "router_z_loss_mlp": 0.13256836, + "step": 7307, + "time_per_iteration": 2.659329652786255 + }, + { + "auxiliary_loss_clip": 0.01130072, + "auxiliary_loss_mlp": 0.01031455, + "balance_loss_clip": 1.04618859, + "balance_loss_mlp": 1.01865792, + "epoch": 0.4393807304975199, + "flos": 21654939522240.0, + "grad_norm": 2.145486772023591, + "language_loss": 0.71846879, + "learning_rate": 2.4848682336487828e-06, + "loss": 0.74008411, + "num_input_tokens_seen": 156619405, + "router_z_loss_clip": 0.83837891, + "router_z_loss_mlp": 0.12799072, + "step": 7308, + "time_per_iteration": 2.7023696899414062 + }, + { + "auxiliary_loss_clip": 0.01130263, + "auxiliary_loss_mlp": 0.01038878, + "balance_loss_clip": 1.04416466, + "balance_loss_mlp": 1.02568173, + "epoch": 0.4394408537501879, + "flos": 27489487320000.0, + "grad_norm": 1.7371878345375877, + "language_loss": 0.76563179, + "learning_rate": 2.4844903812474787e-06, + "loss": 0.78732324, + "num_input_tokens_seen": 156638165, + "router_z_loss_clip": 0.86035156, + "router_z_loss_mlp": 0.13195801, + "step": 7309, + "time_per_iteration": 2.736682891845703 + }, + { + "auxiliary_loss_clip": 0.0112425, + "auxiliary_loss_mlp": 0.0103211, + "balance_loss_clip": 1.04572546, + "balance_loss_mlp": 1.01968801, + "epoch": 0.43950097700285584, + "flos": 28602540828000.0, + "grad_norm": 1.8769552420161633, + "language_loss": 0.70562029, + "learning_rate": 2.484112510474251e-06, + "loss": 0.72718382, + "num_input_tokens_seen": 156658845, + "router_z_loss_clip": 0.78466797, + "router_z_loss_mlp": 0.12432861, + "step": 7310, + "time_per_iteration": 2.691373825073242 + }, + { + "auxiliary_loss_clip": 0.01129581, + "auxiliary_loss_mlp": 0.01037267, + "balance_loss_clip": 1.04597068, + "balance_loss_mlp": 1.02381384, + "epoch": 0.4395611002555238, + "flos": 28285740151200.0, + "grad_norm": 14.964542831185025, + "language_loss": 0.76079178, + "learning_rate": 2.483734621343429e-06, + "loss": 0.78246021, + "num_input_tokens_seen": 156677275, + "router_z_loss_clip": 0.8359375, + "router_z_loss_mlp": 0.13464355, + "step": 7311, + "time_per_iteration": 2.6522579193115234 + }, + { + "auxiliary_loss_clip": 0.01130089, + "auxiliary_loss_mlp": 0.0103404, + "balance_loss_clip": 1.04697061, + "balance_loss_mlp": 1.02187443, + "epoch": 0.43962122350819177, + "flos": 26999946362880.0, + "grad_norm": 1.895084003375787, + "language_loss": 0.8131777, + "learning_rate": 2.483356713869341e-06, + "loss": 0.83481902, + "num_input_tokens_seen": 156695815, + "router_z_loss_clip": 0.83105469, + "router_z_loss_mlp": 0.1217041, + "step": 7312, + "time_per_iteration": 2.689262866973877 + }, + { + "auxiliary_loss_clip": 0.01124414, + "auxiliary_loss_mlp": 0.01033471, + "balance_loss_clip": 1.04335356, + "balance_loss_mlp": 1.02045977, + "epoch": 0.43968134676085974, + "flos": 21256347157920.0, + "grad_norm": 2.409923611601358, + "language_loss": 0.85475999, + "learning_rate": 2.482978788066318e-06, + "loss": 0.87633884, + "num_input_tokens_seen": 156714385, + "router_z_loss_clip": 0.81103516, + "router_z_loss_mlp": 0.13012695, + "step": 7313, + "time_per_iteration": 2.6608877182006836 + }, + { + "auxiliary_loss_clip": 0.01130174, + "auxiliary_loss_mlp": 0.01036746, + "balance_loss_clip": 1.04577172, + "balance_loss_mlp": 1.02381206, + "epoch": 0.43974147001352776, + "flos": 23125952913120.0, + "grad_norm": 2.0500084510415775, + "language_loss": 0.67629826, + "learning_rate": 2.4826008439486904e-06, + "loss": 0.69796747, + "num_input_tokens_seen": 156732615, + "router_z_loss_clip": 0.84277344, + "router_z_loss_mlp": 0.12927246, + "step": 7314, + "time_per_iteration": 2.688084602355957 + }, + { + "auxiliary_loss_clip": 0.0113052, + "auxiliary_loss_mlp": 0.01033119, + "balance_loss_clip": 1.04660928, + "balance_loss_mlp": 1.02014339, + "epoch": 0.4398015932661957, + "flos": 23126925327840.0, + "grad_norm": 1.8164528437301535, + "language_loss": 0.76981646, + "learning_rate": 2.4822228815307915e-06, + "loss": 0.79145288, + "num_input_tokens_seen": 156750920, + "router_z_loss_clip": 0.83837891, + "router_z_loss_mlp": 0.13000488, + "step": 7315, + "time_per_iteration": 4.04370379447937 + }, + { + "auxiliary_loss_clip": 0.01127976, + "auxiliary_loss_mlp": 0.01031847, + "balance_loss_clip": 1.04615128, + "balance_loss_mlp": 1.01930606, + "epoch": 0.4398617165188637, + "flos": 29529685939680.0, + "grad_norm": 2.430603637637235, + "language_loss": 0.74493861, + "learning_rate": 2.4818449008269523e-06, + "loss": 0.76653683, + "num_input_tokens_seen": 156768520, + "router_z_loss_clip": 0.81835938, + "router_z_loss_mlp": 0.12530518, + "step": 7316, + "time_per_iteration": 4.171077251434326 + }, + { + "auxiliary_loss_clip": 0.01126929, + "auxiliary_loss_mlp": 0.01037083, + "balance_loss_clip": 1.04585612, + "balance_loss_mlp": 1.02453017, + "epoch": 0.43992183977153165, + "flos": 27133350714720.0, + "grad_norm": 2.6887607925328774, + "language_loss": 0.64922643, + "learning_rate": 2.481466901851506e-06, + "loss": 0.67086655, + "num_input_tokens_seen": 156788700, + "router_z_loss_clip": 0.81054688, + "router_z_loss_mlp": 0.12554932, + "step": 7317, + "time_per_iteration": 2.6386260986328125 + }, + { + "auxiliary_loss_clip": 0.01130889, + "auxiliary_loss_mlp": 0.01036628, + "balance_loss_clip": 1.04798746, + "balance_loss_mlp": 1.02416468, + "epoch": 0.4399819630241996, + "flos": 22814541034560.0, + "grad_norm": 1.8768664664304384, + "language_loss": 0.79877967, + "learning_rate": 2.4810888846187865e-06, + "loss": 0.82045484, + "num_input_tokens_seen": 156806470, + "router_z_loss_clip": 0.82861328, + "router_z_loss_mlp": 0.12481689, + "step": 7318, + "time_per_iteration": 2.666532278060913 + }, + { + "auxiliary_loss_clip": 0.01132443, + "auxiliary_loss_mlp": 0.0104599, + "balance_loss_clip": 1.04784632, + "balance_loss_mlp": 1.03250098, + "epoch": 0.4400420862768676, + "flos": 29137819443840.0, + "grad_norm": 1.7493718239399532, + "language_loss": 0.79809517, + "learning_rate": 2.4807108491431283e-06, + "loss": 0.81987953, + "num_input_tokens_seen": 156825895, + "router_z_loss_clip": 0.84570312, + "router_z_loss_mlp": 0.1348877, + "step": 7319, + "time_per_iteration": 2.6982014179229736 + }, + { + "auxiliary_loss_clip": 0.01128641, + "auxiliary_loss_mlp": 0.01041308, + "balance_loss_clip": 1.0462091, + "balance_loss_mlp": 1.02758718, + "epoch": 0.44010220952953555, + "flos": 34212127921920.0, + "grad_norm": 1.6865960992519136, + "language_loss": 0.79599315, + "learning_rate": 2.4803327954388667e-06, + "loss": 0.81769264, + "num_input_tokens_seen": 156845990, + "router_z_loss_clip": 0.82470703, + "router_z_loss_mlp": 0.13720703, + "step": 7320, + "time_per_iteration": 2.741460084915161 + }, + { + "auxiliary_loss_clip": 0.01127642, + "auxiliary_loss_mlp": 0.01037436, + "balance_loss_clip": 1.0462898, + "balance_loss_mlp": 1.02490747, + "epoch": 0.4401623327822035, + "flos": 29003199573600.0, + "grad_norm": 1.5981437306045307, + "language_loss": 0.69874191, + "learning_rate": 2.4799547235203376e-06, + "loss": 0.7203927, + "num_input_tokens_seen": 156866685, + "router_z_loss_clip": 0.81298828, + "router_z_loss_mlp": 0.12524414, + "step": 7321, + "time_per_iteration": 2.6574997901916504 + }, + { + "auxiliary_loss_clip": 0.01049564, + "auxiliary_loss_mlp": 0.01006574, + "balance_loss_clip": 1.02406716, + "balance_loss_mlp": 1.00508237, + "epoch": 0.4402224560348715, + "flos": 86358581287200.0, + "grad_norm": 0.8768549122432915, + "language_loss": 0.56948316, + "learning_rate": 2.4795766334018763e-06, + "loss": 0.5900445, + "num_input_tokens_seen": 156923450, + "router_z_loss_clip": 0.25463867, + "router_z_loss_mlp": 0.01489258, + "step": 7322, + "time_per_iteration": 4.770050287246704 + }, + { + "auxiliary_loss_clip": 0.01125748, + "auxiliary_loss_mlp": 0.01033475, + "balance_loss_clip": 1.0459168, + "balance_loss_mlp": 1.02203131, + "epoch": 0.44028257928753944, + "flos": 27932925962880.0, + "grad_norm": 1.608728811809173, + "language_loss": 0.76242983, + "learning_rate": 2.479198525097822e-06, + "loss": 0.78402197, + "num_input_tokens_seen": 156944795, + "router_z_loss_clip": 0.79833984, + "router_z_loss_mlp": 0.11437988, + "step": 7323, + "time_per_iteration": 2.764573097229004 + }, + { + "auxiliary_loss_clip": 0.01128596, + "auxiliary_loss_mlp": 0.01037026, + "balance_loss_clip": 1.04579282, + "balance_loss_mlp": 1.02391934, + "epoch": 0.4403427025402074, + "flos": 21836147914080.0, + "grad_norm": 1.683284466189995, + "language_loss": 0.80430257, + "learning_rate": 2.478820398622511e-06, + "loss": 0.82595885, + "num_input_tokens_seen": 156962755, + "router_z_loss_clip": 0.828125, + "router_z_loss_mlp": 0.13116455, + "step": 7324, + "time_per_iteration": 2.6291258335113525 + }, + { + "auxiliary_loss_clip": 0.01047194, + "auxiliary_loss_mlp": 0.01003009, + "balance_loss_clip": 1.02168334, + "balance_loss_mlp": 1.00150394, + "epoch": 0.4404028257928754, + "flos": 84880274785920.0, + "grad_norm": 0.6673237473222002, + "language_loss": 0.54586363, + "learning_rate": 2.478442253990283e-06, + "loss": 0.56636572, + "num_input_tokens_seen": 157028095, + "router_z_loss_clip": 0.25488281, + "router_z_loss_mlp": 0.01504517, + "step": 7325, + "time_per_iteration": 3.277090072631836 + }, + { + "auxiliary_loss_clip": 0.01126186, + "auxiliary_loss_mlp": 0.01026574, + "balance_loss_clip": 1.04769826, + "balance_loss_mlp": 1.01499844, + "epoch": 0.44046294904554334, + "flos": 25531080387840.0, + "grad_norm": 1.6496028109987824, + "language_loss": 0.69615364, + "learning_rate": 2.4780640912154766e-06, + "loss": 0.71768129, + "num_input_tokens_seen": 157048365, + "router_z_loss_clip": 0.78613281, + "router_z_loss_mlp": 0.11566162, + "step": 7326, + "time_per_iteration": 2.6892852783203125 + }, + { + "auxiliary_loss_clip": 0.0112441, + "auxiliary_loss_mlp": 0.010305, + "balance_loss_clip": 1.04547346, + "balance_loss_mlp": 1.01789379, + "epoch": 0.44052307229821136, + "flos": 28825597219680.0, + "grad_norm": 1.6556173621172852, + "language_loss": 0.76388192, + "learning_rate": 2.477685910312432e-06, + "loss": 0.78543103, + "num_input_tokens_seen": 157069130, + "router_z_loss_clip": 0.79003906, + "router_z_loss_mlp": 0.12597656, + "step": 7327, + "time_per_iteration": 3.918982744216919 + }, + { + "auxiliary_loss_clip": 0.01124206, + "auxiliary_loss_mlp": 0.01033139, + "balance_loss_clip": 1.04477024, + "balance_loss_mlp": 1.02057409, + "epoch": 0.4405831955508793, + "flos": 21472961302080.0, + "grad_norm": 3.51852824128155, + "language_loss": 0.84072387, + "learning_rate": 2.4773077112954897e-06, + "loss": 0.8622973, + "num_input_tokens_seen": 157084940, + "router_z_loss_clip": 0.79394531, + "router_z_loss_mlp": 0.12554932, + "step": 7328, + "time_per_iteration": 2.678689956665039 + }, + { + "auxiliary_loss_clip": 0.01125737, + "auxiliary_loss_mlp": 0.0102866, + "balance_loss_clip": 1.04592466, + "balance_loss_mlp": 1.01574922, + "epoch": 0.4406433188035473, + "flos": 26190363346560.0, + "grad_norm": 2.103503965074467, + "language_loss": 0.77513856, + "learning_rate": 2.4769294941789908e-06, + "loss": 0.79668254, + "num_input_tokens_seen": 157102770, + "router_z_loss_clip": 0.79785156, + "router_z_loss_mlp": 0.12921143, + "step": 7329, + "time_per_iteration": 2.729336738586426 + }, + { + "auxiliary_loss_clip": 0.01127857, + "auxiliary_loss_mlp": 0.01032392, + "balance_loss_clip": 1.04441857, + "balance_loss_mlp": 1.01950002, + "epoch": 0.44070344205621526, + "flos": 27666562949280.0, + "grad_norm": 1.61413519443254, + "language_loss": 0.73540676, + "learning_rate": 2.476551258977278e-06, + "loss": 0.75700927, + "num_input_tokens_seen": 157122035, + "router_z_loss_clip": 0.83496094, + "router_z_loss_mlp": 0.12908936, + "step": 7330, + "time_per_iteration": 2.708570718765259 + }, + { + "auxiliary_loss_clip": 0.01125485, + "auxiliary_loss_mlp": 0.01034013, + "balance_loss_clip": 1.04558969, + "balance_loss_mlp": 1.02190173, + "epoch": 0.4407635653088832, + "flos": 28603148587200.0, + "grad_norm": 1.9739166416579184, + "language_loss": 0.74094319, + "learning_rate": 2.4761730057046936e-06, + "loss": 0.76253814, + "num_input_tokens_seen": 157142800, + "router_z_loss_clip": 0.79882812, + "router_z_loss_mlp": 0.12121582, + "step": 7331, + "time_per_iteration": 2.7088842391967773 + }, + { + "auxiliary_loss_clip": 0.01121465, + "auxiliary_loss_mlp": 0.01030073, + "balance_loss_clip": 1.04259264, + "balance_loss_mlp": 1.01818752, + "epoch": 0.4408236885615512, + "flos": 29311370069760.0, + "grad_norm": 1.6608048331875558, + "language_loss": 0.76019728, + "learning_rate": 2.475794734375581e-06, + "loss": 0.78171259, + "num_input_tokens_seen": 157163295, + "router_z_loss_clip": 0.78808594, + "router_z_loss_mlp": 0.11883545, + "step": 7332, + "time_per_iteration": 2.6763367652893066 + }, + { + "auxiliary_loss_clip": 0.01125235, + "auxiliary_loss_mlp": 0.0104014, + "balance_loss_clip": 1.0447849, + "balance_loss_mlp": 1.02813518, + "epoch": 0.44088381181421915, + "flos": 15467739605280.0, + "grad_norm": 2.127760210832892, + "language_loss": 0.73479998, + "learning_rate": 2.475416445004285e-06, + "loss": 0.75645375, + "num_input_tokens_seen": 157180890, + "router_z_loss_clip": 0.80419922, + "router_z_loss_mlp": 0.11999512, + "step": 7333, + "time_per_iteration": 2.636624574661255 + }, + { + "auxiliary_loss_clip": 0.01122944, + "auxiliary_loss_mlp": 0.01035873, + "balance_loss_clip": 1.04584873, + "balance_loss_mlp": 1.02376783, + "epoch": 0.4409439350668871, + "flos": 29980660796640.0, + "grad_norm": 2.3193943466300926, + "language_loss": 0.79592842, + "learning_rate": 2.4750381376051493e-06, + "loss": 0.81751657, + "num_input_tokens_seen": 157200580, + "router_z_loss_clip": 0.77148438, + "router_z_loss_mlp": 0.12103271, + "step": 7334, + "time_per_iteration": 2.6910383701324463 + }, + { + "auxiliary_loss_clip": 0.0113232, + "auxiliary_loss_mlp": 0.01034865, + "balance_loss_clip": 1.04462254, + "balance_loss_mlp": 1.01979721, + "epoch": 0.4410040583195551, + "flos": 27660931047360.0, + "grad_norm": 2.651798526681132, + "language_loss": 0.75794172, + "learning_rate": 2.47465981219252e-06, + "loss": 0.77961361, + "num_input_tokens_seen": 157218345, + "router_z_loss_clip": 0.87695312, + "router_z_loss_mlp": 0.15063477, + "step": 7335, + "time_per_iteration": 2.6918857097625732 + }, + { + "auxiliary_loss_clip": 0.01127599, + "auxiliary_loss_mlp": 0.01037437, + "balance_loss_clip": 1.04612803, + "balance_loss_mlp": 1.02456844, + "epoch": 0.44106418157222305, + "flos": 13241592072000.0, + "grad_norm": 1.8795521222457376, + "language_loss": 0.72139543, + "learning_rate": 2.4742814687807423e-06, + "loss": 0.74304581, + "num_input_tokens_seen": 157234395, + "router_z_loss_clip": 0.81445312, + "router_z_loss_mlp": 0.12872314, + "step": 7336, + "time_per_iteration": 2.6183862686157227 + }, + { + "auxiliary_loss_clip": 0.01127426, + "auxiliary_loss_mlp": 0.0104072, + "balance_loss_clip": 1.0439918, + "balance_loss_mlp": 1.02727282, + "epoch": 0.441124304824891, + "flos": 26510972647680.0, + "grad_norm": 2.1953131748939536, + "language_loss": 0.62480569, + "learning_rate": 2.473903107384165e-06, + "loss": 0.64648718, + "num_input_tokens_seen": 157254805, + "router_z_loss_clip": 0.83398438, + "router_z_loss_mlp": 0.13458252, + "step": 7337, + "time_per_iteration": 2.6806609630584717 + }, + { + "auxiliary_loss_clip": 0.01043736, + "auxiliary_loss_mlp": 0.01007154, + "balance_loss_clip": 1.01825476, + "balance_loss_mlp": 1.00564873, + "epoch": 0.441184428077559, + "flos": 77140634446080.0, + "grad_norm": 0.7402890584571232, + "language_loss": 0.5264048, + "learning_rate": 2.473524728017134e-06, + "loss": 0.54691362, + "num_input_tokens_seen": 157317870, + "router_z_loss_clip": 0.25415039, + "router_z_loss_mlp": 0.01504517, + "step": 7338, + "time_per_iteration": 3.3236892223358154 + }, + { + "auxiliary_loss_clip": 0.01129835, + "auxiliary_loss_mlp": 0.01043408, + "balance_loss_clip": 1.0446701, + "balance_loss_mlp": 1.02941298, + "epoch": 0.44124455133022694, + "flos": 25841479334400.0, + "grad_norm": 2.213884529093244, + "language_loss": 0.70128739, + "learning_rate": 2.473146330693997e-06, + "loss": 0.72301984, + "num_input_tokens_seen": 157336505, + "router_z_loss_clip": 0.85205078, + "router_z_loss_mlp": 0.13995361, + "step": 7339, + "time_per_iteration": 2.670077085494995 + }, + { + "auxiliary_loss_clip": 0.01122076, + "auxiliary_loss_mlp": 0.01037418, + "balance_loss_clip": 1.04561639, + "balance_loss_mlp": 1.02587199, + "epoch": 0.4413046745828949, + "flos": 21301801195680.0, + "grad_norm": 1.5914663300584244, + "language_loss": 0.69473171, + "learning_rate": 2.472767915429105e-06, + "loss": 0.71632665, + "num_input_tokens_seen": 157354995, + "router_z_loss_clip": 0.76513672, + "router_z_loss_mlp": 0.11541748, + "step": 7340, + "time_per_iteration": 2.62502384185791 + }, + { + "auxiliary_loss_clip": 0.01041994, + "auxiliary_loss_mlp": 0.01004848, + "balance_loss_clip": 1.01650286, + "balance_loss_mlp": 1.00329554, + "epoch": 0.4413647978355629, + "flos": 75148962827040.0, + "grad_norm": 0.8944749463307977, + "language_loss": 0.64046657, + "learning_rate": 2.4723894822368054e-06, + "loss": 0.66093504, + "num_input_tokens_seen": 157404260, + "router_z_loss_clip": 0.25415039, + "router_z_loss_mlp": 0.01551056, + "step": 7341, + "time_per_iteration": 3.106851816177368 + }, + { + "auxiliary_loss_clip": 0.01121677, + "auxiliary_loss_mlp": 0.01032737, + "balance_loss_clip": 1.0435586, + "balance_loss_mlp": 1.02008343, + "epoch": 0.4414249210882309, + "flos": 33589020543840.0, + "grad_norm": 1.8515633720729359, + "language_loss": 0.73263162, + "learning_rate": 2.47201103113145e-06, + "loss": 0.75417578, + "num_input_tokens_seen": 157423045, + "router_z_loss_clip": 0.78027344, + "router_z_loss_mlp": 0.12652588, + "step": 7342, + "time_per_iteration": 2.7370519638061523 + }, + { + "auxiliary_loss_clip": 0.01120989, + "auxiliary_loss_mlp": 0.01035756, + "balance_loss_clip": 1.04043067, + "balance_loss_mlp": 1.02228594, + "epoch": 0.44148504434089886, + "flos": 28691990281440.0, + "grad_norm": 1.802566706393674, + "language_loss": 0.80301517, + "learning_rate": 2.4716325621273886e-06, + "loss": 0.82458264, + "num_input_tokens_seen": 157441815, + "router_z_loss_clip": 0.80664062, + "router_z_loss_mlp": 0.13470459, + "step": 7343, + "time_per_iteration": 2.6824090480804443 + }, + { + "auxiliary_loss_clip": 0.01119563, + "auxiliary_loss_mlp": 0.01030753, + "balance_loss_clip": 1.04019284, + "balance_loss_mlp": 1.01858187, + "epoch": 0.4415451675935668, + "flos": 26332681500000.0, + "grad_norm": 1.5896384731697295, + "language_loss": 0.7642749, + "learning_rate": 2.4712540752389725e-06, + "loss": 0.78577805, + "num_input_tokens_seen": 157460470, + "router_z_loss_clip": 0.79345703, + "router_z_loss_mlp": 0.12182617, + "step": 7344, + "time_per_iteration": 2.7455713748931885 + }, + { + "auxiliary_loss_clip": 0.01038725, + "auxiliary_loss_mlp": 0.01000787, + "balance_loss_clip": 1.01325381, + "balance_loss_mlp": 0.9993223, + "epoch": 0.4416052908462348, + "flos": 71999924496480.0, + "grad_norm": 0.7932327578470235, + "language_loss": 0.63825852, + "learning_rate": 2.470875570480556e-06, + "loss": 0.65865368, + "num_input_tokens_seen": 157512655, + "router_z_loss_clip": 0.25439453, + "router_z_loss_mlp": 0.01463318, + "step": 7345, + "time_per_iteration": 2.9641525745391846 + }, + { + "auxiliary_loss_clip": 0.01127163, + "auxiliary_loss_mlp": 0.01031551, + "balance_loss_clip": 1.045403, + "balance_loss_mlp": 1.01854515, + "epoch": 0.44166541409890275, + "flos": 32162245672320.0, + "grad_norm": 1.5493523235102717, + "language_loss": 0.85700548, + "learning_rate": 2.470497047866489e-06, + "loss": 0.87859255, + "num_input_tokens_seen": 157533700, + "router_z_loss_clip": 0.81738281, + "router_z_loss_mlp": 0.13006592, + "step": 7346, + "time_per_iteration": 2.9065990447998047 + }, + { + "auxiliary_loss_clip": 0.01125468, + "auxiliary_loss_mlp": 0.0103679, + "balance_loss_clip": 1.04423594, + "balance_loss_mlp": 1.02340841, + "epoch": 0.4417255373515707, + "flos": 24639178959360.0, + "grad_norm": 1.7156915695564867, + "language_loss": 0.80226403, + "learning_rate": 2.470118507411128e-06, + "loss": 0.82388663, + "num_input_tokens_seen": 157551105, + "router_z_loss_clip": 0.81201172, + "router_z_loss_mlp": 0.1338501, + "step": 7347, + "time_per_iteration": 2.647977828979492 + }, + { + "auxiliary_loss_clip": 0.01123282, + "auxiliary_loss_mlp": 0.01033156, + "balance_loss_clip": 1.04345942, + "balance_loss_mlp": 1.02032876, + "epoch": 0.4417856606042387, + "flos": 21827274629760.0, + "grad_norm": 2.3894974843121823, + "language_loss": 0.82774854, + "learning_rate": 2.4697399491288263e-06, + "loss": 0.8493129, + "num_input_tokens_seen": 157568285, + "router_z_loss_clip": 0.79833984, + "router_z_loss_mlp": 0.1282959, + "step": 7348, + "time_per_iteration": 2.6807165145874023 + }, + { + "auxiliary_loss_clip": 0.01128619, + "auxiliary_loss_mlp": 0.01031116, + "balance_loss_clip": 1.0460856, + "balance_loss_mlp": 1.01821208, + "epoch": 0.44184578385690665, + "flos": 34122394847520.0, + "grad_norm": 1.8606393646005455, + "language_loss": 0.70060807, + "learning_rate": 2.469361373033938e-06, + "loss": 0.7222054, + "num_input_tokens_seen": 157590405, + "router_z_loss_clip": 0.82421875, + "router_z_loss_mlp": 0.12902832, + "step": 7349, + "time_per_iteration": 2.7139532566070557 + }, + { + "auxiliary_loss_clip": 0.01124364, + "auxiliary_loss_mlp": 0.01035031, + "balance_loss_clip": 1.04263163, + "balance_loss_mlp": 1.02156675, + "epoch": 0.4419059071095746, + "flos": 28513739651040.0, + "grad_norm": 1.8967296778952978, + "language_loss": 0.74186647, + "learning_rate": 2.468982779140819e-06, + "loss": 0.76346046, + "num_input_tokens_seen": 157607420, + "router_z_loss_clip": 0.81787109, + "router_z_loss_mlp": 0.13470459, + "step": 7350, + "time_per_iteration": 2.6801135540008545 + }, + { + "auxiliary_loss_clip": 0.01124907, + "auxiliary_loss_mlp": 0.01032196, + "balance_loss_clip": 1.04412925, + "balance_loss_mlp": 1.01978028, + "epoch": 0.4419660303622426, + "flos": 18317480724000.0, + "grad_norm": 2.91073935110089, + "language_loss": 0.80692923, + "learning_rate": 2.468604167463827e-06, + "loss": 0.82850027, + "num_input_tokens_seen": 157624990, + "router_z_loss_clip": 0.80712891, + "router_z_loss_mlp": 0.12420654, + "step": 7351, + "time_per_iteration": 2.654719591140747 + }, + { + "auxiliary_loss_clip": 0.01119714, + "auxiliary_loss_mlp": 0.01033795, + "balance_loss_clip": 1.0424391, + "balance_loss_mlp": 1.02250576, + "epoch": 0.44202615361491054, + "flos": 30959256503520.0, + "grad_norm": 1.7064339961419208, + "language_loss": 0.73188937, + "learning_rate": 2.4682255380173176e-06, + "loss": 0.75342441, + "num_input_tokens_seen": 157645300, + "router_z_loss_clip": 0.77197266, + "router_z_loss_mlp": 0.11291504, + "step": 7352, + "time_per_iteration": 2.7263095378875732 + }, + { + "auxiliary_loss_clip": 0.0112282, + "auxiliary_loss_mlp": 0.0102864, + "balance_loss_clip": 1.04305446, + "balance_loss_mlp": 1.01630211, + "epoch": 0.4420862768675785, + "flos": 30116253081600.0, + "grad_norm": 1.8434380490963012, + "language_loss": 0.87095481, + "learning_rate": 2.467846890815649e-06, + "loss": 0.89246941, + "num_input_tokens_seen": 157664060, + "router_z_loss_clip": 0.79541016, + "router_z_loss_mlp": 0.12341309, + "step": 7353, + "time_per_iteration": 2.7769689559936523 + }, + { + "auxiliary_loss_clip": 0.01127397, + "auxiliary_loss_mlp": 0.01036316, + "balance_loss_clip": 1.04604888, + "balance_loss_mlp": 1.02440059, + "epoch": 0.44214640012024653, + "flos": 23829312322080.0, + "grad_norm": 3.089959826100693, + "language_loss": 0.7602753, + "learning_rate": 2.4674682258731795e-06, + "loss": 0.78191245, + "num_input_tokens_seen": 157680905, + "router_z_loss_clip": 0.81347656, + "router_z_loss_mlp": 0.11914062, + "step": 7354, + "time_per_iteration": 4.09035325050354 + }, + { + "auxiliary_loss_clip": 0.01120993, + "auxiliary_loss_mlp": 0.01030875, + "balance_loss_clip": 1.04347515, + "balance_loss_mlp": 1.01928818, + "epoch": 0.4422065233729145, + "flos": 58031716096800.0, + "grad_norm": 1.9954074495670828, + "language_loss": 0.64856446, + "learning_rate": 2.467089543204268e-06, + "loss": 0.67008317, + "num_input_tokens_seen": 157701980, + "router_z_loss_clip": 0.77539062, + "router_z_loss_mlp": 0.11590576, + "step": 7355, + "time_per_iteration": 4.107704162597656 + }, + { + "auxiliary_loss_clip": 0.01128692, + "auxiliary_loss_mlp": 0.01036654, + "balance_loss_clip": 1.0444777, + "balance_loss_mlp": 1.02295685, + "epoch": 0.44226664662558246, + "flos": 23526692693280.0, + "grad_norm": 1.976754182246262, + "language_loss": 0.77804178, + "learning_rate": 2.466710842823274e-06, + "loss": 0.79969525, + "num_input_tokens_seen": 157720555, + "router_z_loss_clip": 0.84228516, + "router_z_loss_mlp": 0.13696289, + "step": 7356, + "time_per_iteration": 2.7997500896453857 + }, + { + "auxiliary_loss_clip": 0.01128962, + "auxiliary_loss_mlp": 0.01038314, + "balance_loss_clip": 1.04673123, + "balance_loss_mlp": 1.02529037, + "epoch": 0.4423267698782504, + "flos": 21746009666880.0, + "grad_norm": 1.678270998521568, + "language_loss": 0.77060294, + "learning_rate": 2.4663321247445577e-06, + "loss": 0.79227567, + "num_input_tokens_seen": 157739160, + "router_z_loss_clip": 0.82177734, + "router_z_loss_mlp": 0.13018799, + "step": 7357, + "time_per_iteration": 2.6380209922790527 + }, + { + "auxiliary_loss_clip": 0.01124531, + "auxiliary_loss_mlp": 0.01037271, + "balance_loss_clip": 1.04483676, + "balance_loss_mlp": 1.02381194, + "epoch": 0.4423868931309184, + "flos": 35636026066560.0, + "grad_norm": 1.547382582982812, + "language_loss": 0.73413968, + "learning_rate": 2.465953388982481e-06, + "loss": 0.75575775, + "num_input_tokens_seen": 157760020, + "router_z_loss_clip": 0.79785156, + "router_z_loss_mlp": 0.13458252, + "step": 7358, + "time_per_iteration": 2.8378961086273193 + }, + { + "auxiliary_loss_clip": 0.01126692, + "auxiliary_loss_mlp": 0.01034242, + "balance_loss_clip": 1.04530311, + "balance_loss_mlp": 1.0222795, + "epoch": 0.44244701638358636, + "flos": 36256986028800.0, + "grad_norm": 7.166862104282797, + "language_loss": 0.75489604, + "learning_rate": 2.465574635551405e-06, + "loss": 0.77650535, + "num_input_tokens_seen": 157780435, + "router_z_loss_clip": 0.81445312, + "router_z_loss_mlp": 0.11962891, + "step": 7359, + "time_per_iteration": 2.697680950164795 + }, + { + "auxiliary_loss_clip": 0.01126861, + "auxiliary_loss_mlp": 0.01033455, + "balance_loss_clip": 1.04597783, + "balance_loss_mlp": 1.02017546, + "epoch": 0.4425071396362543, + "flos": 27979433449920.0, + "grad_norm": 1.7275572591063664, + "language_loss": 0.69906282, + "learning_rate": 2.4651958644656923e-06, + "loss": 0.72066599, + "num_input_tokens_seen": 157799420, + "router_z_loss_clip": 0.80761719, + "router_z_loss_mlp": 0.13299561, + "step": 7360, + "time_per_iteration": 2.652954339981079 + }, + { + "auxiliary_loss_clip": 0.01123284, + "auxiliary_loss_mlp": 0.01032237, + "balance_loss_clip": 1.04304576, + "balance_loss_mlp": 1.01934481, + "epoch": 0.4425672628889223, + "flos": 24150569899680.0, + "grad_norm": 2.572702710611074, + "language_loss": 0.69736922, + "learning_rate": 2.4648170757397053e-06, + "loss": 0.71892446, + "num_input_tokens_seen": 157817025, + "router_z_loss_clip": 0.80175781, + "router_z_loss_mlp": 0.12890625, + "step": 7361, + "time_per_iteration": 4.032711029052734 + }, + { + "auxiliary_loss_clip": 0.01125841, + "auxiliary_loss_mlp": 0.01034915, + "balance_loss_clip": 1.04469824, + "balance_loss_mlp": 1.02125955, + "epoch": 0.44262738614159025, + "flos": 16670161532160.0, + "grad_norm": 1.9948429128197842, + "language_loss": 0.826231, + "learning_rate": 2.464438269387809e-06, + "loss": 0.84783864, + "num_input_tokens_seen": 157834345, + "router_z_loss_clip": 0.81103516, + "router_z_loss_mlp": 0.13647461, + "step": 7362, + "time_per_iteration": 2.6047518253326416 + }, + { + "auxiliary_loss_clip": 0.0113083, + "auxiliary_loss_mlp": 0.01039261, + "balance_loss_clip": 1.04587626, + "balance_loss_mlp": 1.02529609, + "epoch": 0.4426875093942582, + "flos": 17338803982560.0, + "grad_norm": 2.5803560453884153, + "language_loss": 0.74857539, + "learning_rate": 2.464059445424366e-06, + "loss": 0.77027631, + "num_input_tokens_seen": 157852290, + "router_z_loss_clip": 0.84960938, + "router_z_loss_mlp": 0.13952637, + "step": 7363, + "time_per_iteration": 2.6596872806549072 + }, + { + "auxiliary_loss_clip": 0.01037867, + "auxiliary_loss_mlp": 0.01006419, + "balance_loss_clip": 1.01251698, + "balance_loss_mlp": 1.00492358, + "epoch": 0.4427476326469262, + "flos": 85556818105920.0, + "grad_norm": 0.6814141078367443, + "language_loss": 0.55682528, + "learning_rate": 2.463680603863743e-06, + "loss": 0.57726812, + "num_input_tokens_seen": 157923060, + "router_z_loss_clip": 0.25341797, + "router_z_loss_mlp": 0.01494598, + "step": 7364, + "time_per_iteration": 3.3294107913970947 + }, + { + "auxiliary_loss_clip": 0.01121954, + "auxiliary_loss_mlp": 0.01032242, + "balance_loss_clip": 1.04260826, + "balance_loss_mlp": 1.02067852, + "epoch": 0.44280775589959415, + "flos": 31048705956960.0, + "grad_norm": 1.8113822390382373, + "language_loss": 0.74109483, + "learning_rate": 2.463301744720305e-06, + "loss": 0.76263678, + "num_input_tokens_seen": 157944110, + "router_z_loss_clip": 0.79296875, + "router_z_loss_mlp": 0.11553955, + "step": 7365, + "time_per_iteration": 2.6710715293884277 + }, + { + "auxiliary_loss_clip": 0.01121412, + "auxiliary_loss_mlp": 0.01036903, + "balance_loss_clip": 1.04254341, + "balance_loss_mlp": 1.02368855, + "epoch": 0.4428678791522621, + "flos": 27890551238400.0, + "grad_norm": 2.1108708505927876, + "language_loss": 0.74048781, + "learning_rate": 2.4629228680084184e-06, + "loss": 0.76207101, + "num_input_tokens_seen": 157964295, + "router_z_loss_clip": 0.78808594, + "router_z_loss_mlp": 0.13226318, + "step": 7366, + "time_per_iteration": 3.9482181072235107 + }, + { + "auxiliary_loss_clip": 0.01123747, + "auxiliary_loss_mlp": 0.01031928, + "balance_loss_clip": 1.04402709, + "balance_loss_mlp": 1.01914883, + "epoch": 0.44292800240493013, + "flos": 31496763569760.0, + "grad_norm": 2.5067923316036924, + "language_loss": 0.73307061, + "learning_rate": 2.46254397374245e-06, + "loss": 0.75462735, + "num_input_tokens_seen": 157983970, + "router_z_loss_clip": 0.79785156, + "router_z_loss_mlp": 0.12774658, + "step": 7367, + "time_per_iteration": 2.696760416030884 + }, + { + "auxiliary_loss_clip": 0.01124291, + "auxiliary_loss_mlp": 0.01038869, + "balance_loss_clip": 1.04500508, + "balance_loss_mlp": 1.02630484, + "epoch": 0.4429881256575981, + "flos": 39555959761440.0, + "grad_norm": 3.0564842751153907, + "language_loss": 0.7381683, + "learning_rate": 2.4621650619367677e-06, + "loss": 0.75979996, + "num_input_tokens_seen": 158006515, + "router_z_loss_clip": 0.79296875, + "router_z_loss_mlp": 0.12548828, + "step": 7368, + "time_per_iteration": 2.789734363555908 + }, + { + "auxiliary_loss_clip": 0.01123777, + "auxiliary_loss_mlp": 0.01029804, + "balance_loss_clip": 1.04512429, + "balance_loss_mlp": 1.01764488, + "epoch": 0.44304824891026606, + "flos": 27044306434080.0, + "grad_norm": 1.7851287485324598, + "language_loss": 0.79889977, + "learning_rate": 2.4617861326057403e-06, + "loss": 0.82043558, + "num_input_tokens_seen": 158025565, + "router_z_loss_clip": 0.78710938, + "router_z_loss_mlp": 0.1217041, + "step": 7369, + "time_per_iteration": 2.6308958530426025 + }, + { + "auxiliary_loss_clip": 0.01120327, + "auxiliary_loss_mlp": 0.01029516, + "balance_loss_clip": 1.04285455, + "balance_loss_mlp": 1.01736236, + "epoch": 0.443108372162934, + "flos": 30918461952960.0, + "grad_norm": 2.0590029516946045, + "language_loss": 0.72127366, + "learning_rate": 2.461407185763737e-06, + "loss": 0.74277204, + "num_input_tokens_seen": 158045620, + "router_z_loss_clip": 0.77490234, + "router_z_loss_mlp": 0.12158203, + "step": 7370, + "time_per_iteration": 2.670039415359497 + }, + { + "auxiliary_loss_clip": 0.01123025, + "auxiliary_loss_mlp": 0.01029711, + "balance_loss_clip": 1.04299068, + "balance_loss_mlp": 1.01746821, + "epoch": 0.443168495415602, + "flos": 28468690786080.0, + "grad_norm": 3.391812714563697, + "language_loss": 0.70809269, + "learning_rate": 2.461028221425126e-06, + "loss": 0.7296201, + "num_input_tokens_seen": 158063505, + "router_z_loss_clip": 0.79980469, + "router_z_loss_mlp": 0.12243652, + "step": 7371, + "time_per_iteration": 2.658343553543091 + }, + { + "auxiliary_loss_clip": 0.01119412, + "auxiliary_loss_mlp": 0.01027054, + "balance_loss_clip": 1.04209375, + "balance_loss_mlp": 1.01620603, + "epoch": 0.44322861866826996, + "flos": 26687764656000.0, + "grad_norm": 5.373829222320418, + "language_loss": 0.6760354, + "learning_rate": 2.4606492396042786e-06, + "loss": 0.69750005, + "num_input_tokens_seen": 158080335, + "router_z_loss_clip": 0.77294922, + "router_z_loss_mlp": 0.10852051, + "step": 7372, + "time_per_iteration": 2.632434606552124 + }, + { + "auxiliary_loss_clip": 0.0112413, + "auxiliary_loss_mlp": 0.01030117, + "balance_loss_clip": 1.04338157, + "balance_loss_mlp": 1.01670599, + "epoch": 0.4432887419209379, + "flos": 24506949608640.0, + "grad_norm": 2.487474522028106, + "language_loss": 0.83066046, + "learning_rate": 2.4602702403155664e-06, + "loss": 0.85220289, + "num_input_tokens_seen": 158098955, + "router_z_loss_clip": 0.80712891, + "router_z_loss_mlp": 0.13409424, + "step": 7373, + "time_per_iteration": 2.6979823112487793 + }, + { + "auxiliary_loss_clip": 0.0103778, + "auxiliary_loss_mlp": 0.01000347, + "balance_loss_clip": 1.01272702, + "balance_loss_mlp": 0.99875182, + "epoch": 0.4433488651736059, + "flos": 85457603988000.0, + "grad_norm": 0.7704931713755038, + "language_loss": 0.55236191, + "learning_rate": 2.4598912235733604e-06, + "loss": 0.57274318, + "num_input_tokens_seen": 158164110, + "router_z_loss_clip": 0.25012207, + "router_z_loss_mlp": 0.01596832, + "step": 7374, + "time_per_iteration": 3.2849278450012207 + }, + { + "auxiliary_loss_clip": 0.0112267, + "auxiliary_loss_mlp": 0.01038408, + "balance_loss_clip": 1.04476631, + "balance_loss_mlp": 1.02514029, + "epoch": 0.44340898842627385, + "flos": 19868219421120.0, + "grad_norm": 3.0153831607144586, + "language_loss": 0.82527554, + "learning_rate": 2.4595121893920327e-06, + "loss": 0.8468864, + "num_input_tokens_seen": 158179850, + "router_z_loss_clip": 0.77832031, + "router_z_loss_mlp": 0.13262939, + "step": 7375, + "time_per_iteration": 2.6488935947418213 + }, + { + "auxiliary_loss_clip": 0.01123458, + "auxiliary_loss_mlp": 0.01028256, + "balance_loss_clip": 1.0428555, + "balance_loss_mlp": 1.01606691, + "epoch": 0.4434691116789418, + "flos": 20270215236960.0, + "grad_norm": 1.9465485421758528, + "language_loss": 0.83899015, + "learning_rate": 2.4591331377859578e-06, + "loss": 0.86050725, + "num_input_tokens_seen": 158196590, + "router_z_loss_clip": 0.80615234, + "router_z_loss_mlp": 0.12188721, + "step": 7376, + "time_per_iteration": 2.601456880569458 + }, + { + "auxiliary_loss_clip": 0.01122259, + "auxiliary_loss_mlp": 0.01032586, + "balance_loss_clip": 1.04420507, + "balance_loss_mlp": 1.02028394, + "epoch": 0.4435292349316098, + "flos": 23260815887040.0, + "grad_norm": 2.3831822325641556, + "language_loss": 0.77404529, + "learning_rate": 2.4587540687695077e-06, + "loss": 0.79559374, + "num_input_tokens_seen": 158216355, + "router_z_loss_clip": 0.77978516, + "router_z_loss_mlp": 0.12310791, + "step": 7377, + "time_per_iteration": 2.6455297470092773 + }, + { + "auxiliary_loss_clip": 0.01120093, + "auxiliary_loss_mlp": 0.01028941, + "balance_loss_clip": 1.04407048, + "balance_loss_mlp": 1.01700234, + "epoch": 0.44358935818427775, + "flos": 25930969305120.0, + "grad_norm": 1.934546962313862, + "language_loss": 0.75797212, + "learning_rate": 2.458374982357057e-06, + "loss": 0.77946246, + "num_input_tokens_seen": 158235825, + "router_z_loss_clip": 0.76025391, + "router_z_loss_mlp": 0.11956787, + "step": 7378, + "time_per_iteration": 2.6406519412994385 + }, + { + "auxiliary_loss_clip": 0.01123042, + "auxiliary_loss_mlp": 0.01037751, + "balance_loss_clip": 1.04340172, + "balance_loss_mlp": 1.02534151, + "epoch": 0.4436494814369457, + "flos": 15246141835680.0, + "grad_norm": 3.0150110607529514, + "language_loss": 0.68840563, + "learning_rate": 2.457995878562982e-06, + "loss": 0.71001351, + "num_input_tokens_seen": 158254230, + "router_z_loss_clip": 0.79589844, + "router_z_loss_mlp": 0.12408447, + "step": 7379, + "time_per_iteration": 2.6493725776672363 + }, + { + "auxiliary_loss_clip": 0.01123559, + "auxiliary_loss_mlp": 0.01036096, + "balance_loss_clip": 1.0449996, + "balance_loss_mlp": 1.02388358, + "epoch": 0.44370960468961373, + "flos": 28869997808160.0, + "grad_norm": 1.7939429649518994, + "language_loss": 0.73200309, + "learning_rate": 2.457616757401656e-06, + "loss": 0.7535997, + "num_input_tokens_seen": 158273400, + "router_z_loss_clip": 0.78564453, + "router_z_loss_mlp": 0.12225342, + "step": 7380, + "time_per_iteration": 2.6829674243927 + }, + { + "auxiliary_loss_clip": 0.01124638, + "auxiliary_loss_mlp": 0.01029736, + "balance_loss_clip": 1.04494774, + "balance_loss_mlp": 1.01733196, + "epoch": 0.4437697279422817, + "flos": 39555919244160.0, + "grad_norm": 2.175196498932218, + "language_loss": 0.64987886, + "learning_rate": 2.457237618887458e-06, + "loss": 0.6714226, + "num_input_tokens_seen": 158296840, + "router_z_loss_clip": 0.79736328, + "router_z_loss_mlp": 0.12390137, + "step": 7381, + "time_per_iteration": 2.749384641647339 + }, + { + "auxiliary_loss_clip": 0.01124419, + "auxiliary_loss_mlp": 0.01034522, + "balance_loss_clip": 1.04516995, + "balance_loss_mlp": 1.02207661, + "epoch": 0.44382985119494966, + "flos": 22101254892000.0, + "grad_norm": 2.434098469234979, + "language_loss": 0.81316304, + "learning_rate": 2.456858463034763e-06, + "loss": 0.83475244, + "num_input_tokens_seen": 158314935, + "router_z_loss_clip": 0.79296875, + "router_z_loss_mlp": 0.12445068, + "step": 7382, + "time_per_iteration": 2.633809804916382 + }, + { + "auxiliary_loss_clip": 0.011274, + "auxiliary_loss_mlp": 0.01040381, + "balance_loss_clip": 1.0470171, + "balance_loss_mlp": 1.02829945, + "epoch": 0.44388997444761763, + "flos": 37551450515040.0, + "grad_norm": 1.7367970068556486, + "language_loss": 0.65381509, + "learning_rate": 2.456479289857949e-06, + "loss": 0.67549288, + "num_input_tokens_seen": 158334620, + "router_z_loss_clip": 0.80371094, + "router_z_loss_mlp": 0.12097168, + "step": 7383, + "time_per_iteration": 2.716611623764038 + }, + { + "auxiliary_loss_clip": 0.01127199, + "auxiliary_loss_mlp": 0.01034563, + "balance_loss_clip": 1.04531431, + "balance_loss_mlp": 1.02140188, + "epoch": 0.4439500977002856, + "flos": 24817875279840.0, + "grad_norm": 2.658074280909578, + "language_loss": 0.76683068, + "learning_rate": 2.4561000993713953e-06, + "loss": 0.78844827, + "num_input_tokens_seen": 158350550, + "router_z_loss_clip": 0.81835938, + "router_z_loss_mlp": 0.13165283, + "step": 7384, + "time_per_iteration": 2.635502338409424 + }, + { + "auxiliary_loss_clip": 0.01127543, + "auxiliary_loss_mlp": 0.01032232, + "balance_loss_clip": 1.04589987, + "balance_loss_mlp": 1.01924396, + "epoch": 0.44401022095295356, + "flos": 24857413794720.0, + "grad_norm": 1.6234262904702874, + "language_loss": 0.80919909, + "learning_rate": 2.4557208915894796e-06, + "loss": 0.83079678, + "num_input_tokens_seen": 158369555, + "router_z_loss_clip": 0.81591797, + "router_z_loss_mlp": 0.12982178, + "step": 7385, + "time_per_iteration": 2.6370439529418945 + }, + { + "auxiliary_loss_clip": 0.01124094, + "auxiliary_loss_mlp": 0.01033096, + "balance_loss_clip": 1.04322743, + "balance_loss_mlp": 1.01929164, + "epoch": 0.4440703442056215, + "flos": 24685362308160.0, + "grad_norm": 1.6234305646185896, + "language_loss": 0.81418204, + "learning_rate": 2.455341666526582e-06, + "loss": 0.83575392, + "num_input_tokens_seen": 158388045, + "router_z_loss_clip": 0.80810547, + "router_z_loss_mlp": 0.13800049, + "step": 7386, + "time_per_iteration": 2.684802770614624 + }, + { + "auxiliary_loss_clip": 0.01127574, + "auxiliary_loss_mlp": 0.01034238, + "balance_loss_clip": 1.04513192, + "balance_loss_mlp": 1.02074313, + "epoch": 0.4441304674582895, + "flos": 48192890328000.0, + "grad_norm": 1.8285476820571631, + "language_loss": 0.69771832, + "learning_rate": 2.4549624241970832e-06, + "loss": 0.71933639, + "num_input_tokens_seen": 158410115, + "router_z_loss_clip": 0.82421875, + "router_z_loss_mlp": 0.1348877, + "step": 7387, + "time_per_iteration": 2.7758231163024902 + }, + { + "auxiliary_loss_clip": 0.0112398, + "auxiliary_loss_mlp": 0.01039136, + "balance_loss_clip": 1.04285944, + "balance_loss_mlp": 1.02602875, + "epoch": 0.44419059071095746, + "flos": 18094788987840.0, + "grad_norm": 3.7312256219872943, + "language_loss": 0.72001213, + "learning_rate": 2.4545831646153628e-06, + "loss": 0.74164331, + "num_input_tokens_seen": 158427765, + "router_z_loss_clip": 0.81201172, + "router_z_loss_mlp": 0.13116455, + "step": 7388, + "time_per_iteration": 2.630160331726074 + }, + { + "auxiliary_loss_clip": 0.01124327, + "auxiliary_loss_mlp": 0.01032063, + "balance_loss_clip": 1.04250455, + "balance_loss_mlp": 1.01865172, + "epoch": 0.4442507139636254, + "flos": 27622891671840.0, + "grad_norm": 1.5295019766970148, + "language_loss": 0.68606257, + "learning_rate": 2.4542038877958044e-06, + "loss": 0.70762646, + "num_input_tokens_seen": 158446375, + "router_z_loss_clip": 0.81738281, + "router_z_loss_mlp": 0.13421631, + "step": 7389, + "time_per_iteration": 2.628602981567383 + }, + { + "auxiliary_loss_clip": 0.01121246, + "auxiliary_loss_mlp": 0.01030403, + "balance_loss_clip": 1.04191959, + "balance_loss_mlp": 1.01807666, + "epoch": 0.4443108372162934, + "flos": 46723700214720.0, + "grad_norm": 2.0266519812843855, + "language_loss": 0.7483865, + "learning_rate": 2.453824593752788e-06, + "loss": 0.769903, + "num_input_tokens_seen": 158467260, + "router_z_loss_clip": 0.79248047, + "router_z_loss_mlp": 0.12347412, + "step": 7390, + "time_per_iteration": 2.8021841049194336 + }, + { + "auxiliary_loss_clip": 0.01122531, + "auxiliary_loss_mlp": 0.01032471, + "balance_loss_clip": 1.04438663, + "balance_loss_mlp": 1.01987624, + "epoch": 0.44437096046896135, + "flos": 21657127455360.0, + "grad_norm": 1.9927745299222017, + "language_loss": 0.8123008, + "learning_rate": 2.4534452825006988e-06, + "loss": 0.83385086, + "num_input_tokens_seen": 158486720, + "router_z_loss_clip": 0.78125, + "router_z_loss_mlp": 0.12597656, + "step": 7391, + "time_per_iteration": 2.6300179958343506 + }, + { + "auxiliary_loss_clip": 0.01123603, + "auxiliary_loss_mlp": 0.01033258, + "balance_loss_clip": 1.04486918, + "balance_loss_mlp": 1.02040148, + "epoch": 0.4444310837216293, + "flos": 16756004947680.0, + "grad_norm": 2.049952594080533, + "language_loss": 0.73398262, + "learning_rate": 2.4530659540539185e-06, + "loss": 0.75555122, + "num_input_tokens_seen": 158502530, + "router_z_loss_clip": 0.78710938, + "router_z_loss_mlp": 0.12866211, + "step": 7392, + "time_per_iteration": 2.7115437984466553 + }, + { + "auxiliary_loss_clip": 0.01120196, + "auxiliary_loss_mlp": 0.01028199, + "balance_loss_clip": 1.04057515, + "balance_loss_mlp": 1.01645708, + "epoch": 0.44449120697429734, + "flos": 30517803207360.0, + "grad_norm": 1.6196855519405384, + "language_loss": 0.79830778, + "learning_rate": 2.4526866084268313e-06, + "loss": 0.81979167, + "num_input_tokens_seen": 158522715, + "router_z_loss_clip": 0.796875, + "router_z_loss_mlp": 0.11737061, + "step": 7393, + "time_per_iteration": 2.720893383026123 + }, + { + "auxiliary_loss_clip": 0.01126077, + "auxiliary_loss_mlp": 0.01033636, + "balance_loss_clip": 1.04310286, + "balance_loss_mlp": 1.02057052, + "epoch": 0.4445513302269653, + "flos": 39866966467200.0, + "grad_norm": 2.2273957972909106, + "language_loss": 0.80721754, + "learning_rate": 2.4523072456338226e-06, + "loss": 0.82881474, + "num_input_tokens_seen": 158543615, + "router_z_loss_clip": 0.82861328, + "router_z_loss_mlp": 0.1307373, + "step": 7394, + "time_per_iteration": 4.3155364990234375 + }, + { + "auxiliary_loss_clip": 0.01120638, + "auxiliary_loss_mlp": 0.01036763, + "balance_loss_clip": 1.04303694, + "balance_loss_mlp": 1.02500272, + "epoch": 0.44461145347963327, + "flos": 14221767952800.0, + "grad_norm": 2.0116348194891946, + "language_loss": 0.79196978, + "learning_rate": 2.4519278656892785e-06, + "loss": 0.81354386, + "num_input_tokens_seen": 158560330, + "router_z_loss_clip": 0.77490234, + "router_z_loss_mlp": 0.11773682, + "step": 7395, + "time_per_iteration": 2.6808674335479736 + }, + { + "auxiliary_loss_clip": 0.01122324, + "auxiliary_loss_mlp": 0.01034838, + "balance_loss_clip": 1.04301167, + "balance_loss_mlp": 1.02234495, + "epoch": 0.44467157673230123, + "flos": 25486072040160.0, + "grad_norm": 2.0968737756285636, + "language_loss": 0.68821537, + "learning_rate": 2.451548468607584e-06, + "loss": 0.70978701, + "num_input_tokens_seen": 158579735, + "router_z_loss_clip": 0.79150391, + "router_z_loss_mlp": 0.12493896, + "step": 7396, + "time_per_iteration": 2.6504757404327393 + }, + { + "auxiliary_loss_clip": 0.01125501, + "auxiliary_loss_mlp": 0.0102835, + "balance_loss_clip": 1.04378569, + "balance_loss_mlp": 1.01582694, + "epoch": 0.4447316999849692, + "flos": 22630374881280.0, + "grad_norm": 4.2665050016634005, + "language_loss": 0.80868864, + "learning_rate": 2.451169054403126e-06, + "loss": 0.8302272, + "num_input_tokens_seen": 158597075, + "router_z_loss_clip": 0.81787109, + "router_z_loss_mlp": 0.12506104, + "step": 7397, + "time_per_iteration": 2.6329174041748047 + }, + { + "auxiliary_loss_clip": 0.0112135, + "auxiliary_loss_mlp": 0.0103182, + "balance_loss_clip": 1.04338098, + "balance_loss_mlp": 1.01961255, + "epoch": 0.44479182323763716, + "flos": 29003280608160.0, + "grad_norm": 4.445287310086177, + "language_loss": 0.67209077, + "learning_rate": 2.450789623090293e-06, + "loss": 0.69362247, + "num_input_tokens_seen": 158616650, + "router_z_loss_clip": 0.77832031, + "router_z_loss_mlp": 0.12200928, + "step": 7398, + "time_per_iteration": 2.675572156906128 + }, + { + "auxiliary_loss_clip": 0.01121709, + "auxiliary_loss_mlp": 0.01033904, + "balance_loss_clip": 1.04361701, + "balance_loss_mlp": 1.02172685, + "epoch": 0.44485194649030513, + "flos": 20186600271840.0, + "grad_norm": 1.8195960356660388, + "language_loss": 0.69606173, + "learning_rate": 2.450410174683472e-06, + "loss": 0.71761787, + "num_input_tokens_seen": 158634515, + "router_z_loss_clip": 0.78076172, + "router_z_loss_mlp": 0.12182617, + "step": 7399, + "time_per_iteration": 2.6522157192230225 + }, + { + "auxiliary_loss_clip": 0.01121362, + "auxiliary_loss_mlp": 0.01035784, + "balance_loss_clip": 1.04351616, + "balance_loss_mlp": 1.02395225, + "epoch": 0.4449120697429731, + "flos": 27577923841440.0, + "grad_norm": 1.8006337778300685, + "language_loss": 0.7260707, + "learning_rate": 2.4500307091970514e-06, + "loss": 0.74764216, + "num_input_tokens_seen": 158653760, + "router_z_loss_clip": 0.77880859, + "router_z_loss_mlp": 0.1184082, + "step": 7400, + "time_per_iteration": 4.1410956382751465 + }, + { + "auxiliary_loss_clip": 0.01121747, + "auxiliary_loss_mlp": 0.01031267, + "balance_loss_clip": 1.04329169, + "balance_loss_mlp": 1.01923323, + "epoch": 0.44497219299564106, + "flos": 24410490665760.0, + "grad_norm": 1.6133533225061627, + "language_loss": 0.84691447, + "learning_rate": 2.449651226645422e-06, + "loss": 0.86844468, + "num_input_tokens_seen": 158672190, + "router_z_loss_clip": 0.78417969, + "router_z_loss_mlp": 0.12036133, + "step": 7401, + "time_per_iteration": 2.622612714767456 + }, + { + "auxiliary_loss_clip": 0.01118821, + "auxiliary_loss_mlp": 0.01032727, + "balance_loss_clip": 1.04261351, + "balance_loss_mlp": 1.02138984, + "epoch": 0.445032316248309, + "flos": 31230441073440.0, + "grad_norm": 1.6080050086817776, + "language_loss": 0.83574343, + "learning_rate": 2.449271727042973e-06, + "loss": 0.85725898, + "num_input_tokens_seen": 158694115, + "router_z_loss_clip": 0.76220703, + "router_z_loss_mlp": 0.11328125, + "step": 7402, + "time_per_iteration": 2.7591259479522705 + }, + { + "auxiliary_loss_clip": 0.01124371, + "auxiliary_loss_mlp": 0.01030483, + "balance_loss_clip": 1.04455674, + "balance_loss_mlp": 1.01846647, + "epoch": 0.445092439500977, + "flos": 25929956373120.0, + "grad_norm": 1.9797635457252656, + "language_loss": 0.76894093, + "learning_rate": 2.4488922104040947e-06, + "loss": 0.79048944, + "num_input_tokens_seen": 158711000, + "router_z_loss_clip": 0.79931641, + "router_z_loss_mlp": 0.12023926, + "step": 7403, + "time_per_iteration": 2.6067922115325928 + }, + { + "auxiliary_loss_clip": 0.01042949, + "auxiliary_loss_mlp": 0.01005532, + "balance_loss_clip": 1.0177002, + "balance_loss_mlp": 1.0039649, + "epoch": 0.44515256275364495, + "flos": 70485077759040.0, + "grad_norm": 0.7461820927085165, + "language_loss": 0.60009843, + "learning_rate": 2.4485126767431793e-06, + "loss": 0.62058324, + "num_input_tokens_seen": 158769675, + "router_z_loss_clip": 0.25244141, + "router_z_loss_mlp": 0.01567078, + "step": 7404, + "time_per_iteration": 3.2756073474884033 + }, + { + "auxiliary_loss_clip": 0.01128039, + "auxiliary_loss_mlp": 0.01033997, + "balance_loss_clip": 1.04623747, + "balance_loss_mlp": 1.02145004, + "epoch": 0.4452126860063129, + "flos": 19029916003680.0, + "grad_norm": 1.6730792029819752, + "language_loss": 0.82161772, + "learning_rate": 2.4481331260746177e-06, + "loss": 0.84323812, + "num_input_tokens_seen": 158788215, + "router_z_loss_clip": 0.81738281, + "router_z_loss_mlp": 0.12548828, + "step": 7405, + "time_per_iteration": 3.929830551147461 + }, + { + "auxiliary_loss_clip": 0.01121389, + "auxiliary_loss_mlp": 0.01028581, + "balance_loss_clip": 1.04265237, + "balance_loss_mlp": 1.01667237, + "epoch": 0.4452728092589809, + "flos": 26377568295840.0, + "grad_norm": 2.41361771625748, + "language_loss": 0.75249231, + "learning_rate": 2.4477535584128036e-06, + "loss": 0.77399212, + "num_input_tokens_seen": 158809090, + "router_z_loss_clip": 0.78808594, + "router_z_loss_mlp": 0.11901855, + "step": 7406, + "time_per_iteration": 2.6853816509246826 + }, + { + "auxiliary_loss_clip": 0.01121061, + "auxiliary_loss_mlp": 0.0102883, + "balance_loss_clip": 1.04416335, + "balance_loss_mlp": 1.01743984, + "epoch": 0.4453329325116489, + "flos": 35993540259360.0, + "grad_norm": 1.7823217322049512, + "language_loss": 0.65534317, + "learning_rate": 2.447373973772129e-06, + "loss": 0.67684203, + "num_input_tokens_seen": 158828320, + "router_z_loss_clip": 0.76806641, + "router_z_loss_mlp": 0.11383057, + "step": 7407, + "time_per_iteration": 2.7444231510162354 + }, + { + "auxiliary_loss_clip": 0.01127703, + "auxiliary_loss_mlp": 0.01031941, + "balance_loss_clip": 1.04680705, + "balance_loss_mlp": 1.02028275, + "epoch": 0.44539305576431687, + "flos": 26064940898880.0, + "grad_norm": 1.5754827561199343, + "language_loss": 0.67823869, + "learning_rate": 2.4469943721669887e-06, + "loss": 0.69983506, + "num_input_tokens_seen": 158847040, + "router_z_loss_clip": 0.80859375, + "router_z_loss_mlp": 0.11657715, + "step": 7408, + "time_per_iteration": 2.6405580043792725 + }, + { + "auxiliary_loss_clip": 0.01123257, + "auxiliary_loss_mlp": 0.01031721, + "balance_loss_clip": 1.04311895, + "balance_loss_mlp": 1.01887012, + "epoch": 0.44545317901698483, + "flos": 50551672384800.0, + "grad_norm": 4.7263486355090025, + "language_loss": 0.71836376, + "learning_rate": 2.4466147536117776e-06, + "loss": 0.73991346, + "num_input_tokens_seen": 158870490, + "router_z_loss_clip": 0.80078125, + "router_z_loss_mlp": 0.12854004, + "step": 7409, + "time_per_iteration": 2.8600058555603027 + }, + { + "auxiliary_loss_clip": 0.01125645, + "auxiliary_loss_mlp": 0.01033056, + "balance_loss_clip": 1.04472303, + "balance_loss_mlp": 1.01967502, + "epoch": 0.4455133022696528, + "flos": 26911590876000.0, + "grad_norm": 1.9761635643775335, + "language_loss": 0.65538526, + "learning_rate": 2.4462351181208895e-06, + "loss": 0.67697227, + "num_input_tokens_seen": 158889920, + "router_z_loss_clip": 0.80908203, + "router_z_loss_mlp": 0.1338501, + "step": 7410, + "time_per_iteration": 2.6685450077056885 + }, + { + "auxiliary_loss_clip": 0.01129419, + "auxiliary_loss_mlp": 0.01034514, + "balance_loss_clip": 1.04597843, + "balance_loss_mlp": 1.02194369, + "epoch": 0.44557342552232077, + "flos": 28646860381920.0, + "grad_norm": 1.9133049616896884, + "language_loss": 0.74241233, + "learning_rate": 2.4458554657087217e-06, + "loss": 0.76405168, + "num_input_tokens_seen": 158909580, + "router_z_loss_clip": 0.83447266, + "router_z_loss_mlp": 0.12573242, + "step": 7411, + "time_per_iteration": 2.6753289699554443 + }, + { + "auxiliary_loss_clip": 0.01122976, + "auxiliary_loss_mlp": 0.01028026, + "balance_loss_clip": 1.04608512, + "balance_loss_mlp": 1.01654661, + "epoch": 0.44563354877498873, + "flos": 23348117924640.0, + "grad_norm": 1.810731208898278, + "language_loss": 0.78988087, + "learning_rate": 2.4454757963896695e-06, + "loss": 0.81139094, + "num_input_tokens_seen": 158924600, + "router_z_loss_clip": 0.76855469, + "router_z_loss_mlp": 0.11480713, + "step": 7412, + "time_per_iteration": 2.7324652671813965 + }, + { + "auxiliary_loss_clip": 0.01124236, + "auxiliary_loss_mlp": 0.01036606, + "balance_loss_clip": 1.04352236, + "balance_loss_mlp": 1.02445316, + "epoch": 0.4456936720276567, + "flos": 16620777318240.0, + "grad_norm": 4.69207604383668, + "language_loss": 0.79608291, + "learning_rate": 2.4450961101781304e-06, + "loss": 0.81769133, + "num_input_tokens_seen": 158939345, + "router_z_loss_clip": 0.80712891, + "router_z_loss_mlp": 0.12164307, + "step": 7413, + "time_per_iteration": 2.6025705337524414 + }, + { + "auxiliary_loss_clip": 0.01122023, + "auxiliary_loss_mlp": 0.01030479, + "balance_loss_clip": 1.04443455, + "balance_loss_mlp": 1.01882648, + "epoch": 0.44575379528032466, + "flos": 17953888939200.0, + "grad_norm": 1.8926942667798803, + "language_loss": 0.76488948, + "learning_rate": 2.4447164070885026e-06, + "loss": 0.7864145, + "num_input_tokens_seen": 158955855, + "router_z_loss_clip": 0.77636719, + "router_z_loss_mlp": 0.11657715, + "step": 7414, + "time_per_iteration": 2.6111202239990234 + }, + { + "auxiliary_loss_clip": 0.01123337, + "auxiliary_loss_mlp": 0.01030467, + "balance_loss_clip": 1.0450449, + "balance_loss_mlp": 1.01880836, + "epoch": 0.4458139185329926, + "flos": 29493348289920.0, + "grad_norm": 1.5922132550093189, + "language_loss": 0.83552426, + "learning_rate": 2.4443366871351837e-06, + "loss": 0.85706234, + "num_input_tokens_seen": 158976315, + "router_z_loss_clip": 0.78271484, + "router_z_loss_mlp": 0.11657715, + "step": 7415, + "time_per_iteration": 2.689514636993408 + }, + { + "auxiliary_loss_clip": 0.01123048, + "auxiliary_loss_mlp": 0.01037673, + "balance_loss_clip": 1.04410982, + "balance_loss_mlp": 1.02602601, + "epoch": 0.4458740417856606, + "flos": 26554400821440.0, + "grad_norm": 2.4324365787029576, + "language_loss": 0.83881044, + "learning_rate": 2.4439569503325732e-06, + "loss": 0.8604176, + "num_input_tokens_seen": 158996725, + "router_z_loss_clip": 0.7890625, + "router_z_loss_mlp": 0.11645508, + "step": 7416, + "time_per_iteration": 2.7261085510253906 + }, + { + "auxiliary_loss_clip": 0.01124245, + "auxiliary_loss_mlp": 0.01032037, + "balance_loss_clip": 1.0436095, + "balance_loss_mlp": 1.01962757, + "epoch": 0.44593416503832856, + "flos": 25708237051680.0, + "grad_norm": 1.5581087599399857, + "language_loss": 0.81102979, + "learning_rate": 2.4435771966950706e-06, + "loss": 0.83259261, + "num_input_tokens_seen": 159017255, + "router_z_loss_clip": 0.80615234, + "router_z_loss_mlp": 0.12408447, + "step": 7417, + "time_per_iteration": 2.629807472229004 + }, + { + "auxiliary_loss_clip": 0.01123215, + "auxiliary_loss_mlp": 0.01036292, + "balance_loss_clip": 1.04260087, + "balance_loss_mlp": 1.02404308, + "epoch": 0.4459942882909965, + "flos": 27577397116800.0, + "grad_norm": 2.128744084522234, + "language_loss": 0.80615699, + "learning_rate": 2.443197426237077e-06, + "loss": 0.82775199, + "num_input_tokens_seen": 159035010, + "router_z_loss_clip": 0.80761719, + "router_z_loss_mlp": 0.12249756, + "step": 7418, + "time_per_iteration": 2.6708827018737793 + }, + { + "auxiliary_loss_clip": 0.01124931, + "auxiliary_loss_mlp": 0.010287, + "balance_loss_clip": 1.04448175, + "balance_loss_mlp": 1.01670766, + "epoch": 0.4460544115436645, + "flos": 32342481649440.0, + "grad_norm": 1.650046184379376, + "language_loss": 0.77233255, + "learning_rate": 2.442817638972991e-06, + "loss": 0.7938689, + "num_input_tokens_seen": 159055345, + "router_z_loss_clip": 0.80517578, + "router_z_loss_mlp": 0.11981201, + "step": 7419, + "time_per_iteration": 2.697822093963623 + }, + { + "auxiliary_loss_clip": 0.01121659, + "auxiliary_loss_mlp": 0.01032384, + "balance_loss_clip": 1.04298031, + "balance_loss_mlp": 1.0208149, + "epoch": 0.4461145347963325, + "flos": 21480132860640.0, + "grad_norm": 1.508109168168989, + "language_loss": 0.72162694, + "learning_rate": 2.4424378349172176e-06, + "loss": 0.74316734, + "num_input_tokens_seen": 159074225, + "router_z_loss_clip": 0.78710938, + "router_z_loss_mlp": 0.11566162, + "step": 7420, + "time_per_iteration": 2.6548545360565186 + }, + { + "auxiliary_loss_clip": 0.01121528, + "auxiliary_loss_mlp": 0.01027941, + "balance_loss_clip": 1.04452002, + "balance_loss_mlp": 1.01603794, + "epoch": 0.44617465804900047, + "flos": 33273516420000.0, + "grad_norm": 1.5147937533561369, + "language_loss": 0.74586904, + "learning_rate": 2.442058014084156e-06, + "loss": 0.76736367, + "num_input_tokens_seen": 159095415, + "router_z_loss_clip": 0.77050781, + "router_z_loss_mlp": 0.11895752, + "step": 7421, + "time_per_iteration": 2.68316650390625 + }, + { + "auxiliary_loss_clip": 0.01120202, + "auxiliary_loss_mlp": 0.01032527, + "balance_loss_clip": 1.04509473, + "balance_loss_mlp": 1.02116597, + "epoch": 0.44623478130166844, + "flos": 21702703044960.0, + "grad_norm": 1.8911174113127807, + "language_loss": 0.76498425, + "learning_rate": 2.44167817648821e-06, + "loss": 0.78651154, + "num_input_tokens_seen": 159114615, + "router_z_loss_clip": 0.75097656, + "router_z_loss_mlp": 0.11364746, + "step": 7422, + "time_per_iteration": 2.637758255004883 + }, + { + "auxiliary_loss_clip": 0.011215, + "auxiliary_loss_mlp": 0.01030935, + "balance_loss_clip": 1.04311788, + "balance_loss_mlp": 1.01909184, + "epoch": 0.4462949045543364, + "flos": 28068477730560.0, + "grad_norm": 1.7460076659704997, + "language_loss": 0.65195155, + "learning_rate": 2.441298322143784e-06, + "loss": 0.67347592, + "num_input_tokens_seen": 159134370, + "router_z_loss_clip": 0.78417969, + "router_z_loss_mlp": 0.11846924, + "step": 7423, + "time_per_iteration": 2.6582860946655273 + }, + { + "auxiliary_loss_clip": 0.01118888, + "auxiliary_loss_mlp": 0.01031855, + "balance_loss_clip": 1.04264045, + "balance_loss_mlp": 1.02095914, + "epoch": 0.44635502780700437, + "flos": 21744064837440.0, + "grad_norm": 1.5839715738829343, + "language_loss": 0.78955328, + "learning_rate": 2.4409184510652807e-06, + "loss": 0.81106079, + "num_input_tokens_seen": 159152540, + "router_z_loss_clip": 0.76171875, + "router_z_loss_mlp": 0.10894775, + "step": 7424, + "time_per_iteration": 2.5998430252075195 + }, + { + "auxiliary_loss_clip": 0.01118187, + "auxiliary_loss_mlp": 0.01029095, + "balance_loss_clip": 1.0425024, + "balance_loss_mlp": 1.01813364, + "epoch": 0.44641515105967233, + "flos": 32565213902880.0, + "grad_norm": 1.3643028737239025, + "language_loss": 0.80174428, + "learning_rate": 2.4405385632671063e-06, + "loss": 0.82321703, + "num_input_tokens_seen": 159173425, + "router_z_loss_clip": 0.7578125, + "router_z_loss_mlp": 0.10974121, + "step": 7425, + "time_per_iteration": 2.7609803676605225 + }, + { + "auxiliary_loss_clip": 0.01122648, + "auxiliary_loss_mlp": 0.010296, + "balance_loss_clip": 1.04500878, + "balance_loss_mlp": 1.01815593, + "epoch": 0.4464752743123403, + "flos": 23078229907680.0, + "grad_norm": 1.5318195462713198, + "language_loss": 0.77495074, + "learning_rate": 2.4401586587636655e-06, + "loss": 0.79647326, + "num_input_tokens_seen": 159191210, + "router_z_loss_clip": 0.77734375, + "router_z_loss_mlp": 0.11444092, + "step": 7426, + "time_per_iteration": 2.6861202716827393 + }, + { + "auxiliary_loss_clip": 0.01120586, + "auxiliary_loss_mlp": 0.01024785, + "balance_loss_clip": 1.04143548, + "balance_loss_mlp": 1.01359749, + "epoch": 0.44653539756500826, + "flos": 36082422470880.0, + "grad_norm": 1.6079344570405085, + "language_loss": 0.64464408, + "learning_rate": 2.4397787375693634e-06, + "loss": 0.66609776, + "num_input_tokens_seen": 159211755, + "router_z_loss_clip": 0.79199219, + "router_z_loss_mlp": 0.11193848, + "step": 7427, + "time_per_iteration": 2.818075656890869 + }, + { + "auxiliary_loss_clip": 0.01121371, + "auxiliary_loss_mlp": 0.01030637, + "balance_loss_clip": 1.04506314, + "balance_loss_mlp": 1.01949739, + "epoch": 0.44659552081767623, + "flos": 26196684042240.0, + "grad_norm": 1.7068092549307488, + "language_loss": 0.75173503, + "learning_rate": 2.439398799698608e-06, + "loss": 0.77325505, + "num_input_tokens_seen": 159230315, + "router_z_loss_clip": 0.76318359, + "router_z_loss_mlp": 0.1114502, + "step": 7428, + "time_per_iteration": 2.635397434234619 + }, + { + "auxiliary_loss_clip": 0.0111975, + "auxiliary_loss_mlp": 0.01029742, + "balance_loss_clip": 1.04259527, + "balance_loss_mlp": 1.01753485, + "epoch": 0.4466556440703442, + "flos": 21880264881600.0, + "grad_norm": 1.8195506682622762, + "language_loss": 0.77458394, + "learning_rate": 2.439018845165806e-06, + "loss": 0.79607892, + "num_input_tokens_seen": 159249810, + "router_z_loss_clip": 0.77050781, + "router_z_loss_mlp": 0.12194824, + "step": 7429, + "time_per_iteration": 2.653425455093384 + }, + { + "auxiliary_loss_clip": 0.0112427, + "auxiliary_loss_mlp": 0.01028274, + "balance_loss_clip": 1.04436076, + "balance_loss_mlp": 1.01594782, + "epoch": 0.44671576732301216, + "flos": 25755190228800.0, + "grad_norm": 1.715592187684504, + "language_loss": 0.9111042, + "learning_rate": 2.438638873985366e-06, + "loss": 0.93262964, + "num_input_tokens_seen": 159271715, + "router_z_loss_clip": 0.79931641, + "router_z_loss_mlp": 0.12335205, + "step": 7430, + "time_per_iteration": 2.6635544300079346 + }, + { + "auxiliary_loss_clip": 0.01125136, + "auxiliary_loss_mlp": 0.01031564, + "balance_loss_clip": 1.04338038, + "balance_loss_mlp": 1.01871371, + "epoch": 0.4467758905756801, + "flos": 28686277344960.0, + "grad_norm": 1.8953041492155043, + "language_loss": 0.79999048, + "learning_rate": 2.4382588861716954e-06, + "loss": 0.8215574, + "num_input_tokens_seen": 159290690, + "router_z_loss_clip": 0.81787109, + "router_z_loss_mlp": 0.12854004, + "step": 7431, + "time_per_iteration": 2.6674911975860596 + }, + { + "auxiliary_loss_clip": 0.01123318, + "auxiliary_loss_mlp": 0.01031996, + "balance_loss_clip": 1.0433867, + "balance_loss_mlp": 1.01947904, + "epoch": 0.4468360138283481, + "flos": 22859144209440.0, + "grad_norm": 4.800857985742876, + "language_loss": 0.7973277, + "learning_rate": 2.437878881739204e-06, + "loss": 0.8188808, + "num_input_tokens_seen": 159309400, + "router_z_loss_clip": 0.79931641, + "router_z_loss_mlp": 0.12530518, + "step": 7432, + "time_per_iteration": 2.611415386199951 + }, + { + "auxiliary_loss_clip": 0.01123572, + "auxiliary_loss_mlp": 0.01037465, + "balance_loss_clip": 1.04319, + "balance_loss_mlp": 1.02531171, + "epoch": 0.4468961370810161, + "flos": 28647062968320.0, + "grad_norm": 2.3252409753727807, + "language_loss": 0.76762915, + "learning_rate": 2.437498860702301e-06, + "loss": 0.78923947, + "num_input_tokens_seen": 159327425, + "router_z_loss_clip": 0.80371094, + "router_z_loss_mlp": 0.1217041, + "step": 7433, + "time_per_iteration": 3.989657163619995 + }, + { + "auxiliary_loss_clip": 0.01117638, + "auxiliary_loss_mlp": 0.01028563, + "balance_loss_clip": 1.04180992, + "balance_loss_mlp": 1.01850224, + "epoch": 0.4469562603336841, + "flos": 36698885015040.0, + "grad_norm": 3.3265162209103463, + "language_loss": 0.77337754, + "learning_rate": 2.437118823075398e-06, + "loss": 0.79483956, + "num_input_tokens_seen": 159345805, + "router_z_loss_clip": 0.75830078, + "router_z_loss_mlp": 0.1005249, + "step": 7434, + "time_per_iteration": 4.128244876861572 + }, + { + "auxiliary_loss_clip": 0.01125431, + "auxiliary_loss_mlp": 0.01030905, + "balance_loss_clip": 1.04593766, + "balance_loss_mlp": 1.01905549, + "epoch": 0.44701638358635204, + "flos": 27401415454080.0, + "grad_norm": 1.7281018273631945, + "language_loss": 0.64547968, + "learning_rate": 2.436738768872905e-06, + "loss": 0.66704309, + "num_input_tokens_seen": 159364595, + "router_z_loss_clip": 0.79492188, + "router_z_loss_mlp": 0.11853027, + "step": 7435, + "time_per_iteration": 2.6503100395202637 + }, + { + "auxiliary_loss_clip": 0.01125512, + "auxiliary_loss_mlp": 0.01030757, + "balance_loss_clip": 1.0451715, + "balance_loss_mlp": 1.01844919, + "epoch": 0.44707650683902, + "flos": 29355973244640.0, + "grad_norm": 1.5514752985733151, + "language_loss": 0.83553797, + "learning_rate": 2.4363586981092346e-06, + "loss": 0.85710073, + "num_input_tokens_seen": 159385265, + "router_z_loss_clip": 0.80371094, + "router_z_loss_mlp": 0.12316895, + "step": 7436, + "time_per_iteration": 2.658235788345337 + }, + { + "auxiliary_loss_clip": 0.01124637, + "auxiliary_loss_mlp": 0.01036774, + "balance_loss_clip": 1.04408252, + "balance_loss_mlp": 1.0237143, + "epoch": 0.44713663009168797, + "flos": 29000890088640.0, + "grad_norm": 1.7048210478112689, + "language_loss": 0.79539406, + "learning_rate": 2.435978610798798e-06, + "loss": 0.81700814, + "num_input_tokens_seen": 159405080, + "router_z_loss_clip": 0.80615234, + "router_z_loss_mlp": 0.1305542, + "step": 7437, + "time_per_iteration": 2.686572790145874 + }, + { + "auxiliary_loss_clip": 0.01125364, + "auxiliary_loss_mlp": 0.01034508, + "balance_loss_clip": 1.0444876, + "balance_loss_mlp": 1.02237797, + "epoch": 0.44719675334435594, + "flos": 29894290656480.0, + "grad_norm": 1.88972035681742, + "language_loss": 0.71908116, + "learning_rate": 2.435598506956009e-06, + "loss": 0.74067992, + "num_input_tokens_seen": 159424595, + "router_z_loss_clip": 0.80859375, + "router_z_loss_mlp": 0.12127686, + "step": 7438, + "time_per_iteration": 2.7561538219451904 + }, + { + "auxiliary_loss_clip": 0.01123356, + "auxiliary_loss_mlp": 0.01038667, + "balance_loss_clip": 1.04262471, + "balance_loss_mlp": 1.02583456, + "epoch": 0.4472568765970239, + "flos": 36341127718560.0, + "grad_norm": 1.6355832597439948, + "language_loss": 0.67346585, + "learning_rate": 2.4352183865952808e-06, + "loss": 0.69508606, + "num_input_tokens_seen": 159443865, + "router_z_loss_clip": 0.80664062, + "router_z_loss_mlp": 0.12841797, + "step": 7439, + "time_per_iteration": 2.7873055934906006 + }, + { + "auxiliary_loss_clip": 0.01125286, + "auxiliary_loss_mlp": 0.01035767, + "balance_loss_clip": 1.04407907, + "balance_loss_mlp": 1.02271938, + "epoch": 0.44731699984969187, + "flos": 30070029215520.0, + "grad_norm": 1.6971521846864681, + "language_loss": 0.73647642, + "learning_rate": 2.4348382497310285e-06, + "loss": 0.75808698, + "num_input_tokens_seen": 159464525, + "router_z_loss_clip": 0.8125, + "router_z_loss_mlp": 0.13049316, + "step": 7440, + "time_per_iteration": 4.1198954582214355 + }, + { + "auxiliary_loss_clip": 0.01121806, + "auxiliary_loss_mlp": 0.01035493, + "balance_loss_clip": 1.04176271, + "balance_loss_mlp": 1.0234468, + "epoch": 0.44737712310235983, + "flos": 35943021561600.0, + "grad_norm": 1.8659001330965888, + "language_loss": 0.74260402, + "learning_rate": 2.4344580963776655e-06, + "loss": 0.76417708, + "num_input_tokens_seen": 159486385, + "router_z_loss_clip": 0.80078125, + "router_z_loss_mlp": 0.12042236, + "step": 7441, + "time_per_iteration": 2.6722428798675537 + }, + { + "auxiliary_loss_clip": 0.01125249, + "auxiliary_loss_mlp": 0.01037007, + "balance_loss_clip": 1.04425991, + "balance_loss_mlp": 1.02431762, + "epoch": 0.4474372463550278, + "flos": 30379779885600.0, + "grad_norm": 1.9647544165939168, + "language_loss": 0.75098395, + "learning_rate": 2.4340779265496082e-06, + "loss": 0.77260649, + "num_input_tokens_seen": 159503880, + "router_z_loss_clip": 0.80957031, + "router_z_loss_mlp": 0.12701416, + "step": 7442, + "time_per_iteration": 2.668210029602051 + }, + { + "auxiliary_loss_clip": 0.01125474, + "auxiliary_loss_mlp": 0.01034869, + "balance_loss_clip": 1.04232025, + "balance_loss_mlp": 1.02225113, + "epoch": 0.44749736960769576, + "flos": 40489263499680.0, + "grad_norm": 2.417008312636826, + "language_loss": 0.74583185, + "learning_rate": 2.433697740261273e-06, + "loss": 0.76743525, + "num_input_tokens_seen": 159522980, + "router_z_loss_clip": 0.83105469, + "router_z_loss_mlp": 0.12628174, + "step": 7443, + "time_per_iteration": 2.699733018875122 + }, + { + "auxiliary_loss_clip": 0.01119734, + "auxiliary_loss_mlp": 0.01029031, + "balance_loss_clip": 1.04021776, + "balance_loss_mlp": 1.01621008, + "epoch": 0.4475574928603637, + "flos": 25711640503200.0, + "grad_norm": 1.8608511963261734, + "language_loss": 0.77986467, + "learning_rate": 2.4333175375270748e-06, + "loss": 0.80135238, + "num_input_tokens_seen": 159543340, + "router_z_loss_clip": 0.79541016, + "router_z_loss_mlp": 0.12817383, + "step": 7444, + "time_per_iteration": 2.6689746379852295 + }, + { + "auxiliary_loss_clip": 0.0111903, + "auxiliary_loss_mlp": 0.01032064, + "balance_loss_clip": 1.04125214, + "balance_loss_mlp": 1.01991093, + "epoch": 0.4476176161130317, + "flos": 26677635336000.0, + "grad_norm": 7.524789951985288, + "language_loss": 0.84830022, + "learning_rate": 2.4329373183614333e-06, + "loss": 0.86981112, + "num_input_tokens_seen": 159558210, + "router_z_loss_clip": 0.77783203, + "router_z_loss_mlp": 0.121521, + "step": 7445, + "time_per_iteration": 3.8908164501190186 + }, + { + "auxiliary_loss_clip": 0.01125662, + "auxiliary_loss_mlp": 0.01032293, + "balance_loss_clip": 1.04511714, + "balance_loss_mlp": 1.01904917, + "epoch": 0.4476777393656997, + "flos": 27489649389120.0, + "grad_norm": 3.1801115173083048, + "language_loss": 0.64156151, + "learning_rate": 2.432557082778765e-06, + "loss": 0.66314107, + "num_input_tokens_seen": 159577920, + "router_z_loss_clip": 0.80517578, + "router_z_loss_mlp": 0.13232422, + "step": 7446, + "time_per_iteration": 2.6664795875549316 + }, + { + "auxiliary_loss_clip": 0.01036431, + "auxiliary_loss_mlp": 0.01010517, + "balance_loss_clip": 1.0114193, + "balance_loss_mlp": 1.00912285, + "epoch": 0.4477378626183677, + "flos": 59809569264000.0, + "grad_norm": 0.7356836796244566, + "language_loss": 0.50255346, + "learning_rate": 2.4321768307934884e-06, + "loss": 0.52302295, + "num_input_tokens_seen": 159632295, + "router_z_loss_clip": 0.25, + "router_z_loss_mlp": 0.01394653, + "step": 7447, + "time_per_iteration": 3.0983774662017822 + }, + { + "auxiliary_loss_clip": 0.01036177, + "auxiliary_loss_mlp": 0.01005556, + "balance_loss_clip": 1.01103806, + "balance_loss_mlp": 1.00416613, + "epoch": 0.44779798587103564, + "flos": 68991381041760.0, + "grad_norm": 0.7821026413976431, + "language_loss": 0.5933513, + "learning_rate": 2.4317965624200235e-06, + "loss": 0.61376864, + "num_input_tokens_seen": 159698435, + "router_z_loss_clip": 0.25146484, + "router_z_loss_mlp": 0.01390076, + "step": 7448, + "time_per_iteration": 3.3014581203460693 + }, + { + "auxiliary_loss_clip": 0.01121122, + "auxiliary_loss_mlp": 0.01032866, + "balance_loss_clip": 1.0421598, + "balance_loss_mlp": 1.02124858, + "epoch": 0.4478581091237036, + "flos": 56739642130080.0, + "grad_norm": 4.895176318416068, + "language_loss": 0.59049308, + "learning_rate": 2.431416277672789e-06, + "loss": 0.61203301, + "num_input_tokens_seen": 159722150, + "router_z_loss_clip": 0.79003906, + "router_z_loss_mlp": 0.11627197, + "step": 7449, + "time_per_iteration": 2.881847620010376 + }, + { + "auxiliary_loss_clip": 0.01123487, + "auxiliary_loss_mlp": 0.01029477, + "balance_loss_clip": 1.04390979, + "balance_loss_mlp": 1.01827753, + "epoch": 0.4479182323763716, + "flos": 25397676036000.0, + "grad_norm": 2.2026604378497296, + "language_loss": 0.79878783, + "learning_rate": 2.4310359765662065e-06, + "loss": 0.82031751, + "num_input_tokens_seen": 159740550, + "router_z_loss_clip": 0.79589844, + "router_z_loss_mlp": 0.11199951, + "step": 7450, + "time_per_iteration": 2.628631353378296 + }, + { + "auxiliary_loss_clip": 0.01121864, + "auxiliary_loss_mlp": 0.01034038, + "balance_loss_clip": 1.04364681, + "balance_loss_mlp": 1.021873, + "epoch": 0.44797835562903954, + "flos": 17382515777280.0, + "grad_norm": 2.184548036160941, + "language_loss": 0.79731798, + "learning_rate": 2.430655659114697e-06, + "loss": 0.81887704, + "num_input_tokens_seen": 159758245, + "router_z_loss_clip": 0.78125, + "router_z_loss_mlp": 0.12164307, + "step": 7451, + "time_per_iteration": 2.6980979442596436 + }, + { + "auxiliary_loss_clip": 0.01036064, + "auxiliary_loss_mlp": 0.01000071, + "balance_loss_clip": 1.01091564, + "balance_loss_mlp": 0.99866819, + "epoch": 0.4480384788817075, + "flos": 77525207831520.0, + "grad_norm": 0.828814811761949, + "language_loss": 0.62854588, + "learning_rate": 2.430275325332681e-06, + "loss": 0.64890718, + "num_input_tokens_seen": 159826790, + "router_z_loss_clip": 0.25195312, + "router_z_loss_mlp": 0.01403809, + "step": 7452, + "time_per_iteration": 3.331901788711548 + }, + { + "auxiliary_loss_clip": 0.01122867, + "auxiliary_loss_mlp": 0.01032613, + "balance_loss_clip": 1.04372644, + "balance_loss_mlp": 1.01960099, + "epoch": 0.44809860213437547, + "flos": 26420672331360.0, + "grad_norm": 1.9866055202028263, + "language_loss": 0.62660062, + "learning_rate": 2.429894975234582e-06, + "loss": 0.64815545, + "num_input_tokens_seen": 159845805, + "router_z_loss_clip": 0.79150391, + "router_z_loss_mlp": 0.13012695, + "step": 7453, + "time_per_iteration": 2.657689332962036 + }, + { + "auxiliary_loss_clip": 0.01036748, + "auxiliary_loss_mlp": 0.01000744, + "balance_loss_clip": 1.01153803, + "balance_loss_mlp": 0.99937332, + "epoch": 0.44815872538704343, + "flos": 84427111995840.0, + "grad_norm": 0.7819974932948474, + "language_loss": 0.5711534, + "learning_rate": 2.4295146088348224e-06, + "loss": 0.59152836, + "num_input_tokens_seen": 159898860, + "router_z_loss_clip": 0.25219727, + "router_z_loss_mlp": 0.01372528, + "step": 7454, + "time_per_iteration": 3.131824493408203 + }, + { + "auxiliary_loss_clip": 0.01122897, + "auxiliary_loss_mlp": 0.01028641, + "balance_loss_clip": 1.04290152, + "balance_loss_mlp": 1.01705396, + "epoch": 0.4482188486397114, + "flos": 15372415146240.0, + "grad_norm": 2.3142665009880448, + "language_loss": 0.75294113, + "learning_rate": 2.4291342261478255e-06, + "loss": 0.7744565, + "num_input_tokens_seen": 159911555, + "router_z_loss_clip": 0.79980469, + "router_z_loss_mlp": 0.11584473, + "step": 7455, + "time_per_iteration": 2.6164143085479736 + }, + { + "auxiliary_loss_clip": 0.01120503, + "auxiliary_loss_mlp": 0.01027225, + "balance_loss_clip": 1.04205215, + "balance_loss_mlp": 1.01568508, + "epoch": 0.44827897189237936, + "flos": 41558848316640.0, + "grad_norm": 1.85330559139536, + "language_loss": 0.75937194, + "learning_rate": 2.428753827188016e-06, + "loss": 0.78084922, + "num_input_tokens_seen": 159931470, + "router_z_loss_clip": 0.78369141, + "router_z_loss_mlp": 0.11541748, + "step": 7456, + "time_per_iteration": 2.8041892051696777 + }, + { + "auxiliary_loss_clip": 0.01121683, + "auxiliary_loss_mlp": 0.01034665, + "balance_loss_clip": 1.0451467, + "balance_loss_mlp": 1.02282119, + "epoch": 0.44833909514504733, + "flos": 30686005552320.0, + "grad_norm": 1.9816990207192138, + "language_loss": 0.7582323, + "learning_rate": 2.428373411969818e-06, + "loss": 0.77979577, + "num_input_tokens_seen": 159946115, + "router_z_loss_clip": 0.76513672, + "router_z_loss_mlp": 0.1184082, + "step": 7457, + "time_per_iteration": 2.669534683227539 + }, + { + "auxiliary_loss_clip": 0.01123298, + "auxiliary_loss_mlp": 0.01028607, + "balance_loss_clip": 1.04351568, + "balance_loss_mlp": 1.01610839, + "epoch": 0.4483992183977153, + "flos": 19742513352480.0, + "grad_norm": 2.0727095170839887, + "language_loss": 0.67649448, + "learning_rate": 2.4279929805076576e-06, + "loss": 0.6980136, + "num_input_tokens_seen": 159963915, + "router_z_loss_clip": 0.79736328, + "router_z_loss_mlp": 0.12512207, + "step": 7458, + "time_per_iteration": 2.6579394340515137 + }, + { + "auxiliary_loss_clip": 0.01124735, + "auxiliary_loss_mlp": 0.01029571, + "balance_loss_clip": 1.04351783, + "balance_loss_mlp": 1.01658916, + "epoch": 0.44845934165038326, + "flos": 21653805038400.0, + "grad_norm": 1.9577220118257639, + "language_loss": 0.71438992, + "learning_rate": 2.427612532815961e-06, + "loss": 0.73593295, + "num_input_tokens_seen": 159982140, + "router_z_loss_clip": 0.8125, + "router_z_loss_mlp": 0.12994385, + "step": 7459, + "time_per_iteration": 2.632784128189087 + }, + { + "auxiliary_loss_clip": 0.01120629, + "auxiliary_loss_mlp": 0.01031336, + "balance_loss_clip": 1.0420711, + "balance_loss_mlp": 1.01948643, + "epoch": 0.4485194649030513, + "flos": 26644255447680.0, + "grad_norm": 1.5358724478245347, + "language_loss": 0.69624054, + "learning_rate": 2.427232068909154e-06, + "loss": 0.71776021, + "num_input_tokens_seen": 160002280, + "router_z_loss_clip": 0.78515625, + "router_z_loss_mlp": 0.11834717, + "step": 7460, + "time_per_iteration": 2.6734704971313477 + }, + { + "auxiliary_loss_clip": 0.01120366, + "auxiliary_loss_mlp": 0.01029234, + "balance_loss_clip": 1.04121208, + "balance_loss_mlp": 1.01700926, + "epoch": 0.44857958815571924, + "flos": 24507071160480.0, + "grad_norm": 2.038612649763204, + "language_loss": 0.77087617, + "learning_rate": 2.4268515888016635e-06, + "loss": 0.79237223, + "num_input_tokens_seen": 160020260, + "router_z_loss_clip": 0.79101562, + "router_z_loss_mlp": 0.12231445, + "step": 7461, + "time_per_iteration": 2.641770839691162 + }, + { + "auxiliary_loss_clip": 0.01123358, + "auxiliary_loss_mlp": 0.01030191, + "balance_loss_clip": 1.04296446, + "balance_loss_mlp": 1.01839554, + "epoch": 0.4486397114083872, + "flos": 33011164617120.0, + "grad_norm": 10.25962453666023, + "language_loss": 0.68123811, + "learning_rate": 2.4264710925079184e-06, + "loss": 0.70277363, + "num_input_tokens_seen": 160040240, + "router_z_loss_clip": 0.80371094, + "router_z_loss_mlp": 0.11804199, + "step": 7462, + "time_per_iteration": 2.7388877868652344 + }, + { + "auxiliary_loss_clip": 0.01036001, + "auxiliary_loss_mlp": 0.01008948, + "balance_loss_clip": 1.01133966, + "balance_loss_mlp": 1.00771689, + "epoch": 0.4486998346610552, + "flos": 82146677657760.0, + "grad_norm": 0.7476294062409795, + "language_loss": 0.54427129, + "learning_rate": 2.4260905800423462e-06, + "loss": 0.56472075, + "num_input_tokens_seen": 160093865, + "router_z_loss_clip": 0.2467041, + "router_z_loss_mlp": 0.01229858, + "step": 7463, + "time_per_iteration": 3.252927303314209 + }, + { + "auxiliary_loss_clip": 0.01121296, + "auxiliary_loss_mlp": 0.0103023, + "balance_loss_clip": 1.04277492, + "balance_loss_mlp": 1.01861894, + "epoch": 0.44875995791372314, + "flos": 33722992137600.0, + "grad_norm": 1.9300589730978204, + "language_loss": 0.76114297, + "learning_rate": 2.4257100514193775e-06, + "loss": 0.78265822, + "num_input_tokens_seen": 160113590, + "router_z_loss_clip": 0.78515625, + "router_z_loss_mlp": 0.11608887, + "step": 7464, + "time_per_iteration": 2.6710615158081055 + }, + { + "auxiliary_loss_clip": 0.01119241, + "auxiliary_loss_mlp": 0.0102963, + "balance_loss_clip": 1.04268074, + "balance_loss_mlp": 1.01875174, + "epoch": 0.4488200811663911, + "flos": 15869289731040.0, + "grad_norm": 2.3093292613526955, + "language_loss": 0.7429443, + "learning_rate": 2.425329506653441e-06, + "loss": 0.76443297, + "num_input_tokens_seen": 160131795, + "router_z_loss_clip": 0.76611328, + "router_z_loss_mlp": 0.10882568, + "step": 7465, + "time_per_iteration": 2.6717894077301025 + }, + { + "auxiliary_loss_clip": 0.01127546, + "auxiliary_loss_mlp": 0.01034064, + "balance_loss_clip": 1.04458308, + "balance_loss_mlp": 1.02044439, + "epoch": 0.44888020441905907, + "flos": 33543890644320.0, + "grad_norm": 1.995732861906853, + "language_loss": 0.80056411, + "learning_rate": 2.424948945758966e-06, + "loss": 0.82218021, + "num_input_tokens_seen": 160150635, + "router_z_loss_clip": 0.82958984, + "router_z_loss_mlp": 0.1362915, + "step": 7466, + "time_per_iteration": 2.6656365394592285 + }, + { + "auxiliary_loss_clip": 0.01122438, + "auxiliary_loss_mlp": 0.0103415, + "balance_loss_clip": 1.04297256, + "balance_loss_mlp": 1.02209187, + "epoch": 0.44894032767172704, + "flos": 22280315868000.0, + "grad_norm": 3.167328039583604, + "language_loss": 0.79729688, + "learning_rate": 2.4245683687503844e-06, + "loss": 0.8188628, + "num_input_tokens_seen": 160168615, + "router_z_loss_clip": 0.79492188, + "router_z_loss_mlp": 0.1204834, + "step": 7467, + "time_per_iteration": 2.674992322921753 + }, + { + "auxiliary_loss_clip": 0.01116605, + "auxiliary_loss_mlp": 0.01029202, + "balance_loss_clip": 1.04203486, + "balance_loss_mlp": 1.01752555, + "epoch": 0.449000450924395, + "flos": 26331830637120.0, + "grad_norm": 1.8683144520134736, + "language_loss": 0.74927378, + "learning_rate": 2.424187775642129e-06, + "loss": 0.77073181, + "num_input_tokens_seen": 160187295, + "router_z_loss_clip": 0.74462891, + "router_z_loss_mlp": 0.11682129, + "step": 7468, + "time_per_iteration": 2.6197619438171387 + }, + { + "auxiliary_loss_clip": 0.01117594, + "auxiliary_loss_mlp": 0.01025572, + "balance_loss_clip": 1.0405705, + "balance_loss_mlp": 1.01440251, + "epoch": 0.44906057417706297, + "flos": 21924260297280.0, + "grad_norm": 2.1454089947236596, + "language_loss": 0.70436406, + "learning_rate": 2.4238071664486297e-06, + "loss": 0.72579569, + "num_input_tokens_seen": 160205115, + "router_z_loss_clip": 0.77050781, + "router_z_loss_mlp": 0.11169434, + "step": 7469, + "time_per_iteration": 2.66180682182312 + }, + { + "auxiliary_loss_clip": 0.01122894, + "auxiliary_loss_mlp": 0.01034191, + "balance_loss_clip": 1.04337955, + "balance_loss_mlp": 1.02227569, + "epoch": 0.44912069742973093, + "flos": 24461779191840.0, + "grad_norm": 2.4775465478006975, + "language_loss": 0.71809649, + "learning_rate": 2.4234265411843203e-06, + "loss": 0.7396673, + "num_input_tokens_seen": 160222580, + "router_z_loss_clip": 0.79492188, + "router_z_loss_mlp": 0.11920166, + "step": 7470, + "time_per_iteration": 2.64311146736145 + }, + { + "auxiliary_loss_clip": 0.01121497, + "auxiliary_loss_mlp": 0.01030467, + "balance_loss_clip": 1.04216409, + "balance_loss_mlp": 1.01802135, + "epoch": 0.4491808206823989, + "flos": 25665092498880.0, + "grad_norm": 2.189793391539259, + "language_loss": 0.77069426, + "learning_rate": 2.423045899863634e-06, + "loss": 0.79221386, + "num_input_tokens_seen": 160241520, + "router_z_loss_clip": 0.79394531, + "router_z_loss_mlp": 0.12445068, + "step": 7471, + "time_per_iteration": 2.674994707107544 + }, + { + "auxiliary_loss_clip": 0.01121263, + "auxiliary_loss_mlp": 0.01035119, + "balance_loss_clip": 1.04343009, + "balance_loss_mlp": 1.02361512, + "epoch": 0.44924094393506686, + "flos": 28024644384000.0, + "grad_norm": 1.7088886771043252, + "language_loss": 0.7007333, + "learning_rate": 2.4226652425010048e-06, + "loss": 0.72229713, + "num_input_tokens_seen": 160261815, + "router_z_loss_clip": 0.77783203, + "router_z_loss_mlp": 0.11499023, + "step": 7472, + "time_per_iteration": 2.671590566635132 + }, + { + "auxiliary_loss_clip": 0.01036596, + "auxiliary_loss_mlp": 0.01002254, + "balance_loss_clip": 1.0118866, + "balance_loss_mlp": 1.0009377, + "epoch": 0.4493010671877349, + "flos": 72276295278240.0, + "grad_norm": 0.7390727112986726, + "language_loss": 0.61714935, + "learning_rate": 2.4222845691108676e-06, + "loss": 0.6375379, + "num_input_tokens_seen": 160317070, + "router_z_loss_clip": 0.24755859, + "router_z_loss_mlp": 0.01316071, + "step": 7473, + "time_per_iteration": 6.081997871398926 + }, + { + "auxiliary_loss_clip": 0.01120544, + "auxiliary_loss_mlp": 0.01034514, + "balance_loss_clip": 1.04261732, + "balance_loss_mlp": 1.02202725, + "epoch": 0.44936119044040285, + "flos": 21969349679520.0, + "grad_norm": 1.882275129909697, + "language_loss": 0.77944779, + "learning_rate": 2.421903879707657e-06, + "loss": 0.80099845, + "num_input_tokens_seen": 160334980, + "router_z_loss_clip": 0.77978516, + "router_z_loss_mlp": 0.12475586, + "step": 7474, + "time_per_iteration": 2.6172068119049072 + }, + { + "auxiliary_loss_clip": 0.01118824, + "auxiliary_loss_mlp": 0.01034417, + "balance_loss_clip": 1.04237592, + "balance_loss_mlp": 1.0221982, + "epoch": 0.4494213136930708, + "flos": 25931536547040.0, + "grad_norm": 2.0259500864592934, + "language_loss": 0.72467619, + "learning_rate": 2.4215231743058086e-06, + "loss": 0.74620861, + "num_input_tokens_seen": 160354500, + "router_z_loss_clip": 0.76464844, + "router_z_loss_mlp": 0.12207031, + "step": 7475, + "time_per_iteration": 2.6648969650268555 + }, + { + "auxiliary_loss_clip": 0.01120293, + "auxiliary_loss_mlp": 0.010276, + "balance_loss_clip": 1.04171228, + "balance_loss_mlp": 1.01594162, + "epoch": 0.4494814369457388, + "flos": 33456750675840.0, + "grad_norm": 1.9500200919363957, + "language_loss": 0.76907372, + "learning_rate": 2.4211424529197594e-06, + "loss": 0.79055262, + "num_input_tokens_seen": 160373650, + "router_z_loss_clip": 0.78564453, + "router_z_loss_mlp": 0.11663818, + "step": 7476, + "time_per_iteration": 2.6815967559814453 + }, + { + "auxiliary_loss_clip": 0.01122285, + "auxiliary_loss_mlp": 0.01035453, + "balance_loss_clip": 1.0413847, + "balance_loss_mlp": 1.02243495, + "epoch": 0.44954156019840674, + "flos": 27886985717760.0, + "grad_norm": 3.2388586663037824, + "language_loss": 0.71986192, + "learning_rate": 2.4207617155639464e-06, + "loss": 0.74143934, + "num_input_tokens_seen": 160393430, + "router_z_loss_clip": 0.80810547, + "router_z_loss_mlp": 0.13018799, + "step": 7477, + "time_per_iteration": 2.6998867988586426 + }, + { + "auxiliary_loss_clip": 0.01125781, + "auxiliary_loss_mlp": 0.01031087, + "balance_loss_clip": 1.04459691, + "balance_loss_mlp": 1.01874912, + "epoch": 0.4496016834510747, + "flos": 20989133281440.0, + "grad_norm": 2.073571916381106, + "language_loss": 0.67801023, + "learning_rate": 2.4203809622528062e-06, + "loss": 0.69957894, + "num_input_tokens_seen": 160410545, + "router_z_loss_clip": 0.81103516, + "router_z_loss_mlp": 0.12347412, + "step": 7478, + "time_per_iteration": 2.6155240535736084 + }, + { + "auxiliary_loss_clip": 0.01118924, + "auxiliary_loss_mlp": 0.01032065, + "balance_loss_clip": 1.04256511, + "balance_loss_mlp": 1.02073431, + "epoch": 0.4496618067037427, + "flos": 23081957497440.0, + "grad_norm": 2.06477508013447, + "language_loss": 0.89347017, + "learning_rate": 2.420000193000779e-06, + "loss": 0.91498011, + "num_input_tokens_seen": 160428105, + "router_z_loss_clip": 0.76220703, + "router_z_loss_mlp": 0.11328125, + "step": 7479, + "time_per_iteration": 2.66245698928833 + }, + { + "auxiliary_loss_clip": 0.01125054, + "auxiliary_loss_mlp": 0.01032509, + "balance_loss_clip": 1.04592657, + "balance_loss_mlp": 1.01999807, + "epoch": 0.44972192995641064, + "flos": 25662985600320.0, + "grad_norm": 1.772872386040172, + "language_loss": 0.7566483, + "learning_rate": 2.419619407822302e-06, + "loss": 0.77822399, + "num_input_tokens_seen": 160448815, + "router_z_loss_clip": 0.79150391, + "router_z_loss_mlp": 0.12524414, + "step": 7480, + "time_per_iteration": 4.2555506229400635 + }, + { + "auxiliary_loss_clip": 0.01123913, + "auxiliary_loss_mlp": 0.01031987, + "balance_loss_clip": 1.0440079, + "balance_loss_mlp": 1.01972604, + "epoch": 0.4497820532090786, + "flos": 25352262515520.0, + "grad_norm": 2.189516470830786, + "language_loss": 0.79702091, + "learning_rate": 2.419238606731815e-06, + "loss": 0.81857991, + "num_input_tokens_seen": 160465940, + "router_z_loss_clip": 0.80029297, + "router_z_loss_mlp": 0.12280273, + "step": 7481, + "time_per_iteration": 2.654680013656616 + }, + { + "auxiliary_loss_clip": 0.01118044, + "auxiliary_loss_mlp": 0.0102876, + "balance_loss_clip": 1.04308176, + "balance_loss_mlp": 1.0162549, + "epoch": 0.44984217646174657, + "flos": 41246990748000.0, + "grad_norm": 1.7707436960111305, + "language_loss": 0.68529761, + "learning_rate": 2.418857789743758e-06, + "loss": 0.70676565, + "num_input_tokens_seen": 160486710, + "router_z_loss_clip": 0.74951172, + "router_z_loss_mlp": 0.12506104, + "step": 7482, + "time_per_iteration": 2.7918150424957275 + }, + { + "auxiliary_loss_clip": 0.01120704, + "auxiliary_loss_mlp": 0.01038849, + "balance_loss_clip": 1.04187977, + "balance_loss_mlp": 1.02645111, + "epoch": 0.44990229971441453, + "flos": 18935604476640.0, + "grad_norm": 2.496514856712482, + "language_loss": 0.84800541, + "learning_rate": 2.418476956872571e-06, + "loss": 0.86960089, + "num_input_tokens_seen": 160503405, + "router_z_loss_clip": 0.78759766, + "router_z_loss_mlp": 0.1239624, + "step": 7483, + "time_per_iteration": 2.6437530517578125 + }, + { + "auxiliary_loss_clip": 0.01128352, + "auxiliary_loss_mlp": 0.01038043, + "balance_loss_clip": 1.04609179, + "balance_loss_mlp": 1.02563381, + "epoch": 0.4499624229670825, + "flos": 36438275455200.0, + "grad_norm": 2.3094821505172685, + "language_loss": 0.8069222, + "learning_rate": 2.4180961081326967e-06, + "loss": 0.82858616, + "num_input_tokens_seen": 160525080, + "router_z_loss_clip": 0.82275391, + "router_z_loss_mlp": 0.12414551, + "step": 7484, + "time_per_iteration": 4.197648525238037 + }, + { + "auxiliary_loss_clip": 0.01125979, + "auxiliary_loss_mlp": 0.01029342, + "balance_loss_clip": 1.04210591, + "balance_loss_mlp": 1.01608002, + "epoch": 0.45002254621975046, + "flos": 22590390676320.0, + "grad_norm": 6.060391431046983, + "language_loss": 0.75153404, + "learning_rate": 2.4177152435385754e-06, + "loss": 0.77308726, + "num_input_tokens_seen": 160540895, + "router_z_loss_clip": 0.83886719, + "router_z_loss_mlp": 0.13269043, + "step": 7485, + "time_per_iteration": 2.633601427078247 + }, + { + "auxiliary_loss_clip": 0.01035709, + "auxiliary_loss_mlp": 0.01001144, + "balance_loss_clip": 1.01125455, + "balance_loss_mlp": 0.99977964, + "epoch": 0.4500826694724185, + "flos": 85928750100000.0, + "grad_norm": 0.7919820092635292, + "language_loss": 0.5867399, + "learning_rate": 2.4173343631046504e-06, + "loss": 0.60710847, + "num_input_tokens_seen": 160598270, + "router_z_loss_clip": 0.24499512, + "router_z_loss_mlp": 0.01365662, + "step": 7486, + "time_per_iteration": 3.2688724994659424 + }, + { + "auxiliary_loss_clip": 0.01122244, + "auxiliary_loss_mlp": 0.01030213, + "balance_loss_clip": 1.04411983, + "balance_loss_mlp": 1.01777911, + "epoch": 0.45014279272508645, + "flos": 19253053429920.0, + "grad_norm": 2.6390460016634205, + "language_loss": 0.83487415, + "learning_rate": 2.4169534668453654e-06, + "loss": 0.85639876, + "num_input_tokens_seen": 160614720, + "router_z_loss_clip": 0.78125, + "router_z_loss_mlp": 0.12445068, + "step": 7487, + "time_per_iteration": 2.652522325515747 + }, + { + "auxiliary_loss_clip": 0.01120352, + "auxiliary_loss_mlp": 0.01031757, + "balance_loss_clip": 1.04281378, + "balance_loss_mlp": 1.01984215, + "epoch": 0.4502029159777544, + "flos": 26599895376480.0, + "grad_norm": 1.480105960374357, + "language_loss": 0.77214372, + "learning_rate": 2.4165725547751622e-06, + "loss": 0.79366481, + "num_input_tokens_seen": 160635170, + "router_z_loss_clip": 0.77539062, + "router_z_loss_mlp": 0.11914062, + "step": 7488, + "time_per_iteration": 2.6678829193115234 + }, + { + "auxiliary_loss_clip": 0.01127974, + "auxiliary_loss_mlp": 0.01034777, + "balance_loss_clip": 1.04443073, + "balance_loss_mlp": 1.02162766, + "epoch": 0.4502630392304224, + "flos": 35103300039360.0, + "grad_norm": 2.177406537866526, + "language_loss": 0.71827388, + "learning_rate": 2.4161916269084858e-06, + "loss": 0.73990142, + "num_input_tokens_seen": 160654490, + "router_z_loss_clip": 0.83642578, + "router_z_loss_mlp": 0.13140869, + "step": 7489, + "time_per_iteration": 2.722144365310669 + }, + { + "auxiliary_loss_clip": 0.01126614, + "auxiliary_loss_mlp": 0.01031324, + "balance_loss_clip": 1.04572833, + "balance_loss_mlp": 1.0178535, + "epoch": 0.45032316248309034, + "flos": 19333427012640.0, + "grad_norm": 3.1329007387848584, + "language_loss": 0.69685197, + "learning_rate": 2.4158106832597817e-06, + "loss": 0.71843135, + "num_input_tokens_seen": 160669400, + "router_z_loss_clip": 0.80859375, + "router_z_loss_mlp": 0.13476562, + "step": 7490, + "time_per_iteration": 2.61322283744812 + }, + { + "auxiliary_loss_clip": 0.01036487, + "auxiliary_loss_mlp": 0.01003119, + "balance_loss_clip": 1.0119648, + "balance_loss_mlp": 1.00185251, + "epoch": 0.4503832857357583, + "flos": 70591868608320.0, + "grad_norm": 0.7520788021304171, + "language_loss": 0.56648177, + "learning_rate": 2.415429723843495e-06, + "loss": 0.58687782, + "num_input_tokens_seen": 160733820, + "router_z_loss_clip": 0.24560547, + "router_z_loss_mlp": 0.01266479, + "step": 7491, + "time_per_iteration": 3.2284128665924072 + }, + { + "auxiliary_loss_clip": 0.01118346, + "auxiliary_loss_mlp": 0.0102961, + "balance_loss_clip": 1.04248714, + "balance_loss_mlp": 1.01769447, + "epoch": 0.4504434089884263, + "flos": 29136360821760.0, + "grad_norm": 1.798643481855774, + "language_loss": 0.79281044, + "learning_rate": 2.4150487486740713e-06, + "loss": 0.81428999, + "num_input_tokens_seen": 160753175, + "router_z_loss_clip": 0.75830078, + "router_z_loss_mlp": 0.11932373, + "step": 7492, + "time_per_iteration": 2.678372859954834 + }, + { + "auxiliary_loss_clip": 0.0112726, + "auxiliary_loss_mlp": 0.01036189, + "balance_loss_clip": 1.04522514, + "balance_loss_mlp": 1.02335668, + "epoch": 0.45050353224109424, + "flos": 21701203905600.0, + "grad_norm": 2.282265331102709, + "language_loss": 0.92698419, + "learning_rate": 2.4146677577659573e-06, + "loss": 0.94861865, + "num_input_tokens_seen": 160768310, + "router_z_loss_clip": 0.8203125, + "router_z_loss_mlp": 0.12835693, + "step": 7493, + "time_per_iteration": 2.6079704761505127 + }, + { + "auxiliary_loss_clip": 0.01036022, + "auxiliary_loss_mlp": 0.01002112, + "balance_loss_clip": 1.01134062, + "balance_loss_mlp": 1.00084519, + "epoch": 0.4505636554937622, + "flos": 79391167031520.0, + "grad_norm": 0.8156820716464616, + "language_loss": 0.62871873, + "learning_rate": 2.4142867511336e-06, + "loss": 0.64910007, + "num_input_tokens_seen": 160827370, + "router_z_loss_clip": 0.24682617, + "router_z_loss_mlp": 0.01266479, + "step": 7494, + "time_per_iteration": 3.3604538440704346 + }, + { + "auxiliary_loss_clip": 0.01121755, + "auxiliary_loss_mlp": 0.01031125, + "balance_loss_clip": 1.04480481, + "balance_loss_mlp": 1.01925206, + "epoch": 0.45062377874643017, + "flos": 27088707022560.0, + "grad_norm": 1.5878744743230508, + "language_loss": 0.82446325, + "learning_rate": 2.4139057287914484e-06, + "loss": 0.84599197, + "num_input_tokens_seen": 160849140, + "router_z_loss_clip": 0.77001953, + "router_z_loss_mlp": 0.11877441, + "step": 7495, + "time_per_iteration": 2.6734297275543213 + }, + { + "auxiliary_loss_clip": 0.01123136, + "auxiliary_loss_mlp": 0.01030991, + "balance_loss_clip": 1.04370582, + "balance_loss_mlp": 1.01823592, + "epoch": 0.45068390199909814, + "flos": 45966621242880.0, + "grad_norm": 1.7272352872661845, + "language_loss": 0.85737813, + "learning_rate": 2.41352469075395e-06, + "loss": 0.87891936, + "num_input_tokens_seen": 160871280, + "router_z_loss_clip": 0.79492188, + "router_z_loss_mlp": 0.12744141, + "step": 7496, + "time_per_iteration": 2.900843620300293 + }, + { + "auxiliary_loss_clip": 0.01124802, + "auxiliary_loss_mlp": 0.01027825, + "balance_loss_clip": 1.04518723, + "balance_loss_mlp": 1.01509356, + "epoch": 0.4507440252517661, + "flos": 27214332056640.0, + "grad_norm": 1.9593228138617207, + "language_loss": 0.76471251, + "learning_rate": 2.4131436370355534e-06, + "loss": 0.78623885, + "num_input_tokens_seen": 160888625, + "router_z_loss_clip": 0.79541016, + "router_z_loss_mlp": 0.12738037, + "step": 7497, + "time_per_iteration": 2.6492741107940674 + }, + { + "auxiliary_loss_clip": 0.01123729, + "auxiliary_loss_mlp": 0.01031699, + "balance_loss_clip": 1.04271388, + "balance_loss_mlp": 1.0189203, + "epoch": 0.45080414850443407, + "flos": 16091900432640.0, + "grad_norm": 3.2845709624628827, + "language_loss": 0.74795574, + "learning_rate": 2.4127625676507088e-06, + "loss": 0.76951003, + "num_input_tokens_seen": 160907040, + "router_z_loss_clip": 0.81005859, + "router_z_loss_mlp": 0.12780762, + "step": 7498, + "time_per_iteration": 2.6415977478027344 + }, + { + "auxiliary_loss_clip": 0.01124592, + "auxiliary_loss_mlp": 0.01037725, + "balance_loss_clip": 1.04430139, + "balance_loss_mlp": 1.02477324, + "epoch": 0.4508642717571021, + "flos": 26777335661280.0, + "grad_norm": 1.9786185950327964, + "language_loss": 0.70267904, + "learning_rate": 2.4123814826138663e-06, + "loss": 0.72430223, + "num_input_tokens_seen": 160927115, + "router_z_loss_clip": 0.80175781, + "router_z_loss_mlp": 0.12957764, + "step": 7499, + "time_per_iteration": 2.6958253383636475 + }, + { + "auxiliary_loss_clip": 0.01125264, + "auxiliary_loss_mlp": 0.01034064, + "balance_loss_clip": 1.04387295, + "balance_loss_mlp": 1.02114201, + "epoch": 0.45092439500977005, + "flos": 28513253443680.0, + "grad_norm": 1.9285281628938418, + "language_loss": 0.77109337, + "learning_rate": 2.412000381939477e-06, + "loss": 0.79268664, + "num_input_tokens_seen": 160944405, + "router_z_loss_clip": 0.81396484, + "router_z_loss_mlp": 0.12939453, + "step": 7500, + "time_per_iteration": 2.6534221172332764 + }, + { + "auxiliary_loss_clip": 0.01122662, + "auxiliary_loss_mlp": 0.01030609, + "balance_loss_clip": 1.04330945, + "balance_loss_mlp": 1.01836598, + "epoch": 0.450984518262438, + "flos": 25347603028320.0, + "grad_norm": 2.246244585050854, + "language_loss": 0.62291551, + "learning_rate": 2.411619265641992e-06, + "loss": 0.64444816, + "num_input_tokens_seen": 160961345, + "router_z_loss_clip": 0.79345703, + "router_z_loss_mlp": 0.12225342, + "step": 7501, + "time_per_iteration": 2.736239194869995 + }, + { + "auxiliary_loss_clip": 0.01126576, + "auxiliary_loss_mlp": 0.01033244, + "balance_loss_clip": 1.04473174, + "balance_loss_mlp": 1.02008367, + "epoch": 0.451044641515106, + "flos": 21606811344000.0, + "grad_norm": 2.460947931347251, + "language_loss": 0.84965163, + "learning_rate": 2.411238133735863e-06, + "loss": 0.87124991, + "num_input_tokens_seen": 160977330, + "router_z_loss_clip": 0.81835938, + "router_z_loss_mlp": 0.1315918, + "step": 7502, + "time_per_iteration": 2.619957208633423 + }, + { + "auxiliary_loss_clip": 0.01120986, + "auxiliary_loss_mlp": 0.01029624, + "balance_loss_clip": 1.04428649, + "balance_loss_mlp": 1.01759028, + "epoch": 0.45110476476777395, + "flos": 25129692331200.0, + "grad_norm": 1.3763274955451048, + "language_loss": 0.79688334, + "learning_rate": 2.4108569862355418e-06, + "loss": 0.81838948, + "num_input_tokens_seen": 160997280, + "router_z_loss_clip": 0.76611328, + "router_z_loss_mlp": 0.12042236, + "step": 7503, + "time_per_iteration": 2.7789862155914307 + }, + { + "auxiliary_loss_clip": 0.01123195, + "auxiliary_loss_mlp": 0.01032831, + "balance_loss_clip": 1.04608727, + "balance_loss_mlp": 1.02005172, + "epoch": 0.4511648880204419, + "flos": 19564424791200.0, + "grad_norm": 1.850966995007521, + "language_loss": 0.80903935, + "learning_rate": 2.410475823155484e-06, + "loss": 0.83059961, + "num_input_tokens_seen": 161014235, + "router_z_loss_clip": 0.77001953, + "router_z_loss_mlp": 0.12786865, + "step": 7504, + "time_per_iteration": 2.721912384033203 + }, + { + "auxiliary_loss_clip": 0.01122608, + "auxiliary_loss_mlp": 0.01031605, + "balance_loss_clip": 1.04511321, + "balance_loss_mlp": 1.01956487, + "epoch": 0.4512250112731099, + "flos": 29260081543680.0, + "grad_norm": 1.589617384015028, + "language_loss": 0.63434529, + "learning_rate": 2.4100946445101405e-06, + "loss": 0.65588748, + "num_input_tokens_seen": 161032360, + "router_z_loss_clip": 0.77539062, + "router_z_loss_mlp": 0.12042236, + "step": 7505, + "time_per_iteration": 2.6862261295318604 + }, + { + "auxiliary_loss_clip": 0.01036886, + "auxiliary_loss_mlp": 0.01000584, + "balance_loss_clip": 1.01220906, + "balance_loss_mlp": 0.999259, + "epoch": 0.45128513452577784, + "flos": 87198302809440.0, + "grad_norm": 0.833217415871163, + "language_loss": 0.58826351, + "learning_rate": 2.409713450313968e-06, + "loss": 0.60863823, + "num_input_tokens_seen": 161091360, + "router_z_loss_clip": 0.24719238, + "router_z_loss_mlp": 0.01326752, + "step": 7506, + "time_per_iteration": 3.340740203857422 + }, + { + "auxiliary_loss_clip": 0.01121639, + "auxiliary_loss_mlp": 0.01031485, + "balance_loss_clip": 1.04459798, + "balance_loss_mlp": 1.01898646, + "epoch": 0.4513452577784458, + "flos": 26955383705280.0, + "grad_norm": 1.6415277741818453, + "language_loss": 0.79166579, + "learning_rate": 2.40933224058142e-06, + "loss": 0.81319708, + "num_input_tokens_seen": 161110825, + "router_z_loss_clip": 0.77148438, + "router_z_loss_mlp": 0.12506104, + "step": 7507, + "time_per_iteration": 2.659898519515991 + }, + { + "auxiliary_loss_clip": 0.01124368, + "auxiliary_loss_mlp": 0.01032522, + "balance_loss_clip": 1.04400527, + "balance_loss_mlp": 1.01890874, + "epoch": 0.4514053810311138, + "flos": 29622700913760.0, + "grad_norm": 2.0023753468855405, + "language_loss": 0.73903042, + "learning_rate": 2.4089510153269526e-06, + "loss": 0.76059932, + "num_input_tokens_seen": 161130685, + "router_z_loss_clip": 0.80371094, + "router_z_loss_mlp": 0.1361084, + "step": 7508, + "time_per_iteration": 2.730839490890503 + }, + { + "auxiliary_loss_clip": 0.01122644, + "auxiliary_loss_mlp": 0.01033893, + "balance_loss_clip": 1.04648411, + "balance_loss_mlp": 1.02176905, + "epoch": 0.45146550428378174, + "flos": 21826018594080.0, + "grad_norm": 2.1503487173287086, + "language_loss": 0.789621, + "learning_rate": 2.4085697745650217e-06, + "loss": 0.81118631, + "num_input_tokens_seen": 161147555, + "router_z_loss_clip": 0.76220703, + "router_z_loss_mlp": 0.12121582, + "step": 7509, + "time_per_iteration": 2.629976987838745 + }, + { + "auxiliary_loss_clip": 0.01123908, + "auxiliary_loss_mlp": 0.01032913, + "balance_loss_clip": 1.04620683, + "balance_loss_mlp": 1.02102196, + "epoch": 0.4515256275364497, + "flos": 29582149466880.0, + "grad_norm": 1.7680077708110895, + "language_loss": 0.72881234, + "learning_rate": 2.4081885183100837e-06, + "loss": 0.75038052, + "num_input_tokens_seen": 161166255, + "router_z_loss_clip": 0.77734375, + "router_z_loss_mlp": 0.11895752, + "step": 7510, + "time_per_iteration": 2.701340436935425 + }, + { + "auxiliary_loss_clip": 0.01123144, + "auxiliary_loss_mlp": 0.01031659, + "balance_loss_clip": 1.04245949, + "balance_loss_mlp": 1.01831317, + "epoch": 0.45158575078911767, + "flos": 25174052402400.0, + "grad_norm": 3.1431971980752818, + "language_loss": 0.76698208, + "learning_rate": 2.4078072465765964e-06, + "loss": 0.78853011, + "num_input_tokens_seen": 161184720, + "router_z_loss_clip": 0.80859375, + "router_z_loss_mlp": 0.13360596, + "step": 7511, + "time_per_iteration": 2.6805129051208496 + }, + { + "auxiliary_loss_clip": 0.01123224, + "auxiliary_loss_mlp": 0.01032414, + "balance_loss_clip": 1.04317629, + "balance_loss_mlp": 1.01903868, + "epoch": 0.45164587404178563, + "flos": 28465084748160.0, + "grad_norm": 1.803667376265719, + "language_loss": 0.78696918, + "learning_rate": 2.4074259593790174e-06, + "loss": 0.80852556, + "num_input_tokens_seen": 161204360, + "router_z_loss_clip": 0.79980469, + "router_z_loss_mlp": 0.13366699, + "step": 7512, + "time_per_iteration": 3.9065494537353516 + }, + { + "auxiliary_loss_clip": 0.0112737, + "auxiliary_loss_mlp": 0.01033657, + "balance_loss_clip": 1.04385698, + "balance_loss_mlp": 1.02004313, + "epoch": 0.45170599729445365, + "flos": 29048288955840.0, + "grad_norm": 2.428592083628295, + "language_loss": 0.87359428, + "learning_rate": 2.4070446567318053e-06, + "loss": 0.89520454, + "num_input_tokens_seen": 161223575, + "router_z_loss_clip": 0.8359375, + "router_z_loss_mlp": 0.13604736, + "step": 7513, + "time_per_iteration": 4.110200643539429 + }, + { + "auxiliary_loss_clip": 0.01116605, + "auxiliary_loss_mlp": 0.01031709, + "balance_loss_clip": 1.04308844, + "balance_loss_mlp": 1.01999068, + "epoch": 0.4517661205471216, + "flos": 28690815280320.0, + "grad_norm": 1.7441009955179105, + "language_loss": 0.67507744, + "learning_rate": 2.406663338649419e-06, + "loss": 0.69656062, + "num_input_tokens_seen": 161243805, + "router_z_loss_clip": 0.73535156, + "router_z_loss_mlp": 0.11712646, + "step": 7514, + "time_per_iteration": 2.649945020675659 + }, + { + "auxiliary_loss_clip": 0.01126858, + "auxiliary_loss_mlp": 0.01030544, + "balance_loss_clip": 1.04744649, + "balance_loss_mlp": 1.01672745, + "epoch": 0.4518262437997896, + "flos": 28691949764160.0, + "grad_norm": 1.8177540416750093, + "language_loss": 0.69832504, + "learning_rate": 2.406282005146318e-06, + "loss": 0.71989906, + "num_input_tokens_seen": 161261450, + "router_z_loss_clip": 0.79296875, + "router_z_loss_mlp": 0.1383667, + "step": 7515, + "time_per_iteration": 2.6844403743743896 + }, + { + "auxiliary_loss_clip": 0.01126368, + "auxiliary_loss_mlp": 0.01039247, + "balance_loss_clip": 1.04362023, + "balance_loss_mlp": 1.02538323, + "epoch": 0.45188636705245755, + "flos": 17776124516160.0, + "grad_norm": 2.3541503197159623, + "language_loss": 0.8220464, + "learning_rate": 2.405900656236963e-06, + "loss": 0.84370255, + "num_input_tokens_seen": 161276965, + "router_z_loss_clip": 0.82714844, + "router_z_loss_mlp": 0.13873291, + "step": 7516, + "time_per_iteration": 2.6224725246429443 + }, + { + "auxiliary_loss_clip": 0.01120645, + "auxiliary_loss_mlp": 0.01032808, + "balance_loss_clip": 1.04369092, + "balance_loss_mlp": 1.01991582, + "epoch": 0.4519464903051255, + "flos": 24283852699680.0, + "grad_norm": 1.7913495594690076, + "language_loss": 0.65537167, + "learning_rate": 2.4055192919358137e-06, + "loss": 0.67690623, + "num_input_tokens_seen": 161295375, + "router_z_loss_clip": 0.76855469, + "router_z_loss_mlp": 0.12902832, + "step": 7517, + "time_per_iteration": 2.666240692138672 + }, + { + "auxiliary_loss_clip": 0.0111991, + "auxiliary_loss_mlp": 0.01026977, + "balance_loss_clip": 1.04408312, + "balance_loss_mlp": 1.01554489, + "epoch": 0.4520066135577935, + "flos": 22993723562400.0, + "grad_norm": 2.4349248172908156, + "language_loss": 0.62753606, + "learning_rate": 2.405137912257333e-06, + "loss": 0.64900488, + "num_input_tokens_seen": 161313010, + "router_z_loss_clip": 0.7578125, + "router_z_loss_mlp": 0.11431885, + "step": 7518, + "time_per_iteration": 2.657071113586426 + }, + { + "auxiliary_loss_clip": 0.01121234, + "auxiliary_loss_mlp": 0.01032967, + "balance_loss_clip": 1.04296815, + "balance_loss_mlp": 1.02092659, + "epoch": 0.45206673681046144, + "flos": 58833398243520.0, + "grad_norm": 1.3866218824918033, + "language_loss": 0.59411865, + "learning_rate": 2.404756517215982e-06, + "loss": 0.61566067, + "num_input_tokens_seen": 161336690, + "router_z_loss_clip": 0.78222656, + "router_z_loss_mlp": 0.12030029, + "step": 7519, + "time_per_iteration": 4.428168773651123 + }, + { + "auxiliary_loss_clip": 0.01124172, + "auxiliary_loss_mlp": 0.01036437, + "balance_loss_clip": 1.0448128, + "balance_loss_mlp": 1.02330637, + "epoch": 0.4521268600631294, + "flos": 29092486957920.0, + "grad_norm": 1.634260459421256, + "language_loss": 0.72973484, + "learning_rate": 2.404375106826223e-06, + "loss": 0.75134099, + "num_input_tokens_seen": 161357845, + "router_z_loss_clip": 0.79296875, + "router_z_loss_mlp": 0.13146973, + "step": 7520, + "time_per_iteration": 2.731553554534912 + }, + { + "auxiliary_loss_clip": 0.01122943, + "auxiliary_loss_mlp": 0.01035826, + "balance_loss_clip": 1.04395723, + "balance_loss_mlp": 1.02373779, + "epoch": 0.4521869833157974, + "flos": 22992872699520.0, + "grad_norm": 3.236062906355862, + "language_loss": 0.75579137, + "learning_rate": 2.4039936811025194e-06, + "loss": 0.77737904, + "num_input_tokens_seen": 161375160, + "router_z_loss_clip": 0.79003906, + "router_z_loss_mlp": 0.12109375, + "step": 7521, + "time_per_iteration": 2.611292600631714 + }, + { + "auxiliary_loss_clip": 0.01127693, + "auxiliary_loss_mlp": 0.01036928, + "balance_loss_clip": 1.04489708, + "balance_loss_mlp": 1.0233798, + "epoch": 0.45224710656846534, + "flos": 24145748343360.0, + "grad_norm": 5.844234702726907, + "language_loss": 0.67918158, + "learning_rate": 2.4036122400593343e-06, + "loss": 0.70082778, + "num_input_tokens_seen": 161393690, + "router_z_loss_clip": 0.828125, + "router_z_loss_mlp": 0.13555908, + "step": 7522, + "time_per_iteration": 2.718351364135742 + }, + { + "auxiliary_loss_clip": 0.01119523, + "auxiliary_loss_mlp": 0.01034688, + "balance_loss_clip": 1.04139209, + "balance_loss_mlp": 1.02180743, + "epoch": 0.4523072298211333, + "flos": 34478815073760.0, + "grad_norm": 1.4887516065014588, + "language_loss": 0.60747725, + "learning_rate": 2.403230783711134e-06, + "loss": 0.62901938, + "num_input_tokens_seen": 161415015, + "router_z_loss_clip": 0.78173828, + "router_z_loss_mlp": 0.12884521, + "step": 7523, + "time_per_iteration": 2.7105846405029297 + }, + { + "auxiliary_loss_clip": 0.01126634, + "auxiliary_loss_mlp": 0.01037803, + "balance_loss_clip": 1.04336667, + "balance_loss_mlp": 1.02408838, + "epoch": 0.45236735307380127, + "flos": 13642453404000.0, + "grad_norm": 2.2400818908125766, + "language_loss": 0.78019989, + "learning_rate": 2.4028493120723813e-06, + "loss": 0.80184424, + "num_input_tokens_seen": 161432940, + "router_z_loss_clip": 0.83251953, + "router_z_loss_mlp": 0.13702393, + "step": 7524, + "time_per_iteration": 3.950432062149048 + }, + { + "auxiliary_loss_clip": 0.0112173, + "auxiliary_loss_mlp": 0.01036293, + "balance_loss_clip": 1.04328728, + "balance_loss_mlp": 1.0238539, + "epoch": 0.45242747632646924, + "flos": 27578410048800.0, + "grad_norm": 1.6159129271804915, + "language_loss": 0.63748401, + "learning_rate": 2.4024678251575417e-06, + "loss": 0.65906429, + "num_input_tokens_seen": 161452215, + "router_z_loss_clip": 0.78417969, + "router_z_loss_mlp": 0.12432861, + "step": 7525, + "time_per_iteration": 2.6858251094818115 + }, + { + "auxiliary_loss_clip": 0.01120161, + "auxiliary_loss_mlp": 0.01034056, + "balance_loss_clip": 1.04317689, + "balance_loss_mlp": 1.02240956, + "epoch": 0.45248759957913726, + "flos": 22276466726400.0, + "grad_norm": 1.542457143349266, + "language_loss": 0.79146498, + "learning_rate": 2.402086322981083e-06, + "loss": 0.81300712, + "num_input_tokens_seen": 161469520, + "router_z_loss_clip": 0.77001953, + "router_z_loss_mlp": 0.11651611, + "step": 7526, + "time_per_iteration": 2.6090433597564697 + }, + { + "auxiliary_loss_clip": 0.01120936, + "auxiliary_loss_mlp": 0.01030778, + "balance_loss_clip": 1.0434432, + "balance_loss_mlp": 1.01796854, + "epoch": 0.4525477228318052, + "flos": 27394284412800.0, + "grad_norm": 1.8318445452401049, + "language_loss": 0.80713737, + "learning_rate": 2.40170480555747e-06, + "loss": 0.82865447, + "num_input_tokens_seen": 161487335, + "router_z_loss_clip": 0.77441406, + "router_z_loss_mlp": 0.12811279, + "step": 7527, + "time_per_iteration": 2.717643976211548 + }, + { + "auxiliary_loss_clip": 0.0112071, + "auxiliary_loss_mlp": 0.01027488, + "balance_loss_clip": 1.04368734, + "balance_loss_mlp": 1.01500094, + "epoch": 0.4526078460844732, + "flos": 36173249511840.0, + "grad_norm": 1.558279276776192, + "language_loss": 0.65530509, + "learning_rate": 2.4013232729011706e-06, + "loss": 0.67678708, + "num_input_tokens_seen": 161510095, + "router_z_loss_clip": 0.76855469, + "router_z_loss_mlp": 0.12487793, + "step": 7528, + "time_per_iteration": 2.7014153003692627 + }, + { + "auxiliary_loss_clip": 0.01118521, + "auxiliary_loss_mlp": 0.01030777, + "balance_loss_clip": 1.04247332, + "balance_loss_mlp": 1.01861143, + "epoch": 0.45266796933714115, + "flos": 28113283491840.0, + "grad_norm": 1.7481847724795563, + "language_loss": 0.75652814, + "learning_rate": 2.4009417250266525e-06, + "loss": 0.7780211, + "num_input_tokens_seen": 161528725, + "router_z_loss_clip": 0.76123047, + "router_z_loss_mlp": 0.121521, + "step": 7529, + "time_per_iteration": 2.7829511165618896 + }, + { + "auxiliary_loss_clip": 0.01122025, + "auxiliary_loss_mlp": 0.01031961, + "balance_loss_clip": 1.04323757, + "balance_loss_mlp": 1.01990247, + "epoch": 0.4527280925898091, + "flos": 17605126478880.0, + "grad_norm": 2.4330503195322497, + "language_loss": 0.73161668, + "learning_rate": 2.400560161948384e-06, + "loss": 0.75315648, + "num_input_tokens_seen": 161547195, + "router_z_loss_clip": 0.78808594, + "router_z_loss_mlp": 0.12060547, + "step": 7530, + "time_per_iteration": 2.657233715057373 + }, + { + "auxiliary_loss_clip": 0.01123133, + "auxiliary_loss_mlp": 0.01030856, + "balance_loss_clip": 1.04453301, + "balance_loss_mlp": 1.01897717, + "epoch": 0.4527882158424771, + "flos": 27974328272640.0, + "grad_norm": 1.5914697903747246, + "language_loss": 0.76237786, + "learning_rate": 2.400178583680834e-06, + "loss": 0.78391778, + "num_input_tokens_seen": 161565565, + "router_z_loss_clip": 0.78515625, + "router_z_loss_mlp": 0.11877441, + "step": 7531, + "time_per_iteration": 2.6443099975585938 + }, + { + "auxiliary_loss_clip": 0.01119361, + "auxiliary_loss_mlp": 0.01035171, + "balance_loss_clip": 1.04363275, + "balance_loss_mlp": 1.02259445, + "epoch": 0.45284833909514505, + "flos": 31183163758080.0, + "grad_norm": 1.6673283930222014, + "language_loss": 0.67026317, + "learning_rate": 2.3997969902384717e-06, + "loss": 0.69180846, + "num_input_tokens_seen": 161586630, + "router_z_loss_clip": 0.75732422, + "router_z_loss_mlp": 0.12579346, + "step": 7532, + "time_per_iteration": 2.7356765270233154 + }, + { + "auxiliary_loss_clip": 0.0112186, + "auxiliary_loss_mlp": 0.01036767, + "balance_loss_clip": 1.04394484, + "balance_loss_mlp": 1.02457237, + "epoch": 0.452908462347813, + "flos": 22146506343360.0, + "grad_norm": 2.1448738814645596, + "language_loss": 0.78368849, + "learning_rate": 2.399415381635768e-06, + "loss": 0.80527472, + "num_input_tokens_seen": 161603815, + "router_z_loss_clip": 0.77832031, + "router_z_loss_mlp": 0.12188721, + "step": 7533, + "time_per_iteration": 2.6778323650360107 + }, + { + "auxiliary_loss_clip": 0.01125323, + "auxiliary_loss_mlp": 0.0103415, + "balance_loss_clip": 1.04129958, + "balance_loss_mlp": 1.02035165, + "epoch": 0.452968585600481, + "flos": 23260248645120.0, + "grad_norm": 1.7035657750580158, + "language_loss": 0.83319151, + "learning_rate": 2.3990337578871927e-06, + "loss": 0.85478628, + "num_input_tokens_seen": 161622900, + "router_z_loss_clip": 0.83984375, + "router_z_loss_mlp": 0.13818359, + "step": 7534, + "time_per_iteration": 2.679340362548828 + }, + { + "auxiliary_loss_clip": 0.01124611, + "auxiliary_loss_mlp": 0.01034532, + "balance_loss_clip": 1.04383397, + "balance_loss_mlp": 1.02130604, + "epoch": 0.45302870885314894, + "flos": 26907903803520.0, + "grad_norm": 1.71496463679682, + "language_loss": 0.76535869, + "learning_rate": 2.3986521190072176e-06, + "loss": 0.78695017, + "num_input_tokens_seen": 161641700, + "router_z_loss_clip": 0.80810547, + "router_z_loss_mlp": 0.13226318, + "step": 7535, + "time_per_iteration": 2.6340765953063965 + }, + { + "auxiliary_loss_clip": 0.01119382, + "auxiliary_loss_mlp": 0.01031564, + "balance_loss_clip": 1.04242063, + "balance_loss_mlp": 1.0198102, + "epoch": 0.4530888321058169, + "flos": 24863045696640.0, + "grad_norm": 2.798837029805808, + "language_loss": 0.80425429, + "learning_rate": 2.3982704650103138e-06, + "loss": 0.82576376, + "num_input_tokens_seen": 161661955, + "router_z_loss_clip": 0.76953125, + "router_z_loss_mlp": 0.11755371, + "step": 7536, + "time_per_iteration": 2.674541711807251 + }, + { + "auxiliary_loss_clip": 0.01122875, + "auxiliary_loss_mlp": 0.0103196, + "balance_loss_clip": 1.04289269, + "balance_loss_mlp": 1.01932931, + "epoch": 0.4531489553584849, + "flos": 18095639850720.0, + "grad_norm": 1.8456859040300702, + "language_loss": 0.75819463, + "learning_rate": 2.3978887959109544e-06, + "loss": 0.77974296, + "num_input_tokens_seen": 161679245, + "router_z_loss_clip": 0.79980469, + "router_z_loss_mlp": 0.12646484, + "step": 7537, + "time_per_iteration": 2.595047950744629 + }, + { + "auxiliary_loss_clip": 0.01126954, + "auxiliary_loss_mlp": 0.01032957, + "balance_loss_clip": 1.04683256, + "balance_loss_mlp": 1.02123916, + "epoch": 0.45320907861115284, + "flos": 26777376178560.0, + "grad_norm": 2.161740733575745, + "language_loss": 0.7575624, + "learning_rate": 2.3975071117236118e-06, + "loss": 0.77916157, + "num_input_tokens_seen": 161698795, + "router_z_loss_clip": 0.80175781, + "router_z_loss_mlp": 0.1171875, + "step": 7538, + "time_per_iteration": 2.707932472229004 + }, + { + "auxiliary_loss_clip": 0.0104245, + "auxiliary_loss_mlp": 0.01005039, + "balance_loss_clip": 1.01732337, + "balance_loss_mlp": 1.00361347, + "epoch": 0.45326920186382086, + "flos": 80839010888640.0, + "grad_norm": 0.7807067702137943, + "language_loss": 0.62391156, + "learning_rate": 2.3971254124627593e-06, + "loss": 0.64438641, + "num_input_tokens_seen": 161761980, + "router_z_loss_clip": 0.25146484, + "router_z_loss_mlp": 0.01425171, + "step": 7539, + "time_per_iteration": 3.3011181354522705 + }, + { + "auxiliary_loss_clip": 0.01122203, + "auxiliary_loss_mlp": 0.0103743, + "balance_loss_clip": 1.04425514, + "balance_loss_mlp": 1.02515125, + "epoch": 0.4533293251164888, + "flos": 17917267668480.0, + "grad_norm": 1.699446380629724, + "language_loss": 0.65688962, + "learning_rate": 2.396743698142872e-06, + "loss": 0.67848599, + "num_input_tokens_seen": 161779455, + "router_z_loss_clip": 0.77880859, + "router_z_loss_mlp": 0.1227417, + "step": 7540, + "time_per_iteration": 2.6528420448303223 + }, + { + "auxiliary_loss_clip": 0.0112773, + "auxiliary_loss_mlp": 0.01037722, + "balance_loss_clip": 1.04554844, + "balance_loss_mlp": 1.02444792, + "epoch": 0.4533894483691568, + "flos": 27578207462400.0, + "grad_norm": 2.1898218856603204, + "language_loss": 0.85191429, + "learning_rate": 2.396361968778424e-06, + "loss": 0.87356877, + "num_input_tokens_seen": 161798980, + "router_z_loss_clip": 0.82128906, + "router_z_loss_mlp": 0.13269043, + "step": 7541, + "time_per_iteration": 2.745305299758911 + }, + { + "auxiliary_loss_clip": 0.01122825, + "auxiliary_loss_mlp": 0.01028727, + "balance_loss_clip": 1.04379821, + "balance_loss_mlp": 1.01677656, + "epoch": 0.45344957162182475, + "flos": 42405012086400.0, + "grad_norm": 2.6122789723346913, + "language_loss": 0.7680124, + "learning_rate": 2.395980224383889e-06, + "loss": 0.78952789, + "num_input_tokens_seen": 161819745, + "router_z_loss_clip": 0.79101562, + "router_z_loss_mlp": 0.11956787, + "step": 7542, + "time_per_iteration": 2.8287405967712402 + }, + { + "auxiliary_loss_clip": 0.01120711, + "auxiliary_loss_mlp": 0.01027436, + "balance_loss_clip": 1.04211843, + "balance_loss_mlp": 1.01446009, + "epoch": 0.4535096948744927, + "flos": 28736228800800.0, + "grad_norm": 1.5128345708558208, + "language_loss": 0.80305672, + "learning_rate": 2.395598464973746e-06, + "loss": 0.82453823, + "num_input_tokens_seen": 161838575, + "router_z_loss_clip": 0.78466797, + "router_z_loss_mlp": 0.12976074, + "step": 7543, + "time_per_iteration": 2.7046570777893066 + }, + { + "auxiliary_loss_clip": 0.01126154, + "auxiliary_loss_mlp": 0.0103535, + "balance_loss_clip": 1.04570341, + "balance_loss_mlp": 1.02295828, + "epoch": 0.4535698181271607, + "flos": 31185675829440.0, + "grad_norm": 3.7182326267509462, + "language_loss": 0.75994653, + "learning_rate": 2.395216690562469e-06, + "loss": 0.78156155, + "num_input_tokens_seen": 161858590, + "router_z_loss_clip": 0.80419922, + "router_z_loss_mlp": 0.12402344, + "step": 7544, + "time_per_iteration": 2.6991653442382812 + }, + { + "auxiliary_loss_clip": 0.0112453, + "auxiliary_loss_mlp": 0.01034085, + "balance_loss_clip": 1.04525352, + "balance_loss_mlp": 1.02245069, + "epoch": 0.45362994137982865, + "flos": 30339512059680.0, + "grad_norm": 2.1941449292231434, + "language_loss": 0.75414371, + "learning_rate": 2.3948349011645355e-06, + "loss": 0.77572989, + "num_input_tokens_seen": 161878390, + "router_z_loss_clip": 0.79248047, + "router_z_loss_mlp": 0.11639404, + "step": 7545, + "time_per_iteration": 2.69828200340271 + }, + { + "auxiliary_loss_clip": 0.01122863, + "auxiliary_loss_mlp": 0.01029642, + "balance_loss_clip": 1.04379821, + "balance_loss_mlp": 1.01726818, + "epoch": 0.4536900646324966, + "flos": 37591637306400.0, + "grad_norm": 1.9491086752585458, + "language_loss": 0.72538239, + "learning_rate": 2.394453096794423e-06, + "loss": 0.74690741, + "num_input_tokens_seen": 161898610, + "router_z_loss_clip": 0.78955078, + "router_z_loss_mlp": 0.12384033, + "step": 7546, + "time_per_iteration": 2.714318037033081 + }, + { + "auxiliary_loss_clip": 0.01125435, + "auxiliary_loss_mlp": 0.01029313, + "balance_loss_clip": 1.04369032, + "balance_loss_mlp": 1.01594925, + "epoch": 0.4537501878851646, + "flos": 28558180756800.0, + "grad_norm": 6.854964234316971, + "language_loss": 0.75536847, + "learning_rate": 2.394071277466609e-06, + "loss": 0.77691597, + "num_input_tokens_seen": 161918210, + "router_z_loss_clip": 0.81689453, + "router_z_loss_mlp": 0.13360596, + "step": 7547, + "time_per_iteration": 2.729224681854248 + }, + { + "auxiliary_loss_clip": 0.01125232, + "auxiliary_loss_mlp": 0.01027141, + "balance_loss_clip": 1.04458189, + "balance_loss_mlp": 1.01457071, + "epoch": 0.45381031113783254, + "flos": 22145695997760.0, + "grad_norm": 2.1044834301012743, + "language_loss": 0.69710886, + "learning_rate": 2.393689443195573e-06, + "loss": 0.71863258, + "num_input_tokens_seen": 161936950, + "router_z_loss_clip": 0.80615234, + "router_z_loss_mlp": 0.12585449, + "step": 7548, + "time_per_iteration": 2.6210920810699463 + }, + { + "auxiliary_loss_clip": 0.01120828, + "auxiliary_loss_mlp": 0.01031745, + "balance_loss_clip": 1.04177356, + "balance_loss_mlp": 1.01935887, + "epoch": 0.4538704343905005, + "flos": 30916314537120.0, + "grad_norm": 2.553160472242855, + "language_loss": 0.72583252, + "learning_rate": 2.393307593995794e-06, + "loss": 0.7473582, + "num_input_tokens_seen": 161955550, + "router_z_loss_clip": 0.79052734, + "router_z_loss_mlp": 0.12390137, + "step": 7549, + "time_per_iteration": 2.6870291233062744 + }, + { + "auxiliary_loss_clip": 0.01120843, + "auxiliary_loss_mlp": 0.01026314, + "balance_loss_clip": 1.04311633, + "balance_loss_mlp": 1.0145061, + "epoch": 0.4539305576431685, + "flos": 35058737381760.0, + "grad_norm": 2.088646614331819, + "language_loss": 0.64592195, + "learning_rate": 2.392925729881751e-06, + "loss": 0.66739351, + "num_input_tokens_seen": 161976760, + "router_z_loss_clip": 0.77685547, + "router_z_loss_mlp": 0.11810303, + "step": 7550, + "time_per_iteration": 2.6854567527770996 + }, + { + "auxiliary_loss_clip": 0.01120869, + "auxiliary_loss_mlp": 0.01029867, + "balance_loss_clip": 1.04480672, + "balance_loss_mlp": 1.01802993, + "epoch": 0.45399068089583644, + "flos": 27445370352480.0, + "grad_norm": 1.6506101845545678, + "language_loss": 0.68676066, + "learning_rate": 2.3925438508679263e-06, + "loss": 0.70826805, + "num_input_tokens_seen": 161996120, + "router_z_loss_clip": 0.76123047, + "router_z_loss_mlp": 0.11834717, + "step": 7551, + "time_per_iteration": 4.132959842681885 + }, + { + "auxiliary_loss_clip": 0.01120869, + "auxiliary_loss_mlp": 0.01031108, + "balance_loss_clip": 1.04098487, + "balance_loss_mlp": 1.01860917, + "epoch": 0.45405080414850446, + "flos": 15732238824000.0, + "grad_norm": 1.9016650865270082, + "language_loss": 0.78670037, + "learning_rate": 2.392161956968798e-06, + "loss": 0.80822015, + "num_input_tokens_seen": 162011125, + "router_z_loss_clip": 0.79833984, + "router_z_loss_mlp": 0.125, + "step": 7552, + "time_per_iteration": 2.6538145542144775 + }, + { + "auxiliary_loss_clip": 0.01043789, + "auxiliary_loss_mlp": 0.01001638, + "balance_loss_clip": 1.01894283, + "balance_loss_mlp": 1.00026655, + "epoch": 0.4541109274011724, + "flos": 72928528230240.0, + "grad_norm": 0.8205574987144199, + "language_loss": 0.57773083, + "learning_rate": 2.39178004819885e-06, + "loss": 0.59818506, + "num_input_tokens_seen": 162068705, + "router_z_loss_clip": 0.24804688, + "router_z_loss_mlp": 0.01373291, + "step": 7553, + "time_per_iteration": 4.682817220687866 + }, + { + "auxiliary_loss_clip": 0.01118198, + "auxiliary_loss_mlp": 0.01031154, + "balance_loss_clip": 1.04137707, + "balance_loss_mlp": 1.0198884, + "epoch": 0.4541710506538404, + "flos": 34790713159680.0, + "grad_norm": 1.3764269016883104, + "language_loss": 0.76842517, + "learning_rate": 2.3913981245725626e-06, + "loss": 0.78991866, + "num_input_tokens_seen": 162089655, + "router_z_loss_clip": 0.76855469, + "router_z_loss_mlp": 0.11273193, + "step": 7554, + "time_per_iteration": 2.6966586112976074 + }, + { + "auxiliary_loss_clip": 0.01124898, + "auxiliary_loss_mlp": 0.01031529, + "balance_loss_clip": 1.04419935, + "balance_loss_mlp": 1.01838636, + "epoch": 0.45423117390650836, + "flos": 21567596967360.0, + "grad_norm": 2.293372364972089, + "language_loss": 0.77101558, + "learning_rate": 2.3910161861044194e-06, + "loss": 0.79257989, + "num_input_tokens_seen": 162108465, + "router_z_loss_clip": 0.80615234, + "router_z_loss_mlp": 0.13146973, + "step": 7555, + "time_per_iteration": 2.697470188140869 + }, + { + "auxiliary_loss_clip": 0.01119766, + "auxiliary_loss_mlp": 0.01029976, + "balance_loss_clip": 1.04299235, + "balance_loss_mlp": 1.01779342, + "epoch": 0.4542912971591763, + "flos": 34254907819200.0, + "grad_norm": 1.3297279255403214, + "language_loss": 0.72472149, + "learning_rate": 2.390634232808903e-06, + "loss": 0.74621892, + "num_input_tokens_seen": 162129910, + "router_z_loss_clip": 0.76855469, + "router_z_loss_mlp": 0.12176514, + "step": 7556, + "time_per_iteration": 2.7040092945098877 + }, + { + "auxiliary_loss_clip": 0.01127061, + "auxiliary_loss_mlp": 0.0103216, + "balance_loss_clip": 1.04555488, + "balance_loss_mlp": 1.01936281, + "epoch": 0.4543514204118443, + "flos": 27664537085280.0, + "grad_norm": 2.2681782344261485, + "language_loss": 0.63114125, + "learning_rate": 2.3902522647004982e-06, + "loss": 0.65273345, + "num_input_tokens_seen": 162148840, + "router_z_loss_clip": 0.81542969, + "router_z_loss_mlp": 0.12805176, + "step": 7557, + "time_per_iteration": 2.7159552574157715 + }, + { + "auxiliary_loss_clip": 0.01045249, + "auxiliary_loss_mlp": 0.01003193, + "balance_loss_clip": 1.02031839, + "balance_loss_mlp": 1.00188649, + "epoch": 0.45441154366451225, + "flos": 71034942595680.0, + "grad_norm": 0.6760307702264372, + "language_loss": 0.57578385, + "learning_rate": 2.3898702817936875e-06, + "loss": 0.5962683, + "num_input_tokens_seen": 162208500, + "router_z_loss_clip": 0.24902344, + "router_z_loss_mlp": 0.01307678, + "step": 7558, + "time_per_iteration": 4.609283685684204 + }, + { + "auxiliary_loss_clip": 0.0112581, + "auxiliary_loss_mlp": 0.01028601, + "balance_loss_clip": 1.04555392, + "balance_loss_mlp": 1.01519585, + "epoch": 0.4544716669171802, + "flos": 20455475356800.0, + "grad_norm": 4.762866529234361, + "language_loss": 0.56359941, + "learning_rate": 2.3894882841029573e-06, + "loss": 0.58514351, + "num_input_tokens_seen": 162224650, + "router_z_loss_clip": 0.80273438, + "router_z_loss_mlp": 0.1340332, + "step": 7559, + "time_per_iteration": 2.629920244216919 + }, + { + "auxiliary_loss_clip": 0.01122286, + "auxiliary_loss_mlp": 0.01033792, + "balance_loss_clip": 1.04473042, + "balance_loss_mlp": 1.02113199, + "epoch": 0.4545317901698482, + "flos": 19431587681280.0, + "grad_norm": 2.0943014454542292, + "language_loss": 0.71627736, + "learning_rate": 2.389106271642792e-06, + "loss": 0.73783815, + "num_input_tokens_seen": 162242930, + "router_z_loss_clip": 0.77587891, + "router_z_loss_mlp": 0.12658691, + "step": 7560, + "time_per_iteration": 2.6460745334625244 + }, + { + "auxiliary_loss_clip": 0.01128481, + "auxiliary_loss_mlp": 0.01030738, + "balance_loss_clip": 1.0461688, + "balance_loss_mlp": 1.01811945, + "epoch": 0.45459191342251615, + "flos": 21523804138080.0, + "grad_norm": 2.255693269635447, + "language_loss": 0.6867944, + "learning_rate": 2.3887242444276775e-06, + "loss": 0.70838666, + "num_input_tokens_seen": 162261455, + "router_z_loss_clip": 0.82226562, + "router_z_loss_mlp": 0.12609863, + "step": 7561, + "time_per_iteration": 2.634756088256836 + }, + { + "auxiliary_loss_clip": 0.01117886, + "auxiliary_loss_mlp": 0.01027533, + "balance_loss_clip": 1.04238343, + "balance_loss_mlp": 1.01644075, + "epoch": 0.4546520366751841, + "flos": 19738704728160.0, + "grad_norm": 2.729468165721827, + "language_loss": 0.85131019, + "learning_rate": 2.3883422024721015e-06, + "loss": 0.87276435, + "num_input_tokens_seen": 162279725, + "router_z_loss_clip": 0.75488281, + "router_z_loss_mlp": 0.11102295, + "step": 7562, + "time_per_iteration": 2.6799800395965576 + }, + { + "auxiliary_loss_clip": 0.01119772, + "auxiliary_loss_mlp": 0.01031929, + "balance_loss_clip": 1.04405427, + "balance_loss_mlp": 1.01990032, + "epoch": 0.4547121599278521, + "flos": 24101064133920.0, + "grad_norm": 2.533003494998053, + "language_loss": 0.89806348, + "learning_rate": 2.38796014579055e-06, + "loss": 0.91958046, + "num_input_tokens_seen": 162297865, + "router_z_loss_clip": 0.75683594, + "router_z_loss_mlp": 0.12036133, + "step": 7563, + "time_per_iteration": 3.9542248249053955 + }, + { + "auxiliary_loss_clip": 0.01121267, + "auxiliary_loss_mlp": 0.01035555, + "balance_loss_clip": 1.04277527, + "balance_loss_mlp": 1.02231693, + "epoch": 0.45477228318052004, + "flos": 24327726563520.0, + "grad_norm": 2.1366997073990057, + "language_loss": 0.71608412, + "learning_rate": 2.3875780743975097e-06, + "loss": 0.7376523, + "num_input_tokens_seen": 162316010, + "router_z_loss_clip": 0.78564453, + "router_z_loss_mlp": 0.13244629, + "step": 7564, + "time_per_iteration": 2.630126953125 + }, + { + "auxiliary_loss_clip": 0.01122852, + "auxiliary_loss_mlp": 0.01035192, + "balance_loss_clip": 1.04341793, + "balance_loss_mlp": 1.02265096, + "epoch": 0.454832406433188, + "flos": 25976058687360.0, + "grad_norm": 3.027152033803579, + "language_loss": 0.68270957, + "learning_rate": 2.3871959883074713e-06, + "loss": 0.70428997, + "num_input_tokens_seen": 162336115, + "router_z_loss_clip": 0.79394531, + "router_z_loss_mlp": 0.12542725, + "step": 7565, + "time_per_iteration": 2.7059109210968018 + }, + { + "auxiliary_loss_clip": 0.01118977, + "auxiliary_loss_mlp": 0.01028155, + "balance_loss_clip": 1.04180837, + "balance_loss_mlp": 1.0161624, + "epoch": 0.45489252968585603, + "flos": 29893682897280.0, + "grad_norm": 1.7212735243873911, + "language_loss": 0.80104721, + "learning_rate": 2.386813887534922e-06, + "loss": 0.82251847, + "num_input_tokens_seen": 162355705, + "router_z_loss_clip": 0.77148438, + "router_z_loss_mlp": 0.11999512, + "step": 7566, + "time_per_iteration": 2.6657044887542725 + }, + { + "auxiliary_loss_clip": 0.01123457, + "auxiliary_loss_mlp": 0.01028509, + "balance_loss_clip": 1.04298508, + "balance_loss_mlp": 1.01533067, + "epoch": 0.454952652938524, + "flos": 20855972033280.0, + "grad_norm": 1.6521988206371832, + "language_loss": 0.73683369, + "learning_rate": 2.3864317720943508e-06, + "loss": 0.75835335, + "num_input_tokens_seen": 162374055, + "router_z_loss_clip": 0.80419922, + "router_z_loss_mlp": 0.13189697, + "step": 7567, + "time_per_iteration": 2.6635780334472656 + }, + { + "auxiliary_loss_clip": 0.01124793, + "auxiliary_loss_mlp": 0.01038762, + "balance_loss_clip": 1.04426372, + "balance_loss_mlp": 1.02595329, + "epoch": 0.45501277619119196, + "flos": 33715739544480.0, + "grad_norm": 1.5887604302209115, + "language_loss": 0.81220961, + "learning_rate": 2.386049642000249e-06, + "loss": 0.83384514, + "num_input_tokens_seen": 162393560, + "router_z_loss_clip": 0.80517578, + "router_z_loss_mlp": 0.12817383, + "step": 7568, + "time_per_iteration": 2.694683074951172 + }, + { + "auxiliary_loss_clip": 0.01127426, + "auxiliary_loss_mlp": 0.01039993, + "balance_loss_clip": 1.04443264, + "balance_loss_mlp": 1.0262661, + "epoch": 0.4550728994438599, + "flos": 24373059049440.0, + "grad_norm": 2.7896650680215687, + "language_loss": 0.79951346, + "learning_rate": 2.3856674972671055e-06, + "loss": 0.82118767, + "num_input_tokens_seen": 162413170, + "router_z_loss_clip": 0.82958984, + "router_z_loss_mlp": 0.13726807, + "step": 7569, + "time_per_iteration": 2.681159019470215 + }, + { + "auxiliary_loss_clip": 0.01128046, + "auxiliary_loss_mlp": 0.0103148, + "balance_loss_clip": 1.04596734, + "balance_loss_mlp": 1.01812899, + "epoch": 0.4551330226965279, + "flos": 31807324585440.0, + "grad_norm": 1.4911843443549102, + "language_loss": 0.75159842, + "learning_rate": 2.385285337909412e-06, + "loss": 0.77319372, + "num_input_tokens_seen": 162434080, + "router_z_loss_clip": 0.82177734, + "router_z_loss_mlp": 0.13354492, + "step": 7570, + "time_per_iteration": 2.6757330894470215 + }, + { + "auxiliary_loss_clip": 0.01122408, + "auxiliary_loss_mlp": 0.01036288, + "balance_loss_clip": 1.04554784, + "balance_loss_mlp": 1.0236938, + "epoch": 0.45519314594919585, + "flos": 40000451853600.0, + "grad_norm": 1.6791404910102457, + "language_loss": 0.74987572, + "learning_rate": 2.3849031639416596e-06, + "loss": 0.77146268, + "num_input_tokens_seen": 162455445, + "router_z_loss_clip": 0.76953125, + "router_z_loss_mlp": 0.12591553, + "step": 7571, + "time_per_iteration": 2.786987066268921 + }, + { + "auxiliary_loss_clip": 0.0111921, + "auxiliary_loss_mlp": 0.01030816, + "balance_loss_clip": 1.04348731, + "balance_loss_mlp": 1.0188235, + "epoch": 0.4552532692018638, + "flos": 23394341790720.0, + "grad_norm": 1.4832554026590654, + "language_loss": 0.81266564, + "learning_rate": 2.3845209753783414e-06, + "loss": 0.83416587, + "num_input_tokens_seen": 162474940, + "router_z_loss_clip": 0.75830078, + "router_z_loss_mlp": 0.11993408, + "step": 7572, + "time_per_iteration": 2.664794683456421 + }, + { + "auxiliary_loss_clip": 0.01126955, + "auxiliary_loss_mlp": 0.01038778, + "balance_loss_clip": 1.04485369, + "balance_loss_mlp": 1.0250628, + "epoch": 0.4553133924545318, + "flos": 31759155889920.0, + "grad_norm": 1.9730328166489257, + "language_loss": 0.73159754, + "learning_rate": 2.3841387722339486e-06, + "loss": 0.75325477, + "num_input_tokens_seen": 162493340, + "router_z_loss_clip": 0.82080078, + "router_z_loss_mlp": 0.137146, + "step": 7573, + "time_per_iteration": 2.8446149826049805 + }, + { + "auxiliary_loss_clip": 0.01127417, + "auxiliary_loss_mlp": 0.01033839, + "balance_loss_clip": 1.04538643, + "balance_loss_mlp": 1.01944494, + "epoch": 0.45537351570719975, + "flos": 37414318573440.0, + "grad_norm": 2.0590054669183786, + "language_loss": 0.75035763, + "learning_rate": 2.3837565545229748e-06, + "loss": 0.77197015, + "num_input_tokens_seen": 162514360, + "router_z_loss_clip": 0.8203125, + "router_z_loss_mlp": 0.14379883, + "step": 7574, + "time_per_iteration": 2.7276573181152344 + }, + { + "auxiliary_loss_clip": 0.01124997, + "auxiliary_loss_mlp": 0.01032586, + "balance_loss_clip": 1.04409099, + "balance_loss_mlp": 1.01985407, + "epoch": 0.4554336389598677, + "flos": 29716283129760.0, + "grad_norm": 2.0704011781635785, + "language_loss": 0.71263254, + "learning_rate": 2.383374322259915e-06, + "loss": 0.73420835, + "num_input_tokens_seen": 162535240, + "router_z_loss_clip": 0.81005859, + "router_z_loss_mlp": 0.12731934, + "step": 7575, + "time_per_iteration": 2.7631516456604004 + }, + { + "auxiliary_loss_clip": 0.0112211, + "auxiliary_loss_mlp": 0.01032788, + "balance_loss_clip": 1.04302454, + "balance_loss_mlp": 1.01958513, + "epoch": 0.4554937622125357, + "flos": 25084846052640.0, + "grad_norm": 3.7677086594087035, + "language_loss": 0.72776651, + "learning_rate": 2.3829920754592617e-06, + "loss": 0.7493155, + "num_input_tokens_seen": 162553880, + "router_z_loss_clip": 0.79101562, + "router_z_loss_mlp": 0.13208008, + "step": 7576, + "time_per_iteration": 2.7004611492156982 + }, + { + "auxiliary_loss_clip": 0.0112118, + "auxiliary_loss_mlp": 0.01035365, + "balance_loss_clip": 1.04347515, + "balance_loss_mlp": 1.02226412, + "epoch": 0.45555388546520365, + "flos": 27846069615360.0, + "grad_norm": 1.8032067852189915, + "language_loss": 0.66395026, + "learning_rate": 2.382609814135511e-06, + "loss": 0.68551576, + "num_input_tokens_seen": 162574485, + "router_z_loss_clip": 0.77734375, + "router_z_loss_mlp": 0.13116455, + "step": 7577, + "time_per_iteration": 2.7133007049560547 + }, + { + "auxiliary_loss_clip": 0.01126271, + "auxiliary_loss_mlp": 0.01041595, + "balance_loss_clip": 1.04628491, + "balance_loss_mlp": 1.02699828, + "epoch": 0.4556140087178716, + "flos": 26510162302080.0, + "grad_norm": 2.1664929259452648, + "language_loss": 0.75050431, + "learning_rate": 2.382227538303157e-06, + "loss": 0.77218294, + "num_input_tokens_seen": 162595130, + "router_z_loss_clip": 0.79882812, + "router_z_loss_mlp": 0.14587402, + "step": 7578, + "time_per_iteration": 2.6771950721740723 + }, + { + "auxiliary_loss_clip": 0.01123958, + "auxiliary_loss_mlp": 0.01030104, + "balance_loss_clip": 1.04480004, + "balance_loss_mlp": 1.01771235, + "epoch": 0.45567413197053963, + "flos": 31718969098560.0, + "grad_norm": 5.04844272730953, + "language_loss": 0.70048636, + "learning_rate": 2.381845247976697e-06, + "loss": 0.72202694, + "num_input_tokens_seen": 162615720, + "router_z_loss_clip": 0.79052734, + "router_z_loss_mlp": 0.1239624, + "step": 7579, + "time_per_iteration": 2.673499822616577 + }, + { + "auxiliary_loss_clip": 0.01121251, + "auxiliary_loss_mlp": 0.01034767, + "balance_loss_clip": 1.0439384, + "balance_loss_mlp": 1.02233934, + "epoch": 0.4557342552232076, + "flos": 26280825732000.0, + "grad_norm": 1.6609280498763475, + "language_loss": 0.784477, + "learning_rate": 2.381462943170627e-06, + "loss": 0.80603719, + "num_input_tokens_seen": 162635825, + "router_z_loss_clip": 0.7734375, + "router_z_loss_mlp": 0.12438965, + "step": 7580, + "time_per_iteration": 2.7261147499084473 + }, + { + "auxiliary_loss_clip": 0.01123369, + "auxiliary_loss_mlp": 0.01030989, + "balance_loss_clip": 1.0449928, + "balance_loss_mlp": 1.01826382, + "epoch": 0.45579437847587556, + "flos": 48812229599040.0, + "grad_norm": 1.6960701875969786, + "language_loss": 0.68769717, + "learning_rate": 2.381080623899444e-06, + "loss": 0.70924073, + "num_input_tokens_seen": 162659130, + "router_z_loss_clip": 0.78466797, + "router_z_loss_mlp": 0.1272583, + "step": 7581, + "time_per_iteration": 2.7992496490478516 + }, + { + "auxiliary_loss_clip": 0.0111854, + "auxiliary_loss_mlp": 0.01031465, + "balance_loss_clip": 1.04130983, + "balance_loss_mlp": 1.01877499, + "epoch": 0.4558545017285435, + "flos": 37996104676320.0, + "grad_norm": 1.6789483346153127, + "language_loss": 0.7372998, + "learning_rate": 2.3806982901776455e-06, + "loss": 0.75879991, + "num_input_tokens_seen": 162681665, + "router_z_loss_clip": 0.77148438, + "router_z_loss_mlp": 0.12689209, + "step": 7582, + "time_per_iteration": 2.758409261703491 + }, + { + "auxiliary_loss_clip": 0.0112759, + "auxiliary_loss_mlp": 0.01045442, + "balance_loss_clip": 1.04635561, + "balance_loss_mlp": 1.03168547, + "epoch": 0.4559146249812115, + "flos": 26509919198400.0, + "grad_norm": 2.3066862563504977, + "language_loss": 0.72745526, + "learning_rate": 2.380315942019729e-06, + "loss": 0.74918556, + "num_input_tokens_seen": 162702040, + "router_z_loss_clip": 0.8125, + "router_z_loss_mlp": 0.13745117, + "step": 7583, + "time_per_iteration": 2.762964963912964 + }, + { + "auxiliary_loss_clip": 0.01126699, + "auxiliary_loss_mlp": 0.01033333, + "balance_loss_clip": 1.04360986, + "balance_loss_mlp": 1.02019644, + "epoch": 0.45597474823387946, + "flos": 29048288955840.0, + "grad_norm": 1.754292491296048, + "language_loss": 0.72643596, + "learning_rate": 2.379933579440195e-06, + "loss": 0.74803632, + "num_input_tokens_seen": 162722375, + "router_z_loss_clip": 0.83105469, + "router_z_loss_mlp": 0.13122559, + "step": 7584, + "time_per_iteration": 2.732895851135254 + }, + { + "auxiliary_loss_clip": 0.01122267, + "auxiliary_loss_mlp": 0.01032303, + "balance_loss_clip": 1.04410768, + "balance_loss_mlp": 1.01928556, + "epoch": 0.4560348714865474, + "flos": 38932163589600.0, + "grad_norm": 7.517323936522329, + "language_loss": 0.68029118, + "learning_rate": 2.379551202453541e-06, + "loss": 0.70183694, + "num_input_tokens_seen": 162746095, + "router_z_loss_clip": 0.78027344, + "router_z_loss_mlp": 0.13043213, + "step": 7585, + "time_per_iteration": 2.813086986541748 + }, + { + "auxiliary_loss_clip": 0.01123073, + "auxiliary_loss_mlp": 0.01029768, + "balance_loss_clip": 1.04425633, + "balance_loss_mlp": 1.01757932, + "epoch": 0.4560949947392154, + "flos": 26904621903840.0, + "grad_norm": 1.5710605250426448, + "language_loss": 0.76367855, + "learning_rate": 2.379168811074267e-06, + "loss": 0.78520691, + "num_input_tokens_seen": 162766330, + "router_z_loss_clip": 0.78857422, + "router_z_loss_mlp": 0.12188721, + "step": 7586, + "time_per_iteration": 2.687694787979126 + }, + { + "auxiliary_loss_clip": 0.01120952, + "auxiliary_loss_mlp": 0.01027815, + "balance_loss_clip": 1.04366517, + "balance_loss_mlp": 1.01603699, + "epoch": 0.45615511799188335, + "flos": 29982848729760.0, + "grad_norm": 1.707705701801608, + "language_loss": 0.78205836, + "learning_rate": 2.3787864053168747e-06, + "loss": 0.80354595, + "num_input_tokens_seen": 162784755, + "router_z_loss_clip": 0.77246094, + "router_z_loss_mlp": 0.11773682, + "step": 7587, + "time_per_iteration": 2.753190040588379 + }, + { + "auxiliary_loss_clip": 0.01128174, + "auxiliary_loss_mlp": 0.01035878, + "balance_loss_clip": 1.04395914, + "balance_loss_mlp": 1.02294946, + "epoch": 0.4562152412445513, + "flos": 22367172215520.0, + "grad_norm": 2.4333129515565073, + "language_loss": 0.69117105, + "learning_rate": 2.378403985195863e-06, + "loss": 0.71281159, + "num_input_tokens_seen": 162803850, + "router_z_loss_clip": 0.84277344, + "router_z_loss_mlp": 0.12927246, + "step": 7588, + "time_per_iteration": 2.6692276000976562 + }, + { + "auxiliary_loss_clip": 0.01120995, + "auxiliary_loss_mlp": 0.01033928, + "balance_loss_clip": 1.04342055, + "balance_loss_mlp": 1.02149463, + "epoch": 0.4562753644972193, + "flos": 16492356591840.0, + "grad_norm": 1.8028757826371618, + "language_loss": 0.79311609, + "learning_rate": 2.378021550725735e-06, + "loss": 0.81466532, + "num_input_tokens_seen": 162820775, + "router_z_loss_clip": 0.77587891, + "router_z_loss_mlp": 0.12445068, + "step": 7589, + "time_per_iteration": 2.671755313873291 + }, + { + "auxiliary_loss_clip": 0.01122823, + "auxiliary_loss_mlp": 0.01034315, + "balance_loss_clip": 1.04386759, + "balance_loss_mlp": 1.02175593, + "epoch": 0.45633548774988725, + "flos": 36167415023520.0, + "grad_norm": 2.5137719911195995, + "language_loss": 0.62307143, + "learning_rate": 2.377639101920992e-06, + "loss": 0.64464277, + "num_input_tokens_seen": 162839695, + "router_z_loss_clip": 0.78955078, + "router_z_loss_mlp": 0.12567139, + "step": 7590, + "time_per_iteration": 2.6634714603424072 + }, + { + "auxiliary_loss_clip": 0.01122221, + "auxiliary_loss_mlp": 0.01037374, + "balance_loss_clip": 1.04269898, + "balance_loss_mlp": 1.02545929, + "epoch": 0.4563956110025552, + "flos": 27129663642240.0, + "grad_norm": 1.850780229377361, + "language_loss": 0.73045671, + "learning_rate": 2.377256638796135e-06, + "loss": 0.75205266, + "num_input_tokens_seen": 162856095, + "router_z_loss_clip": 0.79492188, + "router_z_loss_mlp": 0.11914062, + "step": 7591, + "time_per_iteration": 3.909592390060425 + }, + { + "auxiliary_loss_clip": 0.01126578, + "auxiliary_loss_mlp": 0.01043873, + "balance_loss_clip": 1.04604709, + "balance_loss_mlp": 1.03000283, + "epoch": 0.45645573425522323, + "flos": 20855485825920.0, + "grad_norm": 2.169527110521689, + "language_loss": 0.77069217, + "learning_rate": 2.3768741613656695e-06, + "loss": 0.79239666, + "num_input_tokens_seen": 162874070, + "router_z_loss_clip": 0.80615234, + "router_z_loss_mlp": 0.13867188, + "step": 7592, + "time_per_iteration": 4.0881125926971436 + }, + { + "auxiliary_loss_clip": 0.01121199, + "auxiliary_loss_mlp": 0.01033526, + "balance_loss_clip": 1.04193568, + "balance_loss_mlp": 1.02097964, + "epoch": 0.4565158575078912, + "flos": 24810176996640.0, + "grad_norm": 2.4838376497543067, + "language_loss": 0.69450909, + "learning_rate": 2.376491669644098e-06, + "loss": 0.71605641, + "num_input_tokens_seen": 162891000, + "router_z_loss_clip": 0.79345703, + "router_z_loss_mlp": 0.12554932, + "step": 7593, + "time_per_iteration": 2.629844903945923 + }, + { + "auxiliary_loss_clip": 0.01115663, + "auxiliary_loss_mlp": 0.01027811, + "balance_loss_clip": 1.04059768, + "balance_loss_mlp": 1.01649189, + "epoch": 0.45657598076055916, + "flos": 29264822065440.0, + "grad_norm": 2.083953208293509, + "language_loss": 0.84019268, + "learning_rate": 2.3761091636459248e-06, + "loss": 0.8616274, + "num_input_tokens_seen": 162910120, + "router_z_loss_clip": 0.75, + "router_z_loss_mlp": 0.11315918, + "step": 7594, + "time_per_iteration": 2.645853042602539 + }, + { + "auxiliary_loss_clip": 0.01046081, + "auxiliary_loss_mlp": 0.0100112, + "balance_loss_clip": 1.02110183, + "balance_loss_mlp": 0.99981093, + "epoch": 0.45663610401322713, + "flos": 84639836481120.0, + "grad_norm": 0.794089162226116, + "language_loss": 0.52719873, + "learning_rate": 2.375726643385654e-06, + "loss": 0.54767072, + "num_input_tokens_seen": 162963720, + "router_z_loss_clip": 0.24975586, + "router_z_loss_mlp": 0.01309967, + "step": 7595, + "time_per_iteration": 3.29195499420166 + }, + { + "auxiliary_loss_clip": 0.01127686, + "auxiliary_loss_mlp": 0.01031399, + "balance_loss_clip": 1.04485679, + "balance_loss_mlp": 1.01799965, + "epoch": 0.4566962272658951, + "flos": 18485399448000.0, + "grad_norm": 4.767289559794493, + "language_loss": 0.87547421, + "learning_rate": 2.3753441088777915e-06, + "loss": 0.89706504, + "num_input_tokens_seen": 162975760, + "router_z_loss_clip": 0.828125, + "router_z_loss_mlp": 0.13421631, + "step": 7596, + "time_per_iteration": 2.5759708881378174 + }, + { + "auxiliary_loss_clip": 0.01126423, + "auxiliary_loss_mlp": 0.01036789, + "balance_loss_clip": 1.04628229, + "balance_loss_mlp": 1.02469492, + "epoch": 0.45675635051856306, + "flos": 22814541034560.0, + "grad_norm": 1.9038245275561545, + "language_loss": 0.77295214, + "learning_rate": 2.374961560136843e-06, + "loss": 0.79458427, + "num_input_tokens_seen": 162994865, + "router_z_loss_clip": 0.80175781, + "router_z_loss_mlp": 0.12097168, + "step": 7597, + "time_per_iteration": 2.6865313053131104 + }, + { + "auxiliary_loss_clip": 0.01124196, + "auxiliary_loss_mlp": 0.01031088, + "balance_loss_clip": 1.04435885, + "balance_loss_mlp": 1.0183382, + "epoch": 0.456816473771231, + "flos": 23303717336160.0, + "grad_norm": 2.943563642412706, + "language_loss": 0.78464693, + "learning_rate": 2.374578997177314e-06, + "loss": 0.80619979, + "num_input_tokens_seen": 163014730, + "router_z_loss_clip": 0.79980469, + "router_z_loss_mlp": 0.12762451, + "step": 7598, + "time_per_iteration": 2.838980197906494 + }, + { + "auxiliary_loss_clip": 0.01122992, + "auxiliary_loss_mlp": 0.01027557, + "balance_loss_clip": 1.04464459, + "balance_loss_mlp": 1.01552868, + "epoch": 0.456876597023899, + "flos": 35325546085440.0, + "grad_norm": 2.675997863103062, + "language_loss": 0.71506172, + "learning_rate": 2.374196420013712e-06, + "loss": 0.7365672, + "num_input_tokens_seen": 163033405, + "router_z_loss_clip": 0.78369141, + "router_z_loss_mlp": 0.12023926, + "step": 7599, + "time_per_iteration": 4.1495466232299805 + }, + { + "auxiliary_loss_clip": 0.0111871, + "auxiliary_loss_mlp": 0.01032943, + "balance_loss_clip": 1.04114461, + "balance_loss_mlp": 1.02036059, + "epoch": 0.45693672027656695, + "flos": 28418536743840.0, + "grad_norm": 2.2128672513271637, + "language_loss": 0.69790024, + "learning_rate": 2.373813828660544e-06, + "loss": 0.71941674, + "num_input_tokens_seen": 163051400, + "router_z_loss_clip": 0.77587891, + "router_z_loss_mlp": 0.12573242, + "step": 7600, + "time_per_iteration": 2.6753838062286377 + }, + { + "auxiliary_loss_clip": 0.01121637, + "auxiliary_loss_mlp": 0.01033075, + "balance_loss_clip": 1.0435257, + "balance_loss_mlp": 1.0212971, + "epoch": 0.4569968435292349, + "flos": 25085170190880.0, + "grad_norm": 2.1472701626005457, + "language_loss": 0.78347379, + "learning_rate": 2.373431223132319e-06, + "loss": 0.80502093, + "num_input_tokens_seen": 163069250, + "router_z_loss_clip": 0.78076172, + "router_z_loss_mlp": 0.11773682, + "step": 7601, + "time_per_iteration": 2.6336886882781982 + }, + { + "auxiliary_loss_clip": 0.01124022, + "auxiliary_loss_mlp": 0.01031942, + "balance_loss_clip": 1.04349017, + "balance_loss_mlp": 1.01977122, + "epoch": 0.4570569667819029, + "flos": 50374637272800.0, + "grad_norm": 2.1213324982995667, + "language_loss": 0.71369505, + "learning_rate": 2.3730486034435448e-06, + "loss": 0.73525465, + "num_input_tokens_seen": 163091755, + "router_z_loss_clip": 0.80517578, + "router_z_loss_mlp": 0.12176514, + "step": 7602, + "time_per_iteration": 2.844564437866211 + }, + { + "auxiliary_loss_clip": 0.01123731, + "auxiliary_loss_mlp": 0.01028382, + "balance_loss_clip": 1.04408252, + "balance_loss_mlp": 1.01441705, + "epoch": 0.45711709003457085, + "flos": 31764139515360.0, + "grad_norm": 1.7197533638755804, + "language_loss": 0.72858143, + "learning_rate": 2.372665969608729e-06, + "loss": 0.75010252, + "num_input_tokens_seen": 163111600, + "router_z_loss_clip": 0.79541016, + "router_z_loss_mlp": 0.13964844, + "step": 7603, + "time_per_iteration": 3.979126214981079 + }, + { + "auxiliary_loss_clip": 0.01123867, + "auxiliary_loss_mlp": 0.01036458, + "balance_loss_clip": 1.04406404, + "balance_loss_mlp": 1.02265394, + "epoch": 0.4571772132872388, + "flos": 27037945221120.0, + "grad_norm": 1.7437721019057073, + "language_loss": 0.83054411, + "learning_rate": 2.372283321642383e-06, + "loss": 0.85214734, + "num_input_tokens_seen": 163127350, + "router_z_loss_clip": 0.79785156, + "router_z_loss_mlp": 0.13818359, + "step": 7604, + "time_per_iteration": 2.6379568576812744 + }, + { + "auxiliary_loss_clip": 0.0113063, + "auxiliary_loss_mlp": 0.01037736, + "balance_loss_clip": 1.04710841, + "balance_loss_mlp": 1.02390814, + "epoch": 0.45723733653990684, + "flos": 29137252201920.0, + "grad_norm": 1.8100677440510387, + "language_loss": 0.85547173, + "learning_rate": 2.371900659559016e-06, + "loss": 0.87715542, + "num_input_tokens_seen": 163145855, + "router_z_loss_clip": 0.83398438, + "router_z_loss_mlp": 0.13842773, + "step": 7605, + "time_per_iteration": 2.684962511062622 + }, + { + "auxiliary_loss_clip": 0.01125332, + "auxiliary_loss_mlp": 0.01034677, + "balance_loss_clip": 1.04301238, + "balance_loss_mlp": 1.02139723, + "epoch": 0.4572974597925748, + "flos": 20586853844640.0, + "grad_norm": 2.347054226073421, + "language_loss": 0.73227644, + "learning_rate": 2.371517983373138e-06, + "loss": 0.75387657, + "num_input_tokens_seen": 163163830, + "router_z_loss_clip": 0.82324219, + "router_z_loss_mlp": 0.13262939, + "step": 7606, + "time_per_iteration": 2.6364328861236572 + }, + { + "auxiliary_loss_clip": 0.0112485, + "auxiliary_loss_mlp": 0.01035168, + "balance_loss_clip": 1.04395747, + "balance_loss_mlp": 1.02179921, + "epoch": 0.45735758304524277, + "flos": 16803525366720.0, + "grad_norm": 2.760154033676832, + "language_loss": 0.8017754, + "learning_rate": 2.371135293099262e-06, + "loss": 0.82337558, + "num_input_tokens_seen": 163180700, + "router_z_loss_clip": 0.80957031, + "router_z_loss_mlp": 0.13366699, + "step": 7607, + "time_per_iteration": 2.644473075866699 + }, + { + "auxiliary_loss_clip": 0.01127085, + "auxiliary_loss_mlp": 0.01037688, + "balance_loss_clip": 1.04747033, + "balance_loss_mlp": 1.02458656, + "epoch": 0.45741770629791073, + "flos": 25747654014720.0, + "grad_norm": 2.384767985677861, + "language_loss": 0.8097446, + "learning_rate": 2.3707525887518982e-06, + "loss": 0.83139235, + "num_input_tokens_seen": 163199450, + "router_z_loss_clip": 0.79541016, + "router_z_loss_mlp": 0.13092041, + "step": 7608, + "time_per_iteration": 2.6743266582489014 + }, + { + "auxiliary_loss_clip": 0.01122561, + "auxiliary_loss_mlp": 0.01037926, + "balance_loss_clip": 1.04290462, + "balance_loss_mlp": 1.02377558, + "epoch": 0.4574778295505787, + "flos": 28202530358880.0, + "grad_norm": 1.9530752796944155, + "language_loss": 0.68433392, + "learning_rate": 2.370369870345559e-06, + "loss": 0.70593882, + "num_input_tokens_seen": 163217875, + "router_z_loss_clip": 0.796875, + "router_z_loss_mlp": 0.14147949, + "step": 7609, + "time_per_iteration": 2.7270498275756836 + }, + { + "auxiliary_loss_clip": 0.01124215, + "auxiliary_loss_mlp": 0.01035494, + "balance_loss_clip": 1.04463422, + "balance_loss_mlp": 1.02229166, + "epoch": 0.45753795280324666, + "flos": 29715756405120.0, + "grad_norm": 1.994382698082489, + "language_loss": 0.80935323, + "learning_rate": 2.369987137894757e-06, + "loss": 0.83095032, + "num_input_tokens_seen": 163237430, + "router_z_loss_clip": 0.79589844, + "router_z_loss_mlp": 0.13201904, + "step": 7610, + "time_per_iteration": 2.6831655502319336 + }, + { + "auxiliary_loss_clip": 0.01124563, + "auxiliary_loss_mlp": 0.01035676, + "balance_loss_clip": 1.04301631, + "balance_loss_mlp": 1.02199042, + "epoch": 0.4575980760559146, + "flos": 20320733934720.0, + "grad_norm": 3.270612861747667, + "language_loss": 0.82429773, + "learning_rate": 2.3696043914140057e-06, + "loss": 0.84590012, + "num_input_tokens_seen": 163253905, + "router_z_loss_clip": 0.81591797, + "router_z_loss_mlp": 0.13671875, + "step": 7611, + "time_per_iteration": 2.7261407375335693 + }, + { + "auxiliary_loss_clip": 0.01127217, + "auxiliary_loss_mlp": 0.01027127, + "balance_loss_clip": 1.04685032, + "balance_loss_mlp": 1.01360226, + "epoch": 0.4576581993085826, + "flos": 43821009361440.0, + "grad_norm": 2.9551160436041153, + "language_loss": 0.73745418, + "learning_rate": 2.369221630917819e-06, + "loss": 0.75899762, + "num_input_tokens_seen": 163274285, + "router_z_loss_clip": 0.80273438, + "router_z_loss_mlp": 0.13525391, + "step": 7612, + "time_per_iteration": 2.7555651664733887 + }, + { + "auxiliary_loss_clip": 0.01123094, + "auxiliary_loss_mlp": 0.01032716, + "balance_loss_clip": 1.04276431, + "balance_loss_mlp": 1.01960874, + "epoch": 0.45771832256125056, + "flos": 24502452190560.0, + "grad_norm": 3.20839516055077, + "language_loss": 0.84782439, + "learning_rate": 2.368838856420711e-06, + "loss": 0.8693825, + "num_input_tokens_seen": 163293150, + "router_z_loss_clip": 0.80371094, + "router_z_loss_mlp": 0.13104248, + "step": 7613, + "time_per_iteration": 2.6687002182006836 + }, + { + "auxiliary_loss_clip": 0.01124031, + "auxiliary_loss_mlp": 0.0103394, + "balance_loss_clip": 1.04420376, + "balance_loss_mlp": 1.0206362, + "epoch": 0.4577784458139185, + "flos": 13109443755840.0, + "grad_norm": 2.6879532140729845, + "language_loss": 0.7564435, + "learning_rate": 2.3684560679371965e-06, + "loss": 0.77802324, + "num_input_tokens_seen": 163310065, + "router_z_loss_clip": 0.79785156, + "router_z_loss_mlp": 0.13305664, + "step": 7614, + "time_per_iteration": 2.6151583194732666 + }, + { + "auxiliary_loss_clip": 0.01121072, + "auxiliary_loss_mlp": 0.01032866, + "balance_loss_clip": 1.04324019, + "balance_loss_mlp": 1.02016413, + "epoch": 0.4578385690665865, + "flos": 26731476450720.0, + "grad_norm": 1.6037365848751928, + "language_loss": 0.75108039, + "learning_rate": 2.368073265481791e-06, + "loss": 0.77261978, + "num_input_tokens_seen": 163329415, + "router_z_loss_clip": 0.77832031, + "router_z_loss_mlp": 0.12695312, + "step": 7615, + "time_per_iteration": 2.7124435901641846 + }, + { + "auxiliary_loss_clip": 0.01049677, + "auxiliary_loss_mlp": 0.01000874, + "balance_loss_clip": 1.02466512, + "balance_loss_mlp": 0.99961448, + "epoch": 0.45789869231925445, + "flos": 79020039032640.0, + "grad_norm": 0.7776251398781432, + "language_loss": 0.57625115, + "learning_rate": 2.3676904490690105e-06, + "loss": 0.5967567, + "num_input_tokens_seen": 163385875, + "router_z_loss_clip": 0.24975586, + "router_z_loss_mlp": 0.01259613, + "step": 7616, + "time_per_iteration": 3.214707374572754 + }, + { + "auxiliary_loss_clip": 0.01122503, + "auxiliary_loss_mlp": 0.01036687, + "balance_loss_clip": 1.04319513, + "balance_loss_mlp": 1.02359164, + "epoch": 0.4579588155719224, + "flos": 19698558454080.0, + "grad_norm": 2.2538683232689443, + "language_loss": 0.71311331, + "learning_rate": 2.3673076187133704e-06, + "loss": 0.73470521, + "num_input_tokens_seen": 163405170, + "router_z_loss_clip": 0.79296875, + "router_z_loss_mlp": 0.13104248, + "step": 7617, + "time_per_iteration": 2.626408576965332 + }, + { + "auxiliary_loss_clip": 0.01126494, + "auxiliary_loss_mlp": 0.01031317, + "balance_loss_clip": 1.04596794, + "balance_loss_mlp": 1.01835287, + "epoch": 0.45801893882459044, + "flos": 26107113036960.0, + "grad_norm": 1.8895146737770523, + "language_loss": 0.75994015, + "learning_rate": 2.36692477442939e-06, + "loss": 0.78151822, + "num_input_tokens_seen": 163423155, + "router_z_loss_clip": 0.80419922, + "router_z_loss_mlp": 0.12969971, + "step": 7618, + "time_per_iteration": 2.6654207706451416 + }, + { + "auxiliary_loss_clip": 0.01129318, + "auxiliary_loss_mlp": 0.01036692, + "balance_loss_clip": 1.04748559, + "balance_loss_mlp": 1.02434802, + "epoch": 0.4580790620772584, + "flos": 23838469227360.0, + "grad_norm": 2.406102285659623, + "language_loss": 0.76752585, + "learning_rate": 2.366541916231585e-06, + "loss": 0.78918594, + "num_input_tokens_seen": 163442450, + "router_z_loss_clip": 0.81835938, + "router_z_loss_mlp": 0.12347412, + "step": 7619, + "time_per_iteration": 2.6547982692718506 + }, + { + "auxiliary_loss_clip": 0.01122752, + "auxiliary_loss_mlp": 0.01035831, + "balance_loss_clip": 1.04537451, + "balance_loss_mlp": 1.02383232, + "epoch": 0.45813918532992637, + "flos": 20231973275040.0, + "grad_norm": 1.9351133821930606, + "language_loss": 0.71633577, + "learning_rate": 2.366159044134473e-06, + "loss": 0.7379216, + "num_input_tokens_seen": 163459810, + "router_z_loss_clip": 0.77441406, + "router_z_loss_mlp": 0.12011719, + "step": 7620, + "time_per_iteration": 2.619065046310425 + }, + { + "auxiliary_loss_clip": 0.01121113, + "auxiliary_loss_mlp": 0.01030755, + "balance_loss_clip": 1.04330873, + "balance_loss_mlp": 1.01828551, + "epoch": 0.45819930858259433, + "flos": 51531159471840.0, + "grad_norm": 1.6310251736144696, + "language_loss": 0.78231114, + "learning_rate": 2.3657761581525748e-06, + "loss": 0.80382979, + "num_input_tokens_seen": 163482970, + "router_z_loss_clip": 0.77783203, + "router_z_loss_mlp": 0.12463379, + "step": 7621, + "time_per_iteration": 2.8757495880126953 + }, + { + "auxiliary_loss_clip": 0.01052954, + "auxiliary_loss_mlp": 0.01005666, + "balance_loss_clip": 1.02769232, + "balance_loss_mlp": 1.00434244, + "epoch": 0.4582594318352623, + "flos": 77745346979040.0, + "grad_norm": 0.7879922336900442, + "language_loss": 0.64992613, + "learning_rate": 2.3653932583004063e-06, + "loss": 0.67051232, + "num_input_tokens_seen": 163545330, + "router_z_loss_clip": 0.25292969, + "router_z_loss_mlp": 0.01324463, + "step": 7622, + "time_per_iteration": 3.268710136413574 + }, + { + "auxiliary_loss_clip": 0.01125415, + "auxiliary_loss_mlp": 0.01033351, + "balance_loss_clip": 1.04494572, + "balance_loss_mlp": 1.01928425, + "epoch": 0.45831955508793026, + "flos": 32787297879840.0, + "grad_norm": 1.9950623084773795, + "language_loss": 0.79758608, + "learning_rate": 2.3650103445924903e-06, + "loss": 0.81917381, + "num_input_tokens_seen": 163564620, + "router_z_loss_clip": 0.80517578, + "router_z_loss_mlp": 0.14074707, + "step": 7623, + "time_per_iteration": 2.7090022563934326 + }, + { + "auxiliary_loss_clip": 0.01128922, + "auxiliary_loss_mlp": 0.01034251, + "balance_loss_clip": 1.04624033, + "balance_loss_mlp": 1.02160335, + "epoch": 0.45837967834059823, + "flos": 22859063174880.0, + "grad_norm": 1.9697539634431995, + "language_loss": 0.70126927, + "learning_rate": 2.3646274170433452e-06, + "loss": 0.72290099, + "num_input_tokens_seen": 163581010, + "router_z_loss_clip": 0.82617188, + "router_z_loss_mlp": 0.12652588, + "step": 7624, + "time_per_iteration": 2.6375341415405273 + }, + { + "auxiliary_loss_clip": 0.01123925, + "auxiliary_loss_mlp": 0.0103616, + "balance_loss_clip": 1.04258907, + "balance_loss_mlp": 1.02356601, + "epoch": 0.4584398015932662, + "flos": 25841641403520.0, + "grad_norm": 2.8222983649095243, + "language_loss": 0.72851133, + "learning_rate": 2.364244475667491e-06, + "loss": 0.75011218, + "num_input_tokens_seen": 163599955, + "router_z_loss_clip": 0.81396484, + "router_z_loss_mlp": 0.1260376, + "step": 7625, + "time_per_iteration": 2.7442541122436523 + }, + { + "auxiliary_loss_clip": 0.01127691, + "auxiliary_loss_mlp": 0.01035781, + "balance_loss_clip": 1.04690027, + "balance_loss_mlp": 1.02355003, + "epoch": 0.45849992484593416, + "flos": 24147976793760.0, + "grad_norm": 2.0041085953661137, + "language_loss": 0.78260052, + "learning_rate": 2.363861520479451e-06, + "loss": 0.80423522, + "num_input_tokens_seen": 163618545, + "router_z_loss_clip": 0.80761719, + "router_z_loss_mlp": 0.12237549, + "step": 7626, + "time_per_iteration": 2.6481268405914307 + }, + { + "auxiliary_loss_clip": 0.01130117, + "auxiliary_loss_mlp": 0.01041339, + "balance_loss_clip": 1.04704189, + "balance_loss_mlp": 1.02789211, + "epoch": 0.4585600480986021, + "flos": 22235874762240.0, + "grad_norm": 1.9170936949668629, + "language_loss": 0.84639096, + "learning_rate": 2.3634785514937445e-06, + "loss": 0.86810553, + "num_input_tokens_seen": 163636055, + "router_z_loss_clip": 0.83105469, + "router_z_loss_mlp": 0.13452148, + "step": 7627, + "time_per_iteration": 2.6536436080932617 + }, + { + "auxiliary_loss_clip": 0.0113074, + "auxiliary_loss_mlp": 0.01037584, + "balance_loss_clip": 1.04639769, + "balance_loss_mlp": 1.02441776, + "epoch": 0.4586201713512701, + "flos": 35413658468640.0, + "grad_norm": 1.627826379571658, + "language_loss": 0.69073486, + "learning_rate": 2.3630955687248953e-06, + "loss": 0.71241808, + "num_input_tokens_seen": 163657485, + "router_z_loss_clip": 0.84326172, + "router_z_loss_mlp": 0.1315918, + "step": 7628, + "time_per_iteration": 2.7113144397735596 + }, + { + "auxiliary_loss_clip": 0.01123396, + "auxiliary_loss_mlp": 0.01030224, + "balance_loss_clip": 1.04434443, + "balance_loss_mlp": 1.01693797, + "epoch": 0.45868029460393805, + "flos": 28558504895040.0, + "grad_norm": 2.17963714536835, + "language_loss": 0.78396779, + "learning_rate": 2.3627125721874265e-06, + "loss": 0.80550396, + "num_input_tokens_seen": 163676030, + "router_z_loss_clip": 0.78955078, + "router_z_loss_mlp": 0.13275146, + "step": 7629, + "time_per_iteration": 2.683535099029541 + }, + { + "auxiliary_loss_clip": 0.01130694, + "auxiliary_loss_mlp": 0.01040153, + "balance_loss_clip": 1.04533339, + "balance_loss_mlp": 1.02606273, + "epoch": 0.458740417856606, + "flos": 22234416140160.0, + "grad_norm": 2.6430224094388888, + "language_loss": 0.79473674, + "learning_rate": 2.3623295618958595e-06, + "loss": 0.81644523, + "num_input_tokens_seen": 163694490, + "router_z_loss_clip": 0.85302734, + "router_z_loss_mlp": 0.14093018, + "step": 7630, + "time_per_iteration": 4.081950902938843 + }, + { + "auxiliary_loss_clip": 0.01129343, + "auxiliary_loss_mlp": 0.01033056, + "balance_loss_clip": 1.04464304, + "balance_loss_mlp": 1.02012205, + "epoch": 0.458800541109274, + "flos": 42181915177440.0, + "grad_norm": 2.196997898417026, + "language_loss": 0.71873629, + "learning_rate": 2.3619465378647198e-06, + "loss": 0.74036026, + "num_input_tokens_seen": 163717035, + "router_z_loss_clip": 0.84814453, + "router_z_loss_mlp": 0.12945557, + "step": 7631, + "time_per_iteration": 4.148262977600098 + }, + { + "auxiliary_loss_clip": 0.01131598, + "auxiliary_loss_mlp": 0.01039898, + "balance_loss_clip": 1.04903388, + "balance_loss_mlp": 1.02674317, + "epoch": 0.458860664361942, + "flos": 21612159624960.0, + "grad_norm": 2.299270670375495, + "language_loss": 0.71385825, + "learning_rate": 2.361563500108531e-06, + "loss": 0.73557317, + "num_input_tokens_seen": 163734525, + "router_z_loss_clip": 0.82519531, + "router_z_loss_mlp": 0.13153076, + "step": 7632, + "time_per_iteration": 2.695683240890503 + }, + { + "auxiliary_loss_clip": 0.01131463, + "auxiliary_loss_mlp": 0.01031672, + "balance_loss_clip": 1.04673719, + "balance_loss_mlp": 1.01705742, + "epoch": 0.45892078761460997, + "flos": 22502683465920.0, + "grad_norm": 2.5791581558233516, + "language_loss": 0.69169062, + "learning_rate": 2.3611804486418178e-06, + "loss": 0.71332192, + "num_input_tokens_seen": 163752860, + "router_z_loss_clip": 0.84716797, + "router_z_loss_mlp": 0.1463623, + "step": 7633, + "time_per_iteration": 2.687922716140747 + }, + { + "auxiliary_loss_clip": 0.01128083, + "auxiliary_loss_mlp": 0.01041871, + "balance_loss_clip": 1.04704595, + "balance_loss_mlp": 1.02844167, + "epoch": 0.45898091086727794, + "flos": 27665874155520.0, + "grad_norm": 1.5175645444119947, + "language_loss": 0.80891728, + "learning_rate": 2.3607973834791062e-06, + "loss": 0.83061683, + "num_input_tokens_seen": 163772495, + "router_z_loss_clip": 0.81103516, + "router_z_loss_mlp": 0.13433838, + "step": 7634, + "time_per_iteration": 2.6651434898376465 + }, + { + "auxiliary_loss_clip": 0.01132159, + "auxiliary_loss_mlp": 0.01037462, + "balance_loss_clip": 1.04685378, + "balance_loss_mlp": 1.02318037, + "epoch": 0.4590410341199459, + "flos": 26421036986880.0, + "grad_norm": 1.880743213534732, + "language_loss": 0.81538594, + "learning_rate": 2.3604143046349216e-06, + "loss": 0.83708215, + "num_input_tokens_seen": 163791475, + "router_z_loss_clip": 0.85351562, + "router_z_loss_mlp": 0.14282227, + "step": 7635, + "time_per_iteration": 2.681865930557251 + }, + { + "auxiliary_loss_clip": 0.01127321, + "auxiliary_loss_mlp": 0.01041877, + "balance_loss_clip": 1.04807305, + "balance_loss_mlp": 1.02836418, + "epoch": 0.45910115737261387, + "flos": 44582059026720.0, + "grad_norm": 1.8222783577420363, + "language_loss": 0.65130591, + "learning_rate": 2.3600312121237905e-06, + "loss": 0.67299783, + "num_input_tokens_seen": 163812995, + "router_z_loss_clip": 0.79248047, + "router_z_loss_mlp": 0.13513184, + "step": 7636, + "time_per_iteration": 2.7882344722747803 + }, + { + "auxiliary_loss_clip": 0.01127708, + "auxiliary_loss_mlp": 0.01029933, + "balance_loss_clip": 1.04869497, + "balance_loss_mlp": 1.01676071, + "epoch": 0.45916128062528183, + "flos": 29797993782720.0, + "grad_norm": 1.6852575710464905, + "language_loss": 0.80248564, + "learning_rate": 2.3596481059602395e-06, + "loss": 0.82406205, + "num_input_tokens_seen": 163833945, + "router_z_loss_clip": 0.79052734, + "router_z_loss_mlp": 0.13165283, + "step": 7637, + "time_per_iteration": 2.681434154510498 + }, + { + "auxiliary_loss_clip": 0.01134701, + "auxiliary_loss_mlp": 0.01037773, + "balance_loss_clip": 1.05086684, + "balance_loss_mlp": 1.02382565, + "epoch": 0.4592214038779498, + "flos": 28335651089760.0, + "grad_norm": 2.2776910160772808, + "language_loss": 0.75120771, + "learning_rate": 2.3592649861587965e-06, + "loss": 0.77293247, + "num_input_tokens_seen": 163853885, + "router_z_loss_clip": 0.83789062, + "router_z_loss_mlp": 0.1394043, + "step": 7638, + "time_per_iteration": 4.18053126335144 + }, + { + "auxiliary_loss_clip": 0.01126843, + "auxiliary_loss_mlp": 0.01034589, + "balance_loss_clip": 1.04797375, + "balance_loss_mlp": 1.02207196, + "epoch": 0.45928152713061776, + "flos": 23393734031520.0, + "grad_norm": 1.6610012864287647, + "language_loss": 0.73759854, + "learning_rate": 2.358881852733989e-06, + "loss": 0.75921291, + "num_input_tokens_seen": 163871855, + "router_z_loss_clip": 0.78808594, + "router_z_loss_mlp": 0.12524414, + "step": 7639, + "time_per_iteration": 2.663421630859375 + }, + { + "auxiliary_loss_clip": 0.0113004, + "auxiliary_loss_mlp": 0.01035558, + "balance_loss_clip": 1.04740596, + "balance_loss_mlp": 1.0221591, + "epoch": 0.4593416503832857, + "flos": 27350977790880.0, + "grad_norm": 1.6679023867257807, + "language_loss": 0.68141824, + "learning_rate": 2.358498705700346e-06, + "loss": 0.70307422, + "num_input_tokens_seen": 163891450, + "router_z_loss_clip": 0.82666016, + "router_z_loss_mlp": 0.13409424, + "step": 7640, + "time_per_iteration": 2.668623924255371 + }, + { + "auxiliary_loss_clip": 0.01129489, + "auxiliary_loss_mlp": 0.01035261, + "balance_loss_clip": 1.0455749, + "balance_loss_mlp": 1.02214837, + "epoch": 0.4594017736359537, + "flos": 23037516391680.0, + "grad_norm": 1.636688628708122, + "language_loss": 0.75262499, + "learning_rate": 2.3581155450723958e-06, + "loss": 0.77427244, + "num_input_tokens_seen": 163909345, + "router_z_loss_clip": 0.83935547, + "router_z_loss_mlp": 0.13122559, + "step": 7641, + "time_per_iteration": 2.6143710613250732 + }, + { + "auxiliary_loss_clip": 0.0112884, + "auxiliary_loss_mlp": 0.01031712, + "balance_loss_clip": 1.0460856, + "balance_loss_mlp": 1.01778197, + "epoch": 0.45946189688862166, + "flos": 25037082529920.0, + "grad_norm": 1.7326353144647868, + "language_loss": 0.74842238, + "learning_rate": 2.357732370864668e-06, + "loss": 0.77002788, + "num_input_tokens_seen": 163926940, + "router_z_loss_clip": 0.82714844, + "router_z_loss_mlp": 0.13928223, + "step": 7642, + "time_per_iteration": 4.100606441497803 + }, + { + "auxiliary_loss_clip": 0.01049833, + "auxiliary_loss_mlp": 0.0100261, + "balance_loss_clip": 1.02479088, + "balance_loss_mlp": 1.00120902, + "epoch": 0.4595220201412896, + "flos": 74928904714080.0, + "grad_norm": 0.8405026637574502, + "language_loss": 0.58115268, + "learning_rate": 2.357349183091694e-06, + "loss": 0.60167706, + "num_input_tokens_seen": 163977785, + "router_z_loss_clip": 0.25, + "router_z_loss_mlp": 0.01399994, + "step": 7643, + "time_per_iteration": 3.0336835384368896 + }, + { + "auxiliary_loss_clip": 0.01132245, + "auxiliary_loss_mlp": 0.01032474, + "balance_loss_clip": 1.04522467, + "balance_loss_mlp": 1.01946282, + "epoch": 0.4595821433939576, + "flos": 28468690786080.0, + "grad_norm": 1.5611405105719338, + "language_loss": 0.93105668, + "learning_rate": 2.3569659817680016e-06, + "loss": 0.95270383, + "num_input_tokens_seen": 163996630, + "router_z_loss_clip": 0.86962891, + "router_z_loss_mlp": 0.13018799, + "step": 7644, + "time_per_iteration": 2.6876039505004883 + }, + { + "auxiliary_loss_clip": 0.01130658, + "auxiliary_loss_mlp": 0.0103766, + "balance_loss_clip": 1.04681313, + "balance_loss_mlp": 1.02393937, + "epoch": 0.4596422666466256, + "flos": 17426916365760.0, + "grad_norm": 2.060077772377743, + "language_loss": 0.82297021, + "learning_rate": 2.3565827669081243e-06, + "loss": 0.84465337, + "num_input_tokens_seen": 164013190, + "router_z_loss_clip": 0.83691406, + "router_z_loss_mlp": 0.137146, + "step": 7645, + "time_per_iteration": 2.6599254608154297 + }, + { + "auxiliary_loss_clip": 0.01048448, + "auxiliary_loss_mlp": 0.01000547, + "balance_loss_clip": 1.02339196, + "balance_loss_mlp": 0.99911827, + "epoch": 0.4597023898992936, + "flos": 80200669013280.0, + "grad_norm": 0.7598272553400262, + "language_loss": 0.59871626, + "learning_rate": 2.356199538526593e-06, + "loss": 0.61920625, + "num_input_tokens_seen": 164074030, + "router_z_loss_clip": 0.25024414, + "router_z_loss_mlp": 0.0142746, + "step": 7646, + "time_per_iteration": 3.170488119125366 + }, + { + "auxiliary_loss_clip": 0.01127915, + "auxiliary_loss_mlp": 0.01031448, + "balance_loss_clip": 1.04571223, + "balance_loss_mlp": 1.01794815, + "epoch": 0.45976251315196154, + "flos": 32832508813920.0, + "grad_norm": 1.5534302679491447, + "language_loss": 0.72338903, + "learning_rate": 2.355816296637939e-06, + "loss": 0.74498272, + "num_input_tokens_seen": 164095515, + "router_z_loss_clip": 0.82177734, + "router_z_loss_mlp": 0.13500977, + "step": 7647, + "time_per_iteration": 2.7474920749664307 + }, + { + "auxiliary_loss_clip": 0.01128238, + "auxiliary_loss_mlp": 0.01036201, + "balance_loss_clip": 1.04392219, + "balance_loss_mlp": 1.02282548, + "epoch": 0.4598226364046295, + "flos": 32479573073760.0, + "grad_norm": 1.832255868884949, + "language_loss": 0.66345835, + "learning_rate": 2.3554330412566957e-06, + "loss": 0.6851027, + "num_input_tokens_seen": 164117270, + "router_z_loss_clip": 0.84130859, + "router_z_loss_mlp": 0.13360596, + "step": 7648, + "time_per_iteration": 2.708683490753174 + }, + { + "auxiliary_loss_clip": 0.01127301, + "auxiliary_loss_mlp": 0.01032996, + "balance_loss_clip": 1.04509377, + "balance_loss_mlp": 1.01954937, + "epoch": 0.45988275965729747, + "flos": 29758414750560.0, + "grad_norm": 1.4742242967266976, + "language_loss": 0.7859298, + "learning_rate": 2.3550497723973953e-06, + "loss": 0.80753273, + "num_input_tokens_seen": 164137850, + "router_z_loss_clip": 0.82324219, + "router_z_loss_mlp": 0.13439941, + "step": 7649, + "time_per_iteration": 2.734917640686035 + }, + { + "auxiliary_loss_clip": 0.01126377, + "auxiliary_loss_mlp": 0.01034629, + "balance_loss_clip": 1.04473948, + "balance_loss_mlp": 1.02140844, + "epoch": 0.45994288290996543, + "flos": 29938893831360.0, + "grad_norm": 1.7954687130171496, + "language_loss": 0.69139111, + "learning_rate": 2.3546664900745726e-06, + "loss": 0.71300119, + "num_input_tokens_seen": 164157960, + "router_z_loss_clip": 0.81689453, + "router_z_loss_mlp": 0.13220215, + "step": 7650, + "time_per_iteration": 2.674206256866455 + }, + { + "auxiliary_loss_clip": 0.01134634, + "auxiliary_loss_mlp": 0.01036447, + "balance_loss_clip": 1.04739881, + "balance_loss_mlp": 1.02119982, + "epoch": 0.4600030061626334, + "flos": 18095437264320.0, + "grad_norm": 2.251398857692123, + "language_loss": 0.84133554, + "learning_rate": 2.354283194302761e-06, + "loss": 0.86304629, + "num_input_tokens_seen": 164174590, + "router_z_loss_clip": 0.87255859, + "router_z_loss_mlp": 0.15258789, + "step": 7651, + "time_per_iteration": 2.7082936763763428 + }, + { + "auxiliary_loss_clip": 0.01128163, + "auxiliary_loss_mlp": 0.01030867, + "balance_loss_clip": 1.04714656, + "balance_loss_mlp": 1.01742041, + "epoch": 0.46006312941530136, + "flos": 22102348858560.0, + "grad_norm": 2.7707200377786116, + "language_loss": 0.75384533, + "learning_rate": 2.3538998850964948e-06, + "loss": 0.77543557, + "num_input_tokens_seen": 164192935, + "router_z_loss_clip": 0.80957031, + "router_z_loss_mlp": 0.13439941, + "step": 7652, + "time_per_iteration": 2.6436495780944824 + }, + { + "auxiliary_loss_clip": 0.01127628, + "auxiliary_loss_mlp": 0.01029254, + "balance_loss_clip": 1.04389918, + "balance_loss_mlp": 1.01559234, + "epoch": 0.46012325266796933, + "flos": 26816266416960.0, + "grad_norm": 1.721119514167909, + "language_loss": 0.75883776, + "learning_rate": 2.3535165624703097e-06, + "loss": 0.78040659, + "num_input_tokens_seen": 164213160, + "router_z_loss_clip": 0.83740234, + "router_z_loss_mlp": 0.13659668, + "step": 7653, + "time_per_iteration": 2.733293294906616 + }, + { + "auxiliary_loss_clip": 0.01136502, + "auxiliary_loss_mlp": 0.01036941, + "balance_loss_clip": 1.04914308, + "balance_loss_mlp": 1.0216701, + "epoch": 0.4601833759206373, + "flos": 18629702948160.0, + "grad_norm": 2.258907296004605, + "language_loss": 0.66169327, + "learning_rate": 2.353133226438741e-06, + "loss": 0.68342769, + "num_input_tokens_seen": 164229330, + "router_z_loss_clip": 0.87402344, + "router_z_loss_mlp": 0.15270996, + "step": 7654, + "time_per_iteration": 2.652956247329712 + }, + { + "auxiliary_loss_clip": 0.01126572, + "auxiliary_loss_mlp": 0.01035123, + "balance_loss_clip": 1.04390454, + "balance_loss_mlp": 1.02156317, + "epoch": 0.46024349917330526, + "flos": 33054552273600.0, + "grad_norm": 3.005964130871407, + "language_loss": 0.79101551, + "learning_rate": 2.3527498770163248e-06, + "loss": 0.81263244, + "num_input_tokens_seen": 164248240, + "router_z_loss_clip": 0.82617188, + "router_z_loss_mlp": 0.13574219, + "step": 7655, + "time_per_iteration": 2.7316677570343018 + }, + { + "auxiliary_loss_clip": 0.01122929, + "auxiliary_loss_mlp": 0.01029853, + "balance_loss_clip": 1.04323971, + "balance_loss_mlp": 1.01640618, + "epoch": 0.4603036224259732, + "flos": 29849930585280.0, + "grad_norm": 2.810515186251172, + "language_loss": 0.67639554, + "learning_rate": 2.3523665142175985e-06, + "loss": 0.6979233, + "num_input_tokens_seen": 164268020, + "router_z_loss_clip": 0.79589844, + "router_z_loss_mlp": 0.13452148, + "step": 7656, + "time_per_iteration": 2.680454730987549 + }, + { + "auxiliary_loss_clip": 0.01125323, + "auxiliary_loss_mlp": 0.0103599, + "balance_loss_clip": 1.04351366, + "balance_loss_mlp": 1.02278185, + "epoch": 0.4603637456786412, + "flos": 34299713580480.0, + "grad_norm": 1.7959330769324842, + "language_loss": 0.81128788, + "learning_rate": 2.351983138057098e-06, + "loss": 0.832901, + "num_input_tokens_seen": 164287305, + "router_z_loss_clip": 0.81787109, + "router_z_loss_mlp": 0.13214111, + "step": 7657, + "time_per_iteration": 2.684006690979004 + }, + { + "auxiliary_loss_clip": 0.01126738, + "auxiliary_loss_mlp": 0.01033736, + "balance_loss_clip": 1.04412735, + "balance_loss_mlp": 1.01952052, + "epoch": 0.4604238689313092, + "flos": 29711258987040.0, + "grad_norm": 2.5075468064647226, + "language_loss": 0.70568502, + "learning_rate": 2.3515997485493623e-06, + "loss": 0.72728968, + "num_input_tokens_seen": 164306835, + "router_z_loss_clip": 0.82568359, + "router_z_loss_mlp": 0.14215088, + "step": 7658, + "time_per_iteration": 2.6895039081573486 + }, + { + "auxiliary_loss_clip": 0.01043221, + "auxiliary_loss_mlp": 0.01007655, + "balance_loss_clip": 1.01827812, + "balance_loss_mlp": 1.00633478, + "epoch": 0.4604839921839772, + "flos": 65410283073600.0, + "grad_norm": 0.9629430587780694, + "language_loss": 0.62071097, + "learning_rate": 2.351216345708928e-06, + "loss": 0.64121974, + "num_input_tokens_seen": 164367095, + "router_z_loss_clip": 0.24963379, + "router_z_loss_mlp": 0.01320648, + "step": 7659, + "time_per_iteration": 3.365414619445801 + }, + { + "auxiliary_loss_clip": 0.01126602, + "auxiliary_loss_mlp": 0.01035877, + "balance_loss_clip": 1.04634249, + "balance_loss_mlp": 1.02195919, + "epoch": 0.46054411543664514, + "flos": 38130197821920.0, + "grad_norm": 2.372452378799534, + "language_loss": 0.68846524, + "learning_rate": 2.350832929550336e-06, + "loss": 0.71008998, + "num_input_tokens_seen": 164388895, + "router_z_loss_clip": 0.80224609, + "router_z_loss_mlp": 0.13928223, + "step": 7660, + "time_per_iteration": 2.7444186210632324 + }, + { + "auxiliary_loss_clip": 0.0112499, + "auxiliary_loss_mlp": 0.01037665, + "balance_loss_clip": 1.04278708, + "balance_loss_mlp": 1.02377105, + "epoch": 0.4606042386893131, + "flos": 29398834176480.0, + "grad_norm": 1.7371238657062544, + "language_loss": 0.76758432, + "learning_rate": 2.3504495000881227e-06, + "loss": 0.78921092, + "num_input_tokens_seen": 164409080, + "router_z_loss_clip": 0.82177734, + "router_z_loss_mlp": 0.13885498, + "step": 7661, + "time_per_iteration": 2.676057815551758 + }, + { + "auxiliary_loss_clip": 0.01124622, + "auxiliary_loss_mlp": 0.01038164, + "balance_loss_clip": 1.04506433, + "balance_loss_mlp": 1.02503884, + "epoch": 0.46066436194198107, + "flos": 32431525930080.0, + "grad_norm": 1.98752779436029, + "language_loss": 0.7483874, + "learning_rate": 2.3500660573368305e-06, + "loss": 0.77001524, + "num_input_tokens_seen": 164427585, + "router_z_loss_clip": 0.79589844, + "router_z_loss_mlp": 0.13134766, + "step": 7662, + "time_per_iteration": 2.7264015674591064 + }, + { + "auxiliary_loss_clip": 0.01132346, + "auxiliary_loss_mlp": 0.01036568, + "balance_loss_clip": 1.04450083, + "balance_loss_mlp": 1.02177405, + "epoch": 0.46072448519464904, + "flos": 21690507343680.0, + "grad_norm": 3.6196813347932504, + "language_loss": 0.78608626, + "learning_rate": 2.349682601310998e-06, + "loss": 0.80777538, + "num_input_tokens_seen": 164438455, + "router_z_loss_clip": 0.87792969, + "router_z_loss_mlp": 0.14801025, + "step": 7663, + "time_per_iteration": 2.6542043685913086 + }, + { + "auxiliary_loss_clip": 0.0112453, + "auxiliary_loss_mlp": 0.01034267, + "balance_loss_clip": 1.04407763, + "balance_loss_mlp": 1.02055776, + "epoch": 0.460784608447317, + "flos": 18407618971200.0, + "grad_norm": 2.178943068643905, + "language_loss": 0.73054868, + "learning_rate": 2.3492991320251653e-06, + "loss": 0.75213665, + "num_input_tokens_seen": 164456830, + "router_z_loss_clip": 0.80419922, + "router_z_loss_mlp": 0.13708496, + "step": 7664, + "time_per_iteration": 2.7579541206359863 + }, + { + "auxiliary_loss_clip": 0.01128418, + "auxiliary_loss_mlp": 0.0103674, + "balance_loss_clip": 1.04557228, + "balance_loss_mlp": 1.02377629, + "epoch": 0.46084473169998497, + "flos": 22681258234560.0, + "grad_norm": 1.545853958701788, + "language_loss": 0.72351032, + "learning_rate": 2.3489156494938753e-06, + "loss": 0.74516189, + "num_input_tokens_seen": 164475375, + "router_z_loss_clip": 0.828125, + "router_z_loss_mlp": 0.12963867, + "step": 7665, + "time_per_iteration": 2.7920868396759033 + }, + { + "auxiliary_loss_clip": 0.01127913, + "auxiliary_loss_mlp": 0.01031019, + "balance_loss_clip": 1.04435825, + "balance_loss_mlp": 1.0174588, + "epoch": 0.46090485495265293, + "flos": 23787991046880.0, + "grad_norm": 1.9707594066535579, + "language_loss": 0.78417742, + "learning_rate": 2.348532153731669e-06, + "loss": 0.80576676, + "num_input_tokens_seen": 164492040, + "router_z_loss_clip": 0.83642578, + "router_z_loss_mlp": 0.13568115, + "step": 7666, + "time_per_iteration": 2.65547251701355 + }, + { + "auxiliary_loss_clip": 0.01124684, + "auxiliary_loss_mlp": 0.01036629, + "balance_loss_clip": 1.04289949, + "balance_loss_mlp": 1.02188265, + "epoch": 0.4609649782053209, + "flos": 40712887133280.0, + "grad_norm": 2.622904555262135, + "language_loss": 0.74168873, + "learning_rate": 2.348148644753088e-06, + "loss": 0.76330185, + "num_input_tokens_seen": 164513665, + "router_z_loss_clip": 0.81689453, + "router_z_loss_mlp": 0.14758301, + "step": 7667, + "time_per_iteration": 2.858564615249634 + }, + { + "auxiliary_loss_clip": 0.01124698, + "auxiliary_loss_mlp": 0.01036039, + "balance_loss_clip": 1.04256976, + "balance_loss_mlp": 1.02326608, + "epoch": 0.46102510145798886, + "flos": 29003402160000.0, + "grad_norm": 1.5078483953885518, + "language_loss": 0.76075506, + "learning_rate": 2.347765122572676e-06, + "loss": 0.78236246, + "num_input_tokens_seen": 164533890, + "router_z_loss_clip": 0.82177734, + "router_z_loss_mlp": 0.12762451, + "step": 7668, + "time_per_iteration": 2.661984920501709 + }, + { + "auxiliary_loss_clip": 0.01124327, + "auxiliary_loss_mlp": 0.01034126, + "balance_loss_clip": 1.0465188, + "balance_loss_mlp": 1.02120423, + "epoch": 0.4610852247106568, + "flos": 28424573818560.0, + "grad_norm": 1.7102467166117152, + "language_loss": 0.78228831, + "learning_rate": 2.347381587204975e-06, + "loss": 0.80387282, + "num_input_tokens_seen": 164553815, + "router_z_loss_clip": 0.77880859, + "router_z_loss_mlp": 0.12915039, + "step": 7669, + "time_per_iteration": 2.7228617668151855 + }, + { + "auxiliary_loss_clip": 0.01126827, + "auxiliary_loss_mlp": 0.0103051, + "balance_loss_clip": 1.04354274, + "balance_loss_mlp": 1.01734352, + "epoch": 0.4611453479633248, + "flos": 31051380097440.0, + "grad_norm": 2.606340243313205, + "language_loss": 0.82377213, + "learning_rate": 2.34699803866453e-06, + "loss": 0.8453455, + "num_input_tokens_seen": 164573125, + "router_z_loss_clip": 0.83300781, + "router_z_loss_mlp": 0.13171387, + "step": 7670, + "time_per_iteration": 4.008751153945923 + }, + { + "auxiliary_loss_clip": 0.01124384, + "auxiliary_loss_mlp": 0.0103212, + "balance_loss_clip": 1.0431869, + "balance_loss_mlp": 1.01878059, + "epoch": 0.4612054712159928, + "flos": 25794850295520.0, + "grad_norm": 2.06152566404557, + "language_loss": 0.63646793, + "learning_rate": 2.3466144769658845e-06, + "loss": 0.65803295, + "num_input_tokens_seen": 164592575, + "router_z_loss_clip": 0.81298828, + "router_z_loss_mlp": 0.13336182, + "step": 7671, + "time_per_iteration": 4.141568660736084 + }, + { + "auxiliary_loss_clip": 0.01042266, + "auxiliary_loss_mlp": 0.01000426, + "balance_loss_clip": 1.01737106, + "balance_loss_mlp": 0.99915522, + "epoch": 0.4612655944686608, + "flos": 85364832117600.0, + "grad_norm": 0.7711460901466392, + "language_loss": 0.55837083, + "learning_rate": 2.346230902123583e-06, + "loss": 0.5787977, + "num_input_tokens_seen": 164659795, + "router_z_loss_clip": 0.24914551, + "router_z_loss_mlp": 0.01270294, + "step": 7672, + "time_per_iteration": 3.41119122505188 + }, + { + "auxiliary_loss_clip": 0.01130642, + "auxiliary_loss_mlp": 0.01034052, + "balance_loss_clip": 1.0465188, + "balance_loss_mlp": 1.02040243, + "epoch": 0.46132571772132874, + "flos": 20544398085600.0, + "grad_norm": 2.298197332522711, + "language_loss": 0.71389687, + "learning_rate": 2.3458473141521715e-06, + "loss": 0.73554373, + "num_input_tokens_seen": 164678735, + "router_z_loss_clip": 0.84179688, + "router_z_loss_mlp": 0.13641357, + "step": 7673, + "time_per_iteration": 2.6702027320861816 + }, + { + "auxiliary_loss_clip": 0.01124608, + "auxiliary_loss_mlp": 0.01033007, + "balance_loss_clip": 1.04484129, + "balance_loss_mlp": 1.01987636, + "epoch": 0.4613858409739967, + "flos": 43695303292800.0, + "grad_norm": 1.9761388093954464, + "language_loss": 0.70542479, + "learning_rate": 2.345463713066195e-06, + "loss": 0.72700095, + "num_input_tokens_seen": 164700885, + "router_z_loss_clip": 0.79785156, + "router_z_loss_mlp": 0.13122559, + "step": 7674, + "time_per_iteration": 2.7463018894195557 + }, + { + "auxiliary_loss_clip": 0.01124431, + "auxiliary_loss_mlp": 0.01037659, + "balance_loss_clip": 1.04174185, + "balance_loss_mlp": 1.02393794, + "epoch": 0.4614459642266647, + "flos": 43033224641760.0, + "grad_norm": 1.3527228215491445, + "language_loss": 0.65357006, + "learning_rate": 2.3450800988801996e-06, + "loss": 0.67519099, + "num_input_tokens_seen": 164726960, + "router_z_loss_clip": 0.82763672, + "router_z_loss_mlp": 0.13726807, + "step": 7675, + "time_per_iteration": 2.82501482963562 + }, + { + "auxiliary_loss_clip": 0.01042751, + "auxiliary_loss_mlp": 0.01000162, + "balance_loss_clip": 1.01779342, + "balance_loss_mlp": 0.99889719, + "epoch": 0.46150608747933264, + "flos": 81393934034880.0, + "grad_norm": 1.152252520322945, + "language_loss": 0.58533072, + "learning_rate": 2.3446964716087327e-06, + "loss": 0.60575986, + "num_input_tokens_seen": 164788525, + "router_z_loss_clip": 0.24987793, + "router_z_loss_mlp": 0.01264191, + "step": 7676, + "time_per_iteration": 3.2534964084625244 + }, + { + "auxiliary_loss_clip": 0.01042385, + "auxiliary_loss_mlp": 0.01000783, + "balance_loss_clip": 1.01736677, + "balance_loss_mlp": 0.99949348, + "epoch": 0.4615662107320006, + "flos": 68124269838240.0, + "grad_norm": 0.8266772818920084, + "language_loss": 0.62713933, + "learning_rate": 2.344312831266341e-06, + "loss": 0.64757109, + "num_input_tokens_seen": 164843525, + "router_z_loss_clip": 0.25048828, + "router_z_loss_mlp": 0.01290131, + "step": 7677, + "time_per_iteration": 4.568461179733276 + }, + { + "auxiliary_loss_clip": 0.01125398, + "auxiliary_loss_mlp": 0.01030757, + "balance_loss_clip": 1.04444492, + "balance_loss_mlp": 1.01777554, + "epoch": 0.46162633398466857, + "flos": 18892621992960.0, + "grad_norm": 2.347182005855398, + "language_loss": 0.76556158, + "learning_rate": 2.3439291778675718e-06, + "loss": 0.7871232, + "num_input_tokens_seen": 164859895, + "router_z_loss_clip": 0.81103516, + "router_z_loss_mlp": 0.12963867, + "step": 7678, + "time_per_iteration": 2.699960708618164 + }, + { + "auxiliary_loss_clip": 0.01130244, + "auxiliary_loss_mlp": 0.01034369, + "balance_loss_clip": 1.04714942, + "balance_loss_mlp": 1.02067757, + "epoch": 0.46168645723733653, + "flos": 24417905328000.0, + "grad_norm": 2.4171099857350726, + "language_loss": 0.66777611, + "learning_rate": 2.343545511426974e-06, + "loss": 0.68942225, + "num_input_tokens_seen": 164878030, + "router_z_loss_clip": 0.82958984, + "router_z_loss_mlp": 0.13684082, + "step": 7679, + "time_per_iteration": 2.6360297203063965 + }, + { + "auxiliary_loss_clip": 0.01127377, + "auxiliary_loss_mlp": 0.0103878, + "balance_loss_clip": 1.04586458, + "balance_loss_mlp": 1.02610779, + "epoch": 0.4617465804900045, + "flos": 24768693652320.0, + "grad_norm": 2.3428328873424147, + "language_loss": 0.69951332, + "learning_rate": 2.3431618319590963e-06, + "loss": 0.7211749, + "num_input_tokens_seen": 164895710, + "router_z_loss_clip": 0.81591797, + "router_z_loss_mlp": 0.12689209, + "step": 7680, + "time_per_iteration": 2.683401107788086 + }, + { + "auxiliary_loss_clip": 0.01135835, + "auxiliary_loss_mlp": 0.01042708, + "balance_loss_clip": 1.05104995, + "balance_loss_mlp": 1.02889717, + "epoch": 0.46180670374267246, + "flos": 27267119722080.0, + "grad_norm": 1.8741293378408288, + "language_loss": 0.63340092, + "learning_rate": 2.342778139478487e-06, + "loss": 0.65518636, + "num_input_tokens_seen": 164913365, + "router_z_loss_clip": 0.84960938, + "router_z_loss_mlp": 0.13818359, + "step": 7681, + "time_per_iteration": 2.6852316856384277 + }, + { + "auxiliary_loss_clip": 0.01125929, + "auxiliary_loss_mlp": 0.01034071, + "balance_loss_clip": 1.0457027, + "balance_loss_mlp": 1.02153587, + "epoch": 0.46186682699534043, + "flos": 24277248383040.0, + "grad_norm": 2.497093300772725, + "language_loss": 0.6702925, + "learning_rate": 2.342394433999697e-06, + "loss": 0.6918925, + "num_input_tokens_seen": 164931620, + "router_z_loss_clip": 0.80224609, + "router_z_loss_mlp": 0.12542725, + "step": 7682, + "time_per_iteration": 3.9821321964263916 + }, + { + "auxiliary_loss_clip": 0.0112745, + "auxiliary_loss_mlp": 0.01036402, + "balance_loss_clip": 1.04492748, + "balance_loss_mlp": 1.02313375, + "epoch": 0.4619269502480084, + "flos": 38441569183200.0, + "grad_norm": 2.127397401446139, + "language_loss": 0.73921418, + "learning_rate": 2.342010715537275e-06, + "loss": 0.76085275, + "num_input_tokens_seen": 164950905, + "router_z_loss_clip": 0.82568359, + "router_z_loss_mlp": 0.1328125, + "step": 7683, + "time_per_iteration": 2.763925075531006 + }, + { + "auxiliary_loss_clip": 0.01129057, + "auxiliary_loss_mlp": 0.01039145, + "balance_loss_clip": 1.04708934, + "balance_loss_mlp": 1.02603233, + "epoch": 0.46198707350067636, + "flos": 30516952344480.0, + "grad_norm": 1.8937755253994388, + "language_loss": 0.76764315, + "learning_rate": 2.3416269841057726e-06, + "loss": 0.78932512, + "num_input_tokens_seen": 164970950, + "router_z_loss_clip": 0.81884766, + "router_z_loss_mlp": 0.13110352, + "step": 7684, + "time_per_iteration": 2.645235061645508 + }, + { + "auxiliary_loss_clip": 0.01134351, + "auxiliary_loss_mlp": 0.01040372, + "balance_loss_clip": 1.04772043, + "balance_loss_mlp": 1.02660966, + "epoch": 0.4620471967533444, + "flos": 22320543176640.0, + "grad_norm": 1.9572766172516993, + "language_loss": 0.79876113, + "learning_rate": 2.3412432397197412e-06, + "loss": 0.82050836, + "num_input_tokens_seen": 164989855, + "router_z_loss_clip": 0.86523438, + "router_z_loss_mlp": 0.13757324, + "step": 7685, + "time_per_iteration": 2.7003233432769775 + }, + { + "auxiliary_loss_clip": 0.01126949, + "auxiliary_loss_mlp": 0.01042097, + "balance_loss_clip": 1.04728961, + "balance_loss_mlp": 1.02826905, + "epoch": 0.46210732000601235, + "flos": 41469682484160.0, + "grad_norm": 2.8843972343865167, + "language_loss": 0.67014605, + "learning_rate": 2.340859482393731e-06, + "loss": 0.69183648, + "num_input_tokens_seen": 165012290, + "router_z_loss_clip": 0.79736328, + "router_z_loss_mlp": 0.13830566, + "step": 7686, + "time_per_iteration": 2.7295455932617188 + }, + { + "auxiliary_loss_clip": 0.01128841, + "auxiliary_loss_mlp": 0.01034308, + "balance_loss_clip": 1.04449403, + "balance_loss_mlp": 1.02080774, + "epoch": 0.4621674432586803, + "flos": 30516871309920.0, + "grad_norm": 2.3328507087929085, + "language_loss": 0.73752213, + "learning_rate": 2.340475712142296e-06, + "loss": 0.7591536, + "num_input_tokens_seen": 165030810, + "router_z_loss_clip": 0.84326172, + "router_z_loss_mlp": 0.13513184, + "step": 7687, + "time_per_iteration": 2.727041006088257 + }, + { + "auxiliary_loss_clip": 0.01129784, + "auxiliary_loss_mlp": 0.01033142, + "balance_loss_clip": 1.0483737, + "balance_loss_mlp": 1.0199213, + "epoch": 0.4622275665113483, + "flos": 26862004075680.0, + "grad_norm": 2.3129301720184574, + "language_loss": 0.74511003, + "learning_rate": 2.3400919289799873e-06, + "loss": 0.76673925, + "num_input_tokens_seen": 165050205, + "router_z_loss_clip": 0.81347656, + "router_z_loss_mlp": 0.13226318, + "step": 7688, + "time_per_iteration": 2.698593854904175 + }, + { + "auxiliary_loss_clip": 0.01125226, + "auxiliary_loss_mlp": 0.01035469, + "balance_loss_clip": 1.04399538, + "balance_loss_mlp": 1.0217483, + "epoch": 0.46228768976401624, + "flos": 29357067211200.0, + "grad_norm": 1.7319806185956017, + "language_loss": 0.78234589, + "learning_rate": 2.3397081329213585e-06, + "loss": 0.80395281, + "num_input_tokens_seen": 165069370, + "router_z_loss_clip": 0.81152344, + "router_z_loss_mlp": 0.1373291, + "step": 7689, + "time_per_iteration": 2.7219314575195312 + }, + { + "auxiliary_loss_clip": 0.011302, + "auxiliary_loss_mlp": 0.01039536, + "balance_loss_clip": 1.04548049, + "balance_loss_mlp": 1.02569604, + "epoch": 0.4623478130166842, + "flos": 32520529693440.0, + "grad_norm": 2.217387138921935, + "language_loss": 0.57035637, + "learning_rate": 2.339324323980964e-06, + "loss": 0.59205371, + "num_input_tokens_seen": 165089610, + "router_z_loss_clip": 0.84765625, + "router_z_loss_mlp": 0.13842773, + "step": 7690, + "time_per_iteration": 2.7079806327819824 + }, + { + "auxiliary_loss_clip": 0.01126677, + "auxiliary_loss_mlp": 0.01039727, + "balance_loss_clip": 1.04419112, + "balance_loss_mlp": 1.02580976, + "epoch": 0.46240793626935217, + "flos": 25085413294560.0, + "grad_norm": 2.843004812768351, + "language_loss": 0.82305479, + "learning_rate": 2.3389405021733562e-06, + "loss": 0.84471881, + "num_input_tokens_seen": 165109050, + "router_z_loss_clip": 0.82519531, + "router_z_loss_mlp": 0.13909912, + "step": 7691, + "time_per_iteration": 2.633932590484619 + }, + { + "auxiliary_loss_clip": 0.01129555, + "auxiliary_loss_mlp": 0.01031882, + "balance_loss_clip": 1.04780269, + "balance_loss_mlp": 1.01928735, + "epoch": 0.46246805952202014, + "flos": 27401010281280.0, + "grad_norm": 1.4901213590039677, + "language_loss": 0.75285494, + "learning_rate": 2.338556667513091e-06, + "loss": 0.77446926, + "num_input_tokens_seen": 165130130, + "router_z_loss_clip": 0.81787109, + "router_z_loss_mlp": 0.12597656, + "step": 7692, + "time_per_iteration": 2.779848575592041 + }, + { + "auxiliary_loss_clip": 0.01128346, + "auxiliary_loss_mlp": 0.01038471, + "balance_loss_clip": 1.04494572, + "balance_loss_mlp": 1.02440405, + "epoch": 0.4625281827746881, + "flos": 42758717654880.0, + "grad_norm": 1.795502550036219, + "language_loss": 0.74000669, + "learning_rate": 2.338172820014723e-06, + "loss": 0.76167488, + "num_input_tokens_seen": 165152685, + "router_z_loss_clip": 0.83447266, + "router_z_loss_mlp": 0.14074707, + "step": 7693, + "time_per_iteration": 2.7907938957214355 + }, + { + "auxiliary_loss_clip": 0.0113009, + "auxiliary_loss_mlp": 0.01039536, + "balance_loss_clip": 1.04857421, + "balance_loss_mlp": 1.02638769, + "epoch": 0.46258830602735607, + "flos": 25708074982560.0, + "grad_norm": 1.6669867417384543, + "language_loss": 0.85464621, + "learning_rate": 2.337788959692808e-06, + "loss": 0.87634242, + "num_input_tokens_seen": 165173315, + "router_z_loss_clip": 0.81494141, + "router_z_loss_mlp": 0.13153076, + "step": 7694, + "time_per_iteration": 2.6974477767944336 + }, + { + "auxiliary_loss_clip": 0.01131161, + "auxiliary_loss_mlp": 0.0104342, + "balance_loss_clip": 1.04753542, + "balance_loss_mlp": 1.03049171, + "epoch": 0.46264842928002403, + "flos": 31941903938400.0, + "grad_norm": 2.0097201032314262, + "language_loss": 0.78588796, + "learning_rate": 2.337405086561902e-06, + "loss": 0.80763382, + "num_input_tokens_seen": 165192395, + "router_z_loss_clip": 0.83642578, + "router_z_loss_mlp": 0.12939453, + "step": 7695, + "time_per_iteration": 2.6754164695739746 + }, + { + "auxiliary_loss_clip": 0.0112386, + "auxiliary_loss_mlp": 0.01034369, + "balance_loss_clip": 1.04432726, + "balance_loss_mlp": 1.02143443, + "epoch": 0.462708552532692, + "flos": 20455394322240.0, + "grad_norm": 1.717045429111915, + "language_loss": 0.72244525, + "learning_rate": 2.3370212006365606e-06, + "loss": 0.74402755, + "num_input_tokens_seen": 165211355, + "router_z_loss_clip": 0.79443359, + "router_z_loss_mlp": 0.12957764, + "step": 7696, + "time_per_iteration": 2.6398744583129883 + }, + { + "auxiliary_loss_clip": 0.01129169, + "auxiliary_loss_mlp": 0.01045177, + "balance_loss_clip": 1.04766405, + "balance_loss_mlp": 1.03127146, + "epoch": 0.46276867578535996, + "flos": 18986447312640.0, + "grad_norm": 1.6300950535492824, + "language_loss": 0.69181561, + "learning_rate": 2.3366373019313423e-06, + "loss": 0.71355915, + "num_input_tokens_seen": 165229380, + "router_z_loss_clip": 0.81494141, + "router_z_loss_mlp": 0.13916016, + "step": 7697, + "time_per_iteration": 2.6281869411468506 + }, + { + "auxiliary_loss_clip": 0.01129989, + "auxiliary_loss_mlp": 0.01030307, + "balance_loss_clip": 1.04939055, + "balance_loss_mlp": 1.01725948, + "epoch": 0.462828799038028, + "flos": 27351788136480.0, + "grad_norm": 2.000512141312716, + "language_loss": 0.84237528, + "learning_rate": 2.3362533904608025e-06, + "loss": 0.86397821, + "num_input_tokens_seen": 165247200, + "router_z_loss_clip": 0.80517578, + "router_z_loss_mlp": 0.13043213, + "step": 7698, + "time_per_iteration": 2.682974338531494 + }, + { + "auxiliary_loss_clip": 0.01126701, + "auxiliary_loss_mlp": 0.01033433, + "balance_loss_clip": 1.04570258, + "balance_loss_mlp": 1.02089798, + "epoch": 0.46288892229069595, + "flos": 25709128431840.0, + "grad_norm": 2.0317588408051583, + "language_loss": 0.71249759, + "learning_rate": 2.335869466239502e-06, + "loss": 0.73409891, + "num_input_tokens_seen": 165265825, + "router_z_loss_clip": 0.80908203, + "router_z_loss_mlp": 0.12530518, + "step": 7699, + "time_per_iteration": 2.667611598968506 + }, + { + "auxiliary_loss_clip": 0.01128948, + "auxiliary_loss_mlp": 0.01035111, + "balance_loss_clip": 1.044186, + "balance_loss_mlp": 1.02125275, + "epoch": 0.4629490455433639, + "flos": 28291372053120.0, + "grad_norm": 1.9233010105900126, + "language_loss": 0.71699536, + "learning_rate": 2.335485529281996e-06, + "loss": 0.73863596, + "num_input_tokens_seen": 165284380, + "router_z_loss_clip": 0.84765625, + "router_z_loss_mlp": 0.1385498, + "step": 7700, + "time_per_iteration": 2.709470748901367 + }, + { + "auxiliary_loss_clip": 0.0112635, + "auxiliary_loss_mlp": 0.01034276, + "balance_loss_clip": 1.04556465, + "balance_loss_mlp": 1.0213418, + "epoch": 0.4630091687960319, + "flos": 22983756311520.0, + "grad_norm": 2.210824545310911, + "language_loss": 0.72604126, + "learning_rate": 2.3351015796028467e-06, + "loss": 0.74764752, + "num_input_tokens_seen": 165300320, + "router_z_loss_clip": 0.80712891, + "router_z_loss_mlp": 0.1293335, + "step": 7701, + "time_per_iteration": 2.6656103134155273 + }, + { + "auxiliary_loss_clip": 0.0113056, + "auxiliary_loss_mlp": 0.01033651, + "balance_loss_clip": 1.0464958, + "balance_loss_mlp": 1.01974559, + "epoch": 0.46306929204869984, + "flos": 47476119699360.0, + "grad_norm": 3.3926165795152188, + "language_loss": 0.6548605, + "learning_rate": 2.3347176172166114e-06, + "loss": 0.67650259, + "num_input_tokens_seen": 165318130, + "router_z_loss_clip": 0.84033203, + "router_z_loss_mlp": 0.13916016, + "step": 7702, + "time_per_iteration": 2.7639613151550293 + }, + { + "auxiliary_loss_clip": 0.01124683, + "auxiliary_loss_mlp": 0.01031164, + "balance_loss_clip": 1.04583728, + "balance_loss_mlp": 1.01849759, + "epoch": 0.4631294153013678, + "flos": 23969969267040.0, + "grad_norm": 2.5245128688535687, + "language_loss": 0.73087931, + "learning_rate": 2.33433364213785e-06, + "loss": 0.75243783, + "num_input_tokens_seen": 165336225, + "router_z_loss_clip": 0.78710938, + "router_z_loss_mlp": 0.12652588, + "step": 7703, + "time_per_iteration": 2.7577056884765625 + }, + { + "auxiliary_loss_clip": 0.01130856, + "auxiliary_loss_mlp": 0.01032533, + "balance_loss_clip": 1.04711485, + "balance_loss_mlp": 1.01888311, + "epoch": 0.4631895385540358, + "flos": 30027330352800.0, + "grad_norm": 1.798100310152613, + "language_loss": 0.68935478, + "learning_rate": 2.3339496543811243e-06, + "loss": 0.71098876, + "num_input_tokens_seen": 165355005, + "router_z_loss_clip": 0.83789062, + "router_z_loss_mlp": 0.13665771, + "step": 7704, + "time_per_iteration": 2.644228458404541 + }, + { + "auxiliary_loss_clip": 0.01128833, + "auxiliary_loss_mlp": 0.01032357, + "balance_loss_clip": 1.04539371, + "balance_loss_mlp": 1.01896954, + "epoch": 0.46324966180670374, + "flos": 32116548530880.0, + "grad_norm": 4.0918719163289285, + "language_loss": 0.80819654, + "learning_rate": 2.3335656539609934e-06, + "loss": 0.82980847, + "num_input_tokens_seen": 165374910, + "router_z_loss_clip": 0.83300781, + "router_z_loss_mlp": 0.13378906, + "step": 7705, + "time_per_iteration": 2.784498929977417 + }, + { + "auxiliary_loss_clip": 0.01128334, + "auxiliary_loss_mlp": 0.0103099, + "balance_loss_clip": 1.04483938, + "balance_loss_mlp": 1.01865232, + "epoch": 0.4633097850593717, + "flos": 23477592100320.0, + "grad_norm": 2.121336407517763, + "language_loss": 0.77764589, + "learning_rate": 2.3331816408920196e-06, + "loss": 0.79923916, + "num_input_tokens_seen": 165392590, + "router_z_loss_clip": 0.8359375, + "router_z_loss_mlp": 0.12341309, + "step": 7706, + "time_per_iteration": 2.744001865386963 + }, + { + "auxiliary_loss_clip": 0.01122842, + "auxiliary_loss_mlp": 0.01030354, + "balance_loss_clip": 1.04539752, + "balance_loss_mlp": 1.01744962, + "epoch": 0.46336990831203967, + "flos": 27800291439360.0, + "grad_norm": 1.8541333486773748, + "language_loss": 0.70233238, + "learning_rate": 2.3327976151887654e-06, + "loss": 0.72386432, + "num_input_tokens_seen": 165411195, + "router_z_loss_clip": 0.77441406, + "router_z_loss_mlp": 0.12896729, + "step": 7707, + "time_per_iteration": 2.7449331283569336 + }, + { + "auxiliary_loss_clip": 0.01129535, + "auxiliary_loss_mlp": 0.01033292, + "balance_loss_clip": 1.04473484, + "balance_loss_mlp": 1.01862311, + "epoch": 0.46343003156470763, + "flos": 46626592995360.0, + "grad_norm": 2.084110634013301, + "language_loss": 0.61346412, + "learning_rate": 2.332413576865791e-06, + "loss": 0.63509244, + "num_input_tokens_seen": 165430150, + "router_z_loss_clip": 0.84765625, + "router_z_loss_mlp": 0.14666748, + "step": 7708, + "time_per_iteration": 2.7601656913757324 + }, + { + "auxiliary_loss_clip": 0.01128103, + "auxiliary_loss_mlp": 0.01031608, + "balance_loss_clip": 1.04644442, + "balance_loss_mlp": 1.01819086, + "epoch": 0.4634901548173756, + "flos": 38972634001920.0, + "grad_norm": 2.3945183279595326, + "language_loss": 0.77314174, + "learning_rate": 2.3320295259376614e-06, + "loss": 0.79473889, + "num_input_tokens_seen": 165450595, + "router_z_loss_clip": 0.81640625, + "router_z_loss_mlp": 0.13421631, + "step": 7709, + "time_per_iteration": 4.192160606384277 + }, + { + "auxiliary_loss_clip": 0.01130483, + "auxiliary_loss_mlp": 0.01038199, + "balance_loss_clip": 1.04739583, + "balance_loss_mlp": 1.02512813, + "epoch": 0.46355027807004356, + "flos": 24504842710080.0, + "grad_norm": 1.7558365301796623, + "language_loss": 0.77180213, + "learning_rate": 2.3316454624189385e-06, + "loss": 0.79348892, + "num_input_tokens_seen": 165469515, + "router_z_loss_clip": 0.83056641, + "router_z_loss_mlp": 0.13061523, + "step": 7710, + "time_per_iteration": 4.151703834533691 + }, + { + "auxiliary_loss_clip": 0.01130716, + "auxiliary_loss_mlp": 0.01032977, + "balance_loss_clip": 1.04686165, + "balance_loss_mlp": 1.01941729, + "epoch": 0.4636104013227116, + "flos": 29359943938080.0, + "grad_norm": 2.1762081224810332, + "language_loss": 0.73163497, + "learning_rate": 2.3312613863241865e-06, + "loss": 0.75327194, + "num_input_tokens_seen": 165488125, + "router_z_loss_clip": 0.83837891, + "router_z_loss_mlp": 0.13568115, + "step": 7711, + "time_per_iteration": 2.824791193008423 + }, + { + "auxiliary_loss_clip": 0.01127947, + "auxiliary_loss_mlp": 0.0104361, + "balance_loss_clip": 1.04757237, + "balance_loss_mlp": 1.02977014, + "epoch": 0.46367052457537955, + "flos": 29180963996640.0, + "grad_norm": 1.3961696808413018, + "language_loss": 0.71556675, + "learning_rate": 2.33087729766797e-06, + "loss": 0.7372824, + "num_input_tokens_seen": 165509225, + "router_z_loss_clip": 0.80371094, + "router_z_loss_mlp": 0.1385498, + "step": 7712, + "time_per_iteration": 2.7115423679351807 + }, + { + "auxiliary_loss_clip": 0.01134562, + "auxiliary_loss_mlp": 0.01038713, + "balance_loss_clip": 1.04850125, + "balance_loss_mlp": 1.02417505, + "epoch": 0.4637306478280475, + "flos": 32209482470400.0, + "grad_norm": 1.893719371658695, + "language_loss": 0.73034894, + "learning_rate": 2.3304931964648524e-06, + "loss": 0.75208169, + "num_input_tokens_seen": 165529945, + "router_z_loss_clip": 0.86035156, + "router_z_loss_mlp": 0.14544678, + "step": 7713, + "time_per_iteration": 2.759263515472412 + }, + { + "auxiliary_loss_clip": 0.01131498, + "auxiliary_loss_mlp": 0.01031047, + "balance_loss_clip": 1.04724598, + "balance_loss_mlp": 1.01738536, + "epoch": 0.4637907710807155, + "flos": 26821047456000.0, + "grad_norm": 2.200558222527619, + "language_loss": 0.58652192, + "learning_rate": 2.3301090827294e-06, + "loss": 0.60814738, + "num_input_tokens_seen": 165550690, + "router_z_loss_clip": 0.84228516, + "router_z_loss_mlp": 0.13659668, + "step": 7714, + "time_per_iteration": 2.626596689224243 + }, + { + "auxiliary_loss_clip": 0.01124311, + "auxiliary_loss_mlp": 0.01032727, + "balance_loss_clip": 1.04406548, + "balance_loss_mlp": 1.01964974, + "epoch": 0.46385089433338345, + "flos": 15157057037760.0, + "grad_norm": 2.331519115595584, + "language_loss": 0.70080769, + "learning_rate": 2.3297249564761784e-06, + "loss": 0.72237808, + "num_input_tokens_seen": 165567775, + "router_z_loss_clip": 0.80078125, + "router_z_loss_mlp": 0.1307373, + "step": 7715, + "time_per_iteration": 2.7359142303466797 + }, + { + "auxiliary_loss_clip": 0.01132546, + "auxiliary_loss_mlp": 0.0103534, + "balance_loss_clip": 1.04701018, + "balance_loss_mlp": 1.02181566, + "epoch": 0.4639110175860514, + "flos": 29182220032320.0, + "grad_norm": 1.8962485307364898, + "language_loss": 0.68221474, + "learning_rate": 2.3293408177197527e-06, + "loss": 0.70389354, + "num_input_tokens_seen": 165587010, + "router_z_loss_clip": 0.85595703, + "router_z_loss_mlp": 0.13519287, + "step": 7716, + "time_per_iteration": 2.649275302886963 + }, + { + "auxiliary_loss_clip": 0.01129589, + "auxiliary_loss_mlp": 0.01030565, + "balance_loss_clip": 1.0451591, + "balance_loss_mlp": 1.01629579, + "epoch": 0.4639711408387194, + "flos": 30872035500480.0, + "grad_norm": 1.5034881720378017, + "language_loss": 0.81160176, + "learning_rate": 2.328956666474691e-06, + "loss": 0.83320332, + "num_input_tokens_seen": 165607850, + "router_z_loss_clip": 0.84326172, + "router_z_loss_mlp": 0.14263916, + "step": 7717, + "time_per_iteration": 4.1514081954956055 + }, + { + "auxiliary_loss_clip": 0.01127101, + "auxiliary_loss_mlp": 0.01033331, + "balance_loss_clip": 1.04456782, + "balance_loss_mlp": 1.02030158, + "epoch": 0.46403126409138734, + "flos": 25883043713280.0, + "grad_norm": 1.7612746112872273, + "language_loss": 0.73415911, + "learning_rate": 2.3285725027555593e-06, + "loss": 0.75576341, + "num_input_tokens_seen": 165627175, + "router_z_loss_clip": 0.82519531, + "router_z_loss_mlp": 0.13043213, + "step": 7718, + "time_per_iteration": 2.6918673515319824 + }, + { + "auxiliary_loss_clip": 0.01127579, + "auxiliary_loss_mlp": 0.01033465, + "balance_loss_clip": 1.04686141, + "balance_loss_mlp": 1.01988733, + "epoch": 0.4640913873440553, + "flos": 43739987502240.0, + "grad_norm": 1.803818719388648, + "language_loss": 0.70323306, + "learning_rate": 2.3281883265769254e-06, + "loss": 0.7248435, + "num_input_tokens_seen": 165648340, + "router_z_loss_clip": 0.80712891, + "router_z_loss_mlp": 0.13586426, + "step": 7719, + "time_per_iteration": 2.785341501235962 + }, + { + "auxiliary_loss_clip": 0.01133408, + "auxiliary_loss_mlp": 0.01033949, + "balance_loss_clip": 1.04840541, + "balance_loss_mlp": 1.02031803, + "epoch": 0.46415151059672327, + "flos": 23387048680320.0, + "grad_norm": 2.854408607863661, + "language_loss": 0.86599243, + "learning_rate": 2.327804137953357e-06, + "loss": 0.88766605, + "num_input_tokens_seen": 165667195, + "router_z_loss_clip": 0.84960938, + "router_z_loss_mlp": 0.13616943, + "step": 7720, + "time_per_iteration": 2.6668074131011963 + }, + { + "auxiliary_loss_clip": 0.01047017, + "auxiliary_loss_mlp": 0.01003619, + "balance_loss_clip": 1.02209187, + "balance_loss_mlp": 1.00226974, + "epoch": 0.46421163384939124, + "flos": 71884226196000.0, + "grad_norm": 0.7242362688105863, + "language_loss": 0.55037701, + "learning_rate": 2.3274199368994226e-06, + "loss": 0.57088339, + "num_input_tokens_seen": 165726760, + "router_z_loss_clip": 0.24914551, + "router_z_loss_mlp": 0.0134964, + "step": 7721, + "time_per_iteration": 4.660841226577759 + }, + { + "auxiliary_loss_clip": 0.01128237, + "auxiliary_loss_mlp": 0.01034094, + "balance_loss_clip": 1.04784238, + "balance_loss_mlp": 1.02037895, + "epoch": 0.4642717571020592, + "flos": 24595953372000.0, + "grad_norm": 2.097667839517329, + "language_loss": 0.79699898, + "learning_rate": 2.3270357234296918e-06, + "loss": 0.81862223, + "num_input_tokens_seen": 165745005, + "router_z_loss_clip": 0.80371094, + "router_z_loss_mlp": 0.13702393, + "step": 7722, + "time_per_iteration": 2.690613269805908 + }, + { + "auxiliary_loss_clip": 0.01129999, + "auxiliary_loss_mlp": 0.01033804, + "balance_loss_clip": 1.04604959, + "balance_loss_mlp": 1.02047026, + "epoch": 0.46433188035472717, + "flos": 30561717588480.0, + "grad_norm": 1.7972493973660297, + "language_loss": 0.77900922, + "learning_rate": 2.3266514975587332e-06, + "loss": 0.80064726, + "num_input_tokens_seen": 165765750, + "router_z_loss_clip": 0.83984375, + "router_z_loss_mlp": 0.13330078, + "step": 7723, + "time_per_iteration": 2.8164050579071045 + }, + { + "auxiliary_loss_clip": 0.01125882, + "auxiliary_loss_mlp": 0.01028821, + "balance_loss_clip": 1.04617786, + "balance_loss_mlp": 1.01610744, + "epoch": 0.4643920036073952, + "flos": 34258392305280.0, + "grad_norm": 1.908974370438021, + "language_loss": 0.68256032, + "learning_rate": 2.326267259301118e-06, + "loss": 0.70410728, + "num_input_tokens_seen": 165787515, + "router_z_loss_clip": 0.79785156, + "router_z_loss_mlp": 0.12713623, + "step": 7724, + "time_per_iteration": 2.7150468826293945 + }, + { + "auxiliary_loss_clip": 0.01127781, + "auxiliary_loss_mlp": 0.01030863, + "balance_loss_clip": 1.04627705, + "balance_loss_mlp": 1.01789868, + "epoch": 0.46445212686006315, + "flos": 22414125392640.0, + "grad_norm": 2.389173208917317, + "language_loss": 0.67327285, + "learning_rate": 2.325883008671415e-06, + "loss": 0.69485933, + "num_input_tokens_seen": 165806675, + "router_z_loss_clip": 0.81494141, + "router_z_loss_mlp": 0.12976074, + "step": 7725, + "time_per_iteration": 2.740787982940674 + }, + { + "auxiliary_loss_clip": 0.01123326, + "auxiliary_loss_mlp": 0.01033668, + "balance_loss_clip": 1.04544735, + "balance_loss_mlp": 1.02200913, + "epoch": 0.4645122501127311, + "flos": 38707729610400.0, + "grad_norm": 1.7461450111965302, + "language_loss": 0.65061557, + "learning_rate": 2.3254987456841955e-06, + "loss": 0.67218548, + "num_input_tokens_seen": 165829835, + "router_z_loss_clip": 0.77783203, + "router_z_loss_mlp": 0.11663818, + "step": 7726, + "time_per_iteration": 2.711221694946289 + }, + { + "auxiliary_loss_clip": 0.01130684, + "auxiliary_loss_mlp": 0.01031694, + "balance_loss_clip": 1.04976392, + "balance_loss_mlp": 1.01887953, + "epoch": 0.4645723733653991, + "flos": 29002996987200.0, + "grad_norm": 2.0638752485470504, + "language_loss": 0.75021946, + "learning_rate": 2.3251144703540307e-06, + "loss": 0.77184325, + "num_input_tokens_seen": 165849380, + "router_z_loss_clip": 0.80908203, + "router_z_loss_mlp": 0.12817383, + "step": 7727, + "time_per_iteration": 2.7349700927734375 + }, + { + "auxiliary_loss_clip": 0.01126679, + "auxiliary_loss_mlp": 0.01034108, + "balance_loss_clip": 1.04578269, + "balance_loss_mlp": 1.02069104, + "epoch": 0.46463249661806705, + "flos": 40445187049440.0, + "grad_norm": 1.98534252260672, + "language_loss": 0.79029703, + "learning_rate": 2.3247301826954936e-06, + "loss": 0.81190485, + "num_input_tokens_seen": 165868620, + "router_z_loss_clip": 0.80908203, + "router_z_loss_mlp": 0.1340332, + "step": 7728, + "time_per_iteration": 2.7504806518554688 + }, + { + "auxiliary_loss_clip": 0.01129037, + "auxiliary_loss_mlp": 0.01034674, + "balance_loss_clip": 1.04655933, + "balance_loss_mlp": 1.02169776, + "epoch": 0.464692619870735, + "flos": 22324675939200.0, + "grad_norm": 2.291182787566627, + "language_loss": 0.76548004, + "learning_rate": 2.324345882723155e-06, + "loss": 0.78711712, + "num_input_tokens_seen": 165885915, + "router_z_loss_clip": 0.82519531, + "router_z_loss_mlp": 0.12982178, + "step": 7729, + "time_per_iteration": 2.635810375213623 + }, + { + "auxiliary_loss_clip": 0.01131747, + "auxiliary_loss_mlp": 0.0103226, + "balance_loss_clip": 1.05043101, + "balance_loss_mlp": 1.01979113, + "epoch": 0.464752743123403, + "flos": 27623134775520.0, + "grad_norm": 2.9479975702565504, + "language_loss": 0.79921436, + "learning_rate": 2.323961570451588e-06, + "loss": 0.82085437, + "num_input_tokens_seen": 165905465, + "router_z_loss_clip": 0.81445312, + "router_z_loss_mlp": 0.12475586, + "step": 7730, + "time_per_iteration": 2.690173864364624 + }, + { + "auxiliary_loss_clip": 0.01128227, + "auxiliary_loss_mlp": 0.01035639, + "balance_loss_clip": 1.04738355, + "balance_loss_mlp": 1.0222106, + "epoch": 0.46481286637607094, + "flos": 24906919560480.0, + "grad_norm": 1.8788309329295676, + "language_loss": 0.7713213, + "learning_rate": 2.3235772458953655e-06, + "loss": 0.79295999, + "num_input_tokens_seen": 165924640, + "router_z_loss_clip": 0.80859375, + "router_z_loss_mlp": 0.13421631, + "step": 7731, + "time_per_iteration": 2.6664836406707764 + }, + { + "auxiliary_loss_clip": 0.0112522, + "auxiliary_loss_mlp": 0.01030032, + "balance_loss_clip": 1.04472983, + "balance_loss_mlp": 1.01779556, + "epoch": 0.4648729896287389, + "flos": 41824076846400.0, + "grad_norm": 1.8547131060561568, + "language_loss": 0.66001201, + "learning_rate": 2.323192909069061e-06, + "loss": 0.68156445, + "num_input_tokens_seen": 165945765, + "router_z_loss_clip": 0.80517578, + "router_z_loss_mlp": 0.12237549, + "step": 7732, + "time_per_iteration": 2.7912206649780273 + }, + { + "auxiliary_loss_clip": 0.01132227, + "auxiliary_loss_mlp": 0.01036571, + "balance_loss_clip": 1.04672956, + "balance_loss_mlp": 1.02164602, + "epoch": 0.4649331128814069, + "flos": 26017177376160.0, + "grad_norm": 2.499185186548485, + "language_loss": 0.73041952, + "learning_rate": 2.32280855998725e-06, + "loss": 0.7521075, + "num_input_tokens_seen": 165964025, + "router_z_loss_clip": 0.85498047, + "router_z_loss_mlp": 0.14904785, + "step": 7733, + "time_per_iteration": 2.6507279872894287 + }, + { + "auxiliary_loss_clip": 0.01049333, + "auxiliary_loss_mlp": 0.01006318, + "balance_loss_clip": 1.0243386, + "balance_loss_mlp": 1.00486803, + "epoch": 0.46499323613407484, + "flos": 71148777101280.0, + "grad_norm": 1.2416496097013199, + "language_loss": 0.51976109, + "learning_rate": 2.3224241986645057e-06, + "loss": 0.54031765, + "num_input_tokens_seen": 166021950, + "router_z_loss_clip": 0.24987793, + "router_z_loss_mlp": 0.01448822, + "step": 7734, + "time_per_iteration": 3.3363630771636963 + }, + { + "auxiliary_loss_clip": 0.01129583, + "auxiliary_loss_mlp": 0.01034096, + "balance_loss_clip": 1.04848289, + "balance_loss_mlp": 1.02099514, + "epoch": 0.4650533593867428, + "flos": 13286843523360.0, + "grad_norm": 2.546548988250672, + "language_loss": 0.75838542, + "learning_rate": 2.3220398251154035e-06, + "loss": 0.7800222, + "num_input_tokens_seen": 166039675, + "router_z_loss_clip": 0.81152344, + "router_z_loss_mlp": 0.13110352, + "step": 7735, + "time_per_iteration": 2.6087563037872314 + }, + { + "auxiliary_loss_clip": 0.01124771, + "auxiliary_loss_mlp": 0.01037554, + "balance_loss_clip": 1.04514074, + "balance_loss_mlp": 1.02456069, + "epoch": 0.46511348263941077, + "flos": 24372978014880.0, + "grad_norm": 1.913668865835424, + "language_loss": 0.6984309, + "learning_rate": 2.321655439354519e-06, + "loss": 0.72005415, + "num_input_tokens_seen": 166057745, + "router_z_loss_clip": 0.79638672, + "router_z_loss_mlp": 0.13000488, + "step": 7736, + "time_per_iteration": 2.679471969604492 + }, + { + "auxiliary_loss_clip": 0.01125195, + "auxiliary_loss_mlp": 0.01034706, + "balance_loss_clip": 1.0467236, + "balance_loss_mlp": 1.02172422, + "epoch": 0.46517360589207873, + "flos": 24011938818720.0, + "grad_norm": 1.7079913550566328, + "language_loss": 0.71974301, + "learning_rate": 2.321271041396427e-06, + "loss": 0.74134201, + "num_input_tokens_seen": 166076440, + "router_z_loss_clip": 0.78417969, + "router_z_loss_mlp": 0.12988281, + "step": 7737, + "time_per_iteration": 2.6815338134765625 + }, + { + "auxiliary_loss_clip": 0.01136147, + "auxiliary_loss_mlp": 0.01038505, + "balance_loss_clip": 1.05261385, + "balance_loss_mlp": 1.02474821, + "epoch": 0.46523372914474675, + "flos": 20587826259360.0, + "grad_norm": 1.9544377219533544, + "language_loss": 0.83710808, + "learning_rate": 2.3208866312557065e-06, + "loss": 0.85885459, + "num_input_tokens_seen": 166092520, + "router_z_loss_clip": 0.83544922, + "router_z_loss_mlp": 0.13751221, + "step": 7738, + "time_per_iteration": 2.627525568008423 + }, + { + "auxiliary_loss_clip": 0.01047538, + "auxiliary_loss_mlp": 0.01001067, + "balance_loss_clip": 1.02241302, + "balance_loss_mlp": 0.99964195, + "epoch": 0.4652938523974147, + "flos": 65204973250560.0, + "grad_norm": 0.7688770575273448, + "language_loss": 0.57846355, + "learning_rate": 2.320502208946932e-06, + "loss": 0.59894967, + "num_input_tokens_seen": 166156285, + "router_z_loss_clip": 0.25170898, + "router_z_loss_mlp": 0.01423645, + "step": 7739, + "time_per_iteration": 3.2883987426757812 + }, + { + "auxiliary_loss_clip": 0.01128995, + "auxiliary_loss_mlp": 0.0103456, + "balance_loss_clip": 1.04784644, + "balance_loss_mlp": 1.02188182, + "epoch": 0.4653539756500827, + "flos": 18584856669600.0, + "grad_norm": 2.0382571294112615, + "language_loss": 0.84898508, + "learning_rate": 2.3201177744846815e-06, + "loss": 0.87062067, + "num_input_tokens_seen": 166173455, + "router_z_loss_clip": 0.81152344, + "router_z_loss_mlp": 0.12677002, + "step": 7740, + "time_per_iteration": 2.606282949447632 + }, + { + "auxiliary_loss_clip": 0.01126314, + "auxiliary_loss_mlp": 0.01035947, + "balance_loss_clip": 1.04657018, + "balance_loss_mlp": 1.02218997, + "epoch": 0.46541409890275065, + "flos": 28958393812320.0, + "grad_norm": 1.5681936053426446, + "language_loss": 0.7563107, + "learning_rate": 2.3197333278835327e-06, + "loss": 0.77793336, + "num_input_tokens_seen": 166194370, + "router_z_loss_clip": 0.79736328, + "router_z_loss_mlp": 0.13757324, + "step": 7741, + "time_per_iteration": 2.684483289718628 + }, + { + "auxiliary_loss_clip": 0.01132001, + "auxiliary_loss_mlp": 0.01034809, + "balance_loss_clip": 1.04788733, + "balance_loss_mlp": 1.02167809, + "epoch": 0.4654742221554186, + "flos": 25438227482880.0, + "grad_norm": 4.537330911994236, + "language_loss": 0.80763191, + "learning_rate": 2.319348869158064e-06, + "loss": 0.82930005, + "num_input_tokens_seen": 166213195, + "router_z_loss_clip": 0.84130859, + "router_z_loss_mlp": 0.13140869, + "step": 7742, + "time_per_iteration": 2.641021490097046 + }, + { + "auxiliary_loss_clip": 0.01130249, + "auxiliary_loss_mlp": 0.0103957, + "balance_loss_clip": 1.0462029, + "balance_loss_mlp": 1.02549124, + "epoch": 0.4655343454080866, + "flos": 25264028580480.0, + "grad_norm": 1.7494165733963707, + "language_loss": 0.72680259, + "learning_rate": 2.3189643983228555e-06, + "loss": 0.7485007, + "num_input_tokens_seen": 166231350, + "router_z_loss_clip": 0.84082031, + "router_z_loss_mlp": 0.14074707, + "step": 7743, + "time_per_iteration": 2.669365644454956 + }, + { + "auxiliary_loss_clip": 0.01128717, + "auxiliary_loss_mlp": 0.01030303, + "balance_loss_clip": 1.04693949, + "balance_loss_mlp": 1.01665354, + "epoch": 0.46559446866075455, + "flos": 23171285399040.0, + "grad_norm": 1.9132181454034034, + "language_loss": 0.71452439, + "learning_rate": 2.318579915392483e-06, + "loss": 0.73611462, + "num_input_tokens_seen": 166250530, + "router_z_loss_clip": 0.81787109, + "router_z_loss_mlp": 0.13641357, + "step": 7744, + "time_per_iteration": 2.6204872131347656 + }, + { + "auxiliary_loss_clip": 0.0112573, + "auxiliary_loss_mlp": 0.01030908, + "balance_loss_clip": 1.04614151, + "balance_loss_mlp": 1.01836157, + "epoch": 0.4656545919134225, + "flos": 42093519173280.0, + "grad_norm": 1.5731617120194383, + "language_loss": 0.84920144, + "learning_rate": 2.31819542038153e-06, + "loss": 0.87076783, + "num_input_tokens_seen": 166272545, + "router_z_loss_clip": 0.79492188, + "router_z_loss_mlp": 0.12542725, + "step": 7745, + "time_per_iteration": 2.756333827972412 + }, + { + "auxiliary_loss_clip": 0.01125206, + "auxiliary_loss_mlp": 0.01038223, + "balance_loss_clip": 1.04653907, + "balance_loss_mlp": 1.02469301, + "epoch": 0.4657147151660905, + "flos": 29664589430880.0, + "grad_norm": 1.3471801246038493, + "language_loss": 0.73227823, + "learning_rate": 2.317810913304574e-06, + "loss": 0.75391257, + "num_input_tokens_seen": 166292135, + "router_z_loss_clip": 0.78613281, + "router_z_loss_mlp": 0.13519287, + "step": 7746, + "time_per_iteration": 2.6846680641174316 + }, + { + "auxiliary_loss_clip": 0.01127672, + "auxiliary_loss_mlp": 0.01037196, + "balance_loss_clip": 1.04771161, + "balance_loss_mlp": 1.02401125, + "epoch": 0.46577483841875844, + "flos": 71744818936320.0, + "grad_norm": 1.726371285829067, + "language_loss": 0.6956991, + "learning_rate": 2.3174263941761963e-06, + "loss": 0.71734774, + "num_input_tokens_seen": 166316710, + "router_z_loss_clip": 0.79882812, + "router_z_loss_mlp": 0.1317749, + "step": 7747, + "time_per_iteration": 3.012437343597412 + }, + { + "auxiliary_loss_clip": 0.01126307, + "auxiliary_loss_mlp": 0.01031755, + "balance_loss_clip": 1.04589546, + "balance_loss_mlp": 1.01841557, + "epoch": 0.4658349616714264, + "flos": 38219890379040.0, + "grad_norm": 1.9214203252185382, + "language_loss": 0.67633021, + "learning_rate": 2.317041863010978e-06, + "loss": 0.69791079, + "num_input_tokens_seen": 166338535, + "router_z_loss_clip": 0.80419922, + "router_z_loss_mlp": 0.13342285, + "step": 7748, + "time_per_iteration": 2.7526347637176514 + }, + { + "auxiliary_loss_clip": 0.01131298, + "auxiliary_loss_mlp": 0.01035932, + "balance_loss_clip": 1.04635656, + "balance_loss_mlp": 1.02110207, + "epoch": 0.46589508492409437, + "flos": 18132747328800.0, + "grad_norm": 1.950653533809699, + "language_loss": 0.63966888, + "learning_rate": 2.3166573198235007e-06, + "loss": 0.66134119, + "num_input_tokens_seen": 166355540, + "router_z_loss_clip": 0.85009766, + "router_z_loss_mlp": 0.14819336, + "step": 7749, + "time_per_iteration": 5.394685983657837 + }, + { + "auxiliary_loss_clip": 0.01132776, + "auxiliary_loss_mlp": 0.01036317, + "balance_loss_clip": 1.04966545, + "balance_loss_mlp": 1.02242303, + "epoch": 0.46595520817676234, + "flos": 15735156068160.0, + "grad_norm": 2.0674060475937877, + "language_loss": 0.74059606, + "learning_rate": 2.3162727646283456e-06, + "loss": 0.7622869, + "num_input_tokens_seen": 166372635, + "router_z_loss_clip": 0.83203125, + "router_z_loss_mlp": 0.13885498, + "step": 7750, + "time_per_iteration": 2.624803066253662 + }, + { + "auxiliary_loss_clip": 0.01129859, + "auxiliary_loss_mlp": 0.01028845, + "balance_loss_clip": 1.04623556, + "balance_loss_mlp": 1.01483154, + "epoch": 0.46601533142943036, + "flos": 40089617686080.0, + "grad_norm": 1.8645490568456995, + "language_loss": 0.74152851, + "learning_rate": 2.3158881974400963e-06, + "loss": 0.76311553, + "num_input_tokens_seen": 166393175, + "router_z_loss_clip": 0.83642578, + "router_z_loss_mlp": 0.14019775, + "step": 7751, + "time_per_iteration": 2.7621517181396484 + }, + { + "auxiliary_loss_clip": 0.01131459, + "auxiliary_loss_mlp": 0.01036109, + "balance_loss_clip": 1.04772615, + "balance_loss_mlp": 1.02192307, + "epoch": 0.4660754546820983, + "flos": 24364996110720.0, + "grad_norm": 1.9855275591022938, + "language_loss": 0.73926157, + "learning_rate": 2.3155036182733345e-06, + "loss": 0.76093727, + "num_input_tokens_seen": 166408630, + "router_z_loss_clip": 0.83691406, + "router_z_loss_mlp": 0.14172363, + "step": 7752, + "time_per_iteration": 2.6072683334350586 + }, + { + "auxiliary_loss_clip": 0.01134217, + "auxiliary_loss_mlp": 0.01035503, + "balance_loss_clip": 1.0483743, + "balance_loss_mlp": 1.02141261, + "epoch": 0.4661355779347663, + "flos": 32565375972000.0, + "grad_norm": 3.092102291982823, + "language_loss": 0.68920767, + "learning_rate": 2.315119027142644e-06, + "loss": 0.71090484, + "num_input_tokens_seen": 166428170, + "router_z_loss_clip": 0.85839844, + "router_z_loss_mlp": 0.14080811, + "step": 7753, + "time_per_iteration": 2.716285467147827 + }, + { + "auxiliary_loss_clip": 0.01125729, + "auxiliary_loss_mlp": 0.01032074, + "balance_loss_clip": 1.04626441, + "balance_loss_mlp": 1.01885378, + "epoch": 0.46619570118743425, + "flos": 25574792182560.0, + "grad_norm": 1.8125175065989416, + "language_loss": 0.7292679, + "learning_rate": 2.3147344240626076e-06, + "loss": 0.75084597, + "num_input_tokens_seen": 166446705, + "router_z_loss_clip": 0.79443359, + "router_z_loss_mlp": 0.13220215, + "step": 7754, + "time_per_iteration": 2.638335704803467 + }, + { + "auxiliary_loss_clip": 0.01128399, + "auxiliary_loss_mlp": 0.010314, + "balance_loss_clip": 1.04491305, + "balance_loss_mlp": 1.01773834, + "epoch": 0.4662558244401022, + "flos": 29805003272160.0, + "grad_norm": 1.667018030046509, + "language_loss": 0.78937447, + "learning_rate": 2.3143498090478114e-06, + "loss": 0.81097245, + "num_input_tokens_seen": 166466750, + "router_z_loss_clip": 0.83349609, + "router_z_loss_mlp": 0.13659668, + "step": 7755, + "time_per_iteration": 2.703683853149414 + }, + { + "auxiliary_loss_clip": 0.01124254, + "auxiliary_loss_mlp": 0.01030204, + "balance_loss_clip": 1.04510283, + "balance_loss_mlp": 1.0174005, + "epoch": 0.4663159476927702, + "flos": 25130543194080.0, + "grad_norm": 1.7977243366224767, + "language_loss": 0.72346568, + "learning_rate": 2.3139651821128382e-06, + "loss": 0.74501026, + "num_input_tokens_seen": 166485400, + "router_z_loss_clip": 0.79150391, + "router_z_loss_mlp": 0.12786865, + "step": 7756, + "time_per_iteration": 4.125009775161743 + }, + { + "auxiliary_loss_clip": 0.01126891, + "auxiliary_loss_mlp": 0.01030035, + "balance_loss_clip": 1.04658854, + "balance_loss_mlp": 1.01704717, + "epoch": 0.46637607094543815, + "flos": 31313448279360.0, + "grad_norm": 1.7624193696504646, + "language_loss": 0.7821027, + "learning_rate": 2.313580543272274e-06, + "loss": 0.80367196, + "num_input_tokens_seen": 166505730, + "router_z_loss_clip": 0.80322266, + "router_z_loss_mlp": 0.12976074, + "step": 7757, + "time_per_iteration": 2.8059396743774414 + }, + { + "auxiliary_loss_clip": 0.01125384, + "auxiliary_loss_mlp": 0.01029201, + "balance_loss_clip": 1.04466081, + "balance_loss_mlp": 1.01590908, + "epoch": 0.4664361941981061, + "flos": 29619824186880.0, + "grad_norm": 1.8600967235097152, + "language_loss": 0.66745353, + "learning_rate": 2.313195892540705e-06, + "loss": 0.68899935, + "num_input_tokens_seen": 166523770, + "router_z_loss_clip": 0.80761719, + "router_z_loss_mlp": 0.13293457, + "step": 7758, + "time_per_iteration": 2.756649971008301 + }, + { + "auxiliary_loss_clip": 0.01127965, + "auxiliary_loss_mlp": 0.01038741, + "balance_loss_clip": 1.04698694, + "balance_loss_mlp": 1.02530015, + "epoch": 0.4664963174507741, + "flos": 22458688050240.0, + "grad_norm": 1.7653935946664892, + "language_loss": 0.74849635, + "learning_rate": 2.3128112299327147e-06, + "loss": 0.77016348, + "num_input_tokens_seen": 166542935, + "router_z_loss_clip": 0.80957031, + "router_z_loss_mlp": 0.13433838, + "step": 7759, + "time_per_iteration": 2.618682861328125 + }, + { + "auxiliary_loss_clip": 0.01125662, + "auxiliary_loss_mlp": 0.01035759, + "balance_loss_clip": 1.04560828, + "balance_loss_mlp": 1.02333784, + "epoch": 0.46655644070344204, + "flos": 27399956832000.0, + "grad_norm": 1.555668197506209, + "language_loss": 0.78014237, + "learning_rate": 2.312426555462893e-06, + "loss": 0.80175656, + "num_input_tokens_seen": 166563935, + "router_z_loss_clip": 0.80029297, + "router_z_loss_mlp": 0.12426758, + "step": 7760, + "time_per_iteration": 2.745530843734741 + }, + { + "auxiliary_loss_clip": 0.01120229, + "auxiliary_loss_mlp": 0.01033279, + "balance_loss_clip": 1.04219151, + "balance_loss_mlp": 1.01980793, + "epoch": 0.46661656395611, + "flos": 16848533714400.0, + "grad_norm": 1.6629290562560168, + "language_loss": 0.7417233, + "learning_rate": 2.3120418691458237e-06, + "loss": 0.76325834, + "num_input_tokens_seen": 166582175, + "router_z_loss_clip": 0.78076172, + "router_z_loss_mlp": 0.13464355, + "step": 7761, + "time_per_iteration": 3.984941005706787 + }, + { + "auxiliary_loss_clip": 0.01130469, + "auxiliary_loss_mlp": 0.01036558, + "balance_loss_clip": 1.04625559, + "balance_loss_mlp": 1.02127492, + "epoch": 0.466676687208778, + "flos": 26420145606720.0, + "grad_norm": 1.7720876289000456, + "language_loss": 0.78900337, + "learning_rate": 2.3116571709960956e-06, + "loss": 0.81067359, + "num_input_tokens_seen": 166601870, + "router_z_loss_clip": 0.84228516, + "router_z_loss_mlp": 0.1529541, + "step": 7762, + "time_per_iteration": 2.6209428310394287 + }, + { + "auxiliary_loss_clip": 0.01041866, + "auxiliary_loss_mlp": 0.00999287, + "balance_loss_clip": 1.01680005, + "balance_loss_mlp": 0.99794924, + "epoch": 0.46673681046144594, + "flos": 83626118642880.0, + "grad_norm": 0.8123129221099479, + "language_loss": 0.59844309, + "learning_rate": 2.311272461028297e-06, + "loss": 0.61885464, + "num_input_tokens_seen": 166668960, + "router_z_loss_clip": 0.25024414, + "router_z_loss_mlp": 0.01338959, + "step": 7763, + "time_per_iteration": 3.330094575881958 + }, + { + "auxiliary_loss_clip": 0.01127838, + "auxiliary_loss_mlp": 0.01037943, + "balance_loss_clip": 1.04318535, + "balance_loss_mlp": 1.02297056, + "epoch": 0.46679693371411396, + "flos": 19296522120960.0, + "grad_norm": 3.5742626390280257, + "language_loss": 0.79210496, + "learning_rate": 2.3108877392570146e-06, + "loss": 0.81376278, + "num_input_tokens_seen": 166686110, + "router_z_loss_clip": 0.84570312, + "router_z_loss_mlp": 0.14959717, + "step": 7764, + "time_per_iteration": 2.6168601512908936 + }, + { + "auxiliary_loss_clip": 0.01122995, + "auxiliary_loss_mlp": 0.01035017, + "balance_loss_clip": 1.04406738, + "balance_loss_mlp": 1.0228157, + "epoch": 0.4668570569667819, + "flos": 22591403608320.0, + "grad_norm": 2.268161090761867, + "language_loss": 0.72295249, + "learning_rate": 2.310503005696839e-06, + "loss": 0.74453264, + "num_input_tokens_seen": 166703930, + "router_z_loss_clip": 0.7890625, + "router_z_loss_mlp": 0.12207031, + "step": 7765, + "time_per_iteration": 2.628178358078003 + }, + { + "auxiliary_loss_clip": 0.01124452, + "auxiliary_loss_mlp": 0.01036124, + "balance_loss_clip": 1.04216659, + "balance_loss_mlp": 1.02306461, + "epoch": 0.4669171802194499, + "flos": 23436027721440.0, + "grad_norm": 2.5780681931270277, + "language_loss": 0.78141487, + "learning_rate": 2.3101182603623576e-06, + "loss": 0.8030206, + "num_input_tokens_seen": 166719940, + "router_z_loss_clip": 0.82226562, + "router_z_loss_mlp": 0.13061523, + "step": 7766, + "time_per_iteration": 2.5967609882354736 + }, + { + "auxiliary_loss_clip": 0.01122654, + "auxiliary_loss_mlp": 0.0103404, + "balance_loss_clip": 1.0418967, + "balance_loss_mlp": 1.02136803, + "epoch": 0.46697730347211786, + "flos": 14978725372800.0, + "grad_norm": 3.3761640712713445, + "language_loss": 0.64637941, + "learning_rate": 2.3097335032681607e-06, + "loss": 0.66794634, + "num_input_tokens_seen": 166738285, + "router_z_loss_clip": 0.80859375, + "router_z_loss_mlp": 0.12683105, + "step": 7767, + "time_per_iteration": 2.6231346130371094 + }, + { + "auxiliary_loss_clip": 0.01125783, + "auxiliary_loss_mlp": 0.01034708, + "balance_loss_clip": 1.04450679, + "balance_loss_mlp": 1.02190495, + "epoch": 0.4670374267247858, + "flos": 28780791458400.0, + "grad_norm": 2.623336820149115, + "language_loss": 0.74111104, + "learning_rate": 2.3093487344288393e-06, + "loss": 0.76271594, + "num_input_tokens_seen": 166758170, + "router_z_loss_clip": 0.81298828, + "router_z_loss_mlp": 0.12799072, + "step": 7768, + "time_per_iteration": 2.7005465030670166 + }, + { + "auxiliary_loss_clip": 0.0112492, + "auxiliary_loss_mlp": 0.01032153, + "balance_loss_clip": 1.04318702, + "balance_loss_mlp": 1.0197258, + "epoch": 0.4670975499774538, + "flos": 19512609540480.0, + "grad_norm": 10.794963626509386, + "language_loss": 0.70696801, + "learning_rate": 2.308963953858982e-06, + "loss": 0.72853875, + "num_input_tokens_seen": 166775750, + "router_z_loss_clip": 0.81640625, + "router_z_loss_mlp": 0.12420654, + "step": 7769, + "time_per_iteration": 2.6514341831207275 + }, + { + "auxiliary_loss_clip": 0.01124508, + "auxiliary_loss_mlp": 0.01031891, + "balance_loss_clip": 1.04253149, + "balance_loss_mlp": 1.01943398, + "epoch": 0.46715767323012175, + "flos": 18763350403680.0, + "grad_norm": 1.880502224627563, + "language_loss": 0.81573141, + "learning_rate": 2.3085791615731803e-06, + "loss": 0.83729541, + "num_input_tokens_seen": 166791720, + "router_z_loss_clip": 0.81933594, + "router_z_loss_mlp": 0.12463379, + "step": 7770, + "time_per_iteration": 2.670544147491455 + }, + { + "auxiliary_loss_clip": 0.01043127, + "auxiliary_loss_mlp": 0.01007563, + "balance_loss_clip": 1.01823068, + "balance_loss_mlp": 1.00627971, + "epoch": 0.4672177964827897, + "flos": 73519714342080.0, + "grad_norm": 0.7971249989724023, + "language_loss": 0.55647367, + "learning_rate": 2.3081943575860265e-06, + "loss": 0.57698059, + "num_input_tokens_seen": 166856360, + "router_z_loss_clip": 0.24902344, + "router_z_loss_mlp": 0.01284027, + "step": 7771, + "time_per_iteration": 3.260108232498169 + }, + { + "auxiliary_loss_clip": 0.01123097, + "auxiliary_loss_mlp": 0.0103797, + "balance_loss_clip": 1.04292417, + "balance_loss_mlp": 1.02495885, + "epoch": 0.4672779197354577, + "flos": 33722222309280.0, + "grad_norm": 4.135062538329543, + "language_loss": 0.66129243, + "learning_rate": 2.3078095419121117e-06, + "loss": 0.68290305, + "num_input_tokens_seen": 166875925, + "router_z_loss_clip": 0.80273438, + "router_z_loss_mlp": 0.13024902, + "step": 7772, + "time_per_iteration": 2.7316205501556396 + }, + { + "auxiliary_loss_clip": 0.01123121, + "auxiliary_loss_mlp": 0.01032715, + "balance_loss_clip": 1.04387605, + "balance_loss_mlp": 1.01995993, + "epoch": 0.46733804298812565, + "flos": 38308164831360.0, + "grad_norm": 1.8881024021297739, + "language_loss": 0.63781667, + "learning_rate": 2.3074247145660283e-06, + "loss": 0.65937501, + "num_input_tokens_seen": 166896520, + "router_z_loss_clip": 0.79199219, + "router_z_loss_mlp": 0.12750244, + "step": 7773, + "time_per_iteration": 2.7231218814849854 + }, + { + "auxiliary_loss_clip": 0.01124472, + "auxiliary_loss_mlp": 0.01035954, + "balance_loss_clip": 1.04254913, + "balance_loss_mlp": 1.02213717, + "epoch": 0.4673981662407936, + "flos": 23794554846240.0, + "grad_norm": 1.9522273717928715, + "language_loss": 0.79923689, + "learning_rate": 2.3070398755623685e-06, + "loss": 0.82084113, + "num_input_tokens_seen": 166915370, + "router_z_loss_clip": 0.81884766, + "router_z_loss_mlp": 0.13812256, + "step": 7774, + "time_per_iteration": 2.665497064590454 + }, + { + "auxiliary_loss_clip": 0.0112648, + "auxiliary_loss_mlp": 0.01027788, + "balance_loss_clip": 1.04361594, + "balance_loss_mlp": 1.01490772, + "epoch": 0.4674582894934616, + "flos": 25040769602400.0, + "grad_norm": 1.6880862879395035, + "language_loss": 0.77485061, + "learning_rate": 2.306655024915726e-06, + "loss": 0.79639328, + "num_input_tokens_seen": 166934875, + "router_z_loss_clip": 0.82910156, + "router_z_loss_mlp": 0.12896729, + "step": 7775, + "time_per_iteration": 2.631115674972534 + }, + { + "auxiliary_loss_clip": 0.01122175, + "auxiliary_loss_mlp": 0.01029548, + "balance_loss_clip": 1.04312193, + "balance_loss_mlp": 1.01673913, + "epoch": 0.46751841274612954, + "flos": 26955505257120.0, + "grad_norm": 2.2469239056058488, + "language_loss": 0.69567192, + "learning_rate": 2.306270162640694e-06, + "loss": 0.71718919, + "num_input_tokens_seen": 166954285, + "router_z_loss_clip": 0.79101562, + "router_z_loss_mlp": 0.12811279, + "step": 7776, + "time_per_iteration": 2.671855926513672 + }, + { + "auxiliary_loss_clip": 0.01125232, + "auxiliary_loss_mlp": 0.0103331, + "balance_loss_clip": 1.04417396, + "balance_loss_mlp": 1.02085829, + "epoch": 0.46757853599879756, + "flos": 32921593611840.0, + "grad_norm": 1.767730093020481, + "language_loss": 0.73809922, + "learning_rate": 2.3058852887518678e-06, + "loss": 0.75968468, + "num_input_tokens_seen": 166975975, + "router_z_loss_clip": 0.81054688, + "router_z_loss_mlp": 0.12457275, + "step": 7777, + "time_per_iteration": 2.6898186206817627 + }, + { + "auxiliary_loss_clip": 0.01122637, + "auxiliary_loss_mlp": 0.01029614, + "balance_loss_clip": 1.0421164, + "balance_loss_mlp": 1.01712716, + "epoch": 0.4676386592514655, + "flos": 29448664080480.0, + "grad_norm": 2.1971840505738265, + "language_loss": 0.69951725, + "learning_rate": 2.3055004032638394e-06, + "loss": 0.72103977, + "num_input_tokens_seen": 166996140, + "router_z_loss_clip": 0.80566406, + "router_z_loss_mlp": 0.12493896, + "step": 7778, + "time_per_iteration": 2.8014283180236816 + }, + { + "auxiliary_loss_clip": 0.01125822, + "auxiliary_loss_mlp": 0.01041658, + "balance_loss_clip": 1.0442574, + "balance_loss_mlp": 1.02784777, + "epoch": 0.4676987825041335, + "flos": 31095659134080.0, + "grad_norm": 1.5512639557060808, + "language_loss": 0.73269451, + "learning_rate": 2.305115506191206e-06, + "loss": 0.75436926, + "num_input_tokens_seen": 167016105, + "router_z_loss_clip": 0.81591797, + "router_z_loss_mlp": 0.13818359, + "step": 7779, + "time_per_iteration": 2.7307913303375244 + }, + { + "auxiliary_loss_clip": 0.01122787, + "auxiliary_loss_mlp": 0.01037521, + "balance_loss_clip": 1.0432055, + "balance_loss_mlp": 1.02527809, + "epoch": 0.46775890575680146, + "flos": 26777740834080.0, + "grad_norm": 1.6452644990743015, + "language_loss": 0.72627234, + "learning_rate": 2.304730597548562e-06, + "loss": 0.74787539, + "num_input_tokens_seen": 167036185, + "router_z_loss_clip": 0.79589844, + "router_z_loss_mlp": 0.12243652, + "step": 7780, + "time_per_iteration": 2.664625406265259 + }, + { + "auxiliary_loss_clip": 0.01126405, + "auxiliary_loss_mlp": 0.01036117, + "balance_loss_clip": 1.04207206, + "balance_loss_mlp": 1.02287912, + "epoch": 0.4678190290094694, + "flos": 30784085186400.0, + "grad_norm": 2.3887059943850035, + "language_loss": 0.73841143, + "learning_rate": 2.3043456773505023e-06, + "loss": 0.76003659, + "num_input_tokens_seen": 167054515, + "router_z_loss_clip": 0.84277344, + "router_z_loss_mlp": 0.13226318, + "step": 7781, + "time_per_iteration": 2.739715814590454 + }, + { + "auxiliary_loss_clip": 0.01127819, + "auxiliary_loss_mlp": 0.01033654, + "balance_loss_clip": 1.04323721, + "balance_loss_mlp": 1.020136, + "epoch": 0.4678791522621374, + "flos": 39374953956000.0, + "grad_norm": 1.884278337598032, + "language_loss": 0.62940365, + "learning_rate": 2.3039607456116252e-06, + "loss": 0.65101838, + "num_input_tokens_seen": 167077245, + "router_z_loss_clip": 0.84570312, + "router_z_loss_mlp": 0.13513184, + "step": 7782, + "time_per_iteration": 2.8019983768463135 + }, + { + "auxiliary_loss_clip": 0.01128292, + "auxiliary_loss_mlp": 0.01045532, + "balance_loss_clip": 1.04440033, + "balance_loss_mlp": 1.0322578, + "epoch": 0.46793927551480535, + "flos": 33003223230240.0, + "grad_norm": 1.920455906495583, + "language_loss": 0.63256371, + "learning_rate": 2.3035758023465254e-06, + "loss": 0.654302, + "num_input_tokens_seen": 167097235, + "router_z_loss_clip": 0.83935547, + "router_z_loss_mlp": 0.13269043, + "step": 7783, + "time_per_iteration": 2.7830772399902344 + }, + { + "auxiliary_loss_clip": 0.01131802, + "auxiliary_loss_mlp": 0.01036929, + "balance_loss_clip": 1.04574823, + "balance_loss_mlp": 1.0226599, + "epoch": 0.4679993987674733, + "flos": 21301274471040.0, + "grad_norm": 3.712769096596484, + "language_loss": 0.68110728, + "learning_rate": 2.303190847569801e-06, + "loss": 0.70279455, + "num_input_tokens_seen": 167113155, + "router_z_loss_clip": 0.86035156, + "router_z_loss_mlp": 0.14282227, + "step": 7784, + "time_per_iteration": 2.6604461669921875 + }, + { + "auxiliary_loss_clip": 0.01123013, + "auxiliary_loss_mlp": 0.01029991, + "balance_loss_clip": 1.04236794, + "balance_loss_mlp": 1.01785588, + "epoch": 0.4680595220201413, + "flos": 20944651658400.0, + "grad_norm": 2.1689945515474887, + "language_loss": 0.83736229, + "learning_rate": 2.3028058812960497e-06, + "loss": 0.85889232, + "num_input_tokens_seen": 167131765, + "router_z_loss_clip": 0.80566406, + "router_z_loss_mlp": 0.121521, + "step": 7785, + "time_per_iteration": 2.703489303588867 + }, + { + "auxiliary_loss_clip": 0.01125883, + "auxiliary_loss_mlp": 0.01035348, + "balance_loss_clip": 1.04346347, + "balance_loss_mlp": 1.0223484, + "epoch": 0.46811964527280925, + "flos": 13821230759040.0, + "grad_norm": 2.037994953813759, + "language_loss": 0.76956522, + "learning_rate": 2.3024209035398678e-06, + "loss": 0.79117757, + "num_input_tokens_seen": 167149030, + "router_z_loss_clip": 0.82373047, + "router_z_loss_mlp": 0.12994385, + "step": 7786, + "time_per_iteration": 2.638490676879883 + }, + { + "auxiliary_loss_clip": 0.01120943, + "auxiliary_loss_mlp": 0.0102832, + "balance_loss_clip": 1.04253399, + "balance_loss_mlp": 1.01607084, + "epoch": 0.4681797685254772, + "flos": 29625415571520.0, + "grad_norm": 2.2578867915764267, + "language_loss": 0.7410506, + "learning_rate": 2.302035914315856e-06, + "loss": 0.7625432, + "num_input_tokens_seen": 167167375, + "router_z_loss_clip": 0.78466797, + "router_z_loss_mlp": 0.12255859, + "step": 7787, + "time_per_iteration": 2.7335429191589355 + }, + { + "auxiliary_loss_clip": 0.01125347, + "auxiliary_loss_mlp": 0.01041008, + "balance_loss_clip": 1.0445528, + "balance_loss_mlp": 1.02811551, + "epoch": 0.4682398917781452, + "flos": 38620630159200.0, + "grad_norm": 1.7356777014060167, + "language_loss": 0.65299404, + "learning_rate": 2.3016509136386116e-06, + "loss": 0.67465758, + "num_input_tokens_seen": 167188065, + "router_z_loss_clip": 0.80761719, + "router_z_loss_mlp": 0.12896729, + "step": 7788, + "time_per_iteration": 4.137760639190674 + }, + { + "auxiliary_loss_clip": 0.01123177, + "auxiliary_loss_mlp": 0.01030635, + "balance_loss_clip": 1.04363298, + "balance_loss_mlp": 1.01892281, + "epoch": 0.46830001503081314, + "flos": 34301374788960.0, + "grad_norm": 1.7199438056966014, + "language_loss": 0.64349854, + "learning_rate": 2.3012659015227343e-06, + "loss": 0.66503668, + "num_input_tokens_seen": 167209675, + "router_z_loss_clip": 0.79541016, + "router_z_loss_mlp": 0.11706543, + "step": 7789, + "time_per_iteration": 4.371373176574707 + }, + { + "auxiliary_loss_clip": 0.01047293, + "auxiliary_loss_mlp": 0.01002044, + "balance_loss_clip": 1.02209306, + "balance_loss_mlp": 1.00070989, + "epoch": 0.4683601382834811, + "flos": 70626950222400.0, + "grad_norm": 0.6985336130832225, + "language_loss": 0.61841482, + "learning_rate": 2.300880877982825e-06, + "loss": 0.63890815, + "num_input_tokens_seen": 167273940, + "router_z_loss_clip": 0.25219727, + "router_z_loss_mlp": 0.01335144, + "step": 7790, + "time_per_iteration": 3.3518190383911133 + }, + { + "auxiliary_loss_clip": 0.01124138, + "auxiliary_loss_mlp": 0.01032819, + "balance_loss_clip": 1.04483318, + "balance_loss_mlp": 1.020087, + "epoch": 0.46842026153614913, + "flos": 26688250863360.0, + "grad_norm": 1.6754286601258144, + "language_loss": 0.79164577, + "learning_rate": 2.3004958430334808e-06, + "loss": 0.81321537, + "num_input_tokens_seen": 167292730, + "router_z_loss_clip": 0.79199219, + "router_z_loss_mlp": 0.12738037, + "step": 7791, + "time_per_iteration": 2.66898512840271 + }, + { + "auxiliary_loss_clip": 0.01126736, + "auxiliary_loss_mlp": 0.01035914, + "balance_loss_clip": 1.04569864, + "balance_loss_mlp": 1.02301526, + "epoch": 0.4684803847888171, + "flos": 30384277303680.0, + "grad_norm": 5.931848022360093, + "language_loss": 0.74654949, + "learning_rate": 2.3001107966893052e-06, + "loss": 0.76817596, + "num_input_tokens_seen": 167313460, + "router_z_loss_clip": 0.81103516, + "router_z_loss_mlp": 0.12908936, + "step": 7792, + "time_per_iteration": 2.649634599685669 + }, + { + "auxiliary_loss_clip": 0.0112347, + "auxiliary_loss_mlp": 0.0103117, + "balance_loss_clip": 1.04433894, + "balance_loss_mlp": 1.01916575, + "epoch": 0.46854050804148506, + "flos": 32032609427520.0, + "grad_norm": 1.5378749865558843, + "language_loss": 0.68289465, + "learning_rate": 2.299725738964898e-06, + "loss": 0.70444107, + "num_input_tokens_seen": 167335385, + "router_z_loss_clip": 0.79199219, + "router_z_loss_mlp": 0.12011719, + "step": 7793, + "time_per_iteration": 2.6960160732269287 + }, + { + "auxiliary_loss_clip": 0.01125269, + "auxiliary_loss_mlp": 0.0102758, + "balance_loss_clip": 1.04670036, + "balance_loss_mlp": 1.01583767, + "epoch": 0.468600631294153, + "flos": 26331790119840.0, + "grad_norm": 1.6475271853671916, + "language_loss": 0.74022198, + "learning_rate": 2.2993406698748607e-06, + "loss": 0.76175046, + "num_input_tokens_seen": 167353625, + "router_z_loss_clip": 0.78613281, + "router_z_loss_mlp": 0.11749268, + "step": 7794, + "time_per_iteration": 2.640202045440674 + }, + { + "auxiliary_loss_clip": 0.01127772, + "auxiliary_loss_mlp": 0.0103266, + "balance_loss_clip": 1.04792762, + "balance_loss_mlp": 1.0199585, + "epoch": 0.468660754546821, + "flos": 31586050954080.0, + "grad_norm": 1.7006312119145328, + "language_loss": 0.63102531, + "learning_rate": 2.2989555894337953e-06, + "loss": 0.65262967, + "num_input_tokens_seen": 167374565, + "router_z_loss_clip": 0.79833984, + "router_z_loss_mlp": 0.1270752, + "step": 7795, + "time_per_iteration": 2.694817543029785 + }, + { + "auxiliary_loss_clip": 0.01121976, + "auxiliary_loss_mlp": 0.01027996, + "balance_loss_clip": 1.04415464, + "balance_loss_mlp": 1.0156101, + "epoch": 0.46872087779948896, + "flos": 43288323851520.0, + "grad_norm": 2.2021303087824067, + "language_loss": 0.68366313, + "learning_rate": 2.298570497656304e-06, + "loss": 0.70516294, + "num_input_tokens_seen": 167395010, + "router_z_loss_clip": 0.77783203, + "router_z_loss_mlp": 0.12390137, + "step": 7796, + "time_per_iteration": 4.198785305023193 + }, + { + "auxiliary_loss_clip": 0.01124637, + "auxiliary_loss_mlp": 0.01029163, + "balance_loss_clip": 1.04467607, + "balance_loss_mlp": 1.01695609, + "epoch": 0.4687810010521569, + "flos": 32209360918560.0, + "grad_norm": 1.882902818560631, + "language_loss": 0.70093852, + "learning_rate": 2.2981853945569894e-06, + "loss": 0.7224766, + "num_input_tokens_seen": 167415285, + "router_z_loss_clip": 0.79980469, + "router_z_loss_mlp": 0.12213135, + "step": 7797, + "time_per_iteration": 2.657684087753296 + }, + { + "auxiliary_loss_clip": 0.01127063, + "auxiliary_loss_mlp": 0.01032235, + "balance_loss_clip": 1.04627085, + "balance_loss_mlp": 1.01893079, + "epoch": 0.4688411243048249, + "flos": 24370911633600.0, + "grad_norm": 3.2278535981168806, + "language_loss": 0.67176473, + "learning_rate": 2.297800280150454e-06, + "loss": 0.69335771, + "num_input_tokens_seen": 167432405, + "router_z_loss_clip": 0.80664062, + "router_z_loss_mlp": 0.13305664, + "step": 7798, + "time_per_iteration": 2.6605770587921143 + }, + { + "auxiliary_loss_clip": 0.01046856, + "auxiliary_loss_mlp": 0.01001937, + "balance_loss_clip": 1.02173591, + "balance_loss_mlp": 1.00060678, + "epoch": 0.46890124755749285, + "flos": 78064700244480.0, + "grad_norm": 0.9430722193534152, + "language_loss": 0.6453563, + "learning_rate": 2.2974151544513033e-06, + "loss": 0.6658442, + "num_input_tokens_seen": 167499365, + "router_z_loss_clip": 0.2512207, + "router_z_loss_mlp": 0.01331329, + "step": 7799, + "time_per_iteration": 3.4793150424957275 + }, + { + "auxiliary_loss_clip": 0.01122047, + "auxiliary_loss_mlp": 0.01025589, + "balance_loss_clip": 1.04318917, + "balance_loss_mlp": 1.01367354, + "epoch": 0.4689613708101608, + "flos": 29003199573600.0, + "grad_norm": 1.4110419767854547, + "language_loss": 0.72503769, + "learning_rate": 2.2970300174741395e-06, + "loss": 0.74651408, + "num_input_tokens_seen": 167520390, + "router_z_loss_clip": 0.78857422, + "router_z_loss_mlp": 0.11920166, + "step": 7800, + "time_per_iteration": 2.703601121902466 + }, + { + "auxiliary_loss_clip": 0.01122635, + "auxiliary_loss_mlp": 0.01030535, + "balance_loss_clip": 1.04596043, + "balance_loss_mlp": 1.01906741, + "epoch": 0.4690214940628288, + "flos": 30247712604000.0, + "grad_norm": 1.8943768445345144, + "language_loss": 0.72245777, + "learning_rate": 2.296644869233568e-06, + "loss": 0.74398947, + "num_input_tokens_seen": 167539865, + "router_z_loss_clip": 0.76611328, + "router_z_loss_mlp": 0.11480713, + "step": 7801, + "time_per_iteration": 4.053922176361084 + }, + { + "auxiliary_loss_clip": 0.01130345, + "auxiliary_loss_mlp": 0.01039539, + "balance_loss_clip": 1.04740226, + "balance_loss_mlp": 1.0255022, + "epoch": 0.46908161731549675, + "flos": 22057786200960.0, + "grad_norm": 2.005325381291288, + "language_loss": 0.62198246, + "learning_rate": 2.2962597097441936e-06, + "loss": 0.64368129, + "num_input_tokens_seen": 167558190, + "router_z_loss_clip": 0.82910156, + "router_z_loss_mlp": 0.14050293, + "step": 7802, + "time_per_iteration": 2.683342456817627 + }, + { + "auxiliary_loss_clip": 0.01124974, + "auxiliary_loss_mlp": 0.01035492, + "balance_loss_clip": 1.04357028, + "balance_loss_mlp": 1.0231179, + "epoch": 0.4691417405681647, + "flos": 31363602321600.0, + "grad_norm": 3.189030811757304, + "language_loss": 0.73527765, + "learning_rate": 2.2958745390206206e-06, + "loss": 0.75688237, + "num_input_tokens_seen": 167577685, + "router_z_loss_clip": 0.81396484, + "router_z_loss_mlp": 0.1237793, + "step": 7803, + "time_per_iteration": 2.7493698596954346 + }, + { + "auxiliary_loss_clip": 0.01121246, + "auxiliary_loss_mlp": 0.01035486, + "balance_loss_clip": 1.04272246, + "balance_loss_mlp": 1.02355886, + "epoch": 0.46920186382083273, + "flos": 21300869298240.0, + "grad_norm": 1.5429144101030456, + "language_loss": 0.77137053, + "learning_rate": 2.2954893570774558e-06, + "loss": 0.79293787, + "num_input_tokens_seen": 167596390, + "router_z_loss_clip": 0.78564453, + "router_z_loss_mlp": 0.1192627, + "step": 7804, + "time_per_iteration": 2.683307647705078 + }, + { + "auxiliary_loss_clip": 0.01122436, + "auxiliary_loss_mlp": 0.01029399, + "balance_loss_clip": 1.04385114, + "balance_loss_mlp": 1.01716197, + "epoch": 0.4692619870735007, + "flos": 24818199418080.0, + "grad_norm": 2.2717469453292414, + "language_loss": 0.77129334, + "learning_rate": 2.295104163929305e-06, + "loss": 0.79281169, + "num_input_tokens_seen": 167614980, + "router_z_loss_clip": 0.78662109, + "router_z_loss_mlp": 0.12243652, + "step": 7805, + "time_per_iteration": 2.617908000946045 + }, + { + "auxiliary_loss_clip": 0.01130933, + "auxiliary_loss_mlp": 0.0103813, + "balance_loss_clip": 1.04615545, + "balance_loss_mlp": 1.02477241, + "epoch": 0.46932211032616866, + "flos": 35992486810080.0, + "grad_norm": 1.7285631570183795, + "language_loss": 0.83149672, + "learning_rate": 2.2947189595907742e-06, + "loss": 0.85318732, + "num_input_tokens_seen": 167635895, + "router_z_loss_clip": 0.84814453, + "router_z_loss_mlp": 0.13360596, + "step": 7806, + "time_per_iteration": 2.7381274700164795 + }, + { + "auxiliary_loss_clip": 0.01125973, + "auxiliary_loss_mlp": 0.01035391, + "balance_loss_clip": 1.0439918, + "balance_loss_mlp": 1.02232528, + "epoch": 0.4693822335788366, + "flos": 44186140802880.0, + "grad_norm": 1.68523865021617, + "language_loss": 0.77147305, + "learning_rate": 2.294333744076472e-06, + "loss": 0.79308665, + "num_input_tokens_seen": 167657440, + "router_z_loss_clip": 0.8203125, + "router_z_loss_mlp": 0.13079834, + "step": 7807, + "time_per_iteration": 2.7737114429473877 + }, + { + "auxiliary_loss_clip": 0.01125745, + "auxiliary_loss_mlp": 0.01034262, + "balance_loss_clip": 1.0451293, + "balance_loss_mlp": 1.02089298, + "epoch": 0.4694423568315046, + "flos": 24817672693440.0, + "grad_norm": 2.7343774297022843, + "language_loss": 0.51596427, + "learning_rate": 2.2939485174010035e-06, + "loss": 0.53756428, + "num_input_tokens_seen": 167675025, + "router_z_loss_clip": 0.80664062, + "router_z_loss_mlp": 0.1338501, + "step": 7808, + "time_per_iteration": 2.655749797821045 + }, + { + "auxiliary_loss_clip": 0.01043393, + "auxiliary_loss_mlp": 0.01001712, + "balance_loss_clip": 1.01824319, + "balance_loss_mlp": 1.00040221, + "epoch": 0.46950248008417256, + "flos": 78492418182720.0, + "grad_norm": 0.7886096023751563, + "language_loss": 0.57839578, + "learning_rate": 2.293563279578978e-06, + "loss": 0.59884691, + "num_input_tokens_seen": 167729635, + "router_z_loss_clip": 0.25195312, + "router_z_loss_mlp": 0.0131073, + "step": 7809, + "time_per_iteration": 3.133024215698242 + }, + { + "auxiliary_loss_clip": 0.01128687, + "auxiliary_loss_mlp": 0.01037483, + "balance_loss_clip": 1.04684734, + "balance_loss_mlp": 1.02483535, + "epoch": 0.4695626033368405, + "flos": 23428653576480.0, + "grad_norm": 3.5080684719657294, + "language_loss": 0.71795583, + "learning_rate": 2.2931780306250045e-06, + "loss": 0.73961747, + "num_input_tokens_seen": 167745135, + "router_z_loss_clip": 0.81884766, + "router_z_loss_mlp": 0.12646484, + "step": 7810, + "time_per_iteration": 2.715303659439087 + }, + { + "auxiliary_loss_clip": 0.01126399, + "auxiliary_loss_mlp": 0.01041968, + "balance_loss_clip": 1.04468358, + "balance_loss_mlp": 1.02958226, + "epoch": 0.4696227265895085, + "flos": 28067829454080.0, + "grad_norm": 1.843145503505596, + "language_loss": 0.81123412, + "learning_rate": 2.29279277055369e-06, + "loss": 0.83291781, + "num_input_tokens_seen": 167763875, + "router_z_loss_clip": 0.81640625, + "router_z_loss_mlp": 0.12384033, + "step": 7811, + "time_per_iteration": 2.852843999862671 + }, + { + "auxiliary_loss_clip": 0.01124814, + "auxiliary_loss_mlp": 0.0103297, + "balance_loss_clip": 1.04319024, + "balance_loss_mlp": 1.0202322, + "epoch": 0.46968284984217645, + "flos": 26687035344960.0, + "grad_norm": 1.8301821458838887, + "language_loss": 0.80774039, + "learning_rate": 2.292407499379644e-06, + "loss": 0.82931828, + "num_input_tokens_seen": 167784895, + "router_z_loss_clip": 0.81542969, + "router_z_loss_mlp": 0.12744141, + "step": 7812, + "time_per_iteration": 2.6880710124969482 + }, + { + "auxiliary_loss_clip": 0.01122463, + "auxiliary_loss_mlp": 0.01031258, + "balance_loss_clip": 1.04488349, + "balance_loss_mlp": 1.01908088, + "epoch": 0.4697429730948444, + "flos": 24373423704960.0, + "grad_norm": 8.427034434065233, + "language_loss": 0.74184132, + "learning_rate": 2.292022217117477e-06, + "loss": 0.76337856, + "num_input_tokens_seen": 167803185, + "router_z_loss_clip": 0.77539062, + "router_z_loss_mlp": 0.12176514, + "step": 7813, + "time_per_iteration": 2.6947431564331055 + }, + { + "auxiliary_loss_clip": 0.01122943, + "auxiliary_loss_mlp": 0.01030014, + "balance_loss_clip": 1.04356718, + "balance_loss_mlp": 1.01725268, + "epoch": 0.4698030963475124, + "flos": 18493421869440.0, + "grad_norm": 2.417961759166119, + "language_loss": 0.84475553, + "learning_rate": 2.291636923781798e-06, + "loss": 0.86628509, + "num_input_tokens_seen": 167816550, + "router_z_loss_clip": 0.79492188, + "router_z_loss_mlp": 0.12756348, + "step": 7814, + "time_per_iteration": 2.601818323135376 + }, + { + "auxiliary_loss_clip": 0.01119469, + "auxiliary_loss_mlp": 0.01035696, + "balance_loss_clip": 1.04189146, + "balance_loss_mlp": 1.02366829, + "epoch": 0.46986321960018035, + "flos": 18487749450240.0, + "grad_norm": 1.8331034602314118, + "language_loss": 0.816746, + "learning_rate": 2.291251619387217e-06, + "loss": 0.83829761, + "num_input_tokens_seen": 167831845, + "router_z_loss_clip": 0.77490234, + "router_z_loss_mlp": 0.12036133, + "step": 7815, + "time_per_iteration": 2.692383050918579 + }, + { + "auxiliary_loss_clip": 0.01123626, + "auxiliary_loss_mlp": 0.01035203, + "balance_loss_clip": 1.0433228, + "balance_loss_mlp": 1.02185178, + "epoch": 0.4699233428528483, + "flos": 28198032940800.0, + "grad_norm": 2.2157888580650345, + "language_loss": 0.77755272, + "learning_rate": 2.2908663039483468e-06, + "loss": 0.79914105, + "num_input_tokens_seen": 167850360, + "router_z_loss_clip": 0.80371094, + "router_z_loss_mlp": 0.13336182, + "step": 7816, + "time_per_iteration": 2.646308183670044 + }, + { + "auxiliary_loss_clip": 0.01041346, + "auxiliary_loss_mlp": 0.01000951, + "balance_loss_clip": 1.01625085, + "balance_loss_mlp": 0.99963337, + "epoch": 0.46998346610551633, + "flos": 83102144348160.0, + "grad_norm": 0.83984373765927, + "language_loss": 0.59029317, + "learning_rate": 2.290480977479796e-06, + "loss": 0.6107161, + "num_input_tokens_seen": 167908660, + "router_z_loss_clip": 0.25097656, + "router_z_loss_mlp": 0.01317596, + "step": 7817, + "time_per_iteration": 3.2675459384918213 + }, + { + "auxiliary_loss_clip": 0.01119687, + "auxiliary_loss_mlp": 0.01030509, + "balance_loss_clip": 1.04340076, + "balance_loss_mlp": 1.018713, + "epoch": 0.4700435893581843, + "flos": 29443477868640.0, + "grad_norm": 2.6013564501762887, + "language_loss": 0.79304063, + "learning_rate": 2.2900956399961775e-06, + "loss": 0.81454265, + "num_input_tokens_seen": 167927905, + "router_z_loss_clip": 0.76269531, + "router_z_loss_mlp": 0.11804199, + "step": 7818, + "time_per_iteration": 2.6312429904937744 + }, + { + "auxiliary_loss_clip": 0.0112301, + "auxiliary_loss_mlp": 0.01029486, + "balance_loss_clip": 1.04256463, + "balance_loss_mlp": 1.01754093, + "epoch": 0.47010371261085226, + "flos": 24588255088800.0, + "grad_norm": 2.038136130699, + "language_loss": 0.84268796, + "learning_rate": 2.289710291512104e-06, + "loss": 0.86421287, + "num_input_tokens_seen": 167945995, + "router_z_loss_clip": 0.8046875, + "router_z_loss_mlp": 0.1194458, + "step": 7819, + "time_per_iteration": 2.781662940979004 + }, + { + "auxiliary_loss_clip": 0.01125672, + "auxiliary_loss_mlp": 0.01032533, + "balance_loss_clip": 1.04380405, + "balance_loss_mlp": 1.01978385, + "epoch": 0.47016383586352023, + "flos": 18451695421440.0, + "grad_norm": 2.098242660904179, + "language_loss": 0.76279581, + "learning_rate": 2.289324932042186e-06, + "loss": 0.78437781, + "num_input_tokens_seen": 167963380, + "router_z_loss_clip": 0.81835938, + "router_z_loss_mlp": 0.12750244, + "step": 7820, + "time_per_iteration": 2.6036996841430664 + }, + { + "auxiliary_loss_clip": 0.01122354, + "auxiliary_loss_mlp": 0.01037355, + "balance_loss_clip": 1.04525042, + "balance_loss_mlp": 1.02459335, + "epoch": 0.4702239591161882, + "flos": 16536838214880.0, + "grad_norm": 1.9282398829041019, + "language_loss": 0.74334407, + "learning_rate": 2.288939561601039e-06, + "loss": 0.76494116, + "num_input_tokens_seen": 167981740, + "router_z_loss_clip": 0.77099609, + "router_z_loss_mlp": 0.12774658, + "step": 7821, + "time_per_iteration": 2.6260766983032227 + }, + { + "auxiliary_loss_clip": 0.01121906, + "auxiliary_loss_mlp": 0.01039041, + "balance_loss_clip": 1.04385662, + "balance_loss_mlp": 1.02750146, + "epoch": 0.47028408236885616, + "flos": 29622822465600.0, + "grad_norm": 1.8705460329749264, + "language_loss": 0.88961756, + "learning_rate": 2.2885541802032746e-06, + "loss": 0.91122699, + "num_input_tokens_seen": 167999380, + "router_z_loss_clip": 0.78027344, + "router_z_loss_mlp": 0.11541748, + "step": 7822, + "time_per_iteration": 2.651993989944458 + }, + { + "auxiliary_loss_clip": 0.01122667, + "auxiliary_loss_mlp": 0.01032149, + "balance_loss_clip": 1.04416513, + "balance_loss_mlp": 1.02028203, + "epoch": 0.4703442056215241, + "flos": 27890227100160.0, + "grad_norm": 1.5283879008282824, + "language_loss": 0.79770333, + "learning_rate": 2.2881687878635055e-06, + "loss": 0.81925148, + "num_input_tokens_seen": 168018395, + "router_z_loss_clip": 0.78466797, + "router_z_loss_mlp": 0.11871338, + "step": 7823, + "time_per_iteration": 2.6832642555236816 + }, + { + "auxiliary_loss_clip": 0.01042233, + "auxiliary_loss_mlp": 0.010014, + "balance_loss_clip": 1.01715732, + "balance_loss_mlp": 1.00004208, + "epoch": 0.4704043288741921, + "flos": 84486422943360.0, + "grad_norm": 0.6974225347833337, + "language_loss": 0.56663042, + "learning_rate": 2.2877833845963487e-06, + "loss": 0.58706671, + "num_input_tokens_seen": 168084080, + "router_z_loss_clip": 0.25048828, + "router_z_loss_mlp": 0.01358795, + "step": 7824, + "time_per_iteration": 3.34788179397583 + }, + { + "auxiliary_loss_clip": 0.01123999, + "auxiliary_loss_mlp": 0.01038116, + "balance_loss_clip": 1.04317939, + "balance_loss_mlp": 1.02468669, + "epoch": 0.47046445212686006, + "flos": 22013547681600.0, + "grad_norm": 2.736328850032893, + "language_loss": 0.80806899, + "learning_rate": 2.2873979704164157e-06, + "loss": 0.82969022, + "num_input_tokens_seen": 168101555, + "router_z_loss_clip": 0.80810547, + "router_z_loss_mlp": 0.13409424, + "step": 7825, + "time_per_iteration": 2.6434166431427 + }, + { + "auxiliary_loss_clip": 0.01125523, + "auxiliary_loss_mlp": 0.01028429, + "balance_loss_clip": 1.04515862, + "balance_loss_mlp": 1.01554859, + "epoch": 0.470524575379528, + "flos": 29225850792480.0, + "grad_norm": 2.1378984509261794, + "language_loss": 0.66815221, + "learning_rate": 2.287012545338324e-06, + "loss": 0.68969172, + "num_input_tokens_seen": 168121530, + "router_z_loss_clip": 0.80371094, + "router_z_loss_mlp": 0.12884521, + "step": 7826, + "time_per_iteration": 2.6891090869903564 + }, + { + "auxiliary_loss_clip": 0.01123859, + "auxiliary_loss_mlp": 0.01034691, + "balance_loss_clip": 1.04315174, + "balance_loss_mlp": 1.02208495, + "epoch": 0.470584698632196, + "flos": 22102024720320.0, + "grad_norm": 2.0083711577294925, + "language_loss": 0.83599269, + "learning_rate": 2.2866271093766877e-06, + "loss": 0.85757816, + "num_input_tokens_seen": 168140335, + "router_z_loss_clip": 0.80712891, + "router_z_loss_mlp": 0.1260376, + "step": 7827, + "time_per_iteration": 2.6576178073883057 + }, + { + "auxiliary_loss_clip": 0.01041782, + "auxiliary_loss_mlp": 0.01001974, + "balance_loss_clip": 1.016711, + "balance_loss_mlp": 1.00051844, + "epoch": 0.47064482188486395, + "flos": 69859296240480.0, + "grad_norm": 0.8199329899964549, + "language_loss": 0.55687815, + "learning_rate": 2.286241662546122e-06, + "loss": 0.57731569, + "num_input_tokens_seen": 168200535, + "router_z_loss_clip": 0.25061035, + "router_z_loss_mlp": 0.01455688, + "step": 7828, + "time_per_iteration": 6.107317924499512 + }, + { + "auxiliary_loss_clip": 0.0112073, + "auxiliary_loss_mlp": 0.01029446, + "balance_loss_clip": 1.04255474, + "balance_loss_mlp": 1.01728666, + "epoch": 0.4707049451375319, + "flos": 21835378085760.0, + "grad_norm": 2.322399878304838, + "language_loss": 0.81061262, + "learning_rate": 2.285856204861245e-06, + "loss": 0.83211434, + "num_input_tokens_seen": 168219610, + "router_z_loss_clip": 0.78222656, + "router_z_loss_mlp": 0.12176514, + "step": 7829, + "time_per_iteration": 2.664811134338379 + }, + { + "auxiliary_loss_clip": 0.01125748, + "auxiliary_loss_mlp": 0.01031664, + "balance_loss_clip": 1.04641986, + "balance_loss_mlp": 1.01978481, + "epoch": 0.47076506839019994, + "flos": 30788906742720.0, + "grad_norm": 1.4259397679253725, + "language_loss": 0.75629079, + "learning_rate": 2.2854707363366703e-06, + "loss": 0.77786487, + "num_input_tokens_seen": 168242505, + "router_z_loss_clip": 0.79296875, + "router_z_loss_mlp": 0.11877441, + "step": 7830, + "time_per_iteration": 2.7221386432647705 + }, + { + "auxiliary_loss_clip": 0.01125435, + "auxiliary_loss_mlp": 0.01030617, + "balance_loss_clip": 1.04690075, + "balance_loss_mlp": 1.01737928, + "epoch": 0.4708251916428679, + "flos": 16447429278720.0, + "grad_norm": 3.101484857247387, + "language_loss": 0.79080272, + "learning_rate": 2.2850852569870177e-06, + "loss": 0.81236321, + "num_input_tokens_seen": 168260220, + "router_z_loss_clip": 0.78417969, + "router_z_loss_mlp": 0.13238525, + "step": 7831, + "time_per_iteration": 2.599439859390259 + }, + { + "auxiliary_loss_clip": 0.01129027, + "auxiliary_loss_mlp": 0.01031122, + "balance_loss_clip": 1.04459906, + "balance_loss_mlp": 1.01806879, + "epoch": 0.47088531489553587, + "flos": 36787726709280.0, + "grad_norm": 1.727654672559475, + "language_loss": 0.75581688, + "learning_rate": 2.2846997668269033e-06, + "loss": 0.77741838, + "num_input_tokens_seen": 168277360, + "router_z_loss_clip": 0.84423828, + "router_z_loss_mlp": 0.13061523, + "step": 7832, + "time_per_iteration": 2.7183709144592285 + }, + { + "auxiliary_loss_clip": 0.01124223, + "auxiliary_loss_mlp": 0.01026919, + "balance_loss_clip": 1.04512668, + "balance_loss_mlp": 1.01515889, + "epoch": 0.47094543814820383, + "flos": 26598436754400.0, + "grad_norm": 1.6072897465911165, + "language_loss": 0.74517465, + "learning_rate": 2.2843142658709454e-06, + "loss": 0.76668596, + "num_input_tokens_seen": 168296605, + "router_z_loss_clip": 0.79003906, + "router_z_loss_mlp": 0.11773682, + "step": 7833, + "time_per_iteration": 2.6319804191589355 + }, + { + "auxiliary_loss_clip": 0.01123123, + "auxiliary_loss_mlp": 0.01034645, + "balance_loss_clip": 1.04404736, + "balance_loss_mlp": 1.02166271, + "epoch": 0.4710055614008718, + "flos": 28068477730560.0, + "grad_norm": 1.6188199114936015, + "language_loss": 0.75862908, + "learning_rate": 2.283928754133762e-06, + "loss": 0.7802068, + "num_input_tokens_seen": 168316205, + "router_z_loss_clip": 0.79199219, + "router_z_loss_mlp": 0.12982178, + "step": 7834, + "time_per_iteration": 2.6495165824890137 + }, + { + "auxiliary_loss_clip": 0.01123451, + "auxiliary_loss_mlp": 0.01030406, + "balance_loss_clip": 1.04493856, + "balance_loss_mlp": 1.01824713, + "epoch": 0.47106568465353976, + "flos": 52154753057280.0, + "grad_norm": 1.7734478312354653, + "language_loss": 0.66348565, + "learning_rate": 2.283543231629972e-06, + "loss": 0.6850242, + "num_input_tokens_seen": 168338935, + "router_z_loss_clip": 0.78564453, + "router_z_loss_mlp": 0.12164307, + "step": 7835, + "time_per_iteration": 2.819342613220215 + }, + { + "auxiliary_loss_clip": 0.01041385, + "auxiliary_loss_mlp": 0.00998975, + "balance_loss_clip": 1.01616788, + "balance_loss_mlp": 0.99750894, + "epoch": 0.4711258079062077, + "flos": 83651759730720.0, + "grad_norm": 0.8787323640927557, + "language_loss": 0.62125242, + "learning_rate": 2.283157698374194e-06, + "loss": 0.64165598, + "num_input_tokens_seen": 168392800, + "router_z_loss_clip": 0.25317383, + "router_z_loss_mlp": 0.01464844, + "step": 7836, + "time_per_iteration": 4.647252082824707 + }, + { + "auxiliary_loss_clip": 0.01127863, + "auxiliary_loss_mlp": 0.01029989, + "balance_loss_clip": 1.0441761, + "balance_loss_mlp": 1.01744246, + "epoch": 0.4711859311588757, + "flos": 31050407682720.0, + "grad_norm": 1.8210199837884549, + "language_loss": 0.69218528, + "learning_rate": 2.2827721543810475e-06, + "loss": 0.71376377, + "num_input_tokens_seen": 168412940, + "router_z_loss_clip": 0.83691406, + "router_z_loss_mlp": 0.12542725, + "step": 7837, + "time_per_iteration": 2.711402416229248 + }, + { + "auxiliary_loss_clip": 0.0112725, + "auxiliary_loss_mlp": 0.01035533, + "balance_loss_clip": 1.04613876, + "balance_loss_mlp": 1.02175272, + "epoch": 0.47124605441154366, + "flos": 26822384526240.0, + "grad_norm": 1.752077801938391, + "language_loss": 0.65923321, + "learning_rate": 2.282386599665153e-06, + "loss": 0.680861, + "num_input_tokens_seen": 168431995, + "router_z_loss_clip": 0.81054688, + "router_z_loss_mlp": 0.13781738, + "step": 7838, + "time_per_iteration": 2.6460018157958984 + }, + { + "auxiliary_loss_clip": 0.01127017, + "auxiliary_loss_mlp": 0.0103003, + "balance_loss_clip": 1.044276, + "balance_loss_mlp": 1.01697683, + "epoch": 0.4713061776642116, + "flos": 31008802786560.0, + "grad_norm": 3.539371718859781, + "language_loss": 0.77193809, + "learning_rate": 2.2820010342411304e-06, + "loss": 0.79350853, + "num_input_tokens_seen": 168454585, + "router_z_loss_clip": 0.82763672, + "router_z_loss_mlp": 0.13049316, + "step": 7839, + "time_per_iteration": 2.7000274658203125 + }, + { + "auxiliary_loss_clip": 0.01120404, + "auxiliary_loss_mlp": 0.01028559, + "balance_loss_clip": 1.04255128, + "balance_loss_mlp": 1.01655447, + "epoch": 0.4713663009168796, + "flos": 32387327928000.0, + "grad_norm": 2.612548391724266, + "language_loss": 0.72873431, + "learning_rate": 2.2816154581235993e-06, + "loss": 0.75022388, + "num_input_tokens_seen": 168471265, + "router_z_loss_clip": 0.77880859, + "router_z_loss_mlp": 0.12005615, + "step": 7840, + "time_per_iteration": 2.6673808097839355 + }, + { + "auxiliary_loss_clip": 0.01122737, + "auxiliary_loss_mlp": 0.01026819, + "balance_loss_clip": 1.04381239, + "balance_loss_mlp": 1.01424217, + "epoch": 0.47142642416954755, + "flos": 28825718771520.0, + "grad_norm": 1.7184991639907463, + "language_loss": 0.74988395, + "learning_rate": 2.2812298713271833e-06, + "loss": 0.77137959, + "num_input_tokens_seen": 168491360, + "router_z_loss_clip": 0.7890625, + "router_z_loss_mlp": 0.12579346, + "step": 7841, + "time_per_iteration": 3.9588282108306885 + }, + { + "auxiliary_loss_clip": 0.01124862, + "auxiliary_loss_mlp": 0.01031142, + "balance_loss_clip": 1.04453647, + "balance_loss_mlp": 1.01874411, + "epoch": 0.4714865474222155, + "flos": 27222962237280.0, + "grad_norm": 1.6971763463756837, + "language_loss": 0.70064831, + "learning_rate": 2.280844273866501e-06, + "loss": 0.72220826, + "num_input_tokens_seen": 168511335, + "router_z_loss_clip": 0.80371094, + "router_z_loss_mlp": 0.12402344, + "step": 7842, + "time_per_iteration": 2.7307724952697754 + }, + { + "auxiliary_loss_clip": 0.0112725, + "auxiliary_loss_mlp": 0.01032244, + "balance_loss_clip": 1.04812741, + "balance_loss_mlp": 1.01950026, + "epoch": 0.4715466706748835, + "flos": 21746698460640.0, + "grad_norm": 2.1726055923752363, + "language_loss": 0.78834546, + "learning_rate": 2.280458665756177e-06, + "loss": 0.8099404, + "num_input_tokens_seen": 168529920, + "router_z_loss_clip": 0.79052734, + "router_z_loss_mlp": 0.12750244, + "step": 7843, + "time_per_iteration": 2.6213815212249756 + }, + { + "auxiliary_loss_clip": 0.01125125, + "auxiliary_loss_mlp": 0.01031587, + "balance_loss_clip": 1.04486918, + "balance_loss_mlp": 1.01992846, + "epoch": 0.4716067939275515, + "flos": 28869633152640.0, + "grad_norm": 1.6079283954283536, + "language_loss": 0.73868227, + "learning_rate": 2.280073047010832e-06, + "loss": 0.76024938, + "num_input_tokens_seen": 168550595, + "router_z_loss_clip": 0.80175781, + "router_z_loss_mlp": 0.11657715, + "step": 7844, + "time_per_iteration": 2.6925530433654785 + }, + { + "auxiliary_loss_clip": 0.01123423, + "auxiliary_loss_mlp": 0.01036747, + "balance_loss_clip": 1.04454291, + "balance_loss_mlp": 1.02414083, + "epoch": 0.47166691718021947, + "flos": 21878320052160.0, + "grad_norm": 1.8686690450983892, + "language_loss": 0.7862519, + "learning_rate": 2.279687417645088e-06, + "loss": 0.80785358, + "num_input_tokens_seen": 168569765, + "router_z_loss_clip": 0.78857422, + "router_z_loss_mlp": 0.12615967, + "step": 7845, + "time_per_iteration": 2.6148464679718018 + }, + { + "auxiliary_loss_clip": 0.0112066, + "auxiliary_loss_mlp": 0.01033988, + "balance_loss_clip": 1.04321408, + "balance_loss_mlp": 1.02198958, + "epoch": 0.47172704043288743, + "flos": 32475642897600.0, + "grad_norm": 1.3911631751678855, + "language_loss": 0.73044151, + "learning_rate": 2.2793017776735703e-06, + "loss": 0.75198793, + "num_input_tokens_seen": 168591525, + "router_z_loss_clip": 0.77441406, + "router_z_loss_mlp": 0.12011719, + "step": 7846, + "time_per_iteration": 2.7105884552001953 + }, + { + "auxiliary_loss_clip": 0.01117834, + "auxiliary_loss_mlp": 0.01032203, + "balance_loss_clip": 1.04125178, + "balance_loss_mlp": 1.02061033, + "epoch": 0.4717871636855554, + "flos": 34072402874400.0, + "grad_norm": 2.0694534580412616, + "language_loss": 0.74325657, + "learning_rate": 2.2789161271109e-06, + "loss": 0.76475698, + "num_input_tokens_seen": 168611235, + "router_z_loss_clip": 0.765625, + "router_z_loss_mlp": 0.1159668, + "step": 7847, + "time_per_iteration": 2.7060298919677734 + }, + { + "auxiliary_loss_clip": 0.01123807, + "auxiliary_loss_mlp": 0.01032799, + "balance_loss_clip": 1.04460239, + "balance_loss_mlp": 1.02102697, + "epoch": 0.47184728693822336, + "flos": 17695143174240.0, + "grad_norm": 1.794199489380085, + "language_loss": 0.80308342, + "learning_rate": 2.278530465971703e-06, + "loss": 0.82464945, + "num_input_tokens_seen": 168628710, + "router_z_loss_clip": 0.79052734, + "router_z_loss_mlp": 0.11773682, + "step": 7848, + "time_per_iteration": 2.790416717529297 + }, + { + "auxiliary_loss_clip": 0.01129266, + "auxiliary_loss_mlp": 0.01030569, + "balance_loss_clip": 1.04887414, + "balance_loss_mlp": 1.01861858, + "epoch": 0.47190741019089133, + "flos": 21788546460480.0, + "grad_norm": 2.4108496793086847, + "language_loss": 0.70489216, + "learning_rate": 2.2781447942706032e-06, + "loss": 0.7264905, + "num_input_tokens_seen": 168645645, + "router_z_loss_clip": 0.80371094, + "router_z_loss_mlp": 0.1194458, + "step": 7849, + "time_per_iteration": 2.578510046005249 + }, + { + "auxiliary_loss_clip": 0.01129059, + "auxiliary_loss_mlp": 0.01032912, + "balance_loss_clip": 1.04496193, + "balance_loss_mlp": 1.01935768, + "epoch": 0.4719675334435593, + "flos": 21835823775840.0, + "grad_norm": 2.1037682323779525, + "language_loss": 0.70030266, + "learning_rate": 2.277759112022224e-06, + "loss": 0.7219224, + "num_input_tokens_seen": 168664165, + "router_z_loss_clip": 0.84277344, + "router_z_loss_mlp": 0.13562012, + "step": 7850, + "time_per_iteration": 2.617197275161743 + }, + { + "auxiliary_loss_clip": 0.01126923, + "auxiliary_loss_mlp": 0.01029102, + "balance_loss_clip": 1.04448652, + "balance_loss_mlp": 1.01645994, + "epoch": 0.47202765669622726, + "flos": 25263501855840.0, + "grad_norm": 2.2974627939795553, + "language_loss": 0.75072706, + "learning_rate": 2.2773734192411916e-06, + "loss": 0.77228737, + "num_input_tokens_seen": 168681940, + "router_z_loss_clip": 0.82373047, + "router_z_loss_mlp": 0.12646484, + "step": 7851, + "time_per_iteration": 2.6077914237976074 + }, + { + "auxiliary_loss_clip": 0.0112494, + "auxiliary_loss_mlp": 0.01035389, + "balance_loss_clip": 1.04317427, + "balance_loss_mlp": 1.02203763, + "epoch": 0.4720877799488952, + "flos": 19962693017280.0, + "grad_norm": 2.1229084871117863, + "language_loss": 0.76380092, + "learning_rate": 2.276987715942132e-06, + "loss": 0.78540421, + "num_input_tokens_seen": 168698830, + "router_z_loss_clip": 0.81835938, + "router_z_loss_mlp": 0.13354492, + "step": 7852, + "time_per_iteration": 2.644901990890503 + }, + { + "auxiliary_loss_clip": 0.01123425, + "auxiliary_loss_mlp": 0.01031138, + "balance_loss_clip": 1.04338169, + "balance_loss_mlp": 1.01831055, + "epoch": 0.4721479032015632, + "flos": 25219303853760.0, + "grad_norm": 1.9430622256937196, + "language_loss": 0.68973655, + "learning_rate": 2.2766020021396696e-06, + "loss": 0.71128213, + "num_input_tokens_seen": 168718305, + "router_z_loss_clip": 0.79931641, + "router_z_loss_mlp": 0.12811279, + "step": 7853, + "time_per_iteration": 2.682204484939575 + }, + { + "auxiliary_loss_clip": 0.01037592, + "auxiliary_loss_mlp": 0.01004718, + "balance_loss_clip": 1.01221645, + "balance_loss_mlp": 1.00324106, + "epoch": 0.47220802645423116, + "flos": 82667903127840.0, + "grad_norm": 0.7141498901055608, + "language_loss": 0.5019998, + "learning_rate": 2.276216277848432e-06, + "loss": 0.52242291, + "num_input_tokens_seen": 168782365, + "router_z_loss_clip": 0.25366211, + "router_z_loss_mlp": 0.01476288, + "step": 7854, + "time_per_iteration": 3.409447431564331 + }, + { + "auxiliary_loss_clip": 0.01125237, + "auxiliary_loss_mlp": 0.01031961, + "balance_loss_clip": 1.04394448, + "balance_loss_mlp": 1.0183537, + "epoch": 0.4722681497068991, + "flos": 25528568316480.0, + "grad_norm": 1.9566821567457895, + "language_loss": 0.64038616, + "learning_rate": 2.2758305430830455e-06, + "loss": 0.66195816, + "num_input_tokens_seen": 168800485, + "router_z_loss_clip": 0.81201172, + "router_z_loss_mlp": 0.13592529, + "step": 7855, + "time_per_iteration": 2.6508278846740723 + }, + { + "auxiliary_loss_clip": 0.01124429, + "auxiliary_loss_mlp": 0.01030475, + "balance_loss_clip": 1.04330325, + "balance_loss_mlp": 1.0173378, + "epoch": 0.4723282729595671, + "flos": 34523863938720.0, + "grad_norm": 2.122209945704489, + "language_loss": 0.76026124, + "learning_rate": 2.2754447978581376e-06, + "loss": 0.78181028, + "num_input_tokens_seen": 168818965, + "router_z_loss_clip": 0.81054688, + "router_z_loss_mlp": 0.13146973, + "step": 7856, + "time_per_iteration": 2.6545910835266113 + }, + { + "auxiliary_loss_clip": 0.01121029, + "auxiliary_loss_mlp": 0.01032064, + "balance_loss_clip": 1.04159546, + "balance_loss_mlp": 1.02000654, + "epoch": 0.4723883962122351, + "flos": 33098588206560.0, + "grad_norm": 2.5888544276043706, + "language_loss": 0.74758679, + "learning_rate": 2.2750590421883347e-06, + "loss": 0.76911771, + "num_input_tokens_seen": 168840355, + "router_z_loss_clip": 0.79394531, + "router_z_loss_mlp": 0.12060547, + "step": 7857, + "time_per_iteration": 2.6851112842559814 + }, + { + "auxiliary_loss_clip": 0.01121175, + "auxiliary_loss_mlp": 0.01036465, + "balance_loss_clip": 1.04370189, + "balance_loss_mlp": 1.02509856, + "epoch": 0.47244851946490307, + "flos": 38482930975680.0, + "grad_norm": 1.4764814394057995, + "language_loss": 0.64912337, + "learning_rate": 2.2746732760882655e-06, + "loss": 0.67069972, + "num_input_tokens_seen": 168861765, + "router_z_loss_clip": 0.77441406, + "router_z_loss_mlp": 0.11364746, + "step": 7858, + "time_per_iteration": 2.7013351917266846 + }, + { + "auxiliary_loss_clip": 0.01120287, + "auxiliary_loss_mlp": 0.01035054, + "balance_loss_clip": 1.04152822, + "balance_loss_mlp": 1.02239382, + "epoch": 0.47250864271757104, + "flos": 25485828936480.0, + "grad_norm": 1.7354446994162842, + "language_loss": 0.70450449, + "learning_rate": 2.2742874995725575e-06, + "loss": 0.72605789, + "num_input_tokens_seen": 168881310, + "router_z_loss_clip": 0.78662109, + "router_z_loss_mlp": 0.12652588, + "step": 7859, + "time_per_iteration": 2.702787399291992 + }, + { + "auxiliary_loss_clip": 0.01127451, + "auxiliary_loss_mlp": 0.01032008, + "balance_loss_clip": 1.04342866, + "balance_loss_mlp": 1.01958001, + "epoch": 0.472568765970239, + "flos": 25041053223360.0, + "grad_norm": 1.758695561246486, + "language_loss": 0.62253821, + "learning_rate": 2.2739017126558413e-06, + "loss": 0.64413285, + "num_input_tokens_seen": 168899470, + "router_z_loss_clip": 0.83984375, + "router_z_loss_mlp": 0.12426758, + "step": 7860, + "time_per_iteration": 2.6448307037353516 + }, + { + "auxiliary_loss_clip": 0.01127047, + "auxiliary_loss_mlp": 0.01038023, + "balance_loss_clip": 1.04482675, + "balance_loss_mlp": 1.02491546, + "epoch": 0.47262888922290697, + "flos": 43691089495680.0, + "grad_norm": 2.057404332733523, + "language_loss": 0.71816885, + "learning_rate": 2.2735159153527445e-06, + "loss": 0.73981959, + "num_input_tokens_seen": 168921495, + "router_z_loss_clip": 0.82080078, + "router_z_loss_mlp": 0.13104248, + "step": 7861, + "time_per_iteration": 2.790194272994995 + }, + { + "auxiliary_loss_clip": 0.01125191, + "auxiliary_loss_mlp": 0.01031437, + "balance_loss_clip": 1.04462314, + "balance_loss_mlp": 1.01900971, + "epoch": 0.47268901247557493, + "flos": 25218817646400.0, + "grad_norm": 2.31030953030571, + "language_loss": 0.85433543, + "learning_rate": 2.273130107677896e-06, + "loss": 0.87590164, + "num_input_tokens_seen": 168940515, + "router_z_loss_clip": 0.8046875, + "router_z_loss_mlp": 0.12426758, + "step": 7862, + "time_per_iteration": 2.62908935546875 + }, + { + "auxiliary_loss_clip": 0.01123858, + "auxiliary_loss_mlp": 0.01029772, + "balance_loss_clip": 1.04208517, + "balance_loss_mlp": 1.01713598, + "epoch": 0.4727491357282429, + "flos": 23883274988640.0, + "grad_norm": 1.9837527091247473, + "language_loss": 0.84516078, + "learning_rate": 2.272744289645927e-06, + "loss": 0.86669707, + "num_input_tokens_seen": 168958340, + "router_z_loss_clip": 0.81835938, + "router_z_loss_mlp": 0.12634277, + "step": 7863, + "time_per_iteration": 2.640533447265625 + }, + { + "auxiliary_loss_clip": 0.01122226, + "auxiliary_loss_mlp": 0.0103254, + "balance_loss_clip": 1.04299593, + "balance_loss_mlp": 1.02014208, + "epoch": 0.47280925898091086, + "flos": 22229067859200.0, + "grad_norm": 2.116489632426831, + "language_loss": 0.66053474, + "learning_rate": 2.272358461271467e-06, + "loss": 0.68208241, + "num_input_tokens_seen": 168974850, + "router_z_loss_clip": 0.79199219, + "router_z_loss_mlp": 0.1239624, + "step": 7864, + "time_per_iteration": 2.670058012008667 + }, + { + "auxiliary_loss_clip": 0.01121076, + "auxiliary_loss_mlp": 0.01026253, + "balance_loss_clip": 1.04194033, + "balance_loss_mlp": 1.01381981, + "epoch": 0.4728693822335788, + "flos": 21746009666880.0, + "grad_norm": 1.8416612326790278, + "language_loss": 0.65355074, + "learning_rate": 2.271972622569147e-06, + "loss": 0.67502403, + "num_input_tokens_seen": 168992860, + "router_z_loss_clip": 0.79101562, + "router_z_loss_mlp": 0.12426758, + "step": 7865, + "time_per_iteration": 2.6219420433044434 + }, + { + "auxiliary_loss_clip": 0.01121153, + "auxiliary_loss_mlp": 0.01031158, + "balance_loss_clip": 1.04288602, + "balance_loss_mlp": 1.01942217, + "epoch": 0.4729295054862468, + "flos": 25130097504000.0, + "grad_norm": 1.910516262877096, + "language_loss": 0.74065858, + "learning_rate": 2.2715867735535976e-06, + "loss": 0.76218164, + "num_input_tokens_seen": 169010325, + "router_z_loss_clip": 0.78369141, + "router_z_loss_mlp": 0.11737061, + "step": 7866, + "time_per_iteration": 2.637725830078125 + }, + { + "auxiliary_loss_clip": 0.01123757, + "auxiliary_loss_mlp": 0.01030387, + "balance_loss_clip": 1.04206574, + "balance_loss_mlp": 1.01797175, + "epoch": 0.47298962873891476, + "flos": 28513942237440.0, + "grad_norm": 1.9771936882518484, + "language_loss": 0.83119154, + "learning_rate": 2.271200914239451e-06, + "loss": 0.8527329, + "num_input_tokens_seen": 169029840, + "router_z_loss_clip": 0.81689453, + "router_z_loss_mlp": 0.12408447, + "step": 7867, + "time_per_iteration": 3.9305930137634277 + }, + { + "auxiliary_loss_clip": 0.01119329, + "auxiliary_loss_mlp": 0.01028893, + "balance_loss_clip": 1.04143143, + "balance_loss_mlp": 1.01700187, + "epoch": 0.4730497519915827, + "flos": 26908471045440.0, + "grad_norm": 1.7603632864167804, + "language_loss": 0.79452187, + "learning_rate": 2.2708150446413385e-06, + "loss": 0.81600404, + "num_input_tokens_seen": 169049975, + "router_z_loss_clip": 0.77929688, + "router_z_loss_mlp": 0.11889648, + "step": 7868, + "time_per_iteration": 4.214419364929199 + }, + { + "auxiliary_loss_clip": 0.01123662, + "auxiliary_loss_mlp": 0.01033053, + "balance_loss_clip": 1.04193544, + "balance_loss_mlp": 1.02006519, + "epoch": 0.4731098752442507, + "flos": 25709290500960.0, + "grad_norm": 2.7171856751187735, + "language_loss": 0.74328679, + "learning_rate": 2.2704291647738915e-06, + "loss": 0.76485395, + "num_input_tokens_seen": 169069540, + "router_z_loss_clip": 0.81542969, + "router_z_loss_mlp": 0.12994385, + "step": 7869, + "time_per_iteration": 2.6701507568359375 + }, + { + "auxiliary_loss_clip": 0.01125894, + "auxiliary_loss_mlp": 0.01039288, + "balance_loss_clip": 1.04524851, + "balance_loss_mlp": 1.02539992, + "epoch": 0.4731699984969187, + "flos": 27489770940960.0, + "grad_norm": 1.7275849544366213, + "language_loss": 0.73494053, + "learning_rate": 2.2700432746517443e-06, + "loss": 0.75659239, + "num_input_tokens_seen": 169089940, + "router_z_loss_clip": 0.80615234, + "router_z_loss_mlp": 0.13897705, + "step": 7870, + "time_per_iteration": 2.6808085441589355 + }, + { + "auxiliary_loss_clip": 0.01126509, + "auxiliary_loss_mlp": 0.01031654, + "balance_loss_clip": 1.04332376, + "balance_loss_mlp": 1.01848769, + "epoch": 0.4732301217495867, + "flos": 30381238507680.0, + "grad_norm": 2.055585038678582, + "language_loss": 0.81081796, + "learning_rate": 2.2696573742895292e-06, + "loss": 0.83239949, + "num_input_tokens_seen": 169109650, + "router_z_loss_clip": 0.83154297, + "router_z_loss_mlp": 0.13171387, + "step": 7871, + "time_per_iteration": 2.696300983428955 + }, + { + "auxiliary_loss_clip": 0.01123468, + "auxiliary_loss_mlp": 0.01032932, + "balance_loss_clip": 1.04382098, + "balance_loss_mlp": 1.0204215, + "epoch": 0.47329024500225464, + "flos": 27801709544160.0, + "grad_norm": 1.608885528963982, + "language_loss": 0.75759518, + "learning_rate": 2.269271463701879e-06, + "loss": 0.77915919, + "num_input_tokens_seen": 169128990, + "router_z_loss_clip": 0.79736328, + "router_z_loss_mlp": 0.12512207, + "step": 7872, + "time_per_iteration": 2.6285953521728516 + }, + { + "auxiliary_loss_clip": 0.01121167, + "auxiliary_loss_mlp": 0.01028304, + "balance_loss_clip": 1.04144657, + "balance_loss_mlp": 1.01607895, + "epoch": 0.4733503682549226, + "flos": 47217779107200.0, + "grad_norm": 2.01290101270694, + "language_loss": 0.68079418, + "learning_rate": 2.268885542903428e-06, + "loss": 0.70228887, + "num_input_tokens_seen": 169154645, + "router_z_loss_clip": 0.79785156, + "router_z_loss_mlp": 0.12231445, + "step": 7873, + "time_per_iteration": 2.7979378700256348 + }, + { + "auxiliary_loss_clip": 0.01123229, + "auxiliary_loss_mlp": 0.01032346, + "balance_loss_clip": 1.04457855, + "balance_loss_mlp": 1.02028227, + "epoch": 0.47341049150759057, + "flos": 28024320245760.0, + "grad_norm": 1.517778831118609, + "language_loss": 0.72387028, + "learning_rate": 2.26849961190881e-06, + "loss": 0.74542606, + "num_input_tokens_seen": 169174995, + "router_z_loss_clip": 0.78662109, + "router_z_loss_mlp": 0.1206665, + "step": 7874, + "time_per_iteration": 2.6351771354675293 + }, + { + "auxiliary_loss_clip": 0.01123859, + "auxiliary_loss_mlp": 0.01030482, + "balance_loss_clip": 1.04372573, + "balance_loss_mlp": 1.0182991, + "epoch": 0.47347061476025853, + "flos": 17739381693600.0, + "grad_norm": 3.113565211979843, + "language_loss": 0.64854401, + "learning_rate": 2.26811367073266e-06, + "loss": 0.67008746, + "num_input_tokens_seen": 169191815, + "router_z_loss_clip": 0.80029297, + "router_z_loss_mlp": 0.12182617, + "step": 7875, + "time_per_iteration": 4.100549221038818 + }, + { + "auxiliary_loss_clip": 0.01125374, + "auxiliary_loss_mlp": 0.01032418, + "balance_loss_clip": 1.04525614, + "balance_loss_mlp": 1.01948965, + "epoch": 0.4735307380129265, + "flos": 36928018998720.0, + "grad_norm": 2.424873821514013, + "language_loss": 0.81737936, + "learning_rate": 2.2677277193896125e-06, + "loss": 0.83895725, + "num_input_tokens_seen": 169210430, + "router_z_loss_clip": 0.80078125, + "router_z_loss_mlp": 0.1293335, + "step": 7876, + "time_per_iteration": 2.7154459953308105 + }, + { + "auxiliary_loss_clip": 0.01122436, + "auxiliary_loss_mlp": 0.01033487, + "balance_loss_clip": 1.04206538, + "balance_loss_mlp": 1.02128005, + "epoch": 0.47359086126559446, + "flos": 23660866873440.0, + "grad_norm": 2.0296559574189565, + "language_loss": 0.79483068, + "learning_rate": 2.267341757894304e-06, + "loss": 0.81638992, + "num_input_tokens_seen": 169229295, + "router_z_loss_clip": 0.80371094, + "router_z_loss_mlp": 0.12213135, + "step": 7877, + "time_per_iteration": 2.6223866939544678 + }, + { + "auxiliary_loss_clip": 0.01122729, + "auxiliary_loss_mlp": 0.01032081, + "balance_loss_clip": 1.0433588, + "balance_loss_mlp": 1.01988029, + "epoch": 0.47365098451826243, + "flos": 26771096000160.0, + "grad_norm": 4.804408179253264, + "language_loss": 0.70783782, + "learning_rate": 2.2669557862613685e-06, + "loss": 0.72938591, + "num_input_tokens_seen": 169247855, + "router_z_loss_clip": 0.79443359, + "router_z_loss_mlp": 0.12213135, + "step": 7878, + "time_per_iteration": 2.628096342086792 + }, + { + "auxiliary_loss_clip": 0.01122062, + "auxiliary_loss_mlp": 0.01032599, + "balance_loss_clip": 1.04456329, + "balance_loss_mlp": 1.02106512, + "epoch": 0.4737111077709304, + "flos": 31536909843840.0, + "grad_norm": 1.5537740152834276, + "language_loss": 0.74958932, + "learning_rate": 2.2665698045054425e-06, + "loss": 0.77113593, + "num_input_tokens_seen": 169268860, + "router_z_loss_clip": 0.77490234, + "router_z_loss_mlp": 0.11523438, + "step": 7879, + "time_per_iteration": 2.673055648803711 + }, + { + "auxiliary_loss_clip": 0.01042016, + "auxiliary_loss_mlp": 0.01005808, + "balance_loss_clip": 1.01718783, + "balance_loss_mlp": 1.00460386, + "epoch": 0.47377123102359836, + "flos": 82683050240160.0, + "grad_norm": 0.7264527933462763, + "language_loss": 0.6129868, + "learning_rate": 2.266183812641164e-06, + "loss": 0.63346505, + "num_input_tokens_seen": 169331855, + "router_z_loss_clip": 0.24829102, + "router_z_loss_mlp": 0.01202393, + "step": 7880, + "time_per_iteration": 4.503820896148682 + }, + { + "auxiliary_loss_clip": 0.01121569, + "auxiliary_loss_mlp": 0.0103095, + "balance_loss_clip": 1.0439477, + "balance_loss_mlp": 1.01779532, + "epoch": 0.4738313542762663, + "flos": 29670707540160.0, + "grad_norm": 1.6838677969826603, + "language_loss": 0.67651153, + "learning_rate": 2.2657978106831675e-06, + "loss": 0.69803673, + "num_input_tokens_seen": 169352175, + "router_z_loss_clip": 0.77539062, + "router_z_loss_mlp": 0.13165283, + "step": 7881, + "time_per_iteration": 2.719087839126587 + }, + { + "auxiliary_loss_clip": 0.01121699, + "auxiliary_loss_mlp": 0.01026063, + "balance_loss_clip": 1.04423475, + "balance_loss_mlp": 1.01441669, + "epoch": 0.4738914775289343, + "flos": 25264595822400.0, + "grad_norm": 1.7229668629863444, + "language_loss": 0.77233768, + "learning_rate": 2.265411798646092e-06, + "loss": 0.79381526, + "num_input_tokens_seen": 169371215, + "router_z_loss_clip": 0.77539062, + "router_z_loss_mlp": 0.11645508, + "step": 7882, + "time_per_iteration": 2.6330771446228027 + }, + { + "auxiliary_loss_clip": 0.01122073, + "auxiliary_loss_mlp": 0.01028598, + "balance_loss_clip": 1.04283738, + "balance_loss_mlp": 1.01622391, + "epoch": 0.4739516007816023, + "flos": 31050124061760.0, + "grad_norm": 1.4836563846209414, + "language_loss": 0.7624467, + "learning_rate": 2.2650257765445747e-06, + "loss": 0.78395343, + "num_input_tokens_seen": 169391745, + "router_z_loss_clip": 0.79150391, + "router_z_loss_mlp": 0.12371826, + "step": 7883, + "time_per_iteration": 2.6841602325439453 + }, + { + "auxiliary_loss_clip": 0.01121197, + "auxiliary_loss_mlp": 0.01027897, + "balance_loss_clip": 1.0433681, + "balance_loss_mlp": 1.01611912, + "epoch": 0.4740117240342703, + "flos": 24372694393920.0, + "grad_norm": 1.7721637783470436, + "language_loss": 0.72042739, + "learning_rate": 2.2646397443932525e-06, + "loss": 0.74191833, + "num_input_tokens_seen": 169409845, + "router_z_loss_clip": 0.77832031, + "router_z_loss_mlp": 0.11773682, + "step": 7884, + "time_per_iteration": 2.640665292739868 + }, + { + "auxiliary_loss_clip": 0.01128075, + "auxiliary_loss_mlp": 0.01029219, + "balance_loss_clip": 1.04418826, + "balance_loss_mlp": 1.01603401, + "epoch": 0.47407184728693824, + "flos": 19110127517280.0, + "grad_norm": 2.386814414617737, + "language_loss": 0.82034397, + "learning_rate": 2.2642537022067655e-06, + "loss": 0.84191692, + "num_input_tokens_seen": 169426085, + "router_z_loss_clip": 0.83837891, + "router_z_loss_mlp": 0.13189697, + "step": 7885, + "time_per_iteration": 2.6831767559051514 + }, + { + "auxiliary_loss_clip": 0.01125628, + "auxiliary_loss_mlp": 0.01032431, + "balance_loss_clip": 1.04672492, + "balance_loss_mlp": 1.02014077, + "epoch": 0.4741319705396062, + "flos": 22681865993760.0, + "grad_norm": 1.7565025647194852, + "language_loss": 0.73484147, + "learning_rate": 2.263867649999751e-06, + "loss": 0.75642204, + "num_input_tokens_seen": 169444705, + "router_z_loss_clip": 0.78857422, + "router_z_loss_mlp": 0.1229248, + "step": 7886, + "time_per_iteration": 2.638015031814575 + }, + { + "auxiliary_loss_clip": 0.01126685, + "auxiliary_loss_mlp": 0.01034062, + "balance_loss_clip": 1.0438993, + "balance_loss_mlp": 1.02100289, + "epoch": 0.47419209379227417, + "flos": 16180661092320.0, + "grad_norm": 2.2124715566196307, + "language_loss": 0.74213976, + "learning_rate": 2.263481587786849e-06, + "loss": 0.76374727, + "num_input_tokens_seen": 169460850, + "router_z_loss_clip": 0.82666016, + "router_z_loss_mlp": 0.1305542, + "step": 7887, + "time_per_iteration": 2.6201694011688232 + }, + { + "auxiliary_loss_clip": 0.01118343, + "auxiliary_loss_mlp": 0.01024776, + "balance_loss_clip": 1.04239976, + "balance_loss_mlp": 1.01326668, + "epoch": 0.47425221704494214, + "flos": 24458051602080.0, + "grad_norm": 1.9004071969665457, + "language_loss": 0.76961386, + "learning_rate": 2.2630955155826993e-06, + "loss": 0.79104507, + "num_input_tokens_seen": 169478890, + "router_z_loss_clip": 0.75927734, + "router_z_loss_mlp": 0.1151123, + "step": 7888, + "time_per_iteration": 2.6371331214904785 + }, + { + "auxiliary_loss_clip": 0.0112233, + "auxiliary_loss_mlp": 0.01028472, + "balance_loss_clip": 1.04315305, + "balance_loss_mlp": 1.0167892, + "epoch": 0.4743123402976101, + "flos": 33277689699840.0, + "grad_norm": 2.27830918018119, + "language_loss": 0.72622079, + "learning_rate": 2.2627094334019406e-06, + "loss": 0.74772882, + "num_input_tokens_seen": 169499690, + "router_z_loss_clip": 0.79199219, + "router_z_loss_mlp": 0.11694336, + "step": 7889, + "time_per_iteration": 2.6788740158081055 + }, + { + "auxiliary_loss_clip": 0.01040806, + "auxiliary_loss_mlp": 0.01004058, + "balance_loss_clip": 1.0157938, + "balance_loss_mlp": 1.00284934, + "epoch": 0.47437246355027807, + "flos": 67591827432000.0, + "grad_norm": 0.7138586951568259, + "language_loss": 0.56067449, + "learning_rate": 2.262323341259214e-06, + "loss": 0.58112311, + "num_input_tokens_seen": 169560475, + "router_z_loss_clip": 0.25012207, + "router_z_loss_mlp": 0.01207733, + "step": 7890, + "time_per_iteration": 3.3060414791107178 + }, + { + "auxiliary_loss_clip": 0.01124807, + "auxiliary_loss_mlp": 0.01031381, + "balance_loss_clip": 1.0442915, + "balance_loss_mlp": 1.01785123, + "epoch": 0.47443258680294603, + "flos": 29136806511840.0, + "grad_norm": 1.905897509664378, + "language_loss": 0.65217376, + "learning_rate": 2.2619372391691605e-06, + "loss": 0.67373562, + "num_input_tokens_seen": 169580110, + "router_z_loss_clip": 0.80615234, + "router_z_loss_mlp": 0.13543701, + "step": 7891, + "time_per_iteration": 2.640040636062622 + }, + { + "auxiliary_loss_clip": 0.01125538, + "auxiliary_loss_mlp": 0.01033504, + "balance_loss_clip": 1.04291999, + "balance_loss_mlp": 1.02058125, + "epoch": 0.474492710055614, + "flos": 26817725039040.0, + "grad_norm": 2.398069576632483, + "language_loss": 0.70176184, + "learning_rate": 2.26155112714642e-06, + "loss": 0.72335231, + "num_input_tokens_seen": 169597510, + "router_z_loss_clip": 0.82617188, + "router_z_loss_mlp": 0.12927246, + "step": 7892, + "time_per_iteration": 2.6542041301727295 + }, + { + "auxiliary_loss_clip": 0.01039515, + "auxiliary_loss_mlp": 0.01002719, + "balance_loss_clip": 1.01452708, + "balance_loss_mlp": 1.00154603, + "epoch": 0.47455283330828196, + "flos": 76331821258080.0, + "grad_norm": 0.8193493304399491, + "language_loss": 0.58559644, + "learning_rate": 2.2611650052056355e-06, + "loss": 0.60601878, + "num_input_tokens_seen": 169660010, + "router_z_loss_clip": 0.24975586, + "router_z_loss_mlp": 0.01172638, + "step": 7893, + "time_per_iteration": 3.3426005840301514 + }, + { + "auxiliary_loss_clip": 0.01121644, + "auxiliary_loss_mlp": 0.01031048, + "balance_loss_clip": 1.04358959, + "balance_loss_mlp": 1.01952648, + "epoch": 0.47461295656094993, + "flos": 14756033636640.0, + "grad_norm": 2.0509863436587237, + "language_loss": 0.77632236, + "learning_rate": 2.2607788733614463e-06, + "loss": 0.7978493, + "num_input_tokens_seen": 169678485, + "router_z_loss_clip": 0.78076172, + "router_z_loss_mlp": 0.11529541, + "step": 7894, + "time_per_iteration": 2.6407299041748047 + }, + { + "auxiliary_loss_clip": 0.01121926, + "auxiliary_loss_mlp": 0.01029215, + "balance_loss_clip": 1.04269874, + "balance_loss_mlp": 1.01709163, + "epoch": 0.4746730798136179, + "flos": 25483600486080.0, + "grad_norm": 2.3539289063776465, + "language_loss": 0.74678016, + "learning_rate": 2.260392731628497e-06, + "loss": 0.76829153, + "num_input_tokens_seen": 169697335, + "router_z_loss_clip": 0.79199219, + "router_z_loss_mlp": 0.12133789, + "step": 7895, + "time_per_iteration": 2.671341896057129 + }, + { + "auxiliary_loss_clip": 0.01119638, + "auxiliary_loss_mlp": 0.010262, + "balance_loss_clip": 1.04204965, + "balance_loss_mlp": 1.01361752, + "epoch": 0.4747332030662859, + "flos": 24373221118560.0, + "grad_norm": 1.8528025690606218, + "language_loss": 0.82246482, + "learning_rate": 2.260006580021429e-06, + "loss": 0.84392321, + "num_input_tokens_seen": 169715395, + "router_z_loss_clip": 0.77685547, + "router_z_loss_mlp": 0.12573242, + "step": 7896, + "time_per_iteration": 2.623629570007324 + }, + { + "auxiliary_loss_clip": 0.011217, + "auxiliary_loss_mlp": 0.01029743, + "balance_loss_clip": 1.04327631, + "balance_loss_mlp": 1.01687455, + "epoch": 0.4747933263189539, + "flos": 19564586860320.0, + "grad_norm": 3.349916166269636, + "language_loss": 0.75718391, + "learning_rate": 2.259620418554886e-06, + "loss": 0.77869833, + "num_input_tokens_seen": 169733755, + "router_z_loss_clip": 0.78466797, + "router_z_loss_mlp": 0.12872314, + "step": 7897, + "time_per_iteration": 2.6401472091674805 + }, + { + "auxiliary_loss_clip": 0.01125695, + "auxiliary_loss_mlp": 0.01034556, + "balance_loss_clip": 1.04274356, + "balance_loss_mlp": 1.02218819, + "epoch": 0.47485344957162184, + "flos": 17027473138560.0, + "grad_norm": 2.3316117331649626, + "language_loss": 0.63378471, + "learning_rate": 2.25923424724351e-06, + "loss": 0.65538722, + "num_input_tokens_seen": 169751390, + "router_z_loss_clip": 0.83007812, + "router_z_loss_mlp": 0.12353516, + "step": 7898, + "time_per_iteration": 2.605109214782715 + }, + { + "auxiliary_loss_clip": 0.01122474, + "auxiliary_loss_mlp": 0.01032493, + "balance_loss_clip": 1.04372835, + "balance_loss_mlp": 1.01992249, + "epoch": 0.4749135728242898, + "flos": 24951968425440.0, + "grad_norm": 2.886304670165764, + "language_loss": 0.7004528, + "learning_rate": 2.258848066101946e-06, + "loss": 0.72200251, + "num_input_tokens_seen": 169769500, + "router_z_loss_clip": 0.78662109, + "router_z_loss_mlp": 0.12573242, + "step": 7899, + "time_per_iteration": 2.640288829803467 + }, + { + "auxiliary_loss_clip": 0.01123789, + "auxiliary_loss_mlp": 0.01034431, + "balance_loss_clip": 1.04279089, + "balance_loss_mlp": 1.02224779, + "epoch": 0.4749736960769578, + "flos": 35325424533600.0, + "grad_norm": 2.5336286448659067, + "language_loss": 0.68093872, + "learning_rate": 2.258461875144837e-06, + "loss": 0.70252097, + "num_input_tokens_seen": 169789215, + "router_z_loss_clip": 0.81005859, + "router_z_loss_mlp": 0.12194824, + "step": 7900, + "time_per_iteration": 2.677938461303711 + }, + { + "auxiliary_loss_clip": 0.01120218, + "auxiliary_loss_mlp": 0.01031271, + "balance_loss_clip": 1.04148531, + "balance_loss_mlp": 1.01856279, + "epoch": 0.47503381932962574, + "flos": 38972877105600.0, + "grad_norm": 2.2892041558485596, + "language_loss": 0.70721209, + "learning_rate": 2.2580756743868273e-06, + "loss": 0.72872692, + "num_input_tokens_seen": 169808825, + "router_z_loss_clip": 0.78710938, + "router_z_loss_mlp": 0.12701416, + "step": 7901, + "time_per_iteration": 2.7418224811553955 + }, + { + "auxiliary_loss_clip": 0.01122566, + "auxiliary_loss_mlp": 0.01038998, + "balance_loss_clip": 1.04363179, + "balance_loss_mlp": 1.02683234, + "epoch": 0.4750939425822937, + "flos": 26999824811040.0, + "grad_norm": 2.041996021555763, + "language_loss": 0.73815823, + "learning_rate": 2.2576894638425636e-06, + "loss": 0.75977391, + "num_input_tokens_seen": 169827590, + "router_z_loss_clip": 0.7890625, + "router_z_loss_mlp": 0.1217041, + "step": 7902, + "time_per_iteration": 2.6246225833892822 + }, + { + "auxiliary_loss_clip": 0.01117642, + "auxiliary_loss_mlp": 0.01031176, + "balance_loss_clip": 1.04238939, + "balance_loss_mlp": 1.0197978, + "epoch": 0.47515406583496167, + "flos": 25441347313440.0, + "grad_norm": 1.9152016636169866, + "language_loss": 0.68444443, + "learning_rate": 2.257303243526688e-06, + "loss": 0.70593262, + "num_input_tokens_seen": 169844925, + "router_z_loss_clip": 0.75244141, + "router_z_loss_mlp": 0.1137085, + "step": 7903, + "time_per_iteration": 2.652552366256714 + }, + { + "auxiliary_loss_clip": 0.01117852, + "auxiliary_loss_mlp": 0.01029528, + "balance_loss_clip": 1.04208386, + "balance_loss_mlp": 1.01828635, + "epoch": 0.47521418908762963, + "flos": 21384038573280.0, + "grad_norm": 1.725363508506475, + "language_loss": 0.72157073, + "learning_rate": 2.256917013453848e-06, + "loss": 0.74304456, + "num_input_tokens_seen": 169862705, + "router_z_loss_clip": 0.7578125, + "router_z_loss_mlp": 0.11254883, + "step": 7904, + "time_per_iteration": 2.5916833877563477 + }, + { + "auxiliary_loss_clip": 0.01118758, + "auxiliary_loss_mlp": 0.01027137, + "balance_loss_clip": 1.04231262, + "balance_loss_mlp": 1.01621103, + "epoch": 0.4752743123402976, + "flos": 25086547778400.0, + "grad_norm": 1.5688398388513782, + "language_loss": 0.86178976, + "learning_rate": 2.25653077363869e-06, + "loss": 0.88324875, + "num_input_tokens_seen": 169880155, + "router_z_loss_clip": 0.76416016, + "router_z_loss_mlp": 0.10925293, + "step": 7905, + "time_per_iteration": 2.6769144535064697 + }, + { + "auxiliary_loss_clip": 0.01115826, + "auxiliary_loss_mlp": 0.01026217, + "balance_loss_clip": 1.04108405, + "balance_loss_mlp": 1.01484418, + "epoch": 0.47533443559296557, + "flos": 32165608606560.0, + "grad_norm": 1.534068578697544, + "language_loss": 0.81684625, + "learning_rate": 2.2561445240958583e-06, + "loss": 0.83826667, + "num_input_tokens_seen": 169901525, + "router_z_loss_clip": 0.74755859, + "router_z_loss_mlp": 0.11383057, + "step": 7906, + "time_per_iteration": 2.669137716293335 + }, + { + "auxiliary_loss_clip": 0.01039384, + "auxiliary_loss_mlp": 0.01001595, + "balance_loss_clip": 1.01455832, + "balance_loss_mlp": 1.00043011, + "epoch": 0.47539455884563353, + "flos": 80471650996800.0, + "grad_norm": 0.6720533115805157, + "language_loss": 0.58955967, + "learning_rate": 2.255758264840002e-06, + "loss": 0.60996944, + "num_input_tokens_seen": 169970345, + "router_z_loss_clip": 0.24829102, + "router_z_loss_mlp": 0.0116272, + "step": 7907, + "time_per_iteration": 6.266853332519531 + }, + { + "auxiliary_loss_clip": 0.01120154, + "auxiliary_loss_mlp": 0.01028804, + "balance_loss_clip": 1.04365849, + "balance_loss_mlp": 1.01754439, + "epoch": 0.4754546820983015, + "flos": 21033736456320.0, + "grad_norm": 6.340620563130002, + "language_loss": 0.81330699, + "learning_rate": 2.255371995885765e-06, + "loss": 0.83479655, + "num_input_tokens_seen": 169986440, + "router_z_loss_clip": 0.76464844, + "router_z_loss_mlp": 0.11248779, + "step": 7908, + "time_per_iteration": 2.594787120819092 + }, + { + "auxiliary_loss_clip": 0.0112201, + "auxiliary_loss_mlp": 0.01031651, + "balance_loss_clip": 1.04451728, + "balance_loss_mlp": 1.01960456, + "epoch": 0.47551480535096946, + "flos": 24191769623040.0, + "grad_norm": 1.730249685046405, + "language_loss": 0.74243498, + "learning_rate": 2.254985717247797e-06, + "loss": 0.76397157, + "num_input_tokens_seen": 170005705, + "router_z_loss_clip": 0.77490234, + "router_z_loss_mlp": 0.1204834, + "step": 7909, + "time_per_iteration": 2.664093494415283 + }, + { + "auxiliary_loss_clip": 0.01117169, + "auxiliary_loss_mlp": 0.01028975, + "balance_loss_clip": 1.04068565, + "balance_loss_mlp": 1.017483, + "epoch": 0.4755749286036375, + "flos": 27044590055040.0, + "grad_norm": 1.704256107611575, + "language_loss": 0.75694108, + "learning_rate": 2.2545994289407457e-06, + "loss": 0.77840245, + "num_input_tokens_seen": 170023415, + "router_z_loss_clip": 0.76464844, + "router_z_loss_mlp": 0.1149292, + "step": 7910, + "time_per_iteration": 2.620952844619751 + }, + { + "auxiliary_loss_clip": 0.0111739, + "auxiliary_loss_mlp": 0.01024138, + "balance_loss_clip": 1.04105532, + "balance_loss_mlp": 1.01380837, + "epoch": 0.47563505185630545, + "flos": 26415769740480.0, + "grad_norm": 3.141358041252122, + "language_loss": 0.7918514, + "learning_rate": 2.2542131309792577e-06, + "loss": 0.81326663, + "num_input_tokens_seen": 170042395, + "router_z_loss_clip": 0.76269531, + "router_z_loss_mlp": 0.10321045, + "step": 7911, + "time_per_iteration": 2.664407968521118 + }, + { + "auxiliary_loss_clip": 0.01121107, + "auxiliary_loss_mlp": 0.01028798, + "balance_loss_clip": 1.04077888, + "balance_loss_mlp": 1.01625156, + "epoch": 0.4756951751089734, + "flos": 25171499813760.0, + "grad_norm": 1.8761359417588557, + "language_loss": 0.7539953, + "learning_rate": 2.253826823377983e-06, + "loss": 0.7754944, + "num_input_tokens_seen": 170061610, + "router_z_loss_clip": 0.80322266, + "router_z_loss_mlp": 0.12554932, + "step": 7912, + "time_per_iteration": 2.615899085998535 + }, + { + "auxiliary_loss_clip": 0.01117645, + "auxiliary_loss_mlp": 0.0103434, + "balance_loss_clip": 1.04092622, + "balance_loss_mlp": 1.02322388, + "epoch": 0.4757552983616414, + "flos": 31541123640960.0, + "grad_norm": 1.685183212162627, + "language_loss": 0.74235421, + "learning_rate": 2.253440506151569e-06, + "loss": 0.76387405, + "num_input_tokens_seen": 170083505, + "router_z_loss_clip": 0.76757812, + "router_z_loss_mlp": 0.11120605, + "step": 7913, + "time_per_iteration": 2.7111799716949463 + }, + { + "auxiliary_loss_clip": 0.011214, + "auxiliary_loss_mlp": 0.01027523, + "balance_loss_clip": 1.04370522, + "balance_loss_mlp": 1.01519644, + "epoch": 0.47581542161430934, + "flos": 22235591141280.0, + "grad_norm": 2.1060422613364653, + "language_loss": 0.72074187, + "learning_rate": 2.253054179314666e-06, + "loss": 0.74223113, + "num_input_tokens_seen": 170100690, + "router_z_loss_clip": 0.77734375, + "router_z_loss_mlp": 0.12322998, + "step": 7914, + "time_per_iteration": 2.620011806488037 + }, + { + "auxiliary_loss_clip": 0.01122183, + "auxiliary_loss_mlp": 0.01034825, + "balance_loss_clip": 1.04401994, + "balance_loss_mlp": 1.02390563, + "epoch": 0.4758755448669773, + "flos": 26331749602560.0, + "grad_norm": 2.6192273592729065, + "language_loss": 0.64760906, + "learning_rate": 2.2526678428819227e-06, + "loss": 0.66917908, + "num_input_tokens_seen": 170119240, + "router_z_loss_clip": 0.78222656, + "router_z_loss_mlp": 0.10919189, + "step": 7915, + "time_per_iteration": 4.181095123291016 + }, + { + "auxiliary_loss_clip": 0.01116602, + "auxiliary_loss_mlp": 0.01029194, + "balance_loss_clip": 1.04185021, + "balance_loss_mlp": 1.01771414, + "epoch": 0.47593566811964527, + "flos": 18584694600480.0, + "grad_norm": 1.984665776143751, + "language_loss": 0.76697624, + "learning_rate": 2.2522814968679896e-06, + "loss": 0.78843415, + "num_input_tokens_seen": 170136450, + "router_z_loss_clip": 0.74755859, + "router_z_loss_mlp": 0.11480713, + "step": 7916, + "time_per_iteration": 2.673353910446167 + }, + { + "auxiliary_loss_clip": 0.0111771, + "auxiliary_loss_mlp": 0.01031586, + "balance_loss_clip": 1.04092515, + "balance_loss_mlp": 1.02039242, + "epoch": 0.47599579137231324, + "flos": 26287186944960.0, + "grad_norm": 2.1706213230491427, + "language_loss": 0.64032817, + "learning_rate": 2.2518951412875173e-06, + "loss": 0.66182113, + "num_input_tokens_seen": 170155295, + "router_z_loss_clip": 0.76806641, + "router_z_loss_mlp": 0.11193848, + "step": 7917, + "time_per_iteration": 2.639533758163452 + }, + { + "auxiliary_loss_clip": 0.01040483, + "auxiliary_loss_mlp": 0.01006312, + "balance_loss_clip": 1.01580977, + "balance_loss_mlp": 1.00519967, + "epoch": 0.4760559146249812, + "flos": 78770085517440.0, + "grad_norm": 0.8504521620657108, + "language_loss": 0.65893674, + "learning_rate": 2.2515087761551557e-06, + "loss": 0.67940468, + "num_input_tokens_seen": 170222325, + "router_z_loss_clip": 0.24682617, + "router_z_loss_mlp": 0.01113129, + "step": 7918, + "time_per_iteration": 3.3078622817993164 + }, + { + "auxiliary_loss_clip": 0.01119233, + "auxiliary_loss_mlp": 0.01030722, + "balance_loss_clip": 1.04128194, + "balance_loss_mlp": 1.01948678, + "epoch": 0.47611603787764917, + "flos": 27130190366880.0, + "grad_norm": 1.6702250586387795, + "language_loss": 0.6861819, + "learning_rate": 2.2511224014855563e-06, + "loss": 0.70768142, + "num_input_tokens_seen": 170241625, + "router_z_loss_clip": 0.77929688, + "router_z_loss_mlp": 0.11236572, + "step": 7919, + "time_per_iteration": 2.625734806060791 + }, + { + "auxiliary_loss_clip": 0.01119171, + "auxiliary_loss_mlp": 0.01030514, + "balance_loss_clip": 1.04028189, + "balance_loss_mlp": 1.01916575, + "epoch": 0.47617616113031713, + "flos": 27796725918720.0, + "grad_norm": 1.673276625582038, + "language_loss": 0.74711955, + "learning_rate": 2.2507360172933694e-06, + "loss": 0.76861638, + "num_input_tokens_seen": 170262470, + "router_z_loss_clip": 0.78857422, + "router_z_loss_mlp": 0.11352539, + "step": 7920, + "time_per_iteration": 3.9708831310272217 + }, + { + "auxiliary_loss_clip": 0.01123516, + "auxiliary_loss_mlp": 0.01033126, + "balance_loss_clip": 1.04309976, + "balance_loss_mlp": 1.02101994, + "epoch": 0.4762362843829851, + "flos": 29448380459520.0, + "grad_norm": 1.559305337164971, + "language_loss": 0.77587134, + "learning_rate": 2.2503496235932487e-06, + "loss": 0.79743773, + "num_input_tokens_seen": 170283460, + "router_z_loss_clip": 0.80371094, + "router_z_loss_mlp": 0.12097168, + "step": 7921, + "time_per_iteration": 2.6873745918273926 + }, + { + "auxiliary_loss_clip": 0.01120834, + "auxiliary_loss_mlp": 0.01036711, + "balance_loss_clip": 1.04130745, + "balance_loss_mlp": 1.02424812, + "epoch": 0.47629640763565306, + "flos": 27396512863200.0, + "grad_norm": 1.757246214350701, + "language_loss": 0.7814219, + "learning_rate": 2.249963220399845e-06, + "loss": 0.80299735, + "num_input_tokens_seen": 170304225, + "router_z_loss_clip": 0.79589844, + "router_z_loss_mlp": 0.12457275, + "step": 7922, + "time_per_iteration": 2.652517080307007 + }, + { + "auxiliary_loss_clip": 0.01121566, + "auxiliary_loss_mlp": 0.01038496, + "balance_loss_clip": 1.04086113, + "balance_loss_mlp": 1.02602088, + "epoch": 0.4763565308883211, + "flos": 13642858576800.0, + "grad_norm": 1.8934228771363009, + "language_loss": 0.72624266, + "learning_rate": 2.2495768077278104e-06, + "loss": 0.74784327, + "num_input_tokens_seen": 170322110, + "router_z_loss_clip": 0.80712891, + "router_z_loss_mlp": 0.12481689, + "step": 7923, + "time_per_iteration": 2.6455953121185303 + }, + { + "auxiliary_loss_clip": 0.01120503, + "auxiliary_loss_mlp": 0.01037472, + "balance_loss_clip": 1.04203701, + "balance_loss_mlp": 1.02584267, + "epoch": 0.47641665414098905, + "flos": 27310912551360.0, + "grad_norm": 2.3686034056188707, + "language_loss": 0.82339203, + "learning_rate": 2.2491903855917992e-06, + "loss": 0.84497184, + "num_input_tokens_seen": 170340700, + "router_z_loss_clip": 0.78515625, + "router_z_loss_mlp": 0.11621094, + "step": 7924, + "time_per_iteration": 2.6216378211975098 + }, + { + "auxiliary_loss_clip": 0.0112848, + "auxiliary_loss_mlp": 0.01037297, + "balance_loss_clip": 1.04503047, + "balance_loss_mlp": 1.02440488, + "epoch": 0.476476777393657, + "flos": 30561677071200.0, + "grad_norm": 1.7403890566135798, + "language_loss": 0.80243254, + "learning_rate": 2.2488039540064626e-06, + "loss": 0.82409036, + "num_input_tokens_seen": 170359780, + "router_z_loss_clip": 0.83447266, + "router_z_loss_mlp": 0.12884521, + "step": 7925, + "time_per_iteration": 2.6975016593933105 + }, + { + "auxiliary_loss_clip": 0.0111891, + "auxiliary_loss_mlp": 0.01035512, + "balance_loss_clip": 1.03990555, + "balance_loss_mlp": 1.02407384, + "epoch": 0.476536900646325, + "flos": 33275461249440.0, + "grad_norm": 2.5669813030865627, + "language_loss": 0.72114658, + "learning_rate": 2.2484175129864558e-06, + "loss": 0.7426908, + "num_input_tokens_seen": 170381260, + "router_z_loss_clip": 0.79052734, + "router_z_loss_mlp": 0.11431885, + "step": 7926, + "time_per_iteration": 2.6753861904144287 + }, + { + "auxiliary_loss_clip": 0.01123275, + "auxiliary_loss_mlp": 0.01032783, + "balance_loss_clip": 1.04298902, + "balance_loss_mlp": 1.0205822, + "epoch": 0.47659702389899294, + "flos": 30873169984320.0, + "grad_norm": 2.9339700934477286, + "language_loss": 0.69202638, + "learning_rate": 2.248031062546432e-06, + "loss": 0.71358699, + "num_input_tokens_seen": 170400595, + "router_z_loss_clip": 0.80224609, + "router_z_loss_mlp": 0.12194824, + "step": 7927, + "time_per_iteration": 2.7287437915802 + }, + { + "auxiliary_loss_clip": 0.01119094, + "auxiliary_loss_mlp": 0.0102879, + "balance_loss_clip": 1.04250264, + "balance_loss_mlp": 1.01807904, + "epoch": 0.4766571471516609, + "flos": 31717105303680.0, + "grad_norm": 1.649467102998199, + "language_loss": 0.677468, + "learning_rate": 2.247644602701045e-06, + "loss": 0.69894689, + "num_input_tokens_seen": 170421110, + "router_z_loss_clip": 0.765625, + "router_z_loss_mlp": 0.10705566, + "step": 7928, + "time_per_iteration": 2.7172091007232666 + }, + { + "auxiliary_loss_clip": 0.01121005, + "auxiliary_loss_mlp": 0.01032775, + "balance_loss_clip": 1.04250777, + "balance_loss_mlp": 1.02118158, + "epoch": 0.4767172704043289, + "flos": 19562358409920.0, + "grad_norm": 2.1330618714554697, + "language_loss": 0.78591478, + "learning_rate": 2.2472581334649496e-06, + "loss": 0.80745256, + "num_input_tokens_seen": 170436700, + "router_z_loss_clip": 0.78466797, + "router_z_loss_mlp": 0.1159668, + "step": 7929, + "time_per_iteration": 2.653120517730713 + }, + { + "auxiliary_loss_clip": 0.01120094, + "auxiliary_loss_mlp": 0.01034837, + "balance_loss_clip": 1.04358149, + "balance_loss_mlp": 1.02375698, + "epoch": 0.47677739365699684, + "flos": 47876778444960.0, + "grad_norm": 2.030087005241025, + "language_loss": 0.66662967, + "learning_rate": 2.2468716548528016e-06, + "loss": 0.68817896, + "num_input_tokens_seen": 170459555, + "router_z_loss_clip": 0.76513672, + "router_z_loss_mlp": 0.11083984, + "step": 7930, + "time_per_iteration": 2.8205716609954834 + }, + { + "auxiliary_loss_clip": 0.01117832, + "auxiliary_loss_mlp": 0.01028736, + "balance_loss_clip": 1.04017997, + "balance_loss_mlp": 1.01779819, + "epoch": 0.4768375169096648, + "flos": 30160977808320.0, + "grad_norm": 2.5643348195726072, + "language_loss": 0.7987836, + "learning_rate": 2.2464851668792555e-06, + "loss": 0.8202492, + "num_input_tokens_seen": 170479175, + "router_z_loss_clip": 0.77685547, + "router_z_loss_mlp": 0.10943604, + "step": 7931, + "time_per_iteration": 2.696847438812256 + }, + { + "auxiliary_loss_clip": 0.01121659, + "auxiliary_loss_mlp": 0.01031232, + "balance_loss_clip": 1.04233527, + "balance_loss_mlp": 1.01892984, + "epoch": 0.47689764016233277, + "flos": 27489649389120.0, + "grad_norm": 1.8993982116475472, + "language_loss": 0.76448423, + "learning_rate": 2.2460986695589678e-06, + "loss": 0.78601313, + "num_input_tokens_seen": 170498450, + "router_z_loss_clip": 0.79296875, + "router_z_loss_mlp": 0.12298584, + "step": 7932, + "time_per_iteration": 2.6791529655456543 + }, + { + "auxiliary_loss_clip": 0.01117021, + "auxiliary_loss_mlp": 0.01028207, + "balance_loss_clip": 1.0413835, + "balance_loss_mlp": 1.01637578, + "epoch": 0.47695776341500074, + "flos": 18449912661120.0, + "grad_norm": 2.7314192339769483, + "language_loss": 0.80157, + "learning_rate": 2.245712162906593e-06, + "loss": 0.82302225, + "num_input_tokens_seen": 170516255, + "router_z_loss_clip": 0.75634766, + "router_z_loss_mlp": 0.1182251, + "step": 7933, + "time_per_iteration": 2.5924389362335205 + }, + { + "auxiliary_loss_clip": 0.01125978, + "auxiliary_loss_mlp": 0.01034277, + "balance_loss_clip": 1.04389358, + "balance_loss_mlp": 1.02140832, + "epoch": 0.4770178866676687, + "flos": 17911271111040.0, + "grad_norm": 2.199361618812397, + "language_loss": 0.73962575, + "learning_rate": 2.2453256469367888e-06, + "loss": 0.76122832, + "num_input_tokens_seen": 170532705, + "router_z_loss_clip": 0.82080078, + "router_z_loss_mlp": 0.12872314, + "step": 7934, + "time_per_iteration": 2.6295528411865234 + }, + { + "auxiliary_loss_clip": 0.01122058, + "auxiliary_loss_mlp": 0.01031807, + "balance_loss_clip": 1.04156053, + "balance_loss_mlp": 1.01991606, + "epoch": 0.47707800992033667, + "flos": 27534657736800.0, + "grad_norm": 2.036532120830347, + "language_loss": 0.79899049, + "learning_rate": 2.244939121664211e-06, + "loss": 0.8205291, + "num_input_tokens_seen": 170551925, + "router_z_loss_clip": 0.80517578, + "router_z_loss_mlp": 0.11901855, + "step": 7935, + "time_per_iteration": 2.6423940658569336 + }, + { + "auxiliary_loss_clip": 0.0112821, + "auxiliary_loss_mlp": 0.0103569, + "balance_loss_clip": 1.04401994, + "balance_loss_mlp": 1.02319109, + "epoch": 0.4771381331730047, + "flos": 37727432177760.0, + "grad_norm": 2.133164161395777, + "language_loss": 0.7129271, + "learning_rate": 2.2445525871035177e-06, + "loss": 0.73456609, + "num_input_tokens_seen": 170572320, + "router_z_loss_clip": 0.84228516, + "router_z_loss_mlp": 0.125, + "step": 7936, + "time_per_iteration": 2.763057231903076 + }, + { + "auxiliary_loss_clip": 0.01123488, + "auxiliary_loss_mlp": 0.01030916, + "balance_loss_clip": 1.04217625, + "balance_loss_mlp": 1.01895976, + "epoch": 0.47719825642567265, + "flos": 31407071012640.0, + "grad_norm": 2.176171003992405, + "language_loss": 0.67585534, + "learning_rate": 2.2441660432693656e-06, + "loss": 0.69739938, + "num_input_tokens_seen": 170589470, + "router_z_loss_clip": 0.8125, + "router_z_loss_mlp": 0.11962891, + "step": 7937, + "time_per_iteration": 2.657741069793701 + }, + { + "auxiliary_loss_clip": 0.01043732, + "auxiliary_loss_mlp": 0.0099903, + "balance_loss_clip": 1.01892698, + "balance_loss_mlp": 0.99787247, + "epoch": 0.4772583796783406, + "flos": 80966054027520.0, + "grad_norm": 0.7437143475932331, + "language_loss": 0.56383502, + "learning_rate": 2.2437794901764128e-06, + "loss": 0.58426273, + "num_input_tokens_seen": 170662265, + "router_z_loss_clip": 0.2479248, + "router_z_loss_mlp": 0.01155853, + "step": 7938, + "time_per_iteration": 3.4296865463256836 + }, + { + "auxiliary_loss_clip": 0.01123486, + "auxiliary_loss_mlp": 0.01030778, + "balance_loss_clip": 1.04449725, + "balance_loss_mlp": 1.01836276, + "epoch": 0.4773185029310086, + "flos": 26907174492480.0, + "grad_norm": 1.812021195970738, + "language_loss": 0.88996565, + "learning_rate": 2.243392927839317e-06, + "loss": 0.91150832, + "num_input_tokens_seen": 170679680, + "router_z_loss_clip": 0.79003906, + "router_z_loss_mlp": 0.12420654, + "step": 7939, + "time_per_iteration": 2.690453052520752 + }, + { + "auxiliary_loss_clip": 0.01120657, + "auxiliary_loss_mlp": 0.01029012, + "balance_loss_clip": 1.04063201, + "balance_loss_mlp": 1.01771641, + "epoch": 0.47737862618367655, + "flos": 20410953216480.0, + "grad_norm": 2.1306255745821354, + "language_loss": 0.7690562, + "learning_rate": 2.2430063562727367e-06, + "loss": 0.79055285, + "num_input_tokens_seen": 170697340, + "router_z_loss_clip": 0.80029297, + "router_z_loss_mlp": 0.11297607, + "step": 7940, + "time_per_iteration": 2.614013910293579 + }, + { + "auxiliary_loss_clip": 0.01120004, + "auxiliary_loss_mlp": 0.01032859, + "balance_loss_clip": 1.04410481, + "balance_loss_mlp": 1.02212453, + "epoch": 0.4774387494363445, + "flos": 23927432473440.0, + "grad_norm": 1.723320685837692, + "language_loss": 0.85085624, + "learning_rate": 2.2426197754913322e-06, + "loss": 0.87238491, + "num_input_tokens_seen": 170714905, + "router_z_loss_clip": 0.75878906, + "router_z_loss_mlp": 0.1072998, + "step": 7941, + "time_per_iteration": 2.6581315994262695 + }, + { + "auxiliary_loss_clip": 0.01124124, + "auxiliary_loss_mlp": 0.01031734, + "balance_loss_clip": 1.04321408, + "balance_loss_mlp": 1.01933002, + "epoch": 0.4774988726890125, + "flos": 20320936521120.0, + "grad_norm": 2.0915943986716194, + "language_loss": 0.75989175, + "learning_rate": 2.24223318550976e-06, + "loss": 0.78145027, + "num_input_tokens_seen": 170731810, + "router_z_loss_clip": 0.80859375, + "router_z_loss_mlp": 0.12402344, + "step": 7942, + "time_per_iteration": 2.599820137023926 + }, + { + "auxiliary_loss_clip": 0.01124086, + "auxiliary_loss_mlp": 0.01032265, + "balance_loss_clip": 1.04424775, + "balance_loss_mlp": 1.02078557, + "epoch": 0.47755899594168044, + "flos": 24996490565760.0, + "grad_norm": 1.8329400945945655, + "language_loss": 0.64446455, + "learning_rate": 2.241846586342682e-06, + "loss": 0.66602802, + "num_input_tokens_seen": 170750270, + "router_z_loss_clip": 0.79833984, + "router_z_loss_mlp": 0.11474609, + "step": 7943, + "time_per_iteration": 2.647613525390625 + }, + { + "auxiliary_loss_clip": 0.01123595, + "auxiliary_loss_mlp": 0.01032693, + "balance_loss_clip": 1.04192472, + "balance_loss_mlp": 1.02002096, + "epoch": 0.4776191191943484, + "flos": 26420874917760.0, + "grad_norm": 1.6229302163514188, + "language_loss": 0.7358613, + "learning_rate": 2.2414599780047577e-06, + "loss": 0.75742418, + "num_input_tokens_seen": 170769015, + "router_z_loss_clip": 0.81738281, + "router_z_loss_mlp": 0.12683105, + "step": 7944, + "time_per_iteration": 2.6123104095458984 + }, + { + "auxiliary_loss_clip": 0.01123725, + "auxiliary_loss_mlp": 0.01035435, + "balance_loss_clip": 1.04408193, + "balance_loss_mlp": 1.02269781, + "epoch": 0.4776792424470164, + "flos": 22903342211520.0, + "grad_norm": 2.1132880475762668, + "language_loss": 0.68533832, + "learning_rate": 2.2410733605106456e-06, + "loss": 0.70692992, + "num_input_tokens_seen": 170785725, + "router_z_loss_clip": 0.79541016, + "router_z_loss_mlp": 0.12744141, + "step": 7945, + "time_per_iteration": 2.6431570053100586 + }, + { + "auxiliary_loss_clip": 0.01118312, + "auxiliary_loss_mlp": 0.01031197, + "balance_loss_clip": 1.03920507, + "balance_loss_mlp": 1.01956224, + "epoch": 0.47773936569968434, + "flos": 36260024824800.0, + "grad_norm": 2.0506518155235347, + "language_loss": 0.75404966, + "learning_rate": 2.240686733875009e-06, + "loss": 0.77554482, + "num_input_tokens_seen": 170804600, + "router_z_loss_clip": 0.79003906, + "router_z_loss_mlp": 0.11633301, + "step": 7946, + "time_per_iteration": 3.992309093475342 + }, + { + "auxiliary_loss_clip": 0.0112635, + "auxiliary_loss_mlp": 0.01038174, + "balance_loss_clip": 1.04514503, + "balance_loss_mlp": 1.02559733, + "epoch": 0.4777994889523523, + "flos": 30250913469120.0, + "grad_norm": 1.7729683722112857, + "language_loss": 0.79725373, + "learning_rate": 2.240300098112506e-06, + "loss": 0.81889898, + "num_input_tokens_seen": 170824230, + "router_z_loss_clip": 0.81298828, + "router_z_loss_mlp": 0.12561035, + "step": 7947, + "time_per_iteration": 4.14374041557312 + }, + { + "auxiliary_loss_clip": 0.011177, + "auxiliary_loss_mlp": 0.01030798, + "balance_loss_clip": 1.0409621, + "balance_loss_mlp": 1.01894867, + "epoch": 0.47785961220502027, + "flos": 21790369738080.0, + "grad_norm": 1.9613721648953166, + "language_loss": 0.73735011, + "learning_rate": 2.2399134532377998e-06, + "loss": 0.75883508, + "num_input_tokens_seen": 170843365, + "router_z_loss_clip": 0.76708984, + "router_z_loss_mlp": 0.11846924, + "step": 7948, + "time_per_iteration": 2.638075113296509 + }, + { + "auxiliary_loss_clip": 0.01123386, + "auxiliary_loss_mlp": 0.01029809, + "balance_loss_clip": 1.04362214, + "balance_loss_mlp": 1.01716697, + "epoch": 0.4779197354576883, + "flos": 24729236172000.0, + "grad_norm": 1.8822713676026228, + "language_loss": 0.77824003, + "learning_rate": 2.2395267992655514e-06, + "loss": 0.79977202, + "num_input_tokens_seen": 170863515, + "router_z_loss_clip": 0.79736328, + "router_z_loss_mlp": 0.12646484, + "step": 7949, + "time_per_iteration": 2.631016731262207 + }, + { + "auxiliary_loss_clip": 0.01117547, + "auxiliary_loss_mlp": 0.01027967, + "balance_loss_clip": 1.04034197, + "balance_loss_mlp": 1.01679158, + "epoch": 0.47797985871035625, + "flos": 20812138686720.0, + "grad_norm": 4.228305472284634, + "language_loss": 0.74226135, + "learning_rate": 2.2391401362104227e-06, + "loss": 0.76371652, + "num_input_tokens_seen": 170881245, + "router_z_loss_clip": 0.77148438, + "router_z_loss_mlp": 0.11169434, + "step": 7950, + "time_per_iteration": 2.677049398422241 + }, + { + "auxiliary_loss_clip": 0.01121971, + "auxiliary_loss_mlp": 0.01038086, + "balance_loss_clip": 1.04323673, + "balance_loss_mlp": 1.02503252, + "epoch": 0.4780399819630242, + "flos": 38264372002080.0, + "grad_norm": 2.0937154091133108, + "language_loss": 0.74276453, + "learning_rate": 2.2387534640870756e-06, + "loss": 0.76436508, + "num_input_tokens_seen": 170901285, + "router_z_loss_clip": 0.78613281, + "router_z_loss_mlp": 0.13049316, + "step": 7951, + "time_per_iteration": 2.743997573852539 + }, + { + "auxiliary_loss_clip": 0.01122982, + "auxiliary_loss_mlp": 0.01029462, + "balance_loss_clip": 1.04134512, + "balance_loss_mlp": 1.0168196, + "epoch": 0.4781001052156922, + "flos": 30382899716160.0, + "grad_norm": 2.8055020797563954, + "language_loss": 0.79905689, + "learning_rate": 2.238366782910174e-06, + "loss": 0.82058132, + "num_input_tokens_seen": 170919740, + "router_z_loss_clip": 0.81640625, + "router_z_loss_mlp": 0.12646484, + "step": 7952, + "time_per_iteration": 2.7070868015289307 + }, + { + "auxiliary_loss_clip": 0.01124118, + "auxiliary_loss_mlp": 0.0103361, + "balance_loss_clip": 1.04330742, + "balance_loss_mlp": 1.02125382, + "epoch": 0.47816022846836015, + "flos": 22814824655520.0, + "grad_norm": 1.7284036138959384, + "language_loss": 0.7828486, + "learning_rate": 2.23798009269438e-06, + "loss": 0.80442584, + "num_input_tokens_seen": 170938510, + "router_z_loss_clip": 0.80712891, + "router_z_loss_mlp": 0.12359619, + "step": 7953, + "time_per_iteration": 2.665070056915283 + }, + { + "auxiliary_loss_clip": 0.01124755, + "auxiliary_loss_mlp": 0.01027971, + "balance_loss_clip": 1.04264796, + "balance_loss_mlp": 1.01569283, + "epoch": 0.4782203517210281, + "flos": 14615012036160.0, + "grad_norm": 3.252460663255542, + "language_loss": 0.84302646, + "learning_rate": 2.2375933934543566e-06, + "loss": 0.86455375, + "num_input_tokens_seen": 170951170, + "router_z_loss_clip": 0.8203125, + "router_z_loss_mlp": 0.1227417, + "step": 7954, + "time_per_iteration": 4.050141334533691 + }, + { + "auxiliary_loss_clip": 0.01121892, + "auxiliary_loss_mlp": 0.01033331, + "balance_loss_clip": 1.04262996, + "balance_loss_mlp": 1.02116525, + "epoch": 0.4782804749736961, + "flos": 25396703621280.0, + "grad_norm": 1.7043789900329747, + "language_loss": 0.70518506, + "learning_rate": 2.237206685204768e-06, + "loss": 0.72673726, + "num_input_tokens_seen": 170970990, + "router_z_loss_clip": 0.79296875, + "router_z_loss_mlp": 0.12164307, + "step": 7955, + "time_per_iteration": 2.659318208694458 + }, + { + "auxiliary_loss_clip": 0.01123419, + "auxiliary_loss_mlp": 0.01035252, + "balance_loss_clip": 1.04312634, + "balance_loss_mlp": 1.02311063, + "epoch": 0.47834059822636404, + "flos": 29090380059360.0, + "grad_norm": 1.6978583657844144, + "language_loss": 0.81598991, + "learning_rate": 2.2368199679602787e-06, + "loss": 0.83757657, + "num_input_tokens_seen": 170991215, + "router_z_loss_clip": 0.80322266, + "router_z_loss_mlp": 0.12158203, + "step": 7956, + "time_per_iteration": 2.6382076740264893 + }, + { + "auxiliary_loss_clip": 0.01122216, + "auxiliary_loss_mlp": 0.0103234, + "balance_loss_clip": 1.04322839, + "balance_loss_mlp": 1.01954913, + "epoch": 0.478400721479032, + "flos": 27618313219200.0, + "grad_norm": 2.274090311578076, + "language_loss": 0.84904581, + "learning_rate": 2.2364332417355516e-06, + "loss": 0.8705914, + "num_input_tokens_seen": 171007325, + "router_z_loss_clip": 0.79052734, + "router_z_loss_mlp": 0.12786865, + "step": 7957, + "time_per_iteration": 2.6435203552246094 + }, + { + "auxiliary_loss_clip": 0.01121346, + "auxiliary_loss_mlp": 0.01030505, + "balance_loss_clip": 1.04295886, + "balance_loss_mlp": 1.0187335, + "epoch": 0.4784608447317, + "flos": 23616790423200.0, + "grad_norm": 1.7479529654676604, + "language_loss": 0.79441357, + "learning_rate": 2.2360465065452527e-06, + "loss": 0.81593215, + "num_input_tokens_seen": 171025650, + "router_z_loss_clip": 0.78466797, + "router_z_loss_mlp": 0.11779785, + "step": 7958, + "time_per_iteration": 2.685795545578003 + }, + { + "auxiliary_loss_clip": 0.01121898, + "auxiliary_loss_mlp": 0.01031237, + "balance_loss_clip": 1.04197967, + "balance_loss_mlp": 1.01800442, + "epoch": 0.47852096798436794, + "flos": 29311167483360.0, + "grad_norm": 1.7765744353566415, + "language_loss": 0.82943773, + "learning_rate": 2.235659762404047e-06, + "loss": 0.85096908, + "num_input_tokens_seen": 171045045, + "router_z_loss_clip": 0.796875, + "router_z_loss_mlp": 0.13232422, + "step": 7959, + "time_per_iteration": 4.045424938201904 + }, + { + "auxiliary_loss_clip": 0.01119569, + "auxiliary_loss_mlp": 0.01028206, + "balance_loss_clip": 1.04339051, + "balance_loss_mlp": 1.01705956, + "epoch": 0.4785810912370359, + "flos": 31318593973920.0, + "grad_norm": 2.274882083989553, + "language_loss": 0.72877812, + "learning_rate": 2.235273009326599e-06, + "loss": 0.75025594, + "num_input_tokens_seen": 171062910, + "router_z_loss_clip": 0.76220703, + "router_z_loss_mlp": 0.11138916, + "step": 7960, + "time_per_iteration": 2.710169792175293 + }, + { + "auxiliary_loss_clip": 0.0112063, + "auxiliary_loss_mlp": 0.01033061, + "balance_loss_clip": 1.04237056, + "balance_loss_mlp": 1.02152109, + "epoch": 0.47864121448970387, + "flos": 26151756729120.0, + "grad_norm": 1.8058838184380903, + "language_loss": 0.7650739, + "learning_rate": 2.2348862473275745e-06, + "loss": 0.78661084, + "num_input_tokens_seen": 171080875, + "router_z_loss_clip": 0.78271484, + "router_z_loss_mlp": 0.11547852, + "step": 7961, + "time_per_iteration": 2.678640365600586 + }, + { + "auxiliary_loss_clip": 0.01119481, + "auxiliary_loss_mlp": 0.01027979, + "balance_loss_clip": 1.04147553, + "balance_loss_mlp": 1.01589704, + "epoch": 0.47870133774237184, + "flos": 19698436902240.0, + "grad_norm": 1.99786477449954, + "language_loss": 0.77789986, + "learning_rate": 2.2344994764216405e-06, + "loss": 0.79937446, + "num_input_tokens_seen": 171099190, + "router_z_loss_clip": 0.77978516, + "router_z_loss_mlp": 0.12072754, + "step": 7962, + "time_per_iteration": 2.7145566940307617 + }, + { + "auxiliary_loss_clip": 0.01124721, + "auxiliary_loss_mlp": 0.0103118, + "balance_loss_clip": 1.04480243, + "balance_loss_mlp": 1.01942039, + "epoch": 0.47876146099503986, + "flos": 32832184675680.0, + "grad_norm": 1.8228109728847852, + "language_loss": 0.65000856, + "learning_rate": 2.2341126966234635e-06, + "loss": 0.67156756, + "num_input_tokens_seen": 171119060, + "router_z_loss_clip": 0.79931641, + "router_z_loss_mlp": 0.11761475, + "step": 7963, + "time_per_iteration": 2.65396785736084 + }, + { + "auxiliary_loss_clip": 0.01122257, + "auxiliary_loss_mlp": 0.01028793, + "balance_loss_clip": 1.04242194, + "balance_loss_mlp": 1.01693225, + "epoch": 0.4788215842477078, + "flos": 55316149158240.0, + "grad_norm": 2.786452323971852, + "language_loss": 0.77592033, + "learning_rate": 2.2337259079477083e-06, + "loss": 0.79743087, + "num_input_tokens_seen": 171141900, + "router_z_loss_clip": 0.79833984, + "router_z_loss_mlp": 0.11859131, + "step": 7964, + "time_per_iteration": 2.8250231742858887 + }, + { + "auxiliary_loss_clip": 0.01124447, + "auxiliary_loss_mlp": 0.01030135, + "balance_loss_clip": 1.04257417, + "balance_loss_mlp": 1.0163666, + "epoch": 0.4788817075003758, + "flos": 27133917956640.0, + "grad_norm": 1.8358865992692117, + "language_loss": 0.76599652, + "learning_rate": 2.233339110409044e-06, + "loss": 0.78754234, + "num_input_tokens_seen": 171161045, + "router_z_loss_clip": 0.81835938, + "router_z_loss_mlp": 0.13763428, + "step": 7965, + "time_per_iteration": 2.65539813041687 + }, + { + "auxiliary_loss_clip": 0.01121211, + "auxiliary_loss_mlp": 0.01032495, + "balance_loss_clip": 1.04205585, + "balance_loss_mlp": 1.02067554, + "epoch": 0.47894183075304375, + "flos": 20097920646720.0, + "grad_norm": 2.030670139962612, + "language_loss": 0.74837154, + "learning_rate": 2.232952304022137e-06, + "loss": 0.76990855, + "num_input_tokens_seen": 171179675, + "router_z_loss_clip": 0.79150391, + "router_z_loss_mlp": 0.1182251, + "step": 7966, + "time_per_iteration": 2.601979970932007 + }, + { + "auxiliary_loss_clip": 0.01120671, + "auxiliary_loss_mlp": 0.01029502, + "balance_loss_clip": 1.04092431, + "balance_loss_mlp": 1.01668108, + "epoch": 0.4790019540057117, + "flos": 29629791437760.0, + "grad_norm": 1.7485449821567831, + "language_loss": 0.73256087, + "learning_rate": 2.232565488801655e-06, + "loss": 0.75406265, + "num_input_tokens_seen": 171201175, + "router_z_loss_clip": 0.79785156, + "router_z_loss_mlp": 0.12817383, + "step": 7967, + "time_per_iteration": 2.6636059284210205 + }, + { + "auxiliary_loss_clip": 0.0111648, + "auxiliary_loss_mlp": 0.01029027, + "balance_loss_clip": 1.04092503, + "balance_loss_mlp": 1.01708198, + "epoch": 0.4790620772583797, + "flos": 31318634491200.0, + "grad_norm": 2.343003855068783, + "language_loss": 0.78752589, + "learning_rate": 2.232178664762267e-06, + "loss": 0.80898094, + "num_input_tokens_seen": 171221750, + "router_z_loss_clip": 0.75537109, + "router_z_loss_mlp": 0.1194458, + "step": 7968, + "time_per_iteration": 2.660170316696167 + }, + { + "auxiliary_loss_clip": 0.01040695, + "auxiliary_loss_mlp": 0.01001496, + "balance_loss_clip": 1.01591504, + "balance_loss_mlp": 1.00025129, + "epoch": 0.47912220051104765, + "flos": 84715151754240.0, + "grad_norm": 0.7847828204441131, + "language_loss": 0.62210202, + "learning_rate": 2.2317918319186408e-06, + "loss": 0.64252394, + "num_input_tokens_seen": 171292235, + "router_z_loss_clip": 0.2479248, + "router_z_loss_mlp": 0.01244354, + "step": 7969, + "time_per_iteration": 3.4340884685516357 + }, + { + "auxiliary_loss_clip": 0.0111914, + "auxiliary_loss_mlp": 0.0103061, + "balance_loss_clip": 1.0426532, + "balance_loss_mlp": 1.01863503, + "epoch": 0.4791823237637156, + "flos": 29491362943200.0, + "grad_norm": 1.5679323781685564, + "language_loss": 0.77231956, + "learning_rate": 2.2314049902854446e-06, + "loss": 0.79381704, + "num_input_tokens_seen": 171312215, + "router_z_loss_clip": 0.76416016, + "router_z_loss_mlp": 0.11962891, + "step": 7970, + "time_per_iteration": 2.651501178741455 + }, + { + "auxiliary_loss_clip": 0.01118428, + "auxiliary_loss_mlp": 0.01031505, + "balance_loss_clip": 1.03871965, + "balance_loss_mlp": 1.01926255, + "epoch": 0.4792424470163836, + "flos": 30202339600800.0, + "grad_norm": 2.6519064716083407, + "language_loss": 0.70456427, + "learning_rate": 2.231018139877349e-06, + "loss": 0.72606361, + "num_input_tokens_seen": 171332975, + "router_z_loss_clip": 0.796875, + "router_z_loss_mlp": 0.12237549, + "step": 7971, + "time_per_iteration": 2.7041213512420654 + }, + { + "auxiliary_loss_clip": 0.01119749, + "auxiliary_loss_mlp": 0.01026539, + "balance_loss_clip": 1.04035068, + "balance_loss_mlp": 1.01354492, + "epoch": 0.47930257026905154, + "flos": 28379565470880.0, + "grad_norm": 1.3860125350602446, + "language_loss": 0.80046743, + "learning_rate": 2.230631280709021e-06, + "loss": 0.82193029, + "num_input_tokens_seen": 171353880, + "router_z_loss_clip": 0.79345703, + "router_z_loss_mlp": 0.12982178, + "step": 7972, + "time_per_iteration": 2.6632649898529053 + }, + { + "auxiliary_loss_clip": 0.01122669, + "auxiliary_loss_mlp": 0.0102847, + "balance_loss_clip": 1.04241943, + "balance_loss_mlp": 1.01591718, + "epoch": 0.4793626935217195, + "flos": 17160512834880.0, + "grad_norm": 3.7737591962516124, + "language_loss": 0.6977222, + "learning_rate": 2.2302444127951327e-06, + "loss": 0.71923357, + "num_input_tokens_seen": 171370930, + "router_z_loss_clip": 0.80273438, + "router_z_loss_mlp": 0.12536621, + "step": 7973, + "time_per_iteration": 2.647465467453003 + }, + { + "auxiliary_loss_clip": 0.011209, + "auxiliary_loss_mlp": 0.01033865, + "balance_loss_clip": 1.04438758, + "balance_loss_mlp": 1.02251577, + "epoch": 0.4794228167743875, + "flos": 26599085030880.0, + "grad_norm": 1.8398834504127577, + "language_loss": 0.78756845, + "learning_rate": 2.2298575361503523e-06, + "loss": 0.80911607, + "num_input_tokens_seen": 171387575, + "router_z_loss_clip": 0.76513672, + "router_z_loss_mlp": 0.11340332, + "step": 7974, + "time_per_iteration": 2.656338930130005 + }, + { + "auxiliary_loss_clip": 0.01040778, + "auxiliary_loss_mlp": 0.01003511, + "balance_loss_clip": 1.01595962, + "balance_loss_mlp": 1.00222254, + "epoch": 0.47948294002705544, + "flos": 81715515750720.0, + "grad_norm": 0.8938823407310493, + "language_loss": 0.53988993, + "learning_rate": 2.2294706507893517e-06, + "loss": 0.56033283, + "num_input_tokens_seen": 171449980, + "router_z_loss_clip": 0.24829102, + "router_z_loss_mlp": 0.01288605, + "step": 7975, + "time_per_iteration": 3.3417625427246094 + }, + { + "auxiliary_loss_clip": 0.011267, + "auxiliary_loss_mlp": 0.01034706, + "balance_loss_clip": 1.04240966, + "balance_loss_mlp": 1.02124691, + "epoch": 0.47954306327972346, + "flos": 15157259624160.0, + "grad_norm": 2.0496891906667454, + "language_loss": 0.90322936, + "learning_rate": 2.2290837567268008e-06, + "loss": 0.92484337, + "num_input_tokens_seen": 171465290, + "router_z_loss_clip": 0.84179688, + "router_z_loss_mlp": 0.13464355, + "step": 7976, + "time_per_iteration": 2.676413059234619 + }, + { + "auxiliary_loss_clip": 0.01124926, + "auxiliary_loss_mlp": 0.01034331, + "balance_loss_clip": 1.04274285, + "balance_loss_mlp": 1.0211525, + "epoch": 0.4796031865323914, + "flos": 22405778832960.0, + "grad_norm": 2.7316134386571367, + "language_loss": 0.73526317, + "learning_rate": 2.2286968539773713e-06, + "loss": 0.75685573, + "num_input_tokens_seen": 171481130, + "router_z_loss_clip": 0.82128906, + "router_z_loss_mlp": 0.13165283, + "step": 7977, + "time_per_iteration": 2.59626841545105 + }, + { + "auxiliary_loss_clip": 0.01117286, + "auxiliary_loss_mlp": 0.0103001, + "balance_loss_clip": 1.04000866, + "balance_loss_mlp": 1.01833963, + "epoch": 0.4796633097850594, + "flos": 26643323550240.0, + "grad_norm": 1.7979979098616756, + "language_loss": 0.78484952, + "learning_rate": 2.228309942555734e-06, + "loss": 0.80632252, + "num_input_tokens_seen": 171501140, + "router_z_loss_clip": 0.7734375, + "router_z_loss_mlp": 0.11676025, + "step": 7978, + "time_per_iteration": 2.689225435256958 + }, + { + "auxiliary_loss_clip": 0.01120515, + "auxiliary_loss_mlp": 0.01033776, + "balance_loss_clip": 1.04078293, + "balance_loss_mlp": 1.02137208, + "epoch": 0.47972343303772735, + "flos": 28598813238240.0, + "grad_norm": 1.7076344640000523, + "language_loss": 0.89171678, + "learning_rate": 2.22792302247656e-06, + "loss": 0.91325974, + "num_input_tokens_seen": 171519835, + "router_z_loss_clip": 0.79736328, + "router_z_loss_mlp": 0.12414551, + "step": 7979, + "time_per_iteration": 2.6382896900177 + }, + { + "auxiliary_loss_clip": 0.01123818, + "auxiliary_loss_mlp": 0.01033594, + "balance_loss_clip": 1.04358339, + "balance_loss_mlp": 1.02005827, + "epoch": 0.4797835562903953, + "flos": 30383102302560.0, + "grad_norm": 1.4965403342165762, + "language_loss": 0.76942039, + "learning_rate": 2.227536093754523e-06, + "loss": 0.79099447, + "num_input_tokens_seen": 171540980, + "router_z_loss_clip": 0.80273438, + "router_z_loss_mlp": 0.13531494, + "step": 7980, + "time_per_iteration": 2.7559738159179688 + }, + { + "auxiliary_loss_clip": 0.01124784, + "auxiliary_loss_mlp": 0.01036429, + "balance_loss_clip": 1.04142094, + "balance_loss_mlp": 1.02277935, + "epoch": 0.4798436795430633, + "flos": 42760621967040.0, + "grad_norm": 2.091935899928591, + "language_loss": 0.71461201, + "learning_rate": 2.227149156404295e-06, + "loss": 0.73622406, + "num_input_tokens_seen": 171563600, + "router_z_loss_clip": 0.83300781, + "router_z_loss_mlp": 0.13665771, + "step": 7981, + "time_per_iteration": 2.733092784881592 + }, + { + "auxiliary_loss_clip": 0.01118479, + "auxiliary_loss_mlp": 0.01029584, + "balance_loss_clip": 1.04200482, + "balance_loss_mlp": 1.01787174, + "epoch": 0.47990380279573125, + "flos": 25124627671200.0, + "grad_norm": 2.1863913069428835, + "language_loss": 0.70066905, + "learning_rate": 2.2267622104405473e-06, + "loss": 0.72214961, + "num_input_tokens_seen": 171580700, + "router_z_loss_clip": 0.76513672, + "router_z_loss_mlp": 0.11712646, + "step": 7982, + "time_per_iteration": 2.66342830657959 + }, + { + "auxiliary_loss_clip": 0.01115011, + "auxiliary_loss_mlp": 0.01029058, + "balance_loss_clip": 1.04089487, + "balance_loss_mlp": 1.01816249, + "epoch": 0.4799639260483992, + "flos": 32164231019040.0, + "grad_norm": 1.8182519751663644, + "language_loss": 0.71029752, + "learning_rate": 2.2263752558779544e-06, + "loss": 0.73173821, + "num_input_tokens_seen": 171602035, + "router_z_loss_clip": 0.7421875, + "router_z_loss_mlp": 0.10894775, + "step": 7983, + "time_per_iteration": 2.714547634124756 + }, + { + "auxiliary_loss_clip": 0.01042143, + "auxiliary_loss_mlp": 0.0100627, + "balance_loss_clip": 1.01696825, + "balance_loss_mlp": 1.00498724, + "epoch": 0.4800240493010672, + "flos": 86609385665280.0, + "grad_norm": 0.8151020658510145, + "language_loss": 0.59409475, + "learning_rate": 2.2259882927311883e-06, + "loss": 0.61457884, + "num_input_tokens_seen": 171659215, + "router_z_loss_clip": 0.25170898, + "router_z_loss_mlp": 0.01283264, + "step": 7984, + "time_per_iteration": 3.3191630840301514 + }, + { + "auxiliary_loss_clip": 0.01119035, + "auxiliary_loss_mlp": 0.01035189, + "balance_loss_clip": 1.04107237, + "balance_loss_mlp": 1.02283239, + "epoch": 0.48008417255373514, + "flos": 20849408233920.0, + "grad_norm": 1.590736217624913, + "language_loss": 0.66695952, + "learning_rate": 2.2256013210149247e-06, + "loss": 0.68850178, + "num_input_tokens_seen": 171675710, + "router_z_loss_clip": 0.77978516, + "router_z_loss_mlp": 0.12353516, + "step": 7985, + "time_per_iteration": 2.641209602355957 + }, + { + "auxiliary_loss_clip": 0.01121282, + "auxiliary_loss_mlp": 0.01033811, + "balance_loss_clip": 1.0401175, + "balance_loss_mlp": 1.02131784, + "epoch": 0.4801442958064031, + "flos": 18807143232960.0, + "grad_norm": 2.125662674514755, + "language_loss": 0.69878608, + "learning_rate": 2.225214340743835e-06, + "loss": 0.72033697, + "num_input_tokens_seen": 171692510, + "router_z_loss_clip": 0.81152344, + "router_z_loss_mlp": 0.12506104, + "step": 7986, + "time_per_iteration": 4.025593996047974 + }, + { + "auxiliary_loss_clip": 0.01123847, + "auxiliary_loss_mlp": 0.01040279, + "balance_loss_clip": 1.04051769, + "balance_loss_mlp": 1.02745819, + "epoch": 0.4802044190590711, + "flos": 13999562424000.0, + "grad_norm": 2.136028533428205, + "language_loss": 0.79117179, + "learning_rate": 2.2248273519325956e-06, + "loss": 0.8128131, + "num_input_tokens_seen": 171710235, + "router_z_loss_clip": 0.83398438, + "router_z_loss_mlp": 0.12811279, + "step": 7987, + "time_per_iteration": 4.035984754562378 + }, + { + "auxiliary_loss_clip": 0.01120183, + "auxiliary_loss_mlp": 0.01037642, + "balance_loss_clip": 1.04083955, + "balance_loss_mlp": 1.02569747, + "epoch": 0.48026454231173904, + "flos": 25565270621760.0, + "grad_norm": 2.4143200113678005, + "language_loss": 0.74841654, + "learning_rate": 2.2244403545958812e-06, + "loss": 0.76999485, + "num_input_tokens_seen": 171726715, + "router_z_loss_clip": 0.79248047, + "router_z_loss_mlp": 0.11950684, + "step": 7988, + "time_per_iteration": 2.6368796825408936 + }, + { + "auxiliary_loss_clip": 0.01123752, + "auxiliary_loss_mlp": 0.01031037, + "balance_loss_clip": 1.04436362, + "balance_loss_mlp": 1.01939034, + "epoch": 0.48032466556440706, + "flos": 24950874458880.0, + "grad_norm": 1.9699625228346533, + "language_loss": 0.78758395, + "learning_rate": 2.224053348748365e-06, + "loss": 0.80913186, + "num_input_tokens_seen": 171743605, + "router_z_loss_clip": 0.79443359, + "router_z_loss_mlp": 0.11651611, + "step": 7989, + "time_per_iteration": 2.6036760807037354 + }, + { + "auxiliary_loss_clip": 0.01123232, + "auxiliary_loss_mlp": 0.0103608, + "balance_loss_clip": 1.04037142, + "balance_loss_mlp": 1.02330709, + "epoch": 0.480384788817075, + "flos": 45296317584000.0, + "grad_norm": 1.7143643645781994, + "language_loss": 0.73963416, + "learning_rate": 2.223666334404724e-06, + "loss": 0.76122725, + "num_input_tokens_seen": 171765445, + "router_z_loss_clip": 0.82861328, + "router_z_loss_mlp": 0.12799072, + "step": 7990, + "time_per_iteration": 2.810222625732422 + }, + { + "auxiliary_loss_clip": 0.01041576, + "auxiliary_loss_mlp": 0.01005745, + "balance_loss_clip": 1.01627767, + "balance_loss_mlp": 1.0044812, + "epoch": 0.480444912069743, + "flos": 84869051499360.0, + "grad_norm": 0.7790295892423004, + "language_loss": 0.59116089, + "learning_rate": 2.223279311579633e-06, + "loss": 0.61163414, + "num_input_tokens_seen": 171830115, + "router_z_loss_clip": 0.25366211, + "router_z_loss_mlp": 0.01264191, + "step": 7991, + "time_per_iteration": 3.3839197158813477 + }, + { + "auxiliary_loss_clip": 0.0111844, + "auxiliary_loss_mlp": 0.01030606, + "balance_loss_clip": 1.03975654, + "balance_loss_mlp": 1.01852417, + "epoch": 0.48050503532241096, + "flos": 36390147276960.0, + "grad_norm": 2.1669794251107275, + "language_loss": 0.66860104, + "learning_rate": 2.222892280287768e-06, + "loss": 0.69009149, + "num_input_tokens_seen": 171849135, + "router_z_loss_clip": 0.78662109, + "router_z_loss_mlp": 0.12097168, + "step": 7992, + "time_per_iteration": 2.701144218444824 + }, + { + "auxiliary_loss_clip": 0.01122156, + "auxiliary_loss_mlp": 0.01034929, + "balance_loss_clip": 1.04003716, + "balance_loss_mlp": 1.02240038, + "epoch": 0.4805651585750789, + "flos": 29222568892800.0, + "grad_norm": 1.8924604901924933, + "language_loss": 0.76327741, + "learning_rate": 2.2225052405438056e-06, + "loss": 0.78484821, + "num_input_tokens_seen": 171868880, + "router_z_loss_clip": 0.82080078, + "router_z_loss_mlp": 0.12536621, + "step": 7993, + "time_per_iteration": 2.695266008377075 + }, + { + "auxiliary_loss_clip": 0.01120006, + "auxiliary_loss_mlp": 0.01036542, + "balance_loss_clip": 1.04211235, + "balance_loss_mlp": 1.02454925, + "epoch": 0.4806252818277469, + "flos": 31317459490080.0, + "grad_norm": 1.6440807945873157, + "language_loss": 0.7866075, + "learning_rate": 2.222118192362422e-06, + "loss": 0.80817294, + "num_input_tokens_seen": 171889455, + "router_z_loss_clip": 0.77832031, + "router_z_loss_mlp": 0.11987305, + "step": 7994, + "time_per_iteration": 4.082874059677124 + }, + { + "auxiliary_loss_clip": 0.01124948, + "auxiliary_loss_mlp": 0.01028521, + "balance_loss_clip": 1.04397082, + "balance_loss_mlp": 1.01642132, + "epoch": 0.48068540508041485, + "flos": 16047499844160.0, + "grad_norm": 2.0016199509268144, + "language_loss": 0.79539269, + "learning_rate": 2.2217311357582946e-06, + "loss": 0.81692737, + "num_input_tokens_seen": 171906070, + "router_z_loss_clip": 0.80957031, + "router_z_loss_mlp": 0.12097168, + "step": 7995, + "time_per_iteration": 2.6350982189178467 + }, + { + "auxiliary_loss_clip": 0.0111805, + "auxiliary_loss_mlp": 0.01030244, + "balance_loss_clip": 1.03945827, + "balance_loss_mlp": 1.01814401, + "epoch": 0.4807455283330828, + "flos": 25840101746880.0, + "grad_norm": 1.6012194171010408, + "language_loss": 0.82733428, + "learning_rate": 2.2213440707461e-06, + "loss": 0.84881717, + "num_input_tokens_seen": 171926515, + "router_z_loss_clip": 0.78613281, + "router_z_loss_mlp": 0.12084961, + "step": 7996, + "time_per_iteration": 2.7164676189422607 + }, + { + "auxiliary_loss_clip": 0.0112013, + "auxiliary_loss_mlp": 0.01029745, + "balance_loss_clip": 1.04189038, + "balance_loss_mlp": 1.01774049, + "epoch": 0.4808056515857508, + "flos": 14978563303680.0, + "grad_norm": 1.7269867766654117, + "language_loss": 0.80575138, + "learning_rate": 2.220956997340516e-06, + "loss": 0.82725012, + "num_input_tokens_seen": 171943845, + "router_z_loss_clip": 0.78271484, + "router_z_loss_mlp": 0.12005615, + "step": 7997, + "time_per_iteration": 2.7001821994781494 + }, + { + "auxiliary_loss_clip": 0.01119938, + "auxiliary_loss_mlp": 0.01036446, + "balance_loss_clip": 1.04015386, + "balance_loss_mlp": 1.0237267, + "epoch": 0.48086577483841875, + "flos": 30293207159040.0, + "grad_norm": 2.4805871151841745, + "language_loss": 0.72674549, + "learning_rate": 2.220569915556221e-06, + "loss": 0.74830937, + "num_input_tokens_seen": 171964970, + "router_z_loss_clip": 0.79833984, + "router_z_loss_mlp": 0.1272583, + "step": 7998, + "time_per_iteration": 2.7370705604553223 + }, + { + "auxiliary_loss_clip": 0.01123552, + "auxiliary_loss_mlp": 0.01031222, + "balance_loss_clip": 1.04338312, + "balance_loss_mlp": 1.01887155, + "epoch": 0.4809258980910867, + "flos": 29852685760320.0, + "grad_norm": 1.9111189047594244, + "language_loss": 0.7034753, + "learning_rate": 2.220182825407892e-06, + "loss": 0.72502303, + "num_input_tokens_seen": 171986340, + "router_z_loss_clip": 0.80126953, + "router_z_loss_mlp": 0.12353516, + "step": 7999, + "time_per_iteration": 3.97324538230896 + }, + { + "auxiliary_loss_clip": 0.01123585, + "auxiliary_loss_mlp": 0.01034274, + "balance_loss_clip": 1.041646, + "balance_loss_mlp": 1.02156615, + "epoch": 0.4809860213437547, + "flos": 25887703200480.0, + "grad_norm": 1.6396699442162108, + "language_loss": 0.70982957, + "learning_rate": 2.2197957269102083e-06, + "loss": 0.73140812, + "num_input_tokens_seen": 172007300, + "router_z_loss_clip": 0.8203125, + "router_z_loss_mlp": 0.12713623, + "step": 8000, + "time_per_iteration": 2.717874526977539 + }, + { + "auxiliary_loss_clip": 0.01124605, + "auxiliary_loss_mlp": 0.01036041, + "balance_loss_clip": 1.04466069, + "balance_loss_mlp": 1.02310705, + "epoch": 0.48104614459642264, + "flos": 45920154273120.0, + "grad_norm": 1.5325164382972987, + "language_loss": 0.74675548, + "learning_rate": 2.2194086200778485e-06, + "loss": 0.76836193, + "num_input_tokens_seen": 172029585, + "router_z_loss_clip": 0.79833984, + "router_z_loss_mlp": 0.12939453, + "step": 8001, + "time_per_iteration": 2.7846577167510986 + }, + { + "auxiliary_loss_clip": 0.01124888, + "auxiliary_loss_mlp": 0.01038938, + "balance_loss_clip": 1.04301345, + "balance_loss_mlp": 1.02605772, + "epoch": 0.48110626784909066, + "flos": 22458971671200.0, + "grad_norm": 1.894672721313533, + "language_loss": 0.81076324, + "learning_rate": 2.219021504925493e-06, + "loss": 0.83240145, + "num_input_tokens_seen": 172047495, + "router_z_loss_clip": 0.81835938, + "router_z_loss_mlp": 0.12866211, + "step": 8002, + "time_per_iteration": 2.6576366424560547 + }, + { + "auxiliary_loss_clip": 0.01127809, + "auxiliary_loss_mlp": 0.01031744, + "balance_loss_clip": 1.04472494, + "balance_loss_mlp": 1.01842284, + "epoch": 0.48116639110175863, + "flos": 34702843880160.0, + "grad_norm": 1.881336609863586, + "language_loss": 0.71272659, + "learning_rate": 2.218634381467819e-06, + "loss": 0.73432207, + "num_input_tokens_seen": 172067625, + "router_z_loss_clip": 0.83105469, + "router_z_loss_mlp": 0.13330078, + "step": 8003, + "time_per_iteration": 2.731656551361084 + }, + { + "auxiliary_loss_clip": 0.01121015, + "auxiliary_loss_mlp": 0.01034082, + "balance_loss_clip": 1.04329133, + "balance_loss_mlp": 1.02205372, + "epoch": 0.4812265143544266, + "flos": 26509311439200.0, + "grad_norm": 1.9135757143310792, + "language_loss": 0.82427168, + "learning_rate": 2.218247249719507e-06, + "loss": 0.84582263, + "num_input_tokens_seen": 172087885, + "router_z_loss_clip": 0.77783203, + "router_z_loss_mlp": 0.12030029, + "step": 8004, + "time_per_iteration": 2.6471621990203857 + }, + { + "auxiliary_loss_clip": 0.01133526, + "auxiliary_loss_mlp": 0.01039091, + "balance_loss_clip": 1.04624045, + "balance_loss_mlp": 1.02487493, + "epoch": 0.48128663760709456, + "flos": 16136219986560.0, + "grad_norm": 2.2253147741116686, + "language_loss": 0.77012217, + "learning_rate": 2.217860109695239e-06, + "loss": 0.7918483, + "num_input_tokens_seen": 172105815, + "router_z_loss_clip": 0.87207031, + "router_z_loss_mlp": 0.14221191, + "step": 8005, + "time_per_iteration": 2.613539218902588 + }, + { + "auxiliary_loss_clip": 0.01123129, + "auxiliary_loss_mlp": 0.01035302, + "balance_loss_clip": 1.04168057, + "balance_loss_mlp": 1.02257037, + "epoch": 0.4813467608597625, + "flos": 29582149466880.0, + "grad_norm": 2.978945771102254, + "language_loss": 0.70725685, + "learning_rate": 2.217472961409692e-06, + "loss": 0.72884119, + "num_input_tokens_seen": 172126125, + "router_z_loss_clip": 0.81396484, + "router_z_loss_mlp": 0.12731934, + "step": 8006, + "time_per_iteration": 2.631917715072632 + }, + { + "auxiliary_loss_clip": 0.01123155, + "auxiliary_loss_mlp": 0.01033685, + "balance_loss_clip": 1.04209685, + "balance_loss_mlp": 1.02081013, + "epoch": 0.4814068841124305, + "flos": 33533113047840.0, + "grad_norm": 2.0136234025802766, + "language_loss": 0.70336598, + "learning_rate": 2.2170858048775495e-06, + "loss": 0.72493434, + "num_input_tokens_seen": 172141945, + "router_z_loss_clip": 0.81054688, + "router_z_loss_mlp": 0.12872314, + "step": 8007, + "time_per_iteration": 2.752061605453491 + }, + { + "auxiliary_loss_clip": 0.01123803, + "auxiliary_loss_mlp": 0.01031478, + "balance_loss_clip": 1.04180908, + "balance_loss_mlp": 1.01910377, + "epoch": 0.48146700736509845, + "flos": 23882829298560.0, + "grad_norm": 2.016267751576936, + "language_loss": 0.71350002, + "learning_rate": 2.2166986401134914e-06, + "loss": 0.73505282, + "num_input_tokens_seen": 172161095, + "router_z_loss_clip": 0.8203125, + "router_z_loss_mlp": 0.12365723, + "step": 8008, + "time_per_iteration": 2.652167558670044 + }, + { + "auxiliary_loss_clip": 0.01129467, + "auxiliary_loss_mlp": 0.01042327, + "balance_loss_clip": 1.04567456, + "balance_loss_mlp": 1.02883887, + "epoch": 0.4815271306177664, + "flos": 25170365329920.0, + "grad_norm": 4.053792130416449, + "language_loss": 0.60577011, + "learning_rate": 2.216311467132199e-06, + "loss": 0.62748814, + "num_input_tokens_seen": 172178750, + "router_z_loss_clip": 0.83789062, + "router_z_loss_mlp": 0.13476562, + "step": 8009, + "time_per_iteration": 2.6668405532836914 + }, + { + "auxiliary_loss_clip": 0.01044515, + "auxiliary_loss_mlp": 0.01001881, + "balance_loss_clip": 1.01913834, + "balance_loss_mlp": 1.00054038, + "epoch": 0.4815872538704344, + "flos": 82597571480160.0, + "grad_norm": 0.8666667805679459, + "language_loss": 0.61275947, + "learning_rate": 2.2159242859483547e-06, + "loss": 0.63322341, + "num_input_tokens_seen": 172240235, + "router_z_loss_clip": 0.25415039, + "router_z_loss_mlp": 0.01340485, + "step": 8010, + "time_per_iteration": 3.2667434215545654 + }, + { + "auxiliary_loss_clip": 0.0112617, + "auxiliary_loss_mlp": 0.01044002, + "balance_loss_clip": 1.04453206, + "balance_loss_mlp": 1.03067446, + "epoch": 0.48164737712310235, + "flos": 27846069615360.0, + "grad_norm": 2.2164123750613625, + "language_loss": 0.73686349, + "learning_rate": 2.215537096576639e-06, + "loss": 0.75856525, + "num_input_tokens_seen": 172259875, + "router_z_loss_clip": 0.81494141, + "router_z_loss_mlp": 0.13317871, + "step": 8011, + "time_per_iteration": 2.6680920124053955 + }, + { + "auxiliary_loss_clip": 0.0112001, + "auxiliary_loss_mlp": 0.01036705, + "balance_loss_clip": 1.04225814, + "balance_loss_mlp": 1.02524877, + "epoch": 0.4817075003757703, + "flos": 28960217089920.0, + "grad_norm": 2.2440932462691263, + "language_loss": 0.79356503, + "learning_rate": 2.2151498990317354e-06, + "loss": 0.81513214, + "num_input_tokens_seen": 172280150, + "router_z_loss_clip": 0.77685547, + "router_z_loss_mlp": 0.11456299, + "step": 8012, + "time_per_iteration": 2.6852715015411377 + }, + { + "auxiliary_loss_clip": 0.01125503, + "auxiliary_loss_mlp": 0.01044261, + "balance_loss_clip": 1.04384899, + "balance_loss_mlp": 1.03086829, + "epoch": 0.4817676236284383, + "flos": 34389487172160.0, + "grad_norm": 2.219133546954301, + "language_loss": 0.73516047, + "learning_rate": 2.214762693328326e-06, + "loss": 0.75685811, + "num_input_tokens_seen": 172300810, + "router_z_loss_clip": 0.81542969, + "router_z_loss_mlp": 0.13391113, + "step": 8013, + "time_per_iteration": 2.7015340328216553 + }, + { + "auxiliary_loss_clip": 0.01123902, + "auxiliary_loss_mlp": 0.0103177, + "balance_loss_clip": 1.04474068, + "balance_loss_mlp": 1.0195334, + "epoch": 0.48182774688110624, + "flos": 20855161687680.0, + "grad_norm": 2.19430253090804, + "language_loss": 0.90654945, + "learning_rate": 2.214375479481094e-06, + "loss": 0.92810613, + "num_input_tokens_seen": 172317930, + "router_z_loss_clip": 0.79150391, + "router_z_loss_mlp": 0.12243652, + "step": 8014, + "time_per_iteration": 2.6408767700195312 + }, + { + "auxiliary_loss_clip": 0.01127855, + "auxiliary_loss_mlp": 0.01039113, + "balance_loss_clip": 1.04299498, + "balance_loss_mlp": 1.02542233, + "epoch": 0.4818878701337742, + "flos": 15334740426240.0, + "grad_norm": 2.5526426181394624, + "language_loss": 0.74543858, + "learning_rate": 2.213988257504722e-06, + "loss": 0.7671082, + "num_input_tokens_seen": 172336340, + "router_z_loss_clip": 0.84863281, + "router_z_loss_mlp": 0.13708496, + "step": 8015, + "time_per_iteration": 2.627894878387451 + }, + { + "auxiliary_loss_clip": 0.01126649, + "auxiliary_loss_mlp": 0.01037845, + "balance_loss_clip": 1.04099631, + "balance_loss_mlp": 1.02472019, + "epoch": 0.48194799338644223, + "flos": 30027613973760.0, + "grad_norm": 2.151143354998968, + "language_loss": 0.8066349, + "learning_rate": 2.213601027413894e-06, + "loss": 0.82827985, + "num_input_tokens_seen": 172354315, + "router_z_loss_clip": 0.85595703, + "router_z_loss_mlp": 0.13122559, + "step": 8016, + "time_per_iteration": 2.74267315864563 + }, + { + "auxiliary_loss_clip": 0.01121637, + "auxiliary_loss_mlp": 0.01030181, + "balance_loss_clip": 1.04438114, + "balance_loss_mlp": 1.01814723, + "epoch": 0.4820081166391102, + "flos": 25752678157440.0, + "grad_norm": 2.0419284725713203, + "language_loss": 0.77552986, + "learning_rate": 2.2132137892232933e-06, + "loss": 0.79704809, + "num_input_tokens_seen": 172372695, + "router_z_loss_clip": 0.77246094, + "router_z_loss_mlp": 0.12030029, + "step": 8017, + "time_per_iteration": 2.660537004470825 + }, + { + "auxiliary_loss_clip": 0.01121852, + "auxiliary_loss_mlp": 0.01031185, + "balance_loss_clip": 1.04386473, + "balance_loss_mlp": 1.01870394, + "epoch": 0.48206823989177816, + "flos": 30828283188480.0, + "grad_norm": 1.9704120875949014, + "language_loss": 0.80279207, + "learning_rate": 2.2128265429476043e-06, + "loss": 0.82432246, + "num_input_tokens_seen": 172390905, + "router_z_loss_clip": 0.78076172, + "router_z_loss_mlp": 0.12493896, + "step": 8018, + "time_per_iteration": 2.711904287338257 + }, + { + "auxiliary_loss_clip": 0.01126313, + "auxiliary_loss_mlp": 0.01032363, + "balance_loss_clip": 1.04466176, + "balance_loss_mlp": 1.02000093, + "epoch": 0.4821283631444461, + "flos": 30072217148640.0, + "grad_norm": 1.938309754708891, + "language_loss": 0.76242959, + "learning_rate": 2.2124392886015124e-06, + "loss": 0.78401637, + "num_input_tokens_seen": 172412295, + "router_z_loss_clip": 0.81542969, + "router_z_loss_mlp": 0.12359619, + "step": 8019, + "time_per_iteration": 2.7109382152557373 + }, + { + "auxiliary_loss_clip": 0.01123056, + "auxiliary_loss_mlp": 0.0103586, + "balance_loss_clip": 1.04114175, + "balance_loss_mlp": 1.02290154, + "epoch": 0.4821884863971141, + "flos": 29226580103520.0, + "grad_norm": 1.9577448511314637, + "language_loss": 0.78954875, + "learning_rate": 2.212052026199701e-06, + "loss": 0.81113791, + "num_input_tokens_seen": 172432625, + "router_z_loss_clip": 0.81884766, + "router_z_loss_mlp": 0.12969971, + "step": 8020, + "time_per_iteration": 2.7605173587799072 + }, + { + "auxiliary_loss_clip": 0.01121578, + "auxiliary_loss_mlp": 0.01032059, + "balance_loss_clip": 1.04258108, + "balance_loss_mlp": 1.01914883, + "epoch": 0.48224860964978206, + "flos": 20939627515680.0, + "grad_norm": 1.9308902697707262, + "language_loss": 0.6959511, + "learning_rate": 2.211664755756855e-06, + "loss": 0.71748745, + "num_input_tokens_seen": 172450010, + "router_z_loss_clip": 0.79101562, + "router_z_loss_mlp": 0.12921143, + "step": 8021, + "time_per_iteration": 2.617729663848877 + }, + { + "auxiliary_loss_clip": 0.01128229, + "auxiliary_loss_mlp": 0.01034037, + "balance_loss_clip": 1.04345596, + "balance_loss_mlp": 1.0205189, + "epoch": 0.48230873290245, + "flos": 28157359942080.0, + "grad_norm": 2.2293987745923936, + "language_loss": 0.62945348, + "learning_rate": 2.2112774772876603e-06, + "loss": 0.65107608, + "num_input_tokens_seen": 172469080, + "router_z_loss_clip": 0.84765625, + "router_z_loss_mlp": 0.13519287, + "step": 8022, + "time_per_iteration": 2.6909122467041016 + }, + { + "auxiliary_loss_clip": 0.01121518, + "auxiliary_loss_mlp": 0.01029147, + "balance_loss_clip": 1.04264545, + "balance_loss_mlp": 1.01732099, + "epoch": 0.482368856155118, + "flos": 23615169732000.0, + "grad_norm": 3.7955449478329553, + "language_loss": 0.67040223, + "learning_rate": 2.2108901908068028e-06, + "loss": 0.6919089, + "num_input_tokens_seen": 172484850, + "router_z_loss_clip": 0.78857422, + "router_z_loss_mlp": 0.11834717, + "step": 8023, + "time_per_iteration": 2.6129825115203857 + }, + { + "auxiliary_loss_clip": 0.01122038, + "auxiliary_loss_mlp": 0.01031334, + "balance_loss_clip": 1.04178321, + "balance_loss_mlp": 1.01913273, + "epoch": 0.48242897940778595, + "flos": 24500831499360.0, + "grad_norm": 2.287494778657052, + "language_loss": 0.76713848, + "learning_rate": 2.2105028963289683e-06, + "loss": 0.78867221, + "num_input_tokens_seen": 172503525, + "router_z_loss_clip": 0.80126953, + "router_z_loss_mlp": 0.12207031, + "step": 8024, + "time_per_iteration": 2.7294864654541016 + }, + { + "auxiliary_loss_clip": 0.0112374, + "auxiliary_loss_mlp": 0.01034699, + "balance_loss_clip": 1.04194736, + "balance_loss_mlp": 1.02091885, + "epoch": 0.4824891026604539, + "flos": 28557816101280.0, + "grad_norm": 1.503568485332822, + "language_loss": 0.75118172, + "learning_rate": 2.2101155938688423e-06, + "loss": 0.77276611, + "num_input_tokens_seen": 172524360, + "router_z_loss_clip": 0.81787109, + "router_z_loss_mlp": 0.13781738, + "step": 8025, + "time_per_iteration": 3.977752923965454 + }, + { + "auxiliary_loss_clip": 0.01121889, + "auxiliary_loss_mlp": 0.01033577, + "balance_loss_clip": 1.04222155, + "balance_loss_mlp": 1.02107787, + "epoch": 0.4825492259131219, + "flos": 24854537067840.0, + "grad_norm": 1.9988299010715598, + "language_loss": 0.70882535, + "learning_rate": 2.209728283441112e-06, + "loss": 0.73038006, + "num_input_tokens_seen": 172541480, + "router_z_loss_clip": 0.79541016, + "router_z_loss_mlp": 0.12512207, + "step": 8026, + "time_per_iteration": 4.061312675476074 + }, + { + "auxiliary_loss_clip": 0.01126055, + "auxiliary_loss_mlp": 0.01039109, + "balance_loss_clip": 1.04217362, + "balance_loss_mlp": 1.02495301, + "epoch": 0.48260934916578985, + "flos": 17471641092480.0, + "grad_norm": 2.4181091222237927, + "language_loss": 0.74566972, + "learning_rate": 2.209340965060465e-06, + "loss": 0.76732129, + "num_input_tokens_seen": 172559005, + "router_z_loss_clip": 0.83837891, + "router_z_loss_mlp": 0.14141846, + "step": 8027, + "time_per_iteration": 2.65187406539917 + }, + { + "auxiliary_loss_clip": 0.01126671, + "auxiliary_loss_mlp": 0.01034462, + "balance_loss_clip": 1.04363263, + "balance_loss_mlp": 1.02173007, + "epoch": 0.4826694724184578, + "flos": 26992936873440.0, + "grad_norm": 1.7546201049654808, + "language_loss": 0.67700195, + "learning_rate": 2.2089536387415868e-06, + "loss": 0.69861329, + "num_input_tokens_seen": 172578435, + "router_z_loss_clip": 0.82958984, + "router_z_loss_mlp": 0.1272583, + "step": 8028, + "time_per_iteration": 2.7075283527374268 + }, + { + "auxiliary_loss_clip": 0.01124841, + "auxiliary_loss_mlp": 0.0103462, + "balance_loss_clip": 1.04410815, + "balance_loss_mlp": 1.02194202, + "epoch": 0.48272959567112583, + "flos": 19743850422720.0, + "grad_norm": 1.9956498756729548, + "language_loss": 0.72928101, + "learning_rate": 2.2085663044991655e-06, + "loss": 0.75087559, + "num_input_tokens_seen": 172596095, + "router_z_loss_clip": 0.80810547, + "router_z_loss_mlp": 0.12677002, + "step": 8029, + "time_per_iteration": 2.6213672161102295 + }, + { + "auxiliary_loss_clip": 0.01124258, + "auxiliary_loss_mlp": 0.01028557, + "balance_loss_clip": 1.04238212, + "balance_loss_mlp": 1.01516342, + "epoch": 0.4827897189237938, + "flos": 28285456530240.0, + "grad_norm": 2.2405332687150743, + "language_loss": 0.8433224, + "learning_rate": 2.2081789623478896e-06, + "loss": 0.86485052, + "num_input_tokens_seen": 172615255, + "router_z_loss_clip": 0.81884766, + "router_z_loss_mlp": 0.13397217, + "step": 8030, + "time_per_iteration": 2.6671242713928223 + }, + { + "auxiliary_loss_clip": 0.01120603, + "auxiliary_loss_mlp": 0.0103148, + "balance_loss_clip": 1.04081392, + "balance_loss_mlp": 1.01870096, + "epoch": 0.48284984217646176, + "flos": 26420672331360.0, + "grad_norm": 1.9946693701343436, + "language_loss": 0.73926091, + "learning_rate": 2.2077916123024466e-06, + "loss": 0.76078176, + "num_input_tokens_seen": 172633185, + "router_z_loss_clip": 0.79736328, + "router_z_loss_mlp": 0.12792969, + "step": 8031, + "time_per_iteration": 2.684267520904541 + }, + { + "auxiliary_loss_clip": 0.01127324, + "auxiliary_loss_mlp": 0.01042094, + "balance_loss_clip": 1.04252422, + "balance_loss_mlp": 1.02853954, + "epoch": 0.48290996542912973, + "flos": 38397452215680.0, + "grad_norm": 2.115908986488869, + "language_loss": 0.71924984, + "learning_rate": 2.2074042543775245e-06, + "loss": 0.74094403, + "num_input_tokens_seen": 172654280, + "router_z_loss_clip": 0.84912109, + "router_z_loss_mlp": 0.13555908, + "step": 8032, + "time_per_iteration": 2.762930393218994 + }, + { + "auxiliary_loss_clip": 0.01121477, + "auxiliary_loss_mlp": 0.01032851, + "balance_loss_clip": 1.04081929, + "balance_loss_mlp": 1.02038717, + "epoch": 0.4829700886817977, + "flos": 29848836618720.0, + "grad_norm": 1.7062902022049697, + "language_loss": 0.73899019, + "learning_rate": 2.2070168885878126e-06, + "loss": 0.76053345, + "num_input_tokens_seen": 172675545, + "router_z_loss_clip": 0.80664062, + "router_z_loss_mlp": 0.12463379, + "step": 8033, + "time_per_iteration": 2.6654152870178223 + }, + { + "auxiliary_loss_clip": 0.01128327, + "auxiliary_loss_mlp": 0.01035771, + "balance_loss_clip": 1.04359579, + "balance_loss_mlp": 1.02271199, + "epoch": 0.48303021193446566, + "flos": 31362265251360.0, + "grad_norm": 1.64186321949997, + "language_loss": 0.83476824, + "learning_rate": 2.2066295149479996e-06, + "loss": 0.85640919, + "num_input_tokens_seen": 172696455, + "router_z_loss_clip": 0.84667969, + "router_z_loss_mlp": 0.1305542, + "step": 8034, + "time_per_iteration": 4.129377365112305 + }, + { + "auxiliary_loss_clip": 0.01118994, + "auxiliary_loss_mlp": 0.01031283, + "balance_loss_clip": 1.04136598, + "balance_loss_mlp": 1.01909351, + "epoch": 0.4830903351871336, + "flos": 24507881506080.0, + "grad_norm": 1.7149770057341036, + "language_loss": 0.79182124, + "learning_rate": 2.2062421334727744e-06, + "loss": 0.81332403, + "num_input_tokens_seen": 172716720, + "router_z_loss_clip": 0.77685547, + "router_z_loss_mlp": 0.12194824, + "step": 8035, + "time_per_iteration": 2.6623568534851074 + }, + { + "auxiliary_loss_clip": 0.01121103, + "auxiliary_loss_mlp": 0.01038107, + "balance_loss_clip": 1.04005527, + "balance_loss_mlp": 1.02387357, + "epoch": 0.4831504584398016, + "flos": 48141601801920.0, + "grad_norm": 1.9534214868613258, + "language_loss": 0.69425392, + "learning_rate": 2.2058547441768267e-06, + "loss": 0.71584606, + "num_input_tokens_seen": 172737435, + "router_z_loss_clip": 0.81054688, + "router_z_loss_mlp": 0.14227295, + "step": 8036, + "time_per_iteration": 2.8000662326812744 + }, + { + "auxiliary_loss_clip": 0.01120135, + "auxiliary_loss_mlp": 0.01033011, + "balance_loss_clip": 1.04071331, + "balance_loss_mlp": 1.0207324, + "epoch": 0.48321058169246955, + "flos": 24412435495200.0, + "grad_norm": 2.3706247286899855, + "language_loss": 0.73305666, + "learning_rate": 2.205467347074847e-06, + "loss": 0.75458813, + "num_input_tokens_seen": 172755700, + "router_z_loss_clip": 0.79443359, + "router_z_loss_mlp": 0.12280273, + "step": 8037, + "time_per_iteration": 2.6440529823303223 + }, + { + "auxiliary_loss_clip": 0.01129475, + "auxiliary_loss_mlp": 0.01030823, + "balance_loss_clip": 1.04379892, + "balance_loss_mlp": 1.01705468, + "epoch": 0.4832707049451375, + "flos": 25308712789920.0, + "grad_norm": 4.4246641469662285, + "language_loss": 0.69015765, + "learning_rate": 2.205079942181525e-06, + "loss": 0.71176058, + "num_input_tokens_seen": 172775185, + "router_z_loss_clip": 0.85693359, + "router_z_loss_mlp": 0.13769531, + "step": 8038, + "time_per_iteration": 3.935145378112793 + }, + { + "auxiliary_loss_clip": 0.01121789, + "auxiliary_loss_mlp": 0.01032179, + "balance_loss_clip": 1.04148126, + "balance_loss_mlp": 1.01922143, + "epoch": 0.4833308281978055, + "flos": 40445268084000.0, + "grad_norm": 1.8785157578122755, + "language_loss": 0.78899276, + "learning_rate": 2.20469252951155e-06, + "loss": 0.81053245, + "num_input_tokens_seen": 172796990, + "router_z_loss_clip": 0.80371094, + "router_z_loss_mlp": 0.12969971, + "step": 8039, + "time_per_iteration": 2.7474091053009033 + }, + { + "auxiliary_loss_clip": 0.01123291, + "auxiliary_loss_mlp": 0.01030931, + "balance_loss_clip": 1.04278493, + "balance_loss_mlp": 1.01848531, + "epoch": 0.48339095145047345, + "flos": 23304770785440.0, + "grad_norm": 1.46099390852384, + "language_loss": 0.77536857, + "learning_rate": 2.2043051090796143e-06, + "loss": 0.79691082, + "num_input_tokens_seen": 172814915, + "router_z_loss_clip": 0.8046875, + "router_z_loss_mlp": 0.12438965, + "step": 8040, + "time_per_iteration": 2.65980863571167 + }, + { + "auxiliary_loss_clip": 0.01125722, + "auxiliary_loss_mlp": 0.0103527, + "balance_loss_clip": 1.04321849, + "balance_loss_mlp": 1.02162099, + "epoch": 0.4834510747031414, + "flos": 42048510825600.0, + "grad_norm": 1.6455560784333614, + "language_loss": 0.75268775, + "learning_rate": 2.203917680900409e-06, + "loss": 0.77429765, + "num_input_tokens_seen": 172837060, + "router_z_loss_clip": 0.82568359, + "router_z_loss_mlp": 0.13647461, + "step": 8041, + "time_per_iteration": 2.7479794025421143 + }, + { + "auxiliary_loss_clip": 0.01123534, + "auxiliary_loss_mlp": 0.01031239, + "balance_loss_clip": 1.04501343, + "balance_loss_mlp": 1.01847184, + "epoch": 0.48351119795580944, + "flos": 33410688878880.0, + "grad_norm": 1.95908276501213, + "language_loss": 0.66906035, + "learning_rate": 2.203530244988624e-06, + "loss": 0.69060808, + "num_input_tokens_seen": 172856545, + "router_z_loss_clip": 0.78466797, + "router_z_loss_mlp": 0.12774658, + "step": 8042, + "time_per_iteration": 2.7605254650115967 + }, + { + "auxiliary_loss_clip": 0.01041486, + "auxiliary_loss_mlp": 0.01002205, + "balance_loss_clip": 1.01625466, + "balance_loss_mlp": 1.00080276, + "epoch": 0.4835713212084774, + "flos": 81929901444480.0, + "grad_norm": 0.6922748916011796, + "language_loss": 0.58911842, + "learning_rate": 2.2031428013589517e-06, + "loss": 0.60955536, + "num_input_tokens_seen": 172923055, + "router_z_loss_clip": 0.25292969, + "router_z_loss_mlp": 0.01400757, + "step": 8043, + "time_per_iteration": 3.400545597076416 + }, + { + "auxiliary_loss_clip": 0.01123755, + "auxiliary_loss_mlp": 0.01030916, + "balance_loss_clip": 1.04126358, + "balance_loss_mlp": 1.01773691, + "epoch": 0.48363144446114537, + "flos": 21923774089920.0, + "grad_norm": 2.20922007887653, + "language_loss": 0.71922231, + "learning_rate": 2.2027553500260847e-06, + "loss": 0.74076903, + "num_input_tokens_seen": 172940700, + "router_z_loss_clip": 0.82470703, + "router_z_loss_mlp": 0.13183594, + "step": 8044, + "time_per_iteration": 2.7214677333831787 + }, + { + "auxiliary_loss_clip": 0.01122322, + "auxiliary_loss_mlp": 0.0103109, + "balance_loss_clip": 1.04284668, + "balance_loss_mlp": 1.01754808, + "epoch": 0.48369156771381333, + "flos": 25128355260960.0, + "grad_norm": 1.592845312286485, + "language_loss": 0.75764948, + "learning_rate": 2.202367891004714e-06, + "loss": 0.77918363, + "num_input_tokens_seen": 172961125, + "router_z_loss_clip": 0.79394531, + "router_z_loss_mlp": 0.13549805, + "step": 8045, + "time_per_iteration": 2.6422250270843506 + }, + { + "auxiliary_loss_clip": 0.01124673, + "auxiliary_loss_mlp": 0.01034694, + "balance_loss_clip": 1.04231763, + "balance_loss_mlp": 1.02200437, + "epoch": 0.4837516909664813, + "flos": 27668264675040.0, + "grad_norm": 2.487920681137388, + "language_loss": 0.69520307, + "learning_rate": 2.201980424309533e-06, + "loss": 0.71679676, + "num_input_tokens_seen": 172980405, + "router_z_loss_clip": 0.82177734, + "router_z_loss_mlp": 0.12683105, + "step": 8046, + "time_per_iteration": 2.701460599899292 + }, + { + "auxiliary_loss_clip": 0.01123429, + "auxiliary_loss_mlp": 0.01033774, + "balance_loss_clip": 1.04186058, + "balance_loss_mlp": 1.02034461, + "epoch": 0.48381181421914926, + "flos": 31138398514080.0, + "grad_norm": 2.318403684277291, + "language_loss": 0.82099152, + "learning_rate": 2.2015929499552337e-06, + "loss": 0.84256357, + "num_input_tokens_seen": 172999105, + "router_z_loss_clip": 0.81591797, + "router_z_loss_mlp": 0.13439941, + "step": 8047, + "time_per_iteration": 2.677863836288452 + }, + { + "auxiliary_loss_clip": 0.01122643, + "auxiliary_loss_mlp": 0.01032692, + "balance_loss_clip": 1.04270697, + "balance_loss_mlp": 1.02000833, + "epoch": 0.4838719374718172, + "flos": 29538235085760.0, + "grad_norm": 1.7338083856330442, + "language_loss": 0.8013519, + "learning_rate": 2.2012054679565092e-06, + "loss": 0.82290524, + "num_input_tokens_seen": 173019935, + "router_z_loss_clip": 0.79931641, + "router_z_loss_mlp": 0.12664795, + "step": 8048, + "time_per_iteration": 2.687817096710205 + }, + { + "auxiliary_loss_clip": 0.01126138, + "auxiliary_loss_mlp": 0.01038822, + "balance_loss_clip": 1.04295969, + "balance_loss_mlp": 1.02561378, + "epoch": 0.4839320607244852, + "flos": 32609817077760.0, + "grad_norm": 1.8995111768374078, + "language_loss": 0.81480038, + "learning_rate": 2.200817978328054e-06, + "loss": 0.83645004, + "num_input_tokens_seen": 173039700, + "router_z_loss_clip": 0.83007812, + "router_z_loss_mlp": 0.13214111, + "step": 8049, + "time_per_iteration": 2.709686517715454 + }, + { + "auxiliary_loss_clip": 0.01122315, + "auxiliary_loss_mlp": 0.01031171, + "balance_loss_clip": 1.04505587, + "balance_loss_mlp": 1.01906538, + "epoch": 0.48399218397715316, + "flos": 24951765839040.0, + "grad_norm": 1.7614838254117262, + "language_loss": 0.73084724, + "learning_rate": 2.2004304810845602e-06, + "loss": 0.75238216, + "num_input_tokens_seen": 173059170, + "router_z_loss_clip": 0.77246094, + "router_z_loss_mlp": 0.12103271, + "step": 8050, + "time_per_iteration": 2.612349510192871 + }, + { + "auxiliary_loss_clip": 0.01041053, + "auxiliary_loss_mlp": 0.01005361, + "balance_loss_clip": 1.01593435, + "balance_loss_mlp": 1.00403976, + "epoch": 0.4840523072298211, + "flos": 81974585653920.0, + "grad_norm": 0.6961180775917969, + "language_loss": 0.56330627, + "learning_rate": 2.200042976240723e-06, + "loss": 0.58377039, + "num_input_tokens_seen": 173119000, + "router_z_loss_clip": 0.25146484, + "router_z_loss_mlp": 0.01322174, + "step": 8051, + "time_per_iteration": 3.3295645713806152 + }, + { + "auxiliary_loss_clip": 0.01127196, + "auxiliary_loss_mlp": 0.0103533, + "balance_loss_clip": 1.0448463, + "balance_loss_mlp": 1.02200866, + "epoch": 0.4841124304824891, + "flos": 27347371752960.0, + "grad_norm": 4.213986889513842, + "language_loss": 0.75463474, + "learning_rate": 2.199655463811236e-06, + "loss": 0.77626008, + "num_input_tokens_seen": 173137570, + "router_z_loss_clip": 0.82373047, + "router_z_loss_mlp": 0.13330078, + "step": 8052, + "time_per_iteration": 2.6218268871307373 + }, + { + "auxiliary_loss_clip": 0.01123523, + "auxiliary_loss_mlp": 0.01031942, + "balance_loss_clip": 1.04258704, + "balance_loss_mlp": 1.01940751, + "epoch": 0.48417255373515705, + "flos": 16892002405440.0, + "grad_norm": 2.306019671609677, + "language_loss": 0.66607594, + "learning_rate": 2.1992679438107936e-06, + "loss": 0.68763059, + "num_input_tokens_seen": 173154355, + "router_z_loss_clip": 0.80908203, + "router_z_loss_mlp": 0.12536621, + "step": 8053, + "time_per_iteration": 2.731210231781006 + }, + { + "auxiliary_loss_clip": 0.01121489, + "auxiliary_loss_mlp": 0.01033651, + "balance_loss_clip": 1.04209971, + "balance_loss_mlp": 1.02149177, + "epoch": 0.484232676987825, + "flos": 38619860330880.0, + "grad_norm": 2.0434701807617808, + "language_loss": 0.69185472, + "learning_rate": 2.198880416254091e-06, + "loss": 0.71340609, + "num_input_tokens_seen": 173174845, + "router_z_loss_clip": 0.79345703, + "router_z_loss_mlp": 0.12164307, + "step": 8054, + "time_per_iteration": 2.7508761882781982 + }, + { + "auxiliary_loss_clip": 0.01121443, + "auxiliary_loss_mlp": 0.01033461, + "balance_loss_clip": 1.04056787, + "balance_loss_mlp": 1.02021718, + "epoch": 0.48429280024049304, + "flos": 29401670386080.0, + "grad_norm": 2.0525626742988394, + "language_loss": 0.69733024, + "learning_rate": 2.1984928811558233e-06, + "loss": 0.71887922, + "num_input_tokens_seen": 173195025, + "router_z_loss_clip": 0.80859375, + "router_z_loss_mlp": 0.13244629, + "step": 8055, + "time_per_iteration": 2.6776297092437744 + }, + { + "auxiliary_loss_clip": 0.01125151, + "auxiliary_loss_mlp": 0.01033991, + "balance_loss_clip": 1.04432225, + "balance_loss_mlp": 1.02096772, + "epoch": 0.484352923493161, + "flos": 21389994613440.0, + "grad_norm": 3.1266722722279954, + "language_loss": 0.6346401, + "learning_rate": 2.198105338530685e-06, + "loss": 0.65623158, + "num_input_tokens_seen": 173213065, + "router_z_loss_clip": 0.80761719, + "router_z_loss_mlp": 0.13043213, + "step": 8056, + "time_per_iteration": 2.6436829566955566 + }, + { + "auxiliary_loss_clip": 0.01121692, + "auxiliary_loss_mlp": 0.01030713, + "balance_loss_clip": 1.0403235, + "balance_loss_mlp": 1.01722443, + "epoch": 0.48441304674582897, + "flos": 35590288407840.0, + "grad_norm": 2.2678759153544954, + "language_loss": 0.67500091, + "learning_rate": 2.1977177883933726e-06, + "loss": 0.69652498, + "num_input_tokens_seen": 173234545, + "router_z_loss_clip": 0.8125, + "router_z_loss_mlp": 0.13476562, + "step": 8057, + "time_per_iteration": 2.738771915435791 + }, + { + "auxiliary_loss_clip": 0.01121484, + "auxiliary_loss_mlp": 0.01029278, + "balance_loss_clip": 1.04138219, + "balance_loss_mlp": 1.01661801, + "epoch": 0.48447316999849693, + "flos": 19384756056000.0, + "grad_norm": 2.00322602186338, + "language_loss": 0.81906378, + "learning_rate": 2.1973302307585827e-06, + "loss": 0.84057146, + "num_input_tokens_seen": 173252175, + "router_z_loss_clip": 0.80175781, + "router_z_loss_mlp": 0.12664795, + "step": 8058, + "time_per_iteration": 2.6558852195739746 + }, + { + "auxiliary_loss_clip": 0.01126216, + "auxiliary_loss_mlp": 0.01032661, + "balance_loss_clip": 1.04312396, + "balance_loss_mlp": 1.01951241, + "epoch": 0.4845332932511649, + "flos": 29751243192000.0, + "grad_norm": 2.042274584111005, + "language_loss": 0.79901421, + "learning_rate": 2.1969426656410097e-06, + "loss": 0.82060295, + "num_input_tokens_seen": 173268790, + "router_z_loss_clip": 0.83154297, + "router_z_loss_mlp": 0.13146973, + "step": 8059, + "time_per_iteration": 2.6452786922454834 + }, + { + "auxiliary_loss_clip": 0.01129912, + "auxiliary_loss_mlp": 0.01037784, + "balance_loss_clip": 1.0447191, + "balance_loss_mlp": 1.02409887, + "epoch": 0.48459341650383286, + "flos": 45291414993120.0, + "grad_norm": 2.6463289893310042, + "language_loss": 0.66482186, + "learning_rate": 2.196555093055352e-06, + "loss": 0.68649888, + "num_input_tokens_seen": 173288030, + "router_z_loss_clip": 0.85107422, + "router_z_loss_mlp": 0.13690186, + "step": 8060, + "time_per_iteration": 2.7935945987701416 + }, + { + "auxiliary_loss_clip": 0.01127657, + "auxiliary_loss_mlp": 0.01040499, + "balance_loss_clip": 1.04528677, + "balance_loss_mlp": 1.02754664, + "epoch": 0.48465353975650083, + "flos": 28024239211200.0, + "grad_norm": 2.1204771919140106, + "language_loss": 0.67259824, + "learning_rate": 2.1961675130163046e-06, + "loss": 0.69427979, + "num_input_tokens_seen": 173305965, + "router_z_loss_clip": 0.82421875, + "router_z_loss_mlp": 0.12963867, + "step": 8061, + "time_per_iteration": 2.6473186016082764 + }, + { + "auxiliary_loss_clip": 0.01127773, + "auxiliary_loss_mlp": 0.01042789, + "balance_loss_clip": 1.04504633, + "balance_loss_mlp": 1.02860272, + "epoch": 0.4847136630091688, + "flos": 21607175999520.0, + "grad_norm": 2.0188625561131315, + "language_loss": 0.82574725, + "learning_rate": 2.1957799255385653e-06, + "loss": 0.84745294, + "num_input_tokens_seen": 173321985, + "router_z_loss_clip": 0.82666016, + "router_z_loss_mlp": 0.14196777, + "step": 8062, + "time_per_iteration": 2.6158223152160645 + }, + { + "auxiliary_loss_clip": 0.01124195, + "auxiliary_loss_mlp": 0.01033607, + "balance_loss_clip": 1.04368055, + "balance_loss_mlp": 1.02086926, + "epoch": 0.48477378626183676, + "flos": 26866785114720.0, + "grad_norm": 1.5483942788311986, + "language_loss": 0.74457073, + "learning_rate": 2.1953923306368325e-06, + "loss": 0.76614881, + "num_input_tokens_seen": 173341315, + "router_z_loss_clip": 0.8046875, + "router_z_loss_mlp": 0.12719727, + "step": 8063, + "time_per_iteration": 2.625847101211548 + }, + { + "auxiliary_loss_clip": 0.01125132, + "auxiliary_loss_mlp": 0.01035898, + "balance_loss_clip": 1.04276299, + "balance_loss_mlp": 1.02282667, + "epoch": 0.4848339095145047, + "flos": 34121179329120.0, + "grad_norm": 1.7103271500315422, + "language_loss": 0.79087847, + "learning_rate": 2.1950047283258023e-06, + "loss": 0.81248879, + "num_input_tokens_seen": 173361055, + "router_z_loss_clip": 0.82519531, + "router_z_loss_mlp": 0.1307373, + "step": 8064, + "time_per_iteration": 4.235136270523071 + }, + { + "auxiliary_loss_clip": 0.01123097, + "auxiliary_loss_mlp": 0.01034803, + "balance_loss_clip": 1.0454973, + "balance_loss_mlp": 1.02293539, + "epoch": 0.4848940327671727, + "flos": 26464789298880.0, + "grad_norm": 2.059648942132191, + "language_loss": 0.79043972, + "learning_rate": 2.194617118620173e-06, + "loss": 0.81201875, + "num_input_tokens_seen": 173379255, + "router_z_loss_clip": 0.77685547, + "router_z_loss_mlp": 0.11865234, + "step": 8065, + "time_per_iteration": 4.1399500370025635 + }, + { + "auxiliary_loss_clip": 0.01117017, + "auxiliary_loss_mlp": 0.01034291, + "balance_loss_clip": 1.03984976, + "balance_loss_mlp": 1.02221465, + "epoch": 0.48495415601984065, + "flos": 25174660161600.0, + "grad_norm": 1.9754030973636085, + "language_loss": 0.7615962, + "learning_rate": 2.194229501534644e-06, + "loss": 0.78310931, + "num_input_tokens_seen": 173398370, + "router_z_loss_clip": 0.77197266, + "router_z_loss_mlp": 0.12078857, + "step": 8066, + "time_per_iteration": 2.6331582069396973 + }, + { + "auxiliary_loss_clip": 0.01122928, + "auxiliary_loss_mlp": 0.01030727, + "balance_loss_clip": 1.04433787, + "balance_loss_mlp": 1.01856768, + "epoch": 0.4850142792725086, + "flos": 31272045969600.0, + "grad_norm": 1.422745383864029, + "language_loss": 0.72069496, + "learning_rate": 2.193841877083912e-06, + "loss": 0.74223149, + "num_input_tokens_seen": 173419595, + "router_z_loss_clip": 0.78564453, + "router_z_loss_mlp": 0.12164307, + "step": 8067, + "time_per_iteration": 2.6691625118255615 + }, + { + "auxiliary_loss_clip": 0.01124458, + "auxiliary_loss_mlp": 0.01035562, + "balance_loss_clip": 1.04334581, + "balance_loss_mlp": 1.02194834, + "epoch": 0.4850744025251766, + "flos": 16804052091360.0, + "grad_norm": 2.1361376348195606, + "language_loss": 0.79174447, + "learning_rate": 2.1934542452826767e-06, + "loss": 0.81334472, + "num_input_tokens_seen": 173435390, + "router_z_loss_clip": 0.81152344, + "router_z_loss_mlp": 0.13616943, + "step": 8068, + "time_per_iteration": 2.6185684204101562 + }, + { + "auxiliary_loss_clip": 0.01119837, + "auxiliary_loss_mlp": 0.01031827, + "balance_loss_clip": 1.04114962, + "balance_loss_mlp": 1.01969695, + "epoch": 0.4851345257778446, + "flos": 24723361166400.0, + "grad_norm": 1.5138625286779164, + "language_loss": 0.84316021, + "learning_rate": 2.193066606145638e-06, + "loss": 0.86467683, + "num_input_tokens_seen": 173454095, + "router_z_loss_clip": 0.78662109, + "router_z_loss_mlp": 0.12139893, + "step": 8069, + "time_per_iteration": 2.6380555629730225 + }, + { + "auxiliary_loss_clip": 0.01121682, + "auxiliary_loss_mlp": 0.01033513, + "balance_loss_clip": 1.04235113, + "balance_loss_mlp": 1.02119911, + "epoch": 0.48519464903051257, + "flos": 33055322101920.0, + "grad_norm": 3.147685696524702, + "language_loss": 0.77732718, + "learning_rate": 2.192678959687493e-06, + "loss": 0.79887915, + "num_input_tokens_seen": 173475300, + "router_z_loss_clip": 0.79248047, + "router_z_loss_mlp": 0.12322998, + "step": 8070, + "time_per_iteration": 2.6988766193389893 + }, + { + "auxiliary_loss_clip": 0.01123883, + "auxiliary_loss_mlp": 0.01031918, + "balance_loss_clip": 1.04421365, + "balance_loss_mlp": 1.01913881, + "epoch": 0.48525477228318054, + "flos": 20898549344160.0, + "grad_norm": 6.422620557525551, + "language_loss": 0.77987444, + "learning_rate": 2.192291305922943e-06, + "loss": 0.80143237, + "num_input_tokens_seen": 173492005, + "router_z_loss_clip": 0.796875, + "router_z_loss_mlp": 0.12768555, + "step": 8071, + "time_per_iteration": 2.585251808166504 + }, + { + "auxiliary_loss_clip": 0.01121783, + "auxiliary_loss_mlp": 0.01033669, + "balance_loss_clip": 1.04045749, + "balance_loss_mlp": 1.02023399, + "epoch": 0.4853148955358485, + "flos": 34386569928000.0, + "grad_norm": 1.9480074855531606, + "language_loss": 0.71628439, + "learning_rate": 2.1919036448666873e-06, + "loss": 0.73783898, + "num_input_tokens_seen": 173511995, + "router_z_loss_clip": 0.81298828, + "router_z_loss_mlp": 0.13427734, + "step": 8072, + "time_per_iteration": 2.678262233734131 + }, + { + "auxiliary_loss_clip": 0.0112633, + "auxiliary_loss_mlp": 0.01036045, + "balance_loss_clip": 1.04353619, + "balance_loss_mlp": 1.02262843, + "epoch": 0.48537501878851647, + "flos": 21345148334880.0, + "grad_norm": 2.045829158711305, + "language_loss": 0.87664628, + "learning_rate": 2.1915159765334262e-06, + "loss": 0.89827001, + "num_input_tokens_seen": 173530215, + "router_z_loss_clip": 0.828125, + "router_z_loss_mlp": 0.13421631, + "step": 8073, + "time_per_iteration": 4.0727386474609375 + }, + { + "auxiliary_loss_clip": 0.01119485, + "auxiliary_loss_mlp": 0.01030269, + "balance_loss_clip": 1.04244256, + "balance_loss_mlp": 1.01734102, + "epoch": 0.48543514204118443, + "flos": 34879919509440.0, + "grad_norm": 1.750759974036309, + "language_loss": 0.60674685, + "learning_rate": 2.19112830093786e-06, + "loss": 0.6282444, + "num_input_tokens_seen": 173550920, + "router_z_loss_clip": 0.77001953, + "router_z_loss_mlp": 0.12915039, + "step": 8074, + "time_per_iteration": 2.716190814971924 + }, + { + "auxiliary_loss_clip": 0.01123719, + "auxiliary_loss_mlp": 0.0103684, + "balance_loss_clip": 1.04137826, + "balance_loss_mlp": 1.02376306, + "epoch": 0.4854952652938524, + "flos": 25575399941760.0, + "grad_norm": 1.6136377726250404, + "language_loss": 0.73090887, + "learning_rate": 2.19074061809469e-06, + "loss": 0.75251448, + "num_input_tokens_seen": 173569065, + "router_z_loss_clip": 0.82324219, + "router_z_loss_mlp": 0.1307373, + "step": 8075, + "time_per_iteration": 2.685725450515747 + }, + { + "auxiliary_loss_clip": 0.01119079, + "auxiliary_loss_mlp": 0.01035408, + "balance_loss_clip": 1.04246676, + "balance_loss_mlp": 1.02324295, + "epoch": 0.48555538854652036, + "flos": 81182985959520.0, + "grad_norm": 1.556803230399035, + "language_loss": 0.8148616, + "learning_rate": 2.1903529280186163e-06, + "loss": 0.83640647, + "num_input_tokens_seen": 173596085, + "router_z_loss_clip": 0.76464844, + "router_z_loss_mlp": 0.1217041, + "step": 8076, + "time_per_iteration": 3.0451388359069824 + }, + { + "auxiliary_loss_clip": 0.01125252, + "auxiliary_loss_mlp": 0.01032699, + "balance_loss_clip": 1.04523849, + "balance_loss_mlp": 1.01887643, + "epoch": 0.4856155117991883, + "flos": 19431304060320.0, + "grad_norm": 41.938659958721814, + "language_loss": 0.86332756, + "learning_rate": 2.1899652307243407e-06, + "loss": 0.88490707, + "num_input_tokens_seen": 173613900, + "router_z_loss_clip": 0.79980469, + "router_z_loss_mlp": 0.13812256, + "step": 8077, + "time_per_iteration": 3.8683319091796875 + }, + { + "auxiliary_loss_clip": 0.01039695, + "auxiliary_loss_mlp": 0.01003988, + "balance_loss_clip": 1.01481557, + "balance_loss_mlp": 1.0026958, + "epoch": 0.4856756350518563, + "flos": 86693243734080.0, + "grad_norm": 0.9047556376593823, + "language_loss": 0.5849604, + "learning_rate": 2.189577526226564e-06, + "loss": 0.60539722, + "num_input_tokens_seen": 173671305, + "router_z_loss_clip": 0.24902344, + "router_z_loss_mlp": 0.01292419, + "step": 8078, + "time_per_iteration": 3.2942254543304443 + }, + { + "auxiliary_loss_clip": 0.01128669, + "auxiliary_loss_mlp": 0.01032192, + "balance_loss_clip": 1.04601431, + "balance_loss_mlp": 1.01931715, + "epoch": 0.48573575830452426, + "flos": 36394036935840.0, + "grad_norm": 1.6596983209339964, + "language_loss": 0.72189879, + "learning_rate": 2.1891898145399884e-06, + "loss": 0.74350739, + "num_input_tokens_seen": 173692070, + "router_z_loss_clip": 0.82519531, + "router_z_loss_mlp": 0.12872314, + "step": 8079, + "time_per_iteration": 2.693301200866699 + }, + { + "auxiliary_loss_clip": 0.01126722, + "auxiliary_loss_mlp": 0.01030889, + "balance_loss_clip": 1.04544902, + "balance_loss_mlp": 1.01831198, + "epoch": 0.4857958815571922, + "flos": 21523520517120.0, + "grad_norm": 2.2957338579462983, + "language_loss": 0.79454505, + "learning_rate": 2.1888020956793172e-06, + "loss": 0.81612122, + "num_input_tokens_seen": 173709785, + "router_z_loss_clip": 0.81152344, + "router_z_loss_mlp": 0.12567139, + "step": 8080, + "time_per_iteration": 2.644231081008911 + }, + { + "auxiliary_loss_clip": 0.0112268, + "auxiliary_loss_mlp": 0.01032555, + "balance_loss_clip": 1.04206336, + "balance_loss_mlp": 1.01977587, + "epoch": 0.4858560048098602, + "flos": 25753002295680.0, + "grad_norm": 2.3281821059613175, + "language_loss": 0.83854926, + "learning_rate": 2.188414369659251e-06, + "loss": 0.86010152, + "num_input_tokens_seen": 173728770, + "router_z_loss_clip": 0.80664062, + "router_z_loss_mlp": 0.12774658, + "step": 8081, + "time_per_iteration": 2.6436755657196045 + }, + { + "auxiliary_loss_clip": 0.01122531, + "auxiliary_loss_mlp": 0.01034059, + "balance_loss_clip": 1.04191744, + "balance_loss_mlp": 1.01933646, + "epoch": 0.4859161280625282, + "flos": 26955383705280.0, + "grad_norm": 1.88816597900994, + "language_loss": 0.83258867, + "learning_rate": 2.1880266364944924e-06, + "loss": 0.85415459, + "num_input_tokens_seen": 173747355, + "router_z_loss_clip": 0.80566406, + "router_z_loss_mlp": 0.14703369, + "step": 8082, + "time_per_iteration": 2.702105760574341 + }, + { + "auxiliary_loss_clip": 0.01122443, + "auxiliary_loss_mlp": 0.01033339, + "balance_loss_clip": 1.04535556, + "balance_loss_mlp": 1.0213232, + "epoch": 0.4859762513151962, + "flos": 21345310404000.0, + "grad_norm": 3.0489864814690892, + "language_loss": 0.87478209, + "learning_rate": 2.187638896199746e-06, + "loss": 0.89633989, + "num_input_tokens_seen": 173764825, + "router_z_loss_clip": 0.77099609, + "router_z_loss_mlp": 0.12030029, + "step": 8083, + "time_per_iteration": 2.6250720024108887 + }, + { + "auxiliary_loss_clip": 0.01122266, + "auxiliary_loss_mlp": 0.01038786, + "balance_loss_clip": 1.04298544, + "balance_loss_mlp": 1.02651381, + "epoch": 0.48603637456786414, + "flos": 21968620368480.0, + "grad_norm": 1.6694747725881054, + "language_loss": 0.80464852, + "learning_rate": 2.1872511487897126e-06, + "loss": 0.82625896, + "num_input_tokens_seen": 173783215, + "router_z_loss_clip": 0.79345703, + "router_z_loss_mlp": 0.12261963, + "step": 8084, + "time_per_iteration": 2.6547555923461914 + }, + { + "auxiliary_loss_clip": 0.01126119, + "auxiliary_loss_mlp": 0.01035034, + "balance_loss_clip": 1.04470408, + "balance_loss_mlp": 1.02204657, + "epoch": 0.4860964978205321, + "flos": 27444965179680.0, + "grad_norm": 1.7920425598907315, + "language_loss": 0.68367302, + "learning_rate": 2.186863394279098e-06, + "loss": 0.7052846, + "num_input_tokens_seen": 173801905, + "router_z_loss_clip": 0.81298828, + "router_z_loss_mlp": 0.12982178, + "step": 8085, + "time_per_iteration": 2.617461919784546 + }, + { + "auxiliary_loss_clip": 0.01123979, + "auxiliary_loss_mlp": 0.01037993, + "balance_loss_clip": 1.04355717, + "balance_loss_mlp": 1.02496326, + "epoch": 0.48615662107320007, + "flos": 28518966380160.0, + "grad_norm": 1.4623917558032702, + "language_loss": 0.77861047, + "learning_rate": 2.1864756326826046e-06, + "loss": 0.80023015, + "num_input_tokens_seen": 173824690, + "router_z_loss_clip": 0.8046875, + "router_z_loss_mlp": 0.13031006, + "step": 8086, + "time_per_iteration": 2.726377010345459 + }, + { + "auxiliary_loss_clip": 0.0112185, + "auxiliary_loss_mlp": 0.01030773, + "balance_loss_clip": 1.04232776, + "balance_loss_mlp": 1.01773787, + "epoch": 0.48621674432586803, + "flos": 41999329198080.0, + "grad_norm": 2.2263111525021233, + "language_loss": 0.70032632, + "learning_rate": 2.1860878640149355e-06, + "loss": 0.72185254, + "num_input_tokens_seen": 173844450, + "router_z_loss_clip": 0.79443359, + "router_z_loss_mlp": 0.13031006, + "step": 8087, + "time_per_iteration": 2.725541830062866 + }, + { + "auxiliary_loss_clip": 0.01127769, + "auxiliary_loss_mlp": 0.01037584, + "balance_loss_clip": 1.0424217, + "balance_loss_mlp": 1.02405977, + "epoch": 0.486276867578536, + "flos": 40399003700640.0, + "grad_norm": 2.517461050887558, + "language_loss": 0.7265054, + "learning_rate": 2.1857000882907974e-06, + "loss": 0.74815893, + "num_input_tokens_seen": 173864975, + "router_z_loss_clip": 0.85400391, + "router_z_loss_mlp": 0.13531494, + "step": 8088, + "time_per_iteration": 2.7783477306365967 + }, + { + "auxiliary_loss_clip": 0.01121134, + "auxiliary_loss_mlp": 0.01033424, + "balance_loss_clip": 1.04178822, + "balance_loss_mlp": 1.02080607, + "epoch": 0.48633699083120396, + "flos": 26198628871680.0, + "grad_norm": 1.5478351828550194, + "language_loss": 0.75168681, + "learning_rate": 2.185312305524892e-06, + "loss": 0.77323246, + "num_input_tokens_seen": 173883805, + "router_z_loss_clip": 0.79345703, + "router_z_loss_mlp": 0.12628174, + "step": 8089, + "time_per_iteration": 2.6793100833892822 + }, + { + "auxiliary_loss_clip": 0.01124321, + "auxiliary_loss_mlp": 0.01031126, + "balance_loss_clip": 1.04251575, + "balance_loss_mlp": 1.01806128, + "epoch": 0.48639711408387193, + "flos": 24506868574080.0, + "grad_norm": 2.448516021838412, + "language_loss": 0.83605182, + "learning_rate": 2.184924515731926e-06, + "loss": 0.85760629, + "num_input_tokens_seen": 173903520, + "router_z_loss_clip": 0.81787109, + "router_z_loss_mlp": 0.13067627, + "step": 8090, + "time_per_iteration": 2.684419631958008 + }, + { + "auxiliary_loss_clip": 0.01121837, + "auxiliary_loss_mlp": 0.01032548, + "balance_loss_clip": 1.04460561, + "balance_loss_mlp": 1.01990581, + "epoch": 0.4864572373365399, + "flos": 25354288379520.0, + "grad_norm": 1.544417643340217, + "language_loss": 0.76291084, + "learning_rate": 2.1845367189266045e-06, + "loss": 0.78445464, + "num_input_tokens_seen": 173924255, + "router_z_loss_clip": 0.77148438, + "router_z_loss_mlp": 0.12640381, + "step": 8091, + "time_per_iteration": 2.658414840698242 + }, + { + "auxiliary_loss_clip": 0.01122617, + "auxiliary_loss_mlp": 0.01031396, + "balance_loss_clip": 1.04148722, + "balance_loss_mlp": 1.0184257, + "epoch": 0.48651736058920786, + "flos": 31757130025920.0, + "grad_norm": 1.6197958079490902, + "language_loss": 0.80315101, + "learning_rate": 2.184148915123631e-06, + "loss": 0.82469118, + "num_input_tokens_seen": 173943285, + "router_z_loss_clip": 0.81005859, + "router_z_loss_mlp": 0.1295166, + "step": 8092, + "time_per_iteration": 2.6442959308624268 + }, + { + "auxiliary_loss_clip": 0.01125655, + "auxiliary_loss_mlp": 0.0103134, + "balance_loss_clip": 1.04408789, + "balance_loss_mlp": 1.01828074, + "epoch": 0.4865774838418758, + "flos": 24996531083040.0, + "grad_norm": 2.4542763452300087, + "language_loss": 0.71738899, + "learning_rate": 2.1837611043377126e-06, + "loss": 0.73895884, + "num_input_tokens_seen": 173962205, + "router_z_loss_clip": 0.81591797, + "router_z_loss_mlp": 0.1305542, + "step": 8093, + "time_per_iteration": 2.631657123565674 + }, + { + "auxiliary_loss_clip": 0.01121419, + "auxiliary_loss_mlp": 0.01028995, + "balance_loss_clip": 1.04237151, + "balance_loss_mlp": 1.0166688, + "epoch": 0.4866376070945438, + "flos": 28733392591200.0, + "grad_norm": 1.941341049154655, + "language_loss": 0.67857766, + "learning_rate": 2.1833732865835545e-06, + "loss": 0.70008177, + "num_input_tokens_seen": 173980945, + "router_z_loss_clip": 0.79101562, + "router_z_loss_mlp": 0.12316895, + "step": 8094, + "time_per_iteration": 2.668027400970459 + }, + { + "auxiliary_loss_clip": 0.01130348, + "auxiliary_loss_mlp": 0.01034958, + "balance_loss_clip": 1.04768014, + "balance_loss_mlp": 1.02148151, + "epoch": 0.4866977303472118, + "flos": 20365620730560.0, + "grad_norm": 2.3662388560114396, + "language_loss": 0.66666579, + "learning_rate": 2.1829854618758636e-06, + "loss": 0.68831885, + "num_input_tokens_seen": 173998860, + "router_z_loss_clip": 0.82714844, + "router_z_loss_mlp": 0.13476562, + "step": 8095, + "time_per_iteration": 2.6653406620025635 + }, + { + "auxiliary_loss_clip": 0.01124222, + "auxiliary_loss_mlp": 0.01031369, + "balance_loss_clip": 1.04303122, + "balance_loss_mlp": 1.01801145, + "epoch": 0.4867578535998798, + "flos": 21836431535040.0, + "grad_norm": 2.0013078166696316, + "language_loss": 0.78528517, + "learning_rate": 2.182597630229345e-06, + "loss": 0.80684108, + "num_input_tokens_seen": 174016665, + "router_z_loss_clip": 0.8125, + "router_z_loss_mlp": 0.13342285, + "step": 8096, + "time_per_iteration": 2.611318349838257 + }, + { + "auxiliary_loss_clip": 0.01120208, + "auxiliary_loss_mlp": 0.01030544, + "balance_loss_clip": 1.04171586, + "balance_loss_mlp": 1.01809311, + "epoch": 0.48681797685254774, + "flos": 27622851154560.0, + "grad_norm": 2.1608371732135936, + "language_loss": 0.67627275, + "learning_rate": 2.1822097916587067e-06, + "loss": 0.69778025, + "num_input_tokens_seen": 174034800, + "router_z_loss_clip": 0.78515625, + "router_z_loss_mlp": 0.12463379, + "step": 8097, + "time_per_iteration": 2.647852659225464 + }, + { + "auxiliary_loss_clip": 0.01120608, + "auxiliary_loss_mlp": 0.01030978, + "balance_loss_clip": 1.04188657, + "balance_loss_mlp": 1.01900351, + "epoch": 0.4868781001052157, + "flos": 25485302211840.0, + "grad_norm": 1.6235767283174127, + "language_loss": 0.71519613, + "learning_rate": 2.1818219461786543e-06, + "loss": 0.73671198, + "num_input_tokens_seen": 174054445, + "router_z_loss_clip": 0.78710938, + "router_z_loss_mlp": 0.11975098, + "step": 8098, + "time_per_iteration": 2.593787431716919 + }, + { + "auxiliary_loss_clip": 0.01128623, + "auxiliary_loss_mlp": 0.01034047, + "balance_loss_clip": 1.04384971, + "balance_loss_mlp": 1.02040374, + "epoch": 0.48693822335788367, + "flos": 51219545006880.0, + "grad_norm": 2.268040849277499, + "language_loss": 0.66030639, + "learning_rate": 2.1814340938038956e-06, + "loss": 0.6819331, + "num_input_tokens_seen": 174077890, + "router_z_loss_clip": 0.84814453, + "router_z_loss_mlp": 0.13653564, + "step": 8099, + "time_per_iteration": 2.8601436614990234 + }, + { + "auxiliary_loss_clip": 0.01120214, + "auxiliary_loss_mlp": 0.01036021, + "balance_loss_clip": 1.04120374, + "balance_loss_mlp": 1.02407002, + "epoch": 0.48699834661055164, + "flos": 29581987397760.0, + "grad_norm": 1.9428473597664837, + "language_loss": 0.67147899, + "learning_rate": 2.181046234549138e-06, + "loss": 0.69304132, + "num_input_tokens_seen": 174097460, + "router_z_loss_clip": 0.79003906, + "router_z_loss_mlp": 0.1194458, + "step": 8100, + "time_per_iteration": 2.7523646354675293 + }, + { + "auxiliary_loss_clip": 0.01119677, + "auxiliary_loss_mlp": 0.01031294, + "balance_loss_clip": 1.04245329, + "balance_loss_mlp": 1.01924753, + "epoch": 0.4870584698632196, + "flos": 31632558441120.0, + "grad_norm": 1.6228461986911793, + "language_loss": 0.76633549, + "learning_rate": 2.180658368429088e-06, + "loss": 0.78784519, + "num_input_tokens_seen": 174120775, + "router_z_loss_clip": 0.77246094, + "router_z_loss_mlp": 0.12036133, + "step": 8101, + "time_per_iteration": 2.832082986831665 + }, + { + "auxiliary_loss_clip": 0.01037683, + "auxiliary_loss_mlp": 0.01002408, + "balance_loss_clip": 1.01256347, + "balance_loss_mlp": 1.00120795, + "epoch": 0.48711859311588757, + "flos": 85673407786560.0, + "grad_norm": 0.6779287984320769, + "language_loss": 0.52301037, + "learning_rate": 2.1802704954584565e-06, + "loss": 0.54341125, + "num_input_tokens_seen": 174189135, + "router_z_loss_clip": 0.25097656, + "router_z_loss_mlp": 0.01199341, + "step": 8102, + "time_per_iteration": 3.408684015274048 + }, + { + "auxiliary_loss_clip": 0.01123297, + "auxiliary_loss_mlp": 0.01030441, + "balance_loss_clip": 1.0428226, + "balance_loss_mlp": 1.01801336, + "epoch": 0.48717871636855553, + "flos": 15060841198560.0, + "grad_norm": 2.205951711339963, + "language_loss": 0.73859394, + "learning_rate": 2.1798826156519484e-06, + "loss": 0.76013136, + "num_input_tokens_seen": 174203250, + "router_z_loss_clip": 0.8046875, + "router_z_loss_mlp": 0.12438965, + "step": 8103, + "time_per_iteration": 2.5877461433410645 + }, + { + "auxiliary_loss_clip": 0.01122954, + "auxiliary_loss_mlp": 0.01039052, + "balance_loss_clip": 1.04235923, + "balance_loss_mlp": 1.02582002, + "epoch": 0.4872388396212235, + "flos": 28647062968320.0, + "grad_norm": 1.681321268698338, + "language_loss": 0.63002038, + "learning_rate": 2.1794947290242737e-06, + "loss": 0.65164042, + "num_input_tokens_seen": 174224145, + "router_z_loss_clip": 0.80566406, + "router_z_loss_mlp": 0.13232422, + "step": 8104, + "time_per_iteration": 4.038263559341431 + }, + { + "auxiliary_loss_clip": 0.01122622, + "auxiliary_loss_mlp": 0.01030087, + "balance_loss_clip": 1.0432688, + "balance_loss_mlp": 1.01740289, + "epoch": 0.48729896287389146, + "flos": 38349121451040.0, + "grad_norm": 2.0204058358617223, + "language_loss": 0.69148791, + "learning_rate": 2.1791068355901413e-06, + "loss": 0.71301496, + "num_input_tokens_seen": 174244435, + "router_z_loss_clip": 0.79394531, + "router_z_loss_mlp": 0.12695312, + "step": 8105, + "time_per_iteration": 4.248247146606445 + }, + { + "auxiliary_loss_clip": 0.01116959, + "auxiliary_loss_mlp": 0.01029226, + "balance_loss_clip": 1.03977251, + "balance_loss_mlp": 1.01734066, + "epoch": 0.4873590861265594, + "flos": 23254819329600.0, + "grad_norm": 1.8332642598189823, + "language_loss": 0.73833275, + "learning_rate": 2.178718935364259e-06, + "loss": 0.75979459, + "num_input_tokens_seen": 174262710, + "router_z_loss_clip": 0.77148438, + "router_z_loss_mlp": 0.11889648, + "step": 8106, + "time_per_iteration": 2.6138315200805664 + }, + { + "auxiliary_loss_clip": 0.01127109, + "auxiliary_loss_mlp": 0.0103418, + "balance_loss_clip": 1.04467964, + "balance_loss_mlp": 1.02113891, + "epoch": 0.4874192093792274, + "flos": 29710529676000.0, + "grad_norm": 1.6743492602129393, + "language_loss": 0.76588917, + "learning_rate": 2.1783310283613373e-06, + "loss": 0.78750205, + "num_input_tokens_seen": 174281545, + "router_z_loss_clip": 0.82470703, + "router_z_loss_mlp": 0.13043213, + "step": 8107, + "time_per_iteration": 2.8073906898498535 + }, + { + "auxiliary_loss_clip": 0.01120758, + "auxiliary_loss_mlp": 0.01029596, + "balance_loss_clip": 1.04389536, + "balance_loss_mlp": 1.0179795, + "epoch": 0.4874793326318954, + "flos": 28202206220640.0, + "grad_norm": 1.762443359401592, + "language_loss": 0.75472808, + "learning_rate": 2.1779431145960853e-06, + "loss": 0.77623165, + "num_input_tokens_seen": 174300290, + "router_z_loss_clip": 0.76855469, + "router_z_loss_mlp": 0.11627197, + "step": 8108, + "time_per_iteration": 2.6376359462738037 + }, + { + "auxiliary_loss_clip": 0.01118766, + "auxiliary_loss_mlp": 0.01030813, + "balance_loss_clip": 1.04163122, + "balance_loss_mlp": 1.02026916, + "epoch": 0.4875394558845634, + "flos": 23215604952960.0, + "grad_norm": 2.044401104053908, + "language_loss": 0.73321688, + "learning_rate": 2.177555194083212e-06, + "loss": 0.75471264, + "num_input_tokens_seen": 174318490, + "router_z_loss_clip": 0.77099609, + "router_z_loss_mlp": 0.10546875, + "step": 8109, + "time_per_iteration": 2.641740560531616 + }, + { + "auxiliary_loss_clip": 0.01119649, + "auxiliary_loss_mlp": 0.0103053, + "balance_loss_clip": 1.04187012, + "balance_loss_mlp": 1.01839423, + "epoch": 0.48759957913723134, + "flos": 26153417937600.0, + "grad_norm": 2.3983823341078696, + "language_loss": 0.78621018, + "learning_rate": 2.177167266837428e-06, + "loss": 0.80771196, + "num_input_tokens_seen": 174335505, + "router_z_loss_clip": 0.77783203, + "router_z_loss_mlp": 0.12133789, + "step": 8110, + "time_per_iteration": 2.6959798336029053 + }, + { + "auxiliary_loss_clip": 0.01121459, + "auxiliary_loss_mlp": 0.01037385, + "balance_loss_clip": 1.04235268, + "balance_loss_mlp": 1.0252018, + "epoch": 0.4876597023898993, + "flos": 21656762799840.0, + "grad_norm": 2.08316141706007, + "language_loss": 0.72121727, + "learning_rate": 2.176779332873444e-06, + "loss": 0.74280572, + "num_input_tokens_seen": 174353990, + "router_z_loss_clip": 0.79150391, + "router_z_loss_mlp": 0.12182617, + "step": 8111, + "time_per_iteration": 2.6694250106811523 + }, + { + "auxiliary_loss_clip": 0.01120315, + "auxiliary_loss_mlp": 0.010382, + "balance_loss_clip": 1.04260886, + "balance_loss_mlp": 1.02604055, + "epoch": 0.4877198256425673, + "flos": 20767089821760.0, + "grad_norm": 1.697181030848345, + "language_loss": 0.75715894, + "learning_rate": 2.17639139220597e-06, + "loss": 0.7787441, + "num_input_tokens_seen": 174373425, + "router_z_loss_clip": 0.77587891, + "router_z_loss_mlp": 0.12158203, + "step": 8112, + "time_per_iteration": 2.635575294494629 + }, + { + "auxiliary_loss_clip": 0.01125577, + "auxiliary_loss_mlp": 0.01037362, + "balance_loss_clip": 1.04240656, + "balance_loss_mlp": 1.02455342, + "epoch": 0.48777994889523524, + "flos": 27313100484480.0, + "grad_norm": 1.6838509108326536, + "language_loss": 0.74906838, + "learning_rate": 2.1760034448497166e-06, + "loss": 0.77069777, + "num_input_tokens_seen": 174393070, + "router_z_loss_clip": 0.83105469, + "router_z_loss_mlp": 0.12799072, + "step": 8113, + "time_per_iteration": 4.029780626296997 + }, + { + "auxiliary_loss_clip": 0.01036343, + "auxiliary_loss_mlp": 0.01009646, + "balance_loss_clip": 1.01133895, + "balance_loss_mlp": 1.0084275, + "epoch": 0.4878400721479032, + "flos": 74726836273440.0, + "grad_norm": 0.767413328168948, + "language_loss": 0.48770547, + "learning_rate": 2.1756154908193943e-06, + "loss": 0.50816536, + "num_input_tokens_seen": 174446880, + "router_z_loss_clip": 0.25, + "router_z_loss_mlp": 0.01219177, + "step": 8114, + "time_per_iteration": 3.141864776611328 + }, + { + "auxiliary_loss_clip": 0.0112359, + "auxiliary_loss_mlp": 0.01038288, + "balance_loss_clip": 1.04231155, + "balance_loss_mlp": 1.02535987, + "epoch": 0.48790019540057117, + "flos": 29939785211520.0, + "grad_norm": 1.36753852743703, + "language_loss": 0.76631594, + "learning_rate": 2.1752275301297155e-06, + "loss": 0.78793472, + "num_input_tokens_seen": 174468485, + "router_z_loss_clip": 0.81347656, + "router_z_loss_mlp": 0.12939453, + "step": 8115, + "time_per_iteration": 2.686762809753418 + }, + { + "auxiliary_loss_clip": 0.01127412, + "auxiliary_loss_mlp": 0.01035253, + "balance_loss_clip": 1.04448617, + "balance_loss_mlp": 1.02236032, + "epoch": 0.48796031865323913, + "flos": 26642026997280.0, + "grad_norm": 2.4403870378919383, + "language_loss": 0.72581482, + "learning_rate": 2.1748395627953915e-06, + "loss": 0.74744147, + "num_input_tokens_seen": 174486360, + "router_z_loss_clip": 0.83056641, + "router_z_loss_mlp": 0.12896729, + "step": 8116, + "time_per_iteration": 4.116488456726074 + }, + { + "auxiliary_loss_clip": 0.01119698, + "auxiliary_loss_mlp": 0.0103597, + "balance_loss_clip": 1.04162574, + "balance_loss_mlp": 1.02363133, + "epoch": 0.4880204419059071, + "flos": 22681987545600.0, + "grad_norm": 1.767668671655041, + "language_loss": 0.62999725, + "learning_rate": 2.1744515888311335e-06, + "loss": 0.65155393, + "num_input_tokens_seen": 174505075, + "router_z_loss_clip": 0.78173828, + "router_z_loss_mlp": 0.12335205, + "step": 8117, + "time_per_iteration": 2.656083345413208 + }, + { + "auxiliary_loss_clip": 0.01116858, + "auxiliary_loss_mlp": 0.01033123, + "balance_loss_clip": 1.03828597, + "balance_loss_mlp": 1.02040315, + "epoch": 0.48808056515857506, + "flos": 23393774548800.0, + "grad_norm": 1.9478231860625477, + "language_loss": 0.79522085, + "learning_rate": 2.1740636082516533e-06, + "loss": 0.8167206, + "num_input_tokens_seen": 174523385, + "router_z_loss_clip": 0.78564453, + "router_z_loss_mlp": 0.12731934, + "step": 8118, + "time_per_iteration": 2.6138720512390137 + }, + { + "auxiliary_loss_clip": 0.01120795, + "auxiliary_loss_mlp": 0.01035143, + "balance_loss_clip": 1.03991139, + "balance_loss_mlp": 1.02237582, + "epoch": 0.48814068841124303, + "flos": 24551026058880.0, + "grad_norm": 3.638288553401073, + "language_loss": 0.63240063, + "learning_rate": 2.1736756210716645e-06, + "loss": 0.65395999, + "num_input_tokens_seen": 174542200, + "router_z_loss_clip": 0.80908203, + "router_z_loss_mlp": 0.12774658, + "step": 8119, + "time_per_iteration": 2.6515374183654785 + }, + { + "auxiliary_loss_clip": 0.01121868, + "auxiliary_loss_mlp": 0.01028773, + "balance_loss_clip": 1.0417707, + "balance_loss_mlp": 1.01712668, + "epoch": 0.488200811663911, + "flos": 28022902140960.0, + "grad_norm": 1.953872198876804, + "language_loss": 0.72078514, + "learning_rate": 2.173287627305878e-06, + "loss": 0.74229151, + "num_input_tokens_seen": 174563620, + "router_z_loss_clip": 0.80029297, + "router_z_loss_mlp": 0.11651611, + "step": 8120, + "time_per_iteration": 2.644451856613159 + }, + { + "auxiliary_loss_clip": 0.01121752, + "auxiliary_loss_mlp": 0.01029769, + "balance_loss_clip": 1.04154921, + "balance_loss_mlp": 1.01717472, + "epoch": 0.48826093491657896, + "flos": 41379017512320.0, + "grad_norm": 2.240789590659503, + "language_loss": 0.63288122, + "learning_rate": 2.1728996269690075e-06, + "loss": 0.65439641, + "num_input_tokens_seen": 174586465, + "router_z_loss_clip": 0.80126953, + "router_z_loss_mlp": 0.12591553, + "step": 8121, + "time_per_iteration": 2.79545521736145 + }, + { + "auxiliary_loss_clip": 0.01125207, + "auxiliary_loss_mlp": 0.01034166, + "balance_loss_clip": 1.04295111, + "balance_loss_mlp": 1.02083838, + "epoch": 0.488321058169247, + "flos": 28151120280960.0, + "grad_norm": 2.453101893028602, + "language_loss": 0.82396305, + "learning_rate": 2.1725116200757664e-06, + "loss": 0.8455568, + "num_input_tokens_seen": 174604035, + "router_z_loss_clip": 0.82275391, + "router_z_loss_mlp": 0.13330078, + "step": 8122, + "time_per_iteration": 2.6900362968444824 + }, + { + "auxiliary_loss_clip": 0.01126536, + "auxiliary_loss_mlp": 0.0103381, + "balance_loss_clip": 1.04298759, + "balance_loss_mlp": 1.0202384, + "epoch": 0.48838118142191494, + "flos": 23571417420000.0, + "grad_norm": 2.7046162523631474, + "language_loss": 0.85540915, + "learning_rate": 2.172123606640866e-06, + "loss": 0.87701267, + "num_input_tokens_seen": 174621715, + "router_z_loss_clip": 0.83398438, + "router_z_loss_mlp": 0.13586426, + "step": 8123, + "time_per_iteration": 2.6256802082061768 + }, + { + "auxiliary_loss_clip": 0.01121365, + "auxiliary_loss_mlp": 0.0102963, + "balance_loss_clip": 1.0399462, + "balance_loss_mlp": 1.01740551, + "epoch": 0.4884413046745829, + "flos": 31006533818880.0, + "grad_norm": 1.4872198013372273, + "language_loss": 0.85623515, + "learning_rate": 2.1717355866790227e-06, + "loss": 0.87774503, + "num_input_tokens_seen": 174643835, + "router_z_loss_clip": 0.81445312, + "router_z_loss_mlp": 0.12219238, + "step": 8124, + "time_per_iteration": 2.701307535171509 + }, + { + "auxiliary_loss_clip": 0.0112201, + "auxiliary_loss_mlp": 0.01034028, + "balance_loss_clip": 1.04074168, + "balance_loss_mlp": 1.02123046, + "epoch": 0.4885014279272509, + "flos": 25614371214720.0, + "grad_norm": 1.9783635175408947, + "language_loss": 0.79673737, + "learning_rate": 2.171347560204948e-06, + "loss": 0.81829774, + "num_input_tokens_seen": 174660955, + "router_z_loss_clip": 0.8125, + "router_z_loss_mlp": 0.12792969, + "step": 8125, + "time_per_iteration": 2.622269630432129 + }, + { + "auxiliary_loss_clip": 0.01120869, + "auxiliary_loss_mlp": 0.01031536, + "balance_loss_clip": 1.04075146, + "balance_loss_mlp": 1.01948357, + "epoch": 0.48856155117991884, + "flos": 16803646918560.0, + "grad_norm": 3.148553994287554, + "language_loss": 0.72639704, + "learning_rate": 2.170959527233356e-06, + "loss": 0.74792105, + "num_input_tokens_seen": 174678270, + "router_z_loss_clip": 0.80078125, + "router_z_loss_mlp": 0.12054443, + "step": 8126, + "time_per_iteration": 2.633718490600586 + }, + { + "auxiliary_loss_clip": 0.01121899, + "auxiliary_loss_mlp": 0.01033864, + "balance_loss_clip": 1.04018819, + "balance_loss_mlp": 1.02139473, + "epoch": 0.4886216744325868, + "flos": 39154409635680.0, + "grad_norm": 1.7384567747846944, + "language_loss": 0.68590653, + "learning_rate": 2.1705714877789633e-06, + "loss": 0.7074641, + "num_input_tokens_seen": 174698360, + "router_z_loss_clip": 0.81689453, + "router_z_loss_mlp": 0.12463379, + "step": 8127, + "time_per_iteration": 2.705779790878296 + }, + { + "auxiliary_loss_clip": 0.01122038, + "auxiliary_loss_mlp": 0.01033348, + "balance_loss_clip": 1.03918958, + "balance_loss_mlp": 1.0206697, + "epoch": 0.48868179768525477, + "flos": 23928121267200.0, + "grad_norm": 1.8220255080340675, + "language_loss": 0.76064259, + "learning_rate": 2.170183441856481e-06, + "loss": 0.78219646, + "num_input_tokens_seen": 174716755, + "router_z_loss_clip": 0.82763672, + "router_z_loss_mlp": 0.12689209, + "step": 8128, + "time_per_iteration": 2.731755495071411 + }, + { + "auxiliary_loss_clip": 0.01123394, + "auxiliary_loss_mlp": 0.01033122, + "balance_loss_clip": 1.04302108, + "balance_loss_mlp": 1.0208559, + "epoch": 0.48874192093792274, + "flos": 25974478513440.0, + "grad_norm": 1.9754678815061177, + "language_loss": 0.76171088, + "learning_rate": 2.1697953894806265e-06, + "loss": 0.78327608, + "num_input_tokens_seen": 174735560, + "router_z_loss_clip": 0.80419922, + "router_z_loss_mlp": 0.12268066, + "step": 8129, + "time_per_iteration": 2.6131954193115234 + }, + { + "auxiliary_loss_clip": 0.0112079, + "auxiliary_loss_mlp": 0.0102987, + "balance_loss_clip": 1.03990316, + "balance_loss_mlp": 1.01668513, + "epoch": 0.4888020441905907, + "flos": 17293957704000.0, + "grad_norm": 2.288777462219304, + "language_loss": 0.64543223, + "learning_rate": 2.169407330666114e-06, + "loss": 0.66693884, + "num_input_tokens_seen": 174752730, + "router_z_loss_clip": 0.80761719, + "router_z_loss_mlp": 0.13165283, + "step": 8130, + "time_per_iteration": 2.6448593139648438 + }, + { + "auxiliary_loss_clip": 0.01117685, + "auxiliary_loss_mlp": 0.01028245, + "balance_loss_clip": 1.03864813, + "balance_loss_mlp": 1.01592481, + "epoch": 0.48886216744325867, + "flos": 29404506595680.0, + "grad_norm": 1.7350041166722832, + "language_loss": 0.72250092, + "learning_rate": 2.169019265427658e-06, + "loss": 0.74396014, + "num_input_tokens_seen": 174772520, + "router_z_loss_clip": 0.79052734, + "router_z_loss_mlp": 0.12316895, + "step": 8131, + "time_per_iteration": 2.6474192142486572 + }, + { + "auxiliary_loss_clip": 0.01125543, + "auxiliary_loss_mlp": 0.01037755, + "balance_loss_clip": 1.04359937, + "balance_loss_mlp": 1.02466631, + "epoch": 0.48892229069592663, + "flos": 46895873253120.0, + "grad_norm": 1.6235380866301263, + "language_loss": 0.69806552, + "learning_rate": 2.1686311937799745e-06, + "loss": 0.71969855, + "num_input_tokens_seen": 174796540, + "router_z_loss_clip": 0.81933594, + "router_z_loss_mlp": 0.13092041, + "step": 8132, + "time_per_iteration": 2.797010660171509 + }, + { + "auxiliary_loss_clip": 0.01120762, + "auxiliary_loss_mlp": 0.01029198, + "balance_loss_clip": 1.04186785, + "balance_loss_mlp": 1.01662707, + "epoch": 0.4889824139485946, + "flos": 29093783510880.0, + "grad_norm": 3.345416322003532, + "language_loss": 0.7050879, + "learning_rate": 2.1682431157377797e-06, + "loss": 0.72658741, + "num_input_tokens_seen": 174817840, + "router_z_loss_clip": 0.78955078, + "router_z_loss_mlp": 0.12561035, + "step": 8133, + "time_per_iteration": 2.7124571800231934 + }, + { + "auxiliary_loss_clip": 0.01120959, + "auxiliary_loss_mlp": 0.01032033, + "balance_loss_clip": 1.04127765, + "balance_loss_mlp": 1.01948047, + "epoch": 0.48904253720126256, + "flos": 29804719651200.0, + "grad_norm": 1.667088769678125, + "language_loss": 0.70643604, + "learning_rate": 2.1678550313157883e-06, + "loss": 0.72796595, + "num_input_tokens_seen": 174837885, + "router_z_loss_clip": 0.79589844, + "router_z_loss_mlp": 0.12548828, + "step": 8134, + "time_per_iteration": 2.766072988510132 + }, + { + "auxiliary_loss_clip": 0.01127525, + "auxiliary_loss_mlp": 0.01034423, + "balance_loss_clip": 1.04404354, + "balance_loss_mlp": 1.02082741, + "epoch": 0.4891026604539306, + "flos": 29493348289920.0, + "grad_norm": 1.9769653666328284, + "language_loss": 0.80415702, + "learning_rate": 2.167466940528718e-06, + "loss": 0.82577652, + "num_input_tokens_seen": 174855240, + "router_z_loss_clip": 0.8359375, + "router_z_loss_mlp": 0.13592529, + "step": 8135, + "time_per_iteration": 2.6973912715911865 + }, + { + "auxiliary_loss_clip": 0.01119404, + "auxiliary_loss_mlp": 0.01036212, + "balance_loss_clip": 1.0396502, + "balance_loss_mlp": 1.02410018, + "epoch": 0.48916278370659855, + "flos": 26198912492640.0, + "grad_norm": 1.8330406458353434, + "language_loss": 0.74100941, + "learning_rate": 2.1670788433912843e-06, + "loss": 0.76256561, + "num_input_tokens_seen": 174875145, + "router_z_loss_clip": 0.79736328, + "router_z_loss_mlp": 0.12109375, + "step": 8136, + "time_per_iteration": 2.717364549636841 + }, + { + "auxiliary_loss_clip": 0.01121151, + "auxiliary_loss_mlp": 0.01034587, + "balance_loss_clip": 1.0422368, + "balance_loss_mlp": 1.02265441, + "epoch": 0.4892229069592665, + "flos": 27222516547200.0, + "grad_norm": 1.5421707135394895, + "language_loss": 0.73578447, + "learning_rate": 2.166690739918204e-06, + "loss": 0.75734186, + "num_input_tokens_seen": 174894770, + "router_z_loss_clip": 0.7890625, + "router_z_loss_mlp": 0.11932373, + "step": 8137, + "time_per_iteration": 2.646451473236084 + }, + { + "auxiliary_loss_clip": 0.01122693, + "auxiliary_loss_mlp": 0.01029072, + "balance_loss_clip": 1.04101539, + "balance_loss_mlp": 1.01594687, + "epoch": 0.4892830302119345, + "flos": 15601954302720.0, + "grad_norm": 2.288610610672578, + "language_loss": 0.75498605, + "learning_rate": 2.1663026301241944e-06, + "loss": 0.77650368, + "num_input_tokens_seen": 174912780, + "router_z_loss_clip": 0.81640625, + "router_z_loss_mlp": 0.13128662, + "step": 8138, + "time_per_iteration": 2.588963508605957 + }, + { + "auxiliary_loss_clip": 0.01123899, + "auxiliary_loss_mlp": 0.0103314, + "balance_loss_clip": 1.04402637, + "balance_loss_mlp": 1.02064061, + "epoch": 0.48934315346460244, + "flos": 25397149311360.0, + "grad_norm": 1.693991597693805, + "language_loss": 0.73955297, + "learning_rate": 2.165914514023972e-06, + "loss": 0.76112342, + "num_input_tokens_seen": 174931250, + "router_z_loss_clip": 0.79882812, + "router_z_loss_mlp": 0.12512207, + "step": 8139, + "time_per_iteration": 2.650881767272949 + }, + { + "auxiliary_loss_clip": 0.01123744, + "auxiliary_loss_mlp": 0.01033539, + "balance_loss_clip": 1.0431093, + "balance_loss_mlp": 1.02137959, + "epoch": 0.4894032767172704, + "flos": 24105683103840.0, + "grad_norm": 1.8052648896373786, + "language_loss": 0.62287438, + "learning_rate": 2.165526391632255e-06, + "loss": 0.64444721, + "num_input_tokens_seen": 174951105, + "router_z_loss_clip": 0.80566406, + "router_z_loss_mlp": 0.12164307, + "step": 8140, + "time_per_iteration": 2.6241230964660645 + }, + { + "auxiliary_loss_clip": 0.0112549, + "auxiliary_loss_mlp": 0.01039595, + "balance_loss_clip": 1.04287505, + "balance_loss_mlp": 1.02618408, + "epoch": 0.4894633999699384, + "flos": 21742808801760.0, + "grad_norm": 1.6894762609565999, + "language_loss": 0.82327074, + "learning_rate": 2.1651382629637608e-06, + "loss": 0.84492159, + "num_input_tokens_seen": 174969120, + "router_z_loss_clip": 0.82617188, + "router_z_loss_mlp": 0.13397217, + "step": 8141, + "time_per_iteration": 2.616560459136963 + }, + { + "auxiliary_loss_clip": 0.01125013, + "auxiliary_loss_mlp": 0.01034833, + "balance_loss_clip": 1.04307914, + "balance_loss_mlp": 1.02179146, + "epoch": 0.48952352322260634, + "flos": 31143584725920.0, + "grad_norm": 1.6507123434838333, + "language_loss": 0.72166228, + "learning_rate": 2.1647501280332066e-06, + "loss": 0.7432608, + "num_input_tokens_seen": 174991295, + "router_z_loss_clip": 0.81982422, + "router_z_loss_mlp": 0.13049316, + "step": 8142, + "time_per_iteration": 2.713414192199707 + }, + { + "auxiliary_loss_clip": 0.01120667, + "auxiliary_loss_mlp": 0.0103586, + "balance_loss_clip": 1.04099751, + "balance_loss_mlp": 1.02345657, + "epoch": 0.4895836464752743, + "flos": 35454574571040.0, + "grad_norm": 2.65570384741254, + "language_loss": 0.66807055, + "learning_rate": 2.1643619868553105e-06, + "loss": 0.68963581, + "num_input_tokens_seen": 175012830, + "router_z_loss_clip": 0.79638672, + "router_z_loss_mlp": 0.12408447, + "step": 8143, + "time_per_iteration": 2.7201454639434814 + }, + { + "auxiliary_loss_clip": 0.0111921, + "auxiliary_loss_mlp": 0.01029653, + "balance_loss_clip": 1.04109013, + "balance_loss_mlp": 1.01729119, + "epoch": 0.48964376972794227, + "flos": 40935052144800.0, + "grad_norm": 1.5842112797653347, + "language_loss": 0.75183809, + "learning_rate": 2.163973839444793e-06, + "loss": 0.77332669, + "num_input_tokens_seen": 175035695, + "router_z_loss_clip": 0.78125, + "router_z_loss_mlp": 0.12359619, + "step": 8144, + "time_per_iteration": 5.537027835845947 + }, + { + "auxiliary_loss_clip": 0.01121952, + "auxiliary_loss_mlp": 0.01029246, + "balance_loss_clip": 1.0413866, + "balance_loss_mlp": 1.01692033, + "epoch": 0.48970389298061023, + "flos": 26910496909440.0, + "grad_norm": 3.17232268779932, + "language_loss": 0.75881171, + "learning_rate": 2.1635856858163695e-06, + "loss": 0.78032374, + "num_input_tokens_seen": 175056425, + "router_z_loss_clip": 0.80517578, + "router_z_loss_mlp": 0.12335205, + "step": 8145, + "time_per_iteration": 2.8744089603424072 + }, + { + "auxiliary_loss_clip": 0.01125909, + "auxiliary_loss_mlp": 0.01036462, + "balance_loss_clip": 1.04314244, + "balance_loss_mlp": 1.0232234, + "epoch": 0.4897640162332782, + "flos": 24506828056800.0, + "grad_norm": 3.0111553420504094, + "language_loss": 0.79914486, + "learning_rate": 2.163197525984761e-06, + "loss": 0.82076859, + "num_input_tokens_seen": 175074800, + "router_z_loss_clip": 0.82666016, + "router_z_loss_mlp": 0.13238525, + "step": 8146, + "time_per_iteration": 2.6966938972473145 + }, + { + "auxiliary_loss_clip": 0.01116852, + "auxiliary_loss_mlp": 0.01031007, + "balance_loss_clip": 1.03884828, + "balance_loss_mlp": 1.01825762, + "epoch": 0.48982413948594616, + "flos": 29048572576800.0, + "grad_norm": 1.7553238048623958, + "language_loss": 0.74373579, + "learning_rate": 2.162809359964687e-06, + "loss": 0.76521444, + "num_input_tokens_seen": 175094500, + "router_z_loss_clip": 0.78076172, + "router_z_loss_mlp": 0.12750244, + "step": 8147, + "time_per_iteration": 2.724947452545166 + }, + { + "auxiliary_loss_clip": 0.01123754, + "auxiliary_loss_mlp": 0.01030524, + "balance_loss_clip": 1.04323673, + "balance_loss_mlp": 1.01805472, + "epoch": 0.4898842627386142, + "flos": 21523682586240.0, + "grad_norm": 2.178693159901525, + "language_loss": 0.82670295, + "learning_rate": 2.162421187770864e-06, + "loss": 0.84824574, + "num_input_tokens_seen": 175112920, + "router_z_loss_clip": 0.80517578, + "router_z_loss_mlp": 0.12469482, + "step": 8148, + "time_per_iteration": 2.644519090652466 + }, + { + "auxiliary_loss_clip": 0.01118157, + "auxiliary_loss_mlp": 0.0103059, + "balance_loss_clip": 1.04140425, + "balance_loss_mlp": 1.01933098, + "epoch": 0.48994438599128215, + "flos": 20276900588160.0, + "grad_norm": 1.879274029067469, + "language_loss": 0.73806024, + "learning_rate": 2.162033009418015e-06, + "loss": 0.75954765, + "num_input_tokens_seen": 175129910, + "router_z_loss_clip": 0.76806641, + "router_z_loss_mlp": 0.11260986, + "step": 8149, + "time_per_iteration": 2.6948869228363037 + }, + { + "auxiliary_loss_clip": 0.0112727, + "auxiliary_loss_mlp": 0.01032283, + "balance_loss_clip": 1.04362941, + "balance_loss_mlp": 1.01828182, + "epoch": 0.4900045092439501, + "flos": 32475926518560.0, + "grad_norm": 1.7277743021780023, + "language_loss": 0.76131546, + "learning_rate": 2.1616448249208567e-06, + "loss": 0.782911, + "num_input_tokens_seen": 175148705, + "router_z_loss_clip": 0.83740234, + "router_z_loss_mlp": 0.14019775, + "step": 8150, + "time_per_iteration": 2.6710495948791504 + }, + { + "auxiliary_loss_clip": 0.01126863, + "auxiliary_loss_mlp": 0.01034381, + "balance_loss_clip": 1.0444119, + "balance_loss_mlp": 1.02146518, + "epoch": 0.4900646324966181, + "flos": 24284500976160.0, + "grad_norm": 2.131352031627735, + "language_loss": 0.72801954, + "learning_rate": 2.1612566342941106e-06, + "loss": 0.749632, + "num_input_tokens_seen": 175167425, + "router_z_loss_clip": 0.82373047, + "router_z_loss_mlp": 0.12902832, + "step": 8151, + "time_per_iteration": 2.669227123260498 + }, + { + "auxiliary_loss_clip": 0.01040823, + "auxiliary_loss_mlp": 0.01000232, + "balance_loss_clip": 1.01547003, + "balance_loss_mlp": 0.99898446, + "epoch": 0.49012475574928605, + "flos": 72223467095520.0, + "grad_norm": 0.8265214764456227, + "language_loss": 0.54367363, + "learning_rate": 2.1608684375524977e-06, + "loss": 0.56408417, + "num_input_tokens_seen": 175227985, + "router_z_loss_clip": 0.25366211, + "router_z_loss_mlp": 0.01246643, + "step": 8152, + "time_per_iteration": 4.673141956329346 + }, + { + "auxiliary_loss_clip": 0.01123506, + "auxiliary_loss_mlp": 0.0103217, + "balance_loss_clip": 1.04096496, + "balance_loss_mlp": 1.0196116, + "epoch": 0.490184879001954, + "flos": 55227874705920.0, + "grad_norm": 2.589540775384925, + "language_loss": 0.6099546, + "learning_rate": 2.1604802347107364e-06, + "loss": 0.63151145, + "num_input_tokens_seen": 175251895, + "router_z_loss_clip": 0.82519531, + "router_z_loss_mlp": 0.12554932, + "step": 8153, + "time_per_iteration": 2.8319742679595947 + }, + { + "auxiliary_loss_clip": 0.01121998, + "auxiliary_loss_mlp": 0.01036386, + "balance_loss_clip": 1.04154563, + "balance_loss_mlp": 1.02342749, + "epoch": 0.490245002254622, + "flos": 34168740265440.0, + "grad_norm": 2.265177612133946, + "language_loss": 0.77192974, + "learning_rate": 2.160092025783549e-06, + "loss": 0.79351354, + "num_input_tokens_seen": 175272770, + "router_z_loss_clip": 0.80371094, + "router_z_loss_mlp": 0.12957764, + "step": 8154, + "time_per_iteration": 2.833822250366211 + }, + { + "auxiliary_loss_clip": 0.01040419, + "auxiliary_loss_mlp": 0.01001694, + "balance_loss_clip": 1.01499605, + "balance_loss_mlp": 1.00051939, + "epoch": 0.49030512550728994, + "flos": 71931098338560.0, + "grad_norm": 0.9731956059999323, + "language_loss": 0.67016411, + "learning_rate": 2.1597038107856564e-06, + "loss": 0.6905852, + "num_input_tokens_seen": 175336320, + "router_z_loss_clip": 0.25415039, + "router_z_loss_mlp": 0.01173401, + "step": 8155, + "time_per_iteration": 3.2995426654815674 + }, + { + "auxiliary_loss_clip": 0.01124537, + "auxiliary_loss_mlp": 0.01029192, + "balance_loss_clip": 1.04378021, + "balance_loss_mlp": 1.01753283, + "epoch": 0.4903652487599579, + "flos": 24150002657760.0, + "grad_norm": 2.0042747814711945, + "language_loss": 0.76718873, + "learning_rate": 2.1593155897317784e-06, + "loss": 0.78872603, + "num_input_tokens_seen": 175353540, + "router_z_loss_clip": 0.80761719, + "router_z_loss_mlp": 0.11657715, + "step": 8156, + "time_per_iteration": 4.0412046909332275 + }, + { + "auxiliary_loss_clip": 0.0112126, + "auxiliary_loss_mlp": 0.01031574, + "balance_loss_clip": 1.04113817, + "balance_loss_mlp": 1.01932538, + "epoch": 0.49042537201262587, + "flos": 26554238752320.0, + "grad_norm": 2.171715704680093, + "language_loss": 0.83982253, + "learning_rate": 2.1589273626366377e-06, + "loss": 0.86135089, + "num_input_tokens_seen": 175370445, + "router_z_loss_clip": 0.80078125, + "router_z_loss_mlp": 0.12243652, + "step": 8157, + "time_per_iteration": 2.66715407371521 + }, + { + "auxiliary_loss_clip": 0.01123568, + "auxiliary_loss_mlp": 0.010295, + "balance_loss_clip": 1.04225445, + "balance_loss_mlp": 1.0165236, + "epoch": 0.49048549526529384, + "flos": 23126398603200.0, + "grad_norm": 2.284022700099282, + "language_loss": 0.79978442, + "learning_rate": 2.158539129514956e-06, + "loss": 0.82131505, + "num_input_tokens_seen": 175389020, + "router_z_loss_clip": 0.81152344, + "router_z_loss_mlp": 0.12982178, + "step": 8158, + "time_per_iteration": 2.600224018096924 + }, + { + "auxiliary_loss_clip": 0.01125858, + "auxiliary_loss_mlp": 0.01028366, + "balance_loss_clip": 1.04344988, + "balance_loss_mlp": 1.01561069, + "epoch": 0.4905456185179618, + "flos": 32831738985600.0, + "grad_norm": 1.7556008810806352, + "language_loss": 0.69202411, + "learning_rate": 2.158150890381454e-06, + "loss": 0.7135663, + "num_input_tokens_seen": 175409545, + "router_z_loss_clip": 0.82421875, + "router_z_loss_mlp": 0.12750244, + "step": 8159, + "time_per_iteration": 2.707991600036621 + }, + { + "auxiliary_loss_clip": 0.01122251, + "auxiliary_loss_mlp": 0.01033558, + "balance_loss_clip": 1.04302037, + "balance_loss_mlp": 1.02104068, + "epoch": 0.49060574177062977, + "flos": 24907162664160.0, + "grad_norm": 1.888248056989716, + "language_loss": 0.73271871, + "learning_rate": 2.157762645250854e-06, + "loss": 0.75427687, + "num_input_tokens_seen": 175429335, + "router_z_loss_clip": 0.79199219, + "router_z_loss_mlp": 0.12512207, + "step": 8160, + "time_per_iteration": 2.703439474105835 + }, + { + "auxiliary_loss_clip": 0.01123608, + "auxiliary_loss_mlp": 0.01037889, + "balance_loss_clip": 1.03995097, + "balance_loss_mlp": 1.02421534, + "epoch": 0.4906658650232978, + "flos": 21345594024960.0, + "grad_norm": 2.0329882088410454, + "language_loss": 0.72104543, + "learning_rate": 2.1573743941378796e-06, + "loss": 0.74266034, + "num_input_tokens_seen": 175446955, + "router_z_loss_clip": 0.83691406, + "router_z_loss_mlp": 0.13696289, + "step": 8161, + "time_per_iteration": 2.6775074005126953 + }, + { + "auxiliary_loss_clip": 0.01124673, + "auxiliary_loss_mlp": 0.01035998, + "balance_loss_clip": 1.04530346, + "balance_loss_mlp": 1.0234338, + "epoch": 0.49072598827596575, + "flos": 32475480828480.0, + "grad_norm": 1.731562115196688, + "language_loss": 0.6835531, + "learning_rate": 2.1569861370572517e-06, + "loss": 0.70515978, + "num_input_tokens_seen": 175468195, + "router_z_loss_clip": 0.79345703, + "router_z_loss_mlp": 0.12567139, + "step": 8162, + "time_per_iteration": 2.651230812072754 + }, + { + "auxiliary_loss_clip": 0.01127781, + "auxiliary_loss_mlp": 0.01033917, + "balance_loss_clip": 1.043486, + "balance_loss_mlp": 1.0198797, + "epoch": 0.4907861115286337, + "flos": 24907446285120.0, + "grad_norm": 2.072407676286575, + "language_loss": 0.63450444, + "learning_rate": 2.1565978740236944e-06, + "loss": 0.65612137, + "num_input_tokens_seen": 175487455, + "router_z_loss_clip": 0.84277344, + "router_z_loss_mlp": 0.14031982, + "step": 8163, + "time_per_iteration": 2.6436915397644043 + }, + { + "auxiliary_loss_clip": 0.01120335, + "auxiliary_loss_mlp": 0.01030465, + "balance_loss_clip": 1.04242384, + "balance_loss_mlp": 1.01801407, + "epoch": 0.4908462347813017, + "flos": 17160715421280.0, + "grad_norm": 2.1485117951982624, + "language_loss": 0.76926053, + "learning_rate": 2.1562096050519293e-06, + "loss": 0.79076856, + "num_input_tokens_seen": 175504450, + "router_z_loss_clip": 0.77880859, + "router_z_loss_mlp": 0.12445068, + "step": 8164, + "time_per_iteration": 2.643249273300171 + }, + { + "auxiliary_loss_clip": 0.01124031, + "auxiliary_loss_mlp": 0.01028343, + "balance_loss_clip": 1.04130602, + "balance_loss_mlp": 1.0152663, + "epoch": 0.49090635803396965, + "flos": 22860359727840.0, + "grad_norm": 1.910720978919781, + "language_loss": 0.7629385, + "learning_rate": 2.1558213301566806e-06, + "loss": 0.78446221, + "num_input_tokens_seen": 175523600, + "router_z_loss_clip": 0.82763672, + "router_z_loss_mlp": 0.1307373, + "step": 8165, + "time_per_iteration": 2.6508326530456543 + }, + { + "auxiliary_loss_clip": 0.0112132, + "auxiliary_loss_mlp": 0.01034606, + "balance_loss_clip": 1.0423708, + "balance_loss_mlp": 1.02192783, + "epoch": 0.4909664812866376, + "flos": 25085494329120.0, + "grad_norm": 1.9527366683039222, + "language_loss": 0.77341795, + "learning_rate": 2.1554330493526716e-06, + "loss": 0.79497713, + "num_input_tokens_seen": 175542720, + "router_z_loss_clip": 0.78808594, + "router_z_loss_mlp": 0.12683105, + "step": 8166, + "time_per_iteration": 2.665776252746582 + }, + { + "auxiliary_loss_clip": 0.0103851, + "auxiliary_loss_mlp": 0.01001559, + "balance_loss_clip": 1.01335096, + "balance_loss_mlp": 1.0004493, + "epoch": 0.4910266045393056, + "flos": 66723669129600.0, + "grad_norm": 0.8162384936055755, + "language_loss": 0.54223603, + "learning_rate": 2.1550447626546253e-06, + "loss": 0.56263673, + "num_input_tokens_seen": 175598640, + "router_z_loss_clip": 0.25146484, + "router_z_loss_mlp": 0.0111084, + "step": 8167, + "time_per_iteration": 3.315372943878174 + }, + { + "auxiliary_loss_clip": 0.01120448, + "auxiliary_loss_mlp": 0.01035342, + "balance_loss_clip": 1.04244685, + "balance_loss_mlp": 1.02283692, + "epoch": 0.49108672779197354, + "flos": 19824224005440.0, + "grad_norm": 2.493803730086036, + "language_loss": 0.85654008, + "learning_rate": 2.1546564700772665e-06, + "loss": 0.87809801, + "num_input_tokens_seen": 175615675, + "router_z_loss_clip": 0.77929688, + "router_z_loss_mlp": 0.12512207, + "step": 8168, + "time_per_iteration": 2.6994340419769287 + }, + { + "auxiliary_loss_clip": 0.0112042, + "auxiliary_loss_mlp": 0.01032407, + "balance_loss_clip": 1.04310215, + "balance_loss_mlp": 1.01994371, + "epoch": 0.4911468510446415, + "flos": 24191242898400.0, + "grad_norm": 1.6742949627974364, + "language_loss": 0.73150343, + "learning_rate": 2.1542681716353193e-06, + "loss": 0.75303173, + "num_input_tokens_seen": 175632255, + "router_z_loss_clip": 0.7734375, + "router_z_loss_mlp": 0.12469482, + "step": 8169, + "time_per_iteration": 2.6315860748291016 + }, + { + "auxiliary_loss_clip": 0.01121505, + "auxiliary_loss_mlp": 0.01028623, + "balance_loss_clip": 1.04192805, + "balance_loss_mlp": 1.01645756, + "epoch": 0.4912069742973095, + "flos": 25884016128000.0, + "grad_norm": 1.7825092226991233, + "language_loss": 0.78077805, + "learning_rate": 2.1538798673435068e-06, + "loss": 0.80227941, + "num_input_tokens_seen": 175651625, + "router_z_loss_clip": 0.79638672, + "router_z_loss_mlp": 0.12164307, + "step": 8170, + "time_per_iteration": 2.7064309120178223 + }, + { + "auxiliary_loss_clip": 0.01122573, + "auxiliary_loss_mlp": 0.01031831, + "balance_loss_clip": 1.04253125, + "balance_loss_mlp": 1.0202086, + "epoch": 0.49126709754997744, + "flos": 23839117503840.0, + "grad_norm": 2.2349560951012077, + "language_loss": 0.75254405, + "learning_rate": 2.1534915572165545e-06, + "loss": 0.77408814, + "num_input_tokens_seen": 175669265, + "router_z_loss_clip": 0.79980469, + "router_z_loss_mlp": 0.11633301, + "step": 8171, + "time_per_iteration": 2.6266684532165527 + }, + { + "auxiliary_loss_clip": 0.01125429, + "auxiliary_loss_mlp": 0.01035169, + "balance_loss_clip": 1.04264748, + "balance_loss_mlp": 1.02284896, + "epoch": 0.4913272208026454, + "flos": 14934243749760.0, + "grad_norm": 3.103610438834348, + "language_loss": 0.8168022, + "learning_rate": 2.1531032412691875e-06, + "loss": 0.83840817, + "num_input_tokens_seen": 175686065, + "router_z_loss_clip": 0.82714844, + "router_z_loss_mlp": 0.12304688, + "step": 8172, + "time_per_iteration": 2.707773447036743 + }, + { + "auxiliary_loss_clip": 0.01039185, + "auxiliary_loss_mlp": 0.01002227, + "balance_loss_clip": 1.01409364, + "balance_loss_mlp": 1.00113606, + "epoch": 0.49138734405531337, + "flos": 79882166610720.0, + "grad_norm": 0.6955470776489, + "language_loss": 0.53288257, + "learning_rate": 2.1527149195161295e-06, + "loss": 0.55329669, + "num_input_tokens_seen": 175748595, + "router_z_loss_clip": 0.25048828, + "router_z_loss_mlp": 0.01091766, + "step": 8173, + "time_per_iteration": 3.2684831619262695 + }, + { + "auxiliary_loss_clip": 0.01123974, + "auxiliary_loss_mlp": 0.01031945, + "balance_loss_clip": 1.04245341, + "balance_loss_mlp": 1.01896954, + "epoch": 0.4914474673079814, + "flos": 22499482600800.0, + "grad_norm": 2.1394016241866765, + "language_loss": 0.63214004, + "learning_rate": 2.152326591972107e-06, + "loss": 0.65369928, + "num_input_tokens_seen": 175766770, + "router_z_loss_clip": 0.81591797, + "router_z_loss_mlp": 0.12963867, + "step": 8174, + "time_per_iteration": 2.6623220443725586 + }, + { + "auxiliary_loss_clip": 0.0112368, + "auxiliary_loss_mlp": 0.01034753, + "balance_loss_clip": 1.04398739, + "balance_loss_mlp": 1.02169967, + "epoch": 0.49150759056064935, + "flos": 26460616019040.0, + "grad_norm": 1.8555199516179939, + "language_loss": 0.69371778, + "learning_rate": 2.1519382586518445e-06, + "loss": 0.71530211, + "num_input_tokens_seen": 175783605, + "router_z_loss_clip": 0.79736328, + "router_z_loss_mlp": 0.13049316, + "step": 8175, + "time_per_iteration": 2.6865382194519043 + }, + { + "auxiliary_loss_clip": 0.01122001, + "auxiliary_loss_mlp": 0.01028235, + "balance_loss_clip": 1.0428797, + "balance_loss_mlp": 1.01574779, + "epoch": 0.4915677138133173, + "flos": 27311236689600.0, + "grad_norm": 2.5088615281606748, + "language_loss": 0.74474335, + "learning_rate": 2.151549919570068e-06, + "loss": 0.76624572, + "num_input_tokens_seen": 175801390, + "router_z_loss_clip": 0.79150391, + "router_z_loss_mlp": 0.12506104, + "step": 8176, + "time_per_iteration": 2.6788392066955566 + }, + { + "auxiliary_loss_clip": 0.01123234, + "auxiliary_loss_mlp": 0.01039097, + "balance_loss_clip": 1.04284084, + "balance_loss_mlp": 1.02656829, + "epoch": 0.4916278370659853, + "flos": 22455325116000.0, + "grad_norm": 2.8511211815411355, + "language_loss": 0.70107138, + "learning_rate": 2.1511615747415036e-06, + "loss": 0.72269475, + "num_input_tokens_seen": 175819830, + "router_z_loss_clip": 0.80371094, + "router_z_loss_mlp": 0.12530518, + "step": 8177, + "time_per_iteration": 2.654906988143921 + }, + { + "auxiliary_loss_clip": 0.01040121, + "auxiliary_loss_mlp": 0.01001976, + "balance_loss_clip": 1.01507235, + "balance_loss_mlp": 1.00084925, + "epoch": 0.49168796031865325, + "flos": 81276128835840.0, + "grad_norm": 0.6838142553073067, + "language_loss": 0.46232581, + "learning_rate": 2.150773224180877e-06, + "loss": 0.48274678, + "num_input_tokens_seen": 175881765, + "router_z_loss_clip": 0.25024414, + "router_z_loss_mlp": 0.01127625, + "step": 8178, + "time_per_iteration": 3.2373886108398438 + }, + { + "auxiliary_loss_clip": 0.01128415, + "auxiliary_loss_mlp": 0.01033201, + "balance_loss_clip": 1.04520273, + "balance_loss_mlp": 1.01992095, + "epoch": 0.4917480835713212, + "flos": 25574670630720.0, + "grad_norm": 2.297977854236614, + "language_loss": 0.66182351, + "learning_rate": 2.1503848679029147e-06, + "loss": 0.68343973, + "num_input_tokens_seen": 175901795, + "router_z_loss_clip": 0.83154297, + "router_z_loss_mlp": 0.13275146, + "step": 8179, + "time_per_iteration": 2.7122979164123535 + }, + { + "auxiliary_loss_clip": 0.01128581, + "auxiliary_loss_mlp": 0.01034612, + "balance_loss_clip": 1.044011, + "balance_loss_mlp": 1.02135634, + "epoch": 0.4918082068239892, + "flos": 19246287044160.0, + "grad_norm": 2.028459168672352, + "language_loss": 0.69665247, + "learning_rate": 2.149996505922343e-06, + "loss": 0.71828437, + "num_input_tokens_seen": 175917770, + "router_z_loss_clip": 0.84570312, + "router_z_loss_mlp": 0.13269043, + "step": 8180, + "time_per_iteration": 2.620330572128296 + }, + { + "auxiliary_loss_clip": 0.0112117, + "auxiliary_loss_mlp": 0.01030502, + "balance_loss_clip": 1.0422349, + "balance_loss_mlp": 1.01719236, + "epoch": 0.49186833007665715, + "flos": 30023440693920.0, + "grad_norm": 1.9074095630906998, + "language_loss": 0.84142417, + "learning_rate": 2.1496081382538895e-06, + "loss": 0.86294085, + "num_input_tokens_seen": 175937000, + "router_z_loss_clip": 0.7890625, + "router_z_loss_mlp": 0.13305664, + "step": 8181, + "time_per_iteration": 2.7203376293182373 + }, + { + "auxiliary_loss_clip": 0.01121043, + "auxiliary_loss_mlp": 0.01033859, + "balance_loss_clip": 1.04410982, + "balance_loss_mlp": 1.0223608, + "epoch": 0.4919284533293251, + "flos": 26955059567040.0, + "grad_norm": 1.9731307850247048, + "language_loss": 0.72740078, + "learning_rate": 2.1492197649122793e-06, + "loss": 0.74894977, + "num_input_tokens_seen": 175955170, + "router_z_loss_clip": 0.76904297, + "router_z_loss_mlp": 0.11505127, + "step": 8182, + "time_per_iteration": 2.677769899368286 + }, + { + "auxiliary_loss_clip": 0.01124091, + "auxiliary_loss_mlp": 0.010344, + "balance_loss_clip": 1.04357362, + "balance_loss_mlp": 1.02126908, + "epoch": 0.4919885765819931, + "flos": 28513901720160.0, + "grad_norm": 1.8926635835578254, + "language_loss": 0.72406429, + "learning_rate": 2.1488313859122412e-06, + "loss": 0.74564916, + "num_input_tokens_seen": 175973725, + "router_z_loss_clip": 0.8046875, + "router_z_loss_mlp": 0.13128662, + "step": 8183, + "time_per_iteration": 4.129471302032471 + }, + { + "auxiliary_loss_clip": 0.01126739, + "auxiliary_loss_mlp": 0.01030993, + "balance_loss_clip": 1.04328597, + "balance_loss_mlp": 1.01793909, + "epoch": 0.49204869983466104, + "flos": 26064211587840.0, + "grad_norm": 2.1226970225133663, + "language_loss": 0.77219683, + "learning_rate": 2.1484430012685015e-06, + "loss": 0.79377413, + "num_input_tokens_seen": 175993885, + "router_z_loss_clip": 0.83447266, + "router_z_loss_mlp": 0.1307373, + "step": 8184, + "time_per_iteration": 4.111720323562622 + }, + { + "auxiliary_loss_clip": 0.01125226, + "auxiliary_loss_mlp": 0.01033996, + "balance_loss_clip": 1.04495275, + "balance_loss_mlp": 1.02161014, + "epoch": 0.492108823087329, + "flos": 25798334781600.0, + "grad_norm": 2.135932353390687, + "language_loss": 0.70367742, + "learning_rate": 2.148054610995789e-06, + "loss": 0.72526962, + "num_input_tokens_seen": 176014210, + "router_z_loss_clip": 0.80273438, + "router_z_loss_mlp": 0.12384033, + "step": 8185, + "time_per_iteration": 2.6935007572174072 + }, + { + "auxiliary_loss_clip": 0.01126582, + "auxiliary_loss_mlp": 0.01038826, + "balance_loss_clip": 1.0447948, + "balance_loss_mlp": 1.02513433, + "epoch": 0.49216894633999697, + "flos": 30647885142240.0, + "grad_norm": 1.8388281684100654, + "language_loss": 0.75217855, + "learning_rate": 2.147666215108831e-06, + "loss": 0.77383268, + "num_input_tokens_seen": 176033890, + "router_z_loss_clip": 0.81787109, + "router_z_loss_mlp": 0.13690186, + "step": 8186, + "time_per_iteration": 2.7260255813598633 + }, + { + "auxiliary_loss_clip": 0.01124556, + "auxiliary_loss_mlp": 0.01031012, + "balance_loss_clip": 1.04388225, + "balance_loss_mlp": 1.01792848, + "epoch": 0.49222906959266494, + "flos": 27623013223680.0, + "grad_norm": 2.030529349406, + "language_loss": 0.67754292, + "learning_rate": 2.1472778136223545e-06, + "loss": 0.69909859, + "num_input_tokens_seen": 176052720, + "router_z_loss_clip": 0.80664062, + "router_z_loss_mlp": 0.13098145, + "step": 8187, + "time_per_iteration": 2.702889919281006 + }, + { + "auxiliary_loss_clip": 0.01122039, + "auxiliary_loss_mlp": 0.01031037, + "balance_loss_clip": 1.04305553, + "balance_loss_mlp": 1.01801312, + "epoch": 0.49228919284533296, + "flos": 24905136800160.0, + "grad_norm": 1.5990940868454697, + "language_loss": 0.66854167, + "learning_rate": 2.1468894065510894e-06, + "loss": 0.69007242, + "num_input_tokens_seen": 176072545, + "router_z_loss_clip": 0.79003906, + "router_z_loss_mlp": 0.13037109, + "step": 8188, + "time_per_iteration": 2.65375018119812 + }, + { + "auxiliary_loss_clip": 0.01125442, + "auxiliary_loss_mlp": 0.010277, + "balance_loss_clip": 1.04504132, + "balance_loss_mlp": 1.01589847, + "epoch": 0.4923493160980009, + "flos": 33095549410560.0, + "grad_norm": 1.8718965936118148, + "language_loss": 0.74266505, + "learning_rate": 2.1465009939097623e-06, + "loss": 0.7641964, + "num_input_tokens_seen": 176091490, + "router_z_loss_clip": 0.8046875, + "router_z_loss_mlp": 0.11804199, + "step": 8189, + "time_per_iteration": 2.7129838466644287 + }, + { + "auxiliary_loss_clip": 0.01121852, + "auxiliary_loss_mlp": 0.01028428, + "balance_loss_clip": 1.04310727, + "balance_loss_mlp": 1.01606035, + "epoch": 0.4924094393506689, + "flos": 43607069357760.0, + "grad_norm": 2.6762864954962104, + "language_loss": 0.64654613, + "learning_rate": 2.146112575713104e-06, + "loss": 0.66804898, + "num_input_tokens_seen": 176113200, + "router_z_loss_clip": 0.78857422, + "router_z_loss_mlp": 0.12365723, + "step": 8190, + "time_per_iteration": 2.81195068359375 + }, + { + "auxiliary_loss_clip": 0.01123937, + "auxiliary_loss_mlp": 0.01027496, + "balance_loss_clip": 1.0452292, + "balance_loss_mlp": 1.01511562, + "epoch": 0.49246956260333685, + "flos": 24907324733280.0, + "grad_norm": 2.4888876006785026, + "language_loss": 0.71889091, + "learning_rate": 2.1457241519758413e-06, + "loss": 0.7404052, + "num_input_tokens_seen": 176132485, + "router_z_loss_clip": 0.78710938, + "router_z_loss_mlp": 0.12384033, + "step": 8191, + "time_per_iteration": 4.200497150421143 + }, + { + "auxiliary_loss_clip": 0.01121042, + "auxiliary_loss_mlp": 0.01033612, + "balance_loss_clip": 1.04239798, + "balance_loss_mlp": 1.02173257, + "epoch": 0.4925296858560048, + "flos": 47561720011200.0, + "grad_norm": 1.5160132814535567, + "language_loss": 0.71935493, + "learning_rate": 2.1453357227127043e-06, + "loss": 0.74090147, + "num_input_tokens_seen": 176155755, + "router_z_loss_clip": 0.78662109, + "router_z_loss_mlp": 0.11883545, + "step": 8192, + "time_per_iteration": 2.8954174518585205 + }, + { + "auxiliary_loss_clip": 0.01040894, + "auxiliary_loss_mlp": 0.01004635, + "balance_loss_clip": 1.01599967, + "balance_loss_mlp": 1.00340307, + "epoch": 0.4925898091086728, + "flos": 78432181688160.0, + "grad_norm": 0.7149716029659401, + "language_loss": 0.52120072, + "learning_rate": 2.1449472879384224e-06, + "loss": 0.54165602, + "num_input_tokens_seen": 176216295, + "router_z_loss_clip": 0.24902344, + "router_z_loss_mlp": 0.01231384, + "step": 8193, + "time_per_iteration": 3.378815174102783 + }, + { + "auxiliary_loss_clip": 0.01122907, + "auxiliary_loss_mlp": 0.01039765, + "balance_loss_clip": 1.04555404, + "balance_loss_mlp": 1.02762401, + "epoch": 0.49264993236134075, + "flos": 28109474867520.0, + "grad_norm": 1.4118245643565153, + "language_loss": 0.76956707, + "learning_rate": 2.1445588476677246e-06, + "loss": 0.79119384, + "num_input_tokens_seen": 176235925, + "router_z_loss_clip": 0.77246094, + "router_z_loss_mlp": 0.121521, + "step": 8194, + "time_per_iteration": 2.671598434448242 + }, + { + "auxiliary_loss_clip": 0.01120828, + "auxiliary_loss_mlp": 0.01027751, + "balance_loss_clip": 1.04227853, + "balance_loss_mlp": 1.01562107, + "epoch": 0.4927100556140087, + "flos": 30161464015680.0, + "grad_norm": 2.042307663792564, + "language_loss": 0.70216382, + "learning_rate": 2.144170401915341e-06, + "loss": 0.72364962, + "num_input_tokens_seen": 176253865, + "router_z_loss_clip": 0.78613281, + "router_z_loss_mlp": 0.12133789, + "step": 8195, + "time_per_iteration": 2.788339614868164 + }, + { + "auxiliary_loss_clip": 0.01123386, + "auxiliary_loss_mlp": 0.01029685, + "balance_loss_clip": 1.04337192, + "balance_loss_mlp": 1.01771641, + "epoch": 0.4927701788666767, + "flos": 28682468720640.0, + "grad_norm": 2.4629595622231313, + "language_loss": 0.80683923, + "learning_rate": 2.143781950696001e-06, + "loss": 0.82836998, + "num_input_tokens_seen": 176271525, + "router_z_loss_clip": 0.80078125, + "router_z_loss_mlp": 0.11981201, + "step": 8196, + "time_per_iteration": 4.184859991073608 + }, + { + "auxiliary_loss_clip": 0.01123231, + "auxiliary_loss_mlp": 0.0103318, + "balance_loss_clip": 1.0421859, + "balance_loss_mlp": 1.02038336, + "epoch": 0.49283030211934464, + "flos": 27978136896960.0, + "grad_norm": 3.3098822998145705, + "language_loss": 0.70521629, + "learning_rate": 2.1433934940244356e-06, + "loss": 0.72678041, + "num_input_tokens_seen": 176290810, + "router_z_loss_clip": 0.81054688, + "router_z_loss_mlp": 0.12792969, + "step": 8197, + "time_per_iteration": 2.720933675765991 + }, + { + "auxiliary_loss_clip": 0.01121403, + "auxiliary_loss_mlp": 0.01032234, + "balance_loss_clip": 1.04385412, + "balance_loss_mlp": 1.0203253, + "epoch": 0.4928904253720126, + "flos": 20588231432160.0, + "grad_norm": 2.2063889200107605, + "language_loss": 0.84262013, + "learning_rate": 2.143005031915374e-06, + "loss": 0.86415648, + "num_input_tokens_seen": 176309165, + "router_z_loss_clip": 0.77587891, + "router_z_loss_mlp": 0.11907959, + "step": 8198, + "time_per_iteration": 2.6348533630371094 + }, + { + "auxiliary_loss_clip": 0.01126706, + "auxiliary_loss_mlp": 0.01032778, + "balance_loss_clip": 1.04565489, + "balance_loss_mlp": 1.01973701, + "epoch": 0.4929505486246806, + "flos": 18139756818240.0, + "grad_norm": 3.1714575955210367, + "language_loss": 0.75963467, + "learning_rate": 2.1426165643835467e-06, + "loss": 0.78122956, + "num_input_tokens_seen": 176324960, + "router_z_loss_clip": 0.81054688, + "router_z_loss_mlp": 0.13043213, + "step": 8199, + "time_per_iteration": 2.618867874145508 + }, + { + "auxiliary_loss_clip": 0.01125846, + "auxiliary_loss_mlp": 0.01035612, + "balance_loss_clip": 1.04383326, + "balance_loss_mlp": 1.02229011, + "epoch": 0.49301067187734854, + "flos": 29092689544320.0, + "grad_norm": 1.879094622720278, + "language_loss": 0.60211551, + "learning_rate": 2.1422280914436864e-06, + "loss": 0.62373012, + "num_input_tokens_seen": 176346195, + "router_z_loss_clip": 0.81884766, + "router_z_loss_mlp": 0.13330078, + "step": 8200, + "time_per_iteration": 2.7157368659973145 + }, + { + "auxiliary_loss_clip": 0.0111835, + "auxiliary_loss_mlp": 0.01032543, + "balance_loss_clip": 1.04180813, + "balance_loss_mlp": 1.02050877, + "epoch": 0.49307079513001656, + "flos": 27444114316800.0, + "grad_norm": 1.591609140673962, + "language_loss": 0.79319561, + "learning_rate": 2.1418396131105213e-06, + "loss": 0.81470454, + "num_input_tokens_seen": 176366735, + "router_z_loss_clip": 0.76611328, + "router_z_loss_mlp": 0.12042236, + "step": 8201, + "time_per_iteration": 2.655369997024536 + }, + { + "auxiliary_loss_clip": 0.011276, + "auxiliary_loss_mlp": 0.01033015, + "balance_loss_clip": 1.04324424, + "balance_loss_mlp": 1.01899648, + "epoch": 0.4931309183826845, + "flos": 19430858370240.0, + "grad_norm": 2.090317284273583, + "language_loss": 0.67167163, + "learning_rate": 2.141451129398785e-06, + "loss": 0.69327784, + "num_input_tokens_seen": 176384475, + "router_z_loss_clip": 0.84423828, + "router_z_loss_mlp": 0.14019775, + "step": 8202, + "time_per_iteration": 2.676021099090576 + }, + { + "auxiliary_loss_clip": 0.01122483, + "auxiliary_loss_mlp": 0.01028737, + "balance_loss_clip": 1.04289365, + "balance_loss_mlp": 1.01642895, + "epoch": 0.4931910416353525, + "flos": 33322981668480.0, + "grad_norm": 2.70438623580606, + "language_loss": 0.75069284, + "learning_rate": 2.1410626403232076e-06, + "loss": 0.77220505, + "num_input_tokens_seen": 176402645, + "router_z_loss_clip": 0.79638672, + "router_z_loss_mlp": 0.12298584, + "step": 8203, + "time_per_iteration": 2.6818203926086426 + }, + { + "auxiliary_loss_clip": 0.01121583, + "auxiliary_loss_mlp": 0.0103377, + "balance_loss_clip": 1.04309416, + "balance_loss_mlp": 1.02130032, + "epoch": 0.49325116488802045, + "flos": 25388397578880.0, + "grad_norm": 2.1868699451494202, + "language_loss": 0.79964817, + "learning_rate": 2.1406741458985197e-06, + "loss": 0.82120168, + "num_input_tokens_seen": 176416715, + "router_z_loss_clip": 0.78417969, + "router_z_loss_mlp": 0.12475586, + "step": 8204, + "time_per_iteration": 2.694396495819092 + }, + { + "auxiliary_loss_clip": 0.01120984, + "auxiliary_loss_mlp": 0.01034528, + "balance_loss_clip": 1.04400897, + "balance_loss_mlp": 1.02205312, + "epoch": 0.4933112881406884, + "flos": 24240343491360.0, + "grad_norm": 2.2153059060642866, + "language_loss": 0.65762591, + "learning_rate": 2.140285646139455e-06, + "loss": 0.67918104, + "num_input_tokens_seen": 176435755, + "router_z_loss_clip": 0.76904297, + "router_z_loss_mlp": 0.12481689, + "step": 8205, + "time_per_iteration": 2.651831865310669 + }, + { + "auxiliary_loss_clip": 0.0112846, + "auxiliary_loss_mlp": 0.01034167, + "balance_loss_clip": 1.04453886, + "balance_loss_mlp": 1.02027357, + "epoch": 0.4933714113933564, + "flos": 26635301128800.0, + "grad_norm": 2.3183204157041444, + "language_loss": 0.66290277, + "learning_rate": 2.139897141060744e-06, + "loss": 0.68452907, + "num_input_tokens_seen": 176453915, + "router_z_loss_clip": 0.84033203, + "router_z_loss_mlp": 0.13885498, + "step": 8206, + "time_per_iteration": 2.711712598800659 + }, + { + "auxiliary_loss_clip": 0.01121724, + "auxiliary_loss_mlp": 0.01027027, + "balance_loss_clip": 1.04226136, + "balance_loss_mlp": 1.01534462, + "epoch": 0.49343153464602435, + "flos": 34032297117600.0, + "grad_norm": 1.8454824141762711, + "language_loss": 0.7695998, + "learning_rate": 2.1395086306771196e-06, + "loss": 0.79108727, + "num_input_tokens_seen": 176475175, + "router_z_loss_clip": 0.79394531, + "router_z_loss_mlp": 0.11682129, + "step": 8207, + "time_per_iteration": 2.711843490600586 + }, + { + "auxiliary_loss_clip": 0.0112296, + "auxiliary_loss_mlp": 0.01033784, + "balance_loss_clip": 1.04284656, + "balance_loss_mlp": 1.02023041, + "epoch": 0.4934916578986923, + "flos": 30116212564320.0, + "grad_norm": 3.596628550819157, + "language_loss": 0.60089052, + "learning_rate": 2.1391201150033147e-06, + "loss": 0.62245798, + "num_input_tokens_seen": 176494250, + "router_z_loss_clip": 0.80029297, + "router_z_loss_mlp": 0.13537598, + "step": 8208, + "time_per_iteration": 2.7138142585754395 + }, + { + "auxiliary_loss_clip": 0.01125458, + "auxiliary_loss_mlp": 0.01031893, + "balance_loss_clip": 1.04354429, + "balance_loss_mlp": 1.01861262, + "epoch": 0.4935517811513603, + "flos": 28559112654240.0, + "grad_norm": 5.026838900794831, + "language_loss": 0.78317726, + "learning_rate": 2.1387315940540598e-06, + "loss": 0.80475074, + "num_input_tokens_seen": 176513325, + "router_z_loss_clip": 0.81884766, + "router_z_loss_mlp": 0.13269043, + "step": 8209, + "time_per_iteration": 2.6788835525512695 + }, + { + "auxiliary_loss_clip": 0.01119393, + "auxiliary_loss_mlp": 0.01029206, + "balance_loss_clip": 1.04080462, + "balance_loss_mlp": 1.01634932, + "epoch": 0.49361190440402825, + "flos": 26776849453920.0, + "grad_norm": 5.891293852225398, + "language_loss": 0.78766465, + "learning_rate": 2.138343067844089e-06, + "loss": 0.8091507, + "num_input_tokens_seen": 176532915, + "router_z_loss_clip": 0.78515625, + "router_z_loss_mlp": 0.12878418, + "step": 8210, + "time_per_iteration": 2.700612783432007 + }, + { + "auxiliary_loss_clip": 0.01125243, + "auxiliary_loss_mlp": 0.01033945, + "balance_loss_clip": 1.0425005, + "balance_loss_mlp": 1.02054024, + "epoch": 0.4936720276566962, + "flos": 30646831692960.0, + "grad_norm": 1.6440078724819067, + "language_loss": 0.81194389, + "learning_rate": 2.1379545363881363e-06, + "loss": 0.83353579, + "num_input_tokens_seen": 176552775, + "router_z_loss_clip": 0.828125, + "router_z_loss_mlp": 0.13397217, + "step": 8211, + "time_per_iteration": 2.6945669651031494 + }, + { + "auxiliary_loss_clip": 0.01124925, + "auxiliary_loss_mlp": 0.01034174, + "balance_loss_clip": 1.04444027, + "balance_loss_mlp": 1.02187729, + "epoch": 0.4937321509093642, + "flos": 32163704294400.0, + "grad_norm": 2.930090024192602, + "language_loss": 0.91000319, + "learning_rate": 2.137565999700933e-06, + "loss": 0.93159413, + "num_input_tokens_seen": 176572185, + "router_z_loss_clip": 0.80517578, + "router_z_loss_mlp": 0.12304688, + "step": 8212, + "time_per_iteration": 2.70698881149292 + }, + { + "auxiliary_loss_clip": 0.01122309, + "auxiliary_loss_mlp": 0.01033621, + "balance_loss_clip": 1.04172683, + "balance_loss_mlp": 1.02155685, + "epoch": 0.49379227416203214, + "flos": 28018161619200.0, + "grad_norm": 1.810523641835161, + "language_loss": 0.64970165, + "learning_rate": 2.1371774577972138e-06, + "loss": 0.67126101, + "num_input_tokens_seen": 176591490, + "router_z_loss_clip": 0.80419922, + "router_z_loss_mlp": 0.1206665, + "step": 8213, + "time_per_iteration": 2.6680779457092285 + }, + { + "auxiliary_loss_clip": 0.01119774, + "auxiliary_loss_mlp": 0.01027221, + "balance_loss_clip": 1.04037702, + "balance_loss_mlp": 1.01387548, + "epoch": 0.49385239741470016, + "flos": 39644153179200.0, + "grad_norm": 2.6926575750146986, + "language_loss": 0.75840104, + "learning_rate": 2.136788910691711e-06, + "loss": 0.77987099, + "num_input_tokens_seen": 176612715, + "router_z_loss_clip": 0.79345703, + "router_z_loss_mlp": 0.13348389, + "step": 8214, + "time_per_iteration": 2.7935454845428467 + }, + { + "auxiliary_loss_clip": 0.01125191, + "auxiliary_loss_mlp": 0.0103216, + "balance_loss_clip": 1.04603171, + "balance_loss_mlp": 1.01968503, + "epoch": 0.4939125206673681, + "flos": 27445897077120.0, + "grad_norm": 1.8199221854282535, + "language_loss": 0.84254229, + "learning_rate": 2.1364003583991594e-06, + "loss": 0.86411583, + "num_input_tokens_seen": 176631950, + "router_z_loss_clip": 0.79248047, + "router_z_loss_mlp": 0.12493896, + "step": 8215, + "time_per_iteration": 2.687469244003296 + }, + { + "auxiliary_loss_clip": 0.01115039, + "auxiliary_loss_mlp": 0.01026912, + "balance_loss_clip": 1.04017615, + "balance_loss_mlp": 1.01565862, + "epoch": 0.4939726439200361, + "flos": 38042166473280.0, + "grad_norm": 1.6976650388404786, + "language_loss": 0.83492696, + "learning_rate": 2.136011800934292e-06, + "loss": 0.85634649, + "num_input_tokens_seen": 176653060, + "router_z_loss_clip": 0.74853516, + "router_z_loss_mlp": 0.11254883, + "step": 8216, + "time_per_iteration": 2.711972236633301 + }, + { + "auxiliary_loss_clip": 0.01118532, + "auxiliary_loss_mlp": 0.01029368, + "balance_loss_clip": 1.04060936, + "balance_loss_mlp": 1.01701808, + "epoch": 0.49403276717270406, + "flos": 27667292260320.0, + "grad_norm": 1.512149633902723, + "language_loss": 0.74668765, + "learning_rate": 2.1356232383118442e-06, + "loss": 0.76816666, + "num_input_tokens_seen": 176673895, + "router_z_loss_clip": 0.77880859, + "router_z_loss_mlp": 0.12353516, + "step": 8217, + "time_per_iteration": 2.672971487045288 + }, + { + "auxiliary_loss_clip": 0.01118923, + "auxiliary_loss_mlp": 0.01033467, + "balance_loss_clip": 1.04339302, + "balance_loss_mlp": 1.02080071, + "epoch": 0.494092890425372, + "flos": 25308267099840.0, + "grad_norm": 2.2448696823601644, + "language_loss": 0.78748471, + "learning_rate": 2.1352346705465494e-06, + "loss": 0.8090086, + "num_input_tokens_seen": 176692550, + "router_z_loss_clip": 0.75488281, + "router_z_loss_mlp": 0.12664795, + "step": 8218, + "time_per_iteration": 2.662299871444702 + }, + { + "auxiliary_loss_clip": 0.01116272, + "auxiliary_loss_mlp": 0.01031901, + "balance_loss_clip": 1.04060459, + "balance_loss_mlp": 1.02003336, + "epoch": 0.49415301367804, + "flos": 22413760737120.0, + "grad_norm": 2.102871642114795, + "language_loss": 0.76213706, + "learning_rate": 2.134846097653142e-06, + "loss": 0.78361881, + "num_input_tokens_seen": 176709335, + "router_z_loss_clip": 0.75585938, + "router_z_loss_mlp": 0.11871338, + "step": 8219, + "time_per_iteration": 2.7052316665649414 + }, + { + "auxiliary_loss_clip": 0.01121851, + "auxiliary_loss_mlp": 0.01028353, + "balance_loss_clip": 1.04268885, + "balance_loss_mlp": 1.01513267, + "epoch": 0.49421313693070795, + "flos": 21390602372640.0, + "grad_norm": 2.1625762508890114, + "language_loss": 0.62758219, + "learning_rate": 2.134457519646357e-06, + "loss": 0.64908421, + "num_input_tokens_seen": 176727715, + "router_z_loss_clip": 0.79101562, + "router_z_loss_mlp": 0.13208008, + "step": 8220, + "time_per_iteration": 2.613351821899414 + }, + { + "auxiliary_loss_clip": 0.0112064, + "auxiliary_loss_mlp": 0.01030605, + "balance_loss_clip": 1.04182363, + "balance_loss_mlp": 1.01827884, + "epoch": 0.4942732601833759, + "flos": 25395204481920.0, + "grad_norm": 2.0816747023228905, + "language_loss": 0.72532171, + "learning_rate": 2.1340689365409296e-06, + "loss": 0.74683416, + "num_input_tokens_seen": 176747530, + "router_z_loss_clip": 0.78808594, + "router_z_loss_mlp": 0.12335205, + "step": 8221, + "time_per_iteration": 2.651329278945923 + }, + { + "auxiliary_loss_clip": 0.01121475, + "auxiliary_loss_mlp": 0.01034662, + "balance_loss_clip": 1.0458585, + "balance_loss_mlp": 1.02309895, + "epoch": 0.4943333834360439, + "flos": 18362327002560.0, + "grad_norm": 1.6879264887148324, + "language_loss": 0.79497403, + "learning_rate": 2.133680348351595e-06, + "loss": 0.81653541, + "num_input_tokens_seen": 176765260, + "router_z_loss_clip": 0.75634766, + "router_z_loss_mlp": 0.11560059, + "step": 8222, + "time_per_iteration": 4.106324672698975 + }, + { + "auxiliary_loss_clip": 0.01122743, + "auxiliary_loss_mlp": 0.01036189, + "balance_loss_clip": 1.04471421, + "balance_loss_mlp": 1.02324867, + "epoch": 0.49439350668871185, + "flos": 19608825379680.0, + "grad_norm": 2.538074761946755, + "language_loss": 0.72326976, + "learning_rate": 2.133291755093088e-06, + "loss": 0.7448591, + "num_input_tokens_seen": 176781770, + "router_z_loss_clip": 0.77978516, + "router_z_loss_mlp": 0.1293335, + "step": 8223, + "time_per_iteration": 4.146524667739868 + }, + { + "auxiliary_loss_clip": 0.01121955, + "auxiliary_loss_mlp": 0.01035742, + "balance_loss_clip": 1.04198456, + "balance_loss_mlp": 1.02298093, + "epoch": 0.4944536299413798, + "flos": 25478535826080.0, + "grad_norm": 4.186173941168467, + "language_loss": 0.75454384, + "learning_rate": 2.132903156780144e-06, + "loss": 0.77612078, + "num_input_tokens_seen": 176800655, + "router_z_loss_clip": 0.79882812, + "router_z_loss_mlp": 0.12780762, + "step": 8224, + "time_per_iteration": 2.6298367977142334 + }, + { + "auxiliary_loss_clip": 0.01123884, + "auxiliary_loss_mlp": 0.01031489, + "balance_loss_clip": 1.04435658, + "balance_loss_mlp": 1.01891851, + "epoch": 0.4945137531940478, + "flos": 32469241167360.0, + "grad_norm": 2.3734813805521235, + "language_loss": 0.63776159, + "learning_rate": 2.1325145534274997e-06, + "loss": 0.65931523, + "num_input_tokens_seen": 176820610, + "router_z_loss_clip": 0.79589844, + "router_z_loss_mlp": 0.12561035, + "step": 8225, + "time_per_iteration": 2.7011475563049316 + }, + { + "auxiliary_loss_clip": 0.01120576, + "auxiliary_loss_mlp": 0.01028938, + "balance_loss_clip": 1.04191494, + "balance_loss_mlp": 1.01668358, + "epoch": 0.49457387644671574, + "flos": 29270616036480.0, + "grad_norm": 3.4596696809576915, + "language_loss": 0.76395178, + "learning_rate": 2.1321259450498893e-06, + "loss": 0.78544688, + "num_input_tokens_seen": 176840520, + "router_z_loss_clip": 0.78662109, + "router_z_loss_mlp": 0.12255859, + "step": 8226, + "time_per_iteration": 2.6666598320007324 + }, + { + "auxiliary_loss_clip": 0.0112218, + "auxiliary_loss_mlp": 0.01035473, + "balance_loss_clip": 1.04120994, + "balance_loss_mlp": 1.02213955, + "epoch": 0.49463399969938376, + "flos": 32917663435680.0, + "grad_norm": 2.0388465215711267, + "language_loss": 0.71077931, + "learning_rate": 2.131737331662051e-06, + "loss": 0.73235577, + "num_input_tokens_seen": 176860265, + "router_z_loss_clip": 0.81054688, + "router_z_loss_mlp": 0.13336182, + "step": 8227, + "time_per_iteration": 2.684192419052124 + }, + { + "auxiliary_loss_clip": 0.01125458, + "auxiliary_loss_mlp": 0.01032112, + "balance_loss_clip": 1.04427791, + "balance_loss_mlp": 1.0198276, + "epoch": 0.49469412295205173, + "flos": 36217974238560.0, + "grad_norm": 1.8905199615994561, + "language_loss": 0.71649051, + "learning_rate": 2.131348713278718e-06, + "loss": 0.73806626, + "num_input_tokens_seen": 176882910, + "router_z_loss_clip": 0.81201172, + "router_z_loss_mlp": 0.1229248, + "step": 8228, + "time_per_iteration": 2.696122884750366 + }, + { + "auxiliary_loss_clip": 0.01119309, + "auxiliary_loss_mlp": 0.01028265, + "balance_loss_clip": 1.04227877, + "balance_loss_mlp": 1.01626039, + "epoch": 0.4947542462047197, + "flos": 29446111491840.0, + "grad_norm": 1.7251141626073505, + "language_loss": 0.83656961, + "learning_rate": 2.1309600899146304e-06, + "loss": 0.85804534, + "num_input_tokens_seen": 176903030, + "router_z_loss_clip": 0.76855469, + "router_z_loss_mlp": 0.11999512, + "step": 8229, + "time_per_iteration": 2.7187209129333496 + }, + { + "auxiliary_loss_clip": 0.01122634, + "auxiliary_loss_mlp": 0.01033881, + "balance_loss_clip": 1.0420748, + "balance_loss_mlp": 1.02066684, + "epoch": 0.49481436945738766, + "flos": 24459186085920.0, + "grad_norm": 2.4538470471668066, + "language_loss": 0.74824828, + "learning_rate": 2.1305714615845227e-06, + "loss": 0.76981348, + "num_input_tokens_seen": 176919025, + "router_z_loss_clip": 0.80615234, + "router_z_loss_mlp": 0.13214111, + "step": 8230, + "time_per_iteration": 4.134357213973999 + }, + { + "auxiliary_loss_clip": 0.01121542, + "auxiliary_loss_mlp": 0.0102932, + "balance_loss_clip": 1.04259968, + "balance_loss_mlp": 1.01747108, + "epoch": 0.4948744927100556, + "flos": 19119608560800.0, + "grad_norm": 5.027166673221755, + "language_loss": 0.7957437, + "learning_rate": 2.1301828283031314e-06, + "loss": 0.81725234, + "num_input_tokens_seen": 176937945, + "router_z_loss_clip": 0.78955078, + "router_z_loss_mlp": 0.11853027, + "step": 8231, + "time_per_iteration": 2.59774112701416 + }, + { + "auxiliary_loss_clip": 0.01042077, + "auxiliary_loss_mlp": 0.01004572, + "balance_loss_clip": 1.01728368, + "balance_loss_mlp": 1.00327826, + "epoch": 0.4949346159627236, + "flos": 84037473950400.0, + "grad_norm": 0.7498852254457734, + "language_loss": 0.60185051, + "learning_rate": 2.1297941900851944e-06, + "loss": 0.62231696, + "num_input_tokens_seen": 177004575, + "router_z_loss_clip": 0.24816895, + "router_z_loss_mlp": 0.01294708, + "step": 8232, + "time_per_iteration": 3.4134232997894287 + }, + { + "auxiliary_loss_clip": 0.01126671, + "auxiliary_loss_mlp": 0.01033907, + "balance_loss_clip": 1.04419839, + "balance_loss_mlp": 1.0209074, + "epoch": 0.49499473921539155, + "flos": 30249495364320.0, + "grad_norm": 1.9071627919775123, + "language_loss": 0.69521356, + "learning_rate": 2.1294055469454496e-06, + "loss": 0.71681935, + "num_input_tokens_seen": 177024155, + "router_z_loss_clip": 0.82421875, + "router_z_loss_mlp": 0.13006592, + "step": 8233, + "time_per_iteration": 2.654406785964966 + }, + { + "auxiliary_loss_clip": 0.01120521, + "auxiliary_loss_mlp": 0.01031546, + "balance_loss_clip": 1.04242563, + "balance_loss_mlp": 1.01836729, + "epoch": 0.4950548624680595, + "flos": 39911164469280.0, + "grad_norm": 2.0516582287201417, + "language_loss": 0.66062152, + "learning_rate": 2.129016898898633e-06, + "loss": 0.68214214, + "num_input_tokens_seen": 177046185, + "router_z_loss_clip": 0.78076172, + "router_z_loss_mlp": 0.13171387, + "step": 8234, + "time_per_iteration": 2.7648634910583496 + }, + { + "auxiliary_loss_clip": 0.01040347, + "auxiliary_loss_mlp": 0.0100018, + "balance_loss_clip": 1.01553488, + "balance_loss_mlp": 0.99891615, + "epoch": 0.4951149857207275, + "flos": 61108571685600.0, + "grad_norm": 0.793952332792256, + "language_loss": 0.58011258, + "learning_rate": 2.128628245959482e-06, + "loss": 0.60051787, + "num_input_tokens_seen": 177099025, + "router_z_loss_clip": 0.24816895, + "router_z_loss_mlp": 0.01264191, + "step": 8235, + "time_per_iteration": 4.720935583114624 + }, + { + "auxiliary_loss_clip": 0.01122741, + "auxiliary_loss_mlp": 0.01034394, + "balance_loss_clip": 1.04342568, + "balance_loss_mlp": 1.02163219, + "epoch": 0.49517510897339545, + "flos": 27133472266560.0, + "grad_norm": 1.6980202802709843, + "language_loss": 0.77105176, + "learning_rate": 2.1282395881427355e-06, + "loss": 0.7926231, + "num_input_tokens_seen": 177118365, + "router_z_loss_clip": 0.79199219, + "router_z_loss_mlp": 0.12768555, + "step": 8236, + "time_per_iteration": 2.6552369594573975 + }, + { + "auxiliary_loss_clip": 0.01120879, + "auxiliary_loss_mlp": 0.01031404, + "balance_loss_clip": 1.0432179, + "balance_loss_mlp": 1.01933396, + "epoch": 0.4952352322260634, + "flos": 30961930644000.0, + "grad_norm": 1.7580327220503513, + "language_loss": 0.72342646, + "learning_rate": 2.1278509254631315e-06, + "loss": 0.74494928, + "num_input_tokens_seen": 177136415, + "router_z_loss_clip": 0.77685547, + "router_z_loss_mlp": 0.12084961, + "step": 8237, + "time_per_iteration": 2.712414503097534 + }, + { + "auxiliary_loss_clip": 0.01117131, + "auxiliary_loss_mlp": 0.01031134, + "balance_loss_clip": 1.04068744, + "balance_loss_mlp": 1.01936257, + "epoch": 0.4952953554787314, + "flos": 30027857077440.0, + "grad_norm": 2.663903306682269, + "language_loss": 0.75588447, + "learning_rate": 2.127462257935406e-06, + "loss": 0.77736712, + "num_input_tokens_seen": 177155690, + "router_z_loss_clip": 0.76464844, + "router_z_loss_mlp": 0.11779785, + "step": 8238, + "time_per_iteration": 2.6688625812530518 + }, + { + "auxiliary_loss_clip": 0.01123984, + "auxiliary_loss_mlp": 0.01033218, + "balance_loss_clip": 1.04285729, + "balance_loss_mlp": 1.0200454, + "epoch": 0.49535547873139935, + "flos": 21122983323360.0, + "grad_norm": 2.366534083405969, + "language_loss": 0.73437232, + "learning_rate": 2.1270735855743008e-06, + "loss": 0.75594437, + "num_input_tokens_seen": 177173350, + "router_z_loss_clip": 0.81201172, + "router_z_loss_mlp": 0.13183594, + "step": 8239, + "time_per_iteration": 2.6755025386810303 + }, + { + "auxiliary_loss_clip": 0.0112444, + "auxiliary_loss_mlp": 0.01032604, + "balance_loss_clip": 1.04312706, + "balance_loss_mlp": 1.01885331, + "epoch": 0.4954156019840673, + "flos": 25307416236960.0, + "grad_norm": 4.083146159779104, + "language_loss": 0.78687966, + "learning_rate": 2.126684908394552e-06, + "loss": 0.8084501, + "num_input_tokens_seen": 177191115, + "router_z_loss_clip": 0.81298828, + "router_z_loss_mlp": 0.13751221, + "step": 8240, + "time_per_iteration": 2.649625778198242 + }, + { + "auxiliary_loss_clip": 0.01120594, + "auxiliary_loss_mlp": 0.01035435, + "balance_loss_clip": 1.04530072, + "balance_loss_mlp": 1.02393126, + "epoch": 0.49547572523673533, + "flos": 15643478164320.0, + "grad_norm": 2.0489503155499924, + "language_loss": 0.85504097, + "learning_rate": 2.126296226410898e-06, + "loss": 0.87660122, + "num_input_tokens_seen": 177206155, + "router_z_loss_clip": 0.75244141, + "router_z_loss_mlp": 0.11505127, + "step": 8241, + "time_per_iteration": 2.6354589462280273 + }, + { + "auxiliary_loss_clip": 0.011201, + "auxiliary_loss_mlp": 0.01031401, + "balance_loss_clip": 1.04383194, + "balance_loss_mlp": 1.01971841, + "epoch": 0.4955358484894033, + "flos": 19031091004800.0, + "grad_norm": 3.2893488847638683, + "language_loss": 0.77165461, + "learning_rate": 2.1259075396380794e-06, + "loss": 0.79316962, + "num_input_tokens_seen": 177224815, + "router_z_loss_clip": 0.76123047, + "router_z_loss_mlp": 0.11688232, + "step": 8242, + "time_per_iteration": 2.671198606491089 + }, + { + "auxiliary_loss_clip": 0.0112068, + "auxiliary_loss_mlp": 0.01029246, + "balance_loss_clip": 1.04305911, + "balance_loss_mlp": 1.01730144, + "epoch": 0.49559597174207126, + "flos": 32292732780000.0, + "grad_norm": 3.054194609706456, + "language_loss": 0.67080188, + "learning_rate": 2.125518848090833e-06, + "loss": 0.69230115, + "num_input_tokens_seen": 177244490, + "router_z_loss_clip": 0.77587891, + "router_z_loss_mlp": 0.1194458, + "step": 8243, + "time_per_iteration": 2.679382562637329 + }, + { + "auxiliary_loss_clip": 0.01122848, + "auxiliary_loss_mlp": 0.01030231, + "balance_loss_clip": 1.04643297, + "balance_loss_mlp": 1.01869178, + "epoch": 0.4956560949947392, + "flos": 28245755946240.0, + "grad_norm": 1.8665922257137306, + "language_loss": 0.68493605, + "learning_rate": 2.125130151783901e-06, + "loss": 0.70646679, + "num_input_tokens_seen": 177264340, + "router_z_loss_clip": 0.76416016, + "router_z_loss_mlp": 0.11547852, + "step": 8244, + "time_per_iteration": 2.675386428833008 + }, + { + "auxiliary_loss_clip": 0.0112338, + "auxiliary_loss_mlp": 0.01032891, + "balance_loss_clip": 1.04397082, + "balance_loss_mlp": 1.01968813, + "epoch": 0.4957162182474072, + "flos": 25348170270240.0, + "grad_norm": 2.4349191400354493, + "language_loss": 0.75232977, + "learning_rate": 2.12474145073202e-06, + "loss": 0.77389246, + "num_input_tokens_seen": 177283055, + "router_z_loss_clip": 0.79443359, + "router_z_loss_mlp": 0.13201904, + "step": 8245, + "time_per_iteration": 2.681485652923584 + }, + { + "auxiliary_loss_clip": 0.0112042, + "auxiliary_loss_mlp": 0.01030928, + "balance_loss_clip": 1.04384971, + "balance_loss_mlp": 1.01876903, + "epoch": 0.49577634150007516, + "flos": 22859346795840.0, + "grad_norm": 1.9134878133986069, + "language_loss": 0.81776631, + "learning_rate": 2.1243527449499306e-06, + "loss": 0.83927989, + "num_input_tokens_seen": 177301140, + "router_z_loss_clip": 0.76464844, + "router_z_loss_mlp": 0.12158203, + "step": 8246, + "time_per_iteration": 2.6240646839141846 + }, + { + "auxiliary_loss_clip": 0.01125315, + "auxiliary_loss_mlp": 0.01033766, + "balance_loss_clip": 1.04434013, + "balance_loss_mlp": 1.02090919, + "epoch": 0.4958364647527431, + "flos": 31181867205120.0, + "grad_norm": 2.039400623174082, + "language_loss": 0.83452487, + "learning_rate": 2.1239640344523733e-06, + "loss": 0.8561157, + "num_input_tokens_seen": 177323095, + "router_z_loss_clip": 0.80957031, + "router_z_loss_mlp": 0.128479, + "step": 8247, + "time_per_iteration": 2.6722137928009033 + }, + { + "auxiliary_loss_clip": 0.01126233, + "auxiliary_loss_mlp": 0.01027943, + "balance_loss_clip": 1.04675722, + "balance_loss_mlp": 1.01627815, + "epoch": 0.4958965880054111, + "flos": 29804152409280.0, + "grad_norm": 2.0953935453320414, + "language_loss": 0.8359043, + "learning_rate": 2.123575319254087e-06, + "loss": 0.85744601, + "num_input_tokens_seen": 177339845, + "router_z_loss_clip": 0.79443359, + "router_z_loss_mlp": 0.11663818, + "step": 8248, + "time_per_iteration": 2.7355916500091553 + }, + { + "auxiliary_loss_clip": 0.01125809, + "auxiliary_loss_mlp": 0.01028004, + "balance_loss_clip": 1.04438102, + "balance_loss_mlp": 1.01512361, + "epoch": 0.49595671125807905, + "flos": 30607131108960.0, + "grad_norm": 1.8859589587074996, + "language_loss": 0.73203743, + "learning_rate": 2.123186599369812e-06, + "loss": 0.75357556, + "num_input_tokens_seen": 177359980, + "router_z_loss_clip": 0.81347656, + "router_z_loss_mlp": 0.12872314, + "step": 8249, + "time_per_iteration": 2.6675257682800293 + }, + { + "auxiliary_loss_clip": 0.01127953, + "auxiliary_loss_mlp": 0.0103959, + "balance_loss_clip": 1.04684806, + "balance_loss_mlp": 1.02703691, + "epoch": 0.496016834510747, + "flos": 20054735576640.0, + "grad_norm": 1.7286676349796035, + "language_loss": 0.75724769, + "learning_rate": 2.122797874814289e-06, + "loss": 0.77892315, + "num_input_tokens_seen": 177378580, + "router_z_loss_clip": 0.80908203, + "router_z_loss_mlp": 0.12548828, + "step": 8250, + "time_per_iteration": 2.691561222076416 + }, + { + "auxiliary_loss_clip": 0.01122832, + "auxiliary_loss_mlp": 0.01033272, + "balance_loss_clip": 1.04351783, + "balance_loss_mlp": 1.02089787, + "epoch": 0.496076957763415, + "flos": 28599461514720.0, + "grad_norm": 2.712911375893751, + "language_loss": 0.70116276, + "learning_rate": 2.1224091456022585e-06, + "loss": 0.72272378, + "num_input_tokens_seen": 177398790, + "router_z_loss_clip": 0.79296875, + "router_z_loss_mlp": 0.12384033, + "step": 8251, + "time_per_iteration": 2.647911548614502 + }, + { + "auxiliary_loss_clip": 0.01121292, + "auxiliary_loss_mlp": 0.01030469, + "balance_loss_clip": 1.04340005, + "balance_loss_mlp": 1.01887059, + "epoch": 0.49613708101608295, + "flos": 20632753572480.0, + "grad_norm": 2.226315887056916, + "language_loss": 0.80142224, + "learning_rate": 2.122020411748461e-06, + "loss": 0.82293975, + "num_input_tokens_seen": 177416515, + "router_z_loss_clip": 0.77929688, + "router_z_loss_mlp": 0.11602783, + "step": 8252, + "time_per_iteration": 2.654747724533081 + }, + { + "auxiliary_loss_clip": 0.01122358, + "auxiliary_loss_mlp": 0.01028789, + "balance_loss_clip": 1.04353631, + "balance_loss_mlp": 1.01529503, + "epoch": 0.4961972042687509, + "flos": 20277103174560.0, + "grad_norm": 2.008214715870268, + "language_loss": 0.80668604, + "learning_rate": 2.1216316732676363e-06, + "loss": 0.82819748, + "num_input_tokens_seen": 177434425, + "router_z_loss_clip": 0.78857422, + "router_z_loss_mlp": 0.13476562, + "step": 8253, + "time_per_iteration": 2.6231207847595215 + }, + { + "auxiliary_loss_clip": 0.01120164, + "auxiliary_loss_mlp": 0.01028211, + "balance_loss_clip": 1.0427829, + "balance_loss_mlp": 1.01657069, + "epoch": 0.49625732752141893, + "flos": 35333082299520.0, + "grad_norm": 1.4951353667798937, + "language_loss": 0.67459536, + "learning_rate": 2.1212429301745275e-06, + "loss": 0.69607913, + "num_input_tokens_seen": 177459675, + "router_z_loss_clip": 0.7734375, + "router_z_loss_mlp": 0.11639404, + "step": 8254, + "time_per_iteration": 2.774003744125366 + }, + { + "auxiliary_loss_clip": 0.01122675, + "auxiliary_loss_mlp": 0.01034681, + "balance_loss_clip": 1.04236913, + "balance_loss_mlp": 1.02239084, + "epoch": 0.4963174507740869, + "flos": 28201922599680.0, + "grad_norm": 2.3816834724817437, + "language_loss": 0.74023473, + "learning_rate": 2.1208541824838743e-06, + "loss": 0.76180828, + "num_input_tokens_seen": 177478895, + "router_z_loss_clip": 0.80322266, + "router_z_loss_mlp": 0.12298584, + "step": 8255, + "time_per_iteration": 2.6745848655700684 + }, + { + "auxiliary_loss_clip": 0.01121646, + "auxiliary_loss_mlp": 0.01034688, + "balance_loss_clip": 1.04308653, + "balance_loss_mlp": 1.02195668, + "epoch": 0.49637757402675486, + "flos": 16982100135360.0, + "grad_norm": 2.2086862555971325, + "language_loss": 0.81189835, + "learning_rate": 2.1204654302104183e-06, + "loss": 0.83346164, + "num_input_tokens_seen": 177494920, + "router_z_loss_clip": 0.78564453, + "router_z_loss_mlp": 0.1270752, + "step": 8256, + "time_per_iteration": 2.589191198348999 + }, + { + "auxiliary_loss_clip": 0.01119678, + "auxiliary_loss_mlp": 0.01025751, + "balance_loss_clip": 1.04301906, + "balance_loss_mlp": 1.01412785, + "epoch": 0.49643769727942283, + "flos": 27221665684320.0, + "grad_norm": 1.8104999785259426, + "language_loss": 0.80868363, + "learning_rate": 2.120076673368901e-06, + "loss": 0.83013797, + "num_input_tokens_seen": 177515455, + "router_z_loss_clip": 0.76611328, + "router_z_loss_mlp": 0.11621094, + "step": 8257, + "time_per_iteration": 2.6790382862091064 + }, + { + "auxiliary_loss_clip": 0.01126808, + "auxiliary_loss_mlp": 0.01033026, + "balance_loss_clip": 1.04328907, + "balance_loss_mlp": 1.01988912, + "epoch": 0.4964978205320908, + "flos": 23794149673440.0, + "grad_norm": 2.1266211408372486, + "language_loss": 0.66061807, + "learning_rate": 2.1196879119740647e-06, + "loss": 0.68221647, + "num_input_tokens_seen": 177534040, + "router_z_loss_clip": 0.83544922, + "router_z_loss_mlp": 0.13153076, + "step": 8258, + "time_per_iteration": 2.6297800540924072 + }, + { + "auxiliary_loss_clip": 0.01117913, + "auxiliary_loss_mlp": 0.01027575, + "balance_loss_clip": 1.04070067, + "balance_loss_mlp": 1.01633978, + "epoch": 0.49655794378475876, + "flos": 28597881340800.0, + "grad_norm": 1.5178746876866747, + "language_loss": 0.77733982, + "learning_rate": 2.1192991460406502e-06, + "loss": 0.79879475, + "num_input_tokens_seen": 177554510, + "router_z_loss_clip": 0.77099609, + "router_z_loss_mlp": 0.11242676, + "step": 8259, + "time_per_iteration": 2.6813971996307373 + }, + { + "auxiliary_loss_clip": 0.01123001, + "auxiliary_loss_mlp": 0.01028893, + "balance_loss_clip": 1.0448879, + "balance_loss_mlp": 1.01675117, + "epoch": 0.4966180670374267, + "flos": 32740101599040.0, + "grad_norm": 1.6080769229922132, + "language_loss": 0.78366911, + "learning_rate": 2.1189103755834e-06, + "loss": 0.80518812, + "num_input_tokens_seen": 177575780, + "router_z_loss_clip": 0.78173828, + "router_z_loss_mlp": 0.12133789, + "step": 8260, + "time_per_iteration": 2.7039077281951904 + }, + { + "auxiliary_loss_clip": 0.01123411, + "auxiliary_loss_mlp": 0.0103145, + "balance_loss_clip": 1.04279494, + "balance_loss_mlp": 1.01910591, + "epoch": 0.4966781902900947, + "flos": 26856696312000.0, + "grad_norm": 3.8231329669680383, + "language_loss": 0.7605257, + "learning_rate": 2.1185216006170573e-06, + "loss": 0.78207433, + "num_input_tokens_seen": 177588965, + "router_z_loss_clip": 0.80566406, + "router_z_loss_mlp": 0.12353516, + "step": 8261, + "time_per_iteration": 2.643261671066284 + }, + { + "auxiliary_loss_clip": 0.01119259, + "auxiliary_loss_mlp": 0.01031942, + "balance_loss_clip": 1.04293036, + "balance_loss_mlp": 1.02031946, + "epoch": 0.49673831354276266, + "flos": 31986020905920.0, + "grad_norm": 2.3873821867187304, + "language_loss": 0.89592284, + "learning_rate": 2.1181328211563627e-06, + "loss": 0.91743493, + "num_input_tokens_seen": 177608425, + "router_z_loss_clip": 0.76318359, + "router_z_loss_mlp": 0.11633301, + "step": 8262, + "time_per_iteration": 4.093869686126709 + }, + { + "auxiliary_loss_clip": 0.0111996, + "auxiliary_loss_mlp": 0.0102624, + "balance_loss_clip": 1.04399395, + "balance_loss_mlp": 1.01461697, + "epoch": 0.4967984367954306, + "flos": 28287522911520.0, + "grad_norm": 1.5935746889169098, + "language_loss": 0.73987222, + "learning_rate": 2.11774403721606e-06, + "loss": 0.76133418, + "num_input_tokens_seen": 177628240, + "router_z_loss_clip": 0.75976562, + "router_z_loss_mlp": 0.11639404, + "step": 8263, + "time_per_iteration": 4.088485479354858 + }, + { + "auxiliary_loss_clip": 0.01128613, + "auxiliary_loss_mlp": 0.01032915, + "balance_loss_clip": 1.04760611, + "balance_loss_mlp": 1.0190208, + "epoch": 0.4968585600480986, + "flos": 23527138383360.0, + "grad_norm": 2.232284027217906, + "language_loss": 0.69648194, + "learning_rate": 2.1173552488108923e-06, + "loss": 0.71809721, + "num_input_tokens_seen": 177645920, + "router_z_loss_clip": 0.81054688, + "router_z_loss_mlp": 0.13903809, + "step": 8264, + "time_per_iteration": 2.6402740478515625 + }, + { + "auxiliary_loss_clip": 0.01121557, + "auxiliary_loss_mlp": 0.010284, + "balance_loss_clip": 1.04094398, + "balance_loss_mlp": 1.0160979, + "epoch": 0.49691868330076655, + "flos": 27489689906400.0, + "grad_norm": 3.3787852717123448, + "language_loss": 0.65052432, + "learning_rate": 2.1169664559556007e-06, + "loss": 0.67202389, + "num_input_tokens_seen": 177667185, + "router_z_loss_clip": 0.80566406, + "router_z_loss_mlp": 0.12310791, + "step": 8265, + "time_per_iteration": 2.636439800262451 + }, + { + "auxiliary_loss_clip": 0.01041355, + "auxiliary_loss_mlp": 0.0100696, + "balance_loss_clip": 1.01679361, + "balance_loss_mlp": 1.00581455, + "epoch": 0.4969788065534345, + "flos": 81236671355520.0, + "grad_norm": 0.9534176557270622, + "language_loss": 0.5344885, + "learning_rate": 2.1165776586649304e-06, + "loss": 0.55497164, + "num_input_tokens_seen": 177733020, + "router_z_loss_clip": 0.24572754, + "router_z_loss_mlp": 0.01146698, + "step": 8266, + "time_per_iteration": 3.3198602199554443 + }, + { + "auxiliary_loss_clip": 0.01117707, + "auxiliary_loss_mlp": 0.01027696, + "balance_loss_clip": 1.04211569, + "balance_loss_mlp": 1.01564431, + "epoch": 0.49703892980610254, + "flos": 29357837039520.0, + "grad_norm": 1.7773874922772295, + "language_loss": 0.79410219, + "learning_rate": 2.1161888569536223e-06, + "loss": 0.81555617, + "num_input_tokens_seen": 177753370, + "router_z_loss_clip": 0.75585938, + "router_z_loss_mlp": 0.12060547, + "step": 8267, + "time_per_iteration": 2.71038556098938 + }, + { + "auxiliary_loss_clip": 0.01125465, + "auxiliary_loss_mlp": 0.01031952, + "balance_loss_clip": 1.04495597, + "balance_loss_mlp": 1.01902413, + "epoch": 0.4970990530587705, + "flos": 35541025745760.0, + "grad_norm": 3.3570278185121514, + "language_loss": 0.74631691, + "learning_rate": 2.1158000508364223e-06, + "loss": 0.76789105, + "num_input_tokens_seen": 177771530, + "router_z_loss_clip": 0.80566406, + "router_z_loss_mlp": 0.12921143, + "step": 8268, + "time_per_iteration": 2.6752068996429443 + }, + { + "auxiliary_loss_clip": 0.01120513, + "auxiliary_loss_mlp": 0.01031991, + "balance_loss_clip": 1.042117, + "balance_loss_mlp": 1.01949811, + "epoch": 0.49715917631143847, + "flos": 56163042239040.0, + "grad_norm": 1.6067853457762222, + "language_loss": 0.67807436, + "learning_rate": 2.115411240328073e-06, + "loss": 0.69959939, + "num_input_tokens_seen": 177796355, + "router_z_loss_clip": 0.78417969, + "router_z_loss_mlp": 0.12487793, + "step": 8269, + "time_per_iteration": 4.291993141174316 + }, + { + "auxiliary_loss_clip": 0.01119361, + "auxiliary_loss_mlp": 0.01032833, + "balance_loss_clip": 1.0439595, + "balance_loss_mlp": 1.02074552, + "epoch": 0.49721929956410643, + "flos": 24637598785440.0, + "grad_norm": 1.5213027838572775, + "language_loss": 0.8538391, + "learning_rate": 2.1150224254433167e-06, + "loss": 0.87536103, + "num_input_tokens_seen": 177814300, + "router_z_loss_clip": 0.75390625, + "router_z_loss_mlp": 0.12097168, + "step": 8270, + "time_per_iteration": 2.681519031524658 + }, + { + "auxiliary_loss_clip": 0.01123048, + "auxiliary_loss_mlp": 0.01029134, + "balance_loss_clip": 1.04402399, + "balance_loss_mlp": 1.01806521, + "epoch": 0.4972794228167744, + "flos": 26421725780640.0, + "grad_norm": 1.79504283166904, + "language_loss": 0.70910096, + "learning_rate": 2.114633606196899e-06, + "loss": 0.73062277, + "num_input_tokens_seen": 177833615, + "router_z_loss_clip": 0.79101562, + "router_z_loss_mlp": 0.11065674, + "step": 8271, + "time_per_iteration": 2.6447317600250244 + }, + { + "auxiliary_loss_clip": 0.01123079, + "auxiliary_loss_mlp": 0.01028589, + "balance_loss_clip": 1.04353428, + "balance_loss_mlp": 1.01612592, + "epoch": 0.49733954606944236, + "flos": 29626469020800.0, + "grad_norm": 1.4085460330512076, + "language_loss": 0.78505838, + "learning_rate": 2.1142447826035635e-06, + "loss": 0.80657506, + "num_input_tokens_seen": 177855315, + "router_z_loss_clip": 0.79541016, + "router_z_loss_mlp": 0.12475586, + "step": 8272, + "time_per_iteration": 2.748265266418457 + }, + { + "auxiliary_loss_clip": 0.01123443, + "auxiliary_loss_mlp": 0.01034552, + "balance_loss_clip": 1.04502106, + "balance_loss_mlp": 1.02237487, + "epoch": 0.4973996693221103, + "flos": 46189069875360.0, + "grad_norm": 2.3423505286679847, + "language_loss": 0.65860534, + "learning_rate": 2.1138559546780544e-06, + "loss": 0.68018526, + "num_input_tokens_seen": 177875590, + "router_z_loss_clip": 0.78369141, + "router_z_loss_mlp": 0.1217041, + "step": 8273, + "time_per_iteration": 2.7705893516540527 + }, + { + "auxiliary_loss_clip": 0.01121023, + "auxiliary_loss_mlp": 0.01029414, + "balance_loss_clip": 1.04303813, + "balance_loss_mlp": 1.01729667, + "epoch": 0.4974597925747783, + "flos": 26065265037120.0, + "grad_norm": 1.6971515267806476, + "language_loss": 0.78135365, + "learning_rate": 2.1134671224351163e-06, + "loss": 0.802858, + "num_input_tokens_seen": 177894175, + "router_z_loss_clip": 0.78027344, + "router_z_loss_mlp": 0.12115479, + "step": 8274, + "time_per_iteration": 2.708998680114746 + }, + { + "auxiliary_loss_clip": 0.01124657, + "auxiliary_loss_mlp": 0.01030801, + "balance_loss_clip": 1.04323649, + "balance_loss_mlp": 1.01789701, + "epoch": 0.49751991582744626, + "flos": 37507455099360.0, + "grad_norm": 2.375448584220877, + "language_loss": 0.76010466, + "learning_rate": 2.113078285889493e-06, + "loss": 0.78165925, + "num_input_tokens_seen": 177913920, + "router_z_loss_clip": 0.81445312, + "router_z_loss_mlp": 0.12908936, + "step": 8275, + "time_per_iteration": 4.184135675430298 + }, + { + "auxiliary_loss_clip": 0.01125195, + "auxiliary_loss_mlp": 0.01032626, + "balance_loss_clip": 1.04336441, + "balance_loss_mlp": 1.01872647, + "epoch": 0.4975800390801142, + "flos": 17204994457920.0, + "grad_norm": 2.028268632582644, + "language_loss": 0.83471256, + "learning_rate": 2.1126894450559303e-06, + "loss": 0.85629076, + "num_input_tokens_seen": 177930425, + "router_z_loss_clip": 0.81689453, + "router_z_loss_mlp": 0.13909912, + "step": 8276, + "time_per_iteration": 2.658026933670044 + }, + { + "auxiliary_loss_clip": 0.01115772, + "auxiliary_loss_mlp": 0.01026322, + "balance_loss_clip": 1.04122627, + "balance_loss_mlp": 1.01550388, + "epoch": 0.4976401623327822, + "flos": 29537991982080.0, + "grad_norm": 1.394102790170951, + "language_loss": 0.70246911, + "learning_rate": 2.112300599949172e-06, + "loss": 0.72389007, + "num_input_tokens_seen": 177949885, + "router_z_loss_clip": 0.74511719, + "router_z_loss_mlp": 0.1081543, + "step": 8277, + "time_per_iteration": 2.676248550415039 + }, + { + "auxiliary_loss_clip": 0.01118433, + "auxiliary_loss_mlp": 0.01035181, + "balance_loss_clip": 1.04138684, + "balance_loss_mlp": 1.02269363, + "epoch": 0.49770028558545015, + "flos": 25791649430400.0, + "grad_norm": 1.770909986206506, + "language_loss": 0.8212347, + "learning_rate": 2.111911750583964e-06, + "loss": 0.84277081, + "num_input_tokens_seen": 177965720, + "router_z_loss_clip": 0.77050781, + "router_z_loss_mlp": 0.12481689, + "step": 8278, + "time_per_iteration": 2.6421096324920654 + }, + { + "auxiliary_loss_clip": 0.0112157, + "auxiliary_loss_mlp": 0.01030214, + "balance_loss_clip": 1.04169297, + "balance_loss_mlp": 1.01827514, + "epoch": 0.4977604088381181, + "flos": 20455070184000.0, + "grad_norm": 3.1014751968437055, + "language_loss": 0.67403543, + "learning_rate": 2.111522896975052e-06, + "loss": 0.69555324, + "num_input_tokens_seen": 177983190, + "router_z_loss_clip": 0.79736328, + "router_z_loss_mlp": 0.11938477, + "step": 8279, + "time_per_iteration": 2.658369541168213 + }, + { + "auxiliary_loss_clip": 0.01120373, + "auxiliary_loss_mlp": 0.01036274, + "balance_loss_clip": 1.04077101, + "balance_loss_mlp": 1.02298164, + "epoch": 0.49782053209078614, + "flos": 19161294491520.0, + "grad_norm": 2.966353579313976, + "language_loss": 0.70841527, + "learning_rate": 2.1111340391371794e-06, + "loss": 0.72998178, + "num_input_tokens_seen": 178000155, + "router_z_loss_clip": 0.796875, + "router_z_loss_mlp": 0.13275146, + "step": 8280, + "time_per_iteration": 2.6046135425567627 + }, + { + "auxiliary_loss_clip": 0.01119507, + "auxiliary_loss_mlp": 0.01032836, + "balance_loss_clip": 1.0414958, + "balance_loss_mlp": 1.02085567, + "epoch": 0.4978806553434541, + "flos": 30204851672160.0, + "grad_norm": 1.5903171994552072, + "language_loss": 0.64331126, + "learning_rate": 2.1107451770850936e-06, + "loss": 0.66483468, + "num_input_tokens_seen": 178021060, + "router_z_loss_clip": 0.78076172, + "router_z_loss_mlp": 0.11975098, + "step": 8281, + "time_per_iteration": 2.766545534133911 + }, + { + "auxiliary_loss_clip": 0.01123825, + "auxiliary_loss_mlp": 0.01032471, + "balance_loss_clip": 1.04380465, + "balance_loss_mlp": 1.01950681, + "epoch": 0.49794077859612207, + "flos": 16003220807520.0, + "grad_norm": 2.0048219820634845, + "language_loss": 0.7311728, + "learning_rate": 2.1103563108335387e-06, + "loss": 0.75273573, + "num_input_tokens_seen": 178038180, + "router_z_loss_clip": 0.79980469, + "router_z_loss_mlp": 0.12963867, + "step": 8282, + "time_per_iteration": 2.70041823387146 + }, + { + "auxiliary_loss_clip": 0.01116248, + "auxiliary_loss_mlp": 0.01030695, + "balance_loss_clip": 1.04113007, + "balance_loss_mlp": 1.01950693, + "epoch": 0.49800090184879003, + "flos": 33586913645280.0, + "grad_norm": 1.6672413162731212, + "language_loss": 0.73621118, + "learning_rate": 2.109967440397263e-06, + "loss": 0.75768065, + "num_input_tokens_seen": 178057565, + "router_z_loss_clip": 0.75048828, + "router_z_loss_mlp": 0.11187744, + "step": 8283, + "time_per_iteration": 2.727247476577759 + }, + { + "auxiliary_loss_clip": 0.01119733, + "auxiliary_loss_mlp": 0.01033463, + "balance_loss_clip": 1.04294109, + "balance_loss_mlp": 1.02163184, + "epoch": 0.498061025101458, + "flos": 24150731968800.0, + "grad_norm": 1.5796356196307375, + "language_loss": 0.78848159, + "learning_rate": 2.1095785657910095e-06, + "loss": 0.81001353, + "num_input_tokens_seen": 178076965, + "router_z_loss_clip": 0.76708984, + "router_z_loss_mlp": 0.11828613, + "step": 8284, + "time_per_iteration": 2.6497201919555664 + }, + { + "auxiliary_loss_clip": 0.01125443, + "auxiliary_loss_mlp": 0.01035058, + "balance_loss_clip": 1.04337204, + "balance_loss_mlp": 1.02209449, + "epoch": 0.49812114835412596, + "flos": 36477732935520.0, + "grad_norm": 2.39819120269059, + "language_loss": 0.7377038, + "learning_rate": 2.109189687029526e-06, + "loss": 0.75930882, + "num_input_tokens_seen": 178095105, + "router_z_loss_clip": 0.82128906, + "router_z_loss_mlp": 0.12969971, + "step": 8285, + "time_per_iteration": 2.706568717956543 + }, + { + "auxiliary_loss_clip": 0.01124248, + "auxiliary_loss_mlp": 0.01030352, + "balance_loss_clip": 1.04633689, + "balance_loss_mlp": 1.01750159, + "epoch": 0.49818127160679393, + "flos": 28244418876000.0, + "grad_norm": 1.928735148647592, + "language_loss": 0.74229926, + "learning_rate": 2.1088008041275598e-06, + "loss": 0.76384526, + "num_input_tokens_seen": 178114505, + "router_z_loss_clip": 0.77880859, + "router_z_loss_mlp": 0.12854004, + "step": 8286, + "time_per_iteration": 2.6280148029327393 + }, + { + "auxiliary_loss_clip": 0.01124278, + "auxiliary_loss_mlp": 0.01037234, + "balance_loss_clip": 1.04485989, + "balance_loss_mlp": 1.02539611, + "epoch": 0.4982413948594619, + "flos": 26420874917760.0, + "grad_norm": 1.9414156951790156, + "language_loss": 0.85364425, + "learning_rate": 2.1084119170998545e-06, + "loss": 0.87525928, + "num_input_tokens_seen": 178131595, + "router_z_loss_clip": 0.79541016, + "router_z_loss_mlp": 0.11834717, + "step": 8287, + "time_per_iteration": 2.686368227005005 + }, + { + "auxiliary_loss_clip": 0.01120859, + "auxiliary_loss_mlp": 0.01028871, + "balance_loss_clip": 1.04260397, + "balance_loss_mlp": 1.01683092, + "epoch": 0.49830151811212986, + "flos": 39641762659680.0, + "grad_norm": 1.682901010640218, + "language_loss": 0.72484374, + "learning_rate": 2.108023025961159e-06, + "loss": 0.74634099, + "num_input_tokens_seen": 178152055, + "router_z_loss_clip": 0.78125, + "router_z_loss_mlp": 0.12036133, + "step": 8288, + "time_per_iteration": 2.7655487060546875 + }, + { + "auxiliary_loss_clip": 0.01126005, + "auxiliary_loss_mlp": 0.01030271, + "balance_loss_clip": 1.04396057, + "balance_loss_mlp": 1.0169251, + "epoch": 0.4983616413647978, + "flos": 22137876162720.0, + "grad_norm": 3.232509365222375, + "language_loss": 0.80966723, + "learning_rate": 2.10763413072622e-06, + "loss": 0.83123004, + "num_input_tokens_seen": 178168150, + "router_z_loss_clip": 0.82080078, + "router_z_loss_mlp": 0.13336182, + "step": 8289, + "time_per_iteration": 2.6767075061798096 + }, + { + "auxiliary_loss_clip": 0.01118929, + "auxiliary_loss_mlp": 0.01033759, + "balance_loss_clip": 1.04125094, + "balance_loss_mlp": 1.02205253, + "epoch": 0.4984217646174658, + "flos": 24060917859840.0, + "grad_norm": 2.154589882040819, + "language_loss": 0.73427296, + "learning_rate": 2.107245231409784e-06, + "loss": 0.75579983, + "num_input_tokens_seen": 178186150, + "router_z_loss_clip": 0.77685547, + "router_z_loss_mlp": 0.11700439, + "step": 8290, + "time_per_iteration": 2.671501636505127 + }, + { + "auxiliary_loss_clip": 0.01125829, + "auxiliary_loss_mlp": 0.01036894, + "balance_loss_clip": 1.04607809, + "balance_loss_mlp": 1.02324486, + "epoch": 0.49848188787013376, + "flos": 30427948581120.0, + "grad_norm": 1.647153465665118, + "language_loss": 0.84411061, + "learning_rate": 2.106856328026598e-06, + "loss": 0.86573786, + "num_input_tokens_seen": 178207665, + "router_z_loss_clip": 0.79785156, + "router_z_loss_mlp": 0.13665771, + "step": 8291, + "time_per_iteration": 2.653353452682495 + }, + { + "auxiliary_loss_clip": 0.01128555, + "auxiliary_loss_mlp": 0.01033736, + "balance_loss_clip": 1.04550064, + "balance_loss_mlp": 1.02089131, + "epoch": 0.4985420111228017, + "flos": 27311236689600.0, + "grad_norm": 1.8554358480351807, + "language_loss": 0.66983408, + "learning_rate": 2.106467420591409e-06, + "loss": 0.69145697, + "num_input_tokens_seen": 178226325, + "router_z_loss_clip": 0.83105469, + "router_z_loss_mlp": 0.128479, + "step": 8292, + "time_per_iteration": 2.66171932220459 + }, + { + "auxiliary_loss_clip": 0.01122574, + "auxiliary_loss_mlp": 0.01032224, + "balance_loss_clip": 1.04406548, + "balance_loss_mlp": 1.02067304, + "epoch": 0.4986021343754697, + "flos": 19787035492800.0, + "grad_norm": 1.7772766934250794, + "language_loss": 0.67022544, + "learning_rate": 2.106078509118965e-06, + "loss": 0.69177341, + "num_input_tokens_seen": 178244960, + "router_z_loss_clip": 0.78515625, + "router_z_loss_mlp": 0.11547852, + "step": 8293, + "time_per_iteration": 2.606754779815674 + }, + { + "auxiliary_loss_clip": 0.01122626, + "auxiliary_loss_mlp": 0.01027259, + "balance_loss_clip": 1.0439446, + "balance_loss_mlp": 1.01570785, + "epoch": 0.4986622576281377, + "flos": 28557370411200.0, + "grad_norm": 1.9891461885100972, + "language_loss": 0.82447875, + "learning_rate": 2.1056895936240133e-06, + "loss": 0.84597766, + "num_input_tokens_seen": 178265400, + "router_z_loss_clip": 0.78662109, + "router_z_loss_mlp": 0.11553955, + "step": 8294, + "time_per_iteration": 2.785893440246582 + }, + { + "auxiliary_loss_clip": 0.01122444, + "auxiliary_loss_mlp": 0.01027417, + "balance_loss_clip": 1.04348934, + "balance_loss_mlp": 1.01492405, + "epoch": 0.49872238088080567, + "flos": 24372937497600.0, + "grad_norm": 1.9223857419439416, + "language_loss": 0.72596955, + "learning_rate": 2.1053006741213016e-06, + "loss": 0.74746811, + "num_input_tokens_seen": 178284535, + "router_z_loss_clip": 0.78955078, + "router_z_loss_mlp": 0.12487793, + "step": 8295, + "time_per_iteration": 2.6277952194213867 + }, + { + "auxiliary_loss_clip": 0.01119493, + "auxiliary_loss_mlp": 0.01035851, + "balance_loss_clip": 1.04251575, + "balance_loss_mlp": 1.02422786, + "epoch": 0.49878250413347364, + "flos": 27934303550400.0, + "grad_norm": 1.9641320781455593, + "language_loss": 0.67676473, + "learning_rate": 2.1049117506255775e-06, + "loss": 0.69831818, + "num_input_tokens_seen": 178302425, + "router_z_loss_clip": 0.77001953, + "router_z_loss_mlp": 0.11633301, + "step": 8296, + "time_per_iteration": 2.668020248413086 + }, + { + "auxiliary_loss_clip": 0.011246, + "auxiliary_loss_mlp": 0.01034019, + "balance_loss_clip": 1.0442934, + "balance_loss_mlp": 1.02124596, + "epoch": 0.4988426273861416, + "flos": 39777598048320.0, + "grad_norm": 1.9392440913334457, + "language_loss": 0.64960897, + "learning_rate": 2.1045228231515895e-06, + "loss": 0.67119521, + "num_input_tokens_seen": 178323065, + "router_z_loss_clip": 0.80322266, + "router_z_loss_mlp": 0.12780762, + "step": 8297, + "time_per_iteration": 2.730156183242798 + }, + { + "auxiliary_loss_clip": 0.01120535, + "auxiliary_loss_mlp": 0.01031553, + "balance_loss_clip": 1.04379237, + "balance_loss_mlp": 1.01998949, + "epoch": 0.49890275063880957, + "flos": 25530999353280.0, + "grad_norm": 2.0134044945577254, + "language_loss": 0.69414639, + "learning_rate": 2.1041338917140857e-06, + "loss": 0.71566731, + "num_input_tokens_seen": 178343985, + "router_z_loss_clip": 0.76757812, + "router_z_loss_mlp": 0.11566162, + "step": 8298, + "time_per_iteration": 2.7100300788879395 + }, + { + "auxiliary_loss_clip": 0.0111795, + "auxiliary_loss_mlp": 0.01034028, + "balance_loss_clip": 1.0414151, + "balance_loss_mlp": 1.02235794, + "epoch": 0.49896287389147753, + "flos": 22725456236640.0, + "grad_norm": 1.7775684595836276, + "language_loss": 0.84330523, + "learning_rate": 2.103744956327814e-06, + "loss": 0.86482501, + "num_input_tokens_seen": 178362345, + "router_z_loss_clip": 0.76513672, + "router_z_loss_mlp": 0.11669922, + "step": 8299, + "time_per_iteration": 2.61550235748291 + }, + { + "auxiliary_loss_clip": 0.01125201, + "auxiliary_loss_mlp": 0.01032455, + "balance_loss_clip": 1.04426229, + "balance_loss_mlp": 1.01973522, + "epoch": 0.4990229971441455, + "flos": 30293774400960.0, + "grad_norm": 2.370897005758887, + "language_loss": 0.69038188, + "learning_rate": 2.1033560170075234e-06, + "loss": 0.71195841, + "num_input_tokens_seen": 178383190, + "router_z_loss_clip": 0.80908203, + "router_z_loss_mlp": 0.12731934, + "step": 8300, + "time_per_iteration": 2.715524435043335 + }, + { + "auxiliary_loss_clip": 0.0104615, + "auxiliary_loss_mlp": 0.01000866, + "balance_loss_clip": 1.02159524, + "balance_loss_mlp": 0.99943286, + "epoch": 0.49908312039681346, + "flos": 87103829213280.0, + "grad_norm": 0.76015772510953, + "language_loss": 0.51107681, + "learning_rate": 2.1029670737679623e-06, + "loss": 0.53154695, + "num_input_tokens_seen": 178444250, + "router_z_loss_clip": 0.2454834, + "router_z_loss_mlp": 0.014328, + "step": 8301, + "time_per_iteration": 4.712237119674683 + }, + { + "auxiliary_loss_clip": 0.01117549, + "auxiliary_loss_mlp": 0.01033931, + "balance_loss_clip": 1.04239345, + "balance_loss_mlp": 1.02225471, + "epoch": 0.4991432436494814, + "flos": 24194889453600.0, + "grad_norm": 1.7672491867762274, + "language_loss": 0.84555465, + "learning_rate": 2.102578126623879e-06, + "loss": 0.86706948, + "num_input_tokens_seen": 178463250, + "router_z_loss_clip": 0.75097656, + "router_z_loss_mlp": 0.11682129, + "step": 8302, + "time_per_iteration": 4.2247209548950195 + }, + { + "auxiliary_loss_clip": 0.01120753, + "auxiliary_loss_mlp": 0.01028497, + "balance_loss_clip": 1.04499137, + "balance_loss_mlp": 1.01732743, + "epoch": 0.4992033669021494, + "flos": 18451371283200.0, + "grad_norm": 2.412450434859097, + "language_loss": 0.69552088, + "learning_rate": 2.102189175590024e-06, + "loss": 0.71701336, + "num_input_tokens_seen": 178481340, + "router_z_loss_clip": 0.7578125, + "router_z_loss_mlp": 0.11169434, + "step": 8303, + "time_per_iteration": 2.614687204360962 + }, + { + "auxiliary_loss_clip": 0.0112411, + "auxiliary_loss_mlp": 0.01028141, + "balance_loss_clip": 1.04465592, + "balance_loss_mlp": 1.01614285, + "epoch": 0.49926349015481736, + "flos": 38081178263520.0, + "grad_norm": 1.9642904661201117, + "language_loss": 0.72555822, + "learning_rate": 2.101800220681144e-06, + "loss": 0.74708074, + "num_input_tokens_seen": 178501545, + "router_z_loss_clip": 0.79443359, + "router_z_loss_mlp": 0.11999512, + "step": 8304, + "time_per_iteration": 2.8745059967041016 + }, + { + "auxiliary_loss_clip": 0.01124269, + "auxiliary_loss_mlp": 0.01032568, + "balance_loss_clip": 1.04606557, + "balance_loss_mlp": 1.0211184, + "epoch": 0.4993236134074853, + "flos": 30383588509920.0, + "grad_norm": 1.9730603457938216, + "language_loss": 0.8041361, + "learning_rate": 2.10141126191199e-06, + "loss": 0.82570446, + "num_input_tokens_seen": 178519700, + "router_z_loss_clip": 0.78271484, + "router_z_loss_mlp": 0.11437988, + "step": 8305, + "time_per_iteration": 2.70176362991333 + }, + { + "auxiliary_loss_clip": 0.01045995, + "auxiliary_loss_mlp": 0.01000678, + "balance_loss_clip": 1.02117813, + "balance_loss_mlp": 0.99940372, + "epoch": 0.4993837366601533, + "flos": 85928952686400.0, + "grad_norm": 0.7035857087164861, + "language_loss": 0.56889498, + "learning_rate": 2.1010222992973107e-06, + "loss": 0.58936173, + "num_input_tokens_seen": 178576740, + "router_z_loss_clip": 0.24804688, + "router_z_loss_mlp": 0.01274109, + "step": 8306, + "time_per_iteration": 3.3457157611846924 + }, + { + "auxiliary_loss_clip": 0.01125672, + "auxiliary_loss_mlp": 0.01035538, + "balance_loss_clip": 1.04767203, + "balance_loss_mlp": 1.02274728, + "epoch": 0.4994438599128213, + "flos": 19475542579680.0, + "grad_norm": 1.8300149929080225, + "language_loss": 0.82309216, + "learning_rate": 2.1006333328518556e-06, + "loss": 0.84470427, + "num_input_tokens_seen": 178594745, + "router_z_loss_clip": 0.78027344, + "router_z_loss_mlp": 0.12792969, + "step": 8307, + "time_per_iteration": 2.6542890071868896 + }, + { + "auxiliary_loss_clip": 0.01122135, + "auxiliary_loss_mlp": 0.01036107, + "balance_loss_clip": 1.04439163, + "balance_loss_mlp": 1.02365005, + "epoch": 0.4995039831654893, + "flos": 34078723570080.0, + "grad_norm": 1.763550344933323, + "language_loss": 0.60604656, + "learning_rate": 2.1002443625903748e-06, + "loss": 0.62762904, + "num_input_tokens_seen": 178614110, + "router_z_loss_clip": 0.77734375, + "router_z_loss_mlp": 0.12469482, + "step": 8308, + "time_per_iteration": 2.690558433532715 + }, + { + "auxiliary_loss_clip": 0.01119615, + "auxiliary_loss_mlp": 0.01028091, + "balance_loss_clip": 1.04418325, + "balance_loss_mlp": 1.01672411, + "epoch": 0.49956410641815724, + "flos": 29535196289760.0, + "grad_norm": 1.672882415312557, + "language_loss": 0.74581021, + "learning_rate": 2.0998553885276168e-06, + "loss": 0.76728725, + "num_input_tokens_seen": 178634170, + "router_z_loss_clip": 0.75390625, + "router_z_loss_mlp": 0.11364746, + "step": 8309, + "time_per_iteration": 4.094190359115601 + }, + { + "auxiliary_loss_clip": 0.01122769, + "auxiliary_loss_mlp": 0.01034328, + "balance_loss_clip": 1.04447293, + "balance_loss_mlp": 1.02295494, + "epoch": 0.4996242296708252, + "flos": 19742675421600.0, + "grad_norm": 2.215416853288599, + "language_loss": 0.79914385, + "learning_rate": 2.0994664106783335e-06, + "loss": 0.82071477, + "num_input_tokens_seen": 178651775, + "router_z_loss_clip": 0.78369141, + "router_z_loss_mlp": 0.1137085, + "step": 8310, + "time_per_iteration": 2.6163485050201416 + }, + { + "auxiliary_loss_clip": 0.01125101, + "auxiliary_loss_mlp": 0.01038402, + "balance_loss_clip": 1.04504323, + "balance_loss_mlp": 1.02740526, + "epoch": 0.49968435292349317, + "flos": 20588798674080.0, + "grad_norm": 1.582248371391699, + "language_loss": 0.70602769, + "learning_rate": 2.0990774290572735e-06, + "loss": 0.72766268, + "num_input_tokens_seen": 178669720, + "router_z_loss_clip": 0.80078125, + "router_z_loss_mlp": 0.10992432, + "step": 8311, + "time_per_iteration": 2.6617486476898193 + }, + { + "auxiliary_loss_clip": 0.01123313, + "auxiliary_loss_mlp": 0.0103362, + "balance_loss_clip": 1.04635382, + "balance_loss_mlp": 1.02215242, + "epoch": 0.49974447617616113, + "flos": 18228801098880.0, + "grad_norm": 2.0206688291030126, + "language_loss": 0.76921022, + "learning_rate": 2.098688443679187e-06, + "loss": 0.79077959, + "num_input_tokens_seen": 178686765, + "router_z_loss_clip": 0.76953125, + "router_z_loss_mlp": 0.11474609, + "step": 8312, + "time_per_iteration": 2.6590325832366943 + }, + { + "auxiliary_loss_clip": 0.01124217, + "auxiliary_loss_mlp": 0.01034167, + "balance_loss_clip": 1.04635453, + "balance_loss_mlp": 1.02213848, + "epoch": 0.4998045994288291, + "flos": 32520691762560.0, + "grad_norm": 1.9500835864230701, + "language_loss": 0.84322345, + "learning_rate": 2.0982994545588256e-06, + "loss": 0.86480725, + "num_input_tokens_seen": 178705845, + "router_z_loss_clip": 0.77783203, + "router_z_loss_mlp": 0.12036133, + "step": 8313, + "time_per_iteration": 2.6918656826019287 + }, + { + "auxiliary_loss_clip": 0.01121538, + "auxiliary_loss_mlp": 0.0102925, + "balance_loss_clip": 1.04382229, + "balance_loss_mlp": 1.01737678, + "epoch": 0.49986472268149706, + "flos": 25568390452320.0, + "grad_norm": 1.8305253816409375, + "language_loss": 0.80826795, + "learning_rate": 2.097910461710939e-06, + "loss": 0.82977581, + "num_input_tokens_seen": 178723410, + "router_z_loss_clip": 0.77734375, + "router_z_loss_mlp": 0.11871338, + "step": 8314, + "time_per_iteration": 4.064682960510254 + }, + { + "auxiliary_loss_clip": 0.01124851, + "auxiliary_loss_mlp": 0.01041387, + "balance_loss_clip": 1.04635167, + "balance_loss_mlp": 1.02846479, + "epoch": 0.49992484593416503, + "flos": 27801101784960.0, + "grad_norm": 1.74442147748031, + "language_loss": 0.79742318, + "learning_rate": 2.0975214651502773e-06, + "loss": 0.81908554, + "num_input_tokens_seen": 178743560, + "router_z_loss_clip": 0.78564453, + "router_z_loss_mlp": 0.12921143, + "step": 8315, + "time_per_iteration": 2.661604404449463 + }, + { + "auxiliary_loss_clip": 0.01122695, + "auxiliary_loss_mlp": 0.01031778, + "balance_loss_clip": 1.04606104, + "balance_loss_mlp": 1.02038741, + "epoch": 0.499984969186833, + "flos": 57091929593760.0, + "grad_norm": 1.7243575316273807, + "language_loss": 0.74528348, + "learning_rate": 2.0971324648915926e-06, + "loss": 0.76682824, + "num_input_tokens_seen": 178767225, + "router_z_loss_clip": 0.76757812, + "router_z_loss_mlp": 0.11401367, + "step": 8316, + "time_per_iteration": 3.018725633621216 + }, + { + "auxiliary_loss_clip": 0.01119725, + "auxiliary_loss_mlp": 0.01030532, + "balance_loss_clip": 1.04499102, + "balance_loss_mlp": 1.01966572, + "epoch": 0.500045092439501, + "flos": 31185027552960.0, + "grad_norm": 1.5645516425532842, + "language_loss": 0.81211686, + "learning_rate": 2.0967434609496343e-06, + "loss": 0.83361942, + "num_input_tokens_seen": 178786810, + "router_z_loss_clip": 0.74707031, + "router_z_loss_mlp": 0.10864258, + "step": 8317, + "time_per_iteration": 2.7450687885284424 + }, + { + "auxiliary_loss_clip": 0.01122665, + "auxiliary_loss_mlp": 0.01037194, + "balance_loss_clip": 1.04355896, + "balance_loss_mlp": 1.02451587, + "epoch": 0.5001052156921689, + "flos": 25263947545920.0, + "grad_norm": 2.8710122420023363, + "language_loss": 0.83086014, + "learning_rate": 2.0963544533391548e-06, + "loss": 0.85245872, + "num_input_tokens_seen": 178805660, + "router_z_loss_clip": 0.79052734, + "router_z_loss_mlp": 0.12677002, + "step": 8318, + "time_per_iteration": 2.6953694820404053 + }, + { + "auxiliary_loss_clip": 0.01121784, + "auxiliary_loss_mlp": 0.01031769, + "balance_loss_clip": 1.0439539, + "balance_loss_mlp": 1.02000284, + "epoch": 0.500165338944837, + "flos": 25930726201440.0, + "grad_norm": 2.6978795160229594, + "language_loss": 0.81581855, + "learning_rate": 2.0959654420749045e-06, + "loss": 0.83735406, + "num_input_tokens_seen": 178824780, + "router_z_loss_clip": 0.77832031, + "router_z_loss_mlp": 0.11767578, + "step": 8319, + "time_per_iteration": 2.6926486492156982 + }, + { + "auxiliary_loss_clip": 0.01121598, + "auxiliary_loss_mlp": 0.0102586, + "balance_loss_clip": 1.04318666, + "balance_loss_mlp": 1.01460087, + "epoch": 0.5002254621975049, + "flos": 33988625840160.0, + "grad_norm": 1.6821368053597365, + "language_loss": 0.71804571, + "learning_rate": 2.095576427171635e-06, + "loss": 0.73952019, + "num_input_tokens_seen": 178845640, + "router_z_loss_clip": 0.78320312, + "router_z_loss_mlp": 0.1126709, + "step": 8320, + "time_per_iteration": 2.6929550170898438 + }, + { + "auxiliary_loss_clip": 0.01128453, + "auxiliary_loss_mlp": 0.01041038, + "balance_loss_clip": 1.04365087, + "balance_loss_mlp": 1.02782965, + "epoch": 0.5002855854501729, + "flos": 18976844717280.0, + "grad_norm": 4.5687374424157765, + "language_loss": 0.7708739, + "learning_rate": 2.0951874086440978e-06, + "loss": 0.7925688, + "num_input_tokens_seen": 178862290, + "router_z_loss_clip": 0.84863281, + "router_z_loss_mlp": 0.13208008, + "step": 8321, + "time_per_iteration": 2.6638474464416504 + }, + { + "auxiliary_loss_clip": 0.011237, + "auxiliary_loss_mlp": 0.01041262, + "balance_loss_clip": 1.04516733, + "balance_loss_mlp": 1.02934158, + "epoch": 0.5003457087028408, + "flos": 19653914761920.0, + "grad_norm": 1.7724787365105268, + "language_loss": 0.8327294, + "learning_rate": 2.0947983865070455e-06, + "loss": 0.85437906, + "num_input_tokens_seen": 178879805, + "router_z_loss_clip": 0.78613281, + "router_z_loss_mlp": 0.11907959, + "step": 8322, + "time_per_iteration": 2.611323833465576 + }, + { + "auxiliary_loss_clip": 0.01125805, + "auxiliary_loss_mlp": 0.01037536, + "balance_loss_clip": 1.04632854, + "balance_loss_mlp": 1.02582932, + "epoch": 0.5004058319555088, + "flos": 27711611814240.0, + "grad_norm": 2.9657263124146347, + "language_loss": 0.73477519, + "learning_rate": 2.094409360775228e-06, + "loss": 0.75640857, + "num_input_tokens_seen": 178896985, + "router_z_loss_clip": 0.79443359, + "router_z_loss_mlp": 0.11712646, + "step": 8323, + "time_per_iteration": 2.6750328540802 + }, + { + "auxiliary_loss_clip": 0.01123182, + "auxiliary_loss_mlp": 0.01034325, + "balance_loss_clip": 1.04448652, + "balance_loss_mlp": 1.0214628, + "epoch": 0.5004659552081767, + "flos": 36750659748480.0, + "grad_norm": 1.7132205387954214, + "language_loss": 0.6958034, + "learning_rate": 2.0940203314633977e-06, + "loss": 0.7173785, + "num_input_tokens_seen": 178920605, + "router_z_loss_clip": 0.78808594, + "router_z_loss_mlp": 0.12878418, + "step": 8324, + "time_per_iteration": 2.704403877258301 + }, + { + "auxiliary_loss_clip": 0.0112133, + "auxiliary_loss_mlp": 0.01032727, + "balance_loss_clip": 1.04295301, + "balance_loss_mlp": 1.02029324, + "epoch": 0.5005260784608447, + "flos": 22725739857600.0, + "grad_norm": 2.6911438349410592, + "language_loss": 0.7222259, + "learning_rate": 2.0936312985863077e-06, + "loss": 0.74376643, + "num_input_tokens_seen": 178937760, + "router_z_loss_clip": 0.78320312, + "router_z_loss_mlp": 0.12432861, + "step": 8325, + "time_per_iteration": 2.671090602874756 + }, + { + "auxiliary_loss_clip": 0.01123038, + "auxiliary_loss_mlp": 0.01036511, + "balance_loss_clip": 1.04365158, + "balance_loss_mlp": 1.02335656, + "epoch": 0.5005862017135126, + "flos": 30334852572480.0, + "grad_norm": 21.5391852493824, + "language_loss": 0.73497891, + "learning_rate": 2.093242262158709e-06, + "loss": 0.75657439, + "num_input_tokens_seen": 178957985, + "router_z_loss_clip": 0.79248047, + "router_z_loss_mlp": 0.13165283, + "step": 8326, + "time_per_iteration": 2.6797280311584473 + }, + { + "auxiliary_loss_clip": 0.01120867, + "auxiliary_loss_mlp": 0.01033957, + "balance_loss_clip": 1.04258275, + "balance_loss_mlp": 1.02222061, + "epoch": 0.5006463249661807, + "flos": 22859549382240.0, + "grad_norm": 1.6235217879563228, + "language_loss": 0.78139102, + "learning_rate": 2.0928532221953544e-06, + "loss": 0.8029393, + "num_input_tokens_seen": 178977070, + "router_z_loss_clip": 0.78369141, + "router_z_loss_mlp": 0.11730957, + "step": 8327, + "time_per_iteration": 2.7191216945648193 + }, + { + "auxiliary_loss_clip": 0.01125323, + "auxiliary_loss_mlp": 0.01038768, + "balance_loss_clip": 1.04587603, + "balance_loss_mlp": 1.02675796, + "epoch": 0.5007064482188487, + "flos": 15913487733120.0, + "grad_norm": 3.076211954623044, + "language_loss": 0.87758094, + "learning_rate": 2.092464178710997e-06, + "loss": 0.89922178, + "num_input_tokens_seen": 178994175, + "router_z_loss_clip": 0.79394531, + "router_z_loss_mlp": 0.12023926, + "step": 8328, + "time_per_iteration": 2.6793272495269775 + }, + { + "auxiliary_loss_clip": 0.01123832, + "auxiliary_loss_mlp": 0.0103855, + "balance_loss_clip": 1.04210186, + "balance_loss_mlp": 1.02644467, + "epoch": 0.5007665714715166, + "flos": 25976463860160.0, + "grad_norm": 2.635125876280289, + "language_loss": 0.74300617, + "learning_rate": 2.092075131720388e-06, + "loss": 0.76462996, + "num_input_tokens_seen": 179013710, + "router_z_loss_clip": 0.81787109, + "router_z_loss_mlp": 0.12103271, + "step": 8329, + "time_per_iteration": 2.6519293785095215 + }, + { + "auxiliary_loss_clip": 0.01121273, + "auxiliary_loss_mlp": 0.01034499, + "balance_loss_clip": 1.04508221, + "balance_loss_mlp": 1.02344215, + "epoch": 0.5008266947241846, + "flos": 36307099553760.0, + "grad_norm": 2.126813647071611, + "language_loss": 0.79271865, + "learning_rate": 2.091686081238281e-06, + "loss": 0.8142764, + "num_input_tokens_seen": 179035255, + "router_z_loss_clip": 0.76123047, + "router_z_loss_mlp": 0.11053467, + "step": 8330, + "time_per_iteration": 2.7704217433929443 + }, + { + "auxiliary_loss_clip": 0.01046563, + "auxiliary_loss_mlp": 0.01007528, + "balance_loss_clip": 1.02157164, + "balance_loss_mlp": 1.00631356, + "epoch": 0.5008868179768525, + "flos": 77554819612800.0, + "grad_norm": 0.7369668013760797, + "language_loss": 0.56086951, + "learning_rate": 2.0912970272794282e-06, + "loss": 0.58141041, + "num_input_tokens_seen": 179090915, + "router_z_loss_clip": 0.25, + "router_z_loss_mlp": 0.01213074, + "step": 8331, + "time_per_iteration": 3.082960605621338 + }, + { + "auxiliary_loss_clip": 0.01119817, + "auxiliary_loss_mlp": 0.01032813, + "balance_loss_clip": 1.04406583, + "balance_loss_mlp": 1.02173829, + "epoch": 0.5009469412295205, + "flos": 33404813873280.0, + "grad_norm": 4.717806809171888, + "language_loss": 0.65291107, + "learning_rate": 2.0909079698585833e-06, + "loss": 0.67443734, + "num_input_tokens_seen": 179109160, + "router_z_loss_clip": 0.7578125, + "router_z_loss_mlp": 0.11071777, + "step": 8332, + "time_per_iteration": 2.690059185028076 + }, + { + "auxiliary_loss_clip": 0.01119947, + "auxiliary_loss_mlp": 0.01030849, + "balance_loss_clip": 1.0440414, + "balance_loss_mlp": 1.01997745, + "epoch": 0.5010070644821885, + "flos": 33408419911200.0, + "grad_norm": 1.495020026025913, + "language_loss": 0.74801803, + "learning_rate": 2.0905189089904993e-06, + "loss": 0.769526, + "num_input_tokens_seen": 179130610, + "router_z_loss_clip": 0.75927734, + "router_z_loss_mlp": 0.10876465, + "step": 8333, + "time_per_iteration": 2.6779403686523438 + }, + { + "auxiliary_loss_clip": 0.01122522, + "auxiliary_loss_mlp": 0.01032214, + "balance_loss_clip": 1.04355538, + "balance_loss_mlp": 1.02073479, + "epoch": 0.5010671877348565, + "flos": 25213185744480.0, + "grad_norm": 2.1385342942569983, + "language_loss": 0.80571002, + "learning_rate": 2.090129844689929e-06, + "loss": 0.82725734, + "num_input_tokens_seen": 179147860, + "router_z_loss_clip": 0.7890625, + "router_z_loss_mlp": 0.1149292, + "step": 8334, + "time_per_iteration": 2.6957950592041016 + }, + { + "auxiliary_loss_clip": 0.0104435, + "auxiliary_loss_mlp": 0.01002917, + "balance_loss_clip": 1.01944828, + "balance_loss_mlp": 1.00174236, + "epoch": 0.5011273109875244, + "flos": 72147955586400.0, + "grad_norm": 0.8982843924606755, + "language_loss": 0.62569964, + "learning_rate": 2.089740776971626e-06, + "loss": 0.64617229, + "num_input_tokens_seen": 179210490, + "router_z_loss_clip": 0.24890137, + "router_z_loss_mlp": 0.01174164, + "step": 8335, + "time_per_iteration": 3.2222561836242676 + }, + { + "auxiliary_loss_clip": 0.01119512, + "auxiliary_loss_mlp": 0.01028667, + "balance_loss_clip": 1.04276752, + "balance_loss_mlp": 1.01729488, + "epoch": 0.5011874342401924, + "flos": 30915949881600.0, + "grad_norm": 1.4552712540074055, + "language_loss": 0.79752511, + "learning_rate": 2.0893517058503435e-06, + "loss": 0.81900692, + "num_input_tokens_seen": 179231360, + "router_z_loss_clip": 0.76757812, + "router_z_loss_mlp": 0.11376953, + "step": 8336, + "time_per_iteration": 2.719966173171997 + }, + { + "auxiliary_loss_clip": 0.01119712, + "auxiliary_loss_mlp": 0.0102812, + "balance_loss_clip": 1.0413537, + "balance_loss_mlp": 1.01594925, + "epoch": 0.5012475574928603, + "flos": 24684592479840.0, + "grad_norm": 2.0314707714464837, + "language_loss": 0.80349064, + "learning_rate": 2.088962631340836e-06, + "loss": 0.82496893, + "num_input_tokens_seen": 179250625, + "router_z_loss_clip": 0.78466797, + "router_z_loss_mlp": 0.12176514, + "step": 8337, + "time_per_iteration": 2.7435731887817383 + }, + { + "auxiliary_loss_clip": 0.01124374, + "auxiliary_loss_mlp": 0.01031323, + "balance_loss_clip": 1.04235649, + "balance_loss_mlp": 1.01907396, + "epoch": 0.5013076807455283, + "flos": 27711166124160.0, + "grad_norm": 1.9285826297057493, + "language_loss": 0.79064173, + "learning_rate": 2.0885735534578555e-06, + "loss": 0.81219876, + "num_input_tokens_seen": 179267360, + "router_z_loss_clip": 0.82128906, + "router_z_loss_mlp": 0.12231445, + "step": 8338, + "time_per_iteration": 2.6940393447875977 + }, + { + "auxiliary_loss_clip": 0.01120052, + "auxiliary_loss_mlp": 0.01026878, + "balance_loss_clip": 1.04159474, + "balance_loss_mlp": 1.0145756, + "epoch": 0.5013678039981962, + "flos": 29584094296320.0, + "grad_norm": 2.0078703534588986, + "language_loss": 0.84913516, + "learning_rate": 2.0881844722161583e-06, + "loss": 0.87060452, + "num_input_tokens_seen": 179289810, + "router_z_loss_clip": 0.78369141, + "router_z_loss_mlp": 0.1229248, + "step": 8339, + "time_per_iteration": 2.7093305587768555 + }, + { + "auxiliary_loss_clip": 0.0111973, + "auxiliary_loss_mlp": 0.01035509, + "balance_loss_clip": 1.04179358, + "balance_loss_mlp": 1.02346277, + "epoch": 0.5014279272508643, + "flos": 31941296179200.0, + "grad_norm": 1.4398160684991144, + "language_loss": 0.70797116, + "learning_rate": 2.0877953876304962e-06, + "loss": 0.72952354, + "num_input_tokens_seen": 179310620, + "router_z_loss_clip": 0.77978516, + "router_z_loss_mlp": 0.1204834, + "step": 8340, + "time_per_iteration": 2.7039239406585693 + }, + { + "auxiliary_loss_clip": 0.01125758, + "auxiliary_loss_mlp": 0.01031141, + "balance_loss_clip": 1.04498792, + "balance_loss_mlp": 1.01827788, + "epoch": 0.5014880505035323, + "flos": 26149852416960.0, + "grad_norm": 2.1137399474392833, + "language_loss": 0.77667588, + "learning_rate": 2.0874062997156245e-06, + "loss": 0.79824483, + "num_input_tokens_seen": 179329005, + "router_z_loss_clip": 0.80810547, + "router_z_loss_mlp": 0.12866211, + "step": 8341, + "time_per_iteration": 4.093735694885254 + }, + { + "auxiliary_loss_clip": 0.01124881, + "auxiliary_loss_mlp": 0.01033035, + "balance_loss_clip": 1.04365063, + "balance_loss_mlp": 1.02041078, + "epoch": 0.5015481737562002, + "flos": 19243856007360.0, + "grad_norm": 2.2966911249331874, + "language_loss": 0.89188075, + "learning_rate": 2.0870172084862975e-06, + "loss": 0.9134599, + "num_input_tokens_seen": 179343785, + "router_z_loss_clip": 0.8125, + "router_z_loss_mlp": 0.12634277, + "step": 8342, + "time_per_iteration": 4.04123592376709 + }, + { + "auxiliary_loss_clip": 0.01119662, + "auxiliary_loss_mlp": 0.01032258, + "balance_loss_clip": 1.04180717, + "balance_loss_mlp": 1.01956773, + "epoch": 0.5016082970088682, + "flos": 32739736943520.0, + "grad_norm": 2.1553334885184565, + "language_loss": 0.76552284, + "learning_rate": 2.0866281139572682e-06, + "loss": 0.78704202, + "num_input_tokens_seen": 179364070, + "router_z_loss_clip": 0.77832031, + "router_z_loss_mlp": 0.12695312, + "step": 8343, + "time_per_iteration": 2.6789019107818604 + }, + { + "auxiliary_loss_clip": 0.01120058, + "auxiliary_loss_mlp": 0.01024934, + "balance_loss_clip": 1.04379523, + "balance_loss_mlp": 1.01334691, + "epoch": 0.5016684202615361, + "flos": 26198507319840.0, + "grad_norm": 2.4576937333497897, + "language_loss": 0.67294747, + "learning_rate": 2.086239016143293e-06, + "loss": 0.69439739, + "num_input_tokens_seen": 179384225, + "router_z_loss_clip": 0.76269531, + "router_z_loss_mlp": 0.11590576, + "step": 8344, + "time_per_iteration": 2.6677329540252686 + }, + { + "auxiliary_loss_clip": 0.01122448, + "auxiliary_loss_mlp": 0.01034159, + "balance_loss_clip": 1.04363394, + "balance_loss_mlp": 1.02230406, + "epoch": 0.5017285435142042, + "flos": 32027139594720.0, + "grad_norm": 2.3992074326392943, + "language_loss": 0.7569558, + "learning_rate": 2.0858499150591258e-06, + "loss": 0.7785219, + "num_input_tokens_seen": 179402595, + "router_z_loss_clip": 0.78808594, + "router_z_loss_mlp": 0.11853027, + "step": 8345, + "time_per_iteration": 2.671773672103882 + }, + { + "auxiliary_loss_clip": 0.01121608, + "auxiliary_loss_mlp": 0.01031039, + "balance_loss_clip": 1.04409719, + "balance_loss_mlp": 1.01868868, + "epoch": 0.5017886667668721, + "flos": 25352789240160.0, + "grad_norm": 2.083182917411861, + "language_loss": 0.78382444, + "learning_rate": 2.0854608107195203e-06, + "loss": 0.8053509, + "num_input_tokens_seen": 179419635, + "router_z_loss_clip": 0.77539062, + "router_z_loss_mlp": 0.12359619, + "step": 8346, + "time_per_iteration": 2.743790626525879 + }, + { + "auxiliary_loss_clip": 0.01121758, + "auxiliary_loss_mlp": 0.01032045, + "balance_loss_clip": 1.04259491, + "balance_loss_mlp": 1.0196178, + "epoch": 0.5018487900195401, + "flos": 24594899922720.0, + "grad_norm": 1.859711602998707, + "language_loss": 0.69115466, + "learning_rate": 2.0850717031392333e-06, + "loss": 0.71269268, + "num_input_tokens_seen": 179438770, + "router_z_loss_clip": 0.79248047, + "router_z_loss_mlp": 0.12432861, + "step": 8347, + "time_per_iteration": 2.713510751724243 + }, + { + "auxiliary_loss_clip": 0.01122461, + "auxiliary_loss_mlp": 0.01031123, + "balance_loss_clip": 1.0420692, + "balance_loss_mlp": 1.01894534, + "epoch": 0.501908913272208, + "flos": 22147276171680.0, + "grad_norm": 2.3477505461001873, + "language_loss": 0.71366352, + "learning_rate": 2.0846825923330174e-06, + "loss": 0.73519939, + "num_input_tokens_seen": 179457475, + "router_z_loss_clip": 0.8046875, + "router_z_loss_mlp": 0.12200928, + "step": 8348, + "time_per_iteration": 4.065944671630859 + }, + { + "auxiliary_loss_clip": 0.01118773, + "auxiliary_loss_mlp": 0.01028634, + "balance_loss_clip": 1.04324996, + "balance_loss_mlp": 1.01725554, + "epoch": 0.501969036524876, + "flos": 28201841565120.0, + "grad_norm": 1.4754390428200768, + "language_loss": 0.73897588, + "learning_rate": 2.0842934783156303e-06, + "loss": 0.76044995, + "num_input_tokens_seen": 179478140, + "router_z_loss_clip": 0.75488281, + "router_z_loss_mlp": 0.11376953, + "step": 8349, + "time_per_iteration": 2.722105026245117 + }, + { + "auxiliary_loss_clip": 0.01122425, + "auxiliary_loss_mlp": 0.0102946, + "balance_loss_clip": 1.04205859, + "balance_loss_mlp": 1.01643085, + "epoch": 0.5020291597775439, + "flos": 13864780484640.0, + "grad_norm": 2.2583530716627895, + "language_loss": 0.64328665, + "learning_rate": 2.0839043611018266e-06, + "loss": 0.66480553, + "num_input_tokens_seen": 179494325, + "router_z_loss_clip": 0.80322266, + "router_z_loss_mlp": 0.13018799, + "step": 8350, + "time_per_iteration": 2.681840419769287 + }, + { + "auxiliary_loss_clip": 0.01041516, + "auxiliary_loss_mlp": 0.01000578, + "balance_loss_clip": 1.01662779, + "balance_loss_mlp": 0.99929094, + "epoch": 0.5020892830302119, + "flos": 78106507727040.0, + "grad_norm": 0.7768475013193106, + "language_loss": 0.59803104, + "learning_rate": 2.0835152407063597e-06, + "loss": 0.61845201, + "num_input_tokens_seen": 179553545, + "router_z_loss_clip": 0.24890137, + "router_z_loss_mlp": 0.01287842, + "step": 8351, + "time_per_iteration": 3.396073579788208 + }, + { + "auxiliary_loss_clip": 0.01122919, + "auxiliary_loss_mlp": 0.0103255, + "balance_loss_clip": 1.043015, + "balance_loss_mlp": 1.01983631, + "epoch": 0.5021494062828799, + "flos": 28959082606080.0, + "grad_norm": 1.7389573137080898, + "language_loss": 0.75324529, + "learning_rate": 2.0831261171439873e-06, + "loss": 0.7748, + "num_input_tokens_seen": 179573645, + "router_z_loss_clip": 0.79882812, + "router_z_loss_mlp": 0.1272583, + "step": 8352, + "time_per_iteration": 2.66190242767334 + }, + { + "auxiliary_loss_clip": 0.01123124, + "auxiliary_loss_mlp": 0.01029037, + "balance_loss_clip": 1.04457283, + "balance_loss_mlp": 1.01685381, + "epoch": 0.5022095295355479, + "flos": 26328346151040.0, + "grad_norm": 2.12492874007482, + "language_loss": 0.71803868, + "learning_rate": 2.082736990429464e-06, + "loss": 0.73956037, + "num_input_tokens_seen": 179591435, + "router_z_loss_clip": 0.78515625, + "router_z_loss_mlp": 0.12188721, + "step": 8353, + "time_per_iteration": 2.6317434310913086 + }, + { + "auxiliary_loss_clip": 0.01126468, + "auxiliary_loss_mlp": 0.01034823, + "balance_loss_clip": 1.04713607, + "balance_loss_mlp": 1.02107799, + "epoch": 0.5022696527882159, + "flos": 26109017349120.0, + "grad_norm": 1.7620963275793422, + "language_loss": 0.74152017, + "learning_rate": 2.0823478605775455e-06, + "loss": 0.76313305, + "num_input_tokens_seen": 179609955, + "router_z_loss_clip": 0.79199219, + "router_z_loss_mlp": 0.13751221, + "step": 8354, + "time_per_iteration": 4.050204515457153 + }, + { + "auxiliary_loss_clip": 0.01121489, + "auxiliary_loss_mlp": 0.01034913, + "balance_loss_clip": 1.04430068, + "balance_loss_mlp": 1.02247989, + "epoch": 0.5023297760408838, + "flos": 33142178449440.0, + "grad_norm": 1.9512849464126063, + "language_loss": 0.72434741, + "learning_rate": 2.0819587276029884e-06, + "loss": 0.74591148, + "num_input_tokens_seen": 179630875, + "router_z_loss_clip": 0.77148438, + "router_z_loss_mlp": 0.12438965, + "step": 8355, + "time_per_iteration": 2.6717169284820557 + }, + { + "auxiliary_loss_clip": 0.01124336, + "auxiliary_loss_mlp": 0.01034847, + "balance_loss_clip": 1.04321444, + "balance_loss_mlp": 1.02185369, + "epoch": 0.5023898992935518, + "flos": 31986547630560.0, + "grad_norm": 2.107917216451312, + "language_loss": 0.81276739, + "learning_rate": 2.081569591520548e-06, + "loss": 0.83435917, + "num_input_tokens_seen": 179649835, + "router_z_loss_clip": 0.81103516, + "router_z_loss_mlp": 0.13000488, + "step": 8356, + "time_per_iteration": 2.7202868461608887 + }, + { + "auxiliary_loss_clip": 0.0112572, + "auxiliary_loss_mlp": 0.01035684, + "balance_loss_clip": 1.04219472, + "balance_loss_mlp": 1.02202237, + "epoch": 0.5024500225462197, + "flos": 16399665756000.0, + "grad_norm": 3.0516071618535676, + "language_loss": 0.76444232, + "learning_rate": 2.0811804523449803e-06, + "loss": 0.7860564, + "num_input_tokens_seen": 179667605, + "router_z_loss_clip": 0.83496094, + "router_z_loss_mlp": 0.13665771, + "step": 8357, + "time_per_iteration": 2.68400239944458 + }, + { + "auxiliary_loss_clip": 0.01123098, + "auxiliary_loss_mlp": 0.01034922, + "balance_loss_clip": 1.04296958, + "balance_loss_mlp": 1.02208328, + "epoch": 0.5025101457988878, + "flos": 26331547016160.0, + "grad_norm": 2.0957810306480167, + "language_loss": 0.7594797, + "learning_rate": 2.0807913100910417e-06, + "loss": 0.78105992, + "num_input_tokens_seen": 179686910, + "router_z_loss_clip": 0.80078125, + "router_z_loss_mlp": 0.12835693, + "step": 8358, + "time_per_iteration": 2.6242570877075195 + }, + { + "auxiliary_loss_clip": 0.01122173, + "auxiliary_loss_mlp": 0.01036007, + "balance_loss_clip": 1.04256654, + "balance_loss_mlp": 1.02332282, + "epoch": 0.5025702690515557, + "flos": 30072298183200.0, + "grad_norm": 2.80055797300621, + "language_loss": 0.72079933, + "learning_rate": 2.0804021647734887e-06, + "loss": 0.7423811, + "num_input_tokens_seen": 179706395, + "router_z_loss_clip": 0.79589844, + "router_z_loss_mlp": 0.12683105, + "step": 8359, + "time_per_iteration": 2.674839496612549 + }, + { + "auxiliary_loss_clip": 0.01120459, + "auxiliary_loss_mlp": 0.01038538, + "balance_loss_clip": 1.04246104, + "balance_loss_mlp": 1.02642035, + "epoch": 0.5026303923042237, + "flos": 26954613876960.0, + "grad_norm": 1.7905025119950262, + "language_loss": 0.76830333, + "learning_rate": 2.080013016407077e-06, + "loss": 0.78989333, + "num_input_tokens_seen": 179725735, + "router_z_loss_clip": 0.77929688, + "router_z_loss_mlp": 0.12121582, + "step": 8360, + "time_per_iteration": 2.634162664413452 + }, + { + "auxiliary_loss_clip": 0.01122601, + "auxiliary_loss_mlp": 0.01034669, + "balance_loss_clip": 1.04463291, + "balance_loss_mlp": 1.02304006, + "epoch": 0.5026905155568916, + "flos": 28915492363200.0, + "grad_norm": 1.7641692224693701, + "language_loss": 0.76770383, + "learning_rate": 2.0796238650065645e-06, + "loss": 0.7892766, + "num_input_tokens_seen": 179746150, + "router_z_loss_clip": 0.77832031, + "router_z_loss_mlp": 0.11633301, + "step": 8361, + "time_per_iteration": 2.681318998336792 + }, + { + "auxiliary_loss_clip": 0.01121872, + "auxiliary_loss_mlp": 0.01034767, + "balance_loss_clip": 1.04125404, + "balance_loss_mlp": 1.02128983, + "epoch": 0.5027506388095596, + "flos": 31496723052480.0, + "grad_norm": 1.6397216910035968, + "language_loss": 0.84867936, + "learning_rate": 2.0792347105867065e-06, + "loss": 0.87024581, + "num_input_tokens_seen": 179767550, + "router_z_loss_clip": 0.80664062, + "router_z_loss_mlp": 0.1348877, + "step": 8362, + "time_per_iteration": 2.6827170848846436 + }, + { + "auxiliary_loss_clip": 0.01120941, + "auxiliary_loss_mlp": 0.01031499, + "balance_loss_clip": 1.04055595, + "balance_loss_mlp": 1.01926255, + "epoch": 0.5028107620622275, + "flos": 33588574853760.0, + "grad_norm": 1.6952657790154853, + "language_loss": 0.78707981, + "learning_rate": 2.0788455531622605e-06, + "loss": 0.80860424, + "num_input_tokens_seen": 179790075, + "router_z_loss_clip": 0.80419922, + "router_z_loss_mlp": 0.12231445, + "step": 8363, + "time_per_iteration": 2.765819787979126 + }, + { + "auxiliary_loss_clip": 0.0111903, + "auxiliary_loss_mlp": 0.0102828, + "balance_loss_clip": 1.04339278, + "balance_loss_mlp": 1.01559591, + "epoch": 0.5028708853148955, + "flos": 29937151588320.0, + "grad_norm": 2.8520878942709706, + "language_loss": 0.75721419, + "learning_rate": 2.0784563927479838e-06, + "loss": 0.7786873, + "num_input_tokens_seen": 179806515, + "router_z_loss_clip": 0.75585938, + "router_z_loss_mlp": 0.12695312, + "step": 8364, + "time_per_iteration": 2.6954495906829834 + }, + { + "auxiliary_loss_clip": 0.01118683, + "auxiliary_loss_mlp": 0.01029645, + "balance_loss_clip": 1.04174316, + "balance_loss_mlp": 1.01804006, + "epoch": 0.5029310085675635, + "flos": 25396825173120.0, + "grad_norm": 1.6893208138850353, + "language_loss": 0.69366413, + "learning_rate": 2.0780672293586317e-06, + "loss": 0.71514744, + "num_input_tokens_seen": 179826450, + "router_z_loss_clip": 0.77050781, + "router_z_loss_mlp": 0.11608887, + "step": 8365, + "time_per_iteration": 2.6880970001220703 + }, + { + "auxiliary_loss_clip": 0.0112566, + "auxiliary_loss_mlp": 0.01030827, + "balance_loss_clip": 1.04356408, + "balance_loss_mlp": 1.01755297, + "epoch": 0.5029911318202315, + "flos": 27262784373120.0, + "grad_norm": 1.5116011485080687, + "language_loss": 0.73185652, + "learning_rate": 2.0776780630089635e-06, + "loss": 0.75342137, + "num_input_tokens_seen": 179846770, + "router_z_loss_clip": 0.8203125, + "router_z_loss_mlp": 0.13275146, + "step": 8366, + "time_per_iteration": 2.6446118354797363 + }, + { + "auxiliary_loss_clip": 0.01122816, + "auxiliary_loss_mlp": 0.01032566, + "balance_loss_clip": 1.0453701, + "balance_loss_mlp": 1.02035296, + "epoch": 0.5030512550728995, + "flos": 29715513301440.0, + "grad_norm": 1.9201438673620024, + "language_loss": 0.78230727, + "learning_rate": 2.077288893713735e-06, + "loss": 0.80386108, + "num_input_tokens_seen": 179866585, + "router_z_loss_clip": 0.77392578, + "router_z_loss_mlp": 0.12213135, + "step": 8367, + "time_per_iteration": 2.682757616043091 + }, + { + "auxiliary_loss_clip": 0.01119883, + "auxiliary_loss_mlp": 0.01029616, + "balance_loss_clip": 1.04176211, + "balance_loss_mlp": 1.01779616, + "epoch": 0.5031113783255674, + "flos": 22280599488960.0, + "grad_norm": 3.7040142027303817, + "language_loss": 0.69721735, + "learning_rate": 2.0768997214877035e-06, + "loss": 0.71871233, + "num_input_tokens_seen": 179885575, + "router_z_loss_clip": 0.78173828, + "router_z_loss_mlp": 0.1182251, + "step": 8368, + "time_per_iteration": 2.602989912033081 + }, + { + "auxiliary_loss_clip": 0.01042342, + "auxiliary_loss_mlp": 0.01007108, + "balance_loss_clip": 1.01747084, + "balance_loss_mlp": 1.00591183, + "epoch": 0.5031715015782354, + "flos": 69860714345280.0, + "grad_norm": 1.0980803558815564, + "language_loss": 0.63332206, + "learning_rate": 2.0765105463456274e-06, + "loss": 0.65381658, + "num_input_tokens_seen": 179939650, + "router_z_loss_clip": 0.24890137, + "router_z_loss_mlp": 0.01195526, + "step": 8369, + "time_per_iteration": 3.2333545684814453 + }, + { + "auxiliary_loss_clip": 0.0112022, + "auxiliary_loss_mlp": 0.01030754, + "balance_loss_clip": 1.04257226, + "balance_loss_mlp": 1.01927388, + "epoch": 0.5032316248309033, + "flos": 33589425716640.0, + "grad_norm": 2.6036880967384044, + "language_loss": 0.60320067, + "learning_rate": 2.076121368302263e-06, + "loss": 0.62471044, + "num_input_tokens_seen": 179961765, + "router_z_loss_clip": 0.77636719, + "router_z_loss_mlp": 0.11480713, + "step": 8370, + "time_per_iteration": 2.7288126945495605 + }, + { + "auxiliary_loss_clip": 0.01122642, + "auxiliary_loss_mlp": 0.01037057, + "balance_loss_clip": 1.04145336, + "balance_loss_mlp": 1.0233475, + "epoch": 0.5032917480835714, + "flos": 42092911414080.0, + "grad_norm": 2.4103043076716877, + "language_loss": 0.68566644, + "learning_rate": 2.0757321873723695e-06, + "loss": 0.70726341, + "num_input_tokens_seen": 179983015, + "router_z_loss_clip": 0.81103516, + "router_z_loss_mlp": 0.13696289, + "step": 8371, + "time_per_iteration": 2.760979652404785 + }, + { + "auxiliary_loss_clip": 0.01121227, + "auxiliary_loss_mlp": 0.01032691, + "balance_loss_clip": 1.04189193, + "balance_loss_mlp": 1.01959026, + "epoch": 0.5033518713362393, + "flos": 41069023738560.0, + "grad_norm": 1.957723036439061, + "language_loss": 0.67373228, + "learning_rate": 2.0753430035707042e-06, + "loss": 0.69527149, + "num_input_tokens_seen": 180003210, + "router_z_loss_clip": 0.79345703, + "router_z_loss_mlp": 0.13104248, + "step": 8372, + "time_per_iteration": 2.757408618927002 + }, + { + "auxiliary_loss_clip": 0.01120497, + "auxiliary_loss_mlp": 0.01034805, + "balance_loss_clip": 1.04069948, + "balance_loss_mlp": 1.0211432, + "epoch": 0.5034119945889073, + "flos": 34392404416320.0, + "grad_norm": 1.7262222925208648, + "language_loss": 0.67148328, + "learning_rate": 2.0749538169120235e-06, + "loss": 0.69303632, + "num_input_tokens_seen": 180025530, + "router_z_loss_clip": 0.79785156, + "router_z_loss_mlp": 0.13665771, + "step": 8373, + "time_per_iteration": 2.7377967834472656 + }, + { + "auxiliary_loss_clip": 0.0111889, + "auxiliary_loss_mlp": 0.01030594, + "balance_loss_clip": 1.04095852, + "balance_loss_mlp": 1.01797628, + "epoch": 0.5034721178415752, + "flos": 26062185723840.0, + "grad_norm": 1.6497309602443726, + "language_loss": 0.74439263, + "learning_rate": 2.0745646274110872e-06, + "loss": 0.7658875, + "num_input_tokens_seen": 180043180, + "router_z_loss_clip": 0.77783203, + "router_z_loss_mlp": 0.1262207, + "step": 8374, + "time_per_iteration": 2.7300965785980225 + }, + { + "auxiliary_loss_clip": 0.01122905, + "auxiliary_loss_mlp": 0.01039993, + "balance_loss_clip": 1.04307187, + "balance_loss_mlp": 1.02689743, + "epoch": 0.5035322410942432, + "flos": 27667981054080.0, + "grad_norm": 1.7925942861719446, + "language_loss": 0.68571317, + "learning_rate": 2.0741754350826525e-06, + "loss": 0.70734215, + "num_input_tokens_seen": 180062905, + "router_z_loss_clip": 0.79785156, + "router_z_loss_mlp": 0.13085938, + "step": 8375, + "time_per_iteration": 2.6202902793884277 + }, + { + "auxiliary_loss_clip": 0.01125358, + "auxiliary_loss_mlp": 0.01036364, + "balance_loss_clip": 1.04344475, + "balance_loss_mlp": 1.0225538, + "epoch": 0.5035923643469111, + "flos": 24194970488160.0, + "grad_norm": 1.7803469940342735, + "language_loss": 0.79020017, + "learning_rate": 2.0737862399414777e-06, + "loss": 0.81181741, + "num_input_tokens_seen": 180082000, + "router_z_loss_clip": 0.81884766, + "router_z_loss_mlp": 0.13806152, + "step": 8376, + "time_per_iteration": 2.6824872493743896 + }, + { + "auxiliary_loss_clip": 0.01124667, + "auxiliary_loss_mlp": 0.01032281, + "balance_loss_clip": 1.04217005, + "balance_loss_mlp": 1.01950169, + "epoch": 0.5036524875995791, + "flos": 37235136045600.0, + "grad_norm": 3.739882446069665, + "language_loss": 0.59612429, + "learning_rate": 2.0733970420023213e-06, + "loss": 0.61769378, + "num_input_tokens_seen": 180101340, + "router_z_loss_clip": 0.82519531, + "router_z_loss_mlp": 0.12786865, + "step": 8377, + "time_per_iteration": 2.6830644607543945 + }, + { + "auxiliary_loss_clip": 0.01120293, + "auxiliary_loss_mlp": 0.01032358, + "balance_loss_clip": 1.04104614, + "balance_loss_mlp": 1.01965642, + "epoch": 0.5037126108522471, + "flos": 17961627739680.0, + "grad_norm": 2.698711854902398, + "language_loss": 0.76052988, + "learning_rate": 2.0730078412799425e-06, + "loss": 0.78205645, + "num_input_tokens_seen": 180119160, + "router_z_loss_clip": 0.79248047, + "router_z_loss_mlp": 0.1270752, + "step": 8378, + "time_per_iteration": 2.676384925842285 + }, + { + "auxiliary_loss_clip": 0.01119009, + "auxiliary_loss_mlp": 0.01033447, + "balance_loss_clip": 1.04126811, + "balance_loss_mlp": 1.02131176, + "epoch": 0.5037727341049151, + "flos": 30868996704480.0, + "grad_norm": 1.771753242960945, + "language_loss": 0.74874067, + "learning_rate": 2.0726186377890985e-06, + "loss": 0.77026522, + "num_input_tokens_seen": 180138730, + "router_z_loss_clip": 0.77734375, + "router_z_loss_mlp": 0.12133789, + "step": 8379, + "time_per_iteration": 2.6744256019592285 + }, + { + "auxiliary_loss_clip": 0.01119622, + "auxiliary_loss_mlp": 0.01031753, + "balance_loss_clip": 1.04345918, + "balance_loss_mlp": 1.02011776, + "epoch": 0.5038328573575831, + "flos": 34827942189600.0, + "grad_norm": 5.466350089076057, + "language_loss": 0.66153526, + "learning_rate": 2.072229431544548e-06, + "loss": 0.68304902, + "num_input_tokens_seen": 180158810, + "router_z_loss_clip": 0.76171875, + "router_z_loss_mlp": 0.11633301, + "step": 8380, + "time_per_iteration": 5.546520471572876 + }, + { + "auxiliary_loss_clip": 0.01120109, + "auxiliary_loss_mlp": 0.01031432, + "balance_loss_clip": 1.04296267, + "balance_loss_mlp": 1.01960051, + "epoch": 0.503892980610251, + "flos": 38620062917280.0, + "grad_norm": 2.099979529403909, + "language_loss": 0.63789654, + "learning_rate": 2.071840222561051e-06, + "loss": 0.65941191, + "num_input_tokens_seen": 180179700, + "router_z_loss_clip": 0.77148438, + "router_z_loss_mlp": 0.11846924, + "step": 8381, + "time_per_iteration": 2.7125792503356934 + }, + { + "auxiliary_loss_clip": 0.01118194, + "auxiliary_loss_mlp": 0.01034741, + "balance_loss_clip": 1.04048109, + "balance_loss_mlp": 1.02293348, + "epoch": 0.503953103862919, + "flos": 33054957446400.0, + "grad_norm": 1.7083083289836842, + "language_loss": 0.67658013, + "learning_rate": 2.071451010853365e-06, + "loss": 0.69810951, + "num_input_tokens_seen": 180199890, + "router_z_loss_clip": 0.77685547, + "router_z_loss_mlp": 0.11804199, + "step": 8382, + "time_per_iteration": 2.7232062816619873 + }, + { + "auxiliary_loss_clip": 0.0112839, + "auxiliary_loss_mlp": 0.01041991, + "balance_loss_clip": 1.04379654, + "balance_loss_mlp": 1.02831793, + "epoch": 0.5040132271155869, + "flos": 19074964868640.0, + "grad_norm": 1.9251629132951358, + "language_loss": 0.6216929, + "learning_rate": 2.0710617964362506e-06, + "loss": 0.64339674, + "num_input_tokens_seen": 180217840, + "router_z_loss_clip": 0.84521484, + "router_z_loss_mlp": 0.13665771, + "step": 8383, + "time_per_iteration": 2.5913898944854736 + }, + { + "auxiliary_loss_clip": 0.01118197, + "auxiliary_loss_mlp": 0.01030065, + "balance_loss_clip": 1.04106593, + "balance_loss_mlp": 1.01797748, + "epoch": 0.504073350368255, + "flos": 16582089666240.0, + "grad_norm": 1.9387415975870164, + "language_loss": 0.67179048, + "learning_rate": 2.070672579324465e-06, + "loss": 0.69327313, + "num_input_tokens_seen": 180236465, + "router_z_loss_clip": 0.77001953, + "router_z_loss_mlp": 0.12084961, + "step": 8384, + "time_per_iteration": 2.6123576164245605 + }, + { + "auxiliary_loss_clip": 0.01119784, + "auxiliary_loss_mlp": 0.01036802, + "balance_loss_clip": 1.04106855, + "balance_loss_mlp": 1.02500081, + "epoch": 0.5041334736209229, + "flos": 35457572849760.0, + "grad_norm": 3.3674386517039543, + "language_loss": 0.70874131, + "learning_rate": 2.0702833595327674e-06, + "loss": 0.7303071, + "num_input_tokens_seen": 180258025, + "router_z_loss_clip": 0.78710938, + "router_z_loss_mlp": 0.11798096, + "step": 8385, + "time_per_iteration": 2.71927809715271 + }, + { + "auxiliary_loss_clip": 0.01116826, + "auxiliary_loss_mlp": 0.01026648, + "balance_loss_clip": 1.04135847, + "balance_loss_mlp": 1.01464939, + "epoch": 0.5041935968735909, + "flos": 30027451904640.0, + "grad_norm": 2.186795468391051, + "language_loss": 0.83204281, + "learning_rate": 2.069894137075919e-06, + "loss": 0.8534776, + "num_input_tokens_seen": 180277825, + "router_z_loss_clip": 0.75488281, + "router_z_loss_mlp": 0.11999512, + "step": 8386, + "time_per_iteration": 2.673983097076416 + }, + { + "auxiliary_loss_clip": 0.01121126, + "auxiliary_loss_mlp": 0.0103379, + "balance_loss_clip": 1.04212332, + "balance_loss_mlp": 1.02076602, + "epoch": 0.5042537201262588, + "flos": 32075753980320.0, + "grad_norm": 1.5636919660276045, + "language_loss": 0.66468203, + "learning_rate": 2.0695049119686766e-06, + "loss": 0.68623114, + "num_input_tokens_seen": 180300465, + "router_z_loss_clip": 0.78808594, + "router_z_loss_mlp": 0.13037109, + "step": 8387, + "time_per_iteration": 2.6690616607666016 + }, + { + "auxiliary_loss_clip": 0.01122438, + "auxiliary_loss_mlp": 0.0103103, + "balance_loss_clip": 1.04409814, + "balance_loss_mlp": 1.01930571, + "epoch": 0.5043138433789268, + "flos": 26865772182720.0, + "grad_norm": 6.990633000531097, + "language_loss": 0.80365109, + "learning_rate": 2.0691156842258016e-06, + "loss": 0.82518578, + "num_input_tokens_seen": 180321050, + "router_z_loss_clip": 0.78320312, + "router_z_loss_mlp": 0.11737061, + "step": 8388, + "time_per_iteration": 4.20738959312439 + }, + { + "auxiliary_loss_clip": 0.0111933, + "auxiliary_loss_mlp": 0.01028268, + "balance_loss_clip": 1.041641, + "balance_loss_mlp": 1.01662135, + "epoch": 0.5043739666315947, + "flos": 35102327624640.0, + "grad_norm": 2.8448245107505734, + "language_loss": 0.70102978, + "learning_rate": 2.0687264538620537e-06, + "loss": 0.72250581, + "num_input_tokens_seen": 180338870, + "router_z_loss_clip": 0.77636719, + "router_z_loss_mlp": 0.11645508, + "step": 8389, + "time_per_iteration": 2.695955514907837 + }, + { + "auxiliary_loss_clip": 0.01121319, + "auxiliary_loss_mlp": 0.01035928, + "balance_loss_clip": 1.0407033, + "balance_loss_mlp": 1.0242157, + "epoch": 0.5044340898842627, + "flos": 33677254478880.0, + "grad_norm": 2.0248365379380107, + "language_loss": 0.69731259, + "learning_rate": 2.068337220892191e-06, + "loss": 0.71888506, + "num_input_tokens_seen": 180361285, + "router_z_loss_clip": 0.80615234, + "router_z_loss_mlp": 0.11730957, + "step": 8390, + "time_per_iteration": 2.6880393028259277 + }, + { + "auxiliary_loss_clip": 0.01038858, + "auxiliary_loss_mlp": 0.01002517, + "balance_loss_clip": 1.01414132, + "balance_loss_mlp": 1.00121701, + "epoch": 0.5044942131369307, + "flos": 82314515347200.0, + "grad_norm": 0.8357785263356736, + "language_loss": 0.53055555, + "learning_rate": 2.067947985330974e-06, + "loss": 0.55096936, + "num_input_tokens_seen": 180415170, + "router_z_loss_clip": 0.24719238, + "router_z_loss_mlp": 0.01300812, + "step": 8391, + "time_per_iteration": 3.0762696266174316 + }, + { + "auxiliary_loss_clip": 0.01038734, + "auxiliary_loss_mlp": 0.01001527, + "balance_loss_clip": 1.01380086, + "balance_loss_mlp": 1.00030279, + "epoch": 0.5045543363895987, + "flos": 71542669461120.0, + "grad_norm": 0.8511341210183541, + "language_loss": 0.60646975, + "learning_rate": 2.0675587471931628e-06, + "loss": 0.6268723, + "num_input_tokens_seen": 180468060, + "router_z_loss_clip": 0.24938965, + "router_z_loss_mlp": 0.01222992, + "step": 8392, + "time_per_iteration": 3.0523006916046143 + }, + { + "auxiliary_loss_clip": 0.01116381, + "auxiliary_loss_mlp": 0.01032427, + "balance_loss_clip": 1.04055095, + "balance_loss_mlp": 1.02028537, + "epoch": 0.5046144596422667, + "flos": 27487339904160.0, + "grad_norm": 2.096638810402029, + "language_loss": 0.84496284, + "learning_rate": 2.067169506493517e-06, + "loss": 0.86645085, + "num_input_tokens_seen": 180486610, + "router_z_loss_clip": 0.75732422, + "router_z_loss_mlp": 0.12145996, + "step": 8393, + "time_per_iteration": 2.690981388092041 + }, + { + "auxiliary_loss_clip": 0.01119763, + "auxiliary_loss_mlp": 0.01027993, + "balance_loss_clip": 1.04144406, + "balance_loss_mlp": 1.01638794, + "epoch": 0.5046745828949346, + "flos": 33500219366880.0, + "grad_norm": 1.980733448193002, + "language_loss": 0.51307136, + "learning_rate": 2.0667802632467974e-06, + "loss": 0.53454888, + "num_input_tokens_seen": 180508135, + "router_z_loss_clip": 0.78271484, + "router_z_loss_mlp": 0.11602783, + "step": 8394, + "time_per_iteration": 4.020546913146973 + }, + { + "auxiliary_loss_clip": 0.01119674, + "auxiliary_loss_mlp": 0.01030829, + "balance_loss_clip": 1.04018056, + "balance_loss_mlp": 1.01735854, + "epoch": 0.5047347061476026, + "flos": 21078177562080.0, + "grad_norm": 1.909070161333667, + "language_loss": 0.74994481, + "learning_rate": 2.0663910174677627e-06, + "loss": 0.7714498, + "num_input_tokens_seen": 180527000, + "router_z_loss_clip": 0.79492188, + "router_z_loss_mlp": 0.1348877, + "step": 8395, + "time_per_iteration": 2.6197798252105713 + }, + { + "auxiliary_loss_clip": 0.01119442, + "auxiliary_loss_mlp": 0.0103464, + "balance_loss_clip": 1.0411638, + "balance_loss_mlp": 1.02258813, + "epoch": 0.5047948294002705, + "flos": 20315912378400.0, + "grad_norm": 4.95763059890729, + "language_loss": 0.67896259, + "learning_rate": 2.0660017691711737e-06, + "loss": 0.70050335, + "num_input_tokens_seen": 180544715, + "router_z_loss_clip": 0.78320312, + "router_z_loss_mlp": 0.12036133, + "step": 8396, + "time_per_iteration": 2.638380527496338 + }, + { + "auxiliary_loss_clip": 0.01120541, + "auxiliary_loss_mlp": 0.01030196, + "balance_loss_clip": 1.0424937, + "balance_loss_mlp": 1.01837683, + "epoch": 0.5048549526529386, + "flos": 32782476323520.0, + "grad_norm": 1.8081091078854663, + "language_loss": 0.78745818, + "learning_rate": 2.065612518371792e-06, + "loss": 0.80896556, + "num_input_tokens_seen": 180565365, + "router_z_loss_clip": 0.78076172, + "router_z_loss_mlp": 0.1182251, + "step": 8397, + "time_per_iteration": 2.644864082336426 + }, + { + "auxiliary_loss_clip": 0.01117636, + "auxiliary_loss_mlp": 0.01029168, + "balance_loss_clip": 1.0407418, + "balance_loss_mlp": 1.01763439, + "epoch": 0.5049150759056065, + "flos": 26642026997280.0, + "grad_norm": 1.5892668251148152, + "language_loss": 0.66344243, + "learning_rate": 2.065223265084376e-06, + "loss": 0.68491048, + "num_input_tokens_seen": 180586670, + "router_z_loss_clip": 0.76953125, + "router_z_loss_mlp": 0.11535645, + "step": 8398, + "time_per_iteration": 2.6425669193267822 + }, + { + "auxiliary_loss_clip": 0.01121175, + "auxiliary_loss_mlp": 0.01030813, + "balance_loss_clip": 1.04274023, + "balance_loss_mlp": 1.01849246, + "epoch": 0.5049751991582745, + "flos": 26461466881920.0, + "grad_norm": 2.393957327647368, + "language_loss": 0.71822333, + "learning_rate": 2.064834009323688e-06, + "loss": 0.73974329, + "num_input_tokens_seen": 180605085, + "router_z_loss_clip": 0.78417969, + "router_z_loss_mlp": 0.12310791, + "step": 8399, + "time_per_iteration": 2.6087028980255127 + }, + { + "auxiliary_loss_clip": 0.01121767, + "auxiliary_loss_mlp": 0.01041272, + "balance_loss_clip": 1.04204679, + "balance_loss_mlp": 1.02912498, + "epoch": 0.5050353224109424, + "flos": 26062955552160.0, + "grad_norm": 2.441707366633838, + "language_loss": 0.81508112, + "learning_rate": 2.0644447511044878e-06, + "loss": 0.83671141, + "num_input_tokens_seen": 180624370, + "router_z_loss_clip": 0.79638672, + "router_z_loss_mlp": 0.12127686, + "step": 8400, + "time_per_iteration": 2.649935483932495 + }, + { + "auxiliary_loss_clip": 0.01121005, + "auxiliary_loss_mlp": 0.01029681, + "balance_loss_clip": 1.04197919, + "balance_loss_mlp": 1.01686561, + "epoch": 0.5050954456636104, + "flos": 27845299787040.0, + "grad_norm": 2.293333791908763, + "language_loss": 0.78728044, + "learning_rate": 2.0640554904415362e-06, + "loss": 0.80878735, + "num_input_tokens_seen": 180642450, + "router_z_loss_clip": 0.79003906, + "router_z_loss_mlp": 0.1282959, + "step": 8401, + "time_per_iteration": 2.644531488418579 + }, + { + "auxiliary_loss_clip": 0.01122071, + "auxiliary_loss_mlp": 0.01030282, + "balance_loss_clip": 1.0412358, + "balance_loss_mlp": 1.01791418, + "epoch": 0.5051555689162783, + "flos": 37153506427200.0, + "grad_norm": 1.957350538574791, + "language_loss": 0.69806015, + "learning_rate": 2.063666227349593e-06, + "loss": 0.71958363, + "num_input_tokens_seen": 180665250, + "router_z_loss_clip": 0.80810547, + "router_z_loss_mlp": 0.12365723, + "step": 8402, + "time_per_iteration": 2.716616630554199 + }, + { + "auxiliary_loss_clip": 0.01117793, + "auxiliary_loss_mlp": 0.01027298, + "balance_loss_clip": 1.03910577, + "balance_loss_mlp": 1.01550817, + "epoch": 0.5052156921689464, + "flos": 25975653514560.0, + "grad_norm": 2.4746975155991797, + "language_loss": 0.69267809, + "learning_rate": 2.063276961843422e-06, + "loss": 0.71412897, + "num_input_tokens_seen": 180687425, + "router_z_loss_clip": 0.78564453, + "router_z_loss_mlp": 0.11791992, + "step": 8403, + "time_per_iteration": 2.689707040786743 + }, + { + "auxiliary_loss_clip": 0.0111722, + "auxiliary_loss_mlp": 0.01035725, + "balance_loss_clip": 1.04179478, + "balance_loss_mlp": 1.02366698, + "epoch": 0.5052758154216143, + "flos": 30604700072160.0, + "grad_norm": 1.437054118947304, + "language_loss": 0.85642993, + "learning_rate": 2.062887693937781e-06, + "loss": 0.87795943, + "num_input_tokens_seen": 180708725, + "router_z_loss_clip": 0.75390625, + "router_z_loss_mlp": 0.12054443, + "step": 8404, + "time_per_iteration": 2.6657376289367676 + }, + { + "auxiliary_loss_clip": 0.01121877, + "auxiliary_loss_mlp": 0.01034406, + "balance_loss_clip": 1.04355907, + "balance_loss_mlp": 1.02218699, + "epoch": 0.5053359386742823, + "flos": 25484653935360.0, + "grad_norm": 1.5678550580780233, + "language_loss": 0.7552259, + "learning_rate": 2.0624984236474322e-06, + "loss": 0.77678877, + "num_input_tokens_seen": 180727990, + "router_z_loss_clip": 0.78417969, + "router_z_loss_mlp": 0.12219238, + "step": 8405, + "time_per_iteration": 2.62492299079895 + }, + { + "auxiliary_loss_clip": 0.01120488, + "auxiliary_loss_mlp": 0.01026997, + "balance_loss_clip": 1.04109478, + "balance_loss_mlp": 1.01415229, + "epoch": 0.5053960619269503, + "flos": 46055746558080.0, + "grad_norm": 1.6747688316942562, + "language_loss": 0.72670639, + "learning_rate": 2.0621091509871378e-06, + "loss": 0.74818122, + "num_input_tokens_seen": 180749765, + "router_z_loss_clip": 0.79345703, + "router_z_loss_mlp": 0.12823486, + "step": 8406, + "time_per_iteration": 2.7814929485321045 + }, + { + "auxiliary_loss_clip": 0.01114342, + "auxiliary_loss_mlp": 0.01027635, + "balance_loss_clip": 1.03929305, + "balance_loss_mlp": 1.01591063, + "epoch": 0.5054561851796182, + "flos": 28691828212320.0, + "grad_norm": 2.1211805487132036, + "language_loss": 0.770257, + "learning_rate": 2.0617198759716568e-06, + "loss": 0.79167676, + "num_input_tokens_seen": 180769580, + "router_z_loss_clip": 0.75, + "router_z_loss_mlp": 0.11724854, + "step": 8407, + "time_per_iteration": 2.6858198642730713 + }, + { + "auxiliary_loss_clip": 0.01120912, + "auxiliary_loss_mlp": 0.01028288, + "balance_loss_clip": 1.04061842, + "balance_loss_mlp": 1.01642084, + "epoch": 0.5055163084322862, + "flos": 37107282561120.0, + "grad_norm": 2.375233162922586, + "language_loss": 0.63257259, + "learning_rate": 2.0613305986157535e-06, + "loss": 0.6540646, + "num_input_tokens_seen": 180790295, + "router_z_loss_clip": 0.80273438, + "router_z_loss_mlp": 0.11865234, + "step": 8408, + "time_per_iteration": 2.7076661586761475 + }, + { + "auxiliary_loss_clip": 0.01120641, + "auxiliary_loss_mlp": 0.01031251, + "balance_loss_clip": 1.04274583, + "balance_loss_mlp": 1.01832235, + "epoch": 0.5055764316849541, + "flos": 24721456854240.0, + "grad_norm": 1.6807258478096052, + "language_loss": 0.63233227, + "learning_rate": 2.0609413189341865e-06, + "loss": 0.65385121, + "num_input_tokens_seen": 180807875, + "router_z_loss_clip": 0.77880859, + "router_z_loss_mlp": 0.1293335, + "step": 8409, + "time_per_iteration": 2.649606704711914 + }, + { + "auxiliary_loss_clip": 0.01117507, + "auxiliary_loss_mlp": 0.01026595, + "balance_loss_clip": 1.04168546, + "balance_loss_mlp": 1.01555634, + "epoch": 0.5056365549376222, + "flos": 31811862520800.0, + "grad_norm": 1.3531909688029362, + "language_loss": 0.70395625, + "learning_rate": 2.0605520369417193e-06, + "loss": 0.72539723, + "num_input_tokens_seen": 180831300, + "router_z_loss_clip": 0.75927734, + "router_z_loss_mlp": 0.11035156, + "step": 8410, + "time_per_iteration": 2.7231643199920654 + }, + { + "auxiliary_loss_clip": 0.01120926, + "auxiliary_loss_mlp": 0.01037625, + "balance_loss_clip": 1.041731, + "balance_loss_mlp": 1.02458978, + "epoch": 0.5056966781902901, + "flos": 23524464242880.0, + "grad_norm": 1.7884341660778744, + "language_loss": 0.79107261, + "learning_rate": 2.060162752653113e-06, + "loss": 0.81265813, + "num_input_tokens_seen": 180849055, + "router_z_loss_clip": 0.79101562, + "router_z_loss_mlp": 0.13024902, + "step": 8411, + "time_per_iteration": 2.633934736251831 + }, + { + "auxiliary_loss_clip": 0.01120764, + "auxiliary_loss_mlp": 0.01037118, + "balance_loss_clip": 1.04119849, + "balance_loss_mlp": 1.0235579, + "epoch": 0.5057568014429581, + "flos": 26018919619200.0, + "grad_norm": 2.1039597638115, + "language_loss": 0.81477857, + "learning_rate": 2.0597734660831285e-06, + "loss": 0.83635736, + "num_input_tokens_seen": 180867395, + "router_z_loss_clip": 0.79589844, + "router_z_loss_mlp": 0.13568115, + "step": 8412, + "time_per_iteration": 2.6428630352020264 + }, + { + "auxiliary_loss_clip": 0.01123183, + "auxiliary_loss_mlp": 0.01036845, + "balance_loss_clip": 1.04440379, + "balance_loss_mlp": 1.02455437, + "epoch": 0.505816924695626, + "flos": 21345472473120.0, + "grad_norm": 1.809719942399805, + "language_loss": 0.80819082, + "learning_rate": 2.0593841772465283e-06, + "loss": 0.82979107, + "num_input_tokens_seen": 180886670, + "router_z_loss_clip": 0.78759766, + "router_z_loss_mlp": 0.12286377, + "step": 8413, + "time_per_iteration": 2.6740856170654297 + }, + { + "auxiliary_loss_clip": 0.01121247, + "auxiliary_loss_mlp": 0.010342, + "balance_loss_clip": 1.04117179, + "balance_loss_mlp": 1.02118874, + "epoch": 0.505877047948294, + "flos": 25797645987840.0, + "grad_norm": 2.114540698543727, + "language_loss": 0.80395234, + "learning_rate": 2.0589948861580737e-06, + "loss": 0.82550681, + "num_input_tokens_seen": 180904645, + "router_z_loss_clip": 0.80175781, + "router_z_loss_mlp": 0.13012695, + "step": 8414, + "time_per_iteration": 2.60847806930542 + }, + { + "auxiliary_loss_clip": 0.01118872, + "auxiliary_loss_mlp": 0.01029332, + "balance_loss_clip": 1.03869975, + "balance_loss_mlp": 1.0169934, + "epoch": 0.5059371712009619, + "flos": 44356166425440.0, + "grad_norm": 2.32465775637812, + "language_loss": 0.62003535, + "learning_rate": 2.058605592832528e-06, + "loss": 0.64151746, + "num_input_tokens_seen": 180922340, + "router_z_loss_clip": 0.80175781, + "router_z_loss_mlp": 0.12335205, + "step": 8415, + "time_per_iteration": 2.7029244899749756 + }, + { + "auxiliary_loss_clip": 0.01118804, + "auxiliary_loss_mlp": 0.01028041, + "balance_loss_clip": 1.03978944, + "balance_loss_mlp": 1.01600051, + "epoch": 0.50599729445363, + "flos": 27666360362880.0, + "grad_norm": 2.2472250117124988, + "language_loss": 0.8203637, + "learning_rate": 2.0582162972846515e-06, + "loss": 0.84183216, + "num_input_tokens_seen": 180941350, + "router_z_loss_clip": 0.78955078, + "router_z_loss_mlp": 0.12036133, + "step": 8416, + "time_per_iteration": 2.634428024291992 + }, + { + "auxiliary_loss_clip": 0.01120385, + "auxiliary_loss_mlp": 0.01035824, + "balance_loss_clip": 1.04370832, + "balance_loss_mlp": 1.02411127, + "epoch": 0.5060574177062979, + "flos": 27757106369280.0, + "grad_norm": 2.366630384092906, + "language_loss": 0.7914173, + "learning_rate": 2.0578269995292078e-06, + "loss": 0.81297934, + "num_input_tokens_seen": 180960720, + "router_z_loss_clip": 0.765625, + "router_z_loss_mlp": 0.11706543, + "step": 8417, + "time_per_iteration": 2.707581043243408 + }, + { + "auxiliary_loss_clip": 0.01115938, + "auxiliary_loss_mlp": 0.01032246, + "balance_loss_clip": 1.04039216, + "balance_loss_mlp": 1.02037239, + "epoch": 0.5061175409589659, + "flos": 26421320607840.0, + "grad_norm": 2.069140983147364, + "language_loss": 0.62609285, + "learning_rate": 2.0574376995809588e-06, + "loss": 0.64757466, + "num_input_tokens_seen": 180979725, + "router_z_loss_clip": 0.75537109, + "router_z_loss_mlp": 0.11877441, + "step": 8418, + "time_per_iteration": 2.6383895874023438 + }, + { + "auxiliary_loss_clip": 0.01121579, + "auxiliary_loss_mlp": 0.01031175, + "balance_loss_clip": 1.04175472, + "balance_loss_mlp": 1.01919401, + "epoch": 0.5061776642116339, + "flos": 26376271742880.0, + "grad_norm": 3.070372219484016, + "language_loss": 0.76908785, + "learning_rate": 2.0570483974546653e-06, + "loss": 0.79061538, + "num_input_tokens_seen": 180998980, + "router_z_loss_clip": 0.79785156, + "router_z_loss_mlp": 0.11981201, + "step": 8419, + "time_per_iteration": 2.60544753074646 + }, + { + "auxiliary_loss_clip": 0.01120832, + "auxiliary_loss_mlp": 0.01030921, + "balance_loss_clip": 1.04062831, + "balance_loss_mlp": 1.01821923, + "epoch": 0.5062377874643018, + "flos": 29805246375840.0, + "grad_norm": 1.8264271022507363, + "language_loss": 0.7700358, + "learning_rate": 2.0566590931650917e-06, + "loss": 0.79155338, + "num_input_tokens_seen": 181019165, + "router_z_loss_clip": 0.80175781, + "router_z_loss_mlp": 0.12701416, + "step": 8420, + "time_per_iteration": 5.698000192642212 + }, + { + "auxiliary_loss_clip": 0.0112001, + "auxiliary_loss_mlp": 0.01034689, + "balance_loss_clip": 1.04036975, + "balance_loss_mlp": 1.02208889, + "epoch": 0.5062979107169698, + "flos": 27484868350080.0, + "grad_norm": 6.754043886287056, + "language_loss": 0.77474225, + "learning_rate": 2.056269786726999e-06, + "loss": 0.79628927, + "num_input_tokens_seen": 181037110, + "router_z_loss_clip": 0.79638672, + "router_z_loss_mlp": 0.12615967, + "step": 8421, + "time_per_iteration": 2.6348321437835693 + }, + { + "auxiliary_loss_clip": 0.01117986, + "auxiliary_loss_mlp": 0.01028052, + "balance_loss_clip": 1.04012895, + "balance_loss_mlp": 1.01634002, + "epoch": 0.5063580339696377, + "flos": 29983051316160.0, + "grad_norm": 1.5675675992865374, + "language_loss": 0.66537791, + "learning_rate": 2.0558804781551512e-06, + "loss": 0.68683827, + "num_input_tokens_seen": 181057775, + "router_z_loss_clip": 0.77685547, + "router_z_loss_mlp": 0.1171875, + "step": 8422, + "time_per_iteration": 2.692826986312866 + }, + { + "auxiliary_loss_clip": 0.01120487, + "auxiliary_loss_mlp": 0.01031963, + "balance_loss_clip": 1.04317462, + "balance_loss_mlp": 1.01965427, + "epoch": 0.5064181572223058, + "flos": 27572494525920.0, + "grad_norm": 3.01823989115901, + "language_loss": 0.81921834, + "learning_rate": 2.05549116746431e-06, + "loss": 0.84074283, + "num_input_tokens_seen": 181078260, + "router_z_loss_clip": 0.77246094, + "router_z_loss_mlp": 0.12304688, + "step": 8423, + "time_per_iteration": 2.69995379447937 + }, + { + "auxiliary_loss_clip": 0.01120846, + "auxiliary_loss_mlp": 0.01031521, + "balance_loss_clip": 1.04107594, + "balance_loss_mlp": 1.01805067, + "epoch": 0.5064782804749737, + "flos": 31719455305920.0, + "grad_norm": 2.230408387384884, + "language_loss": 0.74295294, + "learning_rate": 2.055101854669237e-06, + "loss": 0.7644766, + "num_input_tokens_seen": 181098755, + "router_z_loss_clip": 0.79736328, + "router_z_loss_mlp": 0.13470459, + "step": 8424, + "time_per_iteration": 2.64973521232605 + }, + { + "auxiliary_loss_clip": 0.01118354, + "auxiliary_loss_mlp": 0.01033522, + "balance_loss_clip": 1.04120886, + "balance_loss_mlp": 1.02097523, + "epoch": 0.5065384037276417, + "flos": 34840664615520.0, + "grad_norm": 1.5369189283203453, + "language_loss": 0.71973026, + "learning_rate": 2.0547125397846975e-06, + "loss": 0.74124908, + "num_input_tokens_seen": 181121570, + "router_z_loss_clip": 0.77099609, + "router_z_loss_mlp": 0.12554932, + "step": 8425, + "time_per_iteration": 2.7186219692230225 + }, + { + "auxiliary_loss_clip": 0.01120114, + "auxiliary_loss_mlp": 0.01032568, + "balance_loss_clip": 1.04125333, + "balance_loss_mlp": 1.02086782, + "epoch": 0.5065985269803096, + "flos": 28023388348320.0, + "grad_norm": 2.191999191558343, + "language_loss": 0.79133868, + "learning_rate": 2.0543232228254524e-06, + "loss": 0.8128655, + "num_input_tokens_seen": 181140240, + "router_z_loss_clip": 0.78857422, + "router_z_loss_mlp": 0.11706543, + "step": 8426, + "time_per_iteration": 2.6265342235565186 + }, + { + "auxiliary_loss_clip": 0.0112346, + "auxiliary_loss_mlp": 0.0103209, + "balance_loss_clip": 1.04412687, + "balance_loss_mlp": 1.02004361, + "epoch": 0.5066586502329776, + "flos": 26367601044960.0, + "grad_norm": 2.1269780274711096, + "language_loss": 0.77104723, + "learning_rate": 2.053933903806265e-06, + "loss": 0.79260272, + "num_input_tokens_seen": 181158630, + "router_z_loss_clip": 0.79296875, + "router_z_loss_mlp": 0.12042236, + "step": 8427, + "time_per_iteration": 4.185874700546265 + }, + { + "auxiliary_loss_clip": 0.01116129, + "auxiliary_loss_mlp": 0.01026554, + "balance_loss_clip": 1.03930354, + "balance_loss_mlp": 1.01409662, + "epoch": 0.5067187734856455, + "flos": 24818726142720.0, + "grad_norm": 3.125348413260029, + "language_loss": 0.71725315, + "learning_rate": 2.0535445827418997e-06, + "loss": 0.73867995, + "num_input_tokens_seen": 181176405, + "router_z_loss_clip": 0.76806641, + "router_z_loss_mlp": 0.12457275, + "step": 8428, + "time_per_iteration": 2.642937660217285 + }, + { + "auxiliary_loss_clip": 0.01116627, + "auxiliary_loss_mlp": 0.01030146, + "balance_loss_clip": 1.03854895, + "balance_loss_mlp": 1.01847541, + "epoch": 0.5067788967383136, + "flos": 35192506389120.0, + "grad_norm": 1.8532479926561223, + "language_loss": 0.82722729, + "learning_rate": 2.0531552596471168e-06, + "loss": 0.84869504, + "num_input_tokens_seen": 181197595, + "router_z_loss_clip": 0.78076172, + "router_z_loss_mlp": 0.11676025, + "step": 8429, + "time_per_iteration": 2.693115234375 + }, + { + "auxiliary_loss_clip": 0.01123684, + "auxiliary_loss_mlp": 0.01036731, + "balance_loss_clip": 1.04161465, + "balance_loss_mlp": 1.02298617, + "epoch": 0.5068390199909815, + "flos": 39596875863840.0, + "grad_norm": 3.230586379448412, + "language_loss": 0.73378575, + "learning_rate": 2.052765934536682e-06, + "loss": 0.75538993, + "num_input_tokens_seen": 181218560, + "router_z_loss_clip": 0.8203125, + "router_z_loss_mlp": 0.13745117, + "step": 8430, + "time_per_iteration": 2.723269462585449 + }, + { + "auxiliary_loss_clip": 0.01120295, + "auxiliary_loss_mlp": 0.01031021, + "balance_loss_clip": 1.04109561, + "balance_loss_mlp": 1.01896846, + "epoch": 0.5068991432436495, + "flos": 28243892151360.0, + "grad_norm": 1.7739255108775023, + "language_loss": 0.76814687, + "learning_rate": 2.0523766074253575e-06, + "loss": 0.7896601, + "num_input_tokens_seen": 181237095, + "router_z_loss_clip": 0.79248047, + "router_z_loss_mlp": 0.1204834, + "step": 8431, + "time_per_iteration": 2.6652376651763916 + }, + { + "auxiliary_loss_clip": 0.0111796, + "auxiliary_loss_mlp": 0.0102817, + "balance_loss_clip": 1.04097521, + "balance_loss_mlp": 1.01609445, + "epoch": 0.5069592664963174, + "flos": 24326186906880.0, + "grad_norm": 1.602301827778702, + "language_loss": 0.72101551, + "learning_rate": 2.0519872783279074e-06, + "loss": 0.74247682, + "num_input_tokens_seen": 181255940, + "router_z_loss_clip": 0.76953125, + "router_z_loss_mlp": 0.12091064, + "step": 8432, + "time_per_iteration": 2.602858304977417 + }, + { + "auxiliary_loss_clip": 0.01034965, + "auxiliary_loss_mlp": 0.01000251, + "balance_loss_clip": 1.01056218, + "balance_loss_mlp": 0.99902511, + "epoch": 0.5070193897489854, + "flos": 80282339148960.0, + "grad_norm": 0.7535440517286588, + "language_loss": 0.63619226, + "learning_rate": 2.0515979472590945e-06, + "loss": 0.65654445, + "num_input_tokens_seen": 181316945, + "router_z_loss_clip": 0.24414062, + "router_z_loss_mlp": 0.01224518, + "step": 8433, + "time_per_iteration": 3.2970988750457764 + }, + { + "auxiliary_loss_clip": 0.01120093, + "auxiliary_loss_mlp": 0.01039721, + "balance_loss_clip": 1.04094923, + "balance_loss_mlp": 1.02704906, + "epoch": 0.5070795130016534, + "flos": 21079068942240.0, + "grad_norm": 1.837751933678216, + "language_loss": 0.77355063, + "learning_rate": 2.051208614233681e-06, + "loss": 0.79514873, + "num_input_tokens_seen": 181335555, + "router_z_loss_clip": 0.79101562, + "router_z_loss_mlp": 0.12677002, + "step": 8434, + "time_per_iteration": 3.969268560409546 + }, + { + "auxiliary_loss_clip": 0.01122362, + "auxiliary_loss_mlp": 0.01032737, + "balance_loss_clip": 1.0417099, + "balance_loss_mlp": 1.02027345, + "epoch": 0.5071396362543213, + "flos": 25709533604640.0, + "grad_norm": 2.1013512781830963, + "language_loss": 0.70746303, + "learning_rate": 2.0508192792664326e-06, + "loss": 0.72901404, + "num_input_tokens_seen": 181354580, + "router_z_loss_clip": 0.80664062, + "router_z_loss_mlp": 0.12457275, + "step": 8435, + "time_per_iteration": 2.654637336730957 + }, + { + "auxiliary_loss_clip": 0.01122128, + "auxiliary_loss_mlp": 0.01032525, + "balance_loss_clip": 1.04254532, + "balance_loss_mlp": 1.01889324, + "epoch": 0.5071997595069894, + "flos": 28241906804640.0, + "grad_norm": 2.2635097473844668, + "language_loss": 0.72185642, + "learning_rate": 2.050429942372112e-06, + "loss": 0.74340296, + "num_input_tokens_seen": 181374320, + "router_z_loss_clip": 0.79541016, + "router_z_loss_mlp": 0.13641357, + "step": 8436, + "time_per_iteration": 2.652561902999878 + }, + { + "auxiliary_loss_clip": 0.01120619, + "auxiliary_loss_mlp": 0.01028781, + "balance_loss_clip": 1.04233527, + "balance_loss_mlp": 1.0154655, + "epoch": 0.5072598827596573, + "flos": 27756336540960.0, + "grad_norm": 1.671219002902268, + "language_loss": 0.83973324, + "learning_rate": 2.050040603565483e-06, + "loss": 0.86122727, + "num_input_tokens_seen": 181392190, + "router_z_loss_clip": 0.78222656, + "router_z_loss_mlp": 0.13323975, + "step": 8437, + "time_per_iteration": 2.61273455619812 + }, + { + "auxiliary_loss_clip": 0.01116154, + "auxiliary_loss_mlp": 0.01028575, + "balance_loss_clip": 1.03934622, + "balance_loss_mlp": 1.01633835, + "epoch": 0.5073200060123253, + "flos": 27535468082400.0, + "grad_norm": 1.502875380758534, + "language_loss": 0.80719835, + "learning_rate": 2.049651262861309e-06, + "loss": 0.82864565, + "num_input_tokens_seen": 181413890, + "router_z_loss_clip": 0.76806641, + "router_z_loss_mlp": 0.12237549, + "step": 8438, + "time_per_iteration": 2.68064022064209 + }, + { + "auxiliary_loss_clip": 0.01121997, + "auxiliary_loss_mlp": 0.01035246, + "balance_loss_clip": 1.04148769, + "balance_loss_mlp": 1.02142334, + "epoch": 0.5073801292649932, + "flos": 31489875632160.0, + "grad_norm": 2.2648591554918744, + "language_loss": 0.79756939, + "learning_rate": 2.0492619202743543e-06, + "loss": 0.81914181, + "num_input_tokens_seen": 181433240, + "router_z_loss_clip": 0.80517578, + "router_z_loss_mlp": 0.13818359, + "step": 8439, + "time_per_iteration": 2.653233289718628 + }, + { + "auxiliary_loss_clip": 0.01118349, + "auxiliary_loss_mlp": 0.01033846, + "balance_loss_clip": 1.04094172, + "balance_loss_mlp": 1.02212811, + "epoch": 0.5074402525176612, + "flos": 30961728057600.0, + "grad_norm": 1.7110744101684063, + "language_loss": 0.70552409, + "learning_rate": 2.048872575819383e-06, + "loss": 0.72704601, + "num_input_tokens_seen": 181453535, + "router_z_loss_clip": 0.77392578, + "router_z_loss_mlp": 0.11706543, + "step": 8440, + "time_per_iteration": 2.688138246536255 + }, + { + "auxiliary_loss_clip": 0.01120295, + "auxiliary_loss_mlp": 0.01034327, + "balance_loss_clip": 1.04047751, + "balance_loss_mlp": 1.02223897, + "epoch": 0.5075003757703291, + "flos": 31804569410400.0, + "grad_norm": 3.2974337593758705, + "language_loss": 0.70988107, + "learning_rate": 2.048483229511158e-06, + "loss": 0.73142731, + "num_input_tokens_seen": 181474195, + "router_z_loss_clip": 0.79736328, + "router_z_loss_mlp": 0.12091064, + "step": 8441, + "time_per_iteration": 2.6659185886383057 + }, + { + "auxiliary_loss_clip": 0.0112223, + "auxiliary_loss_mlp": 0.01037748, + "balance_loss_clip": 1.04165411, + "balance_loss_mlp": 1.02489734, + "epoch": 0.5075604990229972, + "flos": 26643728723040.0, + "grad_norm": 1.6579783174480827, + "language_loss": 0.63436323, + "learning_rate": 2.0480938813644445e-06, + "loss": 0.655963, + "num_input_tokens_seen": 181494000, + "router_z_loss_clip": 0.80615234, + "router_z_loss_mlp": 0.12854004, + "step": 8442, + "time_per_iteration": 2.639533519744873 + }, + { + "auxiliary_loss_clip": 0.01115389, + "auxiliary_loss_mlp": 0.01026551, + "balance_loss_clip": 1.03947186, + "balance_loss_mlp": 1.01513076, + "epoch": 0.5076206222756651, + "flos": 39021288904800.0, + "grad_norm": 1.5780519345432529, + "language_loss": 0.71183467, + "learning_rate": 2.047704531394006e-06, + "loss": 0.73325408, + "num_input_tokens_seen": 181515955, + "router_z_loss_clip": 0.75830078, + "router_z_loss_mlp": 0.11413574, + "step": 8443, + "time_per_iteration": 2.7054693698883057 + }, + { + "auxiliary_loss_clip": 0.01121081, + "auxiliary_loss_mlp": 0.01037022, + "balance_loss_clip": 1.04094136, + "balance_loss_mlp": 1.02426624, + "epoch": 0.5076807455283331, + "flos": 45031575261600.0, + "grad_norm": 1.44599713377811, + "language_loss": 0.61966383, + "learning_rate": 2.047315179614607e-06, + "loss": 0.64124483, + "num_input_tokens_seen": 181540225, + "router_z_loss_clip": 0.80029297, + "router_z_loss_mlp": 0.12756348, + "step": 8444, + "time_per_iteration": 2.786837339401245 + }, + { + "auxiliary_loss_clip": 0.01118537, + "auxiliary_loss_mlp": 0.01030628, + "balance_loss_clip": 1.04071236, + "balance_loss_mlp": 1.0187068, + "epoch": 0.507740868781001, + "flos": 36438842697120.0, + "grad_norm": 1.7714913895821829, + "language_loss": 0.63652134, + "learning_rate": 2.046925826041012e-06, + "loss": 0.65801299, + "num_input_tokens_seen": 181560125, + "router_z_loss_clip": 0.77734375, + "router_z_loss_mlp": 0.1192627, + "step": 8445, + "time_per_iteration": 2.6922125816345215 + }, + { + "auxiliary_loss_clip": 0.01037284, + "auxiliary_loss_mlp": 0.01006649, + "balance_loss_clip": 1.01254916, + "balance_loss_mlp": 1.00553191, + "epoch": 0.507800992033669, + "flos": 75554686232640.0, + "grad_norm": 0.8272377267439393, + "language_loss": 0.6193285, + "learning_rate": 2.0465364706879845e-06, + "loss": 0.63976783, + "num_input_tokens_seen": 181618830, + "router_z_loss_clip": 0.24755859, + "router_z_loss_mlp": 0.01117706, + "step": 8446, + "time_per_iteration": 3.296165704727173 + }, + { + "auxiliary_loss_clip": 0.01116185, + "auxiliary_loss_mlp": 0.01027366, + "balance_loss_clip": 1.03905129, + "balance_loss_mlp": 1.01554608, + "epoch": 0.507861115286337, + "flos": 25259125989600.0, + "grad_norm": 1.733780365168888, + "language_loss": 0.8054691, + "learning_rate": 2.04614711357029e-06, + "loss": 0.82690459, + "num_input_tokens_seen": 181637120, + "router_z_loss_clip": 0.77197266, + "router_z_loss_mlp": 0.11828613, + "step": 8447, + "time_per_iteration": 2.6323935985565186 + }, + { + "auxiliary_loss_clip": 0.01117915, + "auxiliary_loss_mlp": 0.01029653, + "balance_loss_clip": 1.04264259, + "balance_loss_mlp": 1.01780915, + "epoch": 0.507921238539005, + "flos": 37640413761120.0, + "grad_norm": 1.4338466530199006, + "language_loss": 0.70408291, + "learning_rate": 2.0457577547026916e-06, + "loss": 0.72555858, + "num_input_tokens_seen": 181659965, + "router_z_loss_clip": 0.75244141, + "router_z_loss_mlp": 0.1184082, + "step": 8448, + "time_per_iteration": 2.6994457244873047 + }, + { + "auxiliary_loss_clip": 0.01117425, + "auxiliary_loss_mlp": 0.01027159, + "balance_loss_clip": 1.04133642, + "balance_loss_mlp": 1.01604855, + "epoch": 0.507981361791673, + "flos": 43562547217440.0, + "grad_norm": 1.6338478260574125, + "language_loss": 0.71817243, + "learning_rate": 2.045368394099955e-06, + "loss": 0.7396183, + "num_input_tokens_seen": 181685290, + "router_z_loss_clip": 0.76025391, + "router_z_loss_mlp": 0.11108398, + "step": 8449, + "time_per_iteration": 2.7624123096466064 + }, + { + "auxiliary_loss_clip": 0.01114806, + "auxiliary_loss_mlp": 0.01029519, + "balance_loss_clip": 1.03861821, + "balance_loss_mlp": 1.0179975, + "epoch": 0.5080414850443409, + "flos": 33143029312320.0, + "grad_norm": 1.4091119237061107, + "language_loss": 0.72546273, + "learning_rate": 2.044979031776844e-06, + "loss": 0.74690604, + "num_input_tokens_seen": 181706080, + "router_z_loss_clip": 0.76220703, + "router_z_loss_mlp": 0.11529541, + "step": 8450, + "time_per_iteration": 2.7025675773620605 + }, + { + "auxiliary_loss_clip": 0.0112129, + "auxiliary_loss_mlp": 0.01032672, + "balance_loss_clip": 1.04225731, + "balance_loss_mlp": 1.02030361, + "epoch": 0.5081016082970089, + "flos": 33050257441920.0, + "grad_norm": 2.1353785148209288, + "language_loss": 0.77173066, + "learning_rate": 2.0445896677481234e-06, + "loss": 0.79327029, + "num_input_tokens_seen": 181724805, + "router_z_loss_clip": 0.78955078, + "router_z_loss_mlp": 0.1237793, + "step": 8451, + "time_per_iteration": 2.674140214920044 + }, + { + "auxiliary_loss_clip": 0.01119399, + "auxiliary_loss_mlp": 0.01034046, + "balance_loss_clip": 1.04065883, + "balance_loss_mlp": 1.02218473, + "epoch": 0.5081617315496768, + "flos": 27889781410080.0, + "grad_norm": 1.9958912342091488, + "language_loss": 0.85006666, + "learning_rate": 2.044200302028559e-06, + "loss": 0.8716011, + "num_input_tokens_seen": 181743725, + "router_z_loss_clip": 0.78808594, + "router_z_loss_mlp": 0.11859131, + "step": 8452, + "time_per_iteration": 2.6777172088623047 + }, + { + "auxiliary_loss_clip": 0.01123639, + "auxiliary_loss_mlp": 0.01035237, + "balance_loss_clip": 1.04209757, + "balance_loss_mlp": 1.02212954, + "epoch": 0.5082218548023448, + "flos": 19867571144640.0, + "grad_norm": 3.6168483559066624, + "language_loss": 0.7776382, + "learning_rate": 2.0438109346329143e-06, + "loss": 0.799227, + "num_input_tokens_seen": 181757720, + "router_z_loss_clip": 0.81542969, + "router_z_loss_mlp": 0.13110352, + "step": 8453, + "time_per_iteration": 2.6130564212799072 + }, + { + "auxiliary_loss_clip": 0.01115745, + "auxiliary_loss_mlp": 0.01030397, + "balance_loss_clip": 1.04074693, + "balance_loss_mlp": 1.01885724, + "epoch": 0.5082819780550127, + "flos": 29847499548480.0, + "grad_norm": 1.748313116748458, + "language_loss": 0.76252407, + "learning_rate": 2.0434215655759544e-06, + "loss": 0.7839855, + "num_input_tokens_seen": 181778545, + "router_z_loss_clip": 0.75097656, + "router_z_loss_mlp": 0.11541748, + "step": 8454, + "time_per_iteration": 2.683845043182373 + }, + { + "auxiliary_loss_clip": 0.01119182, + "auxiliary_loss_mlp": 0.01034025, + "balance_loss_clip": 1.04117656, + "balance_loss_mlp": 1.02149606, + "epoch": 0.5083421013076808, + "flos": 28557005755680.0, + "grad_norm": 1.7449749442074654, + "language_loss": 0.89296824, + "learning_rate": 2.0430321948724446e-06, + "loss": 0.91450036, + "num_input_tokens_seen": 181799495, + "router_z_loss_clip": 0.77978516, + "router_z_loss_mlp": 0.12536621, + "step": 8455, + "time_per_iteration": 2.7217178344726562 + }, + { + "auxiliary_loss_clip": 0.01125265, + "auxiliary_loss_mlp": 0.01037807, + "balance_loss_clip": 1.04248095, + "balance_loss_mlp": 1.02445602, + "epoch": 0.5084022245603487, + "flos": 29130283229760.0, + "grad_norm": 1.841957399828363, + "language_loss": 0.62341601, + "learning_rate": 2.042642822537149e-06, + "loss": 0.64504671, + "num_input_tokens_seen": 181818400, + "router_z_loss_clip": 0.828125, + "router_z_loss_mlp": 0.13348389, + "step": 8456, + "time_per_iteration": 2.6955902576446533 + }, + { + "auxiliary_loss_clip": 0.01038335, + "auxiliary_loss_mlp": 0.01001643, + "balance_loss_clip": 1.01359844, + "balance_loss_mlp": 1.00037181, + "epoch": 0.5084623478130167, + "flos": 76718744645760.0, + "grad_norm": 0.822685057959304, + "language_loss": 0.62462795, + "learning_rate": 2.0422534485848343e-06, + "loss": 0.6450277, + "num_input_tokens_seen": 181875975, + "router_z_loss_clip": 0.24743652, + "router_z_loss_mlp": 0.01271057, + "step": 8457, + "time_per_iteration": 3.131058692932129 + }, + { + "auxiliary_loss_clip": 0.01120641, + "auxiliary_loss_mlp": 0.01033501, + "balance_loss_clip": 1.04152453, + "balance_loss_mlp": 1.02025688, + "epoch": 0.5085224710656846, + "flos": 27266917135680.0, + "grad_norm": 1.5734038486225599, + "language_loss": 0.67001224, + "learning_rate": 2.0418640730302644e-06, + "loss": 0.69155365, + "num_input_tokens_seen": 181896450, + "router_z_loss_clip": 0.79101562, + "router_z_loss_mlp": 0.13244629, + "step": 8458, + "time_per_iteration": 2.6497983932495117 + }, + { + "auxiliary_loss_clip": 0.01121855, + "auxiliary_loss_mlp": 0.01030865, + "balance_loss_clip": 1.04114318, + "balance_loss_mlp": 1.01760864, + "epoch": 0.5085825943183526, + "flos": 31806635791680.0, + "grad_norm": 1.9223349645344585, + "language_loss": 0.77495527, + "learning_rate": 2.0414746958882043e-06, + "loss": 0.79648244, + "num_input_tokens_seen": 181916770, + "router_z_loss_clip": 0.80615234, + "router_z_loss_mlp": 0.13262939, + "step": 8459, + "time_per_iteration": 5.38043475151062 + }, + { + "auxiliary_loss_clip": 0.01127318, + "auxiliary_loss_mlp": 0.01036762, + "balance_loss_clip": 1.0459801, + "balance_loss_mlp": 1.02376842, + "epoch": 0.5086427175710206, + "flos": 21256549744320.0, + "grad_norm": 2.4470991954555776, + "language_loss": 0.80550277, + "learning_rate": 2.0410853171734196e-06, + "loss": 0.82714361, + "num_input_tokens_seen": 181932710, + "router_z_loss_clip": 0.8125, + "router_z_loss_mlp": 0.12994385, + "step": 8460, + "time_per_iteration": 2.5902116298675537 + }, + { + "auxiliary_loss_clip": 0.01122175, + "auxiliary_loss_mlp": 0.01036134, + "balance_loss_clip": 1.04262602, + "balance_loss_mlp": 1.02417743, + "epoch": 0.5087028408236886, + "flos": 25174862748000.0, + "grad_norm": 1.6044284896142578, + "language_loss": 0.68763602, + "learning_rate": 2.0406959369006754e-06, + "loss": 0.70921916, + "num_input_tokens_seen": 181950665, + "router_z_loss_clip": 0.79541016, + "router_z_loss_mlp": 0.11962891, + "step": 8461, + "time_per_iteration": 2.645824670791626 + }, + { + "auxiliary_loss_clip": 0.01116717, + "auxiliary_loss_mlp": 0.01026974, + "balance_loss_clip": 1.04105425, + "balance_loss_mlp": 1.01470709, + "epoch": 0.5087629640763566, + "flos": 31229954866080.0, + "grad_norm": 2.0999611910292795, + "language_loss": 0.75957108, + "learning_rate": 2.0403065550847375e-06, + "loss": 0.78100801, + "num_input_tokens_seen": 181971270, + "router_z_loss_clip": 0.75634766, + "router_z_loss_mlp": 0.12280273, + "step": 8462, + "time_per_iteration": 2.649214744567871 + }, + { + "auxiliary_loss_clip": 0.01121171, + "auxiliary_loss_mlp": 0.0103044, + "balance_loss_clip": 1.04296541, + "balance_loss_mlp": 1.0182333, + "epoch": 0.5088230873290245, + "flos": 16181268851520.0, + "grad_norm": 2.6117202579683707, + "language_loss": 0.8136214, + "learning_rate": 2.0399171717403706e-06, + "loss": 0.83513755, + "num_input_tokens_seen": 181988410, + "router_z_loss_clip": 0.78173828, + "router_z_loss_mlp": 0.12200928, + "step": 8463, + "time_per_iteration": 2.5930001735687256 + }, + { + "auxiliary_loss_clip": 0.01120828, + "auxiliary_loss_mlp": 0.01036024, + "balance_loss_clip": 1.04391074, + "balance_loss_mlp": 1.024014, + "epoch": 0.5088832105816925, + "flos": 24457403325600.0, + "grad_norm": 1.975716355074363, + "language_loss": 0.75992405, + "learning_rate": 2.039527786882341e-06, + "loss": 0.78149259, + "num_input_tokens_seen": 182006530, + "router_z_loss_clip": 0.76806641, + "router_z_loss_mlp": 0.12005615, + "step": 8464, + "time_per_iteration": 2.6091208457946777 + }, + { + "auxiliary_loss_clip": 0.01036422, + "auxiliary_loss_mlp": 0.01001302, + "balance_loss_clip": 1.01165366, + "balance_loss_mlp": 1.00001192, + "epoch": 0.5089433338343604, + "flos": 82267325416800.0, + "grad_norm": 0.6840739789942712, + "language_loss": 0.59313881, + "learning_rate": 2.0391384005254133e-06, + "loss": 0.61351603, + "num_input_tokens_seen": 182074240, + "router_z_loss_clip": 0.2479248, + "router_z_loss_mlp": 0.01290131, + "step": 8465, + "time_per_iteration": 3.4747989177703857 + }, + { + "auxiliary_loss_clip": 0.01118981, + "auxiliary_loss_mlp": 0.01033731, + "balance_loss_clip": 1.04137659, + "balance_loss_mlp": 1.02144027, + "epoch": 0.5090034570870284, + "flos": 27711409227840.0, + "grad_norm": 2.226836044996464, + "language_loss": 0.79741025, + "learning_rate": 2.038749012684354e-06, + "loss": 0.81893742, + "num_input_tokens_seen": 182093360, + "router_z_loss_clip": 0.77587891, + "router_z_loss_mlp": 0.1229248, + "step": 8466, + "time_per_iteration": 2.638803482055664 + }, + { + "auxiliary_loss_clip": 0.01117312, + "auxiliary_loss_mlp": 0.01031073, + "balance_loss_clip": 1.04044533, + "balance_loss_mlp": 1.01910424, + "epoch": 0.5090635803396963, + "flos": 24948281352960.0, + "grad_norm": 2.1113214559731777, + "language_loss": 0.78540444, + "learning_rate": 2.0383596233739286e-06, + "loss": 0.80688828, + "num_input_tokens_seen": 182110170, + "router_z_loss_clip": 0.76806641, + "router_z_loss_mlp": 0.11968994, + "step": 8467, + "time_per_iteration": 4.102391004562378 + }, + { + "auxiliary_loss_clip": 0.01116126, + "auxiliary_loss_mlp": 0.01035394, + "balance_loss_clip": 1.04155254, + "balance_loss_mlp": 1.02368784, + "epoch": 0.5091237035923644, + "flos": 29003078021760.0, + "grad_norm": 1.8343071733631626, + "language_loss": 0.74749321, + "learning_rate": 2.0379702326089013e-06, + "loss": 0.7690084, + "num_input_tokens_seen": 182129570, + "router_z_loss_clip": 0.74658203, + "router_z_loss_mlp": 0.11706543, + "step": 8468, + "time_per_iteration": 2.6799638271331787 + }, + { + "auxiliary_loss_clip": 0.01118408, + "auxiliary_loss_mlp": 0.01027404, + "balance_loss_clip": 1.0409534, + "balance_loss_mlp": 1.01539969, + "epoch": 0.5091838268450323, + "flos": 22364011867680.0, + "grad_norm": 2.0021588026251673, + "language_loss": 0.7767756, + "learning_rate": 2.03758084040404e-06, + "loss": 0.79823375, + "num_input_tokens_seen": 182147565, + "router_z_loss_clip": 0.77539062, + "router_z_loss_mlp": 0.12011719, + "step": 8469, + "time_per_iteration": 2.630906343460083 + }, + { + "auxiliary_loss_clip": 0.01122654, + "auxiliary_loss_mlp": 0.01030826, + "balance_loss_clip": 1.04498303, + "balance_loss_mlp": 1.01816654, + "epoch": 0.5092439500977003, + "flos": 35456965090560.0, + "grad_norm": 1.7447740752849383, + "language_loss": 0.69826835, + "learning_rate": 2.037191446774109e-06, + "loss": 0.71980309, + "num_input_tokens_seen": 182169695, + "router_z_loss_clip": 0.77685547, + "router_z_loss_mlp": 0.12677002, + "step": 8470, + "time_per_iteration": 2.7136447429656982 + }, + { + "auxiliary_loss_clip": 0.01122592, + "auxiliary_loss_mlp": 0.01033392, + "balance_loss_clip": 1.04184985, + "balance_loss_mlp": 1.02062476, + "epoch": 0.5093040733503682, + "flos": 16537891664160.0, + "grad_norm": 3.794351067744799, + "language_loss": 0.7389859, + "learning_rate": 2.0368020517338745e-06, + "loss": 0.76054573, + "num_input_tokens_seen": 182186385, + "router_z_loss_clip": 0.80664062, + "router_z_loss_mlp": 0.12774658, + "step": 8471, + "time_per_iteration": 2.6060807704925537 + }, + { + "auxiliary_loss_clip": 0.01033897, + "auxiliary_loss_mlp": 0.01003293, + "balance_loss_clip": 1.00917602, + "balance_loss_mlp": 1.00209165, + "epoch": 0.5093641966030362, + "flos": 84079808157600.0, + "grad_norm": 0.7507254303135138, + "language_loss": 0.58080703, + "learning_rate": 2.036412655298103e-06, + "loss": 0.60117894, + "num_input_tokens_seen": 182247095, + "router_z_loss_clip": 0.24743652, + "router_z_loss_mlp": 0.01199341, + "step": 8472, + "time_per_iteration": 3.2631423473358154 + }, + { + "auxiliary_loss_clip": 0.0112028, + "auxiliary_loss_mlp": 0.01033044, + "balance_loss_clip": 1.0422653, + "balance_loss_mlp": 1.02168989, + "epoch": 0.5094243198557042, + "flos": 26333086672800.0, + "grad_norm": 1.9889308313403788, + "language_loss": 0.69146132, + "learning_rate": 2.03602325748156e-06, + "loss": 0.71299458, + "num_input_tokens_seen": 182266380, + "router_z_loss_clip": 0.77978516, + "router_z_loss_mlp": 0.11358643, + "step": 8473, + "time_per_iteration": 4.074007034301758 + }, + { + "auxiliary_loss_clip": 0.01118646, + "auxiliary_loss_mlp": 0.01032806, + "balance_loss_clip": 1.04083955, + "balance_loss_mlp": 1.02083707, + "epoch": 0.5094844431083722, + "flos": 35192101216320.0, + "grad_norm": 3.147430337386805, + "language_loss": 0.85859197, + "learning_rate": 2.0356338582990105e-06, + "loss": 0.88010651, + "num_input_tokens_seen": 182284685, + "router_z_loss_clip": 0.77783203, + "router_z_loss_mlp": 0.11962891, + "step": 8474, + "time_per_iteration": 2.6796183586120605 + }, + { + "auxiliary_loss_clip": 0.01119825, + "auxiliary_loss_mlp": 0.01030775, + "balance_loss_clip": 1.04047775, + "balance_loss_mlp": 1.01866305, + "epoch": 0.5095445663610402, + "flos": 18273444791040.0, + "grad_norm": 3.8613623349385917, + "language_loss": 0.65008742, + "learning_rate": 2.035244457765222e-06, + "loss": 0.67159343, + "num_input_tokens_seen": 182301810, + "router_z_loss_clip": 0.79296875, + "router_z_loss_mlp": 0.12109375, + "step": 8475, + "time_per_iteration": 2.6413373947143555 + }, + { + "auxiliary_loss_clip": 0.01125259, + "auxiliary_loss_mlp": 0.0103846, + "balance_loss_clip": 1.04247808, + "balance_loss_mlp": 1.02528775, + "epoch": 0.5096046896137081, + "flos": 25352262515520.0, + "grad_norm": 3.682028776876995, + "language_loss": 0.82110578, + "learning_rate": 2.0348550558949605e-06, + "loss": 0.84274298, + "num_input_tokens_seen": 182320285, + "router_z_loss_clip": 0.82861328, + "router_z_loss_mlp": 0.13165283, + "step": 8476, + "time_per_iteration": 2.659980058670044 + }, + { + "auxiliary_loss_clip": 0.01119661, + "auxiliary_loss_mlp": 0.0103575, + "balance_loss_clip": 1.03840792, + "balance_loss_mlp": 1.02124214, + "epoch": 0.5096648128663761, + "flos": 28290683259360.0, + "grad_norm": 6.880353704299147, + "language_loss": 0.81056833, + "learning_rate": 2.0344656527029917e-06, + "loss": 0.83212245, + "num_input_tokens_seen": 182339465, + "router_z_loss_clip": 0.81201172, + "router_z_loss_mlp": 0.1451416, + "step": 8477, + "time_per_iteration": 2.6512253284454346 + }, + { + "auxiliary_loss_clip": 0.01121049, + "auxiliary_loss_mlp": 0.01028757, + "balance_loss_clip": 1.04049706, + "balance_loss_mlp": 1.01514888, + "epoch": 0.509724936119044, + "flos": 27222151891680.0, + "grad_norm": 2.118259623375824, + "language_loss": 0.61393893, + "learning_rate": 2.034076248204082e-06, + "loss": 0.63543695, + "num_input_tokens_seen": 182358375, + "router_z_loss_clip": 0.80566406, + "router_z_loss_mlp": 0.13623047, + "step": 8478, + "time_per_iteration": 2.6580312252044678 + }, + { + "auxiliary_loss_clip": 0.01118139, + "auxiliary_loss_mlp": 0.01039026, + "balance_loss_clip": 1.04027796, + "balance_loss_mlp": 1.0269804, + "epoch": 0.509785059371712, + "flos": 32075916049440.0, + "grad_norm": 1.6472567671855907, + "language_loss": 0.65597898, + "learning_rate": 2.0336868424129968e-06, + "loss": 0.67755067, + "num_input_tokens_seen": 182377935, + "router_z_loss_clip": 0.77978516, + "router_z_loss_mlp": 0.12042236, + "step": 8479, + "time_per_iteration": 2.697370767593384 + }, + { + "auxiliary_loss_clip": 0.01118297, + "auxiliary_loss_mlp": 0.01028631, + "balance_loss_clip": 1.04075873, + "balance_loss_mlp": 1.01683545, + "epoch": 0.50984518262438, + "flos": 28021767657120.0, + "grad_norm": 1.6572595540099113, + "language_loss": 0.69758737, + "learning_rate": 2.0332974353445037e-06, + "loss": 0.71905667, + "num_input_tokens_seen": 182396440, + "router_z_loss_clip": 0.77587891, + "router_z_loss_mlp": 0.11804199, + "step": 8480, + "time_per_iteration": 2.646235704421997 + }, + { + "auxiliary_loss_clip": 0.01121562, + "auxiliary_loss_mlp": 0.01029386, + "balance_loss_clip": 1.04031372, + "balance_loss_mlp": 1.01683354, + "epoch": 0.509905305877048, + "flos": 31986061423200.0, + "grad_norm": 1.830656256033913, + "language_loss": 0.7905587, + "learning_rate": 2.0329080270133688e-06, + "loss": 0.8120681, + "num_input_tokens_seen": 182415890, + "router_z_loss_clip": 0.81103516, + "router_z_loss_mlp": 0.12548828, + "step": 8481, + "time_per_iteration": 2.6603903770446777 + }, + { + "auxiliary_loss_clip": 0.01115622, + "auxiliary_loss_mlp": 0.0103456, + "balance_loss_clip": 1.0398078, + "balance_loss_mlp": 1.02167368, + "epoch": 0.5099654291297159, + "flos": 24818888211840.0, + "grad_norm": 1.5519439666455335, + "language_loss": 0.83522964, + "learning_rate": 2.0325186174343578e-06, + "loss": 0.85673147, + "num_input_tokens_seen": 182434235, + "router_z_loss_clip": 0.75732422, + "router_z_loss_mlp": 0.12902832, + "step": 8482, + "time_per_iteration": 2.643596649169922 + }, + { + "auxiliary_loss_clip": 0.01122342, + "auxiliary_loss_mlp": 0.01038256, + "balance_loss_clip": 1.040833, + "balance_loss_mlp": 1.02477312, + "epoch": 0.5100255523823839, + "flos": 35453804742720.0, + "grad_norm": 1.676969150654312, + "language_loss": 0.85476053, + "learning_rate": 2.032129206622238e-06, + "loss": 0.8763665, + "num_input_tokens_seen": 182454360, + "router_z_loss_clip": 0.81542969, + "router_z_loss_mlp": 0.1348877, + "step": 8483, + "time_per_iteration": 2.6825037002563477 + }, + { + "auxiliary_loss_clip": 0.01119385, + "auxiliary_loss_mlp": 0.01031733, + "balance_loss_clip": 1.03901398, + "balance_loss_mlp": 1.01966858, + "epoch": 0.5100856756350518, + "flos": 27400848212160.0, + "grad_norm": 2.1687171390833657, + "language_loss": 0.82995778, + "learning_rate": 2.031739794591775e-06, + "loss": 0.85146898, + "num_input_tokens_seen": 182471940, + "router_z_loss_clip": 0.80371094, + "router_z_loss_mlp": 0.1206665, + "step": 8484, + "time_per_iteration": 2.625457763671875 + }, + { + "auxiliary_loss_clip": 0.01120776, + "auxiliary_loss_mlp": 0.01028298, + "balance_loss_clip": 1.04133964, + "balance_loss_mlp": 1.01549518, + "epoch": 0.5101457988877198, + "flos": 23392518513120.0, + "grad_norm": 2.123789872555536, + "language_loss": 0.81373143, + "learning_rate": 2.031350381357736e-06, + "loss": 0.83522224, + "num_input_tokens_seen": 182490685, + "router_z_loss_clip": 0.79492188, + "router_z_loss_mlp": 0.12799072, + "step": 8485, + "time_per_iteration": 2.5976903438568115 + }, + { + "auxiliary_loss_clip": 0.01115778, + "auxiliary_loss_mlp": 0.01030545, + "balance_loss_clip": 1.03969061, + "balance_loss_mlp": 1.01839209, + "epoch": 0.5102059221403878, + "flos": 18139554231840.0, + "grad_norm": 2.0629031680887255, + "language_loss": 0.73483086, + "learning_rate": 2.0309609669348874e-06, + "loss": 0.75629407, + "num_input_tokens_seen": 182508325, + "router_z_loss_clip": 0.76025391, + "router_z_loss_mlp": 0.12158203, + "step": 8486, + "time_per_iteration": 2.625331163406372 + }, + { + "auxiliary_loss_clip": 0.01122506, + "auxiliary_loss_mlp": 0.01029739, + "balance_loss_clip": 1.04121089, + "balance_loss_mlp": 1.01708531, + "epoch": 0.5102660453930558, + "flos": 28017837480960.0, + "grad_norm": 1.6360782578856852, + "language_loss": 0.70314384, + "learning_rate": 2.0305715513379953e-06, + "loss": 0.72466624, + "num_input_tokens_seen": 182527020, + "router_z_loss_clip": 0.81298828, + "router_z_loss_mlp": 0.12658691, + "step": 8487, + "time_per_iteration": 2.6056270599365234 + }, + { + "auxiliary_loss_clip": 0.01120459, + "auxiliary_loss_mlp": 0.01032659, + "balance_loss_clip": 1.04198456, + "balance_loss_mlp": 1.01925969, + "epoch": 0.5103261686457238, + "flos": 28246971464640.0, + "grad_norm": 2.2653662996221393, + "language_loss": 0.72908163, + "learning_rate": 2.030182134581827e-06, + "loss": 0.7506128, + "num_input_tokens_seen": 182543505, + "router_z_loss_clip": 0.78466797, + "router_z_loss_mlp": 0.1340332, + "step": 8488, + "time_per_iteration": 2.6385250091552734 + }, + { + "auxiliary_loss_clip": 0.01122985, + "auxiliary_loss_mlp": 0.01032392, + "balance_loss_clip": 1.04195619, + "balance_loss_mlp": 1.01964879, + "epoch": 0.5103862918983917, + "flos": 17471884196160.0, + "grad_norm": 2.1946443732591883, + "language_loss": 0.69613034, + "learning_rate": 2.0297927166811503e-06, + "loss": 0.71768415, + "num_input_tokens_seen": 182562250, + "router_z_loss_clip": 0.81005859, + "router_z_loss_mlp": 0.12731934, + "step": 8489, + "time_per_iteration": 2.604332208633423 + }, + { + "auxiliary_loss_clip": 0.01118389, + "auxiliary_loss_mlp": 0.01034017, + "balance_loss_clip": 1.03930187, + "balance_loss_mlp": 1.02169108, + "epoch": 0.5104464151510597, + "flos": 31540840020000.0, + "grad_norm": 1.803656732342438, + "language_loss": 0.73125815, + "learning_rate": 2.0294032976507297e-06, + "loss": 0.75278223, + "num_input_tokens_seen": 182581910, + "router_z_loss_clip": 0.79052734, + "router_z_loss_mlp": 0.12316895, + "step": 8490, + "time_per_iteration": 2.6652233600616455 + }, + { + "auxiliary_loss_clip": 0.01117401, + "auxiliary_loss_mlp": 0.01028698, + "balance_loss_clip": 1.04051769, + "balance_loss_mlp": 1.01720047, + "epoch": 0.5105065384037276, + "flos": 26420996469600.0, + "grad_norm": 1.568676370609296, + "language_loss": 0.80973864, + "learning_rate": 2.0290138775053337e-06, + "loss": 0.83119959, + "num_input_tokens_seen": 182601350, + "router_z_loss_clip": 0.76904297, + "router_z_loss_mlp": 0.11499023, + "step": 8491, + "time_per_iteration": 2.632981538772583 + }, + { + "auxiliary_loss_clip": 0.01115363, + "auxiliary_loss_mlp": 0.01029901, + "balance_loss_clip": 1.03962827, + "balance_loss_mlp": 1.01780081, + "epoch": 0.5105666616563956, + "flos": 27444438455040.0, + "grad_norm": 2.414103093076211, + "language_loss": 0.79092848, + "learning_rate": 2.028624456259728e-06, + "loss": 0.81238109, + "num_input_tokens_seen": 182619660, + "router_z_loss_clip": 0.75683594, + "router_z_loss_mlp": 0.12103271, + "step": 8492, + "time_per_iteration": 2.622133255004883 + }, + { + "auxiliary_loss_clip": 0.01125242, + "auxiliary_loss_mlp": 0.01039807, + "balance_loss_clip": 1.04309571, + "balance_loss_mlp": 1.02661061, + "epoch": 0.5106267849090635, + "flos": 27400686143040.0, + "grad_norm": 1.895078013552319, + "language_loss": 0.77837968, + "learning_rate": 2.0282350339286804e-06, + "loss": 0.80003011, + "num_input_tokens_seen": 182639815, + "router_z_loss_clip": 0.82080078, + "router_z_loss_mlp": 0.13208008, + "step": 8493, + "time_per_iteration": 2.6248857975006104 + }, + { + "auxiliary_loss_clip": 0.01120776, + "auxiliary_loss_mlp": 0.01027613, + "balance_loss_clip": 1.04173493, + "balance_loss_mlp": 1.01414824, + "epoch": 0.5106869081617316, + "flos": 28732703797440.0, + "grad_norm": 2.620256331295782, + "language_loss": 0.83735514, + "learning_rate": 2.0278456105269574e-06, + "loss": 0.85883904, + "num_input_tokens_seen": 182659655, + "router_z_loss_clip": 0.79150391, + "router_z_loss_mlp": 0.13458252, + "step": 8494, + "time_per_iteration": 2.7702343463897705 + }, + { + "auxiliary_loss_clip": 0.01125088, + "auxiliary_loss_mlp": 0.01035342, + "balance_loss_clip": 1.04414225, + "balance_loss_mlp": 1.02312875, + "epoch": 0.5107470314143995, + "flos": 32693715663840.0, + "grad_norm": 2.348035663077756, + "language_loss": 0.78920245, + "learning_rate": 2.027456186069326e-06, + "loss": 0.81080675, + "num_input_tokens_seen": 182677075, + "router_z_loss_clip": 0.80859375, + "router_z_loss_mlp": 0.12213135, + "step": 8495, + "time_per_iteration": 2.6811633110046387 + }, + { + "auxiliary_loss_clip": 0.01120607, + "auxiliary_loss_mlp": 0.01035325, + "balance_loss_clip": 1.04121852, + "balance_loss_mlp": 1.02206337, + "epoch": 0.5108071546670675, + "flos": 31407962392800.0, + "grad_norm": 2.369963371849336, + "language_loss": 0.78121686, + "learning_rate": 2.0270667605705535e-06, + "loss": 0.80277622, + "num_input_tokens_seen": 182699625, + "router_z_loss_clip": 0.79296875, + "router_z_loss_mlp": 0.13244629, + "step": 8496, + "time_per_iteration": 2.681283950805664 + }, + { + "auxiliary_loss_clip": 0.01117692, + "auxiliary_loss_mlp": 0.01027622, + "balance_loss_clip": 1.04094672, + "balance_loss_mlp": 1.01577902, + "epoch": 0.5108672779197354, + "flos": 22814622069120.0, + "grad_norm": 2.3007729774168655, + "language_loss": 0.79030454, + "learning_rate": 2.0266773340454066e-06, + "loss": 0.81175768, + "num_input_tokens_seen": 182717020, + "router_z_loss_clip": 0.76757812, + "router_z_loss_mlp": 0.11846924, + "step": 8497, + "time_per_iteration": 2.639975070953369 + }, + { + "auxiliary_loss_clip": 0.0111829, + "auxiliary_loss_mlp": 0.01030855, + "balance_loss_clip": 1.04026484, + "balance_loss_mlp": 1.01846874, + "epoch": 0.5109274011724034, + "flos": 32565375972000.0, + "grad_norm": 3.8958311646791293, + "language_loss": 0.81599563, + "learning_rate": 2.0262879065086525e-06, + "loss": 0.8374871, + "num_input_tokens_seen": 182736955, + "router_z_loss_clip": 0.77978516, + "router_z_loss_mlp": 0.12384033, + "step": 8498, + "time_per_iteration": 2.6836116313934326 + }, + { + "auxiliary_loss_clip": 0.01118291, + "auxiliary_loss_mlp": 0.01032732, + "balance_loss_clip": 1.04153407, + "balance_loss_mlp": 1.01963067, + "epoch": 0.5109875244250714, + "flos": 27801263854080.0, + "grad_norm": 27.366016696793046, + "language_loss": 0.70726472, + "learning_rate": 2.0258984779750584e-06, + "loss": 0.72877496, + "num_input_tokens_seen": 182757620, + "router_z_loss_clip": 0.76708984, + "router_z_loss_mlp": 0.13085938, + "step": 8499, + "time_per_iteration": 4.079880714416504 + }, + { + "auxiliary_loss_clip": 0.01119667, + "auxiliary_loss_mlp": 0.01031989, + "balance_loss_clip": 1.04077125, + "balance_loss_mlp": 1.01907253, + "epoch": 0.5110476476777394, + "flos": 43426468725120.0, + "grad_norm": 1.6637246755430037, + "language_loss": 0.72561377, + "learning_rate": 2.0255090484593914e-06, + "loss": 0.74713033, + "num_input_tokens_seen": 182780195, + "router_z_loss_clip": 0.78857422, + "router_z_loss_mlp": 0.12927246, + "step": 8500, + "time_per_iteration": 2.7815353870391846 + }, + { + "auxiliary_loss_clip": 0.01125036, + "auxiliary_loss_mlp": 0.01032251, + "balance_loss_clip": 1.04150879, + "balance_loss_mlp": 1.01786804, + "epoch": 0.5111077709304074, + "flos": 23526409072320.0, + "grad_norm": 3.531662236179588, + "language_loss": 0.62831533, + "learning_rate": 2.0251196179764183e-06, + "loss": 0.64988816, + "num_input_tokens_seen": 182795765, + "router_z_loss_clip": 0.83447266, + "router_z_loss_mlp": 0.14373779, + "step": 8501, + "time_per_iteration": 2.6309666633605957 + }, + { + "auxiliary_loss_clip": 0.01120751, + "auxiliary_loss_mlp": 0.01034538, + "balance_loss_clip": 1.03919888, + "balance_loss_mlp": 1.02134109, + "epoch": 0.5111678941830753, + "flos": 25219344371040.0, + "grad_norm": 2.009222928084499, + "language_loss": 0.87172818, + "learning_rate": 2.024730186540907e-06, + "loss": 0.8932811, + "num_input_tokens_seen": 182813120, + "router_z_loss_clip": 0.81591797, + "router_z_loss_mlp": 0.13201904, + "step": 8502, + "time_per_iteration": 2.788881540298462 + }, + { + "auxiliary_loss_clip": 0.0111834, + "auxiliary_loss_mlp": 0.01035273, + "balance_loss_clip": 1.03924119, + "balance_loss_mlp": 1.0230124, + "epoch": 0.5112280174357433, + "flos": 32076847946880.0, + "grad_norm": 1.5686499491119692, + "language_loss": 0.82390362, + "learning_rate": 2.0243407541676253e-06, + "loss": 0.84543979, + "num_input_tokens_seen": 182835745, + "router_z_loss_clip": 0.79101562, + "router_z_loss_mlp": 0.12261963, + "step": 8503, + "time_per_iteration": 2.6701807975769043 + }, + { + "auxiliary_loss_clip": 0.01041714, + "auxiliary_loss_mlp": 0.0100068, + "balance_loss_clip": 1.01669192, + "balance_loss_mlp": 0.99948359, + "epoch": 0.5112881406884112, + "flos": 72571459727520.0, + "grad_norm": 0.8563067082034909, + "language_loss": 0.63861603, + "learning_rate": 2.023951320871339e-06, + "loss": 0.65903997, + "num_input_tokens_seen": 182892540, + "router_z_loss_clip": 0.24987793, + "router_z_loss_mlp": 0.01194763, + "step": 8504, + "time_per_iteration": 3.2980339527130127 + }, + { + "auxiliary_loss_clip": 0.01118813, + "auxiliary_loss_mlp": 0.01029807, + "balance_loss_clip": 1.04093993, + "balance_loss_mlp": 1.01617515, + "epoch": 0.5113482639410792, + "flos": 32297797440000.0, + "grad_norm": 2.410692125573568, + "language_loss": 0.83617717, + "learning_rate": 2.023561886666816e-06, + "loss": 0.85766333, + "num_input_tokens_seen": 182911515, + "router_z_loss_clip": 0.77783203, + "router_z_loss_mlp": 0.1362915, + "step": 8505, + "time_per_iteration": 2.6772847175598145 + }, + { + "auxiliary_loss_clip": 0.01119549, + "auxiliary_loss_mlp": 0.01027601, + "balance_loss_clip": 1.0418849, + "balance_loss_mlp": 1.01496458, + "epoch": 0.5114083871937471, + "flos": 36479920868640.0, + "grad_norm": 2.650309466818654, + "language_loss": 0.75512969, + "learning_rate": 2.0231724515688246e-06, + "loss": 0.77660125, + "num_input_tokens_seen": 182930860, + "router_z_loss_clip": 0.77734375, + "router_z_loss_mlp": 0.12634277, + "step": 8506, + "time_per_iteration": 4.2122156620025635 + }, + { + "auxiliary_loss_clip": 0.01121504, + "auxiliary_loss_mlp": 0.0103347, + "balance_loss_clip": 1.04211307, + "balance_loss_mlp": 1.01995122, + "epoch": 0.5114685104464152, + "flos": 29669127366240.0, + "grad_norm": 2.4691086938629674, + "language_loss": 0.58129859, + "learning_rate": 2.022783015592131e-06, + "loss": 0.60284829, + "num_input_tokens_seen": 182949960, + "router_z_loss_clip": 0.79345703, + "router_z_loss_mlp": 0.13525391, + "step": 8507, + "time_per_iteration": 2.6767477989196777 + }, + { + "auxiliary_loss_clip": 0.0112398, + "auxiliary_loss_mlp": 0.0104211, + "balance_loss_clip": 1.04392803, + "balance_loss_mlp": 1.02903831, + "epoch": 0.5115286336990831, + "flos": 20766684648960.0, + "grad_norm": 2.001024735711025, + "language_loss": 0.85581863, + "learning_rate": 2.022393578751503e-06, + "loss": 0.87747955, + "num_input_tokens_seen": 182968085, + "router_z_loss_clip": 0.80078125, + "router_z_loss_mlp": 0.13079834, + "step": 8508, + "time_per_iteration": 2.5954155921936035 + }, + { + "auxiliary_loss_clip": 0.01123214, + "auxiliary_loss_mlp": 0.01036648, + "balance_loss_clip": 1.04218221, + "balance_loss_mlp": 1.02301049, + "epoch": 0.5115887569517511, + "flos": 28869471083520.0, + "grad_norm": 2.0285196142172017, + "language_loss": 0.72282553, + "learning_rate": 2.022004141061709e-06, + "loss": 0.74442416, + "num_input_tokens_seen": 182987275, + "router_z_loss_clip": 0.81005859, + "router_z_loss_mlp": 0.1362915, + "step": 8509, + "time_per_iteration": 2.708237409591675 + }, + { + "auxiliary_loss_clip": 0.01117635, + "auxiliary_loss_mlp": 0.01031963, + "balance_loss_clip": 1.04120624, + "balance_loss_mlp": 1.02007723, + "epoch": 0.511648880204419, + "flos": 19653671658240.0, + "grad_norm": 1.9833565954843235, + "language_loss": 0.76130807, + "learning_rate": 2.0216147025375153e-06, + "loss": 0.78280407, + "num_input_tokens_seen": 183004700, + "router_z_loss_clip": 0.76464844, + "router_z_loss_mlp": 0.11889648, + "step": 8510, + "time_per_iteration": 2.5911009311676025 + }, + { + "auxiliary_loss_clip": 0.01119889, + "auxiliary_loss_mlp": 0.0103102, + "balance_loss_clip": 1.04264975, + "balance_loss_mlp": 1.01901603, + "epoch": 0.511709003457087, + "flos": 39822160705920.0, + "grad_norm": 1.6996943298397442, + "language_loss": 0.70823896, + "learning_rate": 2.0212252631936907e-06, + "loss": 0.72974813, + "num_input_tokens_seen": 183025830, + "router_z_loss_clip": 0.77294922, + "router_z_loss_mlp": 0.12011719, + "step": 8511, + "time_per_iteration": 2.7969729900360107 + }, + { + "auxiliary_loss_clip": 0.01119645, + "auxiliary_loss_mlp": 0.01029219, + "balance_loss_clip": 1.04290032, + "balance_loss_mlp": 1.01725006, + "epoch": 0.511769126709755, + "flos": 26554238752320.0, + "grad_norm": 1.9025062781408775, + "language_loss": 0.66491216, + "learning_rate": 2.020835823045001e-06, + "loss": 0.68640083, + "num_input_tokens_seen": 183045140, + "router_z_loss_clip": 0.76757812, + "router_z_loss_mlp": 0.11975098, + "step": 8512, + "time_per_iteration": 4.00014591217041 + }, + { + "auxiliary_loss_clip": 0.01121741, + "auxiliary_loss_mlp": 0.01037578, + "balance_loss_clip": 1.04074812, + "balance_loss_mlp": 1.0238812, + "epoch": 0.511829249962423, + "flos": 29181774342240.0, + "grad_norm": 2.4167487825097367, + "language_loss": 0.66453171, + "learning_rate": 2.0204463821062146e-06, + "loss": 0.68612492, + "num_input_tokens_seen": 183063935, + "router_z_loss_clip": 0.81054688, + "router_z_loss_mlp": 0.13690186, + "step": 8513, + "time_per_iteration": 2.664081573486328 + }, + { + "auxiliary_loss_clip": 0.01118981, + "auxiliary_loss_mlp": 0.01032585, + "balance_loss_clip": 1.04142261, + "balance_loss_mlp": 1.01929271, + "epoch": 0.511889373215091, + "flos": 28952478289440.0, + "grad_norm": 2.200658582660975, + "language_loss": 0.68825364, + "learning_rate": 2.0200569403921e-06, + "loss": 0.70976931, + "num_input_tokens_seen": 183084135, + "router_z_loss_clip": 0.77539062, + "router_z_loss_mlp": 0.13287354, + "step": 8514, + "time_per_iteration": 2.676555633544922 + }, + { + "auxiliary_loss_clip": 0.01117047, + "auxiliary_loss_mlp": 0.01032832, + "balance_loss_clip": 1.03974819, + "balance_loss_mlp": 1.02120888, + "epoch": 0.5119494964677589, + "flos": 34302225651840.0, + "grad_norm": 1.6192265510843589, + "language_loss": 0.65803099, + "learning_rate": 2.019667497917424e-06, + "loss": 0.67952979, + "num_input_tokens_seen": 183104570, + "router_z_loss_clip": 0.77294922, + "router_z_loss_mlp": 0.11608887, + "step": 8515, + "time_per_iteration": 2.6738436222076416 + }, + { + "auxiliary_loss_clip": 0.01117935, + "auxiliary_loss_mlp": 0.0102764, + "balance_loss_clip": 1.03958631, + "balance_loss_mlp": 1.01583791, + "epoch": 0.5120096197204269, + "flos": 30472673307840.0, + "grad_norm": 5.600873744083003, + "language_loss": 0.74807119, + "learning_rate": 2.019278054696955e-06, + "loss": 0.76952696, + "num_input_tokens_seen": 183123850, + "router_z_loss_clip": 0.78369141, + "router_z_loss_mlp": 0.11798096, + "step": 8516, + "time_per_iteration": 2.7020440101623535 + }, + { + "auxiliary_loss_clip": 0.01121866, + "auxiliary_loss_mlp": 0.01035968, + "balance_loss_clip": 1.0424242, + "balance_loss_mlp": 1.02357602, + "epoch": 0.5120697429730948, + "flos": 21924381849120.0, + "grad_norm": 2.18766275780688, + "language_loss": 0.77690208, + "learning_rate": 2.0188886107454595e-06, + "loss": 0.79848039, + "num_input_tokens_seen": 183141725, + "router_z_loss_clip": 0.79541016, + "router_z_loss_mlp": 0.12402344, + "step": 8517, + "time_per_iteration": 2.58162522315979 + }, + { + "auxiliary_loss_clip": 0.01123622, + "auxiliary_loss_mlp": 0.01033803, + "balance_loss_clip": 1.04200613, + "balance_loss_mlp": 1.02071381, + "epoch": 0.5121298662257628, + "flos": 28421778126240.0, + "grad_norm": 2.1237658340877177, + "language_loss": 0.73795056, + "learning_rate": 2.0184991660777063e-06, + "loss": 0.75952482, + "num_input_tokens_seen": 183161300, + "router_z_loss_clip": 0.81689453, + "router_z_loss_mlp": 0.13092041, + "step": 8518, + "time_per_iteration": 2.6546523571014404 + }, + { + "auxiliary_loss_clip": 0.01119599, + "auxiliary_loss_mlp": 0.01036409, + "balance_loss_clip": 1.04047775, + "balance_loss_mlp": 1.02360582, + "epoch": 0.5121899894784308, + "flos": 21122740219680.0, + "grad_norm": 2.073200348031618, + "language_loss": 0.78012604, + "learning_rate": 2.0181097207084625e-06, + "loss": 0.80168611, + "num_input_tokens_seen": 183180495, + "router_z_loss_clip": 0.79199219, + "router_z_loss_mlp": 0.12805176, + "step": 8519, + "time_per_iteration": 2.60404372215271 + }, + { + "auxiliary_loss_clip": 0.01122216, + "auxiliary_loss_mlp": 0.01038046, + "balance_loss_clip": 1.04317141, + "balance_loss_mlp": 1.02481961, + "epoch": 0.5122501127310988, + "flos": 30420655470720.0, + "grad_norm": 1.9203536645013728, + "language_loss": 0.79209739, + "learning_rate": 2.017720274652497e-06, + "loss": 0.81369996, + "num_input_tokens_seen": 183200330, + "router_z_loss_clip": 0.79101562, + "router_z_loss_mlp": 0.13226318, + "step": 8520, + "time_per_iteration": 2.741638660430908 + }, + { + "auxiliary_loss_clip": 0.01124482, + "auxiliary_loss_mlp": 0.0103824, + "balance_loss_clip": 1.04090381, + "balance_loss_mlp": 1.02409613, + "epoch": 0.5123102359837667, + "flos": 22503777432480.0, + "grad_norm": 1.9054968245410508, + "language_loss": 0.81111306, + "learning_rate": 2.0173308279245765e-06, + "loss": 0.83274031, + "num_input_tokens_seen": 183218230, + "router_z_loss_clip": 0.8359375, + "router_z_loss_mlp": 0.14147949, + "step": 8521, + "time_per_iteration": 2.791378974914551 + }, + { + "auxiliary_loss_clip": 0.0111719, + "auxiliary_loss_mlp": 0.01031347, + "balance_loss_clip": 1.03779387, + "balance_loss_mlp": 1.01801956, + "epoch": 0.5123703592364347, + "flos": 32561648382240.0, + "grad_norm": 2.0768905769217474, + "language_loss": 0.68398768, + "learning_rate": 2.0169413805394692e-06, + "loss": 0.70547307, + "num_input_tokens_seen": 183236735, + "router_z_loss_clip": 0.79394531, + "router_z_loss_mlp": 0.13323975, + "step": 8522, + "time_per_iteration": 2.646075487136841 + }, + { + "auxiliary_loss_clip": 0.01127975, + "auxiliary_loss_mlp": 0.01042727, + "balance_loss_clip": 1.04376698, + "balance_loss_mlp": 1.02588844, + "epoch": 0.5124304824891026, + "flos": 35147254937760.0, + "grad_norm": 2.776020708137361, + "language_loss": 0.61509788, + "learning_rate": 2.0165519325119433e-06, + "loss": 0.63680482, + "num_input_tokens_seen": 183257550, + "router_z_loss_clip": 0.84228516, + "router_z_loss_mlp": 0.16815186, + "step": 8523, + "time_per_iteration": 2.7200913429260254 + }, + { + "auxiliary_loss_clip": 0.01123251, + "auxiliary_loss_mlp": 0.01035978, + "balance_loss_clip": 1.04351079, + "balance_loss_mlp": 1.02366996, + "epoch": 0.5124906057417706, + "flos": 26553955131360.0, + "grad_norm": 2.2413968068040524, + "language_loss": 0.78396451, + "learning_rate": 2.0161624838567656e-06, + "loss": 0.80555683, + "num_input_tokens_seen": 183275515, + "router_z_loss_clip": 0.79736328, + "router_z_loss_mlp": 0.12310791, + "step": 8524, + "time_per_iteration": 2.6594467163085938 + }, + { + "auxiliary_loss_clip": 0.01122058, + "auxiliary_loss_mlp": 0.010307, + "balance_loss_clip": 1.04355001, + "balance_loss_mlp": 1.01871336, + "epoch": 0.5125507289944387, + "flos": 23037881047200.0, + "grad_norm": 2.1430796296719667, + "language_loss": 0.75280857, + "learning_rate": 2.015773034588706e-06, + "loss": 0.77433622, + "num_input_tokens_seen": 183293880, + "router_z_loss_clip": 0.78515625, + "router_z_loss_mlp": 0.11981201, + "step": 8525, + "time_per_iteration": 2.6778295040130615 + }, + { + "auxiliary_loss_clip": 0.01125183, + "auxiliary_loss_mlp": 0.0103933, + "balance_loss_clip": 1.04340839, + "balance_loss_mlp": 1.02566314, + "epoch": 0.5126108522471066, + "flos": 43474191730560.0, + "grad_norm": 2.0295591305634657, + "language_loss": 0.74482214, + "learning_rate": 2.015383584722531e-06, + "loss": 0.76646733, + "num_input_tokens_seen": 183315860, + "router_z_loss_clip": 0.81689453, + "router_z_loss_mlp": 0.13677979, + "step": 8526, + "time_per_iteration": 2.74206280708313 + }, + { + "auxiliary_loss_clip": 0.0112355, + "auxiliary_loss_mlp": 0.01041404, + "balance_loss_clip": 1.04333711, + "balance_loss_mlp": 1.02884519, + "epoch": 0.5126709754997746, + "flos": 24637193612640.0, + "grad_norm": 5.540914922368966, + "language_loss": 0.6532293, + "learning_rate": 2.0149941342730088e-06, + "loss": 0.67487884, + "num_input_tokens_seen": 183335480, + "router_z_loss_clip": 0.80175781, + "router_z_loss_mlp": 0.12561035, + "step": 8527, + "time_per_iteration": 2.6435163021087646 + }, + { + "auxiliary_loss_clip": 0.01122471, + "auxiliary_loss_mlp": 0.01039562, + "balance_loss_clip": 1.04605758, + "balance_loss_mlp": 1.0276227, + "epoch": 0.5127310987524425, + "flos": 22681582372800.0, + "grad_norm": 1.5679602937578425, + "language_loss": 0.74240595, + "learning_rate": 2.014604683254908e-06, + "loss": 0.76402628, + "num_input_tokens_seen": 183354395, + "router_z_loss_clip": 0.76416016, + "router_z_loss_mlp": 0.1192627, + "step": 8528, + "time_per_iteration": 2.620326042175293 + }, + { + "auxiliary_loss_clip": 0.01121437, + "auxiliary_loss_mlp": 0.01034512, + "balance_loss_clip": 1.04180408, + "balance_loss_mlp": 1.02175093, + "epoch": 0.5127912220051105, + "flos": 27399470624640.0, + "grad_norm": 1.7272419784885742, + "language_loss": 0.82977796, + "learning_rate": 2.014215231682995e-06, + "loss": 0.85133749, + "num_input_tokens_seen": 183372980, + "router_z_loss_clip": 0.796875, + "router_z_loss_mlp": 0.12768555, + "step": 8529, + "time_per_iteration": 2.648815393447876 + }, + { + "auxiliary_loss_clip": 0.01119803, + "auxiliary_loss_mlp": 0.01032407, + "balance_loss_clip": 1.04210997, + "balance_loss_mlp": 1.01947236, + "epoch": 0.5128513452577784, + "flos": 23298531124320.0, + "grad_norm": 1.9784543143975464, + "language_loss": 0.73736149, + "learning_rate": 2.01382577957204e-06, + "loss": 0.75888354, + "num_input_tokens_seen": 183390160, + "router_z_loss_clip": 0.77685547, + "router_z_loss_mlp": 0.1293335, + "step": 8530, + "time_per_iteration": 2.5861141681671143 + }, + { + "auxiliary_loss_clip": 0.01040217, + "auxiliary_loss_mlp": 0.01006781, + "balance_loss_clip": 1.01442516, + "balance_loss_mlp": 1.00532436, + "epoch": 0.5129114685104464, + "flos": 82841372719200.0, + "grad_norm": 0.7954100478020188, + "language_loss": 0.60814571, + "learning_rate": 2.0134363269368095e-06, + "loss": 0.62861568, + "num_input_tokens_seen": 183455280, + "router_z_loss_clip": 0.2578125, + "router_z_loss_mlp": 0.01454926, + "step": 8531, + "time_per_iteration": 3.3854963779449463 + }, + { + "auxiliary_loss_clip": 0.0112498, + "auxiliary_loss_mlp": 0.01031195, + "balance_loss_clip": 1.04473138, + "balance_loss_mlp": 1.01873767, + "epoch": 0.5129715917631144, + "flos": 24952049460000.0, + "grad_norm": 1.8891908792688417, + "language_loss": 0.77129388, + "learning_rate": 2.0130468737920725e-06, + "loss": 0.79285562, + "num_input_tokens_seen": 183473955, + "router_z_loss_clip": 0.80273438, + "router_z_loss_mlp": 0.12445068, + "step": 8532, + "time_per_iteration": 2.6197240352630615 + }, + { + "auxiliary_loss_clip": 0.01123261, + "auxiliary_loss_mlp": 0.01035261, + "balance_loss_clip": 1.04367483, + "balance_loss_mlp": 1.02219522, + "epoch": 0.5130317150157824, + "flos": 42849868834080.0, + "grad_norm": 2.2769301864274083, + "language_loss": 0.67298239, + "learning_rate": 2.012657420152597e-06, + "loss": 0.69456762, + "num_input_tokens_seen": 183497195, + "router_z_loss_clip": 0.79541016, + "router_z_loss_mlp": 0.1307373, + "step": 8533, + "time_per_iteration": 2.760045289993286 + }, + { + "auxiliary_loss_clip": 0.01126408, + "auxiliary_loss_mlp": 0.01037933, + "balance_loss_clip": 1.04435611, + "balance_loss_mlp": 1.02408719, + "epoch": 0.5130918382684503, + "flos": 24149759554080.0, + "grad_norm": 1.9544770522018737, + "language_loss": 0.82103682, + "learning_rate": 2.01226796603315e-06, + "loss": 0.84268022, + "num_input_tokens_seen": 183513675, + "router_z_loss_clip": 0.82128906, + "router_z_loss_mlp": 0.13861084, + "step": 8534, + "time_per_iteration": 2.6603267192840576 + }, + { + "auxiliary_loss_clip": 0.01124834, + "auxiliary_loss_mlp": 0.01037079, + "balance_loss_clip": 1.04413486, + "balance_loss_mlp": 1.02341723, + "epoch": 0.5131519615211183, + "flos": 32119911465120.0, + "grad_norm": 1.5495715919811535, + "language_loss": 0.63925564, + "learning_rate": 2.0118785114485017e-06, + "loss": 0.66087484, + "num_input_tokens_seen": 183535165, + "router_z_loss_clip": 0.80712891, + "router_z_loss_mlp": 0.13665771, + "step": 8535, + "time_per_iteration": 2.6923434734344482 + }, + { + "auxiliary_loss_clip": 0.0112622, + "auxiliary_loss_mlp": 0.01028665, + "balance_loss_clip": 1.04686165, + "balance_loss_mlp": 1.01600504, + "epoch": 0.5132120847737862, + "flos": 23393490927840.0, + "grad_norm": 1.914862640313424, + "language_loss": 0.69600099, + "learning_rate": 2.011489056413418e-06, + "loss": 0.71754986, + "num_input_tokens_seen": 183553780, + "router_z_loss_clip": 0.79199219, + "router_z_loss_mlp": 0.12670898, + "step": 8536, + "time_per_iteration": 2.711172342300415 + }, + { + "auxiliary_loss_clip": 0.01127191, + "auxiliary_loss_mlp": 0.0103191, + "balance_loss_clip": 1.04444122, + "balance_loss_mlp": 1.01831412, + "epoch": 0.5132722080264542, + "flos": 24685119204480.0, + "grad_norm": 2.221501181957029, + "language_loss": 0.70878494, + "learning_rate": 2.011099600942669e-06, + "loss": 0.73037589, + "num_input_tokens_seen": 183572285, + "router_z_loss_clip": 0.82714844, + "router_z_loss_mlp": 0.13604736, + "step": 8537, + "time_per_iteration": 2.633493185043335 + }, + { + "auxiliary_loss_clip": 0.01125781, + "auxiliary_loss_mlp": 0.01029679, + "balance_loss_clip": 1.0436039, + "balance_loss_mlp": 1.01681018, + "epoch": 0.5133323312791223, + "flos": 20095975817280.0, + "grad_norm": 1.9235498935589528, + "language_loss": 0.80167353, + "learning_rate": 2.0107101450510214e-06, + "loss": 0.82322812, + "num_input_tokens_seen": 183589330, + "router_z_loss_clip": 0.82177734, + "router_z_loss_mlp": 0.12854004, + "step": 8538, + "time_per_iteration": 3.9526376724243164 + }, + { + "auxiliary_loss_clip": 0.01122644, + "auxiliary_loss_mlp": 0.010295, + "balance_loss_clip": 1.04228842, + "balance_loss_mlp": 1.01639843, + "epoch": 0.5133924545317902, + "flos": 31808702172960.0, + "grad_norm": 1.9597440752883304, + "language_loss": 0.7852664, + "learning_rate": 2.0103206887532437e-06, + "loss": 0.80678785, + "num_input_tokens_seen": 183609205, + "router_z_loss_clip": 0.80371094, + "router_z_loss_mlp": 0.13092041, + "step": 8539, + "time_per_iteration": 4.161704063415527 + }, + { + "auxiliary_loss_clip": 0.01123874, + "auxiliary_loss_mlp": 0.0103398, + "balance_loss_clip": 1.04277849, + "balance_loss_mlp": 1.02082515, + "epoch": 0.5134525777844582, + "flos": 35546495578560.0, + "grad_norm": 1.7193494744047761, + "language_loss": 0.76037264, + "learning_rate": 2.009931232064105e-06, + "loss": 0.78195113, + "num_input_tokens_seen": 183629985, + "router_z_loss_clip": 0.81005859, + "router_z_loss_mlp": 0.13171387, + "step": 8540, + "time_per_iteration": 2.7173521518707275 + }, + { + "auxiliary_loss_clip": 0.01127954, + "auxiliary_loss_mlp": 0.01034149, + "balance_loss_clip": 1.04515076, + "balance_loss_mlp": 1.02007008, + "epoch": 0.5135127010371261, + "flos": 21298316709600.0, + "grad_norm": 2.0609370957388853, + "language_loss": 0.74619806, + "learning_rate": 2.0095417749983724e-06, + "loss": 0.76781905, + "num_input_tokens_seen": 183648220, + "router_z_loss_clip": 0.828125, + "router_z_loss_mlp": 0.14086914, + "step": 8541, + "time_per_iteration": 2.65338134765625 + }, + { + "auxiliary_loss_clip": 0.01122535, + "auxiliary_loss_mlp": 0.01036435, + "balance_loss_clip": 1.0421536, + "balance_loss_mlp": 1.02342296, + "epoch": 0.5135728242897941, + "flos": 26777376178560.0, + "grad_norm": 2.168498354339772, + "language_loss": 0.70405209, + "learning_rate": 2.0091523175708162e-06, + "loss": 0.72564185, + "num_input_tokens_seen": 183668230, + "router_z_loss_clip": 0.80419922, + "router_z_loss_mlp": 0.13006592, + "step": 8542, + "time_per_iteration": 2.6426234245300293 + }, + { + "auxiliary_loss_clip": 0.01125713, + "auxiliary_loss_mlp": 0.01028787, + "balance_loss_clip": 1.04496455, + "balance_loss_mlp": 1.01566792, + "epoch": 0.513632947542462, + "flos": 27668021571360.0, + "grad_norm": 2.2133098734364696, + "language_loss": 0.78836697, + "learning_rate": 2.0087628597962023e-06, + "loss": 0.80991197, + "num_input_tokens_seen": 183687800, + "router_z_loss_clip": 0.80810547, + "router_z_loss_mlp": 0.13116455, + "step": 8543, + "time_per_iteration": 2.66111159324646 + }, + { + "auxiliary_loss_clip": 0.01124556, + "auxiliary_loss_mlp": 0.01035048, + "balance_loss_clip": 1.04475081, + "balance_loss_mlp": 1.02182198, + "epoch": 0.51369307079513, + "flos": 35944682770080.0, + "grad_norm": 2.032720732473468, + "language_loss": 0.67892802, + "learning_rate": 2.008373401689299e-06, + "loss": 0.70052397, + "num_input_tokens_seen": 183709025, + "router_z_loss_clip": 0.79785156, + "router_z_loss_mlp": 0.13232422, + "step": 8544, + "time_per_iteration": 2.7470085620880127 + }, + { + "auxiliary_loss_clip": 0.01124858, + "auxiliary_loss_mlp": 0.0103809, + "balance_loss_clip": 1.04336286, + "balance_loss_mlp": 1.02600789, + "epoch": 0.513753194047798, + "flos": 23170718157120.0, + "grad_norm": 2.1765885724595324, + "language_loss": 0.71766698, + "learning_rate": 2.0079839432648765e-06, + "loss": 0.73929644, + "num_input_tokens_seen": 183725740, + "router_z_loss_clip": 0.81494141, + "router_z_loss_mlp": 0.12078857, + "step": 8545, + "time_per_iteration": 4.180584192276001 + }, + { + "auxiliary_loss_clip": 0.01125582, + "auxiliary_loss_mlp": 0.01039882, + "balance_loss_clip": 1.04323006, + "balance_loss_mlp": 1.02582121, + "epoch": 0.513813317300466, + "flos": 21746050184160.0, + "grad_norm": 5.714172411218803, + "language_loss": 0.82135826, + "learning_rate": 2.0075944845377016e-06, + "loss": 0.84301293, + "num_input_tokens_seen": 183743995, + "router_z_loss_clip": 0.82226562, + "router_z_loss_mlp": 0.14056396, + "step": 8546, + "time_per_iteration": 2.6650376319885254 + }, + { + "auxiliary_loss_clip": 0.01123864, + "auxiliary_loss_mlp": 0.01033445, + "balance_loss_clip": 1.04291415, + "balance_loss_mlp": 1.02003431, + "epoch": 0.5138734405531339, + "flos": 29359741351680.0, + "grad_norm": 1.6409581571163856, + "language_loss": 0.7302947, + "learning_rate": 2.007205025522544e-06, + "loss": 0.75186777, + "num_input_tokens_seen": 183764150, + "router_z_loss_clip": 0.81005859, + "router_z_loss_mlp": 0.13409424, + "step": 8547, + "time_per_iteration": 2.63034987449646 + }, + { + "auxiliary_loss_clip": 0.01122458, + "auxiliary_loss_mlp": 0.0104466, + "balance_loss_clip": 1.04167116, + "balance_loss_mlp": 1.03136837, + "epoch": 0.5139335638058019, + "flos": 31844958788160.0, + "grad_norm": 1.8037375303766965, + "language_loss": 0.73567665, + "learning_rate": 2.0068155662341702e-06, + "loss": 0.75734782, + "num_input_tokens_seen": 183783280, + "router_z_loss_clip": 0.80712891, + "router_z_loss_mlp": 0.13293457, + "step": 8548, + "time_per_iteration": 2.7326221466064453 + }, + { + "auxiliary_loss_clip": 0.01122197, + "auxiliary_loss_mlp": 0.01031336, + "balance_loss_clip": 1.04164243, + "balance_loss_mlp": 1.01796031, + "epoch": 0.5139936870584698, + "flos": 23082484222080.0, + "grad_norm": 2.0313824908535616, + "language_loss": 0.8212347, + "learning_rate": 2.0064261066873495e-06, + "loss": 0.84276998, + "num_input_tokens_seen": 183800725, + "router_z_loss_clip": 0.80615234, + "router_z_loss_mlp": 0.13378906, + "step": 8549, + "time_per_iteration": 2.6196341514587402 + }, + { + "auxiliary_loss_clip": 0.01122546, + "auxiliary_loss_mlp": 0.01032934, + "balance_loss_clip": 1.0439775, + "balance_loss_mlp": 1.02059579, + "epoch": 0.5140538103111378, + "flos": 19698558454080.0, + "grad_norm": 2.0820281031451593, + "language_loss": 0.72513062, + "learning_rate": 2.0060366468968504e-06, + "loss": 0.74668539, + "num_input_tokens_seen": 183818735, + "router_z_loss_clip": 0.78564453, + "router_z_loss_mlp": 0.12335205, + "step": 8550, + "time_per_iteration": 2.6344118118286133 + }, + { + "auxiliary_loss_clip": 0.01124922, + "auxiliary_loss_mlp": 0.01034579, + "balance_loss_clip": 1.04229963, + "balance_loss_mlp": 1.02103138, + "epoch": 0.5141139335638057, + "flos": 27357136417440.0, + "grad_norm": 1.5422449149015327, + "language_loss": 0.75094658, + "learning_rate": 2.0056471868774408e-06, + "loss": 0.77254164, + "num_input_tokens_seen": 183840015, + "router_z_loss_clip": 0.82568359, + "router_z_loss_mlp": 0.13555908, + "step": 8551, + "time_per_iteration": 2.6272599697113037 + }, + { + "auxiliary_loss_clip": 0.0112054, + "auxiliary_loss_mlp": 0.01029737, + "balance_loss_clip": 1.04364491, + "balance_loss_mlp": 1.0167557, + "epoch": 0.5141740568164738, + "flos": 33055281584640.0, + "grad_norm": 1.8544818995400736, + "language_loss": 0.69352865, + "learning_rate": 2.0052577266438897e-06, + "loss": 0.7150315, + "num_input_tokens_seen": 183860145, + "router_z_loss_clip": 0.76806641, + "router_z_loss_mlp": 0.12982178, + "step": 8552, + "time_per_iteration": 3.968156576156616 + }, + { + "auxiliary_loss_clip": 0.01121515, + "auxiliary_loss_mlp": 0.01033148, + "balance_loss_clip": 1.04071164, + "balance_loss_mlp": 1.01962352, + "epoch": 0.5142341800691418, + "flos": 30472713825120.0, + "grad_norm": 1.8248265355177673, + "language_loss": 0.74552894, + "learning_rate": 2.004868266210965e-06, + "loss": 0.76707554, + "num_input_tokens_seen": 183880540, + "router_z_loss_clip": 0.80810547, + "router_z_loss_mlp": 0.13525391, + "step": 8553, + "time_per_iteration": 2.666532278060913 + }, + { + "auxiliary_loss_clip": 0.01123043, + "auxiliary_loss_mlp": 0.01037552, + "balance_loss_clip": 1.04350328, + "balance_loss_mlp": 1.02468324, + "epoch": 0.5142943033218097, + "flos": 25263663924960.0, + "grad_norm": 1.7509026907531118, + "language_loss": 0.6805535, + "learning_rate": 2.004478805593435e-06, + "loss": 0.7021594, + "num_input_tokens_seen": 183900895, + "router_z_loss_clip": 0.79589844, + "router_z_loss_mlp": 0.12866211, + "step": 8554, + "time_per_iteration": 2.647739887237549 + }, + { + "auxiliary_loss_clip": 0.01126309, + "auxiliary_loss_mlp": 0.01035935, + "balance_loss_clip": 1.04258025, + "balance_loss_mlp": 1.0207243, + "epoch": 0.5143544265744777, + "flos": 27974287755360.0, + "grad_norm": 1.9144995005096717, + "language_loss": 0.73560125, + "learning_rate": 2.004089344806068e-06, + "loss": 0.75722373, + "num_input_tokens_seen": 183920335, + "router_z_loss_clip": 0.83691406, + "router_z_loss_mlp": 0.15222168, + "step": 8555, + "time_per_iteration": 2.6241729259490967 + }, + { + "auxiliary_loss_clip": 0.01124174, + "auxiliary_loss_mlp": 0.01032546, + "balance_loss_clip": 1.0441947, + "balance_loss_mlp": 1.01946831, + "epoch": 0.5144145498271456, + "flos": 19427252332320.0, + "grad_norm": 3.8891860775979397, + "language_loss": 0.74594754, + "learning_rate": 2.003699883863633e-06, + "loss": 0.76751471, + "num_input_tokens_seen": 183936220, + "router_z_loss_clip": 0.79931641, + "router_z_loss_mlp": 0.1307373, + "step": 8556, + "time_per_iteration": 2.6250038146972656 + }, + { + "auxiliary_loss_clip": 0.01119454, + "auxiliary_loss_mlp": 0.01034246, + "balance_loss_clip": 1.04172266, + "balance_loss_mlp": 1.02190793, + "epoch": 0.5144746730798136, + "flos": 24015139683840.0, + "grad_norm": 1.9999558452345692, + "language_loss": 0.86346149, + "learning_rate": 2.003310422780898e-06, + "loss": 0.8849985, + "num_input_tokens_seen": 183953250, + "router_z_loss_clip": 0.77685547, + "router_z_loss_mlp": 0.12335205, + "step": 8557, + "time_per_iteration": 2.729219675064087 + }, + { + "auxiliary_loss_clip": 0.01117778, + "auxiliary_loss_mlp": 0.01035104, + "balance_loss_clip": 1.04087639, + "balance_loss_mlp": 1.02301586, + "epoch": 0.5145347963324816, + "flos": 29181045031200.0, + "grad_norm": 1.5301669480683362, + "language_loss": 0.88969314, + "learning_rate": 2.0029209615726307e-06, + "loss": 0.91122198, + "num_input_tokens_seen": 183973865, + "router_z_loss_clip": 0.76904297, + "router_z_loss_mlp": 0.12097168, + "step": 8558, + "time_per_iteration": 2.8078134059906006 + }, + { + "auxiliary_loss_clip": 0.01120159, + "auxiliary_loss_mlp": 0.01030344, + "balance_loss_clip": 1.04243529, + "balance_loss_mlp": 1.01757693, + "epoch": 0.5145949195851496, + "flos": 22280680523520.0, + "grad_norm": 2.1857124703992783, + "language_loss": 0.65461147, + "learning_rate": 2.002531500253602e-06, + "loss": 0.67611659, + "num_input_tokens_seen": 183992555, + "router_z_loss_clip": 0.77734375, + "router_z_loss_mlp": 0.12768555, + "step": 8559, + "time_per_iteration": 2.636850595474243 + }, + { + "auxiliary_loss_clip": 0.01123285, + "auxiliary_loss_mlp": 0.01033238, + "balance_loss_clip": 1.04448271, + "balance_loss_mlp": 1.02046442, + "epoch": 0.5146550428378175, + "flos": 31986101940480.0, + "grad_norm": 1.9446646373819687, + "language_loss": 0.63727903, + "learning_rate": 2.002142038838577e-06, + "loss": 0.65884423, + "num_input_tokens_seen": 184010825, + "router_z_loss_clip": 0.78662109, + "router_z_loss_mlp": 0.12774658, + "step": 8560, + "time_per_iteration": 2.7319626808166504 + }, + { + "auxiliary_loss_clip": 0.01120436, + "auxiliary_loss_mlp": 0.01032992, + "balance_loss_clip": 1.04205263, + "balance_loss_mlp": 1.02021909, + "epoch": 0.5147151660904855, + "flos": 27667251743040.0, + "grad_norm": 1.649274379063675, + "language_loss": 0.69918215, + "learning_rate": 2.0017525773423265e-06, + "loss": 0.72071636, + "num_input_tokens_seen": 184030155, + "router_z_loss_clip": 0.78369141, + "router_z_loss_mlp": 0.12780762, + "step": 8561, + "time_per_iteration": 2.6533634662628174 + }, + { + "auxiliary_loss_clip": 0.01122457, + "auxiliary_loss_mlp": 0.01031436, + "balance_loss_clip": 1.04235697, + "balance_loss_mlp": 1.01969433, + "epoch": 0.5147752893431534, + "flos": 30471741410400.0, + "grad_norm": 1.541257384250002, + "language_loss": 0.66173971, + "learning_rate": 2.0013631157796177e-06, + "loss": 0.68327868, + "num_input_tokens_seen": 184051440, + "router_z_loss_clip": 0.80078125, + "router_z_loss_mlp": 0.11737061, + "step": 8562, + "time_per_iteration": 2.6717352867126465 + }, + { + "auxiliary_loss_clip": 0.01125114, + "auxiliary_loss_mlp": 0.0103529, + "balance_loss_clip": 1.0440259, + "balance_loss_mlp": 1.02242148, + "epoch": 0.5148354125958214, + "flos": 27753500331360.0, + "grad_norm": 1.6574473154760827, + "language_loss": 0.78093779, + "learning_rate": 2.0009736541652188e-06, + "loss": 0.80254179, + "num_input_tokens_seen": 184070205, + "router_z_loss_clip": 0.81152344, + "router_z_loss_mlp": 0.12866211, + "step": 8563, + "time_per_iteration": 2.650407314300537 + }, + { + "auxiliary_loss_clip": 0.01125874, + "auxiliary_loss_mlp": 0.01037458, + "balance_loss_clip": 1.04180551, + "balance_loss_mlp": 1.02291441, + "epoch": 0.5148955358484893, + "flos": 28149378037920.0, + "grad_norm": 5.703628517988508, + "language_loss": 0.83096802, + "learning_rate": 2.0005841925139e-06, + "loss": 0.85260129, + "num_input_tokens_seen": 184087345, + "router_z_loss_clip": 0.84082031, + "router_z_loss_mlp": 0.14550781, + "step": 8564, + "time_per_iteration": 2.6471941471099854 + }, + { + "auxiliary_loss_clip": 0.01126139, + "auxiliary_loss_mlp": 0.01036462, + "balance_loss_clip": 1.04263258, + "balance_loss_mlp": 1.02260971, + "epoch": 0.5149556591011574, + "flos": 24818969246400.0, + "grad_norm": 1.9916110658748938, + "language_loss": 0.7318114, + "learning_rate": 2.0001947308404283e-06, + "loss": 0.7534374, + "num_input_tokens_seen": 184107110, + "router_z_loss_clip": 0.83544922, + "router_z_loss_mlp": 0.13861084, + "step": 8565, + "time_per_iteration": 2.6126444339752197 + }, + { + "auxiliary_loss_clip": 0.01126541, + "auxiliary_loss_mlp": 0.01032821, + "balance_loss_clip": 1.04340076, + "balance_loss_mlp": 1.01832533, + "epoch": 0.5150157823538254, + "flos": 27623337361920.0, + "grad_norm": 2.0346329291007694, + "language_loss": 0.68268067, + "learning_rate": 1.9998052691595715e-06, + "loss": 0.7042743, + "num_input_tokens_seen": 184127105, + "router_z_loss_clip": 0.83056641, + "router_z_loss_mlp": 0.14483643, + "step": 8566, + "time_per_iteration": 2.669917583465576 + }, + { + "auxiliary_loss_clip": 0.01122179, + "auxiliary_loss_mlp": 0.01031728, + "balance_loss_clip": 1.03913164, + "balance_loss_mlp": 1.01861453, + "epoch": 0.5150759056064933, + "flos": 31808175448320.0, + "grad_norm": 1.6582724415627446, + "language_loss": 0.78004909, + "learning_rate": 1.9994158074861005e-06, + "loss": 0.80158818, + "num_input_tokens_seen": 184148060, + "router_z_loss_clip": 0.83154297, + "router_z_loss_mlp": 0.13128662, + "step": 8567, + "time_per_iteration": 2.657226800918579 + }, + { + "auxiliary_loss_clip": 0.01125223, + "auxiliary_loss_mlp": 0.01038032, + "balance_loss_clip": 1.04327738, + "balance_loss_mlp": 1.02351284, + "epoch": 0.5151360288591613, + "flos": 31668369366240.0, + "grad_norm": 18.326337627805067, + "language_loss": 0.79322809, + "learning_rate": 1.9990263458347806e-06, + "loss": 0.8148607, + "num_input_tokens_seen": 184166175, + "router_z_loss_clip": 0.81884766, + "router_z_loss_mlp": 0.1451416, + "step": 8568, + "time_per_iteration": 2.668548107147217 + }, + { + "auxiliary_loss_clip": 0.01120184, + "auxiliary_loss_mlp": 0.01032332, + "balance_loss_clip": 1.04119992, + "balance_loss_mlp": 1.0195353, + "epoch": 0.5151961521118292, + "flos": 22583138083200.0, + "grad_norm": 2.9073344519446263, + "language_loss": 0.90469742, + "learning_rate": 1.9986368842203825e-06, + "loss": 0.92622262, + "num_input_tokens_seen": 184182600, + "router_z_loss_clip": 0.7890625, + "router_z_loss_mlp": 0.12786865, + "step": 8569, + "time_per_iteration": 2.5950162410736084 + }, + { + "auxiliary_loss_clip": 0.01124882, + "auxiliary_loss_mlp": 0.01032884, + "balance_loss_clip": 1.04335237, + "balance_loss_mlp": 1.01968133, + "epoch": 0.5152562753644973, + "flos": 27129987780480.0, + "grad_norm": 1.9392113374683944, + "language_loss": 0.7653302, + "learning_rate": 1.998247422657674e-06, + "loss": 0.78690791, + "num_input_tokens_seen": 184202020, + "router_z_loss_clip": 0.81494141, + "router_z_loss_mlp": 0.13201904, + "step": 8570, + "time_per_iteration": 2.779860019683838 + }, + { + "auxiliary_loss_clip": 0.01123929, + "auxiliary_loss_mlp": 0.01042842, + "balance_loss_clip": 1.04152465, + "balance_loss_mlp": 1.02835834, + "epoch": 0.5153163986171652, + "flos": 46901667224160.0, + "grad_norm": 1.5623621931678535, + "language_loss": 0.73773724, + "learning_rate": 1.9978579611614227e-06, + "loss": 0.75940496, + "num_input_tokens_seen": 184224850, + "router_z_loss_clip": 0.82324219, + "router_z_loss_mlp": 0.14489746, + "step": 8571, + "time_per_iteration": 2.8301339149475098 + }, + { + "auxiliary_loss_clip": 0.01038367, + "auxiliary_loss_mlp": 0.01007632, + "balance_loss_clip": 1.01291442, + "balance_loss_mlp": 1.00647819, + "epoch": 0.5153765218698332, + "flos": 81002553746400.0, + "grad_norm": 0.7799091640890728, + "language_loss": 0.52936542, + "learning_rate": 1.9974684997463984e-06, + "loss": 0.54982543, + "num_input_tokens_seen": 184288520, + "router_z_loss_clip": 0.25439453, + "router_z_loss_mlp": 0.01153564, + "step": 8572, + "time_per_iteration": 3.3594119548797607 + }, + { + "auxiliary_loss_clip": 0.01121899, + "auxiliary_loss_mlp": 0.01038489, + "balance_loss_clip": 1.04459834, + "balance_loss_mlp": 1.02649641, + "epoch": 0.5154366451225011, + "flos": 29315178694080.0, + "grad_norm": 2.2003546632229605, + "language_loss": 0.76287067, + "learning_rate": 1.9970790384273687e-06, + "loss": 0.78447449, + "num_input_tokens_seen": 184308565, + "router_z_loss_clip": 0.77246094, + "router_z_loss_mlp": 0.12005615, + "step": 8573, + "time_per_iteration": 2.6732826232910156 + }, + { + "auxiliary_loss_clip": 0.0112071, + "auxiliary_loss_mlp": 0.01030922, + "balance_loss_clip": 1.04153323, + "balance_loss_mlp": 1.01740372, + "epoch": 0.5154967683751691, + "flos": 28637703476640.0, + "grad_norm": 2.0450614145237767, + "language_loss": 0.77197373, + "learning_rate": 1.996689577219102e-06, + "loss": 0.79349005, + "num_input_tokens_seen": 184326795, + "router_z_loss_clip": 0.79101562, + "router_z_loss_mlp": 0.13513184, + "step": 8574, + "time_per_iteration": 2.6457314491271973 + }, + { + "auxiliary_loss_clip": 0.01121897, + "auxiliary_loss_mlp": 0.01033655, + "balance_loss_clip": 1.0426383, + "balance_loss_mlp": 1.02138197, + "epoch": 0.515556891627837, + "flos": 29047924300320.0, + "grad_norm": 1.6890496442333351, + "language_loss": 0.85173142, + "learning_rate": 1.996300116136367e-06, + "loss": 0.8732869, + "num_input_tokens_seen": 184345990, + "router_z_loss_clip": 0.79345703, + "router_z_loss_mlp": 0.1227417, + "step": 8575, + "time_per_iteration": 2.688384532928467 + }, + { + "auxiliary_loss_clip": 0.01123196, + "auxiliary_loss_mlp": 0.01034945, + "balance_loss_clip": 1.04164815, + "balance_loss_mlp": 1.02141488, + "epoch": 0.515617014880505, + "flos": 24194767901760.0, + "grad_norm": 1.8011619071538572, + "language_loss": 0.76951712, + "learning_rate": 1.995910655193932e-06, + "loss": 0.79109854, + "num_input_tokens_seen": 184366300, + "router_z_loss_clip": 0.81494141, + "router_z_loss_mlp": 0.13543701, + "step": 8576, + "time_per_iteration": 2.6949305534362793 + }, + { + "auxiliary_loss_clip": 0.011268, + "auxiliary_loss_mlp": 0.01030691, + "balance_loss_clip": 1.04230189, + "balance_loss_mlp": 1.01656461, + "epoch": 0.515677138133173, + "flos": 17382434742720.0, + "grad_norm": 2.766557360838749, + "language_loss": 0.75484443, + "learning_rate": 1.9955211944065654e-06, + "loss": 0.7764194, + "num_input_tokens_seen": 184383030, + "router_z_loss_clip": 0.84521484, + "router_z_loss_mlp": 0.14123535, + "step": 8577, + "time_per_iteration": 2.624326705932617 + }, + { + "auxiliary_loss_clip": 0.01124384, + "auxiliary_loss_mlp": 0.01037921, + "balance_loss_clip": 1.04230809, + "balance_loss_mlp": 1.02410436, + "epoch": 0.515737261385841, + "flos": 34520055314400.0, + "grad_norm": 1.7662229653185426, + "language_loss": 0.81128687, + "learning_rate": 1.9951317337890353e-06, + "loss": 0.83290994, + "num_input_tokens_seen": 184403410, + "router_z_loss_clip": 0.82080078, + "router_z_loss_mlp": 0.13793945, + "step": 8578, + "time_per_iteration": 4.185594797134399 + }, + { + "auxiliary_loss_clip": 0.01118578, + "auxiliary_loss_mlp": 0.01031044, + "balance_loss_clip": 1.03981948, + "balance_loss_mlp": 1.01853323, + "epoch": 0.515797384638509, + "flos": 34034322981600.0, + "grad_norm": 1.9078918680596413, + "language_loss": 0.76067066, + "learning_rate": 1.9947422733561105e-06, + "loss": 0.7821669, + "num_input_tokens_seen": 184423830, + "router_z_loss_clip": 0.78710938, + "router_z_loss_mlp": 0.12506104, + "step": 8579, + "time_per_iteration": 2.67812180519104 + }, + { + "auxiliary_loss_clip": 0.01123622, + "auxiliary_loss_mlp": 0.01033945, + "balance_loss_clip": 1.04260087, + "balance_loss_mlp": 1.02148747, + "epoch": 0.5158575078911769, + "flos": 28113648147360.0, + "grad_norm": 1.8642877011618224, + "language_loss": 0.79040289, + "learning_rate": 1.994352813122559e-06, + "loss": 0.81197852, + "num_input_tokens_seen": 184445050, + "router_z_loss_clip": 0.80957031, + "router_z_loss_mlp": 0.12463379, + "step": 8580, + "time_per_iteration": 2.710045337677002 + }, + { + "auxiliary_loss_clip": 0.01128627, + "auxiliary_loss_mlp": 0.01042182, + "balance_loss_clip": 1.04551637, + "balance_loss_mlp": 1.02845526, + "epoch": 0.5159176311438449, + "flos": 15424351948800.0, + "grad_norm": 2.7889935863578685, + "language_loss": 0.73481679, + "learning_rate": 1.99396335310315e-06, + "loss": 0.75652486, + "num_input_tokens_seen": 184460775, + "router_z_loss_clip": 0.83056641, + "router_z_loss_mlp": 0.1373291, + "step": 8581, + "time_per_iteration": 2.591604232788086 + }, + { + "auxiliary_loss_clip": 0.01122178, + "auxiliary_loss_mlp": 0.01031724, + "balance_loss_clip": 1.04265714, + "balance_loss_mlp": 1.01915348, + "epoch": 0.5159777543965128, + "flos": 18983854206720.0, + "grad_norm": 2.970334636694401, + "language_loss": 0.74212348, + "learning_rate": 1.9935738933126508e-06, + "loss": 0.76366252, + "num_input_tokens_seen": 184477365, + "router_z_loss_clip": 0.79541016, + "router_z_loss_mlp": 0.12567139, + "step": 8582, + "time_per_iteration": 2.6881985664367676 + }, + { + "auxiliary_loss_clip": 0.01122924, + "auxiliary_loss_mlp": 0.01033933, + "balance_loss_clip": 1.0427866, + "balance_loss_mlp": 1.02133262, + "epoch": 0.5160378776491809, + "flos": 28335326951520.0, + "grad_norm": 6.690366200504157, + "language_loss": 0.66027939, + "learning_rate": 1.99318443376583e-06, + "loss": 0.68184799, + "num_input_tokens_seen": 184497045, + "router_z_loss_clip": 0.80224609, + "router_z_loss_mlp": 0.12597656, + "step": 8583, + "time_per_iteration": 2.641139268875122 + }, + { + "auxiliary_loss_clip": 0.01125939, + "auxiliary_loss_mlp": 0.01034084, + "balance_loss_clip": 1.04486513, + "balance_loss_mlp": 1.02061379, + "epoch": 0.5160980009018488, + "flos": 26776971005760.0, + "grad_norm": 1.4528068636548293, + "language_loss": 0.76149225, + "learning_rate": 1.9927949744774568e-06, + "loss": 0.7830925, + "num_input_tokens_seen": 184517675, + "router_z_loss_clip": 0.81054688, + "router_z_loss_mlp": 0.13476562, + "step": 8584, + "time_per_iteration": 2.694959878921509 + }, + { + "auxiliary_loss_clip": 0.01128258, + "auxiliary_loss_mlp": 0.01045715, + "balance_loss_clip": 1.04428101, + "balance_loss_mlp": 1.03261375, + "epoch": 0.5161581241545168, + "flos": 27801304371360.0, + "grad_norm": 2.07594029361811, + "language_loss": 0.78828359, + "learning_rate": 1.9924055154622983e-06, + "loss": 0.81002331, + "num_input_tokens_seen": 184537745, + "router_z_loss_clip": 0.83984375, + "router_z_loss_mlp": 0.13085938, + "step": 8585, + "time_per_iteration": 4.218624830245972 + }, + { + "auxiliary_loss_clip": 0.01120242, + "auxiliary_loss_mlp": 0.01036174, + "balance_loss_clip": 1.04266882, + "balance_loss_mlp": 1.02379441, + "epoch": 0.5162182474071847, + "flos": 24008535367200.0, + "grad_norm": 2.2854000825276066, + "language_loss": 0.81072521, + "learning_rate": 1.9920160567351238e-06, + "loss": 0.83228946, + "num_input_tokens_seen": 184553630, + "router_z_loss_clip": 0.77539062, + "router_z_loss_mlp": 0.1237793, + "step": 8586, + "time_per_iteration": 2.604100465774536 + }, + { + "auxiliary_loss_clip": 0.01122869, + "auxiliary_loss_mlp": 0.01035853, + "balance_loss_clip": 1.04183733, + "balance_loss_mlp": 1.0229249, + "epoch": 0.5162783706598527, + "flos": 24460482638880.0, + "grad_norm": 2.3002460398772717, + "language_loss": 0.71625924, + "learning_rate": 1.991626598310701e-06, + "loss": 0.73784649, + "num_input_tokens_seen": 184573530, + "router_z_loss_clip": 0.81054688, + "router_z_loss_mlp": 0.12921143, + "step": 8587, + "time_per_iteration": 2.634331703186035 + }, + { + "auxiliary_loss_clip": 0.01039817, + "auxiliary_loss_mlp": 0.01002267, + "balance_loss_clip": 1.0141269, + "balance_loss_mlp": 1.00085187, + "epoch": 0.5163384939125206, + "flos": 85365156255840.0, + "grad_norm": 0.7361288221417883, + "language_loss": 0.57789886, + "learning_rate": 1.9912371402037984e-06, + "loss": 0.59831965, + "num_input_tokens_seen": 184637875, + "router_z_loss_clip": 0.25683594, + "router_z_loss_mlp": 0.0141449, + "step": 8588, + "time_per_iteration": 3.264880895614624 + }, + { + "auxiliary_loss_clip": 0.01124611, + "auxiliary_loss_mlp": 0.01036334, + "balance_loss_clip": 1.04358482, + "balance_loss_mlp": 1.0227021, + "epoch": 0.5163986171651886, + "flos": 21252376464480.0, + "grad_norm": 1.8849005929220757, + "language_loss": 0.75026464, + "learning_rate": 1.990847682429185e-06, + "loss": 0.77187407, + "num_input_tokens_seen": 184656125, + "router_z_loss_clip": 0.81005859, + "router_z_loss_mlp": 0.1362915, + "step": 8589, + "time_per_iteration": 2.676694631576538 + }, + { + "auxiliary_loss_clip": 0.01125828, + "auxiliary_loss_mlp": 0.01033643, + "balance_loss_clip": 1.04521704, + "balance_loss_mlp": 1.02107835, + "epoch": 0.5164587404178566, + "flos": 26018190308160.0, + "grad_norm": 1.579470164159104, + "language_loss": 0.67415476, + "learning_rate": 1.990458225001627e-06, + "loss": 0.6957494, + "num_input_tokens_seen": 184675920, + "router_z_loss_clip": 0.80664062, + "router_z_loss_mlp": 0.12573242, + "step": 8590, + "time_per_iteration": 2.643483877182007 + }, + { + "auxiliary_loss_clip": 0.01039339, + "auxiliary_loss_mlp": 0.01000847, + "balance_loss_clip": 1.01381814, + "balance_loss_mlp": 0.99944949, + "epoch": 0.5165188636705246, + "flos": 83044170470880.0, + "grad_norm": 0.7831825924699357, + "language_loss": 0.55896771, + "learning_rate": 1.990068767935895e-06, + "loss": 0.5793696, + "num_input_tokens_seen": 184730520, + "router_z_loss_clip": 0.25512695, + "router_z_loss_mlp": 0.01397705, + "step": 8591, + "time_per_iteration": 4.44576358795166 + }, + { + "auxiliary_loss_clip": 0.01117085, + "auxiliary_loss_mlp": 0.01027595, + "balance_loss_clip": 1.04228306, + "balance_loss_mlp": 1.01568627, + "epoch": 0.5165789869231926, + "flos": 23654789281440.0, + "grad_norm": 1.5802369451290417, + "language_loss": 0.8173449, + "learning_rate": 1.9896793112467566e-06, + "loss": 0.83879173, + "num_input_tokens_seen": 184748340, + "router_z_loss_clip": 0.74755859, + "router_z_loss_mlp": 0.11907959, + "step": 8592, + "time_per_iteration": 2.658177137374878 + }, + { + "auxiliary_loss_clip": 0.01123231, + "auxiliary_loss_mlp": 0.01029806, + "balance_loss_clip": 1.04601049, + "balance_loss_mlp": 1.01755118, + "epoch": 0.5166391101758605, + "flos": 25575643045440.0, + "grad_norm": 2.3663397319989294, + "language_loss": 0.83356154, + "learning_rate": 1.989289854948979e-06, + "loss": 0.85509193, + "num_input_tokens_seen": 184766615, + "router_z_loss_clip": 0.77197266, + "router_z_loss_mlp": 0.12249756, + "step": 8593, + "time_per_iteration": 2.649172306060791 + }, + { + "auxiliary_loss_clip": 0.01125416, + "auxiliary_loss_mlp": 0.01033175, + "balance_loss_clip": 1.04561627, + "balance_loss_mlp": 1.02111685, + "epoch": 0.5166992334285285, + "flos": 35950598292960.0, + "grad_norm": 1.7060800177535649, + "language_loss": 0.68841493, + "learning_rate": 1.9889003990573314e-06, + "loss": 0.71000087, + "num_input_tokens_seen": 184788075, + "router_z_loss_clip": 0.79785156, + "router_z_loss_mlp": 0.1206665, + "step": 8594, + "time_per_iteration": 2.784428358078003 + }, + { + "auxiliary_loss_clip": 0.01122407, + "auxiliary_loss_mlp": 0.01030123, + "balance_loss_clip": 1.04385543, + "balance_loss_mlp": 1.0173552, + "epoch": 0.5167593566811964, + "flos": 24775257451680.0, + "grad_norm": 1.6403999870468315, + "language_loss": 0.77113092, + "learning_rate": 1.988510943586582e-06, + "loss": 0.79265618, + "num_input_tokens_seen": 184808710, + "router_z_loss_clip": 0.78564453, + "router_z_loss_mlp": 0.12768555, + "step": 8595, + "time_per_iteration": 2.678898811340332 + }, + { + "auxiliary_loss_clip": 0.01121959, + "auxiliary_loss_mlp": 0.01034287, + "balance_loss_clip": 1.04318929, + "balance_loss_mlp": 1.02140093, + "epoch": 0.5168194799338645, + "flos": 17828344939680.0, + "grad_norm": 1.5813554262063845, + "language_loss": 0.65107274, + "learning_rate": 1.9881214885514986e-06, + "loss": 0.6726352, + "num_input_tokens_seen": 184826475, + "router_z_loss_clip": 0.78759766, + "router_z_loss_mlp": 0.12884521, + "step": 8596, + "time_per_iteration": 2.652101755142212 + }, + { + "auxiliary_loss_clip": 0.01123181, + "auxiliary_loss_mlp": 0.01037596, + "balance_loss_clip": 1.04412198, + "balance_loss_mlp": 1.02356482, + "epoch": 0.5168796031865324, + "flos": 30514440273120.0, + "grad_norm": 1.726569640657528, + "language_loss": 0.75699925, + "learning_rate": 1.9877320339668492e-06, + "loss": 0.77860695, + "num_input_tokens_seen": 184845245, + "router_z_loss_clip": 0.79052734, + "router_z_loss_mlp": 0.14044189, + "step": 8597, + "time_per_iteration": 2.6681437492370605 + }, + { + "auxiliary_loss_clip": 0.01121178, + "auxiliary_loss_mlp": 0.01027426, + "balance_loss_clip": 1.0419724, + "balance_loss_mlp": 1.01499271, + "epoch": 0.5169397264392004, + "flos": 32872979226240.0, + "grad_norm": 2.4476711606666006, + "language_loss": 0.81189865, + "learning_rate": 1.987342579847403e-06, + "loss": 0.83338469, + "num_input_tokens_seen": 184866605, + "router_z_loss_clip": 0.79150391, + "router_z_loss_mlp": 0.12432861, + "step": 8598, + "time_per_iteration": 2.669600248336792 + }, + { + "auxiliary_loss_clip": 0.01123825, + "auxiliary_loss_mlp": 0.01042171, + "balance_loss_clip": 1.04397964, + "balance_loss_mlp": 1.02952886, + "epoch": 0.5169998496918683, + "flos": 31003292436480.0, + "grad_norm": 2.330247607889249, + "language_loss": 0.75348502, + "learning_rate": 1.9869531262079273e-06, + "loss": 0.77514505, + "num_input_tokens_seen": 184886945, + "router_z_loss_clip": 0.79931641, + "router_z_loss_mlp": 0.12652588, + "step": 8599, + "time_per_iteration": 2.729900360107422 + }, + { + "auxiliary_loss_clip": 0.01121005, + "auxiliary_loss_mlp": 0.01031983, + "balance_loss_clip": 1.04291487, + "balance_loss_mlp": 1.01987135, + "epoch": 0.5170599729445363, + "flos": 30115888426080.0, + "grad_norm": 3.7264868199436805, + "language_loss": 0.72014868, + "learning_rate": 1.9865636730631904e-06, + "loss": 0.7416786, + "num_input_tokens_seen": 184905590, + "router_z_loss_clip": 0.78125, + "router_z_loss_mlp": 0.12109375, + "step": 8600, + "time_per_iteration": 2.6798946857452393 + }, + { + "auxiliary_loss_clip": 0.01123554, + "auxiliary_loss_mlp": 0.01035029, + "balance_loss_clip": 1.04532838, + "balance_loss_mlp": 1.02180219, + "epoch": 0.5171200961972042, + "flos": 25617653114400.0, + "grad_norm": 2.1739152930605496, + "language_loss": 0.74459112, + "learning_rate": 1.9861742204279602e-06, + "loss": 0.76617694, + "num_input_tokens_seen": 184925555, + "router_z_loss_clip": 0.78173828, + "router_z_loss_mlp": 0.13226318, + "step": 8601, + "time_per_iteration": 2.672278642654419 + }, + { + "auxiliary_loss_clip": 0.01122788, + "auxiliary_loss_mlp": 0.01038217, + "balance_loss_clip": 1.04229414, + "balance_loss_mlp": 1.02461529, + "epoch": 0.5171802194498722, + "flos": 27754594297920.0, + "grad_norm": 2.0102919309731964, + "language_loss": 0.83497101, + "learning_rate": 1.9857847683170045e-06, + "loss": 0.85658109, + "num_input_tokens_seen": 184944490, + "router_z_loss_clip": 0.8046875, + "router_z_loss_mlp": 0.13592529, + "step": 8602, + "time_per_iteration": 2.6379427909851074 + }, + { + "auxiliary_loss_clip": 0.01123462, + "auxiliary_loss_mlp": 0.01031846, + "balance_loss_clip": 1.04382253, + "balance_loss_mlp": 1.0185895, + "epoch": 0.5172403427025402, + "flos": 34382518200000.0, + "grad_norm": 2.0764240583165683, + "language_loss": 0.74751711, + "learning_rate": 1.9853953167450926e-06, + "loss": 0.76907015, + "num_input_tokens_seen": 184963190, + "router_z_loss_clip": 0.79589844, + "router_z_loss_mlp": 0.13262939, + "step": 8603, + "time_per_iteration": 2.702169895172119 + }, + { + "auxiliary_loss_clip": 0.01126596, + "auxiliary_loss_mlp": 0.01036586, + "balance_loss_clip": 1.04584277, + "balance_loss_mlp": 1.02389646, + "epoch": 0.5173004659552082, + "flos": 24816376140480.0, + "grad_norm": 2.1936565029957737, + "language_loss": 0.7270689, + "learning_rate": 1.9850058657269915e-06, + "loss": 0.74870074, + "num_input_tokens_seen": 184981220, + "router_z_loss_clip": 0.80810547, + "router_z_loss_mlp": 0.12701416, + "step": 8604, + "time_per_iteration": 2.670694351196289 + }, + { + "auxiliary_loss_clip": 0.01128934, + "auxiliary_loss_mlp": 0.0103752, + "balance_loss_clip": 1.04361773, + "balance_loss_mlp": 1.02333379, + "epoch": 0.5173605892078762, + "flos": 23260896921600.0, + "grad_norm": 3.0798068535378027, + "language_loss": 0.85431337, + "learning_rate": 1.984616415277469e-06, + "loss": 0.87597787, + "num_input_tokens_seen": 184998810, + "router_z_loss_clip": 0.85302734, + "router_z_loss_mlp": 0.14178467, + "step": 8605, + "time_per_iteration": 2.667188882827759 + }, + { + "auxiliary_loss_clip": 0.01119915, + "auxiliary_loss_mlp": 0.0102618, + "balance_loss_clip": 1.04117644, + "balance_loss_mlp": 1.01401496, + "epoch": 0.5174207124605441, + "flos": 34160879913120.0, + "grad_norm": 2.297355656568255, + "language_loss": 0.64954817, + "learning_rate": 1.984226965411294e-06, + "loss": 0.67100906, + "num_input_tokens_seen": 185021185, + "router_z_loss_clip": 0.78710938, + "router_z_loss_mlp": 0.1217041, + "step": 8606, + "time_per_iteration": 2.6818246841430664 + }, + { + "auxiliary_loss_clip": 0.01121835, + "auxiliary_loss_mlp": 0.01031053, + "balance_loss_clip": 1.04345679, + "balance_loss_mlp": 1.01830912, + "epoch": 0.5174808357132121, + "flos": 23790097945440.0, + "grad_norm": 1.5759185692203883, + "language_loss": 0.77824223, + "learning_rate": 1.983837516143234e-06, + "loss": 0.79977107, + "num_input_tokens_seen": 185038465, + "router_z_loss_clip": 0.78466797, + "router_z_loss_mlp": 0.12750244, + "step": 8607, + "time_per_iteration": 2.6728570461273193 + }, + { + "auxiliary_loss_clip": 0.01126875, + "auxiliary_loss_mlp": 0.01036537, + "balance_loss_clip": 1.04518867, + "balance_loss_mlp": 1.02322173, + "epoch": 0.51754095896588, + "flos": 27801425923200.0, + "grad_norm": 1.9563452881725447, + "language_loss": 0.72066045, + "learning_rate": 1.983448067488057e-06, + "loss": 0.74229455, + "num_input_tokens_seen": 185057340, + "router_z_loss_clip": 0.81591797, + "router_z_loss_mlp": 0.13305664, + "step": 8608, + "time_per_iteration": 2.656097412109375 + }, + { + "auxiliary_loss_clip": 0.01130811, + "auxiliary_loss_mlp": 0.01034877, + "balance_loss_clip": 1.04617906, + "balance_loss_mlp": 1.02131724, + "epoch": 0.5176010822185481, + "flos": 27661700875680.0, + "grad_norm": 2.042233703023683, + "language_loss": 0.86309576, + "learning_rate": 1.983058619460531e-06, + "loss": 0.88475269, + "num_input_tokens_seen": 185074935, + "router_z_loss_clip": 0.84619141, + "router_z_loss_mlp": 0.13568115, + "step": 8609, + "time_per_iteration": 2.6397266387939453 + }, + { + "auxiliary_loss_clip": 0.01121441, + "auxiliary_loss_mlp": 0.01031417, + "balance_loss_clip": 1.04228735, + "balance_loss_mlp": 1.01971078, + "epoch": 0.517661205471216, + "flos": 29225567171520.0, + "grad_norm": 1.8847745177508761, + "language_loss": 0.73495722, + "learning_rate": 1.9826691720754237e-06, + "loss": 0.75648582, + "num_input_tokens_seen": 185095050, + "router_z_loss_clip": 0.79101562, + "router_z_loss_mlp": 0.11700439, + "step": 8610, + "time_per_iteration": 2.7353053092956543 + }, + { + "auxiliary_loss_clip": 0.01129222, + "auxiliary_loss_mlp": 0.01033994, + "balance_loss_clip": 1.04580653, + "balance_loss_mlp": 1.02001643, + "epoch": 0.517721328723884, + "flos": 19029997038240.0, + "grad_norm": 2.449830733229932, + "language_loss": 0.67194235, + "learning_rate": 1.9822797253475034e-06, + "loss": 0.69357449, + "num_input_tokens_seen": 185112275, + "router_z_loss_clip": 0.83300781, + "router_z_loss_mlp": 0.13977051, + "step": 8611, + "time_per_iteration": 2.6409637928009033 + }, + { + "auxiliary_loss_clip": 0.01122419, + "auxiliary_loss_mlp": 0.01034505, + "balance_loss_clip": 1.0427773, + "balance_loss_mlp": 1.02143383, + "epoch": 0.5177814519765519, + "flos": 25575643045440.0, + "grad_norm": 2.1370796109912447, + "language_loss": 0.77006876, + "learning_rate": 1.9818902792915373e-06, + "loss": 0.79163802, + "num_input_tokens_seen": 185132165, + "router_z_loss_clip": 0.796875, + "router_z_loss_mlp": 0.13067627, + "step": 8612, + "time_per_iteration": 2.67960786819458 + }, + { + "auxiliary_loss_clip": 0.01125129, + "auxiliary_loss_mlp": 0.01034114, + "balance_loss_clip": 1.04434443, + "balance_loss_mlp": 1.02103674, + "epoch": 0.5178415752292199, + "flos": 21924179262720.0, + "grad_norm": 2.90214911253181, + "language_loss": 0.81895393, + "learning_rate": 1.981500833922294e-06, + "loss": 0.84054637, + "num_input_tokens_seen": 185151025, + "router_z_loss_clip": 0.80761719, + "router_z_loss_mlp": 0.13079834, + "step": 8613, + "time_per_iteration": 2.5934627056121826 + }, + { + "auxiliary_loss_clip": 0.01127415, + "auxiliary_loss_mlp": 0.01035401, + "balance_loss_clip": 1.04682517, + "balance_loss_mlp": 1.02203178, + "epoch": 0.5179016984818878, + "flos": 21745645011360.0, + "grad_norm": 2.3737011480434265, + "language_loss": 0.66351199, + "learning_rate": 1.981111389254541e-06, + "loss": 0.68514013, + "num_input_tokens_seen": 185168455, + "router_z_loss_clip": 0.80566406, + "router_z_loss_mlp": 0.13372803, + "step": 8614, + "time_per_iteration": 2.61682391166687 + }, + { + "auxiliary_loss_clip": 0.01129094, + "auxiliary_loss_mlp": 0.01033373, + "balance_loss_clip": 1.04679537, + "balance_loss_mlp": 1.01982486, + "epoch": 0.5179618217345558, + "flos": 21744470010240.0, + "grad_norm": 3.2146810071319347, + "language_loss": 0.8661828, + "learning_rate": 1.9807219453030453e-06, + "loss": 0.88780749, + "num_input_tokens_seen": 185184415, + "router_z_loss_clip": 0.82373047, + "router_z_loss_mlp": 0.13555908, + "step": 8615, + "time_per_iteration": 2.58734130859375 + }, + { + "auxiliary_loss_clip": 0.01124254, + "auxiliary_loss_mlp": 0.01040085, + "balance_loss_clip": 1.04529655, + "balance_loss_mlp": 1.02770519, + "epoch": 0.5180219449872238, + "flos": 27482275244160.0, + "grad_norm": 1.6628633318429946, + "language_loss": 0.80663812, + "learning_rate": 1.9803325020825763e-06, + "loss": 0.82828152, + "num_input_tokens_seen": 185202910, + "router_z_loss_clip": 0.78955078, + "router_z_loss_mlp": 0.12365723, + "step": 8616, + "time_per_iteration": 2.676778793334961 + }, + { + "auxiliary_loss_clip": 0.01133087, + "auxiliary_loss_mlp": 0.01041986, + "balance_loss_clip": 1.05008435, + "balance_loss_mlp": 1.02809215, + "epoch": 0.5180820682398918, + "flos": 29181895894080.0, + "grad_norm": 1.9027326353496454, + "language_loss": 0.75222147, + "learning_rate": 1.9799430596079e-06, + "loss": 0.77397227, + "num_input_tokens_seen": 185223085, + "router_z_loss_clip": 0.82910156, + "router_z_loss_mlp": 0.13891602, + "step": 8617, + "time_per_iteration": 4.128292560577393 + }, + { + "auxiliary_loss_clip": 0.01123972, + "auxiliary_loss_mlp": 0.01041185, + "balance_loss_clip": 1.04345703, + "balance_loss_mlp": 1.0277319, + "epoch": 0.5181421914925598, + "flos": 20718921126240.0, + "grad_norm": 1.575208570559521, + "language_loss": 0.6986239, + "learning_rate": 1.979553617893785e-06, + "loss": 0.72027552, + "num_input_tokens_seen": 185241295, + "router_z_loss_clip": 0.8046875, + "router_z_loss_mlp": 0.13458252, + "step": 8618, + "time_per_iteration": 3.9265570640563965 + }, + { + "auxiliary_loss_clip": 0.01040004, + "auxiliary_loss_mlp": 0.01005283, + "balance_loss_clip": 1.01451325, + "balance_loss_mlp": 1.00386381, + "epoch": 0.5182023147452277, + "flos": 80606068280640.0, + "grad_norm": 0.9525620007874218, + "language_loss": 0.67266726, + "learning_rate": 1.979164176954999e-06, + "loss": 0.69312012, + "num_input_tokens_seen": 185298295, + "router_z_loss_clip": 0.25439453, + "router_z_loss_mlp": 0.01419067, + "step": 8619, + "time_per_iteration": 3.2059881687164307 + }, + { + "auxiliary_loss_clip": 0.01121896, + "auxiliary_loss_mlp": 0.01033342, + "balance_loss_clip": 1.04478145, + "balance_loss_mlp": 1.02087831, + "epoch": 0.5182624379978957, + "flos": 22192446588480.0, + "grad_norm": 2.0490237318725386, + "language_loss": 0.79499692, + "learning_rate": 1.97877473680631e-06, + "loss": 0.8165493, + "num_input_tokens_seen": 185317000, + "router_z_loss_clip": 0.77099609, + "router_z_loss_mlp": 0.12475586, + "step": 8620, + "time_per_iteration": 2.667869806289673 + }, + { + "auxiliary_loss_clip": 0.01123369, + "auxiliary_loss_mlp": 0.01039773, + "balance_loss_clip": 1.04533553, + "balance_loss_mlp": 1.02728558, + "epoch": 0.5183225612505636, + "flos": 17114977762560.0, + "grad_norm": 2.4454785299707313, + "language_loss": 0.8168366, + "learning_rate": 1.9783852974624846e-06, + "loss": 0.83846802, + "num_input_tokens_seen": 185331185, + "router_z_loss_clip": 0.78027344, + "router_z_loss_mlp": 0.12487793, + "step": 8621, + "time_per_iteration": 2.614100933074951 + }, + { + "auxiliary_loss_clip": 0.01124668, + "auxiliary_loss_mlp": 0.01036618, + "balance_loss_clip": 1.04448438, + "balance_loss_mlp": 1.02439892, + "epoch": 0.5183826845032317, + "flos": 28869916773600.0, + "grad_norm": 2.5304573889557, + "language_loss": 0.65695846, + "learning_rate": 1.9779958589382905e-06, + "loss": 0.67857134, + "num_input_tokens_seen": 185348955, + "router_z_loss_clip": 0.80029297, + "router_z_loss_mlp": 0.12237549, + "step": 8622, + "time_per_iteration": 2.6815502643585205 + }, + { + "auxiliary_loss_clip": 0.01128225, + "auxiliary_loss_mlp": 0.01043294, + "balance_loss_clip": 1.04608476, + "balance_loss_mlp": 1.02981186, + "epoch": 0.5184428077558996, + "flos": 19386903471840.0, + "grad_norm": 1.768665601807307, + "language_loss": 0.60696578, + "learning_rate": 1.977606421248497e-06, + "loss": 0.62868094, + "num_input_tokens_seen": 185367330, + "router_z_loss_clip": 0.82177734, + "router_z_loss_mlp": 0.1350708, + "step": 8623, + "time_per_iteration": 2.6476001739501953 + }, + { + "auxiliary_loss_clip": 0.01123495, + "auxiliary_loss_mlp": 0.01035141, + "balance_loss_clip": 1.04369569, + "balance_loss_mlp": 1.02253437, + "epoch": 0.5185029310085676, + "flos": 25662053702880.0, + "grad_norm": 2.282878583001528, + "language_loss": 0.75641727, + "learning_rate": 1.9772169844078685e-06, + "loss": 0.77800369, + "num_input_tokens_seen": 185385060, + "router_z_loss_clip": 0.79736328, + "router_z_loss_mlp": 0.1260376, + "step": 8624, + "time_per_iteration": 4.105668067932129 + }, + { + "auxiliary_loss_clip": 0.01123621, + "auxiliary_loss_mlp": 0.01033663, + "balance_loss_clip": 1.04352236, + "balance_loss_mlp": 1.02143252, + "epoch": 0.5185630542612355, + "flos": 32387814135360.0, + "grad_norm": 2.355319257330352, + "language_loss": 0.70943129, + "learning_rate": 1.9768275484311756e-06, + "loss": 0.73100412, + "num_input_tokens_seen": 185403745, + "router_z_loss_clip": 0.79980469, + "router_z_loss_mlp": 0.12225342, + "step": 8625, + "time_per_iteration": 2.684419870376587 + }, + { + "auxiliary_loss_clip": 0.0112393, + "auxiliary_loss_mlp": 0.01033603, + "balance_loss_clip": 1.0437845, + "balance_loss_mlp": 1.02189112, + "epoch": 0.5186231775139035, + "flos": 25219587474720.0, + "grad_norm": 1.9485682538541707, + "language_loss": 0.68130684, + "learning_rate": 1.976438113333184e-06, + "loss": 0.70288217, + "num_input_tokens_seen": 185422620, + "router_z_loss_clip": 0.80126953, + "router_z_loss_mlp": 0.1171875, + "step": 8626, + "time_per_iteration": 2.6928675174713135 + }, + { + "auxiliary_loss_clip": 0.01124295, + "auxiliary_loss_mlp": 0.01033609, + "balance_loss_clip": 1.04618549, + "balance_loss_mlp": 1.02088892, + "epoch": 0.5186833007665714, + "flos": 25484653935360.0, + "grad_norm": 2.8015471075791245, + "language_loss": 0.69898921, + "learning_rate": 1.9760486791286612e-06, + "loss": 0.72056824, + "num_input_tokens_seen": 185439380, + "router_z_loss_clip": 0.78125, + "router_z_loss_mlp": 0.1272583, + "step": 8627, + "time_per_iteration": 2.6734867095947266 + }, + { + "auxiliary_loss_clip": 0.01126804, + "auxiliary_loss_mlp": 0.01041381, + "balance_loss_clip": 1.0447669, + "balance_loss_mlp": 1.02895355, + "epoch": 0.5187434240192395, + "flos": 25486234109280.0, + "grad_norm": 1.772854434633797, + "language_loss": 0.73042196, + "learning_rate": 1.9756592458323753e-06, + "loss": 0.75210381, + "num_input_tokens_seen": 185458830, + "router_z_loss_clip": 0.81884766, + "router_z_loss_mlp": 0.12420654, + "step": 8628, + "time_per_iteration": 2.7321863174438477 + }, + { + "auxiliary_loss_clip": 0.01123673, + "auxiliary_loss_mlp": 0.01036325, + "balance_loss_clip": 1.04556417, + "balance_loss_mlp": 1.02389133, + "epoch": 0.5188035472719074, + "flos": 24233536588320.0, + "grad_norm": 1.7911377800341062, + "language_loss": 0.77587628, + "learning_rate": 1.9752698134590927e-06, + "loss": 0.79747629, + "num_input_tokens_seen": 185477270, + "router_z_loss_clip": 0.78027344, + "router_z_loss_mlp": 0.12426758, + "step": 8629, + "time_per_iteration": 2.678157091140747 + }, + { + "auxiliary_loss_clip": 0.01127914, + "auxiliary_loss_mlp": 0.01032044, + "balance_loss_clip": 1.0471679, + "balance_loss_mlp": 1.01851952, + "epoch": 0.5188636705245754, + "flos": 25793391673440.0, + "grad_norm": 2.1045711802649363, + "language_loss": 0.74882209, + "learning_rate": 1.9748803820235815e-06, + "loss": 0.77042162, + "num_input_tokens_seen": 185495795, + "router_z_loss_clip": 0.80761719, + "router_z_loss_mlp": 0.13531494, + "step": 8630, + "time_per_iteration": 2.652523994445801 + }, + { + "auxiliary_loss_clip": 0.01125154, + "auxiliary_loss_mlp": 0.01040277, + "balance_loss_clip": 1.04386067, + "balance_loss_mlp": 1.0264492, + "epoch": 0.5189237937772434, + "flos": 27356731244640.0, + "grad_norm": 2.6882704632614565, + "language_loss": 0.80595833, + "learning_rate": 1.9744909515406093e-06, + "loss": 0.82761264, + "num_input_tokens_seen": 185514885, + "router_z_loss_clip": 0.81152344, + "router_z_loss_mlp": 0.13830566, + "step": 8631, + "time_per_iteration": 4.07621693611145 + }, + { + "auxiliary_loss_clip": 0.01125298, + "auxiliary_loss_mlp": 0.01034289, + "balance_loss_clip": 1.0444839, + "balance_loss_mlp": 1.02127171, + "epoch": 0.5189839170299113, + "flos": 31050529234560.0, + "grad_norm": 2.03033137657492, + "language_loss": 0.74531674, + "learning_rate": 1.974101522024942e-06, + "loss": 0.76691264, + "num_input_tokens_seen": 185537155, + "router_z_loss_clip": 0.80810547, + "router_z_loss_mlp": 0.13006592, + "step": 8632, + "time_per_iteration": 2.7262396812438965 + }, + { + "auxiliary_loss_clip": 0.0112015, + "auxiliary_loss_mlp": 0.01031629, + "balance_loss_clip": 1.04321361, + "balance_loss_mlp": 1.01918352, + "epoch": 0.5190440402825793, + "flos": 22680650475360.0, + "grad_norm": 2.108330537099221, + "language_loss": 0.78499585, + "learning_rate": 1.9737120934913477e-06, + "loss": 0.80651367, + "num_input_tokens_seen": 185555520, + "router_z_loss_clip": 0.76953125, + "router_z_loss_mlp": 0.12451172, + "step": 8633, + "time_per_iteration": 2.6329398155212402 + }, + { + "auxiliary_loss_clip": 0.01125137, + "auxiliary_loss_mlp": 0.01032763, + "balance_loss_clip": 1.04462743, + "balance_loss_mlp": 1.02047229, + "epoch": 0.5191041635352472, + "flos": 26733016107360.0, + "grad_norm": 1.9777207794325118, + "language_loss": 0.802791, + "learning_rate": 1.9733226659545936e-06, + "loss": 0.82437003, + "num_input_tokens_seen": 185573855, + "router_z_loss_clip": 0.80517578, + "router_z_loss_mlp": 0.12298584, + "step": 8634, + "time_per_iteration": 2.697030782699585 + }, + { + "auxiliary_loss_clip": 0.01121459, + "auxiliary_loss_mlp": 0.01037888, + "balance_loss_clip": 1.04397082, + "balance_loss_mlp": 1.02515066, + "epoch": 0.5191642867879153, + "flos": 33590114510400.0, + "grad_norm": 1.8473579723708236, + "language_loss": 0.6900847, + "learning_rate": 1.9729332394294467e-06, + "loss": 0.71167815, + "num_input_tokens_seen": 185595145, + "router_z_loss_clip": 0.77587891, + "router_z_loss_mlp": 0.12738037, + "step": 8635, + "time_per_iteration": 2.703908920288086 + }, + { + "auxiliary_loss_clip": 0.01128409, + "auxiliary_loss_mlp": 0.01035026, + "balance_loss_clip": 1.04641008, + "balance_loss_mlp": 1.02238417, + "epoch": 0.5192244100405832, + "flos": 19164090183840.0, + "grad_norm": 1.7614334007744095, + "language_loss": 0.77644831, + "learning_rate": 1.9725438139306742e-06, + "loss": 0.79808271, + "num_input_tokens_seen": 185613320, + "router_z_loss_clip": 0.81933594, + "router_z_loss_mlp": 0.12652588, + "step": 8636, + "time_per_iteration": 2.678352117538452 + }, + { + "auxiliary_loss_clip": 0.01125686, + "auxiliary_loss_mlp": 0.01030734, + "balance_loss_clip": 1.04499245, + "balance_loss_mlp": 1.0178659, + "epoch": 0.5192845332932512, + "flos": 14711430461760.0, + "grad_norm": 3.0130784266483306, + "language_loss": 0.71643096, + "learning_rate": 1.9721543894730425e-06, + "loss": 0.73799521, + "num_input_tokens_seen": 185630730, + "router_z_loss_clip": 0.80761719, + "router_z_loss_mlp": 0.12872314, + "step": 8637, + "time_per_iteration": 2.6483325958251953 + }, + { + "auxiliary_loss_clip": 0.01121396, + "auxiliary_loss_mlp": 0.01032033, + "balance_loss_clip": 1.04399967, + "balance_loss_mlp": 1.01950979, + "epoch": 0.5193446565459191, + "flos": 23126560672320.0, + "grad_norm": 2.368046578786869, + "language_loss": 0.76303089, + "learning_rate": 1.9717649660713194e-06, + "loss": 0.78456515, + "num_input_tokens_seen": 185648515, + "router_z_loss_clip": 0.7734375, + "router_z_loss_mlp": 0.12512207, + "step": 8638, + "time_per_iteration": 2.6807374954223633 + }, + { + "auxiliary_loss_clip": 0.01121822, + "auxiliary_loss_mlp": 0.01033606, + "balance_loss_clip": 1.04322457, + "balance_loss_mlp": 1.02135158, + "epoch": 0.5194047797985871, + "flos": 24861627591840.0, + "grad_norm": 2.123254128541379, + "language_loss": 0.74529648, + "learning_rate": 1.971375543740272e-06, + "loss": 0.76685071, + "num_input_tokens_seen": 185665220, + "router_z_loss_clip": 0.78564453, + "router_z_loss_mlp": 0.12255859, + "step": 8639, + "time_per_iteration": 2.67616605758667 + }, + { + "auxiliary_loss_clip": 0.01124017, + "auxiliary_loss_mlp": 0.01030055, + "balance_loss_clip": 1.04526687, + "balance_loss_mlp": 1.01673889, + "epoch": 0.519464903051255, + "flos": 29715837439680.0, + "grad_norm": 2.5646963687031414, + "language_loss": 0.77456528, + "learning_rate": 1.9709861224946665e-06, + "loss": 0.79610598, + "num_input_tokens_seen": 185683750, + "router_z_loss_clip": 0.78710938, + "router_z_loss_mlp": 0.13311768, + "step": 8640, + "time_per_iteration": 2.738682508468628 + }, + { + "auxiliary_loss_clip": 0.01124046, + "auxiliary_loss_mlp": 0.01034759, + "balance_loss_clip": 1.04701257, + "balance_loss_mlp": 1.02262986, + "epoch": 0.519525026303923, + "flos": 17157960246240.0, + "grad_norm": 2.700161593129817, + "language_loss": 0.65866178, + "learning_rate": 1.97059670234927e-06, + "loss": 0.68024981, + "num_input_tokens_seen": 185700625, + "router_z_loss_clip": 0.76904297, + "router_z_loss_mlp": 0.12133789, + "step": 8641, + "time_per_iteration": 2.6817119121551514 + }, + { + "auxiliary_loss_clip": 0.01122328, + "auxiliary_loss_mlp": 0.01036042, + "balance_loss_clip": 1.04524136, + "balance_loss_mlp": 1.02428234, + "epoch": 0.519585149556591, + "flos": 35186631383520.0, + "grad_norm": 1.9607692671804837, + "language_loss": 0.76401365, + "learning_rate": 1.97020728331885e-06, + "loss": 0.78559732, + "num_input_tokens_seen": 185721155, + "router_z_loss_clip": 0.77099609, + "router_z_loss_mlp": 0.11755371, + "step": 8642, + "time_per_iteration": 2.740476608276367 + }, + { + "auxiliary_loss_clip": 0.01121924, + "auxiliary_loss_mlp": 0.01032923, + "balance_loss_clip": 1.0443567, + "balance_loss_mlp": 1.02073407, + "epoch": 0.519645272809259, + "flos": 30960674608320.0, + "grad_norm": 1.7197688896951269, + "language_loss": 0.83289587, + "learning_rate": 1.9698178654181726e-06, + "loss": 0.85444432, + "num_input_tokens_seen": 185740990, + "router_z_loss_clip": 0.77490234, + "router_z_loss_mlp": 0.12182617, + "step": 8643, + "time_per_iteration": 2.705376625061035 + }, + { + "auxiliary_loss_clip": 0.0112443, + "auxiliary_loss_mlp": 0.01038489, + "balance_loss_clip": 1.04384053, + "balance_loss_mlp": 1.02551877, + "epoch": 0.519705396061927, + "flos": 30960512539200.0, + "grad_norm": 1.6568948073628766, + "language_loss": 0.69915366, + "learning_rate": 1.969428448662004e-06, + "loss": 0.72078282, + "num_input_tokens_seen": 185762235, + "router_z_loss_clip": 0.80566406, + "router_z_loss_mlp": 0.12957764, + "step": 8644, + "time_per_iteration": 2.7258212566375732 + }, + { + "auxiliary_loss_clip": 0.01121958, + "auxiliary_loss_mlp": 0.01033561, + "balance_loss_clip": 1.04287601, + "balance_loss_mlp": 1.02124035, + "epoch": 0.5197655193145949, + "flos": 34747284985920.0, + "grad_norm": 1.6121623406102035, + "language_loss": 0.80293518, + "learning_rate": 1.9690390330651133e-06, + "loss": 0.82449037, + "num_input_tokens_seen": 185783415, + "router_z_loss_clip": 0.78955078, + "router_z_loss_mlp": 0.12322998, + "step": 8645, + "time_per_iteration": 2.7533366680145264 + }, + { + "auxiliary_loss_clip": 0.01120723, + "auxiliary_loss_mlp": 0.0102949, + "balance_loss_clip": 1.04190135, + "balance_loss_mlp": 1.01703823, + "epoch": 0.5198256425672629, + "flos": 24415312222080.0, + "grad_norm": 2.2445546335265223, + "language_loss": 0.77817523, + "learning_rate": 1.968649618642264e-06, + "loss": 0.79967737, + "num_input_tokens_seen": 185801345, + "router_z_loss_clip": 0.78710938, + "router_z_loss_mlp": 0.12469482, + "step": 8646, + "time_per_iteration": 2.675459861755371 + }, + { + "auxiliary_loss_clip": 0.01124994, + "auxiliary_loss_mlp": 0.01031843, + "balance_loss_clip": 1.04574823, + "balance_loss_mlp": 1.01974332, + "epoch": 0.5198857658199308, + "flos": 24195132557280.0, + "grad_norm": 1.782876470281827, + "language_loss": 0.66139936, + "learning_rate": 1.9682602054082252e-06, + "loss": 0.68296778, + "num_input_tokens_seen": 185820815, + "router_z_loss_clip": 0.79199219, + "router_z_loss_mlp": 0.12109375, + "step": 8647, + "time_per_iteration": 2.664642572402954 + }, + { + "auxiliary_loss_clip": 0.01125782, + "auxiliary_loss_mlp": 0.01032186, + "balance_loss_clip": 1.0440681, + "balance_loss_mlp": 1.01836324, + "epoch": 0.5199458890725989, + "flos": 29848796101440.0, + "grad_norm": 1.7091350812010964, + "language_loss": 0.71656764, + "learning_rate": 1.967870793377763e-06, + "loss": 0.73814732, + "num_input_tokens_seen": 185841450, + "router_z_loss_clip": 0.81689453, + "router_z_loss_mlp": 0.13824463, + "step": 8648, + "time_per_iteration": 2.683274269104004 + }, + { + "auxiliary_loss_clip": 0.01124149, + "auxiliary_loss_mlp": 0.01031876, + "balance_loss_clip": 1.04409933, + "balance_loss_mlp": 1.01842928, + "epoch": 0.5200060123252668, + "flos": 28558707481440.0, + "grad_norm": 1.9718974183830098, + "language_loss": 0.64063239, + "learning_rate": 1.967481382565642e-06, + "loss": 0.6621927, + "num_input_tokens_seen": 185859935, + "router_z_loss_clip": 0.79980469, + "router_z_loss_mlp": 0.13464355, + "step": 8649, + "time_per_iteration": 2.7344765663146973 + }, + { + "auxiliary_loss_clip": 0.01126448, + "auxiliary_loss_mlp": 0.01037049, + "balance_loss_clip": 1.04316914, + "balance_loss_mlp": 1.02287555, + "epoch": 0.5200661355779348, + "flos": 20988890177760.0, + "grad_norm": 1.7716680471802113, + "language_loss": 0.70433629, + "learning_rate": 1.9670919729866315e-06, + "loss": 0.72597128, + "num_input_tokens_seen": 185876795, + "router_z_loss_clip": 0.83203125, + "router_z_loss_mlp": 0.14172363, + "step": 8650, + "time_per_iteration": 2.6665256023406982 + }, + { + "auxiliary_loss_clip": 0.01121385, + "auxiliary_loss_mlp": 0.01028393, + "balance_loss_clip": 1.04290926, + "balance_loss_mlp": 1.01570272, + "epoch": 0.5201262588306027, + "flos": 22591970850240.0, + "grad_norm": 1.6692710869974814, + "language_loss": 0.77756071, + "learning_rate": 1.966702564655496e-06, + "loss": 0.79905856, + "num_input_tokens_seen": 185895570, + "router_z_loss_clip": 0.78564453, + "router_z_loss_mlp": 0.12677002, + "step": 8651, + "time_per_iteration": 2.764863967895508 + }, + { + "auxiliary_loss_clip": 0.01126826, + "auxiliary_loss_mlp": 0.01037156, + "balance_loss_clip": 1.04658675, + "balance_loss_mlp": 1.02430534, + "epoch": 0.5201863820832707, + "flos": 22719743300160.0, + "grad_norm": 1.9839575460468981, + "language_loss": 0.78612489, + "learning_rate": 1.966313157587003e-06, + "loss": 0.80776471, + "num_input_tokens_seen": 185913700, + "router_z_loss_clip": 0.80126953, + "router_z_loss_mlp": 0.12841797, + "step": 8652, + "time_per_iteration": 2.6274256706237793 + }, + { + "auxiliary_loss_clip": 0.01124166, + "auxiliary_loss_mlp": 0.01031277, + "balance_loss_clip": 1.04467702, + "balance_loss_mlp": 1.01750803, + "epoch": 0.5202465053359386, + "flos": 27890470203840.0, + "grad_norm": 1.8294234487150887, + "language_loss": 0.70235646, + "learning_rate": 1.9659237517959187e-06, + "loss": 0.72391093, + "num_input_tokens_seen": 185932460, + "router_z_loss_clip": 0.79443359, + "router_z_loss_mlp": 0.13769531, + "step": 8653, + "time_per_iteration": 2.690314769744873 + }, + { + "auxiliary_loss_clip": 0.01126261, + "auxiliary_loss_mlp": 0.01041172, + "balance_loss_clip": 1.04462874, + "balance_loss_mlp": 1.02811289, + "epoch": 0.5203066285886067, + "flos": 26821979353440.0, + "grad_norm": 2.026688201670099, + "language_loss": 0.78676796, + "learning_rate": 1.965534347297008e-06, + "loss": 0.80844235, + "num_input_tokens_seen": 185952030, + "router_z_loss_clip": 0.81689453, + "router_z_loss_mlp": 0.13037109, + "step": 8654, + "time_per_iteration": 2.626192569732666 + }, + { + "auxiliary_loss_clip": 0.01127621, + "auxiliary_loss_mlp": 0.01036259, + "balance_loss_clip": 1.0447613, + "balance_loss_mlp": 1.02329481, + "epoch": 0.5203667518412746, + "flos": 24688401104160.0, + "grad_norm": 1.85686770355754, + "language_loss": 0.84247589, + "learning_rate": 1.9651449441050393e-06, + "loss": 0.8641147, + "num_input_tokens_seen": 185973130, + "router_z_loss_clip": 0.828125, + "router_z_loss_mlp": 0.12963867, + "step": 8655, + "time_per_iteration": 2.6844723224639893 + }, + { + "auxiliary_loss_clip": 0.01122058, + "auxiliary_loss_mlp": 0.01033693, + "balance_loss_clip": 1.04508114, + "balance_loss_mlp": 1.02138484, + "epoch": 0.5204268750939426, + "flos": 19163482424640.0, + "grad_norm": 3.609457801915303, + "language_loss": 0.6614179, + "learning_rate": 1.9647555422347777e-06, + "loss": 0.68297535, + "num_input_tokens_seen": 185990200, + "router_z_loss_clip": 0.76953125, + "router_z_loss_mlp": 0.12316895, + "step": 8656, + "time_per_iteration": 2.6570982933044434 + }, + { + "auxiliary_loss_clip": 0.01124436, + "auxiliary_loss_mlp": 0.01032086, + "balance_loss_clip": 1.0456214, + "balance_loss_mlp": 1.01962829, + "epoch": 0.5204869983466105, + "flos": 33494830568640.0, + "grad_norm": 1.8095206066830107, + "language_loss": 0.73190415, + "learning_rate": 1.9643661417009893e-06, + "loss": 0.75346941, + "num_input_tokens_seen": 186009880, + "router_z_loss_clip": 0.78955078, + "router_z_loss_mlp": 0.12457275, + "step": 8657, + "time_per_iteration": 4.135780334472656 + }, + { + "auxiliary_loss_clip": 0.01123194, + "auxiliary_loss_mlp": 0.01032367, + "balance_loss_clip": 1.04475307, + "balance_loss_mlp": 1.01925373, + "epoch": 0.5205471215992785, + "flos": 25130543194080.0, + "grad_norm": 2.0548034028188544, + "language_loss": 0.71472543, + "learning_rate": 1.9639767425184408e-06, + "loss": 0.73628104, + "num_input_tokens_seen": 186026680, + "router_z_loss_clip": 0.78515625, + "router_z_loss_mlp": 0.13098145, + "step": 8658, + "time_per_iteration": 4.037830591201782 + }, + { + "auxiliary_loss_clip": 0.01120847, + "auxiliary_loss_mlp": 0.01032208, + "balance_loss_clip": 1.04203975, + "balance_loss_mlp": 1.01969099, + "epoch": 0.5206072448519465, + "flos": 27000554122080.0, + "grad_norm": 2.0019899581417984, + "language_loss": 0.83197731, + "learning_rate": 1.963587344701897e-06, + "loss": 0.85350788, + "num_input_tokens_seen": 186046920, + "router_z_loss_clip": 0.78857422, + "router_z_loss_mlp": 0.12524414, + "step": 8659, + "time_per_iteration": 2.6389713287353516 + }, + { + "auxiliary_loss_clip": 0.01128796, + "auxiliary_loss_mlp": 0.01039628, + "balance_loss_clip": 1.04423046, + "balance_loss_mlp": 1.02540636, + "epoch": 0.5206673681046144, + "flos": 22366848077280.0, + "grad_norm": 2.3247358267747322, + "language_loss": 0.75655138, + "learning_rate": 1.9631979482661253e-06, + "loss": 0.77823567, + "num_input_tokens_seen": 186062090, + "router_z_loss_clip": 0.84472656, + "router_z_loss_mlp": 0.14208984, + "step": 8660, + "time_per_iteration": 2.594219923019409 + }, + { + "auxiliary_loss_clip": 0.01120827, + "auxiliary_loss_mlp": 0.01033385, + "balance_loss_clip": 1.04342985, + "balance_loss_mlp": 1.02160692, + "epoch": 0.5207274913572825, + "flos": 24684916618080.0, + "grad_norm": 2.729411522442914, + "language_loss": 0.77285838, + "learning_rate": 1.9628085532258906e-06, + "loss": 0.79440057, + "num_input_tokens_seen": 186081135, + "router_z_loss_clip": 0.77441406, + "router_z_loss_mlp": 0.11773682, + "step": 8661, + "time_per_iteration": 2.7140860557556152 + }, + { + "auxiliary_loss_clip": 0.01123687, + "auxiliary_loss_mlp": 0.01031405, + "balance_loss_clip": 1.04244137, + "balance_loss_mlp": 1.0191741, + "epoch": 0.5207876146099504, + "flos": 26999581707360.0, + "grad_norm": 1.972445642612774, + "language_loss": 0.70368963, + "learning_rate": 1.9624191595959603e-06, + "loss": 0.72524059, + "num_input_tokens_seen": 186099700, + "router_z_loss_clip": 0.8125, + "router_z_loss_mlp": 0.12231445, + "step": 8662, + "time_per_iteration": 2.6531689167022705 + }, + { + "auxiliary_loss_clip": 0.01119534, + "auxiliary_loss_mlp": 0.01031782, + "balance_loss_clip": 1.04263413, + "balance_loss_mlp": 1.01872253, + "epoch": 0.5208477378626184, + "flos": 29137292719200.0, + "grad_norm": 1.6427060349444804, + "language_loss": 0.69559515, + "learning_rate": 1.962029767391098e-06, + "loss": 0.71710831, + "num_input_tokens_seen": 186119740, + "router_z_loss_clip": 0.76953125, + "router_z_loss_mlp": 0.13067627, + "step": 8663, + "time_per_iteration": 2.674156665802002 + }, + { + "auxiliary_loss_clip": 0.01122559, + "auxiliary_loss_mlp": 0.01031068, + "balance_loss_clip": 1.04427361, + "balance_loss_mlp": 1.01825333, + "epoch": 0.5209078611152863, + "flos": 25577142184800.0, + "grad_norm": 1.898408648810418, + "language_loss": 0.76712257, + "learning_rate": 1.961640376626072e-06, + "loss": 0.78865886, + "num_input_tokens_seen": 186140645, + "router_z_loss_clip": 0.78271484, + "router_z_loss_mlp": 0.12817383, + "step": 8664, + "time_per_iteration": 4.100133419036865 + }, + { + "auxiliary_loss_clip": 0.0112274, + "auxiliary_loss_mlp": 0.01036778, + "balance_loss_clip": 1.04339945, + "balance_loss_mlp": 1.02417111, + "epoch": 0.5209679843679543, + "flos": 25218817646400.0, + "grad_norm": 2.606634423717609, + "language_loss": 0.76115191, + "learning_rate": 1.961250987315646e-06, + "loss": 0.78274703, + "num_input_tokens_seen": 186160130, + "router_z_loss_clip": 0.79296875, + "router_z_loss_mlp": 0.12591553, + "step": 8665, + "time_per_iteration": 2.5939927101135254 + }, + { + "auxiliary_loss_clip": 0.01121808, + "auxiliary_loss_mlp": 0.0103069, + "balance_loss_clip": 1.0446924, + "balance_loss_mlp": 1.01878726, + "epoch": 0.5210281076206222, + "flos": 24681594201120.0, + "grad_norm": 2.689541914815724, + "language_loss": 0.7249161, + "learning_rate": 1.960861599474586e-06, + "loss": 0.74644107, + "num_input_tokens_seen": 186179485, + "router_z_loss_clip": 0.77099609, + "router_z_loss_mlp": 0.11907959, + "step": 8666, + "time_per_iteration": 2.6348698139190674 + }, + { + "auxiliary_loss_clip": 0.0112848, + "auxiliary_loss_mlp": 0.01038049, + "balance_loss_clip": 1.04338229, + "balance_loss_mlp": 1.02350557, + "epoch": 0.5210882308732903, + "flos": 19608339172320.0, + "grad_norm": 2.167148830685864, + "language_loss": 0.67943561, + "learning_rate": 1.9604722131176592e-06, + "loss": 0.70110089, + "num_input_tokens_seen": 186197140, + "router_z_loss_clip": 0.85058594, + "router_z_loss_mlp": 0.14532471, + "step": 8667, + "time_per_iteration": 2.6005895137786865 + }, + { + "auxiliary_loss_clip": 0.01119647, + "auxiliary_loss_mlp": 0.01032528, + "balance_loss_clip": 1.04270697, + "balance_loss_mlp": 1.02005315, + "epoch": 0.5211483541259582, + "flos": 30292356296160.0, + "grad_norm": 1.4976449699780294, + "language_loss": 0.80889118, + "learning_rate": 1.960082828259629e-06, + "loss": 0.83041292, + "num_input_tokens_seen": 186216800, + "router_z_loss_clip": 0.76904297, + "router_z_loss_mlp": 0.12475586, + "step": 8668, + "time_per_iteration": 2.683210611343384 + }, + { + "auxiliary_loss_clip": 0.01121268, + "auxiliary_loss_mlp": 0.01031053, + "balance_loss_clip": 1.04336476, + "balance_loss_mlp": 1.01851833, + "epoch": 0.5212084773786262, + "flos": 24856765518240.0, + "grad_norm": 2.0319243335455837, + "language_loss": 0.63541079, + "learning_rate": 1.9596934449152623e-06, + "loss": 0.65693396, + "num_input_tokens_seen": 186235320, + "router_z_loss_clip": 0.78027344, + "router_z_loss_mlp": 0.12536621, + "step": 8669, + "time_per_iteration": 2.6066157817840576 + }, + { + "auxiliary_loss_clip": 0.01123483, + "auxiliary_loss_mlp": 0.01032592, + "balance_loss_clip": 1.04559255, + "balance_loss_mlp": 1.02011073, + "epoch": 0.5212686006312941, + "flos": 28242636115680.0, + "grad_norm": 1.5650163241285797, + "language_loss": 0.6668703, + "learning_rate": 1.959304063099325e-06, + "loss": 0.68843102, + "num_input_tokens_seen": 186254460, + "router_z_loss_clip": 0.77832031, + "router_z_loss_mlp": 0.12469482, + "step": 8670, + "time_per_iteration": 4.021914720535278 + }, + { + "auxiliary_loss_clip": 0.01117544, + "auxiliary_loss_mlp": 0.01026833, + "balance_loss_clip": 1.04236197, + "balance_loss_mlp": 1.01492977, + "epoch": 0.5213287238839621, + "flos": 33891234999840.0, + "grad_norm": 2.169503514382362, + "language_loss": 0.76063108, + "learning_rate": 1.9589146828265806e-06, + "loss": 0.78207481, + "num_input_tokens_seen": 186269465, + "router_z_loss_clip": 0.75195312, + "router_z_loss_mlp": 0.11901855, + "step": 8671, + "time_per_iteration": 2.6637089252471924 + }, + { + "auxiliary_loss_clip": 0.01129055, + "auxiliary_loss_mlp": 0.01037668, + "balance_loss_clip": 1.04804707, + "balance_loss_mlp": 1.02435279, + "epoch": 0.5213888471366301, + "flos": 24327969667200.0, + "grad_norm": 3.6031583791823767, + "language_loss": 0.78487569, + "learning_rate": 1.958525304111796e-06, + "loss": 0.80654287, + "num_input_tokens_seen": 186288660, + "router_z_loss_clip": 0.80908203, + "router_z_loss_mlp": 0.13336182, + "step": 8672, + "time_per_iteration": 2.6446447372436523 + }, + { + "auxiliary_loss_clip": 0.01117199, + "auxiliary_loss_mlp": 0.01030091, + "balance_loss_clip": 1.04048562, + "balance_loss_mlp": 1.01884985, + "epoch": 0.521448970389298, + "flos": 20722081474080.0, + "grad_norm": 2.184170153227921, + "language_loss": 0.72086787, + "learning_rate": 1.958135926969736e-06, + "loss": 0.7423408, + "num_input_tokens_seen": 186305760, + "router_z_loss_clip": 0.76757812, + "router_z_loss_mlp": 0.11236572, + "step": 8673, + "time_per_iteration": 2.640073776245117 + }, + { + "auxiliary_loss_clip": 0.01121515, + "auxiliary_loss_mlp": 0.01033201, + "balance_loss_clip": 1.04328036, + "balance_loss_mlp": 1.02077889, + "epoch": 0.5215090936419661, + "flos": 23171123329920.0, + "grad_norm": 1.7380336999363917, + "language_loss": 0.75082254, + "learning_rate": 1.957746551415166e-06, + "loss": 0.77236968, + "num_input_tokens_seen": 186324135, + "router_z_loss_clip": 0.78271484, + "router_z_loss_mlp": 0.12432861, + "step": 8674, + "time_per_iteration": 2.7311458587646484 + }, + { + "auxiliary_loss_clip": 0.01121033, + "auxiliary_loss_mlp": 0.01034225, + "balance_loss_clip": 1.04101765, + "balance_loss_mlp": 1.02083826, + "epoch": 0.521569216894634, + "flos": 19697910177600.0, + "grad_norm": 2.1619313021894158, + "language_loss": 0.85987639, + "learning_rate": 1.9573571774628506e-06, + "loss": 0.88142896, + "num_input_tokens_seen": 186340205, + "router_z_loss_clip": 0.80029297, + "router_z_loss_mlp": 0.1338501, + "step": 8675, + "time_per_iteration": 2.636502504348755 + }, + { + "auxiliary_loss_clip": 0.01039951, + "auxiliary_loss_mlp": 0.01002774, + "balance_loss_clip": 1.01507187, + "balance_loss_mlp": 1.00141025, + "epoch": 0.521629340147302, + "flos": 70259104123200.0, + "grad_norm": 0.87439815811195, + "language_loss": 0.63094974, + "learning_rate": 1.9569678051275556e-06, + "loss": 0.65137696, + "num_input_tokens_seen": 186396940, + "router_z_loss_clip": 0.24853516, + "router_z_loss_mlp": 0.01364899, + "step": 8676, + "time_per_iteration": 3.23673152923584 + }, + { + "auxiliary_loss_clip": 0.01119605, + "auxiliary_loss_mlp": 0.01029553, + "balance_loss_clip": 1.0426836, + "balance_loss_mlp": 1.01747096, + "epoch": 0.5216894633999699, + "flos": 32698132047360.0, + "grad_norm": 2.3996010111451933, + "language_loss": 0.68559194, + "learning_rate": 1.956578434424046e-06, + "loss": 0.70708352, + "num_input_tokens_seen": 186418680, + "router_z_loss_clip": 0.76904297, + "router_z_loss_mlp": 0.12078857, + "step": 8677, + "time_per_iteration": 2.671069383621216 + }, + { + "auxiliary_loss_clip": 0.01121114, + "auxiliary_loss_mlp": 0.01029667, + "balance_loss_clip": 1.04350591, + "balance_loss_mlp": 1.01684022, + "epoch": 0.5217495866526379, + "flos": 32163461190720.0, + "grad_norm": 1.5839975028601254, + "language_loss": 0.64793658, + "learning_rate": 1.956189065367086e-06, + "loss": 0.66944444, + "num_input_tokens_seen": 186438265, + "router_z_loss_clip": 0.77685547, + "router_z_loss_mlp": 0.1282959, + "step": 8678, + "time_per_iteration": 2.659726858139038 + }, + { + "auxiliary_loss_clip": 0.0112336, + "auxiliary_loss_mlp": 0.01033063, + "balance_loss_clip": 1.04210019, + "balance_loss_mlp": 1.01978898, + "epoch": 0.5218097099053058, + "flos": 28777712145120.0, + "grad_norm": 2.3441425900834347, + "language_loss": 0.6798774, + "learning_rate": 1.9557996979714414e-06, + "loss": 0.70144159, + "num_input_tokens_seen": 186456870, + "router_z_loss_clip": 0.81298828, + "router_z_loss_mlp": 0.1328125, + "step": 8679, + "time_per_iteration": 2.6422245502471924 + }, + { + "auxiliary_loss_clip": 0.01123796, + "auxiliary_loss_mlp": 0.01035765, + "balance_loss_clip": 1.04538083, + "balance_loss_mlp": 1.02324176, + "epoch": 0.5218698331579739, + "flos": 22057867235520.0, + "grad_norm": 1.9242822148210108, + "language_loss": 0.66572404, + "learning_rate": 1.9554103322518764e-06, + "loss": 0.68731964, + "num_input_tokens_seen": 186476425, + "router_z_loss_clip": 0.78466797, + "router_z_loss_mlp": 0.12518311, + "step": 8680, + "time_per_iteration": 2.613352060317993 + }, + { + "auxiliary_loss_clip": 0.01123011, + "auxiliary_loss_mlp": 0.01036031, + "balance_loss_clip": 1.04412174, + "balance_loss_mlp": 1.02334118, + "epoch": 0.5219299564106418, + "flos": 23526773727840.0, + "grad_norm": 2.1158834720974564, + "language_loss": 0.83291209, + "learning_rate": 1.955020968223156e-06, + "loss": 0.85450256, + "num_input_tokens_seen": 186492555, + "router_z_loss_clip": 0.78857422, + "router_z_loss_mlp": 0.12683105, + "step": 8681, + "time_per_iteration": 2.6084041595458984 + }, + { + "auxiliary_loss_clip": 0.01118502, + "auxiliary_loss_mlp": 0.0103226, + "balance_loss_clip": 1.04132187, + "balance_loss_mlp": 1.01984453, + "epoch": 0.5219900796633098, + "flos": 32520489176160.0, + "grad_norm": 3.362227155358691, + "language_loss": 0.7752139, + "learning_rate": 1.9546316059000454e-06, + "loss": 0.79672152, + "num_input_tokens_seen": 186513190, + "router_z_loss_clip": 0.77099609, + "router_z_loss_mlp": 0.12408447, + "step": 8682, + "time_per_iteration": 2.657968521118164 + }, + { + "auxiliary_loss_clip": 0.01122739, + "auxiliary_loss_mlp": 0.01036631, + "balance_loss_clip": 1.04586291, + "balance_loss_mlp": 1.02500844, + "epoch": 0.5220502029159777, + "flos": 41869530884160.0, + "grad_norm": 1.7456551562803535, + "language_loss": 0.69197184, + "learning_rate": 1.9542422452973082e-06, + "loss": 0.71356553, + "num_input_tokens_seen": 186534830, + "router_z_loss_clip": 0.76855469, + "router_z_loss_mlp": 0.11621094, + "step": 8683, + "time_per_iteration": 2.770047187805176 + }, + { + "auxiliary_loss_clip": 0.0112113, + "auxiliary_loss_mlp": 0.01038643, + "balance_loss_clip": 1.04174948, + "balance_loss_mlp": 1.02532125, + "epoch": 0.5221103261686457, + "flos": 27036162460800.0, + "grad_norm": 3.292360781755029, + "language_loss": 0.76155037, + "learning_rate": 1.9538528864297104e-06, + "loss": 0.78314805, + "num_input_tokens_seen": 186554390, + "router_z_loss_clip": 0.79394531, + "router_z_loss_mlp": 0.13330078, + "step": 8684, + "time_per_iteration": 2.661522388458252 + }, + { + "auxiliary_loss_clip": 0.01116745, + "auxiliary_loss_mlp": 0.01028161, + "balance_loss_clip": 1.04116392, + "balance_loss_mlp": 1.0161984, + "epoch": 0.5221704494213137, + "flos": 23438013068160.0, + "grad_norm": 3.4177403571012612, + "language_loss": 0.75558889, + "learning_rate": 1.9534635293120153e-06, + "loss": 0.77703792, + "num_input_tokens_seen": 186572360, + "router_z_loss_clip": 0.75634766, + "router_z_loss_mlp": 0.11962891, + "step": 8685, + "time_per_iteration": 2.598078727722168 + }, + { + "auxiliary_loss_clip": 0.01124775, + "auxiliary_loss_mlp": 0.01038796, + "balance_loss_clip": 1.04553628, + "balance_loss_mlp": 1.02652919, + "epoch": 0.5222305726739817, + "flos": 23615777491200.0, + "grad_norm": 2.676822587507147, + "language_loss": 0.81089079, + "learning_rate": 1.9530741739589876e-06, + "loss": 0.83252645, + "num_input_tokens_seen": 186590655, + "router_z_loss_clip": 0.79296875, + "router_z_loss_mlp": 0.12261963, + "step": 8686, + "time_per_iteration": 2.639288902282715 + }, + { + "auxiliary_loss_clip": 0.01116564, + "auxiliary_loss_mlp": 0.01032307, + "balance_loss_clip": 1.04154968, + "balance_loss_mlp": 1.02049947, + "epoch": 0.5222906959266497, + "flos": 33940578696480.0, + "grad_norm": 1.801349158561327, + "language_loss": 0.69784486, + "learning_rate": 1.9526848203853927e-06, + "loss": 0.71933353, + "num_input_tokens_seen": 186610345, + "router_z_loss_clip": 0.74951172, + "router_z_loss_mlp": 0.11798096, + "step": 8687, + "time_per_iteration": 2.6926801204681396 + }, + { + "auxiliary_loss_clip": 0.0111753, + "auxiliary_loss_mlp": 0.01029518, + "balance_loss_clip": 1.04120171, + "balance_loss_mlp": 1.01752567, + "epoch": 0.5223508191793176, + "flos": 15511897090080.0, + "grad_norm": 2.243138702392405, + "language_loss": 0.82686889, + "learning_rate": 1.9522954686059936e-06, + "loss": 0.84833938, + "num_input_tokens_seen": 186624360, + "router_z_loss_clip": 0.76416016, + "router_z_loss_mlp": 0.11987305, + "step": 8688, + "time_per_iteration": 2.683203935623169 + }, + { + "auxiliary_loss_clip": 0.01120113, + "auxiliary_loss_mlp": 0.01028927, + "balance_loss_clip": 1.04275632, + "balance_loss_mlp": 1.01617789, + "epoch": 0.5224109424319856, + "flos": 19074154523040.0, + "grad_norm": 3.993161166458221, + "language_loss": 0.73471367, + "learning_rate": 1.9519061186355558e-06, + "loss": 0.75620407, + "num_input_tokens_seen": 186638680, + "router_z_loss_clip": 0.7734375, + "router_z_loss_mlp": 0.12750244, + "step": 8689, + "time_per_iteration": 2.6300697326660156 + }, + { + "auxiliary_loss_clip": 0.01119485, + "auxiliary_loss_mlp": 0.01029342, + "balance_loss_clip": 1.04389477, + "balance_loss_mlp": 1.01713502, + "epoch": 0.5224710656846535, + "flos": 19208328703200.0, + "grad_norm": 2.642928388621966, + "language_loss": 0.82893038, + "learning_rate": 1.9515167704888417e-06, + "loss": 0.85041869, + "num_input_tokens_seen": 186655840, + "router_z_loss_clip": 0.75537109, + "router_z_loss_mlp": 0.12207031, + "step": 8690, + "time_per_iteration": 2.626612663269043 + }, + { + "auxiliary_loss_clip": 0.01121453, + "auxiliary_loss_mlp": 0.01032616, + "balance_loss_clip": 1.04262626, + "balance_loss_mlp": 1.01940131, + "epoch": 0.5225311889373215, + "flos": 31763491238880.0, + "grad_norm": 3.440315100135704, + "language_loss": 0.79125285, + "learning_rate": 1.9511274241806173e-06, + "loss": 0.81279355, + "num_input_tokens_seen": 186674150, + "router_z_loss_clip": 0.7890625, + "router_z_loss_mlp": 0.13220215, + "step": 8691, + "time_per_iteration": 2.6400787830352783 + }, + { + "auxiliary_loss_clip": 0.01125132, + "auxiliary_loss_mlp": 0.01035176, + "balance_loss_clip": 1.04436338, + "balance_loss_mlp": 1.02171779, + "epoch": 0.5225913121899894, + "flos": 22414044358080.0, + "grad_norm": 2.588856439673243, + "language_loss": 0.76265824, + "learning_rate": 1.950738079725646e-06, + "loss": 0.78426135, + "num_input_tokens_seen": 186690675, + "router_z_loss_clip": 0.80761719, + "router_z_loss_mlp": 0.13458252, + "step": 8692, + "time_per_iteration": 2.619959592819214 + }, + { + "auxiliary_loss_clip": 0.0111943, + "auxiliary_loss_mlp": 0.01032395, + "balance_loss_clip": 1.04399157, + "balance_loss_mlp": 1.02079022, + "epoch": 0.5226514354426575, + "flos": 35720694480960.0, + "grad_norm": 1.9315618580489202, + "language_loss": 0.72578984, + "learning_rate": 1.950348737138691e-06, + "loss": 0.74730808, + "num_input_tokens_seen": 186710380, + "router_z_loss_clip": 0.75537109, + "router_z_loss_mlp": 0.1159668, + "step": 8693, + "time_per_iteration": 2.6650099754333496 + }, + { + "auxiliary_loss_clip": 0.0112638, + "auxiliary_loss_mlp": 0.01035794, + "balance_loss_clip": 1.04461646, + "balance_loss_mlp": 1.02228796, + "epoch": 0.5227115586953254, + "flos": 27886134854880.0, + "grad_norm": 2.579986119665971, + "language_loss": 0.81883061, + "learning_rate": 1.949959396434517e-06, + "loss": 0.84045231, + "num_input_tokens_seen": 186729135, + "router_z_loss_clip": 0.81738281, + "router_z_loss_mlp": 0.13519287, + "step": 8694, + "time_per_iteration": 2.6882286071777344 + }, + { + "auxiliary_loss_clip": 0.0104168, + "auxiliary_loss_mlp": 0.00998175, + "balance_loss_clip": 1.01654351, + "balance_loss_mlp": 0.99680161, + "epoch": 0.5227716819479934, + "flos": 70130480810400.0, + "grad_norm": 0.7758947314537432, + "language_loss": 0.55644542, + "learning_rate": 1.949570057627888e-06, + "loss": 0.57684398, + "num_input_tokens_seen": 186791115, + "router_z_loss_clip": 0.25073242, + "router_z_loss_mlp": 0.01373291, + "step": 8695, + "time_per_iteration": 3.3384923934936523 + }, + { + "auxiliary_loss_clip": 0.01123324, + "auxiliary_loss_mlp": 0.01035597, + "balance_loss_clip": 1.04482651, + "balance_loss_mlp": 1.02336645, + "epoch": 0.5228318052006613, + "flos": 16848371645280.0, + "grad_norm": 2.110800581139008, + "language_loss": 0.73567474, + "learning_rate": 1.9491807207335672e-06, + "loss": 0.75726402, + "num_input_tokens_seen": 186808660, + "router_z_loss_clip": 0.78466797, + "router_z_loss_mlp": 0.12231445, + "step": 8696, + "time_per_iteration": 4.030356168746948 + }, + { + "auxiliary_loss_clip": 0.01123567, + "auxiliary_loss_mlp": 0.01036788, + "balance_loss_clip": 1.04522491, + "balance_loss_mlp": 1.02437258, + "epoch": 0.5228919284533293, + "flos": 19030118590080.0, + "grad_norm": 1.5774378129068292, + "language_loss": 0.71317476, + "learning_rate": 1.948791385766319e-06, + "loss": 0.73477829, + "num_input_tokens_seen": 186825900, + "router_z_loss_clip": 0.78369141, + "router_z_loss_mlp": 0.12420654, + "step": 8697, + "time_per_iteration": 4.051598310470581 + }, + { + "auxiliary_loss_clip": 0.01120196, + "auxiliary_loss_mlp": 0.01032377, + "balance_loss_clip": 1.04491425, + "balance_loss_mlp": 1.02002096, + "epoch": 0.5229520517059973, + "flos": 27444357420480.0, + "grad_norm": 1.7624619738392695, + "language_loss": 0.80257845, + "learning_rate": 1.948402052740906e-06, + "loss": 0.82410419, + "num_input_tokens_seen": 186843735, + "router_z_loss_clip": 0.75292969, + "router_z_loss_mlp": 0.12359619, + "step": 8698, + "time_per_iteration": 2.672611713409424 + }, + { + "auxiliary_loss_clip": 0.01121603, + "auxiliary_loss_mlp": 0.01033864, + "balance_loss_clip": 1.04393303, + "balance_loss_mlp": 1.02134144, + "epoch": 0.5230121749586653, + "flos": 26955424222560.0, + "grad_norm": 1.8540602292405175, + "language_loss": 0.74261171, + "learning_rate": 1.948012721672093e-06, + "loss": 0.76416636, + "num_input_tokens_seen": 186862440, + "router_z_loss_clip": 0.77685547, + "router_z_loss_mlp": 0.12524414, + "step": 8699, + "time_per_iteration": 2.595268964767456 + }, + { + "auxiliary_loss_clip": 0.01126884, + "auxiliary_loss_mlp": 0.01032937, + "balance_loss_clip": 1.04518008, + "balance_loss_mlp": 1.02011037, + "epoch": 0.5230722982113333, + "flos": 26999703259200.0, + "grad_norm": 1.8539649499598763, + "language_loss": 0.73274368, + "learning_rate": 1.947623392574642e-06, + "loss": 0.75434184, + "num_input_tokens_seen": 186880940, + "router_z_loss_clip": 0.81640625, + "router_z_loss_mlp": 0.12835693, + "step": 8700, + "time_per_iteration": 2.640683174133301 + }, + { + "auxiliary_loss_clip": 0.01126324, + "auxiliary_loss_mlp": 0.01033746, + "balance_loss_clip": 1.04604077, + "balance_loss_mlp": 1.02035856, + "epoch": 0.5231324214640012, + "flos": 30516709240800.0, + "grad_norm": 2.2657819288365446, + "language_loss": 0.66460979, + "learning_rate": 1.947234065463318e-06, + "loss": 0.68621045, + "num_input_tokens_seen": 186900785, + "router_z_loss_clip": 0.80273438, + "router_z_loss_mlp": 0.1340332, + "step": 8701, + "time_per_iteration": 2.669372320175171 + }, + { + "auxiliary_loss_clip": 0.01121544, + "auxiliary_loss_mlp": 0.01036007, + "balance_loss_clip": 1.04380941, + "balance_loss_mlp": 1.02377057, + "epoch": 0.5231925447166692, + "flos": 31408894290240.0, + "grad_norm": 1.8709014400277757, + "language_loss": 0.6679703, + "learning_rate": 1.9468447403528826e-06, + "loss": 0.68954575, + "num_input_tokens_seen": 186920895, + "router_z_loss_clip": 0.77636719, + "router_z_loss_mlp": 0.12243652, + "step": 8702, + "time_per_iteration": 2.7116763591766357 + }, + { + "auxiliary_loss_clip": 0.01121639, + "auxiliary_loss_mlp": 0.0103599, + "balance_loss_clip": 1.04472375, + "balance_loss_mlp": 1.02310967, + "epoch": 0.5232526679693371, + "flos": 26153782593120.0, + "grad_norm": 2.0948761731876755, + "language_loss": 0.76777017, + "learning_rate": 1.946455417258101e-06, + "loss": 0.78934646, + "num_input_tokens_seen": 186940605, + "router_z_loss_clip": 0.76953125, + "router_z_loss_mlp": 0.12890625, + "step": 8703, + "time_per_iteration": 4.115009069442749 + }, + { + "auxiliary_loss_clip": 0.01128893, + "auxiliary_loss_mlp": 0.01038796, + "balance_loss_clip": 1.04482067, + "balance_loss_mlp": 1.02398992, + "epoch": 0.5233127912220051, + "flos": 43072276949280.0, + "grad_norm": 5.158678028375282, + "language_loss": 0.76802021, + "learning_rate": 1.9460660961937348e-06, + "loss": 0.78969705, + "num_input_tokens_seen": 186960820, + "router_z_loss_clip": 0.84130859, + "router_z_loss_mlp": 0.14807129, + "step": 8704, + "time_per_iteration": 2.769023895263672 + }, + { + "auxiliary_loss_clip": 0.01122014, + "auxiliary_loss_mlp": 0.0103772, + "balance_loss_clip": 1.04606819, + "balance_loss_mlp": 1.0259068, + "epoch": 0.523372914474673, + "flos": 20806182646560.0, + "grad_norm": 2.0616413174838115, + "language_loss": 0.77679443, + "learning_rate": 1.9456767771745474e-06, + "loss": 0.79839182, + "num_input_tokens_seen": 186976240, + "router_z_loss_clip": 0.75878906, + "router_z_loss_mlp": 0.11810303, + "step": 8705, + "time_per_iteration": 2.6097192764282227 + }, + { + "auxiliary_loss_clip": 0.01126308, + "auxiliary_loss_mlp": 0.01033484, + "balance_loss_clip": 1.0455637, + "balance_loss_mlp": 1.02018619, + "epoch": 0.5234330377273411, + "flos": 22459174257600.0, + "grad_norm": 1.922369630857445, + "language_loss": 0.69307613, + "learning_rate": 1.9452874602153027e-06, + "loss": 0.71467406, + "num_input_tokens_seen": 186992855, + "router_z_loss_clip": 0.80761719, + "router_z_loss_mlp": 0.13287354, + "step": 8706, + "time_per_iteration": 2.593316078186035 + }, + { + "auxiliary_loss_clip": 0.01042481, + "auxiliary_loss_mlp": 0.01003055, + "balance_loss_clip": 1.01738441, + "balance_loss_mlp": 1.00175786, + "epoch": 0.523493160980009, + "flos": 80349469931520.0, + "grad_norm": 0.6800633049149246, + "language_loss": 0.52444261, + "learning_rate": 1.9448981453307623e-06, + "loss": 0.54489791, + "num_input_tokens_seen": 187051205, + "router_z_loss_clip": 0.25024414, + "router_z_loss_mlp": 0.0129776, + "step": 8707, + "time_per_iteration": 3.339855670928955 + }, + { + "auxiliary_loss_clip": 0.01122512, + "auxiliary_loss_mlp": 0.01041584, + "balance_loss_clip": 1.04372001, + "balance_loss_mlp": 1.02887011, + "epoch": 0.523553284232677, + "flos": 26688412932480.0, + "grad_norm": 2.2458815749100998, + "language_loss": 0.74910581, + "learning_rate": 1.9445088325356904e-06, + "loss": 0.77074677, + "num_input_tokens_seen": 187070540, + "router_z_loss_clip": 0.78759766, + "router_z_loss_mlp": 0.12719727, + "step": 8708, + "time_per_iteration": 2.6169393062591553 + }, + { + "auxiliary_loss_clip": 0.01124161, + "auxiliary_loss_mlp": 0.01026471, + "balance_loss_clip": 1.04632783, + "balance_loss_mlp": 1.01379895, + "epoch": 0.5236134074853449, + "flos": 25439199897600.0, + "grad_norm": 1.5838071785498786, + "language_loss": 0.77414274, + "learning_rate": 1.944119521844849e-06, + "loss": 0.79564905, + "num_input_tokens_seen": 187089975, + "router_z_loss_clip": 0.77880859, + "router_z_loss_mlp": 0.12677002, + "step": 8709, + "time_per_iteration": 3.945485830307007 + }, + { + "auxiliary_loss_clip": 0.01128063, + "auxiliary_loss_mlp": 0.01038017, + "balance_loss_clip": 1.04357529, + "balance_loss_mlp": 1.02346134, + "epoch": 0.5236735307380129, + "flos": 31140424378080.0, + "grad_norm": 2.0888590647340286, + "language_loss": 0.84036267, + "learning_rate": 1.9437302132730003e-06, + "loss": 0.86202353, + "num_input_tokens_seen": 187108775, + "router_z_loss_clip": 0.84423828, + "router_z_loss_mlp": 0.14556885, + "step": 8710, + "time_per_iteration": 2.6976206302642822 + }, + { + "auxiliary_loss_clip": 0.01119951, + "auxiliary_loss_mlp": 0.01033184, + "balance_loss_clip": 1.04317474, + "balance_loss_mlp": 1.02076864, + "epoch": 0.523733653990681, + "flos": 28776699213120.0, + "grad_norm": 1.9598081056423315, + "language_loss": 0.6948508, + "learning_rate": 1.943340906834908e-06, + "loss": 0.71638215, + "num_input_tokens_seen": 187128830, + "router_z_loss_clip": 0.76708984, + "router_z_loss_mlp": 0.12426758, + "step": 8711, + "time_per_iteration": 2.628978729248047 + }, + { + "auxiliary_loss_clip": 0.01119944, + "auxiliary_loss_mlp": 0.01036084, + "balance_loss_clip": 1.04132915, + "balance_loss_mlp": 1.02326274, + "epoch": 0.5237937772433489, + "flos": 25753853158560.0, + "grad_norm": 1.8386181530958892, + "language_loss": 0.8312915, + "learning_rate": 1.9429516025453345e-06, + "loss": 0.85285175, + "num_input_tokens_seen": 187149570, + "router_z_loss_clip": 0.78515625, + "router_z_loss_mlp": 0.12811279, + "step": 8712, + "time_per_iteration": 2.664607286453247 + }, + { + "auxiliary_loss_clip": 0.01123808, + "auxiliary_loss_mlp": 0.01042675, + "balance_loss_clip": 1.04300797, + "balance_loss_mlp": 1.02893007, + "epoch": 0.5238539004960169, + "flos": 23394584894400.0, + "grad_norm": 1.6779107748516229, + "language_loss": 0.69673145, + "learning_rate": 1.9425623004190415e-06, + "loss": 0.71839619, + "num_input_tokens_seen": 187170575, + "router_z_loss_clip": 0.80615234, + "router_z_loss_mlp": 0.13757324, + "step": 8713, + "time_per_iteration": 2.6598079204559326 + }, + { + "auxiliary_loss_clip": 0.01125103, + "auxiliary_loss_mlp": 0.01034566, + "balance_loss_clip": 1.04189026, + "balance_loss_mlp": 1.02071357, + "epoch": 0.5239140237486848, + "flos": 21827639285280.0, + "grad_norm": 4.716713587294388, + "language_loss": 0.77474666, + "learning_rate": 1.9421730004707925e-06, + "loss": 0.79634333, + "num_input_tokens_seen": 187187190, + "router_z_loss_clip": 0.83203125, + "router_z_loss_mlp": 0.13867188, + "step": 8714, + "time_per_iteration": 2.6782681941986084 + }, + { + "auxiliary_loss_clip": 0.01126547, + "auxiliary_loss_mlp": 0.0103434, + "balance_loss_clip": 1.04617441, + "balance_loss_mlp": 1.02095306, + "epoch": 0.5239741470013528, + "flos": 21878603673120.0, + "grad_norm": 2.2110039825833407, + "language_loss": 0.76167953, + "learning_rate": 1.9417837027153483e-06, + "loss": 0.78328842, + "num_input_tokens_seen": 187204350, + "router_z_loss_clip": 0.80419922, + "router_z_loss_mlp": 0.13391113, + "step": 8715, + "time_per_iteration": 2.6785812377929688 + }, + { + "auxiliary_loss_clip": 0.0112061, + "auxiliary_loss_mlp": 0.01035678, + "balance_loss_clip": 1.0434618, + "balance_loss_mlp": 1.02294016, + "epoch": 0.5240342702540207, + "flos": 37818948012480.0, + "grad_norm": 1.765107926277529, + "language_loss": 0.71272111, + "learning_rate": 1.9413944071674723e-06, + "loss": 0.73428398, + "num_input_tokens_seen": 187225605, + "router_z_loss_clip": 0.77050781, + "router_z_loss_mlp": 0.1272583, + "step": 8716, + "time_per_iteration": 2.7734429836273193 + }, + { + "auxiliary_loss_clip": 0.01122735, + "auxiliary_loss_mlp": 0.01038449, + "balance_loss_clip": 1.0439328, + "balance_loss_mlp": 1.02639675, + "epoch": 0.5240943935066887, + "flos": 30511968719040.0, + "grad_norm": 2.5833496356605106, + "language_loss": 0.8665297, + "learning_rate": 1.941005113841926e-06, + "loss": 0.88814157, + "num_input_tokens_seen": 187241335, + "router_z_loss_clip": 0.78759766, + "router_z_loss_mlp": 0.1204834, + "step": 8717, + "time_per_iteration": 2.63860821723938 + }, + { + "auxiliary_loss_clip": 0.01123142, + "auxiliary_loss_mlp": 0.01032455, + "balance_loss_clip": 1.04422128, + "balance_loss_mlp": 1.02010465, + "epoch": 0.5241545167593566, + "flos": 28869146945280.0, + "grad_norm": 2.1852800708048123, + "language_loss": 0.61290985, + "learning_rate": 1.9406158227534723e-06, + "loss": 0.63446581, + "num_input_tokens_seen": 187259925, + "router_z_loss_clip": 0.7890625, + "router_z_loss_mlp": 0.12353516, + "step": 8718, + "time_per_iteration": 2.7067954540252686 + }, + { + "auxiliary_loss_clip": 0.01126021, + "auxiliary_loss_mlp": 0.01036288, + "balance_loss_clip": 1.04537296, + "balance_loss_mlp": 1.02352035, + "epoch": 0.5242146400120247, + "flos": 28553764373280.0, + "grad_norm": 2.3967825237445393, + "language_loss": 0.72254068, + "learning_rate": 1.940226533916872e-06, + "loss": 0.74416375, + "num_input_tokens_seen": 187279035, + "router_z_loss_clip": 0.80712891, + "router_z_loss_mlp": 0.12768555, + "step": 8719, + "time_per_iteration": 2.6609580516815186 + }, + { + "auxiliary_loss_clip": 0.01120086, + "auxiliary_loss_mlp": 0.01030651, + "balance_loss_clip": 1.04360509, + "balance_loss_mlp": 1.01897991, + "epoch": 0.5242747632646926, + "flos": 21657208489920.0, + "grad_norm": 1.8619068914984356, + "language_loss": 0.7276355, + "learning_rate": 1.9398372473468877e-06, + "loss": 0.74914294, + "num_input_tokens_seen": 187297555, + "router_z_loss_clip": 0.76367188, + "router_z_loss_mlp": 0.11682129, + "step": 8720, + "time_per_iteration": 2.6607723236083984 + }, + { + "auxiliary_loss_clip": 0.01123989, + "auxiliary_loss_mlp": 0.01040806, + "balance_loss_clip": 1.04529071, + "balance_loss_mlp": 1.02796769, + "epoch": 0.5243348865173606, + "flos": 39776787702720.0, + "grad_norm": 1.5850042463871448, + "language_loss": 0.69983149, + "learning_rate": 1.939447963058281e-06, + "loss": 0.72147942, + "num_input_tokens_seen": 187320265, + "router_z_loss_clip": 0.78710938, + "router_z_loss_mlp": 0.128479, + "step": 8721, + "time_per_iteration": 2.7254955768585205 + }, + { + "auxiliary_loss_clip": 0.01122216, + "auxiliary_loss_mlp": 0.01033187, + "balance_loss_clip": 1.04387915, + "balance_loss_mlp": 1.02080727, + "epoch": 0.5243950097700285, + "flos": 31096104824160.0, + "grad_norm": 1.9213313565919765, + "language_loss": 0.86430168, + "learning_rate": 1.939058681065813e-06, + "loss": 0.88585579, + "num_input_tokens_seen": 187338045, + "router_z_loss_clip": 0.78271484, + "router_z_loss_mlp": 0.12390137, + "step": 8722, + "time_per_iteration": 2.6252009868621826 + }, + { + "auxiliary_loss_clip": 0.01121317, + "auxiliary_loss_mlp": 0.01034488, + "balance_loss_clip": 1.04415131, + "balance_loss_mlp": 1.0211184, + "epoch": 0.5244551330226965, + "flos": 18630067603680.0, + "grad_norm": 1.7172510046799228, + "language_loss": 0.79975575, + "learning_rate": 1.938669401384247e-06, + "loss": 0.82131386, + "num_input_tokens_seen": 187356040, + "router_z_loss_clip": 0.77148438, + "router_z_loss_mlp": 0.13372803, + "step": 8723, + "time_per_iteration": 2.6274044513702393 + }, + { + "auxiliary_loss_clip": 0.01127015, + "auxiliary_loss_mlp": 0.01039904, + "balance_loss_clip": 1.04714441, + "balance_loss_mlp": 1.02646291, + "epoch": 0.5245152562753645, + "flos": 27133836922080.0, + "grad_norm": 1.8529694164669928, + "language_loss": 0.74563479, + "learning_rate": 1.9382801240283426e-06, + "loss": 0.76730394, + "num_input_tokens_seen": 187374185, + "router_z_loss_clip": 0.79931641, + "router_z_loss_mlp": 0.13433838, + "step": 8724, + "time_per_iteration": 2.623687267303467 + }, + { + "auxiliary_loss_clip": 0.01127837, + "auxiliary_loss_mlp": 0.01036368, + "balance_loss_clip": 1.04459167, + "balance_loss_mlp": 1.02224731, + "epoch": 0.5245753795280325, + "flos": 35906683911840.0, + "grad_norm": 3.0524257710003084, + "language_loss": 0.70333844, + "learning_rate": 1.9378908490128625e-06, + "loss": 0.72498047, + "num_input_tokens_seen": 187396640, + "router_z_loss_clip": 0.83203125, + "router_z_loss_mlp": 0.14117432, + "step": 8725, + "time_per_iteration": 2.7068095207214355 + }, + { + "auxiliary_loss_clip": 0.01042409, + "auxiliary_loss_mlp": 0.00999774, + "balance_loss_clip": 1.01735282, + "balance_loss_mlp": 0.998362, + "epoch": 0.5246355027807005, + "flos": 71789104323360.0, + "grad_norm": 0.7553596042989261, + "language_loss": 0.5565865, + "learning_rate": 1.937501576352568e-06, + "loss": 0.57700837, + "num_input_tokens_seen": 187455945, + "router_z_loss_clip": 0.25048828, + "router_z_loss_mlp": 0.01412201, + "step": 8726, + "time_per_iteration": 3.2692439556121826 + }, + { + "auxiliary_loss_clip": 0.01043019, + "auxiliary_loss_mlp": 0.01000602, + "balance_loss_clip": 1.0178901, + "balance_loss_mlp": 0.99921113, + "epoch": 0.5246956260333684, + "flos": 78735125455200.0, + "grad_norm": 0.7927954351803828, + "language_loss": 0.58378321, + "learning_rate": 1.937112306062219e-06, + "loss": 0.60421938, + "num_input_tokens_seen": 187519975, + "router_z_loss_clip": 0.25097656, + "router_z_loss_mlp": 0.01390839, + "step": 8727, + "time_per_iteration": 3.2380547523498535 + }, + { + "auxiliary_loss_clip": 0.01126213, + "auxiliary_loss_mlp": 0.01037539, + "balance_loss_clip": 1.04475439, + "balance_loss_mlp": 1.02335334, + "epoch": 0.5247557492860364, + "flos": 29936422277280.0, + "grad_norm": 5.53268088460099, + "language_loss": 0.70737195, + "learning_rate": 1.9367230381565786e-06, + "loss": 0.72900945, + "num_input_tokens_seen": 187541775, + "router_z_loss_clip": 0.81445312, + "router_z_loss_mlp": 0.14196777, + "step": 8728, + "time_per_iteration": 2.6886744499206543 + }, + { + "auxiliary_loss_clip": 0.01121212, + "auxiliary_loss_mlp": 0.01029108, + "balance_loss_clip": 1.04285073, + "balance_loss_mlp": 1.01733601, + "epoch": 0.5248158725387043, + "flos": 22947216075360.0, + "grad_norm": 1.4363589878719798, + "language_loss": 0.69453114, + "learning_rate": 1.9363337726504062e-06, + "loss": 0.71603435, + "num_input_tokens_seen": 187560425, + "router_z_loss_clip": 0.78271484, + "router_z_loss_mlp": 0.11779785, + "step": 8729, + "time_per_iteration": 2.6223583221435547 + }, + { + "auxiliary_loss_clip": 0.01125845, + "auxiliary_loss_mlp": 0.01032371, + "balance_loss_clip": 1.04547024, + "balance_loss_mlp": 1.01955581, + "epoch": 0.5248759957913723, + "flos": 25570740454560.0, + "grad_norm": 2.010507387455593, + "language_loss": 0.83941644, + "learning_rate": 1.935944509558464e-06, + "loss": 0.86099863, + "num_input_tokens_seen": 187579930, + "router_z_loss_clip": 0.80419922, + "router_z_loss_mlp": 0.12817383, + "step": 8730, + "time_per_iteration": 2.654233455657959 + }, + { + "auxiliary_loss_clip": 0.01123714, + "auxiliary_loss_mlp": 0.01031596, + "balance_loss_clip": 1.04487693, + "balance_loss_mlp": 1.01873326, + "epoch": 0.5249361190440403, + "flos": 22770018894240.0, + "grad_norm": 3.033865028993372, + "language_loss": 0.79578656, + "learning_rate": 1.9355552488955125e-06, + "loss": 0.81733966, + "num_input_tokens_seen": 187595365, + "router_z_loss_clip": 0.78857422, + "router_z_loss_mlp": 0.12860107, + "step": 8731, + "time_per_iteration": 2.6010231971740723 + }, + { + "auxiliary_loss_clip": 0.01118905, + "auxiliary_loss_mlp": 0.01034338, + "balance_loss_clip": 1.04314566, + "balance_loss_mlp": 1.02137387, + "epoch": 0.5249962422967083, + "flos": 30338620679520.0, + "grad_norm": 1.7192520356197942, + "language_loss": 0.83264136, + "learning_rate": 1.935165990676312e-06, + "loss": 0.85417378, + "num_input_tokens_seen": 187614715, + "router_z_loss_clip": 0.75878906, + "router_z_loss_mlp": 0.12963867, + "step": 8732, + "time_per_iteration": 2.737894296646118 + }, + { + "auxiliary_loss_clip": 0.01123447, + "auxiliary_loss_mlp": 0.01031783, + "balance_loss_clip": 1.04497385, + "balance_loss_mlp": 1.01961184, + "epoch": 0.5250563655493762, + "flos": 18623625356160.0, + "grad_norm": 1.5918890617554349, + "language_loss": 0.77620691, + "learning_rate": 1.9347767349156237e-06, + "loss": 0.79775923, + "num_input_tokens_seen": 187630745, + "router_z_loss_clip": 0.78417969, + "router_z_loss_mlp": 0.12176514, + "step": 8733, + "time_per_iteration": 2.6130096912384033 + }, + { + "auxiliary_loss_clip": 0.01127727, + "auxiliary_loss_mlp": 0.01035341, + "balance_loss_clip": 1.04671049, + "balance_loss_mlp": 1.02184629, + "epoch": 0.5251164888020442, + "flos": 22726266582240.0, + "grad_norm": 2.351097158675609, + "language_loss": 0.81795716, + "learning_rate": 1.934387481628208e-06, + "loss": 0.83958781, + "num_input_tokens_seen": 187648200, + "router_z_loss_clip": 0.81054688, + "router_z_loss_mlp": 0.13500977, + "step": 8734, + "time_per_iteration": 2.7195630073547363 + }, + { + "auxiliary_loss_clip": 0.01120023, + "auxiliary_loss_mlp": 0.01028905, + "balance_loss_clip": 1.04410326, + "balance_loss_mlp": 1.01641166, + "epoch": 0.5251766120547121, + "flos": 36254311888320.0, + "grad_norm": 3.552854045039133, + "language_loss": 0.76734406, + "learning_rate": 1.933998230828826e-06, + "loss": 0.78883338, + "num_input_tokens_seen": 187669205, + "router_z_loss_clip": 0.76025391, + "router_z_loss_mlp": 0.12493896, + "step": 8735, + "time_per_iteration": 2.687230348587036 + }, + { + "auxiliary_loss_clip": 0.01125193, + "auxiliary_loss_mlp": 0.01034668, + "balance_loss_clip": 1.04578662, + "balance_loss_mlp": 1.02300978, + "epoch": 0.5252367353073801, + "flos": 28602135655200.0, + "grad_norm": 2.5317134681618243, + "language_loss": 0.80322707, + "learning_rate": 1.9336089825322376e-06, + "loss": 0.82482576, + "num_input_tokens_seen": 187690890, + "router_z_loss_clip": 0.79443359, + "router_z_loss_mlp": 0.11663818, + "step": 8736, + "time_per_iteration": 4.0979979038238525 + }, + { + "auxiliary_loss_clip": 0.0112543, + "auxiliary_loss_mlp": 0.01030756, + "balance_loss_clip": 1.04624689, + "balance_loss_mlp": 1.01795924, + "epoch": 0.5252968585600482, + "flos": 37596053689920.0, + "grad_norm": 2.0314354536468016, + "language_loss": 0.69758457, + "learning_rate": 1.9332197367532033e-06, + "loss": 0.71914643, + "num_input_tokens_seen": 187713045, + "router_z_loss_clip": 0.79150391, + "router_z_loss_mlp": 0.12811279, + "step": 8737, + "time_per_iteration": 4.054887533187866 + }, + { + "auxiliary_loss_clip": 0.01121554, + "auxiliary_loss_mlp": 0.01033995, + "balance_loss_clip": 1.04232454, + "balance_loss_mlp": 1.0215075, + "epoch": 0.5253569818127161, + "flos": 25170932571840.0, + "grad_norm": 1.5289341699930061, + "language_loss": 0.77480114, + "learning_rate": 1.9328304935064833e-06, + "loss": 0.79635662, + "num_input_tokens_seen": 187733640, + "router_z_loss_clip": 0.79150391, + "router_z_loss_mlp": 0.125, + "step": 8738, + "time_per_iteration": 2.673466444015503 + }, + { + "auxiliary_loss_clip": 0.01040304, + "auxiliary_loss_mlp": 0.01001526, + "balance_loss_clip": 1.01541662, + "balance_loss_mlp": 1.000144, + "epoch": 0.5254171050653841, + "flos": 77395693138560.0, + "grad_norm": 0.9017586088447815, + "language_loss": 0.54522783, + "learning_rate": 1.932441252806837e-06, + "loss": 0.56564605, + "num_input_tokens_seen": 187792930, + "router_z_loss_clip": 0.2487793, + "router_z_loss_mlp": 0.01383972, + "step": 8739, + "time_per_iteration": 3.247492551803589 + }, + { + "auxiliary_loss_clip": 0.01123297, + "auxiliary_loss_mlp": 0.01034867, + "balance_loss_clip": 1.04518604, + "balance_loss_mlp": 1.02280879, + "epoch": 0.525477228318052, + "flos": 42307094521440.0, + "grad_norm": 1.7828774266034382, + "language_loss": 0.84725821, + "learning_rate": 1.9320520146690263e-06, + "loss": 0.86883986, + "num_input_tokens_seen": 187812495, + "router_z_loss_clip": 0.78125, + "router_z_loss_mlp": 0.1206665, + "step": 8740, + "time_per_iteration": 2.7967746257781982 + }, + { + "auxiliary_loss_clip": 0.01122443, + "auxiliary_loss_mlp": 0.01039986, + "balance_loss_clip": 1.04452372, + "balance_loss_mlp": 1.02693915, + "epoch": 0.52553735157072, + "flos": 21879089880480.0, + "grad_norm": 3.0372510040990313, + "language_loss": 0.69434947, + "learning_rate": 1.9316627791078093e-06, + "loss": 0.71597379, + "num_input_tokens_seen": 187829685, + "router_z_loss_clip": 0.77929688, + "router_z_loss_mlp": 0.1305542, + "step": 8741, + "time_per_iteration": 2.672276735305786 + }, + { + "auxiliary_loss_clip": 0.01128463, + "auxiliary_loss_mlp": 0.01035802, + "balance_loss_clip": 1.04757547, + "balance_loss_mlp": 1.02246261, + "epoch": 0.5255974748233879, + "flos": 12129065288640.0, + "grad_norm": 2.4732855168142187, + "language_loss": 0.6615361, + "learning_rate": 1.931273546137947e-06, + "loss": 0.68317872, + "num_input_tokens_seen": 187846495, + "router_z_loss_clip": 0.80908203, + "router_z_loss_mlp": 0.13342285, + "step": 8742, + "time_per_iteration": 4.178268671035767 + }, + { + "auxiliary_loss_clip": 0.01127542, + "auxiliary_loss_mlp": 0.01036101, + "balance_loss_clip": 1.04559171, + "balance_loss_mlp": 1.02246928, + "epoch": 0.5256575980760559, + "flos": 20583045220320.0, + "grad_norm": 2.2995750779766317, + "language_loss": 0.63089794, + "learning_rate": 1.9308843157741983e-06, + "loss": 0.65253437, + "num_input_tokens_seen": 187862010, + "router_z_loss_clip": 0.81933594, + "router_z_loss_mlp": 0.13641357, + "step": 8743, + "time_per_iteration": 2.67863392829895 + }, + { + "auxiliary_loss_clip": 0.01040646, + "auxiliary_loss_mlp": 0.01003141, + "balance_loss_clip": 1.01587915, + "balance_loss_mlp": 1.0017451, + "epoch": 0.5257177213287239, + "flos": 76122135568800.0, + "grad_norm": 0.7712560112060586, + "language_loss": 0.54123706, + "learning_rate": 1.930495088031323e-06, + "loss": 0.56167495, + "num_input_tokens_seen": 187922730, + "router_z_loss_clip": 0.24755859, + "router_z_loss_mlp": 0.01395416, + "step": 8744, + "time_per_iteration": 3.335181713104248 + }, + { + "auxiliary_loss_clip": 0.01131135, + "auxiliary_loss_mlp": 0.01036282, + "balance_loss_clip": 1.04779553, + "balance_loss_mlp": 1.02217925, + "epoch": 0.5257778445813919, + "flos": 25352059929120.0, + "grad_norm": 2.3108807998772245, + "language_loss": 0.75514162, + "learning_rate": 1.9301058629240814e-06, + "loss": 0.77681577, + "num_input_tokens_seen": 187940160, + "router_z_loss_clip": 0.83300781, + "router_z_loss_mlp": 0.14099121, + "step": 8745, + "time_per_iteration": 2.6575944423675537 + }, + { + "auxiliary_loss_clip": 0.01125769, + "auxiliary_loss_mlp": 0.01036602, + "balance_loss_clip": 1.04671252, + "balance_loss_mlp": 1.02446067, + "epoch": 0.5258379678340598, + "flos": 20765914820640.0, + "grad_norm": 2.3188619463647524, + "language_loss": 0.80833113, + "learning_rate": 1.9297166404672324e-06, + "loss": 0.82995486, + "num_input_tokens_seen": 187958625, + "router_z_loss_clip": 0.79199219, + "router_z_loss_mlp": 0.12139893, + "step": 8746, + "time_per_iteration": 2.6018917560577393 + }, + { + "auxiliary_loss_clip": 0.0112377, + "auxiliary_loss_mlp": 0.01037086, + "balance_loss_clip": 1.04511452, + "balance_loss_mlp": 1.02365732, + "epoch": 0.5258980910867278, + "flos": 25708196534400.0, + "grad_norm": 2.1432015412885357, + "language_loss": 0.75266486, + "learning_rate": 1.9293274206755353e-06, + "loss": 0.7742734, + "num_input_tokens_seen": 187977575, + "router_z_loss_clip": 0.78662109, + "router_z_loss_mlp": 0.13433838, + "step": 8747, + "time_per_iteration": 2.724156379699707 + }, + { + "auxiliary_loss_clip": 0.01120213, + "auxiliary_loss_mlp": 0.01034493, + "balance_loss_clip": 1.04474258, + "balance_loss_mlp": 1.0218389, + "epoch": 0.5259582143393957, + "flos": 21968863472160.0, + "grad_norm": 1.8101963043273233, + "language_loss": 0.82644886, + "learning_rate": 1.9289382035637505e-06, + "loss": 0.84799588, + "num_input_tokens_seen": 187996650, + "router_z_loss_clip": 0.75390625, + "router_z_loss_mlp": 0.12652588, + "step": 8748, + "time_per_iteration": 2.603400945663452 + }, + { + "auxiliary_loss_clip": 0.01123906, + "auxiliary_loss_mlp": 0.01038643, + "balance_loss_clip": 1.04351854, + "balance_loss_mlp": 1.02472496, + "epoch": 0.5260183375920637, + "flos": 27801061267680.0, + "grad_norm": 1.8590636876295101, + "language_loss": 0.80205965, + "learning_rate": 1.9285489891466345e-06, + "loss": 0.82368505, + "num_input_tokens_seen": 188013510, + "router_z_loss_clip": 0.80371094, + "router_z_loss_mlp": 0.13922119, + "step": 8749, + "time_per_iteration": 4.0286853313446045 + }, + { + "auxiliary_loss_clip": 0.01125548, + "auxiliary_loss_mlp": 0.01035797, + "balance_loss_clip": 1.04705501, + "balance_loss_mlp": 1.02267778, + "epoch": 0.5260784608447318, + "flos": 33009908581440.0, + "grad_norm": 7.233987727072277, + "language_loss": 0.72634119, + "learning_rate": 1.9281597774389487e-06, + "loss": 0.74795461, + "num_input_tokens_seen": 188032085, + "router_z_loss_clip": 0.78515625, + "router_z_loss_mlp": 0.13104248, + "step": 8750, + "time_per_iteration": 2.695706367492676 + }, + { + "auxiliary_loss_clip": 0.01122729, + "auxiliary_loss_mlp": 0.01036514, + "balance_loss_clip": 1.04377842, + "balance_loss_mlp": 1.0244503, + "epoch": 0.5261385840973997, + "flos": 25213266779040.0, + "grad_norm": 2.135620237556267, + "language_loss": 0.76214182, + "learning_rate": 1.9277705684554517e-06, + "loss": 0.78373432, + "num_input_tokens_seen": 188050590, + "router_z_loss_clip": 0.7890625, + "router_z_loss_mlp": 0.1206665, + "step": 8751, + "time_per_iteration": 2.627321720123291 + }, + { + "auxiliary_loss_clip": 0.01122106, + "auxiliary_loss_mlp": 0.01036751, + "balance_loss_clip": 1.04568481, + "balance_loss_mlp": 1.02456212, + "epoch": 0.5261987073500677, + "flos": 28824665322240.0, + "grad_norm": 1.9130637335348173, + "language_loss": 0.76030087, + "learning_rate": 1.927381362210902e-06, + "loss": 0.78188944, + "num_input_tokens_seen": 188071620, + "router_z_loss_clip": 0.76464844, + "router_z_loss_mlp": 0.1217041, + "step": 8752, + "time_per_iteration": 2.6815099716186523 + }, + { + "auxiliary_loss_clip": 0.0112666, + "auxiliary_loss_mlp": 0.01033127, + "balance_loss_clip": 1.04587483, + "balance_loss_mlp": 1.0196147, + "epoch": 0.5262588306027356, + "flos": 33722343861120.0, + "grad_norm": 2.2222464016673285, + "language_loss": 0.6799922, + "learning_rate": 1.926992158720058e-06, + "loss": 0.70159006, + "num_input_tokens_seen": 188091740, + "router_z_loss_clip": 0.80908203, + "router_z_loss_mlp": 0.13519287, + "step": 8753, + "time_per_iteration": 2.6743009090423584 + }, + { + "auxiliary_loss_clip": 0.01123417, + "auxiliary_loss_mlp": 0.0103805, + "balance_loss_clip": 1.04601443, + "balance_loss_mlp": 1.025491, + "epoch": 0.5263189538554036, + "flos": 26551564611840.0, + "grad_norm": 1.7023296693342176, + "language_loss": 0.84050918, + "learning_rate": 1.9266029579976785e-06, + "loss": 0.86212385, + "num_input_tokens_seen": 188111165, + "router_z_loss_clip": 0.77441406, + "router_z_loss_mlp": 0.12554932, + "step": 8754, + "time_per_iteration": 2.670989513397217 + }, + { + "auxiliary_loss_clip": 0.0112547, + "auxiliary_loss_mlp": 0.0103591, + "balance_loss_clip": 1.04557109, + "balance_loss_mlp": 1.02318478, + "epoch": 0.5263790771080715, + "flos": 17420149980000.0, + "grad_norm": 3.836638500219489, + "language_loss": 0.87248647, + "learning_rate": 1.926213760058522e-06, + "loss": 0.89410031, + "num_input_tokens_seen": 188127825, + "router_z_loss_clip": 0.79931641, + "router_z_loss_mlp": 0.1272583, + "step": 8755, + "time_per_iteration": 2.5739824771881104 + }, + { + "auxiliary_loss_clip": 0.01041713, + "auxiliary_loss_mlp": 0.01013792, + "balance_loss_clip": 1.0164777, + "balance_loss_mlp": 1.01240599, + "epoch": 0.5264392003607395, + "flos": 80296601231520.0, + "grad_norm": 0.7223873160504714, + "language_loss": 0.58816195, + "learning_rate": 1.9258245649173477e-06, + "loss": 0.60871702, + "num_input_tokens_seen": 188194050, + "router_z_loss_clip": 0.25219727, + "router_z_loss_mlp": 0.01386261, + "step": 8756, + "time_per_iteration": 3.3379175662994385 + }, + { + "auxiliary_loss_clip": 0.01126959, + "auxiliary_loss_mlp": 0.01034374, + "balance_loss_clip": 1.04501295, + "balance_loss_mlp": 1.02161288, + "epoch": 0.5264993236134075, + "flos": 25663836463200.0, + "grad_norm": 3.03239946509112, + "language_loss": 0.70907456, + "learning_rate": 1.925435372588913e-06, + "loss": 0.73068786, + "num_input_tokens_seen": 188212565, + "router_z_loss_clip": 0.81884766, + "router_z_loss_mlp": 0.12768555, + "step": 8757, + "time_per_iteration": 2.657482624053955 + }, + { + "auxiliary_loss_clip": 0.01124046, + "auxiliary_loss_mlp": 0.01038494, + "balance_loss_clip": 1.044626, + "balance_loss_mlp": 1.02597094, + "epoch": 0.5265594468660755, + "flos": 20277224726400.0, + "grad_norm": 1.8008735843616954, + "language_loss": 0.87830275, + "learning_rate": 1.9250461830879768e-06, + "loss": 0.89992821, + "num_input_tokens_seen": 188229505, + "router_z_loss_clip": 0.79443359, + "router_z_loss_mlp": 0.12530518, + "step": 8758, + "time_per_iteration": 2.6162447929382324 + }, + { + "auxiliary_loss_clip": 0.01127472, + "auxiliary_loss_mlp": 0.01037318, + "balance_loss_clip": 1.04570782, + "balance_loss_mlp": 1.0235672, + "epoch": 0.5266195701187434, + "flos": 29448745115040.0, + "grad_norm": 1.924614746142258, + "language_loss": 0.75791335, + "learning_rate": 1.9246569964292965e-06, + "loss": 0.77956122, + "num_input_tokens_seen": 188250395, + "router_z_loss_clip": 0.81738281, + "router_z_loss_mlp": 0.13751221, + "step": 8759, + "time_per_iteration": 2.6861164569854736 + }, + { + "auxiliary_loss_clip": 0.01119938, + "auxiliary_loss_mlp": 0.01027786, + "balance_loss_clip": 1.0438931, + "balance_loss_mlp": 1.01506686, + "epoch": 0.5266796933714114, + "flos": 19333345978080.0, + "grad_norm": 1.8153077214274445, + "language_loss": 0.70810539, + "learning_rate": 1.9242678126276307e-06, + "loss": 0.72958261, + "num_input_tokens_seen": 188266785, + "router_z_loss_clip": 0.76220703, + "router_z_loss_mlp": 0.12731934, + "step": 8760, + "time_per_iteration": 2.613313674926758 + }, + { + "auxiliary_loss_clip": 0.01127816, + "auxiliary_loss_mlp": 0.01036844, + "balance_loss_clip": 1.04501843, + "balance_loss_mlp": 1.02373719, + "epoch": 0.5267398166240793, + "flos": 25565756829120.0, + "grad_norm": 3.1514277033957745, + "language_loss": 0.75842392, + "learning_rate": 1.923878631697736e-06, + "loss": 0.78007054, + "num_input_tokens_seen": 188282525, + "router_z_loss_clip": 0.828125, + "router_z_loss_mlp": 0.13116455, + "step": 8761, + "time_per_iteration": 2.654320240020752 + }, + { + "auxiliary_loss_clip": 0.01123307, + "auxiliary_loss_mlp": 0.010282, + "balance_loss_clip": 1.04470849, + "balance_loss_mlp": 1.01636863, + "epoch": 0.5267999398767473, + "flos": 25619841047520.0, + "grad_norm": 2.0281169292734034, + "language_loss": 0.70440996, + "learning_rate": 1.923489453654373e-06, + "loss": 0.72592503, + "num_input_tokens_seen": 188301395, + "router_z_loss_clip": 0.78613281, + "router_z_loss_mlp": 0.11834717, + "step": 8762, + "time_per_iteration": 2.656885862350464 + }, + { + "auxiliary_loss_clip": 0.01041862, + "auxiliary_loss_mlp": 0.01001552, + "balance_loss_clip": 1.01697803, + "balance_loss_mlp": 1.00016963, + "epoch": 0.5268600631294152, + "flos": 81570726043200.0, + "grad_norm": 0.9234682749524283, + "language_loss": 0.65433609, + "learning_rate": 1.9231002785122963e-06, + "loss": 0.67477024, + "num_input_tokens_seen": 188357665, + "router_z_loss_clip": 0.2487793, + "router_z_loss_mlp": 0.01383972, + "step": 8763, + "time_per_iteration": 3.1940207481384277 + }, + { + "auxiliary_loss_clip": 0.01122427, + "auxiliary_loss_mlp": 0.01028051, + "balance_loss_clip": 1.04371667, + "balance_loss_mlp": 1.01503325, + "epoch": 0.5269201863820833, + "flos": 20945137865760.0, + "grad_norm": 1.6449620851866233, + "language_loss": 0.70874292, + "learning_rate": 1.922711106286265e-06, + "loss": 0.73024768, + "num_input_tokens_seen": 188376935, + "router_z_loss_clip": 0.78710938, + "router_z_loss_mlp": 0.13012695, + "step": 8764, + "time_per_iteration": 2.6067442893981934 + }, + { + "auxiliary_loss_clip": 0.01124475, + "auxiliary_loss_mlp": 0.01030199, + "balance_loss_clip": 1.04490924, + "balance_loss_mlp": 1.01694918, + "epoch": 0.5269803096347513, + "flos": 25041620465280.0, + "grad_norm": 6.691087148453161, + "language_loss": 0.74365288, + "learning_rate": 1.9223219369910368e-06, + "loss": 0.7651996, + "num_input_tokens_seen": 188394995, + "router_z_loss_clip": 0.79589844, + "router_z_loss_mlp": 0.13269043, + "step": 8765, + "time_per_iteration": 2.633258104324341 + }, + { + "auxiliary_loss_clip": 0.01124882, + "auxiliary_loss_mlp": 0.01034771, + "balance_loss_clip": 1.04174185, + "balance_loss_mlp": 1.02087748, + "epoch": 0.5270404328874192, + "flos": 33228183934080.0, + "grad_norm": 1.5134886024794467, + "language_loss": 0.85407615, + "learning_rate": 1.9219327706413677e-06, + "loss": 0.8756727, + "num_input_tokens_seen": 188415475, + "router_z_loss_clip": 0.83056641, + "router_z_loss_mlp": 0.13897705, + "step": 8766, + "time_per_iteration": 2.6951541900634766 + }, + { + "auxiliary_loss_clip": 0.01127138, + "auxiliary_loss_mlp": 0.01038873, + "balance_loss_clip": 1.04638362, + "balance_loss_mlp": 1.02488387, + "epoch": 0.5271005561400872, + "flos": 28199815701120.0, + "grad_norm": 3.4085701500814194, + "language_loss": 0.79390764, + "learning_rate": 1.921543607252017e-06, + "loss": 0.81556773, + "num_input_tokens_seen": 188435665, + "router_z_loss_clip": 0.80712891, + "router_z_loss_mlp": 0.13983154, + "step": 8767, + "time_per_iteration": 2.669090747833252 + }, + { + "auxiliary_loss_clip": 0.01129345, + "auxiliary_loss_mlp": 0.01035484, + "balance_loss_clip": 1.04791164, + "balance_loss_mlp": 1.021245, + "epoch": 0.5271606793927551, + "flos": 27533766356640.0, + "grad_norm": 2.054597391561109, + "language_loss": 0.7320286, + "learning_rate": 1.9211544468377394e-06, + "loss": 0.75367689, + "num_input_tokens_seen": 188455405, + "router_z_loss_clip": 0.81445312, + "router_z_loss_mlp": 0.14227295, + "step": 8768, + "time_per_iteration": 2.6335690021514893 + }, + { + "auxiliary_loss_clip": 0.0112495, + "auxiliary_loss_mlp": 0.01038198, + "balance_loss_clip": 1.04723454, + "balance_loss_mlp": 1.02604437, + "epoch": 0.5272208026454231, + "flos": 22896616343040.0, + "grad_norm": 1.8249299182838277, + "language_loss": 0.73967957, + "learning_rate": 1.9207652894132933e-06, + "loss": 0.76131105, + "num_input_tokens_seen": 188472940, + "router_z_loss_clip": 0.77539062, + "router_z_loss_mlp": 0.12158203, + "step": 8769, + "time_per_iteration": 2.6196982860565186 + }, + { + "auxiliary_loss_clip": 0.01124022, + "auxiliary_loss_mlp": 0.01034882, + "balance_loss_clip": 1.04589891, + "balance_loss_mlp": 1.02163768, + "epoch": 0.5272809258980911, + "flos": 24906960077760.0, + "grad_norm": 4.254165426580445, + "language_loss": 0.7328406, + "learning_rate": 1.920376134993436e-06, + "loss": 0.75442964, + "num_input_tokens_seen": 188493035, + "router_z_loss_clip": 0.78125, + "router_z_loss_mlp": 0.13238525, + "step": 8770, + "time_per_iteration": 2.680544137954712 + }, + { + "auxiliary_loss_clip": 0.01128412, + "auxiliary_loss_mlp": 0.0103025, + "balance_loss_clip": 1.04837954, + "balance_loss_mlp": 1.01751816, + "epoch": 0.5273410491507591, + "flos": 34479503867520.0, + "grad_norm": 2.117753235816306, + "language_loss": 0.68544519, + "learning_rate": 1.9199869835929224e-06, + "loss": 0.70703179, + "num_input_tokens_seen": 188513860, + "router_z_loss_clip": 0.80126953, + "router_z_loss_mlp": 0.12731934, + "step": 8771, + "time_per_iteration": 2.6752841472625732 + }, + { + "auxiliary_loss_clip": 0.01125026, + "auxiliary_loss_mlp": 0.01037483, + "balance_loss_clip": 1.04749537, + "balance_loss_mlp": 1.02469206, + "epoch": 0.527401172403427, + "flos": 27399997349280.0, + "grad_norm": 7.983684164792596, + "language_loss": 0.76340806, + "learning_rate": 1.9195978352265115e-06, + "loss": 0.78503323, + "num_input_tokens_seen": 188533345, + "router_z_loss_clip": 0.77636719, + "router_z_loss_mlp": 0.12805176, + "step": 8772, + "time_per_iteration": 2.638530731201172 + }, + { + "auxiliary_loss_clip": 0.01130141, + "auxiliary_loss_mlp": 0.01035821, + "balance_loss_clip": 1.04795349, + "balance_loss_mlp": 1.02237391, + "epoch": 0.527461295656095, + "flos": 25662337323840.0, + "grad_norm": 2.6289741485144225, + "language_loss": 0.66025311, + "learning_rate": 1.9192086899089585e-06, + "loss": 0.68191272, + "num_input_tokens_seen": 188551550, + "router_z_loss_clip": 0.82226562, + "router_z_loss_mlp": 0.13464355, + "step": 8773, + "time_per_iteration": 2.6297664642333984 + }, + { + "auxiliary_loss_clip": 0.01126344, + "auxiliary_loss_mlp": 0.01039133, + "balance_loss_clip": 1.04585791, + "balance_loss_mlp": 1.02665174, + "epoch": 0.5275214189087629, + "flos": 32118695946720.0, + "grad_norm": 6.146671388910612, + "language_loss": 0.86046875, + "learning_rate": 1.91881954765502e-06, + "loss": 0.88212347, + "num_input_tokens_seen": 188571615, + "router_z_loss_clip": 0.80566406, + "router_z_loss_mlp": 0.12481689, + "step": 8774, + "time_per_iteration": 2.7394168376922607 + }, + { + "auxiliary_loss_clip": 0.01123472, + "auxiliary_loss_mlp": 0.01028798, + "balance_loss_clip": 1.04520464, + "balance_loss_mlp": 1.01688313, + "epoch": 0.5275815421614309, + "flos": 24461414536320.0, + "grad_norm": 1.6625707130131422, + "language_loss": 0.80062008, + "learning_rate": 1.9184304084794523e-06, + "loss": 0.82214272, + "num_input_tokens_seen": 188591965, + "router_z_loss_clip": 0.78271484, + "router_z_loss_mlp": 0.11914062, + "step": 8775, + "time_per_iteration": 4.056457281112671 + }, + { + "auxiliary_loss_clip": 0.0112242, + "auxiliary_loss_mlp": 0.01038834, + "balance_loss_clip": 1.0449487, + "balance_loss_mlp": 1.02575684, + "epoch": 0.5276416654140988, + "flos": 26151837763680.0, + "grad_norm": 29.461137201751438, + "language_loss": 0.8336044, + "learning_rate": 1.918041272397012e-06, + "loss": 0.85521692, + "num_input_tokens_seen": 188610675, + "router_z_loss_clip": 0.7734375, + "router_z_loss_mlp": 0.13061523, + "step": 8776, + "time_per_iteration": 4.008655071258545 + }, + { + "auxiliary_loss_clip": 0.01124315, + "auxiliary_loss_mlp": 0.01027301, + "balance_loss_clip": 1.04482806, + "balance_loss_mlp": 1.01441431, + "epoch": 0.5277017886667669, + "flos": 20944611141120.0, + "grad_norm": 1.8368386760898228, + "language_loss": 0.68335664, + "learning_rate": 1.9176521394224547e-06, + "loss": 0.70487279, + "num_input_tokens_seen": 188628235, + "router_z_loss_clip": 0.79492188, + "router_z_loss_mlp": 0.12902832, + "step": 8777, + "time_per_iteration": 2.657212495803833 + }, + { + "auxiliary_loss_clip": 0.0112554, + "auxiliary_loss_mlp": 0.01036212, + "balance_loss_clip": 1.04820585, + "balance_loss_mlp": 1.02345037, + "epoch": 0.5277619119194349, + "flos": 24950914976160.0, + "grad_norm": 1.5617330677743448, + "language_loss": 0.82263696, + "learning_rate": 1.9172630095705358e-06, + "loss": 0.84425449, + "num_input_tokens_seen": 188648925, + "router_z_loss_clip": 0.77392578, + "router_z_loss_mlp": 0.12762451, + "step": 8778, + "time_per_iteration": 2.6650211811065674 + }, + { + "auxiliary_loss_clip": 0.01131647, + "auxiliary_loss_mlp": 0.01038662, + "balance_loss_clip": 1.05108118, + "balance_loss_mlp": 1.02541792, + "epoch": 0.5278220351721028, + "flos": 29359619799840.0, + "grad_norm": 8.479879285585794, + "language_loss": 0.79878926, + "learning_rate": 1.916873882856013e-06, + "loss": 0.82049233, + "num_input_tokens_seen": 188668125, + "router_z_loss_clip": 0.80615234, + "router_z_loss_mlp": 0.13238525, + "step": 8779, + "time_per_iteration": 2.813309907913208 + }, + { + "auxiliary_loss_clip": 0.01121357, + "auxiliary_loss_mlp": 0.01031717, + "balance_loss_clip": 1.04450929, + "balance_loss_mlp": 1.01903296, + "epoch": 0.5278821584247708, + "flos": 30069786111840.0, + "grad_norm": 2.8904502354704964, + "language_loss": 0.76314092, + "learning_rate": 1.9164847592936406e-06, + "loss": 0.78467166, + "num_input_tokens_seen": 188684410, + "router_z_loss_clip": 0.76904297, + "router_z_loss_mlp": 0.12683105, + "step": 8780, + "time_per_iteration": 2.6678950786590576 + }, + { + "auxiliary_loss_clip": 0.01130929, + "auxiliary_loss_mlp": 0.01032274, + "balance_loss_clip": 1.04829824, + "balance_loss_mlp": 1.01840973, + "epoch": 0.5279422816774387, + "flos": 43206289060320.0, + "grad_norm": 1.5809151257018446, + "language_loss": 0.69378912, + "learning_rate": 1.916095638898174e-06, + "loss": 0.71542114, + "num_input_tokens_seen": 188706130, + "router_z_loss_clip": 0.82617188, + "router_z_loss_mlp": 0.1385498, + "step": 8781, + "time_per_iteration": 2.733853816986084 + }, + { + "auxiliary_loss_clip": 0.01120228, + "auxiliary_loss_mlp": 0.01035364, + "balance_loss_clip": 1.04382348, + "balance_loss_mlp": 1.02369952, + "epoch": 0.5280024049301068, + "flos": 28024522832160.0, + "grad_norm": 1.6744721311771125, + "language_loss": 0.72222739, + "learning_rate": 1.9157065216843696e-06, + "loss": 0.7437833, + "num_input_tokens_seen": 188725030, + "router_z_loss_clip": 0.76416016, + "router_z_loss_mlp": 0.11663818, + "step": 8782, + "time_per_iteration": 4.058955430984497 + }, + { + "auxiliary_loss_clip": 0.01123299, + "auxiliary_loss_mlp": 0.01029836, + "balance_loss_clip": 1.04619575, + "balance_loss_mlp": 1.017349, + "epoch": 0.5280625281827747, + "flos": 26243637219360.0, + "grad_norm": 1.7889946176906133, + "language_loss": 0.68842137, + "learning_rate": 1.915317407666982e-06, + "loss": 0.70995271, + "num_input_tokens_seen": 188744325, + "router_z_loss_clip": 0.77099609, + "router_z_loss_mlp": 0.12481689, + "step": 8783, + "time_per_iteration": 2.624356746673584 + }, + { + "auxiliary_loss_clip": 0.01134699, + "auxiliary_loss_mlp": 0.01042179, + "balance_loss_clip": 1.05047727, + "balance_loss_mlp": 1.02733731, + "epoch": 0.5281226514354427, + "flos": 38081421367200.0, + "grad_norm": 1.8404308304553245, + "language_loss": 0.69064444, + "learning_rate": 1.9149282968607674e-06, + "loss": 0.71241319, + "num_input_tokens_seen": 188765100, + "router_z_loss_clip": 0.84130859, + "router_z_loss_mlp": 0.14831543, + "step": 8784, + "time_per_iteration": 2.7421090602874756 + }, + { + "auxiliary_loss_clip": 0.01127247, + "auxiliary_loss_mlp": 0.01034582, + "balance_loss_clip": 1.04406977, + "balance_loss_mlp": 1.02045608, + "epoch": 0.5281827746881106, + "flos": 30600769896000.0, + "grad_norm": 2.0579608273326646, + "language_loss": 0.74757355, + "learning_rate": 1.91453918928048e-06, + "loss": 0.76919186, + "num_input_tokens_seen": 188783995, + "router_z_loss_clip": 0.83105469, + "router_z_loss_mlp": 0.14123535, + "step": 8785, + "time_per_iteration": 2.6312880516052246 + }, + { + "auxiliary_loss_clip": 0.01129713, + "auxiliary_loss_mlp": 0.0103282, + "balance_loss_clip": 1.04895067, + "balance_loss_mlp": 1.0193789, + "epoch": 0.5282428979407786, + "flos": 25174295506080.0, + "grad_norm": 1.6425901109859296, + "language_loss": 0.83444762, + "learning_rate": 1.9141500849408745e-06, + "loss": 0.8560729, + "num_input_tokens_seen": 188803120, + "router_z_loss_clip": 0.80810547, + "router_z_loss_mlp": 0.13439941, + "step": 8786, + "time_per_iteration": 2.6497390270233154 + }, + { + "auxiliary_loss_clip": 0.01119457, + "auxiliary_loss_mlp": 0.01026336, + "balance_loss_clip": 1.04396844, + "balance_loss_mlp": 1.01468372, + "epoch": 0.5283030211934465, + "flos": 27356285554560.0, + "grad_norm": 2.4115896654018147, + "language_loss": 0.82722449, + "learning_rate": 1.9137609838567076e-06, + "loss": 0.84868246, + "num_input_tokens_seen": 188820960, + "router_z_loss_clip": 0.75439453, + "router_z_loss_mlp": 0.11657715, + "step": 8787, + "time_per_iteration": 2.6410937309265137 + }, + { + "auxiliary_loss_clip": 0.01121177, + "auxiliary_loss_mlp": 0.01026678, + "balance_loss_clip": 1.04469633, + "balance_loss_mlp": 1.01507282, + "epoch": 0.5283631444461145, + "flos": 28816561866240.0, + "grad_norm": 2.146848978616126, + "language_loss": 0.83423907, + "learning_rate": 1.9133718860427316e-06, + "loss": 0.8557176, + "num_input_tokens_seen": 188837165, + "router_z_loss_clip": 0.76416016, + "router_z_loss_mlp": 0.1161499, + "step": 8788, + "time_per_iteration": 2.7209179401397705 + }, + { + "auxiliary_loss_clip": 0.01125451, + "auxiliary_loss_mlp": 0.01037815, + "balance_loss_clip": 1.04786777, + "balance_loss_mlp": 1.02319944, + "epoch": 0.5284232676987825, + "flos": 39867290605440.0, + "grad_norm": 1.615044924001856, + "language_loss": 0.75070864, + "learning_rate": 1.9129827915137027e-06, + "loss": 0.77234137, + "num_input_tokens_seen": 188858555, + "router_z_loss_clip": 0.77587891, + "router_z_loss_mlp": 0.1463623, + "step": 8789, + "time_per_iteration": 4.082506895065308 + }, + { + "auxiliary_loss_clip": 0.0112613, + "auxiliary_loss_mlp": 0.01031302, + "balance_loss_clip": 1.04658747, + "balance_loss_mlp": 1.01899958, + "epoch": 0.5284833909514505, + "flos": 32654177148960.0, + "grad_norm": 1.5255032530477934, + "language_loss": 0.69884771, + "learning_rate": 1.9125937002843754e-06, + "loss": 0.72042203, + "num_input_tokens_seen": 188879050, + "router_z_loss_clip": 0.79541016, + "router_z_loss_mlp": 0.12304688, + "step": 8790, + "time_per_iteration": 2.6895689964294434 + }, + { + "auxiliary_loss_clip": 0.0112155, + "auxiliary_loss_mlp": 0.01027658, + "balance_loss_clip": 1.0452981, + "balance_loss_mlp": 1.01586783, + "epoch": 0.5285435142041185, + "flos": 26955221636160.0, + "grad_norm": 1.6707138387177276, + "language_loss": 0.79080963, + "learning_rate": 1.9122046123695036e-06, + "loss": 0.8123017, + "num_input_tokens_seen": 188898885, + "router_z_loss_clip": 0.76220703, + "router_z_loss_mlp": 0.11798096, + "step": 8791, + "time_per_iteration": 2.6401073932647705 + }, + { + "auxiliary_loss_clip": 0.01124829, + "auxiliary_loss_mlp": 0.01029815, + "balance_loss_clip": 1.04792941, + "balance_loss_mlp": 1.01739359, + "epoch": 0.5286036374567864, + "flos": 24862194833760.0, + "grad_norm": 2.2242843015790057, + "language_loss": 0.66195989, + "learning_rate": 1.9118155277838423e-06, + "loss": 0.68350637, + "num_input_tokens_seen": 188917225, + "router_z_loss_clip": 0.76953125, + "router_z_loss_mlp": 0.12420654, + "step": 8792, + "time_per_iteration": 2.612215280532837 + }, + { + "auxiliary_loss_clip": 0.0112007, + "auxiliary_loss_mlp": 0.01034126, + "balance_loss_clip": 1.04316974, + "balance_loss_mlp": 1.02263975, + "epoch": 0.5286637607094544, + "flos": 29714905542240.0, + "grad_norm": 2.01517027993581, + "language_loss": 0.79351741, + "learning_rate": 1.9114264465421443e-06, + "loss": 0.81505936, + "num_input_tokens_seen": 188936120, + "router_z_loss_clip": 0.76806641, + "router_z_loss_mlp": 0.1149292, + "step": 8793, + "time_per_iteration": 2.643902540206909 + }, + { + "auxiliary_loss_clip": 0.01124667, + "auxiliary_loss_mlp": 0.01040036, + "balance_loss_clip": 1.04656196, + "balance_loss_mlp": 1.02738178, + "epoch": 0.5287238839621223, + "flos": 21074368937760.0, + "grad_norm": 1.8872061735399828, + "language_loss": 0.8480823, + "learning_rate": 1.9110373686591645e-06, + "loss": 0.86972934, + "num_input_tokens_seen": 188953405, + "router_z_loss_clip": 0.78027344, + "router_z_loss_mlp": 0.12658691, + "step": 8794, + "time_per_iteration": 2.5987112522125244 + }, + { + "auxiliary_loss_clip": 0.01128879, + "auxiliary_loss_mlp": 0.01034767, + "balance_loss_clip": 1.04652643, + "balance_loss_mlp": 1.02170205, + "epoch": 0.5287840072147904, + "flos": 21434881409280.0, + "grad_norm": 2.208940576370507, + "language_loss": 0.67259824, + "learning_rate": 1.9106482941496564e-06, + "loss": 0.69423473, + "num_input_tokens_seen": 188971150, + "router_z_loss_clip": 0.82324219, + "router_z_loss_mlp": 0.1307373, + "step": 8795, + "time_per_iteration": 2.599296808242798 + }, + { + "auxiliary_loss_clip": 0.01124765, + "auxiliary_loss_mlp": 0.0103369, + "balance_loss_clip": 1.04520655, + "balance_loss_mlp": 1.02126801, + "epoch": 0.5288441304674583, + "flos": 22637100749760.0, + "grad_norm": 2.4704850851049804, + "language_loss": 0.80837953, + "learning_rate": 1.910259223028374e-06, + "loss": 0.8299641, + "num_input_tokens_seen": 188989550, + "router_z_loss_clip": 0.79443359, + "router_z_loss_mlp": 0.12414551, + "step": 8796, + "time_per_iteration": 2.653315544128418 + }, + { + "auxiliary_loss_clip": 0.0112641, + "auxiliary_loss_mlp": 0.01036364, + "balance_loss_clip": 1.04718804, + "balance_loss_mlp": 1.02303624, + "epoch": 0.5289042537201263, + "flos": 25397838105120.0, + "grad_norm": 1.7314394075624193, + "language_loss": 0.68465137, + "learning_rate": 1.909870155310071e-06, + "loss": 0.70627916, + "num_input_tokens_seen": 189008795, + "router_z_loss_clip": 0.79199219, + "router_z_loss_mlp": 0.13336182, + "step": 8797, + "time_per_iteration": 2.6096408367156982 + }, + { + "auxiliary_loss_clip": 0.01118887, + "auxiliary_loss_mlp": 0.01033699, + "balance_loss_clip": 1.04431939, + "balance_loss_mlp": 1.02156925, + "epoch": 0.5289643769727942, + "flos": 19200468350880.0, + "grad_norm": 1.5533832750595227, + "language_loss": 0.82317269, + "learning_rate": 1.9094810910095005e-06, + "loss": 0.84469855, + "num_input_tokens_seen": 189025540, + "router_z_loss_clip": 0.74707031, + "router_z_loss_mlp": 0.12139893, + "step": 8798, + "time_per_iteration": 2.61912202835083 + }, + { + "auxiliary_loss_clip": 0.01126916, + "auxiliary_loss_mlp": 0.0103657, + "balance_loss_clip": 1.04595006, + "balance_loss_mlp": 1.02338004, + "epoch": 0.5290245002254622, + "flos": 23839320090240.0, + "grad_norm": 2.1120049042311595, + "language_loss": 0.70471376, + "learning_rate": 1.9090920301414166e-06, + "loss": 0.72634864, + "num_input_tokens_seen": 189044885, + "router_z_loss_clip": 0.81005859, + "router_z_loss_mlp": 0.13208008, + "step": 8799, + "time_per_iteration": 2.601903200149536 + }, + { + "auxiliary_loss_clip": 0.01120168, + "auxiliary_loss_mlp": 0.01033237, + "balance_loss_clip": 1.04604292, + "balance_loss_mlp": 1.02154875, + "epoch": 0.5290846234781301, + "flos": 19297291949280.0, + "grad_norm": 1.9247065554777694, + "language_loss": 0.69145918, + "learning_rate": 1.9087029727205716e-06, + "loss": 0.71299326, + "num_input_tokens_seen": 189061280, + "router_z_loss_clip": 0.74121094, + "router_z_loss_mlp": 0.11688232, + "step": 8800, + "time_per_iteration": 2.692396640777588 + }, + { + "auxiliary_loss_clip": 0.01041843, + "auxiliary_loss_mlp": 0.00999736, + "balance_loss_clip": 1.01682186, + "balance_loss_mlp": 0.99821794, + "epoch": 0.5291447467307981, + "flos": 85485878699040.0, + "grad_norm": 1.0111193416998612, + "language_loss": 0.57010031, + "learning_rate": 1.9083139187617193e-06, + "loss": 0.59051609, + "num_input_tokens_seen": 189114775, + "router_z_loss_clip": 0.24975586, + "router_z_loss_mlp": 0.01516724, + "step": 8801, + "time_per_iteration": 3.185608148574829 + }, + { + "auxiliary_loss_clip": 0.01125476, + "auxiliary_loss_mlp": 0.0103494, + "balance_loss_clip": 1.04593861, + "balance_loss_mlp": 1.02291226, + "epoch": 0.529204869983466, + "flos": 34610801320800.0, + "grad_norm": 1.5995962104161785, + "language_loss": 0.63835025, + "learning_rate": 1.9079248682796123e-06, + "loss": 0.65995437, + "num_input_tokens_seen": 189134700, + "router_z_loss_clip": 0.79492188, + "router_z_loss_mlp": 0.12023926, + "step": 8802, + "time_per_iteration": 2.7741124629974365 + }, + { + "auxiliary_loss_clip": 0.01121898, + "auxiliary_loss_mlp": 0.01029602, + "balance_loss_clip": 1.04528427, + "balance_loss_mlp": 1.01691759, + "epoch": 0.5292649932361341, + "flos": 41194243599840.0, + "grad_norm": 1.7602156036210344, + "language_loss": 0.69222116, + "learning_rate": 1.907535821289003e-06, + "loss": 0.71373618, + "num_input_tokens_seen": 189155365, + "router_z_loss_clip": 0.76611328, + "router_z_loss_mlp": 0.12677002, + "step": 8803, + "time_per_iteration": 2.7721736431121826 + }, + { + "auxiliary_loss_clip": 0.01117542, + "auxiliary_loss_mlp": 0.01032899, + "balance_loss_clip": 1.04310739, + "balance_loss_mlp": 1.02046585, + "epoch": 0.5293251164888021, + "flos": 24950388251520.0, + "grad_norm": 1.6918459713003902, + "language_loss": 0.76150334, + "learning_rate": 1.9071467778046458e-06, + "loss": 0.78300774, + "num_input_tokens_seen": 189173885, + "router_z_loss_clip": 0.74414062, + "router_z_loss_mlp": 0.12438965, + "step": 8804, + "time_per_iteration": 2.678872585296631 + }, + { + "auxiliary_loss_clip": 0.01041727, + "auxiliary_loss_mlp": 0.0100053, + "balance_loss_clip": 1.01664782, + "balance_loss_mlp": 0.99910063, + "epoch": 0.52938523974147, + "flos": 81197416461600.0, + "grad_norm": 0.7567229036012183, + "language_loss": 0.52982223, + "learning_rate": 1.906757737841291e-06, + "loss": 0.55024481, + "num_input_tokens_seen": 189236515, + "router_z_loss_clip": 0.25097656, + "router_z_loss_mlp": 0.01428986, + "step": 8805, + "time_per_iteration": 3.34487247467041 + }, + { + "auxiliary_loss_clip": 0.01041558, + "auxiliary_loss_mlp": 0.01000733, + "balance_loss_clip": 1.01644611, + "balance_loss_mlp": 0.99925971, + "epoch": 0.529445362994138, + "flos": 81938693694240.0, + "grad_norm": 0.7385894086238057, + "language_loss": 0.63749647, + "learning_rate": 1.906368701413693e-06, + "loss": 0.65791935, + "num_input_tokens_seen": 189300500, + "router_z_loss_clip": 0.2512207, + "router_z_loss_mlp": 0.01470184, + "step": 8806, + "time_per_iteration": 3.251426935195923 + }, + { + "auxiliary_loss_clip": 0.01125266, + "auxiliary_loss_mlp": 0.01033019, + "balance_loss_clip": 1.04362071, + "balance_loss_mlp": 1.02078247, + "epoch": 0.5295054862468059, + "flos": 21657492110880.0, + "grad_norm": 1.5915535112462877, + "language_loss": 0.72035569, + "learning_rate": 1.9059796685366026e-06, + "loss": 0.74193847, + "num_input_tokens_seen": 189319745, + "router_z_loss_clip": 0.81494141, + "router_z_loss_mlp": 0.12243652, + "step": 8807, + "time_per_iteration": 2.653526544570923 + }, + { + "auxiliary_loss_clip": 0.01120047, + "auxiliary_loss_mlp": 0.01029077, + "balance_loss_clip": 1.04444981, + "balance_loss_mlp": 1.01774669, + "epoch": 0.529565609499474, + "flos": 13906709519040.0, + "grad_norm": 2.10446831776073, + "language_loss": 0.68671691, + "learning_rate": 1.9055906392247723e-06, + "loss": 0.70820808, + "num_input_tokens_seen": 189334550, + "router_z_loss_clip": 0.75537109, + "router_z_loss_mlp": 0.11322021, + "step": 8808, + "time_per_iteration": 2.5698485374450684 + }, + { + "auxiliary_loss_clip": 0.01119895, + "auxiliary_loss_mlp": 0.01033481, + "balance_loss_clip": 1.04369938, + "balance_loss_mlp": 1.02184653, + "epoch": 0.5296257327521419, + "flos": 20983258275840.0, + "grad_norm": 2.0155916420491566, + "language_loss": 0.86761266, + "learning_rate": 1.9052016134929554e-06, + "loss": 0.88914645, + "num_input_tokens_seen": 189351735, + "router_z_loss_clip": 0.76220703, + "router_z_loss_mlp": 0.11639404, + "step": 8809, + "time_per_iteration": 2.6134696006774902 + }, + { + "auxiliary_loss_clip": 0.01124322, + "auxiliary_loss_mlp": 0.01036634, + "balance_loss_clip": 1.04312861, + "balance_loss_mlp": 1.02333069, + "epoch": 0.5296858560048099, + "flos": 48766127284800.0, + "grad_norm": 2.0605048368356877, + "language_loss": 0.6416117, + "learning_rate": 1.9048125913559016e-06, + "loss": 0.66322124, + "num_input_tokens_seen": 189373105, + "router_z_loss_clip": 0.81152344, + "router_z_loss_mlp": 0.13305664, + "step": 8810, + "time_per_iteration": 2.7648847103118896 + }, + { + "auxiliary_loss_clip": 0.01120572, + "auxiliary_loss_mlp": 0.01035774, + "balance_loss_clip": 1.04457867, + "balance_loss_mlp": 1.02388883, + "epoch": 0.5297459792574778, + "flos": 25577020632960.0, + "grad_norm": 1.908172546305645, + "language_loss": 0.67836565, + "learning_rate": 1.9044235728283646e-06, + "loss": 0.69992912, + "num_input_tokens_seen": 189394615, + "router_z_loss_clip": 0.75976562, + "router_z_loss_mlp": 0.11889648, + "step": 8811, + "time_per_iteration": 2.695089817047119 + }, + { + "auxiliary_loss_clip": 0.01040494, + "auxiliary_loss_mlp": 0.01000606, + "balance_loss_clip": 1.01576495, + "balance_loss_mlp": 0.99919891, + "epoch": 0.5298061025101458, + "flos": 81172944024480.0, + "grad_norm": 0.6614125589645946, + "language_loss": 0.5330292, + "learning_rate": 1.9040345579250953e-06, + "loss": 0.55344021, + "num_input_tokens_seen": 189459750, + "router_z_loss_clip": 0.24755859, + "router_z_loss_mlp": 0.01407623, + "step": 8812, + "time_per_iteration": 3.359755754470825 + }, + { + "auxiliary_loss_clip": 0.01040952, + "auxiliary_loss_mlp": 0.01002142, + "balance_loss_clip": 1.01609182, + "balance_loss_mlp": 1.00068772, + "epoch": 0.5298662257628137, + "flos": 82562530383360.0, + "grad_norm": 0.7394465825659444, + "language_loss": 0.56343973, + "learning_rate": 1.9036455466608453e-06, + "loss": 0.58387065, + "num_input_tokens_seen": 189527540, + "router_z_loss_clip": 0.24865723, + "router_z_loss_mlp": 0.014534, + "step": 8813, + "time_per_iteration": 3.268442392349243 + }, + { + "auxiliary_loss_clip": 0.0111542, + "auxiliary_loss_mlp": 0.01032061, + "balance_loss_clip": 1.04330444, + "balance_loss_mlp": 1.01993763, + "epoch": 0.5299263490154817, + "flos": 23972481338400.0, + "grad_norm": 1.677361069325346, + "language_loss": 0.81722271, + "learning_rate": 1.9032565390503657e-06, + "loss": 0.83869755, + "num_input_tokens_seen": 189546900, + "router_z_loss_clip": 0.72070312, + "router_z_loss_mlp": 0.12127686, + "step": 8814, + "time_per_iteration": 2.6520729064941406 + }, + { + "auxiliary_loss_clip": 0.01127107, + "auxiliary_loss_mlp": 0.0102743, + "balance_loss_clip": 1.04707742, + "balance_loss_mlp": 1.01555049, + "epoch": 0.5299864722681497, + "flos": 26911590876000.0, + "grad_norm": 1.5399615813946017, + "language_loss": 0.84876215, + "learning_rate": 1.9028675351084076e-06, + "loss": 0.87030756, + "num_input_tokens_seen": 189566490, + "router_z_loss_clip": 0.80029297, + "router_z_loss_mlp": 0.11877441, + "step": 8815, + "time_per_iteration": 4.127521514892578 + }, + { + "auxiliary_loss_clip": 0.01118579, + "auxiliary_loss_mlp": 0.01031803, + "balance_loss_clip": 1.04481888, + "balance_loss_mlp": 1.0208534, + "epoch": 0.5300465955208177, + "flos": 26556062029920.0, + "grad_norm": 2.0540165549597957, + "language_loss": 0.67089516, + "learning_rate": 1.9024785348497225e-06, + "loss": 0.69239902, + "num_input_tokens_seen": 189585580, + "router_z_loss_clip": 0.73681641, + "router_z_loss_mlp": 0.10961914, + "step": 8816, + "time_per_iteration": 3.9918272495269775 + }, + { + "auxiliary_loss_clip": 0.01120563, + "auxiliary_loss_mlp": 0.01033089, + "balance_loss_clip": 1.04389977, + "balance_loss_mlp": 1.02115023, + "epoch": 0.5301067187734857, + "flos": 52466124418560.0, + "grad_norm": 1.593017783436717, + "language_loss": 0.72155195, + "learning_rate": 1.9020895382890611e-06, + "loss": 0.74308848, + "num_input_tokens_seen": 189608485, + "router_z_loss_clip": 0.76660156, + "router_z_loss_mlp": 0.11938477, + "step": 8817, + "time_per_iteration": 2.9225258827209473 + }, + { + "auxiliary_loss_clip": 0.01120345, + "auxiliary_loss_mlp": 0.01028862, + "balance_loss_clip": 1.04217911, + "balance_loss_mlp": 1.01672101, + "epoch": 0.5301668420261536, + "flos": 25080753807360.0, + "grad_norm": 1.6788754027339905, + "language_loss": 0.65414131, + "learning_rate": 1.9017005454411743e-06, + "loss": 0.67563343, + "num_input_tokens_seen": 189627815, + "router_z_loss_clip": 0.78222656, + "router_z_loss_mlp": 0.12145996, + "step": 8818, + "time_per_iteration": 2.7083702087402344 + }, + { + "auxiliary_loss_clip": 0.01122295, + "auxiliary_loss_mlp": 0.01029428, + "balance_loss_clip": 1.04407477, + "balance_loss_mlp": 1.01654696, + "epoch": 0.5302269652788216, + "flos": 21337652638080.0, + "grad_norm": 2.0949372952237697, + "language_loss": 0.75247335, + "learning_rate": 1.9013115563208126e-06, + "loss": 0.77399063, + "num_input_tokens_seen": 189644850, + "router_z_loss_clip": 0.78173828, + "router_z_loss_mlp": 0.12878418, + "step": 8819, + "time_per_iteration": 2.616964340209961 + }, + { + "auxiliary_loss_clip": 0.01125744, + "auxiliary_loss_mlp": 0.01035549, + "balance_loss_clip": 1.04546499, + "balance_loss_mlp": 1.02329993, + "epoch": 0.5302870885314895, + "flos": 17783052971040.0, + "grad_norm": 2.367235852607995, + "language_loss": 0.8207773, + "learning_rate": 1.9009225709427267e-06, + "loss": 0.84239024, + "num_input_tokens_seen": 189660945, + "router_z_loss_clip": 0.80175781, + "router_z_loss_mlp": 0.12243652, + "step": 8820, + "time_per_iteration": 2.6456589698791504 + }, + { + "auxiliary_loss_clip": 0.0112029, + "auxiliary_loss_mlp": 0.01034641, + "balance_loss_clip": 1.04290867, + "balance_loss_mlp": 1.02337003, + "epoch": 0.5303472117841576, + "flos": 28599583066560.0, + "grad_norm": 1.8471166868681284, + "language_loss": 0.72703791, + "learning_rate": 1.9005335893216667e-06, + "loss": 0.74858725, + "num_input_tokens_seen": 189680425, + "router_z_loss_clip": 0.77441406, + "router_z_loss_mlp": 0.1126709, + "step": 8821, + "time_per_iteration": 4.1007585525512695 + }, + { + "auxiliary_loss_clip": 0.01119069, + "auxiliary_loss_mlp": 0.01027435, + "balance_loss_clip": 1.04392374, + "balance_loss_mlp": 1.01590204, + "epoch": 0.5304073350368255, + "flos": 27709910088480.0, + "grad_norm": 1.5337793394894674, + "language_loss": 0.74085206, + "learning_rate": 1.9001446114723824e-06, + "loss": 0.76231712, + "num_input_tokens_seen": 189700375, + "router_z_loss_clip": 0.75146484, + "router_z_loss_mlp": 0.11547852, + "step": 8822, + "time_per_iteration": 2.735236883163452 + }, + { + "auxiliary_loss_clip": 0.01120917, + "auxiliary_loss_mlp": 0.01037105, + "balance_loss_clip": 1.0435183, + "balance_loss_mlp": 1.02393866, + "epoch": 0.5304674582894935, + "flos": 34079007191040.0, + "grad_norm": 6.79828071657848, + "language_loss": 0.67901939, + "learning_rate": 1.8997556374096257e-06, + "loss": 0.70059961, + "num_input_tokens_seen": 189721225, + "router_z_loss_clip": 0.77246094, + "router_z_loss_mlp": 0.13165283, + "step": 8823, + "time_per_iteration": 2.730215311050415 + }, + { + "auxiliary_loss_clip": 0.0112414, + "auxiliary_loss_mlp": 0.01035175, + "balance_loss_clip": 1.04409945, + "balance_loss_mlp": 1.02231181, + "epoch": 0.5305275815421614, + "flos": 25930199476800.0, + "grad_norm": 2.139496231621309, + "language_loss": 0.69696385, + "learning_rate": 1.8993666671481444e-06, + "loss": 0.718557, + "num_input_tokens_seen": 189740170, + "router_z_loss_clip": 0.80029297, + "router_z_loss_mlp": 0.12872314, + "step": 8824, + "time_per_iteration": 2.6779730319976807 + }, + { + "auxiliary_loss_clip": 0.01118224, + "auxiliary_loss_mlp": 0.01029861, + "balance_loss_clip": 1.04395926, + "balance_loss_mlp": 1.01812434, + "epoch": 0.5305877047948294, + "flos": 21479079411360.0, + "grad_norm": 2.020809739950326, + "language_loss": 0.76388907, + "learning_rate": 1.898977700702689e-06, + "loss": 0.78536987, + "num_input_tokens_seen": 189757890, + "router_z_loss_clip": 0.74267578, + "router_z_loss_mlp": 0.1171875, + "step": 8825, + "time_per_iteration": 2.698199510574341 + }, + { + "auxiliary_loss_clip": 0.01120084, + "auxiliary_loss_mlp": 0.01036275, + "balance_loss_clip": 1.04381621, + "balance_loss_mlp": 1.02400255, + "epoch": 0.5306478280474973, + "flos": 18541104357600.0, + "grad_norm": 2.3665466272103095, + "language_loss": 0.85883832, + "learning_rate": 1.8985887380880103e-06, + "loss": 0.88040185, + "num_input_tokens_seen": 189775390, + "router_z_loss_clip": 0.76269531, + "router_z_loss_mlp": 0.12280273, + "step": 8826, + "time_per_iteration": 2.628894567489624 + }, + { + "auxiliary_loss_clip": 0.01118254, + "auxiliary_loss_mlp": 0.01031735, + "balance_loss_clip": 1.04335904, + "balance_loss_mlp": 1.01937902, + "epoch": 0.5307079513001653, + "flos": 18718949815200.0, + "grad_norm": 1.9763313862670266, + "language_loss": 0.64390808, + "learning_rate": 1.8981997793188558e-06, + "loss": 0.66540802, + "num_input_tokens_seen": 189793975, + "router_z_loss_clip": 0.74902344, + "router_z_loss_mlp": 0.12365723, + "step": 8827, + "time_per_iteration": 2.6833078861236572 + }, + { + "auxiliary_loss_clip": 0.01124105, + "auxiliary_loss_mlp": 0.0103807, + "balance_loss_clip": 1.04544783, + "balance_loss_mlp": 1.025195, + "epoch": 0.5307680745528333, + "flos": 53134726351680.0, + "grad_norm": 1.6639118209308967, + "language_loss": 0.59962511, + "learning_rate": 1.8978108244099762e-06, + "loss": 0.62124687, + "num_input_tokens_seen": 189817870, + "router_z_loss_clip": 0.78613281, + "router_z_loss_mlp": 0.12866211, + "step": 8828, + "time_per_iteration": 4.3663084506988525 + }, + { + "auxiliary_loss_clip": 0.01123703, + "auxiliary_loss_mlp": 0.01030664, + "balance_loss_clip": 1.04510438, + "balance_loss_mlp": 1.01803994, + "epoch": 0.5308281978055013, + "flos": 24463561952160.0, + "grad_norm": 2.0013455910781164, + "language_loss": 0.81259656, + "learning_rate": 1.8974218733761208e-06, + "loss": 0.8341403, + "num_input_tokens_seen": 189837905, + "router_z_loss_clip": 0.78515625, + "router_z_loss_mlp": 0.12634277, + "step": 8829, + "time_per_iteration": 2.646245002746582 + }, + { + "auxiliary_loss_clip": 0.01120863, + "auxiliary_loss_mlp": 0.01031531, + "balance_loss_clip": 1.04566288, + "balance_loss_mlp": 1.01946688, + "epoch": 0.5308883210581693, + "flos": 25263015648480.0, + "grad_norm": 1.4272475157067999, + "language_loss": 0.78207743, + "learning_rate": 1.8970329262320375e-06, + "loss": 0.80360132, + "num_input_tokens_seen": 189856970, + "router_z_loss_clip": 0.75097656, + "router_z_loss_mlp": 0.1206665, + "step": 8830, + "time_per_iteration": 2.666400671005249 + }, + { + "auxiliary_loss_clip": 0.01120099, + "auxiliary_loss_mlp": 0.01031807, + "balance_loss_clip": 1.04368484, + "balance_loss_mlp": 1.02002358, + "epoch": 0.5309484443108372, + "flos": 17516163232800.0, + "grad_norm": 2.459288861242002, + "language_loss": 0.8055371, + "learning_rate": 1.8966439829924768e-06, + "loss": 0.82705617, + "num_input_tokens_seen": 189872830, + "router_z_loss_clip": 0.76416016, + "router_z_loss_mlp": 0.11791992, + "step": 8831, + "time_per_iteration": 2.596729040145874 + }, + { + "auxiliary_loss_clip": 0.01119936, + "auxiliary_loss_mlp": 0.01029647, + "balance_loss_clip": 1.04418468, + "balance_loss_mlp": 1.01752329, + "epoch": 0.5310085675635052, + "flos": 24416851878720.0, + "grad_norm": 2.0935175308073184, + "language_loss": 0.73321402, + "learning_rate": 1.896255043672186e-06, + "loss": 0.75470984, + "num_input_tokens_seen": 189891635, + "router_z_loss_clip": 0.7578125, + "router_z_loss_mlp": 0.12127686, + "step": 8832, + "time_per_iteration": 2.6945016384124756 + }, + { + "auxiliary_loss_clip": 0.01123922, + "auxiliary_loss_mlp": 0.01034682, + "balance_loss_clip": 1.04489803, + "balance_loss_mlp": 1.0224396, + "epoch": 0.5310686908161731, + "flos": 26999865328320.0, + "grad_norm": 2.3247789944321484, + "language_loss": 0.75890541, + "learning_rate": 1.8958661082859143e-06, + "loss": 0.78049147, + "num_input_tokens_seen": 189909050, + "router_z_loss_clip": 0.78955078, + "router_z_loss_mlp": 0.12249756, + "step": 8833, + "time_per_iteration": 2.662853240966797 + }, + { + "auxiliary_loss_clip": 0.0112092, + "auxiliary_loss_mlp": 0.01031257, + "balance_loss_clip": 1.0425384, + "balance_loss_mlp": 1.01881742, + "epoch": 0.5311288140688412, + "flos": 30161099360160.0, + "grad_norm": 1.899769148175115, + "language_loss": 0.73921865, + "learning_rate": 1.8954771768484103e-06, + "loss": 0.76074046, + "num_input_tokens_seen": 189927405, + "router_z_loss_clip": 0.78369141, + "router_z_loss_mlp": 0.12451172, + "step": 8834, + "time_per_iteration": 2.8283722400665283 + }, + { + "auxiliary_loss_clip": 0.01125818, + "auxiliary_loss_mlp": 0.01034652, + "balance_loss_clip": 1.04456115, + "balance_loss_mlp": 1.02134871, + "epoch": 0.5311889373215091, + "flos": 29404304009280.0, + "grad_norm": 2.1895618740217087, + "language_loss": 0.77552509, + "learning_rate": 1.8950882493744226e-06, + "loss": 0.79712975, + "num_input_tokens_seen": 189947740, + "router_z_loss_clip": 0.81298828, + "router_z_loss_mlp": 0.13305664, + "step": 8835, + "time_per_iteration": 2.680934190750122 + }, + { + "auxiliary_loss_clip": 0.01120941, + "auxiliary_loss_mlp": 0.01035736, + "balance_loss_clip": 1.04318798, + "balance_loss_mlp": 1.02266502, + "epoch": 0.5312490605741771, + "flos": 26865367009920.0, + "grad_norm": 1.6490688878432445, + "language_loss": 0.72221112, + "learning_rate": 1.8946993258786985e-06, + "loss": 0.74377793, + "num_input_tokens_seen": 189966495, + "router_z_loss_clip": 0.77587891, + "router_z_loss_mlp": 0.1307373, + "step": 8836, + "time_per_iteration": 2.6897079944610596 + }, + { + "auxiliary_loss_clip": 0.01121751, + "auxiliary_loss_mlp": 0.01037151, + "balance_loss_clip": 1.04254663, + "balance_loss_mlp": 1.02425241, + "epoch": 0.531309183826845, + "flos": 23660299631520.0, + "grad_norm": 1.7866791663655488, + "language_loss": 0.8072871, + "learning_rate": 1.894310406375987e-06, + "loss": 0.82887608, + "num_input_tokens_seen": 189985325, + "router_z_loss_clip": 0.79199219, + "router_z_loss_mlp": 0.12896729, + "step": 8837, + "time_per_iteration": 2.6214752197265625 + }, + { + "auxiliary_loss_clip": 0.01120737, + "auxiliary_loss_mlp": 0.01031279, + "balance_loss_clip": 1.04521, + "balance_loss_mlp": 1.01874983, + "epoch": 0.531369307079513, + "flos": 24636221197920.0, + "grad_norm": 1.8240818614578218, + "language_loss": 0.85929847, + "learning_rate": 1.893921490881035e-06, + "loss": 0.88081861, + "num_input_tokens_seen": 190003290, + "router_z_loss_clip": 0.75488281, + "router_z_loss_mlp": 0.12512207, + "step": 8838, + "time_per_iteration": 2.6715011596679688 + }, + { + "auxiliary_loss_clip": 0.01117561, + "auxiliary_loss_mlp": 0.01032393, + "balance_loss_clip": 1.04215097, + "balance_loss_mlp": 1.02085376, + "epoch": 0.5314294303321809, + "flos": 23037435357120.0, + "grad_norm": 9.87236853791632, + "language_loss": 0.73223305, + "learning_rate": 1.8935325794085906e-06, + "loss": 0.75373256, + "num_input_tokens_seen": 190023260, + "router_z_loss_clip": 0.75390625, + "router_z_loss_mlp": 0.11529541, + "step": 8839, + "time_per_iteration": 2.610805034637451 + }, + { + "auxiliary_loss_clip": 0.01119231, + "auxiliary_loss_mlp": 0.01036779, + "balance_loss_clip": 1.04099941, + "balance_loss_mlp": 1.0246737, + "epoch": 0.531489553584849, + "flos": 28113607630080.0, + "grad_norm": 1.5971156514308902, + "language_loss": 0.77308303, + "learning_rate": 1.8931436719734023e-06, + "loss": 0.79464316, + "num_input_tokens_seen": 190042035, + "router_z_loss_clip": 0.78222656, + "router_z_loss_mlp": 0.12109375, + "step": 8840, + "time_per_iteration": 2.675407886505127 + }, + { + "auxiliary_loss_clip": 0.01121987, + "auxiliary_loss_mlp": 0.01032507, + "balance_loss_clip": 1.04234862, + "balance_loss_mlp": 1.01977563, + "epoch": 0.5315496768375169, + "flos": 24148463001120.0, + "grad_norm": 3.055851358354609, + "language_loss": 0.77194208, + "learning_rate": 1.892754768590216e-06, + "loss": 0.79348701, + "num_input_tokens_seen": 190057545, + "router_z_loss_clip": 0.79638672, + "router_z_loss_mlp": 0.12731934, + "step": 8841, + "time_per_iteration": 2.640491247177124 + }, + { + "auxiliary_loss_clip": 0.01042618, + "auxiliary_loss_mlp": 0.0102074, + "balance_loss_clip": 1.01805544, + "balance_loss_mlp": 1.01953208, + "epoch": 0.5316098000901849, + "flos": 86662619020800.0, + "grad_norm": 0.6942653017472734, + "language_loss": 0.56803465, + "learning_rate": 1.8923658692737793e-06, + "loss": 0.58866823, + "num_input_tokens_seen": 190123800, + "router_z_loss_clip": 0.24572754, + "router_z_loss_mlp": 0.0120697, + "step": 8842, + "time_per_iteration": 3.376030683517456 + }, + { + "auxiliary_loss_clip": 0.01122496, + "auxiliary_loss_mlp": 0.01035074, + "balance_loss_clip": 1.04298818, + "balance_loss_mlp": 1.02210426, + "epoch": 0.5316699233428529, + "flos": 20054370921120.0, + "grad_norm": 1.8742300773705647, + "language_loss": 0.73798728, + "learning_rate": 1.8919769740388407e-06, + "loss": 0.75956297, + "num_input_tokens_seen": 190141625, + "router_z_loss_clip": 0.79443359, + "router_z_loss_mlp": 0.12963867, + "step": 8843, + "time_per_iteration": 2.640697717666626 + }, + { + "auxiliary_loss_clip": 0.01042596, + "auxiliary_loss_mlp": 0.01013537, + "balance_loss_clip": 1.01796126, + "balance_loss_mlp": 1.01236534, + "epoch": 0.5317300465955208, + "flos": 82265785760160.0, + "grad_norm": 0.8648471667864456, + "language_loss": 0.60879695, + "learning_rate": 1.891588082900145e-06, + "loss": 0.62935823, + "num_input_tokens_seen": 190198110, + "router_z_loss_clip": 0.24645996, + "router_z_loss_mlp": 0.01171112, + "step": 8844, + "time_per_iteration": 3.2920761108398438 + }, + { + "auxiliary_loss_clip": 0.01042776, + "auxiliary_loss_mlp": 0.01005944, + "balance_loss_clip": 1.01807094, + "balance_loss_mlp": 1.00473523, + "epoch": 0.5317901698481888, + "flos": 72612983589120.0, + "grad_norm": 0.8389456991508308, + "language_loss": 0.62252116, + "learning_rate": 1.8911991958724411e-06, + "loss": 0.64300835, + "num_input_tokens_seen": 190259950, + "router_z_loss_clip": 0.24719238, + "router_z_loss_mlp": 0.01208496, + "step": 8845, + "time_per_iteration": 3.2952044010162354 + }, + { + "auxiliary_loss_clip": 0.01119993, + "auxiliary_loss_mlp": 0.01033097, + "balance_loss_clip": 1.04256785, + "balance_loss_mlp": 1.01951885, + "epoch": 0.5318502931008567, + "flos": 23341959298080.0, + "grad_norm": 2.8651130287911775, + "language_loss": 0.75072742, + "learning_rate": 1.890810312970474e-06, + "loss": 0.7722584, + "num_input_tokens_seen": 190278265, + "router_z_loss_clip": 0.77490234, + "router_z_loss_mlp": 0.13592529, + "step": 8846, + "time_per_iteration": 2.8450324535369873 + }, + { + "auxiliary_loss_clip": 0.01120241, + "auxiliary_loss_mlp": 0.01031164, + "balance_loss_clip": 1.04270267, + "balance_loss_mlp": 1.02012527, + "epoch": 0.5319104163535248, + "flos": 30116172047040.0, + "grad_norm": 5.355413139221963, + "language_loss": 0.75262898, + "learning_rate": 1.8904214342089903e-06, + "loss": 0.77414304, + "num_input_tokens_seen": 190298400, + "router_z_loss_clip": 0.77441406, + "router_z_loss_mlp": 0.11035156, + "step": 8847, + "time_per_iteration": 2.709362745285034 + }, + { + "auxiliary_loss_clip": 0.01117668, + "auxiliary_loss_mlp": 0.01029055, + "balance_loss_clip": 1.04114354, + "balance_loss_mlp": 1.01740229, + "epoch": 0.5319705396061927, + "flos": 23654627212320.0, + "grad_norm": 2.549833605464298, + "language_loss": 0.87691164, + "learning_rate": 1.8900325596027378e-06, + "loss": 0.89837885, + "num_input_tokens_seen": 190316235, + "router_z_loss_clip": 0.76416016, + "router_z_loss_mlp": 0.11657715, + "step": 8848, + "time_per_iteration": 2.6652865409851074 + }, + { + "auxiliary_loss_clip": 0.01122691, + "auxiliary_loss_mlp": 0.01037607, + "balance_loss_clip": 1.04420638, + "balance_loss_mlp": 1.0240531, + "epoch": 0.5320306628588607, + "flos": 22279910695200.0, + "grad_norm": 2.511700807413778, + "language_loss": 0.7442472, + "learning_rate": 1.8896436891664609e-06, + "loss": 0.76585013, + "num_input_tokens_seen": 190335060, + "router_z_loss_clip": 0.78369141, + "router_z_loss_mlp": 0.13537598, + "step": 8849, + "time_per_iteration": 2.6493563652038574 + }, + { + "auxiliary_loss_clip": 0.01121757, + "auxiliary_loss_mlp": 0.0102861, + "balance_loss_clip": 1.04215598, + "balance_loss_mlp": 1.01601005, + "epoch": 0.5320907861115286, + "flos": 28958474846880.0, + "grad_norm": 3.233445794211373, + "language_loss": 0.79693252, + "learning_rate": 1.8892548229149066e-06, + "loss": 0.81843615, + "num_input_tokens_seen": 190353265, + "router_z_loss_clip": 0.79589844, + "router_z_loss_mlp": 0.1260376, + "step": 8850, + "time_per_iteration": 2.6960997581481934 + }, + { + "auxiliary_loss_clip": 0.01117947, + "auxiliary_loss_mlp": 0.0102769, + "balance_loss_clip": 1.04133677, + "balance_loss_mlp": 1.01541138, + "epoch": 0.5321509093641966, + "flos": 42092263137600.0, + "grad_norm": 1.7765817036947686, + "language_loss": 0.55041766, + "learning_rate": 1.888865960862821e-06, + "loss": 0.57187402, + "num_input_tokens_seen": 190376575, + "router_z_loss_clip": 0.765625, + "router_z_loss_mlp": 0.1227417, + "step": 8851, + "time_per_iteration": 2.7564189434051514 + }, + { + "auxiliary_loss_clip": 0.01122176, + "auxiliary_loss_mlp": 0.0103078, + "balance_loss_clip": 1.04366612, + "balance_loss_mlp": 1.01903248, + "epoch": 0.5322110326168645, + "flos": 24417338086080.0, + "grad_norm": 1.6690326767716486, + "language_loss": 0.68790078, + "learning_rate": 1.8884771030249484e-06, + "loss": 0.70943034, + "num_input_tokens_seen": 190395185, + "router_z_loss_clip": 0.78369141, + "router_z_loss_mlp": 0.11743164, + "step": 8852, + "time_per_iteration": 2.666646718978882 + }, + { + "auxiliary_loss_clip": 0.01043807, + "auxiliary_loss_mlp": 0.01000167, + "balance_loss_clip": 1.01885533, + "balance_loss_mlp": 0.99886537, + "epoch": 0.5322711558695326, + "flos": 78863181526080.0, + "grad_norm": 0.8039880022612078, + "language_loss": 0.62804997, + "learning_rate": 1.8880882494160357e-06, + "loss": 0.64848971, + "num_input_tokens_seen": 190452595, + "router_z_loss_clip": 0.24951172, + "router_z_loss_mlp": 0.01302338, + "step": 8853, + "time_per_iteration": 3.246706962585449 + }, + { + "auxiliary_loss_clip": 0.01121025, + "auxiliary_loss_mlp": 0.01026124, + "balance_loss_clip": 1.04227209, + "balance_loss_mlp": 1.01369011, + "epoch": 0.5323312791222005, + "flos": 18228598512480.0, + "grad_norm": 2.37472601991335, + "language_loss": 0.80001485, + "learning_rate": 1.8876994000508278e-06, + "loss": 0.82148635, + "num_input_tokens_seen": 190469140, + "router_z_loss_clip": 0.78759766, + "router_z_loss_mlp": 0.12426758, + "step": 8854, + "time_per_iteration": 4.025306940078735 + }, + { + "auxiliary_loss_clip": 0.01115877, + "auxiliary_loss_mlp": 0.01028555, + "balance_loss_clip": 1.04189467, + "balance_loss_mlp": 1.01770723, + "epoch": 0.5323914023748685, + "flos": 28602743414400.0, + "grad_norm": 2.0789169102570257, + "language_loss": 0.73535472, + "learning_rate": 1.8873105549440698e-06, + "loss": 0.75679904, + "num_input_tokens_seen": 190489015, + "router_z_loss_clip": 0.73876953, + "router_z_loss_mlp": 0.10852051, + "step": 8855, + "time_per_iteration": 4.023873329162598 + }, + { + "auxiliary_loss_clip": 0.0111779, + "auxiliary_loss_mlp": 0.0102944, + "balance_loss_clip": 1.04173243, + "balance_loss_mlp": 1.01824057, + "epoch": 0.5324515256275365, + "flos": 32074903117440.0, + "grad_norm": 2.0602780180372755, + "language_loss": 0.65045309, + "learning_rate": 1.886921714110507e-06, + "loss": 0.67192543, + "num_input_tokens_seen": 190508065, + "router_z_loss_clip": 0.76025391, + "router_z_loss_mlp": 0.11199951, + "step": 8856, + "time_per_iteration": 2.7053873538970947 + }, + { + "auxiliary_loss_clip": 0.01125068, + "auxiliary_loss_mlp": 0.01034323, + "balance_loss_clip": 1.04563046, + "balance_loss_mlp": 1.02160382, + "epoch": 0.5325116488802044, + "flos": 32788351329120.0, + "grad_norm": 11.697597042406906, + "language_loss": 0.7771278, + "learning_rate": 1.8865328775648842e-06, + "loss": 0.79872173, + "num_input_tokens_seen": 190527045, + "router_z_loss_clip": 0.79492188, + "router_z_loss_mlp": 0.1272583, + "step": 8857, + "time_per_iteration": 2.6934940814971924 + }, + { + "auxiliary_loss_clip": 0.01118866, + "auxiliary_loss_mlp": 0.01030912, + "balance_loss_clip": 1.04184365, + "balance_loss_mlp": 1.01818025, + "epoch": 0.5325717721328724, + "flos": 31585038022080.0, + "grad_norm": 1.8310686843369528, + "language_loss": 0.71467984, + "learning_rate": 1.8861440453219456e-06, + "loss": 0.73617762, + "num_input_tokens_seen": 190544075, + "router_z_loss_clip": 0.76855469, + "router_z_loss_mlp": 0.12738037, + "step": 8858, + "time_per_iteration": 2.7356116771698 + }, + { + "auxiliary_loss_clip": 0.01122831, + "auxiliary_loss_mlp": 0.01036458, + "balance_loss_clip": 1.04442596, + "balance_loss_mlp": 1.02305937, + "epoch": 0.5326318953855403, + "flos": 26599611755520.0, + "grad_norm": 1.8128936711753918, + "language_loss": 0.69682229, + "learning_rate": 1.8857552173964367e-06, + "loss": 0.71841514, + "num_input_tokens_seen": 190566030, + "router_z_loss_clip": 0.78466797, + "router_z_loss_mlp": 0.13409424, + "step": 8859, + "time_per_iteration": 2.6665403842926025 + }, + { + "auxiliary_loss_clip": 0.01115746, + "auxiliary_loss_mlp": 0.01024242, + "balance_loss_clip": 1.04269814, + "balance_loss_mlp": 1.01328087, + "epoch": 0.5326920186382084, + "flos": 25530229524960.0, + "grad_norm": 1.5161421733460547, + "language_loss": 0.69442147, + "learning_rate": 1.8853663938031013e-06, + "loss": 0.71582139, + "num_input_tokens_seen": 190585605, + "router_z_loss_clip": 0.72949219, + "router_z_loss_mlp": 0.10961914, + "step": 8860, + "time_per_iteration": 2.6450445652008057 + }, + { + "auxiliary_loss_clip": 0.0111813, + "auxiliary_loss_mlp": 0.01031997, + "balance_loss_clip": 1.04247785, + "balance_loss_mlp": 1.02026045, + "epoch": 0.5327521418908763, + "flos": 26153215351200.0, + "grad_norm": 1.9914397158577373, + "language_loss": 0.77930903, + "learning_rate": 1.884977574556683e-06, + "loss": 0.80081034, + "num_input_tokens_seen": 190604625, + "router_z_loss_clip": 0.75683594, + "router_z_loss_mlp": 0.11737061, + "step": 8861, + "time_per_iteration": 4.148904323577881 + }, + { + "auxiliary_loss_clip": 0.01119239, + "auxiliary_loss_mlp": 0.01035352, + "balance_loss_clip": 1.04201818, + "balance_loss_mlp": 1.02232242, + "epoch": 0.5328122651435443, + "flos": 26552172371040.0, + "grad_norm": 2.0547152517261407, + "language_loss": 0.86130363, + "learning_rate": 1.8845887596719279e-06, + "loss": 0.88284951, + "num_input_tokens_seen": 190625060, + "router_z_loss_clip": 0.77148438, + "router_z_loss_mlp": 0.13018799, + "step": 8862, + "time_per_iteration": 2.682239294052124 + }, + { + "auxiliary_loss_clip": 0.01120359, + "auxiliary_loss_mlp": 0.0103512, + "balance_loss_clip": 1.04174078, + "balance_loss_mlp": 1.02167296, + "epoch": 0.5328723883962122, + "flos": 22325202663840.0, + "grad_norm": 1.9705970645891604, + "language_loss": 0.61577725, + "learning_rate": 1.8841999491635778e-06, + "loss": 0.63733196, + "num_input_tokens_seen": 190643150, + "router_z_loss_clip": 0.78515625, + "router_z_loss_mlp": 0.13452148, + "step": 8863, + "time_per_iteration": 2.662252902984619 + }, + { + "auxiliary_loss_clip": 0.01119494, + "auxiliary_loss_mlp": 0.01031095, + "balance_loss_clip": 1.04508984, + "balance_loss_mlp": 1.01993668, + "epoch": 0.5329325116488802, + "flos": 30962822024160.0, + "grad_norm": 5.236866144537255, + "language_loss": 0.7316063, + "learning_rate": 1.883811143046377e-06, + "loss": 0.7531122, + "num_input_tokens_seen": 190662725, + "router_z_loss_clip": 0.74414062, + "router_z_loss_mlp": 0.11157227, + "step": 8864, + "time_per_iteration": 2.787052631378174 + }, + { + "auxiliary_loss_clip": 0.01117382, + "auxiliary_loss_mlp": 0.01033075, + "balance_loss_clip": 1.04149318, + "balance_loss_mlp": 1.02139831, + "epoch": 0.5329926349015481, + "flos": 31228010036640.0, + "grad_norm": 1.7007968435561345, + "language_loss": 0.64131105, + "learning_rate": 1.8834223413350702e-06, + "loss": 0.66281563, + "num_input_tokens_seen": 190683680, + "router_z_loss_clip": 0.7578125, + "router_z_loss_mlp": 0.11676025, + "step": 8865, + "time_per_iteration": 2.7239699363708496 + }, + { + "auxiliary_loss_clip": 0.01119428, + "auxiliary_loss_mlp": 0.01030391, + "balance_loss_clip": 1.04169631, + "balance_loss_mlp": 1.01807642, + "epoch": 0.5330527581542162, + "flos": 27930454408800.0, + "grad_norm": 1.8477813503421696, + "language_loss": 0.78627682, + "learning_rate": 1.8830335440443989e-06, + "loss": 0.80777502, + "num_input_tokens_seen": 190703350, + "router_z_loss_clip": 0.77783203, + "router_z_loss_mlp": 0.12316895, + "step": 8866, + "time_per_iteration": 2.6231637001037598 + }, + { + "auxiliary_loss_clip": 0.01117947, + "auxiliary_loss_mlp": 0.01030455, + "balance_loss_clip": 1.04208016, + "balance_loss_mlp": 1.01872492, + "epoch": 0.5331128814068841, + "flos": 19556199783360.0, + "grad_norm": 1.9178071127596426, + "language_loss": 0.73364371, + "learning_rate": 1.882644751189108e-06, + "loss": 0.75512773, + "num_input_tokens_seen": 190721170, + "router_z_loss_clip": 0.75878906, + "router_z_loss_mlp": 0.1171875, + "step": 8867, + "time_per_iteration": 4.078510284423828 + }, + { + "auxiliary_loss_clip": 0.01120561, + "auxiliary_loss_mlp": 0.01029676, + "balance_loss_clip": 1.04269886, + "balance_loss_mlp": 1.01739728, + "epoch": 0.5331730046595521, + "flos": 48010223314080.0, + "grad_norm": 1.8117535926008108, + "language_loss": 0.72045553, + "learning_rate": 1.88225596278394e-06, + "loss": 0.7419579, + "num_input_tokens_seen": 190743795, + "router_z_loss_clip": 0.77880859, + "router_z_loss_mlp": 0.1227417, + "step": 8868, + "time_per_iteration": 2.8178441524505615 + }, + { + "auxiliary_loss_clip": 0.01116288, + "auxiliary_loss_mlp": 0.01030564, + "balance_loss_clip": 1.04003119, + "balance_loss_mlp": 1.01881599, + "epoch": 0.5332331279122201, + "flos": 29314854555840.0, + "grad_norm": 2.082130521783741, + "language_loss": 0.78548789, + "learning_rate": 1.881867178843637e-06, + "loss": 0.80695647, + "num_input_tokens_seen": 190761560, + "router_z_loss_clip": 0.76318359, + "router_z_loss_mlp": 0.11743164, + "step": 8869, + "time_per_iteration": 2.6613211631774902 + }, + { + "auxiliary_loss_clip": 0.01123793, + "auxiliary_loss_mlp": 0.01031188, + "balance_loss_clip": 1.04322529, + "balance_loss_mlp": 1.01880836, + "epoch": 0.533293251164888, + "flos": 20900899346400.0, + "grad_norm": 5.083222668815957, + "language_loss": 0.76139355, + "learning_rate": 1.8814783993829434e-06, + "loss": 0.78294337, + "num_input_tokens_seen": 190778875, + "router_z_loss_clip": 0.80517578, + "router_z_loss_mlp": 0.12390137, + "step": 8870, + "time_per_iteration": 2.677762985229492 + }, + { + "auxiliary_loss_clip": 0.01124775, + "auxiliary_loss_mlp": 0.01040658, + "balance_loss_clip": 1.04468441, + "balance_loss_mlp": 1.02734852, + "epoch": 0.533353374417556, + "flos": 26998811879040.0, + "grad_norm": 3.161343051232687, + "language_loss": 0.75640965, + "learning_rate": 1.8810896244165997e-06, + "loss": 0.77806395, + "num_input_tokens_seen": 190799830, + "router_z_loss_clip": 0.80078125, + "router_z_loss_mlp": 0.13317871, + "step": 8871, + "time_per_iteration": 2.6627886295318604 + }, + { + "auxiliary_loss_clip": 0.01120145, + "auxiliary_loss_mlp": 0.01032297, + "balance_loss_clip": 1.0425396, + "balance_loss_mlp": 1.02014971, + "epoch": 0.533413497670224, + "flos": 18316265205600.0, + "grad_norm": 1.9520496070275524, + "language_loss": 0.72266364, + "learning_rate": 1.8807008539593498e-06, + "loss": 0.74418807, + "num_input_tokens_seen": 190817155, + "router_z_loss_clip": 0.77587891, + "router_z_loss_mlp": 0.12133789, + "step": 8872, + "time_per_iteration": 2.6549394130706787 + }, + { + "auxiliary_loss_clip": 0.01120215, + "auxiliary_loss_mlp": 0.01038163, + "balance_loss_clip": 1.04433465, + "balance_loss_mlp": 1.02597952, + "epoch": 0.533473620922892, + "flos": 23927999715360.0, + "grad_norm": 1.7815847166730465, + "language_loss": 0.64669681, + "learning_rate": 1.880312088025936e-06, + "loss": 0.6682806, + "num_input_tokens_seen": 190835240, + "router_z_loss_clip": 0.75927734, + "router_z_loss_mlp": 0.12188721, + "step": 8873, + "time_per_iteration": 2.618518352508545 + }, + { + "auxiliary_loss_clip": 0.01118043, + "auxiliary_loss_mlp": 0.01037558, + "balance_loss_clip": 1.04157865, + "balance_loss_mlp": 1.02581048, + "epoch": 0.5335337441755599, + "flos": 17427240504000.0, + "grad_norm": 2.41715828371678, + "language_loss": 0.79526806, + "learning_rate": 1.879923326631099e-06, + "loss": 0.81682408, + "num_input_tokens_seen": 190851620, + "router_z_loss_clip": 0.76513672, + "router_z_loss_mlp": 0.11749268, + "step": 8874, + "time_per_iteration": 2.6037471294403076 + }, + { + "auxiliary_loss_clip": 0.01118959, + "auxiliary_loss_mlp": 0.01028249, + "balance_loss_clip": 1.04231489, + "balance_loss_mlp": 1.01635194, + "epoch": 0.5335938674282279, + "flos": 25398202760640.0, + "grad_norm": 1.7083210909240771, + "language_loss": 0.69888771, + "learning_rate": 1.879534569789582e-06, + "loss": 0.7203598, + "num_input_tokens_seen": 190870545, + "router_z_loss_clip": 0.76611328, + "router_z_loss_mlp": 0.11895752, + "step": 8875, + "time_per_iteration": 2.652940034866333 + }, + { + "auxiliary_loss_clip": 0.01038929, + "auxiliary_loss_mlp": 0.0099967, + "balance_loss_clip": 1.01438928, + "balance_loss_mlp": 0.99842244, + "epoch": 0.5336539906808958, + "flos": 87117280950240.0, + "grad_norm": 0.7280243720584466, + "language_loss": 0.59629583, + "learning_rate": 1.879145817516126e-06, + "loss": 0.61668181, + "num_input_tokens_seen": 190931995, + "router_z_loss_clip": 0.2454834, + "router_z_loss_mlp": 0.01246643, + "step": 8876, + "time_per_iteration": 3.417184352874756 + }, + { + "auxiliary_loss_clip": 0.01116713, + "auxiliary_loss_mlp": 0.01033215, + "balance_loss_clip": 1.04067993, + "balance_loss_mlp": 1.02184248, + "epoch": 0.5337141139335638, + "flos": 24591050781120.0, + "grad_norm": 2.37756911302028, + "language_loss": 0.74656785, + "learning_rate": 1.8787570698254727e-06, + "loss": 0.76806712, + "num_input_tokens_seen": 190949890, + "router_z_loss_clip": 0.76025391, + "router_z_loss_mlp": 0.11376953, + "step": 8877, + "time_per_iteration": 2.670590400695801 + }, + { + "auxiliary_loss_clip": 0.0103786, + "auxiliary_loss_mlp": 0.01001374, + "balance_loss_clip": 1.01313293, + "balance_loss_mlp": 1.00008619, + "epoch": 0.5337742371862317, + "flos": 82643025517920.0, + "grad_norm": 0.7567972080815852, + "language_loss": 0.57201195, + "learning_rate": 1.8783683267323629e-06, + "loss": 0.59240425, + "num_input_tokens_seen": 191008480, + "router_z_loss_clip": 0.24731445, + "router_z_loss_mlp": 0.01288605, + "step": 8878, + "time_per_iteration": 3.1670336723327637 + }, + { + "auxiliary_loss_clip": 0.01122025, + "auxiliary_loss_mlp": 0.0103465, + "balance_loss_clip": 1.04280102, + "balance_loss_mlp": 1.02178764, + "epoch": 0.5338343604388998, + "flos": 30516304068000.0, + "grad_norm": 1.807486964552381, + "language_loss": 0.72421294, + "learning_rate": 1.8779795882515395e-06, + "loss": 0.74577975, + "num_input_tokens_seen": 191028995, + "router_z_loss_clip": 0.79199219, + "router_z_loss_mlp": 0.12854004, + "step": 8879, + "time_per_iteration": 2.885253667831421 + }, + { + "auxiliary_loss_clip": 0.01120709, + "auxiliary_loss_mlp": 0.01029747, + "balance_loss_clip": 1.04309988, + "balance_loss_mlp": 1.01742053, + "epoch": 0.5338944836915677, + "flos": 21477013030080.0, + "grad_norm": 6.445326278612051, + "language_loss": 0.83864045, + "learning_rate": 1.8775908543977416e-06, + "loss": 0.86014503, + "num_input_tokens_seen": 191045285, + "router_z_loss_clip": 0.77490234, + "router_z_loss_mlp": 0.12322998, + "step": 8880, + "time_per_iteration": 2.593017816543579 + }, + { + "auxiliary_loss_clip": 0.01115135, + "auxiliary_loss_mlp": 0.01030586, + "balance_loss_clip": 1.0408268, + "balance_loss_mlp": 1.01883817, + "epoch": 0.5339546069442357, + "flos": 26508055403520.0, + "grad_norm": 1.5025137452923278, + "language_loss": 0.79567516, + "learning_rate": 1.8772021251857107e-06, + "loss": 0.81713247, + "num_input_tokens_seen": 191066105, + "router_z_loss_clip": 0.74267578, + "router_z_loss_mlp": 0.11755371, + "step": 8881, + "time_per_iteration": 2.6350297927856445 + }, + { + "auxiliary_loss_clip": 0.01038268, + "auxiliary_loss_mlp": 0.01003203, + "balance_loss_clip": 1.01385558, + "balance_loss_mlp": 1.00202823, + "epoch": 0.5340147301969036, + "flos": 85078777705920.0, + "grad_norm": 0.8007488228278984, + "language_loss": 0.59165454, + "learning_rate": 1.8768134006301882e-06, + "loss": 0.61206931, + "num_input_tokens_seen": 191126315, + "router_z_loss_clip": 0.24414062, + "router_z_loss_mlp": 0.01173401, + "step": 8882, + "time_per_iteration": 3.185950994491577 + }, + { + "auxiliary_loss_clip": 0.01036982, + "auxiliary_loss_mlp": 0.01002406, + "balance_loss_clip": 1.01229465, + "balance_loss_mlp": 1.00113416, + "epoch": 0.5340748534495716, + "flos": 77949319731840.0, + "grad_norm": 0.8685654490909489, + "language_loss": 0.63619906, + "learning_rate": 1.876424680745913e-06, + "loss": 0.65659297, + "num_input_tokens_seen": 191174240, + "router_z_loss_clip": 0.24694824, + "router_z_loss_mlp": 0.0127182, + "step": 8883, + "time_per_iteration": 3.044339895248413 + }, + { + "auxiliary_loss_clip": 0.01120648, + "auxiliary_loss_mlp": 0.01031203, + "balance_loss_clip": 1.04114032, + "balance_loss_mlp": 1.01823854, + "epoch": 0.5341349767022396, + "flos": 35013485930400.0, + "grad_norm": 2.204581357519641, + "language_loss": 0.81891823, + "learning_rate": 1.8760359655476272e-06, + "loss": 0.8404367, + "num_input_tokens_seen": 191193335, + "router_z_loss_clip": 0.79443359, + "router_z_loss_mlp": 0.12969971, + "step": 8884, + "time_per_iteration": 2.7084102630615234 + }, + { + "auxiliary_loss_clip": 0.01115105, + "auxiliary_loss_mlp": 0.01032168, + "balance_loss_clip": 1.04245734, + "balance_loss_mlp": 1.02059877, + "epoch": 0.5341950999549075, + "flos": 19876363394400.0, + "grad_norm": 4.016257457009369, + "language_loss": 0.71934485, + "learning_rate": 1.8756472550500695e-06, + "loss": 0.74081761, + "num_input_tokens_seen": 191210900, + "router_z_loss_clip": 0.72705078, + "router_z_loss_mlp": 0.11578369, + "step": 8885, + "time_per_iteration": 2.6400387287139893 + }, + { + "auxiliary_loss_clip": 0.01120294, + "auxiliary_loss_mlp": 0.01029797, + "balance_loss_clip": 1.03972757, + "balance_loss_mlp": 1.01679111, + "epoch": 0.5342552232075756, + "flos": 17516325301920.0, + "grad_norm": 3.1143259651491877, + "language_loss": 0.78679758, + "learning_rate": 1.87525854926798e-06, + "loss": 0.80829847, + "num_input_tokens_seen": 191226730, + "router_z_loss_clip": 0.80664062, + "router_z_loss_mlp": 0.13012695, + "step": 8886, + "time_per_iteration": 2.6601381301879883 + }, + { + "auxiliary_loss_clip": 0.01119279, + "auxiliary_loss_mlp": 0.01034109, + "balance_loss_clip": 1.04134989, + "balance_loss_mlp": 1.02015579, + "epoch": 0.5343153464602435, + "flos": 36970555792320.0, + "grad_norm": 2.4024316688959058, + "language_loss": 0.74741316, + "learning_rate": 1.8748698482160996e-06, + "loss": 0.76894712, + "num_input_tokens_seen": 191250435, + "router_z_loss_clip": 0.77929688, + "router_z_loss_mlp": 0.13946533, + "step": 8887, + "time_per_iteration": 2.7266898155212402 + }, + { + "auxiliary_loss_clip": 0.01117479, + "auxiliary_loss_mlp": 0.01027626, + "balance_loss_clip": 1.04170561, + "balance_loss_mlp": 1.01550829, + "epoch": 0.5343754697129115, + "flos": 19030240141920.0, + "grad_norm": 2.247797871165468, + "language_loss": 0.69310755, + "learning_rate": 1.8744811519091663e-06, + "loss": 0.7145586, + "num_input_tokens_seen": 191268315, + "router_z_loss_clip": 0.75732422, + "router_z_loss_mlp": 0.12115479, + "step": 8888, + "time_per_iteration": 2.628904342651367 + }, + { + "auxiliary_loss_clip": 0.01126841, + "auxiliary_loss_mlp": 0.0103358, + "balance_loss_clip": 1.04367054, + "balance_loss_mlp": 1.02088392, + "epoch": 0.5344355929655794, + "flos": 20632875124320.0, + "grad_norm": 2.2529758625813816, + "language_loss": 0.77308565, + "learning_rate": 1.8740924603619208e-06, + "loss": 0.79468989, + "num_input_tokens_seen": 191287000, + "router_z_loss_clip": 0.83105469, + "router_z_loss_mlp": 0.12695312, + "step": 8889, + "time_per_iteration": 2.6055092811584473 + }, + { + "auxiliary_loss_clip": 0.01120085, + "auxiliary_loss_mlp": 0.01037118, + "balance_loss_clip": 1.04326081, + "balance_loss_mlp": 1.02482176, + "epoch": 0.5344957162182474, + "flos": 20497080252960.0, + "grad_norm": 1.9282964348926008, + "language_loss": 0.69001526, + "learning_rate": 1.873703773589102e-06, + "loss": 0.71158731, + "num_input_tokens_seen": 191304565, + "router_z_loss_clip": 0.76904297, + "router_z_loss_mlp": 0.12298584, + "step": 8890, + "time_per_iteration": 2.6546449661254883 + }, + { + "auxiliary_loss_clip": 0.01122867, + "auxiliary_loss_mlp": 0.01044537, + "balance_loss_clip": 1.04182553, + "balance_loss_mlp": 1.03088152, + "epoch": 0.5345558394709153, + "flos": 15503550530400.0, + "grad_norm": 2.381450026240971, + "language_loss": 0.77137232, + "learning_rate": 1.8733150916054483e-06, + "loss": 0.79304641, + "num_input_tokens_seen": 191318300, + "router_z_loss_clip": 0.81005859, + "router_z_loss_mlp": 0.13665771, + "step": 8891, + "time_per_iteration": 2.6654744148254395 + }, + { + "auxiliary_loss_clip": 0.01115653, + "auxiliary_loss_mlp": 0.01030623, + "balance_loss_clip": 1.0407896, + "balance_loss_mlp": 1.01943469, + "epoch": 0.5346159627235834, + "flos": 27399875797440.0, + "grad_norm": 1.5481587632871265, + "language_loss": 0.73835182, + "learning_rate": 1.872926414425699e-06, + "loss": 0.75981462, + "num_input_tokens_seen": 191337925, + "router_z_loss_clip": 0.74755859, + "router_z_loss_mlp": 0.11193848, + "step": 8892, + "time_per_iteration": 2.694305419921875 + }, + { + "auxiliary_loss_clip": 0.01116743, + "auxiliary_loss_mlp": 0.0103392, + "balance_loss_clip": 1.03960121, + "balance_loss_mlp": 1.02212381, + "epoch": 0.5346760859762513, + "flos": 27352193309280.0, + "grad_norm": 4.33531759082341, + "language_loss": 0.87658775, + "learning_rate": 1.8725377420645932e-06, + "loss": 0.89809442, + "num_input_tokens_seen": 191357120, + "router_z_loss_clip": 0.77197266, + "router_z_loss_mlp": 0.11798096, + "step": 8893, + "time_per_iteration": 4.148799419403076 + }, + { + "auxiliary_loss_clip": 0.01115972, + "auxiliary_loss_mlp": 0.01031947, + "balance_loss_clip": 1.04091716, + "balance_loss_mlp": 1.02040195, + "epoch": 0.5347362092289193, + "flos": 27840842886240.0, + "grad_norm": 1.6513831296503017, + "language_loss": 0.72657615, + "learning_rate": 1.872149074536869e-06, + "loss": 0.7480554, + "num_input_tokens_seen": 191375395, + "router_z_loss_clip": 0.75048828, + "router_z_loss_mlp": 0.11535645, + "step": 8894, + "time_per_iteration": 2.6492762565612793 + }, + { + "auxiliary_loss_clip": 0.01116791, + "auxiliary_loss_mlp": 0.01031655, + "balance_loss_clip": 1.04071188, + "balance_loss_mlp": 1.01925755, + "epoch": 0.5347963324815872, + "flos": 28332855397440.0, + "grad_norm": 1.9358888960743048, + "language_loss": 0.74926597, + "learning_rate": 1.8717604118572648e-06, + "loss": 0.77075046, + "num_input_tokens_seen": 191395595, + "router_z_loss_clip": 0.76025391, + "router_z_loss_mlp": 0.12408447, + "step": 8895, + "time_per_iteration": 3.9052181243896484 + }, + { + "auxiliary_loss_clip": 0.01116, + "auxiliary_loss_mlp": 0.01032803, + "balance_loss_clip": 1.03960276, + "balance_loss_mlp": 1.02085268, + "epoch": 0.5348564557342552, + "flos": 27578369531520.0, + "grad_norm": 1.742348927730417, + "language_loss": 0.76793855, + "learning_rate": 1.8713717540405178e-06, + "loss": 0.78942662, + "num_input_tokens_seen": 191413730, + "router_z_loss_clip": 0.76318359, + "router_z_loss_mlp": 0.11962891, + "step": 8896, + "time_per_iteration": 2.6700170040130615 + }, + { + "auxiliary_loss_clip": 0.01115342, + "auxiliary_loss_mlp": 0.01025745, + "balance_loss_clip": 1.04018354, + "balance_loss_mlp": 1.01353788, + "epoch": 0.5349165789869232, + "flos": 21966513469920.0, + "grad_norm": 1.8123312825897138, + "language_loss": 0.78927231, + "learning_rate": 1.8709831011013676e-06, + "loss": 0.81068319, + "num_input_tokens_seen": 191432400, + "router_z_loss_clip": 0.75048828, + "router_z_loss_mlp": 0.12213135, + "step": 8897, + "time_per_iteration": 2.662662982940674 + }, + { + "auxiliary_loss_clip": 0.01118563, + "auxiliary_loss_mlp": 0.01032414, + "balance_loss_clip": 1.04150748, + "balance_loss_mlp": 1.02051723, + "epoch": 0.5349767022395912, + "flos": 20938614583680.0, + "grad_norm": 2.108840048363675, + "language_loss": 0.7570346, + "learning_rate": 1.8705944530545509e-06, + "loss": 0.77854437, + "num_input_tokens_seen": 191448855, + "router_z_loss_clip": 0.76953125, + "router_z_loss_mlp": 0.11895752, + "step": 8898, + "time_per_iteration": 2.6071529388427734 + }, + { + "auxiliary_loss_clip": 0.01037186, + "auxiliary_loss_mlp": 0.010085, + "balance_loss_clip": 1.01225924, + "balance_loss_mlp": 1.00713468, + "epoch": 0.5350368254922592, + "flos": 86624093437920.0, + "grad_norm": 0.8506026083796244, + "language_loss": 0.57972848, + "learning_rate": 1.8702058099148052e-06, + "loss": 0.60018533, + "num_input_tokens_seen": 191519690, + "router_z_loss_clip": 0.24902344, + "router_z_loss_mlp": 0.01364899, + "step": 8899, + "time_per_iteration": 3.417412519454956 + }, + { + "auxiliary_loss_clip": 0.01114376, + "auxiliary_loss_mlp": 0.01031782, + "balance_loss_clip": 1.04008055, + "balance_loss_mlp": 1.01961708, + "epoch": 0.5350969487449271, + "flos": 33455778261120.0, + "grad_norm": 1.6654704871153025, + "language_loss": 0.6968233, + "learning_rate": 1.869817171696868e-06, + "loss": 0.71828485, + "num_input_tokens_seen": 191539380, + "router_z_loss_clip": 0.7421875, + "router_z_loss_mlp": 0.1217041, + "step": 8900, + "time_per_iteration": 2.691265344619751 + }, + { + "auxiliary_loss_clip": 0.01121018, + "auxiliary_loss_mlp": 0.01030451, + "balance_loss_clip": 1.04238749, + "balance_loss_mlp": 1.01818454, + "epoch": 0.5351570719975951, + "flos": 23569634659680.0, + "grad_norm": 1.7529448479470873, + "language_loss": 0.71395671, + "learning_rate": 1.8694285384154777e-06, + "loss": 0.73547137, + "num_input_tokens_seen": 191557400, + "router_z_loss_clip": 0.78613281, + "router_z_loss_mlp": 0.12255859, + "step": 8901, + "time_per_iteration": 4.061612606048584 + }, + { + "auxiliary_loss_clip": 0.011187, + "auxiliary_loss_mlp": 0.01031882, + "balance_loss_clip": 1.04097724, + "balance_loss_mlp": 1.02009809, + "epoch": 0.535217195250263, + "flos": 24194808419040.0, + "grad_norm": 2.6639704196427734, + "language_loss": 0.77272707, + "learning_rate": 1.8690399100853699e-06, + "loss": 0.7942329, + "num_input_tokens_seen": 191575860, + "router_z_loss_clip": 0.77685547, + "router_z_loss_mlp": 0.11791992, + "step": 8902, + "time_per_iteration": 2.65476655960083 + }, + { + "auxiliary_loss_clip": 0.01113765, + "auxiliary_loss_mlp": 0.01032523, + "balance_loss_clip": 1.0406996, + "balance_loss_mlp": 1.0217526, + "epoch": 0.535277318502931, + "flos": 27000513604800.0, + "grad_norm": 1.5100438498815134, + "language_loss": 0.69941312, + "learning_rate": 1.868651286721281e-06, + "loss": 0.72087598, + "num_input_tokens_seen": 191595775, + "router_z_loss_clip": 0.73095703, + "router_z_loss_mlp": 0.10778809, + "step": 8903, + "time_per_iteration": 2.6924221515655518 + }, + { + "auxiliary_loss_clip": 0.01120503, + "auxiliary_loss_mlp": 0.01034447, + "balance_loss_clip": 1.04105079, + "balance_loss_mlp": 1.02226353, + "epoch": 0.5353374417555989, + "flos": 30561393450240.0, + "grad_norm": 1.6235547668243375, + "language_loss": 0.72259325, + "learning_rate": 1.86826266833795e-06, + "loss": 0.74414277, + "num_input_tokens_seen": 191617785, + "router_z_loss_clip": 0.79394531, + "router_z_loss_mlp": 0.12188721, + "step": 8904, + "time_per_iteration": 2.6938722133636475 + }, + { + "auxiliary_loss_clip": 0.01122095, + "auxiliary_loss_mlp": 0.01038913, + "balance_loss_clip": 1.04405391, + "balance_loss_mlp": 1.02637815, + "epoch": 0.535397565008267, + "flos": 23658233250240.0, + "grad_norm": 2.381486614973189, + "language_loss": 0.73468232, + "learning_rate": 1.8678740549501103e-06, + "loss": 0.75629234, + "num_input_tokens_seen": 191636900, + "router_z_loss_clip": 0.78076172, + "router_z_loss_mlp": 0.12530518, + "step": 8905, + "time_per_iteration": 2.7734031677246094 + }, + { + "auxiliary_loss_clip": 0.0111214, + "auxiliary_loss_mlp": 0.0103577, + "balance_loss_clip": 1.03893161, + "balance_loss_mlp": 1.02507734, + "epoch": 0.5354576882609349, + "flos": 26199196113600.0, + "grad_norm": 1.5707568088110273, + "language_loss": 0.83483523, + "learning_rate": 1.8674854465725005e-06, + "loss": 0.8563143, + "num_input_tokens_seen": 191656720, + "router_z_loss_clip": 0.73242188, + "router_z_loss_mlp": 0.10693359, + "step": 8906, + "time_per_iteration": 2.614299774169922 + }, + { + "auxiliary_loss_clip": 0.01121639, + "auxiliary_loss_mlp": 0.01034249, + "balance_loss_clip": 1.0423559, + "balance_loss_mlp": 1.02189338, + "epoch": 0.5355178115136029, + "flos": 25352748722880.0, + "grad_norm": 2.083478442563393, + "language_loss": 0.7420032, + "learning_rate": 1.8670968432198563e-06, + "loss": 0.76356208, + "num_input_tokens_seen": 191674445, + "router_z_loss_clip": 0.79345703, + "router_z_loss_mlp": 0.12359619, + "step": 8907, + "time_per_iteration": 4.114420413970947 + }, + { + "auxiliary_loss_clip": 0.01119759, + "auxiliary_loss_mlp": 0.01034501, + "balance_loss_clip": 1.04275084, + "balance_loss_mlp": 1.02223468, + "epoch": 0.5355779347662708, + "flos": 28692030798720.0, + "grad_norm": 2.1726923892405208, + "language_loss": 0.76525331, + "learning_rate": 1.866708244906912e-06, + "loss": 0.78679591, + "num_input_tokens_seen": 191695000, + "router_z_loss_clip": 0.77050781, + "router_z_loss_mlp": 0.12261963, + "step": 8908, + "time_per_iteration": 2.6706125736236572 + }, + { + "auxiliary_loss_clip": 0.0112129, + "auxiliary_loss_mlp": 0.01033808, + "balance_loss_clip": 1.04220486, + "balance_loss_mlp": 1.02079678, + "epoch": 0.5356380580189388, + "flos": 24773960898720.0, + "grad_norm": 2.757149498770456, + "language_loss": 0.74156308, + "learning_rate": 1.8663196516484055e-06, + "loss": 0.76311409, + "num_input_tokens_seen": 191713295, + "router_z_loss_clip": 0.79052734, + "router_z_loss_mlp": 0.13012695, + "step": 8909, + "time_per_iteration": 2.596095323562622 + }, + { + "auxiliary_loss_clip": 0.01119379, + "auxiliary_loss_mlp": 0.01033183, + "balance_loss_clip": 1.04327655, + "balance_loss_mlp": 1.02156568, + "epoch": 0.5356981812716068, + "flos": 26065508140800.0, + "grad_norm": 1.965409239193301, + "language_loss": 0.84045744, + "learning_rate": 1.8659310634590702e-06, + "loss": 0.86198306, + "num_input_tokens_seen": 191732725, + "router_z_loss_clip": 0.75976562, + "router_z_loss_mlp": 0.1161499, + "step": 8910, + "time_per_iteration": 2.6872189044952393 + }, + { + "auxiliary_loss_clip": 0.01119145, + "auxiliary_loss_mlp": 0.01031024, + "balance_loss_clip": 1.04086435, + "balance_loss_mlp": 1.01875186, + "epoch": 0.5357583045242748, + "flos": 28200666564000.0, + "grad_norm": 1.8942375843693096, + "language_loss": 0.81540102, + "learning_rate": 1.8655424803536427e-06, + "loss": 0.83690268, + "num_input_tokens_seen": 191753765, + "router_z_loss_clip": 0.78320312, + "router_z_loss_mlp": 0.12280273, + "step": 8911, + "time_per_iteration": 2.6409897804260254 + }, + { + "auxiliary_loss_clip": 0.01117507, + "auxiliary_loss_mlp": 0.01031662, + "balance_loss_clip": 1.0421474, + "balance_loss_mlp": 1.02034891, + "epoch": 0.5358184277769428, + "flos": 25797281332320.0, + "grad_norm": 1.7858089397082937, + "language_loss": 0.6885637, + "learning_rate": 1.8651539023468585e-06, + "loss": 0.71005541, + "num_input_tokens_seen": 191773560, + "router_z_loss_clip": 0.75390625, + "router_z_loss_mlp": 0.11315918, + "step": 8912, + "time_per_iteration": 2.6613385677337646 + }, + { + "auxiliary_loss_clip": 0.01119088, + "auxiliary_loss_mlp": 0.01032064, + "balance_loss_clip": 1.04237223, + "balance_loss_mlp": 1.01986253, + "epoch": 0.5358785510296107, + "flos": 19868138386560.0, + "grad_norm": 3.6717279416909196, + "language_loss": 0.71520722, + "learning_rate": 1.8647653294534509e-06, + "loss": 0.73671877, + "num_input_tokens_seen": 191791255, + "router_z_loss_clip": 0.76708984, + "router_z_loss_mlp": 0.12200928, + "step": 8913, + "time_per_iteration": 2.5771031379699707 + }, + { + "auxiliary_loss_clip": 0.01124411, + "auxiliary_loss_mlp": 0.01033613, + "balance_loss_clip": 1.04357827, + "balance_loss_mlp": 1.02094734, + "epoch": 0.5359386742822787, + "flos": 20715477157440.0, + "grad_norm": 1.8173804848564132, + "language_loss": 0.72614837, + "learning_rate": 1.864376761688156e-06, + "loss": 0.74772859, + "num_input_tokens_seen": 191809325, + "router_z_loss_clip": 0.80810547, + "router_z_loss_mlp": 0.12677002, + "step": 8914, + "time_per_iteration": 2.660259246826172 + }, + { + "auxiliary_loss_clip": 0.0112764, + "auxiliary_loss_mlp": 0.01037978, + "balance_loss_clip": 1.04586554, + "balance_loss_mlp": 1.02437592, + "epoch": 0.5359987975349466, + "flos": 25396055344800.0, + "grad_norm": 2.1015461866061043, + "language_loss": 0.70664483, + "learning_rate": 1.8639881990657079e-06, + "loss": 0.72830093, + "num_input_tokens_seen": 191829795, + "router_z_loss_clip": 0.81689453, + "router_z_loss_mlp": 0.13586426, + "step": 8915, + "time_per_iteration": 2.7139360904693604 + }, + { + "auxiliary_loss_clip": 0.01119421, + "auxiliary_loss_mlp": 0.01034097, + "balance_loss_clip": 1.04244423, + "balance_loss_mlp": 1.02169895, + "epoch": 0.5360589207876146, + "flos": 27088544953440.0, + "grad_norm": 1.7219335094042882, + "language_loss": 0.75262898, + "learning_rate": 1.8635996416008408e-06, + "loss": 0.7741642, + "num_input_tokens_seen": 191850840, + "router_z_loss_clip": 0.76953125, + "router_z_loss_mlp": 0.12402344, + "step": 8916, + "time_per_iteration": 2.7702996730804443 + }, + { + "auxiliary_loss_clip": 0.01120514, + "auxiliary_loss_mlp": 0.01033229, + "balance_loss_clip": 1.04114652, + "balance_loss_mlp": 1.02068257, + "epoch": 0.5361190440402825, + "flos": 38308286383200.0, + "grad_norm": 2.1535727433712197, + "language_loss": 0.72308004, + "learning_rate": 1.863211089308289e-06, + "loss": 0.74461752, + "num_input_tokens_seen": 191869520, + "router_z_loss_clip": 0.79345703, + "router_z_loss_mlp": 0.12542725, + "step": 8917, + "time_per_iteration": 2.7777042388916016 + }, + { + "auxiliary_loss_clip": 0.01119917, + "auxiliary_loss_mlp": 0.01037341, + "balance_loss_clip": 1.04284167, + "balance_loss_mlp": 1.0242939, + "epoch": 0.5361791672929506, + "flos": 19608217620480.0, + "grad_norm": 4.689213252914888, + "language_loss": 0.71493292, + "learning_rate": 1.8628225422027865e-06, + "loss": 0.73650551, + "num_input_tokens_seen": 191887240, + "router_z_loss_clip": 0.77050781, + "router_z_loss_mlp": 0.13043213, + "step": 8918, + "time_per_iteration": 2.6176822185516357 + }, + { + "auxiliary_loss_clip": 0.01120504, + "auxiliary_loss_mlp": 0.01034265, + "balance_loss_clip": 1.04351544, + "balance_loss_mlp": 1.02175426, + "epoch": 0.5362392905456185, + "flos": 25308105030720.0, + "grad_norm": 1.5726099534255835, + "language_loss": 0.75027597, + "learning_rate": 1.862434000299067e-06, + "loss": 0.77182364, + "num_input_tokens_seen": 191905690, + "router_z_loss_clip": 0.77001953, + "router_z_loss_mlp": 0.12512207, + "step": 8919, + "time_per_iteration": 2.6947174072265625 + }, + { + "auxiliary_loss_clip": 0.01119423, + "auxiliary_loss_mlp": 0.01034283, + "balance_loss_clip": 1.04039454, + "balance_loss_mlp": 1.02199244, + "epoch": 0.5362994137982865, + "flos": 21158834765760.0, + "grad_norm": 2.2357146109069075, + "language_loss": 0.71359479, + "learning_rate": 1.862045463611864e-06, + "loss": 0.73513192, + "num_input_tokens_seen": 191920725, + "router_z_loss_clip": 0.79052734, + "router_z_loss_mlp": 0.12286377, + "step": 8920, + "time_per_iteration": 2.5746731758117676 + }, + { + "auxiliary_loss_clip": 0.0111753, + "auxiliary_loss_mlp": 0.01032465, + "balance_loss_clip": 1.03997993, + "balance_loss_mlp": 1.02003121, + "epoch": 0.5363595370509544, + "flos": 52243392165120.0, + "grad_norm": 1.9821779418756074, + "language_loss": 0.68862176, + "learning_rate": 1.8616569321559105e-06, + "loss": 0.71012163, + "num_input_tokens_seen": 191944645, + "router_z_loss_clip": 0.77587891, + "router_z_loss_mlp": 0.12438965, + "step": 8921, + "time_per_iteration": 2.8423519134521484 + }, + { + "auxiliary_loss_clip": 0.01122724, + "auxiliary_loss_mlp": 0.01035069, + "balance_loss_clip": 1.04430771, + "balance_loss_mlp": 1.02261162, + "epoch": 0.5364196603036224, + "flos": 23393531445120.0, + "grad_norm": 2.8446409342651044, + "language_loss": 0.82184625, + "learning_rate": 1.86126840594594e-06, + "loss": 0.84342414, + "num_input_tokens_seen": 191962265, + "router_z_loss_clip": 0.78417969, + "router_z_loss_mlp": 0.12463379, + "step": 8922, + "time_per_iteration": 2.6160337924957275 + }, + { + "auxiliary_loss_clip": 0.01120687, + "auxiliary_loss_mlp": 0.01025921, + "balance_loss_clip": 1.04137397, + "balance_loss_mlp": 1.01416087, + "epoch": 0.5364797835562904, + "flos": 21879332984160.0, + "grad_norm": 2.1643774740996276, + "language_loss": 0.77248132, + "learning_rate": 1.860879884996686e-06, + "loss": 0.79394746, + "num_input_tokens_seen": 191978850, + "router_z_loss_clip": 0.79296875, + "router_z_loss_mlp": 0.11767578, + "step": 8923, + "time_per_iteration": 2.687845468521118 + }, + { + "auxiliary_loss_clip": 0.01122733, + "auxiliary_loss_mlp": 0.01029441, + "balance_loss_clip": 1.04303205, + "balance_loss_mlp": 1.01702523, + "epoch": 0.5365399068089584, + "flos": 36884955480480.0, + "grad_norm": 1.9745475960497723, + "language_loss": 0.70593208, + "learning_rate": 1.8604913693228804e-06, + "loss": 0.72745383, + "num_input_tokens_seen": 192002000, + "router_z_loss_clip": 0.79589844, + "router_z_loss_mlp": 0.12432861, + "step": 8924, + "time_per_iteration": 2.7069365978240967 + }, + { + "auxiliary_loss_clip": 0.01126374, + "auxiliary_loss_mlp": 0.01036309, + "balance_loss_clip": 1.04615235, + "balance_loss_mlp": 1.02299953, + "epoch": 0.5366000300616264, + "flos": 30373985914560.0, + "grad_norm": 2.0100770643716768, + "language_loss": 0.8719579, + "learning_rate": 1.8601028589392558e-06, + "loss": 0.89358467, + "num_input_tokens_seen": 192019100, + "router_z_loss_clip": 0.80175781, + "router_z_loss_mlp": 0.13317871, + "step": 8925, + "time_per_iteration": 2.75337815284729 + }, + { + "auxiliary_loss_clip": 0.01120034, + "auxiliary_loss_mlp": 0.01029146, + "balance_loss_clip": 1.04057062, + "balance_loss_mlp": 1.01692748, + "epoch": 0.5366601533142943, + "flos": 36395130902400.0, + "grad_norm": 1.9093591366596578, + "language_loss": 0.78117108, + "learning_rate": 1.8597143538605455e-06, + "loss": 0.80266291, + "num_input_tokens_seen": 192041660, + "router_z_loss_clip": 0.79492188, + "router_z_loss_mlp": 0.12207031, + "step": 8926, + "time_per_iteration": 2.6988911628723145 + }, + { + "auxiliary_loss_clip": 0.01119363, + "auxiliary_loss_mlp": 0.01030585, + "balance_loss_clip": 1.04446721, + "balance_loss_mlp": 1.01933753, + "epoch": 0.5367202765669623, + "flos": 33189131626560.0, + "grad_norm": 1.7873763801779174, + "language_loss": 0.6682716, + "learning_rate": 1.85932585410148e-06, + "loss": 0.68977106, + "num_input_tokens_seen": 192063540, + "router_z_loss_clip": 0.74804688, + "router_z_loss_mlp": 0.11248779, + "step": 8927, + "time_per_iteration": 2.6876156330108643 + }, + { + "auxiliary_loss_clip": 0.01120633, + "auxiliary_loss_mlp": 0.01030285, + "balance_loss_clip": 1.04087639, + "balance_loss_mlp": 1.01761317, + "epoch": 0.5367803998196302, + "flos": 24684592479840.0, + "grad_norm": 3.907629218900608, + "language_loss": 0.73802501, + "learning_rate": 1.8589373596767929e-06, + "loss": 0.75953418, + "num_input_tokens_seen": 192081760, + "router_z_loss_clip": 0.796875, + "router_z_loss_mlp": 0.12677002, + "step": 8928, + "time_per_iteration": 2.661364793777466 + }, + { + "auxiliary_loss_clip": 0.01120323, + "auxiliary_loss_mlp": 0.01026589, + "balance_loss_clip": 1.04224706, + "balance_loss_mlp": 1.01483488, + "epoch": 0.5368405230722982, + "flos": 39236565978720.0, + "grad_norm": 4.324668083561059, + "language_loss": 0.63318938, + "learning_rate": 1.8585488706012154e-06, + "loss": 0.6546585, + "num_input_tokens_seen": 192101620, + "router_z_loss_clip": 0.78076172, + "router_z_loss_mlp": 0.11761475, + "step": 8929, + "time_per_iteration": 2.744170904159546 + }, + { + "auxiliary_loss_clip": 0.01121483, + "auxiliary_loss_mlp": 0.01030629, + "balance_loss_clip": 1.04244041, + "balance_loss_mlp": 1.01812434, + "epoch": 0.5369006463249661, + "flos": 32028395630400.0, + "grad_norm": 1.7420960536767789, + "language_loss": 0.66278732, + "learning_rate": 1.8581603868894781e-06, + "loss": 0.68430841, + "num_input_tokens_seen": 192121805, + "router_z_loss_clip": 0.79003906, + "router_z_loss_mlp": 0.12512207, + "step": 8930, + "time_per_iteration": 2.720311164855957 + }, + { + "auxiliary_loss_clip": 0.01117577, + "auxiliary_loss_mlp": 0.01029485, + "balance_loss_clip": 1.04150534, + "balance_loss_mlp": 1.0173018, + "epoch": 0.5369607695776342, + "flos": 31983711420960.0, + "grad_norm": 1.4027599883250688, + "language_loss": 0.67291319, + "learning_rate": 1.8577719085563136e-06, + "loss": 0.69438374, + "num_input_tokens_seen": 192141765, + "router_z_loss_clip": 0.76123047, + "router_z_loss_mlp": 0.12182617, + "step": 8931, + "time_per_iteration": 2.6788504123687744 + }, + { + "auxiliary_loss_clip": 0.01123331, + "auxiliary_loss_mlp": 0.01030202, + "balance_loss_clip": 1.04639554, + "balance_loss_mlp": 1.01730311, + "epoch": 0.5370208928303021, + "flos": 30516425619840.0, + "grad_norm": 7.908190798349318, + "language_loss": 0.75446385, + "learning_rate": 1.8573834356164525e-06, + "loss": 0.77599919, + "num_input_tokens_seen": 192161560, + "router_z_loss_clip": 0.76904297, + "router_z_loss_mlp": 0.12908936, + "step": 8932, + "time_per_iteration": 2.7030818462371826 + }, + { + "auxiliary_loss_clip": 0.011215, + "auxiliary_loss_mlp": 0.01030102, + "balance_loss_clip": 1.04504466, + "balance_loss_mlp": 1.01731038, + "epoch": 0.5370810160829701, + "flos": 38794018716000.0, + "grad_norm": 1.864010858724266, + "language_loss": 0.65827793, + "learning_rate": 1.8569949680846261e-06, + "loss": 0.67979395, + "num_input_tokens_seen": 192180190, + "router_z_loss_clip": 0.76464844, + "router_z_loss_mlp": 0.12792969, + "step": 8933, + "time_per_iteration": 4.182009220123291 + }, + { + "auxiliary_loss_clip": 0.01117844, + "auxiliary_loss_mlp": 0.01035252, + "balance_loss_clip": 1.04338074, + "balance_loss_mlp": 1.02355742, + "epoch": 0.537141139335638, + "flos": 29092973165280.0, + "grad_norm": 1.5706736487039592, + "language_loss": 0.82980353, + "learning_rate": 1.856606505975565e-06, + "loss": 0.85133445, + "num_input_tokens_seen": 192198855, + "router_z_loss_clip": 0.74414062, + "router_z_loss_mlp": 0.11688232, + "step": 8934, + "time_per_iteration": 4.010028600692749 + }, + { + "auxiliary_loss_clip": 0.0111674, + "auxiliary_loss_mlp": 0.01029339, + "balance_loss_clip": 1.04199302, + "balance_loss_mlp": 1.01674449, + "epoch": 0.537201262588306, + "flos": 22584920843520.0, + "grad_norm": 4.5388272420252616, + "language_loss": 0.79818606, + "learning_rate": 1.856218049303999e-06, + "loss": 0.81964684, + "num_input_tokens_seen": 192216555, + "router_z_loss_clip": 0.74658203, + "router_z_loss_mlp": 0.12597656, + "step": 8935, + "time_per_iteration": 2.647066354751587 + }, + { + "auxiliary_loss_clip": 0.01121018, + "auxiliary_loss_mlp": 0.01038934, + "balance_loss_clip": 1.04290748, + "balance_loss_mlp": 1.02626777, + "epoch": 0.537261385840974, + "flos": 31314663797760.0, + "grad_norm": 1.8447355733346662, + "language_loss": 0.83786237, + "learning_rate": 1.855829598084659e-06, + "loss": 0.85946184, + "num_input_tokens_seen": 192236910, + "router_z_loss_clip": 0.78125, + "router_z_loss_mlp": 0.12683105, + "step": 8936, + "time_per_iteration": 2.70265531539917 + }, + { + "auxiliary_loss_clip": 0.01120839, + "auxiliary_loss_mlp": 0.01032717, + "balance_loss_clip": 1.04504561, + "balance_loss_mlp": 1.02114224, + "epoch": 0.537321509093642, + "flos": 49706440512480.0, + "grad_norm": 1.4794098163202565, + "language_loss": 0.7279582, + "learning_rate": 1.8554411523322754e-06, + "loss": 0.74949372, + "num_input_tokens_seen": 192260790, + "router_z_loss_clip": 0.7578125, + "router_z_loss_mlp": 0.11584473, + "step": 8937, + "time_per_iteration": 2.848924398422241 + }, + { + "auxiliary_loss_clip": 0.01122786, + "auxiliary_loss_mlp": 0.01028308, + "balance_loss_clip": 1.04186273, + "balance_loss_mlp": 1.01571357, + "epoch": 0.53738163234631, + "flos": 21034141629120.0, + "grad_norm": 2.158893809654096, + "language_loss": 0.81568682, + "learning_rate": 1.8550527120615778e-06, + "loss": 0.83719778, + "num_input_tokens_seen": 192277230, + "router_z_loss_clip": 0.80761719, + "router_z_loss_mlp": 0.12591553, + "step": 8938, + "time_per_iteration": 2.5770087242126465 + }, + { + "auxiliary_loss_clip": 0.01125223, + "auxiliary_loss_mlp": 0.01034965, + "balance_loss_clip": 1.04308391, + "balance_loss_mlp": 1.02311587, + "epoch": 0.5374417555989779, + "flos": 15644936786400.0, + "grad_norm": 2.5114817210272777, + "language_loss": 0.80990028, + "learning_rate": 1.8546642772872957e-06, + "loss": 0.8315022, + "num_input_tokens_seen": 192292840, + "router_z_loss_clip": 0.82080078, + "router_z_loss_mlp": 0.11859131, + "step": 8939, + "time_per_iteration": 2.6029253005981445 + }, + { + "auxiliary_loss_clip": 0.010407, + "auxiliary_loss_mlp": 0.01000572, + "balance_loss_clip": 1.01581252, + "balance_loss_mlp": 0.99929684, + "epoch": 0.5375018788516459, + "flos": 82066425626880.0, + "grad_norm": 0.7075927247952136, + "language_loss": 0.52462691, + "learning_rate": 1.8542758480241589e-06, + "loss": 0.54503965, + "num_input_tokens_seen": 192358240, + "router_z_loss_clip": 0.2487793, + "router_z_loss_mlp": 0.01274872, + "step": 8940, + "time_per_iteration": 4.690160512924194 + }, + { + "auxiliary_loss_clip": 0.01119473, + "auxiliary_loss_mlp": 0.01030229, + "balance_loss_clip": 1.04300344, + "balance_loss_mlp": 1.01820636, + "epoch": 0.5375620021043138, + "flos": 22102713514080.0, + "grad_norm": 1.8255210679358949, + "language_loss": 0.71508235, + "learning_rate": 1.8538874242868965e-06, + "loss": 0.73657936, + "num_input_tokens_seen": 192377370, + "router_z_loss_clip": 0.76464844, + "router_z_loss_mlp": 0.12011719, + "step": 8941, + "time_per_iteration": 2.6572322845458984 + }, + { + "auxiliary_loss_clip": 0.01115454, + "auxiliary_loss_mlp": 0.01028527, + "balance_loss_clip": 1.04070318, + "balance_loss_mlp": 1.01642132, + "epoch": 0.5376221253569818, + "flos": 28247052499200.0, + "grad_norm": 1.828300767705028, + "language_loss": 0.79456806, + "learning_rate": 1.853499006090237e-06, + "loss": 0.81600791, + "num_input_tokens_seen": 192396450, + "router_z_loss_clip": 0.74853516, + "router_z_loss_mlp": 0.12109375, + "step": 8942, + "time_per_iteration": 2.701169967651367 + }, + { + "auxiliary_loss_clip": 0.01123561, + "auxiliary_loss_mlp": 0.01033017, + "balance_loss_clip": 1.04351783, + "balance_loss_mlp": 1.02012467, + "epoch": 0.5376822486096497, + "flos": 36572895325440.0, + "grad_norm": 2.8614251269968523, + "language_loss": 0.70076859, + "learning_rate": 1.853110593448911e-06, + "loss": 0.72233427, + "num_input_tokens_seen": 192417390, + "router_z_loss_clip": 0.79980469, + "router_z_loss_mlp": 0.12902832, + "step": 8943, + "time_per_iteration": 2.725435972213745 + }, + { + "auxiliary_loss_clip": 0.01040242, + "auxiliary_loss_mlp": 0.01001837, + "balance_loss_clip": 1.01524711, + "balance_loss_mlp": 1.0005275, + "epoch": 0.5377423718623178, + "flos": 66095537608800.0, + "grad_norm": 0.8722998406631696, + "language_loss": 0.59636617, + "learning_rate": 1.852722186377645e-06, + "loss": 0.61678702, + "num_input_tokens_seen": 192478060, + "router_z_loss_clip": 0.24975586, + "router_z_loss_mlp": 0.01309967, + "step": 8944, + "time_per_iteration": 3.223193883895874 + }, + { + "auxiliary_loss_clip": 0.01126664, + "auxiliary_loss_mlp": 0.01034144, + "balance_loss_clip": 1.04436922, + "balance_loss_mlp": 1.02063811, + "epoch": 0.5378024951149857, + "flos": 28378633573440.0, + "grad_norm": 2.085560581850922, + "language_loss": 0.77721131, + "learning_rate": 1.852333784891169e-06, + "loss": 0.79881942, + "num_input_tokens_seen": 192495985, + "router_z_loss_clip": 0.82226562, + "router_z_loss_mlp": 0.13519287, + "step": 8945, + "time_per_iteration": 2.7236557006835938 + }, + { + "auxiliary_loss_clip": 0.01119337, + "auxiliary_loss_mlp": 0.01030107, + "balance_loss_clip": 1.04018402, + "balance_loss_mlp": 1.01790559, + "epoch": 0.5378626183676537, + "flos": 29314530417600.0, + "grad_norm": 1.889307488356948, + "language_loss": 0.68568003, + "learning_rate": 1.8519453890042112e-06, + "loss": 0.70717442, + "num_input_tokens_seen": 192515445, + "router_z_loss_clip": 0.79101562, + "router_z_loss_mlp": 0.12194824, + "step": 8946, + "time_per_iteration": 4.099342346191406 + }, + { + "auxiliary_loss_clip": 0.01117766, + "auxiliary_loss_mlp": 0.01039909, + "balance_loss_clip": 1.04223943, + "balance_loss_mlp": 1.02767777, + "epoch": 0.5379227416203216, + "flos": 33143393967840.0, + "grad_norm": 1.7704425565510944, + "language_loss": 0.76958764, + "learning_rate": 1.851556998731498e-06, + "loss": 0.79116434, + "num_input_tokens_seen": 192536530, + "router_z_loss_clip": 0.75585938, + "router_z_loss_mlp": 0.12231445, + "step": 8947, + "time_per_iteration": 2.715891122817993 + }, + { + "auxiliary_loss_clip": 0.01119101, + "auxiliary_loss_mlp": 0.01032178, + "balance_loss_clip": 1.04199374, + "balance_loss_mlp": 1.0200963, + "epoch": 0.5379828648729896, + "flos": 30116820323520.0, + "grad_norm": 1.9109155869827659, + "language_loss": 0.60288358, + "learning_rate": 1.8511686140877592e-06, + "loss": 0.62439638, + "num_input_tokens_seen": 192556075, + "router_z_loss_clip": 0.77246094, + "router_z_loss_mlp": 0.12097168, + "step": 8948, + "time_per_iteration": 2.7178568840026855 + }, + { + "auxiliary_loss_clip": 0.01124347, + "auxiliary_loss_mlp": 0.01032171, + "balance_loss_clip": 1.04636395, + "balance_loss_mlp": 1.02076244, + "epoch": 0.5380429881256577, + "flos": 27483571797120.0, + "grad_norm": 1.6879965196477826, + "language_loss": 0.79524672, + "learning_rate": 1.8507802350877205e-06, + "loss": 0.81681192, + "num_input_tokens_seen": 192575535, + "router_z_loss_clip": 0.77880859, + "router_z_loss_mlp": 0.11401367, + "step": 8949, + "time_per_iteration": 2.654867649078369 + }, + { + "auxiliary_loss_clip": 0.01118783, + "auxiliary_loss_mlp": 0.01033263, + "balance_loss_clip": 1.04352677, + "balance_loss_mlp": 1.02073979, + "epoch": 0.5381031113783256, + "flos": 32920985852640.0, + "grad_norm": 1.8979047222258543, + "language_loss": 0.77886367, + "learning_rate": 1.850391861746111e-06, + "loss": 0.80038404, + "num_input_tokens_seen": 192594490, + "router_z_loss_clip": 0.75244141, + "router_z_loss_mlp": 0.12524414, + "step": 8950, + "time_per_iteration": 2.7431838512420654 + }, + { + "auxiliary_loss_clip": 0.01118935, + "auxiliary_loss_mlp": 0.01030422, + "balance_loss_clip": 1.0438385, + "balance_loss_mlp": 1.01920438, + "epoch": 0.5381632346309936, + "flos": 30205297362240.0, + "grad_norm": 1.5660319385672035, + "language_loss": 0.72698385, + "learning_rate": 1.8500034940776573e-06, + "loss": 0.74847746, + "num_input_tokens_seen": 192615650, + "router_z_loss_clip": 0.75048828, + "router_z_loss_mlp": 0.11224365, + "step": 8951, + "time_per_iteration": 2.6894216537475586 + }, + { + "auxiliary_loss_clip": 0.01119693, + "auxiliary_loss_mlp": 0.0102459, + "balance_loss_clip": 1.04155731, + "balance_loss_mlp": 1.0120188, + "epoch": 0.5382233578836615, + "flos": 18986001622560.0, + "grad_norm": 1.8355895455246474, + "language_loss": 0.75160545, + "learning_rate": 1.849615132097085e-06, + "loss": 0.77304828, + "num_input_tokens_seen": 192633840, + "router_z_loss_clip": 0.78173828, + "router_z_loss_mlp": 0.12579346, + "step": 8952, + "time_per_iteration": 2.6570165157318115 + }, + { + "auxiliary_loss_clip": 0.01120926, + "auxiliary_loss_mlp": 0.01032423, + "balance_loss_clip": 1.04425597, + "balance_loss_mlp": 1.0193224, + "epoch": 0.5382834811363295, + "flos": 30608306110080.0, + "grad_norm": 2.100896140295175, + "language_loss": 0.79525727, + "learning_rate": 1.8492267758191228e-06, + "loss": 0.81679082, + "num_input_tokens_seen": 192655890, + "router_z_loss_clip": 0.76708984, + "router_z_loss_mlp": 0.13110352, + "step": 8953, + "time_per_iteration": 2.6717262268066406 + }, + { + "auxiliary_loss_clip": 0.01115635, + "auxiliary_loss_mlp": 0.01030439, + "balance_loss_clip": 1.04168439, + "balance_loss_mlp": 1.01808894, + "epoch": 0.5383436043889974, + "flos": 16225547888160.0, + "grad_norm": 2.039806196451518, + "language_loss": 0.8061583, + "learning_rate": 1.8488384252584964e-06, + "loss": 0.82761908, + "num_input_tokens_seen": 192673025, + "router_z_loss_clip": 0.73925781, + "router_z_loss_mlp": 0.12347412, + "step": 8954, + "time_per_iteration": 2.6530938148498535 + }, + { + "auxiliary_loss_clip": 0.01121412, + "auxiliary_loss_mlp": 0.01030448, + "balance_loss_clip": 1.04381752, + "balance_loss_mlp": 1.01853359, + "epoch": 0.5384037276416654, + "flos": 28112878319040.0, + "grad_norm": 1.9907126347549167, + "language_loss": 0.76316541, + "learning_rate": 1.8484500804299318e-06, + "loss": 0.784684, + "num_input_tokens_seen": 192692190, + "router_z_loss_clip": 0.77587891, + "router_z_loss_mlp": 0.11920166, + "step": 8955, + "time_per_iteration": 2.629992961883545 + }, + { + "auxiliary_loss_clip": 0.01119665, + "auxiliary_loss_mlp": 0.01037386, + "balance_loss_clip": 1.04357505, + "balance_loss_mlp": 1.02485085, + "epoch": 0.5384638508943334, + "flos": 25174173954240.0, + "grad_norm": 1.6612617082181063, + "language_loss": 0.78224432, + "learning_rate": 1.8480617413481557e-06, + "loss": 0.80381477, + "num_input_tokens_seen": 192710380, + "router_z_loss_clip": 0.76123047, + "router_z_loss_mlp": 0.12536621, + "step": 8956, + "time_per_iteration": 2.66597318649292 + }, + { + "auxiliary_loss_clip": 0.01038308, + "auxiliary_loss_mlp": 0.01003004, + "balance_loss_clip": 1.01322913, + "balance_loss_mlp": 1.00172377, + "epoch": 0.5385239741470014, + "flos": 81434607033600.0, + "grad_norm": 0.8542928331510653, + "language_loss": 0.63376522, + "learning_rate": 1.8476734080278932e-06, + "loss": 0.65417838, + "num_input_tokens_seen": 192768995, + "router_z_loss_clip": 0.25048828, + "router_z_loss_mlp": 0.01280212, + "step": 8957, + "time_per_iteration": 3.2023847103118896 + }, + { + "auxiliary_loss_clip": 0.01038618, + "auxiliary_loss_mlp": 0.0100187, + "balance_loss_clip": 1.01337087, + "balance_loss_mlp": 1.00057197, + "epoch": 0.5385840973996693, + "flos": 78965677543680.0, + "grad_norm": 0.707380214548226, + "language_loss": 0.51587582, + "learning_rate": 1.8472850804838705e-06, + "loss": 0.53628075, + "num_input_tokens_seen": 192825585, + "router_z_loss_clip": 0.25268555, + "router_z_loss_mlp": 0.01298523, + "step": 8958, + "time_per_iteration": 3.248389959335327 + }, + { + "auxiliary_loss_clip": 0.01126358, + "auxiliary_loss_mlp": 0.01029797, + "balance_loss_clip": 1.04705071, + "balance_loss_mlp": 1.01654065, + "epoch": 0.5386442206523373, + "flos": 31898556799200.0, + "grad_norm": 3.3810472494674224, + "language_loss": 0.7698555, + "learning_rate": 1.8468967587308128e-06, + "loss": 0.79141706, + "num_input_tokens_seen": 192847335, + "router_z_loss_clip": 0.79199219, + "router_z_loss_mlp": 0.13262939, + "step": 8959, + "time_per_iteration": 2.7110276222229004 + }, + { + "auxiliary_loss_clip": 0.01119676, + "auxiliary_loss_mlp": 0.0103155, + "balance_loss_clip": 1.04097247, + "balance_loss_mlp": 1.01918817, + "epoch": 0.5387043439050052, + "flos": 22271483100960.0, + "grad_norm": 2.2073566507505324, + "language_loss": 0.83556688, + "learning_rate": 1.8465084427834455e-06, + "loss": 0.85707915, + "num_input_tokens_seen": 192862205, + "router_z_loss_clip": 0.78613281, + "router_z_loss_mlp": 0.12359619, + "step": 8960, + "time_per_iteration": 2.578732490539551 + }, + { + "auxiliary_loss_clip": 0.0112185, + "auxiliary_loss_mlp": 0.01028438, + "balance_loss_clip": 1.04442835, + "balance_loss_mlp": 1.01649284, + "epoch": 0.5387644671576732, + "flos": 36348501863520.0, + "grad_norm": 1.513605484981142, + "language_loss": 0.78799856, + "learning_rate": 1.8461201326564933e-06, + "loss": 0.80950147, + "num_input_tokens_seen": 192883695, + "router_z_loss_clip": 0.77441406, + "router_z_loss_mlp": 0.11938477, + "step": 8961, + "time_per_iteration": 2.729616165161133 + }, + { + "auxiliary_loss_clip": 0.01119508, + "auxiliary_loss_mlp": 0.01032568, + "balance_loss_clip": 1.04270196, + "balance_loss_mlp": 1.02058756, + "epoch": 0.5388245904103413, + "flos": 27302241853440.0, + "grad_norm": 2.066964720769899, + "language_loss": 0.83954751, + "learning_rate": 1.845731828364681e-06, + "loss": 0.86106825, + "num_input_tokens_seen": 192900190, + "router_z_loss_clip": 0.76708984, + "router_z_loss_mlp": 0.11981201, + "step": 8962, + "time_per_iteration": 2.640390396118164 + }, + { + "auxiliary_loss_clip": 0.01036703, + "auxiliary_loss_mlp": 0.01001347, + "balance_loss_clip": 1.01175046, + "balance_loss_mlp": 1.00001001, + "epoch": 0.5388847136630092, + "flos": 85180503895200.0, + "grad_norm": 0.7331051846759562, + "language_loss": 0.54079735, + "learning_rate": 1.8453435299227333e-06, + "loss": 0.56117785, + "num_input_tokens_seen": 192958675, + "router_z_loss_clip": 0.24938965, + "router_z_loss_mlp": 0.01337433, + "step": 8963, + "time_per_iteration": 3.1402270793914795 + }, + { + "auxiliary_loss_clip": 0.01037159, + "auxiliary_loss_mlp": 0.01001247, + "balance_loss_clip": 1.01216626, + "balance_loss_mlp": 0.99986959, + "epoch": 0.5389448369156772, + "flos": 85196872876320.0, + "grad_norm": 0.8289359490286187, + "language_loss": 0.63358963, + "learning_rate": 1.8449552373453744e-06, + "loss": 0.6539737, + "num_input_tokens_seen": 193033135, + "router_z_loss_clip": 0.24951172, + "router_z_loss_mlp": 0.01378632, + "step": 8964, + "time_per_iteration": 3.321274995803833 + }, + { + "auxiliary_loss_clip": 0.01122747, + "auxiliary_loss_mlp": 0.01030404, + "balance_loss_clip": 1.04263449, + "balance_loss_mlp": 1.01848269, + "epoch": 0.5390049601683451, + "flos": 38708661507840.0, + "grad_norm": 1.6104402052028652, + "language_loss": 0.70310992, + "learning_rate": 1.8445669506473287e-06, + "loss": 0.72464144, + "num_input_tokens_seen": 193055570, + "router_z_loss_clip": 0.80078125, + "router_z_loss_mlp": 0.11920166, + "step": 8965, + "time_per_iteration": 2.72814679145813 + }, + { + "auxiliary_loss_clip": 0.01123671, + "auxiliary_loss_mlp": 0.01030831, + "balance_loss_clip": 1.04400039, + "balance_loss_mlp": 1.01823628, + "epoch": 0.5390650834210131, + "flos": 22101943685760.0, + "grad_norm": 2.1736240579755037, + "language_loss": 0.8142066, + "learning_rate": 1.8441786698433192e-06, + "loss": 0.83575153, + "num_input_tokens_seen": 193073120, + "router_z_loss_clip": 0.79638672, + "router_z_loss_mlp": 0.12597656, + "step": 8966, + "time_per_iteration": 2.678880214691162 + }, + { + "auxiliary_loss_clip": 0.01120085, + "auxiliary_loss_mlp": 0.01031288, + "balance_loss_clip": 1.04470396, + "balance_loss_mlp": 1.01876533, + "epoch": 0.539125206673681, + "flos": 21252011808960.0, + "grad_norm": 2.153795928185031, + "language_loss": 0.72318476, + "learning_rate": 1.8437903949480706e-06, + "loss": 0.74469852, + "num_input_tokens_seen": 193090105, + "router_z_loss_clip": 0.75341797, + "router_z_loss_mlp": 0.12518311, + "step": 8967, + "time_per_iteration": 2.611334800720215 + }, + { + "auxiliary_loss_clip": 0.01116056, + "auxiliary_loss_mlp": 0.01031134, + "balance_loss_clip": 1.03957343, + "balance_loss_mlp": 1.01970172, + "epoch": 0.539185329926349, + "flos": 27087410469600.0, + "grad_norm": 1.6139365107852108, + "language_loss": 0.82057452, + "learning_rate": 1.8434021259763065e-06, + "loss": 0.8420465, + "num_input_tokens_seen": 193109325, + "router_z_loss_clip": 0.76464844, + "router_z_loss_mlp": 0.11437988, + "step": 8968, + "time_per_iteration": 2.6752705574035645 + }, + { + "auxiliary_loss_clip": 0.01120292, + "auxiliary_loss_mlp": 0.01034051, + "balance_loss_clip": 1.04251587, + "balance_loss_mlp": 1.02149832, + "epoch": 0.539245453179017, + "flos": 26154309317760.0, + "grad_norm": 5.301604742417668, + "language_loss": 0.7370038, + "learning_rate": 1.8430138629427484e-06, + "loss": 0.75854731, + "num_input_tokens_seen": 193130595, + "router_z_loss_clip": 0.77734375, + "router_z_loss_mlp": 0.12554932, + "step": 8969, + "time_per_iteration": 2.6321256160736084 + }, + { + "auxiliary_loss_clip": 0.01121719, + "auxiliary_loss_mlp": 0.01034001, + "balance_loss_clip": 1.04110992, + "balance_loss_mlp": 1.02145469, + "epoch": 0.539305576431685, + "flos": 25301419679520.0, + "grad_norm": 1.867021777635601, + "language_loss": 0.8196348, + "learning_rate": 1.8426256058621205e-06, + "loss": 0.84119201, + "num_input_tokens_seen": 193148930, + "router_z_loss_clip": 0.80615234, + "router_z_loss_mlp": 0.12561035, + "step": 8970, + "time_per_iteration": 2.6847167015075684 + }, + { + "auxiliary_loss_clip": 0.01117921, + "auxiliary_loss_mlp": 0.01031904, + "balance_loss_clip": 1.04248857, + "balance_loss_mlp": 1.02029252, + "epoch": 0.5393656996843529, + "flos": 37729944249120.0, + "grad_norm": 1.4700267242179013, + "language_loss": 0.75461656, + "learning_rate": 1.842237354749146e-06, + "loss": 0.77611482, + "num_input_tokens_seen": 193170140, + "router_z_loss_clip": 0.75341797, + "router_z_loss_mlp": 0.11608887, + "step": 8971, + "time_per_iteration": 2.8027236461639404 + }, + { + "auxiliary_loss_clip": 0.010355, + "auxiliary_loss_mlp": 0.01002798, + "balance_loss_clip": 1.01062489, + "balance_loss_mlp": 1.00140822, + "epoch": 0.5394258229370209, + "flos": 61397901646560.0, + "grad_norm": 0.8882335526600565, + "language_loss": 0.60452604, + "learning_rate": 1.8418491096185465e-06, + "loss": 0.62490904, + "num_input_tokens_seen": 193227235, + "router_z_loss_clip": 0.24865723, + "router_z_loss_mlp": 0.01390839, + "step": 8972, + "time_per_iteration": 4.691479682922363 + }, + { + "auxiliary_loss_clip": 0.01118968, + "auxiliary_loss_mlp": 0.01038969, + "balance_loss_clip": 1.04179454, + "balance_loss_mlp": 1.02607071, + "epoch": 0.5394859461896888, + "flos": 31007951923680.0, + "grad_norm": 1.5120784246138461, + "language_loss": 0.78307599, + "learning_rate": 1.841460870485045e-06, + "loss": 0.80465537, + "num_input_tokens_seen": 193248435, + "router_z_loss_clip": 0.77148438, + "router_z_loss_mlp": 0.12908936, + "step": 8973, + "time_per_iteration": 2.7814998626708984 + }, + { + "auxiliary_loss_clip": 0.01125129, + "auxiliary_loss_mlp": 0.01032522, + "balance_loss_clip": 1.04287243, + "balance_loss_mlp": 1.01912296, + "epoch": 0.5395460694423568, + "flos": 31089743611200.0, + "grad_norm": 2.179494898353724, + "language_loss": 0.73591882, + "learning_rate": 1.8410726373633623e-06, + "loss": 0.75749534, + "num_input_tokens_seen": 193267490, + "router_z_loss_clip": 0.82275391, + "router_z_loss_mlp": 0.13397217, + "step": 8974, + "time_per_iteration": 4.104269981384277 + }, + { + "auxiliary_loss_clip": 0.01034477, + "auxiliary_loss_mlp": 0.01002312, + "balance_loss_clip": 1.00963116, + "balance_loss_mlp": 1.00092518, + "epoch": 0.5396061926950249, + "flos": 64974218575680.0, + "grad_norm": 0.7288080581769069, + "language_loss": 0.51096946, + "learning_rate": 1.8406844102682215e-06, + "loss": 0.53133732, + "num_input_tokens_seen": 193326050, + "router_z_loss_clip": 0.24865723, + "router_z_loss_mlp": 0.01387024, + "step": 8975, + "time_per_iteration": 3.2047176361083984 + }, + { + "auxiliary_loss_clip": 0.01118906, + "auxiliary_loss_mlp": 0.01038237, + "balance_loss_clip": 1.04309464, + "balance_loss_mlp": 1.02571368, + "epoch": 0.5396663159476928, + "flos": 32610384319680.0, + "grad_norm": 1.6078830756522233, + "language_loss": 0.72114861, + "learning_rate": 1.840296189214344e-06, + "loss": 0.74272007, + "num_input_tokens_seen": 193348785, + "router_z_loss_clip": 0.75878906, + "router_z_loss_mlp": 0.12530518, + "step": 8976, + "time_per_iteration": 2.7216436862945557 + }, + { + "auxiliary_loss_clip": 0.01119667, + "auxiliary_loss_mlp": 0.01032921, + "balance_loss_clip": 1.04171216, + "balance_loss_mlp": 1.02095866, + "epoch": 0.5397264392003608, + "flos": 28374460293600.0, + "grad_norm": 1.8208719205670916, + "language_loss": 0.69704294, + "learning_rate": 1.8399079742164509e-06, + "loss": 0.71856886, + "num_input_tokens_seen": 193367080, + "router_z_loss_clip": 0.77880859, + "router_z_loss_mlp": 0.11968994, + "step": 8977, + "time_per_iteration": 2.674180269241333 + }, + { + "auxiliary_loss_clip": 0.01120535, + "auxiliary_loss_mlp": 0.01034861, + "balance_loss_clip": 1.04286957, + "balance_loss_mlp": 1.02239776, + "epoch": 0.5397865624530287, + "flos": 22322609557920.0, + "grad_norm": 1.9120697519994896, + "language_loss": 0.72663736, + "learning_rate": 1.8395197652892636e-06, + "loss": 0.74819136, + "num_input_tokens_seen": 193383715, + "router_z_loss_clip": 0.77734375, + "router_z_loss_mlp": 0.12457275, + "step": 8978, + "time_per_iteration": 2.6716814041137695 + }, + { + "auxiliary_loss_clip": 0.01124849, + "auxiliary_loss_mlp": 0.01032314, + "balance_loss_clip": 1.04300046, + "balance_loss_mlp": 1.01883173, + "epoch": 0.5398466857056967, + "flos": 18666283701600.0, + "grad_norm": 2.3990110726611897, + "language_loss": 0.74409401, + "learning_rate": 1.8391315624475028e-06, + "loss": 0.76566565, + "num_input_tokens_seen": 193400560, + "router_z_loss_clip": 0.81982422, + "router_z_loss_mlp": 0.13494873, + "step": 8979, + "time_per_iteration": 4.113218784332275 + }, + { + "auxiliary_loss_clip": 0.01126065, + "auxiliary_loss_mlp": 0.01048044, + "balance_loss_clip": 1.04491568, + "balance_loss_mlp": 1.03509164, + "epoch": 0.5399068089583646, + "flos": 21745928632320.0, + "grad_norm": 2.9233162808710804, + "language_loss": 0.77316999, + "learning_rate": 1.8387433657058892e-06, + "loss": 0.79491115, + "num_input_tokens_seen": 193418680, + "router_z_loss_clip": 0.81103516, + "router_z_loss_mlp": 0.1295166, + "step": 8980, + "time_per_iteration": 2.6052494049072266 + }, + { + "auxiliary_loss_clip": 0.01118935, + "auxiliary_loss_mlp": 0.0103337, + "balance_loss_clip": 1.04054677, + "balance_loss_mlp": 1.02128851, + "epoch": 0.5399669322110326, + "flos": 33411580259040.0, + "grad_norm": 2.048812202612029, + "language_loss": 0.82156783, + "learning_rate": 1.8383551750791431e-06, + "loss": 0.84309089, + "num_input_tokens_seen": 193439310, + "router_z_loss_clip": 0.78320312, + "router_z_loss_mlp": 0.12084961, + "step": 8981, + "time_per_iteration": 2.75144362449646 + }, + { + "auxiliary_loss_clip": 0.01121946, + "auxiliary_loss_mlp": 0.01032834, + "balance_loss_clip": 1.04173851, + "balance_loss_mlp": 1.01962543, + "epoch": 0.5400270554637006, + "flos": 24952008942720.0, + "grad_norm": 2.016622803336766, + "language_loss": 0.67259991, + "learning_rate": 1.8379669905819857e-06, + "loss": 0.69414771, + "num_input_tokens_seen": 193458115, + "router_z_loss_clip": 0.80273438, + "router_z_loss_mlp": 0.13226318, + "step": 8982, + "time_per_iteration": 2.6224422454833984 + }, + { + "auxiliary_loss_clip": 0.01119485, + "auxiliary_loss_mlp": 0.01037577, + "balance_loss_clip": 1.0430665, + "balance_loss_mlp": 1.0266335, + "epoch": 0.5400871787163686, + "flos": 26465721196320.0, + "grad_norm": 6.496491468416762, + "language_loss": 0.82669175, + "learning_rate": 1.8375788122291358e-06, + "loss": 0.84826231, + "num_input_tokens_seen": 193477365, + "router_z_loss_clip": 0.76416016, + "router_z_loss_mlp": 0.10955811, + "step": 8983, + "time_per_iteration": 2.6439168453216553 + }, + { + "auxiliary_loss_clip": 0.01117098, + "auxiliary_loss_mlp": 0.0103694, + "balance_loss_clip": 1.03988636, + "balance_loss_mlp": 1.02383292, + "epoch": 0.5401473019690365, + "flos": 23433880305600.0, + "grad_norm": 3.4510175566890657, + "language_loss": 0.7126497, + "learning_rate": 1.8371906400353138e-06, + "loss": 0.73419011, + "num_input_tokens_seen": 193495595, + "router_z_loss_clip": 0.77246094, + "router_z_loss_mlp": 0.13098145, + "step": 8984, + "time_per_iteration": 2.5696558952331543 + }, + { + "auxiliary_loss_clip": 0.01123968, + "auxiliary_loss_mlp": 0.01031431, + "balance_loss_clip": 1.04312468, + "balance_loss_mlp": 1.01811504, + "epoch": 0.5402074252217045, + "flos": 25169919639840.0, + "grad_norm": 2.9489251896626656, + "language_loss": 0.79790151, + "learning_rate": 1.8368024740152386e-06, + "loss": 0.8194555, + "num_input_tokens_seen": 193514035, + "router_z_loss_clip": 0.80859375, + "router_z_loss_mlp": 0.13311768, + "step": 8985, + "time_per_iteration": 2.619758367538452 + }, + { + "auxiliary_loss_clip": 0.01112681, + "auxiliary_loss_mlp": 0.01027584, + "balance_loss_clip": 1.03993988, + "balance_loss_mlp": 1.01484025, + "epoch": 0.5402675484743724, + "flos": 30472956928800.0, + "grad_norm": 1.4823968490044557, + "language_loss": 0.79025245, + "learning_rate": 1.83641431418363e-06, + "loss": 0.8116551, + "num_input_tokens_seen": 193535445, + "router_z_loss_clip": 0.72753906, + "router_z_loss_mlp": 0.12750244, + "step": 8986, + "time_per_iteration": 4.154483795166016 + }, + { + "auxiliary_loss_clip": 0.01118241, + "auxiliary_loss_mlp": 0.01027023, + "balance_loss_clip": 1.0405426, + "balance_loss_mlp": 1.01513159, + "epoch": 0.5403276717270404, + "flos": 23744279252160.0, + "grad_norm": 1.7468353428497634, + "language_loss": 0.76896554, + "learning_rate": 1.8360261605552075e-06, + "loss": 0.79041815, + "num_input_tokens_seen": 193554780, + "router_z_loss_clip": 0.77587891, + "router_z_loss_mlp": 0.11889648, + "step": 8987, + "time_per_iteration": 2.6009669303894043 + }, + { + "auxiliary_loss_clip": 0.01119168, + "auxiliary_loss_mlp": 0.01031641, + "balance_loss_clip": 1.04159176, + "balance_loss_mlp": 1.01912355, + "epoch": 0.5403877949797083, + "flos": 22502967086880.0, + "grad_norm": 3.297084564783826, + "language_loss": 0.70570457, + "learning_rate": 1.8356380131446887e-06, + "loss": 0.72721273, + "num_input_tokens_seen": 193573580, + "router_z_loss_clip": 0.77587891, + "router_z_loss_mlp": 0.12518311, + "step": 8988, + "time_per_iteration": 2.6572229862213135 + }, + { + "auxiliary_loss_clip": 0.01120602, + "auxiliary_loss_mlp": 0.01037688, + "balance_loss_clip": 1.04160142, + "balance_loss_mlp": 1.02431893, + "epoch": 0.5404479182323764, + "flos": 34523620835040.0, + "grad_norm": 4.159770759207133, + "language_loss": 0.67103893, + "learning_rate": 1.8352498719667934e-06, + "loss": 0.69262183, + "num_input_tokens_seen": 193590490, + "router_z_loss_clip": 0.79003906, + "router_z_loss_mlp": 0.1338501, + "step": 8989, + "time_per_iteration": 2.6685256958007812 + }, + { + "auxiliary_loss_clip": 0.01120741, + "auxiliary_loss_mlp": 0.01037062, + "balance_loss_clip": 1.04202199, + "balance_loss_mlp": 1.02449775, + "epoch": 0.5405080414850444, + "flos": 28513091374560.0, + "grad_norm": 1.5939282668004302, + "language_loss": 0.77735829, + "learning_rate": 1.8348617370362399e-06, + "loss": 0.79893637, + "num_input_tokens_seen": 193609900, + "router_z_loss_clip": 0.78613281, + "router_z_loss_mlp": 0.12561035, + "step": 8990, + "time_per_iteration": 2.669802665710449 + }, + { + "auxiliary_loss_clip": 0.0111573, + "auxiliary_loss_mlp": 0.01027201, + "balance_loss_clip": 1.03910422, + "balance_loss_mlp": 1.01551306, + "epoch": 0.5405681647377123, + "flos": 25754055744960.0, + "grad_norm": 3.2543983890589576, + "language_loss": 0.69164819, + "learning_rate": 1.834473608367745e-06, + "loss": 0.71307755, + "num_input_tokens_seen": 193629775, + "router_z_loss_clip": 0.76660156, + "router_z_loss_mlp": 0.11688232, + "step": 8991, + "time_per_iteration": 2.6791720390319824 + }, + { + "auxiliary_loss_clip": 0.01117803, + "auxiliary_loss_mlp": 0.01026623, + "balance_loss_clip": 1.03980076, + "balance_loss_mlp": 1.01399875, + "epoch": 0.5406282879903803, + "flos": 24952089977280.0, + "grad_norm": 2.32960980557159, + "language_loss": 0.76075596, + "learning_rate": 1.8340854859760277e-06, + "loss": 0.78220022, + "num_input_tokens_seen": 193648070, + "router_z_loss_clip": 0.77978516, + "router_z_loss_mlp": 0.12615967, + "step": 8992, + "time_per_iteration": 2.768552303314209 + }, + { + "auxiliary_loss_clip": 0.01119835, + "auxiliary_loss_mlp": 0.01029713, + "balance_loss_clip": 1.0397501, + "balance_loss_mlp": 1.01750588, + "epoch": 0.5406884112430482, + "flos": 17337993636960.0, + "grad_norm": 2.807796657004866, + "language_loss": 0.76428264, + "learning_rate": 1.8336973698758056e-06, + "loss": 0.78577811, + "num_input_tokens_seen": 193665060, + "router_z_loss_clip": 0.80078125, + "router_z_loss_mlp": 0.12219238, + "step": 8993, + "time_per_iteration": 2.665722370147705 + }, + { + "auxiliary_loss_clip": 0.01117759, + "auxiliary_loss_mlp": 0.01030254, + "balance_loss_clip": 1.04170883, + "balance_loss_mlp": 1.0184164, + "epoch": 0.5407485344957162, + "flos": 29133524612160.0, + "grad_norm": 1.6545249967080642, + "language_loss": 0.70497668, + "learning_rate": 1.8333092600817959e-06, + "loss": 0.72645676, + "num_input_tokens_seen": 193683620, + "router_z_loss_clip": 0.76025391, + "router_z_loss_mlp": 0.1184082, + "step": 8994, + "time_per_iteration": 2.653956890106201 + }, + { + "auxiliary_loss_clip": 0.01120833, + "auxiliary_loss_mlp": 0.01029115, + "balance_loss_clip": 1.04191065, + "balance_loss_mlp": 1.01565623, + "epoch": 0.5408086577483842, + "flos": 28246039567200.0, + "grad_norm": 2.044166521590003, + "language_loss": 0.75506437, + "learning_rate": 1.8329211566087157e-06, + "loss": 0.77656388, + "num_input_tokens_seen": 193702990, + "router_z_loss_clip": 0.7890625, + "router_z_loss_mlp": 0.13439941, + "step": 8995, + "time_per_iteration": 2.696781635284424 + }, + { + "auxiliary_loss_clip": 0.01114924, + "auxiliary_loss_mlp": 0.01028423, + "balance_loss_clip": 1.03993607, + "balance_loss_mlp": 1.01687729, + "epoch": 0.5408687810010522, + "flos": 22904274108960.0, + "grad_norm": 2.2323702848385154, + "language_loss": 0.73160899, + "learning_rate": 1.832533059471282e-06, + "loss": 0.75304246, + "num_input_tokens_seen": 193721785, + "router_z_loss_clip": 0.75, + "router_z_loss_mlp": 0.11541748, + "step": 8996, + "time_per_iteration": 2.6157655715942383 + }, + { + "auxiliary_loss_clip": 0.01116542, + "auxiliary_loss_mlp": 0.01037508, + "balance_loss_clip": 1.04210603, + "balance_loss_mlp": 1.02571261, + "epoch": 0.5409289042537201, + "flos": 16937780581440.0, + "grad_norm": 1.922941088863565, + "language_loss": 0.73680425, + "learning_rate": 1.8321449686842115e-06, + "loss": 0.75834471, + "num_input_tokens_seen": 193740315, + "router_z_loss_clip": 0.74414062, + "router_z_loss_mlp": 0.11791992, + "step": 8997, + "time_per_iteration": 2.650804281234741 + }, + { + "auxiliary_loss_clip": 0.01119446, + "auxiliary_loss_mlp": 0.0103131, + "balance_loss_clip": 1.04183745, + "balance_loss_mlp": 1.01912713, + "epoch": 0.5409890275063881, + "flos": 17650175343840.0, + "grad_norm": 2.4152774414208737, + "language_loss": 0.72009736, + "learning_rate": 1.8317568842622207e-06, + "loss": 0.74160492, + "num_input_tokens_seen": 193757580, + "router_z_loss_clip": 0.77587891, + "router_z_loss_mlp": 0.12176514, + "step": 8998, + "time_per_iteration": 2.620471239089966 + }, + { + "auxiliary_loss_clip": 0.01117377, + "auxiliary_loss_mlp": 0.01037082, + "balance_loss_clip": 1.04061818, + "balance_loss_mlp": 1.02483344, + "epoch": 0.541049150759056, + "flos": 59764797669600.0, + "grad_norm": 1.5477355173230487, + "language_loss": 0.70371759, + "learning_rate": 1.8313688062200256e-06, + "loss": 0.72526217, + "num_input_tokens_seen": 193780965, + "router_z_loss_clip": 0.76757812, + "router_z_loss_mlp": 0.12268066, + "step": 8999, + "time_per_iteration": 2.916511058807373 + }, + { + "auxiliary_loss_clip": 0.01116839, + "auxiliary_loss_mlp": 0.01030493, + "balance_loss_clip": 1.04118991, + "balance_loss_mlp": 1.01805949, + "epoch": 0.541109274011724, + "flos": 22143791685600.0, + "grad_norm": 4.950236414729861, + "language_loss": 0.80373549, + "learning_rate": 1.8309807345723422e-06, + "loss": 0.82520884, + "num_input_tokens_seen": 193797855, + "router_z_loss_clip": 0.75634766, + "router_z_loss_mlp": 0.12420654, + "step": 9000, + "time_per_iteration": 2.6196794509887695 + }, + { + "auxiliary_loss_clip": 0.01116521, + "auxiliary_loss_mlp": 0.0102848, + "balance_loss_clip": 1.0401988, + "balance_loss_mlp": 1.01635635, + "epoch": 0.541169397264392, + "flos": 25041579948000.0, + "grad_norm": 1.6916973137936675, + "language_loss": 0.72959554, + "learning_rate": 1.8305926693338863e-06, + "loss": 0.75104558, + "num_input_tokens_seen": 193817375, + "router_z_loss_clip": 0.76318359, + "router_z_loss_mlp": 0.12133789, + "step": 9001, + "time_per_iteration": 2.655939817428589 + }, + { + "auxiliary_loss_clip": 0.01123032, + "auxiliary_loss_mlp": 0.01035152, + "balance_loss_clip": 1.04276228, + "balance_loss_mlp": 1.02193189, + "epoch": 0.54122952051706, + "flos": 24458051602080.0, + "grad_norm": 2.1618035294506, + "language_loss": 0.84845722, + "learning_rate": 1.8302046105193734e-06, + "loss": 0.87003911, + "num_input_tokens_seen": 193832205, + "router_z_loss_clip": 0.80224609, + "router_z_loss_mlp": 0.13226318, + "step": 9002, + "time_per_iteration": 2.600560426712036 + }, + { + "auxiliary_loss_clip": 0.01116144, + "auxiliary_loss_mlp": 0.01030116, + "balance_loss_clip": 1.04175425, + "balance_loss_mlp": 1.01911867, + "epoch": 0.541289643769728, + "flos": 23259681403200.0, + "grad_norm": 2.2347620134782935, + "language_loss": 0.78026789, + "learning_rate": 1.8298165581435183e-06, + "loss": 0.80173051, + "num_input_tokens_seen": 193849830, + "router_z_loss_clip": 0.74462891, + "router_z_loss_mlp": 0.11004639, + "step": 9003, + "time_per_iteration": 2.6620092391967773 + }, + { + "auxiliary_loss_clip": 0.01116508, + "auxiliary_loss_mlp": 0.01029981, + "balance_loss_clip": 1.04047322, + "balance_loss_mlp": 1.01733899, + "epoch": 0.5413497670223959, + "flos": 27311074620480.0, + "grad_norm": 2.1407122481198693, + "language_loss": 0.69826341, + "learning_rate": 1.8294285122210372e-06, + "loss": 0.71972823, + "num_input_tokens_seen": 193869945, + "router_z_loss_clip": 0.76025391, + "router_z_loss_mlp": 0.12646484, + "step": 9004, + "time_per_iteration": 2.683443069458008 + }, + { + "auxiliary_loss_clip": 0.01036865, + "auxiliary_loss_mlp": 0.0100054, + "balance_loss_clip": 1.0124774, + "balance_loss_mlp": 0.99911344, + "epoch": 0.5414098902750639, + "flos": 85453309156320.0, + "grad_norm": 0.9352008033712208, + "language_loss": 0.59092355, + "learning_rate": 1.8290404727666434e-06, + "loss": 0.61129761, + "num_input_tokens_seen": 193930860, + "router_z_loss_clip": 0.24389648, + "router_z_loss_mlp": 0.01425934, + "step": 9005, + "time_per_iteration": 3.3722219467163086 + }, + { + "auxiliary_loss_clip": 0.01121986, + "auxiliary_loss_mlp": 0.01030585, + "balance_loss_clip": 1.04345846, + "balance_loss_mlp": 1.01908755, + "epoch": 0.5414700135277318, + "flos": 26599206582720.0, + "grad_norm": 1.9223403367267242, + "language_loss": 0.78277004, + "learning_rate": 1.8286524397950517e-06, + "loss": 0.80429566, + "num_input_tokens_seen": 193949075, + "router_z_loss_clip": 0.78564453, + "router_z_loss_mlp": 0.11499023, + "step": 9006, + "time_per_iteration": 2.664451837539673 + }, + { + "auxiliary_loss_clip": 0.01116294, + "auxiliary_loss_mlp": 0.01031541, + "balance_loss_clip": 1.04115915, + "balance_loss_mlp": 1.02033567, + "epoch": 0.5415301367803999, + "flos": 20630970812160.0, + "grad_norm": 1.846423371106376, + "language_loss": 0.83129704, + "learning_rate": 1.8282644133209777e-06, + "loss": 0.85277539, + "num_input_tokens_seen": 193967630, + "router_z_loss_clip": 0.75146484, + "router_z_loss_mlp": 0.11199951, + "step": 9007, + "time_per_iteration": 2.6878044605255127 + }, + { + "auxiliary_loss_clip": 0.01119514, + "auxiliary_loss_mlp": 0.01029614, + "balance_loss_clip": 1.04190242, + "balance_loss_mlp": 1.01688886, + "epoch": 0.5415902600330678, + "flos": 31364534219040.0, + "grad_norm": 3.031535601031052, + "language_loss": 0.66811979, + "learning_rate": 1.8278763933591334e-06, + "loss": 0.68961108, + "num_input_tokens_seen": 193988730, + "router_z_loss_clip": 0.77539062, + "router_z_loss_mlp": 0.1272583, + "step": 9008, + "time_per_iteration": 2.7970242500305176 + }, + { + "auxiliary_loss_clip": 0.01124709, + "auxiliary_loss_mlp": 0.01030306, + "balance_loss_clip": 1.04342723, + "balance_loss_mlp": 1.01709795, + "epoch": 0.5416503832857358, + "flos": 23437729447200.0, + "grad_norm": 2.000137052454507, + "language_loss": 0.74000072, + "learning_rate": 1.827488379924234e-06, + "loss": 0.76155084, + "num_input_tokens_seen": 194005160, + "router_z_loss_clip": 0.81298828, + "router_z_loss_mlp": 0.13201904, + "step": 9009, + "time_per_iteration": 2.6183743476867676 + }, + { + "auxiliary_loss_clip": 0.01121212, + "auxiliary_loss_mlp": 0.01035227, + "balance_loss_clip": 1.04198992, + "balance_loss_mlp": 1.02259099, + "epoch": 0.5417105065384037, + "flos": 15512707435680.0, + "grad_norm": 2.199337769596237, + "language_loss": 0.87403488, + "learning_rate": 1.8271003730309923e-06, + "loss": 0.89559925, + "num_input_tokens_seen": 194021700, + "router_z_loss_clip": 0.79248047, + "router_z_loss_mlp": 0.12646484, + "step": 9010, + "time_per_iteration": 2.6450421810150146 + }, + { + "auxiliary_loss_clip": 0.0111812, + "auxiliary_loss_mlp": 0.01032675, + "balance_loss_clip": 1.04104221, + "balance_loss_mlp": 1.02077794, + "epoch": 0.5417706297910717, + "flos": 37016739141120.0, + "grad_norm": 2.7025843802561997, + "language_loss": 0.655559, + "learning_rate": 1.826712372694122e-06, + "loss": 0.67706698, + "num_input_tokens_seen": 194042620, + "router_z_loss_clip": 0.76953125, + "router_z_loss_mlp": 0.11901855, + "step": 9011, + "time_per_iteration": 2.6837546825408936 + }, + { + "auxiliary_loss_clip": 0.01122269, + "auxiliary_loss_mlp": 0.01037989, + "balance_loss_clip": 1.04382992, + "balance_loss_mlp": 1.02583599, + "epoch": 0.5418307530437396, + "flos": 35412280881120.0, + "grad_norm": 2.4854697738189246, + "language_loss": 0.79621363, + "learning_rate": 1.8263243789283362e-06, + "loss": 0.8178162, + "num_input_tokens_seen": 194061800, + "router_z_loss_clip": 0.78466797, + "router_z_loss_mlp": 0.12158203, + "step": 9012, + "time_per_iteration": 4.122662305831909 + }, + { + "auxiliary_loss_clip": 0.01117966, + "auxiliary_loss_mlp": 0.01031458, + "balance_loss_clip": 1.0404923, + "balance_loss_mlp": 1.01900089, + "epoch": 0.5418908762964076, + "flos": 20588717639520.0, + "grad_norm": 3.8475638401429553, + "language_loss": 0.74070722, + "learning_rate": 1.8259363917483466e-06, + "loss": 0.76220143, + "num_input_tokens_seen": 194079890, + "router_z_loss_clip": 0.77490234, + "router_z_loss_mlp": 0.12469482, + "step": 9013, + "time_per_iteration": 3.994046211242676 + }, + { + "auxiliary_loss_clip": 0.01119321, + "auxiliary_loss_mlp": 0.01029013, + "balance_loss_clip": 1.03932381, + "balance_loss_mlp": 1.01635277, + "epoch": 0.5419509995490756, + "flos": 23122630496160.0, + "grad_norm": 2.56656935423611, + "language_loss": 0.72529674, + "learning_rate": 1.8255484111688667e-06, + "loss": 0.74678004, + "num_input_tokens_seen": 194097625, + "router_z_loss_clip": 0.79980469, + "router_z_loss_mlp": 0.12658691, + "step": 9014, + "time_per_iteration": 2.6468346118927 + }, + { + "auxiliary_loss_clip": 0.01117719, + "auxiliary_loss_mlp": 0.01032065, + "balance_loss_clip": 1.04047, + "balance_loss_mlp": 1.01999545, + "epoch": 0.5420111228017436, + "flos": 22058231891040.0, + "grad_norm": 1.828944094247948, + "language_loss": 0.80459708, + "learning_rate": 1.8251604372046085e-06, + "loss": 0.82609493, + "num_input_tokens_seen": 194116055, + "router_z_loss_clip": 0.77294922, + "router_z_loss_mlp": 0.12072754, + "step": 9015, + "time_per_iteration": 2.767357110977173 + }, + { + "auxiliary_loss_clip": 0.01122474, + "auxiliary_loss_mlp": 0.01034923, + "balance_loss_clip": 1.04187918, + "balance_loss_mlp": 1.02248979, + "epoch": 0.5420712460544116, + "flos": 23259154678560.0, + "grad_norm": 2.531079548193992, + "language_loss": 0.81299484, + "learning_rate": 1.8247724698702843e-06, + "loss": 0.83456886, + "num_input_tokens_seen": 194130365, + "router_z_loss_clip": 0.80566406, + "router_z_loss_mlp": 0.12426758, + "step": 9016, + "time_per_iteration": 2.7823829650878906 + }, + { + "auxiliary_loss_clip": 0.01116804, + "auxiliary_loss_mlp": 0.01031049, + "balance_loss_clip": 1.04000711, + "balance_loss_mlp": 1.01837134, + "epoch": 0.5421313693070795, + "flos": 22191717277440.0, + "grad_norm": 1.7049862032540772, + "language_loss": 0.81305367, + "learning_rate": 1.8243845091806053e-06, + "loss": 0.8345322, + "num_input_tokens_seen": 194148975, + "router_z_loss_clip": 0.76953125, + "router_z_loss_mlp": 0.12677002, + "step": 9017, + "time_per_iteration": 2.6585748195648193 + }, + { + "auxiliary_loss_clip": 0.01115491, + "auxiliary_loss_mlp": 0.01032535, + "balance_loss_clip": 1.04046202, + "balance_loss_mlp": 1.02006578, + "epoch": 0.5421914925597475, + "flos": 15869289731040.0, + "grad_norm": 1.6470540401917297, + "language_loss": 0.77609229, + "learning_rate": 1.8239965551502837e-06, + "loss": 0.79757261, + "num_input_tokens_seen": 194167185, + "router_z_loss_clip": 0.75097656, + "router_z_loss_mlp": 0.12475586, + "step": 9018, + "time_per_iteration": 2.6803879737854004 + }, + { + "auxiliary_loss_clip": 0.01118884, + "auxiliary_loss_mlp": 0.01036166, + "balance_loss_clip": 1.03819966, + "balance_loss_mlp": 1.02325535, + "epoch": 0.5422516158124154, + "flos": 57056402289600.0, + "grad_norm": 1.6077514404133788, + "language_loss": 0.66667402, + "learning_rate": 1.8236086077940303e-06, + "loss": 0.68822443, + "num_input_tokens_seen": 194192840, + "router_z_loss_clip": 0.80712891, + "router_z_loss_mlp": 0.12896729, + "step": 9019, + "time_per_iteration": 4.3010053634643555 + }, + { + "auxiliary_loss_clip": 0.01114741, + "auxiliary_loss_mlp": 0.01025377, + "balance_loss_clip": 1.03977764, + "balance_loss_mlp": 1.01433814, + "epoch": 0.5423117390650835, + "flos": 38752859509920.0, + "grad_norm": 1.9546555314427592, + "language_loss": 0.69893324, + "learning_rate": 1.8232206671265555e-06, + "loss": 0.72033453, + "num_input_tokens_seen": 194213150, + "router_z_loss_clip": 0.74951172, + "router_z_loss_mlp": 0.11035156, + "step": 9020, + "time_per_iteration": 2.7206308841705322 + }, + { + "auxiliary_loss_clip": 0.01113926, + "auxiliary_loss_mlp": 0.01037915, + "balance_loss_clip": 1.0396682, + "balance_loss_mlp": 1.02552319, + "epoch": 0.5423718623177514, + "flos": 33192089388000.0, + "grad_norm": 1.6050295107283092, + "language_loss": 0.80559468, + "learning_rate": 1.8228327331625717e-06, + "loss": 0.82711303, + "num_input_tokens_seen": 194234665, + "router_z_loss_clip": 0.74267578, + "router_z_loss_mlp": 0.12384033, + "step": 9021, + "time_per_iteration": 2.7268691062927246 + }, + { + "auxiliary_loss_clip": 0.0111882, + "auxiliary_loss_mlp": 0.01038451, + "balance_loss_clip": 1.04228926, + "balance_loss_mlp": 1.02568424, + "epoch": 0.5424319855704194, + "flos": 28731974486400.0, + "grad_norm": 1.5975988498475737, + "language_loss": 0.78682053, + "learning_rate": 1.822444805916788e-06, + "loss": 0.80839324, + "num_input_tokens_seen": 194253790, + "router_z_loss_clip": 0.76464844, + "router_z_loss_mlp": 0.12762451, + "step": 9022, + "time_per_iteration": 2.659637451171875 + }, + { + "auxiliary_loss_clip": 0.01117174, + "auxiliary_loss_mlp": 0.01038276, + "balance_loss_clip": 1.04039967, + "balance_loss_mlp": 1.02594984, + "epoch": 0.5424921088230873, + "flos": 32476817898720.0, + "grad_norm": 2.877597736254472, + "language_loss": 0.82437938, + "learning_rate": 1.822056885403915e-06, + "loss": 0.84593391, + "num_input_tokens_seen": 194274950, + "router_z_loss_clip": 0.76855469, + "router_z_loss_mlp": 0.12329102, + "step": 9023, + "time_per_iteration": 2.702094078063965 + }, + { + "auxiliary_loss_clip": 0.0111823, + "auxiliary_loss_mlp": 0.0102731, + "balance_loss_clip": 1.04128313, + "balance_loss_mlp": 1.01561558, + "epoch": 0.5425522320757553, + "flos": 28780872492960.0, + "grad_norm": 2.8365930426586266, + "language_loss": 0.71358788, + "learning_rate": 1.8216689716386627e-06, + "loss": 0.73504329, + "num_input_tokens_seen": 194296155, + "router_z_loss_clip": 0.76904297, + "router_z_loss_mlp": 0.11688232, + "step": 9024, + "time_per_iteration": 2.645352363586426 + }, + { + "auxiliary_loss_clip": 0.01118634, + "auxiliary_loss_mlp": 0.01033604, + "balance_loss_clip": 1.03987706, + "balance_loss_mlp": 1.02165937, + "epoch": 0.5426123553284232, + "flos": 37329447572640.0, + "grad_norm": 1.7994431331044376, + "language_loss": 0.65159434, + "learning_rate": 1.8212810646357405e-06, + "loss": 0.67311674, + "num_input_tokens_seen": 194318025, + "router_z_loss_clip": 0.78759766, + "router_z_loss_mlp": 0.11938477, + "step": 9025, + "time_per_iteration": 2.7311911582946777 + }, + { + "auxiliary_loss_clip": 0.01121784, + "auxiliary_loss_mlp": 0.01034187, + "balance_loss_clip": 1.04226398, + "balance_loss_mlp": 1.02207518, + "epoch": 0.5426724785810912, + "flos": 15245939249280.0, + "grad_norm": 3.371185031545014, + "language_loss": 0.73674685, + "learning_rate": 1.8208931644098591e-06, + "loss": 0.75830656, + "num_input_tokens_seen": 194336150, + "router_z_loss_clip": 0.79541016, + "router_z_loss_mlp": 0.12115479, + "step": 9026, + "time_per_iteration": 3.9456865787506104 + }, + { + "auxiliary_loss_clip": 0.01118736, + "auxiliary_loss_mlp": 0.01037493, + "balance_loss_clip": 1.03988957, + "balance_loss_mlp": 1.02373075, + "epoch": 0.5427326018337592, + "flos": 31804771996800.0, + "grad_norm": 1.679639994875594, + "language_loss": 0.78544617, + "learning_rate": 1.8205052709757265e-06, + "loss": 0.80700856, + "num_input_tokens_seen": 194355980, + "router_z_loss_clip": 0.78808594, + "router_z_loss_mlp": 0.13763428, + "step": 9027, + "time_per_iteration": 2.694791555404663 + }, + { + "auxiliary_loss_clip": 0.0103464, + "auxiliary_loss_mlp": 0.01003103, + "balance_loss_clip": 1.01020598, + "balance_loss_mlp": 1.00168777, + "epoch": 0.5427927250864272, + "flos": 80514592963200.0, + "grad_norm": 0.7477776030708569, + "language_loss": 0.56529355, + "learning_rate": 1.8201173843480515e-06, + "loss": 0.58567101, + "num_input_tokens_seen": 194422660, + "router_z_loss_clip": 0.24414062, + "router_z_loss_mlp": 0.01415253, + "step": 9028, + "time_per_iteration": 3.284959077835083 + }, + { + "auxiliary_loss_clip": 0.01121196, + "auxiliary_loss_mlp": 0.01031016, + "balance_loss_clip": 1.04224789, + "balance_loss_mlp": 1.01755762, + "epoch": 0.5428528483390952, + "flos": 24373869395040.0, + "grad_norm": 2.0543017470741214, + "language_loss": 0.78175902, + "learning_rate": 1.8197295045415442e-06, + "loss": 0.80328119, + "num_input_tokens_seen": 194438545, + "router_z_loss_clip": 0.79052734, + "router_z_loss_mlp": 0.13470459, + "step": 9029, + "time_per_iteration": 2.636422872543335 + }, + { + "auxiliary_loss_clip": 0.01117559, + "auxiliary_loss_mlp": 0.01027624, + "balance_loss_clip": 1.0410372, + "balance_loss_mlp": 1.01453471, + "epoch": 0.5429129715917631, + "flos": 26640203719680.0, + "grad_norm": 1.6676540459211262, + "language_loss": 0.83297843, + "learning_rate": 1.8193416315709112e-06, + "loss": 0.85443026, + "num_input_tokens_seen": 194458060, + "router_z_loss_clip": 0.765625, + "router_z_loss_mlp": 0.1307373, + "step": 9030, + "time_per_iteration": 2.723844528198242 + }, + { + "auxiliary_loss_clip": 0.01118583, + "auxiliary_loss_mlp": 0.01034934, + "balance_loss_clip": 1.04252553, + "balance_loss_mlp": 1.02326298, + "epoch": 0.5429730948444311, + "flos": 33900999664320.0, + "grad_norm": 3.2220583148558295, + "language_loss": 0.75078118, + "learning_rate": 1.8189537654508623e-06, + "loss": 0.77231634, + "num_input_tokens_seen": 194477405, + "router_z_loss_clip": 0.76123047, + "router_z_loss_mlp": 0.11669922, + "step": 9031, + "time_per_iteration": 2.6666626930236816 + }, + { + "auxiliary_loss_clip": 0.01114742, + "auxiliary_loss_mlp": 0.01031127, + "balance_loss_clip": 1.04038906, + "balance_loss_mlp": 1.01987982, + "epoch": 0.543033218097099, + "flos": 32653650424320.0, + "grad_norm": 1.970820172464602, + "language_loss": 0.85489333, + "learning_rate": 1.8185659061961045e-06, + "loss": 0.87635207, + "num_input_tokens_seen": 194497085, + "router_z_loss_clip": 0.74316406, + "router_z_loss_mlp": 0.11248779, + "step": 9032, + "time_per_iteration": 2.6604068279266357 + }, + { + "auxiliary_loss_clip": 0.01121362, + "auxiliary_loss_mlp": 0.01033827, + "balance_loss_clip": 1.04095197, + "balance_loss_mlp": 1.02085066, + "epoch": 0.5430933413497671, + "flos": 27668467261440.0, + "grad_norm": 2.927731038112954, + "language_loss": 0.74119139, + "learning_rate": 1.8181780538213457e-06, + "loss": 0.76274323, + "num_input_tokens_seen": 194516785, + "router_z_loss_clip": 0.80322266, + "router_z_loss_mlp": 0.12982178, + "step": 9033, + "time_per_iteration": 2.6094534397125244 + }, + { + "auxiliary_loss_clip": 0.01117665, + "auxiliary_loss_mlp": 0.01038525, + "balance_loss_clip": 1.04079068, + "balance_loss_mlp": 1.02554274, + "epoch": 0.543153464602435, + "flos": 30026479489920.0, + "grad_norm": 1.7091845266621568, + "language_loss": 0.75142193, + "learning_rate": 1.8177902083412935e-06, + "loss": 0.77298385, + "num_input_tokens_seen": 194536475, + "router_z_loss_clip": 0.76855469, + "router_z_loss_mlp": 0.12994385, + "step": 9034, + "time_per_iteration": 2.6605453491210938 + }, + { + "auxiliary_loss_clip": 0.01118168, + "auxiliary_loss_mlp": 0.01027498, + "balance_loss_clip": 1.04214644, + "balance_loss_mlp": 1.01559496, + "epoch": 0.543213587855103, + "flos": 23215118745600.0, + "grad_norm": 1.7893413788778327, + "language_loss": 0.84410429, + "learning_rate": 1.817402369770655e-06, + "loss": 0.86556101, + "num_input_tokens_seen": 194554495, + "router_z_loss_clip": 0.75976562, + "router_z_loss_mlp": 0.11914062, + "step": 9035, + "time_per_iteration": 2.609703302383423 + }, + { + "auxiliary_loss_clip": 0.01033975, + "auxiliary_loss_mlp": 0.01002105, + "balance_loss_clip": 1.00930214, + "balance_loss_mlp": 1.00075006, + "epoch": 0.5432737111077709, + "flos": 80151163247520.0, + "grad_norm": 0.7193793364511633, + "language_loss": 0.5591706, + "learning_rate": 1.8170145381241364e-06, + "loss": 0.57953143, + "num_input_tokens_seen": 194617620, + "router_z_loss_clip": 0.24658203, + "router_z_loss_mlp": 0.01356506, + "step": 9036, + "time_per_iteration": 3.253244638442993 + }, + { + "auxiliary_loss_clip": 0.01121254, + "auxiliary_loss_mlp": 0.01031886, + "balance_loss_clip": 1.04086161, + "balance_loss_mlp": 1.01922011, + "epoch": 0.5433338343604389, + "flos": 26956315602720.0, + "grad_norm": 1.7728667906003446, + "language_loss": 0.74777216, + "learning_rate": 1.8166267134164451e-06, + "loss": 0.76930356, + "num_input_tokens_seen": 194637690, + "router_z_loss_clip": 0.80419922, + "router_z_loss_mlp": 0.12658691, + "step": 9037, + "time_per_iteration": 2.6859498023986816 + }, + { + "auxiliary_loss_clip": 0.01117162, + "auxiliary_loss_mlp": 0.01036208, + "balance_loss_clip": 1.03907084, + "balance_loss_mlp": 1.0231781, + "epoch": 0.5433939576131068, + "flos": 42309201420000.0, + "grad_norm": 1.7373240689382314, + "language_loss": 0.66432607, + "learning_rate": 1.8162388956622875e-06, + "loss": 0.6858598, + "num_input_tokens_seen": 194659520, + "router_z_loss_clip": 0.78125, + "router_z_loss_mlp": 0.13031006, + "step": 9038, + "time_per_iteration": 2.7618885040283203 + }, + { + "auxiliary_loss_clip": 0.01116572, + "auxiliary_loss_mlp": 0.01032133, + "balance_loss_clip": 1.04000974, + "balance_loss_mlp": 1.01997948, + "epoch": 0.5434540808657748, + "flos": 24774122967840.0, + "grad_norm": 1.8142711809130585, + "language_loss": 0.78006834, + "learning_rate": 1.8158510848763692e-06, + "loss": 0.8015554, + "num_input_tokens_seen": 194677645, + "router_z_loss_clip": 0.765625, + "router_z_loss_mlp": 0.12158203, + "step": 9039, + "time_per_iteration": 2.6967577934265137 + }, + { + "auxiliary_loss_clip": 0.01118589, + "auxiliary_loss_mlp": 0.01034079, + "balance_loss_clip": 1.04057717, + "balance_loss_mlp": 1.02160931, + "epoch": 0.5435142041184428, + "flos": 28202408807040.0, + "grad_norm": 1.8210114152644754, + "language_loss": 0.7626968, + "learning_rate": 1.8154632810733962e-06, + "loss": 0.78422344, + "num_input_tokens_seen": 194697400, + "router_z_loss_clip": 0.78027344, + "router_z_loss_mlp": 0.12481689, + "step": 9040, + "time_per_iteration": 2.647366523742676 + }, + { + "auxiliary_loss_clip": 0.01033124, + "auxiliary_loss_mlp": 0.01000164, + "balance_loss_clip": 1.0085659, + "balance_loss_mlp": 0.9987269, + "epoch": 0.5435743273711108, + "flos": 78108371521920.0, + "grad_norm": 0.6662488854027587, + "language_loss": 0.5242914, + "learning_rate": 1.815075484268074e-06, + "loss": 0.54462421, + "num_input_tokens_seen": 194761205, + "router_z_loss_clip": 0.24560547, + "router_z_loss_mlp": 0.01436615, + "step": 9041, + "time_per_iteration": 3.2405805587768555 + }, + { + "auxiliary_loss_clip": 0.01117925, + "auxiliary_loss_mlp": 0.01034511, + "balance_loss_clip": 1.03988171, + "balance_loss_mlp": 1.0222863, + "epoch": 0.5436344506237788, + "flos": 30650478248160.0, + "grad_norm": 1.7782195187732381, + "language_loss": 0.76435709, + "learning_rate": 1.8146876944751078e-06, + "loss": 0.7858814, + "num_input_tokens_seen": 194782445, + "router_z_loss_clip": 0.78027344, + "router_z_loss_mlp": 0.12225342, + "step": 9042, + "time_per_iteration": 2.693497657775879 + }, + { + "auxiliary_loss_clip": 0.01114303, + "auxiliary_loss_mlp": 0.01030737, + "balance_loss_clip": 1.03880453, + "balance_loss_mlp": 1.01934016, + "epoch": 0.5436945738764467, + "flos": 23882910333120.0, + "grad_norm": 2.5777209184667504, + "language_loss": 0.67163336, + "learning_rate": 1.8142999117092033e-06, + "loss": 0.69308376, + "num_input_tokens_seen": 194800325, + "router_z_loss_clip": 0.75488281, + "router_z_loss_mlp": 0.11401367, + "step": 9043, + "time_per_iteration": 2.652465343475342 + }, + { + "auxiliary_loss_clip": 0.01113602, + "auxiliary_loss_mlp": 0.01033302, + "balance_loss_clip": 1.03869677, + "balance_loss_mlp": 1.02110076, + "epoch": 0.5437546971291147, + "flos": 25798132195200.0, + "grad_norm": 1.9309595245700832, + "language_loss": 0.84232509, + "learning_rate": 1.8139121359850644e-06, + "loss": 0.86379409, + "num_input_tokens_seen": 194818675, + "router_z_loss_clip": 0.74951172, + "router_z_loss_mlp": 0.12194824, + "step": 9044, + "time_per_iteration": 2.6377370357513428 + }, + { + "auxiliary_loss_clip": 0.01123188, + "auxiliary_loss_mlp": 0.01029018, + "balance_loss_clip": 1.04161143, + "balance_loss_mlp": 1.01625025, + "epoch": 0.5438148203817826, + "flos": 30650235144480.0, + "grad_norm": 8.585690951496446, + "language_loss": 0.62107944, + "learning_rate": 1.8135243673173956e-06, + "loss": 0.64260149, + "num_input_tokens_seen": 194836595, + "router_z_loss_clip": 0.81445312, + "router_z_loss_mlp": 0.12756348, + "step": 9045, + "time_per_iteration": 2.679757595062256 + }, + { + "auxiliary_loss_clip": 0.01121619, + "auxiliary_loss_mlp": 0.01031648, + "balance_loss_clip": 1.0431124, + "balance_loss_mlp": 1.01950026, + "epoch": 0.5438749436344507, + "flos": 28068396696000.0, + "grad_norm": 1.5418857018025693, + "language_loss": 0.69828737, + "learning_rate": 1.8131366057209023e-06, + "loss": 0.71982002, + "num_input_tokens_seen": 194857520, + "router_z_loss_clip": 0.78613281, + "router_z_loss_mlp": 0.12145996, + "step": 9046, + "time_per_iteration": 2.6824398040771484 + }, + { + "auxiliary_loss_clip": 0.01117327, + "auxiliary_loss_mlp": 0.01029944, + "balance_loss_clip": 1.04081893, + "balance_loss_mlp": 1.01797581, + "epoch": 0.5439350668871186, + "flos": 18896876307360.0, + "grad_norm": 1.5134886492732749, + "language_loss": 0.77276802, + "learning_rate": 1.8127488512102868e-06, + "loss": 0.79424071, + "num_input_tokens_seen": 194876020, + "router_z_loss_clip": 0.76513672, + "router_z_loss_mlp": 0.11962891, + "step": 9047, + "time_per_iteration": 2.6474239826202393 + }, + { + "auxiliary_loss_clip": 0.01119488, + "auxiliary_loss_mlp": 0.01036661, + "balance_loss_clip": 1.0415132, + "balance_loss_mlp": 1.02439427, + "epoch": 0.5439951901397866, + "flos": 21034060594560.0, + "grad_norm": 2.1074411845230885, + "language_loss": 0.72585094, + "learning_rate": 1.8123611038002547e-06, + "loss": 0.74741238, + "num_input_tokens_seen": 194894650, + "router_z_loss_clip": 0.77929688, + "router_z_loss_mlp": 0.1227417, + "step": 9048, + "time_per_iteration": 2.6379599571228027 + }, + { + "auxiliary_loss_clip": 0.01120427, + "auxiliary_loss_mlp": 0.01029698, + "balance_loss_clip": 1.04373515, + "balance_loss_mlp": 1.01653183, + "epoch": 0.5440553133924545, + "flos": 22770059411520.0, + "grad_norm": 2.3014031359944873, + "language_loss": 0.93151653, + "learning_rate": 1.8119733635055076e-06, + "loss": 0.95301777, + "num_input_tokens_seen": 194911935, + "router_z_loss_clip": 0.76611328, + "router_z_loss_mlp": 0.13165283, + "step": 9049, + "time_per_iteration": 2.6054062843322754 + }, + { + "auxiliary_loss_clip": 0.01114754, + "auxiliary_loss_mlp": 0.0103321, + "balance_loss_clip": 1.03854418, + "balance_loss_mlp": 1.0216409, + "epoch": 0.5441154366451225, + "flos": 33096197687040.0, + "grad_norm": 1.886674643092412, + "language_loss": 0.73718566, + "learning_rate": 1.8115856303407492e-06, + "loss": 0.75866532, + "num_input_tokens_seen": 194931620, + "router_z_loss_clip": 0.76220703, + "router_z_loss_mlp": 0.11566162, + "step": 9050, + "time_per_iteration": 2.694415330886841 + }, + { + "auxiliary_loss_clip": 0.01119867, + "auxiliary_loss_mlp": 0.01032369, + "balance_loss_clip": 1.04183233, + "balance_loss_mlp": 1.02006674, + "epoch": 0.5441755598977904, + "flos": 31718239787520.0, + "grad_norm": 1.980617969908303, + "language_loss": 0.67460859, + "learning_rate": 1.8111979043206832e-06, + "loss": 0.69613087, + "num_input_tokens_seen": 194952560, + "router_z_loss_clip": 0.78027344, + "router_z_loss_mlp": 0.12304688, + "step": 9051, + "time_per_iteration": 4.2314088344573975 + }, + { + "auxiliary_loss_clip": 0.01119305, + "auxiliary_loss_mlp": 0.01034474, + "balance_loss_clip": 1.04091549, + "balance_loss_mlp": 1.02196288, + "epoch": 0.5442356831504584, + "flos": 39511032448320.0, + "grad_norm": 1.8470944477703006, + "language_loss": 0.67058766, + "learning_rate": 1.810810185460011e-06, + "loss": 0.69212556, + "num_input_tokens_seen": 194973915, + "router_z_loss_clip": 0.78173828, + "router_z_loss_mlp": 0.12493896, + "step": 9052, + "time_per_iteration": 2.7417469024658203 + }, + { + "auxiliary_loss_clip": 0.01120823, + "auxiliary_loss_mlp": 0.01032267, + "balance_loss_clip": 1.04153705, + "balance_loss_mlp": 1.01929092, + "epoch": 0.5442958064031264, + "flos": 29486136214080.0, + "grad_norm": 1.9238962303301337, + "language_loss": 0.92651957, + "learning_rate": 1.810422473773436e-06, + "loss": 0.9480505, + "num_input_tokens_seen": 194990170, + "router_z_loss_clip": 0.79345703, + "router_z_loss_mlp": 0.12988281, + "step": 9053, + "time_per_iteration": 4.063756704330444 + }, + { + "auxiliary_loss_clip": 0.01120982, + "auxiliary_loss_mlp": 0.01037586, + "balance_loss_clip": 1.04107571, + "balance_loss_mlp": 1.02525353, + "epoch": 0.5443559296557944, + "flos": 22897102550400.0, + "grad_norm": 3.116598621384694, + "language_loss": 0.83773196, + "learning_rate": 1.8100347692756595e-06, + "loss": 0.8593176, + "num_input_tokens_seen": 195006395, + "router_z_loss_clip": 0.79931641, + "router_z_loss_mlp": 0.12335205, + "step": 9054, + "time_per_iteration": 2.594867706298828 + }, + { + "auxiliary_loss_clip": 0.01121147, + "auxiliary_loss_mlp": 0.01032258, + "balance_loss_clip": 1.04257548, + "balance_loss_mlp": 1.01949024, + "epoch": 0.5444160529084624, + "flos": 27616935631680.0, + "grad_norm": 2.4401760519500457, + "language_loss": 0.68257654, + "learning_rate": 1.8096470719813836e-06, + "loss": 0.7041105, + "num_input_tokens_seen": 195025080, + "router_z_loss_clip": 0.78613281, + "router_z_loss_mlp": 0.12756348, + "step": 9055, + "time_per_iteration": 2.643035888671875 + }, + { + "auxiliary_loss_clip": 0.01034488, + "auxiliary_loss_mlp": 0.01003056, + "balance_loss_clip": 1.00963533, + "balance_loss_mlp": 1.00169218, + "epoch": 0.5444761761611303, + "flos": 85013760172320.0, + "grad_norm": 0.7488426356651192, + "language_loss": 0.5764395, + "learning_rate": 1.80925938190531e-06, + "loss": 0.59681493, + "num_input_tokens_seen": 195085725, + "router_z_loss_clip": 0.24841309, + "router_z_loss_mlp": 0.01364899, + "step": 9056, + "time_per_iteration": 3.2439777851104736 + }, + { + "auxiliary_loss_clip": 0.01122375, + "auxiliary_loss_mlp": 0.01034812, + "balance_loss_clip": 1.04166603, + "balance_loss_mlp": 1.02135301, + "epoch": 0.5445362994137983, + "flos": 17422499982240.0, + "grad_norm": 2.1247153599624102, + "language_loss": 0.69811338, + "learning_rate": 1.8088716990621395e-06, + "loss": 0.71968526, + "num_input_tokens_seen": 195102585, + "router_z_loss_clip": 0.80712891, + "router_z_loss_mlp": 0.13470459, + "step": 9057, + "time_per_iteration": 2.634478807449341 + }, + { + "auxiliary_loss_clip": 0.01117057, + "auxiliary_loss_mlp": 0.01034861, + "balance_loss_clip": 1.04044092, + "balance_loss_mlp": 1.02186728, + "epoch": 0.5445964226664662, + "flos": 35369663052960.0, + "grad_norm": 2.6426138664418177, + "language_loss": 0.75153494, + "learning_rate": 1.8084840234665738e-06, + "loss": 0.77305412, + "num_input_tokens_seen": 195120055, + "router_z_loss_clip": 0.765625, + "router_z_loss_mlp": 0.13006592, + "step": 9058, + "time_per_iteration": 4.117671251296997 + }, + { + "auxiliary_loss_clip": 0.01034777, + "auxiliary_loss_mlp": 0.01004221, + "balance_loss_clip": 1.00997543, + "balance_loss_mlp": 1.00292516, + "epoch": 0.5446565459191343, + "flos": 83729830178880.0, + "grad_norm": 0.7951103605990162, + "language_loss": 0.62630063, + "learning_rate": 1.808096355133312e-06, + "loss": 0.64669061, + "num_input_tokens_seen": 195181045, + "router_z_loss_clip": 0.24804688, + "router_z_loss_mlp": 0.01296234, + "step": 9059, + "time_per_iteration": 3.3326475620269775 + }, + { + "auxiliary_loss_clip": 0.01118642, + "auxiliary_loss_mlp": 0.01033983, + "balance_loss_clip": 1.04171193, + "balance_loss_mlp": 1.0215857, + "epoch": 0.5447166691718022, + "flos": 19787035492800.0, + "grad_norm": 1.906270977938517, + "language_loss": 0.79287881, + "learning_rate": 1.8077086940770572e-06, + "loss": 0.81440508, + "num_input_tokens_seen": 195198840, + "router_z_loss_clip": 0.77001953, + "router_z_loss_mlp": 0.12402344, + "step": 9060, + "time_per_iteration": 2.631708860397339 + }, + { + "auxiliary_loss_clip": 0.01121269, + "auxiliary_loss_mlp": 0.01032451, + "balance_loss_clip": 1.04182601, + "balance_loss_mlp": 1.01971364, + "epoch": 0.5447767924244702, + "flos": 31541407261920.0, + "grad_norm": 2.080991740240898, + "language_loss": 0.79557025, + "learning_rate": 1.8073210403125072e-06, + "loss": 0.81710744, + "num_input_tokens_seen": 195218720, + "router_z_loss_clip": 0.79443359, + "router_z_loss_mlp": 0.12738037, + "step": 9061, + "time_per_iteration": 2.7918920516967773 + }, + { + "auxiliary_loss_clip": 0.01118123, + "auxiliary_loss_mlp": 0.01024536, + "balance_loss_clip": 1.04174101, + "balance_loss_mlp": 1.0125134, + "epoch": 0.5448369156771381, + "flos": 24011168990400.0, + "grad_norm": 2.6476607602909032, + "language_loss": 0.87118924, + "learning_rate": 1.8069333938543627e-06, + "loss": 0.8926158, + "num_input_tokens_seen": 195235770, + "router_z_loss_clip": 0.76318359, + "router_z_loss_mlp": 0.12023926, + "step": 9062, + "time_per_iteration": 2.612182378768921 + }, + { + "auxiliary_loss_clip": 0.01123095, + "auxiliary_loss_mlp": 0.01033184, + "balance_loss_clip": 1.04169095, + "balance_loss_mlp": 1.01966023, + "epoch": 0.5448970389298061, + "flos": 23527543556160.0, + "grad_norm": 2.4638533186794285, + "language_loss": 0.82481754, + "learning_rate": 1.8065457547173233e-06, + "loss": 0.84638035, + "num_input_tokens_seen": 195254870, + "router_z_loss_clip": 0.81347656, + "router_z_loss_mlp": 0.13537598, + "step": 9063, + "time_per_iteration": 2.652055501937866 + }, + { + "auxiliary_loss_clip": 0.01119368, + "auxiliary_loss_mlp": 0.01030112, + "balance_loss_clip": 1.04009509, + "balance_loss_mlp": 1.01752973, + "epoch": 0.544957162182474, + "flos": 25614533283840.0, + "grad_norm": 8.204251620870535, + "language_loss": 0.6390872, + "learning_rate": 1.8061581229160878e-06, + "loss": 0.66058201, + "num_input_tokens_seen": 195273390, + "router_z_loss_clip": 0.79296875, + "router_z_loss_mlp": 0.12585449, + "step": 9064, + "time_per_iteration": 2.6399319171905518 + }, + { + "auxiliary_loss_clip": 0.01121662, + "auxiliary_loss_mlp": 0.0103307, + "balance_loss_clip": 1.04198897, + "balance_loss_mlp": 1.02011788, + "epoch": 0.545017285435142, + "flos": 30962457368640.0, + "grad_norm": 1.6688281741407605, + "language_loss": 0.80118382, + "learning_rate": 1.8057704984653566e-06, + "loss": 0.82273114, + "num_input_tokens_seen": 195295635, + "router_z_loss_clip": 0.796875, + "router_z_loss_mlp": 0.12963867, + "step": 9065, + "time_per_iteration": 4.141050577163696 + }, + { + "auxiliary_loss_clip": 0.01116223, + "auxiliary_loss_mlp": 0.01029423, + "balance_loss_clip": 1.04030871, + "balance_loss_mlp": 1.018116, + "epoch": 0.54507740868781, + "flos": 23348198959200.0, + "grad_norm": 2.3504818987570637, + "language_loss": 0.77870524, + "learning_rate": 1.805382881379827e-06, + "loss": 0.80016172, + "num_input_tokens_seen": 195312545, + "router_z_loss_clip": 0.75830078, + "router_z_loss_mlp": 0.11315918, + "step": 9066, + "time_per_iteration": 2.7001566886901855 + }, + { + "auxiliary_loss_clip": 0.01121224, + "auxiliary_loss_mlp": 0.01029795, + "balance_loss_clip": 1.04030001, + "balance_loss_mlp": 1.01710546, + "epoch": 0.545137531940478, + "flos": 32030502528960.0, + "grad_norm": 2.2835125316652305, + "language_loss": 0.76292735, + "learning_rate": 1.8049952716741975e-06, + "loss": 0.78443754, + "num_input_tokens_seen": 195332955, + "router_z_loss_clip": 0.80761719, + "router_z_loss_mlp": 0.12689209, + "step": 9067, + "time_per_iteration": 2.765087842941284 + }, + { + "auxiliary_loss_clip": 0.01125681, + "auxiliary_loss_mlp": 0.01035423, + "balance_loss_clip": 1.0417285, + "balance_loss_mlp": 1.02047443, + "epoch": 0.545197655193146, + "flos": 45827179816320.0, + "grad_norm": 2.577502191277401, + "language_loss": 0.62975025, + "learning_rate": 1.8046076693631682e-06, + "loss": 0.65136129, + "num_input_tokens_seen": 195355930, + "router_z_loss_clip": 0.83935547, + "router_z_loss_mlp": 0.14941406, + "step": 9068, + "time_per_iteration": 2.7743210792541504 + }, + { + "auxiliary_loss_clip": 0.01118349, + "auxiliary_loss_mlp": 0.01038268, + "balance_loss_clip": 1.04200554, + "balance_loss_mlp": 1.02631128, + "epoch": 0.5452577784458139, + "flos": 31763774859840.0, + "grad_norm": 1.6296113390179872, + "language_loss": 0.72096443, + "learning_rate": 1.8042200744614343e-06, + "loss": 0.74253058, + "num_input_tokens_seen": 195376445, + "router_z_loss_clip": 0.76416016, + "router_z_loss_mlp": 0.1194458, + "step": 9069, + "time_per_iteration": 2.664471387863159 + }, + { + "auxiliary_loss_clip": 0.01119896, + "auxiliary_loss_mlp": 0.01026379, + "balance_loss_clip": 1.0441221, + "balance_loss_mlp": 1.01516175, + "epoch": 0.5453179016984819, + "flos": 21522993792480.0, + "grad_norm": 1.9089132237531572, + "language_loss": 0.73820382, + "learning_rate": 1.8038324869836957e-06, + "loss": 0.75966656, + "num_input_tokens_seen": 195393725, + "router_z_loss_clip": 0.75732422, + "router_z_loss_mlp": 0.11212158, + "step": 9070, + "time_per_iteration": 2.6750130653381348 + }, + { + "auxiliary_loss_clip": 0.011184, + "auxiliary_loss_mlp": 0.0103307, + "balance_loss_clip": 1.04077351, + "balance_loss_mlp": 1.02035058, + "epoch": 0.5453780249511498, + "flos": 28329695049600.0, + "grad_norm": 2.186746363751902, + "language_loss": 0.60574973, + "learning_rate": 1.8034449069446489e-06, + "loss": 0.6272645, + "num_input_tokens_seen": 195411380, + "router_z_loss_clip": 0.77587891, + "router_z_loss_mlp": 0.12731934, + "step": 9071, + "time_per_iteration": 2.6472244262695312 + }, + { + "auxiliary_loss_clip": 0.01033998, + "auxiliary_loss_mlp": 0.01002204, + "balance_loss_clip": 1.00945783, + "balance_loss_mlp": 1.00087225, + "epoch": 0.5454381482038179, + "flos": 83822602049280.0, + "grad_norm": 0.703715609020086, + "language_loss": 0.57154608, + "learning_rate": 1.80305733435899e-06, + "loss": 0.5919081, + "num_input_tokens_seen": 195482015, + "router_z_loss_clip": 0.24536133, + "router_z_loss_mlp": 0.01333618, + "step": 9072, + "time_per_iteration": 3.366832971572876 + }, + { + "auxiliary_loss_clip": 0.01115648, + "auxiliary_loss_mlp": 0.01032804, + "balance_loss_clip": 1.03993011, + "balance_loss_mlp": 1.0208652, + "epoch": 0.5454982714564858, + "flos": 16180134367680.0, + "grad_norm": 1.6263150032716007, + "language_loss": 0.69475126, + "learning_rate": 1.8026697692414174e-06, + "loss": 0.71623576, + "num_input_tokens_seen": 195500440, + "router_z_loss_clip": 0.7578125, + "router_z_loss_mlp": 0.11950684, + "step": 9073, + "time_per_iteration": 2.680119514465332 + }, + { + "auxiliary_loss_clip": 0.01115989, + "auxiliary_loss_mlp": 0.01033051, + "balance_loss_clip": 1.04032826, + "balance_loss_mlp": 1.02151752, + "epoch": 0.5455583947091538, + "flos": 26644214930400.0, + "grad_norm": 2.6260727283389182, + "language_loss": 0.71253645, + "learning_rate": 1.802282211606627e-06, + "loss": 0.73402685, + "num_input_tokens_seen": 195520860, + "router_z_loss_clip": 0.75634766, + "router_z_loss_mlp": 0.11535645, + "step": 9074, + "time_per_iteration": 2.6447439193725586 + }, + { + "auxiliary_loss_clip": 0.01116613, + "auxiliary_loss_mlp": 0.01035374, + "balance_loss_clip": 1.03907812, + "balance_loss_mlp": 1.02311349, + "epoch": 0.5456185179618217, + "flos": 21741066558720.0, + "grad_norm": 2.095217451565906, + "language_loss": 0.68499565, + "learning_rate": 1.8018946614693148e-06, + "loss": 0.70651555, + "num_input_tokens_seen": 195538615, + "router_z_loss_clip": 0.77539062, + "router_z_loss_mlp": 0.12255859, + "step": 9075, + "time_per_iteration": 2.620143413543701 + }, + { + "auxiliary_loss_clip": 0.01117516, + "auxiliary_loss_mlp": 0.01031048, + "balance_loss_clip": 1.04204345, + "balance_loss_mlp": 1.02002692, + "epoch": 0.5456786412144897, + "flos": 25708925845440.0, + "grad_norm": 2.210889553843951, + "language_loss": 0.80352092, + "learning_rate": 1.8015071188441768e-06, + "loss": 0.8250066, + "num_input_tokens_seen": 195557460, + "router_z_loss_clip": 0.75390625, + "router_z_loss_mlp": 0.11029053, + "step": 9076, + "time_per_iteration": 2.607546806335449 + }, + { + "auxiliary_loss_clip": 0.0111927, + "auxiliary_loss_mlp": 0.01032465, + "balance_loss_clip": 1.04115605, + "balance_loss_mlp": 1.02065754, + "epoch": 0.5457387644671576, + "flos": 28424735887680.0, + "grad_norm": 1.688982021291041, + "language_loss": 0.80246741, + "learning_rate": 1.8011195837459089e-06, + "loss": 0.82398474, + "num_input_tokens_seen": 195577985, + "router_z_loss_clip": 0.78173828, + "router_z_loss_mlp": 0.11816406, + "step": 9077, + "time_per_iteration": 2.664874792098999 + }, + { + "auxiliary_loss_clip": 0.01117583, + "auxiliary_loss_mlp": 0.01031357, + "balance_loss_clip": 1.03895795, + "balance_loss_mlp": 1.01929891, + "epoch": 0.5457988877198257, + "flos": 26377163123040.0, + "grad_norm": 3.82902672582344, + "language_loss": 0.67716801, + "learning_rate": 1.8007320561892064e-06, + "loss": 0.69865745, + "num_input_tokens_seen": 195597620, + "router_z_loss_clip": 0.78515625, + "router_z_loss_mlp": 0.12060547, + "step": 9078, + "time_per_iteration": 2.6979727745056152 + }, + { + "auxiliary_loss_clip": 0.01122051, + "auxiliary_loss_mlp": 0.01036428, + "balance_loss_clip": 1.04228616, + "balance_loss_mlp": 1.02398825, + "epoch": 0.5458590109724936, + "flos": 28995865945920.0, + "grad_norm": 2.0368132485091035, + "language_loss": 0.80818748, + "learning_rate": 1.800344536188764e-06, + "loss": 0.82977223, + "num_input_tokens_seen": 195615910, + "router_z_loss_clip": 0.79589844, + "router_z_loss_mlp": 0.12432861, + "step": 9079, + "time_per_iteration": 2.6775248050689697 + }, + { + "auxiliary_loss_clip": 0.01125908, + "auxiliary_loss_mlp": 0.01033851, + "balance_loss_clip": 1.04300547, + "balance_loss_mlp": 1.01997542, + "epoch": 0.5459191342251616, + "flos": 29802977408160.0, + "grad_norm": 1.728735728934033, + "language_loss": 0.75639546, + "learning_rate": 1.799957023759277e-06, + "loss": 0.77799302, + "num_input_tokens_seen": 195635620, + "router_z_loss_clip": 0.82910156, + "router_z_loss_mlp": 0.13885498, + "step": 9080, + "time_per_iteration": 2.6652166843414307 + }, + { + "auxiliary_loss_clip": 0.01120709, + "auxiliary_loss_mlp": 0.01030657, + "balance_loss_clip": 1.04144692, + "balance_loss_mlp": 1.01772833, + "epoch": 0.5459792574778296, + "flos": 28825070495040.0, + "grad_norm": 2.431967724963442, + "language_loss": 0.83340377, + "learning_rate": 1.7995695189154392e-06, + "loss": 0.85491741, + "num_input_tokens_seen": 195652495, + "router_z_loss_clip": 0.79248047, + "router_z_loss_mlp": 0.12927246, + "step": 9081, + "time_per_iteration": 2.691657304763794 + }, + { + "auxiliary_loss_clip": 0.0112436, + "auxiliary_loss_mlp": 0.01031153, + "balance_loss_clip": 1.04302621, + "balance_loss_mlp": 1.01842117, + "epoch": 0.5460393807304975, + "flos": 23349090339360.0, + "grad_norm": 1.619055977109652, + "language_loss": 0.69943422, + "learning_rate": 1.7991820216719461e-06, + "loss": 0.72098941, + "num_input_tokens_seen": 195671965, + "router_z_loss_clip": 0.8125, + "router_z_loss_mlp": 0.12731934, + "step": 9082, + "time_per_iteration": 2.6884090900421143 + }, + { + "auxiliary_loss_clip": 0.01115199, + "auxiliary_loss_mlp": 0.01028472, + "balance_loss_clip": 1.0398097, + "balance_loss_mlp": 1.01650894, + "epoch": 0.5460995039831655, + "flos": 43829031782880.0, + "grad_norm": 2.4838006711173697, + "language_loss": 0.66816509, + "learning_rate": 1.7987945320434906e-06, + "loss": 0.68960178, + "num_input_tokens_seen": 195694725, + "router_z_loss_clip": 0.75390625, + "router_z_loss_mlp": 0.11962891, + "step": 9083, + "time_per_iteration": 2.758554458618164 + }, + { + "auxiliary_loss_clip": 0.01117036, + "auxiliary_loss_mlp": 0.01028878, + "balance_loss_clip": 1.0407027, + "balance_loss_mlp": 1.01683211, + "epoch": 0.5461596272358334, + "flos": 32652272836800.0, + "grad_norm": 2.0874775865561066, + "language_loss": 0.78983665, + "learning_rate": 1.798407050044766e-06, + "loss": 0.81129575, + "num_input_tokens_seen": 195714090, + "router_z_loss_clip": 0.76318359, + "router_z_loss_mlp": 0.12036133, + "step": 9084, + "time_per_iteration": 2.6586880683898926 + }, + { + "auxiliary_loss_clip": 0.0112207, + "auxiliary_loss_mlp": 0.01032056, + "balance_loss_clip": 1.04204845, + "balance_loss_mlp": 1.01957476, + "epoch": 0.5462197504885015, + "flos": 25485869453760.0, + "grad_norm": 3.1212824011502125, + "language_loss": 0.74888355, + "learning_rate": 1.7980195756904675e-06, + "loss": 0.77042484, + "num_input_tokens_seen": 195733585, + "router_z_loss_clip": 0.80078125, + "router_z_loss_mlp": 0.12475586, + "step": 9085, + "time_per_iteration": 2.6835198402404785 + }, + { + "auxiliary_loss_clip": 0.01120322, + "auxiliary_loss_mlp": 0.01031055, + "balance_loss_clip": 1.04109073, + "balance_loss_mlp": 1.01848412, + "epoch": 0.5462798737411694, + "flos": 31488335975520.0, + "grad_norm": 1.8988950850906683, + "language_loss": 0.74603283, + "learning_rate": 1.7976321089952857e-06, + "loss": 0.76754659, + "num_input_tokens_seen": 195752820, + "router_z_loss_clip": 0.79296875, + "router_z_loss_mlp": 0.12579346, + "step": 9086, + "time_per_iteration": 2.682469606399536 + }, + { + "auxiliary_loss_clip": 0.01119541, + "auxiliary_loss_mlp": 0.01029327, + "balance_loss_clip": 1.04054189, + "balance_loss_mlp": 1.0167383, + "epoch": 0.5463399969938374, + "flos": 31451187980160.0, + "grad_norm": 1.635642944637423, + "language_loss": 0.77023226, + "learning_rate": 1.7972446499739155e-06, + "loss": 0.79172093, + "num_input_tokens_seen": 195773740, + "router_z_loss_clip": 0.7890625, + "router_z_loss_mlp": 0.12573242, + "step": 9087, + "time_per_iteration": 2.7203192710876465 + }, + { + "auxiliary_loss_clip": 0.01123553, + "auxiliary_loss_mlp": 0.01039338, + "balance_loss_clip": 1.04312062, + "balance_loss_mlp": 1.02537882, + "epoch": 0.5464001202465053, + "flos": 22992508044000.0, + "grad_norm": 1.7412359282443808, + "language_loss": 0.77686125, + "learning_rate": 1.7968571986410484e-06, + "loss": 0.79849017, + "num_input_tokens_seen": 195792125, + "router_z_loss_clip": 0.80322266, + "router_z_loss_mlp": 0.13946533, + "step": 9088, + "time_per_iteration": 2.6504082679748535 + }, + { + "auxiliary_loss_clip": 0.01036228, + "auxiliary_loss_mlp": 0.01002237, + "balance_loss_clip": 1.01149035, + "balance_loss_mlp": 1.00084579, + "epoch": 0.5464602434991733, + "flos": 84254493267360.0, + "grad_norm": 0.7292940066110187, + "language_loss": 0.57692814, + "learning_rate": 1.7964697550113758e-06, + "loss": 0.59731275, + "num_input_tokens_seen": 195854935, + "router_z_loss_clip": 0.24743652, + "router_z_loss_mlp": 0.01390839, + "step": 9089, + "time_per_iteration": 3.278278350830078 + }, + { + "auxiliary_loss_clip": 0.01119742, + "auxiliary_loss_mlp": 0.01030861, + "balance_loss_clip": 1.04004645, + "balance_loss_mlp": 1.01873112, + "epoch": 0.5465203667518412, + "flos": 33629653025280.0, + "grad_norm": 1.8377999259158924, + "language_loss": 0.76534098, + "learning_rate": 1.7960823190995918e-06, + "loss": 0.786847, + "num_input_tokens_seen": 195874715, + "router_z_loss_clip": 0.79541016, + "router_z_loss_mlp": 0.12139893, + "step": 9090, + "time_per_iteration": 2.75003981590271 + }, + { + "auxiliary_loss_clip": 0.01121954, + "auxiliary_loss_mlp": 0.01033837, + "balance_loss_clip": 1.04026115, + "balance_loss_mlp": 1.02030647, + "epoch": 0.5465804900045093, + "flos": 25881220435680.0, + "grad_norm": 1.9844213177375278, + "language_loss": 0.73989379, + "learning_rate": 1.7956948909203855e-06, + "loss": 0.76145172, + "num_input_tokens_seen": 195892610, + "router_z_loss_clip": 0.81689453, + "router_z_loss_mlp": 0.13543701, + "step": 9091, + "time_per_iteration": 4.116035461425781 + }, + { + "auxiliary_loss_clip": 0.01121486, + "auxiliary_loss_mlp": 0.01035843, + "balance_loss_clip": 1.04218888, + "balance_loss_mlp": 1.02298021, + "epoch": 0.5466406132571772, + "flos": 27441480693600.0, + "grad_norm": 2.544277000432198, + "language_loss": 0.77517962, + "learning_rate": 1.7953074704884498e-06, + "loss": 0.79675293, + "num_input_tokens_seen": 195911085, + "router_z_loss_clip": 0.79248047, + "router_z_loss_mlp": 0.12860107, + "step": 9092, + "time_per_iteration": 4.012652397155762 + }, + { + "auxiliary_loss_clip": 0.01123442, + "auxiliary_loss_mlp": 0.01031211, + "balance_loss_clip": 1.04282355, + "balance_loss_mlp": 1.01853871, + "epoch": 0.5467007365098452, + "flos": 21567880588320.0, + "grad_norm": 2.2543241776234435, + "language_loss": 0.75110626, + "learning_rate": 1.794920057818476e-06, + "loss": 0.77265275, + "num_input_tokens_seen": 195929845, + "router_z_loss_clip": 0.80566406, + "router_z_loss_mlp": 0.12670898, + "step": 9093, + "time_per_iteration": 2.675962448120117 + }, + { + "auxiliary_loss_clip": 0.0112137, + "auxiliary_loss_mlp": 0.01037509, + "balance_loss_clip": 1.04015326, + "balance_loss_mlp": 1.0236268, + "epoch": 0.5467608597625132, + "flos": 19156027245120.0, + "grad_norm": 2.1960906305979058, + "language_loss": 0.68994296, + "learning_rate": 1.7945326529251533e-06, + "loss": 0.71153176, + "num_input_tokens_seen": 195946350, + "router_z_loss_clip": 0.81152344, + "router_z_loss_mlp": 0.13891602, + "step": 9094, + "time_per_iteration": 2.595477342605591 + }, + { + "auxiliary_loss_clip": 0.0112129, + "auxiliary_loss_mlp": 0.01034653, + "balance_loss_clip": 1.04322219, + "balance_loss_mlp": 1.02307808, + "epoch": 0.5468209830151811, + "flos": 29667142019520.0, + "grad_norm": 3.107097318119796, + "language_loss": 0.67359269, + "learning_rate": 1.7941452558231731e-06, + "loss": 0.69515216, + "num_input_tokens_seen": 195959840, + "router_z_loss_clip": 0.77978516, + "router_z_loss_mlp": 0.11566162, + "step": 9095, + "time_per_iteration": 2.6523704528808594 + }, + { + "auxiliary_loss_clip": 0.01120401, + "auxiliary_loss_mlp": 0.01037763, + "balance_loss_clip": 1.04309046, + "balance_loss_mlp": 1.02556145, + "epoch": 0.5468811062678491, + "flos": 35589680648640.0, + "grad_norm": 2.529447852076833, + "language_loss": 0.66487652, + "learning_rate": 1.7937578665272256e-06, + "loss": 0.68645817, + "num_input_tokens_seen": 195981125, + "router_z_loss_clip": 0.77246094, + "router_z_loss_mlp": 0.12207031, + "step": 9096, + "time_per_iteration": 2.7363719940185547 + }, + { + "auxiliary_loss_clip": 0.01034268, + "auxiliary_loss_mlp": 0.0100647, + "balance_loss_clip": 1.00978923, + "balance_loss_mlp": 1.00521481, + "epoch": 0.546941229520517, + "flos": 82812889071360.0, + "grad_norm": 0.7420680911589883, + "language_loss": 0.57588905, + "learning_rate": 1.7933704850520007e-06, + "loss": 0.59629643, + "num_input_tokens_seen": 196038880, + "router_z_loss_clip": 0.24475098, + "router_z_loss_mlp": 0.01254272, + "step": 9097, + "time_per_iteration": 3.3431453704833984 + }, + { + "auxiliary_loss_clip": 0.01035202, + "auxiliary_loss_mlp": 0.01006162, + "balance_loss_clip": 1.01066279, + "balance_loss_mlp": 1.00491762, + "epoch": 0.5470013527731851, + "flos": 71102310131520.0, + "grad_norm": 0.940188811438361, + "language_loss": 0.64721859, + "learning_rate": 1.7929831114121868e-06, + "loss": 0.66763222, + "num_input_tokens_seen": 196099215, + "router_z_loss_clip": 0.24560547, + "router_z_loss_mlp": 0.01243591, + "step": 9098, + "time_per_iteration": 4.590398073196411 + }, + { + "auxiliary_loss_clip": 0.01121217, + "auxiliary_loss_mlp": 0.0103957, + "balance_loss_clip": 1.04139757, + "balance_loss_mlp": 1.02642131, + "epoch": 0.547061476025853, + "flos": 28023266796480.0, + "grad_norm": 1.8466193842206486, + "language_loss": 0.73043585, + "learning_rate": 1.7925957456224753e-06, + "loss": 0.75204372, + "num_input_tokens_seen": 196120370, + "router_z_loss_clip": 0.79736328, + "router_z_loss_mlp": 0.13146973, + "step": 9099, + "time_per_iteration": 2.7023539543151855 + }, + { + "auxiliary_loss_clip": 0.01117753, + "auxiliary_loss_mlp": 0.01035063, + "balance_loss_clip": 1.04036856, + "balance_loss_mlp": 1.02401829, + "epoch": 0.547121599278521, + "flos": 36568316872800.0, + "grad_norm": 2.6172690819570037, + "language_loss": 0.72250211, + "learning_rate": 1.7922083876975537e-06, + "loss": 0.74403024, + "num_input_tokens_seen": 196139075, + "router_z_loss_clip": 0.7734375, + "router_z_loss_mlp": 0.1104126, + "step": 9100, + "time_per_iteration": 2.8355283737182617 + }, + { + "auxiliary_loss_clip": 0.011174, + "auxiliary_loss_mlp": 0.01028343, + "balance_loss_clip": 1.0400579, + "balance_loss_mlp": 1.01592779, + "epoch": 0.5471817225311889, + "flos": 44582140061280.0, + "grad_norm": 1.7142379252535862, + "language_loss": 0.6766876, + "learning_rate": 1.7918210376521102e-06, + "loss": 0.69814503, + "num_input_tokens_seen": 196159990, + "router_z_loss_clip": 0.7734375, + "router_z_loss_mlp": 0.12414551, + "step": 9101, + "time_per_iteration": 2.7773730754852295 + }, + { + "auxiliary_loss_clip": 0.01120616, + "auxiliary_loss_mlp": 0.01034402, + "balance_loss_clip": 1.041857, + "balance_loss_mlp": 1.02246916, + "epoch": 0.5472418457838569, + "flos": 31451431083840.0, + "grad_norm": 1.889517688286125, + "language_loss": 0.78370947, + "learning_rate": 1.7914336955008343e-06, + "loss": 0.80525964, + "num_input_tokens_seen": 196180570, + "router_z_loss_clip": 0.78710938, + "router_z_loss_mlp": 0.11938477, + "step": 9102, + "time_per_iteration": 2.660428285598755 + }, + { + "auxiliary_loss_clip": 0.01121398, + "auxiliary_loss_mlp": 0.01039427, + "balance_loss_clip": 1.04412103, + "balance_loss_mlp": 1.02694011, + "epoch": 0.5473019690365248, + "flos": 34029541942560.0, + "grad_norm": 1.599710749977288, + "language_loss": 0.72020602, + "learning_rate": 1.791046361258413e-06, + "loss": 0.74181426, + "num_input_tokens_seen": 196200300, + "router_z_loss_clip": 0.77294922, + "router_z_loss_mlp": 0.12481689, + "step": 9103, + "time_per_iteration": 2.7223422527313232 + }, + { + "auxiliary_loss_clip": 0.01116464, + "auxiliary_loss_mlp": 0.01034647, + "balance_loss_clip": 1.03949666, + "balance_loss_mlp": 1.02252936, + "epoch": 0.5473620922891929, + "flos": 70324486312320.0, + "grad_norm": 1.3880079131790137, + "language_loss": 0.65496051, + "learning_rate": 1.7906590349395356e-06, + "loss": 0.67647171, + "num_input_tokens_seen": 196228525, + "router_z_loss_clip": 0.76904297, + "router_z_loss_mlp": 0.12121582, + "step": 9104, + "time_per_iteration": 2.9942922592163086 + }, + { + "auxiliary_loss_clip": 0.01123222, + "auxiliary_loss_mlp": 0.01032814, + "balance_loss_clip": 1.04239821, + "balance_loss_mlp": 1.01996279, + "epoch": 0.5474222155418608, + "flos": 23615777491200.0, + "grad_norm": 2.6638911399274527, + "language_loss": 0.81400478, + "learning_rate": 1.790271716558888e-06, + "loss": 0.83556509, + "num_input_tokens_seen": 196247690, + "router_z_loss_clip": 0.80761719, + "router_z_loss_mlp": 0.12860107, + "step": 9105, + "time_per_iteration": 3.9129178524017334 + }, + { + "auxiliary_loss_clip": 0.01117163, + "auxiliary_loss_mlp": 0.01032504, + "balance_loss_clip": 1.04024291, + "balance_loss_mlp": 1.02061844, + "epoch": 0.5474823387945288, + "flos": 30652220491200.0, + "grad_norm": 3.1914572934058794, + "language_loss": 0.80399787, + "learning_rate": 1.7898844061311575e-06, + "loss": 0.82549447, + "num_input_tokens_seen": 196268555, + "router_z_loss_clip": 0.76855469, + "router_z_loss_mlp": 0.11889648, + "step": 9106, + "time_per_iteration": 2.867093801498413 + }, + { + "auxiliary_loss_clip": 0.0111912, + "auxiliary_loss_mlp": 0.01035821, + "balance_loss_clip": 1.04207265, + "balance_loss_mlp": 1.02421641, + "epoch": 0.5475424620471967, + "flos": 21968539333920.0, + "grad_norm": 1.8848556876272176, + "language_loss": 0.69241208, + "learning_rate": 1.7894971036710322e-06, + "loss": 0.71396148, + "num_input_tokens_seen": 196285585, + "router_z_loss_clip": 0.77050781, + "router_z_loss_mlp": 0.1161499, + "step": 9107, + "time_per_iteration": 2.630894660949707 + }, + { + "auxiliary_loss_clip": 0.011225, + "auxiliary_loss_mlp": 0.01032611, + "balance_loss_clip": 1.04210591, + "balance_loss_mlp": 1.02015924, + "epoch": 0.5476025852998647, + "flos": 27222030339840.0, + "grad_norm": 1.8122684209612228, + "language_loss": 0.63222808, + "learning_rate": 1.789109809193197e-06, + "loss": 0.65377915, + "num_input_tokens_seen": 196305085, + "router_z_loss_clip": 0.80371094, + "router_z_loss_mlp": 0.12457275, + "step": 9108, + "time_per_iteration": 2.6406009197235107 + }, + { + "auxiliary_loss_clip": 0.01118556, + "auxiliary_loss_mlp": 0.0103051, + "balance_loss_clip": 1.04083419, + "balance_loss_mlp": 1.01944745, + "epoch": 0.5476627085525327, + "flos": 24551026058880.0, + "grad_norm": 2.233625641823938, + "language_loss": 0.75110894, + "learning_rate": 1.7887225227123396e-06, + "loss": 0.77259958, + "num_input_tokens_seen": 196323945, + "router_z_loss_clip": 0.77734375, + "router_z_loss_mlp": 0.11053467, + "step": 9109, + "time_per_iteration": 2.6229207515716553 + }, + { + "auxiliary_loss_clip": 0.01118518, + "auxiliary_loss_mlp": 0.01034443, + "balance_loss_clip": 1.04220605, + "balance_loss_mlp": 1.02162826, + "epoch": 0.5477228318052006, + "flos": 21612686349600.0, + "grad_norm": 1.8744277424299227, + "language_loss": 0.77560127, + "learning_rate": 1.7883352442431457e-06, + "loss": 0.79713094, + "num_input_tokens_seen": 196342200, + "router_z_loss_clip": 0.76367188, + "router_z_loss_mlp": 0.12823486, + "step": 9110, + "time_per_iteration": 2.6856961250305176 + }, + { + "auxiliary_loss_clip": 0.01115964, + "auxiliary_loss_mlp": 0.01031254, + "balance_loss_clip": 1.03968143, + "balance_loss_mlp": 1.01978016, + "epoch": 0.5477829550578687, + "flos": 31541245192800.0, + "grad_norm": 1.559265002134974, + "language_loss": 0.71375793, + "learning_rate": 1.7879479738002993e-06, + "loss": 0.73523009, + "num_input_tokens_seen": 196362940, + "router_z_loss_clip": 0.76269531, + "router_z_loss_mlp": 0.11474609, + "step": 9111, + "time_per_iteration": 2.6964995861053467 + }, + { + "auxiliary_loss_clip": 0.01118197, + "auxiliary_loss_mlp": 0.0104026, + "balance_loss_clip": 1.04046512, + "balance_loss_mlp": 1.02822554, + "epoch": 0.5478430783105366, + "flos": 28113202457280.0, + "grad_norm": 2.3033668604677793, + "language_loss": 0.71066654, + "learning_rate": 1.7875607113984876e-06, + "loss": 0.73225105, + "num_input_tokens_seen": 196383070, + "router_z_loss_clip": 0.77734375, + "router_z_loss_mlp": 0.1204834, + "step": 9112, + "time_per_iteration": 2.6518871784210205 + }, + { + "auxiliary_loss_clip": 0.01120359, + "auxiliary_loss_mlp": 0.01034411, + "balance_loss_clip": 1.0411073, + "balance_loss_mlp": 1.0223825, + "epoch": 0.5479032015632046, + "flos": 19609716759840.0, + "grad_norm": 2.091161383934921, + "language_loss": 0.88241041, + "learning_rate": 1.7871734570523953e-06, + "loss": 0.90395808, + "num_input_tokens_seen": 196398485, + "router_z_loss_clip": 0.79199219, + "router_z_loss_mlp": 0.12042236, + "step": 9113, + "time_per_iteration": 2.605179786682129 + }, + { + "auxiliary_loss_clip": 0.01122754, + "auxiliary_loss_mlp": 0.01028675, + "balance_loss_clip": 1.04322958, + "balance_loss_mlp": 1.01609838, + "epoch": 0.5479633248158725, + "flos": 29625294019680.0, + "grad_norm": 2.7054271991283123, + "language_loss": 0.72775656, + "learning_rate": 1.7867862107767067e-06, + "loss": 0.7492708, + "num_input_tokens_seen": 196417725, + "router_z_loss_clip": 0.79589844, + "router_z_loss_mlp": 0.12573242, + "step": 9114, + "time_per_iteration": 2.656548500061035 + }, + { + "auxiliary_loss_clip": 0.01117473, + "auxiliary_loss_mlp": 0.01033623, + "balance_loss_clip": 1.04048467, + "balance_loss_mlp": 1.02229834, + "epoch": 0.5480234480685405, + "flos": 32162893948800.0, + "grad_norm": 2.0419906346645, + "language_loss": 0.7230407, + "learning_rate": 1.7863989725861066e-06, + "loss": 0.74455166, + "num_input_tokens_seen": 196437840, + "router_z_loss_clip": 0.76953125, + "router_z_loss_mlp": 0.11334229, + "step": 9115, + "time_per_iteration": 2.665827989578247 + }, + { + "auxiliary_loss_clip": 0.0112173, + "auxiliary_loss_mlp": 0.01036776, + "balance_loss_clip": 1.04096985, + "balance_loss_mlp": 1.02386546, + "epoch": 0.5480835713212084, + "flos": 26911671910560.0, + "grad_norm": 1.6912838743772012, + "language_loss": 0.71982157, + "learning_rate": 1.7860117424952781e-06, + "loss": 0.74140662, + "num_input_tokens_seen": 196457300, + "router_z_loss_clip": 0.80712891, + "router_z_loss_mlp": 0.12902832, + "step": 9116, + "time_per_iteration": 2.623106002807617 + }, + { + "auxiliary_loss_clip": 0.01120045, + "auxiliary_loss_mlp": 0.01037342, + "balance_loss_clip": 1.0422585, + "balance_loss_mlp": 1.02512896, + "epoch": 0.5481436945738765, + "flos": 30872764811520.0, + "grad_norm": 3.8339737899537356, + "language_loss": 0.76198632, + "learning_rate": 1.7856245205189063e-06, + "loss": 0.78356022, + "num_input_tokens_seen": 196476720, + "router_z_loss_clip": 0.77783203, + "router_z_loss_mlp": 0.12219238, + "step": 9117, + "time_per_iteration": 2.6834073066711426 + }, + { + "auxiliary_loss_clip": 0.01113911, + "auxiliary_loss_mlp": 0.01030613, + "balance_loss_clip": 1.0388484, + "balance_loss_mlp": 1.01909137, + "epoch": 0.5482038178265444, + "flos": 40976657040960.0, + "grad_norm": 1.7532361836938002, + "language_loss": 0.62716448, + "learning_rate": 1.785237306671674e-06, + "loss": 0.6486097, + "num_input_tokens_seen": 196496765, + "router_z_loss_clip": 0.75, + "router_z_loss_mlp": 0.11529541, + "step": 9118, + "time_per_iteration": 2.722496509552002 + }, + { + "auxiliary_loss_clip": 0.01125229, + "auxiliary_loss_mlp": 0.01034476, + "balance_loss_clip": 1.04520237, + "balance_loss_mlp": 1.02145231, + "epoch": 0.5482639410792124, + "flos": 23215645470240.0, + "grad_norm": 1.9658343174538233, + "language_loss": 0.78647542, + "learning_rate": 1.7848501009682646e-06, + "loss": 0.80807245, + "num_input_tokens_seen": 196516220, + "router_z_loss_clip": 0.80078125, + "router_z_loss_mlp": 0.13018799, + "step": 9119, + "time_per_iteration": 2.666454315185547 + }, + { + "auxiliary_loss_clip": 0.01118924, + "auxiliary_loss_mlp": 0.01031051, + "balance_loss_clip": 1.04296446, + "balance_loss_mlp": 1.02029872, + "epoch": 0.5483240643318803, + "flos": 31006128646080.0, + "grad_norm": 2.270677826881043, + "language_loss": 0.82359946, + "learning_rate": 1.7844629034233604e-06, + "loss": 0.84509921, + "num_input_tokens_seen": 196533860, + "router_z_loss_clip": 0.75976562, + "router_z_loss_mlp": 0.10754395, + "step": 9120, + "time_per_iteration": 2.6468582153320312 + }, + { + "auxiliary_loss_clip": 0.01123438, + "auxiliary_loss_mlp": 0.01036649, + "balance_loss_clip": 1.04358625, + "balance_loss_mlp": 1.02364874, + "epoch": 0.5483841875845483, + "flos": 26194334040000.0, + "grad_norm": 2.3601523078674083, + "language_loss": 0.80893844, + "learning_rate": 1.7840757140516455e-06, + "loss": 0.83053929, + "num_input_tokens_seen": 196551305, + "router_z_loss_clip": 0.79785156, + "router_z_loss_mlp": 0.13000488, + "step": 9121, + "time_per_iteration": 2.703946590423584 + }, + { + "auxiliary_loss_clip": 0.01121417, + "auxiliary_loss_mlp": 0.01034768, + "balance_loss_clip": 1.04034019, + "balance_loss_mlp": 1.02177405, + "epoch": 0.5484443108372163, + "flos": 30197599079040.0, + "grad_norm": 1.668698924110544, + "language_loss": 0.61005676, + "learning_rate": 1.7836885328678008e-06, + "loss": 0.63161862, + "num_input_tokens_seen": 196569420, + "router_z_loss_clip": 0.81103516, + "router_z_loss_mlp": 0.13000488, + "step": 9122, + "time_per_iteration": 2.64856219291687 + }, + { + "auxiliary_loss_clip": 0.01118386, + "auxiliary_loss_mlp": 0.0103827, + "balance_loss_clip": 1.0419507, + "balance_loss_mlp": 1.02765393, + "epoch": 0.5485044340898843, + "flos": 30963348748800.0, + "grad_norm": 1.6106452038544208, + "language_loss": 0.71441758, + "learning_rate": 1.7833013598865084e-06, + "loss": 0.73598415, + "num_input_tokens_seen": 196590610, + "router_z_loss_clip": 0.76464844, + "router_z_loss_mlp": 0.10614014, + "step": 9123, + "time_per_iteration": 2.685558795928955 + }, + { + "auxiliary_loss_clip": 0.01120094, + "auxiliary_loss_mlp": 0.01033352, + "balance_loss_clip": 1.04168177, + "balance_loss_mlp": 1.0218308, + "epoch": 0.5485645573425523, + "flos": 15424149362400.0, + "grad_norm": 2.095258130351961, + "language_loss": 0.83110577, + "learning_rate": 1.7829141951224505e-06, + "loss": 0.85264021, + "num_input_tokens_seen": 196606495, + "router_z_loss_clip": 0.78515625, + "router_z_loss_mlp": 0.11529541, + "step": 9124, + "time_per_iteration": 2.607701539993286 + }, + { + "auxiliary_loss_clip": 0.01120375, + "auxiliary_loss_mlp": 0.01034792, + "balance_loss_clip": 1.04350138, + "balance_loss_mlp": 1.02257919, + "epoch": 0.5486246805952202, + "flos": 34568224009920.0, + "grad_norm": 1.7655986817250153, + "language_loss": 0.80026919, + "learning_rate": 1.7825270385903075e-06, + "loss": 0.82182086, + "num_input_tokens_seen": 196626365, + "router_z_loss_clip": 0.76855469, + "router_z_loss_mlp": 0.12200928, + "step": 9125, + "time_per_iteration": 2.703587532043457 + }, + { + "auxiliary_loss_clip": 0.01123126, + "auxiliary_loss_mlp": 0.01031528, + "balance_loss_clip": 1.04301381, + "balance_loss_mlp": 1.01863003, + "epoch": 0.5486848038478882, + "flos": 20499146634240.0, + "grad_norm": 2.095788751437725, + "language_loss": 0.74916983, + "learning_rate": 1.7821398903047617e-06, + "loss": 0.77071637, + "num_input_tokens_seen": 196644465, + "router_z_loss_clip": 0.80029297, + "router_z_loss_mlp": 0.12902832, + "step": 9126, + "time_per_iteration": 2.6142122745513916 + }, + { + "auxiliary_loss_clip": 0.01123142, + "auxiliary_loss_mlp": 0.01031671, + "balance_loss_clip": 1.04210293, + "balance_loss_mlp": 1.01809883, + "epoch": 0.5487449271005561, + "flos": 21031589040480.0, + "grad_norm": 2.8447442512369223, + "language_loss": 0.66240215, + "learning_rate": 1.7817527502804928e-06, + "loss": 0.68395019, + "num_input_tokens_seen": 196659160, + "router_z_loss_clip": 0.80957031, + "router_z_loss_mlp": 0.13568115, + "step": 9127, + "time_per_iteration": 2.6184241771698 + }, + { + "auxiliary_loss_clip": 0.01121581, + "auxiliary_loss_mlp": 0.0104036, + "balance_loss_clip": 1.04255652, + "balance_loss_mlp": 1.02741396, + "epoch": 0.5488050503532241, + "flos": 21159726145920.0, + "grad_norm": 1.8849440880271464, + "language_loss": 0.83142328, + "learning_rate": 1.781365618532181e-06, + "loss": 0.85304272, + "num_input_tokens_seen": 196677410, + "router_z_loss_clip": 0.78955078, + "router_z_loss_mlp": 0.12939453, + "step": 9128, + "time_per_iteration": 2.6325788497924805 + }, + { + "auxiliary_loss_clip": 0.01118495, + "auxiliary_loss_mlp": 0.01031142, + "balance_loss_clip": 1.04131174, + "balance_loss_mlp": 1.01871443, + "epoch": 0.548865173605892, + "flos": 21034911457440.0, + "grad_norm": 2.081688334855193, + "language_loss": 0.73917425, + "learning_rate": 1.7809784950745078e-06, + "loss": 0.7606706, + "num_input_tokens_seen": 196696765, + "router_z_loss_clip": 0.77099609, + "router_z_loss_mlp": 0.12432861, + "step": 9129, + "time_per_iteration": 2.6427628993988037 + }, + { + "auxiliary_loss_clip": 0.01126022, + "auxiliary_loss_mlp": 0.01030383, + "balance_loss_clip": 1.04474235, + "balance_loss_mlp": 1.01699567, + "epoch": 0.5489252968585601, + "flos": 21300504642720.0, + "grad_norm": 2.7918110055222316, + "language_loss": 0.63242745, + "learning_rate": 1.7805913799221511e-06, + "loss": 0.65399146, + "num_input_tokens_seen": 196714895, + "router_z_loss_clip": 0.8125, + "router_z_loss_mlp": 0.1338501, + "step": 9130, + "time_per_iteration": 4.065761566162109 + }, + { + "auxiliary_loss_clip": 0.01122875, + "auxiliary_loss_mlp": 0.01037153, + "balance_loss_clip": 1.04218447, + "balance_loss_mlp": 1.02441514, + "epoch": 0.548985420111228, + "flos": 32120235603360.0, + "grad_norm": 2.1753453784963677, + "language_loss": 0.62943023, + "learning_rate": 1.7802042730897915e-06, + "loss": 0.65103048, + "num_input_tokens_seen": 196735510, + "router_z_loss_clip": 0.80712891, + "router_z_loss_mlp": 0.12738037, + "step": 9131, + "time_per_iteration": 4.069982051849365 + }, + { + "auxiliary_loss_clip": 0.01120876, + "auxiliary_loss_mlp": 0.01034086, + "balance_loss_clip": 1.04095936, + "balance_loss_mlp": 1.02043033, + "epoch": 0.549045543363896, + "flos": 22810327237440.0, + "grad_norm": 2.1265527739824406, + "language_loss": 0.74795789, + "learning_rate": 1.7798171745921084e-06, + "loss": 0.76950753, + "num_input_tokens_seen": 196752855, + "router_z_loss_clip": 0.79833984, + "router_z_loss_mlp": 0.13665771, + "step": 9132, + "time_per_iteration": 2.625753164291382 + }, + { + "auxiliary_loss_clip": 0.01118912, + "auxiliary_loss_mlp": 0.01029677, + "balance_loss_clip": 1.03967988, + "balance_loss_mlp": 1.01787567, + "epoch": 0.5491056666165639, + "flos": 30160532118240.0, + "grad_norm": 1.6014329682990494, + "language_loss": 0.81247354, + "learning_rate": 1.7794300844437795e-06, + "loss": 0.83395946, + "num_input_tokens_seen": 196772230, + "router_z_loss_clip": 0.79150391, + "router_z_loss_mlp": 0.11804199, + "step": 9133, + "time_per_iteration": 2.710414409637451 + }, + { + "auxiliary_loss_clip": 0.01119003, + "auxiliary_loss_mlp": 0.01034787, + "balance_loss_clip": 1.04108071, + "balance_loss_mlp": 1.02250242, + "epoch": 0.5491657898692319, + "flos": 26328548737440.0, + "grad_norm": 2.0103012005385956, + "language_loss": 0.701487, + "learning_rate": 1.7790430026594841e-06, + "loss": 0.72302496, + "num_input_tokens_seen": 196790405, + "router_z_loss_clip": 0.77880859, + "router_z_loss_mlp": 0.12280273, + "step": 9134, + "time_per_iteration": 2.614583730697632 + }, + { + "auxiliary_loss_clip": 0.01120354, + "auxiliary_loss_mlp": 0.010317, + "balance_loss_clip": 1.04039979, + "balance_loss_mlp": 1.01964808, + "epoch": 0.5492259131219, + "flos": 61593689908800.0, + "grad_norm": 2.071906002656613, + "language_loss": 0.61141527, + "learning_rate": 1.7786559292539004e-06, + "loss": 0.63293588, + "num_input_tokens_seen": 196813785, + "router_z_loss_clip": 0.79882812, + "router_z_loss_mlp": 0.12054443, + "step": 9135, + "time_per_iteration": 2.891557216644287 + }, + { + "auxiliary_loss_clip": 0.01121375, + "auxiliary_loss_mlp": 0.01033952, + "balance_loss_clip": 1.04130697, + "balance_loss_mlp": 1.02077889, + "epoch": 0.5492860363745679, + "flos": 30650883420960.0, + "grad_norm": 2.713735259025598, + "language_loss": 0.72422016, + "learning_rate": 1.7782688642417058e-06, + "loss": 0.74577343, + "num_input_tokens_seen": 196834390, + "router_z_loss_clip": 0.80078125, + "router_z_loss_mlp": 0.13183594, + "step": 9136, + "time_per_iteration": 2.6434667110443115 + }, + { + "auxiliary_loss_clip": 0.01125062, + "auxiliary_loss_mlp": 0.01036361, + "balance_loss_clip": 1.04143023, + "balance_loss_mlp": 1.02246761, + "epoch": 0.5493461596272359, + "flos": 27618313219200.0, + "grad_norm": 2.3346411763687183, + "language_loss": 0.67872572, + "learning_rate": 1.7778818076375781e-06, + "loss": 0.70033991, + "num_input_tokens_seen": 196853290, + "router_z_loss_clip": 0.8359375, + "router_z_loss_mlp": 0.13916016, + "step": 9137, + "time_per_iteration": 4.141258478164673 + }, + { + "auxiliary_loss_clip": 0.01037162, + "auxiliary_loss_mlp": 0.00998547, + "balance_loss_clip": 1.01263762, + "balance_loss_mlp": 0.99731648, + "epoch": 0.5494062828799038, + "flos": 80719862268960.0, + "grad_norm": 1.2741959982891295, + "language_loss": 0.65228665, + "learning_rate": 1.7774947594561947e-06, + "loss": 0.67264366, + "num_input_tokens_seen": 196913120, + "router_z_loss_clip": 0.24523926, + "router_z_loss_mlp": 0.01229095, + "step": 9138, + "time_per_iteration": 3.3701181411743164 + }, + { + "auxiliary_loss_clip": 0.01121512, + "auxiliary_loss_mlp": 0.01030601, + "balance_loss_clip": 1.04181433, + "balance_loss_mlp": 1.01807809, + "epoch": 0.5494664061325718, + "flos": 25753529020320.0, + "grad_norm": 2.297368892965968, + "language_loss": 0.74954253, + "learning_rate": 1.7771077197122321e-06, + "loss": 0.77106369, + "num_input_tokens_seen": 196931530, + "router_z_loss_clip": 0.79589844, + "router_z_loss_mlp": 0.12512207, + "step": 9139, + "time_per_iteration": 2.666168212890625 + }, + { + "auxiliary_loss_clip": 0.01121118, + "auxiliary_loss_mlp": 0.0102708, + "balance_loss_clip": 1.04133022, + "balance_loss_mlp": 1.01463413, + "epoch": 0.5495265293852397, + "flos": 17561333649600.0, + "grad_norm": 1.8219842432219124, + "language_loss": 0.71764219, + "learning_rate": 1.7767206884203672e-06, + "loss": 0.73912418, + "num_input_tokens_seen": 196949430, + "router_z_loss_clip": 0.796875, + "router_z_loss_mlp": 0.12469482, + "step": 9140, + "time_per_iteration": 2.6492698192596436 + }, + { + "auxiliary_loss_clip": 0.01118416, + "auxiliary_loss_mlp": 0.0103789, + "balance_loss_clip": 1.03948295, + "balance_loss_mlp": 1.02481246, + "epoch": 0.5495866526379077, + "flos": 31176194785920.0, + "grad_norm": 1.9361383230302969, + "language_loss": 0.76410925, + "learning_rate": 1.7763336655952762e-06, + "loss": 0.78567231, + "num_input_tokens_seen": 196968265, + "router_z_loss_clip": 0.78955078, + "router_z_loss_mlp": 0.13079834, + "step": 9141, + "time_per_iteration": 2.6912786960601807 + }, + { + "auxiliary_loss_clip": 0.01117347, + "auxiliary_loss_mlp": 0.01031835, + "balance_loss_clip": 1.04190314, + "balance_loss_mlp": 1.01974678, + "epoch": 0.5496467758905756, + "flos": 26012072198880.0, + "grad_norm": 2.780383020584442, + "language_loss": 0.75020111, + "learning_rate": 1.7759466512516346e-06, + "loss": 0.77169287, + "num_input_tokens_seen": 196984930, + "router_z_loss_clip": 0.75390625, + "router_z_loss_mlp": 0.12078857, + "step": 9142, + "time_per_iteration": 2.6912686824798584 + }, + { + "auxiliary_loss_clip": 0.01122308, + "auxiliary_loss_mlp": 0.01033882, + "balance_loss_clip": 1.04194474, + "balance_loss_mlp": 1.01996422, + "epoch": 0.5497068991432437, + "flos": 27129298986720.0, + "grad_norm": 1.9699537006884997, + "language_loss": 0.76649451, + "learning_rate": 1.7755596454041192e-06, + "loss": 0.78805637, + "num_input_tokens_seen": 197002320, + "router_z_loss_clip": 0.80322266, + "router_z_loss_mlp": 0.13922119, + "step": 9143, + "time_per_iteration": 2.6703264713287354 + }, + { + "auxiliary_loss_clip": 0.0111778, + "auxiliary_loss_mlp": 0.01029303, + "balance_loss_clip": 1.03960359, + "balance_loss_mlp": 1.01710153, + "epoch": 0.5497670223959116, + "flos": 22548015951840.0, + "grad_norm": 3.826242104769336, + "language_loss": 0.79520667, + "learning_rate": 1.7751726480674044e-06, + "loss": 0.81667751, + "num_input_tokens_seen": 197020825, + "router_z_loss_clip": 0.78173828, + "router_z_loss_mlp": 0.12219238, + "step": 9144, + "time_per_iteration": 3.8728418350219727 + }, + { + "auxiliary_loss_clip": 0.01122614, + "auxiliary_loss_mlp": 0.0102734, + "balance_loss_clip": 1.04331219, + "balance_loss_mlp": 1.0143342, + "epoch": 0.5498271456485796, + "flos": 35636795894880.0, + "grad_norm": 2.10015214628477, + "language_loss": 0.71132547, + "learning_rate": 1.7747856592561645e-06, + "loss": 0.73282504, + "num_input_tokens_seen": 197040450, + "router_z_loss_clip": 0.79248047, + "router_z_loss_mlp": 0.12994385, + "step": 9145, + "time_per_iteration": 2.705960273742676 + }, + { + "auxiliary_loss_clip": 0.01120296, + "auxiliary_loss_mlp": 0.01029547, + "balance_loss_clip": 1.04202271, + "balance_loss_mlp": 1.01812077, + "epoch": 0.5498872689012475, + "flos": 42493813263360.0, + "grad_norm": 1.8453303961104326, + "language_loss": 0.70642471, + "learning_rate": 1.774398678985076e-06, + "loss": 0.72792321, + "num_input_tokens_seen": 197063930, + "router_z_loss_clip": 0.78222656, + "router_z_loss_mlp": 0.11431885, + "step": 9146, + "time_per_iteration": 2.7522294521331787 + }, + { + "auxiliary_loss_clip": 0.01114918, + "auxiliary_loss_mlp": 0.01027157, + "balance_loss_clip": 1.03958714, + "balance_loss_mlp": 1.01552188, + "epoch": 0.5499473921539155, + "flos": 31630370508000.0, + "grad_norm": 2.0556161496535545, + "language_loss": 0.64149189, + "learning_rate": 1.7740117072688113e-06, + "loss": 0.66291267, + "num_input_tokens_seen": 197082660, + "router_z_loss_clip": 0.75244141, + "router_z_loss_mlp": 0.11627197, + "step": 9147, + "time_per_iteration": 2.751788377761841 + }, + { + "auxiliary_loss_clip": 0.01121133, + "auxiliary_loss_mlp": 0.01030421, + "balance_loss_clip": 1.04373646, + "balance_loss_mlp": 1.01824415, + "epoch": 0.5500075154065835, + "flos": 27178480614240.0, + "grad_norm": 2.0290257833875383, + "language_loss": 0.8070026, + "learning_rate": 1.7736247441220458e-06, + "loss": 0.82851809, + "num_input_tokens_seen": 197100675, + "router_z_loss_clip": 0.77392578, + "router_z_loss_mlp": 0.12182617, + "step": 9148, + "time_per_iteration": 2.7893548011779785 + }, + { + "auxiliary_loss_clip": 0.01121762, + "auxiliary_loss_mlp": 0.01032999, + "balance_loss_clip": 1.04315686, + "balance_loss_mlp": 1.02073824, + "epoch": 0.5500676386592515, + "flos": 34212006370080.0, + "grad_norm": 2.26483403511309, + "language_loss": 0.79382074, + "learning_rate": 1.773237789559453e-06, + "loss": 0.81536835, + "num_input_tokens_seen": 197121320, + "router_z_loss_clip": 0.78662109, + "router_z_loss_mlp": 0.12261963, + "step": 9149, + "time_per_iteration": 2.689758062362671 + }, + { + "auxiliary_loss_clip": 0.01118236, + "auxiliary_loss_mlp": 0.01026082, + "balance_loss_clip": 1.04055429, + "balance_loss_mlp": 1.01370823, + "epoch": 0.5501277619119195, + "flos": 29181166583040.0, + "grad_norm": 2.9226990475623538, + "language_loss": 0.72170967, + "learning_rate": 1.7728508435957052e-06, + "loss": 0.74315286, + "num_input_tokens_seen": 197138965, + "router_z_loss_clip": 0.77636719, + "router_z_loss_mlp": 0.12390137, + "step": 9150, + "time_per_iteration": 2.6738781929016113 + }, + { + "auxiliary_loss_clip": 0.01120421, + "auxiliary_loss_mlp": 0.01027875, + "balance_loss_clip": 1.03943777, + "balance_loss_mlp": 1.01480365, + "epoch": 0.5501878851645874, + "flos": 25531201939680.0, + "grad_norm": 6.45726626445923, + "language_loss": 0.75083113, + "learning_rate": 1.772463906245477e-06, + "loss": 0.77231413, + "num_input_tokens_seen": 197156460, + "router_z_loss_clip": 0.80957031, + "router_z_loss_mlp": 0.1307373, + "step": 9151, + "time_per_iteration": 2.636942148208618 + }, + { + "auxiliary_loss_clip": 0.01119066, + "auxiliary_loss_mlp": 0.01027399, + "balance_loss_clip": 1.04111946, + "balance_loss_mlp": 1.01576424, + "epoch": 0.5502480084172554, + "flos": 25216062471360.0, + "grad_norm": 2.0604689099140936, + "language_loss": 0.76208991, + "learning_rate": 1.7720769775234394e-06, + "loss": 0.78355455, + "num_input_tokens_seen": 197175140, + "router_z_loss_clip": 0.77929688, + "router_z_loss_mlp": 0.11645508, + "step": 9152, + "time_per_iteration": 2.7136270999908447 + }, + { + "auxiliary_loss_clip": 0.01117017, + "auxiliary_loss_mlp": 0.01030261, + "balance_loss_clip": 1.04066563, + "balance_loss_mlp": 1.01844692, + "epoch": 0.5503081316699233, + "flos": 32253477886080.0, + "grad_norm": 1.7379002852713352, + "language_loss": 0.82036734, + "learning_rate": 1.7716900574442662e-06, + "loss": 0.84184009, + "num_input_tokens_seen": 197194345, + "router_z_loss_clip": 0.76367188, + "router_z_loss_mlp": 0.11816406, + "step": 9153, + "time_per_iteration": 2.806062936782837 + }, + { + "auxiliary_loss_clip": 0.01118319, + "auxiliary_loss_mlp": 0.01030224, + "balance_loss_clip": 1.04155493, + "balance_loss_mlp": 1.01782608, + "epoch": 0.5503682549225913, + "flos": 37374415403040.0, + "grad_norm": 1.7512476954208922, + "language_loss": 0.73841071, + "learning_rate": 1.7713031460226294e-06, + "loss": 0.75989616, + "num_input_tokens_seen": 197215535, + "router_z_loss_clip": 0.76757812, + "router_z_loss_mlp": 0.12402344, + "step": 9154, + "time_per_iteration": 2.754237651824951 + }, + { + "auxiliary_loss_clip": 0.01124591, + "auxiliary_loss_mlp": 0.01033263, + "balance_loss_clip": 1.04272175, + "balance_loss_mlp": 1.01983356, + "epoch": 0.5504283781752592, + "flos": 27534495667680.0, + "grad_norm": 2.3083670636081774, + "language_loss": 0.72488046, + "learning_rate": 1.770916243273199e-06, + "loss": 0.74645901, + "num_input_tokens_seen": 197234945, + "router_z_loss_clip": 0.81933594, + "router_z_loss_mlp": 0.13439941, + "step": 9155, + "time_per_iteration": 2.640336513519287 + }, + { + "auxiliary_loss_clip": 0.01039224, + "auxiliary_loss_mlp": 0.0100209, + "balance_loss_clip": 1.01473069, + "balance_loss_mlp": 1.00080609, + "epoch": 0.5504885014279273, + "flos": 82854575002080.0, + "grad_norm": 0.7442620567594982, + "language_loss": 0.55331212, + "learning_rate": 1.7705293492106483e-06, + "loss": 0.57372528, + "num_input_tokens_seen": 197302285, + "router_z_loss_clip": 0.24487305, + "router_z_loss_mlp": 0.01284027, + "step": 9156, + "time_per_iteration": 3.4174883365631104 + }, + { + "auxiliary_loss_clip": 0.01117095, + "auxiliary_loss_mlp": 0.01030137, + "balance_loss_clip": 1.04038644, + "balance_loss_mlp": 1.01803088, + "epoch": 0.5505486246805952, + "flos": 27394243895520.0, + "grad_norm": 1.664630578146293, + "language_loss": 0.82511461, + "learning_rate": 1.7701424638496475e-06, + "loss": 0.84658694, + "num_input_tokens_seen": 197321575, + "router_z_loss_clip": 0.76757812, + "router_z_loss_mlp": 0.12109375, + "step": 9157, + "time_per_iteration": 2.6483232975006104 + }, + { + "auxiliary_loss_clip": 0.01124203, + "auxiliary_loss_mlp": 0.01031547, + "balance_loss_clip": 1.0430243, + "balance_loss_mlp": 1.01799917, + "epoch": 0.5506087479332632, + "flos": 32832306227520.0, + "grad_norm": 2.641199235216882, + "language_loss": 0.75662404, + "learning_rate": 1.7697555872048677e-06, + "loss": 0.77818155, + "num_input_tokens_seen": 197340255, + "router_z_loss_clip": 0.81152344, + "router_z_loss_mlp": 0.13549805, + "step": 9158, + "time_per_iteration": 2.659905195236206 + }, + { + "auxiliary_loss_clip": 0.01115707, + "auxiliary_loss_mlp": 0.01030409, + "balance_loss_clip": 1.04088306, + "balance_loss_mlp": 1.01870251, + "epoch": 0.5506688711859311, + "flos": 27979676553600.0, + "grad_norm": 2.068145559152878, + "language_loss": 0.69483244, + "learning_rate": 1.769368719290979e-06, + "loss": 0.71629357, + "num_input_tokens_seen": 197360360, + "router_z_loss_clip": 0.74853516, + "router_z_loss_mlp": 0.1171875, + "step": 9159, + "time_per_iteration": 2.677650213241577 + }, + { + "auxiliary_loss_clip": 0.01120347, + "auxiliary_loss_mlp": 0.01032442, + "balance_loss_clip": 1.04101586, + "balance_loss_mlp": 1.02016902, + "epoch": 0.5507289944385991, + "flos": 36126498921120.0, + "grad_norm": 1.6337801846168423, + "language_loss": 0.68155688, + "learning_rate": 1.7689818601226516e-06, + "loss": 0.70308477, + "num_input_tokens_seen": 197381905, + "router_z_loss_clip": 0.79248047, + "router_z_loss_mlp": 0.12268066, + "step": 9160, + "time_per_iteration": 2.723961114883423 + }, + { + "auxiliary_loss_clip": 0.01116886, + "auxiliary_loss_mlp": 0.01030185, + "balance_loss_clip": 1.04178858, + "balance_loss_mlp": 1.01832414, + "epoch": 0.5507891176912671, + "flos": 18711292049280.0, + "grad_norm": 1.9737283892601138, + "language_loss": 0.71610671, + "learning_rate": 1.7685950097145552e-06, + "loss": 0.73757744, + "num_input_tokens_seen": 197398555, + "router_z_loss_clip": 0.75097656, + "router_z_loss_mlp": 0.11865234, + "step": 9161, + "time_per_iteration": 2.6312291622161865 + }, + { + "auxiliary_loss_clip": 0.01120403, + "auxiliary_loss_mlp": 0.0103796, + "balance_loss_clip": 1.04317451, + "balance_loss_mlp": 1.02591372, + "epoch": 0.5508492409439351, + "flos": 32431363860960.0, + "grad_norm": 1.6281083769922777, + "language_loss": 0.69465482, + "learning_rate": 1.768208168081359e-06, + "loss": 0.71623838, + "num_input_tokens_seen": 197419630, + "router_z_loss_clip": 0.77246094, + "router_z_loss_mlp": 0.1204834, + "step": 9162, + "time_per_iteration": 2.6735658645629883 + }, + { + "auxiliary_loss_clip": 0.01120662, + "auxiliary_loss_mlp": 0.01035663, + "balance_loss_clip": 1.04340041, + "balance_loss_mlp": 1.02344394, + "epoch": 0.5509093641966031, + "flos": 31046842162080.0, + "grad_norm": 4.6513643843363255, + "language_loss": 0.85664737, + "learning_rate": 1.767821335237733e-06, + "loss": 0.87821066, + "num_input_tokens_seen": 197438480, + "router_z_loss_clip": 0.77197266, + "router_z_loss_mlp": 0.12225342, + "step": 9163, + "time_per_iteration": 2.6875033378601074 + }, + { + "auxiliary_loss_clip": 0.01118727, + "auxiliary_loss_mlp": 0.01030434, + "balance_loss_clip": 1.04233789, + "balance_loss_mlp": 1.01888311, + "epoch": 0.550969487449271, + "flos": 22815756552960.0, + "grad_norm": 7.66274363739438, + "language_loss": 0.80774206, + "learning_rate": 1.7674345111983441e-06, + "loss": 0.82923365, + "num_input_tokens_seen": 197456755, + "router_z_loss_clip": 0.76367188, + "router_z_loss_mlp": 0.11553955, + "step": 9164, + "time_per_iteration": 2.619856119155884 + }, + { + "auxiliary_loss_clip": 0.01124431, + "auxiliary_loss_mlp": 0.01025441, + "balance_loss_clip": 1.04550958, + "balance_loss_mlp": 1.01269722, + "epoch": 0.551029610701939, + "flos": 27709585950240.0, + "grad_norm": 2.6232202334414136, + "language_loss": 0.7328583, + "learning_rate": 1.767047695977863e-06, + "loss": 0.7543571, + "num_input_tokens_seen": 197475530, + "router_z_loss_clip": 0.79003906, + "router_z_loss_mlp": 0.12744141, + "step": 9165, + "time_per_iteration": 2.7037336826324463 + }, + { + "auxiliary_loss_clip": 0.01116352, + "auxiliary_loss_mlp": 0.01025085, + "balance_loss_clip": 1.04070699, + "balance_loss_mlp": 1.01387322, + "epoch": 0.5510897339546069, + "flos": 15155031173760.0, + "grad_norm": 2.086833164101435, + "language_loss": 0.78830218, + "learning_rate": 1.7666608895909563e-06, + "loss": 0.80971658, + "num_input_tokens_seen": 197490835, + "router_z_loss_clip": 0.75537109, + "router_z_loss_mlp": 0.11206055, + "step": 9166, + "time_per_iteration": 2.6003944873809814 + }, + { + "auxiliary_loss_clip": 0.01121505, + "auxiliary_loss_mlp": 0.01032669, + "balance_loss_clip": 1.04235387, + "balance_loss_mlp": 1.02063465, + "epoch": 0.5511498572072749, + "flos": 22903747384320.0, + "grad_norm": 4.946823025455763, + "language_loss": 0.76293999, + "learning_rate": 1.7662740920522913e-06, + "loss": 0.78448176, + "num_input_tokens_seen": 197508770, + "router_z_loss_clip": 0.79101562, + "router_z_loss_mlp": 0.12036133, + "step": 9167, + "time_per_iteration": 2.6542587280273438 + }, + { + "auxiliary_loss_clip": 0.01117159, + "auxiliary_loss_mlp": 0.01028327, + "balance_loss_clip": 1.04110241, + "balance_loss_mlp": 1.01554203, + "epoch": 0.5512099804599428, + "flos": 23883315505920.0, + "grad_norm": 3.9141238692987277, + "language_loss": 0.80020499, + "learning_rate": 1.7658873033765374e-06, + "loss": 0.82165986, + "num_input_tokens_seen": 197527340, + "router_z_loss_clip": 0.76123047, + "router_z_loss_mlp": 0.12792969, + "step": 9168, + "time_per_iteration": 2.647252321243286 + }, + { + "auxiliary_loss_clip": 0.01122639, + "auxiliary_loss_mlp": 0.01035346, + "balance_loss_clip": 1.04368591, + "balance_loss_mlp": 1.0233593, + "epoch": 0.5512701037126109, + "flos": 32025762007200.0, + "grad_norm": 3.0534527759345726, + "language_loss": 0.68578786, + "learning_rate": 1.7655005235783591e-06, + "loss": 0.70736772, + "num_input_tokens_seen": 197547280, + "router_z_loss_clip": 0.78857422, + "router_z_loss_mlp": 0.11993408, + "step": 9169, + "time_per_iteration": 2.727958917617798 + }, + { + "auxiliary_loss_clip": 0.01114697, + "auxiliary_loss_mlp": 0.01024848, + "balance_loss_clip": 1.04037213, + "balance_loss_mlp": 1.01385653, + "epoch": 0.5513302269652788, + "flos": 26777862385920.0, + "grad_norm": 2.1413312067711563, + "language_loss": 0.85427999, + "learning_rate": 1.7651137526724251e-06, + "loss": 0.87567538, + "num_input_tokens_seen": 197565045, + "router_z_loss_clip": 0.74267578, + "router_z_loss_mlp": 0.10998535, + "step": 9170, + "time_per_iteration": 4.088847637176514 + }, + { + "auxiliary_loss_clip": 0.01036236, + "auxiliary_loss_mlp": 0.01005644, + "balance_loss_clip": 1.01181614, + "balance_loss_mlp": 1.00452518, + "epoch": 0.5513903502179468, + "flos": 83262688927200.0, + "grad_norm": 0.7958600070622165, + "language_loss": 0.59799957, + "learning_rate": 1.7647269906734017e-06, + "loss": 0.6184184, + "num_input_tokens_seen": 197625005, + "router_z_loss_clip": 0.2442627, + "router_z_loss_mlp": 0.01119995, + "step": 9171, + "time_per_iteration": 4.70795750617981 + }, + { + "auxiliary_loss_clip": 0.01118898, + "auxiliary_loss_mlp": 0.01034607, + "balance_loss_clip": 1.04228508, + "balance_loss_mlp": 1.02267396, + "epoch": 0.5514504734706147, + "flos": 22859387313120.0, + "grad_norm": 1.5225418684738556, + "language_loss": 0.70298743, + "learning_rate": 1.7643402375959533e-06, + "loss": 0.72452247, + "num_input_tokens_seen": 197645050, + "router_z_loss_clip": 0.765625, + "router_z_loss_mlp": 0.1194458, + "step": 9172, + "time_per_iteration": 2.8336679935455322 + }, + { + "auxiliary_loss_clip": 0.01115402, + "auxiliary_loss_mlp": 0.01030227, + "balance_loss_clip": 1.03879011, + "balance_loss_mlp": 1.01871753, + "epoch": 0.5515105967232827, + "flos": 27175401300960.0, + "grad_norm": 1.8240957517282228, + "language_loss": 0.75710517, + "learning_rate": 1.7639534934547474e-06, + "loss": 0.77856147, + "num_input_tokens_seen": 197663910, + "router_z_loss_clip": 0.76611328, + "router_z_loss_mlp": 0.1151123, + "step": 9173, + "time_per_iteration": 2.670452833175659 + }, + { + "auxiliary_loss_clip": 0.01116737, + "auxiliary_loss_mlp": 0.01033069, + "balance_loss_clip": 1.04173851, + "balance_loss_mlp": 1.02086163, + "epoch": 0.5515707199759508, + "flos": 27525743935200.0, + "grad_norm": 1.803999435158375, + "language_loss": 0.74770921, + "learning_rate": 1.7635667582644484e-06, + "loss": 0.76920736, + "num_input_tokens_seen": 197681580, + "router_z_loss_clip": 0.74951172, + "router_z_loss_mlp": 0.12213135, + "step": 9174, + "time_per_iteration": 2.6359610557556152 + }, + { + "auxiliary_loss_clip": 0.01118686, + "auxiliary_loss_mlp": 0.010326, + "balance_loss_clip": 1.04141593, + "balance_loss_mlp": 1.02084637, + "epoch": 0.5516308432286187, + "flos": 34521594971040.0, + "grad_norm": 2.129527577982872, + "language_loss": 0.72675192, + "learning_rate": 1.7631800320397217e-06, + "loss": 0.74826479, + "num_input_tokens_seen": 197702095, + "router_z_loss_clip": 0.77392578, + "router_z_loss_mlp": 0.11755371, + "step": 9175, + "time_per_iteration": 2.7280094623565674 + }, + { + "auxiliary_loss_clip": 0.01120674, + "auxiliary_loss_mlp": 0.01033323, + "balance_loss_clip": 1.04218388, + "balance_loss_mlp": 1.02177131, + "epoch": 0.5516909664812867, + "flos": 22896859446720.0, + "grad_norm": 6.2845087264065675, + "language_loss": 0.69665265, + "learning_rate": 1.7627933147952318e-06, + "loss": 0.71819258, + "num_input_tokens_seen": 197720720, + "router_z_loss_clip": 0.78564453, + "router_z_loss_mlp": 0.11560059, + "step": 9176, + "time_per_iteration": 4.091492176055908 + }, + { + "auxiliary_loss_clip": 0.01117094, + "auxiliary_loss_mlp": 0.01028906, + "balance_loss_clip": 1.04098463, + "balance_loss_mlp": 1.01713443, + "epoch": 0.5517510897339546, + "flos": 33849873207360.0, + "grad_norm": 2.2224049862385313, + "language_loss": 0.71129608, + "learning_rate": 1.7624066065456435e-06, + "loss": 0.73275608, + "num_input_tokens_seen": 197741820, + "router_z_loss_clip": 0.76074219, + "router_z_loss_mlp": 0.11773682, + "step": 9177, + "time_per_iteration": 2.701401710510254 + }, + { + "auxiliary_loss_clip": 0.01120288, + "auxiliary_loss_mlp": 0.01026396, + "balance_loss_clip": 1.0430603, + "balance_loss_mlp": 1.01537514, + "epoch": 0.5518112129866226, + "flos": 22457107876320.0, + "grad_norm": 3.1847384828923824, + "language_loss": 0.80285645, + "learning_rate": 1.7620199073056204e-06, + "loss": 0.8243233, + "num_input_tokens_seen": 197759160, + "router_z_loss_clip": 0.77099609, + "router_z_loss_mlp": 0.11016846, + "step": 9178, + "time_per_iteration": 2.637730836868286 + }, + { + "auxiliary_loss_clip": 0.01122377, + "auxiliary_loss_mlp": 0.01041143, + "balance_loss_clip": 1.04250908, + "balance_loss_mlp": 1.0282805, + "epoch": 0.5518713362392905, + "flos": 30606239728800.0, + "grad_norm": 1.5760075714301933, + "language_loss": 0.75123525, + "learning_rate": 1.761633217089826e-06, + "loss": 0.77287042, + "num_input_tokens_seen": 197779760, + "router_z_loss_clip": 0.79833984, + "router_z_loss_mlp": 0.12866211, + "step": 9179, + "time_per_iteration": 2.6434826850891113 + }, + { + "auxiliary_loss_clip": 0.01120128, + "auxiliary_loss_mlp": 0.0103692, + "balance_loss_clip": 1.04306352, + "balance_loss_mlp": 1.02527928, + "epoch": 0.5519314594919585, + "flos": 44586151272000.0, + "grad_norm": 1.7766889266288046, + "language_loss": 0.69995028, + "learning_rate": 1.761246535912924e-06, + "loss": 0.72152072, + "num_input_tokens_seen": 197801545, + "router_z_loss_clip": 0.77148438, + "router_z_loss_mlp": 0.11639404, + "step": 9180, + "time_per_iteration": 2.778639793395996 + }, + { + "auxiliary_loss_clip": 0.01120489, + "auxiliary_loss_mlp": 0.0103711, + "balance_loss_clip": 1.04303539, + "balance_loss_mlp": 1.02468228, + "epoch": 0.5519915827446265, + "flos": 24951441700800.0, + "grad_norm": 2.98761898486204, + "language_loss": 0.67241776, + "learning_rate": 1.7608598637895776e-06, + "loss": 0.69399381, + "num_input_tokens_seen": 197820760, + "router_z_loss_clip": 0.77441406, + "router_z_loss_mlp": 0.12420654, + "step": 9181, + "time_per_iteration": 2.6409714221954346 + }, + { + "auxiliary_loss_clip": 0.0112086, + "auxiliary_loss_mlp": 0.01027873, + "balance_loss_clip": 1.04094172, + "balance_loss_mlp": 1.0153265, + "epoch": 0.5520517059972945, + "flos": 29002146124320.0, + "grad_norm": 4.616358590367533, + "language_loss": 0.78804481, + "learning_rate": 1.7604732007344486e-06, + "loss": 0.80953217, + "num_input_tokens_seen": 197840195, + "router_z_loss_clip": 0.79882812, + "router_z_loss_mlp": 0.12554932, + "step": 9182, + "time_per_iteration": 2.6977968215942383 + }, + { + "auxiliary_loss_clip": 0.01119937, + "auxiliary_loss_mlp": 0.01031534, + "balance_loss_clip": 1.0421114, + "balance_loss_mlp": 1.01969635, + "epoch": 0.5521118292499624, + "flos": 27084614777280.0, + "grad_norm": 2.439063418145847, + "language_loss": 0.83536422, + "learning_rate": 1.7600865467622003e-06, + "loss": 0.856879, + "num_input_tokens_seen": 197859475, + "router_z_loss_clip": 0.77783203, + "router_z_loss_mlp": 0.11853027, + "step": 9183, + "time_per_iteration": 3.961904525756836 + }, + { + "auxiliary_loss_clip": 0.01119033, + "auxiliary_loss_mlp": 0.01028811, + "balance_loss_clip": 1.04275751, + "balance_loss_mlp": 1.01699769, + "epoch": 0.5521719525026304, + "flos": 28776901799520.0, + "grad_norm": 1.368509491651036, + "language_loss": 0.67413127, + "learning_rate": 1.7596999018874936e-06, + "loss": 0.69560969, + "num_input_tokens_seen": 197879395, + "router_z_loss_clip": 0.76269531, + "router_z_loss_mlp": 0.11810303, + "step": 9184, + "time_per_iteration": 2.6549534797668457 + }, + { + "auxiliary_loss_clip": 0.01119061, + "auxiliary_loss_mlp": 0.01028111, + "balance_loss_clip": 1.0421629, + "balance_loss_mlp": 1.01588571, + "epoch": 0.5522320757552983, + "flos": 31893856794720.0, + "grad_norm": 1.5432738082141078, + "language_loss": 0.76581979, + "learning_rate": 1.7593132661249917e-06, + "loss": 0.78729153, + "num_input_tokens_seen": 197900815, + "router_z_loss_clip": 0.76855469, + "router_z_loss_mlp": 0.12225342, + "step": 9185, + "time_per_iteration": 2.7049143314361572 + }, + { + "auxiliary_loss_clip": 0.01121079, + "auxiliary_loss_mlp": 0.01032189, + "balance_loss_clip": 1.04341483, + "balance_loss_mlp": 1.01984501, + "epoch": 0.5522921990079663, + "flos": 30110661696960.0, + "grad_norm": 1.6619026221778188, + "language_loss": 0.73762691, + "learning_rate": 1.7589266394893536e-06, + "loss": 0.75915962, + "num_input_tokens_seen": 197918985, + "router_z_loss_clip": 0.77636719, + "router_z_loss_mlp": 0.12341309, + "step": 9186, + "time_per_iteration": 2.643418073654175 + }, + { + "auxiliary_loss_clip": 0.01123196, + "auxiliary_loss_mlp": 0.01031265, + "balance_loss_clip": 1.04362142, + "balance_loss_mlp": 1.01991594, + "epoch": 0.5523523222606344, + "flos": 27757430507520.0, + "grad_norm": 2.7152778767447043, + "language_loss": 0.66748041, + "learning_rate": 1.7585400219952421e-06, + "loss": 0.68902504, + "num_input_tokens_seen": 197937725, + "router_z_loss_clip": 0.79589844, + "router_z_loss_mlp": 0.11340332, + "step": 9187, + "time_per_iteration": 2.672525405883789 + }, + { + "auxiliary_loss_clip": 0.01120879, + "auxiliary_loss_mlp": 0.0102897, + "balance_loss_clip": 1.04336786, + "balance_loss_mlp": 1.01691806, + "epoch": 0.5524124455133023, + "flos": 24106007242080.0, + "grad_norm": 1.7193689638872902, + "language_loss": 0.77640843, + "learning_rate": 1.758153413657318e-06, + "loss": 0.79790688, + "num_input_tokens_seen": 197955635, + "router_z_loss_clip": 0.77539062, + "router_z_loss_mlp": 0.1204834, + "step": 9188, + "time_per_iteration": 2.613088607788086 + }, + { + "auxiliary_loss_clip": 0.01118768, + "auxiliary_loss_mlp": 0.01029646, + "balance_loss_clip": 1.04232562, + "balance_loss_mlp": 1.01739717, + "epoch": 0.5524725687659703, + "flos": 28424573818560.0, + "grad_norm": 2.389275367550298, + "language_loss": 0.81141853, + "learning_rate": 1.7577668144902394e-06, + "loss": 0.83290267, + "num_input_tokens_seen": 197974490, + "router_z_loss_clip": 0.76464844, + "router_z_loss_mlp": 0.12255859, + "step": 9189, + "time_per_iteration": 2.8696095943450928 + }, + { + "auxiliary_loss_clip": 0.01120167, + "auxiliary_loss_mlp": 0.01028067, + "balance_loss_clip": 1.04430485, + "balance_loss_mlp": 1.01563406, + "epoch": 0.5525326920186382, + "flos": 30337324126560.0, + "grad_norm": 1.4134974716712088, + "language_loss": 0.76570535, + "learning_rate": 1.7573802245086684e-06, + "loss": 0.78718764, + "num_input_tokens_seen": 197995735, + "router_z_loss_clip": 0.75927734, + "router_z_loss_mlp": 0.12432861, + "step": 9190, + "time_per_iteration": 2.7533514499664307 + }, + { + "auxiliary_loss_clip": 0.01124916, + "auxiliary_loss_mlp": 0.01035274, + "balance_loss_clip": 1.04351413, + "balance_loss_mlp": 1.02179766, + "epoch": 0.5525928152713062, + "flos": 16759651502880.0, + "grad_norm": 3.3680922854627102, + "language_loss": 0.78788084, + "learning_rate": 1.7569936437272627e-06, + "loss": 0.80948275, + "num_input_tokens_seen": 198009685, + "router_z_loss_clip": 0.81445312, + "router_z_loss_mlp": 0.13464355, + "step": 9191, + "time_per_iteration": 2.6212406158447266 + }, + { + "auxiliary_loss_clip": 0.01118551, + "auxiliary_loss_mlp": 0.01028034, + "balance_loss_clip": 1.04235291, + "balance_loss_mlp": 1.01633406, + "epoch": 0.5526529385239741, + "flos": 15950068486560.0, + "grad_norm": 2.1895374195616046, + "language_loss": 0.69267547, + "learning_rate": 1.7566070721606829e-06, + "loss": 0.71414137, + "num_input_tokens_seen": 198026845, + "router_z_loss_clip": 0.76171875, + "router_z_loss_mlp": 0.11700439, + "step": 9192, + "time_per_iteration": 2.6604185104370117 + }, + { + "auxiliary_loss_clip": 0.01117634, + "auxiliary_loss_mlp": 0.01027727, + "balance_loss_clip": 1.04279339, + "balance_loss_mlp": 1.01712298, + "epoch": 0.5527130617766421, + "flos": 28245918015360.0, + "grad_norm": 1.4908737105392023, + "language_loss": 0.77382576, + "learning_rate": 1.756220509823588e-06, + "loss": 0.79527938, + "num_input_tokens_seen": 198045275, + "router_z_loss_clip": 0.74804688, + "router_z_loss_mlp": 0.10601807, + "step": 9193, + "time_per_iteration": 2.6507749557495117 + }, + { + "auxiliary_loss_clip": 0.01117999, + "auxiliary_loss_mlp": 0.01032739, + "balance_loss_clip": 1.04140258, + "balance_loss_mlp": 1.0205797, + "epoch": 0.55277318502931, + "flos": 25972938856800.0, + "grad_norm": 1.5344038384664778, + "language_loss": 0.78509456, + "learning_rate": 1.7558339567306344e-06, + "loss": 0.80660194, + "num_input_tokens_seen": 198065760, + "router_z_loss_clip": 0.765625, + "router_z_loss_mlp": 0.1217041, + "step": 9194, + "time_per_iteration": 2.679744243621826 + }, + { + "auxiliary_loss_clip": 0.01125118, + "auxiliary_loss_mlp": 0.01034437, + "balance_loss_clip": 1.04335999, + "balance_loss_mlp": 1.02231371, + "epoch": 0.5528333082819781, + "flos": 46765224076320.0, + "grad_norm": 1.8882460667910501, + "language_loss": 0.69472623, + "learning_rate": 1.7554474128964825e-06, + "loss": 0.71632171, + "num_input_tokens_seen": 198087595, + "router_z_loss_clip": 0.81738281, + "router_z_loss_mlp": 0.12121582, + "step": 9195, + "time_per_iteration": 2.7718520164489746 + }, + { + "auxiliary_loss_clip": 0.01125721, + "auxiliary_loss_mlp": 0.01029168, + "balance_loss_clip": 1.04405904, + "balance_loss_mlp": 1.01669896, + "epoch": 0.552893431534646, + "flos": 16537567525920.0, + "grad_norm": 2.1958002960759777, + "language_loss": 0.73899394, + "learning_rate": 1.7550608783357887e-06, + "loss": 0.76054287, + "num_input_tokens_seen": 198104620, + "router_z_loss_clip": 0.81640625, + "router_z_loss_mlp": 0.12469482, + "step": 9196, + "time_per_iteration": 2.732689619064331 + }, + { + "auxiliary_loss_clip": 0.01120262, + "auxiliary_loss_mlp": 0.01031732, + "balance_loss_clip": 1.04362941, + "balance_loss_mlp": 1.02023387, + "epoch": 0.552953554787314, + "flos": 26771096000160.0, + "grad_norm": 1.6531537247621237, + "language_loss": 0.76787984, + "learning_rate": 1.7546743530632115e-06, + "loss": 0.78939974, + "num_input_tokens_seen": 198123565, + "router_z_loss_clip": 0.76611328, + "router_z_loss_mlp": 0.11499023, + "step": 9197, + "time_per_iteration": 2.640768527984619 + }, + { + "auxiliary_loss_clip": 0.01115868, + "auxiliary_loss_mlp": 0.01027867, + "balance_loss_clip": 1.04006851, + "balance_loss_mlp": 1.01650667, + "epoch": 0.5530136780399819, + "flos": 53272911742560.0, + "grad_norm": 1.920485769270325, + "language_loss": 0.76263285, + "learning_rate": 1.754287837093407e-06, + "loss": 0.78407025, + "num_input_tokens_seen": 198148270, + "router_z_loss_clip": 0.7578125, + "router_z_loss_mlp": 0.11358643, + "step": 9198, + "time_per_iteration": 2.924133062362671 + }, + { + "auxiliary_loss_clip": 0.01116443, + "auxiliary_loss_mlp": 0.01025634, + "balance_loss_clip": 1.0396409, + "balance_loss_mlp": 1.0147624, + "epoch": 0.5530738012926499, + "flos": 30560947760160.0, + "grad_norm": 1.7385734297667659, + "language_loss": 0.78938115, + "learning_rate": 1.7539013304410327e-06, + "loss": 0.81080186, + "num_input_tokens_seen": 198168810, + "router_z_loss_clip": 0.76855469, + "router_z_loss_mlp": 0.10876465, + "step": 9199, + "time_per_iteration": 2.749084949493408 + }, + { + "auxiliary_loss_clip": 0.01119087, + "auxiliary_loss_mlp": 0.01032426, + "balance_loss_clip": 1.04202104, + "balance_loss_mlp": 1.02127969, + "epoch": 0.553133924545318, + "flos": 20098933578720.0, + "grad_norm": 2.059870912846492, + "language_loss": 0.64189363, + "learning_rate": 1.7535148331207443e-06, + "loss": 0.6634087, + "num_input_tokens_seen": 198186200, + "router_z_loss_clip": 0.77099609, + "router_z_loss_mlp": 0.1114502, + "step": 9200, + "time_per_iteration": 2.6169755458831787 + }, + { + "auxiliary_loss_clip": 0.01124313, + "auxiliary_loss_mlp": 0.01030296, + "balance_loss_clip": 1.04408932, + "balance_loss_mlp": 1.01764226, + "epoch": 0.5531940477979859, + "flos": 30025020867840.0, + "grad_norm": 1.838046304436287, + "language_loss": 0.66265082, + "learning_rate": 1.7531283451471978e-06, + "loss": 0.68419695, + "num_input_tokens_seen": 198207050, + "router_z_loss_clip": 0.80175781, + "router_z_loss_mlp": 0.12664795, + "step": 9201, + "time_per_iteration": 2.6588785648345947 + }, + { + "auxiliary_loss_clip": 0.01120733, + "auxiliary_loss_mlp": 0.01031643, + "balance_loss_clip": 1.04443216, + "balance_loss_mlp": 1.01945972, + "epoch": 0.5532541710506539, + "flos": 27039646946880.0, + "grad_norm": 1.965074271465707, + "language_loss": 0.60463065, + "learning_rate": 1.7527418665350502e-06, + "loss": 0.62615442, + "num_input_tokens_seen": 198224565, + "router_z_loss_clip": 0.76220703, + "router_z_loss_mlp": 0.12200928, + "step": 9202, + "time_per_iteration": 2.6966757774353027 + }, + { + "auxiliary_loss_clip": 0.01118918, + "auxiliary_loss_mlp": 0.01027478, + "balance_loss_clip": 1.0433296, + "balance_loss_mlp": 1.01604557, + "epoch": 0.5533142943033218, + "flos": 26109057866400.0, + "grad_norm": 1.7606590899622123, + "language_loss": 0.64307499, + "learning_rate": 1.7523553972989548e-06, + "loss": 0.66453892, + "num_input_tokens_seen": 198244790, + "router_z_loss_clip": 0.75439453, + "router_z_loss_mlp": 0.11444092, + "step": 9203, + "time_per_iteration": 2.647801637649536 + }, + { + "auxiliary_loss_clip": 0.01118966, + "auxiliary_loss_mlp": 0.01027111, + "balance_loss_clip": 1.04168129, + "balance_loss_mlp": 1.01565528, + "epoch": 0.5533744175559898, + "flos": 28736593456320.0, + "grad_norm": 6.927925557967279, + "language_loss": 0.63651633, + "learning_rate": 1.7519689374535683e-06, + "loss": 0.6579771, + "num_input_tokens_seen": 198264375, + "router_z_loss_clip": 0.7734375, + "router_z_loss_mlp": 0.11450195, + "step": 9204, + "time_per_iteration": 2.651726245880127 + }, + { + "auxiliary_loss_clip": 0.01115249, + "auxiliary_loss_mlp": 0.01028337, + "balance_loss_clip": 1.03996062, + "balance_loss_mlp": 1.01747108, + "epoch": 0.5534345408086577, + "flos": 29360106007200.0, + "grad_norm": 1.6645829469931737, + "language_loss": 0.77736056, + "learning_rate": 1.7515824870135445e-06, + "loss": 0.79879642, + "num_input_tokens_seen": 198283895, + "router_z_loss_clip": 0.75292969, + "router_z_loss_mlp": 0.10870361, + "step": 9205, + "time_per_iteration": 2.708015203475952 + }, + { + "auxiliary_loss_clip": 0.01116629, + "auxiliary_loss_mlp": 0.01034532, + "balance_loss_clip": 1.04204178, + "balance_loss_mlp": 1.02286696, + "epoch": 0.5534946640613257, + "flos": 41208910855200.0, + "grad_norm": 1.4029712580628622, + "language_loss": 0.72559106, + "learning_rate": 1.751196045993537e-06, + "loss": 0.74710274, + "num_input_tokens_seen": 198310035, + "router_z_loss_clip": 0.74609375, + "router_z_loss_mlp": 0.11657715, + "step": 9206, + "time_per_iteration": 2.7687771320343018 + }, + { + "auxiliary_loss_clip": 0.01118897, + "auxiliary_loss_mlp": 0.010311, + "balance_loss_clip": 1.0419023, + "balance_loss_mlp": 1.02013874, + "epoch": 0.5535547873139937, + "flos": 18496825320960.0, + "grad_norm": 2.2500355696379026, + "language_loss": 0.75600511, + "learning_rate": 1.7508096144082012e-06, + "loss": 0.7775051, + "num_input_tokens_seen": 198327810, + "router_z_loss_clip": 0.76953125, + "router_z_loss_mlp": 0.10961914, + "step": 9207, + "time_per_iteration": 2.6410491466522217 + }, + { + "auxiliary_loss_clip": 0.01124122, + "auxiliary_loss_mlp": 0.01031678, + "balance_loss_clip": 1.0433569, + "balance_loss_mlp": 1.01885653, + "epoch": 0.5536149105666617, + "flos": 20719974575520.0, + "grad_norm": 2.349647672560156, + "language_loss": 0.61915177, + "learning_rate": 1.750423192272189e-06, + "loss": 0.64070976, + "num_input_tokens_seen": 198343150, + "router_z_loss_clip": 0.80761719, + "router_z_loss_mlp": 0.1282959, + "step": 9208, + "time_per_iteration": 2.629082441329956 + }, + { + "auxiliary_loss_clip": 0.01119023, + "auxiliary_loss_mlp": 0.01031142, + "balance_loss_clip": 1.04130399, + "balance_loss_mlp": 1.01990664, + "epoch": 0.5536750338193296, + "flos": 22146425308800.0, + "grad_norm": 2.155726462500802, + "language_loss": 0.64251006, + "learning_rate": 1.7500367796001547e-06, + "loss": 0.66401172, + "num_input_tokens_seen": 198360925, + "router_z_loss_clip": 0.77734375, + "router_z_loss_mlp": 0.11236572, + "step": 9209, + "time_per_iteration": 2.665419340133667 + }, + { + "auxiliary_loss_clip": 0.01118568, + "auxiliary_loss_mlp": 0.01034101, + "balance_loss_clip": 1.04145277, + "balance_loss_mlp": 1.02120233, + "epoch": 0.5537351570719976, + "flos": 27757025334720.0, + "grad_norm": 2.080680440521203, + "language_loss": 0.82858312, + "learning_rate": 1.7496503764067513e-06, + "loss": 0.85010982, + "num_input_tokens_seen": 198379265, + "router_z_loss_clip": 0.77099609, + "router_z_loss_mlp": 0.12896729, + "step": 9210, + "time_per_iteration": 5.418195962905884 + }, + { + "auxiliary_loss_clip": 0.011155, + "auxiliary_loss_mlp": 0.01025411, + "balance_loss_clip": 1.03999698, + "balance_loss_mlp": 1.01435447, + "epoch": 0.5537952803246655, + "flos": 32160422394720.0, + "grad_norm": 1.86947168111048, + "language_loss": 0.72573555, + "learning_rate": 1.74926398270663e-06, + "loss": 0.74714464, + "num_input_tokens_seen": 198399490, + "router_z_loss_clip": 0.75537109, + "router_z_loss_mlp": 0.1105957, + "step": 9211, + "time_per_iteration": 2.6709060668945312 + }, + { + "auxiliary_loss_clip": 0.01120092, + "auxiliary_loss_mlp": 0.01035419, + "balance_loss_clip": 1.04178822, + "balance_loss_mlp": 1.02235377, + "epoch": 0.5538554035773335, + "flos": 22009414919040.0, + "grad_norm": 1.9215193581413297, + "language_loss": 0.66755998, + "learning_rate": 1.7488775985144437e-06, + "loss": 0.68911511, + "num_input_tokens_seen": 198419110, + "router_z_loss_clip": 0.78320312, + "router_z_loss_mlp": 0.13067627, + "step": 9212, + "time_per_iteration": 2.655189275741577 + }, + { + "auxiliary_loss_clip": 0.01120161, + "auxiliary_loss_mlp": 0.0103135, + "balance_loss_clip": 1.04070759, + "balance_loss_mlp": 1.01802278, + "epoch": 0.5539155268300014, + "flos": 38664463505760.0, + "grad_norm": 2.1803041894743505, + "language_loss": 0.52013487, + "learning_rate": 1.7484912238448443e-06, + "loss": 0.54164994, + "num_input_tokens_seen": 198441360, + "router_z_loss_clip": 0.79443359, + "router_z_loss_mlp": 0.13323975, + "step": 9213, + "time_per_iteration": 2.709710121154785 + }, + { + "auxiliary_loss_clip": 0.01122919, + "auxiliary_loss_mlp": 0.01028156, + "balance_loss_clip": 1.04411578, + "balance_loss_mlp": 1.01593685, + "epoch": 0.5539756500826695, + "flos": 18539159528160.0, + "grad_norm": 2.2177043661904166, + "language_loss": 0.85589445, + "learning_rate": 1.7481048587124827e-06, + "loss": 0.87740517, + "num_input_tokens_seen": 198459835, + "router_z_loss_clip": 0.78759766, + "router_z_loss_mlp": 0.12225342, + "step": 9214, + "time_per_iteration": 2.6527247428894043 + }, + { + "auxiliary_loss_clip": 0.01119097, + "auxiliary_loss_mlp": 0.01033494, + "balance_loss_clip": 1.04347563, + "balance_loss_mlp": 1.02236605, + "epoch": 0.5540357733353375, + "flos": 32157140495040.0, + "grad_norm": 6.660222462253259, + "language_loss": 0.70019048, + "learning_rate": 1.7477185031320108e-06, + "loss": 0.7217164, + "num_input_tokens_seen": 198478955, + "router_z_loss_clip": 0.75683594, + "router_z_loss_mlp": 0.11138916, + "step": 9215, + "time_per_iteration": 2.6680476665496826 + }, + { + "auxiliary_loss_clip": 0.01120619, + "auxiliary_loss_mlp": 0.01028758, + "balance_loss_clip": 1.04261494, + "balance_loss_mlp": 1.01661015, + "epoch": 0.5540958965880054, + "flos": 26019324792000.0, + "grad_norm": 1.7724808009411404, + "language_loss": 0.73179495, + "learning_rate": 1.7473321571180773e-06, + "loss": 0.75328875, + "num_input_tokens_seen": 198499030, + "router_z_loss_clip": 0.77978516, + "router_z_loss_mlp": 0.12158203, + "step": 9216, + "time_per_iteration": 4.107451438903809 + }, + { + "auxiliary_loss_clip": 0.01117015, + "auxiliary_loss_mlp": 0.01029436, + "balance_loss_clip": 1.04305434, + "balance_loss_mlp": 1.01745582, + "epoch": 0.5541560198406734, + "flos": 31319039664000.0, + "grad_norm": 1.9922488871278021, + "language_loss": 0.71642464, + "learning_rate": 1.7469458206853345e-06, + "loss": 0.73788917, + "num_input_tokens_seen": 198520265, + "router_z_loss_clip": 0.73925781, + "router_z_loss_mlp": 0.11987305, + "step": 9217, + "time_per_iteration": 2.704643726348877 + }, + { + "auxiliary_loss_clip": 0.01112802, + "auxiliary_loss_mlp": 0.01027282, + "balance_loss_clip": 1.03818583, + "balance_loss_mlp": 1.01579571, + "epoch": 0.5542161430933413, + "flos": 26771339103840.0, + "grad_norm": 2.4652352197747174, + "language_loss": 0.78562129, + "learning_rate": 1.7465594938484315e-06, + "loss": 0.80702215, + "num_input_tokens_seen": 198539645, + "router_z_loss_clip": 0.74560547, + "router_z_loss_mlp": 0.11480713, + "step": 9218, + "time_per_iteration": 2.646374225616455 + }, + { + "auxiliary_loss_clip": 0.01118481, + "auxiliary_loss_mlp": 0.01032589, + "balance_loss_clip": 1.03890586, + "balance_loss_mlp": 1.01970243, + "epoch": 0.5542762663460093, + "flos": 23881208607360.0, + "grad_norm": 1.596516522392045, + "language_loss": 0.72034967, + "learning_rate": 1.7461731766220176e-06, + "loss": 0.74186039, + "num_input_tokens_seen": 198558710, + "router_z_loss_clip": 0.79394531, + "router_z_loss_mlp": 0.12872314, + "step": 9219, + "time_per_iteration": 2.6546859741210938 + }, + { + "auxiliary_loss_clip": 0.01121787, + "auxiliary_loss_mlp": 0.01035548, + "balance_loss_clip": 1.04397368, + "balance_loss_mlp": 1.02367473, + "epoch": 0.5543363895986773, + "flos": 23792812603200.0, + "grad_norm": 1.5470560745604531, + "language_loss": 0.71220624, + "learning_rate": 1.7457868690207426e-06, + "loss": 0.73377961, + "num_input_tokens_seen": 198577050, + "router_z_loss_clip": 0.77636719, + "router_z_loss_mlp": 0.11865234, + "step": 9220, + "time_per_iteration": 2.6527369022369385 + }, + { + "auxiliary_loss_clip": 0.01115523, + "auxiliary_loss_mlp": 0.0102489, + "balance_loss_clip": 1.04056048, + "balance_loss_mlp": 1.01411986, + "epoch": 0.5543965128513453, + "flos": 27620379600480.0, + "grad_norm": 1.6847222058552251, + "language_loss": 0.79443407, + "learning_rate": 1.7454005710592547e-06, + "loss": 0.81583822, + "num_input_tokens_seen": 198595290, + "router_z_loss_clip": 0.74951172, + "router_z_loss_mlp": 0.10772705, + "step": 9221, + "time_per_iteration": 2.6635866165161133 + }, + { + "auxiliary_loss_clip": 0.01116455, + "auxiliary_loss_mlp": 0.01031855, + "balance_loss_clip": 1.04121947, + "balance_loss_mlp": 1.02042246, + "epoch": 0.5544566361040132, + "flos": 31713823404000.0, + "grad_norm": 1.95272715986979, + "language_loss": 0.83721495, + "learning_rate": 1.7450142827522027e-06, + "loss": 0.85869801, + "num_input_tokens_seen": 198614110, + "router_z_loss_clip": 0.75195312, + "router_z_loss_mlp": 0.11437988, + "step": 9222, + "time_per_iteration": 4.0362749099731445 + }, + { + "auxiliary_loss_clip": 0.01123008, + "auxiliary_loss_mlp": 0.01033779, + "balance_loss_clip": 1.0430038, + "balance_loss_mlp": 1.02117217, + "epoch": 0.5545167593566812, + "flos": 34480435764960.0, + "grad_norm": 2.20435982076203, + "language_loss": 0.75436389, + "learning_rate": 1.7446280041142344e-06, + "loss": 0.77593184, + "num_input_tokens_seen": 198633880, + "router_z_loss_clip": 0.80029297, + "router_z_loss_mlp": 0.12609863, + "step": 9223, + "time_per_iteration": 2.6790173053741455 + }, + { + "auxiliary_loss_clip": 0.01117796, + "auxiliary_loss_mlp": 0.01033447, + "balance_loss_clip": 1.0415616, + "balance_loss_mlp": 1.02168131, + "epoch": 0.5545768826093491, + "flos": 34747649641440.0, + "grad_norm": 1.8669642881581676, + "language_loss": 0.82345819, + "learning_rate": 1.7442417351599986e-06, + "loss": 0.8449707, + "num_input_tokens_seen": 198653505, + "router_z_loss_clip": 0.76220703, + "router_z_loss_mlp": 0.11767578, + "step": 9224, + "time_per_iteration": 2.722407102584839 + }, + { + "auxiliary_loss_clip": 0.01121366, + "auxiliary_loss_mlp": 0.01041725, + "balance_loss_clip": 1.04307246, + "balance_loss_mlp": 1.02959001, + "epoch": 0.5546370058620171, + "flos": 22546840950720.0, + "grad_norm": 2.01579108970509, + "language_loss": 0.5728538, + "learning_rate": 1.743855475904141e-06, + "loss": 0.59448469, + "num_input_tokens_seen": 198671890, + "router_z_loss_clip": 0.78222656, + "router_z_loss_mlp": 0.12145996, + "step": 9225, + "time_per_iteration": 2.622437000274658 + }, + { + "auxiliary_loss_clip": 0.01117819, + "auxiliary_loss_mlp": 0.01035034, + "balance_loss_clip": 1.04006505, + "balance_loss_mlp": 1.02300584, + "epoch": 0.554697129114685, + "flos": 27979960174560.0, + "grad_norm": 1.8675909367692107, + "language_loss": 0.674537, + "learning_rate": 1.7434692263613098e-06, + "loss": 0.69606555, + "num_input_tokens_seen": 198691995, + "router_z_loss_clip": 0.77685547, + "router_z_loss_mlp": 0.12011719, + "step": 9226, + "time_per_iteration": 2.6670782566070557 + }, + { + "auxiliary_loss_clip": 0.01118433, + "auxiliary_loss_mlp": 0.01032176, + "balance_loss_clip": 1.04083276, + "balance_loss_mlp": 1.02071381, + "epoch": 0.5547572523673531, + "flos": 26597180718720.0, + "grad_norm": 1.5099415312315119, + "language_loss": 0.74353468, + "learning_rate": 1.7430829865461518e-06, + "loss": 0.76504076, + "num_input_tokens_seen": 198712440, + "router_z_loss_clip": 0.77587891, + "router_z_loss_mlp": 0.11456299, + "step": 9227, + "time_per_iteration": 2.627610921859741 + }, + { + "auxiliary_loss_clip": 0.01121606, + "auxiliary_loss_mlp": 0.01032553, + "balance_loss_clip": 1.04425228, + "balance_loss_mlp": 1.02103186, + "epoch": 0.5548173756200211, + "flos": 27263189545920.0, + "grad_norm": 1.6531567523819366, + "language_loss": 0.7359277, + "learning_rate": 1.7426967564733118e-06, + "loss": 0.7574693, + "num_input_tokens_seen": 198731515, + "router_z_loss_clip": 0.7734375, + "router_z_loss_mlp": 0.11523438, + "step": 9228, + "time_per_iteration": 2.700321912765503 + }, + { + "auxiliary_loss_clip": 0.01119353, + "auxiliary_loss_mlp": 0.01029947, + "balance_loss_clip": 1.04162192, + "balance_loss_mlp": 1.01862788, + "epoch": 0.554877498872689, + "flos": 21791018014560.0, + "grad_norm": 2.2037212111393436, + "language_loss": 0.76006126, + "learning_rate": 1.7423105361574373e-06, + "loss": 0.78155428, + "num_input_tokens_seen": 198749750, + "router_z_loss_clip": 0.77685547, + "router_z_loss_mlp": 0.11309814, + "step": 9229, + "time_per_iteration": 2.646714448928833 + }, + { + "auxiliary_loss_clip": 0.01119729, + "auxiliary_loss_mlp": 0.01038994, + "balance_loss_clip": 1.04303324, + "balance_loss_mlp": 1.02717423, + "epoch": 0.554937622125357, + "flos": 21033979560000.0, + "grad_norm": 4.060958777676529, + "language_loss": 0.69093323, + "learning_rate": 1.741924325613172e-06, + "loss": 0.71252048, + "num_input_tokens_seen": 198768320, + "router_z_loss_clip": 0.76660156, + "router_z_loss_mlp": 0.11834717, + "step": 9230, + "time_per_iteration": 2.6105735301971436 + }, + { + "auxiliary_loss_clip": 0.01118207, + "auxiliary_loss_mlp": 0.01032081, + "balance_loss_clip": 1.04008389, + "balance_loss_mlp": 1.02018392, + "epoch": 0.5549977453780249, + "flos": 30955123740960.0, + "grad_norm": 2.9590084538532166, + "language_loss": 0.67833388, + "learning_rate": 1.741538124855163e-06, + "loss": 0.69983673, + "num_input_tokens_seen": 198787230, + "router_z_loss_clip": 0.78125, + "router_z_loss_mlp": 0.11895752, + "step": 9231, + "time_per_iteration": 2.702636957168579 + }, + { + "auxiliary_loss_clip": 0.0112165, + "auxiliary_loss_mlp": 0.01033351, + "balance_loss_clip": 1.04198492, + "balance_loss_mlp": 1.02086961, + "epoch": 0.555057868630693, + "flos": 30602228518080.0, + "grad_norm": 1.9206487547667344, + "language_loss": 0.78467888, + "learning_rate": 1.7411519338980548e-06, + "loss": 0.80622888, + "num_input_tokens_seen": 198806720, + "router_z_loss_clip": 0.79736328, + "router_z_loss_mlp": 0.12487793, + "step": 9232, + "time_per_iteration": 2.66510272026062 + }, + { + "auxiliary_loss_clip": 0.01115302, + "auxiliary_loss_mlp": 0.01028179, + "balance_loss_clip": 1.04084551, + "balance_loss_mlp": 1.01768899, + "epoch": 0.5551179918833609, + "flos": 31852819140480.0, + "grad_norm": 1.7658189559039, + "language_loss": 0.82587695, + "learning_rate": 1.7407657527564898e-06, + "loss": 0.84731174, + "num_input_tokens_seen": 198826235, + "router_z_loss_clip": 0.74462891, + "router_z_loss_mlp": 0.10498047, + "step": 9233, + "time_per_iteration": 2.7355830669403076 + }, + { + "auxiliary_loss_clip": 0.011222, + "auxiliary_loss_mlp": 0.01032534, + "balance_loss_clip": 1.04250669, + "balance_loss_mlp": 1.02091694, + "epoch": 0.5551781151360289, + "flos": 23653776349440.0, + "grad_norm": 2.092512346386486, + "language_loss": 0.74914742, + "learning_rate": 1.7403795814451142e-06, + "loss": 0.77069473, + "num_input_tokens_seen": 198842655, + "router_z_loss_clip": 0.79638672, + "router_z_loss_mlp": 0.11633301, + "step": 9234, + "time_per_iteration": 2.6223032474517822 + }, + { + "auxiliary_loss_clip": 0.01115198, + "auxiliary_loss_mlp": 0.01025212, + "balance_loss_clip": 1.04030824, + "balance_loss_mlp": 1.01419735, + "epoch": 0.5552382383886968, + "flos": 26510121784800.0, + "grad_norm": 2.2152454148153637, + "language_loss": 0.64690393, + "learning_rate": 1.7399934199785706e-06, + "loss": 0.66830808, + "num_input_tokens_seen": 198861210, + "router_z_loss_clip": 0.74804688, + "router_z_loss_mlp": 0.11022949, + "step": 9235, + "time_per_iteration": 2.6657252311706543 + }, + { + "auxiliary_loss_clip": 0.0111772, + "auxiliary_loss_mlp": 0.01031004, + "balance_loss_clip": 1.04048944, + "balance_loss_mlp": 1.01928616, + "epoch": 0.5552983616413648, + "flos": 17516933061120.0, + "grad_norm": 2.0058025240752553, + "language_loss": 0.67454338, + "learning_rate": 1.7396072683715029e-06, + "loss": 0.69603068, + "num_input_tokens_seen": 198880045, + "router_z_loss_clip": 0.77246094, + "router_z_loss_mlp": 0.11724854, + "step": 9236, + "time_per_iteration": 2.6527974605560303 + }, + { + "auxiliary_loss_clip": 0.01114314, + "auxiliary_loss_mlp": 0.01023047, + "balance_loss_clip": 1.04055285, + "balance_loss_mlp": 1.01166868, + "epoch": 0.5553584848940327, + "flos": 31089824645760.0, + "grad_norm": 1.7792401304151346, + "language_loss": 0.86025846, + "learning_rate": 1.7392211266385536e-06, + "loss": 0.88163209, + "num_input_tokens_seen": 198900210, + "router_z_loss_clip": 0.73681641, + "router_z_loss_mlp": 0.11364746, + "step": 9237, + "time_per_iteration": 2.716261386871338 + }, + { + "auxiliary_loss_clip": 0.01114528, + "auxiliary_loss_mlp": 0.01036055, + "balance_loss_clip": 1.03965676, + "balance_loss_mlp": 1.02448547, + "epoch": 0.5554186081467007, + "flos": 27044468503200.0, + "grad_norm": 1.636235050256443, + "language_loss": 0.72943753, + "learning_rate": 1.7388349947943652e-06, + "loss": 0.7509433, + "num_input_tokens_seen": 198919055, + "router_z_loss_clip": 0.74853516, + "router_z_loss_mlp": 0.11572266, + "step": 9238, + "time_per_iteration": 2.631391763687134 + }, + { + "auxiliary_loss_clip": 0.01118881, + "auxiliary_loss_mlp": 0.01029471, + "balance_loss_clip": 1.04100442, + "balance_loss_mlp": 1.01803303, + "epoch": 0.5554787313993687, + "flos": 60703166067840.0, + "grad_norm": 1.7219091944430622, + "language_loss": 0.78815168, + "learning_rate": 1.73844887285358e-06, + "loss": 0.80963516, + "num_input_tokens_seen": 198943505, + "router_z_loss_clip": 0.77880859, + "router_z_loss_mlp": 0.11437988, + "step": 9239, + "time_per_iteration": 2.90486741065979 + }, + { + "auxiliary_loss_clip": 0.01118783, + "auxiliary_loss_mlp": 0.01027793, + "balance_loss_clip": 1.04154253, + "balance_loss_mlp": 1.01602054, + "epoch": 0.5555388546520367, + "flos": 26999946362880.0, + "grad_norm": 1.678993133459156, + "language_loss": 0.79955357, + "learning_rate": 1.7380627608308393e-06, + "loss": 0.82101929, + "num_input_tokens_seen": 198963590, + "router_z_loss_clip": 0.77294922, + "router_z_loss_mlp": 0.11767578, + "step": 9240, + "time_per_iteration": 2.6749861240386963 + }, + { + "auxiliary_loss_clip": 0.01118001, + "auxiliary_loss_mlp": 0.01029339, + "balance_loss_clip": 1.04223084, + "balance_loss_mlp": 1.01800179, + "epoch": 0.5555989779047047, + "flos": 30117387565440.0, + "grad_norm": 1.6654281459223577, + "language_loss": 0.65021336, + "learning_rate": 1.737676658740786e-06, + "loss": 0.67168677, + "num_input_tokens_seen": 198982680, + "router_z_loss_clip": 0.75830078, + "router_z_loss_mlp": 0.11340332, + "step": 9241, + "time_per_iteration": 2.7256009578704834 + }, + { + "auxiliary_loss_clip": 0.01119786, + "auxiliary_loss_mlp": 0.01029192, + "balance_loss_clip": 1.04221654, + "balance_loss_mlp": 1.0179143, + "epoch": 0.5556591011573726, + "flos": 19652901829920.0, + "grad_norm": 3.113305878793096, + "language_loss": 0.72600514, + "learning_rate": 1.7372905665980594e-06, + "loss": 0.74749488, + "num_input_tokens_seen": 199000185, + "router_z_loss_clip": 0.77441406, + "router_z_loss_mlp": 0.112854, + "step": 9242, + "time_per_iteration": 2.6382791996002197 + }, + { + "auxiliary_loss_clip": 0.01117714, + "auxiliary_loss_mlp": 0.01031711, + "balance_loss_clip": 1.04064906, + "balance_loss_mlp": 1.01937914, + "epoch": 0.5557192244100406, + "flos": 15781055796000.0, + "grad_norm": 1.822977030024073, + "language_loss": 0.63651776, + "learning_rate": 1.7369044844173012e-06, + "loss": 0.65801203, + "num_input_tokens_seen": 199018380, + "router_z_loss_clip": 0.77001953, + "router_z_loss_mlp": 0.12335205, + "step": 9243, + "time_per_iteration": 2.684415578842163 + }, + { + "auxiliary_loss_clip": 0.01120595, + "auxiliary_loss_mlp": 0.01031192, + "balance_loss_clip": 1.04418349, + "balance_loss_mlp": 1.01972389, + "epoch": 0.5557793476627085, + "flos": 28201193288640.0, + "grad_norm": 2.827165438876474, + "language_loss": 0.75461024, + "learning_rate": 1.7365184122131509e-06, + "loss": 0.77612811, + "num_input_tokens_seen": 199037115, + "router_z_loss_clip": 0.76318359, + "router_z_loss_mlp": 0.11468506, + "step": 9244, + "time_per_iteration": 2.6604180335998535 + }, + { + "auxiliary_loss_clip": 0.011121, + "auxiliary_loss_mlp": 0.01028322, + "balance_loss_clip": 1.03985977, + "balance_loss_mlp": 1.01771212, + "epoch": 0.5558394709153766, + "flos": 26147137759200.0, + "grad_norm": 2.4733246156080555, + "language_loss": 0.74917161, + "learning_rate": 1.7361323500002486e-06, + "loss": 0.77057576, + "num_input_tokens_seen": 199053375, + "router_z_loss_clip": 0.72216797, + "router_z_loss_mlp": 0.1060791, + "step": 9245, + "time_per_iteration": 2.6165854930877686 + }, + { + "auxiliary_loss_clip": 0.01122119, + "auxiliary_loss_mlp": 0.01029455, + "balance_loss_clip": 1.04272962, + "balance_loss_mlp": 1.01740885, + "epoch": 0.5558995941680445, + "flos": 30601337137920.0, + "grad_norm": 2.1246903147731993, + "language_loss": 0.79445595, + "learning_rate": 1.7357462977932348e-06, + "loss": 0.81597161, + "num_input_tokens_seen": 199070930, + "router_z_loss_clip": 0.79345703, + "router_z_loss_mlp": 0.12060547, + "step": 9246, + "time_per_iteration": 2.7927889823913574 + }, + { + "auxiliary_loss_clip": 0.01118794, + "auxiliary_loss_mlp": 0.01030977, + "balance_loss_clip": 1.04243815, + "balance_loss_mlp": 1.01949072, + "epoch": 0.5559597174207125, + "flos": 24417581189760.0, + "grad_norm": 2.092784721510894, + "language_loss": 0.73727649, + "learning_rate": 1.7353602556067471e-06, + "loss": 0.75877416, + "num_input_tokens_seen": 199088675, + "router_z_loss_clip": 0.76318359, + "router_z_loss_mlp": 0.1149292, + "step": 9247, + "time_per_iteration": 2.6395277976989746 + }, + { + "auxiliary_loss_clip": 0.01119552, + "auxiliary_loss_mlp": 0.01029824, + "balance_loss_clip": 1.04194438, + "balance_loss_mlp": 1.01773655, + "epoch": 0.5560198406733804, + "flos": 20542736877120.0, + "grad_norm": 3.3294310332196746, + "language_loss": 0.76407146, + "learning_rate": 1.7349742234554254e-06, + "loss": 0.7855652, + "num_input_tokens_seen": 199103075, + "router_z_loss_clip": 0.77587891, + "router_z_loss_mlp": 0.12084961, + "step": 9248, + "time_per_iteration": 2.6293787956237793 + }, + { + "auxiliary_loss_clip": 0.01041293, + "auxiliary_loss_mlp": 0.00999812, + "balance_loss_clip": 1.01670027, + "balance_loss_mlp": 0.99854779, + "epoch": 0.5560799639260484, + "flos": 86264391312000.0, + "grad_norm": 0.8431293599680506, + "language_loss": 0.59397995, + "learning_rate": 1.7345882013539081e-06, + "loss": 0.61439097, + "num_input_tokens_seen": 199160325, + "router_z_loss_clip": 0.24584961, + "router_z_loss_mlp": 0.01264191, + "step": 9249, + "time_per_iteration": 4.769556999206543 + }, + { + "auxiliary_loss_clip": 0.01115315, + "auxiliary_loss_mlp": 0.01029081, + "balance_loss_clip": 1.03851485, + "balance_loss_mlp": 1.01722598, + "epoch": 0.5561400871787163, + "flos": 28246161119040.0, + "grad_norm": 2.047476047641571, + "language_loss": 0.80097091, + "learning_rate": 1.734202189316832e-06, + "loss": 0.82241488, + "num_input_tokens_seen": 199179760, + "router_z_loss_clip": 0.76855469, + "router_z_loss_mlp": 0.11853027, + "step": 9250, + "time_per_iteration": 4.139876842498779 + }, + { + "auxiliary_loss_clip": 0.01120225, + "auxiliary_loss_mlp": 0.01031199, + "balance_loss_clip": 1.04139948, + "balance_loss_mlp": 1.01920009, + "epoch": 0.5562002104313843, + "flos": 21434354684640.0, + "grad_norm": 2.148856922845178, + "language_loss": 0.69349957, + "learning_rate": 1.733816187358836e-06, + "loss": 0.71501386, + "num_input_tokens_seen": 199196695, + "router_z_loss_clip": 0.78759766, + "router_z_loss_mlp": 0.12017822, + "step": 9251, + "time_per_iteration": 2.6023809909820557 + }, + { + "auxiliary_loss_clip": 0.01118112, + "auxiliary_loss_mlp": 0.01033848, + "balance_loss_clip": 1.04185462, + "balance_loss_mlp": 1.02246988, + "epoch": 0.5562603336840523, + "flos": 30560866725600.0, + "grad_norm": 1.6178734473250016, + "language_loss": 0.75745225, + "learning_rate": 1.7334301954945569e-06, + "loss": 0.77897191, + "num_input_tokens_seen": 199217845, + "router_z_loss_clip": 0.76269531, + "router_z_loss_mlp": 0.11376953, + "step": 9252, + "time_per_iteration": 2.6946120262145996 + }, + { + "auxiliary_loss_clip": 0.01121401, + "auxiliary_loss_mlp": 0.0103294, + "balance_loss_clip": 1.042382, + "balance_loss_mlp": 1.02135277, + "epoch": 0.5563204569367203, + "flos": 35458545264480.0, + "grad_norm": 2.3050847581587943, + "language_loss": 0.73035854, + "learning_rate": 1.7330442137386313e-06, + "loss": 0.75190192, + "num_input_tokens_seen": 199239250, + "router_z_loss_clip": 0.79003906, + "router_z_loss_mlp": 0.11584473, + "step": 9253, + "time_per_iteration": 2.7286593914031982 + }, + { + "auxiliary_loss_clip": 0.0111865, + "auxiliary_loss_mlp": 0.0102858, + "balance_loss_clip": 1.04285538, + "balance_loss_mlp": 1.0174818, + "epoch": 0.5563805801893883, + "flos": 26954938015200.0, + "grad_norm": 1.9050181371657087, + "language_loss": 0.82609701, + "learning_rate": 1.7326582421056965e-06, + "loss": 0.84756935, + "num_input_tokens_seen": 199258320, + "router_z_loss_clip": 0.75732422, + "router_z_loss_mlp": 0.11083984, + "step": 9254, + "time_per_iteration": 2.672079563140869 + }, + { + "auxiliary_loss_clip": 0.0104074, + "auxiliary_loss_mlp": 0.00999977, + "balance_loss_clip": 1.01619601, + "balance_loss_mlp": 0.9987247, + "epoch": 0.5564407034420562, + "flos": 71548625501280.0, + "grad_norm": 0.8741960767841573, + "language_loss": 0.64801347, + "learning_rate": 1.732272280610387e-06, + "loss": 0.66842067, + "num_input_tokens_seen": 199314840, + "router_z_loss_clip": 0.2454834, + "router_z_loss_mlp": 0.01252747, + "step": 9255, + "time_per_iteration": 4.536627292633057 + }, + { + "auxiliary_loss_clip": 0.01121852, + "auxiliary_loss_mlp": 0.01033632, + "balance_loss_clip": 1.04754174, + "balance_loss_mlp": 1.02223599, + "epoch": 0.5565008266947242, + "flos": 28201882082400.0, + "grad_norm": 2.0970469771054927, + "language_loss": 0.69685626, + "learning_rate": 1.7318863292673399e-06, + "loss": 0.71841109, + "num_input_tokens_seen": 199335405, + "router_z_loss_clip": 0.7421875, + "router_z_loss_mlp": 0.11395264, + "step": 9256, + "time_per_iteration": 2.6786375045776367 + }, + { + "auxiliary_loss_clip": 0.01115088, + "auxiliary_loss_mlp": 0.01030233, + "balance_loss_clip": 1.04117167, + "balance_loss_mlp": 1.01977801, + "epoch": 0.5565609499473921, + "flos": 26330209945920.0, + "grad_norm": 1.63927975443866, + "language_loss": 0.75995994, + "learning_rate": 1.73150038809119e-06, + "loss": 0.7814132, + "num_input_tokens_seen": 199354345, + "router_z_loss_clip": 0.73925781, + "router_z_loss_mlp": 0.10455322, + "step": 9257, + "time_per_iteration": 2.66941237449646 + }, + { + "auxiliary_loss_clip": 0.01119093, + "auxiliary_loss_mlp": 0.01031742, + "balance_loss_clip": 1.04101408, + "balance_loss_mlp": 1.02032757, + "epoch": 0.5566210732000602, + "flos": 22414449530880.0, + "grad_norm": 3.1640192953049966, + "language_loss": 0.61154038, + "learning_rate": 1.7311144570965724e-06, + "loss": 0.63304871, + "num_input_tokens_seen": 199372250, + "router_z_loss_clip": 0.78125, + "router_z_loss_mlp": 0.11419678, + "step": 9258, + "time_per_iteration": 2.622631549835205 + }, + { + "auxiliary_loss_clip": 0.01119337, + "auxiliary_loss_mlp": 0.01035816, + "balance_loss_clip": 1.04215443, + "balance_loss_mlp": 1.02345407, + "epoch": 0.5566811964527281, + "flos": 31363602321600.0, + "grad_norm": 1.879219335622239, + "language_loss": 0.7939539, + "learning_rate": 1.7307285362981215e-06, + "loss": 0.8155055, + "num_input_tokens_seen": 199392815, + "router_z_loss_clip": 0.77099609, + "router_z_loss_mlp": 0.12341309, + "step": 9259, + "time_per_iteration": 2.745882987976074 + }, + { + "auxiliary_loss_clip": 0.01118739, + "auxiliary_loss_mlp": 0.01031638, + "balance_loss_clip": 1.04119468, + "balance_loss_mlp": 1.01928759, + "epoch": 0.5567413197053961, + "flos": 32877517161600.0, + "grad_norm": 2.360169505759237, + "language_loss": 0.81448776, + "learning_rate": 1.7303426257104712e-06, + "loss": 0.8359915, + "num_input_tokens_seen": 199412375, + "router_z_loss_clip": 0.77539062, + "router_z_loss_mlp": 0.12365723, + "step": 9260, + "time_per_iteration": 2.681051015853882 + }, + { + "auxiliary_loss_clip": 0.01118721, + "auxiliary_loss_mlp": 0.01038143, + "balance_loss_clip": 1.04211187, + "balance_loss_mlp": 1.02582312, + "epoch": 0.556801442958064, + "flos": 25441914555360.0, + "grad_norm": 1.5352687366533007, + "language_loss": 0.68914175, + "learning_rate": 1.729956725348256e-06, + "loss": 0.71071035, + "num_input_tokens_seen": 199431490, + "router_z_loss_clip": 0.76611328, + "router_z_loss_mlp": 0.12329102, + "step": 9261, + "time_per_iteration": 2.7055561542510986 + }, + { + "auxiliary_loss_clip": 0.01040594, + "auxiliary_loss_mlp": 0.01003843, + "balance_loss_clip": 1.01565659, + "balance_loss_mlp": 1.00253129, + "epoch": 0.556861566210732, + "flos": 86023264213440.0, + "grad_norm": 0.7283565570192532, + "language_loss": 0.61085045, + "learning_rate": 1.729570835226108e-06, + "loss": 0.63129485, + "num_input_tokens_seen": 199495855, + "router_z_loss_clip": 0.24926758, + "router_z_loss_mlp": 0.01311493, + "step": 9262, + "time_per_iteration": 4.586662292480469 + }, + { + "auxiliary_loss_clip": 0.01119321, + "auxiliary_loss_mlp": 0.01035423, + "balance_loss_clip": 1.04122651, + "balance_loss_mlp": 1.02393126, + "epoch": 0.5569216894633999, + "flos": 30917003330880.0, + "grad_norm": 1.6548119265781358, + "language_loss": 0.64600551, + "learning_rate": 1.7291849553586622e-06, + "loss": 0.66755295, + "num_input_tokens_seen": 199515870, + "router_z_loss_clip": 0.78027344, + "router_z_loss_mlp": 0.11486816, + "step": 9263, + "time_per_iteration": 2.6562790870666504 + }, + { + "auxiliary_loss_clip": 0.01117931, + "auxiliary_loss_mlp": 0.01034679, + "balance_loss_clip": 1.04132771, + "balance_loss_mlp": 1.02264512, + "epoch": 0.556981812716068, + "flos": 27623904603840.0, + "grad_norm": 2.5046637813864763, + "language_loss": 0.73168302, + "learning_rate": 1.7287990857605497e-06, + "loss": 0.75320911, + "num_input_tokens_seen": 199535745, + "router_z_loss_clip": 0.76611328, + "router_z_loss_mlp": 0.12036133, + "step": 9264, + "time_per_iteration": 2.6639978885650635 + }, + { + "auxiliary_loss_clip": 0.01120236, + "auxiliary_loss_mlp": 0.01030962, + "balance_loss_clip": 1.04349041, + "balance_loss_mlp": 1.01926184, + "epoch": 0.5570419359687359, + "flos": 13465215705600.0, + "grad_norm": 1.820216552574653, + "language_loss": 0.76314938, + "learning_rate": 1.7284132264464022e-06, + "loss": 0.78466135, + "num_input_tokens_seen": 199554035, + "router_z_loss_clip": 0.76660156, + "router_z_loss_mlp": 0.11694336, + "step": 9265, + "time_per_iteration": 2.5971791744232178 + }, + { + "auxiliary_loss_clip": 0.01117073, + "auxiliary_loss_mlp": 0.01030864, + "balance_loss_clip": 1.0435667, + "balance_loss_mlp": 1.0197835, + "epoch": 0.5571020592214039, + "flos": 27848743755840.0, + "grad_norm": 1.7227077512600255, + "language_loss": 0.70914239, + "learning_rate": 1.7280273774308536e-06, + "loss": 0.7306217, + "num_input_tokens_seen": 199576120, + "router_z_loss_clip": 0.73535156, + "router_z_loss_mlp": 0.11090088, + "step": 9266, + "time_per_iteration": 2.7122015953063965 + }, + { + "auxiliary_loss_clip": 0.01119307, + "auxiliary_loss_mlp": 0.01031744, + "balance_loss_clip": 1.04245305, + "balance_loss_mlp": 1.01967406, + "epoch": 0.5571621824740719, + "flos": 27976880861280.0, + "grad_norm": 1.888692688278665, + "language_loss": 0.68376887, + "learning_rate": 1.727641538728533e-06, + "loss": 0.70527941, + "num_input_tokens_seen": 199593780, + "router_z_loss_clip": 0.76855469, + "router_z_loss_mlp": 0.12078857, + "step": 9267, + "time_per_iteration": 2.6155807971954346 + }, + { + "auxiliary_loss_clip": 0.01113532, + "auxiliary_loss_mlp": 0.01037711, + "balance_loss_clip": 1.04067063, + "balance_loss_mlp": 1.02665424, + "epoch": 0.5572223057267398, + "flos": 28024158176640.0, + "grad_norm": 6.64590934679668, + "language_loss": 0.74488389, + "learning_rate": 1.7272557103540736e-06, + "loss": 0.76639628, + "num_input_tokens_seen": 199613220, + "router_z_loss_clip": 0.72802734, + "router_z_loss_mlp": 0.11065674, + "step": 9268, + "time_per_iteration": 2.6947999000549316 + }, + { + "auxiliary_loss_clip": 0.01117484, + "auxiliary_loss_mlp": 0.01029464, + "balance_loss_clip": 1.04141963, + "balance_loss_mlp": 1.01816845, + "epoch": 0.5572824289794078, + "flos": 25575480976320.0, + "grad_norm": 1.8017393580993293, + "language_loss": 0.74668407, + "learning_rate": 1.726869892322104e-06, + "loss": 0.76815355, + "num_input_tokens_seen": 199632085, + "router_z_loss_clip": 0.76074219, + "router_z_loss_mlp": 0.11297607, + "step": 9269, + "time_per_iteration": 2.6321513652801514 + }, + { + "auxiliary_loss_clip": 0.01117082, + "auxiliary_loss_mlp": 0.01033731, + "balance_loss_clip": 1.03999305, + "balance_loss_mlp": 1.02205467, + "epoch": 0.5573425522320757, + "flos": 30557463274080.0, + "grad_norm": 15.286851465260085, + "language_loss": 0.82742584, + "learning_rate": 1.726484084647256e-06, + "loss": 0.84893399, + "num_input_tokens_seen": 199649295, + "router_z_loss_clip": 0.77148438, + "router_z_loss_mlp": 0.11676025, + "step": 9270, + "time_per_iteration": 2.6828131675720215 + }, + { + "auxiliary_loss_clip": 0.01119949, + "auxiliary_loss_mlp": 0.01034952, + "balance_loss_clip": 1.0417459, + "balance_loss_mlp": 1.02273917, + "epoch": 0.5574026754847438, + "flos": 28869754704480.0, + "grad_norm": 2.1938133545379985, + "language_loss": 0.79371911, + "learning_rate": 1.7260982873441591e-06, + "loss": 0.81526816, + "num_input_tokens_seen": 199668870, + "router_z_loss_clip": 0.78173828, + "router_z_loss_mlp": 0.12225342, + "step": 9271, + "time_per_iteration": 2.670513391494751 + }, + { + "auxiliary_loss_clip": 0.01119742, + "auxiliary_loss_mlp": 0.01031198, + "balance_loss_clip": 1.04198241, + "balance_loss_mlp": 1.01933694, + "epoch": 0.5574627987374117, + "flos": 30240419493600.0, + "grad_norm": 1.956731434491746, + "language_loss": 0.90309179, + "learning_rate": 1.725712500427442e-06, + "loss": 0.92460114, + "num_input_tokens_seen": 199684870, + "router_z_loss_clip": 0.77734375, + "router_z_loss_mlp": 0.11865234, + "step": 9272, + "time_per_iteration": 2.6891469955444336 + }, + { + "auxiliary_loss_clip": 0.01118165, + "auxiliary_loss_mlp": 0.01034178, + "balance_loss_clip": 1.04308367, + "balance_loss_mlp": 1.02277565, + "epoch": 0.5575229219900797, + "flos": 26643161481120.0, + "grad_norm": 2.0968882523437444, + "language_loss": 0.8387903, + "learning_rate": 1.7253267239117347e-06, + "loss": 0.86031371, + "num_input_tokens_seen": 199701975, + "router_z_loss_clip": 0.75, + "router_z_loss_mlp": 0.11407471, + "step": 9273, + "time_per_iteration": 2.6464178562164307 + }, + { + "auxiliary_loss_clip": 0.01120826, + "auxiliary_loss_mlp": 0.01038257, + "balance_loss_clip": 1.04374611, + "balance_loss_mlp": 1.02528131, + "epoch": 0.5575830452427476, + "flos": 33940983869280.0, + "grad_norm": 2.2344607754854278, + "language_loss": 0.743527, + "learning_rate": 1.7249409578116655e-06, + "loss": 0.76511788, + "num_input_tokens_seen": 199721865, + "router_z_loss_clip": 0.77099609, + "router_z_loss_mlp": 0.12963867, + "step": 9274, + "time_per_iteration": 2.686661720275879 + }, + { + "auxiliary_loss_clip": 0.01127496, + "auxiliary_loss_mlp": 0.01036259, + "balance_loss_clip": 1.04478788, + "balance_loss_mlp": 1.02288413, + "epoch": 0.5576431684954156, + "flos": 21735839829600.0, + "grad_norm": 3.3880018483771894, + "language_loss": 0.77977455, + "learning_rate": 1.7245552021418629e-06, + "loss": 0.80141217, + "num_input_tokens_seen": 199736455, + "router_z_loss_clip": 0.82617188, + "router_z_loss_mlp": 0.13360596, + "step": 9275, + "time_per_iteration": 2.6419825553894043 + }, + { + "auxiliary_loss_clip": 0.01121112, + "auxiliary_loss_mlp": 0.01031609, + "balance_loss_clip": 1.04423618, + "balance_loss_mlp": 1.01966453, + "epoch": 0.5577032917480835, + "flos": 18896714238240.0, + "grad_norm": 2.358644257675651, + "language_loss": 0.74753785, + "learning_rate": 1.7241694569169546e-06, + "loss": 0.76906502, + "num_input_tokens_seen": 199753125, + "router_z_loss_clip": 0.76855469, + "router_z_loss_mlp": 0.1194458, + "step": 9276, + "time_per_iteration": 2.5912222862243652 + }, + { + "auxiliary_loss_clip": 0.01117039, + "auxiliary_loss_mlp": 0.01035263, + "balance_loss_clip": 1.04078937, + "balance_loss_mlp": 1.02363396, + "epoch": 0.5577634150007516, + "flos": 26331749602560.0, + "grad_norm": 1.7921778190461857, + "language_loss": 0.75524807, + "learning_rate": 1.7237837221515678e-06, + "loss": 0.77677107, + "num_input_tokens_seen": 199771365, + "router_z_loss_clip": 0.76220703, + "router_z_loss_mlp": 0.11627197, + "step": 9277, + "time_per_iteration": 2.638671875 + }, + { + "auxiliary_loss_clip": 0.01115478, + "auxiliary_loss_mlp": 0.01034679, + "balance_loss_clip": 1.04087818, + "balance_loss_mlp": 1.02359807, + "epoch": 0.5578235382534195, + "flos": 25795012364640.0, + "grad_norm": 1.539346499789946, + "language_loss": 0.71565235, + "learning_rate": 1.7233979978603304e-06, + "loss": 0.73715389, + "num_input_tokens_seen": 199790035, + "router_z_loss_clip": 0.74560547, + "router_z_loss_mlp": 0.11077881, + "step": 9278, + "time_per_iteration": 2.601229190826416 + }, + { + "auxiliary_loss_clip": 0.01121387, + "auxiliary_loss_mlp": 0.01031547, + "balance_loss_clip": 1.04232383, + "balance_loss_mlp": 1.01894057, + "epoch": 0.5578836615060875, + "flos": 32342157511200.0, + "grad_norm": 1.6399009840958827, + "language_loss": 0.75523067, + "learning_rate": 1.723012284057868e-06, + "loss": 0.77675998, + "num_input_tokens_seen": 199811125, + "router_z_loss_clip": 0.79003906, + "router_z_loss_mlp": 0.1260376, + "step": 9279, + "time_per_iteration": 2.694460391998291 + }, + { + "auxiliary_loss_clip": 0.01116848, + "auxiliary_loss_mlp": 0.01031435, + "balance_loss_clip": 1.03971148, + "balance_loss_mlp": 1.01928735, + "epoch": 0.5579437847587555, + "flos": 24591942161280.0, + "grad_norm": 2.8867223744407533, + "language_loss": 0.6759035, + "learning_rate": 1.7226265807588082e-06, + "loss": 0.69738626, + "num_input_tokens_seen": 199829915, + "router_z_loss_clip": 0.77148438, + "router_z_loss_mlp": 0.12145996, + "step": 9280, + "time_per_iteration": 2.6435389518737793 + }, + { + "auxiliary_loss_clip": 0.01121554, + "auxiliary_loss_mlp": 0.01038867, + "balance_loss_clip": 1.04217994, + "balance_loss_mlp": 1.02679157, + "epoch": 0.5580039080114234, + "flos": 31850631207360.0, + "grad_norm": 1.5695747015744397, + "language_loss": 0.73419923, + "learning_rate": 1.7222408879777763e-06, + "loss": 0.75580347, + "num_input_tokens_seen": 199850670, + "router_z_loss_clip": 0.79296875, + "router_z_loss_mlp": 0.12072754, + "step": 9281, + "time_per_iteration": 2.7216956615448 + }, + { + "auxiliary_loss_clip": 0.01118683, + "auxiliary_loss_mlp": 0.01034678, + "balance_loss_clip": 1.04306281, + "balance_loss_mlp": 1.02314472, + "epoch": 0.5580640312640914, + "flos": 16803727953120.0, + "grad_norm": 2.764628562981113, + "language_loss": 0.75225788, + "learning_rate": 1.7218552057293974e-06, + "loss": 0.77379143, + "num_input_tokens_seen": 199867645, + "router_z_loss_clip": 0.75683594, + "router_z_loss_mlp": 0.11535645, + "step": 9282, + "time_per_iteration": 2.6335978507995605 + }, + { + "auxiliary_loss_clip": 0.01117793, + "auxiliary_loss_mlp": 0.01029579, + "balance_loss_clip": 1.04194283, + "balance_loss_mlp": 1.01742578, + "epoch": 0.5581241545167593, + "flos": 21568042657440.0, + "grad_norm": 1.7929212413830562, + "language_loss": 0.66516256, + "learning_rate": 1.721469534028297e-06, + "loss": 0.68663627, + "num_input_tokens_seen": 199886320, + "router_z_loss_clip": 0.75830078, + "router_z_loss_mlp": 0.121521, + "step": 9283, + "time_per_iteration": 2.8153440952301025 + }, + { + "auxiliary_loss_clip": 0.01118467, + "auxiliary_loss_mlp": 0.01028975, + "balance_loss_clip": 1.04212785, + "balance_loss_mlp": 1.01820993, + "epoch": 0.5581842777694274, + "flos": 23878615501440.0, + "grad_norm": 2.451390261467081, + "language_loss": 0.82914066, + "learning_rate": 1.7210838728890994e-06, + "loss": 0.85061502, + "num_input_tokens_seen": 199904895, + "router_z_loss_clip": 0.76318359, + "router_z_loss_mlp": 0.10766602, + "step": 9284, + "time_per_iteration": 2.6064774990081787 + }, + { + "auxiliary_loss_clip": 0.01119618, + "auxiliary_loss_mlp": 0.01030821, + "balance_loss_clip": 1.04237437, + "balance_loss_mlp": 1.01917434, + "epoch": 0.5582444010220953, + "flos": 25130219055840.0, + "grad_norm": 7.851812382278598, + "language_loss": 0.85564864, + "learning_rate": 1.7206982223264304e-06, + "loss": 0.87715304, + "num_input_tokens_seen": 199921090, + "router_z_loss_clip": 0.77246094, + "router_z_loss_mlp": 0.11657715, + "step": 9285, + "time_per_iteration": 2.695040225982666 + }, + { + "auxiliary_loss_clip": 0.01120271, + "auxiliary_loss_mlp": 0.01034067, + "balance_loss_clip": 1.04259515, + "balance_loss_mlp": 1.02245605, + "epoch": 0.5583045242747633, + "flos": 23349049822080.0, + "grad_norm": 2.4729969049320166, + "language_loss": 0.73421121, + "learning_rate": 1.720312582354912e-06, + "loss": 0.75575459, + "num_input_tokens_seen": 199939925, + "router_z_loss_clip": 0.77636719, + "router_z_loss_mlp": 0.1161499, + "step": 9286, + "time_per_iteration": 2.655811309814453 + }, + { + "auxiliary_loss_clip": 0.01120477, + "auxiliary_loss_mlp": 0.01030564, + "balance_loss_clip": 1.04285967, + "balance_loss_mlp": 1.01931679, + "epoch": 0.5583646475274312, + "flos": 33500584022400.0, + "grad_norm": 1.7516825879634117, + "language_loss": 0.74115849, + "learning_rate": 1.7199269529891684e-06, + "loss": 0.76266891, + "num_input_tokens_seen": 199960015, + "router_z_loss_clip": 0.77734375, + "router_z_loss_mlp": 0.11260986, + "step": 9287, + "time_per_iteration": 2.6980526447296143 + }, + { + "auxiliary_loss_clip": 0.01123834, + "auxiliary_loss_mlp": 0.01033011, + "balance_loss_clip": 1.04372311, + "balance_loss_mlp": 1.02063072, + "epoch": 0.5584247707800992, + "flos": 28863150387840.0, + "grad_norm": 1.760693880160035, + "language_loss": 0.75055838, + "learning_rate": 1.7195413342438233e-06, + "loss": 0.77212685, + "num_input_tokens_seen": 199980505, + "router_z_loss_clip": 0.80126953, + "router_z_loss_mlp": 0.12390137, + "step": 9288, + "time_per_iteration": 4.106663942337036 + }, + { + "auxiliary_loss_clip": 0.01123553, + "auxiliary_loss_mlp": 0.01036898, + "balance_loss_clip": 1.0459795, + "balance_loss_mlp": 1.02442861, + "epoch": 0.5584848940327671, + "flos": 16714886258880.0, + "grad_norm": 2.069947654016103, + "language_loss": 0.77501827, + "learning_rate": 1.7191557261334984e-06, + "loss": 0.79662275, + "num_input_tokens_seen": 199999020, + "router_z_loss_clip": 0.77539062, + "router_z_loss_mlp": 0.12463379, + "step": 9289, + "time_per_iteration": 3.902299165725708 + }, + { + "auxiliary_loss_clip": 0.01125363, + "auxiliary_loss_mlp": 0.01032911, + "balance_loss_clip": 1.04380679, + "balance_loss_mlp": 1.02043629, + "epoch": 0.5585450172854352, + "flos": 32965710579360.0, + "grad_norm": 1.7569045309742968, + "language_loss": 0.61588275, + "learning_rate": 1.718770128672817e-06, + "loss": 0.63746554, + "num_input_tokens_seen": 200019020, + "router_z_loss_clip": 0.81494141, + "router_z_loss_mlp": 0.12475586, + "step": 9290, + "time_per_iteration": 2.6518356800079346 + }, + { + "auxiliary_loss_clip": 0.01121528, + "auxiliary_loss_mlp": 0.01030314, + "balance_loss_clip": 1.04224396, + "balance_loss_mlp": 1.01798749, + "epoch": 0.5586051405381031, + "flos": 28291331535840.0, + "grad_norm": 9.186236175401678, + "language_loss": 0.68592536, + "learning_rate": 1.7183845418764e-06, + "loss": 0.70744371, + "num_input_tokens_seen": 200038110, + "router_z_loss_clip": 0.79248047, + "router_z_loss_mlp": 0.12335205, + "step": 9291, + "time_per_iteration": 2.7182207107543945 + }, + { + "auxiliary_loss_clip": 0.01120738, + "auxiliary_loss_mlp": 0.01035444, + "balance_loss_clip": 1.04202485, + "balance_loss_mlp": 1.02320087, + "epoch": 0.5586652637907711, + "flos": 25350520272480.0, + "grad_norm": 1.9150287171181768, + "language_loss": 0.84287775, + "learning_rate": 1.7179989657588698e-06, + "loss": 0.86443955, + "num_input_tokens_seen": 200056210, + "router_z_loss_clip": 0.78662109, + "router_z_loss_mlp": 0.12249756, + "step": 9292, + "time_per_iteration": 2.619746208190918 + }, + { + "auxiliary_loss_clip": 0.01116918, + "auxiliary_loss_mlp": 0.01036452, + "balance_loss_clip": 1.04219866, + "balance_loss_mlp": 1.02509177, + "epoch": 0.5587253870434391, + "flos": 34434576554400.0, + "grad_norm": 2.102337991462547, + "language_loss": 0.73879528, + "learning_rate": 1.7176134003348476e-06, + "loss": 0.76032895, + "num_input_tokens_seen": 200075620, + "router_z_loss_clip": 0.74755859, + "router_z_loss_mlp": 0.11358643, + "step": 9293, + "time_per_iteration": 2.715266227722168 + }, + { + "auxiliary_loss_clip": 0.01117308, + "auxiliary_loss_mlp": 0.01032284, + "balance_loss_clip": 1.04268193, + "balance_loss_mlp": 1.02127481, + "epoch": 0.558785510296107, + "flos": 32476939450560.0, + "grad_norm": 2.514142479707133, + "language_loss": 0.72396624, + "learning_rate": 1.7172278456189523e-06, + "loss": 0.74546212, + "num_input_tokens_seen": 200095945, + "router_z_loss_clip": 0.74560547, + "router_z_loss_mlp": 0.11016846, + "step": 9294, + "time_per_iteration": 2.6535346508026123 + }, + { + "auxiliary_loss_clip": 0.01121087, + "auxiliary_loss_mlp": 0.01032224, + "balance_loss_clip": 1.04377484, + "balance_loss_mlp": 1.02024961, + "epoch": 0.558845633548775, + "flos": 24595305095520.0, + "grad_norm": 3.192949672885425, + "language_loss": 0.68383026, + "learning_rate": 1.716842301625806e-06, + "loss": 0.70536339, + "num_input_tokens_seen": 200114185, + "router_z_loss_clip": 0.7734375, + "router_z_loss_mlp": 0.11968994, + "step": 9295, + "time_per_iteration": 4.077066898345947 + }, + { + "auxiliary_loss_clip": 0.01120247, + "auxiliary_loss_mlp": 0.01032252, + "balance_loss_clip": 1.04352403, + "balance_loss_mlp": 1.02040267, + "epoch": 0.5589057568014429, + "flos": 29713325368320.0, + "grad_norm": 1.5007555313070033, + "language_loss": 0.80629367, + "learning_rate": 1.7164567683700281e-06, + "loss": 0.82781869, + "num_input_tokens_seen": 200135030, + "router_z_loss_clip": 0.76708984, + "router_z_loss_mlp": 0.11853027, + "step": 9296, + "time_per_iteration": 2.6783058643341064 + }, + { + "auxiliary_loss_clip": 0.01118951, + "auxiliary_loss_mlp": 0.01030657, + "balance_loss_clip": 1.04291773, + "balance_loss_mlp": 1.01867008, + "epoch": 0.558965880054111, + "flos": 25752637640160.0, + "grad_norm": 2.1157735142585197, + "language_loss": 0.65541792, + "learning_rate": 1.7160712458662379e-06, + "loss": 0.67691404, + "num_input_tokens_seen": 200154290, + "router_z_loss_clip": 0.76025391, + "router_z_loss_mlp": 0.11975098, + "step": 9297, + "time_per_iteration": 2.623725652694702 + }, + { + "auxiliary_loss_clip": 0.01120674, + "auxiliary_loss_mlp": 0.01035207, + "balance_loss_clip": 1.04226327, + "balance_loss_mlp": 1.0230422, + "epoch": 0.5590260033067789, + "flos": 22496119666560.0, + "grad_norm": 2.4479823927380346, + "language_loss": 0.75091785, + "learning_rate": 1.7156857341290544e-06, + "loss": 0.77247667, + "num_input_tokens_seen": 200171555, + "router_z_loss_clip": 0.78369141, + "router_z_loss_mlp": 0.12164307, + "step": 9298, + "time_per_iteration": 2.6518208980560303 + }, + { + "auxiliary_loss_clip": 0.01037573, + "auxiliary_loss_mlp": 0.01006735, + "balance_loss_clip": 1.01311958, + "balance_loss_mlp": 1.00551629, + "epoch": 0.5590861265594469, + "flos": 86119358500800.0, + "grad_norm": 0.8817186959884439, + "language_loss": 0.52395713, + "learning_rate": 1.7153002331730967e-06, + "loss": 0.54440022, + "num_input_tokens_seen": 200237010, + "router_z_loss_clip": 0.24438477, + "router_z_loss_mlp": 0.01217651, + "step": 9299, + "time_per_iteration": 3.3220129013061523 + }, + { + "auxiliary_loss_clip": 0.01116511, + "auxiliary_loss_mlp": 0.01028508, + "balance_loss_clip": 1.04203868, + "balance_loss_mlp": 1.01726091, + "epoch": 0.5591462498121148, + "flos": 37417559955840.0, + "grad_norm": 2.145902248872391, + "language_loss": 0.69184291, + "learning_rate": 1.7149147430129824e-06, + "loss": 0.71329308, + "num_input_tokens_seen": 200260820, + "router_z_loss_clip": 0.74462891, + "router_z_loss_mlp": 0.11248779, + "step": 9300, + "time_per_iteration": 2.784351110458374 + }, + { + "auxiliary_loss_clip": 0.01119963, + "auxiliary_loss_mlp": 0.01037693, + "balance_loss_clip": 1.04221725, + "balance_loss_mlp": 1.02528906, + "epoch": 0.5592063730647828, + "flos": 22146992550720.0, + "grad_norm": 6.084638282911397, + "language_loss": 0.82021379, + "learning_rate": 1.7145292636633293e-06, + "loss": 0.84179032, + "num_input_tokens_seen": 200278035, + "router_z_loss_clip": 0.77783203, + "router_z_loss_mlp": 0.12402344, + "step": 9301, + "time_per_iteration": 2.6262223720550537 + }, + { + "auxiliary_loss_clip": 0.01116789, + "auxiliary_loss_mlp": 0.01026539, + "balance_loss_clip": 1.04049993, + "balance_loss_mlp": 1.01489246, + "epoch": 0.5592664963174507, + "flos": 29358849971520.0, + "grad_norm": 1.6694264906003273, + "language_loss": 0.67966956, + "learning_rate": 1.714143795138756e-06, + "loss": 0.70110285, + "num_input_tokens_seen": 200297255, + "router_z_loss_clip": 0.76220703, + "router_z_loss_mlp": 0.11645508, + "step": 9302, + "time_per_iteration": 3.9416158199310303 + }, + { + "auxiliary_loss_clip": 0.01120667, + "auxiliary_loss_mlp": 0.010232, + "balance_loss_clip": 1.04118752, + "balance_loss_mlp": 1.01099849, + "epoch": 0.5593266195701188, + "flos": 24194038590720.0, + "grad_norm": 1.8523726702113483, + "language_loss": 0.71107721, + "learning_rate": 1.713758337453878e-06, + "loss": 0.73251587, + "num_input_tokens_seen": 200317505, + "router_z_loss_clip": 0.79492188, + "router_z_loss_mlp": 0.12207031, + "step": 9303, + "time_per_iteration": 2.7309560775756836 + }, + { + "auxiliary_loss_clip": 0.01117311, + "auxiliary_loss_mlp": 0.01028837, + "balance_loss_clip": 1.04426479, + "balance_loss_mlp": 1.01823378, + "epoch": 0.5593867428227867, + "flos": 30872440673280.0, + "grad_norm": 1.723249836649721, + "language_loss": 0.73097444, + "learning_rate": 1.7133728906233124e-06, + "loss": 0.75243592, + "num_input_tokens_seen": 200338350, + "router_z_loss_clip": 0.72998047, + "router_z_loss_mlp": 0.10614014, + "step": 9304, + "time_per_iteration": 2.692185401916504 + }, + { + "auxiliary_loss_clip": 0.01117158, + "auxiliary_loss_mlp": 0.01030412, + "balance_loss_clip": 1.04131079, + "balance_loss_mlp": 1.01927233, + "epoch": 0.5594468660754547, + "flos": 15780285967680.0, + "grad_norm": 2.7711250177916207, + "language_loss": 0.7797547, + "learning_rate": 1.7129874546616763e-06, + "loss": 0.80123043, + "num_input_tokens_seen": 200353965, + "router_z_loss_clip": 0.7578125, + "router_z_loss_mlp": 0.11138916, + "step": 9305, + "time_per_iteration": 2.6812124252319336 + }, + { + "auxiliary_loss_clip": 0.01114616, + "auxiliary_loss_mlp": 0.01025842, + "balance_loss_clip": 1.04180896, + "balance_loss_mlp": 1.01458311, + "epoch": 0.5595069893281227, + "flos": 23260208127840.0, + "grad_norm": 3.1692354086048464, + "language_loss": 0.6904248, + "learning_rate": 1.7126020295835836e-06, + "loss": 0.71182942, + "num_input_tokens_seen": 200373595, + "router_z_loss_clip": 0.72802734, + "router_z_loss_mlp": 0.1126709, + "step": 9306, + "time_per_iteration": 2.608743667602539 + }, + { + "auxiliary_loss_clip": 0.01036995, + "auxiliary_loss_mlp": 0.01003802, + "balance_loss_clip": 1.01267385, + "balance_loss_mlp": 1.00256538, + "epoch": 0.5595671125807906, + "flos": 85749446020320.0, + "grad_norm": 0.9190257513174507, + "language_loss": 0.60246992, + "learning_rate": 1.7122166154036518e-06, + "loss": 0.62287784, + "num_input_tokens_seen": 200429155, + "router_z_loss_clip": 0.24328613, + "router_z_loss_mlp": 0.01235962, + "step": 9307, + "time_per_iteration": 3.370030641555786 + }, + { + "auxiliary_loss_clip": 0.01115863, + "auxiliary_loss_mlp": 0.01035154, + "balance_loss_clip": 1.04054832, + "balance_loss_mlp": 1.02390027, + "epoch": 0.5596272358334586, + "flos": 25216305575040.0, + "grad_norm": 1.6057000969885677, + "language_loss": 0.73951769, + "learning_rate": 1.7118312121364943e-06, + "loss": 0.76102787, + "num_input_tokens_seen": 200448290, + "router_z_loss_clip": 0.75292969, + "router_z_loss_mlp": 0.11254883, + "step": 9308, + "time_per_iteration": 2.6540205478668213 + }, + { + "auxiliary_loss_clip": 0.01119272, + "auxiliary_loss_mlp": 0.01033185, + "balance_loss_clip": 1.04079401, + "balance_loss_mlp": 1.02049565, + "epoch": 0.5596873590861265, + "flos": 30555883100160.0, + "grad_norm": 2.0071237382001255, + "language_loss": 0.69779646, + "learning_rate": 1.7114458197967257e-06, + "loss": 0.71932113, + "num_input_tokens_seen": 200466555, + "router_z_loss_clip": 0.78417969, + "router_z_loss_mlp": 0.12695312, + "step": 9309, + "time_per_iteration": 2.6510913372039795 + }, + { + "auxiliary_loss_clip": 0.0112085, + "auxiliary_loss_mlp": 0.01032202, + "balance_loss_clip": 1.043136, + "balance_loss_mlp": 1.01890397, + "epoch": 0.5597474823387946, + "flos": 31674852131040.0, + "grad_norm": 3.027381142042252, + "language_loss": 0.74825191, + "learning_rate": 1.7110604383989613e-06, + "loss": 0.76978242, + "num_input_tokens_seen": 200485980, + "router_z_loss_clip": 0.77783203, + "router_z_loss_mlp": 0.13299561, + "step": 9310, + "time_per_iteration": 2.712522029876709 + }, + { + "auxiliary_loss_clip": 0.01124012, + "auxiliary_loss_mlp": 0.01030418, + "balance_loss_clip": 1.04487896, + "balance_loss_mlp": 1.0178293, + "epoch": 0.5598076055914625, + "flos": 31942147042080.0, + "grad_norm": 5.516180893399851, + "language_loss": 0.69767445, + "learning_rate": 1.7106750679578133e-06, + "loss": 0.71921873, + "num_input_tokens_seen": 200504555, + "router_z_loss_clip": 0.79052734, + "router_z_loss_mlp": 0.12597656, + "step": 9311, + "time_per_iteration": 2.667785406112671 + }, + { + "auxiliary_loss_clip": 0.01115925, + "auxiliary_loss_mlp": 0.01030877, + "balance_loss_clip": 1.04011416, + "balance_loss_mlp": 1.01936698, + "epoch": 0.5598677288441305, + "flos": 14221727435520.0, + "grad_norm": 2.1762354897456753, + "language_loss": 0.71993905, + "learning_rate": 1.7102897084878962e-06, + "loss": 0.74140704, + "num_input_tokens_seen": 200522700, + "router_z_loss_clip": 0.75878906, + "router_z_loss_mlp": 0.11505127, + "step": 9312, + "time_per_iteration": 2.646146774291992 + }, + { + "auxiliary_loss_clip": 0.01117987, + "auxiliary_loss_mlp": 0.01032909, + "balance_loss_clip": 1.04272652, + "balance_loss_mlp": 1.02087498, + "epoch": 0.5599278520967984, + "flos": 28024036624800.0, + "grad_norm": 2.196522897973057, + "language_loss": 0.89304894, + "learning_rate": 1.709904360003822e-06, + "loss": 0.91455793, + "num_input_tokens_seen": 200541910, + "router_z_loss_clip": 0.75195312, + "router_z_loss_mlp": 0.12036133, + "step": 9313, + "time_per_iteration": 2.6393320560455322 + }, + { + "auxiliary_loss_clip": 0.01119416, + "auxiliary_loss_mlp": 0.01034578, + "balance_loss_clip": 1.04417098, + "balance_loss_mlp": 1.02288353, + "epoch": 0.5599879753494664, + "flos": 25887257510400.0, + "grad_norm": 1.5014502199470605, + "language_loss": 0.78125644, + "learning_rate": 1.709519022520204e-06, + "loss": 0.8027963, + "num_input_tokens_seen": 200562600, + "router_z_loss_clip": 0.75195312, + "router_z_loss_mlp": 0.11688232, + "step": 9314, + "time_per_iteration": 2.7102015018463135 + }, + { + "auxiliary_loss_clip": 0.01115549, + "auxiliary_loss_mlp": 0.01026867, + "balance_loss_clip": 1.03940892, + "balance_loss_mlp": 1.01468396, + "epoch": 0.5600480986021343, + "flos": 38929408414560.0, + "grad_norm": 2.386139108335647, + "language_loss": 0.70147681, + "learning_rate": 1.7091336960516537e-06, + "loss": 0.72290093, + "num_input_tokens_seen": 200584795, + "router_z_loss_clip": 0.76123047, + "router_z_loss_mlp": 0.12182617, + "step": 9315, + "time_per_iteration": 2.761181116104126 + }, + { + "auxiliary_loss_clip": 0.01119619, + "auxiliary_loss_mlp": 0.01036282, + "balance_loss_clip": 1.04065537, + "balance_loss_mlp": 1.02423584, + "epoch": 0.5601082218548024, + "flos": 34747082399520.0, + "grad_norm": 2.3323003046380566, + "language_loss": 0.66742384, + "learning_rate": 1.7087483806127824e-06, + "loss": 0.68898284, + "num_input_tokens_seen": 200606945, + "router_z_loss_clip": 0.79003906, + "router_z_loss_mlp": 0.12036133, + "step": 9316, + "time_per_iteration": 2.6952459812164307 + }, + { + "auxiliary_loss_clip": 0.01118401, + "auxiliary_loss_mlp": 0.01029727, + "balance_loss_clip": 1.04252005, + "balance_loss_mlp": 1.01715004, + "epoch": 0.5601683451074703, + "flos": 29403696250080.0, + "grad_norm": 2.082575366628379, + "language_loss": 0.86237592, + "learning_rate": 1.7083630762182022e-06, + "loss": 0.88385725, + "num_input_tokens_seen": 200626340, + "router_z_loss_clip": 0.7578125, + "router_z_loss_mlp": 0.12567139, + "step": 9317, + "time_per_iteration": 2.622527599334717 + }, + { + "auxiliary_loss_clip": 0.01120786, + "auxiliary_loss_mlp": 0.01036312, + "balance_loss_clip": 1.0412395, + "balance_loss_mlp": 1.02297235, + "epoch": 0.5602284683601383, + "flos": 32160624981120.0, + "grad_norm": 1.8392739505435316, + "language_loss": 0.77255607, + "learning_rate": 1.7079777828825233e-06, + "loss": 0.79412711, + "num_input_tokens_seen": 200644520, + "router_z_loss_clip": 0.79541016, + "router_z_loss_mlp": 0.13354492, + "step": 9318, + "time_per_iteration": 2.6768710613250732 + }, + { + "auxiliary_loss_clip": 0.01115157, + "auxiliary_loss_mlp": 0.01036527, + "balance_loss_clip": 1.03906775, + "balance_loss_mlp": 1.02577996, + "epoch": 0.5602885916128063, + "flos": 29891049274080.0, + "grad_norm": 1.5503163528222446, + "language_loss": 0.76366299, + "learning_rate": 1.7075925006203558e-06, + "loss": 0.78517985, + "num_input_tokens_seen": 200664845, + "router_z_loss_clip": 0.76074219, + "router_z_loss_mlp": 0.10742188, + "step": 9319, + "time_per_iteration": 2.673450469970703 + }, + { + "auxiliary_loss_clip": 0.01115731, + "auxiliary_loss_mlp": 0.01032817, + "balance_loss_clip": 1.04153693, + "balance_loss_mlp": 1.02137256, + "epoch": 0.5603487148654742, + "flos": 33366896049600.0, + "grad_norm": 1.5333528600055422, + "language_loss": 0.85285646, + "learning_rate": 1.7072072294463101e-06, + "loss": 0.87434191, + "num_input_tokens_seen": 200686535, + "router_z_loss_clip": 0.74121094, + "router_z_loss_mlp": 0.11437988, + "step": 9320, + "time_per_iteration": 2.6879360675811768 + }, + { + "auxiliary_loss_clip": 0.01040249, + "auxiliary_loss_mlp": 0.01003557, + "balance_loss_clip": 1.01561785, + "balance_loss_mlp": 1.00224495, + "epoch": 0.5604088381181422, + "flos": 65997214871040.0, + "grad_norm": 0.7577649305577313, + "language_loss": 0.52570689, + "learning_rate": 1.706821969374996e-06, + "loss": 0.54614496, + "num_input_tokens_seen": 200736965, + "router_z_loss_clip": 0.24621582, + "router_z_loss_mlp": 0.01312256, + "step": 9321, + "time_per_iteration": 3.0574872493743896 + }, + { + "auxiliary_loss_clip": 0.01117146, + "auxiliary_loss_mlp": 0.01031248, + "balance_loss_clip": 1.04290962, + "balance_loss_mlp": 1.02033401, + "epoch": 0.5604689613708101, + "flos": 27133026576480.0, + "grad_norm": 1.9625271855815092, + "language_loss": 0.74302638, + "learning_rate": 1.7064367204210216e-06, + "loss": 0.76451033, + "num_input_tokens_seen": 200757420, + "router_z_loss_clip": 0.74267578, + "router_z_loss_mlp": 0.10919189, + "step": 9322, + "time_per_iteration": 2.650423288345337 + }, + { + "auxiliary_loss_clip": 0.01116834, + "auxiliary_loss_mlp": 0.01032036, + "balance_loss_clip": 1.04061842, + "balance_loss_mlp": 1.01962662, + "epoch": 0.5605290846234782, + "flos": 43071588155520.0, + "grad_norm": 1.6821793908009282, + "language_loss": 0.73743004, + "learning_rate": 1.7060514825989963e-06, + "loss": 0.7589187, + "num_input_tokens_seen": 200779520, + "router_z_loss_clip": 0.76220703, + "router_z_loss_mlp": 0.1239624, + "step": 9323, + "time_per_iteration": 2.801394462585449 + }, + { + "auxiliary_loss_clip": 0.0112157, + "auxiliary_loss_mlp": 0.0102861, + "balance_loss_clip": 1.0433805, + "balance_loss_mlp": 1.01621819, + "epoch": 0.5605892078761461, + "flos": 24725589616800.0, + "grad_norm": 1.6925111640006434, + "language_loss": 0.62012684, + "learning_rate": 1.7056662559235286e-06, + "loss": 0.64162868, + "num_input_tokens_seen": 200799485, + "router_z_loss_clip": 0.78222656, + "router_z_loss_mlp": 0.12402344, + "step": 9324, + "time_per_iteration": 2.6422982215881348 + }, + { + "auxiliary_loss_clip": 0.01115759, + "auxiliary_loss_mlp": 0.0103242, + "balance_loss_clip": 1.03919029, + "balance_loss_mlp": 1.02001631, + "epoch": 0.5606493311288141, + "flos": 21119660906400.0, + "grad_norm": 2.0784624766190776, + "language_loss": 0.87838274, + "learning_rate": 1.705281040409226e-06, + "loss": 0.89986455, + "num_input_tokens_seen": 200817540, + "router_z_loss_clip": 0.76513672, + "router_z_loss_mlp": 0.12402344, + "step": 9325, + "time_per_iteration": 2.6609790325164795 + }, + { + "auxiliary_loss_clip": 0.01119333, + "auxiliary_loss_mlp": 0.01029726, + "balance_loss_clip": 1.04128695, + "balance_loss_mlp": 1.01733971, + "epoch": 0.560709454381482, + "flos": 26421077504160.0, + "grad_norm": 1.8952623358120027, + "language_loss": 0.74159241, + "learning_rate": 1.7048958360706952e-06, + "loss": 0.76308304, + "num_input_tokens_seen": 200838380, + "router_z_loss_clip": 0.78125, + "router_z_loss_mlp": 0.12390137, + "step": 9326, + "time_per_iteration": 2.6456379890441895 + }, + { + "auxiliary_loss_clip": 0.01122976, + "auxiliary_loss_mlp": 0.0102648, + "balance_loss_clip": 1.04303539, + "balance_loss_mlp": 1.01409459, + "epoch": 0.56076957763415, + "flos": 24774122967840.0, + "grad_norm": 2.4176796792465702, + "language_loss": 0.78256941, + "learning_rate": 1.7045106429225447e-06, + "loss": 0.80406392, + "num_input_tokens_seen": 200855640, + "router_z_loss_clip": 0.79980469, + "router_z_loss_mlp": 0.12390137, + "step": 9327, + "time_per_iteration": 2.678156614303589 + }, + { + "auxiliary_loss_clip": 0.01120774, + "auxiliary_loss_mlp": 0.01032451, + "balance_loss_clip": 1.04470491, + "balance_loss_mlp": 1.02010119, + "epoch": 0.5608297008868179, + "flos": 30561393450240.0, + "grad_norm": 1.6269326349268691, + "language_loss": 0.7824741, + "learning_rate": 1.7041254609793795e-06, + "loss": 0.8040064, + "num_input_tokens_seen": 200876585, + "router_z_loss_clip": 0.76074219, + "router_z_loss_mlp": 0.12359619, + "step": 9328, + "time_per_iteration": 4.28455114364624 + }, + { + "auxiliary_loss_clip": 0.01116157, + "auxiliary_loss_mlp": 0.01029409, + "balance_loss_clip": 1.04042125, + "balance_loss_mlp": 1.01753628, + "epoch": 0.560889824139486, + "flos": 24241315906080.0, + "grad_norm": 1.764919250796932, + "language_loss": 0.73545182, + "learning_rate": 1.7037402902558066e-06, + "loss": 0.7569074, + "num_input_tokens_seen": 200898175, + "router_z_loss_clip": 0.75683594, + "router_z_loss_mlp": 0.11877441, + "step": 9329, + "time_per_iteration": 4.1630024909973145 + }, + { + "auxiliary_loss_clip": 0.01120544, + "auxiliary_loss_mlp": 0.0103295, + "balance_loss_clip": 1.04053497, + "balance_loss_mlp": 1.02021229, + "epoch": 0.5609499473921539, + "flos": 27978704138880.0, + "grad_norm": 2.4336028361323714, + "language_loss": 0.83604538, + "learning_rate": 1.7033551307664324e-06, + "loss": 0.8575803, + "num_input_tokens_seen": 200917515, + "router_z_loss_clip": 0.79980469, + "router_z_loss_mlp": 0.12744141, + "step": 9330, + "time_per_iteration": 2.664823532104492 + }, + { + "auxiliary_loss_clip": 0.01039743, + "auxiliary_loss_mlp": 0.01001015, + "balance_loss_clip": 1.01523948, + "balance_loss_mlp": 0.99963397, + "epoch": 0.5610100706448219, + "flos": 64713001256640.0, + "grad_norm": 0.7106427959986208, + "language_loss": 0.57806015, + "learning_rate": 1.7029699825258603e-06, + "loss": 0.59846777, + "num_input_tokens_seen": 200978615, + "router_z_loss_clip": 0.24487305, + "router_z_loss_mlp": 0.01382446, + "step": 9331, + "time_per_iteration": 3.26591157913208 + }, + { + "auxiliary_loss_clip": 0.01119128, + "auxiliary_loss_mlp": 0.01031273, + "balance_loss_clip": 1.04117203, + "balance_loss_mlp": 1.01917362, + "epoch": 0.5610701938974898, + "flos": 26643161481120.0, + "grad_norm": 3.397553809780886, + "language_loss": 0.81569874, + "learning_rate": 1.7025848455486971e-06, + "loss": 0.83720273, + "num_input_tokens_seen": 200997745, + "router_z_loss_clip": 0.77929688, + "router_z_loss_mlp": 0.12103271, + "step": 9332, + "time_per_iteration": 2.622007369995117 + }, + { + "auxiliary_loss_clip": 0.01124404, + "auxiliary_loss_mlp": 0.01035793, + "balance_loss_clip": 1.0426867, + "balance_loss_mlp": 1.02245975, + "epoch": 0.5611303171501578, + "flos": 21301031367360.0, + "grad_norm": 1.9497364002291075, + "language_loss": 0.81529254, + "learning_rate": 1.7021997198495454e-06, + "loss": 0.83689451, + "num_input_tokens_seen": 201016370, + "router_z_loss_clip": 0.81640625, + "router_z_loss_mlp": 0.13330078, + "step": 9333, + "time_per_iteration": 2.683596611022949 + }, + { + "auxiliary_loss_clip": 0.01119136, + "auxiliary_loss_mlp": 0.01026508, + "balance_loss_clip": 1.04138184, + "balance_loss_mlp": 1.01502848, + "epoch": 0.5611904404028258, + "flos": 27622851154560.0, + "grad_norm": 1.7159540540534552, + "language_loss": 0.73053455, + "learning_rate": 1.7018146054430108e-06, + "loss": 0.75199103, + "num_input_tokens_seen": 201034310, + "router_z_loss_clip": 0.77832031, + "router_z_loss_mlp": 0.11480713, + "step": 9334, + "time_per_iteration": 2.6078293323516846 + }, + { + "auxiliary_loss_clip": 0.01120014, + "auxiliary_loss_mlp": 0.01032842, + "balance_loss_clip": 1.04408038, + "balance_loss_mlp": 1.02077842, + "epoch": 0.5612505636554938, + "flos": 17468602296480.0, + "grad_norm": 1.8780514673154718, + "language_loss": 0.71119332, + "learning_rate": 1.7014295023436961e-06, + "loss": 0.73272192, + "num_input_tokens_seen": 201052030, + "router_z_loss_clip": 0.75830078, + "router_z_loss_mlp": 0.1206665, + "step": 9335, + "time_per_iteration": 4.032320261001587 + }, + { + "auxiliary_loss_clip": 0.01121254, + "auxiliary_loss_mlp": 0.01028062, + "balance_loss_clip": 1.04299724, + "balance_loss_mlp": 1.01594448, + "epoch": 0.5613106869081618, + "flos": 20143131580800.0, + "grad_norm": 2.0686803050866795, + "language_loss": 0.76610279, + "learning_rate": 1.701044410566205e-06, + "loss": 0.78759599, + "num_input_tokens_seen": 201068445, + "router_z_loss_clip": 0.78271484, + "router_z_loss_mlp": 0.12121582, + "step": 9336, + "time_per_iteration": 2.7961530685424805 + }, + { + "auxiliary_loss_clip": 0.01118462, + "auxiliary_loss_mlp": 0.01032928, + "balance_loss_clip": 1.04195523, + "balance_loss_mlp": 1.02114439, + "epoch": 0.5613708101608297, + "flos": 29356783590240.0, + "grad_norm": 3.302371534640032, + "language_loss": 0.64623886, + "learning_rate": 1.7006593301251393e-06, + "loss": 0.66775274, + "num_input_tokens_seen": 201082140, + "router_z_loss_clip": 0.76513672, + "router_z_loss_mlp": 0.11779785, + "step": 9337, + "time_per_iteration": 2.690175771713257 + }, + { + "auxiliary_loss_clip": 0.01039491, + "auxiliary_loss_mlp": 0.01002657, + "balance_loss_clip": 1.01502943, + "balance_loss_mlp": 1.00137162, + "epoch": 0.5614309334134977, + "flos": 79199829319680.0, + "grad_norm": 0.8822146226271177, + "language_loss": 0.62572181, + "learning_rate": 1.700274261035102e-06, + "loss": 0.64614332, + "num_input_tokens_seen": 201137245, + "router_z_loss_clip": 0.24438477, + "router_z_loss_mlp": 0.01285553, + "step": 9338, + "time_per_iteration": 3.2234866619110107 + }, + { + "auxiliary_loss_clip": 0.01122755, + "auxiliary_loss_mlp": 0.01035121, + "balance_loss_clip": 1.04402924, + "balance_loss_mlp": 1.02334905, + "epoch": 0.5614910566661656, + "flos": 40170436958880.0, + "grad_norm": 2.1617472539731373, + "language_loss": 0.65767121, + "learning_rate": 1.6998892033106946e-06, + "loss": 0.67924994, + "num_input_tokens_seen": 201157270, + "router_z_loss_clip": 0.78710938, + "router_z_loss_mlp": 0.11785889, + "step": 9339, + "time_per_iteration": 2.7044243812561035 + }, + { + "auxiliary_loss_clip": 0.01118554, + "auxiliary_loss_mlp": 0.01030307, + "balance_loss_clip": 1.04248631, + "balance_loss_mlp": 1.0182308, + "epoch": 0.5615511799188336, + "flos": 22681947028320.0, + "grad_norm": 1.9584985501062508, + "language_loss": 0.69806874, + "learning_rate": 1.6995041569665184e-06, + "loss": 0.71955729, + "num_input_tokens_seen": 201174530, + "router_z_loss_clip": 0.76074219, + "router_z_loss_mlp": 0.12084961, + "step": 9340, + "time_per_iteration": 2.632101535797119 + }, + { + "auxiliary_loss_clip": 0.0111903, + "auxiliary_loss_mlp": 0.01028822, + "balance_loss_clip": 1.0447197, + "balance_loss_mlp": 1.01726484, + "epoch": 0.5616113031715015, + "flos": 27845218752480.0, + "grad_norm": 1.782384140263674, + "language_loss": 0.7709316, + "learning_rate": 1.6991191220171756e-06, + "loss": 0.79241014, + "num_input_tokens_seen": 201194905, + "router_z_loss_clip": 0.74316406, + "router_z_loss_mlp": 0.11560059, + "step": 9341, + "time_per_iteration": 3.9551351070404053 + }, + { + "auxiliary_loss_clip": 0.01120097, + "auxiliary_loss_mlp": 0.01033581, + "balance_loss_clip": 1.04204881, + "balance_loss_mlp": 1.02056921, + "epoch": 0.5616714264241696, + "flos": 27266430928320.0, + "grad_norm": 1.7424102812736462, + "language_loss": 0.79763812, + "learning_rate": 1.6987340984772653e-06, + "loss": 0.81917495, + "num_input_tokens_seen": 201213715, + "router_z_loss_clip": 0.78076172, + "router_z_loss_mlp": 0.13006592, + "step": 9342, + "time_per_iteration": 2.6216533184051514 + }, + { + "auxiliary_loss_clip": 0.01123544, + "auxiliary_loss_mlp": 0.01033114, + "balance_loss_clip": 1.04314208, + "balance_loss_mlp": 1.02072859, + "epoch": 0.5617315496768375, + "flos": 22949079870240.0, + "grad_norm": 2.1823731229138486, + "language_loss": 0.76218426, + "learning_rate": 1.6983490863613882e-06, + "loss": 0.78375083, + "num_input_tokens_seen": 201231415, + "router_z_loss_clip": 0.80566406, + "router_z_loss_mlp": 0.1239624, + "step": 9343, + "time_per_iteration": 2.6958837509155273 + }, + { + "auxiliary_loss_clip": 0.01122212, + "auxiliary_loss_mlp": 0.01038357, + "balance_loss_clip": 1.04490733, + "balance_loss_mlp": 1.0255841, + "epoch": 0.5617916729295055, + "flos": 22414084875360.0, + "grad_norm": 2.4631301396829315, + "language_loss": 0.69168711, + "learning_rate": 1.6979640856841442e-06, + "loss": 0.71329284, + "num_input_tokens_seen": 201249625, + "router_z_loss_clip": 0.77294922, + "router_z_loss_mlp": 0.12768555, + "step": 9344, + "time_per_iteration": 2.639720916748047 + }, + { + "auxiliary_loss_clip": 0.01122024, + "auxiliary_loss_mlp": 0.01035885, + "balance_loss_clip": 1.04395652, + "balance_loss_mlp": 1.02324271, + "epoch": 0.5618517961821734, + "flos": 34386164755200.0, + "grad_norm": 2.1319338553976226, + "language_loss": 0.66296864, + "learning_rate": 1.6975790964601318e-06, + "loss": 0.68454778, + "num_input_tokens_seen": 201271205, + "router_z_loss_clip": 0.78076172, + "router_z_loss_mlp": 0.12634277, + "step": 9345, + "time_per_iteration": 2.7271337509155273 + }, + { + "auxiliary_loss_clip": 0.0112178, + "auxiliary_loss_mlp": 0.01028547, + "balance_loss_clip": 1.04412031, + "balance_loss_mlp": 1.01689982, + "epoch": 0.5619119194348414, + "flos": 18896714238240.0, + "grad_norm": 2.056182886207992, + "language_loss": 0.87363803, + "learning_rate": 1.6971941187039512e-06, + "loss": 0.89514136, + "num_input_tokens_seen": 201287700, + "router_z_loss_clip": 0.77685547, + "router_z_loss_mlp": 0.11657715, + "step": 9346, + "time_per_iteration": 2.5844717025756836 + }, + { + "auxiliary_loss_clip": 0.01120853, + "auxiliary_loss_mlp": 0.01032378, + "balance_loss_clip": 1.04299116, + "balance_loss_mlp": 1.01955676, + "epoch": 0.5619720426875094, + "flos": 35543821438080.0, + "grad_norm": 2.28893633855626, + "language_loss": 0.59540361, + "learning_rate": 1.6968091524301993e-06, + "loss": 0.61693597, + "num_input_tokens_seen": 201307530, + "router_z_loss_clip": 0.77929688, + "router_z_loss_mlp": 0.12823486, + "step": 9347, + "time_per_iteration": 2.711731195449829 + }, + { + "auxiliary_loss_clip": 0.01122, + "auxiliary_loss_mlp": 0.01034711, + "balance_loss_clip": 1.04364717, + "balance_loss_mlp": 1.02119219, + "epoch": 0.5620321659401774, + "flos": 21968053126560.0, + "grad_norm": 2.64399179300086, + "language_loss": 0.69283712, + "learning_rate": 1.6964241976534745e-06, + "loss": 0.71440423, + "num_input_tokens_seen": 201326210, + "router_z_loss_clip": 0.78271484, + "router_z_loss_mlp": 0.13525391, + "step": 9348, + "time_per_iteration": 2.635606288909912 + }, + { + "auxiliary_loss_clip": 0.01123241, + "auxiliary_loss_mlp": 0.01030304, + "balance_loss_clip": 1.04133523, + "balance_loss_mlp": 1.01654172, + "epoch": 0.5620922891928454, + "flos": 25129489744800.0, + "grad_norm": 1.8792173740017084, + "language_loss": 0.78678918, + "learning_rate": 1.6960392543883754e-06, + "loss": 0.80832464, + "num_input_tokens_seen": 201346120, + "router_z_loss_clip": 0.81933594, + "router_z_loss_mlp": 0.13763428, + "step": 9349, + "time_per_iteration": 2.6753132343292236 + }, + { + "auxiliary_loss_clip": 0.01122329, + "auxiliary_loss_mlp": 0.01030712, + "balance_loss_clip": 1.04367363, + "balance_loss_mlp": 1.01777148, + "epoch": 0.5621524124455133, + "flos": 32075835014880.0, + "grad_norm": 3.6755267410855783, + "language_loss": 0.67209154, + "learning_rate": 1.6956543226494975e-06, + "loss": 0.69362199, + "num_input_tokens_seen": 201365700, + "router_z_loss_clip": 0.78564453, + "router_z_loss_mlp": 0.12945557, + "step": 9350, + "time_per_iteration": 2.6900784969329834 + }, + { + "auxiliary_loss_clip": 0.0112231, + "auxiliary_loss_mlp": 0.01033429, + "balance_loss_clip": 1.04312682, + "balance_loss_mlp": 1.02092409, + "epoch": 0.5622125356981813, + "flos": 15557351127840.0, + "grad_norm": 2.2726441035058964, + "language_loss": 0.78614706, + "learning_rate": 1.6952694024514381e-06, + "loss": 0.80770451, + "num_input_tokens_seen": 201382795, + "router_z_loss_clip": 0.79248047, + "router_z_loss_mlp": 0.12493896, + "step": 9351, + "time_per_iteration": 2.6150240898132324 + }, + { + "auxiliary_loss_clip": 0.01124177, + "auxiliary_loss_mlp": 0.01035493, + "balance_loss_clip": 1.0431143, + "balance_loss_mlp": 1.02319026, + "epoch": 0.5622726589508492, + "flos": 29047559644800.0, + "grad_norm": 1.6660656523647879, + "language_loss": 0.58554053, + "learning_rate": 1.6948844938087945e-06, + "loss": 0.60713726, + "num_input_tokens_seen": 201402780, + "router_z_loss_clip": 0.81054688, + "router_z_loss_mlp": 0.12310791, + "step": 9352, + "time_per_iteration": 2.6617581844329834 + }, + { + "auxiliary_loss_clip": 0.01115814, + "auxiliary_loss_mlp": 0.01029151, + "balance_loss_clip": 1.04227257, + "balance_loss_mlp": 1.0178026, + "epoch": 0.5623327822035172, + "flos": 30161464015680.0, + "grad_norm": 2.317439937422393, + "language_loss": 0.71676505, + "learning_rate": 1.6944995967361604e-06, + "loss": 0.73821467, + "num_input_tokens_seen": 201424140, + "router_z_loss_clip": 0.73535156, + "router_z_loss_mlp": 0.11346436, + "step": 9353, + "time_per_iteration": 2.65989351272583 + }, + { + "auxiliary_loss_clip": 0.01121465, + "auxiliary_loss_mlp": 0.01028915, + "balance_loss_clip": 1.04227996, + "balance_loss_mlp": 1.01689327, + "epoch": 0.5623929054561851, + "flos": 17106590685600.0, + "grad_norm": 3.793499884696172, + "language_loss": 0.75656939, + "learning_rate": 1.6941147112481327e-06, + "loss": 0.77807319, + "num_input_tokens_seen": 201439645, + "router_z_loss_clip": 0.79150391, + "router_z_loss_mlp": 0.12017822, + "step": 9354, + "time_per_iteration": 2.6209805011749268 + }, + { + "auxiliary_loss_clip": 0.01123375, + "auxiliary_loss_mlp": 0.01037005, + "balance_loss_clip": 1.04264283, + "balance_loss_mlp": 1.02441025, + "epoch": 0.5624530287088532, + "flos": 25263542373120.0, + "grad_norm": 2.1886092435786724, + "language_loss": 0.72715259, + "learning_rate": 1.6937298373593056e-06, + "loss": 0.74875641, + "num_input_tokens_seen": 201459970, + "router_z_loss_clip": 0.80810547, + "router_z_loss_mlp": 0.12591553, + "step": 9355, + "time_per_iteration": 2.633049964904785 + }, + { + "auxiliary_loss_clip": 0.01120372, + "auxiliary_loss_mlp": 0.01031445, + "balance_loss_clip": 1.04184616, + "balance_loss_mlp": 1.01924419, + "epoch": 0.5625131519615211, + "flos": 26198871975360.0, + "grad_norm": 6.007689896194723, + "language_loss": 0.73450398, + "learning_rate": 1.693344975084274e-06, + "loss": 0.7560221, + "num_input_tokens_seen": 201480055, + "router_z_loss_clip": 0.78564453, + "router_z_loss_mlp": 0.12194824, + "step": 9356, + "time_per_iteration": 2.662644147872925 + }, + { + "auxiliary_loss_clip": 0.01120109, + "auxiliary_loss_mlp": 0.01032324, + "balance_loss_clip": 1.04319572, + "balance_loss_mlp": 1.0198251, + "epoch": 0.5625732752141891, + "flos": 22815148793760.0, + "grad_norm": 3.8891294238040333, + "language_loss": 0.83153903, + "learning_rate": 1.6929601244376318e-06, + "loss": 0.85306334, + "num_input_tokens_seen": 201497645, + "router_z_loss_clip": 0.76855469, + "router_z_loss_mlp": 0.125, + "step": 9357, + "time_per_iteration": 2.607576608657837 + }, + { + "auxiliary_loss_clip": 0.01121111, + "auxiliary_loss_mlp": 0.01034141, + "balance_loss_clip": 1.04302883, + "balance_loss_mlp": 1.02243447, + "epoch": 0.562633398466857, + "flos": 19787521700160.0, + "grad_norm": 2.299545755719312, + "language_loss": 0.72138834, + "learning_rate": 1.6925752854339722e-06, + "loss": 0.74294084, + "num_input_tokens_seen": 201515455, + "router_z_loss_clip": 0.78027344, + "router_z_loss_mlp": 0.11712646, + "step": 9358, + "time_per_iteration": 2.6760218143463135 + }, + { + "auxiliary_loss_clip": 0.01119965, + "auxiliary_loss_mlp": 0.01042232, + "balance_loss_clip": 1.04314613, + "balance_loss_mlp": 1.03013277, + "epoch": 0.562693521719525, + "flos": 27444843627840.0, + "grad_norm": 1.6834495494346582, + "language_loss": 0.77617431, + "learning_rate": 1.6921904580878885e-06, + "loss": 0.79779637, + "num_input_tokens_seen": 201534500, + "router_z_loss_clip": 0.76806641, + "router_z_loss_mlp": 0.12103271, + "step": 9359, + "time_per_iteration": 2.6927480697631836 + }, + { + "auxiliary_loss_clip": 0.01121079, + "auxiliary_loss_mlp": 0.0103284, + "balance_loss_clip": 1.04344714, + "balance_loss_mlp": 1.02125263, + "epoch": 0.562753644972193, + "flos": 30910115393280.0, + "grad_norm": 1.9537271597442478, + "language_loss": 0.70466882, + "learning_rate": 1.6918056424139736e-06, + "loss": 0.72620803, + "num_input_tokens_seen": 201553280, + "router_z_loss_clip": 0.77587891, + "router_z_loss_mlp": 0.11578369, + "step": 9360, + "time_per_iteration": 2.721405029296875 + }, + { + "auxiliary_loss_clip": 0.01040108, + "auxiliary_loss_mlp": 0.01004604, + "balance_loss_clip": 1.01537561, + "balance_loss_mlp": 1.00341892, + "epoch": 0.562813768224861, + "flos": 82233979695360.0, + "grad_norm": 0.7762489326756035, + "language_loss": 0.55594254, + "learning_rate": 1.6914208384268197e-06, + "loss": 0.57638967, + "num_input_tokens_seen": 201610030, + "router_z_loss_clip": 0.24768066, + "router_z_loss_mlp": 0.01184845, + "step": 9361, + "time_per_iteration": 3.1724839210510254 + }, + { + "auxiliary_loss_clip": 0.01119909, + "auxiliary_loss_mlp": 0.01036358, + "balance_loss_clip": 1.04448271, + "balance_loss_mlp": 1.02434182, + "epoch": 0.562873891477529, + "flos": 28468933889760.0, + "grad_norm": 1.818776061180626, + "language_loss": 0.81802082, + "learning_rate": 1.691036046141018e-06, + "loss": 0.83958352, + "num_input_tokens_seen": 201628370, + "router_z_loss_clip": 0.75341797, + "router_z_loss_mlp": 0.12023926, + "step": 9362, + "time_per_iteration": 2.670578956604004 + }, + { + "auxiliary_loss_clip": 0.01118361, + "auxiliary_loss_mlp": 0.01033909, + "balance_loss_clip": 1.04236782, + "balance_loss_mlp": 1.0222683, + "epoch": 0.5629340147301969, + "flos": 46945946260800.0, + "grad_norm": 4.63631886085493, + "language_loss": 0.74485022, + "learning_rate": 1.6906512655711614e-06, + "loss": 0.76637292, + "num_input_tokens_seen": 201649790, + "router_z_loss_clip": 0.75927734, + "router_z_loss_mlp": 0.11645508, + "step": 9363, + "time_per_iteration": 2.7946512699127197 + }, + { + "auxiliary_loss_clip": 0.01121765, + "auxiliary_loss_mlp": 0.01031124, + "balance_loss_clip": 1.0426333, + "balance_loss_mlp": 1.01882803, + "epoch": 0.5629941379828649, + "flos": 35681804242560.0, + "grad_norm": 1.677508587331368, + "language_loss": 0.82773477, + "learning_rate": 1.690266496731839e-06, + "loss": 0.84926367, + "num_input_tokens_seen": 201669175, + "router_z_loss_clip": 0.79101562, + "router_z_loss_mlp": 0.12310791, + "step": 9364, + "time_per_iteration": 2.678011655807495 + }, + { + "auxiliary_loss_clip": 0.01119011, + "auxiliary_loss_mlp": 0.01033381, + "balance_loss_clip": 1.04407895, + "balance_loss_mlp": 1.02241409, + "epoch": 0.5630542612355328, + "flos": 23697893316960.0, + "grad_norm": 2.3268028299580807, + "language_loss": 0.65200198, + "learning_rate": 1.689881739637642e-06, + "loss": 0.67352599, + "num_input_tokens_seen": 201687000, + "router_z_loss_clip": 0.75048828, + "router_z_loss_mlp": 0.10961914, + "step": 9365, + "time_per_iteration": 2.644413471221924 + }, + { + "auxiliary_loss_clip": 0.01125293, + "auxiliary_loss_mlp": 0.01033169, + "balance_loss_clip": 1.04338169, + "balance_loss_mlp": 1.02052045, + "epoch": 0.5631143844882008, + "flos": 27170134054560.0, + "grad_norm": 2.985761326725547, + "language_loss": 0.81401098, + "learning_rate": 1.6894969943031611e-06, + "loss": 0.83559561, + "num_input_tokens_seen": 201703335, + "router_z_loss_clip": 0.81884766, + "router_z_loss_mlp": 0.12652588, + "step": 9366, + "time_per_iteration": 2.614345073699951 + }, + { + "auxiliary_loss_clip": 0.01120318, + "auxiliary_loss_mlp": 0.0102765, + "balance_loss_clip": 1.04492855, + "balance_loss_mlp": 1.01634932, + "epoch": 0.5631745077408687, + "flos": 28022578002720.0, + "grad_norm": 1.8236579383551867, + "language_loss": 0.73452544, + "learning_rate": 1.6891122607429845e-06, + "loss": 0.75600511, + "num_input_tokens_seen": 201723495, + "router_z_loss_clip": 0.75390625, + "router_z_loss_mlp": 0.11303711, + "step": 9367, + "time_per_iteration": 4.143434524536133 + }, + { + "auxiliary_loss_clip": 0.01040044, + "auxiliary_loss_mlp": 0.01005278, + "balance_loss_clip": 1.01492429, + "balance_loss_mlp": 1.00409389, + "epoch": 0.5632346309935368, + "flos": 79410453256800.0, + "grad_norm": 0.6265399964917331, + "language_loss": 0.53431046, + "learning_rate": 1.6887275389717028e-06, + "loss": 0.55476373, + "num_input_tokens_seen": 201792615, + "router_z_loss_clip": 0.25109863, + "router_z_loss_mlp": 0.01182556, + "step": 9368, + "time_per_iteration": 4.710587739944458 + }, + { + "auxiliary_loss_clip": 0.01120612, + "auxiliary_loss_mlp": 0.01031854, + "balance_loss_clip": 1.04461884, + "balance_loss_mlp": 1.02002883, + "epoch": 0.5632947542462047, + "flos": 28068356178720.0, + "grad_norm": 2.4963816542857336, + "language_loss": 0.69457614, + "learning_rate": 1.6883428290039046e-06, + "loss": 0.71610081, + "num_input_tokens_seen": 201812520, + "router_z_loss_clip": 0.75976562, + "router_z_loss_mlp": 0.1184082, + "step": 9369, + "time_per_iteration": 2.6412622928619385 + }, + { + "auxiliary_loss_clip": 0.01118795, + "auxiliary_loss_mlp": 0.01031378, + "balance_loss_clip": 1.04064345, + "balance_loss_mlp": 1.01923013, + "epoch": 0.5633548774988727, + "flos": 37195638048000.0, + "grad_norm": 1.987295404977583, + "language_loss": 0.75767493, + "learning_rate": 1.6879581308541763e-06, + "loss": 0.77917659, + "num_input_tokens_seen": 201834185, + "router_z_loss_clip": 0.78222656, + "router_z_loss_mlp": 0.121521, + "step": 9370, + "time_per_iteration": 2.739513874053955 + }, + { + "auxiliary_loss_clip": 0.01123337, + "auxiliary_loss_mlp": 0.01035849, + "balance_loss_clip": 1.04391444, + "balance_loss_mlp": 1.02276623, + "epoch": 0.5634150007515406, + "flos": 22591930332960.0, + "grad_norm": 2.7778312410240993, + "language_loss": 0.76123023, + "learning_rate": 1.687573444537108e-06, + "loss": 0.78282213, + "num_input_tokens_seen": 201851305, + "router_z_loss_clip": 0.79443359, + "router_z_loss_mlp": 0.13092041, + "step": 9371, + "time_per_iteration": 2.6787190437316895 + }, + { + "auxiliary_loss_clip": 0.01117585, + "auxiliary_loss_mlp": 0.01035773, + "balance_loss_clip": 1.04218614, + "balance_loss_mlp": 1.02431738, + "epoch": 0.5634751240042086, + "flos": 23482292104800.0, + "grad_norm": 2.1042586486550854, + "language_loss": 0.75865805, + "learning_rate": 1.687188770067285e-06, + "loss": 0.78019166, + "num_input_tokens_seen": 201870350, + "router_z_loss_clip": 0.75390625, + "router_z_loss_mlp": 0.11450195, + "step": 9372, + "time_per_iteration": 2.6598293781280518 + }, + { + "auxiliary_loss_clip": 0.01118378, + "auxiliary_loss_mlp": 0.01028657, + "balance_loss_clip": 1.04322708, + "balance_loss_mlp": 1.0166471, + "epoch": 0.5635352472568766, + "flos": 14666948838720.0, + "grad_norm": 2.905875786527223, + "language_loss": 0.71248674, + "learning_rate": 1.6868041074592956e-06, + "loss": 0.73395705, + "num_input_tokens_seen": 201886800, + "router_z_loss_clip": 0.75097656, + "router_z_loss_mlp": 0.12005615, + "step": 9373, + "time_per_iteration": 2.661154270172119 + }, + { + "auxiliary_loss_clip": 0.01122547, + "auxiliary_loss_mlp": 0.01030516, + "balance_loss_clip": 1.04500818, + "balance_loss_mlp": 1.01742649, + "epoch": 0.5635953705095446, + "flos": 26687926725120.0, + "grad_norm": 2.0137090240545503, + "language_loss": 0.82922333, + "learning_rate": 1.6864194567277264e-06, + "loss": 0.85075396, + "num_input_tokens_seen": 201904730, + "router_z_loss_clip": 0.77636719, + "router_z_loss_mlp": 0.13098145, + "step": 9374, + "time_per_iteration": 2.6847267150878906 + }, + { + "auxiliary_loss_clip": 0.01117689, + "auxiliary_loss_mlp": 0.01027657, + "balance_loss_clip": 1.0421586, + "balance_loss_mlp": 1.01571202, + "epoch": 0.5636554937622126, + "flos": 33099844242240.0, + "grad_norm": 1.776546197012531, + "language_loss": 0.66338891, + "learning_rate": 1.6860348178871618e-06, + "loss": 0.68484241, + "num_input_tokens_seen": 201924850, + "router_z_loss_clip": 0.75537109, + "router_z_loss_mlp": 0.11950684, + "step": 9375, + "time_per_iteration": 4.2048256397247314 + }, + { + "auxiliary_loss_clip": 0.01120973, + "auxiliary_loss_mlp": 0.01034096, + "balance_loss_clip": 1.04277873, + "balance_loss_mlp": 1.02247882, + "epoch": 0.5637156170148805, + "flos": 15772830788160.0, + "grad_norm": 2.2054565987352617, + "language_loss": 0.81548297, + "learning_rate": 1.6856501909521889e-06, + "loss": 0.83703369, + "num_input_tokens_seen": 201939500, + "router_z_loss_clip": 0.78173828, + "router_z_loss_mlp": 0.11627197, + "step": 9376, + "time_per_iteration": 2.623889684677124 + }, + { + "auxiliary_loss_clip": 0.01122526, + "auxiliary_loss_mlp": 0.01032002, + "balance_loss_clip": 1.0429697, + "balance_loss_mlp": 1.0195744, + "epoch": 0.5637757402675485, + "flos": 55583322517440.0, + "grad_norm": 1.766089434337296, + "language_loss": 0.69443214, + "learning_rate": 1.6852655759373925e-06, + "loss": 0.71597743, + "num_input_tokens_seen": 201963000, + "router_z_loss_clip": 0.79589844, + "router_z_loss_mlp": 0.12420654, + "step": 9377, + "time_per_iteration": 2.8699071407318115 + }, + { + "auxiliary_loss_clip": 0.01118011, + "auxiliary_loss_mlp": 0.01029123, + "balance_loss_clip": 1.04453397, + "balance_loss_mlp": 1.01793528, + "epoch": 0.5638358635202164, + "flos": 25485261694560.0, + "grad_norm": 27.534862286353107, + "language_loss": 0.74444044, + "learning_rate": 1.6848809728573565e-06, + "loss": 0.76591176, + "num_input_tokens_seen": 201983145, + "router_z_loss_clip": 0.73486328, + "router_z_loss_mlp": 0.11187744, + "step": 9378, + "time_per_iteration": 2.6952908039093018 + }, + { + "auxiliary_loss_clip": 0.01124205, + "auxiliary_loss_mlp": 0.01032862, + "balance_loss_clip": 1.04226899, + "balance_loss_mlp": 1.0199573, + "epoch": 0.5638959867728844, + "flos": 22947702282720.0, + "grad_norm": 2.9433957736011203, + "language_loss": 0.82178009, + "learning_rate": 1.6844963817266656e-06, + "loss": 0.84335077, + "num_input_tokens_seen": 202000335, + "router_z_loss_clip": 0.81933594, + "router_z_loss_mlp": 0.12896729, + "step": 9379, + "time_per_iteration": 2.6225686073303223 + }, + { + "auxiliary_loss_clip": 0.01119602, + "auxiliary_loss_mlp": 0.01032028, + "balance_loss_clip": 1.04108775, + "balance_loss_mlp": 1.02007711, + "epoch": 0.5639561100255523, + "flos": 33544295817120.0, + "grad_norm": 2.6539441197698457, + "language_loss": 0.71166098, + "learning_rate": 1.6841118025599042e-06, + "loss": 0.7331773, + "num_input_tokens_seen": 202018275, + "router_z_loss_clip": 0.78320312, + "router_z_loss_mlp": 0.11956787, + "step": 9380, + "time_per_iteration": 4.0220866203308105 + }, + { + "auxiliary_loss_clip": 0.01122537, + "auxiliary_loss_mlp": 0.01035057, + "balance_loss_clip": 1.04364038, + "balance_loss_mlp": 1.02254045, + "epoch": 0.5640162332782204, + "flos": 22056935338080.0, + "grad_norm": 2.5755253373511047, + "language_loss": 0.74458623, + "learning_rate": 1.6837272353716542e-06, + "loss": 0.76616216, + "num_input_tokens_seen": 202034330, + "router_z_loss_clip": 0.79003906, + "router_z_loss_mlp": 0.12524414, + "step": 9381, + "time_per_iteration": 2.606084108352661 + }, + { + "auxiliary_loss_clip": 0.01122925, + "auxiliary_loss_mlp": 0.01033643, + "balance_loss_clip": 1.04388809, + "balance_loss_mlp": 1.02205551, + "epoch": 0.5640763565308883, + "flos": 25483924624320.0, + "grad_norm": 3.5161905801319167, + "language_loss": 0.72523868, + "learning_rate": 1.683342680176499e-06, + "loss": 0.74680436, + "num_input_tokens_seen": 202053100, + "router_z_loss_clip": 0.79052734, + "router_z_loss_mlp": 0.11590576, + "step": 9382, + "time_per_iteration": 2.756955862045288 + }, + { + "auxiliary_loss_clip": 0.01039668, + "auxiliary_loss_mlp": 0.0100201, + "balance_loss_clip": 1.0146246, + "balance_loss_mlp": 1.0007869, + "epoch": 0.5641364797835563, + "flos": 78639638927040.0, + "grad_norm": 0.766973281761216, + "language_loss": 0.54370236, + "learning_rate": 1.682958136989022e-06, + "loss": 0.5641191, + "num_input_tokens_seen": 202120125, + "router_z_loss_clip": 0.25048828, + "router_z_loss_mlp": 0.01222229, + "step": 9383, + "time_per_iteration": 3.384363889694214 + }, + { + "auxiliary_loss_clip": 0.01123429, + "auxiliary_loss_mlp": 0.01027002, + "balance_loss_clip": 1.04273069, + "balance_loss_mlp": 1.01462793, + "epoch": 0.5641966030362242, + "flos": 22769978376960.0, + "grad_norm": 4.213857813435202, + "language_loss": 0.70628005, + "learning_rate": 1.6825736058238033e-06, + "loss": 0.72778434, + "num_input_tokens_seen": 202138030, + "router_z_loss_clip": 0.80615234, + "router_z_loss_mlp": 0.1237793, + "step": 9384, + "time_per_iteration": 2.6286141872406006 + }, + { + "auxiliary_loss_clip": 0.01120791, + "auxiliary_loss_mlp": 0.0102805, + "balance_loss_clip": 1.04247141, + "balance_loss_mlp": 1.01567578, + "epoch": 0.5642567262888922, + "flos": 27445289317920.0, + "grad_norm": 2.414001088919258, + "language_loss": 0.76002824, + "learning_rate": 1.6821890866954263e-06, + "loss": 0.78151661, + "num_input_tokens_seen": 202155580, + "router_z_loss_clip": 0.78222656, + "router_z_loss_mlp": 0.12371826, + "step": 9385, + "time_per_iteration": 2.727710247039795 + }, + { + "auxiliary_loss_clip": 0.01117995, + "auxiliary_loss_mlp": 0.0103201, + "balance_loss_clip": 1.04197669, + "balance_loss_mlp": 1.02020264, + "epoch": 0.5643168495415603, + "flos": 15867061280640.0, + "grad_norm": 2.1958166552807996, + "language_loss": 0.82900578, + "learning_rate": 1.6818045796184703e-06, + "loss": 0.85050583, + "num_input_tokens_seen": 202170365, + "router_z_loss_clip": 0.76025391, + "router_z_loss_mlp": 0.11804199, + "step": 9386, + "time_per_iteration": 2.6225807666778564 + }, + { + "auxiliary_loss_clip": 0.01126686, + "auxiliary_loss_mlp": 0.01033582, + "balance_loss_clip": 1.04578042, + "balance_loss_mlp": 1.02090394, + "epoch": 0.5643769727942282, + "flos": 22681298751840.0, + "grad_norm": 2.2529441120572624, + "language_loss": 0.69867736, + "learning_rate": 1.681420084607516e-06, + "loss": 0.72028005, + "num_input_tokens_seen": 202189095, + "router_z_loss_clip": 0.80859375, + "router_z_loss_mlp": 0.12677002, + "step": 9387, + "time_per_iteration": 2.688485860824585 + }, + { + "auxiliary_loss_clip": 0.01122971, + "auxiliary_loss_mlp": 0.0102986, + "balance_loss_clip": 1.04315472, + "balance_loss_mlp": 1.01774812, + "epoch": 0.5644370960468962, + "flos": 41251123510560.0, + "grad_norm": 1.590549571829783, + "language_loss": 0.74712163, + "learning_rate": 1.6810356016771452e-06, + "loss": 0.76864994, + "num_input_tokens_seen": 202213500, + "router_z_loss_clip": 0.79931641, + "router_z_loss_mlp": 0.12091064, + "step": 9388, + "time_per_iteration": 2.794935464859009 + }, + { + "auxiliary_loss_clip": 0.01116057, + "auxiliary_loss_mlp": 0.01032017, + "balance_loss_clip": 1.04135776, + "balance_loss_mlp": 1.02152026, + "epoch": 0.5644972192995641, + "flos": 25886528199360.0, + "grad_norm": 2.0284863065819274, + "language_loss": 0.82343793, + "learning_rate": 1.6806511308419353e-06, + "loss": 0.84491861, + "num_input_tokens_seen": 202231920, + "router_z_loss_clip": 0.74707031, + "router_z_loss_mlp": 0.10498047, + "step": 9389, + "time_per_iteration": 2.6632673740386963 + }, + { + "auxiliary_loss_clip": 0.01123104, + "auxiliary_loss_mlp": 0.01032647, + "balance_loss_clip": 1.04303086, + "balance_loss_mlp": 1.01941466, + "epoch": 0.5645573425522321, + "flos": 22680650475360.0, + "grad_norm": 2.693466272211732, + "language_loss": 0.63893533, + "learning_rate": 1.680266672116467e-06, + "loss": 0.66049284, + "num_input_tokens_seen": 202247600, + "router_z_loss_clip": 0.80029297, + "router_z_loss_mlp": 0.13238525, + "step": 9390, + "time_per_iteration": 2.641143798828125 + }, + { + "auxiliary_loss_clip": 0.01120305, + "auxiliary_loss_mlp": 0.01025325, + "balance_loss_clip": 1.04486847, + "balance_loss_mlp": 1.01459074, + "epoch": 0.5646174658049, + "flos": 22102308341280.0, + "grad_norm": 2.0186502377837416, + "language_loss": 0.92251182, + "learning_rate": 1.6798822255153192e-06, + "loss": 0.94396812, + "num_input_tokens_seen": 202265350, + "router_z_loss_clip": 0.75390625, + "router_z_loss_mlp": 0.10742188, + "step": 9391, + "time_per_iteration": 2.6252589225769043 + }, + { + "auxiliary_loss_clip": 0.01126822, + "auxiliary_loss_mlp": 0.01033339, + "balance_loss_clip": 1.04463983, + "balance_loss_mlp": 1.0204643, + "epoch": 0.564677589057568, + "flos": 34568831769120.0, + "grad_norm": 2.0962824915447658, + "language_loss": 0.60873079, + "learning_rate": 1.6794977910530684e-06, + "loss": 0.63033241, + "num_input_tokens_seen": 202284285, + "router_z_loss_clip": 0.82177734, + "router_z_loss_mlp": 0.12884521, + "step": 9392, + "time_per_iteration": 2.704186201095581 + }, + { + "auxiliary_loss_clip": 0.01118342, + "auxiliary_loss_mlp": 0.01026143, + "balance_loss_clip": 1.04112673, + "balance_loss_mlp": 1.01304173, + "epoch": 0.564737712310236, + "flos": 27044346951360.0, + "grad_norm": 2.24854536668992, + "language_loss": 0.8088361, + "learning_rate": 1.6791133687442937e-06, + "loss": 0.83028102, + "num_input_tokens_seen": 202303450, + "router_z_loss_clip": 0.77294922, + "router_z_loss_mlp": 0.13110352, + "step": 9393, + "time_per_iteration": 2.6316988468170166 + }, + { + "auxiliary_loss_clip": 0.01121806, + "auxiliary_loss_mlp": 0.01027994, + "balance_loss_clip": 1.04455638, + "balance_loss_mlp": 1.01601338, + "epoch": 0.564797835562904, + "flos": 25574305975200.0, + "grad_norm": 1.7975269973321917, + "language_loss": 0.87329376, + "learning_rate": 1.6787289586035725e-06, + "loss": 0.89479172, + "num_input_tokens_seen": 202322315, + "router_z_loss_clip": 0.77246094, + "router_z_loss_mlp": 0.11981201, + "step": 9394, + "time_per_iteration": 2.658965826034546 + }, + { + "auxiliary_loss_clip": 0.01121805, + "auxiliary_loss_mlp": 0.01029574, + "balance_loss_clip": 1.0462687, + "balance_loss_mlp": 1.01771903, + "epoch": 0.5648579588155719, + "flos": 21256063536960.0, + "grad_norm": 2.3775120166211945, + "language_loss": 0.84892803, + "learning_rate": 1.6783445606454814e-06, + "loss": 0.87044185, + "num_input_tokens_seen": 202339905, + "router_z_loss_clip": 0.75439453, + "router_z_loss_mlp": 0.11865234, + "step": 9395, + "time_per_iteration": 2.6630187034606934 + }, + { + "auxiliary_loss_clip": 0.01037871, + "auxiliary_loss_mlp": 0.00999062, + "balance_loss_clip": 1.01285636, + "balance_loss_mlp": 0.99780917, + "epoch": 0.5649180820682399, + "flos": 85327967743200.0, + "grad_norm": 0.7974415607197128, + "language_loss": 0.58313346, + "learning_rate": 1.677960174884597e-06, + "loss": 0.60350281, + "num_input_tokens_seen": 202397320, + "router_z_loss_clip": 0.24975586, + "router_z_loss_mlp": 0.01252747, + "step": 9396, + "time_per_iteration": 3.260263681411743 + }, + { + "auxiliary_loss_clip": 0.01122377, + "auxiliary_loss_mlp": 0.01030587, + "balance_loss_clip": 1.04252648, + "balance_loss_mlp": 1.01863611, + "epoch": 0.5649782053209078, + "flos": 30472511238720.0, + "grad_norm": 5.139236814669637, + "language_loss": 0.69894338, + "learning_rate": 1.6775758013354943e-06, + "loss": 0.72047299, + "num_input_tokens_seen": 202416865, + "router_z_loss_clip": 0.79785156, + "router_z_loss_mlp": 0.11956787, + "step": 9397, + "time_per_iteration": 2.653573751449585 + }, + { + "auxiliary_loss_clip": 0.01121951, + "auxiliary_loss_mlp": 0.01033787, + "balance_loss_clip": 1.04260325, + "balance_loss_mlp": 1.02187228, + "epoch": 0.5650383285735758, + "flos": 26510851095840.0, + "grad_norm": 2.766976483194587, + "language_loss": 0.67044586, + "learning_rate": 1.67719144001275e-06, + "loss": 0.69200325, + "num_input_tokens_seen": 202436210, + "router_z_loss_clip": 0.79199219, + "router_z_loss_mlp": 0.11920166, + "step": 9398, + "time_per_iteration": 2.6740360260009766 + }, + { + "auxiliary_loss_clip": 0.01037825, + "auxiliary_loss_mlp": 0.00999737, + "balance_loss_clip": 1.012833, + "balance_loss_mlp": 0.99847829, + "epoch": 0.5650984518262439, + "flos": 80417323674720.0, + "grad_norm": 0.764928156406885, + "language_loss": 0.58123446, + "learning_rate": 1.6768070909309386e-06, + "loss": 0.60161012, + "num_input_tokens_seen": 202492925, + "router_z_loss_clip": 0.24987793, + "router_z_loss_mlp": 0.0125885, + "step": 9399, + "time_per_iteration": 3.182814359664917 + }, + { + "auxiliary_loss_clip": 0.01121218, + "auxiliary_loss_mlp": 0.01035813, + "balance_loss_clip": 1.04103494, + "balance_loss_mlp": 1.02221119, + "epoch": 0.5651585750789118, + "flos": 25664039049600.0, + "grad_norm": 1.874765864475941, + "language_loss": 0.732189, + "learning_rate": 1.6764227541046347e-06, + "loss": 0.75375926, + "num_input_tokens_seen": 202511905, + "router_z_loss_clip": 0.80175781, + "router_z_loss_mlp": 0.13598633, + "step": 9400, + "time_per_iteration": 2.645841121673584 + }, + { + "auxiliary_loss_clip": 0.01124603, + "auxiliary_loss_mlp": 0.01031885, + "balance_loss_clip": 1.04382586, + "balance_loss_mlp": 1.01852751, + "epoch": 0.5652186983315798, + "flos": 22636898163360.0, + "grad_norm": 1.8658816962626588, + "language_loss": 0.60564995, + "learning_rate": 1.676038429548412e-06, + "loss": 0.62721479, + "num_input_tokens_seen": 202529815, + "router_z_loss_clip": 0.80712891, + "router_z_loss_mlp": 0.13354492, + "step": 9401, + "time_per_iteration": 2.782132148742676 + }, + { + "auxiliary_loss_clip": 0.01117608, + "auxiliary_loss_mlp": 0.01028182, + "balance_loss_clip": 1.04022074, + "balance_loss_mlp": 1.01667786, + "epoch": 0.5652788215842477, + "flos": 22547529744480.0, + "grad_norm": 2.0371971519911845, + "language_loss": 0.81415737, + "learning_rate": 1.6756541172768453e-06, + "loss": 0.83561528, + "num_input_tokens_seen": 202547710, + "router_z_loss_clip": 0.7734375, + "router_z_loss_mlp": 0.1151123, + "step": 9402, + "time_per_iteration": 2.669079303741455 + }, + { + "auxiliary_loss_clip": 0.01118416, + "auxiliary_loss_mlp": 0.01030795, + "balance_loss_clip": 1.04206371, + "balance_loss_mlp": 1.01946402, + "epoch": 0.5653389448369157, + "flos": 36660562018560.0, + "grad_norm": 1.4135460771007637, + "language_loss": 0.77654147, + "learning_rate": 1.6752698173045068e-06, + "loss": 0.7980336, + "num_input_tokens_seen": 202568835, + "router_z_loss_clip": 0.76367188, + "router_z_loss_mlp": 0.11322021, + "step": 9403, + "time_per_iteration": 2.7275924682617188 + }, + { + "auxiliary_loss_clip": 0.01119871, + "auxiliary_loss_mlp": 0.01027154, + "balance_loss_clip": 1.04219711, + "balance_loss_mlp": 1.01467896, + "epoch": 0.5653990680895836, + "flos": 20410669595520.0, + "grad_norm": 2.1313792554223743, + "language_loss": 0.69526219, + "learning_rate": 1.6748855296459685e-06, + "loss": 0.71673244, + "num_input_tokens_seen": 202587385, + "router_z_loss_clip": 0.77636719, + "router_z_loss_mlp": 0.12475586, + "step": 9404, + "time_per_iteration": 2.664202928543091 + }, + { + "auxiliary_loss_clip": 0.01115639, + "auxiliary_loss_mlp": 0.01029548, + "balance_loss_clip": 1.04030442, + "balance_loss_mlp": 1.01832438, + "epoch": 0.5654591913422516, + "flos": 17739138589920.0, + "grad_norm": 2.937472724040124, + "language_loss": 0.670645, + "learning_rate": 1.6745012543158045e-06, + "loss": 0.69209683, + "num_input_tokens_seen": 202604815, + "router_z_loss_clip": 0.75292969, + "router_z_loss_mlp": 0.11224365, + "step": 9405, + "time_per_iteration": 2.6664066314697266 + }, + { + "auxiliary_loss_clip": 0.01116159, + "auxiliary_loss_mlp": 0.01032118, + "balance_loss_clip": 1.04349709, + "balance_loss_mlp": 1.02062011, + "epoch": 0.5655193145949196, + "flos": 31982131247040.0, + "grad_norm": 3.5873370454222684, + "language_loss": 0.74271744, + "learning_rate": 1.6741169913285852e-06, + "loss": 0.76420021, + "num_input_tokens_seen": 202623775, + "router_z_loss_clip": 0.7265625, + "router_z_loss_mlp": 0.1149292, + "step": 9406, + "time_per_iteration": 2.6895735263824463 + }, + { + "auxiliary_loss_clip": 0.01120637, + "auxiliary_loss_mlp": 0.01034347, + "balance_loss_clip": 1.04143596, + "balance_loss_mlp": 1.02091789, + "epoch": 0.5655794378475876, + "flos": 30562163278560.0, + "grad_norm": 3.912809698146503, + "language_loss": 0.79500222, + "learning_rate": 1.673732740698882e-06, + "loss": 0.81655204, + "num_input_tokens_seen": 202643375, + "router_z_loss_clip": 0.79199219, + "router_z_loss_mlp": 0.13433838, + "step": 9407, + "time_per_iteration": 4.190093755722046 + }, + { + "auxiliary_loss_clip": 0.01117716, + "auxiliary_loss_mlp": 0.0103052, + "balance_loss_clip": 1.04340029, + "balance_loss_mlp": 1.01898694, + "epoch": 0.5656395611002555, + "flos": 37866833087040.0, + "grad_norm": 1.4817212632866534, + "language_loss": 0.71153271, + "learning_rate": 1.6733485024412666e-06, + "loss": 0.733015, + "num_input_tokens_seen": 202668400, + "router_z_loss_clip": 0.74365234, + "router_z_loss_mlp": 0.11541748, + "step": 9408, + "time_per_iteration": 3.991617441177368 + }, + { + "auxiliary_loss_clip": 0.01118718, + "auxiliary_loss_mlp": 0.01036572, + "balance_loss_clip": 1.04229629, + "balance_loss_mlp": 1.02410269, + "epoch": 0.5656996843529235, + "flos": 24684106272480.0, + "grad_norm": 2.0525917663049285, + "language_loss": 0.81747174, + "learning_rate": 1.672964276570308e-06, + "loss": 0.8390246, + "num_input_tokens_seen": 202685125, + "router_z_loss_clip": 0.76464844, + "router_z_loss_mlp": 0.12475586, + "step": 9409, + "time_per_iteration": 2.654500722885132 + }, + { + "auxiliary_loss_clip": 0.01118484, + "auxiliary_loss_mlp": 0.01031377, + "balance_loss_clip": 1.04149723, + "balance_loss_mlp": 1.01978385, + "epoch": 0.5657598076055914, + "flos": 25619678978400.0, + "grad_norm": 2.6083280538992115, + "language_loss": 0.78280854, + "learning_rate": 1.6725800631005776e-06, + "loss": 0.80430716, + "num_input_tokens_seen": 202703830, + "router_z_loss_clip": 0.77001953, + "router_z_loss_mlp": 0.11602783, + "step": 9410, + "time_per_iteration": 2.6591832637786865 + }, + { + "auxiliary_loss_clip": 0.01119482, + "auxiliary_loss_mlp": 0.01036601, + "balance_loss_clip": 1.04159236, + "balance_loss_mlp": 1.02466226, + "epoch": 0.5658199308582594, + "flos": 14087796359040.0, + "grad_norm": 2.145903417713964, + "language_loss": 0.83144808, + "learning_rate": 1.6721958620466432e-06, + "loss": 0.85300893, + "num_input_tokens_seen": 202719835, + "router_z_loss_clip": 0.77929688, + "router_z_loss_mlp": 0.11938477, + "step": 9411, + "time_per_iteration": 2.632230281829834 + }, + { + "auxiliary_loss_clip": 0.01121606, + "auxiliary_loss_mlp": 0.01032787, + "balance_loss_clip": 1.04061925, + "balance_loss_mlp": 1.0194658, + "epoch": 0.5658800541109275, + "flos": 17292499081920.0, + "grad_norm": 2.984322597918883, + "language_loss": 0.67532194, + "learning_rate": 1.6718116734230749e-06, + "loss": 0.6968658, + "num_input_tokens_seen": 202736795, + "router_z_loss_clip": 0.81005859, + "router_z_loss_mlp": 0.13311768, + "step": 9412, + "time_per_iteration": 2.65863037109375 + }, + { + "auxiliary_loss_clip": 0.01114513, + "auxiliary_loss_mlp": 0.01027957, + "balance_loss_clip": 1.04111648, + "balance_loss_mlp": 1.0176034, + "epoch": 0.5659401773635954, + "flos": 33318484250400.0, + "grad_norm": 4.069210007944732, + "language_loss": 0.58443582, + "learning_rate": 1.6714274972444413e-06, + "loss": 0.60586053, + "num_input_tokens_seen": 202756900, + "router_z_loss_clip": 0.73388672, + "router_z_loss_mlp": 0.10357666, + "step": 9413, + "time_per_iteration": 2.754889965057373 + }, + { + "auxiliary_loss_clip": 0.01115462, + "auxiliary_loss_mlp": 0.0102918, + "balance_loss_clip": 1.04025269, + "balance_loss_mlp": 1.01790285, + "epoch": 0.5660003006162634, + "flos": 20411398906560.0, + "grad_norm": 1.5391695607026543, + "language_loss": 0.69134313, + "learning_rate": 1.6710433335253092e-06, + "loss": 0.71278954, + "num_input_tokens_seen": 202775145, + "router_z_loss_clip": 0.75195312, + "router_z_loss_mlp": 0.11279297, + "step": 9414, + "time_per_iteration": 4.046090841293335 + }, + { + "auxiliary_loss_clip": 0.01113778, + "auxiliary_loss_mlp": 0.01029744, + "balance_loss_clip": 1.03934765, + "balance_loss_mlp": 1.01877069, + "epoch": 0.5660604238689313, + "flos": 26421604228800.0, + "grad_norm": 1.7651591890269442, + "language_loss": 0.78402334, + "learning_rate": 1.670659182280247e-06, + "loss": 0.80545855, + "num_input_tokens_seen": 202794505, + "router_z_loss_clip": 0.74462891, + "router_z_loss_mlp": 0.10974121, + "step": 9415, + "time_per_iteration": 2.729480266571045 + }, + { + "auxiliary_loss_clip": 0.01037408, + "auxiliary_loss_mlp": 0.01007113, + "balance_loss_clip": 1.01245189, + "balance_loss_mlp": 1.00594139, + "epoch": 0.5661205471215993, + "flos": 83979905245920.0, + "grad_norm": 0.6823845898249975, + "language_loss": 0.49181214, + "learning_rate": 1.670275043523822e-06, + "loss": 0.51225734, + "num_input_tokens_seen": 202858580, + "router_z_loss_clip": 0.24926758, + "router_z_loss_mlp": 0.01170349, + "step": 9416, + "time_per_iteration": 3.3742570877075195 + }, + { + "auxiliary_loss_clip": 0.01119527, + "auxiliary_loss_mlp": 0.01033015, + "balance_loss_clip": 1.0421176, + "balance_loss_mlp": 1.02083826, + "epoch": 0.5661806703742672, + "flos": 34924927857120.0, + "grad_norm": 2.851354415723724, + "language_loss": 0.63121784, + "learning_rate": 1.6698909172706e-06, + "loss": 0.65274328, + "num_input_tokens_seen": 202878565, + "router_z_loss_clip": 0.77441406, + "router_z_loss_mlp": 0.12182617, + "step": 9417, + "time_per_iteration": 2.679943323135376 + }, + { + "auxiliary_loss_clip": 0.01119723, + "auxiliary_loss_mlp": 0.01028631, + "balance_loss_clip": 1.04146862, + "balance_loss_mlp": 1.01573217, + "epoch": 0.5662407936269352, + "flos": 26109827694720.0, + "grad_norm": 2.179069918550614, + "language_loss": 0.68774748, + "learning_rate": 1.6695068035351479e-06, + "loss": 0.70923108, + "num_input_tokens_seen": 202897350, + "router_z_loss_clip": 0.78076172, + "router_z_loss_mlp": 0.12908936, + "step": 9418, + "time_per_iteration": 2.747295379638672 + }, + { + "auxiliary_loss_clip": 0.01117428, + "auxiliary_loss_mlp": 0.01030007, + "balance_loss_clip": 1.03989553, + "balance_loss_mlp": 1.01657784, + "epoch": 0.5663009168796032, + "flos": 31311341380800.0, + "grad_norm": 3.6521309953864662, + "language_loss": 0.64834195, + "learning_rate": 1.6691227023320304e-06, + "loss": 0.66981626, + "num_input_tokens_seen": 202916745, + "router_z_loss_clip": 0.77441406, + "router_z_loss_mlp": 0.13409424, + "step": 9419, + "time_per_iteration": 2.68696928024292 + }, + { + "auxiliary_loss_clip": 0.01037442, + "auxiliary_loss_mlp": 0.01004299, + "balance_loss_clip": 1.012537, + "balance_loss_mlp": 1.00311613, + "epoch": 0.5663610401322712, + "flos": 82887799171680.0, + "grad_norm": 0.7362189308232571, + "language_loss": 0.59724021, + "learning_rate": 1.6687386136758135e-06, + "loss": 0.6176576, + "num_input_tokens_seen": 202982375, + "router_z_loss_clip": 0.24914551, + "router_z_loss_mlp": 0.01181793, + "step": 9420, + "time_per_iteration": 4.679775714874268 + }, + { + "auxiliary_loss_clip": 0.01114337, + "auxiliary_loss_mlp": 0.0102666, + "balance_loss_clip": 1.03999579, + "balance_loss_mlp": 1.01573455, + "epoch": 0.5664211633849391, + "flos": 30028586388480.0, + "grad_norm": 1.7444424806089562, + "language_loss": 0.74136901, + "learning_rate": 1.6683545375810618e-06, + "loss": 0.762779, + "num_input_tokens_seen": 203002430, + "router_z_loss_clip": 0.74316406, + "router_z_loss_mlp": 0.10925293, + "step": 9421, + "time_per_iteration": 2.724785327911377 + }, + { + "auxiliary_loss_clip": 0.01121795, + "auxiliary_loss_mlp": 0.01035962, + "balance_loss_clip": 1.042696, + "balance_loss_mlp": 1.02389765, + "epoch": 0.5664812866376071, + "flos": 14213867083200.0, + "grad_norm": 2.6033583798114774, + "language_loss": 0.72798824, + "learning_rate": 1.6679704740623389e-06, + "loss": 0.74956572, + "num_input_tokens_seen": 203019425, + "router_z_loss_clip": 0.79101562, + "router_z_loss_mlp": 0.12054443, + "step": 9422, + "time_per_iteration": 2.592120885848999 + }, + { + "auxiliary_loss_clip": 0.01116911, + "auxiliary_loss_mlp": 0.01034953, + "balance_loss_clip": 1.04292524, + "balance_loss_mlp": 1.02409887, + "epoch": 0.566541409890275, + "flos": 30071001630240.0, + "grad_norm": 3.4402591215956746, + "language_loss": 0.82139432, + "learning_rate": 1.6675864231342085e-06, + "loss": 0.84291297, + "num_input_tokens_seen": 203039035, + "router_z_loss_clip": 0.74072266, + "router_z_loss_mlp": 0.10852051, + "step": 9423, + "time_per_iteration": 2.6760191917419434 + }, + { + "auxiliary_loss_clip": 0.01115038, + "auxiliary_loss_mlp": 0.01033337, + "balance_loss_clip": 1.04024732, + "balance_loss_mlp": 1.02142239, + "epoch": 0.566601533142943, + "flos": 27174955610880.0, + "grad_norm": 1.640006944558526, + "language_loss": 0.80821073, + "learning_rate": 1.6672023848112353e-06, + "loss": 0.82969451, + "num_input_tokens_seen": 203059320, + "router_z_loss_clip": 0.74804688, + "router_z_loss_mlp": 0.11907959, + "step": 9424, + "time_per_iteration": 2.628509044647217 + }, + { + "auxiliary_loss_clip": 0.01122431, + "auxiliary_loss_mlp": 0.01029396, + "balance_loss_clip": 1.04237628, + "balance_loss_mlp": 1.01658738, + "epoch": 0.5666616563956111, + "flos": 36572166014400.0, + "grad_norm": 2.507700474587532, + "language_loss": 0.78615332, + "learning_rate": 1.6668183591079805e-06, + "loss": 0.80767161, + "num_input_tokens_seen": 203078490, + "router_z_loss_clip": 0.79980469, + "router_z_loss_mlp": 0.12817383, + "step": 9425, + "time_per_iteration": 2.72796368598938 + }, + { + "auxiliary_loss_clip": 0.01119909, + "auxiliary_loss_mlp": 0.01033143, + "balance_loss_clip": 1.04373109, + "balance_loss_mlp": 1.02104306, + "epoch": 0.566721779648279, + "flos": 21697111660320.0, + "grad_norm": 1.9262855071742115, + "language_loss": 0.59023321, + "learning_rate": 1.6664343460390064e-06, + "loss": 0.61176372, + "num_input_tokens_seen": 203096065, + "router_z_loss_clip": 0.76171875, + "router_z_loss_mlp": 0.12097168, + "step": 9426, + "time_per_iteration": 2.6099326610565186 + }, + { + "auxiliary_loss_clip": 0.01119932, + "auxiliary_loss_mlp": 0.01031411, + "balance_loss_clip": 1.04185176, + "balance_loss_mlp": 1.02004409, + "epoch": 0.566781902900947, + "flos": 25665051981600.0, + "grad_norm": 1.8700752950480066, + "language_loss": 0.81671238, + "learning_rate": 1.6660503456188764e-06, + "loss": 0.83822584, + "num_input_tokens_seen": 203115270, + "router_z_loss_clip": 0.77978516, + "router_z_loss_mlp": 0.11364746, + "step": 9427, + "time_per_iteration": 2.7004401683807373 + }, + { + "auxiliary_loss_clip": 0.01116413, + "auxiliary_loss_mlp": 0.0103336, + "balance_loss_clip": 1.04238105, + "balance_loss_mlp": 1.02127862, + "epoch": 0.5668420261536149, + "flos": 28246161119040.0, + "grad_norm": 2.826381512196398, + "language_loss": 0.86579669, + "learning_rate": 1.6656663578621498e-06, + "loss": 0.88729441, + "num_input_tokens_seen": 203134290, + "router_z_loss_clip": 0.74072266, + "router_z_loss_mlp": 0.12078857, + "step": 9428, + "time_per_iteration": 2.694972515106201 + }, + { + "auxiliary_loss_clip": 0.01125633, + "auxiliary_loss_mlp": 0.01031689, + "balance_loss_clip": 1.04603505, + "balance_loss_mlp": 1.01946998, + "epoch": 0.5669021494062829, + "flos": 27578653152480.0, + "grad_norm": 5.191690954062009, + "language_loss": 0.73439914, + "learning_rate": 1.6652823827833886e-06, + "loss": 0.75597239, + "num_input_tokens_seen": 203152935, + "router_z_loss_clip": 0.79589844, + "router_z_loss_mlp": 0.12225342, + "step": 9429, + "time_per_iteration": 2.713336944580078 + }, + { + "auxiliary_loss_clip": 0.01120564, + "auxiliary_loss_mlp": 0.01030591, + "balance_loss_clip": 1.04124045, + "balance_loss_mlp": 1.01773393, + "epoch": 0.5669622726589508, + "flos": 21208624152480.0, + "grad_norm": 1.9247007981745514, + "language_loss": 0.74872935, + "learning_rate": 1.6648984203971538e-06, + "loss": 0.7702409, + "num_input_tokens_seen": 203170110, + "router_z_loss_clip": 0.79248047, + "router_z_loss_mlp": 0.12860107, + "step": 9430, + "time_per_iteration": 2.6581757068634033 + }, + { + "auxiliary_loss_clip": 0.01117499, + "auxiliary_loss_mlp": 0.01029294, + "balance_loss_clip": 1.04048216, + "balance_loss_mlp": 1.0176537, + "epoch": 0.5670223959116188, + "flos": 22895481859200.0, + "grad_norm": 35.7362353505095, + "language_loss": 0.72935289, + "learning_rate": 1.6645144707180032e-06, + "loss": 0.75082088, + "num_input_tokens_seen": 203188825, + "router_z_loss_clip": 0.76953125, + "router_z_loss_mlp": 0.11639404, + "step": 9431, + "time_per_iteration": 2.648170232772827 + }, + { + "auxiliary_loss_clip": 0.01110207, + "auxiliary_loss_mlp": 0.01028933, + "balance_loss_clip": 1.03986001, + "balance_loss_mlp": 1.01860988, + "epoch": 0.5670825191642868, + "flos": 16537243387680.0, + "grad_norm": 1.8093682377421745, + "language_loss": 0.73562503, + "learning_rate": 1.6641305337604984e-06, + "loss": 0.75701648, + "num_input_tokens_seen": 203206860, + "router_z_loss_clip": 0.70361328, + "router_z_loss_mlp": 0.10321045, + "step": 9432, + "time_per_iteration": 2.6608853340148926 + }, + { + "auxiliary_loss_clip": 0.01118453, + "auxiliary_loss_mlp": 0.01031123, + "balance_loss_clip": 1.0415988, + "balance_loss_mlp": 1.01996458, + "epoch": 0.5671426424169548, + "flos": 26910496909440.0, + "grad_norm": 1.6258202485895052, + "language_loss": 0.78069568, + "learning_rate": 1.663746609539197e-06, + "loss": 0.8021915, + "num_input_tokens_seen": 203225625, + "router_z_loss_clip": 0.76855469, + "router_z_loss_mlp": 0.1116333, + "step": 9433, + "time_per_iteration": 2.6437599658966064 + }, + { + "auxiliary_loss_clip": 0.01120816, + "auxiliary_loss_mlp": 0.01034705, + "balance_loss_clip": 1.04163766, + "balance_loss_mlp": 1.02082872, + "epoch": 0.5672027656696227, + "flos": 26019648930240.0, + "grad_norm": 2.102563301888593, + "language_loss": 0.63523984, + "learning_rate": 1.6633626980686582e-06, + "loss": 0.65679502, + "num_input_tokens_seen": 203242920, + "router_z_loss_clip": 0.79248047, + "router_z_loss_mlp": 0.13885498, + "step": 9434, + "time_per_iteration": 2.6672189235687256 + }, + { + "auxiliary_loss_clip": 0.01116095, + "auxiliary_loss_mlp": 0.01025067, + "balance_loss_clip": 1.04141617, + "balance_loss_mlp": 1.01368237, + "epoch": 0.5672628889222907, + "flos": 28692071316000.0, + "grad_norm": 2.9296861587485927, + "language_loss": 0.66592735, + "learning_rate": 1.6629787993634399e-06, + "loss": 0.68733895, + "num_input_tokens_seen": 203261995, + "router_z_loss_clip": 0.74707031, + "router_z_loss_mlp": 0.1138916, + "step": 9435, + "time_per_iteration": 2.642688035964966 + }, + { + "auxiliary_loss_clip": 0.01115425, + "auxiliary_loss_mlp": 0.01033297, + "balance_loss_clip": 1.0401547, + "balance_loss_mlp": 1.02110147, + "epoch": 0.5673230121749586, + "flos": 33095063203200.0, + "grad_norm": 1.582286404931202, + "language_loss": 0.71538079, + "learning_rate": 1.6625949134380984e-06, + "loss": 0.73686802, + "num_input_tokens_seen": 203280670, + "router_z_loss_clip": 0.75097656, + "router_z_loss_mlp": 0.12188721, + "step": 9436, + "time_per_iteration": 2.6688904762268066 + }, + { + "auxiliary_loss_clip": 0.01120354, + "auxiliary_loss_mlp": 0.01031552, + "balance_loss_clip": 1.04157472, + "balance_loss_mlp": 1.01960158, + "epoch": 0.5673831354276266, + "flos": 38000034852480.0, + "grad_norm": 1.6686696127236693, + "language_loss": 0.74115378, + "learning_rate": 1.6622110403071921e-06, + "loss": 0.7626729, + "num_input_tokens_seen": 203304800, + "router_z_loss_clip": 0.78710938, + "router_z_loss_mlp": 0.11962891, + "step": 9437, + "time_per_iteration": 2.721090793609619 + }, + { + "auxiliary_loss_clip": 0.0112389, + "auxiliary_loss_mlp": 0.01032637, + "balance_loss_clip": 1.04622447, + "balance_loss_mlp": 1.01994753, + "epoch": 0.5674432586802945, + "flos": 33767392726080.0, + "grad_norm": 2.9872688088374018, + "language_loss": 0.61021328, + "learning_rate": 1.661827179985277e-06, + "loss": 0.6317786, + "num_input_tokens_seen": 203324060, + "router_z_loss_clip": 0.77685547, + "router_z_loss_mlp": 0.12683105, + "step": 9438, + "time_per_iteration": 2.712372064590454 + }, + { + "auxiliary_loss_clip": 0.01117411, + "auxiliary_loss_mlp": 0.01029987, + "balance_loss_clip": 1.04004407, + "balance_loss_mlp": 1.01820314, + "epoch": 0.5675033819329626, + "flos": 32476291174080.0, + "grad_norm": 1.6808437712886068, + "language_loss": 0.7501477, + "learning_rate": 1.661443332486909e-06, + "loss": 0.77162164, + "num_input_tokens_seen": 203344360, + "router_z_loss_clip": 0.77441406, + "router_z_loss_mlp": 0.11791992, + "step": 9439, + "time_per_iteration": 2.662601947784424 + }, + { + "auxiliary_loss_clip": 0.01118069, + "auxiliary_loss_mlp": 0.01029102, + "balance_loss_clip": 1.04280996, + "balance_loss_mlp": 1.01663876, + "epoch": 0.5675635051856306, + "flos": 23303798370720.0, + "grad_norm": 2.1409905126782878, + "language_loss": 0.83786857, + "learning_rate": 1.6610594978266438e-06, + "loss": 0.85934031, + "num_input_tokens_seen": 203362115, + "router_z_loss_clip": 0.75390625, + "router_z_loss_mlp": 0.12463379, + "step": 9440, + "time_per_iteration": 2.6495447158813477 + }, + { + "auxiliary_loss_clip": 0.01120616, + "auxiliary_loss_mlp": 0.01034716, + "balance_loss_clip": 1.0405755, + "balance_loss_mlp": 1.02163315, + "epoch": 0.5676236284382985, + "flos": 21434233132800.0, + "grad_norm": 2.1230477334169233, + "language_loss": 0.75434196, + "learning_rate": 1.6606756760190365e-06, + "loss": 0.7758953, + "num_input_tokens_seen": 203380550, + "router_z_loss_clip": 0.80126953, + "router_z_loss_mlp": 0.13092041, + "step": 9441, + "time_per_iteration": 2.620405673980713 + }, + { + "auxiliary_loss_clip": 0.01117459, + "auxiliary_loss_mlp": 0.01033436, + "balance_loss_clip": 1.04135966, + "balance_loss_mlp": 1.02149081, + "epoch": 0.5676837516909665, + "flos": 19467479640960.0, + "grad_norm": 1.9114179184685645, + "language_loss": 0.83285868, + "learning_rate": 1.6602918670786413e-06, + "loss": 0.85436755, + "num_input_tokens_seen": 203396590, + "router_z_loss_clip": 0.76171875, + "router_z_loss_mlp": 0.1194458, + "step": 9442, + "time_per_iteration": 2.6149704456329346 + }, + { + "auxiliary_loss_clip": 0.01115254, + "auxiliary_loss_mlp": 0.01030635, + "balance_loss_clip": 1.04421556, + "balance_loss_mlp": 1.01922727, + "epoch": 0.5677438749436344, + "flos": 22324554387360.0, + "grad_norm": 1.9240833911857538, + "language_loss": 0.7464487, + "learning_rate": 1.6599080710200126e-06, + "loss": 0.76790762, + "num_input_tokens_seen": 203414280, + "router_z_loss_clip": 0.71044922, + "router_z_loss_mlp": 0.11407471, + "step": 9443, + "time_per_iteration": 2.593928337097168 + }, + { + "auxiliary_loss_clip": 0.01119375, + "auxiliary_loss_mlp": 0.01032226, + "balance_loss_clip": 1.04239464, + "balance_loss_mlp": 1.02026331, + "epoch": 0.5678039981963025, + "flos": 21879616605120.0, + "grad_norm": 5.469240801022552, + "language_loss": 0.77691424, + "learning_rate": 1.6595242878577046e-06, + "loss": 0.79843026, + "num_input_tokens_seen": 203433280, + "router_z_loss_clip": 0.77001953, + "router_z_loss_mlp": 0.11962891, + "step": 9444, + "time_per_iteration": 2.625927686691284 + }, + { + "auxiliary_loss_clip": 0.01120197, + "auxiliary_loss_mlp": 0.01035037, + "balance_loss_clip": 1.04181802, + "balance_loss_mlp": 1.02299094, + "epoch": 0.5678641214489704, + "flos": 23569877763360.0, + "grad_norm": 1.8257810160198258, + "language_loss": 0.81351572, + "learning_rate": 1.6591405176062687e-06, + "loss": 0.83506805, + "num_input_tokens_seen": 203449935, + "router_z_loss_clip": 0.78320312, + "router_z_loss_mlp": 0.12054443, + "step": 9445, + "time_per_iteration": 2.6123569011688232 + }, + { + "auxiliary_loss_clip": 0.01115889, + "auxiliary_loss_mlp": 0.01024286, + "balance_loss_clip": 1.03913665, + "balance_loss_mlp": 1.01222253, + "epoch": 0.5679242447016384, + "flos": 33861987874080.0, + "grad_norm": 1.3595682236598368, + "language_loss": 0.70710027, + "learning_rate": 1.658756760280259e-06, + "loss": 0.72850204, + "num_input_tokens_seen": 203473025, + "router_z_loss_clip": 0.76806641, + "router_z_loss_mlp": 0.1206665, + "step": 9446, + "time_per_iteration": 4.241382598876953 + }, + { + "auxiliary_loss_clip": 0.01121938, + "auxiliary_loss_mlp": 0.01028378, + "balance_loss_clip": 1.04247248, + "balance_loss_mlp": 1.01627278, + "epoch": 0.5679843679543063, + "flos": 29003321125440.0, + "grad_norm": 2.277173808992219, + "language_loss": 0.73355126, + "learning_rate": 1.6583730158942276e-06, + "loss": 0.75505441, + "num_input_tokens_seen": 203492895, + "router_z_loss_clip": 0.79541016, + "router_z_loss_mlp": 0.12115479, + "step": 9447, + "time_per_iteration": 4.0672900676727295 + }, + { + "auxiliary_loss_clip": 0.0112098, + "auxiliary_loss_mlp": 0.01028008, + "balance_loss_clip": 1.04238999, + "balance_loss_mlp": 1.01574779, + "epoch": 0.5680444912069743, + "flos": 31227766932960.0, + "grad_norm": 1.7507614375423828, + "language_loss": 0.75379992, + "learning_rate": 1.657989284462725e-06, + "loss": 0.77528983, + "num_input_tokens_seen": 203513710, + "router_z_loss_clip": 0.78613281, + "router_z_loss_mlp": 0.12255859, + "step": 9448, + "time_per_iteration": 2.696061134338379 + }, + { + "auxiliary_loss_clip": 0.01125059, + "auxiliary_loss_mlp": 0.01031446, + "balance_loss_clip": 1.04587471, + "balance_loss_mlp": 1.01924527, + "epoch": 0.5681046144596422, + "flos": 28913912189280.0, + "grad_norm": 2.5540037086788825, + "language_loss": 0.76392889, + "learning_rate": 1.6576055660003038e-06, + "loss": 0.78549391, + "num_input_tokens_seen": 203531630, + "router_z_loss_clip": 0.79150391, + "router_z_loss_mlp": 0.12207031, + "step": 9449, + "time_per_iteration": 2.651505470275879 + }, + { + "auxiliary_loss_clip": 0.0111816, + "auxiliary_loss_mlp": 0.0103301, + "balance_loss_clip": 1.04082847, + "balance_loss_mlp": 1.02058828, + "epoch": 0.5681647377123102, + "flos": 34167200608800.0, + "grad_norm": 8.124645163001105, + "language_loss": 0.74852073, + "learning_rate": 1.6572218605215128e-06, + "loss": 0.77003247, + "num_input_tokens_seen": 203551885, + "router_z_loss_clip": 0.77294922, + "router_z_loss_mlp": 0.12438965, + "step": 9450, + "time_per_iteration": 2.7256550788879395 + }, + { + "auxiliary_loss_clip": 0.01121479, + "auxiliary_loss_mlp": 0.01033513, + "balance_loss_clip": 1.04351223, + "balance_loss_mlp": 1.02231944, + "epoch": 0.5682248609649782, + "flos": 27756984817440.0, + "grad_norm": 1.8080562544498178, + "language_loss": 0.66861773, + "learning_rate": 1.6568381680409038e-06, + "loss": 0.69016767, + "num_input_tokens_seen": 203572250, + "router_z_loss_clip": 0.77929688, + "router_z_loss_mlp": 0.11181641, + "step": 9451, + "time_per_iteration": 2.6738104820251465 + }, + { + "auxiliary_loss_clip": 0.01124969, + "auxiliary_loss_mlp": 0.01032373, + "balance_loss_clip": 1.04110289, + "balance_loss_mlp": 1.01861024, + "epoch": 0.5682849842176462, + "flos": 25976382825600.0, + "grad_norm": 3.174989876878635, + "language_loss": 0.71929371, + "learning_rate": 1.656454488573026e-06, + "loss": 0.74086714, + "num_input_tokens_seen": 203590605, + "router_z_loss_clip": 0.83886719, + "router_z_loss_mlp": 0.13757324, + "step": 9452, + "time_per_iteration": 2.7412798404693604 + }, + { + "auxiliary_loss_clip": 0.01115203, + "auxiliary_loss_mlp": 0.01028369, + "balance_loss_clip": 1.04033113, + "balance_loss_mlp": 1.01682949, + "epoch": 0.5683451074703142, + "flos": 25797200297760.0, + "grad_norm": 3.7871592794237614, + "language_loss": 0.70156258, + "learning_rate": 1.656070822132428e-06, + "loss": 0.72299832, + "num_input_tokens_seen": 203610080, + "router_z_loss_clip": 0.74755859, + "router_z_loss_mlp": 0.11541748, + "step": 9453, + "time_per_iteration": 4.067154407501221 + }, + { + "auxiliary_loss_clip": 0.01119769, + "auxiliary_loss_mlp": 0.01031587, + "balance_loss_clip": 1.04184628, + "balance_loss_mlp": 1.02007699, + "epoch": 0.5684052307229821, + "flos": 27265053340800.0, + "grad_norm": 2.3671404814954067, + "language_loss": 0.69820899, + "learning_rate": 1.6556871687336592e-06, + "loss": 0.71972257, + "num_input_tokens_seen": 203630060, + "router_z_loss_clip": 0.77929688, + "router_z_loss_mlp": 0.11499023, + "step": 9454, + "time_per_iteration": 2.6615681648254395 + }, + { + "auxiliary_loss_clip": 0.01112845, + "auxiliary_loss_mlp": 0.01028158, + "balance_loss_clip": 1.03833759, + "balance_loss_mlp": 1.01723874, + "epoch": 0.5684653539756501, + "flos": 26598841927200.0, + "grad_norm": 4.619262008491141, + "language_loss": 0.60290408, + "learning_rate": 1.6553035283912671e-06, + "loss": 0.62431407, + "num_input_tokens_seen": 203649065, + "router_z_loss_clip": 0.74511719, + "router_z_loss_mlp": 0.10913086, + "step": 9455, + "time_per_iteration": 2.674494504928589 + }, + { + "auxiliary_loss_clip": 0.01125076, + "auxiliary_loss_mlp": 0.01028924, + "balance_loss_clip": 1.04495692, + "balance_loss_mlp": 1.01693726, + "epoch": 0.568525477228318, + "flos": 28064669106240.0, + "grad_norm": 2.518301702177166, + "language_loss": 0.73438382, + "learning_rate": 1.6549199011198e-06, + "loss": 0.75592375, + "num_input_tokens_seen": 203667545, + "router_z_loss_clip": 0.80126953, + "router_z_loss_mlp": 0.11975098, + "step": 9456, + "time_per_iteration": 2.626828193664551 + }, + { + "auxiliary_loss_clip": 0.01118826, + "auxiliary_loss_mlp": 0.01030347, + "balance_loss_clip": 1.04214716, + "balance_loss_mlp": 1.01934958, + "epoch": 0.568585600480986, + "flos": 26103669068160.0, + "grad_norm": 1.6781342326397668, + "language_loss": 0.76969755, + "learning_rate": 1.6545362869338048e-06, + "loss": 0.79118931, + "num_input_tokens_seen": 203686025, + "router_z_loss_clip": 0.76611328, + "router_z_loss_mlp": 0.11010742, + "step": 9457, + "time_per_iteration": 2.701617956161499 + }, + { + "auxiliary_loss_clip": 0.01118548, + "auxiliary_loss_mlp": 0.01031356, + "balance_loss_clip": 1.04007125, + "balance_loss_mlp": 1.01836824, + "epoch": 0.568645723733654, + "flos": 36615796774560.0, + "grad_norm": 9.70592615542248, + "language_loss": 0.66207671, + "learning_rate": 1.6541526858478285e-06, + "loss": 0.68357575, + "num_input_tokens_seen": 203705540, + "router_z_loss_clip": 0.78564453, + "router_z_loss_mlp": 0.12976074, + "step": 9458, + "time_per_iteration": 2.700605630874634 + }, + { + "auxiliary_loss_clip": 0.01119859, + "auxiliary_loss_mlp": 0.01029058, + "balance_loss_clip": 1.04106963, + "balance_loss_mlp": 1.01642752, + "epoch": 0.568705846986322, + "flos": 24907527319680.0, + "grad_norm": 2.2407586289327552, + "language_loss": 0.67930388, + "learning_rate": 1.6537690978764167e-06, + "loss": 0.70079315, + "num_input_tokens_seen": 203723670, + "router_z_loss_clip": 0.78808594, + "router_z_loss_mlp": 0.12634277, + "step": 9459, + "time_per_iteration": 3.888411045074463 + }, + { + "auxiliary_loss_clip": 0.01122324, + "auxiliary_loss_mlp": 0.01029249, + "balance_loss_clip": 1.04315042, + "balance_loss_mlp": 1.01707828, + "epoch": 0.5687659702389899, + "flos": 21300545160000.0, + "grad_norm": 2.532898783611353, + "language_loss": 0.7675817, + "learning_rate": 1.6533855230341155e-06, + "loss": 0.78909743, + "num_input_tokens_seen": 203739705, + "router_z_loss_clip": 0.79150391, + "router_z_loss_mlp": 0.1217041, + "step": 9460, + "time_per_iteration": 2.701080799102783 + }, + { + "auxiliary_loss_clip": 0.01119131, + "auxiliary_loss_mlp": 0.01031502, + "balance_loss_clip": 1.04106283, + "balance_loss_mlp": 1.01897907, + "epoch": 0.5688260934916579, + "flos": 31001347607040.0, + "grad_norm": 2.0604241772323144, + "language_loss": 0.71517086, + "learning_rate": 1.65300196133547e-06, + "loss": 0.73667717, + "num_input_tokens_seen": 203759000, + "router_z_loss_clip": 0.78027344, + "router_z_loss_mlp": 0.12518311, + "step": 9461, + "time_per_iteration": 2.693993091583252 + }, + { + "auxiliary_loss_clip": 0.01118203, + "auxiliary_loss_mlp": 0.0102915, + "balance_loss_clip": 1.0409013, + "balance_loss_mlp": 1.01691961, + "epoch": 0.5688862167443258, + "flos": 26366993285760.0, + "grad_norm": 2.2046522176993784, + "language_loss": 0.73089987, + "learning_rate": 1.6526184127950249e-06, + "loss": 0.7523734, + "num_input_tokens_seen": 203774295, + "router_z_loss_clip": 0.77490234, + "router_z_loss_mlp": 0.12225342, + "step": 9462, + "time_per_iteration": 2.658769130706787 + }, + { + "auxiliary_loss_clip": 0.01113517, + "auxiliary_loss_mlp": 0.01023321, + "balance_loss_clip": 1.04009688, + "balance_loss_mlp": 1.01286006, + "epoch": 0.5689463399969938, + "flos": 27355839864480.0, + "grad_norm": 2.4160674339823665, + "language_loss": 0.72660768, + "learning_rate": 1.6522348774273246e-06, + "loss": 0.74797606, + "num_input_tokens_seen": 203792710, + "router_z_loss_clip": 0.73339844, + "router_z_loss_mlp": 0.10467529, + "step": 9463, + "time_per_iteration": 2.6513092517852783 + }, + { + "auxiliary_loss_clip": 0.01115637, + "auxiliary_loss_mlp": 0.01031107, + "balance_loss_clip": 1.03887904, + "balance_loss_mlp": 1.01898909, + "epoch": 0.5690064632496618, + "flos": 22325567319360.0, + "grad_norm": 2.9243822485585964, + "language_loss": 0.74114388, + "learning_rate": 1.6518513552469123e-06, + "loss": 0.76261133, + "num_input_tokens_seen": 203811645, + "router_z_loss_clip": 0.76708984, + "router_z_loss_mlp": 0.12121582, + "step": 9464, + "time_per_iteration": 2.6717529296875 + }, + { + "auxiliary_loss_clip": 0.01118448, + "auxiliary_loss_mlp": 0.01033864, + "balance_loss_clip": 1.04133844, + "balance_loss_mlp": 1.02225888, + "epoch": 0.5690665865023298, + "flos": 26331182360640.0, + "grad_norm": 1.9537600128968036, + "language_loss": 0.84115285, + "learning_rate": 1.6514678462683312e-06, + "loss": 0.86267596, + "num_input_tokens_seen": 203830040, + "router_z_loss_clip": 0.77099609, + "router_z_loss_mlp": 0.11608887, + "step": 9465, + "time_per_iteration": 2.789022207260132 + }, + { + "auxiliary_loss_clip": 0.01111426, + "auxiliary_loss_mlp": 0.01027024, + "balance_loss_clip": 1.03755391, + "balance_loss_mlp": 1.0156635, + "epoch": 0.5691267097549978, + "flos": 29799776543040.0, + "grad_norm": 1.6445077677751632, + "language_loss": 0.72311759, + "learning_rate": 1.651084350506125e-06, + "loss": 0.74450207, + "num_input_tokens_seen": 203851245, + "router_z_loss_clip": 0.73876953, + "router_z_loss_mlp": 0.11358643, + "step": 9466, + "time_per_iteration": 2.7041213512420654 + }, + { + "auxiliary_loss_clip": 0.0103706, + "auxiliary_loss_mlp": 0.01006419, + "balance_loss_clip": 1.01276422, + "balance_loss_mlp": 1.00502849, + "epoch": 0.5691868330076657, + "flos": 75233272936320.0, + "grad_norm": 0.7139525083058649, + "language_loss": 0.55352634, + "learning_rate": 1.6507008679748343e-06, + "loss": 0.57396114, + "num_input_tokens_seen": 203916400, + "router_z_loss_clip": 0.24291992, + "router_z_loss_mlp": 0.01390076, + "step": 9467, + "time_per_iteration": 3.3303186893463135 + }, + { + "auxiliary_loss_clip": 0.01120063, + "auxiliary_loss_mlp": 0.01031644, + "balance_loss_clip": 1.0407418, + "balance_loss_mlp": 1.01852477, + "epoch": 0.5692469562603337, + "flos": 26021026517760.0, + "grad_norm": 2.141201293603086, + "language_loss": 0.63873208, + "learning_rate": 1.6503173986890023e-06, + "loss": 0.66024917, + "num_input_tokens_seen": 203935870, + "router_z_loss_clip": 0.79394531, + "router_z_loss_mlp": 0.13122559, + "step": 9468, + "time_per_iteration": 2.6243176460266113 + }, + { + "auxiliary_loss_clip": 0.01116891, + "auxiliary_loss_mlp": 0.01027596, + "balance_loss_clip": 1.04118443, + "balance_loss_mlp": 1.01566291, + "epoch": 0.5693070795130016, + "flos": 28513739651040.0, + "grad_norm": 2.6464590282121865, + "language_loss": 0.78912574, + "learning_rate": 1.64993394266317e-06, + "loss": 0.8105706, + "num_input_tokens_seen": 203954950, + "router_z_loss_clip": 0.75634766, + "router_z_loss_mlp": 0.1194458, + "step": 9469, + "time_per_iteration": 2.6909945011138916 + }, + { + "auxiliary_loss_clip": 0.01121485, + "auxiliary_loss_mlp": 0.01039962, + "balance_loss_clip": 1.04117763, + "balance_loss_mlp": 1.0274334, + "epoch": 0.5693672027656697, + "flos": 22814297930880.0, + "grad_norm": 2.760977755251231, + "language_loss": 0.68971568, + "learning_rate": 1.6495504999118769e-06, + "loss": 0.71133018, + "num_input_tokens_seen": 203972715, + "router_z_loss_clip": 0.80371094, + "router_z_loss_mlp": 0.12530518, + "step": 9470, + "time_per_iteration": 2.609102725982666 + }, + { + "auxiliary_loss_clip": 0.01118072, + "auxiliary_loss_mlp": 0.0103365, + "balance_loss_clip": 1.0413878, + "balance_loss_mlp": 1.02169883, + "epoch": 0.5694273260183376, + "flos": 24952089977280.0, + "grad_norm": 1.9396963325146173, + "language_loss": 0.74598444, + "learning_rate": 1.6491670704496644e-06, + "loss": 0.76750171, + "num_input_tokens_seen": 203990775, + "router_z_loss_clip": 0.76708984, + "router_z_loss_mlp": 0.11938477, + "step": 9471, + "time_per_iteration": 2.6416714191436768 + }, + { + "auxiliary_loss_clip": 0.01116361, + "auxiliary_loss_mlp": 0.0102931, + "balance_loss_clip": 1.04141712, + "balance_loss_mlp": 1.01754975, + "epoch": 0.5694874492710056, + "flos": 21479200963200.0, + "grad_norm": 3.89609941668545, + "language_loss": 0.57122999, + "learning_rate": 1.6487836542910716e-06, + "loss": 0.59268677, + "num_input_tokens_seen": 204008845, + "router_z_loss_clip": 0.74902344, + "router_z_loss_mlp": 0.11743164, + "step": 9472, + "time_per_iteration": 2.600365161895752 + }, + { + "auxiliary_loss_clip": 0.01115057, + "auxiliary_loss_mlp": 0.01030142, + "balance_loss_clip": 1.04165411, + "balance_loss_mlp": 1.01874602, + "epoch": 0.5695475725236735, + "flos": 16314551651520.0, + "grad_norm": 1.8699707662222926, + "language_loss": 0.74125016, + "learning_rate": 1.648400251450638e-06, + "loss": 0.76270211, + "num_input_tokens_seen": 204023755, + "router_z_loss_clip": 0.73486328, + "router_z_loss_mlp": 0.11395264, + "step": 9473, + "time_per_iteration": 2.704279899597168 + }, + { + "auxiliary_loss_clip": 0.01036387, + "auxiliary_loss_mlp": 0.01002255, + "balance_loss_clip": 1.01214516, + "balance_loss_mlp": 1.00097823, + "epoch": 0.5696076957763415, + "flos": 83186529141600.0, + "grad_norm": 0.6657222262753648, + "language_loss": 0.57617867, + "learning_rate": 1.6480168619429023e-06, + "loss": 0.59656513, + "num_input_tokens_seen": 204091255, + "router_z_loss_clip": 0.24243164, + "router_z_loss_mlp": 0.01276398, + "step": 9474, + "time_per_iteration": 3.3521878719329834 + }, + { + "auxiliary_loss_clip": 0.01116724, + "auxiliary_loss_mlp": 0.01033628, + "balance_loss_clip": 1.04240155, + "balance_loss_mlp": 1.02121854, + "epoch": 0.5696678190290094, + "flos": 41290621508160.0, + "grad_norm": 1.8148777136404906, + "language_loss": 0.53877735, + "learning_rate": 1.6476334857824017e-06, + "loss": 0.56028092, + "num_input_tokens_seen": 204113285, + "router_z_loss_clip": 0.74316406, + "router_z_loss_mlp": 0.12408447, + "step": 9475, + "time_per_iteration": 2.8275673389434814 + }, + { + "auxiliary_loss_clip": 0.01119899, + "auxiliary_loss_mlp": 0.01035359, + "balance_loss_clip": 1.04254699, + "balance_loss_mlp": 1.02315819, + "epoch": 0.5697279422816774, + "flos": 32161070671200.0, + "grad_norm": 1.798557878766222, + "language_loss": 0.79503715, + "learning_rate": 1.647250122983675e-06, + "loss": 0.81658971, + "num_input_tokens_seen": 204133045, + "router_z_loss_clip": 0.77294922, + "router_z_loss_mlp": 0.12194824, + "step": 9476, + "time_per_iteration": 2.7324492931365967 + }, + { + "auxiliary_loss_clip": 0.01123333, + "auxiliary_loss_mlp": 0.01033465, + "balance_loss_clip": 1.04529476, + "balance_loss_mlp": 1.0215857, + "epoch": 0.5697880655343454, + "flos": 27979960174560.0, + "grad_norm": 2.059807782515903, + "language_loss": 0.66147166, + "learning_rate": 1.6468667735612592e-06, + "loss": 0.68303967, + "num_input_tokens_seen": 204152590, + "router_z_loss_clip": 0.78027344, + "router_z_loss_mlp": 0.11883545, + "step": 9477, + "time_per_iteration": 2.7278780937194824 + }, + { + "auxiliary_loss_clip": 0.01117731, + "auxiliary_loss_mlp": 0.01030949, + "balance_loss_clip": 1.0406338, + "balance_loss_mlp": 1.01826525, + "epoch": 0.5698481887870134, + "flos": 32654622839040.0, + "grad_norm": 2.053546953921474, + "language_loss": 0.70900077, + "learning_rate": 1.6464834375296906e-06, + "loss": 0.73048759, + "num_input_tokens_seen": 204171815, + "router_z_loss_clip": 0.77099609, + "router_z_loss_mlp": 0.12677002, + "step": 9478, + "time_per_iteration": 2.7150514125823975 + }, + { + "auxiliary_loss_clip": 0.01111812, + "auxiliary_loss_mlp": 0.01028482, + "balance_loss_clip": 1.04080427, + "balance_loss_mlp": 1.01778889, + "epoch": 0.5699083120396814, + "flos": 19208369220480.0, + "grad_norm": 1.74364633100196, + "language_loss": 0.69598472, + "learning_rate": 1.6461001149035055e-06, + "loss": 0.71738768, + "num_input_tokens_seen": 204188535, + "router_z_loss_clip": 0.71044922, + "router_z_loss_mlp": 0.10705566, + "step": 9479, + "time_per_iteration": 2.6436381340026855 + }, + { + "auxiliary_loss_clip": 0.01111573, + "auxiliary_loss_mlp": 0.01031927, + "balance_loss_clip": 1.03944612, + "balance_loss_mlp": 1.02082288, + "epoch": 0.5699684352923493, + "flos": 23839279572960.0, + "grad_norm": 1.4067967714549836, + "language_loss": 0.71267509, + "learning_rate": 1.6457168056972392e-06, + "loss": 0.73411006, + "num_input_tokens_seen": 204208365, + "router_z_loss_clip": 0.72167969, + "router_z_loss_mlp": 0.11102295, + "step": 9480, + "time_per_iteration": 2.704136848449707 + }, + { + "auxiliary_loss_clip": 0.01115932, + "auxiliary_loss_mlp": 0.01031127, + "balance_loss_clip": 1.04027891, + "balance_loss_mlp": 1.01905727, + "epoch": 0.5700285585450173, + "flos": 19832003323200.0, + "grad_norm": 2.9917023556475626, + "language_loss": 0.72095907, + "learning_rate": 1.6453335099254276e-06, + "loss": 0.74242961, + "num_input_tokens_seen": 204226560, + "router_z_loss_clip": 0.75585938, + "router_z_loss_mlp": 0.1206665, + "step": 9481, + "time_per_iteration": 2.6345579624176025 + }, + { + "auxiliary_loss_clip": 0.01119982, + "auxiliary_loss_mlp": 0.01031851, + "balance_loss_clip": 1.04363489, + "balance_loss_mlp": 1.02006137, + "epoch": 0.5700886817976852, + "flos": 24239249524800.0, + "grad_norm": 10.445087422962475, + "language_loss": 0.78433573, + "learning_rate": 1.6449502276026041e-06, + "loss": 0.80585408, + "num_input_tokens_seen": 204245410, + "router_z_loss_clip": 0.76318359, + "router_z_loss_mlp": 0.11785889, + "step": 9482, + "time_per_iteration": 2.6742148399353027 + }, + { + "auxiliary_loss_clip": 0.0111611, + "auxiliary_loss_mlp": 0.01026764, + "balance_loss_clip": 1.04074693, + "balance_loss_mlp": 1.01545048, + "epoch": 0.5701488050503533, + "flos": 29091960233280.0, + "grad_norm": 1.5998538528880502, + "language_loss": 0.77816403, + "learning_rate": 1.6445669587433043e-06, + "loss": 0.79959273, + "num_input_tokens_seen": 204264840, + "router_z_loss_clip": 0.75244141, + "router_z_loss_mlp": 0.11297607, + "step": 9483, + "time_per_iteration": 2.65081787109375 + }, + { + "auxiliary_loss_clip": 0.01116022, + "auxiliary_loss_mlp": 0.01039062, + "balance_loss_clip": 1.04016209, + "balance_loss_mlp": 1.02770734, + "epoch": 0.5702089283030212, + "flos": 28869795221760.0, + "grad_norm": 1.6935164339839988, + "language_loss": 0.81085968, + "learning_rate": 1.6441837033620612e-06, + "loss": 0.83241051, + "num_input_tokens_seen": 204284335, + "router_z_loss_clip": 0.7578125, + "router_z_loss_mlp": 0.11358643, + "step": 9484, + "time_per_iteration": 2.7141778469085693 + }, + { + "auxiliary_loss_clip": 0.01116721, + "auxiliary_loss_mlp": 0.01029814, + "balance_loss_clip": 1.04041624, + "balance_loss_mlp": 1.01778531, + "epoch": 0.5702690515556892, + "flos": 34033472118720.0, + "grad_norm": 2.2419061909915667, + "language_loss": 0.60558283, + "learning_rate": 1.6438004614734073e-06, + "loss": 0.62704819, + "num_input_tokens_seen": 204302590, + "router_z_loss_clip": 0.76318359, + "router_z_loss_mlp": 0.12036133, + "step": 9485, + "time_per_iteration": 2.784961462020874 + }, + { + "auxiliary_loss_clip": 0.01116966, + "auxiliary_loss_mlp": 0.01030085, + "balance_loss_clip": 1.03990543, + "balance_loss_mlp": 1.01827753, + "epoch": 0.5703291748083571, + "flos": 29314368348480.0, + "grad_norm": 2.203569280109433, + "language_loss": 0.65328574, + "learning_rate": 1.6434172330918757e-06, + "loss": 0.67475629, + "num_input_tokens_seen": 204323055, + "router_z_loss_clip": 0.77050781, + "router_z_loss_mlp": 0.11810303, + "step": 9486, + "time_per_iteration": 4.1860504150390625 + }, + { + "auxiliary_loss_clip": 0.01036023, + "auxiliary_loss_mlp": 0.00999788, + "balance_loss_clip": 1.01186752, + "balance_loss_mlp": 0.99848557, + "epoch": 0.5703892980610251, + "flos": 69577948183680.0, + "grad_norm": 0.6633319330657862, + "language_loss": 0.47980011, + "learning_rate": 1.6430340182319978e-06, + "loss": 0.50015819, + "num_input_tokens_seen": 204386160, + "router_z_loss_clip": 0.24157715, + "router_z_loss_mlp": 0.01303101, + "step": 9487, + "time_per_iteration": 4.9805285930633545 + }, + { + "auxiliary_loss_clip": 0.01117894, + "auxiliary_loss_mlp": 0.01030752, + "balance_loss_clip": 1.04214764, + "balance_loss_mlp": 1.01888442, + "epoch": 0.570449421313693, + "flos": 29713608989280.0, + "grad_norm": 1.6963048795754918, + "language_loss": 0.8562032, + "learning_rate": 1.6426508169083067e-06, + "loss": 0.8776896, + "num_input_tokens_seen": 204406315, + "router_z_loss_clip": 0.75830078, + "router_z_loss_mlp": 0.11877441, + "step": 9488, + "time_per_iteration": 2.8999381065368652 + }, + { + "auxiliary_loss_clip": 0.01119227, + "auxiliary_loss_mlp": 0.0102824, + "balance_loss_clip": 1.04001725, + "balance_loss_mlp": 1.01625919, + "epoch": 0.570509544566361, + "flos": 30296043368640.0, + "grad_norm": 1.491605110679431, + "language_loss": 0.79053319, + "learning_rate": 1.6422676291353314e-06, + "loss": 0.81200778, + "num_input_tokens_seen": 204427645, + "router_z_loss_clip": 0.79150391, + "router_z_loss_mlp": 0.11987305, + "step": 9489, + "time_per_iteration": 2.699207067489624 + }, + { + "auxiliary_loss_clip": 0.01118004, + "auxiliary_loss_mlp": 0.01028885, + "balance_loss_clip": 1.04217863, + "balance_loss_mlp": 1.01804256, + "epoch": 0.570569667819029, + "flos": 26109098383680.0, + "grad_norm": 1.9137504832037107, + "language_loss": 0.70127362, + "learning_rate": 1.641884454927604e-06, + "loss": 0.72274256, + "num_input_tokens_seen": 204445910, + "router_z_loss_clip": 0.7578125, + "router_z_loss_mlp": 0.10845947, + "step": 9490, + "time_per_iteration": 2.7406744956970215 + }, + { + "auxiliary_loss_clip": 0.01115692, + "auxiliary_loss_mlp": 0.01032582, + "balance_loss_clip": 1.04097247, + "balance_loss_mlp": 1.02101886, + "epoch": 0.570629791071697, + "flos": 28329776084160.0, + "grad_norm": 1.954274834171221, + "language_loss": 0.76156628, + "learning_rate": 1.6415012942996548e-06, + "loss": 0.78304905, + "num_input_tokens_seen": 204464680, + "router_z_loss_clip": 0.74707031, + "router_z_loss_mlp": 0.11560059, + "step": 9491, + "time_per_iteration": 2.6694743633270264 + }, + { + "auxiliary_loss_clip": 0.01035446, + "auxiliary_loss_mlp": 0.01003152, + "balance_loss_clip": 1.01119292, + "balance_loss_mlp": 1.00192213, + "epoch": 0.570689914324365, + "flos": 79660933496640.0, + "grad_norm": 0.8006882541604087, + "language_loss": 0.57418501, + "learning_rate": 1.641118147266011e-06, + "loss": 0.59457093, + "num_input_tokens_seen": 204525580, + "router_z_loss_clip": 0.24255371, + "router_z_loss_mlp": 0.01229095, + "step": 9492, + "time_per_iteration": 3.2195143699645996 + }, + { + "auxiliary_loss_clip": 0.01117629, + "auxiliary_loss_mlp": 0.0103156, + "balance_loss_clip": 1.0421561, + "balance_loss_mlp": 1.01953173, + "epoch": 0.5707500375770329, + "flos": 25797443401440.0, + "grad_norm": 3.891071383670592, + "language_loss": 0.72025734, + "learning_rate": 1.6407350138412035e-06, + "loss": 0.74174929, + "num_input_tokens_seen": 204541320, + "router_z_loss_clip": 0.75488281, + "router_z_loss_mlp": 0.12023926, + "step": 9493, + "time_per_iteration": 4.102849721908569 + }, + { + "auxiliary_loss_clip": 0.01119745, + "auxiliary_loss_mlp": 0.0102669, + "balance_loss_clip": 1.04116964, + "balance_loss_mlp": 1.01513314, + "epoch": 0.5708101608297009, + "flos": 25395852758400.0, + "grad_norm": 1.6902549031566485, + "language_loss": 0.77712309, + "learning_rate": 1.6403518940397606e-06, + "loss": 0.79858744, + "num_input_tokens_seen": 204560275, + "router_z_loss_clip": 0.78662109, + "router_z_loss_mlp": 0.11566162, + "step": 9494, + "time_per_iteration": 2.6425678730010986 + }, + { + "auxiliary_loss_clip": 0.0112281, + "auxiliary_loss_mlp": 0.01032197, + "balance_loss_clip": 1.04211843, + "balance_loss_mlp": 1.01964402, + "epoch": 0.5708702840823688, + "flos": 31496642017920.0, + "grad_norm": 2.3248750865303047, + "language_loss": 0.80778563, + "learning_rate": 1.6399687878762096e-06, + "loss": 0.82933569, + "num_input_tokens_seen": 204579430, + "router_z_loss_clip": 0.80566406, + "router_z_loss_mlp": 0.12542725, + "step": 9495, + "time_per_iteration": 2.709749460220337 + }, + { + "auxiliary_loss_clip": 0.0112541, + "auxiliary_loss_mlp": 0.01038197, + "balance_loss_clip": 1.04389668, + "balance_loss_mlp": 1.02434468, + "epoch": 0.5709304073350369, + "flos": 28861043489280.0, + "grad_norm": 3.333237762020226, + "language_loss": 0.66618574, + "learning_rate": 1.6395856953650784e-06, + "loss": 0.68782175, + "num_input_tokens_seen": 204597710, + "router_z_loss_clip": 0.81591797, + "router_z_loss_mlp": 0.13861084, + "step": 9496, + "time_per_iteration": 2.6359329223632812 + }, + { + "auxiliary_loss_clip": 0.0112197, + "auxiliary_loss_mlp": 0.0103413, + "balance_loss_clip": 1.04256582, + "balance_loss_mlp": 1.02167881, + "epoch": 0.5709905305877048, + "flos": 19654076831040.0, + "grad_norm": 2.5436100810866527, + "language_loss": 0.69375813, + "learning_rate": 1.6392026165208938e-06, + "loss": 0.7153191, + "num_input_tokens_seen": 204616140, + "router_z_loss_clip": 0.79394531, + "router_z_loss_mlp": 0.12463379, + "step": 9497, + "time_per_iteration": 2.7002780437469482 + }, + { + "auxiliary_loss_clip": 0.011202, + "auxiliary_loss_mlp": 0.01031201, + "balance_loss_clip": 1.04116678, + "balance_loss_mlp": 1.01840377, + "epoch": 0.5710506538403728, + "flos": 30201610289760.0, + "grad_norm": 2.6051653444885403, + "language_loss": 0.81380844, + "learning_rate": 1.638819551358182e-06, + "loss": 0.83532238, + "num_input_tokens_seen": 204636470, + "router_z_loss_clip": 0.79052734, + "router_z_loss_mlp": 0.12799072, + "step": 9498, + "time_per_iteration": 3.9189858436584473 + }, + { + "auxiliary_loss_clip": 0.01120844, + "auxiliary_loss_mlp": 0.0103129, + "balance_loss_clip": 1.04226589, + "balance_loss_mlp": 1.0179925, + "epoch": 0.5711107770930407, + "flos": 26822789699040.0, + "grad_norm": 2.1366969029945917, + "language_loss": 0.66635942, + "learning_rate": 1.638436499891469e-06, + "loss": 0.68788075, + "num_input_tokens_seen": 204656640, + "router_z_loss_clip": 0.78515625, + "router_z_loss_mlp": 0.13287354, + "step": 9499, + "time_per_iteration": 2.636384963989258 + }, + { + "auxiliary_loss_clip": 0.011196, + "auxiliary_loss_mlp": 0.0103344, + "balance_loss_clip": 1.04353082, + "balance_loss_mlp": 1.02169752, + "epoch": 0.5711709003457087, + "flos": 23884085334240.0, + "grad_norm": 1.7451233065833331, + "language_loss": 0.71779943, + "learning_rate": 1.6380534621352805e-06, + "loss": 0.73932981, + "num_input_tokens_seen": 204675475, + "router_z_loss_clip": 0.76171875, + "router_z_loss_mlp": 0.11737061, + "step": 9500, + "time_per_iteration": 2.653826951980591 + }, + { + "auxiliary_loss_clip": 0.011198, + "auxiliary_loss_mlp": 0.01032458, + "balance_loss_clip": 1.0405407, + "balance_loss_mlp": 1.01998305, + "epoch": 0.5712310235983766, + "flos": 29581298604000.0, + "grad_norm": 2.1301888729727443, + "language_loss": 0.76652056, + "learning_rate": 1.6376704381041407e-06, + "loss": 0.78804314, + "num_input_tokens_seen": 204695385, + "router_z_loss_clip": 0.79248047, + "router_z_loss_mlp": 0.12475586, + "step": 9501, + "time_per_iteration": 2.6439568996429443 + }, + { + "auxiliary_loss_clip": 0.01120001, + "auxiliary_loss_mlp": 0.01034209, + "balance_loss_clip": 1.04113007, + "balance_loss_mlp": 1.02274108, + "epoch": 0.5712911468510447, + "flos": 25619557426560.0, + "grad_norm": 2.667359152321044, + "language_loss": 0.74734694, + "learning_rate": 1.6372874278125742e-06, + "loss": 0.76888907, + "num_input_tokens_seen": 204714730, + "router_z_loss_clip": 0.7890625, + "router_z_loss_mlp": 0.11474609, + "step": 9502, + "time_per_iteration": 2.671764612197876 + }, + { + "auxiliary_loss_clip": 0.01117306, + "auxiliary_loss_mlp": 0.01028749, + "balance_loss_clip": 1.04173291, + "balance_loss_mlp": 1.01712632, + "epoch": 0.5713512701037126, + "flos": 23082362670240.0, + "grad_norm": 3.7987525725780116, + "language_loss": 0.82152259, + "learning_rate": 1.636904431275105e-06, + "loss": 0.84298313, + "num_input_tokens_seen": 204735025, + "router_z_loss_clip": 0.75488281, + "router_z_loss_mlp": 0.11639404, + "step": 9503, + "time_per_iteration": 2.656479597091675 + }, + { + "auxiliary_loss_clip": 0.01116769, + "auxiliary_loss_mlp": 0.01029689, + "balance_loss_clip": 1.04118598, + "balance_loss_mlp": 1.01851892, + "epoch": 0.5714113933563806, + "flos": 21248324736480.0, + "grad_norm": 2.1774998324973893, + "language_loss": 0.85967314, + "learning_rate": 1.6365214485062553e-06, + "loss": 0.88113773, + "num_input_tokens_seen": 204751365, + "router_z_loss_clip": 0.75537109, + "router_z_loss_mlp": 0.11175537, + "step": 9504, + "time_per_iteration": 2.6919138431549072 + }, + { + "auxiliary_loss_clip": 0.01116933, + "auxiliary_loss_mlp": 0.01026618, + "balance_loss_clip": 1.04230809, + "balance_loss_mlp": 1.01475644, + "epoch": 0.5714715166090486, + "flos": 24639948787680.0, + "grad_norm": 1.9222480604232872, + "language_loss": 0.75475973, + "learning_rate": 1.6361384795205496e-06, + "loss": 0.77619529, + "num_input_tokens_seen": 204768980, + "router_z_loss_clip": 0.74511719, + "router_z_loss_mlp": 0.11865234, + "step": 9505, + "time_per_iteration": 2.666808843612671 + }, + { + "auxiliary_loss_clip": 0.01118155, + "auxiliary_loss_mlp": 0.01030037, + "balance_loss_clip": 1.04193521, + "balance_loss_mlp": 1.01908207, + "epoch": 0.5715316398617165, + "flos": 22636614542400.0, + "grad_norm": 1.679082945978817, + "language_loss": 0.81475663, + "learning_rate": 1.635755524332509e-06, + "loss": 0.8362385, + "num_input_tokens_seen": 204788110, + "router_z_loss_clip": 0.76220703, + "router_z_loss_mlp": 0.10949707, + "step": 9506, + "time_per_iteration": 2.630849599838257 + }, + { + "auxiliary_loss_clip": 0.01118135, + "auxiliary_loss_mlp": 0.01026879, + "balance_loss_clip": 1.04205883, + "balance_loss_mlp": 1.01534534, + "epoch": 0.5715917631143845, + "flos": 22547003019840.0, + "grad_norm": 1.799707759263742, + "language_loss": 0.77602041, + "learning_rate": 1.6353725829566552e-06, + "loss": 0.79747057, + "num_input_tokens_seen": 204807240, + "router_z_loss_clip": 0.76025391, + "router_z_loss_mlp": 0.11529541, + "step": 9507, + "time_per_iteration": 2.6443960666656494 + }, + { + "auxiliary_loss_clip": 0.01120885, + "auxiliary_loss_mlp": 0.0103822, + "balance_loss_clip": 1.04264081, + "balance_loss_mlp": 1.02547634, + "epoch": 0.5716518863670524, + "flos": 29309951964960.0, + "grad_norm": 1.8162823682000628, + "language_loss": 0.68505377, + "learning_rate": 1.63498965540751e-06, + "loss": 0.70664489, + "num_input_tokens_seen": 204826415, + "router_z_loss_clip": 0.78271484, + "router_z_loss_mlp": 0.12744141, + "step": 9508, + "time_per_iteration": 2.6880574226379395 + }, + { + "auxiliary_loss_clip": 0.01120643, + "auxiliary_loss_mlp": 0.01028655, + "balance_loss_clip": 1.04170418, + "balance_loss_mlp": 1.01622176, + "epoch": 0.5717120096197205, + "flos": 21743376043680.0, + "grad_norm": 2.3682562308732393, + "language_loss": 0.7977941, + "learning_rate": 1.634606741699593e-06, + "loss": 0.81928706, + "num_input_tokens_seen": 204844305, + "router_z_loss_clip": 0.78857422, + "router_z_loss_mlp": 0.12445068, + "step": 9509, + "time_per_iteration": 2.6584208011627197 + }, + { + "auxiliary_loss_clip": 0.01116879, + "auxiliary_loss_mlp": 0.01036005, + "balance_loss_clip": 1.04181695, + "balance_loss_mlp": 1.02355409, + "epoch": 0.5717721328723884, + "flos": 26681606029440.0, + "grad_norm": 2.040434985083235, + "language_loss": 0.72103643, + "learning_rate": 1.6342238418474255e-06, + "loss": 0.74256527, + "num_input_tokens_seen": 204861765, + "router_z_loss_clip": 0.75048828, + "router_z_loss_mlp": 0.12451172, + "step": 9510, + "time_per_iteration": 2.6187193393707275 + }, + { + "auxiliary_loss_clip": 0.01118935, + "auxiliary_loss_mlp": 0.0103101, + "balance_loss_clip": 1.04235554, + "balance_loss_mlp": 1.01966774, + "epoch": 0.5718322561250564, + "flos": 34700372326080.0, + "grad_norm": 1.4591656620161582, + "language_loss": 0.69777352, + "learning_rate": 1.6338409558655264e-06, + "loss": 0.71927297, + "num_input_tokens_seen": 204882505, + "router_z_loss_clip": 0.765625, + "router_z_loss_mlp": 0.11358643, + "step": 9511, + "time_per_iteration": 2.78212571144104 + }, + { + "auxiliary_loss_clip": 0.01118086, + "auxiliary_loss_mlp": 0.01035196, + "balance_loss_clip": 1.04192376, + "balance_loss_mlp": 1.02360892, + "epoch": 0.5718923793777243, + "flos": 16535541661920.0, + "grad_norm": 2.723205677697021, + "language_loss": 0.61587769, + "learning_rate": 1.6334580837684152e-06, + "loss": 0.63741052, + "num_input_tokens_seen": 204899830, + "router_z_loss_clip": 0.76171875, + "router_z_loss_mlp": 0.11584473, + "step": 9512, + "time_per_iteration": 2.608992338180542 + }, + { + "auxiliary_loss_clip": 0.01117511, + "auxiliary_loss_mlp": 0.01028334, + "balance_loss_clip": 1.04225206, + "balance_loss_mlp": 1.01715839, + "epoch": 0.5719525026303923, + "flos": 21746252770560.0, + "grad_norm": 4.510538230148088, + "language_loss": 0.76388097, + "learning_rate": 1.6330752255706104e-06, + "loss": 0.78533942, + "num_input_tokens_seen": 204918100, + "router_z_loss_clip": 0.75341797, + "router_z_loss_mlp": 0.11175537, + "step": 9513, + "time_per_iteration": 2.667597770690918 + }, + { + "auxiliary_loss_clip": 0.01038051, + "auxiliary_loss_mlp": 0.01003003, + "balance_loss_clip": 1.01390767, + "balance_loss_mlp": 1.00162077, + "epoch": 0.5720126258830602, + "flos": 74797040018880.0, + "grad_norm": 0.890014029438249, + "language_loss": 0.6684922, + "learning_rate": 1.6326923812866288e-06, + "loss": 0.6889028, + "num_input_tokens_seen": 204972925, + "router_z_loss_clip": 0.24145508, + "router_z_loss_mlp": 0.01382446, + "step": 9514, + "time_per_iteration": 3.2324206829071045 + }, + { + "auxiliary_loss_clip": 0.01122055, + "auxiliary_loss_mlp": 0.01041595, + "balance_loss_clip": 1.04348135, + "balance_loss_mlp": 1.02908969, + "epoch": 0.5720727491357283, + "flos": 29270494484640.0, + "grad_norm": 2.291799536698828, + "language_loss": 0.81073564, + "learning_rate": 1.63230955093099e-06, + "loss": 0.83237213, + "num_input_tokens_seen": 204990910, + "router_z_loss_clip": 0.78613281, + "router_z_loss_mlp": 0.12512207, + "step": 9515, + "time_per_iteration": 2.6898343563079834 + }, + { + "auxiliary_loss_clip": 0.01114401, + "auxiliary_loss_mlp": 0.01028145, + "balance_loss_clip": 1.04007268, + "balance_loss_mlp": 1.01647401, + "epoch": 0.5721328723883962, + "flos": 28559436792480.0, + "grad_norm": 2.0037945571673736, + "language_loss": 0.85899836, + "learning_rate": 1.6319267345182092e-06, + "loss": 0.88042378, + "num_input_tokens_seen": 205010500, + "router_z_loss_clip": 0.74267578, + "router_z_loss_mlp": 0.11669922, + "step": 9516, + "time_per_iteration": 2.710301399230957 + }, + { + "auxiliary_loss_clip": 0.01117646, + "auxiliary_loss_mlp": 0.01026147, + "balance_loss_clip": 1.04207385, + "balance_loss_mlp": 1.01371932, + "epoch": 0.5721929956410642, + "flos": 22945838487840.0, + "grad_norm": 4.652035120675765, + "language_loss": 0.87859952, + "learning_rate": 1.6315439320628038e-06, + "loss": 0.90003741, + "num_input_tokens_seen": 205028560, + "router_z_loss_clip": 0.75488281, + "router_z_loss_mlp": 0.12426758, + "step": 9517, + "time_per_iteration": 2.6812965869903564 + }, + { + "auxiliary_loss_clip": 0.01115811, + "auxiliary_loss_mlp": 0.01028936, + "balance_loss_clip": 1.04015851, + "balance_loss_mlp": 1.01674736, + "epoch": 0.5722531188937322, + "flos": 33186092830560.0, + "grad_norm": 2.9956935273596965, + "language_loss": 0.85332286, + "learning_rate": 1.6311611435792893e-06, + "loss": 0.87477028, + "num_input_tokens_seen": 205048650, + "router_z_loss_clip": 0.75634766, + "router_z_loss_mlp": 0.12213135, + "step": 9518, + "time_per_iteration": 2.75742244720459 + }, + { + "auxiliary_loss_clip": 0.01114721, + "auxiliary_loss_mlp": 0.01030618, + "balance_loss_clip": 1.04083467, + "balance_loss_mlp": 1.01898921, + "epoch": 0.5723132421464001, + "flos": 18541306944000.0, + "grad_norm": 1.727923280266352, + "language_loss": 0.78748238, + "learning_rate": 1.6307783690821812e-06, + "loss": 0.80893576, + "num_input_tokens_seen": 205066480, + "router_z_loss_clip": 0.73925781, + "router_z_loss_mlp": 0.11639404, + "step": 9519, + "time_per_iteration": 2.6507837772369385 + }, + { + "auxiliary_loss_clip": 0.01115603, + "auxiliary_loss_mlp": 0.01027741, + "balance_loss_clip": 1.04068065, + "balance_loss_mlp": 1.01613653, + "epoch": 0.5723733653990681, + "flos": 33677457065280.0, + "grad_norm": 1.993275796110992, + "language_loss": 0.83027798, + "learning_rate": 1.6303956085859944e-06, + "loss": 0.85171145, + "num_input_tokens_seen": 205087475, + "router_z_loss_clip": 0.75097656, + "router_z_loss_mlp": 0.11608887, + "step": 9520, + "time_per_iteration": 2.8422205448150635 + }, + { + "auxiliary_loss_clip": 0.01119949, + "auxiliary_loss_mlp": 0.01041408, + "balance_loss_clip": 1.04230428, + "balance_loss_mlp": 1.02919531, + "epoch": 0.572433488651736, + "flos": 22235429072160.0, + "grad_norm": 3.810533683671871, + "language_loss": 0.72339487, + "learning_rate": 1.630012862105243e-06, + "loss": 0.74500841, + "num_input_tokens_seen": 205106495, + "router_z_loss_clip": 0.77636719, + "router_z_loss_mlp": 0.12213135, + "step": 9521, + "time_per_iteration": 2.669360399246216 + }, + { + "auxiliary_loss_clip": 0.01115806, + "auxiliary_loss_mlp": 0.01032804, + "balance_loss_clip": 1.04003882, + "balance_loss_mlp": 1.02103162, + "epoch": 0.5724936119044041, + "flos": 38130967650240.0, + "grad_norm": 1.94028643135802, + "language_loss": 0.78137392, + "learning_rate": 1.6296301296544415e-06, + "loss": 0.80286002, + "num_input_tokens_seen": 205128285, + "router_z_loss_clip": 0.75830078, + "router_z_loss_mlp": 0.11755371, + "step": 9522, + "time_per_iteration": 2.8567922115325928 + }, + { + "auxiliary_loss_clip": 0.01112548, + "auxiliary_loss_mlp": 0.01030031, + "balance_loss_clip": 1.03987837, + "balance_loss_mlp": 1.01957667, + "epoch": 0.572553735157072, + "flos": 23431003578720.0, + "grad_norm": 1.809283638766983, + "language_loss": 0.72058082, + "learning_rate": 1.629247411248102e-06, + "loss": 0.74200654, + "num_input_tokens_seen": 205146595, + "router_z_loss_clip": 0.7265625, + "router_z_loss_mlp": 0.10455322, + "step": 9523, + "time_per_iteration": 2.614680051803589 + }, + { + "auxiliary_loss_clip": 0.01114293, + "auxiliary_loss_mlp": 0.01027111, + "balance_loss_clip": 1.03980923, + "balance_loss_mlp": 1.01610208, + "epoch": 0.57261385840974, + "flos": 25886528199360.0, + "grad_norm": 1.7526102111530308, + "language_loss": 0.69979531, + "learning_rate": 1.628864706900738e-06, + "loss": 0.72120941, + "num_input_tokens_seen": 205164295, + "router_z_loss_clip": 0.74511719, + "router_z_loss_mlp": 0.11016846, + "step": 9524, + "time_per_iteration": 2.659316062927246 + }, + { + "auxiliary_loss_clip": 0.01116251, + "auxiliary_loss_mlp": 0.01029029, + "balance_loss_clip": 1.04147601, + "balance_loss_mlp": 1.01784682, + "epoch": 0.5726739816624079, + "flos": 41469034207680.0, + "grad_norm": 1.4932561001877906, + "language_loss": 0.65330285, + "learning_rate": 1.6284820166268615e-06, + "loss": 0.67475563, + "num_input_tokens_seen": 205185380, + "router_z_loss_clip": 0.74755859, + "router_z_loss_mlp": 0.11187744, + "step": 9525, + "time_per_iteration": 2.7259905338287354 + }, + { + "auxiliary_loss_clip": 0.01112741, + "auxiliary_loss_mlp": 0.01028145, + "balance_loss_clip": 1.0385251, + "balance_loss_mlp": 1.01704669, + "epoch": 0.5727341049150759, + "flos": 29622214706400.0, + "grad_norm": 1.8130037588808285, + "language_loss": 0.72745311, + "learning_rate": 1.628099340440984e-06, + "loss": 0.74886191, + "num_input_tokens_seen": 205204895, + "router_z_loss_clip": 0.74316406, + "router_z_loss_mlp": 0.11108398, + "step": 9526, + "time_per_iteration": 4.128347158432007 + }, + { + "auxiliary_loss_clip": 0.01113024, + "auxiliary_loss_mlp": 0.01029714, + "balance_loss_clip": 1.04006195, + "balance_loss_mlp": 1.01853824, + "epoch": 0.5727942281677438, + "flos": 34655201909280.0, + "grad_norm": 1.7982032354474269, + "language_loss": 0.80141294, + "learning_rate": 1.6277166783576176e-06, + "loss": 0.82284033, + "num_input_tokens_seen": 205223440, + "router_z_loss_clip": 0.72851562, + "router_z_loss_mlp": 0.11181641, + "step": 9527, + "time_per_iteration": 3.978701591491699 + }, + { + "auxiliary_loss_clip": 0.01115679, + "auxiliary_loss_mlp": 0.01034515, + "balance_loss_clip": 1.04120803, + "balance_loss_mlp": 1.02241564, + "epoch": 0.5728543514204119, + "flos": 23838712331040.0, + "grad_norm": 1.7851767043768734, + "language_loss": 0.72302878, + "learning_rate": 1.6273340303912713e-06, + "loss": 0.7445308, + "num_input_tokens_seen": 205242800, + "router_z_loss_clip": 0.74462891, + "router_z_loss_mlp": 0.12097168, + "step": 9528, + "time_per_iteration": 2.684967041015625 + }, + { + "auxiliary_loss_clip": 0.01116528, + "auxiliary_loss_mlp": 0.01034787, + "balance_loss_clip": 1.04094827, + "balance_loss_mlp": 1.02312207, + "epoch": 0.5729144746730798, + "flos": 26242664804640.0, + "grad_norm": 2.0059733936358284, + "language_loss": 0.85401076, + "learning_rate": 1.6269513965564557e-06, + "loss": 0.87552387, + "num_input_tokens_seen": 205259465, + "router_z_loss_clip": 0.75488281, + "router_z_loss_mlp": 0.11669922, + "step": 9529, + "time_per_iteration": 2.6390881538391113 + }, + { + "auxiliary_loss_clip": 0.0103741, + "auxiliary_loss_mlp": 0.01001061, + "balance_loss_clip": 1.01360703, + "balance_loss_mlp": 0.99966145, + "epoch": 0.5729745979257478, + "flos": 71603317478880.0, + "grad_norm": 0.7666925303855263, + "language_loss": 0.56121165, + "learning_rate": 1.6265687768676813e-06, + "loss": 0.58159637, + "num_input_tokens_seen": 205314100, + "router_z_loss_clip": 0.23828125, + "router_z_loss_mlp": 0.01399994, + "step": 9530, + "time_per_iteration": 3.130807876586914 + }, + { + "auxiliary_loss_clip": 0.01119249, + "auxiliary_loss_mlp": 0.01027934, + "balance_loss_clip": 1.04194784, + "balance_loss_mlp": 1.01684165, + "epoch": 0.5730347211784158, + "flos": 22637100749760.0, + "grad_norm": 1.738920449718561, + "language_loss": 0.66477203, + "learning_rate": 1.6261861713394553e-06, + "loss": 0.68624395, + "num_input_tokens_seen": 205333420, + "router_z_loss_clip": 0.77246094, + "router_z_loss_mlp": 0.11090088, + "step": 9531, + "time_per_iteration": 2.6116981506347656 + }, + { + "auxiliary_loss_clip": 0.01116693, + "auxiliary_loss_mlp": 0.01028366, + "balance_loss_clip": 1.0402354, + "balance_loss_mlp": 1.01638532, + "epoch": 0.5730948444310837, + "flos": 46407709883520.0, + "grad_norm": 2.091760061872688, + "language_loss": 0.75607085, + "learning_rate": 1.6258035799862876e-06, + "loss": 0.77752149, + "num_input_tokens_seen": 205350995, + "router_z_loss_clip": 0.76513672, + "router_z_loss_mlp": 0.11981201, + "step": 9532, + "time_per_iteration": 4.268834590911865 + }, + { + "auxiliary_loss_clip": 0.01115115, + "auxiliary_loss_mlp": 0.01027644, + "balance_loss_clip": 1.04026401, + "balance_loss_mlp": 1.01546109, + "epoch": 0.5731549676837517, + "flos": 30782586047040.0, + "grad_norm": 1.3707416275271913, + "language_loss": 0.78708333, + "learning_rate": 1.625421002822686e-06, + "loss": 0.8085109, + "num_input_tokens_seen": 205372675, + "router_z_loss_clip": 0.74951172, + "router_z_loss_mlp": 0.12182617, + "step": 9533, + "time_per_iteration": 2.695892095565796 + }, + { + "auxiliary_loss_clip": 0.0111453, + "auxiliary_loss_mlp": 0.01027866, + "balance_loss_clip": 1.04151571, + "balance_loss_mlp": 1.01652908, + "epoch": 0.5732150909364196, + "flos": 28514104306560.0, + "grad_norm": 2.906312483762627, + "language_loss": 0.85524726, + "learning_rate": 1.6250384398631574e-06, + "loss": 0.87667125, + "num_input_tokens_seen": 205392590, + "router_z_loss_clip": 0.72998047, + "router_z_loss_mlp": 0.11340332, + "step": 9534, + "time_per_iteration": 2.6488499641418457 + }, + { + "auxiliary_loss_clip": 0.0111729, + "auxiliary_loss_mlp": 0.01029767, + "balance_loss_clip": 1.04160738, + "balance_loss_mlp": 1.01704144, + "epoch": 0.5732752141890877, + "flos": 28161330635520.0, + "grad_norm": 17.55439348946112, + "language_loss": 0.75174451, + "learning_rate": 1.6246558911222085e-06, + "loss": 0.77321506, + "num_input_tokens_seen": 205414885, + "router_z_loss_clip": 0.75683594, + "router_z_loss_mlp": 0.12738037, + "step": 9535, + "time_per_iteration": 2.774700164794922 + }, + { + "auxiliary_loss_clip": 0.0112192, + "auxiliary_loss_mlp": 0.01033244, + "balance_loss_clip": 1.04336596, + "balance_loss_mlp": 1.02111411, + "epoch": 0.5733353374417556, + "flos": 29715027094080.0, + "grad_norm": 1.7263588953757347, + "language_loss": 0.7080797, + "learning_rate": 1.624273356614346e-06, + "loss": 0.7296313, + "num_input_tokens_seen": 205434440, + "router_z_loss_clip": 0.78613281, + "router_z_loss_mlp": 0.12127686, + "step": 9536, + "time_per_iteration": 2.6388325691223145 + }, + { + "auxiliary_loss_clip": 0.01113524, + "auxiliary_loss_mlp": 0.01031013, + "balance_loss_clip": 1.03936207, + "balance_loss_mlp": 1.01977146, + "epoch": 0.5733954606944236, + "flos": 33188564384640.0, + "grad_norm": 3.0580295374233546, + "language_loss": 0.69542193, + "learning_rate": 1.6238908363540755e-06, + "loss": 0.71686733, + "num_input_tokens_seen": 205454225, + "router_z_loss_clip": 0.74169922, + "router_z_loss_mlp": 0.11236572, + "step": 9537, + "time_per_iteration": 2.688418388366699 + }, + { + "auxiliary_loss_clip": 0.01116015, + "auxiliary_loss_mlp": 0.0103085, + "balance_loss_clip": 1.04074478, + "balance_loss_mlp": 1.01925731, + "epoch": 0.5734555839470915, + "flos": 35100099174240.0, + "grad_norm": 2.137958413185381, + "language_loss": 0.62909341, + "learning_rate": 1.623508330355902e-06, + "loss": 0.65056211, + "num_input_tokens_seen": 205474750, + "router_z_loss_clip": 0.75341797, + "router_z_loss_mlp": 0.11602783, + "step": 9538, + "time_per_iteration": 4.113224983215332 + }, + { + "auxiliary_loss_clip": 0.01115392, + "auxiliary_loss_mlp": 0.01033844, + "balance_loss_clip": 1.04069376, + "balance_loss_mlp": 1.02120745, + "epoch": 0.5735157071997595, + "flos": 28024279728480.0, + "grad_norm": 2.088855007946043, + "language_loss": 0.83501613, + "learning_rate": 1.6231258386343306e-06, + "loss": 0.85650843, + "num_input_tokens_seen": 205495495, + "router_z_loss_clip": 0.74707031, + "router_z_loss_mlp": 0.12634277, + "step": 9539, + "time_per_iteration": 2.6334476470947266 + }, + { + "auxiliary_loss_clip": 0.01117718, + "auxiliary_loss_mlp": 0.01039404, + "balance_loss_clip": 1.04071271, + "balance_loss_mlp": 1.02663064, + "epoch": 0.5735758304524274, + "flos": 23170839708960.0, + "grad_norm": 2.0560652295719786, + "language_loss": 0.73159271, + "learning_rate": 1.6227433612038647e-06, + "loss": 0.75316399, + "num_input_tokens_seen": 205510070, + "router_z_loss_clip": 0.77050781, + "router_z_loss_mlp": 0.12762451, + "step": 9540, + "time_per_iteration": 2.644989252090454 + }, + { + "auxiliary_loss_clip": 0.01112573, + "auxiliary_loss_mlp": 0.01027989, + "balance_loss_clip": 1.03855836, + "balance_loss_mlp": 1.01718283, + "epoch": 0.5736359537050955, + "flos": 34652244147840.0, + "grad_norm": 1.8234147478610765, + "language_loss": 0.80325484, + "learning_rate": 1.6223608980790089e-06, + "loss": 0.82466048, + "num_input_tokens_seen": 205530190, + "router_z_loss_clip": 0.73974609, + "router_z_loss_mlp": 0.10809326, + "step": 9541, + "time_per_iteration": 2.6712875366210938 + }, + { + "auxiliary_loss_clip": 0.01120596, + "auxiliary_loss_mlp": 0.0103016, + "balance_loss_clip": 1.04238153, + "balance_loss_mlp": 1.01789308, + "epoch": 0.5736960769577634, + "flos": 19068482103840.0, + "grad_norm": 3.3116631910616463, + "language_loss": 0.64731979, + "learning_rate": 1.6219784492742654e-06, + "loss": 0.6688273, + "num_input_tokens_seen": 205547380, + "router_z_loss_clip": 0.78076172, + "router_z_loss_mlp": 0.1227417, + "step": 9542, + "time_per_iteration": 2.675766944885254 + }, + { + "auxiliary_loss_clip": 0.01115504, + "auxiliary_loss_mlp": 0.01029743, + "balance_loss_clip": 1.0397203, + "balance_loss_mlp": 1.01806617, + "epoch": 0.5737562002104314, + "flos": 21968336747520.0, + "grad_norm": 2.6815361325319746, + "language_loss": 0.83421427, + "learning_rate": 1.6215960148041365e-06, + "loss": 0.8556667, + "num_input_tokens_seen": 205566540, + "router_z_loss_clip": 0.75732422, + "router_z_loss_mlp": 0.11669922, + "step": 9543, + "time_per_iteration": 2.6223304271698 + }, + { + "auxiliary_loss_clip": 0.01119748, + "auxiliary_loss_mlp": 0.01029112, + "balance_loss_clip": 1.0412761, + "balance_loss_mlp": 1.01595724, + "epoch": 0.5738163234630994, + "flos": 25256532883680.0, + "grad_norm": 21.79051401337925, + "language_loss": 0.73205382, + "learning_rate": 1.6212135946831257e-06, + "loss": 0.75354236, + "num_input_tokens_seen": 205584200, + "router_z_loss_clip": 0.78515625, + "router_z_loss_mlp": 0.13153076, + "step": 9544, + "time_per_iteration": 2.665700674057007 + }, + { + "auxiliary_loss_clip": 0.01117924, + "auxiliary_loss_mlp": 0.01026613, + "balance_loss_clip": 1.04032362, + "balance_loss_mlp": 1.01453137, + "epoch": 0.5738764467157673, + "flos": 28246971464640.0, + "grad_norm": 2.513237781947869, + "language_loss": 0.76226485, + "learning_rate": 1.620831188925733e-06, + "loss": 0.78371018, + "num_input_tokens_seen": 205604675, + "router_z_loss_clip": 0.77587891, + "router_z_loss_mlp": 0.12091064, + "step": 9545, + "time_per_iteration": 2.6285934448242188 + }, + { + "auxiliary_loss_clip": 0.01118444, + "auxiliary_loss_mlp": 0.01034828, + "balance_loss_clip": 1.04220891, + "balance_loss_mlp": 1.02294862, + "epoch": 0.5739365699684353, + "flos": 35988394564800.0, + "grad_norm": 4.1136061064528535, + "language_loss": 0.56511885, + "learning_rate": 1.620448797546459e-06, + "loss": 0.58665156, + "num_input_tokens_seen": 205624680, + "router_z_loss_clip": 0.76220703, + "router_z_loss_mlp": 0.11877441, + "step": 9546, + "time_per_iteration": 2.6907947063446045 + }, + { + "auxiliary_loss_clip": 0.01117899, + "auxiliary_loss_mlp": 0.01032916, + "balance_loss_clip": 1.04052353, + "balance_loss_mlp": 1.02084064, + "epoch": 0.5739966932211032, + "flos": 17115585521760.0, + "grad_norm": 5.081193627145516, + "language_loss": 0.76251417, + "learning_rate": 1.6200664205598055e-06, + "loss": 0.78402233, + "num_input_tokens_seen": 205641950, + "router_z_loss_clip": 0.7734375, + "router_z_loss_mlp": 0.12084961, + "step": 9547, + "time_per_iteration": 2.5878286361694336 + }, + { + "auxiliary_loss_clip": 0.01117956, + "auxiliary_loss_mlp": 0.01028387, + "balance_loss_clip": 1.04096699, + "balance_loss_mlp": 1.01602459, + "epoch": 0.5740568164737713, + "flos": 23259438299520.0, + "grad_norm": 2.1060708770847216, + "language_loss": 0.74366069, + "learning_rate": 1.6196840579802704e-06, + "loss": 0.76512408, + "num_input_tokens_seen": 205660130, + "router_z_loss_clip": 0.77001953, + "router_z_loss_mlp": 0.12353516, + "step": 9548, + "time_per_iteration": 2.64589524269104 + }, + { + "auxiliary_loss_clip": 0.01116562, + "auxiliary_loss_mlp": 0.01031961, + "balance_loss_clip": 1.03931713, + "balance_loss_mlp": 1.01958692, + "epoch": 0.5741169397264392, + "flos": 27000473087520.0, + "grad_norm": 2.3314419149033947, + "language_loss": 0.69857836, + "learning_rate": 1.619301709822355e-06, + "loss": 0.72006357, + "num_input_tokens_seen": 205678895, + "router_z_loss_clip": 0.77246094, + "router_z_loss_mlp": 0.1237793, + "step": 9549, + "time_per_iteration": 2.6145124435424805 + }, + { + "auxiliary_loss_clip": 0.01118181, + "auxiliary_loss_mlp": 0.01029099, + "balance_loss_clip": 1.04314017, + "balance_loss_mlp": 1.01736283, + "epoch": 0.5741770629791072, + "flos": 30428272719360.0, + "grad_norm": 4.676102294891359, + "language_loss": 0.79257679, + "learning_rate": 1.6189193761005564e-06, + "loss": 0.8140496, + "num_input_tokens_seen": 205698450, + "router_z_loss_clip": 0.75146484, + "router_z_loss_mlp": 0.11737061, + "step": 9550, + "time_per_iteration": 2.682204008102417 + }, + { + "auxiliary_loss_clip": 0.01119553, + "auxiliary_loss_mlp": 0.01030544, + "balance_loss_clip": 1.04257202, + "balance_loss_mlp": 1.0181638, + "epoch": 0.5742371862317751, + "flos": 22941948828960.0, + "grad_norm": 3.888244501763813, + "language_loss": 0.67575902, + "learning_rate": 1.6185370568293727e-06, + "loss": 0.69726002, + "num_input_tokens_seen": 205714870, + "router_z_loss_clip": 0.76904297, + "router_z_loss_mlp": 0.12390137, + "step": 9551, + "time_per_iteration": 2.6393752098083496 + }, + { + "auxiliary_loss_clip": 0.01117123, + "auxiliary_loss_mlp": 0.01034822, + "balance_loss_clip": 1.03968692, + "balance_loss_mlp": 1.02296102, + "epoch": 0.5742973094844431, + "flos": 29847377996640.0, + "grad_norm": 1.9384167421974934, + "language_loss": 0.72055542, + "learning_rate": 1.6181547520233031e-06, + "loss": 0.74207497, + "num_input_tokens_seen": 205736045, + "router_z_loss_clip": 0.77392578, + "router_z_loss_mlp": 0.11859131, + "step": 9552, + "time_per_iteration": 2.643977165222168 + }, + { + "auxiliary_loss_clip": 0.01119728, + "auxiliary_loss_mlp": 0.01035887, + "balance_loss_clip": 1.04334855, + "balance_loss_mlp": 1.02374578, + "epoch": 0.574357432737111, + "flos": 26420834400480.0, + "grad_norm": 2.0946716817059663, + "language_loss": 0.80090904, + "learning_rate": 1.617772461696843e-06, + "loss": 0.82246518, + "num_input_tokens_seen": 205754445, + "router_z_loss_clip": 0.76318359, + "router_z_loss_mlp": 0.121521, + "step": 9553, + "time_per_iteration": 2.6458687782287598 + }, + { + "auxiliary_loss_clip": 0.01118021, + "auxiliary_loss_mlp": 0.01029809, + "balance_loss_clip": 1.03968596, + "balance_loss_mlp": 1.01813865, + "epoch": 0.5744175559897791, + "flos": 20187613203840.0, + "grad_norm": 14.18460425816751, + "language_loss": 0.83528835, + "learning_rate": 1.6173901858644895e-06, + "loss": 0.8567667, + "num_input_tokens_seen": 205770595, + "router_z_loss_clip": 0.78271484, + "router_z_loss_mlp": 0.11669922, + "step": 9554, + "time_per_iteration": 2.5779614448547363 + }, + { + "auxiliary_loss_clip": 0.01124157, + "auxiliary_loss_mlp": 0.01034361, + "balance_loss_clip": 1.04415309, + "balance_loss_mlp": 1.02135515, + "epoch": 0.574477679242447, + "flos": 29538356637600.0, + "grad_norm": 1.674694724055139, + "language_loss": 0.71020985, + "learning_rate": 1.6170079245407385e-06, + "loss": 0.73179495, + "num_input_tokens_seen": 205791935, + "router_z_loss_clip": 0.80029297, + "router_z_loss_mlp": 0.12994385, + "step": 9555, + "time_per_iteration": 2.7137672901153564 + }, + { + "auxiliary_loss_clip": 0.01116939, + "auxiliary_loss_mlp": 0.01027039, + "balance_loss_clip": 1.04083323, + "balance_loss_mlp": 1.01445603, + "epoch": 0.574537802495115, + "flos": 18184238441280.0, + "grad_norm": 2.836046031149152, + "language_loss": 0.73098195, + "learning_rate": 1.6166256777400853e-06, + "loss": 0.75242174, + "num_input_tokens_seen": 205807260, + "router_z_loss_clip": 0.76025391, + "router_z_loss_mlp": 0.12579346, + "step": 9556, + "time_per_iteration": 2.602092742919922 + }, + { + "auxiliary_loss_clip": 0.0111887, + "auxiliary_loss_mlp": 0.01031859, + "balance_loss_clip": 1.04255748, + "balance_loss_mlp": 1.01918745, + "epoch": 0.5745979257477829, + "flos": 30426165820800.0, + "grad_norm": 2.3186626538421358, + "language_loss": 0.74241412, + "learning_rate": 1.6162434454770248e-06, + "loss": 0.76392138, + "num_input_tokens_seen": 205826885, + "router_z_loss_clip": 0.76367188, + "router_z_loss_mlp": 0.12670898, + "step": 9557, + "time_per_iteration": 2.7965009212493896 + }, + { + "auxiliary_loss_clip": 0.01119277, + "auxiliary_loss_mlp": 0.01031701, + "balance_loss_clip": 1.04229975, + "balance_loss_mlp": 1.01993537, + "epoch": 0.5746580490004509, + "flos": 21031062315840.0, + "grad_norm": 1.6136141384983813, + "language_loss": 0.67882013, + "learning_rate": 1.6158612277660514e-06, + "loss": 0.70033002, + "num_input_tokens_seen": 205844630, + "router_z_loss_clip": 0.77050781, + "router_z_loss_mlp": 0.11761475, + "step": 9558, + "time_per_iteration": 2.5969200134277344 + }, + { + "auxiliary_loss_clip": 0.01125745, + "auxiliary_loss_mlp": 0.01038829, + "balance_loss_clip": 1.04364395, + "balance_loss_mlp": 1.02476811, + "epoch": 0.5747181722531189, + "flos": 16091333190720.0, + "grad_norm": 2.435889125155377, + "language_loss": 0.70896482, + "learning_rate": 1.615479024621659e-06, + "loss": 0.73061055, + "num_input_tokens_seen": 205860960, + "router_z_loss_clip": 0.82177734, + "router_z_loss_mlp": 0.14068604, + "step": 9559, + "time_per_iteration": 2.6220755577087402 + }, + { + "auxiliary_loss_clip": 0.01119318, + "auxiliary_loss_mlp": 0.01030446, + "balance_loss_clip": 1.0433538, + "balance_loss_mlp": 1.01998568, + "epoch": 0.5747782955057869, + "flos": 28019984896800.0, + "grad_norm": 1.6349048796993424, + "language_loss": 0.79375517, + "learning_rate": 1.6150968360583398e-06, + "loss": 0.81525278, + "num_input_tokens_seen": 205880675, + "router_z_loss_clip": 0.75976562, + "router_z_loss_mlp": 0.10461426, + "step": 9560, + "time_per_iteration": 2.6229782104492188 + }, + { + "auxiliary_loss_clip": 0.0111781, + "auxiliary_loss_mlp": 0.01027142, + "balance_loss_clip": 1.04022908, + "balance_loss_mlp": 1.01540589, + "epoch": 0.5748384187584549, + "flos": 28556965238400.0, + "grad_norm": 2.2069396182183163, + "language_loss": 0.64307499, + "learning_rate": 1.614714662090588e-06, + "loss": 0.6645245, + "num_input_tokens_seen": 205900050, + "router_z_loss_clip": 0.77685547, + "router_z_loss_mlp": 0.11730957, + "step": 9561, + "time_per_iteration": 2.653477430343628 + }, + { + "auxiliary_loss_clip": 0.0112948, + "auxiliary_loss_mlp": 0.01037424, + "balance_loss_clip": 1.04696798, + "balance_loss_mlp": 1.02403712, + "epoch": 0.5748985420111228, + "flos": 21701690112960.0, + "grad_norm": 1.6363004396688932, + "language_loss": 0.71073419, + "learning_rate": 1.6143325027328945e-06, + "loss": 0.73240316, + "num_input_tokens_seen": 205918855, + "router_z_loss_clip": 0.82470703, + "router_z_loss_mlp": 0.13391113, + "step": 9562, + "time_per_iteration": 2.594667673110962 + }, + { + "auxiliary_loss_clip": 0.01119853, + "auxiliary_loss_mlp": 0.01034006, + "balance_loss_clip": 1.04380965, + "balance_loss_mlp": 1.02265692, + "epoch": 0.5749586652637908, + "flos": 24240667629600.0, + "grad_norm": 1.6434312033026524, + "language_loss": 0.84042847, + "learning_rate": 1.613950357999751e-06, + "loss": 0.86196709, + "num_input_tokens_seen": 205936970, + "router_z_loss_clip": 0.76025391, + "router_z_loss_mlp": 0.11346436, + "step": 9563, + "time_per_iteration": 2.723788022994995 + }, + { + "auxiliary_loss_clip": 0.01122967, + "auxiliary_loss_mlp": 0.01037962, + "balance_loss_clip": 1.04332829, + "balance_loss_mlp": 1.02582026, + "epoch": 0.5750187885164587, + "flos": 25974843168960.0, + "grad_norm": 2.0890717667542393, + "language_loss": 0.57254386, + "learning_rate": 1.6135682279056488e-06, + "loss": 0.59415311, + "num_input_tokens_seen": 205954630, + "router_z_loss_clip": 0.79638672, + "router_z_loss_mlp": 0.121521, + "step": 9564, + "time_per_iteration": 2.609788656234741 + }, + { + "auxiliary_loss_clip": 0.01113496, + "auxiliary_loss_mlp": 0.01029701, + "balance_loss_clip": 1.03994608, + "balance_loss_mlp": 1.01818538, + "epoch": 0.5750789117691267, + "flos": 22945554866880.0, + "grad_norm": 2.347342386810892, + "language_loss": 0.76079774, + "learning_rate": 1.613186112465078e-06, + "loss": 0.78222972, + "num_input_tokens_seen": 205971510, + "router_z_loss_clip": 0.73583984, + "router_z_loss_mlp": 0.11517334, + "step": 9565, + "time_per_iteration": 4.184612989425659 + }, + { + "auxiliary_loss_clip": 0.01039301, + "auxiliary_loss_mlp": 0.01004054, + "balance_loss_clip": 1.01553977, + "balance_loss_mlp": 1.00279665, + "epoch": 0.5751390350217946, + "flos": 86222867450400.0, + "grad_norm": 0.7434996312785723, + "language_loss": 0.60823429, + "learning_rate": 1.6128040116925287e-06, + "loss": 0.62866783, + "num_input_tokens_seen": 206035125, + "router_z_loss_clip": 0.23791504, + "router_z_loss_mlp": 0.01257324, + "step": 9566, + "time_per_iteration": 4.673164367675781 + }, + { + "auxiliary_loss_clip": 0.01117892, + "auxiliary_loss_mlp": 0.01031746, + "balance_loss_clip": 1.04238772, + "balance_loss_mlp": 1.02049887, + "epoch": 0.5751991582744627, + "flos": 17382920950080.0, + "grad_norm": 3.0995341637466067, + "language_loss": 0.75446469, + "learning_rate": 1.6124219256024901e-06, + "loss": 0.77596104, + "num_input_tokens_seen": 206052075, + "router_z_loss_clip": 0.75585938, + "router_z_loss_mlp": 0.11254883, + "step": 9567, + "time_per_iteration": 2.6411094665527344 + }, + { + "auxiliary_loss_clip": 0.01117381, + "auxiliary_loss_mlp": 0.01029512, + "balance_loss_clip": 1.04134166, + "balance_loss_mlp": 1.01797271, + "epoch": 0.5752592815271306, + "flos": 22364295488640.0, + "grad_norm": 4.300837961594993, + "language_loss": 0.74912083, + "learning_rate": 1.6120398542094504e-06, + "loss": 0.77058971, + "num_input_tokens_seen": 206069970, + "router_z_loss_clip": 0.76074219, + "router_z_loss_mlp": 0.11541748, + "step": 9568, + "time_per_iteration": 2.6202895641326904 + }, + { + "auxiliary_loss_clip": 0.01119155, + "auxiliary_loss_mlp": 0.01030684, + "balance_loss_clip": 1.04220474, + "balance_loss_mlp": 1.01877522, + "epoch": 0.5753194047797986, + "flos": 25530026938560.0, + "grad_norm": 3.1421689808291102, + "language_loss": 0.71376944, + "learning_rate": 1.6116577975278994e-06, + "loss": 0.73526788, + "num_input_tokens_seen": 206088950, + "router_z_loss_clip": 0.76953125, + "router_z_loss_mlp": 0.1192627, + "step": 9569, + "time_per_iteration": 2.655979871749878 + }, + { + "auxiliary_loss_clip": 0.01120654, + "auxiliary_loss_mlp": 0.01039687, + "balance_loss_clip": 1.04292262, + "balance_loss_mlp": 1.0266757, + "epoch": 0.5753795280324665, + "flos": 23527543556160.0, + "grad_norm": 2.229446990701181, + "language_loss": 0.55345595, + "learning_rate": 1.6112757555723223e-06, + "loss": 0.57505941, + "num_input_tokens_seen": 206107780, + "router_z_loss_clip": 0.77734375, + "router_z_loss_mlp": 0.13006592, + "step": 9570, + "time_per_iteration": 2.585803270339966 + }, + { + "auxiliary_loss_clip": 0.01115105, + "auxiliary_loss_mlp": 0.0103647, + "balance_loss_clip": 1.03987598, + "balance_loss_mlp": 1.02517474, + "epoch": 0.5754396512851345, + "flos": 26420955952320.0, + "grad_norm": 1.5596855676734538, + "language_loss": 0.64497185, + "learning_rate": 1.6108937283572082e-06, + "loss": 0.66648757, + "num_input_tokens_seen": 206127445, + "router_z_loss_clip": 0.75146484, + "router_z_loss_mlp": 0.112854, + "step": 9571, + "time_per_iteration": 2.6675915718078613 + }, + { + "auxiliary_loss_clip": 0.01116891, + "auxiliary_loss_mlp": 0.01030743, + "balance_loss_clip": 1.04025006, + "balance_loss_mlp": 1.0188098, + "epoch": 0.5754997745378025, + "flos": 62259253045920.0, + "grad_norm": 3.0171949130285274, + "language_loss": 0.67043811, + "learning_rate": 1.6105117158970434e-06, + "loss": 0.69191444, + "num_input_tokens_seen": 206152005, + "router_z_loss_clip": 0.76611328, + "router_z_loss_mlp": 0.1192627, + "step": 9572, + "time_per_iteration": 4.4330055713653564 + }, + { + "auxiliary_loss_clip": 0.01118322, + "auxiliary_loss_mlp": 0.01030162, + "balance_loss_clip": 1.04246545, + "balance_loss_mlp": 1.01810431, + "epoch": 0.5755598977904705, + "flos": 27890186582880.0, + "grad_norm": 1.9394622205351104, + "language_loss": 0.72380549, + "learning_rate": 1.6101297182063123e-06, + "loss": 0.74529034, + "num_input_tokens_seen": 206169875, + "router_z_loss_clip": 0.75830078, + "router_z_loss_mlp": 0.12060547, + "step": 9573, + "time_per_iteration": 2.613772392272949 + }, + { + "auxiliary_loss_clip": 0.01113765, + "auxiliary_loss_mlp": 0.01026467, + "balance_loss_clip": 1.04276824, + "balance_loss_mlp": 1.01592922, + "epoch": 0.5756200210431385, + "flos": 46945946260800.0, + "grad_norm": 2.201398260127113, + "language_loss": 0.76191598, + "learning_rate": 1.6097477352995022e-06, + "loss": 0.78331828, + "num_input_tokens_seen": 206192635, + "router_z_loss_clip": 0.70996094, + "router_z_loss_mlp": 0.10540771, + "step": 9574, + "time_per_iteration": 2.7934043407440186 + }, + { + "auxiliary_loss_clip": 0.01122049, + "auxiliary_loss_mlp": 0.01029517, + "balance_loss_clip": 1.04227555, + "balance_loss_mlp": 1.01691031, + "epoch": 0.5756801442958064, + "flos": 29175737267520.0, + "grad_norm": 3.7874549807253173, + "language_loss": 0.6651774, + "learning_rate": 1.6093657671910968e-06, + "loss": 0.68669307, + "num_input_tokens_seen": 206211485, + "router_z_loss_clip": 0.79736328, + "router_z_loss_mlp": 0.12609863, + "step": 9575, + "time_per_iteration": 2.607403039932251 + }, + { + "auxiliary_loss_clip": 0.01114917, + "auxiliary_loss_mlp": 0.01029743, + "balance_loss_clip": 1.04191422, + "balance_loss_mlp": 1.01830506, + "epoch": 0.5757402675484744, + "flos": 25752759192000.0, + "grad_norm": 1.9273808118022777, + "language_loss": 0.79753256, + "learning_rate": 1.6089838138955804e-06, + "loss": 0.81897926, + "num_input_tokens_seen": 206231740, + "router_z_loss_clip": 0.73046875, + "router_z_loss_mlp": 0.11444092, + "step": 9576, + "time_per_iteration": 2.662628173828125 + }, + { + "auxiliary_loss_clip": 0.01116304, + "auxiliary_loss_mlp": 0.01026686, + "balance_loss_clip": 1.04175329, + "balance_loss_mlp": 1.01580811, + "epoch": 0.5758003908011423, + "flos": 25086183122880.0, + "grad_norm": 1.8379648880103363, + "language_loss": 0.69517112, + "learning_rate": 1.6086018754274372e-06, + "loss": 0.71660101, + "num_input_tokens_seen": 206250975, + "router_z_loss_clip": 0.74560547, + "router_z_loss_mlp": 0.10882568, + "step": 9577, + "time_per_iteration": 2.6040501594543457 + }, + { + "auxiliary_loss_clip": 0.01116878, + "auxiliary_loss_mlp": 0.01032855, + "balance_loss_clip": 1.03912401, + "balance_loss_mlp": 1.02173829, + "epoch": 0.5758605140538103, + "flos": 20099987028000.0, + "grad_norm": 1.8860697660978882, + "language_loss": 0.66301894, + "learning_rate": 1.6082199518011504e-06, + "loss": 0.68451625, + "num_input_tokens_seen": 206268800, + "router_z_loss_clip": 0.77734375, + "router_z_loss_mlp": 0.11114502, + "step": 9578, + "time_per_iteration": 3.923504590988159 + }, + { + "auxiliary_loss_clip": 0.01113683, + "auxiliary_loss_mlp": 0.01027262, + "balance_loss_clip": 1.04020822, + "balance_loss_mlp": 1.01645577, + "epoch": 0.5759206373064782, + "flos": 25975207824480.0, + "grad_norm": 1.5696574409069164, + "language_loss": 0.72597247, + "learning_rate": 1.6078380430312016e-06, + "loss": 0.74738193, + "num_input_tokens_seen": 206287190, + "router_z_loss_clip": 0.734375, + "router_z_loss_mlp": 0.10803223, + "step": 9579, + "time_per_iteration": 2.678825616836548 + }, + { + "auxiliary_loss_clip": 0.01122967, + "auxiliary_loss_mlp": 0.01031361, + "balance_loss_clip": 1.04327369, + "balance_loss_mlp": 1.01853991, + "epoch": 0.5759807605591463, + "flos": 31808459069280.0, + "grad_norm": 5.32952241402487, + "language_loss": 0.64675581, + "learning_rate": 1.6074561491320742e-06, + "loss": 0.66829908, + "num_input_tokens_seen": 206307020, + "router_z_loss_clip": 0.796875, + "router_z_loss_mlp": 0.12841797, + "step": 9580, + "time_per_iteration": 2.6669728755950928 + }, + { + "auxiliary_loss_clip": 0.01117712, + "auxiliary_loss_mlp": 0.01036421, + "balance_loss_clip": 1.04036701, + "balance_loss_mlp": 1.02377295, + "epoch": 0.5760408838118142, + "flos": 23029453452960.0, + "grad_norm": 2.959989042540056, + "language_loss": 0.85469389, + "learning_rate": 1.6070742701182486e-06, + "loss": 0.87623519, + "num_input_tokens_seen": 206324095, + "router_z_loss_clip": 0.77392578, + "router_z_loss_mlp": 0.12646484, + "step": 9581, + "time_per_iteration": 2.623487949371338 + }, + { + "auxiliary_loss_clip": 0.01124388, + "auxiliary_loss_mlp": 0.01039104, + "balance_loss_clip": 1.04404211, + "balance_loss_mlp": 1.02683139, + "epoch": 0.5761010070644822, + "flos": 18763958162880.0, + "grad_norm": 3.257372293582189, + "language_loss": 0.67683792, + "learning_rate": 1.6066924060042057e-06, + "loss": 0.6984728, + "num_input_tokens_seen": 206343210, + "router_z_loss_clip": 0.80371094, + "router_z_loss_mlp": 0.1227417, + "step": 9582, + "time_per_iteration": 2.5947537422180176 + }, + { + "auxiliary_loss_clip": 0.01036839, + "auxiliary_loss_mlp": 0.010003, + "balance_loss_clip": 1.01335859, + "balance_loss_mlp": 0.99907535, + "epoch": 0.5761611303171501, + "flos": 87209161440480.0, + "grad_norm": 0.6383708507565331, + "language_loss": 0.57166523, + "learning_rate": 1.6063105568044271e-06, + "loss": 0.5920366, + "num_input_tokens_seen": 206415935, + "router_z_loss_clip": 0.23498535, + "router_z_loss_mlp": 0.01223755, + "step": 9583, + "time_per_iteration": 3.439363956451416 + }, + { + "auxiliary_loss_clip": 0.01117603, + "auxiliary_loss_mlp": 0.01029075, + "balance_loss_clip": 1.04128766, + "balance_loss_mlp": 1.01754701, + "epoch": 0.5762212535698181, + "flos": 19823899867200.0, + "grad_norm": 2.8161645890875526, + "language_loss": 0.82490063, + "learning_rate": 1.6059287225333912e-06, + "loss": 0.84636736, + "num_input_tokens_seen": 206431900, + "router_z_loss_clip": 0.76220703, + "router_z_loss_mlp": 0.11523438, + "step": 9584, + "time_per_iteration": 2.5724592208862305 + }, + { + "auxiliary_loss_clip": 0.01037055, + "auxiliary_loss_mlp": 0.00999542, + "balance_loss_clip": 1.01348209, + "balance_loss_mlp": 0.99834692, + "epoch": 0.5762813768224861, + "flos": 85640230484640.0, + "grad_norm": 0.6235251415302766, + "language_loss": 0.49549252, + "learning_rate": 1.6055469032055773e-06, + "loss": 0.51585853, + "num_input_tokens_seen": 206501200, + "router_z_loss_clip": 0.23583984, + "router_z_loss_mlp": 0.01194, + "step": 9585, + "time_per_iteration": 3.341292142868042 + }, + { + "auxiliary_loss_clip": 0.01113533, + "auxiliary_loss_mlp": 0.01027908, + "balance_loss_clip": 1.03867376, + "balance_loss_mlp": 1.0165832, + "epoch": 0.5763415000751541, + "flos": 25036474770720.0, + "grad_norm": 1.823027773562025, + "language_loss": 0.84797394, + "learning_rate": 1.605165098835465e-06, + "loss": 0.86938834, + "num_input_tokens_seen": 206520575, + "router_z_loss_clip": 0.74902344, + "router_z_loss_mlp": 0.11322021, + "step": 9586, + "time_per_iteration": 2.6582772731781006 + }, + { + "auxiliary_loss_clip": 0.01117228, + "auxiliary_loss_mlp": 0.01031071, + "balance_loss_clip": 1.04103589, + "balance_loss_mlp": 1.01858997, + "epoch": 0.5764016233278221, + "flos": 19297737639360.0, + "grad_norm": 1.8733497323460127, + "language_loss": 0.8018086, + "learning_rate": 1.6047833094375308e-06, + "loss": 0.8232916, + "num_input_tokens_seen": 206538060, + "router_z_loss_clip": 0.76074219, + "router_z_loss_mlp": 0.12475586, + "step": 9587, + "time_per_iteration": 2.5898687839508057 + }, + { + "auxiliary_loss_clip": 0.01114186, + "auxiliary_loss_mlp": 0.01031195, + "balance_loss_clip": 1.03964186, + "balance_loss_mlp": 1.01888692, + "epoch": 0.57646174658049, + "flos": 25346873717280.0, + "grad_norm": 1.6440756777956125, + "language_loss": 0.66037244, + "learning_rate": 1.6044015350262542e-06, + "loss": 0.68182623, + "num_input_tokens_seen": 206557320, + "router_z_loss_clip": 0.74609375, + "router_z_loss_mlp": 0.12304688, + "step": 9588, + "time_per_iteration": 2.6616404056549072 + }, + { + "auxiliary_loss_clip": 0.01118574, + "auxiliary_loss_mlp": 0.01028401, + "balance_loss_clip": 1.04174435, + "balance_loss_mlp": 1.01583099, + "epoch": 0.576521869833158, + "flos": 28735904662560.0, + "grad_norm": 1.9288354840111257, + "language_loss": 0.78890264, + "learning_rate": 1.6040197756161104e-06, + "loss": 0.81037235, + "num_input_tokens_seen": 206575780, + "router_z_loss_clip": 0.76806641, + "router_z_loss_mlp": 0.12585449, + "step": 9589, + "time_per_iteration": 2.6756181716918945 + }, + { + "auxiliary_loss_clip": 0.01111514, + "auxiliary_loss_mlp": 0.01021973, + "balance_loss_clip": 1.03878856, + "balance_loss_mlp": 1.0110532, + "epoch": 0.5765819930858259, + "flos": 24729276689280.0, + "grad_norm": 2.105675069357401, + "language_loss": 0.79345781, + "learning_rate": 1.6036380312215762e-06, + "loss": 0.81479263, + "num_input_tokens_seen": 206594100, + "router_z_loss_clip": 0.72705078, + "router_z_loss_mlp": 0.10919189, + "step": 9590, + "time_per_iteration": 2.6869568824768066 + }, + { + "auxiliary_loss_clip": 0.01116229, + "auxiliary_loss_mlp": 0.01029938, + "balance_loss_clip": 1.04137778, + "balance_loss_mlp": 1.01879811, + "epoch": 0.5766421163384939, + "flos": 28246363705440.0, + "grad_norm": 2.252895485266773, + "language_loss": 0.62703592, + "learning_rate": 1.6032563018571283e-06, + "loss": 0.64849758, + "num_input_tokens_seen": 206613325, + "router_z_loss_clip": 0.74853516, + "router_z_loss_mlp": 0.11126709, + "step": 9591, + "time_per_iteration": 2.6119065284729004 + }, + { + "auxiliary_loss_clip": 0.01118805, + "auxiliary_loss_mlp": 0.01039264, + "balance_loss_clip": 1.04229641, + "balance_loss_mlp": 1.02746868, + "epoch": 0.5767022395911618, + "flos": 31541245192800.0, + "grad_norm": 1.5967824116106384, + "language_loss": 0.77907991, + "learning_rate": 1.6028745875372406e-06, + "loss": 0.80066061, + "num_input_tokens_seen": 206634265, + "router_z_loss_clip": 0.76464844, + "router_z_loss_mlp": 0.11798096, + "step": 9592, + "time_per_iteration": 2.7403197288513184 + }, + { + "auxiliary_loss_clip": 0.01037421, + "auxiliary_loss_mlp": 0.01004512, + "balance_loss_clip": 1.01374328, + "balance_loss_mlp": 1.00320935, + "epoch": 0.5767623628438299, + "flos": 83330751607200.0, + "grad_norm": 0.7346842142200595, + "language_loss": 0.59607047, + "learning_rate": 1.6024928882763885e-06, + "loss": 0.61648977, + "num_input_tokens_seen": 206696990, + "router_z_loss_clip": 0.23681641, + "router_z_loss_mlp": 0.01303101, + "step": 9593, + "time_per_iteration": 3.4341728687286377 + }, + { + "auxiliary_loss_clip": 0.01119107, + "auxiliary_loss_mlp": 0.01034939, + "balance_loss_clip": 1.04065394, + "balance_loss_mlp": 1.02205837, + "epoch": 0.5768224860964978, + "flos": 36836908336800.0, + "grad_norm": 1.6940500489851398, + "language_loss": 0.71516401, + "learning_rate": 1.6021112040890463e-06, + "loss": 0.73670447, + "num_input_tokens_seen": 206717815, + "router_z_loss_clip": 0.78417969, + "router_z_loss_mlp": 0.12878418, + "step": 9594, + "time_per_iteration": 2.710145950317383 + }, + { + "auxiliary_loss_clip": 0.01117211, + "auxiliary_loss_mlp": 0.0103037, + "balance_loss_clip": 1.04110122, + "balance_loss_mlp": 1.01936722, + "epoch": 0.5768826093491658, + "flos": 21835337568480.0, + "grad_norm": 2.4708379738820274, + "language_loss": 0.71052355, + "learning_rate": 1.6017295349896863e-06, + "loss": 0.73199934, + "num_input_tokens_seen": 206735985, + "router_z_loss_clip": 0.76074219, + "router_z_loss_mlp": 0.11010742, + "step": 9595, + "time_per_iteration": 2.6720001697540283 + }, + { + "auxiliary_loss_clip": 0.01115633, + "auxiliary_loss_mlp": 0.01029255, + "balance_loss_clip": 1.03965867, + "balance_loss_mlp": 1.01711917, + "epoch": 0.5769427326018337, + "flos": 21301031367360.0, + "grad_norm": 2.1169829707255663, + "language_loss": 0.70160246, + "learning_rate": 1.6013478809927828e-06, + "loss": 0.72305131, + "num_input_tokens_seen": 206753370, + "router_z_loss_clip": 0.75976562, + "router_z_loss_mlp": 0.12139893, + "step": 9596, + "time_per_iteration": 2.581951379776001 + }, + { + "auxiliary_loss_clip": 0.01121926, + "auxiliary_loss_mlp": 0.01037725, + "balance_loss_clip": 1.04133892, + "balance_loss_mlp": 1.02372932, + "epoch": 0.5770028558545017, + "flos": 48103846047360.0, + "grad_norm": 5.028827932302667, + "language_loss": 0.67315352, + "learning_rate": 1.6009662421128074e-06, + "loss": 0.69475001, + "num_input_tokens_seen": 206777645, + "router_z_loss_clip": 0.80566406, + "router_z_loss_mlp": 0.13983154, + "step": 9597, + "time_per_iteration": 2.812318801879883 + }, + { + "auxiliary_loss_clip": 0.01117295, + "auxiliary_loss_mlp": 0.01031979, + "balance_loss_clip": 1.04149485, + "balance_loss_mlp": 1.02023697, + "epoch": 0.5770629791071697, + "flos": 26279448144480.0, + "grad_norm": 2.165469614590163, + "language_loss": 0.81314319, + "learning_rate": 1.6005846183642323e-06, + "loss": 0.83463591, + "num_input_tokens_seen": 206794865, + "router_z_loss_clip": 0.75732422, + "router_z_loss_mlp": 0.11743164, + "step": 9598, + "time_per_iteration": 2.596405506134033 + }, + { + "auxiliary_loss_clip": 0.01117863, + "auxiliary_loss_mlp": 0.01033255, + "balance_loss_clip": 1.04078996, + "balance_loss_mlp": 1.02063668, + "epoch": 0.5771231023598377, + "flos": 25486112557440.0, + "grad_norm": 2.4764042145347887, + "language_loss": 0.72758108, + "learning_rate": 1.6002030097615277e-06, + "loss": 0.74909228, + "num_input_tokens_seen": 206814095, + "router_z_loss_clip": 0.77099609, + "router_z_loss_mlp": 0.12615967, + "step": 9599, + "time_per_iteration": 2.686528444290161 + }, + { + "auxiliary_loss_clip": 0.01112486, + "auxiliary_loss_mlp": 0.01030196, + "balance_loss_clip": 1.03948045, + "balance_loss_mlp": 1.01877558, + "epoch": 0.5771832256125057, + "flos": 22057664649120.0, + "grad_norm": 1.9054512493092037, + "language_loss": 0.78319216, + "learning_rate": 1.5998214163191663e-06, + "loss": 0.80461895, + "num_input_tokens_seen": 206832245, + "router_z_loss_clip": 0.72998047, + "router_z_loss_mlp": 0.11425781, + "step": 9600, + "time_per_iteration": 2.5887608528137207 + }, + { + "auxiliary_loss_clip": 0.01117162, + "auxiliary_loss_mlp": 0.01039295, + "balance_loss_clip": 1.04082823, + "balance_loss_mlp": 1.02712333, + "epoch": 0.5772433488651736, + "flos": 32163866363520.0, + "grad_norm": 1.6385618717223935, + "language_loss": 0.7213316, + "learning_rate": 1.5994398380516163e-06, + "loss": 0.7428962, + "num_input_tokens_seen": 206851535, + "router_z_loss_clip": 0.76464844, + "router_z_loss_mlp": 0.12188721, + "step": 9601, + "time_per_iteration": 2.731685161590576 + }, + { + "auxiliary_loss_clip": 0.01119145, + "auxiliary_loss_mlp": 0.01034751, + "balance_loss_clip": 1.04332685, + "balance_loss_mlp": 1.02279389, + "epoch": 0.5773034721178416, + "flos": 24014856062880.0, + "grad_norm": 2.500063030285072, + "language_loss": 0.68285137, + "learning_rate": 1.599058274973348e-06, + "loss": 0.70439029, + "num_input_tokens_seen": 206870595, + "router_z_loss_clip": 0.75878906, + "router_z_loss_mlp": 0.11956787, + "step": 9602, + "time_per_iteration": 2.6158368587493896 + }, + { + "auxiliary_loss_clip": 0.01112759, + "auxiliary_loss_mlp": 0.01033345, + "balance_loss_clip": 1.04061544, + "balance_loss_mlp": 1.02240765, + "epoch": 0.5773635953705095, + "flos": 30606482832480.0, + "grad_norm": 1.9962662738587686, + "language_loss": 0.73479688, + "learning_rate": 1.5986767270988297e-06, + "loss": 0.75625789, + "num_input_tokens_seen": 206892320, + "router_z_loss_clip": 0.72070312, + "router_z_loss_mlp": 0.10943604, + "step": 9603, + "time_per_iteration": 2.7101128101348877 + }, + { + "auxiliary_loss_clip": 0.01116209, + "auxiliary_loss_mlp": 0.01027642, + "balance_loss_clip": 1.04150629, + "balance_loss_mlp": 1.01628745, + "epoch": 0.5774237186231775, + "flos": 25664484739680.0, + "grad_norm": 2.0922765047885634, + "language_loss": 0.76290226, + "learning_rate": 1.5982951944425298e-06, + "loss": 0.78434074, + "num_input_tokens_seen": 206912485, + "router_z_loss_clip": 0.74560547, + "router_z_loss_mlp": 0.11352539, + "step": 9604, + "time_per_iteration": 2.658604145050049 + }, + { + "auxiliary_loss_clip": 0.01117802, + "auxiliary_loss_mlp": 0.0103321, + "balance_loss_clip": 1.04045486, + "balance_loss_mlp": 1.02081239, + "epoch": 0.5774838418758454, + "flos": 18585342876960.0, + "grad_norm": 1.887969470783739, + "language_loss": 0.83252531, + "learning_rate": 1.5979136770189174e-06, + "loss": 0.85403538, + "num_input_tokens_seen": 206929100, + "router_z_loss_clip": 0.77246094, + "router_z_loss_mlp": 0.12390137, + "step": 9605, + "time_per_iteration": 5.457365989685059 + }, + { + "auxiliary_loss_clip": 0.01123768, + "auxiliary_loss_mlp": 0.01033465, + "balance_loss_clip": 1.04337668, + "balance_loss_mlp": 1.01966691, + "epoch": 0.5775439651285135, + "flos": 28776131971200.0, + "grad_norm": 2.0246821719953814, + "language_loss": 0.78022134, + "learning_rate": 1.5975321748424581e-06, + "loss": 0.80179369, + "num_input_tokens_seen": 206947020, + "router_z_loss_clip": 0.80322266, + "router_z_loss_mlp": 0.13806152, + "step": 9606, + "time_per_iteration": 2.6903891563415527 + }, + { + "auxiliary_loss_clip": 0.01116842, + "auxiliary_loss_mlp": 0.01034839, + "balance_loss_clip": 1.04190278, + "balance_loss_mlp": 1.02343655, + "epoch": 0.5776040883811814, + "flos": 22012899405120.0, + "grad_norm": 2.073285445858605, + "language_loss": 0.7399596, + "learning_rate": 1.597150687927619e-06, + "loss": 0.76147634, + "num_input_tokens_seen": 206964065, + "router_z_loss_clip": 0.74951172, + "router_z_loss_mlp": 0.11407471, + "step": 9607, + "time_per_iteration": 2.6483705043792725 + }, + { + "auxiliary_loss_clip": 0.0111978, + "auxiliary_loss_mlp": 0.01032732, + "balance_loss_clip": 1.04310894, + "balance_loss_mlp": 1.02106094, + "epoch": 0.5776642116338494, + "flos": 22725658823040.0, + "grad_norm": 1.5919214839306268, + "language_loss": 0.69351333, + "learning_rate": 1.5967692162888664e-06, + "loss": 0.71503848, + "num_input_tokens_seen": 206981940, + "router_z_loss_clip": 0.76660156, + "router_z_loss_mlp": 0.11676025, + "step": 9608, + "time_per_iteration": 2.6093997955322266 + }, + { + "auxiliary_loss_clip": 0.01120075, + "auxiliary_loss_mlp": 0.01035323, + "balance_loss_clip": 1.0425663, + "balance_loss_mlp": 1.02312803, + "epoch": 0.5777243348865173, + "flos": 34657430359680.0, + "grad_norm": 1.8546101790539338, + "language_loss": 0.76576209, + "learning_rate": 1.596387759940665e-06, + "loss": 0.78731608, + "num_input_tokens_seen": 207002365, + "router_z_loss_clip": 0.77490234, + "router_z_loss_mlp": 0.12200928, + "step": 9609, + "time_per_iteration": 2.729616641998291 + }, + { + "auxiliary_loss_clip": 0.01118478, + "auxiliary_loss_mlp": 0.01029899, + "balance_loss_clip": 1.04054701, + "balance_loss_mlp": 1.01854479, + "epoch": 0.5777844581391853, + "flos": 29315219211360.0, + "grad_norm": 1.7796104894224885, + "language_loss": 0.77512193, + "learning_rate": 1.5960063188974808e-06, + "loss": 0.79660565, + "num_input_tokens_seen": 207021195, + "router_z_loss_clip": 0.77978516, + "router_z_loss_mlp": 0.11352539, + "step": 9610, + "time_per_iteration": 2.644209623336792 + }, + { + "auxiliary_loss_clip": 0.01117872, + "auxiliary_loss_mlp": 0.01030767, + "balance_loss_clip": 1.04147828, + "balance_loss_mlp": 1.01805973, + "epoch": 0.5778445813918534, + "flos": 21692492690400.0, + "grad_norm": 3.384496388178964, + "language_loss": 0.69306391, + "learning_rate": 1.5956248931737777e-06, + "loss": 0.71455026, + "num_input_tokens_seen": 207037465, + "router_z_loss_clip": 0.76416016, + "router_z_loss_mlp": 0.1270752, + "step": 9611, + "time_per_iteration": 4.181657552719116 + }, + { + "auxiliary_loss_clip": 0.0111335, + "auxiliary_loss_mlp": 0.0102418, + "balance_loss_clip": 1.03821898, + "balance_loss_mlp": 1.01196158, + "epoch": 0.5779047046445213, + "flos": 27129420538560.0, + "grad_norm": 3.574296169803358, + "language_loss": 0.82825828, + "learning_rate": 1.5952434827840185e-06, + "loss": 0.84963357, + "num_input_tokens_seen": 207054230, + "router_z_loss_clip": 0.75097656, + "router_z_loss_mlp": 0.12231445, + "step": 9612, + "time_per_iteration": 2.7207653522491455 + }, + { + "auxiliary_loss_clip": 0.01116978, + "auxiliary_loss_mlp": 0.01032744, + "balance_loss_clip": 1.04158258, + "balance_loss_mlp": 1.02068043, + "epoch": 0.5779648278971893, + "flos": 26154268800480.0, + "grad_norm": 2.2981000070135673, + "language_loss": 0.79814047, + "learning_rate": 1.594862087742667e-06, + "loss": 0.81963772, + "num_input_tokens_seen": 207073150, + "router_z_loss_clip": 0.75488281, + "router_z_loss_mlp": 0.1206665, + "step": 9613, + "time_per_iteration": 2.6483304500579834 + }, + { + "auxiliary_loss_clip": 0.01115662, + "auxiliary_loss_mlp": 0.01035028, + "balance_loss_clip": 1.03993177, + "balance_loss_mlp": 1.02361369, + "epoch": 0.5780249511498572, + "flos": 23215564435680.0, + "grad_norm": 1.9915351093806517, + "language_loss": 0.77628422, + "learning_rate": 1.5944807080641863e-06, + "loss": 0.79779106, + "num_input_tokens_seen": 207090375, + "router_z_loss_clip": 0.75683594, + "router_z_loss_mlp": 0.11407471, + "step": 9614, + "time_per_iteration": 2.65325927734375 + }, + { + "auxiliary_loss_clip": 0.01119221, + "auxiliary_loss_mlp": 0.01033481, + "balance_loss_clip": 1.04074073, + "balance_loss_mlp": 1.02158403, + "epoch": 0.5780850744025252, + "flos": 14794397150400.0, + "grad_norm": 2.9377313549876076, + "language_loss": 0.80736542, + "learning_rate": 1.5940993437630375e-06, + "loss": 0.82889247, + "num_input_tokens_seen": 207106030, + "router_z_loss_clip": 0.78466797, + "router_z_loss_mlp": 0.11914062, + "step": 9615, + "time_per_iteration": 2.602534294128418 + }, + { + "auxiliary_loss_clip": 0.01117718, + "auxiliary_loss_mlp": 0.01029534, + "balance_loss_clip": 1.04009819, + "balance_loss_mlp": 1.01772082, + "epoch": 0.5781451976551931, + "flos": 30559732241760.0, + "grad_norm": 1.6194711350298967, + "language_loss": 0.66913795, + "learning_rate": 1.5937179948536825e-06, + "loss": 0.69061041, + "num_input_tokens_seen": 207125435, + "router_z_loss_clip": 0.77587891, + "router_z_loss_mlp": 0.11810303, + "step": 9616, + "time_per_iteration": 2.6762824058532715 + }, + { + "auxiliary_loss_clip": 0.01114469, + "auxiliary_loss_mlp": 0.01027606, + "balance_loss_clip": 1.04096997, + "balance_loss_mlp": 1.01553595, + "epoch": 0.5782053209078611, + "flos": 23483345554080.0, + "grad_norm": 1.9475641835326034, + "language_loss": 0.77862364, + "learning_rate": 1.5933366613505812e-06, + "loss": 0.80004442, + "num_input_tokens_seen": 207145095, + "router_z_loss_clip": 0.734375, + "router_z_loss_mlp": 0.12078857, + "step": 9617, + "time_per_iteration": 4.012932538986206 + }, + { + "auxiliary_loss_clip": 0.01116718, + "auxiliary_loss_mlp": 0.01030853, + "balance_loss_clip": 1.04108787, + "balance_loss_mlp": 1.01848495, + "epoch": 0.578265444160529, + "flos": 31718199270240.0, + "grad_norm": 1.6370004293519818, + "language_loss": 0.74767733, + "learning_rate": 1.5929553432681947e-06, + "loss": 0.76915312, + "num_input_tokens_seen": 207166045, + "router_z_loss_clip": 0.75488281, + "router_z_loss_mlp": 0.12365723, + "step": 9618, + "time_per_iteration": 2.7007596492767334 + }, + { + "auxiliary_loss_clip": 0.01114096, + "auxiliary_loss_mlp": 0.01024905, + "balance_loss_clip": 1.04013395, + "balance_loss_mlp": 1.01362145, + "epoch": 0.5783255674131971, + "flos": 26598801409920.0, + "grad_norm": 1.835477888524511, + "language_loss": 0.81167805, + "learning_rate": 1.5925740406209826e-06, + "loss": 0.83306813, + "num_input_tokens_seen": 207185290, + "router_z_loss_clip": 0.73974609, + "router_z_loss_mlp": 0.11291504, + "step": 9619, + "time_per_iteration": 2.6647322177886963 + }, + { + "auxiliary_loss_clip": 0.01115873, + "auxiliary_loss_mlp": 0.01029151, + "balance_loss_clip": 1.03969741, + "balance_loss_mlp": 1.0176121, + "epoch": 0.578385690665865, + "flos": 30248887605120.0, + "grad_norm": 2.108188295212344, + "language_loss": 0.72506976, + "learning_rate": 1.5921927534234039e-06, + "loss": 0.74651998, + "num_input_tokens_seen": 207205505, + "router_z_loss_clip": 0.76220703, + "router_z_loss_mlp": 0.11529541, + "step": 9620, + "time_per_iteration": 2.693394422531128 + }, + { + "auxiliary_loss_clip": 0.01117088, + "auxiliary_loss_mlp": 0.01026793, + "balance_loss_clip": 1.0408994, + "balance_loss_mlp": 1.01497924, + "epoch": 0.578445813918533, + "flos": 25884218714400.0, + "grad_norm": 1.6778618508949799, + "language_loss": 0.77275491, + "learning_rate": 1.591811481689916e-06, + "loss": 0.79419374, + "num_input_tokens_seen": 207225315, + "router_z_loss_clip": 0.76074219, + "router_z_loss_mlp": 0.11816406, + "step": 9621, + "time_per_iteration": 2.717437982559204 + }, + { + "auxiliary_loss_clip": 0.01115138, + "auxiliary_loss_mlp": 0.01028542, + "balance_loss_clip": 1.03801334, + "balance_loss_mlp": 1.01634669, + "epoch": 0.5785059371712009, + "flos": 30562406382240.0, + "grad_norm": 1.5794919943379555, + "language_loss": 0.70331299, + "learning_rate": 1.5914302254349787e-06, + "loss": 0.72474974, + "num_input_tokens_seen": 207247690, + "router_z_loss_clip": 0.77099609, + "router_z_loss_mlp": 0.12200928, + "step": 9622, + "time_per_iteration": 2.6437788009643555 + }, + { + "auxiliary_loss_clip": 0.01037453, + "auxiliary_loss_mlp": 0.01002924, + "balance_loss_clip": 1.01373363, + "balance_loss_mlp": 1.00178945, + "epoch": 0.5785660604238689, + "flos": 86442682459680.0, + "grad_norm": 0.769672066753361, + "language_loss": 0.55920982, + "learning_rate": 1.5910489846730476e-06, + "loss": 0.57961357, + "num_input_tokens_seen": 207301735, + "router_z_loss_clip": 0.23730469, + "router_z_loss_mlp": 0.01135254, + "step": 9623, + "time_per_iteration": 3.3898138999938965 + }, + { + "auxiliary_loss_clip": 0.01120894, + "auxiliary_loss_mlp": 0.01031257, + "balance_loss_clip": 1.04162025, + "balance_loss_mlp": 1.01886487, + "epoch": 0.578626183676537, + "flos": 38620184469120.0, + "grad_norm": 3.613636505130983, + "language_loss": 0.71127212, + "learning_rate": 1.5906677594185799e-06, + "loss": 0.73279363, + "num_input_tokens_seen": 207321240, + "router_z_loss_clip": 0.79199219, + "router_z_loss_mlp": 0.12402344, + "step": 9624, + "time_per_iteration": 2.6994495391845703 + }, + { + "auxiliary_loss_clip": 0.01119037, + "auxiliary_loss_mlp": 0.0103321, + "balance_loss_clip": 1.04346037, + "balance_loss_mlp": 1.02079391, + "epoch": 0.5786863069292049, + "flos": 26680633614720.0, + "grad_norm": 2.0776772187380734, + "language_loss": 0.82156163, + "learning_rate": 1.5902865496860322e-06, + "loss": 0.8430841, + "num_input_tokens_seen": 207339540, + "router_z_loss_clip": 0.75634766, + "router_z_loss_mlp": 0.12414551, + "step": 9625, + "time_per_iteration": 2.6626851558685303 + }, + { + "auxiliary_loss_clip": 0.01114755, + "auxiliary_loss_mlp": 0.01031932, + "balance_loss_clip": 1.04056811, + "balance_loss_mlp": 1.01945055, + "epoch": 0.5787464301818729, + "flos": 28510417234080.0, + "grad_norm": 1.4837965465556562, + "language_loss": 0.70260859, + "learning_rate": 1.5899053554898591e-06, + "loss": 0.72407544, + "num_input_tokens_seen": 207360470, + "router_z_loss_clip": 0.74121094, + "router_z_loss_mlp": 0.12481689, + "step": 9626, + "time_per_iteration": 2.6767990589141846 + }, + { + "auxiliary_loss_clip": 0.01114672, + "auxiliary_loss_mlp": 0.01036064, + "balance_loss_clip": 1.03974378, + "balance_loss_mlp": 1.02462041, + "epoch": 0.5788065534345408, + "flos": 36612028667520.0, + "grad_norm": 1.538669233715172, + "language_loss": 0.71727216, + "learning_rate": 1.5895241768445166e-06, + "loss": 0.73877954, + "num_input_tokens_seen": 207383080, + "router_z_loss_clip": 0.75097656, + "router_z_loss_mlp": 0.11444092, + "step": 9627, + "time_per_iteration": 2.7568676471710205 + }, + { + "auxiliary_loss_clip": 0.01114579, + "auxiliary_loss_mlp": 0.01025927, + "balance_loss_clip": 1.03955591, + "balance_loss_mlp": 1.01468551, + "epoch": 0.5788666766872088, + "flos": 29929898995200.0, + "grad_norm": 1.7003864842044525, + "language_loss": 0.8363781, + "learning_rate": 1.589143013764458e-06, + "loss": 0.85778308, + "num_input_tokens_seen": 207401000, + "router_z_loss_clip": 0.74951172, + "router_z_loss_mlp": 0.11242676, + "step": 9628, + "time_per_iteration": 2.6902663707733154 + }, + { + "auxiliary_loss_clip": 0.01115221, + "auxiliary_loss_mlp": 0.01027005, + "balance_loss_clip": 1.03867948, + "balance_loss_mlp": 1.01512539, + "epoch": 0.5789267999398767, + "flos": 28959042088800.0, + "grad_norm": 1.6812755085173459, + "language_loss": 0.71985757, + "learning_rate": 1.5887618662641376e-06, + "loss": 0.74127984, + "num_input_tokens_seen": 207419230, + "router_z_loss_clip": 0.76416016, + "router_z_loss_mlp": 0.11865234, + "step": 9629, + "time_per_iteration": 2.679180383682251 + }, + { + "auxiliary_loss_clip": 0.01117744, + "auxiliary_loss_mlp": 0.0103119, + "balance_loss_clip": 1.04210246, + "balance_loss_mlp": 1.01849461, + "epoch": 0.5789869231925447, + "flos": 25789704600960.0, + "grad_norm": 2.007599432365628, + "language_loss": 0.74520683, + "learning_rate": 1.5883807343580087e-06, + "loss": 0.76669616, + "num_input_tokens_seen": 207437615, + "router_z_loss_clip": 0.75683594, + "router_z_loss_mlp": 0.12689209, + "step": 9630, + "time_per_iteration": 2.6419811248779297 + }, + { + "auxiliary_loss_clip": 0.01112535, + "auxiliary_loss_mlp": 0.01029887, + "balance_loss_clip": 1.03945482, + "balance_loss_mlp": 1.0183115, + "epoch": 0.5790470464452127, + "flos": 25880572159200.0, + "grad_norm": 1.7022950590415324, + "language_loss": 0.79125345, + "learning_rate": 1.587999618060523e-06, + "loss": 0.81267774, + "num_input_tokens_seen": 207457270, + "router_z_loss_clip": 0.73144531, + "router_z_loss_mlp": 0.11566162, + "step": 9631, + "time_per_iteration": 2.678556442260742 + }, + { + "auxiliary_loss_clip": 0.01114782, + "auxiliary_loss_mlp": 0.01026059, + "balance_loss_clip": 1.0388701, + "balance_loss_mlp": 1.0138104, + "epoch": 0.5791071696978807, + "flos": 28558261791360.0, + "grad_norm": 3.0869174809318087, + "language_loss": 0.74713373, + "learning_rate": 1.5876185173861333e-06, + "loss": 0.76854211, + "num_input_tokens_seen": 207477890, + "router_z_loss_clip": 0.75878906, + "router_z_loss_mlp": 0.12243652, + "step": 9632, + "time_per_iteration": 2.6799652576446533 + }, + { + "auxiliary_loss_clip": 0.01115349, + "auxiliary_loss_mlp": 0.01027365, + "balance_loss_clip": 1.04072356, + "balance_loss_mlp": 1.01482439, + "epoch": 0.5791672929505486, + "flos": 29537141119200.0, + "grad_norm": 2.2012814936761154, + "language_loss": 0.79737973, + "learning_rate": 1.5872374323492915e-06, + "loss": 0.81880689, + "num_input_tokens_seen": 207497670, + "router_z_loss_clip": 0.74560547, + "router_z_loss_mlp": 0.12536621, + "step": 9633, + "time_per_iteration": 2.684448719024658 + }, + { + "auxiliary_loss_clip": 0.01125217, + "auxiliary_loss_mlp": 0.01035595, + "balance_loss_clip": 1.04326367, + "balance_loss_mlp": 1.02320313, + "epoch": 0.5792274162032166, + "flos": 29711177952480.0, + "grad_norm": 1.6823021374000322, + "language_loss": 0.77763122, + "learning_rate": 1.5868563629644464e-06, + "loss": 0.79923934, + "num_input_tokens_seen": 207516105, + "router_z_loss_clip": 0.81933594, + "router_z_loss_mlp": 0.1239624, + "step": 9634, + "time_per_iteration": 2.6597704887390137 + }, + { + "auxiliary_loss_clip": 0.01119141, + "auxiliary_loss_mlp": 0.0103361, + "balance_loss_clip": 1.0411433, + "balance_loss_mlp": 1.02170062, + "epoch": 0.5792875394558845, + "flos": 24952333080960.0, + "grad_norm": 4.288722955373306, + "language_loss": 0.63878834, + "learning_rate": 1.5864753092460502e-06, + "loss": 0.66031587, + "num_input_tokens_seen": 207533685, + "router_z_loss_clip": 0.78027344, + "router_z_loss_mlp": 0.11907959, + "step": 9635, + "time_per_iteration": 2.633481979370117 + }, + { + "auxiliary_loss_clip": 0.01115463, + "auxiliary_loss_mlp": 0.01028637, + "balance_loss_clip": 1.04191875, + "balance_loss_mlp": 1.01728237, + "epoch": 0.5793476627085525, + "flos": 29358768936960.0, + "grad_norm": 1.5021822690011075, + "language_loss": 0.77025807, + "learning_rate": 1.5860942712085516e-06, + "loss": 0.79169905, + "num_input_tokens_seen": 207552840, + "router_z_loss_clip": 0.73486328, + "router_z_loss_mlp": 0.11346436, + "step": 9636, + "time_per_iteration": 2.749763250350952 + }, + { + "auxiliary_loss_clip": 0.0111154, + "auxiliary_loss_mlp": 0.01027574, + "balance_loss_clip": 1.03942835, + "balance_loss_mlp": 1.01651144, + "epoch": 0.5794077859612206, + "flos": 26910780530400.0, + "grad_norm": 1.5795619497611089, + "language_loss": 0.68289256, + "learning_rate": 1.5857132488663998e-06, + "loss": 0.70428371, + "num_input_tokens_seen": 207572095, + "router_z_loss_clip": 0.72119141, + "router_z_loss_mlp": 0.1105957, + "step": 9637, + "time_per_iteration": 2.6353626251220703 + }, + { + "auxiliary_loss_clip": 0.01117063, + "auxiliary_loss_mlp": 0.01032052, + "balance_loss_clip": 1.0394218, + "balance_loss_mlp": 1.02004719, + "epoch": 0.5794679092138885, + "flos": 13952325625920.0, + "grad_norm": 2.8408652008455793, + "language_loss": 0.72404683, + "learning_rate": 1.585332242234043e-06, + "loss": 0.745538, + "num_input_tokens_seen": 207587495, + "router_z_loss_clip": 0.77636719, + "router_z_loss_mlp": 0.11993408, + "step": 9638, + "time_per_iteration": 2.705662488937378 + }, + { + "auxiliary_loss_clip": 0.01117193, + "auxiliary_loss_mlp": 0.01028921, + "balance_loss_clip": 1.04260683, + "balance_loss_mlp": 1.01786399, + "epoch": 0.5795280324665565, + "flos": 23037516391680.0, + "grad_norm": 2.2877527549878383, + "language_loss": 0.72309792, + "learning_rate": 1.5849512513259291e-06, + "loss": 0.74455905, + "num_input_tokens_seen": 207606795, + "router_z_loss_clip": 0.74560547, + "router_z_loss_mlp": 0.11065674, + "step": 9639, + "time_per_iteration": 2.6345863342285156 + }, + { + "auxiliary_loss_clip": 0.01118209, + "auxiliary_loss_mlp": 0.01033996, + "balance_loss_clip": 1.04163253, + "balance_loss_mlp": 1.02230155, + "epoch": 0.5795881557192244, + "flos": 15869289731040.0, + "grad_norm": 2.2964358415232375, + "language_loss": 0.6947974, + "learning_rate": 1.5845702761565054e-06, + "loss": 0.71631944, + "num_input_tokens_seen": 207623620, + "router_z_loss_clip": 0.76611328, + "router_z_loss_mlp": 0.11700439, + "step": 9640, + "time_per_iteration": 2.6007907390594482 + }, + { + "auxiliary_loss_clip": 0.01124459, + "auxiliary_loss_mlp": 0.010359, + "balance_loss_clip": 1.04318941, + "balance_loss_mlp": 1.02321041, + "epoch": 0.5796482789718924, + "flos": 24322013627040.0, + "grad_norm": 4.33908854082195, + "language_loss": 0.78291696, + "learning_rate": 1.5841893167402183e-06, + "loss": 0.80452061, + "num_input_tokens_seen": 207639380, + "router_z_loss_clip": 0.8125, + "router_z_loss_mlp": 0.12695312, + "step": 9641, + "time_per_iteration": 2.605834484100342 + }, + { + "auxiliary_loss_clip": 0.01117602, + "auxiliary_loss_mlp": 0.01033996, + "balance_loss_clip": 1.04208422, + "balance_loss_mlp": 1.02236676, + "epoch": 0.5797084022245603, + "flos": 26418605950080.0, + "grad_norm": 2.1461322879248375, + "language_loss": 0.7383495, + "learning_rate": 1.5838083730915143e-06, + "loss": 0.75986552, + "num_input_tokens_seen": 207657915, + "router_z_loss_clip": 0.75488281, + "router_z_loss_mlp": 0.11639404, + "step": 9642, + "time_per_iteration": 2.6245617866516113 + }, + { + "auxiliary_loss_clip": 0.01116163, + "auxiliary_loss_mlp": 0.01036161, + "balance_loss_clip": 1.04104269, + "balance_loss_mlp": 1.02409685, + "epoch": 0.5797685254772283, + "flos": 31764139515360.0, + "grad_norm": 1.837881100026886, + "language_loss": 0.73651153, + "learning_rate": 1.5834274452248378e-06, + "loss": 0.75803483, + "num_input_tokens_seen": 207678620, + "router_z_loss_clip": 0.75097656, + "router_z_loss_mlp": 0.12072754, + "step": 9643, + "time_per_iteration": 2.6901156902313232 + }, + { + "auxiliary_loss_clip": 0.01119278, + "auxiliary_loss_mlp": 0.01031622, + "balance_loss_clip": 1.04147172, + "balance_loss_mlp": 1.01950479, + "epoch": 0.5798286487298963, + "flos": 27705574739520.0, + "grad_norm": 2.644188842140724, + "language_loss": 0.66925859, + "learning_rate": 1.5830465331546352e-06, + "loss": 0.69076765, + "num_input_tokens_seen": 207696980, + "router_z_loss_clip": 0.77783203, + "router_z_loss_mlp": 0.12115479, + "step": 9644, + "time_per_iteration": 3.994763135910034 + }, + { + "auxiliary_loss_clip": 0.01123387, + "auxiliary_loss_mlp": 0.01031467, + "balance_loss_clip": 1.04357719, + "balance_loss_mlp": 1.01851499, + "epoch": 0.5798887719825643, + "flos": 28246606809120.0, + "grad_norm": 2.330081097782168, + "language_loss": 0.85654819, + "learning_rate": 1.5826656368953496e-06, + "loss": 0.8780967, + "num_input_tokens_seen": 207714065, + "router_z_loss_clip": 0.79785156, + "router_z_loss_mlp": 0.12939453, + "step": 9645, + "time_per_iteration": 3.955960988998413 + }, + { + "auxiliary_loss_clip": 0.01119021, + "auxiliary_loss_mlp": 0.01033208, + "balance_loss_clip": 1.04284358, + "balance_loss_mlp": 1.02175212, + "epoch": 0.5799488952352322, + "flos": 29804962754880.0, + "grad_norm": 1.9879023038049233, + "language_loss": 0.75680786, + "learning_rate": 1.5822847564614244e-06, + "loss": 0.77833009, + "num_input_tokens_seen": 207734720, + "router_z_loss_clip": 0.76123047, + "router_z_loss_mlp": 0.11450195, + "step": 9646, + "time_per_iteration": 2.716614246368408 + }, + { + "auxiliary_loss_clip": 0.01120521, + "auxiliary_loss_mlp": 0.01033139, + "balance_loss_clip": 1.04278553, + "balance_loss_mlp": 1.0206933, + "epoch": 0.5800090184879002, + "flos": 46852445079360.0, + "grad_norm": 2.4779116207116374, + "language_loss": 0.59343135, + "learning_rate": 1.5819038918673038e-06, + "loss": 0.61496794, + "num_input_tokens_seen": 207755435, + "router_z_loss_clip": 0.77734375, + "router_z_loss_mlp": 0.12457275, + "step": 9647, + "time_per_iteration": 2.7506320476531982 + }, + { + "auxiliary_loss_clip": 0.0112013, + "auxiliary_loss_mlp": 0.01035769, + "balance_loss_clip": 1.04248047, + "balance_loss_mlp": 1.0233767, + "epoch": 0.5800691417405681, + "flos": 24142101788160.0, + "grad_norm": 1.7298438891429315, + "language_loss": 0.84306073, + "learning_rate": 1.5815230431274288e-06, + "loss": 0.86461973, + "num_input_tokens_seen": 207773570, + "router_z_loss_clip": 0.77636719, + "router_z_loss_mlp": 0.12384033, + "step": 9648, + "time_per_iteration": 2.681610345840454 + }, + { + "auxiliary_loss_clip": 0.01038551, + "auxiliary_loss_mlp": 0.01003148, + "balance_loss_clip": 1.0145328, + "balance_loss_mlp": 1.00192428, + "epoch": 0.5801292649932361, + "flos": 85799275924320.0, + "grad_norm": 0.8332748787680496, + "language_loss": 0.63017786, + "learning_rate": 1.581142210256242e-06, + "loss": 0.65059483, + "num_input_tokens_seen": 207830095, + "router_z_loss_clip": 0.24035645, + "router_z_loss_mlp": 0.01222229, + "step": 9649, + "time_per_iteration": 3.323432207107544 + }, + { + "auxiliary_loss_clip": 0.01114801, + "auxiliary_loss_mlp": 0.01032638, + "balance_loss_clip": 1.04098666, + "balance_loss_mlp": 1.02144444, + "epoch": 0.5801893882459042, + "flos": 22860035589600.0, + "grad_norm": 2.8082348029166684, + "language_loss": 0.82525939, + "learning_rate": 1.5807613932681857e-06, + "loss": 0.84673381, + "num_input_tokens_seen": 207848555, + "router_z_loss_clip": 0.73779297, + "router_z_loss_mlp": 0.11199951, + "step": 9650, + "time_per_iteration": 2.6392087936401367 + }, + { + "auxiliary_loss_clip": 0.01119388, + "auxiliary_loss_mlp": 0.0103068, + "balance_loss_clip": 1.04052353, + "balance_loss_mlp": 1.01866364, + "epoch": 0.5802495114985721, + "flos": 19030483245600.0, + "grad_norm": 2.325922018954786, + "language_loss": 0.77134931, + "learning_rate": 1.580380592177698e-06, + "loss": 0.79285002, + "num_input_tokens_seen": 207867060, + "router_z_loss_clip": 0.7890625, + "router_z_loss_mlp": 0.12017822, + "step": 9651, + "time_per_iteration": 4.095627546310425 + }, + { + "auxiliary_loss_clip": 0.01121328, + "auxiliary_loss_mlp": 0.01040374, + "balance_loss_clip": 1.0433228, + "balance_loss_mlp": 1.027982, + "epoch": 0.5803096347512401, + "flos": 22275494311680.0, + "grad_norm": 2.328813973651309, + "language_loss": 0.74346602, + "learning_rate": 1.5799998069992213e-06, + "loss": 0.76508307, + "num_input_tokens_seen": 207884520, + "router_z_loss_clip": 0.77880859, + "router_z_loss_mlp": 0.12408447, + "step": 9652, + "time_per_iteration": 2.5807998180389404 + }, + { + "auxiliary_loss_clip": 0.01119196, + "auxiliary_loss_mlp": 0.01033051, + "balance_loss_clip": 1.04117155, + "balance_loss_mlp": 1.02085018, + "epoch": 0.580369758003908, + "flos": 27934465619520.0, + "grad_norm": 3.5360436211764124, + "language_loss": 0.77108097, + "learning_rate": 1.579619037747193e-06, + "loss": 0.79260349, + "num_input_tokens_seen": 207905370, + "router_z_loss_clip": 0.78076172, + "router_z_loss_mlp": 0.12207031, + "step": 9653, + "time_per_iteration": 2.703937530517578 + }, + { + "auxiliary_loss_clip": 0.01117785, + "auxiliary_loss_mlp": 0.01031564, + "balance_loss_clip": 1.04113197, + "balance_loss_mlp": 1.01883888, + "epoch": 0.580429881256576, + "flos": 22814784138240.0, + "grad_norm": 2.139749439394806, + "language_loss": 0.74258006, + "learning_rate": 1.5792382844360534e-06, + "loss": 0.76407355, + "num_input_tokens_seen": 207923790, + "router_z_loss_clip": 0.76660156, + "router_z_loss_mlp": 0.12731934, + "step": 9654, + "time_per_iteration": 2.577524185180664 + }, + { + "auxiliary_loss_clip": 0.01115134, + "auxiliary_loss_mlp": 0.01032738, + "balance_loss_clip": 1.04212415, + "balance_loss_mlp": 1.02108574, + "epoch": 0.5804900045092439, + "flos": 30117225496320.0, + "grad_norm": 2.158006763092032, + "language_loss": 0.70080823, + "learning_rate": 1.5788575470802408e-06, + "loss": 0.72228694, + "num_input_tokens_seen": 207942335, + "router_z_loss_clip": 0.73144531, + "router_z_loss_mlp": 0.11651611, + "step": 9655, + "time_per_iteration": 2.691584825515747 + }, + { + "auxiliary_loss_clip": 0.01121761, + "auxiliary_loss_mlp": 0.01034046, + "balance_loss_clip": 1.04112053, + "balance_loss_mlp": 1.02179158, + "epoch": 0.580550127761912, + "flos": 28202408807040.0, + "grad_norm": 2.185314401563615, + "language_loss": 0.69740391, + "learning_rate": 1.5784768256941915e-06, + "loss": 0.71896207, + "num_input_tokens_seen": 207961975, + "router_z_loss_clip": 0.80712891, + "router_z_loss_mlp": 0.12255859, + "step": 9656, + "time_per_iteration": 3.952507495880127 + }, + { + "auxiliary_loss_clip": 0.01114291, + "auxiliary_loss_mlp": 0.01029743, + "balance_loss_clip": 1.04226136, + "balance_loss_mlp": 1.01844239, + "epoch": 0.5806102510145799, + "flos": 22544166810240.0, + "grad_norm": 1.6563708096835692, + "language_loss": 0.71900439, + "learning_rate": 1.5780961202923433e-06, + "loss": 0.74044472, + "num_input_tokens_seen": 207979520, + "router_z_loss_clip": 0.72070312, + "router_z_loss_mlp": 0.11309814, + "step": 9657, + "time_per_iteration": 2.6725575923919678 + }, + { + "auxiliary_loss_clip": 0.01121156, + "auxiliary_loss_mlp": 0.01036501, + "balance_loss_clip": 1.04181373, + "balance_loss_mlp": 1.02373374, + "epoch": 0.5806703742672479, + "flos": 29182098480480.0, + "grad_norm": 4.002161250643305, + "language_loss": 0.70779705, + "learning_rate": 1.5777154308891328e-06, + "loss": 0.72937357, + "num_input_tokens_seen": 207998375, + "router_z_loss_clip": 0.79296875, + "router_z_loss_mlp": 0.12768555, + "step": 9658, + "time_per_iteration": 2.7623887062072754 + }, + { + "auxiliary_loss_clip": 0.01038651, + "auxiliary_loss_mlp": 0.01002016, + "balance_loss_clip": 1.01471615, + "balance_loss_mlp": 1.00081325, + "epoch": 0.5807304975199158, + "flos": 80914036190400.0, + "grad_norm": 0.6474341927507158, + "language_loss": 0.53584933, + "learning_rate": 1.5773347574989953e-06, + "loss": 0.556256, + "num_input_tokens_seen": 208060605, + "router_z_loss_clip": 0.23950195, + "router_z_loss_mlp": 0.0120163, + "step": 9659, + "time_per_iteration": 3.2460479736328125 + }, + { + "auxiliary_loss_clip": 0.01120734, + "auxiliary_loss_mlp": 0.01037545, + "balance_loss_clip": 1.04270136, + "balance_loss_mlp": 1.02472973, + "epoch": 0.5807906207725838, + "flos": 38708661507840.0, + "grad_norm": 5.7328287962248945, + "language_loss": 0.62084222, + "learning_rate": 1.576954100136366e-06, + "loss": 0.64242494, + "num_input_tokens_seen": 208080320, + "router_z_loss_clip": 0.78027344, + "router_z_loss_mlp": 0.12805176, + "step": 9660, + "time_per_iteration": 2.7165298461914062 + }, + { + "auxiliary_loss_clip": 0.01117725, + "auxiliary_loss_mlp": 0.01031648, + "balance_loss_clip": 1.03927445, + "balance_loss_mlp": 1.01918483, + "epoch": 0.5808507440252517, + "flos": 29042859640320.0, + "grad_norm": 2.2765088272837413, + "language_loss": 0.65170044, + "learning_rate": 1.5765734588156797e-06, + "loss": 0.67319423, + "num_input_tokens_seen": 208099305, + "router_z_loss_clip": 0.78417969, + "router_z_loss_mlp": 0.12475586, + "step": 9661, + "time_per_iteration": 2.627953290939331 + }, + { + "auxiliary_loss_clip": 0.01111185, + "auxiliary_loss_mlp": 0.01024872, + "balance_loss_clip": 1.03940964, + "balance_loss_mlp": 1.01416743, + "epoch": 0.5809108672779197, + "flos": 16714400051520.0, + "grad_norm": 1.761289363888515, + "language_loss": 0.74393857, + "learning_rate": 1.5761928335513704e-06, + "loss": 0.76529908, + "num_input_tokens_seen": 208116960, + "router_z_loss_clip": 0.71679688, + "router_z_loss_mlp": 0.10699463, + "step": 9662, + "time_per_iteration": 2.613935947418213 + }, + { + "auxiliary_loss_clip": 0.01038626, + "auxiliary_loss_mlp": 0.01001289, + "balance_loss_clip": 1.01462388, + "balance_loss_mlp": 1.00010109, + "epoch": 0.5809709905305876, + "flos": 84357475492320.0, + "grad_norm": 0.8761184058874237, + "language_loss": 0.58342403, + "learning_rate": 1.5758122243578709e-06, + "loss": 0.60382313, + "num_input_tokens_seen": 208182190, + "router_z_loss_clip": 0.23999023, + "router_z_loss_mlp": 0.01186371, + "step": 9663, + "time_per_iteration": 3.306429147720337 + }, + { + "auxiliary_loss_clip": 0.01117343, + "auxiliary_loss_mlp": 0.01029706, + "balance_loss_clip": 1.04244554, + "balance_loss_mlp": 1.01826191, + "epoch": 0.5810311137832557, + "flos": 24193633417920.0, + "grad_norm": 2.5123616784868457, + "language_loss": 0.81772, + "learning_rate": 1.5754316312496152e-06, + "loss": 0.83919048, + "num_input_tokens_seen": 208197015, + "router_z_loss_clip": 0.74951172, + "router_z_loss_mlp": 0.11444092, + "step": 9664, + "time_per_iteration": 2.595613956451416 + }, + { + "auxiliary_loss_clip": 0.01117046, + "auxiliary_loss_mlp": 0.01025427, + "balance_loss_clip": 1.03836656, + "balance_loss_mlp": 1.01353037, + "epoch": 0.5810912370359237, + "flos": 35677833549120.0, + "grad_norm": 2.0419438439688053, + "language_loss": 0.81543654, + "learning_rate": 1.5750510542410337e-06, + "loss": 0.83686131, + "num_input_tokens_seen": 208215795, + "router_z_loss_clip": 0.78613281, + "router_z_loss_mlp": 0.11895752, + "step": 9665, + "time_per_iteration": 2.7095086574554443 + }, + { + "auxiliary_loss_clip": 0.01121643, + "auxiliary_loss_mlp": 0.0103541, + "balance_loss_clip": 1.0427773, + "balance_loss_mlp": 1.02182031, + "epoch": 0.5811513602885916, + "flos": 27803006097120.0, + "grad_norm": 1.7939280930579868, + "language_loss": 0.81369561, + "learning_rate": 1.5746704933465599e-06, + "loss": 0.83526611, + "num_input_tokens_seen": 208234655, + "router_z_loss_clip": 0.78857422, + "router_z_loss_mlp": 0.13580322, + "step": 9666, + "time_per_iteration": 2.6277153491973877 + }, + { + "auxiliary_loss_clip": 0.01115716, + "auxiliary_loss_mlp": 0.01032332, + "balance_loss_clip": 1.04179692, + "balance_loss_mlp": 1.02084661, + "epoch": 0.5812114835412596, + "flos": 22859508864960.0, + "grad_norm": 1.8090058836362417, + "language_loss": 0.80064487, + "learning_rate": 1.5742899485806227e-06, + "loss": 0.82212532, + "num_input_tokens_seen": 208251300, + "router_z_loss_clip": 0.73925781, + "router_z_loss_mlp": 0.11486816, + "step": 9667, + "time_per_iteration": 2.6533701419830322 + }, + { + "auxiliary_loss_clip": 0.01123182, + "auxiliary_loss_mlp": 0.01031809, + "balance_loss_clip": 1.04172707, + "balance_loss_mlp": 1.01885653, + "epoch": 0.5812716067939275, + "flos": 32252059781280.0, + "grad_norm": 1.9110421885868079, + "language_loss": 0.78824961, + "learning_rate": 1.573909419957653e-06, + "loss": 0.80979943, + "num_input_tokens_seen": 208272685, + "router_z_loss_clip": 0.81347656, + "router_z_loss_mlp": 0.12939453, + "step": 9668, + "time_per_iteration": 2.644695281982422 + }, + { + "auxiliary_loss_clip": 0.01118404, + "auxiliary_loss_mlp": 0.01030188, + "balance_loss_clip": 1.04184043, + "balance_loss_mlp": 1.01857138, + "epoch": 0.5813317300465956, + "flos": 52955908479360.0, + "grad_norm": 1.9083628432657906, + "language_loss": 0.64302444, + "learning_rate": 1.5735289074920819e-06, + "loss": 0.66451025, + "num_input_tokens_seen": 208294315, + "router_z_loss_clip": 0.76464844, + "router_z_loss_mlp": 0.1161499, + "step": 9669, + "time_per_iteration": 2.8008368015289307 + }, + { + "auxiliary_loss_clip": 0.01118525, + "auxiliary_loss_mlp": 0.01035444, + "balance_loss_clip": 1.04277325, + "balance_loss_mlp": 1.02355242, + "epoch": 0.5813918532992635, + "flos": 30244066048800.0, + "grad_norm": 1.578931305634822, + "language_loss": 0.73174071, + "learning_rate": 1.5731484111983363e-06, + "loss": 0.7532804, + "num_input_tokens_seen": 208315610, + "router_z_loss_clip": 0.7578125, + "router_z_loss_mlp": 0.11883545, + "step": 9670, + "time_per_iteration": 2.6766974925994873 + }, + { + "auxiliary_loss_clip": 0.01120362, + "auxiliary_loss_mlp": 0.01035266, + "balance_loss_clip": 1.04181576, + "balance_loss_mlp": 1.0234046, + "epoch": 0.5814519765519315, + "flos": 27890713307520.0, + "grad_norm": 2.2331026178771354, + "language_loss": 0.79267341, + "learning_rate": 1.5727679310908464e-06, + "loss": 0.81422973, + "num_input_tokens_seen": 208334725, + "router_z_loss_clip": 0.78564453, + "router_z_loss_mlp": 0.11853027, + "step": 9671, + "time_per_iteration": 2.673978328704834 + }, + { + "auxiliary_loss_clip": 0.01124496, + "auxiliary_loss_mlp": 0.01039255, + "balance_loss_clip": 1.04476571, + "balance_loss_mlp": 1.02592134, + "epoch": 0.5815120998045994, + "flos": 29359336178880.0, + "grad_norm": 1.9955564783858006, + "language_loss": 0.60737759, + "learning_rate": 1.5723874671840399e-06, + "loss": 0.62901509, + "num_input_tokens_seen": 208353825, + "router_z_loss_clip": 0.79736328, + "router_z_loss_mlp": 0.13336182, + "step": 9672, + "time_per_iteration": 2.6403276920318604 + }, + { + "auxiliary_loss_clip": 0.01116332, + "auxiliary_loss_mlp": 0.01030728, + "balance_loss_clip": 1.04254425, + "balance_loss_mlp": 1.01930737, + "epoch": 0.5815722230572674, + "flos": 29626185399840.0, + "grad_norm": 1.6546350724696228, + "language_loss": 0.81530786, + "learning_rate": 1.572007019492342e-06, + "loss": 0.8367784, + "num_input_tokens_seen": 208374160, + "router_z_loss_clip": 0.73730469, + "router_z_loss_mlp": 0.11425781, + "step": 9673, + "time_per_iteration": 2.6546590328216553 + }, + { + "auxiliary_loss_clip": 0.01123565, + "auxiliary_loss_mlp": 0.01034502, + "balance_loss_clip": 1.04358077, + "balance_loss_mlp": 1.02172303, + "epoch": 0.5816323463099353, + "flos": 27089031160800.0, + "grad_norm": 5.088425315013098, + "language_loss": 0.87942672, + "learning_rate": 1.5716265880301817e-06, + "loss": 0.90100741, + "num_input_tokens_seen": 208392105, + "router_z_loss_clip": 0.79980469, + "router_z_loss_mlp": 0.12786865, + "step": 9674, + "time_per_iteration": 2.6258978843688965 + }, + { + "auxiliary_loss_clip": 0.01119533, + "auxiliary_loss_mlp": 0.0103159, + "balance_loss_clip": 1.04265738, + "balance_loss_mlp": 1.02030051, + "epoch": 0.5816924695626033, + "flos": 29449109770560.0, + "grad_norm": 1.5718473157053634, + "language_loss": 0.79065335, + "learning_rate": 1.571246172811984e-06, + "loss": 0.81216455, + "num_input_tokens_seen": 208411755, + "router_z_loss_clip": 0.76904297, + "router_z_loss_mlp": 0.112854, + "step": 9675, + "time_per_iteration": 2.6387033462524414 + }, + { + "auxiliary_loss_clip": 0.01118798, + "auxiliary_loss_mlp": 0.01030602, + "balance_loss_clip": 1.04257417, + "balance_loss_mlp": 1.01868165, + "epoch": 0.5817525928152713, + "flos": 26019932551200.0, + "grad_norm": 2.017618913523039, + "language_loss": 0.7026487, + "learning_rate": 1.5708657738521748e-06, + "loss": 0.72414267, + "num_input_tokens_seen": 208429995, + "router_z_loss_clip": 0.76269531, + "router_z_loss_mlp": 0.1192627, + "step": 9676, + "time_per_iteration": 2.6482129096984863 + }, + { + "auxiliary_loss_clip": 0.01118117, + "auxiliary_loss_mlp": 0.01031365, + "balance_loss_clip": 1.04119968, + "balance_loss_mlp": 1.01934242, + "epoch": 0.5818127160679393, + "flos": 32866820599680.0, + "grad_norm": 3.536596092097616, + "language_loss": 0.63624144, + "learning_rate": 1.5704853911651779e-06, + "loss": 0.6577363, + "num_input_tokens_seen": 208443655, + "router_z_loss_clip": 0.76904297, + "router_z_loss_mlp": 0.12011719, + "step": 9677, + "time_per_iteration": 2.6260294914245605 + }, + { + "auxiliary_loss_clip": 0.01038683, + "auxiliary_loss_mlp": 0.01002143, + "balance_loss_clip": 1.01481986, + "balance_loss_mlp": 1.00090742, + "epoch": 0.5818728393206073, + "flos": 77992105979520.0, + "grad_norm": 0.8024112034770827, + "language_loss": 0.54162651, + "learning_rate": 1.5701050247654182e-06, + "loss": 0.56203473, + "num_input_tokens_seen": 208498405, + "router_z_loss_clip": 0.2388916, + "router_z_loss_mlp": 0.01234436, + "step": 9678, + "time_per_iteration": 3.3186490535736084 + }, + { + "auxiliary_loss_clip": 0.01038642, + "auxiliary_loss_mlp": 0.01000852, + "balance_loss_clip": 1.01464629, + "balance_loss_mlp": 0.99962497, + "epoch": 0.5819329625732752, + "flos": 79257762679680.0, + "grad_norm": 0.7355448462547455, + "language_loss": 0.56268203, + "learning_rate": 1.569724674667319e-06, + "loss": 0.58307695, + "num_input_tokens_seen": 208559075, + "router_z_loss_clip": 0.2401123, + "router_z_loss_mlp": 0.01226044, + "step": 9679, + "time_per_iteration": 3.1180570125579834 + }, + { + "auxiliary_loss_clip": 0.0111544, + "auxiliary_loss_mlp": 0.01027348, + "balance_loss_clip": 1.03990459, + "balance_loss_mlp": 1.01681638, + "epoch": 0.5819930858259432, + "flos": 25886933372160.0, + "grad_norm": 1.9936205010385935, + "language_loss": 0.65320456, + "learning_rate": 1.5693443408853032e-06, + "loss": 0.67463243, + "num_input_tokens_seen": 208577770, + "router_z_loss_clip": 0.75634766, + "router_z_loss_mlp": 0.10534668, + "step": 9680, + "time_per_iteration": 2.7267730236053467 + }, + { + "auxiliary_loss_clip": 0.01117373, + "auxiliary_loss_mlp": 0.01027449, + "balance_loss_clip": 1.04157603, + "balance_loss_mlp": 1.01591504, + "epoch": 0.5820532090786111, + "flos": 23743712010240.0, + "grad_norm": 2.100182456587793, + "language_loss": 0.83441257, + "learning_rate": 1.5689640234337933e-06, + "loss": 0.85586083, + "num_input_tokens_seen": 208595110, + "router_z_loss_clip": 0.7578125, + "router_z_loss_mlp": 0.11529541, + "step": 9681, + "time_per_iteration": 2.656505584716797 + }, + { + "auxiliary_loss_clip": 0.0111698, + "auxiliary_loss_mlp": 0.01028376, + "balance_loss_clip": 1.04133105, + "balance_loss_mlp": 1.01668739, + "epoch": 0.5821133323312792, + "flos": 21612726866880.0, + "grad_norm": 1.9393330664507318, + "language_loss": 0.7553072, + "learning_rate": 1.5685837223272109e-06, + "loss": 0.77676082, + "num_input_tokens_seen": 208612080, + "router_z_loss_clip": 0.75585938, + "router_z_loss_mlp": 0.11694336, + "step": 9682, + "time_per_iteration": 2.629211664199829 + }, + { + "auxiliary_loss_clip": 0.01120359, + "auxiliary_loss_mlp": 0.01030919, + "balance_loss_clip": 1.04232395, + "balance_loss_mlp": 1.01889718, + "epoch": 0.5821734555839471, + "flos": 29982848729760.0, + "grad_norm": 2.695021102606011, + "language_loss": 0.74853969, + "learning_rate": 1.568203437579977e-06, + "loss": 0.77005255, + "num_input_tokens_seen": 208630235, + "router_z_loss_clip": 0.77929688, + "router_z_loss_mlp": 0.12030029, + "step": 9683, + "time_per_iteration": 4.067992687225342 + }, + { + "auxiliary_loss_clip": 0.01120565, + "auxiliary_loss_mlp": 0.01031123, + "balance_loss_clip": 1.04148328, + "balance_loss_mlp": 1.01863599, + "epoch": 0.5822335788366151, + "flos": 27311034103200.0, + "grad_norm": 4.99284647918982, + "language_loss": 0.73780251, + "learning_rate": 1.5678231692065116e-06, + "loss": 0.75931937, + "num_input_tokens_seen": 208647925, + "router_z_loss_clip": 0.79052734, + "router_z_loss_mlp": 0.125, + "step": 9684, + "time_per_iteration": 3.9235949516296387 + }, + { + "auxiliary_loss_clip": 0.01119372, + "auxiliary_loss_mlp": 0.0103317, + "balance_loss_clip": 1.04220557, + "balance_loss_mlp": 1.0212667, + "epoch": 0.582293702089283, + "flos": 32607467075520.0, + "grad_norm": 2.154599023064988, + "language_loss": 0.77920842, + "learning_rate": 1.5674429172212348e-06, + "loss": 0.80073386, + "num_input_tokens_seen": 208666180, + "router_z_loss_clip": 0.77148438, + "router_z_loss_mlp": 0.11914062, + "step": 9685, + "time_per_iteration": 2.6273059844970703 + }, + { + "auxiliary_loss_clip": 0.01117799, + "auxiliary_loss_mlp": 0.01035325, + "balance_loss_clip": 1.0406425, + "balance_loss_mlp": 1.02359486, + "epoch": 0.582353825341951, + "flos": 21168518395680.0, + "grad_norm": 1.7889597329936653, + "language_loss": 0.75398207, + "learning_rate": 1.5670626816385667e-06, + "loss": 0.77551335, + "num_input_tokens_seen": 208684240, + "router_z_loss_clip": 0.77197266, + "router_z_loss_mlp": 0.11737061, + "step": 9686, + "time_per_iteration": 2.6281003952026367 + }, + { + "auxiliary_loss_clip": 0.01036537, + "auxiliary_loss_mlp": 0.01002909, + "balance_loss_clip": 1.01265502, + "balance_loss_mlp": 1.0017879, + "epoch": 0.5824139485946189, + "flos": 67688002753920.0, + "grad_norm": 0.815412715866893, + "language_loss": 0.57382232, + "learning_rate": 1.5666824624729244e-06, + "loss": 0.59421682, + "num_input_tokens_seen": 208736090, + "router_z_loss_clip": 0.23876953, + "router_z_loss_mlp": 0.01119995, + "step": 9687, + "time_per_iteration": 3.0412580966949463 + }, + { + "auxiliary_loss_clip": 0.01117081, + "auxiliary_loss_mlp": 0.01029951, + "balance_loss_clip": 1.04032183, + "balance_loss_mlp": 1.01727891, + "epoch": 0.582474071847287, + "flos": 24773758312320.0, + "grad_norm": 2.090280801522645, + "language_loss": 0.70541465, + "learning_rate": 1.566302259738727e-06, + "loss": 0.72688502, + "num_input_tokens_seen": 208754600, + "router_z_loss_clip": 0.76757812, + "router_z_loss_mlp": 0.12677002, + "step": 9688, + "time_per_iteration": 2.682446002960205 + }, + { + "auxiliary_loss_clip": 0.01119122, + "auxiliary_loss_mlp": 0.0103551, + "balance_loss_clip": 1.0423882, + "balance_loss_mlp": 1.02385712, + "epoch": 0.5825341950999549, + "flos": 29136279787200.0, + "grad_norm": 2.716233145926289, + "language_loss": 0.65446478, + "learning_rate": 1.5659220734503918e-06, + "loss": 0.67601109, + "num_input_tokens_seen": 208773140, + "router_z_loss_clip": 0.76806641, + "router_z_loss_mlp": 0.11645508, + "step": 9689, + "time_per_iteration": 2.6266775131225586 + }, + { + "auxiliary_loss_clip": 0.01119063, + "auxiliary_loss_mlp": 0.01030174, + "balance_loss_clip": 1.04377913, + "balance_loss_mlp": 1.01816392, + "epoch": 0.5825943183526229, + "flos": 28202692428000.0, + "grad_norm": 1.876596964145601, + "language_loss": 0.73462832, + "learning_rate": 1.5655419036223341e-06, + "loss": 0.75612074, + "num_input_tokens_seen": 208793410, + "router_z_loss_clip": 0.75292969, + "router_z_loss_mlp": 0.12011719, + "step": 9690, + "time_per_iteration": 4.061282634735107 + }, + { + "auxiliary_loss_clip": 0.01119929, + "auxiliary_loss_mlp": 0.01036571, + "balance_loss_clip": 1.04254985, + "balance_loss_mlp": 1.02358949, + "epoch": 0.5826544416052909, + "flos": 27890794342080.0, + "grad_norm": 2.0616485351252436, + "language_loss": 0.7579748, + "learning_rate": 1.5651617502689717e-06, + "loss": 0.77953982, + "num_input_tokens_seen": 208811920, + "router_z_loss_clip": 0.77441406, + "router_z_loss_mlp": 0.12976074, + "step": 9691, + "time_per_iteration": 2.6652936935424805 + }, + { + "auxiliary_loss_clip": 0.01117349, + "auxiliary_loss_mlp": 0.0103121, + "balance_loss_clip": 1.04018593, + "balance_loss_mlp": 1.01944435, + "epoch": 0.5827145648579588, + "flos": 38439624353760.0, + "grad_norm": 1.8099150447655012, + "language_loss": 0.80858195, + "learning_rate": 1.5647816134047184e-06, + "loss": 0.83006757, + "num_input_tokens_seen": 208834720, + "router_z_loss_clip": 0.77050781, + "router_z_loss_mlp": 0.11767578, + "step": 9692, + "time_per_iteration": 2.6992413997650146 + }, + { + "auxiliary_loss_clip": 0.01036917, + "auxiliary_loss_mlp": 0.01001729, + "balance_loss_clip": 1.01327658, + "balance_loss_mlp": 1.00055718, + "epoch": 0.5827746881106268, + "flos": 85185041830560.0, + "grad_norm": 0.762796698835319, + "language_loss": 0.5690434, + "learning_rate": 1.5644014930439907e-06, + "loss": 0.58942986, + "num_input_tokens_seen": 208898415, + "router_z_loss_clip": 0.2364502, + "router_z_loss_mlp": 0.01171112, + "step": 9693, + "time_per_iteration": 3.206740379333496 + }, + { + "auxiliary_loss_clip": 0.01116493, + "auxiliary_loss_mlp": 0.01031192, + "balance_loss_clip": 1.04001522, + "balance_loss_mlp": 1.0203979, + "epoch": 0.5828348113632947, + "flos": 28201963116960.0, + "grad_norm": 2.049949527128458, + "language_loss": 0.79220688, + "learning_rate": 1.5640213892012025e-06, + "loss": 0.81368375, + "num_input_tokens_seen": 208919045, + "router_z_loss_clip": 0.76416016, + "router_z_loss_mlp": 0.10797119, + "step": 9694, + "time_per_iteration": 2.6967692375183105 + }, + { + "auxiliary_loss_clip": 0.0111383, + "auxiliary_loss_mlp": 0.0103143, + "balance_loss_clip": 1.04159296, + "balance_loss_mlp": 1.02087426, + "epoch": 0.5828949346159628, + "flos": 26689547416320.0, + "grad_norm": 2.354158863080711, + "language_loss": 0.76144886, + "learning_rate": 1.5636413018907656e-06, + "loss": 0.78290153, + "num_input_tokens_seen": 208939375, + "router_z_loss_clip": 0.72167969, + "router_z_loss_mlp": 0.10565186, + "step": 9695, + "time_per_iteration": 2.7599916458129883 + }, + { + "auxiliary_loss_clip": 0.01036482, + "auxiliary_loss_mlp": 0.00999913, + "balance_loss_clip": 1.01288986, + "balance_loss_mlp": 0.99872416, + "epoch": 0.5829550578686307, + "flos": 80486196700320.0, + "grad_norm": 0.7782019513668639, + "language_loss": 0.55015886, + "learning_rate": 1.563261231127095e-06, + "loss": 0.57052279, + "num_input_tokens_seen": 209004760, + "router_z_loss_clip": 0.23608398, + "router_z_loss_mlp": 0.01187897, + "step": 9696, + "time_per_iteration": 4.779590845108032 + }, + { + "auxiliary_loss_clip": 0.01120466, + "auxiliary_loss_mlp": 0.01027658, + "balance_loss_clip": 1.04411602, + "balance_loss_mlp": 1.01595771, + "epoch": 0.5830151811212987, + "flos": 19876484946240.0, + "grad_norm": 2.3421141157595247, + "language_loss": 0.75975847, + "learning_rate": 1.5628811769246021e-06, + "loss": 0.78123975, + "num_input_tokens_seen": 209022930, + "router_z_loss_clip": 0.76464844, + "router_z_loss_mlp": 0.11694336, + "step": 9697, + "time_per_iteration": 2.627964973449707 + }, + { + "auxiliary_loss_clip": 0.01119257, + "auxiliary_loss_mlp": 0.01032101, + "balance_loss_clip": 1.04087806, + "balance_loss_mlp": 1.01954865, + "epoch": 0.5830753043739666, + "flos": 29491281908640.0, + "grad_norm": 2.100835545575017, + "language_loss": 0.78109145, + "learning_rate": 1.5625011392976991e-06, + "loss": 0.80260503, + "num_input_tokens_seen": 209043740, + "router_z_loss_clip": 0.78320312, + "router_z_loss_mlp": 0.12548828, + "step": 9698, + "time_per_iteration": 2.680690050125122 + }, + { + "auxiliary_loss_clip": 0.01120176, + "auxiliary_loss_mlp": 0.01037263, + "balance_loss_clip": 1.04358518, + "balance_loss_mlp": 1.02531815, + "epoch": 0.5831354276266346, + "flos": 33017404278240.0, + "grad_norm": 1.6138705563219629, + "language_loss": 0.83790946, + "learning_rate": 1.5621211182607966e-06, + "loss": 0.85948384, + "num_input_tokens_seen": 209068885, + "router_z_loss_clip": 0.76464844, + "router_z_loss_mlp": 0.11950684, + "step": 9699, + "time_per_iteration": 2.7576329708099365 + }, + { + "auxiliary_loss_clip": 0.01120179, + "auxiliary_loss_mlp": 0.01028437, + "balance_loss_clip": 1.04218364, + "balance_loss_mlp": 1.01680207, + "epoch": 0.5831955508793025, + "flos": 28825597219680.0, + "grad_norm": 2.9013022360705922, + "language_loss": 0.66273344, + "learning_rate": 1.561741113828305e-06, + "loss": 0.68421966, + "num_input_tokens_seen": 209087340, + "router_z_loss_clip": 0.78027344, + "router_z_loss_mlp": 0.11627197, + "step": 9700, + "time_per_iteration": 2.648005485534668 + }, + { + "auxiliary_loss_clip": 0.01117771, + "auxiliary_loss_mlp": 0.01033958, + "balance_loss_clip": 1.0411675, + "balance_loss_mlp": 1.02164364, + "epoch": 0.5832556741319705, + "flos": 30472713825120.0, + "grad_norm": 2.29546496041843, + "language_loss": 0.71598041, + "learning_rate": 1.5613611260146344e-06, + "loss": 0.73749763, + "num_input_tokens_seen": 209108840, + "router_z_loss_clip": 0.76611328, + "router_z_loss_mlp": 0.12298584, + "step": 9701, + "time_per_iteration": 2.725123167037964 + }, + { + "auxiliary_loss_clip": 0.0111559, + "auxiliary_loss_mlp": 0.0103356, + "balance_loss_clip": 1.04005766, + "balance_loss_mlp": 1.02175248, + "epoch": 0.5833157973846385, + "flos": 28335205399680.0, + "grad_norm": 1.6756748322865895, + "language_loss": 0.85586166, + "learning_rate": 1.5609811548341936e-06, + "loss": 0.87735319, + "num_input_tokens_seen": 209127985, + "router_z_loss_clip": 0.75537109, + "router_z_loss_mlp": 0.1182251, + "step": 9702, + "time_per_iteration": 2.6362087726593018 + }, + { + "auxiliary_loss_clip": 0.01114019, + "auxiliary_loss_mlp": 0.0103241, + "balance_loss_clip": 1.03989506, + "balance_loss_mlp": 1.02116823, + "epoch": 0.5833759206373065, + "flos": 26817927625440.0, + "grad_norm": 1.5596585961385612, + "language_loss": 0.78019726, + "learning_rate": 1.560601200301392e-06, + "loss": 0.80166161, + "num_input_tokens_seen": 209146885, + "router_z_loss_clip": 0.74121094, + "router_z_loss_mlp": 0.11236572, + "step": 9703, + "time_per_iteration": 2.8018314838409424 + }, + { + "auxiliary_loss_clip": 0.01122472, + "auxiliary_loss_mlp": 0.01031153, + "balance_loss_clip": 1.04350162, + "balance_loss_mlp": 1.01871991, + "epoch": 0.5834360438899745, + "flos": 26554603407840.0, + "grad_norm": 2.805523153210475, + "language_loss": 0.71688735, + "learning_rate": 1.5602212624306366e-06, + "loss": 0.73842365, + "num_input_tokens_seen": 209166130, + "router_z_loss_clip": 0.7890625, + "router_z_loss_mlp": 0.12438965, + "step": 9704, + "time_per_iteration": 2.613290548324585 + }, + { + "auxiliary_loss_clip": 0.01118537, + "auxiliary_loss_mlp": 0.01032717, + "balance_loss_clip": 1.0425005, + "balance_loss_mlp": 1.02118361, + "epoch": 0.5834961671426424, + "flos": 19514270748960.0, + "grad_norm": 1.7158476974205996, + "language_loss": 0.8118974, + "learning_rate": 1.559841341236335e-06, + "loss": 0.83340997, + "num_input_tokens_seen": 209183350, + "router_z_loss_clip": 0.75976562, + "router_z_loss_mlp": 0.11535645, + "step": 9705, + "time_per_iteration": 2.6043102741241455 + }, + { + "auxiliary_loss_clip": 0.01117208, + "auxiliary_loss_mlp": 0.01028995, + "balance_loss_clip": 1.04103327, + "balance_loss_mlp": 1.01747918, + "epoch": 0.5835562903953104, + "flos": 27843314440320.0, + "grad_norm": 2.011855771161634, + "language_loss": 0.80559832, + "learning_rate": 1.5594614367328937e-06, + "loss": 0.82706034, + "num_input_tokens_seen": 209203945, + "router_z_loss_clip": 0.76123047, + "router_z_loss_mlp": 0.1151123, + "step": 9706, + "time_per_iteration": 2.6102938652038574 + }, + { + "auxiliary_loss_clip": 0.01116207, + "auxiliary_loss_mlp": 0.01037219, + "balance_loss_clip": 1.04086959, + "balance_loss_mlp": 1.0248096, + "epoch": 0.5836164136479783, + "flos": 59143067879040.0, + "grad_norm": 2.4191031118430693, + "language_loss": 0.74876696, + "learning_rate": 1.5590815489347187e-06, + "loss": 0.77030122, + "num_input_tokens_seen": 209227080, + "router_z_loss_clip": 0.75292969, + "router_z_loss_mlp": 0.12408447, + "step": 9707, + "time_per_iteration": 2.8694396018981934 + }, + { + "auxiliary_loss_clip": 0.01114425, + "auxiliary_loss_mlp": 0.01025984, + "balance_loss_clip": 1.04074383, + "balance_loss_mlp": 1.01456392, + "epoch": 0.5836765369006464, + "flos": 32831293295520.0, + "grad_norm": 1.8336494149745652, + "language_loss": 0.81602681, + "learning_rate": 1.5587016778562163e-06, + "loss": 0.83743095, + "num_input_tokens_seen": 209248170, + "router_z_loss_clip": 0.73730469, + "router_z_loss_mlp": 0.11431885, + "step": 9708, + "time_per_iteration": 2.6657745838165283 + }, + { + "auxiliary_loss_clip": 0.01117942, + "auxiliary_loss_mlp": 0.01027555, + "balance_loss_clip": 1.0429306, + "balance_loss_mlp": 1.01516926, + "epoch": 0.5837366601533143, + "flos": 24506544435840.0, + "grad_norm": 2.36494360878255, + "language_loss": 0.78344399, + "learning_rate": 1.5583218235117896e-06, + "loss": 0.80489898, + "num_input_tokens_seen": 209267730, + "router_z_loss_clip": 0.74902344, + "router_z_loss_mlp": 0.12384033, + "step": 9709, + "time_per_iteration": 2.672179698944092 + }, + { + "auxiliary_loss_clip": 0.01037414, + "auxiliary_loss_mlp": 0.01000113, + "balance_loss_clip": 1.01355839, + "balance_loss_mlp": 0.99901217, + "epoch": 0.5837967834059823, + "flos": 79757027784000.0, + "grad_norm": 0.7635489239540951, + "language_loss": 0.56518197, + "learning_rate": 1.557941985915844e-06, + "loss": 0.58555722, + "num_input_tokens_seen": 209332510, + "router_z_loss_clip": 0.23864746, + "router_z_loss_mlp": 0.01102448, + "step": 9710, + "time_per_iteration": 3.320624589920044 + }, + { + "auxiliary_loss_clip": 0.01116955, + "auxiliary_loss_mlp": 0.01030763, + "balance_loss_clip": 1.04245591, + "balance_loss_mlp": 1.01958108, + "epoch": 0.5838569066586502, + "flos": 30918542987520.0, + "grad_norm": 1.5676966086965214, + "language_loss": 0.6546272, + "learning_rate": 1.5575621650827833e-06, + "loss": 0.67610437, + "num_input_tokens_seen": 209353355, + "router_z_loss_clip": 0.74560547, + "router_z_loss_mlp": 0.11181641, + "step": 9711, + "time_per_iteration": 2.678689956665039 + }, + { + "auxiliary_loss_clip": 0.01123251, + "auxiliary_loss_mlp": 0.01038408, + "balance_loss_clip": 1.04164171, + "balance_loss_mlp": 1.02553344, + "epoch": 0.5839170299113182, + "flos": 27124963637760.0, + "grad_norm": 2.4760928913623723, + "language_loss": 0.78481758, + "learning_rate": 1.5571823610270085e-06, + "loss": 0.80643421, + "num_input_tokens_seen": 209370960, + "router_z_loss_clip": 0.81640625, + "router_z_loss_mlp": 0.12878418, + "step": 9712, + "time_per_iteration": 2.6493499279022217 + }, + { + "auxiliary_loss_clip": 0.01117123, + "auxiliary_loss_mlp": 0.01029421, + "balance_loss_clip": 1.04039013, + "balance_loss_mlp": 1.01713634, + "epoch": 0.5839771531639861, + "flos": 27088625988000.0, + "grad_norm": 2.0475359661464556, + "language_loss": 0.73338121, + "learning_rate": 1.5568025737629234e-06, + "loss": 0.75484663, + "num_input_tokens_seen": 209390955, + "router_z_loss_clip": 0.76855469, + "router_z_loss_mlp": 0.12286377, + "step": 9713, + "time_per_iteration": 2.652205228805542 + }, + { + "auxiliary_loss_clip": 0.01121012, + "auxiliary_loss_mlp": 0.01030507, + "balance_loss_clip": 1.04061592, + "balance_loss_mlp": 1.01689911, + "epoch": 0.5840372764166541, + "flos": 27356245037280.0, + "grad_norm": 2.0119475616876348, + "language_loss": 0.69250655, + "learning_rate": 1.5564228033049292e-06, + "loss": 0.71402168, + "num_input_tokens_seen": 209410260, + "router_z_loss_clip": 0.8046875, + "router_z_loss_mlp": 0.13604736, + "step": 9714, + "time_per_iteration": 2.6483826637268066 + }, + { + "auxiliary_loss_clip": 0.01118625, + "auxiliary_loss_mlp": 0.0102949, + "balance_loss_clip": 1.04019201, + "balance_loss_mlp": 1.01696122, + "epoch": 0.5840973996693221, + "flos": 24194443763520.0, + "grad_norm": 2.3527129916275067, + "language_loss": 0.80005068, + "learning_rate": 1.5560430496674268e-06, + "loss": 0.82153183, + "num_input_tokens_seen": 209429920, + "router_z_loss_clip": 0.78466797, + "router_z_loss_mlp": 0.12530518, + "step": 9715, + "time_per_iteration": 2.6539101600646973 + }, + { + "auxiliary_loss_clip": 0.01117419, + "auxiliary_loss_mlp": 0.01032534, + "balance_loss_clip": 1.04096007, + "balance_loss_mlp": 1.01975513, + "epoch": 0.5841575229219901, + "flos": 25798658919840.0, + "grad_norm": 2.568414694471323, + "language_loss": 0.72622734, + "learning_rate": 1.5556633128648167e-06, + "loss": 0.7477268, + "num_input_tokens_seen": 209449470, + "router_z_loss_clip": 0.76416016, + "router_z_loss_mlp": 0.12756348, + "step": 9716, + "time_per_iteration": 2.6470606327056885 + }, + { + "auxiliary_loss_clip": 0.01115318, + "auxiliary_loss_mlp": 0.01028895, + "balance_loss_clip": 1.04123259, + "balance_loss_mlp": 1.01725972, + "epoch": 0.5842176461746581, + "flos": 30066868867680.0, + "grad_norm": 1.7825253181369787, + "language_loss": 0.74844432, + "learning_rate": 1.5552835929114976e-06, + "loss": 0.76988649, + "num_input_tokens_seen": 209467695, + "router_z_loss_clip": 0.74072266, + "router_z_loss_mlp": 0.11621094, + "step": 9717, + "time_per_iteration": 2.6489686965942383 + }, + { + "auxiliary_loss_clip": 0.01116826, + "auxiliary_loss_mlp": 0.01036179, + "balance_loss_clip": 1.04068577, + "balance_loss_mlp": 1.02366781, + "epoch": 0.584277769427326, + "flos": 23344147231200.0, + "grad_norm": 2.085260260664319, + "language_loss": 0.79931837, + "learning_rate": 1.5549038898218697e-06, + "loss": 0.82084846, + "num_input_tokens_seen": 209484250, + "router_z_loss_clip": 0.76074219, + "router_z_loss_mlp": 0.12506104, + "step": 9718, + "time_per_iteration": 2.7384610176086426 + }, + { + "auxiliary_loss_clip": 0.01116672, + "auxiliary_loss_mlp": 0.01029497, + "balance_loss_clip": 1.04193902, + "balance_loss_mlp": 1.01687896, + "epoch": 0.584337892679994, + "flos": 27668305192320.0, + "grad_norm": 2.693672904672085, + "language_loss": 0.67657888, + "learning_rate": 1.5545242036103306e-06, + "loss": 0.6980406, + "num_input_tokens_seen": 209502830, + "router_z_loss_clip": 0.74804688, + "router_z_loss_mlp": 0.12634277, + "step": 9719, + "time_per_iteration": 2.6102609634399414 + }, + { + "auxiliary_loss_clip": 0.01118598, + "auxiliary_loss_mlp": 0.01031005, + "balance_loss_clip": 1.04075956, + "balance_loss_mlp": 1.01884007, + "epoch": 0.5843980159326619, + "flos": 38173788064800.0, + "grad_norm": 2.192214862702395, + "language_loss": 0.75743055, + "learning_rate": 1.5541445342912786e-06, + "loss": 0.77892661, + "num_input_tokens_seen": 209525995, + "router_z_loss_clip": 0.77880859, + "router_z_loss_mlp": 0.12164307, + "step": 9720, + "time_per_iteration": 2.7112503051757812 + }, + { + "auxiliary_loss_clip": 0.01117988, + "auxiliary_loss_mlp": 0.01034892, + "balance_loss_clip": 1.04004395, + "balance_loss_mlp": 1.02298903, + "epoch": 0.58445813918533, + "flos": 27757389990240.0, + "grad_norm": 1.7678767810077591, + "language_loss": 0.83063662, + "learning_rate": 1.5537648818791105e-06, + "loss": 0.85216546, + "num_input_tokens_seen": 209545895, + "router_z_loss_clip": 0.78027344, + "router_z_loss_mlp": 0.11907959, + "step": 9721, + "time_per_iteration": 2.6231420040130615 + }, + { + "auxiliary_loss_clip": 0.01038778, + "auxiliary_loss_mlp": 0.01004522, + "balance_loss_clip": 1.01470637, + "balance_loss_mlp": 1.00344777, + "epoch": 0.5845182624379979, + "flos": 74050049849760.0, + "grad_norm": 0.9291673076221328, + "language_loss": 0.71387953, + "learning_rate": 1.5533852463882226e-06, + "loss": 0.73431253, + "num_input_tokens_seen": 209602315, + "router_z_loss_clip": 0.24047852, + "router_z_loss_mlp": 0.01074982, + "step": 9722, + "time_per_iteration": 3.2824602127075195 + }, + { + "auxiliary_loss_clip": 0.01116323, + "auxiliary_loss_mlp": 0.0103466, + "balance_loss_clip": 1.0411067, + "balance_loss_mlp": 1.02284026, + "epoch": 0.5845783856906659, + "flos": 19965164571360.0, + "grad_norm": 3.0953242849147995, + "language_loss": 0.88995183, + "learning_rate": 1.5530056278330113e-06, + "loss": 0.91146165, + "num_input_tokens_seen": 209617615, + "router_z_loss_clip": 0.75146484, + "router_z_loss_mlp": 0.11816406, + "step": 9723, + "time_per_iteration": 4.095663070678711 + }, + { + "auxiliary_loss_clip": 0.01115725, + "auxiliary_loss_mlp": 0.01031405, + "balance_loss_clip": 1.04108334, + "balance_loss_mlp": 1.01953733, + "epoch": 0.5846385089433338, + "flos": 24505815124800.0, + "grad_norm": 1.5073861845443906, + "language_loss": 0.68102306, + "learning_rate": 1.5526260262278709e-06, + "loss": 0.70249432, + "num_input_tokens_seen": 209637005, + "router_z_loss_clip": 0.74707031, + "router_z_loss_mlp": 0.11877441, + "step": 9724, + "time_per_iteration": 3.9850924015045166 + }, + { + "auxiliary_loss_clip": 0.01122047, + "auxiliary_loss_mlp": 0.01032604, + "balance_loss_clip": 1.04390931, + "balance_loss_mlp": 1.02005124, + "epoch": 0.5846986321960018, + "flos": 21122740219680.0, + "grad_norm": 2.1085922289237136, + "language_loss": 0.86489433, + "learning_rate": 1.552246441587197e-06, + "loss": 0.88644081, + "num_input_tokens_seen": 209653170, + "router_z_loss_clip": 0.78076172, + "router_z_loss_mlp": 0.12567139, + "step": 9725, + "time_per_iteration": 2.592252731323242 + }, + { + "auxiliary_loss_clip": 0.01121492, + "auxiliary_loss_mlp": 0.01035221, + "balance_loss_clip": 1.04286146, + "balance_loss_mlp": 1.02322316, + "epoch": 0.5847587554486697, + "flos": 20985000518880.0, + "grad_norm": 1.6869488571726283, + "language_loss": 0.83008063, + "learning_rate": 1.5518668739253821e-06, + "loss": 0.85164779, + "num_input_tokens_seen": 209671275, + "router_z_loss_clip": 0.78564453, + "router_z_loss_mlp": 0.12005615, + "step": 9726, + "time_per_iteration": 2.658851385116577 + }, + { + "auxiliary_loss_clip": 0.01119218, + "auxiliary_loss_mlp": 0.01029231, + "balance_loss_clip": 1.04254413, + "balance_loss_mlp": 1.01787615, + "epoch": 0.5848188787013378, + "flos": 29931317100000.0, + "grad_norm": 2.2871866652944903, + "language_loss": 0.66979998, + "learning_rate": 1.5514873232568206e-06, + "loss": 0.69128442, + "num_input_tokens_seen": 209690380, + "router_z_loss_clip": 0.76708984, + "router_z_loss_mlp": 0.11340332, + "step": 9727, + "time_per_iteration": 2.6348679065704346 + }, + { + "auxiliary_loss_clip": 0.01121183, + "auxiliary_loss_mlp": 0.01041967, + "balance_loss_clip": 1.0443399, + "balance_loss_mlp": 1.02959275, + "epoch": 0.5848790019540057, + "flos": 25171054123680.0, + "grad_norm": 1.91605386413175, + "language_loss": 0.81857371, + "learning_rate": 1.5511077895959055e-06, + "loss": 0.84020519, + "num_input_tokens_seen": 209708845, + "router_z_loss_clip": 0.76904297, + "router_z_loss_mlp": 0.12371826, + "step": 9728, + "time_per_iteration": 2.644489049911499 + }, + { + "auxiliary_loss_clip": 0.01115009, + "auxiliary_loss_mlp": 0.01034228, + "balance_loss_clip": 1.04152668, + "balance_loss_mlp": 1.02293324, + "epoch": 0.5849391252066737, + "flos": 27355677795360.0, + "grad_norm": 28.372624778803893, + "language_loss": 0.78118008, + "learning_rate": 1.550728272957027e-06, + "loss": 0.80267245, + "num_input_tokens_seen": 209729000, + "router_z_loss_clip": 0.73486328, + "router_z_loss_mlp": 0.11297607, + "step": 9729, + "time_per_iteration": 2.6310930252075195 + }, + { + "auxiliary_loss_clip": 0.01117409, + "auxiliary_loss_mlp": 0.01031132, + "balance_loss_clip": 1.04021668, + "balance_loss_mlp": 1.01867449, + "epoch": 0.5849992484593417, + "flos": 31006817439840.0, + "grad_norm": 4.866387489540371, + "language_loss": 0.70419347, + "learning_rate": 1.5503487733545782e-06, + "loss": 0.72567892, + "num_input_tokens_seen": 209747435, + "router_z_loss_clip": 0.77246094, + "router_z_loss_mlp": 0.12451172, + "step": 9730, + "time_per_iteration": 4.160254716873169 + }, + { + "auxiliary_loss_clip": 0.01122302, + "auxiliary_loss_mlp": 0.01034958, + "balance_loss_clip": 1.04309034, + "balance_loss_mlp": 1.02183354, + "epoch": 0.5850593717120096, + "flos": 25704144806400.0, + "grad_norm": 2.073915781909947, + "language_loss": 0.78753138, + "learning_rate": 1.5499692908029482e-06, + "loss": 0.80910397, + "num_input_tokens_seen": 209764910, + "router_z_loss_clip": 0.79150391, + "router_z_loss_mlp": 0.13140869, + "step": 9731, + "time_per_iteration": 2.626537561416626 + }, + { + "auxiliary_loss_clip": 0.01117625, + "auxiliary_loss_mlp": 0.01036017, + "balance_loss_clip": 1.04193711, + "balance_loss_mlp": 1.02386391, + "epoch": 0.5851194949646776, + "flos": 30873332053440.0, + "grad_norm": 2.0913976585362803, + "language_loss": 0.70534295, + "learning_rate": 1.549589825316528e-06, + "loss": 0.72687936, + "num_input_tokens_seen": 209786115, + "router_z_loss_clip": 0.75634766, + "router_z_loss_mlp": 0.121521, + "step": 9732, + "time_per_iteration": 2.6516358852386475 + }, + { + "auxiliary_loss_clip": 0.01123642, + "auxiliary_loss_mlp": 0.01034804, + "balance_loss_clip": 1.04410553, + "balance_loss_mlp": 1.02127957, + "epoch": 0.5851796182173455, + "flos": 28781358700320.0, + "grad_norm": 2.1560815668700632, + "language_loss": 0.52690744, + "learning_rate": 1.5492103769097075e-06, + "loss": 0.5484919, + "num_input_tokens_seen": 209806095, + "router_z_loss_clip": 0.79541016, + "router_z_loss_mlp": 0.13513184, + "step": 9733, + "time_per_iteration": 2.6866941452026367 + }, + { + "auxiliary_loss_clip": 0.01121248, + "auxiliary_loss_mlp": 0.01033828, + "balance_loss_clip": 1.04407465, + "balance_loss_mlp": 1.02115619, + "epoch": 0.5852397414700136, + "flos": 30289439052000.0, + "grad_norm": 2.3240193528103448, + "language_loss": 0.87671214, + "learning_rate": 1.5488309455968739e-06, + "loss": 0.89826286, + "num_input_tokens_seen": 209823650, + "router_z_loss_clip": 0.77197266, + "router_z_loss_mlp": 0.12670898, + "step": 9734, + "time_per_iteration": 2.6703591346740723 + }, + { + "auxiliary_loss_clip": 0.01115028, + "auxiliary_loss_mlp": 0.01033295, + "balance_loss_clip": 1.04278266, + "balance_loss_mlp": 1.02206516, + "epoch": 0.5852998647226815, + "flos": 24328091219040.0, + "grad_norm": 1.614443514167366, + "language_loss": 0.72162068, + "learning_rate": 1.5484515313924163e-06, + "loss": 0.74310392, + "num_input_tokens_seen": 209843220, + "router_z_loss_clip": 0.72265625, + "router_z_loss_mlp": 0.11236572, + "step": 9735, + "time_per_iteration": 4.0578389167785645 + }, + { + "auxiliary_loss_clip": 0.01121427, + "auxiliary_loss_mlp": 0.01039624, + "balance_loss_clip": 1.04206455, + "balance_loss_mlp": 1.02634394, + "epoch": 0.5853599879753495, + "flos": 20402566139520.0, + "grad_norm": 2.307441857538404, + "language_loss": 0.73467267, + "learning_rate": 1.5480721343107217e-06, + "loss": 0.75628316, + "num_input_tokens_seen": 209854880, + "router_z_loss_clip": 0.79492188, + "router_z_loss_mlp": 0.13311768, + "step": 9736, + "time_per_iteration": 2.69323468208313 + }, + { + "auxiliary_loss_clip": 0.0111671, + "auxiliary_loss_mlp": 0.01031873, + "balance_loss_clip": 1.04098487, + "balance_loss_mlp": 1.02032197, + "epoch": 0.5854201112280174, + "flos": 54247779859680.0, + "grad_norm": 1.6335623968023523, + "language_loss": 0.70839798, + "learning_rate": 1.5476927543661772e-06, + "loss": 0.72988379, + "num_input_tokens_seen": 209877870, + "router_z_loss_clip": 0.75683594, + "router_z_loss_mlp": 0.11547852, + "step": 9737, + "time_per_iteration": 2.818394899368286 + }, + { + "auxiliary_loss_clip": 0.01115486, + "auxiliary_loss_mlp": 0.01037515, + "balance_loss_clip": 1.04089069, + "balance_loss_mlp": 1.02632165, + "epoch": 0.5854802344806854, + "flos": 24817915797120.0, + "grad_norm": 10.984253126740542, + "language_loss": 0.82796943, + "learning_rate": 1.547313391573169e-06, + "loss": 0.8494994, + "num_input_tokens_seen": 209896690, + "router_z_loss_clip": 0.74609375, + "router_z_loss_mlp": 0.11193848, + "step": 9738, + "time_per_iteration": 2.673487901687622 + }, + { + "auxiliary_loss_clip": 0.01124272, + "auxiliary_loss_mlp": 0.01036203, + "balance_loss_clip": 1.04409754, + "balance_loss_mlp": 1.02302432, + "epoch": 0.5855403577333533, + "flos": 25528122626400.0, + "grad_norm": 2.5260263229290003, + "language_loss": 0.68261409, + "learning_rate": 1.546934045946082e-06, + "loss": 0.70421886, + "num_input_tokens_seen": 209914640, + "router_z_loss_clip": 0.80273438, + "router_z_loss_mlp": 0.13183594, + "step": 9739, + "time_per_iteration": 2.6193392276763916 + }, + { + "auxiliary_loss_clip": 0.01118927, + "auxiliary_loss_mlp": 0.01027687, + "balance_loss_clip": 1.03992593, + "balance_loss_mlp": 1.01502657, + "epoch": 0.5856004809860214, + "flos": 24951482218080.0, + "grad_norm": 2.4801107890878535, + "language_loss": 0.59212381, + "learning_rate": 1.5465547174993017e-06, + "loss": 0.61358988, + "num_input_tokens_seen": 209933375, + "router_z_loss_clip": 0.78955078, + "router_z_loss_mlp": 0.12652588, + "step": 9740, + "time_per_iteration": 2.7189319133758545 + }, + { + "auxiliary_loss_clip": 0.01117618, + "auxiliary_loss_mlp": 0.01029903, + "balance_loss_clip": 1.0405252, + "balance_loss_mlp": 1.0174154, + "epoch": 0.5856606042386893, + "flos": 23966120125440.0, + "grad_norm": 3.625269035286444, + "language_loss": 0.74794221, + "learning_rate": 1.5461754062472113e-06, + "loss": 0.76941741, + "num_input_tokens_seen": 209952055, + "router_z_loss_clip": 0.77099609, + "router_z_loss_mlp": 0.12493896, + "step": 9741, + "time_per_iteration": 2.6161956787109375 + }, + { + "auxiliary_loss_clip": 0.01120168, + "auxiliary_loss_mlp": 0.01032811, + "balance_loss_clip": 1.04399908, + "balance_loss_mlp": 1.02069366, + "epoch": 0.5857207274913573, + "flos": 26461872054720.0, + "grad_norm": 2.0153437361219533, + "language_loss": 0.75920796, + "learning_rate": 1.5457961122041959e-06, + "loss": 0.78073782, + "num_input_tokens_seen": 209971190, + "router_z_loss_clip": 0.76123047, + "router_z_loss_mlp": 0.12133789, + "step": 9742, + "time_per_iteration": 2.656010866165161 + }, + { + "auxiliary_loss_clip": 0.01116356, + "auxiliary_loss_mlp": 0.01029581, + "balance_loss_clip": 1.04061782, + "balance_loss_mlp": 1.01769626, + "epoch": 0.5857808507440253, + "flos": 28288009118880.0, + "grad_norm": 2.2547657093515276, + "language_loss": 0.75073862, + "learning_rate": 1.5454168353846369e-06, + "loss": 0.77219796, + "num_input_tokens_seen": 209990695, + "router_z_loss_clip": 0.75732422, + "router_z_loss_mlp": 0.11877441, + "step": 9743, + "time_per_iteration": 2.6169140338897705 + }, + { + "auxiliary_loss_clip": 0.01116298, + "auxiliary_loss_mlp": 0.01026988, + "balance_loss_clip": 1.04323196, + "balance_loss_mlp": 1.01573431, + "epoch": 0.5858409739966932, + "flos": 33233046007680.0, + "grad_norm": 2.0682885706910445, + "language_loss": 0.80801535, + "learning_rate": 1.5450375758029172e-06, + "loss": 0.82944822, + "num_input_tokens_seen": 210010210, + "router_z_loss_clip": 0.73046875, + "router_z_loss_mlp": 0.11242676, + "step": 9744, + "time_per_iteration": 2.708796977996826 + }, + { + "auxiliary_loss_clip": 0.01124161, + "auxiliary_loss_mlp": 0.01025628, + "balance_loss_clip": 1.04404306, + "balance_loss_mlp": 1.01369536, + "epoch": 0.5859010972493612, + "flos": 33989881875840.0, + "grad_norm": 3.3250525796141837, + "language_loss": 0.7171073, + "learning_rate": 1.5446583334734183e-06, + "loss": 0.73860514, + "num_input_tokens_seen": 210030030, + "router_z_loss_clip": 0.80224609, + "router_z_loss_mlp": 0.11932373, + "step": 9745, + "time_per_iteration": 2.6337358951568604 + }, + { + "auxiliary_loss_clip": 0.01038534, + "auxiliary_loss_mlp": 0.01002016, + "balance_loss_clip": 1.01426256, + "balance_loss_mlp": 1.00084281, + "epoch": 0.5859612205020291, + "flos": 85422117201120.0, + "grad_norm": 0.7475425414173109, + "language_loss": 0.53322399, + "learning_rate": 1.5442791084105204e-06, + "loss": 0.55362946, + "num_input_tokens_seen": 210094840, + "router_z_loss_clip": 0.24279785, + "router_z_loss_mlp": 0.01172638, + "step": 9746, + "time_per_iteration": 3.3658359050750732 + }, + { + "auxiliary_loss_clip": 0.01122602, + "auxiliary_loss_mlp": 0.01029878, + "balance_loss_clip": 1.04401588, + "balance_loss_mlp": 1.01714587, + "epoch": 0.5860213437546972, + "flos": 29353906863360.0, + "grad_norm": 2.486495042115834, + "language_loss": 0.73077542, + "learning_rate": 1.5438999006286054e-06, + "loss": 0.7523002, + "num_input_tokens_seen": 210114660, + "router_z_loss_clip": 0.78515625, + "router_z_loss_mlp": 0.12731934, + "step": 9747, + "time_per_iteration": 2.6630170345306396 + }, + { + "auxiliary_loss_clip": 0.0112005, + "auxiliary_loss_mlp": 0.01032507, + "balance_loss_clip": 1.04258537, + "balance_loss_mlp": 1.01961493, + "epoch": 0.5860814670073651, + "flos": 23120239976640.0, + "grad_norm": 2.8676274560328547, + "language_loss": 0.81347764, + "learning_rate": 1.543520710142051e-06, + "loss": 0.83500326, + "num_input_tokens_seen": 210132770, + "router_z_loss_clip": 0.77490234, + "router_z_loss_mlp": 0.12890625, + "step": 9748, + "time_per_iteration": 2.591797351837158 + }, + { + "auxiliary_loss_clip": 0.01120599, + "auxiliary_loss_mlp": 0.01031129, + "balance_loss_clip": 1.04217243, + "balance_loss_mlp": 1.01861167, + "epoch": 0.5861415902600331, + "flos": 27530443939680.0, + "grad_norm": 1.744710424075137, + "language_loss": 0.72395754, + "learning_rate": 1.5431415369652375e-06, + "loss": 0.74547482, + "num_input_tokens_seen": 210151895, + "router_z_loss_clip": 0.78417969, + "router_z_loss_mlp": 0.12524414, + "step": 9749, + "time_per_iteration": 2.7321650981903076 + }, + { + "auxiliary_loss_clip": 0.0111953, + "auxiliary_loss_mlp": 0.01029265, + "balance_loss_clip": 1.04502368, + "balance_loss_mlp": 1.01720095, + "epoch": 0.586201713512701, + "flos": 17560725890400.0, + "grad_norm": 3.4327440193106313, + "language_loss": 0.74997169, + "learning_rate": 1.5427623811125428e-06, + "loss": 0.77145964, + "num_input_tokens_seen": 210168040, + "router_z_loss_clip": 0.74560547, + "router_z_loss_mlp": 0.1206665, + "step": 9750, + "time_per_iteration": 2.6131882667541504 + }, + { + "auxiliary_loss_clip": 0.01119518, + "auxiliary_loss_mlp": 0.01033717, + "balance_loss_clip": 1.04309988, + "balance_loss_mlp": 1.02028203, + "epoch": 0.586261836765369, + "flos": 23792245361280.0, + "grad_norm": 1.8398063655966141, + "language_loss": 0.70875418, + "learning_rate": 1.542383242598344e-06, + "loss": 0.73028648, + "num_input_tokens_seen": 210187720, + "router_z_loss_clip": 0.76318359, + "router_z_loss_mlp": 0.13433838, + "step": 9751, + "time_per_iteration": 2.636547565460205 + }, + { + "auxiliary_loss_clip": 0.01122254, + "auxiliary_loss_mlp": 0.01034409, + "balance_loss_clip": 1.04327035, + "balance_loss_mlp": 1.02065277, + "epoch": 0.5863219600180369, + "flos": 25260625128960.0, + "grad_norm": 2.968575150068794, + "language_loss": 0.74329108, + "learning_rate": 1.5420041214370184e-06, + "loss": 0.76485771, + "num_input_tokens_seen": 210206080, + "router_z_loss_clip": 0.79101562, + "router_z_loss_mlp": 0.13763428, + "step": 9752, + "time_per_iteration": 2.594744920730591 + }, + { + "auxiliary_loss_clip": 0.01117186, + "auxiliary_loss_mlp": 0.0102918, + "balance_loss_clip": 1.04157567, + "balance_loss_mlp": 1.01685345, + "epoch": 0.586382083270705, + "flos": 24150326796000.0, + "grad_norm": 3.1497207775471976, + "language_loss": 0.774234, + "learning_rate": 1.541625017642943e-06, + "loss": 0.79569769, + "num_input_tokens_seen": 210225660, + "router_z_loss_clip": 0.75537109, + "router_z_loss_mlp": 0.12335205, + "step": 9753, + "time_per_iteration": 2.661949396133423 + }, + { + "auxiliary_loss_clip": 0.01114203, + "auxiliary_loss_mlp": 0.01027191, + "balance_loss_clip": 1.04155087, + "balance_loss_mlp": 1.01575863, + "epoch": 0.5864422065233729, + "flos": 20134420365600.0, + "grad_norm": 1.9528464048576937, + "language_loss": 0.70994055, + "learning_rate": 1.5412459312304927e-06, + "loss": 0.73135448, + "num_input_tokens_seen": 210242725, + "router_z_loss_clip": 0.7265625, + "router_z_loss_mlp": 0.11425781, + "step": 9754, + "time_per_iteration": 2.600734233856201 + }, + { + "auxiliary_loss_clip": 0.01117109, + "auxiliary_loss_mlp": 0.01029115, + "balance_loss_clip": 1.03989744, + "balance_loss_mlp": 1.01614475, + "epoch": 0.5865023297760409, + "flos": 24908216113440.0, + "grad_norm": 1.7681342337882062, + "language_loss": 0.71904111, + "learning_rate": 1.540866862214043e-06, + "loss": 0.74050337, + "num_input_tokens_seen": 210263225, + "router_z_loss_clip": 0.77197266, + "router_z_loss_mlp": 0.12976074, + "step": 9755, + "time_per_iteration": 2.66629958152771 + }, + { + "auxiliary_loss_clip": 0.01036589, + "auxiliary_loss_mlp": 0.0100035, + "balance_loss_clip": 1.01243544, + "balance_loss_mlp": 0.99910355, + "epoch": 0.5865624530287089, + "flos": 77301057473280.0, + "grad_norm": 0.7469073659929092, + "language_loss": 0.56890821, + "learning_rate": 1.540487810607967e-06, + "loss": 0.58927763, + "num_input_tokens_seen": 210322310, + "router_z_loss_clip": 0.24182129, + "router_z_loss_mlp": 0.0124588, + "step": 9756, + "time_per_iteration": 3.2823550701141357 + }, + { + "auxiliary_loss_clip": 0.01115068, + "auxiliary_loss_mlp": 0.01030501, + "balance_loss_clip": 1.04104352, + "balance_loss_mlp": 1.01915216, + "epoch": 0.5866225762813768, + "flos": 32965791613920.0, + "grad_norm": 4.230630601758038, + "language_loss": 0.76238704, + "learning_rate": 1.5401087764266396e-06, + "loss": 0.78384268, + "num_input_tokens_seen": 210340845, + "router_z_loss_clip": 0.74072266, + "router_z_loss_mlp": 0.11346436, + "step": 9757, + "time_per_iteration": 2.6593050956726074 + }, + { + "auxiliary_loss_clip": 0.01036315, + "auxiliary_loss_mlp": 0.01000942, + "balance_loss_clip": 1.01206934, + "balance_loss_mlp": 0.99971598, + "epoch": 0.5866826995340448, + "flos": 89059602522240.0, + "grad_norm": 0.8494866414186527, + "language_loss": 0.60463083, + "learning_rate": 1.5397297596844337e-06, + "loss": 0.62500346, + "num_input_tokens_seen": 210397815, + "router_z_loss_clip": 0.24243164, + "router_z_loss_mlp": 0.01225281, + "step": 9758, + "time_per_iteration": 3.232466220855713 + }, + { + "auxiliary_loss_clip": 0.01122562, + "auxiliary_loss_mlp": 0.01028326, + "balance_loss_clip": 1.04239249, + "balance_loss_mlp": 1.01577282, + "epoch": 0.5867428227867127, + "flos": 25973141443200.0, + "grad_norm": 3.8432169203867432, + "language_loss": 0.72252786, + "learning_rate": 1.5393507603957212e-06, + "loss": 0.74403679, + "num_input_tokens_seen": 210413900, + "router_z_loss_clip": 0.79980469, + "router_z_loss_mlp": 0.12561035, + "step": 9759, + "time_per_iteration": 2.620177984237671 + }, + { + "auxiliary_loss_clip": 0.01118859, + "auxiliary_loss_mlp": 0.01032721, + "balance_loss_clip": 1.04244757, + "balance_loss_mlp": 1.02128267, + "epoch": 0.5868029460393808, + "flos": 40840213893120.0, + "grad_norm": 1.7737522042709457, + "language_loss": 0.72960258, + "learning_rate": 1.5389717785748742e-06, + "loss": 0.75111842, + "num_input_tokens_seen": 210434110, + "router_z_loss_clip": 0.76513672, + "router_z_loss_mlp": 0.11437988, + "step": 9760, + "time_per_iteration": 2.788917303085327 + }, + { + "auxiliary_loss_clip": 0.01115702, + "auxiliary_loss_mlp": 0.01029172, + "balance_loss_clip": 1.04017437, + "balance_loss_mlp": 1.01683402, + "epoch": 0.5868630692920487, + "flos": 21829057390080.0, + "grad_norm": 1.952891518596562, + "language_loss": 0.72591501, + "learning_rate": 1.5385928142362637e-06, + "loss": 0.74736375, + "num_input_tokens_seen": 210451685, + "router_z_loss_clip": 0.75537109, + "router_z_loss_mlp": 0.12341309, + "step": 9761, + "time_per_iteration": 2.5911202430725098 + }, + { + "auxiliary_loss_clip": 0.01119571, + "auxiliary_loss_mlp": 0.01029657, + "balance_loss_clip": 1.03974557, + "balance_loss_mlp": 1.01591802, + "epoch": 0.5869231925447167, + "flos": 25666794224640.0, + "grad_norm": 2.085493310079381, + "language_loss": 0.74604273, + "learning_rate": 1.5382138673942597e-06, + "loss": 0.76753497, + "num_input_tokens_seen": 210470825, + "router_z_loss_clip": 0.79833984, + "router_z_loss_mlp": 0.13751221, + "step": 9762, + "time_per_iteration": 4.12530779838562 + }, + { + "auxiliary_loss_clip": 0.01115316, + "auxiliary_loss_mlp": 0.01030697, + "balance_loss_clip": 1.0413785, + "balance_loss_mlp": 1.01838243, + "epoch": 0.5869833157973846, + "flos": 91199900289600.0, + "grad_norm": 1.6442496160173679, + "language_loss": 0.72257406, + "learning_rate": 1.5378349380632317e-06, + "loss": 0.74403417, + "num_input_tokens_seen": 210500075, + "router_z_loss_clip": 0.73925781, + "router_z_loss_mlp": 0.12304688, + "step": 9763, + "time_per_iteration": 4.227200508117676 + }, + { + "auxiliary_loss_clip": 0.01113449, + "auxiliary_loss_mlp": 0.01030265, + "balance_loss_clip": 1.03903246, + "balance_loss_mlp": 1.01894045, + "epoch": 0.5870434390500526, + "flos": 21522953275200.0, + "grad_norm": 1.6459966197339422, + "language_loss": 0.80176401, + "learning_rate": 1.53745602625755e-06, + "loss": 0.82320118, + "num_input_tokens_seen": 210518150, + "router_z_loss_clip": 0.74414062, + "router_z_loss_mlp": 0.11322021, + "step": 9764, + "time_per_iteration": 2.6241564750671387 + }, + { + "auxiliary_loss_clip": 0.01118221, + "auxiliary_loss_mlp": 0.01032113, + "balance_loss_clip": 1.04180694, + "balance_loss_mlp": 1.01962638, + "epoch": 0.5871035623027205, + "flos": 26241976010880.0, + "grad_norm": 1.6265206150444425, + "language_loss": 0.79104674, + "learning_rate": 1.5370771319915819e-06, + "loss": 0.81255001, + "num_input_tokens_seen": 210537760, + "router_z_loss_clip": 0.76367188, + "router_z_loss_mlp": 0.12481689, + "step": 9765, + "time_per_iteration": 2.630580186843872 + }, + { + "auxiliary_loss_clip": 0.01115541, + "auxiliary_loss_mlp": 0.01029611, + "balance_loss_clip": 1.04144144, + "balance_loss_mlp": 1.01767838, + "epoch": 0.5871636855553886, + "flos": 16535582179200.0, + "grad_norm": 3.5284451084440116, + "language_loss": 0.83320117, + "learning_rate": 1.5366982552796947e-06, + "loss": 0.85465264, + "num_input_tokens_seen": 210555515, + "router_z_loss_clip": 0.74121094, + "router_z_loss_mlp": 0.11914062, + "step": 9766, + "time_per_iteration": 2.679126262664795 + }, + { + "auxiliary_loss_clip": 0.01120753, + "auxiliary_loss_mlp": 0.01033558, + "balance_loss_clip": 1.04193687, + "balance_loss_mlp": 1.02198291, + "epoch": 0.5872238088080565, + "flos": 31985534698560.0, + "grad_norm": 1.6936267333154353, + "language_loss": 0.69639075, + "learning_rate": 1.536319396136257e-06, + "loss": 0.71793377, + "num_input_tokens_seen": 210575000, + "router_z_loss_clip": 0.78759766, + "router_z_loss_mlp": 0.11572266, + "step": 9767, + "time_per_iteration": 2.6365599632263184 + }, + { + "auxiliary_loss_clip": 0.01117521, + "auxiliary_loss_mlp": 0.01034124, + "balance_loss_clip": 1.03993404, + "balance_loss_mlp": 1.02165461, + "epoch": 0.5872839320607245, + "flos": 37418451336000.0, + "grad_norm": 2.1151779663854064, + "language_loss": 0.63443404, + "learning_rate": 1.5359405545756336e-06, + "loss": 0.65595055, + "num_input_tokens_seen": 210595185, + "router_z_loss_clip": 0.77539062, + "router_z_loss_mlp": 0.12487793, + "step": 9768, + "time_per_iteration": 2.727522850036621 + }, + { + "auxiliary_loss_clip": 0.010355, + "auxiliary_loss_mlp": 0.01003714, + "balance_loss_clip": 1.01157749, + "balance_loss_mlp": 1.00253284, + "epoch": 0.5873440553133924, + "flos": 73581010636320.0, + "grad_norm": 0.7202769752649123, + "language_loss": 0.53859979, + "learning_rate": 1.5355617306121914e-06, + "loss": 0.55899191, + "num_input_tokens_seen": 210653210, + "router_z_loss_clip": 0.23901367, + "router_z_loss_mlp": 0.01180267, + "step": 9769, + "time_per_iteration": 3.2441728115081787 + }, + { + "auxiliary_loss_clip": 0.01114583, + "auxiliary_loss_mlp": 0.01031919, + "balance_loss_clip": 1.04019284, + "balance_loss_mlp": 1.0198133, + "epoch": 0.5874041785660604, + "flos": 26282730044160.0, + "grad_norm": 1.543481751038648, + "language_loss": 0.70644534, + "learning_rate": 1.5351829242602945e-06, + "loss": 0.72791034, + "num_input_tokens_seen": 210673750, + "router_z_loss_clip": 0.74316406, + "router_z_loss_mlp": 0.12115479, + "step": 9770, + "time_per_iteration": 4.047478914260864 + }, + { + "auxiliary_loss_clip": 0.01114575, + "auxiliary_loss_mlp": 0.01032065, + "balance_loss_clip": 1.04034448, + "balance_loss_mlp": 1.02002466, + "epoch": 0.5874643018187284, + "flos": 29759751820800.0, + "grad_norm": 1.9487215607731143, + "language_loss": 0.67672825, + "learning_rate": 1.5348041355343077e-06, + "loss": 0.69819468, + "num_input_tokens_seen": 210692960, + "router_z_loss_clip": 0.74316406, + "router_z_loss_mlp": 0.12054443, + "step": 9771, + "time_per_iteration": 2.680569648742676 + }, + { + "auxiliary_loss_clip": 0.01116855, + "auxiliary_loss_mlp": 0.01030903, + "balance_loss_clip": 1.03886986, + "balance_loss_mlp": 1.01809442, + "epoch": 0.5875244250713964, + "flos": 34346423653920.0, + "grad_norm": 1.9691080404958365, + "language_loss": 0.66159755, + "learning_rate": 1.5344253644485954e-06, + "loss": 0.68307519, + "num_input_tokens_seen": 210714040, + "router_z_loss_clip": 0.78027344, + "router_z_loss_mlp": 0.12811279, + "step": 9772, + "time_per_iteration": 2.689528226852417 + }, + { + "auxiliary_loss_clip": 0.0111927, + "auxiliary_loss_mlp": 0.01038357, + "balance_loss_clip": 1.04135215, + "balance_loss_mlp": 1.02516627, + "epoch": 0.5875845483240644, + "flos": 31496723052480.0, + "grad_norm": 1.9912102123284074, + "language_loss": 0.74500841, + "learning_rate": 1.534046611017519e-06, + "loss": 0.76658469, + "num_input_tokens_seen": 210733710, + "router_z_loss_clip": 0.77978516, + "router_z_loss_mlp": 0.13183594, + "step": 9773, + "time_per_iteration": 2.7008142471313477 + }, + { + "auxiliary_loss_clip": 0.01116683, + "auxiliary_loss_mlp": 0.01036289, + "balance_loss_clip": 1.04024482, + "balance_loss_mlp": 1.02390862, + "epoch": 0.5876446715767323, + "flos": 32831941572000.0, + "grad_norm": 2.964437800885018, + "language_loss": 0.53593051, + "learning_rate": 1.5336678752554421e-06, + "loss": 0.55746025, + "num_input_tokens_seen": 210753580, + "router_z_loss_clip": 0.76464844, + "router_z_loss_mlp": 0.1237793, + "step": 9774, + "time_per_iteration": 4.142822504043579 + }, + { + "auxiliary_loss_clip": 0.01118505, + "auxiliary_loss_mlp": 0.01031675, + "balance_loss_clip": 1.04247665, + "balance_loss_mlp": 1.01900327, + "epoch": 0.5877047948294003, + "flos": 44764118281440.0, + "grad_norm": 2.742997553682502, + "language_loss": 0.65019858, + "learning_rate": 1.5332891571767264e-06, + "loss": 0.67170042, + "num_input_tokens_seen": 210773495, + "router_z_loss_clip": 0.75976562, + "router_z_loss_mlp": 0.12677002, + "step": 9775, + "time_per_iteration": 2.760451555252075 + }, + { + "auxiliary_loss_clip": 0.01114301, + "auxiliary_loss_mlp": 0.01032456, + "balance_loss_clip": 1.03874111, + "balance_loss_mlp": 1.02026665, + "epoch": 0.5877649180820682, + "flos": 32609979146880.0, + "grad_norm": 1.8477314400525553, + "language_loss": 0.7328738, + "learning_rate": 1.5329104567957326e-06, + "loss": 0.75434136, + "num_input_tokens_seen": 210793645, + "router_z_loss_clip": 0.75537109, + "router_z_loss_mlp": 0.12188721, + "step": 9776, + "time_per_iteration": 2.6912314891815186 + }, + { + "auxiliary_loss_clip": 0.01115598, + "auxiliary_loss_mlp": 0.01031591, + "balance_loss_clip": 1.04026413, + "balance_loss_mlp": 1.02017093, + "epoch": 0.5878250413347362, + "flos": 25663593359520.0, + "grad_norm": 2.026402341050476, + "language_loss": 0.74503744, + "learning_rate": 1.532531774126821e-06, + "loss": 0.76650935, + "num_input_tokens_seen": 210813415, + "router_z_loss_clip": 0.75390625, + "router_z_loss_mlp": 0.11419678, + "step": 9777, + "time_per_iteration": 2.603787899017334 + }, + { + "auxiliary_loss_clip": 0.01115338, + "auxiliary_loss_mlp": 0.01033443, + "balance_loss_clip": 1.04270267, + "balance_loss_mlp": 1.02210617, + "epoch": 0.5878851645874041, + "flos": 31096266893280.0, + "grad_norm": 1.5573654491502804, + "language_loss": 0.74482548, + "learning_rate": 1.5321531091843512e-06, + "loss": 0.76631325, + "num_input_tokens_seen": 210833850, + "router_z_loss_clip": 0.72607422, + "router_z_loss_mlp": 0.11340332, + "step": 9778, + "time_per_iteration": 2.684894561767578 + }, + { + "auxiliary_loss_clip": 0.01111935, + "auxiliary_loss_mlp": 0.01028435, + "balance_loss_clip": 1.03860092, + "balance_loss_mlp": 1.01726472, + "epoch": 0.5879452878400722, + "flos": 29003361642720.0, + "grad_norm": 1.714640384895788, + "language_loss": 0.6990124, + "learning_rate": 1.5317744619826824e-06, + "loss": 0.72041607, + "num_input_tokens_seen": 210853115, + "router_z_loss_clip": 0.73291016, + "router_z_loss_mlp": 0.11181641, + "step": 9779, + "time_per_iteration": 2.6273202896118164 + }, + { + "auxiliary_loss_clip": 0.01117113, + "auxiliary_loss_mlp": 0.01029887, + "balance_loss_clip": 1.03980422, + "balance_loss_mlp": 1.01807296, + "epoch": 0.5880054110927401, + "flos": 21749048462880.0, + "grad_norm": 2.4540721342620326, + "language_loss": 0.67020988, + "learning_rate": 1.5313958325361727e-06, + "loss": 0.69167984, + "num_input_tokens_seen": 210872090, + "router_z_loss_clip": 0.77246094, + "router_z_loss_mlp": 0.11810303, + "step": 9780, + "time_per_iteration": 2.6610209941864014 + }, + { + "auxiliary_loss_clip": 0.01116646, + "auxiliary_loss_mlp": 0.01033887, + "balance_loss_clip": 1.04128408, + "balance_loss_mlp": 1.02181745, + "epoch": 0.5880655343454081, + "flos": 23749222360320.0, + "grad_norm": 2.158979991520472, + "language_loss": 0.72419429, + "learning_rate": 1.5310172208591807e-06, + "loss": 0.74569964, + "num_input_tokens_seen": 210888490, + "router_z_loss_clip": 0.75390625, + "router_z_loss_mlp": 0.12078857, + "step": 9781, + "time_per_iteration": 2.6040258407592773 + }, + { + "auxiliary_loss_clip": 0.01113724, + "auxiliary_loss_mlp": 0.0103158, + "balance_loss_clip": 1.03883839, + "balance_loss_mlp": 1.02001619, + "epoch": 0.588125657598076, + "flos": 26109422521920.0, + "grad_norm": 1.4715571199351665, + "language_loss": 0.70356482, + "learning_rate": 1.5306386269660622e-06, + "loss": 0.72501785, + "num_input_tokens_seen": 210908220, + "router_z_loss_clip": 0.74853516, + "router_z_loss_mlp": 0.11553955, + "step": 9782, + "time_per_iteration": 2.6851658821105957 + }, + { + "auxiliary_loss_clip": 0.01118709, + "auxiliary_loss_mlp": 0.01028175, + "balance_loss_clip": 1.03985143, + "balance_loss_mlp": 1.0162127, + "epoch": 0.588185780850744, + "flos": 19565113584960.0, + "grad_norm": 3.028603410650041, + "language_loss": 0.70126182, + "learning_rate": 1.5302600508711741e-06, + "loss": 0.72273064, + "num_input_tokens_seen": 210923945, + "router_z_loss_clip": 0.78857422, + "router_z_loss_mlp": 0.11968994, + "step": 9783, + "time_per_iteration": 2.650686264038086 + }, + { + "auxiliary_loss_clip": 0.01118275, + "auxiliary_loss_mlp": 0.01031094, + "balance_loss_clip": 1.04060686, + "balance_loss_mlp": 1.01808262, + "epoch": 0.588245904103412, + "flos": 28953855876960.0, + "grad_norm": 2.7051165236521797, + "language_loss": 0.69057441, + "learning_rate": 1.5298814925888719e-06, + "loss": 0.71206808, + "num_input_tokens_seen": 210941955, + "router_z_loss_clip": 0.77636719, + "router_z_loss_mlp": 0.13018799, + "step": 9784, + "time_per_iteration": 2.7077810764312744 + }, + { + "auxiliary_loss_clip": 0.01115468, + "auxiliary_loss_mlp": 0.01028731, + "balance_loss_clip": 1.03815556, + "balance_loss_mlp": 1.01679218, + "epoch": 0.58830602735608, + "flos": 41246909713440.0, + "grad_norm": 2.8084599602606577, + "language_loss": 0.6939069, + "learning_rate": 1.5295029521335102e-06, + "loss": 0.71534884, + "num_input_tokens_seen": 210963105, + "router_z_loss_clip": 0.7734375, + "router_z_loss_mlp": 0.11950684, + "step": 9785, + "time_per_iteration": 2.70977783203125 + }, + { + "auxiliary_loss_clip": 0.01113232, + "auxiliary_loss_mlp": 0.01025773, + "balance_loss_clip": 1.03960061, + "balance_loss_mlp": 1.01480031, + "epoch": 0.588366150608748, + "flos": 20853986686560.0, + "grad_norm": 3.681959659049078, + "language_loss": 0.76958656, + "learning_rate": 1.5291244295194448e-06, + "loss": 0.79097664, + "num_input_tokens_seen": 210978720, + "router_z_loss_clip": 0.73681641, + "router_z_loss_mlp": 0.10980225, + "step": 9786, + "time_per_iteration": 2.643994092941284 + }, + { + "auxiliary_loss_clip": 0.01119397, + "auxiliary_loss_mlp": 0.0103285, + "balance_loss_clip": 1.04212523, + "balance_loss_mlp": 1.02069068, + "epoch": 0.5884262738614159, + "flos": 26999743776480.0, + "grad_norm": 2.1226026567702676, + "language_loss": 0.79195344, + "learning_rate": 1.5287459247610276e-06, + "loss": 0.81347597, + "num_input_tokens_seen": 210998750, + "router_z_loss_clip": 0.77148438, + "router_z_loss_mlp": 0.1217041, + "step": 9787, + "time_per_iteration": 2.6550426483154297 + }, + { + "auxiliary_loss_clip": 0.01114944, + "auxiliary_loss_mlp": 0.01028758, + "balance_loss_clip": 1.03913665, + "balance_loss_mlp": 1.01762354, + "epoch": 0.5884863971140839, + "flos": 25663188186720.0, + "grad_norm": 1.7943270117781471, + "language_loss": 0.66781926, + "learning_rate": 1.5283674378726116e-06, + "loss": 0.68925631, + "num_input_tokens_seen": 211017550, + "router_z_loss_clip": 0.75732422, + "router_z_loss_mlp": 0.11126709, + "step": 9788, + "time_per_iteration": 2.6533071994781494 + }, + { + "auxiliary_loss_clip": 0.01114885, + "auxiliary_loss_mlp": 0.01030676, + "balance_loss_clip": 1.04129434, + "balance_loss_mlp": 1.01896381, + "epoch": 0.5885465203667518, + "flos": 29047316541120.0, + "grad_norm": 2.542430241743722, + "language_loss": 0.80340773, + "learning_rate": 1.5279889688685506e-06, + "loss": 0.82486337, + "num_input_tokens_seen": 211034135, + "router_z_loss_clip": 0.73486328, + "router_z_loss_mlp": 0.11700439, + "step": 9789, + "time_per_iteration": 2.658926486968994 + }, + { + "auxiliary_loss_clip": 0.01112897, + "auxiliary_loss_mlp": 0.01028165, + "balance_loss_clip": 1.03937316, + "balance_loss_mlp": 1.01669681, + "epoch": 0.5886066436194198, + "flos": 23037921564480.0, + "grad_norm": 1.5865428186428607, + "language_loss": 0.70363557, + "learning_rate": 1.5276105177631944e-06, + "loss": 0.72504622, + "num_input_tokens_seen": 211053850, + "router_z_loss_clip": 0.73583984, + "router_z_loss_mlp": 0.11474609, + "step": 9790, + "time_per_iteration": 2.6111032962799072 + }, + { + "auxiliary_loss_clip": 0.01114141, + "auxiliary_loss_mlp": 0.01033418, + "balance_loss_clip": 1.04031134, + "balance_loss_mlp": 1.02147365, + "epoch": 0.5886667668720877, + "flos": 30249333295200.0, + "grad_norm": 2.341632584402622, + "language_loss": 0.83098948, + "learning_rate": 1.527232084570895e-06, + "loss": 0.85246509, + "num_input_tokens_seen": 211072165, + "router_z_loss_clip": 0.73876953, + "router_z_loss_mlp": 0.11950684, + "step": 9791, + "time_per_iteration": 2.6955087184906006 + }, + { + "auxiliary_loss_clip": 0.01118567, + "auxiliary_loss_mlp": 0.01032292, + "balance_loss_clip": 1.04230094, + "balance_loss_mlp": 1.01972139, + "epoch": 0.5887268901247558, + "flos": 26374164844320.0, + "grad_norm": 1.8743502774492002, + "language_loss": 0.76223958, + "learning_rate": 1.5268536693060026e-06, + "loss": 0.78374815, + "num_input_tokens_seen": 211089630, + "router_z_loss_clip": 0.76220703, + "router_z_loss_mlp": 0.12579346, + "step": 9792, + "time_per_iteration": 2.6203742027282715 + }, + { + "auxiliary_loss_clip": 0.01116943, + "auxiliary_loss_mlp": 0.0103319, + "balance_loss_clip": 1.03930533, + "balance_loss_mlp": 1.02100646, + "epoch": 0.5887870133774237, + "flos": 24992155216800.0, + "grad_norm": 1.9109037790540033, + "language_loss": 0.68676025, + "learning_rate": 1.5264752719828662e-06, + "loss": 0.70826155, + "num_input_tokens_seen": 211106120, + "router_z_loss_clip": 0.77587891, + "router_z_loss_mlp": 0.12188721, + "step": 9793, + "time_per_iteration": 2.7029330730438232 + }, + { + "auxiliary_loss_clip": 0.01112994, + "auxiliary_loss_mlp": 0.01028985, + "balance_loss_clip": 1.04009485, + "balance_loss_mlp": 1.01695716, + "epoch": 0.5888471366300917, + "flos": 23435622548640.0, + "grad_norm": 2.4187718484810325, + "language_loss": 0.60338187, + "learning_rate": 1.5260968926158353e-06, + "loss": 0.62480164, + "num_input_tokens_seen": 211122450, + "router_z_loss_clip": 0.72851562, + "router_z_loss_mlp": 0.12017822, + "step": 9794, + "time_per_iteration": 2.5784943103790283 + }, + { + "auxiliary_loss_clip": 0.01118586, + "auxiliary_loss_mlp": 0.01034501, + "balance_loss_clip": 1.04245925, + "balance_loss_mlp": 1.02180505, + "epoch": 0.5889072598827596, + "flos": 24371073702720.0, + "grad_norm": 2.825150486193083, + "language_loss": 0.65256643, + "learning_rate": 1.525718531219257e-06, + "loss": 0.6740973, + "num_input_tokens_seen": 211141765, + "router_z_loss_clip": 0.76123047, + "router_z_loss_mlp": 0.12713623, + "step": 9795, + "time_per_iteration": 2.768798589706421 + }, + { + "auxiliary_loss_clip": 0.0111296, + "auxiliary_loss_mlp": 0.01032804, + "balance_loss_clip": 1.03957498, + "balance_loss_mlp": 1.0218544, + "epoch": 0.5889673831354276, + "flos": 25308469686240.0, + "grad_norm": 1.7183495836499771, + "language_loss": 0.74009162, + "learning_rate": 1.5253401878074801e-06, + "loss": 0.76154923, + "num_input_tokens_seen": 211160475, + "router_z_loss_clip": 0.734375, + "router_z_loss_mlp": 0.10955811, + "step": 9796, + "time_per_iteration": 2.6801440715789795 + }, + { + "auxiliary_loss_clip": 0.01116217, + "auxiliary_loss_mlp": 0.01033271, + "balance_loss_clip": 1.04162276, + "balance_loss_mlp": 1.02196968, + "epoch": 0.5890275063880956, + "flos": 30872643259680.0, + "grad_norm": 1.4830464769377885, + "language_loss": 0.83092809, + "learning_rate": 1.5249618623948507e-06, + "loss": 0.85242295, + "num_input_tokens_seen": 211180480, + "router_z_loss_clip": 0.74560547, + "router_z_loss_mlp": 0.11297607, + "step": 9797, + "time_per_iteration": 2.71738600730896 + }, + { + "auxiliary_loss_clip": 0.01111845, + "auxiliary_loss_mlp": 0.01032963, + "balance_loss_clip": 1.03905892, + "balance_loss_mlp": 1.02089262, + "epoch": 0.5890876296407636, + "flos": 14355455925600.0, + "grad_norm": 1.7019914193998396, + "language_loss": 0.79177982, + "learning_rate": 1.5245835549957152e-06, + "loss": 0.81322789, + "num_input_tokens_seen": 211198000, + "router_z_loss_clip": 0.72705078, + "router_z_loss_mlp": 0.1206665, + "step": 9798, + "time_per_iteration": 2.6343493461608887 + }, + { + "auxiliary_loss_clip": 0.01112454, + "auxiliary_loss_mlp": 0.01032786, + "balance_loss_clip": 1.03957009, + "balance_loss_mlp": 1.02194941, + "epoch": 0.5891477528934316, + "flos": 16581360355200.0, + "grad_norm": 2.5251441865262825, + "language_loss": 0.74552333, + "learning_rate": 1.5242052656244186e-06, + "loss": 0.7669757, + "num_input_tokens_seen": 211214765, + "router_z_loss_clip": 0.72851562, + "router_z_loss_mlp": 0.1083374, + "step": 9799, + "time_per_iteration": 2.7290732860565186 + }, + { + "auxiliary_loss_clip": 0.01118079, + "auxiliary_loss_mlp": 0.01030086, + "balance_loss_clip": 1.04066682, + "balance_loss_mlp": 1.01727068, + "epoch": 0.5892078761460995, + "flos": 18362610623520.0, + "grad_norm": 16.450306399713533, + "language_loss": 0.76287293, + "learning_rate": 1.5238269942953064e-06, + "loss": 0.78435457, + "num_input_tokens_seen": 211232335, + "router_z_loss_clip": 0.77392578, + "router_z_loss_mlp": 0.12811279, + "step": 9800, + "time_per_iteration": 2.6144819259643555 + }, + { + "auxiliary_loss_clip": 0.01116723, + "auxiliary_loss_mlp": 0.01036361, + "balance_loss_clip": 1.03989828, + "balance_loss_mlp": 1.02467871, + "epoch": 0.5892679993987675, + "flos": 19252283601600.0, + "grad_norm": 2.1381105196524866, + "language_loss": 0.78846633, + "learning_rate": 1.523448741022722e-06, + "loss": 0.8099972, + "num_input_tokens_seen": 211249985, + "router_z_loss_clip": 0.76806641, + "router_z_loss_mlp": 0.11700439, + "step": 9801, + "time_per_iteration": 2.633190155029297 + }, + { + "auxiliary_loss_clip": 0.01117505, + "auxiliary_loss_mlp": 0.0103292, + "balance_loss_clip": 1.04021919, + "balance_loss_mlp": 1.02067149, + "epoch": 0.5893281226514354, + "flos": 30828971982240.0, + "grad_norm": 2.559026676054318, + "language_loss": 0.66188496, + "learning_rate": 1.5230705058210088e-06, + "loss": 0.68338919, + "num_input_tokens_seen": 211268425, + "router_z_loss_clip": 0.77294922, + "router_z_loss_mlp": 0.12249756, + "step": 9802, + "time_per_iteration": 4.135786533355713 + }, + { + "auxiliary_loss_clip": 0.01114226, + "auxiliary_loss_mlp": 0.01026824, + "balance_loss_clip": 1.04016733, + "balance_loss_mlp": 1.01455092, + "epoch": 0.5893882459041034, + "flos": 23743387872000.0, + "grad_norm": 1.6413794392234016, + "language_loss": 0.78025579, + "learning_rate": 1.5226922887045108e-06, + "loss": 0.80166626, + "num_input_tokens_seen": 211286680, + "router_z_loss_clip": 0.74121094, + "router_z_loss_mlp": 0.12261963, + "step": 9803, + "time_per_iteration": 2.5898568630218506 + }, + { + "auxiliary_loss_clip": 0.01117401, + "auxiliary_loss_mlp": 0.010363, + "balance_loss_clip": 1.04095817, + "balance_loss_mlp": 1.02427757, + "epoch": 0.5894483691567713, + "flos": 25177455853920.0, + "grad_norm": 1.5374060181083058, + "language_loss": 0.72960097, + "learning_rate": 1.5223140896875686e-06, + "loss": 0.75113797, + "num_input_tokens_seen": 211307700, + "router_z_loss_clip": 0.76513672, + "router_z_loss_mlp": 0.12011719, + "step": 9804, + "time_per_iteration": 2.7339305877685547 + }, + { + "auxiliary_loss_clip": 0.01116839, + "auxiliary_loss_mlp": 0.01029109, + "balance_loss_clip": 1.0420779, + "balance_loss_mlp": 1.01740265, + "epoch": 0.5895084924094394, + "flos": 21693465105120.0, + "grad_norm": 2.9409033428417035, + "language_loss": 0.74497569, + "learning_rate": 1.5219359087845234e-06, + "loss": 0.76643521, + "num_input_tokens_seen": 211324835, + "router_z_loss_clip": 0.74804688, + "router_z_loss_mlp": 0.11706543, + "step": 9805, + "time_per_iteration": 2.6240458488464355 + }, + { + "auxiliary_loss_clip": 0.01120297, + "auxiliary_loss_mlp": 0.01028121, + "balance_loss_clip": 1.0403403, + "balance_loss_mlp": 1.01540148, + "epoch": 0.5895686156621073, + "flos": 24551674335360.0, + "grad_norm": 2.916914314598025, + "language_loss": 0.7787174, + "learning_rate": 1.5215577460097174e-06, + "loss": 0.80020159, + "num_input_tokens_seen": 211344130, + "router_z_loss_clip": 0.79931641, + "router_z_loss_mlp": 0.1272583, + "step": 9806, + "time_per_iteration": 2.674489736557007 + }, + { + "auxiliary_loss_clip": 0.01114432, + "auxiliary_loss_mlp": 0.01029767, + "balance_loss_clip": 1.03874326, + "balance_loss_mlp": 1.01768565, + "epoch": 0.5896287389147753, + "flos": 25441306796160.0, + "grad_norm": 2.1313485551025613, + "language_loss": 0.77299875, + "learning_rate": 1.5211796013774887e-06, + "loss": 0.79444075, + "num_input_tokens_seen": 211362915, + "router_z_loss_clip": 0.75683594, + "router_z_loss_mlp": 0.12084961, + "step": 9807, + "time_per_iteration": 2.6951847076416016 + }, + { + "auxiliary_loss_clip": 0.01118853, + "auxiliary_loss_mlp": 0.01030062, + "balance_loss_clip": 1.0420748, + "balance_loss_mlp": 1.01771784, + "epoch": 0.5896888621674432, + "flos": 17738733417120.0, + "grad_norm": 2.032888420592872, + "language_loss": 0.74288273, + "learning_rate": 1.5208014749021786e-06, + "loss": 0.76437193, + "num_input_tokens_seen": 211380700, + "router_z_loss_clip": 0.76855469, + "router_z_loss_mlp": 0.12347412, + "step": 9808, + "time_per_iteration": 2.688472270965576 + }, + { + "auxiliary_loss_clip": 0.01119199, + "auxiliary_loss_mlp": 0.01028232, + "balance_loss_clip": 1.0411799, + "balance_loss_mlp": 1.0154469, + "epoch": 0.5897489854201112, + "flos": 25486072040160.0, + "grad_norm": 2.430597607509248, + "language_loss": 0.72141767, + "learning_rate": 1.5204233665981236e-06, + "loss": 0.74289203, + "num_input_tokens_seen": 211400095, + "router_z_loss_clip": 0.78027344, + "router_z_loss_mlp": 0.12780762, + "step": 9809, + "time_per_iteration": 4.077239513397217 + }, + { + "auxiliary_loss_clip": 0.01119686, + "auxiliary_loss_mlp": 0.01032061, + "balance_loss_clip": 1.04116344, + "balance_loss_mlp": 1.01969337, + "epoch": 0.5898091086727792, + "flos": 24417581189760.0, + "grad_norm": 4.797882554158741, + "language_loss": 0.82220602, + "learning_rate": 1.5200452764796627e-06, + "loss": 0.84372354, + "num_input_tokens_seen": 211417810, + "router_z_loss_clip": 0.78466797, + "router_z_loss_mlp": 0.12359619, + "step": 9810, + "time_per_iteration": 2.6704094409942627 + }, + { + "auxiliary_loss_clip": 0.01115765, + "auxiliary_loss_mlp": 0.01032232, + "balance_loss_clip": 1.04211116, + "balance_loss_mlp": 1.02023959, + "epoch": 0.5898692319254472, + "flos": 19831881771360.0, + "grad_norm": 1.6318653675324637, + "language_loss": 0.81245625, + "learning_rate": 1.5196672045611336e-06, + "loss": 0.83393615, + "num_input_tokens_seen": 211436020, + "router_z_loss_clip": 0.73632812, + "router_z_loss_mlp": 0.11999512, + "step": 9811, + "time_per_iteration": 2.7109477519989014 + }, + { + "auxiliary_loss_clip": 0.01119741, + "auxiliary_loss_mlp": 0.01029445, + "balance_loss_clip": 1.04228079, + "balance_loss_mlp": 1.01651073, + "epoch": 0.5899293551781152, + "flos": 24951360666240.0, + "grad_norm": 2.4680803329601466, + "language_loss": 0.77149308, + "learning_rate": 1.5192891508568715e-06, + "loss": 0.79298496, + "num_input_tokens_seen": 211454335, + "router_z_loss_clip": 0.7734375, + "router_z_loss_mlp": 0.12927246, + "step": 9812, + "time_per_iteration": 2.6340253353118896 + }, + { + "auxiliary_loss_clip": 0.01115658, + "auxiliary_loss_mlp": 0.01029821, + "balance_loss_clip": 1.0413779, + "balance_loss_mlp": 1.0187223, + "epoch": 0.5899894784307831, + "flos": 16937942650560.0, + "grad_norm": 1.832348607244664, + "language_loss": 0.70723975, + "learning_rate": 1.5189111153812133e-06, + "loss": 0.72869444, + "num_input_tokens_seen": 211472775, + "router_z_loss_clip": 0.74267578, + "router_z_loss_mlp": 0.11102295, + "step": 9813, + "time_per_iteration": 4.064470529556274 + }, + { + "auxiliary_loss_clip": 0.01116437, + "auxiliary_loss_mlp": 0.01033599, + "balance_loss_clip": 1.04073071, + "balance_loss_mlp": 1.02146959, + "epoch": 0.5900496016834511, + "flos": 24506260814880.0, + "grad_norm": 1.6789427948033897, + "language_loss": 0.71970439, + "learning_rate": 1.518533098148494e-06, + "loss": 0.74120474, + "num_input_tokens_seen": 211492195, + "router_z_loss_clip": 0.7578125, + "router_z_loss_mlp": 0.12133789, + "step": 9814, + "time_per_iteration": 2.6504695415496826 + }, + { + "auxiliary_loss_clip": 0.01117287, + "auxiliary_loss_mlp": 0.01028179, + "balance_loss_clip": 1.04132605, + "balance_loss_mlp": 1.01613927, + "epoch": 0.590109724936119, + "flos": 24720889612320.0, + "grad_norm": 1.7724314634462706, + "language_loss": 0.78542221, + "learning_rate": 1.5181550991730476e-06, + "loss": 0.8068769, + "num_input_tokens_seen": 211510220, + "router_z_loss_clip": 0.76025391, + "router_z_loss_mlp": 0.12054443, + "step": 9815, + "time_per_iteration": 2.605790853500366 + }, + { + "auxiliary_loss_clip": 0.01122026, + "auxiliary_loss_mlp": 0.0103537, + "balance_loss_clip": 1.04154706, + "balance_loss_mlp": 1.0218749, + "epoch": 0.590169848188787, + "flos": 29572708940640.0, + "grad_norm": 2.3668262432889735, + "language_loss": 0.75335896, + "learning_rate": 1.5177771184692083e-06, + "loss": 0.77493298, + "num_input_tokens_seen": 211526260, + "router_z_loss_clip": 0.8046875, + "router_z_loss_mlp": 0.13513184, + "step": 9816, + "time_per_iteration": 2.6627466678619385 + }, + { + "auxiliary_loss_clip": 0.01116137, + "auxiliary_loss_mlp": 0.0103326, + "balance_loss_clip": 1.04164898, + "balance_loss_mlp": 1.02098167, + "epoch": 0.590229971441455, + "flos": 21700798732800.0, + "grad_norm": 2.1214169055093226, + "language_loss": 0.81585681, + "learning_rate": 1.517399156051309e-06, + "loss": 0.83735079, + "num_input_tokens_seen": 211542890, + "router_z_loss_clip": 0.74560547, + "router_z_loss_mlp": 0.1229248, + "step": 9817, + "time_per_iteration": 2.589390516281128 + }, + { + "auxiliary_loss_clip": 0.01117212, + "auxiliary_loss_mlp": 0.01032844, + "balance_loss_clip": 1.04085088, + "balance_loss_mlp": 1.02117896, + "epoch": 0.590290094694123, + "flos": 27133593818400.0, + "grad_norm": 1.729413984538033, + "language_loss": 0.76554114, + "learning_rate": 1.517021211933682e-06, + "loss": 0.78704166, + "num_input_tokens_seen": 211562685, + "router_z_loss_clip": 0.76220703, + "router_z_loss_mlp": 0.11676025, + "step": 9818, + "time_per_iteration": 2.68143367767334 + }, + { + "auxiliary_loss_clip": 0.01114094, + "auxiliary_loss_mlp": 0.01028407, + "balance_loss_clip": 1.03996778, + "balance_loss_mlp": 1.01646781, + "epoch": 0.5903502179467909, + "flos": 23304081991680.0, + "grad_norm": 2.788912202651951, + "language_loss": 0.66618741, + "learning_rate": 1.5166432861306592e-06, + "loss": 0.68761241, + "num_input_tokens_seen": 211579960, + "router_z_loss_clip": 0.7421875, + "router_z_loss_mlp": 0.11938477, + "step": 9819, + "time_per_iteration": 2.614985942840576 + }, + { + "auxiliary_loss_clip": 0.01116279, + "auxiliary_loss_mlp": 0.0102913, + "balance_loss_clip": 1.04079247, + "balance_loss_mlp": 1.0171082, + "epoch": 0.5904103411994589, + "flos": 29573195148000.0, + "grad_norm": 3.645779417669693, + "language_loss": 0.77918512, + "learning_rate": 1.5162653786565714e-06, + "loss": 0.80063915, + "num_input_tokens_seen": 211599310, + "router_z_loss_clip": 0.75488281, + "router_z_loss_mlp": 0.12023926, + "step": 9820, + "time_per_iteration": 2.663644552230835 + }, + { + "auxiliary_loss_clip": 0.01041211, + "auxiliary_loss_mlp": 0.01006686, + "balance_loss_clip": 1.01709366, + "balance_loss_mlp": 1.00547194, + "epoch": 0.5904704644521268, + "flos": 79163613221760.0, + "grad_norm": 0.9354871822410596, + "language_loss": 0.65094733, + "learning_rate": 1.5158874895257487e-06, + "loss": 0.6714263, + "num_input_tokens_seen": 211658790, + "router_z_loss_clip": 0.24133301, + "router_z_loss_mlp": 0.012146, + "step": 9821, + "time_per_iteration": 3.269984006881714 + }, + { + "auxiliary_loss_clip": 0.01114307, + "auxiliary_loss_mlp": 0.01033155, + "balance_loss_clip": 1.04025626, + "balance_loss_mlp": 1.02127552, + "epoch": 0.5905305877047948, + "flos": 23928040232640.0, + "grad_norm": 2.71526738470659, + "language_loss": 0.61278212, + "learning_rate": 1.515509618752521e-06, + "loss": 0.63425672, + "num_input_tokens_seen": 211677240, + "router_z_loss_clip": 0.74072266, + "router_z_loss_mlp": 0.11877441, + "step": 9822, + "time_per_iteration": 2.5903117656707764 + }, + { + "auxiliary_loss_clip": 0.01117831, + "auxiliary_loss_mlp": 0.01035638, + "balance_loss_clip": 1.04013598, + "balance_loss_mlp": 1.0233835, + "epoch": 0.5905907109574628, + "flos": 23171042295360.0, + "grad_norm": 2.7751293477964003, + "language_loss": 0.8270793, + "learning_rate": 1.5151317663512173e-06, + "loss": 0.84861398, + "num_input_tokens_seen": 211695485, + "router_z_loss_clip": 0.77734375, + "router_z_loss_mlp": 0.12268066, + "step": 9823, + "time_per_iteration": 2.6412763595581055 + }, + { + "auxiliary_loss_clip": 0.01115763, + "auxiliary_loss_mlp": 0.01028224, + "balance_loss_clip": 1.04069805, + "balance_loss_mlp": 1.01624322, + "epoch": 0.5906508342101308, + "flos": 27089071678080.0, + "grad_norm": 2.1665235092278117, + "language_loss": 0.7307049, + "learning_rate": 1.514753932336165e-06, + "loss": 0.75214475, + "num_input_tokens_seen": 211713090, + "router_z_loss_clip": 0.75, + "router_z_loss_mlp": 0.11981201, + "step": 9824, + "time_per_iteration": 2.639854669570923 + }, + { + "auxiliary_loss_clip": 0.01124753, + "auxiliary_loss_mlp": 0.01033242, + "balance_loss_clip": 1.04196453, + "balance_loss_mlp": 1.02003336, + "epoch": 0.5907109574627988, + "flos": 25485991005600.0, + "grad_norm": 2.3373215954567357, + "language_loss": 0.82973194, + "learning_rate": 1.514376116721693e-06, + "loss": 0.85131192, + "num_input_tokens_seen": 211732510, + "router_z_loss_clip": 0.82714844, + "router_z_loss_mlp": 0.13214111, + "step": 9825, + "time_per_iteration": 2.6916255950927734 + }, + { + "auxiliary_loss_clip": 0.01112098, + "auxiliary_loss_mlp": 0.0103083, + "balance_loss_clip": 1.04006922, + "balance_loss_mlp": 1.02035713, + "epoch": 0.5907710807154667, + "flos": 26242543252800.0, + "grad_norm": 1.893192725704772, + "language_loss": 0.76659244, + "learning_rate": 1.5139983195221272e-06, + "loss": 0.78802168, + "num_input_tokens_seen": 211748695, + "router_z_loss_clip": 0.71972656, + "router_z_loss_mlp": 0.10461426, + "step": 9826, + "time_per_iteration": 2.6245033740997314 + }, + { + "auxiliary_loss_clip": 0.011161, + "auxiliary_loss_mlp": 0.01031775, + "balance_loss_clip": 1.04114366, + "balance_loss_mlp": 1.02016437, + "epoch": 0.5908312039681347, + "flos": 26866906666560.0, + "grad_norm": 1.6475999484989559, + "language_loss": 0.72013992, + "learning_rate": 1.513620540751793e-06, + "loss": 0.74161863, + "num_input_tokens_seen": 211768545, + "router_z_loss_clip": 0.74951172, + "router_z_loss_mlp": 0.11602783, + "step": 9827, + "time_per_iteration": 2.7174298763275146 + }, + { + "auxiliary_loss_clip": 0.01115673, + "auxiliary_loss_mlp": 0.01030862, + "balance_loss_clip": 1.03940094, + "balance_loss_mlp": 1.01960826, + "epoch": 0.5908913272208026, + "flos": 22548502159200.0, + "grad_norm": 1.630654582586103, + "language_loss": 0.79811209, + "learning_rate": 1.5132427804250178e-06, + "loss": 0.81957746, + "num_input_tokens_seen": 211786665, + "router_z_loss_clip": 0.76220703, + "router_z_loss_mlp": 0.11242676, + "step": 9828, + "time_per_iteration": 2.599677801132202 + }, + { + "auxiliary_loss_clip": 0.01117956, + "auxiliary_loss_mlp": 0.0103309, + "balance_loss_clip": 1.04050004, + "balance_loss_mlp": 1.02068031, + "epoch": 0.5909514504734706, + "flos": 15022761305760.0, + "grad_norm": 3.4824598790248116, + "language_loss": 0.87961924, + "learning_rate": 1.5128650385561241e-06, + "loss": 0.90112972, + "num_input_tokens_seen": 211801215, + "router_z_loss_clip": 0.77441406, + "router_z_loss_mlp": 0.12420654, + "step": 9829, + "time_per_iteration": 2.695798873901367 + }, + { + "auxiliary_loss_clip": 0.01039766, + "auxiliary_loss_mlp": 0.0100458, + "balance_loss_clip": 1.01564884, + "balance_loss_mlp": 1.0033648, + "epoch": 0.5910115737261386, + "flos": 85674906925920.0, + "grad_norm": 0.7616813677328373, + "language_loss": 0.57921529, + "learning_rate": 1.5124873151594376e-06, + "loss": 0.59965879, + "num_input_tokens_seen": 211857005, + "router_z_loss_clip": 0.24108887, + "router_z_loss_mlp": 0.01213837, + "step": 9830, + "time_per_iteration": 3.168858766555786 + }, + { + "auxiliary_loss_clip": 0.01125499, + "auxiliary_loss_mlp": 0.01034988, + "balance_loss_clip": 1.04307044, + "balance_loss_mlp": 1.02141631, + "epoch": 0.5910716969788066, + "flos": 26866217872800.0, + "grad_norm": 3.443324245580095, + "language_loss": 0.75827801, + "learning_rate": 1.5121096102492812e-06, + "loss": 0.77988291, + "num_input_tokens_seen": 211876675, + "router_z_loss_clip": 0.82470703, + "router_z_loss_mlp": 0.13580322, + "step": 9831, + "time_per_iteration": 2.6668152809143066 + }, + { + "auxiliary_loss_clip": 0.01112892, + "auxiliary_loss_mlp": 0.01029773, + "balance_loss_clip": 1.04090667, + "balance_loss_mlp": 1.01800716, + "epoch": 0.5911318202314745, + "flos": 25931333960640.0, + "grad_norm": 2.273337197700672, + "language_loss": 0.7781356, + "learning_rate": 1.5117319238399767e-06, + "loss": 0.79956228, + "num_input_tokens_seen": 211895725, + "router_z_loss_clip": 0.71972656, + "router_z_loss_mlp": 0.11761475, + "step": 9832, + "time_per_iteration": 2.683525323867798 + }, + { + "auxiliary_loss_clip": 0.01114206, + "auxiliary_loss_mlp": 0.01026965, + "balance_loss_clip": 1.03876376, + "balance_loss_mlp": 1.01536012, + "epoch": 0.5911919434841425, + "flos": 21745766563200.0, + "grad_norm": 1.9303810218584554, + "language_loss": 0.83669746, + "learning_rate": 1.511354255945847e-06, + "loss": 0.85810912, + "num_input_tokens_seen": 211913860, + "router_z_loss_clip": 0.75439453, + "router_z_loss_mlp": 0.1159668, + "step": 9833, + "time_per_iteration": 2.642050266265869 + }, + { + "auxiliary_loss_clip": 0.01115829, + "auxiliary_loss_mlp": 0.01032147, + "balance_loss_clip": 1.03994596, + "balance_loss_mlp": 1.02009511, + "epoch": 0.5912520667368104, + "flos": 24861425005440.0, + "grad_norm": 1.6711051237492571, + "language_loss": 0.74550283, + "learning_rate": 1.5109766065812123e-06, + "loss": 0.76698256, + "num_input_tokens_seen": 211932880, + "router_z_loss_clip": 0.75927734, + "router_z_loss_mlp": 0.12042236, + "step": 9834, + "time_per_iteration": 2.668628215789795 + }, + { + "auxiliary_loss_clip": 0.01115858, + "auxiliary_loss_mlp": 0.01031354, + "balance_loss_clip": 1.03944445, + "balance_loss_mlp": 1.01947534, + "epoch": 0.5913121899894784, + "flos": 21878765742240.0, + "grad_norm": 3.2231467710244086, + "language_loss": 0.78367639, + "learning_rate": 1.5105989757603942e-06, + "loss": 0.80514854, + "num_input_tokens_seen": 211948625, + "router_z_loss_clip": 0.76367188, + "router_z_loss_mlp": 0.11877441, + "step": 9835, + "time_per_iteration": 2.5797572135925293 + }, + { + "auxiliary_loss_clip": 0.01117778, + "auxiliary_loss_mlp": 0.01032217, + "balance_loss_clip": 1.04068553, + "balance_loss_mlp": 1.02029634, + "epoch": 0.5913723132421465, + "flos": 26999460155520.0, + "grad_norm": 2.6892937274851967, + "language_loss": 0.74077797, + "learning_rate": 1.5102213634977117e-06, + "loss": 0.76227796, + "num_input_tokens_seen": 211965355, + "router_z_loss_clip": 0.77001953, + "router_z_loss_mlp": 0.11932373, + "step": 9836, + "time_per_iteration": 2.6728076934814453 + }, + { + "auxiliary_loss_clip": 0.01117115, + "auxiliary_loss_mlp": 0.01026815, + "balance_loss_clip": 1.04096091, + "balance_loss_mlp": 1.01503086, + "epoch": 0.5914324364948144, + "flos": 19154973795840.0, + "grad_norm": 2.244840937891783, + "language_loss": 0.82270479, + "learning_rate": 1.5098437698074841e-06, + "loss": 0.84414405, + "num_input_tokens_seen": 211982245, + "router_z_loss_clip": 0.76074219, + "router_z_loss_mlp": 0.11791992, + "step": 9837, + "time_per_iteration": 2.591420888900757 + }, + { + "auxiliary_loss_clip": 0.01116034, + "auxiliary_loss_mlp": 0.01030157, + "balance_loss_clip": 1.03953433, + "balance_loss_mlp": 1.01774096, + "epoch": 0.5914925597474824, + "flos": 27756498610080.0, + "grad_norm": 1.7967192675788208, + "language_loss": 0.7922141, + "learning_rate": 1.5094661947040304e-06, + "loss": 0.81367606, + "num_input_tokens_seen": 212000250, + "router_z_loss_clip": 0.76513672, + "router_z_loss_mlp": 0.12408447, + "step": 9838, + "time_per_iteration": 2.6425905227661133 + }, + { + "auxiliary_loss_clip": 0.01117515, + "auxiliary_loss_mlp": 0.01035172, + "balance_loss_clip": 1.04112482, + "balance_loss_mlp": 1.02321577, + "epoch": 0.5915526830001503, + "flos": 22321029384000.0, + "grad_norm": 2.2683079486167435, + "language_loss": 0.70071185, + "learning_rate": 1.5090886382016673e-06, + "loss": 0.72223878, + "num_input_tokens_seen": 212017505, + "router_z_loss_clip": 0.76318359, + "router_z_loss_mlp": 0.11956787, + "step": 9839, + "time_per_iteration": 2.5941758155822754 + }, + { + "auxiliary_loss_clip": 0.01117311, + "auxiliary_loss_mlp": 0.01037666, + "balance_loss_clip": 1.03979826, + "balance_loss_mlp": 1.02566791, + "epoch": 0.5916128062528183, + "flos": 20766441545280.0, + "grad_norm": 2.3044705299348105, + "language_loss": 0.6574012, + "learning_rate": 1.5087111003147124e-06, + "loss": 0.67895091, + "num_input_tokens_seen": 212034595, + "router_z_loss_clip": 0.77587891, + "router_z_loss_mlp": 0.11981201, + "step": 9840, + "time_per_iteration": 2.6777172088623047 + }, + { + "auxiliary_loss_clip": 0.01119123, + "auxiliary_loss_mlp": 0.01027841, + "balance_loss_clip": 1.04126549, + "balance_loss_mlp": 1.0155859, + "epoch": 0.5916729295054862, + "flos": 30205459431360.0, + "grad_norm": 1.8709964947617546, + "language_loss": 0.81292772, + "learning_rate": 1.5083335810574813e-06, + "loss": 0.83439738, + "num_input_tokens_seen": 212055775, + "router_z_loss_clip": 0.77734375, + "router_z_loss_mlp": 0.12255859, + "step": 9841, + "time_per_iteration": 4.119814395904541 + }, + { + "auxiliary_loss_clip": 0.01113969, + "auxiliary_loss_mlp": 0.0102681, + "balance_loss_clip": 1.03990233, + "balance_loss_mlp": 1.0156343, + "epoch": 0.5917330527581542, + "flos": 19472139128160.0, + "grad_norm": 2.370331294221331, + "language_loss": 0.69152975, + "learning_rate": 1.507956080444291e-06, + "loss": 0.71293759, + "num_input_tokens_seen": 212074000, + "router_z_loss_clip": 0.74023438, + "router_z_loss_mlp": 0.11181641, + "step": 9842, + "time_per_iteration": 4.058468341827393 + }, + { + "auxiliary_loss_clip": 0.01115229, + "auxiliary_loss_mlp": 0.01033461, + "balance_loss_clip": 1.03854978, + "balance_loss_mlp": 1.02170157, + "epoch": 0.5917931760108222, + "flos": 29042373432960.0, + "grad_norm": 1.8309920933050703, + "language_loss": 0.82474297, + "learning_rate": 1.5075785984894549e-06, + "loss": 0.84622991, + "num_input_tokens_seen": 212091415, + "router_z_loss_clip": 0.76660156, + "router_z_loss_mlp": 0.11755371, + "step": 9843, + "time_per_iteration": 2.7310831546783447 + }, + { + "auxiliary_loss_clip": 0.01115653, + "auxiliary_loss_mlp": 0.0102701, + "balance_loss_clip": 1.03935516, + "balance_loss_mlp": 1.01436758, + "epoch": 0.5918532992634902, + "flos": 28370205979200.0, + "grad_norm": 2.5921108962949306, + "language_loss": 0.81549394, + "learning_rate": 1.5072011352072875e-06, + "loss": 0.83692062, + "num_input_tokens_seen": 212105255, + "router_z_loss_clip": 0.76318359, + "router_z_loss_mlp": 0.12634277, + "step": 9844, + "time_per_iteration": 2.608853578567505 + }, + { + "auxiliary_loss_clip": 0.01119722, + "auxiliary_loss_mlp": 0.01028152, + "balance_loss_clip": 1.04266274, + "balance_loss_mlp": 1.01636171, + "epoch": 0.5919134225161581, + "flos": 23793703983360.0, + "grad_norm": 3.5063230920000734, + "language_loss": 0.73836505, + "learning_rate": 1.5068236906121032e-06, + "loss": 0.75984377, + "num_input_tokens_seen": 212122765, + "router_z_loss_clip": 0.77099609, + "router_z_loss_mlp": 0.11791992, + "step": 9845, + "time_per_iteration": 2.634800672531128 + }, + { + "auxiliary_loss_clip": 0.01117941, + "auxiliary_loss_mlp": 0.01027058, + "balance_loss_clip": 1.04030252, + "balance_loss_mlp": 1.01430213, + "epoch": 0.5919735457688261, + "flos": 47346361902720.0, + "grad_norm": 1.8770466267085422, + "language_loss": 0.63842839, + "learning_rate": 1.506446264718213e-06, + "loss": 0.65987837, + "num_input_tokens_seen": 212143960, + "router_z_loss_clip": 0.77539062, + "router_z_loss_mlp": 0.12762451, + "step": 9846, + "time_per_iteration": 2.881113052368164 + }, + { + "auxiliary_loss_clip": 0.01111365, + "auxiliary_loss_mlp": 0.01024874, + "balance_loss_clip": 1.03965914, + "balance_loss_mlp": 1.01417518, + "epoch": 0.592033669021494, + "flos": 27044630572320.0, + "grad_norm": 2.129048068787904, + "language_loss": 0.765302, + "learning_rate": 1.506068857539931e-06, + "loss": 0.78666437, + "num_input_tokens_seen": 212162005, + "router_z_loss_clip": 0.71777344, + "router_z_loss_mlp": 0.10705566, + "step": 9847, + "time_per_iteration": 2.613966941833496 + }, + { + "auxiliary_loss_clip": 0.01118787, + "auxiliary_loss_mlp": 0.010307, + "balance_loss_clip": 1.04147649, + "balance_loss_mlp": 1.01815319, + "epoch": 0.592093792274162, + "flos": 27712138538880.0, + "grad_norm": 1.7653705579142818, + "language_loss": 0.6232549, + "learning_rate": 1.5056914690915667e-06, + "loss": 0.64474976, + "num_input_tokens_seen": 212181635, + "router_z_loss_clip": 0.7734375, + "router_z_loss_mlp": 0.12542725, + "step": 9848, + "time_per_iteration": 2.6723179817199707 + }, + { + "auxiliary_loss_clip": 0.01117212, + "auxiliary_loss_mlp": 0.0103746, + "balance_loss_clip": 1.04009104, + "balance_loss_mlp": 1.02555668, + "epoch": 0.59215391552683, + "flos": 27489973527360.0, + "grad_norm": 1.796011971333033, + "language_loss": 0.75658029, + "learning_rate": 1.5053140993874312e-06, + "loss": 0.77812696, + "num_input_tokens_seen": 212201615, + "router_z_loss_clip": 0.77099609, + "router_z_loss_mlp": 0.11895752, + "step": 9849, + "time_per_iteration": 4.12116813659668 + }, + { + "auxiliary_loss_clip": 0.01118364, + "auxiliary_loss_mlp": 0.01033405, + "balance_loss_clip": 1.04115033, + "balance_loss_mlp": 1.02131712, + "epoch": 0.592214038779498, + "flos": 29894007035520.0, + "grad_norm": 2.0790028662485613, + "language_loss": 0.75309634, + "learning_rate": 1.5049367484418353e-06, + "loss": 0.77461404, + "num_input_tokens_seen": 212219355, + "router_z_loss_clip": 0.77246094, + "router_z_loss_mlp": 0.12091064, + "step": 9850, + "time_per_iteration": 2.696634531021118 + }, + { + "auxiliary_loss_clip": 0.01113153, + "auxiliary_loss_mlp": 0.01032973, + "balance_loss_clip": 1.03850222, + "balance_loss_mlp": 1.02103424, + "epoch": 0.592274162032166, + "flos": 26639109753120.0, + "grad_norm": 2.0403713281705413, + "language_loss": 0.757191, + "learning_rate": 1.5045594162690868e-06, + "loss": 0.77865225, + "num_input_tokens_seen": 212236710, + "router_z_loss_clip": 0.74658203, + "router_z_loss_mlp": 0.1194458, + "step": 9851, + "time_per_iteration": 2.6285831928253174 + }, + { + "auxiliary_loss_clip": 0.01117156, + "auxiliary_loss_mlp": 0.01029715, + "balance_loss_clip": 1.04047012, + "balance_loss_mlp": 1.01782954, + "epoch": 0.5923342852848339, + "flos": 30027208800960.0, + "grad_norm": 2.025961590758011, + "language_loss": 0.70917928, + "learning_rate": 1.5041821028834954e-06, + "loss": 0.73064798, + "num_input_tokens_seen": 212256195, + "router_z_loss_clip": 0.76708984, + "router_z_loss_mlp": 0.11901855, + "step": 9852, + "time_per_iteration": 2.6559178829193115 + }, + { + "auxiliary_loss_clip": 0.01121144, + "auxiliary_loss_mlp": 0.0103643, + "balance_loss_clip": 1.04156506, + "balance_loss_mlp": 1.02453268, + "epoch": 0.5923944085375019, + "flos": 24328496391840.0, + "grad_norm": 2.2244499329695597, + "language_loss": 0.80526394, + "learning_rate": 1.5038048082993685e-06, + "loss": 0.82683969, + "num_input_tokens_seen": 212274085, + "router_z_loss_clip": 0.79541016, + "router_z_loss_mlp": 0.11901855, + "step": 9853, + "time_per_iteration": 4.072290420532227 + }, + { + "auxiliary_loss_clip": 0.01113099, + "auxiliary_loss_mlp": 0.01026529, + "balance_loss_clip": 1.03960347, + "balance_loss_mlp": 1.01546073, + "epoch": 0.5924545317901698, + "flos": 34969733618400.0, + "grad_norm": 2.660910910294898, + "language_loss": 0.67509043, + "learning_rate": 1.5034275325310124e-06, + "loss": 0.69648671, + "num_input_tokens_seen": 212295530, + "router_z_loss_clip": 0.73535156, + "router_z_loss_mlp": 0.11065674, + "step": 9854, + "time_per_iteration": 2.653677225112915 + }, + { + "auxiliary_loss_clip": 0.01114513, + "auxiliary_loss_mlp": 0.01025804, + "balance_loss_clip": 1.03978753, + "balance_loss_mlp": 1.01399088, + "epoch": 0.5925146550428378, + "flos": 24239127972960.0, + "grad_norm": 2.412223305589531, + "language_loss": 0.89033914, + "learning_rate": 1.5030502755927344e-06, + "loss": 0.91174233, + "num_input_tokens_seen": 212313770, + "router_z_loss_clip": 0.74755859, + "router_z_loss_mlp": 0.11816406, + "step": 9855, + "time_per_iteration": 2.6396336555480957 + }, + { + "auxiliary_loss_clip": 0.01113276, + "auxiliary_loss_mlp": 0.01027123, + "balance_loss_clip": 1.04014885, + "balance_loss_mlp": 1.01610184, + "epoch": 0.5925747782955058, + "flos": 18452586801600.0, + "grad_norm": 3.2855329124717985, + "language_loss": 0.87272918, + "learning_rate": 1.5026730374988397e-06, + "loss": 0.89413315, + "num_input_tokens_seen": 212331525, + "router_z_loss_clip": 0.73144531, + "router_z_loss_mlp": 0.11029053, + "step": 9856, + "time_per_iteration": 2.6058735847473145 + }, + { + "auxiliary_loss_clip": 0.01117302, + "auxiliary_loss_mlp": 0.0103237, + "balance_loss_clip": 1.04072547, + "balance_loss_mlp": 1.02080643, + "epoch": 0.5926349015481738, + "flos": 22458201842880.0, + "grad_norm": 2.6208259297847873, + "language_loss": 0.77193785, + "learning_rate": 1.5022958182636332e-06, + "loss": 0.79343456, + "num_input_tokens_seen": 212347295, + "router_z_loss_clip": 0.76513672, + "router_z_loss_mlp": 0.11560059, + "step": 9857, + "time_per_iteration": 2.616771936416626 + }, + { + "auxiliary_loss_clip": 0.01120617, + "auxiliary_loss_mlp": 0.01033877, + "balance_loss_clip": 1.04551482, + "balance_loss_mlp": 1.02204001, + "epoch": 0.5926950248008417, + "flos": 28200545012160.0, + "grad_norm": 2.0984624506889444, + "language_loss": 0.6506139, + "learning_rate": 1.501918617901419e-06, + "loss": 0.67215884, + "num_input_tokens_seen": 212365750, + "router_z_loss_clip": 0.75, + "router_z_loss_mlp": 0.11834717, + "step": 9858, + "time_per_iteration": 2.639242172241211 + }, + { + "auxiliary_loss_clip": 0.01115056, + "auxiliary_loss_mlp": 0.01029481, + "balance_loss_clip": 1.04061818, + "balance_loss_mlp": 1.01732123, + "epoch": 0.5927551480535097, + "flos": 34207954642080.0, + "grad_norm": 1.9594506518192758, + "language_loss": 0.77070439, + "learning_rate": 1.501541436426501e-06, + "loss": 0.79214978, + "num_input_tokens_seen": 212385300, + "router_z_loss_clip": 0.74414062, + "router_z_loss_mlp": 0.12164307, + "step": 9859, + "time_per_iteration": 2.69523024559021 + }, + { + "auxiliary_loss_clip": 0.01119733, + "auxiliary_loss_mlp": 0.0103492, + "balance_loss_clip": 1.04251242, + "balance_loss_mlp": 1.02224255, + "epoch": 0.5928152713061776, + "flos": 26598882444480.0, + "grad_norm": 2.513503750684433, + "language_loss": 0.74980223, + "learning_rate": 1.5011642738531818e-06, + "loss": 0.77134883, + "num_input_tokens_seen": 212402140, + "router_z_loss_clip": 0.77197266, + "router_z_loss_mlp": 0.12683105, + "step": 9860, + "time_per_iteration": 2.663719892501831 + }, + { + "auxiliary_loss_clip": 0.01116272, + "auxiliary_loss_mlp": 0.01029616, + "balance_loss_clip": 1.04172421, + "balance_loss_mlp": 1.01887512, + "epoch": 0.5928753945588456, + "flos": 29671679954880.0, + "grad_norm": 6.057932620017983, + "language_loss": 0.76272857, + "learning_rate": 1.500787130195763e-06, + "loss": 0.78418744, + "num_input_tokens_seen": 212421790, + "router_z_loss_clip": 0.74560547, + "router_z_loss_mlp": 0.10736084, + "step": 9861, + "time_per_iteration": 2.675140619277954 + }, + { + "auxiliary_loss_clip": 0.01112696, + "auxiliary_loss_mlp": 0.0102742, + "balance_loss_clip": 1.03810799, + "balance_loss_mlp": 1.01613081, + "epoch": 0.5929355178115137, + "flos": 32293016400960.0, + "grad_norm": 1.8874510068578, + "language_loss": 0.70625311, + "learning_rate": 1.5004100054685465e-06, + "loss": 0.72765422, + "num_input_tokens_seen": 212442115, + "router_z_loss_clip": 0.74658203, + "router_z_loss_mlp": 0.11291504, + "step": 9862, + "time_per_iteration": 2.72977352142334 + }, + { + "auxiliary_loss_clip": 0.01116344, + "auxiliary_loss_mlp": 0.01032191, + "balance_loss_clip": 1.04057336, + "balance_loss_mlp": 1.02061021, + "epoch": 0.5929956410641816, + "flos": 30464529334560.0, + "grad_norm": 2.081612874097026, + "language_loss": 0.77841687, + "learning_rate": 1.500032899685832e-06, + "loss": 0.79990226, + "num_input_tokens_seen": 212459535, + "router_z_loss_clip": 0.75830078, + "router_z_loss_mlp": 0.11590576, + "step": 9863, + "time_per_iteration": 2.64620304107666 + }, + { + "auxiliary_loss_clip": 0.01118505, + "auxiliary_loss_mlp": 0.01037107, + "balance_loss_clip": 1.04263806, + "balance_loss_mlp": 1.02467918, + "epoch": 0.5930557643168496, + "flos": 31980348486720.0, + "grad_norm": 1.8567622978207747, + "language_loss": 0.70746279, + "learning_rate": 1.499655812861921e-06, + "loss": 0.72901893, + "num_input_tokens_seen": 212479385, + "router_z_loss_clip": 0.7578125, + "router_z_loss_mlp": 0.12420654, + "step": 9864, + "time_per_iteration": 2.7079451084136963 + }, + { + "auxiliary_loss_clip": 0.01118521, + "auxiliary_loss_mlp": 0.0103147, + "balance_loss_clip": 1.04118252, + "balance_loss_mlp": 1.01910162, + "epoch": 0.5931158875695175, + "flos": 33989233599360.0, + "grad_norm": 2.0790637875963736, + "language_loss": 0.67276096, + "learning_rate": 1.4992787450111112e-06, + "loss": 0.69426084, + "num_input_tokens_seen": 212500060, + "router_z_loss_clip": 0.77392578, + "router_z_loss_mlp": 0.12371826, + "step": 9865, + "time_per_iteration": 2.678053617477417 + }, + { + "auxiliary_loss_clip": 0.01118539, + "auxiliary_loss_mlp": 0.01031945, + "balance_loss_clip": 1.04093432, + "balance_loss_mlp": 1.02004218, + "epoch": 0.5931760108221855, + "flos": 18807386336640.0, + "grad_norm": 2.667718125993559, + "language_loss": 0.78167671, + "learning_rate": 1.4989016961477015e-06, + "loss": 0.80318153, + "num_input_tokens_seen": 212518590, + "router_z_loss_clip": 0.77685547, + "router_z_loss_mlp": 0.11907959, + "step": 9866, + "time_per_iteration": 2.703491449356079 + }, + { + "auxiliary_loss_clip": 0.0111346, + "auxiliary_loss_mlp": 0.01029518, + "balance_loss_clip": 1.04035163, + "balance_loss_mlp": 1.01787114, + "epoch": 0.5932361340748534, + "flos": 36836989371360.0, + "grad_norm": 3.0834513652000726, + "language_loss": 0.72415984, + "learning_rate": 1.4985246662859903e-06, + "loss": 0.74558961, + "num_input_tokens_seen": 212538190, + "router_z_loss_clip": 0.73144531, + "router_z_loss_mlp": 0.11663818, + "step": 9867, + "time_per_iteration": 2.707202911376953 + }, + { + "auxiliary_loss_clip": 0.01114999, + "auxiliary_loss_mlp": 0.01031127, + "balance_loss_clip": 1.04047763, + "balance_loss_mlp": 1.01841879, + "epoch": 0.5932962573275214, + "flos": 24596196475680.0, + "grad_norm": 1.6256126835920868, + "language_loss": 0.66430533, + "learning_rate": 1.4981476554402732e-06, + "loss": 0.68576658, + "num_input_tokens_seen": 212557820, + "router_z_loss_clip": 0.74462891, + "router_z_loss_mlp": 0.1270752, + "step": 9868, + "time_per_iteration": 2.660451650619507 + }, + { + "auxiliary_loss_clip": 0.01119516, + "auxiliary_loss_mlp": 0.01033297, + "balance_loss_clip": 1.04250526, + "balance_loss_mlp": 1.02134597, + "epoch": 0.5933563805801894, + "flos": 31049637854400.0, + "grad_norm": 2.3301864923406983, + "language_loss": 0.75746399, + "learning_rate": 1.4977706636248478e-06, + "loss": 0.77899206, + "num_input_tokens_seen": 212577645, + "router_z_loss_clip": 0.77050781, + "router_z_loss_mlp": 0.11950684, + "step": 9869, + "time_per_iteration": 2.656924247741699 + }, + { + "auxiliary_loss_clip": 0.01121095, + "auxiliary_loss_mlp": 0.01034036, + "balance_loss_clip": 1.04310334, + "balance_loss_mlp": 1.02148938, + "epoch": 0.5934165038328574, + "flos": 73210200425280.0, + "grad_norm": 1.5785698322350779, + "language_loss": 0.74236405, + "learning_rate": 1.4973936908540091e-06, + "loss": 0.76391536, + "num_input_tokens_seen": 212603430, + "router_z_loss_clip": 0.78027344, + "router_z_loss_mlp": 0.12548828, + "step": 9870, + "time_per_iteration": 2.983081102371216 + }, + { + "auxiliary_loss_clip": 0.01119647, + "auxiliary_loss_mlp": 0.01030136, + "balance_loss_clip": 1.04271364, + "balance_loss_mlp": 1.01786935, + "epoch": 0.5934766270855253, + "flos": 29798763611040.0, + "grad_norm": 2.2667911786440444, + "language_loss": 0.71884751, + "learning_rate": 1.4970167371420517e-06, + "loss": 0.74034536, + "num_input_tokens_seen": 212620730, + "router_z_loss_clip": 0.76855469, + "router_z_loss_mlp": 0.12280273, + "step": 9871, + "time_per_iteration": 2.668802261352539 + }, + { + "auxiliary_loss_clip": 0.01119321, + "auxiliary_loss_mlp": 0.01028936, + "balance_loss_clip": 1.04218602, + "balance_loss_mlp": 1.0165683, + "epoch": 0.5935367503381933, + "flos": 28691058384000.0, + "grad_norm": 2.217822203942803, + "language_loss": 0.74396336, + "learning_rate": 1.496639802503271e-06, + "loss": 0.76544595, + "num_input_tokens_seen": 212639745, + "router_z_loss_clip": 0.77099609, + "router_z_loss_mlp": 0.12359619, + "step": 9872, + "time_per_iteration": 2.6567018032073975 + }, + { + "auxiliary_loss_clip": 0.0112079, + "auxiliary_loss_mlp": 0.01036242, + "balance_loss_clip": 1.04153919, + "balance_loss_mlp": 1.02340353, + "epoch": 0.5935968735908612, + "flos": 23121090839520.0, + "grad_norm": 2.1670118562244935, + "language_loss": 0.78824329, + "learning_rate": 1.4962628869519583e-06, + "loss": 0.80981362, + "num_input_tokens_seen": 212655915, + "router_z_loss_clip": 0.79248047, + "router_z_loss_mlp": 0.128479, + "step": 9873, + "time_per_iteration": 2.652078151702881 + }, + { + "auxiliary_loss_clip": 0.01118856, + "auxiliary_loss_mlp": 0.01034386, + "balance_loss_clip": 1.04174948, + "balance_loss_mlp": 1.02188718, + "epoch": 0.5936569968435292, + "flos": 31095334995840.0, + "grad_norm": 1.707748056440093, + "language_loss": 0.85008526, + "learning_rate": 1.4958859905024078e-06, + "loss": 0.87161762, + "num_input_tokens_seen": 212676115, + "router_z_loss_clip": 0.77148438, + "router_z_loss_mlp": 0.12512207, + "step": 9874, + "time_per_iteration": 2.6760270595550537 + }, + { + "auxiliary_loss_clip": 0.01037582, + "auxiliary_loss_mlp": 0.01003298, + "balance_loss_clip": 1.01373231, + "balance_loss_mlp": 1.00202453, + "epoch": 0.5937171200961973, + "flos": 80995625291520.0, + "grad_norm": 0.7133001794650273, + "language_loss": 0.59997076, + "learning_rate": 1.4955091131689115e-06, + "loss": 0.62037957, + "num_input_tokens_seen": 212737560, + "router_z_loss_clip": 0.23852539, + "router_z_loss_mlp": 0.01273346, + "step": 9875, + "time_per_iteration": 3.358454704284668 + }, + { + "auxiliary_loss_clip": 0.01121776, + "auxiliary_loss_mlp": 0.01030309, + "balance_loss_clip": 1.04075611, + "balance_loss_mlp": 1.01702356, + "epoch": 0.5937772433488652, + "flos": 18184765165920.0, + "grad_norm": 2.263658302000531, + "language_loss": 0.77268434, + "learning_rate": 1.4951322549657594e-06, + "loss": 0.79420519, + "num_input_tokens_seen": 212755365, + "router_z_loss_clip": 0.81005859, + "router_z_loss_mlp": 0.13293457, + "step": 9876, + "time_per_iteration": 2.6384613513946533 + }, + { + "auxiliary_loss_clip": 0.01112183, + "auxiliary_loss_mlp": 0.0102868, + "balance_loss_clip": 1.03864014, + "balance_loss_mlp": 1.01720059, + "epoch": 0.5938373666015332, + "flos": 27530038766880.0, + "grad_norm": 2.278377592799446, + "language_loss": 0.75703323, + "learning_rate": 1.494755415907243e-06, + "loss": 0.77844191, + "num_input_tokens_seen": 212773875, + "router_z_loss_clip": 0.73535156, + "router_z_loss_mlp": 0.11474609, + "step": 9877, + "time_per_iteration": 2.653848648071289 + }, + { + "auxiliary_loss_clip": 0.01118123, + "auxiliary_loss_mlp": 0.01030296, + "balance_loss_clip": 1.03980565, + "balance_loss_mlp": 1.01751089, + "epoch": 0.5938974898542011, + "flos": 22501548982080.0, + "grad_norm": 2.4381104919881795, + "language_loss": 0.81290656, + "learning_rate": 1.4943785960076522e-06, + "loss": 0.83439076, + "num_input_tokens_seen": 212790590, + "router_z_loss_clip": 0.78369141, + "router_z_loss_mlp": 0.12792969, + "step": 9878, + "time_per_iteration": 2.5913448333740234 + }, + { + "auxiliary_loss_clip": 0.01118733, + "auxiliary_loss_mlp": 0.01037905, + "balance_loss_clip": 1.04075789, + "balance_loss_mlp": 1.02584743, + "epoch": 0.5939576131068691, + "flos": 55626953277600.0, + "grad_norm": 1.784931404010155, + "language_loss": 0.71354425, + "learning_rate": 1.4940017952812754e-06, + "loss": 0.73511064, + "num_input_tokens_seen": 212812265, + "router_z_loss_clip": 0.77978516, + "router_z_loss_mlp": 0.12054443, + "step": 9879, + "time_per_iteration": 2.8271467685699463 + }, + { + "auxiliary_loss_clip": 0.01116898, + "auxiliary_loss_mlp": 0.01034583, + "balance_loss_clip": 1.0415585, + "balance_loss_mlp": 1.02264416, + "epoch": 0.594017736359537, + "flos": 28781885424960.0, + "grad_norm": 3.104872501833952, + "language_loss": 0.57168835, + "learning_rate": 1.493625013742401e-06, + "loss": 0.59320313, + "num_input_tokens_seen": 212831915, + "router_z_loss_clip": 0.75292969, + "router_z_loss_mlp": 0.11938477, + "step": 9880, + "time_per_iteration": 2.6504907608032227 + }, + { + "auxiliary_loss_clip": 0.01118065, + "auxiliary_loss_mlp": 0.01036731, + "balance_loss_clip": 1.04091334, + "balance_loss_mlp": 1.0240705, + "epoch": 0.594077859612205, + "flos": 35944763804640.0, + "grad_norm": 2.0753421525996765, + "language_loss": 0.77654874, + "learning_rate": 1.4932482514053177e-06, + "loss": 0.79809666, + "num_input_tokens_seen": 212851350, + "router_z_loss_clip": 0.77148438, + "router_z_loss_mlp": 0.12658691, + "step": 9881, + "time_per_iteration": 5.470816612243652 + }, + { + "auxiliary_loss_clip": 0.01116091, + "auxiliary_loss_mlp": 0.01027583, + "balance_loss_clip": 1.03916228, + "balance_loss_mlp": 1.01542401, + "epoch": 0.594137982864873, + "flos": 20499227668800.0, + "grad_norm": 2.2502434670481053, + "language_loss": 0.82822657, + "learning_rate": 1.4928715082843112e-06, + "loss": 0.84966338, + "num_input_tokens_seen": 212867995, + "router_z_loss_clip": 0.76953125, + "router_z_loss_mlp": 0.12164307, + "step": 9882, + "time_per_iteration": 2.5976226329803467 + }, + { + "auxiliary_loss_clip": 0.01117008, + "auxiliary_loss_mlp": 0.01034863, + "balance_loss_clip": 1.04094672, + "balance_loss_mlp": 1.02312064, + "epoch": 0.594198106117541, + "flos": 15557310610560.0, + "grad_norm": 2.5917202895215365, + "language_loss": 0.79460013, + "learning_rate": 1.492494784393667e-06, + "loss": 0.81611884, + "num_input_tokens_seen": 212885220, + "router_z_loss_clip": 0.76074219, + "router_z_loss_mlp": 0.11737061, + "step": 9883, + "time_per_iteration": 2.6154186725616455 + }, + { + "auxiliary_loss_clip": 0.01124305, + "auxiliary_loss_mlp": 0.01034486, + "balance_loss_clip": 1.04416561, + "balance_loss_mlp": 1.02143884, + "epoch": 0.5942582293702089, + "flos": 25620003116640.0, + "grad_norm": 1.8443469261789278, + "language_loss": 0.74558705, + "learning_rate": 1.4921180797476725e-06, + "loss": 0.76717502, + "num_input_tokens_seen": 212903195, + "router_z_loss_clip": 0.80175781, + "router_z_loss_mlp": 0.13061523, + "step": 9884, + "time_per_iteration": 2.608400583267212 + }, + { + "auxiliary_loss_clip": 0.01121765, + "auxiliary_loss_mlp": 0.01032547, + "balance_loss_clip": 1.0451746, + "balance_loss_mlp": 1.02058482, + "epoch": 0.5943183526228769, + "flos": 34522000143840.0, + "grad_norm": 3.488087191167161, + "language_loss": 0.65984571, + "learning_rate": 1.4917413943606106e-06, + "loss": 0.68138885, + "num_input_tokens_seen": 212923340, + "router_z_loss_clip": 0.76611328, + "router_z_loss_mlp": 0.11962891, + "step": 9885, + "time_per_iteration": 2.7127561569213867 + }, + { + "auxiliary_loss_clip": 0.01116725, + "auxiliary_loss_mlp": 0.01037458, + "balance_loss_clip": 1.0418849, + "balance_loss_mlp": 1.02534056, + "epoch": 0.5943784758755448, + "flos": 32475683414880.0, + "grad_norm": 2.977310886353984, + "language_loss": 0.77220035, + "learning_rate": 1.4913647282467667e-06, + "loss": 0.79374218, + "num_input_tokens_seen": 212942755, + "router_z_loss_clip": 0.74804688, + "router_z_loss_mlp": 0.12121582, + "step": 9886, + "time_per_iteration": 2.706670045852661 + }, + { + "auxiliary_loss_clip": 0.01038503, + "auxiliary_loss_mlp": 0.01001628, + "balance_loss_clip": 1.01458549, + "balance_loss_mlp": 1.00040495, + "epoch": 0.5944385991282128, + "flos": 78326201184480.0, + "grad_norm": 0.834986975864746, + "language_loss": 0.64497614, + "learning_rate": 1.490988081420423e-06, + "loss": 0.66537744, + "num_input_tokens_seen": 212999355, + "router_z_loss_clip": 0.23913574, + "router_z_loss_mlp": 0.01222229, + "step": 9887, + "time_per_iteration": 3.1851601600646973 + }, + { + "auxiliary_loss_clip": 0.01115836, + "auxiliary_loss_mlp": 0.01031151, + "balance_loss_clip": 1.03994, + "balance_loss_mlp": 1.01902747, + "epoch": 0.5944987223808808, + "flos": 23881816366560.0, + "grad_norm": 2.5740195836219026, + "language_loss": 0.68908286, + "learning_rate": 1.4906114538958615e-06, + "loss": 0.71055269, + "num_input_tokens_seen": 213018570, + "router_z_loss_clip": 0.75976562, + "router_z_loss_mlp": 0.12139893, + "step": 9888, + "time_per_iteration": 2.8567681312561035 + }, + { + "auxiliary_loss_clip": 0.01120502, + "auxiliary_loss_mlp": 0.01031054, + "balance_loss_clip": 1.04360008, + "balance_loss_mlp": 1.01809025, + "epoch": 0.5945588456335488, + "flos": 31942309111200.0, + "grad_norm": 2.3424908613053885, + "language_loss": 0.79393703, + "learning_rate": 1.490234845687366e-06, + "loss": 0.81545264, + "num_input_tokens_seen": 213037735, + "router_z_loss_clip": 0.76904297, + "router_z_loss_mlp": 0.12969971, + "step": 9889, + "time_per_iteration": 4.213197708129883 + }, + { + "auxiliary_loss_clip": 0.01115852, + "auxiliary_loss_mlp": 0.01029529, + "balance_loss_clip": 1.04055333, + "balance_loss_mlp": 1.01771522, + "epoch": 0.5946189688862168, + "flos": 24949051181280.0, + "grad_norm": 1.8122384050696763, + "language_loss": 0.70576018, + "learning_rate": 1.4898582568092154e-06, + "loss": 0.72721398, + "num_input_tokens_seen": 213057160, + "router_z_loss_clip": 0.75341797, + "router_z_loss_mlp": 0.1182251, + "step": 9890, + "time_per_iteration": 2.686626672744751 + }, + { + "auxiliary_loss_clip": 0.01118587, + "auxiliary_loss_mlp": 0.01031006, + "balance_loss_clip": 1.0408268, + "balance_loss_mlp": 1.01804233, + "epoch": 0.5946790921388847, + "flos": 16396505408160.0, + "grad_norm": 2.696683249189778, + "language_loss": 0.69177723, + "learning_rate": 1.489481687275691e-06, + "loss": 0.71327317, + "num_input_tokens_seen": 213073630, + "router_z_loss_clip": 0.77636719, + "router_z_loss_mlp": 0.12976074, + "step": 9891, + "time_per_iteration": 2.5904300212860107 + }, + { + "auxiliary_loss_clip": 0.01116358, + "auxiliary_loss_mlp": 0.01036022, + "balance_loss_clip": 1.04096043, + "balance_loss_mlp": 1.02349305, + "epoch": 0.5947392153915527, + "flos": 24907081629600.0, + "grad_norm": 2.3173472916893485, + "language_loss": 0.53721392, + "learning_rate": 1.4891051371010726e-06, + "loss": 0.55873775, + "num_input_tokens_seen": 213092450, + "router_z_loss_clip": 0.75439453, + "router_z_loss_mlp": 0.12542725, + "step": 9892, + "time_per_iteration": 2.6787421703338623 + }, + { + "auxiliary_loss_clip": 0.0103862, + "auxiliary_loss_mlp": 0.01002107, + "balance_loss_clip": 1.01473081, + "balance_loss_mlp": 1.00086451, + "epoch": 0.5947993386442206, + "flos": 80068520697120.0, + "grad_norm": 0.6590507013696407, + "language_loss": 0.54554617, + "learning_rate": 1.4887286062996375e-06, + "loss": 0.56595349, + "num_input_tokens_seen": 213155465, + "router_z_loss_clip": 0.23901367, + "router_z_loss_mlp": 0.01242065, + "step": 9893, + "time_per_iteration": 4.692462205886841 + }, + { + "auxiliary_loss_clip": 0.01117094, + "auxiliary_loss_mlp": 0.01031936, + "balance_loss_clip": 1.04234099, + "balance_loss_mlp": 1.02008665, + "epoch": 0.5948594618968887, + "flos": 28288454808960.0, + "grad_norm": 1.883461798365026, + "language_loss": 0.74800038, + "learning_rate": 1.4883520948856658e-06, + "loss": 0.76949066, + "num_input_tokens_seen": 213174875, + "router_z_loss_clip": 0.74755859, + "router_z_loss_mlp": 0.11846924, + "step": 9894, + "time_per_iteration": 2.606410503387451 + }, + { + "auxiliary_loss_clip": 0.01117653, + "auxiliary_loss_mlp": 0.01031418, + "balance_loss_clip": 1.04122281, + "balance_loss_mlp": 1.01927686, + "epoch": 0.5949195851495566, + "flos": 16626814392960.0, + "grad_norm": 2.0762991121342065, + "language_loss": 0.77690434, + "learning_rate": 1.487975602873434e-06, + "loss": 0.79839504, + "num_input_tokens_seen": 213192695, + "router_z_loss_clip": 0.76318359, + "router_z_loss_mlp": 0.12139893, + "step": 9895, + "time_per_iteration": 2.6303372383117676 + }, + { + "auxiliary_loss_clip": 0.01120589, + "auxiliary_loss_mlp": 0.01029341, + "balance_loss_clip": 1.0431782, + "balance_loss_mlp": 1.01675272, + "epoch": 0.5949797084022246, + "flos": 24100577926560.0, + "grad_norm": 3.9664967540354543, + "language_loss": 0.78961253, + "learning_rate": 1.4875991302772182e-06, + "loss": 0.81111187, + "num_input_tokens_seen": 213211195, + "router_z_loss_clip": 0.7734375, + "router_z_loss_mlp": 0.12591553, + "step": 9896, + "time_per_iteration": 2.6282050609588623 + }, + { + "auxiliary_loss_clip": 0.01119027, + "auxiliary_loss_mlp": 0.01033149, + "balance_loss_clip": 1.04159629, + "balance_loss_mlp": 1.0205065, + "epoch": 0.5950398316548925, + "flos": 31449891427200.0, + "grad_norm": 1.6477386934360716, + "language_loss": 0.83450353, + "learning_rate": 1.4872226771112954e-06, + "loss": 0.85602534, + "num_input_tokens_seen": 213231975, + "router_z_loss_clip": 0.77392578, + "router_z_loss_mlp": 0.12640381, + "step": 9897, + "time_per_iteration": 2.7059404850006104 + }, + { + "auxiliary_loss_clip": 0.01120697, + "auxiliary_loss_mlp": 0.01034302, + "balance_loss_clip": 1.04273677, + "balance_loss_mlp": 1.02215505, + "epoch": 0.5950999549075605, + "flos": 28112473146240.0, + "grad_norm": 1.7305285364221188, + "language_loss": 0.70491409, + "learning_rate": 1.486846243389939e-06, + "loss": 0.72646415, + "num_input_tokens_seen": 213249760, + "router_z_loss_clip": 0.77978516, + "router_z_loss_mlp": 0.12139893, + "step": 9898, + "time_per_iteration": 2.6322402954101562 + }, + { + "auxiliary_loss_clip": 0.01124841, + "auxiliary_loss_mlp": 0.01043024, + "balance_loss_clip": 1.04331911, + "balance_loss_mlp": 1.02928567, + "epoch": 0.5951600781602284, + "flos": 39592581032160.0, + "grad_norm": 2.646595331378723, + "language_loss": 0.64020312, + "learning_rate": 1.4864698291274251e-06, + "loss": 0.66188174, + "num_input_tokens_seen": 213269890, + "router_z_loss_clip": 0.81542969, + "router_z_loss_mlp": 0.13720703, + "step": 9899, + "time_per_iteration": 2.7421276569366455 + }, + { + "auxiliary_loss_clip": 0.01120044, + "auxiliary_loss_mlp": 0.01028471, + "balance_loss_clip": 1.04421997, + "balance_loss_mlp": 1.01765847, + "epoch": 0.5952202014128964, + "flos": 29042535502080.0, + "grad_norm": 2.2401105730253033, + "language_loss": 0.72214937, + "learning_rate": 1.4860934343380267e-06, + "loss": 0.74363452, + "num_input_tokens_seen": 213289400, + "router_z_loss_clip": 0.75683594, + "router_z_loss_mlp": 0.10821533, + "step": 9900, + "time_per_iteration": 2.6781301498413086 + }, + { + "auxiliary_loss_clip": 0.01117658, + "auxiliary_loss_mlp": 0.0103114, + "balance_loss_clip": 1.0429337, + "balance_loss_mlp": 1.01896286, + "epoch": 0.5952803246655644, + "flos": 27445127248800.0, + "grad_norm": 2.2202281736081253, + "language_loss": 0.84937656, + "learning_rate": 1.4857170590360169e-06, + "loss": 0.87086451, + "num_input_tokens_seen": 213308040, + "router_z_loss_clip": 0.74755859, + "router_z_loss_mlp": 0.12188721, + "step": 9901, + "time_per_iteration": 2.6313259601593018 + }, + { + "auxiliary_loss_clip": 0.01038891, + "auxiliary_loss_mlp": 0.01001743, + "balance_loss_clip": 1.01503694, + "balance_loss_mlp": 1.00055146, + "epoch": 0.5953404479182324, + "flos": 62516343952800.0, + "grad_norm": 0.7886763255373797, + "language_loss": 0.58140284, + "learning_rate": 1.4853407032356674e-06, + "loss": 0.6018092, + "num_input_tokens_seen": 213358585, + "router_z_loss_clip": 0.23864746, + "router_z_loss_mlp": 0.01189423, + "step": 9902, + "time_per_iteration": 3.1167354583740234 + }, + { + "auxiliary_loss_clip": 0.01119751, + "auxiliary_loss_mlp": 0.01030295, + "balance_loss_clip": 1.04186916, + "balance_loss_mlp": 1.01765919, + "epoch": 0.5954005711709004, + "flos": 28202287255200.0, + "grad_norm": 1.780274003083247, + "language_loss": 0.77176851, + "learning_rate": 1.4849643669512503e-06, + "loss": 0.79326892, + "num_input_tokens_seen": 213379585, + "router_z_loss_clip": 0.77929688, + "router_z_loss_mlp": 0.12640381, + "step": 9903, + "time_per_iteration": 2.65303897857666 + }, + { + "auxiliary_loss_clip": 0.01119819, + "auxiliary_loss_mlp": 0.01032824, + "balance_loss_clip": 1.04300547, + "balance_loss_mlp": 1.02083135, + "epoch": 0.5954606944235683, + "flos": 43873229784960.0, + "grad_norm": 1.7386376918802442, + "language_loss": 0.7804985, + "learning_rate": 1.4845880501970362e-06, + "loss": 0.80202496, + "num_input_tokens_seen": 213401465, + "router_z_loss_clip": 0.76855469, + "router_z_loss_mlp": 0.11999512, + "step": 9904, + "time_per_iteration": 2.759734630584717 + }, + { + "auxiliary_loss_clip": 0.01120094, + "auxiliary_loss_mlp": 0.01032948, + "balance_loss_clip": 1.04102933, + "balance_loss_mlp": 1.02055001, + "epoch": 0.5955208176762363, + "flos": 37148360732640.0, + "grad_norm": 1.9601673906169244, + "language_loss": 0.72917688, + "learning_rate": 1.4842117529872942e-06, + "loss": 0.75070733, + "num_input_tokens_seen": 213422720, + "router_z_loss_clip": 0.79101562, + "router_z_loss_mlp": 0.12402344, + "step": 9905, + "time_per_iteration": 2.685959815979004 + }, + { + "auxiliary_loss_clip": 0.01120182, + "auxiliary_loss_mlp": 0.01027843, + "balance_loss_clip": 1.04254687, + "balance_loss_mlp": 1.01545119, + "epoch": 0.5955809409289042, + "flos": 21524209310880.0, + "grad_norm": 2.444675256572268, + "language_loss": 0.69705266, + "learning_rate": 1.483835475336295e-06, + "loss": 0.71853292, + "num_input_tokens_seen": 213439480, + "router_z_loss_clip": 0.77636719, + "router_z_loss_mlp": 0.12402344, + "step": 9906, + "time_per_iteration": 2.6325666904449463 + }, + { + "auxiliary_loss_clip": 0.01118798, + "auxiliary_loss_mlp": 0.0103155, + "balance_loss_clip": 1.04238868, + "balance_loss_mlp": 1.01929545, + "epoch": 0.5956410641815723, + "flos": 29626631089920.0, + "grad_norm": 1.9116481831471799, + "language_loss": 0.75237334, + "learning_rate": 1.4834592172583057e-06, + "loss": 0.77387685, + "num_input_tokens_seen": 213458895, + "router_z_loss_clip": 0.76318359, + "router_z_loss_mlp": 0.12237549, + "step": 9907, + "time_per_iteration": 2.6697468757629395 + }, + { + "auxiliary_loss_clip": 0.01117272, + "auxiliary_loss_mlp": 0.01031848, + "balance_loss_clip": 1.04083228, + "balance_loss_mlp": 1.02043986, + "epoch": 0.5957011874342402, + "flos": 43604354700000.0, + "grad_norm": 1.778824736992261, + "language_loss": 0.67100775, + "learning_rate": 1.483082978767595e-06, + "loss": 0.69249892, + "num_input_tokens_seen": 213481730, + "router_z_loss_clip": 0.765625, + "router_z_loss_mlp": 0.11407471, + "step": 9908, + "time_per_iteration": 2.7520272731781006 + }, + { + "auxiliary_loss_clip": 0.01119481, + "auxiliary_loss_mlp": 0.01029026, + "balance_loss_clip": 1.04410172, + "balance_loss_mlp": 1.01754606, + "epoch": 0.5957613106869082, + "flos": 25923838263840.0, + "grad_norm": 5.208506603017627, + "language_loss": 0.76545489, + "learning_rate": 1.4827067598784298e-06, + "loss": 0.78693998, + "num_input_tokens_seen": 213497225, + "router_z_loss_clip": 0.75439453, + "router_z_loss_mlp": 0.11486816, + "step": 9909, + "time_per_iteration": 2.6102492809295654 + }, + { + "auxiliary_loss_clip": 0.01037516, + "auxiliary_loss_mlp": 0.01003489, + "balance_loss_clip": 1.01363373, + "balance_loss_mlp": 1.0021677, + "epoch": 0.5958214339395761, + "flos": 80461845815040.0, + "grad_norm": 1.1179496974783893, + "language_loss": 0.73353779, + "learning_rate": 1.4823305606050753e-06, + "loss": 0.75394785, + "num_input_tokens_seen": 213556890, + "router_z_loss_clip": 0.2388916, + "router_z_loss_mlp": 0.01321411, + "step": 9910, + "time_per_iteration": 3.3501720428466797 + }, + { + "auxiliary_loss_clip": 0.01119388, + "auxiliary_loss_mlp": 0.01031852, + "balance_loss_clip": 1.0419836, + "balance_loss_mlp": 1.01969314, + "epoch": 0.5958815571922441, + "flos": 28331964017280.0, + "grad_norm": 1.5185045458968138, + "language_loss": 0.69395351, + "learning_rate": 1.481954380961799e-06, + "loss": 0.71546596, + "num_input_tokens_seen": 213575800, + "router_z_loss_clip": 0.77441406, + "router_z_loss_mlp": 0.12164307, + "step": 9911, + "time_per_iteration": 2.6989707946777344 + }, + { + "auxiliary_loss_clip": 0.01127069, + "auxiliary_loss_mlp": 0.01036169, + "balance_loss_clip": 1.04596734, + "balance_loss_mlp": 1.02318096, + "epoch": 0.595941680444912, + "flos": 20180725266240.0, + "grad_norm": 1.9379592815900646, + "language_loss": 0.65646517, + "learning_rate": 1.4815782209628631e-06, + "loss": 0.67809761, + "num_input_tokens_seen": 213592740, + "router_z_loss_clip": 0.81103516, + "router_z_loss_mlp": 0.12976074, + "step": 9912, + "time_per_iteration": 2.581585645675659 + }, + { + "auxiliary_loss_clip": 0.01118067, + "auxiliary_loss_mlp": 0.01035484, + "balance_loss_clip": 1.04178667, + "balance_loss_mlp": 1.02301502, + "epoch": 0.59600180369758, + "flos": 33945197666400.0, + "grad_norm": 2.1576333128865794, + "language_loss": 0.73400426, + "learning_rate": 1.4812020806225337e-06, + "loss": 0.75553972, + "num_input_tokens_seen": 213611970, + "router_z_loss_clip": 0.76220703, + "router_z_loss_mlp": 0.12475586, + "step": 9913, + "time_per_iteration": 2.7123377323150635 + }, + { + "auxiliary_loss_clip": 0.0112006, + "auxiliary_loss_mlp": 0.01028006, + "balance_loss_clip": 1.03993714, + "balance_loss_mlp": 1.01561451, + "epoch": 0.596061926950248, + "flos": 35986854908160.0, + "grad_norm": 2.591481489449488, + "language_loss": 0.79968655, + "learning_rate": 1.4808259599550738e-06, + "loss": 0.82116723, + "num_input_tokens_seen": 213632230, + "router_z_loss_clip": 0.80078125, + "router_z_loss_mlp": 0.12390137, + "step": 9914, + "time_per_iteration": 2.7099974155426025 + }, + { + "auxiliary_loss_clip": 0.01116836, + "auxiliary_loss_mlp": 0.01029547, + "balance_loss_clip": 1.04163003, + "balance_loss_mlp": 1.01821601, + "epoch": 0.596122050202916, + "flos": 20543992912800.0, + "grad_norm": 2.1339494937064325, + "language_loss": 0.67509258, + "learning_rate": 1.4804498589747448e-06, + "loss": 0.69655645, + "num_input_tokens_seen": 213649645, + "router_z_loss_clip": 0.75195312, + "router_z_loss_mlp": 0.11322021, + "step": 9915, + "time_per_iteration": 2.650710105895996 + }, + { + "auxiliary_loss_clip": 0.01118309, + "auxiliary_loss_mlp": 0.01033163, + "balance_loss_clip": 1.04034185, + "balance_loss_mlp": 1.02149224, + "epoch": 0.596182173455584, + "flos": 25620124668480.0, + "grad_norm": 1.647814748323576, + "language_loss": 0.7891717, + "learning_rate": 1.4800737776958095e-06, + "loss": 0.81068647, + "num_input_tokens_seen": 213668850, + "router_z_loss_clip": 0.78027344, + "router_z_loss_mlp": 0.11676025, + "step": 9916, + "time_per_iteration": 2.6813457012176514 + }, + { + "auxiliary_loss_clip": 0.01118959, + "auxiliary_loss_mlp": 0.01029776, + "balance_loss_clip": 1.04140449, + "balance_loss_mlp": 1.01788461, + "epoch": 0.5962422967082519, + "flos": 19603477098720.0, + "grad_norm": 1.8211203936391875, + "language_loss": 0.82723826, + "learning_rate": 1.4796977161325286e-06, + "loss": 0.84872556, + "num_input_tokens_seen": 213685695, + "router_z_loss_clip": 0.77636719, + "router_z_loss_mlp": 0.11901855, + "step": 9917, + "time_per_iteration": 2.6472055912017822 + }, + { + "auxiliary_loss_clip": 0.01117095, + "auxiliary_loss_mlp": 0.01030887, + "balance_loss_clip": 1.04197598, + "balance_loss_mlp": 1.0193119, + "epoch": 0.5963024199609199, + "flos": 14845483090080.0, + "grad_norm": 2.236340809396415, + "language_loss": 0.77916974, + "learning_rate": 1.4793216742991625e-06, + "loss": 0.80064952, + "num_input_tokens_seen": 213703515, + "router_z_loss_clip": 0.75146484, + "router_z_loss_mlp": 0.11572266, + "step": 9918, + "time_per_iteration": 2.5988049507141113 + }, + { + "auxiliary_loss_clip": 0.01119886, + "auxiliary_loss_mlp": 0.01037669, + "balance_loss_clip": 1.04370773, + "balance_loss_mlp": 1.02516353, + "epoch": 0.5963625432135878, + "flos": 34256852648640.0, + "grad_norm": 1.519681391254201, + "language_loss": 0.78751338, + "learning_rate": 1.4789456522099707e-06, + "loss": 0.80908895, + "num_input_tokens_seen": 213724170, + "router_z_loss_clip": 0.76318359, + "router_z_loss_mlp": 0.12512207, + "step": 9919, + "time_per_iteration": 2.711610794067383 + }, + { + "auxiliary_loss_clip": 0.01118505, + "auxiliary_loss_mlp": 0.01032747, + "balance_loss_clip": 1.04248047, + "balance_loss_mlp": 1.01983631, + "epoch": 0.5964226664662559, + "flos": 24233982278400.0, + "grad_norm": 2.3640152146797253, + "language_loss": 0.77591884, + "learning_rate": 1.4785696498792122e-06, + "loss": 0.79743141, + "num_input_tokens_seen": 213740620, + "router_z_loss_clip": 0.75976562, + "router_z_loss_mlp": 0.12902832, + "step": 9920, + "time_per_iteration": 4.100628137588501 + }, + { + "auxiliary_loss_clip": 0.01122502, + "auxiliary_loss_mlp": 0.01028938, + "balance_loss_clip": 1.04503345, + "balance_loss_mlp": 1.01732755, + "epoch": 0.5964827897189238, + "flos": 15779840277600.0, + "grad_norm": 2.563872360976051, + "language_loss": 0.8299861, + "learning_rate": 1.4781936673211446e-06, + "loss": 0.85150045, + "num_input_tokens_seen": 213755390, + "router_z_loss_clip": 0.77441406, + "router_z_loss_mlp": 0.11608887, + "step": 9921, + "time_per_iteration": 4.029565811157227 + }, + { + "auxiliary_loss_clip": 0.01117454, + "auxiliary_loss_mlp": 0.01027238, + "balance_loss_clip": 1.04075718, + "balance_loss_mlp": 1.01519227, + "epoch": 0.5965429129715918, + "flos": 22147397723520.0, + "grad_norm": 2.782661044833965, + "language_loss": 0.80710196, + "learning_rate": 1.4778177045500252e-06, + "loss": 0.82854891, + "num_input_tokens_seen": 213773225, + "router_z_loss_clip": 0.76611328, + "router_z_loss_mlp": 0.12054443, + "step": 9922, + "time_per_iteration": 2.6012210845947266 + }, + { + "auxiliary_loss_clip": 0.01116533, + "auxiliary_loss_mlp": 0.01027313, + "balance_loss_clip": 1.04053855, + "balance_loss_mlp": 1.01530218, + "epoch": 0.5966030362242597, + "flos": 26555535305280.0, + "grad_norm": 2.5247256776894704, + "language_loss": 0.7702722, + "learning_rate": 1.477441761580111e-06, + "loss": 0.79171062, + "num_input_tokens_seen": 213791860, + "router_z_loss_clip": 0.76025391, + "router_z_loss_mlp": 0.12023926, + "step": 9923, + "time_per_iteration": 2.665738821029663 + }, + { + "auxiliary_loss_clip": 0.0112352, + "auxiliary_loss_mlp": 0.01036114, + "balance_loss_clip": 1.04333162, + "balance_loss_mlp": 1.02240431, + "epoch": 0.5966631594769277, + "flos": 22948431593760.0, + "grad_norm": 1.938299116686148, + "language_loss": 0.75616717, + "learning_rate": 1.4770658384256573e-06, + "loss": 0.77776349, + "num_input_tokens_seen": 213809455, + "router_z_loss_clip": 0.80126953, + "router_z_loss_mlp": 0.13702393, + "step": 9924, + "time_per_iteration": 2.605046510696411 + }, + { + "auxiliary_loss_clip": 0.0111529, + "auxiliary_loss_mlp": 0.01031563, + "balance_loss_clip": 1.04218519, + "balance_loss_mlp": 1.01927245, + "epoch": 0.5967232827295956, + "flos": 17160593869440.0, + "grad_norm": 4.091355106143247, + "language_loss": 0.66324258, + "learning_rate": 1.4766899351009204e-06, + "loss": 0.6847111, + "num_input_tokens_seen": 213826615, + "router_z_loss_clip": 0.72998047, + "router_z_loss_mlp": 0.1229248, + "step": 9925, + "time_per_iteration": 2.6087098121643066 + }, + { + "auxiliary_loss_clip": 0.01118603, + "auxiliary_loss_mlp": 0.01031884, + "balance_loss_clip": 1.0450356, + "balance_loss_mlp": 1.02010584, + "epoch": 0.5967834059822636, + "flos": 21033858008160.0, + "grad_norm": 2.623249635171266, + "language_loss": 0.71409881, + "learning_rate": 1.4763140516201528e-06, + "loss": 0.73560369, + "num_input_tokens_seen": 213844495, + "router_z_loss_clip": 0.73535156, + "router_z_loss_mlp": 0.11791992, + "step": 9926, + "time_per_iteration": 2.6381607055664062 + }, + { + "auxiliary_loss_clip": 0.01120677, + "auxiliary_loss_mlp": 0.01026799, + "balance_loss_clip": 1.04255533, + "balance_loss_mlp": 1.01420426, + "epoch": 0.5968435292349316, + "flos": 51885148661280.0, + "grad_norm": 4.6933954818141075, + "language_loss": 0.70323068, + "learning_rate": 1.4759381879976088e-06, + "loss": 0.72470546, + "num_input_tokens_seen": 213869125, + "router_z_loss_clip": 0.78076172, + "router_z_loss_mlp": 0.1260376, + "step": 9927, + "time_per_iteration": 2.963632822036743 + }, + { + "auxiliary_loss_clip": 0.01120732, + "auxiliary_loss_mlp": 0.01027916, + "balance_loss_clip": 1.04061413, + "balance_loss_mlp": 1.01504695, + "epoch": 0.5969036524875996, + "flos": 45918817202880.0, + "grad_norm": 1.9554087518808712, + "language_loss": 0.6357615, + "learning_rate": 1.4755623442475415e-06, + "loss": 0.65724802, + "num_input_tokens_seen": 213891115, + "router_z_loss_clip": 0.80078125, + "router_z_loss_mlp": 0.12872314, + "step": 9928, + "time_per_iteration": 4.2891809940338135 + }, + { + "auxiliary_loss_clip": 0.01114338, + "auxiliary_loss_mlp": 0.01029003, + "balance_loss_clip": 1.03955889, + "balance_loss_mlp": 1.01738, + "epoch": 0.5969637757402676, + "flos": 28245918015360.0, + "grad_norm": 1.674248347403764, + "language_loss": 0.69775116, + "learning_rate": 1.4751865203842022e-06, + "loss": 0.71918464, + "num_input_tokens_seen": 213911925, + "router_z_loss_clip": 0.74755859, + "router_z_loss_mlp": 0.11627197, + "step": 9929, + "time_per_iteration": 2.661179304122925 + }, + { + "auxiliary_loss_clip": 0.01114448, + "auxiliary_loss_mlp": 0.0103183, + "balance_loss_clip": 1.04212928, + "balance_loss_mlp": 1.02072597, + "epoch": 0.5970238989929355, + "flos": 29310964896960.0, + "grad_norm": 3.279404828573554, + "language_loss": 0.76270521, + "learning_rate": 1.4748107164218431e-06, + "loss": 0.784168, + "num_input_tokens_seen": 213930715, + "router_z_loss_clip": 0.72265625, + "router_z_loss_mlp": 0.11102295, + "step": 9930, + "time_per_iteration": 2.6993916034698486 + }, + { + "auxiliary_loss_clip": 0.01121249, + "auxiliary_loss_mlp": 0.01029704, + "balance_loss_clip": 1.04295015, + "balance_loss_mlp": 1.01696646, + "epoch": 0.5970840222456035, + "flos": 23704659702720.0, + "grad_norm": 1.9094257067870395, + "language_loss": 0.68545115, + "learning_rate": 1.4744349323747146e-06, + "loss": 0.70696068, + "num_input_tokens_seen": 213950015, + "router_z_loss_clip": 0.78222656, + "router_z_loss_mlp": 0.12731934, + "step": 9931, + "time_per_iteration": 2.618650197982788 + }, + { + "auxiliary_loss_clip": 0.0103937, + "auxiliary_loss_mlp": 0.01003127, + "balance_loss_clip": 1.01520061, + "balance_loss_mlp": 1.00177062, + "epoch": 0.5971441454982714, + "flos": 76845342094560.0, + "grad_norm": 0.8568547755930622, + "language_loss": 0.64204288, + "learning_rate": 1.474059168257065e-06, + "loss": 0.66246784, + "num_input_tokens_seen": 214003330, + "router_z_loss_clip": 0.24169922, + "router_z_loss_mlp": 0.01358032, + "step": 9932, + "time_per_iteration": 4.4895899295806885 + }, + { + "auxiliary_loss_clip": 0.01117774, + "auxiliary_loss_mlp": 0.01029645, + "balance_loss_clip": 1.04195225, + "balance_loss_mlp": 1.01735401, + "epoch": 0.5972042687509395, + "flos": 24728871516480.0, + "grad_norm": 1.8739587079789537, + "language_loss": 0.73757058, + "learning_rate": 1.4736834240831454e-06, + "loss": 0.75904477, + "num_input_tokens_seen": 214021680, + "router_z_loss_clip": 0.75878906, + "router_z_loss_mlp": 0.12298584, + "step": 9933, + "time_per_iteration": 2.7973086833953857 + }, + { + "auxiliary_loss_clip": 0.01038826, + "auxiliary_loss_mlp": 0.01001999, + "balance_loss_clip": 1.01471257, + "balance_loss_mlp": 1.00065899, + "epoch": 0.5972643920036074, + "flos": 87429543691680.0, + "grad_norm": 1.0130608712172513, + "language_loss": 0.52025628, + "learning_rate": 1.473307699867203e-06, + "loss": 0.54066449, + "num_input_tokens_seen": 214090265, + "router_z_loss_clip": 0.24133301, + "router_z_loss_mlp": 0.01341248, + "step": 9934, + "time_per_iteration": 3.331979513168335 + }, + { + "auxiliary_loss_clip": 0.01038714, + "auxiliary_loss_mlp": 0.01001076, + "balance_loss_clip": 1.01478446, + "balance_loss_mlp": 0.99972761, + "epoch": 0.5973245152562754, + "flos": 69420436050240.0, + "grad_norm": 0.8286015389335014, + "language_loss": 0.54204994, + "learning_rate": 1.4729319956234849e-06, + "loss": 0.56244785, + "num_input_tokens_seen": 214146375, + "router_z_loss_clip": 0.23937988, + "router_z_loss_mlp": 0.01348877, + "step": 9935, + "time_per_iteration": 3.189910411834717 + }, + { + "auxiliary_loss_clip": 0.01117123, + "auxiliary_loss_mlp": 0.01032353, + "balance_loss_clip": 1.04133284, + "balance_loss_mlp": 1.01983023, + "epoch": 0.5973846385089433, + "flos": 29486217248640.0, + "grad_norm": 2.681228127961657, + "language_loss": 0.65947372, + "learning_rate": 1.4725563113662394e-06, + "loss": 0.68096852, + "num_input_tokens_seen": 214165340, + "router_z_loss_clip": 0.75683594, + "router_z_loss_mlp": 0.12524414, + "step": 9936, + "time_per_iteration": 2.692321300506592 + }, + { + "auxiliary_loss_clip": 0.01120887, + "auxiliary_loss_mlp": 0.01033437, + "balance_loss_clip": 1.04300785, + "balance_loss_mlp": 1.02189159, + "epoch": 0.5974447617616113, + "flos": 21561965065440.0, + "grad_norm": 2.8112688127450305, + "language_loss": 0.67353529, + "learning_rate": 1.4721806471097103e-06, + "loss": 0.69507849, + "num_input_tokens_seen": 214181360, + "router_z_loss_clip": 0.77832031, + "router_z_loss_mlp": 0.11541748, + "step": 9937, + "time_per_iteration": 2.5986199378967285 + }, + { + "auxiliary_loss_clip": 0.01120127, + "auxiliary_loss_mlp": 0.01028246, + "balance_loss_clip": 1.0413928, + "balance_loss_mlp": 1.01615214, + "epoch": 0.5975048850142792, + "flos": 27933736308480.0, + "grad_norm": 2.4274531988800496, + "language_loss": 0.77199972, + "learning_rate": 1.4718050028681442e-06, + "loss": 0.79348338, + "num_input_tokens_seen": 214198525, + "router_z_loss_clip": 0.78662109, + "router_z_loss_mlp": 0.12103271, + "step": 9938, + "time_per_iteration": 2.694666862487793 + }, + { + "auxiliary_loss_clip": 0.01118723, + "auxiliary_loss_mlp": 0.01030087, + "balance_loss_clip": 1.04101765, + "balance_loss_mlp": 1.01791549, + "epoch": 0.5975650082669473, + "flos": 29715958991520.0, + "grad_norm": 1.7684510271945109, + "language_loss": 0.7573216, + "learning_rate": 1.4714293786557855e-06, + "loss": 0.77880973, + "num_input_tokens_seen": 214218710, + "router_z_loss_clip": 0.77539062, + "router_z_loss_mlp": 0.12176514, + "step": 9939, + "time_per_iteration": 2.6846418380737305 + }, + { + "auxiliary_loss_clip": 0.01121515, + "auxiliary_loss_mlp": 0.01027194, + "balance_loss_clip": 1.04106724, + "balance_loss_mlp": 1.01378882, + "epoch": 0.5976251315196152, + "flos": 25530715732320.0, + "grad_norm": 2.985720659314661, + "language_loss": 0.68745482, + "learning_rate": 1.471053774486878e-06, + "loss": 0.70894182, + "num_input_tokens_seen": 214237800, + "router_z_loss_clip": 0.8046875, + "router_z_loss_mlp": 0.13415527, + "step": 9940, + "time_per_iteration": 2.6594934463500977 + }, + { + "auxiliary_loss_clip": 0.01113968, + "auxiliary_loss_mlp": 0.01032268, + "balance_loss_clip": 1.0406394, + "balance_loss_mlp": 1.02121162, + "epoch": 0.5976852547722832, + "flos": 43738893535680.0, + "grad_norm": 1.5589807012112968, + "language_loss": 0.70233452, + "learning_rate": 1.470678190375664e-06, + "loss": 0.72379696, + "num_input_tokens_seen": 214260355, + "router_z_loss_clip": 0.73339844, + "router_z_loss_mlp": 0.11065674, + "step": 9941, + "time_per_iteration": 2.7491023540496826 + }, + { + "auxiliary_loss_clip": 0.01113319, + "auxiliary_loss_mlp": 0.0103242, + "balance_loss_clip": 1.03821743, + "balance_loss_mlp": 1.01944983, + "epoch": 0.5977453780249512, + "flos": 15686622717120.0, + "grad_norm": 2.49607590517325, + "language_loss": 0.77517247, + "learning_rate": 1.470302626336386e-06, + "loss": 0.79662979, + "num_input_tokens_seen": 214277120, + "router_z_loss_clip": 0.75048828, + "router_z_loss_mlp": 0.12976074, + "step": 9942, + "time_per_iteration": 2.619896411895752 + }, + { + "auxiliary_loss_clip": 0.01116777, + "auxiliary_loss_mlp": 0.01038512, + "balance_loss_clip": 1.03945982, + "balance_loss_mlp": 1.02680576, + "epoch": 0.5978055012776191, + "flos": 25574468044320.0, + "grad_norm": 3.7963234647347863, + "language_loss": 0.75619483, + "learning_rate": 1.4699270823832857e-06, + "loss": 0.77774775, + "num_input_tokens_seen": 214295300, + "router_z_loss_clip": 0.77246094, + "router_z_loss_mlp": 0.1171875, + "step": 9943, + "time_per_iteration": 2.6023800373077393 + }, + { + "auxiliary_loss_clip": 0.011158, + "auxiliary_loss_mlp": 0.01027132, + "balance_loss_clip": 1.04112446, + "balance_loss_mlp": 1.01596177, + "epoch": 0.5978656245302871, + "flos": 41558645730240.0, + "grad_norm": 2.0397668146396453, + "language_loss": 0.62428993, + "learning_rate": 1.4695515585306032e-06, + "loss": 0.64571923, + "num_input_tokens_seen": 214317050, + "router_z_loss_clip": 0.74707031, + "router_z_loss_mlp": 0.11175537, + "step": 9944, + "time_per_iteration": 2.8067164421081543 + }, + { + "auxiliary_loss_clip": 0.01117889, + "auxiliary_loss_mlp": 0.01031694, + "balance_loss_clip": 1.04082155, + "balance_loss_mlp": 1.01922488, + "epoch": 0.597925747782955, + "flos": 45604650149280.0, + "grad_norm": 1.8632608219817466, + "language_loss": 0.72434127, + "learning_rate": 1.4691760547925795e-06, + "loss": 0.74583709, + "num_input_tokens_seen": 214337470, + "router_z_loss_clip": 0.77050781, + "router_z_loss_mlp": 0.12475586, + "step": 9945, + "time_per_iteration": 2.743347406387329 + }, + { + "auxiliary_loss_clip": 0.01115216, + "auxiliary_loss_mlp": 0.01033908, + "balance_loss_clip": 1.03977442, + "balance_loss_mlp": 1.02157617, + "epoch": 0.5979858710356231, + "flos": 30962700472320.0, + "grad_norm": 2.3880063386382275, + "language_loss": 0.66930217, + "learning_rate": 1.4688005711834522e-06, + "loss": 0.6907934, + "num_input_tokens_seen": 214357975, + "router_z_loss_clip": 0.75439453, + "router_z_loss_mlp": 0.12329102, + "step": 9946, + "time_per_iteration": 2.7368173599243164 + }, + { + "auxiliary_loss_clip": 0.01121333, + "auxiliary_loss_mlp": 0.01035443, + "balance_loss_clip": 1.04259324, + "balance_loss_mlp": 1.02292573, + "epoch": 0.598045994288291, + "flos": 16714359534240.0, + "grad_norm": 2.000804184378929, + "language_loss": 0.8894152, + "learning_rate": 1.468425107717461e-06, + "loss": 0.91098297, + "num_input_tokens_seen": 214374125, + "router_z_loss_clip": 0.78710938, + "router_z_loss_mlp": 0.12518311, + "step": 9947, + "time_per_iteration": 2.5606913566589355 + }, + { + "auxiliary_loss_clip": 0.01111084, + "auxiliary_loss_mlp": 0.01034301, + "balance_loss_clip": 1.03867304, + "balance_loss_mlp": 1.0237931, + "epoch": 0.598106117540959, + "flos": 26821776767040.0, + "grad_norm": 6.547139129429903, + "language_loss": 0.72335446, + "learning_rate": 1.4680496644088432e-06, + "loss": 0.74480832, + "num_input_tokens_seen": 214393395, + "router_z_loss_clip": 0.72363281, + "router_z_loss_mlp": 0.10498047, + "step": 9948, + "time_per_iteration": 2.644360065460205 + }, + { + "auxiliary_loss_clip": 0.01117921, + "auxiliary_loss_mlp": 0.01026593, + "balance_loss_clip": 1.04136419, + "balance_loss_mlp": 1.0138967, + "epoch": 0.5981662407936269, + "flos": 25086466743840.0, + "grad_norm": 2.0119867546410712, + "language_loss": 0.89585733, + "learning_rate": 1.4676742412718347e-06, + "loss": 0.91730249, + "num_input_tokens_seen": 214411550, + "router_z_loss_clip": 0.76513672, + "router_z_loss_mlp": 0.12695312, + "step": 9949, + "time_per_iteration": 2.6226866245269775 + }, + { + "auxiliary_loss_clip": 0.01114844, + "auxiliary_loss_mlp": 0.01026123, + "balance_loss_clip": 1.04112434, + "balance_loss_mlp": 1.01512003, + "epoch": 0.5982263640462949, + "flos": 17160229213920.0, + "grad_norm": 2.840749853869849, + "language_loss": 0.70489645, + "learning_rate": 1.467298838320673e-06, + "loss": 0.72630608, + "num_input_tokens_seen": 214429780, + "router_z_loss_clip": 0.73681641, + "router_z_loss_mlp": 0.11004639, + "step": 9950, + "time_per_iteration": 2.6683061122894287 + }, + { + "auxiliary_loss_clip": 0.01115917, + "auxiliary_loss_mlp": 0.01029587, + "balance_loss_clip": 1.03969646, + "balance_loss_mlp": 1.01710558, + "epoch": 0.5982864872989628, + "flos": 21606649274880.0, + "grad_norm": 1.8155153834481819, + "language_loss": 0.78162557, + "learning_rate": 1.4669234555695921e-06, + "loss": 0.80308068, + "num_input_tokens_seen": 214447775, + "router_z_loss_clip": 0.76220703, + "router_z_loss_mlp": 0.12481689, + "step": 9951, + "time_per_iteration": 2.6493332386016846 + }, + { + "auxiliary_loss_clip": 0.01116284, + "auxiliary_loss_mlp": 0.01039201, + "balance_loss_clip": 1.03928375, + "balance_loss_mlp": 1.02633214, + "epoch": 0.5983466105516309, + "flos": 20456042598720.0, + "grad_norm": 1.5639210903237264, + "language_loss": 0.73943728, + "learning_rate": 1.4665480930328275e-06, + "loss": 0.76099211, + "num_input_tokens_seen": 214467245, + "router_z_loss_clip": 0.77001953, + "router_z_loss_mlp": 0.12866211, + "step": 9952, + "time_per_iteration": 2.6220057010650635 + }, + { + "auxiliary_loss_clip": 0.01119465, + "auxiliary_loss_mlp": 0.01029175, + "balance_loss_clip": 1.04152369, + "balance_loss_mlp": 1.01594257, + "epoch": 0.5984067338042988, + "flos": 24456552462720.0, + "grad_norm": 2.3678626250819423, + "language_loss": 0.78852355, + "learning_rate": 1.466172750724613e-06, + "loss": 0.8100099, + "num_input_tokens_seen": 214484385, + "router_z_loss_clip": 0.77978516, + "router_z_loss_mlp": 0.13244629, + "step": 9953, + "time_per_iteration": 2.662302017211914 + }, + { + "auxiliary_loss_clip": 0.01115495, + "auxiliary_loss_mlp": 0.01033757, + "balance_loss_clip": 1.04012787, + "balance_loss_mlp": 1.02197313, + "epoch": 0.5984668570569668, + "flos": 32116305427200.0, + "grad_norm": 1.5010757569475457, + "language_loss": 0.69752169, + "learning_rate": 1.4657974286591807e-06, + "loss": 0.71901423, + "num_input_tokens_seen": 214503465, + "router_z_loss_clip": 0.75439453, + "router_z_loss_mlp": 0.11798096, + "step": 9954, + "time_per_iteration": 2.6592977046966553 + }, + { + "auxiliary_loss_clip": 0.01115534, + "auxiliary_loss_mlp": 0.01032729, + "balance_loss_clip": 1.03900301, + "balance_loss_mlp": 1.02119601, + "epoch": 0.5985269803096348, + "flos": 25128598364640.0, + "grad_norm": 2.0637029387635413, + "language_loss": 0.73202419, + "learning_rate": 1.4654221268507637e-06, + "loss": 0.75350684, + "num_input_tokens_seen": 214520725, + "router_z_loss_clip": 0.76464844, + "router_z_loss_mlp": 0.11529541, + "step": 9955, + "time_per_iteration": 2.635538101196289 + }, + { + "auxiliary_loss_clip": 0.01116992, + "auxiliary_loss_mlp": 0.01030596, + "balance_loss_clip": 1.03998065, + "balance_loss_mlp": 1.01840091, + "epoch": 0.5985871035623027, + "flos": 32782111668000.0, + "grad_norm": 2.183315813776068, + "language_loss": 0.68636906, + "learning_rate": 1.4650468453135934e-06, + "loss": 0.70784485, + "num_input_tokens_seen": 214540675, + "router_z_loss_clip": 0.76953125, + "router_z_loss_mlp": 0.12194824, + "step": 9956, + "time_per_iteration": 2.6423685550689697 + }, + { + "auxiliary_loss_clip": 0.01118477, + "auxiliary_loss_mlp": 0.01030811, + "balance_loss_clip": 1.04135561, + "balance_loss_mlp": 1.01895571, + "epoch": 0.5986472268149707, + "flos": 23927716094400.0, + "grad_norm": 2.118329497143777, + "language_loss": 0.73793912, + "learning_rate": 1.4646715840618999e-06, + "loss": 0.75943196, + "num_input_tokens_seen": 214559910, + "router_z_loss_clip": 0.77246094, + "router_z_loss_mlp": 0.11877441, + "step": 9957, + "time_per_iteration": 2.6320090293884277 + }, + { + "auxiliary_loss_clip": 0.0111426, + "auxiliary_loss_mlp": 0.01028527, + "balance_loss_clip": 1.04114819, + "balance_loss_mlp": 1.01673126, + "epoch": 0.5987073500676386, + "flos": 26594303991840.0, + "grad_norm": 3.8813664510078105, + "language_loss": 0.8477121, + "learning_rate": 1.4642963431099138e-06, + "loss": 0.86913997, + "num_input_tokens_seen": 214575960, + "router_z_loss_clip": 0.73095703, + "router_z_loss_mlp": 0.11791992, + "step": 9958, + "time_per_iteration": 2.6414194107055664 + }, + { + "auxiliary_loss_clip": 0.01117886, + "auxiliary_loss_mlp": 0.01035754, + "balance_loss_clip": 1.04042482, + "balance_loss_mlp": 1.02335012, + "epoch": 0.5987674733203067, + "flos": 29668884262560.0, + "grad_norm": 2.1237618067419346, + "language_loss": 0.66574174, + "learning_rate": 1.463921122471864e-06, + "loss": 0.68727815, + "num_input_tokens_seen": 214594230, + "router_z_loss_clip": 0.77490234, + "router_z_loss_mlp": 0.12414551, + "step": 9959, + "time_per_iteration": 2.7019991874694824 + }, + { + "auxiliary_loss_clip": 0.01117683, + "auxiliary_loss_mlp": 0.01030879, + "balance_loss_clip": 1.04147816, + "balance_loss_mlp": 1.018821, + "epoch": 0.5988275965729746, + "flos": 26015394615840.0, + "grad_norm": 1.8947744670002382, + "language_loss": 0.83770788, + "learning_rate": 1.4635459221619796e-06, + "loss": 0.85919356, + "num_input_tokens_seen": 214613130, + "router_z_loss_clip": 0.76269531, + "router_z_loss_mlp": 0.12060547, + "step": 9960, + "time_per_iteration": 5.513265609741211 + }, + { + "auxiliary_loss_clip": 0.01113896, + "auxiliary_loss_mlp": 0.01028116, + "balance_loss_clip": 1.03851116, + "balance_loss_mlp": 1.01594472, + "epoch": 0.5988877198256426, + "flos": 30649505833440.0, + "grad_norm": 1.5636135041929937, + "language_loss": 0.79195607, + "learning_rate": 1.4631707421944868e-06, + "loss": 0.81337619, + "num_input_tokens_seen": 214634470, + "router_z_loss_clip": 0.75292969, + "router_z_loss_mlp": 0.12164307, + "step": 9961, + "time_per_iteration": 2.6646857261657715 + }, + { + "auxiliary_loss_clip": 0.011159, + "auxiliary_loss_mlp": 0.01029209, + "balance_loss_clip": 1.04033232, + "balance_loss_mlp": 1.01722217, + "epoch": 0.5989478430783105, + "flos": 32249588227200.0, + "grad_norm": 6.886639372690757, + "language_loss": 0.67601091, + "learning_rate": 1.4627955825836136e-06, + "loss": 0.69746202, + "num_input_tokens_seen": 214654030, + "router_z_loss_clip": 0.75585938, + "router_z_loss_mlp": 0.11993408, + "step": 9962, + "time_per_iteration": 2.6952648162841797 + }, + { + "auxiliary_loss_clip": 0.01117398, + "auxiliary_loss_mlp": 0.01034418, + "balance_loss_clip": 1.04046941, + "balance_loss_mlp": 1.02255094, + "epoch": 0.5990079663309785, + "flos": 31455037121760.0, + "grad_norm": 1.50252247611074, + "language_loss": 0.74388134, + "learning_rate": 1.4624204433435857e-06, + "loss": 0.76539946, + "num_input_tokens_seen": 214676985, + "router_z_loss_clip": 0.76953125, + "router_z_loss_mlp": 0.11865234, + "step": 9963, + "time_per_iteration": 2.6771087646484375 + }, + { + "auxiliary_loss_clip": 0.01114232, + "auxiliary_loss_mlp": 0.01027259, + "balance_loss_clip": 1.03927124, + "balance_loss_mlp": 1.01527286, + "epoch": 0.5990680895836464, + "flos": 44943341326560.0, + "grad_norm": 2.8742765729379354, + "language_loss": 0.67878437, + "learning_rate": 1.4620453244886281e-06, + "loss": 0.70019925, + "num_input_tokens_seen": 214700105, + "router_z_loss_clip": 0.74951172, + "router_z_loss_mlp": 0.11987305, + "step": 9964, + "time_per_iteration": 2.757293939590454 + }, + { + "auxiliary_loss_clip": 0.01113168, + "auxiliary_loss_mlp": 0.01027855, + "balance_loss_clip": 1.04019642, + "balance_loss_mlp": 1.01559997, + "epoch": 0.5991282128363145, + "flos": 29448096838560.0, + "grad_norm": 2.3400076907367633, + "language_loss": 0.76872772, + "learning_rate": 1.4616702260329662e-06, + "loss": 0.79013795, + "num_input_tokens_seen": 214717885, + "router_z_loss_clip": 0.72949219, + "router_z_loss_mlp": 0.12268066, + "step": 9965, + "time_per_iteration": 2.7009871006011963 + }, + { + "auxiliary_loss_clip": 0.01117219, + "auxiliary_loss_mlp": 0.01027651, + "balance_loss_clip": 1.04000974, + "balance_loss_mlp": 1.01584291, + "epoch": 0.5991883360889824, + "flos": 12572422896960.0, + "grad_norm": 2.048957646160788, + "language_loss": 0.77405679, + "learning_rate": 1.4612951479908229e-06, + "loss": 0.79550552, + "num_input_tokens_seen": 214733680, + "router_z_loss_clip": 0.77246094, + "router_z_loss_mlp": 0.11810303, + "step": 9966, + "time_per_iteration": 2.580808162689209 + }, + { + "auxiliary_loss_clip": 0.01115778, + "auxiliary_loss_mlp": 0.01025266, + "balance_loss_clip": 1.04076433, + "balance_loss_mlp": 1.01379824, + "epoch": 0.5992484593416504, + "flos": 29225567171520.0, + "grad_norm": 1.5620838792231804, + "language_loss": 0.7346555, + "learning_rate": 1.460920090376422e-06, + "loss": 0.75606602, + "num_input_tokens_seen": 214753285, + "router_z_loss_clip": 0.74951172, + "router_z_loss_mlp": 0.11468506, + "step": 9967, + "time_per_iteration": 2.6649765968322754 + }, + { + "auxiliary_loss_clip": 0.0112339, + "auxiliary_loss_mlp": 0.0103458, + "balance_loss_clip": 1.04254937, + "balance_loss_mlp": 1.02165771, + "epoch": 0.5993085825943184, + "flos": 14573893347360.0, + "grad_norm": 2.1273525667457167, + "language_loss": 0.68564785, + "learning_rate": 1.4605450532039847e-06, + "loss": 0.70722759, + "num_input_tokens_seen": 214767810, + "router_z_loss_clip": 0.80810547, + "router_z_loss_mlp": 0.1293335, + "step": 9968, + "time_per_iteration": 4.023433685302734 + }, + { + "auxiliary_loss_clip": 0.01117459, + "auxiliary_loss_mlp": 0.01034803, + "balance_loss_clip": 1.03968239, + "balance_loss_mlp": 1.02170777, + "epoch": 0.5993687058469863, + "flos": 23216374781280.0, + "grad_norm": 3.1817897656076637, + "language_loss": 0.79543263, + "learning_rate": 1.4601700364877334e-06, + "loss": 0.81695533, + "num_input_tokens_seen": 214786040, + "router_z_loss_clip": 0.77783203, + "router_z_loss_mlp": 0.13092041, + "step": 9969, + "time_per_iteration": 2.656834363937378 + }, + { + "auxiliary_loss_clip": 0.01116582, + "auxiliary_loss_mlp": 0.01029493, + "balance_loss_clip": 1.03953481, + "balance_loss_mlp": 1.01701784, + "epoch": 0.5994288290996543, + "flos": 17427605159520.0, + "grad_norm": 2.299739266148388, + "language_loss": 0.81375468, + "learning_rate": 1.4597950402418889e-06, + "loss": 0.83521545, + "num_input_tokens_seen": 214803110, + "router_z_loss_clip": 0.77099609, + "router_z_loss_mlp": 0.12463379, + "step": 9970, + "time_per_iteration": 2.6166861057281494 + }, + { + "auxiliary_loss_clip": 0.01118218, + "auxiliary_loss_mlp": 0.01041067, + "balance_loss_clip": 1.03946912, + "balance_loss_mlp": 1.02667892, + "epoch": 0.5994889523523222, + "flos": 23436027721440.0, + "grad_norm": 2.16753276993161, + "language_loss": 0.62111962, + "learning_rate": 1.4594200644806697e-06, + "loss": 0.64271247, + "num_input_tokens_seen": 214819945, + "router_z_loss_clip": 0.78808594, + "router_z_loss_mlp": 0.14385986, + "step": 9971, + "time_per_iteration": 2.619271993637085 + }, + { + "auxiliary_loss_clip": 0.01113037, + "auxiliary_loss_mlp": 0.01025767, + "balance_loss_clip": 1.03976941, + "balance_loss_mlp": 1.01454389, + "epoch": 0.5995490756049903, + "flos": 34211479645440.0, + "grad_norm": 1.817052039548499, + "language_loss": 0.7875632, + "learning_rate": 1.4590451092182962e-06, + "loss": 0.8089512, + "num_input_tokens_seen": 214838810, + "router_z_loss_clip": 0.73242188, + "router_z_loss_mlp": 0.11218262, + "step": 9972, + "time_per_iteration": 4.17358922958374 + }, + { + "auxiliary_loss_clip": 0.01123314, + "auxiliary_loss_mlp": 0.01035849, + "balance_loss_clip": 1.04133582, + "balance_loss_mlp": 1.02292061, + "epoch": 0.5996091988576582, + "flos": 35451697844160.0, + "grad_norm": 2.5197421367189277, + "language_loss": 0.76058376, + "learning_rate": 1.4586701744689864e-06, + "loss": 0.78217542, + "num_input_tokens_seen": 214857040, + "router_z_loss_clip": 0.81982422, + "router_z_loss_mlp": 0.12927246, + "step": 9973, + "time_per_iteration": 2.6694507598876953 + }, + { + "auxiliary_loss_clip": 0.01117129, + "auxiliary_loss_mlp": 0.01028301, + "balance_loss_clip": 1.03935862, + "balance_loss_mlp": 1.01517618, + "epoch": 0.5996693221103262, + "flos": 25397230345920.0, + "grad_norm": 4.355229381396657, + "language_loss": 0.6516403, + "learning_rate": 1.4582952602469578e-06, + "loss": 0.67309451, + "num_input_tokens_seen": 214873375, + "router_z_loss_clip": 0.77783203, + "router_z_loss_mlp": 0.13128662, + "step": 9974, + "time_per_iteration": 2.7140140533447266 + }, + { + "auxiliary_loss_clip": 0.01115978, + "auxiliary_loss_mlp": 0.01028412, + "balance_loss_clip": 1.03892183, + "balance_loss_mlp": 1.01628208, + "epoch": 0.5997294453629941, + "flos": 29003118539040.0, + "grad_norm": 1.5614161940187528, + "language_loss": 0.74255055, + "learning_rate": 1.457920366566428e-06, + "loss": 0.76399446, + "num_input_tokens_seen": 214893900, + "router_z_loss_clip": 0.77050781, + "router_z_loss_mlp": 0.12121582, + "step": 9975, + "time_per_iteration": 2.6401960849761963 + }, + { + "auxiliary_loss_clip": 0.01118555, + "auxiliary_loss_mlp": 0.01029613, + "balance_loss_clip": 1.04157043, + "balance_loss_mlp": 1.01713181, + "epoch": 0.5997895686156621, + "flos": 25575440459040.0, + "grad_norm": 2.1971308599002515, + "language_loss": 0.77119756, + "learning_rate": 1.457545493441611e-06, + "loss": 0.79267925, + "num_input_tokens_seen": 214912110, + "router_z_loss_clip": 0.76953125, + "router_z_loss_mlp": 0.12493896, + "step": 9976, + "time_per_iteration": 2.7077219486236572 + }, + { + "auxiliary_loss_clip": 0.01118714, + "auxiliary_loss_mlp": 0.01037722, + "balance_loss_clip": 1.04159069, + "balance_loss_mlp": 1.02506804, + "epoch": 0.59984969186833, + "flos": 34612219425600.0, + "grad_norm": 2.653913663846696, + "language_loss": 0.74983978, + "learning_rate": 1.4571706408867237e-06, + "loss": 0.77140415, + "num_input_tokens_seen": 214930140, + "router_z_loss_clip": 0.77099609, + "router_z_loss_mlp": 0.12664795, + "step": 9977, + "time_per_iteration": 2.710369110107422 + }, + { + "auxiliary_loss_clip": 0.01116547, + "auxiliary_loss_mlp": 0.01030048, + "balance_loss_clip": 1.03990149, + "balance_loss_mlp": 1.0180074, + "epoch": 0.5999098151209981, + "flos": 27535022392320.0, + "grad_norm": 1.7226038922413232, + "language_loss": 0.68803418, + "learning_rate": 1.4567958089159802e-06, + "loss": 0.70950019, + "num_input_tokens_seen": 214949200, + "router_z_loss_clip": 0.76660156, + "router_z_loss_mlp": 0.1204834, + "step": 9978, + "time_per_iteration": 2.6461093425750732 + }, + { + "auxiliary_loss_clip": 0.01124592, + "auxiliary_loss_mlp": 0.0103191, + "balance_loss_clip": 1.04447556, + "balance_loss_mlp": 1.01940477, + "epoch": 0.599969938373666, + "flos": 22901640485760.0, + "grad_norm": 2.386701700795437, + "language_loss": 0.81871307, + "learning_rate": 1.456420997543594e-06, + "loss": 0.84027803, + "num_input_tokens_seen": 214965775, + "router_z_loss_clip": 0.80078125, + "router_z_loss_mlp": 0.12530518, + "step": 9979, + "time_per_iteration": 2.6849188804626465 + }, + { + "auxiliary_loss_clip": 0.0111216, + "auxiliary_loss_mlp": 0.01032553, + "balance_loss_clip": 1.03910649, + "balance_loss_mlp": 1.02034593, + "epoch": 0.600030061626334, + "flos": 13820825586240.0, + "grad_norm": 2.0334579208454806, + "language_loss": 0.70246363, + "learning_rate": 1.4560462067837782e-06, + "loss": 0.72391075, + "num_input_tokens_seen": 214982480, + "router_z_loss_clip": 0.72998047, + "router_z_loss_mlp": 0.12213135, + "step": 9980, + "time_per_iteration": 2.7571194171905518 + }, + { + "auxiliary_loss_clip": 0.01121439, + "auxiliary_loss_mlp": 0.01027117, + "balance_loss_clip": 1.04197741, + "balance_loss_mlp": 1.0141468, + "epoch": 0.600090184879002, + "flos": 20229258617280.0, + "grad_norm": 2.834152788961238, + "language_loss": 0.68751657, + "learning_rate": 1.4556714366507445e-06, + "loss": 0.70900214, + "num_input_tokens_seen": 214998110, + "router_z_loss_clip": 0.79394531, + "router_z_loss_mlp": 0.12976074, + "step": 9981, + "time_per_iteration": 2.6920979022979736 + }, + { + "auxiliary_loss_clip": 0.01117633, + "auxiliary_loss_mlp": 0.01033003, + "balance_loss_clip": 1.04161704, + "balance_loss_mlp": 1.02173138, + "epoch": 0.6001503081316699, + "flos": 28820127386880.0, + "grad_norm": 2.2290598005904005, + "language_loss": 0.78481579, + "learning_rate": 1.4552966871587048e-06, + "loss": 0.80632216, + "num_input_tokens_seen": 215017995, + "router_z_loss_clip": 0.75976562, + "router_z_loss_mlp": 0.11273193, + "step": 9982, + "time_per_iteration": 2.6760599613189697 + }, + { + "auxiliary_loss_clip": 0.01116956, + "auxiliary_loss_mlp": 0.01033269, + "balance_loss_clip": 1.04193544, + "balance_loss_mlp": 1.02084136, + "epoch": 0.6002104313843379, + "flos": 25218736611840.0, + "grad_norm": 1.8990850053720854, + "language_loss": 0.73066467, + "learning_rate": 1.4549219583218686e-06, + "loss": 0.75216687, + "num_input_tokens_seen": 215038285, + "router_z_loss_clip": 0.75146484, + "router_z_loss_mlp": 0.12414551, + "step": 9983, + "time_per_iteration": 2.695028781890869 + }, + { + "auxiliary_loss_clip": 0.01116436, + "auxiliary_loss_mlp": 0.01033208, + "balance_loss_clip": 1.03932095, + "balance_loss_mlp": 1.02105463, + "epoch": 0.6002705546370058, + "flos": 27400078383840.0, + "grad_norm": 6.1240512195456835, + "language_loss": 0.78721732, + "learning_rate": 1.454547250154447e-06, + "loss": 0.80871379, + "num_input_tokens_seen": 215057825, + "router_z_loss_clip": 0.77050781, + "router_z_loss_mlp": 0.12145996, + "step": 9984, + "time_per_iteration": 2.6321372985839844 + }, + { + "auxiliary_loss_clip": 0.01118835, + "auxiliary_loss_mlp": 0.0103383, + "balance_loss_clip": 1.04198301, + "balance_loss_mlp": 1.02183735, + "epoch": 0.6003306778896739, + "flos": 30739644080640.0, + "grad_norm": 2.0680734311406206, + "language_loss": 0.82925177, + "learning_rate": 1.4541725626706485e-06, + "loss": 0.85077846, + "num_input_tokens_seen": 215077790, + "router_z_loss_clip": 0.76806641, + "router_z_loss_mlp": 0.11987305, + "step": 9985, + "time_per_iteration": 2.7678141593933105 + }, + { + "auxiliary_loss_clip": 0.01118492, + "auxiliary_loss_mlp": 0.01034567, + "balance_loss_clip": 1.04227304, + "balance_loss_mlp": 1.02268219, + "epoch": 0.6003908011423418, + "flos": 32564889764640.0, + "grad_norm": 1.798782701249638, + "language_loss": 0.71052659, + "learning_rate": 1.4537978958846809e-06, + "loss": 0.73205709, + "num_input_tokens_seen": 215097650, + "router_z_loss_clip": 0.76171875, + "router_z_loss_mlp": 0.11883545, + "step": 9986, + "time_per_iteration": 2.71103572845459 + }, + { + "auxiliary_loss_clip": 0.0112067, + "auxiliary_loss_mlp": 0.01034099, + "balance_loss_clip": 1.04348624, + "balance_loss_mlp": 1.0211525, + "epoch": 0.6004509243950098, + "flos": 27534090494880.0, + "grad_norm": 1.7607137469723364, + "language_loss": 0.71586168, + "learning_rate": 1.4534232498107514e-06, + "loss": 0.73740935, + "num_input_tokens_seen": 215118235, + "router_z_loss_clip": 0.77148438, + "router_z_loss_mlp": 0.1295166, + "step": 9987, + "time_per_iteration": 2.8453245162963867 + }, + { + "auxiliary_loss_clip": 0.01117985, + "auxiliary_loss_mlp": 0.01034595, + "balance_loss_clip": 1.0416162, + "balance_loss_mlp": 1.02275181, + "epoch": 0.6005110476476777, + "flos": 24061444584480.0, + "grad_norm": 1.7503548522082086, + "language_loss": 0.84939462, + "learning_rate": 1.4530486244630673e-06, + "loss": 0.87092042, + "num_input_tokens_seen": 215136755, + "router_z_loss_clip": 0.76318359, + "router_z_loss_mlp": 0.1182251, + "step": 9988, + "time_per_iteration": 2.843935012817383 + }, + { + "auxiliary_loss_clip": 0.01116894, + "auxiliary_loss_mlp": 0.01034307, + "balance_loss_clip": 1.04052854, + "balance_loss_mlp": 1.02170086, + "epoch": 0.6005711709003457, + "flos": 21612645832320.0, + "grad_norm": 1.6002745889899979, + "language_loss": 0.65501785, + "learning_rate": 1.4526740198558346e-06, + "loss": 0.67652982, + "num_input_tokens_seen": 215155225, + "router_z_loss_clip": 0.76464844, + "router_z_loss_mlp": 0.12597656, + "step": 9989, + "time_per_iteration": 2.6840851306915283 + }, + { + "auxiliary_loss_clip": 0.01115636, + "auxiliary_loss_mlp": 0.01029059, + "balance_loss_clip": 1.03945351, + "balance_loss_mlp": 1.01768601, + "epoch": 0.6006312941530136, + "flos": 22591282056480.0, + "grad_norm": 7.610820639898432, + "language_loss": 0.80275112, + "learning_rate": 1.452299436003257e-06, + "loss": 0.82419807, + "num_input_tokens_seen": 215174815, + "router_z_loss_clip": 0.76171875, + "router_z_loss_mlp": 0.1137085, + "step": 9990, + "time_per_iteration": 2.775583267211914 + }, + { + "auxiliary_loss_clip": 0.01120838, + "auxiliary_loss_mlp": 0.01031632, + "balance_loss_clip": 1.04182291, + "balance_loss_mlp": 1.01934767, + "epoch": 0.6006914174056817, + "flos": 26101805273280.0, + "grad_norm": 2.8386377715362725, + "language_loss": 0.82962644, + "learning_rate": 1.4519248729195403e-06, + "loss": 0.85115111, + "num_input_tokens_seen": 215192045, + "router_z_loss_clip": 0.78955078, + "router_z_loss_mlp": 0.12286377, + "step": 9991, + "time_per_iteration": 2.715942621231079 + }, + { + "auxiliary_loss_clip": 0.01114822, + "auxiliary_loss_mlp": 0.01031356, + "balance_loss_clip": 1.04004598, + "balance_loss_mlp": 1.01933384, + "epoch": 0.6007515406583496, + "flos": 15557594231520.0, + "grad_norm": 2.38164846303747, + "language_loss": 0.83429384, + "learning_rate": 1.4515503306188878e-06, + "loss": 0.85575563, + "num_input_tokens_seen": 215209885, + "router_z_loss_clip": 0.74755859, + "router_z_loss_mlp": 0.12017822, + "step": 9992, + "time_per_iteration": 2.68017840385437 + }, + { + "auxiliary_loss_clip": 0.01115737, + "auxiliary_loss_mlp": 0.01033002, + "balance_loss_clip": 1.0399642, + "balance_loss_mlp": 1.01968074, + "epoch": 0.6008116639110176, + "flos": 23436311342400.0, + "grad_norm": 2.314259767671877, + "language_loss": 0.66673309, + "learning_rate": 1.4511758091155008e-06, + "loss": 0.6882205, + "num_input_tokens_seen": 215228150, + "router_z_loss_clip": 0.75878906, + "router_z_loss_mlp": 0.13317871, + "step": 9993, + "time_per_iteration": 2.612816333770752 + }, + { + "auxiliary_loss_clip": 0.01116691, + "auxiliary_loss_mlp": 0.01035838, + "balance_loss_clip": 1.03988433, + "balance_loss_mlp": 1.02355897, + "epoch": 0.6008717871636855, + "flos": 20810720581920.0, + "grad_norm": 4.557127255431152, + "language_loss": 0.80994737, + "learning_rate": 1.4508013084235826e-06, + "loss": 0.83147264, + "num_input_tokens_seen": 215243755, + "router_z_loss_clip": 0.76757812, + "router_z_loss_mlp": 0.1227417, + "step": 9994, + "time_per_iteration": 2.650660514831543 + }, + { + "auxiliary_loss_clip": 0.01113462, + "auxiliary_loss_mlp": 0.01030264, + "balance_loss_clip": 1.04139495, + "balance_loss_mlp": 1.01936805, + "epoch": 0.6009319104163535, + "flos": 24768855721440.0, + "grad_norm": 1.8355450742987942, + "language_loss": 0.72582924, + "learning_rate": 1.4504268285573337e-06, + "loss": 0.74726641, + "num_input_tokens_seen": 215262130, + "router_z_loss_clip": 0.72070312, + "router_z_loss_mlp": 0.10900879, + "step": 9995, + "time_per_iteration": 2.7901790142059326 + }, + { + "auxiliary_loss_clip": 0.01117263, + "auxiliary_loss_mlp": 0.01030324, + "balance_loss_clip": 1.03917146, + "balance_loss_mlp": 1.01797986, + "epoch": 0.6009920336690215, + "flos": 26645227862400.0, + "grad_norm": 4.477743641921564, + "language_loss": 0.80893248, + "learning_rate": 1.4500523695309546e-06, + "loss": 0.83040833, + "num_input_tokens_seen": 215281785, + "router_z_loss_clip": 0.78125, + "router_z_loss_mlp": 0.12341309, + "step": 9996, + "time_per_iteration": 2.72623610496521 + }, + { + "auxiliary_loss_clip": 0.01118796, + "auxiliary_loss_mlp": 0.01039538, + "balance_loss_clip": 1.04248142, + "balance_loss_mlp": 1.02673483, + "epoch": 0.6010521569216895, + "flos": 27572454008640.0, + "grad_norm": 2.0030035783982902, + "language_loss": 0.78707838, + "learning_rate": 1.4496779313586447e-06, + "loss": 0.8086617, + "num_input_tokens_seen": 215297550, + "router_z_loss_clip": 0.76367188, + "router_z_loss_mlp": 0.12799072, + "step": 9997, + "time_per_iteration": 2.651759624481201 + }, + { + "auxiliary_loss_clip": 0.01119858, + "auxiliary_loss_mlp": 0.0103263, + "balance_loss_clip": 1.04134774, + "balance_loss_mlp": 1.01988626, + "epoch": 0.6011122801743575, + "flos": 23393247824160.0, + "grad_norm": 2.3959029370722726, + "language_loss": 0.7319268, + "learning_rate": 1.4493035140546028e-06, + "loss": 0.7534517, + "num_input_tokens_seen": 215316360, + "router_z_loss_clip": 0.78515625, + "router_z_loss_mlp": 0.12738037, + "step": 9998, + "time_per_iteration": 2.7497358322143555 + }, + { + "auxiliary_loss_clip": 0.01115008, + "auxiliary_loss_mlp": 0.01030076, + "balance_loss_clip": 1.04014909, + "balance_loss_mlp": 1.01796985, + "epoch": 0.6011724034270254, + "flos": 30517560103680.0, + "grad_norm": 1.6890221833369283, + "language_loss": 0.72610009, + "learning_rate": 1.448929117633027e-06, + "loss": 0.74755096, + "num_input_tokens_seen": 215336405, + "router_z_loss_clip": 0.74951172, + "router_z_loss_mlp": 0.12109375, + "step": 9999, + "time_per_iteration": 5.6341633796691895 + }, + { + "auxiliary_loss_clip": 0.0112044, + "auxiliary_loss_mlp": 0.01038572, + "balance_loss_clip": 1.04057193, + "balance_loss_mlp": 1.02586472, + "epoch": 0.6012325266796934, + "flos": 26597747960640.0, + "grad_norm": 1.8747572587485093, + "language_loss": 0.78326404, + "learning_rate": 1.4485547421081142e-06, + "loss": 0.80485415, + "num_input_tokens_seen": 215356590, + "router_z_loss_clip": 0.79833984, + "router_z_loss_mlp": 0.12713623, + "step": 10000, + "time_per_iteration": 2.653299570083618 + }, + { + "auxiliary_loss_clip": 0.01123633, + "auxiliary_loss_mlp": 0.01037043, + "balance_loss_clip": 1.04343903, + "balance_loss_mlp": 1.02373338, + "epoch": 0.6012926499323613, + "flos": 23882950850400.0, + "grad_norm": 1.9302143092569273, + "language_loss": 0.77328098, + "learning_rate": 1.4481803874940608e-06, + "loss": 0.79488772, + "num_input_tokens_seen": 215374295, + "router_z_loss_clip": 0.80126953, + "router_z_loss_mlp": 0.13317871, + "step": 10001, + "time_per_iteration": 2.654743194580078 + }, + { + "auxiliary_loss_clip": 0.01121806, + "auxiliary_loss_mlp": 0.01032494, + "balance_loss_clip": 1.04221725, + "balance_loss_mlp": 1.0193572, + "epoch": 0.6013527731850293, + "flos": 42538254369120.0, + "grad_norm": 1.953537790449529, + "language_loss": 0.58973324, + "learning_rate": 1.4478060538050624e-06, + "loss": 0.61127627, + "num_input_tokens_seen": 215394535, + "router_z_loss_clip": 0.79638672, + "router_z_loss_mlp": 0.13128662, + "step": 10002, + "time_per_iteration": 2.8407235145568848 + }, + { + "auxiliary_loss_clip": 0.0112181, + "auxiliary_loss_mlp": 0.01033431, + "balance_loss_clip": 1.0439868, + "balance_loss_mlp": 1.02028823, + "epoch": 0.6014128964376972, + "flos": 28421332436160.0, + "grad_norm": 2.5039274169120302, + "language_loss": 0.77888274, + "learning_rate": 1.447431741055314e-06, + "loss": 0.80043519, + "num_input_tokens_seen": 215414355, + "router_z_loss_clip": 0.77734375, + "router_z_loss_mlp": 0.13153076, + "step": 10003, + "time_per_iteration": 2.6641201972961426 + }, + { + "auxiliary_loss_clip": 0.01121027, + "auxiliary_loss_mlp": 0.0103161, + "balance_loss_clip": 1.04289162, + "balance_loss_mlp": 1.01933765, + "epoch": 0.6014730196903653, + "flos": 30287413188000.0, + "grad_norm": 2.3703535693045796, + "language_loss": 0.77528799, + "learning_rate": 1.4470574492590091e-06, + "loss": 0.79681432, + "num_input_tokens_seen": 215428280, + "router_z_loss_clip": 0.78076172, + "router_z_loss_mlp": 0.12261963, + "step": 10004, + "time_per_iteration": 2.700270175933838 + }, + { + "auxiliary_loss_clip": 0.01117738, + "auxiliary_loss_mlp": 0.01031576, + "balance_loss_clip": 1.04131281, + "balance_loss_mlp": 1.01914883, + "epoch": 0.6015331429430332, + "flos": 28201922599680.0, + "grad_norm": 1.8067726013085958, + "language_loss": 0.72493356, + "learning_rate": 1.4466831784303408e-06, + "loss": 0.7464267, + "num_input_tokens_seen": 215448970, + "router_z_loss_clip": 0.76416016, + "router_z_loss_mlp": 0.12426758, + "step": 10005, + "time_per_iteration": 2.645920515060425 + }, + { + "auxiliary_loss_clip": 0.011152, + "auxiliary_loss_mlp": 0.01029606, + "balance_loss_clip": 1.04124784, + "balance_loss_mlp": 1.01767302, + "epoch": 0.6015932661957012, + "flos": 23430071681280.0, + "grad_norm": 2.6733945036477604, + "language_loss": 0.74796546, + "learning_rate": 1.4463089285835026e-06, + "loss": 0.76941347, + "num_input_tokens_seen": 215465260, + "router_z_loss_clip": 0.73974609, + "router_z_loss_mlp": 0.11938477, + "step": 10006, + "time_per_iteration": 2.655233383178711 + }, + { + "auxiliary_loss_clip": 0.0111737, + "auxiliary_loss_mlp": 0.0103637, + "balance_loss_clip": 1.04016864, + "balance_loss_mlp": 1.02346563, + "epoch": 0.6016533894483691, + "flos": 22102105754880.0, + "grad_norm": 1.935840350547238, + "language_loss": 0.73659766, + "learning_rate": 1.445934699732685e-06, + "loss": 0.75813502, + "num_input_tokens_seen": 215482725, + "router_z_loss_clip": 0.77246094, + "router_z_loss_mlp": 0.12921143, + "step": 10007, + "time_per_iteration": 4.096645355224609 + }, + { + "auxiliary_loss_clip": 0.0111693, + "auxiliary_loss_mlp": 0.01027938, + "balance_loss_clip": 1.04206729, + "balance_loss_mlp": 1.01629114, + "epoch": 0.6017135127010371, + "flos": 19787521700160.0, + "grad_norm": 2.125045701064946, + "language_loss": 0.69710732, + "learning_rate": 1.4455604918920785e-06, + "loss": 0.71855593, + "num_input_tokens_seen": 215500420, + "router_z_loss_clip": 0.74804688, + "router_z_loss_mlp": 0.11645508, + "step": 10008, + "time_per_iteration": 2.728233575820923 + }, + { + "auxiliary_loss_clip": 0.01118135, + "auxiliary_loss_mlp": 0.01026823, + "balance_loss_clip": 1.04227066, + "balance_loss_mlp": 1.01546264, + "epoch": 0.6017736359537051, + "flos": 28606592556000.0, + "grad_norm": 1.6788109867028422, + "language_loss": 0.76274931, + "learning_rate": 1.4451863050758748e-06, + "loss": 0.78419888, + "num_input_tokens_seen": 215522260, + "router_z_loss_clip": 0.75830078, + "router_z_loss_mlp": 0.11358643, + "step": 10009, + "time_per_iteration": 2.7841122150421143 + }, + { + "auxiliary_loss_clip": 0.01119232, + "auxiliary_loss_mlp": 0.01032971, + "balance_loss_clip": 1.04236734, + "balance_loss_mlp": 1.02092469, + "epoch": 0.601833759206373, + "flos": 28692071316000.0, + "grad_norm": 2.698275172894321, + "language_loss": 0.74301791, + "learning_rate": 1.4448121392982608e-06, + "loss": 0.76453996, + "num_input_tokens_seen": 215541715, + "router_z_loss_clip": 0.76806641, + "router_z_loss_mlp": 0.12042236, + "step": 10010, + "time_per_iteration": 4.214098930358887 + }, + { + "auxiliary_loss_clip": 0.01043289, + "auxiliary_loss_mlp": 0.0100657, + "balance_loss_clip": 1.01868749, + "balance_loss_mlp": 1.00522935, + "epoch": 0.6018938824590411, + "flos": 78081231294720.0, + "grad_norm": 0.8055443731491192, + "language_loss": 0.55078924, + "learning_rate": 1.4444379945734268e-06, + "loss": 0.57128781, + "num_input_tokens_seen": 215603020, + "router_z_loss_clip": 0.24584961, + "router_z_loss_mlp": 0.0134201, + "step": 10011, + "time_per_iteration": 3.4452896118164062 + }, + { + "auxiliary_loss_clip": 0.01119632, + "auxiliary_loss_mlp": 0.01036279, + "balance_loss_clip": 1.04254937, + "balance_loss_mlp": 1.02493596, + "epoch": 0.601954005711709, + "flos": 42269014628640.0, + "grad_norm": 1.4252779828664437, + "language_loss": 0.62046897, + "learning_rate": 1.44406387091556e-06, + "loss": 0.64202803, + "num_input_tokens_seen": 215625115, + "router_z_loss_clip": 0.77099609, + "router_z_loss_mlp": 0.11352539, + "step": 10012, + "time_per_iteration": 2.865030527114868 + }, + { + "auxiliary_loss_clip": 0.01119077, + "auxiliary_loss_mlp": 0.01024986, + "balance_loss_clip": 1.04340637, + "balance_loss_mlp": 1.01366115, + "epoch": 0.602014128964377, + "flos": 23704862289120.0, + "grad_norm": 1.7927633052515326, + "language_loss": 0.74938524, + "learning_rate": 1.4436897683388462e-06, + "loss": 0.77082586, + "num_input_tokens_seen": 215643730, + "router_z_loss_clip": 0.75683594, + "router_z_loss_mlp": 0.11328125, + "step": 10013, + "time_per_iteration": 2.662494659423828 + }, + { + "auxiliary_loss_clip": 0.01112559, + "auxiliary_loss_mlp": 0.01031278, + "balance_loss_clip": 1.04116738, + "balance_loss_mlp": 1.01992893, + "epoch": 0.6020742522170449, + "flos": 34568183492640.0, + "grad_norm": 1.6815971744577574, + "language_loss": 0.81505275, + "learning_rate": 1.4433156868574732e-06, + "loss": 0.83649117, + "num_input_tokens_seen": 215664425, + "router_z_loss_clip": 0.71386719, + "router_z_loss_mlp": 0.11352539, + "step": 10014, + "time_per_iteration": 2.7906346321105957 + }, + { + "auxiliary_loss_clip": 0.01112465, + "auxiliary_loss_mlp": 0.01026559, + "balance_loss_clip": 1.04047263, + "balance_loss_mlp": 1.01480472, + "epoch": 0.6021343754697129, + "flos": 27756174471840.0, + "grad_norm": 1.4928963151724086, + "language_loss": 0.72654492, + "learning_rate": 1.442941626485624e-06, + "loss": 0.74793518, + "num_input_tokens_seen": 215684280, + "router_z_loss_clip": 0.71972656, + "router_z_loss_mlp": 0.11749268, + "step": 10015, + "time_per_iteration": 2.6759798526763916 + }, + { + "auxiliary_loss_clip": 0.01043832, + "auxiliary_loss_mlp": 0.01004383, + "balance_loss_clip": 1.01945961, + "balance_loss_mlp": 1.00302482, + "epoch": 0.6021944987223808, + "flos": 80231860968480.0, + "grad_norm": 0.8162132620676624, + "language_loss": 0.54765081, + "learning_rate": 1.4425675872374848e-06, + "loss": 0.56813294, + "num_input_tokens_seen": 215739780, + "router_z_loss_clip": 0.24377441, + "router_z_loss_mlp": 0.01358032, + "step": 10016, + "time_per_iteration": 3.156724452972412 + }, + { + "auxiliary_loss_clip": 0.01118177, + "auxiliary_loss_mlp": 0.01029051, + "balance_loss_clip": 1.04321551, + "balance_loss_mlp": 1.0167073, + "epoch": 0.6022546219750489, + "flos": 19651402690560.0, + "grad_norm": 1.629640340666775, + "language_loss": 0.8276816, + "learning_rate": 1.4421935691272381e-06, + "loss": 0.84915388, + "num_input_tokens_seen": 215757885, + "router_z_loss_clip": 0.75, + "router_z_loss_mlp": 0.12341309, + "step": 10017, + "time_per_iteration": 2.6906652450561523 + }, + { + "auxiliary_loss_clip": 0.0111752, + "auxiliary_loss_mlp": 0.01031065, + "balance_loss_clip": 1.04350543, + "balance_loss_mlp": 1.01919198, + "epoch": 0.6023147452277168, + "flos": 31131551093760.0, + "grad_norm": 1.9605149538188014, + "language_loss": 0.83602476, + "learning_rate": 1.4418195721690677e-06, + "loss": 0.85751063, + "num_input_tokens_seen": 215776415, + "router_z_loss_clip": 0.73974609, + "router_z_loss_mlp": 0.11871338, + "step": 10018, + "time_per_iteration": 2.77352237701416 + }, + { + "auxiliary_loss_clip": 0.01120729, + "auxiliary_loss_mlp": 0.01035191, + "balance_loss_clip": 1.04210413, + "balance_loss_mlp": 1.02267361, + "epoch": 0.6023748684803848, + "flos": 27620906325120.0, + "grad_norm": 2.71809843233629, + "language_loss": 0.7834326, + "learning_rate": 1.4414455963771549e-06, + "loss": 0.80499178, + "num_input_tokens_seen": 215794865, + "router_z_loss_clip": 0.78564453, + "router_z_loss_mlp": 0.12518311, + "step": 10019, + "time_per_iteration": 2.7360663414001465 + }, + { + "auxiliary_loss_clip": 0.01116452, + "auxiliary_loss_mlp": 0.01027041, + "balance_loss_clip": 1.04025435, + "balance_loss_mlp": 1.01560903, + "epoch": 0.6024349917330527, + "flos": 31982739006240.0, + "grad_norm": 1.5137917211856564, + "language_loss": 0.73753393, + "learning_rate": 1.441071641765681e-06, + "loss": 0.75896889, + "num_input_tokens_seen": 215816840, + "router_z_loss_clip": 0.76123047, + "router_z_loss_mlp": 0.11419678, + "step": 10020, + "time_per_iteration": 2.6910386085510254 + }, + { + "auxiliary_loss_clip": 0.01118026, + "auxiliary_loss_mlp": 0.01032883, + "balance_loss_clip": 1.04153049, + "balance_loss_mlp": 1.02081311, + "epoch": 0.6024951149857207, + "flos": 25931455512480.0, + "grad_norm": 3.5177853182384595, + "language_loss": 0.64181638, + "learning_rate": 1.4406977083488264e-06, + "loss": 0.66332543, + "num_input_tokens_seen": 215836100, + "router_z_loss_clip": 0.76416016, + "router_z_loss_mlp": 0.12060547, + "step": 10021, + "time_per_iteration": 2.7116644382476807 + }, + { + "auxiliary_loss_clip": 0.01117193, + "auxiliary_loss_mlp": 0.01028706, + "balance_loss_clip": 1.04080224, + "balance_loss_mlp": 1.01662993, + "epoch": 0.6025552382383887, + "flos": 32876909402400.0, + "grad_norm": 2.327325818343379, + "language_loss": 0.80355519, + "learning_rate": 1.4403237961407704e-06, + "loss": 0.82501411, + "num_input_tokens_seen": 215858480, + "router_z_loss_clip": 0.76416016, + "router_z_loss_mlp": 0.1206665, + "step": 10022, + "time_per_iteration": 2.674232006072998 + }, + { + "auxiliary_loss_clip": 0.01121986, + "auxiliary_loss_mlp": 0.01026025, + "balance_loss_clip": 1.04282856, + "balance_loss_mlp": 1.01422918, + "epoch": 0.6026153614910567, + "flos": 38662356607200.0, + "grad_norm": 1.5784401840123203, + "language_loss": 0.66492784, + "learning_rate": 1.439949905155693e-06, + "loss": 0.68640792, + "num_input_tokens_seen": 215879950, + "router_z_loss_clip": 0.79199219, + "router_z_loss_mlp": 0.11798096, + "step": 10023, + "time_per_iteration": 2.7938108444213867 + }, + { + "auxiliary_loss_clip": 0.01120391, + "auxiliary_loss_mlp": 0.01031572, + "balance_loss_clip": 1.04178309, + "balance_loss_mlp": 1.01913834, + "epoch": 0.6026754847437247, + "flos": 35768295934560.0, + "grad_norm": 4.611190120990335, + "language_loss": 0.743765, + "learning_rate": 1.4395760354077707e-06, + "loss": 0.76528466, + "num_input_tokens_seen": 215899830, + "router_z_loss_clip": 0.78613281, + "router_z_loss_mlp": 0.12438965, + "step": 10024, + "time_per_iteration": 2.687462568283081 + }, + { + "auxiliary_loss_clip": 0.0111725, + "auxiliary_loss_mlp": 0.01032875, + "balance_loss_clip": 1.04225254, + "balance_loss_mlp": 1.02020884, + "epoch": 0.6027356079963926, + "flos": 28780831975680.0, + "grad_norm": 1.8516610056408171, + "language_loss": 0.72664177, + "learning_rate": 1.4392021869111815e-06, + "loss": 0.74814302, + "num_input_tokens_seen": 215920440, + "router_z_loss_clip": 0.75097656, + "router_z_loss_mlp": 0.12689209, + "step": 10025, + "time_per_iteration": 2.7294576168060303 + }, + { + "auxiliary_loss_clip": 0.01122619, + "auxiliary_loss_mlp": 0.01036304, + "balance_loss_clip": 1.04229581, + "balance_loss_mlp": 1.02342379, + "epoch": 0.6027957312490606, + "flos": 25396987242240.0, + "grad_norm": 2.503192950634915, + "language_loss": 0.67946887, + "learning_rate": 1.4388283596801016e-06, + "loss": 0.70105803, + "num_input_tokens_seen": 215940535, + "router_z_loss_clip": 0.80273438, + "router_z_loss_mlp": 0.12884521, + "step": 10026, + "time_per_iteration": 2.6773715019226074 + }, + { + "auxiliary_loss_clip": 0.01111353, + "auxiliary_loss_mlp": 0.01033839, + "balance_loss_clip": 1.03826058, + "balance_loss_mlp": 1.02234757, + "epoch": 0.6028558545017285, + "flos": 24325822251360.0, + "grad_norm": 1.866957824085038, + "language_loss": 0.801543, + "learning_rate": 1.4384545537287061e-06, + "loss": 0.82299489, + "num_input_tokens_seen": 215958045, + "router_z_loss_clip": 0.73144531, + "router_z_loss_mlp": 0.1149292, + "step": 10027, + "time_per_iteration": 2.702651023864746 + }, + { + "auxiliary_loss_clip": 0.01122751, + "auxiliary_loss_mlp": 0.01032375, + "balance_loss_clip": 1.04403758, + "balance_loss_mlp": 1.02020383, + "epoch": 0.6029159777543965, + "flos": 27846150649920.0, + "grad_norm": 2.760728077998871, + "language_loss": 0.71109468, + "learning_rate": 1.438080769071171e-06, + "loss": 0.73264593, + "num_input_tokens_seen": 215977330, + "router_z_loss_clip": 0.78710938, + "router_z_loss_mlp": 0.12182617, + "step": 10028, + "time_per_iteration": 2.68271541595459 + }, + { + "auxiliary_loss_clip": 0.01120569, + "auxiliary_loss_mlp": 0.0103538, + "balance_loss_clip": 1.04166913, + "balance_loss_mlp": 1.02256489, + "epoch": 0.6029761010070644, + "flos": 28781358700320.0, + "grad_norm": 2.0755821469927582, + "language_loss": 0.84629536, + "learning_rate": 1.437707005721669e-06, + "loss": 0.86785483, + "num_input_tokens_seen": 215997865, + "router_z_loss_clip": 0.78808594, + "router_z_loss_mlp": 0.1282959, + "step": 10029, + "time_per_iteration": 2.7007198333740234 + }, + { + "auxiliary_loss_clip": 0.01115272, + "auxiliary_loss_mlp": 0.01033691, + "balance_loss_clip": 1.04080939, + "balance_loss_mlp": 1.02176976, + "epoch": 0.6030362242597325, + "flos": 16670404635840.0, + "grad_norm": 1.9313871470728996, + "language_loss": 0.79608911, + "learning_rate": 1.437333263694373e-06, + "loss": 0.81757873, + "num_input_tokens_seen": 216016230, + "router_z_loss_clip": 0.74560547, + "router_z_loss_mlp": 0.11932373, + "step": 10030, + "time_per_iteration": 2.7230188846588135 + }, + { + "auxiliary_loss_clip": 0.01118686, + "auxiliary_loss_mlp": 0.0103024, + "balance_loss_clip": 1.04212785, + "balance_loss_mlp": 1.01853383, + "epoch": 0.6030963475124004, + "flos": 29801397234240.0, + "grad_norm": 2.977432885439609, + "language_loss": 0.71255332, + "learning_rate": 1.4369595430034572e-06, + "loss": 0.73404258, + "num_input_tokens_seen": 216035785, + "router_z_loss_clip": 0.765625, + "router_z_loss_mlp": 0.11712646, + "step": 10031, + "time_per_iteration": 2.7134315967559814 + }, + { + "auxiliary_loss_clip": 0.01122765, + "auxiliary_loss_mlp": 0.01034119, + "balance_loss_clip": 1.04172361, + "balance_loss_mlp": 1.02088118, + "epoch": 0.6031564707650684, + "flos": 36172317614400.0, + "grad_norm": 1.7156423116902741, + "language_loss": 0.73269564, + "learning_rate": 1.4365858436630912e-06, + "loss": 0.75426447, + "num_input_tokens_seen": 216059555, + "router_z_loss_clip": 0.81054688, + "router_z_loss_mlp": 0.13250732, + "step": 10032, + "time_per_iteration": 2.743067502975464 + }, + { + "auxiliary_loss_clip": 0.01122748, + "auxiliary_loss_mlp": 0.01033389, + "balance_loss_clip": 1.04532552, + "balance_loss_mlp": 1.02052689, + "epoch": 0.6032165940177363, + "flos": 20277873002880.0, + "grad_norm": 1.8303756564113316, + "language_loss": 0.68177104, + "learning_rate": 1.4362121656874465e-06, + "loss": 0.70333242, + "num_input_tokens_seen": 216077235, + "router_z_loss_clip": 0.77441406, + "router_z_loss_mlp": 0.12872314, + "step": 10033, + "time_per_iteration": 2.7397003173828125 + }, + { + "auxiliary_loss_clip": 0.01119868, + "auxiliary_loss_mlp": 0.01028963, + "balance_loss_clip": 1.04375672, + "balance_loss_mlp": 1.01650548, + "epoch": 0.6032767172704043, + "flos": 21339354363840.0, + "grad_norm": 4.2128266571338795, + "language_loss": 0.75775254, + "learning_rate": 1.4358385090906934e-06, + "loss": 0.77924085, + "num_input_tokens_seen": 216094985, + "router_z_loss_clip": 0.76123047, + "router_z_loss_mlp": 0.12463379, + "step": 10034, + "time_per_iteration": 2.6413302421569824 + }, + { + "auxiliary_loss_clip": 0.01119629, + "auxiliary_loss_mlp": 0.01028159, + "balance_loss_clip": 1.04152751, + "balance_loss_mlp": 1.01546371, + "epoch": 0.6033368405230723, + "flos": 32742167980320.0, + "grad_norm": 9.489731693794065, + "language_loss": 0.74654782, + "learning_rate": 1.4354648738870004e-06, + "loss": 0.76802576, + "num_input_tokens_seen": 216115905, + "router_z_loss_clip": 0.78173828, + "router_z_loss_mlp": 0.12713623, + "step": 10035, + "time_per_iteration": 2.714552402496338 + }, + { + "auxiliary_loss_clip": 0.01118007, + "auxiliary_loss_mlp": 0.01026932, + "balance_loss_clip": 1.04289711, + "balance_loss_mlp": 1.01552963, + "epoch": 0.6033969637757403, + "flos": 20633199262560.0, + "grad_norm": 1.7341410687562564, + "language_loss": 0.86573726, + "learning_rate": 1.435091260090536e-06, + "loss": 0.88718665, + "num_input_tokens_seen": 216132420, + "router_z_loss_clip": 0.75, + "router_z_loss_mlp": 0.11401367, + "step": 10036, + "time_per_iteration": 2.8094770908355713 + }, + { + "auxiliary_loss_clip": 0.01119717, + "auxiliary_loss_mlp": 0.01034767, + "balance_loss_clip": 1.0427382, + "balance_loss_mlp": 1.02250671, + "epoch": 0.6034570870284083, + "flos": 27979473967200.0, + "grad_norm": 4.442556536727268, + "language_loss": 0.700423, + "learning_rate": 1.4347176677154676e-06, + "loss": 0.72196794, + "num_input_tokens_seen": 216149800, + "router_z_loss_clip": 0.77001953, + "router_z_loss_mlp": 0.12255859, + "step": 10037, + "time_per_iteration": 2.6290857791900635 + }, + { + "auxiliary_loss_clip": 0.01116854, + "auxiliary_loss_mlp": 0.01027798, + "balance_loss_clip": 1.04259288, + "balance_loss_mlp": 1.01573431, + "epoch": 0.6035172102810762, + "flos": 28511956890720.0, + "grad_norm": 1.86310907041191, + "language_loss": 0.85409772, + "learning_rate": 1.4343440967759616e-06, + "loss": 0.87554425, + "num_input_tokens_seen": 216168200, + "router_z_loss_clip": 0.74316406, + "router_z_loss_mlp": 0.12060547, + "step": 10038, + "time_per_iteration": 4.134376287460327 + }, + { + "auxiliary_loss_clip": 0.01120231, + "auxiliary_loss_mlp": 0.01031784, + "balance_loss_clip": 1.04164493, + "balance_loss_mlp": 1.01940966, + "epoch": 0.6035773335337442, + "flos": 25486436695680.0, + "grad_norm": 2.782434766937151, + "language_loss": 0.76383519, + "learning_rate": 1.4339705472861846e-06, + "loss": 0.78535533, + "num_input_tokens_seen": 216187105, + "router_z_loss_clip": 0.78564453, + "router_z_loss_mlp": 0.1237793, + "step": 10039, + "time_per_iteration": 4.0991926193237305 + }, + { + "auxiliary_loss_clip": 0.01116711, + "auxiliary_loss_mlp": 0.01027055, + "balance_loss_clip": 1.04106462, + "balance_loss_mlp": 1.01536107, + "epoch": 0.6036374567864121, + "flos": 30427664960160.0, + "grad_norm": 1.9226257635907793, + "language_loss": 0.71385539, + "learning_rate": 1.433597019260301e-06, + "loss": 0.73529303, + "num_input_tokens_seen": 216205440, + "router_z_loss_clip": 0.75585938, + "router_z_loss_mlp": 0.11694336, + "step": 10040, + "time_per_iteration": 2.7573819160461426 + }, + { + "auxiliary_loss_clip": 0.01125297, + "auxiliary_loss_mlp": 0.01032293, + "balance_loss_clip": 1.04516459, + "balance_loss_mlp": 1.01846516, + "epoch": 0.6036975800390801, + "flos": 28245796463520.0, + "grad_norm": 2.915985279459391, + "language_loss": 0.78399509, + "learning_rate": 1.433223512712475e-06, + "loss": 0.80557102, + "num_input_tokens_seen": 216223130, + "router_z_loss_clip": 0.80175781, + "router_z_loss_mlp": 0.13842773, + "step": 10041, + "time_per_iteration": 2.626983165740967 + }, + { + "auxiliary_loss_clip": 0.01117457, + "auxiliary_loss_mlp": 0.01028018, + "balance_loss_clip": 1.04189968, + "balance_loss_mlp": 1.01634121, + "epoch": 0.603757703291748, + "flos": 22769735273280.0, + "grad_norm": 1.7855010504563213, + "language_loss": 0.75502789, + "learning_rate": 1.4328500276568704e-06, + "loss": 0.77648258, + "num_input_tokens_seen": 216240260, + "router_z_loss_clip": 0.75537109, + "router_z_loss_mlp": 0.11676025, + "step": 10042, + "time_per_iteration": 2.6635820865631104 + }, + { + "auxiliary_loss_clip": 0.01116555, + "auxiliary_loss_mlp": 0.01025474, + "balance_loss_clip": 1.04148221, + "balance_loss_mlp": 1.01411319, + "epoch": 0.6038178265444161, + "flos": 24016679340480.0, + "grad_norm": 2.045795354373741, + "language_loss": 0.84606296, + "learning_rate": 1.4324765641076498e-06, + "loss": 0.86748326, + "num_input_tokens_seen": 216258510, + "router_z_loss_clip": 0.75048828, + "router_z_loss_mlp": 0.1137085, + "step": 10043, + "time_per_iteration": 2.717973232269287 + }, + { + "auxiliary_loss_clip": 0.01119795, + "auxiliary_loss_mlp": 0.01032674, + "balance_loss_clip": 1.04033685, + "balance_loss_mlp": 1.01972198, + "epoch": 0.603877949797084, + "flos": 27623337361920.0, + "grad_norm": 2.0474352414448598, + "language_loss": 0.69376421, + "learning_rate": 1.432103122078974e-06, + "loss": 0.71528888, + "num_input_tokens_seen": 216277550, + "router_z_loss_clip": 0.79443359, + "router_z_loss_mlp": 0.12945557, + "step": 10044, + "time_per_iteration": 2.7680323123931885 + }, + { + "auxiliary_loss_clip": 0.0112332, + "auxiliary_loss_mlp": 0.01031795, + "balance_loss_clip": 1.04422259, + "balance_loss_mlp": 1.01893246, + "epoch": 0.603938073049752, + "flos": 31051866304800.0, + "grad_norm": 5.354318266385468, + "language_loss": 0.78208852, + "learning_rate": 1.4317297015850057e-06, + "loss": 0.80363965, + "num_input_tokens_seen": 216296690, + "router_z_loss_clip": 0.79003906, + "router_z_loss_mlp": 0.12884521, + "step": 10045, + "time_per_iteration": 2.7015485763549805 + }, + { + "auxiliary_loss_clip": 0.01116219, + "auxiliary_loss_mlp": 0.01028254, + "balance_loss_clip": 1.04123974, + "balance_loss_mlp": 1.01614237, + "epoch": 0.6039981963024199, + "flos": 27260150749920.0, + "grad_norm": 2.5346439067365982, + "language_loss": 0.77111459, + "learning_rate": 1.4313563026399036e-06, + "loss": 0.79255927, + "num_input_tokens_seen": 216316110, + "router_z_loss_clip": 0.75, + "router_z_loss_mlp": 0.12109375, + "step": 10046, + "time_per_iteration": 4.153211832046509 + }, + { + "auxiliary_loss_clip": 0.01115476, + "auxiliary_loss_mlp": 0.01027959, + "balance_loss_clip": 1.0391953, + "balance_loss_mlp": 1.01693821, + "epoch": 0.6040583195550879, + "flos": 25262124268320.0, + "grad_norm": 5.442029953267571, + "language_loss": 0.8715784, + "learning_rate": 1.430982925257827e-06, + "loss": 0.89301276, + "num_input_tokens_seen": 216333855, + "router_z_loss_clip": 0.76269531, + "router_z_loss_mlp": 0.11029053, + "step": 10047, + "time_per_iteration": 2.8016631603240967 + }, + { + "auxiliary_loss_clip": 0.01118994, + "auxiliary_loss_mlp": 0.01026226, + "balance_loss_clip": 1.04557204, + "balance_loss_mlp": 1.01491284, + "epoch": 0.604118442807756, + "flos": 33144893107200.0, + "grad_norm": 1.6681053096804073, + "language_loss": 0.75821286, + "learning_rate": 1.4306095694529358e-06, + "loss": 0.77966511, + "num_input_tokens_seen": 216354890, + "router_z_loss_clip": 0.73388672, + "router_z_loss_mlp": 0.11315918, + "step": 10048, + "time_per_iteration": 2.800579071044922 + }, + { + "auxiliary_loss_clip": 0.01126339, + "auxiliary_loss_mlp": 0.01040821, + "balance_loss_clip": 1.04321957, + "balance_loss_mlp": 1.02683187, + "epoch": 0.6041785660604239, + "flos": 37682666933760.0, + "grad_norm": 2.459387106702191, + "language_loss": 0.66100359, + "learning_rate": 1.430236235239386e-06, + "loss": 0.68267524, + "num_input_tokens_seen": 216376055, + "router_z_loss_clip": 0.83105469, + "router_z_loss_mlp": 0.13977051, + "step": 10049, + "time_per_iteration": 2.7908430099487305 + }, + { + "auxiliary_loss_clip": 0.01116816, + "auxiliary_loss_mlp": 0.01030406, + "balance_loss_clip": 1.04191196, + "balance_loss_mlp": 1.01853275, + "epoch": 0.6042386893130919, + "flos": 24328577426400.0, + "grad_norm": 1.7619677521260637, + "language_loss": 0.66932982, + "learning_rate": 1.429862922631336e-06, + "loss": 0.6908021, + "num_input_tokens_seen": 216396295, + "router_z_loss_clip": 0.74951172, + "router_z_loss_mlp": 0.11877441, + "step": 10050, + "time_per_iteration": 4.569811582565308 + }, + { + "auxiliary_loss_clip": 0.01121318, + "auxiliary_loss_mlp": 0.01032275, + "balance_loss_clip": 1.04462051, + "balance_loss_mlp": 1.02034187, + "epoch": 0.6042988125657598, + "flos": 39554865794880.0, + "grad_norm": 2.0495317453789297, + "language_loss": 0.69606149, + "learning_rate": 1.4294896316429408e-06, + "loss": 0.71759742, + "num_input_tokens_seen": 216416605, + "router_z_loss_clip": 0.76806641, + "router_z_loss_mlp": 0.11932373, + "step": 10051, + "time_per_iteration": 2.7424967288970947 + }, + { + "auxiliary_loss_clip": 0.01113893, + "auxiliary_loss_mlp": 0.01029941, + "balance_loss_clip": 1.03783059, + "balance_loss_mlp": 1.01818061, + "epoch": 0.6043589358184278, + "flos": 21256630778880.0, + "grad_norm": 2.143248589338131, + "language_loss": 0.64344347, + "learning_rate": 1.4291163622883553e-06, + "loss": 0.66488177, + "num_input_tokens_seen": 216435130, + "router_z_loss_clip": 0.76074219, + "router_z_loss_mlp": 0.11755371, + "step": 10052, + "time_per_iteration": 2.711028575897217 + }, + { + "auxiliary_loss_clip": 0.0111988, + "auxiliary_loss_mlp": 0.01031537, + "balance_loss_clip": 1.04325128, + "balance_loss_mlp": 1.01921701, + "epoch": 0.6044190590710957, + "flos": 33766987553280.0, + "grad_norm": 2.3783937925504364, + "language_loss": 0.68997753, + "learning_rate": 1.4287431145817358e-06, + "loss": 0.7114917, + "num_input_tokens_seen": 216455640, + "router_z_loss_clip": 0.76660156, + "router_z_loss_mlp": 0.12335205, + "step": 10053, + "time_per_iteration": 2.672436475753784 + }, + { + "auxiliary_loss_clip": 0.01045229, + "auxiliary_loss_mlp": 0.01008173, + "balance_loss_clip": 1.02100062, + "balance_loss_mlp": 1.00685072, + "epoch": 0.6044791823237637, + "flos": 79699945286880.0, + "grad_norm": 0.7432156969071031, + "language_loss": 0.60415125, + "learning_rate": 1.4283698885372336e-06, + "loss": 0.62468529, + "num_input_tokens_seen": 216518130, + "router_z_loss_clip": 0.24255371, + "router_z_loss_mlp": 0.01322937, + "step": 10054, + "time_per_iteration": 3.4097461700439453 + }, + { + "auxiliary_loss_clip": 0.0111765, + "auxiliary_loss_mlp": 0.01029521, + "balance_loss_clip": 1.04273844, + "balance_loss_mlp": 1.01693225, + "epoch": 0.6045393055764317, + "flos": 29886511338720.0, + "grad_norm": 1.763835364981906, + "language_loss": 0.8536588, + "learning_rate": 1.4279966841690027e-06, + "loss": 0.87513053, + "num_input_tokens_seen": 216536845, + "router_z_loss_clip": 0.74853516, + "router_z_loss_mlp": 0.12585449, + "step": 10055, + "time_per_iteration": 2.7141494750976562 + }, + { + "auxiliary_loss_clip": 0.01122461, + "auxiliary_loss_mlp": 0.01039416, + "balance_loss_clip": 1.04541802, + "balance_loss_mlp": 1.02602887, + "epoch": 0.6045994288290997, + "flos": 23250767601600.0, + "grad_norm": 2.5784943786870707, + "language_loss": 0.73668981, + "learning_rate": 1.4276235014911952e-06, + "loss": 0.75830859, + "num_input_tokens_seen": 216551860, + "router_z_loss_clip": 0.77050781, + "router_z_loss_mlp": 0.13378906, + "step": 10056, + "time_per_iteration": 2.6386966705322266 + }, + { + "auxiliary_loss_clip": 0.0111731, + "auxiliary_loss_mlp": 0.01032248, + "balance_loss_clip": 1.04337239, + "balance_loss_mlp": 1.02091706, + "epoch": 0.6046595520817676, + "flos": 32429540583360.0, + "grad_norm": 2.1774213102233255, + "language_loss": 0.80509138, + "learning_rate": 1.4272503405179616e-06, + "loss": 0.82658696, + "num_input_tokens_seen": 216574775, + "router_z_loss_clip": 0.73974609, + "router_z_loss_mlp": 0.11328125, + "step": 10057, + "time_per_iteration": 2.7279601097106934 + }, + { + "auxiliary_loss_clip": 0.0111593, + "auxiliary_loss_mlp": 0.01030877, + "balance_loss_clip": 1.04214549, + "balance_loss_mlp": 1.01836026, + "epoch": 0.6047196753344356, + "flos": 16573256899200.0, + "grad_norm": 3.6865488390768877, + "language_loss": 0.75599486, + "learning_rate": 1.4268772012634527e-06, + "loss": 0.77746296, + "num_input_tokens_seen": 216590100, + "router_z_loss_clip": 0.73876953, + "router_z_loss_mlp": 0.12512207, + "step": 10058, + "time_per_iteration": 2.6275670528411865 + }, + { + "auxiliary_loss_clip": 0.01114894, + "auxiliary_loss_mlp": 0.01027663, + "balance_loss_clip": 1.04041862, + "balance_loss_mlp": 1.01555109, + "epoch": 0.6047797985871035, + "flos": 31140464895360.0, + "grad_norm": 3.1652676056242086, + "language_loss": 0.71083736, + "learning_rate": 1.4265040837418176e-06, + "loss": 0.73226291, + "num_input_tokens_seen": 216610145, + "router_z_loss_clip": 0.74511719, + "router_z_loss_mlp": 0.12121582, + "step": 10059, + "time_per_iteration": 2.7704107761383057 + }, + { + "auxiliary_loss_clip": 0.01119886, + "auxiliary_loss_mlp": 0.0102881, + "balance_loss_clip": 1.04359555, + "balance_loss_mlp": 1.01696658, + "epoch": 0.6048399218397715, + "flos": 25038865290240.0, + "grad_norm": 1.579392375596526, + "language_loss": 0.76050901, + "learning_rate": 1.4261309879672054e-06, + "loss": 0.78199601, + "num_input_tokens_seen": 216630625, + "router_z_loss_clip": 0.76269531, + "router_z_loss_mlp": 0.11846924, + "step": 10060, + "time_per_iteration": 2.691845655441284 + }, + { + "auxiliary_loss_clip": 0.01117007, + "auxiliary_loss_mlp": 0.01033338, + "balance_loss_clip": 1.04212499, + "balance_loss_mlp": 1.02155423, + "epoch": 0.6049000450924396, + "flos": 24903151453440.0, + "grad_norm": 1.9332390419898642, + "language_loss": 0.73344398, + "learning_rate": 1.4257579139537628e-06, + "loss": 0.75494742, + "num_input_tokens_seen": 216649255, + "router_z_loss_clip": 0.74951172, + "router_z_loss_mlp": 0.11798096, + "step": 10061, + "time_per_iteration": 2.6922543048858643 + }, + { + "auxiliary_loss_clip": 0.01119384, + "auxiliary_loss_mlp": 0.01028194, + "balance_loss_clip": 1.04169488, + "balance_loss_mlp": 1.01631498, + "epoch": 0.6049601683451075, + "flos": 25308591238080.0, + "grad_norm": 2.700539259978458, + "language_loss": 0.67358226, + "learning_rate": 1.425384861715639e-06, + "loss": 0.69505805, + "num_input_tokens_seen": 216668100, + "router_z_loss_clip": 0.77587891, + "router_z_loss_mlp": 0.11877441, + "step": 10062, + "time_per_iteration": 2.6497504711151123 + }, + { + "auxiliary_loss_clip": 0.01117048, + "auxiliary_loss_mlp": 0.01033902, + "balance_loss_clip": 1.04105473, + "balance_loss_mlp": 1.02180219, + "epoch": 0.6050202915977755, + "flos": 24505490986560.0, + "grad_norm": 3.2235989871286455, + "language_loss": 0.71218181, + "learning_rate": 1.425011831266978e-06, + "loss": 0.73369133, + "num_input_tokens_seen": 216686125, + "router_z_loss_clip": 0.75927734, + "router_z_loss_mlp": 0.12103271, + "step": 10063, + "time_per_iteration": 2.6844022274017334 + }, + { + "auxiliary_loss_clip": 0.01113759, + "auxiliary_loss_mlp": 0.01033873, + "balance_loss_clip": 1.04047799, + "balance_loss_mlp": 1.02244639, + "epoch": 0.6050804148504434, + "flos": 19475096889600.0, + "grad_norm": 2.063944166408617, + "language_loss": 0.84474212, + "learning_rate": 1.424638822621926e-06, + "loss": 0.86621845, + "num_input_tokens_seen": 216704265, + "router_z_loss_clip": 0.73242188, + "router_z_loss_mlp": 0.11413574, + "step": 10064, + "time_per_iteration": 2.644134283065796 + }, + { + "auxiliary_loss_clip": 0.0111858, + "auxiliary_loss_mlp": 0.01031857, + "balance_loss_clip": 1.04290223, + "balance_loss_mlp": 1.02003145, + "epoch": 0.6051405381031114, + "flos": 21300423608160.0, + "grad_norm": 2.579934138184316, + "language_loss": 0.7926532, + "learning_rate": 1.4242658357946278e-06, + "loss": 0.81415755, + "num_input_tokens_seen": 216721765, + "router_z_loss_clip": 0.75634766, + "router_z_loss_mlp": 0.1182251, + "step": 10065, + "time_per_iteration": 2.6583707332611084 + }, + { + "auxiliary_loss_clip": 0.01123648, + "auxiliary_loss_mlp": 0.01029183, + "balance_loss_clip": 1.0447247, + "balance_loss_mlp": 1.01655316, + "epoch": 0.6052006613557793, + "flos": 13909788832320.0, + "grad_norm": 1.9455503800510279, + "language_loss": 0.78452885, + "learning_rate": 1.423892870799226e-06, + "loss": 0.80605716, + "num_input_tokens_seen": 216738295, + "router_z_loss_clip": 0.78808594, + "router_z_loss_mlp": 0.12628174, + "step": 10066, + "time_per_iteration": 2.6844241619110107 + }, + { + "auxiliary_loss_clip": 0.01117343, + "auxiliary_loss_mlp": 0.01032634, + "balance_loss_clip": 1.0420872, + "balance_loss_mlp": 1.02031946, + "epoch": 0.6052607846084473, + "flos": 30201812876160.0, + "grad_norm": 6.704444242965567, + "language_loss": 0.72986984, + "learning_rate": 1.4235199276498655e-06, + "loss": 0.75136954, + "num_input_tokens_seen": 216759875, + "router_z_loss_clip": 0.75341797, + "router_z_loss_mlp": 0.12310791, + "step": 10067, + "time_per_iteration": 2.694626808166504 + }, + { + "auxiliary_loss_clip": 0.01120853, + "auxiliary_loss_mlp": 0.01031987, + "balance_loss_clip": 1.04494643, + "balance_loss_mlp": 1.02010143, + "epoch": 0.6053209078611153, + "flos": 25308348134400.0, + "grad_norm": 1.356491911817256, + "language_loss": 0.68719006, + "learning_rate": 1.4231470063606863e-06, + "loss": 0.70871842, + "num_input_tokens_seen": 216780705, + "router_z_loss_clip": 0.75878906, + "router_z_loss_mlp": 0.11877441, + "step": 10068, + "time_per_iteration": 2.7712390422821045 + }, + { + "auxiliary_loss_clip": 0.01117055, + "auxiliary_loss_mlp": 0.01030799, + "balance_loss_clip": 1.03931594, + "balance_loss_mlp": 1.01878893, + "epoch": 0.6053810311137833, + "flos": 23126520155040.0, + "grad_norm": 3.568269262136346, + "language_loss": 0.87222409, + "learning_rate": 1.4227741069458303e-06, + "loss": 0.89370263, + "num_input_tokens_seen": 216797625, + "router_z_loss_clip": 0.77783203, + "router_z_loss_mlp": 0.12011719, + "step": 10069, + "time_per_iteration": 2.7666003704071045 + }, + { + "auxiliary_loss_clip": 0.01117431, + "auxiliary_loss_mlp": 0.01026863, + "balance_loss_clip": 1.04201472, + "balance_loss_mlp": 1.01493049, + "epoch": 0.6054411543664512, + "flos": 29225486136960.0, + "grad_norm": 1.6871536797140063, + "language_loss": 0.83338809, + "learning_rate": 1.4224012294194387e-06, + "loss": 0.85483104, + "num_input_tokens_seen": 216817610, + "router_z_loss_clip": 0.75439453, + "router_z_loss_mlp": 0.1192627, + "step": 10070, + "time_per_iteration": 2.736921787261963 + }, + { + "auxiliary_loss_clip": 0.01119667, + "auxiliary_loss_mlp": 0.01034844, + "balance_loss_clip": 1.04232967, + "balance_loss_mlp": 1.02278566, + "epoch": 0.6055012776191192, + "flos": 25128881985600.0, + "grad_norm": 1.5741103325432406, + "language_loss": 0.85957485, + "learning_rate": 1.4220283737956496e-06, + "loss": 0.88111997, + "num_input_tokens_seen": 216836835, + "router_z_loss_clip": 0.77197266, + "router_z_loss_mlp": 0.1204834, + "step": 10071, + "time_per_iteration": 2.653632402420044 + }, + { + "auxiliary_loss_clip": 0.01124256, + "auxiliary_loss_mlp": 0.01036095, + "balance_loss_clip": 1.04512835, + "balance_loss_mlp": 1.02279747, + "epoch": 0.6055614008717871, + "flos": 36970798896000.0, + "grad_norm": 2.5158553593510495, + "language_loss": 0.77122486, + "learning_rate": 1.421655540088603e-06, + "loss": 0.79282838, + "num_input_tokens_seen": 216856760, + "router_z_loss_clip": 0.79150391, + "router_z_loss_mlp": 0.13293457, + "step": 10072, + "time_per_iteration": 2.7969589233398438 + }, + { + "auxiliary_loss_clip": 0.0111786, + "auxiliary_loss_mlp": 0.01028728, + "balance_loss_clip": 1.04073417, + "balance_loss_mlp": 1.01568627, + "epoch": 0.6056215241244551, + "flos": 33098547689280.0, + "grad_norm": 2.3582012678377353, + "language_loss": 0.7458474, + "learning_rate": 1.4212827283124367e-06, + "loss": 0.76731336, + "num_input_tokens_seen": 216878795, + "router_z_loss_clip": 0.77148438, + "router_z_loss_mlp": 0.13043213, + "step": 10073, + "time_per_iteration": 2.724830150604248 + }, + { + "auxiliary_loss_clip": 0.01044491, + "auxiliary_loss_mlp": 0.01003644, + "balance_loss_clip": 1.02003098, + "balance_loss_mlp": 1.00240469, + "epoch": 0.6056816473771232, + "flos": 68340924499680.0, + "grad_norm": 0.7554322700033171, + "language_loss": 0.551808, + "learning_rate": 1.4209099384812863e-06, + "loss": 0.57228941, + "num_input_tokens_seen": 216937800, + "router_z_loss_clip": 0.24462891, + "router_z_loss_mlp": 0.01239014, + "step": 10074, + "time_per_iteration": 3.3816754817962646 + }, + { + "auxiliary_loss_clip": 0.01119103, + "auxiliary_loss_mlp": 0.01030662, + "balance_loss_clip": 1.04371941, + "balance_loss_mlp": 1.01844287, + "epoch": 0.6057417706297911, + "flos": 28735418455200.0, + "grad_norm": 1.891708117347832, + "language_loss": 0.81701028, + "learning_rate": 1.4205371706092894e-06, + "loss": 0.83850795, + "num_input_tokens_seen": 216955280, + "router_z_loss_clip": 0.75341797, + "router_z_loss_mlp": 0.12219238, + "step": 10075, + "time_per_iteration": 2.697728395462036 + }, + { + "auxiliary_loss_clip": 0.01118551, + "auxiliary_loss_mlp": 0.01024901, + "balance_loss_clip": 1.04125118, + "balance_loss_mlp": 1.01246762, + "epoch": 0.6058018938824591, + "flos": 33854330108160.0, + "grad_norm": 2.5359545199628077, + "language_loss": 0.78635478, + "learning_rate": 1.4201644247105813e-06, + "loss": 0.80778933, + "num_input_tokens_seen": 216976950, + "router_z_loss_clip": 0.77294922, + "router_z_loss_mlp": 0.12438965, + "step": 10076, + "time_per_iteration": 2.7223007678985596 + }, + { + "auxiliary_loss_clip": 0.01120901, + "auxiliary_loss_mlp": 0.01031735, + "balance_loss_clip": 1.04218888, + "balance_loss_mlp": 1.01984942, + "epoch": 0.605862017135127, + "flos": 27800534543040.0, + "grad_norm": 1.7992225198259801, + "language_loss": 0.72256327, + "learning_rate": 1.4197917007992964e-06, + "loss": 0.7440896, + "num_input_tokens_seen": 216996945, + "router_z_loss_clip": 0.78613281, + "router_z_loss_mlp": 0.11889648, + "step": 10077, + "time_per_iteration": 2.6698856353759766 + }, + { + "auxiliary_loss_clip": 0.01121959, + "auxiliary_loss_mlp": 0.01037136, + "balance_loss_clip": 1.04470062, + "balance_loss_mlp": 1.02466691, + "epoch": 0.605922140387795, + "flos": 25886649751200.0, + "grad_norm": 3.990413561401416, + "language_loss": 0.55311525, + "learning_rate": 1.4194189988895682e-06, + "loss": 0.5747062, + "num_input_tokens_seen": 217016580, + "router_z_loss_clip": 0.77392578, + "router_z_loss_mlp": 0.12469482, + "step": 10078, + "time_per_iteration": 5.705039978027344 + }, + { + "auxiliary_loss_clip": 0.01119352, + "auxiliary_loss_mlp": 0.01033128, + "balance_loss_clip": 1.0414722, + "balance_loss_mlp": 1.02076602, + "epoch": 0.6059822636404629, + "flos": 33273719006400.0, + "grad_norm": 2.8400242231464072, + "language_loss": 0.70255095, + "learning_rate": 1.4190463189955297e-06, + "loss": 0.72407579, + "num_input_tokens_seen": 217037300, + "router_z_loss_clip": 0.77929688, + "router_z_loss_mlp": 0.12365723, + "step": 10079, + "time_per_iteration": 2.697089433670044 + }, + { + "auxiliary_loss_clip": 0.01118933, + "auxiliary_loss_mlp": 0.01035133, + "balance_loss_clip": 1.04241252, + "balance_loss_mlp": 1.02335525, + "epoch": 0.606042386893131, + "flos": 25174376540640.0, + "grad_norm": 4.233032268066151, + "language_loss": 0.62595391, + "learning_rate": 1.4186736611313131e-06, + "loss": 0.64749455, + "num_input_tokens_seen": 217055805, + "router_z_loss_clip": 0.76464844, + "router_z_loss_mlp": 0.11785889, + "step": 10080, + "time_per_iteration": 2.7337646484375 + }, + { + "auxiliary_loss_clip": 0.0112012, + "auxiliary_loss_mlp": 0.01028509, + "balance_loss_clip": 1.04332519, + "balance_loss_mlp": 1.01609349, + "epoch": 0.6061025101457989, + "flos": 28067667384960.0, + "grad_norm": 2.0730386186349947, + "language_loss": 0.71219015, + "learning_rate": 1.4183010253110492e-06, + "loss": 0.73367643, + "num_input_tokens_seen": 217074175, + "router_z_loss_clip": 0.76757812, + "router_z_loss_mlp": 0.12420654, + "step": 10081, + "time_per_iteration": 2.669154644012451 + }, + { + "auxiliary_loss_clip": 0.01119178, + "auxiliary_loss_mlp": 0.01028809, + "balance_loss_clip": 1.0432961, + "balance_loss_mlp": 1.01599991, + "epoch": 0.6061626333984669, + "flos": 36483405354720.0, + "grad_norm": 1.6608384937485212, + "language_loss": 0.69125307, + "learning_rate": 1.4179284115488691e-06, + "loss": 0.71273297, + "num_input_tokens_seen": 217095695, + "router_z_loss_clip": 0.7578125, + "router_z_loss_mlp": 0.12811279, + "step": 10082, + "time_per_iteration": 2.8127615451812744 + }, + { + "auxiliary_loss_clip": 0.0111886, + "auxiliary_loss_mlp": 0.01032523, + "balance_loss_clip": 1.04332471, + "balance_loss_mlp": 1.02069128, + "epoch": 0.6062227566511348, + "flos": 30516628206240.0, + "grad_norm": 1.9377645841563937, + "language_loss": 0.65962338, + "learning_rate": 1.4175558198589015e-06, + "loss": 0.6811372, + "num_input_tokens_seen": 217116260, + "router_z_loss_clip": 0.75439453, + "router_z_loss_mlp": 0.11828613, + "step": 10083, + "time_per_iteration": 2.8045098781585693 + }, + { + "auxiliary_loss_clip": 0.01118238, + "auxiliary_loss_mlp": 0.0103238, + "balance_loss_clip": 1.04187226, + "balance_loss_mlp": 1.02019048, + "epoch": 0.6062828799038028, + "flos": 23749830119520.0, + "grad_norm": 3.07422962150431, + "language_loss": 0.73795497, + "learning_rate": 1.4171832502552764e-06, + "loss": 0.75946116, + "num_input_tokens_seen": 217134465, + "router_z_loss_clip": 0.76318359, + "router_z_loss_mlp": 0.1217041, + "step": 10084, + "time_per_iteration": 2.65321683883667 + }, + { + "auxiliary_loss_clip": 0.01119284, + "auxiliary_loss_mlp": 0.01037017, + "balance_loss_clip": 1.0418098, + "balance_loss_mlp": 1.02498293, + "epoch": 0.6063430031564707, + "flos": 16581481907040.0, + "grad_norm": 11.489909943712812, + "language_loss": 0.72460258, + "learning_rate": 1.4168107027521204e-06, + "loss": 0.74616557, + "num_input_tokens_seen": 217149920, + "router_z_loss_clip": 0.77490234, + "router_z_loss_mlp": 0.12042236, + "step": 10085, + "time_per_iteration": 4.1396803855896 + }, + { + "auxiliary_loss_clip": 0.01116955, + "auxiliary_loss_mlp": 0.01036217, + "balance_loss_clip": 1.04183447, + "balance_loss_mlp": 1.02482104, + "epoch": 0.6064031264091387, + "flos": 28376850813120.0, + "grad_norm": 1.9988649117046104, + "language_loss": 0.76032102, + "learning_rate": 1.4164381773635605e-06, + "loss": 0.78185272, + "num_input_tokens_seen": 217168165, + "router_z_loss_clip": 0.75146484, + "router_z_loss_mlp": 0.1138916, + "step": 10086, + "time_per_iteration": 2.635850429534912 + }, + { + "auxiliary_loss_clip": 0.01115296, + "auxiliary_loss_mlp": 0.01033854, + "balance_loss_clip": 1.04143047, + "balance_loss_mlp": 1.021945, + "epoch": 0.6064632496618068, + "flos": 27403724939040.0, + "grad_norm": 3.115256091951121, + "language_loss": 0.72539067, + "learning_rate": 1.4160656741037246e-06, + "loss": 0.74688214, + "num_input_tokens_seen": 217190070, + "router_z_loss_clip": 0.73828125, + "router_z_loss_mlp": 0.11914062, + "step": 10087, + "time_per_iteration": 2.743974208831787 + }, + { + "auxiliary_loss_clip": 0.01115035, + "auxiliary_loss_mlp": 0.01033413, + "balance_loss_clip": 1.04127824, + "balance_loss_mlp": 1.02310729, + "epoch": 0.6065233729144747, + "flos": 30650194627200.0, + "grad_norm": 2.613746359882091, + "language_loss": 0.8397727, + "learning_rate": 1.4156931929867355e-06, + "loss": 0.86125714, + "num_input_tokens_seen": 217209370, + "router_z_loss_clip": 0.73730469, + "router_z_loss_mlp": 0.10302734, + "step": 10088, + "time_per_iteration": 2.69577956199646 + }, + { + "auxiliary_loss_clip": 0.01116626, + "auxiliary_loss_mlp": 0.01027713, + "balance_loss_clip": 1.04138708, + "balance_loss_mlp": 1.01518965, + "epoch": 0.6065834961671427, + "flos": 28647306072000.0, + "grad_norm": 2.160234654117956, + "language_loss": 0.71032262, + "learning_rate": 1.4153207340267201e-06, + "loss": 0.73176605, + "num_input_tokens_seen": 217226990, + "router_z_loss_clip": 0.75244141, + "router_z_loss_mlp": 0.12524414, + "step": 10089, + "time_per_iteration": 2.6829097270965576 + }, + { + "auxiliary_loss_clip": 0.0111873, + "auxiliary_loss_mlp": 0.01037625, + "balance_loss_clip": 1.04277754, + "balance_loss_mlp": 1.02650917, + "epoch": 0.6066436194198106, + "flos": 20766806200800.0, + "grad_norm": 2.0312384542333453, + "language_loss": 0.82964045, + "learning_rate": 1.4149482972378009e-06, + "loss": 0.85120398, + "num_input_tokens_seen": 217244585, + "router_z_loss_clip": 0.75927734, + "router_z_loss_mlp": 0.11120605, + "step": 10090, + "time_per_iteration": 4.057585954666138 + }, + { + "auxiliary_loss_clip": 0.01125376, + "auxiliary_loss_mlp": 0.01039981, + "balance_loss_clip": 1.04304707, + "balance_loss_mlp": 1.02693987, + "epoch": 0.6067037426724786, + "flos": 22591606194720.0, + "grad_norm": 5.134631547859637, + "language_loss": 0.7604624, + "learning_rate": 1.4145758826341e-06, + "loss": 0.782116, + "num_input_tokens_seen": 217263435, + "router_z_loss_clip": 0.82324219, + "router_z_loss_mlp": 0.13043213, + "step": 10091, + "time_per_iteration": 2.6096749305725098 + }, + { + "auxiliary_loss_clip": 0.01114348, + "auxiliary_loss_mlp": 0.01033975, + "balance_loss_clip": 1.04020178, + "balance_loss_mlp": 1.02213764, + "epoch": 0.6067638659251465, + "flos": 27266755066560.0, + "grad_norm": 1.9624154632539204, + "language_loss": 0.79768038, + "learning_rate": 1.4142034902297415e-06, + "loss": 0.81916368, + "num_input_tokens_seen": 217283725, + "router_z_loss_clip": 0.74316406, + "router_z_loss_mlp": 0.11834717, + "step": 10092, + "time_per_iteration": 2.6875855922698975 + }, + { + "auxiliary_loss_clip": 0.01121306, + "auxiliary_loss_mlp": 0.01039287, + "balance_loss_clip": 1.04224825, + "balance_loss_mlp": 1.02657294, + "epoch": 0.6068239891778145, + "flos": 15192827445600.0, + "grad_norm": 1.8781455293621068, + "language_loss": 0.76039815, + "learning_rate": 1.4138311200388444e-06, + "loss": 0.782004, + "num_input_tokens_seen": 217301120, + "router_z_loss_clip": 0.79052734, + "router_z_loss_mlp": 0.1270752, + "step": 10093, + "time_per_iteration": 2.788112163543701 + }, + { + "auxiliary_loss_clip": 0.01115286, + "auxiliary_loss_mlp": 0.01029703, + "balance_loss_clip": 1.04195273, + "balance_loss_mlp": 1.0183723, + "epoch": 0.6068841124304825, + "flos": 28291574639520.0, + "grad_norm": 1.7805094445498024, + "language_loss": 0.87331414, + "learning_rate": 1.4134587720755304e-06, + "loss": 0.89476407, + "num_input_tokens_seen": 217319585, + "router_z_loss_clip": 0.73242188, + "router_z_loss_mlp": 0.11322021, + "step": 10094, + "time_per_iteration": 2.6678686141967773 + }, + { + "auxiliary_loss_clip": 0.01117068, + "auxiliary_loss_mlp": 0.01031854, + "balance_loss_clip": 1.04137301, + "balance_loss_mlp": 1.01968861, + "epoch": 0.6069442356831505, + "flos": 22680853061760.0, + "grad_norm": 1.8098153310641245, + "language_loss": 0.72326338, + "learning_rate": 1.413086446353919e-06, + "loss": 0.74475265, + "num_input_tokens_seen": 217338880, + "router_z_loss_clip": 0.75634766, + "router_z_loss_mlp": 0.12164307, + "step": 10095, + "time_per_iteration": 2.678239345550537 + }, + { + "auxiliary_loss_clip": 0.01115815, + "auxiliary_loss_mlp": 0.01028797, + "balance_loss_clip": 1.04011798, + "balance_loss_mlp": 1.01695347, + "epoch": 0.6070043589358184, + "flos": 25575805114560.0, + "grad_norm": 1.9218587532794393, + "language_loss": 0.76615202, + "learning_rate": 1.4127141428881273e-06, + "loss": 0.78759813, + "num_input_tokens_seen": 217357480, + "router_z_loss_clip": 0.75732422, + "router_z_loss_mlp": 0.1184082, + "step": 10096, + "time_per_iteration": 2.686917781829834 + }, + { + "auxiliary_loss_clip": 0.01117766, + "auxiliary_loss_mlp": 0.01038731, + "balance_loss_clip": 1.04059482, + "balance_loss_mlp": 1.02686417, + "epoch": 0.6070644821884864, + "flos": 14266452162240.0, + "grad_norm": 2.0061802837037304, + "language_loss": 0.79347098, + "learning_rate": 1.4123418616922749e-06, + "loss": 0.81503594, + "num_input_tokens_seen": 217374575, + "router_z_loss_clip": 0.77294922, + "router_z_loss_mlp": 0.11883545, + "step": 10097, + "time_per_iteration": 2.658656358718872 + }, + { + "auxiliary_loss_clip": 0.01115776, + "auxiliary_loss_mlp": 0.01027604, + "balance_loss_clip": 1.04128122, + "balance_loss_mlp": 1.01606417, + "epoch": 0.6071246054411543, + "flos": 23563354481280.0, + "grad_norm": 1.3949252090658475, + "language_loss": 0.67121321, + "learning_rate": 1.411969602780478e-06, + "loss": 0.69264698, + "num_input_tokens_seen": 217392950, + "router_z_loss_clip": 0.74462891, + "router_z_loss_mlp": 0.11529541, + "step": 10098, + "time_per_iteration": 2.612755298614502 + }, + { + "auxiliary_loss_clip": 0.01115313, + "auxiliary_loss_mlp": 0.01029417, + "balance_loss_clip": 1.04021645, + "balance_loss_mlp": 1.01829529, + "epoch": 0.6071847286938223, + "flos": 21657167972640.0, + "grad_norm": 2.0082561134951065, + "language_loss": 0.80766076, + "learning_rate": 1.4115973661668523e-06, + "loss": 0.829108, + "num_input_tokens_seen": 217412145, + "router_z_loss_clip": 0.75146484, + "router_z_loss_mlp": 0.11126709, + "step": 10099, + "time_per_iteration": 2.6542394161224365 + }, + { + "auxiliary_loss_clip": 0.01120737, + "auxiliary_loss_mlp": 0.01033713, + "balance_loss_clip": 1.04010367, + "balance_loss_mlp": 1.02069569, + "epoch": 0.6072448519464904, + "flos": 27622527016320.0, + "grad_norm": 1.8949249605047809, + "language_loss": 0.70384598, + "learning_rate": 1.4112251518655133e-06, + "loss": 0.72539049, + "num_input_tokens_seen": 217432080, + "router_z_loss_clip": 0.80712891, + "router_z_loss_mlp": 0.13024902, + "step": 10100, + "time_per_iteration": 2.643720865249634 + }, + { + "auxiliary_loss_clip": 0.01119967, + "auxiliary_loss_mlp": 0.01034572, + "balance_loss_clip": 1.04309678, + "balance_loss_mlp": 1.02219796, + "epoch": 0.6073049751991583, + "flos": 23839320090240.0, + "grad_norm": 2.0782944450996674, + "language_loss": 0.7068572, + "learning_rate": 1.4108529598905764e-06, + "loss": 0.72840255, + "num_input_tokens_seen": 217450945, + "router_z_loss_clip": 0.76904297, + "router_z_loss_mlp": 0.12371826, + "step": 10101, + "time_per_iteration": 2.7210497856140137 + }, + { + "auxiliary_loss_clip": 0.01114738, + "auxiliary_loss_mlp": 0.01035834, + "balance_loss_clip": 1.03981626, + "balance_loss_mlp": 1.02402091, + "epoch": 0.6073650984518263, + "flos": 34525160491680.0, + "grad_norm": 1.706182896947796, + "language_loss": 0.69433928, + "learning_rate": 1.410480790256154e-06, + "loss": 0.71584505, + "num_input_tokens_seen": 217473105, + "router_z_loss_clip": 0.75048828, + "router_z_loss_mlp": 0.1182251, + "step": 10102, + "time_per_iteration": 2.692767381668091 + }, + { + "auxiliary_loss_clip": 0.01118272, + "auxiliary_loss_mlp": 0.01031653, + "balance_loss_clip": 1.04169762, + "balance_loss_mlp": 1.0200423, + "epoch": 0.6074252217044942, + "flos": 31316325006240.0, + "grad_norm": 2.5433674657733403, + "language_loss": 0.73390585, + "learning_rate": 1.4101086429763589e-06, + "loss": 0.75540507, + "num_input_tokens_seen": 217491780, + "router_z_loss_clip": 0.76464844, + "router_z_loss_mlp": 0.1161499, + "step": 10103, + "time_per_iteration": 2.7218968868255615 + }, + { + "auxiliary_loss_clip": 0.01123348, + "auxiliary_loss_mlp": 0.01032552, + "balance_loss_clip": 1.04388404, + "balance_loss_mlp": 1.0194869, + "epoch": 0.6074853449571622, + "flos": 27890429686560.0, + "grad_norm": 2.012715313938425, + "language_loss": 0.76716781, + "learning_rate": 1.4097365180653032e-06, + "loss": 0.78872681, + "num_input_tokens_seen": 217510605, + "router_z_loss_clip": 0.79541016, + "router_z_loss_mlp": 0.13067627, + "step": 10104, + "time_per_iteration": 2.6897337436676025 + }, + { + "auxiliary_loss_clip": 0.01043319, + "auxiliary_loss_mlp": 0.01004371, + "balance_loss_clip": 1.01876473, + "balance_loss_mlp": 1.00316906, + "epoch": 0.6075454682098301, + "flos": 81889836204960.0, + "grad_norm": 0.7118394919060538, + "language_loss": 0.55962723, + "learning_rate": 1.4093644155370977e-06, + "loss": 0.58010411, + "num_input_tokens_seen": 217574815, + "router_z_loss_clip": 0.24560547, + "router_z_loss_mlp": 0.01200867, + "step": 10105, + "time_per_iteration": 3.2923004627227783 + }, + { + "auxiliary_loss_clip": 0.01043318, + "auxiliary_loss_mlp": 0.0100316, + "balance_loss_clip": 1.01879883, + "balance_loss_mlp": 1.0019424, + "epoch": 0.6076055914624982, + "flos": 86284643601600.0, + "grad_norm": 0.7605419274658513, + "language_loss": 0.56848389, + "learning_rate": 1.4089923354058533e-06, + "loss": 0.58894861, + "num_input_tokens_seen": 217632375, + "router_z_loss_clip": 0.24523926, + "router_z_loss_mlp": 0.01216125, + "step": 10106, + "time_per_iteration": 3.2263870239257812 + }, + { + "auxiliary_loss_clip": 0.01115779, + "auxiliary_loss_mlp": 0.01033, + "balance_loss_clip": 1.04054749, + "balance_loss_mlp": 1.02130556, + "epoch": 0.6076657147151661, + "flos": 34611206493600.0, + "grad_norm": 1.6340721462377756, + "language_loss": 0.68871248, + "learning_rate": 1.4086202776856784e-06, + "loss": 0.71020025, + "num_input_tokens_seen": 217653055, + "router_z_loss_clip": 0.75146484, + "router_z_loss_mlp": 0.11694336, + "step": 10107, + "time_per_iteration": 2.6992993354797363 + }, + { + "auxiliary_loss_clip": 0.01122004, + "auxiliary_loss_mlp": 0.01028239, + "balance_loss_clip": 1.04260516, + "balance_loss_mlp": 1.01564503, + "epoch": 0.6077258379678341, + "flos": 18363258900000.0, + "grad_norm": 1.8969339748924072, + "language_loss": 0.8109858, + "learning_rate": 1.4082482423906815e-06, + "loss": 0.8324883, + "num_input_tokens_seen": 217671520, + "router_z_loss_clip": 0.79345703, + "router_z_loss_mlp": 0.12573242, + "step": 10108, + "time_per_iteration": 2.690844774246216 + }, + { + "auxiliary_loss_clip": 0.01125459, + "auxiliary_loss_mlp": 0.01033268, + "balance_loss_clip": 1.04479194, + "balance_loss_mlp": 1.02001202, + "epoch": 0.607785961220502, + "flos": 44133150551040.0, + "grad_norm": 1.8628782964365929, + "language_loss": 0.71269107, + "learning_rate": 1.4078762295349714e-06, + "loss": 0.73427826, + "num_input_tokens_seen": 217691880, + "router_z_loss_clip": 0.80664062, + "router_z_loss_mlp": 0.13262939, + "step": 10109, + "time_per_iteration": 2.767090082168579 + }, + { + "auxiliary_loss_clip": 0.0111382, + "auxiliary_loss_mlp": 0.01030258, + "balance_loss_clip": 1.04020774, + "balance_loss_mlp": 1.01889122, + "epoch": 0.60784608447317, + "flos": 27484665763680.0, + "grad_norm": 2.541624622790135, + "language_loss": 0.80158275, + "learning_rate": 1.407504239132653e-06, + "loss": 0.82302356, + "num_input_tokens_seen": 217710530, + "router_z_loss_clip": 0.73583984, + "router_z_loss_mlp": 0.11376953, + "step": 10110, + "time_per_iteration": 2.7435970306396484 + }, + { + "auxiliary_loss_clip": 0.01119752, + "auxiliary_loss_mlp": 0.01027976, + "balance_loss_clip": 1.04187942, + "balance_loss_mlp": 1.0157454, + "epoch": 0.6079062077258379, + "flos": 29091352474080.0, + "grad_norm": 3.4713370212228543, + "language_loss": 0.70670009, + "learning_rate": 1.4071322711978338e-06, + "loss": 0.72817737, + "num_input_tokens_seen": 217728650, + "router_z_loss_clip": 0.77783203, + "router_z_loss_mlp": 0.12231445, + "step": 10111, + "time_per_iteration": 2.688432216644287 + }, + { + "auxiliary_loss_clip": 0.0111961, + "auxiliary_loss_mlp": 0.01030898, + "balance_loss_clip": 1.0422132, + "balance_loss_mlp": 1.01796341, + "epoch": 0.6079663309785059, + "flos": 28512767236320.0, + "grad_norm": 3.3780837224782307, + "language_loss": 0.65407813, + "learning_rate": 1.4067603257446186e-06, + "loss": 0.67558324, + "num_input_tokens_seen": 217747135, + "router_z_loss_clip": 0.7734375, + "router_z_loss_mlp": 0.1293335, + "step": 10112, + "time_per_iteration": 2.6987152099609375 + }, + { + "auxiliary_loss_clip": 0.01042822, + "auxiliary_loss_mlp": 0.01002251, + "balance_loss_clip": 1.01838684, + "balance_loss_mlp": 1.0010289, + "epoch": 0.6080264542311739, + "flos": 87101965418400.0, + "grad_norm": 0.6233145668443346, + "language_loss": 0.49565807, + "learning_rate": 1.4063884027871105e-06, + "loss": 0.51610881, + "num_input_tokens_seen": 217811860, + "router_z_loss_clip": 0.24438477, + "router_z_loss_mlp": 0.01220703, + "step": 10113, + "time_per_iteration": 3.3493871688842773 + }, + { + "auxiliary_loss_clip": 0.0104206, + "auxiliary_loss_mlp": 0.01002389, + "balance_loss_clip": 1.01765943, + "balance_loss_mlp": 1.0011487, + "epoch": 0.6080865774838419, + "flos": 81179953513920.0, + "grad_norm": 0.8365804502517612, + "language_loss": 0.569785, + "learning_rate": 1.4060165023394147e-06, + "loss": 0.59022951, + "num_input_tokens_seen": 217866510, + "router_z_loss_clip": 0.24389648, + "router_z_loss_mlp": 0.01239777, + "step": 10114, + "time_per_iteration": 3.1597707271575928 + }, + { + "auxiliary_loss_clip": 0.01121417, + "auxiliary_loss_mlp": 0.01026072, + "balance_loss_clip": 1.04206848, + "balance_loss_mlp": 1.01289308, + "epoch": 0.6081467007365099, + "flos": 23437486343520.0, + "grad_norm": 3.647546158829553, + "language_loss": 0.70070648, + "learning_rate": 1.4056446244156317e-06, + "loss": 0.72218138, + "num_input_tokens_seen": 217885650, + "router_z_loss_clip": 0.79345703, + "router_z_loss_mlp": 0.1317749, + "step": 10115, + "time_per_iteration": 2.70353102684021 + }, + { + "auxiliary_loss_clip": 0.01118152, + "auxiliary_loss_mlp": 0.01026745, + "balance_loss_clip": 1.04147267, + "balance_loss_mlp": 1.0146575, + "epoch": 0.6082068239891778, + "flos": 29490025872960.0, + "grad_norm": 1.9821303100146592, + "language_loss": 0.72511315, + "learning_rate": 1.4052727690298642e-06, + "loss": 0.74656212, + "num_input_tokens_seen": 217905300, + "router_z_loss_clip": 0.76660156, + "router_z_loss_mlp": 0.12084961, + "step": 10116, + "time_per_iteration": 2.662968158721924 + }, + { + "auxiliary_loss_clip": 0.01119988, + "auxiliary_loss_mlp": 0.01033906, + "balance_loss_clip": 1.04105914, + "balance_loss_mlp": 1.02076912, + "epoch": 0.6082669472418458, + "flos": 45653183500320.0, + "grad_norm": 1.7138189083194297, + "language_loss": 0.53637534, + "learning_rate": 1.4049009361962138e-06, + "loss": 0.55791432, + "num_input_tokens_seen": 217927845, + "router_z_loss_clip": 0.7890625, + "router_z_loss_mlp": 0.13140869, + "step": 10117, + "time_per_iteration": 4.206465721130371 + }, + { + "auxiliary_loss_clip": 0.01117173, + "auxiliary_loss_mlp": 0.01024998, + "balance_loss_clip": 1.04029727, + "balance_loss_mlp": 1.0130353, + "epoch": 0.6083270704945137, + "flos": 18406970694720.0, + "grad_norm": 1.9941615503783556, + "language_loss": 0.69737661, + "learning_rate": 1.4045291259287786e-06, + "loss": 0.7187984, + "num_input_tokens_seen": 217946145, + "router_z_loss_clip": 0.76904297, + "router_z_loss_mlp": 0.11962891, + "step": 10118, + "time_per_iteration": 4.087399959564209 + }, + { + "auxiliary_loss_clip": 0.01118729, + "auxiliary_loss_mlp": 0.01028453, + "balance_loss_clip": 1.04228282, + "balance_loss_mlp": 1.01672244, + "epoch": 0.6083871937471818, + "flos": 25219627992000.0, + "grad_norm": 1.8026210925740074, + "language_loss": 0.74563247, + "learning_rate": 1.4041573382416588e-06, + "loss": 0.76710427, + "num_input_tokens_seen": 217965190, + "router_z_loss_clip": 0.76464844, + "router_z_loss_mlp": 0.11730957, + "step": 10119, + "time_per_iteration": 2.667285680770874 + }, + { + "auxiliary_loss_clip": 0.01114926, + "auxiliary_loss_mlp": 0.01030465, + "balance_loss_clip": 1.03940463, + "balance_loss_mlp": 1.01874685, + "epoch": 0.6084473169998497, + "flos": 26242826873760.0, + "grad_norm": 2.4043688628145747, + "language_loss": 0.67598861, + "learning_rate": 1.4037855731489525e-06, + "loss": 0.69744253, + "num_input_tokens_seen": 217983625, + "router_z_loss_clip": 0.75585938, + "router_z_loss_mlp": 0.1171875, + "step": 10120, + "time_per_iteration": 2.637146472930908 + }, + { + "auxiliary_loss_clip": 0.01120328, + "auxiliary_loss_mlp": 0.01029982, + "balance_loss_clip": 1.04210281, + "balance_loss_mlp": 1.01762652, + "epoch": 0.6085074402525177, + "flos": 32787378914400.0, + "grad_norm": 1.6317972963067782, + "language_loss": 0.74123758, + "learning_rate": 1.4034138306647571e-06, + "loss": 0.76274073, + "num_input_tokens_seen": 218006005, + "router_z_loss_clip": 0.78271484, + "router_z_loss_mlp": 0.12359619, + "step": 10121, + "time_per_iteration": 2.744065761566162 + }, + { + "auxiliary_loss_clip": 0.01114602, + "auxiliary_loss_mlp": 0.01027849, + "balance_loss_clip": 1.03879249, + "balance_loss_mlp": 1.01657796, + "epoch": 0.6085675635051856, + "flos": 13287046109760.0, + "grad_norm": 3.2305499937117905, + "language_loss": 0.80286813, + "learning_rate": 1.4030421108031685e-06, + "loss": 0.82429266, + "num_input_tokens_seen": 218024195, + "router_z_loss_clip": 0.7578125, + "router_z_loss_mlp": 0.11273193, + "step": 10122, + "time_per_iteration": 2.593848943710327 + }, + { + "auxiliary_loss_clip": 0.01118356, + "auxiliary_loss_mlp": 0.01030606, + "balance_loss_clip": 1.04171073, + "balance_loss_mlp": 1.01804733, + "epoch": 0.6086276867578536, + "flos": 42539834543040.0, + "grad_norm": 2.400562200670685, + "language_loss": 0.55646223, + "learning_rate": 1.402670413578284e-06, + "loss": 0.57795191, + "num_input_tokens_seen": 218047190, + "router_z_loss_clip": 0.765625, + "router_z_loss_mlp": 0.12548828, + "step": 10123, + "time_per_iteration": 2.8280179500579834 + }, + { + "auxiliary_loss_clip": 0.01115107, + "auxiliary_loss_mlp": 0.01032494, + "balance_loss_clip": 1.04081082, + "balance_loss_mlp": 1.02050161, + "epoch": 0.6086878100105215, + "flos": 24462184364640.0, + "grad_norm": 2.091860331143924, + "language_loss": 0.73808336, + "learning_rate": 1.4022987390041965e-06, + "loss": 0.75955939, + "num_input_tokens_seen": 218065945, + "router_z_loss_clip": 0.74316406, + "router_z_loss_mlp": 0.12005615, + "step": 10124, + "time_per_iteration": 2.6737124919891357 + }, + { + "auxiliary_loss_clip": 0.01118, + "auxiliary_loss_mlp": 0.01031043, + "balance_loss_clip": 1.04098368, + "balance_loss_mlp": 1.01852012, + "epoch": 0.6087479332631895, + "flos": 22369157562240.0, + "grad_norm": 3.4460991307641686, + "language_loss": 0.65948081, + "learning_rate": 1.4019270870950006e-06, + "loss": 0.68097121, + "num_input_tokens_seen": 218085285, + "router_z_loss_clip": 0.77050781, + "router_z_loss_mlp": 0.12518311, + "step": 10125, + "time_per_iteration": 4.15421462059021 + }, + { + "auxiliary_loss_clip": 0.01117067, + "auxiliary_loss_mlp": 0.01026496, + "balance_loss_clip": 1.04192638, + "balance_loss_mlp": 1.01505828, + "epoch": 0.6088080565158575, + "flos": 29887686339840.0, + "grad_norm": 2.6083807355014414, + "language_loss": 0.76024485, + "learning_rate": 1.40155545786479e-06, + "loss": 0.78168046, + "num_input_tokens_seen": 218104735, + "router_z_loss_clip": 0.75146484, + "router_z_loss_mlp": 0.11437988, + "step": 10126, + "time_per_iteration": 2.7708210945129395 + }, + { + "auxiliary_loss_clip": 0.01119452, + "auxiliary_loss_mlp": 0.01026217, + "balance_loss_clip": 1.04005098, + "balance_loss_mlp": 1.01338995, + "epoch": 0.6088681797685255, + "flos": 12529521447840.0, + "grad_norm": 11.130671190631636, + "language_loss": 0.71069694, + "learning_rate": 1.4011838513276558e-06, + "loss": 0.73215365, + "num_input_tokens_seen": 218121855, + "router_z_loss_clip": 0.79345703, + "router_z_loss_mlp": 0.12817383, + "step": 10127, + "time_per_iteration": 2.5982401371002197 + }, + { + "auxiliary_loss_clip": 0.01121503, + "auxiliary_loss_mlp": 0.01027905, + "balance_loss_clip": 1.04238629, + "balance_loss_mlp": 1.01510835, + "epoch": 0.6089283030211935, + "flos": 26813146586400.0, + "grad_norm": 2.4327322879913402, + "language_loss": 0.7258926, + "learning_rate": 1.400812267497691e-06, + "loss": 0.74738669, + "num_input_tokens_seen": 218137325, + "router_z_loss_clip": 0.79101562, + "router_z_loss_mlp": 0.12811279, + "step": 10128, + "time_per_iteration": 2.7125134468078613 + }, + { + "auxiliary_loss_clip": 0.01115906, + "auxiliary_loss_mlp": 0.01030582, + "balance_loss_clip": 1.04032278, + "balance_loss_mlp": 1.01891112, + "epoch": 0.6089884262738614, + "flos": 21701325457440.0, + "grad_norm": 2.278605583686706, + "language_loss": 0.72947395, + "learning_rate": 1.4004407063889842e-06, + "loss": 0.75093883, + "num_input_tokens_seen": 218155530, + "router_z_loss_clip": 0.75683594, + "router_z_loss_mlp": 0.11682129, + "step": 10129, + "time_per_iteration": 2.613332748413086 + }, + { + "auxiliary_loss_clip": 0.01114007, + "auxiliary_loss_mlp": 0.0103016, + "balance_loss_clip": 1.03810048, + "balance_loss_mlp": 1.01798916, + "epoch": 0.6090485495265294, + "flos": 45038787337440.0, + "grad_norm": 1.5029017441033206, + "language_loss": 0.65887821, + "learning_rate": 1.400069168015626e-06, + "loss": 0.68031985, + "num_input_tokens_seen": 218182535, + "router_z_loss_clip": 0.75878906, + "router_z_loss_mlp": 0.12158203, + "step": 10130, + "time_per_iteration": 4.2005774974823 + }, + { + "auxiliary_loss_clip": 0.01111832, + "auxiliary_loss_mlp": 0.01025582, + "balance_loss_clip": 1.03753972, + "balance_loss_mlp": 1.01476383, + "epoch": 0.6091086727791973, + "flos": 24281016490080.0, + "grad_norm": 1.899854780776024, + "language_loss": 0.77097607, + "learning_rate": 1.3996976523917054e-06, + "loss": 0.79235023, + "num_input_tokens_seen": 218201740, + "router_z_loss_clip": 0.74365234, + "router_z_loss_mlp": 0.10809326, + "step": 10131, + "time_per_iteration": 2.646975517272949 + }, + { + "auxiliary_loss_clip": 0.01115764, + "auxiliary_loss_mlp": 0.01028188, + "balance_loss_clip": 1.04050922, + "balance_loss_mlp": 1.01743507, + "epoch": 0.6091687960318654, + "flos": 27044225399520.0, + "grad_norm": 2.418879037481608, + "language_loss": 0.77175963, + "learning_rate": 1.3993261595313093e-06, + "loss": 0.79319912, + "num_input_tokens_seen": 218219800, + "router_z_loss_clip": 0.75244141, + "router_z_loss_mlp": 0.10760498, + "step": 10132, + "time_per_iteration": 2.6488006114959717 + }, + { + "auxiliary_loss_clip": 0.011118, + "auxiliary_loss_mlp": 0.01030343, + "balance_loss_clip": 1.0391531, + "balance_loss_mlp": 1.01917934, + "epoch": 0.6092289192845333, + "flos": 26192510762400.0, + "grad_norm": 1.7717152448844768, + "language_loss": 0.75846159, + "learning_rate": 1.3989546894485261e-06, + "loss": 0.77988303, + "num_input_tokens_seen": 218237585, + "router_z_loss_clip": 0.72705078, + "router_z_loss_mlp": 0.1116333, + "step": 10133, + "time_per_iteration": 2.678851366043091 + }, + { + "auxiliary_loss_clip": 0.0111539, + "auxiliary_loss_mlp": 0.0102782, + "balance_loss_clip": 1.0398314, + "balance_loss_mlp": 1.0156548, + "epoch": 0.6092890425372013, + "flos": 35014174724160.0, + "grad_norm": 2.4083790500686355, + "language_loss": 0.63920569, + "learning_rate": 1.3985832421574414e-06, + "loss": 0.66063786, + "num_input_tokens_seen": 218258700, + "router_z_loss_clip": 0.75634766, + "router_z_loss_mlp": 0.12176514, + "step": 10134, + "time_per_iteration": 2.7157273292541504 + }, + { + "auxiliary_loss_clip": 0.01113412, + "auxiliary_loss_mlp": 0.01027863, + "balance_loss_clip": 1.03951621, + "balance_loss_mlp": 1.0164367, + "epoch": 0.6093491657898692, + "flos": 25396338965760.0, + "grad_norm": 1.874064929798991, + "language_loss": 0.78553635, + "learning_rate": 1.3982118176721397e-06, + "loss": 0.80694914, + "num_input_tokens_seen": 218275655, + "router_z_loss_clip": 0.73876953, + "router_z_loss_mlp": 0.11425781, + "step": 10135, + "time_per_iteration": 2.653467893600464 + }, + { + "auxiliary_loss_clip": 0.01116181, + "auxiliary_loss_mlp": 0.01029387, + "balance_loss_clip": 1.03932738, + "balance_loss_mlp": 1.01796067, + "epoch": 0.6094092890425372, + "flos": 31050853372800.0, + "grad_norm": 1.8651035493821486, + "language_loss": 0.72001183, + "learning_rate": 1.3978404160067069e-06, + "loss": 0.74146754, + "num_input_tokens_seen": 218295720, + "router_z_loss_clip": 0.76806641, + "router_z_loss_mlp": 0.11437988, + "step": 10136, + "time_per_iteration": 2.6895487308502197 + }, + { + "auxiliary_loss_clip": 0.01117015, + "auxiliary_loss_mlp": 0.01028959, + "balance_loss_clip": 1.04103625, + "balance_loss_mlp": 1.01701474, + "epoch": 0.6094694122952051, + "flos": 43466777068320.0, + "grad_norm": 1.8147754616303902, + "language_loss": 0.74718195, + "learning_rate": 1.3974690371752253e-06, + "loss": 0.76864171, + "num_input_tokens_seen": 218316745, + "router_z_loss_clip": 0.75927734, + "router_z_loss_mlp": 0.1194458, + "step": 10137, + "time_per_iteration": 2.8442745208740234 + }, + { + "auxiliary_loss_clip": 0.01118283, + "auxiliary_loss_mlp": 0.01034946, + "balance_loss_clip": 1.04085374, + "balance_loss_mlp": 1.02263761, + "epoch": 0.6095295355478731, + "flos": 29843204716800.0, + "grad_norm": 2.3605809074729356, + "language_loss": 0.80143559, + "learning_rate": 1.3970976811917785e-06, + "loss": 0.82296789, + "num_input_tokens_seen": 218335385, + "router_z_loss_clip": 0.77441406, + "router_z_loss_mlp": 0.12310791, + "step": 10138, + "time_per_iteration": 2.7533836364746094 + }, + { + "auxiliary_loss_clip": 0.01109968, + "auxiliary_loss_mlp": 0.01030482, + "balance_loss_clip": 1.03783345, + "balance_loss_mlp": 1.01966941, + "epoch": 0.6095896588005411, + "flos": 19075248489600.0, + "grad_norm": 1.7518655351849173, + "language_loss": 0.80989355, + "learning_rate": 1.3967263480704481e-06, + "loss": 0.83129811, + "num_input_tokens_seen": 218353320, + "router_z_loss_clip": 0.72070312, + "router_z_loss_mlp": 0.10803223, + "step": 10139, + "time_per_iteration": 2.766291618347168 + }, + { + "auxiliary_loss_clip": 0.01120892, + "auxiliary_loss_mlp": 0.01030955, + "balance_loss_clip": 1.04319739, + "balance_loss_mlp": 1.01886106, + "epoch": 0.6096497820532091, + "flos": 18975305060640.0, + "grad_norm": 2.2000368029129125, + "language_loss": 0.83492172, + "learning_rate": 1.396355037825315e-06, + "loss": 0.85644019, + "num_input_tokens_seen": 218365620, + "router_z_loss_clip": 0.77685547, + "router_z_loss_mlp": 0.12103271, + "step": 10140, + "time_per_iteration": 2.6438052654266357 + }, + { + "auxiliary_loss_clip": 0.01117181, + "auxiliary_loss_mlp": 0.01029598, + "balance_loss_clip": 1.04003525, + "balance_loss_mlp": 1.01773691, + "epoch": 0.6097099053058771, + "flos": 29534831634240.0, + "grad_norm": 6.531016663108932, + "language_loss": 0.75728309, + "learning_rate": 1.3959837504704592e-06, + "loss": 0.7787509, + "num_input_tokens_seen": 218383785, + "router_z_loss_clip": 0.77148438, + "router_z_loss_mlp": 0.11865234, + "step": 10141, + "time_per_iteration": 2.692800521850586 + }, + { + "auxiliary_loss_clip": 0.01113444, + "auxiliary_loss_mlp": 0.01028986, + "balance_loss_clip": 1.03876042, + "balance_loss_mlp": 1.01726747, + "epoch": 0.609770028558545, + "flos": 23879831019840.0, + "grad_norm": 2.014957577257555, + "language_loss": 0.76371396, + "learning_rate": 1.3956124860199603e-06, + "loss": 0.78513825, + "num_input_tokens_seen": 218399055, + "router_z_loss_clip": 0.74755859, + "router_z_loss_mlp": 0.11706543, + "step": 10142, + "time_per_iteration": 2.684264659881592 + }, + { + "auxiliary_loss_clip": 0.01114825, + "auxiliary_loss_mlp": 0.01027658, + "balance_loss_clip": 1.03965998, + "balance_loss_mlp": 1.01548088, + "epoch": 0.609830151811213, + "flos": 29223298203840.0, + "grad_norm": 1.8047554319636863, + "language_loss": 0.7683295, + "learning_rate": 1.3952412444878964e-06, + "loss": 0.78975427, + "num_input_tokens_seen": 218419120, + "router_z_loss_clip": 0.75195312, + "router_z_loss_mlp": 0.12176514, + "step": 10143, + "time_per_iteration": 2.620054244995117 + }, + { + "auxiliary_loss_clip": 0.01116249, + "auxiliary_loss_mlp": 0.01031463, + "balance_loss_clip": 1.04077637, + "balance_loss_mlp": 1.01936948, + "epoch": 0.6098902750638809, + "flos": 19742229731520.0, + "grad_norm": 1.9702525291861377, + "language_loss": 0.75257063, + "learning_rate": 1.3948700258883448e-06, + "loss": 0.77404773, + "num_input_tokens_seen": 218435290, + "router_z_loss_clip": 0.75488281, + "router_z_loss_mlp": 0.12097168, + "step": 10144, + "time_per_iteration": 2.598227024078369 + }, + { + "auxiliary_loss_clip": 0.01116912, + "auxiliary_loss_mlp": 0.01027479, + "balance_loss_clip": 1.03964067, + "balance_loss_mlp": 1.01560545, + "epoch": 0.609950398316549, + "flos": 54333258619680.0, + "grad_norm": 1.9853164079416832, + "language_loss": 0.73308647, + "learning_rate": 1.394498830235383e-06, + "loss": 0.75453037, + "num_input_tokens_seen": 218457880, + "router_z_loss_clip": 0.77294922, + "router_z_loss_mlp": 0.11877441, + "step": 10145, + "time_per_iteration": 2.817976713180542 + }, + { + "auxiliary_loss_clip": 0.01115396, + "auxiliary_loss_mlp": 0.01029669, + "balance_loss_clip": 1.04008996, + "balance_loss_mlp": 1.01862991, + "epoch": 0.6100105215692169, + "flos": 28335407986080.0, + "grad_norm": 1.6817000258516892, + "language_loss": 0.69069898, + "learning_rate": 1.3941276575430862e-06, + "loss": 0.71214968, + "num_input_tokens_seen": 218475930, + "router_z_loss_clip": 0.75292969, + "router_z_loss_mlp": 0.11035156, + "step": 10146, + "time_per_iteration": 2.6474037170410156 + }, + { + "auxiliary_loss_clip": 0.01114829, + "auxiliary_loss_mlp": 0.01028887, + "balance_loss_clip": 1.04178548, + "balance_loss_mlp": 1.01830149, + "epoch": 0.6100706448218849, + "flos": 18317926414080.0, + "grad_norm": 2.1228837643860197, + "language_loss": 0.77138615, + "learning_rate": 1.3937565078255289e-06, + "loss": 0.79282331, + "num_input_tokens_seen": 218493675, + "router_z_loss_clip": 0.72998047, + "router_z_loss_mlp": 0.10595703, + "step": 10147, + "time_per_iteration": 2.5964176654815674 + }, + { + "auxiliary_loss_clip": 0.0111418, + "auxiliary_loss_mlp": 0.01026408, + "balance_loss_clip": 1.03817189, + "balance_loss_mlp": 1.01484513, + "epoch": 0.6101307680745528, + "flos": 23965228745280.0, + "grad_norm": 1.95678275054574, + "language_loss": 0.78599238, + "learning_rate": 1.393385381096786e-06, + "loss": 0.80739826, + "num_input_tokens_seen": 218511780, + "router_z_loss_clip": 0.75976562, + "router_z_loss_mlp": 0.11566162, + "step": 10148, + "time_per_iteration": 2.622225046157837 + }, + { + "auxiliary_loss_clip": 0.0112168, + "auxiliary_loss_mlp": 0.01034092, + "balance_loss_clip": 1.0419879, + "balance_loss_mlp": 1.02074635, + "epoch": 0.6101908913272208, + "flos": 36528089564160.0, + "grad_norm": 4.58026308849388, + "language_loss": 0.5377183, + "learning_rate": 1.39301427737093e-06, + "loss": 0.55927598, + "num_input_tokens_seen": 218531850, + "router_z_loss_clip": 0.79736328, + "router_z_loss_mlp": 0.13360596, + "step": 10149, + "time_per_iteration": 2.682616710662842 + }, + { + "auxiliary_loss_clip": 0.01113981, + "auxiliary_loss_mlp": 0.01031346, + "balance_loss_clip": 1.04166937, + "balance_loss_mlp": 1.02031875, + "epoch": 0.6102510145798887, + "flos": 26598841927200.0, + "grad_norm": 2.0102345433723685, + "language_loss": 0.79842985, + "learning_rate": 1.3926431966620333e-06, + "loss": 0.81988311, + "num_input_tokens_seen": 218551245, + "router_z_loss_clip": 0.72363281, + "router_z_loss_mlp": 0.11029053, + "step": 10150, + "time_per_iteration": 2.660015106201172 + }, + { + "auxiliary_loss_clip": 0.01119203, + "auxiliary_loss_mlp": 0.01036482, + "balance_loss_clip": 1.04210675, + "balance_loss_mlp": 1.02443576, + "epoch": 0.6103111378325567, + "flos": 25265487202560.0, + "grad_norm": 1.5935960516045826, + "language_loss": 0.69205964, + "learning_rate": 1.3922721389841684e-06, + "loss": 0.71361649, + "num_input_tokens_seen": 218571365, + "router_z_loss_clip": 0.77050781, + "router_z_loss_mlp": 0.12042236, + "step": 10151, + "time_per_iteration": 2.669172525405884 + }, + { + "auxiliary_loss_clip": 0.01114189, + "auxiliary_loss_mlp": 0.01028632, + "balance_loss_clip": 1.03932285, + "balance_loss_mlp": 1.01760507, + "epoch": 0.6103712610852247, + "flos": 35852761762560.0, + "grad_norm": 1.8962650737283098, + "language_loss": 0.70842659, + "learning_rate": 1.3919011043514036e-06, + "loss": 0.72985482, + "num_input_tokens_seen": 218588315, + "router_z_loss_clip": 0.74853516, + "router_z_loss_mlp": 0.11029053, + "step": 10152, + "time_per_iteration": 2.6974990367889404 + }, + { + "auxiliary_loss_clip": 0.01118661, + "auxiliary_loss_mlp": 0.01030675, + "balance_loss_clip": 1.04168856, + "balance_loss_mlp": 1.0191052, + "epoch": 0.6104313843378927, + "flos": 25396176896640.0, + "grad_norm": 2.3227980063977802, + "language_loss": 0.78348458, + "learning_rate": 1.391530092777811e-06, + "loss": 0.80497789, + "num_input_tokens_seen": 218605940, + "router_z_loss_clip": 0.76904297, + "router_z_loss_mlp": 0.11572266, + "step": 10153, + "time_per_iteration": 2.604703426361084 + }, + { + "auxiliary_loss_clip": 0.01116985, + "auxiliary_loss_mlp": 0.01027841, + "balance_loss_clip": 1.04070699, + "balance_loss_mlp": 1.01599145, + "epoch": 0.6104915075905607, + "flos": 32431525930080.0, + "grad_norm": 1.622833857581848, + "language_loss": 0.79214013, + "learning_rate": 1.3911591042774573e-06, + "loss": 0.81358838, + "num_input_tokens_seen": 218626100, + "router_z_loss_clip": 0.76171875, + "router_z_loss_mlp": 0.11853027, + "step": 10154, + "time_per_iteration": 2.663384199142456 + }, + { + "auxiliary_loss_clip": 0.01115524, + "auxiliary_loss_mlp": 0.01028113, + "balance_loss_clip": 1.04098713, + "balance_loss_mlp": 1.01690114, + "epoch": 0.6105516308432286, + "flos": 29182260549600.0, + "grad_norm": 3.2233929143806788, + "language_loss": 0.70142066, + "learning_rate": 1.3907881388644116e-06, + "loss": 0.722857, + "num_input_tokens_seen": 218645060, + "router_z_loss_clip": 0.74658203, + "router_z_loss_mlp": 0.11212158, + "step": 10155, + "time_per_iteration": 2.640012741088867 + }, + { + "auxiliary_loss_clip": 0.01118514, + "auxiliary_loss_mlp": 0.01029073, + "balance_loss_clip": 1.04307222, + "balance_loss_mlp": 1.01706243, + "epoch": 0.6106117540958966, + "flos": 38531545361280.0, + "grad_norm": 1.5875947839242202, + "language_loss": 0.71351826, + "learning_rate": 1.3904171965527413e-06, + "loss": 0.73499417, + "num_input_tokens_seen": 218667690, + "router_z_loss_clip": 0.75488281, + "router_z_loss_mlp": 0.12005615, + "step": 10156, + "time_per_iteration": 4.105096340179443 + }, + { + "auxiliary_loss_clip": 0.01113232, + "auxiliary_loss_mlp": 0.01028688, + "balance_loss_clip": 1.04074192, + "balance_loss_mlp": 1.01707745, + "epoch": 0.6106718773485645, + "flos": 23925973851360.0, + "grad_norm": 1.5967845840683705, + "language_loss": 0.67528009, + "learning_rate": 1.3900462773565114e-06, + "loss": 0.69669932, + "num_input_tokens_seen": 218687505, + "router_z_loss_clip": 0.72460938, + "router_z_loss_mlp": 0.11608887, + "step": 10157, + "time_per_iteration": 3.991609811782837 + }, + { + "auxiliary_loss_clip": 0.01115027, + "auxiliary_loss_mlp": 0.01028075, + "balance_loss_clip": 1.03847742, + "balance_loss_mlp": 1.01616049, + "epoch": 0.6107320006012326, + "flos": 20894781237120.0, + "grad_norm": 4.418486120629499, + "language_loss": 0.72264373, + "learning_rate": 1.3896753812897877e-06, + "loss": 0.74407482, + "num_input_tokens_seen": 218705315, + "router_z_loss_clip": 0.765625, + "router_z_loss_mlp": 0.11907959, + "step": 10158, + "time_per_iteration": 2.6652991771698 + }, + { + "auxiliary_loss_clip": 0.01118414, + "auxiliary_loss_mlp": 0.01029601, + "balance_loss_clip": 1.04123592, + "balance_loss_mlp": 1.01835978, + "epoch": 0.6107921238539005, + "flos": 36788172399360.0, + "grad_norm": 1.714387116072066, + "language_loss": 0.69164193, + "learning_rate": 1.389304508366635e-06, + "loss": 0.71312207, + "num_input_tokens_seen": 218725735, + "router_z_loss_clip": 0.77050781, + "router_z_loss_mlp": 0.11254883, + "step": 10159, + "time_per_iteration": 2.7001163959503174 + }, + { + "auxiliary_loss_clip": 0.01117703, + "auxiliary_loss_mlp": 0.01032383, + "balance_loss_clip": 1.04072809, + "balance_loss_mlp": 1.01996088, + "epoch": 0.6108522471065685, + "flos": 22500860188320.0, + "grad_norm": 1.9062765291006136, + "language_loss": 0.787853, + "learning_rate": 1.3889336586011167e-06, + "loss": 0.80935389, + "num_input_tokens_seen": 218743215, + "router_z_loss_clip": 0.77001953, + "router_z_loss_mlp": 0.12420654, + "step": 10160, + "time_per_iteration": 2.6501731872558594 + }, + { + "auxiliary_loss_clip": 0.01047497, + "auxiliary_loss_mlp": 0.01005275, + "balance_loss_clip": 1.0230124, + "balance_loss_mlp": 1.00404143, + "epoch": 0.6109123703592364, + "flos": 78256888819200.0, + "grad_norm": 0.8202520771324826, + "language_loss": 0.61498672, + "learning_rate": 1.388562832007295e-06, + "loss": 0.63551438, + "num_input_tokens_seen": 218806440, + "router_z_loss_clip": 0.24475098, + "router_z_loss_mlp": 0.01232147, + "step": 10161, + "time_per_iteration": 3.452455997467041 + }, + { + "auxiliary_loss_clip": 0.01120164, + "auxiliary_loss_mlp": 0.01033022, + "balance_loss_clip": 1.04265654, + "balance_loss_mlp": 1.02043939, + "epoch": 0.6109724936119044, + "flos": 25218939198240.0, + "grad_norm": 2.1778460051595, + "language_loss": 0.75973535, + "learning_rate": 1.3881920285992324e-06, + "loss": 0.78126723, + "num_input_tokens_seen": 218825720, + "router_z_loss_clip": 0.77441406, + "router_z_loss_mlp": 0.12591553, + "step": 10162, + "time_per_iteration": 2.765385866165161 + }, + { + "auxiliary_loss_clip": 0.01117019, + "auxiliary_loss_mlp": 0.01030343, + "balance_loss_clip": 1.04125202, + "balance_loss_mlp": 1.01835084, + "epoch": 0.6110326168645723, + "flos": 38256997857120.0, + "grad_norm": 1.7565788874715202, + "language_loss": 0.7161116, + "learning_rate": 1.3878212483909888e-06, + "loss": 0.73758519, + "num_input_tokens_seen": 218847735, + "router_z_loss_clip": 0.75683594, + "router_z_loss_mlp": 0.11993408, + "step": 10163, + "time_per_iteration": 2.6928980350494385 + }, + { + "auxiliary_loss_clip": 0.0111444, + "auxiliary_loss_mlp": 0.01027281, + "balance_loss_clip": 1.04019213, + "balance_loss_mlp": 1.01639152, + "epoch": 0.6110927401172404, + "flos": 30510064406880.0, + "grad_norm": 1.726714429410667, + "language_loss": 0.59258634, + "learning_rate": 1.387450491396625e-06, + "loss": 0.61400354, + "num_input_tokens_seen": 218866585, + "router_z_loss_clip": 0.74267578, + "router_z_loss_mlp": 0.10894775, + "step": 10164, + "time_per_iteration": 4.046699285507202 + }, + { + "auxiliary_loss_clip": 0.0111467, + "auxiliary_loss_mlp": 0.01030062, + "balance_loss_clip": 1.03918076, + "balance_loss_mlp": 1.01861811, + "epoch": 0.6111528633699083, + "flos": 32028557699520.0, + "grad_norm": 1.7639372891677443, + "language_loss": 0.75822502, + "learning_rate": 1.3870797576302003e-06, + "loss": 0.77967238, + "num_input_tokens_seen": 218885560, + "router_z_loss_clip": 0.75488281, + "router_z_loss_mlp": 0.11444092, + "step": 10165, + "time_per_iteration": 2.710186243057251 + }, + { + "auxiliary_loss_clip": 0.01116888, + "auxiliary_loss_mlp": 0.01022783, + "balance_loss_clip": 1.04355919, + "balance_loss_mlp": 1.01100516, + "epoch": 0.6112129866225763, + "flos": 27311277206880.0, + "grad_norm": 2.166539483940468, + "language_loss": 0.79236066, + "learning_rate": 1.3867090471057719e-06, + "loss": 0.81375736, + "num_input_tokens_seen": 218905055, + "router_z_loss_clip": 0.73339844, + "router_z_loss_mlp": 0.11779785, + "step": 10166, + "time_per_iteration": 2.6392548084259033 + }, + { + "auxiliary_loss_clip": 0.01117374, + "auxiliary_loss_mlp": 0.01028815, + "balance_loss_clip": 1.04164124, + "balance_loss_mlp": 1.01666212, + "epoch": 0.6112731098752443, + "flos": 30783517944480.0, + "grad_norm": 2.504294897086734, + "language_loss": 0.67776662, + "learning_rate": 1.3863383598373987e-06, + "loss": 0.69922853, + "num_input_tokens_seen": 218924030, + "router_z_loss_clip": 0.7578125, + "router_z_loss_mlp": 0.121521, + "step": 10167, + "time_per_iteration": 2.6868057250976562 + }, + { + "auxiliary_loss_clip": 0.01115365, + "auxiliary_loss_mlp": 0.01031377, + "balance_loss_clip": 1.04200244, + "balance_loss_mlp": 1.02052903, + "epoch": 0.6113332331279122, + "flos": 27934465619520.0, + "grad_norm": 1.6015547714626546, + "language_loss": 0.78847218, + "learning_rate": 1.3859676958391364e-06, + "loss": 0.80993962, + "num_input_tokens_seen": 218943750, + "router_z_loss_clip": 0.73388672, + "router_z_loss_mlp": 0.10845947, + "step": 10168, + "time_per_iteration": 2.6273648738861084 + }, + { + "auxiliary_loss_clip": 0.01121595, + "auxiliary_loss_mlp": 0.01033172, + "balance_loss_clip": 1.04085803, + "balance_loss_mlp": 1.02029145, + "epoch": 0.6113933563805802, + "flos": 22720796749440.0, + "grad_norm": 2.6387742485951593, + "language_loss": 0.85750628, + "learning_rate": 1.3855970551250398e-06, + "loss": 0.87905395, + "num_input_tokens_seen": 218957585, + "router_z_loss_clip": 0.80664062, + "router_z_loss_mlp": 0.12878418, + "step": 10169, + "time_per_iteration": 3.920717477798462 + }, + { + "auxiliary_loss_clip": 0.01111941, + "auxiliary_loss_mlp": 0.01028032, + "balance_loss_clip": 1.03851652, + "balance_loss_mlp": 1.01747632, + "epoch": 0.6114534796332481, + "flos": 51086869966080.0, + "grad_norm": 3.392850317807021, + "language_loss": 0.79017746, + "learning_rate": 1.3852264377091652e-06, + "loss": 0.8115772, + "num_input_tokens_seen": 218980025, + "router_z_loss_clip": 0.73486328, + "router_z_loss_mlp": 0.10552979, + "step": 10170, + "time_per_iteration": 2.844118356704712 + }, + { + "auxiliary_loss_clip": 0.01120833, + "auxiliary_loss_mlp": 0.01036974, + "balance_loss_clip": 1.04030323, + "balance_loss_mlp": 1.0239687, + "epoch": 0.6115136028859162, + "flos": 26732813520960.0, + "grad_norm": 6.512917140016252, + "language_loss": 0.69270289, + "learning_rate": 1.3848558436055651e-06, + "loss": 0.71428096, + "num_input_tokens_seen": 218998200, + "router_z_loss_clip": 0.80517578, + "router_z_loss_mlp": 0.13018799, + "step": 10171, + "time_per_iteration": 2.7083518505096436 + }, + { + "auxiliary_loss_clip": 0.01119764, + "auxiliary_loss_mlp": 0.01036306, + "balance_loss_clip": 1.04107308, + "balance_loss_mlp": 1.02328229, + "epoch": 0.6115737261385841, + "flos": 35148267869760.0, + "grad_norm": 1.6997002389639257, + "language_loss": 0.79356313, + "learning_rate": 1.3844852728282934e-06, + "loss": 0.8151238, + "num_input_tokens_seen": 219017910, + "router_z_loss_clip": 0.78710938, + "router_z_loss_mlp": 0.13031006, + "step": 10172, + "time_per_iteration": 2.744274616241455 + }, + { + "auxiliary_loss_clip": 0.01122973, + "auxiliary_loss_mlp": 0.01034622, + "balance_loss_clip": 1.04172397, + "balance_loss_mlp": 1.02236128, + "epoch": 0.6116338493912521, + "flos": 25931131374240.0, + "grad_norm": 2.632817726204247, + "language_loss": 0.67273855, + "learning_rate": 1.3841147253914022e-06, + "loss": 0.69431448, + "num_input_tokens_seen": 219037730, + "router_z_loss_clip": 0.8125, + "router_z_loss_mlp": 0.12255859, + "step": 10173, + "time_per_iteration": 2.639402151107788 + }, + { + "auxiliary_loss_clip": 0.01120132, + "auxiliary_loss_mlp": 0.01035822, + "balance_loss_clip": 1.0429163, + "balance_loss_mlp": 1.02378142, + "epoch": 0.61169397264392, + "flos": 21390116165280.0, + "grad_norm": 1.8605555702452383, + "language_loss": 0.56229913, + "learning_rate": 1.3837442013089416e-06, + "loss": 0.58385873, + "num_input_tokens_seen": 219056755, + "router_z_loss_clip": 0.77294922, + "router_z_loss_mlp": 0.12054443, + "step": 10174, + "time_per_iteration": 2.6743805408477783 + }, + { + "auxiliary_loss_clip": 0.01124038, + "auxiliary_loss_mlp": 0.01033922, + "balance_loss_clip": 1.04570913, + "balance_loss_mlp": 1.02147615, + "epoch": 0.611754095896588, + "flos": 29225972344320.0, + "grad_norm": 2.0573636339407537, + "language_loss": 0.66109049, + "learning_rate": 1.3833737005949628e-06, + "loss": 0.68267006, + "num_input_tokens_seen": 219076985, + "router_z_loss_clip": 0.78222656, + "router_z_loss_mlp": 0.12463379, + "step": 10175, + "time_per_iteration": 2.6546366214752197 + }, + { + "auxiliary_loss_clip": 0.01112605, + "auxiliary_loss_mlp": 0.01022177, + "balance_loss_clip": 1.03764606, + "balance_loss_mlp": 1.01147223, + "epoch": 0.6118142191492559, + "flos": 31719374271360.0, + "grad_norm": 2.070314167607778, + "language_loss": 0.83005893, + "learning_rate": 1.3830032232635154e-06, + "loss": 0.85140669, + "num_input_tokens_seen": 219096050, + "router_z_loss_clip": 0.74902344, + "router_z_loss_mlp": 0.10705566, + "step": 10176, + "time_per_iteration": 2.7303545475006104 + }, + { + "auxiliary_loss_clip": 0.0111892, + "auxiliary_loss_mlp": 0.01035259, + "balance_loss_clip": 1.04229212, + "balance_loss_mlp": 1.0221225, + "epoch": 0.611874342401924, + "flos": 30021617416320.0, + "grad_norm": 2.3070388065141585, + "language_loss": 0.77075458, + "learning_rate": 1.3826327693286474e-06, + "loss": 0.79229641, + "num_input_tokens_seen": 219112665, + "router_z_loss_clip": 0.76660156, + "router_z_loss_mlp": 0.13140869, + "step": 10177, + "time_per_iteration": 2.678356885910034 + }, + { + "auxiliary_loss_clip": 0.01117154, + "auxiliary_loss_mlp": 0.01032215, + "balance_loss_clip": 1.0406847, + "balance_loss_mlp": 1.02021062, + "epoch": 0.6119344656545919, + "flos": 19385242263360.0, + "grad_norm": 2.9593205148167785, + "language_loss": 0.75763297, + "learning_rate": 1.3822623388044065e-06, + "loss": 0.77912664, + "num_input_tokens_seen": 219129120, + "router_z_loss_clip": 0.76513672, + "router_z_loss_mlp": 0.11999512, + "step": 10178, + "time_per_iteration": 2.611431121826172 + }, + { + "auxiliary_loss_clip": 0.01118672, + "auxiliary_loss_mlp": 0.01033728, + "balance_loss_clip": 1.04212713, + "balance_loss_mlp": 1.02178335, + "epoch": 0.6119945889072599, + "flos": 26420955952320.0, + "grad_norm": 1.5809828795927678, + "language_loss": 0.67244762, + "learning_rate": 1.3818919317048402e-06, + "loss": 0.69397163, + "num_input_tokens_seen": 219148950, + "router_z_loss_clip": 0.765625, + "router_z_loss_mlp": 0.11950684, + "step": 10179, + "time_per_iteration": 2.6991231441497803 + }, + { + "auxiliary_loss_clip": 0.01118719, + "auxiliary_loss_mlp": 0.0103381, + "balance_loss_clip": 1.04253149, + "balance_loss_mlp": 1.02240801, + "epoch": 0.6120547121599279, + "flos": 16804700367840.0, + "grad_norm": 1.9768933487537328, + "language_loss": 0.83728004, + "learning_rate": 1.3815215480439933e-06, + "loss": 0.8588053, + "num_input_tokens_seen": 219165585, + "router_z_loss_clip": 0.76220703, + "router_z_loss_mlp": 0.11413574, + "step": 10180, + "time_per_iteration": 2.7131056785583496 + }, + { + "auxiliary_loss_clip": 0.01118165, + "auxiliary_loss_mlp": 0.01029211, + "balance_loss_clip": 1.04238963, + "balance_loss_mlp": 1.01661706, + "epoch": 0.6121148354125958, + "flos": 24501034085760.0, + "grad_norm": 1.6188927347791422, + "language_loss": 0.77595437, + "learning_rate": 1.3811511878359113e-06, + "loss": 0.79742813, + "num_input_tokens_seen": 219183280, + "router_z_loss_clip": 0.75830078, + "router_z_loss_mlp": 0.1260376, + "step": 10181, + "time_per_iteration": 2.7056331634521484 + }, + { + "auxiliary_loss_clip": 0.01119115, + "auxiliary_loss_mlp": 0.01030901, + "balance_loss_clip": 1.0420506, + "balance_loss_mlp": 1.01881874, + "epoch": 0.6121749586652638, + "flos": 16437988752480.0, + "grad_norm": 2.093561111116138, + "language_loss": 0.80775547, + "learning_rate": 1.3807808510946384e-06, + "loss": 0.82925558, + "num_input_tokens_seen": 219197200, + "router_z_loss_clip": 0.77050781, + "router_z_loss_mlp": 0.12097168, + "step": 10182, + "time_per_iteration": 2.6222445964813232 + }, + { + "auxiliary_loss_clip": 0.01111316, + "auxiliary_loss_mlp": 0.01026363, + "balance_loss_clip": 1.03877497, + "balance_loss_mlp": 1.01651037, + "epoch": 0.6122350819179317, + "flos": 24551066576160.0, + "grad_norm": 2.042165857245921, + "language_loss": 0.82784772, + "learning_rate": 1.3804105378342177e-06, + "loss": 0.84922445, + "num_input_tokens_seen": 219216825, + "router_z_loss_clip": 0.72558594, + "router_z_loss_mlp": 0.09851074, + "step": 10183, + "time_per_iteration": 2.786637306213379 + }, + { + "auxiliary_loss_clip": 0.0104656, + "auxiliary_loss_mlp": 0.01003811, + "balance_loss_clip": 1.02235091, + "balance_loss_mlp": 1.00257432, + "epoch": 0.6122952051705998, + "flos": 79837482401280.0, + "grad_norm": 0.7033028342134909, + "language_loss": 0.62899619, + "learning_rate": 1.3800402480686914e-06, + "loss": 0.64949989, + "num_input_tokens_seen": 219283795, + "router_z_loss_clip": 0.24182129, + "router_z_loss_mlp": 0.01235962, + "step": 10184, + "time_per_iteration": 3.4080371856689453 + }, + { + "auxiliary_loss_clip": 0.01120388, + "auxiliary_loss_mlp": 0.01034436, + "balance_loss_clip": 1.04418159, + "balance_loss_mlp": 1.02312922, + "epoch": 0.6123553284232677, + "flos": 24863086213920.0, + "grad_norm": 1.8390546633015241, + "language_loss": 0.82297719, + "learning_rate": 1.379669981812101e-06, + "loss": 0.84452546, + "num_input_tokens_seen": 219302385, + "router_z_loss_clip": 0.76220703, + "router_z_loss_mlp": 0.11309814, + "step": 10185, + "time_per_iteration": 2.7659387588500977 + }, + { + "auxiliary_loss_clip": 0.01123448, + "auxiliary_loss_mlp": 0.010348, + "balance_loss_clip": 1.04351294, + "balance_loss_mlp": 1.02296853, + "epoch": 0.6124154516759357, + "flos": 29270048794560.0, + "grad_norm": 2.6220135353165412, + "language_loss": 0.74582916, + "learning_rate": 1.3792997390784868e-06, + "loss": 0.76741165, + "num_input_tokens_seen": 219319765, + "router_z_loss_clip": 0.79882812, + "router_z_loss_mlp": 0.11828613, + "step": 10186, + "time_per_iteration": 2.6659913063049316 + }, + { + "auxiliary_loss_clip": 0.0111461, + "auxiliary_loss_mlp": 0.01031152, + "balance_loss_clip": 1.03970015, + "balance_loss_mlp": 1.0202682, + "epoch": 0.6124755749286036, + "flos": 26196481455840.0, + "grad_norm": 1.7653719082236732, + "language_loss": 0.78449303, + "learning_rate": 1.3789295198818895e-06, + "loss": 0.80595064, + "num_input_tokens_seen": 219337440, + "router_z_loss_clip": 0.74951172, + "router_z_loss_mlp": 0.10882568, + "step": 10187, + "time_per_iteration": 2.6757359504699707 + }, + { + "auxiliary_loss_clip": 0.01115878, + "auxiliary_loss_mlp": 0.01032493, + "balance_loss_clip": 1.03981042, + "balance_loss_mlp": 1.02100718, + "epoch": 0.6125356981812716, + "flos": 29137616857440.0, + "grad_norm": 2.256658506461249, + "language_loss": 0.83268815, + "learning_rate": 1.3785593242363462e-06, + "loss": 0.85417187, + "num_input_tokens_seen": 219357525, + "router_z_loss_clip": 0.76074219, + "router_z_loss_mlp": 0.1149292, + "step": 10188, + "time_per_iteration": 2.730022668838501 + }, + { + "auxiliary_loss_clip": 0.01117658, + "auxiliary_loss_mlp": 0.01033102, + "balance_loss_clip": 1.04080176, + "balance_loss_mlp": 1.02131212, + "epoch": 0.6125958214339395, + "flos": 17601966131040.0, + "grad_norm": 1.7778727422791019, + "language_loss": 0.75661719, + "learning_rate": 1.378189152155896e-06, + "loss": 0.77812481, + "num_input_tokens_seen": 219374855, + "router_z_loss_clip": 0.76806641, + "router_z_loss_mlp": 0.11785889, + "step": 10189, + "time_per_iteration": 2.5923078060150146 + }, + { + "auxiliary_loss_clip": 0.01117716, + "auxiliary_loss_mlp": 0.01034914, + "balance_loss_clip": 1.04039192, + "balance_loss_mlp": 1.02276683, + "epoch": 0.6126559446866076, + "flos": 28380902541120.0, + "grad_norm": 1.8474747362357415, + "language_loss": 0.74191803, + "learning_rate": 1.3778190036545758e-06, + "loss": 0.7634443, + "num_input_tokens_seen": 219394740, + "router_z_loss_clip": 0.77246094, + "router_z_loss_mlp": 0.12139893, + "step": 10190, + "time_per_iteration": 2.679710865020752 + }, + { + "auxiliary_loss_clip": 0.01118464, + "auxiliary_loss_mlp": 0.01033795, + "balance_loss_clip": 1.04133785, + "balance_loss_mlp": 1.02142096, + "epoch": 0.6127160679392755, + "flos": 32781463391520.0, + "grad_norm": 3.034138683556873, + "language_loss": 0.68536341, + "learning_rate": 1.3774488787464207e-06, + "loss": 0.70688599, + "num_input_tokens_seen": 219413755, + "router_z_loss_clip": 0.77099609, + "router_z_loss_mlp": 0.12371826, + "step": 10191, + "time_per_iteration": 2.6528284549713135 + }, + { + "auxiliary_loss_clip": 0.01118597, + "auxiliary_loss_mlp": 0.0103266, + "balance_loss_clip": 1.03996444, + "balance_loss_mlp": 1.02057195, + "epoch": 0.6127761911919435, + "flos": 32209360918560.0, + "grad_norm": 3.0371449072831473, + "language_loss": 0.73680484, + "learning_rate": 1.377078777445467e-06, + "loss": 0.75831735, + "num_input_tokens_seen": 219433560, + "router_z_loss_clip": 0.78662109, + "router_z_loss_mlp": 0.12103271, + "step": 10192, + "time_per_iteration": 2.707484483718872 + }, + { + "auxiliary_loss_clip": 0.01116001, + "auxiliary_loss_mlp": 0.01026844, + "balance_loss_clip": 1.0411005, + "balance_loss_mlp": 1.01552474, + "epoch": 0.6128363144446115, + "flos": 27619407185760.0, + "grad_norm": 2.0195127103529003, + "language_loss": 0.83518034, + "learning_rate": 1.3767086997657478e-06, + "loss": 0.85660875, + "num_input_tokens_seen": 219452640, + "router_z_loss_clip": 0.74853516, + "router_z_loss_mlp": 0.11309814, + "step": 10193, + "time_per_iteration": 2.6502110958099365 + }, + { + "auxiliary_loss_clip": 0.01117021, + "auxiliary_loss_mlp": 0.01030375, + "balance_loss_clip": 1.0399971, + "balance_loss_mlp": 1.01844835, + "epoch": 0.6128964376972794, + "flos": 32652070250400.0, + "grad_norm": 2.3112241814785377, + "language_loss": 0.69759619, + "learning_rate": 1.3763386457212979e-06, + "loss": 0.71907014, + "num_input_tokens_seen": 219468585, + "router_z_loss_clip": 0.77001953, + "router_z_loss_mlp": 0.11914062, + "step": 10194, + "time_per_iteration": 2.7506232261657715 + }, + { + "auxiliary_loss_clip": 0.01044366, + "auxiliary_loss_mlp": 0.00999999, + "balance_loss_clip": 1.02037573, + "balance_loss_mlp": 0.99875283, + "epoch": 0.6129565609499474, + "flos": 80007386472000.0, + "grad_norm": 0.8237456094873066, + "language_loss": 0.58671308, + "learning_rate": 1.375968615326149e-06, + "loss": 0.60715675, + "num_input_tokens_seen": 219523015, + "router_z_loss_clip": 0.2401123, + "router_z_loss_mlp": 0.0124588, + "step": 10195, + "time_per_iteration": 4.538278579711914 + }, + { + "auxiliary_loss_clip": 0.0111934, + "auxiliary_loss_mlp": 0.01037238, + "balance_loss_clip": 1.04276884, + "balance_loss_mlp": 1.02513814, + "epoch": 0.6130166842026153, + "flos": 19964759398560.0, + "grad_norm": 1.930498269516621, + "language_loss": 0.69916558, + "learning_rate": 1.3755986085943324e-06, + "loss": 0.72073132, + "num_input_tokens_seen": 219539980, + "router_z_loss_clip": 0.76513672, + "router_z_loss_mlp": 0.12091064, + "step": 10196, + "time_per_iteration": 2.6479978561401367 + }, + { + "auxiliary_loss_clip": 0.01117491, + "auxiliary_loss_mlp": 0.01036158, + "balance_loss_clip": 1.04147887, + "balance_loss_mlp": 1.02459455, + "epoch": 0.6130768074552834, + "flos": 28861651248480.0, + "grad_norm": 1.9807609675567364, + "language_loss": 0.71669304, + "learning_rate": 1.3752286255398788e-06, + "loss": 0.73822957, + "num_input_tokens_seen": 219556980, + "router_z_loss_clip": 0.76025391, + "router_z_loss_mlp": 0.11572266, + "step": 10197, + "time_per_iteration": 4.243145227432251 + }, + { + "auxiliary_loss_clip": 0.01121518, + "auxiliary_loss_mlp": 0.01037967, + "balance_loss_clip": 1.04300094, + "balance_loss_mlp": 1.02522349, + "epoch": 0.6131369307079513, + "flos": 24461900743680.0, + "grad_norm": 2.260714775467706, + "language_loss": 0.78939748, + "learning_rate": 1.3748586661768191e-06, + "loss": 0.81099236, + "num_input_tokens_seen": 219576410, + "router_z_loss_clip": 0.78515625, + "router_z_loss_mlp": 0.12744141, + "step": 10198, + "time_per_iteration": 2.6673436164855957 + }, + { + "auxiliary_loss_clip": 0.01120164, + "auxiliary_loss_mlp": 0.01031618, + "balance_loss_clip": 1.04112446, + "balance_loss_mlp": 1.01949418, + "epoch": 0.6131970539606193, + "flos": 27664982775360.0, + "grad_norm": 1.4769540208668952, + "language_loss": 0.74574661, + "learning_rate": 1.374488730519181e-06, + "loss": 0.76726443, + "num_input_tokens_seen": 219597180, + "router_z_loss_clip": 0.78955078, + "router_z_loss_mlp": 0.12127686, + "step": 10199, + "time_per_iteration": 2.7280385494232178 + }, + { + "auxiliary_loss_clip": 0.01120558, + "auxiliary_loss_mlp": 0.01035098, + "balance_loss_clip": 1.04125929, + "balance_loss_mlp": 1.02225316, + "epoch": 0.6132571772132872, + "flos": 32788351329120.0, + "grad_norm": 1.8980806069724823, + "language_loss": 0.61120784, + "learning_rate": 1.374118818580993e-06, + "loss": 0.63276446, + "num_input_tokens_seen": 219617630, + "router_z_loss_clip": 0.79248047, + "router_z_loss_mlp": 0.12835693, + "step": 10200, + "time_per_iteration": 2.7687668800354004 + }, + { + "auxiliary_loss_clip": 0.01118868, + "auxiliary_loss_mlp": 0.01034233, + "balance_loss_clip": 1.04239142, + "balance_loss_mlp": 1.02237749, + "epoch": 0.6133173004659552, + "flos": 27933898377600.0, + "grad_norm": 2.1039061063199966, + "language_loss": 0.68862164, + "learning_rate": 1.3737489303762822e-06, + "loss": 0.71015263, + "num_input_tokens_seen": 219637025, + "router_z_loss_clip": 0.76416016, + "router_z_loss_mlp": 0.11871338, + "step": 10201, + "time_per_iteration": 2.668421745300293 + }, + { + "auxiliary_loss_clip": 0.01113425, + "auxiliary_loss_mlp": 0.01028781, + "balance_loss_clip": 1.0379895, + "balance_loss_mlp": 1.01649046, + "epoch": 0.6133774237186231, + "flos": 24995963841120.0, + "grad_norm": 1.8805378160631685, + "language_loss": 0.83437467, + "learning_rate": 1.3733790659190746e-06, + "loss": 0.85579669, + "num_input_tokens_seen": 219656625, + "router_z_loss_clip": 0.75439453, + "router_z_loss_mlp": 0.12286377, + "step": 10202, + "time_per_iteration": 2.6726677417755127 + }, + { + "auxiliary_loss_clip": 0.01044991, + "auxiliary_loss_mlp": 0.01000928, + "balance_loss_clip": 1.02109861, + "balance_loss_mlp": 0.99971032, + "epoch": 0.6134375469712912, + "flos": 84698215531200.0, + "grad_norm": 0.9202902982932598, + "language_loss": 0.67097461, + "learning_rate": 1.3730092252233953e-06, + "loss": 0.69143379, + "num_input_tokens_seen": 219718090, + "router_z_loss_clip": 0.23876953, + "router_z_loss_mlp": 0.01216888, + "step": 10203, + "time_per_iteration": 3.3115251064300537 + }, + { + "auxiliary_loss_clip": 0.01118473, + "auxiliary_loss_mlp": 0.01029988, + "balance_loss_clip": 1.04153872, + "balance_loss_mlp": 1.0179956, + "epoch": 0.6134976702239591, + "flos": 50373421754400.0, + "grad_norm": 1.7497940212393102, + "language_loss": 0.61208552, + "learning_rate": 1.37263940830327e-06, + "loss": 0.63357019, + "num_input_tokens_seen": 219740100, + "router_z_loss_clip": 0.76953125, + "router_z_loss_mlp": 0.11993408, + "step": 10204, + "time_per_iteration": 4.2720725536346436 + }, + { + "auxiliary_loss_clip": 0.0111683, + "auxiliary_loss_mlp": 0.01031311, + "balance_loss_clip": 1.04146409, + "balance_loss_mlp": 1.01945019, + "epoch": 0.6135577934766271, + "flos": 27267443860320.0, + "grad_norm": 2.000863815218091, + "language_loss": 0.72398895, + "learning_rate": 1.3722696151727204e-06, + "loss": 0.74547035, + "num_input_tokens_seen": 219761225, + "router_z_loss_clip": 0.75292969, + "router_z_loss_mlp": 0.11865234, + "step": 10205, + "time_per_iteration": 2.6664769649505615 + }, + { + "auxiliary_loss_clip": 0.01116611, + "auxiliary_loss_mlp": 0.01025806, + "balance_loss_clip": 1.04218006, + "balance_loss_mlp": 1.01354551, + "epoch": 0.6136179167292951, + "flos": 28953774842400.0, + "grad_norm": 1.7976673226188202, + "language_loss": 0.75817907, + "learning_rate": 1.3718998458457701e-06, + "loss": 0.77960324, + "num_input_tokens_seen": 219780085, + "router_z_loss_clip": 0.74414062, + "router_z_loss_mlp": 0.12261963, + "step": 10206, + "time_per_iteration": 2.626640796661377 + }, + { + "auxiliary_loss_clip": 0.01119144, + "auxiliary_loss_mlp": 0.01032386, + "balance_loss_clip": 1.04124832, + "balance_loss_mlp": 1.02019119, + "epoch": 0.613678039981963, + "flos": 31756360197600.0, + "grad_norm": 2.134136531174584, + "language_loss": 0.75649339, + "learning_rate": 1.3715301003364407e-06, + "loss": 0.7780087, + "num_input_tokens_seen": 219797895, + "router_z_loss_clip": 0.77978516, + "router_z_loss_mlp": 0.12200928, + "step": 10207, + "time_per_iteration": 2.6740777492523193 + }, + { + "auxiliary_loss_clip": 0.01117159, + "auxiliary_loss_mlp": 0.01030696, + "balance_loss_clip": 1.04121888, + "balance_loss_mlp": 1.01939511, + "epoch": 0.613738163234631, + "flos": 12031066689120.0, + "grad_norm": 2.230967752318878, + "language_loss": 0.82810771, + "learning_rate": 1.3711603786587525e-06, + "loss": 0.84958625, + "num_input_tokens_seen": 219811295, + "router_z_loss_clip": 0.75976562, + "router_z_loss_mlp": 0.11309814, + "step": 10208, + "time_per_iteration": 4.016873836517334 + }, + { + "auxiliary_loss_clip": 0.01122475, + "auxiliary_loss_mlp": 0.01033665, + "balance_loss_clip": 1.0437541, + "balance_loss_mlp": 1.02096331, + "epoch": 0.613798286487299, + "flos": 40489790224320.0, + "grad_norm": 2.0090003694060083, + "language_loss": 0.72679609, + "learning_rate": 1.3707906808267265e-06, + "loss": 0.74835747, + "num_input_tokens_seen": 219832735, + "router_z_loss_clip": 0.78759766, + "router_z_loss_mlp": 0.12701416, + "step": 10209, + "time_per_iteration": 2.7087597846984863 + }, + { + "auxiliary_loss_clip": 0.01116996, + "auxiliary_loss_mlp": 0.01035986, + "balance_loss_clip": 1.04186678, + "balance_loss_mlp": 1.02414274, + "epoch": 0.613858409739967, + "flos": 31271762348640.0, + "grad_norm": 1.65496616057411, + "language_loss": 0.74186599, + "learning_rate": 1.37042100685438e-06, + "loss": 0.76339585, + "num_input_tokens_seen": 219852755, + "router_z_loss_clip": 0.75146484, + "router_z_loss_mlp": 0.11853027, + "step": 10210, + "time_per_iteration": 2.7012546062469482 + }, + { + "auxiliary_loss_clip": 0.01043487, + "auxiliary_loss_mlp": 0.01000434, + "balance_loss_clip": 1.01961708, + "balance_loss_mlp": 0.9992618, + "epoch": 0.6139185329926349, + "flos": 81987834804480.0, + "grad_norm": 1.3720834493464578, + "language_loss": 0.64847481, + "learning_rate": 1.3700513567557325e-06, + "loss": 0.66891402, + "num_input_tokens_seen": 219922785, + "router_z_loss_clip": 0.23852539, + "router_z_loss_mlp": 0.01171112, + "step": 10211, + "time_per_iteration": 3.399106502532959 + }, + { + "auxiliary_loss_clip": 0.01117843, + "auxiliary_loss_mlp": 0.01036909, + "balance_loss_clip": 1.04128051, + "balance_loss_mlp": 1.0247376, + "epoch": 0.6139786562453029, + "flos": 26287632635040.0, + "grad_norm": 1.8046069737992365, + "language_loss": 0.75950682, + "learning_rate": 1.369681730544801e-06, + "loss": 0.78105438, + "num_input_tokens_seen": 219942215, + "router_z_loss_clip": 0.765625, + "router_z_loss_mlp": 0.12158203, + "step": 10212, + "time_per_iteration": 2.7116942405700684 + }, + { + "auxiliary_loss_clip": 0.01119096, + "auxiliary_loss_mlp": 0.01035752, + "balance_loss_clip": 1.04182601, + "balance_loss_mlp": 1.02353275, + "epoch": 0.6140387794979708, + "flos": 31809147863040.0, + "grad_norm": 1.6072649268338197, + "language_loss": 0.73774946, + "learning_rate": 1.3693121282356009e-06, + "loss": 0.75929791, + "num_input_tokens_seen": 219963830, + "router_z_loss_clip": 0.7734375, + "router_z_loss_mlp": 0.12213135, + "step": 10213, + "time_per_iteration": 2.7719027996063232 + }, + { + "auxiliary_loss_clip": 0.01123743, + "auxiliary_loss_mlp": 0.01035793, + "balance_loss_clip": 1.04312921, + "balance_loss_mlp": 1.02275205, + "epoch": 0.6140989027506388, + "flos": 28912575119040.0, + "grad_norm": 2.7296676851757944, + "language_loss": 0.72786951, + "learning_rate": 1.3689425498421483e-06, + "loss": 0.74946487, + "num_input_tokens_seen": 219983815, + "router_z_loss_clip": 0.80566406, + "router_z_loss_mlp": 0.13049316, + "step": 10214, + "time_per_iteration": 2.7097771167755127 + }, + { + "auxiliary_loss_clip": 0.01119613, + "auxiliary_loss_mlp": 0.0103096, + "balance_loss_clip": 1.04081941, + "balance_loss_mlp": 1.0188427, + "epoch": 0.6141590260033067, + "flos": 27127759330080.0, + "grad_norm": 1.7835111575099512, + "language_loss": 0.74530286, + "learning_rate": 1.3685729953784572e-06, + "loss": 0.76680863, + "num_input_tokens_seen": 220003165, + "router_z_loss_clip": 0.78857422, + "router_z_loss_mlp": 0.12115479, + "step": 10215, + "time_per_iteration": 2.6872072219848633 + }, + { + "auxiliary_loss_clip": 0.01115605, + "auxiliary_loss_mlp": 0.01029941, + "balance_loss_clip": 1.04017735, + "balance_loss_mlp": 1.01744795, + "epoch": 0.6142191492559748, + "flos": 29129108228640.0, + "grad_norm": 1.8427468941107708, + "language_loss": 0.7830497, + "learning_rate": 1.368203464858542e-06, + "loss": 0.80450523, + "num_input_tokens_seen": 220021015, + "router_z_loss_clip": 0.75439453, + "router_z_loss_mlp": 0.12469482, + "step": 10216, + "time_per_iteration": 2.623987913131714 + }, + { + "auxiliary_loss_clip": 0.01116741, + "auxiliary_loss_mlp": 0.01028174, + "balance_loss_clip": 1.04128098, + "balance_loss_mlp": 1.01549637, + "epoch": 0.6142792725086427, + "flos": 18355074409440.0, + "grad_norm": 3.960396935673931, + "language_loss": 0.80280995, + "learning_rate": 1.3678339582964147e-06, + "loss": 0.82425904, + "num_input_tokens_seen": 220035780, + "router_z_loss_clip": 0.75439453, + "router_z_loss_mlp": 0.12689209, + "step": 10217, + "time_per_iteration": 2.7626140117645264 + }, + { + "auxiliary_loss_clip": 0.01118116, + "auxiliary_loss_mlp": 0.01028398, + "balance_loss_clip": 1.03980124, + "balance_loss_mlp": 1.01614928, + "epoch": 0.6143393957613107, + "flos": 28468690786080.0, + "grad_norm": 2.3602829831777856, + "language_loss": 0.77916294, + "learning_rate": 1.3674644757060865e-06, + "loss": 0.80062807, + "num_input_tokens_seen": 220054280, + "router_z_loss_clip": 0.78222656, + "router_z_loss_mlp": 0.12255859, + "step": 10218, + "time_per_iteration": 2.6464405059814453 + }, + { + "auxiliary_loss_clip": 0.01115607, + "auxiliary_loss_mlp": 0.01033084, + "balance_loss_clip": 1.04091048, + "balance_loss_mlp": 1.02121627, + "epoch": 0.6143995190139786, + "flos": 24547176917280.0, + "grad_norm": 2.254478077101289, + "language_loss": 0.82050663, + "learning_rate": 1.367095017101569e-06, + "loss": 0.84199357, + "num_input_tokens_seen": 220074120, + "router_z_loss_clip": 0.74658203, + "router_z_loss_mlp": 0.11865234, + "step": 10219, + "time_per_iteration": 2.685786247253418 + }, + { + "auxiliary_loss_clip": 0.01118086, + "auxiliary_loss_mlp": 0.01034137, + "balance_loss_clip": 1.03880882, + "balance_loss_mlp": 1.02178693, + "epoch": 0.6144596422666466, + "flos": 51620527890720.0, + "grad_norm": 4.301183772012151, + "language_loss": 0.66831946, + "learning_rate": 1.3667255824968717e-06, + "loss": 0.68984169, + "num_input_tokens_seen": 220096320, + "router_z_loss_clip": 0.79345703, + "router_z_loss_mlp": 0.12359619, + "step": 10220, + "time_per_iteration": 2.8066132068634033 + }, + { + "auxiliary_loss_clip": 0.01114536, + "auxiliary_loss_mlp": 0.01026942, + "balance_loss_clip": 1.03919649, + "balance_loss_mlp": 1.01537848, + "epoch": 0.6145197655193146, + "flos": 26324821147680.0, + "grad_norm": 2.4311703382773864, + "language_loss": 0.71868938, + "learning_rate": 1.3663561719060041e-06, + "loss": 0.7401042, + "num_input_tokens_seen": 220114850, + "router_z_loss_clip": 0.75341797, + "router_z_loss_mlp": 0.11566162, + "step": 10221, + "time_per_iteration": 2.6768174171447754 + }, + { + "auxiliary_loss_clip": 0.01114367, + "auxiliary_loss_mlp": 0.01028405, + "balance_loss_clip": 1.03914142, + "balance_loss_mlp": 1.01668632, + "epoch": 0.6145798887719826, + "flos": 26199074561760.0, + "grad_norm": 2.1451850607685365, + "language_loss": 0.79598176, + "learning_rate": 1.3659867853429735e-06, + "loss": 0.81740952, + "num_input_tokens_seen": 220133395, + "router_z_loss_clip": 0.75244141, + "router_z_loss_mlp": 0.11724854, + "step": 10222, + "time_per_iteration": 2.6453371047973633 + }, + { + "auxiliary_loss_clip": 0.0112038, + "auxiliary_loss_mlp": 0.01031435, + "balance_loss_clip": 1.04326928, + "balance_loss_mlp": 1.01974058, + "epoch": 0.6146400120246506, + "flos": 25352181480960.0, + "grad_norm": 5.394237006341633, + "language_loss": 0.76322293, + "learning_rate": 1.365617422821788e-06, + "loss": 0.7847411, + "num_input_tokens_seen": 220152790, + "router_z_loss_clip": 0.77148438, + "router_z_loss_mlp": 0.11694336, + "step": 10223, + "time_per_iteration": 2.651201009750366 + }, + { + "auxiliary_loss_clip": 0.01114506, + "auxiliary_loss_mlp": 0.01032519, + "balance_loss_clip": 1.04126072, + "balance_loss_mlp": 1.02043152, + "epoch": 0.6147001352773185, + "flos": 16937496960480.0, + "grad_norm": 3.2145694445220805, + "language_loss": 0.7834695, + "learning_rate": 1.3652480843564535e-06, + "loss": 0.80493975, + "num_input_tokens_seen": 220169535, + "router_z_loss_clip": 0.73242188, + "router_z_loss_mlp": 0.12091064, + "step": 10224, + "time_per_iteration": 2.5754852294921875 + }, + { + "auxiliary_loss_clip": 0.0111292, + "auxiliary_loss_mlp": 0.01027024, + "balance_loss_clip": 1.0387361, + "balance_loss_mlp": 1.01572931, + "epoch": 0.6147602585299865, + "flos": 69117202311840.0, + "grad_norm": 1.2540844359639323, + "language_loss": 0.66502929, + "learning_rate": 1.3648787699609746e-06, + "loss": 0.68642879, + "num_input_tokens_seen": 220195305, + "router_z_loss_clip": 0.74121094, + "router_z_loss_mlp": 0.11297607, + "step": 10225, + "time_per_iteration": 3.0568933486938477 + }, + { + "auxiliary_loss_clip": 0.0112101, + "auxiliary_loss_mlp": 0.01027324, + "balance_loss_clip": 1.04166889, + "balance_loss_mlp": 1.01541531, + "epoch": 0.6148203817826544, + "flos": 40044568821120.0, + "grad_norm": 2.3644936849633726, + "language_loss": 0.63543367, + "learning_rate": 1.364509479649357e-06, + "loss": 0.65691704, + "num_input_tokens_seen": 220215040, + "router_z_loss_clip": 0.79345703, + "router_z_loss_mlp": 0.11914062, + "step": 10226, + "time_per_iteration": 2.8058831691741943 + }, + { + "auxiliary_loss_clip": 0.0111837, + "auxiliary_loss_mlp": 0.01035052, + "balance_loss_clip": 1.04131866, + "balance_loss_mlp": 1.02150404, + "epoch": 0.6148805050353224, + "flos": 22368671354880.0, + "grad_norm": 9.895149390230698, + "language_loss": 0.75605476, + "learning_rate": 1.3641402134356037e-06, + "loss": 0.77758884, + "num_input_tokens_seen": 220234205, + "router_z_loss_clip": 0.77050781, + "router_z_loss_mlp": 0.13549805, + "step": 10227, + "time_per_iteration": 2.6524627208709717 + }, + { + "auxiliary_loss_clip": 0.01119411, + "auxiliary_loss_mlp": 0.01033309, + "balance_loss_clip": 1.04011464, + "balance_loss_mlp": 1.01938581, + "epoch": 0.6149406282879903, + "flos": 17160391283040.0, + "grad_norm": 2.3802562643876617, + "language_loss": 0.61760783, + "learning_rate": 1.3637709713337164e-06, + "loss": 0.639135, + "num_input_tokens_seen": 220252730, + "router_z_loss_clip": 0.79296875, + "router_z_loss_mlp": 0.13934326, + "step": 10228, + "time_per_iteration": 2.6306285858154297 + }, + { + "auxiliary_loss_clip": 0.01116512, + "auxiliary_loss_mlp": 0.01030146, + "balance_loss_clip": 1.04104304, + "balance_loss_mlp": 1.0180639, + "epoch": 0.6150007515406584, + "flos": 30737618216640.0, + "grad_norm": 1.3321602951432898, + "language_loss": 0.74347973, + "learning_rate": 1.3634017533576985e-06, + "loss": 0.76494628, + "num_input_tokens_seen": 220273345, + "router_z_loss_clip": 0.75537109, + "router_z_loss_mlp": 0.12084961, + "step": 10229, + "time_per_iteration": 2.6540615558624268 + }, + { + "auxiliary_loss_clip": 0.01119857, + "auxiliary_loss_mlp": 0.0103326, + "balance_loss_clip": 1.0426923, + "balance_loss_mlp": 1.02086806, + "epoch": 0.6150608747933263, + "flos": 26777700316800.0, + "grad_norm": 4.228111271763526, + "language_loss": 0.77727628, + "learning_rate": 1.3630325595215493e-06, + "loss": 0.79880744, + "num_input_tokens_seen": 220293845, + "router_z_loss_clip": 0.77001953, + "router_z_loss_mlp": 0.1239624, + "step": 10230, + "time_per_iteration": 2.703068971633911 + }, + { + "auxiliary_loss_clip": 0.01115725, + "auxiliary_loss_mlp": 0.01028405, + "balance_loss_clip": 1.03934348, + "balance_loss_mlp": 1.01685917, + "epoch": 0.6151209980459943, + "flos": 36750092506560.0, + "grad_norm": 1.9943166697364751, + "language_loss": 0.73120898, + "learning_rate": 1.36266338983927e-06, + "loss": 0.75265026, + "num_input_tokens_seen": 220316070, + "router_z_loss_clip": 0.76416016, + "router_z_loss_mlp": 0.11547852, + "step": 10231, + "time_per_iteration": 2.7028656005859375 + }, + { + "auxiliary_loss_clip": 0.01117779, + "auxiliary_loss_mlp": 0.0103153, + "balance_loss_clip": 1.04073024, + "balance_loss_mlp": 1.019835, + "epoch": 0.6151811212986622, + "flos": 36616931258400.0, + "grad_norm": 1.7038329026499832, + "language_loss": 0.69925427, + "learning_rate": 1.362294244324858e-06, + "loss": 0.72074735, + "num_input_tokens_seen": 220335695, + "router_z_loss_clip": 0.77148438, + "router_z_loss_mlp": 0.11700439, + "step": 10232, + "time_per_iteration": 2.7039761543273926 + }, + { + "auxiliary_loss_clip": 0.01114319, + "auxiliary_loss_mlp": 0.01032935, + "balance_loss_clip": 1.04072714, + "balance_loss_mlp": 1.02128196, + "epoch": 0.6152412445513302, + "flos": 23029250866560.0, + "grad_norm": 23.288762157278132, + "language_loss": 0.91904145, + "learning_rate": 1.3619251229923126e-06, + "loss": 0.94051397, + "num_input_tokens_seen": 220353720, + "router_z_loss_clip": 0.73583984, + "router_z_loss_mlp": 0.11651611, + "step": 10233, + "time_per_iteration": 2.5854668617248535 + }, + { + "auxiliary_loss_clip": 0.01114322, + "auxiliary_loss_mlp": 0.01032899, + "balance_loss_clip": 1.03997135, + "balance_loss_mlp": 1.02183604, + "epoch": 0.6153013678039982, + "flos": 31363885942560.0, + "grad_norm": 1.9551238885475368, + "language_loss": 0.71312189, + "learning_rate": 1.3615560258556306e-06, + "loss": 0.73459405, + "num_input_tokens_seen": 220372515, + "router_z_loss_clip": 0.74316406, + "router_z_loss_mlp": 0.11065674, + "step": 10234, + "time_per_iteration": 4.105310916900635 + }, + { + "auxiliary_loss_clip": 0.01118145, + "auxiliary_loss_mlp": 0.01033312, + "balance_loss_clip": 1.03995085, + "balance_loss_mlp": 1.02087212, + "epoch": 0.6153614910566662, + "flos": 34789051951200.0, + "grad_norm": 3.1616915770947376, + "language_loss": 0.6683079, + "learning_rate": 1.3611869529288077e-06, + "loss": 0.68982249, + "num_input_tokens_seen": 220393490, + "router_z_loss_clip": 0.78173828, + "router_z_loss_mlp": 0.12438965, + "step": 10235, + "time_per_iteration": 2.696755886077881 + }, + { + "auxiliary_loss_clip": 0.01120797, + "auxiliary_loss_mlp": 0.01029697, + "balance_loss_clip": 1.04106557, + "balance_loss_mlp": 1.01757288, + "epoch": 0.6154216143093342, + "flos": 28735742593440.0, + "grad_norm": 2.8528222535783683, + "language_loss": 0.8109836, + "learning_rate": 1.3608179042258398e-06, + "loss": 0.83248854, + "num_input_tokens_seen": 220412855, + "router_z_loss_clip": 0.796875, + "router_z_loss_mlp": 0.12139893, + "step": 10236, + "time_per_iteration": 4.111356973648071 + }, + { + "auxiliary_loss_clip": 0.01120197, + "auxiliary_loss_mlp": 0.01027276, + "balance_loss_clip": 1.04054403, + "balance_loss_mlp": 1.01497412, + "epoch": 0.6154817375620021, + "flos": 27757349472960.0, + "grad_norm": 1.5491419266712407, + "language_loss": 0.80608273, + "learning_rate": 1.360448879760721e-06, + "loss": 0.82755744, + "num_input_tokens_seen": 220433440, + "router_z_loss_clip": 0.796875, + "router_z_loss_mlp": 0.12298584, + "step": 10237, + "time_per_iteration": 2.6231815814971924 + }, + { + "auxiliary_loss_clip": 0.01116577, + "auxiliary_loss_mlp": 0.01039617, + "balance_loss_clip": 1.04045701, + "balance_loss_mlp": 1.0275594, + "epoch": 0.6155418608146701, + "flos": 33144731038080.0, + "grad_norm": 1.814126781624277, + "language_loss": 0.76072633, + "learning_rate": 1.3600798795474449e-06, + "loss": 0.78228825, + "num_input_tokens_seen": 220453445, + "router_z_loss_clip": 0.76220703, + "router_z_loss_mlp": 0.1206665, + "step": 10238, + "time_per_iteration": 2.7626025676727295 + }, + { + "auxiliary_loss_clip": 0.01043649, + "auxiliary_loss_mlp": 0.010035, + "balance_loss_clip": 1.01943839, + "balance_loss_mlp": 1.00229335, + "epoch": 0.615601984067338, + "flos": 83965400059680.0, + "grad_norm": 0.8758906009501987, + "language_loss": 0.57661384, + "learning_rate": 1.3597109036000036e-06, + "loss": 0.59708536, + "num_input_tokens_seen": 220509730, + "router_z_loss_clip": 0.24230957, + "router_z_loss_mlp": 0.01205444, + "step": 10239, + "time_per_iteration": 3.265937089920044 + }, + { + "auxiliary_loss_clip": 0.01117899, + "auxiliary_loss_mlp": 0.01032303, + "balance_loss_clip": 1.04071665, + "balance_loss_mlp": 1.01973271, + "epoch": 0.615662107320006, + "flos": 18934226889120.0, + "grad_norm": 1.9465824508067469, + "language_loss": 0.7733252, + "learning_rate": 1.3593419519323892e-06, + "loss": 0.79482722, + "num_input_tokens_seen": 220527295, + "router_z_loss_clip": 0.77099609, + "router_z_loss_mlp": 0.12573242, + "step": 10240, + "time_per_iteration": 2.667250394821167 + }, + { + "auxiliary_loss_clip": 0.01120811, + "auxiliary_loss_mlp": 0.01031864, + "balance_loss_clip": 1.04271019, + "balance_loss_mlp": 1.01941228, + "epoch": 0.615722230572674, + "flos": 25702483597920.0, + "grad_norm": 5.068192424252299, + "language_loss": 0.72948515, + "learning_rate": 1.3589730245585922e-06, + "loss": 0.75101185, + "num_input_tokens_seen": 220542730, + "router_z_loss_clip": 0.78173828, + "router_z_loss_mlp": 0.12451172, + "step": 10241, + "time_per_iteration": 2.6069869995117188 + }, + { + "auxiliary_loss_clip": 0.01114828, + "auxiliary_loss_mlp": 0.01029101, + "balance_loss_clip": 1.03995538, + "balance_loss_mlp": 1.0176152, + "epoch": 0.615782353825342, + "flos": 28378309435200.0, + "grad_norm": 1.824212086693982, + "language_loss": 0.71752995, + "learning_rate": 1.3586041214926018e-06, + "loss": 0.73896921, + "num_input_tokens_seen": 220562995, + "router_z_loss_clip": 0.74853516, + "router_z_loss_mlp": 0.11480713, + "step": 10242, + "time_per_iteration": 2.7006165981292725 + }, + { + "auxiliary_loss_clip": 0.01118399, + "auxiliary_loss_mlp": 0.01031605, + "balance_loss_clip": 1.04231501, + "balance_loss_mlp": 1.01992857, + "epoch": 0.6158424770780099, + "flos": 25751219535360.0, + "grad_norm": 2.0930102517915206, + "language_loss": 0.72252131, + "learning_rate": 1.3582352427484086e-06, + "loss": 0.74402136, + "num_input_tokens_seen": 220581775, + "router_z_loss_clip": 0.76220703, + "router_z_loss_mlp": 0.11669922, + "step": 10243, + "time_per_iteration": 4.1303699016571045 + }, + { + "auxiliary_loss_clip": 0.01042662, + "auxiliary_loss_mlp": 0.00999283, + "balance_loss_clip": 1.01859772, + "balance_loss_mlp": 0.99807239, + "epoch": 0.6159026003306779, + "flos": 85820385427200.0, + "grad_norm": 0.7553330744142531, + "language_loss": 0.56812042, + "learning_rate": 1.3578663883399984e-06, + "loss": 0.5885399, + "num_input_tokens_seen": 220646395, + "router_z_loss_clip": 0.24047852, + "router_z_loss_mlp": 0.01210022, + "step": 10244, + "time_per_iteration": 3.3039469718933105 + }, + { + "auxiliary_loss_clip": 0.01118506, + "auxiliary_loss_mlp": 0.01032075, + "balance_loss_clip": 1.04060388, + "balance_loss_mlp": 1.01959419, + "epoch": 0.6159627235833458, + "flos": 41334657441120.0, + "grad_norm": 1.8875669513297237, + "language_loss": 0.63867694, + "learning_rate": 1.3574975582813593e-06, + "loss": 0.66018277, + "num_input_tokens_seen": 220668335, + "router_z_loss_clip": 0.77929688, + "router_z_loss_mlp": 0.12481689, + "step": 10245, + "time_per_iteration": 2.765585422515869 + }, + { + "auxiliary_loss_clip": 0.01115988, + "auxiliary_loss_mlp": 0.01029927, + "balance_loss_clip": 1.03999567, + "balance_loss_mlp": 1.01848888, + "epoch": 0.6160228468360138, + "flos": 32428122478560.0, + "grad_norm": 1.7316286888445986, + "language_loss": 0.78979999, + "learning_rate": 1.3571287525864771e-06, + "loss": 0.81125915, + "num_input_tokens_seen": 220688915, + "router_z_loss_clip": 0.76074219, + "router_z_loss_mlp": 0.11437988, + "step": 10246, + "time_per_iteration": 2.705803871154785 + }, + { + "auxiliary_loss_clip": 0.01123152, + "auxiliary_loss_mlp": 0.0104153, + "balance_loss_clip": 1.04317105, + "balance_loss_mlp": 1.02780271, + "epoch": 0.6160829700886818, + "flos": 20979773789760.0, + "grad_norm": 3.215378850346691, + "language_loss": 0.8739143, + "learning_rate": 1.3567599712693368e-06, + "loss": 0.8955611, + "num_input_tokens_seen": 220703465, + "router_z_loss_clip": 0.79931641, + "router_z_loss_mlp": 0.1373291, + "step": 10247, + "time_per_iteration": 3.853182315826416 + }, + { + "auxiliary_loss_clip": 0.01120546, + "auxiliary_loss_mlp": 0.01033149, + "balance_loss_clip": 1.04300177, + "balance_loss_mlp": 1.02130544, + "epoch": 0.6161430933413498, + "flos": 28825678254240.0, + "grad_norm": 1.8636329947784462, + "language_loss": 0.79970503, + "learning_rate": 1.3563912143439235e-06, + "loss": 0.82124197, + "num_input_tokens_seen": 220722090, + "router_z_loss_clip": 0.77490234, + "router_z_loss_mlp": 0.11853027, + "step": 10248, + "time_per_iteration": 2.6919877529144287 + }, + { + "auxiliary_loss_clip": 0.01115707, + "auxiliary_loss_mlp": 0.01030633, + "balance_loss_clip": 1.04105115, + "balance_loss_mlp": 1.01900411, + "epoch": 0.6162032165940178, + "flos": 28067829454080.0, + "grad_norm": 2.1182498085247414, + "language_loss": 0.87018871, + "learning_rate": 1.3560224818242191e-06, + "loss": 0.89165211, + "num_input_tokens_seen": 220741075, + "router_z_loss_clip": 0.74658203, + "router_z_loss_mlp": 0.11639404, + "step": 10249, + "time_per_iteration": 2.6249513626098633 + }, + { + "auxiliary_loss_clip": 0.01118155, + "auxiliary_loss_mlp": 0.01028676, + "balance_loss_clip": 1.0413698, + "balance_loss_mlp": 1.01614141, + "epoch": 0.6162633398466857, + "flos": 48104129668320.0, + "grad_norm": 8.365456774618995, + "language_loss": 0.68766439, + "learning_rate": 1.3556537737242072e-06, + "loss": 0.70913273, + "num_input_tokens_seen": 220763395, + "router_z_loss_clip": 0.76806641, + "router_z_loss_mlp": 0.12530518, + "step": 10250, + "time_per_iteration": 2.8238775730133057 + }, + { + "auxiliary_loss_clip": 0.01111399, + "auxiliary_loss_mlp": 0.0103057, + "balance_loss_clip": 1.03966808, + "balance_loss_mlp": 1.01954889, + "epoch": 0.6163234630993537, + "flos": 23482292104800.0, + "grad_norm": 2.0572465750813764, + "language_loss": 0.73866391, + "learning_rate": 1.3552850900578692e-06, + "loss": 0.76008362, + "num_input_tokens_seen": 220780640, + "router_z_loss_clip": 0.71630859, + "router_z_loss_mlp": 0.11022949, + "step": 10251, + "time_per_iteration": 2.5995230674743652 + }, + { + "auxiliary_loss_clip": 0.01116693, + "auxiliary_loss_mlp": 0.01028488, + "balance_loss_clip": 1.03996539, + "balance_loss_mlp": 1.01634014, + "epoch": 0.6163835863520216, + "flos": 19476190856160.0, + "grad_norm": 2.3284079141955933, + "language_loss": 0.68337357, + "learning_rate": 1.3549164308391844e-06, + "loss": 0.70482534, + "num_input_tokens_seen": 220797960, + "router_z_loss_clip": 0.76660156, + "router_z_loss_mlp": 0.12145996, + "step": 10252, + "time_per_iteration": 2.628304958343506 + }, + { + "auxiliary_loss_clip": 0.01041962, + "auxiliary_loss_mlp": 0.01006777, + "balance_loss_clip": 1.01802468, + "balance_loss_mlp": 1.00560653, + "epoch": 0.6164437096046896, + "flos": 83100199518720.0, + "grad_norm": 0.8906507376967494, + "language_loss": 0.57778835, + "learning_rate": 1.3545477960821333e-06, + "loss": 0.59827566, + "num_input_tokens_seen": 220856930, + "router_z_loss_clip": 0.23937988, + "router_z_loss_mlp": 0.01171112, + "step": 10253, + "time_per_iteration": 3.314183473587036 + }, + { + "auxiliary_loss_clip": 0.01117558, + "auxiliary_loss_mlp": 0.01035171, + "balance_loss_clip": 1.04028487, + "balance_loss_mlp": 1.0232259, + "epoch": 0.6165038328573575, + "flos": 26065021933440.0, + "grad_norm": 1.6207282936928085, + "language_loss": 0.79713655, + "learning_rate": 1.3541791858006946e-06, + "loss": 0.81866384, + "num_input_tokens_seen": 220877595, + "router_z_loss_clip": 0.77294922, + "router_z_loss_mlp": 0.1194458, + "step": 10254, + "time_per_iteration": 2.642085552215576 + }, + { + "auxiliary_loss_clip": 0.0112223, + "auxiliary_loss_mlp": 0.0103379, + "balance_loss_clip": 1.04228139, + "balance_loss_mlp": 1.02187538, + "epoch": 0.6165639561100256, + "flos": 25749031602240.0, + "grad_norm": 2.1911132106161286, + "language_loss": 0.80353224, + "learning_rate": 1.353810600008846e-06, + "loss": 0.82509243, + "num_input_tokens_seen": 220896880, + "router_z_loss_clip": 0.80029297, + "router_z_loss_mlp": 0.11914062, + "step": 10255, + "time_per_iteration": 2.6790826320648193 + }, + { + "auxiliary_loss_clip": 0.01122656, + "auxiliary_loss_mlp": 0.01035247, + "balance_loss_clip": 1.04375756, + "balance_loss_mlp": 1.02255106, + "epoch": 0.6166240793626935, + "flos": 30918421435680.0, + "grad_norm": 3.0040251520881234, + "language_loss": 0.65231711, + "learning_rate": 1.3534420387205646e-06, + "loss": 0.67389613, + "num_input_tokens_seen": 220916425, + "router_z_loss_clip": 0.78808594, + "router_z_loss_mlp": 0.12701416, + "step": 10256, + "time_per_iteration": 2.6872293949127197 + }, + { + "auxiliary_loss_clip": 0.01117209, + "auxiliary_loss_mlp": 0.01027873, + "balance_loss_clip": 1.0424788, + "balance_loss_mlp": 1.01669097, + "epoch": 0.6166842026153615, + "flos": 24016719857760.0, + "grad_norm": 1.8329981005834335, + "language_loss": 0.72011864, + "learning_rate": 1.353073501949825e-06, + "loss": 0.74156952, + "num_input_tokens_seen": 220935050, + "router_z_loss_clip": 0.74658203, + "router_z_loss_mlp": 0.11181641, + "step": 10257, + "time_per_iteration": 2.737262487411499 + }, + { + "auxiliary_loss_clip": 0.01123166, + "auxiliary_loss_mlp": 0.01031679, + "balance_loss_clip": 1.04432607, + "balance_loss_mlp": 1.01884592, + "epoch": 0.6167443258680294, + "flos": 23572146731040.0, + "grad_norm": 1.9878263100904152, + "language_loss": 0.71827698, + "learning_rate": 1.3527049897106034e-06, + "loss": 0.73982543, + "num_input_tokens_seen": 220953085, + "router_z_loss_clip": 0.7890625, + "router_z_loss_mlp": 0.12835693, + "step": 10258, + "time_per_iteration": 2.6089675426483154 + }, + { + "auxiliary_loss_clip": 0.01119536, + "auxiliary_loss_mlp": 0.01034865, + "balance_loss_clip": 1.04217803, + "balance_loss_mlp": 1.022861, + "epoch": 0.6168044491206974, + "flos": 30829012499520.0, + "grad_norm": 3.8445949646267703, + "language_loss": 0.64322644, + "learning_rate": 1.3523365020168735e-06, + "loss": 0.66477048, + "num_input_tokens_seen": 220969050, + "router_z_loss_clip": 0.7734375, + "router_z_loss_mlp": 0.11999512, + "step": 10259, + "time_per_iteration": 2.704962968826294 + }, + { + "auxiliary_loss_clip": 0.01116074, + "auxiliary_loss_mlp": 0.01031675, + "balance_loss_clip": 1.04183769, + "balance_loss_mlp": 1.01950955, + "epoch": 0.6168645723733654, + "flos": 16130831188320.0, + "grad_norm": 3.8097203650621165, + "language_loss": 0.71246731, + "learning_rate": 1.3519680388826084e-06, + "loss": 0.73394477, + "num_input_tokens_seen": 220985825, + "router_z_loss_clip": 0.74169922, + "router_z_loss_mlp": 0.12188721, + "step": 10260, + "time_per_iteration": 2.5804836750030518 + }, + { + "auxiliary_loss_clip": 0.01126892, + "auxiliary_loss_mlp": 0.010336, + "balance_loss_clip": 1.04719448, + "balance_loss_mlp": 1.02020109, + "epoch": 0.6169246956260334, + "flos": 32521259004480.0, + "grad_norm": 1.9434651047799134, + "language_loss": 0.68435764, + "learning_rate": 1.3515996003217803e-06, + "loss": 0.7059626, + "num_input_tokens_seen": 221004465, + "router_z_loss_clip": 0.79638672, + "router_z_loss_mlp": 0.13409424, + "step": 10261, + "time_per_iteration": 2.7600839138031006 + }, + { + "auxiliary_loss_clip": 0.01117563, + "auxiliary_loss_mlp": 0.01034293, + "balance_loss_clip": 1.04152226, + "balance_loss_mlp": 1.02284312, + "epoch": 0.6169848188787014, + "flos": 28245958532640.0, + "grad_norm": 2.744348515272905, + "language_loss": 0.71122807, + "learning_rate": 1.3512311863483602e-06, + "loss": 0.7327466, + "num_input_tokens_seen": 221023260, + "router_z_loss_clip": 0.75927734, + "router_z_loss_mlp": 0.11456299, + "step": 10262, + "time_per_iteration": 2.63266658782959 + }, + { + "auxiliary_loss_clip": 0.01117792, + "auxiliary_loss_mlp": 0.01035268, + "balance_loss_clip": 1.0413897, + "balance_loss_mlp": 1.02286434, + "epoch": 0.6170449421313693, + "flos": 28469784752640.0, + "grad_norm": 5.29066392866617, + "language_loss": 0.70082569, + "learning_rate": 1.3508627969763188e-06, + "loss": 0.72235632, + "num_input_tokens_seen": 221043090, + "router_z_loss_clip": 0.76318359, + "router_z_loss_mlp": 0.12414551, + "step": 10263, + "time_per_iteration": 2.7638351917266846 + }, + { + "auxiliary_loss_clip": 0.01120105, + "auxiliary_loss_mlp": 0.01032063, + "balance_loss_clip": 1.04172349, + "balance_loss_mlp": 1.02065444, + "epoch": 0.6171050653840373, + "flos": 19341814089600.0, + "grad_norm": 2.158378180915587, + "language_loss": 0.76394862, + "learning_rate": 1.3504944322196244e-06, + "loss": 0.78547031, + "num_input_tokens_seen": 221061435, + "router_z_loss_clip": 0.78320312, + "router_z_loss_mlp": 0.11407471, + "step": 10264, + "time_per_iteration": 2.6335391998291016 + }, + { + "auxiliary_loss_clip": 0.01118731, + "auxiliary_loss_mlp": 0.0103317, + "balance_loss_clip": 1.04181969, + "balance_loss_mlp": 1.02058744, + "epoch": 0.6171651886367052, + "flos": 24459712810560.0, + "grad_norm": 2.3623005999175186, + "language_loss": 0.84952873, + "learning_rate": 1.350126092092247e-06, + "loss": 0.87104774, + "num_input_tokens_seen": 221078705, + "router_z_loss_clip": 0.76855469, + "router_z_loss_mlp": 0.12579346, + "step": 10265, + "time_per_iteration": 2.647193193435669 + }, + { + "auxiliary_loss_clip": 0.01115421, + "auxiliary_loss_mlp": 0.01032773, + "balance_loss_clip": 1.04035866, + "balance_loss_mlp": 1.02057171, + "epoch": 0.6172253118893732, + "flos": 32253639955200.0, + "grad_norm": 2.7753063849842015, + "language_loss": 0.64022696, + "learning_rate": 1.349757776608153e-06, + "loss": 0.66170889, + "num_input_tokens_seen": 221099245, + "router_z_loss_clip": 0.75097656, + "router_z_loss_mlp": 0.12194824, + "step": 10266, + "time_per_iteration": 2.6964244842529297 + }, + { + "auxiliary_loss_clip": 0.01116844, + "auxiliary_loss_mlp": 0.01032744, + "balance_loss_clip": 1.03968263, + "balance_loss_mlp": 1.02100182, + "epoch": 0.6172854351420412, + "flos": 27617300287200.0, + "grad_norm": 1.7617994913015593, + "language_loss": 0.75431705, + "learning_rate": 1.3493894857813094e-06, + "loss": 0.77581298, + "num_input_tokens_seen": 221116930, + "router_z_loss_clip": 0.77148438, + "router_z_loss_mlp": 0.11737061, + "step": 10267, + "time_per_iteration": 2.6251397132873535 + }, + { + "auxiliary_loss_clip": 0.01121992, + "auxiliary_loss_mlp": 0.01029679, + "balance_loss_clip": 1.0427072, + "balance_loss_mlp": 1.01716828, + "epoch": 0.6173455583947092, + "flos": 25884299748960.0, + "grad_norm": 1.9092701636115823, + "language_loss": 0.74915874, + "learning_rate": 1.3490212196256818e-06, + "loss": 0.77067542, + "num_input_tokens_seen": 221137660, + "router_z_loss_clip": 0.79248047, + "router_z_loss_mlp": 0.12524414, + "step": 10268, + "time_per_iteration": 2.7908456325531006 + }, + { + "auxiliary_loss_clip": 0.01120445, + "auxiliary_loss_mlp": 0.01029227, + "balance_loss_clip": 1.04074836, + "balance_loss_mlp": 1.01706123, + "epoch": 0.6174056816473771, + "flos": 23794190190720.0, + "grad_norm": 1.7964510776705116, + "language_loss": 0.75102299, + "learning_rate": 1.3486529781552342e-06, + "loss": 0.77251971, + "num_input_tokens_seen": 221156225, + "router_z_loss_clip": 0.796875, + "router_z_loss_mlp": 0.1217041, + "step": 10269, + "time_per_iteration": 2.611506462097168 + }, + { + "auxiliary_loss_clip": 0.0111564, + "auxiliary_loss_mlp": 0.01032273, + "balance_loss_clip": 1.03941011, + "balance_loss_mlp": 1.02054882, + "epoch": 0.6174658049000451, + "flos": 19519659547200.0, + "grad_norm": 2.806622827783305, + "language_loss": 0.76377511, + "learning_rate": 1.3482847613839318e-06, + "loss": 0.78525424, + "num_input_tokens_seen": 221173820, + "router_z_loss_clip": 0.76171875, + "router_z_loss_mlp": 0.1171875, + "step": 10270, + "time_per_iteration": 2.644735336303711 + }, + { + "auxiliary_loss_clip": 0.01119784, + "auxiliary_loss_mlp": 0.01028805, + "balance_loss_clip": 1.0426538, + "balance_loss_mlp": 1.01620436, + "epoch": 0.617525928152713, + "flos": 26727667826400.0, + "grad_norm": 2.1361473673869305, + "language_loss": 0.82214886, + "learning_rate": 1.347916569325736e-06, + "loss": 0.84363472, + "num_input_tokens_seen": 221191815, + "router_z_loss_clip": 0.77197266, + "router_z_loss_mlp": 0.12597656, + "step": 10271, + "time_per_iteration": 2.6083765029907227 + }, + { + "auxiliary_loss_clip": 0.01119159, + "auxiliary_loss_mlp": 0.01032195, + "balance_loss_clip": 1.04126596, + "balance_loss_mlp": 1.01989269, + "epoch": 0.617586051405381, + "flos": 25753529020320.0, + "grad_norm": 2.1408803357257007, + "language_loss": 0.76968157, + "learning_rate": 1.3475484019946093e-06, + "loss": 0.79119515, + "num_input_tokens_seen": 221211205, + "router_z_loss_clip": 0.77880859, + "router_z_loss_mlp": 0.12310791, + "step": 10272, + "time_per_iteration": 2.6817736625671387 + }, + { + "auxiliary_loss_clip": 0.01037092, + "auxiliary_loss_mlp": 0.0100357, + "balance_loss_clip": 1.01309776, + "balance_loss_mlp": 1.00241983, + "epoch": 0.617646174658049, + "flos": 71516663717760.0, + "grad_norm": 0.8122326709611086, + "language_loss": 0.59026408, + "learning_rate": 1.347180259404513e-06, + "loss": 0.61067069, + "num_input_tokens_seen": 221268430, + "router_z_loss_clip": 0.2401123, + "router_z_loss_mlp": 0.0114975, + "step": 10273, + "time_per_iteration": 3.129714012145996 + }, + { + "auxiliary_loss_clip": 0.01115484, + "auxiliary_loss_mlp": 0.01029467, + "balance_loss_clip": 1.04011166, + "balance_loss_mlp": 1.01748037, + "epoch": 0.617706297910717, + "flos": 16934539199040.0, + "grad_norm": 3.568151084248545, + "language_loss": 0.72792882, + "learning_rate": 1.3468121415694059e-06, + "loss": 0.74937832, + "num_input_tokens_seen": 221281930, + "router_z_loss_clip": 0.75390625, + "router_z_loss_mlp": 0.11987305, + "step": 10274, + "time_per_iteration": 4.038919687271118 + }, + { + "auxiliary_loss_clip": 0.01116252, + "auxiliary_loss_mlp": 0.0103135, + "balance_loss_clip": 1.04068637, + "balance_loss_mlp": 1.0198102, + "epoch": 0.617766421163385, + "flos": 23438053585440.0, + "grad_norm": 2.042090362123433, + "language_loss": 0.77261531, + "learning_rate": 1.3464440485032484e-06, + "loss": 0.79409134, + "num_input_tokens_seen": 221301605, + "router_z_loss_clip": 0.75390625, + "router_z_loss_mlp": 0.11535645, + "step": 10275, + "time_per_iteration": 2.6661128997802734 + }, + { + "auxiliary_loss_clip": 0.01116858, + "auxiliary_loss_mlp": 0.01031876, + "balance_loss_clip": 1.04068851, + "balance_loss_mlp": 1.01990747, + "epoch": 0.6178265444160529, + "flos": 27535103426880.0, + "grad_norm": 1.713413526945013, + "language_loss": 0.79316092, + "learning_rate": 1.346075980219998e-06, + "loss": 0.81464827, + "num_input_tokens_seen": 221320105, + "router_z_loss_clip": 0.76025391, + "router_z_loss_mlp": 0.11981201, + "step": 10276, + "time_per_iteration": 4.05089807510376 + }, + { + "auxiliary_loss_clip": 0.0112054, + "auxiliary_loss_mlp": 0.0103264, + "balance_loss_clip": 1.04206038, + "balance_loss_mlp": 1.02000427, + "epoch": 0.6178866676687209, + "flos": 14622588767520.0, + "grad_norm": 2.21953733244639, + "language_loss": 0.80808425, + "learning_rate": 1.345707936733612e-06, + "loss": 0.82961601, + "num_input_tokens_seen": 221335915, + "router_z_loss_clip": 0.78417969, + "router_z_loss_mlp": 0.12646484, + "step": 10277, + "time_per_iteration": 2.594876766204834 + }, + { + "auxiliary_loss_clip": 0.0111924, + "auxiliary_loss_mlp": 0.01030818, + "balance_loss_clip": 1.0402931, + "balance_loss_mlp": 1.01792586, + "epoch": 0.6179467909213888, + "flos": 25614330697440.0, + "grad_norm": 1.7308640483689417, + "language_loss": 0.81594563, + "learning_rate": 1.3453399180580466e-06, + "loss": 0.83744621, + "num_input_tokens_seen": 221353965, + "router_z_loss_clip": 0.7890625, + "router_z_loss_mlp": 0.12890625, + "step": 10278, + "time_per_iteration": 2.6607465744018555 + }, + { + "auxiliary_loss_clip": 0.01118531, + "auxiliary_loss_mlp": 0.01030878, + "balance_loss_clip": 1.04150784, + "balance_loss_mlp": 1.01928449, + "epoch": 0.6180069141740568, + "flos": 30917935228320.0, + "grad_norm": 2.218443428891055, + "language_loss": 0.74272549, + "learning_rate": 1.3449719242072567e-06, + "loss": 0.76421952, + "num_input_tokens_seen": 221374080, + "router_z_loss_clip": 0.77001953, + "router_z_loss_mlp": 0.1159668, + "step": 10279, + "time_per_iteration": 2.6895599365234375 + }, + { + "auxiliary_loss_clip": 0.011132, + "auxiliary_loss_mlp": 0.01027425, + "balance_loss_clip": 1.03768325, + "balance_loss_mlp": 1.01587343, + "epoch": 0.6180670374267248, + "flos": 23971711510080.0, + "grad_norm": 1.7106157349607058, + "language_loss": 0.70645559, + "learning_rate": 1.3446039551951975e-06, + "loss": 0.72786188, + "num_input_tokens_seen": 221392910, + "router_z_loss_clip": 0.75683594, + "router_z_loss_mlp": 0.11553955, + "step": 10280, + "time_per_iteration": 2.632704973220825 + }, + { + "auxiliary_loss_clip": 0.01117801, + "auxiliary_loss_mlp": 0.01030734, + "balance_loss_clip": 1.04095113, + "balance_loss_mlp": 1.01835978, + "epoch": 0.6181271606793928, + "flos": 23750883568800.0, + "grad_norm": 2.333361844614956, + "language_loss": 0.72992718, + "learning_rate": 1.3442360110358215e-06, + "loss": 0.75141257, + "num_input_tokens_seen": 221410990, + "router_z_loss_clip": 0.76708984, + "router_z_loss_mlp": 0.12371826, + "step": 10281, + "time_per_iteration": 2.609118938446045 + }, + { + "auxiliary_loss_clip": 0.01113766, + "auxiliary_loss_mlp": 0.01030147, + "balance_loss_clip": 1.04157603, + "balance_loss_mlp": 1.01913834, + "epoch": 0.6181872839320607, + "flos": 31230481590720.0, + "grad_norm": 2.2907712622082377, + "language_loss": 0.76489151, + "learning_rate": 1.3438680917430827e-06, + "loss": 0.78633064, + "num_input_tokens_seen": 221431020, + "router_z_loss_clip": 0.72167969, + "router_z_loss_mlp": 0.11004639, + "step": 10282, + "time_per_iteration": 2.7219247817993164 + }, + { + "auxiliary_loss_clip": 0.01119329, + "auxiliary_loss_mlp": 0.01032537, + "balance_loss_clip": 1.04162467, + "balance_loss_mlp": 1.01828551, + "epoch": 0.6182474071847287, + "flos": 31179193064640.0, + "grad_norm": 1.8371971714908621, + "language_loss": 0.69036555, + "learning_rate": 1.343500197330931e-06, + "loss": 0.7118842, + "num_input_tokens_seen": 221453235, + "router_z_loss_clip": 0.77685547, + "router_z_loss_mlp": 0.14251709, + "step": 10283, + "time_per_iteration": 4.226056337356567 + }, + { + "auxiliary_loss_clip": 0.0112296, + "auxiliary_loss_mlp": 0.01030278, + "balance_loss_clip": 1.04147708, + "balance_loss_mlp": 1.01755857, + "epoch": 0.6183075304373966, + "flos": 26995854117600.0, + "grad_norm": 2.5864297882668397, + "language_loss": 0.75182623, + "learning_rate": 1.3431323278133176e-06, + "loss": 0.77335858, + "num_input_tokens_seen": 221472560, + "router_z_loss_clip": 0.81396484, + "router_z_loss_mlp": 0.1270752, + "step": 10284, + "time_per_iteration": 2.623100519180298 + }, + { + "auxiliary_loss_clip": 0.01113136, + "auxiliary_loss_mlp": 0.01032762, + "balance_loss_clip": 1.04086065, + "balance_loss_mlp": 1.020787, + "epoch": 0.6183676536900646, + "flos": 27400483556640.0, + "grad_norm": 1.809813681437985, + "language_loss": 0.7532649, + "learning_rate": 1.3427644832041922e-06, + "loss": 0.77472383, + "num_input_tokens_seen": 221492835, + "router_z_loss_clip": 0.72314453, + "router_z_loss_mlp": 0.11975098, + "step": 10285, + "time_per_iteration": 2.708326816558838 + }, + { + "auxiliary_loss_clip": 0.01117046, + "auxiliary_loss_mlp": 0.01031441, + "balance_loss_clip": 1.03994811, + "balance_loss_mlp": 1.01954985, + "epoch": 0.6184277769427327, + "flos": 28509485336640.0, + "grad_norm": 1.5862478522064014, + "language_loss": 0.72879732, + "learning_rate": 1.342396663517503e-06, + "loss": 0.75028211, + "num_input_tokens_seen": 221511870, + "router_z_loss_clip": 0.77001953, + "router_z_loss_mlp": 0.11883545, + "step": 10286, + "time_per_iteration": 2.6437623500823975 + }, + { + "auxiliary_loss_clip": 0.01114791, + "auxiliary_loss_mlp": 0.0102717, + "balance_loss_clip": 1.03958309, + "balance_loss_mlp": 1.01555943, + "epoch": 0.6184879001954006, + "flos": 27712219573440.0, + "grad_norm": 1.6568333860040794, + "language_loss": 0.75733668, + "learning_rate": 1.342028868767199e-06, + "loss": 0.77875626, + "num_input_tokens_seen": 221529915, + "router_z_loss_clip": 0.75195312, + "router_z_loss_mlp": 0.1161499, + "step": 10287, + "time_per_iteration": 4.053858041763306 + }, + { + "auxiliary_loss_clip": 0.01116334, + "auxiliary_loss_mlp": 0.01030983, + "balance_loss_clip": 1.04107594, + "balance_loss_mlp": 1.01967001, + "epoch": 0.6185480234480686, + "flos": 29092203336960.0, + "grad_norm": 2.114012797763427, + "language_loss": 0.72895443, + "learning_rate": 1.3416610989672262e-06, + "loss": 0.7504276, + "num_input_tokens_seen": 221549745, + "router_z_loss_clip": 0.75146484, + "router_z_loss_mlp": 0.11309814, + "step": 10288, + "time_per_iteration": 2.651895046234131 + }, + { + "auxiliary_loss_clip": 0.01112497, + "auxiliary_loss_mlp": 0.01030039, + "balance_loss_clip": 1.04027033, + "balance_loss_mlp": 1.01892245, + "epoch": 0.6186081467007365, + "flos": 55493751512160.0, + "grad_norm": 1.6270990690956042, + "language_loss": 0.73141158, + "learning_rate": 1.3412933541315296e-06, + "loss": 0.75283694, + "num_input_tokens_seen": 221572455, + "router_z_loss_clip": 0.72119141, + "router_z_loss_mlp": 0.11114502, + "step": 10289, + "time_per_iteration": 2.9064254760742188 + }, + { + "auxiliary_loss_clip": 0.01119342, + "auxiliary_loss_mlp": 0.01029983, + "balance_loss_clip": 1.04024673, + "balance_loss_mlp": 1.01784778, + "epoch": 0.6186682699534045, + "flos": 28736958111840.0, + "grad_norm": 2.312614049851753, + "language_loss": 0.79134876, + "learning_rate": 1.340925634274056e-06, + "loss": 0.81284201, + "num_input_tokens_seen": 221591325, + "router_z_loss_clip": 0.79101562, + "router_z_loss_mlp": 0.12127686, + "step": 10290, + "time_per_iteration": 2.731051445007324 + }, + { + "auxiliary_loss_clip": 0.01120741, + "auxiliary_loss_mlp": 0.01032816, + "balance_loss_clip": 1.04209399, + "balance_loss_mlp": 1.02038801, + "epoch": 0.6187283932060724, + "flos": 31450256082720.0, + "grad_norm": 1.7290047579948298, + "language_loss": 0.81101823, + "learning_rate": 1.3405579394087475e-06, + "loss": 0.83255386, + "num_input_tokens_seen": 221611640, + "router_z_loss_clip": 0.78759766, + "router_z_loss_mlp": 0.12445068, + "step": 10291, + "time_per_iteration": 2.6492884159088135 + }, + { + "auxiliary_loss_clip": 0.01117201, + "auxiliary_loss_mlp": 0.01028462, + "balance_loss_clip": 1.04102278, + "balance_loss_mlp": 1.01707709, + "epoch": 0.6187885164587404, + "flos": 30828971982240.0, + "grad_norm": 2.1661818458061464, + "language_loss": 0.77731925, + "learning_rate": 1.3401902695495487e-06, + "loss": 0.79877585, + "num_input_tokens_seen": 221631225, + "router_z_loss_clip": 0.76123047, + "router_z_loss_mlp": 0.11376953, + "step": 10292, + "time_per_iteration": 2.678192377090454 + }, + { + "auxiliary_loss_clip": 0.01123282, + "auxiliary_loss_mlp": 0.01037917, + "balance_loss_clip": 1.04331875, + "balance_loss_mlp": 1.0240593, + "epoch": 0.6188486397114084, + "flos": 32031596495520.0, + "grad_norm": 5.233242690909788, + "language_loss": 0.7349565, + "learning_rate": 1.339822624710401e-06, + "loss": 0.75656855, + "num_input_tokens_seen": 221651035, + "router_z_loss_clip": 0.79882812, + "router_z_loss_mlp": 0.1385498, + "step": 10293, + "time_per_iteration": 2.6396143436431885 + }, + { + "auxiliary_loss_clip": 0.01119326, + "auxiliary_loss_mlp": 0.01033076, + "balance_loss_clip": 1.04273129, + "balance_loss_mlp": 1.02067196, + "epoch": 0.6189087629640764, + "flos": 25530756249600.0, + "grad_norm": 1.7894428166707435, + "language_loss": 0.82958484, + "learning_rate": 1.3394550049052454e-06, + "loss": 0.85110879, + "num_input_tokens_seen": 221671300, + "router_z_loss_clip": 0.765625, + "router_z_loss_mlp": 0.12408447, + "step": 10294, + "time_per_iteration": 2.64961314201355 + }, + { + "auxiliary_loss_clip": 0.01118509, + "auxiliary_loss_mlp": 0.01030971, + "balance_loss_clip": 1.04154277, + "balance_loss_mlp": 1.01943111, + "epoch": 0.6189688862167443, + "flos": 18094951056960.0, + "grad_norm": 2.4000540431441957, + "language_loss": 0.70260656, + "learning_rate": 1.3390874101480225e-06, + "loss": 0.72410142, + "num_input_tokens_seen": 221687320, + "router_z_loss_clip": 0.76953125, + "router_z_loss_mlp": 0.11547852, + "step": 10295, + "time_per_iteration": 2.5991568565368652 + }, + { + "auxiliary_loss_clip": 0.01118595, + "auxiliary_loss_mlp": 0.01031933, + "balance_loss_clip": 1.04307199, + "balance_loss_mlp": 1.01982117, + "epoch": 0.6190290094694123, + "flos": 29632465578240.0, + "grad_norm": 1.8011861719487514, + "language_loss": 0.70153177, + "learning_rate": 1.3387198404526705e-06, + "loss": 0.723037, + "num_input_tokens_seen": 221710175, + "router_z_loss_clip": 0.75585938, + "router_z_loss_mlp": 0.12121582, + "step": 10296, + "time_per_iteration": 2.714170217514038 + }, + { + "auxiliary_loss_clip": 0.0112061, + "auxiliary_loss_mlp": 0.01033883, + "balance_loss_clip": 1.04278433, + "balance_loss_mlp": 1.02080536, + "epoch": 0.6190891327220802, + "flos": 27490540769280.0, + "grad_norm": 2.006921334643721, + "language_loss": 0.71587265, + "learning_rate": 1.3383522958331287e-06, + "loss": 0.73741758, + "num_input_tokens_seen": 221728145, + "router_z_loss_clip": 0.77734375, + "router_z_loss_mlp": 0.13079834, + "step": 10297, + "time_per_iteration": 2.604538917541504 + }, + { + "auxiliary_loss_clip": 0.01038725, + "auxiliary_loss_mlp": 0.01000512, + "balance_loss_clip": 1.01482296, + "balance_loss_mlp": 0.99938095, + "epoch": 0.6191492559747482, + "flos": 82644119484480.0, + "grad_norm": 0.8869968581636619, + "language_loss": 0.64201653, + "learning_rate": 1.3379847763033345e-06, + "loss": 0.66240889, + "num_input_tokens_seen": 221786100, + "router_z_loss_clip": 0.23901367, + "router_z_loss_mlp": 0.01132965, + "step": 10298, + "time_per_iteration": 3.142324924468994 + }, + { + "auxiliary_loss_clip": 0.01118013, + "auxiliary_loss_mlp": 0.01035187, + "balance_loss_clip": 1.04090738, + "balance_loss_mlp": 1.02337945, + "epoch": 0.6192093792274163, + "flos": 27267524894880.0, + "grad_norm": 1.9597377556058155, + "language_loss": 0.74466807, + "learning_rate": 1.3376172818772236e-06, + "loss": 0.76620007, + "num_input_tokens_seen": 221806450, + "router_z_loss_clip": 0.77148438, + "router_z_loss_mlp": 0.11816406, + "step": 10299, + "time_per_iteration": 2.635399580001831 + }, + { + "auxiliary_loss_clip": 0.01123541, + "auxiliary_loss_mlp": 0.01030415, + "balance_loss_clip": 1.04277229, + "balance_loss_mlp": 1.01823235, + "epoch": 0.6192695024800842, + "flos": 16537932181440.0, + "grad_norm": 2.8673603498431652, + "language_loss": 0.68171358, + "learning_rate": 1.337249812568732e-06, + "loss": 0.70325315, + "num_input_tokens_seen": 221823330, + "router_z_loss_clip": 0.80664062, + "router_z_loss_mlp": 0.12176514, + "step": 10300, + "time_per_iteration": 2.6887402534484863 + }, + { + "auxiliary_loss_clip": 0.01121026, + "auxiliary_loss_mlp": 0.01033938, + "balance_loss_clip": 1.0431906, + "balance_loss_mlp": 1.02189839, + "epoch": 0.6193296257327522, + "flos": 21250431635040.0, + "grad_norm": 2.710477933062779, + "language_loss": 0.67118526, + "learning_rate": 1.3368823683917939e-06, + "loss": 0.69273496, + "num_input_tokens_seen": 221839360, + "router_z_loss_clip": 0.77880859, + "router_z_loss_mlp": 0.12023926, + "step": 10301, + "time_per_iteration": 2.697054386138916 + }, + { + "auxiliary_loss_clip": 0.0111671, + "auxiliary_loss_mlp": 0.01033404, + "balance_loss_clip": 1.03912067, + "balance_loss_mlp": 1.02172744, + "epoch": 0.6193897489854201, + "flos": 37950569604000.0, + "grad_norm": 1.7563939298073834, + "language_loss": 0.73106468, + "learning_rate": 1.3365149493603424e-06, + "loss": 0.75256586, + "num_input_tokens_seen": 221859465, + "router_z_loss_clip": 0.77490234, + "router_z_loss_mlp": 0.11682129, + "step": 10302, + "time_per_iteration": 2.743309736251831 + }, + { + "auxiliary_loss_clip": 0.01116797, + "auxiliary_loss_mlp": 0.01029283, + "balance_loss_clip": 1.04071951, + "balance_loss_mlp": 1.01628268, + "epoch": 0.6194498722380881, + "flos": 23347388613600.0, + "grad_norm": 5.72983187902126, + "language_loss": 0.80250823, + "learning_rate": 1.3361475554883107e-06, + "loss": 0.82396907, + "num_input_tokens_seen": 221878555, + "router_z_loss_clip": 0.76074219, + "router_z_loss_mlp": 0.13000488, + "step": 10303, + "time_per_iteration": 2.6699328422546387 + }, + { + "auxiliary_loss_clip": 0.01121122, + "auxiliary_loss_mlp": 0.01027564, + "balance_loss_clip": 1.04282665, + "balance_loss_mlp": 1.01455259, + "epoch": 0.619509995490756, + "flos": 26643323550240.0, + "grad_norm": 3.6662490637803002, + "language_loss": 0.77004999, + "learning_rate": 1.3357801867896307e-06, + "loss": 0.79153687, + "num_input_tokens_seen": 221898790, + "router_z_loss_clip": 0.78271484, + "router_z_loss_mlp": 0.13024902, + "step": 10304, + "time_per_iteration": 2.6280131340026855 + }, + { + "auxiliary_loss_clip": 0.01124741, + "auxiliary_loss_mlp": 0.01029993, + "balance_loss_clip": 1.04292965, + "balance_loss_mlp": 1.01767921, + "epoch": 0.619570118743424, + "flos": 29048937232320.0, + "grad_norm": 2.8100321558155765, + "language_loss": 0.77451205, + "learning_rate": 1.3354128432782324e-06, + "loss": 0.79605937, + "num_input_tokens_seen": 221918875, + "router_z_loss_clip": 0.81787109, + "router_z_loss_mlp": 0.12322998, + "step": 10305, + "time_per_iteration": 2.6700327396392822 + }, + { + "auxiliary_loss_clip": 0.01124396, + "auxiliary_loss_mlp": 0.0103359, + "balance_loss_clip": 1.04372954, + "balance_loss_mlp": 1.02004755, + "epoch": 0.619630241996092, + "flos": 25748626429440.0, + "grad_norm": 2.2438255365895703, + "language_loss": 0.78641903, + "learning_rate": 1.335045524968045e-06, + "loss": 0.8079989, + "num_input_tokens_seen": 221937895, + "router_z_loss_clip": 0.80615234, + "router_z_loss_mlp": 0.13562012, + "step": 10306, + "time_per_iteration": 2.607243061065674 + }, + { + "auxiliary_loss_clip": 0.01112228, + "auxiliary_loss_mlp": 0.01027228, + "balance_loss_clip": 1.03910327, + "balance_loss_mlp": 1.01623738, + "epoch": 0.61969036524876, + "flos": 33322090288320.0, + "grad_norm": 1.785891618995182, + "language_loss": 0.8013097, + "learning_rate": 1.3346782318729988e-06, + "loss": 0.8227042, + "num_input_tokens_seen": 221955920, + "router_z_loss_clip": 0.73193359, + "router_z_loss_mlp": 0.10992432, + "step": 10307, + "time_per_iteration": 2.750392198562622 + }, + { + "auxiliary_loss_clip": 0.01039364, + "auxiliary_loss_mlp": 0.01005413, + "balance_loss_clip": 1.01567507, + "balance_loss_mlp": 1.00431371, + "epoch": 0.6197504885014279, + "flos": 63020673717120.0, + "grad_norm": 0.8174529406893437, + "language_loss": 0.59454107, + "learning_rate": 1.3343109640070203e-06, + "loss": 0.6149888, + "num_input_tokens_seen": 222011405, + "router_z_loss_clip": 0.23693848, + "router_z_loss_mlp": 0.01099396, + "step": 10308, + "time_per_iteration": 3.2663967609405518 + }, + { + "auxiliary_loss_clip": 0.01115982, + "auxiliary_loss_mlp": 0.01027733, + "balance_loss_clip": 1.04147708, + "balance_loss_mlp": 1.01707613, + "epoch": 0.6198106117540959, + "flos": 37286424571680.0, + "grad_norm": 1.9659187395823952, + "language_loss": 0.68369377, + "learning_rate": 1.333943721384037e-06, + "loss": 0.70513093, + "num_input_tokens_seen": 222034545, + "router_z_loss_clip": 0.74462891, + "router_z_loss_mlp": 0.10656738, + "step": 10309, + "time_per_iteration": 2.788787841796875 + }, + { + "auxiliary_loss_clip": 0.01114847, + "auxiliary_loss_mlp": 0.01034301, + "balance_loss_clip": 1.04003525, + "balance_loss_mlp": 1.02266002, + "epoch": 0.6198707350067638, + "flos": 23074178179680.0, + "grad_norm": 1.7421276572473434, + "language_loss": 0.72398818, + "learning_rate": 1.3335765040179746e-06, + "loss": 0.7454797, + "num_input_tokens_seen": 222052690, + "router_z_loss_clip": 0.74707031, + "router_z_loss_mlp": 0.11639404, + "step": 10310, + "time_per_iteration": 2.6970014572143555 + }, + { + "auxiliary_loss_clip": 0.01122358, + "auxiliary_loss_mlp": 0.01032409, + "balance_loss_clip": 1.04509735, + "balance_loss_mlp": 1.01906955, + "epoch": 0.6199308582594318, + "flos": 26153701558560.0, + "grad_norm": 2.881877869946548, + "language_loss": 0.79045069, + "learning_rate": 1.3332093119227573e-06, + "loss": 0.81199837, + "num_input_tokens_seen": 222069095, + "router_z_loss_clip": 0.77246094, + "router_z_loss_mlp": 0.13342285, + "step": 10311, + "time_per_iteration": 2.6242992877960205 + }, + { + "auxiliary_loss_clip": 0.01115972, + "auxiliary_loss_mlp": 0.010313, + "balance_loss_clip": 1.03900051, + "balance_loss_mlp": 1.01951647, + "epoch": 0.6199909815120999, + "flos": 22460389776000.0, + "grad_norm": 2.574431134873477, + "language_loss": 0.71955597, + "learning_rate": 1.3328421451123105e-06, + "loss": 0.74102867, + "num_input_tokens_seen": 222087360, + "router_z_loss_clip": 0.76953125, + "router_z_loss_mlp": 0.11791992, + "step": 10312, + "time_per_iteration": 2.75234317779541 + }, + { + "auxiliary_loss_clip": 0.01120498, + "auxiliary_loss_mlp": 0.01032331, + "balance_loss_clip": 1.04267097, + "balance_loss_mlp": 1.02033854, + "epoch": 0.6200511047647678, + "flos": 26194293522720.0, + "grad_norm": 3.6222547947309347, + "language_loss": 0.71828806, + "learning_rate": 1.3324750036005557e-06, + "loss": 0.73981637, + "num_input_tokens_seen": 222106130, + "router_z_loss_clip": 0.77783203, + "router_z_loss_mlp": 0.11999512, + "step": 10313, + "time_per_iteration": 4.154998064041138 + }, + { + "auxiliary_loss_clip": 0.01119939, + "auxiliary_loss_mlp": 0.01031217, + "balance_loss_clip": 1.0414561, + "balance_loss_mlp": 1.01842594, + "epoch": 0.6201112280174358, + "flos": 22227568719840.0, + "grad_norm": 2.3625281599887957, + "language_loss": 0.78546154, + "learning_rate": 1.332107887401416e-06, + "loss": 0.8069731, + "num_input_tokens_seen": 222123125, + "router_z_loss_clip": 0.78515625, + "router_z_loss_mlp": 0.12799072, + "step": 10314, + "time_per_iteration": 2.6577229499816895 + }, + { + "auxiliary_loss_clip": 0.01115873, + "auxiliary_loss_mlp": 0.01030721, + "balance_loss_clip": 1.03823626, + "balance_loss_mlp": 1.0190208, + "epoch": 0.6201713512701037, + "flos": 24417621707040.0, + "grad_norm": 1.937670122529253, + "language_loss": 0.78230417, + "learning_rate": 1.331740796528812e-06, + "loss": 0.80377007, + "num_input_tokens_seen": 222140655, + "router_z_loss_clip": 0.77539062, + "router_z_loss_mlp": 0.11694336, + "step": 10315, + "time_per_iteration": 3.9681973457336426 + }, + { + "auxiliary_loss_clip": 0.01121227, + "auxiliary_loss_mlp": 0.01035557, + "balance_loss_clip": 1.04325879, + "balance_loss_mlp": 1.02360034, + "epoch": 0.6202314745227717, + "flos": 27440062588800.0, + "grad_norm": 1.8881801936020484, + "language_loss": 0.76035988, + "learning_rate": 1.3313737309966641e-06, + "loss": 0.7819277, + "num_input_tokens_seen": 222160450, + "router_z_loss_clip": 0.77978516, + "router_z_loss_mlp": 0.11956787, + "step": 10316, + "time_per_iteration": 2.6366615295410156 + }, + { + "auxiliary_loss_clip": 0.01118776, + "auxiliary_loss_mlp": 0.01028121, + "balance_loss_clip": 1.03972602, + "balance_loss_mlp": 1.01624799, + "epoch": 0.6202915977754396, + "flos": 32737346424000.0, + "grad_norm": 2.8959831676166323, + "language_loss": 0.77606618, + "learning_rate": 1.3310066908188915e-06, + "loss": 0.79753518, + "num_input_tokens_seen": 222179170, + "router_z_loss_clip": 0.79101562, + "router_z_loss_mlp": 0.11865234, + "step": 10317, + "time_per_iteration": 2.6984848976135254 + }, + { + "auxiliary_loss_clip": 0.01039161, + "auxiliary_loss_mlp": 0.01002483, + "balance_loss_clip": 1.01540804, + "balance_loss_mlp": 1.00136852, + "epoch": 0.6203517210281076, + "flos": 76557916445760.0, + "grad_norm": 0.6905032920373887, + "language_loss": 0.58994555, + "learning_rate": 1.3306396760094122e-06, + "loss": 0.61036199, + "num_input_tokens_seen": 222242660, + "router_z_loss_clip": 0.2376709, + "router_z_loss_mlp": 0.0111618, + "step": 10318, + "time_per_iteration": 3.336768627166748 + }, + { + "auxiliary_loss_clip": 0.01121036, + "auxiliary_loss_mlp": 0.0103194, + "balance_loss_clip": 1.04394245, + "balance_loss_mlp": 1.01942885, + "epoch": 0.6204118442807756, + "flos": 28558140239520.0, + "grad_norm": 1.858516207314237, + "language_loss": 0.77732694, + "learning_rate": 1.330272686582143e-06, + "loss": 0.79885662, + "num_input_tokens_seen": 222262170, + "router_z_loss_clip": 0.77050781, + "router_z_loss_mlp": 0.12524414, + "step": 10319, + "time_per_iteration": 2.661781072616577 + }, + { + "auxiliary_loss_clip": 0.01117396, + "auxiliary_loss_mlp": 0.01032433, + "balance_loss_clip": 1.04176164, + "balance_loss_mlp": 1.02084601, + "epoch": 0.6204719675334436, + "flos": 24640272925920.0, + "grad_norm": 2.7599900215127304, + "language_loss": 0.65915066, + "learning_rate": 1.3299057225510013e-06, + "loss": 0.68064892, + "num_input_tokens_seen": 222280375, + "router_z_loss_clip": 0.75683594, + "router_z_loss_mlp": 0.11584473, + "step": 10320, + "time_per_iteration": 2.6472787857055664 + }, + { + "auxiliary_loss_clip": 0.01115086, + "auxiliary_loss_mlp": 0.01028042, + "balance_loss_clip": 1.04105008, + "balance_loss_mlp": 1.01682472, + "epoch": 0.6205320907861115, + "flos": 16091535777120.0, + "grad_norm": 1.8773348919244421, + "language_loss": 0.75875843, + "learning_rate": 1.3295387839299013e-06, + "loss": 0.78018963, + "num_input_tokens_seen": 222297325, + "router_z_loss_clip": 0.73925781, + "router_z_loss_mlp": 0.11218262, + "step": 10321, + "time_per_iteration": 2.593662977218628 + }, + { + "auxiliary_loss_clip": 0.01113888, + "auxiliary_loss_mlp": 0.01024856, + "balance_loss_clip": 1.03964472, + "balance_loss_mlp": 1.01358509, + "epoch": 0.6205922140387795, + "flos": 25219546957440.0, + "grad_norm": 2.065946217869035, + "language_loss": 0.73689926, + "learning_rate": 1.329171870732758e-06, + "loss": 0.75828665, + "num_input_tokens_seen": 222317095, + "router_z_loss_clip": 0.74316406, + "router_z_loss_mlp": 0.11279297, + "step": 10322, + "time_per_iteration": 4.160737752914429 + }, + { + "auxiliary_loss_clip": 0.01116138, + "auxiliary_loss_mlp": 0.01023293, + "balance_loss_clip": 1.04161072, + "balance_loss_mlp": 1.0122242, + "epoch": 0.6206523372914474, + "flos": 29136077200800.0, + "grad_norm": 2.4730908861904064, + "language_loss": 0.72912073, + "learning_rate": 1.3288049829734845e-06, + "loss": 0.75051504, + "num_input_tokens_seen": 222337055, + "router_z_loss_clip": 0.74560547, + "router_z_loss_mlp": 0.11065674, + "step": 10323, + "time_per_iteration": 2.6331076622009277 + }, + { + "auxiliary_loss_clip": 0.0112618, + "auxiliary_loss_mlp": 0.01028163, + "balance_loss_clip": 1.04511476, + "balance_loss_mlp": 1.01614094, + "epoch": 0.6207124605441154, + "flos": 16358263446240.0, + "grad_norm": 2.6440631534961327, + "language_loss": 0.59190261, + "learning_rate": 1.3284381206659933e-06, + "loss": 0.613446, + "num_input_tokens_seen": 222354515, + "router_z_loss_clip": 0.80957031, + "router_z_loss_mlp": 0.12030029, + "step": 10324, + "time_per_iteration": 2.6260876655578613 + }, + { + "auxiliary_loss_clip": 0.01121037, + "auxiliary_loss_mlp": 0.0103385, + "balance_loss_clip": 1.0431881, + "balance_loss_mlp": 1.02146459, + "epoch": 0.6207725837967835, + "flos": 23081633359200.0, + "grad_norm": 2.5938148601351934, + "language_loss": 0.76762748, + "learning_rate": 1.3280712838241956e-06, + "loss": 0.78917634, + "num_input_tokens_seen": 222372755, + "router_z_loss_clip": 0.77880859, + "router_z_loss_mlp": 0.12384033, + "step": 10325, + "time_per_iteration": 2.631039619445801 + }, + { + "auxiliary_loss_clip": 0.01119998, + "auxiliary_loss_mlp": 0.01026133, + "balance_loss_clip": 1.04090083, + "balance_loss_mlp": 1.01387799, + "epoch": 0.6208327070494514, + "flos": 29261661717600.0, + "grad_norm": 1.9782816072225617, + "language_loss": 0.72452027, + "learning_rate": 1.327704472462003e-06, + "loss": 0.74598157, + "num_input_tokens_seen": 222391380, + "router_z_loss_clip": 0.79052734, + "router_z_loss_mlp": 0.1227417, + "step": 10326, + "time_per_iteration": 4.124832391738892 + }, + { + "auxiliary_loss_clip": 0.01121564, + "auxiliary_loss_mlp": 0.01034739, + "balance_loss_clip": 1.04206979, + "balance_loss_mlp": 1.02249062, + "epoch": 0.6208928303021194, + "flos": 27845664442560.0, + "grad_norm": 2.308770852155016, + "language_loss": 0.74008554, + "learning_rate": 1.3273376865933234e-06, + "loss": 0.76164854, + "num_input_tokens_seen": 222411165, + "router_z_loss_clip": 0.79394531, + "router_z_loss_mlp": 0.12231445, + "step": 10327, + "time_per_iteration": 2.72115159034729 + }, + { + "auxiliary_loss_clip": 0.01121474, + "auxiliary_loss_mlp": 0.01032201, + "balance_loss_clip": 1.04237652, + "balance_loss_mlp": 1.01909351, + "epoch": 0.6209529535547873, + "flos": 21432207268800.0, + "grad_norm": 2.1926291909769984, + "language_loss": 0.79176354, + "learning_rate": 1.326970926232066e-06, + "loss": 0.81330025, + "num_input_tokens_seen": 222428110, + "router_z_loss_clip": 0.79199219, + "router_z_loss_mlp": 0.13098145, + "step": 10328, + "time_per_iteration": 2.611830711364746 + }, + { + "auxiliary_loss_clip": 0.0111903, + "auxiliary_loss_mlp": 0.01038102, + "balance_loss_clip": 1.04119647, + "balance_loss_mlp": 1.02596688, + "epoch": 0.6210130768074553, + "flos": 26859613556160.0, + "grad_norm": 2.1774452367008372, + "language_loss": 0.77884871, + "learning_rate": 1.3266041913921396e-06, + "loss": 0.80042005, + "num_input_tokens_seen": 222446385, + "router_z_loss_clip": 0.77832031, + "router_z_loss_mlp": 0.12139893, + "step": 10329, + "time_per_iteration": 2.725131034851074 + }, + { + "auxiliary_loss_clip": 0.01037575, + "auxiliary_loss_mlp": 0.01002394, + "balance_loss_clip": 1.01383376, + "balance_loss_mlp": 1.00129926, + "epoch": 0.6210732000601232, + "flos": 77699204147520.0, + "grad_norm": 0.8130944235970078, + "language_loss": 0.62130892, + "learning_rate": 1.3262374820874484e-06, + "loss": 0.64170861, + "num_input_tokens_seen": 222502150, + "router_z_loss_clip": 0.23718262, + "router_z_loss_mlp": 0.01095581, + "step": 10330, + "time_per_iteration": 3.2462406158447266 + }, + { + "auxiliary_loss_clip": 0.0112303, + "auxiliary_loss_mlp": 0.01034553, + "balance_loss_clip": 1.04191673, + "balance_loss_mlp": 1.02184486, + "epoch": 0.6211333233127913, + "flos": 29582068432320.0, + "grad_norm": 2.6542028859365474, + "language_loss": 0.78028917, + "learning_rate": 1.3258707983319002e-06, + "loss": 0.80186498, + "num_input_tokens_seen": 222519880, + "router_z_loss_clip": 0.81103516, + "router_z_loss_mlp": 0.12713623, + "step": 10331, + "time_per_iteration": 2.6781673431396484 + }, + { + "auxiliary_loss_clip": 0.01120914, + "auxiliary_loss_mlp": 0.01033498, + "balance_loss_clip": 1.04192185, + "balance_loss_mlp": 1.02148747, + "epoch": 0.6211934465654592, + "flos": 20674804158720.0, + "grad_norm": 3.141480407853954, + "language_loss": 0.6736846, + "learning_rate": 1.3255041401393992e-06, + "loss": 0.6952287, + "num_input_tokens_seen": 222538545, + "router_z_loss_clip": 0.78955078, + "router_z_loss_mlp": 0.12005615, + "step": 10332, + "time_per_iteration": 2.7431867122650146 + }, + { + "auxiliary_loss_clip": 0.01118059, + "auxiliary_loss_mlp": 0.01030019, + "balance_loss_clip": 1.04162657, + "balance_loss_mlp": 1.01845002, + "epoch": 0.6212535698181272, + "flos": 18628933119840.0, + "grad_norm": 1.4649225251777704, + "language_loss": 0.7636686, + "learning_rate": 1.3251375075238476e-06, + "loss": 0.78514934, + "num_input_tokens_seen": 222556935, + "router_z_loss_clip": 0.76416016, + "router_z_loss_mlp": 0.11566162, + "step": 10333, + "time_per_iteration": 2.600783586502075 + }, + { + "auxiliary_loss_clip": 0.01115861, + "auxiliary_loss_mlp": 0.01028737, + "balance_loss_clip": 1.04104924, + "balance_loss_mlp": 1.01715589, + "epoch": 0.6213136930707951, + "flos": 16403758001280.0, + "grad_norm": 2.8401151359201933, + "language_loss": 0.6992532, + "learning_rate": 1.3247709004991507e-06, + "loss": 0.72069919, + "num_input_tokens_seen": 222574035, + "router_z_loss_clip": 0.74804688, + "router_z_loss_mlp": 0.11578369, + "step": 10334, + "time_per_iteration": 2.621762990951538 + }, + { + "auxiliary_loss_clip": 0.01118254, + "auxiliary_loss_mlp": 0.01029695, + "balance_loss_clip": 1.04281592, + "balance_loss_mlp": 1.01875138, + "epoch": 0.6213738163234631, + "flos": 22099391097120.0, + "grad_norm": 2.104726505753157, + "language_loss": 0.70030057, + "learning_rate": 1.3244043190792078e-06, + "loss": 0.72178006, + "num_input_tokens_seen": 222592290, + "router_z_loss_clip": 0.75488281, + "router_z_loss_mlp": 0.10949707, + "step": 10335, + "time_per_iteration": 2.6053643226623535 + }, + { + "auxiliary_loss_clip": 0.01114932, + "auxiliary_loss_mlp": 0.01032335, + "balance_loss_clip": 1.04065895, + "balance_loss_mlp": 1.02027154, + "epoch": 0.621433939576131, + "flos": 30917084365440.0, + "grad_norm": 1.9843952742827562, + "language_loss": 0.80330497, + "learning_rate": 1.3240377632779213e-06, + "loss": 0.8247776, + "num_input_tokens_seen": 222612805, + "router_z_loss_clip": 0.7421875, + "router_z_loss_mlp": 0.12072754, + "step": 10336, + "time_per_iteration": 2.6954638957977295 + }, + { + "auxiliary_loss_clip": 0.01116174, + "auxiliary_loss_mlp": 0.01031144, + "balance_loss_clip": 1.04205012, + "balance_loss_mlp": 1.0198015, + "epoch": 0.621494062828799, + "flos": 27534698254080.0, + "grad_norm": 2.064751979136123, + "language_loss": 0.73330593, + "learning_rate": 1.3236712331091907e-06, + "loss": 0.7547791, + "num_input_tokens_seen": 222632260, + "router_z_loss_clip": 0.74169922, + "router_z_loss_mlp": 0.11340332, + "step": 10337, + "time_per_iteration": 2.6294169425964355 + }, + { + "auxiliary_loss_clip": 0.01121645, + "auxiliary_loss_mlp": 0.01035671, + "balance_loss_clip": 1.04267836, + "balance_loss_mlp": 1.02304125, + "epoch": 0.621554186081467, + "flos": 33455940330240.0, + "grad_norm": 2.41661234773059, + "language_loss": 0.63677502, + "learning_rate": 1.3233047285869145e-06, + "loss": 0.65834814, + "num_input_tokens_seen": 222653570, + "router_z_loss_clip": 0.78955078, + "router_z_loss_mlp": 0.12646484, + "step": 10338, + "time_per_iteration": 2.687480926513672 + }, + { + "auxiliary_loss_clip": 0.01117864, + "auxiliary_loss_mlp": 0.01029017, + "balance_loss_clip": 1.04220748, + "balance_loss_mlp": 1.0167923, + "epoch": 0.621614309334135, + "flos": 27267686964000.0, + "grad_norm": 1.957922863412355, + "language_loss": 0.71825373, + "learning_rate": 1.322938249724991e-06, + "loss": 0.73972261, + "num_input_tokens_seen": 222672480, + "router_z_loss_clip": 0.75585938, + "router_z_loss_mlp": 0.12219238, + "step": 10339, + "time_per_iteration": 2.6200199127197266 + }, + { + "auxiliary_loss_clip": 0.01115293, + "auxiliary_loss_mlp": 0.01030335, + "balance_loss_clip": 1.04118276, + "balance_loss_mlp": 1.0184083, + "epoch": 0.621674432586803, + "flos": 23526895279680.0, + "grad_norm": 1.6837300492944705, + "language_loss": 0.6969164, + "learning_rate": 1.3225717965373166e-06, + "loss": 0.7183727, + "num_input_tokens_seen": 222691200, + "router_z_loss_clip": 0.74169922, + "router_z_loss_mlp": 0.11932373, + "step": 10340, + "time_per_iteration": 2.6385598182678223 + }, + { + "auxiliary_loss_clip": 0.0111366, + "auxiliary_loss_mlp": 0.01024596, + "balance_loss_clip": 1.03947735, + "balance_loss_mlp": 1.01368201, + "epoch": 0.6217345558394709, + "flos": 26368695011520.0, + "grad_norm": 2.044568088099489, + "language_loss": 0.68783516, + "learning_rate": 1.322205369037788e-06, + "loss": 0.70921773, + "num_input_tokens_seen": 222709975, + "router_z_loss_clip": 0.74169922, + "router_z_loss_mlp": 0.10925293, + "step": 10341, + "time_per_iteration": 2.6434547901153564 + }, + { + "auxiliary_loss_clip": 0.01120467, + "auxiliary_loss_mlp": 0.01030556, + "balance_loss_clip": 1.04257452, + "balance_loss_mlp": 1.01800907, + "epoch": 0.6217946790921389, + "flos": 21968458299360.0, + "grad_norm": 2.686394820048792, + "language_loss": 0.80921274, + "learning_rate": 1.321838967240299e-06, + "loss": 0.83072299, + "num_input_tokens_seen": 222729005, + "router_z_loss_clip": 0.77880859, + "router_z_loss_mlp": 0.12567139, + "step": 10342, + "time_per_iteration": 2.6680147647857666 + }, + { + "auxiliary_loss_clip": 0.01037762, + "auxiliary_loss_mlp": 0.00999918, + "balance_loss_clip": 1.0138768, + "balance_loss_mlp": 0.99872983, + "epoch": 0.6218548023448068, + "flos": 75619710116640.0, + "grad_norm": 0.7785852590836205, + "language_loss": 0.57317168, + "learning_rate": 1.3214725911587452e-06, + "loss": 0.59354848, + "num_input_tokens_seen": 222786090, + "router_z_loss_clip": 0.23876953, + "router_z_loss_mlp": 0.01187134, + "step": 10343, + "time_per_iteration": 3.215669870376587 + }, + { + "auxiliary_loss_clip": 0.01111794, + "auxiliary_loss_mlp": 0.01028386, + "balance_loss_clip": 1.03917623, + "balance_loss_mlp": 1.01756144, + "epoch": 0.6219149255974749, + "flos": 31407354633600.0, + "grad_norm": 3.9497178968402746, + "language_loss": 0.72776026, + "learning_rate": 1.3211062408070184e-06, + "loss": 0.74916208, + "num_input_tokens_seen": 222806100, + "router_z_loss_clip": 0.72607422, + "router_z_loss_mlp": 0.10821533, + "step": 10344, + "time_per_iteration": 2.6879422664642334 + }, + { + "auxiliary_loss_clip": 0.01118873, + "auxiliary_loss_mlp": 0.01037888, + "balance_loss_clip": 1.04211664, + "balance_loss_mlp": 1.02671218, + "epoch": 0.6219750488501428, + "flos": 31006979508960.0, + "grad_norm": 3.231214486747169, + "language_loss": 0.60416228, + "learning_rate": 1.3207399161990105e-06, + "loss": 0.62572992, + "num_input_tokens_seen": 222826575, + "router_z_loss_clip": 0.76660156, + "router_z_loss_mlp": 0.11181641, + "step": 10345, + "time_per_iteration": 2.727444648742676 + }, + { + "auxiliary_loss_clip": 0.01117616, + "auxiliary_loss_mlp": 0.01034131, + "balance_loss_clip": 1.04051995, + "balance_loss_mlp": 1.02207899, + "epoch": 0.6220351721028108, + "flos": 24461819709120.0, + "grad_norm": 3.0610363420053552, + "language_loss": 0.77896327, + "learning_rate": 1.320373617348614e-06, + "loss": 0.80048078, + "num_input_tokens_seen": 222845285, + "router_z_loss_clip": 0.77148438, + "router_z_loss_mlp": 0.12060547, + "step": 10346, + "time_per_iteration": 2.6143441200256348 + }, + { + "auxiliary_loss_clip": 0.01121091, + "auxiliary_loss_mlp": 0.01035923, + "balance_loss_clip": 1.04144692, + "balance_loss_mlp": 1.02268505, + "epoch": 0.6220952953554787, + "flos": 33542107884000.0, + "grad_norm": 1.845901604121337, + "language_loss": 0.7139098, + "learning_rate": 1.3200073442697171e-06, + "loss": 0.73547995, + "num_input_tokens_seen": 222864575, + "router_z_loss_clip": 0.79638672, + "router_z_loss_mlp": 0.13220215, + "step": 10347, + "time_per_iteration": 2.719193935394287 + }, + { + "auxiliary_loss_clip": 0.01113177, + "auxiliary_loss_mlp": 0.0102646, + "balance_loss_clip": 1.03825951, + "balance_loss_mlp": 1.01472962, + "epoch": 0.6221554186081467, + "flos": 23437121688000.0, + "grad_norm": 1.943035450515487, + "language_loss": 0.71823251, + "learning_rate": 1.3196410969762108e-06, + "loss": 0.73962891, + "num_input_tokens_seen": 222884420, + "router_z_loss_clip": 0.74951172, + "router_z_loss_mlp": 0.11737061, + "step": 10348, + "time_per_iteration": 2.618285894393921 + }, + { + "auxiliary_loss_clip": 0.01037154, + "auxiliary_loss_mlp": 0.00999367, + "balance_loss_clip": 1.01324952, + "balance_loss_mlp": 0.9981823, + "epoch": 0.6222155418608146, + "flos": 76813015655520.0, + "grad_norm": 0.8116770468659277, + "language_loss": 0.54127347, + "learning_rate": 1.3192748754819815e-06, + "loss": 0.56163865, + "num_input_tokens_seen": 222944690, + "router_z_loss_clip": 0.23913574, + "router_z_loss_mlp": 0.01183319, + "step": 10349, + "time_per_iteration": 3.2361135482788086 + }, + { + "auxiliary_loss_clip": 0.01117394, + "auxiliary_loss_mlp": 0.01027993, + "balance_loss_clip": 1.04089046, + "balance_loss_mlp": 1.016155, + "epoch": 0.6222756651134826, + "flos": 27578085910560.0, + "grad_norm": 2.220973986400832, + "language_loss": 0.70039248, + "learning_rate": 1.3189086798009173e-06, + "loss": 0.72184634, + "num_input_tokens_seen": 222962990, + "router_z_loss_clip": 0.76513672, + "router_z_loss_mlp": 0.1184082, + "step": 10350, + "time_per_iteration": 2.6220765113830566 + }, + { + "auxiliary_loss_clip": 0.01118642, + "auxiliary_loss_mlp": 0.01030424, + "balance_loss_clip": 1.04133511, + "balance_loss_mlp": 1.01837158, + "epoch": 0.6223357883661506, + "flos": 25797808056960.0, + "grad_norm": 2.003697714758519, + "language_loss": 0.56853962, + "learning_rate": 1.3185425099469046e-06, + "loss": 0.59003031, + "num_input_tokens_seen": 222980715, + "router_z_loss_clip": 0.7734375, + "router_z_loss_mlp": 0.12042236, + "step": 10351, + "time_per_iteration": 2.6956064701080322 + }, + { + "auxiliary_loss_clip": 0.01036198, + "auxiliary_loss_mlp": 0.01001373, + "balance_loss_clip": 1.01216137, + "balance_loss_mlp": 1.0001291, + "epoch": 0.6223959116188186, + "flos": 77805427754880.0, + "grad_norm": 0.8095901212967133, + "language_loss": 0.61106586, + "learning_rate": 1.3181763659338276e-06, + "loss": 0.63144153, + "num_input_tokens_seen": 223040685, + "router_z_loss_clip": 0.2401123, + "router_z_loss_mlp": 0.01242828, + "step": 10352, + "time_per_iteration": 3.219921588897705 + }, + { + "auxiliary_loss_clip": 0.01114089, + "auxiliary_loss_mlp": 0.01032829, + "balance_loss_clip": 1.03986108, + "balance_loss_mlp": 1.02078271, + "epoch": 0.6224560348714866, + "flos": 27534819805920.0, + "grad_norm": 3.59412812766918, + "language_loss": 0.81886363, + "learning_rate": 1.3178102477755714e-06, + "loss": 0.84033281, + "num_input_tokens_seen": 223059000, + "router_z_loss_clip": 0.74121094, + "router_z_loss_mlp": 0.12042236, + "step": 10353, + "time_per_iteration": 4.113842248916626 + }, + { + "auxiliary_loss_clip": 0.01110811, + "auxiliary_loss_mlp": 0.01032442, + "balance_loss_clip": 1.03877199, + "balance_loss_mlp": 1.02164125, + "epoch": 0.6225161581241545, + "flos": 29404344526560.0, + "grad_norm": 1.6778802051634545, + "language_loss": 0.75842941, + "learning_rate": 1.3174441554860195e-06, + "loss": 0.77986193, + "num_input_tokens_seen": 223079345, + "router_z_loss_clip": 0.72167969, + "router_z_loss_mlp": 0.10803223, + "step": 10354, + "time_per_iteration": 2.827578067779541 + }, + { + "auxiliary_loss_clip": 0.01115238, + "auxiliary_loss_mlp": 0.01026313, + "balance_loss_clip": 1.04016566, + "balance_loss_mlp": 1.01449311, + "epoch": 0.6225762813768225, + "flos": 24945688247040.0, + "grad_norm": 1.7127919331130703, + "language_loss": 0.78615403, + "learning_rate": 1.3170780890790528e-06, + "loss": 0.80756962, + "num_input_tokens_seen": 223097880, + "router_z_loss_clip": 0.75, + "router_z_loss_mlp": 0.11828613, + "step": 10355, + "time_per_iteration": 4.079005479812622 + }, + { + "auxiliary_loss_clip": 0.01117495, + "auxiliary_loss_mlp": 0.01030026, + "balance_loss_clip": 1.04201663, + "balance_loss_mlp": 1.01835585, + "epoch": 0.6226364046294904, + "flos": 33187754039040.0, + "grad_norm": 2.101319387377807, + "language_loss": 0.78485274, + "learning_rate": 1.3167120485685538e-06, + "loss": 0.80632794, + "num_input_tokens_seen": 223118185, + "router_z_loss_clip": 0.75439453, + "router_z_loss_mlp": 0.11682129, + "step": 10356, + "time_per_iteration": 2.661761522293091 + }, + { + "auxiliary_loss_clip": 0.01122684, + "auxiliary_loss_mlp": 0.01031997, + "balance_loss_clip": 1.04230237, + "balance_loss_mlp": 1.01937902, + "epoch": 0.6226965278821585, + "flos": 24948240835680.0, + "grad_norm": 1.9632037987245068, + "language_loss": 0.67704129, + "learning_rate": 1.3163460339684024e-06, + "loss": 0.69858813, + "num_input_tokens_seen": 223137600, + "router_z_loss_clip": 0.80322266, + "router_z_loss_mlp": 0.12609863, + "step": 10357, + "time_per_iteration": 2.6348724365234375 + }, + { + "auxiliary_loss_clip": 0.01122964, + "auxiliary_loss_mlp": 0.01030736, + "balance_loss_clip": 1.04324651, + "balance_loss_mlp": 1.017349, + "epoch": 0.6227566511348264, + "flos": 27043212467520.0, + "grad_norm": 3.05230934769533, + "language_loss": 0.76074404, + "learning_rate": 1.3159800452924778e-06, + "loss": 0.78228104, + "num_input_tokens_seen": 223154360, + "router_z_loss_clip": 0.79589844, + "router_z_loss_mlp": 0.1338501, + "step": 10358, + "time_per_iteration": 2.6415467262268066 + }, + { + "auxiliary_loss_clip": 0.01116341, + "auxiliary_loss_mlp": 0.01027858, + "balance_loss_clip": 1.0387826, + "balance_loss_mlp": 1.0158242, + "epoch": 0.6228167743874944, + "flos": 22013304577920.0, + "grad_norm": 2.2087974128576673, + "language_loss": 0.81790656, + "learning_rate": 1.3156140825546588e-06, + "loss": 0.83934855, + "num_input_tokens_seen": 223172255, + "router_z_loss_clip": 0.77587891, + "router_z_loss_mlp": 0.12042236, + "step": 10359, + "time_per_iteration": 2.648249387741089 + }, + { + "auxiliary_loss_clip": 0.01113175, + "auxiliary_loss_mlp": 0.01037746, + "balance_loss_clip": 1.03904438, + "balance_loss_mlp": 1.02548552, + "epoch": 0.6228768976401623, + "flos": 21650199000480.0, + "grad_norm": 3.6429390176026204, + "language_loss": 0.73365271, + "learning_rate": 1.315248145768822e-06, + "loss": 0.75516188, + "num_input_tokens_seen": 223186965, + "router_z_loss_clip": 0.74169922, + "router_z_loss_mlp": 0.1227417, + "step": 10360, + "time_per_iteration": 2.568629503250122 + }, + { + "auxiliary_loss_clip": 0.0111675, + "auxiliary_loss_mlp": 0.01035623, + "balance_loss_clip": 1.04084778, + "balance_loss_mlp": 1.02357674, + "epoch": 0.6229370208928303, + "flos": 21835216016640.0, + "grad_norm": 2.4821744046284406, + "language_loss": 0.77325726, + "learning_rate": 1.3148822349488442e-06, + "loss": 0.79478103, + "num_input_tokens_seen": 223206045, + "router_z_loss_clip": 0.75878906, + "router_z_loss_mlp": 0.1204834, + "step": 10361, + "time_per_iteration": 2.620271921157837 + }, + { + "auxiliary_loss_clip": 0.011168, + "auxiliary_loss_mlp": 0.01029744, + "balance_loss_clip": 1.04148507, + "balance_loss_mlp": 1.01859188, + "epoch": 0.6229971441454982, + "flos": 21167505463680.0, + "grad_norm": 2.2730853752772933, + "language_loss": 0.6741119, + "learning_rate": 1.3145163501086005e-06, + "loss": 0.69557738, + "num_input_tokens_seen": 223224820, + "router_z_loss_clip": 0.75390625, + "router_z_loss_mlp": 0.1116333, + "step": 10362, + "time_per_iteration": 4.052792310714722 + }, + { + "auxiliary_loss_clip": 0.01117327, + "auxiliary_loss_mlp": 0.010292, + "balance_loss_clip": 1.0410701, + "balance_loss_mlp": 1.01677203, + "epoch": 0.6230572673981662, + "flos": 35681074931520.0, + "grad_norm": 2.0443427628100386, + "language_loss": 0.67960316, + "learning_rate": 1.3141504912619658e-06, + "loss": 0.7010684, + "num_input_tokens_seen": 223243205, + "router_z_loss_clip": 0.76171875, + "router_z_loss_mlp": 0.12432861, + "step": 10363, + "time_per_iteration": 2.684081792831421 + }, + { + "auxiliary_loss_clip": 0.01119073, + "auxiliary_loss_mlp": 0.01029478, + "balance_loss_clip": 1.04075003, + "balance_loss_mlp": 1.01694918, + "epoch": 0.6231173906508342, + "flos": 19920642431040.0, + "grad_norm": 2.173671706047159, + "language_loss": 0.86190975, + "learning_rate": 1.3137846584228127e-06, + "loss": 0.88339525, + "num_input_tokens_seen": 223261370, + "router_z_loss_clip": 0.78417969, + "router_z_loss_mlp": 0.12536621, + "step": 10364, + "time_per_iteration": 2.6040802001953125 + }, + { + "auxiliary_loss_clip": 0.01037976, + "auxiliary_loss_mlp": 0.01002922, + "balance_loss_clip": 1.01385677, + "balance_loss_mlp": 1.00167918, + "epoch": 0.6231775139035022, + "flos": 83832198294240.0, + "grad_norm": 0.8961000708139683, + "language_loss": 0.60718077, + "learning_rate": 1.313418851605015e-06, + "loss": 0.62758982, + "num_input_tokens_seen": 223315050, + "router_z_loss_clip": 0.24133301, + "router_z_loss_mlp": 0.01242065, + "step": 10365, + "time_per_iteration": 3.2774837017059326 + }, + { + "auxiliary_loss_clip": 0.01123156, + "auxiliary_loss_mlp": 0.01036596, + "balance_loss_clip": 1.04235995, + "balance_loss_mlp": 1.02260041, + "epoch": 0.6232376371561702, + "flos": 24186299790240.0, + "grad_norm": 2.0546049592062694, + "language_loss": 0.753407, + "learning_rate": 1.3130530708224427e-06, + "loss": 0.77500451, + "num_input_tokens_seen": 223332130, + "router_z_loss_clip": 0.80712891, + "router_z_loss_mlp": 0.13989258, + "step": 10366, + "time_per_iteration": 4.059491872787476 + }, + { + "auxiliary_loss_clip": 0.01120459, + "auxiliary_loss_mlp": 0.01038145, + "balance_loss_clip": 1.04237485, + "balance_loss_mlp": 1.02590179, + "epoch": 0.6232977604088381, + "flos": 28379970643680.0, + "grad_norm": 2.26439838302543, + "language_loss": 0.76472324, + "learning_rate": 1.3126873160889665e-06, + "loss": 0.78630924, + "num_input_tokens_seen": 223351605, + "router_z_loss_clip": 0.78222656, + "router_z_loss_mlp": 0.12255859, + "step": 10367, + "time_per_iteration": 2.795848846435547 + }, + { + "auxiliary_loss_clip": 0.01116824, + "auxiliary_loss_mlp": 0.010336, + "balance_loss_clip": 1.04299271, + "balance_loss_mlp": 1.02192974, + "epoch": 0.6233578836615061, + "flos": 25754055744960.0, + "grad_norm": 1.5541903904993626, + "language_loss": 0.78373367, + "learning_rate": 1.312321587418457e-06, + "loss": 0.80523789, + "num_input_tokens_seen": 223372090, + "router_z_loss_clip": 0.73925781, + "router_z_loss_mlp": 0.11676025, + "step": 10368, + "time_per_iteration": 2.6263134479522705 + }, + { + "auxiliary_loss_clip": 0.01122102, + "auxiliary_loss_mlp": 0.01031078, + "balance_loss_clip": 1.04356062, + "balance_loss_mlp": 1.01863861, + "epoch": 0.623418006914174, + "flos": 29003685780960.0, + "grad_norm": 2.0765614222002067, + "language_loss": 0.68218398, + "learning_rate": 1.3119558848247811e-06, + "loss": 0.7037158, + "num_input_tokens_seen": 223390110, + "router_z_loss_clip": 0.78564453, + "router_z_loss_mlp": 0.12445068, + "step": 10369, + "time_per_iteration": 2.685070753097534 + }, + { + "auxiliary_loss_clip": 0.01118502, + "auxiliary_loss_mlp": 0.01035963, + "balance_loss_clip": 1.04153574, + "balance_loss_mlp": 1.02333891, + "epoch": 0.6234781301668421, + "flos": 21830191873920.0, + "grad_norm": 2.1535324804854596, + "language_loss": 0.87869048, + "learning_rate": 1.3115902083218072e-06, + "loss": 0.90023518, + "num_input_tokens_seen": 223404205, + "router_z_loss_clip": 0.76953125, + "router_z_loss_mlp": 0.12628174, + "step": 10370, + "time_per_iteration": 2.5883960723876953 + }, + { + "auxiliary_loss_clip": 0.01115027, + "auxiliary_loss_mlp": 0.01026872, + "balance_loss_clip": 1.03957891, + "balance_loss_mlp": 1.01513016, + "epoch": 0.62353825341951, + "flos": 31939959108960.0, + "grad_norm": 1.6830737970685767, + "language_loss": 0.65927386, + "learning_rate": 1.311224557923402e-06, + "loss": 0.68069279, + "num_input_tokens_seen": 223424855, + "router_z_loss_clip": 0.75390625, + "router_z_loss_mlp": 0.11743164, + "step": 10371, + "time_per_iteration": 2.696931838989258 + }, + { + "auxiliary_loss_clip": 0.0111009, + "auxiliary_loss_mlp": 0.01029007, + "balance_loss_clip": 1.03798747, + "balance_loss_mlp": 1.01830792, + "epoch": 0.623598376672178, + "flos": 37997684850240.0, + "grad_norm": 1.375490728550307, + "language_loss": 0.7772842, + "learning_rate": 1.3108589336434298e-06, + "loss": 0.79867518, + "num_input_tokens_seen": 223447225, + "router_z_loss_clip": 0.72070312, + "router_z_loss_mlp": 0.10693359, + "step": 10372, + "time_per_iteration": 2.7864339351654053 + }, + { + "auxiliary_loss_clip": 0.01117985, + "auxiliary_loss_mlp": 0.01032932, + "balance_loss_clip": 1.04008555, + "balance_loss_mlp": 1.02067733, + "epoch": 0.6236584999248459, + "flos": 28956084327360.0, + "grad_norm": 3.508634552172886, + "language_loss": 0.77354968, + "learning_rate": 1.3104933354957568e-06, + "loss": 0.79505885, + "num_input_tokens_seen": 223467520, + "router_z_loss_clip": 0.77880859, + "router_z_loss_mlp": 0.12255859, + "step": 10373, + "time_per_iteration": 2.650606393814087 + }, + { + "auxiliary_loss_clip": 0.01113879, + "auxiliary_loss_mlp": 0.01024785, + "balance_loss_clip": 1.04078126, + "balance_loss_mlp": 1.01313221, + "epoch": 0.6237186231775139, + "flos": 26555008580640.0, + "grad_norm": 1.6161330940085055, + "language_loss": 0.69857275, + "learning_rate": 1.3101277634942448e-06, + "loss": 0.71995938, + "num_input_tokens_seen": 223488130, + "router_z_loss_clip": 0.73095703, + "router_z_loss_mlp": 0.11645508, + "step": 10374, + "time_per_iteration": 2.643242835998535 + }, + { + "auxiliary_loss_clip": 0.01119938, + "auxiliary_loss_mlp": 0.01031318, + "balance_loss_clip": 1.04238081, + "balance_loss_mlp": 1.01950979, + "epoch": 0.6237787464301818, + "flos": 18228760581600.0, + "grad_norm": 6.102974184651829, + "language_loss": 0.7707305, + "learning_rate": 1.3097622176527577e-06, + "loss": 0.792243, + "num_input_tokens_seen": 223505105, + "router_z_loss_clip": 0.77587891, + "router_z_loss_mlp": 0.11816406, + "step": 10375, + "time_per_iteration": 2.6376380920410156 + }, + { + "auxiliary_loss_clip": 0.01116747, + "auxiliary_loss_mlp": 0.01026225, + "balance_loss_clip": 1.04231548, + "balance_loss_mlp": 1.01466811, + "epoch": 0.6238388696828499, + "flos": 43428494589120.0, + "grad_norm": 1.4917410867615608, + "language_loss": 0.70055187, + "learning_rate": 1.3093966979851566e-06, + "loss": 0.72198159, + "num_input_tokens_seen": 223528065, + "router_z_loss_clip": 0.74462891, + "router_z_loss_mlp": 0.11553955, + "step": 10376, + "time_per_iteration": 2.805171489715576 + }, + { + "auxiliary_loss_clip": 0.01119814, + "auxiliary_loss_mlp": 0.01031297, + "balance_loss_clip": 1.04188478, + "balance_loss_mlp": 1.01841044, + "epoch": 0.6238989929355178, + "flos": 28825637736960.0, + "grad_norm": 1.7440401758509596, + "language_loss": 0.76532125, + "learning_rate": 1.309031204505301e-06, + "loss": 0.78683233, + "num_input_tokens_seen": 223547305, + "router_z_loss_clip": 0.78027344, + "router_z_loss_mlp": 0.12896729, + "step": 10377, + "time_per_iteration": 2.6404709815979004 + }, + { + "auxiliary_loss_clip": 0.01119132, + "auxiliary_loss_mlp": 0.01026847, + "balance_loss_clip": 1.04153883, + "balance_loss_mlp": 1.01576614, + "epoch": 0.6239591161881858, + "flos": 26951575080960.0, + "grad_norm": 1.9237699339513283, + "language_loss": 0.68132234, + "learning_rate": 1.308665737227052e-06, + "loss": 0.70278215, + "num_input_tokens_seen": 223567205, + "router_z_loss_clip": 0.77490234, + "router_z_loss_mlp": 0.11083984, + "step": 10378, + "time_per_iteration": 2.653096914291382 + }, + { + "auxiliary_loss_clip": 0.01117375, + "auxiliary_loss_mlp": 0.01033336, + "balance_loss_clip": 1.04124832, + "balance_loss_mlp": 1.0212605, + "epoch": 0.6240192394408538, + "flos": 29939015383200.0, + "grad_norm": 2.4136074071531697, + "language_loss": 0.76182258, + "learning_rate": 1.3083002961642675e-06, + "loss": 0.78332973, + "num_input_tokens_seen": 223586560, + "router_z_loss_clip": 0.76074219, + "router_z_loss_mlp": 0.12084961, + "step": 10379, + "time_per_iteration": 2.643841028213501 + }, + { + "auxiliary_loss_clip": 0.01114969, + "auxiliary_loss_mlp": 0.01031361, + "balance_loss_clip": 1.03975308, + "balance_loss_mlp": 1.01914227, + "epoch": 0.6240793626935217, + "flos": 34082815815360.0, + "grad_norm": 3.870971688443079, + "language_loss": 0.79619056, + "learning_rate": 1.3079348813308051e-06, + "loss": 0.81765383, + "num_input_tokens_seen": 223610595, + "router_z_loss_clip": 0.75292969, + "router_z_loss_mlp": 0.12219238, + "step": 10380, + "time_per_iteration": 2.7635586261749268 + }, + { + "auxiliary_loss_clip": 0.01115179, + "auxiliary_loss_mlp": 0.01032062, + "balance_loss_clip": 1.04129219, + "balance_loss_mlp": 1.02088654, + "epoch": 0.6241394859461897, + "flos": 27933736308480.0, + "grad_norm": 1.5681885587171616, + "language_loss": 0.79875863, + "learning_rate": 1.3075694927405207e-06, + "loss": 0.82023102, + "num_input_tokens_seen": 223630230, + "router_z_loss_clip": 0.73925781, + "router_z_loss_mlp": 0.11175537, + "step": 10381, + "time_per_iteration": 2.609360456466675 + }, + { + "auxiliary_loss_clip": 0.01117793, + "auxiliary_loss_mlp": 0.0102874, + "balance_loss_clip": 1.04083216, + "balance_loss_mlp": 1.01724803, + "epoch": 0.6241996091988576, + "flos": 15557877852480.0, + "grad_norm": 2.040519585764275, + "language_loss": 0.74038529, + "learning_rate": 1.3072041304072718e-06, + "loss": 0.76185066, + "num_input_tokens_seen": 223648360, + "router_z_loss_clip": 0.76953125, + "router_z_loss_mlp": 0.1149292, + "step": 10382, + "time_per_iteration": 2.6405141353607178 + }, + { + "auxiliary_loss_clip": 0.01114098, + "auxiliary_loss_mlp": 0.0102615, + "balance_loss_clip": 1.04000187, + "balance_loss_mlp": 1.01497447, + "epoch": 0.6242597324515257, + "flos": 31543959850560.0, + "grad_norm": 1.5923788412368691, + "language_loss": 0.78648156, + "learning_rate": 1.306838794344911e-06, + "loss": 0.80788398, + "num_input_tokens_seen": 223671255, + "router_z_loss_clip": 0.74072266, + "router_z_loss_mlp": 0.11175537, + "step": 10383, + "time_per_iteration": 2.678882598876953 + }, + { + "auxiliary_loss_clip": 0.01115236, + "auxiliary_loss_mlp": 0.01026052, + "balance_loss_clip": 1.04030752, + "balance_loss_mlp": 1.01523972, + "epoch": 0.6243198557041936, + "flos": 24328010184480.0, + "grad_norm": 1.8698605412104186, + "language_loss": 0.75364506, + "learning_rate": 1.3064734845672925e-06, + "loss": 0.77505791, + "num_input_tokens_seen": 223689860, + "router_z_loss_clip": 0.74902344, + "router_z_loss_mlp": 0.10809326, + "step": 10384, + "time_per_iteration": 2.6503262519836426 + }, + { + "auxiliary_loss_clip": 0.01119811, + "auxiliary_loss_mlp": 0.01033801, + "balance_loss_clip": 1.04143238, + "balance_loss_mlp": 1.02148676, + "epoch": 0.6243799789568616, + "flos": 22459052705760.0, + "grad_norm": 2.0473618356546317, + "language_loss": 0.66698396, + "learning_rate": 1.3061082010882694e-06, + "loss": 0.68852007, + "num_input_tokens_seen": 223707835, + "router_z_loss_clip": 0.78369141, + "router_z_loss_mlp": 0.12316895, + "step": 10385, + "time_per_iteration": 2.6190152168273926 + }, + { + "auxiliary_loss_clip": 0.01037922, + "auxiliary_loss_mlp": 0.01001914, + "balance_loss_clip": 1.01390362, + "balance_loss_mlp": 1.00076318, + "epoch": 0.6244401022095295, + "flos": 80566286662080.0, + "grad_norm": 0.7548505091547077, + "language_loss": 0.62004799, + "learning_rate": 1.305742943921692e-06, + "loss": 0.64044631, + "num_input_tokens_seen": 223771875, + "router_z_loss_clip": 0.24023438, + "router_z_loss_mlp": 0.01150513, + "step": 10386, + "time_per_iteration": 3.2752156257629395 + }, + { + "auxiliary_loss_clip": 0.01117415, + "auxiliary_loss_mlp": 0.01031408, + "balance_loss_clip": 1.03987336, + "balance_loss_mlp": 1.01930237, + "epoch": 0.6245002254621975, + "flos": 29982240970560.0, + "grad_norm": 6.298730571720408, + "language_loss": 0.72235197, + "learning_rate": 1.3053777130814128e-06, + "loss": 0.74384022, + "num_input_tokens_seen": 223788895, + "router_z_loss_clip": 0.77490234, + "router_z_loss_mlp": 0.12121582, + "step": 10387, + "time_per_iteration": 2.7122554779052734 + }, + { + "auxiliary_loss_clip": 0.0112299, + "auxiliary_loss_mlp": 0.01040237, + "balance_loss_clip": 1.04169977, + "balance_loss_mlp": 1.02729726, + "epoch": 0.6245603487148654, + "flos": 35592516858240.0, + "grad_norm": 3.0810549464141586, + "language_loss": 0.65372837, + "learning_rate": 1.3050125085812798e-06, + "loss": 0.67536068, + "num_input_tokens_seen": 223810385, + "router_z_loss_clip": 0.81298828, + "router_z_loss_mlp": 0.12921143, + "step": 10388, + "time_per_iteration": 2.6649973392486572 + }, + { + "auxiliary_loss_clip": 0.01117287, + "auxiliary_loss_mlp": 0.01028573, + "balance_loss_clip": 1.04031396, + "balance_loss_mlp": 1.01762342, + "epoch": 0.6246204719675335, + "flos": 18046782361440.0, + "grad_norm": 1.7671585012561462, + "language_loss": 0.79047304, + "learning_rate": 1.3046473304351417e-06, + "loss": 0.81193161, + "num_input_tokens_seen": 223826040, + "router_z_loss_clip": 0.76904297, + "router_z_loss_mlp": 0.10955811, + "step": 10389, + "time_per_iteration": 2.6464529037475586 + }, + { + "auxiliary_loss_clip": 0.01114917, + "auxiliary_loss_mlp": 0.01032507, + "balance_loss_clip": 1.03983736, + "balance_loss_mlp": 1.02078271, + "epoch": 0.6246805952202014, + "flos": 15244156488960.0, + "grad_norm": 2.291739973909343, + "language_loss": 0.603302, + "learning_rate": 1.3042821786568475e-06, + "loss": 0.62477624, + "num_input_tokens_seen": 223842300, + "router_z_loss_clip": 0.75048828, + "router_z_loss_mlp": 0.11737061, + "step": 10390, + "time_per_iteration": 2.5902462005615234 + }, + { + "auxiliary_loss_clip": 0.01118857, + "auxiliary_loss_mlp": 0.01033374, + "balance_loss_clip": 1.04001975, + "balance_loss_mlp": 1.0213877, + "epoch": 0.6247407184728694, + "flos": 15601751716320.0, + "grad_norm": 2.207211729321308, + "language_loss": 0.77167034, + "learning_rate": 1.3039170532602416e-06, + "loss": 0.79319263, + "num_input_tokens_seen": 223858320, + "router_z_loss_clip": 0.78808594, + "router_z_loss_mlp": 0.11981201, + "step": 10391, + "time_per_iteration": 2.6671276092529297 + }, + { + "auxiliary_loss_clip": 0.01120866, + "auxiliary_loss_mlp": 0.01032581, + "balance_loss_clip": 1.04311943, + "balance_loss_mlp": 1.02024293, + "epoch": 0.6248008417255374, + "flos": 49576966336800.0, + "grad_norm": 1.6000355384842042, + "language_loss": 0.64517552, + "learning_rate": 1.3035519542591718e-06, + "loss": 0.66671002, + "num_input_tokens_seen": 223883545, + "router_z_loss_clip": 0.77734375, + "router_z_loss_mlp": 0.12347412, + "step": 10392, + "time_per_iteration": 4.266865491867065 + }, + { + "auxiliary_loss_clip": 0.0112225, + "auxiliary_loss_mlp": 0.01032489, + "balance_loss_clip": 1.04351854, + "balance_loss_mlp": 1.0204246, + "epoch": 0.6248609649782053, + "flos": 24284379424320.0, + "grad_norm": 2.589717462215586, + "language_loss": 0.76579404, + "learning_rate": 1.3031868816674819e-06, + "loss": 0.78734136, + "num_input_tokens_seen": 223901445, + "router_z_loss_clip": 0.78808594, + "router_z_loss_mlp": 0.12078857, + "step": 10393, + "time_per_iteration": 2.595926523208618 + }, + { + "auxiliary_loss_clip": 0.01121392, + "auxiliary_loss_mlp": 0.01034569, + "balance_loss_clip": 1.04251325, + "balance_loss_mlp": 1.02202177, + "epoch": 0.6249210882308733, + "flos": 24016841409600.0, + "grad_norm": 1.8313441653008455, + "language_loss": 0.82237226, + "learning_rate": 1.3028218354990142e-06, + "loss": 0.84393191, + "num_input_tokens_seen": 223920170, + "router_z_loss_clip": 0.78857422, + "router_z_loss_mlp": 0.12542725, + "step": 10394, + "time_per_iteration": 2.691876173019409 + }, + { + "auxiliary_loss_clip": 0.01121179, + "auxiliary_loss_mlp": 0.01037131, + "balance_loss_clip": 1.04204142, + "balance_loss_mlp": 1.02423286, + "epoch": 0.6249812114835412, + "flos": 17071022864160.0, + "grad_norm": 2.56546564760643, + "language_loss": 0.75032365, + "learning_rate": 1.3024568157676128e-06, + "loss": 0.77190679, + "num_input_tokens_seen": 223936495, + "router_z_loss_clip": 0.79150391, + "router_z_loss_mlp": 0.12896729, + "step": 10395, + "time_per_iteration": 3.925764322280884 + }, + { + "auxiliary_loss_clip": 0.01119222, + "auxiliary_loss_mlp": 0.01029334, + "balance_loss_clip": 1.04015374, + "balance_loss_mlp": 1.01697171, + "epoch": 0.6250413347362093, + "flos": 17729860132800.0, + "grad_norm": 2.931457553083682, + "language_loss": 0.72822869, + "learning_rate": 1.302091822487119e-06, + "loss": 0.74971426, + "num_input_tokens_seen": 223950070, + "router_z_loss_clip": 0.79052734, + "router_z_loss_mlp": 0.12365723, + "step": 10396, + "time_per_iteration": 2.6070401668548584 + }, + { + "auxiliary_loss_clip": 0.01117824, + "auxiliary_loss_mlp": 0.01032158, + "balance_loss_clip": 1.04134798, + "balance_loss_mlp": 1.02049899, + "epoch": 0.6251014579888772, + "flos": 28019984896800.0, + "grad_norm": 4.637704345840577, + "language_loss": 0.75809109, + "learning_rate": 1.3017268556713732e-06, + "loss": 0.77959096, + "num_input_tokens_seen": 223970065, + "router_z_loss_clip": 0.76416016, + "router_z_loss_mlp": 0.11657715, + "step": 10397, + "time_per_iteration": 2.6857659816741943 + }, + { + "auxiliary_loss_clip": 0.01117826, + "auxiliary_loss_mlp": 0.01033599, + "balance_loss_clip": 1.04141879, + "balance_loss_mlp": 1.02121925, + "epoch": 0.6251615812415452, + "flos": 34301901513600.0, + "grad_norm": 2.408483891497526, + "language_loss": 0.74685585, + "learning_rate": 1.3013619153342154e-06, + "loss": 0.76837015, + "num_input_tokens_seen": 223990315, + "router_z_loss_clip": 0.76464844, + "router_z_loss_mlp": 0.12371826, + "step": 10398, + "time_per_iteration": 2.6876773834228516 + }, + { + "auxiliary_loss_clip": 0.01120998, + "auxiliary_loss_mlp": 0.01034111, + "balance_loss_clip": 1.04084682, + "balance_loss_mlp": 1.02042019, + "epoch": 0.6252217044942131, + "flos": 32609695525920.0, + "grad_norm": 1.8922469620126063, + "language_loss": 0.74264395, + "learning_rate": 1.300997001489483e-06, + "loss": 0.76419508, + "num_input_tokens_seen": 224009960, + "router_z_loss_clip": 0.80224609, + "router_z_loss_mlp": 0.13696289, + "step": 10399, + "time_per_iteration": 2.73512601852417 + }, + { + "auxiliary_loss_clip": 0.01119756, + "auxiliary_loss_mlp": 0.01034519, + "balance_loss_clip": 1.04229856, + "balance_loss_mlp": 1.02254426, + "epoch": 0.6252818277468811, + "flos": 24412638081600.0, + "grad_norm": 2.0746830406171526, + "language_loss": 0.74672222, + "learning_rate": 1.3006321141510147e-06, + "loss": 0.76826495, + "num_input_tokens_seen": 224028870, + "router_z_loss_clip": 0.77490234, + "router_z_loss_mlp": 0.11968994, + "step": 10400, + "time_per_iteration": 2.6124281883239746 + }, + { + "auxiliary_loss_clip": 0.01039934, + "auxiliary_loss_mlp": 0.01003823, + "balance_loss_clip": 1.01614761, + "balance_loss_mlp": 1.00260472, + "epoch": 0.625341950999549, + "flos": 72330217427520.0, + "grad_norm": 0.8368032720489332, + "language_loss": 0.56467044, + "learning_rate": 1.3002672533326465e-06, + "loss": 0.58510792, + "num_input_tokens_seen": 224094140, + "router_z_loss_clip": 0.23791504, + "router_z_loss_mlp": 0.01216888, + "step": 10401, + "time_per_iteration": 4.870488882064819 + }, + { + "auxiliary_loss_clip": 0.01119282, + "auxiliary_loss_mlp": 0.01029904, + "balance_loss_clip": 1.04094541, + "balance_loss_mlp": 1.01772738, + "epoch": 0.625402074252217, + "flos": 24595669751040.0, + "grad_norm": 2.2925053610643, + "language_loss": 0.82728338, + "learning_rate": 1.2999024190482146e-06, + "loss": 0.84877527, + "num_input_tokens_seen": 224113235, + "router_z_loss_clip": 0.78417969, + "router_z_loss_mlp": 0.1217041, + "step": 10402, + "time_per_iteration": 2.6608309745788574 + }, + { + "auxiliary_loss_clip": 0.01115404, + "auxiliary_loss_mlp": 0.0103174, + "balance_loss_clip": 1.03925693, + "balance_loss_mlp": 1.01955676, + "epoch": 0.625462197504885, + "flos": 35548197304320.0, + "grad_norm": 2.3725534468710086, + "language_loss": 0.69141793, + "learning_rate": 1.2995376113115527e-06, + "loss": 0.71288931, + "num_input_tokens_seen": 224134530, + "router_z_loss_clip": 0.76074219, + "router_z_loss_mlp": 0.1217041, + "step": 10403, + "time_per_iteration": 2.685526132583618 + }, + { + "auxiliary_loss_clip": 0.01120116, + "auxiliary_loss_mlp": 0.01029902, + "balance_loss_clip": 1.04142225, + "balance_loss_mlp": 1.01646721, + "epoch": 0.625522320757553, + "flos": 31853062244160.0, + "grad_norm": 1.6657302571636496, + "language_loss": 0.7131629, + "learning_rate": 1.2991728301364954e-06, + "loss": 0.73466301, + "num_input_tokens_seen": 224154170, + "router_z_loss_clip": 0.78662109, + "router_z_loss_mlp": 0.13439941, + "step": 10404, + "time_per_iteration": 2.761931896209717 + }, + { + "auxiliary_loss_clip": 0.01120288, + "auxiliary_loss_mlp": 0.01034957, + "balance_loss_clip": 1.04298639, + "balance_loss_mlp": 1.02291727, + "epoch": 0.625582444010221, + "flos": 25173971367840.0, + "grad_norm": 1.8832002158010255, + "language_loss": 0.69705272, + "learning_rate": 1.2988080755368742e-06, + "loss": 0.71860516, + "num_input_tokens_seen": 224172730, + "router_z_loss_clip": 0.7734375, + "router_z_loss_mlp": 0.1204834, + "step": 10405, + "time_per_iteration": 2.632793664932251 + }, + { + "auxiliary_loss_clip": 0.01118545, + "auxiliary_loss_mlp": 0.01031967, + "balance_loss_clip": 1.04200315, + "balance_loss_mlp": 1.01991487, + "epoch": 0.6256425672628889, + "flos": 25040404946880.0, + "grad_norm": 1.6897907601172062, + "language_loss": 0.79086423, + "learning_rate": 1.2984433475265207e-06, + "loss": 0.81236935, + "num_input_tokens_seen": 224192620, + "router_z_loss_clip": 0.76513672, + "router_z_loss_mlp": 0.12060547, + "step": 10406, + "time_per_iteration": 4.085382461547852 + }, + { + "auxiliary_loss_clip": 0.01118481, + "auxiliary_loss_mlp": 0.01032888, + "balance_loss_clip": 1.04213655, + "balance_loss_mlp": 1.0208118, + "epoch": 0.6257026905155569, + "flos": 36032673601440.0, + "grad_norm": 3.624598390081193, + "language_loss": 0.68695724, + "learning_rate": 1.2980786461192666e-06, + "loss": 0.70847094, + "num_input_tokens_seen": 224214660, + "router_z_loss_clip": 0.76318359, + "router_z_loss_mlp": 0.12084961, + "step": 10407, + "time_per_iteration": 2.7692465782165527 + }, + { + "auxiliary_loss_clip": 0.01115201, + "auxiliary_loss_mlp": 0.01027838, + "balance_loss_clip": 1.04186821, + "balance_loss_mlp": 1.01681674, + "epoch": 0.6257628137682248, + "flos": 29315421797760.0, + "grad_norm": 2.032155948977993, + "language_loss": 0.85730851, + "learning_rate": 1.2977139713289398e-06, + "loss": 0.878739, + "num_input_tokens_seen": 224234170, + "router_z_loss_clip": 0.73388672, + "router_z_loss_mlp": 0.11029053, + "step": 10408, + "time_per_iteration": 2.6224887371063232 + }, + { + "auxiliary_loss_clip": 0.01115957, + "auxiliary_loss_mlp": 0.01031455, + "balance_loss_clip": 1.04017735, + "balance_loss_mlp": 1.02018332, + "epoch": 0.6258229370208929, + "flos": 25442319728160.0, + "grad_norm": 1.9269492278834142, + "language_loss": 0.79933, + "learning_rate": 1.2973493231693699e-06, + "loss": 0.82080412, + "num_input_tokens_seen": 224253115, + "router_z_loss_clip": 0.7578125, + "router_z_loss_mlp": 0.11273193, + "step": 10409, + "time_per_iteration": 2.715456247329712 + }, + { + "auxiliary_loss_clip": 0.011152, + "auxiliary_loss_mlp": 0.01028014, + "balance_loss_clip": 1.03978062, + "balance_loss_mlp": 1.01630175, + "epoch": 0.6258830602735608, + "flos": 27127556743680.0, + "grad_norm": 14.723039547096226, + "language_loss": 0.69959176, + "learning_rate": 1.2969847016543845e-06, + "loss": 0.72102392, + "num_input_tokens_seen": 224271375, + "router_z_loss_clip": 0.75488281, + "router_z_loss_mlp": 0.11700439, + "step": 10410, + "time_per_iteration": 2.617158889770508 + }, + { + "auxiliary_loss_clip": 0.01115054, + "auxiliary_loss_mlp": 0.01025759, + "balance_loss_clip": 1.04129553, + "balance_loss_mlp": 1.01457167, + "epoch": 0.6259431835262288, + "flos": 30599716446720.0, + "grad_norm": 2.2071230042205974, + "language_loss": 0.67429614, + "learning_rate": 1.2966201067978086e-06, + "loss": 0.69570428, + "num_input_tokens_seen": 224290315, + "router_z_loss_clip": 0.73730469, + "router_z_loss_mlp": 0.11187744, + "step": 10411, + "time_per_iteration": 2.715567111968994 + }, + { + "auxiliary_loss_clip": 0.01119054, + "auxiliary_loss_mlp": 0.0103496, + "balance_loss_clip": 1.04123151, + "balance_loss_mlp": 1.02306867, + "epoch": 0.6260033067788967, + "flos": 34474479724800.0, + "grad_norm": 2.0618699789167443, + "language_loss": 0.69884253, + "learning_rate": 1.2962555386134702e-06, + "loss": 0.72038269, + "num_input_tokens_seen": 224310545, + "router_z_loss_clip": 0.77685547, + "router_z_loss_mlp": 0.11877441, + "step": 10412, + "time_per_iteration": 2.660828113555908 + }, + { + "auxiliary_loss_clip": 0.01114481, + "auxiliary_loss_mlp": 0.01031193, + "balance_loss_clip": 1.03941894, + "balance_loss_mlp": 1.02026188, + "epoch": 0.6260634300315647, + "flos": 28513212926400.0, + "grad_norm": 2.4287397527606895, + "language_loss": 0.698053, + "learning_rate": 1.2958909971151908e-06, + "loss": 0.71950972, + "num_input_tokens_seen": 224331115, + "router_z_loss_clip": 0.75097656, + "router_z_loss_mlp": 0.109375, + "step": 10413, + "time_per_iteration": 2.7210872173309326 + }, + { + "auxiliary_loss_clip": 0.01119174, + "auxiliary_loss_mlp": 0.010315, + "balance_loss_clip": 1.03888273, + "balance_loss_mlp": 1.01849449, + "epoch": 0.6261235532842326, + "flos": 22007024399520.0, + "grad_norm": 3.2737797050093103, + "language_loss": 0.80107212, + "learning_rate": 1.295526482316796e-06, + "loss": 0.82257891, + "num_input_tokens_seen": 224347525, + "router_z_loss_clip": 0.80273438, + "router_z_loss_mlp": 0.13018799, + "step": 10414, + "time_per_iteration": 2.6058175563812256 + }, + { + "auxiliary_loss_clip": 0.01120615, + "auxiliary_loss_mlp": 0.01038919, + "balance_loss_clip": 1.04416609, + "balance_loss_mlp": 1.02696276, + "epoch": 0.6261836765369007, + "flos": 26858884245120.0, + "grad_norm": 1.9339610805340883, + "language_loss": 0.74601197, + "learning_rate": 1.2951619942321083e-06, + "loss": 0.76760727, + "num_input_tokens_seen": 224367045, + "router_z_loss_clip": 0.76416016, + "router_z_loss_mlp": 0.11962891, + "step": 10415, + "time_per_iteration": 2.6642909049987793 + }, + { + "auxiliary_loss_clip": 0.01116015, + "auxiliary_loss_mlp": 0.01029821, + "balance_loss_clip": 1.04065895, + "balance_loss_mlp": 1.01816821, + "epoch": 0.6262437997895686, + "flos": 30427340821920.0, + "grad_norm": 1.5501936466628246, + "language_loss": 0.74123108, + "learning_rate": 1.2947975328749472e-06, + "loss": 0.76268947, + "num_input_tokens_seen": 224388860, + "router_z_loss_clip": 0.75341797, + "router_z_loss_mlp": 0.11663818, + "step": 10416, + "time_per_iteration": 2.669480085372925 + }, + { + "auxiliary_loss_clip": 0.01114751, + "auxiliary_loss_mlp": 0.01028273, + "balance_loss_clip": 1.04127336, + "balance_loss_mlp": 1.01659679, + "epoch": 0.6263039230422366, + "flos": 38569949392320.0, + "grad_norm": 2.6609887845860034, + "language_loss": 0.84021801, + "learning_rate": 1.2944330982591352e-06, + "loss": 0.8616482, + "num_input_tokens_seen": 224409645, + "router_z_loss_clip": 0.734375, + "router_z_loss_mlp": 0.11688232, + "step": 10417, + "time_per_iteration": 2.7784907817840576 + }, + { + "auxiliary_loss_clip": 0.01118475, + "auxiliary_loss_mlp": 0.01032401, + "balance_loss_clip": 1.04151177, + "balance_loss_mlp": 1.02007496, + "epoch": 0.6263640462949046, + "flos": 21523479999840.0, + "grad_norm": 2.3163735758019106, + "language_loss": 0.56693381, + "learning_rate": 1.2940686903984904e-06, + "loss": 0.58844256, + "num_input_tokens_seen": 224428530, + "router_z_loss_clip": 0.76953125, + "router_z_loss_mlp": 0.12316895, + "step": 10418, + "time_per_iteration": 2.682764768600464 + }, + { + "auxiliary_loss_clip": 0.01121239, + "auxiliary_loss_mlp": 0.01031017, + "balance_loss_clip": 1.04070771, + "balance_loss_mlp": 1.01847649, + "epoch": 0.6264241695475725, + "flos": 24373383187680.0, + "grad_norm": 1.9218405457004561, + "language_loss": 0.84429502, + "learning_rate": 1.2937043093068316e-06, + "loss": 0.86581755, + "num_input_tokens_seen": 224447175, + "router_z_loss_clip": 0.80517578, + "router_z_loss_mlp": 0.12530518, + "step": 10419, + "time_per_iteration": 2.6611976623535156 + }, + { + "auxiliary_loss_clip": 0.01119894, + "auxiliary_loss_mlp": 0.01032917, + "balance_loss_clip": 1.04277182, + "balance_loss_mlp": 1.02063203, + "epoch": 0.6264842928002405, + "flos": 33366126221280.0, + "grad_norm": 2.0611100833990394, + "language_loss": 0.64865553, + "learning_rate": 1.2933399549979762e-06, + "loss": 0.67018366, + "num_input_tokens_seen": 224469445, + "router_z_loss_clip": 0.77099609, + "router_z_loss_mlp": 0.12298584, + "step": 10420, + "time_per_iteration": 2.698207139968872 + }, + { + "auxiliary_loss_clip": 0.01119073, + "auxiliary_loss_mlp": 0.01030633, + "balance_loss_clip": 1.03973234, + "balance_loss_mlp": 1.01784766, + "epoch": 0.6265444160529084, + "flos": 28061994965760.0, + "grad_norm": 2.7328266893367466, + "language_loss": 0.86587143, + "learning_rate": 1.292975627485741e-06, + "loss": 0.88736856, + "num_input_tokens_seen": 224486590, + "router_z_loss_clip": 0.79296875, + "router_z_loss_mlp": 0.12799072, + "step": 10421, + "time_per_iteration": 2.6736464500427246 + }, + { + "auxiliary_loss_clip": 0.01121331, + "auxiliary_loss_mlp": 0.01031818, + "balance_loss_clip": 1.04441428, + "balance_loss_mlp": 1.02023089, + "epoch": 0.6266045393055765, + "flos": 24328698978240.0, + "grad_norm": 3.243165174075043, + "language_loss": 0.79595327, + "learning_rate": 1.2926113267839403e-06, + "loss": 0.81748474, + "num_input_tokens_seen": 224502795, + "router_z_loss_clip": 0.76855469, + "router_z_loss_mlp": 0.11584473, + "step": 10422, + "time_per_iteration": 2.65609073638916 + }, + { + "auxiliary_loss_clip": 0.0111457, + "auxiliary_loss_mlp": 0.01029745, + "balance_loss_clip": 1.03862846, + "balance_loss_mlp": 1.01710308, + "epoch": 0.6266646625582444, + "flos": 29759913889920.0, + "grad_norm": 1.7810961002313712, + "language_loss": 0.74347013, + "learning_rate": 1.292247052906389e-06, + "loss": 0.76491332, + "num_input_tokens_seen": 224522300, + "router_z_loss_clip": 0.75927734, + "router_z_loss_mlp": 0.12658691, + "step": 10423, + "time_per_iteration": 2.645258903503418 + }, + { + "auxiliary_loss_clip": 0.01115098, + "auxiliary_loss_mlp": 0.01029551, + "balance_loss_clip": 1.03924799, + "balance_loss_mlp": 1.01816034, + "epoch": 0.6267247858109124, + "flos": 17916781461120.0, + "grad_norm": 4.297737992197679, + "language_loss": 0.77956367, + "learning_rate": 1.2918828058669004e-06, + "loss": 0.80101013, + "num_input_tokens_seen": 224538260, + "router_z_loss_clip": 0.7578125, + "router_z_loss_mlp": 0.1138916, + "step": 10424, + "time_per_iteration": 2.6910946369171143 + }, + { + "auxiliary_loss_clip": 0.01117415, + "auxiliary_loss_mlp": 0.01029111, + "balance_loss_clip": 1.04084945, + "balance_loss_mlp": 1.01659369, + "epoch": 0.6267849090635803, + "flos": 30420533918880.0, + "grad_norm": 2.104209178220685, + "language_loss": 0.69018316, + "learning_rate": 1.2915185856792868e-06, + "loss": 0.7116484, + "num_input_tokens_seen": 224559155, + "router_z_loss_clip": 0.76513672, + "router_z_loss_mlp": 0.12512207, + "step": 10425, + "time_per_iteration": 2.65643310546875 + }, + { + "auxiliary_loss_clip": 0.01112184, + "auxiliary_loss_mlp": 0.01030583, + "balance_loss_clip": 1.03942633, + "balance_loss_mlp": 1.01944256, + "epoch": 0.6268450323162483, + "flos": 30917651607360.0, + "grad_norm": 1.5374582620546104, + "language_loss": 0.74548101, + "learning_rate": 1.2911543923573598e-06, + "loss": 0.76690865, + "num_input_tokens_seen": 224578660, + "router_z_loss_clip": 0.72851562, + "router_z_loss_mlp": 0.11138916, + "step": 10426, + "time_per_iteration": 2.6810591220855713 + }, + { + "auxiliary_loss_clip": 0.01117699, + "auxiliary_loss_mlp": 0.01036245, + "balance_loss_clip": 1.04060125, + "balance_loss_mlp": 1.02441967, + "epoch": 0.6269051555689162, + "flos": 31941984972960.0, + "grad_norm": 1.6320219587948526, + "language_loss": 0.80285478, + "learning_rate": 1.290790225914929e-06, + "loss": 0.82439417, + "num_input_tokens_seen": 224599080, + "router_z_loss_clip": 0.77050781, + "router_z_loss_mlp": 0.1182251, + "step": 10427, + "time_per_iteration": 2.6421914100646973 + }, + { + "auxiliary_loss_clip": 0.01120401, + "auxiliary_loss_mlp": 0.01032138, + "balance_loss_clip": 1.04259586, + "balance_loss_mlp": 1.01996112, + "epoch": 0.6269652788215843, + "flos": 22277155520160.0, + "grad_norm": 2.0336886801575766, + "language_loss": 0.68768865, + "learning_rate": 1.2904260863658034e-06, + "loss": 0.70921409, + "num_input_tokens_seen": 224614225, + "router_z_loss_clip": 0.77783203, + "router_z_loss_mlp": 0.12176514, + "step": 10428, + "time_per_iteration": 2.6049439907073975 + }, + { + "auxiliary_loss_clip": 0.0111846, + "auxiliary_loss_mlp": 0.01032729, + "balance_loss_clip": 1.04242146, + "balance_loss_mlp": 1.02098131, + "epoch": 0.6270254020742522, + "flos": 14355496442880.0, + "grad_norm": 1.8748399297836478, + "language_loss": 0.71845281, + "learning_rate": 1.2900619737237928e-06, + "loss": 0.73996472, + "num_input_tokens_seen": 224632365, + "router_z_loss_clip": 0.75976562, + "router_z_loss_mlp": 0.11743164, + "step": 10429, + "time_per_iteration": 2.6043670177459717 + }, + { + "auxiliary_loss_clip": 0.01121041, + "auxiliary_loss_mlp": 0.01029844, + "balance_loss_clip": 1.04240298, + "balance_loss_mlp": 1.01752949, + "epoch": 0.6270855253269202, + "flos": 28645361242560.0, + "grad_norm": 1.6546603879422075, + "language_loss": 0.79858494, + "learning_rate": 1.2896978880027023e-06, + "loss": 0.82009381, + "num_input_tokens_seen": 224651125, + "router_z_loss_clip": 0.78564453, + "router_z_loss_mlp": 0.12310791, + "step": 10430, + "time_per_iteration": 2.7202205657958984 + }, + { + "auxiliary_loss_clip": 0.01038974, + "auxiliary_loss_mlp": 0.01010588, + "balance_loss_clip": 1.0149219, + "balance_loss_mlp": 1.00942433, + "epoch": 0.6271456485795882, + "flos": 85493658016800.0, + "grad_norm": 0.7726478703017159, + "language_loss": 0.59101737, + "learning_rate": 1.2893338292163393e-06, + "loss": 0.61151302, + "num_input_tokens_seen": 224716115, + "router_z_loss_clip": 0.24047852, + "router_z_loss_mlp": 0.01163483, + "step": 10431, + "time_per_iteration": 3.3875746726989746 + }, + { + "auxiliary_loss_clip": 0.01039338, + "auxiliary_loss_mlp": 0.01008234, + "balance_loss_clip": 1.01526904, + "balance_loss_mlp": 1.00713587, + "epoch": 0.6272057718322561, + "flos": 79503144092640.0, + "grad_norm": 0.8839880965434453, + "language_loss": 0.63706923, + "learning_rate": 1.2889697973785095e-06, + "loss": 0.65754491, + "num_input_tokens_seen": 224782930, + "router_z_loss_clip": 0.24060059, + "router_z_loss_mlp": 0.01098633, + "step": 10432, + "time_per_iteration": 4.7900390625 + }, + { + "auxiliary_loss_clip": 0.01114032, + "auxiliary_loss_mlp": 0.01030781, + "balance_loss_clip": 1.03889763, + "balance_loss_mlp": 1.02001667, + "epoch": 0.6272658950849241, + "flos": 29760197510880.0, + "grad_norm": 2.3525787297279286, + "language_loss": 0.64901054, + "learning_rate": 1.2886057925030153e-06, + "loss": 0.67045867, + "num_input_tokens_seen": 224802010, + "router_z_loss_clip": 0.75244141, + "router_z_loss_mlp": 0.10760498, + "step": 10433, + "time_per_iteration": 2.65358304977417 + }, + { + "auxiliary_loss_clip": 0.01122601, + "auxiliary_loss_mlp": 0.01036316, + "balance_loss_clip": 1.04281175, + "balance_loss_mlp": 1.02387631, + "epoch": 0.627326018337592, + "flos": 21922112881440.0, + "grad_norm": 2.341317666629812, + "language_loss": 0.61479294, + "learning_rate": 1.2882418146036612e-06, + "loss": 0.6363821, + "num_input_tokens_seen": 224818875, + "router_z_loss_clip": 0.79785156, + "router_z_loss_mlp": 0.12451172, + "step": 10434, + "time_per_iteration": 2.642552375793457 + }, + { + "auxiliary_loss_clip": 0.01116213, + "auxiliary_loss_mlp": 0.0102838, + "balance_loss_clip": 1.03884745, + "balance_loss_mlp": 1.01622093, + "epoch": 0.6273861415902601, + "flos": 24684754548960.0, + "grad_norm": 1.7067303539274556, + "language_loss": 0.84482825, + "learning_rate": 1.2878778636942484e-06, + "loss": 0.86627418, + "num_input_tokens_seen": 224837790, + "router_z_loss_clip": 0.77392578, + "router_z_loss_mlp": 0.12158203, + "step": 10435, + "time_per_iteration": 3.9983971118927 + }, + { + "auxiliary_loss_clip": 0.01038579, + "auxiliary_loss_mlp": 0.01000871, + "balance_loss_clip": 1.01451683, + "balance_loss_mlp": 0.99973536, + "epoch": 0.627446264842928, + "flos": 79256749747680.0, + "grad_norm": 0.7338030297937568, + "language_loss": 0.61579192, + "learning_rate": 1.2875139397885786e-06, + "loss": 0.63618636, + "num_input_tokens_seen": 224899685, + "router_z_loss_clip": 0.24072266, + "router_z_loss_mlp": 0.01135254, + "step": 10436, + "time_per_iteration": 3.2764387130737305 + }, + { + "auxiliary_loss_clip": 0.01119707, + "auxiliary_loss_mlp": 0.01037479, + "balance_loss_clip": 1.04300582, + "balance_loss_mlp": 1.02451515, + "epoch": 0.627506388095596, + "flos": 28777995766080.0, + "grad_norm": 1.5950080669652191, + "language_loss": 0.7767151, + "learning_rate": 1.2871500429004523e-06, + "loss": 0.79828697, + "num_input_tokens_seen": 224918650, + "router_z_loss_clip": 0.76660156, + "router_z_loss_mlp": 0.12969971, + "step": 10437, + "time_per_iteration": 2.7081375122070312 + }, + { + "auxiliary_loss_clip": 0.01038246, + "auxiliary_loss_mlp": 0.01000723, + "balance_loss_clip": 1.01414025, + "balance_loss_mlp": 0.99955976, + "epoch": 0.6275665113482639, + "flos": 82466274026880.0, + "grad_norm": 0.7349642238522061, + "language_loss": 0.54358423, + "learning_rate": 1.2867861730436667e-06, + "loss": 0.5639739, + "num_input_tokens_seen": 224981575, + "router_z_loss_clip": 0.24121094, + "router_z_loss_mlp": 0.01161957, + "step": 10438, + "time_per_iteration": 3.188838243484497 + }, + { + "auxiliary_loss_clip": 0.0111672, + "auxiliary_loss_mlp": 0.01038966, + "balance_loss_clip": 1.04009199, + "balance_loss_mlp": 1.02690828, + "epoch": 0.6276266346009319, + "flos": 33723113689440.0, + "grad_norm": 1.79332457461059, + "language_loss": 0.83865851, + "learning_rate": 1.2864223302320214e-06, + "loss": 0.86021531, + "num_input_tokens_seen": 225000820, + "router_z_loss_clip": 0.76611328, + "router_z_loss_mlp": 0.1204834, + "step": 10439, + "time_per_iteration": 2.7367749214172363 + }, + { + "auxiliary_loss_clip": 0.01120178, + "auxiliary_loss_mlp": 0.01035926, + "balance_loss_clip": 1.04128504, + "balance_loss_mlp": 1.0228126, + "epoch": 0.6276867578535998, + "flos": 27755161539840.0, + "grad_norm": 2.1324742297201444, + "language_loss": 0.80474043, + "learning_rate": 1.2860585144793128e-06, + "loss": 0.82630146, + "num_input_tokens_seen": 225017585, + "router_z_loss_clip": 0.78808594, + "router_z_loss_mlp": 0.13104248, + "step": 10440, + "time_per_iteration": 2.643648386001587 + }, + { + "auxiliary_loss_clip": 0.01112577, + "auxiliary_loss_mlp": 0.01029245, + "balance_loss_clip": 1.03979635, + "balance_loss_mlp": 1.01820064, + "epoch": 0.6277468811062679, + "flos": 30071244733920.0, + "grad_norm": 1.7324398886356558, + "language_loss": 0.7488122, + "learning_rate": 1.285694725799337e-06, + "loss": 0.77023041, + "num_input_tokens_seen": 225039085, + "router_z_loss_clip": 0.72802734, + "router_z_loss_mlp": 0.11053467, + "step": 10441, + "time_per_iteration": 4.1677563190460205 + }, + { + "auxiliary_loss_clip": 0.01117621, + "auxiliary_loss_mlp": 0.01026794, + "balance_loss_clip": 1.04142761, + "balance_loss_mlp": 1.01458716, + "epoch": 0.6278070043589358, + "flos": 24012587095200.0, + "grad_norm": 2.239993800364471, + "language_loss": 0.71841204, + "learning_rate": 1.2853309642058884e-06, + "loss": 0.73985624, + "num_input_tokens_seen": 225058105, + "router_z_loss_clip": 0.76171875, + "router_z_loss_mlp": 0.12219238, + "step": 10442, + "time_per_iteration": 2.658917188644409 + }, + { + "auxiliary_loss_clip": 0.01118474, + "auxiliary_loss_mlp": 0.01031867, + "balance_loss_clip": 1.04148626, + "balance_loss_mlp": 1.02019644, + "epoch": 0.6278671276116038, + "flos": 26993706701760.0, + "grad_norm": 1.5694188723165834, + "language_loss": 0.71582943, + "learning_rate": 1.284967229712762e-06, + "loss": 0.73733288, + "num_input_tokens_seen": 225077605, + "router_z_loss_clip": 0.77001953, + "router_z_loss_mlp": 0.11669922, + "step": 10443, + "time_per_iteration": 2.686812162399292 + }, + { + "auxiliary_loss_clip": 0.01116902, + "auxiliary_loss_mlp": 0.0102895, + "balance_loss_clip": 1.04172468, + "balance_loss_mlp": 1.01682615, + "epoch": 0.6279272508642717, + "flos": 28111946421600.0, + "grad_norm": 2.256479472610015, + "language_loss": 0.73643827, + "learning_rate": 1.2846035223337492e-06, + "loss": 0.75789678, + "num_input_tokens_seen": 225097775, + "router_z_loss_clip": 0.75195312, + "router_z_loss_mlp": 0.12115479, + "step": 10444, + "time_per_iteration": 2.6833698749542236 + }, + { + "auxiliary_loss_clip": 0.01115303, + "auxiliary_loss_mlp": 0.01027058, + "balance_loss_clip": 1.04028893, + "balance_loss_mlp": 1.01457047, + "epoch": 0.6279873741169397, + "flos": 24189743759040.0, + "grad_norm": 2.021826891363327, + "language_loss": 0.72413135, + "learning_rate": 1.2842398420826423e-06, + "loss": 0.74555492, + "num_input_tokens_seen": 225115585, + "router_z_loss_clip": 0.74951172, + "router_z_loss_mlp": 0.12493896, + "step": 10445, + "time_per_iteration": 4.076347589492798 + }, + { + "auxiliary_loss_clip": 0.01115612, + "auxiliary_loss_mlp": 0.01028714, + "balance_loss_clip": 1.0394361, + "balance_loss_mlp": 1.01691771, + "epoch": 0.6280474973696077, + "flos": 29181814859520.0, + "grad_norm": 2.161196852129675, + "language_loss": 0.69181532, + "learning_rate": 1.2838761889732331e-06, + "loss": 0.71325856, + "num_input_tokens_seen": 225135575, + "router_z_loss_clip": 0.76220703, + "router_z_loss_mlp": 0.11798096, + "step": 10446, + "time_per_iteration": 2.664151668548584 + }, + { + "auxiliary_loss_clip": 0.0112372, + "auxiliary_loss_mlp": 0.01030733, + "balance_loss_clip": 1.04210067, + "balance_loss_mlp": 1.0178051, + "epoch": 0.6281076206222757, + "flos": 21924341331840.0, + "grad_norm": 1.9623485249489403, + "language_loss": 0.73166019, + "learning_rate": 1.2835125630193102e-06, + "loss": 0.7532047, + "num_input_tokens_seen": 225154230, + "router_z_loss_clip": 0.81640625, + "router_z_loss_mlp": 0.12939453, + "step": 10447, + "time_per_iteration": 2.607853889465332 + }, + { + "auxiliary_loss_clip": 0.01037404, + "auxiliary_loss_mlp": 0.01000507, + "balance_loss_clip": 1.01342249, + "balance_loss_mlp": 0.99942195, + "epoch": 0.6281677438749437, + "flos": 81483869695680.0, + "grad_norm": 0.677765604444112, + "language_loss": 0.52324992, + "learning_rate": 1.2831489642346626e-06, + "loss": 0.54362905, + "num_input_tokens_seen": 225213650, + "router_z_loss_clip": 0.24023438, + "router_z_loss_mlp": 0.01085663, + "step": 10448, + "time_per_iteration": 3.1179463863372803 + }, + { + "auxiliary_loss_clip": 0.01120302, + "auxiliary_loss_mlp": 0.01040214, + "balance_loss_clip": 1.04282236, + "balance_loss_mlp": 1.02774501, + "epoch": 0.6282278671276116, + "flos": 14222173125600.0, + "grad_norm": 2.0944077963701404, + "language_loss": 0.91283107, + "learning_rate": 1.282785392633079e-06, + "loss": 0.9344362, + "num_input_tokens_seen": 225230135, + "router_z_loss_clip": 0.7734375, + "router_z_loss_mlp": 0.12487793, + "step": 10449, + "time_per_iteration": 2.6181578636169434 + }, + { + "auxiliary_loss_clip": 0.01116473, + "auxiliary_loss_mlp": 0.01028606, + "balance_loss_clip": 1.04034424, + "balance_loss_mlp": 1.01734114, + "epoch": 0.6282879903802796, + "flos": 52155036678240.0, + "grad_norm": 1.6736001122664226, + "language_loss": 0.59926522, + "learning_rate": 1.2824218482283438e-06, + "loss": 0.62071598, + "num_input_tokens_seen": 225253520, + "router_z_loss_clip": 0.76025391, + "router_z_loss_mlp": 0.1126709, + "step": 10450, + "time_per_iteration": 2.8358421325683594 + }, + { + "auxiliary_loss_clip": 0.01114888, + "auxiliary_loss_mlp": 0.01030408, + "balance_loss_clip": 1.04025638, + "balance_loss_mlp": 1.01884496, + "epoch": 0.6283481136329475, + "flos": 24415393256640.0, + "grad_norm": 1.6794102143924063, + "language_loss": 0.76864475, + "learning_rate": 1.2820583310342452e-06, + "loss": 0.79009771, + "num_input_tokens_seen": 225272460, + "router_z_loss_clip": 0.74707031, + "router_z_loss_mlp": 0.11572266, + "step": 10451, + "time_per_iteration": 2.6362006664276123 + }, + { + "auxiliary_loss_clip": 0.01118466, + "auxiliary_loss_mlp": 0.01030958, + "balance_loss_clip": 1.04013133, + "balance_loss_mlp": 1.01907265, + "epoch": 0.6284082368856155, + "flos": 26728275585600.0, + "grad_norm": 1.7109901504573855, + "language_loss": 0.77191067, + "learning_rate": 1.281694841064566e-06, + "loss": 0.79340494, + "num_input_tokens_seen": 225291700, + "router_z_loss_clip": 0.78417969, + "router_z_loss_mlp": 0.11895752, + "step": 10452, + "time_per_iteration": 2.643629789352417 + }, + { + "auxiliary_loss_clip": 0.01117441, + "auxiliary_loss_mlp": 0.01033704, + "balance_loss_clip": 1.04178596, + "balance_loss_mlp": 1.02134752, + "epoch": 0.6284683601382834, + "flos": 31096185858720.0, + "grad_norm": 2.255896170783629, + "language_loss": 0.72792017, + "learning_rate": 1.2813313783330904e-06, + "loss": 0.74943161, + "num_input_tokens_seen": 225311470, + "router_z_loss_clip": 0.75732422, + "router_z_loss_mlp": 0.12359619, + "step": 10453, + "time_per_iteration": 2.685941219329834 + }, + { + "auxiliary_loss_clip": 0.01116803, + "auxiliary_loss_mlp": 0.01026204, + "balance_loss_clip": 1.03898096, + "balance_loss_mlp": 1.01409221, + "epoch": 0.6285284833909515, + "flos": 20180806300800.0, + "grad_norm": 2.2140621228720945, + "language_loss": 0.80467355, + "learning_rate": 1.2809679428536013e-06, + "loss": 0.82610363, + "num_input_tokens_seen": 225328385, + "router_z_loss_clip": 0.77734375, + "router_z_loss_mlp": 0.12109375, + "step": 10454, + "time_per_iteration": 2.627060890197754 + }, + { + "auxiliary_loss_clip": 0.01115399, + "auxiliary_loss_mlp": 0.01030334, + "balance_loss_clip": 1.04104948, + "balance_loss_mlp": 1.01897359, + "epoch": 0.6285886066436194, + "flos": 27845988580800.0, + "grad_norm": 2.3109278727671363, + "language_loss": 0.81815416, + "learning_rate": 1.2806045346398792e-06, + "loss": 0.83961153, + "num_input_tokens_seen": 225348415, + "router_z_loss_clip": 0.74414062, + "router_z_loss_mlp": 0.11364746, + "step": 10455, + "time_per_iteration": 2.6454057693481445 + }, + { + "auxiliary_loss_clip": 0.01117424, + "auxiliary_loss_mlp": 0.01031542, + "balance_loss_clip": 1.0408119, + "balance_loss_mlp": 1.02006769, + "epoch": 0.6286487298962874, + "flos": 30158668323360.0, + "grad_norm": 1.5988919228428686, + "language_loss": 0.81670594, + "learning_rate": 1.280241153705706e-06, + "loss": 0.83819556, + "num_input_tokens_seen": 225367740, + "router_z_loss_clip": 0.76464844, + "router_z_loss_mlp": 0.11474609, + "step": 10456, + "time_per_iteration": 2.7037980556488037 + }, + { + "auxiliary_loss_clip": 0.01122503, + "auxiliary_loss_mlp": 0.01029084, + "balance_loss_clip": 1.04410923, + "balance_loss_mlp": 1.01671588, + "epoch": 0.6287088531489553, + "flos": 25307983478880.0, + "grad_norm": 1.650951001325769, + "language_loss": 0.71863532, + "learning_rate": 1.27987780006486e-06, + "loss": 0.74015117, + "num_input_tokens_seen": 225388405, + "router_z_loss_clip": 0.78369141, + "router_z_loss_mlp": 0.1237793, + "step": 10457, + "time_per_iteration": 2.639248847961426 + }, + { + "auxiliary_loss_clip": 0.0112055, + "auxiliary_loss_mlp": 0.01032142, + "balance_loss_clip": 1.03929341, + "balance_loss_mlp": 1.01967907, + "epoch": 0.6287689764016233, + "flos": 28151484936480.0, + "grad_norm": 2.031334762865339, + "language_loss": 0.79685843, + "learning_rate": 1.2795144737311202e-06, + "loss": 0.81838536, + "num_input_tokens_seen": 225408360, + "router_z_loss_clip": 0.81347656, + "router_z_loss_mlp": 0.12457275, + "step": 10458, + "time_per_iteration": 2.661146402359009 + }, + { + "auxiliary_loss_clip": 0.01120648, + "auxiliary_loss_mlp": 0.01033314, + "balance_loss_clip": 1.04199195, + "balance_loss_mlp": 1.02157223, + "epoch": 0.6288290996542913, + "flos": 39332457679680.0, + "grad_norm": 1.7807089876848723, + "language_loss": 0.61014551, + "learning_rate": 1.2791511747182635e-06, + "loss": 0.63168514, + "num_input_tokens_seen": 225431310, + "router_z_loss_clip": 0.78662109, + "router_z_loss_mlp": 0.11749268, + "step": 10459, + "time_per_iteration": 2.8520283699035645 + }, + { + "auxiliary_loss_clip": 0.01118746, + "auxiliary_loss_mlp": 0.01032921, + "balance_loss_clip": 1.04170716, + "balance_loss_mlp": 1.02182817, + "epoch": 0.6288892229069593, + "flos": 30068003351520.0, + "grad_norm": 1.889541327576288, + "language_loss": 0.78959191, + "learning_rate": 1.2787879030400666e-06, + "loss": 0.81110859, + "num_input_tokens_seen": 225450385, + "router_z_loss_clip": 0.76953125, + "router_z_loss_mlp": 0.11096191, + "step": 10460, + "time_per_iteration": 2.7249929904937744 + }, + { + "auxiliary_loss_clip": 0.01117238, + "auxiliary_loss_mlp": 0.01029251, + "balance_loss_clip": 1.04129732, + "balance_loss_mlp": 1.01708603, + "epoch": 0.6289493461596273, + "flos": 21790572324480.0, + "grad_norm": 1.8520386370758546, + "language_loss": 0.74298418, + "learning_rate": 1.2784246587103047e-06, + "loss": 0.76444912, + "num_input_tokens_seen": 225467325, + "router_z_loss_clip": 0.75878906, + "router_z_loss_mlp": 0.1217041, + "step": 10461, + "time_per_iteration": 2.616356134414673 + }, + { + "auxiliary_loss_clip": 0.01115176, + "auxiliary_loss_mlp": 0.01033312, + "balance_loss_clip": 1.04074144, + "balance_loss_mlp": 1.02148056, + "epoch": 0.6290094694122952, + "flos": 27266066272800.0, + "grad_norm": 1.8343661978929535, + "language_loss": 0.70203829, + "learning_rate": 1.2780614417427523e-06, + "loss": 0.72352314, + "num_input_tokens_seen": 225487370, + "router_z_loss_clip": 0.74365234, + "router_z_loss_mlp": 0.1182251, + "step": 10462, + "time_per_iteration": 2.7306981086730957 + }, + { + "auxiliary_loss_clip": 0.01111487, + "auxiliary_loss_mlp": 0.010309, + "balance_loss_clip": 1.04087567, + "balance_loss_mlp": 1.02024877, + "epoch": 0.6290695926649632, + "flos": 34657308807840.0, + "grad_norm": 2.6799660207506846, + "language_loss": 0.72293079, + "learning_rate": 1.2776982521511821e-06, + "loss": 0.74435472, + "num_input_tokens_seen": 225506915, + "router_z_loss_clip": 0.70556641, + "router_z_loss_mlp": 0.10656738, + "step": 10463, + "time_per_iteration": 2.6774585247039795 + }, + { + "auxiliary_loss_clip": 0.01116697, + "auxiliary_loss_mlp": 0.01035829, + "balance_loss_clip": 1.04397845, + "balance_loss_mlp": 1.02433074, + "epoch": 0.6291297159176311, + "flos": 26241206182560.0, + "grad_norm": 1.9630730084854495, + "language_loss": 0.72817284, + "learning_rate": 1.2773350899493665e-06, + "loss": 0.7496981, + "num_input_tokens_seen": 225525670, + "router_z_loss_clip": 0.72802734, + "router_z_loss_mlp": 0.11505127, + "step": 10464, + "time_per_iteration": 2.6590964794158936 + }, + { + "auxiliary_loss_clip": 0.01116215, + "auxiliary_loss_mlp": 0.01029559, + "balance_loss_clip": 1.04266906, + "balance_loss_mlp": 1.01859212, + "epoch": 0.6291898391702991, + "flos": 14889843161280.0, + "grad_norm": 2.059738216331727, + "language_loss": 0.69661593, + "learning_rate": 1.2769719551510768e-06, + "loss": 0.71807373, + "num_input_tokens_seen": 225542235, + "router_z_loss_clip": 0.73486328, + "router_z_loss_mlp": 0.10974121, + "step": 10465, + "time_per_iteration": 2.578714609146118 + }, + { + "auxiliary_loss_clip": 0.01037147, + "auxiliary_loss_mlp": 0.01004118, + "balance_loss_clip": 1.01322567, + "balance_loss_mlp": 1.0029242, + "epoch": 0.629249962422967, + "flos": 84559989623040.0, + "grad_norm": 0.6821647917044869, + "language_loss": 0.59717071, + "learning_rate": 1.2766088477700832e-06, + "loss": 0.61758339, + "num_input_tokens_seen": 225607185, + "router_z_loss_clip": 0.23925781, + "router_z_loss_mlp": 0.01193237, + "step": 10466, + "time_per_iteration": 3.368433713912964 + }, + { + "auxiliary_loss_clip": 0.01110713, + "auxiliary_loss_mlp": 0.01029387, + "balance_loss_clip": 1.03600645, + "balance_loss_mlp": 1.01845586, + "epoch": 0.6293100856756351, + "flos": 48904596296640.0, + "grad_norm": 2.220140683499845, + "language_loss": 0.65093362, + "learning_rate": 1.276245767820154e-06, + "loss": 0.67233467, + "num_input_tokens_seen": 225628785, + "router_z_loss_clip": 0.74707031, + "router_z_loss_mlp": 0.109375, + "step": 10467, + "time_per_iteration": 2.850909948348999 + }, + { + "auxiliary_loss_clip": 0.01036768, + "auxiliary_loss_mlp": 0.01003839, + "balance_loss_clip": 1.01284039, + "balance_loss_mlp": 1.00274658, + "epoch": 0.629370208928303, + "flos": 82366452149760.0, + "grad_norm": 0.8024147810903501, + "language_loss": 0.56831813, + "learning_rate": 1.2758827153150586e-06, + "loss": 0.58872426, + "num_input_tokens_seen": 225678980, + "router_z_loss_clip": 0.23925781, + "router_z_loss_mlp": 0.01094055, + "step": 10468, + "time_per_iteration": 2.9999752044677734 + }, + { + "auxiliary_loss_clip": 0.01037041, + "auxiliary_loss_mlp": 0.01003522, + "balance_loss_clip": 1.01297188, + "balance_loss_mlp": 1.00236368, + "epoch": 0.629430332180971, + "flos": 74017237203360.0, + "grad_norm": 0.7399642387067338, + "language_loss": 0.58014268, + "learning_rate": 1.2755196902685626e-06, + "loss": 0.60054839, + "num_input_tokens_seen": 225740295, + "router_z_loss_clip": 0.24072266, + "router_z_loss_mlp": 0.01157379, + "step": 10469, + "time_per_iteration": 3.1667914390563965 + }, + { + "auxiliary_loss_clip": 0.01037101, + "auxiliary_loss_mlp": 0.01002787, + "balance_loss_clip": 1.01284838, + "balance_loss_mlp": 1.00158095, + "epoch": 0.6294904554336389, + "flos": 81593699340960.0, + "grad_norm": 0.683342671149161, + "language_loss": 0.52074474, + "learning_rate": 1.2751566926944329e-06, + "loss": 0.54114366, + "num_input_tokens_seen": 225805615, + "router_z_loss_clip": 0.24255371, + "router_z_loss_mlp": 0.01205444, + "step": 10470, + "time_per_iteration": 3.3002984523773193 + }, + { + "auxiliary_loss_clip": 0.01114183, + "auxiliary_loss_mlp": 0.01030072, + "balance_loss_clip": 1.03995037, + "balance_loss_mlp": 1.01906919, + "epoch": 0.6295505786863069, + "flos": 51888025388160.0, + "grad_norm": 5.835827915010919, + "language_loss": 0.74592662, + "learning_rate": 1.2747937226064342e-06, + "loss": 0.76736921, + "num_input_tokens_seen": 225826585, + "router_z_loss_clip": 0.74267578, + "router_z_loss_mlp": 0.11010742, + "step": 10471, + "time_per_iteration": 2.813547372817993 + }, + { + "auxiliary_loss_clip": 0.01119035, + "auxiliary_loss_mlp": 0.01027547, + "balance_loss_clip": 1.04255891, + "balance_loss_mlp": 1.01677012, + "epoch": 0.629610701938975, + "flos": 21211906052160.0, + "grad_norm": 2.2168991200667514, + "language_loss": 0.63429028, + "learning_rate": 1.2744307800183297e-06, + "loss": 0.65575612, + "num_input_tokens_seen": 225844095, + "router_z_loss_clip": 0.76464844, + "router_z_loss_mlp": 0.10784912, + "step": 10472, + "time_per_iteration": 4.117898464202881 + }, + { + "auxiliary_loss_clip": 0.01121768, + "auxiliary_loss_mlp": 0.01032083, + "balance_loss_clip": 1.04367256, + "balance_loss_mlp": 1.02037048, + "epoch": 0.6296708251916429, + "flos": 29581582224960.0, + "grad_norm": 2.525700814491294, + "language_loss": 0.69157648, + "learning_rate": 1.2740678649438828e-06, + "loss": 0.71311504, + "num_input_tokens_seen": 225864310, + "router_z_loss_clip": 0.78076172, + "router_z_loss_mlp": 0.11712646, + "step": 10473, + "time_per_iteration": 2.701866865158081 + }, + { + "auxiliary_loss_clip": 0.01115184, + "auxiliary_loss_mlp": 0.01029199, + "balance_loss_clip": 1.04011047, + "balance_loss_mlp": 1.0185957, + "epoch": 0.6297309484443109, + "flos": 23523856483680.0, + "grad_norm": 1.7414916706804322, + "language_loss": 0.74562281, + "learning_rate": 1.2737049773968554e-06, + "loss": 0.7670666, + "num_input_tokens_seen": 225883830, + "router_z_loss_clip": 0.75048828, + "router_z_loss_mlp": 0.10601807, + "step": 10474, + "time_per_iteration": 3.933262348175049 + }, + { + "auxiliary_loss_clip": 0.01116101, + "auxiliary_loss_mlp": 0.01030761, + "balance_loss_clip": 1.04066586, + "balance_loss_mlp": 1.01923954, + "epoch": 0.6297910716969788, + "flos": 37415615126400.0, + "grad_norm": 1.5540839152455317, + "language_loss": 0.66010296, + "learning_rate": 1.2733421173910081e-06, + "loss": 0.6815716, + "num_input_tokens_seen": 225905755, + "router_z_loss_clip": 0.75341797, + "router_z_loss_mlp": 0.11523438, + "step": 10475, + "time_per_iteration": 2.7502052783966064 + }, + { + "auxiliary_loss_clip": 0.01111582, + "auxiliary_loss_mlp": 0.01029347, + "balance_loss_clip": 1.03986442, + "balance_loss_mlp": 1.01849926, + "epoch": 0.6298511949496468, + "flos": 17599048886880.0, + "grad_norm": 2.2687674460207914, + "language_loss": 0.89975524, + "learning_rate": 1.272979284940101e-06, + "loss": 0.92116451, + "num_input_tokens_seen": 225922155, + "router_z_loss_clip": 0.71777344, + "router_z_loss_mlp": 0.10845947, + "step": 10476, + "time_per_iteration": 2.5994341373443604 + }, + { + "auxiliary_loss_clip": 0.01115434, + "auxiliary_loss_mlp": 0.01032355, + "balance_loss_clip": 1.04104507, + "balance_loss_mlp": 1.02151895, + "epoch": 0.6299113182023147, + "flos": 28692192867840.0, + "grad_norm": 1.9344631484404973, + "language_loss": 0.75407624, + "learning_rate": 1.2726164800578913e-06, + "loss": 0.77555418, + "num_input_tokens_seen": 225941060, + "router_z_loss_clip": 0.74414062, + "router_z_loss_mlp": 0.10845947, + "step": 10477, + "time_per_iteration": 2.682631492614746 + }, + { + "auxiliary_loss_clip": 0.01115672, + "auxiliary_loss_mlp": 0.01026743, + "balance_loss_clip": 1.03964424, + "balance_loss_mlp": 1.01504862, + "epoch": 0.6299714414549827, + "flos": 27667616398560.0, + "grad_norm": 1.948400576424987, + "language_loss": 0.70242655, + "learning_rate": 1.272253702758138e-06, + "loss": 0.72385073, + "num_input_tokens_seen": 225960870, + "router_z_loss_clip": 0.76025391, + "router_z_loss_mlp": 0.11688232, + "step": 10478, + "time_per_iteration": 2.6255667209625244 + }, + { + "auxiliary_loss_clip": 0.01120422, + "auxiliary_loss_mlp": 0.01029587, + "balance_loss_clip": 1.04163015, + "balance_loss_mlp": 1.01734459, + "epoch": 0.6300315647076506, + "flos": 17694575932320.0, + "grad_norm": 2.3723304916984027, + "language_loss": 0.67678928, + "learning_rate": 1.2718909530545974e-06, + "loss": 0.69828933, + "num_input_tokens_seen": 225977895, + "router_z_loss_clip": 0.78759766, + "router_z_loss_mlp": 0.12243652, + "step": 10479, + "time_per_iteration": 2.6913201808929443 + }, + { + "auxiliary_loss_clip": 0.01115886, + "auxiliary_loss_mlp": 0.01032804, + "balance_loss_clip": 1.04205692, + "balance_loss_mlp": 1.02110958, + "epoch": 0.6300916879603187, + "flos": 26687967242400.0, + "grad_norm": 2.007301629923282, + "language_loss": 0.73690879, + "learning_rate": 1.2715282309610245e-06, + "loss": 0.75839567, + "num_input_tokens_seen": 225997835, + "router_z_loss_clip": 0.73828125, + "router_z_loss_mlp": 0.11694336, + "step": 10480, + "time_per_iteration": 2.698129653930664 + }, + { + "auxiliary_loss_clip": 0.01115618, + "auxiliary_loss_mlp": 0.01030433, + "balance_loss_clip": 1.03915834, + "balance_loss_mlp": 1.01768959, + "epoch": 0.6301518112129866, + "flos": 26642148549120.0, + "grad_norm": 2.2454172538581068, + "language_loss": 0.79392469, + "learning_rate": 1.2711655364911744e-06, + "loss": 0.81538522, + "num_input_tokens_seen": 226017620, + "router_z_loss_clip": 0.76513672, + "router_z_loss_mlp": 0.12756348, + "step": 10481, + "time_per_iteration": 4.051872968673706 + }, + { + "auxiliary_loss_clip": 0.0103707, + "auxiliary_loss_mlp": 0.00999387, + "balance_loss_clip": 1.01331139, + "balance_loss_mlp": 0.99833679, + "epoch": 0.6302119344656546, + "flos": 54096311151360.0, + "grad_norm": 0.8998954857086725, + "language_loss": 0.61860228, + "learning_rate": 1.2708028696588e-06, + "loss": 0.63896692, + "num_input_tokens_seen": 226068755, + "router_z_loss_clip": 0.23754883, + "router_z_loss_mlp": 0.01050568, + "step": 10482, + "time_per_iteration": 3.0881853103637695 + }, + { + "auxiliary_loss_clip": 0.01120861, + "auxiliary_loss_mlp": 0.01030356, + "balance_loss_clip": 1.04080558, + "balance_loss_mlp": 1.01829219, + "epoch": 0.6302720577183225, + "flos": 13687421234400.0, + "grad_norm": 2.083827114671708, + "language_loss": 0.82532525, + "learning_rate": 1.2704402304776541e-06, + "loss": 0.8468374, + "num_input_tokens_seen": 226084395, + "router_z_loss_clip": 0.79980469, + "router_z_loss_mlp": 0.12060547, + "step": 10483, + "time_per_iteration": 2.5964088439941406 + }, + { + "auxiliary_loss_clip": 0.01111608, + "auxiliary_loss_mlp": 0.01027836, + "balance_loss_clip": 1.03998756, + "balance_loss_mlp": 1.01724458, + "epoch": 0.6303321809709905, + "flos": 34123407779520.0, + "grad_norm": 2.4096418032899654, + "language_loss": 0.72966218, + "learning_rate": 1.270077618961487e-06, + "loss": 0.75105661, + "num_input_tokens_seen": 226105890, + "router_z_loss_clip": 0.71679688, + "router_z_loss_mlp": 0.105896, + "step": 10484, + "time_per_iteration": 4.15775203704834 + }, + { + "auxiliary_loss_clip": 0.01114229, + "auxiliary_loss_mlp": 0.01023653, + "balance_loss_clip": 1.03815246, + "balance_loss_mlp": 1.01206589, + "epoch": 0.6303923042236586, + "flos": 34435548969120.0, + "grad_norm": 5.226343675124479, + "language_loss": 0.74561095, + "learning_rate": 1.2697150351240506e-06, + "loss": 0.76698971, + "num_input_tokens_seen": 226126760, + "router_z_loss_clip": 0.76074219, + "router_z_loss_mlp": 0.11590576, + "step": 10485, + "time_per_iteration": 2.7586019039154053 + }, + { + "auxiliary_loss_clip": 0.01119653, + "auxiliary_loss_mlp": 0.01030567, + "balance_loss_clip": 1.04119647, + "balance_loss_mlp": 1.01900351, + "epoch": 0.6304524274763265, + "flos": 33716468855520.0, + "grad_norm": 2.206944166463931, + "language_loss": 0.81464338, + "learning_rate": 1.269352478979093e-06, + "loss": 0.83614552, + "num_input_tokens_seen": 226147315, + "router_z_loss_clip": 0.78466797, + "router_z_loss_mlp": 0.11566162, + "step": 10486, + "time_per_iteration": 2.6916487216949463 + }, + { + "auxiliary_loss_clip": 0.01116243, + "auxiliary_loss_mlp": 0.01031591, + "balance_loss_clip": 1.04171777, + "balance_loss_mlp": 1.02049875, + "epoch": 0.6305125507289945, + "flos": 21123023840640.0, + "grad_norm": 2.375152021632609, + "language_loss": 0.6327948, + "learning_rate": 1.2689899505403628e-06, + "loss": 0.65427315, + "num_input_tokens_seen": 226165935, + "router_z_loss_clip": 0.74560547, + "router_z_loss_mlp": 0.11090088, + "step": 10487, + "time_per_iteration": 2.639479637145996 + }, + { + "auxiliary_loss_clip": 0.011152, + "auxiliary_loss_mlp": 0.01034438, + "balance_loss_clip": 1.04013622, + "balance_loss_mlp": 1.02357173, + "epoch": 0.6305726739816624, + "flos": 31492063565280.0, + "grad_norm": 1.893821968682402, + "language_loss": 0.67016667, + "learning_rate": 1.2686274498216065e-06, + "loss": 0.69166303, + "num_input_tokens_seen": 226186890, + "router_z_loss_clip": 0.75048828, + "router_z_loss_mlp": 0.10864258, + "step": 10488, + "time_per_iteration": 2.6827800273895264 + }, + { + "auxiliary_loss_clip": 0.01117684, + "auxiliary_loss_mlp": 0.01027394, + "balance_loss_clip": 1.04103005, + "balance_loss_mlp": 1.01610494, + "epoch": 0.6306327972343304, + "flos": 26597504856960.0, + "grad_norm": 2.5681828848802755, + "language_loss": 0.67105222, + "learning_rate": 1.2682649768365706e-06, + "loss": 0.69250304, + "num_input_tokens_seen": 226206710, + "router_z_loss_clip": 0.76611328, + "router_z_loss_mlp": 0.112854, + "step": 10489, + "time_per_iteration": 2.6834042072296143 + }, + { + "auxiliary_loss_clip": 0.01121063, + "auxiliary_loss_mlp": 0.0103298, + "balance_loss_clip": 1.04082417, + "balance_loss_mlp": 1.02069592, + "epoch": 0.6306929204869983, + "flos": 25352586653760.0, + "grad_norm": 2.2304567026197892, + "language_loss": 0.69681442, + "learning_rate": 1.2679025315990007e-06, + "loss": 0.71835482, + "num_input_tokens_seen": 226225565, + "router_z_loss_clip": 0.80273438, + "router_z_loss_mlp": 0.12286377, + "step": 10490, + "time_per_iteration": 2.6331021785736084 + }, + { + "auxiliary_loss_clip": 0.0111826, + "auxiliary_loss_mlp": 0.01037074, + "balance_loss_clip": 1.04134965, + "balance_loss_mlp": 1.02505732, + "epoch": 0.6307530437396663, + "flos": 28863636595200.0, + "grad_norm": 2.2024127650578067, + "language_loss": 0.78408217, + "learning_rate": 1.2675401141226393e-06, + "loss": 0.80563557, + "num_input_tokens_seen": 226243680, + "router_z_loss_clip": 0.76953125, + "router_z_loss_mlp": 0.12017822, + "step": 10491, + "time_per_iteration": 2.745530366897583 + }, + { + "auxiliary_loss_clip": 0.01116519, + "auxiliary_loss_mlp": 0.01029138, + "balance_loss_clip": 1.04176736, + "balance_loss_mlp": 1.01806903, + "epoch": 0.6308131669923343, + "flos": 30162355395840.0, + "grad_norm": 2.0598963546557143, + "language_loss": 0.55718392, + "learning_rate": 1.2671777244212308e-06, + "loss": 0.57864046, + "num_input_tokens_seen": 226264345, + "router_z_loss_clip": 0.74707031, + "router_z_loss_mlp": 0.1105957, + "step": 10492, + "time_per_iteration": 2.7013254165649414 + }, + { + "auxiliary_loss_clip": 0.0111797, + "auxiliary_loss_mlp": 0.01031983, + "balance_loss_clip": 1.04038286, + "balance_loss_mlp": 1.02000809, + "epoch": 0.6308732902450023, + "flos": 27534455150400.0, + "grad_norm": 1.8736764834925503, + "language_loss": 0.6388104, + "learning_rate": 1.2668153625085168e-06, + "loss": 0.66030991, + "num_input_tokens_seen": 226283165, + "router_z_loss_clip": 0.77587891, + "router_z_loss_mlp": 0.11962891, + "step": 10493, + "time_per_iteration": 2.6713576316833496 + }, + { + "auxiliary_loss_clip": 0.01115365, + "auxiliary_loss_mlp": 0.01027663, + "balance_loss_clip": 1.03950775, + "balance_loss_mlp": 1.01572442, + "epoch": 0.6309334134976702, + "flos": 30071244733920.0, + "grad_norm": 1.6521371958774977, + "language_loss": 0.82926559, + "learning_rate": 1.2664530283982367e-06, + "loss": 0.85069585, + "num_input_tokens_seen": 226304080, + "router_z_loss_clip": 0.75830078, + "router_z_loss_mlp": 0.11938477, + "step": 10494, + "time_per_iteration": 2.698894739151001 + }, + { + "auxiliary_loss_clip": 0.0111769, + "auxiliary_loss_mlp": 0.01032486, + "balance_loss_clip": 1.04103231, + "balance_loss_mlp": 1.02072585, + "epoch": 0.6309935367503382, + "flos": 50552037040320.0, + "grad_norm": 1.9486004482605956, + "language_loss": 0.79419208, + "learning_rate": 1.2660907221041317e-06, + "loss": 0.8156938, + "num_input_tokens_seen": 226325925, + "router_z_loss_clip": 0.76611328, + "router_z_loss_mlp": 0.11755371, + "step": 10495, + "time_per_iteration": 2.797281503677368 + }, + { + "auxiliary_loss_clip": 0.01116382, + "auxiliary_loss_mlp": 0.01032794, + "balance_loss_clip": 1.04047632, + "balance_loss_mlp": 1.02064681, + "epoch": 0.6310536600030061, + "flos": 18448008348960.0, + "grad_norm": 2.2552477394614914, + "language_loss": 0.70453429, + "learning_rate": 1.2657284436399403e-06, + "loss": 0.72602606, + "num_input_tokens_seen": 226344190, + "router_z_loss_clip": 0.75878906, + "router_z_loss_mlp": 0.121521, + "step": 10496, + "time_per_iteration": 2.679309368133545 + }, + { + "auxiliary_loss_clip": 0.01120323, + "auxiliary_loss_mlp": 0.01033611, + "balance_loss_clip": 1.04263616, + "balance_loss_mlp": 1.02186275, + "epoch": 0.6311137832556741, + "flos": 18585221325120.0, + "grad_norm": 3.478236651094878, + "language_loss": 0.80542696, + "learning_rate": 1.2653661930193997e-06, + "loss": 0.82696629, + "num_input_tokens_seen": 226361520, + "router_z_loss_clip": 0.77587891, + "router_z_loss_mlp": 0.11749268, + "step": 10497, + "time_per_iteration": 2.583855628967285 + }, + { + "auxiliary_loss_clip": 0.01114131, + "auxiliary_loss_mlp": 0.01031456, + "balance_loss_clip": 1.04038453, + "balance_loss_mlp": 1.02088225, + "epoch": 0.6311739065083422, + "flos": 26867230804800.0, + "grad_norm": 2.293369260392486, + "language_loss": 0.74073839, + "learning_rate": 1.265003970256247e-06, + "loss": 0.76219428, + "num_input_tokens_seen": 226381920, + "router_z_loss_clip": 0.73730469, + "router_z_loss_mlp": 0.10571289, + "step": 10498, + "time_per_iteration": 2.679286241531372 + }, + { + "auxiliary_loss_clip": 0.01117749, + "auxiliary_loss_mlp": 0.01031446, + "balance_loss_clip": 1.04160559, + "balance_loss_mlp": 1.02001429, + "epoch": 0.6312340297610101, + "flos": 27711895435200.0, + "grad_norm": 3.239793320959501, + "language_loss": 0.70008361, + "learning_rate": 1.264641775364217e-06, + "loss": 0.7215755, + "num_input_tokens_seen": 226400035, + "router_z_loss_clip": 0.76123047, + "router_z_loss_mlp": 0.11413574, + "step": 10499, + "time_per_iteration": 2.620035171508789 + }, + { + "auxiliary_loss_clip": 0.01114646, + "auxiliary_loss_mlp": 0.01038712, + "balance_loss_clip": 1.04178524, + "balance_loss_mlp": 1.02721381, + "epoch": 0.6312941530136781, + "flos": 29626874193600.0, + "grad_norm": 1.954336095798069, + "language_loss": 0.69853574, + "learning_rate": 1.2642796083570448e-06, + "loss": 0.72006929, + "num_input_tokens_seen": 226418280, + "router_z_loss_clip": 0.72851562, + "router_z_loss_mlp": 0.11486816, + "step": 10500, + "time_per_iteration": 2.7149810791015625 + }, + { + "auxiliary_loss_clip": 0.01118492, + "auxiliary_loss_mlp": 0.01031181, + "balance_loss_clip": 1.04292488, + "balance_loss_mlp": 1.02007663, + "epoch": 0.631354276266346, + "flos": 26510121784800.0, + "grad_norm": 2.414622493106006, + "language_loss": 0.74451172, + "learning_rate": 1.2639174692484634e-06, + "loss": 0.7660085, + "num_input_tokens_seen": 226436650, + "router_z_loss_clip": 0.75537109, + "router_z_loss_mlp": 0.11096191, + "step": 10501, + "time_per_iteration": 2.6480841636657715 + }, + { + "auxiliary_loss_clip": 0.01115008, + "auxiliary_loss_mlp": 0.01031648, + "balance_loss_clip": 1.04062712, + "balance_loss_mlp": 1.01979232, + "epoch": 0.631414399519014, + "flos": 29315664901440.0, + "grad_norm": 1.9165262950572515, + "language_loss": 0.75335872, + "learning_rate": 1.2635553580522053e-06, + "loss": 0.77482533, + "num_input_tokens_seen": 226456275, + "router_z_loss_clip": 0.74267578, + "router_z_loss_mlp": 0.11853027, + "step": 10502, + "time_per_iteration": 2.6907172203063965 + }, + { + "auxiliary_loss_clip": 0.01119804, + "auxiliary_loss_mlp": 0.01044456, + "balance_loss_clip": 1.04191613, + "balance_loss_mlp": 1.03220177, + "epoch": 0.6314745227716819, + "flos": 29670504953760.0, + "grad_norm": 2.207392932055538, + "language_loss": 0.8550328, + "learning_rate": 1.2631932747820022e-06, + "loss": 0.87667543, + "num_input_tokens_seen": 226473610, + "router_z_loss_clip": 0.77880859, + "router_z_loss_mlp": 0.12255859, + "step": 10503, + "time_per_iteration": 2.6734068393707275 + }, + { + "auxiliary_loss_clip": 0.01116923, + "auxiliary_loss_mlp": 0.01030107, + "balance_loss_clip": 1.04014683, + "balance_loss_mlp": 1.01835918, + "epoch": 0.6315346460243499, + "flos": 28512037925280.0, + "grad_norm": 1.8490116399196534, + "language_loss": 0.86550331, + "learning_rate": 1.2628312194515838e-06, + "loss": 0.88697362, + "num_input_tokens_seen": 226493665, + "router_z_loss_clip": 0.76757812, + "router_z_loss_mlp": 0.11743164, + "step": 10504, + "time_per_iteration": 2.754331350326538 + }, + { + "auxiliary_loss_clip": 0.01121407, + "auxiliary_loss_mlp": 0.01036927, + "balance_loss_clip": 1.04240847, + "balance_loss_mlp": 1.0246129, + "epoch": 0.6315947692770179, + "flos": 24722226682560.0, + "grad_norm": 1.6676697527059112, + "language_loss": 0.76431817, + "learning_rate": 1.2624691920746793e-06, + "loss": 0.78590155, + "num_input_tokens_seen": 226511625, + "router_z_loss_clip": 0.79003906, + "router_z_loss_mlp": 0.12322998, + "step": 10505, + "time_per_iteration": 2.7401912212371826 + }, + { + "auxiliary_loss_clip": 0.01117218, + "auxiliary_loss_mlp": 0.01032824, + "balance_loss_clip": 1.04050374, + "balance_loss_mlp": 1.02076006, + "epoch": 0.6316548925296859, + "flos": 30829701293280.0, + "grad_norm": 2.724703124347024, + "language_loss": 0.82050121, + "learning_rate": 1.2621071926650166e-06, + "loss": 0.84200162, + "num_input_tokens_seen": 226530085, + "router_z_loss_clip": 0.76757812, + "router_z_loss_mlp": 0.12078857, + "step": 10506, + "time_per_iteration": 2.6536049842834473 + }, + { + "auxiliary_loss_clip": 0.0111949, + "auxiliary_loss_mlp": 0.0103079, + "balance_loss_clip": 1.04323626, + "balance_loss_mlp": 1.01916695, + "epoch": 0.6317150157823538, + "flos": 27980041209120.0, + "grad_norm": 2.372616494915609, + "language_loss": 0.74512303, + "learning_rate": 1.2617452212363238e-06, + "loss": 0.76662576, + "num_input_tokens_seen": 226548115, + "router_z_loss_clip": 0.76269531, + "router_z_loss_mlp": 0.1161499, + "step": 10507, + "time_per_iteration": 2.711047410964966 + }, + { + "auxiliary_loss_clip": 0.0112401, + "auxiliary_loss_mlp": 0.0103556, + "balance_loss_clip": 1.0440948, + "balance_loss_mlp": 1.02371609, + "epoch": 0.6317751390350218, + "flos": 27487826111520.0, + "grad_norm": 3.1338400063581395, + "language_loss": 0.67569566, + "learning_rate": 1.2613832778023258e-06, + "loss": 0.69729137, + "num_input_tokens_seen": 226567955, + "router_z_loss_clip": 0.79931641, + "router_z_loss_mlp": 0.1184082, + "step": 10508, + "time_per_iteration": 2.684331178665161 + }, + { + "auxiliary_loss_clip": 0.01117447, + "auxiliary_loss_mlp": 0.01034693, + "balance_loss_clip": 1.0418911, + "balance_loss_mlp": 1.02302873, + "epoch": 0.6318352622876897, + "flos": 28424492784000.0, + "grad_norm": 2.9950067274284335, + "language_loss": 0.71096343, + "learning_rate": 1.2610213623767478e-06, + "loss": 0.73248482, + "num_input_tokens_seen": 226588205, + "router_z_loss_clip": 0.75585938, + "router_z_loss_mlp": 0.11669922, + "step": 10509, + "time_per_iteration": 2.709399700164795 + }, + { + "auxiliary_loss_clip": 0.01115873, + "auxiliary_loss_mlp": 0.01027043, + "balance_loss_clip": 1.04067016, + "balance_loss_mlp": 1.01618862, + "epoch": 0.6318953855403577, + "flos": 25263177717600.0, + "grad_norm": 1.620555088780621, + "language_loss": 0.79483724, + "learning_rate": 1.2606594749733143e-06, + "loss": 0.81626642, + "num_input_tokens_seen": 226606965, + "router_z_loss_clip": 0.75292969, + "router_z_loss_mlp": 0.10852051, + "step": 10510, + "time_per_iteration": 2.682727336883545 + }, + { + "auxiliary_loss_clip": 0.01118924, + "auxiliary_loss_mlp": 0.01031334, + "balance_loss_clip": 1.04110801, + "balance_loss_mlp": 1.01914501, + "epoch": 0.6319555087930258, + "flos": 27845259269760.0, + "grad_norm": 2.154094948891234, + "language_loss": 0.70383769, + "learning_rate": 1.2602976156057469e-06, + "loss": 0.72534025, + "num_input_tokens_seen": 226627845, + "router_z_loss_clip": 0.77832031, + "router_z_loss_mlp": 0.12200928, + "step": 10511, + "time_per_iteration": 4.1560728549957275 + }, + { + "auxiliary_loss_clip": 0.01115923, + "auxiliary_loss_mlp": 0.0103224, + "balance_loss_clip": 1.04175472, + "balance_loss_mlp": 1.02092075, + "epoch": 0.6320156320456937, + "flos": 24368845252320.0, + "grad_norm": 4.119622496228729, + "language_loss": 0.8008337, + "learning_rate": 1.2599357842877684e-06, + "loss": 0.82231534, + "num_input_tokens_seen": 226645855, + "router_z_loss_clip": 0.74121094, + "router_z_loss_mlp": 0.11322021, + "step": 10512, + "time_per_iteration": 2.665555000305176 + }, + { + "auxiliary_loss_clip": 0.01118498, + "auxiliary_loss_mlp": 0.01032735, + "balance_loss_clip": 1.04234934, + "balance_loss_mlp": 1.02017021, + "epoch": 0.6320757552983617, + "flos": 32962550231520.0, + "grad_norm": 2.3531131502237406, + "language_loss": 0.70499319, + "learning_rate": 1.2595739810330994e-06, + "loss": 0.72650552, + "num_input_tokens_seen": 226665375, + "router_z_loss_clip": 0.76220703, + "router_z_loss_mlp": 0.12561035, + "step": 10513, + "time_per_iteration": 2.6487784385681152 + }, + { + "auxiliary_loss_clip": 0.01122358, + "auxiliary_loss_mlp": 0.01030832, + "balance_loss_clip": 1.04257488, + "balance_loss_mlp": 1.01857686, + "epoch": 0.6321358785510296, + "flos": 28914074258400.0, + "grad_norm": 2.487618000843097, + "language_loss": 0.66666007, + "learning_rate": 1.259212205855459e-06, + "loss": 0.68819201, + "num_input_tokens_seen": 226685270, + "router_z_loss_clip": 0.79785156, + "router_z_loss_mlp": 0.12261963, + "step": 10514, + "time_per_iteration": 3.9863638877868652 + }, + { + "auxiliary_loss_clip": 0.01114799, + "auxiliary_loss_mlp": 0.01029075, + "balance_loss_clip": 1.03969002, + "balance_loss_mlp": 1.01798213, + "epoch": 0.6321960018036976, + "flos": 31719779444160.0, + "grad_norm": 1.8605347399365253, + "language_loss": 0.73987299, + "learning_rate": 1.2588504587685663e-06, + "loss": 0.76131165, + "num_input_tokens_seen": 226705325, + "router_z_loss_clip": 0.75048828, + "router_z_loss_mlp": 0.11096191, + "step": 10515, + "time_per_iteration": 2.7010834217071533 + }, + { + "auxiliary_loss_clip": 0.01113285, + "auxiliary_loss_mlp": 0.01028771, + "balance_loss_clip": 1.04014444, + "balance_loss_mlp": 1.01778555, + "epoch": 0.6322561250563655, + "flos": 27846353236320.0, + "grad_norm": 2.017845220871314, + "language_loss": 0.89768314, + "learning_rate": 1.2584887397861379e-06, + "loss": 0.91910374, + "num_input_tokens_seen": 226723815, + "router_z_loss_clip": 0.73144531, + "router_z_loss_mlp": 0.10986328, + "step": 10516, + "time_per_iteration": 2.654963970184326 + }, + { + "auxiliary_loss_clip": 0.01123554, + "auxiliary_loss_mlp": 0.0103255, + "balance_loss_clip": 1.04355693, + "balance_loss_mlp": 1.01926446, + "epoch": 0.6323162483090335, + "flos": 23169664707840.0, + "grad_norm": 2.2393056478851063, + "language_loss": 0.81719035, + "learning_rate": 1.2581270489218911e-06, + "loss": 0.83875138, + "num_input_tokens_seen": 226741550, + "router_z_loss_clip": 0.79931641, + "router_z_loss_mlp": 0.13293457, + "step": 10517, + "time_per_iteration": 2.6928796768188477 + }, + { + "auxiliary_loss_clip": 0.01118207, + "auxiliary_loss_mlp": 0.01033332, + "balance_loss_clip": 1.04200315, + "balance_loss_mlp": 1.02182841, + "epoch": 0.6323763715617015, + "flos": 24239452111200.0, + "grad_norm": 1.835983282960929, + "language_loss": 0.77778727, + "learning_rate": 1.257765386189541e-06, + "loss": 0.79930264, + "num_input_tokens_seen": 226761115, + "router_z_loss_clip": 0.76123047, + "router_z_loss_mlp": 0.1151123, + "step": 10518, + "time_per_iteration": 2.623990297317505 + }, + { + "auxiliary_loss_clip": 0.01114353, + "auxiliary_loss_mlp": 0.01030535, + "balance_loss_clip": 1.04007113, + "balance_loss_mlp": 1.01882911, + "epoch": 0.6324364948143695, + "flos": 27799764714720.0, + "grad_norm": 2.1342174927194555, + "language_loss": 0.85293901, + "learning_rate": 1.2574037516028018e-06, + "loss": 0.87438792, + "num_input_tokens_seen": 226782225, + "router_z_loss_clip": 0.74316406, + "router_z_loss_mlp": 0.11700439, + "step": 10519, + "time_per_iteration": 2.7797915935516357 + }, + { + "auxiliary_loss_clip": 0.01114209, + "auxiliary_loss_mlp": 0.01032886, + "balance_loss_clip": 1.04097438, + "balance_loss_mlp": 1.0217104, + "epoch": 0.6324966180670374, + "flos": 27132499851840.0, + "grad_norm": 2.2048882979436293, + "language_loss": 0.71961415, + "learning_rate": 1.2570421451753867e-06, + "loss": 0.74108517, + "num_input_tokens_seen": 226802375, + "router_z_loss_clip": 0.73291016, + "router_z_loss_mlp": 0.1116333, + "step": 10520, + "time_per_iteration": 2.628850221633911 + }, + { + "auxiliary_loss_clip": 0.01115203, + "auxiliary_loss_mlp": 0.01029615, + "balance_loss_clip": 1.03987372, + "balance_loss_mlp": 1.01820683, + "epoch": 0.6325567413197054, + "flos": 26465275506240.0, + "grad_norm": 1.8019423569817374, + "language_loss": 0.71904856, + "learning_rate": 1.2566805669210081e-06, + "loss": 0.74049675, + "num_input_tokens_seen": 226822165, + "router_z_loss_clip": 0.75390625, + "router_z_loss_mlp": 0.11407471, + "step": 10521, + "time_per_iteration": 4.2230565547943115 + }, + { + "auxiliary_loss_clip": 0.01119783, + "auxiliary_loss_mlp": 0.01031896, + "balance_loss_clip": 1.04254794, + "balance_loss_mlp": 1.01963568, + "epoch": 0.6326168645723733, + "flos": 24327361908000.0, + "grad_norm": 1.7945921787145334, + "language_loss": 0.71781552, + "learning_rate": 1.256319016853377e-06, + "loss": 0.73933232, + "num_input_tokens_seen": 226841645, + "router_z_loss_clip": 0.77148438, + "router_z_loss_mlp": 0.12255859, + "step": 10522, + "time_per_iteration": 2.666364908218384 + }, + { + "auxiliary_loss_clip": 0.01119175, + "auxiliary_loss_mlp": 0.0103448, + "balance_loss_clip": 1.04229307, + "balance_loss_mlp": 1.02301228, + "epoch": 0.6326769878250413, + "flos": 24685038169920.0, + "grad_norm": 2.272362385897415, + "language_loss": 0.81830037, + "learning_rate": 1.2559574949862023e-06, + "loss": 0.8398369, + "num_input_tokens_seen": 226860355, + "router_z_loss_clip": 0.76904297, + "router_z_loss_mlp": 0.11468506, + "step": 10523, + "time_per_iteration": 4.127038955688477 + }, + { + "auxiliary_loss_clip": 0.01115099, + "auxiliary_loss_mlp": 0.01027726, + "balance_loss_clip": 1.0394094, + "balance_loss_mlp": 1.01574516, + "epoch": 0.6327371110777094, + "flos": 25350763376160.0, + "grad_norm": 2.5144540372537403, + "language_loss": 0.73763216, + "learning_rate": 1.255596001333195e-06, + "loss": 0.75906038, + "num_input_tokens_seen": 226878390, + "router_z_loss_clip": 0.75634766, + "router_z_loss_mlp": 0.11975098, + "step": 10524, + "time_per_iteration": 2.6492984294891357 + }, + { + "auxiliary_loss_clip": 0.01124389, + "auxiliary_loss_mlp": 0.01034541, + "balance_loss_clip": 1.04223251, + "balance_loss_mlp": 1.02149999, + "epoch": 0.6327972343303773, + "flos": 37017427934880.0, + "grad_norm": 5.03142326557376, + "language_loss": 0.84090412, + "learning_rate": 1.2552345359080615e-06, + "loss": 0.86249346, + "num_input_tokens_seen": 226898420, + "router_z_loss_clip": 0.82226562, + "router_z_loss_mlp": 0.13043213, + "step": 10525, + "time_per_iteration": 2.685706853866577 + }, + { + "auxiliary_loss_clip": 0.01114894, + "auxiliary_loss_mlp": 0.01024311, + "balance_loss_clip": 1.03995442, + "balance_loss_mlp": 1.01299214, + "epoch": 0.6328573575830453, + "flos": 20855728929600.0, + "grad_norm": 1.7848798712997487, + "language_loss": 0.66452312, + "learning_rate": 1.2548730987245093e-06, + "loss": 0.68591511, + "num_input_tokens_seen": 226916305, + "router_z_loss_clip": 0.74902344, + "router_z_loss_mlp": 0.11328125, + "step": 10526, + "time_per_iteration": 2.6617624759674072 + }, + { + "auxiliary_loss_clip": 0.01122665, + "auxiliary_loss_mlp": 0.01036565, + "balance_loss_clip": 1.04321873, + "balance_loss_mlp": 1.023422, + "epoch": 0.6329174808357132, + "flos": 30562811555040.0, + "grad_norm": 1.5925061601718018, + "language_loss": 0.73256344, + "learning_rate": 1.254511689796244e-06, + "loss": 0.75415576, + "num_input_tokens_seen": 226937705, + "router_z_loss_clip": 0.79492188, + "router_z_loss_mlp": 0.13134766, + "step": 10527, + "time_per_iteration": 2.804238796234131 + }, + { + "auxiliary_loss_clip": 0.01115489, + "auxiliary_loss_mlp": 0.01030054, + "balance_loss_clip": 1.04213834, + "balance_loss_mlp": 1.01890826, + "epoch": 0.6329776040883812, + "flos": 20544033430080.0, + "grad_norm": 3.0247287270092387, + "language_loss": 0.71549278, + "learning_rate": 1.2541503091369693e-06, + "loss": 0.73694819, + "num_input_tokens_seen": 226954880, + "router_z_loss_clip": 0.73339844, + "router_z_loss_mlp": 0.11151123, + "step": 10528, + "time_per_iteration": 2.6532270908355713 + }, + { + "auxiliary_loss_clip": 0.01116838, + "auxiliary_loss_mlp": 0.01028616, + "balance_loss_clip": 1.04108274, + "balance_loss_mlp": 1.01622462, + "epoch": 0.6330377273410491, + "flos": 16492559178240.0, + "grad_norm": 2.2108829938362735, + "language_loss": 0.66748226, + "learning_rate": 1.2537889567603905e-06, + "loss": 0.68893677, + "num_input_tokens_seen": 226972595, + "router_z_loss_clip": 0.75732422, + "router_z_loss_mlp": 0.1239624, + "step": 10529, + "time_per_iteration": 2.6458075046539307 + }, + { + "auxiliary_loss_clip": 0.01122159, + "auxiliary_loss_mlp": 0.01030548, + "balance_loss_clip": 1.04325426, + "balance_loss_mlp": 1.01798391, + "epoch": 0.6330978505937171, + "flos": 26282203319520.0, + "grad_norm": 2.8404947548549573, + "language_loss": 0.75047326, + "learning_rate": 1.2534276326802092e-06, + "loss": 0.77200037, + "num_input_tokens_seen": 226991910, + "router_z_loss_clip": 0.7890625, + "router_z_loss_mlp": 0.12579346, + "step": 10530, + "time_per_iteration": 2.653903007507324 + }, + { + "auxiliary_loss_clip": 0.01119375, + "auxiliary_loss_mlp": 0.01029591, + "balance_loss_clip": 1.04211211, + "balance_loss_mlp": 1.01794457, + "epoch": 0.6331579738463851, + "flos": 30516952344480.0, + "grad_norm": 1.6294275853398146, + "language_loss": 0.73791802, + "learning_rate": 1.2530663369101259e-06, + "loss": 0.75940764, + "num_input_tokens_seen": 227010175, + "router_z_loss_clip": 0.77294922, + "router_z_loss_mlp": 0.11645508, + "step": 10531, + "time_per_iteration": 2.699246406555176 + }, + { + "auxiliary_loss_clip": 0.01114439, + "auxiliary_loss_mlp": 0.01024948, + "balance_loss_clip": 1.04062104, + "balance_loss_mlp": 1.01297402, + "epoch": 0.6332180970990531, + "flos": 18273323239200.0, + "grad_norm": 2.29745929090587, + "language_loss": 0.80315602, + "learning_rate": 1.2527050694638432e-06, + "loss": 0.82454991, + "num_input_tokens_seen": 227025540, + "router_z_loss_clip": 0.73779297, + "router_z_loss_mlp": 0.11987305, + "step": 10532, + "time_per_iteration": 2.60502028465271 + }, + { + "auxiliary_loss_clip": 0.01113752, + "auxiliary_loss_mlp": 0.01028903, + "balance_loss_clip": 1.04025149, + "balance_loss_mlp": 1.01835918, + "epoch": 0.633278220351721, + "flos": 27707600603520.0, + "grad_norm": 1.557500253462173, + "language_loss": 0.7438429, + "learning_rate": 1.2523438303550582e-06, + "loss": 0.7652694, + "num_input_tokens_seen": 227045520, + "router_z_loss_clip": 0.734375, + "router_z_loss_mlp": 0.10546875, + "step": 10533, + "time_per_iteration": 2.69728422164917 + }, + { + "auxiliary_loss_clip": 0.01123912, + "auxiliary_loss_mlp": 0.01037028, + "balance_loss_clip": 1.04221821, + "balance_loss_mlp": 1.02404046, + "epoch": 0.633338343604389, + "flos": 15377641875360.0, + "grad_norm": 3.970792001783305, + "language_loss": 0.77058578, + "learning_rate": 1.2519826195974706e-06, + "loss": 0.7921952, + "num_input_tokens_seen": 227059420, + "router_z_loss_clip": 0.81689453, + "router_z_loss_mlp": 0.12976074, + "step": 10534, + "time_per_iteration": 2.5879945755004883 + }, + { + "auxiliary_loss_clip": 0.0112036, + "auxiliary_loss_mlp": 0.01032921, + "balance_loss_clip": 1.0444659, + "balance_loss_mlp": 1.02150071, + "epoch": 0.6333984668570569, + "flos": 31676796960480.0, + "grad_norm": 1.766457493258628, + "language_loss": 0.85456455, + "learning_rate": 1.251621437204777e-06, + "loss": 0.87609738, + "num_input_tokens_seen": 227081310, + "router_z_loss_clip": 0.75927734, + "router_z_loss_mlp": 0.11425781, + "step": 10535, + "time_per_iteration": 2.7400145530700684 + }, + { + "auxiliary_loss_clip": 0.01119489, + "auxiliary_loss_mlp": 0.0103259, + "balance_loss_clip": 1.04256821, + "balance_loss_mlp": 1.02047205, + "epoch": 0.6334585901097249, + "flos": 28869309014400.0, + "grad_norm": 3.011203890437495, + "language_loss": 0.76631749, + "learning_rate": 1.2512602831906733e-06, + "loss": 0.78783822, + "num_input_tokens_seen": 227100365, + "router_z_loss_clip": 0.76855469, + "router_z_loss_mlp": 0.12121582, + "step": 10536, + "time_per_iteration": 2.6650137901306152 + }, + { + "auxiliary_loss_clip": 0.01119307, + "auxiliary_loss_mlp": 0.01025678, + "balance_loss_clip": 1.0447638, + "balance_loss_mlp": 1.01412666, + "epoch": 0.633518713362393, + "flos": 35094872445120.0, + "grad_norm": 2.527384665634426, + "language_loss": 0.59868503, + "learning_rate": 1.250899157568855e-06, + "loss": 0.62013489, + "num_input_tokens_seen": 227119680, + "router_z_loss_clip": 0.74609375, + "router_z_loss_mlp": 0.11560059, + "step": 10537, + "time_per_iteration": 2.704265594482422 + }, + { + "auxiliary_loss_clip": 0.01037399, + "auxiliary_loss_mlp": 0.01000312, + "balance_loss_clip": 1.01321983, + "balance_loss_mlp": 0.99918807, + "epoch": 0.6335788366150609, + "flos": 85928669065440.0, + "grad_norm": 0.7833440718553804, + "language_loss": 0.52484441, + "learning_rate": 1.2505380603530155e-06, + "loss": 0.54522145, + "num_input_tokens_seen": 227184465, + "router_z_loss_clip": 0.24206543, + "router_z_loss_mlp": 0.01124573, + "step": 10538, + "time_per_iteration": 3.3715405464172363 + }, + { + "auxiliary_loss_clip": 0.01122667, + "auxiliary_loss_mlp": 0.0102887, + "balance_loss_clip": 1.04321551, + "balance_loss_mlp": 1.01621628, + "epoch": 0.6336389598677289, + "flos": 28958191225920.0, + "grad_norm": 3.166785286424578, + "language_loss": 0.83156657, + "learning_rate": 1.250176991556848e-06, + "loss": 0.85308194, + "num_input_tokens_seen": 227202185, + "router_z_loss_clip": 0.79394531, + "router_z_loss_mlp": 0.12652588, + "step": 10539, + "time_per_iteration": 2.626842498779297 + }, + { + "auxiliary_loss_clip": 0.01119997, + "auxiliary_loss_mlp": 0.01028616, + "balance_loss_clip": 1.04239452, + "balance_loss_mlp": 1.01614714, + "epoch": 0.6336990831203968, + "flos": 35724260001600.0, + "grad_norm": 1.6018257842931154, + "language_loss": 0.86497277, + "learning_rate": 1.2498159511940438e-06, + "loss": 0.88645887, + "num_input_tokens_seen": 227222020, + "router_z_loss_clip": 0.77587891, + "router_z_loss_mlp": 0.12481689, + "step": 10540, + "time_per_iteration": 2.7357397079467773 + }, + { + "auxiliary_loss_clip": 0.01115403, + "auxiliary_loss_mlp": 0.01030917, + "balance_loss_clip": 1.04129422, + "balance_loss_mlp": 1.02037907, + "epoch": 0.6337592063730648, + "flos": 35504566544160.0, + "grad_norm": 1.9912097912609161, + "language_loss": 0.72790396, + "learning_rate": 1.2494549392782943e-06, + "loss": 0.74936712, + "num_input_tokens_seen": 227240885, + "router_z_loss_clip": 0.74121094, + "router_z_loss_mlp": 0.10546875, + "step": 10541, + "time_per_iteration": 2.687697649002075 + }, + { + "auxiliary_loss_clip": 0.01121683, + "auxiliary_loss_mlp": 0.01033599, + "balance_loss_clip": 1.04210019, + "balance_loss_mlp": 1.02129114, + "epoch": 0.6338193296257327, + "flos": 42350725281600.0, + "grad_norm": 4.593876151910472, + "language_loss": 0.84866643, + "learning_rate": 1.2490939558232887e-06, + "loss": 0.87021923, + "num_input_tokens_seen": 227257880, + "router_z_loss_clip": 0.79541016, + "router_z_loss_mlp": 0.12304688, + "step": 10542, + "time_per_iteration": 2.7875723838806152 + }, + { + "auxiliary_loss_clip": 0.01117097, + "auxiliary_loss_mlp": 0.01028824, + "balance_loss_clip": 1.04119849, + "balance_loss_mlp": 1.01592016, + "epoch": 0.6338794528784008, + "flos": 20362217279040.0, + "grad_norm": 1.8838506162102426, + "language_loss": 0.78055906, + "learning_rate": 1.2487330008427153e-06, + "loss": 0.80201828, + "num_input_tokens_seen": 227274840, + "router_z_loss_clip": 0.75878906, + "router_z_loss_mlp": 0.12908936, + "step": 10543, + "time_per_iteration": 2.588792562484741 + }, + { + "auxiliary_loss_clip": 0.01112654, + "auxiliary_loss_mlp": 0.01034298, + "balance_loss_clip": 1.04101789, + "balance_loss_mlp": 1.02290201, + "epoch": 0.6339395761310687, + "flos": 27267443860320.0, + "grad_norm": 1.8611982886674552, + "language_loss": 0.73420095, + "learning_rate": 1.2483720743502618e-06, + "loss": 0.75567043, + "num_input_tokens_seen": 227294835, + "router_z_loss_clip": 0.71582031, + "router_z_loss_mlp": 0.11401367, + "step": 10544, + "time_per_iteration": 2.7056362628936768 + }, + { + "auxiliary_loss_clip": 0.01121893, + "auxiliary_loss_mlp": 0.01036927, + "balance_loss_clip": 1.04211521, + "balance_loss_mlp": 1.02483869, + "epoch": 0.6339996993837367, + "flos": 22636776611520.0, + "grad_norm": 2.3703896263007356, + "language_loss": 0.68184698, + "learning_rate": 1.2480111763596144e-06, + "loss": 0.70343518, + "num_input_tokens_seen": 227314935, + "router_z_loss_clip": 0.79833984, + "router_z_loss_mlp": 0.12084961, + "step": 10545, + "time_per_iteration": 2.6553988456726074 + }, + { + "auxiliary_loss_clip": 0.01114571, + "auxiliary_loss_mlp": 0.0103313, + "balance_loss_clip": 1.04011798, + "balance_loss_mlp": 1.02094698, + "epoch": 0.6340598226364046, + "flos": 15824808108000.0, + "grad_norm": 2.3983251598721678, + "language_loss": 0.71018076, + "learning_rate": 1.2476503068844592e-06, + "loss": 0.7316578, + "num_input_tokens_seen": 227332905, + "router_z_loss_clip": 0.74560547, + "router_z_loss_mlp": 0.12176514, + "step": 10546, + "time_per_iteration": 2.6503987312316895 + }, + { + "auxiliary_loss_clip": 0.01113609, + "auxiliary_loss_mlp": 0.0103023, + "balance_loss_clip": 1.04112673, + "balance_loss_mlp": 1.01912522, + "epoch": 0.6341199458890726, + "flos": 32565173385600.0, + "grad_norm": 1.397643320724053, + "language_loss": 0.78226292, + "learning_rate": 1.2472894659384792e-06, + "loss": 0.80370128, + "num_input_tokens_seen": 227354915, + "router_z_loss_clip": 0.72460938, + "router_z_loss_mlp": 0.11108398, + "step": 10547, + "time_per_iteration": 2.6937856674194336 + }, + { + "auxiliary_loss_clip": 0.01118637, + "auxiliary_loss_mlp": 0.01035488, + "balance_loss_clip": 1.04063594, + "balance_loss_mlp": 1.02348924, + "epoch": 0.6341800691417405, + "flos": 22859427830400.0, + "grad_norm": 1.8880807706938163, + "language_loss": 0.63116121, + "learning_rate": 1.2469286535353578e-06, + "loss": 0.65270257, + "num_input_tokens_seen": 227372990, + "router_z_loss_clip": 0.78076172, + "router_z_loss_mlp": 0.12005615, + "step": 10548, + "time_per_iteration": 2.6486966609954834 + }, + { + "auxiliary_loss_clip": 0.01117003, + "auxiliary_loss_mlp": 0.01032194, + "balance_loss_clip": 1.0424453, + "balance_loss_mlp": 1.02116716, + "epoch": 0.6342401923944085, + "flos": 32030097356160.0, + "grad_norm": 1.8514995332294126, + "language_loss": 0.61309111, + "learning_rate": 1.2465678696887785e-06, + "loss": 0.63458312, + "num_input_tokens_seen": 227393270, + "router_z_loss_clip": 0.74511719, + "router_z_loss_mlp": 0.11016846, + "step": 10549, + "time_per_iteration": 2.6771607398986816 + }, + { + "auxiliary_loss_clip": 0.01116333, + "auxiliary_loss_mlp": 0.01030164, + "balance_loss_clip": 1.04075789, + "balance_loss_mlp": 1.01877952, + "epoch": 0.6343003156470765, + "flos": 30116050495200.0, + "grad_norm": 1.6491479416078667, + "language_loss": 0.73808026, + "learning_rate": 1.2462071144124197e-06, + "loss": 0.75954521, + "num_input_tokens_seen": 227413630, + "router_z_loss_clip": 0.75537109, + "router_z_loss_mlp": 0.11395264, + "step": 10550, + "time_per_iteration": 2.743340015411377 + }, + { + "auxiliary_loss_clip": 0.01036081, + "auxiliary_loss_mlp": 0.01002734, + "balance_loss_clip": 1.01167989, + "balance_loss_mlp": 1.00158417, + "epoch": 0.6343604388997445, + "flos": 85177748720160.0, + "grad_norm": 0.697819313211104, + "language_loss": 0.5773862, + "learning_rate": 1.2458463877199638e-06, + "loss": 0.59777439, + "num_input_tokens_seen": 227476630, + "router_z_loss_clip": 0.2442627, + "router_z_loss_mlp": 0.0114975, + "step": 10551, + "time_per_iteration": 4.701247215270996 + }, + { + "auxiliary_loss_clip": 0.01114146, + "auxiliary_loss_mlp": 0.01026655, + "balance_loss_clip": 1.04010749, + "balance_loss_mlp": 1.01612878, + "epoch": 0.6344205621524125, + "flos": 26822992285440.0, + "grad_norm": 1.8754392543029388, + "language_loss": 0.67006356, + "learning_rate": 1.2454856896250881e-06, + "loss": 0.69147158, + "num_input_tokens_seen": 227496060, + "router_z_loss_clip": 0.74072266, + "router_z_loss_mlp": 0.10522461, + "step": 10552, + "time_per_iteration": 2.6484344005584717 + }, + { + "auxiliary_loss_clip": 0.01118981, + "auxiliary_loss_mlp": 0.01027361, + "balance_loss_clip": 1.04019058, + "balance_loss_mlp": 1.01530278, + "epoch": 0.6344806854050804, + "flos": 24951725321760.0, + "grad_norm": 1.7662824347481478, + "language_loss": 0.82230693, + "learning_rate": 1.24512502014147e-06, + "loss": 0.84377038, + "num_input_tokens_seen": 227513440, + "router_z_loss_clip": 0.78808594, + "router_z_loss_mlp": 0.12060547, + "step": 10553, + "time_per_iteration": 4.014477729797363 + }, + { + "auxiliary_loss_clip": 0.01119265, + "auxiliary_loss_mlp": 0.01029151, + "balance_loss_clip": 1.04120529, + "balance_loss_mlp": 1.01759934, + "epoch": 0.6345408086577484, + "flos": 49433108526720.0, + "grad_norm": 2.579655279164166, + "language_loss": 0.55327559, + "learning_rate": 1.2447643792827879e-06, + "loss": 0.57475972, + "num_input_tokens_seen": 227535395, + "router_z_loss_clip": 0.78076172, + "router_z_loss_mlp": 0.11560059, + "step": 10554, + "time_per_iteration": 2.8096067905426025 + }, + { + "auxiliary_loss_clip": 0.01118491, + "auxiliary_loss_mlp": 0.01033298, + "balance_loss_clip": 1.04237735, + "balance_loss_mlp": 1.02105522, + "epoch": 0.6346009319104163, + "flos": 26065143485280.0, + "grad_norm": 2.1683867798873377, + "language_loss": 0.70652717, + "learning_rate": 1.2444037670627153e-06, + "loss": 0.72804511, + "num_input_tokens_seen": 227554545, + "router_z_loss_clip": 0.76220703, + "router_z_loss_mlp": 0.12249756, + "step": 10555, + "time_per_iteration": 2.628371477127075 + }, + { + "auxiliary_loss_clip": 0.01035679, + "auxiliary_loss_mlp": 0.01001927, + "balance_loss_clip": 1.01131248, + "balance_loss_mlp": 1.000741, + "epoch": 0.6346610551630844, + "flos": 87082274020320.0, + "grad_norm": 0.798897626758498, + "language_loss": 0.55273509, + "learning_rate": 1.2440431834949276e-06, + "loss": 0.57311106, + "num_input_tokens_seen": 227608575, + "router_z_loss_clip": 0.24377441, + "router_z_loss_mlp": 0.01185608, + "step": 10556, + "time_per_iteration": 3.1587185859680176 + }, + { + "auxiliary_loss_clip": 0.01118294, + "auxiliary_loss_mlp": 0.01032905, + "balance_loss_clip": 1.03998792, + "balance_loss_mlp": 1.02017951, + "epoch": 0.6347211784157523, + "flos": 31007465716320.0, + "grad_norm": 2.213971651638891, + "language_loss": 0.68069834, + "learning_rate": 1.2436826285930985e-06, + "loss": 0.70221031, + "num_input_tokens_seen": 227628175, + "router_z_loss_clip": 0.78271484, + "router_z_loss_mlp": 0.12731934, + "step": 10557, + "time_per_iteration": 2.7317323684692383 + }, + { + "auxiliary_loss_clip": 0.0111547, + "auxiliary_loss_mlp": 0.01030191, + "balance_loss_clip": 1.04035902, + "balance_loss_mlp": 1.0189321, + "epoch": 0.6347813016684203, + "flos": 19208895945120.0, + "grad_norm": 1.800420937289931, + "language_loss": 0.702142, + "learning_rate": 1.2433221023709002e-06, + "loss": 0.7235986, + "num_input_tokens_seen": 227645330, + "router_z_loss_clip": 0.75195312, + "router_z_loss_mlp": 0.11248779, + "step": 10558, + "time_per_iteration": 2.6344621181488037 + }, + { + "auxiliary_loss_clip": 0.01115406, + "auxiliary_loss_mlp": 0.01029691, + "balance_loss_clip": 1.03988671, + "balance_loss_mlp": 1.01796651, + "epoch": 0.6348414249210882, + "flos": 26191943520480.0, + "grad_norm": 4.0457819359893685, + "language_loss": 0.7842468, + "learning_rate": 1.2429616048420031e-06, + "loss": 0.80569774, + "num_input_tokens_seen": 227665250, + "router_z_loss_clip": 0.75488281, + "router_z_loss_mlp": 0.11737061, + "step": 10559, + "time_per_iteration": 2.6744043827056885 + }, + { + "auxiliary_loss_clip": 0.01119587, + "auxiliary_loss_mlp": 0.01036184, + "balance_loss_clip": 1.04244256, + "balance_loss_mlp": 1.02420354, + "epoch": 0.6349015481737562, + "flos": 26421158538720.0, + "grad_norm": 1.7528619326964319, + "language_loss": 0.68006963, + "learning_rate": 1.242601136020078e-06, + "loss": 0.70162737, + "num_input_tokens_seen": 227685070, + "router_z_loss_clip": 0.77050781, + "router_z_loss_mlp": 0.11987305, + "step": 10560, + "time_per_iteration": 4.068682432174683 + }, + { + "auxiliary_loss_clip": 0.01116767, + "auxiliary_loss_mlp": 0.01031581, + "balance_loss_clip": 1.04066992, + "balance_loss_mlp": 1.02007151, + "epoch": 0.6349616714264241, + "flos": 27082953568800.0, + "grad_norm": 2.282761562761588, + "language_loss": 0.76844871, + "learning_rate": 1.2422406959187939e-06, + "loss": 0.78993219, + "num_input_tokens_seen": 227704430, + "router_z_loss_clip": 0.76074219, + "router_z_loss_mlp": 0.11523438, + "step": 10561, + "time_per_iteration": 2.709892749786377 + }, + { + "auxiliary_loss_clip": 0.01117537, + "auxiliary_loss_mlp": 0.01031227, + "balance_loss_clip": 1.04037631, + "balance_loss_mlp": 1.01930022, + "epoch": 0.6350217946790921, + "flos": 31006169163360.0, + "grad_norm": 2.338792134156386, + "language_loss": 0.72087681, + "learning_rate": 1.2418802845518178e-06, + "loss": 0.74236447, + "num_input_tokens_seen": 227724920, + "router_z_loss_clip": 0.77099609, + "router_z_loss_mlp": 0.11938477, + "step": 10562, + "time_per_iteration": 2.6978862285614014 + }, + { + "auxiliary_loss_clip": 0.01120334, + "auxiliary_loss_mlp": 0.01027193, + "balance_loss_clip": 1.04222608, + "balance_loss_mlp": 1.01474702, + "epoch": 0.63508191793176, + "flos": 24061039411680.0, + "grad_norm": 1.998484592976121, + "language_loss": 0.81047535, + "learning_rate": 1.2415199019328185e-06, + "loss": 0.8319506, + "num_input_tokens_seen": 227743400, + "router_z_loss_clip": 0.78125, + "router_z_loss_mlp": 0.12438965, + "step": 10563, + "time_per_iteration": 4.114362716674805 + }, + { + "auxiliary_loss_clip": 0.01122655, + "auxiliary_loss_mlp": 0.0103391, + "balance_loss_clip": 1.04475307, + "balance_loss_mlp": 1.02172041, + "epoch": 0.6351420411844281, + "flos": 22191514691040.0, + "grad_norm": 2.7399851953367, + "language_loss": 0.81261325, + "learning_rate": 1.2411595480754597e-06, + "loss": 0.83417892, + "num_input_tokens_seen": 227759990, + "router_z_loss_clip": 0.77832031, + "router_z_loss_mlp": 0.12188721, + "step": 10564, + "time_per_iteration": 2.6621437072753906 + }, + { + "auxiliary_loss_clip": 0.01119902, + "auxiliary_loss_mlp": 0.01029155, + "balance_loss_clip": 1.04372823, + "balance_loss_mlp": 1.01746678, + "epoch": 0.6352021644370961, + "flos": 41154421464000.0, + "grad_norm": 1.5965112772212715, + "language_loss": 0.72231185, + "learning_rate": 1.240799222993407e-06, + "loss": 0.74380243, + "num_input_tokens_seen": 227780835, + "router_z_loss_clip": 0.76074219, + "router_z_loss_mlp": 0.11688232, + "step": 10565, + "time_per_iteration": 2.7206649780273438 + }, + { + "auxiliary_loss_clip": 0.01118783, + "auxiliary_loss_mlp": 0.01027834, + "balance_loss_clip": 1.04096317, + "balance_loss_mlp": 1.01515019, + "epoch": 0.635262287689764, + "flos": 24861425005440.0, + "grad_norm": 2.176580912154983, + "language_loss": 0.6928395, + "learning_rate": 1.240438926700324e-06, + "loss": 0.71430576, + "num_input_tokens_seen": 227798580, + "router_z_loss_clip": 0.77734375, + "router_z_loss_mlp": 0.12683105, + "step": 10566, + "time_per_iteration": 2.6244795322418213 + }, + { + "auxiliary_loss_clip": 0.01115319, + "auxiliary_loss_mlp": 0.01033686, + "balance_loss_clip": 1.0416832, + "balance_loss_mlp": 1.02232528, + "epoch": 0.635322410942432, + "flos": 33587237783520.0, + "grad_norm": 3.7792885489708032, + "language_loss": 0.69939613, + "learning_rate": 1.2400786592098725e-06, + "loss": 0.72088623, + "num_input_tokens_seen": 227819210, + "router_z_loss_clip": 0.73632812, + "router_z_loss_mlp": 0.11358643, + "step": 10567, + "time_per_iteration": 2.693978786468506 + }, + { + "auxiliary_loss_clip": 0.01115527, + "auxiliary_loss_mlp": 0.01032758, + "balance_loss_clip": 1.04297459, + "balance_loss_mlp": 1.02171886, + "epoch": 0.6353825341950999, + "flos": 26287592117760.0, + "grad_norm": 2.365487339187902, + "language_loss": 0.84753656, + "learning_rate": 1.2397184205357154e-06, + "loss": 0.86901939, + "num_input_tokens_seen": 227838340, + "router_z_loss_clip": 0.72460938, + "router_z_loss_mlp": 0.11035156, + "step": 10568, + "time_per_iteration": 2.6943347454071045 + }, + { + "auxiliary_loss_clip": 0.01120525, + "auxiliary_loss_mlp": 0.0103699, + "balance_loss_clip": 1.04318476, + "balance_loss_mlp": 1.02473497, + "epoch": 0.635442657447768, + "flos": 38754115545600.0, + "grad_norm": 2.059972555043032, + "language_loss": 0.8403216, + "learning_rate": 1.2393582106915113e-06, + "loss": 0.86189675, + "num_input_tokens_seen": 227859170, + "router_z_loss_clip": 0.7734375, + "router_z_loss_mlp": 0.12261963, + "step": 10569, + "time_per_iteration": 2.7823994159698486 + }, + { + "auxiliary_loss_clip": 0.01117, + "auxiliary_loss_mlp": 0.01029485, + "balance_loss_clip": 1.04155374, + "balance_loss_mlp": 1.01728439, + "epoch": 0.6355027807004359, + "flos": 24195092040000.0, + "grad_norm": 1.8985262423612255, + "language_loss": 0.69069529, + "learning_rate": 1.2389980296909198e-06, + "loss": 0.71216011, + "num_input_tokens_seen": 227878545, + "router_z_loss_clip": 0.75390625, + "router_z_loss_mlp": 0.12213135, + "step": 10570, + "time_per_iteration": 2.6560089588165283 + }, + { + "auxiliary_loss_clip": 0.01118955, + "auxiliary_loss_mlp": 0.01029791, + "balance_loss_clip": 1.04041886, + "balance_loss_mlp": 1.01783478, + "epoch": 0.6355629039531039, + "flos": 37061220764160.0, + "grad_norm": 1.8242740354522633, + "language_loss": 0.65980566, + "learning_rate": 1.2386378775476e-06, + "loss": 0.68129313, + "num_input_tokens_seen": 227898875, + "router_z_loss_clip": 0.78613281, + "router_z_loss_mlp": 0.11962891, + "step": 10571, + "time_per_iteration": 2.739461660385132 + }, + { + "auxiliary_loss_clip": 0.01122095, + "auxiliary_loss_mlp": 0.01028509, + "balance_loss_clip": 1.04390955, + "balance_loss_mlp": 1.01649892, + "epoch": 0.6356230272057718, + "flos": 21880467468000.0, + "grad_norm": 1.8340998359321494, + "language_loss": 0.71222907, + "learning_rate": 1.2382777542752074e-06, + "loss": 0.73373508, + "num_input_tokens_seen": 227917130, + "router_z_loss_clip": 0.78125, + "router_z_loss_mlp": 0.12011719, + "step": 10572, + "time_per_iteration": 2.6083600521087646 + }, + { + "auxiliary_loss_clip": 0.0111509, + "auxiliary_loss_mlp": 0.01031491, + "balance_loss_clip": 1.04028535, + "balance_loss_mlp": 1.0201546, + "epoch": 0.6356831504584398, + "flos": 30963875473440.0, + "grad_norm": 1.6213077430347242, + "language_loss": 0.81297243, + "learning_rate": 1.2379176598873992e-06, + "loss": 0.83443826, + "num_input_tokens_seen": 227939550, + "router_z_loss_clip": 0.74902344, + "router_z_loss_mlp": 0.11328125, + "step": 10573, + "time_per_iteration": 2.74137282371521 + }, + { + "auxiliary_loss_clip": 0.0112009, + "auxiliary_loss_mlp": 0.01031669, + "balance_loss_clip": 1.04238975, + "balance_loss_mlp": 1.02006423, + "epoch": 0.6357432737111077, + "flos": 56741667994080.0, + "grad_norm": 1.6586244590950785, + "language_loss": 0.68967611, + "learning_rate": 1.2375575943978303e-06, + "loss": 0.71119368, + "num_input_tokens_seen": 227962200, + "router_z_loss_clip": 0.77685547, + "router_z_loss_mlp": 0.11602783, + "step": 10574, + "time_per_iteration": 2.8404486179351807 + }, + { + "auxiliary_loss_clip": 0.01116784, + "auxiliary_loss_mlp": 0.01030413, + "balance_loss_clip": 1.04234707, + "balance_loss_mlp": 1.01877809, + "epoch": 0.6358033969637757, + "flos": 21077974975680.0, + "grad_norm": 4.0017069249389134, + "language_loss": 0.86432672, + "learning_rate": 1.2371975578201525e-06, + "loss": 0.88579869, + "num_input_tokens_seen": 227979270, + "router_z_loss_clip": 0.74316406, + "router_z_loss_mlp": 0.11639404, + "step": 10575, + "time_per_iteration": 2.6535863876342773 + }, + { + "auxiliary_loss_clip": 0.0111557, + "auxiliary_loss_mlp": 0.01036082, + "balance_loss_clip": 1.04048193, + "balance_loss_mlp": 1.02453029, + "epoch": 0.6358635202164437, + "flos": 33099114931200.0, + "grad_norm": 1.7865082265517256, + "language_loss": 0.72212893, + "learning_rate": 1.2368375501680204e-06, + "loss": 0.74364543, + "num_input_tokens_seen": 228000550, + "router_z_loss_clip": 0.75146484, + "router_z_loss_mlp": 0.11541748, + "step": 10576, + "time_per_iteration": 2.647430658340454 + }, + { + "auxiliary_loss_clip": 0.0111965, + "auxiliary_loss_mlp": 0.01031862, + "balance_loss_clip": 1.04142284, + "balance_loss_mlp": 1.01957726, + "epoch": 0.6359236434691117, + "flos": 33586994679840.0, + "grad_norm": 1.5881844411972648, + "language_loss": 0.69583827, + "learning_rate": 1.236477571455085e-06, + "loss": 0.7173534, + "num_input_tokens_seen": 228022005, + "router_z_loss_clip": 0.78222656, + "router_z_loss_mlp": 0.12280273, + "step": 10577, + "time_per_iteration": 2.706105947494507 + }, + { + "auxiliary_loss_clip": 0.01117708, + "auxiliary_loss_mlp": 0.01028186, + "balance_loss_clip": 1.04157531, + "balance_loss_mlp": 1.01708722, + "epoch": 0.6359837667217797, + "flos": 48014275042080.0, + "grad_norm": 2.5042332728526717, + "language_loss": 0.72028565, + "learning_rate": 1.2361176216949964e-06, + "loss": 0.74174452, + "num_input_tokens_seen": 228043770, + "router_z_loss_clip": 0.76171875, + "router_z_loss_mlp": 0.11108398, + "step": 10578, + "time_per_iteration": 2.795036792755127 + }, + { + "auxiliary_loss_clip": 0.01036418, + "auxiliary_loss_mlp": 0.01000604, + "balance_loss_clip": 1.01211572, + "balance_loss_mlp": 0.99943984, + "epoch": 0.6360438899744476, + "flos": 68836462014240.0, + "grad_norm": 0.7059354082025688, + "language_loss": 0.54453993, + "learning_rate": 1.2357577009014044e-06, + "loss": 0.56491017, + "num_input_tokens_seen": 228104985, + "router_z_loss_clip": 0.24304199, + "router_z_loss_mlp": 0.01164246, + "step": 10579, + "time_per_iteration": 3.3581583499908447 + }, + { + "auxiliary_loss_clip": 0.0112011, + "auxiliary_loss_mlp": 0.01029945, + "balance_loss_clip": 1.04357719, + "balance_loss_mlp": 1.01830387, + "epoch": 0.6361040132271156, + "flos": 30472551756000.0, + "grad_norm": 1.7703864227050272, + "language_loss": 0.77488202, + "learning_rate": 1.2353978090879568e-06, + "loss": 0.79638255, + "num_input_tokens_seen": 228125620, + "router_z_loss_clip": 0.76464844, + "router_z_loss_mlp": 0.11645508, + "step": 10580, + "time_per_iteration": 2.6849465370178223 + }, + { + "auxiliary_loss_clip": 0.01118258, + "auxiliary_loss_mlp": 0.01026452, + "balance_loss_clip": 1.04203236, + "balance_loss_mlp": 1.01513886, + "epoch": 0.6361641364797835, + "flos": 28381024092960.0, + "grad_norm": 3.191438029354655, + "language_loss": 0.66546589, + "learning_rate": 1.235037946268301e-06, + "loss": 0.68691301, + "num_input_tokens_seen": 228143495, + "router_z_loss_clip": 0.76171875, + "router_z_loss_mlp": 0.11315918, + "step": 10581, + "time_per_iteration": 2.6721138954162598 + }, + { + "auxiliary_loss_clip": 0.01115657, + "auxiliary_loss_mlp": 0.01030204, + "balance_loss_clip": 1.03951216, + "balance_loss_mlp": 1.01869392, + "epoch": 0.6362242597324516, + "flos": 31718969098560.0, + "grad_norm": 1.5810946941682167, + "language_loss": 0.68738925, + "learning_rate": 1.2346781124560828e-06, + "loss": 0.70884788, + "num_input_tokens_seen": 228166500, + "router_z_loss_clip": 0.76123047, + "router_z_loss_mlp": 0.11517334, + "step": 10582, + "time_per_iteration": 2.7200372219085693 + }, + { + "auxiliary_loss_clip": 0.01119479, + "auxiliary_loss_mlp": 0.01038319, + "balance_loss_clip": 1.04216492, + "balance_loss_mlp": 1.02663648, + "epoch": 0.6362843829851195, + "flos": 31363480769760.0, + "grad_norm": 1.9789582339087064, + "language_loss": 0.84514892, + "learning_rate": 1.2343183076649473e-06, + "loss": 0.86672693, + "num_input_tokens_seen": 228185325, + "router_z_loss_clip": 0.77392578, + "router_z_loss_mlp": 0.11669922, + "step": 10583, + "time_per_iteration": 2.7296664714813232 + }, + { + "auxiliary_loss_clip": 0.01117123, + "auxiliary_loss_mlp": 0.01032765, + "balance_loss_clip": 1.04325724, + "balance_loss_mlp": 1.02117205, + "epoch": 0.6363445062377875, + "flos": 25041985120800.0, + "grad_norm": 1.8086098689484282, + "language_loss": 0.75425112, + "learning_rate": 1.233958531908538e-06, + "loss": 0.77574992, + "num_input_tokens_seen": 228204050, + "router_z_loss_clip": 0.73925781, + "router_z_loss_mlp": 0.11602783, + "step": 10584, + "time_per_iteration": 2.7043190002441406 + }, + { + "auxiliary_loss_clip": 0.01120561, + "auxiliary_loss_mlp": 0.01033908, + "balance_loss_clip": 1.04232407, + "balance_loss_mlp": 1.02106285, + "epoch": 0.6364046294904554, + "flos": 23749668050400.0, + "grad_norm": 1.7655534767179135, + "language_loss": 0.73165929, + "learning_rate": 1.2335987852004985e-06, + "loss": 0.75320399, + "num_input_tokens_seen": 228222430, + "router_z_loss_clip": 0.78320312, + "router_z_loss_mlp": 0.12841797, + "step": 10585, + "time_per_iteration": 2.633521318435669 + }, + { + "auxiliary_loss_clip": 0.01117902, + "auxiliary_loss_mlp": 0.01027832, + "balance_loss_clip": 1.04209185, + "balance_loss_mlp": 1.01650095, + "epoch": 0.6364647527431234, + "flos": 25619273805600.0, + "grad_norm": 1.8490050785870604, + "language_loss": 0.82735723, + "learning_rate": 1.2332390675544697e-06, + "loss": 0.84881461, + "num_input_tokens_seen": 228241925, + "router_z_loss_clip": 0.7578125, + "router_z_loss_mlp": 0.11334229, + "step": 10586, + "time_per_iteration": 2.7171471118927 + }, + { + "auxiliary_loss_clip": 0.01115593, + "auxiliary_loss_mlp": 0.0102625, + "balance_loss_clip": 1.04129732, + "balance_loss_mlp": 1.01471686, + "epoch": 0.6365248759957913, + "flos": 31446406941120.0, + "grad_norm": 1.7731359388404067, + "language_loss": 0.72464943, + "learning_rate": 1.2328793789840918e-06, + "loss": 0.74606788, + "num_input_tokens_seen": 228262535, + "router_z_loss_clip": 0.7421875, + "router_z_loss_mlp": 0.11535645, + "step": 10587, + "time_per_iteration": 2.655313491821289 + }, + { + "auxiliary_loss_clip": 0.01118241, + "auxiliary_loss_mlp": 0.01025528, + "balance_loss_clip": 1.04233658, + "balance_loss_mlp": 1.01439345, + "epoch": 0.6365849992484593, + "flos": 27400483556640.0, + "grad_norm": 2.013133265732514, + "language_loss": 0.76474202, + "learning_rate": 1.2325197195030058e-06, + "loss": 0.78617972, + "num_input_tokens_seen": 228281340, + "router_z_loss_clip": 0.75976562, + "router_z_loss_mlp": 0.11132812, + "step": 10588, + "time_per_iteration": 2.681069850921631 + }, + { + "auxiliary_loss_clip": 0.01114907, + "auxiliary_loss_mlp": 0.01028707, + "balance_loss_clip": 1.0418762, + "balance_loss_mlp": 1.01641655, + "epoch": 0.6366451225011273, + "flos": 23215078228320.0, + "grad_norm": 1.6109154711733649, + "language_loss": 0.79900438, + "learning_rate": 1.2321600891248478e-06, + "loss": 0.82044053, + "num_input_tokens_seen": 228300865, + "router_z_loss_clip": 0.73046875, + "router_z_loss_mlp": 0.12298584, + "step": 10589, + "time_per_iteration": 2.636995553970337 + }, + { + "auxiliary_loss_clip": 0.01115948, + "auxiliary_loss_mlp": 0.01030921, + "balance_loss_clip": 1.04168665, + "balance_loss_mlp": 1.01929235, + "epoch": 0.6367052457537953, + "flos": 30784085186400.0, + "grad_norm": 2.795512303496472, + "language_loss": 0.67011046, + "learning_rate": 1.231800487863257e-06, + "loss": 0.69157916, + "num_input_tokens_seen": 228320815, + "router_z_loss_clip": 0.7421875, + "router_z_loss_mlp": 0.11627197, + "step": 10590, + "time_per_iteration": 4.169891595840454 + }, + { + "auxiliary_loss_clip": 0.01123476, + "auxiliary_loss_mlp": 0.01033012, + "balance_loss_clip": 1.04222095, + "balance_loss_mlp": 1.02071548, + "epoch": 0.6367653690064633, + "flos": 23438053585440.0, + "grad_norm": 1.6459367425618001, + "language_loss": 0.7930128, + "learning_rate": 1.2314409157318685e-06, + "loss": 0.8145777, + "num_input_tokens_seen": 228339065, + "router_z_loss_clip": 0.81298828, + "router_z_loss_mlp": 0.1229248, + "step": 10591, + "time_per_iteration": 2.6727054119110107 + }, + { + "auxiliary_loss_clip": 0.01116202, + "auxiliary_loss_mlp": 0.01026777, + "balance_loss_clip": 1.0427711, + "balance_loss_mlp": 1.01585698, + "epoch": 0.6368254922591312, + "flos": 28732379659200.0, + "grad_norm": 2.95637575850368, + "language_loss": 0.8896476, + "learning_rate": 1.231081372744317e-06, + "loss": 0.91107738, + "num_input_tokens_seen": 228359210, + "router_z_loss_clip": 0.734375, + "router_z_loss_mlp": 0.10925293, + "step": 10592, + "time_per_iteration": 2.6873440742492676 + }, + { + "auxiliary_loss_clip": 0.01113364, + "auxiliary_loss_mlp": 0.01026982, + "balance_loss_clip": 1.04000092, + "balance_loss_mlp": 1.01650941, + "epoch": 0.6368856155117992, + "flos": 32296055196960.0, + "grad_norm": 1.996438451559088, + "language_loss": 0.68002576, + "learning_rate": 1.2307218589142376e-06, + "loss": 0.70142925, + "num_input_tokens_seen": 228379630, + "router_z_loss_clip": 0.73291016, + "router_z_loss_mlp": 0.10461426, + "step": 10593, + "time_per_iteration": 3.898116111755371 + }, + { + "auxiliary_loss_clip": 0.01114115, + "auxiliary_loss_mlp": 0.01033483, + "balance_loss_clip": 1.03927684, + "balance_loss_mlp": 1.02207518, + "epoch": 0.6369457387644671, + "flos": 41112613981440.0, + "grad_norm": 2.1663107119608886, + "language_loss": 0.63493496, + "learning_rate": 1.2303623742552618e-06, + "loss": 0.65641093, + "num_input_tokens_seen": 228401410, + "router_z_loss_clip": 0.74902344, + "router_z_loss_mlp": 0.11413574, + "step": 10594, + "time_per_iteration": 2.7973906993865967 + }, + { + "auxiliary_loss_clip": 0.01036126, + "auxiliary_loss_mlp": 0.01001925, + "balance_loss_clip": 1.01187968, + "balance_loss_mlp": 1.0007205, + "epoch": 0.6370058620171352, + "flos": 86523218111520.0, + "grad_norm": 0.7595520060869272, + "language_loss": 0.54596394, + "learning_rate": 1.230002918781022e-06, + "loss": 0.5663445, + "num_input_tokens_seen": 228470335, + "router_z_loss_clip": 0.24230957, + "router_z_loss_mlp": 0.01203156, + "step": 10595, + "time_per_iteration": 3.3916430473327637 + }, + { + "auxiliary_loss_clip": 0.01122521, + "auxiliary_loss_mlp": 0.01035259, + "balance_loss_clip": 1.04400718, + "balance_loss_mlp": 1.02305198, + "epoch": 0.6370659852698031, + "flos": 25797483918720.0, + "grad_norm": 5.604232685096969, + "language_loss": 0.66674459, + "learning_rate": 1.2296434925051493e-06, + "loss": 0.68832242, + "num_input_tokens_seen": 228490765, + "router_z_loss_clip": 0.78515625, + "router_z_loss_mlp": 0.12207031, + "step": 10596, + "time_per_iteration": 2.6733245849609375 + }, + { + "auxiliary_loss_clip": 0.01118233, + "auxiliary_loss_mlp": 0.01030747, + "balance_loss_clip": 1.04205668, + "balance_loss_mlp": 1.01939273, + "epoch": 0.6371261085224711, + "flos": 24639867753120.0, + "grad_norm": 2.2359955618052414, + "language_loss": 0.78942204, + "learning_rate": 1.2292840954412718e-06, + "loss": 0.81091177, + "num_input_tokens_seen": 228509700, + "router_z_loss_clip": 0.76123047, + "router_z_loss_mlp": 0.1137085, + "step": 10597, + "time_per_iteration": 2.637953758239746 + }, + { + "auxiliary_loss_clip": 0.01119212, + "auxiliary_loss_mlp": 0.01034027, + "balance_loss_clip": 1.04356706, + "balance_loss_mlp": 1.02316678, + "epoch": 0.637186231775139, + "flos": 24017732789760.0, + "grad_norm": 2.2592541340121803, + "language_loss": 0.74870044, + "learning_rate": 1.2289247276030189e-06, + "loss": 0.77023286, + "num_input_tokens_seen": 228529050, + "router_z_loss_clip": 0.75488281, + "router_z_loss_mlp": 0.10870361, + "step": 10598, + "time_per_iteration": 2.6256229877471924 + }, + { + "auxiliary_loss_clip": 0.01117292, + "auxiliary_loss_mlp": 0.0103375, + "balance_loss_clip": 1.04120708, + "balance_loss_mlp": 1.02209115, + "epoch": 0.637246355027807, + "flos": 15949622796480.0, + "grad_norm": 2.5095868616306913, + "language_loss": 0.68329191, + "learning_rate": 1.2285653890040176e-06, + "loss": 0.70480239, + "num_input_tokens_seen": 228544665, + "router_z_loss_clip": 0.75927734, + "router_z_loss_mlp": 0.11663818, + "step": 10599, + "time_per_iteration": 2.6273937225341797 + }, + { + "auxiliary_loss_clip": 0.01119805, + "auxiliary_loss_mlp": 0.01029549, + "balance_loss_clip": 1.0417788, + "balance_loss_mlp": 1.01741993, + "epoch": 0.6373064782804749, + "flos": 22235185968480.0, + "grad_norm": 4.272370398635073, + "language_loss": 0.80590272, + "learning_rate": 1.2282060796578942e-06, + "loss": 0.82739627, + "num_input_tokens_seen": 228562060, + "router_z_loss_clip": 0.78027344, + "router_z_loss_mlp": 0.12139893, + "step": 10600, + "time_per_iteration": 4.137358903884888 + }, + { + "auxiliary_loss_clip": 0.01115382, + "auxiliary_loss_mlp": 0.01031191, + "balance_loss_clip": 1.03988123, + "balance_loss_mlp": 1.0200088, + "epoch": 0.637366601533143, + "flos": 29893399276320.0, + "grad_norm": 1.7031947771877989, + "language_loss": 0.79784548, + "learning_rate": 1.2278467995782732e-06, + "loss": 0.81931126, + "num_input_tokens_seen": 228582550, + "router_z_loss_clip": 0.75439453, + "router_z_loss_mlp": 0.11175537, + "step": 10601, + "time_per_iteration": 2.6725521087646484 + }, + { + "auxiliary_loss_clip": 0.01119158, + "auxiliary_loss_mlp": 0.01028058, + "balance_loss_clip": 1.04186511, + "balance_loss_mlp": 1.01672745, + "epoch": 0.6374267247858109, + "flos": 32163825846240.0, + "grad_norm": 2.6034973269263357, + "language_loss": 0.66489953, + "learning_rate": 1.2274875487787797e-06, + "loss": 0.68637168, + "num_input_tokens_seen": 228604960, + "router_z_loss_clip": 0.77246094, + "router_z_loss_mlp": 0.11346436, + "step": 10602, + "time_per_iteration": 4.156108856201172 + }, + { + "auxiliary_loss_clip": 0.01113639, + "auxiliary_loss_mlp": 0.01025927, + "balance_loss_clip": 1.03922272, + "balance_loss_mlp": 1.01460242, + "epoch": 0.6374868480384789, + "flos": 24857981036640.0, + "grad_norm": 1.818068149318954, + "language_loss": 0.79835373, + "learning_rate": 1.2271283272730354e-06, + "loss": 0.81974941, + "num_input_tokens_seen": 228622195, + "router_z_loss_clip": 0.74414062, + "router_z_loss_mlp": 0.11315918, + "step": 10603, + "time_per_iteration": 2.616460084915161 + }, + { + "auxiliary_loss_clip": 0.01116575, + "auxiliary_loss_mlp": 0.01026985, + "balance_loss_clip": 1.0414046, + "balance_loss_mlp": 1.01486707, + "epoch": 0.6375469712911469, + "flos": 25619678978400.0, + "grad_norm": 1.937411347443575, + "language_loss": 0.76974857, + "learning_rate": 1.2267691350746621e-06, + "loss": 0.79118419, + "num_input_tokens_seen": 228639735, + "router_z_loss_clip": 0.75195312, + "router_z_loss_mlp": 0.12115479, + "step": 10604, + "time_per_iteration": 2.699765205383301 + }, + { + "auxiliary_loss_clip": 0.01119399, + "auxiliary_loss_mlp": 0.01027201, + "balance_loss_clip": 1.04088449, + "balance_loss_mlp": 1.0155009, + "epoch": 0.6376070945438148, + "flos": 24057433373760.0, + "grad_norm": 1.9788843796205915, + "language_loss": 0.76702589, + "learning_rate": 1.226409972197281e-06, + "loss": 0.78849185, + "num_input_tokens_seen": 228658195, + "router_z_loss_clip": 0.78369141, + "router_z_loss_mlp": 0.11706543, + "step": 10605, + "time_per_iteration": 2.6692845821380615 + }, + { + "auxiliary_loss_clip": 0.01120207, + "auxiliary_loss_mlp": 0.01028339, + "balance_loss_clip": 1.04305947, + "balance_loss_mlp": 1.01538134, + "epoch": 0.6376672177964828, + "flos": 26242664804640.0, + "grad_norm": 2.33687598748383, + "language_loss": 0.65893388, + "learning_rate": 1.2260508386545106e-06, + "loss": 0.68041933, + "num_input_tokens_seen": 228677415, + "router_z_loss_clip": 0.77099609, + "router_z_loss_mlp": 0.12963867, + "step": 10606, + "time_per_iteration": 2.6592235565185547 + }, + { + "auxiliary_loss_clip": 0.01114554, + "auxiliary_loss_mlp": 0.01032293, + "balance_loss_clip": 1.04160166, + "balance_loss_mlp": 1.02111745, + "epoch": 0.6377273410491507, + "flos": 22993156320480.0, + "grad_norm": 2.058624900864306, + "language_loss": 0.75433707, + "learning_rate": 1.225691734459971e-06, + "loss": 0.77580559, + "num_input_tokens_seen": 228696450, + "router_z_loss_clip": 0.72900391, + "router_z_loss_mlp": 0.11175537, + "step": 10607, + "time_per_iteration": 2.642598867416382 + }, + { + "auxiliary_loss_clip": 0.01118887, + "auxiliary_loss_mlp": 0.01035431, + "balance_loss_clip": 1.04245007, + "balance_loss_mlp": 1.02398729, + "epoch": 0.6377874643018188, + "flos": 64750345488000.0, + "grad_norm": 2.0707936760696306, + "language_loss": 0.65970302, + "learning_rate": 1.225332659627278e-06, + "loss": 0.68124616, + "num_input_tokens_seen": 228721600, + "router_z_loss_clip": 0.76367188, + "router_z_loss_mlp": 0.11456299, + "step": 10608, + "time_per_iteration": 2.906674861907959 + }, + { + "auxiliary_loss_clip": 0.01037228, + "auxiliary_loss_mlp": 0.01000827, + "balance_loss_clip": 1.01308751, + "balance_loss_mlp": 0.99977767, + "epoch": 0.6378475875544867, + "flos": 79478671655520.0, + "grad_norm": 0.7305861773223797, + "language_loss": 0.5190748, + "learning_rate": 1.2249736141700475e-06, + "loss": 0.53945535, + "num_input_tokens_seen": 228784535, + "router_z_loss_clip": 0.24169922, + "router_z_loss_mlp": 0.01049805, + "step": 10609, + "time_per_iteration": 3.286686897277832 + }, + { + "auxiliary_loss_clip": 0.01113233, + "auxiliary_loss_mlp": 0.0102313, + "balance_loss_clip": 1.04006398, + "balance_loss_mlp": 1.01283097, + "epoch": 0.6379077108071547, + "flos": 28068194109600.0, + "grad_norm": 5.880280490348635, + "language_loss": 0.74951196, + "learning_rate": 1.2246145981018965e-06, + "loss": 0.77087557, + "num_input_tokens_seen": 228804110, + "router_z_loss_clip": 0.73046875, + "router_z_loss_mlp": 0.10296631, + "step": 10610, + "time_per_iteration": 2.7591652870178223 + }, + { + "auxiliary_loss_clip": 0.01037207, + "auxiliary_loss_mlp": 0.01001601, + "balance_loss_clip": 1.01303029, + "balance_loss_mlp": 1.00050366, + "epoch": 0.6379678340598226, + "flos": 82486937839680.0, + "grad_norm": 0.8420205940923791, + "language_loss": 0.63152528, + "learning_rate": 1.2242556114364364e-06, + "loss": 0.65191329, + "num_input_tokens_seen": 228867705, + "router_z_loss_clip": 0.24206543, + "router_z_loss_mlp": 0.0109787, + "step": 10611, + "time_per_iteration": 3.3161659240722656 + }, + { + "auxiliary_loss_clip": 0.01115713, + "auxiliary_loss_mlp": 0.01031142, + "balance_loss_clip": 1.04070699, + "balance_loss_mlp": 1.01962614, + "epoch": 0.6380279573124906, + "flos": 36216313030080.0, + "grad_norm": 2.561225448783446, + "language_loss": 0.71668303, + "learning_rate": 1.223896654187282e-06, + "loss": 0.73815155, + "num_input_tokens_seen": 228889215, + "router_z_loss_clip": 0.75048828, + "router_z_loss_mlp": 0.11523438, + "step": 10612, + "time_per_iteration": 2.6956582069396973 + }, + { + "auxiliary_loss_clip": 0.01037343, + "auxiliary_loss_mlp": 0.01001395, + "balance_loss_clip": 1.01305866, + "balance_loss_mlp": 1.00030231, + "epoch": 0.6380880805651585, + "flos": 81124937398080.0, + "grad_norm": 0.7099226464273044, + "language_loss": 0.57852286, + "learning_rate": 1.2235377263680446e-06, + "loss": 0.59891027, + "num_input_tokens_seen": 228948465, + "router_z_loss_clip": 0.24304199, + "router_z_loss_mlp": 0.01093292, + "step": 10613, + "time_per_iteration": 3.1530442237854004 + }, + { + "auxiliary_loss_clip": 0.01117356, + "auxiliary_loss_mlp": 0.01028858, + "balance_loss_clip": 1.04018605, + "balance_loss_mlp": 1.01675797, + "epoch": 0.6381482038178266, + "flos": 29181369169440.0, + "grad_norm": 1.856981952236241, + "language_loss": 0.75366807, + "learning_rate": 1.2231788279923334e-06, + "loss": 0.77513015, + "num_input_tokens_seen": 228967955, + "router_z_loss_clip": 0.77197266, + "router_z_loss_mlp": 0.12097168, + "step": 10614, + "time_per_iteration": 2.7411248683929443 + }, + { + "auxiliary_loss_clip": 0.01119549, + "auxiliary_loss_mlp": 0.01032745, + "balance_loss_clip": 1.04366779, + "balance_loss_mlp": 1.02096081, + "epoch": 0.6382083270704945, + "flos": 29581703776800.0, + "grad_norm": 3.8517665827874956, + "language_loss": 0.79293621, + "learning_rate": 1.2228199590737599e-06, + "loss": 0.8144592, + "num_input_tokens_seen": 228985495, + "router_z_loss_clip": 0.75830078, + "router_z_loss_mlp": 0.11767578, + "step": 10615, + "time_per_iteration": 2.6543657779693604 + }, + { + "auxiliary_loss_clip": 0.01036692, + "auxiliary_loss_mlp": 0.01000933, + "balance_loss_clip": 1.01250339, + "balance_loss_mlp": 0.99985862, + "epoch": 0.6382684503231625, + "flos": 86359675253760.0, + "grad_norm": 0.6603652037317992, + "language_loss": 0.55577844, + "learning_rate": 1.2224611196259305e-06, + "loss": 0.57615471, + "num_input_tokens_seen": 229052995, + "router_z_loss_clip": 0.24194336, + "router_z_loss_mlp": 0.01075745, + "step": 10616, + "time_per_iteration": 3.3300654888153076 + }, + { + "auxiliary_loss_clip": 0.01116375, + "auxiliary_loss_mlp": 0.01029194, + "balance_loss_clip": 1.04099989, + "balance_loss_mlp": 1.01728499, + "epoch": 0.6383285735758305, + "flos": 20187734755680.0, + "grad_norm": 1.7072246887040199, + "language_loss": 0.84066397, + "learning_rate": 1.2221023096624538e-06, + "loss": 0.86211962, + "num_input_tokens_seen": 229071030, + "router_z_loss_clip": 0.75292969, + "router_z_loss_mlp": 0.11901855, + "step": 10617, + "time_per_iteration": 2.600947856903076 + }, + { + "auxiliary_loss_clip": 0.01118129, + "auxiliary_loss_mlp": 0.0103486, + "balance_loss_clip": 1.04157758, + "balance_loss_mlp": 1.02287316, + "epoch": 0.6383886968284984, + "flos": 17604275616000.0, + "grad_norm": 2.4106418228951942, + "language_loss": 0.87051284, + "learning_rate": 1.221743529196936e-06, + "loss": 0.89204276, + "num_input_tokens_seen": 229088275, + "router_z_loss_clip": 0.76513672, + "router_z_loss_mlp": 0.11975098, + "step": 10618, + "time_per_iteration": 2.6572771072387695 + }, + { + "auxiliary_loss_clip": 0.01121581, + "auxiliary_loss_mlp": 0.01034999, + "balance_loss_clip": 1.04384506, + "balance_loss_mlp": 1.02465713, + "epoch": 0.6384488200811664, + "flos": 21877752810240.0, + "grad_norm": 5.006535523754019, + "language_loss": 0.73064041, + "learning_rate": 1.2213847782429806e-06, + "loss": 0.75220621, + "num_input_tokens_seen": 229105190, + "router_z_loss_clip": 0.77636719, + "router_z_loss_mlp": 0.10333252, + "step": 10619, + "time_per_iteration": 2.601531505584717 + }, + { + "auxiliary_loss_clip": 0.01122777, + "auxiliary_loss_mlp": 0.01032953, + "balance_loss_clip": 1.04289901, + "balance_loss_mlp": 1.02030516, + "epoch": 0.6385089433338343, + "flos": 22592538092160.0, + "grad_norm": 3.777549141383651, + "language_loss": 0.76266789, + "learning_rate": 1.221026056814193e-06, + "loss": 0.78422511, + "num_input_tokens_seen": 229122290, + "router_z_loss_clip": 0.79833984, + "router_z_loss_mlp": 0.12664795, + "step": 10620, + "time_per_iteration": 2.6723883152008057 + }, + { + "auxiliary_loss_clip": 0.01114881, + "auxiliary_loss_mlp": 0.01026459, + "balance_loss_clip": 1.04070854, + "balance_loss_mlp": 1.0151403, + "epoch": 0.6385690665865024, + "flos": 30204770637600.0, + "grad_norm": 2.662533126177933, + "language_loss": 0.7051596, + "learning_rate": 1.2206673649241752e-06, + "loss": 0.72657299, + "num_input_tokens_seen": 229141620, + "router_z_loss_clip": 0.74072266, + "router_z_loss_mlp": 0.11322021, + "step": 10621, + "time_per_iteration": 2.642717123031616 + }, + { + "auxiliary_loss_clip": 0.0111051, + "auxiliary_loss_mlp": 0.01024052, + "balance_loss_clip": 1.03950369, + "balance_loss_mlp": 1.01360917, + "epoch": 0.6386291898391703, + "flos": 24551107093440.0, + "grad_norm": 2.5426082148129723, + "language_loss": 0.78015912, + "learning_rate": 1.220308702586529e-06, + "loss": 0.80150473, + "num_input_tokens_seen": 229161570, + "router_z_loss_clip": 0.71044922, + "router_z_loss_mlp": 0.10449219, + "step": 10622, + "time_per_iteration": 2.6924962997436523 + }, + { + "auxiliary_loss_clip": 0.01115039, + "auxiliary_loss_mlp": 0.01030181, + "balance_loss_clip": 1.0405767, + "balance_loss_mlp": 1.01891601, + "epoch": 0.6386893130918383, + "flos": 20581748667360.0, + "grad_norm": 2.0366004756650096, + "language_loss": 0.74432617, + "learning_rate": 1.2199500698148546e-06, + "loss": 0.76577836, + "num_input_tokens_seen": 229178465, + "router_z_loss_clip": 0.74414062, + "router_z_loss_mlp": 0.11273193, + "step": 10623, + "time_per_iteration": 2.5978915691375732 + }, + { + "auxiliary_loss_clip": 0.01112347, + "auxiliary_loss_mlp": 0.01030796, + "balance_loss_clip": 1.03934026, + "balance_loss_mlp": 1.02024007, + "epoch": 0.6387494363445062, + "flos": 28023671969280.0, + "grad_norm": 1.5497042528971017, + "language_loss": 0.76840407, + "learning_rate": 1.2195914666227527e-06, + "loss": 0.78983551, + "num_input_tokens_seen": 229198975, + "router_z_loss_clip": 0.73046875, + "router_z_loss_mlp": 0.10552979, + "step": 10624, + "time_per_iteration": 2.667428731918335 + }, + { + "auxiliary_loss_clip": 0.0111399, + "auxiliary_loss_mlp": 0.01028964, + "balance_loss_clip": 1.03916693, + "balance_loss_mlp": 1.01776409, + "epoch": 0.6388095595971742, + "flos": 27891118480320.0, + "grad_norm": 1.628378918897642, + "language_loss": 0.80291706, + "learning_rate": 1.21923289302382e-06, + "loss": 0.8243466, + "num_input_tokens_seen": 229218825, + "router_z_loss_clip": 0.74804688, + "router_z_loss_mlp": 0.11199951, + "step": 10625, + "time_per_iteration": 2.675881862640381 + }, + { + "auxiliary_loss_clip": 0.01121346, + "auxiliary_loss_mlp": 0.01030757, + "balance_loss_clip": 1.04497278, + "balance_loss_mlp": 1.01878798, + "epoch": 0.6388696828498421, + "flos": 21123266944320.0, + "grad_norm": 2.2280193545259936, + "language_loss": 0.7275939, + "learning_rate": 1.218874349031654e-06, + "loss": 0.74911493, + "num_input_tokens_seen": 229236060, + "router_z_loss_clip": 0.76416016, + "router_z_loss_mlp": 0.11962891, + "step": 10626, + "time_per_iteration": 2.622248411178589 + }, + { + "auxiliary_loss_clip": 0.0111675, + "auxiliary_loss_mlp": 0.01029679, + "balance_loss_clip": 1.04033768, + "balance_loss_mlp": 1.01726365, + "epoch": 0.6389298061025102, + "flos": 20900129518080.0, + "grad_norm": 2.648396378078977, + "language_loss": 0.72449243, + "learning_rate": 1.2185158346598517e-06, + "loss": 0.74595666, + "num_input_tokens_seen": 229255160, + "router_z_loss_clip": 0.76318359, + "router_z_loss_mlp": 0.12414551, + "step": 10627, + "time_per_iteration": 2.6300275325775146 + }, + { + "auxiliary_loss_clip": 0.01121312, + "auxiliary_loss_mlp": 0.01031478, + "balance_loss_clip": 1.04149377, + "balance_loss_mlp": 1.0183655, + "epoch": 0.6389899293551781, + "flos": 33813373488480.0, + "grad_norm": 1.8652062463397356, + "language_loss": 0.67107427, + "learning_rate": 1.2181573499220064e-06, + "loss": 0.69260216, + "num_input_tokens_seen": 229278705, + "router_z_loss_clip": 0.79736328, + "router_z_loss_mlp": 0.13110352, + "step": 10628, + "time_per_iteration": 2.6817758083343506 + }, + { + "auxiliary_loss_clip": 0.01112533, + "auxiliary_loss_mlp": 0.01026589, + "balance_loss_clip": 1.0407331, + "balance_loss_mlp": 1.01573503, + "epoch": 0.6390500526078461, + "flos": 25887419579520.0, + "grad_norm": 3.4144604707765756, + "language_loss": 0.67895752, + "learning_rate": 1.2177988948317135e-06, + "loss": 0.70034873, + "num_input_tokens_seen": 229299990, + "router_z_loss_clip": 0.71875, + "router_z_loss_mlp": 0.10852051, + "step": 10629, + "time_per_iteration": 2.76723051071167 + }, + { + "auxiliary_loss_clip": 0.01122909, + "auxiliary_loss_mlp": 0.01033692, + "balance_loss_clip": 1.04212022, + "balance_loss_mlp": 1.02067494, + "epoch": 0.6391101758605141, + "flos": 26333208224640.0, + "grad_norm": 1.579686296248123, + "language_loss": 0.75800711, + "learning_rate": 1.2174404694025646e-06, + "loss": 0.77957314, + "num_input_tokens_seen": 229319230, + "router_z_loss_clip": 0.80859375, + "router_z_loss_mlp": 0.13018799, + "step": 10630, + "time_per_iteration": 4.076160430908203 + }, + { + "auxiliary_loss_clip": 0.01114868, + "auxiliary_loss_mlp": 0.01028717, + "balance_loss_clip": 1.04100859, + "balance_loss_mlp": 1.01855433, + "epoch": 0.639170299113182, + "flos": 24282637181280.0, + "grad_norm": 2.036264788272965, + "language_loss": 0.70415151, + "learning_rate": 1.2170820736481511e-06, + "loss": 0.72558737, + "num_input_tokens_seen": 229338600, + "router_z_loss_clip": 0.73828125, + "router_z_loss_mlp": 0.10162354, + "step": 10631, + "time_per_iteration": 2.6643893718719482 + }, + { + "auxiliary_loss_clip": 0.01035394, + "auxiliary_loss_mlp": 0.010009, + "balance_loss_clip": 1.01146269, + "balance_loss_mlp": 0.99979562, + "epoch": 0.63923042236585, + "flos": 85265982655200.0, + "grad_norm": 0.7686588371879217, + "language_loss": 0.62938058, + "learning_rate": 1.2167237075820646e-06, + "loss": 0.64974344, + "num_input_tokens_seen": 229402420, + "router_z_loss_clip": 0.23925781, + "router_z_loss_mlp": 0.01105499, + "step": 10632, + "time_per_iteration": 4.715872049331665 + }, + { + "auxiliary_loss_clip": 0.01114169, + "auxiliary_loss_mlp": 0.01031597, + "balance_loss_clip": 1.04024637, + "balance_loss_mlp": 1.02020037, + "epoch": 0.639290545618518, + "flos": 27667981054080.0, + "grad_norm": 2.352799959176852, + "language_loss": 0.66270286, + "learning_rate": 1.216365371217893e-06, + "loss": 0.68416053, + "num_input_tokens_seen": 229419185, + "router_z_loss_clip": 0.73925781, + "router_z_loss_mlp": 0.11401367, + "step": 10633, + "time_per_iteration": 2.7301135063171387 + }, + { + "auxiliary_loss_clip": 0.01116658, + "auxiliary_loss_mlp": 0.01024638, + "balance_loss_clip": 1.04223835, + "balance_loss_mlp": 1.01350343, + "epoch": 0.639350668871186, + "flos": 24195497212800.0, + "grad_norm": 1.9826808882073241, + "language_loss": 0.81826043, + "learning_rate": 1.216007064569225e-06, + "loss": 0.8396734, + "num_input_tokens_seen": 229436735, + "router_z_loss_clip": 0.74462891, + "router_z_loss_mlp": 0.1114502, + "step": 10634, + "time_per_iteration": 2.644118070602417 + }, + { + "auxiliary_loss_clip": 0.01117844, + "auxiliary_loss_mlp": 0.01032768, + "balance_loss_clip": 1.04276896, + "balance_loss_mlp": 1.02075183, + "epoch": 0.6394107921238539, + "flos": 25079659840800.0, + "grad_norm": 1.5568730879703305, + "language_loss": 0.74964195, + "learning_rate": 1.2156487876496483e-06, + "loss": 0.77114803, + "num_input_tokens_seen": 229455595, + "router_z_loss_clip": 0.75, + "router_z_loss_mlp": 0.12030029, + "step": 10635, + "time_per_iteration": 2.650949239730835 + }, + { + "auxiliary_loss_clip": 0.01115862, + "auxiliary_loss_mlp": 0.01026815, + "balance_loss_clip": 1.03984404, + "balance_loss_mlp": 1.01553154, + "epoch": 0.6394709153765219, + "flos": 31451147462880.0, + "grad_norm": 1.8041741468567227, + "language_loss": 0.71432042, + "learning_rate": 1.2152905404727475e-06, + "loss": 0.73574716, + "num_input_tokens_seen": 229476230, + "router_z_loss_clip": 0.76074219, + "router_z_loss_mlp": 0.112854, + "step": 10636, + "time_per_iteration": 2.6818859577178955 + }, + { + "auxiliary_loss_clip": 0.01119833, + "auxiliary_loss_mlp": 0.010298, + "balance_loss_clip": 1.04211569, + "balance_loss_mlp": 1.0177238, + "epoch": 0.6395310386291898, + "flos": 21390075648000.0, + "grad_norm": 11.492457244501953, + "language_loss": 0.73720801, + "learning_rate": 1.2149323230521085e-06, + "loss": 0.7587043, + "num_input_tokens_seen": 229494300, + "router_z_loss_clip": 0.77636719, + "router_z_loss_mlp": 0.12072754, + "step": 10637, + "time_per_iteration": 2.6743009090423584 + }, + { + "auxiliary_loss_clip": 0.01117207, + "auxiliary_loss_mlp": 0.01030101, + "balance_loss_clip": 1.04014468, + "balance_loss_mlp": 1.01816821, + "epoch": 0.6395911618818578, + "flos": 22681217717280.0, + "grad_norm": 2.033787102217392, + "language_loss": 0.77774632, + "learning_rate": 1.2145741354013143e-06, + "loss": 0.79921943, + "num_input_tokens_seen": 229512985, + "router_z_loss_clip": 0.77001953, + "router_z_loss_mlp": 0.1192627, + "step": 10638, + "time_per_iteration": 2.5974693298339844 + }, + { + "auxiliary_loss_clip": 0.01114998, + "auxiliary_loss_mlp": 0.01029527, + "balance_loss_clip": 1.04069984, + "balance_loss_mlp": 1.01721287, + "epoch": 0.6396512851345257, + "flos": 34613475461280.0, + "grad_norm": 2.3515146323571305, + "language_loss": 0.81611526, + "learning_rate": 1.2142159775339478e-06, + "loss": 0.83756047, + "num_input_tokens_seen": 229534270, + "router_z_loss_clip": 0.74169922, + "router_z_loss_mlp": 0.12322998, + "step": 10639, + "time_per_iteration": 4.175678491592407 + }, + { + "auxiliary_loss_clip": 0.01035344, + "auxiliary_loss_mlp": 0.01000408, + "balance_loss_clip": 1.01126087, + "balance_loss_mlp": 0.99933958, + "epoch": 0.6397114083871938, + "flos": 86298500511360.0, + "grad_norm": 0.8106841894967476, + "language_loss": 0.59007466, + "learning_rate": 1.21385784946359e-06, + "loss": 0.61043215, + "num_input_tokens_seen": 229596455, + "router_z_loss_clip": 0.24108887, + "router_z_loss_mlp": 0.01069641, + "step": 10640, + "time_per_iteration": 3.22493577003479 + }, + { + "auxiliary_loss_clip": 0.01112257, + "auxiliary_loss_mlp": 0.01022973, + "balance_loss_clip": 1.04026675, + "balance_loss_mlp": 1.01245308, + "epoch": 0.6397715316398617, + "flos": 22138078749120.0, + "grad_norm": 2.003104263932562, + "language_loss": 0.78327405, + "learning_rate": 1.2134997512038215e-06, + "loss": 0.80462635, + "num_input_tokens_seen": 229612860, + "router_z_loss_clip": 0.71923828, + "router_z_loss_mlp": 0.10516357, + "step": 10641, + "time_per_iteration": 2.641826629638672 + }, + { + "auxiliary_loss_clip": 0.01120788, + "auxiliary_loss_mlp": 0.01033248, + "balance_loss_clip": 1.04076111, + "balance_loss_mlp": 1.02114248, + "epoch": 0.6398316548925297, + "flos": 31407759806400.0, + "grad_norm": 1.906519265965143, + "language_loss": 0.63461411, + "learning_rate": 1.2131416827682209e-06, + "loss": 0.65615445, + "num_input_tokens_seen": 229633960, + "router_z_loss_clip": 0.80078125, + "router_z_loss_mlp": 0.12115479, + "step": 10642, + "time_per_iteration": 4.128183841705322 + }, + { + "auxiliary_loss_clip": 0.01035889, + "auxiliary_loss_mlp": 0.01001416, + "balance_loss_clip": 1.01184821, + "balance_loss_mlp": 1.00035882, + "epoch": 0.6398917781451977, + "flos": 86898269936160.0, + "grad_norm": 0.9230727037884636, + "language_loss": 0.55923569, + "learning_rate": 1.2127836441703667e-06, + "loss": 0.57960868, + "num_input_tokens_seen": 229686730, + "router_z_loss_clip": 0.24035645, + "router_z_loss_mlp": 0.01058197, + "step": 10643, + "time_per_iteration": 3.192310333251953 + }, + { + "auxiliary_loss_clip": 0.01120875, + "auxiliary_loss_mlp": 0.01024381, + "balance_loss_clip": 1.0426867, + "balance_loss_mlp": 1.01298451, + "epoch": 0.6399519013978656, + "flos": 25040769602400.0, + "grad_norm": 2.0347614858759586, + "language_loss": 0.76663446, + "learning_rate": 1.2124256354238358e-06, + "loss": 0.78808701, + "num_input_tokens_seen": 229704800, + "router_z_loss_clip": 0.78222656, + "router_z_loss_mlp": 0.11383057, + "step": 10644, + "time_per_iteration": 2.64668869972229 + }, + { + "auxiliary_loss_clip": 0.0111815, + "auxiliary_loss_mlp": 0.01029062, + "balance_loss_clip": 1.0441246, + "balance_loss_mlp": 1.01754642, + "epoch": 0.6400120246505336, + "flos": 29847580583040.0, + "grad_norm": 1.5671232490674571, + "language_loss": 0.82436651, + "learning_rate": 1.212067656542203e-06, + "loss": 0.84583867, + "num_input_tokens_seen": 229725265, + "router_z_loss_clip": 0.74023438, + "router_z_loss_mlp": 0.1151123, + "step": 10645, + "time_per_iteration": 2.7280144691467285 + }, + { + "auxiliary_loss_clip": 0.01121914, + "auxiliary_loss_mlp": 0.01035979, + "balance_loss_clip": 1.04253411, + "balance_loss_mlp": 1.0229075, + "epoch": 0.6400721479032015, + "flos": 34612948736640.0, + "grad_norm": 2.6109460558114708, + "language_loss": 0.7384572, + "learning_rate": 1.2117097075390447e-06, + "loss": 0.76003611, + "num_input_tokens_seen": 229744840, + "router_z_loss_clip": 0.79296875, + "router_z_loss_mlp": 0.13085938, + "step": 10646, + "time_per_iteration": 2.7254958152770996 + }, + { + "auxiliary_loss_clip": 0.01118285, + "auxiliary_loss_mlp": 0.0103045, + "balance_loss_clip": 1.04121566, + "balance_loss_mlp": 1.0184871, + "epoch": 0.6401322711558696, + "flos": 21740053626720.0, + "grad_norm": 2.231461830631724, + "language_loss": 0.79849255, + "learning_rate": 1.2113517884279327e-06, + "loss": 0.81997991, + "num_input_tokens_seen": 229759095, + "router_z_loss_clip": 0.77099609, + "router_z_loss_mlp": 0.11962891, + "step": 10647, + "time_per_iteration": 2.670353889465332 + }, + { + "auxiliary_loss_clip": 0.0111593, + "auxiliary_loss_mlp": 0.01027572, + "balance_loss_clip": 1.04286981, + "balance_loss_mlp": 1.01673591, + "epoch": 0.6401923944085375, + "flos": 31763329169760.0, + "grad_norm": 1.7739136489315805, + "language_loss": 0.76043308, + "learning_rate": 1.2109938992224399e-06, + "loss": 0.7818681, + "num_input_tokens_seen": 229777750, + "router_z_loss_clip": 0.73046875, + "router_z_loss_mlp": 0.10827637, + "step": 10648, + "time_per_iteration": 2.6894030570983887 + }, + { + "auxiliary_loss_clip": 0.01116211, + "auxiliary_loss_mlp": 0.01028328, + "balance_loss_clip": 1.04040766, + "balance_loss_mlp": 1.01724792, + "epoch": 0.6402525176612055, + "flos": 28780467320160.0, + "grad_norm": 2.046592807648618, + "language_loss": 0.78902483, + "learning_rate": 1.210636039936138e-06, + "loss": 0.81047022, + "num_input_tokens_seen": 229796785, + "router_z_loss_clip": 0.75732422, + "router_z_loss_mlp": 0.11083984, + "step": 10649, + "time_per_iteration": 2.6966440677642822 + }, + { + "auxiliary_loss_clip": 0.01116058, + "auxiliary_loss_mlp": 0.01036024, + "balance_loss_clip": 1.04146409, + "balance_loss_mlp": 1.02359021, + "epoch": 0.6403126409138734, + "flos": 22013912337120.0, + "grad_norm": 2.065149677012464, + "language_loss": 0.75313646, + "learning_rate": 1.2102782105825956e-06, + "loss": 0.77465731, + "num_input_tokens_seen": 229815425, + "router_z_loss_clip": 0.74609375, + "router_z_loss_mlp": 0.12445068, + "step": 10650, + "time_per_iteration": 2.6158673763275146 + }, + { + "auxiliary_loss_clip": 0.01116826, + "auxiliary_loss_mlp": 0.01030612, + "balance_loss_clip": 1.04166162, + "balance_loss_mlp": 1.01854825, + "epoch": 0.6403727641665414, + "flos": 26819669868480.0, + "grad_norm": 1.4808639286248353, + "language_loss": 0.70334095, + "learning_rate": 1.2099204111753833e-06, + "loss": 0.72481537, + "num_input_tokens_seen": 229834545, + "router_z_loss_clip": 0.75146484, + "router_z_loss_mlp": 0.12072754, + "step": 10651, + "time_per_iteration": 2.66196608543396 + }, + { + "auxiliary_loss_clip": 0.01116176, + "auxiliary_loss_mlp": 0.01034157, + "balance_loss_clip": 1.04078364, + "balance_loss_mlp": 1.02188492, + "epoch": 0.6404328874192093, + "flos": 30378321263520.0, + "grad_norm": 3.04830442978586, + "language_loss": 0.63783717, + "learning_rate": 1.2095626417280684e-06, + "loss": 0.6593405, + "num_input_tokens_seen": 229849175, + "router_z_loss_clip": 0.75390625, + "router_z_loss_mlp": 0.12280273, + "step": 10652, + "time_per_iteration": 2.6527607440948486 + }, + { + "auxiliary_loss_clip": 0.0111695, + "auxiliary_loss_mlp": 0.01025843, + "balance_loss_clip": 1.04212117, + "balance_loss_mlp": 1.01452446, + "epoch": 0.6404930106718774, + "flos": 21472434577440.0, + "grad_norm": 3.518697070621156, + "language_loss": 0.79257393, + "learning_rate": 1.2092049022542168e-06, + "loss": 0.81400192, + "num_input_tokens_seen": 229865400, + "router_z_loss_clip": 0.74853516, + "router_z_loss_mlp": 0.11315918, + "step": 10653, + "time_per_iteration": 2.7005510330200195 + }, + { + "auxiliary_loss_clip": 0.011247, + "auxiliary_loss_mlp": 0.01046487, + "balance_loss_clip": 1.04135895, + "balance_loss_mlp": 1.03323126, + "epoch": 0.6405531339245453, + "flos": 24596561131200.0, + "grad_norm": 4.192449159713711, + "language_loss": 0.70807844, + "learning_rate": 1.2088471927673952e-06, + "loss": 0.72979027, + "num_input_tokens_seen": 229882945, + "router_z_loss_clip": 0.83349609, + "router_z_loss_mlp": 0.13250732, + "step": 10654, + "time_per_iteration": 2.7209768295288086 + }, + { + "auxiliary_loss_clip": 0.01120827, + "auxiliary_loss_mlp": 0.01037945, + "balance_loss_clip": 1.04221737, + "balance_loss_mlp": 1.02539802, + "epoch": 0.6406132571772133, + "flos": 26774337382560.0, + "grad_norm": 1.8814916689196992, + "language_loss": 0.72932088, + "learning_rate": 1.2084895132811666e-06, + "loss": 0.75090861, + "num_input_tokens_seen": 229901590, + "router_z_loss_clip": 0.78613281, + "router_z_loss_mlp": 0.12554932, + "step": 10655, + "time_per_iteration": 2.677678108215332 + }, + { + "auxiliary_loss_clip": 0.01119301, + "auxiliary_loss_mlp": 0.01034941, + "balance_loss_clip": 1.04200804, + "balance_loss_mlp": 1.02299619, + "epoch": 0.6406733804298813, + "flos": 35103340556640.0, + "grad_norm": 1.6051198360015282, + "language_loss": 0.82510233, + "learning_rate": 1.2081318638090952e-06, + "loss": 0.8466447, + "num_input_tokens_seen": 229922535, + "router_z_loss_clip": 0.7734375, + "router_z_loss_mlp": 0.11962891, + "step": 10656, + "time_per_iteration": 2.750058650970459 + }, + { + "auxiliary_loss_clip": 0.01116442, + "auxiliary_loss_mlp": 0.01032818, + "balance_loss_clip": 1.0406692, + "balance_loss_mlp": 1.02194047, + "epoch": 0.6407335036825492, + "flos": 21301031367360.0, + "grad_norm": 2.25965399557124, + "language_loss": 0.72506756, + "learning_rate": 1.2077742443647433e-06, + "loss": 0.74656016, + "num_input_tokens_seen": 229939575, + "router_z_loss_clip": 0.75830078, + "router_z_loss_mlp": 0.10882568, + "step": 10657, + "time_per_iteration": 2.609736919403076 + }, + { + "auxiliary_loss_clip": 0.01117374, + "auxiliary_loss_mlp": 0.01034806, + "balance_loss_clip": 1.04157019, + "balance_loss_mlp": 1.02347505, + "epoch": 0.6407936269352172, + "flos": 26997596360640.0, + "grad_norm": 1.9639608763462315, + "language_loss": 0.77089357, + "learning_rate": 1.2074166549616707e-06, + "loss": 0.79241538, + "num_input_tokens_seen": 229958840, + "router_z_loss_clip": 0.75830078, + "router_z_loss_mlp": 0.11334229, + "step": 10658, + "time_per_iteration": 2.6676158905029297 + }, + { + "auxiliary_loss_clip": 0.01120577, + "auxiliary_loss_mlp": 0.01037525, + "balance_loss_clip": 1.04311085, + "balance_loss_mlp": 1.02538383, + "epoch": 0.6408537501878852, + "flos": 28199977770240.0, + "grad_norm": 3.1644984770663296, + "language_loss": 0.76209092, + "learning_rate": 1.2070590956134386e-06, + "loss": 0.78367198, + "num_input_tokens_seen": 229979680, + "router_z_loss_clip": 0.77294922, + "router_z_loss_mlp": 0.12133789, + "step": 10659, + "time_per_iteration": 2.650355577468872 + }, + { + "auxiliary_loss_clip": 0.0111602, + "auxiliary_loss_mlp": 0.01033099, + "balance_loss_clip": 1.04022121, + "balance_loss_mlp": 1.02096903, + "epoch": 0.6409138734405532, + "flos": 20099338751520.0, + "grad_norm": 5.187416465909818, + "language_loss": 0.78055382, + "learning_rate": 1.2067015663336046e-06, + "loss": 0.80204499, + "num_input_tokens_seen": 229996830, + "router_z_loss_clip": 0.75830078, + "router_z_loss_mlp": 0.12133789, + "step": 10660, + "time_per_iteration": 2.6478066444396973 + }, + { + "auxiliary_loss_clip": 0.01122277, + "auxiliary_loss_mlp": 0.0103658, + "balance_loss_clip": 1.04258692, + "balance_loss_mlp": 1.02426016, + "epoch": 0.6409739966932211, + "flos": 27794375916480.0, + "grad_norm": 2.4212416790309628, + "language_loss": 0.687428, + "learning_rate": 1.206344067135727e-06, + "loss": 0.70901656, + "num_input_tokens_seen": 230015115, + "router_z_loss_clip": 0.79736328, + "router_z_loss_mlp": 0.12329102, + "step": 10661, + "time_per_iteration": 2.6430864334106445 + }, + { + "auxiliary_loss_clip": 0.01114463, + "auxiliary_loss_mlp": 0.01038069, + "balance_loss_clip": 1.0415318, + "balance_loss_mlp": 1.02726901, + "epoch": 0.6410341199458891, + "flos": 30691232281440.0, + "grad_norm": 1.7036273361794354, + "language_loss": 0.76039797, + "learning_rate": 1.205986598033362e-06, + "loss": 0.78192329, + "num_input_tokens_seen": 230035515, + "router_z_loss_clip": 0.72851562, + "router_z_loss_mlp": 0.10809326, + "step": 10662, + "time_per_iteration": 2.6849920749664307 + }, + { + "auxiliary_loss_clip": 0.01115014, + "auxiliary_loss_mlp": 0.0103026, + "balance_loss_clip": 1.03928018, + "balance_loss_mlp": 1.0190897, + "epoch": 0.641094243198557, + "flos": 33002899092000.0, + "grad_norm": 2.1405043195368374, + "language_loss": 0.70353854, + "learning_rate": 1.2056291590400644e-06, + "loss": 0.7249912, + "num_input_tokens_seen": 230054355, + "router_z_loss_clip": 0.75732422, + "router_z_loss_mlp": 0.11157227, + "step": 10663, + "time_per_iteration": 2.6851565837860107 + }, + { + "auxiliary_loss_clip": 0.01118877, + "auxiliary_loss_mlp": 0.01042963, + "balance_loss_clip": 1.04188919, + "balance_loss_mlp": 1.02930796, + "epoch": 0.641154366451225, + "flos": 30962416851360.0, + "grad_norm": 1.928787551674507, + "language_loss": 0.67919612, + "learning_rate": 1.205271750169389e-06, + "loss": 0.70081449, + "num_input_tokens_seen": 230074605, + "router_z_loss_clip": 0.76904297, + "router_z_loss_mlp": 0.13635254, + "step": 10664, + "time_per_iteration": 2.6936252117156982 + }, + { + "auxiliary_loss_clip": 0.01113215, + "auxiliary_loss_mlp": 0.01029567, + "balance_loss_clip": 1.03944409, + "balance_loss_mlp": 1.01842129, + "epoch": 0.6412144897038929, + "flos": 30692690903520.0, + "grad_norm": 1.8928729047352295, + "language_loss": 0.66427445, + "learning_rate": 1.2049143714348881e-06, + "loss": 0.6857022, + "num_input_tokens_seen": 230093820, + "router_z_loss_clip": 0.73779297, + "router_z_loss_mlp": 0.1114502, + "step": 10665, + "time_per_iteration": 2.6832022666931152 + }, + { + "auxiliary_loss_clip": 0.01115415, + "auxiliary_loss_mlp": 0.01032298, + "balance_loss_clip": 1.04142523, + "balance_loss_mlp": 1.0208596, + "epoch": 0.641274612956561, + "flos": 28602662379840.0, + "grad_norm": 1.685436494425832, + "language_loss": 0.64542294, + "learning_rate": 1.2045570228501145e-06, + "loss": 0.6669001, + "num_input_tokens_seen": 230114285, + "router_z_loss_clip": 0.73925781, + "router_z_loss_mlp": 0.11444092, + "step": 10666, + "time_per_iteration": 2.694120407104492 + }, + { + "auxiliary_loss_clip": 0.01118096, + "auxiliary_loss_mlp": 0.01037141, + "balance_loss_clip": 1.04155874, + "balance_loss_mlp": 1.02570868, + "epoch": 0.6413347362092289, + "flos": 23705348496480.0, + "grad_norm": 1.6976948131039917, + "language_loss": 0.7105006, + "learning_rate": 1.2041997044286176e-06, + "loss": 0.73205298, + "num_input_tokens_seen": 230132760, + "router_z_loss_clip": 0.76513672, + "router_z_loss_mlp": 0.11437988, + "step": 10667, + "time_per_iteration": 2.6736507415771484 + }, + { + "auxiliary_loss_clip": 0.01128248, + "auxiliary_loss_mlp": 0.01043544, + "balance_loss_clip": 1.04538596, + "balance_loss_mlp": 1.03091335, + "epoch": 0.6413948594618969, + "flos": 20983501379520.0, + "grad_norm": 4.255473546951819, + "language_loss": 0.77312183, + "learning_rate": 1.2038424161839484e-06, + "loss": 0.79483974, + "num_input_tokens_seen": 230149690, + "router_z_loss_clip": 0.82861328, + "router_z_loss_mlp": 0.12634277, + "step": 10668, + "time_per_iteration": 2.6724464893341064 + }, + { + "auxiliary_loss_clip": 0.0112035, + "auxiliary_loss_mlp": 0.01037932, + "balance_loss_clip": 1.04498947, + "balance_loss_mlp": 1.02629113, + "epoch": 0.6414549827145648, + "flos": 27174064230720.0, + "grad_norm": 1.83043914370025, + "language_loss": 0.67316228, + "learning_rate": 1.2034851581296544e-06, + "loss": 0.69474506, + "num_input_tokens_seen": 230166950, + "router_z_loss_clip": 0.75390625, + "router_z_loss_mlp": 0.11639404, + "step": 10669, + "time_per_iteration": 4.153616905212402 + }, + { + "auxiliary_loss_clip": 0.01126722, + "auxiliary_loss_mlp": 0.01035539, + "balance_loss_clip": 1.04597878, + "balance_loss_mlp": 1.02283728, + "epoch": 0.6415151059672328, + "flos": 23965471848960.0, + "grad_norm": 1.7320256322205962, + "language_loss": 0.7819981, + "learning_rate": 1.2031279302792825e-06, + "loss": 0.8036207, + "num_input_tokens_seen": 230184785, + "router_z_loss_clip": 0.80712891, + "router_z_loss_mlp": 0.1270752, + "step": 10670, + "time_per_iteration": 2.6890194416046143 + }, + { + "auxiliary_loss_clip": 0.01121049, + "auxiliary_loss_mlp": 0.01032229, + "balance_loss_clip": 1.04142308, + "balance_loss_mlp": 1.01993871, + "epoch": 0.6415752292199008, + "flos": 18138946472640.0, + "grad_norm": 2.6889444604816024, + "language_loss": 0.88813365, + "learning_rate": 1.20277073264638e-06, + "loss": 0.90966642, + "num_input_tokens_seen": 230201385, + "router_z_loss_clip": 0.79541016, + "router_z_loss_mlp": 0.12298584, + "step": 10671, + "time_per_iteration": 3.946770429611206 + }, + { + "auxiliary_loss_clip": 0.01115154, + "auxiliary_loss_mlp": 0.01028059, + "balance_loss_clip": 1.04194486, + "balance_loss_mlp": 1.01687741, + "epoch": 0.6416353524725688, + "flos": 16759651502880.0, + "grad_norm": 1.6235861837525045, + "language_loss": 0.69422877, + "learning_rate": 1.2024135652444907e-06, + "loss": 0.71566093, + "num_input_tokens_seen": 230220380, + "router_z_loss_clip": 0.73193359, + "router_z_loss_mlp": 0.11187744, + "step": 10672, + "time_per_iteration": 2.7055823802948 + }, + { + "auxiliary_loss_clip": 0.01121213, + "auxiliary_loss_mlp": 0.01031331, + "balance_loss_clip": 1.0417999, + "balance_loss_mlp": 1.01790822, + "epoch": 0.6416954757252368, + "flos": 29937880899360.0, + "grad_norm": 4.011933635064767, + "language_loss": 0.74311829, + "learning_rate": 1.2020564280871593e-06, + "loss": 0.76464367, + "num_input_tokens_seen": 230239845, + "router_z_loss_clip": 0.79443359, + "router_z_loss_mlp": 0.13427734, + "step": 10673, + "time_per_iteration": 2.639500856399536 + }, + { + "auxiliary_loss_clip": 0.01117114, + "auxiliary_loss_mlp": 0.01031533, + "balance_loss_clip": 1.04147625, + "balance_loss_mlp": 1.01917648, + "epoch": 0.6417555989779047, + "flos": 33812279521920.0, + "grad_norm": 1.9392289252453172, + "language_loss": 0.691113, + "learning_rate": 1.2016993211879283e-06, + "loss": 0.71259952, + "num_input_tokens_seen": 230262420, + "router_z_loss_clip": 0.75585938, + "router_z_loss_mlp": 0.12359619, + "step": 10674, + "time_per_iteration": 2.7808351516723633 + }, + { + "auxiliary_loss_clip": 0.01121457, + "auxiliary_loss_mlp": 0.01029317, + "balance_loss_clip": 1.04082227, + "balance_loss_mlp": 1.01683605, + "epoch": 0.6418157222305727, + "flos": 25083387430560.0, + "grad_norm": 2.5791259821068966, + "language_loss": 0.66568661, + "learning_rate": 1.201342244560338e-06, + "loss": 0.68719447, + "num_input_tokens_seen": 230279950, + "router_z_loss_clip": 0.80664062, + "router_z_loss_mlp": 0.12481689, + "step": 10675, + "time_per_iteration": 2.6368062496185303 + }, + { + "auxiliary_loss_clip": 0.01119882, + "auxiliary_loss_mlp": 0.01037473, + "balance_loss_clip": 1.04403484, + "balance_loss_mlp": 1.02586198, + "epoch": 0.6418758454832406, + "flos": 27578572117920.0, + "grad_norm": 2.4117476326349214, + "language_loss": 0.66459459, + "learning_rate": 1.2009851982179307e-06, + "loss": 0.68616813, + "num_input_tokens_seen": 230299705, + "router_z_loss_clip": 0.75878906, + "router_z_loss_mlp": 0.11633301, + "step": 10676, + "time_per_iteration": 2.668057680130005 + }, + { + "auxiliary_loss_clip": 0.01119761, + "auxiliary_loss_mlp": 0.01030602, + "balance_loss_clip": 1.04288268, + "balance_loss_mlp": 1.01791203, + "epoch": 0.6419359687359086, + "flos": 33404206114080.0, + "grad_norm": 2.066401461733894, + "language_loss": 0.75230592, + "learning_rate": 1.2006281821742446e-06, + "loss": 0.77380955, + "num_input_tokens_seen": 230320030, + "router_z_loss_clip": 0.76708984, + "router_z_loss_mlp": 0.12701416, + "step": 10677, + "time_per_iteration": 2.713712215423584 + }, + { + "auxiliary_loss_clip": 0.0103687, + "auxiliary_loss_mlp": 0.01001233, + "balance_loss_clip": 1.01304054, + "balance_loss_mlp": 1.00016057, + "epoch": 0.6419960919885765, + "flos": 82061806656960.0, + "grad_norm": 0.7816381501737762, + "language_loss": 0.60751474, + "learning_rate": 1.200271196442818e-06, + "loss": 0.62789577, + "num_input_tokens_seen": 230381495, + "router_z_loss_clip": 0.23828125, + "router_z_loss_mlp": 0.01073456, + "step": 10678, + "time_per_iteration": 3.3500335216522217 + }, + { + "auxiliary_loss_clip": 0.01115632, + "auxiliary_loss_mlp": 0.01033064, + "balance_loss_clip": 1.04174316, + "balance_loss_mlp": 1.02196562, + "epoch": 0.6420562152412446, + "flos": 24284217355200.0, + "grad_norm": 2.147571805303558, + "language_loss": 0.67410254, + "learning_rate": 1.1999142410371875e-06, + "loss": 0.69558954, + "num_input_tokens_seen": 230401385, + "router_z_loss_clip": 0.73828125, + "router_z_loss_mlp": 0.11090088, + "step": 10679, + "time_per_iteration": 4.091147422790527 + }, + { + "auxiliary_loss_clip": 0.01120367, + "auxiliary_loss_mlp": 0.01029186, + "balance_loss_clip": 1.04351377, + "balance_loss_mlp": 1.0172528, + "epoch": 0.6421163384939125, + "flos": 30249373812480.0, + "grad_norm": 1.7420203552482352, + "language_loss": 0.7319954, + "learning_rate": 1.1995573159708897e-06, + "loss": 0.75349092, + "num_input_tokens_seen": 230421340, + "router_z_loss_clip": 0.76806641, + "router_z_loss_mlp": 0.11938477, + "step": 10680, + "time_per_iteration": 2.6696512699127197 + }, + { + "auxiliary_loss_clip": 0.01115694, + "auxiliary_loss_mlp": 0.01026771, + "balance_loss_clip": 1.03963804, + "balance_loss_mlp": 1.01591122, + "epoch": 0.6421764617465805, + "flos": 31228820382240.0, + "grad_norm": 1.9193376728987015, + "language_loss": 0.68049866, + "learning_rate": 1.1992004212574582e-06, + "loss": 0.70192331, + "num_input_tokens_seen": 230441270, + "router_z_loss_clip": 0.76171875, + "router_z_loss_mlp": 0.10864258, + "step": 10681, + "time_per_iteration": 2.684640884399414 + }, + { + "auxiliary_loss_clip": 0.01115735, + "auxiliary_loss_mlp": 0.010306, + "balance_loss_clip": 1.04064655, + "balance_loss_mlp": 1.01896477, + "epoch": 0.6422365849992484, + "flos": 17246923492320.0, + "grad_norm": 1.8330703558662411, + "language_loss": 0.7483083, + "learning_rate": 1.198843556910427e-06, + "loss": 0.7697717, + "num_input_tokens_seen": 230457455, + "router_z_loss_clip": 0.75097656, + "router_z_loss_mlp": 0.11639404, + "step": 10682, + "time_per_iteration": 4.083132982254028 + }, + { + "auxiliary_loss_clip": 0.01112274, + "auxiliary_loss_mlp": 0.01032723, + "balance_loss_clip": 1.04063439, + "balance_loss_mlp": 1.02203572, + "epoch": 0.6422967082519164, + "flos": 27312897898080.0, + "grad_norm": 2.717366100257604, + "language_loss": 0.79355538, + "learning_rate": 1.1984867229433287e-06, + "loss": 0.8150053, + "num_input_tokens_seen": 230478955, + "router_z_loss_clip": 0.71582031, + "router_z_loss_mlp": 0.10693359, + "step": 10683, + "time_per_iteration": 2.701401710510254 + }, + { + "auxiliary_loss_clip": 0.01119499, + "auxiliary_loss_mlp": 0.010329, + "balance_loss_clip": 1.04229999, + "balance_loss_mlp": 1.02027583, + "epoch": 0.6423568315045844, + "flos": 17872948114560.0, + "grad_norm": 1.779784174573866, + "language_loss": 0.6706751, + "learning_rate": 1.1981299193696941e-06, + "loss": 0.69219905, + "num_input_tokens_seen": 230496425, + "router_z_loss_clip": 0.77099609, + "router_z_loss_mlp": 0.12615967, + "step": 10684, + "time_per_iteration": 2.653470039367676 + }, + { + "auxiliary_loss_clip": 0.01117373, + "auxiliary_loss_mlp": 0.01030879, + "balance_loss_clip": 1.04074025, + "balance_loss_mlp": 1.01851654, + "epoch": 0.6424169547572524, + "flos": 32743261946880.0, + "grad_norm": 4.551851792064359, + "language_loss": 0.71655011, + "learning_rate": 1.1977731462030533e-06, + "loss": 0.73803258, + "num_input_tokens_seen": 230516245, + "router_z_loss_clip": 0.76513672, + "router_z_loss_mlp": 0.12341309, + "step": 10685, + "time_per_iteration": 2.7196028232574463 + }, + { + "auxiliary_loss_clip": 0.01113121, + "auxiliary_loss_mlp": 0.0103206, + "balance_loss_clip": 1.04007292, + "balance_loss_mlp": 1.02096128, + "epoch": 0.6424770780099204, + "flos": 27708127328160.0, + "grad_norm": 3.606093217565478, + "language_loss": 0.75408375, + "learning_rate": 1.197416403456935e-06, + "loss": 0.77553558, + "num_input_tokens_seen": 230534745, + "router_z_loss_clip": 0.72949219, + "router_z_loss_mlp": 0.11090088, + "step": 10686, + "time_per_iteration": 2.716377019882202 + }, + { + "auxiliary_loss_clip": 0.01120826, + "auxiliary_loss_mlp": 0.01030309, + "balance_loss_clip": 1.04263639, + "balance_loss_mlp": 1.01773202, + "epoch": 0.6425372012625883, + "flos": 34739708254560.0, + "grad_norm": 1.904087189554099, + "language_loss": 0.68638849, + "learning_rate": 1.197059691144867e-06, + "loss": 0.70789981, + "num_input_tokens_seen": 230555895, + "router_z_loss_clip": 0.78125, + "router_z_loss_mlp": 0.12591553, + "step": 10687, + "time_per_iteration": 2.734062910079956 + }, + { + "auxiliary_loss_clip": 0.01119987, + "auxiliary_loss_mlp": 0.01028916, + "balance_loss_clip": 1.04295564, + "balance_loss_mlp": 1.01732326, + "epoch": 0.6425973245152563, + "flos": 35815613767200.0, + "grad_norm": 2.9047663530736414, + "language_loss": 0.66305351, + "learning_rate": 1.1967030092803767e-06, + "loss": 0.6845426, + "num_input_tokens_seen": 230577460, + "router_z_loss_clip": 0.77001953, + "router_z_loss_mlp": 0.1159668, + "step": 10688, + "time_per_iteration": 2.7053470611572266 + }, + { + "auxiliary_loss_clip": 0.01116708, + "auxiliary_loss_mlp": 0.01029533, + "balance_loss_clip": 1.04087853, + "balance_loss_mlp": 1.0173378, + "epoch": 0.6426574477679242, + "flos": 20048536432800.0, + "grad_norm": 2.3272538916521297, + "language_loss": 0.73268151, + "learning_rate": 1.1963463578769876e-06, + "loss": 0.75414395, + "num_input_tokens_seen": 230595030, + "router_z_loss_clip": 0.75830078, + "router_z_loss_mlp": 0.12213135, + "step": 10689, + "time_per_iteration": 2.670393705368042 + }, + { + "auxiliary_loss_clip": 0.0111531, + "auxiliary_loss_mlp": 0.01027034, + "balance_loss_clip": 1.04217982, + "balance_loss_mlp": 1.01599479, + "epoch": 0.6427175710205922, + "flos": 26644255447680.0, + "grad_norm": 4.664061149571956, + "language_loss": 0.72257304, + "learning_rate": 1.195989736948226e-06, + "loss": 0.7439965, + "num_input_tokens_seen": 230615135, + "router_z_loss_clip": 0.73095703, + "router_z_loss_mlp": 0.1104126, + "step": 10690, + "time_per_iteration": 2.6873557567596436 + }, + { + "auxiliary_loss_clip": 0.01114273, + "auxiliary_loss_mlp": 0.01027512, + "balance_loss_clip": 1.04019332, + "balance_loss_mlp": 1.01606774, + "epoch": 0.6427776942732601, + "flos": 21702946148640.0, + "grad_norm": 2.066062798146607, + "language_loss": 0.77312064, + "learning_rate": 1.1956331465076143e-06, + "loss": 0.7945385, + "num_input_tokens_seen": 230631965, + "router_z_loss_clip": 0.74072266, + "router_z_loss_mlp": 0.11437988, + "step": 10691, + "time_per_iteration": 2.67891001701355 + }, + { + "auxiliary_loss_clip": 0.01118987, + "auxiliary_loss_mlp": 0.0103284, + "balance_loss_clip": 1.04203439, + "balance_loss_mlp": 1.02143145, + "epoch": 0.6428378175259282, + "flos": 18407375867520.0, + "grad_norm": 2.2230039978190286, + "language_loss": 0.74462372, + "learning_rate": 1.1952765865686738e-06, + "loss": 0.76614201, + "num_input_tokens_seen": 230649565, + "router_z_loss_clip": 0.76904297, + "router_z_loss_mlp": 0.11419678, + "step": 10692, + "time_per_iteration": 2.642348051071167 + }, + { + "auxiliary_loss_clip": 0.01117443, + "auxiliary_loss_mlp": 0.01033365, + "balance_loss_clip": 1.04149806, + "balance_loss_mlp": 1.02186155, + "epoch": 0.6428979407785961, + "flos": 29092041267840.0, + "grad_norm": 2.1172604014612233, + "language_loss": 0.61849403, + "learning_rate": 1.1949200571449263e-06, + "loss": 0.64000213, + "num_input_tokens_seen": 230669265, + "router_z_loss_clip": 0.75976562, + "router_z_loss_mlp": 0.1151123, + "step": 10693, + "time_per_iteration": 2.754080057144165 + }, + { + "auxiliary_loss_clip": 0.01119165, + "auxiliary_loss_mlp": 0.01027815, + "balance_loss_clip": 1.04108953, + "balance_loss_mlp": 1.01569724, + "epoch": 0.6429580640312641, + "flos": 40178337828480.0, + "grad_norm": 1.9140917783027738, + "language_loss": 0.59849232, + "learning_rate": 1.1945635582498903e-06, + "loss": 0.6199621, + "num_input_tokens_seen": 230690575, + "router_z_loss_clip": 0.78076172, + "router_z_loss_mlp": 0.12109375, + "step": 10694, + "time_per_iteration": 2.7385001182556152 + }, + { + "auxiliary_loss_clip": 0.01119347, + "auxiliary_loss_mlp": 0.01031165, + "balance_loss_clip": 1.04321015, + "balance_loss_mlp": 1.01952446, + "epoch": 0.643018187283932, + "flos": 25707750844320.0, + "grad_norm": 1.4879374571431305, + "language_loss": 0.79778886, + "learning_rate": 1.1942070898970853e-06, + "loss": 0.81929398, + "num_input_tokens_seen": 230709420, + "router_z_loss_clip": 0.76171875, + "router_z_loss_mlp": 0.11651611, + "step": 10695, + "time_per_iteration": 2.7049317359924316 + }, + { + "auxiliary_loss_clip": 0.01117837, + "auxiliary_loss_mlp": 0.01035179, + "balance_loss_clip": 1.04155672, + "balance_loss_mlp": 1.02294803, + "epoch": 0.6430783105366, + "flos": 32609898112320.0, + "grad_norm": 2.038582597822046, + "language_loss": 0.73471242, + "learning_rate": 1.1938506521000285e-06, + "loss": 0.75624263, + "num_input_tokens_seen": 230729350, + "router_z_loss_clip": 0.76220703, + "router_z_loss_mlp": 0.12231445, + "step": 10696, + "time_per_iteration": 2.6907382011413574 + }, + { + "auxiliary_loss_clip": 0.01114815, + "auxiliary_loss_mlp": 0.01026983, + "balance_loss_clip": 1.04265928, + "balance_loss_mlp": 1.01582491, + "epoch": 0.643138433789268, + "flos": 28915087190400.0, + "grad_norm": 1.960677497462904, + "language_loss": 0.75565898, + "learning_rate": 1.1934942448722347e-06, + "loss": 0.77707696, + "num_input_tokens_seen": 230749220, + "router_z_loss_clip": 0.72167969, + "router_z_loss_mlp": 0.11157227, + "step": 10697, + "time_per_iteration": 2.7059109210968018 + }, + { + "auxiliary_loss_clip": 0.01114807, + "auxiliary_loss_mlp": 0.0102889, + "balance_loss_clip": 1.04101241, + "balance_loss_mlp": 1.01790524, + "epoch": 0.643198557041936, + "flos": 41734748944800.0, + "grad_norm": 1.9678786224298703, + "language_loss": 0.66008157, + "learning_rate": 1.1931378682272208e-06, + "loss": 0.68151855, + "num_input_tokens_seen": 230770245, + "router_z_loss_clip": 0.73730469, + "router_z_loss_mlp": 0.10986328, + "step": 10698, + "time_per_iteration": 2.849095106124878 + }, + { + "auxiliary_loss_clip": 0.01036492, + "auxiliary_loss_mlp": 0.00999697, + "balance_loss_clip": 1.01285541, + "balance_loss_mlp": 0.9986589, + "epoch": 0.643258680294604, + "flos": 82517967725760.0, + "grad_norm": 0.8553540477982379, + "language_loss": 0.63418084, + "learning_rate": 1.1927815221784996e-06, + "loss": 0.65454274, + "num_input_tokens_seen": 230837030, + "router_z_loss_clip": 0.23632812, + "router_z_loss_mlp": 0.01038361, + "step": 10699, + "time_per_iteration": 3.2547831535339355 + }, + { + "auxiliary_loss_clip": 0.01114533, + "auxiliary_loss_mlp": 0.01024875, + "balance_loss_clip": 1.0419991, + "balance_loss_mlp": 1.01404524, + "epoch": 0.6433188035472719, + "flos": 30732877694880.0, + "grad_norm": 1.761210153656652, + "language_loss": 0.69571626, + "learning_rate": 1.1924252067395838e-06, + "loss": 0.71711034, + "num_input_tokens_seen": 230856845, + "router_z_loss_clip": 0.72558594, + "router_z_loss_mlp": 0.10821533, + "step": 10700, + "time_per_iteration": 2.7408053874969482 + }, + { + "auxiliary_loss_clip": 0.01118161, + "auxiliary_loss_mlp": 0.01027373, + "balance_loss_clip": 1.04200554, + "balance_loss_mlp": 1.01572585, + "epoch": 0.6433789267999399, + "flos": 30473078480640.0, + "grad_norm": 2.0298940595380457, + "language_loss": 0.73621953, + "learning_rate": 1.1920689219239855e-06, + "loss": 0.75767481, + "num_input_tokens_seen": 230878785, + "router_z_loss_clip": 0.76171875, + "router_z_loss_mlp": 0.11639404, + "step": 10701, + "time_per_iteration": 2.6970014572143555 + }, + { + "auxiliary_loss_clip": 0.01118835, + "auxiliary_loss_mlp": 0.01029186, + "balance_loss_clip": 1.04092312, + "balance_loss_mlp": 1.01655579, + "epoch": 0.6434390500526078, + "flos": 21433908994560.0, + "grad_norm": 4.260208642299309, + "language_loss": 0.81968033, + "learning_rate": 1.1917126677452144e-06, + "loss": 0.84116054, + "num_input_tokens_seen": 230895445, + "router_z_loss_clip": 0.77929688, + "router_z_loss_mlp": 0.12628174, + "step": 10702, + "time_per_iteration": 2.6974453926086426 + }, + { + "auxiliary_loss_clip": 0.01113751, + "auxiliary_loss_mlp": 0.01033912, + "balance_loss_clip": 1.03975511, + "balance_loss_mlp": 1.0225271, + "epoch": 0.6434991733052758, + "flos": 25434013685760.0, + "grad_norm": 2.0362139975927147, + "language_loss": 0.7401216, + "learning_rate": 1.1913564442167798e-06, + "loss": 0.76159823, + "num_input_tokens_seen": 230911375, + "router_z_loss_clip": 0.73974609, + "router_z_loss_mlp": 0.11383057, + "step": 10703, + "time_per_iteration": 2.645693302154541 + }, + { + "auxiliary_loss_clip": 0.01037232, + "auxiliary_loss_mlp": 0.01000514, + "balance_loss_clip": 1.01353467, + "balance_loss_mlp": 0.99941087, + "epoch": 0.6435592965579437, + "flos": 80648605074240.0, + "grad_norm": 0.6604807212254856, + "language_loss": 0.54632711, + "learning_rate": 1.1910002513521898e-06, + "loss": 0.56670451, + "num_input_tokens_seen": 230975990, + "router_z_loss_clip": 0.23718262, + "router_z_loss_mlp": 0.0110321, + "step": 10704, + "time_per_iteration": 3.230151653289795 + }, + { + "auxiliary_loss_clip": 0.01116983, + "auxiliary_loss_mlp": 0.01025909, + "balance_loss_clip": 1.04083896, + "balance_loss_mlp": 1.01540685, + "epoch": 0.6436194198106118, + "flos": 29003037504480.0, + "grad_norm": 1.9697510410581511, + "language_loss": 0.76889843, + "learning_rate": 1.1906440891649519e-06, + "loss": 0.79032731, + "num_input_tokens_seen": 230997110, + "router_z_loss_clip": 0.76074219, + "router_z_loss_mlp": 0.10498047, + "step": 10705, + "time_per_iteration": 2.700821876525879 + }, + { + "auxiliary_loss_clip": 0.01117808, + "auxiliary_loss_mlp": 0.01035459, + "balance_loss_clip": 1.04184091, + "balance_loss_mlp": 1.02472448, + "epoch": 0.6436795430632797, + "flos": 24684997652640.0, + "grad_norm": 1.8219253844838148, + "language_loss": 0.79356551, + "learning_rate": 1.1902879576685708e-06, + "loss": 0.81509817, + "num_input_tokens_seen": 231015590, + "router_z_loss_clip": 0.75927734, + "router_z_loss_mlp": 0.10742188, + "step": 10706, + "time_per_iteration": 2.7123754024505615 + }, + { + "auxiliary_loss_clip": 0.01115714, + "auxiliary_loss_mlp": 0.01032916, + "balance_loss_clip": 1.04002666, + "balance_loss_mlp": 1.0208993, + "epoch": 0.6437396663159477, + "flos": 24772421242080.0, + "grad_norm": 2.176270489655497, + "language_loss": 0.8024416, + "learning_rate": 1.1899318568765518e-06, + "loss": 0.82392788, + "num_input_tokens_seen": 231033800, + "router_z_loss_clip": 0.75732422, + "router_z_loss_mlp": 0.12011719, + "step": 10707, + "time_per_iteration": 2.7326788902282715 + }, + { + "auxiliary_loss_clip": 0.01117601, + "auxiliary_loss_mlp": 0.01029357, + "balance_loss_clip": 1.04155648, + "balance_loss_mlp": 1.01825857, + "epoch": 0.6437997895686156, + "flos": 29136928063680.0, + "grad_norm": 1.773424625984761, + "language_loss": 0.85696846, + "learning_rate": 1.1895757868023978e-06, + "loss": 0.878438, + "num_input_tokens_seen": 231053160, + "router_z_loss_clip": 0.76025391, + "router_z_loss_mlp": 0.11102295, + "step": 10708, + "time_per_iteration": 2.7009549140930176 + }, + { + "auxiliary_loss_clip": 0.01127951, + "auxiliary_loss_mlp": 0.01042291, + "balance_loss_clip": 1.04645002, + "balance_loss_mlp": 1.02948213, + "epoch": 0.6438599128212836, + "flos": 23170637122560.0, + "grad_norm": 2.816021629111908, + "language_loss": 0.65895689, + "learning_rate": 1.1892197474596106e-06, + "loss": 0.68065929, + "num_input_tokens_seen": 231069470, + "router_z_loss_clip": 0.81494141, + "router_z_loss_mlp": 0.12817383, + "step": 10709, + "time_per_iteration": 4.092857360839844 + }, + { + "auxiliary_loss_clip": 0.01115631, + "auxiliary_loss_mlp": 0.01028808, + "balance_loss_clip": 1.04048038, + "balance_loss_mlp": 1.01714945, + "epoch": 0.6439200360739517, + "flos": 29402764352640.0, + "grad_norm": 1.9329471307642985, + "language_loss": 0.80560547, + "learning_rate": 1.1888637388616929e-06, + "loss": 0.82704985, + "num_input_tokens_seen": 231088205, + "router_z_loss_clip": 0.75146484, + "router_z_loss_mlp": 0.11657715, + "step": 10710, + "time_per_iteration": 2.766697406768799 + }, + { + "auxiliary_loss_clip": 0.01112903, + "auxiliary_loss_mlp": 0.01025868, + "balance_loss_clip": 1.03870106, + "balance_loss_mlp": 1.01470351, + "epoch": 0.6439801593266196, + "flos": 38928192896160.0, + "grad_norm": 2.003177601194097, + "language_loss": 0.6646468, + "learning_rate": 1.1885077610221425e-06, + "loss": 0.6860345, + "num_input_tokens_seen": 231107850, + "router_z_loss_clip": 0.7421875, + "router_z_loss_mlp": 0.11157227, + "step": 10711, + "time_per_iteration": 4.148433685302734 + }, + { + "auxiliary_loss_clip": 0.0112115, + "auxiliary_loss_mlp": 0.01030162, + "balance_loss_clip": 1.04462063, + "balance_loss_mlp": 1.01805043, + "epoch": 0.6440402825792876, + "flos": 33099520104000.0, + "grad_norm": 1.787434560772675, + "language_loss": 0.78673309, + "learning_rate": 1.1881518139544597e-06, + "loss": 0.80824625, + "num_input_tokens_seen": 231127200, + "router_z_loss_clip": 0.76513672, + "router_z_loss_mlp": 0.12103271, + "step": 10712, + "time_per_iteration": 2.766991376876831 + }, + { + "auxiliary_loss_clip": 0.01120592, + "auxiliary_loss_mlp": 0.01032641, + "balance_loss_clip": 1.04173517, + "balance_loss_mlp": 1.02084565, + "epoch": 0.6441004058319555, + "flos": 25218777129120.0, + "grad_norm": 1.9535327068189736, + "language_loss": 0.82840538, + "learning_rate": 1.1877958976721417e-06, + "loss": 0.84993774, + "num_input_tokens_seen": 231146360, + "router_z_loss_clip": 0.78808594, + "router_z_loss_mlp": 0.11804199, + "step": 10713, + "time_per_iteration": 2.6748206615448 + }, + { + "auxiliary_loss_clip": 0.01116263, + "auxiliary_loss_mlp": 0.01034533, + "balance_loss_clip": 1.0437386, + "balance_loss_mlp": 1.02283263, + "epoch": 0.6441605290846235, + "flos": 31757859336960.0, + "grad_norm": 1.427923676622322, + "language_loss": 0.78608167, + "learning_rate": 1.187440012188684e-06, + "loss": 0.80758971, + "num_input_tokens_seen": 231168350, + "router_z_loss_clip": 0.72558594, + "router_z_loss_mlp": 0.11700439, + "step": 10714, + "time_per_iteration": 2.7245676517486572 + }, + { + "auxiliary_loss_clip": 0.01115893, + "auxiliary_loss_mlp": 0.01024015, + "balance_loss_clip": 1.04169631, + "balance_loss_mlp": 1.01326847, + "epoch": 0.6442206523372914, + "flos": 30382778164320.0, + "grad_norm": 2.8303163385787853, + "language_loss": 0.81409502, + "learning_rate": 1.187084157517583e-06, + "loss": 0.83549404, + "num_input_tokens_seen": 231188385, + "router_z_loss_clip": 0.74121094, + "router_z_loss_mlp": 0.10742188, + "step": 10715, + "time_per_iteration": 2.703778028488159 + }, + { + "auxiliary_loss_clip": 0.01115906, + "auxiliary_loss_mlp": 0.01030023, + "balance_loss_clip": 1.03937149, + "balance_loss_mlp": 1.01816201, + "epoch": 0.6442807755899594, + "flos": 30696053837760.0, + "grad_norm": 1.9444833476994994, + "language_loss": 0.81375217, + "learning_rate": 1.186728333672332e-06, + "loss": 0.83521146, + "num_input_tokens_seen": 231209880, + "router_z_loss_clip": 0.76464844, + "router_z_loss_mlp": 0.11859131, + "step": 10716, + "time_per_iteration": 2.730769395828247 + }, + { + "auxiliary_loss_clip": 0.01120712, + "auxiliary_loss_mlp": 0.01035901, + "balance_loss_clip": 1.04227877, + "balance_loss_mlp": 1.02257371, + "epoch": 0.6443408988426274, + "flos": 33366328807680.0, + "grad_norm": 2.1328050268454417, + "language_loss": 0.7784965, + "learning_rate": 1.186372540666424e-06, + "loss": 0.80006266, + "num_input_tokens_seen": 231230765, + "router_z_loss_clip": 0.78369141, + "router_z_loss_mlp": 0.13323975, + "step": 10717, + "time_per_iteration": 2.710514783859253 + }, + { + "auxiliary_loss_clip": 0.01115081, + "auxiliary_loss_mlp": 0.01031871, + "balance_loss_clip": 1.04225731, + "balance_loss_mlp": 1.02039099, + "epoch": 0.6444010220952954, + "flos": 34078764087360.0, + "grad_norm": 1.9988702580228401, + "language_loss": 0.68195742, + "learning_rate": 1.1860167785133513e-06, + "loss": 0.70342696, + "num_input_tokens_seen": 231252350, + "router_z_loss_clip": 0.72851562, + "router_z_loss_mlp": 0.11480713, + "step": 10718, + "time_per_iteration": 2.73842716217041 + }, + { + "auxiliary_loss_clip": 0.01035309, + "auxiliary_loss_mlp": 0.01004314, + "balance_loss_clip": 1.01177013, + "balance_loss_mlp": 1.00329888, + "epoch": 0.6444611453479633, + "flos": 86899161316320.0, + "grad_norm": 0.7833491648509169, + "language_loss": 0.49587151, + "learning_rate": 1.185661047226603e-06, + "loss": 0.51626772, + "num_input_tokens_seen": 231313865, + "router_z_loss_clip": 0.23547363, + "router_z_loss_mlp": 0.01014709, + "step": 10719, + "time_per_iteration": 4.886744260787964 + }, + { + "auxiliary_loss_clip": 0.01120406, + "auxiliary_loss_mlp": 0.01034834, + "balance_loss_clip": 1.04325819, + "balance_loss_mlp": 1.02279949, + "epoch": 0.6445212686006313, + "flos": 27705534222240.0, + "grad_norm": 1.932934751843708, + "language_loss": 0.78197861, + "learning_rate": 1.18530534681967e-06, + "loss": 0.80353093, + "num_input_tokens_seen": 231331710, + "router_z_loss_clip": 0.77197266, + "router_z_loss_mlp": 0.12042236, + "step": 10720, + "time_per_iteration": 2.7089037895202637 + }, + { + "auxiliary_loss_clip": 0.01115798, + "auxiliary_loss_mlp": 0.01032817, + "balance_loss_clip": 1.04064679, + "balance_loss_mlp": 1.02027643, + "epoch": 0.6445813918532992, + "flos": 25842451749120.0, + "grad_norm": 1.8852412225972206, + "language_loss": 0.77270311, + "learning_rate": 1.18494967730604e-06, + "loss": 0.79418927, + "num_input_tokens_seen": 231350705, + "router_z_loss_clip": 0.75146484, + "router_z_loss_mlp": 0.12542725, + "step": 10721, + "time_per_iteration": 2.656339168548584 + }, + { + "auxiliary_loss_clip": 0.01119345, + "auxiliary_loss_mlp": 0.0103345, + "balance_loss_clip": 1.04269505, + "balance_loss_mlp": 1.02118301, + "epoch": 0.6446415151059672, + "flos": 30739563046080.0, + "grad_norm": 1.9536867966288918, + "language_loss": 0.72677219, + "learning_rate": 1.1845940386991995e-06, + "loss": 0.74830014, + "num_input_tokens_seen": 231369550, + "router_z_loss_clip": 0.76611328, + "router_z_loss_mlp": 0.1227417, + "step": 10722, + "time_per_iteration": 4.017050743103027 + }, + { + "auxiliary_loss_clip": 0.01116102, + "auxiliary_loss_mlp": 0.01028715, + "balance_loss_clip": 1.04279876, + "balance_loss_mlp": 1.01742613, + "epoch": 0.6447016383586353, + "flos": 30874101881760.0, + "grad_norm": 1.5879175330496906, + "language_loss": 0.77786291, + "learning_rate": 1.184238431012635e-06, + "loss": 0.79931104, + "num_input_tokens_seen": 231389285, + "router_z_loss_clip": 0.73242188, + "router_z_loss_mlp": 0.11279297, + "step": 10723, + "time_per_iteration": 2.7230312824249268 + }, + { + "auxiliary_loss_clip": 0.01121921, + "auxiliary_loss_mlp": 0.01035881, + "balance_loss_clip": 1.04344702, + "balance_loss_mlp": 1.02400804, + "epoch": 0.6447617616113032, + "flos": 33804540721440.0, + "grad_norm": 1.7739032909061592, + "language_loss": 0.58829385, + "learning_rate": 1.1838828542598312e-06, + "loss": 0.60987186, + "num_input_tokens_seen": 231408820, + "router_z_loss_clip": 0.78466797, + "router_z_loss_mlp": 0.11877441, + "step": 10724, + "time_per_iteration": 2.690718650817871 + }, + { + "auxiliary_loss_clip": 0.01117278, + "auxiliary_loss_mlp": 0.01033104, + "balance_loss_clip": 1.04382408, + "balance_loss_mlp": 1.02195239, + "epoch": 0.6448218848639712, + "flos": 28112554180800.0, + "grad_norm": 1.8141042235211402, + "language_loss": 0.8372702, + "learning_rate": 1.183527308454271e-06, + "loss": 0.85877401, + "num_input_tokens_seen": 231428100, + "router_z_loss_clip": 0.73339844, + "router_z_loss_mlp": 0.11157227, + "step": 10725, + "time_per_iteration": 2.6659789085388184 + }, + { + "auxiliary_loss_clip": 0.01117298, + "auxiliary_loss_mlp": 0.01035955, + "balance_loss_clip": 1.04100919, + "balance_loss_mlp": 1.02333069, + "epoch": 0.6448820081166391, + "flos": 29890725135840.0, + "grad_norm": 2.181414109681825, + "language_loss": 0.81729406, + "learning_rate": 1.1831717936094368e-06, + "loss": 0.83882666, + "num_input_tokens_seen": 231445810, + "router_z_loss_clip": 0.76367188, + "router_z_loss_mlp": 0.12615967, + "step": 10726, + "time_per_iteration": 2.645305633544922 + }, + { + "auxiliary_loss_clip": 0.01119434, + "auxiliary_loss_mlp": 0.01031062, + "balance_loss_clip": 1.04099095, + "balance_loss_mlp": 1.01895595, + "epoch": 0.6449421313693071, + "flos": 27356164002720.0, + "grad_norm": 1.8785090409742153, + "language_loss": 0.81414485, + "learning_rate": 1.1828163097388108e-06, + "loss": 0.83564985, + "num_input_tokens_seen": 231463570, + "router_z_loss_clip": 0.78369141, + "router_z_loss_mlp": 0.12097168, + "step": 10727, + "time_per_iteration": 2.6711087226867676 + }, + { + "auxiliary_loss_clip": 0.01124935, + "auxiliary_loss_mlp": 0.01035425, + "balance_loss_clip": 1.04359245, + "balance_loss_mlp": 1.02268803, + "epoch": 0.645002254621975, + "flos": 24679811440800.0, + "grad_norm": 2.09361727228697, + "language_loss": 0.7890349, + "learning_rate": 1.1824608568558717e-06, + "loss": 0.81063849, + "num_input_tokens_seen": 231482155, + "router_z_loss_clip": 0.81347656, + "router_z_loss_mlp": 0.12731934, + "step": 10728, + "time_per_iteration": 2.6357789039611816 + }, + { + "auxiliary_loss_clip": 0.0111946, + "auxiliary_loss_mlp": 0.01033998, + "balance_loss_clip": 1.04164374, + "balance_loss_mlp": 1.02117729, + "epoch": 0.645062377874643, + "flos": 33989800841280.0, + "grad_norm": 2.531207422045739, + "language_loss": 0.73938048, + "learning_rate": 1.1821054349740988e-06, + "loss": 0.76091504, + "num_input_tokens_seen": 231502465, + "router_z_loss_clip": 0.77783203, + "router_z_loss_mlp": 0.12811279, + "step": 10729, + "time_per_iteration": 2.741767644882202 + }, + { + "auxiliary_loss_clip": 0.01120083, + "auxiliary_loss_mlp": 0.01032521, + "balance_loss_clip": 1.04247427, + "balance_loss_mlp": 1.01968861, + "epoch": 0.645122501127311, + "flos": 30873291536160.0, + "grad_norm": 1.6744385452841384, + "language_loss": 0.6646598, + "learning_rate": 1.1817500441069706e-06, + "loss": 0.68618584, + "num_input_tokens_seen": 231522740, + "router_z_loss_clip": 0.77587891, + "router_z_loss_mlp": 0.12823486, + "step": 10730, + "time_per_iteration": 2.6470391750335693 + }, + { + "auxiliary_loss_clip": 0.011205, + "auxiliary_loss_mlp": 0.01034002, + "balance_loss_clip": 1.04329419, + "balance_loss_mlp": 1.02084088, + "epoch": 0.645182624379979, + "flos": 22948593662880.0, + "grad_norm": 1.6457978357242622, + "language_loss": 0.63766932, + "learning_rate": 1.1813946842679614e-06, + "loss": 0.65921438, + "num_input_tokens_seen": 231542050, + "router_z_loss_clip": 0.77099609, + "router_z_loss_mlp": 0.13165283, + "step": 10731, + "time_per_iteration": 2.6493489742279053 + }, + { + "auxiliary_loss_clip": 0.01115525, + "auxiliary_loss_mlp": 0.01033925, + "balance_loss_clip": 1.04112887, + "balance_loss_mlp": 1.02191508, + "epoch": 0.6452427476326469, + "flos": 22368833424000.0, + "grad_norm": 1.7021910872371322, + "language_loss": 0.681871, + "learning_rate": 1.1810393554705492e-06, + "loss": 0.7033655, + "num_input_tokens_seen": 231560380, + "router_z_loss_clip": 0.74365234, + "router_z_loss_mlp": 0.12017822, + "step": 10732, + "time_per_iteration": 2.6828112602233887 + }, + { + "auxiliary_loss_clip": 0.01116224, + "auxiliary_loss_mlp": 0.01031988, + "balance_loss_clip": 1.0423758, + "balance_loss_mlp": 1.02031732, + "epoch": 0.6453028708853149, + "flos": 27802074199680.0, + "grad_norm": 2.27042072705785, + "language_loss": 0.75812513, + "learning_rate": 1.1806840577282055e-06, + "loss": 0.77960724, + "num_input_tokens_seen": 231580810, + "router_z_loss_clip": 0.73876953, + "router_z_loss_mlp": 0.11676025, + "step": 10733, + "time_per_iteration": 2.7093896865844727 + }, + { + "auxiliary_loss_clip": 0.01124353, + "auxiliary_loss_mlp": 0.01039334, + "balance_loss_clip": 1.04528582, + "balance_loss_mlp": 1.0266856, + "epoch": 0.6453629941379828, + "flos": 29219692165920.0, + "grad_norm": 2.245885218485429, + "language_loss": 0.66873467, + "learning_rate": 1.1803287910544048e-06, + "loss": 0.69037151, + "num_input_tokens_seen": 231600585, + "router_z_loss_clip": 0.78955078, + "router_z_loss_mlp": 0.12664795, + "step": 10734, + "time_per_iteration": 2.64091420173645 + }, + { + "auxiliary_loss_clip": 0.01115585, + "auxiliary_loss_mlp": 0.01037683, + "balance_loss_clip": 1.04391408, + "balance_loss_mlp": 1.02574468, + "epoch": 0.6454231173906508, + "flos": 21568488347520.0, + "grad_norm": 2.3433683238709184, + "language_loss": 0.74058664, + "learning_rate": 1.1799735554626191e-06, + "loss": 0.76211929, + "num_input_tokens_seen": 231618765, + "router_z_loss_clip": 0.71582031, + "router_z_loss_mlp": 0.1192627, + "step": 10735, + "time_per_iteration": 2.663362979888916 + }, + { + "auxiliary_loss_clip": 0.01119373, + "auxiliary_loss_mlp": 0.01036621, + "balance_loss_clip": 1.04280043, + "balance_loss_mlp": 1.02428889, + "epoch": 0.6454832406433189, + "flos": 28422264333600.0, + "grad_norm": 1.857909544616582, + "language_loss": 0.75219154, + "learning_rate": 1.1796183509663176e-06, + "loss": 0.7737515, + "num_input_tokens_seen": 231638525, + "router_z_loss_clip": 0.76513672, + "router_z_loss_mlp": 0.12347412, + "step": 10736, + "time_per_iteration": 2.7032947540283203 + }, + { + "auxiliary_loss_clip": 0.01122339, + "auxiliary_loss_mlp": 0.01031204, + "balance_loss_clip": 1.04389453, + "balance_loss_mlp": 1.01844311, + "epoch": 0.6455433638959868, + "flos": 24595791302880.0, + "grad_norm": 2.099995408815855, + "language_loss": 0.70308548, + "learning_rate": 1.1792631775789708e-06, + "loss": 0.72462094, + "num_input_tokens_seen": 231656785, + "router_z_loss_clip": 0.78417969, + "router_z_loss_mlp": 0.12744141, + "step": 10737, + "time_per_iteration": 2.631904363632202 + }, + { + "auxiliary_loss_clip": 0.01035781, + "auxiliary_loss_mlp": 0.01002776, + "balance_loss_clip": 1.01229346, + "balance_loss_mlp": 1.0017395, + "epoch": 0.6456034871486548, + "flos": 81183438000000.0, + "grad_norm": 0.7860337879537697, + "language_loss": 0.58436733, + "learning_rate": 1.1789080353140464e-06, + "loss": 0.6047529, + "num_input_tokens_seen": 231719075, + "router_z_loss_clip": 0.23498535, + "router_z_loss_mlp": 0.01036835, + "step": 10738, + "time_per_iteration": 3.3817055225372314 + }, + { + "auxiliary_loss_clip": 0.01117209, + "auxiliary_loss_mlp": 0.01030493, + "balance_loss_clip": 1.04195321, + "balance_loss_mlp": 1.01844704, + "epoch": 0.6456636104013227, + "flos": 29536654911840.0, + "grad_norm": 2.294115881392437, + "language_loss": 0.74364138, + "learning_rate": 1.1785529241850118e-06, + "loss": 0.76511842, + "num_input_tokens_seen": 231737810, + "router_z_loss_clip": 0.75292969, + "router_z_loss_mlp": 0.12054443, + "step": 10739, + "time_per_iteration": 2.656351089477539 + }, + { + "auxiliary_loss_clip": 0.01123101, + "auxiliary_loss_mlp": 0.01032094, + "balance_loss_clip": 1.04427385, + "balance_loss_mlp": 1.0192914, + "epoch": 0.6457237336539907, + "flos": 28826286013440.0, + "grad_norm": 2.401885384846356, + "language_loss": 0.71364784, + "learning_rate": 1.1781978442053324e-06, + "loss": 0.73519987, + "num_input_tokens_seen": 231756140, + "router_z_loss_clip": 0.78857422, + "router_z_loss_mlp": 0.12805176, + "step": 10740, + "time_per_iteration": 2.748746633529663 + }, + { + "auxiliary_loss_clip": 0.01035622, + "auxiliary_loss_mlp": 0.01003296, + "balance_loss_clip": 1.01203847, + "balance_loss_mlp": 1.00223255, + "epoch": 0.6457838569066586, + "flos": 80344648375200.0, + "grad_norm": 0.6623322170575772, + "language_loss": 0.55253565, + "learning_rate": 1.1778427953884733e-06, + "loss": 0.57292479, + "num_input_tokens_seen": 231823665, + "router_z_loss_clip": 0.23620605, + "router_z_loss_mlp": 0.01064301, + "step": 10741, + "time_per_iteration": 3.2513227462768555 + }, + { + "auxiliary_loss_clip": 0.01118502, + "auxiliary_loss_mlp": 0.0103183, + "balance_loss_clip": 1.04389286, + "balance_loss_mlp": 1.02051139, + "epoch": 0.6458439801593266, + "flos": 27310709964960.0, + "grad_norm": 1.722104449840828, + "language_loss": 0.80748308, + "learning_rate": 1.1774877777478977e-06, + "loss": 0.82898641, + "num_input_tokens_seen": 231844500, + "router_z_loss_clip": 0.74609375, + "router_z_loss_mlp": 0.11334229, + "step": 10742, + "time_per_iteration": 2.6941418647766113 + }, + { + "auxiliary_loss_clip": 0.01114564, + "auxiliary_loss_mlp": 0.01031045, + "balance_loss_clip": 1.04128647, + "balance_loss_mlp": 1.01953518, + "epoch": 0.6459041034119946, + "flos": 30248320363200.0, + "grad_norm": 1.9308544358252415, + "language_loss": 0.8194207, + "learning_rate": 1.1771327912970678e-06, + "loss": 0.84087676, + "num_input_tokens_seen": 231864510, + "router_z_loss_clip": 0.73242188, + "router_z_loss_mlp": 0.1151123, + "step": 10743, + "time_per_iteration": 2.7073707580566406 + }, + { + "auxiliary_loss_clip": 0.01116897, + "auxiliary_loss_mlp": 0.01029456, + "balance_loss_clip": 1.04259539, + "balance_loss_mlp": 1.0174042, + "epoch": 0.6459642266646626, + "flos": 22361135140800.0, + "grad_norm": 2.190476532320624, + "language_loss": 0.71859419, + "learning_rate": 1.1767778360494453e-06, + "loss": 0.74005771, + "num_input_tokens_seen": 231881555, + "router_z_loss_clip": 0.74365234, + "router_z_loss_mlp": 0.1204834, + "step": 10744, + "time_per_iteration": 2.6266555786132812 + }, + { + "auxiliary_loss_clip": 0.01117775, + "auxiliary_loss_mlp": 0.01029148, + "balance_loss_clip": 1.04160571, + "balance_loss_mlp": 1.01781082, + "epoch": 0.6460243499173305, + "flos": 53179734699360.0, + "grad_norm": 1.7715603695169577, + "language_loss": 0.66499418, + "learning_rate": 1.1764229120184896e-06, + "loss": 0.68646342, + "num_input_tokens_seen": 231905945, + "router_z_loss_clip": 0.76171875, + "router_z_loss_mlp": 0.11340332, + "step": 10745, + "time_per_iteration": 2.8954055309295654 + }, + { + "auxiliary_loss_clip": 0.0111912, + "auxiliary_loss_mlp": 0.01036903, + "balance_loss_clip": 1.04347527, + "balance_loss_mlp": 1.02423739, + "epoch": 0.6460844731699985, + "flos": 23481562793760.0, + "grad_norm": 2.3039901752024017, + "language_loss": 0.73039055, + "learning_rate": 1.1760680192176597e-06, + "loss": 0.7519508, + "num_input_tokens_seen": 231922535, + "router_z_loss_clip": 0.75585938, + "router_z_loss_mlp": 0.12658691, + "step": 10746, + "time_per_iteration": 2.604607343673706 + }, + { + "auxiliary_loss_clip": 0.01121284, + "auxiliary_loss_mlp": 0.01036322, + "balance_loss_clip": 1.04374003, + "balance_loss_mlp": 1.02437174, + "epoch": 0.6461445964226664, + "flos": 33499206434880.0, + "grad_norm": 1.5531287804324359, + "language_loss": 0.66688061, + "learning_rate": 1.175713157660413e-06, + "loss": 0.68845665, + "num_input_tokens_seen": 231944800, + "router_z_loss_clip": 0.77587891, + "router_z_loss_mlp": 0.11950684, + "step": 10747, + "time_per_iteration": 2.7365880012512207 + }, + { + "auxiliary_loss_clip": 0.01119438, + "auxiliary_loss_mlp": 0.01040174, + "balance_loss_clip": 1.0436666, + "balance_loss_mlp": 1.02854514, + "epoch": 0.6462047196753344, + "flos": 24765249683520.0, + "grad_norm": 2.1039273030847014, + "language_loss": 0.67232662, + "learning_rate": 1.1753583273602056e-06, + "loss": 0.69392276, + "num_input_tokens_seen": 231962970, + "router_z_loss_clip": 0.75878906, + "router_z_loss_mlp": 0.11627197, + "step": 10748, + "time_per_iteration": 4.266455173492432 + }, + { + "auxiliary_loss_clip": 0.01122771, + "auxiliary_loss_mlp": 0.01042973, + "balance_loss_clip": 1.04414213, + "balance_loss_mlp": 1.0298841, + "epoch": 0.6462648429280025, + "flos": 26866866149280.0, + "grad_norm": 2.6824184329169865, + "language_loss": 0.76047468, + "learning_rate": 1.1750035283304937e-06, + "loss": 0.78213215, + "num_input_tokens_seen": 231981195, + "router_z_loss_clip": 0.78662109, + "router_z_loss_mlp": 0.13079834, + "step": 10749, + "time_per_iteration": 2.6319477558135986 + }, + { + "auxiliary_loss_clip": 0.01120288, + "auxiliary_loss_mlp": 0.01034835, + "balance_loss_clip": 1.04234278, + "balance_loss_mlp": 1.02255654, + "epoch": 0.6463249661806704, + "flos": 33899257421280.0, + "grad_norm": 1.78848794190529, + "language_loss": 0.76678163, + "learning_rate": 1.17464876058473e-06, + "loss": 0.78833282, + "num_input_tokens_seen": 232001735, + "router_z_loss_clip": 0.78027344, + "router_z_loss_mlp": 0.1229248, + "step": 10750, + "time_per_iteration": 4.009809732437134 + }, + { + "auxiliary_loss_clip": 0.01123398, + "auxiliary_loss_mlp": 0.01034511, + "balance_loss_clip": 1.0439043, + "balance_loss_mlp": 1.021106, + "epoch": 0.6463850894333384, + "flos": 26906607250560.0, + "grad_norm": 2.101266600879209, + "language_loss": 0.68406391, + "learning_rate": 1.1742940241363683e-06, + "loss": 0.705643, + "num_input_tokens_seen": 232019830, + "router_z_loss_clip": 0.79492188, + "router_z_loss_mlp": 0.1340332, + "step": 10751, + "time_per_iteration": 2.6142067909240723 + }, + { + "auxiliary_loss_clip": 0.011188, + "auxiliary_loss_mlp": 0.01031303, + "balance_loss_clip": 1.04173625, + "balance_loss_mlp": 1.01878595, + "epoch": 0.6464452126860063, + "flos": 25753853158560.0, + "grad_norm": 2.3006941152488967, + "language_loss": 0.71152616, + "learning_rate": 1.1739393189988604e-06, + "loss": 0.73302722, + "num_input_tokens_seen": 232039625, + "router_z_loss_clip": 0.76953125, + "router_z_loss_mlp": 0.12518311, + "step": 10752, + "time_per_iteration": 2.6749801635742188 + }, + { + "auxiliary_loss_clip": 0.01124389, + "auxiliary_loss_mlp": 0.01038328, + "balance_loss_clip": 1.04530931, + "balance_loss_mlp": 1.02438021, + "epoch": 0.6465053359386743, + "flos": 19557658405440.0, + "grad_norm": 1.736252283309113, + "language_loss": 0.78134179, + "learning_rate": 1.1735846451856554e-06, + "loss": 0.80296898, + "num_input_tokens_seen": 232055855, + "router_z_loss_clip": 0.79101562, + "router_z_loss_mlp": 0.13952637, + "step": 10753, + "time_per_iteration": 2.594012498855591 + }, + { + "auxiliary_loss_clip": 0.01118919, + "auxiliary_loss_mlp": 0.01042327, + "balance_loss_clip": 1.0440942, + "balance_loss_mlp": 1.03024542, + "epoch": 0.6465654591913422, + "flos": 28551819543840.0, + "grad_norm": 2.1146012565736423, + "language_loss": 0.85189235, + "learning_rate": 1.1732300027102041e-06, + "loss": 0.87350476, + "num_input_tokens_seen": 232073475, + "router_z_loss_clip": 0.74755859, + "router_z_loss_mlp": 0.12097168, + "step": 10754, + "time_per_iteration": 2.6523382663726807 + }, + { + "auxiliary_loss_clip": 0.01118839, + "auxiliary_loss_mlp": 0.01033926, + "balance_loss_clip": 1.04341531, + "balance_loss_mlp": 1.02158153, + "epoch": 0.6466255824440102, + "flos": 18762985748160.0, + "grad_norm": 2.758591755109987, + "language_loss": 0.59547198, + "learning_rate": 1.1728753915859541e-06, + "loss": 0.61699963, + "num_input_tokens_seen": 232091090, + "router_z_loss_clip": 0.75488281, + "router_z_loss_mlp": 0.12359619, + "step": 10755, + "time_per_iteration": 2.644794464111328 + }, + { + "auxiliary_loss_clip": 0.0111752, + "auxiliary_loss_mlp": 0.01032443, + "balance_loss_clip": 1.04190969, + "balance_loss_mlp": 1.0202775, + "epoch": 0.6466857056966782, + "flos": 19831436081280.0, + "grad_norm": 2.077649341525584, + "language_loss": 0.67984033, + "learning_rate": 1.1725208118263518e-06, + "loss": 0.70134002, + "num_input_tokens_seen": 232107320, + "router_z_loss_clip": 0.75585938, + "router_z_loss_mlp": 0.12176514, + "step": 10756, + "time_per_iteration": 2.5970356464385986 + }, + { + "auxiliary_loss_clip": 0.01124987, + "auxiliary_loss_mlp": 0.01033065, + "balance_loss_clip": 1.04425955, + "balance_loss_mlp": 1.0207088, + "epoch": 0.6467458289493462, + "flos": 25841803472640.0, + "grad_norm": 4.762032319995731, + "language_loss": 0.7455892, + "learning_rate": 1.172166263444844e-06, + "loss": 0.76716965, + "num_input_tokens_seen": 232123930, + "router_z_loss_clip": 0.80712891, + "router_z_loss_mlp": 0.12347412, + "step": 10757, + "time_per_iteration": 2.6701531410217285 + }, + { + "auxiliary_loss_clip": 0.01119035, + "auxiliary_loss_mlp": 0.01034302, + "balance_loss_clip": 1.04446769, + "balance_loss_mlp": 1.02242327, + "epoch": 0.6468059522020141, + "flos": 21924705987360.0, + "grad_norm": 1.6421214426815163, + "language_loss": 0.74270725, + "learning_rate": 1.1718117464548734e-06, + "loss": 0.76424068, + "num_input_tokens_seen": 232142905, + "router_z_loss_clip": 0.74609375, + "router_z_loss_mlp": 0.11889648, + "step": 10758, + "time_per_iteration": 4.0740883350372314 + }, + { + "auxiliary_loss_clip": 0.01120187, + "auxiliary_loss_mlp": 0.01031331, + "balance_loss_clip": 1.04294598, + "balance_loss_mlp": 1.01808667, + "epoch": 0.6468660754546821, + "flos": 21829827218400.0, + "grad_norm": 2.954288173476964, + "language_loss": 0.67684603, + "learning_rate": 1.1714572608698845e-06, + "loss": 0.69836122, + "num_input_tokens_seen": 232162230, + "router_z_loss_clip": 0.77148438, + "router_z_loss_mlp": 0.13256836, + "step": 10759, + "time_per_iteration": 2.694554090499878 + }, + { + "auxiliary_loss_clip": 0.01123767, + "auxiliary_loss_mlp": 0.01034787, + "balance_loss_clip": 1.04421329, + "balance_loss_mlp": 1.02201331, + "epoch": 0.64692619870735, + "flos": 27577964358720.0, + "grad_norm": 1.7790524451011154, + "language_loss": 0.7565766, + "learning_rate": 1.1711028067033197e-06, + "loss": 0.77816212, + "num_input_tokens_seen": 232182700, + "router_z_loss_clip": 0.79541016, + "router_z_loss_mlp": 0.12774658, + "step": 10760, + "time_per_iteration": 2.687027931213379 + }, + { + "auxiliary_loss_clip": 0.01116337, + "auxiliary_loss_mlp": 0.0102919, + "balance_loss_clip": 1.0408988, + "balance_loss_mlp": 1.01738226, + "epoch": 0.646986321960018, + "flos": 60525158541120.0, + "grad_norm": 1.9099486818033558, + "language_loss": 0.65355438, + "learning_rate": 1.1707483839686194e-06, + "loss": 0.67500967, + "num_input_tokens_seen": 232208235, + "router_z_loss_clip": 0.75390625, + "router_z_loss_mlp": 0.11816406, + "step": 10761, + "time_per_iteration": 4.188171625137329 + }, + { + "auxiliary_loss_clip": 0.011205, + "auxiliary_loss_mlp": 0.0103188, + "balance_loss_clip": 1.04383898, + "balance_loss_mlp": 1.01930928, + "epoch": 0.6470464452126861, + "flos": 26732813520960.0, + "grad_norm": 2.9830335827145515, + "language_loss": 0.69797611, + "learning_rate": 1.1703939926792235e-06, + "loss": 0.71949995, + "num_input_tokens_seen": 232228720, + "router_z_loss_clip": 0.76708984, + "router_z_loss_mlp": 0.12579346, + "step": 10762, + "time_per_iteration": 2.7514586448669434 + }, + { + "auxiliary_loss_clip": 0.0112257, + "auxiliary_loss_mlp": 0.01036461, + "balance_loss_clip": 1.04367757, + "balance_loss_mlp": 1.0236218, + "epoch": 0.647106568465354, + "flos": 22093637643360.0, + "grad_norm": 3.215405007757571, + "language_loss": 0.82654405, + "learning_rate": 1.1700396328485705e-06, + "loss": 0.8481344, + "num_input_tokens_seen": 232244655, + "router_z_loss_clip": 0.78857422, + "router_z_loss_mlp": 0.12835693, + "step": 10763, + "time_per_iteration": 2.6420645713806152 + }, + { + "auxiliary_loss_clip": 0.01036325, + "auxiliary_loss_mlp": 0.01003859, + "balance_loss_clip": 1.01264346, + "balance_loss_mlp": 1.00257719, + "epoch": 0.647166691718022, + "flos": 84780493426080.0, + "grad_norm": 0.8380355895122396, + "language_loss": 0.57748282, + "learning_rate": 1.1696853044900978e-06, + "loss": 0.59788465, + "num_input_tokens_seen": 232308685, + "router_z_loss_clip": 0.23693848, + "router_z_loss_mlp": 0.01281738, + "step": 10764, + "time_per_iteration": 3.439426898956299 + }, + { + "auxiliary_loss_clip": 0.01117332, + "auxiliary_loss_mlp": 0.01033085, + "balance_loss_clip": 1.04196143, + "balance_loss_mlp": 1.02049017, + "epoch": 0.6472268149706899, + "flos": 41603937698880.0, + "grad_norm": 2.6139703968456085, + "language_loss": 0.60687888, + "learning_rate": 1.1693310076172413e-06, + "loss": 0.62838304, + "num_input_tokens_seen": 232327520, + "router_z_loss_clip": 0.75341797, + "router_z_loss_mlp": 0.12585449, + "step": 10765, + "time_per_iteration": 2.9425106048583984 + }, + { + "auxiliary_loss_clip": 0.01118135, + "auxiliary_loss_mlp": 0.01029542, + "balance_loss_clip": 1.04328418, + "balance_loss_mlp": 1.01771045, + "epoch": 0.6472869382233579, + "flos": 34301901513600.0, + "grad_norm": 2.105325114728327, + "language_loss": 0.63090038, + "learning_rate": 1.168976742243437e-06, + "loss": 0.65237719, + "num_input_tokens_seen": 232349025, + "router_z_loss_clip": 0.74853516, + "router_z_loss_mlp": 0.1182251, + "step": 10766, + "time_per_iteration": 2.7103326320648193 + }, + { + "auxiliary_loss_clip": 0.01117651, + "auxiliary_loss_mlp": 0.01029809, + "balance_loss_clip": 1.04205346, + "balance_loss_mlp": 1.01766181, + "epoch": 0.6473470614760258, + "flos": 27445086731520.0, + "grad_norm": 1.7519255925864852, + "language_loss": 0.75855827, + "learning_rate": 1.1686225083821174e-06, + "loss": 0.78003287, + "num_input_tokens_seen": 232367835, + "router_z_loss_clip": 0.75634766, + "router_z_loss_mlp": 0.121521, + "step": 10767, + "time_per_iteration": 2.7074780464172363 + }, + { + "auxiliary_loss_clip": 0.01117004, + "auxiliary_loss_mlp": 0.01034327, + "balance_loss_clip": 1.04171419, + "balance_loss_mlp": 1.02202487, + "epoch": 0.6474071847286939, + "flos": 17739908418240.0, + "grad_norm": 2.2246656159529565, + "language_loss": 0.77800018, + "learning_rate": 1.1682683060467153e-06, + "loss": 0.79951346, + "num_input_tokens_seen": 232385840, + "router_z_loss_clip": 0.75195312, + "router_z_loss_mlp": 0.12310791, + "step": 10768, + "time_per_iteration": 2.6029365062713623 + }, + { + "auxiliary_loss_clip": 0.01118296, + "auxiliary_loss_mlp": 0.0102841, + "balance_loss_clip": 1.04247963, + "balance_loss_mlp": 1.01635242, + "epoch": 0.6474673079813618, + "flos": 29404871251200.0, + "grad_norm": 1.7217350302121774, + "language_loss": 0.7180692, + "learning_rate": 1.167914135250663e-06, + "loss": 0.73953629, + "num_input_tokens_seen": 232406205, + "router_z_loss_clip": 0.7578125, + "router_z_loss_mlp": 0.12054443, + "step": 10769, + "time_per_iteration": 2.725680112838745 + }, + { + "auxiliary_loss_clip": 0.01116213, + "auxiliary_loss_mlp": 0.01037402, + "balance_loss_clip": 1.04278958, + "balance_loss_mlp": 1.02532005, + "epoch": 0.6475274312340298, + "flos": 18273525825600.0, + "grad_norm": 1.895568365428386, + "language_loss": 0.71718884, + "learning_rate": 1.1675599960073895e-06, + "loss": 0.73872501, + "num_input_tokens_seen": 232424995, + "router_z_loss_clip": 0.73486328, + "router_z_loss_mlp": 0.12097168, + "step": 10770, + "time_per_iteration": 2.5935535430908203 + }, + { + "auxiliary_loss_clip": 0.01122522, + "auxiliary_loss_mlp": 0.01031984, + "balance_loss_clip": 1.04215431, + "balance_loss_mlp": 1.01834035, + "epoch": 0.6475875544866977, + "flos": 30560745173760.0, + "grad_norm": 2.0484616009451466, + "language_loss": 0.73313308, + "learning_rate": 1.167205888330325e-06, + "loss": 0.75467819, + "num_input_tokens_seen": 232445870, + "router_z_loss_clip": 0.80273438, + "router_z_loss_mlp": 0.13653564, + "step": 10771, + "time_per_iteration": 2.7752232551574707 + }, + { + "auxiliary_loss_clip": 0.01119254, + "auxiliary_loss_mlp": 0.01035847, + "balance_loss_clip": 1.04416478, + "balance_loss_mlp": 1.02342522, + "epoch": 0.6476476777393657, + "flos": 20099014613280.0, + "grad_norm": 2.5167407356213274, + "language_loss": 0.74241394, + "learning_rate": 1.1668518122328958e-06, + "loss": 0.76396495, + "num_input_tokens_seen": 232464285, + "router_z_loss_clip": 0.75048828, + "router_z_loss_mlp": 0.12432861, + "step": 10772, + "time_per_iteration": 2.612769603729248 + }, + { + "auxiliary_loss_clip": 0.01115716, + "auxiliary_loss_mlp": 0.01031049, + "balance_loss_clip": 1.04143524, + "balance_loss_mlp": 1.02001011, + "epoch": 0.6477078009920336, + "flos": 31496560983360.0, + "grad_norm": 1.6301841983733898, + "language_loss": 0.83294988, + "learning_rate": 1.1664977677285305e-06, + "loss": 0.85441756, + "num_input_tokens_seen": 232485815, + "router_z_loss_clip": 0.74121094, + "router_z_loss_mlp": 0.11035156, + "step": 10773, + "time_per_iteration": 2.7314651012420654 + }, + { + "auxiliary_loss_clip": 0.01114249, + "auxiliary_loss_mlp": 0.01028506, + "balance_loss_clip": 1.04052949, + "balance_loss_mlp": 1.01671076, + "epoch": 0.6477679242447016, + "flos": 21568204726560.0, + "grad_norm": 1.5598060049468947, + "language_loss": 0.78386879, + "learning_rate": 1.1661437548306524e-06, + "loss": 0.8052963, + "num_input_tokens_seen": 232504875, + "router_z_loss_clip": 0.73632812, + "router_z_loss_mlp": 0.11791992, + "step": 10774, + "time_per_iteration": 2.6458702087402344 + }, + { + "auxiliary_loss_clip": 0.01119357, + "auxiliary_loss_mlp": 0.01037692, + "balance_loss_clip": 1.04158139, + "balance_loss_mlp": 1.02534747, + "epoch": 0.6478280474973696, + "flos": 25663714911360.0, + "grad_norm": 4.305410152559784, + "language_loss": 0.69106019, + "learning_rate": 1.1657897735526867e-06, + "loss": 0.71263069, + "num_input_tokens_seen": 232521945, + "router_z_loss_clip": 0.77783203, + "router_z_loss_mlp": 0.12347412, + "step": 10775, + "time_per_iteration": 2.661445379257202 + }, + { + "auxiliary_loss_clip": 0.01123567, + "auxiliary_loss_mlp": 0.01034003, + "balance_loss_clip": 1.04358292, + "balance_loss_mlp": 1.02183819, + "epoch": 0.6478881707500376, + "flos": 26377041571200.0, + "grad_norm": 1.9996831767689753, + "language_loss": 0.66050768, + "learning_rate": 1.1654358239080574e-06, + "loss": 0.68208337, + "num_input_tokens_seen": 232541500, + "router_z_loss_clip": 0.80078125, + "router_z_loss_mlp": 0.12158203, + "step": 10776, + "time_per_iteration": 2.6721818447113037 + }, + { + "auxiliary_loss_clip": 0.01121311, + "auxiliary_loss_mlp": 0.01037478, + "balance_loss_clip": 1.0426271, + "balance_loss_mlp": 1.02453828, + "epoch": 0.6479482940027056, + "flos": 22503453294240.0, + "grad_norm": 2.52347488605493, + "language_loss": 0.78268874, + "learning_rate": 1.1650819059101839e-06, + "loss": 0.80427665, + "num_input_tokens_seen": 232559720, + "router_z_loss_clip": 0.78710938, + "router_z_loss_mlp": 0.12945557, + "step": 10777, + "time_per_iteration": 2.6232011318206787 + }, + { + "auxiliary_loss_clip": 0.01121983, + "auxiliary_loss_mlp": 0.01034165, + "balance_loss_clip": 1.0454452, + "balance_loss_mlp": 1.02148151, + "epoch": 0.6480084172553735, + "flos": 27044711606880.0, + "grad_norm": 1.7741143453286496, + "language_loss": 0.74088955, + "learning_rate": 1.1647280195724896e-06, + "loss": 0.76245105, + "num_input_tokens_seen": 232579370, + "router_z_loss_clip": 0.76708984, + "router_z_loss_mlp": 0.12695312, + "step": 10778, + "time_per_iteration": 2.7685697078704834 + }, + { + "auxiliary_loss_clip": 0.01116383, + "auxiliary_loss_mlp": 0.01032278, + "balance_loss_clip": 1.04145145, + "balance_loss_mlp": 1.02049446, + "epoch": 0.6480685405080415, + "flos": 29670991161120.0, + "grad_norm": 1.7616889952584163, + "language_loss": 0.77661335, + "learning_rate": 1.1643741649083923e-06, + "loss": 0.79809999, + "num_input_tokens_seen": 232600495, + "router_z_loss_clip": 0.74902344, + "router_z_loss_mlp": 0.11785889, + "step": 10779, + "time_per_iteration": 2.6778547763824463 + }, + { + "auxiliary_loss_clip": 0.01036745, + "auxiliary_loss_mlp": 0.01002529, + "balance_loss_clip": 1.01307499, + "balance_loss_mlp": 1.00122309, + "epoch": 0.6481286637607094, + "flos": 73078828287840.0, + "grad_norm": 0.8574847602036497, + "language_loss": 0.59388697, + "learning_rate": 1.1640203419313095e-06, + "loss": 0.61427975, + "num_input_tokens_seen": 232663165, + "router_z_loss_clip": 0.23669434, + "router_z_loss_mlp": 0.01306915, + "step": 10780, + "time_per_iteration": 3.2869973182678223 + }, + { + "auxiliary_loss_clip": 0.01115877, + "auxiliary_loss_mlp": 0.0103082, + "balance_loss_clip": 1.04174566, + "balance_loss_mlp": 1.01926875, + "epoch": 0.6481887870133775, + "flos": 31096185858720.0, + "grad_norm": 2.8447998946033786, + "language_loss": 0.79475844, + "learning_rate": 1.1636665506546599e-06, + "loss": 0.81622547, + "num_input_tokens_seen": 232683385, + "router_z_loss_clip": 0.74169922, + "router_z_loss_mlp": 0.11553955, + "step": 10781, + "time_per_iteration": 2.6854233741760254 + }, + { + "auxiliary_loss_clip": 0.01122437, + "auxiliary_loss_mlp": 0.01035296, + "balance_loss_clip": 1.04464078, + "balance_loss_mlp": 1.02140832, + "epoch": 0.6482489102660454, + "flos": 24319542072960.0, + "grad_norm": 2.8099727343118284, + "language_loss": 0.78977871, + "learning_rate": 1.1633127910918578e-06, + "loss": 0.81135607, + "num_input_tokens_seen": 232699095, + "router_z_loss_clip": 0.77734375, + "router_z_loss_mlp": 0.13891602, + "step": 10782, + "time_per_iteration": 2.6766397953033447 + }, + { + "auxiliary_loss_clip": 0.01121561, + "auxiliary_loss_mlp": 0.01034037, + "balance_loss_clip": 1.04383564, + "balance_loss_mlp": 1.02128756, + "epoch": 0.6483090335187134, + "flos": 32920904818080.0, + "grad_norm": 2.513202660743191, + "language_loss": 0.64467621, + "learning_rate": 1.1629590632563187e-06, + "loss": 0.66623223, + "num_input_tokens_seen": 232717920, + "router_z_loss_clip": 0.77734375, + "router_z_loss_mlp": 0.12744141, + "step": 10783, + "time_per_iteration": 2.6959593296051025 + }, + { + "auxiliary_loss_clip": 0.01123897, + "auxiliary_loss_mlp": 0.01033765, + "balance_loss_clip": 1.0450983, + "balance_loss_mlp": 1.02033591, + "epoch": 0.6483691567713813, + "flos": 30600931965120.0, + "grad_norm": 2.0029447198277333, + "language_loss": 0.88818705, + "learning_rate": 1.1626053671614561e-06, + "loss": 0.90976363, + "num_input_tokens_seen": 232737605, + "router_z_loss_clip": 0.78808594, + "router_z_loss_mlp": 0.13433838, + "step": 10784, + "time_per_iteration": 2.7038116455078125 + }, + { + "auxiliary_loss_clip": 0.01118774, + "auxiliary_loss_mlp": 0.01029904, + "balance_loss_clip": 1.04365504, + "balance_loss_mlp": 1.01736271, + "epoch": 0.6484292800240493, + "flos": 19652294070720.0, + "grad_norm": 3.3806330049732285, + "language_loss": 0.73274857, + "learning_rate": 1.1622517028206815e-06, + "loss": 0.75423533, + "num_input_tokens_seen": 232755110, + "router_z_loss_clip": 0.75146484, + "router_z_loss_mlp": 0.12548828, + "step": 10785, + "time_per_iteration": 2.641063928604126 + }, + { + "auxiliary_loss_clip": 0.01116836, + "auxiliary_loss_mlp": 0.01034223, + "balance_loss_clip": 1.04246569, + "balance_loss_mlp": 1.02175963, + "epoch": 0.6484894032767172, + "flos": 35191736560800.0, + "grad_norm": 2.0098948038907363, + "language_loss": 0.69475538, + "learning_rate": 1.1618980702474071e-06, + "loss": 0.71626592, + "num_input_tokens_seen": 232779040, + "router_z_loss_clip": 0.74316406, + "router_z_loss_mlp": 0.12463379, + "step": 10786, + "time_per_iteration": 2.7547082901000977 + }, + { + "auxiliary_loss_clip": 0.0111616, + "auxiliary_loss_mlp": 0.01031512, + "balance_loss_clip": 1.04006791, + "balance_loss_mlp": 1.01935267, + "epoch": 0.6485495265293852, + "flos": 36883820996640.0, + "grad_norm": 2.2110962447470763, + "language_loss": 0.71456361, + "learning_rate": 1.161544469455041e-06, + "loss": 0.73604029, + "num_input_tokens_seen": 232800515, + "router_z_loss_clip": 0.76074219, + "router_z_loss_mlp": 0.12164307, + "step": 10787, + "time_per_iteration": 2.7765963077545166 + }, + { + "auxiliary_loss_clip": 0.01122901, + "auxiliary_loss_mlp": 0.01031962, + "balance_loss_clip": 1.04369378, + "balance_loss_mlp": 1.0192126, + "epoch": 0.6486096497820532, + "flos": 24504437537280.0, + "grad_norm": 2.642401833511337, + "language_loss": 0.84255701, + "learning_rate": 1.1611909004569934e-06, + "loss": 0.8641057, + "num_input_tokens_seen": 232818450, + "router_z_loss_clip": 0.79248047, + "router_z_loss_mlp": 0.12756348, + "step": 10788, + "time_per_iteration": 4.165992736816406 + }, + { + "auxiliary_loss_clip": 0.01121213, + "auxiliary_loss_mlp": 0.01029646, + "balance_loss_clip": 1.04471302, + "balance_loss_mlp": 1.01726031, + "epoch": 0.6486697730347212, + "flos": 20900615725440.0, + "grad_norm": 1.9859583217463708, + "language_loss": 0.77605754, + "learning_rate": 1.1608373632666708e-06, + "loss": 0.79756612, + "num_input_tokens_seen": 232834785, + "router_z_loss_clip": 0.76513672, + "router_z_loss_mlp": 0.12402344, + "step": 10789, + "time_per_iteration": 4.0334107875823975 + }, + { + "auxiliary_loss_clip": 0.01115931, + "auxiliary_loss_mlp": 0.01029311, + "balance_loss_clip": 1.04210556, + "balance_loss_mlp": 1.01771212, + "epoch": 0.6487298962873892, + "flos": 47481670566720.0, + "grad_norm": 1.6577767386719795, + "language_loss": 0.76376927, + "learning_rate": 1.160483857897479e-06, + "loss": 0.7852217, + "num_input_tokens_seen": 232856050, + "router_z_loss_clip": 0.73876953, + "router_z_loss_mlp": 0.11602783, + "step": 10790, + "time_per_iteration": 2.863938808441162 + }, + { + "auxiliary_loss_clip": 0.01118785, + "auxiliary_loss_mlp": 0.01036154, + "balance_loss_clip": 1.04437923, + "balance_loss_mlp": 1.02488875, + "epoch": 0.6487900195400571, + "flos": 14578107144480.0, + "grad_norm": 2.1498473576455432, + "language_loss": 0.60233486, + "learning_rate": 1.160130384362823e-06, + "loss": 0.6238842, + "num_input_tokens_seen": 232873945, + "router_z_loss_clip": 0.74462891, + "router_z_loss_mlp": 0.11279297, + "step": 10791, + "time_per_iteration": 2.6183440685272217 + }, + { + "auxiliary_loss_clip": 0.01116911, + "auxiliary_loss_mlp": 0.01030687, + "balance_loss_clip": 1.04080081, + "balance_loss_mlp": 1.01866508, + "epoch": 0.6488501427927251, + "flos": 27265012823520.0, + "grad_norm": 1.7845432084679385, + "language_loss": 0.86207503, + "learning_rate": 1.1597769426761082e-06, + "loss": 0.88355106, + "num_input_tokens_seen": 232892160, + "router_z_loss_clip": 0.75976562, + "router_z_loss_mlp": 0.12023926, + "step": 10792, + "time_per_iteration": 2.7833526134490967 + }, + { + "auxiliary_loss_clip": 0.01123652, + "auxiliary_loss_mlp": 0.0103621, + "balance_loss_clip": 1.04470801, + "balance_loss_mlp": 1.02320409, + "epoch": 0.648910266045393, + "flos": 27132905024640.0, + "grad_norm": 2.6585402533167457, + "language_loss": 0.78273422, + "learning_rate": 1.159423532850735e-06, + "loss": 0.80433279, + "num_input_tokens_seen": 232911725, + "router_z_loss_clip": 0.78857422, + "router_z_loss_mlp": 0.13006592, + "step": 10793, + "time_per_iteration": 2.6249523162841797 + }, + { + "auxiliary_loss_clip": 0.01124446, + "auxiliary_loss_mlp": 0.01027845, + "balance_loss_clip": 1.04656005, + "balance_loss_mlp": 1.01563787, + "epoch": 0.6489703892980611, + "flos": 30873007915200.0, + "grad_norm": 1.7769123167633334, + "language_loss": 0.74641067, + "learning_rate": 1.1590701549001055e-06, + "loss": 0.76793355, + "num_input_tokens_seen": 232929085, + "router_z_loss_clip": 0.77880859, + "router_z_loss_mlp": 0.12219238, + "step": 10794, + "time_per_iteration": 2.7248148918151855 + }, + { + "auxiliary_loss_clip": 0.01118367, + "auxiliary_loss_mlp": 0.01034687, + "balance_loss_clip": 1.04189086, + "balance_loss_mlp": 1.02232552, + "epoch": 0.649030512550729, + "flos": 29982970281600.0, + "grad_norm": 2.0345759052972707, + "language_loss": 0.69930315, + "learning_rate": 1.158716808837621e-06, + "loss": 0.72083366, + "num_input_tokens_seen": 232949455, + "router_z_loss_clip": 0.76464844, + "router_z_loss_mlp": 0.12365723, + "step": 10795, + "time_per_iteration": 2.699338674545288 + }, + { + "auxiliary_loss_clip": 0.01122142, + "auxiliary_loss_mlp": 0.01035982, + "balance_loss_clip": 1.04466331, + "balance_loss_mlp": 1.02297592, + "epoch": 0.649090635803397, + "flos": 32024749075200.0, + "grad_norm": 2.072622223419372, + "language_loss": 0.53890443, + "learning_rate": 1.158363494676679e-06, + "loss": 0.5604856, + "num_input_tokens_seen": 232969445, + "router_z_loss_clip": 0.77490234, + "router_z_loss_mlp": 0.13000488, + "step": 10796, + "time_per_iteration": 2.6827712059020996 + }, + { + "auxiliary_loss_clip": 0.01117186, + "auxiliary_loss_mlp": 0.01029577, + "balance_loss_clip": 1.04176474, + "balance_loss_mlp": 1.0183835, + "epoch": 0.6491507590560649, + "flos": 30428880478560.0, + "grad_norm": 1.826240265092876, + "language_loss": 0.77657306, + "learning_rate": 1.1580102124306775e-06, + "loss": 0.79804063, + "num_input_tokens_seen": 232988900, + "router_z_loss_clip": 0.75439453, + "router_z_loss_mlp": 0.11199951, + "step": 10797, + "time_per_iteration": 2.6786508560180664 + }, + { + "auxiliary_loss_clip": 0.01116872, + "auxiliary_loss_mlp": 0.01029102, + "balance_loss_clip": 1.04489708, + "balance_loss_mlp": 1.01831973, + "epoch": 0.6492108823087329, + "flos": 23793622948800.0, + "grad_norm": 2.9706095668547214, + "language_loss": 0.70959711, + "learning_rate": 1.1576569621130134e-06, + "loss": 0.73105681, + "num_input_tokens_seen": 233005060, + "router_z_loss_clip": 0.72021484, + "router_z_loss_mlp": 0.10784912, + "step": 10798, + "time_per_iteration": 4.102623462677002 + }, + { + "auxiliary_loss_clip": 0.01116987, + "auxiliary_loss_mlp": 0.01029634, + "balance_loss_clip": 1.04174209, + "balance_loss_mlp": 1.01840496, + "epoch": 0.6492710055614008, + "flos": 24061444584480.0, + "grad_norm": 1.8844952549462286, + "language_loss": 0.76701319, + "learning_rate": 1.1573037437370811e-06, + "loss": 0.78847933, + "num_input_tokens_seen": 233023375, + "router_z_loss_clip": 0.75244141, + "router_z_loss_mlp": 0.11224365, + "step": 10799, + "time_per_iteration": 2.6338725090026855 + }, + { + "auxiliary_loss_clip": 0.01118594, + "auxiliary_loss_mlp": 0.01034066, + "balance_loss_clip": 1.0396893, + "balance_loss_mlp": 1.02159631, + "epoch": 0.6493311288140688, + "flos": 29671882541280.0, + "grad_norm": 1.8942354516987874, + "language_loss": 0.7177524, + "learning_rate": 1.1569505573162755e-06, + "loss": 0.73927897, + "num_input_tokens_seen": 233043130, + "router_z_loss_clip": 0.78857422, + "router_z_loss_mlp": 0.12463379, + "step": 10800, + "time_per_iteration": 4.012060165405273 + }, + { + "auxiliary_loss_clip": 0.0103753, + "auxiliary_loss_mlp": 0.01003513, + "balance_loss_clip": 1.01397109, + "balance_loss_mlp": 1.0022049, + "epoch": 0.6493912520667368, + "flos": 86555949723360.0, + "grad_norm": 0.7693288395892733, + "language_loss": 0.60254395, + "learning_rate": 1.1565974028639897e-06, + "loss": 0.62295437, + "num_input_tokens_seen": 233110560, + "router_z_loss_clip": 0.23583984, + "router_z_loss_mlp": 0.01308441, + "step": 10801, + "time_per_iteration": 3.358583688735962 + }, + { + "auxiliary_loss_clip": 0.01125152, + "auxiliary_loss_mlp": 0.01040296, + "balance_loss_clip": 1.04561317, + "balance_loss_mlp": 1.02766001, + "epoch": 0.6494513753194048, + "flos": 30917570572800.0, + "grad_norm": 1.9834126852364948, + "language_loss": 0.78492671, + "learning_rate": 1.156244280393614e-06, + "loss": 0.80658114, + "num_input_tokens_seen": 233130080, + "router_z_loss_clip": 0.79589844, + "router_z_loss_mlp": 0.12640381, + "step": 10802, + "time_per_iteration": 2.7615368366241455 + }, + { + "auxiliary_loss_clip": 0.0111767, + "auxiliary_loss_mlp": 0.01037092, + "balance_loss_clip": 1.04115784, + "balance_loss_mlp": 1.0241338, + "epoch": 0.6495114985720728, + "flos": 30117225496320.0, + "grad_norm": 6.250497120175174, + "language_loss": 0.75003898, + "learning_rate": 1.155891189918541e-06, + "loss": 0.7715866, + "num_input_tokens_seen": 233150235, + "router_z_loss_clip": 0.76513672, + "router_z_loss_mlp": 0.12969971, + "step": 10803, + "time_per_iteration": 2.7104790210723877 + }, + { + "auxiliary_loss_clip": 0.01119689, + "auxiliary_loss_mlp": 0.01034347, + "balance_loss_clip": 1.0429951, + "balance_loss_mlp": 1.02229476, + "epoch": 0.6495716218247407, + "flos": 28860192626400.0, + "grad_norm": 2.865523887056756, + "language_loss": 0.70186049, + "learning_rate": 1.1555381314521578e-06, + "loss": 0.72340083, + "num_input_tokens_seen": 233166710, + "router_z_loss_clip": 0.76708984, + "router_z_loss_mlp": 0.1204834, + "step": 10804, + "time_per_iteration": 2.680574655532837 + }, + { + "auxiliary_loss_clip": 0.01118542, + "auxiliary_loss_mlp": 0.01031115, + "balance_loss_clip": 1.04310584, + "balance_loss_mlp": 1.01850235, + "epoch": 0.6496317450774087, + "flos": 27266957652960.0, + "grad_norm": 2.466015112449997, + "language_loss": 0.72601664, + "learning_rate": 1.1551851050078537e-06, + "loss": 0.74751318, + "num_input_tokens_seen": 233185445, + "router_z_loss_clip": 0.75488281, + "router_z_loss_mlp": 0.12615967, + "step": 10805, + "time_per_iteration": 2.672950267791748 + }, + { + "auxiliary_loss_clip": 0.01116796, + "auxiliary_loss_mlp": 0.01028456, + "balance_loss_clip": 1.04054427, + "balance_loss_mlp": 1.0172925, + "epoch": 0.6496918683300766, + "flos": 37239673980960.0, + "grad_norm": 2.6675870599628935, + "language_loss": 0.65882754, + "learning_rate": 1.1548321105990155e-06, + "loss": 0.68028009, + "num_input_tokens_seen": 233205805, + "router_z_loss_clip": 0.76171875, + "router_z_loss_mlp": 0.11157227, + "step": 10806, + "time_per_iteration": 2.726806402206421 + }, + { + "auxiliary_loss_clip": 0.0112122, + "auxiliary_loss_mlp": 0.01032487, + "balance_loss_clip": 1.04240799, + "balance_loss_mlp": 1.01963592, + "epoch": 0.6497519915827447, + "flos": 15201822281760.0, + "grad_norm": 2.7004272715903124, + "language_loss": 0.78672844, + "learning_rate": 1.1544791482390275e-06, + "loss": 0.80826551, + "num_input_tokens_seen": 233224215, + "router_z_loss_clip": 0.78759766, + "router_z_loss_mlp": 0.12860107, + "step": 10807, + "time_per_iteration": 2.6506693363189697 + }, + { + "auxiliary_loss_clip": 0.0103675, + "auxiliary_loss_mlp": 0.01002631, + "balance_loss_clip": 1.01306129, + "balance_loss_mlp": 1.00136852, + "epoch": 0.6498121148354126, + "flos": 84307726622880.0, + "grad_norm": 0.7870822169665513, + "language_loss": 0.58884072, + "learning_rate": 1.1541262179412745e-06, + "loss": 0.60923451, + "num_input_tokens_seen": 233294440, + "router_z_loss_clip": 0.23693848, + "router_z_loss_mlp": 0.01262665, + "step": 10808, + "time_per_iteration": 3.436079502105713 + }, + { + "auxiliary_loss_clip": 0.01118382, + "auxiliary_loss_mlp": 0.01024333, + "balance_loss_clip": 1.04516363, + "balance_loss_mlp": 1.01182222, + "epoch": 0.6498722380880806, + "flos": 45023795943840.0, + "grad_norm": 1.800028065878609, + "language_loss": 0.63092172, + "learning_rate": 1.1537733197191415e-06, + "loss": 0.65234888, + "num_input_tokens_seen": 233316125, + "router_z_loss_clip": 0.73193359, + "router_z_loss_mlp": 0.12512207, + "step": 10809, + "time_per_iteration": 2.7874786853790283 + }, + { + "auxiliary_loss_clip": 0.01115877, + "auxiliary_loss_mlp": 0.01032388, + "balance_loss_clip": 1.0422976, + "balance_loss_mlp": 1.02032375, + "epoch": 0.6499323613407485, + "flos": 35408634325920.0, + "grad_norm": 1.6386287894384948, + "language_loss": 0.81448525, + "learning_rate": 1.153420453586008e-06, + "loss": 0.8359679, + "num_input_tokens_seen": 233336140, + "router_z_loss_clip": 0.73535156, + "router_z_loss_mlp": 0.1204834, + "step": 10810, + "time_per_iteration": 2.8748550415039062 + }, + { + "auxiliary_loss_clip": 0.0111495, + "auxiliary_loss_mlp": 0.01033838, + "balance_loss_clip": 1.04109764, + "balance_loss_mlp": 1.02262616, + "epoch": 0.6499924845934165, + "flos": 24549607954080.0, + "grad_norm": 1.6994550840885703, + "language_loss": 0.71641284, + "learning_rate": 1.1530676195552561e-06, + "loss": 0.73790073, + "num_input_tokens_seen": 233356095, + "router_z_loss_clip": 0.73876953, + "router_z_loss_mlp": 0.11199951, + "step": 10811, + "time_per_iteration": 2.648045539855957 + }, + { + "auxiliary_loss_clip": 0.0111669, + "auxiliary_loss_mlp": 0.01027817, + "balance_loss_clip": 1.04445481, + "balance_loss_mlp": 1.01659918, + "epoch": 0.6500526078460844, + "flos": 29799654991200.0, + "grad_norm": 1.588299817453223, + "language_loss": 0.77578032, + "learning_rate": 1.1527148176402649e-06, + "loss": 0.79722542, + "num_input_tokens_seen": 233376830, + "router_z_loss_clip": 0.72363281, + "router_z_loss_mlp": 0.11218262, + "step": 10812, + "time_per_iteration": 2.7219796180725098 + }, + { + "auxiliary_loss_clip": 0.0111897, + "auxiliary_loss_mlp": 0.01036553, + "balance_loss_clip": 1.04260254, + "balance_loss_mlp": 1.02405941, + "epoch": 0.6501127310987524, + "flos": 28468771820640.0, + "grad_norm": 1.8224626125516048, + "language_loss": 0.85521889, + "learning_rate": 1.152362047854413e-06, + "loss": 0.87677413, + "num_input_tokens_seen": 233395275, + "router_z_loss_clip": 0.76269531, + "router_z_loss_mlp": 0.12493896, + "step": 10813, + "time_per_iteration": 2.6317052841186523 + }, + { + "auxiliary_loss_clip": 0.01117919, + "auxiliary_loss_mlp": 0.0103115, + "balance_loss_clip": 1.04286695, + "balance_loss_mlp": 1.01949084, + "epoch": 0.6501728543514204, + "flos": 22814946207360.0, + "grad_norm": 1.7813494421077563, + "language_loss": 0.79703987, + "learning_rate": 1.1520093102110764e-06, + "loss": 0.81853056, + "num_input_tokens_seen": 233413345, + "router_z_loss_clip": 0.74951172, + "router_z_loss_mlp": 0.11657715, + "step": 10814, + "time_per_iteration": 2.630075693130493 + }, + { + "auxiliary_loss_clip": 0.01120253, + "auxiliary_loss_mlp": 0.01035488, + "balance_loss_clip": 1.04342675, + "balance_loss_mlp": 1.02325714, + "epoch": 0.6502329776040884, + "flos": 53934625738080.0, + "grad_norm": 1.8079428182240531, + "language_loss": 0.65557235, + "learning_rate": 1.1516566047236328e-06, + "loss": 0.67712975, + "num_input_tokens_seen": 233436105, + "router_z_loss_clip": 0.76806641, + "router_z_loss_mlp": 0.12225342, + "step": 10815, + "time_per_iteration": 2.8137364387512207 + }, + { + "auxiliary_loss_clip": 0.01122359, + "auxiliary_loss_mlp": 0.0103332, + "balance_loss_clip": 1.04348373, + "balance_loss_mlp": 1.01964676, + "epoch": 0.6502931008567564, + "flos": 17783660730240.0, + "grad_norm": 2.312873649933893, + "language_loss": 0.75261652, + "learning_rate": 1.1513039314054546e-06, + "loss": 0.77417326, + "num_input_tokens_seen": 233452320, + "router_z_loss_clip": 0.7890625, + "router_z_loss_mlp": 0.13684082, + "step": 10816, + "time_per_iteration": 2.638749122619629 + }, + { + "auxiliary_loss_clip": 0.01119302, + "auxiliary_loss_mlp": 0.01029382, + "balance_loss_clip": 1.04498029, + "balance_loss_mlp": 1.01744914, + "epoch": 0.6503532241094243, + "flos": 26106505277760.0, + "grad_norm": 2.0925613685053253, + "language_loss": 0.73135561, + "learning_rate": 1.1509512902699174e-06, + "loss": 0.75284243, + "num_input_tokens_seen": 233469920, + "router_z_loss_clip": 0.74267578, + "router_z_loss_mlp": 0.11938477, + "step": 10817, + "time_per_iteration": 2.633671760559082 + }, + { + "auxiliary_loss_clip": 0.01118037, + "auxiliary_loss_mlp": 0.01033125, + "balance_loss_clip": 1.04159832, + "balance_loss_mlp": 1.02088833, + "epoch": 0.6504133473620923, + "flos": 91202858051040.0, + "grad_norm": 1.6575296490610532, + "language_loss": 0.71778727, + "learning_rate": 1.1505986813303916e-06, + "loss": 0.73929894, + "num_input_tokens_seen": 233499780, + "router_z_loss_clip": 0.76513672, + "router_z_loss_mlp": 0.12219238, + "step": 10818, + "time_per_iteration": 3.041456460952759 + }, + { + "auxiliary_loss_clip": 0.01121081, + "auxiliary_loss_mlp": 0.01029432, + "balance_loss_clip": 1.04399085, + "balance_loss_mlp": 1.01718354, + "epoch": 0.6504734706147602, + "flos": 24054637681440.0, + "grad_norm": 2.232475876576847, + "language_loss": 0.64606202, + "learning_rate": 1.150246104600249e-06, + "loss": 0.66756713, + "num_input_tokens_seen": 233518235, + "router_z_loss_clip": 0.77050781, + "router_z_loss_mlp": 0.12243652, + "step": 10819, + "time_per_iteration": 2.6508517265319824 + }, + { + "auxiliary_loss_clip": 0.01120912, + "auxiliary_loss_mlp": 0.01031077, + "balance_loss_clip": 1.04439008, + "balance_loss_mlp": 1.01891148, + "epoch": 0.6505335938674283, + "flos": 31184906001120.0, + "grad_norm": 1.970988819201739, + "language_loss": 0.83822882, + "learning_rate": 1.14989356009286e-06, + "loss": 0.85974872, + "num_input_tokens_seen": 233535215, + "router_z_loss_clip": 0.76464844, + "router_z_loss_mlp": 0.12145996, + "step": 10820, + "time_per_iteration": 2.6859493255615234 + }, + { + "auxiliary_loss_clip": 0.01121245, + "auxiliary_loss_mlp": 0.0103038, + "balance_loss_clip": 1.04363394, + "balance_loss_mlp": 1.01828027, + "epoch": 0.6505937171200962, + "flos": 21745563976800.0, + "grad_norm": 2.30301327411564, + "language_loss": 0.77762318, + "learning_rate": 1.1495410478215914e-06, + "loss": 0.79913938, + "num_input_tokens_seen": 233552775, + "router_z_loss_clip": 0.77636719, + "router_z_loss_mlp": 0.12109375, + "step": 10821, + "time_per_iteration": 2.5896949768066406 + }, + { + "auxiliary_loss_clip": 0.01115653, + "auxiliary_loss_mlp": 0.01029518, + "balance_loss_clip": 1.04288423, + "balance_loss_mlp": 1.01854491, + "epoch": 0.6506538403727642, + "flos": 25219263336480.0, + "grad_norm": 1.6374018783195148, + "language_loss": 0.80284023, + "learning_rate": 1.1491885677998126e-06, + "loss": 0.82429194, + "num_input_tokens_seen": 233572080, + "router_z_loss_clip": 0.72753906, + "router_z_loss_mlp": 0.10980225, + "step": 10822, + "time_per_iteration": 2.7000834941864014 + }, + { + "auxiliary_loss_clip": 0.01118373, + "auxiliary_loss_mlp": 0.01028599, + "balance_loss_clip": 1.04308593, + "balance_loss_mlp": 1.01685071, + "epoch": 0.6507139636254321, + "flos": 14302506191040.0, + "grad_norm": 1.8353276141528774, + "language_loss": 0.87064338, + "learning_rate": 1.1488361200408883e-06, + "loss": 0.89211309, + "num_input_tokens_seen": 233589155, + "router_z_loss_clip": 0.75292969, + "router_z_loss_mlp": 0.11743164, + "step": 10823, + "time_per_iteration": 2.6684014797210693 + }, + { + "auxiliary_loss_clip": 0.0111811, + "auxiliary_loss_mlp": 0.01031735, + "balance_loss_clip": 1.04172814, + "balance_loss_mlp": 1.01945627, + "epoch": 0.6507740868781001, + "flos": 32655109046400.0, + "grad_norm": 2.5728989591764666, + "language_loss": 0.66499346, + "learning_rate": 1.148483704558183e-06, + "loss": 0.68649191, + "num_input_tokens_seen": 233608180, + "router_z_loss_clip": 0.76318359, + "router_z_loss_mlp": 0.12280273, + "step": 10824, + "time_per_iteration": 2.7174196243286133 + }, + { + "auxiliary_loss_clip": 0.01119891, + "auxiliary_loss_mlp": 0.01031958, + "balance_loss_clip": 1.0423162, + "balance_loss_mlp": 1.01978052, + "epoch": 0.650834210130768, + "flos": 20098528405920.0, + "grad_norm": 2.5619109391258728, + "language_loss": 0.87159574, + "learning_rate": 1.1481313213650607e-06, + "loss": 0.89311427, + "num_input_tokens_seen": 233625750, + "router_z_loss_clip": 0.77636719, + "router_z_loss_mlp": 0.12176514, + "step": 10825, + "time_per_iteration": 2.680433750152588 + }, + { + "auxiliary_loss_clip": 0.0112048, + "auxiliary_loss_mlp": 0.01028593, + "balance_loss_clip": 1.04139066, + "balance_loss_mlp": 1.01548576, + "epoch": 0.650894333383436, + "flos": 20900170035360.0, + "grad_norm": 2.3870612206449873, + "language_loss": 0.73523635, + "learning_rate": 1.147778970474885e-06, + "loss": 0.7567271, + "num_input_tokens_seen": 233644235, + "router_z_loss_clip": 0.79052734, + "router_z_loss_mlp": 0.13110352, + "step": 10826, + "time_per_iteration": 2.599599599838257 + }, + { + "auxiliary_loss_clip": 0.01119761, + "auxiliary_loss_mlp": 0.01033447, + "balance_loss_clip": 1.04458106, + "balance_loss_mlp": 1.02203286, + "epoch": 0.650954456636104, + "flos": 22859387313120.0, + "grad_norm": 1.9185347501933885, + "language_loss": 0.68989682, + "learning_rate": 1.1474266519010157e-06, + "loss": 0.71142888, + "num_input_tokens_seen": 233662845, + "router_z_loss_clip": 0.75097656, + "router_z_loss_mlp": 0.11401367, + "step": 10827, + "time_per_iteration": 2.685746908187866 + }, + { + "auxiliary_loss_clip": 0.01118468, + "auxiliary_loss_mlp": 0.01029881, + "balance_loss_clip": 1.04233503, + "balance_loss_mlp": 1.01834762, + "epoch": 0.651014579888772, + "flos": 29930344685280.0, + "grad_norm": 2.1392306809370583, + "language_loss": 0.76968479, + "learning_rate": 1.1470743656568136e-06, + "loss": 0.79116833, + "num_input_tokens_seen": 233681990, + "router_z_loss_clip": 0.76171875, + "router_z_loss_mlp": 0.11541748, + "step": 10828, + "time_per_iteration": 5.608430862426758 + }, + { + "auxiliary_loss_clip": 0.0111819, + "auxiliary_loss_mlp": 0.01026639, + "balance_loss_clip": 1.04387677, + "balance_loss_mlp": 1.01550472, + "epoch": 0.65107470314144, + "flos": 29359619799840.0, + "grad_norm": 2.083281351118252, + "language_loss": 0.89509577, + "learning_rate": 1.1467221117556362e-06, + "loss": 0.91654408, + "num_input_tokens_seen": 233698930, + "router_z_loss_clip": 0.74365234, + "router_z_loss_mlp": 0.11132812, + "step": 10829, + "time_per_iteration": 2.6849021911621094 + }, + { + "auxiliary_loss_clip": 0.01037509, + "auxiliary_loss_mlp": 0.01000369, + "balance_loss_clip": 1.01392126, + "balance_loss_mlp": 0.99923283, + "epoch": 0.6511348263941079, + "flos": 88441357217760.0, + "grad_norm": 0.6449768283725459, + "language_loss": 0.55374444, + "learning_rate": 1.1463698902108428e-06, + "loss": 0.57412314, + "num_input_tokens_seen": 233769825, + "router_z_loss_clip": 0.23608398, + "router_z_loss_mlp": 0.0113678, + "step": 10830, + "time_per_iteration": 3.423495054244995 + }, + { + "auxiliary_loss_clip": 0.01122354, + "auxiliary_loss_mlp": 0.01034429, + "balance_loss_clip": 1.04412103, + "balance_loss_mlp": 1.02141726, + "epoch": 0.6511949496467759, + "flos": 28513496547360.0, + "grad_norm": 1.9879932394883244, + "language_loss": 0.74898887, + "learning_rate": 1.1460177010357878e-06, + "loss": 0.77055669, + "num_input_tokens_seen": 233787095, + "router_z_loss_clip": 0.78222656, + "router_z_loss_mlp": 0.13006592, + "step": 10831, + "time_per_iteration": 2.817826986312866 + }, + { + "auxiliary_loss_clip": 0.01037963, + "auxiliary_loss_mlp": 0.01001259, + "balance_loss_clip": 1.0144074, + "balance_loss_mlp": 1.00012779, + "epoch": 0.6512550728994438, + "flos": 82160210429280.0, + "grad_norm": 0.6415882547254353, + "language_loss": 0.50999683, + "learning_rate": 1.145665544243828e-06, + "loss": 0.53038907, + "num_input_tokens_seen": 233853050, + "router_z_loss_clip": 0.23571777, + "router_z_loss_mlp": 0.01131439, + "step": 10832, + "time_per_iteration": 3.3187427520751953 + }, + { + "auxiliary_loss_clip": 0.01120345, + "auxiliary_loss_mlp": 0.01035084, + "balance_loss_clip": 1.04133415, + "balance_loss_mlp": 1.02240562, + "epoch": 0.6513151961521119, + "flos": 25797200297760.0, + "grad_norm": 2.8104651938509493, + "language_loss": 0.83482289, + "learning_rate": 1.145313419848316e-06, + "loss": 0.85637712, + "num_input_tokens_seen": 233871385, + "router_z_loss_clip": 0.79003906, + "router_z_loss_mlp": 0.12683105, + "step": 10833, + "time_per_iteration": 2.683347702026367 + }, + { + "auxiliary_loss_clip": 0.01120465, + "auxiliary_loss_mlp": 0.01032022, + "balance_loss_clip": 1.0450002, + "balance_loss_mlp": 1.01972604, + "epoch": 0.6513753194047798, + "flos": 18496379630880.0, + "grad_norm": 2.6519355610864856, + "language_loss": 0.83548057, + "learning_rate": 1.1449613278626049e-06, + "loss": 0.85700548, + "num_input_tokens_seen": 233888175, + "router_z_loss_clip": 0.75488281, + "router_z_loss_mlp": 0.12310791, + "step": 10834, + "time_per_iteration": 2.595048189163208 + }, + { + "auxiliary_loss_clip": 0.01120724, + "auxiliary_loss_mlp": 0.01037178, + "balance_loss_clip": 1.04381323, + "balance_loss_mlp": 1.02548325, + "epoch": 0.6514354426574478, + "flos": 36883334789280.0, + "grad_norm": 3.251263785335873, + "language_loss": 0.77428067, + "learning_rate": 1.1446092683000455e-06, + "loss": 0.79585975, + "num_input_tokens_seen": 233911470, + "router_z_loss_clip": 0.76904297, + "router_z_loss_mlp": 0.11694336, + "step": 10835, + "time_per_iteration": 2.792365312576294 + }, + { + "auxiliary_loss_clip": 0.01121707, + "auxiliary_loss_mlp": 0.01035186, + "balance_loss_clip": 1.04634535, + "balance_loss_mlp": 1.02324152, + "epoch": 0.6514955659101157, + "flos": 29536087669920.0, + "grad_norm": 1.4873923058800091, + "language_loss": 0.77380961, + "learning_rate": 1.1442572411739882e-06, + "loss": 0.79537857, + "num_input_tokens_seen": 233932135, + "router_z_loss_clip": 0.75341797, + "router_z_loss_mlp": 0.1194458, + "step": 10836, + "time_per_iteration": 2.6958630084991455 + }, + { + "auxiliary_loss_clip": 0.01119104, + "auxiliary_loss_mlp": 0.01034658, + "balance_loss_clip": 1.04301167, + "balance_loss_mlp": 1.02316606, + "epoch": 0.6515556891627837, + "flos": 15103459026720.0, + "grad_norm": 3.4886569719942986, + "language_loss": 0.82495844, + "learning_rate": 1.143905246497783e-06, + "loss": 0.84649605, + "num_input_tokens_seen": 233947880, + "router_z_loss_clip": 0.76171875, + "router_z_loss_mlp": 0.1149292, + "step": 10837, + "time_per_iteration": 4.116978645324707 + }, + { + "auxiliary_loss_clip": 0.01117375, + "auxiliary_loss_mlp": 0.01032084, + "balance_loss_clip": 1.04336536, + "balance_loss_mlp": 1.01867914, + "epoch": 0.6516158124154516, + "flos": 60525644748480.0, + "grad_norm": 2.4416864545381096, + "language_loss": 0.58215499, + "learning_rate": 1.1435532842847758e-06, + "loss": 0.60364956, + "num_input_tokens_seen": 233971475, + "router_z_loss_clip": 0.73925781, + "router_z_loss_mlp": 0.1340332, + "step": 10838, + "time_per_iteration": 2.984656572341919 + }, + { + "auxiliary_loss_clip": 0.01038009, + "auxiliary_loss_mlp": 0.0100138, + "balance_loss_clip": 1.01445103, + "balance_loss_mlp": 1.00019956, + "epoch": 0.6516759356681197, + "flos": 72849410683200.0, + "grad_norm": 0.7261972763675723, + "language_loss": 0.60789561, + "learning_rate": 1.1432013545483147e-06, + "loss": 0.62828952, + "num_input_tokens_seen": 234030690, + "router_z_loss_clip": 0.23583984, + "router_z_loss_mlp": 0.01178741, + "step": 10839, + "time_per_iteration": 3.281259536743164 + }, + { + "auxiliary_loss_clip": 0.01118324, + "auxiliary_loss_mlp": 0.01030143, + "balance_loss_clip": 1.04366183, + "balance_loss_mlp": 1.01874089, + "epoch": 0.6517360589207876, + "flos": 45698840124480.0, + "grad_norm": 1.9366001418655148, + "language_loss": 0.68155241, + "learning_rate": 1.1428494573017439e-06, + "loss": 0.70303702, + "num_input_tokens_seen": 234052470, + "router_z_loss_clip": 0.74707031, + "router_z_loss_mlp": 0.11401367, + "step": 10840, + "time_per_iteration": 4.17349648475647 + }, + { + "auxiliary_loss_clip": 0.01116745, + "auxiliary_loss_mlp": 0.01033124, + "balance_loss_clip": 1.04113221, + "balance_loss_mlp": 1.02221036, + "epoch": 0.6517961821734556, + "flos": 30962822024160.0, + "grad_norm": 2.1961947717600157, + "language_loss": 0.73459721, + "learning_rate": 1.1424975925584071e-06, + "loss": 0.75609589, + "num_input_tokens_seen": 234071495, + "router_z_loss_clip": 0.75537109, + "router_z_loss_mlp": 0.10913086, + "step": 10841, + "time_per_iteration": 2.7183151245117188 + }, + { + "auxiliary_loss_clip": 0.01118452, + "auxiliary_loss_mlp": 0.01039291, + "balance_loss_clip": 1.0411284, + "balance_loss_mlp": 1.02729249, + "epoch": 0.6518563054261236, + "flos": 35102206072800.0, + "grad_norm": 1.5181005261931775, + "language_loss": 0.62573701, + "learning_rate": 1.142145760331648e-06, + "loss": 0.64731443, + "num_input_tokens_seen": 234092325, + "router_z_loss_clip": 0.77294922, + "router_z_loss_mlp": 0.11993408, + "step": 10842, + "time_per_iteration": 2.6933445930480957 + }, + { + "auxiliary_loss_clip": 0.01037787, + "auxiliary_loss_mlp": 0.01003063, + "balance_loss_clip": 1.01414776, + "balance_loss_mlp": 1.00188327, + "epoch": 0.6519164286787915, + "flos": 84103625967840.0, + "grad_norm": 0.8187023913897465, + "language_loss": 0.56110072, + "learning_rate": 1.141793960634807e-06, + "loss": 0.58150923, + "num_input_tokens_seen": 234148005, + "router_z_loss_clip": 0.23657227, + "router_z_loss_mlp": 0.01178741, + "step": 10843, + "time_per_iteration": 3.0610599517822266 + }, + { + "auxiliary_loss_clip": 0.01120607, + "auxiliary_loss_mlp": 0.0103729, + "balance_loss_clip": 1.04138494, + "balance_loss_mlp": 1.02456462, + "epoch": 0.6519765519314595, + "flos": 24943905486720.0, + "grad_norm": 4.842092569344506, + "language_loss": 0.82629085, + "learning_rate": 1.1414421934812253e-06, + "loss": 0.84786975, + "num_input_tokens_seen": 234164280, + "router_z_loss_clip": 0.79199219, + "router_z_loss_mlp": 0.1272583, + "step": 10844, + "time_per_iteration": 2.6420655250549316 + }, + { + "auxiliary_loss_clip": 0.01120431, + "auxiliary_loss_mlp": 0.01032025, + "balance_loss_clip": 1.04392624, + "balance_loss_mlp": 1.0192697, + "epoch": 0.6520366751841274, + "flos": 34657916567040.0, + "grad_norm": 2.1225921496834284, + "language_loss": 0.59706998, + "learning_rate": 1.1410904588842421e-06, + "loss": 0.61859459, + "num_input_tokens_seen": 234185090, + "router_z_loss_clip": 0.76513672, + "router_z_loss_mlp": 0.12738037, + "step": 10845, + "time_per_iteration": 2.7044787406921387 + }, + { + "auxiliary_loss_clip": 0.01118254, + "auxiliary_loss_mlp": 0.01030761, + "balance_loss_clip": 1.04237962, + "balance_loss_mlp": 1.01899517, + "epoch": 0.6520967984367955, + "flos": 27178156476000.0, + "grad_norm": 2.2732341415273165, + "language_loss": 0.79574126, + "learning_rate": 1.140738756857194e-06, + "loss": 0.81723142, + "num_input_tokens_seen": 234204050, + "router_z_loss_clip": 0.7578125, + "router_z_loss_mlp": 0.11773682, + "step": 10846, + "time_per_iteration": 2.8028578758239746 + }, + { + "auxiliary_loss_clip": 0.01037426, + "auxiliary_loss_mlp": 0.01002791, + "balance_loss_clip": 1.0137496, + "balance_loss_mlp": 1.00161874, + "epoch": 0.6521569216894634, + "flos": 84091679720640.0, + "grad_norm": 0.7066867074890204, + "language_loss": 0.60207736, + "learning_rate": 1.1403870874134192e-06, + "loss": 0.62247956, + "num_input_tokens_seen": 234269790, + "router_z_loss_clip": 0.23693848, + "router_z_loss_mlp": 0.01171875, + "step": 10847, + "time_per_iteration": 3.3768296241760254 + }, + { + "auxiliary_loss_clip": 0.01123078, + "auxiliary_loss_mlp": 0.01037586, + "balance_loss_clip": 1.04458976, + "balance_loss_mlp": 1.02536142, + "epoch": 0.6522170449421314, + "flos": 35545928336640.0, + "grad_norm": 1.7373901698838072, + "language_loss": 0.80797797, + "learning_rate": 1.1400354505662514e-06, + "loss": 0.8295846, + "num_input_tokens_seen": 234290135, + "router_z_loss_clip": 0.78466797, + "router_z_loss_mlp": 0.12219238, + "step": 10848, + "time_per_iteration": 2.7806954383850098 + }, + { + "auxiliary_loss_clip": 0.01117404, + "auxiliary_loss_mlp": 0.01036079, + "balance_loss_clip": 1.04225194, + "balance_loss_mlp": 1.024122, + "epoch": 0.6522771681947993, + "flos": 32521137452640.0, + "grad_norm": 2.9904036390100432, + "language_loss": 0.74758965, + "learning_rate": 1.1396838463290263e-06, + "loss": 0.76912439, + "num_input_tokens_seen": 234309535, + "router_z_loss_clip": 0.75146484, + "router_z_loss_mlp": 0.11962891, + "step": 10849, + "time_per_iteration": 2.7038064002990723 + }, + { + "auxiliary_loss_clip": 0.01115391, + "auxiliary_loss_mlp": 0.01035669, + "balance_loss_clip": 1.04221725, + "balance_loss_mlp": 1.02397466, + "epoch": 0.6523372914474673, + "flos": 31408043427360.0, + "grad_norm": 1.5639330946039645, + "language_loss": 0.68010271, + "learning_rate": 1.1393322747150752e-06, + "loss": 0.70161331, + "num_input_tokens_seen": 234328755, + "router_z_loss_clip": 0.73144531, + "router_z_loss_mlp": 0.11694336, + "step": 10850, + "time_per_iteration": 2.7036283016204834 + }, + { + "auxiliary_loss_clip": 0.01116169, + "auxiliary_loss_mlp": 0.01028288, + "balance_loss_clip": 1.04303861, + "balance_loss_mlp": 1.0163554, + "epoch": 0.6523974147001352, + "flos": 30294665781120.0, + "grad_norm": 1.7064746075395363, + "language_loss": 0.66972488, + "learning_rate": 1.1389807357377313e-06, + "loss": 0.69116944, + "num_input_tokens_seen": 234348655, + "router_z_loss_clip": 0.73095703, + "router_z_loss_mlp": 0.11932373, + "step": 10851, + "time_per_iteration": 2.7571685314178467 + }, + { + "auxiliary_loss_clip": 0.01120474, + "auxiliary_loss_mlp": 0.01032411, + "balance_loss_clip": 1.04265976, + "balance_loss_mlp": 1.02078271, + "epoch": 0.6524575379528033, + "flos": 32113752838560.0, + "grad_norm": 2.1290573240328183, + "language_loss": 0.73882008, + "learning_rate": 1.1386292294103235e-06, + "loss": 0.76034892, + "num_input_tokens_seen": 234367445, + "router_z_loss_clip": 0.77880859, + "router_z_loss_mlp": 0.11633301, + "step": 10852, + "time_per_iteration": 2.6978759765625 + }, + { + "auxiliary_loss_clip": 0.01121421, + "auxiliary_loss_mlp": 0.01031583, + "balance_loss_clip": 1.04350078, + "balance_loss_mlp": 1.01884544, + "epoch": 0.6525176612054712, + "flos": 23787261735840.0, + "grad_norm": 1.8781063534121247, + "language_loss": 0.66799784, + "learning_rate": 1.1382777557461812e-06, + "loss": 0.68952787, + "num_input_tokens_seen": 234384825, + "router_z_loss_clip": 0.77929688, + "router_z_loss_mlp": 0.12719727, + "step": 10853, + "time_per_iteration": 2.688202142715454 + }, + { + "auxiliary_loss_clip": 0.0103718, + "auxiliary_loss_mlp": 0.01001046, + "balance_loss_clip": 1.01355124, + "balance_loss_mlp": 0.99987441, + "epoch": 0.6525777844581392, + "flos": 87497762090400.0, + "grad_norm": 0.7201741527103782, + "language_loss": 0.63005453, + "learning_rate": 1.137926314758634e-06, + "loss": 0.65043688, + "num_input_tokens_seen": 234450630, + "router_z_loss_clip": 0.2364502, + "router_z_loss_mlp": 0.01170349, + "step": 10854, + "time_per_iteration": 3.368222951889038 + }, + { + "auxiliary_loss_clip": 0.01118973, + "auxiliary_loss_mlp": 0.01033046, + "balance_loss_clip": 1.04226255, + "balance_loss_mlp": 1.01996231, + "epoch": 0.6526379077108072, + "flos": 32522474522880.0, + "grad_norm": 1.7791220621557473, + "language_loss": 0.77679294, + "learning_rate": 1.1375749064610072e-06, + "loss": 0.79831314, + "num_input_tokens_seen": 234473505, + "router_z_loss_clip": 0.76757812, + "router_z_loss_mlp": 0.13079834, + "step": 10855, + "time_per_iteration": 2.810715913772583 + }, + { + "auxiliary_loss_clip": 0.01114186, + "auxiliary_loss_mlp": 0.01027716, + "balance_loss_clip": 1.04068732, + "balance_loss_mlp": 1.01602793, + "epoch": 0.6526980309634751, + "flos": 27845623925280.0, + "grad_norm": 2.0988942037840457, + "language_loss": 0.79332161, + "learning_rate": 1.1372235308666256e-06, + "loss": 0.81474066, + "num_input_tokens_seen": 234492485, + "router_z_loss_clip": 0.73339844, + "router_z_loss_mlp": 0.11688232, + "step": 10856, + "time_per_iteration": 2.6566977500915527 + }, + { + "auxiliary_loss_clip": 0.01117995, + "auxiliary_loss_mlp": 0.01029958, + "balance_loss_clip": 1.04193664, + "balance_loss_mlp": 1.01652312, + "epoch": 0.6527581542161431, + "flos": 34611773735520.0, + "grad_norm": 2.7027582708406652, + "language_loss": 0.73615885, + "learning_rate": 1.136872187988815e-06, + "loss": 0.75763834, + "num_input_tokens_seen": 234512645, + "router_z_loss_clip": 0.76074219, + "router_z_loss_mlp": 0.13452148, + "step": 10857, + "time_per_iteration": 2.71317458152771 + }, + { + "auxiliary_loss_clip": 0.01117919, + "auxiliary_loss_mlp": 0.01029722, + "balance_loss_clip": 1.04207134, + "balance_loss_mlp": 1.01877904, + "epoch": 0.652818277468811, + "flos": 22413963323520.0, + "grad_norm": 2.6775564853662597, + "language_loss": 0.62718213, + "learning_rate": 1.1365208778408965e-06, + "loss": 0.64865851, + "num_input_tokens_seen": 234529310, + "router_z_loss_clip": 0.7578125, + "router_z_loss_mlp": 0.10955811, + "step": 10858, + "time_per_iteration": 2.592968225479126 + }, + { + "auxiliary_loss_clip": 0.01113174, + "auxiliary_loss_mlp": 0.01031648, + "balance_loss_clip": 1.04020214, + "balance_loss_mlp": 1.02026343, + "epoch": 0.6528784007214791, + "flos": 22007996814240.0, + "grad_norm": 1.6462748543611525, + "language_loss": 0.78172183, + "learning_rate": 1.1361696004361939e-06, + "loss": 0.80317008, + "num_input_tokens_seen": 234546685, + "router_z_loss_clip": 0.72949219, + "router_z_loss_mlp": 0.11395264, + "step": 10859, + "time_per_iteration": 2.6277737617492676 + }, + { + "auxiliary_loss_clip": 0.01120713, + "auxiliary_loss_mlp": 0.01029051, + "balance_loss_clip": 1.04287493, + "balance_loss_mlp": 1.01689792, + "epoch": 0.652938523974147, + "flos": 27310750482240.0, + "grad_norm": 1.4977741582121045, + "language_loss": 0.67431796, + "learning_rate": 1.1358183557880256e-06, + "loss": 0.69581568, + "num_input_tokens_seen": 234566255, + "router_z_loss_clip": 0.77783203, + "router_z_loss_mlp": 0.12145996, + "step": 10860, + "time_per_iteration": 2.6468443870544434 + }, + { + "auxiliary_loss_clip": 0.01121831, + "auxiliary_loss_mlp": 0.01027155, + "balance_loss_clip": 1.04326916, + "balance_loss_mlp": 1.01540744, + "epoch": 0.652998647226815, + "flos": 20455191735840.0, + "grad_norm": 1.7709350995553923, + "language_loss": 0.66218203, + "learning_rate": 1.135467143909712e-06, + "loss": 0.68367195, + "num_input_tokens_seen": 234585405, + "router_z_loss_clip": 0.78564453, + "router_z_loss_mlp": 0.11755371, + "step": 10861, + "time_per_iteration": 2.6663589477539062 + }, + { + "auxiliary_loss_clip": 0.01120398, + "auxiliary_loss_mlp": 0.01034871, + "balance_loss_clip": 1.04306471, + "balance_loss_mlp": 1.02181125, + "epoch": 0.6530587704794829, + "flos": 43650862187040.0, + "grad_norm": 1.7868078171703243, + "language_loss": 0.64486802, + "learning_rate": 1.135115964814572e-06, + "loss": 0.6664207, + "num_input_tokens_seen": 234608095, + "router_z_loss_clip": 0.7734375, + "router_z_loss_mlp": 0.13043213, + "step": 10862, + "time_per_iteration": 2.7851078510284424 + }, + { + "auxiliary_loss_clip": 0.01118327, + "auxiliary_loss_mlp": 0.01034437, + "balance_loss_clip": 1.04355085, + "balance_loss_mlp": 1.02252245, + "epoch": 0.6531188937321509, + "flos": 23569634659680.0, + "grad_norm": 1.6643573904309603, + "language_loss": 0.76865101, + "learning_rate": 1.13476481851592e-06, + "loss": 0.79017866, + "num_input_tokens_seen": 234627335, + "router_z_loss_clip": 0.74755859, + "router_z_loss_mlp": 0.11920166, + "step": 10863, + "time_per_iteration": 2.6130917072296143 + }, + { + "auxiliary_loss_clip": 0.01115902, + "auxiliary_loss_mlp": 0.01031314, + "balance_loss_clip": 1.04152393, + "balance_loss_mlp": 1.02002466, + "epoch": 0.6531790169848188, + "flos": 27935154413280.0, + "grad_norm": 1.8731160605544417, + "language_loss": 0.74532843, + "learning_rate": 1.1344137050270739e-06, + "loss": 0.76680064, + "num_input_tokens_seen": 234646540, + "router_z_loss_clip": 0.74365234, + "router_z_loss_mlp": 0.11291504, + "step": 10864, + "time_per_iteration": 2.682276487350464 + }, + { + "auxiliary_loss_clip": 0.01115643, + "auxiliary_loss_mlp": 0.01033477, + "balance_loss_clip": 1.04159629, + "balance_loss_mlp": 1.02218246, + "epoch": 0.6532391402374869, + "flos": 36074764704960.0, + "grad_norm": 1.8082860995009382, + "language_loss": 0.86137784, + "learning_rate": 1.1340626243613458e-06, + "loss": 0.88286906, + "num_input_tokens_seen": 234665470, + "router_z_loss_clip": 0.73974609, + "router_z_loss_mlp": 0.11291504, + "step": 10865, + "time_per_iteration": 2.727509021759033 + }, + { + "auxiliary_loss_clip": 0.01122192, + "auxiliary_loss_mlp": 0.01041227, + "balance_loss_clip": 1.04370606, + "balance_loss_mlp": 1.02919304, + "epoch": 0.6532992634901548, + "flos": 28194143281920.0, + "grad_norm": 1.6715416501759865, + "language_loss": 0.81344306, + "learning_rate": 1.133711576532051e-06, + "loss": 0.83507729, + "num_input_tokens_seen": 234683955, + "router_z_loss_clip": 0.78564453, + "router_z_loss_mlp": 0.12030029, + "step": 10866, + "time_per_iteration": 2.6614959239959717 + }, + { + "auxiliary_loss_clip": 0.01117865, + "auxiliary_loss_mlp": 0.01029643, + "balance_loss_clip": 1.04402697, + "balance_loss_mlp": 1.01838374, + "epoch": 0.6533593867428228, + "flos": 31807851310080.0, + "grad_norm": 1.4846201570735182, + "language_loss": 0.82425368, + "learning_rate": 1.1333605615524995e-06, + "loss": 0.84572875, + "num_input_tokens_seen": 234704595, + "router_z_loss_clip": 0.73925781, + "router_z_loss_mlp": 0.11273193, + "step": 10867, + "time_per_iteration": 4.15783166885376 + }, + { + "auxiliary_loss_clip": 0.01117122, + "auxiliary_loss_mlp": 0.01026677, + "balance_loss_clip": 1.04112864, + "balance_loss_mlp": 1.01531672, + "epoch": 0.6534195099954908, + "flos": 25883773024320.0, + "grad_norm": 1.9700059733235946, + "language_loss": 0.813851, + "learning_rate": 1.1330095794360016e-06, + "loss": 0.835289, + "num_input_tokens_seen": 234724090, + "router_z_loss_clip": 0.76025391, + "router_z_loss_mlp": 0.11364746, + "step": 10868, + "time_per_iteration": 3.991034984588623 + }, + { + "auxiliary_loss_clip": 0.01121545, + "auxiliary_loss_mlp": 0.01029437, + "balance_loss_clip": 1.04396653, + "balance_loss_mlp": 1.01655614, + "epoch": 0.6534796332481587, + "flos": 23972683924800.0, + "grad_norm": 2.030447426290313, + "language_loss": 0.79826021, + "learning_rate": 1.1326586301958675e-06, + "loss": 0.81976998, + "num_input_tokens_seen": 234742560, + "router_z_loss_clip": 0.77587891, + "router_z_loss_mlp": 0.12878418, + "step": 10869, + "time_per_iteration": 2.6700193881988525 + }, + { + "auxiliary_loss_clip": 0.01121054, + "auxiliary_loss_mlp": 0.01031364, + "balance_loss_clip": 1.04508281, + "balance_loss_mlp": 1.01918054, + "epoch": 0.6535397565008267, + "flos": 29312707140000.0, + "grad_norm": 2.109235000186185, + "language_loss": 0.72453809, + "learning_rate": 1.1323077138454063e-06, + "loss": 0.74606228, + "num_input_tokens_seen": 234762315, + "router_z_loss_clip": 0.75927734, + "router_z_loss_mlp": 0.1217041, + "step": 10870, + "time_per_iteration": 2.6606335639953613 + }, + { + "auxiliary_loss_clip": 0.01121349, + "auxiliary_loss_mlp": 0.01034869, + "balance_loss_clip": 1.04643023, + "balance_loss_mlp": 1.02313888, + "epoch": 0.6535998797534947, + "flos": 30021131208960.0, + "grad_norm": 2.6646375738614987, + "language_loss": 0.74367368, + "learning_rate": 1.1319568303979221e-06, + "loss": 0.7652359, + "num_input_tokens_seen": 234781300, + "router_z_loss_clip": 0.74902344, + "router_z_loss_mlp": 0.11737061, + "step": 10871, + "time_per_iteration": 2.6812593936920166 + }, + { + "auxiliary_loss_clip": 0.01114034, + "auxiliary_loss_mlp": 0.01031034, + "balance_loss_clip": 1.04200947, + "balance_loss_mlp": 1.01980472, + "epoch": 0.6536600030061627, + "flos": 28509525853920.0, + "grad_norm": 2.019821873118289, + "language_loss": 0.55759436, + "learning_rate": 1.1316059798667227e-06, + "loss": 0.57904506, + "num_input_tokens_seen": 234801040, + "router_z_loss_clip": 0.72070312, + "router_z_loss_mlp": 0.11230469, + "step": 10872, + "time_per_iteration": 2.6664788722991943 + }, + { + "auxiliary_loss_clip": 0.01115989, + "auxiliary_loss_mlp": 0.01033386, + "balance_loss_clip": 1.04292381, + "balance_loss_mlp": 1.02212119, + "epoch": 0.6537201262588306, + "flos": 29136806511840.0, + "grad_norm": 1.8975402075628736, + "language_loss": 0.74545687, + "learning_rate": 1.1312551622651112e-06, + "loss": 0.76695061, + "num_input_tokens_seen": 234821415, + "router_z_loss_clip": 0.73095703, + "router_z_loss_mlp": 0.11260986, + "step": 10873, + "time_per_iteration": 2.6961569786071777 + }, + { + "auxiliary_loss_clip": 0.01120133, + "auxiliary_loss_mlp": 0.01029798, + "balance_loss_clip": 1.044976, + "balance_loss_mlp": 1.01784158, + "epoch": 0.6537802495114986, + "flos": 29717822786400.0, + "grad_norm": 1.8245946359136478, + "language_loss": 0.75767028, + "learning_rate": 1.1309043776063917e-06, + "loss": 0.77916962, + "num_input_tokens_seen": 234843795, + "router_z_loss_clip": 0.75097656, + "router_z_loss_mlp": 0.11950684, + "step": 10874, + "time_per_iteration": 2.710151195526123 + }, + { + "auxiliary_loss_clip": 0.0111842, + "auxiliary_loss_mlp": 0.01034012, + "balance_loss_clip": 1.04391348, + "balance_loss_mlp": 1.02229404, + "epoch": 0.6538403727641665, + "flos": 34162824742560.0, + "grad_norm": 1.5299512508092665, + "language_loss": 0.81577361, + "learning_rate": 1.1305536259038642e-06, + "loss": 0.83729792, + "num_input_tokens_seen": 234862350, + "router_z_loss_clip": 0.74462891, + "router_z_loss_mlp": 0.11712646, + "step": 10875, + "time_per_iteration": 2.6560463905334473 + }, + { + "auxiliary_loss_clip": 0.01115809, + "auxiliary_loss_mlp": 0.01036058, + "balance_loss_clip": 1.04036319, + "balance_loss_mlp": 1.02467895, + "epoch": 0.6539004960168345, + "flos": 33633664236000.0, + "grad_norm": 1.853515618137958, + "language_loss": 0.69900751, + "learning_rate": 1.1302029071708314e-06, + "loss": 0.72052622, + "num_input_tokens_seen": 234881790, + "router_z_loss_clip": 0.75439453, + "router_z_loss_mlp": 0.1137085, + "step": 10876, + "time_per_iteration": 2.7026946544647217 + }, + { + "auxiliary_loss_clip": 0.01116983, + "auxiliary_loss_mlp": 0.01031277, + "balance_loss_clip": 1.04216337, + "balance_loss_mlp": 1.01915908, + "epoch": 0.6539606192695024, + "flos": 17729900650080.0, + "grad_norm": 2.1106109344179007, + "language_loss": 0.79605103, + "learning_rate": 1.1298522214205908e-06, + "loss": 0.81753361, + "num_input_tokens_seen": 234897775, + "router_z_loss_clip": 0.74755859, + "router_z_loss_mlp": 0.12109375, + "step": 10877, + "time_per_iteration": 4.067800760269165 + }, + { + "auxiliary_loss_clip": 0.0111854, + "auxiliary_loss_mlp": 0.01027752, + "balance_loss_clip": 1.0438869, + "balance_loss_mlp": 1.01572394, + "epoch": 0.6540207425221705, + "flos": 26376393294720.0, + "grad_norm": 2.2599031929024327, + "language_loss": 0.80160379, + "learning_rate": 1.1295015686664408e-06, + "loss": 0.82306671, + "num_input_tokens_seen": 234918395, + "router_z_loss_clip": 0.74707031, + "router_z_loss_mlp": 0.12030029, + "step": 10878, + "time_per_iteration": 2.6669299602508545 + }, + { + "auxiliary_loss_clip": 0.01115415, + "auxiliary_loss_mlp": 0.01026727, + "balance_loss_clip": 1.04070008, + "balance_loss_mlp": 1.01440692, + "epoch": 0.6540808657748384, + "flos": 21560141787840.0, + "grad_norm": 4.482629813719554, + "language_loss": 0.84371173, + "learning_rate": 1.1291509489216797e-06, + "loss": 0.86513311, + "num_input_tokens_seen": 234936260, + "router_z_loss_clip": 0.74755859, + "router_z_loss_mlp": 0.12322998, + "step": 10879, + "time_per_iteration": 2.6065011024475098 + }, + { + "auxiliary_loss_clip": 0.01118809, + "auxiliary_loss_mlp": 0.01033153, + "balance_loss_clip": 1.04159856, + "balance_loss_mlp": 1.02042735, + "epoch": 0.6541409890275064, + "flos": 17739179107200.0, + "grad_norm": 3.932426237509112, + "language_loss": 0.71730679, + "learning_rate": 1.128800362199601e-06, + "loss": 0.73882639, + "num_input_tokens_seen": 234952110, + "router_z_loss_clip": 0.77246094, + "router_z_loss_mlp": 0.12750244, + "step": 10880, + "time_per_iteration": 3.8388359546661377 + }, + { + "auxiliary_loss_clip": 0.01114734, + "auxiliary_loss_mlp": 0.01029408, + "balance_loss_clip": 1.04105806, + "balance_loss_mlp": 1.01788068, + "epoch": 0.6542011122801744, + "flos": 20944854244800.0, + "grad_norm": 2.0371689891727036, + "language_loss": 0.84562117, + "learning_rate": 1.1284498085135005e-06, + "loss": 0.86706257, + "num_input_tokens_seen": 234970810, + "router_z_loss_clip": 0.73632812, + "router_z_loss_mlp": 0.11529541, + "step": 10881, + "time_per_iteration": 2.650381565093994 + }, + { + "auxiliary_loss_clip": 0.01120605, + "auxiliary_loss_mlp": 0.01032981, + "balance_loss_clip": 1.04336953, + "balance_loss_mlp": 1.0198977, + "epoch": 0.6542612355328423, + "flos": 22191150035520.0, + "grad_norm": 2.009372882070416, + "language_loss": 0.78003561, + "learning_rate": 1.1280992878766699e-06, + "loss": 0.80157149, + "num_input_tokens_seen": 234989565, + "router_z_loss_clip": 0.77246094, + "router_z_loss_mlp": 0.13092041, + "step": 10882, + "time_per_iteration": 2.614466428756714 + }, + { + "auxiliary_loss_clip": 0.01122095, + "auxiliary_loss_mlp": 0.0102917, + "balance_loss_clip": 1.04528451, + "balance_loss_mlp": 1.01655746, + "epoch": 0.6543213587855103, + "flos": 24150691451520.0, + "grad_norm": 3.972931937124708, + "language_loss": 0.8215493, + "learning_rate": 1.1277488003024024e-06, + "loss": 0.84306192, + "num_input_tokens_seen": 235007955, + "router_z_loss_clip": 0.76757812, + "router_z_loss_mlp": 0.12615967, + "step": 10883, + "time_per_iteration": 2.6542294025421143 + }, + { + "auxiliary_loss_clip": 0.01123478, + "auxiliary_loss_mlp": 0.01034039, + "balance_loss_clip": 1.04600716, + "balance_loss_mlp": 1.02164769, + "epoch": 0.6543814820381783, + "flos": 25752556605600.0, + "grad_norm": 3.189743336931546, + "language_loss": 0.85629272, + "learning_rate": 1.127398345803988e-06, + "loss": 0.87786788, + "num_input_tokens_seen": 235024860, + "router_z_loss_clip": 0.77392578, + "router_z_loss_mlp": 0.12390137, + "step": 10884, + "time_per_iteration": 2.634124279022217 + }, + { + "auxiliary_loss_clip": 0.01121567, + "auxiliary_loss_mlp": 0.01033551, + "balance_loss_clip": 1.04492867, + "balance_loss_mlp": 1.02134442, + "epoch": 0.6544416052908463, + "flos": 24640718616000.0, + "grad_norm": 2.2255296051039126, + "language_loss": 0.80208659, + "learning_rate": 1.127047924394715e-06, + "loss": 0.82363772, + "num_input_tokens_seen": 235043815, + "router_z_loss_clip": 0.765625, + "router_z_loss_mlp": 0.12213135, + "step": 10885, + "time_per_iteration": 2.6548190116882324 + }, + { + "auxiliary_loss_clip": 0.01117062, + "auxiliary_loss_mlp": 0.01027871, + "balance_loss_clip": 1.04297853, + "balance_loss_mlp": 1.01528311, + "epoch": 0.6545017285435142, + "flos": 28514144823840.0, + "grad_norm": 1.9757530540429278, + "language_loss": 0.72652996, + "learning_rate": 1.1266975360878722e-06, + "loss": 0.74797928, + "num_input_tokens_seen": 235062985, + "router_z_loss_clip": 0.74072266, + "router_z_loss_mlp": 0.12585449, + "step": 10886, + "time_per_iteration": 2.6381547451019287 + }, + { + "auxiliary_loss_clip": 0.01116392, + "auxiliary_loss_mlp": 0.01029567, + "balance_loss_clip": 1.04268563, + "balance_loss_mlp": 1.01817107, + "epoch": 0.6545618517961822, + "flos": 23348158441920.0, + "grad_norm": 2.95710940599453, + "language_loss": 0.78409785, + "learning_rate": 1.1263471808967468e-06, + "loss": 0.80555749, + "num_input_tokens_seen": 235081670, + "router_z_loss_clip": 0.73681641, + "router_z_loss_mlp": 0.1138916, + "step": 10887, + "time_per_iteration": 2.6505813598632812 + }, + { + "auxiliary_loss_clip": 0.01118297, + "auxiliary_loss_mlp": 0.01030802, + "balance_loss_clip": 1.04389858, + "balance_loss_mlp": 1.01954818, + "epoch": 0.6546219750488501, + "flos": 18227788166880.0, + "grad_norm": 3.2702130026974148, + "language_loss": 0.78921402, + "learning_rate": 1.1259968588346234e-06, + "loss": 0.81070507, + "num_input_tokens_seen": 235098510, + "router_z_loss_clip": 0.74414062, + "router_z_loss_mlp": 0.11260986, + "step": 10888, + "time_per_iteration": 2.7085378170013428 + }, + { + "auxiliary_loss_clip": 0.01113221, + "auxiliary_loss_mlp": 0.01027563, + "balance_loss_clip": 1.04074931, + "balance_loss_mlp": 1.0166142, + "epoch": 0.6546820983015181, + "flos": 44319504637440.0, + "grad_norm": 1.6585758638402859, + "language_loss": 0.66269982, + "learning_rate": 1.1256465699147874e-06, + "loss": 0.68410766, + "num_input_tokens_seen": 235119990, + "router_z_loss_clip": 0.72412109, + "router_z_loss_mlp": 0.10955811, + "step": 10889, + "time_per_iteration": 2.7582201957702637 + }, + { + "auxiliary_loss_clip": 0.01116369, + "auxiliary_loss_mlp": 0.01031018, + "balance_loss_clip": 1.04100919, + "balance_loss_mlp": 1.01869226, + "epoch": 0.654742221554186, + "flos": 24906149732160.0, + "grad_norm": 1.4234908778774717, + "language_loss": 0.79673064, + "learning_rate": 1.1252963141505203e-06, + "loss": 0.81820446, + "num_input_tokens_seen": 235139255, + "router_z_loss_clip": 0.75390625, + "router_z_loss_mlp": 0.12335205, + "step": 10890, + "time_per_iteration": 2.7139222621917725 + }, + { + "auxiliary_loss_clip": 0.01116804, + "auxiliary_loss_mlp": 0.0102845, + "balance_loss_clip": 1.04122186, + "balance_loss_mlp": 1.01650572, + "epoch": 0.6548023448068541, + "flos": 30338985335040.0, + "grad_norm": 2.0997110839795026, + "language_loss": 0.65569842, + "learning_rate": 1.1249460915551052e-06, + "loss": 0.67715091, + "num_input_tokens_seen": 235158455, + "router_z_loss_clip": 0.75683594, + "router_z_loss_mlp": 0.11938477, + "step": 10891, + "time_per_iteration": 2.6851961612701416 + }, + { + "auxiliary_loss_clip": 0.01117052, + "auxiliary_loss_mlp": 0.01032876, + "balance_loss_clip": 1.0427742, + "balance_loss_mlp": 1.02181363, + "epoch": 0.654862468059522, + "flos": 26146611034560.0, + "grad_norm": 1.8018458912670283, + "language_loss": 0.79510975, + "learning_rate": 1.1245959021418214e-06, + "loss": 0.81660903, + "num_input_tokens_seen": 235177350, + "router_z_loss_clip": 0.74316406, + "router_z_loss_mlp": 0.11065674, + "step": 10892, + "time_per_iteration": 2.6948394775390625 + }, + { + "auxiliary_loss_clip": 0.01123572, + "auxiliary_loss_mlp": 0.01032998, + "balance_loss_clip": 1.04535341, + "balance_loss_mlp": 1.0213449, + "epoch": 0.65492259131219, + "flos": 32430837136320.0, + "grad_norm": 1.9838629755684585, + "language_loss": 0.7834059, + "learning_rate": 1.1242457459239497e-06, + "loss": 0.80497164, + "num_input_tokens_seen": 235196435, + "router_z_loss_clip": 0.78222656, + "router_z_loss_mlp": 0.11651611, + "step": 10893, + "time_per_iteration": 2.672057867050171 + }, + { + "auxiliary_loss_clip": 0.01123879, + "auxiliary_loss_mlp": 0.01033061, + "balance_loss_clip": 1.04586053, + "balance_loss_mlp": 1.01991773, + "epoch": 0.6549827145648579, + "flos": 26236546695360.0, + "grad_norm": 1.7299109542053226, + "language_loss": 0.70351815, + "learning_rate": 1.123895622914766e-06, + "loss": 0.72508752, + "num_input_tokens_seen": 235215430, + "router_z_loss_clip": 0.77929688, + "router_z_loss_mlp": 0.13146973, + "step": 10894, + "time_per_iteration": 2.7237274646759033 + }, + { + "auxiliary_loss_clip": 0.0112012, + "auxiliary_loss_mlp": 0.01031059, + "balance_loss_clip": 1.04254019, + "balance_loss_mlp": 1.0191021, + "epoch": 0.6550428378175259, + "flos": 27570387627360.0, + "grad_norm": 4.093827237097143, + "language_loss": 0.6335057, + "learning_rate": 1.123545533127549e-06, + "loss": 0.6550175, + "num_input_tokens_seen": 235232015, + "router_z_loss_clip": 0.77539062, + "router_z_loss_mlp": 0.11968994, + "step": 10895, + "time_per_iteration": 2.6495413780212402 + }, + { + "auxiliary_loss_clip": 0.01117525, + "auxiliary_loss_mlp": 0.01034582, + "balance_loss_clip": 1.04250264, + "balance_loss_mlp": 1.02280426, + "epoch": 0.655102961070194, + "flos": 15647003167680.0, + "grad_norm": 2.1771984004033267, + "language_loss": 0.78639627, + "learning_rate": 1.1231954765755722e-06, + "loss": 0.80791736, + "num_input_tokens_seen": 235248115, + "router_z_loss_clip": 0.75048828, + "router_z_loss_mlp": 0.11767578, + "step": 10896, + "time_per_iteration": 2.6976442337036133 + }, + { + "auxiliary_loss_clip": 0.01117105, + "auxiliary_loss_mlp": 0.01029776, + "balance_loss_clip": 1.04302907, + "balance_loss_mlp": 1.01878524, + "epoch": 0.6551630843228619, + "flos": 30250386744480.0, + "grad_norm": 2.7651843770231936, + "language_loss": 0.7066201, + "learning_rate": 1.1228454532721111e-06, + "loss": 0.72808886, + "num_input_tokens_seen": 235270785, + "router_z_loss_clip": 0.74023438, + "router_z_loss_mlp": 0.10992432, + "step": 10897, + "time_per_iteration": 2.676210641860962 + }, + { + "auxiliary_loss_clip": 0.0111721, + "auxiliary_loss_mlp": 0.01030919, + "balance_loss_clip": 1.04046845, + "balance_loss_mlp": 1.0194627, + "epoch": 0.6552232075755299, + "flos": 20407185109440.0, + "grad_norm": 1.657285275522394, + "language_loss": 0.75475508, + "learning_rate": 1.1224954632304391e-06, + "loss": 0.77623641, + "num_input_tokens_seen": 235287905, + "router_z_loss_clip": 0.76757812, + "router_z_loss_mlp": 0.11474609, + "step": 10898, + "time_per_iteration": 2.6388347148895264 + }, + { + "auxiliary_loss_clip": 0.0111829, + "auxiliary_loss_mlp": 0.0103214, + "balance_loss_clip": 1.04340482, + "balance_loss_mlp": 1.02030873, + "epoch": 0.6552833308281978, + "flos": 26864759250720.0, + "grad_norm": 2.658386007417234, + "language_loss": 0.73655969, + "learning_rate": 1.122145506463827e-06, + "loss": 0.75806397, + "num_input_tokens_seen": 235305525, + "router_z_loss_clip": 0.74853516, + "router_z_loss_mlp": 0.11828613, + "step": 10899, + "time_per_iteration": 2.6875016689300537 + }, + { + "auxiliary_loss_clip": 0.01116742, + "auxiliary_loss_mlp": 0.01028115, + "balance_loss_clip": 1.04139543, + "balance_loss_mlp": 1.01673651, + "epoch": 0.6553434540808658, + "flos": 30338701714080.0, + "grad_norm": 4.020259960145913, + "language_loss": 0.55878782, + "learning_rate": 1.1217955829855443e-06, + "loss": 0.58023638, + "num_input_tokens_seen": 235324415, + "router_z_loss_clip": 0.75341797, + "router_z_loss_mlp": 0.1137085, + "step": 10900, + "time_per_iteration": 2.6640474796295166 + }, + { + "auxiliary_loss_clip": 0.01118201, + "auxiliary_loss_mlp": 0.01033274, + "balance_loss_clip": 1.0429529, + "balance_loss_mlp": 1.02051258, + "epoch": 0.6554035773335337, + "flos": 28334638157760.0, + "grad_norm": 2.057227758471475, + "language_loss": 0.76912403, + "learning_rate": 1.1214456928088622e-06, + "loss": 0.79063874, + "num_input_tokens_seen": 235341595, + "router_z_loss_clip": 0.75292969, + "router_z_loss_mlp": 0.12750244, + "step": 10901, + "time_per_iteration": 2.6424412727355957 + }, + { + "auxiliary_loss_clip": 0.01114224, + "auxiliary_loss_mlp": 0.01026694, + "balance_loss_clip": 1.04053605, + "balance_loss_mlp": 1.01436234, + "epoch": 0.6554637005862017, + "flos": 27800777646720.0, + "grad_norm": 18.048938864439634, + "language_loss": 0.73247349, + "learning_rate": 1.1210958359470463e-06, + "loss": 0.75388265, + "num_input_tokens_seen": 235361700, + "router_z_loss_clip": 0.73730469, + "router_z_loss_mlp": 0.12335205, + "step": 10902, + "time_per_iteration": 2.7149860858917236 + }, + { + "auxiliary_loss_clip": 0.01116632, + "auxiliary_loss_mlp": 0.0102974, + "balance_loss_clip": 1.04333353, + "balance_loss_mlp": 1.01829004, + "epoch": 0.6555238238388696, + "flos": 26243191529280.0, + "grad_norm": 1.6773991452570414, + "language_loss": 0.67593539, + "learning_rate": 1.1207460124133645e-06, + "loss": 0.69739908, + "num_input_tokens_seen": 235382065, + "router_z_loss_clip": 0.73242188, + "router_z_loss_mlp": 0.11450195, + "step": 10903, + "time_per_iteration": 2.691830635070801 + }, + { + "auxiliary_loss_clip": 0.01120827, + "auxiliary_loss_mlp": 0.01032371, + "balance_loss_clip": 1.04197741, + "balance_loss_mlp": 1.01979411, + "epoch": 0.6555839470915377, + "flos": 37240605878400.0, + "grad_norm": 1.8914802771221997, + "language_loss": 0.66920292, + "learning_rate": 1.1203962222210832e-06, + "loss": 0.69073492, + "num_input_tokens_seen": 235402130, + "router_z_loss_clip": 0.78808594, + "router_z_loss_mlp": 0.12573242, + "step": 10904, + "time_per_iteration": 2.691589117050171 + }, + { + "auxiliary_loss_clip": 0.01116224, + "auxiliary_loss_mlp": 0.01036591, + "balance_loss_clip": 1.04008186, + "balance_loss_mlp": 1.02346575, + "epoch": 0.6556440703442056, + "flos": 30069826629120.0, + "grad_norm": 2.1812811654295037, + "language_loss": 0.90710467, + "learning_rate": 1.120046465383464e-06, + "loss": 0.9286328, + "num_input_tokens_seen": 235420435, + "router_z_loss_clip": 0.75976562, + "router_z_loss_mlp": 0.13104248, + "step": 10905, + "time_per_iteration": 2.6657984256744385 + }, + { + "auxiliary_loss_clip": 0.01115796, + "auxiliary_loss_mlp": 0.01031506, + "balance_loss_clip": 1.04243684, + "balance_loss_mlp": 1.01979327, + "epoch": 0.6557041935968736, + "flos": 28958636916000.0, + "grad_norm": 1.9100864391789083, + "language_loss": 0.75284797, + "learning_rate": 1.1196967419137721e-06, + "loss": 0.77432096, + "num_input_tokens_seen": 235439960, + "router_z_loss_clip": 0.73291016, + "router_z_loss_mlp": 0.1171875, + "step": 10906, + "time_per_iteration": 2.6585371494293213 + }, + { + "auxiliary_loss_clip": 0.01119678, + "auxiliary_loss_mlp": 0.01038071, + "balance_loss_clip": 1.04236364, + "balance_loss_mlp": 1.02578068, + "epoch": 0.6557643168495415, + "flos": 13547412565920.0, + "grad_norm": 2.6236927334770197, + "language_loss": 0.74694014, + "learning_rate": 1.119347051825267e-06, + "loss": 0.76851755, + "num_input_tokens_seen": 235457495, + "router_z_loss_clip": 0.77246094, + "router_z_loss_mlp": 0.12286377, + "step": 10907, + "time_per_iteration": 4.052000284194946 + }, + { + "auxiliary_loss_clip": 0.01115761, + "auxiliary_loss_mlp": 0.01027958, + "balance_loss_clip": 1.03999305, + "balance_loss_mlp": 1.01501811, + "epoch": 0.6558244401022095, + "flos": 36839825580960.0, + "grad_norm": 1.5054446655350602, + "language_loss": 0.72154933, + "learning_rate": 1.118997395131211e-06, + "loss": 0.74298656, + "num_input_tokens_seen": 235479525, + "router_z_loss_clip": 0.7578125, + "router_z_loss_mlp": 0.1295166, + "step": 10908, + "time_per_iteration": 4.15909218788147 + }, + { + "auxiliary_loss_clip": 0.01120167, + "auxiliary_loss_mlp": 0.01030938, + "balance_loss_clip": 1.04428172, + "balance_loss_mlp": 1.01813507, + "epoch": 0.6558845633548775, + "flos": 21879332984160.0, + "grad_norm": 2.4523448737972617, + "language_loss": 0.81383574, + "learning_rate": 1.118647771844861e-06, + "loss": 0.83534682, + "num_input_tokens_seen": 235496305, + "router_z_loss_clip": 0.75927734, + "router_z_loss_mlp": 0.12799072, + "step": 10909, + "time_per_iteration": 2.613409996032715 + }, + { + "auxiliary_loss_clip": 0.01118637, + "auxiliary_loss_mlp": 0.01039835, + "balance_loss_clip": 1.04202747, + "balance_loss_mlp": 1.02669823, + "epoch": 0.6559446866075455, + "flos": 26727951447360.0, + "grad_norm": 2.4086093200367857, + "language_loss": 0.63699919, + "learning_rate": 1.1182981819794767e-06, + "loss": 0.65858388, + "num_input_tokens_seen": 235512545, + "router_z_loss_clip": 0.765625, + "router_z_loss_mlp": 0.13140869, + "step": 10910, + "time_per_iteration": 2.8133115768432617 + }, + { + "auxiliary_loss_clip": 0.01125051, + "auxiliary_loss_mlp": 0.01034081, + "balance_loss_clip": 1.04474258, + "balance_loss_mlp": 1.02069342, + "epoch": 0.6560048098602135, + "flos": 17239306243680.0, + "grad_norm": 2.9655739671505166, + "language_loss": 0.76227319, + "learning_rate": 1.117948625548313e-06, + "loss": 0.78386456, + "num_input_tokens_seen": 235526045, + "router_z_loss_clip": 0.80371094, + "router_z_loss_mlp": 0.1340332, + "step": 10911, + "time_per_iteration": 2.643989324569702 + }, + { + "auxiliary_loss_clip": 0.01112993, + "auxiliary_loss_mlp": 0.01027208, + "balance_loss_clip": 1.04036951, + "balance_loss_mlp": 1.01622915, + "epoch": 0.6560649331128814, + "flos": 22948836766560.0, + "grad_norm": 1.5387429771648165, + "language_loss": 0.75333673, + "learning_rate": 1.1175991025646265e-06, + "loss": 0.77473873, + "num_input_tokens_seen": 235545285, + "router_z_loss_clip": 0.72607422, + "router_z_loss_mlp": 0.10986328, + "step": 10912, + "time_per_iteration": 2.658460855484009 + }, + { + "auxiliary_loss_clip": 0.01126833, + "auxiliary_loss_mlp": 0.0103575, + "balance_loss_clip": 1.04575849, + "balance_loss_mlp": 1.02267265, + "epoch": 0.6561250563655494, + "flos": 20808775752480.0, + "grad_norm": 1.5385229318316072, + "language_loss": 0.77257043, + "learning_rate": 1.1172496130416697e-06, + "loss": 0.79419625, + "num_input_tokens_seen": 235563150, + "router_z_loss_clip": 0.81152344, + "router_z_loss_mlp": 0.13079834, + "step": 10913, + "time_per_iteration": 2.6047141551971436 + }, + { + "auxiliary_loss_clip": 0.01113388, + "auxiliary_loss_mlp": 0.01027297, + "balance_loss_clip": 1.04078269, + "balance_loss_mlp": 1.01635981, + "epoch": 0.6561851796182173, + "flos": 27622689085440.0, + "grad_norm": 2.085255327561184, + "language_loss": 0.71509039, + "learning_rate": 1.1169001569926961e-06, + "loss": 0.73649728, + "num_input_tokens_seen": 235582535, + "router_z_loss_clip": 0.72607422, + "router_z_loss_mlp": 0.109375, + "step": 10914, + "time_per_iteration": 2.729095220565796 + }, + { + "auxiliary_loss_clip": 0.01116549, + "auxiliary_loss_mlp": 0.01030649, + "balance_loss_clip": 1.04160225, + "balance_loss_mlp": 1.01870453, + "epoch": 0.6562453028708853, + "flos": 23476133478240.0, + "grad_norm": 2.165402875084535, + "language_loss": 0.74120474, + "learning_rate": 1.116550734430958e-06, + "loss": 0.76267672, + "num_input_tokens_seen": 235601490, + "router_z_loss_clip": 0.74951172, + "router_z_loss_mlp": 0.11938477, + "step": 10915, + "time_per_iteration": 2.697504997253418 + }, + { + "auxiliary_loss_clip": 0.0111343, + "auxiliary_loss_mlp": 0.01027958, + "balance_loss_clip": 1.0400641, + "balance_loss_mlp": 1.01558995, + "epoch": 0.6563054261235532, + "flos": 29043021709440.0, + "grad_norm": 1.6040480683962606, + "language_loss": 0.79755521, + "learning_rate": 1.1162013453697042e-06, + "loss": 0.81896907, + "num_input_tokens_seen": 235619165, + "router_z_loss_clip": 0.73339844, + "router_z_loss_mlp": 0.12371826, + "step": 10916, + "time_per_iteration": 4.06892991065979 + }, + { + "auxiliary_loss_clip": 0.01116922, + "auxiliary_loss_mlp": 0.01029109, + "balance_loss_clip": 1.04179859, + "balance_loss_mlp": 1.01826668, + "epoch": 0.6563655493762213, + "flos": 23477065375680.0, + "grad_norm": 1.87917588689534, + "language_loss": 0.76246339, + "learning_rate": 1.1158519898221831e-06, + "loss": 0.78392375, + "num_input_tokens_seen": 235637115, + "router_z_loss_clip": 0.75097656, + "router_z_loss_mlp": 0.1083374, + "step": 10917, + "time_per_iteration": 2.6713027954101562 + }, + { + "auxiliary_loss_clip": 0.01115476, + "auxiliary_loss_mlp": 0.01031332, + "balance_loss_clip": 1.04124737, + "balance_loss_mlp": 1.01947701, + "epoch": 0.6564256726288892, + "flos": 31184419793760.0, + "grad_norm": 2.123785614931082, + "language_loss": 0.69540167, + "learning_rate": 1.1155026678016445e-06, + "loss": 0.71686983, + "num_input_tokens_seen": 235656330, + "router_z_loss_clip": 0.74267578, + "router_z_loss_mlp": 0.11853027, + "step": 10918, + "time_per_iteration": 2.675291061401367 + }, + { + "auxiliary_loss_clip": 0.01113907, + "auxiliary_loss_mlp": 0.01033776, + "balance_loss_clip": 1.04229271, + "balance_loss_mlp": 1.02286792, + "epoch": 0.6564857958815572, + "flos": 27089314781760.0, + "grad_norm": 1.6439295375428395, + "language_loss": 0.76419628, + "learning_rate": 1.115153379321332e-06, + "loss": 0.78567314, + "num_input_tokens_seen": 235674510, + "router_z_loss_clip": 0.71679688, + "router_z_loss_mlp": 0.10906982, + "step": 10919, + "time_per_iteration": 3.8749918937683105 + }, + { + "auxiliary_loss_clip": 0.0103753, + "auxiliary_loss_mlp": 0.01001927, + "balance_loss_clip": 1.01400423, + "balance_loss_mlp": 1.00070047, + "epoch": 0.6565459191342251, + "flos": 70922843982720.0, + "grad_norm": 0.7236039203404884, + "language_loss": 0.52958894, + "learning_rate": 1.1148041243944931e-06, + "loss": 0.5499835, + "num_input_tokens_seen": 235735050, + "router_z_loss_clip": 0.23535156, + "router_z_loss_mlp": 0.01225281, + "step": 10920, + "time_per_iteration": 3.298863410949707 + }, + { + "auxiliary_loss_clip": 0.01116444, + "auxiliary_loss_mlp": 0.01031688, + "balance_loss_clip": 1.04333639, + "balance_loss_mlp": 1.01935029, + "epoch": 0.6566060423868931, + "flos": 37594838171520.0, + "grad_norm": 2.135931144851268, + "language_loss": 0.65764594, + "learning_rate": 1.1144549030343697e-06, + "loss": 0.67912734, + "num_input_tokens_seen": 235757545, + "router_z_loss_clip": 0.73046875, + "router_z_loss_mlp": 0.12347412, + "step": 10921, + "time_per_iteration": 2.6934008598327637 + }, + { + "auxiliary_loss_clip": 0.01115418, + "auxiliary_loss_mlp": 0.01033267, + "balance_loss_clip": 1.04145038, + "balance_loss_mlp": 1.02101862, + "epoch": 0.6566661656395612, + "flos": 28513212926400.0, + "grad_norm": 1.8134926957333941, + "language_loss": 0.80962157, + "learning_rate": 1.114105715254205e-06, + "loss": 0.83110845, + "num_input_tokens_seen": 235777265, + "router_z_loss_clip": 0.73974609, + "router_z_loss_mlp": 0.12261963, + "step": 10922, + "time_per_iteration": 2.662278890609741 + }, + { + "auxiliary_loss_clip": 0.01118339, + "auxiliary_loss_mlp": 0.01033139, + "balance_loss_clip": 1.04336905, + "balance_loss_mlp": 1.02117074, + "epoch": 0.6567262888922291, + "flos": 31403343422880.0, + "grad_norm": 4.860938253403201, + "language_loss": 0.71439064, + "learning_rate": 1.1137565610672414e-06, + "loss": 0.73590541, + "num_input_tokens_seen": 235796565, + "router_z_loss_clip": 0.75048828, + "router_z_loss_mlp": 0.11975098, + "step": 10923, + "time_per_iteration": 2.676006317138672 + }, + { + "auxiliary_loss_clip": 0.01119201, + "auxiliary_loss_mlp": 0.01033505, + "balance_loss_clip": 1.04357278, + "balance_loss_mlp": 1.02144682, + "epoch": 0.6567864121448971, + "flos": 20894578650720.0, + "grad_norm": 2.844130157224003, + "language_loss": 0.81036341, + "learning_rate": 1.1134074404867169e-06, + "loss": 0.83189046, + "num_input_tokens_seen": 235814805, + "router_z_loss_clip": 0.75683594, + "router_z_loss_mlp": 0.12060547, + "step": 10924, + "time_per_iteration": 2.6134040355682373 + }, + { + "auxiliary_loss_clip": 0.01115276, + "auxiliary_loss_mlp": 0.01027563, + "balance_loss_clip": 1.04116559, + "balance_loss_mlp": 1.0166254, + "epoch": 0.656846535397565, + "flos": 27356204520000.0, + "grad_norm": 1.6594029142538904, + "language_loss": 0.72292936, + "learning_rate": 1.1130583535258717e-06, + "loss": 0.74435771, + "num_input_tokens_seen": 235833405, + "router_z_loss_clip": 0.74121094, + "router_z_loss_mlp": 0.109375, + "step": 10925, + "time_per_iteration": 2.6675782203674316 + }, + { + "auxiliary_loss_clip": 0.01117178, + "auxiliary_loss_mlp": 0.01026842, + "balance_loss_clip": 1.04203463, + "balance_loss_mlp": 1.0158031, + "epoch": 0.656906658650233, + "flos": 21603610478880.0, + "grad_norm": 2.8596346879172008, + "language_loss": 0.72860408, + "learning_rate": 1.112709300197942e-06, + "loss": 0.75004423, + "num_input_tokens_seen": 235848530, + "router_z_loss_clip": 0.75146484, + "router_z_loss_mlp": 0.1104126, + "step": 10926, + "time_per_iteration": 2.6003198623657227 + }, + { + "auxiliary_loss_clip": 0.0111807, + "auxiliary_loss_mlp": 0.01032746, + "balance_loss_clip": 1.04099476, + "balance_loss_mlp": 1.01985395, + "epoch": 0.6569667819029009, + "flos": 25838318986560.0, + "grad_norm": 1.732507234416387, + "language_loss": 0.72576833, + "learning_rate": 1.1123602805161656e-06, + "loss": 0.74727648, + "num_input_tokens_seen": 235867225, + "router_z_loss_clip": 0.76953125, + "router_z_loss_mlp": 0.12908936, + "step": 10927, + "time_per_iteration": 2.638855457305908 + }, + { + "auxiliary_loss_clip": 0.01037892, + "auxiliary_loss_mlp": 0.01001783, + "balance_loss_clip": 1.01441705, + "balance_loss_mlp": 1.00059056, + "epoch": 0.6570269051555689, + "flos": 83903097183840.0, + "grad_norm": 0.7246817653367312, + "language_loss": 0.64479744, + "learning_rate": 1.112011294493775e-06, + "loss": 0.66519415, + "num_input_tokens_seen": 235932925, + "router_z_loss_clip": 0.23486328, + "router_z_loss_mlp": 0.01190948, + "step": 10928, + "time_per_iteration": 3.257270574569702 + }, + { + "auxiliary_loss_clip": 0.01116044, + "auxiliary_loss_mlp": 0.01032277, + "balance_loss_clip": 1.04118514, + "balance_loss_mlp": 1.02030277, + "epoch": 0.6570870284082369, + "flos": 32115616633440.0, + "grad_norm": 1.6689651295647145, + "language_loss": 0.77684331, + "learning_rate": 1.1116623421440063e-06, + "loss": 0.79832649, + "num_input_tokens_seen": 235952680, + "router_z_loss_clip": 0.74902344, + "router_z_loss_mlp": 0.11987305, + "step": 10929, + "time_per_iteration": 2.7094578742980957 + }, + { + "auxiliary_loss_clip": 0.01117051, + "auxiliary_loss_mlp": 0.01030278, + "balance_loss_clip": 1.04261971, + "balance_loss_mlp": 1.01842856, + "epoch": 0.6571471516609049, + "flos": 31939108246080.0, + "grad_norm": 1.8407770209128407, + "language_loss": 0.654755, + "learning_rate": 1.1113134234800895e-06, + "loss": 0.67622828, + "num_input_tokens_seen": 235972075, + "router_z_loss_clip": 0.74365234, + "router_z_loss_mlp": 0.11846924, + "step": 10930, + "time_per_iteration": 2.646021842956543 + }, + { + "auxiliary_loss_clip": 0.01117505, + "auxiliary_loss_mlp": 0.01028766, + "balance_loss_clip": 1.04210877, + "balance_loss_mlp": 1.01619518, + "epoch": 0.6572072749135728, + "flos": 24863086213920.0, + "grad_norm": 1.839041419697913, + "language_loss": 0.70885372, + "learning_rate": 1.110964538515258e-06, + "loss": 0.7303164, + "num_input_tokens_seen": 235990340, + "router_z_loss_clip": 0.75390625, + "router_z_loss_mlp": 0.12579346, + "step": 10931, + "time_per_iteration": 2.7428698539733887 + }, + { + "auxiliary_loss_clip": 0.01120299, + "auxiliary_loss_mlp": 0.01034956, + "balance_loss_clip": 1.04303122, + "balance_loss_mlp": 1.02323198, + "epoch": 0.6572673981662408, + "flos": 20900251069920.0, + "grad_norm": 2.3999712930113724, + "language_loss": 0.68608057, + "learning_rate": 1.1106156872627393e-06, + "loss": 0.70763314, + "num_input_tokens_seen": 236007470, + "router_z_loss_clip": 0.7734375, + "router_z_loss_mlp": 0.1171875, + "step": 10932, + "time_per_iteration": 2.5883216857910156 + }, + { + "auxiliary_loss_clip": 0.011163, + "auxiliary_loss_mlp": 0.01026777, + "balance_loss_clip": 1.04179466, + "balance_loss_mlp": 1.01521337, + "epoch": 0.6573275214189087, + "flos": 50366169161280.0, + "grad_norm": 12.467983322197474, + "language_loss": 0.80208713, + "learning_rate": 1.1102668697357626e-06, + "loss": 0.82351792, + "num_input_tokens_seen": 236029030, + "router_z_loss_clip": 0.74462891, + "router_z_loss_mlp": 0.11572266, + "step": 10933, + "time_per_iteration": 2.8020317554473877 + }, + { + "auxiliary_loss_clip": 0.01120054, + "auxiliary_loss_mlp": 0.01035649, + "balance_loss_clip": 1.04310453, + "balance_loss_mlp": 1.02301836, + "epoch": 0.6573876446715767, + "flos": 27931062168000.0, + "grad_norm": 1.7634492271974187, + "language_loss": 0.73917961, + "learning_rate": 1.1099180859475571e-06, + "loss": 0.76073658, + "num_input_tokens_seen": 236047160, + "router_z_loss_clip": 0.76953125, + "router_z_loss_mlp": 0.12634277, + "step": 10934, + "time_per_iteration": 2.6325740814208984 + }, + { + "auxiliary_loss_clip": 0.01118114, + "auxiliary_loss_mlp": 0.01036727, + "balance_loss_clip": 1.04374957, + "balance_loss_mlp": 1.02450824, + "epoch": 0.6574477679242448, + "flos": 53712825382080.0, + "grad_norm": 1.7582973208313757, + "language_loss": 0.76172364, + "learning_rate": 1.1095693359113454e-06, + "loss": 0.78327209, + "num_input_tokens_seen": 236069215, + "router_z_loss_clip": 0.74462891, + "router_z_loss_mlp": 0.12213135, + "step": 10935, + "time_per_iteration": 2.832562208175659 + }, + { + "auxiliary_loss_clip": 0.01118618, + "auxiliary_loss_mlp": 0.01038956, + "balance_loss_clip": 1.0424149, + "balance_loss_mlp": 1.02574158, + "epoch": 0.6575078911769127, + "flos": 29982524591520.0, + "grad_norm": 1.7370836386095176, + "language_loss": 0.78903866, + "learning_rate": 1.1092206196403538e-06, + "loss": 0.81061441, + "num_input_tokens_seen": 236088335, + "router_z_loss_clip": 0.76123047, + "router_z_loss_mlp": 0.13226318, + "step": 10936, + "time_per_iteration": 2.7364821434020996 + }, + { + "auxiliary_loss_clip": 0.01114512, + "auxiliary_loss_mlp": 0.01031274, + "balance_loss_clip": 1.04164672, + "balance_loss_mlp": 1.01978254, + "epoch": 0.6575680144295807, + "flos": 25531485560640.0, + "grad_norm": 2.9479840213289634, + "language_loss": 0.69328117, + "learning_rate": 1.1088719371478056e-06, + "loss": 0.71473908, + "num_input_tokens_seen": 236108540, + "router_z_loss_clip": 0.72851562, + "router_z_loss_mlp": 0.11480713, + "step": 10937, + "time_per_iteration": 2.6745264530181885 + }, + { + "auxiliary_loss_clip": 0.01115449, + "auxiliary_loss_mlp": 0.01029001, + "balance_loss_clip": 1.04161549, + "balance_loss_mlp": 1.01713419, + "epoch": 0.6576281376822486, + "flos": 13330798421760.0, + "grad_norm": 2.2794207750049353, + "language_loss": 0.6846422, + "learning_rate": 1.1085232884469236e-06, + "loss": 0.70608664, + "num_input_tokens_seen": 236124495, + "router_z_loss_clip": 0.73779297, + "router_z_loss_mlp": 0.11877441, + "step": 10938, + "time_per_iteration": 2.683805227279663 + }, + { + "auxiliary_loss_clip": 0.01117967, + "auxiliary_loss_mlp": 0.01030825, + "balance_loss_clip": 1.04223728, + "balance_loss_mlp": 1.01873136, + "epoch": 0.6576882609349166, + "flos": 23527259935200.0, + "grad_norm": 2.1597590362383903, + "language_loss": 0.71268904, + "learning_rate": 1.108174673550927e-06, + "loss": 0.73417699, + "num_input_tokens_seen": 236142550, + "router_z_loss_clip": 0.75634766, + "router_z_loss_mlp": 0.12097168, + "step": 10939, + "time_per_iteration": 2.6012425422668457 + }, + { + "auxiliary_loss_clip": 0.01120794, + "auxiliary_loss_mlp": 0.01028433, + "balance_loss_clip": 1.04333258, + "balance_loss_mlp": 1.01555872, + "epoch": 0.6577483841875845, + "flos": 24673976952480.0, + "grad_norm": 2.5480633623450304, + "language_loss": 0.7774961, + "learning_rate": 1.107826092473037e-06, + "loss": 0.79898834, + "num_input_tokens_seen": 236156620, + "router_z_loss_clip": 0.77441406, + "router_z_loss_mlp": 0.12860107, + "step": 10940, + "time_per_iteration": 2.6271250247955322 + }, + { + "auxiliary_loss_clip": 0.01120508, + "auxiliary_loss_mlp": 0.01030393, + "balance_loss_clip": 1.04210043, + "balance_loss_mlp": 1.01869273, + "epoch": 0.6578085074402525, + "flos": 42404404327200.0, + "grad_norm": 2.0883222825303376, + "language_loss": 0.68737817, + "learning_rate": 1.107477545226471e-06, + "loss": 0.70888722, + "num_input_tokens_seen": 236177095, + "router_z_loss_clip": 0.78369141, + "router_z_loss_mlp": 0.11706543, + "step": 10941, + "time_per_iteration": 2.705223798751831 + }, + { + "auxiliary_loss_clip": 0.01115902, + "auxiliary_loss_mlp": 0.01025778, + "balance_loss_clip": 1.04128313, + "balance_loss_mlp": 1.01403618, + "epoch": 0.6578686306929205, + "flos": 28640134513440.0, + "grad_norm": 2.566804298225174, + "language_loss": 0.68554133, + "learning_rate": 1.1071290318244448e-06, + "loss": 0.70695817, + "num_input_tokens_seen": 236194695, + "router_z_loss_clip": 0.74560547, + "router_z_loss_mlp": 0.11743164, + "step": 10942, + "time_per_iteration": 2.6599223613739014 + }, + { + "auxiliary_loss_clip": 0.01125053, + "auxiliary_loss_mlp": 0.01037152, + "balance_loss_clip": 1.04513729, + "balance_loss_mlp": 1.02409244, + "epoch": 0.6579287539455885, + "flos": 22057502580000.0, + "grad_norm": 2.6554047826149687, + "language_loss": 0.71699357, + "learning_rate": 1.1067805522801753e-06, + "loss": 0.73861563, + "num_input_tokens_seen": 236213885, + "router_z_loss_clip": 0.79833984, + "router_z_loss_mlp": 0.1305542, + "step": 10943, + "time_per_iteration": 2.6140849590301514 + }, + { + "auxiliary_loss_clip": 0.01115722, + "auxiliary_loss_mlp": 0.01030638, + "balance_loss_clip": 1.04173255, + "balance_loss_mlp": 1.01881218, + "epoch": 0.6579888771982564, + "flos": 34972975000800.0, + "grad_norm": 1.66293266979719, + "language_loss": 0.59405565, + "learning_rate": 1.1064321066068778e-06, + "loss": 0.61551929, + "num_input_tokens_seen": 236237315, + "router_z_loss_clip": 0.73876953, + "router_z_loss_mlp": 0.11846924, + "step": 10944, + "time_per_iteration": 2.708383321762085 + }, + { + "auxiliary_loss_clip": 0.01121431, + "auxiliary_loss_mlp": 0.01036932, + "balance_loss_clip": 1.04247689, + "balance_loss_mlp": 1.02460539, + "epoch": 0.6580490004509244, + "flos": 30562122761280.0, + "grad_norm": 1.5528566391697844, + "language_loss": 0.72532374, + "learning_rate": 1.1060836948177646e-06, + "loss": 0.74690729, + "num_input_tokens_seen": 236256345, + "router_z_loss_clip": 0.79052734, + "router_z_loss_mlp": 0.12322998, + "step": 10945, + "time_per_iteration": 2.628861665725708 + }, + { + "auxiliary_loss_clip": 0.01117169, + "auxiliary_loss_mlp": 0.01025719, + "balance_loss_clip": 1.04329622, + "balance_loss_mlp": 1.01464415, + "epoch": 0.6581091237035923, + "flos": 53091136108800.0, + "grad_norm": 2.773905246721577, + "language_loss": 0.70743626, + "learning_rate": 1.105735316926046e-06, + "loss": 0.72886515, + "num_input_tokens_seen": 236281890, + "router_z_loss_clip": 0.73876953, + "router_z_loss_mlp": 0.11077881, + "step": 10946, + "time_per_iteration": 4.261468410491943 + }, + { + "auxiliary_loss_clip": 0.0111862, + "auxiliary_loss_mlp": 0.01032293, + "balance_loss_clip": 1.0434221, + "balance_loss_mlp": 1.02021098, + "epoch": 0.6581692469562603, + "flos": 27351788136480.0, + "grad_norm": 3.0392684605770417, + "language_loss": 0.82093847, + "learning_rate": 1.105386972944934e-06, + "loss": 0.84244758, + "num_input_tokens_seen": 236298370, + "router_z_loss_clip": 0.75146484, + "router_z_loss_mlp": 0.12078857, + "step": 10947, + "time_per_iteration": 3.998410701751709 + }, + { + "auxiliary_loss_clip": 0.01119139, + "auxiliary_loss_mlp": 0.01026837, + "balance_loss_clip": 1.04217887, + "balance_loss_mlp": 1.01569724, + "epoch": 0.6582293702089284, + "flos": 30334609468800.0, + "grad_norm": 1.6771159922335475, + "language_loss": 0.77275276, + "learning_rate": 1.1050386628876385e-06, + "loss": 0.79421258, + "num_input_tokens_seen": 236317380, + "router_z_loss_clip": 0.77050781, + "router_z_loss_mlp": 0.11126709, + "step": 10948, + "time_per_iteration": 2.659198760986328 + }, + { + "auxiliary_loss_clip": 0.01117097, + "auxiliary_loss_mlp": 0.01027465, + "balance_loss_clip": 1.0434742, + "balance_loss_mlp": 1.01617002, + "epoch": 0.6582894934615963, + "flos": 28113202457280.0, + "grad_norm": 4.569297507369188, + "language_loss": 0.79086792, + "learning_rate": 1.1046903867673655e-06, + "loss": 0.81231356, + "num_input_tokens_seen": 236336210, + "router_z_loss_clip": 0.73535156, + "router_z_loss_mlp": 0.11303711, + "step": 10949, + "time_per_iteration": 2.670285701751709 + }, + { + "auxiliary_loss_clip": 0.01037731, + "auxiliary_loss_mlp": 0.01002248, + "balance_loss_clip": 1.01418722, + "balance_loss_mlp": 1.00108433, + "epoch": 0.6583496167142643, + "flos": 88528456668960.0, + "grad_norm": 0.7384811620453093, + "language_loss": 0.61824024, + "learning_rate": 1.104342144597323e-06, + "loss": 0.63863999, + "num_input_tokens_seen": 236403090, + "router_z_loss_clip": 0.2355957, + "router_z_loss_mlp": 0.0116272, + "step": 10950, + "time_per_iteration": 3.3665168285369873 + }, + { + "auxiliary_loss_clip": 0.01113492, + "auxiliary_loss_mlp": 0.01034411, + "balance_loss_clip": 1.04081345, + "balance_loss_mlp": 1.0234673, + "epoch": 0.6584097399669322, + "flos": 15958252977120.0, + "grad_norm": 2.051201554259015, + "language_loss": 0.67229074, + "learning_rate": 1.1039939363907178e-06, + "loss": 0.69376975, + "num_input_tokens_seen": 236420475, + "router_z_loss_clip": 0.72558594, + "router_z_loss_mlp": 0.10943604, + "step": 10951, + "time_per_iteration": 2.608583450317383 + }, + { + "auxiliary_loss_clip": 0.01118695, + "auxiliary_loss_mlp": 0.01034612, + "balance_loss_clip": 1.04422128, + "balance_loss_mlp": 1.02295935, + "epoch": 0.6584698632196002, + "flos": 35012189377440.0, + "grad_norm": 1.6139892782361627, + "language_loss": 0.765683, + "learning_rate": 1.1036457621607504e-06, + "loss": 0.78721607, + "num_input_tokens_seen": 236441915, + "router_z_loss_clip": 0.74462891, + "router_z_loss_mlp": 0.11663818, + "step": 10952, + "time_per_iteration": 2.7712934017181396 + }, + { + "auxiliary_loss_clip": 0.0111534, + "auxiliary_loss_mlp": 0.01028918, + "balance_loss_clip": 1.04266572, + "balance_loss_mlp": 1.01733685, + "epoch": 0.6585299864722681, + "flos": 17471924713440.0, + "grad_norm": 2.051289895529144, + "language_loss": 0.73292822, + "learning_rate": 1.1032976219206257e-06, + "loss": 0.75437081, + "num_input_tokens_seen": 236460340, + "router_z_loss_clip": 0.72705078, + "router_z_loss_mlp": 0.11578369, + "step": 10953, + "time_per_iteration": 2.647163152694702 + }, + { + "auxiliary_loss_clip": 0.01116406, + "auxiliary_loss_mlp": 0.01038127, + "balance_loss_clip": 1.04175854, + "balance_loss_mlp": 1.0259738, + "epoch": 0.6585901097249361, + "flos": 32699266531200.0, + "grad_norm": 1.9101841937604316, + "language_loss": 0.78584135, + "learning_rate": 1.102949515683546e-06, + "loss": 0.80738664, + "num_input_tokens_seen": 236478280, + "router_z_loss_clip": 0.74707031, + "router_z_loss_mlp": 0.121521, + "step": 10954, + "time_per_iteration": 2.689093589782715 + }, + { + "auxiliary_loss_clip": 0.0111802, + "auxiliary_loss_mlp": 0.01038009, + "balance_loss_clip": 1.04239798, + "balance_loss_mlp": 1.02563512, + "epoch": 0.658650232977604, + "flos": 22858131277440.0, + "grad_norm": 2.3826495844391395, + "language_loss": 0.69341952, + "learning_rate": 1.1026014434627096e-06, + "loss": 0.71497983, + "num_input_tokens_seen": 236493225, + "router_z_loss_clip": 0.75585938, + "router_z_loss_mlp": 0.12384033, + "step": 10955, + "time_per_iteration": 2.613771677017212 + }, + { + "auxiliary_loss_clip": 0.011138, + "auxiliary_loss_mlp": 0.01032652, + "balance_loss_clip": 1.04202747, + "balance_loss_mlp": 1.02228642, + "epoch": 0.6587103562302721, + "flos": 30204568051200.0, + "grad_norm": 2.293966107810187, + "language_loss": 0.8043555, + "learning_rate": 1.1022534052713172e-06, + "loss": 0.82581997, + "num_input_tokens_seen": 236514420, + "router_z_loss_clip": 0.71875, + "router_z_loss_mlp": 0.10369873, + "step": 10956, + "time_per_iteration": 4.1202921867370605 + }, + { + "auxiliary_loss_clip": 0.0111815, + "auxiliary_loss_mlp": 0.01041332, + "balance_loss_clip": 1.04409432, + "balance_loss_mlp": 1.02904201, + "epoch": 0.65877047948294, + "flos": 27266917135680.0, + "grad_norm": 3.235996856161176, + "language_loss": 0.81193149, + "learning_rate": 1.1019054011225648e-06, + "loss": 0.83352637, + "num_input_tokens_seen": 236532785, + "router_z_loss_clip": 0.73974609, + "router_z_loss_mlp": 0.12280273, + "step": 10957, + "time_per_iteration": 2.7491180896759033 + }, + { + "auxiliary_loss_clip": 0.01116874, + "auxiliary_loss_mlp": 0.01032213, + "balance_loss_clip": 1.04424644, + "balance_loss_mlp": 1.021752, + "epoch": 0.658830602735608, + "flos": 55131739901280.0, + "grad_norm": 1.6274253492578037, + "language_loss": 0.75846976, + "learning_rate": 1.1015574310296506e-06, + "loss": 0.77996063, + "num_input_tokens_seen": 236553330, + "router_z_loss_clip": 0.72558594, + "router_z_loss_mlp": 0.10467529, + "step": 10958, + "time_per_iteration": 4.29412317276001 + }, + { + "auxiliary_loss_clip": 0.01115152, + "auxiliary_loss_mlp": 0.01030317, + "balance_loss_clip": 1.04233718, + "balance_loss_mlp": 1.0189209, + "epoch": 0.6588907259882759, + "flos": 24283933734240.0, + "grad_norm": 1.677858616743974, + "language_loss": 0.74842894, + "learning_rate": 1.1012094950057678e-06, + "loss": 0.76988363, + "num_input_tokens_seen": 236572960, + "router_z_loss_clip": 0.72753906, + "router_z_loss_mlp": 0.11401367, + "step": 10959, + "time_per_iteration": 2.6763556003570557 + }, + { + "auxiliary_loss_clip": 0.01115388, + "auxiliary_loss_mlp": 0.01029822, + "balance_loss_clip": 1.04169655, + "balance_loss_mlp": 1.01844335, + "epoch": 0.6589508492409439, + "flos": 29448339942240.0, + "grad_norm": 1.5017367591324793, + "language_loss": 0.65010977, + "learning_rate": 1.1008615930641107e-06, + "loss": 0.67156184, + "num_input_tokens_seen": 236594090, + "router_z_loss_clip": 0.73681641, + "router_z_loss_mlp": 0.11383057, + "step": 10960, + "time_per_iteration": 2.685896396636963 + }, + { + "auxiliary_loss_clip": 0.01122052, + "auxiliary_loss_mlp": 0.01032516, + "balance_loss_clip": 1.04334545, + "balance_loss_mlp": 1.0197432, + "epoch": 0.659010972493612, + "flos": 22235469589440.0, + "grad_norm": 2.082228670449021, + "language_loss": 0.82388633, + "learning_rate": 1.1005137252178734e-06, + "loss": 0.84543204, + "num_input_tokens_seen": 236610190, + "router_z_loss_clip": 0.78662109, + "router_z_loss_mlp": 0.12780762, + "step": 10961, + "time_per_iteration": 2.607327699661255 + }, + { + "auxiliary_loss_clip": 0.01117263, + "auxiliary_loss_mlp": 0.01030332, + "balance_loss_clip": 1.04338682, + "balance_loss_mlp": 1.01872742, + "epoch": 0.6590710957462799, + "flos": 33678105341760.0, + "grad_norm": 2.1668105953267554, + "language_loss": 0.73538935, + "learning_rate": 1.1001658914802453e-06, + "loss": 0.75686526, + "num_input_tokens_seen": 236631575, + "router_z_loss_clip": 0.73828125, + "router_z_loss_mlp": 0.11608887, + "step": 10962, + "time_per_iteration": 2.7303378582000732 + }, + { + "auxiliary_loss_clip": 0.01117182, + "auxiliary_loss_mlp": 0.01032747, + "balance_loss_clip": 1.04152012, + "balance_loss_mlp": 1.02096367, + "epoch": 0.6591312189989479, + "flos": 24773677277760.0, + "grad_norm": 2.0094913709624023, + "language_loss": 0.79569674, + "learning_rate": 1.0998180918644165e-06, + "loss": 0.81719601, + "num_input_tokens_seen": 236649815, + "router_z_loss_clip": 0.75585938, + "router_z_loss_mlp": 0.11791992, + "step": 10963, + "time_per_iteration": 2.6323208808898926 + }, + { + "auxiliary_loss_clip": 0.01114236, + "auxiliary_loss_mlp": 0.01031596, + "balance_loss_clip": 1.04106402, + "balance_loss_mlp": 1.02000904, + "epoch": 0.6591913422516158, + "flos": 15023612168640.0, + "grad_norm": 1.8532325446184668, + "language_loss": 0.78203022, + "learning_rate": 1.0994703263835754e-06, + "loss": 0.80348855, + "num_input_tokens_seen": 236668335, + "router_z_loss_clip": 0.73193359, + "router_z_loss_mlp": 0.11584473, + "step": 10964, + "time_per_iteration": 2.6414291858673096 + }, + { + "auxiliary_loss_clip": 0.01116382, + "auxiliary_loss_mlp": 0.01035054, + "balance_loss_clip": 1.04030967, + "balance_loss_mlp": 1.02375269, + "epoch": 0.6592514655042838, + "flos": 31584875952960.0, + "grad_norm": 1.647486245121565, + "language_loss": 0.73681617, + "learning_rate": 1.0991225950509106e-06, + "loss": 0.75833058, + "num_input_tokens_seen": 236688945, + "router_z_loss_clip": 0.76123047, + "router_z_loss_mlp": 0.11309814, + "step": 10965, + "time_per_iteration": 2.6419756412506104 + }, + { + "auxiliary_loss_clip": 0.01121563, + "auxiliary_loss_mlp": 0.01032942, + "balance_loss_clip": 1.04315758, + "balance_loss_mlp": 1.02131867, + "epoch": 0.6593115887569517, + "flos": 17159986110240.0, + "grad_norm": 2.1880401494969806, + "language_loss": 0.73910481, + "learning_rate": 1.0987748978796067e-06, + "loss": 0.76064986, + "num_input_tokens_seen": 236707055, + "router_z_loss_clip": 0.78320312, + "router_z_loss_mlp": 0.11633301, + "step": 10966, + "time_per_iteration": 2.6972463130950928 + }, + { + "auxiliary_loss_clip": 0.01116597, + "auxiliary_loss_mlp": 0.01030878, + "balance_loss_clip": 1.04178786, + "balance_loss_mlp": 1.01810515, + "epoch": 0.6593717120096197, + "flos": 30161139877440.0, + "grad_norm": 1.7774935145963955, + "language_loss": 0.77124202, + "learning_rate": 1.0984272348828487e-06, + "loss": 0.79271674, + "num_input_tokens_seen": 236725900, + "router_z_loss_clip": 0.74853516, + "router_z_loss_mlp": 0.12780762, + "step": 10967, + "time_per_iteration": 2.6517322063446045 + }, + { + "auxiliary_loss_clip": 0.01036813, + "auxiliary_loss_mlp": 0.01004098, + "balance_loss_clip": 1.01335776, + "balance_loss_mlp": 1.00296283, + "epoch": 0.6594318352622877, + "flos": 67790417736960.0, + "grad_norm": 0.6973840335666345, + "language_loss": 0.48475444, + "learning_rate": 1.0980796060738221e-06, + "loss": 0.50516355, + "num_input_tokens_seen": 236788415, + "router_z_loss_clip": 0.23449707, + "router_z_loss_mlp": 0.01136017, + "step": 10968, + "time_per_iteration": 3.3002984523773193 + }, + { + "auxiliary_loss_clip": 0.01118287, + "auxiliary_loss_mlp": 0.01031226, + "balance_loss_clip": 1.04191554, + "balance_loss_mlp": 1.01926351, + "epoch": 0.6594919585149557, + "flos": 21300747746400.0, + "grad_norm": 1.8819513121588094, + "language_loss": 0.78751945, + "learning_rate": 1.0977320114657058e-06, + "loss": 0.80901456, + "num_input_tokens_seen": 236805155, + "router_z_loss_clip": 0.76367188, + "router_z_loss_mlp": 0.11950684, + "step": 10969, + "time_per_iteration": 2.621917963027954 + }, + { + "auxiliary_loss_clip": 0.01114614, + "auxiliary_loss_mlp": 0.01030621, + "balance_loss_clip": 1.04038858, + "balance_loss_mlp": 1.01903415, + "epoch": 0.6595520817676236, + "flos": 22235834244960.0, + "grad_norm": 2.127491393006265, + "language_loss": 0.6509462, + "learning_rate": 1.0973844510716817e-06, + "loss": 0.67239851, + "num_input_tokens_seen": 236824360, + "router_z_loss_clip": 0.74267578, + "router_z_loss_mlp": 0.1159668, + "step": 10970, + "time_per_iteration": 2.589158535003662 + }, + { + "auxiliary_loss_clip": 0.01116745, + "auxiliary_loss_mlp": 0.01027167, + "balance_loss_clip": 1.04128027, + "balance_loss_mlp": 1.01550865, + "epoch": 0.6596122050202916, + "flos": 27088342367040.0, + "grad_norm": 1.5642469919204864, + "language_loss": 0.76399732, + "learning_rate": 1.0970369249049308e-06, + "loss": 0.78543639, + "num_input_tokens_seen": 236844640, + "router_z_loss_clip": 0.75341797, + "router_z_loss_mlp": 0.11663818, + "step": 10971, + "time_per_iteration": 2.6635966300964355 + }, + { + "auxiliary_loss_clip": 0.01119673, + "auxiliary_loss_mlp": 0.01030822, + "balance_loss_clip": 1.04327822, + "balance_loss_mlp": 1.01936638, + "epoch": 0.6596723282729595, + "flos": 17294403394080.0, + "grad_norm": 3.299310324705195, + "language_loss": 0.70232981, + "learning_rate": 1.096689432978629e-06, + "loss": 0.72383481, + "num_input_tokens_seen": 236861160, + "router_z_loss_clip": 0.76464844, + "router_z_loss_mlp": 0.11450195, + "step": 10972, + "time_per_iteration": 2.5815951824188232 + }, + { + "auxiliary_loss_clip": 0.01115833, + "auxiliary_loss_mlp": 0.01023725, + "balance_loss_clip": 1.04199958, + "balance_loss_mlp": 1.011953, + "epoch": 0.6597324515256275, + "flos": 37282008188160.0, + "grad_norm": 2.104595606711167, + "language_loss": 0.55605686, + "learning_rate": 1.0963419753059556e-06, + "loss": 0.57745248, + "num_input_tokens_seen": 236880465, + "router_z_loss_clip": 0.73876953, + "router_z_loss_mlp": 0.11773682, + "step": 10973, + "time_per_iteration": 2.7598965167999268 + }, + { + "auxiliary_loss_clip": 0.01121943, + "auxiliary_loss_mlp": 0.01036115, + "balance_loss_clip": 1.04265404, + "balance_loss_mlp": 1.02418804, + "epoch": 0.6597925747782956, + "flos": 21523925689920.0, + "grad_norm": 1.9768064271599537, + "language_loss": 0.78992337, + "learning_rate": 1.0959945519000839e-06, + "loss": 0.81150395, + "num_input_tokens_seen": 236897730, + "router_z_loss_clip": 0.79248047, + "router_z_loss_mlp": 0.11920166, + "step": 10974, + "time_per_iteration": 2.6303164958953857 + }, + { + "auxiliary_loss_clip": 0.01117292, + "auxiliary_loss_mlp": 0.0103441, + "balance_loss_clip": 1.04223442, + "balance_loss_mlp": 1.02261472, + "epoch": 0.6598526980309635, + "flos": 27844205820480.0, + "grad_norm": 4.770343384320929, + "language_loss": 0.68611729, + "learning_rate": 1.0956471627741906e-06, + "loss": 0.70763433, + "num_input_tokens_seen": 236917300, + "router_z_loss_clip": 0.75097656, + "router_z_loss_mlp": 0.11804199, + "step": 10975, + "time_per_iteration": 2.7328567504882812 + }, + { + "auxiliary_loss_clip": 0.01115523, + "auxiliary_loss_mlp": 0.01029056, + "balance_loss_clip": 1.04022455, + "balance_loss_mlp": 1.01767182, + "epoch": 0.6599128212836315, + "flos": 25707386188800.0, + "grad_norm": 2.478718278564229, + "language_loss": 0.70477998, + "learning_rate": 1.0952998079414464e-06, + "loss": 0.72622579, + "num_input_tokens_seen": 236935590, + "router_z_loss_clip": 0.75292969, + "router_z_loss_mlp": 0.1138916, + "step": 10976, + "time_per_iteration": 2.624711275100708 + }, + { + "auxiliary_loss_clip": 0.01112461, + "auxiliary_loss_mlp": 0.01027203, + "balance_loss_clip": 1.04000759, + "balance_loss_mlp": 1.0148108, + "epoch": 0.6599729445362994, + "flos": 27044103847680.0, + "grad_norm": 1.9541990428505642, + "language_loss": 0.67424721, + "learning_rate": 1.0949524874150243e-06, + "loss": 0.69564378, + "num_input_tokens_seen": 236952830, + "router_z_loss_clip": 0.72363281, + "router_z_loss_mlp": 0.1239624, + "step": 10977, + "time_per_iteration": 2.651459217071533 + }, + { + "auxiliary_loss_clip": 0.01121931, + "auxiliary_loss_mlp": 0.0103068, + "balance_loss_clip": 1.04340529, + "balance_loss_mlp": 1.01773977, + "epoch": 0.6600330677889674, + "flos": 22146587377920.0, + "grad_norm": 2.290147238286102, + "language_loss": 0.81620222, + "learning_rate": 1.0946052012080952e-06, + "loss": 0.83772832, + "num_input_tokens_seen": 236971930, + "router_z_loss_clip": 0.78564453, + "router_z_loss_mlp": 0.12939453, + "step": 10978, + "time_per_iteration": 2.6241543292999268 + }, + { + "auxiliary_loss_clip": 0.01120738, + "auxiliary_loss_mlp": 0.01037557, + "balance_loss_clip": 1.04331195, + "balance_loss_mlp": 1.02474189, + "epoch": 0.6600931910416353, + "flos": 22146627895200.0, + "grad_norm": 2.2556749105208653, + "language_loss": 0.67232645, + "learning_rate": 1.0942579493338278e-06, + "loss": 0.69390941, + "num_input_tokens_seen": 236989920, + "router_z_loss_clip": 0.77441406, + "router_z_loss_mlp": 0.12805176, + "step": 10979, + "time_per_iteration": 2.7102601528167725 + }, + { + "auxiliary_loss_clip": 0.01117267, + "auxiliary_loss_mlp": 0.01026862, + "balance_loss_clip": 1.04038727, + "balance_loss_mlp": 1.01473844, + "epoch": 0.6601533142943034, + "flos": 21256549744320.0, + "grad_norm": 4.6574486631198795, + "language_loss": 0.73142046, + "learning_rate": 1.0939107318053889e-06, + "loss": 0.75286174, + "num_input_tokens_seen": 237006570, + "router_z_loss_clip": 0.76904297, + "router_z_loss_mlp": 0.12133789, + "step": 10980, + "time_per_iteration": 2.6174087524414062 + }, + { + "auxiliary_loss_clip": 0.01112976, + "auxiliary_loss_mlp": 0.01029672, + "balance_loss_clip": 1.04133439, + "balance_loss_mlp": 1.01870525, + "epoch": 0.6602134375469713, + "flos": 34434211898880.0, + "grad_norm": 1.75684149215006, + "language_loss": 0.73053813, + "learning_rate": 1.0935635486359459e-06, + "loss": 0.75196457, + "num_input_tokens_seen": 237028415, + "router_z_loss_clip": 0.71630859, + "router_z_loss_mlp": 0.10968018, + "step": 10981, + "time_per_iteration": 2.719058036804199 + }, + { + "auxiliary_loss_clip": 0.01117499, + "auxiliary_loss_mlp": 0.01032813, + "balance_loss_clip": 1.04152989, + "balance_loss_mlp": 1.02127409, + "epoch": 0.6602735607996393, + "flos": 35897202868320.0, + "grad_norm": 2.3792986296735923, + "language_loss": 0.68818015, + "learning_rate": 1.0932163998386647e-06, + "loss": 0.7096833, + "num_input_tokens_seen": 237046595, + "router_z_loss_clip": 0.75878906, + "router_z_loss_mlp": 0.11547852, + "step": 10982, + "time_per_iteration": 2.751922130584717 + }, + { + "auxiliary_loss_clip": 0.01115683, + "auxiliary_loss_mlp": 0.01028413, + "balance_loss_clip": 1.04236376, + "balance_loss_mlp": 1.01679027, + "epoch": 0.6603336840523072, + "flos": 22681015130880.0, + "grad_norm": 1.6371369546032515, + "language_loss": 0.6974318, + "learning_rate": 1.0928692854267075e-06, + "loss": 0.71887273, + "num_input_tokens_seen": 237066150, + "router_z_loss_clip": 0.73388672, + "router_z_loss_mlp": 0.1161499, + "step": 10983, + "time_per_iteration": 2.6208670139312744 + }, + { + "auxiliary_loss_clip": 0.01116506, + "auxiliary_loss_mlp": 0.01029849, + "balance_loss_clip": 1.04065776, + "balance_loss_mlp": 1.01791024, + "epoch": 0.6603938073049752, + "flos": 40578631918560.0, + "grad_norm": 2.7266114825477823, + "language_loss": 0.71123874, + "learning_rate": 1.092522205413239e-06, + "loss": 0.73270226, + "num_input_tokens_seen": 237087060, + "router_z_loss_clip": 0.75878906, + "router_z_loss_mlp": 0.1194458, + "step": 10984, + "time_per_iteration": 2.72477126121521 + }, + { + "auxiliary_loss_clip": 0.01115195, + "auxiliary_loss_mlp": 0.01032044, + "balance_loss_clip": 1.04194474, + "balance_loss_mlp": 1.02043879, + "epoch": 0.6604539305576431, + "flos": 21212068121280.0, + "grad_norm": 1.665899679726056, + "language_loss": 0.83744633, + "learning_rate": 1.0921751598114193e-06, + "loss": 0.85891867, + "num_input_tokens_seen": 237103825, + "router_z_loss_clip": 0.73291016, + "router_z_loss_mlp": 0.11602783, + "step": 10985, + "time_per_iteration": 4.0131330490112305 + }, + { + "auxiliary_loss_clip": 0.01118251, + "auxiliary_loss_mlp": 0.0102979, + "balance_loss_clip": 1.04300487, + "balance_loss_mlp": 1.01745176, + "epoch": 0.6605140538103111, + "flos": 25930766718720.0, + "grad_norm": 2.4614057219411474, + "language_loss": 0.74167311, + "learning_rate": 1.0918281486344077e-06, + "loss": 0.76315355, + "num_input_tokens_seen": 237121740, + "router_z_loss_clip": 0.75146484, + "router_z_loss_mlp": 0.12341309, + "step": 10986, + "time_per_iteration": 2.6508748531341553 + }, + { + "auxiliary_loss_clip": 0.01114217, + "auxiliary_loss_mlp": 0.01027012, + "balance_loss_clip": 1.04184878, + "balance_loss_mlp": 1.015306, + "epoch": 0.6605741770629792, + "flos": 16937942650560.0, + "grad_norm": 1.8838119676206215, + "language_loss": 0.789226, + "learning_rate": 1.0914811718953636e-06, + "loss": 0.81063831, + "num_input_tokens_seen": 237139565, + "router_z_loss_clip": 0.72363281, + "router_z_loss_mlp": 0.11706543, + "step": 10987, + "time_per_iteration": 4.047450542449951 + }, + { + "auxiliary_loss_clip": 0.0103667, + "auxiliary_loss_mlp": 0.01000943, + "balance_loss_clip": 1.01305664, + "balance_loss_mlp": 0.99988139, + "epoch": 0.6606343003156471, + "flos": 84578870675520.0, + "grad_norm": 0.8215318610719542, + "language_loss": 0.54090041, + "learning_rate": 1.0911342296074454e-06, + "loss": 0.56127656, + "num_input_tokens_seen": 237201055, + "router_z_loss_clip": 0.23608398, + "router_z_loss_mlp": 0.01062775, + "step": 10988, + "time_per_iteration": 3.3478105068206787 + }, + { + "auxiliary_loss_clip": 0.01116889, + "auxiliary_loss_mlp": 0.01030372, + "balance_loss_clip": 1.04360843, + "balance_loss_mlp": 1.01981592, + "epoch": 0.6606944235683151, + "flos": 33278256941760.0, + "grad_norm": 1.7206315583248888, + "language_loss": 0.77203453, + "learning_rate": 1.0907873217838077e-06, + "loss": 0.79350716, + "num_input_tokens_seen": 237221805, + "router_z_loss_clip": 0.73291016, + "router_z_loss_mlp": 0.10559082, + "step": 10989, + "time_per_iteration": 2.684788465499878 + }, + { + "auxiliary_loss_clip": 0.01119793, + "auxiliary_loss_mlp": 0.01031244, + "balance_loss_clip": 1.04636431, + "balance_loss_mlp": 1.01957345, + "epoch": 0.660754546820983, + "flos": 16805024506080.0, + "grad_norm": 2.6496489253777584, + "language_loss": 0.77237368, + "learning_rate": 1.0904404484376064e-06, + "loss": 0.7938841, + "num_input_tokens_seen": 237238270, + "router_z_loss_clip": 0.73486328, + "router_z_loss_mlp": 0.11676025, + "step": 10990, + "time_per_iteration": 2.594313383102417 + }, + { + "auxiliary_loss_clip": 0.0111844, + "auxiliary_loss_mlp": 0.01031878, + "balance_loss_clip": 1.04217362, + "balance_loss_mlp": 1.01983821, + "epoch": 0.660814670073651, + "flos": 19163198803680.0, + "grad_norm": 2.585279480921569, + "language_loss": 0.60566515, + "learning_rate": 1.0900936095819937e-06, + "loss": 0.6271683, + "num_input_tokens_seen": 237255400, + "router_z_loss_clip": 0.76367188, + "router_z_loss_mlp": 0.12036133, + "step": 10991, + "time_per_iteration": 2.5770621299743652 + }, + { + "auxiliary_loss_clip": 0.01121149, + "auxiliary_loss_mlp": 0.01037022, + "balance_loss_clip": 1.04326224, + "balance_loss_mlp": 1.02485085, + "epoch": 0.6608747933263189, + "flos": 25441266278880.0, + "grad_norm": 3.031092586595328, + "language_loss": 0.68700087, + "learning_rate": 1.0897468052301234e-06, + "loss": 0.70858258, + "num_input_tokens_seen": 237273105, + "router_z_loss_clip": 0.77783203, + "router_z_loss_mlp": 0.12164307, + "step": 10992, + "time_per_iteration": 2.6541783809661865 + }, + { + "auxiliary_loss_clip": 0.01118759, + "auxiliary_loss_mlp": 0.01030649, + "balance_loss_clip": 1.04110432, + "balance_loss_mlp": 1.01831698, + "epoch": 0.660934916578987, + "flos": 25175267920800.0, + "grad_norm": 1.8169663197543537, + "language_loss": 0.87993008, + "learning_rate": 1.0894000353951444e-06, + "loss": 0.90142411, + "num_input_tokens_seen": 237292650, + "router_z_loss_clip": 0.77685547, + "router_z_loss_mlp": 0.12322998, + "step": 10993, + "time_per_iteration": 2.694080352783203 + }, + { + "auxiliary_loss_clip": 0.01123192, + "auxiliary_loss_mlp": 0.01031674, + "balance_loss_clip": 1.04359663, + "balance_loss_mlp": 1.01835883, + "epoch": 0.6609950398316549, + "flos": 30644481690720.0, + "grad_norm": 1.6907235332546562, + "language_loss": 0.6741538, + "learning_rate": 1.0890533000902078e-06, + "loss": 0.69570243, + "num_input_tokens_seen": 237312865, + "router_z_loss_clip": 0.79541016, + "router_z_loss_mlp": 0.13317871, + "step": 10994, + "time_per_iteration": 2.728668689727783 + }, + { + "auxiliary_loss_clip": 0.01120923, + "auxiliary_loss_mlp": 0.01031838, + "balance_loss_clip": 1.04428482, + "balance_loss_mlp": 1.01967287, + "epoch": 0.6610551630843229, + "flos": 22770261997920.0, + "grad_norm": 6.132451764226712, + "language_loss": 0.76546133, + "learning_rate": 1.0887065993284626e-06, + "loss": 0.78698891, + "num_input_tokens_seen": 237331210, + "router_z_loss_clip": 0.765625, + "router_z_loss_mlp": 0.12164307, + "step": 10995, + "time_per_iteration": 2.637943744659424 + }, + { + "auxiliary_loss_clip": 0.01117815, + "auxiliary_loss_mlp": 0.01030853, + "balance_loss_clip": 1.04257059, + "balance_loss_mlp": 1.01989186, + "epoch": 0.6611152863369908, + "flos": 28379889609120.0, + "grad_norm": 2.335785296026084, + "language_loss": 0.74619257, + "learning_rate": 1.088359933123053e-06, + "loss": 0.76767921, + "num_input_tokens_seen": 237349455, + "router_z_loss_clip": 0.75244141, + "router_z_loss_mlp": 0.10961914, + "step": 10996, + "time_per_iteration": 4.1829822063446045 + }, + { + "auxiliary_loss_clip": 0.01117581, + "auxiliary_loss_mlp": 0.0103497, + "balance_loss_clip": 1.04342484, + "balance_loss_mlp": 1.02339506, + "epoch": 0.6611754095896588, + "flos": 27039809016000.0, + "grad_norm": 2.004814576737352, + "language_loss": 0.6887241, + "learning_rate": 1.088013301487126e-06, + "loss": 0.71024966, + "num_input_tokens_seen": 237367100, + "router_z_loss_clip": 0.74169922, + "router_z_loss_mlp": 0.11578369, + "step": 10997, + "time_per_iteration": 4.015078544616699 + }, + { + "auxiliary_loss_clip": 0.0112133, + "auxiliary_loss_mlp": 0.01029007, + "balance_loss_clip": 1.04287767, + "balance_loss_mlp": 1.01770627, + "epoch": 0.6612355328423267, + "flos": 17071590106080.0, + "grad_norm": 2.9251872042800016, + "language_loss": 0.68801612, + "learning_rate": 1.0876667044338269e-06, + "loss": 0.70951951, + "num_input_tokens_seen": 237384840, + "router_z_loss_clip": 0.78369141, + "router_z_loss_mlp": 0.11291504, + "step": 10998, + "time_per_iteration": 2.6611692905426025 + }, + { + "auxiliary_loss_clip": 0.01036608, + "auxiliary_loss_mlp": 0.01003892, + "balance_loss_clip": 1.01304734, + "balance_loss_mlp": 1.00287247, + "epoch": 0.6612956560949947, + "flos": 74986108763040.0, + "grad_norm": 0.6543643153799212, + "language_loss": 0.51050389, + "learning_rate": 1.087320141976297e-06, + "loss": 0.53090894, + "num_input_tokens_seen": 237443355, + "router_z_loss_clip": 0.23547363, + "router_z_loss_mlp": 0.01019287, + "step": 10999, + "time_per_iteration": 3.241877794265747 + }, + { + "auxiliary_loss_clip": 0.01119781, + "auxiliary_loss_mlp": 0.01034174, + "balance_loss_clip": 1.04236019, + "balance_loss_mlp": 1.02261114, + "epoch": 0.6613557793476627, + "flos": 26376717432960.0, + "grad_norm": 2.7603473866673807, + "language_loss": 0.70917583, + "learning_rate": 1.086973614127679e-06, + "loss": 0.73071539, + "num_input_tokens_seen": 237459205, + "router_z_loss_clip": 0.77294922, + "router_z_loss_mlp": 0.11566162, + "step": 11000, + "time_per_iteration": 2.678215742111206 + }, + { + "auxiliary_loss_clip": 0.01113341, + "auxiliary_loss_mlp": 0.01032883, + "balance_loss_clip": 1.04087782, + "balance_loss_mlp": 1.02203536, + "epoch": 0.6614159026003307, + "flos": 41512543416000.0, + "grad_norm": 1.7226694325304535, + "language_loss": 0.65454423, + "learning_rate": 1.0866271209011133e-06, + "loss": 0.67600644, + "num_input_tokens_seen": 237483580, + "router_z_loss_clip": 0.72509766, + "router_z_loss_mlp": 0.10852051, + "step": 11001, + "time_per_iteration": 2.7475051879882812 + }, + { + "auxiliary_loss_clip": 0.0111526, + "auxiliary_loss_mlp": 0.01028513, + "balance_loss_clip": 1.04127312, + "balance_loss_mlp": 1.01681304, + "epoch": 0.6614760258529987, + "flos": 29403817801920.0, + "grad_norm": 1.796321731889505, + "language_loss": 0.7274459, + "learning_rate": 1.086280662309739e-06, + "loss": 0.74888361, + "num_input_tokens_seen": 237502860, + "router_z_loss_clip": 0.73925781, + "router_z_loss_mlp": 0.11700439, + "step": 11002, + "time_per_iteration": 2.6751558780670166 + }, + { + "auxiliary_loss_clip": 0.0111501, + "auxiliary_loss_mlp": 0.01032942, + "balance_loss_clip": 1.04159951, + "balance_loss_mlp": 1.0212059, + "epoch": 0.6615361491056666, + "flos": 18185129821440.0, + "grad_norm": 2.1597764407894045, + "language_loss": 0.78981555, + "learning_rate": 1.0859342383666928e-06, + "loss": 0.81129515, + "num_input_tokens_seen": 237521030, + "router_z_loss_clip": 0.734375, + "router_z_loss_mlp": 0.11724854, + "step": 11003, + "time_per_iteration": 2.6366446018218994 + }, + { + "auxiliary_loss_clip": 0.01119183, + "auxiliary_loss_mlp": 0.01034654, + "balance_loss_clip": 1.0431571, + "balance_loss_mlp": 1.02176726, + "epoch": 0.6615962723583346, + "flos": 18674589744000.0, + "grad_norm": 2.220291252263821, + "language_loss": 0.68475056, + "learning_rate": 1.0855878490851119e-06, + "loss": 0.70628893, + "num_input_tokens_seen": 237539585, + "router_z_loss_clip": 0.76025391, + "router_z_loss_mlp": 0.12884521, + "step": 11004, + "time_per_iteration": 2.650607109069824 + }, + { + "auxiliary_loss_clip": 0.01119631, + "auxiliary_loss_mlp": 0.01035029, + "balance_loss_clip": 1.04171693, + "balance_loss_mlp": 1.0222559, + "epoch": 0.6616563956110025, + "flos": 22857523518240.0, + "grad_norm": 4.004931640183714, + "language_loss": 0.6953963, + "learning_rate": 1.085241494478132e-06, + "loss": 0.71694291, + "num_input_tokens_seen": 237557655, + "router_z_loss_clip": 0.77929688, + "router_z_loss_mlp": 0.12786865, + "step": 11005, + "time_per_iteration": 2.6225993633270264 + }, + { + "auxiliary_loss_clip": 0.01116173, + "auxiliary_loss_mlp": 0.01029908, + "balance_loss_clip": 1.04184186, + "balance_loss_mlp": 1.01851189, + "epoch": 0.6617165188636706, + "flos": 29889550134720.0, + "grad_norm": 1.717964323841668, + "language_loss": 0.78267479, + "learning_rate": 1.0848951745588855e-06, + "loss": 0.80413562, + "num_input_tokens_seen": 237577000, + "router_z_loss_clip": 0.74316406, + "router_z_loss_mlp": 0.11401367, + "step": 11006, + "time_per_iteration": 2.810224771499634 + }, + { + "auxiliary_loss_clip": 0.01116465, + "auxiliary_loss_mlp": 0.01032928, + "balance_loss_clip": 1.04189229, + "balance_loss_mlp": 1.02084589, + "epoch": 0.6617766421163385, + "flos": 27308116859040.0, + "grad_norm": 1.5749784630659789, + "language_loss": 0.76184797, + "learning_rate": 1.0845488893405068e-06, + "loss": 0.78334188, + "num_input_tokens_seen": 237597960, + "router_z_loss_clip": 0.74560547, + "router_z_loss_mlp": 0.12091064, + "step": 11007, + "time_per_iteration": 2.6564390659332275 + }, + { + "auxiliary_loss_clip": 0.01118602, + "auxiliary_loss_mlp": 0.01029127, + "balance_loss_clip": 1.04406619, + "balance_loss_mlp": 1.01764107, + "epoch": 0.6618367653690065, + "flos": 25441711968960.0, + "grad_norm": 1.6855307170538927, + "language_loss": 0.78244191, + "learning_rate": 1.0842026388361248e-06, + "loss": 0.8039192, + "num_input_tokens_seen": 237616385, + "router_z_loss_clip": 0.74511719, + "router_z_loss_mlp": 0.11486816, + "step": 11008, + "time_per_iteration": 2.6530189514160156 + }, + { + "auxiliary_loss_clip": 0.0112181, + "auxiliary_loss_mlp": 0.01032651, + "balance_loss_clip": 1.04295015, + "balance_loss_mlp": 1.01969278, + "epoch": 0.6618968886216744, + "flos": 21612402728640.0, + "grad_norm": 2.1750565190648357, + "language_loss": 0.81655741, + "learning_rate": 1.0838564230588715e-06, + "loss": 0.83810204, + "num_input_tokens_seen": 237634930, + "router_z_loss_clip": 0.7890625, + "router_z_loss_mlp": 0.12969971, + "step": 11009, + "time_per_iteration": 2.621440887451172 + }, + { + "auxiliary_loss_clip": 0.01035472, + "auxiliary_loss_mlp": 0.01002818, + "balance_loss_clip": 1.01201367, + "balance_loss_mlp": 1.00177443, + "epoch": 0.6619570118743424, + "flos": 81797307438240.0, + "grad_norm": 0.9764625804759026, + "language_loss": 0.67328501, + "learning_rate": 1.0835102420218735e-06, + "loss": 0.69366789, + "num_input_tokens_seen": 237693175, + "router_z_loss_clip": 0.23449707, + "router_z_loss_mlp": 0.01043701, + "step": 11010, + "time_per_iteration": 3.2135257720947266 + }, + { + "auxiliary_loss_clip": 0.01117191, + "auxiliary_loss_mlp": 0.01030107, + "balance_loss_clip": 1.04162669, + "balance_loss_mlp": 1.01788807, + "epoch": 0.6620171351270103, + "flos": 22769937859680.0, + "grad_norm": 1.5884362724968062, + "language_loss": 0.70934767, + "learning_rate": 1.0831640957382593e-06, + "loss": 0.73082066, + "num_input_tokens_seen": 237713160, + "router_z_loss_clip": 0.75585938, + "router_z_loss_mlp": 0.12213135, + "step": 11011, + "time_per_iteration": 2.6333374977111816 + }, + { + "auxiliary_loss_clip": 0.01117339, + "auxiliary_loss_mlp": 0.01030772, + "balance_loss_clip": 1.0441308, + "balance_loss_mlp": 1.01992989, + "epoch": 0.6620772583796783, + "flos": 29493145703520.0, + "grad_norm": 1.6906455469372808, + "language_loss": 0.72832739, + "learning_rate": 1.0828179842211557e-06, + "loss": 0.74980855, + "num_input_tokens_seen": 237733600, + "router_z_loss_clip": 0.73193359, + "router_z_loss_mlp": 0.10845947, + "step": 11012, + "time_per_iteration": 2.6873815059661865 + }, + { + "auxiliary_loss_clip": 0.01111739, + "auxiliary_loss_mlp": 0.01033583, + "balance_loss_clip": 1.04197669, + "balance_loss_mlp": 1.02291977, + "epoch": 0.6621373816323463, + "flos": 28825678254240.0, + "grad_norm": 2.31902492208415, + "language_loss": 0.79300314, + "learning_rate": 1.0824719074836845e-06, + "loss": 0.81445634, + "num_input_tokens_seen": 237752135, + "router_z_loss_clip": 0.69775391, + "router_z_loss_mlp": 0.10662842, + "step": 11013, + "time_per_iteration": 2.7074358463287354 + }, + { + "auxiliary_loss_clip": 0.01117955, + "auxiliary_loss_mlp": 0.01026125, + "balance_loss_clip": 1.04347479, + "balance_loss_mlp": 1.01450229, + "epoch": 0.6621975048850143, + "flos": 22503048121440.0, + "grad_norm": 4.280783606052068, + "language_loss": 0.70684582, + "learning_rate": 1.082125865538971e-06, + "loss": 0.72828662, + "num_input_tokens_seen": 237770735, + "router_z_loss_clip": 0.74462891, + "router_z_loss_mlp": 0.1161499, + "step": 11014, + "time_per_iteration": 2.618462085723877 + }, + { + "auxiliary_loss_clip": 0.01115203, + "auxiliary_loss_mlp": 0.01027821, + "balance_loss_clip": 1.0433799, + "balance_loss_mlp": 1.01742005, + "epoch": 0.6622576281376823, + "flos": 17160269731200.0, + "grad_norm": 2.0749097544028645, + "language_loss": 0.76711434, + "learning_rate": 1.081779858400137e-06, + "loss": 0.78854454, + "num_input_tokens_seen": 237789005, + "router_z_loss_clip": 0.71777344, + "router_z_loss_mlp": 0.10406494, + "step": 11015, + "time_per_iteration": 2.645413637161255 + }, + { + "auxiliary_loss_clip": 0.01115138, + "auxiliary_loss_mlp": 0.01026635, + "balance_loss_clip": 1.04155123, + "balance_loss_mlp": 1.01514304, + "epoch": 0.6623177513903502, + "flos": 20766319993440.0, + "grad_norm": 2.3399089721972213, + "language_loss": 0.82361096, + "learning_rate": 1.0814338860803021e-06, + "loss": 0.8450287, + "num_input_tokens_seen": 237807740, + "router_z_loss_clip": 0.73730469, + "router_z_loss_mlp": 0.11486816, + "step": 11016, + "time_per_iteration": 2.587484359741211 + }, + { + "auxiliary_loss_clip": 0.01116121, + "auxiliary_loss_mlp": 0.01029547, + "balance_loss_clip": 1.03935814, + "balance_loss_mlp": 1.01791239, + "epoch": 0.6623778746430182, + "flos": 21074206868640.0, + "grad_norm": 2.1568866480270525, + "language_loss": 0.69492841, + "learning_rate": 1.0810879485925864e-06, + "loss": 0.71638513, + "num_input_tokens_seen": 237826340, + "router_z_loss_clip": 0.76660156, + "router_z_loss_mlp": 0.11651611, + "step": 11017, + "time_per_iteration": 2.617372989654541 + }, + { + "auxiliary_loss_clip": 0.01114642, + "auxiliary_loss_mlp": 0.01031991, + "balance_loss_clip": 1.04093432, + "balance_loss_mlp": 1.02011812, + "epoch": 0.6624379978956861, + "flos": 59539553344800.0, + "grad_norm": 1.8472425862138746, + "language_loss": 0.77099276, + "learning_rate": 1.0807420459501084e-06, + "loss": 0.79245907, + "num_input_tokens_seen": 237848305, + "router_z_loss_clip": 0.73828125, + "router_z_loss_mlp": 0.11871338, + "step": 11018, + "time_per_iteration": 2.855618953704834 + }, + { + "auxiliary_loss_clip": 0.01114528, + "auxiliary_loss_mlp": 0.01035284, + "balance_loss_clip": 1.04022932, + "balance_loss_mlp": 1.02347684, + "epoch": 0.6624981211483542, + "flos": 23125709809440.0, + "grad_norm": 2.7372210107019015, + "language_loss": 0.83372641, + "learning_rate": 1.0803961781659841e-06, + "loss": 0.85522449, + "num_input_tokens_seen": 237867020, + "router_z_loss_clip": 0.74267578, + "router_z_loss_mlp": 0.11816406, + "step": 11019, + "time_per_iteration": 2.635023593902588 + }, + { + "auxiliary_loss_clip": 0.01113484, + "auxiliary_loss_mlp": 0.01028942, + "balance_loss_clip": 1.04174972, + "balance_loss_mlp": 1.01793909, + "epoch": 0.6625582444010221, + "flos": 28378066331520.0, + "grad_norm": 1.569334693268699, + "language_loss": 0.71675813, + "learning_rate": 1.080050345253328e-06, + "loss": 0.73818237, + "num_input_tokens_seen": 237886710, + "router_z_loss_clip": 0.71679688, + "router_z_loss_mlp": 0.10998535, + "step": 11020, + "time_per_iteration": 2.631714105606079 + }, + { + "auxiliary_loss_clip": 0.01119839, + "auxiliary_loss_mlp": 0.01029741, + "balance_loss_clip": 1.04144382, + "balance_loss_mlp": 1.01702142, + "epoch": 0.6626183676536901, + "flos": 26106302691360.0, + "grad_norm": 1.9254090650414317, + "language_loss": 0.72663295, + "learning_rate": 1.0797045472252554e-06, + "loss": 0.74812877, + "num_input_tokens_seen": 237904795, + "router_z_loss_clip": 0.78222656, + "router_z_loss_mlp": 0.12731934, + "step": 11021, + "time_per_iteration": 2.6576130390167236 + }, + { + "auxiliary_loss_clip": 0.01118292, + "auxiliary_loss_mlp": 0.01031535, + "balance_loss_clip": 1.04338408, + "balance_loss_mlp": 1.01959062, + "epoch": 0.662678490906358, + "flos": 17779771071360.0, + "grad_norm": 2.167018446962411, + "language_loss": 0.83156401, + "learning_rate": 1.0793587840948793e-06, + "loss": 0.85306227, + "num_input_tokens_seen": 237921320, + "router_z_loss_clip": 0.74902344, + "router_z_loss_mlp": 0.11950684, + "step": 11022, + "time_per_iteration": 2.6055729389190674 + }, + { + "auxiliary_loss_clip": 0.01121443, + "auxiliary_loss_mlp": 0.01032217, + "balance_loss_clip": 1.04077172, + "balance_loss_mlp": 1.0191102, + "epoch": 0.662738614159026, + "flos": 19514027645280.0, + "grad_norm": 2.6022865397124235, + "language_loss": 0.7313906, + "learning_rate": 1.0790130558753099e-06, + "loss": 0.75292718, + "num_input_tokens_seen": 237933525, + "router_z_loss_clip": 0.80712891, + "router_z_loss_mlp": 0.13116455, + "step": 11023, + "time_per_iteration": 2.6142165660858154 + }, + { + "auxiliary_loss_clip": 0.01114474, + "auxiliary_loss_mlp": 0.01027811, + "balance_loss_clip": 1.03978264, + "balance_loss_mlp": 1.01637268, + "epoch": 0.6627987374116939, + "flos": 23838226123680.0, + "grad_norm": 1.8771406389122627, + "language_loss": 0.75018454, + "learning_rate": 1.0786673625796574e-06, + "loss": 0.77160734, + "num_input_tokens_seen": 237953395, + "router_z_loss_clip": 0.74707031, + "router_z_loss_mlp": 0.11437988, + "step": 11024, + "time_per_iteration": 2.6008777618408203 + }, + { + "auxiliary_loss_clip": 0.01116894, + "auxiliary_loss_mlp": 0.01028969, + "balance_loss_clip": 1.04201221, + "balance_loss_mlp": 1.01650596, + "epoch": 0.662858860664362, + "flos": 19160524663200.0, + "grad_norm": 2.2930878824813274, + "language_loss": 0.69391787, + "learning_rate": 1.0783217042210306e-06, + "loss": 0.7153765, + "num_input_tokens_seen": 237971445, + "router_z_loss_clip": 0.74853516, + "router_z_loss_mlp": 0.12463379, + "step": 11025, + "time_per_iteration": 4.0497047901153564 + }, + { + "auxiliary_loss_clip": 0.01118566, + "auxiliary_loss_mlp": 0.01032285, + "balance_loss_clip": 1.04396009, + "balance_loss_mlp": 1.02052498, + "epoch": 0.6629189839170299, + "flos": 24592509403200.0, + "grad_norm": 1.5344848182836759, + "language_loss": 0.78725088, + "learning_rate": 1.0779760808125379e-06, + "loss": 0.80875939, + "num_input_tokens_seen": 237989965, + "router_z_loss_clip": 0.74560547, + "router_z_loss_mlp": 0.11755371, + "step": 11026, + "time_per_iteration": 4.193035125732422 + }, + { + "auxiliary_loss_clip": 0.01117075, + "auxiliary_loss_mlp": 0.010276, + "balance_loss_clip": 1.04409063, + "balance_loss_mlp": 1.01651323, + "epoch": 0.6629791071696979, + "flos": 25521761413440.0, + "grad_norm": 1.616006800293124, + "language_loss": 0.76180398, + "learning_rate": 1.0776304923672842e-06, + "loss": 0.78325069, + "num_input_tokens_seen": 238006820, + "router_z_loss_clip": 0.72998047, + "router_z_loss_mlp": 0.11090088, + "step": 11027, + "time_per_iteration": 2.7327258586883545 + }, + { + "auxiliary_loss_clip": 0.01118112, + "auxiliary_loss_mlp": 0.01033217, + "balance_loss_clip": 1.04305744, + "balance_loss_mlp": 1.02100945, + "epoch": 0.6630392304223659, + "flos": 25437741275520.0, + "grad_norm": 3.098311187872314, + "language_loss": 0.70682997, + "learning_rate": 1.0772849388983742e-06, + "loss": 0.72834325, + "num_input_tokens_seen": 238022560, + "router_z_loss_clip": 0.75146484, + "router_z_loss_mlp": 0.12194824, + "step": 11028, + "time_per_iteration": 2.639160394668579 + }, + { + "auxiliary_loss_clip": 0.01116101, + "auxiliary_loss_mlp": 0.01033833, + "balance_loss_clip": 1.04179382, + "balance_loss_mlp": 1.02361059, + "epoch": 0.6630993536750338, + "flos": 25619192771040.0, + "grad_norm": 2.9226833456996157, + "language_loss": 0.7974447, + "learning_rate": 1.0769394204189138e-06, + "loss": 0.8189441, + "num_input_tokens_seen": 238041895, + "router_z_loss_clip": 0.74316406, + "router_z_loss_mlp": 0.10223389, + "step": 11029, + "time_per_iteration": 2.6213369369506836 + }, + { + "auxiliary_loss_clip": 0.0111714, + "auxiliary_loss_mlp": 0.01030669, + "balance_loss_clip": 1.04093695, + "balance_loss_mlp": 1.01823592, + "epoch": 0.6631594769277018, + "flos": 22279586556960.0, + "grad_norm": 1.9875144414235786, + "language_loss": 0.7592743, + "learning_rate": 1.0765939369420012e-06, + "loss": 0.78075236, + "num_input_tokens_seen": 238060445, + "router_z_loss_clip": 0.76171875, + "router_z_loss_mlp": 0.12432861, + "step": 11030, + "time_per_iteration": 2.582068920135498 + }, + { + "auxiliary_loss_clip": 0.01122174, + "auxiliary_loss_mlp": 0.01031501, + "balance_loss_clip": 1.04313445, + "balance_loss_mlp": 1.0195322, + "epoch": 0.6632196001803697, + "flos": 21744510527520.0, + "grad_norm": 2.414302086340152, + "language_loss": 0.74664277, + "learning_rate": 1.0762484884807391e-06, + "loss": 0.76817954, + "num_input_tokens_seen": 238077080, + "router_z_loss_clip": 0.79052734, + "router_z_loss_mlp": 0.11962891, + "step": 11031, + "time_per_iteration": 2.689486265182495 + }, + { + "auxiliary_loss_clip": 0.01117074, + "auxiliary_loss_mlp": 0.01030629, + "balance_loss_clip": 1.04086339, + "balance_loss_mlp": 1.01870823, + "epoch": 0.6632797234330378, + "flos": 15460243908480.0, + "grad_norm": 2.8739403239676635, + "language_loss": 0.75181627, + "learning_rate": 1.075903075048228e-06, + "loss": 0.77329326, + "num_input_tokens_seen": 238091045, + "router_z_loss_clip": 0.76171875, + "router_z_loss_mlp": 0.11914062, + "step": 11032, + "time_per_iteration": 2.5469260215759277 + }, + { + "auxiliary_loss_clip": 0.01116839, + "auxiliary_loss_mlp": 0.01030117, + "balance_loss_clip": 1.04215384, + "balance_loss_mlp": 1.01888108, + "epoch": 0.6633398466857057, + "flos": 28778522490720.0, + "grad_norm": 2.4245557303289504, + "language_loss": 0.80382168, + "learning_rate": 1.0755576966575635e-06, + "loss": 0.82529128, + "num_input_tokens_seen": 238110220, + "router_z_loss_clip": 0.74609375, + "router_z_loss_mlp": 0.11236572, + "step": 11033, + "time_per_iteration": 2.6741788387298584 + }, + { + "auxiliary_loss_clip": 0.01115393, + "auxiliary_loss_mlp": 0.01028954, + "balance_loss_clip": 1.04017687, + "balance_loss_mlp": 1.01691377, + "epoch": 0.6633999699383737, + "flos": 25174781713440.0, + "grad_norm": 1.878000748860832, + "language_loss": 0.80350381, + "learning_rate": 1.0752123533218451e-06, + "loss": 0.82494724, + "num_input_tokens_seen": 238130400, + "router_z_loss_clip": 0.75195312, + "router_z_loss_mlp": 0.12042236, + "step": 11034, + "time_per_iteration": 2.6098859310150146 + }, + { + "auxiliary_loss_clip": 0.01113397, + "auxiliary_loss_mlp": 0.01026351, + "balance_loss_clip": 1.04107499, + "balance_loss_mlp": 1.01544356, + "epoch": 0.6634600931910416, + "flos": 26597099684160.0, + "grad_norm": 1.692633303742492, + "language_loss": 0.75770229, + "learning_rate": 1.074867045054166e-06, + "loss": 0.77909976, + "num_input_tokens_seen": 238148165, + "router_z_loss_clip": 0.72265625, + "router_z_loss_mlp": 0.10906982, + "step": 11035, + "time_per_iteration": 4.105140686035156 + }, + { + "auxiliary_loss_clip": 0.0111727, + "auxiliary_loss_mlp": 0.01025744, + "balance_loss_clip": 1.04054761, + "balance_loss_mlp": 1.01413274, + "epoch": 0.6635202164437096, + "flos": 22857847656480.0, + "grad_norm": 2.1684332155134594, + "language_loss": 0.83297217, + "learning_rate": 1.074521771867622e-06, + "loss": 0.85440224, + "num_input_tokens_seen": 238166360, + "router_z_loss_clip": 0.76757812, + "router_z_loss_mlp": 0.11602783, + "step": 11036, + "time_per_iteration": 2.600832939147949 + }, + { + "auxiliary_loss_clip": 0.01035464, + "auxiliary_loss_mlp": 0.01001042, + "balance_loss_clip": 1.01211667, + "balance_loss_mlp": 1.00004625, + "epoch": 0.6635803396963775, + "flos": 73483133588640.0, + "grad_norm": 0.7849296207187254, + "language_loss": 0.52291965, + "learning_rate": 1.0741765337753044e-06, + "loss": 0.54328477, + "num_input_tokens_seen": 238227630, + "router_z_loss_clip": 0.23388672, + "router_z_loss_mlp": 0.00994873, + "step": 11037, + "time_per_iteration": 4.6303675174713135 + }, + { + "auxiliary_loss_clip": 0.011178, + "auxiliary_loss_mlp": 0.01035354, + "balance_loss_clip": 1.04298997, + "balance_loss_mlp": 1.02323604, + "epoch": 0.6636404629490456, + "flos": 35590490994240.0, + "grad_norm": 1.6469297286820175, + "language_loss": 0.79003251, + "learning_rate": 1.0738313307903052e-06, + "loss": 0.81156403, + "num_input_tokens_seen": 238248435, + "router_z_loss_clip": 0.74804688, + "router_z_loss_mlp": 0.12121582, + "step": 11038, + "time_per_iteration": 2.7600820064544678 + }, + { + "auxiliary_loss_clip": 0.01116805, + "auxiliary_loss_mlp": 0.01037007, + "balance_loss_clip": 1.04258156, + "balance_loss_mlp": 1.02457929, + "epoch": 0.6637005862017135, + "flos": 47476889527680.0, + "grad_norm": 2.3484930051467865, + "language_loss": 0.64036667, + "learning_rate": 1.073486162925716e-06, + "loss": 0.66190481, + "num_input_tokens_seen": 238268755, + "router_z_loss_clip": 0.74267578, + "router_z_loss_mlp": 0.12408447, + "step": 11039, + "time_per_iteration": 2.904700517654419 + }, + { + "auxiliary_loss_clip": 0.01117964, + "auxiliary_loss_mlp": 0.01027107, + "balance_loss_clip": 1.04079688, + "balance_loss_mlp": 1.01591945, + "epoch": 0.6637607094543815, + "flos": 27800494025760.0, + "grad_norm": 1.639380160532237, + "language_loss": 0.64020741, + "learning_rate": 1.0731410301946237e-06, + "loss": 0.66165811, + "num_input_tokens_seen": 238290120, + "router_z_loss_clip": 0.77148438, + "router_z_loss_mlp": 0.11187744, + "step": 11040, + "time_per_iteration": 2.665140390396118 + }, + { + "auxiliary_loss_clip": 0.01113178, + "auxiliary_loss_mlp": 0.01030458, + "balance_loss_clip": 1.0397315, + "balance_loss_mlp": 1.01952088, + "epoch": 0.6638208327070495, + "flos": 22102632479520.0, + "grad_norm": 2.4882852679314835, + "language_loss": 0.7134223, + "learning_rate": 1.0727959326101161e-06, + "loss": 0.73485869, + "num_input_tokens_seen": 238309290, + "router_z_loss_clip": 0.73339844, + "router_z_loss_mlp": 0.10943604, + "step": 11041, + "time_per_iteration": 2.7209033966064453 + }, + { + "auxiliary_loss_clip": 0.01114005, + "auxiliary_loss_mlp": 0.01034415, + "balance_loss_clip": 1.04026818, + "balance_loss_mlp": 1.02266097, + "epoch": 0.6638809559597174, + "flos": 35904901151520.0, + "grad_norm": 2.2752444647792007, + "language_loss": 0.61171007, + "learning_rate": 1.0724508701852806e-06, + "loss": 0.63319427, + "num_input_tokens_seen": 238327280, + "router_z_loss_clip": 0.73779297, + "router_z_loss_mlp": 0.11749268, + "step": 11042, + "time_per_iteration": 2.6707794666290283 + }, + { + "auxiliary_loss_clip": 0.01118155, + "auxiliary_loss_mlp": 0.01026531, + "balance_loss_clip": 1.04086828, + "balance_loss_mlp": 1.01437759, + "epoch": 0.6639410792123854, + "flos": 34255069888320.0, + "grad_norm": 1.9849568689778212, + "language_loss": 0.6905697, + "learning_rate": 1.0721058429331998e-06, + "loss": 0.71201658, + "num_input_tokens_seen": 238346330, + "router_z_loss_clip": 0.77294922, + "router_z_loss_mlp": 0.12158203, + "step": 11043, + "time_per_iteration": 2.6879498958587646 + }, + { + "auxiliary_loss_clip": 0.01111418, + "auxiliary_loss_mlp": 0.01027048, + "balance_loss_clip": 1.04133534, + "balance_loss_mlp": 1.01670098, + "epoch": 0.6640012024650533, + "flos": 31184338759200.0, + "grad_norm": 1.5771734932398995, + "language_loss": 0.84035903, + "learning_rate": 1.0717608508669587e-06, + "loss": 0.86174369, + "num_input_tokens_seen": 238364650, + "router_z_loss_clip": 0.70019531, + "router_z_loss_mlp": 0.10345459, + "step": 11044, + "time_per_iteration": 2.645658016204834 + }, + { + "auxiliary_loss_clip": 0.01115315, + "auxiliary_loss_mlp": 0.01030963, + "balance_loss_clip": 1.04152513, + "balance_loss_mlp": 1.01909542, + "epoch": 0.6640613257177214, + "flos": 18140202508320.0, + "grad_norm": 3.10160814916164, + "language_loss": 0.69538414, + "learning_rate": 1.0714158939996392e-06, + "loss": 0.71684694, + "num_input_tokens_seen": 238381630, + "router_z_loss_clip": 0.73779297, + "router_z_loss_mlp": 0.11865234, + "step": 11045, + "time_per_iteration": 2.602783203125 + }, + { + "auxiliary_loss_clip": 0.01117385, + "auxiliary_loss_mlp": 0.01027073, + "balance_loss_clip": 1.04200578, + "balance_loss_mlp": 1.01583731, + "epoch": 0.6641214489703893, + "flos": 28334678675040.0, + "grad_norm": 1.5520732245586806, + "language_loss": 0.64315689, + "learning_rate": 1.0710709723443235e-06, + "loss": 0.66460145, + "num_input_tokens_seen": 238402595, + "router_z_loss_clip": 0.75439453, + "router_z_loss_mlp": 0.11236572, + "step": 11046, + "time_per_iteration": 2.6247870922088623 + }, + { + "auxiliary_loss_clip": 0.01114773, + "auxiliary_loss_mlp": 0.01024913, + "balance_loss_clip": 1.04109526, + "balance_loss_mlp": 1.01379681, + "epoch": 0.6641815722230573, + "flos": 46054936212480.0, + "grad_norm": 1.7359982337688016, + "language_loss": 0.71570736, + "learning_rate": 1.070726085914088e-06, + "loss": 0.73710424, + "num_input_tokens_seen": 238426860, + "router_z_loss_clip": 0.73583984, + "router_z_loss_mlp": 0.11108398, + "step": 11047, + "time_per_iteration": 2.7856993675231934 + }, + { + "auxiliary_loss_clip": 0.0111782, + "auxiliary_loss_mlp": 0.01033555, + "balance_loss_clip": 1.04361331, + "balance_loss_mlp": 1.02179468, + "epoch": 0.6642416954757252, + "flos": 21879414018720.0, + "grad_norm": 1.871904004507359, + "language_loss": 0.77127242, + "learning_rate": 1.0703812347220126e-06, + "loss": 0.79278618, + "num_input_tokens_seen": 238443990, + "router_z_loss_clip": 0.7421875, + "router_z_loss_mlp": 0.11767578, + "step": 11048, + "time_per_iteration": 2.615037202835083 + }, + { + "auxiliary_loss_clip": 0.01034294, + "auxiliary_loss_mlp": 0.01000418, + "balance_loss_clip": 1.01108801, + "balance_loss_mlp": 0.99934667, + "epoch": 0.6643018187283932, + "flos": 63445109755680.0, + "grad_norm": 0.7547287654315894, + "language_loss": 0.55008459, + "learning_rate": 1.0700364187811745e-06, + "loss": 0.57043171, + "num_input_tokens_seen": 238503045, + "router_z_loss_clip": 0.2322998, + "router_z_loss_mlp": 0.0107193, + "step": 11049, + "time_per_iteration": 3.270216703414917 + }, + { + "auxiliary_loss_clip": 0.01115109, + "auxiliary_loss_mlp": 0.01026358, + "balance_loss_clip": 1.04157877, + "balance_loss_mlp": 1.01567101, + "epoch": 0.6643619419810611, + "flos": 36883375306560.0, + "grad_norm": 1.6767592864416716, + "language_loss": 0.64445788, + "learning_rate": 1.069691638104648e-06, + "loss": 0.66587257, + "num_input_tokens_seen": 238527320, + "router_z_loss_clip": 0.73535156, + "router_z_loss_mlp": 0.10687256, + "step": 11050, + "time_per_iteration": 2.720186948776245 + }, + { + "auxiliary_loss_clip": 0.01111157, + "auxiliary_loss_mlp": 0.01029437, + "balance_loss_clip": 1.03900838, + "balance_loss_mlp": 1.01855278, + "epoch": 0.6644220652337292, + "flos": 28023509900160.0, + "grad_norm": 5.324254240044988, + "language_loss": 0.79068589, + "learning_rate": 1.0693468927055085e-06, + "loss": 0.81209183, + "num_input_tokens_seen": 238546030, + "router_z_loss_clip": 0.72167969, + "router_z_loss_mlp": 0.10882568, + "step": 11051, + "time_per_iteration": 2.64228892326355 + }, + { + "auxiliary_loss_clip": 0.01116446, + "auxiliary_loss_mlp": 0.01029972, + "balance_loss_clip": 1.04303086, + "balance_loss_mlp": 1.01919591, + "epoch": 0.6644821884863971, + "flos": 25797808056960.0, + "grad_norm": 1.8610492557124396, + "language_loss": 0.85153258, + "learning_rate": 1.0690021825968276e-06, + "loss": 0.87299675, + "num_input_tokens_seen": 238564175, + "router_z_loss_clip": 0.734375, + "router_z_loss_mlp": 0.10766602, + "step": 11052, + "time_per_iteration": 2.7714314460754395 + }, + { + "auxiliary_loss_clip": 0.01118717, + "auxiliary_loss_mlp": 0.01034018, + "balance_loss_clip": 1.04231882, + "balance_loss_mlp": 1.02151322, + "epoch": 0.6645423117390651, + "flos": 24639341028480.0, + "grad_norm": 2.644331783484058, + "language_loss": 0.7445991, + "learning_rate": 1.0686575077916776e-06, + "loss": 0.76612651, + "num_input_tokens_seen": 238581010, + "router_z_loss_clip": 0.76416016, + "router_z_loss_mlp": 0.12506104, + "step": 11053, + "time_per_iteration": 2.6175742149353027 + }, + { + "auxiliary_loss_clip": 0.01113397, + "auxiliary_loss_mlp": 0.01027553, + "balance_loss_clip": 1.04099762, + "balance_loss_mlp": 1.01697898, + "epoch": 0.6646024349917331, + "flos": 29714662438560.0, + "grad_norm": 1.6867050087511442, + "language_loss": 0.79863119, + "learning_rate": 1.0683128683031278e-06, + "loss": 0.8200407, + "num_input_tokens_seen": 238601365, + "router_z_loss_clip": 0.72412109, + "router_z_loss_mlp": 0.10577393, + "step": 11054, + "time_per_iteration": 2.6651053428649902 + }, + { + "auxiliary_loss_clip": 0.01113185, + "auxiliary_loss_mlp": 0.01029097, + "balance_loss_clip": 1.04047763, + "balance_loss_mlp": 1.01885653, + "epoch": 0.664662558244401, + "flos": 22947864351840.0, + "grad_norm": 1.7723198746373192, + "language_loss": 0.74191427, + "learning_rate": 1.0679682641442472e-06, + "loss": 0.76333714, + "num_input_tokens_seen": 238619850, + "router_z_loss_clip": 0.7265625, + "router_z_loss_mlp": 0.10229492, + "step": 11055, + "time_per_iteration": 2.6190810203552246 + }, + { + "auxiliary_loss_clip": 0.01116823, + "auxiliary_loss_mlp": 0.01040113, + "balance_loss_clip": 1.04171836, + "balance_loss_mlp": 1.02768564, + "epoch": 0.664722681497069, + "flos": 23126114982240.0, + "grad_norm": 2.5390803466382255, + "language_loss": 0.72915101, + "learning_rate": 1.0676236953281042e-06, + "loss": 0.75072038, + "num_input_tokens_seen": 238637635, + "router_z_loss_clip": 0.75, + "router_z_loss_mlp": 0.12426758, + "step": 11056, + "time_per_iteration": 2.6382577419281006 + }, + { + "auxiliary_loss_clip": 0.01113716, + "auxiliary_loss_mlp": 0.01029743, + "balance_loss_clip": 1.04063904, + "balance_loss_mlp": 1.01835871, + "epoch": 0.6647828047497369, + "flos": 23881006020960.0, + "grad_norm": 1.9619198029710292, + "language_loss": 0.70055068, + "learning_rate": 1.0672791618677641e-06, + "loss": 0.72198522, + "num_input_tokens_seen": 238656200, + "router_z_loss_clip": 0.72998047, + "router_z_loss_mlp": 0.11376953, + "step": 11057, + "time_per_iteration": 2.609318971633911 + }, + { + "auxiliary_loss_clip": 0.01115877, + "auxiliary_loss_mlp": 0.01029739, + "balance_loss_clip": 1.04187775, + "balance_loss_mlp": 1.01791942, + "epoch": 0.664842928002405, + "flos": 28247133533760.0, + "grad_norm": 2.565400705093193, + "language_loss": 0.8038162, + "learning_rate": 1.066934663776291e-06, + "loss": 0.82527232, + "num_input_tokens_seen": 238675005, + "router_z_loss_clip": 0.74023438, + "router_z_loss_mlp": 0.11816406, + "step": 11058, + "time_per_iteration": 2.644052743911743 + }, + { + "auxiliary_loss_clip": 0.01034452, + "auxiliary_loss_mlp": 0.01000155, + "balance_loss_clip": 1.01127028, + "balance_loss_mlp": 0.99907529, + "epoch": 0.6649030512550729, + "flos": 79612724283840.0, + "grad_norm": 0.8040380918104872, + "language_loss": 0.62601268, + "learning_rate": 1.0665902010667496e-06, + "loss": 0.64635873, + "num_input_tokens_seen": 238731425, + "router_z_loss_clip": 0.23181152, + "router_z_loss_mlp": 0.01081085, + "step": 11059, + "time_per_iteration": 3.128100633621216 + }, + { + "auxiliary_loss_clip": 0.01113714, + "auxiliary_loss_mlp": 0.01033355, + "balance_loss_clip": 1.04093528, + "balance_loss_mlp": 1.0226438, + "epoch": 0.6649631745077409, + "flos": 24641569478880.0, + "grad_norm": 1.579095779034342, + "language_loss": 0.78649902, + "learning_rate": 1.0662457737522008e-06, + "loss": 0.80796981, + "num_input_tokens_seen": 238752020, + "router_z_loss_clip": 0.72802734, + "router_z_loss_mlp": 0.1071167, + "step": 11060, + "time_per_iteration": 2.6043989658355713 + }, + { + "auxiliary_loss_clip": 0.01118426, + "auxiliary_loss_mlp": 0.01033279, + "balance_loss_clip": 1.04355466, + "balance_loss_mlp": 1.02153111, + "epoch": 0.6650232977604088, + "flos": 21034222663680.0, + "grad_norm": 1.9485612405178332, + "language_loss": 0.78934687, + "learning_rate": 1.0659013818457055e-06, + "loss": 0.81086397, + "num_input_tokens_seen": 238769665, + "router_z_loss_clip": 0.74902344, + "router_z_loss_mlp": 0.11743164, + "step": 11061, + "time_per_iteration": 2.662640333175659 + }, + { + "auxiliary_loss_clip": 0.01114492, + "auxiliary_loss_mlp": 0.01025932, + "balance_loss_clip": 1.04177547, + "balance_loss_mlp": 1.01508427, + "epoch": 0.6650834210130768, + "flos": 12210249216960.0, + "grad_norm": 2.0054751920958895, + "language_loss": 0.5660342, + "learning_rate": 1.0655570253603243e-06, + "loss": 0.58743846, + "num_input_tokens_seen": 238782180, + "router_z_loss_clip": 0.72705078, + "router_z_loss_mlp": 0.10845947, + "step": 11062, + "time_per_iteration": 2.5436384677886963 + }, + { + "auxiliary_loss_clip": 0.01117672, + "auxiliary_loss_mlp": 0.01025581, + "balance_loss_clip": 1.04037702, + "balance_loss_mlp": 1.01343322, + "epoch": 0.6651435442657447, + "flos": 12752861460480.0, + "grad_norm": 1.8942818048374246, + "language_loss": 0.76023072, + "learning_rate": 1.0652127043091144e-06, + "loss": 0.78166324, + "num_input_tokens_seen": 238800315, + "router_z_loss_clip": 0.77197266, + "router_z_loss_mlp": 0.12133789, + "step": 11063, + "time_per_iteration": 2.639509439468384 + }, + { + "auxiliary_loss_clip": 0.01118172, + "auxiliary_loss_mlp": 0.01033453, + "balance_loss_clip": 1.043926, + "balance_loss_mlp": 1.02245593, + "epoch": 0.6652036675184128, + "flos": 27264769719840.0, + "grad_norm": 1.3736313528436597, + "language_loss": 0.70366716, + "learning_rate": 1.0648684187051316e-06, + "loss": 0.72518337, + "num_input_tokens_seen": 238822250, + "router_z_loss_clip": 0.74267578, + "router_z_loss_mlp": 0.11004639, + "step": 11064, + "time_per_iteration": 4.22507905960083 + }, + { + "auxiliary_loss_clip": 0.01034308, + "auxiliary_loss_mlp": 0.01001334, + "balance_loss_clip": 1.01116097, + "balance_loss_mlp": 1.00022507, + "epoch": 0.6652637907710807, + "flos": 64560553783200.0, + "grad_norm": 0.8419416212238006, + "language_loss": 0.63077748, + "learning_rate": 1.0645241685614322e-06, + "loss": 0.65113389, + "num_input_tokens_seen": 238877190, + "router_z_loss_clip": 0.23181152, + "router_z_loss_mlp": 0.0111084, + "step": 11065, + "time_per_iteration": 3.2049405574798584 + }, + { + "auxiliary_loss_clip": 0.01114993, + "auxiliary_loss_mlp": 0.01029127, + "balance_loss_clip": 1.04121685, + "balance_loss_mlp": 1.01798058, + "epoch": 0.6653239140237487, + "flos": 28193211384480.0, + "grad_norm": 2.48940030705564, + "language_loss": 0.62609375, + "learning_rate": 1.0641799538910708e-06, + "loss": 0.64753497, + "num_input_tokens_seen": 238896010, + "router_z_loss_clip": 0.73681641, + "router_z_loss_mlp": 0.1114502, + "step": 11066, + "time_per_iteration": 3.8936049938201904 + }, + { + "auxiliary_loss_clip": 0.01115696, + "auxiliary_loss_mlp": 0.01026833, + "balance_loss_clip": 1.04028296, + "balance_loss_mlp": 1.01465011, + "epoch": 0.6653840372764167, + "flos": 31675986614880.0, + "grad_norm": 1.56145740134215, + "language_loss": 0.69879168, + "learning_rate": 1.0638357747070985e-06, + "loss": 0.72021693, + "num_input_tokens_seen": 238918990, + "router_z_loss_clip": 0.75390625, + "router_z_loss_mlp": 0.1217041, + "step": 11067, + "time_per_iteration": 2.669098138809204 + }, + { + "auxiliary_loss_clip": 0.01034118, + "auxiliary_loss_mlp": 0.0100119, + "balance_loss_clip": 1.01077986, + "balance_loss_mlp": 1.00018501, + "epoch": 0.6654441605290846, + "flos": 80585890675200.0, + "grad_norm": 0.9227799519160401, + "language_loss": 0.72074789, + "learning_rate": 1.0634916310225684e-06, + "loss": 0.74110097, + "num_input_tokens_seen": 238975735, + "router_z_loss_clip": 0.23364258, + "router_z_loss_mlp": 0.01004791, + "step": 11068, + "time_per_iteration": 3.2184455394744873 + }, + { + "auxiliary_loss_clip": 0.01034173, + "auxiliary_loss_mlp": 0.01000434, + "balance_loss_clip": 1.01083004, + "balance_loss_mlp": 0.99941057, + "epoch": 0.6655042837817526, + "flos": 79550948132640.0, + "grad_norm": 0.7852076360609419, + "language_loss": 0.5786683, + "learning_rate": 1.0631475228505285e-06, + "loss": 0.59901434, + "num_input_tokens_seen": 239042360, + "router_z_loss_clip": 0.23364258, + "router_z_loss_mlp": 0.01023102, + "step": 11069, + "time_per_iteration": 3.360135078430176 + }, + { + "auxiliary_loss_clip": 0.01034068, + "auxiliary_loss_mlp": 0.01000063, + "balance_loss_clip": 1.01083505, + "balance_loss_mlp": 0.9991163, + "epoch": 0.6655644070344205, + "flos": 84205109053440.0, + "grad_norm": 0.7516692602187284, + "language_loss": 0.63522196, + "learning_rate": 1.062803450204029e-06, + "loss": 0.65556324, + "num_input_tokens_seen": 239109410, + "router_z_loss_clip": 0.23242188, + "router_z_loss_mlp": 0.00944519, + "step": 11070, + "time_per_iteration": 3.2960383892059326 + }, + { + "auxiliary_loss_clip": 0.0111302, + "auxiliary_loss_mlp": 0.01025484, + "balance_loss_clip": 1.03898489, + "balance_loss_mlp": 1.01426601, + "epoch": 0.6656245302870886, + "flos": 44313913252800.0, + "grad_norm": 1.7044575225407241, + "language_loss": 0.59294128, + "learning_rate": 1.062459413096116e-06, + "loss": 0.61432636, + "num_input_tokens_seen": 239135345, + "router_z_loss_clip": 0.74121094, + "router_z_loss_mlp": 0.11212158, + "step": 11071, + "time_per_iteration": 2.8031771183013916 + }, + { + "auxiliary_loss_clip": 0.01118645, + "auxiliary_loss_mlp": 0.01025377, + "balance_loss_clip": 1.04424429, + "balance_loss_mlp": 1.01515484, + "epoch": 0.6656846535397565, + "flos": 26594547095520.0, + "grad_norm": 2.0654362895035234, + "language_loss": 0.72482085, + "learning_rate": 1.0621154115398364e-06, + "loss": 0.74626106, + "num_input_tokens_seen": 239154340, + "router_z_loss_clip": 0.74365234, + "router_z_loss_mlp": 0.10217285, + "step": 11072, + "time_per_iteration": 2.6929001808166504 + }, + { + "auxiliary_loss_clip": 0.01116293, + "auxiliary_loss_mlp": 0.01030728, + "balance_loss_clip": 1.04348075, + "balance_loss_mlp": 1.01856828, + "epoch": 0.6657447767924245, + "flos": 45743726920320.0, + "grad_norm": 1.812212008011444, + "language_loss": 0.70957422, + "learning_rate": 1.0617714455482353e-06, + "loss": 0.73104453, + "num_input_tokens_seen": 239177815, + "router_z_loss_clip": 0.72851562, + "router_z_loss_mlp": 0.12164307, + "step": 11073, + "time_per_iteration": 2.804222345352173 + }, + { + "auxiliary_loss_clip": 0.01117675, + "auxiliary_loss_mlp": 0.01029537, + "balance_loss_clip": 1.04128039, + "balance_loss_mlp": 1.01831377, + "epoch": 0.6658049000450924, + "flos": 20545005844800.0, + "grad_norm": 3.8779658786273825, + "language_loss": 0.55750132, + "learning_rate": 1.061427515134354e-06, + "loss": 0.57897341, + "num_input_tokens_seen": 239195735, + "router_z_loss_clip": 0.76416016, + "router_z_loss_mlp": 0.11218262, + "step": 11074, + "time_per_iteration": 2.6059741973876953 + }, + { + "auxiliary_loss_clip": 0.01117011, + "auxiliary_loss_mlp": 0.01028713, + "balance_loss_clip": 1.0433588, + "balance_loss_mlp": 1.01788259, + "epoch": 0.6658650232977604, + "flos": 40891097246400.0, + "grad_norm": 1.5334009452761999, + "language_loss": 0.72323251, + "learning_rate": 1.061083620311235e-06, + "loss": 0.74468976, + "num_input_tokens_seen": 239217535, + "router_z_loss_clip": 0.73681641, + "router_z_loss_mlp": 0.10827637, + "step": 11075, + "time_per_iteration": 4.134704351425171 + }, + { + "auxiliary_loss_clip": 0.01112169, + "auxiliary_loss_mlp": 0.01031361, + "balance_loss_clip": 1.04041839, + "balance_loss_mlp": 1.02087057, + "epoch": 0.6659251465504283, + "flos": 46010616658560.0, + "grad_norm": 2.1451605388926964, + "language_loss": 0.66101587, + "learning_rate": 1.0607397610919202e-06, + "loss": 0.68245113, + "num_input_tokens_seen": 239241975, + "router_z_loss_clip": 0.71826172, + "router_z_loss_mlp": 0.1048584, + "step": 11076, + "time_per_iteration": 4.1381995677948 + }, + { + "auxiliary_loss_clip": 0.01114527, + "auxiliary_loss_mlp": 0.01034097, + "balance_loss_clip": 1.04140019, + "balance_loss_mlp": 1.02216411, + "epoch": 0.6659852698030964, + "flos": 30375930744000.0, + "grad_norm": 1.7304768300124382, + "language_loss": 0.75386798, + "learning_rate": 1.0603959374894468e-06, + "loss": 0.77535427, + "num_input_tokens_seen": 239262025, + "router_z_loss_clip": 0.73046875, + "router_z_loss_mlp": 0.11938477, + "step": 11077, + "time_per_iteration": 2.6986114978790283 + }, + { + "auxiliary_loss_clip": 0.01113809, + "auxiliary_loss_mlp": 0.01031023, + "balance_loss_clip": 1.03969216, + "balance_loss_mlp": 1.01996589, + "epoch": 0.6660453930557643, + "flos": 29715634853280.0, + "grad_norm": 2.4208199960743935, + "language_loss": 0.66628861, + "learning_rate": 1.0600521495168538e-06, + "loss": 0.68773693, + "num_input_tokens_seen": 239282775, + "router_z_loss_clip": 0.74169922, + "router_z_loss_mlp": 0.11053467, + "step": 11078, + "time_per_iteration": 2.622459888458252 + }, + { + "auxiliary_loss_clip": 0.01116662, + "auxiliary_loss_mlp": 0.01032947, + "balance_loss_clip": 1.04069972, + "balance_loss_mlp": 1.02081752, + "epoch": 0.6661055163084323, + "flos": 12930706918080.0, + "grad_norm": 2.130140913206462, + "language_loss": 0.70225555, + "learning_rate": 1.0597083971871783e-06, + "loss": 0.72375166, + "num_input_tokens_seen": 239299775, + "router_z_loss_clip": 0.75976562, + "router_z_loss_mlp": 0.12127686, + "step": 11079, + "time_per_iteration": 2.631436586380005 + }, + { + "auxiliary_loss_clip": 0.01113194, + "auxiliary_loss_mlp": 0.01021891, + "balance_loss_clip": 1.040622, + "balance_loss_mlp": 1.01135921, + "epoch": 0.6661656395611003, + "flos": 29355568071840.0, + "grad_norm": 1.652553949025578, + "language_loss": 0.80556035, + "learning_rate": 1.0593646805134544e-06, + "loss": 0.82691121, + "num_input_tokens_seen": 239319660, + "router_z_loss_clip": 0.72509766, + "router_z_loss_mlp": 0.10534668, + "step": 11080, + "time_per_iteration": 2.6216142177581787 + }, + { + "auxiliary_loss_clip": 0.01112603, + "auxiliary_loss_mlp": 0.01026515, + "balance_loss_clip": 1.04225254, + "balance_loss_mlp": 1.01572645, + "epoch": 0.6662257628137682, + "flos": 28109272281120.0, + "grad_norm": 2.0521162986658243, + "language_loss": 0.78103745, + "learning_rate": 1.0590209995087157e-06, + "loss": 0.80242866, + "num_input_tokens_seen": 239339215, + "router_z_loss_clip": 0.703125, + "router_z_loss_mlp": 0.10797119, + "step": 11081, + "time_per_iteration": 2.674983024597168 + }, + { + "auxiliary_loss_clip": 0.01118867, + "auxiliary_loss_mlp": 0.01027274, + "balance_loss_clip": 1.04344356, + "balance_loss_mlp": 1.01544833, + "epoch": 0.6662858860664362, + "flos": 30205702535040.0, + "grad_norm": 3.198702167088906, + "language_loss": 0.79953331, + "learning_rate": 1.0586773541859946e-06, + "loss": 0.82099473, + "num_input_tokens_seen": 239358545, + "router_z_loss_clip": 0.75341797, + "router_z_loss_mlp": 0.11816406, + "step": 11082, + "time_per_iteration": 2.684290647506714 + }, + { + "auxiliary_loss_clip": 0.01114585, + "auxiliary_loss_mlp": 0.01035432, + "balance_loss_clip": 1.04049516, + "balance_loss_mlp": 1.02478671, + "epoch": 0.6663460093191041, + "flos": 24415231187520.0, + "grad_norm": 1.8562691766010888, + "language_loss": 0.83933413, + "learning_rate": 1.0583337445583234e-06, + "loss": 0.8608343, + "num_input_tokens_seen": 239376665, + "router_z_loss_clip": 0.74169922, + "router_z_loss_mlp": 0.10650635, + "step": 11083, + "time_per_iteration": 2.673398733139038 + }, + { + "auxiliary_loss_clip": 0.01121889, + "auxiliary_loss_mlp": 0.01034323, + "balance_loss_clip": 1.04514146, + "balance_loss_mlp": 1.0225575, + "epoch": 0.6664061325717722, + "flos": 21745523459520.0, + "grad_norm": 3.469286723813715, + "language_loss": 0.85429341, + "learning_rate": 1.057990170638731e-06, + "loss": 0.87585557, + "num_input_tokens_seen": 239394345, + "router_z_loss_clip": 0.76708984, + "router_z_loss_mlp": 0.11773682, + "step": 11084, + "time_per_iteration": 2.706836223602295 + }, + { + "auxiliary_loss_clip": 0.01119517, + "auxiliary_loss_mlp": 0.01026873, + "balance_loss_clip": 1.04324949, + "balance_loss_mlp": 1.01443303, + "epoch": 0.6664662558244401, + "flos": 22057299993600.0, + "grad_norm": 2.430653006656931, + "language_loss": 0.73573935, + "learning_rate": 1.0576466324402452e-06, + "loss": 0.75720328, + "num_input_tokens_seen": 239410605, + "router_z_loss_clip": 0.76220703, + "router_z_loss_mlp": 0.12426758, + "step": 11085, + "time_per_iteration": 2.6476569175720215 + }, + { + "auxiliary_loss_clip": 0.01116504, + "auxiliary_loss_mlp": 0.01030085, + "balance_loss_clip": 1.04274929, + "balance_loss_mlp": 1.01833701, + "epoch": 0.6665263790771081, + "flos": 26325469424160.0, + "grad_norm": 4.462447753812555, + "language_loss": 0.80925524, + "learning_rate": 1.057303129975894e-06, + "loss": 0.83072114, + "num_input_tokens_seen": 239427155, + "router_z_loss_clip": 0.73828125, + "router_z_loss_mlp": 0.11749268, + "step": 11086, + "time_per_iteration": 2.6426491737365723 + }, + { + "auxiliary_loss_clip": 0.01116155, + "auxiliary_loss_mlp": 0.01033771, + "balance_loss_clip": 1.04214549, + "balance_loss_mlp": 1.02183783, + "epoch": 0.666586502329776, + "flos": 29537262671040.0, + "grad_norm": 2.3506616535624243, + "language_loss": 0.7455799, + "learning_rate": 1.056959663258702e-06, + "loss": 0.76707917, + "num_input_tokens_seen": 239445510, + "router_z_loss_clip": 0.73974609, + "router_z_loss_mlp": 0.11938477, + "step": 11087, + "time_per_iteration": 2.6197357177734375 + }, + { + "auxiliary_loss_clip": 0.0111513, + "auxiliary_loss_mlp": 0.01031481, + "balance_loss_clip": 1.04140759, + "balance_loss_mlp": 1.02016234, + "epoch": 0.666646625582444, + "flos": 27089193229920.0, + "grad_norm": 6.24460209005094, + "language_loss": 0.64952534, + "learning_rate": 1.0566162323016939e-06, + "loss": 0.67099148, + "num_input_tokens_seen": 239464805, + "router_z_loss_clip": 0.73779297, + "router_z_loss_mlp": 0.11334229, + "step": 11088, + "time_per_iteration": 2.633019208908081 + }, + { + "auxiliary_loss_clip": 0.01117044, + "auxiliary_loss_mlp": 0.01027982, + "balance_loss_clip": 1.0417515, + "balance_loss_mlp": 1.01643646, + "epoch": 0.6667067488351119, + "flos": 22280599488960.0, + "grad_norm": 2.3542091046804234, + "language_loss": 0.64074916, + "learning_rate": 1.0562728371178928e-06, + "loss": 0.66219944, + "num_input_tokens_seen": 239483890, + "router_z_loss_clip": 0.75341797, + "router_z_loss_mlp": 0.11547852, + "step": 11089, + "time_per_iteration": 2.5828969478607178 + }, + { + "auxiliary_loss_clip": 0.0111531, + "auxiliary_loss_mlp": 0.01028278, + "balance_loss_clip": 1.04167461, + "balance_loss_mlp": 1.01689327, + "epoch": 0.66676687208778, + "flos": 21031953696000.0, + "grad_norm": 2.300356656337637, + "language_loss": 0.80966294, + "learning_rate": 1.0559294777203221e-06, + "loss": 0.83109885, + "num_input_tokens_seen": 239500080, + "router_z_loss_clip": 0.73583984, + "router_z_loss_mlp": 0.1137085, + "step": 11090, + "time_per_iteration": 2.6651368141174316 + }, + { + "auxiliary_loss_clip": 0.01119478, + "auxiliary_loss_mlp": 0.01031462, + "balance_loss_clip": 1.04341888, + "balance_loss_mlp": 1.02005923, + "epoch": 0.6668269953404479, + "flos": 24101914996800.0, + "grad_norm": 3.025719750032797, + "language_loss": 0.78019357, + "learning_rate": 1.0555861541219984e-06, + "loss": 0.80170292, + "num_input_tokens_seen": 239517335, + "router_z_loss_clip": 0.75927734, + "router_z_loss_mlp": 0.11413574, + "step": 11091, + "time_per_iteration": 2.5943121910095215 + }, + { + "auxiliary_loss_clip": 0.01115914, + "auxiliary_loss_mlp": 0.01031525, + "balance_loss_clip": 1.04203057, + "balance_loss_mlp": 1.02017617, + "epoch": 0.6668871185931159, + "flos": 25085413294560.0, + "grad_norm": 1.951166580393909, + "language_loss": 0.79441226, + "learning_rate": 1.0552428663359425e-06, + "loss": 0.81588662, + "num_input_tokens_seen": 239536240, + "router_z_loss_clip": 0.73828125, + "router_z_loss_mlp": 0.11346436, + "step": 11092, + "time_per_iteration": 2.6408064365386963 + }, + { + "auxiliary_loss_clip": 0.01033536, + "auxiliary_loss_mlp": 0.01006071, + "balance_loss_clip": 1.01030171, + "balance_loss_mlp": 1.00500047, + "epoch": 0.6669472418457839, + "flos": 70880023568160.0, + "grad_norm": 0.7602491221075982, + "language_loss": 0.57636809, + "learning_rate": 1.0548996143751724e-06, + "loss": 0.59676415, + "num_input_tokens_seen": 239598000, + "router_z_loss_clip": 0.23254395, + "router_z_loss_mlp": 0.0107193, + "step": 11093, + "time_per_iteration": 3.2961177825927734 + }, + { + "auxiliary_loss_clip": 0.01114534, + "auxiliary_loss_mlp": 0.01031597, + "balance_loss_clip": 1.04157817, + "balance_loss_mlp": 1.02046847, + "epoch": 0.6670073650984518, + "flos": 31805339238720.0, + "grad_norm": 1.6742550902987743, + "language_loss": 0.7681399, + "learning_rate": 1.054556398252703e-06, + "loss": 0.78960127, + "num_input_tokens_seen": 239617650, + "router_z_loss_clip": 0.72998047, + "router_z_loss_mlp": 0.11132812, + "step": 11094, + "time_per_iteration": 2.7025325298309326 + }, + { + "auxiliary_loss_clip": 0.01115083, + "auxiliary_loss_mlp": 0.01031593, + "balance_loss_clip": 1.04141974, + "balance_loss_mlp": 1.01985717, + "epoch": 0.6670674883511198, + "flos": 39555919244160.0, + "grad_norm": 1.7286277946750763, + "language_loss": 0.73405159, + "learning_rate": 1.05421321798155e-06, + "loss": 0.75551832, + "num_input_tokens_seen": 239639825, + "router_z_loss_clip": 0.73681641, + "router_z_loss_mlp": 0.11749268, + "step": 11095, + "time_per_iteration": 2.732914924621582 + }, + { + "auxiliary_loss_clip": 0.01116373, + "auxiliary_loss_mlp": 0.01034675, + "balance_loss_clip": 1.04338169, + "balance_loss_mlp": 1.02362466, + "epoch": 0.6671276116037878, + "flos": 22009820091840.0, + "grad_norm": 2.153830117799027, + "language_loss": 0.73588109, + "learning_rate": 1.053870073574727e-06, + "loss": 0.75739157, + "num_input_tokens_seen": 239656300, + "router_z_loss_clip": 0.73046875, + "router_z_loss_mlp": 0.11053467, + "step": 11096, + "time_per_iteration": 2.6198818683624268 + }, + { + "auxiliary_loss_clip": 0.01115524, + "auxiliary_loss_mlp": 0.01028717, + "balance_loss_clip": 1.04314363, + "balance_loss_mlp": 1.01763105, + "epoch": 0.6671877348564558, + "flos": 29001011640480.0, + "grad_norm": 1.7593570708938877, + "language_loss": 0.64717615, + "learning_rate": 1.0535269650452456e-06, + "loss": 0.66861856, + "num_input_tokens_seen": 239676655, + "router_z_loss_clip": 0.72363281, + "router_z_loss_mlp": 0.11083984, + "step": 11097, + "time_per_iteration": 2.6697089672088623 + }, + { + "auxiliary_loss_clip": 0.01117384, + "auxiliary_loss_mlp": 0.01030929, + "balance_loss_clip": 1.04184413, + "balance_loss_mlp": 1.01988435, + "epoch": 0.6672478581091237, + "flos": 25525083830400.0, + "grad_norm": 1.8162816762025116, + "language_loss": 0.75648004, + "learning_rate": 1.0531838924061158e-06, + "loss": 0.77796316, + "num_input_tokens_seen": 239695430, + "router_z_loss_clip": 0.75634766, + "router_z_loss_mlp": 0.11035156, + "step": 11098, + "time_per_iteration": 2.6768131256103516 + }, + { + "auxiliary_loss_clip": 0.01119724, + "auxiliary_loss_mlp": 0.01032345, + "balance_loss_clip": 1.04429615, + "balance_loss_mlp": 1.02179539, + "epoch": 0.6673079813617917, + "flos": 33990206014080.0, + "grad_norm": 2.0664189579896726, + "language_loss": 0.74521637, + "learning_rate": 1.0528408556703476e-06, + "loss": 0.76673704, + "num_input_tokens_seen": 239717070, + "router_z_loss_clip": 0.75439453, + "router_z_loss_mlp": 0.10546875, + "step": 11099, + "time_per_iteration": 2.6822309494018555 + }, + { + "auxiliary_loss_clip": 0.01112954, + "auxiliary_loss_mlp": 0.0103076, + "balance_loss_clip": 1.04036117, + "balance_loss_mlp": 1.01967382, + "epoch": 0.6673681046144596, + "flos": 26376960536640.0, + "grad_norm": 2.169793080185186, + "language_loss": 0.78286338, + "learning_rate": 1.0524978548509502e-06, + "loss": 0.80430055, + "num_input_tokens_seen": 239737105, + "router_z_loss_clip": 0.72558594, + "router_z_loss_mlp": 0.11083984, + "step": 11100, + "time_per_iteration": 2.6014254093170166 + }, + { + "auxiliary_loss_clip": 0.01115569, + "auxiliary_loss_mlp": 0.01033288, + "balance_loss_clip": 1.04206359, + "balance_loss_mlp": 1.02220201, + "epoch": 0.6674282278671276, + "flos": 25489597043520.0, + "grad_norm": 2.3451259880180126, + "language_loss": 0.59779793, + "learning_rate": 1.0521548899609288e-06, + "loss": 0.61928654, + "num_input_tokens_seen": 239757835, + "router_z_loss_clip": 0.73632812, + "router_z_loss_mlp": 0.11071777, + "step": 11101, + "time_per_iteration": 2.671022415161133 + }, + { + "auxiliary_loss_clip": 0.01121157, + "auxiliary_loss_mlp": 0.01031624, + "balance_loss_clip": 1.04304838, + "balance_loss_mlp": 1.01949465, + "epoch": 0.6674883511197955, + "flos": 28826407565280.0, + "grad_norm": 2.063143527343714, + "language_loss": 0.7169441, + "learning_rate": 1.0518119610132884e-06, + "loss": 0.73847187, + "num_input_tokens_seen": 239775425, + "router_z_loss_clip": 0.78076172, + "router_z_loss_mlp": 0.12139893, + "step": 11102, + "time_per_iteration": 2.64231538772583 + }, + { + "auxiliary_loss_clip": 0.01115137, + "auxiliary_loss_mlp": 0.01028143, + "balance_loss_clip": 1.04072428, + "balance_loss_mlp": 1.01724124, + "epoch": 0.6675484743724636, + "flos": 23927675577120.0, + "grad_norm": 1.787962215923408, + "language_loss": 0.84727979, + "learning_rate": 1.051469068021034e-06, + "loss": 0.86871254, + "num_input_tokens_seen": 239794605, + "router_z_loss_clip": 0.74511719, + "router_z_loss_mlp": 0.10913086, + "step": 11103, + "time_per_iteration": 2.721510648727417 + }, + { + "auxiliary_loss_clip": 0.01116933, + "auxiliary_loss_mlp": 0.01025832, + "balance_loss_clip": 1.04197657, + "balance_loss_mlp": 1.01491284, + "epoch": 0.6676085976251315, + "flos": 17472127299840.0, + "grad_norm": 1.9165602088720544, + "language_loss": 0.77597976, + "learning_rate": 1.0511262109971668e-06, + "loss": 0.79740751, + "num_input_tokens_seen": 239812135, + "router_z_loss_clip": 0.74902344, + "router_z_loss_mlp": 0.10925293, + "step": 11104, + "time_per_iteration": 4.1511805057525635 + }, + { + "auxiliary_loss_clip": 0.0112003, + "auxiliary_loss_mlp": 0.01027044, + "balance_loss_clip": 1.04348159, + "balance_loss_mlp": 1.01595187, + "epoch": 0.6676687208777995, + "flos": 46496916233280.0, + "grad_norm": 1.5133005573737883, + "language_loss": 0.57804942, + "learning_rate": 1.0507833899546889e-06, + "loss": 0.59952009, + "num_input_tokens_seen": 239835845, + "router_z_loss_clip": 0.76513672, + "router_z_loss_mlp": 0.11096191, + "step": 11105, + "time_per_iteration": 4.151297092437744 + }, + { + "auxiliary_loss_clip": 0.01121368, + "auxiliary_loss_mlp": 0.01031242, + "balance_loss_clip": 1.04284811, + "balance_loss_mlp": 1.01886225, + "epoch": 0.6677288441304675, + "flos": 29263201374240.0, + "grad_norm": 1.8785225684456301, + "language_loss": 0.7309612, + "learning_rate": 1.0504406049066e-06, + "loss": 0.7524873, + "num_input_tokens_seen": 239853820, + "router_z_loss_clip": 0.78613281, + "router_z_loss_mlp": 0.12384033, + "step": 11106, + "time_per_iteration": 2.6828441619873047 + }, + { + "auxiliary_loss_clip": 0.01116375, + "auxiliary_loss_mlp": 0.01027657, + "balance_loss_clip": 1.04223931, + "balance_loss_mlp": 1.01638544, + "epoch": 0.6677889673831354, + "flos": 29493348289920.0, + "grad_norm": 1.7380346206852426, + "language_loss": 0.76396656, + "learning_rate": 1.0500978558659e-06, + "loss": 0.78540689, + "num_input_tokens_seen": 239873365, + "router_z_loss_clip": 0.74023438, + "router_z_loss_mlp": 0.11260986, + "step": 11107, + "time_per_iteration": 2.6581599712371826 + }, + { + "auxiliary_loss_clip": 0.01111791, + "auxiliary_loss_mlp": 0.01027599, + "balance_loss_clip": 1.04039693, + "balance_loss_mlp": 1.01616096, + "epoch": 0.6678490906358034, + "flos": 27222394995360.0, + "grad_norm": 2.2706396601513976, + "language_loss": 0.89692199, + "learning_rate": 1.049755142845583e-06, + "loss": 0.91831589, + "num_input_tokens_seen": 239891215, + "router_z_loss_clip": 0.71337891, + "router_z_loss_mlp": 0.11425781, + "step": 11108, + "time_per_iteration": 2.6025278568267822 + }, + { + "auxiliary_loss_clip": 0.01114158, + "auxiliary_loss_mlp": 0.01022787, + "balance_loss_clip": 1.04101074, + "balance_loss_mlp": 1.0126121, + "epoch": 0.6679092138884714, + "flos": 45025619221440.0, + "grad_norm": 1.4211914693020293, + "language_loss": 0.82798111, + "learning_rate": 1.049412465858646e-06, + "loss": 0.84935051, + "num_input_tokens_seen": 239913490, + "router_z_loss_clip": 0.73095703, + "router_z_loss_mlp": 0.10174561, + "step": 11109, + "time_per_iteration": 2.790868043899536 + }, + { + "auxiliary_loss_clip": 0.01115141, + "auxiliary_loss_mlp": 0.01026393, + "balance_loss_clip": 1.04104054, + "balance_loss_mlp": 1.01439452, + "epoch": 0.6679693371411394, + "flos": 22146830481600.0, + "grad_norm": 17.074378264404235, + "language_loss": 0.6942898, + "learning_rate": 1.0490698249180847e-06, + "loss": 0.71570516, + "num_input_tokens_seen": 239931565, + "router_z_loss_clip": 0.74169922, + "router_z_loss_mlp": 0.11993408, + "step": 11110, + "time_per_iteration": 2.595590114593506 + }, + { + "auxiliary_loss_clip": 0.01117381, + "auxiliary_loss_mlp": 0.01029667, + "balance_loss_clip": 1.04130495, + "balance_loss_mlp": 1.01730454, + "epoch": 0.6680294603938073, + "flos": 33187794556320.0, + "grad_norm": 1.4920535696309072, + "language_loss": 0.73307228, + "learning_rate": 1.04872722003689e-06, + "loss": 0.75454283, + "num_input_tokens_seen": 239952395, + "router_z_loss_clip": 0.75976562, + "router_z_loss_mlp": 0.12359619, + "step": 11111, + "time_per_iteration": 2.6742379665374756 + }, + { + "auxiliary_loss_clip": 0.01112109, + "auxiliary_loss_mlp": 0.01024581, + "balance_loss_clip": 1.039891, + "balance_loss_mlp": 1.01348317, + "epoch": 0.6680895836464753, + "flos": 26509392473760.0, + "grad_norm": 1.9531618278019376, + "language_loss": 0.64866179, + "learning_rate": 1.0483846512280553e-06, + "loss": 0.67002869, + "num_input_tokens_seen": 239968910, + "router_z_loss_clip": 0.72216797, + "router_z_loss_mlp": 0.11114502, + "step": 11112, + "time_per_iteration": 2.606269121170044 + }, + { + "auxiliary_loss_clip": 0.01115068, + "auxiliary_loss_mlp": 0.01029004, + "balance_loss_clip": 1.0412569, + "balance_loss_mlp": 1.01748252, + "epoch": 0.6681497068991432, + "flos": 23972562372960.0, + "grad_norm": 2.5353024657628813, + "language_loss": 0.63171297, + "learning_rate": 1.048042118504569e-06, + "loss": 0.65315372, + "num_input_tokens_seen": 239987680, + "router_z_loss_clip": 0.73925781, + "router_z_loss_mlp": 0.11523438, + "step": 11113, + "time_per_iteration": 2.627105712890625 + }, + { + "auxiliary_loss_clip": 0.01115008, + "auxiliary_loss_mlp": 0.01031773, + "balance_loss_clip": 1.04248321, + "balance_loss_mlp": 1.02032304, + "epoch": 0.6682098301518112, + "flos": 21254766984000.0, + "grad_norm": 2.0481825375897467, + "language_loss": 0.66328931, + "learning_rate": 1.047699621879422e-06, + "loss": 0.68475711, + "num_input_tokens_seen": 240005790, + "router_z_loss_clip": 0.72558594, + "router_z_loss_mlp": 0.11462402, + "step": 11114, + "time_per_iteration": 4.106088399887085 + }, + { + "auxiliary_loss_clip": 0.01114844, + "auxiliary_loss_mlp": 0.01031941, + "balance_loss_clip": 1.0404942, + "balance_loss_mlp": 1.02027023, + "epoch": 0.6682699534044791, + "flos": 27575735908320.0, + "grad_norm": 1.661503896945599, + "language_loss": 0.78134215, + "learning_rate": 1.0473571613655998e-06, + "loss": 0.80281007, + "num_input_tokens_seen": 240025895, + "router_z_loss_clip": 0.74414062, + "router_z_loss_mlp": 0.11682129, + "step": 11115, + "time_per_iteration": 2.6584396362304688 + }, + { + "auxiliary_loss_clip": 0.01113736, + "auxiliary_loss_mlp": 0.01028321, + "balance_loss_clip": 1.03892207, + "balance_loss_mlp": 1.01670361, + "epoch": 0.6683300766571472, + "flos": 30339025852320.0, + "grad_norm": 1.8791428434864796, + "language_loss": 0.79808229, + "learning_rate": 1.0470147369760896e-06, + "loss": 0.81950283, + "num_input_tokens_seen": 240044880, + "router_z_loss_clip": 0.74755859, + "router_z_loss_mlp": 0.1161499, + "step": 11116, + "time_per_iteration": 4.062463760375977 + }, + { + "auxiliary_loss_clip": 0.01116202, + "auxiliary_loss_mlp": 0.01030802, + "balance_loss_clip": 1.04046631, + "balance_loss_mlp": 1.01854134, + "epoch": 0.6683901999098151, + "flos": 33100370966880.0, + "grad_norm": 2.696942843414755, + "language_loss": 0.78915012, + "learning_rate": 1.0466723487238768e-06, + "loss": 0.81062013, + "num_input_tokens_seen": 240065785, + "router_z_loss_clip": 0.75683594, + "router_z_loss_mlp": 0.12268066, + "step": 11117, + "time_per_iteration": 2.7402055263519287 + }, + { + "auxiliary_loss_clip": 0.01117347, + "auxiliary_loss_mlp": 0.01031278, + "balance_loss_clip": 1.04237163, + "balance_loss_mlp": 1.01864147, + "epoch": 0.6684503231624831, + "flos": 25305957614880.0, + "grad_norm": 2.442060248999044, + "language_loss": 0.6581524, + "learning_rate": 1.0463299966219441e-06, + "loss": 0.67963862, + "num_input_tokens_seen": 240085130, + "router_z_loss_clip": 0.74902344, + "router_z_loss_mlp": 0.12634277, + "step": 11118, + "time_per_iteration": 2.6393749713897705 + }, + { + "auxiliary_loss_clip": 0.01114151, + "auxiliary_loss_mlp": 0.01028347, + "balance_loss_clip": 1.04098642, + "balance_loss_mlp": 1.01774311, + "epoch": 0.668510446415151, + "flos": 26554360304160.0, + "grad_norm": 2.210400812083563, + "language_loss": 0.69158411, + "learning_rate": 1.0459876806832727e-06, + "loss": 0.71300906, + "num_input_tokens_seen": 240105495, + "router_z_loss_clip": 0.73144531, + "router_z_loss_mlp": 0.1060791, + "step": 11119, + "time_per_iteration": 2.615647554397583 + }, + { + "auxiliary_loss_clip": 0.01115127, + "auxiliary_loss_mlp": 0.01027582, + "balance_loss_clip": 1.04135501, + "balance_loss_mlp": 1.01638782, + "epoch": 0.668570569667819, + "flos": 36840068684640.0, + "grad_norm": 3.7035909380773355, + "language_loss": 0.67153227, + "learning_rate": 1.0456454009208448e-06, + "loss": 0.69295943, + "num_input_tokens_seen": 240125455, + "router_z_loss_clip": 0.73730469, + "router_z_loss_mlp": 0.11193848, + "step": 11120, + "time_per_iteration": 2.6996686458587646 + }, + { + "auxiliary_loss_clip": 0.01115557, + "auxiliary_loss_mlp": 0.01029293, + "balance_loss_clip": 1.04178023, + "balance_loss_mlp": 1.01767588, + "epoch": 0.668630692920487, + "flos": 29493307772640.0, + "grad_norm": 1.688159537632396, + "language_loss": 0.71930081, + "learning_rate": 1.045303157347638e-06, + "loss": 0.7407493, + "num_input_tokens_seen": 240143870, + "router_z_loss_clip": 0.73779297, + "router_z_loss_mlp": 0.11627197, + "step": 11121, + "time_per_iteration": 2.6554908752441406 + }, + { + "auxiliary_loss_clip": 0.0111745, + "auxiliary_loss_mlp": 0.01035576, + "balance_loss_clip": 1.04177737, + "balance_loss_mlp": 1.02380395, + "epoch": 0.668690816173155, + "flos": 21300990850080.0, + "grad_norm": 3.843670526320748, + "language_loss": 0.70019186, + "learning_rate": 1.0449609499766316e-06, + "loss": 0.72172219, + "num_input_tokens_seen": 240161020, + "router_z_loss_clip": 0.75585938, + "router_z_loss_mlp": 0.11761475, + "step": 11122, + "time_per_iteration": 2.5818448066711426 + }, + { + "auxiliary_loss_clip": 0.01115233, + "auxiliary_loss_mlp": 0.01034572, + "balance_loss_clip": 1.04113483, + "balance_loss_mlp": 1.02319336, + "epoch": 0.668750939425823, + "flos": 30512009236320.0, + "grad_norm": 12.364248008566834, + "language_loss": 0.71469605, + "learning_rate": 1.0446187788208015e-06, + "loss": 0.73619413, + "num_input_tokens_seen": 240179820, + "router_z_loss_clip": 0.74072266, + "router_z_loss_mlp": 0.1138916, + "step": 11123, + "time_per_iteration": 2.6700918674468994 + }, + { + "auxiliary_loss_clip": 0.01119893, + "auxiliary_loss_mlp": 0.01035448, + "balance_loss_clip": 1.04411197, + "balance_loss_mlp": 1.02351487, + "epoch": 0.6688110626784909, + "flos": 29403250560000.0, + "grad_norm": 1.6014676924112212, + "language_loss": 0.7912643, + "learning_rate": 1.0442766438931244e-06, + "loss": 0.81281769, + "num_input_tokens_seen": 240200130, + "router_z_loss_clip": 0.7578125, + "router_z_loss_mlp": 0.1192627, + "step": 11124, + "time_per_iteration": 2.637176513671875 + }, + { + "auxiliary_loss_clip": 0.01116692, + "auxiliary_loss_mlp": 0.01036603, + "balance_loss_clip": 1.04221511, + "balance_loss_mlp": 1.02512944, + "epoch": 0.6688711859311589, + "flos": 26551362025440.0, + "grad_norm": 1.6406470760694927, + "language_loss": 0.74515128, + "learning_rate": 1.0439345452065716e-06, + "loss": 0.76668417, + "num_input_tokens_seen": 240217945, + "router_z_loss_clip": 0.74462891, + "router_z_loss_mlp": 0.11474609, + "step": 11125, + "time_per_iteration": 2.744235038757324 + }, + { + "auxiliary_loss_clip": 0.01118583, + "auxiliary_loss_mlp": 0.01032257, + "balance_loss_clip": 1.04313469, + "balance_loss_mlp": 1.02105689, + "epoch": 0.6689313091838268, + "flos": 27978987759840.0, + "grad_norm": 5.187792871774873, + "language_loss": 0.65983057, + "learning_rate": 1.043592482774116e-06, + "loss": 0.68133903, + "num_input_tokens_seen": 240237220, + "router_z_loss_clip": 0.75439453, + "router_z_loss_mlp": 0.11187744, + "step": 11126, + "time_per_iteration": 2.611361026763916 + }, + { + "auxiliary_loss_clip": 0.01113846, + "auxiliary_loss_mlp": 0.01027161, + "balance_loss_clip": 1.03929043, + "balance_loss_mlp": 1.01593757, + "epoch": 0.6689914324364948, + "flos": 25485383246400.0, + "grad_norm": 1.5672738885725996, + "language_loss": 0.71121228, + "learning_rate": 1.0432504566087305e-06, + "loss": 0.73262239, + "num_input_tokens_seen": 240256000, + "router_z_loss_clip": 0.74609375, + "router_z_loss_mlp": 0.11224365, + "step": 11127, + "time_per_iteration": 2.677302360534668 + }, + { + "auxiliary_loss_clip": 0.01119579, + "auxiliary_loss_mlp": 0.01029455, + "balance_loss_clip": 1.04149377, + "balance_loss_mlp": 1.01673579, + "epoch": 0.6690515556891627, + "flos": 27757511542080.0, + "grad_norm": 2.0375156640964125, + "language_loss": 0.79926246, + "learning_rate": 1.0429084667233827e-06, + "loss": 0.8207528, + "num_input_tokens_seen": 240275845, + "router_z_loss_clip": 0.78027344, + "router_z_loss_mlp": 0.1272583, + "step": 11128, + "time_per_iteration": 2.619791269302368 + }, + { + "auxiliary_loss_clip": 0.01116645, + "auxiliary_loss_mlp": 0.01028795, + "balance_loss_clip": 1.04107773, + "balance_loss_mlp": 1.01684415, + "epoch": 0.6691116789418308, + "flos": 28469258028000.0, + "grad_norm": 1.8512624796627464, + "language_loss": 0.80877852, + "learning_rate": 1.0425665131310427e-06, + "loss": 0.83023286, + "num_input_tokens_seen": 240294095, + "router_z_loss_clip": 0.75585938, + "router_z_loss_mlp": 0.1194458, + "step": 11129, + "time_per_iteration": 2.7270724773406982 + }, + { + "auxiliary_loss_clip": 0.01110382, + "auxiliary_loss_mlp": 0.01033465, + "balance_loss_clip": 1.039101, + "balance_loss_mlp": 1.02303386, + "epoch": 0.6691718021944987, + "flos": 39593310343200.0, + "grad_norm": 1.7192991268830526, + "language_loss": 0.70425284, + "learning_rate": 1.0422245958446762e-06, + "loss": 0.72569126, + "num_input_tokens_seen": 240313460, + "router_z_loss_clip": 0.71142578, + "router_z_loss_mlp": 0.10424805, + "step": 11130, + "time_per_iteration": 2.720263957977295 + }, + { + "auxiliary_loss_clip": 0.01114779, + "auxiliary_loss_mlp": 0.01033855, + "balance_loss_clip": 1.04285121, + "balance_loss_mlp": 1.02285767, + "epoch": 0.6692319254471667, + "flos": 28957664501280.0, + "grad_norm": 2.019954248034977, + "language_loss": 0.70252889, + "learning_rate": 1.0418827148772486e-06, + "loss": 0.72401524, + "num_input_tokens_seen": 240333540, + "router_z_loss_clip": 0.71972656, + "router_z_loss_mlp": 0.11010742, + "step": 11131, + "time_per_iteration": 2.754228353500366 + }, + { + "auxiliary_loss_clip": 0.01117076, + "auxiliary_loss_mlp": 0.01032727, + "balance_loss_clip": 1.04124618, + "balance_loss_mlp": 1.02003694, + "epoch": 0.6692920486998346, + "flos": 17604559236960.0, + "grad_norm": 2.5238812134923196, + "language_loss": 0.65194762, + "learning_rate": 1.0415408702417243e-06, + "loss": 0.67344564, + "num_input_tokens_seen": 240350085, + "router_z_loss_clip": 0.75830078, + "router_z_loss_mlp": 0.12689209, + "step": 11132, + "time_per_iteration": 2.5825672149658203 + }, + { + "auxiliary_loss_clip": 0.01117141, + "auxiliary_loss_mlp": 0.01030764, + "balance_loss_clip": 1.04114592, + "balance_loss_mlp": 1.01834881, + "epoch": 0.6693521719525026, + "flos": 26243475150240.0, + "grad_norm": 3.1301544686351988, + "language_loss": 0.74595535, + "learning_rate": 1.0411990619510661e-06, + "loss": 0.76743436, + "num_input_tokens_seen": 240370015, + "router_z_loss_clip": 0.75976562, + "router_z_loss_mlp": 0.12420654, + "step": 11133, + "time_per_iteration": 2.6433424949645996 + }, + { + "auxiliary_loss_clip": 0.01121654, + "auxiliary_loss_mlp": 0.01029047, + "balance_loss_clip": 1.04353952, + "balance_loss_mlp": 1.01662564, + "epoch": 0.6694122952051706, + "flos": 31002320021760.0, + "grad_norm": 1.9548178620660446, + "language_loss": 0.66187245, + "learning_rate": 1.0408572900182363e-06, + "loss": 0.68337947, + "num_input_tokens_seen": 240390770, + "router_z_loss_clip": 0.78076172, + "router_z_loss_mlp": 0.12426758, + "step": 11134, + "time_per_iteration": 2.6569151878356934 + }, + { + "auxiliary_loss_clip": 0.01124025, + "auxiliary_loss_mlp": 0.01031967, + "balance_loss_clip": 1.04404449, + "balance_loss_mlp": 1.01909184, + "epoch": 0.6694724184578386, + "flos": 31313286210240.0, + "grad_norm": 2.1843113942516337, + "language_loss": 0.77273977, + "learning_rate": 1.0405155544561943e-06, + "loss": 0.79429972, + "num_input_tokens_seen": 240409590, + "router_z_loss_clip": 0.80029297, + "router_z_loss_mlp": 0.12884521, + "step": 11135, + "time_per_iteration": 2.6714727878570557 + }, + { + "auxiliary_loss_clip": 0.01111914, + "auxiliary_loss_mlp": 0.01027859, + "balance_loss_clip": 1.03948534, + "balance_loss_mlp": 1.01612318, + "epoch": 0.6695325417105066, + "flos": 21608999277120.0, + "grad_norm": 1.7694007191988075, + "language_loss": 0.74274659, + "learning_rate": 1.040173855277898e-06, + "loss": 0.7641443, + "num_input_tokens_seen": 240428180, + "router_z_loss_clip": 0.72509766, + "router_z_loss_mlp": 0.11737061, + "step": 11136, + "time_per_iteration": 2.6577374935150146 + }, + { + "auxiliary_loss_clip": 0.01121565, + "auxiliary_loss_mlp": 0.01031707, + "balance_loss_clip": 1.04336572, + "balance_loss_mlp": 1.01929724, + "epoch": 0.6695926649631745, + "flos": 29846689202880.0, + "grad_norm": 1.833475178729137, + "language_loss": 0.6214779, + "learning_rate": 1.0398321924963061e-06, + "loss": 0.64301062, + "num_input_tokens_seen": 240447815, + "router_z_loss_clip": 0.78173828, + "router_z_loss_mlp": 0.12408447, + "step": 11137, + "time_per_iteration": 2.63664174079895 + }, + { + "auxiliary_loss_clip": 0.01115127, + "auxiliary_loss_mlp": 0.01029243, + "balance_loss_clip": 1.04074264, + "balance_loss_mlp": 1.01739979, + "epoch": 0.6696527882158425, + "flos": 29626914710880.0, + "grad_norm": 1.9624282790122938, + "language_loss": 0.65866029, + "learning_rate": 1.0394905661243724e-06, + "loss": 0.68010402, + "num_input_tokens_seen": 240468635, + "router_z_loss_clip": 0.74365234, + "router_z_loss_mlp": 0.11846924, + "step": 11138, + "time_per_iteration": 2.670001983642578 + }, + { + "auxiliary_loss_clip": 0.01111076, + "auxiliary_loss_mlp": 0.01031478, + "balance_loss_clip": 1.03923535, + "balance_loss_mlp": 1.02037978, + "epoch": 0.6697129114685104, + "flos": 28068194109600.0, + "grad_norm": 1.759410301883216, + "language_loss": 0.73011231, + "learning_rate": 1.039148976175053e-06, + "loss": 0.7515378, + "num_input_tokens_seen": 240488550, + "router_z_loss_clip": 0.71826172, + "router_z_loss_mlp": 0.11083984, + "step": 11139, + "time_per_iteration": 2.6289267539978027 + }, + { + "auxiliary_loss_clip": 0.01113375, + "auxiliary_loss_mlp": 0.01028386, + "balance_loss_clip": 1.04183578, + "balance_loss_mlp": 1.01763964, + "epoch": 0.6697730347211784, + "flos": 27623215810080.0, + "grad_norm": 3.127907628068095, + "language_loss": 0.70866513, + "learning_rate": 1.0388074226613016e-06, + "loss": 0.73008275, + "num_input_tokens_seen": 240508330, + "router_z_loss_clip": 0.71630859, + "router_z_loss_mlp": 0.10754395, + "step": 11140, + "time_per_iteration": 2.675149440765381 + }, + { + "auxiliary_loss_clip": 0.01117246, + "auxiliary_loss_mlp": 0.01027185, + "balance_loss_clip": 1.04022706, + "balance_loss_mlp": 1.014763, + "epoch": 0.6698331579738463, + "flos": 35236663873920.0, + "grad_norm": 1.927153525691557, + "language_loss": 0.76075906, + "learning_rate": 1.0384659055960691e-06, + "loss": 0.78220338, + "num_input_tokens_seen": 240528470, + "router_z_loss_clip": 0.76953125, + "router_z_loss_mlp": 0.12420654, + "step": 11141, + "time_per_iteration": 2.6659603118896484 + }, + { + "auxiliary_loss_clip": 0.01115968, + "auxiliary_loss_mlp": 0.01033963, + "balance_loss_clip": 1.0405966, + "balance_loss_mlp": 1.02217329, + "epoch": 0.6698932812265144, + "flos": 29537789395680.0, + "grad_norm": 1.984117047878091, + "language_loss": 0.82262582, + "learning_rate": 1.0381244249923052e-06, + "loss": 0.84412515, + "num_input_tokens_seen": 240547815, + "router_z_loss_clip": 0.75292969, + "router_z_loss_mlp": 0.11798096, + "step": 11142, + "time_per_iteration": 2.6703763008117676 + }, + { + "auxiliary_loss_clip": 0.01112271, + "auxiliary_loss_mlp": 0.01026193, + "balance_loss_clip": 1.03869164, + "balance_loss_mlp": 1.01452243, + "epoch": 0.6699534044791823, + "flos": 26954735428800.0, + "grad_norm": 1.871558687932224, + "language_loss": 0.70139039, + "learning_rate": 1.037782980862959e-06, + "loss": 0.7227751, + "num_input_tokens_seen": 240567765, + "router_z_loss_clip": 0.73486328, + "router_z_loss_mlp": 0.11657715, + "step": 11143, + "time_per_iteration": 4.091835975646973 + }, + { + "auxiliary_loss_clip": 0.01111169, + "auxiliary_loss_mlp": 0.01028186, + "balance_loss_clip": 1.03962433, + "balance_loss_mlp": 1.01777899, + "epoch": 0.6700135277318503, + "flos": 30740089770720.0, + "grad_norm": 2.646808242715723, + "language_loss": 0.70222461, + "learning_rate": 1.0374415732209796e-06, + "loss": 0.72361815, + "num_input_tokens_seen": 240590750, + "router_z_loss_clip": 0.71582031, + "router_z_loss_mlp": 0.10412598, + "step": 11144, + "time_per_iteration": 2.720658779144287 + }, + { + "auxiliary_loss_clip": 0.01113762, + "auxiliary_loss_mlp": 0.0103061, + "balance_loss_clip": 1.04045284, + "balance_loss_mlp": 1.01873112, + "epoch": 0.6700736509845182, + "flos": 28602014103360.0, + "grad_norm": 2.6155577925590174, + "language_loss": 0.74244022, + "learning_rate": 1.0371002020793114e-06, + "loss": 0.76388389, + "num_input_tokens_seen": 240608875, + "router_z_loss_clip": 0.73242188, + "router_z_loss_mlp": 0.11883545, + "step": 11145, + "time_per_iteration": 3.968918800354004 + }, + { + "auxiliary_loss_clip": 0.01118261, + "auxiliary_loss_mlp": 0.01027881, + "balance_loss_clip": 1.04112601, + "balance_loss_mlp": 1.01587021, + "epoch": 0.6701337742371862, + "flos": 29760886304640.0, + "grad_norm": 1.6750327278498474, + "language_loss": 0.70754725, + "learning_rate": 1.0367588674509008e-06, + "loss": 0.72900867, + "num_input_tokens_seen": 240628565, + "router_z_loss_clip": 0.77246094, + "router_z_loss_mlp": 0.12017822, + "step": 11146, + "time_per_iteration": 2.6323776245117188 + }, + { + "auxiliary_loss_clip": 0.0111023, + "auxiliary_loss_mlp": 0.01026783, + "balance_loss_clip": 1.03928196, + "balance_loss_mlp": 1.01561344, + "epoch": 0.6701938974898543, + "flos": 18050793572160.0, + "grad_norm": 2.749772862308014, + "language_loss": 0.7836414, + "learning_rate": 1.0364175693486905e-06, + "loss": 0.80501157, + "num_input_tokens_seen": 240646325, + "router_z_loss_clip": 0.70898438, + "router_z_loss_mlp": 0.11181641, + "step": 11147, + "time_per_iteration": 2.664903402328491 + }, + { + "auxiliary_loss_clip": 0.01117525, + "auxiliary_loss_mlp": 0.01037244, + "balance_loss_clip": 1.04337502, + "balance_loss_mlp": 1.02565694, + "epoch": 0.6702540207425222, + "flos": 24592185264960.0, + "grad_norm": 1.7085003204379487, + "language_loss": 0.70246601, + "learning_rate": 1.0360763077856218e-06, + "loss": 0.72401369, + "num_input_tokens_seen": 240666145, + "router_z_loss_clip": 0.74072266, + "router_z_loss_mlp": 0.11584473, + "step": 11148, + "time_per_iteration": 2.6177303791046143 + }, + { + "auxiliary_loss_clip": 0.01114949, + "auxiliary_loss_mlp": 0.01032216, + "balance_loss_clip": 1.03949702, + "balance_loss_mlp": 1.02096856, + "epoch": 0.6703141439951902, + "flos": 25885677336480.0, + "grad_norm": 3.680719070926072, + "language_loss": 0.70228535, + "learning_rate": 1.035735082774636e-06, + "loss": 0.72375697, + "num_input_tokens_seen": 240685570, + "router_z_loss_clip": 0.75488281, + "router_z_loss_mlp": 0.11254883, + "step": 11149, + "time_per_iteration": 2.6606245040893555 + }, + { + "auxiliary_loss_clip": 0.01114685, + "auxiliary_loss_mlp": 0.01026643, + "balance_loss_clip": 1.03921568, + "balance_loss_mlp": 1.01606894, + "epoch": 0.6703742672478581, + "flos": 28202125186080.0, + "grad_norm": 2.3524465277054656, + "language_loss": 0.73306704, + "learning_rate": 1.0353938943286727e-06, + "loss": 0.75448036, + "num_input_tokens_seen": 240706945, + "router_z_loss_clip": 0.75439453, + "router_z_loss_mlp": 0.10571289, + "step": 11150, + "time_per_iteration": 2.680752992630005 + }, + { + "auxiliary_loss_clip": 0.01115493, + "auxiliary_loss_mlp": 0.01030612, + "balance_loss_clip": 1.04134881, + "balance_loss_mlp": 1.01923323, + "epoch": 0.6704343905005261, + "flos": 27489933010080.0, + "grad_norm": 1.7936389596652726, + "language_loss": 0.7829743, + "learning_rate": 1.035052742460671e-06, + "loss": 0.80443537, + "num_input_tokens_seen": 240727990, + "router_z_loss_clip": 0.74121094, + "router_z_loss_mlp": 0.1137085, + "step": 11151, + "time_per_iteration": 2.6451892852783203 + }, + { + "auxiliary_loss_clip": 0.01034538, + "auxiliary_loss_mlp": 0.0100134, + "balance_loss_clip": 1.0111289, + "balance_loss_mlp": 1.00019729, + "epoch": 0.670494513753194, + "flos": 79059381311520.0, + "grad_norm": 0.7961891103386175, + "language_loss": 0.55442774, + "learning_rate": 1.0347116271835643e-06, + "loss": 0.57478648, + "num_input_tokens_seen": 240790380, + "router_z_loss_clip": 0.23449707, + "router_z_loss_mlp": 0.01143646, + "step": 11152, + "time_per_iteration": 3.3327550888061523 + }, + { + "auxiliary_loss_clip": 0.01116178, + "auxiliary_loss_mlp": 0.01034517, + "balance_loss_clip": 1.04009116, + "balance_loss_mlp": 1.02298319, + "epoch": 0.670554637005862, + "flos": 28689194589120.0, + "grad_norm": 1.988125198996298, + "language_loss": 0.8092041, + "learning_rate": 1.0343705485102896e-06, + "loss": 0.83071107, + "num_input_tokens_seen": 240811545, + "router_z_loss_clip": 0.76123047, + "router_z_loss_mlp": 0.11529541, + "step": 11153, + "time_per_iteration": 4.140769720077515 + }, + { + "auxiliary_loss_clip": 0.01114996, + "auxiliary_loss_mlp": 0.01031555, + "balance_loss_clip": 1.03970504, + "balance_loss_mlp": 1.02049828, + "epoch": 0.67061476025853, + "flos": 23749222360320.0, + "grad_norm": 1.7569925432388058, + "language_loss": 0.76458776, + "learning_rate": 1.0340295064537814e-06, + "loss": 0.78605324, + "num_input_tokens_seen": 240831380, + "router_z_loss_clip": 0.75244141, + "router_z_loss_mlp": 0.1105957, + "step": 11154, + "time_per_iteration": 2.6919662952423096 + }, + { + "auxiliary_loss_clip": 0.01121369, + "auxiliary_loss_mlp": 0.0103541, + "balance_loss_clip": 1.04297185, + "balance_loss_mlp": 1.0230006, + "epoch": 0.670674883511198, + "flos": 25038662703840.0, + "grad_norm": 1.735886370066072, + "language_loss": 0.76105034, + "learning_rate": 1.0336885010269702e-06, + "loss": 0.78261805, + "num_input_tokens_seen": 240851855, + "router_z_loss_clip": 0.78320312, + "router_z_loss_mlp": 0.12414551, + "step": 11155, + "time_per_iteration": 4.068148374557495 + }, + { + "auxiliary_loss_clip": 0.01115922, + "auxiliary_loss_mlp": 0.01032057, + "balance_loss_clip": 1.04148197, + "balance_loss_mlp": 1.02033901, + "epoch": 0.6707350067638659, + "flos": 31095942755040.0, + "grad_norm": 1.9966527151916915, + "language_loss": 0.81851262, + "learning_rate": 1.0333475322427878e-06, + "loss": 0.8399924, + "num_input_tokens_seen": 240869980, + "router_z_loss_clip": 0.74462891, + "router_z_loss_mlp": 0.11706543, + "step": 11156, + "time_per_iteration": 2.63678240776062 + }, + { + "auxiliary_loss_clip": 0.01114416, + "auxiliary_loss_mlp": 0.01031021, + "balance_loss_clip": 1.0405525, + "balance_loss_mlp": 1.01971412, + "epoch": 0.6707951300165339, + "flos": 27178196993280.0, + "grad_norm": 2.1830078635605625, + "language_loss": 0.74641466, + "learning_rate": 1.033006600114165e-06, + "loss": 0.76786906, + "num_input_tokens_seen": 240888680, + "router_z_loss_clip": 0.73828125, + "router_z_loss_mlp": 0.11309814, + "step": 11157, + "time_per_iteration": 2.73564076423645 + }, + { + "auxiliary_loss_clip": 0.01118366, + "auxiliary_loss_mlp": 0.01037257, + "balance_loss_clip": 1.04296672, + "balance_loss_mlp": 1.02572954, + "epoch": 0.6708552532692018, + "flos": 29266604825760.0, + "grad_norm": 1.5917976039790713, + "language_loss": 0.74446356, + "learning_rate": 1.0326657046540282e-06, + "loss": 0.76601982, + "num_input_tokens_seen": 240909050, + "router_z_loss_clip": 0.75341797, + "router_z_loss_mlp": 0.11535645, + "step": 11158, + "time_per_iteration": 2.688915252685547 + }, + { + "auxiliary_loss_clip": 0.01118349, + "auxiliary_loss_mlp": 0.01031881, + "balance_loss_clip": 1.04143929, + "balance_loss_mlp": 1.02001953, + "epoch": 0.6709153765218698, + "flos": 30429042547680.0, + "grad_norm": 5.6821275475231365, + "language_loss": 0.81662554, + "learning_rate": 1.0323248458753044e-06, + "loss": 0.83812785, + "num_input_tokens_seen": 240930035, + "router_z_loss_clip": 0.76953125, + "router_z_loss_mlp": 0.11846924, + "step": 11159, + "time_per_iteration": 2.688549041748047 + }, + { + "auxiliary_loss_clip": 0.01114794, + "auxiliary_loss_mlp": 0.01029215, + "balance_loss_clip": 1.04000497, + "balance_loss_mlp": 1.01799703, + "epoch": 0.6709754997745379, + "flos": 21390156682560.0, + "grad_norm": 2.2167485213211435, + "language_loss": 0.77188373, + "learning_rate": 1.0319840237909193e-06, + "loss": 0.79332387, + "num_input_tokens_seen": 240948895, + "router_z_loss_clip": 0.74804688, + "router_z_loss_mlp": 0.11218262, + "step": 11160, + "time_per_iteration": 2.6188509464263916 + }, + { + "auxiliary_loss_clip": 0.01113041, + "auxiliary_loss_mlp": 0.01026682, + "balance_loss_clip": 1.03989756, + "balance_loss_mlp": 1.01551247, + "epoch": 0.6710356230272058, + "flos": 26955424222560.0, + "grad_norm": 1.8458958215400674, + "language_loss": 0.73668605, + "learning_rate": 1.0316432384137978e-06, + "loss": 0.75808322, + "num_input_tokens_seen": 240967770, + "router_z_loss_clip": 0.73144531, + "router_z_loss_mlp": 0.11169434, + "step": 11161, + "time_per_iteration": 2.6198368072509766 + }, + { + "auxiliary_loss_clip": 0.0112084, + "auxiliary_loss_mlp": 0.01037898, + "balance_loss_clip": 1.04224634, + "balance_loss_mlp": 1.025756, + "epoch": 0.6710957462798738, + "flos": 29537060084640.0, + "grad_norm": 1.8796930966079282, + "language_loss": 0.68079096, + "learning_rate": 1.0313024897568618e-06, + "loss": 0.70237827, + "num_input_tokens_seen": 240988985, + "router_z_loss_clip": 0.78564453, + "router_z_loss_mlp": 0.12145996, + "step": 11162, + "time_per_iteration": 2.6603481769561768 + }, + { + "auxiliary_loss_clip": 0.01116153, + "auxiliary_loss_mlp": 0.0103474, + "balance_loss_clip": 1.04254103, + "balance_loss_mlp": 1.02406478, + "epoch": 0.6711558695325417, + "flos": 23298814745280.0, + "grad_norm": 2.027340964431601, + "language_loss": 0.70046175, + "learning_rate": 1.030961777833032e-06, + "loss": 0.72197062, + "num_input_tokens_seen": 241005455, + "router_z_loss_clip": 0.73535156, + "router_z_loss_mlp": 0.10675049, + "step": 11163, + "time_per_iteration": 2.6206650733947754 + }, + { + "auxiliary_loss_clip": 0.01114782, + "auxiliary_loss_mlp": 0.01030115, + "balance_loss_clip": 1.04318798, + "balance_loss_mlp": 1.01932025, + "epoch": 0.6712159927852097, + "flos": 31186040484960.0, + "grad_norm": 1.5953288901403544, + "language_loss": 0.75454366, + "learning_rate": 1.0306211026552291e-06, + "loss": 0.77599257, + "num_input_tokens_seen": 241026175, + "router_z_loss_clip": 0.71533203, + "router_z_loss_mlp": 0.10797119, + "step": 11164, + "time_per_iteration": 2.7261838912963867 + }, + { + "auxiliary_loss_clip": 0.01115653, + "auxiliary_loss_mlp": 0.01030442, + "balance_loss_clip": 1.0418843, + "balance_loss_mlp": 1.01860428, + "epoch": 0.6712761160378776, + "flos": 27124477430400.0, + "grad_norm": 2.164745249884758, + "language_loss": 0.65493572, + "learning_rate": 1.0302804642363704e-06, + "loss": 0.67639673, + "num_input_tokens_seen": 241044040, + "router_z_loss_clip": 0.73779297, + "router_z_loss_mlp": 0.1184082, + "step": 11165, + "time_per_iteration": 2.6218297481536865 + }, + { + "auxiliary_loss_clip": 0.01114347, + "auxiliary_loss_mlp": 0.01029063, + "balance_loss_clip": 1.04187644, + "balance_loss_mlp": 1.01808417, + "epoch": 0.6713362392905456, + "flos": 27400402522080.0, + "grad_norm": 2.074566236311615, + "language_loss": 0.7170226, + "learning_rate": 1.0299398625893738e-06, + "loss": 0.73845673, + "num_input_tokens_seen": 241063615, + "router_z_loss_clip": 0.72412109, + "router_z_loss_mlp": 0.10980225, + "step": 11166, + "time_per_iteration": 2.6885502338409424 + }, + { + "auxiliary_loss_clip": 0.01114439, + "auxiliary_loss_mlp": 0.01026686, + "balance_loss_clip": 1.04303336, + "balance_loss_mlp": 1.0162257, + "epoch": 0.6713963625432136, + "flos": 31274355454560.0, + "grad_norm": 2.0102936432134086, + "language_loss": 0.76964211, + "learning_rate": 1.0295992977271546e-06, + "loss": 0.79105335, + "num_input_tokens_seen": 241082520, + "router_z_loss_clip": 0.71435547, + "router_z_loss_mlp": 0.10449219, + "step": 11167, + "time_per_iteration": 2.6531357765197754 + }, + { + "auxiliary_loss_clip": 0.01115237, + "auxiliary_loss_mlp": 0.01035926, + "balance_loss_clip": 1.04024875, + "balance_loss_mlp": 1.02467895, + "epoch": 0.6714564857958816, + "flos": 42716059309440.0, + "grad_norm": 2.4454157036545787, + "language_loss": 0.68464112, + "learning_rate": 1.029258769662629e-06, + "loss": 0.70615274, + "num_input_tokens_seen": 241103505, + "router_z_loss_clip": 0.74951172, + "router_z_loss_mlp": 0.11236572, + "step": 11168, + "time_per_iteration": 2.8007986545562744 + }, + { + "auxiliary_loss_clip": 0.01119228, + "auxiliary_loss_mlp": 0.01034992, + "balance_loss_clip": 1.04185867, + "balance_loss_mlp": 1.02232575, + "epoch": 0.6715166090485495, + "flos": 32067853110720.0, + "grad_norm": 2.77809852146324, + "language_loss": 0.73059475, + "learning_rate": 1.0289182784087068e-06, + "loss": 0.75213689, + "num_input_tokens_seen": 241122885, + "router_z_loss_clip": 0.77392578, + "router_z_loss_mlp": 0.12670898, + "step": 11169, + "time_per_iteration": 2.675177812576294 + }, + { + "auxiliary_loss_clip": 0.01115944, + "auxiliary_loss_mlp": 0.01032519, + "balance_loss_clip": 1.03991461, + "balance_loss_mlp": 1.02051449, + "epoch": 0.6715767323012175, + "flos": 19430372162880.0, + "grad_norm": 2.2092668501151365, + "language_loss": 0.76190126, + "learning_rate": 1.0285778239783005e-06, + "loss": 0.78338593, + "num_input_tokens_seen": 241140865, + "router_z_loss_clip": 0.76025391, + "router_z_loss_mlp": 0.12005615, + "step": 11170, + "time_per_iteration": 2.6492085456848145 + }, + { + "auxiliary_loss_clip": 0.01117737, + "auxiliary_loss_mlp": 0.01030558, + "balance_loss_clip": 1.04086328, + "balance_loss_mlp": 1.01882219, + "epoch": 0.6716368555538854, + "flos": 21343041436320.0, + "grad_norm": 2.189475539175994, + "language_loss": 0.74582422, + "learning_rate": 1.0282374063843212e-06, + "loss": 0.76730716, + "num_input_tokens_seen": 241158225, + "router_z_loss_clip": 0.76855469, + "router_z_loss_mlp": 0.11737061, + "step": 11171, + "time_per_iteration": 2.5832467079162598 + }, + { + "auxiliary_loss_clip": 0.0111848, + "auxiliary_loss_mlp": 0.01037699, + "balance_loss_clip": 1.04241109, + "balance_loss_mlp": 1.02607632, + "epoch": 0.6716969788065534, + "flos": 20452355526240.0, + "grad_norm": 2.0167064788784486, + "language_loss": 0.864012, + "learning_rate": 1.0278970256396762e-06, + "loss": 0.88557374, + "num_input_tokens_seen": 241175215, + "router_z_loss_clip": 0.76074219, + "router_z_loss_mlp": 0.11633301, + "step": 11172, + "time_per_iteration": 2.6195504665374756 + }, + { + "auxiliary_loss_clip": 0.01114459, + "auxiliary_loss_mlp": 0.01032561, + "balance_loss_clip": 1.03992569, + "balance_loss_mlp": 1.02091432, + "epoch": 0.6717571020592215, + "flos": 27710801468640.0, + "grad_norm": 1.8612237121344304, + "language_loss": 0.63710487, + "learning_rate": 1.0275566817572733e-06, + "loss": 0.65857506, + "num_input_tokens_seen": 241195250, + "router_z_loss_clip": 0.74560547, + "router_z_loss_mlp": 0.11639404, + "step": 11173, + "time_per_iteration": 2.6608619689941406 + }, + { + "auxiliary_loss_clip": 0.01122276, + "auxiliary_loss_mlp": 0.01034066, + "balance_loss_clip": 1.04171109, + "balance_loss_mlp": 1.02093458, + "epoch": 0.6718172253118894, + "flos": 22859225244000.0, + "grad_norm": 6.24736979181557, + "language_loss": 0.71775883, + "learning_rate": 1.02721637475002e-06, + "loss": 0.7393223, + "num_input_tokens_seen": 241210720, + "router_z_loss_clip": 0.80566406, + "router_z_loss_mlp": 0.13116455, + "step": 11174, + "time_per_iteration": 2.5937225818634033 + }, + { + "auxiliary_loss_clip": 0.0111329, + "auxiliary_loss_mlp": 0.010285, + "balance_loss_clip": 1.04005122, + "balance_loss_mlp": 1.01737809, + "epoch": 0.6718773485645574, + "flos": 19074478661280.0, + "grad_norm": 2.56687589348572, + "language_loss": 0.68340003, + "learning_rate": 1.0268761046308178e-06, + "loss": 0.70481795, + "num_input_tokens_seen": 241227395, + "router_z_loss_clip": 0.73339844, + "router_z_loss_mlp": 0.11120605, + "step": 11175, + "time_per_iteration": 2.6380391120910645 + }, + { + "auxiliary_loss_clip": 0.0111463, + "auxiliary_loss_mlp": 0.01033011, + "balance_loss_clip": 1.04187143, + "balance_loss_mlp": 1.0220319, + "epoch": 0.6719374718172253, + "flos": 23616587836800.0, + "grad_norm": 2.4649003959386144, + "language_loss": 0.73665988, + "learning_rate": 1.0265358714125714e-06, + "loss": 0.75813627, + "num_input_tokens_seen": 241246355, + "router_z_loss_clip": 0.72705078, + "router_z_loss_mlp": 0.10974121, + "step": 11176, + "time_per_iteration": 2.6215968132019043 + }, + { + "auxiliary_loss_clip": 0.01116461, + "auxiliary_loss_mlp": 0.01027351, + "balance_loss_clip": 1.04055798, + "balance_loss_mlp": 1.01532865, + "epoch": 0.6719975950698933, + "flos": 26821736249760.0, + "grad_norm": 1.6804118750291084, + "language_loss": 0.72465885, + "learning_rate": 1.026195675108182e-06, + "loss": 0.74609691, + "num_input_tokens_seen": 241264180, + "router_z_loss_clip": 0.75927734, + "router_z_loss_mlp": 0.12023926, + "step": 11177, + "time_per_iteration": 2.6165952682495117 + }, + { + "auxiliary_loss_clip": 0.01115899, + "auxiliary_loss_mlp": 0.01033183, + "balance_loss_clip": 1.0405128, + "balance_loss_mlp": 1.02073765, + "epoch": 0.6720577183225612, + "flos": 30783720530880.0, + "grad_norm": 2.215075902408288, + "language_loss": 0.76465565, + "learning_rate": 1.025855515730551e-06, + "loss": 0.78614646, + "num_input_tokens_seen": 241282245, + "router_z_loss_clip": 0.75341797, + "router_z_loss_mlp": 0.12432861, + "step": 11178, + "time_per_iteration": 2.750185251235962 + }, + { + "auxiliary_loss_clip": 0.01117691, + "auxiliary_loss_mlp": 0.0103173, + "balance_loss_clip": 1.04211855, + "balance_loss_mlp": 1.02073932, + "epoch": 0.6721178415752292, + "flos": 20677154160960.0, + "grad_norm": 1.638502838193656, + "language_loss": 0.70339978, + "learning_rate": 1.0255153932925766e-06, + "loss": 0.72489399, + "num_input_tokens_seen": 241300745, + "router_z_loss_clip": 0.75537109, + "router_z_loss_mlp": 0.10986328, + "step": 11179, + "time_per_iteration": 2.696667194366455 + }, + { + "auxiliary_loss_clip": 0.01114689, + "auxiliary_loss_mlp": 0.01030068, + "balance_loss_clip": 1.04116035, + "balance_loss_mlp": 1.01879656, + "epoch": 0.6721779648278972, + "flos": 26285080046400.0, + "grad_norm": 1.959052885332888, + "language_loss": 0.73741376, + "learning_rate": 1.0251753078071557e-06, + "loss": 0.7588613, + "num_input_tokens_seen": 241319320, + "router_z_loss_clip": 0.73486328, + "router_z_loss_mlp": 0.11273193, + "step": 11180, + "time_per_iteration": 2.6523118019104004 + }, + { + "auxiliary_loss_clip": 0.01116107, + "auxiliary_loss_mlp": 0.01029935, + "balance_loss_clip": 1.0424999, + "balance_loss_mlp": 1.01854491, + "epoch": 0.6722380880805652, + "flos": 27582988501440.0, + "grad_norm": 1.4233749494491492, + "language_loss": 0.75308514, + "learning_rate": 1.0248352592871848e-06, + "loss": 0.77454555, + "num_input_tokens_seen": 241342225, + "router_z_loss_clip": 0.73535156, + "router_z_loss_mlp": 0.11395264, + "step": 11181, + "time_per_iteration": 2.692600965499878 + }, + { + "auxiliary_loss_clip": 0.01117769, + "auxiliary_loss_mlp": 0.01027064, + "balance_loss_clip": 1.04136491, + "balance_loss_mlp": 1.01572704, + "epoch": 0.6722982113332331, + "flos": 19431628198560.0, + "grad_norm": 2.660827487267969, + "language_loss": 0.74207896, + "learning_rate": 1.0244952477455585e-06, + "loss": 0.76352727, + "num_input_tokens_seen": 241358240, + "router_z_loss_clip": 0.76367188, + "router_z_loss_mlp": 0.11340332, + "step": 11182, + "time_per_iteration": 4.158527374267578 + }, + { + "auxiliary_loss_clip": 0.01113437, + "auxiliary_loss_mlp": 0.01032809, + "balance_loss_clip": 1.04083169, + "balance_loss_mlp": 1.02201438, + "epoch": 0.6723583345859011, + "flos": 25131353539680.0, + "grad_norm": 1.7175777040751286, + "language_loss": 0.69667715, + "learning_rate": 1.0241552731951699e-06, + "loss": 0.71813965, + "num_input_tokens_seen": 241378420, + "router_z_loss_clip": 0.72558594, + "router_z_loss_mlp": 0.10803223, + "step": 11183, + "time_per_iteration": 2.690467119216919 + }, + { + "auxiliary_loss_clip": 0.0111566, + "auxiliary_loss_mlp": 0.01029984, + "balance_loss_clip": 1.04054964, + "balance_loss_mlp": 1.01812243, + "epoch": 0.672418457838569, + "flos": 26510000232960.0, + "grad_norm": 1.5426368666655181, + "language_loss": 0.77755934, + "learning_rate": 1.0238153356489112e-06, + "loss": 0.79901576, + "num_input_tokens_seen": 241397185, + "router_z_loss_clip": 0.75097656, + "router_z_loss_mlp": 0.11859131, + "step": 11184, + "time_per_iteration": 3.991856336593628 + }, + { + "auxiliary_loss_clip": 0.01124202, + "auxiliary_loss_mlp": 0.01033957, + "balance_loss_clip": 1.04380262, + "balance_loss_mlp": 1.0210768, + "epoch": 0.672478581091237, + "flos": 26198264216160.0, + "grad_norm": 2.0240847974589733, + "language_loss": 0.66095239, + "learning_rate": 1.0234754351196743e-06, + "loss": 0.68253398, + "num_input_tokens_seen": 241415785, + "router_z_loss_clip": 0.80419922, + "router_z_loss_mlp": 0.12878418, + "step": 11185, + "time_per_iteration": 2.6688954830169678 + }, + { + "auxiliary_loss_clip": 0.01116777, + "auxiliary_loss_mlp": 0.01030114, + "balance_loss_clip": 1.04096699, + "balance_loss_mlp": 1.01827645, + "epoch": 0.6725387043439051, + "flos": 37640089622880.0, + "grad_norm": 1.814374645809007, + "language_loss": 0.80644464, + "learning_rate": 1.023135571620345e-06, + "loss": 0.82791352, + "num_input_tokens_seen": 241437390, + "router_z_loss_clip": 0.7578125, + "router_z_loss_mlp": 0.1184082, + "step": 11186, + "time_per_iteration": 2.7217838764190674 + }, + { + "auxiliary_loss_clip": 0.01117118, + "auxiliary_loss_mlp": 0.01033635, + "balance_loss_clip": 1.0448302, + "balance_loss_mlp": 1.02306128, + "epoch": 0.672598827596573, + "flos": 29353825828800.0, + "grad_norm": 3.2813902410069753, + "language_loss": 0.80284965, + "learning_rate": 1.022795745163813e-06, + "loss": 0.82435715, + "num_input_tokens_seen": 241458085, + "router_z_loss_clip": 0.72314453, + "router_z_loss_mlp": 0.10577393, + "step": 11187, + "time_per_iteration": 2.7002594470977783 + }, + { + "auxiliary_loss_clip": 0.01123366, + "auxiliary_loss_mlp": 0.01031785, + "balance_loss_clip": 1.04451084, + "balance_loss_mlp": 1.01935792, + "epoch": 0.672658950849241, + "flos": 26688169828800.0, + "grad_norm": 2.13609138532122, + "language_loss": 0.70553362, + "learning_rate": 1.022455955762965e-06, + "loss": 0.72708517, + "num_input_tokens_seen": 241476880, + "router_z_loss_clip": 0.78955078, + "router_z_loss_mlp": 0.12420654, + "step": 11188, + "time_per_iteration": 2.6511363983154297 + }, + { + "auxiliary_loss_clip": 0.01114658, + "auxiliary_loss_mlp": 0.01031543, + "balance_loss_clip": 1.04256678, + "balance_loss_mlp": 1.020123, + "epoch": 0.6727190741019089, + "flos": 28335813158880.0, + "grad_norm": 2.0270659012473744, + "language_loss": 0.75725764, + "learning_rate": 1.0221162034306842e-06, + "loss": 0.77871966, + "num_input_tokens_seen": 241496535, + "router_z_loss_clip": 0.72070312, + "router_z_loss_mlp": 0.11407471, + "step": 11189, + "time_per_iteration": 2.670344829559326 + }, + { + "auxiliary_loss_clip": 0.01119515, + "auxiliary_loss_mlp": 0.01029676, + "balance_loss_clip": 1.04117596, + "balance_loss_mlp": 1.01695681, + "epoch": 0.6727791973545769, + "flos": 19252729291680.0, + "grad_norm": 2.3561952656412024, + "language_loss": 0.74953127, + "learning_rate": 1.0217764881798562e-06, + "loss": 0.77102315, + "num_input_tokens_seen": 241513465, + "router_z_loss_clip": 0.78271484, + "router_z_loss_mlp": 0.1272583, + "step": 11190, + "time_per_iteration": 2.603914737701416 + }, + { + "auxiliary_loss_clip": 0.01115709, + "auxiliary_loss_mlp": 0.01031198, + "balance_loss_clip": 1.0416007, + "balance_loss_mlp": 1.01882958, + "epoch": 0.6728393206072448, + "flos": 25929308096640.0, + "grad_norm": 1.6994817787628274, + "language_loss": 0.77268815, + "learning_rate": 1.0214368100233612e-06, + "loss": 0.79415721, + "num_input_tokens_seen": 241534125, + "router_z_loss_clip": 0.74121094, + "router_z_loss_mlp": 0.12365723, + "step": 11191, + "time_per_iteration": 2.717360496520996 + }, + { + "auxiliary_loss_clip": 0.01114054, + "auxiliary_loss_mlp": 0.01031504, + "balance_loss_clip": 1.04131544, + "balance_loss_mlp": 1.01976156, + "epoch": 0.6728994438599128, + "flos": 39197837809440.0, + "grad_norm": 1.705223683710179, + "language_loss": 0.86473846, + "learning_rate": 1.0210971689740802e-06, + "loss": 0.88619411, + "num_input_tokens_seen": 241556340, + "router_z_loss_clip": 0.72753906, + "router_z_loss_mlp": 0.11743164, + "step": 11192, + "time_per_iteration": 2.6838953495025635 + }, + { + "auxiliary_loss_clip": 0.01117545, + "auxiliary_loss_mlp": 0.01033184, + "balance_loss_clip": 1.0420804, + "balance_loss_mlp": 1.02116179, + "epoch": 0.6729595671125808, + "flos": 28201922599680.0, + "grad_norm": 1.9949850018204445, + "language_loss": 0.75679374, + "learning_rate": 1.0207575650448923e-06, + "loss": 0.778301, + "num_input_tokens_seen": 241575185, + "router_z_loss_clip": 0.75390625, + "router_z_loss_mlp": 0.12036133, + "step": 11193, + "time_per_iteration": 4.076962232589722 + }, + { + "auxiliary_loss_clip": 0.01118731, + "auxiliary_loss_mlp": 0.01033153, + "balance_loss_clip": 1.04397118, + "balance_loss_mlp": 1.02147603, + "epoch": 0.6730196903652488, + "flos": 17828385456960.0, + "grad_norm": 2.727850172482577, + "language_loss": 0.78671819, + "learning_rate": 1.0204179982486758e-06, + "loss": 0.80823708, + "num_input_tokens_seen": 241592970, + "router_z_loss_clip": 0.74853516, + "router_z_loss_mlp": 0.11669922, + "step": 11194, + "time_per_iteration": 2.6687381267547607 + }, + { + "auxiliary_loss_clip": 0.01117465, + "auxiliary_loss_mlp": 0.0102875, + "balance_loss_clip": 1.04147875, + "balance_loss_mlp": 1.01754999, + "epoch": 0.6730798136179167, + "flos": 25753488503040.0, + "grad_norm": 2.082758317078022, + "language_loss": 0.90260577, + "learning_rate": 1.0200784685983075e-06, + "loss": 0.92406797, + "num_input_tokens_seen": 241610245, + "router_z_loss_clip": 0.75927734, + "router_z_loss_mlp": 0.11206055, + "step": 11195, + "time_per_iteration": 3.986628293991089 + }, + { + "auxiliary_loss_clip": 0.01115918, + "auxiliary_loss_mlp": 0.01034829, + "balance_loss_clip": 1.04135263, + "balance_loss_mlp": 1.0235939, + "epoch": 0.6731399368705847, + "flos": 35369744087520.0, + "grad_norm": 1.7466177293851135, + "language_loss": 0.7265017, + "learning_rate": 1.019738976106662e-06, + "loss": 0.7480092, + "num_input_tokens_seen": 241630350, + "router_z_loss_clip": 0.74560547, + "router_z_loss_mlp": 0.11230469, + "step": 11196, + "time_per_iteration": 2.7414379119873047 + }, + { + "auxiliary_loss_clip": 0.01035049, + "auxiliary_loss_mlp": 0.01002055, + "balance_loss_clip": 1.01170075, + "balance_loss_mlp": 1.00084925, + "epoch": 0.6732000601232526, + "flos": 79000232433120.0, + "grad_norm": 0.7808748760023327, + "language_loss": 0.56539434, + "learning_rate": 1.0193995207866123e-06, + "loss": 0.58576536, + "num_input_tokens_seen": 241692380, + "router_z_loss_clip": 0.23376465, + "router_z_loss_mlp": 0.01204681, + "step": 11197, + "time_per_iteration": 3.2140402793884277 + }, + { + "auxiliary_loss_clip": 0.01115753, + "auxiliary_loss_mlp": 0.01028465, + "balance_loss_clip": 1.04315388, + "balance_loss_mlp": 1.01736116, + "epoch": 0.6732601833759206, + "flos": 20989214316000.0, + "grad_norm": 2.13326229539135, + "language_loss": 0.75723076, + "learning_rate": 1.0190601026510312e-06, + "loss": 0.77867293, + "num_input_tokens_seen": 241710430, + "router_z_loss_clip": 0.72509766, + "router_z_loss_mlp": 0.11108398, + "step": 11198, + "time_per_iteration": 2.645470142364502 + }, + { + "auxiliary_loss_clip": 0.01116576, + "auxiliary_loss_mlp": 0.01028911, + "balance_loss_clip": 1.04032815, + "balance_loss_mlp": 1.01682949, + "epoch": 0.6733203066285887, + "flos": 22767628374720.0, + "grad_norm": 2.1079743355543528, + "language_loss": 0.81521666, + "learning_rate": 1.0187207217127892e-06, + "loss": 0.83667153, + "num_input_tokens_seen": 241724775, + "router_z_loss_clip": 0.76269531, + "router_z_loss_mlp": 0.12078857, + "step": 11199, + "time_per_iteration": 2.6140644550323486 + }, + { + "auxiliary_loss_clip": 0.01119536, + "auxiliary_loss_mlp": 0.01035156, + "balance_loss_clip": 1.04256797, + "balance_loss_mlp": 1.02283525, + "epoch": 0.6733804298812566, + "flos": 43695748982880.0, + "grad_norm": 2.0676660257196815, + "language_loss": 0.71434689, + "learning_rate": 1.0183813779847552e-06, + "loss": 0.73589385, + "num_input_tokens_seen": 241744440, + "router_z_loss_clip": 0.77001953, + "router_z_loss_mlp": 0.12310791, + "step": 11200, + "time_per_iteration": 2.7516636848449707 + }, + { + "auxiliary_loss_clip": 0.01119418, + "auxiliary_loss_mlp": 0.01035345, + "balance_loss_clip": 1.04389763, + "balance_loss_mlp": 1.02366281, + "epoch": 0.6734405531339246, + "flos": 75215114844480.0, + "grad_norm": 1.7307657652978405, + "language_loss": 0.64812851, + "learning_rate": 1.0180420714797987e-06, + "loss": 0.66967618, + "num_input_tokens_seen": 241771705, + "router_z_loss_clip": 0.75439453, + "router_z_loss_mlp": 0.11694336, + "step": 11201, + "time_per_iteration": 3.008693218231201 + }, + { + "auxiliary_loss_clip": 0.01119118, + "auxiliary_loss_mlp": 0.01034294, + "balance_loss_clip": 1.04165983, + "balance_loss_mlp": 1.02202189, + "epoch": 0.6735006763865925, + "flos": 25041336844320.0, + "grad_norm": 1.9240846554640962, + "language_loss": 0.6309303, + "learning_rate": 1.0177028022107856e-06, + "loss": 0.65246445, + "num_input_tokens_seen": 241790830, + "router_z_loss_clip": 0.77392578, + "router_z_loss_mlp": 0.12261963, + "step": 11202, + "time_per_iteration": 2.6472649574279785 + }, + { + "auxiliary_loss_clip": 0.01116208, + "auxiliary_loss_mlp": 0.01024455, + "balance_loss_clip": 1.04083061, + "balance_loss_mlp": 1.01347005, + "epoch": 0.6735607996392605, + "flos": 16983234619200.0, + "grad_norm": 2.4707973287273495, + "language_loss": 0.74836856, + "learning_rate": 1.0173635701905796e-06, + "loss": 0.76977515, + "num_input_tokens_seen": 241808165, + "router_z_loss_clip": 0.75341797, + "router_z_loss_mlp": 0.10992432, + "step": 11203, + "time_per_iteration": 2.6334474086761475 + }, + { + "auxiliary_loss_clip": 0.01121932, + "auxiliary_loss_mlp": 0.01030236, + "balance_loss_clip": 1.0426755, + "balance_loss_mlp": 1.01745701, + "epoch": 0.6736209228919284, + "flos": 22948472111040.0, + "grad_norm": 1.9724813449176015, + "language_loss": 0.67405194, + "learning_rate": 1.0170243754320456e-06, + "loss": 0.69557363, + "num_input_tokens_seen": 241826925, + "router_z_loss_clip": 0.79199219, + "router_z_loss_mlp": 0.12786865, + "step": 11204, + "time_per_iteration": 2.641618013381958 + }, + { + "auxiliary_loss_clip": 0.01124708, + "auxiliary_loss_mlp": 0.01031083, + "balance_loss_clip": 1.04637218, + "balance_loss_mlp": 1.01885223, + "epoch": 0.6736810461445965, + "flos": 24859561210560.0, + "grad_norm": 2.388005913451771, + "language_loss": 0.74072981, + "learning_rate": 1.0166852179480465e-06, + "loss": 0.76228774, + "num_input_tokens_seen": 241845525, + "router_z_loss_clip": 0.78222656, + "router_z_loss_mlp": 0.12243652, + "step": 11205, + "time_per_iteration": 2.688725709915161 + }, + { + "auxiliary_loss_clip": 0.01112388, + "auxiliary_loss_mlp": 0.01030536, + "balance_loss_clip": 1.03947091, + "balance_loss_mlp": 1.01927674, + "epoch": 0.6737411693972644, + "flos": 36615999360960.0, + "grad_norm": 1.8285018544922527, + "language_loss": 0.71832359, + "learning_rate": 1.0163460977514416e-06, + "loss": 0.73975283, + "num_input_tokens_seen": 241866815, + "router_z_loss_clip": 0.72900391, + "router_z_loss_mlp": 0.11273193, + "step": 11206, + "time_per_iteration": 2.7265048027038574 + }, + { + "auxiliary_loss_clip": 0.01122603, + "auxiliary_loss_mlp": 0.01035651, + "balance_loss_clip": 1.04336274, + "balance_loss_mlp": 1.02346778, + "epoch": 0.6738012926499324, + "flos": 31051258545600.0, + "grad_norm": 2.1271214136335472, + "language_loss": 0.67283189, + "learning_rate": 1.016007014855092e-06, + "loss": 0.69441444, + "num_input_tokens_seen": 241887050, + "router_z_loss_clip": 0.79199219, + "router_z_loss_mlp": 0.12188721, + "step": 11207, + "time_per_iteration": 2.7553415298461914 + }, + { + "auxiliary_loss_clip": 0.01115065, + "auxiliary_loss_mlp": 0.01031328, + "balance_loss_clip": 1.04211068, + "balance_loss_mlp": 1.0200088, + "epoch": 0.6738614159026003, + "flos": 25352019411840.0, + "grad_norm": 2.6867715657550173, + "language_loss": 0.74282783, + "learning_rate": 1.0156679692718553e-06, + "loss": 0.76429176, + "num_input_tokens_seen": 241904280, + "router_z_loss_clip": 0.72851562, + "router_z_loss_mlp": 0.11309814, + "step": 11208, + "time_per_iteration": 2.6580300331115723 + }, + { + "auxiliary_loss_clip": 0.011179, + "auxiliary_loss_mlp": 0.01033605, + "balance_loss_clip": 1.04186451, + "balance_loss_mlp": 1.02089083, + "epoch": 0.6739215391552683, + "flos": 23876062912800.0, + "grad_norm": 1.9530125350467915, + "language_loss": 0.75632125, + "learning_rate": 1.0153289610145867e-06, + "loss": 0.77783632, + "num_input_tokens_seen": 241919190, + "router_z_loss_clip": 0.75927734, + "router_z_loss_mlp": 0.1272583, + "step": 11209, + "time_per_iteration": 2.59318470954895 + }, + { + "auxiliary_loss_clip": 0.01113388, + "auxiliary_loss_mlp": 0.01029638, + "balance_loss_clip": 1.04193425, + "balance_loss_mlp": 1.01878428, + "epoch": 0.6739816624079362, + "flos": 29758900957920.0, + "grad_norm": 1.9397414670083089, + "language_loss": 0.66509485, + "learning_rate": 1.0149899900961428e-06, + "loss": 0.68652511, + "num_input_tokens_seen": 241940525, + "router_z_loss_clip": 0.71386719, + "router_z_loss_mlp": 0.10845947, + "step": 11210, + "time_per_iteration": 2.6595256328582764 + }, + { + "auxiliary_loss_clip": 0.01111485, + "auxiliary_loss_mlp": 0.01029034, + "balance_loss_clip": 1.03919411, + "balance_loss_mlp": 1.01855564, + "epoch": 0.6740417856606042, + "flos": 27489082147200.0, + "grad_norm": 2.2927157585577347, + "language_loss": 0.80337697, + "learning_rate": 1.014651056529377e-06, + "loss": 0.82478213, + "num_input_tokens_seen": 241959290, + "router_z_loss_clip": 0.72314453, + "router_z_loss_mlp": 0.10473633, + "step": 11211, + "time_per_iteration": 2.642467498779297 + }, + { + "auxiliary_loss_clip": 0.01114278, + "auxiliary_loss_mlp": 0.01033419, + "balance_loss_clip": 1.04156387, + "balance_loss_mlp": 1.02178431, + "epoch": 0.6741019089132723, + "flos": 31451674187520.0, + "grad_norm": 1.6449863446484239, + "language_loss": 0.7655654, + "learning_rate": 1.014312160327143e-06, + "loss": 0.78704238, + "num_input_tokens_seen": 241980715, + "router_z_loss_clip": 0.72753906, + "router_z_loss_mlp": 0.11645508, + "step": 11212, + "time_per_iteration": 2.741905927658081 + }, + { + "auxiliary_loss_clip": 0.01116846, + "auxiliary_loss_mlp": 0.01028306, + "balance_loss_clip": 1.04121065, + "balance_loss_mlp": 1.01611638, + "epoch": 0.6741620321659402, + "flos": 25753204882080.0, + "grad_norm": 2.3812119343808864, + "language_loss": 0.78073084, + "learning_rate": 1.0139733015022905e-06, + "loss": 0.80218238, + "num_input_tokens_seen": 241999985, + "router_z_loss_clip": 0.75585938, + "router_z_loss_mlp": 0.12200928, + "step": 11213, + "time_per_iteration": 2.6090941429138184 + }, + { + "auxiliary_loss_clip": 0.01119743, + "auxiliary_loss_mlp": 0.01033691, + "balance_loss_clip": 1.04285157, + "balance_loss_mlp": 1.02123988, + "epoch": 0.6742221554186082, + "flos": 25307537788800.0, + "grad_norm": 2.3241565799248174, + "language_loss": 0.68102431, + "learning_rate": 1.0136344800676685e-06, + "loss": 0.70255864, + "num_input_tokens_seen": 242018990, + "router_z_loss_clip": 0.76904297, + "router_z_loss_mlp": 0.12457275, + "step": 11214, + "time_per_iteration": 2.664027452468872 + }, + { + "auxiliary_loss_clip": 0.01116866, + "auxiliary_loss_mlp": 0.01032947, + "balance_loss_clip": 1.04184687, + "balance_loss_mlp": 1.02178943, + "epoch": 0.6742822786712761, + "flos": 46093866968160.0, + "grad_norm": 1.6753170500051575, + "language_loss": 0.72708917, + "learning_rate": 1.0132956960361263e-06, + "loss": 0.74858725, + "num_input_tokens_seen": 242039340, + "router_z_loss_clip": 0.74951172, + "router_z_loss_mlp": 0.11151123, + "step": 11215, + "time_per_iteration": 2.7510716915130615 + }, + { + "auxiliary_loss_clip": 0.01115834, + "auxiliary_loss_mlp": 0.01029562, + "balance_loss_clip": 1.04040384, + "balance_loss_mlp": 1.01870203, + "epoch": 0.6743424019239441, + "flos": 45471164762880.0, + "grad_norm": 3.17300835363302, + "language_loss": 0.67437834, + "learning_rate": 1.0129569494205096e-06, + "loss": 0.69583231, + "num_input_tokens_seen": 242062215, + "router_z_loss_clip": 0.75439453, + "router_z_loss_mlp": 0.10852051, + "step": 11216, + "time_per_iteration": 2.7969818115234375 + }, + { + "auxiliary_loss_clip": 0.01033209, + "auxiliary_loss_mlp": 0.0100061, + "balance_loss_clip": 1.010131, + "balance_loss_mlp": 0.99946868, + "epoch": 0.674402525176612, + "flos": 80529260218560.0, + "grad_norm": 0.6764027317724526, + "language_loss": 0.56284392, + "learning_rate": 1.0126182402336646e-06, + "loss": 0.5831821, + "num_input_tokens_seen": 242131130, + "router_z_loss_clip": 0.23071289, + "router_z_loss_mlp": 0.01142883, + "step": 11217, + "time_per_iteration": 3.3341753482818604 + }, + { + "auxiliary_loss_clip": 0.01115014, + "auxiliary_loss_mlp": 0.0103003, + "balance_loss_clip": 1.04105473, + "balance_loss_mlp": 1.01826358, + "epoch": 0.67446264842928, + "flos": 32289167259360.0, + "grad_norm": 2.24769096615662, + "language_loss": 0.74740946, + "learning_rate": 1.0122795684884363e-06, + "loss": 0.76885986, + "num_input_tokens_seen": 242149720, + "router_z_loss_clip": 0.73925781, + "router_z_loss_mlp": 0.11773682, + "step": 11218, + "time_per_iteration": 2.6709113121032715 + }, + { + "auxiliary_loss_clip": 0.01118551, + "auxiliary_loss_mlp": 0.01041542, + "balance_loss_clip": 1.0426923, + "balance_loss_mlp": 1.0282141, + "epoch": 0.674522771681948, + "flos": 28958555881440.0, + "grad_norm": 1.7220385579004915, + "language_loss": 0.66014194, + "learning_rate": 1.0119409341976639e-06, + "loss": 0.68174291, + "num_input_tokens_seen": 242168875, + "router_z_loss_clip": 0.7578125, + "router_z_loss_mlp": 0.13330078, + "step": 11219, + "time_per_iteration": 2.7101011276245117 + }, + { + "auxiliary_loss_clip": 0.01117173, + "auxiliary_loss_mlp": 0.01029942, + "balance_loss_clip": 1.04163766, + "balance_loss_mlp": 1.01830697, + "epoch": 0.674582894934616, + "flos": 30205580983200.0, + "grad_norm": 1.8848859157458069, + "language_loss": 0.74703866, + "learning_rate": 1.0116023373741904e-06, + "loss": 0.76850986, + "num_input_tokens_seen": 242188465, + "router_z_loss_clip": 0.75439453, + "router_z_loss_mlp": 0.11627197, + "step": 11220, + "time_per_iteration": 2.706925392150879 + }, + { + "auxiliary_loss_clip": 0.01116718, + "auxiliary_loss_mlp": 0.01028142, + "balance_loss_clip": 1.04133856, + "balance_loss_mlp": 1.01623297, + "epoch": 0.6746430181872839, + "flos": 30293855435520.0, + "grad_norm": 1.536637518868168, + "language_loss": 0.70221752, + "learning_rate": 1.0112637780308554e-06, + "loss": 0.72366619, + "num_input_tokens_seen": 242208675, + "router_z_loss_clip": 0.75390625, + "router_z_loss_mlp": 0.11895752, + "step": 11221, + "time_per_iteration": 2.731182813644409 + }, + { + "auxiliary_loss_clip": 0.01115258, + "auxiliary_loss_mlp": 0.01026665, + "balance_loss_clip": 1.04218435, + "balance_loss_mlp": 1.01644325, + "epoch": 0.6747031414399519, + "flos": 20588758156800.0, + "grad_norm": 2.235204135689417, + "language_loss": 0.5826506, + "learning_rate": 1.010925256180498e-06, + "loss": 0.60406983, + "num_input_tokens_seen": 242227440, + "router_z_loss_clip": 0.73046875, + "router_z_loss_mlp": 0.10217285, + "step": 11222, + "time_per_iteration": 4.04677939414978 + }, + { + "auxiliary_loss_clip": 0.01115991, + "auxiliary_loss_mlp": 0.01035837, + "balance_loss_clip": 1.04166007, + "balance_loss_mlp": 1.02442861, + "epoch": 0.6747632646926198, + "flos": 27802276786080.0, + "grad_norm": 1.7720903995630926, + "language_loss": 0.76517564, + "learning_rate": 1.0105867718359528e-06, + "loss": 0.78669393, + "num_input_tokens_seen": 242245240, + "router_z_loss_clip": 0.74267578, + "router_z_loss_mlp": 0.1138916, + "step": 11223, + "time_per_iteration": 2.7158987522125244 + }, + { + "auxiliary_loss_clip": 0.01119555, + "auxiliary_loss_mlp": 0.01029005, + "balance_loss_clip": 1.04377329, + "balance_loss_mlp": 1.01770365, + "epoch": 0.6748233879452878, + "flos": 24460361087040.0, + "grad_norm": 1.8702366926639256, + "language_loss": 0.7524724, + "learning_rate": 1.0102483250100574e-06, + "loss": 0.77395803, + "num_input_tokens_seen": 242263435, + "router_z_loss_clip": 0.7578125, + "router_z_loss_mlp": 0.11303711, + "step": 11224, + "time_per_iteration": 4.115102052688599 + }, + { + "auxiliary_loss_clip": 0.01113222, + "auxiliary_loss_mlp": 0.01027444, + "balance_loss_clip": 1.04115891, + "balance_loss_mlp": 1.01734734, + "epoch": 0.6748835111979558, + "flos": 28068275144160.0, + "grad_norm": 1.8489207357308142, + "language_loss": 0.63312918, + "learning_rate": 1.0099099157156445e-06, + "loss": 0.65453589, + "num_input_tokens_seen": 242282765, + "router_z_loss_clip": 0.71972656, + "router_z_loss_mlp": 0.10095215, + "step": 11225, + "time_per_iteration": 2.645449161529541 + }, + { + "auxiliary_loss_clip": 0.01111516, + "auxiliary_loss_mlp": 0.01035103, + "balance_loss_clip": 1.04127336, + "balance_loss_mlp": 1.02511883, + "epoch": 0.6749436344506238, + "flos": 14883076775520.0, + "grad_norm": 2.1065074349207262, + "language_loss": 0.64016408, + "learning_rate": 1.0095715439655462e-06, + "loss": 0.66163027, + "num_input_tokens_seen": 242298980, + "router_z_loss_clip": 0.70166016, + "router_z_loss_mlp": 0.09985352, + "step": 11226, + "time_per_iteration": 2.632826566696167 + }, + { + "auxiliary_loss_clip": 0.01118294, + "auxiliary_loss_mlp": 0.01029168, + "balance_loss_clip": 1.0439322, + "balance_loss_mlp": 1.01769423, + "epoch": 0.6750037577032918, + "flos": 14487766310880.0, + "grad_norm": 2.4371042382079633, + "language_loss": 0.7171526, + "learning_rate": 1.0092332097725945e-06, + "loss": 0.7386272, + "num_input_tokens_seen": 242315420, + "router_z_loss_clip": 0.74365234, + "router_z_loss_mlp": 0.11480713, + "step": 11227, + "time_per_iteration": 2.6428654193878174 + }, + { + "auxiliary_loss_clip": 0.01112866, + "auxiliary_loss_mlp": 0.01027562, + "balance_loss_clip": 1.04055858, + "balance_loss_mlp": 1.01630855, + "epoch": 0.6750638809559597, + "flos": 20767049304480.0, + "grad_norm": 3.1618670744897845, + "language_loss": 0.71429092, + "learning_rate": 1.0088949131496183e-06, + "loss": 0.73569524, + "num_input_tokens_seen": 242332805, + "router_z_loss_clip": 0.72363281, + "router_z_loss_mlp": 0.11242676, + "step": 11228, + "time_per_iteration": 2.6669414043426514 + }, + { + "auxiliary_loss_clip": 0.01032905, + "auxiliary_loss_mlp": 0.01000551, + "balance_loss_clip": 1.00976825, + "balance_loss_mlp": 0.99937499, + "epoch": 0.6751240042086277, + "flos": 86574425603040.0, + "grad_norm": 0.7629968758322329, + "language_loss": 0.53192091, + "learning_rate": 1.0085566541094482e-06, + "loss": 0.55225551, + "num_input_tokens_seen": 242396160, + "router_z_loss_clip": 0.23144531, + "router_z_loss_mlp": 0.01174927, + "step": 11229, + "time_per_iteration": 3.359409809112549 + }, + { + "auxiliary_loss_clip": 0.01114336, + "auxiliary_loss_mlp": 0.0102831, + "balance_loss_clip": 1.04083586, + "balance_loss_mlp": 1.01731324, + "epoch": 0.6751841274612956, + "flos": 27668345709600.0, + "grad_norm": 1.8905759817750551, + "language_loss": 0.80335081, + "learning_rate": 1.0082184326649072e-06, + "loss": 0.82477731, + "num_input_tokens_seen": 242414660, + "router_z_loss_clip": 0.734375, + "router_z_loss_mlp": 0.11004639, + "step": 11230, + "time_per_iteration": 2.6539406776428223 + }, + { + "auxiliary_loss_clip": 0.01111476, + "auxiliary_loss_mlp": 0.01027658, + "balance_loss_clip": 1.04003131, + "balance_loss_mlp": 1.01745939, + "epoch": 0.6752442507139637, + "flos": 25976423342880.0, + "grad_norm": 1.623882811469789, + "language_loss": 0.6582129, + "learning_rate": 1.0078802488288228e-06, + "loss": 0.67960417, + "num_input_tokens_seen": 242434225, + "router_z_loss_clip": 0.71386719, + "router_z_loss_mlp": 0.10205078, + "step": 11231, + "time_per_iteration": 2.7105813026428223 + }, + { + "auxiliary_loss_clip": 0.01121078, + "auxiliary_loss_mlp": 0.01035785, + "balance_loss_clip": 1.04397655, + "balance_loss_mlp": 1.02320254, + "epoch": 0.6753043739666316, + "flos": 34479706453920.0, + "grad_norm": 4.823718580870561, + "language_loss": 0.66589606, + "learning_rate": 1.0075421026140198e-06, + "loss": 0.68746471, + "num_input_tokens_seen": 242454355, + "router_z_loss_clip": 0.77148438, + "router_z_loss_mlp": 0.12579346, + "step": 11232, + "time_per_iteration": 2.699289083480835 + }, + { + "auxiliary_loss_clip": 0.01113426, + "auxiliary_loss_mlp": 0.01028811, + "balance_loss_clip": 1.04055333, + "balance_loss_mlp": 1.01763487, + "epoch": 0.6753644972192996, + "flos": 26064049518720.0, + "grad_norm": 1.752121930797283, + "language_loss": 0.72797859, + "learning_rate": 1.0072039940333188e-06, + "loss": 0.74940091, + "num_input_tokens_seen": 242474935, + "router_z_loss_clip": 0.72753906, + "router_z_loss_mlp": 0.11175537, + "step": 11233, + "time_per_iteration": 4.228402614593506 + }, + { + "auxiliary_loss_clip": 0.01114499, + "auxiliary_loss_mlp": 0.0103139, + "balance_loss_clip": 1.04093957, + "balance_loss_mlp": 1.02020264, + "epoch": 0.6754246204719675, + "flos": 32387003789760.0, + "grad_norm": 2.8371680005205806, + "language_loss": 0.76874614, + "learning_rate": 1.0068659230995418e-06, + "loss": 0.790205, + "num_input_tokens_seen": 242495530, + "router_z_loss_clip": 0.73583984, + "router_z_loss_mlp": 0.11193848, + "step": 11234, + "time_per_iteration": 3.935969829559326 + }, + { + "auxiliary_loss_clip": 0.01117359, + "auxiliary_loss_mlp": 0.01032222, + "balance_loss_clip": 1.04404593, + "balance_loss_mlp": 1.02074194, + "epoch": 0.6754847437246355, + "flos": 31184703414720.0, + "grad_norm": 1.7147582585689756, + "language_loss": 0.75154191, + "learning_rate": 1.0065278898255101e-06, + "loss": 0.77303767, + "num_input_tokens_seen": 242514550, + "router_z_loss_clip": 0.73193359, + "router_z_loss_mlp": 0.1149292, + "step": 11235, + "time_per_iteration": 2.698873519897461 + }, + { + "auxiliary_loss_clip": 0.01032322, + "auxiliary_loss_mlp": 0.01000802, + "balance_loss_clip": 1.00911999, + "balance_loss_mlp": 0.99974263, + "epoch": 0.6755448669773034, + "flos": 72618899112000.0, + "grad_norm": 0.7785217112394481, + "language_loss": 0.51426554, + "learning_rate": 1.0061898942240387e-06, + "loss": 0.5345968, + "num_input_tokens_seen": 242569200, + "router_z_loss_clip": 0.2322998, + "router_z_loss_mlp": 0.01059723, + "step": 11236, + "time_per_iteration": 3.2589287757873535 + }, + { + "auxiliary_loss_clip": 0.01115678, + "auxiliary_loss_mlp": 0.01031174, + "balance_loss_clip": 1.04198122, + "balance_loss_mlp": 1.01818061, + "epoch": 0.6756049902299714, + "flos": 28424249680320.0, + "grad_norm": 2.439005632328731, + "language_loss": 0.75764632, + "learning_rate": 1.0058519363079464e-06, + "loss": 0.77911484, + "num_input_tokens_seen": 242586950, + "router_z_loss_clip": 0.73779297, + "router_z_loss_mlp": 0.13000488, + "step": 11237, + "time_per_iteration": 2.6974124908447266 + }, + { + "auxiliary_loss_clip": 0.01115809, + "auxiliary_loss_mlp": 0.01033409, + "balance_loss_clip": 1.04242802, + "balance_loss_mlp": 1.02196491, + "epoch": 0.6756651134826394, + "flos": 38529357428160.0, + "grad_norm": 6.350463437587197, + "language_loss": 0.77493346, + "learning_rate": 1.0055140160900482e-06, + "loss": 0.79642558, + "num_input_tokens_seen": 242607380, + "router_z_loss_clip": 0.73339844, + "router_z_loss_mlp": 0.11444092, + "step": 11238, + "time_per_iteration": 2.729529619216919 + }, + { + "auxiliary_loss_clip": 0.01117413, + "auxiliary_loss_mlp": 0.01031309, + "balance_loss_clip": 1.03969967, + "balance_loss_mlp": 1.01978731, + "epoch": 0.6757252367353074, + "flos": 33278135389920.0, + "grad_norm": 2.157828910987456, + "language_loss": 0.66927314, + "learning_rate": 1.0051761335831587e-06, + "loss": 0.69076031, + "num_input_tokens_seen": 242628025, + "router_z_loss_clip": 0.77685547, + "router_z_loss_mlp": 0.11517334, + "step": 11239, + "time_per_iteration": 2.7056610584259033 + }, + { + "auxiliary_loss_clip": 0.01114912, + "auxiliary_loss_mlp": 0.01026884, + "balance_loss_clip": 1.04248166, + "balance_loss_mlp": 1.01556528, + "epoch": 0.6757853599879754, + "flos": 20537348078880.0, + "grad_norm": 2.129645158707132, + "language_loss": 0.83234185, + "learning_rate": 1.0048382888000898e-06, + "loss": 0.85375977, + "num_input_tokens_seen": 242643825, + "router_z_loss_clip": 0.72460938, + "router_z_loss_mlp": 0.11334229, + "step": 11240, + "time_per_iteration": 2.6184585094451904 + }, + { + "auxiliary_loss_clip": 0.01121531, + "auxiliary_loss_mlp": 0.0102942, + "balance_loss_clip": 1.04226351, + "balance_loss_mlp": 1.01591992, + "epoch": 0.6758454832406433, + "flos": 28333503673920.0, + "grad_norm": 4.398225396397525, + "language_loss": 0.74659586, + "learning_rate": 1.0045004817536525e-06, + "loss": 0.76810539, + "num_input_tokens_seen": 242661820, + "router_z_loss_clip": 0.79248047, + "router_z_loss_mlp": 0.1350708, + "step": 11241, + "time_per_iteration": 2.725621461868286 + }, + { + "auxiliary_loss_clip": 0.01116382, + "auxiliary_loss_mlp": 0.01035659, + "balance_loss_clip": 1.04226685, + "balance_loss_mlp": 1.02426851, + "epoch": 0.6759056064933113, + "flos": 19876201325280.0, + "grad_norm": 2.3112901644271675, + "language_loss": 0.80708122, + "learning_rate": 1.0041627124566572e-06, + "loss": 0.8286016, + "num_input_tokens_seen": 242679890, + "router_z_loss_clip": 0.74121094, + "router_z_loss_mlp": 0.11395264, + "step": 11242, + "time_per_iteration": 2.6506242752075195 + }, + { + "auxiliary_loss_clip": 0.01114241, + "auxiliary_loss_mlp": 0.01025216, + "balance_loss_clip": 1.0404321, + "balance_loss_mlp": 1.01432037, + "epoch": 0.6759657297459792, + "flos": 31630775680800.0, + "grad_norm": 1.824789925636672, + "language_loss": 0.73023587, + "learning_rate": 1.0038249809219109e-06, + "loss": 0.75163043, + "num_input_tokens_seen": 242699495, + "router_z_loss_clip": 0.73876953, + "router_z_loss_mlp": 0.10894775, + "step": 11243, + "time_per_iteration": 2.6833724975585938 + }, + { + "auxiliary_loss_clip": 0.01115891, + "auxiliary_loss_mlp": 0.01033617, + "balance_loss_clip": 1.04221761, + "balance_loss_mlp": 1.02296519, + "epoch": 0.6760258529986473, + "flos": 28066006176480.0, + "grad_norm": 1.6569528269479445, + "language_loss": 0.72850096, + "learning_rate": 1.003487287162221e-06, + "loss": 0.74999607, + "num_input_tokens_seen": 242719500, + "router_z_loss_clip": 0.73681641, + "router_z_loss_mlp": 0.10644531, + "step": 11244, + "time_per_iteration": 2.6926004886627197 + }, + { + "auxiliary_loss_clip": 0.01118058, + "auxiliary_loss_mlp": 0.01038889, + "balance_loss_clip": 1.04259133, + "balance_loss_mlp": 1.02710557, + "epoch": 0.6760859762513152, + "flos": 25574670630720.0, + "grad_norm": 1.9748052439383137, + "language_loss": 0.85487437, + "learning_rate": 1.003149631190393e-06, + "loss": 0.87644386, + "num_input_tokens_seen": 242738325, + "router_z_loss_clip": 0.75488281, + "router_z_loss_mlp": 0.11779785, + "step": 11245, + "time_per_iteration": 2.649773597717285 + }, + { + "auxiliary_loss_clip": 0.01120932, + "auxiliary_loss_mlp": 0.01031632, + "balance_loss_clip": 1.04214048, + "balance_loss_mlp": 1.01997972, + "epoch": 0.6761460995039832, + "flos": 28824989460480.0, + "grad_norm": 3.7256291355953746, + "language_loss": 0.73693919, + "learning_rate": 1.0028120130192327e-06, + "loss": 0.75846487, + "num_input_tokens_seen": 242756620, + "router_z_loss_clip": 0.78759766, + "router_z_loss_mlp": 0.11651611, + "step": 11246, + "time_per_iteration": 2.6466164588928223 + }, + { + "auxiliary_loss_clip": 0.01114799, + "auxiliary_loss_mlp": 0.01026167, + "balance_loss_clip": 1.03959751, + "balance_loss_mlp": 1.01472294, + "epoch": 0.6762062227566511, + "flos": 25345253026080.0, + "grad_norm": 1.9068740365195809, + "language_loss": 0.88009167, + "learning_rate": 1.002474432661539e-06, + "loss": 0.9015013, + "num_input_tokens_seen": 242774505, + "router_z_loss_clip": 0.75146484, + "router_z_loss_mlp": 0.11450195, + "step": 11247, + "time_per_iteration": 2.663343667984009 + }, + { + "auxiliary_loss_clip": 0.01032601, + "auxiliary_loss_mlp": 0.01001371, + "balance_loss_clip": 1.0093013, + "balance_loss_mlp": 1.00022984, + "epoch": 0.6762663460093191, + "flos": 64447205484960.0, + "grad_norm": 0.8260417299738363, + "language_loss": 0.54049313, + "learning_rate": 1.002136890130115e-06, + "loss": 0.56083286, + "num_input_tokens_seen": 242828645, + "router_z_loss_clip": 0.23327637, + "router_z_loss_mlp": 0.01141357, + "step": 11248, + "time_per_iteration": 3.283714532852173 + }, + { + "auxiliary_loss_clip": 0.0111327, + "auxiliary_loss_mlp": 0.01025659, + "balance_loss_clip": 1.04285085, + "balance_loss_mlp": 1.01467967, + "epoch": 0.676326469261987, + "flos": 28914479431200.0, + "grad_norm": 1.592103797565027, + "language_loss": 0.73567617, + "learning_rate": 1.001799385437761e-06, + "loss": 0.75706542, + "num_input_tokens_seen": 242850100, + "router_z_loss_clip": 0.70410156, + "router_z_loss_mlp": 0.10980225, + "step": 11249, + "time_per_iteration": 2.6580140590667725 + }, + { + "auxiliary_loss_clip": 0.01116606, + "auxiliary_loss_mlp": 0.01036975, + "balance_loss_clip": 1.04010761, + "balance_loss_mlp": 1.02485108, + "epoch": 0.676386592514655, + "flos": 17160107662080.0, + "grad_norm": 2.166292695962504, + "language_loss": 0.74380445, + "learning_rate": 1.0014619185972732e-06, + "loss": 0.76534033, + "num_input_tokens_seen": 242867775, + "router_z_loss_clip": 0.76464844, + "router_z_loss_mlp": 0.12133789, + "step": 11250, + "time_per_iteration": 2.664628505706787 + }, + { + "auxiliary_loss_clip": 0.0111831, + "auxiliary_loss_mlp": 0.01030214, + "balance_loss_clip": 1.04259825, + "balance_loss_mlp": 1.01913977, + "epoch": 0.676446715767323, + "flos": 24907041112320.0, + "grad_norm": 4.421087820442605, + "language_loss": 0.75218916, + "learning_rate": 1.0011244896214497e-06, + "loss": 0.77367443, + "num_input_tokens_seen": 242886865, + "router_z_loss_clip": 0.75683594, + "router_z_loss_mlp": 0.11077881, + "step": 11251, + "time_per_iteration": 2.6330642700195312 + }, + { + "auxiliary_loss_clip": 0.01116544, + "auxiliary_loss_mlp": 0.01031039, + "balance_loss_clip": 1.04335129, + "balance_loss_mlp": 1.01936817, + "epoch": 0.676506839019991, + "flos": 25975977652800.0, + "grad_norm": 1.6539338285665708, + "language_loss": 0.70144206, + "learning_rate": 1.0007870985230873e-06, + "loss": 0.72291791, + "num_input_tokens_seen": 242906705, + "router_z_loss_clip": 0.73242188, + "router_z_loss_mlp": 0.11669922, + "step": 11252, + "time_per_iteration": 2.6632423400878906 + }, + { + "auxiliary_loss_clip": 0.01118142, + "auxiliary_loss_mlp": 0.01030902, + "balance_loss_clip": 1.04344034, + "balance_loss_mlp": 1.0197922, + "epoch": 0.676566962272659, + "flos": 36524605078080.0, + "grad_norm": 1.9016904025198453, + "language_loss": 0.66906679, + "learning_rate": 1.0004497453149765e-06, + "loss": 0.69055724, + "num_input_tokens_seen": 242925215, + "router_z_loss_clip": 0.74707031, + "router_z_loss_mlp": 0.11108398, + "step": 11253, + "time_per_iteration": 2.686908006668091 + }, + { + "auxiliary_loss_clip": 0.01119396, + "auxiliary_loss_mlp": 0.01035771, + "balance_loss_clip": 1.04311013, + "balance_loss_mlp": 1.02295613, + "epoch": 0.6766270855253269, + "flos": 21879495053280.0, + "grad_norm": 2.3198134351509627, + "language_loss": 0.77189314, + "learning_rate": 1.0001124300099115e-06, + "loss": 0.79344481, + "num_input_tokens_seen": 242944750, + "router_z_loss_clip": 0.76367188, + "router_z_loss_mlp": 0.12817383, + "step": 11254, + "time_per_iteration": 2.6502037048339844 + }, + { + "auxiliary_loss_clip": 0.01115422, + "auxiliary_loss_mlp": 0.01031477, + "balance_loss_clip": 1.03984809, + "balance_loss_mlp": 1.01983035, + "epoch": 0.6766872087779949, + "flos": 28193454488160.0, + "grad_norm": 2.1405698635119563, + "language_loss": 0.72140574, + "learning_rate": 9.997751526206835e-07, + "loss": 0.74287474, + "num_input_tokens_seen": 242963860, + "router_z_loss_clip": 0.75585938, + "router_z_loss_mlp": 0.11639404, + "step": 11255, + "time_per_iteration": 2.6416702270507812 + }, + { + "auxiliary_loss_clip": 0.01115104, + "auxiliary_loss_mlp": 0.01038014, + "balance_loss_clip": 1.03965163, + "balance_loss_mlp": 1.02583671, + "epoch": 0.6767473320306628, + "flos": 31985777802240.0, + "grad_norm": 2.2172954871308295, + "language_loss": 0.75463188, + "learning_rate": 9.994379131600828e-07, + "loss": 0.77616304, + "num_input_tokens_seen": 242983050, + "router_z_loss_clip": 0.75341797, + "router_z_loss_mlp": 0.12158203, + "step": 11256, + "time_per_iteration": 2.6590681076049805 + }, + { + "auxiliary_loss_clip": 0.01118701, + "auxiliary_loss_mlp": 0.01031816, + "balance_loss_clip": 1.04324436, + "balance_loss_mlp": 1.02043176, + "epoch": 0.6768074552833309, + "flos": 22413436598880.0, + "grad_norm": 2.7190494503192406, + "language_loss": 0.65623927, + "learning_rate": 9.991007116408965e-07, + "loss": 0.67774451, + "num_input_tokens_seen": 243001125, + "router_z_loss_clip": 0.75341797, + "router_z_loss_mlp": 0.11383057, + "step": 11257, + "time_per_iteration": 2.5975985527038574 + }, + { + "auxiliary_loss_clip": 0.01113498, + "auxiliary_loss_mlp": 0.01029994, + "balance_loss_clip": 1.04124475, + "balance_loss_mlp": 1.01881838, + "epoch": 0.6768675785359988, + "flos": 28113486078240.0, + "grad_norm": 2.060300880020235, + "language_loss": 0.75271279, + "learning_rate": 9.987635480759109e-07, + "loss": 0.77414775, + "num_input_tokens_seen": 243021865, + "router_z_loss_clip": 0.72265625, + "router_z_loss_mlp": 0.11175537, + "step": 11258, + "time_per_iteration": 2.6513450145721436 + }, + { + "auxiliary_loss_clip": 0.01114599, + "auxiliary_loss_mlp": 0.0103079, + "balance_loss_clip": 1.04253983, + "balance_loss_mlp": 1.02001381, + "epoch": 0.6769277017886668, + "flos": 40312147353120.0, + "grad_norm": 1.6999923843367142, + "language_loss": 0.66750836, + "learning_rate": 9.984264224779127e-07, + "loss": 0.68896222, + "num_input_tokens_seen": 243042970, + "router_z_loss_clip": 0.72070312, + "router_z_loss_mlp": 0.10778809, + "step": 11259, + "time_per_iteration": 2.7409327030181885 + }, + { + "auxiliary_loss_clip": 0.01116618, + "auxiliary_loss_mlp": 0.01031191, + "balance_loss_clip": 1.0417459, + "balance_loss_mlp": 1.01955009, + "epoch": 0.6769878250413347, + "flos": 25439443001280.0, + "grad_norm": 3.504982488631024, + "language_loss": 0.85403693, + "learning_rate": 9.980893348596839e-07, + "loss": 0.87551498, + "num_input_tokens_seen": 243058470, + "router_z_loss_clip": 0.74902344, + "router_z_loss_mlp": 0.11639404, + "step": 11260, + "time_per_iteration": 2.7992539405822754 + }, + { + "auxiliary_loss_clip": 0.01116792, + "auxiliary_loss_mlp": 0.0103694, + "balance_loss_clip": 1.04004049, + "balance_loss_mlp": 1.02495313, + "epoch": 0.6770479482940027, + "flos": 19029875486400.0, + "grad_norm": 2.219505215419358, + "language_loss": 0.77856779, + "learning_rate": 9.977522852340081e-07, + "loss": 0.80010509, + "num_input_tokens_seen": 243076630, + "router_z_loss_clip": 0.76708984, + "router_z_loss_mlp": 0.11993408, + "step": 11261, + "time_per_iteration": 4.032003402709961 + }, + { + "auxiliary_loss_clip": 0.01115249, + "auxiliary_loss_mlp": 0.01033504, + "balance_loss_clip": 1.03949594, + "balance_loss_mlp": 1.0216428, + "epoch": 0.6771080715466706, + "flos": 22721566577760.0, + "grad_norm": 1.7303675235909826, + "language_loss": 0.88009816, + "learning_rate": 9.97415273613666e-07, + "loss": 0.9015857, + "num_input_tokens_seen": 243092260, + "router_z_loss_clip": 0.75732422, + "router_z_loss_mlp": 0.11859131, + "step": 11262, + "time_per_iteration": 2.5849571228027344 + }, + { + "auxiliary_loss_clip": 0.01121529, + "auxiliary_loss_mlp": 0.01031152, + "balance_loss_clip": 1.0447787, + "balance_loss_mlp": 1.01926661, + "epoch": 0.6771681947993387, + "flos": 15246303904800.0, + "grad_norm": 2.2752926328178606, + "language_loss": 0.74398172, + "learning_rate": 9.97078300011439e-07, + "loss": 0.76550847, + "num_input_tokens_seen": 243109405, + "router_z_loss_clip": 0.76806641, + "router_z_loss_mlp": 0.11889648, + "step": 11263, + "time_per_iteration": 2.637463331222534 + }, + { + "auxiliary_loss_clip": 0.01120023, + "auxiliary_loss_mlp": 0.01034637, + "balance_loss_clip": 1.04249728, + "balance_loss_mlp": 1.02209616, + "epoch": 0.6772283180520066, + "flos": 27133674852960.0, + "grad_norm": 2.1574167181962993, + "language_loss": 0.67918897, + "learning_rate": 9.967413644401016e-07, + "loss": 0.70073557, + "num_input_tokens_seen": 243128135, + "router_z_loss_clip": 0.77539062, + "router_z_loss_mlp": 0.12536621, + "step": 11264, + "time_per_iteration": 4.1127424240112305 + }, + { + "auxiliary_loss_clip": 0.01118606, + "auxiliary_loss_mlp": 0.0103139, + "balance_loss_clip": 1.0435431, + "balance_loss_mlp": 1.01898074, + "epoch": 0.6772884413046746, + "flos": 19697586039360.0, + "grad_norm": 1.841424957634415, + "language_loss": 0.73078787, + "learning_rate": 9.964044669124324e-07, + "loss": 0.75228786, + "num_input_tokens_seen": 243146785, + "router_z_loss_clip": 0.75097656, + "router_z_loss_mlp": 0.1239624, + "step": 11265, + "time_per_iteration": 2.619436025619507 + }, + { + "auxiliary_loss_clip": 0.0111461, + "auxiliary_loss_mlp": 0.01028427, + "balance_loss_clip": 1.04143262, + "balance_loss_mlp": 1.01727509, + "epoch": 0.6773485645573426, + "flos": 23349333443040.0, + "grad_norm": 1.7350466651622012, + "language_loss": 0.61693376, + "learning_rate": 9.96067607441207e-07, + "loss": 0.63836408, + "num_input_tokens_seen": 243165275, + "router_z_loss_clip": 0.73095703, + "router_z_loss_mlp": 0.1114502, + "step": 11266, + "time_per_iteration": 2.6357438564300537 + }, + { + "auxiliary_loss_clip": 0.01119675, + "auxiliary_loss_mlp": 0.01033824, + "balance_loss_clip": 1.04291618, + "balance_loss_mlp": 1.02128303, + "epoch": 0.6774086878100105, + "flos": 17249395046400.0, + "grad_norm": 2.6957778565511292, + "language_loss": 0.70837891, + "learning_rate": 9.957307860391976e-07, + "loss": 0.72991389, + "num_input_tokens_seen": 243182845, + "router_z_loss_clip": 0.76855469, + "router_z_loss_mlp": 0.12548828, + "step": 11267, + "time_per_iteration": 2.59226393699646 + }, + { + "auxiliary_loss_clip": 0.01118396, + "auxiliary_loss_mlp": 0.01029802, + "balance_loss_clip": 1.04128277, + "balance_loss_mlp": 1.01816177, + "epoch": 0.6774688110626785, + "flos": 27085303571040.0, + "grad_norm": 1.986100481875949, + "language_loss": 0.71031868, + "learning_rate": 9.953940027191785e-07, + "loss": 0.73180068, + "num_input_tokens_seen": 243201475, + "router_z_loss_clip": 0.77148438, + "router_z_loss_mlp": 0.11627197, + "step": 11268, + "time_per_iteration": 2.662665605545044 + }, + { + "auxiliary_loss_clip": 0.01121078, + "auxiliary_loss_mlp": 0.01031904, + "balance_loss_clip": 1.0459764, + "balance_loss_mlp": 1.01986432, + "epoch": 0.6775289343153464, + "flos": 28113526595520.0, + "grad_norm": 1.5564326240950832, + "language_loss": 0.76924646, + "learning_rate": 9.950572574939194e-07, + "loss": 0.79077625, + "num_input_tokens_seen": 243221850, + "router_z_loss_clip": 0.75097656, + "router_z_loss_mlp": 0.12030029, + "step": 11269, + "time_per_iteration": 2.6797356605529785 + }, + { + "auxiliary_loss_clip": 0.01117916, + "auxiliary_loss_mlp": 0.01037613, + "balance_loss_clip": 1.04026401, + "balance_loss_mlp": 1.02529287, + "epoch": 0.6775890575680145, + "flos": 22321839729600.0, + "grad_norm": 2.558215782583242, + "language_loss": 0.7465381, + "learning_rate": 9.94720550376189e-07, + "loss": 0.76809347, + "num_input_tokens_seen": 243239855, + "router_z_loss_clip": 0.77539062, + "router_z_loss_mlp": 0.12329102, + "step": 11270, + "time_per_iteration": 2.7310500144958496 + }, + { + "auxiliary_loss_clip": 0.01117137, + "auxiliary_loss_mlp": 0.0103527, + "balance_loss_clip": 1.04272008, + "balance_loss_mlp": 1.02275324, + "epoch": 0.6776491808206824, + "flos": 30916071433440.0, + "grad_norm": 1.768079394948642, + "language_loss": 0.72965646, + "learning_rate": 9.94383881378756e-07, + "loss": 0.75118053, + "num_input_tokens_seen": 243260085, + "router_z_loss_clip": 0.74511719, + "router_z_loss_mlp": 0.12506104, + "step": 11271, + "time_per_iteration": 2.677117347717285 + }, + { + "auxiliary_loss_clip": 0.01117112, + "auxiliary_loss_mlp": 0.01032967, + "balance_loss_clip": 1.04196835, + "balance_loss_mlp": 1.02116561, + "epoch": 0.6777093040733504, + "flos": 31760371408320.0, + "grad_norm": 1.632194722763618, + "language_loss": 0.67891735, + "learning_rate": 9.94047250514387e-07, + "loss": 0.70041817, + "num_input_tokens_seen": 243280065, + "router_z_loss_clip": 0.75048828, + "router_z_loss_mlp": 0.11791992, + "step": 11272, + "time_per_iteration": 4.176357746124268 + }, + { + "auxiliary_loss_clip": 0.01120458, + "auxiliary_loss_mlp": 0.01034882, + "balance_loss_clip": 1.04247975, + "balance_loss_mlp": 1.02166796, + "epoch": 0.6777694273260183, + "flos": 21967891057440.0, + "grad_norm": 1.799137930292594, + "language_loss": 0.74008918, + "learning_rate": 9.937106577958481e-07, + "loss": 0.76164258, + "num_input_tokens_seen": 243297775, + "router_z_loss_clip": 0.77880859, + "router_z_loss_mlp": 0.13226318, + "step": 11273, + "time_per_iteration": 2.7662336826324463 + }, + { + "auxiliary_loss_clip": 0.01114129, + "auxiliary_loss_mlp": 0.01036835, + "balance_loss_clip": 1.0411309, + "balance_loss_mlp": 1.02514029, + "epoch": 0.6778295505786863, + "flos": 28603027035360.0, + "grad_norm": 2.14418814815779, + "language_loss": 0.69954932, + "learning_rate": 9.933741032359015e-07, + "loss": 0.72105896, + "num_input_tokens_seen": 243315760, + "router_z_loss_clip": 0.73095703, + "router_z_loss_mlp": 0.11706543, + "step": 11274, + "time_per_iteration": 4.054638862609863 + }, + { + "auxiliary_loss_clip": 0.01118882, + "auxiliary_loss_mlp": 0.01031129, + "balance_loss_clip": 1.04283762, + "balance_loss_mlp": 1.01875544, + "epoch": 0.6778896738313542, + "flos": 23303838888000.0, + "grad_norm": 1.8787042088616064, + "language_loss": 0.65602279, + "learning_rate": 9.930375868473093e-07, + "loss": 0.6775229, + "num_input_tokens_seen": 243335715, + "router_z_loss_clip": 0.76025391, + "router_z_loss_mlp": 0.12365723, + "step": 11275, + "time_per_iteration": 2.6275253295898438 + }, + { + "auxiliary_loss_clip": 0.01118134, + "auxiliary_loss_mlp": 0.01031509, + "balance_loss_clip": 1.04341996, + "balance_loss_mlp": 1.02021396, + "epoch": 0.6779497970840223, + "flos": 31852495002240.0, + "grad_norm": 1.6566065564384644, + "language_loss": 0.72601163, + "learning_rate": 9.927011086428335e-07, + "loss": 0.74750805, + "num_input_tokens_seen": 243356935, + "router_z_loss_clip": 0.74511719, + "router_z_loss_mlp": 0.11297607, + "step": 11276, + "time_per_iteration": 2.711636543273926 + }, + { + "auxiliary_loss_clip": 0.01117214, + "auxiliary_loss_mlp": 0.01030258, + "balance_loss_clip": 1.04270244, + "balance_loss_mlp": 1.01770568, + "epoch": 0.6780099203366902, + "flos": 24061363549920.0, + "grad_norm": 2.605557461145345, + "language_loss": 0.77065003, + "learning_rate": 9.923646686352317e-07, + "loss": 0.79212475, + "num_input_tokens_seen": 243375625, + "router_z_loss_clip": 0.74462891, + "router_z_loss_mlp": 0.12548828, + "step": 11277, + "time_per_iteration": 2.5944483280181885 + }, + { + "auxiliary_loss_clip": 0.01120683, + "auxiliary_loss_mlp": 0.01030144, + "balance_loss_clip": 1.04362607, + "balance_loss_mlp": 1.01815701, + "epoch": 0.6780700435893582, + "flos": 22226029063200.0, + "grad_norm": 3.848754755459437, + "language_loss": 0.83439654, + "learning_rate": 9.920282668372627e-07, + "loss": 0.85590482, + "num_input_tokens_seen": 243390195, + "router_z_loss_clip": 0.77001953, + "router_z_loss_mlp": 0.11981201, + "step": 11278, + "time_per_iteration": 2.572887897491455 + }, + { + "auxiliary_loss_clip": 0.01114666, + "auxiliary_loss_mlp": 0.01029576, + "balance_loss_clip": 1.04214466, + "balance_loss_mlp": 1.01865053, + "epoch": 0.6781301668420262, + "flos": 30964321163520.0, + "grad_norm": 1.8532057155243329, + "language_loss": 0.70242739, + "learning_rate": 9.916919032616844e-07, + "loss": 0.7238698, + "num_input_tokens_seen": 243411690, + "router_z_loss_clip": 0.72460938, + "router_z_loss_mlp": 0.10919189, + "step": 11279, + "time_per_iteration": 2.6651995182037354 + }, + { + "auxiliary_loss_clip": 0.01118194, + "auxiliary_loss_mlp": 0.01031335, + "balance_loss_clip": 1.04218078, + "balance_loss_mlp": 1.01870513, + "epoch": 0.6781902900946941, + "flos": 29310397655040.0, + "grad_norm": 1.957159017482297, + "language_loss": 0.73740155, + "learning_rate": 9.913555779212485e-07, + "loss": 0.75889683, + "num_input_tokens_seen": 243430280, + "router_z_loss_clip": 0.75927734, + "router_z_loss_mlp": 0.12646484, + "step": 11280, + "time_per_iteration": 2.649887800216675 + }, + { + "auxiliary_loss_clip": 0.01118843, + "auxiliary_loss_mlp": 0.01026952, + "balance_loss_clip": 1.04132628, + "balance_loss_mlp": 1.01509702, + "epoch": 0.6782504133473621, + "flos": 23972886511200.0, + "grad_norm": 2.0128980986881455, + "language_loss": 0.70270026, + "learning_rate": 9.910192908287104e-07, + "loss": 0.72415817, + "num_input_tokens_seen": 243448690, + "router_z_loss_clip": 0.77539062, + "router_z_loss_mlp": 0.11853027, + "step": 11281, + "time_per_iteration": 2.600231885910034 + }, + { + "auxiliary_loss_clip": 0.01114319, + "auxiliary_loss_mlp": 0.0102679, + "balance_loss_clip": 1.04175854, + "balance_loss_mlp": 1.01582837, + "epoch": 0.67831053660003, + "flos": 30423653749440.0, + "grad_norm": 1.4902153257146875, + "language_loss": 0.6363821, + "learning_rate": 9.906830419968217e-07, + "loss": 0.65779316, + "num_input_tokens_seen": 243470695, + "router_z_loss_clip": 0.72460938, + "router_z_loss_mlp": 0.10961914, + "step": 11282, + "time_per_iteration": 2.808094024658203 + }, + { + "auxiliary_loss_clip": 0.01121683, + "auxiliary_loss_mlp": 0.01036826, + "balance_loss_clip": 1.04337025, + "balance_loss_mlp": 1.02433848, + "epoch": 0.6783706598526981, + "flos": 38081664470880.0, + "grad_norm": 1.6219169986730473, + "language_loss": 0.74354327, + "learning_rate": 9.90346831438334e-07, + "loss": 0.76512837, + "num_input_tokens_seen": 243493345, + "router_z_loss_clip": 0.78320312, + "router_z_loss_mlp": 0.12487793, + "step": 11283, + "time_per_iteration": 2.6903412342071533 + }, + { + "auxiliary_loss_clip": 0.01116549, + "auxiliary_loss_mlp": 0.01027003, + "balance_loss_clip": 1.04196477, + "balance_loss_mlp": 1.01564181, + "epoch": 0.678430783105366, + "flos": 43246962059040.0, + "grad_norm": 1.6426646908452927, + "language_loss": 0.56919134, + "learning_rate": 9.900106591659948e-07, + "loss": 0.59062684, + "num_input_tokens_seen": 243515670, + "router_z_loss_clip": 0.74609375, + "router_z_loss_mlp": 0.11352539, + "step": 11284, + "time_per_iteration": 2.765821933746338 + }, + { + "auxiliary_loss_clip": 0.01117344, + "auxiliary_loss_mlp": 0.01030967, + "balance_loss_clip": 1.04135728, + "balance_loss_mlp": 1.01889133, + "epoch": 0.678490906358034, + "flos": 17605572168960.0, + "grad_norm": 3.7854217648378783, + "language_loss": 0.75356317, + "learning_rate": 9.896745251925535e-07, + "loss": 0.77504635, + "num_input_tokens_seen": 243533625, + "router_z_loss_clip": 0.75976562, + "router_z_loss_mlp": 0.12084961, + "step": 11285, + "time_per_iteration": 2.6089890003204346 + }, + { + "auxiliary_loss_clip": 0.01117001, + "auxiliary_loss_mlp": 0.01031482, + "balance_loss_clip": 1.04368663, + "balance_loss_mlp": 1.01952517, + "epoch": 0.6785510296107019, + "flos": 29665359259200.0, + "grad_norm": 2.5053350666090544, + "language_loss": 0.66576487, + "learning_rate": 9.893384295307557e-07, + "loss": 0.68724966, + "num_input_tokens_seen": 243553040, + "router_z_loss_clip": 0.73242188, + "router_z_loss_mlp": 0.1194458, + "step": 11286, + "time_per_iteration": 2.7170703411102295 + }, + { + "auxiliary_loss_clip": 0.01116448, + "auxiliary_loss_mlp": 0.01027262, + "balance_loss_clip": 1.03995585, + "balance_loss_mlp": 1.01515627, + "epoch": 0.6786111528633699, + "flos": 32918392746720.0, + "grad_norm": 5.14356405025053, + "language_loss": 0.53142178, + "learning_rate": 9.890023721933447e-07, + "loss": 0.55285895, + "num_input_tokens_seen": 243572590, + "router_z_loss_clip": 0.76513672, + "router_z_loss_mlp": 0.12109375, + "step": 11287, + "time_per_iteration": 2.7292118072509766 + }, + { + "auxiliary_loss_clip": 0.01117313, + "auxiliary_loss_mlp": 0.01028868, + "balance_loss_clip": 1.04291832, + "balance_loss_mlp": 1.01723313, + "epoch": 0.6786712761160378, + "flos": 29672490300480.0, + "grad_norm": 1.7624666292998072, + "language_loss": 0.77371919, + "learning_rate": 9.886663531930655e-07, + "loss": 0.79518098, + "num_input_tokens_seen": 243594140, + "router_z_loss_clip": 0.74365234, + "router_z_loss_mlp": 0.11627197, + "step": 11288, + "time_per_iteration": 2.676567792892456 + }, + { + "auxiliary_loss_clip": 0.01122116, + "auxiliary_loss_mlp": 0.01036652, + "balance_loss_clip": 1.04545605, + "balance_loss_mlp": 1.02517796, + "epoch": 0.6787313993687059, + "flos": 27979879140000.0, + "grad_norm": 2.0026346618392554, + "language_loss": 0.73436999, + "learning_rate": 9.883303725426593e-07, + "loss": 0.75595772, + "num_input_tokens_seen": 243615170, + "router_z_loss_clip": 0.76708984, + "router_z_loss_mlp": 0.11474609, + "step": 11289, + "time_per_iteration": 2.6802937984466553 + }, + { + "auxiliary_loss_clip": 0.01118587, + "auxiliary_loss_mlp": 0.01035122, + "balance_loss_clip": 1.04248726, + "balance_loss_mlp": 1.02272379, + "epoch": 0.6787915226213738, + "flos": 32785555636800.0, + "grad_norm": 1.730550810823376, + "language_loss": 0.79956603, + "learning_rate": 9.879944302548682e-07, + "loss": 0.82110316, + "num_input_tokens_seen": 243635675, + "router_z_loss_clip": 0.76123047, + "router_z_loss_mlp": 0.1239624, + "step": 11290, + "time_per_iteration": 2.7025742530822754 + }, + { + "auxiliary_loss_clip": 0.0111384, + "auxiliary_loss_mlp": 0.01026085, + "balance_loss_clip": 1.04217303, + "balance_loss_mlp": 1.01506984, + "epoch": 0.6788516458740418, + "flos": 24414988083840.0, + "grad_norm": 1.5315025586609143, + "language_loss": 0.74971187, + "learning_rate": 9.87658526342428e-07, + "loss": 0.77111113, + "num_input_tokens_seen": 243654950, + "router_z_loss_clip": 0.71630859, + "router_z_loss_mlp": 0.11016846, + "step": 11291, + "time_per_iteration": 2.66496205329895 + }, + { + "auxiliary_loss_clip": 0.01120673, + "auxiliary_loss_mlp": 0.0103435, + "balance_loss_clip": 1.04350376, + "balance_loss_mlp": 1.02220273, + "epoch": 0.6789117691267098, + "flos": 35053834790880.0, + "grad_norm": 1.9031679927182101, + "language_loss": 0.75635237, + "learning_rate": 9.873226608180785e-07, + "loss": 0.7779026, + "num_input_tokens_seen": 243674970, + "router_z_loss_clip": 0.77148438, + "router_z_loss_mlp": 0.12145996, + "step": 11292, + "time_per_iteration": 2.6784865856170654 + }, + { + "auxiliary_loss_clip": 0.01118408, + "auxiliary_loss_mlp": 0.01031377, + "balance_loss_clip": 1.04315567, + "balance_loss_mlp": 1.01877022, + "epoch": 0.6789718923793777, + "flos": 28557208342080.0, + "grad_norm": 1.8411111618220577, + "language_loss": 0.84079617, + "learning_rate": 9.869868336945556e-07, + "loss": 0.86229396, + "num_input_tokens_seen": 243693440, + "router_z_loss_clip": 0.75244141, + "router_z_loss_mlp": 0.12597656, + "step": 11293, + "time_per_iteration": 2.665151357650757 + }, + { + "auxiliary_loss_clip": 0.01123702, + "auxiliary_loss_mlp": 0.01035566, + "balance_loss_clip": 1.04459584, + "balance_loss_mlp": 1.02251887, + "epoch": 0.6790320156320457, + "flos": 24951725321760.0, + "grad_norm": 2.2791906291115884, + "language_loss": 0.79531997, + "learning_rate": 9.866510449845929e-07, + "loss": 0.81691265, + "num_input_tokens_seen": 243710055, + "router_z_loss_clip": 0.79052734, + "router_z_loss_mlp": 0.13031006, + "step": 11294, + "time_per_iteration": 2.619770050048828 + }, + { + "auxiliary_loss_clip": 0.01117828, + "auxiliary_loss_mlp": 0.01025554, + "balance_loss_clip": 1.04232979, + "balance_loss_mlp": 1.01416981, + "epoch": 0.6790921388847136, + "flos": 29489418113760.0, + "grad_norm": 1.675994059278062, + "language_loss": 0.79092181, + "learning_rate": 9.86315294700924e-07, + "loss": 0.81235564, + "num_input_tokens_seen": 243728635, + "router_z_loss_clip": 0.75488281, + "router_z_loss_mlp": 0.11383057, + "step": 11295, + "time_per_iteration": 2.672743558883667 + }, + { + "auxiliary_loss_clip": 0.01112321, + "auxiliary_loss_mlp": 0.01030144, + "balance_loss_clip": 1.04030776, + "balance_loss_mlp": 1.0198983, + "epoch": 0.6791522621373817, + "flos": 26732408348160.0, + "grad_norm": 3.371712409374502, + "language_loss": 0.711272, + "learning_rate": 9.859795828562823e-07, + "loss": 0.73269665, + "num_input_tokens_seen": 243748330, + "router_z_loss_clip": 0.72070312, + "router_z_loss_mlp": 0.10241699, + "step": 11296, + "time_per_iteration": 2.6605498790740967 + }, + { + "auxiliary_loss_clip": 0.01117207, + "auxiliary_loss_mlp": 0.01026299, + "balance_loss_clip": 1.0424552, + "balance_loss_mlp": 1.01482534, + "epoch": 0.6792123853900496, + "flos": 30293733883680.0, + "grad_norm": 2.804314608195454, + "language_loss": 0.70639515, + "learning_rate": 9.856439094633949e-07, + "loss": 0.72783023, + "num_input_tokens_seen": 243769380, + "router_z_loss_clip": 0.74853516, + "router_z_loss_mlp": 0.11468506, + "step": 11297, + "time_per_iteration": 2.685973882675171 + }, + { + "auxiliary_loss_clip": 0.01121551, + "auxiliary_loss_mlp": 0.01033161, + "balance_loss_clip": 1.04384398, + "balance_loss_mlp": 1.02094841, + "epoch": 0.6792725086427176, + "flos": 21434192615520.0, + "grad_norm": 1.9685073408779654, + "language_loss": 0.66035879, + "learning_rate": 9.853082745349918e-07, + "loss": 0.68190593, + "num_input_tokens_seen": 243785510, + "router_z_loss_clip": 0.77734375, + "router_z_loss_mlp": 0.12225342, + "step": 11298, + "time_per_iteration": 2.6200051307678223 + }, + { + "auxiliary_loss_clip": 0.01118639, + "auxiliary_loss_mlp": 0.01028302, + "balance_loss_clip": 1.04244399, + "balance_loss_mlp": 1.01707268, + "epoch": 0.6793326318953855, + "flos": 32875086124800.0, + "grad_norm": 1.691929085780661, + "language_loss": 0.71621281, + "learning_rate": 9.84972678083801e-07, + "loss": 0.73768222, + "num_input_tokens_seen": 243805545, + "router_z_loss_clip": 0.76171875, + "router_z_loss_mlp": 0.11230469, + "step": 11299, + "time_per_iteration": 2.693502902984619 + }, + { + "auxiliary_loss_clip": 0.0112032, + "auxiliary_loss_mlp": 0.01033867, + "balance_loss_clip": 1.04366469, + "balance_loss_mlp": 1.02185047, + "epoch": 0.6793927551480535, + "flos": 29673300646080.0, + "grad_norm": 1.3462625592450994, + "language_loss": 0.77358139, + "learning_rate": 9.846371201225488e-07, + "loss": 0.79512322, + "num_input_tokens_seen": 243825185, + "router_z_loss_clip": 0.76708984, + "router_z_loss_mlp": 0.12017822, + "step": 11300, + "time_per_iteration": 2.6799726486206055 + }, + { + "auxiliary_loss_clip": 0.01117893, + "auxiliary_loss_mlp": 0.01032724, + "balance_loss_clip": 1.04215062, + "balance_loss_mlp": 1.02086854, + "epoch": 0.6794528784007214, + "flos": 13954310972640.0, + "grad_norm": 2.0875026524246563, + "language_loss": 0.62875938, + "learning_rate": 9.843016006639577e-07, + "loss": 0.65026557, + "num_input_tokens_seen": 243841600, + "router_z_loss_clip": 0.75732422, + "router_z_loss_mlp": 0.11859131, + "step": 11301, + "time_per_iteration": 4.078989744186401 + }, + { + "auxiliary_loss_clip": 0.011167, + "auxiliary_loss_mlp": 0.01033079, + "balance_loss_clip": 1.04131496, + "balance_loss_mlp": 1.02153921, + "epoch": 0.6795130016533895, + "flos": 30785543808480.0, + "grad_norm": 2.7827759717959077, + "language_loss": 0.83232653, + "learning_rate": 9.839661197207525e-07, + "loss": 0.85382432, + "num_input_tokens_seen": 243862250, + "router_z_loss_clip": 0.75439453, + "router_z_loss_mlp": 0.11541748, + "step": 11302, + "time_per_iteration": 2.657463550567627 + }, + { + "auxiliary_loss_clip": 0.01118672, + "auxiliary_loss_mlp": 0.01033112, + "balance_loss_clip": 1.0419805, + "balance_loss_mlp": 1.02130437, + "epoch": 0.6795731249060574, + "flos": 22325121629280.0, + "grad_norm": 1.749508716707796, + "language_loss": 0.69640911, + "learning_rate": 9.83630677305654e-07, + "loss": 0.71792698, + "num_input_tokens_seen": 243880560, + "router_z_loss_clip": 0.76708984, + "router_z_loss_mlp": 0.11804199, + "step": 11303, + "time_per_iteration": 4.040615797042847 + }, + { + "auxiliary_loss_clip": 0.01121706, + "auxiliary_loss_mlp": 0.01031795, + "balance_loss_clip": 1.04392958, + "balance_loss_mlp": 1.01981497, + "epoch": 0.6796332481587254, + "flos": 24770760033600.0, + "grad_norm": 2.292360372296921, + "language_loss": 0.70320725, + "learning_rate": 9.832952734313813e-07, + "loss": 0.72474229, + "num_input_tokens_seen": 243900635, + "router_z_loss_clip": 0.77783203, + "router_z_loss_mlp": 0.11987305, + "step": 11304, + "time_per_iteration": 2.6036224365234375 + }, + { + "auxiliary_loss_clip": 0.01120739, + "auxiliary_loss_mlp": 0.01034022, + "balance_loss_clip": 1.04456091, + "balance_loss_mlp": 1.02178526, + "epoch": 0.6796933714113934, + "flos": 28780913010240.0, + "grad_norm": 1.8967436784566176, + "language_loss": 0.72642028, + "learning_rate": 9.829599081106536e-07, + "loss": 0.74796796, + "num_input_tokens_seen": 243920160, + "router_z_loss_clip": 0.76074219, + "router_z_loss_mlp": 0.12243652, + "step": 11305, + "time_per_iteration": 2.6847856044769287 + }, + { + "auxiliary_loss_clip": 0.01117113, + "auxiliary_loss_mlp": 0.01026399, + "balance_loss_clip": 1.04146612, + "balance_loss_mlp": 1.01432884, + "epoch": 0.6797534946640613, + "flos": 33092794235520.0, + "grad_norm": 2.517025363516103, + "language_loss": 0.65643728, + "learning_rate": 9.826245813561882e-07, + "loss": 0.67787242, + "num_input_tokens_seen": 243939015, + "router_z_loss_clip": 0.75634766, + "router_z_loss_mlp": 0.1206665, + "step": 11306, + "time_per_iteration": 2.6384408473968506 + }, + { + "auxiliary_loss_clip": 0.01116431, + "auxiliary_loss_mlp": 0.01029753, + "balance_loss_clip": 1.04159832, + "balance_loss_mlp": 1.01764691, + "epoch": 0.6798136179167293, + "flos": 27000027397440.0, + "grad_norm": 1.9224312237337744, + "language_loss": 0.79903734, + "learning_rate": 9.822892931807021e-07, + "loss": 0.82049912, + "num_input_tokens_seen": 243958470, + "router_z_loss_clip": 0.74804688, + "router_z_loss_mlp": 0.12115479, + "step": 11307, + "time_per_iteration": 2.654578924179077 + }, + { + "auxiliary_loss_clip": 0.01118024, + "auxiliary_loss_mlp": 0.01032282, + "balance_loss_clip": 1.0432272, + "balance_loss_mlp": 1.02013421, + "epoch": 0.6798737411693972, + "flos": 21339881088480.0, + "grad_norm": 1.722871032434004, + "language_loss": 0.89243913, + "learning_rate": 9.819540435969066e-07, + "loss": 0.91394216, + "num_input_tokens_seen": 243975450, + "router_z_loss_clip": 0.74755859, + "router_z_loss_mlp": 0.12139893, + "step": 11308, + "time_per_iteration": 2.586475133895874 + }, + { + "auxiliary_loss_clip": 0.01118533, + "auxiliary_loss_mlp": 0.01036422, + "balance_loss_clip": 1.04197264, + "balance_loss_mlp": 1.02414942, + "epoch": 0.6799338644220653, + "flos": 27933452687520.0, + "grad_norm": 1.9835471746188313, + "language_loss": 0.71334767, + "learning_rate": 9.816188326175154e-07, + "loss": 0.73489726, + "num_input_tokens_seen": 243994355, + "router_z_loss_clip": 0.76416016, + "router_z_loss_mlp": 0.12280273, + "step": 11309, + "time_per_iteration": 2.7330033779144287 + }, + { + "auxiliary_loss_clip": 0.0112006, + "auxiliary_loss_mlp": 0.01040593, + "balance_loss_clip": 1.04337156, + "balance_loss_mlp": 1.02868414, + "epoch": 0.6799939876747332, + "flos": 28285416012960.0, + "grad_norm": 1.9419563083012101, + "language_loss": 0.84451717, + "learning_rate": 9.812836602552411e-07, + "loss": 0.86612368, + "num_input_tokens_seen": 244011620, + "router_z_loss_clip": 0.76708984, + "router_z_loss_mlp": 0.11901855, + "step": 11310, + "time_per_iteration": 2.699157953262329 + }, + { + "auxiliary_loss_clip": 0.01116734, + "auxiliary_loss_mlp": 0.01029763, + "balance_loss_clip": 1.04348624, + "balance_loss_mlp": 1.01882541, + "epoch": 0.6800541109274012, + "flos": 23793785017920.0, + "grad_norm": 2.33789782821594, + "language_loss": 0.83105081, + "learning_rate": 9.80948526522792e-07, + "loss": 0.85251582, + "num_input_tokens_seen": 244029925, + "router_z_loss_clip": 0.73242188, + "router_z_loss_mlp": 0.10931396, + "step": 11311, + "time_per_iteration": 2.6287841796875 + }, + { + "auxiliary_loss_clip": 0.01121151, + "auxiliary_loss_mlp": 0.01031729, + "balance_loss_clip": 1.04093599, + "balance_loss_mlp": 1.0181632, + "epoch": 0.6801142341800691, + "flos": 27182937515040.0, + "grad_norm": 2.4596420722824885, + "language_loss": 0.7610997, + "learning_rate": 9.806134314328767e-07, + "loss": 0.78262848, + "num_input_tokens_seen": 244051225, + "router_z_loss_clip": 0.80126953, + "router_z_loss_mlp": 0.13568115, + "step": 11312, + "time_per_iteration": 4.089852333068848 + }, + { + "auxiliary_loss_clip": 0.01037549, + "auxiliary_loss_mlp": 0.01002184, + "balance_loss_clip": 1.01390541, + "balance_loss_mlp": 1.00090528, + "epoch": 0.6801743574327371, + "flos": 83846622445920.0, + "grad_norm": 0.6578955233822473, + "language_loss": 0.57222927, + "learning_rate": 9.802783749982038e-07, + "loss": 0.59262657, + "num_input_tokens_seen": 244115930, + "router_z_loss_clip": 0.23669434, + "router_z_loss_mlp": 0.01278687, + "step": 11313, + "time_per_iteration": 4.669026136398315 + }, + { + "auxiliary_loss_clip": 0.01117809, + "auxiliary_loss_mlp": 0.01028749, + "balance_loss_clip": 1.04077148, + "balance_loss_mlp": 1.01661897, + "epoch": 0.680234480685405, + "flos": 35948410359840.0, + "grad_norm": 1.8191620169534408, + "language_loss": 0.68859136, + "learning_rate": 9.799433572314754e-07, + "loss": 0.7100569, + "num_input_tokens_seen": 244137320, + "router_z_loss_clip": 0.77050781, + "router_z_loss_mlp": 0.12145996, + "step": 11314, + "time_per_iteration": 2.6786272525787354 + }, + { + "auxiliary_loss_clip": 0.01115688, + "auxiliary_loss_mlp": 0.01030059, + "balance_loss_clip": 1.0405221, + "balance_loss_mlp": 1.01917505, + "epoch": 0.6802946039380731, + "flos": 19422147155040.0, + "grad_norm": 2.2763528339608903, + "language_loss": 0.81477225, + "learning_rate": 9.796083781453972e-07, + "loss": 0.83622968, + "num_input_tokens_seen": 244152755, + "router_z_loss_clip": 0.75146484, + "router_z_loss_mlp": 0.10894775, + "step": 11315, + "time_per_iteration": 2.7880470752716064 + }, + { + "auxiliary_loss_clip": 0.01116617, + "auxiliary_loss_mlp": 0.01031812, + "balance_loss_clip": 1.04095864, + "balance_loss_mlp": 1.02010584, + "epoch": 0.680354727190741, + "flos": 26867230804800.0, + "grad_norm": 1.777643331356982, + "language_loss": 0.70475709, + "learning_rate": 9.792734377526718e-07, + "loss": 0.72624147, + "num_input_tokens_seen": 244171480, + "router_z_loss_clip": 0.75585938, + "router_z_loss_mlp": 0.11706543, + "step": 11316, + "time_per_iteration": 2.6375274658203125 + }, + { + "auxiliary_loss_clip": 0.01116134, + "auxiliary_loss_mlp": 0.01028543, + "balance_loss_clip": 1.04176378, + "balance_loss_mlp": 1.01716459, + "epoch": 0.680414850443409, + "flos": 22502237775840.0, + "grad_norm": 2.1302885201422237, + "language_loss": 0.66423666, + "learning_rate": 9.789385360660003e-07, + "loss": 0.68568349, + "num_input_tokens_seen": 244187920, + "router_z_loss_clip": 0.74414062, + "router_z_loss_mlp": 0.1138916, + "step": 11317, + "time_per_iteration": 2.6913840770721436 + }, + { + "auxiliary_loss_clip": 0.01121075, + "auxiliary_loss_mlp": 0.01042181, + "balance_loss_clip": 1.04501224, + "balance_loss_mlp": 1.03045654, + "epoch": 0.680474973696077, + "flos": 32163339638880.0, + "grad_norm": 1.454586372462139, + "language_loss": 0.74995142, + "learning_rate": 9.78603673098082e-07, + "loss": 0.77158403, + "num_input_tokens_seen": 244209565, + "router_z_loss_clip": 0.76025391, + "router_z_loss_mlp": 0.11724854, + "step": 11318, + "time_per_iteration": 2.674344539642334 + }, + { + "auxiliary_loss_clip": 0.01113819, + "auxiliary_loss_mlp": 0.01028642, + "balance_loss_clip": 1.04029799, + "balance_loss_mlp": 1.01758528, + "epoch": 0.6805350969487449, + "flos": 22369886873280.0, + "grad_norm": 1.6883973485769734, + "language_loss": 0.68052751, + "learning_rate": 9.782688488616143e-07, + "loss": 0.7019521, + "num_input_tokens_seen": 244228015, + "router_z_loss_clip": 0.73388672, + "router_z_loss_mlp": 0.1105957, + "step": 11319, + "time_per_iteration": 2.6909115314483643 + }, + { + "auxiliary_loss_clip": 0.01115557, + "auxiliary_loss_mlp": 0.01039931, + "balance_loss_clip": 1.04052854, + "balance_loss_mlp": 1.02734828, + "epoch": 0.6805952202014129, + "flos": 24327767080800.0, + "grad_norm": 1.903780105152037, + "language_loss": 0.7674638, + "learning_rate": 9.779340633692945e-07, + "loss": 0.78901869, + "num_input_tokens_seen": 244245615, + "router_z_loss_clip": 0.74951172, + "router_z_loss_mlp": 0.12585449, + "step": 11320, + "time_per_iteration": 2.7092530727386475 + }, + { + "auxiliary_loss_clip": 0.01115741, + "auxiliary_loss_mlp": 0.01032987, + "balance_loss_clip": 1.04045606, + "balance_loss_mlp": 1.0209825, + "epoch": 0.6806553434540809, + "flos": 30779466216480.0, + "grad_norm": 1.7546935448037104, + "language_loss": 0.74719006, + "learning_rate": 9.77599316633817e-07, + "loss": 0.76867735, + "num_input_tokens_seen": 244263625, + "router_z_loss_clip": 0.75292969, + "router_z_loss_mlp": 0.11999512, + "step": 11321, + "time_per_iteration": 2.67781662940979 + }, + { + "auxiliary_loss_clip": 0.0111768, + "auxiliary_loss_mlp": 0.01032457, + "balance_loss_clip": 1.04184592, + "balance_loss_mlp": 1.02060723, + "epoch": 0.6807154667067489, + "flos": 21701771147520.0, + "grad_norm": 2.3701259421558776, + "language_loss": 0.72456419, + "learning_rate": 9.772646086678758e-07, + "loss": 0.74606556, + "num_input_tokens_seen": 244282745, + "router_z_loss_clip": 0.7578125, + "router_z_loss_mlp": 0.11865234, + "step": 11322, + "time_per_iteration": 2.6194114685058594 + }, + { + "auxiliary_loss_clip": 0.01116702, + "auxiliary_loss_mlp": 0.01027802, + "balance_loss_clip": 1.04097223, + "balance_loss_mlp": 1.01533818, + "epoch": 0.6807755899594168, + "flos": 27088504436160.0, + "grad_norm": 1.8755861307534774, + "language_loss": 0.78250474, + "learning_rate": 9.769299394841638e-07, + "loss": 0.80394977, + "num_input_tokens_seen": 244303770, + "router_z_loss_clip": 0.7578125, + "router_z_loss_mlp": 0.12451172, + "step": 11323, + "time_per_iteration": 2.647538661956787 + }, + { + "auxiliary_loss_clip": 0.01036609, + "auxiliary_loss_mlp": 0.01001022, + "balance_loss_clip": 1.01304042, + "balance_loss_mlp": 0.99976742, + "epoch": 0.6808357132120848, + "flos": 83745422981280.0, + "grad_norm": 0.7470785155262656, + "language_loss": 0.57103097, + "learning_rate": 9.765953090953714e-07, + "loss": 0.5914073, + "num_input_tokens_seen": 244355910, + "router_z_loss_clip": 0.23583984, + "router_z_loss_mlp": 0.01254272, + "step": 11324, + "time_per_iteration": 3.063577175140381 + }, + { + "auxiliary_loss_clip": 0.01118129, + "auxiliary_loss_mlp": 0.01031717, + "balance_loss_clip": 1.04211271, + "balance_loss_mlp": 1.01951587, + "epoch": 0.6808958364647527, + "flos": 29093824028160.0, + "grad_norm": 2.0694712027392086, + "language_loss": 0.68212891, + "learning_rate": 9.76260717514186e-07, + "loss": 0.70362741, + "num_input_tokens_seen": 244376610, + "router_z_loss_clip": 0.75976562, + "router_z_loss_mlp": 0.12200928, + "step": 11325, + "time_per_iteration": 2.6747896671295166 + }, + { + "auxiliary_loss_clip": 0.01121749, + "auxiliary_loss_mlp": 0.01030575, + "balance_loss_clip": 1.04257894, + "balance_loss_mlp": 1.018291, + "epoch": 0.6809559597174207, + "flos": 21611713934880.0, + "grad_norm": 2.4896106064648618, + "language_loss": 0.70059323, + "learning_rate": 9.759261647532974e-07, + "loss": 0.72211647, + "num_input_tokens_seen": 244393000, + "router_z_loss_clip": 0.79101562, + "router_z_loss_mlp": 0.12280273, + "step": 11326, + "time_per_iteration": 2.639349937438965 + }, + { + "auxiliary_loss_clip": 0.01117163, + "auxiliary_loss_mlp": 0.01032313, + "balance_loss_clip": 1.04123354, + "balance_loss_mlp": 1.02019584, + "epoch": 0.6810160829700886, + "flos": 27444803110560.0, + "grad_norm": 1.8553037464037319, + "language_loss": 0.72893471, + "learning_rate": 9.75591650825392e-07, + "loss": 0.75042951, + "num_input_tokens_seen": 244409515, + "router_z_loss_clip": 0.75878906, + "router_z_loss_mlp": 0.12115479, + "step": 11327, + "time_per_iteration": 2.611401319503784 + }, + { + "auxiliary_loss_clip": 0.0111459, + "auxiliary_loss_mlp": 0.01029858, + "balance_loss_clip": 1.04057348, + "balance_loss_mlp": 1.01787746, + "epoch": 0.6810762062227567, + "flos": 20539576529280.0, + "grad_norm": 2.024635081353341, + "language_loss": 0.77442575, + "learning_rate": 9.752571757431526e-07, + "loss": 0.7958703, + "num_input_tokens_seen": 244427165, + "router_z_loss_clip": 0.73974609, + "router_z_loss_mlp": 0.11993408, + "step": 11328, + "time_per_iteration": 2.6546525955200195 + }, + { + "auxiliary_loss_clip": 0.01117707, + "auxiliary_loss_mlp": 0.01026958, + "balance_loss_clip": 1.04235077, + "balance_loss_mlp": 1.01543021, + "epoch": 0.6811363294754246, + "flos": 15513801402240.0, + "grad_norm": 1.9201168370205588, + "language_loss": 0.6451751, + "learning_rate": 9.74922739519265e-07, + "loss": 0.66662174, + "num_input_tokens_seen": 244445705, + "router_z_loss_clip": 0.75341797, + "router_z_loss_mlp": 0.11523438, + "step": 11329, + "time_per_iteration": 2.7000248432159424 + }, + { + "auxiliary_loss_clip": 0.01119441, + "auxiliary_loss_mlp": 0.01028328, + "balance_loss_clip": 1.04286778, + "balance_loss_mlp": 1.01604927, + "epoch": 0.6811964527280926, + "flos": 21611997555840.0, + "grad_norm": 1.9648918962048376, + "language_loss": 0.79050076, + "learning_rate": 9.745883421664096e-07, + "loss": 0.81197846, + "num_input_tokens_seen": 244460415, + "router_z_loss_clip": 0.76464844, + "router_z_loss_mlp": 0.12286377, + "step": 11330, + "time_per_iteration": 2.6234419345855713 + }, + { + "auxiliary_loss_clip": 0.01118359, + "auxiliary_loss_mlp": 0.01029493, + "balance_loss_clip": 1.04263854, + "balance_loss_mlp": 1.01739335, + "epoch": 0.6812565759807605, + "flos": 30338782748640.0, + "grad_norm": 33.58608652592222, + "language_loss": 0.63749242, + "learning_rate": 9.742539836972665e-07, + "loss": 0.65897089, + "num_input_tokens_seen": 244480555, + "router_z_loss_clip": 0.75634766, + "router_z_loss_mlp": 0.12097168, + "step": 11331, + "time_per_iteration": 2.6911747455596924 + }, + { + "auxiliary_loss_clip": 0.01118631, + "auxiliary_loss_mlp": 0.01031434, + "balance_loss_clip": 1.04306436, + "balance_loss_mlp": 1.01890481, + "epoch": 0.6813166992334285, + "flos": 20945664590400.0, + "grad_norm": 1.6820319745112342, + "language_loss": 0.72814739, + "learning_rate": 9.739196641245148e-07, + "loss": 0.74964803, + "num_input_tokens_seen": 244498540, + "router_z_loss_clip": 0.75537109, + "router_z_loss_mlp": 0.12518311, + "step": 11332, + "time_per_iteration": 2.6434853076934814 + }, + { + "auxiliary_loss_clip": 0.01120143, + "auxiliary_loss_mlp": 0.01032728, + "balance_loss_clip": 1.04217219, + "balance_loss_mlp": 1.0204196, + "epoch": 0.6813768224860965, + "flos": 22992791664960.0, + "grad_norm": 2.541381892888783, + "language_loss": 0.74739945, + "learning_rate": 9.735853834608326e-07, + "loss": 0.76892817, + "num_input_tokens_seen": 244517015, + "router_z_loss_clip": 0.78027344, + "router_z_loss_mlp": 0.12298584, + "step": 11333, + "time_per_iteration": 2.599519729614258 + }, + { + "auxiliary_loss_clip": 0.011219, + "auxiliary_loss_mlp": 0.01030563, + "balance_loss_clip": 1.04387105, + "balance_loss_mlp": 1.01840961, + "epoch": 0.6814369457387645, + "flos": 29935409345280.0, + "grad_norm": 1.5537897934332046, + "language_loss": 0.7192533, + "learning_rate": 9.732511417188963e-07, + "loss": 0.74077791, + "num_input_tokens_seen": 244537450, + "router_z_loss_clip": 0.77880859, + "router_z_loss_mlp": 0.12158203, + "step": 11334, + "time_per_iteration": 2.685589075088501 + }, + { + "auxiliary_loss_clip": 0.01116796, + "auxiliary_loss_mlp": 0.01033853, + "balance_loss_clip": 1.04248488, + "balance_loss_mlp": 1.02216482, + "epoch": 0.6814970689914325, + "flos": 22235550624000.0, + "grad_norm": 1.649763454625303, + "language_loss": 0.85680646, + "learning_rate": 9.729169389113791e-07, + "loss": 0.87831295, + "num_input_tokens_seen": 244555640, + "router_z_loss_clip": 0.74267578, + "router_z_loss_mlp": 0.11688232, + "step": 11335, + "time_per_iteration": 2.62638521194458 + }, + { + "auxiliary_loss_clip": 0.01112073, + "auxiliary_loss_mlp": 0.010295, + "balance_loss_clip": 1.04119635, + "balance_loss_mlp": 1.018718, + "epoch": 0.6815571922441004, + "flos": 30784530876480.0, + "grad_norm": 1.6712498001432607, + "language_loss": 0.81837213, + "learning_rate": 9.725827750509542e-07, + "loss": 0.83978784, + "num_input_tokens_seen": 244574005, + "router_z_loss_clip": 0.70849609, + "router_z_loss_mlp": 0.10791016, + "step": 11336, + "time_per_iteration": 2.7071571350097656 + }, + { + "auxiliary_loss_clip": 0.01113079, + "auxiliary_loss_mlp": 0.01031132, + "balance_loss_clip": 1.04104304, + "balance_loss_mlp": 1.01985478, + "epoch": 0.6816173154967684, + "flos": 23740956835200.0, + "grad_norm": 1.8809932005367334, + "language_loss": 0.81351864, + "learning_rate": 9.72248650150294e-07, + "loss": 0.83496076, + "num_input_tokens_seen": 244591395, + "router_z_loss_clip": 0.72070312, + "router_z_loss_mlp": 0.11279297, + "step": 11337, + "time_per_iteration": 2.660698175430298 + }, + { + "auxiliary_loss_clip": 0.01114209, + "auxiliary_loss_mlp": 0.01030544, + "balance_loss_clip": 1.04141283, + "balance_loss_mlp": 1.01938581, + "epoch": 0.6816774387494363, + "flos": 21879778674240.0, + "grad_norm": 1.7232504075256572, + "language_loss": 0.72679138, + "learning_rate": 9.719145642220673e-07, + "loss": 0.74823892, + "num_input_tokens_seen": 244610400, + "router_z_loss_clip": 0.72802734, + "router_z_loss_mlp": 0.11151123, + "step": 11338, + "time_per_iteration": 2.6267266273498535 + }, + { + "auxiliary_loss_clip": 0.01117272, + "auxiliary_loss_mlp": 0.01033968, + "balance_loss_clip": 1.04249477, + "balance_loss_mlp": 1.02174354, + "epoch": 0.6817375620021043, + "flos": 27129217952160.0, + "grad_norm": 1.436006781054529, + "language_loss": 0.7786237, + "learning_rate": 9.715805172789435e-07, + "loss": 0.80013609, + "num_input_tokens_seen": 244630400, + "router_z_loss_clip": 0.74755859, + "router_z_loss_mlp": 0.12213135, + "step": 11339, + "time_per_iteration": 2.6413116455078125 + }, + { + "auxiliary_loss_clip": 0.01117402, + "auxiliary_loss_mlp": 0.01034906, + "balance_loss_clip": 1.04230332, + "balance_loss_mlp": 1.02287769, + "epoch": 0.6817976852547722, + "flos": 30962254782240.0, + "grad_norm": 1.994975474000205, + "language_loss": 0.70423824, + "learning_rate": 9.712465093335901e-07, + "loss": 0.72576129, + "num_input_tokens_seen": 244649155, + "router_z_loss_clip": 0.75, + "router_z_loss_mlp": 0.12030029, + "step": 11340, + "time_per_iteration": 4.155214548110962 + }, + { + "auxiliary_loss_clip": 0.01122079, + "auxiliary_loss_mlp": 0.0103543, + "balance_loss_clip": 1.04380536, + "balance_loss_mlp": 1.02392066, + "epoch": 0.6818578085074403, + "flos": 27169526295360.0, + "grad_norm": 2.500919111958517, + "language_loss": 0.8373816, + "learning_rate": 9.709125403986722e-07, + "loss": 0.85895663, + "num_input_tokens_seen": 244665470, + "router_z_loss_clip": 0.78173828, + "router_z_loss_mlp": 0.11523438, + "step": 11341, + "time_per_iteration": 2.627960443496704 + }, + { + "auxiliary_loss_clip": 0.01120733, + "auxiliary_loss_mlp": 0.01034378, + "balance_loss_clip": 1.04326403, + "balance_loss_mlp": 1.02128863, + "epoch": 0.6819179317601082, + "flos": 23571903627360.0, + "grad_norm": 1.7040632109850715, + "language_loss": 0.6799711, + "learning_rate": 9.705786104868531e-07, + "loss": 0.70152223, + "num_input_tokens_seen": 244684390, + "router_z_loss_clip": 0.77392578, + "router_z_loss_mlp": 0.13098145, + "step": 11342, + "time_per_iteration": 2.6348695755004883 + }, + { + "auxiliary_loss_clip": 0.01114571, + "auxiliary_loss_mlp": 0.01028252, + "balance_loss_clip": 1.04018044, + "balance_loss_mlp": 1.01617026, + "epoch": 0.6819780550127762, + "flos": 25751462639040.0, + "grad_norm": 1.6983434818643413, + "language_loss": 0.74852461, + "learning_rate": 9.702447196107963e-07, + "loss": 0.76995277, + "num_input_tokens_seen": 244703370, + "router_z_loss_clip": 0.74316406, + "router_z_loss_mlp": 0.12091064, + "step": 11343, + "time_per_iteration": 3.9119811058044434 + }, + { + "auxiliary_loss_clip": 0.01120344, + "auxiliary_loss_mlp": 0.01033725, + "balance_loss_clip": 1.0449481, + "balance_loss_mlp": 1.0218935, + "epoch": 0.6820381782654441, + "flos": 36260673101280.0, + "grad_norm": 2.167046553537311, + "language_loss": 0.79490471, + "learning_rate": 9.699108677831639e-07, + "loss": 0.81644535, + "num_input_tokens_seen": 244723325, + "router_z_loss_clip": 0.75341797, + "router_z_loss_mlp": 0.11834717, + "step": 11344, + "time_per_iteration": 2.698660373687744 + }, + { + "auxiliary_loss_clip": 0.01117337, + "auxiliary_loss_mlp": 0.01031725, + "balance_loss_clip": 1.04152942, + "balance_loss_mlp": 1.01984596, + "epoch": 0.6820983015181121, + "flos": 35681925794400.0, + "grad_norm": 3.6807285316667735, + "language_loss": 0.66272306, + "learning_rate": 9.695770550166136e-07, + "loss": 0.68421364, + "num_input_tokens_seen": 244745650, + "router_z_loss_clip": 0.7578125, + "router_z_loss_mlp": 0.11871338, + "step": 11345, + "time_per_iteration": 2.6731863021850586 + }, + { + "auxiliary_loss_clip": 0.01121092, + "auxiliary_loss_mlp": 0.01032248, + "balance_loss_clip": 1.04337716, + "balance_loss_mlp": 1.01997519, + "epoch": 0.6821584247707801, + "flos": 23027994830880.0, + "grad_norm": 2.7726189860424917, + "language_loss": 0.64878249, + "learning_rate": 9.692432813238054e-07, + "loss": 0.67031586, + "num_input_tokens_seen": 244760270, + "router_z_loss_clip": 0.77734375, + "router_z_loss_mlp": 0.12280273, + "step": 11346, + "time_per_iteration": 2.590502977371216 + }, + { + "auxiliary_loss_clip": 0.01117502, + "auxiliary_loss_mlp": 0.01030809, + "balance_loss_clip": 1.04136205, + "balance_loss_mlp": 1.01835704, + "epoch": 0.6822185480234481, + "flos": 26020337724000.0, + "grad_norm": 1.6109494116518586, + "language_loss": 0.78597057, + "learning_rate": 9.689095467173952e-07, + "loss": 0.80745363, + "num_input_tokens_seen": 244779565, + "router_z_loss_clip": 0.76171875, + "router_z_loss_mlp": 0.12469482, + "step": 11347, + "time_per_iteration": 2.60721755027771 + }, + { + "auxiliary_loss_clip": 0.01035326, + "auxiliary_loss_mlp": 0.01002113, + "balance_loss_clip": 1.01190925, + "balance_loss_mlp": 1.00090551, + "epoch": 0.6822786712761161, + "flos": 77469664991040.0, + "grad_norm": 0.7203257751425013, + "language_loss": 0.52471125, + "learning_rate": 9.685758512100378e-07, + "loss": 0.54508561, + "num_input_tokens_seen": 244838480, + "router_z_loss_clip": 0.23425293, + "router_z_loss_mlp": 0.01206207, + "step": 11348, + "time_per_iteration": 3.232893943786621 + }, + { + "auxiliary_loss_clip": 0.01114187, + "auxiliary_loss_mlp": 0.01032761, + "balance_loss_clip": 1.04069448, + "balance_loss_mlp": 1.0215497, + "epoch": 0.682338794528784, + "flos": 25708399120800.0, + "grad_norm": 2.0764658338657966, + "language_loss": 0.79675889, + "learning_rate": 9.682421948143873e-07, + "loss": 0.81822836, + "num_input_tokens_seen": 244855265, + "router_z_loss_clip": 0.73535156, + "router_z_loss_mlp": 0.11206055, + "step": 11349, + "time_per_iteration": 2.6510465145111084 + }, + { + "auxiliary_loss_clip": 0.0112305, + "auxiliary_loss_mlp": 0.01024167, + "balance_loss_clip": 1.04234147, + "balance_loss_mlp": 1.01048768, + "epoch": 0.682398917781452, + "flos": 44274091116960.0, + "grad_norm": 1.8040518383333752, + "language_loss": 0.73754835, + "learning_rate": 9.67908577543096e-07, + "loss": 0.75902051, + "num_input_tokens_seen": 244875555, + "router_z_loss_clip": 0.80712891, + "router_z_loss_mlp": 0.13690186, + "step": 11350, + "time_per_iteration": 2.744630813598633 + }, + { + "auxiliary_loss_clip": 0.01115955, + "auxiliary_loss_mlp": 0.01026824, + "balance_loss_clip": 1.04219115, + "balance_loss_mlp": 1.01432467, + "epoch": 0.6824590410341199, + "flos": 30333961192320.0, + "grad_norm": 1.5243125865449068, + "language_loss": 0.79319787, + "learning_rate": 9.675749994088161e-07, + "loss": 0.81462562, + "num_input_tokens_seen": 244895270, + "router_z_loss_clip": 0.73828125, + "router_z_loss_mlp": 0.125, + "step": 11351, + "time_per_iteration": 4.093381881713867 + }, + { + "auxiliary_loss_clip": 0.01116792, + "auxiliary_loss_mlp": 0.01030009, + "balance_loss_clip": 1.0413307, + "balance_loss_mlp": 1.01833284, + "epoch": 0.6825191642867879, + "flos": 27396877518720.0, + "grad_norm": 1.667433065761816, + "language_loss": 0.73016137, + "learning_rate": 9.672414604241954e-07, + "loss": 0.75162935, + "num_input_tokens_seen": 244914535, + "router_z_loss_clip": 0.75537109, + "router_z_loss_mlp": 0.11682129, + "step": 11352, + "time_per_iteration": 2.6601319313049316 + }, + { + "auxiliary_loss_clip": 0.01119286, + "auxiliary_loss_mlp": 0.01033874, + "balance_loss_clip": 1.04219723, + "balance_loss_mlp": 1.02077246, + "epoch": 0.6825792875394558, + "flos": 35904333909600.0, + "grad_norm": 1.4977212167240084, + "language_loss": 0.80097866, + "learning_rate": 9.669079606018814e-07, + "loss": 0.82251024, + "num_input_tokens_seen": 244936095, + "router_z_loss_clip": 0.77050781, + "router_z_loss_mlp": 0.13110352, + "step": 11353, + "time_per_iteration": 4.189020156860352 + }, + { + "auxiliary_loss_clip": 0.01117693, + "auxiliary_loss_mlp": 0.01024056, + "balance_loss_clip": 1.04266369, + "balance_loss_mlp": 1.01236773, + "epoch": 0.6826394107921239, + "flos": 22904193074400.0, + "grad_norm": 1.9166520555907056, + "language_loss": 0.7830053, + "learning_rate": 9.665744999545218e-07, + "loss": 0.8044228, + "num_input_tokens_seen": 244955290, + "router_z_loss_clip": 0.75048828, + "router_z_loss_mlp": 0.11700439, + "step": 11354, + "time_per_iteration": 2.67802095413208 + }, + { + "auxiliary_loss_clip": 0.01114624, + "auxiliary_loss_mlp": 0.01026983, + "balance_loss_clip": 1.04130411, + "balance_loss_mlp": 1.01550364, + "epoch": 0.6826995340447918, + "flos": 20276535932640.0, + "grad_norm": 2.00643555906443, + "language_loss": 0.62070686, + "learning_rate": 9.662410784947599e-07, + "loss": 0.64212298, + "num_input_tokens_seen": 244972935, + "router_z_loss_clip": 0.73339844, + "router_z_loss_mlp": 0.11480713, + "step": 11355, + "time_per_iteration": 2.6066319942474365 + }, + { + "auxiliary_loss_clip": 0.01115547, + "auxiliary_loss_mlp": 0.01025556, + "balance_loss_clip": 1.04062319, + "balance_loss_mlp": 1.01408243, + "epoch": 0.6827596572974598, + "flos": 25439321449440.0, + "grad_norm": 2.244854361286307, + "language_loss": 0.81939399, + "learning_rate": 9.659076962352398e-07, + "loss": 0.84080499, + "num_input_tokens_seen": 244989440, + "router_z_loss_clip": 0.74951172, + "router_z_loss_mlp": 0.11480713, + "step": 11356, + "time_per_iteration": 2.622572422027588 + }, + { + "auxiliary_loss_clip": 0.01119528, + "auxiliary_loss_mlp": 0.01030724, + "balance_loss_clip": 1.04356647, + "balance_loss_mlp": 1.01905334, + "epoch": 0.6828197805501277, + "flos": 27757349472960.0, + "grad_norm": 2.100484029592945, + "language_loss": 0.78942466, + "learning_rate": 9.655743531886052e-07, + "loss": 0.81092715, + "num_input_tokens_seen": 245007830, + "router_z_loss_clip": 0.75976562, + "router_z_loss_mlp": 0.11676025, + "step": 11357, + "time_per_iteration": 2.635382890701294 + }, + { + "auxiliary_loss_clip": 0.0103515, + "auxiliary_loss_mlp": 0.01003343, + "balance_loss_clip": 1.01176798, + "balance_loss_mlp": 1.00231361, + "epoch": 0.6828799038027957, + "flos": 87422979892320.0, + "grad_norm": 0.8205087462600233, + "language_loss": 0.59657979, + "learning_rate": 9.65241049367493e-07, + "loss": 0.61696482, + "num_input_tokens_seen": 245070720, + "router_z_loss_clip": 0.23376465, + "router_z_loss_mlp": 0.01029205, + "step": 11358, + "time_per_iteration": 3.3285043239593506 + }, + { + "auxiliary_loss_clip": 0.0112308, + "auxiliary_loss_mlp": 0.01034491, + "balance_loss_clip": 1.04396522, + "balance_loss_mlp": 1.0216459, + "epoch": 0.6829400270554637, + "flos": 24195456695520.0, + "grad_norm": 1.853714704664582, + "language_loss": 0.78241652, + "learning_rate": 9.64907784784544e-07, + "loss": 0.80399221, + "num_input_tokens_seen": 245089070, + "router_z_loss_clip": 0.79150391, + "router_z_loss_mlp": 0.12854004, + "step": 11359, + "time_per_iteration": 2.6176676750183105 + }, + { + "auxiliary_loss_clip": 0.01116679, + "auxiliary_loss_mlp": 0.01032294, + "balance_loss_clip": 1.04163647, + "balance_loss_mlp": 1.02009845, + "epoch": 0.6830001503081317, + "flos": 26820966421440.0, + "grad_norm": 1.9243500586925233, + "language_loss": 0.81498814, + "learning_rate": 9.645745594523958e-07, + "loss": 0.83647788, + "num_input_tokens_seen": 245106500, + "router_z_loss_clip": 0.75195312, + "router_z_loss_mlp": 0.12188721, + "step": 11360, + "time_per_iteration": 2.6595537662506104 + }, + { + "auxiliary_loss_clip": 0.01121077, + "auxiliary_loss_mlp": 0.01028975, + "balance_loss_clip": 1.04455638, + "balance_loss_mlp": 1.01697659, + "epoch": 0.6830602735607997, + "flos": 29671598920320.0, + "grad_norm": 2.072819646848577, + "language_loss": 0.75248313, + "learning_rate": 9.642413733836844e-07, + "loss": 0.77398372, + "num_input_tokens_seen": 245125260, + "router_z_loss_clip": 0.76464844, + "router_z_loss_mlp": 0.12005615, + "step": 11361, + "time_per_iteration": 2.815425395965576 + }, + { + "auxiliary_loss_clip": 0.01033611, + "auxiliary_loss_mlp": 0.01002924, + "balance_loss_clip": 1.01033688, + "balance_loss_mlp": 1.00180578, + "epoch": 0.6831203968134676, + "flos": 70393926579840.0, + "grad_norm": 0.8949223235250661, + "language_loss": 0.59606612, + "learning_rate": 9.639082265910437e-07, + "loss": 0.61643147, + "num_input_tokens_seen": 245188730, + "router_z_loss_clip": 0.23303223, + "router_z_loss_mlp": 0.01118469, + "step": 11362, + "time_per_iteration": 3.365798234939575 + }, + { + "auxiliary_loss_clip": 0.01118438, + "auxiliary_loss_mlp": 0.01028034, + "balance_loss_clip": 1.04126787, + "balance_loss_mlp": 1.01517117, + "epoch": 0.6831805200661356, + "flos": 17556674162400.0, + "grad_norm": 2.8286953978283766, + "language_loss": 0.75461304, + "learning_rate": 9.635751190871074e-07, + "loss": 0.77607781, + "num_input_tokens_seen": 245205065, + "router_z_loss_clip": 0.77148438, + "router_z_loss_mlp": 0.12860107, + "step": 11363, + "time_per_iteration": 2.638418674468994 + }, + { + "auxiliary_loss_clip": 0.01116083, + "auxiliary_loss_mlp": 0.01036048, + "balance_loss_clip": 1.04153943, + "balance_loss_mlp": 1.02385259, + "epoch": 0.6832406433188035, + "flos": 27845542890720.0, + "grad_norm": 2.240639978294842, + "language_loss": 0.89550579, + "learning_rate": 9.632420508845063e-07, + "loss": 0.91702712, + "num_input_tokens_seen": 245224265, + "router_z_loss_clip": 0.74658203, + "router_z_loss_mlp": 0.12194824, + "step": 11364, + "time_per_iteration": 2.6402878761291504 + }, + { + "auxiliary_loss_clip": 0.01116006, + "auxiliary_loss_mlp": 0.01029031, + "balance_loss_clip": 1.04255748, + "balance_loss_mlp": 1.01799846, + "epoch": 0.6833007665714715, + "flos": 21429046920960.0, + "grad_norm": 2.3469718950487355, + "language_loss": 0.88383245, + "learning_rate": 9.629090219958697e-07, + "loss": 0.90528274, + "num_input_tokens_seen": 245243360, + "router_z_loss_clip": 0.73486328, + "router_z_loss_mlp": 0.1104126, + "step": 11365, + "time_per_iteration": 2.6206939220428467 + }, + { + "auxiliary_loss_clip": 0.01123607, + "auxiliary_loss_mlp": 0.01033513, + "balance_loss_clip": 1.04465997, + "balance_loss_mlp": 1.02122259, + "epoch": 0.6833608898241395, + "flos": 27389584408320.0, + "grad_norm": 2.3645296939943408, + "language_loss": 0.81089872, + "learning_rate": 9.625760324338272e-07, + "loss": 0.83246988, + "num_input_tokens_seen": 245256350, + "router_z_loss_clip": 0.78857422, + "router_z_loss_mlp": 0.12304688, + "step": 11366, + "time_per_iteration": 2.6246330738067627 + }, + { + "auxiliary_loss_clip": 0.01118331, + "auxiliary_loss_mlp": 0.01029831, + "balance_loss_clip": 1.04123616, + "balance_loss_mlp": 1.01748085, + "epoch": 0.6834210130768075, + "flos": 29937556761120.0, + "grad_norm": 1.484551645782622, + "language_loss": 0.76588488, + "learning_rate": 9.622430822110062e-07, + "loss": 0.78736651, + "num_input_tokens_seen": 245277575, + "router_z_loss_clip": 0.77001953, + "router_z_loss_mlp": 0.12353516, + "step": 11367, + "time_per_iteration": 2.668044328689575 + }, + { + "auxiliary_loss_clip": 0.01119346, + "auxiliary_loss_mlp": 0.01035148, + "balance_loss_clip": 1.04308248, + "balance_loss_mlp": 1.02297115, + "epoch": 0.6834811363294754, + "flos": 24461536088160.0, + "grad_norm": 2.329907903155012, + "language_loss": 0.69339234, + "learning_rate": 9.619101713400312e-07, + "loss": 0.71493733, + "num_input_tokens_seen": 245296615, + "router_z_loss_clip": 0.76220703, + "router_z_loss_mlp": 0.12182617, + "step": 11368, + "time_per_iteration": 2.634514570236206 + }, + { + "auxiliary_loss_clip": 0.01115415, + "auxiliary_loss_mlp": 0.01028837, + "balance_loss_clip": 1.04059064, + "balance_loss_mlp": 1.01732731, + "epoch": 0.6835412595821434, + "flos": 30250022088960.0, + "grad_norm": 1.6713675163121287, + "language_loss": 0.73251462, + "learning_rate": 9.615772998335261e-07, + "loss": 0.75395715, + "num_input_tokens_seen": 245316275, + "router_z_loss_clip": 0.74853516, + "router_z_loss_mlp": 0.1151123, + "step": 11369, + "time_per_iteration": 2.6373119354248047 + }, + { + "auxiliary_loss_clip": 0.01115863, + "auxiliary_loss_mlp": 0.01028538, + "balance_loss_clip": 1.03996623, + "balance_loss_mlp": 1.0168258, + "epoch": 0.6836013828348113, + "flos": 23794271225280.0, + "grad_norm": 2.1025637850271965, + "language_loss": 0.78499556, + "learning_rate": 9.612444677041138e-07, + "loss": 0.80643952, + "num_input_tokens_seen": 245334595, + "router_z_loss_clip": 0.75830078, + "router_z_loss_mlp": 0.11724854, + "step": 11370, + "time_per_iteration": 2.5954840183258057 + }, + { + "auxiliary_loss_clip": 0.01032651, + "auxiliary_loss_mlp": 0.01001654, + "balance_loss_clip": 1.0093025, + "balance_loss_mlp": 1.00056028, + "epoch": 0.6836615060874793, + "flos": 71214732882720.0, + "grad_norm": 0.7481049852034605, + "language_loss": 0.59825623, + "learning_rate": 9.609116749644162e-07, + "loss": 0.6185993, + "num_input_tokens_seen": 245389750, + "router_z_loss_clip": 0.23352051, + "router_z_loss_mlp": 0.01094055, + "step": 11371, + "time_per_iteration": 3.135230302810669 + }, + { + "auxiliary_loss_clip": 0.01113144, + "auxiliary_loss_mlp": 0.01027153, + "balance_loss_clip": 1.04120302, + "balance_loss_mlp": 1.01602495, + "epoch": 0.6837216293401474, + "flos": 14845402055520.0, + "grad_norm": 1.802022827393426, + "language_loss": 0.63618475, + "learning_rate": 9.605789216270511e-07, + "loss": 0.65758771, + "num_input_tokens_seen": 245407530, + "router_z_loss_clip": 0.71923828, + "router_z_loss_mlp": 0.11126709, + "step": 11372, + "time_per_iteration": 2.6266884803771973 + }, + { + "auxiliary_loss_clip": 0.01114444, + "auxiliary_loss_mlp": 0.01025038, + "balance_loss_clip": 1.04049945, + "balance_loss_mlp": 1.01332521, + "epoch": 0.6837817525928153, + "flos": 27000189466560.0, + "grad_norm": 1.5393566633328342, + "language_loss": 0.71839535, + "learning_rate": 9.602462077046375e-07, + "loss": 0.7397902, + "num_input_tokens_seen": 245427000, + "router_z_loss_clip": 0.73974609, + "router_z_loss_mlp": 0.11706543, + "step": 11373, + "time_per_iteration": 2.6321003437042236 + }, + { + "auxiliary_loss_clip": 0.01032614, + "auxiliary_loss_mlp": 0.01001353, + "balance_loss_clip": 1.0092442, + "balance_loss_mlp": 1.00026035, + "epoch": 0.6838418758454833, + "flos": 79319585698560.0, + "grad_norm": 1.1977597222490877, + "language_loss": 0.56703317, + "learning_rate": 9.599135332097935e-07, + "loss": 0.58737278, + "num_input_tokens_seen": 245491620, + "router_z_loss_clip": 0.23376465, + "router_z_loss_mlp": 0.01094055, + "step": 11374, + "time_per_iteration": 3.4467668533325195 + }, + { + "auxiliary_loss_clip": 0.01119034, + "auxiliary_loss_mlp": 0.01027532, + "balance_loss_clip": 1.04341006, + "balance_loss_mlp": 1.01576602, + "epoch": 0.6839019990981512, + "flos": 25662053702880.0, + "grad_norm": 1.6179778906073745, + "language_loss": 0.73758042, + "learning_rate": 9.595808981551312e-07, + "loss": 0.75904608, + "num_input_tokens_seen": 245511285, + "router_z_loss_clip": 0.75683594, + "router_z_loss_mlp": 0.11761475, + "step": 11375, + "time_per_iteration": 2.7863004207611084 + }, + { + "auxiliary_loss_clip": 0.0111369, + "auxiliary_loss_mlp": 0.01031477, + "balance_loss_clip": 1.03952277, + "balance_loss_mlp": 1.0200274, + "epoch": 0.6839621223508192, + "flos": 30426814097280.0, + "grad_norm": 3.264572934772481, + "language_loss": 0.70401168, + "learning_rate": 9.592483025532651e-07, + "loss": 0.72546333, + "num_input_tokens_seen": 245532910, + "router_z_loss_clip": 0.74023438, + "router_z_loss_mlp": 0.11456299, + "step": 11376, + "time_per_iteration": 2.7085120677948 + }, + { + "auxiliary_loss_clip": 0.01117479, + "auxiliary_loss_mlp": 0.01031995, + "balance_loss_clip": 1.04078197, + "balance_loss_mlp": 1.0201335, + "epoch": 0.6840222456034871, + "flos": 32163582742560.0, + "grad_norm": 1.8010275375622617, + "language_loss": 0.74564803, + "learning_rate": 9.58915746416808e-07, + "loss": 0.76714271, + "num_input_tokens_seen": 245550540, + "router_z_loss_clip": 0.76660156, + "router_z_loss_mlp": 0.11865234, + "step": 11377, + "time_per_iteration": 2.668550491333008 + }, + { + "auxiliary_loss_clip": 0.01032592, + "auxiliary_loss_mlp": 0.01000621, + "balance_loss_clip": 1.00921655, + "balance_loss_mlp": 0.99958694, + "epoch": 0.6840823688561551, + "flos": 80518644691200.0, + "grad_norm": 0.7194281374679111, + "language_loss": 0.56799185, + "learning_rate": 9.585832297583707e-07, + "loss": 0.58832395, + "num_input_tokens_seen": 245619570, + "router_z_loss_clip": 0.23400879, + "router_z_loss_mlp": 0.01034546, + "step": 11378, + "time_per_iteration": 3.375122308731079 + }, + { + "auxiliary_loss_clip": 0.01114273, + "auxiliary_loss_mlp": 0.01029083, + "balance_loss_clip": 1.03910911, + "balance_loss_mlp": 1.01737714, + "epoch": 0.684142492108823, + "flos": 26109422521920.0, + "grad_norm": 7.861765367199408, + "language_loss": 0.78213799, + "learning_rate": 9.58250752590561e-07, + "loss": 0.80357152, + "num_input_tokens_seen": 245637980, + "router_z_loss_clip": 0.75146484, + "router_z_loss_mlp": 0.11700439, + "step": 11379, + "time_per_iteration": 2.6700997352600098 + }, + { + "auxiliary_loss_clip": 0.01111739, + "auxiliary_loss_mlp": 0.01026212, + "balance_loss_clip": 1.04215074, + "balance_loss_mlp": 1.01566219, + "epoch": 0.6842026153614911, + "flos": 22941908311680.0, + "grad_norm": 2.064542587074884, + "language_loss": 0.68858677, + "learning_rate": 9.57918314925988e-07, + "loss": 0.7099663, + "num_input_tokens_seen": 245655690, + "router_z_loss_clip": 0.6953125, + "router_z_loss_mlp": 0.10552979, + "step": 11380, + "time_per_iteration": 4.184069395065308 + }, + { + "auxiliary_loss_clip": 0.01114326, + "auxiliary_loss_mlp": 0.0103012, + "balance_loss_clip": 1.03970575, + "balance_loss_mlp": 1.01810968, + "epoch": 0.684262738614159, + "flos": 23972481338400.0, + "grad_norm": 1.9854579615692658, + "language_loss": 0.78178644, + "learning_rate": 9.575859167772568e-07, + "loss": 0.80323088, + "num_input_tokens_seen": 245671525, + "router_z_loss_clip": 0.74707031, + "router_z_loss_mlp": 0.12017822, + "step": 11381, + "time_per_iteration": 2.6045804023742676 + }, + { + "auxiliary_loss_clip": 0.01032348, + "auxiliary_loss_mlp": 0.01002387, + "balance_loss_clip": 1.00902081, + "balance_loss_mlp": 1.00130546, + "epoch": 0.684322861866827, + "flos": 76086318293280.0, + "grad_norm": 0.8684112360550512, + "language_loss": 0.67165852, + "learning_rate": 9.572535581569713e-07, + "loss": 0.69200587, + "num_input_tokens_seen": 245724115, + "router_z_loss_clip": 0.2331543, + "router_z_loss_mlp": 0.01081848, + "step": 11382, + "time_per_iteration": 4.5601818561553955 + }, + { + "auxiliary_loss_clip": 0.01032558, + "auxiliary_loss_mlp": 0.01002106, + "balance_loss_clip": 1.00931609, + "balance_loss_mlp": 1.00097752, + "epoch": 0.6843829851194949, + "flos": 80295061574880.0, + "grad_norm": 0.8391389047523863, + "language_loss": 0.58053827, + "learning_rate": 9.569212390777356e-07, + "loss": 0.60088497, + "num_input_tokens_seen": 245789245, + "router_z_loss_clip": 0.23254395, + "router_z_loss_mlp": 0.01128387, + "step": 11383, + "time_per_iteration": 3.300856351852417 + }, + { + "auxiliary_loss_clip": 0.01114339, + "auxiliary_loss_mlp": 0.0102688, + "balance_loss_clip": 1.03992486, + "balance_loss_mlp": 1.01575232, + "epoch": 0.6844431083721629, + "flos": 33989476703040.0, + "grad_norm": 1.725173362451117, + "language_loss": 0.79875386, + "learning_rate": 9.565889595521517e-07, + "loss": 0.82016605, + "num_input_tokens_seen": 245812420, + "router_z_loss_clip": 0.74316406, + "router_z_loss_mlp": 0.11126709, + "step": 11384, + "time_per_iteration": 2.700589179992676 + }, + { + "auxiliary_loss_clip": 0.01115933, + "auxiliary_loss_mlp": 0.01033746, + "balance_loss_clip": 1.03907609, + "balance_loss_mlp": 1.02201605, + "epoch": 0.684503231624831, + "flos": 22275048621600.0, + "grad_norm": 1.9309129900480855, + "language_loss": 0.77153707, + "learning_rate": 9.562567195928187e-07, + "loss": 0.79303384, + "num_input_tokens_seen": 245829135, + "router_z_loss_clip": 0.76806641, + "router_z_loss_mlp": 0.11737061, + "step": 11385, + "time_per_iteration": 2.6059389114379883 + }, + { + "auxiliary_loss_clip": 0.01121058, + "auxiliary_loss_mlp": 0.01036217, + "balance_loss_clip": 1.04134202, + "balance_loss_mlp": 1.02343154, + "epoch": 0.6845633548774989, + "flos": 21523723103520.0, + "grad_norm": 2.5363283588046066, + "language_loss": 0.84627491, + "learning_rate": 9.55924519212335e-07, + "loss": 0.86784756, + "num_input_tokens_seen": 245847140, + "router_z_loss_clip": 0.79736328, + "router_z_loss_mlp": 0.12786865, + "step": 11386, + "time_per_iteration": 2.6012487411499023 + }, + { + "auxiliary_loss_clip": 0.01116276, + "auxiliary_loss_mlp": 0.01032749, + "balance_loss_clip": 1.04175973, + "balance_loss_mlp": 1.02198994, + "epoch": 0.6846234781301669, + "flos": 25390018270080.0, + "grad_norm": 2.255751648476092, + "language_loss": 0.83548832, + "learning_rate": 9.555923584232984e-07, + "loss": 0.8569786, + "num_input_tokens_seen": 245862855, + "router_z_loss_clip": 0.74609375, + "router_z_loss_mlp": 0.10760498, + "step": 11387, + "time_per_iteration": 2.634169816970825 + }, + { + "auxiliary_loss_clip": 0.0111309, + "auxiliary_loss_mlp": 0.01026213, + "balance_loss_clip": 1.03942156, + "balance_loss_mlp": 1.01484048, + "epoch": 0.6846836013828348, + "flos": 44051885588160.0, + "grad_norm": 1.595842931239859, + "language_loss": 0.72255743, + "learning_rate": 9.552602372383047e-07, + "loss": 0.74395043, + "num_input_tokens_seen": 245885415, + "router_z_loss_clip": 0.73632812, + "router_z_loss_mlp": 0.11376953, + "step": 11388, + "time_per_iteration": 2.7490127086639404 + }, + { + "auxiliary_loss_clip": 0.01112674, + "auxiliary_loss_mlp": 0.01024476, + "balance_loss_clip": 1.03988564, + "balance_loss_mlp": 1.01328182, + "epoch": 0.6847437246355028, + "flos": 52644253497120.0, + "grad_norm": 1.858885761966365, + "language_loss": 0.6269685, + "learning_rate": 9.549281556699469e-07, + "loss": 0.64833999, + "num_input_tokens_seen": 245906285, + "router_z_loss_clip": 0.72753906, + "router_z_loss_mlp": 0.11199951, + "step": 11389, + "time_per_iteration": 2.8240349292755127 + }, + { + "auxiliary_loss_clip": 0.01031839, + "auxiliary_loss_mlp": 0.01002407, + "balance_loss_clip": 1.0086019, + "balance_loss_mlp": 1.00135612, + "epoch": 0.6848038478881707, + "flos": 87445703736000.0, + "grad_norm": 0.7278966463057681, + "language_loss": 0.56022382, + "learning_rate": 9.54596113730818e-07, + "loss": 0.58056629, + "num_input_tokens_seen": 245967620, + "router_z_loss_clip": 0.23242188, + "router_z_loss_mlp": 0.01051331, + "step": 11390, + "time_per_iteration": 3.3365719318389893 + }, + { + "auxiliary_loss_clip": 0.01115978, + "auxiliary_loss_mlp": 0.01031389, + "balance_loss_clip": 1.04157233, + "balance_loss_mlp": 1.01992083, + "epoch": 0.6848639711408387, + "flos": 24328131736320.0, + "grad_norm": 1.91673848065148, + "language_loss": 0.87999535, + "learning_rate": 9.542641114335109e-07, + "loss": 0.90146899, + "num_input_tokens_seen": 245985075, + "router_z_loss_clip": 0.74316406, + "router_z_loss_mlp": 0.11462402, + "step": 11391, + "time_per_iteration": 4.1025025844573975 + }, + { + "auxiliary_loss_clip": 0.01118429, + "auxiliary_loss_mlp": 0.01032178, + "balance_loss_clip": 1.04253125, + "balance_loss_mlp": 1.02097189, + "epoch": 0.6849240943935067, + "flos": 32784502187520.0, + "grad_norm": 1.5793040077149705, + "language_loss": 0.78902507, + "learning_rate": 9.539321487906117e-07, + "loss": 0.81053114, + "num_input_tokens_seen": 246003560, + "router_z_loss_clip": 0.75878906, + "router_z_loss_mlp": 0.11206055, + "step": 11392, + "time_per_iteration": 4.0390520095825195 + }, + { + "auxiliary_loss_clip": 0.01113012, + "auxiliary_loss_mlp": 0.01027144, + "balance_loss_clip": 1.04085863, + "balance_loss_mlp": 1.015885, + "epoch": 0.6849842176461747, + "flos": 16759408399200.0, + "grad_norm": 2.3426355748568417, + "language_loss": 0.71265739, + "learning_rate": 9.536002258147104e-07, + "loss": 0.73405898, + "num_input_tokens_seen": 246019600, + "router_z_loss_clip": 0.72265625, + "router_z_loss_mlp": 0.11260986, + "step": 11393, + "time_per_iteration": 2.666780710220337 + }, + { + "auxiliary_loss_clip": 0.01117364, + "auxiliary_loss_mlp": 0.01031108, + "balance_loss_clip": 1.04030609, + "balance_loss_mlp": 1.01850164, + "epoch": 0.6850443408988426, + "flos": 30472511238720.0, + "grad_norm": 1.7799130910872285, + "language_loss": 0.64625359, + "learning_rate": 9.532683425183936e-07, + "loss": 0.66773832, + "num_input_tokens_seen": 246038920, + "router_z_loss_clip": 0.77148438, + "router_z_loss_mlp": 0.12609863, + "step": 11394, + "time_per_iteration": 2.7056713104248047 + }, + { + "auxiliary_loss_clip": 0.011167, + "auxiliary_loss_mlp": 0.01030865, + "balance_loss_clip": 1.04126453, + "balance_loss_mlp": 1.01935542, + "epoch": 0.6851044641515106, + "flos": 33854249073600.0, + "grad_norm": 1.7468888397734623, + "language_loss": 0.80455947, + "learning_rate": 9.529364989142468e-07, + "loss": 0.82603508, + "num_input_tokens_seen": 246060490, + "router_z_loss_clip": 0.75585938, + "router_z_loss_mlp": 0.11505127, + "step": 11395, + "time_per_iteration": 2.703622341156006 + }, + { + "auxiliary_loss_clip": 0.01116439, + "auxiliary_loss_mlp": 0.01031157, + "balance_loss_clip": 1.04216003, + "balance_loss_mlp": 1.01885426, + "epoch": 0.6851645874041785, + "flos": 29713649506560.0, + "grad_norm": 2.7302145342272865, + "language_loss": 0.73196381, + "learning_rate": 9.526046950148527e-07, + "loss": 0.75343978, + "num_input_tokens_seen": 246081465, + "router_z_loss_clip": 0.74267578, + "router_z_loss_mlp": 0.12304688, + "step": 11396, + "time_per_iteration": 2.6423087120056152 + }, + { + "auxiliary_loss_clip": 0.01117562, + "auxiliary_loss_mlp": 0.01030051, + "balance_loss_clip": 1.04065037, + "balance_loss_mlp": 1.01792753, + "epoch": 0.6852247106568465, + "flos": 18398299996800.0, + "grad_norm": 2.6729741057808543, + "language_loss": 0.79458135, + "learning_rate": 9.522729308327931e-07, + "loss": 0.8160575, + "num_input_tokens_seen": 246096110, + "router_z_loss_clip": 0.76953125, + "router_z_loss_mlp": 0.12121582, + "step": 11397, + "time_per_iteration": 2.637789011001587 + }, + { + "auxiliary_loss_clip": 0.01113825, + "auxiliary_loss_mlp": 0.01031024, + "balance_loss_clip": 1.03827822, + "balance_loss_mlp": 1.01869226, + "epoch": 0.6852848339095146, + "flos": 22903099107840.0, + "grad_norm": 1.7206493286703755, + "language_loss": 0.71193242, + "learning_rate": 9.519412063806493e-07, + "loss": 0.73338091, + "num_input_tokens_seen": 246114785, + "router_z_loss_clip": 0.75439453, + "router_z_loss_mlp": 0.12341309, + "step": 11398, + "time_per_iteration": 2.6804919242858887 + }, + { + "auxiliary_loss_clip": 0.01111264, + "auxiliary_loss_mlp": 0.01030875, + "balance_loss_clip": 1.03859639, + "balance_loss_mlp": 1.02007496, + "epoch": 0.6853449571621825, + "flos": 33988585322880.0, + "grad_norm": 1.7672829989155672, + "language_loss": 0.70644456, + "learning_rate": 9.516095216709996e-07, + "loss": 0.72786593, + "num_input_tokens_seen": 246136375, + "router_z_loss_clip": 0.7265625, + "router_z_loss_mlp": 0.10803223, + "step": 11399, + "time_per_iteration": 2.715528964996338 + }, + { + "auxiliary_loss_clip": 0.01116683, + "auxiliary_loss_mlp": 0.01032536, + "balance_loss_clip": 1.04181242, + "balance_loss_mlp": 1.020818, + "epoch": 0.6854050804148505, + "flos": 22147073585280.0, + "grad_norm": 1.5333806378181647, + "language_loss": 0.70289844, + "learning_rate": 9.512778767164217e-07, + "loss": 0.72439063, + "num_input_tokens_seen": 246155090, + "router_z_loss_clip": 0.74853516, + "router_z_loss_mlp": 0.11724854, + "step": 11400, + "time_per_iteration": 2.6663057804107666 + }, + { + "auxiliary_loss_clip": 0.01125003, + "auxiliary_loss_mlp": 0.01034002, + "balance_loss_clip": 1.04327846, + "balance_loss_mlp": 1.02024519, + "epoch": 0.6854652036675184, + "flos": 19920926052000.0, + "grad_norm": 1.9871536214381589, + "language_loss": 0.77664888, + "learning_rate": 9.509462715294927e-07, + "loss": 0.79823893, + "num_input_tokens_seen": 246172645, + "router_z_loss_clip": 0.81884766, + "router_z_loss_mlp": 0.13757324, + "step": 11401, + "time_per_iteration": 2.6153512001037598 + }, + { + "auxiliary_loss_clip": 0.01111642, + "auxiliary_loss_mlp": 0.01027167, + "balance_loss_clip": 1.03852546, + "balance_loss_mlp": 1.01571095, + "epoch": 0.6855253269201864, + "flos": 18230259720960.0, + "grad_norm": 2.6713061318114786, + "language_loss": 0.75302052, + "learning_rate": 9.50614706122786e-07, + "loss": 0.77440864, + "num_input_tokens_seen": 246189055, + "router_z_loss_clip": 0.73193359, + "router_z_loss_mlp": 0.11456299, + "step": 11402, + "time_per_iteration": 2.6221821308135986 + }, + { + "auxiliary_loss_clip": 0.01115228, + "auxiliary_loss_mlp": 0.01036736, + "balance_loss_clip": 1.03923154, + "balance_loss_mlp": 1.02464819, + "epoch": 0.6855854501728543, + "flos": 28111014524160.0, + "grad_norm": 2.0476756884322334, + "language_loss": 0.7286557, + "learning_rate": 9.502831805088742e-07, + "loss": 0.75017536, + "num_input_tokens_seen": 246207990, + "router_z_loss_clip": 0.75976562, + "router_z_loss_mlp": 0.12097168, + "step": 11403, + "time_per_iteration": 2.623107671737671 + }, + { + "auxiliary_loss_clip": 0.01111298, + "auxiliary_loss_mlp": 0.01031629, + "balance_loss_clip": 1.03926897, + "balance_loss_mlp": 1.02016735, + "epoch": 0.6856455734255223, + "flos": 16172233498080.0, + "grad_norm": 2.0670934809859314, + "language_loss": 0.81743544, + "learning_rate": 9.499516947003294e-07, + "loss": 0.83886474, + "num_input_tokens_seen": 246221595, + "router_z_loss_clip": 0.72119141, + "router_z_loss_mlp": 0.11462402, + "step": 11404, + "time_per_iteration": 2.613532304763794 + }, + { + "auxiliary_loss_clip": 0.0111526, + "auxiliary_loss_mlp": 0.01032127, + "balance_loss_clip": 1.04151726, + "balance_loss_mlp": 1.02082634, + "epoch": 0.6857056966781903, + "flos": 28471526995680.0, + "grad_norm": 2.463305723164111, + "language_loss": 0.77658862, + "learning_rate": 9.496202487097222e-07, + "loss": 0.79806244, + "num_input_tokens_seen": 246242970, + "router_z_loss_clip": 0.73779297, + "router_z_loss_mlp": 0.11303711, + "step": 11405, + "time_per_iteration": 2.7276484966278076 + }, + { + "auxiliary_loss_clip": 0.01031417, + "auxiliary_loss_mlp": 0.01001913, + "balance_loss_clip": 1.0081985, + "balance_loss_mlp": 1.00083041, + "epoch": 0.6857658199308583, + "flos": 75473137648800.0, + "grad_norm": 0.7868268325829036, + "language_loss": 0.60972869, + "learning_rate": 9.492888425496199e-07, + "loss": 0.63006204, + "num_input_tokens_seen": 246300405, + "router_z_loss_clip": 0.23242188, + "router_z_loss_mlp": 0.01082611, + "step": 11406, + "time_per_iteration": 3.3604319095611572 + }, + { + "auxiliary_loss_clip": 0.0111348, + "auxiliary_loss_mlp": 0.0103144, + "balance_loss_clip": 1.03851402, + "balance_loss_mlp": 1.01869678, + "epoch": 0.6858259431835262, + "flos": 20321584797600.0, + "grad_norm": 3.2415004584758784, + "language_loss": 0.77007127, + "learning_rate": 9.489574762325907e-07, + "loss": 0.79152048, + "num_input_tokens_seen": 246318780, + "router_z_loss_clip": 0.74951172, + "router_z_loss_mlp": 0.12750244, + "step": 11407, + "time_per_iteration": 2.6764585971832275 + }, + { + "auxiliary_loss_clip": 0.01116674, + "auxiliary_loss_mlp": 0.01036378, + "balance_loss_clip": 1.04041886, + "balance_loss_mlp": 1.02370548, + "epoch": 0.6858860664361942, + "flos": 26689425864480.0, + "grad_norm": 2.1838224473257277, + "language_loss": 0.71323544, + "learning_rate": 9.486261497711991e-07, + "loss": 0.73476601, + "num_input_tokens_seen": 246339405, + "router_z_loss_clip": 0.76269531, + "router_z_loss_mlp": 0.12677002, + "step": 11408, + "time_per_iteration": 2.6681573390960693 + }, + { + "auxiliary_loss_clip": 0.01116196, + "auxiliary_loss_mlp": 0.01026403, + "balance_loss_clip": 1.03918064, + "balance_loss_mlp": 1.01423168, + "epoch": 0.6859461896888621, + "flos": 18629216740800.0, + "grad_norm": 2.129759484142482, + "language_loss": 0.70450795, + "learning_rate": 9.482948631780087e-07, + "loss": 0.72593397, + "num_input_tokens_seen": 246357055, + "router_z_loss_clip": 0.76953125, + "router_z_loss_mlp": 0.121521, + "step": 11409, + "time_per_iteration": 2.6292874813079834 + }, + { + "auxiliary_loss_clip": 0.01109743, + "auxiliary_loss_mlp": 0.01026531, + "balance_loss_clip": 1.04013348, + "balance_loss_mlp": 1.01584363, + "epoch": 0.6860063129415301, + "flos": 22720756232160.0, + "grad_norm": 1.700051865324992, + "language_loss": 0.78080034, + "learning_rate": 9.479636164655825e-07, + "loss": 0.80216306, + "num_input_tokens_seen": 246374050, + "router_z_loss_clip": 0.6953125, + "router_z_loss_mlp": 0.10687256, + "step": 11410, + "time_per_iteration": 2.600632667541504 + }, + { + "auxiliary_loss_clip": 0.01116662, + "auxiliary_loss_mlp": 0.01030321, + "balance_loss_clip": 1.03881311, + "balance_loss_mlp": 1.01800668, + "epoch": 0.6860664361941982, + "flos": 29225769757920.0, + "grad_norm": 2.8032155271594617, + "language_loss": 0.71882755, + "learning_rate": 9.476324096464821e-07, + "loss": 0.74029732, + "num_input_tokens_seen": 246392910, + "router_z_loss_clip": 0.77929688, + "router_z_loss_mlp": 0.12310791, + "step": 11411, + "time_per_iteration": 2.6671509742736816 + }, + { + "auxiliary_loss_clip": 0.0111662, + "auxiliary_loss_mlp": 0.01027645, + "balance_loss_clip": 1.04087591, + "balance_loss_mlp": 1.01523542, + "epoch": 0.6861265594468661, + "flos": 24902016969600.0, + "grad_norm": 4.874594066907659, + "language_loss": 0.7019347, + "learning_rate": 9.473012427332654e-07, + "loss": 0.72337735, + "num_input_tokens_seen": 246411540, + "router_z_loss_clip": 0.75585938, + "router_z_loss_mlp": 0.12420654, + "step": 11412, + "time_per_iteration": 2.637484550476074 + }, + { + "auxiliary_loss_clip": 0.01114986, + "auxiliary_loss_mlp": 0.01029599, + "balance_loss_clip": 1.04014993, + "balance_loss_mlp": 1.01730299, + "epoch": 0.6861866826995341, + "flos": 13945235101920.0, + "grad_norm": 3.4249728823118732, + "language_loss": 0.71486795, + "learning_rate": 9.469701157384919e-07, + "loss": 0.73631382, + "num_input_tokens_seen": 246423295, + "router_z_loss_clip": 0.75, + "router_z_loss_mlp": 0.12304688, + "step": 11413, + "time_per_iteration": 2.593785524368286 + }, + { + "auxiliary_loss_clip": 0.01116517, + "auxiliary_loss_mlp": 0.01030985, + "balance_loss_clip": 1.04124904, + "balance_loss_mlp": 1.01984549, + "epoch": 0.686246805952202, + "flos": 19519943168160.0, + "grad_norm": 1.697965761959784, + "language_loss": 0.74016243, + "learning_rate": 9.466390286747164e-07, + "loss": 0.76163745, + "num_input_tokens_seen": 246441045, + "router_z_loss_clip": 0.75390625, + "router_z_loss_mlp": 0.1114502, + "step": 11414, + "time_per_iteration": 2.614924669265747 + }, + { + "auxiliary_loss_clip": 0.01119588, + "auxiliary_loss_mlp": 0.01029747, + "balance_loss_clip": 1.04285979, + "balance_loss_mlp": 1.01752806, + "epoch": 0.68630692920487, + "flos": 24192701520480.0, + "grad_norm": 2.897432486127864, + "language_loss": 0.86759198, + "learning_rate": 9.46307981554495e-07, + "loss": 0.88908529, + "num_input_tokens_seen": 246456905, + "router_z_loss_clip": 0.76806641, + "router_z_loss_mlp": 0.12219238, + "step": 11415, + "time_per_iteration": 2.6236469745635986 + }, + { + "auxiliary_loss_clip": 0.01117203, + "auxiliary_loss_mlp": 0.01031791, + "balance_loss_clip": 1.04121888, + "balance_loss_mlp": 1.0195483, + "epoch": 0.6863670524575379, + "flos": 32074943634720.0, + "grad_norm": 1.5877667685940497, + "language_loss": 0.67497128, + "learning_rate": 9.459769743903801e-07, + "loss": 0.6964612, + "num_input_tokens_seen": 246477545, + "router_z_loss_clip": 0.75927734, + "router_z_loss_mlp": 0.12255859, + "step": 11416, + "time_per_iteration": 2.656240463256836 + }, + { + "auxiliary_loss_clip": 0.01115998, + "auxiliary_loss_mlp": 0.01029597, + "balance_loss_clip": 1.04069746, + "balance_loss_mlp": 1.01825464, + "epoch": 0.686427175710206, + "flos": 23394868515360.0, + "grad_norm": 1.935239803453437, + "language_loss": 0.76177478, + "learning_rate": 9.456460071949237e-07, + "loss": 0.78323066, + "num_input_tokens_seen": 246496705, + "router_z_loss_clip": 0.75341797, + "router_z_loss_mlp": 0.11340332, + "step": 11417, + "time_per_iteration": 2.710344076156616 + }, + { + "auxiliary_loss_clip": 0.0111521, + "auxiliary_loss_mlp": 0.01029275, + "balance_loss_clip": 1.04061139, + "balance_loss_mlp": 1.01803374, + "epoch": 0.6864872989628739, + "flos": 23081957497440.0, + "grad_norm": 2.1496043772442435, + "language_loss": 0.77119613, + "learning_rate": 9.45315079980678e-07, + "loss": 0.79264092, + "num_input_tokens_seen": 246514860, + "router_z_loss_clip": 0.74511719, + "router_z_loss_mlp": 0.11248779, + "step": 11418, + "time_per_iteration": 2.6101038455963135 + }, + { + "auxiliary_loss_clip": 0.01116359, + "auxiliary_loss_mlp": 0.01026878, + "balance_loss_clip": 1.04222584, + "balance_loss_mlp": 1.01557136, + "epoch": 0.6865474222155419, + "flos": 31672380576960.0, + "grad_norm": 1.6775658176418111, + "language_loss": 0.76470834, + "learning_rate": 9.449841927601887e-07, + "loss": 0.7861408, + "num_input_tokens_seen": 246536145, + "router_z_loss_clip": 0.74072266, + "router_z_loss_mlp": 0.11303711, + "step": 11419, + "time_per_iteration": 2.7113144397735596 + }, + { + "auxiliary_loss_clip": 0.01113216, + "auxiliary_loss_mlp": 0.010317, + "balance_loss_clip": 1.03960371, + "balance_loss_mlp": 1.02072668, + "epoch": 0.6866075454682098, + "flos": 22546881468000.0, + "grad_norm": 1.66468279640096, + "language_loss": 0.71586847, + "learning_rate": 9.446533455460044e-07, + "loss": 0.73731768, + "num_input_tokens_seen": 246553265, + "router_z_loss_clip": 0.73632812, + "router_z_loss_mlp": 0.10974121, + "step": 11420, + "time_per_iteration": 4.264700412750244 + }, + { + "auxiliary_loss_clip": 0.01112894, + "auxiliary_loss_mlp": 0.01025825, + "balance_loss_clip": 1.03940284, + "balance_loss_mlp": 1.01479805, + "epoch": 0.6866676687208778, + "flos": 41780770224480.0, + "grad_norm": 1.383068390660397, + "language_loss": 0.74589044, + "learning_rate": 9.443225383506712e-07, + "loss": 0.7672776, + "num_input_tokens_seen": 246575130, + "router_z_loss_clip": 0.73486328, + "router_z_loss_mlp": 0.11022949, + "step": 11421, + "time_per_iteration": 2.744471311569214 + }, + { + "auxiliary_loss_clip": 0.01111688, + "auxiliary_loss_mlp": 0.01024486, + "balance_loss_clip": 1.03931236, + "balance_loss_mlp": 1.01350677, + "epoch": 0.6867277919735457, + "flos": 26505381263040.0, + "grad_norm": 1.827360569292347, + "language_loss": 0.76960588, + "learning_rate": 9.439917711867338e-07, + "loss": 0.79096764, + "num_input_tokens_seen": 246593095, + "router_z_loss_clip": 0.72265625, + "router_z_loss_mlp": 0.10980225, + "step": 11422, + "time_per_iteration": 4.050010919570923 + }, + { + "auxiliary_loss_clip": 0.01117452, + "auxiliary_loss_mlp": 0.0103754, + "balance_loss_clip": 1.04216111, + "balance_loss_mlp": 1.02527332, + "epoch": 0.6867879152262137, + "flos": 29404628147520.0, + "grad_norm": 1.7724853885266132, + "language_loss": 0.77289164, + "learning_rate": 9.436610440667334e-07, + "loss": 0.79444158, + "num_input_tokens_seen": 246612165, + "router_z_loss_clip": 0.75292969, + "router_z_loss_mlp": 0.12261963, + "step": 11423, + "time_per_iteration": 2.693286895751953 + }, + { + "auxiliary_loss_clip": 0.01118305, + "auxiliary_loss_mlp": 0.01028488, + "balance_loss_clip": 1.04313159, + "balance_loss_mlp": 1.01648951, + "epoch": 0.6868480384788818, + "flos": 26375785535520.0, + "grad_norm": 2.7932553903078525, + "language_loss": 0.72815043, + "learning_rate": 9.433303570032129e-07, + "loss": 0.74961829, + "num_input_tokens_seen": 246632065, + "router_z_loss_clip": 0.75097656, + "router_z_loss_mlp": 0.11993408, + "step": 11424, + "time_per_iteration": 2.637113094329834 + }, + { + "auxiliary_loss_clip": 0.01113915, + "auxiliary_loss_mlp": 0.01029006, + "balance_loss_clip": 1.03952837, + "balance_loss_mlp": 1.01769352, + "epoch": 0.6869081617315497, + "flos": 32074862600160.0, + "grad_norm": 1.9004755955149033, + "language_loss": 0.65293837, + "learning_rate": 9.429997100087112e-07, + "loss": 0.67436755, + "num_input_tokens_seen": 246651245, + "router_z_loss_clip": 0.74316406, + "router_z_loss_mlp": 0.11315918, + "step": 11425, + "time_per_iteration": 2.7313077449798584 + }, + { + "auxiliary_loss_clip": 0.01114635, + "auxiliary_loss_mlp": 0.01026201, + "balance_loss_clip": 1.04170752, + "balance_loss_mlp": 1.01468563, + "epoch": 0.6869682849842177, + "flos": 25752556605600.0, + "grad_norm": 3.4319100366431905, + "language_loss": 0.71513665, + "learning_rate": 9.426691030957657e-07, + "loss": 0.73654509, + "num_input_tokens_seen": 246672225, + "router_z_loss_clip": 0.72949219, + "router_z_loss_mlp": 0.11523438, + "step": 11426, + "time_per_iteration": 2.660304307937622 + }, + { + "auxiliary_loss_clip": 0.01113099, + "auxiliary_loss_mlp": 0.01027103, + "balance_loss_clip": 1.03922617, + "balance_loss_mlp": 1.01553392, + "epoch": 0.6870284082368856, + "flos": 20856174619680.0, + "grad_norm": 2.2468982844066647, + "language_loss": 0.85200882, + "learning_rate": 9.423385362769136e-07, + "loss": 0.87341088, + "num_input_tokens_seen": 246688385, + "router_z_loss_clip": 0.73779297, + "router_z_loss_mlp": 0.11566162, + "step": 11427, + "time_per_iteration": 2.633735179901123 + }, + { + "auxiliary_loss_clip": 0.0111356, + "auxiliary_loss_mlp": 0.01027575, + "balance_loss_clip": 1.04088593, + "balance_loss_mlp": 1.01633322, + "epoch": 0.6870885314895536, + "flos": 33321887701920.0, + "grad_norm": 1.448101693899427, + "language_loss": 0.76109105, + "learning_rate": 9.420080095646909e-07, + "loss": 0.78250241, + "num_input_tokens_seen": 246710730, + "router_z_loss_clip": 0.72753906, + "router_z_loss_mlp": 0.11242676, + "step": 11428, + "time_per_iteration": 2.676217555999756 + }, + { + "auxiliary_loss_clip": 0.01116376, + "auxiliary_loss_mlp": 0.01032207, + "balance_loss_clip": 1.03970468, + "balance_loss_mlp": 1.01918876, + "epoch": 0.6871486547422215, + "flos": 25397878622400.0, + "grad_norm": 1.7668275082273912, + "language_loss": 0.72598207, + "learning_rate": 9.4167752297163e-07, + "loss": 0.74746788, + "num_input_tokens_seen": 246730350, + "router_z_loss_clip": 0.76660156, + "router_z_loss_mlp": 0.13031006, + "step": 11429, + "time_per_iteration": 2.6269609928131104 + }, + { + "auxiliary_loss_clip": 0.01118449, + "auxiliary_loss_mlp": 0.01024477, + "balance_loss_clip": 1.0432303, + "balance_loss_mlp": 1.01304531, + "epoch": 0.6872087779948896, + "flos": 36794817233280.0, + "grad_norm": 1.7403415372087387, + "language_loss": 0.82888228, + "learning_rate": 9.413470765102643e-07, + "loss": 0.85031152, + "num_input_tokens_seen": 246751700, + "router_z_loss_clip": 0.75146484, + "router_z_loss_mlp": 0.11431885, + "step": 11430, + "time_per_iteration": 4.19898247718811 + }, + { + "auxiliary_loss_clip": 0.01114299, + "auxiliary_loss_mlp": 0.01034182, + "balance_loss_clip": 1.03992844, + "balance_loss_mlp": 1.02245784, + "epoch": 0.6872689012475575, + "flos": 25263744959520.0, + "grad_norm": 2.0906956949829407, + "language_loss": 0.70291054, + "learning_rate": 9.410166701931225e-07, + "loss": 0.72439539, + "num_input_tokens_seen": 246769860, + "router_z_loss_clip": 0.74414062, + "router_z_loss_mlp": 0.1171875, + "step": 11431, + "time_per_iteration": 2.618067741394043 + }, + { + "auxiliary_loss_clip": 0.01115244, + "auxiliary_loss_mlp": 0.01029417, + "balance_loss_clip": 1.03917086, + "balance_loss_mlp": 1.01742446, + "epoch": 0.6873290245002255, + "flos": 31140910585440.0, + "grad_norm": 3.1453140157017496, + "language_loss": 0.80082619, + "learning_rate": 9.406863040327355e-07, + "loss": 0.82227278, + "num_input_tokens_seen": 246789905, + "router_z_loss_clip": 0.76123047, + "router_z_loss_mlp": 0.11993408, + "step": 11432, + "time_per_iteration": 3.937964677810669 + }, + { + "auxiliary_loss_clip": 0.01112425, + "auxiliary_loss_mlp": 0.01027336, + "balance_loss_clip": 1.04075432, + "balance_loss_mlp": 1.01602948, + "epoch": 0.6873891477528934, + "flos": 30739076838720.0, + "grad_norm": 1.543833518421042, + "language_loss": 0.67886603, + "learning_rate": 9.403559780416295e-07, + "loss": 0.70026362, + "num_input_tokens_seen": 246808815, + "router_z_loss_clip": 0.71630859, + "router_z_loss_mlp": 0.11297607, + "step": 11433, + "time_per_iteration": 2.6525144577026367 + }, + { + "auxiliary_loss_clip": 0.01118372, + "auxiliary_loss_mlp": 0.01036108, + "balance_loss_clip": 1.043957, + "balance_loss_mlp": 1.02442014, + "epoch": 0.6874492710055614, + "flos": 42894107353440.0, + "grad_norm": 3.5358875946258363, + "language_loss": 0.72747982, + "learning_rate": 9.400256922323309e-07, + "loss": 0.74902463, + "num_input_tokens_seen": 246829775, + "router_z_loss_clip": 0.74414062, + "router_z_loss_mlp": 0.11682129, + "step": 11434, + "time_per_iteration": 2.757920980453491 + }, + { + "auxiliary_loss_clip": 0.01115265, + "auxiliary_loss_mlp": 0.01026478, + "balance_loss_clip": 1.04185438, + "balance_loss_mlp": 1.01461661, + "epoch": 0.6875093942582293, + "flos": 21746455356960.0, + "grad_norm": 1.820993514097884, + "language_loss": 0.80334675, + "learning_rate": 9.396954466173657e-07, + "loss": 0.82476413, + "num_input_tokens_seen": 246848045, + "router_z_loss_clip": 0.73486328, + "router_z_loss_mlp": 0.11859131, + "step": 11435, + "time_per_iteration": 2.6041793823242188 + }, + { + "auxiliary_loss_clip": 0.01116502, + "auxiliary_loss_mlp": 0.01031308, + "balance_loss_clip": 1.04003096, + "balance_loss_mlp": 1.0189991, + "epoch": 0.6875695175108973, + "flos": 25263704442240.0, + "grad_norm": 3.4568353305447244, + "language_loss": 0.81017387, + "learning_rate": 9.393652412092538e-07, + "loss": 0.83165205, + "num_input_tokens_seen": 246866095, + "router_z_loss_clip": 0.76416016, + "router_z_loss_mlp": 0.1229248, + "step": 11436, + "time_per_iteration": 2.640101909637451 + }, + { + "auxiliary_loss_clip": 0.01109227, + "auxiliary_loss_mlp": 0.0102757, + "balance_loss_clip": 1.03925586, + "balance_loss_mlp": 1.0174017, + "epoch": 0.6876296407635654, + "flos": 30961809092160.0, + "grad_norm": 1.7793262293749563, + "language_loss": 0.82077765, + "learning_rate": 9.390350760205183e-07, + "loss": 0.84214556, + "num_input_tokens_seen": 246883975, + "router_z_loss_clip": 0.70019531, + "router_z_loss_mlp": 0.10174561, + "step": 11437, + "time_per_iteration": 2.6830413341522217 + }, + { + "auxiliary_loss_clip": 0.01122456, + "auxiliary_loss_mlp": 0.01041664, + "balance_loss_clip": 1.04219162, + "balance_loss_mlp": 1.02914667, + "epoch": 0.6876897640162333, + "flos": 28336056262560.0, + "grad_norm": 2.6929396693113947, + "language_loss": 0.78208363, + "learning_rate": 9.387049510636793e-07, + "loss": 0.80372488, + "num_input_tokens_seen": 246901560, + "router_z_loss_clip": 0.80126953, + "router_z_loss_mlp": 0.12530518, + "step": 11438, + "time_per_iteration": 2.662090539932251 + }, + { + "auxiliary_loss_clip": 0.01110107, + "auxiliary_loss_mlp": 0.01028914, + "balance_loss_clip": 1.0391171, + "balance_loss_mlp": 1.01766706, + "epoch": 0.6877498872689013, + "flos": 33097777860960.0, + "grad_norm": 1.6829060281324693, + "language_loss": 0.72662222, + "learning_rate": 9.383748663512554e-07, + "loss": 0.74801242, + "num_input_tokens_seen": 246922655, + "router_z_loss_clip": 0.70849609, + "router_z_loss_mlp": 0.11236572, + "step": 11439, + "time_per_iteration": 2.6899657249450684 + }, + { + "auxiliary_loss_clip": 0.0111489, + "auxiliary_loss_mlp": 0.0102629, + "balance_loss_clip": 1.04054546, + "balance_loss_mlp": 1.0142678, + "epoch": 0.6878100105215692, + "flos": 14131386601920.0, + "grad_norm": 1.9416286504805826, + "language_loss": 0.75402892, + "learning_rate": 9.380448218957623e-07, + "loss": 0.77544069, + "num_input_tokens_seen": 246940100, + "router_z_loss_clip": 0.74316406, + "router_z_loss_mlp": 0.12030029, + "step": 11440, + "time_per_iteration": 2.623150587081909 + }, + { + "auxiliary_loss_clip": 0.01111606, + "auxiliary_loss_mlp": 0.01030447, + "balance_loss_clip": 1.03897738, + "balance_loss_mlp": 1.01921189, + "epoch": 0.6878701337742372, + "flos": 24773920381440.0, + "grad_norm": 3.2508560123120533, + "language_loss": 0.71758819, + "learning_rate": 9.377148177097167e-07, + "loss": 0.73900872, + "num_input_tokens_seen": 246958545, + "router_z_loss_clip": 0.72607422, + "router_z_loss_mlp": 0.11248779, + "step": 11441, + "time_per_iteration": 2.6222567558288574 + }, + { + "auxiliary_loss_clip": 0.0111855, + "auxiliary_loss_mlp": 0.01032414, + "balance_loss_clip": 1.04100907, + "balance_loss_mlp": 1.01963472, + "epoch": 0.6879302570269051, + "flos": 16887626539200.0, + "grad_norm": 1.9602739990364562, + "language_loss": 0.66515404, + "learning_rate": 9.373848538056317e-07, + "loss": 0.68666363, + "num_input_tokens_seen": 246974805, + "router_z_loss_clip": 0.77490234, + "router_z_loss_mlp": 0.12780762, + "step": 11442, + "time_per_iteration": 2.6046814918518066 + }, + { + "auxiliary_loss_clip": 0.0111578, + "auxiliary_loss_mlp": 0.01032772, + "balance_loss_clip": 1.04237056, + "balance_loss_mlp": 1.02131617, + "epoch": 0.6879903802795732, + "flos": 26020418758560.0, + "grad_norm": 2.7765259302296057, + "language_loss": 0.69376719, + "learning_rate": 9.370549301960189e-07, + "loss": 0.71525276, + "num_input_tokens_seen": 246992505, + "router_z_loss_clip": 0.73339844, + "router_z_loss_mlp": 0.11456299, + "step": 11443, + "time_per_iteration": 2.645885705947876 + }, + { + "auxiliary_loss_clip": 0.01119922, + "auxiliary_loss_mlp": 0.01030475, + "balance_loss_clip": 1.04402637, + "balance_loss_mlp": 1.01835787, + "epoch": 0.6880505035322411, + "flos": 30691232281440.0, + "grad_norm": 2.1402268680124195, + "language_loss": 0.76414454, + "learning_rate": 9.367250468933893e-07, + "loss": 0.78564847, + "num_input_tokens_seen": 247013370, + "router_z_loss_clip": 0.75878906, + "router_z_loss_mlp": 0.12109375, + "step": 11444, + "time_per_iteration": 2.651892900466919 + }, + { + "auxiliary_loss_clip": 0.01113295, + "auxiliary_loss_mlp": 0.01026856, + "balance_loss_clip": 1.04076588, + "balance_loss_mlp": 1.01635385, + "epoch": 0.6881106267849091, + "flos": 28328398496640.0, + "grad_norm": 2.2151067336219494, + "language_loss": 0.76388597, + "learning_rate": 9.363952039102536e-07, + "loss": 0.7852875, + "num_input_tokens_seen": 247029855, + "router_z_loss_clip": 0.72460938, + "router_z_loss_mlp": 0.1050415, + "step": 11445, + "time_per_iteration": 2.644603729248047 + }, + { + "auxiliary_loss_clip": 0.01032261, + "auxiliary_loss_mlp": 0.010006, + "balance_loss_clip": 1.00922513, + "balance_loss_mlp": 0.99947035, + "epoch": 0.688170750037577, + "flos": 59161503241440.0, + "grad_norm": 0.816752002391949, + "language_loss": 0.58337796, + "learning_rate": 9.360654012591183e-07, + "loss": 0.60370654, + "num_input_tokens_seen": 247085030, + "router_z_loss_clip": 0.23022461, + "router_z_loss_mlp": 0.0112915, + "step": 11446, + "time_per_iteration": 3.2923731803894043 + }, + { + "auxiliary_loss_clip": 0.0111619, + "auxiliary_loss_mlp": 0.01030353, + "balance_loss_clip": 1.03962779, + "balance_loss_mlp": 1.01837218, + "epoch": 0.688230873290245, + "flos": 27800818164000.0, + "grad_norm": 2.852156592131606, + "language_loss": 0.75836062, + "learning_rate": 9.357356389524886e-07, + "loss": 0.77982605, + "num_input_tokens_seen": 247104840, + "router_z_loss_clip": 0.765625, + "router_z_loss_mlp": 0.11981201, + "step": 11447, + "time_per_iteration": 2.6925487518310547 + }, + { + "auxiliary_loss_clip": 0.01117762, + "auxiliary_loss_mlp": 0.01033152, + "balance_loss_clip": 1.04097533, + "balance_loss_mlp": 1.02150524, + "epoch": 0.6882909965429129, + "flos": 27400564591200.0, + "grad_norm": 2.2340175019917625, + "language_loss": 0.73311853, + "learning_rate": 9.354059170028705e-07, + "loss": 0.7546277, + "num_input_tokens_seen": 247121905, + "router_z_loss_clip": 0.76855469, + "router_z_loss_mlp": 0.11651611, + "step": 11448, + "time_per_iteration": 2.643812656402588 + }, + { + "auxiliary_loss_clip": 0.01116937, + "auxiliary_loss_mlp": 0.01033903, + "balance_loss_clip": 1.03926969, + "balance_loss_mlp": 1.02176774, + "epoch": 0.688351119795581, + "flos": 31986912286080.0, + "grad_norm": 3.093225535729463, + "language_loss": 0.74656069, + "learning_rate": 9.350762354227673e-07, + "loss": 0.76806909, + "num_input_tokens_seen": 247142375, + "router_z_loss_clip": 0.77685547, + "router_z_loss_mlp": 0.12139893, + "step": 11449, + "time_per_iteration": 2.734964370727539 + }, + { + "auxiliary_loss_clip": 0.01112261, + "auxiliary_loss_mlp": 0.01033006, + "balance_loss_clip": 1.03944767, + "balance_loss_mlp": 1.02178264, + "epoch": 0.6884112430482489, + "flos": 27534293081280.0, + "grad_norm": 1.8630733790182452, + "language_loss": 0.70301765, + "learning_rate": 9.34746594224679e-07, + "loss": 0.72447032, + "num_input_tokens_seen": 247161095, + "router_z_loss_clip": 0.72802734, + "router_z_loss_mlp": 0.11224365, + "step": 11450, + "time_per_iteration": 2.6995437145233154 + }, + { + "auxiliary_loss_clip": 0.01120142, + "auxiliary_loss_mlp": 0.01035805, + "balance_loss_clip": 1.04112875, + "balance_loss_mlp": 1.02274513, + "epoch": 0.6884713663009169, + "flos": 21159928732320.0, + "grad_norm": 2.0608530214167926, + "language_loss": 0.76119959, + "learning_rate": 9.344169934211068e-07, + "loss": 0.78275907, + "num_input_tokens_seen": 247178565, + "router_z_loss_clip": 0.79052734, + "router_z_loss_mlp": 0.13037109, + "step": 11451, + "time_per_iteration": 2.609112024307251 + }, + { + "auxiliary_loss_clip": 0.01115976, + "auxiliary_loss_mlp": 0.01031035, + "balance_loss_clip": 1.04041016, + "balance_loss_mlp": 1.01947141, + "epoch": 0.6885314895535849, + "flos": 32300025890400.0, + "grad_norm": 1.7300609725121316, + "language_loss": 0.69251359, + "learning_rate": 9.340874330245505e-07, + "loss": 0.71398371, + "num_input_tokens_seen": 247202345, + "router_z_loss_clip": 0.75537109, + "router_z_loss_mlp": 0.11566162, + "step": 11452, + "time_per_iteration": 2.7287025451660156 + }, + { + "auxiliary_loss_clip": 0.01114287, + "auxiliary_loss_mlp": 0.01035017, + "balance_loss_clip": 1.04038835, + "balance_loss_mlp": 1.02179646, + "epoch": 0.6885916128062528, + "flos": 25040445464160.0, + "grad_norm": 1.8479874667568195, + "language_loss": 0.7188946, + "learning_rate": 9.337579130475042e-07, + "loss": 0.74038762, + "num_input_tokens_seen": 247219240, + "router_z_loss_clip": 0.73925781, + "router_z_loss_mlp": 0.13226318, + "step": 11453, + "time_per_iteration": 2.651822805404663 + }, + { + "auxiliary_loss_clip": 0.0103268, + "auxiliary_loss_mlp": 0.01001311, + "balance_loss_clip": 1.00948429, + "balance_loss_mlp": 1.00017631, + "epoch": 0.6886517360589208, + "flos": 86288897916000.0, + "grad_norm": 0.8048595136421254, + "language_loss": 0.50627142, + "learning_rate": 9.334284335024644e-07, + "loss": 0.52661133, + "num_input_tokens_seen": 247272010, + "router_z_loss_clip": 0.23205566, + "router_z_loss_mlp": 0.01135254, + "step": 11454, + "time_per_iteration": 3.1700680255889893 + }, + { + "auxiliary_loss_clip": 0.01113387, + "auxiliary_loss_mlp": 0.01032176, + "balance_loss_clip": 1.0423317, + "balance_loss_mlp": 1.0206486, + "epoch": 0.6887118593115887, + "flos": 21833635842720.0, + "grad_norm": 2.0173528537730956, + "language_loss": 0.75550807, + "learning_rate": 9.330989944019263e-07, + "loss": 0.77696365, + "num_input_tokens_seen": 247290630, + "router_z_loss_clip": 0.7109375, + "router_z_loss_mlp": 0.11541748, + "step": 11455, + "time_per_iteration": 2.599975824356079 + }, + { + "auxiliary_loss_clip": 0.01116559, + "auxiliary_loss_mlp": 0.01034533, + "balance_loss_clip": 1.03904295, + "balance_loss_mlp": 1.02212906, + "epoch": 0.6887719825642568, + "flos": 21296331362880.0, + "grad_norm": 2.4259420142414343, + "language_loss": 0.72785091, + "learning_rate": 9.327695957583803e-07, + "loss": 0.74936187, + "num_input_tokens_seen": 247304800, + "router_z_loss_clip": 0.77539062, + "router_z_loss_mlp": 0.12402344, + "step": 11456, + "time_per_iteration": 2.5969104766845703 + }, + { + "auxiliary_loss_clip": 0.01114864, + "auxiliary_loss_mlp": 0.01036432, + "balance_loss_clip": 1.04231322, + "balance_loss_mlp": 1.02492833, + "epoch": 0.6888321058169247, + "flos": 28150998729120.0, + "grad_norm": 1.7589096608529877, + "language_loss": 0.81052566, + "learning_rate": 9.32440237584319e-07, + "loss": 0.83203864, + "num_input_tokens_seen": 247323450, + "router_z_loss_clip": 0.72460938, + "router_z_loss_mlp": 0.11505127, + "step": 11457, + "time_per_iteration": 2.607407331466675 + }, + { + "auxiliary_loss_clip": 0.01121268, + "auxiliary_loss_mlp": 0.01029758, + "balance_loss_clip": 1.04349279, + "balance_loss_mlp": 1.01761639, + "epoch": 0.6888922290695927, + "flos": 28514793100320.0, + "grad_norm": 1.6133937078785219, + "language_loss": 0.76002336, + "learning_rate": 9.321109198922301e-07, + "loss": 0.7815336, + "num_input_tokens_seen": 247343845, + "router_z_loss_clip": 0.77734375, + "router_z_loss_mlp": 0.121521, + "step": 11458, + "time_per_iteration": 2.657959222793579 + }, + { + "auxiliary_loss_clip": 0.0111655, + "auxiliary_loss_mlp": 0.01029953, + "balance_loss_clip": 1.04135132, + "balance_loss_mlp": 1.01883662, + "epoch": 0.6889523523222606, + "flos": 21514647232800.0, + "grad_norm": 2.8812038368000663, + "language_loss": 0.68176985, + "learning_rate": 9.31781642694603e-07, + "loss": 0.70323491, + "num_input_tokens_seen": 247356650, + "router_z_loss_clip": 0.75195312, + "router_z_loss_mlp": 0.11114502, + "step": 11459, + "time_per_iteration": 4.0141308307647705 + }, + { + "auxiliary_loss_clip": 0.01115944, + "auxiliary_loss_mlp": 0.01029823, + "balance_loss_clip": 1.04197693, + "balance_loss_mlp": 1.01859903, + "epoch": 0.6890124755749286, + "flos": 30783801565440.0, + "grad_norm": 1.5637683448392463, + "language_loss": 0.68258721, + "learning_rate": 9.314524060039221e-07, + "loss": 0.70404488, + "num_input_tokens_seen": 247377340, + "router_z_loss_clip": 0.74023438, + "router_z_loss_mlp": 0.11218262, + "step": 11460, + "time_per_iteration": 2.66701078414917 + }, + { + "auxiliary_loss_clip": 0.0111885, + "auxiliary_loss_mlp": 0.01033999, + "balance_loss_clip": 1.03960562, + "balance_loss_mlp": 1.02147603, + "epoch": 0.6890725988275965, + "flos": 24684916618080.0, + "grad_norm": 2.0284106717096497, + "language_loss": 0.7696836, + "learning_rate": 9.311232098326731e-07, + "loss": 0.79121202, + "num_input_tokens_seen": 247395805, + "router_z_loss_clip": 0.79296875, + "router_z_loss_mlp": 0.12512207, + "step": 11461, + "time_per_iteration": 3.9849703311920166 + }, + { + "auxiliary_loss_clip": 0.01116472, + "auxiliary_loss_mlp": 0.01035773, + "balance_loss_clip": 1.04200482, + "balance_loss_mlp": 1.02395391, + "epoch": 0.6891327220802645, + "flos": 17735816172960.0, + "grad_norm": 4.894573071862417, + "language_loss": 0.69243503, + "learning_rate": 9.307940541933401e-07, + "loss": 0.71395755, + "num_input_tokens_seen": 247413165, + "router_z_loss_clip": 0.74462891, + "router_z_loss_mlp": 0.1182251, + "step": 11462, + "time_per_iteration": 2.7288658618927 + }, + { + "auxiliary_loss_clip": 0.0111797, + "auxiliary_loss_mlp": 0.01027342, + "balance_loss_clip": 1.04246616, + "balance_loss_mlp": 1.0155766, + "epoch": 0.6891928453329325, + "flos": 25794323570880.0, + "grad_norm": 1.4482391879738508, + "language_loss": 0.87241876, + "learning_rate": 9.304649390984034e-07, + "loss": 0.89387184, + "num_input_tokens_seen": 247433140, + "router_z_loss_clip": 0.75537109, + "router_z_loss_mlp": 0.11755371, + "step": 11463, + "time_per_iteration": 2.6784543991088867 + }, + { + "auxiliary_loss_clip": 0.0111166, + "auxiliary_loss_mlp": 0.01027431, + "balance_loss_clip": 1.0403024, + "balance_loss_mlp": 1.01701188, + "epoch": 0.6892529685856005, + "flos": 21790774910880.0, + "grad_norm": 1.7177225888782426, + "language_loss": 0.68318206, + "learning_rate": 9.301358645603428e-07, + "loss": 0.70457304, + "num_input_tokens_seen": 247451265, + "router_z_loss_clip": 0.71289062, + "router_z_loss_mlp": 0.10424805, + "step": 11464, + "time_per_iteration": 2.690781831741333 + }, + { + "auxiliary_loss_clip": 0.01115908, + "auxiliary_loss_mlp": 0.0103725, + "balance_loss_clip": 1.04127347, + "balance_loss_mlp": 1.02504897, + "epoch": 0.6893130918382685, + "flos": 36527238701280.0, + "grad_norm": 2.0751112960654905, + "language_loss": 0.65438986, + "learning_rate": 9.298068305916373e-07, + "loss": 0.67592144, + "num_input_tokens_seen": 247471645, + "router_z_loss_clip": 0.74658203, + "router_z_loss_mlp": 0.12213135, + "step": 11465, + "time_per_iteration": 2.70017147064209 + }, + { + "auxiliary_loss_clip": 0.01118572, + "auxiliary_loss_mlp": 0.01034078, + "balance_loss_clip": 1.04104638, + "balance_loss_mlp": 1.02218723, + "epoch": 0.6893732150909364, + "flos": 29759265613440.0, + "grad_norm": 1.4944134447802435, + "language_loss": 0.72569919, + "learning_rate": 9.294778372047649e-07, + "loss": 0.74722564, + "num_input_tokens_seen": 247491170, + "router_z_loss_clip": 0.77392578, + "router_z_loss_mlp": 0.11895752, + "step": 11466, + "time_per_iteration": 2.7454237937927246 + }, + { + "auxiliary_loss_clip": 0.01117075, + "auxiliary_loss_mlp": 0.01029086, + "balance_loss_clip": 1.04143345, + "balance_loss_mlp": 1.01744533, + "epoch": 0.6894333383436044, + "flos": 20722243543200.0, + "grad_norm": 1.7187085942650793, + "language_loss": 0.72207868, + "learning_rate": 9.291488844121995e-07, + "loss": 0.74354029, + "num_input_tokens_seen": 247509005, + "router_z_loss_clip": 0.75634766, + "router_z_loss_mlp": 0.11639404, + "step": 11467, + "time_per_iteration": 2.5870466232299805 + }, + { + "auxiliary_loss_clip": 0.01118137, + "auxiliary_loss_mlp": 0.01036592, + "balance_loss_clip": 1.04072678, + "balance_loss_mlp": 1.0234375, + "epoch": 0.6894934615962723, + "flos": 23166423325440.0, + "grad_norm": 1.9851003058482073, + "language_loss": 0.80791664, + "learning_rate": 9.288199722264156e-07, + "loss": 0.82946396, + "num_input_tokens_seen": 247527050, + "router_z_loss_clip": 0.77441406, + "router_z_loss_mlp": 0.13153076, + "step": 11468, + "time_per_iteration": 2.637598991394043 + }, + { + "auxiliary_loss_clip": 0.01117759, + "auxiliary_loss_mlp": 0.01036122, + "balance_loss_clip": 1.04142046, + "balance_loss_mlp": 1.02368796, + "epoch": 0.6895535848489404, + "flos": 42136704243360.0, + "grad_norm": 1.9376592356205145, + "language_loss": 0.65942568, + "learning_rate": 9.284911006598875e-07, + "loss": 0.68096447, + "num_input_tokens_seen": 247547765, + "router_z_loss_clip": 0.76367188, + "router_z_loss_mlp": 0.12445068, + "step": 11469, + "time_per_iteration": 2.74756121635437 + }, + { + "auxiliary_loss_clip": 0.01032909, + "auxiliary_loss_mlp": 0.01000743, + "balance_loss_clip": 1.00972176, + "balance_loss_mlp": 0.99960995, + "epoch": 0.6896137081016083, + "flos": 61101440644320.0, + "grad_norm": 0.8074252804140568, + "language_loss": 0.55232668, + "learning_rate": 9.281622697250824e-07, + "loss": 0.57266319, + "num_input_tokens_seen": 247603515, + "router_z_loss_clip": 0.23181152, + "router_z_loss_mlp": 0.01134491, + "step": 11470, + "time_per_iteration": 4.646167278289795 + }, + { + "auxiliary_loss_clip": 0.01112932, + "auxiliary_loss_mlp": 0.01032222, + "balance_loss_clip": 1.04164886, + "balance_loss_mlp": 1.02185678, + "epoch": 0.6896738313542763, + "flos": 24328577426400.0, + "grad_norm": 2.746799752282894, + "language_loss": 0.78352982, + "learning_rate": 9.278334794344715e-07, + "loss": 0.80498141, + "num_input_tokens_seen": 247622110, + "router_z_loss_clip": 0.71289062, + "router_z_loss_mlp": 0.10345459, + "step": 11471, + "time_per_iteration": 3.9401159286499023 + }, + { + "auxiliary_loss_clip": 0.01115445, + "auxiliary_loss_mlp": 0.0103254, + "balance_loss_clip": 1.04092097, + "balance_loss_mlp": 1.02044606, + "epoch": 0.6897339546069442, + "flos": 26509595060160.0, + "grad_norm": 1.836076686024447, + "language_loss": 0.78561938, + "learning_rate": 9.275047298005232e-07, + "loss": 0.80709922, + "num_input_tokens_seen": 247641905, + "router_z_loss_clip": 0.74511719, + "router_z_loss_mlp": 0.12091064, + "step": 11472, + "time_per_iteration": 2.652211904525757 + }, + { + "auxiliary_loss_clip": 0.01113697, + "auxiliary_loss_mlp": 0.01031264, + "balance_loss_clip": 1.03961217, + "balance_loss_mlp": 1.02017212, + "epoch": 0.6897940778596122, + "flos": 24192458416800.0, + "grad_norm": 1.8203903436525517, + "language_loss": 0.76473945, + "learning_rate": 9.271760208357024e-07, + "loss": 0.78618908, + "num_input_tokens_seen": 247660945, + "router_z_loss_clip": 0.73974609, + "router_z_loss_mlp": 0.11083984, + "step": 11473, + "time_per_iteration": 2.6104202270507812 + }, + { + "auxiliary_loss_clip": 0.01120767, + "auxiliary_loss_mlp": 0.01035943, + "balance_loss_clip": 1.04296231, + "balance_loss_mlp": 1.02335453, + "epoch": 0.6898542011122801, + "flos": 21121605735840.0, + "grad_norm": 1.9890125720544352, + "language_loss": 0.75423694, + "learning_rate": 9.268473525524751e-07, + "loss": 0.77580404, + "num_input_tokens_seen": 247678395, + "router_z_loss_clip": 0.77734375, + "router_z_loss_mlp": 0.12597656, + "step": 11474, + "time_per_iteration": 2.7269446849823 + }, + { + "auxiliary_loss_clip": 0.01117283, + "auxiliary_loss_mlp": 0.01033206, + "balance_loss_clip": 1.04228878, + "balance_loss_mlp": 1.0215416, + "epoch": 0.6899143243649482, + "flos": 30205256844960.0, + "grad_norm": 1.820905650325317, + "language_loss": 0.74095249, + "learning_rate": 9.26518724963303e-07, + "loss": 0.76245743, + "num_input_tokens_seen": 247698380, + "router_z_loss_clip": 0.74902344, + "router_z_loss_mlp": 0.11669922, + "step": 11475, + "time_per_iteration": 2.6588826179504395 + }, + { + "auxiliary_loss_clip": 0.01116547, + "auxiliary_loss_mlp": 0.01031057, + "balance_loss_clip": 1.04074407, + "balance_loss_mlp": 1.01928484, + "epoch": 0.6899744476176161, + "flos": 21029684728320.0, + "grad_norm": 2.0301996489640968, + "language_loss": 0.88591605, + "learning_rate": 9.261901380806491e-07, + "loss": 0.90739214, + "num_input_tokens_seen": 247716370, + "router_z_loss_clip": 0.75927734, + "router_z_loss_mlp": 0.11767578, + "step": 11476, + "time_per_iteration": 2.647932767868042 + }, + { + "auxiliary_loss_clip": 0.01115077, + "auxiliary_loss_mlp": 0.01034095, + "balance_loss_clip": 1.04077446, + "balance_loss_mlp": 1.02257323, + "epoch": 0.6900345708702841, + "flos": 31006533818880.0, + "grad_norm": 1.4538652926870932, + "language_loss": 0.70126569, + "learning_rate": 9.258615919169724e-07, + "loss": 0.7227574, + "num_input_tokens_seen": 247737335, + "router_z_loss_clip": 0.7421875, + "router_z_loss_mlp": 0.11529541, + "step": 11477, + "time_per_iteration": 2.6779918670654297 + }, + { + "auxiliary_loss_clip": 0.01121264, + "auxiliary_loss_mlp": 0.01038674, + "balance_loss_clip": 1.04244626, + "balance_loss_mlp": 1.0261029, + "epoch": 0.6900946941229521, + "flos": 28595247717600.0, + "grad_norm": 2.120761791304101, + "language_loss": 0.68501812, + "learning_rate": 9.255330864847313e-07, + "loss": 0.70661747, + "num_input_tokens_seen": 247756680, + "router_z_loss_clip": 0.7890625, + "router_z_loss_mlp": 0.12579346, + "step": 11478, + "time_per_iteration": 2.68829083442688 + }, + { + "auxiliary_loss_clip": 0.0111907, + "auxiliary_loss_mlp": 0.01031136, + "balance_loss_clip": 1.04269695, + "balance_loss_mlp": 1.01975155, + "epoch": 0.69015481737562, + "flos": 21744064837440.0, + "grad_norm": 2.057917803805536, + "language_loss": 0.76436567, + "learning_rate": 9.252046217963843e-07, + "loss": 0.78586781, + "num_input_tokens_seen": 247774265, + "router_z_loss_clip": 0.76318359, + "router_z_loss_mlp": 0.1138916, + "step": 11479, + "time_per_iteration": 2.589125633239746 + }, + { + "auxiliary_loss_clip": 0.01119208, + "auxiliary_loss_mlp": 0.01029456, + "balance_loss_clip": 1.0429076, + "balance_loss_mlp": 1.01708829, + "epoch": 0.690214940628288, + "flos": 21300261539040.0, + "grad_norm": 5.148423862786902, + "language_loss": 0.78547412, + "learning_rate": 9.248761978643856e-07, + "loss": 0.80696076, + "num_input_tokens_seen": 247792395, + "router_z_loss_clip": 0.76269531, + "router_z_loss_mlp": 0.12365723, + "step": 11480, + "time_per_iteration": 2.6403048038482666 + }, + { + "auxiliary_loss_clip": 0.01117103, + "auxiliary_loss_mlp": 0.0102727, + "balance_loss_clip": 1.04323244, + "balance_loss_mlp": 1.01497364, + "epoch": 0.6902750638809559, + "flos": 36078330225600.0, + "grad_norm": 2.796736254068018, + "language_loss": 0.75580621, + "learning_rate": 9.245478147011885e-07, + "loss": 0.77724999, + "num_input_tokens_seen": 247811985, + "router_z_loss_clip": 0.73828125, + "router_z_loss_mlp": 0.12310791, + "step": 11481, + "time_per_iteration": 2.7270994186401367 + }, + { + "auxiliary_loss_clip": 0.01115018, + "auxiliary_loss_mlp": 0.01029906, + "balance_loss_clip": 1.04056573, + "balance_loss_mlp": 1.01811612, + "epoch": 0.690335187133624, + "flos": 31185189622080.0, + "grad_norm": 1.8625305241973367, + "language_loss": 0.68852055, + "learning_rate": 9.24219472319246e-07, + "loss": 0.70996976, + "num_input_tokens_seen": 247831880, + "router_z_loss_clip": 0.74414062, + "router_z_loss_mlp": 0.11798096, + "step": 11482, + "time_per_iteration": 2.666243553161621 + }, + { + "auxiliary_loss_clip": 0.01116763, + "auxiliary_loss_mlp": 0.01029568, + "balance_loss_clip": 1.04169679, + "balance_loss_mlp": 1.01833832, + "epoch": 0.6903953103862919, + "flos": 27440913451680.0, + "grad_norm": 2.4511819033360114, + "language_loss": 0.82746935, + "learning_rate": 9.238911707310096e-07, + "loss": 0.84893274, + "num_input_tokens_seen": 247851170, + "router_z_loss_clip": 0.75146484, + "router_z_loss_mlp": 0.11242676, + "step": 11483, + "time_per_iteration": 2.650461196899414 + }, + { + "auxiliary_loss_clip": 0.01116383, + "auxiliary_loss_mlp": 0.01031383, + "balance_loss_clip": 1.04025674, + "balance_loss_mlp": 1.01995087, + "epoch": 0.6904554336389599, + "flos": 31848645860640.0, + "grad_norm": 2.0274534036034053, + "language_loss": 0.65336919, + "learning_rate": 9.235629099489273e-07, + "loss": 0.67484683, + "num_input_tokens_seen": 247868950, + "router_z_loss_clip": 0.76171875, + "router_z_loss_mlp": 0.11444092, + "step": 11484, + "time_per_iteration": 2.660215377807617 + }, + { + "auxiliary_loss_clip": 0.01113064, + "auxiliary_loss_mlp": 0.01035967, + "balance_loss_clip": 1.03946829, + "balance_loss_mlp": 1.02449369, + "epoch": 0.6905155568916278, + "flos": 38036980261440.0, + "grad_norm": 2.27989987124702, + "language_loss": 0.736458, + "learning_rate": 9.232346899854479e-07, + "loss": 0.75794834, + "num_input_tokens_seen": 247889805, + "router_z_loss_clip": 0.73632812, + "router_z_loss_mlp": 0.11468506, + "step": 11485, + "time_per_iteration": 2.731595754623413 + }, + { + "auxiliary_loss_clip": 0.0111781, + "auxiliary_loss_mlp": 0.01036898, + "balance_loss_clip": 1.04201984, + "balance_loss_mlp": 1.02469134, + "epoch": 0.6905756801442958, + "flos": 21523723103520.0, + "grad_norm": 1.7948476845392365, + "language_loss": 0.84919608, + "learning_rate": 9.22906510853017e-07, + "loss": 0.87074316, + "num_input_tokens_seen": 247908585, + "router_z_loss_clip": 0.75830078, + "router_z_loss_mlp": 0.12200928, + "step": 11486, + "time_per_iteration": 2.61350679397583 + }, + { + "auxiliary_loss_clip": 0.01116358, + "auxiliary_loss_mlp": 0.01037146, + "balance_loss_clip": 1.04128051, + "balance_loss_mlp": 1.02518344, + "epoch": 0.6906358033969637, + "flos": 27264161960640.0, + "grad_norm": 1.6381672612443716, + "language_loss": 0.72662431, + "learning_rate": 9.225783725640786e-07, + "loss": 0.74815941, + "num_input_tokens_seen": 247928480, + "router_z_loss_clip": 0.75097656, + "router_z_loss_mlp": 0.11968994, + "step": 11487, + "time_per_iteration": 2.7091782093048096 + }, + { + "auxiliary_loss_clip": 0.01034015, + "auxiliary_loss_mlp": 0.0100386, + "balance_loss_clip": 1.01064491, + "balance_loss_mlp": 1.00268698, + "epoch": 0.6906959266496318, + "flos": 85106086352640.0, + "grad_norm": 0.8886289426481915, + "language_loss": 0.66670984, + "learning_rate": 9.222502751310759e-07, + "loss": 0.68708861, + "num_input_tokens_seen": 247988855, + "router_z_loss_clip": 0.23364258, + "router_z_loss_mlp": 0.01171875, + "step": 11488, + "time_per_iteration": 3.2591466903686523 + }, + { + "auxiliary_loss_clip": 0.0112036, + "auxiliary_loss_mlp": 0.01033602, + "balance_loss_clip": 1.04113233, + "balance_loss_mlp": 1.02039409, + "epoch": 0.6907560499022997, + "flos": 26154147248640.0, + "grad_norm": 1.9762654092809744, + "language_loss": 0.7510379, + "learning_rate": 9.219222185664519e-07, + "loss": 0.77257752, + "num_input_tokens_seen": 248007685, + "router_z_loss_clip": 0.79248047, + "router_z_loss_mlp": 0.13214111, + "step": 11489, + "time_per_iteration": 2.67834210395813 + }, + { + "auxiliary_loss_clip": 0.01118468, + "auxiliary_loss_mlp": 0.01035741, + "balance_loss_clip": 1.04154801, + "balance_loss_mlp": 1.02314031, + "epoch": 0.6908161731549677, + "flos": 17560928476800.0, + "grad_norm": 2.3409520129605377, + "language_loss": 0.6238479, + "learning_rate": 9.215942028826445e-07, + "loss": 0.64539003, + "num_input_tokens_seen": 248025145, + "router_z_loss_clip": 0.76953125, + "router_z_loss_mlp": 0.12609863, + "step": 11490, + "time_per_iteration": 2.621415138244629 + }, + { + "auxiliary_loss_clip": 0.01115678, + "auxiliary_loss_mlp": 0.01029892, + "balance_loss_clip": 1.03972435, + "balance_loss_mlp": 1.0182575, + "epoch": 0.6908762964076357, + "flos": 24417378603360.0, + "grad_norm": 2.294834097016354, + "language_loss": 0.72835773, + "learning_rate": 9.212662280920937e-07, + "loss": 0.74981344, + "num_input_tokens_seen": 248043750, + "router_z_loss_clip": 0.75878906, + "router_z_loss_mlp": 0.11657715, + "step": 11491, + "time_per_iteration": 2.687647819519043 + }, + { + "auxiliary_loss_clip": 0.01115434, + "auxiliary_loss_mlp": 0.01041238, + "balance_loss_clip": 1.04065132, + "balance_loss_mlp": 1.02935243, + "epoch": 0.6909364196603036, + "flos": 35104556075040.0, + "grad_norm": 1.459963562511414, + "language_loss": 0.7056396, + "learning_rate": 9.20938294207235e-07, + "loss": 0.72720635, + "num_input_tokens_seen": 248065765, + "router_z_loss_clip": 0.74755859, + "router_z_loss_mlp": 0.11877441, + "step": 11492, + "time_per_iteration": 2.7047579288482666 + }, + { + "auxiliary_loss_clip": 0.01121529, + "auxiliary_loss_mlp": 0.01036868, + "balance_loss_clip": 1.04269469, + "balance_loss_mlp": 1.02431583, + "epoch": 0.6909965429129716, + "flos": 27489041629920.0, + "grad_norm": 2.0524101014168377, + "language_loss": 0.74402273, + "learning_rate": 9.206104012405049e-07, + "loss": 0.76560676, + "num_input_tokens_seen": 248083810, + "router_z_loss_clip": 0.7890625, + "router_z_loss_mlp": 0.12548828, + "step": 11493, + "time_per_iteration": 2.643944501876831 + }, + { + "auxiliary_loss_clip": 0.01117297, + "auxiliary_loss_mlp": 0.01030566, + "balance_loss_clip": 1.04269743, + "balance_loss_mlp": 1.01894927, + "epoch": 0.6910566661656395, + "flos": 22458769084800.0, + "grad_norm": 2.0046235436460367, + "language_loss": 0.74626207, + "learning_rate": 9.20282549204336e-07, + "loss": 0.76774067, + "num_input_tokens_seen": 248103185, + "router_z_loss_clip": 0.74609375, + "router_z_loss_mlp": 0.1161499, + "step": 11494, + "time_per_iteration": 2.616917133331299 + }, + { + "auxiliary_loss_clip": 0.01113783, + "auxiliary_loss_mlp": 0.01029755, + "balance_loss_clip": 1.04012394, + "balance_loss_mlp": 1.01804841, + "epoch": 0.6911167894183076, + "flos": 37551247928640.0, + "grad_norm": 1.7021527612776384, + "language_loss": 0.68285823, + "learning_rate": 9.19954738111161e-07, + "loss": 0.70429367, + "num_input_tokens_seen": 248125665, + "router_z_loss_clip": 0.73681641, + "router_z_loss_mlp": 0.11706543, + "step": 11495, + "time_per_iteration": 2.810861110687256 + }, + { + "auxiliary_loss_clip": 0.01116088, + "auxiliary_loss_mlp": 0.01027146, + "balance_loss_clip": 1.04071188, + "balance_loss_mlp": 1.01512957, + "epoch": 0.6911769126709755, + "flos": 16759813572000.0, + "grad_norm": 2.185186613468139, + "language_loss": 0.73963618, + "learning_rate": 9.196269679734119e-07, + "loss": 0.76106858, + "num_input_tokens_seen": 248142545, + "router_z_loss_clip": 0.75292969, + "router_z_loss_mlp": 0.12011719, + "step": 11496, + "time_per_iteration": 2.6263365745544434 + }, + { + "auxiliary_loss_clip": 0.01114597, + "auxiliary_loss_mlp": 0.01029075, + "balance_loss_clip": 1.04027581, + "balance_loss_mlp": 1.01772034, + "epoch": 0.6912370359236435, + "flos": 21434840892000.0, + "grad_norm": 1.7312376488071772, + "language_loss": 0.8020668, + "learning_rate": 9.19299238803515e-07, + "loss": 0.82350349, + "num_input_tokens_seen": 248160225, + "router_z_loss_clip": 0.74365234, + "router_z_loss_mlp": 0.11358643, + "step": 11497, + "time_per_iteration": 2.5689210891723633 + }, + { + "auxiliary_loss_clip": 0.01119493, + "auxiliary_loss_mlp": 0.01033979, + "balance_loss_clip": 1.04198623, + "balance_loss_mlp": 1.02207041, + "epoch": 0.6912971591763114, + "flos": 26955302670720.0, + "grad_norm": 1.646354385867694, + "language_loss": 0.80837107, + "learning_rate": 9.189715506138993e-07, + "loss": 0.82990581, + "num_input_tokens_seen": 248180430, + "router_z_loss_clip": 0.77539062, + "router_z_loss_mlp": 0.11907959, + "step": 11498, + "time_per_iteration": 2.6553609371185303 + }, + { + "auxiliary_loss_clip": 0.01112637, + "auxiliary_loss_mlp": 0.01033718, + "balance_loss_clip": 1.0397948, + "balance_loss_mlp": 1.02174377, + "epoch": 0.6913572824289794, + "flos": 36571963428000.0, + "grad_norm": 1.8567886618807785, + "language_loss": 0.85880959, + "learning_rate": 9.186439034169915e-07, + "loss": 0.88027322, + "num_input_tokens_seen": 248202365, + "router_z_loss_clip": 0.72851562, + "router_z_loss_mlp": 0.11981201, + "step": 11499, + "time_per_iteration": 4.161834001541138 + }, + { + "auxiliary_loss_clip": 0.01113216, + "auxiliary_loss_mlp": 0.01031993, + "balance_loss_clip": 1.04120922, + "balance_loss_mlp": 1.02029252, + "epoch": 0.6914174056816473, + "flos": 24951806356320.0, + "grad_norm": 2.063065014307985, + "language_loss": 0.75611961, + "learning_rate": 9.183162972252145e-07, + "loss": 0.7775718, + "num_input_tokens_seen": 248221750, + "router_z_loss_clip": 0.71923828, + "router_z_loss_mlp": 0.11700439, + "step": 11500, + "time_per_iteration": 2.679208278656006 + }, + { + "auxiliary_loss_clip": 0.01118727, + "auxiliary_loss_mlp": 0.01028143, + "balance_loss_clip": 1.04331839, + "balance_loss_mlp": 1.01653767, + "epoch": 0.6914775289343154, + "flos": 25969535405280.0, + "grad_norm": 1.939494963024516, + "language_loss": 0.77110362, + "learning_rate": 9.179887320509921e-07, + "loss": 0.79257232, + "num_input_tokens_seen": 248239535, + "router_z_loss_clip": 0.75439453, + "router_z_loss_mlp": 0.11608887, + "step": 11501, + "time_per_iteration": 4.0873637199401855 + }, + { + "auxiliary_loss_clip": 0.01118009, + "auxiliary_loss_mlp": 0.01038162, + "balance_loss_clip": 1.04120111, + "balance_loss_mlp": 1.0256803, + "epoch": 0.6915376521869833, + "flos": 29136401339040.0, + "grad_norm": 1.968128682554595, + "language_loss": 0.73752302, + "learning_rate": 9.176612079067458e-07, + "loss": 0.75908476, + "num_input_tokens_seen": 248259055, + "router_z_loss_clip": 0.76806641, + "router_z_loss_mlp": 0.12475586, + "step": 11502, + "time_per_iteration": 2.633044481277466 + }, + { + "auxiliary_loss_clip": 0.0111922, + "auxiliary_loss_mlp": 0.01034224, + "balance_loss_clip": 1.04174948, + "balance_loss_mlp": 1.02167106, + "epoch": 0.6915977754396513, + "flos": 14044125081600.0, + "grad_norm": 2.0035630618131446, + "language_loss": 0.73545766, + "learning_rate": 9.173337248048953e-07, + "loss": 0.7569921, + "num_input_tokens_seen": 248276765, + "router_z_loss_clip": 0.77392578, + "router_z_loss_mlp": 0.12548828, + "step": 11503, + "time_per_iteration": 2.6768991947174072 + }, + { + "auxiliary_loss_clip": 0.01117542, + "auxiliary_loss_mlp": 0.01031444, + "balance_loss_clip": 1.04237473, + "balance_loss_mlp": 1.0197736, + "epoch": 0.6916578986923193, + "flos": 27578288496960.0, + "grad_norm": 1.896156552599378, + "language_loss": 0.77220315, + "learning_rate": 9.170062827578575e-07, + "loss": 0.79369295, + "num_input_tokens_seen": 248295310, + "router_z_loss_clip": 0.75195312, + "router_z_loss_mlp": 0.11676025, + "step": 11504, + "time_per_iteration": 2.7065963745117188 + }, + { + "auxiliary_loss_clip": 0.01115912, + "auxiliary_loss_mlp": 0.01027442, + "balance_loss_clip": 1.04103398, + "balance_loss_mlp": 1.01559305, + "epoch": 0.6917180219449872, + "flos": 28647306072000.0, + "grad_norm": 1.8523807323855286, + "language_loss": 0.73724771, + "learning_rate": 9.166788817780499e-07, + "loss": 0.75868124, + "num_input_tokens_seen": 248315230, + "router_z_loss_clip": 0.74902344, + "router_z_loss_mlp": 0.11846924, + "step": 11505, + "time_per_iteration": 2.6686313152313232 + }, + { + "auxiliary_loss_clip": 0.01114601, + "auxiliary_loss_mlp": 0.01034745, + "balance_loss_clip": 1.04066408, + "balance_loss_mlp": 1.02256179, + "epoch": 0.6917781451976552, + "flos": 28958717950560.0, + "grad_norm": 2.1432796606122095, + "language_loss": 0.8755554, + "learning_rate": 9.163515218778886e-07, + "loss": 0.89704883, + "num_input_tokens_seen": 248332980, + "router_z_loss_clip": 0.73925781, + "router_z_loss_mlp": 0.12194824, + "step": 11506, + "time_per_iteration": 2.6347815990448 + }, + { + "auxiliary_loss_clip": 0.01114308, + "auxiliary_loss_mlp": 0.01025326, + "balance_loss_clip": 1.04101276, + "balance_loss_mlp": 1.01393569, + "epoch": 0.6918382684503231, + "flos": 38396682387360.0, + "grad_norm": 2.404875304422641, + "language_loss": 0.69812822, + "learning_rate": 9.160242030697856e-07, + "loss": 0.71952456, + "num_input_tokens_seen": 248352865, + "router_z_loss_clip": 0.73291016, + "router_z_loss_mlp": 0.1138916, + "step": 11507, + "time_per_iteration": 2.7230570316314697 + }, + { + "auxiliary_loss_clip": 0.01116297, + "auxiliary_loss_mlp": 0.0103295, + "balance_loss_clip": 1.03930521, + "balance_loss_mlp": 1.02114201, + "epoch": 0.6918983917029912, + "flos": 26418241294560.0, + "grad_norm": 1.903721554531631, + "language_loss": 0.7734552, + "learning_rate": 9.156969253661538e-07, + "loss": 0.79494762, + "num_input_tokens_seen": 248371125, + "router_z_loss_clip": 0.77148438, + "router_z_loss_mlp": 0.11804199, + "step": 11508, + "time_per_iteration": 2.654860734939575 + }, + { + "auxiliary_loss_clip": 0.01112396, + "auxiliary_loss_mlp": 0.0102906, + "balance_loss_clip": 1.04144001, + "balance_loss_mlp": 1.01808727, + "epoch": 0.6919585149556591, + "flos": 31177248235200.0, + "grad_norm": 2.039915230970794, + "language_loss": 0.75055701, + "learning_rate": 9.153696887794027e-07, + "loss": 0.77197158, + "num_input_tokens_seen": 248390455, + "router_z_loss_clip": 0.70947266, + "router_z_loss_mlp": 0.10980225, + "step": 11509, + "time_per_iteration": 4.14886212348938 + }, + { + "auxiliary_loss_clip": 0.011168, + "auxiliary_loss_mlp": 0.01032371, + "balance_loss_clip": 1.04277706, + "balance_loss_mlp": 1.02102172, + "epoch": 0.6920186382083271, + "flos": 28869916773600.0, + "grad_norm": 1.5639721267644302, + "language_loss": 0.64124554, + "learning_rate": 9.150424933219425e-07, + "loss": 0.66273725, + "num_input_tokens_seen": 248411305, + "router_z_loss_clip": 0.74023438, + "router_z_loss_mlp": 0.11346436, + "step": 11510, + "time_per_iteration": 2.719867706298828 + }, + { + "auxiliary_loss_clip": 0.01121284, + "auxiliary_loss_mlp": 0.01032071, + "balance_loss_clip": 1.04293704, + "balance_loss_mlp": 1.01938105, + "epoch": 0.692078761460995, + "flos": 23259357264960.0, + "grad_norm": 1.94561462570418, + "language_loss": 0.75313342, + "learning_rate": 9.147153390061788e-07, + "loss": 0.77466697, + "num_input_tokens_seen": 248430190, + "router_z_loss_clip": 0.78417969, + "router_z_loss_mlp": 0.12701416, + "step": 11511, + "time_per_iteration": 3.875725746154785 + }, + { + "auxiliary_loss_clip": 0.01115131, + "auxiliary_loss_mlp": 0.01032068, + "balance_loss_clip": 1.04180849, + "balance_loss_mlp": 1.02131534, + "epoch": 0.692138884713663, + "flos": 35414873987040.0, + "grad_norm": 1.6201962655454873, + "language_loss": 0.62310457, + "learning_rate": 9.143882258445184e-07, + "loss": 0.64457655, + "num_input_tokens_seen": 248450830, + "router_z_loss_clip": 0.73388672, + "router_z_loss_mlp": 0.10760498, + "step": 11512, + "time_per_iteration": 2.7032527923583984 + }, + { + "auxiliary_loss_clip": 0.01116942, + "auxiliary_loss_mlp": 0.0103211, + "balance_loss_clip": 1.0405674, + "balance_loss_mlp": 1.02024889, + "epoch": 0.6921990079663309, + "flos": 18007041260160.0, + "grad_norm": 1.8484922176147405, + "language_loss": 0.82789463, + "learning_rate": 9.140611538493666e-07, + "loss": 0.84938514, + "num_input_tokens_seen": 248468585, + "router_z_loss_clip": 0.76318359, + "router_z_loss_mlp": 0.1184082, + "step": 11513, + "time_per_iteration": 2.6392812728881836 + }, + { + "auxiliary_loss_clip": 0.01114723, + "auxiliary_loss_mlp": 0.01031732, + "balance_loss_clip": 1.04193699, + "balance_loss_mlp": 1.02071106, + "epoch": 0.692259131218999, + "flos": 29092122302400.0, + "grad_norm": 1.711637198202488, + "language_loss": 0.78400135, + "learning_rate": 9.137341230331233e-07, + "loss": 0.80546588, + "num_input_tokens_seen": 248490535, + "router_z_loss_clip": 0.72802734, + "router_z_loss_mlp": 0.11016846, + "step": 11514, + "time_per_iteration": 2.6423463821411133 + }, + { + "auxiliary_loss_clip": 0.01117353, + "auxiliary_loss_mlp": 0.01029739, + "balance_loss_clip": 1.0403409, + "balance_loss_mlp": 1.01784742, + "epoch": 0.6923192544716669, + "flos": 23348563614720.0, + "grad_norm": 11.427648408785794, + "language_loss": 0.74680489, + "learning_rate": 9.134071334081907e-07, + "loss": 0.76827586, + "num_input_tokens_seen": 248508575, + "router_z_loss_clip": 0.77001953, + "router_z_loss_mlp": 0.11901855, + "step": 11515, + "time_per_iteration": 2.64958119392395 + }, + { + "auxiliary_loss_clip": 0.01113004, + "auxiliary_loss_mlp": 0.01028561, + "balance_loss_clip": 1.04104686, + "balance_loss_mlp": 1.01743865, + "epoch": 0.6923793777243349, + "flos": 34257257821440.0, + "grad_norm": 3.2791212107920176, + "language_loss": 0.53774828, + "learning_rate": 9.130801849869694e-07, + "loss": 0.55916393, + "num_input_tokens_seen": 248527025, + "router_z_loss_clip": 0.71875, + "router_z_loss_mlp": 0.11126709, + "step": 11516, + "time_per_iteration": 2.685434579849243 + }, + { + "auxiliary_loss_clip": 0.01111829, + "auxiliary_loss_mlp": 0.01029001, + "balance_loss_clip": 1.04105115, + "balance_loss_mlp": 1.01740813, + "epoch": 0.6924395009770029, + "flos": 20232175861440.0, + "grad_norm": 1.7088237587411423, + "language_loss": 0.73117918, + "learning_rate": 9.127532777818557e-07, + "loss": 0.75258744, + "num_input_tokens_seen": 248544275, + "router_z_loss_clip": 0.70800781, + "router_z_loss_mlp": 0.11602783, + "step": 11517, + "time_per_iteration": 2.5954747200012207 + }, + { + "auxiliary_loss_clip": 0.0111714, + "auxiliary_loss_mlp": 0.01036072, + "balance_loss_clip": 1.04168987, + "balance_loss_mlp": 1.02406192, + "epoch": 0.6924996242296708, + "flos": 20322435660480.0, + "grad_norm": 1.9615859482513698, + "language_loss": 0.76866841, + "learning_rate": 9.124264118052465e-07, + "loss": 0.79020053, + "num_input_tokens_seen": 248561870, + "router_z_loss_clip": 0.75390625, + "router_z_loss_mlp": 0.12011719, + "step": 11518, + "time_per_iteration": 2.583205461502075 + }, + { + "auxiliary_loss_clip": 0.01119808, + "auxiliary_loss_mlp": 0.0103157, + "balance_loss_clip": 1.04224086, + "balance_loss_mlp": 1.01899314, + "epoch": 0.6925597474823388, + "flos": 42407888813280.0, + "grad_norm": 1.4263179352800166, + "language_loss": 0.64640987, + "learning_rate": 9.120995870695376e-07, + "loss": 0.66792363, + "num_input_tokens_seen": 248588190, + "router_z_loss_clip": 0.77490234, + "router_z_loss_mlp": 0.12573242, + "step": 11519, + "time_per_iteration": 2.7616562843322754 + }, + { + "auxiliary_loss_clip": 0.01115384, + "auxiliary_loss_mlp": 0.01036934, + "balance_loss_clip": 1.04066825, + "balance_loss_mlp": 1.02535868, + "epoch": 0.6926198707350067, + "flos": 26688007759680.0, + "grad_norm": 2.878362467880682, + "language_loss": 0.62425715, + "learning_rate": 9.117728035871212e-07, + "loss": 0.64578032, + "num_input_tokens_seen": 248606460, + "router_z_loss_clip": 0.74755859, + "router_z_loss_mlp": 0.11578369, + "step": 11520, + "time_per_iteration": 2.6314268112182617 + }, + { + "auxiliary_loss_clip": 0.01123037, + "auxiliary_loss_mlp": 0.01030122, + "balance_loss_clip": 1.04283547, + "balance_loss_mlp": 1.01758075, + "epoch": 0.6926799939876748, + "flos": 15869654386560.0, + "grad_norm": 2.1781464637858274, + "language_loss": 0.77946413, + "learning_rate": 9.114460613703887e-07, + "loss": 0.80099565, + "num_input_tokens_seen": 248623715, + "router_z_loss_clip": 0.80078125, + "router_z_loss_mlp": 0.12554932, + "step": 11521, + "time_per_iteration": 2.6175200939178467 + }, + { + "auxiliary_loss_clip": 0.01120043, + "auxiliary_loss_mlp": 0.01030343, + "balance_loss_clip": 1.0414263, + "balance_loss_mlp": 1.01808822, + "epoch": 0.6927401172403427, + "flos": 20452031388000.0, + "grad_norm": 1.797879596736714, + "language_loss": 0.82276702, + "learning_rate": 9.111193604317304e-07, + "loss": 0.84427094, + "num_input_tokens_seen": 248640575, + "router_z_loss_clip": 0.78613281, + "router_z_loss_mlp": 0.12249756, + "step": 11522, + "time_per_iteration": 2.6100919246673584 + }, + { + "auxiliary_loss_clip": 0.01118604, + "auxiliary_loss_mlp": 0.01030651, + "balance_loss_clip": 1.04436135, + "balance_loss_mlp": 1.01934433, + "epoch": 0.6928002404930107, + "flos": 31366195427520.0, + "grad_norm": 1.6798283653897224, + "language_loss": 0.76947522, + "learning_rate": 9.107927007835361e-07, + "loss": 0.79096776, + "num_input_tokens_seen": 248663535, + "router_z_loss_clip": 0.74267578, + "router_z_loss_mlp": 0.11303711, + "step": 11523, + "time_per_iteration": 2.6673927307128906 + }, + { + "auxiliary_loss_clip": 0.01113388, + "auxiliary_loss_mlp": 0.01030555, + "balance_loss_clip": 1.04054475, + "balance_loss_mlp": 1.01974297, + "epoch": 0.6928603637456786, + "flos": 22681258234560.0, + "grad_norm": 2.2387384560346337, + "language_loss": 0.68470007, + "learning_rate": 9.104660824381915e-07, + "loss": 0.70613945, + "num_input_tokens_seen": 248681125, + "router_z_loss_clip": 0.72851562, + "router_z_loss_mlp": 0.1081543, + "step": 11524, + "time_per_iteration": 2.6858184337615967 + }, + { + "auxiliary_loss_clip": 0.01116028, + "auxiliary_loss_mlp": 0.01030254, + "balance_loss_clip": 1.04032111, + "balance_loss_mlp": 1.01749873, + "epoch": 0.6929204869983466, + "flos": 27089598402720.0, + "grad_norm": 2.042037624205232, + "language_loss": 0.64380836, + "learning_rate": 9.101395054080815e-07, + "loss": 0.66527122, + "num_input_tokens_seen": 248700555, + "router_z_loss_clip": 0.75683594, + "router_z_loss_mlp": 0.12756348, + "step": 11525, + "time_per_iteration": 2.633944034576416 + }, + { + "auxiliary_loss_clip": 0.01119026, + "auxiliary_loss_mlp": 0.01038833, + "balance_loss_clip": 1.0442667, + "balance_loss_mlp": 1.02725744, + "epoch": 0.6929806102510145, + "flos": 21835013430240.0, + "grad_norm": 2.2489878442497653, + "language_loss": 0.70311081, + "learning_rate": 9.098129697055907e-07, + "loss": 0.72468942, + "num_input_tokens_seen": 248716095, + "router_z_loss_clip": 0.74755859, + "router_z_loss_mlp": 0.11584473, + "step": 11526, + "time_per_iteration": 2.5983452796936035 + }, + { + "auxiliary_loss_clip": 0.01113807, + "auxiliary_loss_mlp": 0.01028134, + "balance_loss_clip": 1.04071903, + "balance_loss_mlp": 1.01707101, + "epoch": 0.6930407335036826, + "flos": 24105277931040.0, + "grad_norm": 1.5952450593186724, + "language_loss": 0.76477152, + "learning_rate": 9.094864753431022e-07, + "loss": 0.78619093, + "num_input_tokens_seen": 248735330, + "router_z_loss_clip": 0.73144531, + "router_z_loss_mlp": 0.1105957, + "step": 11527, + "time_per_iteration": 2.603560209274292 + }, + { + "auxiliary_loss_clip": 0.01114505, + "auxiliary_loss_mlp": 0.01026118, + "balance_loss_clip": 1.0410403, + "balance_loss_mlp": 1.01537168, + "epoch": 0.6931008567563505, + "flos": 26288321428800.0, + "grad_norm": 1.6331430075113926, + "language_loss": 0.79945415, + "learning_rate": 9.091600223329952e-07, + "loss": 0.82086039, + "num_input_tokens_seen": 248754530, + "router_z_loss_clip": 0.734375, + "router_z_loss_mlp": 0.10748291, + "step": 11528, + "time_per_iteration": 2.6491963863372803 + }, + { + "auxiliary_loss_clip": 0.01112727, + "auxiliary_loss_mlp": 0.01031026, + "balance_loss_clip": 1.04177856, + "balance_loss_mlp": 1.01984417, + "epoch": 0.6931609800090185, + "flos": 32030745632640.0, + "grad_norm": 1.8087074942561032, + "language_loss": 0.76210719, + "learning_rate": 9.088336106876491e-07, + "loss": 0.78354472, + "num_input_tokens_seen": 248775825, + "router_z_loss_clip": 0.70996094, + "router_z_loss_mlp": 0.11175537, + "step": 11529, + "time_per_iteration": 2.653275966644287 + }, + { + "auxiliary_loss_clip": 0.01113284, + "auxiliary_loss_mlp": 0.01032518, + "balance_loss_clip": 1.04156923, + "balance_loss_mlp": 1.02072251, + "epoch": 0.6932211032616865, + "flos": 39466307721600.0, + "grad_norm": 3.386028393711648, + "language_loss": 0.71983969, + "learning_rate": 9.085072404194436e-07, + "loss": 0.74129772, + "num_input_tokens_seen": 248796180, + "router_z_loss_clip": 0.71728516, + "router_z_loss_mlp": 0.11798096, + "step": 11530, + "time_per_iteration": 2.740816831588745 + }, + { + "auxiliary_loss_clip": 0.01123566, + "auxiliary_loss_mlp": 0.01033974, + "balance_loss_clip": 1.04429138, + "balance_loss_mlp": 1.02072978, + "epoch": 0.6932812265143544, + "flos": 26904905524800.0, + "grad_norm": 2.0533046639170855, + "language_loss": 0.78339171, + "learning_rate": 9.081809115407513e-07, + "loss": 0.80496711, + "num_input_tokens_seen": 248814735, + "router_z_loss_clip": 0.79199219, + "router_z_loss_mlp": 0.13238525, + "step": 11531, + "time_per_iteration": 2.6092326641082764 + }, + { + "auxiliary_loss_clip": 0.01113849, + "auxiliary_loss_mlp": 0.01029846, + "balance_loss_clip": 1.04148376, + "balance_loss_mlp": 1.01960039, + "epoch": 0.6933413497670224, + "flos": 32038200812160.0, + "grad_norm": 2.9724975149035378, + "language_loss": 0.69178814, + "learning_rate": 9.078546240639484e-07, + "loss": 0.71322513, + "num_input_tokens_seen": 248839140, + "router_z_loss_clip": 0.72314453, + "router_z_loss_mlp": 0.10241699, + "step": 11532, + "time_per_iteration": 2.730666399002075 + }, + { + "auxiliary_loss_clip": 0.01117175, + "auxiliary_loss_mlp": 0.01029008, + "balance_loss_clip": 1.04177523, + "balance_loss_mlp": 1.01691401, + "epoch": 0.6934014730196904, + "flos": 23882950850400.0, + "grad_norm": 1.5258981213386507, + "language_loss": 0.67107213, + "learning_rate": 9.075283780014082e-07, + "loss": 0.69253403, + "num_input_tokens_seen": 248858300, + "router_z_loss_clip": 0.75341797, + "router_z_loss_mlp": 0.12091064, + "step": 11533, + "time_per_iteration": 2.60685396194458 + }, + { + "auxiliary_loss_clip": 0.01119237, + "auxiliary_loss_mlp": 0.0103141, + "balance_loss_clip": 1.04309106, + "balance_loss_mlp": 1.0192802, + "epoch": 0.6934615962723584, + "flos": 26990789457600.0, + "grad_norm": 3.707603067017058, + "language_loss": 0.58304322, + "learning_rate": 9.072021733655007e-07, + "loss": 0.60454965, + "num_input_tokens_seen": 248876310, + "router_z_loss_clip": 0.76171875, + "router_z_loss_mlp": 0.12127686, + "step": 11534, + "time_per_iteration": 2.632432699203491 + }, + { + "auxiliary_loss_clip": 0.01115554, + "auxiliary_loss_mlp": 0.010311, + "balance_loss_clip": 1.04103649, + "balance_loss_mlp": 1.0193994, + "epoch": 0.6935217195250263, + "flos": 26064576243360.0, + "grad_norm": 2.1195283648562424, + "language_loss": 0.70953941, + "learning_rate": 9.068760101685971e-07, + "loss": 0.73100603, + "num_input_tokens_seen": 248895650, + "router_z_loss_clip": 0.74414062, + "router_z_loss_mlp": 0.11694336, + "step": 11535, + "time_per_iteration": 2.6036672592163086 + }, + { + "auxiliary_loss_clip": 0.01035492, + "auxiliary_loss_mlp": 0.01002581, + "balance_loss_clip": 1.01221871, + "balance_loss_mlp": 1.00139666, + "epoch": 0.6935818427776943, + "flos": 78169343677920.0, + "grad_norm": 0.714124182397666, + "language_loss": 0.59035176, + "learning_rate": 9.065498884230638e-07, + "loss": 0.6107325, + "num_input_tokens_seen": 248963920, + "router_z_loss_clip": 0.23291016, + "router_z_loss_mlp": 0.01184082, + "step": 11536, + "time_per_iteration": 3.360724687576294 + }, + { + "auxiliary_loss_clip": 0.01121708, + "auxiliary_loss_mlp": 0.01031911, + "balance_loss_clip": 1.04402852, + "balance_loss_mlp": 1.01954269, + "epoch": 0.6936419660303622, + "flos": 24773231587680.0, + "grad_norm": 2.55727191919697, + "language_loss": 0.72717953, + "learning_rate": 9.062238081412692e-07, + "loss": 0.74871576, + "num_input_tokens_seen": 248983380, + "router_z_loss_clip": 0.77636719, + "router_z_loss_mlp": 0.12384033, + "step": 11537, + "time_per_iteration": 2.6187803745269775 + }, + { + "auxiliary_loss_clip": 0.01035585, + "auxiliary_loss_mlp": 0.01001671, + "balance_loss_clip": 1.01225019, + "balance_loss_mlp": 1.00049424, + "epoch": 0.6937020892830302, + "flos": 81976854621600.0, + "grad_norm": 0.7507475217979882, + "language_loss": 0.5553081, + "learning_rate": 9.058977693355767e-07, + "loss": 0.57568067, + "num_input_tokens_seen": 249044680, + "router_z_loss_clip": 0.23352051, + "router_z_loss_mlp": 0.01176453, + "step": 11538, + "time_per_iteration": 4.687795162200928 + }, + { + "auxiliary_loss_clip": 0.01111612, + "auxiliary_loss_mlp": 0.01032558, + "balance_loss_clip": 1.04125571, + "balance_loss_mlp": 1.02169859, + "epoch": 0.6937622125356981, + "flos": 29136482373600.0, + "grad_norm": 1.850312145390749, + "language_loss": 0.7763043, + "learning_rate": 9.055717720183505e-07, + "loss": 0.797746, + "num_input_tokens_seen": 249061060, + "router_z_loss_clip": 0.70361328, + "router_z_loss_mlp": 0.10845947, + "step": 11539, + "time_per_iteration": 2.661590337753296 + }, + { + "auxiliary_loss_clip": 0.01116562, + "auxiliary_loss_mlp": 0.01028668, + "balance_loss_clip": 1.0431788, + "balance_loss_mlp": 1.01789141, + "epoch": 0.6938223357883662, + "flos": 35057805484320.0, + "grad_norm": 3.1720438506743225, + "language_loss": 0.64003336, + "learning_rate": 9.05245816201953e-07, + "loss": 0.66148561, + "num_input_tokens_seen": 249081430, + "router_z_loss_clip": 0.73339844, + "router_z_loss_mlp": 0.10778809, + "step": 11540, + "time_per_iteration": 2.6856091022491455 + }, + { + "auxiliary_loss_clip": 0.0111435, + "auxiliary_loss_mlp": 0.01028958, + "balance_loss_clip": 1.04153633, + "balance_loss_mlp": 1.01772857, + "epoch": 0.6938824590410341, + "flos": 34965803442240.0, + "grad_norm": 1.5417142684796568, + "language_loss": 0.86883485, + "learning_rate": 9.049199018987437e-07, + "loss": 0.89026797, + "num_input_tokens_seen": 249103020, + "router_z_loss_clip": 0.72753906, + "router_z_loss_mlp": 0.11230469, + "step": 11541, + "time_per_iteration": 4.117698431015015 + }, + { + "auxiliary_loss_clip": 0.01117196, + "auxiliary_loss_mlp": 0.01031438, + "balance_loss_clip": 1.04239154, + "balance_loss_mlp": 1.01999354, + "epoch": 0.6939425822937021, + "flos": 23163627633120.0, + "grad_norm": 2.098482298103812, + "language_loss": 0.84089577, + "learning_rate": 9.04594029121081e-07, + "loss": 0.86238205, + "num_input_tokens_seen": 249120810, + "router_z_loss_clip": 0.74902344, + "router_z_loss_mlp": 0.11437988, + "step": 11542, + "time_per_iteration": 2.649977922439575 + }, + { + "auxiliary_loss_clip": 0.01117315, + "auxiliary_loss_mlp": 0.01030459, + "balance_loss_clip": 1.04184532, + "balance_loss_mlp": 1.01815605, + "epoch": 0.6940027055463701, + "flos": 28157359942080.0, + "grad_norm": 2.051504431579933, + "language_loss": 0.75287944, + "learning_rate": 9.04268197881323e-07, + "loss": 0.7743572, + "num_input_tokens_seen": 249138050, + "router_z_loss_clip": 0.75439453, + "router_z_loss_mlp": 0.12316895, + "step": 11543, + "time_per_iteration": 2.6283628940582275 + }, + { + "auxiliary_loss_clip": 0.01114416, + "auxiliary_loss_mlp": 0.01034013, + "balance_loss_clip": 1.04136503, + "balance_loss_mlp": 1.02301562, + "epoch": 0.694062828799038, + "flos": 22191109518240.0, + "grad_norm": 2.1921520314370344, + "language_loss": 0.76094073, + "learning_rate": 9.039424081918241e-07, + "loss": 0.78242505, + "num_input_tokens_seen": 249155570, + "router_z_loss_clip": 0.73095703, + "router_z_loss_mlp": 0.10986328, + "step": 11544, + "time_per_iteration": 2.5807065963745117 + }, + { + "auxiliary_loss_clip": 0.01120095, + "auxiliary_loss_mlp": 0.01031213, + "balance_loss_clip": 1.04418063, + "balance_loss_mlp": 1.02013254, + "epoch": 0.694122952051706, + "flos": 21745685528640.0, + "grad_norm": 1.8939929084007914, + "language_loss": 0.71136332, + "learning_rate": 9.036166600649388e-07, + "loss": 0.73287636, + "num_input_tokens_seen": 249172960, + "router_z_loss_clip": 0.75976562, + "router_z_loss_mlp": 0.11071777, + "step": 11545, + "time_per_iteration": 2.6294376850128174 + }, + { + "auxiliary_loss_clip": 0.01112524, + "auxiliary_loss_mlp": 0.01025637, + "balance_loss_clip": 1.04198325, + "balance_loss_mlp": 1.0154686, + "epoch": 0.694183075304374, + "flos": 25886973889440.0, + "grad_norm": 2.2679645167002187, + "language_loss": 0.79717076, + "learning_rate": 9.0329095351302e-07, + "loss": 0.81855237, + "num_input_tokens_seen": 249192450, + "router_z_loss_clip": 0.70556641, + "router_z_loss_mlp": 0.10174561, + "step": 11546, + "time_per_iteration": 2.6576831340789795 + }, + { + "auxiliary_loss_clip": 0.01117222, + "auxiliary_loss_mlp": 0.01029294, + "balance_loss_clip": 1.04334497, + "balance_loss_mlp": 1.01757574, + "epoch": 0.694243198557042, + "flos": 29358971523360.0, + "grad_norm": 1.4178902449474917, + "language_loss": 0.78574061, + "learning_rate": 9.029652885484194e-07, + "loss": 0.80720574, + "num_input_tokens_seen": 249214320, + "router_z_loss_clip": 0.73974609, + "router_z_loss_mlp": 0.11712646, + "step": 11547, + "time_per_iteration": 2.6929168701171875 + }, + { + "auxiliary_loss_clip": 0.01116915, + "auxiliary_loss_mlp": 0.01034917, + "balance_loss_clip": 1.04327583, + "balance_loss_mlp": 1.02275169, + "epoch": 0.6943033218097099, + "flos": 25797402884160.0, + "grad_norm": 2.6183569851773103, + "language_loss": 0.80952132, + "learning_rate": 9.026396651834834e-07, + "loss": 0.83103967, + "num_input_tokens_seen": 249230925, + "router_z_loss_clip": 0.73583984, + "router_z_loss_mlp": 0.12158203, + "step": 11548, + "time_per_iteration": 4.126654386520386 + }, + { + "auxiliary_loss_clip": 0.01034982, + "auxiliary_loss_mlp": 0.01001788, + "balance_loss_clip": 1.01172042, + "balance_loss_mlp": 1.0006249, + "epoch": 0.6943634450623779, + "flos": 70541593014240.0, + "grad_norm": 0.6897610684265518, + "language_loss": 0.53730404, + "learning_rate": 9.023140834305613e-07, + "loss": 0.55767179, + "num_input_tokens_seen": 249293975, + "router_z_loss_clip": 0.23278809, + "router_z_loss_mlp": 0.01161194, + "step": 11549, + "time_per_iteration": 3.2821385860443115 + }, + { + "auxiliary_loss_clip": 0.01115255, + "auxiliary_loss_mlp": 0.01028467, + "balance_loss_clip": 1.04098535, + "balance_loss_mlp": 1.01681995, + "epoch": 0.6944235683150458, + "flos": 37327664812320.0, + "grad_norm": 1.4161597250549272, + "language_loss": 0.73676789, + "learning_rate": 9.01988543302e-07, + "loss": 0.75820512, + "num_input_tokens_seen": 249315285, + "router_z_loss_clip": 0.74267578, + "router_z_loss_mlp": 0.11657715, + "step": 11550, + "time_per_iteration": 3.9683263301849365 + }, + { + "auxiliary_loss_clip": 0.01118251, + "auxiliary_loss_mlp": 0.01032683, + "balance_loss_clip": 1.04258657, + "balance_loss_mlp": 1.02119756, + "epoch": 0.6944836915677138, + "flos": 24061606653600.0, + "grad_norm": 2.1649616503798166, + "language_loss": 0.73987305, + "learning_rate": 9.016630448101425e-07, + "loss": 0.76138234, + "num_input_tokens_seen": 249333505, + "router_z_loss_clip": 0.75634766, + "router_z_loss_mlp": 0.11474609, + "step": 11551, + "time_per_iteration": 2.6003451347351074 + }, + { + "auxiliary_loss_clip": 0.01116559, + "auxiliary_loss_mlp": 0.0103699, + "balance_loss_clip": 1.04227471, + "balance_loss_mlp": 1.02534902, + "epoch": 0.6945438148203817, + "flos": 30338904300480.0, + "grad_norm": 1.5399450025850348, + "language_loss": 0.84736252, + "learning_rate": 9.01337587967333e-07, + "loss": 0.86889803, + "num_input_tokens_seen": 249354180, + "router_z_loss_clip": 0.74267578, + "router_z_loss_mlp": 0.11639404, + "step": 11552, + "time_per_iteration": 2.645019292831421 + }, + { + "auxiliary_loss_clip": 0.01114753, + "auxiliary_loss_mlp": 0.01032604, + "balance_loss_clip": 1.04142833, + "balance_loss_mlp": 1.0210824, + "epoch": 0.6946039380730498, + "flos": 40666339128960.0, + "grad_norm": 1.722280997130254, + "language_loss": 0.67723119, + "learning_rate": 9.010121727859117e-07, + "loss": 0.69870472, + "num_input_tokens_seen": 249377035, + "router_z_loss_clip": 0.734375, + "router_z_loss_mlp": 0.11529541, + "step": 11553, + "time_per_iteration": 2.7330193519592285 + }, + { + "auxiliary_loss_clip": 0.01122425, + "auxiliary_loss_mlp": 0.01031075, + "balance_loss_clip": 1.04400015, + "balance_loss_mlp": 1.01857567, + "epoch": 0.6946640613257177, + "flos": 25442319728160.0, + "grad_norm": 1.5842071028199614, + "language_loss": 0.7908656, + "learning_rate": 9.006867992782195e-07, + "loss": 0.81240058, + "num_input_tokens_seen": 249396155, + "router_z_loss_clip": 0.78466797, + "router_z_loss_mlp": 0.12512207, + "step": 11554, + "time_per_iteration": 2.690096616744995 + }, + { + "auxiliary_loss_clip": 0.01117893, + "auxiliary_loss_mlp": 0.01027535, + "balance_loss_clip": 1.04191291, + "balance_loss_mlp": 1.01583445, + "epoch": 0.6947241845783857, + "flos": 23616668871360.0, + "grad_norm": 1.8793117776204182, + "language_loss": 0.72548062, + "learning_rate": 9.003614674565934e-07, + "loss": 0.74693489, + "num_input_tokens_seen": 249414555, + "router_z_loss_clip": 0.76025391, + "router_z_loss_mlp": 0.11694336, + "step": 11555, + "time_per_iteration": 2.6141555309295654 + }, + { + "auxiliary_loss_clip": 0.01113943, + "auxiliary_loss_mlp": 0.01030511, + "balance_loss_clip": 1.03979278, + "balance_loss_mlp": 1.01954997, + "epoch": 0.6947843078310536, + "flos": 33093158891040.0, + "grad_norm": 1.6956904078992647, + "language_loss": 0.77966374, + "learning_rate": 9.000361773333705e-07, + "loss": 0.8011083, + "num_input_tokens_seen": 249433570, + "router_z_loss_clip": 0.74072266, + "router_z_loss_mlp": 0.10961914, + "step": 11556, + "time_per_iteration": 2.6375374794006348 + }, + { + "auxiliary_loss_clip": 0.01114482, + "auxiliary_loss_mlp": 0.01034784, + "balance_loss_clip": 1.03993177, + "balance_loss_mlp": 1.02355433, + "epoch": 0.6948444310837216, + "flos": 34880203130400.0, + "grad_norm": 4.467597432958066, + "language_loss": 0.60981107, + "learning_rate": 8.997109289208869e-07, + "loss": 0.63130379, + "num_input_tokens_seen": 249453735, + "router_z_loss_clip": 0.74511719, + "router_z_loss_mlp": 0.11230469, + "step": 11557, + "time_per_iteration": 2.7107787132263184 + }, + { + "auxiliary_loss_clip": 0.01114144, + "auxiliary_loss_mlp": 0.01034724, + "balance_loss_clip": 1.04176354, + "balance_loss_mlp": 1.02340531, + "epoch": 0.6949045543363896, + "flos": 19119284422560.0, + "grad_norm": 1.8141985808021042, + "language_loss": 0.85363448, + "learning_rate": 8.993857222314752e-07, + "loss": 0.8751232, + "num_input_tokens_seen": 249470805, + "router_z_loss_clip": 0.72412109, + "router_z_loss_mlp": 0.11315918, + "step": 11558, + "time_per_iteration": 2.6323063373565674 + }, + { + "auxiliary_loss_clip": 0.01118782, + "auxiliary_loss_mlp": 0.01030594, + "balance_loss_clip": 1.04297233, + "balance_loss_mlp": 1.01833904, + "epoch": 0.6949646775890576, + "flos": 28381267196640.0, + "grad_norm": 1.532762663424574, + "language_loss": 0.70391333, + "learning_rate": 8.990605572774664e-07, + "loss": 0.72540712, + "num_input_tokens_seen": 249491150, + "router_z_loss_clip": 0.75878906, + "router_z_loss_mlp": 0.12268066, + "step": 11559, + "time_per_iteration": 2.670863628387451 + }, + { + "auxiliary_loss_clip": 0.01114996, + "auxiliary_loss_mlp": 0.01029689, + "balance_loss_clip": 1.04141486, + "balance_loss_mlp": 1.0182631, + "epoch": 0.6950248008417256, + "flos": 27311358241440.0, + "grad_norm": 1.4939661411377245, + "language_loss": 0.78672612, + "learning_rate": 8.987354340711921e-07, + "loss": 0.808173, + "num_input_tokens_seen": 249511560, + "router_z_loss_clip": 0.73681641, + "router_z_loss_mlp": 0.11425781, + "step": 11560, + "time_per_iteration": 2.6034963130950928 + }, + { + "auxiliary_loss_clip": 0.01115679, + "auxiliary_loss_mlp": 0.01035342, + "balance_loss_clip": 1.04249823, + "balance_loss_mlp": 1.02413678, + "epoch": 0.6950849240943935, + "flos": 28647306072000.0, + "grad_norm": 1.5451286520663918, + "language_loss": 0.76687413, + "learning_rate": 8.9841035262498e-07, + "loss": 0.78838438, + "num_input_tokens_seen": 249531910, + "router_z_loss_clip": 0.73144531, + "router_z_loss_mlp": 0.11199951, + "step": 11561, + "time_per_iteration": 2.619532346725464 + }, + { + "auxiliary_loss_clip": 0.01112617, + "auxiliary_loss_mlp": 0.01031749, + "balance_loss_clip": 1.03937638, + "balance_loss_mlp": 1.01933289, + "epoch": 0.6951450473470615, + "flos": 21256347157920.0, + "grad_norm": 1.8600961652003611, + "language_loss": 0.78635865, + "learning_rate": 8.980853129511577e-07, + "loss": 0.80780232, + "num_input_tokens_seen": 249550300, + "router_z_loss_clip": 0.73242188, + "router_z_loss_mlp": 0.12420654, + "step": 11562, + "time_per_iteration": 2.5880041122436523 + }, + { + "auxiliary_loss_clip": 0.01115951, + "auxiliary_loss_mlp": 0.01034463, + "balance_loss_clip": 1.04088652, + "balance_loss_mlp": 1.02303112, + "epoch": 0.6952051705997294, + "flos": 24996693152160.0, + "grad_norm": 2.70950698874215, + "language_loss": 0.69575047, + "learning_rate": 8.977603150620515e-07, + "loss": 0.71725464, + "num_input_tokens_seen": 249567740, + "router_z_loss_clip": 0.75097656, + "router_z_loss_mlp": 0.11437988, + "step": 11563, + "time_per_iteration": 2.62556529045105 + }, + { + "auxiliary_loss_clip": 0.0111262, + "auxiliary_loss_mlp": 0.01026426, + "balance_loss_clip": 1.04181433, + "balance_loss_mlp": 1.01562524, + "epoch": 0.6952652938523974, + "flos": 17069685793920.0, + "grad_norm": 2.0355811063444165, + "language_loss": 0.73388374, + "learning_rate": 8.974353589699846e-07, + "loss": 0.75527418, + "num_input_tokens_seen": 249582700, + "router_z_loss_clip": 0.70849609, + "router_z_loss_mlp": 0.10791016, + "step": 11564, + "time_per_iteration": 2.5852982997894287 + }, + { + "auxiliary_loss_clip": 0.01130242, + "auxiliary_loss_mlp": 0.01034086, + "balance_loss_clip": 1.04604006, + "balance_loss_mlp": 1.01998353, + "epoch": 0.6953254171050653, + "flos": 37771103455200.0, + "grad_norm": 3.57389869980693, + "language_loss": 0.72089159, + "learning_rate": 8.971104446872785e-07, + "loss": 0.74253488, + "num_input_tokens_seen": 249602920, + "router_z_loss_clip": 0.84228516, + "router_z_loss_mlp": 0.14099121, + "step": 11565, + "time_per_iteration": 2.7034895420074463 + }, + { + "auxiliary_loss_clip": 0.01035119, + "auxiliary_loss_mlp": 0.01000895, + "balance_loss_clip": 1.01186395, + "balance_loss_mlp": 0.9997108, + "epoch": 0.6953855403577334, + "flos": 75250567464480.0, + "grad_norm": 0.9132863500413975, + "language_loss": 0.58438635, + "learning_rate": 8.96785572226255e-07, + "loss": 0.60474646, + "num_input_tokens_seen": 249660400, + "router_z_loss_clip": 0.23291016, + "router_z_loss_mlp": 0.01181793, + "step": 11566, + "time_per_iteration": 3.103910446166992 + }, + { + "auxiliary_loss_clip": 0.01119572, + "auxiliary_loss_mlp": 0.01031269, + "balance_loss_clip": 1.0418427, + "balance_loss_mlp": 1.01894855, + "epoch": 0.6954456636104013, + "flos": 28112473146240.0, + "grad_norm": 1.9378295003263921, + "language_loss": 0.74207526, + "learning_rate": 8.964607415992338e-07, + "loss": 0.76358372, + "num_input_tokens_seen": 249679335, + "router_z_loss_clip": 0.77783203, + "router_z_loss_mlp": 0.12322998, + "step": 11567, + "time_per_iteration": 2.6378748416900635 + }, + { + "auxiliary_loss_clip": 0.01113227, + "auxiliary_loss_mlp": 0.01030813, + "balance_loss_clip": 1.04055405, + "balance_loss_mlp": 1.01886809, + "epoch": 0.6955057868630693, + "flos": 29185339862880.0, + "grad_norm": 1.3264337715726566, + "language_loss": 0.7710849, + "learning_rate": 8.961359528185313e-07, + "loss": 0.79252535, + "num_input_tokens_seen": 249701805, + "router_z_loss_clip": 0.7265625, + "router_z_loss_mlp": 0.11956787, + "step": 11568, + "time_per_iteration": 2.8055942058563232 + }, + { + "auxiliary_loss_clip": 0.01115827, + "auxiliary_loss_mlp": 0.01030842, + "balance_loss_clip": 1.04325891, + "balance_loss_mlp": 1.01987481, + "epoch": 0.6955659101157372, + "flos": 27570792800160.0, + "grad_norm": 1.9373439906670016, + "language_loss": 0.73124689, + "learning_rate": 8.958112058964649e-07, + "loss": 0.75271362, + "num_input_tokens_seen": 249720550, + "router_z_loss_clip": 0.72558594, + "router_z_loss_mlp": 0.10986328, + "step": 11569, + "time_per_iteration": 2.6497185230255127 + }, + { + "auxiliary_loss_clip": 0.01117071, + "auxiliary_loss_mlp": 0.01029504, + "balance_loss_clip": 1.04139388, + "balance_loss_mlp": 1.01767802, + "epoch": 0.6956260333684052, + "flos": 29983659075360.0, + "grad_norm": 1.4635517708974077, + "language_loss": 0.77056509, + "learning_rate": 8.954865008453471e-07, + "loss": 0.79203081, + "num_input_tokens_seen": 249740325, + "router_z_loss_clip": 0.75634766, + "router_z_loss_mlp": 0.11828613, + "step": 11570, + "time_per_iteration": 2.6434669494628906 + }, + { + "auxiliary_loss_clip": 0.01116949, + "auxiliary_loss_mlp": 0.01031384, + "balance_loss_clip": 1.04128695, + "balance_loss_mlp": 1.01956463, + "epoch": 0.6956861566210732, + "flos": 31538044327680.0, + "grad_norm": 2.0760101734027674, + "language_loss": 0.74609578, + "learning_rate": 8.95161837677493e-07, + "loss": 0.7675792, + "num_input_tokens_seen": 249760570, + "router_z_loss_clip": 0.75585938, + "router_z_loss_mlp": 0.11816406, + "step": 11571, + "time_per_iteration": 2.6339802742004395 + }, + { + "auxiliary_loss_clip": 0.011105, + "auxiliary_loss_mlp": 0.01032624, + "balance_loss_clip": 1.04032731, + "balance_loss_mlp": 1.02134728, + "epoch": 0.6957462798737412, + "flos": 18670659567840.0, + "grad_norm": 1.9221626228711763, + "language_loss": 0.74410486, + "learning_rate": 8.948372164052118e-07, + "loss": 0.76553607, + "num_input_tokens_seen": 249778290, + "router_z_loss_clip": 0.70117188, + "router_z_loss_mlp": 0.11273193, + "step": 11572, + "time_per_iteration": 2.5938503742218018 + }, + { + "auxiliary_loss_clip": 0.01113525, + "auxiliary_loss_mlp": 0.01028522, + "balance_loss_clip": 1.03928065, + "balance_loss_mlp": 1.01665509, + "epoch": 0.6958064031264092, + "flos": 44229650011200.0, + "grad_norm": 3.2195938881033688, + "language_loss": 0.69957072, + "learning_rate": 8.94512637040814e-07, + "loss": 0.72099119, + "num_input_tokens_seen": 249800925, + "router_z_loss_clip": 0.74316406, + "router_z_loss_mlp": 0.11877441, + "step": 11573, + "time_per_iteration": 2.7297885417938232 + }, + { + "auxiliary_loss_clip": 0.01120842, + "auxiliary_loss_mlp": 0.01037468, + "balance_loss_clip": 1.04413295, + "balance_loss_mlp": 1.02505791, + "epoch": 0.6958665263790771, + "flos": 23438377723680.0, + "grad_norm": 2.425708687836548, + "language_loss": 0.74525517, + "learning_rate": 8.941880995966095e-07, + "loss": 0.76683825, + "num_input_tokens_seen": 249820500, + "router_z_loss_clip": 0.76806641, + "router_z_loss_mlp": 0.12414551, + "step": 11574, + "time_per_iteration": 2.6279587745666504 + }, + { + "auxiliary_loss_clip": 0.01118357, + "auxiliary_loss_mlp": 0.01027731, + "balance_loss_clip": 1.0415709, + "balance_loss_mlp": 1.0164001, + "epoch": 0.6959266496317451, + "flos": 26595681579360.0, + "grad_norm": 1.5846063043644065, + "language_loss": 0.74456143, + "learning_rate": 8.938636040849014e-07, + "loss": 0.76602226, + "num_input_tokens_seen": 249839845, + "router_z_loss_clip": 0.76757812, + "router_z_loss_mlp": 0.11340332, + "step": 11575, + "time_per_iteration": 2.609726905822754 + }, + { + "auxiliary_loss_clip": 0.01117335, + "auxiliary_loss_mlp": 0.01027967, + "balance_loss_clip": 1.04143381, + "balance_loss_mlp": 1.01546788, + "epoch": 0.695986772884413, + "flos": 25085210708160.0, + "grad_norm": 2.5287948503227633, + "language_loss": 0.78713256, + "learning_rate": 8.935391505179966e-07, + "loss": 0.80858564, + "num_input_tokens_seen": 249857400, + "router_z_loss_clip": 0.75976562, + "router_z_loss_mlp": 0.12493896, + "step": 11576, + "time_per_iteration": 2.6226625442504883 + }, + { + "auxiliary_loss_clip": 0.01118269, + "auxiliary_loss_mlp": 0.01033118, + "balance_loss_clip": 1.04053748, + "balance_loss_mlp": 1.02187634, + "epoch": 0.696046896137081, + "flos": 18225073509120.0, + "grad_norm": 2.221878466118558, + "language_loss": 0.57210606, + "learning_rate": 8.932147389081985e-07, + "loss": 0.59361994, + "num_input_tokens_seen": 249871645, + "router_z_loss_clip": 0.77832031, + "router_z_loss_mlp": 0.11236572, + "step": 11577, + "time_per_iteration": 2.5848677158355713 + }, + { + "auxiliary_loss_clip": 0.01111435, + "auxiliary_loss_mlp": 0.01021827, + "balance_loss_clip": 1.04087877, + "balance_loss_mlp": 1.01178956, + "epoch": 0.696107019389749, + "flos": 37509805101600.0, + "grad_norm": 1.3426798168941898, + "language_loss": 0.76691413, + "learning_rate": 8.928903692678081e-07, + "loss": 0.78824675, + "num_input_tokens_seen": 249894215, + "router_z_loss_clip": 0.70507812, + "router_z_loss_mlp": 0.10040283, + "step": 11578, + "time_per_iteration": 4.140595197677612 + }, + { + "auxiliary_loss_clip": 0.01115967, + "auxiliary_loss_mlp": 0.0103299, + "balance_loss_clip": 1.04138577, + "balance_loss_mlp": 1.02149808, + "epoch": 0.696167142642417, + "flos": 25351857342720.0, + "grad_norm": 1.8222501616329758, + "language_loss": 0.79667294, + "learning_rate": 8.925660416091254e-07, + "loss": 0.81816256, + "num_input_tokens_seen": 249912850, + "router_z_loss_clip": 0.74511719, + "router_z_loss_mlp": 0.11499023, + "step": 11579, + "time_per_iteration": 2.675703763961792 + }, + { + "auxiliary_loss_clip": 0.01111034, + "auxiliary_loss_mlp": 0.01027789, + "balance_loss_clip": 1.03907108, + "balance_loss_mlp": 1.0160048, + "epoch": 0.6962272658950849, + "flos": 27534171529440.0, + "grad_norm": 1.743417349731438, + "language_loss": 0.72739363, + "learning_rate": 8.922417559444502e-07, + "loss": 0.74878186, + "num_input_tokens_seen": 249932650, + "router_z_loss_clip": 0.71972656, + "router_z_loss_mlp": 0.11791992, + "step": 11580, + "time_per_iteration": 3.9145867824554443 + }, + { + "auxiliary_loss_clip": 0.01117563, + "auxiliary_loss_mlp": 0.01031303, + "balance_loss_clip": 1.04108083, + "balance_loss_mlp": 1.01871455, + "epoch": 0.6962873891477529, + "flos": 27088788057120.0, + "grad_norm": 2.3653830682056656, + "language_loss": 0.65607202, + "learning_rate": 8.919175122860787e-07, + "loss": 0.67756075, + "num_input_tokens_seen": 249951205, + "router_z_loss_clip": 0.76464844, + "router_z_loss_mlp": 0.12591553, + "step": 11581, + "time_per_iteration": 2.6470179557800293 + }, + { + "auxiliary_loss_clip": 0.0111567, + "auxiliary_loss_mlp": 0.01029501, + "balance_loss_clip": 1.04135704, + "balance_loss_mlp": 1.01813984, + "epoch": 0.6963475124004208, + "flos": 15239821140000.0, + "grad_norm": 2.116365534113475, + "language_loss": 0.7664417, + "learning_rate": 8.915933106463056e-07, + "loss": 0.78789341, + "num_input_tokens_seen": 249967045, + "router_z_loss_clip": 0.74316406, + "router_z_loss_mlp": 0.11364746, + "step": 11582, + "time_per_iteration": 2.6090846061706543 + }, + { + "auxiliary_loss_clip": 0.0111464, + "auxiliary_loss_mlp": 0.01028637, + "balance_loss_clip": 1.04094696, + "balance_loss_mlp": 1.0177114, + "epoch": 0.6964076356530888, + "flos": 20944611141120.0, + "grad_norm": 2.3747811690297436, + "language_loss": 0.69830751, + "learning_rate": 8.91269151037425e-07, + "loss": 0.71974033, + "num_input_tokens_seen": 249984565, + "router_z_loss_clip": 0.73632812, + "router_z_loss_mlp": 0.10931396, + "step": 11583, + "time_per_iteration": 2.587376832962036 + }, + { + "auxiliary_loss_clip": 0.01116986, + "auxiliary_loss_mlp": 0.0103096, + "balance_loss_clip": 1.0426861, + "balance_loss_mlp": 1.01908684, + "epoch": 0.6964677589057569, + "flos": 24327564494400.0, + "grad_norm": 1.6163627336909543, + "language_loss": 0.82362473, + "learning_rate": 8.909450334717301e-07, + "loss": 0.84510422, + "num_input_tokens_seen": 250004235, + "router_z_loss_clip": 0.74267578, + "router_z_loss_mlp": 0.11889648, + "step": 11584, + "time_per_iteration": 2.6264116764068604 + }, + { + "auxiliary_loss_clip": 0.01119362, + "auxiliary_loss_mlp": 0.01033273, + "balance_loss_clip": 1.04296339, + "balance_loss_mlp": 1.02113795, + "epoch": 0.6965278821584248, + "flos": 27801304371360.0, + "grad_norm": 2.3810783828308053, + "language_loss": 0.79952496, + "learning_rate": 8.906209579615107e-07, + "loss": 0.82105136, + "num_input_tokens_seen": 250017645, + "router_z_loss_clip": 0.76513672, + "router_z_loss_mlp": 0.12145996, + "step": 11585, + "time_per_iteration": 2.618903160095215 + }, + { + "auxiliary_loss_clip": 0.01111935, + "auxiliary_loss_mlp": 0.01031206, + "balance_loss_clip": 1.0406208, + "balance_loss_mlp": 1.02019083, + "epoch": 0.6965880054110928, + "flos": 24462022295520.0, + "grad_norm": 1.6670779649190164, + "language_loss": 0.77847362, + "learning_rate": 8.90296924519055e-07, + "loss": 0.79990494, + "num_input_tokens_seen": 250037640, + "router_z_loss_clip": 0.71435547, + "router_z_loss_mlp": 0.11022949, + "step": 11586, + "time_per_iteration": 2.614198923110962 + }, + { + "auxiliary_loss_clip": 0.01109257, + "auxiliary_loss_mlp": 0.01028518, + "balance_loss_clip": 1.03999925, + "balance_loss_mlp": 1.01762223, + "epoch": 0.6966481286637607, + "flos": 26732570417280.0, + "grad_norm": 1.7112181895655183, + "language_loss": 0.78407502, + "learning_rate": 8.899729331566519e-07, + "loss": 0.80545282, + "num_input_tokens_seen": 250056490, + "router_z_loss_clip": 0.69287109, + "router_z_loss_mlp": 0.10894775, + "step": 11587, + "time_per_iteration": 2.603569746017456 + }, + { + "auxiliary_loss_clip": 0.01111197, + "auxiliary_loss_mlp": 0.01033922, + "balance_loss_clip": 1.04021716, + "balance_loss_mlp": 1.02240086, + "epoch": 0.6967082519164287, + "flos": 19075207972320.0, + "grad_norm": 2.1682798312551723, + "language_loss": 0.72646582, + "learning_rate": 8.896489838865857e-07, + "loss": 0.74791706, + "num_input_tokens_seen": 250074285, + "router_z_loss_clip": 0.70898438, + "router_z_loss_mlp": 0.11517334, + "step": 11588, + "time_per_iteration": 4.019607305526733 + }, + { + "auxiliary_loss_clip": 0.01113108, + "auxiliary_loss_mlp": 0.01027987, + "balance_loss_clip": 1.03932905, + "balance_loss_mlp": 1.01721025, + "epoch": 0.6967683751690966, + "flos": 29315097659520.0, + "grad_norm": 3.149569066170148, + "language_loss": 0.7509011, + "learning_rate": 8.893250767211413e-07, + "loss": 0.77231205, + "num_input_tokens_seen": 250093350, + "router_z_loss_clip": 0.73730469, + "router_z_loss_mlp": 0.10778809, + "step": 11589, + "time_per_iteration": 2.6225130558013916 + }, + { + "auxiliary_loss_clip": 0.01115685, + "auxiliary_loss_mlp": 0.01032845, + "balance_loss_clip": 1.04210949, + "balance_loss_mlp": 1.02186632, + "epoch": 0.6968284984217646, + "flos": 37857635664480.0, + "grad_norm": 1.8816049734583147, + "language_loss": 0.63470304, + "learning_rate": 8.890012116726012e-07, + "loss": 0.65618837, + "num_input_tokens_seen": 250114170, + "router_z_loss_clip": 0.73583984, + "router_z_loss_mlp": 0.10974121, + "step": 11590, + "time_per_iteration": 3.9407460689544678 + }, + { + "auxiliary_loss_clip": 0.01033673, + "auxiliary_loss_mlp": 0.01000409, + "balance_loss_clip": 1.01066494, + "balance_loss_mlp": 0.99931204, + "epoch": 0.6968886216744326, + "flos": 82514037549600.0, + "grad_norm": 0.7535944937390867, + "language_loss": 0.61251748, + "learning_rate": 8.88677388753248e-07, + "loss": 0.6328584, + "num_input_tokens_seen": 250178250, + "router_z_loss_clip": 0.23022461, + "router_z_loss_mlp": 0.01098633, + "step": 11591, + "time_per_iteration": 3.3878488540649414 + }, + { + "auxiliary_loss_clip": 0.01117586, + "auxiliary_loss_mlp": 0.01030978, + "balance_loss_clip": 1.0437777, + "balance_loss_mlp": 1.01877713, + "epoch": 0.6969487449271006, + "flos": 30339025852320.0, + "grad_norm": 3.753340625699957, + "language_loss": 0.69693083, + "learning_rate": 8.883536079753582e-07, + "loss": 0.71841645, + "num_input_tokens_seen": 250198420, + "router_z_loss_clip": 0.73681641, + "router_z_loss_mlp": 0.12207031, + "step": 11592, + "time_per_iteration": 2.6567015647888184 + }, + { + "auxiliary_loss_clip": 0.0111539, + "auxiliary_loss_mlp": 0.01027486, + "balance_loss_clip": 1.0424521, + "balance_loss_mlp": 1.01674485, + "epoch": 0.6970088681797685, + "flos": 35103178487520.0, + "grad_norm": 1.5951166019463607, + "language_loss": 0.62615418, + "learning_rate": 8.880298693512109e-07, + "loss": 0.64758289, + "num_input_tokens_seen": 250220650, + "router_z_loss_clip": 0.72900391, + "router_z_loss_mlp": 0.10748291, + "step": 11593, + "time_per_iteration": 2.738513946533203 + }, + { + "auxiliary_loss_clip": 0.0110977, + "auxiliary_loss_mlp": 0.01025385, + "balance_loss_clip": 1.03966331, + "balance_loss_mlp": 1.01498389, + "epoch": 0.6970689914324365, + "flos": 33322414426560.0, + "grad_norm": 1.4021418127358625, + "language_loss": 0.5411182, + "learning_rate": 8.877061728930832e-07, + "loss": 0.56246972, + "num_input_tokens_seen": 250241750, + "router_z_loss_clip": 0.70117188, + "router_z_loss_mlp": 0.10406494, + "step": 11594, + "time_per_iteration": 2.6861448287963867 + }, + { + "auxiliary_loss_clip": 0.01113476, + "auxiliary_loss_mlp": 0.01027783, + "balance_loss_clip": 1.04030621, + "balance_loss_mlp": 1.01709545, + "epoch": 0.6971291146851044, + "flos": 23349900684960.0, + "grad_norm": 2.1440532100377023, + "language_loss": 0.76734626, + "learning_rate": 8.87382518613248e-07, + "loss": 0.78875887, + "num_input_tokens_seen": 250259445, + "router_z_loss_clip": 0.73193359, + "router_z_loss_mlp": 0.10693359, + "step": 11595, + "time_per_iteration": 2.628298282623291 + }, + { + "auxiliary_loss_clip": 0.01118517, + "auxiliary_loss_mlp": 0.0102912, + "balance_loss_clip": 1.0426172, + "balance_loss_mlp": 1.01691842, + "epoch": 0.6971892379377724, + "flos": 17827372524960.0, + "grad_norm": 2.4117305329979426, + "language_loss": 0.71804357, + "learning_rate": 8.870589065239793e-07, + "loss": 0.73951995, + "num_input_tokens_seen": 250275640, + "router_z_loss_clip": 0.7578125, + "router_z_loss_mlp": 0.12219238, + "step": 11596, + "time_per_iteration": 2.6412227153778076 + }, + { + "auxiliary_loss_clip": 0.01117129, + "auxiliary_loss_mlp": 0.01032574, + "balance_loss_clip": 1.04316592, + "balance_loss_mlp": 1.02095139, + "epoch": 0.6972493611904405, + "flos": 27219923441280.0, + "grad_norm": 1.6842924196625744, + "language_loss": 0.76229322, + "learning_rate": 8.867353366375492e-07, + "loss": 0.78379023, + "num_input_tokens_seen": 250296435, + "router_z_loss_clip": 0.73828125, + "router_z_loss_mlp": 0.11627197, + "step": 11597, + "time_per_iteration": 2.6748976707458496 + }, + { + "auxiliary_loss_clip": 0.01111854, + "auxiliary_loss_mlp": 0.01030321, + "balance_loss_clip": 1.03929901, + "balance_loss_mlp": 1.01912689, + "epoch": 0.6973094844431084, + "flos": 21256144571520.0, + "grad_norm": 1.9287364763704589, + "language_loss": 0.74692851, + "learning_rate": 8.864118089662267e-07, + "loss": 0.76835024, + "num_input_tokens_seen": 250314035, + "router_z_loss_clip": 0.72412109, + "router_z_loss_mlp": 0.11187744, + "step": 11598, + "time_per_iteration": 2.663529396057129 + }, + { + "auxiliary_loss_clip": 0.0111778, + "auxiliary_loss_mlp": 0.01033201, + "balance_loss_clip": 1.04128981, + "balance_loss_mlp": 1.02082705, + "epoch": 0.6973696076957764, + "flos": 33233127042240.0, + "grad_norm": 1.8364424323701893, + "language_loss": 0.89707518, + "learning_rate": 8.860883235222791e-07, + "loss": 0.91858494, + "num_input_tokens_seen": 250332995, + "router_z_loss_clip": 0.76416016, + "router_z_loss_mlp": 0.1237793, + "step": 11599, + "time_per_iteration": 2.6957411766052246 + }, + { + "auxiliary_loss_clip": 0.01122053, + "auxiliary_loss_mlp": 0.01040803, + "balance_loss_clip": 1.04369807, + "balance_loss_mlp": 1.0282805, + "epoch": 0.6974297309484443, + "flos": 26866582528320.0, + "grad_norm": 1.9544252156225297, + "language_loss": 0.69672203, + "learning_rate": 8.85764880317974e-07, + "loss": 0.71835053, + "num_input_tokens_seen": 250352120, + "router_z_loss_clip": 0.78369141, + "router_z_loss_mlp": 0.12530518, + "step": 11600, + "time_per_iteration": 2.7324962615966797 + }, + { + "auxiliary_loss_clip": 0.01112588, + "auxiliary_loss_mlp": 0.01031283, + "balance_loss_clip": 1.03914738, + "balance_loss_mlp": 1.02004147, + "epoch": 0.6974898542011123, + "flos": 34613353909440.0, + "grad_norm": 1.6834956243247592, + "language_loss": 0.76491678, + "learning_rate": 8.854414793655771e-07, + "loss": 0.7863555, + "num_input_tokens_seen": 250371705, + "router_z_loss_clip": 0.734375, + "router_z_loss_mlp": 0.11242676, + "step": 11601, + "time_per_iteration": 2.7157371044158936 + }, + { + "auxiliary_loss_clip": 0.01108723, + "auxiliary_loss_mlp": 0.01030753, + "balance_loss_clip": 1.03837454, + "balance_loss_mlp": 1.01995897, + "epoch": 0.6975499774537802, + "flos": 18585707532480.0, + "grad_norm": 1.8007721862183133, + "language_loss": 0.722929, + "learning_rate": 8.851181206773508e-07, + "loss": 0.74432379, + "num_input_tokens_seen": 250390485, + "router_z_loss_clip": 0.70410156, + "router_z_loss_mlp": 0.10797119, + "step": 11602, + "time_per_iteration": 2.6364097595214844 + }, + { + "auxiliary_loss_clip": 0.01112808, + "auxiliary_loss_mlp": 0.01035819, + "balance_loss_clip": 1.04021454, + "balance_loss_mlp": 1.02473247, + "epoch": 0.6976101007064482, + "flos": 27037215910080.0, + "grad_norm": 2.674551797848341, + "language_loss": 0.7630291, + "learning_rate": 8.847948042655567e-07, + "loss": 0.78451538, + "num_input_tokens_seen": 250407020, + "router_z_loss_clip": 0.72558594, + "router_z_loss_mlp": 0.11083984, + "step": 11603, + "time_per_iteration": 2.6518213748931885 + }, + { + "auxiliary_loss_clip": 0.0111286, + "auxiliary_loss_mlp": 0.01029299, + "balance_loss_clip": 1.03969717, + "balance_loss_mlp": 1.01778936, + "epoch": 0.6976702239591162, + "flos": 27178278027840.0, + "grad_norm": 1.6758580334032547, + "language_loss": 0.6210261, + "learning_rate": 8.844715301424557e-07, + "loss": 0.64244771, + "num_input_tokens_seen": 250425880, + "router_z_loss_clip": 0.73193359, + "router_z_loss_mlp": 0.11523438, + "step": 11604, + "time_per_iteration": 2.6446664333343506 + }, + { + "auxiliary_loss_clip": 0.01116229, + "auxiliary_loss_mlp": 0.01032032, + "balance_loss_clip": 1.04116488, + "balance_loss_mlp": 1.0195266, + "epoch": 0.6977303472117842, + "flos": 31540921054560.0, + "grad_norm": 2.2226910201327907, + "language_loss": 0.81525987, + "learning_rate": 8.841482983203057e-07, + "loss": 0.83674246, + "num_input_tokens_seen": 250442925, + "router_z_loss_clip": 0.75, + "router_z_loss_mlp": 0.12493896, + "step": 11605, + "time_per_iteration": 2.6880433559417725 + }, + { + "auxiliary_loss_clip": 0.01114073, + "auxiliary_loss_mlp": 0.01031017, + "balance_loss_clip": 1.04051995, + "balance_loss_mlp": 1.01982379, + "epoch": 0.6977904704644521, + "flos": 25574954251680.0, + "grad_norm": 1.7431307524549893, + "language_loss": 0.70931977, + "learning_rate": 8.838251088113638e-07, + "loss": 0.73077071, + "num_input_tokens_seen": 250461220, + "router_z_loss_clip": 0.73535156, + "router_z_loss_mlp": 0.11193848, + "step": 11606, + "time_per_iteration": 2.6203627586364746 + }, + { + "auxiliary_loss_clip": 0.01116523, + "auxiliary_loss_mlp": 0.01030224, + "balance_loss_clip": 1.0413686, + "balance_loss_mlp": 1.01897073, + "epoch": 0.6978505937171201, + "flos": 26911631393280.0, + "grad_norm": 2.8120964967345583, + "language_loss": 0.82511204, + "learning_rate": 8.835019616278856e-07, + "loss": 0.84657955, + "num_input_tokens_seen": 250480975, + "router_z_loss_clip": 0.75195312, + "router_z_loss_mlp": 0.11242676, + "step": 11607, + "time_per_iteration": 2.623278856277466 + }, + { + "auxiliary_loss_clip": 0.01120685, + "auxiliary_loss_mlp": 0.01032787, + "balance_loss_clip": 1.04285443, + "balance_loss_mlp": 1.02099693, + "epoch": 0.697910716969788, + "flos": 24457686946560.0, + "grad_norm": 2.570247899577739, + "language_loss": 0.78997743, + "learning_rate": 8.831788567821265e-07, + "loss": 0.81151211, + "num_input_tokens_seen": 250497980, + "router_z_loss_clip": 0.77880859, + "router_z_loss_mlp": 0.11791992, + "step": 11608, + "time_per_iteration": 2.649773597717285 + }, + { + "auxiliary_loss_clip": 0.01116246, + "auxiliary_loss_mlp": 0.01029868, + "balance_loss_clip": 1.04131961, + "balance_loss_mlp": 1.01835227, + "epoch": 0.697970840222456, + "flos": 19386498299040.0, + "grad_norm": 2.3008078498476805, + "language_loss": 0.90048051, + "learning_rate": 8.828557942863357e-07, + "loss": 0.92194164, + "num_input_tokens_seen": 250511910, + "router_z_loss_clip": 0.74951172, + "router_z_loss_mlp": 0.11517334, + "step": 11609, + "time_per_iteration": 2.594604969024658 + }, + { + "auxiliary_loss_clip": 0.01115404, + "auxiliary_loss_mlp": 0.01028752, + "balance_loss_clip": 1.03960085, + "balance_loss_mlp": 1.01655078, + "epoch": 0.698030963475124, + "flos": 25887338544960.0, + "grad_norm": 1.888696025255902, + "language_loss": 0.64002794, + "learning_rate": 8.82532774152765e-07, + "loss": 0.66146946, + "num_input_tokens_seen": 250531090, + "router_z_loss_clip": 0.7578125, + "router_z_loss_mlp": 0.12194824, + "step": 11610, + "time_per_iteration": 2.6592023372650146 + }, + { + "auxiliary_loss_clip": 0.0111258, + "auxiliary_loss_mlp": 0.01028906, + "balance_loss_clip": 1.03994226, + "balance_loss_mlp": 1.01821852, + "epoch": 0.698091086727792, + "flos": 41195256531840.0, + "grad_norm": 1.7185426841399283, + "language_loss": 0.84821486, + "learning_rate": 8.822097963936643e-07, + "loss": 0.86962968, + "num_input_tokens_seen": 250551565, + "router_z_loss_clip": 0.72558594, + "router_z_loss_mlp": 0.10681152, + "step": 11611, + "time_per_iteration": 2.7343714237213135 + }, + { + "auxiliary_loss_clip": 0.01115743, + "auxiliary_loss_mlp": 0.01030156, + "balance_loss_clip": 1.04029965, + "balance_loss_mlp": 1.01895046, + "epoch": 0.69815120998046, + "flos": 19386052608960.0, + "grad_norm": 1.969222478539419, + "language_loss": 0.71014172, + "learning_rate": 8.818868610212793e-07, + "loss": 0.73160064, + "num_input_tokens_seen": 250569625, + "router_z_loss_clip": 0.75439453, + "router_z_loss_mlp": 0.11212158, + "step": 11612, + "time_per_iteration": 2.6221132278442383 + }, + { + "auxiliary_loss_clip": 0.01112806, + "auxiliary_loss_mlp": 0.01030577, + "balance_loss_clip": 1.04012299, + "balance_loss_mlp": 1.01922858, + "epoch": 0.6982113332331279, + "flos": 23119632217440.0, + "grad_norm": 1.6765326251958368, + "language_loss": 0.81134987, + "learning_rate": 8.815639680478573e-07, + "loss": 0.8327837, + "num_input_tokens_seen": 250586960, + "router_z_loss_clip": 0.7265625, + "router_z_loss_mlp": 0.11346436, + "step": 11613, + "time_per_iteration": 2.604675769805908 + }, + { + "auxiliary_loss_clip": 0.01113981, + "auxiliary_loss_mlp": 0.01032111, + "balance_loss_clip": 1.04131711, + "balance_loss_mlp": 1.02126265, + "epoch": 0.6982714564857959, + "flos": 29760481131840.0, + "grad_norm": 2.393231531345377, + "language_loss": 0.75395691, + "learning_rate": 8.812411174856411e-07, + "loss": 0.7754178, + "num_input_tokens_seen": 250605080, + "router_z_loss_clip": 0.72705078, + "router_z_loss_mlp": 0.10845947, + "step": 11614, + "time_per_iteration": 2.6392571926116943 + }, + { + "auxiliary_loss_clip": 0.01115025, + "auxiliary_loss_mlp": 0.01032055, + "balance_loss_clip": 1.04145145, + "balance_loss_mlp": 1.0202539, + "epoch": 0.6983315797384638, + "flos": 24506179780320.0, + "grad_norm": 2.870020771167638, + "language_loss": 0.77601492, + "learning_rate": 8.809183093468746e-07, + "loss": 0.79748577, + "num_input_tokens_seen": 250623965, + "router_z_loss_clip": 0.73583984, + "router_z_loss_mlp": 0.11798096, + "step": 11615, + "time_per_iteration": 2.64693021774292 + }, + { + "auxiliary_loss_clip": 0.01110688, + "auxiliary_loss_mlp": 0.01027753, + "balance_loss_clip": 1.04026699, + "balance_loss_mlp": 1.01642871, + "epoch": 0.6983917029911318, + "flos": 16487737621920.0, + "grad_norm": 2.095700673203189, + "language_loss": 0.72888207, + "learning_rate": 8.80595543643797e-07, + "loss": 0.75026655, + "num_input_tokens_seen": 250640675, + "router_z_loss_clip": 0.70458984, + "router_z_loss_mlp": 0.11322021, + "step": 11616, + "time_per_iteration": 2.5783236026763916 + }, + { + "auxiliary_loss_clip": 0.01115259, + "auxiliary_loss_mlp": 0.01035211, + "balance_loss_clip": 1.04263711, + "balance_loss_mlp": 1.02391624, + "epoch": 0.6984518262437998, + "flos": 26866947183840.0, + "grad_norm": 1.6855335688468587, + "language_loss": 0.84300697, + "learning_rate": 8.802728203886487e-07, + "loss": 0.86451161, + "num_input_tokens_seen": 250660295, + "router_z_loss_clip": 0.72558594, + "router_z_loss_mlp": 0.11291504, + "step": 11617, + "time_per_iteration": 4.076253414154053 + }, + { + "auxiliary_loss_clip": 0.01117912, + "auxiliary_loss_mlp": 0.0103431, + "balance_loss_clip": 1.04281044, + "balance_loss_mlp": 1.02263951, + "epoch": 0.6985119494964678, + "flos": 22903787901600.0, + "grad_norm": 2.461852016118952, + "language_loss": 0.59614527, + "learning_rate": 8.799501395936682e-07, + "loss": 0.61766744, + "num_input_tokens_seen": 250678155, + "router_z_loss_clip": 0.75048828, + "router_z_loss_mlp": 0.11669922, + "step": 11618, + "time_per_iteration": 2.601287364959717 + }, + { + "auxiliary_loss_clip": 0.01114923, + "auxiliary_loss_mlp": 0.01033463, + "balance_loss_clip": 1.0418123, + "balance_loss_mlp": 1.02211976, + "epoch": 0.6985720727491357, + "flos": 27311682379680.0, + "grad_norm": 1.7280767786488291, + "language_loss": 0.83099699, + "learning_rate": 8.796275012710903e-07, + "loss": 0.85248083, + "num_input_tokens_seen": 250697230, + "router_z_loss_clip": 0.73095703, + "router_z_loss_mlp": 0.11352539, + "step": 11619, + "time_per_iteration": 2.6274123191833496 + }, + { + "auxiliary_loss_clip": 0.01109355, + "auxiliary_loss_mlp": 0.01027778, + "balance_loss_clip": 1.03845525, + "balance_loss_mlp": 1.01736486, + "epoch": 0.6986321960018037, + "flos": 48282177712320.0, + "grad_norm": 1.8782316519030475, + "language_loss": 0.67421752, + "learning_rate": 8.793049054331494e-07, + "loss": 0.69558889, + "num_input_tokens_seen": 250719865, + "router_z_loss_clip": 0.70898438, + "router_z_loss_mlp": 0.10406494, + "step": 11620, + "time_per_iteration": 4.203247785568237 + }, + { + "auxiliary_loss_clip": 0.01115658, + "auxiliary_loss_mlp": 0.01030112, + "balance_loss_clip": 1.04031944, + "balance_loss_mlp": 1.01822066, + "epoch": 0.6986923192544716, + "flos": 21924057710880.0, + "grad_norm": 2.453894399082652, + "language_loss": 0.72821915, + "learning_rate": 8.789823520920794e-07, + "loss": 0.74967682, + "num_input_tokens_seen": 250736565, + "router_z_loss_clip": 0.75244141, + "router_z_loss_mlp": 0.11889648, + "step": 11621, + "time_per_iteration": 2.5969460010528564 + }, + { + "auxiliary_loss_clip": 0.01118045, + "auxiliary_loss_mlp": 0.01034246, + "balance_loss_clip": 1.0415647, + "balance_loss_mlp": 1.02217031, + "epoch": 0.6987524425071396, + "flos": 31230522108000.0, + "grad_norm": 1.7438887667163836, + "language_loss": 0.68225563, + "learning_rate": 8.7865984126011e-07, + "loss": 0.70377851, + "num_input_tokens_seen": 250757235, + "router_z_loss_clip": 0.76513672, + "router_z_loss_mlp": 0.12084961, + "step": 11622, + "time_per_iteration": 2.6788289546966553 + }, + { + "auxiliary_loss_clip": 0.01109551, + "auxiliary_loss_mlp": 0.01025101, + "balance_loss_clip": 1.03873658, + "balance_loss_mlp": 1.01404393, + "epoch": 0.6988125657598077, + "flos": 21389994613440.0, + "grad_norm": 2.1683378423789064, + "language_loss": 0.62889379, + "learning_rate": 8.783373729494721e-07, + "loss": 0.6502403, + "num_input_tokens_seen": 250775585, + "router_z_loss_clip": 0.70800781, + "router_z_loss_mlp": 0.1105957, + "step": 11623, + "time_per_iteration": 2.6428399085998535 + }, + { + "auxiliary_loss_clip": 0.01116597, + "auxiliary_loss_mlp": 0.0102724, + "balance_loss_clip": 1.03963518, + "balance_loss_mlp": 1.01525402, + "epoch": 0.6988726890124756, + "flos": 47791502271360.0, + "grad_norm": 1.722280594681301, + "language_loss": 0.61097777, + "learning_rate": 8.780149471723932e-07, + "loss": 0.63241613, + "num_input_tokens_seen": 250795725, + "router_z_loss_clip": 0.76904297, + "router_z_loss_mlp": 0.11987305, + "step": 11624, + "time_per_iteration": 2.7783446311950684 + }, + { + "auxiliary_loss_clip": 0.01115351, + "auxiliary_loss_mlp": 0.01037356, + "balance_loss_clip": 1.03883195, + "balance_loss_mlp": 1.02542353, + "epoch": 0.6989328122651436, + "flos": 24640070339520.0, + "grad_norm": 1.7372830860106254, + "language_loss": 0.78452945, + "learning_rate": 8.776925639411017e-07, + "loss": 0.8060565, + "num_input_tokens_seen": 250814555, + "router_z_loss_clip": 0.76464844, + "router_z_loss_mlp": 0.11938477, + "step": 11625, + "time_per_iteration": 2.613520622253418 + }, + { + "auxiliary_loss_clip": 0.0111139, + "auxiliary_loss_mlp": 0.01032016, + "balance_loss_clip": 1.0407294, + "balance_loss_mlp": 1.02111423, + "epoch": 0.6989929355178115, + "flos": 26643080446560.0, + "grad_norm": 1.8798911043258448, + "language_loss": 0.66274107, + "learning_rate": 8.773702232678188e-07, + "loss": 0.68417513, + "num_input_tokens_seen": 250833105, + "router_z_loss_clip": 0.70654297, + "router_z_loss_mlp": 0.10894775, + "step": 11626, + "time_per_iteration": 2.658764123916626 + }, + { + "auxiliary_loss_clip": 0.01115192, + "auxiliary_loss_mlp": 0.01033394, + "balance_loss_clip": 1.04089057, + "balance_loss_mlp": 1.02132452, + "epoch": 0.6990530587704795, + "flos": 32119668361440.0, + "grad_norm": 1.7978463139299996, + "language_loss": 0.7011016, + "learning_rate": 8.770479251647697e-07, + "loss": 0.72258747, + "num_input_tokens_seen": 250852570, + "router_z_loss_clip": 0.74316406, + "router_z_loss_mlp": 0.1206665, + "step": 11627, + "time_per_iteration": 4.14333176612854 + }, + { + "auxiliary_loss_clip": 0.01112925, + "auxiliary_loss_mlp": 0.01028124, + "balance_loss_clip": 1.04219961, + "balance_loss_mlp": 1.01765776, + "epoch": 0.6991131820231474, + "flos": 24195254109120.0, + "grad_norm": 2.2343009736055386, + "language_loss": 0.62671852, + "learning_rate": 8.767256696441768e-07, + "loss": 0.64812905, + "num_input_tokens_seen": 250870500, + "router_z_loss_clip": 0.70800781, + "router_z_loss_mlp": 0.10461426, + "step": 11628, + "time_per_iteration": 2.625478982925415 + }, + { + "auxiliary_loss_clip": 0.01114563, + "auxiliary_loss_mlp": 0.01032511, + "balance_loss_clip": 1.03948665, + "balance_loss_mlp": 1.02073944, + "epoch": 0.6991733052758154, + "flos": 41469358345920.0, + "grad_norm": 2.499376834383407, + "language_loss": 0.68400139, + "learning_rate": 8.764034567182581e-07, + "loss": 0.70547211, + "num_input_tokens_seen": 250892745, + "router_z_loss_clip": 0.75048828, + "router_z_loss_mlp": 0.11773682, + "step": 11629, + "time_per_iteration": 4.023114442825317 + }, + { + "auxiliary_loss_clip": 0.01116724, + "auxiliary_loss_mlp": 0.01036636, + "balance_loss_clip": 1.0434587, + "balance_loss_mlp": 1.02393985, + "epoch": 0.6992334285284834, + "flos": 19075167455040.0, + "grad_norm": 1.9378026609213734, + "language_loss": 0.72395819, + "learning_rate": 8.760812863992337e-07, + "loss": 0.74549186, + "num_input_tokens_seen": 250910225, + "router_z_loss_clip": 0.73144531, + "router_z_loss_mlp": 0.1270752, + "step": 11630, + "time_per_iteration": 2.6130146980285645 + }, + { + "auxiliary_loss_clip": 0.0111509, + "auxiliary_loss_mlp": 0.01040928, + "balance_loss_clip": 1.04267812, + "balance_loss_mlp": 1.02991855, + "epoch": 0.6992935517811514, + "flos": 26510486440320.0, + "grad_norm": 1.6077984347454277, + "language_loss": 0.74105799, + "learning_rate": 8.757591586993196e-07, + "loss": 0.76261818, + "num_input_tokens_seen": 250929715, + "router_z_loss_clip": 0.72509766, + "router_z_loss_mlp": 0.11010742, + "step": 11631, + "time_per_iteration": 2.6221225261688232 + }, + { + "auxiliary_loss_clip": 0.01118066, + "auxiliary_loss_mlp": 0.01030104, + "balance_loss_clip": 1.04272079, + "balance_loss_mlp": 1.01780772, + "epoch": 0.6993536750338193, + "flos": 24545110536000.0, + "grad_norm": 2.2348477028158915, + "language_loss": 0.89308953, + "learning_rate": 8.7543707363073e-07, + "loss": 0.91457129, + "num_input_tokens_seen": 250944230, + "router_z_loss_clip": 0.75341797, + "router_z_loss_mlp": 0.1229248, + "step": 11632, + "time_per_iteration": 2.6070079803466797 + }, + { + "auxiliary_loss_clip": 0.01117555, + "auxiliary_loss_mlp": 0.01033164, + "balance_loss_clip": 1.04239237, + "balance_loss_mlp": 1.02183342, + "epoch": 0.6994137982864873, + "flos": 26858073899520.0, + "grad_norm": 2.0923672337722814, + "language_loss": 0.80127048, + "learning_rate": 8.751150312056792e-07, + "loss": 0.82277775, + "num_input_tokens_seen": 250961865, + "router_z_loss_clip": 0.75097656, + "router_z_loss_mlp": 0.11322021, + "step": 11633, + "time_per_iteration": 2.6522507667541504 + }, + { + "auxiliary_loss_clip": 0.01119866, + "auxiliary_loss_mlp": 0.01030909, + "balance_loss_clip": 1.04242682, + "balance_loss_mlp": 1.01826644, + "epoch": 0.6994739215391552, + "flos": 31138844204160.0, + "grad_norm": 3.899346590517035, + "language_loss": 0.66629899, + "learning_rate": 8.747930314363794e-07, + "loss": 0.68780673, + "num_input_tokens_seen": 250982025, + "router_z_loss_clip": 0.77539062, + "router_z_loss_mlp": 0.12634277, + "step": 11634, + "time_per_iteration": 2.728402614593506 + }, + { + "auxiliary_loss_clip": 0.01033024, + "auxiliary_loss_mlp": 0.01000866, + "balance_loss_clip": 1.00979698, + "balance_loss_mlp": 0.9997493, + "epoch": 0.6995340447918232, + "flos": 72147631448160.0, + "grad_norm": 0.6962929843344302, + "language_loss": 0.53184497, + "learning_rate": 8.744710743350412e-07, + "loss": 0.55218387, + "num_input_tokens_seen": 251046900, + "router_z_loss_clip": 0.23217773, + "router_z_loss_mlp": 0.01117706, + "step": 11635, + "time_per_iteration": 3.337064743041992 + }, + { + "auxiliary_loss_clip": 0.01113865, + "auxiliary_loss_mlp": 0.01031588, + "balance_loss_clip": 1.04068923, + "balance_loss_mlp": 1.01976836, + "epoch": 0.6995941680444913, + "flos": 21924179262720.0, + "grad_norm": 2.8599778505381135, + "language_loss": 0.82065237, + "learning_rate": 8.741491599138726e-07, + "loss": 0.84210694, + "num_input_tokens_seen": 251065050, + "router_z_loss_clip": 0.73046875, + "router_z_loss_mlp": 0.11810303, + "step": 11636, + "time_per_iteration": 2.589277505874634 + }, + { + "auxiliary_loss_clip": 0.0111637, + "auxiliary_loss_mlp": 0.01025277, + "balance_loss_clip": 1.04140949, + "balance_loss_mlp": 1.01429784, + "epoch": 0.6996542912971592, + "flos": 26820358662240.0, + "grad_norm": 2.05155688634537, + "language_loss": 0.82719308, + "learning_rate": 8.738272881850801e-07, + "loss": 0.84860957, + "num_input_tokens_seen": 251083355, + "router_z_loss_clip": 0.74951172, + "router_z_loss_mlp": 0.10980225, + "step": 11637, + "time_per_iteration": 2.651906967163086 + }, + { + "auxiliary_loss_clip": 0.01114914, + "auxiliary_loss_mlp": 0.01032278, + "balance_loss_clip": 1.04259682, + "balance_loss_mlp": 1.02079177, + "epoch": 0.6997144145498272, + "flos": 14259361638240.0, + "grad_norm": 2.091943404114221, + "language_loss": 0.6781497, + "learning_rate": 8.735054591608704e-07, + "loss": 0.69962156, + "num_input_tokens_seen": 251096420, + "router_z_loss_clip": 0.72314453, + "router_z_loss_mlp": 0.11474609, + "step": 11638, + "time_per_iteration": 2.673550844192505 + }, + { + "auxiliary_loss_clip": 0.01119166, + "auxiliary_loss_mlp": 0.01029658, + "balance_loss_clip": 1.04129386, + "balance_loss_mlp": 1.01716483, + "epoch": 0.6997745378024951, + "flos": 36127349784000.0, + "grad_norm": 2.112496368758864, + "language_loss": 0.78420854, + "learning_rate": 8.731836728534459e-07, + "loss": 0.80569679, + "num_input_tokens_seen": 251115410, + "router_z_loss_clip": 0.77880859, + "router_z_loss_mlp": 0.12487793, + "step": 11639, + "time_per_iteration": 2.7123239040374756 + }, + { + "auxiliary_loss_clip": 0.01118417, + "auxiliary_loss_mlp": 0.01038131, + "balance_loss_clip": 1.04299521, + "balance_loss_mlp": 1.0260191, + "epoch": 0.6998346610551631, + "flos": 25485909971040.0, + "grad_norm": 2.337948897232547, + "language_loss": 0.82349956, + "learning_rate": 8.728619292750093e-07, + "loss": 0.845065, + "num_input_tokens_seen": 251133530, + "router_z_loss_clip": 0.75390625, + "router_z_loss_mlp": 0.12115479, + "step": 11640, + "time_per_iteration": 2.630797863006592 + }, + { + "auxiliary_loss_clip": 0.01112613, + "auxiliary_loss_mlp": 0.01026139, + "balance_loss_clip": 1.03904843, + "balance_loss_mlp": 1.01475477, + "epoch": 0.699894784307831, + "flos": 33144812072640.0, + "grad_norm": 1.8772359718159122, + "language_loss": 0.75307441, + "learning_rate": 8.725402284377619e-07, + "loss": 0.77446193, + "num_input_tokens_seen": 251153985, + "router_z_loss_clip": 0.73632812, + "router_z_loss_mlp": 0.11383057, + "step": 11641, + "time_per_iteration": 2.6716017723083496 + }, + { + "auxiliary_loss_clip": 0.01117052, + "auxiliary_loss_mlp": 0.0102452, + "balance_loss_clip": 1.04268813, + "balance_loss_mlp": 1.0122478, + "epoch": 0.699954907560499, + "flos": 25531039870560.0, + "grad_norm": 2.1933391179444315, + "language_loss": 0.77689338, + "learning_rate": 8.722185703539022e-07, + "loss": 0.79830909, + "num_input_tokens_seen": 251173225, + "router_z_loss_clip": 0.74414062, + "router_z_loss_mlp": 0.12280273, + "step": 11642, + "time_per_iteration": 2.603062152862549 + }, + { + "auxiliary_loss_clip": 0.01118986, + "auxiliary_loss_mlp": 0.01033555, + "balance_loss_clip": 1.04210794, + "balance_loss_mlp": 1.02013183, + "epoch": 0.700015030813167, + "flos": 34968923272800.0, + "grad_norm": 2.3011642102579772, + "language_loss": 0.74896908, + "learning_rate": 8.718969550356266e-07, + "loss": 0.77049458, + "num_input_tokens_seen": 251192485, + "router_z_loss_clip": 0.76806641, + "router_z_loss_mlp": 0.13439941, + "step": 11643, + "time_per_iteration": 2.720353841781616 + }, + { + "auxiliary_loss_clip": 0.01115957, + "auxiliary_loss_mlp": 0.01027726, + "balance_loss_clip": 1.04060149, + "balance_loss_mlp": 1.01582265, + "epoch": 0.700075154065835, + "flos": 35636998481280.0, + "grad_norm": 1.6548809631372976, + "language_loss": 0.60090822, + "learning_rate": 8.715753824951315e-07, + "loss": 0.62234509, + "num_input_tokens_seen": 251214965, + "router_z_loss_clip": 0.75390625, + "router_z_loss_mlp": 0.11895752, + "step": 11644, + "time_per_iteration": 2.67402982711792 + }, + { + "auxiliary_loss_clip": 0.01111612, + "auxiliary_loss_mlp": 0.01030869, + "balance_loss_clip": 1.03975642, + "balance_loss_mlp": 1.01972306, + "epoch": 0.7001352773185029, + "flos": 28202003634240.0, + "grad_norm": 1.7451690483677909, + "language_loss": 0.81927335, + "learning_rate": 8.712538527446119e-07, + "loss": 0.84069818, + "num_input_tokens_seen": 251234500, + "router_z_loss_clip": 0.71875, + "router_z_loss_mlp": 0.11138916, + "step": 11645, + "time_per_iteration": 2.683897018432617 + }, + { + "auxiliary_loss_clip": 0.01114764, + "auxiliary_loss_mlp": 0.0102924, + "balance_loss_clip": 1.04161251, + "balance_loss_mlp": 1.01758718, + "epoch": 0.7001954005711709, + "flos": 26018392894560.0, + "grad_norm": 1.6809618143228087, + "language_loss": 0.68319011, + "learning_rate": 8.709323657962584e-07, + "loss": 0.70463014, + "num_input_tokens_seen": 251254360, + "router_z_loss_clip": 0.73144531, + "router_z_loss_mlp": 0.11663818, + "step": 11646, + "time_per_iteration": 2.6415724754333496 + }, + { + "auxiliary_loss_clip": 0.0111432, + "auxiliary_loss_mlp": 0.01035516, + "balance_loss_clip": 1.04191542, + "balance_loss_mlp": 1.0241375, + "epoch": 0.7002555238238388, + "flos": 29938893831360.0, + "grad_norm": 1.7740116301732243, + "language_loss": 0.71081781, + "learning_rate": 8.706109216622635e-07, + "loss": 0.7323162, + "num_input_tokens_seen": 251274790, + "router_z_loss_clip": 0.72412109, + "router_z_loss_mlp": 0.11376953, + "step": 11647, + "time_per_iteration": 2.6527092456817627 + }, + { + "auxiliary_loss_clip": 0.01118294, + "auxiliary_loss_mlp": 0.01030452, + "balance_loss_clip": 1.04357052, + "balance_loss_mlp": 1.01837039, + "epoch": 0.7003156470765068, + "flos": 47658259988640.0, + "grad_norm": 1.925485025204529, + "language_loss": 0.71380967, + "learning_rate": 8.702895203548155e-07, + "loss": 0.73529708, + "num_input_tokens_seen": 251296275, + "router_z_loss_clip": 0.74804688, + "router_z_loss_mlp": 0.12084961, + "step": 11648, + "time_per_iteration": 2.7664754390716553 + }, + { + "auxiliary_loss_clip": 0.01111815, + "auxiliary_loss_mlp": 0.01028271, + "balance_loss_clip": 1.03863597, + "balance_loss_mlp": 1.01715434, + "epoch": 0.7003757703291749, + "flos": 35147417006880.0, + "grad_norm": 1.5600367966789295, + "language_loss": 0.7774269, + "learning_rate": 8.699681618861014e-07, + "loss": 0.79882777, + "num_input_tokens_seen": 251317375, + "router_z_loss_clip": 0.73095703, + "router_z_loss_mlp": 0.11114502, + "step": 11649, + "time_per_iteration": 2.6993844509124756 + }, + { + "auxiliary_loss_clip": 0.01114232, + "auxiliary_loss_mlp": 0.01031891, + "balance_loss_clip": 1.04088557, + "balance_loss_mlp": 1.02033377, + "epoch": 0.7004358935818428, + "flos": 19467236537280.0, + "grad_norm": 1.9737989135121774, + "language_loss": 0.78311193, + "learning_rate": 8.69646846268308e-07, + "loss": 0.80457318, + "num_input_tokens_seen": 251333570, + "router_z_loss_clip": 0.73291016, + "router_z_loss_mlp": 0.11547852, + "step": 11650, + "time_per_iteration": 2.612800121307373 + }, + { + "auxiliary_loss_clip": 0.0111269, + "auxiliary_loss_mlp": 0.01030764, + "balance_loss_clip": 1.03886032, + "balance_loss_mlp": 1.01918912, + "epoch": 0.7004960168345108, + "flos": 24906757491360.0, + "grad_norm": 2.0290927850369482, + "language_loss": 0.78109086, + "learning_rate": 8.693255735136194e-07, + "loss": 0.80252534, + "num_input_tokens_seen": 251351070, + "router_z_loss_clip": 0.73828125, + "router_z_loss_mlp": 0.11572266, + "step": 11651, + "time_per_iteration": 2.624992847442627 + }, + { + "auxiliary_loss_clip": 0.01117677, + "auxiliary_loss_mlp": 0.01029574, + "balance_loss_clip": 1.04242027, + "balance_loss_mlp": 1.01855326, + "epoch": 0.7005561400871787, + "flos": 21167262360000.0, + "grad_norm": 1.7026727783696713, + "language_loss": 0.69653797, + "learning_rate": 8.690043436342198e-07, + "loss": 0.71801043, + "num_input_tokens_seen": 251370005, + "router_z_loss_clip": 0.75292969, + "router_z_loss_mlp": 0.11022949, + "step": 11652, + "time_per_iteration": 2.7521636486053467 + }, + { + "auxiliary_loss_clip": 0.01114963, + "auxiliary_loss_mlp": 0.01031035, + "balance_loss_clip": 1.04148376, + "balance_loss_mlp": 1.01935267, + "epoch": 0.7006162633398467, + "flos": 30873980329920.0, + "grad_norm": 1.4678505466785694, + "language_loss": 0.74682176, + "learning_rate": 8.686831566422874e-07, + "loss": 0.76828176, + "num_input_tokens_seen": 251391210, + "router_z_loss_clip": 0.734375, + "router_z_loss_mlp": 0.11676025, + "step": 11653, + "time_per_iteration": 2.6574459075927734 + }, + { + "auxiliary_loss_clip": 0.01114683, + "auxiliary_loss_mlp": 0.01032949, + "balance_loss_clip": 1.03972435, + "balance_loss_mlp": 1.02024746, + "epoch": 0.7006763865925146, + "flos": 25220235751200.0, + "grad_norm": 2.159365256046675, + "language_loss": 0.70656407, + "learning_rate": 8.68362012550003e-07, + "loss": 0.7280404, + "num_input_tokens_seen": 251411505, + "router_z_loss_clip": 0.74951172, + "router_z_loss_mlp": 0.12701416, + "step": 11654, + "time_per_iteration": 2.719242572784424 + }, + { + "auxiliary_loss_clip": 0.01116992, + "auxiliary_loss_mlp": 0.01029032, + "balance_loss_clip": 1.04130304, + "balance_loss_mlp": 1.0160327, + "epoch": 0.7007365098451827, + "flos": 24460766259840.0, + "grad_norm": 2.847029947187659, + "language_loss": 0.73304701, + "learning_rate": 8.680409113695453e-07, + "loss": 0.7545073, + "num_input_tokens_seen": 251428975, + "router_z_loss_clip": 0.7578125, + "router_z_loss_mlp": 0.12988281, + "step": 11655, + "time_per_iteration": 2.6376395225524902 + }, + { + "auxiliary_loss_clip": 0.01125164, + "auxiliary_loss_mlp": 0.01038219, + "balance_loss_clip": 1.04501629, + "balance_loss_mlp": 1.02556515, + "epoch": 0.7007966330978506, + "flos": 25352586653760.0, + "grad_norm": 2.268760614866713, + "language_loss": 0.70301878, + "learning_rate": 8.677198531130889e-07, + "loss": 0.72465265, + "num_input_tokens_seen": 251446940, + "router_z_loss_clip": 0.80078125, + "router_z_loss_mlp": 0.12664795, + "step": 11656, + "time_per_iteration": 4.11397910118103 + }, + { + "auxiliary_loss_clip": 0.01112573, + "auxiliary_loss_mlp": 0.01031657, + "balance_loss_clip": 1.03984809, + "balance_loss_mlp": 1.02071953, + "epoch": 0.7008567563505186, + "flos": 36165713297760.0, + "grad_norm": 1.5531761871494112, + "language_loss": 0.77833223, + "learning_rate": 8.673988377928092e-07, + "loss": 0.79977453, + "num_input_tokens_seen": 251466205, + "router_z_loss_clip": 0.72705078, + "router_z_loss_mlp": 0.10931396, + "step": 11657, + "time_per_iteration": 2.727994680404663 + }, + { + "auxiliary_loss_clip": 0.01118829, + "auxiliary_loss_mlp": 0.01034728, + "balance_loss_clip": 1.0418694, + "balance_loss_mlp": 1.02200842, + "epoch": 0.7009168796031865, + "flos": 20855445308640.0, + "grad_norm": 1.9050625467721318, + "language_loss": 0.77881062, + "learning_rate": 8.670778654208797e-07, + "loss": 0.80034626, + "num_input_tokens_seen": 251484820, + "router_z_loss_clip": 0.76904297, + "router_z_loss_mlp": 0.1272583, + "step": 11658, + "time_per_iteration": 2.6204888820648193 + }, + { + "auxiliary_loss_clip": 0.01111127, + "auxiliary_loss_mlp": 0.01026709, + "balance_loss_clip": 1.03906107, + "balance_loss_mlp": 1.01531291, + "epoch": 0.7009770028558545, + "flos": 24951846873600.0, + "grad_norm": 1.867080299387141, + "language_loss": 0.82658708, + "learning_rate": 8.667569360094713e-07, + "loss": 0.84796542, + "num_input_tokens_seen": 251502670, + "router_z_loss_clip": 0.72021484, + "router_z_loss_mlp": 0.11395264, + "step": 11659, + "time_per_iteration": 3.903287649154663 + }, + { + "auxiliary_loss_clip": 0.011136, + "auxiliary_loss_mlp": 0.01026702, + "balance_loss_clip": 1.04119527, + "balance_loss_mlp": 1.015329, + "epoch": 0.7010371261085224, + "flos": 23482818829440.0, + "grad_norm": 1.9853014582727155, + "language_loss": 0.68973386, + "learning_rate": 8.664360495707526e-07, + "loss": 0.71113688, + "num_input_tokens_seen": 251521630, + "router_z_loss_clip": 0.72363281, + "router_z_loss_mlp": 0.1137085, + "step": 11660, + "time_per_iteration": 2.5915496349334717 + }, + { + "auxiliary_loss_clip": 0.01115525, + "auxiliary_loss_mlp": 0.01033205, + "balance_loss_clip": 1.03924036, + "balance_loss_mlp": 1.02074134, + "epoch": 0.7010972493611904, + "flos": 27000189466560.0, + "grad_norm": 1.8205904246735511, + "language_loss": 0.81135309, + "learning_rate": 8.661152061168924e-07, + "loss": 0.83284038, + "num_input_tokens_seen": 251540105, + "router_z_loss_clip": 0.76220703, + "router_z_loss_mlp": 0.12457275, + "step": 11661, + "time_per_iteration": 2.6609418392181396 + }, + { + "auxiliary_loss_clip": 0.01112626, + "auxiliary_loss_mlp": 0.01033671, + "balance_loss_clip": 1.03834784, + "balance_loss_mlp": 1.02230406, + "epoch": 0.7011573726138585, + "flos": 38304639828000.0, + "grad_norm": 1.7727271236164612, + "language_loss": 0.79021198, + "learning_rate": 8.657944056600579e-07, + "loss": 0.81167495, + "num_input_tokens_seen": 251560530, + "router_z_loss_clip": 0.74365234, + "router_z_loss_mlp": 0.11358643, + "step": 11662, + "time_per_iteration": 2.6705822944641113 + }, + { + "auxiliary_loss_clip": 0.0111622, + "auxiliary_loss_mlp": 0.01025084, + "balance_loss_clip": 1.040573, + "balance_loss_mlp": 1.01316929, + "epoch": 0.7012174958665264, + "flos": 22147154619840.0, + "grad_norm": 2.193683264732396, + "language_loss": 0.83459973, + "learning_rate": 8.654736482124134e-07, + "loss": 0.85601282, + "num_input_tokens_seen": 251577930, + "router_z_loss_clip": 0.75585938, + "router_z_loss_mlp": 0.11914062, + "step": 11663, + "time_per_iteration": 2.5943877696990967 + }, + { + "auxiliary_loss_clip": 0.01033523, + "auxiliary_loss_mlp": 0.01001653, + "balance_loss_clip": 1.010077, + "balance_loss_mlp": 1.00054765, + "epoch": 0.7012776191191944, + "flos": 74006864779680.0, + "grad_norm": 0.827435285639522, + "language_loss": 0.5372045, + "learning_rate": 8.651529337861209e-07, + "loss": 0.55755633, + "num_input_tokens_seen": 251638820, + "router_z_loss_clip": 0.23449707, + "router_z_loss_mlp": 0.01106262, + "step": 11664, + "time_per_iteration": 3.2152979373931885 + }, + { + "auxiliary_loss_clip": 0.01116577, + "auxiliary_loss_mlp": 0.01031223, + "balance_loss_clip": 1.04066086, + "balance_loss_mlp": 1.01902819, + "epoch": 0.7013377423718623, + "flos": 33188280763680.0, + "grad_norm": 5.99389750547643, + "language_loss": 0.78400397, + "learning_rate": 8.64832262393344e-07, + "loss": 0.80548197, + "num_input_tokens_seen": 251658070, + "router_z_loss_clip": 0.75830078, + "router_z_loss_mlp": 0.12194824, + "step": 11665, + "time_per_iteration": 2.6509287357330322 + }, + { + "auxiliary_loss_clip": 0.01114867, + "auxiliary_loss_mlp": 0.01027954, + "balance_loss_clip": 1.04055953, + "balance_loss_mlp": 1.01671302, + "epoch": 0.7013978656245303, + "flos": 20186276133600.0, + "grad_norm": 2.3130214369553155, + "language_loss": 0.76740229, + "learning_rate": 8.645116340462404e-07, + "loss": 0.78883052, + "num_input_tokens_seen": 251671575, + "router_z_loss_clip": 0.74316406, + "router_z_loss_mlp": 0.11230469, + "step": 11666, + "time_per_iteration": 2.6145012378692627 + }, + { + "auxiliary_loss_clip": 0.01114966, + "auxiliary_loss_mlp": 0.01028741, + "balance_loss_clip": 1.0418551, + "balance_loss_mlp": 1.01721323, + "epoch": 0.7014579888771982, + "flos": 28240731803520.0, + "grad_norm": 2.1595151704827056, + "language_loss": 0.8140409, + "learning_rate": 8.641910487569695e-07, + "loss": 0.83547795, + "num_input_tokens_seen": 251689350, + "router_z_loss_clip": 0.72998047, + "router_z_loss_mlp": 0.11529541, + "step": 11667, + "time_per_iteration": 4.181663274765015 + }, + { + "auxiliary_loss_clip": 0.01114331, + "auxiliary_loss_mlp": 0.0103593, + "balance_loss_clip": 1.04055548, + "balance_loss_mlp": 1.02415776, + "epoch": 0.7015181121298663, + "flos": 30605591452320.0, + "grad_norm": 2.314282706682072, + "language_loss": 0.65051818, + "learning_rate": 8.638705065376879e-07, + "loss": 0.67202079, + "num_input_tokens_seen": 251704635, + "router_z_loss_clip": 0.73876953, + "router_z_loss_mlp": 0.11773682, + "step": 11668, + "time_per_iteration": 3.9639530181884766 + }, + { + "auxiliary_loss_clip": 0.01115411, + "auxiliary_loss_mlp": 0.01027542, + "balance_loss_clip": 1.0399282, + "balance_loss_mlp": 1.01570415, + "epoch": 0.7015782353825342, + "flos": 28465246817280.0, + "grad_norm": 4.732957277925665, + "language_loss": 0.7672767, + "learning_rate": 8.635500074005519e-07, + "loss": 0.7887063, + "num_input_tokens_seen": 251723035, + "router_z_loss_clip": 0.75488281, + "router_z_loss_mlp": 0.11846924, + "step": 11669, + "time_per_iteration": 2.6874735355377197 + }, + { + "auxiliary_loss_clip": 0.01033476, + "auxiliary_loss_mlp": 0.01002697, + "balance_loss_clip": 1.01016736, + "balance_loss_mlp": 1.0015974, + "epoch": 0.7016383586352022, + "flos": 85899178836000.0, + "grad_norm": 0.7003744810291169, + "language_loss": 0.54474509, + "learning_rate": 8.632295513577122e-07, + "loss": 0.56510687, + "num_input_tokens_seen": 251791630, + "router_z_loss_clip": 0.23303223, + "router_z_loss_mlp": 0.01100159, + "step": 11670, + "time_per_iteration": 3.395071029663086 + }, + { + "auxiliary_loss_clip": 0.01115415, + "auxiliary_loss_mlp": 0.01035365, + "balance_loss_clip": 1.0413841, + "balance_loss_mlp": 1.02307498, + "epoch": 0.7016984818878701, + "flos": 24150488865120.0, + "grad_norm": 1.6395636196814287, + "language_loss": 0.81630826, + "learning_rate": 8.629091384213218e-07, + "loss": 0.83781606, + "num_input_tokens_seen": 251809840, + "router_z_loss_clip": 0.73876953, + "router_z_loss_mlp": 0.12298584, + "step": 11671, + "time_per_iteration": 2.6596224308013916 + }, + { + "auxiliary_loss_clip": 0.01116532, + "auxiliary_loss_mlp": 0.01031358, + "balance_loss_clip": 1.04218197, + "balance_loss_mlp": 1.01944911, + "epoch": 0.7017586051405381, + "flos": 15736209517440.0, + "grad_norm": 2.0991236135179467, + "language_loss": 0.7482723, + "learning_rate": 8.625887686035313e-07, + "loss": 0.76975125, + "num_input_tokens_seen": 251827550, + "router_z_loss_clip": 0.74414062, + "router_z_loss_mlp": 0.11907959, + "step": 11672, + "time_per_iteration": 2.6698949337005615 + }, + { + "auxiliary_loss_clip": 0.011134, + "auxiliary_loss_mlp": 0.01027763, + "balance_loss_clip": 1.03930879, + "balance_loss_mlp": 1.0158422, + "epoch": 0.701818728393206, + "flos": 22369562735040.0, + "grad_norm": 12.173538244763424, + "language_loss": 0.8680647, + "learning_rate": 8.622684419164883e-07, + "loss": 0.8894763, + "num_input_tokens_seen": 251844880, + "router_z_loss_clip": 0.74023438, + "router_z_loss_mlp": 0.11932373, + "step": 11673, + "time_per_iteration": 2.5949833393096924 + }, + { + "auxiliary_loss_clip": 0.01110539, + "auxiliary_loss_mlp": 0.01026949, + "balance_loss_clip": 1.03896463, + "balance_loss_mlp": 1.01516485, + "epoch": 0.701878851645874, + "flos": 21211987086720.0, + "grad_norm": 1.8415662021785413, + "language_loss": 0.7337485, + "learning_rate": 8.619481583723399e-07, + "loss": 0.75512338, + "num_input_tokens_seen": 251861025, + "router_z_loss_clip": 0.71484375, + "router_z_loss_mlp": 0.11791992, + "step": 11674, + "time_per_iteration": 2.6462104320526123 + }, + { + "auxiliary_loss_clip": 0.01113125, + "auxiliary_loss_mlp": 0.01030886, + "balance_loss_clip": 1.0424515, + "balance_loss_mlp": 1.01994252, + "epoch": 0.701938974898542, + "flos": 29181936411360.0, + "grad_norm": 1.690580029938793, + "language_loss": 0.72065628, + "learning_rate": 8.616279179832329e-07, + "loss": 0.74209642, + "num_input_tokens_seen": 251880175, + "router_z_loss_clip": 0.70751953, + "router_z_loss_mlp": 0.109375, + "step": 11675, + "time_per_iteration": 2.698366403579712 + }, + { + "auxiliary_loss_clip": 0.01115054, + "auxiliary_loss_mlp": 0.01026475, + "balance_loss_clip": 1.04031563, + "balance_loss_mlp": 1.01484597, + "epoch": 0.70199909815121, + "flos": 26595397958400.0, + "grad_norm": 2.6631619283971473, + "language_loss": 0.51247096, + "learning_rate": 8.613077207613078e-07, + "loss": 0.53388625, + "num_input_tokens_seen": 251899005, + "router_z_loss_clip": 0.74804688, + "router_z_loss_mlp": 0.11627197, + "step": 11676, + "time_per_iteration": 2.634352207183838 + }, + { + "auxiliary_loss_clip": 0.0103337, + "auxiliary_loss_mlp": 0.0100212, + "balance_loss_clip": 1.0100565, + "balance_loss_mlp": 1.00099921, + "epoch": 0.702059221403878, + "flos": 87506959512960.0, + "grad_norm": 0.7355689452129813, + "language_loss": 0.59196281, + "learning_rate": 8.609875667187079e-07, + "loss": 0.6123178, + "num_input_tokens_seen": 251966790, + "router_z_loss_clip": 0.2331543, + "router_z_loss_mlp": 0.01121521, + "step": 11677, + "time_per_iteration": 3.315483570098877 + }, + { + "auxiliary_loss_clip": 0.01116441, + "auxiliary_loss_mlp": 0.01030106, + "balance_loss_clip": 1.040573, + "balance_loss_mlp": 1.01810718, + "epoch": 0.7021193446565459, + "flos": 34301698927200.0, + "grad_norm": 2.4337761495072745, + "language_loss": 0.62618041, + "learning_rate": 8.606674558675737e-07, + "loss": 0.64764589, + "num_input_tokens_seen": 251989315, + "router_z_loss_clip": 0.75927734, + "router_z_loss_mlp": 0.12005615, + "step": 11678, + "time_per_iteration": 2.6927499771118164 + }, + { + "auxiliary_loss_clip": 0.01114148, + "auxiliary_loss_mlp": 0.01031121, + "balance_loss_clip": 1.04045713, + "balance_loss_mlp": 1.01977837, + "epoch": 0.7021794679092139, + "flos": 27973639478880.0, + "grad_norm": 1.6448083152733783, + "language_loss": 0.79172373, + "learning_rate": 8.603473882200444e-07, + "loss": 0.81317639, + "num_input_tokens_seen": 252006620, + "router_z_loss_clip": 0.73681641, + "router_z_loss_mlp": 0.11334229, + "step": 11679, + "time_per_iteration": 2.6158225536346436 + }, + { + "auxiliary_loss_clip": 0.01113963, + "auxiliary_loss_mlp": 0.01035036, + "balance_loss_clip": 1.04275155, + "balance_loss_mlp": 1.02473009, + "epoch": 0.7022395911618818, + "flos": 22058353442880.0, + "grad_norm": 2.713033620977271, + "language_loss": 0.70570064, + "learning_rate": 8.600273637882567e-07, + "loss": 0.72719061, + "num_input_tokens_seen": 252024570, + "router_z_loss_clip": 0.71191406, + "router_z_loss_mlp": 0.10308838, + "step": 11680, + "time_per_iteration": 2.6156129837036133 + }, + { + "auxiliary_loss_clip": 0.01118355, + "auxiliary_loss_mlp": 0.01031204, + "balance_loss_clip": 1.04211307, + "balance_loss_mlp": 1.01909268, + "epoch": 0.7022997144145499, + "flos": 19564708412160.0, + "grad_norm": 1.8587558181253154, + "language_loss": 0.74912959, + "learning_rate": 8.597073825843446e-07, + "loss": 0.77062523, + "num_input_tokens_seen": 252042775, + "router_z_loss_clip": 0.76220703, + "router_z_loss_mlp": 0.12097168, + "step": 11681, + "time_per_iteration": 2.6453754901885986 + }, + { + "auxiliary_loss_clip": 0.01113175, + "auxiliary_loss_mlp": 0.01031731, + "balance_loss_clip": 1.03956461, + "balance_loss_mlp": 1.02105546, + "epoch": 0.7023598376672178, + "flos": 32297594853600.0, + "grad_norm": 1.628674686312999, + "language_loss": 0.76917422, + "learning_rate": 8.593874446204434e-07, + "loss": 0.79062331, + "num_input_tokens_seen": 252063690, + "router_z_loss_clip": 0.73583984, + "router_z_loss_mlp": 0.10675049, + "step": 11682, + "time_per_iteration": 2.680609703063965 + }, + { + "auxiliary_loss_clip": 0.01117281, + "auxiliary_loss_mlp": 0.01034232, + "balance_loss_clip": 1.04096699, + "balance_loss_mlp": 1.02267504, + "epoch": 0.7024199609198858, + "flos": 20811368858400.0, + "grad_norm": 2.950153384251145, + "language_loss": 0.73260736, + "learning_rate": 8.590675499086841e-07, + "loss": 0.75412244, + "num_input_tokens_seen": 252080335, + "router_z_loss_clip": 0.76220703, + "router_z_loss_mlp": 0.11547852, + "step": 11683, + "time_per_iteration": 2.608729600906372 + }, + { + "auxiliary_loss_clip": 0.01115867, + "auxiliary_loss_mlp": 0.01032125, + "balance_loss_clip": 1.04266119, + "balance_loss_mlp": 1.02022171, + "epoch": 0.7024800841725537, + "flos": 31541407261920.0, + "grad_norm": 2.512628022880345, + "language_loss": 0.71510112, + "learning_rate": 8.587476984611976e-07, + "loss": 0.73658103, + "num_input_tokens_seen": 252101075, + "router_z_loss_clip": 0.73242188, + "router_z_loss_mlp": 0.11907959, + "step": 11684, + "time_per_iteration": 2.6936867237091064 + }, + { + "auxiliary_loss_clip": 0.01114869, + "auxiliary_loss_mlp": 0.01032859, + "balance_loss_clip": 1.04091573, + "balance_loss_mlp": 1.02081859, + "epoch": 0.7025402074252217, + "flos": 28690896314880.0, + "grad_norm": 2.160079163614821, + "language_loss": 0.71680063, + "learning_rate": 8.584278902901128e-07, + "loss": 0.73827791, + "num_input_tokens_seen": 252120510, + "router_z_loss_clip": 0.73925781, + "router_z_loss_mlp": 0.12054443, + "step": 11685, + "time_per_iteration": 2.7531185150146484 + }, + { + "auxiliary_loss_clip": 0.0111617, + "auxiliary_loss_mlp": 0.01028345, + "balance_loss_clip": 1.0406549, + "balance_loss_mlp": 1.01725304, + "epoch": 0.7026003306778896, + "flos": 24588781813440.0, + "grad_norm": 2.4602473530250464, + "language_loss": 0.84522855, + "learning_rate": 8.581081254075582e-07, + "loss": 0.86667371, + "num_input_tokens_seen": 252137590, + "router_z_loss_clip": 0.75537109, + "router_z_loss_mlp": 0.11083984, + "step": 11686, + "time_per_iteration": 2.605739116668701 + }, + { + "auxiliary_loss_clip": 0.01033405, + "auxiliary_loss_mlp": 0.01001833, + "balance_loss_clip": 1.01015723, + "balance_loss_mlp": 1.00078988, + "epoch": 0.7026604539305576, + "flos": 79035962323680.0, + "grad_norm": 0.995521264392388, + "language_loss": 0.69838649, + "learning_rate": 8.577884038256566e-07, + "loss": 0.71873885, + "num_input_tokens_seen": 252199830, + "router_z_loss_clip": 0.23266602, + "router_z_loss_mlp": 0.01043701, + "step": 11687, + "time_per_iteration": 3.441798210144043 + }, + { + "auxiliary_loss_clip": 0.01115518, + "auxiliary_loss_mlp": 0.01027806, + "balance_loss_clip": 1.040622, + "balance_loss_mlp": 1.01635027, + "epoch": 0.7027205771832256, + "flos": 26687724138720.0, + "grad_norm": 1.9068228059751196, + "language_loss": 0.76864624, + "learning_rate": 8.574687255565329e-07, + "loss": 0.79007941, + "num_input_tokens_seen": 252217200, + "router_z_loss_clip": 0.74902344, + "router_z_loss_mlp": 0.11444092, + "step": 11688, + "time_per_iteration": 2.6305999755859375 + }, + { + "auxiliary_loss_clip": 0.011144, + "auxiliary_loss_mlp": 0.0103014, + "balance_loss_clip": 1.04021096, + "balance_loss_mlp": 1.0182966, + "epoch": 0.7027807004358936, + "flos": 28513618099200.0, + "grad_norm": 2.2236507622622343, + "language_loss": 0.68015158, + "learning_rate": 8.571490906123107e-07, + "loss": 0.70159698, + "num_input_tokens_seen": 252236105, + "router_z_loss_clip": 0.74121094, + "router_z_loss_mlp": 0.11846924, + "step": 11689, + "time_per_iteration": 2.65502667427063 + }, + { + "auxiliary_loss_clip": 0.01115274, + "auxiliary_loss_mlp": 0.0103596, + "balance_loss_clip": 1.03895462, + "balance_loss_mlp": 1.02344322, + "epoch": 0.7028408236885616, + "flos": 18673900950240.0, + "grad_norm": 2.647555082105489, + "language_loss": 0.79413462, + "learning_rate": 8.568294990051086e-07, + "loss": 0.81564701, + "num_input_tokens_seen": 252253315, + "router_z_loss_clip": 0.76269531, + "router_z_loss_mlp": 0.12518311, + "step": 11690, + "time_per_iteration": 2.624037981033325 + }, + { + "auxiliary_loss_clip": 0.01116144, + "auxiliary_loss_mlp": 0.01034492, + "balance_loss_clip": 1.0428226, + "balance_loss_mlp": 1.02262449, + "epoch": 0.7029009469412295, + "flos": 26866704080160.0, + "grad_norm": 1.6070160928208115, + "language_loss": 0.76001263, + "learning_rate": 8.56509950747047e-07, + "loss": 0.781519, + "num_input_tokens_seen": 252272765, + "router_z_loss_clip": 0.73144531, + "router_z_loss_mlp": 0.11883545, + "step": 11691, + "time_per_iteration": 2.6580593585968018 + }, + { + "auxiliary_loss_clip": 0.01115662, + "auxiliary_loss_mlp": 0.01026074, + "balance_loss_clip": 1.04274058, + "balance_loss_mlp": 1.01448059, + "epoch": 0.7029610701938975, + "flos": 26643809757600.0, + "grad_norm": 1.7925466833209565, + "language_loss": 0.82066071, + "learning_rate": 8.561904458502429e-07, + "loss": 0.84207803, + "num_input_tokens_seen": 252290510, + "router_z_loss_clip": 0.72900391, + "router_z_loss_mlp": 0.11584473, + "step": 11692, + "time_per_iteration": 2.636852264404297 + }, + { + "auxiliary_loss_clip": 0.01113832, + "auxiliary_loss_mlp": 0.0102814, + "balance_loss_clip": 1.04007316, + "balance_loss_mlp": 1.01595616, + "epoch": 0.7030211934465654, + "flos": 23349211891200.0, + "grad_norm": 2.126690625311167, + "language_loss": 0.76944739, + "learning_rate": 8.558709843268111e-07, + "loss": 0.79086709, + "num_input_tokens_seen": 252309365, + "router_z_loss_clip": 0.73681641, + "router_z_loss_mlp": 0.1217041, + "step": 11693, + "time_per_iteration": 2.6443257331848145 + }, + { + "auxiliary_loss_clip": 0.01116179, + "auxiliary_loss_mlp": 0.01038446, + "balance_loss_clip": 1.04355907, + "balance_loss_mlp": 1.02752662, + "epoch": 0.7030813166992335, + "flos": 47035233645120.0, + "grad_norm": 2.6300529116742775, + "language_loss": 0.68366265, + "learning_rate": 8.55551566188866e-07, + "loss": 0.70520896, + "num_input_tokens_seen": 252333010, + "router_z_loss_clip": 0.7265625, + "router_z_loss_mlp": 0.10919189, + "step": 11694, + "time_per_iteration": 2.8171560764312744 + }, + { + "auxiliary_loss_clip": 0.01115931, + "auxiliary_loss_mlp": 0.01030584, + "balance_loss_clip": 1.04120517, + "balance_loss_mlp": 1.01917601, + "epoch": 0.7031414399519014, + "flos": 17961992395200.0, + "grad_norm": 2.3633507021221107, + "language_loss": 0.75657505, + "learning_rate": 8.552321914485203e-07, + "loss": 0.77804017, + "num_input_tokens_seen": 252351330, + "router_z_loss_clip": 0.74707031, + "router_z_loss_mlp": 0.11401367, + "step": 11695, + "time_per_iteration": 2.611804962158203 + }, + { + "auxiliary_loss_clip": 0.01117957, + "auxiliary_loss_mlp": 0.01038438, + "balance_loss_clip": 1.04191232, + "balance_loss_mlp": 1.02620721, + "epoch": 0.7032015632045694, + "flos": 17115585521760.0, + "grad_norm": 5.667382650004995, + "language_loss": 0.73801029, + "learning_rate": 8.549128601178852e-07, + "loss": 0.75957417, + "num_input_tokens_seen": 252369580, + "router_z_loss_clip": 0.76074219, + "router_z_loss_mlp": 0.12237549, + "step": 11696, + "time_per_iteration": 4.1012914180755615 + }, + { + "auxiliary_loss_clip": 0.01116069, + "auxiliary_loss_mlp": 0.0103243, + "balance_loss_clip": 1.04079235, + "balance_loss_mlp": 1.0199784, + "epoch": 0.7032616864572373, + "flos": 33723397310400.0, + "grad_norm": 1.693068038458534, + "language_loss": 0.75366229, + "learning_rate": 8.545935722090693e-07, + "loss": 0.77514732, + "num_input_tokens_seen": 252390525, + "router_z_loss_clip": 0.75292969, + "router_z_loss_mlp": 0.12438965, + "step": 11697, + "time_per_iteration": 2.7415361404418945 + }, + { + "auxiliary_loss_clip": 0.01119752, + "auxiliary_loss_mlp": 0.01039936, + "balance_loss_clip": 1.04300547, + "balance_loss_mlp": 1.02631009, + "epoch": 0.7033218097099053, + "flos": 21923571503520.0, + "grad_norm": 2.305850074461305, + "language_loss": 0.80673981, + "learning_rate": 8.542743277341793e-07, + "loss": 0.82833672, + "num_input_tokens_seen": 252407470, + "router_z_loss_clip": 0.76757812, + "router_z_loss_mlp": 0.13616943, + "step": 11698, + "time_per_iteration": 2.655674934387207 + }, + { + "auxiliary_loss_clip": 0.01115141, + "auxiliary_loss_mlp": 0.01034841, + "balance_loss_clip": 1.03987896, + "balance_loss_mlp": 1.02304542, + "epoch": 0.7033819329625732, + "flos": 23795810881920.0, + "grad_norm": 1.4208413529667407, + "language_loss": 0.84829611, + "learning_rate": 8.539551267053222e-07, + "loss": 0.86979592, + "num_input_tokens_seen": 252427025, + "router_z_loss_clip": 0.75341797, + "router_z_loss_mlp": 0.11798096, + "step": 11699, + "time_per_iteration": 3.9827632904052734 + }, + { + "auxiliary_loss_clip": 0.0111703, + "auxiliary_loss_mlp": 0.01029403, + "balance_loss_clip": 1.04338276, + "balance_loss_mlp": 1.01707685, + "epoch": 0.7034420562152413, + "flos": 29270048794560.0, + "grad_norm": 2.3910135319236145, + "language_loss": 0.79244876, + "learning_rate": 8.53635969134601e-07, + "loss": 0.81391311, + "num_input_tokens_seen": 252445410, + "router_z_loss_clip": 0.73535156, + "router_z_loss_mlp": 0.12322998, + "step": 11700, + "time_per_iteration": 2.626553535461426 + }, + { + "auxiliary_loss_clip": 0.01115103, + "auxiliary_loss_mlp": 0.01025321, + "balance_loss_clip": 1.03959382, + "balance_loss_mlp": 1.01301813, + "epoch": 0.7035021794679092, + "flos": 42761108174400.0, + "grad_norm": 2.0780066891634905, + "language_loss": 0.7420646, + "learning_rate": 8.533168550341186e-07, + "loss": 0.76346886, + "num_input_tokens_seen": 252463905, + "router_z_loss_clip": 0.75537109, + "router_z_loss_mlp": 0.12310791, + "step": 11701, + "time_per_iteration": 2.7398009300231934 + }, + { + "auxiliary_loss_clip": 0.01116912, + "auxiliary_loss_mlp": 0.01031579, + "balance_loss_clip": 1.04084468, + "balance_loss_mlp": 1.01865077, + "epoch": 0.7035623027205772, + "flos": 13418505632160.0, + "grad_norm": 2.6071502813561795, + "language_loss": 0.84398103, + "learning_rate": 8.529977844159769e-07, + "loss": 0.865466, + "num_input_tokens_seen": 252478655, + "router_z_loss_clip": 0.76074219, + "router_z_loss_mlp": 0.12927246, + "step": 11702, + "time_per_iteration": 2.5723185539245605 + }, + { + "auxiliary_loss_clip": 0.01114198, + "auxiliary_loss_mlp": 0.01033667, + "balance_loss_clip": 1.03929543, + "balance_loss_mlp": 1.02140594, + "epoch": 0.7036224259732452, + "flos": 28825880840640.0, + "grad_norm": 2.148145178122394, + "language_loss": 0.61096156, + "learning_rate": 8.526787572922738e-07, + "loss": 0.63244021, + "num_input_tokens_seen": 252498740, + "router_z_loss_clip": 0.74853516, + "router_z_loss_mlp": 0.12243652, + "step": 11703, + "time_per_iteration": 2.6405029296875 + }, + { + "auxiliary_loss_clip": 0.01113961, + "auxiliary_loss_mlp": 0.01025571, + "balance_loss_clip": 1.03929472, + "balance_loss_mlp": 1.01349521, + "epoch": 0.7036825492259131, + "flos": 38664382471200.0, + "grad_norm": 2.1371509622305753, + "language_loss": 0.61250222, + "learning_rate": 8.523597736751067e-07, + "loss": 0.63389754, + "num_input_tokens_seen": 252517800, + "router_z_loss_clip": 0.74658203, + "router_z_loss_mlp": 0.1206665, + "step": 11704, + "time_per_iteration": 2.709296226501465 + }, + { + "auxiliary_loss_clip": 0.01110113, + "auxiliary_loss_mlp": 0.01030348, + "balance_loss_clip": 1.03958607, + "balance_loss_mlp": 1.01982176, + "epoch": 0.7037426724785811, + "flos": 36841891962240.0, + "grad_norm": 1.973943025386473, + "language_loss": 0.71028721, + "learning_rate": 8.520408335765719e-07, + "loss": 0.73169184, + "num_input_tokens_seen": 252539620, + "router_z_loss_clip": 0.70507812, + "router_z_loss_mlp": 0.10522461, + "step": 11705, + "time_per_iteration": 2.699005365371704 + }, + { + "auxiliary_loss_clip": 0.01113371, + "auxiliary_loss_mlp": 0.01029617, + "balance_loss_clip": 1.0406065, + "balance_loss_mlp": 1.01816666, + "epoch": 0.703802795731249, + "flos": 29665683397440.0, + "grad_norm": 1.990633115573845, + "language_loss": 0.6177094, + "learning_rate": 8.517219370087645e-07, + "loss": 0.63913929, + "num_input_tokens_seen": 252557300, + "router_z_loss_clip": 0.7265625, + "router_z_loss_mlp": 0.11456299, + "step": 11706, + "time_per_iteration": 2.6286189556121826 + }, + { + "auxiliary_loss_clip": 0.01115635, + "auxiliary_loss_mlp": 0.010272, + "balance_loss_clip": 1.04075634, + "balance_loss_mlp": 1.0160296, + "epoch": 0.7038629189839171, + "flos": 27489973527360.0, + "grad_norm": 2.076280844027102, + "language_loss": 0.68362606, + "learning_rate": 8.514030839837756e-07, + "loss": 0.7050544, + "num_input_tokens_seen": 252576715, + "router_z_loss_clip": 0.74902344, + "router_z_loss_mlp": 0.11175537, + "step": 11707, + "time_per_iteration": 4.158066511154175 + }, + { + "auxiliary_loss_clip": 0.01110851, + "auxiliary_loss_mlp": 0.01026832, + "balance_loss_clip": 1.03914964, + "balance_loss_mlp": 1.01549554, + "epoch": 0.703923042236585, + "flos": 32031272357280.0, + "grad_norm": 2.579378117968914, + "language_loss": 0.76489842, + "learning_rate": 8.510842745136974e-07, + "loss": 0.78627527, + "num_input_tokens_seen": 252596190, + "router_z_loss_clip": 0.71728516, + "router_z_loss_mlp": 0.11322021, + "step": 11708, + "time_per_iteration": 3.9822821617126465 + }, + { + "auxiliary_loss_clip": 0.01113424, + "auxiliary_loss_mlp": 0.01027631, + "balance_loss_clip": 1.04115188, + "balance_loss_mlp": 1.01635396, + "epoch": 0.703983165489253, + "flos": 23659975493280.0, + "grad_norm": 2.193188876642257, + "language_loss": 0.72623181, + "learning_rate": 8.50765508610619e-07, + "loss": 0.7476424, + "num_input_tokens_seen": 252613410, + "router_z_loss_clip": 0.72167969, + "router_z_loss_mlp": 0.11273193, + "step": 11709, + "time_per_iteration": 2.6942946910858154 + }, + { + "auxiliary_loss_clip": 0.01111219, + "auxiliary_loss_mlp": 0.01026929, + "balance_loss_clip": 1.0382911, + "balance_loss_mlp": 1.01546657, + "epoch": 0.7040432887419209, + "flos": 20357760378240.0, + "grad_norm": 2.039955892100913, + "language_loss": 0.78750449, + "learning_rate": 8.504467862866267e-07, + "loss": 0.80888593, + "num_input_tokens_seen": 252629150, + "router_z_loss_clip": 0.72802734, + "router_z_loss_mlp": 0.11456299, + "step": 11710, + "time_per_iteration": 2.6016135215759277 + }, + { + "auxiliary_loss_clip": 0.01116259, + "auxiliary_loss_mlp": 0.01030119, + "balance_loss_clip": 1.04168439, + "balance_loss_mlp": 1.01798368, + "epoch": 0.7041034119945889, + "flos": 25796876159520.0, + "grad_norm": 1.810740697899537, + "language_loss": 0.77261829, + "learning_rate": 8.501281075538076e-07, + "loss": 0.79408211, + "num_input_tokens_seen": 252648225, + "router_z_loss_clip": 0.74511719, + "router_z_loss_mlp": 0.12139893, + "step": 11711, + "time_per_iteration": 2.6532106399536133 + }, + { + "auxiliary_loss_clip": 0.01111571, + "auxiliary_loss_mlp": 0.01028983, + "balance_loss_clip": 1.03874528, + "balance_loss_mlp": 1.01860571, + "epoch": 0.7041635352472568, + "flos": 20633644952640.0, + "grad_norm": 2.814532305709773, + "language_loss": 0.74184752, + "learning_rate": 8.498094724242457e-07, + "loss": 0.76325309, + "num_input_tokens_seen": 252665380, + "router_z_loss_clip": 0.72900391, + "router_z_loss_mlp": 0.10369873, + "step": 11712, + "time_per_iteration": 2.614471197128296 + }, + { + "auxiliary_loss_clip": 0.01032382, + "auxiliary_loss_mlp": 0.01000744, + "balance_loss_clip": 1.00921357, + "balance_loss_mlp": 0.99960262, + "epoch": 0.7042236584999249, + "flos": 87465435651360.0, + "grad_norm": 0.8791017284697635, + "language_loss": 0.6460405, + "learning_rate": 8.494908809100247e-07, + "loss": 0.66637176, + "num_input_tokens_seen": 252727950, + "router_z_loss_clip": 0.23205566, + "router_z_loss_mlp": 0.01141357, + "step": 11713, + "time_per_iteration": 3.291015625 + }, + { + "auxiliary_loss_clip": 0.01111665, + "auxiliary_loss_mlp": 0.01027252, + "balance_loss_clip": 1.03774691, + "balance_loss_mlp": 1.01654124, + "epoch": 0.7042837817525928, + "flos": 34969368962880.0, + "grad_norm": 2.066140682965144, + "language_loss": 0.72701168, + "learning_rate": 8.49172333023225e-07, + "loss": 0.74840087, + "num_input_tokens_seen": 252746770, + "router_z_loss_clip": 0.73974609, + "router_z_loss_mlp": 0.1071167, + "step": 11714, + "time_per_iteration": 2.7055141925811768 + }, + { + "auxiliary_loss_clip": 0.01112122, + "auxiliary_loss_mlp": 0.01038584, + "balance_loss_clip": 1.03900528, + "balance_loss_mlp": 1.02668095, + "epoch": 0.7043439050052608, + "flos": 24103454653440.0, + "grad_norm": 2.019488765310995, + "language_loss": 0.79514945, + "learning_rate": 8.488538287759248e-07, + "loss": 0.81665647, + "num_input_tokens_seen": 252765610, + "router_z_loss_clip": 0.73095703, + "router_z_loss_mlp": 0.11901855, + "step": 11715, + "time_per_iteration": 2.6262030601501465 + }, + { + "auxiliary_loss_clip": 0.01115446, + "auxiliary_loss_mlp": 0.01032736, + "balance_loss_clip": 1.04018927, + "balance_loss_mlp": 1.02066612, + "epoch": 0.7044040282579288, + "flos": 14079368764800.0, + "grad_norm": 2.3864077328729203, + "language_loss": 0.71631944, + "learning_rate": 8.485353681802037e-07, + "loss": 0.73780131, + "num_input_tokens_seen": 252781610, + "router_z_loss_clip": 0.75292969, + "router_z_loss_mlp": 0.12078857, + "step": 11716, + "time_per_iteration": 2.6856820583343506 + }, + { + "auxiliary_loss_clip": 0.01117671, + "auxiliary_loss_mlp": 0.01030866, + "balance_loss_clip": 1.04151511, + "balance_loss_mlp": 1.01891565, + "epoch": 0.7044641515105967, + "flos": 41068294427520.0, + "grad_norm": 2.165238833034143, + "language_loss": 0.66700387, + "learning_rate": 8.482169512481358e-07, + "loss": 0.68848926, + "num_input_tokens_seen": 252800600, + "router_z_loss_clip": 0.76123047, + "router_z_loss_mlp": 0.11956787, + "step": 11717, + "time_per_iteration": 2.743525743484497 + }, + { + "auxiliary_loss_clip": 0.0111455, + "auxiliary_loss_mlp": 0.01028637, + "balance_loss_clip": 1.04068184, + "balance_loss_mlp": 1.01728201, + "epoch": 0.7045242747632647, + "flos": 32609006732160.0, + "grad_norm": 1.894937433499458, + "language_loss": 0.74155295, + "learning_rate": 8.478985779917967e-07, + "loss": 0.76298481, + "num_input_tokens_seen": 252822310, + "router_z_loss_clip": 0.73828125, + "router_z_loss_mlp": 0.11352539, + "step": 11718, + "time_per_iteration": 2.698697566986084 + }, + { + "auxiliary_loss_clip": 0.01113533, + "auxiliary_loss_mlp": 0.01029152, + "balance_loss_clip": 1.04108322, + "balance_loss_mlp": 1.01828027, + "epoch": 0.7045843980159326, + "flos": 32698982910240.0, + "grad_norm": 1.8509224707968186, + "language_loss": 0.79819357, + "learning_rate": 8.475802484232606e-07, + "loss": 0.81962037, + "num_input_tokens_seen": 252842355, + "router_z_loss_clip": 0.72412109, + "router_z_loss_mlp": 0.10876465, + "step": 11719, + "time_per_iteration": 2.661320924758911 + }, + { + "auxiliary_loss_clip": 0.0111509, + "auxiliary_loss_mlp": 0.01031427, + "balance_loss_clip": 1.04178619, + "balance_loss_mlp": 1.01967847, + "epoch": 0.7046445212686007, + "flos": 50731179050880.0, + "grad_norm": 2.490301381477452, + "language_loss": 0.6542744, + "learning_rate": 8.472619625545951e-07, + "loss": 0.67573959, + "num_input_tokens_seen": 252866785, + "router_z_loss_clip": 0.73193359, + "router_z_loss_mlp": 0.11730957, + "step": 11720, + "time_per_iteration": 2.854532241821289 + }, + { + "auxiliary_loss_clip": 0.01117595, + "auxiliary_loss_mlp": 0.01027057, + "balance_loss_clip": 1.04240882, + "balance_loss_mlp": 1.01529682, + "epoch": 0.7047046445212686, + "flos": 18986123174400.0, + "grad_norm": 2.254475403658223, + "language_loss": 0.80061018, + "learning_rate": 8.46943720397872e-07, + "loss": 0.82205665, + "num_input_tokens_seen": 252881870, + "router_z_loss_clip": 0.75146484, + "router_z_loss_mlp": 0.11773682, + "step": 11721, + "time_per_iteration": 2.5831284523010254 + }, + { + "auxiliary_loss_clip": 0.01032164, + "auxiliary_loss_mlp": 0.01001233, + "balance_loss_clip": 1.00915122, + "balance_loss_mlp": 1.00005698, + "epoch": 0.7047647677739366, + "flos": 85913967643200.0, + "grad_norm": 0.7634385848854531, + "language_loss": 0.6477657, + "learning_rate": 8.466255219651582e-07, + "loss": 0.66809964, + "num_input_tokens_seen": 252951300, + "router_z_loss_clip": 0.23022461, + "router_z_loss_mlp": 0.01176453, + "step": 11722, + "time_per_iteration": 3.340977668762207 + }, + { + "auxiliary_loss_clip": 0.01115185, + "auxiliary_loss_mlp": 0.01028526, + "balance_loss_clip": 1.04270029, + "balance_loss_mlp": 1.01786244, + "epoch": 0.7048248910266045, + "flos": 28870200394560.0, + "grad_norm": 1.9031647107887353, + "language_loss": 0.65989977, + "learning_rate": 8.463073672685211e-07, + "loss": 0.68133688, + "num_input_tokens_seen": 252971400, + "router_z_loss_clip": 0.72412109, + "router_z_loss_mlp": 0.10668945, + "step": 11723, + "time_per_iteration": 2.6636557579040527 + }, + { + "auxiliary_loss_clip": 0.0111692, + "auxiliary_loss_mlp": 0.01027202, + "balance_loss_clip": 1.04176128, + "balance_loss_mlp": 1.01563299, + "epoch": 0.7048850142792725, + "flos": 26108936314560.0, + "grad_norm": 1.9792249973540958, + "language_loss": 0.81248868, + "learning_rate": 8.459892563200235e-07, + "loss": 0.8339299, + "num_input_tokens_seen": 252989475, + "router_z_loss_clip": 0.75146484, + "router_z_loss_mlp": 0.11560059, + "step": 11724, + "time_per_iteration": 2.650911569595337 + }, + { + "auxiliary_loss_clip": 0.01114267, + "auxiliary_loss_mlp": 0.0103549, + "balance_loss_clip": 1.03941202, + "balance_loss_mlp": 1.02389657, + "epoch": 0.7049451375319404, + "flos": 26415567154080.0, + "grad_norm": 1.6518544252779395, + "language_loss": 0.73333895, + "learning_rate": 8.456711891317296e-07, + "loss": 0.75483656, + "num_input_tokens_seen": 253007220, + "router_z_loss_clip": 0.74804688, + "router_z_loss_mlp": 0.1159668, + "step": 11725, + "time_per_iteration": 2.6675586700439453 + }, + { + "auxiliary_loss_clip": 0.01114058, + "auxiliary_loss_mlp": 0.01030305, + "balance_loss_clip": 1.03917599, + "balance_loss_mlp": 1.01793718, + "epoch": 0.7050052607846085, + "flos": 18140040439200.0, + "grad_norm": 2.0798999430573306, + "language_loss": 0.78572088, + "learning_rate": 8.453531657156998e-07, + "loss": 0.80716449, + "num_input_tokens_seen": 253025410, + "router_z_loss_clip": 0.74902344, + "router_z_loss_mlp": 0.12365723, + "step": 11726, + "time_per_iteration": 2.6110687255859375 + }, + { + "auxiliary_loss_clip": 0.01114174, + "auxiliary_loss_mlp": 0.01032242, + "balance_loss_clip": 1.03949177, + "balance_loss_mlp": 1.02082777, + "epoch": 0.7050653840372764, + "flos": 23479010205120.0, + "grad_norm": 1.682657806824506, + "language_loss": 0.70538998, + "learning_rate": 8.450351860839931e-07, + "loss": 0.72685415, + "num_input_tokens_seen": 253043305, + "router_z_loss_clip": 0.74658203, + "router_z_loss_mlp": 0.11413574, + "step": 11727, + "time_per_iteration": 2.643293857574463 + }, + { + "auxiliary_loss_clip": 0.01106192, + "auxiliary_loss_mlp": 0.01025402, + "balance_loss_clip": 1.03774095, + "balance_loss_mlp": 1.01501858, + "epoch": 0.7051255072899444, + "flos": 33898447075680.0, + "grad_norm": 1.8703797192936678, + "language_loss": 0.68600416, + "learning_rate": 8.44717250248668e-07, + "loss": 0.70732009, + "num_input_tokens_seen": 253062790, + "router_z_loss_clip": 0.68457031, + "router_z_loss_mlp": 0.10388184, + "step": 11728, + "time_per_iteration": 2.678546190261841 + }, + { + "auxiliary_loss_clip": 0.01114091, + "auxiliary_loss_mlp": 0.01029854, + "balance_loss_clip": 1.04085219, + "balance_loss_mlp": 1.0181179, + "epoch": 0.7051856305426124, + "flos": 34034444533440.0, + "grad_norm": 1.8782195241307373, + "language_loss": 0.73472935, + "learning_rate": 8.443993582217803e-07, + "loss": 0.75616884, + "num_input_tokens_seen": 253082055, + "router_z_loss_clip": 0.73242188, + "router_z_loss_mlp": 0.11724854, + "step": 11729, + "time_per_iteration": 2.6997222900390625 + }, + { + "auxiliary_loss_clip": 0.01119972, + "auxiliary_loss_mlp": 0.01033899, + "balance_loss_clip": 1.04174137, + "balance_loss_mlp": 1.02129257, + "epoch": 0.7052457537952803, + "flos": 30558759827040.0, + "grad_norm": 1.664725590659243, + "language_loss": 0.78010672, + "learning_rate": 8.440815100153862e-07, + "loss": 0.8016454, + "num_input_tokens_seen": 253102575, + "router_z_loss_clip": 0.78173828, + "router_z_loss_mlp": 0.1260376, + "step": 11730, + "time_per_iteration": 2.7440781593322754 + }, + { + "auxiliary_loss_clip": 0.01113256, + "auxiliary_loss_mlp": 0.01032783, + "balance_loss_clip": 1.03858209, + "balance_loss_mlp": 1.02107668, + "epoch": 0.7053058770479483, + "flos": 26688007759680.0, + "grad_norm": 2.3760216301247477, + "language_loss": 0.62680876, + "learning_rate": 8.437637056415359e-07, + "loss": 0.64826918, + "num_input_tokens_seen": 253121290, + "router_z_loss_clip": 0.74462891, + "router_z_loss_mlp": 0.11700439, + "step": 11731, + "time_per_iteration": 2.810312509536743 + }, + { + "auxiliary_loss_clip": 0.01115733, + "auxiliary_loss_mlp": 0.01026962, + "balance_loss_clip": 1.0402596, + "balance_loss_mlp": 1.01474905, + "epoch": 0.7053660003006162, + "flos": 20009443608000.0, + "grad_norm": 2.2840069002809145, + "language_loss": 0.74509418, + "learning_rate": 8.434459451122815e-07, + "loss": 0.7665211, + "num_input_tokens_seen": 253139720, + "router_z_loss_clip": 0.75439453, + "router_z_loss_mlp": 0.12219238, + "step": 11732, + "time_per_iteration": 2.622231960296631 + }, + { + "auxiliary_loss_clip": 0.01114298, + "auxiliary_loss_mlp": 0.01025921, + "balance_loss_clip": 1.0423305, + "balance_loss_mlp": 1.01500094, + "epoch": 0.7054261235532843, + "flos": 27712786815360.0, + "grad_norm": 1.6551639746272953, + "language_loss": 0.70949531, + "learning_rate": 8.431282284396735e-07, + "loss": 0.73089749, + "num_input_tokens_seen": 253160250, + "router_z_loss_clip": 0.71826172, + "router_z_loss_mlp": 0.10925293, + "step": 11733, + "time_per_iteration": 2.6803596019744873 + }, + { + "auxiliary_loss_clip": 0.01112307, + "auxiliary_loss_mlp": 0.01030004, + "balance_loss_clip": 1.03915226, + "balance_loss_mlp": 1.01833904, + "epoch": 0.7054862468059522, + "flos": 16581643976160.0, + "grad_norm": 1.9573068107227682, + "language_loss": 0.7309885, + "learning_rate": 8.428105556357583e-07, + "loss": 0.7524116, + "num_input_tokens_seen": 253178710, + "router_z_loss_clip": 0.73144531, + "router_z_loss_mlp": 0.11663818, + "step": 11734, + "time_per_iteration": 2.6263861656188965 + }, + { + "auxiliary_loss_clip": 0.01118624, + "auxiliary_loss_mlp": 0.0103271, + "balance_loss_clip": 1.04135704, + "balance_loss_mlp": 1.02097332, + "epoch": 0.7055463700586202, + "flos": 19377989670240.0, + "grad_norm": 2.233685247842489, + "language_loss": 0.69360393, + "learning_rate": 8.424929267125829e-07, + "loss": 0.71511722, + "num_input_tokens_seen": 253194805, + "router_z_loss_clip": 0.77294922, + "router_z_loss_mlp": 0.11737061, + "step": 11735, + "time_per_iteration": 4.144705772399902 + }, + { + "auxiliary_loss_clip": 0.01116549, + "auxiliary_loss_mlp": 0.01034999, + "balance_loss_clip": 1.04145885, + "balance_loss_mlp": 1.02180266, + "epoch": 0.7056064933112881, + "flos": 28157724597600.0, + "grad_norm": 1.9441070091167607, + "language_loss": 0.72507465, + "learning_rate": 8.421753416821933e-07, + "loss": 0.74659014, + "num_input_tokens_seen": 253213895, + "router_z_loss_clip": 0.75097656, + "router_z_loss_mlp": 0.13214111, + "step": 11736, + "time_per_iteration": 2.663769483566284 + }, + { + "auxiliary_loss_clip": 0.01114201, + "auxiliary_loss_mlp": 0.0102548, + "balance_loss_clip": 1.04205251, + "balance_loss_mlp": 1.0148046, + "epoch": 0.7056666165639561, + "flos": 29355203416320.0, + "grad_norm": 2.0450159584502656, + "language_loss": 0.69211048, + "learning_rate": 8.41857800556629e-07, + "loss": 0.71350729, + "num_input_tokens_seen": 253231620, + "router_z_loss_clip": 0.72167969, + "router_z_loss_mlp": 0.10675049, + "step": 11737, + "time_per_iteration": 2.6389353275299072 + }, + { + "auxiliary_loss_clip": 0.01117052, + "auxiliary_loss_mlp": 0.01037479, + "balance_loss_clip": 1.04120231, + "balance_loss_mlp": 1.02576661, + "epoch": 0.705726739816624, + "flos": 21345067300320.0, + "grad_norm": 2.551914149733846, + "language_loss": 0.68068421, + "learning_rate": 8.415403033479332e-07, + "loss": 0.7022295, + "num_input_tokens_seen": 253249590, + "router_z_loss_clip": 0.75830078, + "router_z_loss_mlp": 0.11706543, + "step": 11738, + "time_per_iteration": 4.021930694580078 + }, + { + "auxiliary_loss_clip": 0.0111516, + "auxiliary_loss_mlp": 0.01032675, + "balance_loss_clip": 1.04118907, + "balance_loss_mlp": 1.02092648, + "epoch": 0.7057868630692921, + "flos": 62660357481600.0, + "grad_norm": 2.028561810259961, + "language_loss": 0.7518332, + "learning_rate": 8.41222850068145e-07, + "loss": 0.7733115, + "num_input_tokens_seen": 253273870, + "router_z_loss_clip": 0.73925781, + "router_z_loss_mlp": 0.11755371, + "step": 11739, + "time_per_iteration": 2.9187803268432617 + }, + { + "auxiliary_loss_clip": 0.01111466, + "auxiliary_loss_mlp": 0.01027445, + "balance_loss_clip": 1.04055119, + "balance_loss_mlp": 1.01603627, + "epoch": 0.70584698632196, + "flos": 31851401035680.0, + "grad_norm": 2.1490729689401635, + "language_loss": 0.71067846, + "learning_rate": 8.409054407293032e-07, + "loss": 0.73206758, + "num_input_tokens_seen": 253293720, + "router_z_loss_clip": 0.70947266, + "router_z_loss_mlp": 0.11395264, + "step": 11740, + "time_per_iteration": 2.6515157222747803 + }, + { + "auxiliary_loss_clip": 0.01115262, + "auxiliary_loss_mlp": 0.01026579, + "balance_loss_clip": 1.04212928, + "balance_loss_mlp": 1.01607084, + "epoch": 0.705907109574628, + "flos": 26287065393120.0, + "grad_norm": 2.071351344055652, + "language_loss": 0.82060361, + "learning_rate": 8.405880753434434e-07, + "loss": 0.84202206, + "num_input_tokens_seen": 253313700, + "router_z_loss_clip": 0.73095703, + "router_z_loss_mlp": 0.10510254, + "step": 11741, + "time_per_iteration": 2.700289249420166 + }, + { + "auxiliary_loss_clip": 0.01116171, + "auxiliary_loss_mlp": 0.0102736, + "balance_loss_clip": 1.04145014, + "balance_loss_mlp": 1.01561821, + "epoch": 0.705967232827296, + "flos": 27711976469760.0, + "grad_norm": 2.4690113116864816, + "language_loss": 0.77928853, + "learning_rate": 8.402707539225993e-07, + "loss": 0.80072385, + "num_input_tokens_seen": 253332425, + "router_z_loss_clip": 0.74658203, + "router_z_loss_mlp": 0.11743164, + "step": 11742, + "time_per_iteration": 2.6670734882354736 + }, + { + "auxiliary_loss_clip": 0.01118206, + "auxiliary_loss_mlp": 0.01033373, + "balance_loss_clip": 1.0409441, + "balance_loss_mlp": 1.02098727, + "epoch": 0.7060273560799639, + "flos": 35010001444320.0, + "grad_norm": 1.5187365051943473, + "language_loss": 0.64146167, + "learning_rate": 8.39953476478805e-07, + "loss": 0.66297746, + "num_input_tokens_seen": 253353620, + "router_z_loss_clip": 0.77197266, + "router_z_loss_mlp": 0.1239624, + "step": 11743, + "time_per_iteration": 2.858849048614502 + }, + { + "auxiliary_loss_clip": 0.01115137, + "auxiliary_loss_mlp": 0.01029093, + "balance_loss_clip": 1.03914237, + "balance_loss_mlp": 1.01682079, + "epoch": 0.7060874793326319, + "flos": 19164292770240.0, + "grad_norm": 2.5047471038685205, + "language_loss": 0.65689111, + "learning_rate": 8.396362430240902e-07, + "loss": 0.6783334, + "num_input_tokens_seen": 253370930, + "router_z_loss_clip": 0.75976562, + "router_z_loss_mlp": 0.12280273, + "step": 11744, + "time_per_iteration": 2.6459386348724365 + }, + { + "auxiliary_loss_clip": 0.01114278, + "auxiliary_loss_mlp": 0.01030239, + "balance_loss_clip": 1.04153466, + "balance_loss_mlp": 1.0188787, + "epoch": 0.7061476025852998, + "flos": 26242664804640.0, + "grad_norm": 2.0281113550231504, + "language_loss": 0.63862735, + "learning_rate": 8.393190535704857e-07, + "loss": 0.66007251, + "num_input_tokens_seen": 253389810, + "router_z_loss_clip": 0.7265625, + "router_z_loss_mlp": 0.11364746, + "step": 11745, + "time_per_iteration": 2.7136590480804443 + }, + { + "auxiliary_loss_clip": 0.01115917, + "auxiliary_loss_mlp": 0.01032073, + "balance_loss_clip": 1.04095376, + "balance_loss_mlp": 1.02105796, + "epoch": 0.7062077258379679, + "flos": 34388271653760.0, + "grad_norm": 1.7336703698971159, + "language_loss": 0.71703041, + "learning_rate": 8.390019081300188e-07, + "loss": 0.73851037, + "num_input_tokens_seen": 253408685, + "router_z_loss_clip": 0.75, + "router_z_loss_mlp": 0.11010742, + "step": 11746, + "time_per_iteration": 4.194290399551392 + }, + { + "auxiliary_loss_clip": 0.01116888, + "auxiliary_loss_mlp": 0.01031024, + "balance_loss_clip": 1.04239678, + "balance_loss_mlp": 1.01941323, + "epoch": 0.7062678490906358, + "flos": 33987815494560.0, + "grad_norm": 1.543254509024017, + "language_loss": 0.79418886, + "learning_rate": 8.386848067147175e-07, + "loss": 0.81566793, + "num_input_tokens_seen": 253429685, + "router_z_loss_clip": 0.74511719, + "router_z_loss_mlp": 0.1161499, + "step": 11747, + "time_per_iteration": 3.9574434757232666 + }, + { + "auxiliary_loss_clip": 0.01112773, + "auxiliary_loss_mlp": 0.01031241, + "balance_loss_clip": 1.04044271, + "balance_loss_mlp": 1.02042866, + "epoch": 0.7063279723433038, + "flos": 28290440155680.0, + "grad_norm": 2.0791112533298826, + "language_loss": 0.65290284, + "learning_rate": 8.383677493366031e-07, + "loss": 0.67434299, + "num_input_tokens_seen": 253448260, + "router_z_loss_clip": 0.72265625, + "router_z_loss_mlp": 0.1081543, + "step": 11748, + "time_per_iteration": 2.6185169219970703 + }, + { + "auxiliary_loss_clip": 0.01115663, + "auxiliary_loss_mlp": 0.01036375, + "balance_loss_clip": 1.04110193, + "balance_loss_mlp": 1.02428746, + "epoch": 0.7063880955959717, + "flos": 24634641024000.0, + "grad_norm": 1.888837154976362, + "language_loss": 0.79242152, + "learning_rate": 8.380507360077003e-07, + "loss": 0.8139419, + "num_input_tokens_seen": 253467725, + "router_z_loss_clip": 0.74560547, + "router_z_loss_mlp": 0.12097168, + "step": 11749, + "time_per_iteration": 2.6368045806884766 + }, + { + "auxiliary_loss_clip": 0.01034181, + "auxiliary_loss_mlp": 0.01001306, + "balance_loss_clip": 1.01094294, + "balance_loss_mlp": 1.00011873, + "epoch": 0.7064482188486397, + "flos": 77689398965760.0, + "grad_norm": 0.78513195688264, + "language_loss": 0.5403856, + "learning_rate": 8.377337667400304e-07, + "loss": 0.56074047, + "num_input_tokens_seen": 253526940, + "router_z_loss_clip": 0.23254395, + "router_z_loss_mlp": 0.01186371, + "step": 11750, + "time_per_iteration": 3.2008068561553955 + }, + { + "auxiliary_loss_clip": 0.0111523, + "auxiliary_loss_mlp": 0.01031634, + "balance_loss_clip": 1.04079485, + "balance_loss_mlp": 1.01939142, + "epoch": 0.7065083421013076, + "flos": 30738793217760.0, + "grad_norm": 1.6916026641999689, + "language_loss": 0.78497744, + "learning_rate": 8.37416841545612e-07, + "loss": 0.80644608, + "num_input_tokens_seen": 253546160, + "router_z_loss_clip": 0.74462891, + "router_z_loss_mlp": 0.12243652, + "step": 11751, + "time_per_iteration": 2.6610476970672607 + }, + { + "auxiliary_loss_clip": 0.01110845, + "auxiliary_loss_mlp": 0.01028121, + "balance_loss_clip": 1.03966928, + "balance_loss_mlp": 1.01732063, + "epoch": 0.7065684653539757, + "flos": 27935073378720.0, + "grad_norm": 2.476919137960537, + "language_loss": 0.68167788, + "learning_rate": 8.370999604364634e-07, + "loss": 0.70306754, + "num_input_tokens_seen": 253565505, + "router_z_loss_clip": 0.71240234, + "router_z_loss_mlp": 0.10803223, + "step": 11752, + "time_per_iteration": 2.6644091606140137 + }, + { + "auxiliary_loss_clip": 0.01113304, + "auxiliary_loss_mlp": 0.01034399, + "balance_loss_clip": 1.04055023, + "balance_loss_mlp": 1.02297294, + "epoch": 0.7066285886066436, + "flos": 28736269318080.0, + "grad_norm": 2.561337102276034, + "language_loss": 0.76559782, + "learning_rate": 8.367831234246025e-07, + "loss": 0.7870748, + "num_input_tokens_seen": 253585125, + "router_z_loss_clip": 0.72802734, + "router_z_loss_mlp": 0.11437988, + "step": 11753, + "time_per_iteration": 2.6718928813934326 + }, + { + "auxiliary_loss_clip": 0.01112507, + "auxiliary_loss_mlp": 0.01031262, + "balance_loss_clip": 1.04084575, + "balance_loss_mlp": 1.01978791, + "epoch": 0.7066887118593116, + "flos": 25708804293600.0, + "grad_norm": 2.485983896404353, + "language_loss": 0.71178436, + "learning_rate": 8.364663305220405e-07, + "loss": 0.73322207, + "num_input_tokens_seen": 253604815, + "router_z_loss_clip": 0.71630859, + "router_z_loss_mlp": 0.11474609, + "step": 11754, + "time_per_iteration": 2.6451001167297363 + }, + { + "auxiliary_loss_clip": 0.01113575, + "auxiliary_loss_mlp": 0.01031688, + "balance_loss_clip": 1.03990746, + "balance_loss_mlp": 1.01988673, + "epoch": 0.7067488351119796, + "flos": 25839615539520.0, + "grad_norm": 1.6475108360095285, + "language_loss": 0.8932085, + "learning_rate": 8.361495817407919e-07, + "loss": 0.91466117, + "num_input_tokens_seen": 253622855, + "router_z_loss_clip": 0.73535156, + "router_z_loss_mlp": 0.11810303, + "step": 11755, + "time_per_iteration": 2.70046067237854 + }, + { + "auxiliary_loss_clip": 0.01114905, + "auxiliary_loss_mlp": 0.01032442, + "balance_loss_clip": 1.04115796, + "balance_loss_mlp": 1.02054524, + "epoch": 0.7068089583646475, + "flos": 24952130494560.0, + "grad_norm": 1.8265257299204138, + "language_loss": 0.79214567, + "learning_rate": 8.358328770928678e-07, + "loss": 0.81361914, + "num_input_tokens_seen": 253642760, + "router_z_loss_clip": 0.73828125, + "router_z_loss_mlp": 0.11895752, + "step": 11756, + "time_per_iteration": 2.6041548252105713 + }, + { + "auxiliary_loss_clip": 0.010344, + "auxiliary_loss_mlp": 0.01001053, + "balance_loss_clip": 1.01118708, + "balance_loss_mlp": 0.99993336, + "epoch": 0.7068690816173155, + "flos": 72125995220640.0, + "grad_norm": 0.8241337073064273, + "language_loss": 0.60397118, + "learning_rate": 8.355162165902785e-07, + "loss": 0.62432575, + "num_input_tokens_seen": 253695685, + "router_z_loss_clip": 0.23242188, + "router_z_loss_mlp": 0.01119995, + "step": 11757, + "time_per_iteration": 3.1549224853515625 + }, + { + "auxiliary_loss_clip": 0.01114384, + "auxiliary_loss_mlp": 0.01033336, + "balance_loss_clip": 1.04095304, + "balance_loss_mlp": 1.02185071, + "epoch": 0.7069292048699835, + "flos": 19829977459200.0, + "grad_norm": 1.900502089268923, + "language_loss": 0.80170417, + "learning_rate": 8.351996002450307e-07, + "loss": 0.82318133, + "num_input_tokens_seen": 253713305, + "router_z_loss_clip": 0.73339844, + "router_z_loss_mlp": 0.11480713, + "step": 11758, + "time_per_iteration": 2.7064599990844727 + }, + { + "auxiliary_loss_clip": 0.01111797, + "auxiliary_loss_mlp": 0.01031544, + "balance_loss_clip": 1.03949261, + "balance_loss_mlp": 1.01998675, + "epoch": 0.7069893281226515, + "flos": 50240625161760.0, + "grad_norm": 3.8250712393751596, + "language_loss": 0.7754432, + "learning_rate": 8.348830280691304e-07, + "loss": 0.79687661, + "num_input_tokens_seen": 253736100, + "router_z_loss_clip": 0.72314453, + "router_z_loss_mlp": 0.11553955, + "step": 11759, + "time_per_iteration": 2.7957851886749268 + }, + { + "auxiliary_loss_clip": 0.01113626, + "auxiliary_loss_mlp": 0.01030188, + "balance_loss_clip": 1.0396421, + "balance_loss_mlp": 1.01806462, + "epoch": 0.7070494513753194, + "flos": 29537991982080.0, + "grad_norm": 1.607737190794342, + "language_loss": 0.67871749, + "learning_rate": 8.34566500074583e-07, + "loss": 0.70015562, + "num_input_tokens_seen": 253757350, + "router_z_loss_clip": 0.73974609, + "router_z_loss_mlp": 0.12115479, + "step": 11760, + "time_per_iteration": 2.7110707759857178 + }, + { + "auxiliary_loss_clip": 0.01116997, + "auxiliary_loss_mlp": 0.01033251, + "balance_loss_clip": 1.04195595, + "balance_loss_mlp": 1.02177143, + "epoch": 0.7071095746279874, + "flos": 24631278089760.0, + "grad_norm": 2.101377699724692, + "language_loss": 0.80198485, + "learning_rate": 8.342500162733899e-07, + "loss": 0.82348728, + "num_input_tokens_seen": 253772855, + "router_z_loss_clip": 0.75, + "router_z_loss_mlp": 0.11474609, + "step": 11761, + "time_per_iteration": 2.6361286640167236 + }, + { + "auxiliary_loss_clip": 0.01115078, + "auxiliary_loss_mlp": 0.01032109, + "balance_loss_clip": 1.04078841, + "balance_loss_mlp": 1.01982474, + "epoch": 0.7071696978806553, + "flos": 22187138824800.0, + "grad_norm": 2.5310724751414178, + "language_loss": 0.74643362, + "learning_rate": 8.33933576677553e-07, + "loss": 0.76790541, + "num_input_tokens_seen": 253790360, + "router_z_loss_clip": 0.74316406, + "router_z_loss_mlp": 0.12286377, + "step": 11762, + "time_per_iteration": 2.6516990661621094 + }, + { + "auxiliary_loss_clip": 0.01112504, + "auxiliary_loss_mlp": 0.01031919, + "balance_loss_clip": 1.04012811, + "balance_loss_mlp": 1.0209223, + "epoch": 0.7072298211333233, + "flos": 29446030457280.0, + "grad_norm": 1.7233827266061523, + "language_loss": 0.76818848, + "learning_rate": 8.336171812990724e-07, + "loss": 0.78963268, + "num_input_tokens_seen": 253810585, + "router_z_loss_clip": 0.72265625, + "router_z_loss_mlp": 0.10992432, + "step": 11763, + "time_per_iteration": 2.6679091453552246 + }, + { + "auxiliary_loss_clip": 0.01114859, + "auxiliary_loss_mlp": 0.01032553, + "balance_loss_clip": 1.04130566, + "balance_loss_mlp": 1.02058423, + "epoch": 0.7072899443859912, + "flos": 33188078177280.0, + "grad_norm": 2.2564354273786735, + "language_loss": 0.78946197, + "learning_rate": 8.333008301499453e-07, + "loss": 0.81093609, + "num_input_tokens_seen": 253829080, + "router_z_loss_clip": 0.73535156, + "router_z_loss_mlp": 0.11981201, + "step": 11764, + "time_per_iteration": 2.7312943935394287 + }, + { + "auxiliary_loss_clip": 0.01117352, + "auxiliary_loss_mlp": 0.01033779, + "balance_loss_clip": 1.04130793, + "balance_loss_mlp": 1.0215894, + "epoch": 0.7073500676386593, + "flos": 20054411438400.0, + "grad_norm": 1.6129485857863164, + "language_loss": 0.79384506, + "learning_rate": 8.32984523242167e-07, + "loss": 0.81535637, + "num_input_tokens_seen": 253846780, + "router_z_loss_clip": 0.76025391, + "router_z_loss_mlp": 0.12188721, + "step": 11765, + "time_per_iteration": 2.5936484336853027 + }, + { + "auxiliary_loss_clip": 0.01110291, + "auxiliary_loss_mlp": 0.01028975, + "balance_loss_clip": 1.03923607, + "balance_loss_mlp": 1.01860976, + "epoch": 0.7074101908913272, + "flos": 33768000485280.0, + "grad_norm": 1.7738054605144087, + "language_loss": 0.68453228, + "learning_rate": 8.326682605877324e-07, + "loss": 0.70592499, + "num_input_tokens_seen": 253867075, + "router_z_loss_clip": 0.71044922, + "router_z_loss_mlp": 0.1036377, + "step": 11766, + "time_per_iteration": 2.6800014972686768 + }, + { + "auxiliary_loss_clip": 0.01113573, + "auxiliary_loss_mlp": 0.01037345, + "balance_loss_clip": 1.03930116, + "balance_loss_mlp": 1.02484632, + "epoch": 0.7074703141439952, + "flos": 27134930888640.0, + "grad_norm": 1.9244356894960566, + "language_loss": 0.64350426, + "learning_rate": 8.323520421986352e-07, + "loss": 0.66501343, + "num_input_tokens_seen": 253885790, + "router_z_loss_clip": 0.74267578, + "router_z_loss_mlp": 0.12493896, + "step": 11767, + "time_per_iteration": 2.6702258586883545 + }, + { + "auxiliary_loss_clip": 0.01113764, + "auxiliary_loss_mlp": 0.01030794, + "balance_loss_clip": 1.03965712, + "balance_loss_mlp": 1.0192008, + "epoch": 0.7075304373966632, + "flos": 36171385716960.0, + "grad_norm": 1.6175571480216935, + "language_loss": 0.52941626, + "learning_rate": 8.320358680868646e-07, + "loss": 0.55086184, + "num_input_tokens_seen": 253907070, + "router_z_loss_clip": 0.74121094, + "router_z_loss_mlp": 0.11608887, + "step": 11768, + "time_per_iteration": 2.716367244720459 + }, + { + "auxiliary_loss_clip": 0.01112823, + "auxiliary_loss_mlp": 0.0102876, + "balance_loss_clip": 1.04066586, + "balance_loss_mlp": 1.01791191, + "epoch": 0.7075905606493311, + "flos": 24105318448320.0, + "grad_norm": 1.9122343255492775, + "language_loss": 0.75826114, + "learning_rate": 8.317197382644119e-07, + "loss": 0.77967703, + "num_input_tokens_seen": 253927290, + "router_z_loss_clip": 0.72265625, + "router_z_loss_mlp": 0.10852051, + "step": 11769, + "time_per_iteration": 2.682695150375366 + }, + { + "auxiliary_loss_clip": 0.01034636, + "auxiliary_loss_mlp": 0.01002707, + "balance_loss_clip": 1.0114733, + "balance_loss_mlp": 1.00152552, + "epoch": 0.7076506839019991, + "flos": 80188027621920.0, + "grad_norm": 0.8922655543047326, + "language_loss": 0.6196208, + "learning_rate": 8.314036527432637e-07, + "loss": 0.6399942, + "num_input_tokens_seen": 253983440, + "router_z_loss_clip": 0.23156738, + "router_z_loss_mlp": 0.0118103, + "step": 11770, + "time_per_iteration": 3.206282615661621 + }, + { + "auxiliary_loss_clip": 0.01115757, + "auxiliary_loss_mlp": 0.01036776, + "balance_loss_clip": 1.04052055, + "balance_loss_mlp": 1.02501559, + "epoch": 0.707710807154667, + "flos": 28999026293760.0, + "grad_norm": 1.9715248295507917, + "language_loss": 0.76481503, + "learning_rate": 8.310876115354055e-07, + "loss": 0.78634036, + "num_input_tokens_seen": 254003825, + "router_z_loss_clip": 0.75244141, + "router_z_loss_mlp": 0.11773682, + "step": 11771, + "time_per_iteration": 2.6653950214385986 + }, + { + "auxiliary_loss_clip": 0.01111281, + "auxiliary_loss_mlp": 0.010264, + "balance_loss_clip": 1.04016089, + "balance_loss_mlp": 1.01563537, + "epoch": 0.7077709304073351, + "flos": 25931009822400.0, + "grad_norm": 1.626124976895518, + "language_loss": 0.71189058, + "learning_rate": 8.307716146528221e-07, + "loss": 0.73326743, + "num_input_tokens_seen": 254023345, + "router_z_loss_clip": 0.7109375, + "router_z_loss_mlp": 0.10766602, + "step": 11772, + "time_per_iteration": 2.6704444885253906 + }, + { + "auxiliary_loss_clip": 0.01117265, + "auxiliary_loss_mlp": 0.01031697, + "balance_loss_clip": 1.04091704, + "balance_loss_mlp": 1.01900685, + "epoch": 0.707831053660003, + "flos": 25307902444320.0, + "grad_norm": 2.4450576334198697, + "language_loss": 0.69511771, + "learning_rate": 8.30455662107496e-07, + "loss": 0.71660727, + "num_input_tokens_seen": 254041815, + "router_z_loss_clip": 0.76367188, + "router_z_loss_mlp": 0.12695312, + "step": 11773, + "time_per_iteration": 2.701190710067749 + }, + { + "auxiliary_loss_clip": 0.01116276, + "auxiliary_loss_mlp": 0.01037533, + "balance_loss_clip": 1.042184, + "balance_loss_mlp": 1.02617264, + "epoch": 0.707891176912671, + "flos": 26821371594240.0, + "grad_norm": 1.8314798281414129, + "language_loss": 0.69944972, + "learning_rate": 8.301397539114095e-07, + "loss": 0.7209878, + "num_input_tokens_seen": 254062065, + "router_z_loss_clip": 0.74121094, + "router_z_loss_mlp": 0.11358643, + "step": 11774, + "time_per_iteration": 2.664597272872925 + }, + { + "auxiliary_loss_clip": 0.01111706, + "auxiliary_loss_mlp": 0.01026974, + "balance_loss_clip": 1.04127681, + "balance_loss_mlp": 1.01613212, + "epoch": 0.7079513001653389, + "flos": 25708439638080.0, + "grad_norm": 1.6177355906953137, + "language_loss": 0.74445868, + "learning_rate": 8.298238900765407e-07, + "loss": 0.76584548, + "num_input_tokens_seen": 254080605, + "router_z_loss_clip": 0.70410156, + "router_z_loss_mlp": 0.10839844, + "step": 11775, + "time_per_iteration": 4.091461896896362 + }, + { + "auxiliary_loss_clip": 0.01115874, + "auxiliary_loss_mlp": 0.01028262, + "balance_loss_clip": 1.04087162, + "balance_loss_mlp": 1.01700234, + "epoch": 0.7080114234180069, + "flos": 22013061474240.0, + "grad_norm": 1.7539938707818201, + "language_loss": 0.86886519, + "learning_rate": 8.295080706148665e-07, + "loss": 0.89030659, + "num_input_tokens_seen": 254098710, + "router_z_loss_clip": 0.75, + "router_z_loss_mlp": 0.11254883, + "step": 11776, + "time_per_iteration": 2.7044270038604736 + }, + { + "auxiliary_loss_clip": 0.01113329, + "auxiliary_loss_mlp": 0.0103042, + "balance_loss_clip": 1.0402627, + "balance_loss_mlp": 1.01954806, + "epoch": 0.7080715466706748, + "flos": 18451776456000.0, + "grad_norm": 1.5149953917838146, + "language_loss": 0.74699616, + "learning_rate": 8.291922955383641e-07, + "loss": 0.76843363, + "num_input_tokens_seen": 254117200, + "router_z_loss_clip": 0.73046875, + "router_z_loss_mlp": 0.10870361, + "step": 11777, + "time_per_iteration": 2.6691195964813232 + }, + { + "auxiliary_loss_clip": 0.01120485, + "auxiliary_loss_mlp": 0.01030874, + "balance_loss_clip": 1.04350901, + "balance_loss_mlp": 1.01876783, + "epoch": 0.7081316699233429, + "flos": 17598805783200.0, + "grad_norm": 3.2071552803189824, + "language_loss": 0.82625765, + "learning_rate": 8.288765648590066e-07, + "loss": 0.84777117, + "num_input_tokens_seen": 254132115, + "router_z_loss_clip": 0.77050781, + "router_z_loss_mlp": 0.12103271, + "step": 11778, + "time_per_iteration": 3.8286709785461426 + }, + { + "auxiliary_loss_clip": 0.01111049, + "auxiliary_loss_mlp": 0.01032467, + "balance_loss_clip": 1.04086256, + "balance_loss_mlp": 1.02179182, + "epoch": 0.7081917931760108, + "flos": 28336137297120.0, + "grad_norm": 1.6470580147909504, + "language_loss": 0.84805632, + "learning_rate": 8.285608785887673e-07, + "loss": 0.86949146, + "num_input_tokens_seen": 254152285, + "router_z_loss_clip": 0.70117188, + "router_z_loss_mlp": 0.10681152, + "step": 11779, + "time_per_iteration": 2.635485887527466 + }, + { + "auxiliary_loss_clip": 0.01115854, + "auxiliary_loss_mlp": 0.01038472, + "balance_loss_clip": 1.0415374, + "balance_loss_mlp": 1.02682495, + "epoch": 0.7082519164286788, + "flos": 47965944277440.0, + "grad_norm": 2.522006176293258, + "language_loss": 0.72115922, + "learning_rate": 8.28245236739618e-07, + "loss": 0.74270248, + "num_input_tokens_seen": 254172805, + "router_z_loss_clip": 0.74267578, + "router_z_loss_mlp": 0.11645508, + "step": 11780, + "time_per_iteration": 2.809601306915283 + }, + { + "auxiliary_loss_clip": 0.01114208, + "auxiliary_loss_mlp": 0.0102781, + "balance_loss_clip": 1.0420754, + "balance_loss_mlp": 1.01676512, + "epoch": 0.7083120396813467, + "flos": 26419011122880.0, + "grad_norm": 2.216788602272834, + "language_loss": 0.72988796, + "learning_rate": 8.279296393235256e-07, + "loss": 0.7513082, + "num_input_tokens_seen": 254191890, + "router_z_loss_clip": 0.72216797, + "router_z_loss_mlp": 0.11053467, + "step": 11781, + "time_per_iteration": 2.6732382774353027 + }, + { + "auxiliary_loss_clip": 0.01113478, + "auxiliary_loss_mlp": 0.01032201, + "balance_loss_clip": 1.04098856, + "balance_loss_mlp": 1.02163315, + "epoch": 0.7083721629340147, + "flos": 21435043478400.0, + "grad_norm": 1.8052919165395396, + "language_loss": 0.77564681, + "learning_rate": 8.276140863524585e-07, + "loss": 0.79710364, + "num_input_tokens_seen": 254210150, + "router_z_loss_clip": 0.72363281, + "router_z_loss_mlp": 0.10565186, + "step": 11782, + "time_per_iteration": 2.604400396347046 + }, + { + "auxiliary_loss_clip": 0.01111614, + "auxiliary_loss_mlp": 0.01028323, + "balance_loss_clip": 1.03914702, + "balance_loss_mlp": 1.01805365, + "epoch": 0.7084322861866827, + "flos": 35814276696960.0, + "grad_norm": 1.531922099757239, + "language_loss": 0.69727015, + "learning_rate": 8.272985778383828e-07, + "loss": 0.71866947, + "num_input_tokens_seen": 254233015, + "router_z_loss_clip": 0.72460938, + "router_z_loss_mlp": 0.10272217, + "step": 11783, + "time_per_iteration": 2.724736452102661 + }, + { + "auxiliary_loss_clip": 0.0111725, + "auxiliary_loss_mlp": 0.01032944, + "balance_loss_clip": 1.04194283, + "balance_loss_mlp": 1.02146435, + "epoch": 0.7084924094393507, + "flos": 24641002236960.0, + "grad_norm": 1.7583832601886995, + "language_loss": 0.7907272, + "learning_rate": 8.269831137932632e-07, + "loss": 0.81222916, + "num_input_tokens_seen": 254251345, + "router_z_loss_clip": 0.75292969, + "router_z_loss_mlp": 0.11486816, + "step": 11784, + "time_per_iteration": 2.6359710693359375 + }, + { + "auxiliary_loss_clip": 0.01113519, + "auxiliary_loss_mlp": 0.01030367, + "balance_loss_clip": 1.0407846, + "balance_loss_mlp": 1.01918554, + "epoch": 0.7085525326920187, + "flos": 28647144002880.0, + "grad_norm": 1.6918107759120373, + "language_loss": 0.77282506, + "learning_rate": 8.266676942290609e-07, + "loss": 0.79426396, + "num_input_tokens_seen": 254269905, + "router_z_loss_clip": 0.72802734, + "router_z_loss_mlp": 0.11169434, + "step": 11785, + "time_per_iteration": 2.7527265548706055 + }, + { + "auxiliary_loss_clip": 0.01113915, + "auxiliary_loss_mlp": 0.01033358, + "balance_loss_clip": 1.04094648, + "balance_loss_mlp": 1.02177668, + "epoch": 0.7086126559446866, + "flos": 31675338338400.0, + "grad_norm": 2.9787800904188173, + "language_loss": 0.78023946, + "learning_rate": 8.26352319157738e-07, + "loss": 0.80171216, + "num_input_tokens_seen": 254289990, + "router_z_loss_clip": 0.72900391, + "router_z_loss_mlp": 0.11590576, + "step": 11786, + "time_per_iteration": 5.465143918991089 + }, + { + "auxiliary_loss_clip": 0.01115671, + "auxiliary_loss_mlp": 0.01028083, + "balance_loss_clip": 1.04058099, + "balance_loss_mlp": 1.0166744, + "epoch": 0.7086727791973546, + "flos": 32610060181440.0, + "grad_norm": 2.1243341539700005, + "language_loss": 0.79102898, + "learning_rate": 8.260369885912526e-07, + "loss": 0.8124665, + "num_input_tokens_seen": 254309085, + "router_z_loss_clip": 0.75097656, + "router_z_loss_mlp": 0.11413574, + "step": 11787, + "time_per_iteration": 2.6782915592193604 + }, + { + "auxiliary_loss_clip": 0.01114769, + "auxiliary_loss_mlp": 0.01029734, + "balance_loss_clip": 1.04192019, + "balance_loss_mlp": 1.01855874, + "epoch": 0.7087329024500225, + "flos": 26458711706880.0, + "grad_norm": 1.8905912805136451, + "language_loss": 0.76519668, + "learning_rate": 8.257217025415615e-07, + "loss": 0.78664172, + "num_input_tokens_seen": 254327045, + "router_z_loss_clip": 0.72802734, + "router_z_loss_mlp": 0.11169434, + "step": 11788, + "time_per_iteration": 2.6571531295776367 + }, + { + "auxiliary_loss_clip": 0.01118294, + "auxiliary_loss_mlp": 0.01029146, + "balance_loss_clip": 1.04196143, + "balance_loss_mlp": 1.01658106, + "epoch": 0.7087930257026905, + "flos": 21879332984160.0, + "grad_norm": 2.839284984313541, + "language_loss": 0.68624437, + "learning_rate": 8.254064610206212e-07, + "loss": 0.70771873, + "num_input_tokens_seen": 254344585, + "router_z_loss_clip": 0.76269531, + "router_z_loss_mlp": 0.12554932, + "step": 11789, + "time_per_iteration": 2.6566078662872314 + }, + { + "auxiliary_loss_clip": 0.01116136, + "auxiliary_loss_mlp": 0.01031622, + "balance_loss_clip": 1.04002023, + "balance_loss_mlp": 1.01956975, + "epoch": 0.7088531489553584, + "flos": 23076204043680.0, + "grad_norm": 1.6496998589509768, + "language_loss": 0.77468425, + "learning_rate": 8.250912640403858e-07, + "loss": 0.79616183, + "num_input_tokens_seen": 254362470, + "router_z_loss_clip": 0.76171875, + "router_z_loss_mlp": 0.12054443, + "step": 11790, + "time_per_iteration": 2.619901418685913 + }, + { + "auxiliary_loss_clip": 0.01118693, + "auxiliary_loss_mlp": 0.01032027, + "balance_loss_clip": 1.04065275, + "balance_loss_mlp": 1.01979017, + "epoch": 0.7089132722080265, + "flos": 33410729396160.0, + "grad_norm": 2.0770300488260434, + "language_loss": 0.71325892, + "learning_rate": 8.247761116128085e-07, + "loss": 0.73476613, + "num_input_tokens_seen": 254383190, + "router_z_loss_clip": 0.78076172, + "router_z_loss_mlp": 0.12249756, + "step": 11791, + "time_per_iteration": 2.7512974739074707 + }, + { + "auxiliary_loss_clip": 0.01116029, + "auxiliary_loss_mlp": 0.01029685, + "balance_loss_clip": 1.04217172, + "balance_loss_mlp": 1.01807952, + "epoch": 0.7089733954606944, + "flos": 27043415053920.0, + "grad_norm": 1.7758031947557305, + "language_loss": 0.82298803, + "learning_rate": 8.244610037498376e-07, + "loss": 0.84444511, + "num_input_tokens_seen": 254403115, + "router_z_loss_clip": 0.73779297, + "router_z_loss_mlp": 0.11608887, + "step": 11792, + "time_per_iteration": 2.6667263507843018 + }, + { + "auxiliary_loss_clip": 0.01116141, + "auxiliary_loss_mlp": 0.0103224, + "balance_loss_clip": 1.03984189, + "balance_loss_mlp": 1.02043235, + "epoch": 0.7090335187133624, + "flos": 29804719651200.0, + "grad_norm": 7.998807471303142, + "language_loss": 0.64918894, + "learning_rate": 8.241459404634232e-07, + "loss": 0.67067271, + "num_input_tokens_seen": 254421875, + "router_z_loss_clip": 0.76220703, + "router_z_loss_mlp": 0.11798096, + "step": 11793, + "time_per_iteration": 2.659280300140381 + }, + { + "auxiliary_loss_clip": 0.01114451, + "auxiliary_loss_mlp": 0.01028205, + "balance_loss_clip": 1.04136646, + "balance_loss_mlp": 1.01722634, + "epoch": 0.7090936419660303, + "flos": 26643485619360.0, + "grad_norm": 2.4690308810524595, + "language_loss": 0.70478219, + "learning_rate": 8.238309217655133e-07, + "loss": 0.72620875, + "num_input_tokens_seen": 254440765, + "router_z_loss_clip": 0.72998047, + "router_z_loss_mlp": 0.10980225, + "step": 11794, + "time_per_iteration": 2.634644031524658 + }, + { + "auxiliary_loss_clip": 0.01115939, + "auxiliary_loss_mlp": 0.0103226, + "balance_loss_clip": 1.0438354, + "balance_loss_mlp": 1.0217998, + "epoch": 0.7091537652186983, + "flos": 24504640123680.0, + "grad_norm": 2.0379988841895105, + "language_loss": 0.75766587, + "learning_rate": 8.23515947668052e-07, + "loss": 0.77914792, + "num_input_tokens_seen": 254459480, + "router_z_loss_clip": 0.72167969, + "router_z_loss_mlp": 0.10461426, + "step": 11795, + "time_per_iteration": 2.706136465072632 + }, + { + "auxiliary_loss_clip": 0.01115033, + "auxiliary_loss_mlp": 0.01033386, + "balance_loss_clip": 1.04104686, + "balance_loss_mlp": 1.02207351, + "epoch": 0.7092138884713663, + "flos": 16047013636800.0, + "grad_norm": 2.367782862794934, + "language_loss": 0.753016, + "learning_rate": 8.232010181829838e-07, + "loss": 0.77450025, + "num_input_tokens_seen": 254473985, + "router_z_loss_clip": 0.74072266, + "router_z_loss_mlp": 0.11315918, + "step": 11796, + "time_per_iteration": 2.7215890884399414 + }, + { + "auxiliary_loss_clip": 0.01121357, + "auxiliary_loss_mlp": 0.01035077, + "balance_loss_clip": 1.0433836, + "balance_loss_mlp": 1.0220716, + "epoch": 0.7092740117240343, + "flos": 26413176634560.0, + "grad_norm": 1.8931992436176643, + "language_loss": 0.74208426, + "learning_rate": 8.228861333222523e-07, + "loss": 0.76364863, + "num_input_tokens_seen": 254492135, + "router_z_loss_clip": 0.77978516, + "router_z_loss_mlp": 0.13000488, + "step": 11797, + "time_per_iteration": 2.6289968490600586 + }, + { + "auxiliary_loss_clip": 0.01115555, + "auxiliary_loss_mlp": 0.01031349, + "balance_loss_clip": 1.04134774, + "balance_loss_mlp": 1.02011919, + "epoch": 0.7093341349767023, + "flos": 25663958015040.0, + "grad_norm": 1.4662134324646743, + "language_loss": 0.79349202, + "learning_rate": 8.225712930977953e-07, + "loss": 0.81496108, + "num_input_tokens_seen": 254512865, + "router_z_loss_clip": 0.74169922, + "router_z_loss_mlp": 0.11224365, + "step": 11798, + "time_per_iteration": 2.6454293727874756 + }, + { + "auxiliary_loss_clip": 0.01114169, + "auxiliary_loss_mlp": 0.01029425, + "balance_loss_clip": 1.0408287, + "balance_loss_mlp": 1.01787925, + "epoch": 0.7093942582293702, + "flos": 26866096320960.0, + "grad_norm": 1.9775685995209376, + "language_loss": 0.66796708, + "learning_rate": 8.222564975215529e-07, + "loss": 0.68940306, + "num_input_tokens_seen": 254532605, + "router_z_loss_clip": 0.73291016, + "router_z_loss_mlp": 0.11547852, + "step": 11799, + "time_per_iteration": 2.660961151123047 + }, + { + "auxiliary_loss_clip": 0.01115193, + "auxiliary_loss_mlp": 0.01027072, + "balance_loss_clip": 1.04139125, + "balance_loss_mlp": 1.01530623, + "epoch": 0.7094543814820382, + "flos": 33233370145920.0, + "grad_norm": 1.6222078344627613, + "language_loss": 0.81694192, + "learning_rate": 8.219417466054622e-07, + "loss": 0.8383646, + "num_input_tokens_seen": 254553780, + "router_z_loss_clip": 0.73828125, + "router_z_loss_mlp": 0.11767578, + "step": 11800, + "time_per_iteration": 2.6856424808502197 + }, + { + "auxiliary_loss_clip": 0.01111283, + "auxiliary_loss_mlp": 0.01030968, + "balance_loss_clip": 1.03978872, + "balance_loss_mlp": 1.02019143, + "epoch": 0.7095145047347061, + "flos": 14750968976640.0, + "grad_norm": 1.873599256850462, + "language_loss": 0.86863244, + "learning_rate": 8.21627040361459e-07, + "loss": 0.89005494, + "num_input_tokens_seen": 254567510, + "router_z_loss_clip": 0.71484375, + "router_z_loss_mlp": 0.10778809, + "step": 11801, + "time_per_iteration": 2.6388723850250244 + }, + { + "auxiliary_loss_clip": 0.01114098, + "auxiliary_loss_mlp": 0.01035731, + "balance_loss_clip": 1.04023302, + "balance_loss_mlp": 1.02458453, + "epoch": 0.7095746279873741, + "flos": 23652682382880.0, + "grad_norm": 12.411767732288325, + "language_loss": 0.76051319, + "learning_rate": 8.213123788014758e-07, + "loss": 0.78201145, + "num_input_tokens_seen": 254585565, + "router_z_loss_clip": 0.73876953, + "router_z_loss_mlp": 0.1114502, + "step": 11802, + "time_per_iteration": 2.5960373878479004 + }, + { + "auxiliary_loss_clip": 0.01120041, + "auxiliary_loss_mlp": 0.01036847, + "balance_loss_clip": 1.04368174, + "balance_loss_mlp": 1.02521813, + "epoch": 0.709634751240042, + "flos": 26064333139680.0, + "grad_norm": 5.328564672863325, + "language_loss": 0.81182295, + "learning_rate": 8.209977619374462e-07, + "loss": 0.83339185, + "num_input_tokens_seen": 254603465, + "router_z_loss_clip": 0.76318359, + "router_z_loss_mlp": 0.11633301, + "step": 11803, + "time_per_iteration": 2.647073745727539 + }, + { + "auxiliary_loss_clip": 0.01116146, + "auxiliary_loss_mlp": 0.01029661, + "balance_loss_clip": 1.04007626, + "balance_loss_mlp": 1.01713812, + "epoch": 0.7096948744927101, + "flos": 16981168237920.0, + "grad_norm": 2.349457242015332, + "language_loss": 0.67950833, + "learning_rate": 8.206831897812995e-07, + "loss": 0.70096636, + "num_input_tokens_seen": 254620500, + "router_z_loss_clip": 0.76074219, + "router_z_loss_mlp": 0.12524414, + "step": 11804, + "time_per_iteration": 2.603200674057007 + }, + { + "auxiliary_loss_clip": 0.01109979, + "auxiliary_loss_mlp": 0.01025201, + "balance_loss_clip": 1.04000783, + "balance_loss_mlp": 1.01499641, + "epoch": 0.709754997745378, + "flos": 36970960965120.0, + "grad_norm": 1.964567949622775, + "language_loss": 0.77867079, + "learning_rate": 8.203686623449637e-07, + "loss": 0.8000226, + "num_input_tokens_seen": 254638565, + "router_z_loss_clip": 0.69921875, + "router_z_loss_mlp": 0.10205078, + "step": 11805, + "time_per_iteration": 2.7347590923309326 + }, + { + "auxiliary_loss_clip": 0.01114638, + "auxiliary_loss_mlp": 0.01033462, + "balance_loss_clip": 1.04057729, + "balance_loss_mlp": 1.02163029, + "epoch": 0.709815120998046, + "flos": 22592051884800.0, + "grad_norm": 2.067780124851084, + "language_loss": 0.78789747, + "learning_rate": 8.200541796403667e-07, + "loss": 0.80937845, + "num_input_tokens_seen": 254657505, + "router_z_loss_clip": 0.74121094, + "router_z_loss_mlp": 0.1184082, + "step": 11806, + "time_per_iteration": 2.6003904342651367 + }, + { + "auxiliary_loss_clip": 0.01115606, + "auxiliary_loss_mlp": 0.01038413, + "balance_loss_clip": 1.04219532, + "balance_loss_mlp": 1.02763057, + "epoch": 0.7098752442507139, + "flos": 27177224578560.0, + "grad_norm": 2.0397679957441626, + "language_loss": 0.55817413, + "learning_rate": 8.197397416794332e-07, + "loss": 0.57971436, + "num_input_tokens_seen": 254674730, + "router_z_loss_clip": 0.734375, + "router_z_loss_mlp": 0.10778809, + "step": 11807, + "time_per_iteration": 2.722689151763916 + }, + { + "auxiliary_loss_clip": 0.01117245, + "auxiliary_loss_mlp": 0.01040487, + "balance_loss_clip": 1.03892612, + "balance_loss_mlp": 1.02858365, + "epoch": 0.7099353675033819, + "flos": 23520371997600.0, + "grad_norm": 2.201149452534656, + "language_loss": 0.68905413, + "learning_rate": 8.194253484740882e-07, + "loss": 0.71063143, + "num_input_tokens_seen": 254691665, + "router_z_loss_clip": 0.78271484, + "router_z_loss_mlp": 0.11901855, + "step": 11808, + "time_per_iteration": 2.633570671081543 + }, + { + "auxiliary_loss_clip": 0.01116392, + "auxiliary_loss_mlp": 0.0103152, + "balance_loss_clip": 1.04050016, + "balance_loss_mlp": 1.02014208, + "epoch": 0.70999549075605, + "flos": 26732894555520.0, + "grad_norm": 1.87273665267488, + "language_loss": 0.71561056, + "learning_rate": 8.191110000362513e-07, + "loss": 0.73708975, + "num_input_tokens_seen": 254711610, + "router_z_loss_clip": 0.75878906, + "router_z_loss_mlp": 0.11364746, + "step": 11809, + "time_per_iteration": 2.6064507961273193 + }, + { + "auxiliary_loss_clip": 0.01033996, + "auxiliary_loss_mlp": 0.01003174, + "balance_loss_clip": 1.01102495, + "balance_loss_mlp": 1.00201726, + "epoch": 0.7100556140087179, + "flos": 85972340342880.0, + "grad_norm": 0.7564665111022704, + "language_loss": 0.59441555, + "learning_rate": 8.187966963778435e-07, + "loss": 0.61478722, + "num_input_tokens_seen": 254772615, + "router_z_loss_clip": 0.2298584, + "router_z_loss_mlp": 0.01157379, + "step": 11810, + "time_per_iteration": 3.330888032913208 + }, + { + "auxiliary_loss_clip": 0.01115564, + "auxiliary_loss_mlp": 0.01035427, + "balance_loss_clip": 1.04189062, + "balance_loss_mlp": 1.02441752, + "epoch": 0.7101157372613859, + "flos": 28113080905440.0, + "grad_norm": 1.8220917700086021, + "language_loss": 0.73921156, + "learning_rate": 8.18482437510784e-07, + "loss": 0.7607215, + "num_input_tokens_seen": 254791375, + "router_z_loss_clip": 0.73681641, + "router_z_loss_mlp": 0.11016846, + "step": 11811, + "time_per_iteration": 2.6354598999023438 + }, + { + "auxiliary_loss_clip": 0.01111711, + "auxiliary_loss_mlp": 0.01026531, + "balance_loss_clip": 1.04050231, + "balance_loss_mlp": 1.01544464, + "epoch": 0.7101758605140538, + "flos": 28290966880320.0, + "grad_norm": 2.0435837744406196, + "language_loss": 0.83496726, + "learning_rate": 8.181682234469882e-07, + "loss": 0.85634965, + "num_input_tokens_seen": 254809300, + "router_z_loss_clip": 0.71191406, + "router_z_loss_mlp": 0.11083984, + "step": 11812, + "time_per_iteration": 2.673847198486328 + }, + { + "auxiliary_loss_clip": 0.01116479, + "auxiliary_loss_mlp": 0.01028385, + "balance_loss_clip": 1.04115176, + "balance_loss_mlp": 1.01618409, + "epoch": 0.7102359837667218, + "flos": 28914276844800.0, + "grad_norm": 1.6407281081438685, + "language_loss": 0.70003247, + "learning_rate": 8.178540541983716e-07, + "loss": 0.72148108, + "num_input_tokens_seen": 254829325, + "router_z_loss_clip": 0.75341797, + "router_z_loss_mlp": 0.12194824, + "step": 11813, + "time_per_iteration": 2.6636781692504883 + }, + { + "auxiliary_loss_clip": 0.01110227, + "auxiliary_loss_mlp": 0.01026521, + "balance_loss_clip": 1.03907037, + "balance_loss_mlp": 1.01574993, + "epoch": 0.7102961070193897, + "flos": 23660947908000.0, + "grad_norm": 2.022289093544096, + "language_loss": 0.81624711, + "learning_rate": 8.175399297768495e-07, + "loss": 0.83761454, + "num_input_tokens_seen": 254847690, + "router_z_loss_clip": 0.71142578, + "router_z_loss_mlp": 0.10766602, + "step": 11814, + "time_per_iteration": 4.098404884338379 + }, + { + "auxiliary_loss_clip": 0.01115899, + "auxiliary_loss_mlp": 0.01029523, + "balance_loss_clip": 1.04278624, + "balance_loss_mlp": 1.01770949, + "epoch": 0.7103562302720577, + "flos": 26243515667520.0, + "grad_norm": 2.818353823764017, + "language_loss": 0.75863016, + "learning_rate": 8.172258501943301e-07, + "loss": 0.78008437, + "num_input_tokens_seen": 254865960, + "router_z_loss_clip": 0.73046875, + "router_z_loss_mlp": 0.11816406, + "step": 11815, + "time_per_iteration": 2.7308361530303955 + }, + { + "auxiliary_loss_clip": 0.01111914, + "auxiliary_loss_mlp": 0.01029908, + "balance_loss_clip": 1.0397985, + "balance_loss_mlp": 1.0185771, + "epoch": 0.7104163535247257, + "flos": 17735532552000.0, + "grad_norm": 2.0027313862273304, + "language_loss": 0.78304893, + "learning_rate": 8.16911815462725e-07, + "loss": 0.80446714, + "num_input_tokens_seen": 254882815, + "router_z_loss_clip": 0.72216797, + "router_z_loss_mlp": 0.11340332, + "step": 11816, + "time_per_iteration": 2.5934481620788574 + }, + { + "auxiliary_loss_clip": 0.01116448, + "auxiliary_loss_mlp": 0.01036716, + "balance_loss_clip": 1.04237723, + "balance_loss_mlp": 1.02570093, + "epoch": 0.7104764767773937, + "flos": 13909991418720.0, + "grad_norm": 1.9096870533027457, + "language_loss": 0.86624545, + "learning_rate": 8.165978255939426e-07, + "loss": 0.88777709, + "num_input_tokens_seen": 254898705, + "router_z_loss_clip": 0.74072266, + "router_z_loss_mlp": 0.11010742, + "step": 11817, + "time_per_iteration": 4.034510612487793 + }, + { + "auxiliary_loss_clip": 0.01113633, + "auxiliary_loss_mlp": 0.01028327, + "balance_loss_clip": 1.04090428, + "balance_loss_mlp": 1.01766944, + "epoch": 0.7105366000300616, + "flos": 14264183194560.0, + "grad_norm": 3.4709377053411847, + "language_loss": 0.8505528, + "learning_rate": 8.162838805998897e-07, + "loss": 0.87197244, + "num_input_tokens_seen": 254913665, + "router_z_loss_clip": 0.72802734, + "router_z_loss_mlp": 0.10656738, + "step": 11818, + "time_per_iteration": 2.6036479473114014 + }, + { + "auxiliary_loss_clip": 0.01112388, + "auxiliary_loss_mlp": 0.01029125, + "balance_loss_clip": 1.03800213, + "balance_loss_mlp": 1.01768064, + "epoch": 0.7105967232827296, + "flos": 23615736973920.0, + "grad_norm": 2.143386848158018, + "language_loss": 0.75467277, + "learning_rate": 8.159699804924709e-07, + "loss": 0.77608794, + "num_input_tokens_seen": 254932140, + "router_z_loss_clip": 0.74316406, + "router_z_loss_mlp": 0.11444092, + "step": 11819, + "time_per_iteration": 2.6282505989074707 + }, + { + "auxiliary_loss_clip": 0.01116554, + "auxiliary_loss_mlp": 0.01029447, + "balance_loss_clip": 1.04273415, + "balance_loss_mlp": 1.01653683, + "epoch": 0.7106568465353975, + "flos": 27979838622720.0, + "grad_norm": 2.4021566072403764, + "language_loss": 0.70892674, + "learning_rate": 8.156561252835883e-07, + "loss": 0.73038673, + "num_input_tokens_seen": 254951580, + "router_z_loss_clip": 0.73876953, + "router_z_loss_mlp": 0.12908936, + "step": 11820, + "time_per_iteration": 2.6367926597595215 + }, + { + "auxiliary_loss_clip": 0.011156, + "auxiliary_loss_mlp": 0.01026013, + "balance_loss_clip": 1.04272616, + "balance_loss_mlp": 1.01472998, + "epoch": 0.7107169697880655, + "flos": 23305256992800.0, + "grad_norm": 3.2641550104896178, + "language_loss": 0.7555232, + "learning_rate": 8.153423149851449e-07, + "loss": 0.77693939, + "num_input_tokens_seen": 254969425, + "router_z_loss_clip": 0.72851562, + "router_z_loss_mlp": 0.11291504, + "step": 11821, + "time_per_iteration": 2.600731372833252 + }, + { + "auxiliary_loss_clip": 0.01033457, + "auxiliary_loss_mlp": 0.01001335, + "balance_loss_clip": 1.01041102, + "balance_loss_mlp": 1.00011373, + "epoch": 0.7107770930407336, + "flos": 77652048384000.0, + "grad_norm": 0.7707743785225228, + "language_loss": 0.55082709, + "learning_rate": 8.150285496090388e-07, + "loss": 0.57117498, + "num_input_tokens_seen": 255032680, + "router_z_loss_clip": 0.23046875, + "router_z_loss_mlp": 0.01220703, + "step": 11822, + "time_per_iteration": 3.3380987644195557 + }, + { + "auxiliary_loss_clip": 0.01109961, + "auxiliary_loss_mlp": 0.01028815, + "balance_loss_clip": 1.039464, + "balance_loss_mlp": 1.01726401, + "epoch": 0.7108372162934015, + "flos": 26910983116800.0, + "grad_norm": 2.96479623221075, + "language_loss": 0.60104036, + "learning_rate": 8.147148291671688e-07, + "loss": 0.62242818, + "num_input_tokens_seen": 255054400, + "router_z_loss_clip": 0.70410156, + "router_z_loss_mlp": 0.11553955, + "step": 11823, + "time_per_iteration": 2.7143261432647705 + }, + { + "auxiliary_loss_clip": 0.01114329, + "auxiliary_loss_mlp": 0.01027367, + "balance_loss_clip": 1.04123855, + "balance_loss_mlp": 1.0164299, + "epoch": 0.7108973395460695, + "flos": 23348644649280.0, + "grad_norm": 3.5136154792809218, + "language_loss": 0.71590257, + "learning_rate": 8.144011536714322e-07, + "loss": 0.73731959, + "num_input_tokens_seen": 255072785, + "router_z_loss_clip": 0.72998047, + "router_z_loss_mlp": 0.109375, + "step": 11824, + "time_per_iteration": 2.6471426486968994 + }, + { + "auxiliary_loss_clip": 0.01108574, + "auxiliary_loss_mlp": 0.01029528, + "balance_loss_clip": 1.03837752, + "balance_loss_mlp": 1.01904941, + "epoch": 0.7109574627987374, + "flos": 21834648774720.0, + "grad_norm": 2.057646643061224, + "language_loss": 0.72706556, + "learning_rate": 8.140875231337223e-07, + "loss": 0.74844658, + "num_input_tokens_seen": 255091820, + "router_z_loss_clip": 0.70166016, + "router_z_loss_mlp": 0.10479736, + "step": 11825, + "time_per_iteration": 4.217491626739502 + }, + { + "auxiliary_loss_clip": 0.01115159, + "auxiliary_loss_mlp": 0.01032415, + "balance_loss_clip": 1.03964686, + "balance_loss_mlp": 1.02122712, + "epoch": 0.7110175860514054, + "flos": 35362248390720.0, + "grad_norm": 2.745637174783214, + "language_loss": 0.79557997, + "learning_rate": 8.137739375659321e-07, + "loss": 0.8170557, + "num_input_tokens_seen": 255111720, + "router_z_loss_clip": 0.75488281, + "router_z_loss_mlp": 0.11181641, + "step": 11826, + "time_per_iteration": 3.9807469844818115 + }, + { + "auxiliary_loss_clip": 0.01111342, + "auxiliary_loss_mlp": 0.01031301, + "balance_loss_clip": 1.03947687, + "balance_loss_mlp": 1.02049422, + "epoch": 0.7110777093040733, + "flos": 31937487554880.0, + "grad_norm": 1.5028608948808742, + "language_loss": 0.83252287, + "learning_rate": 8.134603969799527e-07, + "loss": 0.85394931, + "num_input_tokens_seen": 255133495, + "router_z_loss_clip": 0.71777344, + "router_z_loss_mlp": 0.10809326, + "step": 11827, + "time_per_iteration": 2.7278215885162354 + }, + { + "auxiliary_loss_clip": 0.01114172, + "auxiliary_loss_mlp": 0.01034594, + "balance_loss_clip": 1.04029846, + "balance_loss_mlp": 1.02269125, + "epoch": 0.7111378325567413, + "flos": 32787257362560.0, + "grad_norm": 1.529446283098282, + "language_loss": 0.62376893, + "learning_rate": 8.131469013876748e-07, + "loss": 0.64525658, + "num_input_tokens_seen": 255156880, + "router_z_loss_clip": 0.73876953, + "router_z_loss_mlp": 0.11907959, + "step": 11828, + "time_per_iteration": 2.695099353790283 + }, + { + "auxiliary_loss_clip": 0.01113963, + "auxiliary_loss_mlp": 0.01030859, + "balance_loss_clip": 1.04094172, + "balance_loss_mlp": 1.01899171, + "epoch": 0.7111979558094093, + "flos": 33277608665280.0, + "grad_norm": 3.2043531558679965, + "language_loss": 0.71874851, + "learning_rate": 8.128334508009846e-07, + "loss": 0.7401967, + "num_input_tokens_seen": 255178920, + "router_z_loss_clip": 0.72998047, + "router_z_loss_mlp": 0.11883545, + "step": 11829, + "time_per_iteration": 2.6588218212127686 + }, + { + "auxiliary_loss_clip": 0.01112166, + "auxiliary_loss_mlp": 0.01027654, + "balance_loss_clip": 1.03956318, + "balance_loss_mlp": 1.01675868, + "epoch": 0.7112580790620773, + "flos": 30562568451360.0, + "grad_norm": 1.9736630304831164, + "language_loss": 0.80335063, + "learning_rate": 8.125200452317697e-07, + "loss": 0.82474881, + "num_input_tokens_seen": 255198095, + "router_z_loss_clip": 0.72607422, + "router_z_loss_mlp": 0.10900879, + "step": 11830, + "time_per_iteration": 2.649916648864746 + }, + { + "auxiliary_loss_clip": 0.01114781, + "auxiliary_loss_mlp": 0.01033712, + "balance_loss_clip": 1.04057229, + "balance_loss_mlp": 1.02231526, + "epoch": 0.7113182023147452, + "flos": 26413905945600.0, + "grad_norm": 2.457312561894842, + "language_loss": 0.84171629, + "learning_rate": 8.122066846919138e-07, + "loss": 0.86320126, + "num_input_tokens_seen": 255215860, + "router_z_loss_clip": 0.74121094, + "router_z_loss_mlp": 0.11401367, + "step": 11831, + "time_per_iteration": 2.600886821746826 + }, + { + "auxiliary_loss_clip": 0.01114529, + "auxiliary_loss_mlp": 0.01027618, + "balance_loss_clip": 1.04043174, + "balance_loss_mlp": 1.01654899, + "epoch": 0.7113783255674132, + "flos": 25619719495680.0, + "grad_norm": 2.5197385572687754, + "language_loss": 0.77289987, + "learning_rate": 8.118933691932985e-07, + "loss": 0.7943213, + "num_input_tokens_seen": 255235425, + "router_z_loss_clip": 0.74072266, + "router_z_loss_mlp": 0.11065674, + "step": 11832, + "time_per_iteration": 2.622556209564209 + }, + { + "auxiliary_loss_clip": 0.01032973, + "auxiliary_loss_mlp": 0.0100147, + "balance_loss_clip": 1.01005125, + "balance_loss_mlp": 1.00030541, + "epoch": 0.7114384488200811, + "flos": 81475644687840.0, + "grad_norm": 0.7449283465534696, + "language_loss": 0.56592488, + "learning_rate": 8.115800987478059e-07, + "loss": 0.58626932, + "num_input_tokens_seen": 255291680, + "router_z_loss_clip": 0.22912598, + "router_z_loss_mlp": 0.01164246, + "step": 11833, + "time_per_iteration": 3.1594669818878174 + }, + { + "auxiliary_loss_clip": 0.0111127, + "auxiliary_loss_mlp": 0.01030312, + "balance_loss_clip": 1.03898001, + "balance_loss_mlp": 1.01936281, + "epoch": 0.7114985720727491, + "flos": 30517600620960.0, + "grad_norm": 1.9193009120066526, + "language_loss": 0.70778012, + "learning_rate": 8.11266873367315e-07, + "loss": 0.72919595, + "num_input_tokens_seen": 255313880, + "router_z_loss_clip": 0.72265625, + "router_z_loss_mlp": 0.10955811, + "step": 11834, + "time_per_iteration": 2.6470186710357666 + }, + { + "auxiliary_loss_clip": 0.01118007, + "auxiliary_loss_mlp": 0.01029705, + "balance_loss_clip": 1.04291511, + "balance_loss_mlp": 1.01808774, + "epoch": 0.7115586953254172, + "flos": 26197980595200.0, + "grad_norm": 1.9889346224788627, + "language_loss": 0.79649317, + "learning_rate": 8.10953693063704e-07, + "loss": 0.81797028, + "num_input_tokens_seen": 255332390, + "router_z_loss_clip": 0.75048828, + "router_z_loss_mlp": 0.11608887, + "step": 11835, + "time_per_iteration": 2.677119016647339 + }, + { + "auxiliary_loss_clip": 0.01110284, + "auxiliary_loss_mlp": 0.01029493, + "balance_loss_clip": 1.03883481, + "balance_loss_mlp": 1.01859093, + "epoch": 0.7116188185780851, + "flos": 34925859754560.0, + "grad_norm": 2.3017899509369735, + "language_loss": 0.76069069, + "learning_rate": 8.10640557848848e-07, + "loss": 0.78208852, + "num_input_tokens_seen": 255354025, + "router_z_loss_clip": 0.71582031, + "router_z_loss_mlp": 0.10913086, + "step": 11836, + "time_per_iteration": 2.7080399990081787 + }, + { + "auxiliary_loss_clip": 0.01111337, + "auxiliary_loss_mlp": 0.01029369, + "balance_loss_clip": 1.03839326, + "balance_loss_mlp": 1.01766849, + "epoch": 0.7116789418307531, + "flos": 30864377734560.0, + "grad_norm": 1.9013867469868753, + "language_loss": 0.70414853, + "learning_rate": 8.103274677346208e-07, + "loss": 0.72555554, + "num_input_tokens_seen": 255371400, + "router_z_loss_clip": 0.72998047, + "router_z_loss_mlp": 0.11694336, + "step": 11837, + "time_per_iteration": 2.6458654403686523 + }, + { + "auxiliary_loss_clip": 0.01116699, + "auxiliary_loss_mlp": 0.01033318, + "balance_loss_clip": 1.04080975, + "balance_loss_mlp": 1.02088428, + "epoch": 0.711739065083421, + "flos": 31185392208480.0, + "grad_norm": 2.0496434928732703, + "language_loss": 0.61801368, + "learning_rate": 8.100144227328958e-07, + "loss": 0.63951385, + "num_input_tokens_seen": 255390710, + "router_z_loss_clip": 0.75830078, + "router_z_loss_mlp": 0.12451172, + "step": 11838, + "time_per_iteration": 2.6673367023468018 + }, + { + "auxiliary_loss_clip": 0.01113849, + "auxiliary_loss_mlp": 0.01031001, + "balance_loss_clip": 1.04056954, + "balance_loss_mlp": 1.01975346, + "epoch": 0.711799188336089, + "flos": 31897665419040.0, + "grad_norm": 2.0753210408333254, + "language_loss": 0.67902547, + "learning_rate": 8.097014228555426e-07, + "loss": 0.70047396, + "num_input_tokens_seen": 255408790, + "router_z_loss_clip": 0.73193359, + "router_z_loss_mlp": 0.11254883, + "step": 11839, + "time_per_iteration": 2.6971065998077393 + }, + { + "auxiliary_loss_clip": 0.01113967, + "auxiliary_loss_mlp": 0.01034411, + "balance_loss_clip": 1.04095185, + "balance_loss_mlp": 1.02313423, + "epoch": 0.7118593115887569, + "flos": 25796025296640.0, + "grad_norm": 2.033701333240172, + "language_loss": 0.84425491, + "learning_rate": 8.093884681144305e-07, + "loss": 0.86573863, + "num_input_tokens_seen": 255426280, + "router_z_loss_clip": 0.72949219, + "router_z_loss_mlp": 0.112854, + "step": 11840, + "time_per_iteration": 2.6254563331604004 + }, + { + "auxiliary_loss_clip": 0.01117919, + "auxiliary_loss_mlp": 0.0103325, + "balance_loss_clip": 1.04195547, + "balance_loss_mlp": 1.02157974, + "epoch": 0.711919434841425, + "flos": 18272958583680.0, + "grad_norm": 2.208882642666319, + "language_loss": 0.76665187, + "learning_rate": 8.090755585214277e-07, + "loss": 0.78816354, + "num_input_tokens_seen": 255442935, + "router_z_loss_clip": 0.76025391, + "router_z_loss_mlp": 0.11669922, + "step": 11841, + "time_per_iteration": 2.757774829864502 + }, + { + "auxiliary_loss_clip": 0.01115561, + "auxiliary_loss_mlp": 0.01032873, + "balance_loss_clip": 1.04124713, + "balance_loss_mlp": 1.02116656, + "epoch": 0.7119795580940929, + "flos": 20143941926400.0, + "grad_norm": 2.1137374354111276, + "language_loss": 0.74717653, + "learning_rate": 8.087626940883994e-07, + "loss": 0.7686609, + "num_input_tokens_seen": 255460925, + "router_z_loss_clip": 0.74267578, + "router_z_loss_mlp": 0.11706543, + "step": 11842, + "time_per_iteration": 2.6028571128845215 + }, + { + "auxiliary_loss_clip": 0.01033106, + "auxiliary_loss_mlp": 0.01002559, + "balance_loss_clip": 1.01007926, + "balance_loss_mlp": 1.00135827, + "epoch": 0.7120396813467609, + "flos": 81229459279680.0, + "grad_norm": 0.7825983028043052, + "language_loss": 0.616292, + "learning_rate": 8.084498748272082e-07, + "loss": 0.63664865, + "num_input_tokens_seen": 255521360, + "router_z_loss_clip": 0.23034668, + "router_z_loss_mlp": 0.01198578, + "step": 11843, + "time_per_iteration": 3.248990297317505 + }, + { + "auxiliary_loss_clip": 0.01113227, + "auxiliary_loss_mlp": 0.01024848, + "balance_loss_clip": 1.04109931, + "balance_loss_mlp": 1.01382124, + "epoch": 0.7120998045994288, + "flos": 32253680472480.0, + "grad_norm": 2.662009199160168, + "language_loss": 0.80402601, + "learning_rate": 8.081371007497171e-07, + "loss": 0.82540679, + "num_input_tokens_seen": 255541435, + "router_z_loss_clip": 0.71972656, + "router_z_loss_mlp": 0.11010742, + "step": 11844, + "time_per_iteration": 2.645117998123169 + }, + { + "auxiliary_loss_clip": 0.01112642, + "auxiliary_loss_mlp": 0.01026371, + "balance_loss_clip": 1.03916705, + "balance_loss_mlp": 1.0147121, + "epoch": 0.7121599278520968, + "flos": 20045821775040.0, + "grad_norm": 2.3501167899645368, + "language_loss": 0.79203594, + "learning_rate": 8.078243718677873e-07, + "loss": 0.81342602, + "num_input_tokens_seen": 255558505, + "router_z_loss_clip": 0.734375, + "router_z_loss_mlp": 0.11645508, + "step": 11845, + "time_per_iteration": 2.648322820663452 + }, + { + "auxiliary_loss_clip": 0.01112229, + "auxiliary_loss_mlp": 0.01030862, + "balance_loss_clip": 1.04106236, + "balance_loss_mlp": 1.01954353, + "epoch": 0.7122200511047647, + "flos": 35325343499040.0, + "grad_norm": 1.9714400918852808, + "language_loss": 0.7766183, + "learning_rate": 8.075116881932762e-07, + "loss": 0.79804921, + "num_input_tokens_seen": 255577815, + "router_z_loss_clip": 0.71191406, + "router_z_loss_mlp": 0.11309814, + "step": 11846, + "time_per_iteration": 2.680817127227783 + }, + { + "auxiliary_loss_clip": 0.01115846, + "auxiliary_loss_mlp": 0.01032648, + "balance_loss_clip": 1.04192805, + "balance_loss_mlp": 1.02064371, + "epoch": 0.7122801743574327, + "flos": 20098690475040.0, + "grad_norm": 2.0690306024553515, + "language_loss": 0.58828896, + "learning_rate": 8.071990497380421e-07, + "loss": 0.60977387, + "num_input_tokens_seen": 255595885, + "router_z_loss_clip": 0.73974609, + "router_z_loss_mlp": 0.11999512, + "step": 11847, + "time_per_iteration": 2.6447649002075195 + }, + { + "auxiliary_loss_clip": 0.01110243, + "auxiliary_loss_mlp": 0.01029709, + "balance_loss_clip": 1.04030943, + "balance_loss_mlp": 1.01875412, + "epoch": 0.7123402976101008, + "flos": 25174052402400.0, + "grad_norm": 1.384611039276452, + "language_loss": 0.71414042, + "learning_rate": 8.068864565139395e-07, + "loss": 0.73553991, + "num_input_tokens_seen": 255616750, + "router_z_loss_clip": 0.70068359, + "router_z_loss_mlp": 0.10955811, + "step": 11848, + "time_per_iteration": 2.6221046447753906 + }, + { + "auxiliary_loss_clip": 0.01033746, + "auxiliary_loss_mlp": 0.01003775, + "balance_loss_clip": 1.01072848, + "balance_loss_mlp": 1.00260842, + "epoch": 0.7124004208627687, + "flos": 76050547885440.0, + "grad_norm": 0.8339291202779809, + "language_loss": 0.62967026, + "learning_rate": 8.065739085328211e-07, + "loss": 0.65004545, + "num_input_tokens_seen": 255677900, + "router_z_loss_clip": 0.23034668, + "router_z_loss_mlp": 0.01165771, + "step": 11849, + "time_per_iteration": 3.217552900314331 + }, + { + "auxiliary_loss_clip": 0.01115025, + "auxiliary_loss_mlp": 0.01036299, + "balance_loss_clip": 1.04043722, + "balance_loss_mlp": 1.02493238, + "epoch": 0.7124605441154367, + "flos": 48414204476640.0, + "grad_norm": 1.8789096024724583, + "language_loss": 0.63858145, + "learning_rate": 8.0626140580654e-07, + "loss": 0.66009468, + "num_input_tokens_seen": 255699140, + "router_z_loss_clip": 0.74609375, + "router_z_loss_mlp": 0.1137085, + "step": 11850, + "time_per_iteration": 2.8251278400421143 + }, + { + "auxiliary_loss_clip": 0.01114228, + "auxiliary_loss_mlp": 0.01031054, + "balance_loss_clip": 1.04049337, + "balance_loss_mlp": 1.0197351, + "epoch": 0.7125206673681046, + "flos": 34388312171040.0, + "grad_norm": 1.55806943778742, + "language_loss": 0.6981051, + "learning_rate": 8.05948948346946e-07, + "loss": 0.71955788, + "num_input_tokens_seen": 255719640, + "router_z_loss_clip": 0.73681641, + "router_z_loss_mlp": 0.11309814, + "step": 11851, + "time_per_iteration": 2.7024471759796143 + }, + { + "auxiliary_loss_clip": 0.01113703, + "auxiliary_loss_mlp": 0.01030337, + "balance_loss_clip": 1.04156709, + "balance_loss_mlp": 1.01947761, + "epoch": 0.7125807906207726, + "flos": 31941579800160.0, + "grad_norm": 1.7433279195873466, + "language_loss": 0.83095717, + "learning_rate": 8.056365361658882e-07, + "loss": 0.85239756, + "num_input_tokens_seen": 255740450, + "router_z_loss_clip": 0.72167969, + "router_z_loss_mlp": 0.10852051, + "step": 11852, + "time_per_iteration": 2.6805951595306396 + }, + { + "auxiliary_loss_clip": 0.01116729, + "auxiliary_loss_mlp": 0.01033044, + "balance_loss_clip": 1.04113293, + "balance_loss_mlp": 1.02058065, + "epoch": 0.7126409138734405, + "flos": 20935859408640.0, + "grad_norm": 2.8316625772742507, + "language_loss": 0.72431993, + "learning_rate": 8.053241692752126e-07, + "loss": 0.74581766, + "num_input_tokens_seen": 255758070, + "router_z_loss_clip": 0.75537109, + "router_z_loss_mlp": 0.12481689, + "step": 11853, + "time_per_iteration": 2.585939645767212 + }, + { + "auxiliary_loss_clip": 0.01110162, + "auxiliary_loss_mlp": 0.01027068, + "balance_loss_clip": 1.04065776, + "balance_loss_mlp": 1.01706004, + "epoch": 0.7127010371261085, + "flos": 22903018073280.0, + "grad_norm": 1.893772533026078, + "language_loss": 0.92106545, + "learning_rate": 8.050118476867635e-07, + "loss": 0.94243771, + "num_input_tokens_seen": 255775685, + "router_z_loss_clip": 0.6953125, + "router_z_loss_mlp": 0.10003662, + "step": 11854, + "time_per_iteration": 4.0682373046875 + }, + { + "auxiliary_loss_clip": 0.01111512, + "auxiliary_loss_mlp": 0.01030383, + "balance_loss_clip": 1.03956163, + "balance_loss_mlp": 1.01878369, + "epoch": 0.7127611603787765, + "flos": 24863288800320.0, + "grad_norm": 1.9471026219225835, + "language_loss": 0.79075027, + "learning_rate": 8.046995714123856e-07, + "loss": 0.81216925, + "num_input_tokens_seen": 255794750, + "router_z_loss_clip": 0.71972656, + "router_z_loss_mlp": 0.11602783, + "step": 11855, + "time_per_iteration": 2.6270806789398193 + }, + { + "auxiliary_loss_clip": 0.01113412, + "auxiliary_loss_mlp": 0.01032292, + "balance_loss_clip": 1.04014838, + "balance_loss_mlp": 1.02059138, + "epoch": 0.7128212836314445, + "flos": 24951887390880.0, + "grad_norm": 2.1970344801028845, + "language_loss": 0.73032916, + "learning_rate": 8.043873404639192e-07, + "loss": 0.75178623, + "num_input_tokens_seen": 255813325, + "router_z_loss_clip": 0.73193359, + "router_z_loss_mlp": 0.11706543, + "step": 11856, + "time_per_iteration": 2.6806483268737793 + }, + { + "auxiliary_loss_clip": 0.01117396, + "auxiliary_loss_mlp": 0.01029137, + "balance_loss_clip": 1.04256463, + "balance_loss_mlp": 1.01780045, + "epoch": 0.7128814068841124, + "flos": 28602743414400.0, + "grad_norm": 1.7603531349991424, + "language_loss": 0.70009422, + "learning_rate": 8.040751548532046e-07, + "loss": 0.72155958, + "num_input_tokens_seen": 255832470, + "router_z_loss_clip": 0.74902344, + "router_z_loss_mlp": 0.11334229, + "step": 11857, + "time_per_iteration": 3.9099276065826416 + }, + { + "auxiliary_loss_clip": 0.01111234, + "auxiliary_loss_mlp": 0.01029654, + "balance_loss_clip": 1.04005456, + "balance_loss_mlp": 1.01797807, + "epoch": 0.7129415301367804, + "flos": 22235510106720.0, + "grad_norm": 2.087813482949879, + "language_loss": 0.84872884, + "learning_rate": 8.03763014592081e-07, + "loss": 0.87013769, + "num_input_tokens_seen": 255849740, + "router_z_loss_clip": 0.7109375, + "router_z_loss_mlp": 0.11669922, + "step": 11858, + "time_per_iteration": 2.5713717937469482 + }, + { + "auxiliary_loss_clip": 0.01118206, + "auxiliary_loss_mlp": 0.01030363, + "balance_loss_clip": 1.04264009, + "balance_loss_mlp": 1.01862681, + "epoch": 0.7130016533894483, + "flos": 18941844137760.0, + "grad_norm": 1.7288192137874059, + "language_loss": 0.80264705, + "learning_rate": 8.034509196923829e-07, + "loss": 0.82413274, + "num_input_tokens_seen": 255866975, + "router_z_loss_clip": 0.75537109, + "router_z_loss_mlp": 0.11730957, + "step": 11859, + "time_per_iteration": 2.6367650032043457 + }, + { + "auxiliary_loss_clip": 0.01112307, + "auxiliary_loss_mlp": 0.01030183, + "balance_loss_clip": 1.04029894, + "balance_loss_mlp": 1.01903129, + "epoch": 0.7130617766421163, + "flos": 69695503928640.0, + "grad_norm": 3.101549384278628, + "language_loss": 0.69037235, + "learning_rate": 8.031388701659456e-07, + "loss": 0.71179724, + "num_input_tokens_seen": 255892915, + "router_z_loss_clip": 0.72021484, + "router_z_loss_mlp": 0.1114502, + "step": 11860, + "time_per_iteration": 3.018531084060669 + }, + { + "auxiliary_loss_clip": 0.01114206, + "auxiliary_loss_mlp": 0.01031524, + "balance_loss_clip": 1.04068816, + "balance_loss_mlp": 1.01921606, + "epoch": 0.7131218998947844, + "flos": 24145950929760.0, + "grad_norm": 2.079668288521193, + "language_loss": 0.64315772, + "learning_rate": 8.028268660246023e-07, + "loss": 0.66461504, + "num_input_tokens_seen": 255911480, + "router_z_loss_clip": 0.73486328, + "router_z_loss_mlp": 0.12304688, + "step": 11861, + "time_per_iteration": 2.6407294273376465 + }, + { + "auxiliary_loss_clip": 0.01120115, + "auxiliary_loss_mlp": 0.01027833, + "balance_loss_clip": 1.04386306, + "balance_loss_mlp": 1.01587582, + "epoch": 0.7131820231474523, + "flos": 32521096935360.0, + "grad_norm": 1.6405407953955409, + "language_loss": 0.6689961, + "learning_rate": 8.025149072801849e-07, + "loss": 0.69047558, + "num_input_tokens_seen": 255931140, + "router_z_loss_clip": 0.76171875, + "router_z_loss_mlp": 0.11956787, + "step": 11862, + "time_per_iteration": 2.6932785511016846 + }, + { + "auxiliary_loss_clip": 0.01111901, + "auxiliary_loss_mlp": 0.01032219, + "balance_loss_clip": 1.04091334, + "balance_loss_mlp": 1.02193141, + "epoch": 0.7132421464001203, + "flos": 35637363136800.0, + "grad_norm": 5.100309012140027, + "language_loss": 0.66773903, + "learning_rate": 8.022029939445214e-07, + "loss": 0.68918025, + "num_input_tokens_seen": 255951665, + "router_z_loss_clip": 0.70996094, + "router_z_loss_mlp": 0.10284424, + "step": 11863, + "time_per_iteration": 2.6892261505126953 + }, + { + "auxiliary_loss_clip": 0.01121931, + "auxiliary_loss_mlp": 0.01033948, + "balance_loss_clip": 1.04416239, + "balance_loss_mlp": 1.0220691, + "epoch": 0.7133022696527882, + "flos": 28154847870720.0, + "grad_norm": 2.419370616334389, + "language_loss": 0.6545676, + "learning_rate": 8.018911260294414e-07, + "loss": 0.67612636, + "num_input_tokens_seen": 255970055, + "router_z_loss_clip": 0.77783203, + "router_z_loss_mlp": 0.11883545, + "step": 11864, + "time_per_iteration": 4.111300468444824 + }, + { + "auxiliary_loss_clip": 0.01118091, + "auxiliary_loss_mlp": 0.01031179, + "balance_loss_clip": 1.04252005, + "balance_loss_mlp": 1.01942539, + "epoch": 0.7133623929054562, + "flos": 21300950332800.0, + "grad_norm": 2.224262516498469, + "language_loss": 0.86018747, + "learning_rate": 8.015793035467697e-07, + "loss": 0.88168025, + "num_input_tokens_seen": 255987720, + "router_z_loss_clip": 0.75585938, + "router_z_loss_mlp": 0.11761475, + "step": 11865, + "time_per_iteration": 3.935690402984619 + }, + { + "auxiliary_loss_clip": 0.0111556, + "auxiliary_loss_mlp": 0.01029072, + "balance_loss_clip": 1.04050434, + "balance_loss_mlp": 1.01666784, + "epoch": 0.7134225161581241, + "flos": 23838469227360.0, + "grad_norm": 2.2358417455570696, + "language_loss": 0.74788141, + "learning_rate": 8.012675265083304e-07, + "loss": 0.76932764, + "num_input_tokens_seen": 256005490, + "router_z_loss_clip": 0.74951172, + "router_z_loss_mlp": 0.12402344, + "step": 11866, + "time_per_iteration": 2.597783327102661 + }, + { + "auxiliary_loss_clip": 0.01118333, + "auxiliary_loss_mlp": 0.01033559, + "balance_loss_clip": 1.04427457, + "balance_loss_mlp": 1.02158475, + "epoch": 0.7134826394107922, + "flos": 32030826667200.0, + "grad_norm": 2.598476163482906, + "language_loss": 0.70338333, + "learning_rate": 8.009557949259464e-07, + "loss": 0.72490221, + "num_input_tokens_seen": 256026030, + "router_z_loss_clip": 0.73925781, + "router_z_loss_mlp": 0.11962891, + "step": 11867, + "time_per_iteration": 2.707462787628174 + }, + { + "auxiliary_loss_clip": 0.01113268, + "auxiliary_loss_mlp": 0.01026976, + "balance_loss_clip": 1.04138803, + "balance_loss_mlp": 1.01641977, + "epoch": 0.7135427626634601, + "flos": 19297494535680.0, + "grad_norm": 2.0679435982781897, + "language_loss": 0.71652925, + "learning_rate": 8.006441088114397e-07, + "loss": 0.73793161, + "num_input_tokens_seen": 256043680, + "router_z_loss_clip": 0.71923828, + "router_z_loss_mlp": 0.10546875, + "step": 11868, + "time_per_iteration": 2.7187161445617676 + }, + { + "auxiliary_loss_clip": 0.01119705, + "auxiliary_loss_mlp": 0.01030239, + "balance_loss_clip": 1.0441227, + "balance_loss_mlp": 1.01747191, + "epoch": 0.7136028859161281, + "flos": 22236036831360.0, + "grad_norm": 3.998736268842145, + "language_loss": 0.65991318, + "learning_rate": 8.003324681766286e-07, + "loss": 0.68141264, + "num_input_tokens_seen": 256059705, + "router_z_loss_clip": 0.75585938, + "router_z_loss_mlp": 0.12780762, + "step": 11869, + "time_per_iteration": 2.643880605697632 + }, + { + "auxiliary_loss_clip": 0.01113147, + "auxiliary_loss_mlp": 0.01024159, + "balance_loss_clip": 1.03942418, + "balance_loss_mlp": 1.01304281, + "epoch": 0.713663009168796, + "flos": 29669410987200.0, + "grad_norm": 1.6754269569206013, + "language_loss": 0.78172898, + "learning_rate": 8.000208730333298e-07, + "loss": 0.80310214, + "num_input_tokens_seen": 256079785, + "router_z_loss_clip": 0.73730469, + "router_z_loss_mlp": 0.11114502, + "step": 11870, + "time_per_iteration": 2.633171558380127 + }, + { + "auxiliary_loss_clip": 0.01116419, + "auxiliary_loss_mlp": 0.01031831, + "balance_loss_clip": 1.04350543, + "balance_loss_mlp": 1.02001703, + "epoch": 0.713723132421464, + "flos": 32383073613600.0, + "grad_norm": 2.772466178351954, + "language_loss": 0.80811131, + "learning_rate": 7.997093233933597e-07, + "loss": 0.82959378, + "num_input_tokens_seen": 256099000, + "router_z_loss_clip": 0.72851562, + "router_z_loss_mlp": 0.11810303, + "step": 11871, + "time_per_iteration": 2.717918634414673 + }, + { + "auxiliary_loss_clip": 0.01117187, + "auxiliary_loss_mlp": 0.01036355, + "balance_loss_clip": 1.04134011, + "balance_loss_mlp": 1.02417135, + "epoch": 0.7137832556741319, + "flos": 24240019353120.0, + "grad_norm": 1.6872410945509435, + "language_loss": 0.79116011, + "learning_rate": 7.993978192685331e-07, + "loss": 0.8126955, + "num_input_tokens_seen": 256117985, + "router_z_loss_clip": 0.75830078, + "router_z_loss_mlp": 0.12182617, + "step": 11872, + "time_per_iteration": 2.646590232849121 + }, + { + "auxiliary_loss_clip": 0.01117044, + "auxiliary_loss_mlp": 0.01029829, + "balance_loss_clip": 1.0414927, + "balance_loss_mlp": 1.01759243, + "epoch": 0.7138433789267999, + "flos": 26464991885280.0, + "grad_norm": 2.6391875436698, + "language_loss": 0.83946872, + "learning_rate": 7.990863606706606e-07, + "loss": 0.86093754, + "num_input_tokens_seen": 256134350, + "router_z_loss_clip": 0.75488281, + "router_z_loss_mlp": 0.12243652, + "step": 11873, + "time_per_iteration": 2.6759657859802246 + }, + { + "auxiliary_loss_clip": 0.01110987, + "auxiliary_loss_mlp": 0.01027367, + "balance_loss_clip": 1.03960896, + "balance_loss_mlp": 1.01718652, + "epoch": 0.713903502179468, + "flos": 21478674238560.0, + "grad_norm": 1.9495635283013677, + "language_loss": 0.86516649, + "learning_rate": 7.987749476115539e-07, + "loss": 0.88655001, + "num_input_tokens_seen": 256150610, + "router_z_loss_clip": 0.71337891, + "router_z_loss_mlp": 0.10180664, + "step": 11874, + "time_per_iteration": 2.6181836128234863 + }, + { + "auxiliary_loss_clip": 0.01113712, + "auxiliary_loss_mlp": 0.0103173, + "balance_loss_clip": 1.03940821, + "balance_loss_mlp": 1.02024424, + "epoch": 0.7139636254321359, + "flos": 22012696818720.0, + "grad_norm": 2.004150805550565, + "language_loss": 0.82464397, + "learning_rate": 7.984635801030228e-07, + "loss": 0.84609842, + "num_input_tokens_seen": 256168620, + "router_z_loss_clip": 0.74267578, + "router_z_loss_mlp": 0.11486816, + "step": 11875, + "time_per_iteration": 2.598314046859741 + }, + { + "auxiliary_loss_clip": 0.01119245, + "auxiliary_loss_mlp": 0.01033205, + "balance_loss_clip": 1.04058599, + "balance_loss_mlp": 1.01973987, + "epoch": 0.7140237486848039, + "flos": 28468893372480.0, + "grad_norm": 1.970369669530311, + "language_loss": 0.69565523, + "learning_rate": 7.981522581568721e-07, + "loss": 0.71717978, + "num_input_tokens_seen": 256186700, + "router_z_loss_clip": 0.78662109, + "router_z_loss_mlp": 0.13470459, + "step": 11876, + "time_per_iteration": 2.6611366271972656 + }, + { + "auxiliary_loss_clip": 0.01117294, + "auxiliary_loss_mlp": 0.01033404, + "balance_loss_clip": 1.04174078, + "balance_loss_mlp": 1.0214715, + "epoch": 0.7140838719374718, + "flos": 20544317051040.0, + "grad_norm": 2.1386679072751646, + "language_loss": 0.77930546, + "learning_rate": 7.978409817849079e-07, + "loss": 0.80081242, + "num_input_tokens_seen": 256205390, + "router_z_loss_clip": 0.75488281, + "router_z_loss_mlp": 0.11932373, + "step": 11877, + "time_per_iteration": 2.6161937713623047 + }, + { + "auxiliary_loss_clip": 0.01116032, + "auxiliary_loss_mlp": 0.01033713, + "balance_loss_clip": 1.0431118, + "balance_loss_mlp": 1.02270985, + "epoch": 0.7141439951901398, + "flos": 25797564953280.0, + "grad_norm": 1.8394358616772999, + "language_loss": 0.69296646, + "learning_rate": 7.97529750998934e-07, + "loss": 0.71446395, + "num_input_tokens_seen": 256224575, + "router_z_loss_clip": 0.72851562, + "router_z_loss_mlp": 0.11004639, + "step": 11878, + "time_per_iteration": 2.63749361038208 + }, + { + "auxiliary_loss_clip": 0.01114213, + "auxiliary_loss_mlp": 0.01034071, + "balance_loss_clip": 1.04171598, + "balance_loss_mlp": 1.02356887, + "epoch": 0.7142041184428077, + "flos": 30160896773760.0, + "grad_norm": 2.643753917807507, + "language_loss": 0.67897922, + "learning_rate": 7.972185658107535e-07, + "loss": 0.7004621, + "num_input_tokens_seen": 256242130, + "router_z_loss_clip": 0.72509766, + "router_z_loss_mlp": 0.10498047, + "step": 11879, + "time_per_iteration": 2.65132999420166 + }, + { + "auxiliary_loss_clip": 0.01113774, + "auxiliary_loss_mlp": 0.01033939, + "balance_loss_clip": 1.04055905, + "balance_loss_mlp": 1.02146959, + "epoch": 0.7142642416954758, + "flos": 26732529900000.0, + "grad_norm": 1.8728672928449102, + "language_loss": 0.69138139, + "learning_rate": 7.969074262321646e-07, + "loss": 0.71285856, + "num_input_tokens_seen": 256261920, + "router_z_loss_clip": 0.73193359, + "router_z_loss_mlp": 0.12463379, + "step": 11880, + "time_per_iteration": 2.677673816680908 + }, + { + "auxiliary_loss_clip": 0.01115372, + "auxiliary_loss_mlp": 0.01039012, + "balance_loss_clip": 1.03965414, + "balance_loss_mlp": 1.02707887, + "epoch": 0.7143243649481437, + "flos": 25387830336960.0, + "grad_norm": 5.165522569109072, + "language_loss": 0.8047365, + "learning_rate": 7.965963322749674e-07, + "loss": 0.82628036, + "num_input_tokens_seen": 256277970, + "router_z_loss_clip": 0.75732422, + "router_z_loss_mlp": 0.11932373, + "step": 11881, + "time_per_iteration": 2.5926897525787354 + }, + { + "auxiliary_loss_clip": 0.01112398, + "auxiliary_loss_mlp": 0.01032106, + "balance_loss_clip": 1.03906107, + "balance_loss_mlp": 1.02127624, + "epoch": 0.7143844882008117, + "flos": 33232721869440.0, + "grad_norm": 1.6691521195728054, + "language_loss": 0.63635397, + "learning_rate": 7.962852839509579e-07, + "loss": 0.65779901, + "num_input_tokens_seen": 256298205, + "router_z_loss_clip": 0.73339844, + "router_z_loss_mlp": 0.1083374, + "step": 11882, + "time_per_iteration": 2.817361831665039 + }, + { + "auxiliary_loss_clip": 0.01117797, + "auxiliary_loss_mlp": 0.01030792, + "balance_loss_clip": 1.04267323, + "balance_loss_mlp": 1.01929998, + "epoch": 0.7144446114534796, + "flos": 21877712292960.0, + "grad_norm": 1.993533124623528, + "language_loss": 0.68983603, + "learning_rate": 7.959742812719304e-07, + "loss": 0.71132189, + "num_input_tokens_seen": 256316685, + "router_z_loss_clip": 0.75, + "router_z_loss_mlp": 0.1149292, + "step": 11883, + "time_per_iteration": 2.5954246520996094 + }, + { + "auxiliary_loss_clip": 0.01114086, + "auxiliary_loss_mlp": 0.01037736, + "balance_loss_clip": 1.04178321, + "balance_loss_mlp": 1.02583861, + "epoch": 0.7145047347061476, + "flos": 25308145548000.0, + "grad_norm": 1.7510629339738695, + "language_loss": 0.77404535, + "learning_rate": 7.956633242496788e-07, + "loss": 0.79556358, + "num_input_tokens_seen": 256334205, + "router_z_loss_clip": 0.72314453, + "router_z_loss_mlp": 0.11895752, + "step": 11884, + "time_per_iteration": 2.6324214935302734 + }, + { + "auxiliary_loss_clip": 0.01120548, + "auxiliary_loss_mlp": 0.01034641, + "balance_loss_clip": 1.04130507, + "balance_loss_mlp": 1.02180839, + "epoch": 0.7145648579588155, + "flos": 25842208645440.0, + "grad_norm": 2.971267111579731, + "language_loss": 0.73904109, + "learning_rate": 7.953524128959954e-07, + "loss": 0.76059294, + "num_input_tokens_seen": 256353340, + "router_z_loss_clip": 0.79296875, + "router_z_loss_mlp": 0.1282959, + "step": 11885, + "time_per_iteration": 2.63503360748291 + }, + { + "auxiliary_loss_clip": 0.01033762, + "auxiliary_loss_mlp": 0.01003969, + "balance_loss_clip": 1.01058102, + "balance_loss_mlp": 1.00282776, + "epoch": 0.7146249812114835, + "flos": 79049657164320.0, + "grad_norm": 0.8885178769983064, + "language_loss": 0.66300976, + "learning_rate": 7.95041547222669e-07, + "loss": 0.68338704, + "num_input_tokens_seen": 256411550, + "router_z_loss_clip": 0.23181152, + "router_z_loss_mlp": 0.01141357, + "step": 11886, + "time_per_iteration": 3.251100540161133 + }, + { + "auxiliary_loss_clip": 0.01114927, + "auxiliary_loss_mlp": 0.01030407, + "balance_loss_clip": 1.0405215, + "balance_loss_mlp": 1.01835489, + "epoch": 0.7146851044641516, + "flos": 22102389375840.0, + "grad_norm": 1.9772098232814606, + "language_loss": 0.75131583, + "learning_rate": 7.947307272414874e-07, + "loss": 0.77276915, + "num_input_tokens_seen": 256430360, + "router_z_loss_clip": 0.74414062, + "router_z_loss_mlp": 0.12054443, + "step": 11887, + "time_per_iteration": 2.6374852657318115 + }, + { + "auxiliary_loss_clip": 0.01113185, + "auxiliary_loss_mlp": 0.01027653, + "balance_loss_clip": 1.0398761, + "balance_loss_mlp": 1.01681674, + "epoch": 0.7147452277168195, + "flos": 23481360207360.0, + "grad_norm": 2.0051114115778206, + "language_loss": 0.71464586, + "learning_rate": 7.944199529642372e-07, + "loss": 0.73605418, + "num_input_tokens_seen": 256449750, + "router_z_loss_clip": 0.73242188, + "router_z_loss_mlp": 0.10839844, + "step": 11888, + "time_per_iteration": 2.625236749649048 + }, + { + "auxiliary_loss_clip": 0.01116382, + "auxiliary_loss_mlp": 0.01031921, + "balance_loss_clip": 1.0393914, + "balance_loss_mlp": 1.0197022, + "epoch": 0.7148053509694875, + "flos": 28998378017280.0, + "grad_norm": 1.9363415561118456, + "language_loss": 0.84100938, + "learning_rate": 7.941092244027041e-07, + "loss": 0.86249244, + "num_input_tokens_seen": 256467330, + "router_z_loss_clip": 0.76953125, + "router_z_loss_mlp": 0.12225342, + "step": 11889, + "time_per_iteration": 2.6892929077148438 + }, + { + "auxiliary_loss_clip": 0.0111486, + "auxiliary_loss_mlp": 0.01029846, + "balance_loss_clip": 1.04130888, + "balance_loss_mlp": 1.01817536, + "epoch": 0.7148654742221554, + "flos": 27437388448320.0, + "grad_norm": 1.7544150711529314, + "language_loss": 0.75543475, + "learning_rate": 7.937985415686695e-07, + "loss": 0.77688187, + "num_input_tokens_seen": 256485705, + "router_z_loss_clip": 0.73486328, + "router_z_loss_mlp": 0.11688232, + "step": 11890, + "time_per_iteration": 2.636502265930176 + }, + { + "auxiliary_loss_clip": 0.01113875, + "auxiliary_loss_mlp": 0.01029642, + "balance_loss_clip": 1.04119873, + "balance_loss_mlp": 1.01851964, + "epoch": 0.7149255974748234, + "flos": 30114105665760.0, + "grad_norm": 1.7629645017038964, + "language_loss": 0.73716468, + "learning_rate": 7.934879044739147e-07, + "loss": 0.75859988, + "num_input_tokens_seen": 256504755, + "router_z_loss_clip": 0.7265625, + "router_z_loss_mlp": 0.11114502, + "step": 11891, + "time_per_iteration": 2.696993350982666 + }, + { + "auxiliary_loss_clip": 0.01116852, + "auxiliary_loss_mlp": 0.01036172, + "balance_loss_clip": 1.04174876, + "balance_loss_mlp": 1.0242449, + "epoch": 0.7149857207274913, + "flos": 22458363912000.0, + "grad_norm": 1.8162077308067923, + "language_loss": 0.67654228, + "learning_rate": 7.931773131302211e-07, + "loss": 0.69807249, + "num_input_tokens_seen": 256523670, + "router_z_loss_clip": 0.75097656, + "router_z_loss_mlp": 0.1192627, + "step": 11892, + "time_per_iteration": 2.6386661529541016 + }, + { + "auxiliary_loss_clip": 0.01117315, + "auxiliary_loss_mlp": 0.01032698, + "balance_loss_clip": 1.04074597, + "balance_loss_mlp": 1.01974607, + "epoch": 0.7150458439801594, + "flos": 30468500028000.0, + "grad_norm": 1.9577688115688971, + "language_loss": 0.73810685, + "learning_rate": 7.928667675493632e-07, + "loss": 0.75960696, + "num_input_tokens_seen": 256542225, + "router_z_loss_clip": 0.765625, + "router_z_loss_mlp": 0.12957764, + "step": 11893, + "time_per_iteration": 4.137160778045654 + }, + { + "auxiliary_loss_clip": 0.01118565, + "auxiliary_loss_mlp": 0.0103099, + "balance_loss_clip": 1.0413444, + "balance_loss_mlp": 1.01868701, + "epoch": 0.7151059672328273, + "flos": 20365823316960.0, + "grad_norm": 2.612256918716728, + "language_loss": 0.66574895, + "learning_rate": 7.925562677431185e-07, + "loss": 0.68724447, + "num_input_tokens_seen": 256560730, + "router_z_loss_clip": 0.77099609, + "router_z_loss_mlp": 0.12304688, + "step": 11894, + "time_per_iteration": 2.605081558227539 + }, + { + "auxiliary_loss_clip": 0.01115978, + "auxiliary_loss_mlp": 0.01031357, + "balance_loss_clip": 1.04046082, + "balance_loss_mlp": 1.02007365, + "epoch": 0.7151660904854953, + "flos": 33277608665280.0, + "grad_norm": 2.419339785941654, + "language_loss": 0.7791152, + "learning_rate": 7.922458137232613e-07, + "loss": 0.80058855, + "num_input_tokens_seen": 256580505, + "router_z_loss_clip": 0.75585938, + "router_z_loss_mlp": 0.112854, + "step": 11895, + "time_per_iteration": 2.6679928302764893 + }, + { + "auxiliary_loss_clip": 0.01117023, + "auxiliary_loss_mlp": 0.01028148, + "balance_loss_clip": 1.0413053, + "balance_loss_mlp": 1.01600087, + "epoch": 0.7152262137381632, + "flos": 22368914458560.0, + "grad_norm": 1.9646145755543531, + "language_loss": 0.69246346, + "learning_rate": 7.919354055015643e-07, + "loss": 0.71391511, + "num_input_tokens_seen": 256597330, + "router_z_loss_clip": 0.75634766, + "router_z_loss_mlp": 0.12145996, + "step": 11896, + "time_per_iteration": 3.926621198654175 + }, + { + "auxiliary_loss_clip": 0.01115806, + "auxiliary_loss_mlp": 0.01044046, + "balance_loss_clip": 1.03980792, + "balance_loss_mlp": 1.03142738, + "epoch": 0.7152863369908312, + "flos": 26599571238240.0, + "grad_norm": 1.9101723494713698, + "language_loss": 0.86981571, + "learning_rate": 7.91625043089798e-07, + "loss": 0.89141428, + "num_input_tokens_seen": 256616030, + "router_z_loss_clip": 0.75927734, + "router_z_loss_mlp": 0.1262207, + "step": 11897, + "time_per_iteration": 2.6318881511688232 + }, + { + "auxiliary_loss_clip": 0.01113297, + "auxiliary_loss_mlp": 0.01030006, + "balance_loss_clip": 1.04117846, + "balance_loss_mlp": 1.01847196, + "epoch": 0.7153464602434991, + "flos": 27038350393920.0, + "grad_norm": 1.9090288791838228, + "language_loss": 0.77983403, + "learning_rate": 7.913147264997304e-07, + "loss": 0.80126709, + "num_input_tokens_seen": 256635570, + "router_z_loss_clip": 0.72167969, + "router_z_loss_mlp": 0.11541748, + "step": 11898, + "time_per_iteration": 2.660358428955078 + }, + { + "auxiliary_loss_clip": 0.01118744, + "auxiliary_loss_mlp": 0.01031016, + "balance_loss_clip": 1.04150438, + "balance_loss_mlp": 1.01886845, + "epoch": 0.7154065834961671, + "flos": 30159032978880.0, + "grad_norm": 1.7044460255914096, + "language_loss": 0.72981817, + "learning_rate": 7.910044557431302e-07, + "loss": 0.75131577, + "num_input_tokens_seen": 256655290, + "router_z_loss_clip": 0.77148438, + "router_z_loss_mlp": 0.121521, + "step": 11899, + "time_per_iteration": 2.667266845703125 + }, + { + "auxiliary_loss_clip": 0.01112424, + "auxiliary_loss_mlp": 0.01033459, + "balance_loss_clip": 1.03923488, + "balance_loss_mlp": 1.02107334, + "epoch": 0.7154667067488351, + "flos": 27578531600640.0, + "grad_norm": 2.0901269319991966, + "language_loss": 0.76048082, + "learning_rate": 7.906942308317614e-07, + "loss": 0.78193963, + "num_input_tokens_seen": 256671605, + "router_z_loss_clip": 0.73144531, + "router_z_loss_mlp": 0.12390137, + "step": 11900, + "time_per_iteration": 2.640516996383667 + }, + { + "auxiliary_loss_clip": 0.01116544, + "auxiliary_loss_mlp": 0.0102843, + "balance_loss_clip": 1.04212856, + "balance_loss_mlp": 1.01727188, + "epoch": 0.7155268300015031, + "flos": 22904395660800.0, + "grad_norm": 1.9694617488887856, + "language_loss": 0.81124604, + "learning_rate": 7.903840517773886e-07, + "loss": 0.83269572, + "num_input_tokens_seen": 256689680, + "router_z_loss_clip": 0.74414062, + "router_z_loss_mlp": 0.11157227, + "step": 11901, + "time_per_iteration": 2.654085159301758 + }, + { + "auxiliary_loss_clip": 0.01119048, + "auxiliary_loss_mlp": 0.01034767, + "balance_loss_clip": 1.04063261, + "balance_loss_mlp": 1.02268529, + "epoch": 0.7155869532541711, + "flos": 22325040594720.0, + "grad_norm": 2.495916394366127, + "language_loss": 0.81067699, + "learning_rate": 7.900739185917744e-07, + "loss": 0.83221513, + "num_input_tokens_seen": 256707760, + "router_z_loss_clip": 0.78417969, + "router_z_loss_mlp": 0.12091064, + "step": 11902, + "time_per_iteration": 2.6278457641601562 + }, + { + "auxiliary_loss_clip": 0.01113154, + "auxiliary_loss_mlp": 0.01028571, + "balance_loss_clip": 1.03897822, + "balance_loss_mlp": 1.0171454, + "epoch": 0.715647076506839, + "flos": 14618658591360.0, + "grad_norm": 1.911120632639125, + "language_loss": 0.67744958, + "learning_rate": 7.897638312866785e-07, + "loss": 0.69886684, + "num_input_tokens_seen": 256724150, + "router_z_loss_clip": 0.74169922, + "router_z_loss_mlp": 0.11413574, + "step": 11903, + "time_per_iteration": 2.689929962158203 + }, + { + "auxiliary_loss_clip": 0.01111771, + "auxiliary_loss_mlp": 0.01033711, + "balance_loss_clip": 1.03939366, + "balance_loss_mlp": 1.02266598, + "epoch": 0.715707199759507, + "flos": 23124818429280.0, + "grad_norm": 1.9495538636956093, + "language_loss": 0.75832909, + "learning_rate": 7.894537898738589e-07, + "loss": 0.7797839, + "num_input_tokens_seen": 256742780, + "router_z_loss_clip": 0.72265625, + "router_z_loss_mlp": 0.11047363, + "step": 11904, + "time_per_iteration": 4.135778903961182 + }, + { + "auxiliary_loss_clip": 0.01116467, + "auxiliary_loss_mlp": 0.01038454, + "balance_loss_clip": 1.04229212, + "balance_loss_mlp": 1.02597833, + "epoch": 0.7157673230121749, + "flos": 18674063019360.0, + "grad_norm": 1.9783024856203058, + "language_loss": 0.72364485, + "learning_rate": 7.891437943650727e-07, + "loss": 0.74519408, + "num_input_tokens_seen": 256761355, + "router_z_loss_clip": 0.74121094, + "router_z_loss_mlp": 0.12457275, + "step": 11905, + "time_per_iteration": 4.094755172729492 + }, + { + "auxiliary_loss_clip": 0.01112392, + "auxiliary_loss_mlp": 0.01032487, + "balance_loss_clip": 1.03910303, + "balance_loss_mlp": 1.02123952, + "epoch": 0.715827446264843, + "flos": 28335529537920.0, + "grad_norm": 2.178984541419565, + "language_loss": 0.78264433, + "learning_rate": 7.88833844772076e-07, + "loss": 0.80409312, + "num_input_tokens_seen": 256781335, + "router_z_loss_clip": 0.73291016, + "router_z_loss_mlp": 0.11248779, + "step": 11906, + "time_per_iteration": 2.7016565799713135 + }, + { + "auxiliary_loss_clip": 0.01032981, + "auxiliary_loss_mlp": 0.01002687, + "balance_loss_clip": 1.00995588, + "balance_loss_mlp": 1.0015924, + "epoch": 0.7158875695175109, + "flos": 74402823520800.0, + "grad_norm": 0.746042369890607, + "language_loss": 0.55256677, + "learning_rate": 7.885239411066205e-07, + "loss": 0.57292354, + "num_input_tokens_seen": 256838890, + "router_z_loss_clip": 0.23034668, + "router_z_loss_mlp": 0.01094818, + "step": 11907, + "time_per_iteration": 3.1708829402923584 + }, + { + "auxiliary_loss_clip": 0.01113881, + "auxiliary_loss_mlp": 0.01033492, + "balance_loss_clip": 1.03966069, + "balance_loss_mlp": 1.02111197, + "epoch": 0.7159476927701789, + "flos": 20900291587200.0, + "grad_norm": 1.923935369536796, + "language_loss": 0.69534445, + "learning_rate": 7.882140833804593e-07, + "loss": 0.71681815, + "num_input_tokens_seen": 256858145, + "router_z_loss_clip": 0.74169922, + "router_z_loss_mlp": 0.12384033, + "step": 11908, + "time_per_iteration": 2.6659538745880127 + }, + { + "auxiliary_loss_clip": 0.01114283, + "auxiliary_loss_mlp": 0.01030808, + "balance_loss_clip": 1.03993356, + "balance_loss_mlp": 1.01851714, + "epoch": 0.7160078160228468, + "flos": 27443344488480.0, + "grad_norm": 2.637118251100993, + "language_loss": 0.71542555, + "learning_rate": 7.879042716053415e-07, + "loss": 0.73687649, + "num_input_tokens_seen": 256878545, + "router_z_loss_clip": 0.74267578, + "router_z_loss_mlp": 0.1229248, + "step": 11909, + "time_per_iteration": 2.6493310928344727 + }, + { + "auxiliary_loss_clip": 0.0111647, + "auxiliary_loss_mlp": 0.01030472, + "balance_loss_clip": 1.04121435, + "balance_loss_mlp": 1.01927197, + "epoch": 0.7160679392755148, + "flos": 37327948433280.0, + "grad_norm": 1.6664130442825478, + "language_loss": 0.752437, + "learning_rate": 7.875945057930144e-07, + "loss": 0.77390647, + "num_input_tokens_seen": 256899920, + "router_z_loss_clip": 0.75244141, + "router_z_loss_mlp": 0.11199951, + "step": 11910, + "time_per_iteration": 2.7214231491088867 + }, + { + "auxiliary_loss_clip": 0.01113694, + "auxiliary_loss_mlp": 0.01035311, + "balance_loss_clip": 1.03975618, + "balance_loss_mlp": 1.02437997, + "epoch": 0.7161280625281827, + "flos": 26019284274720.0, + "grad_norm": 1.501241911007065, + "language_loss": 0.76330185, + "learning_rate": 7.872847859552251e-07, + "loss": 0.78479195, + "num_input_tokens_seen": 256918460, + "router_z_loss_clip": 0.73828125, + "router_z_loss_mlp": 0.10931396, + "step": 11911, + "time_per_iteration": 2.6564033031463623 + }, + { + "auxiliary_loss_clip": 0.01115539, + "auxiliary_loss_mlp": 0.01030839, + "balance_loss_clip": 1.0413754, + "balance_loss_mlp": 1.01853037, + "epoch": 0.7161881857808508, + "flos": 75484881309600.0, + "grad_norm": 1.72807148155504, + "language_loss": 0.58880895, + "learning_rate": 7.869751121037192e-07, + "loss": 0.61027271, + "num_input_tokens_seen": 256942015, + "router_z_loss_clip": 0.74072266, + "router_z_loss_mlp": 0.12298584, + "step": 11912, + "time_per_iteration": 3.0018997192382812 + }, + { + "auxiliary_loss_clip": 0.011169, + "auxiliary_loss_mlp": 0.01033, + "balance_loss_clip": 1.04293847, + "balance_loss_mlp": 1.02094793, + "epoch": 0.7162483090335187, + "flos": 25395123447360.0, + "grad_norm": 1.9290976313849653, + "language_loss": 0.7808373, + "learning_rate": 7.866654842502376e-07, + "loss": 0.80233634, + "num_input_tokens_seen": 256961065, + "router_z_loss_clip": 0.74023438, + "router_z_loss_mlp": 0.12054443, + "step": 11913, + "time_per_iteration": 2.6762969493865967 + }, + { + "auxiliary_loss_clip": 0.01112613, + "auxiliary_loss_mlp": 0.01027395, + "balance_loss_clip": 1.04008257, + "balance_loss_mlp": 1.01691628, + "epoch": 0.7163084322861867, + "flos": 29404101422880.0, + "grad_norm": 1.6009854335504825, + "language_loss": 0.74120688, + "learning_rate": 7.863559024065234e-07, + "loss": 0.76260698, + "num_input_tokens_seen": 256982165, + "router_z_loss_clip": 0.72460938, + "router_z_loss_mlp": 0.10479736, + "step": 11914, + "time_per_iteration": 2.7281723022460938 + }, + { + "auxiliary_loss_clip": 0.01111499, + "auxiliary_loss_mlp": 0.01031264, + "balance_loss_clip": 1.04036927, + "balance_loss_mlp": 1.0198617, + "epoch": 0.7163685555388547, + "flos": 24502978915200.0, + "grad_norm": 1.692538549088167, + "language_loss": 0.74089611, + "learning_rate": 7.860463665843143e-07, + "loss": 0.76232374, + "num_input_tokens_seen": 256999825, + "router_z_loss_clip": 0.7109375, + "router_z_loss_mlp": 0.11401367, + "step": 11915, + "time_per_iteration": 2.722058057785034 + }, + { + "auxiliary_loss_clip": 0.01114304, + "auxiliary_loss_mlp": 0.01028913, + "balance_loss_clip": 1.03918552, + "balance_loss_mlp": 1.01751113, + "epoch": 0.7164286787915226, + "flos": 21300788263680.0, + "grad_norm": 1.9497222446926885, + "language_loss": 0.81126916, + "learning_rate": 7.85736876795349e-07, + "loss": 0.83270133, + "num_input_tokens_seen": 257017450, + "router_z_loss_clip": 0.75146484, + "router_z_loss_mlp": 0.11401367, + "step": 11916, + "time_per_iteration": 2.607757091522217 + }, + { + "auxiliary_loss_clip": 0.0111521, + "auxiliary_loss_mlp": 0.01031545, + "balance_loss_clip": 1.04016173, + "balance_loss_mlp": 1.020226, + "epoch": 0.7164888020441906, + "flos": 24060998894400.0, + "grad_norm": 2.823572934922098, + "language_loss": 0.68529588, + "learning_rate": 7.854274330513626e-07, + "loss": 0.70676345, + "num_input_tokens_seen": 257035465, + "router_z_loss_clip": 0.75048828, + "router_z_loss_mlp": 0.11315918, + "step": 11917, + "time_per_iteration": 2.652310848236084 + }, + { + "auxiliary_loss_clip": 0.01115197, + "auxiliary_loss_mlp": 0.0103233, + "balance_loss_clip": 1.0411694, + "balance_loss_mlp": 1.02057636, + "epoch": 0.7165489252968585, + "flos": 26198426285280.0, + "grad_norm": 1.6783766802340774, + "language_loss": 0.7590993, + "learning_rate": 7.851180353640896e-07, + "loss": 0.78057456, + "num_input_tokens_seen": 257053750, + "router_z_loss_clip": 0.74023438, + "router_z_loss_mlp": 0.11755371, + "step": 11918, + "time_per_iteration": 2.6271004676818848 + }, + { + "auxiliary_loss_clip": 0.01033683, + "auxiliary_loss_mlp": 0.01001321, + "balance_loss_clip": 1.01065826, + "balance_loss_mlp": 1.00023079, + "epoch": 0.7166090485495266, + "flos": 85326266017440.0, + "grad_norm": 0.6330085537842051, + "language_loss": 0.53902471, + "learning_rate": 7.848086837452639e-07, + "loss": 0.55937475, + "num_input_tokens_seen": 257121215, + "router_z_loss_clip": 0.23034668, + "router_z_loss_mlp": 0.01091003, + "step": 11919, + "time_per_iteration": 3.3066842555999756 + }, + { + "auxiliary_loss_clip": 0.01117556, + "auxiliary_loss_mlp": 0.01032125, + "balance_loss_clip": 1.0428592, + "balance_loss_mlp": 1.02089572, + "epoch": 0.7166691718021945, + "flos": 33365640013920.0, + "grad_norm": 4.0044744390396385, + "language_loss": 0.69098866, + "learning_rate": 7.844993782066132e-07, + "loss": 0.71248555, + "num_input_tokens_seen": 257143370, + "router_z_loss_clip": 0.74658203, + "router_z_loss_mlp": 0.11230469, + "step": 11920, + "time_per_iteration": 2.7038326263427734 + }, + { + "auxiliary_loss_clip": 0.01116067, + "auxiliary_loss_mlp": 0.01035235, + "balance_loss_clip": 1.04121065, + "balance_loss_mlp": 1.02327824, + "epoch": 0.7167292950548625, + "flos": 37105742904480.0, + "grad_norm": 2.469909948845957, + "language_loss": 0.75204337, + "learning_rate": 7.841901187598678e-07, + "loss": 0.77355635, + "num_input_tokens_seen": 257162160, + "router_z_loss_clip": 0.74804688, + "router_z_loss_mlp": 0.11956787, + "step": 11921, + "time_per_iteration": 2.688366174697876 + }, + { + "auxiliary_loss_clip": 0.01120926, + "auxiliary_loss_mlp": 0.0103372, + "balance_loss_clip": 1.04251695, + "balance_loss_mlp": 1.01949239, + "epoch": 0.7167894183075304, + "flos": 17779730554080.0, + "grad_norm": 2.000896266958238, + "language_loss": 0.75424445, + "learning_rate": 7.83880905416755e-07, + "loss": 0.77579093, + "num_input_tokens_seen": 257179300, + "router_z_loss_clip": 0.78417969, + "router_z_loss_mlp": 0.14221191, + "step": 11922, + "time_per_iteration": 2.6179494857788086 + }, + { + "auxiliary_loss_clip": 0.01033664, + "auxiliary_loss_mlp": 0.01001327, + "balance_loss_clip": 1.01049089, + "balance_loss_mlp": 1.00023067, + "epoch": 0.7168495415601984, + "flos": 78228931896000.0, + "grad_norm": 0.7553988571170195, + "language_loss": 0.55089855, + "learning_rate": 7.83571738189001e-07, + "loss": 0.57124841, + "num_input_tokens_seen": 257235470, + "router_z_loss_clip": 0.23168945, + "router_z_loss_mlp": 0.0109787, + "step": 11923, + "time_per_iteration": 3.0485963821411133 + }, + { + "auxiliary_loss_clip": 0.01115373, + "auxiliary_loss_mlp": 0.01037253, + "balance_loss_clip": 1.04021049, + "balance_loss_mlp": 1.02487957, + "epoch": 0.7169096648128663, + "flos": 30116617737120.0, + "grad_norm": 1.5037923730518266, + "language_loss": 0.76895988, + "learning_rate": 7.832626170883279e-07, + "loss": 0.79048622, + "num_input_tokens_seen": 257255850, + "router_z_loss_clip": 0.75292969, + "router_z_loss_mlp": 0.12371826, + "step": 11924, + "time_per_iteration": 2.687537670135498 + }, + { + "auxiliary_loss_clip": 0.01114081, + "auxiliary_loss_mlp": 0.01030634, + "balance_loss_clip": 1.04043126, + "balance_loss_mlp": 1.01979756, + "epoch": 0.7169697880655344, + "flos": 25218574542720.0, + "grad_norm": 1.8560822768545038, + "language_loss": 0.68434095, + "learning_rate": 7.829535421264588e-07, + "loss": 0.70578808, + "num_input_tokens_seen": 257275425, + "router_z_loss_clip": 0.73681641, + "router_z_loss_mlp": 0.1083374, + "step": 11925, + "time_per_iteration": 2.645387649536133 + }, + { + "auxiliary_loss_clip": 0.01108665, + "auxiliary_loss_mlp": 0.01028821, + "balance_loss_clip": 1.03799152, + "balance_loss_mlp": 1.01805615, + "epoch": 0.7170299113182023, + "flos": 25664160601440.0, + "grad_norm": 1.823231159187222, + "language_loss": 0.77413034, + "learning_rate": 7.826445133151133e-07, + "loss": 0.79550517, + "num_input_tokens_seen": 257295740, + "router_z_loss_clip": 0.70654297, + "router_z_loss_mlp": 0.10760498, + "step": 11926, + "time_per_iteration": 2.700134754180908 + }, + { + "auxiliary_loss_clip": 0.0111877, + "auxiliary_loss_mlp": 0.01032199, + "balance_loss_clip": 1.04043889, + "balance_loss_mlp": 1.02029026, + "epoch": 0.7170900345708703, + "flos": 27934627688640.0, + "grad_norm": 2.044728338589401, + "language_loss": 0.77311212, + "learning_rate": 7.823355306660093e-07, + "loss": 0.79462183, + "num_input_tokens_seen": 257315970, + "router_z_loss_clip": 0.78369141, + "router_z_loss_mlp": 0.11895752, + "step": 11927, + "time_per_iteration": 2.61737322807312 + }, + { + "auxiliary_loss_clip": 0.01114129, + "auxiliary_loss_mlp": 0.01030123, + "balance_loss_clip": 1.0417726, + "balance_loss_mlp": 1.01790404, + "epoch": 0.7171501578235383, + "flos": 18935969132160.0, + "grad_norm": 1.6300383375929945, + "language_loss": 0.68936783, + "learning_rate": 7.820265941908642e-07, + "loss": 0.71081042, + "num_input_tokens_seen": 257334230, + "router_z_loss_clip": 0.72314453, + "router_z_loss_mlp": 0.12213135, + "step": 11928, + "time_per_iteration": 2.7693734169006348 + }, + { + "auxiliary_loss_clip": 0.01110972, + "auxiliary_loss_mlp": 0.01031389, + "balance_loss_clip": 1.03933835, + "balance_loss_mlp": 1.02015352, + "epoch": 0.7172102810762062, + "flos": 31852900175040.0, + "grad_norm": 1.8338484449726808, + "language_loss": 0.64857751, + "learning_rate": 7.817177039013931e-07, + "loss": 0.67000115, + "num_input_tokens_seen": 257352145, + "router_z_loss_clip": 0.71679688, + "router_z_loss_mlp": 0.11236572, + "step": 11929, + "time_per_iteration": 2.6593029499053955 + }, + { + "auxiliary_loss_clip": 0.01116149, + "auxiliary_loss_mlp": 0.01027556, + "balance_loss_clip": 1.03996825, + "balance_loss_mlp": 1.01591468, + "epoch": 0.7172704043288742, + "flos": 26242543252800.0, + "grad_norm": 1.9778420790698434, + "language_loss": 0.69398689, + "learning_rate": 7.81408859809308e-07, + "loss": 0.71542394, + "num_input_tokens_seen": 257371460, + "router_z_loss_clip": 0.76220703, + "router_z_loss_mlp": 0.11633301, + "step": 11930, + "time_per_iteration": 2.6835060119628906 + }, + { + "auxiliary_loss_clip": 0.01113237, + "auxiliary_loss_mlp": 0.01029568, + "balance_loss_clip": 1.03835118, + "balance_loss_mlp": 1.01764083, + "epoch": 0.7173305275815421, + "flos": 22904152557120.0, + "grad_norm": 1.9969626741977688, + "language_loss": 0.80610657, + "learning_rate": 7.811000619263219e-07, + "loss": 0.82753462, + "num_input_tokens_seen": 257390800, + "router_z_loss_clip": 0.74951172, + "router_z_loss_mlp": 0.11920166, + "step": 11931, + "time_per_iteration": 2.6972057819366455 + }, + { + "auxiliary_loss_clip": 0.01113059, + "auxiliary_loss_mlp": 0.01029999, + "balance_loss_clip": 1.03955626, + "balance_loss_mlp": 1.0187397, + "epoch": 0.7173906508342102, + "flos": 19742391800640.0, + "grad_norm": 1.9696891490680672, + "language_loss": 0.78237391, + "learning_rate": 7.80791310264143e-07, + "loss": 0.80380452, + "num_input_tokens_seen": 257407495, + "router_z_loss_clip": 0.73535156, + "router_z_loss_mlp": 0.11273193, + "step": 11932, + "time_per_iteration": 2.708848476409912 + }, + { + "auxiliary_loss_clip": 0.01113698, + "auxiliary_loss_mlp": 0.01028111, + "balance_loss_clip": 1.04054785, + "balance_loss_mlp": 1.01728725, + "epoch": 0.7174507740868781, + "flos": 32474832552000.0, + "grad_norm": 1.744525940684451, + "language_loss": 0.75288212, + "learning_rate": 7.804826048344803e-07, + "loss": 0.77430022, + "num_input_tokens_seen": 257429675, + "router_z_loss_clip": 0.73193359, + "router_z_loss_mlp": 0.10821533, + "step": 11933, + "time_per_iteration": 4.116259813308716 + }, + { + "auxiliary_loss_clip": 0.01121503, + "auxiliary_loss_mlp": 0.0103323, + "balance_loss_clip": 1.04274392, + "balance_loss_mlp": 1.01984882, + "epoch": 0.7175108973395461, + "flos": 22492554145920.0, + "grad_norm": 3.222201172805356, + "language_loss": 0.69509602, + "learning_rate": 7.801739456490388e-07, + "loss": 0.71664327, + "num_input_tokens_seen": 257442765, + "router_z_loss_clip": 0.78808594, + "router_z_loss_mlp": 0.13378906, + "step": 11934, + "time_per_iteration": 2.6893677711486816 + }, + { + "auxiliary_loss_clip": 0.0111406, + "auxiliary_loss_mlp": 0.01033243, + "balance_loss_clip": 1.03932166, + "balance_loss_mlp": 1.02146506, + "epoch": 0.717571020592214, + "flos": 29181328652160.0, + "grad_norm": 2.1453783509600375, + "language_loss": 0.86773968, + "learning_rate": 7.798653327195237e-07, + "loss": 0.88921273, + "num_input_tokens_seen": 257459310, + "router_z_loss_clip": 0.74853516, + "router_z_loss_mlp": 0.11779785, + "step": 11935, + "time_per_iteration": 2.691990852355957 + }, + { + "auxiliary_loss_clip": 0.01115596, + "auxiliary_loss_mlp": 0.01029995, + "balance_loss_clip": 1.04026389, + "balance_loss_mlp": 1.01800215, + "epoch": 0.717631143844882, + "flos": 46682216870400.0, + "grad_norm": 2.7293269000818747, + "language_loss": 0.7386384, + "learning_rate": 7.795567660576388e-07, + "loss": 0.76009429, + "num_input_tokens_seen": 257484750, + "router_z_loss_clip": 0.75341797, + "router_z_loss_mlp": 0.11981201, + "step": 11936, + "time_per_iteration": 4.299103736877441 + }, + { + "auxiliary_loss_clip": 0.0103242, + "auxiliary_loss_mlp": 0.01002386, + "balance_loss_clip": 1.00942469, + "balance_loss_mlp": 1.00125074, + "epoch": 0.7176912670975499, + "flos": 79941963765600.0, + "grad_norm": 0.7568932216017922, + "language_loss": 0.55849016, + "learning_rate": 7.79248245675082e-07, + "loss": 0.57883823, + "num_input_tokens_seen": 257543110, + "router_z_loss_clip": 0.22998047, + "router_z_loss_mlp": 0.01134491, + "step": 11937, + "time_per_iteration": 3.2756433486938477 + }, + { + "auxiliary_loss_clip": 0.01119708, + "auxiliary_loss_mlp": 0.01032125, + "balance_loss_clip": 1.04291606, + "balance_loss_mlp": 1.01937532, + "epoch": 0.717751390350218, + "flos": 38170384613280.0, + "grad_norm": 2.0744645967742894, + "language_loss": 0.54610229, + "learning_rate": 7.789397715835542e-07, + "loss": 0.56762058, + "num_input_tokens_seen": 257567410, + "router_z_loss_clip": 0.76806641, + "router_z_loss_mlp": 0.12744141, + "step": 11938, + "time_per_iteration": 2.783970594406128 + }, + { + "auxiliary_loss_clip": 0.01110136, + "auxiliary_loss_mlp": 0.01028554, + "balance_loss_clip": 1.0378058, + "balance_loss_mlp": 1.01752138, + "epoch": 0.7178115136028859, + "flos": 24232523656320.0, + "grad_norm": 1.6776476457184584, + "language_loss": 0.76416767, + "learning_rate": 7.786313437947527e-07, + "loss": 0.78555453, + "num_input_tokens_seen": 257586270, + "router_z_loss_clip": 0.72314453, + "router_z_loss_mlp": 0.1104126, + "step": 11939, + "time_per_iteration": 2.6874592304229736 + }, + { + "auxiliary_loss_clip": 0.01032966, + "auxiliary_loss_mlp": 0.01001787, + "balance_loss_clip": 1.00974274, + "balance_loss_mlp": 1.00070691, + "epoch": 0.7178716368555539, + "flos": 78517579413600.0, + "grad_norm": 0.7599957417019527, + "language_loss": 0.6132046, + "learning_rate": 7.783229623203738e-07, + "loss": 0.63355213, + "num_input_tokens_seen": 257647415, + "router_z_loss_clip": 0.2322998, + "router_z_loss_mlp": 0.01080322, + "step": 11940, + "time_per_iteration": 3.1917436122894287 + }, + { + "auxiliary_loss_clip": 0.01111771, + "auxiliary_loss_mlp": 0.01030528, + "balance_loss_clip": 1.03937924, + "balance_loss_mlp": 1.01898813, + "epoch": 0.7179317601082219, + "flos": 32656081461120.0, + "grad_norm": 1.6737915291981642, + "language_loss": 0.587767, + "learning_rate": 7.780146271721097e-07, + "loss": 0.60918999, + "num_input_tokens_seen": 257669795, + "router_z_loss_clip": 0.72412109, + "router_z_loss_mlp": 0.11535645, + "step": 11941, + "time_per_iteration": 2.724830389022827 + }, + { + "auxiliary_loss_clip": 0.01113575, + "auxiliary_loss_mlp": 0.0102982, + "balance_loss_clip": 1.04044223, + "balance_loss_mlp": 1.0179646, + "epoch": 0.7179918833608898, + "flos": 28691787695040.0, + "grad_norm": 2.111625124355109, + "language_loss": 0.7897042, + "learning_rate": 7.777063383616543e-07, + "loss": 0.81113815, + "num_input_tokens_seen": 257687415, + "router_z_loss_clip": 0.73046875, + "router_z_loss_mlp": 0.11834717, + "step": 11942, + "time_per_iteration": 2.678438186645508 + }, + { + "auxiliary_loss_clip": 0.01117782, + "auxiliary_loss_mlp": 0.01042846, + "balance_loss_clip": 1.04202342, + "balance_loss_mlp": 1.03060889, + "epoch": 0.7180520066135578, + "flos": 20945178383040.0, + "grad_norm": 2.167296710082043, + "language_loss": 0.66097337, + "learning_rate": 7.773980959006968e-07, + "loss": 0.6825797, + "num_input_tokens_seen": 257706215, + "router_z_loss_clip": 0.7578125, + "router_z_loss_mlp": 0.12231445, + "step": 11943, + "time_per_iteration": 4.06287407875061 + }, + { + "auxiliary_loss_clip": 0.01112377, + "auxiliary_loss_mlp": 0.01033249, + "balance_loss_clip": 1.03951693, + "balance_loss_mlp": 1.02089274, + "epoch": 0.7181121298662257, + "flos": 21434597788320.0, + "grad_norm": 2.0763907810853204, + "language_loss": 0.79234195, + "learning_rate": 7.770898998009254e-07, + "loss": 0.81379819, + "num_input_tokens_seen": 257724740, + "router_z_loss_clip": 0.72949219, + "router_z_loss_mlp": 0.12365723, + "step": 11944, + "time_per_iteration": 4.016855716705322 + }, + { + "auxiliary_loss_clip": 0.01119184, + "auxiliary_loss_mlp": 0.01041525, + "balance_loss_clip": 1.04223561, + "balance_loss_mlp": 1.02866876, + "epoch": 0.7181722531188938, + "flos": 14577904558080.0, + "grad_norm": 3.1756844142350698, + "language_loss": 0.62971491, + "learning_rate": 7.767817500740277e-07, + "loss": 0.65132201, + "num_input_tokens_seen": 257742060, + "router_z_loss_clip": 0.76806641, + "router_z_loss_mlp": 0.12854004, + "step": 11945, + "time_per_iteration": 2.586379051208496 + }, + { + "auxiliary_loss_clip": 0.01032992, + "auxiliary_loss_mlp": 0.01001236, + "balance_loss_clip": 1.00972188, + "balance_loss_mlp": 1.00020075, + "epoch": 0.7182323763715617, + "flos": 79928147373120.0, + "grad_norm": 0.7009028045622724, + "language_loss": 0.51054764, + "learning_rate": 7.76473646731689e-07, + "loss": 0.53088993, + "num_input_tokens_seen": 257802250, + "router_z_loss_clip": 0.23291016, + "router_z_loss_mlp": 0.01036072, + "step": 11946, + "time_per_iteration": 3.2379913330078125 + }, + { + "auxiliary_loss_clip": 0.01119193, + "auxiliary_loss_mlp": 0.01036197, + "balance_loss_clip": 1.04202974, + "balance_loss_mlp": 1.02264893, + "epoch": 0.7182924996242297, + "flos": 25173930850560.0, + "grad_norm": 1.9864194980518435, + "language_loss": 0.74605024, + "learning_rate": 7.761655897855925e-07, + "loss": 0.76760411, + "num_input_tokens_seen": 257821155, + "router_z_loss_clip": 0.77099609, + "router_z_loss_mlp": 0.13543701, + "step": 11947, + "time_per_iteration": 2.7517642974853516 + }, + { + "auxiliary_loss_clip": 0.01112486, + "auxiliary_loss_mlp": 0.01030917, + "balance_loss_clip": 1.03827047, + "balance_loss_mlp": 1.01899576, + "epoch": 0.7183526228768976, + "flos": 19786994975520.0, + "grad_norm": 1.8153014488783443, + "language_loss": 0.72642994, + "learning_rate": 7.758575792474187e-07, + "loss": 0.74786395, + "num_input_tokens_seen": 257839905, + "router_z_loss_clip": 0.74169922, + "router_z_loss_mlp": 0.11920166, + "step": 11948, + "time_per_iteration": 2.648750066757202 + }, + { + "auxiliary_loss_clip": 0.01116784, + "auxiliary_loss_mlp": 0.01038687, + "balance_loss_clip": 1.04140663, + "balance_loss_mlp": 1.02632535, + "epoch": 0.7184127461295656, + "flos": 27128812779360.0, + "grad_norm": 1.6838769669223392, + "language_loss": 0.71409786, + "learning_rate": 7.755496151288483e-07, + "loss": 0.73565263, + "num_input_tokens_seen": 257860055, + "router_z_loss_clip": 0.75537109, + "router_z_loss_mlp": 0.12371826, + "step": 11949, + "time_per_iteration": 2.6799094676971436 + }, + { + "auxiliary_loss_clip": 0.01113988, + "auxiliary_loss_mlp": 0.01030535, + "balance_loss_clip": 1.04100418, + "balance_loss_mlp": 1.01917446, + "epoch": 0.7184728693822335, + "flos": 33366328807680.0, + "grad_norm": 3.7914745096988103, + "language_loss": 0.75894165, + "learning_rate": 7.752416974415598e-07, + "loss": 0.78038692, + "num_input_tokens_seen": 257879315, + "router_z_loss_clip": 0.73046875, + "router_z_loss_mlp": 0.1137085, + "step": 11950, + "time_per_iteration": 2.675962448120117 + }, + { + "auxiliary_loss_clip": 0.01119764, + "auxiliary_loss_mlp": 0.01031727, + "balance_loss_clip": 1.04367566, + "balance_loss_mlp": 1.01904333, + "epoch": 0.7185329926349016, + "flos": 20143415201760.0, + "grad_norm": 3.423527865531964, + "language_loss": 0.67826736, + "learning_rate": 7.749338261972282e-07, + "loss": 0.69978225, + "num_input_tokens_seen": 257896570, + "router_z_loss_clip": 0.76074219, + "router_z_loss_mlp": 0.12670898, + "step": 11951, + "time_per_iteration": 2.661174774169922 + }, + { + "auxiliary_loss_clip": 0.01120659, + "auxiliary_loss_mlp": 0.01032279, + "balance_loss_clip": 1.04316473, + "balance_loss_mlp": 1.01927364, + "epoch": 0.7185931158875695, + "flos": 29225405102400.0, + "grad_norm": 4.204041798698244, + "language_loss": 0.78258538, + "learning_rate": 7.746260014075286e-07, + "loss": 0.8041147, + "num_input_tokens_seen": 257916855, + "router_z_loss_clip": 0.77490234, + "router_z_loss_mlp": 0.13012695, + "step": 11952, + "time_per_iteration": 2.6400108337402344 + }, + { + "auxiliary_loss_clip": 0.01119612, + "auxiliary_loss_mlp": 0.01035866, + "balance_loss_clip": 1.04236114, + "balance_loss_mlp": 1.02280629, + "epoch": 0.7186532391402375, + "flos": 32387044307040.0, + "grad_norm": 1.9810333150463983, + "language_loss": 0.74895424, + "learning_rate": 7.743182230841352e-07, + "loss": 0.770509, + "num_input_tokens_seen": 257937140, + "router_z_loss_clip": 0.77294922, + "router_z_loss_mlp": 0.13049316, + "step": 11953, + "time_per_iteration": 2.6825618743896484 + }, + { + "auxiliary_loss_clip": 0.0111623, + "auxiliary_loss_mlp": 0.01033019, + "balance_loss_clip": 1.04074764, + "balance_loss_mlp": 1.02095509, + "epoch": 0.7187133623929055, + "flos": 27311925483360.0, + "grad_norm": 2.035105769819727, + "language_loss": 0.73223698, + "learning_rate": 7.740104912387164e-07, + "loss": 0.75372946, + "num_input_tokens_seen": 257956785, + "router_z_loss_clip": 0.75439453, + "router_z_loss_mlp": 0.12072754, + "step": 11954, + "time_per_iteration": 2.6368837356567383 + }, + { + "auxiliary_loss_clip": 0.01117973, + "auxiliary_loss_mlp": 0.01035287, + "balance_loss_clip": 1.04289114, + "balance_loss_mlp": 1.02327728, + "epoch": 0.7187734856455734, + "flos": 19253823258240.0, + "grad_norm": 1.9710567867491335, + "language_loss": 0.74624473, + "learning_rate": 7.737028058829425e-07, + "loss": 0.76777732, + "num_input_tokens_seen": 257975455, + "router_z_loss_clip": 0.75146484, + "router_z_loss_mlp": 0.12017822, + "step": 11955, + "time_per_iteration": 2.6626627445220947 + }, + { + "auxiliary_loss_clip": 0.01115974, + "auxiliary_loss_mlp": 0.01036181, + "balance_loss_clip": 1.04053617, + "balance_loss_mlp": 1.02428377, + "epoch": 0.7188336088982414, + "flos": 38753953476480.0, + "grad_norm": 1.671937862136852, + "language_loss": 0.73431337, + "learning_rate": 7.733951670284817e-07, + "loss": 0.75583494, + "num_input_tokens_seen": 257996850, + "router_z_loss_clip": 0.75439453, + "router_z_loss_mlp": 0.11901855, + "step": 11956, + "time_per_iteration": 2.6947691440582275 + }, + { + "auxiliary_loss_clip": 0.01115925, + "auxiliary_loss_mlp": 0.01032631, + "balance_loss_clip": 1.03901267, + "balance_loss_mlp": 1.02033508, + "epoch": 0.7188937321509093, + "flos": 26192429727840.0, + "grad_norm": 1.7206693646518552, + "language_loss": 0.70609534, + "learning_rate": 7.730875746869987e-07, + "loss": 0.72758085, + "num_input_tokens_seen": 258016145, + "router_z_loss_clip": 0.76904297, + "router_z_loss_mlp": 0.12304688, + "step": 11957, + "time_per_iteration": 2.722118854522705 + }, + { + "auxiliary_loss_clip": 0.01117839, + "auxiliary_loss_mlp": 0.01040367, + "balance_loss_clip": 1.04071665, + "balance_loss_mlp": 1.02771878, + "epoch": 0.7189538554035774, + "flos": 33277244009760.0, + "grad_norm": 1.9847737809272414, + "language_loss": 0.7338537, + "learning_rate": 7.727800288701582e-07, + "loss": 0.75543571, + "num_input_tokens_seen": 258035420, + "router_z_loss_clip": 0.77099609, + "router_z_loss_mlp": 0.12652588, + "step": 11958, + "time_per_iteration": 2.7299368381500244 + }, + { + "auxiliary_loss_clip": 0.0111367, + "auxiliary_loss_mlp": 0.01033959, + "balance_loss_clip": 1.04093134, + "balance_loss_mlp": 1.02206826, + "epoch": 0.7190139786562453, + "flos": 26332722017280.0, + "grad_norm": 1.7445324132125222, + "language_loss": 0.83905554, + "learning_rate": 7.724725295896215e-07, + "loss": 0.86053181, + "num_input_tokens_seen": 258053520, + "router_z_loss_clip": 0.7265625, + "router_z_loss_mlp": 0.11895752, + "step": 11959, + "time_per_iteration": 2.690232276916504 + }, + { + "auxiliary_loss_clip": 0.01120283, + "auxiliary_loss_mlp": 0.01032409, + "balance_loss_clip": 1.04263687, + "balance_loss_mlp": 1.01932025, + "epoch": 0.7190741019089133, + "flos": 32606048970720.0, + "grad_norm": 2.9699029860889836, + "language_loss": 0.82018697, + "learning_rate": 7.7216507685705e-07, + "loss": 0.84171391, + "num_input_tokens_seen": 258073020, + "router_z_loss_clip": 0.77587891, + "router_z_loss_mlp": 0.13098145, + "step": 11960, + "time_per_iteration": 2.7962512969970703 + }, + { + "auxiliary_loss_clip": 0.01115641, + "auxiliary_loss_mlp": 0.0103731, + "balance_loss_clip": 1.04227567, + "balance_loss_mlp": 1.02435791, + "epoch": 0.7191342251615812, + "flos": 31853426899680.0, + "grad_norm": 2.2940008576894333, + "language_loss": 0.77931869, + "learning_rate": 7.718576706841013e-07, + "loss": 0.80084819, + "num_input_tokens_seen": 258093155, + "router_z_loss_clip": 0.73339844, + "router_z_loss_mlp": 0.1295166, + "step": 11961, + "time_per_iteration": 2.675449848175049 + }, + { + "auxiliary_loss_clip": 0.01111197, + "auxiliary_loss_mlp": 0.01030602, + "balance_loss_clip": 1.04010057, + "balance_loss_mlp": 1.0195334, + "epoch": 0.7191943484142492, + "flos": 28024522832160.0, + "grad_norm": 1.5242699605873244, + "language_loss": 0.75004101, + "learning_rate": 7.715503110824326e-07, + "loss": 0.77145898, + "num_input_tokens_seen": 258113905, + "router_z_loss_clip": 0.71142578, + "router_z_loss_mlp": 0.11071777, + "step": 11962, + "time_per_iteration": 2.741532325744629 + }, + { + "auxiliary_loss_clip": 0.01117119, + "auxiliary_loss_mlp": 0.01034161, + "balance_loss_clip": 1.04090929, + "balance_loss_mlp": 1.02074993, + "epoch": 0.7192544716669171, + "flos": 27534374115840.0, + "grad_norm": 1.8172236747597408, + "language_loss": 0.75442159, + "learning_rate": 7.712429980637001e-07, + "loss": 0.7759344, + "num_input_tokens_seen": 258132820, + "router_z_loss_clip": 0.76123047, + "router_z_loss_mlp": 0.13409424, + "step": 11963, + "time_per_iteration": 2.681241035461426 + }, + { + "auxiliary_loss_clip": 0.01120562, + "auxiliary_loss_mlp": 0.01041271, + "balance_loss_clip": 1.0426476, + "balance_loss_mlp": 1.02756166, + "epoch": 0.7193145949195852, + "flos": 23162655218400.0, + "grad_norm": 4.219453479611296, + "language_loss": 0.81088829, + "learning_rate": 7.709357316395564e-07, + "loss": 0.83250666, + "num_input_tokens_seen": 258148055, + "router_z_loss_clip": 0.77978516, + "router_z_loss_mlp": 0.13720703, + "step": 11964, + "time_per_iteration": 2.6499991416931152 + }, + { + "auxiliary_loss_clip": 0.01115014, + "auxiliary_loss_mlp": 0.01035124, + "balance_loss_clip": 1.04123998, + "balance_loss_mlp": 1.02295256, + "epoch": 0.7193747181722531, + "flos": 21968620368480.0, + "grad_norm": 5.748032785312797, + "language_loss": 0.74803829, + "learning_rate": 7.70628511821652e-07, + "loss": 0.76953965, + "num_input_tokens_seen": 258165995, + "router_z_loss_clip": 0.73779297, + "router_z_loss_mlp": 0.12176514, + "step": 11965, + "time_per_iteration": 2.647887945175171 + }, + { + "auxiliary_loss_clip": 0.01117269, + "auxiliary_loss_mlp": 0.01033564, + "balance_loss_clip": 1.04133761, + "balance_loss_mlp": 1.02096355, + "epoch": 0.7194348414249211, + "flos": 29760278545440.0, + "grad_norm": 1.580210369756297, + "language_loss": 0.77460563, + "learning_rate": 7.703213386216377e-07, + "loss": 0.79611397, + "num_input_tokens_seen": 258186165, + "router_z_loss_clip": 0.75927734, + "router_z_loss_mlp": 0.1260376, + "step": 11966, + "time_per_iteration": 2.6435813903808594 + }, + { + "auxiliary_loss_clip": 0.01114902, + "auxiliary_loss_mlp": 0.0103092, + "balance_loss_clip": 1.03988957, + "balance_loss_mlp": 1.0188024, + "epoch": 0.7194949646775891, + "flos": 27044022813120.0, + "grad_norm": 1.9075267234138233, + "language_loss": 0.73047465, + "learning_rate": 7.700142120511619e-07, + "loss": 0.75193286, + "num_input_tokens_seen": 258204595, + "router_z_loss_clip": 0.75048828, + "router_z_loss_mlp": 0.12121582, + "step": 11967, + "time_per_iteration": 2.6516647338867188 + }, + { + "auxiliary_loss_clip": 0.01113027, + "auxiliary_loss_mlp": 0.01030889, + "balance_loss_clip": 1.04312575, + "balance_loss_mlp": 1.02025592, + "epoch": 0.719555087930257, + "flos": 24729033585600.0, + "grad_norm": 1.6555033919251592, + "language_loss": 0.81462431, + "learning_rate": 7.6970713212187e-07, + "loss": 0.83606339, + "num_input_tokens_seen": 258223110, + "router_z_loss_clip": 0.69824219, + "router_z_loss_mlp": 0.10644531, + "step": 11968, + "time_per_iteration": 2.705094814300537 + }, + { + "auxiliary_loss_clip": 0.01114363, + "auxiliary_loss_mlp": 0.01026514, + "balance_loss_clip": 1.041062, + "balance_loss_mlp": 1.01479602, + "epoch": 0.719615211182925, + "flos": 30159316599840.0, + "grad_norm": 1.8898360476990794, + "language_loss": 0.76657498, + "learning_rate": 7.69400098845407e-07, + "loss": 0.78798378, + "num_input_tokens_seen": 258242660, + "router_z_loss_clip": 0.73242188, + "router_z_loss_mlp": 0.11706543, + "step": 11969, + "time_per_iteration": 2.6823341846466064 + }, + { + "auxiliary_loss_clip": 0.01113875, + "auxiliary_loss_mlp": 0.01027806, + "balance_loss_clip": 1.03789103, + "balance_loss_mlp": 1.0152657, + "epoch": 0.719675334435593, + "flos": 24415879464000.0, + "grad_norm": 1.5530342977031502, + "language_loss": 0.71069247, + "learning_rate": 7.69093112233417e-07, + "loss": 0.73210931, + "num_input_tokens_seen": 258261850, + "router_z_loss_clip": 0.75830078, + "router_z_loss_mlp": 0.12554932, + "step": 11970, + "time_per_iteration": 2.6211154460906982 + }, + { + "auxiliary_loss_clip": 0.01033794, + "auxiliary_loss_mlp": 0.01000716, + "balance_loss_clip": 1.01042819, + "balance_loss_mlp": 0.99961424, + "epoch": 0.719735457688261, + "flos": 53931269154240.0, + "grad_norm": 0.91534713788276, + "language_loss": 0.60800111, + "learning_rate": 7.68786172297538e-07, + "loss": 0.6283462, + "num_input_tokens_seen": 258312570, + "router_z_loss_clip": 0.23388672, + "router_z_loss_mlp": 0.01103973, + "step": 11971, + "time_per_iteration": 3.195563316345215 + }, + { + "auxiliary_loss_clip": 0.01121619, + "auxiliary_loss_mlp": 0.01030358, + "balance_loss_clip": 1.04207718, + "balance_loss_mlp": 1.01719093, + "epoch": 0.7197955809409289, + "flos": 20502388016640.0, + "grad_norm": 2.7179859767837335, + "language_loss": 0.80071747, + "learning_rate": 7.684792790494105e-07, + "loss": 0.82223725, + "num_input_tokens_seen": 258331600, + "router_z_loss_clip": 0.79443359, + "router_z_loss_mlp": 0.1317749, + "step": 11972, + "time_per_iteration": 2.6027510166168213 + }, + { + "auxiliary_loss_clip": 0.01119803, + "auxiliary_loss_mlp": 0.01036757, + "balance_loss_clip": 1.04351544, + "balance_loss_mlp": 1.02441311, + "epoch": 0.7198557041935969, + "flos": 29938407624000.0, + "grad_norm": 6.355625142169979, + "language_loss": 0.75283736, + "learning_rate": 7.681724325006733e-07, + "loss": 0.77440298, + "num_input_tokens_seen": 258351785, + "router_z_loss_clip": 0.76269531, + "router_z_loss_mlp": 0.12341309, + "step": 11973, + "time_per_iteration": 4.244680166244507 + }, + { + "auxiliary_loss_clip": 0.01033931, + "auxiliary_loss_mlp": 0.01000471, + "balance_loss_clip": 1.0105716, + "balance_loss_mlp": 0.9993695, + "epoch": 0.7199158274462648, + "flos": 86283306531360.0, + "grad_norm": 0.8573828724167724, + "language_loss": 0.57076621, + "learning_rate": 7.6786563266296e-07, + "loss": 0.59111017, + "num_input_tokens_seen": 258404035, + "router_z_loss_clip": 0.23364258, + "router_z_loss_mlp": 0.01102448, + "step": 11974, + "time_per_iteration": 3.1031723022460938 + }, + { + "auxiliary_loss_clip": 0.01115912, + "auxiliary_loss_mlp": 0.01030961, + "balance_loss_clip": 1.03917289, + "balance_loss_mlp": 1.01822937, + "epoch": 0.7199759506989328, + "flos": 35811440487360.0, + "grad_norm": 1.9266451979998847, + "language_loss": 0.61315078, + "learning_rate": 7.675588795479062e-07, + "loss": 0.63461947, + "num_input_tokens_seen": 258424850, + "router_z_loss_clip": 0.76757812, + "router_z_loss_mlp": 0.12731934, + "step": 11975, + "time_per_iteration": 4.006390333175659 + }, + { + "auxiliary_loss_clip": 0.01114827, + "auxiliary_loss_mlp": 0.01032349, + "balance_loss_clip": 1.04012907, + "balance_loss_mlp": 1.02047014, + "epoch": 0.7200360739516007, + "flos": 30067030936800.0, + "grad_norm": 3.2820118599102766, + "language_loss": 0.67599082, + "learning_rate": 7.672521731671425e-07, + "loss": 0.69746256, + "num_input_tokens_seen": 258445485, + "router_z_loss_clip": 0.74804688, + "router_z_loss_mlp": 0.11877441, + "step": 11976, + "time_per_iteration": 2.725379467010498 + }, + { + "auxiliary_loss_clip": 0.01114802, + "auxiliary_loss_mlp": 0.01027034, + "balance_loss_clip": 1.0400573, + "balance_loss_mlp": 1.01569092, + "epoch": 0.7200961972042688, + "flos": 25395974310240.0, + "grad_norm": 1.950556466683338, + "language_loss": 0.67629099, + "learning_rate": 7.669455135323004e-07, + "loss": 0.69770932, + "num_input_tokens_seen": 258464505, + "router_z_loss_clip": 0.74853516, + "router_z_loss_mlp": 0.11340332, + "step": 11977, + "time_per_iteration": 2.6539745330810547 + }, + { + "auxiliary_loss_clip": 0.01118189, + "auxiliary_loss_mlp": 0.01034086, + "balance_loss_clip": 1.04200053, + "balance_loss_mlp": 1.02182508, + "epoch": 0.7201563204569367, + "flos": 38127361612320.0, + "grad_norm": 1.531922347524188, + "language_loss": 0.75547397, + "learning_rate": 7.666389006550074e-07, + "loss": 0.77699673, + "num_input_tokens_seen": 258487190, + "router_z_loss_clip": 0.76074219, + "router_z_loss_mlp": 0.12261963, + "step": 11978, + "time_per_iteration": 2.781934976577759 + }, + { + "auxiliary_loss_clip": 0.01112596, + "auxiliary_loss_mlp": 0.01030852, + "balance_loss_clip": 1.03930652, + "balance_loss_mlp": 1.01880026, + "epoch": 0.7202164437096047, + "flos": 32520529693440.0, + "grad_norm": 1.8972438700406546, + "language_loss": 0.79075688, + "learning_rate": 7.663323345468908e-07, + "loss": 0.81219137, + "num_input_tokens_seen": 258503790, + "router_z_loss_clip": 0.73339844, + "router_z_loss_mlp": 0.12060547, + "step": 11979, + "time_per_iteration": 2.6817617416381836 + }, + { + "auxiliary_loss_clip": 0.01115631, + "auxiliary_loss_mlp": 0.01031962, + "balance_loss_clip": 1.04077959, + "balance_loss_mlp": 1.01957035, + "epoch": 0.7202765669622727, + "flos": 31676229718560.0, + "grad_norm": 1.9111764712703092, + "language_loss": 0.64661348, + "learning_rate": 7.660258152195767e-07, + "loss": 0.66808945, + "num_input_tokens_seen": 258527335, + "router_z_loss_clip": 0.74755859, + "router_z_loss_mlp": 0.1237793, + "step": 11980, + "time_per_iteration": 2.681123971939087 + }, + { + "auxiliary_loss_clip": 0.01116659, + "auxiliary_loss_mlp": 0.01037506, + "balance_loss_clip": 1.04146183, + "balance_loss_mlp": 1.02416623, + "epoch": 0.7203366902149406, + "flos": 34791401953440.0, + "grad_norm": 2.2271911438672265, + "language_loss": 0.6721397, + "learning_rate": 7.657193426846871e-07, + "loss": 0.69368142, + "num_input_tokens_seen": 258546690, + "router_z_loss_clip": 0.75244141, + "router_z_loss_mlp": 0.13342285, + "step": 11981, + "time_per_iteration": 2.71225643157959 + }, + { + "auxiliary_loss_clip": 0.01116409, + "auxiliary_loss_mlp": 0.01034135, + "balance_loss_clip": 1.04100847, + "balance_loss_mlp": 1.02163649, + "epoch": 0.7203968134676086, + "flos": 25753610054880.0, + "grad_norm": 1.8056975112860887, + "language_loss": 0.73412836, + "learning_rate": 7.65412916953843e-07, + "loss": 0.75563377, + "num_input_tokens_seen": 258566340, + "router_z_loss_clip": 0.75390625, + "router_z_loss_mlp": 0.12506104, + "step": 11982, + "time_per_iteration": 2.639009952545166 + }, + { + "auxiliary_loss_clip": 0.01115899, + "auxiliary_loss_mlp": 0.01041015, + "balance_loss_clip": 1.04044509, + "balance_loss_mlp": 1.02977407, + "epoch": 0.7204569367202766, + "flos": 22368792906720.0, + "grad_norm": 1.8440399912013024, + "language_loss": 0.65848154, + "learning_rate": 7.65106538038665e-07, + "loss": 0.68005067, + "num_input_tokens_seen": 258584455, + "router_z_loss_clip": 0.75488281, + "router_z_loss_mlp": 0.11242676, + "step": 11983, + "time_per_iteration": 4.1406073570251465 + }, + { + "auxiliary_loss_clip": 0.01118158, + "auxiliary_loss_mlp": 0.01033773, + "balance_loss_clip": 1.04323483, + "balance_loss_mlp": 1.02168536, + "epoch": 0.7205170599729446, + "flos": 28376405123040.0, + "grad_norm": 4.051847910210804, + "language_loss": 0.66594458, + "learning_rate": 7.648002059507715e-07, + "loss": 0.68746388, + "num_input_tokens_seen": 258604725, + "router_z_loss_clip": 0.74951172, + "router_z_loss_mlp": 0.12091064, + "step": 11984, + "time_per_iteration": 3.980654239654541 + }, + { + "auxiliary_loss_clip": 0.01122842, + "auxiliary_loss_mlp": 0.01030133, + "balance_loss_clip": 1.04469728, + "balance_loss_mlp": 1.01760435, + "epoch": 0.7205771832256125, + "flos": 24551471748960.0, + "grad_norm": 1.851283728733093, + "language_loss": 0.74268746, + "learning_rate": 7.644939207017771e-07, + "loss": 0.76421726, + "num_input_tokens_seen": 258622885, + "router_z_loss_clip": 0.78076172, + "router_z_loss_mlp": 0.12542725, + "step": 11985, + "time_per_iteration": 2.6075336933135986 + }, + { + "auxiliary_loss_clip": 0.01116514, + "auxiliary_loss_mlp": 0.01031049, + "balance_loss_clip": 1.04238534, + "balance_loss_mlp": 1.01927137, + "epoch": 0.7206373064782805, + "flos": 33804662273280.0, + "grad_norm": 1.7982305992306697, + "language_loss": 0.62814987, + "learning_rate": 7.641876823032977e-07, + "loss": 0.64962554, + "num_input_tokens_seen": 258644305, + "router_z_loss_clip": 0.74121094, + "router_z_loss_mlp": 0.11779785, + "step": 11986, + "time_per_iteration": 2.6594717502593994 + }, + { + "auxiliary_loss_clip": 0.01117431, + "auxiliary_loss_mlp": 0.01034262, + "balance_loss_clip": 1.04219604, + "balance_loss_mlp": 1.02064288, + "epoch": 0.7206974297309484, + "flos": 21924543918240.0, + "grad_norm": 1.8683583003525432, + "language_loss": 0.72632158, + "learning_rate": 7.638814907669455e-07, + "loss": 0.7478385, + "num_input_tokens_seen": 258661775, + "router_z_loss_clip": 0.75195312, + "router_z_loss_mlp": 0.13623047, + "step": 11987, + "time_per_iteration": 2.600914239883423 + }, + { + "auxiliary_loss_clip": 0.01119596, + "auxiliary_loss_mlp": 0.0103481, + "balance_loss_clip": 1.0419395, + "balance_loss_mlp": 1.02193499, + "epoch": 0.7207575529836164, + "flos": 20722567681440.0, + "grad_norm": 2.4783561611986076, + "language_loss": 0.7879644, + "learning_rate": 7.635753461043301e-07, + "loss": 0.8095085, + "num_input_tokens_seen": 258679830, + "router_z_loss_clip": 0.77636719, + "router_z_loss_mlp": 0.12860107, + "step": 11988, + "time_per_iteration": 2.6131842136383057 + }, + { + "auxiliary_loss_clip": 0.01114297, + "auxiliary_loss_mlp": 0.01032095, + "balance_loss_clip": 1.04014134, + "balance_loss_mlp": 1.02014983, + "epoch": 0.7208176762362843, + "flos": 22851891616320.0, + "grad_norm": 2.241021481113825, + "language_loss": 0.7914654, + "learning_rate": 7.632692483270618e-07, + "loss": 0.81292927, + "num_input_tokens_seen": 258697415, + "router_z_loss_clip": 0.7421875, + "router_z_loss_mlp": 0.11950684, + "step": 11989, + "time_per_iteration": 2.6689305305480957 + }, + { + "auxiliary_loss_clip": 0.01113691, + "auxiliary_loss_mlp": 0.01030112, + "balance_loss_clip": 1.04068041, + "balance_loss_mlp": 1.01813173, + "epoch": 0.7208777994889524, + "flos": 22859670934080.0, + "grad_norm": 2.047951470552601, + "language_loss": 0.82682508, + "learning_rate": 7.629631974467481e-07, + "loss": 0.84826314, + "num_input_tokens_seen": 258716755, + "router_z_loss_clip": 0.72998047, + "router_z_loss_mlp": 0.11975098, + "step": 11990, + "time_per_iteration": 2.6070680618286133 + }, + { + "auxiliary_loss_clip": 0.01115827, + "auxiliary_loss_mlp": 0.01036836, + "balance_loss_clip": 1.04060352, + "balance_loss_mlp": 1.02475441, + "epoch": 0.7209379227416203, + "flos": 18050550468480.0, + "grad_norm": 3.1623575639961543, + "language_loss": 0.75940847, + "learning_rate": 7.626571934749931e-07, + "loss": 0.78093505, + "num_input_tokens_seen": 258733270, + "router_z_loss_clip": 0.75195312, + "router_z_loss_mlp": 0.12072754, + "step": 11991, + "time_per_iteration": 2.624448537826538 + }, + { + "auxiliary_loss_clip": 0.01113002, + "auxiliary_loss_mlp": 0.01031401, + "balance_loss_clip": 1.04027438, + "balance_loss_mlp": 1.02001047, + "epoch": 0.7209980459942883, + "flos": 36164214158400.0, + "grad_norm": 1.4939177277682478, + "language_loss": 0.7235769, + "learning_rate": 7.623512364234022e-07, + "loss": 0.74502087, + "num_input_tokens_seen": 258755270, + "router_z_loss_clip": 0.72753906, + "router_z_loss_mlp": 0.11401367, + "step": 11992, + "time_per_iteration": 2.7073471546173096 + }, + { + "auxiliary_loss_clip": 0.01116226, + "auxiliary_loss_mlp": 0.01029446, + "balance_loss_clip": 1.04048634, + "balance_loss_mlp": 1.01738763, + "epoch": 0.7210581692469563, + "flos": 28647103485600.0, + "grad_norm": 1.484638892941478, + "language_loss": 0.66335285, + "learning_rate": 7.620453263035755e-07, + "loss": 0.68480957, + "num_input_tokens_seen": 258775340, + "router_z_loss_clip": 0.75634766, + "router_z_loss_mlp": 0.12054443, + "step": 11993, + "time_per_iteration": 2.6733596324920654 + }, + { + "auxiliary_loss_clip": 0.01115395, + "auxiliary_loss_mlp": 0.01030723, + "balance_loss_clip": 1.04035616, + "balance_loss_mlp": 1.01906991, + "epoch": 0.7211182924996242, + "flos": 31847876032320.0, + "grad_norm": 2.1566530073704184, + "language_loss": 0.65707326, + "learning_rate": 7.61739463127115e-07, + "loss": 0.67853439, + "num_input_tokens_seen": 258794580, + "router_z_loss_clip": 0.75048828, + "router_z_loss_mlp": 0.11651611, + "step": 11994, + "time_per_iteration": 2.649245262145996 + }, + { + "auxiliary_loss_clip": 0.01119102, + "auxiliary_loss_mlp": 0.01034581, + "balance_loss_clip": 1.04236197, + "balance_loss_mlp": 1.02133656, + "epoch": 0.7211784157522922, + "flos": 21612119107680.0, + "grad_norm": 1.8601419325781217, + "language_loss": 0.66828513, + "learning_rate": 7.614336469056172e-07, + "loss": 0.68982196, + "num_input_tokens_seen": 258812330, + "router_z_loss_clip": 0.76708984, + "router_z_loss_mlp": 0.13244629, + "step": 11995, + "time_per_iteration": 2.599578619003296 + }, + { + "auxiliary_loss_clip": 0.01113146, + "auxiliary_loss_mlp": 0.01029613, + "balance_loss_clip": 1.0414958, + "balance_loss_mlp": 1.01697111, + "epoch": 0.7212385390049602, + "flos": 29802248097120.0, + "grad_norm": 1.7647558667667567, + "language_loss": 0.79517066, + "learning_rate": 7.6112787765068e-07, + "loss": 0.81659824, + "num_input_tokens_seen": 258831770, + "router_z_loss_clip": 0.71630859, + "router_z_loss_mlp": 0.12658691, + "step": 11996, + "time_per_iteration": 2.638716220855713 + }, + { + "auxiliary_loss_clip": 0.01117168, + "auxiliary_loss_mlp": 0.01031251, + "balance_loss_clip": 1.04210591, + "balance_loss_mlp": 1.01939559, + "epoch": 0.7212986622576282, + "flos": 34345896929280.0, + "grad_norm": 4.36862208437392, + "language_loss": 0.81609541, + "learning_rate": 7.60822155373899e-07, + "loss": 0.83757961, + "num_input_tokens_seen": 258849090, + "router_z_loss_clip": 0.75048828, + "router_z_loss_mlp": 0.11865234, + "step": 11997, + "time_per_iteration": 2.675868511199951 + }, + { + "auxiliary_loss_clip": 0.01117369, + "auxiliary_loss_mlp": 0.01035308, + "balance_loss_clip": 1.04010487, + "balance_loss_mlp": 1.02277327, + "epoch": 0.7213587855102961, + "flos": 26644093378560.0, + "grad_norm": 2.1879354874523886, + "language_loss": 0.66549939, + "learning_rate": 7.605164800868646e-07, + "loss": 0.68702614, + "num_input_tokens_seen": 258868230, + "router_z_loss_clip": 0.77294922, + "router_z_loss_mlp": 0.12542725, + "step": 11998, + "time_per_iteration": 2.599837064743042 + }, + { + "auxiliary_loss_clip": 0.01115958, + "auxiliary_loss_mlp": 0.01032607, + "balance_loss_clip": 1.04121637, + "balance_loss_mlp": 1.021312, + "epoch": 0.7214189087629641, + "flos": 17828547526080.0, + "grad_norm": 2.052550111627228, + "language_loss": 0.72863781, + "learning_rate": 7.602108518011696e-07, + "loss": 0.7501235, + "num_input_tokens_seen": 258885525, + "router_z_loss_clip": 0.74707031, + "router_z_loss_mlp": 0.11303711, + "step": 11999, + "time_per_iteration": 2.6254098415374756 + }, + { + "auxiliary_loss_clip": 0.01117118, + "auxiliary_loss_mlp": 0.01028083, + "balance_loss_clip": 1.04135156, + "balance_loss_mlp": 1.01524401, + "epoch": 0.721479032015632, + "flos": 23660664287040.0, + "grad_norm": 2.47854741753487, + "language_loss": 0.82785237, + "learning_rate": 7.599052705284039e-07, + "loss": 0.84930438, + "num_input_tokens_seen": 258903245, + "router_z_loss_clip": 0.75878906, + "router_z_loss_mlp": 0.12854004, + "step": 12000, + "time_per_iteration": 2.6747050285339355 + }, + { + "auxiliary_loss_clip": 0.01117911, + "auxiliary_loss_mlp": 0.01034839, + "balance_loss_clip": 1.04218328, + "balance_loss_mlp": 1.02250087, + "epoch": 0.7215391552683, + "flos": 22589499296160.0, + "grad_norm": 2.216557340072786, + "language_loss": 0.77247298, + "learning_rate": 7.59599736280154e-07, + "loss": 0.79400051, + "num_input_tokens_seen": 258921245, + "router_z_loss_clip": 0.75732422, + "router_z_loss_mlp": 0.12347412, + "step": 12001, + "time_per_iteration": 2.6788158416748047 + }, + { + "auxiliary_loss_clip": 0.01115924, + "auxiliary_loss_mlp": 0.01036583, + "balance_loss_clip": 1.04318047, + "balance_loss_mlp": 1.0245012, + "epoch": 0.721599278520968, + "flos": 28380497368320.0, + "grad_norm": 1.9355883288340485, + "language_loss": 0.81381804, + "learning_rate": 7.592942490680066e-07, + "loss": 0.83534312, + "num_input_tokens_seen": 258939425, + "router_z_loss_clip": 0.72705078, + "router_z_loss_mlp": 0.12072754, + "step": 12002, + "time_per_iteration": 2.624561071395874 + }, + { + "auxiliary_loss_clip": 0.01119291, + "auxiliary_loss_mlp": 0.01029338, + "balance_loss_clip": 1.04269886, + "balance_loss_mlp": 1.01697552, + "epoch": 0.721659401773636, + "flos": 47832458891040.0, + "grad_norm": 4.020230405004357, + "language_loss": 0.62229216, + "learning_rate": 7.589888089035462e-07, + "loss": 0.6437785, + "num_input_tokens_seen": 258960710, + "router_z_loss_clip": 0.765625, + "router_z_loss_mlp": 0.12365723, + "step": 12003, + "time_per_iteration": 2.779271125793457 + }, + { + "auxiliary_loss_clip": 0.01116387, + "auxiliary_loss_mlp": 0.0103625, + "balance_loss_clip": 1.04067612, + "balance_loss_mlp": 1.02357244, + "epoch": 0.7217195250263039, + "flos": 18229246788960.0, + "grad_norm": 2.3066966584327346, + "language_loss": 0.68725908, + "learning_rate": 7.586834157983544e-07, + "loss": 0.70878541, + "num_input_tokens_seen": 258978475, + "router_z_loss_clip": 0.75634766, + "router_z_loss_mlp": 0.12677002, + "step": 12004, + "time_per_iteration": 2.592327833175659 + }, + { + "auxiliary_loss_clip": 0.01033018, + "auxiliary_loss_mlp": 0.01001888, + "balance_loss_clip": 1.00981009, + "balance_loss_mlp": 1.00080132, + "epoch": 0.7217796482789719, + "flos": 86475414071520.0, + "grad_norm": 0.857649557479886, + "language_loss": 0.54049826, + "learning_rate": 7.583780697640112e-07, + "loss": 0.56084728, + "num_input_tokens_seen": 259037520, + "router_z_loss_clip": 0.23205566, + "router_z_loss_mlp": 0.01087189, + "step": 12005, + "time_per_iteration": 3.219393253326416 + }, + { + "auxiliary_loss_clip": 0.01115348, + "auxiliary_loss_mlp": 0.0103512, + "balance_loss_clip": 1.04065096, + "balance_loss_mlp": 1.02256763, + "epoch": 0.7218397715316398, + "flos": 45699650470080.0, + "grad_norm": 1.8765906004289732, + "language_loss": 0.63067758, + "learning_rate": 7.580727708120962e-07, + "loss": 0.65218234, + "num_input_tokens_seen": 259061325, + "router_z_loss_clip": 0.74707031, + "router_z_loss_mlp": 0.12567139, + "step": 12006, + "time_per_iteration": 2.9023396968841553 + }, + { + "auxiliary_loss_clip": 0.01115932, + "auxiliary_loss_mlp": 0.01032673, + "balance_loss_clip": 1.04098511, + "balance_loss_mlp": 1.02099657, + "epoch": 0.7218998947843078, + "flos": 27711773883360.0, + "grad_norm": 2.235009529159882, + "language_loss": 0.91971529, + "learning_rate": 7.577675189541865e-07, + "loss": 0.94120133, + "num_input_tokens_seen": 259078135, + "router_z_loss_clip": 0.75048828, + "router_z_loss_mlp": 0.11682129, + "step": 12007, + "time_per_iteration": 2.6891283988952637 + }, + { + "auxiliary_loss_clip": 0.01116539, + "auxiliary_loss_mlp": 0.01032947, + "balance_loss_clip": 1.03958917, + "balance_loss_mlp": 1.01998305, + "epoch": 0.7219600180369758, + "flos": 14845726193760.0, + "grad_norm": 9.754716281241677, + "language_loss": 0.63980091, + "learning_rate": 7.574623142018568e-07, + "loss": 0.66129577, + "num_input_tokens_seen": 259095910, + "router_z_loss_clip": 0.76806641, + "router_z_loss_mlp": 0.12957764, + "step": 12008, + "time_per_iteration": 2.648037910461426 + }, + { + "auxiliary_loss_clip": 0.01117384, + "auxiliary_loss_mlp": 0.0103352, + "balance_loss_clip": 1.04075193, + "balance_loss_mlp": 1.02099752, + "epoch": 0.7220201412896438, + "flos": 27572980733280.0, + "grad_norm": 3.7197040064461975, + "language_loss": 0.78711206, + "learning_rate": 7.57157156566681e-07, + "loss": 0.80862105, + "num_input_tokens_seen": 259114225, + "router_z_loss_clip": 0.76611328, + "router_z_loss_mlp": 0.12524414, + "step": 12009, + "time_per_iteration": 2.6182425022125244 + }, + { + "auxiliary_loss_clip": 0.01117908, + "auxiliary_loss_mlp": 0.01041172, + "balance_loss_clip": 1.04086947, + "balance_loss_mlp": 1.02734375, + "epoch": 0.7220802645423118, + "flos": 32603131726560.0, + "grad_norm": 2.8495384425199193, + "language_loss": 0.63790381, + "learning_rate": 7.568520460602297e-07, + "loss": 0.65949458, + "num_input_tokens_seen": 259134660, + "router_z_loss_clip": 0.77001953, + "router_z_loss_mlp": 0.13824463, + "step": 12010, + "time_per_iteration": 2.683809280395508 + }, + { + "auxiliary_loss_clip": 0.01115259, + "auxiliary_loss_mlp": 0.01033946, + "balance_loss_clip": 1.03952837, + "balance_loss_mlp": 1.02131629, + "epoch": 0.7221403877949797, + "flos": 29798763611040.0, + "grad_norm": 2.281993444459926, + "language_loss": 0.77133256, + "learning_rate": 7.565469826940742e-07, + "loss": 0.79282469, + "num_input_tokens_seen": 259153300, + "router_z_loss_clip": 0.75634766, + "router_z_loss_mlp": 0.12634277, + "step": 12011, + "time_per_iteration": 2.687126398086548 + }, + { + "auxiliary_loss_clip": 0.01116717, + "auxiliary_loss_mlp": 0.01034297, + "balance_loss_clip": 1.04253137, + "balance_loss_mlp": 1.02309132, + "epoch": 0.7222005110476477, + "flos": 28692435971520.0, + "grad_norm": 1.7518199640385794, + "language_loss": 0.78847146, + "learning_rate": 7.56241966479781e-07, + "loss": 0.80998158, + "num_input_tokens_seen": 259172115, + "router_z_loss_clip": 0.74121094, + "router_z_loss_mlp": 0.11199951, + "step": 12012, + "time_per_iteration": 2.7204208374023438 + }, + { + "auxiliary_loss_clip": 0.0111914, + "auxiliary_loss_mlp": 0.01030941, + "balance_loss_clip": 1.04215145, + "balance_loss_mlp": 1.01885939, + "epoch": 0.7222606343003156, + "flos": 28202125186080.0, + "grad_norm": 1.7603716022573073, + "language_loss": 0.75574911, + "learning_rate": 7.559369974289171e-07, + "loss": 0.77724987, + "num_input_tokens_seen": 259191345, + "router_z_loss_clip": 0.77050781, + "router_z_loss_mlp": 0.12084961, + "step": 12013, + "time_per_iteration": 4.076268911361694 + }, + { + "auxiliary_loss_clip": 0.01114577, + "auxiliary_loss_mlp": 0.01028802, + "balance_loss_clip": 1.04094446, + "balance_loss_mlp": 1.01729822, + "epoch": 0.7223207575529836, + "flos": 29713771058400.0, + "grad_norm": 1.5319483310760262, + "language_loss": 0.75989187, + "learning_rate": 7.556320755530484e-07, + "loss": 0.7813257, + "num_input_tokens_seen": 259211700, + "router_z_loss_clip": 0.73583984, + "router_z_loss_mlp": 0.11499023, + "step": 12014, + "time_per_iteration": 2.6402456760406494 + }, + { + "auxiliary_loss_clip": 0.01116365, + "auxiliary_loss_mlp": 0.01032773, + "balance_loss_clip": 1.04001379, + "balance_loss_mlp": 1.02067304, + "epoch": 0.7223808808056515, + "flos": 34569074872800.0, + "grad_norm": 1.6038287592421212, + "language_loss": 0.86561489, + "learning_rate": 7.553272008637346e-07, + "loss": 0.8871063, + "num_input_tokens_seen": 259233825, + "router_z_loss_clip": 0.76318359, + "router_z_loss_mlp": 0.12084961, + "step": 12015, + "time_per_iteration": 3.942471504211426 + }, + { + "auxiliary_loss_clip": 0.01114738, + "auxiliary_loss_mlp": 0.0103629, + "balance_loss_clip": 1.04116571, + "balance_loss_mlp": 1.0241785, + "epoch": 0.7224410040583196, + "flos": 25708966362720.0, + "grad_norm": 1.7891680693326435, + "language_loss": 0.78296345, + "learning_rate": 7.55022373372538e-07, + "loss": 0.80447376, + "num_input_tokens_seen": 259253055, + "router_z_loss_clip": 0.73535156, + "router_z_loss_mlp": 0.12121582, + "step": 12016, + "time_per_iteration": 2.628281831741333 + }, + { + "auxiliary_loss_clip": 0.0111358, + "auxiliary_loss_mlp": 0.01038678, + "balance_loss_clip": 1.0404644, + "balance_loss_mlp": 1.026685, + "epoch": 0.7225011273109875, + "flos": 32696470838880.0, + "grad_norm": 1.713220732433301, + "language_loss": 0.7780813, + "learning_rate": 7.547175930910186e-07, + "loss": 0.79960388, + "num_input_tokens_seen": 259273420, + "router_z_loss_clip": 0.73046875, + "router_z_loss_mlp": 0.11993408, + "step": 12017, + "time_per_iteration": 2.6821470260620117 + }, + { + "auxiliary_loss_clip": 0.01111561, + "auxiliary_loss_mlp": 0.01030531, + "balance_loss_clip": 1.03988755, + "balance_loss_mlp": 1.0193193, + "epoch": 0.7225612505636555, + "flos": 28777225937760.0, + "grad_norm": 1.8800236670392996, + "language_loss": 0.74173105, + "learning_rate": 7.54412860030732e-07, + "loss": 0.76315188, + "num_input_tokens_seen": 259291000, + "router_z_loss_clip": 0.71728516, + "router_z_loss_mlp": 0.11224365, + "step": 12018, + "time_per_iteration": 2.6829922199249268 + }, + { + "auxiliary_loss_clip": 0.01113967, + "auxiliary_loss_mlp": 0.01032351, + "balance_loss_clip": 1.04311144, + "balance_loss_mlp": 1.0216043, + "epoch": 0.7226213738163234, + "flos": 25352870274720.0, + "grad_norm": 1.6258413383998835, + "language_loss": 0.77552247, + "learning_rate": 7.541081742032347e-07, + "loss": 0.79698575, + "num_input_tokens_seen": 259312390, + "router_z_loss_clip": 0.70898438, + "router_z_loss_mlp": 0.10754395, + "step": 12019, + "time_per_iteration": 2.7869741916656494 + }, + { + "auxiliary_loss_clip": 0.01113836, + "auxiliary_loss_mlp": 0.01026096, + "balance_loss_clip": 1.04034829, + "balance_loss_mlp": 1.01408005, + "epoch": 0.7226814970689914, + "flos": 39823133120640.0, + "grad_norm": 1.9702276590033438, + "language_loss": 0.74104637, + "learning_rate": 7.53803535620081e-07, + "loss": 0.76244569, + "num_input_tokens_seen": 259332645, + "router_z_loss_clip": 0.73583984, + "router_z_loss_mlp": 0.12011719, + "step": 12020, + "time_per_iteration": 2.750847101211548 + }, + { + "auxiliary_loss_clip": 0.01115862, + "auxiliary_loss_mlp": 0.01031614, + "balance_loss_clip": 1.03912103, + "balance_loss_mlp": 1.02061105, + "epoch": 0.7227416203216595, + "flos": 27399105969120.0, + "grad_norm": 1.7180022741604999, + "language_loss": 0.77260607, + "learning_rate": 7.534989442928219e-07, + "loss": 0.79408085, + "num_input_tokens_seen": 259353810, + "router_z_loss_clip": 0.76757812, + "router_z_loss_mlp": 0.10998535, + "step": 12021, + "time_per_iteration": 2.6430468559265137 + }, + { + "auxiliary_loss_clip": 0.01116061, + "auxiliary_loss_mlp": 0.01032056, + "balance_loss_clip": 1.04184604, + "balance_loss_mlp": 1.02034974, + "epoch": 0.7228017435743274, + "flos": 26420307675840.0, + "grad_norm": 1.6808433100438267, + "language_loss": 0.68232787, + "learning_rate": 7.531944002330073e-07, + "loss": 0.70380902, + "num_input_tokens_seen": 259372460, + "router_z_loss_clip": 0.74267578, + "router_z_loss_mlp": 0.11700439, + "step": 12022, + "time_per_iteration": 4.202226161956787 + }, + { + "auxiliary_loss_clip": 0.01115253, + "auxiliary_loss_mlp": 0.01034072, + "balance_loss_clip": 1.03983021, + "balance_loss_mlp": 1.02179384, + "epoch": 0.7228618668269954, + "flos": 36037495157760.0, + "grad_norm": 2.0183115054637573, + "language_loss": 0.69115591, + "learning_rate": 7.528899034521858e-07, + "loss": 0.71264917, + "num_input_tokens_seen": 259393275, + "router_z_loss_clip": 0.75439453, + "router_z_loss_mlp": 0.1227417, + "step": 12023, + "time_per_iteration": 3.9542298316955566 + }, + { + "auxiliary_loss_clip": 0.01113384, + "auxiliary_loss_mlp": 0.01029157, + "balance_loss_clip": 1.03873956, + "balance_loss_mlp": 1.01727796, + "epoch": 0.7229219900796633, + "flos": 33499733159520.0, + "grad_norm": 1.8747078587537458, + "language_loss": 0.71264768, + "learning_rate": 7.525854539619052e-07, + "loss": 0.73407304, + "num_input_tokens_seen": 259416205, + "router_z_loss_clip": 0.74707031, + "router_z_loss_mlp": 0.11883545, + "step": 12024, + "time_per_iteration": 2.668027639389038 + }, + { + "auxiliary_loss_clip": 0.01115185, + "auxiliary_loss_mlp": 0.01032988, + "balance_loss_clip": 1.04200268, + "balance_loss_mlp": 1.0210793, + "epoch": 0.7229821133323313, + "flos": 19875998738880.0, + "grad_norm": 1.8557258108551968, + "language_loss": 0.75375879, + "learning_rate": 7.522810517737089e-07, + "loss": 0.77524054, + "num_input_tokens_seen": 259433115, + "router_z_loss_clip": 0.73144531, + "router_z_loss_mlp": 0.11895752, + "step": 12025, + "time_per_iteration": 2.6130049228668213 + }, + { + "auxiliary_loss_clip": 0.01113638, + "auxiliary_loss_mlp": 0.01032316, + "balance_loss_clip": 1.04070723, + "balance_loss_mlp": 1.02086616, + "epoch": 0.7230422365849992, + "flos": 24907162664160.0, + "grad_norm": 1.9963118796422994, + "language_loss": 0.76734954, + "learning_rate": 7.519766968991395e-07, + "loss": 0.78880906, + "num_input_tokens_seen": 259450475, + "router_z_loss_clip": 0.72802734, + "router_z_loss_mlp": 0.11444092, + "step": 12026, + "time_per_iteration": 2.6085879802703857 + }, + { + "auxiliary_loss_clip": 0.01115288, + "auxiliary_loss_mlp": 0.01042102, + "balance_loss_clip": 1.03936017, + "balance_loss_mlp": 1.03041315, + "epoch": 0.7231023598376672, + "flos": 31229590210560.0, + "grad_norm": 2.4420362628998884, + "language_loss": 0.67916548, + "learning_rate": 7.516723893497388e-07, + "loss": 0.70073938, + "num_input_tokens_seen": 259469355, + "router_z_loss_clip": 0.75830078, + "router_z_loss_mlp": 0.11688232, + "step": 12027, + "time_per_iteration": 2.680978536605835 + }, + { + "auxiliary_loss_clip": 0.01119454, + "auxiliary_loss_mlp": 0.01031224, + "balance_loss_clip": 1.0425837, + "balance_loss_mlp": 1.01902854, + "epoch": 0.7231624830903352, + "flos": 30688477106400.0, + "grad_norm": 2.2713050307585383, + "language_loss": 0.79083037, + "learning_rate": 7.513681291370469e-07, + "loss": 0.81233716, + "num_input_tokens_seen": 259486565, + "router_z_loss_clip": 0.76953125, + "router_z_loss_mlp": 0.12188721, + "step": 12028, + "time_per_iteration": 2.6874725818634033 + }, + { + "auxiliary_loss_clip": 0.01113307, + "auxiliary_loss_mlp": 0.01027987, + "balance_loss_clip": 1.03823304, + "balance_loss_mlp": 1.01545835, + "epoch": 0.7232226063430032, + "flos": 26509878681120.0, + "grad_norm": 1.7544321866836488, + "language_loss": 0.82368964, + "learning_rate": 7.510639162726e-07, + "loss": 0.84510255, + "num_input_tokens_seen": 259505070, + "router_z_loss_clip": 0.75097656, + "router_z_loss_mlp": 0.12524414, + "step": 12029, + "time_per_iteration": 2.6328723430633545 + }, + { + "auxiliary_loss_clip": 0.01033118, + "auxiliary_loss_mlp": 0.01002188, + "balance_loss_clip": 1.00980639, + "balance_loss_mlp": 1.00109017, + "epoch": 0.7232827295956711, + "flos": 83506084993440.0, + "grad_norm": 0.8656390078941056, + "language_loss": 0.61818004, + "learning_rate": 7.507597507679347e-07, + "loss": 0.63853312, + "num_input_tokens_seen": 259569135, + "router_z_loss_clip": 0.2331543, + "router_z_loss_mlp": 0.01098633, + "step": 12030, + "time_per_iteration": 3.329564094543457 + }, + { + "auxiliary_loss_clip": 0.01112188, + "auxiliary_loss_mlp": 0.01027795, + "balance_loss_clip": 1.03944719, + "balance_loss_mlp": 1.0160892, + "epoch": 0.7233428528483391, + "flos": 24639138442080.0, + "grad_norm": 2.019211973070615, + "language_loss": 0.77789676, + "learning_rate": 7.504556326345859e-07, + "loss": 0.79929662, + "num_input_tokens_seen": 259587035, + "router_z_loss_clip": 0.72705078, + "router_z_loss_mlp": 0.11700439, + "step": 12031, + "time_per_iteration": 2.61979341506958 + }, + { + "auxiliary_loss_clip": 0.01117257, + "auxiliary_loss_mlp": 0.01031781, + "balance_loss_clip": 1.04053545, + "balance_loss_mlp": 1.01991987, + "epoch": 0.723402976101007, + "flos": 29223622342080.0, + "grad_norm": 2.051976208357739, + "language_loss": 0.81817353, + "learning_rate": 7.501515618840834e-07, + "loss": 0.83966386, + "num_input_tokens_seen": 259606140, + "router_z_loss_clip": 0.76660156, + "router_z_loss_mlp": 0.11859131, + "step": 12032, + "time_per_iteration": 2.7865965366363525 + }, + { + "auxiliary_loss_clip": 0.01118421, + "auxiliary_loss_mlp": 0.01034025, + "balance_loss_clip": 1.04121327, + "balance_loss_mlp": 1.02177596, + "epoch": 0.723463099353675, + "flos": 25396582069440.0, + "grad_norm": 2.021656449169967, + "language_loss": 0.75245857, + "learning_rate": 7.498475385279592e-07, + "loss": 0.773983, + "num_input_tokens_seen": 259624275, + "router_z_loss_clip": 0.77246094, + "router_z_loss_mlp": 0.12255859, + "step": 12033, + "time_per_iteration": 2.5870487689971924 + }, + { + "auxiliary_loss_clip": 0.0111172, + "auxiliary_loss_mlp": 0.01030519, + "balance_loss_clip": 1.03870416, + "balance_loss_mlp": 1.01927185, + "epoch": 0.723523222606343, + "flos": 23303393197920.0, + "grad_norm": 1.8050380750006327, + "language_loss": 0.74860871, + "learning_rate": 7.495435625777423e-07, + "loss": 0.77003109, + "num_input_tokens_seen": 259643465, + "router_z_loss_clip": 0.72998047, + "router_z_loss_mlp": 0.11254883, + "step": 12034, + "time_per_iteration": 2.654548406600952 + }, + { + "auxiliary_loss_clip": 0.01114523, + "auxiliary_loss_mlp": 0.01029064, + "balance_loss_clip": 1.04014468, + "balance_loss_mlp": 1.01812649, + "epoch": 0.723583345859011, + "flos": 32344183375200.0, + "grad_norm": 1.908190004448348, + "language_loss": 0.80689096, + "learning_rate": 7.492396340449578e-07, + "loss": 0.82832682, + "num_input_tokens_seen": 259662500, + "router_z_loss_clip": 0.74267578, + "router_z_loss_mlp": 0.10931396, + "step": 12035, + "time_per_iteration": 2.6736278533935547 + }, + { + "auxiliary_loss_clip": 0.01118014, + "auxiliary_loss_mlp": 0.01033938, + "balance_loss_clip": 1.04231238, + "balance_loss_mlp": 1.02201688, + "epoch": 0.723643469111679, + "flos": 19564222204800.0, + "grad_norm": 2.590774716435767, + "language_loss": 0.61265427, + "learning_rate": 7.489357529411326e-07, + "loss": 0.63417375, + "num_input_tokens_seen": 259680140, + "router_z_loss_clip": 0.75683594, + "router_z_loss_mlp": 0.11907959, + "step": 12036, + "time_per_iteration": 2.6436593532562256 + }, + { + "auxiliary_loss_clip": 0.01111809, + "auxiliary_loss_mlp": 0.01032423, + "balance_loss_clip": 1.03990185, + "balance_loss_mlp": 1.02157474, + "epoch": 0.7237035923643469, + "flos": 26778024455040.0, + "grad_norm": 1.9876634312226726, + "language_loss": 0.67505515, + "learning_rate": 7.486319192777883e-07, + "loss": 0.69649744, + "num_input_tokens_seen": 259700160, + "router_z_loss_clip": 0.71875, + "router_z_loss_mlp": 0.10845947, + "step": 12037, + "time_per_iteration": 2.5989081859588623 + }, + { + "auxiliary_loss_clip": 0.01115471, + "auxiliary_loss_mlp": 0.01036198, + "balance_loss_clip": 1.04136181, + "balance_loss_mlp": 1.02345419, + "epoch": 0.7237637156170149, + "flos": 28776699213120.0, + "grad_norm": 2.098020975450238, + "language_loss": 0.72047734, + "learning_rate": 7.483281330664479e-07, + "loss": 0.74199402, + "num_input_tokens_seen": 259720525, + "router_z_loss_clip": 0.74169922, + "router_z_loss_mlp": 0.12750244, + "step": 12038, + "time_per_iteration": 2.6742260456085205 + }, + { + "auxiliary_loss_clip": 0.01115333, + "auxiliary_loss_mlp": 0.01031839, + "balance_loss_clip": 1.04084134, + "balance_loss_mlp": 1.01877391, + "epoch": 0.7238238388696828, + "flos": 25129692331200.0, + "grad_norm": 1.7144342043125633, + "language_loss": 0.72125578, + "learning_rate": 7.480243943186293e-07, + "loss": 0.74272746, + "num_input_tokens_seen": 259738680, + "router_z_loss_clip": 0.74511719, + "router_z_loss_mlp": 0.1305542, + "step": 12039, + "time_per_iteration": 2.622718334197998 + }, + { + "auxiliary_loss_clip": 0.01118042, + "auxiliary_loss_mlp": 0.0103309, + "balance_loss_clip": 1.04145646, + "balance_loss_mlp": 1.02191973, + "epoch": 0.7238839621223508, + "flos": 29537708361120.0, + "grad_norm": 1.8805853579224132, + "language_loss": 0.76390934, + "learning_rate": 7.477207030458513e-07, + "loss": 0.78542066, + "num_input_tokens_seen": 259758790, + "router_z_loss_clip": 0.76611328, + "router_z_loss_mlp": 0.11175537, + "step": 12040, + "time_per_iteration": 2.649088144302368 + }, + { + "auxiliary_loss_clip": 0.01114852, + "auxiliary_loss_mlp": 0.01033119, + "balance_loss_clip": 1.03882647, + "balance_loss_mlp": 1.02053666, + "epoch": 0.7239440853750188, + "flos": 17338277257920.0, + "grad_norm": 2.9535641572843976, + "language_loss": 0.76629734, + "learning_rate": 7.474170592596301e-07, + "loss": 0.78777707, + "num_input_tokens_seen": 259777370, + "router_z_loss_clip": 0.76025391, + "router_z_loss_mlp": 0.12567139, + "step": 12041, + "time_per_iteration": 2.5937259197235107 + }, + { + "auxiliary_loss_clip": 0.01115583, + "auxiliary_loss_mlp": 0.01028195, + "balance_loss_clip": 1.03944898, + "balance_loss_mlp": 1.01650071, + "epoch": 0.7240042086276868, + "flos": 26374651051680.0, + "grad_norm": 2.177177237650489, + "language_loss": 0.6331479, + "learning_rate": 7.471134629714797e-07, + "loss": 0.65458566, + "num_input_tokens_seen": 259794665, + "router_z_loss_clip": 0.76220703, + "router_z_loss_mlp": 0.11700439, + "step": 12042, + "time_per_iteration": 2.6332709789276123 + }, + { + "auxiliary_loss_clip": 0.01120403, + "auxiliary_loss_mlp": 0.01034319, + "balance_loss_clip": 1.04336917, + "balance_loss_mlp": 1.02118802, + "epoch": 0.7240643318803547, + "flos": 28469258028000.0, + "grad_norm": 2.3368337700324426, + "language_loss": 0.8332783, + "learning_rate": 7.468099141929116e-07, + "loss": 0.8548255, + "num_input_tokens_seen": 259811110, + "router_z_loss_clip": 0.77001953, + "router_z_loss_mlp": 0.13134766, + "step": 12043, + "time_per_iteration": 2.644351005554199 + }, + { + "auxiliary_loss_clip": 0.01117226, + "auxiliary_loss_mlp": 0.01033079, + "balance_loss_clip": 1.04074717, + "balance_loss_mlp": 1.0194236, + "epoch": 0.7241244551330227, + "flos": 29315381280480.0, + "grad_norm": 1.8074832013693856, + "language_loss": 0.64282393, + "learning_rate": 7.465064129354379e-07, + "loss": 0.66432703, + "num_input_tokens_seen": 259831080, + "router_z_loss_clip": 0.76464844, + "router_z_loss_mlp": 0.13653564, + "step": 12044, + "time_per_iteration": 2.7083818912506104 + }, + { + "auxiliary_loss_clip": 0.01117992, + "auxiliary_loss_mlp": 0.01036958, + "balance_loss_clip": 1.04315126, + "balance_loss_mlp": 1.02451229, + "epoch": 0.7241845783856906, + "flos": 22854970929600.0, + "grad_norm": 1.6002533588753167, + "language_loss": 0.81550872, + "learning_rate": 7.462029592105658e-07, + "loss": 0.83705819, + "num_input_tokens_seen": 259850135, + "router_z_loss_clip": 0.74853516, + "router_z_loss_mlp": 0.12445068, + "step": 12045, + "time_per_iteration": 2.681471586227417 + }, + { + "auxiliary_loss_clip": 0.01113081, + "auxiliary_loss_mlp": 0.01032144, + "balance_loss_clip": 1.04083109, + "balance_loss_mlp": 1.02034783, + "epoch": 0.7242447016383586, + "flos": 23792366913120.0, + "grad_norm": 2.108457674957578, + "language_loss": 0.72034633, + "learning_rate": 7.458995530298034e-07, + "loss": 0.74179858, + "num_input_tokens_seen": 259868185, + "router_z_loss_clip": 0.72314453, + "router_z_loss_mlp": 0.11804199, + "step": 12046, + "time_per_iteration": 2.639037847518921 + }, + { + "auxiliary_loss_clip": 0.01115051, + "auxiliary_loss_mlp": 0.01031436, + "balance_loss_clip": 1.03972745, + "balance_loss_mlp": 1.01868653, + "epoch": 0.7243048248910267, + "flos": 27044063330400.0, + "grad_norm": 2.415316023345079, + "language_loss": 0.71025455, + "learning_rate": 7.455961944046553e-07, + "loss": 0.73171937, + "num_input_tokens_seen": 259887055, + "router_z_loss_clip": 0.75292969, + "router_z_loss_mlp": 0.12750244, + "step": 12047, + "time_per_iteration": 2.650085687637329 + }, + { + "auxiliary_loss_clip": 0.01120931, + "auxiliary_loss_mlp": 0.01033413, + "balance_loss_clip": 1.04293394, + "balance_loss_mlp": 1.02087212, + "epoch": 0.7243649481436946, + "flos": 33766906518720.0, + "grad_norm": 1.6657868765680695, + "language_loss": 0.70019615, + "learning_rate": 7.45292883346627e-07, + "loss": 0.72173959, + "num_input_tokens_seen": 259908295, + "router_z_loss_clip": 0.77880859, + "router_z_loss_mlp": 0.12542725, + "step": 12048, + "time_per_iteration": 2.659029006958008 + }, + { + "auxiliary_loss_clip": 0.0103389, + "auxiliary_loss_mlp": 0.01002541, + "balance_loss_clip": 1.01042891, + "balance_loss_mlp": 1.00147128, + "epoch": 0.7244250713963626, + "flos": 77171137607520.0, + "grad_norm": 0.8341497645253895, + "language_loss": 0.53713876, + "learning_rate": 7.449896198672168e-07, + "loss": 0.5575031, + "num_input_tokens_seen": 259968475, + "router_z_loss_clip": 0.23474121, + "router_z_loss_mlp": 0.01071167, + "step": 12049, + "time_per_iteration": 3.276792526245117 + }, + { + "auxiliary_loss_clip": 0.01120281, + "auxiliary_loss_mlp": 0.01033873, + "balance_loss_clip": 1.04034877, + "balance_loss_mlp": 1.01981831, + "epoch": 0.7244851946490305, + "flos": 21923368917120.0, + "grad_norm": 2.1433692022095374, + "language_loss": 0.60467964, + "learning_rate": 7.446864039779258e-07, + "loss": 0.62622118, + "num_input_tokens_seen": 259984865, + "router_z_loss_clip": 0.79980469, + "router_z_loss_mlp": 0.14068604, + "step": 12050, + "time_per_iteration": 2.6217191219329834 + }, + { + "auxiliary_loss_clip": 0.01034089, + "auxiliary_loss_mlp": 0.01002256, + "balance_loss_clip": 1.01061702, + "balance_loss_mlp": 1.0011797, + "epoch": 0.7245453179016985, + "flos": 86566119560640.0, + "grad_norm": 0.7750304369117181, + "language_loss": 0.53245139, + "learning_rate": 7.443832356902528e-07, + "loss": 0.5528149, + "num_input_tokens_seen": 260046735, + "router_z_loss_clip": 0.23486328, + "router_z_loss_mlp": 0.01077271, + "step": 12051, + "time_per_iteration": 3.270419120788574 + }, + { + "auxiliary_loss_clip": 0.01115334, + "auxiliary_loss_mlp": 0.01032427, + "balance_loss_clip": 1.04157341, + "balance_loss_mlp": 1.02113152, + "epoch": 0.7246054411543664, + "flos": 29979080622720.0, + "grad_norm": 1.6794029767960141, + "language_loss": 0.72141874, + "learning_rate": 7.440801150156927e-07, + "loss": 0.74289644, + "num_input_tokens_seen": 260067950, + "router_z_loss_clip": 0.73730469, + "router_z_loss_mlp": 0.11291504, + "step": 12052, + "time_per_iteration": 4.1514928340911865 + }, + { + "auxiliary_loss_clip": 0.01114461, + "auxiliary_loss_mlp": 0.01030956, + "balance_loss_clip": 1.04055786, + "balance_loss_mlp": 1.01807535, + "epoch": 0.7246655644070344, + "flos": 39460594785120.0, + "grad_norm": 1.9433400804237384, + "language_loss": 0.74033445, + "learning_rate": 7.437770419657415e-07, + "loss": 0.76178861, + "num_input_tokens_seen": 260087730, + "router_z_loss_clip": 0.73974609, + "router_z_loss_mlp": 0.12878418, + "step": 12053, + "time_per_iteration": 2.7825629711151123 + }, + { + "auxiliary_loss_clip": 0.01115173, + "auxiliary_loss_mlp": 0.01033058, + "balance_loss_clip": 1.04035997, + "balance_loss_mlp": 1.02110147, + "epoch": 0.7247256876597024, + "flos": 26688858622560.0, + "grad_norm": 2.484448879959535, + "language_loss": 0.78251761, + "learning_rate": 7.434740165518898e-07, + "loss": 0.8039999, + "num_input_tokens_seen": 260107760, + "router_z_loss_clip": 0.74755859, + "router_z_loss_mlp": 0.1194458, + "step": 12054, + "time_per_iteration": 4.028083324432373 + }, + { + "auxiliary_loss_clip": 0.01115537, + "auxiliary_loss_mlp": 0.01033296, + "balance_loss_clip": 1.04121017, + "balance_loss_mlp": 1.02096379, + "epoch": 0.7247858109123704, + "flos": 19783226868480.0, + "grad_norm": 3.1087394580584493, + "language_loss": 0.6851809, + "learning_rate": 7.431710387856301e-07, + "loss": 0.70666921, + "num_input_tokens_seen": 260123660, + "router_z_loss_clip": 0.7421875, + "router_z_loss_mlp": 0.12329102, + "step": 12055, + "time_per_iteration": 2.6700897216796875 + }, + { + "auxiliary_loss_clip": 0.01113624, + "auxiliary_loss_mlp": 0.01033359, + "balance_loss_clip": 1.04023623, + "balance_loss_mlp": 1.0219388, + "epoch": 0.7248459341650383, + "flos": 25442036107200.0, + "grad_norm": 2.4505723289528984, + "language_loss": 0.74327952, + "learning_rate": 7.428681086784496e-07, + "loss": 0.76474935, + "num_input_tokens_seen": 260142690, + "router_z_loss_clip": 0.73339844, + "router_z_loss_mlp": 0.11425781, + "step": 12056, + "time_per_iteration": 2.5948030948638916 + }, + { + "auxiliary_loss_clip": 0.01110933, + "auxiliary_loss_mlp": 0.01025391, + "balance_loss_clip": 1.0394212, + "balance_loss_mlp": 1.01383376, + "epoch": 0.7249060574177063, + "flos": 31630208438880.0, + "grad_norm": 1.5821113383626413, + "language_loss": 0.70856047, + "learning_rate": 7.425652262418368e-07, + "loss": 0.72992361, + "num_input_tokens_seen": 260162590, + "router_z_loss_clip": 0.71533203, + "router_z_loss_mlp": 0.11553955, + "step": 12057, + "time_per_iteration": 2.700094223022461 + }, + { + "auxiliary_loss_clip": 0.01119823, + "auxiliary_loss_mlp": 0.01034965, + "balance_loss_clip": 1.04343402, + "balance_loss_mlp": 1.022609, + "epoch": 0.7249661806703742, + "flos": 21164993392320.0, + "grad_norm": 2.372725341937415, + "language_loss": 0.62589002, + "learning_rate": 7.42262391487277e-07, + "loss": 0.64743793, + "num_input_tokens_seen": 260181065, + "router_z_loss_clip": 0.76367188, + "router_z_loss_mlp": 0.12353516, + "step": 12058, + "time_per_iteration": 2.604544162750244 + }, + { + "auxiliary_loss_clip": 0.01119603, + "auxiliary_loss_mlp": 0.01031055, + "balance_loss_clip": 1.04293573, + "balance_loss_mlp": 1.01838863, + "epoch": 0.7250263039230422, + "flos": 23884247403360.0, + "grad_norm": 2.3286518172809494, + "language_loss": 0.74709475, + "learning_rate": 7.419596044262535e-07, + "loss": 0.7686013, + "num_input_tokens_seen": 260200330, + "router_z_loss_clip": 0.76660156, + "router_z_loss_mlp": 0.12664795, + "step": 12059, + "time_per_iteration": 2.637331247329712 + }, + { + "auxiliary_loss_clip": 0.01112094, + "auxiliary_loss_mlp": 0.01030614, + "balance_loss_clip": 1.04003835, + "balance_loss_mlp": 1.01944458, + "epoch": 0.7250864271757103, + "flos": 26816469003360.0, + "grad_norm": 1.691728829446363, + "language_loss": 0.79203749, + "learning_rate": 7.416568650702472e-07, + "loss": 0.81346464, + "num_input_tokens_seen": 260219975, + "router_z_loss_clip": 0.72070312, + "router_z_loss_mlp": 0.11169434, + "step": 12060, + "time_per_iteration": 2.6276776790618896 + }, + { + "auxiliary_loss_clip": 0.01116298, + "auxiliary_loss_mlp": 0.01027929, + "balance_loss_clip": 1.04101086, + "balance_loss_mlp": 1.01530457, + "epoch": 0.7251465504283782, + "flos": 30520477347840.0, + "grad_norm": 1.7920237381009598, + "language_loss": 0.76562119, + "learning_rate": 7.413541734307393e-07, + "loss": 0.78706348, + "num_input_tokens_seen": 260242025, + "router_z_loss_clip": 0.75195312, + "router_z_loss_mlp": 0.12628174, + "step": 12061, + "time_per_iteration": 4.282742500305176 + }, + { + "auxiliary_loss_clip": 0.01113143, + "auxiliary_loss_mlp": 0.01026705, + "balance_loss_clip": 1.04129446, + "balance_loss_mlp": 1.01521945, + "epoch": 0.7252066736810462, + "flos": 20365094005920.0, + "grad_norm": 1.9027037386479877, + "language_loss": 0.81010127, + "learning_rate": 7.410515295192068e-07, + "loss": 0.83149981, + "num_input_tokens_seen": 260260015, + "router_z_loss_clip": 0.71875, + "router_z_loss_mlp": 0.11499023, + "step": 12062, + "time_per_iteration": 2.595576763153076 + }, + { + "auxiliary_loss_clip": 0.0112118, + "auxiliary_loss_mlp": 0.01029465, + "balance_loss_clip": 1.04326618, + "balance_loss_mlp": 1.01631629, + "epoch": 0.7252667969337141, + "flos": 31363237666080.0, + "grad_norm": 2.826875657068777, + "language_loss": 0.69189829, + "learning_rate": 7.407489333471262e-07, + "loss": 0.71340477, + "num_input_tokens_seen": 260278635, + "router_z_loss_clip": 0.77832031, + "router_z_loss_mlp": 0.13146973, + "step": 12063, + "time_per_iteration": 3.994694948196411 + }, + { + "auxiliary_loss_clip": 0.01111867, + "auxiliary_loss_mlp": 0.01031245, + "balance_loss_clip": 1.03975701, + "balance_loss_mlp": 1.01979542, + "epoch": 0.7253269201863821, + "flos": 22280599488960.0, + "grad_norm": 1.5419032862017166, + "language_loss": 0.70081937, + "learning_rate": 7.40446384925973e-07, + "loss": 0.72225052, + "num_input_tokens_seen": 260298510, + "router_z_loss_clip": 0.72119141, + "router_z_loss_mlp": 0.11456299, + "step": 12064, + "time_per_iteration": 2.6363542079925537 + }, + { + "auxiliary_loss_clip": 0.01117122, + "auxiliary_loss_mlp": 0.01028802, + "balance_loss_clip": 1.04306746, + "balance_loss_mlp": 1.01714921, + "epoch": 0.72538704343905, + "flos": 24906554904960.0, + "grad_norm": 3.672174820637845, + "language_loss": 0.90367639, + "learning_rate": 7.401438842672192e-07, + "loss": 0.92513555, + "num_input_tokens_seen": 260317405, + "router_z_loss_clip": 0.74072266, + "router_z_loss_mlp": 0.11651611, + "step": 12065, + "time_per_iteration": 2.6798839569091797 + }, + { + "auxiliary_loss_clip": 0.01033632, + "auxiliary_loss_mlp": 0.01001339, + "balance_loss_clip": 1.01031733, + "balance_loss_mlp": 1.00027156, + "epoch": 0.725447166691718, + "flos": 85598179898400.0, + "grad_norm": 0.6577339663948004, + "language_loss": 0.56063175, + "learning_rate": 7.398414313823349e-07, + "loss": 0.58098149, + "num_input_tokens_seen": 260388085, + "router_z_loss_clip": 0.2331543, + "router_z_loss_mlp": 0.01068878, + "step": 12066, + "time_per_iteration": 3.396162271499634 + }, + { + "auxiliary_loss_clip": 0.01114033, + "auxiliary_loss_mlp": 0.01030355, + "balance_loss_clip": 1.039814, + "balance_loss_mlp": 1.01874423, + "epoch": 0.725507289944386, + "flos": 33009827546880.0, + "grad_norm": 1.7081905519088738, + "language_loss": 0.76509047, + "learning_rate": 7.395390262827897e-07, + "loss": 0.78653431, + "num_input_tokens_seen": 260406165, + "router_z_loss_clip": 0.7421875, + "router_z_loss_mlp": 0.11608887, + "step": 12067, + "time_per_iteration": 2.671543836593628 + }, + { + "auxiliary_loss_clip": 0.01033778, + "auxiliary_loss_mlp": 0.01000345, + "balance_loss_clip": 1.01039958, + "balance_loss_mlp": 0.99926943, + "epoch": 0.725567413197054, + "flos": 76776921109440.0, + "grad_norm": 0.7275161111677715, + "language_loss": 0.57005453, + "learning_rate": 7.392366689800515e-07, + "loss": 0.59039575, + "num_input_tokens_seen": 260461365, + "router_z_loss_clip": 0.23388672, + "router_z_loss_mlp": 0.01076508, + "step": 12068, + "time_per_iteration": 3.1692192554473877 + }, + { + "auxiliary_loss_clip": 0.0103389, + "auxiliary_loss_mlp": 0.0100093, + "balance_loss_clip": 1.01041889, + "balance_loss_mlp": 0.99988174, + "epoch": 0.7256275364497219, + "flos": 73572420972960.0, + "grad_norm": 0.6576373735331329, + "language_loss": 0.55388194, + "learning_rate": 7.389343594855848e-07, + "loss": 0.57423019, + "num_input_tokens_seen": 260523795, + "router_z_loss_clip": 0.23474121, + "router_z_loss_mlp": 0.01048279, + "step": 12069, + "time_per_iteration": 3.2503912448883057 + }, + { + "auxiliary_loss_clip": 0.01112788, + "auxiliary_loss_mlp": 0.01029159, + "balance_loss_clip": 1.04120612, + "balance_loss_mlp": 1.01816201, + "epoch": 0.7256876597023899, + "flos": 29893439793600.0, + "grad_norm": 1.8144161152908986, + "language_loss": 0.79885805, + "learning_rate": 7.38632097810854e-07, + "loss": 0.82027745, + "num_input_tokens_seen": 260544765, + "router_z_loss_clip": 0.71582031, + "router_z_loss_mlp": 0.11004639, + "step": 12070, + "time_per_iteration": 2.7027153968811035 + }, + { + "auxiliary_loss_clip": 0.01111303, + "auxiliary_loss_mlp": 0.01031509, + "balance_loss_clip": 1.04136086, + "balance_loss_mlp": 1.01978493, + "epoch": 0.7257477829550578, + "flos": 29715553818720.0, + "grad_norm": 1.9302054560714816, + "language_loss": 0.71972114, + "learning_rate": 7.383298839673197e-07, + "loss": 0.74114931, + "num_input_tokens_seen": 260564340, + "router_z_loss_clip": 0.69873047, + "router_z_loss_mlp": 0.1171875, + "step": 12071, + "time_per_iteration": 2.6519546508789062 + }, + { + "auxiliary_loss_clip": 0.01113886, + "auxiliary_loss_mlp": 0.01034407, + "balance_loss_clip": 1.04099274, + "balance_loss_mlp": 1.02280784, + "epoch": 0.7258079062077258, + "flos": 20989862592480.0, + "grad_norm": 1.8789400989675393, + "language_loss": 0.70403028, + "learning_rate": 7.380277179664436e-07, + "loss": 0.72551328, + "num_input_tokens_seen": 260582565, + "router_z_loss_clip": 0.72900391, + "router_z_loss_mlp": 0.11602783, + "step": 12072, + "time_per_iteration": 2.6619060039520264 + }, + { + "auxiliary_loss_clip": 0.01118184, + "auxiliary_loss_mlp": 0.0103402, + "balance_loss_clip": 1.04034877, + "balance_loss_mlp": 1.02121067, + "epoch": 0.7258680294603939, + "flos": 26332397879040.0, + "grad_norm": 2.5200397866820605, + "language_loss": 0.7876054, + "learning_rate": 7.377255998196821e-07, + "loss": 0.80912745, + "num_input_tokens_seen": 260601700, + "router_z_loss_clip": 0.77734375, + "router_z_loss_mlp": 0.12817383, + "step": 12073, + "time_per_iteration": 2.6465182304382324 + }, + { + "auxiliary_loss_clip": 0.01113985, + "auxiliary_loss_mlp": 0.01024181, + "balance_loss_clip": 1.04133976, + "balance_loss_mlp": 1.01238537, + "epoch": 0.7259281527130618, + "flos": 42532176777120.0, + "grad_norm": 1.7746547588121808, + "language_loss": 0.70224071, + "learning_rate": 7.374235295384923e-07, + "loss": 0.72362244, + "num_input_tokens_seen": 260623040, + "router_z_loss_clip": 0.72705078, + "router_z_loss_mlp": 0.11804199, + "step": 12074, + "time_per_iteration": 2.80375599861145 + }, + { + "auxiliary_loss_clip": 0.01117612, + "auxiliary_loss_mlp": 0.01028396, + "balance_loss_clip": 1.04169631, + "balance_loss_mlp": 1.0162487, + "epoch": 0.7259882759657298, + "flos": 30999726915840.0, + "grad_norm": 1.7471960378899158, + "language_loss": 0.74135637, + "learning_rate": 7.371215071343302e-07, + "loss": 0.76281643, + "num_input_tokens_seen": 260642735, + "router_z_loss_clip": 0.7578125, + "router_z_loss_mlp": 0.121521, + "step": 12075, + "time_per_iteration": 2.6946818828582764 + }, + { + "auxiliary_loss_clip": 0.01115397, + "auxiliary_loss_mlp": 0.01035339, + "balance_loss_clip": 1.0403316, + "balance_loss_mlp": 1.02295351, + "epoch": 0.7260483992183977, + "flos": 76819411035360.0, + "grad_norm": 1.5645917334940735, + "language_loss": 0.63825631, + "learning_rate": 7.368195326186458e-07, + "loss": 0.65976369, + "num_input_tokens_seen": 260669935, + "router_z_loss_clip": 0.75048828, + "router_z_loss_mlp": 0.1239624, + "step": 12076, + "time_per_iteration": 3.010287284851074 + }, + { + "auxiliary_loss_clip": 0.01113211, + "auxiliary_loss_mlp": 0.01028446, + "balance_loss_clip": 1.03852439, + "balance_loss_mlp": 1.01650715, + "epoch": 0.7261085224710657, + "flos": 32296298300640.0, + "grad_norm": 4.986459223981618, + "language_loss": 0.79096973, + "learning_rate": 7.365176060028912e-07, + "loss": 0.81238633, + "num_input_tokens_seen": 260689605, + "router_z_loss_clip": 0.74707031, + "router_z_loss_mlp": 0.1194458, + "step": 12077, + "time_per_iteration": 2.66870379447937 + }, + { + "auxiliary_loss_clip": 0.01033077, + "auxiliary_loss_mlp": 0.0100046, + "balance_loss_clip": 1.00969613, + "balance_loss_mlp": 0.99942255, + "epoch": 0.7261686457237336, + "flos": 81477508482720.0, + "grad_norm": 0.8848365425933585, + "language_loss": 0.64953053, + "learning_rate": 7.362157272985163e-07, + "loss": 0.66986585, + "num_input_tokens_seen": 260748265, + "router_z_loss_clip": 0.23376465, + "router_z_loss_mlp": 0.01037598, + "step": 12078, + "time_per_iteration": 3.2248809337615967 + }, + { + "auxiliary_loss_clip": 0.01033316, + "auxiliary_loss_mlp": 0.01000912, + "balance_loss_clip": 1.0098933, + "balance_loss_mlp": 0.99986607, + "epoch": 0.7262287689764017, + "flos": 85413689606880.0, + "grad_norm": 0.7158794959737551, + "language_loss": 0.59272635, + "learning_rate": 7.359138965169671e-07, + "loss": 0.61306864, + "num_input_tokens_seen": 260816715, + "router_z_loss_clip": 0.23449707, + "router_z_loss_mlp": 0.01046753, + "step": 12079, + "time_per_iteration": 3.401366949081421 + }, + { + "auxiliary_loss_clip": 0.01114074, + "auxiliary_loss_mlp": 0.01034258, + "balance_loss_clip": 1.03970742, + "balance_loss_mlp": 1.021842, + "epoch": 0.7262888922290696, + "flos": 29047681196640.0, + "grad_norm": 3.3967244919771242, + "language_loss": 0.65147388, + "learning_rate": 7.356121136696895e-07, + "loss": 0.67295718, + "num_input_tokens_seen": 260836765, + "router_z_loss_clip": 0.74365234, + "router_z_loss_mlp": 0.12420654, + "step": 12080, + "time_per_iteration": 2.6274445056915283 + }, + { + "auxiliary_loss_clip": 0.01115067, + "auxiliary_loss_mlp": 0.01027674, + "balance_loss_clip": 1.03952479, + "balance_loss_mlp": 1.01497793, + "epoch": 0.7263490154817376, + "flos": 23794230708000.0, + "grad_norm": 2.486904470206517, + "language_loss": 0.704862, + "learning_rate": 7.35310378768128e-07, + "loss": 0.72628939, + "num_input_tokens_seen": 260854610, + "router_z_loss_clip": 0.75537109, + "router_z_loss_mlp": 0.12701416, + "step": 12081, + "time_per_iteration": 2.629770517349243 + }, + { + "auxiliary_loss_clip": 0.01118988, + "auxiliary_loss_mlp": 0.01032145, + "balance_loss_clip": 1.04294395, + "balance_loss_mlp": 1.01993155, + "epoch": 0.7264091387344055, + "flos": 19872473735520.0, + "grad_norm": 1.8417501972407826, + "language_loss": 0.81436002, + "learning_rate": 7.350086918237237e-07, + "loss": 0.83587134, + "num_input_tokens_seen": 260871620, + "router_z_loss_clip": 0.75976562, + "router_z_loss_mlp": 0.12231445, + "step": 12082, + "time_per_iteration": 2.613494634628296 + }, + { + "auxiliary_loss_clip": 0.01120901, + "auxiliary_loss_mlp": 0.01034735, + "balance_loss_clip": 1.04079938, + "balance_loss_mlp": 1.02174711, + "epoch": 0.7264692619870735, + "flos": 29714743473120.0, + "grad_norm": 1.5430477293481453, + "language_loss": 0.77235854, + "learning_rate": 7.347070528479158e-07, + "loss": 0.79391491, + "num_input_tokens_seen": 260890490, + "router_z_loss_clip": 0.80029297, + "router_z_loss_mlp": 0.12969971, + "step": 12083, + "time_per_iteration": 2.6516103744506836 + }, + { + "auxiliary_loss_clip": 0.01118212, + "auxiliary_loss_mlp": 0.01029087, + "balance_loss_clip": 1.04271114, + "balance_loss_mlp": 1.01671934, + "epoch": 0.7265293852397414, + "flos": 30651126524640.0, + "grad_norm": 7.353225851211569, + "language_loss": 0.72586656, + "learning_rate": 7.344054618521433e-07, + "loss": 0.74733961, + "num_input_tokens_seen": 260909700, + "router_z_loss_clip": 0.75439453, + "router_z_loss_mlp": 0.12365723, + "step": 12084, + "time_per_iteration": 2.667299509048462 + }, + { + "auxiliary_loss_clip": 0.01118257, + "auxiliary_loss_mlp": 0.01033575, + "balance_loss_clip": 1.04222631, + "balance_loss_mlp": 1.02105784, + "epoch": 0.7265895084924094, + "flos": 27623337361920.0, + "grad_norm": 1.7523504010776765, + "language_loss": 0.77704465, + "learning_rate": 7.34103918847843e-07, + "loss": 0.798563, + "num_input_tokens_seen": 260929090, + "router_z_loss_clip": 0.75976562, + "router_z_loss_mlp": 0.12524414, + "step": 12085, + "time_per_iteration": 2.6075446605682373 + }, + { + "auxiliary_loss_clip": 0.01115679, + "auxiliary_loss_mlp": 0.0103262, + "balance_loss_clip": 1.04039502, + "balance_loss_mlp": 1.02087188, + "epoch": 0.7266496317450775, + "flos": 28514104306560.0, + "grad_norm": 1.6194249257953615, + "language_loss": 0.7242384, + "learning_rate": 7.338024238464493e-07, + "loss": 0.74572146, + "num_input_tokens_seen": 260946615, + "router_z_loss_clip": 0.75341797, + "router_z_loss_mlp": 0.11749268, + "step": 12086, + "time_per_iteration": 2.7013516426086426 + }, + { + "auxiliary_loss_clip": 0.01114528, + "auxiliary_loss_mlp": 0.0103091, + "balance_loss_clip": 1.04082417, + "balance_loss_mlp": 1.01900125, + "epoch": 0.7267097549977454, + "flos": 34257217304160.0, + "grad_norm": 2.277260142081768, + "language_loss": 0.69446707, + "learning_rate": 7.335009768593938e-07, + "loss": 0.7159214, + "num_input_tokens_seen": 260968515, + "router_z_loss_clip": 0.73681641, + "router_z_loss_mlp": 0.11920166, + "step": 12087, + "time_per_iteration": 2.6561996936798096 + }, + { + "auxiliary_loss_clip": 0.01116687, + "auxiliary_loss_mlp": 0.01035695, + "balance_loss_clip": 1.04160118, + "balance_loss_mlp": 1.02323198, + "epoch": 0.7267698782504134, + "flos": 27083561328000.0, + "grad_norm": 1.8953880369998564, + "language_loss": 0.79150289, + "learning_rate": 7.331995778981088e-07, + "loss": 0.81302679, + "num_input_tokens_seen": 260986790, + "router_z_loss_clip": 0.75097656, + "router_z_loss_mlp": 0.12469482, + "step": 12088, + "time_per_iteration": 2.6481921672821045 + }, + { + "auxiliary_loss_clip": 0.01115891, + "auxiliary_loss_mlp": 0.01036833, + "balance_loss_clip": 1.03995538, + "balance_loss_mlp": 1.02487636, + "epoch": 0.7268300015030813, + "flos": 22591727746560.0, + "grad_norm": 1.8100083965504663, + "language_loss": 0.7399531, + "learning_rate": 7.328982269740221e-07, + "loss": 0.76148033, + "num_input_tokens_seen": 261004925, + "router_z_loss_clip": 0.75878906, + "router_z_loss_mlp": 0.11950684, + "step": 12089, + "time_per_iteration": 2.5820255279541016 + }, + { + "auxiliary_loss_clip": 0.0111766, + "auxiliary_loss_mlp": 0.01038362, + "balance_loss_clip": 1.04241955, + "balance_loss_mlp": 1.02610087, + "epoch": 0.7268901247557493, + "flos": 29268225516960.0, + "grad_norm": 2.551302422625854, + "language_loss": 0.71532619, + "learning_rate": 7.325969240985616e-07, + "loss": 0.73688644, + "num_input_tokens_seen": 261023895, + "router_z_loss_clip": 0.75292969, + "router_z_loss_mlp": 0.12255859, + "step": 12090, + "time_per_iteration": 2.681384325027466 + }, + { + "auxiliary_loss_clip": 0.01116621, + "auxiliary_loss_mlp": 0.01029174, + "balance_loss_clip": 1.04032028, + "balance_loss_mlp": 1.0158819, + "epoch": 0.7269502480084172, + "flos": 39154774291200.0, + "grad_norm": 1.9200747738308919, + "language_loss": 0.7757647, + "learning_rate": 7.322956692831528e-07, + "loss": 0.79722261, + "num_input_tokens_seen": 261045445, + "router_z_loss_clip": 0.76269531, + "router_z_loss_mlp": 0.1328125, + "step": 12091, + "time_per_iteration": 4.164759635925293 + }, + { + "auxiliary_loss_clip": 0.01113851, + "auxiliary_loss_mlp": 0.01026084, + "balance_loss_clip": 1.03936243, + "balance_loss_mlp": 1.01394272, + "epoch": 0.7270103712610853, + "flos": 23260289162400.0, + "grad_norm": 2.1174043057206746, + "language_loss": 0.71570146, + "learning_rate": 7.319944625392205e-07, + "loss": 0.73710084, + "num_input_tokens_seen": 261064275, + "router_z_loss_clip": 0.74511719, + "router_z_loss_mlp": 0.12145996, + "step": 12092, + "time_per_iteration": 2.6198465824127197 + }, + { + "auxiliary_loss_clip": 0.01115338, + "auxiliary_loss_mlp": 0.01029754, + "balance_loss_clip": 1.04170489, + "balance_loss_mlp": 1.01750541, + "epoch": 0.7270704945137532, + "flos": 42138122348160.0, + "grad_norm": 4.717942949497918, + "language_loss": 0.60928702, + "learning_rate": 7.31693303878184e-07, + "loss": 0.6307379, + "num_input_tokens_seen": 261083310, + "router_z_loss_clip": 0.73681641, + "router_z_loss_mlp": 0.12249756, + "step": 12093, + "time_per_iteration": 2.7578163146972656 + }, + { + "auxiliary_loss_clip": 0.01116367, + "auxiliary_loss_mlp": 0.01032119, + "balance_loss_clip": 1.04272807, + "balance_loss_mlp": 1.01994824, + "epoch": 0.7271306177664212, + "flos": 26243596702080.0, + "grad_norm": 1.5876649899113695, + "language_loss": 0.75880164, + "learning_rate": 7.313921933114644e-07, + "loss": 0.78028649, + "num_input_tokens_seen": 261103460, + "router_z_loss_clip": 0.73730469, + "router_z_loss_mlp": 0.12164307, + "step": 12094, + "time_per_iteration": 4.040107488632202 + }, + { + "auxiliary_loss_clip": 0.01112381, + "auxiliary_loss_mlp": 0.01027955, + "balance_loss_clip": 1.03907609, + "balance_loss_mlp": 1.01694655, + "epoch": 0.7271907410190891, + "flos": 27177062509440.0, + "grad_norm": 1.9388547957937736, + "language_loss": 0.84695995, + "learning_rate": 7.310911308504808e-07, + "loss": 0.86836326, + "num_input_tokens_seen": 261121375, + "router_z_loss_clip": 0.73291016, + "router_z_loss_mlp": 0.11022949, + "step": 12095, + "time_per_iteration": 2.650515079498291 + }, + { + "auxiliary_loss_clip": 0.01115254, + "auxiliary_loss_mlp": 0.01033763, + "balance_loss_clip": 1.03995514, + "balance_loss_mlp": 1.02169287, + "epoch": 0.7272508642717571, + "flos": 27934425102240.0, + "grad_norm": 1.657163607247145, + "language_loss": 0.77731633, + "learning_rate": 7.307901165066479e-07, + "loss": 0.79880649, + "num_input_tokens_seen": 261141105, + "router_z_loss_clip": 0.75244141, + "router_z_loss_mlp": 0.12060547, + "step": 12096, + "time_per_iteration": 2.67209792137146 + }, + { + "auxiliary_loss_clip": 0.01115885, + "auxiliary_loss_mlp": 0.01034805, + "balance_loss_clip": 1.04220772, + "balance_loss_mlp": 1.02330768, + "epoch": 0.727310987524425, + "flos": 14221848987360.0, + "grad_norm": 1.838848510689349, + "language_loss": 0.72334689, + "learning_rate": 7.30489150291381e-07, + "loss": 0.74485379, + "num_input_tokens_seen": 261159255, + "router_z_loss_clip": 0.73730469, + "router_z_loss_mlp": 0.11499023, + "step": 12097, + "time_per_iteration": 2.7391698360443115 + }, + { + "auxiliary_loss_clip": 0.01117533, + "auxiliary_loss_mlp": 0.01032919, + "balance_loss_clip": 1.04193306, + "balance_loss_mlp": 1.02006269, + "epoch": 0.727371110777093, + "flos": 29938326589440.0, + "grad_norm": 2.9089562432521205, + "language_loss": 0.7731058, + "learning_rate": 7.301882322160935e-07, + "loss": 0.79461032, + "num_input_tokens_seen": 261177960, + "router_z_loss_clip": 0.75537109, + "router_z_loss_mlp": 0.12854004, + "step": 12098, + "time_per_iteration": 2.6998355388641357 + }, + { + "auxiliary_loss_clip": 0.01116894, + "auxiliary_loss_mlp": 0.01029412, + "balance_loss_clip": 1.03957438, + "balance_loss_mlp": 1.01712132, + "epoch": 0.7274312340297611, + "flos": 91201723567200.0, + "grad_norm": 1.817666380008194, + "language_loss": 0.67625332, + "learning_rate": 7.298873622921952e-07, + "loss": 0.69771636, + "num_input_tokens_seen": 261205660, + "router_z_loss_clip": 0.77294922, + "router_z_loss_mlp": 0.12286377, + "step": 12099, + "time_per_iteration": 3.0069050788879395 + }, + { + "auxiliary_loss_clip": 0.01120064, + "auxiliary_loss_mlp": 0.01038138, + "balance_loss_clip": 1.04003036, + "balance_loss_mlp": 1.02448273, + "epoch": 0.727491357282429, + "flos": 27263230063200.0, + "grad_norm": 2.4817033671064026, + "language_loss": 0.72756517, + "learning_rate": 7.29586540531095e-07, + "loss": 0.74914718, + "num_input_tokens_seen": 261225185, + "router_z_loss_clip": 0.79980469, + "router_z_loss_mlp": 0.13641357, + "step": 12100, + "time_per_iteration": 2.6285037994384766 + }, + { + "auxiliary_loss_clip": 0.01115609, + "auxiliary_loss_mlp": 0.01033156, + "balance_loss_clip": 1.0412209, + "balance_loss_mlp": 1.02175939, + "epoch": 0.727551480535097, + "flos": 28427653131840.0, + "grad_norm": 1.4844151826725822, + "language_loss": 0.74808753, + "learning_rate": 7.292857669442005e-07, + "loss": 0.76957518, + "num_input_tokens_seen": 261247965, + "router_z_loss_clip": 0.74414062, + "router_z_loss_mlp": 0.11407471, + "step": 12101, + "time_per_iteration": 4.129193305969238 + }, + { + "auxiliary_loss_clip": 0.01114754, + "auxiliary_loss_mlp": 0.01031509, + "balance_loss_clip": 1.04160452, + "balance_loss_mlp": 1.02064955, + "epoch": 0.7276116037877649, + "flos": 26198466802560.0, + "grad_norm": 4.442318546302226, + "language_loss": 0.82454157, + "learning_rate": 7.289850415429177e-07, + "loss": 0.84600425, + "num_input_tokens_seen": 261267585, + "router_z_loss_clip": 0.73144531, + "router_z_loss_mlp": 0.10864258, + "step": 12102, + "time_per_iteration": 4.086575031280518 + }, + { + "auxiliary_loss_clip": 0.01114438, + "auxiliary_loss_mlp": 0.01032652, + "balance_loss_clip": 1.04066753, + "balance_loss_mlp": 1.02112412, + "epoch": 0.7276717270404329, + "flos": 26191133174880.0, + "grad_norm": 1.970556167988762, + "language_loss": 0.81701607, + "learning_rate": 7.286843643386495e-07, + "loss": 0.83848697, + "num_input_tokens_seen": 261285200, + "router_z_loss_clip": 0.73730469, + "router_z_loss_mlp": 0.1151123, + "step": 12103, + "time_per_iteration": 2.6328611373901367 + }, + { + "auxiliary_loss_clip": 0.01116489, + "auxiliary_loss_mlp": 0.0102588, + "balance_loss_clip": 1.04189181, + "balance_loss_mlp": 1.0130527, + "epoch": 0.7277318502931008, + "flos": 20544479120160.0, + "grad_norm": 1.6887208096500415, + "language_loss": 0.66587389, + "learning_rate": 7.283837353427968e-07, + "loss": 0.68729758, + "num_input_tokens_seen": 261303645, + "router_z_loss_clip": 0.74560547, + "router_z_loss_mlp": 0.12817383, + "step": 12104, + "time_per_iteration": 2.5767579078674316 + }, + { + "auxiliary_loss_clip": 0.01113495, + "auxiliary_loss_mlp": 0.01031422, + "balance_loss_clip": 1.04153156, + "balance_loss_mlp": 1.01976943, + "epoch": 0.7277919735457689, + "flos": 40756436858880.0, + "grad_norm": 2.0123642848086134, + "language_loss": 0.66133589, + "learning_rate": 7.280831545667611e-07, + "loss": 0.68278503, + "num_input_tokens_seen": 261323265, + "router_z_loss_clip": 0.71972656, + "router_z_loss_mlp": 0.11669922, + "step": 12105, + "time_per_iteration": 2.7411324977874756 + }, + { + "auxiliary_loss_clip": 0.01117532, + "auxiliary_loss_mlp": 0.01030772, + "balance_loss_clip": 1.04261208, + "balance_loss_mlp": 1.0183624, + "epoch": 0.7278520967984368, + "flos": 23435865652320.0, + "grad_norm": 2.1211191084176027, + "language_loss": 0.75419903, + "learning_rate": 7.27782622021939e-07, + "loss": 0.77568203, + "num_input_tokens_seen": 261339745, + "router_z_loss_clip": 0.74951172, + "router_z_loss_mlp": 0.12408447, + "step": 12106, + "time_per_iteration": 2.674365282058716 + }, + { + "auxiliary_loss_clip": 0.01119054, + "auxiliary_loss_mlp": 0.01030038, + "balance_loss_clip": 1.04259658, + "balance_loss_mlp": 1.01739001, + "epoch": 0.7279122200511048, + "flos": 41603370456960.0, + "grad_norm": 3.0840846774690496, + "language_loss": 0.69773829, + "learning_rate": 7.274821377197273e-07, + "loss": 0.71922922, + "num_input_tokens_seen": 261359310, + "router_z_loss_clip": 0.76513672, + "router_z_loss_mlp": 0.12658691, + "step": 12107, + "time_per_iteration": 2.830261707305908 + }, + { + "auxiliary_loss_clip": 0.01114255, + "auxiliary_loss_mlp": 0.01032897, + "balance_loss_clip": 1.03990674, + "balance_loss_mlp": 1.02090502, + "epoch": 0.7279723433037727, + "flos": 66624448661280.0, + "grad_norm": 1.4461666428325846, + "language_loss": 0.75489813, + "learning_rate": 7.271817016715205e-07, + "loss": 0.77636969, + "num_input_tokens_seen": 261384640, + "router_z_loss_clip": 0.74365234, + "router_z_loss_mlp": 0.11993408, + "step": 12108, + "time_per_iteration": 2.9734559059143066 + }, + { + "auxiliary_loss_clip": 0.0111412, + "auxiliary_loss_mlp": 0.01031932, + "balance_loss_clip": 1.03890193, + "balance_loss_mlp": 1.01979089, + "epoch": 0.7280324665564407, + "flos": 44096205142080.0, + "grad_norm": 1.9259068442470053, + "language_loss": 0.67240679, + "learning_rate": 7.268813138887124e-07, + "loss": 0.69386733, + "num_input_tokens_seen": 261405290, + "router_z_loss_clip": 0.75244141, + "router_z_loss_mlp": 0.12139893, + "step": 12109, + "time_per_iteration": 2.751991033554077 + }, + { + "auxiliary_loss_clip": 0.01115295, + "auxiliary_loss_mlp": 0.01032916, + "balance_loss_clip": 1.04143691, + "balance_loss_mlp": 1.02013075, + "epoch": 0.7280925898091086, + "flos": 14176354432320.0, + "grad_norm": 4.92622244621996, + "language_loss": 0.63294232, + "learning_rate": 7.265809743826912e-07, + "loss": 0.65442443, + "num_input_tokens_seen": 261419710, + "router_z_loss_clip": 0.73876953, + "router_z_loss_mlp": 0.12774658, + "step": 12110, + "time_per_iteration": 2.645843267440796 + }, + { + "auxiliary_loss_clip": 0.01116185, + "auxiliary_loss_mlp": 0.01026463, + "balance_loss_clip": 1.03876185, + "balance_loss_mlp": 1.0131892, + "epoch": 0.7281527130617766, + "flos": 41825981158560.0, + "grad_norm": 2.018337356001978, + "language_loss": 0.58509898, + "learning_rate": 7.26280683164847e-07, + "loss": 0.60652554, + "num_input_tokens_seen": 261442385, + "router_z_loss_clip": 0.77539062, + "router_z_loss_mlp": 0.13287354, + "step": 12111, + "time_per_iteration": 2.73893666267395 + }, + { + "auxiliary_loss_clip": 0.01117539, + "auxiliary_loss_mlp": 0.01027984, + "balance_loss_clip": 1.04171467, + "balance_loss_mlp": 1.01591384, + "epoch": 0.7282128363144446, + "flos": 16982302721760.0, + "grad_norm": 2.265867728338773, + "language_loss": 0.73841715, + "learning_rate": 7.259804402465677e-07, + "loss": 0.75987232, + "num_input_tokens_seen": 261459805, + "router_z_loss_clip": 0.7578125, + "router_z_loss_mlp": 0.12072754, + "step": 12112, + "time_per_iteration": 2.712661027908325 + }, + { + "auxiliary_loss_clip": 0.01111249, + "auxiliary_loss_mlp": 0.01031115, + "balance_loss_clip": 1.03799295, + "balance_loss_mlp": 1.01998651, + "epoch": 0.7282729595671126, + "flos": 25352870274720.0, + "grad_norm": 2.1581818958410146, + "language_loss": 0.67374557, + "learning_rate": 7.25680245639237e-07, + "loss": 0.69516921, + "num_input_tokens_seen": 261477175, + "router_z_loss_clip": 0.73291016, + "router_z_loss_mlp": 0.11126709, + "step": 12113, + "time_per_iteration": 2.6566686630249023 + }, + { + "auxiliary_loss_clip": 0.0111406, + "auxiliary_loss_mlp": 0.01034682, + "balance_loss_clip": 1.03801024, + "balance_loss_mlp": 1.02220654, + "epoch": 0.7283330828197806, + "flos": 19920034671840.0, + "grad_norm": 1.8934844925151266, + "language_loss": 0.73580968, + "learning_rate": 7.253800993542399e-07, + "loss": 0.75729716, + "num_input_tokens_seen": 261494990, + "router_z_loss_clip": 0.76074219, + "router_z_loss_mlp": 0.12487793, + "step": 12114, + "time_per_iteration": 2.6438775062561035 + }, + { + "auxiliary_loss_clip": 0.01113571, + "auxiliary_loss_mlp": 0.01030735, + "balance_loss_clip": 1.03970122, + "balance_loss_mlp": 1.018188, + "epoch": 0.7283932060724485, + "flos": 33544133748000.0, + "grad_norm": 2.9936602997130755, + "language_loss": 0.67937702, + "learning_rate": 7.250800014029564e-07, + "loss": 0.70082009, + "num_input_tokens_seen": 261514445, + "router_z_loss_clip": 0.73925781, + "router_z_loss_mlp": 0.12554932, + "step": 12115, + "time_per_iteration": 2.7263364791870117 + }, + { + "auxiliary_loss_clip": 0.0111593, + "auxiliary_loss_mlp": 0.01030034, + "balance_loss_clip": 1.03955257, + "balance_loss_mlp": 1.01776195, + "epoch": 0.7284533293251165, + "flos": 22412464184160.0, + "grad_norm": 3.24729659989504, + "language_loss": 0.60103643, + "learning_rate": 7.247799517967674e-07, + "loss": 0.62249613, + "num_input_tokens_seen": 261533565, + "router_z_loss_clip": 0.76220703, + "router_z_loss_mlp": 0.12255859, + "step": 12116, + "time_per_iteration": 2.705674171447754 + }, + { + "auxiliary_loss_clip": 0.0111494, + "auxiliary_loss_mlp": 0.01032168, + "balance_loss_clip": 1.04030299, + "balance_loss_mlp": 1.01990151, + "epoch": 0.7285134525777844, + "flos": 26509635577440.0, + "grad_norm": 1.9133470630856007, + "language_loss": 0.73030663, + "learning_rate": 7.2447995054705e-07, + "loss": 0.75177777, + "num_input_tokens_seen": 261553795, + "router_z_loss_clip": 0.74658203, + "router_z_loss_mlp": 0.1227417, + "step": 12117, + "time_per_iteration": 2.7002601623535156 + }, + { + "auxiliary_loss_clip": 0.01114725, + "auxiliary_loss_mlp": 0.01030758, + "balance_loss_clip": 1.0405283, + "balance_loss_mlp": 1.01874161, + "epoch": 0.7285735758304525, + "flos": 25308591238080.0, + "grad_norm": 2.7435035577705955, + "language_loss": 0.69695067, + "learning_rate": 7.241799976651807e-07, + "loss": 0.71840549, + "num_input_tokens_seen": 261572565, + "router_z_loss_clip": 0.74072266, + "router_z_loss_mlp": 0.12030029, + "step": 12118, + "time_per_iteration": 2.734851598739624 + }, + { + "auxiliary_loss_clip": 0.01110901, + "auxiliary_loss_mlp": 0.0103192, + "balance_loss_clip": 1.03978813, + "balance_loss_mlp": 1.02048147, + "epoch": 0.7286336990831204, + "flos": 21122618667840.0, + "grad_norm": 1.653450932252771, + "language_loss": 0.84421009, + "learning_rate": 7.238800931625346e-07, + "loss": 0.86563832, + "num_input_tokens_seen": 261590910, + "router_z_loss_clip": 0.7109375, + "router_z_loss_mlp": 0.11437988, + "step": 12119, + "time_per_iteration": 2.719120502471924 + }, + { + "auxiliary_loss_clip": 0.01115347, + "auxiliary_loss_mlp": 0.01026672, + "balance_loss_clip": 1.04001451, + "balance_loss_mlp": 1.01503074, + "epoch": 0.7286938223357884, + "flos": 24144978515040.0, + "grad_norm": 2.8038600057837724, + "language_loss": 0.81893045, + "learning_rate": 7.235802370504831e-07, + "loss": 0.84035063, + "num_input_tokens_seen": 261606005, + "router_z_loss_clip": 0.75244141, + "router_z_loss_mlp": 0.11651611, + "step": 12120, + "time_per_iteration": 2.622408151626587 + }, + { + "auxiliary_loss_clip": 0.01117498, + "auxiliary_loss_mlp": 0.01041907, + "balance_loss_clip": 1.04198384, + "balance_loss_mlp": 1.02980673, + "epoch": 0.7287539455884563, + "flos": 18718544642400.0, + "grad_norm": 2.4962419699075946, + "language_loss": 0.78902447, + "learning_rate": 7.232804293403963e-07, + "loss": 0.81061846, + "num_input_tokens_seen": 261622305, + "router_z_loss_clip": 0.75488281, + "router_z_loss_mlp": 0.12109375, + "step": 12121, + "time_per_iteration": 2.7501907348632812 + }, + { + "auxiliary_loss_clip": 0.01116307, + "auxiliary_loss_mlp": 0.01032571, + "balance_loss_clip": 1.03834414, + "balance_loss_mlp": 1.02026236, + "epoch": 0.7288140688411243, + "flos": 30740211322560.0, + "grad_norm": 10.674685992323399, + "language_loss": 0.69312692, + "learning_rate": 7.229806700436441e-07, + "loss": 0.7146157, + "num_input_tokens_seen": 261642465, + "router_z_loss_clip": 0.77929688, + "router_z_loss_mlp": 0.12310791, + "step": 12122, + "time_per_iteration": 2.678591728210449 + }, + { + "auxiliary_loss_clip": 0.01110203, + "auxiliary_loss_mlp": 0.01032275, + "balance_loss_clip": 1.0378499, + "balance_loss_mlp": 1.02123666, + "epoch": 0.7288741920937922, + "flos": 29266118618400.0, + "grad_norm": 1.9588992184457266, + "language_loss": 0.86793953, + "learning_rate": 7.226809591715923e-07, + "loss": 0.8893643, + "num_input_tokens_seen": 261661420, + "router_z_loss_clip": 0.72363281, + "router_z_loss_mlp": 0.11047363, + "step": 12123, + "time_per_iteration": 2.699941635131836 + }, + { + "auxiliary_loss_clip": 0.01112641, + "auxiliary_loss_mlp": 0.01033576, + "balance_loss_clip": 1.03894305, + "balance_loss_mlp": 1.02220368, + "epoch": 0.7289343153464602, + "flos": 27753783952320.0, + "grad_norm": 2.7897415952334845, + "language_loss": 0.82801163, + "learning_rate": 7.223812967356065e-07, + "loss": 0.84947383, + "num_input_tokens_seen": 261680865, + "router_z_loss_clip": 0.73779297, + "router_z_loss_mlp": 0.11376953, + "step": 12124, + "time_per_iteration": 2.630506753921509 + }, + { + "auxiliary_loss_clip": 0.011123, + "auxiliary_loss_mlp": 0.01029927, + "balance_loss_clip": 1.03996992, + "balance_loss_mlp": 1.01870322, + "epoch": 0.7289944385991282, + "flos": 30383385923520.0, + "grad_norm": 1.8295878383188164, + "language_loss": 0.66662943, + "learning_rate": 7.220816827470499e-07, + "loss": 0.6880517, + "num_input_tokens_seen": 261701455, + "router_z_loss_clip": 0.72363281, + "router_z_loss_mlp": 0.11224365, + "step": 12125, + "time_per_iteration": 2.7342703342437744 + }, + { + "auxiliary_loss_clip": 0.01118548, + "auxiliary_loss_mlp": 0.01040864, + "balance_loss_clip": 1.0413096, + "balance_loss_mlp": 1.02822816, + "epoch": 0.7290545618517962, + "flos": 28024563349440.0, + "grad_norm": 1.9522886435388167, + "language_loss": 0.75251722, + "learning_rate": 7.217821172172855e-07, + "loss": 0.77411139, + "num_input_tokens_seen": 261721260, + "router_z_loss_clip": 0.77392578, + "router_z_loss_mlp": 0.12646484, + "step": 12126, + "time_per_iteration": 2.6372315883636475 + }, + { + "auxiliary_loss_clip": 0.01033498, + "auxiliary_loss_mlp": 0.01002096, + "balance_loss_clip": 1.0103718, + "balance_loss_mlp": 1.00105762, + "epoch": 0.7291146851044642, + "flos": 75531638250720.0, + "grad_norm": 0.8359787968447359, + "language_loss": 0.58704734, + "learning_rate": 7.2148260015767e-07, + "loss": 0.60740334, + "num_input_tokens_seen": 261779370, + "router_z_loss_clip": 0.23156738, + "router_z_loss_mlp": 0.01039124, + "step": 12127, + "time_per_iteration": 3.241529703140259 + }, + { + "auxiliary_loss_clip": 0.01113835, + "auxiliary_loss_mlp": 0.01027022, + "balance_loss_clip": 1.04153109, + "balance_loss_mlp": 1.01622725, + "epoch": 0.7291748083571321, + "flos": 28468690786080.0, + "grad_norm": 3.720623926530914, + "language_loss": 0.68476427, + "learning_rate": 7.21183131579562e-07, + "loss": 0.70617282, + "num_input_tokens_seen": 261798050, + "router_z_loss_clip": 0.72363281, + "router_z_loss_mlp": 0.10797119, + "step": 12128, + "time_per_iteration": 2.671797513961792 + }, + { + "auxiliary_loss_clip": 0.01115791, + "auxiliary_loss_mlp": 0.01033112, + "balance_loss_clip": 1.04099274, + "balance_loss_mlp": 1.02126837, + "epoch": 0.7292349316098001, + "flos": 34568872286400.0, + "grad_norm": 2.3230357704041134, + "language_loss": 0.66177088, + "learning_rate": 7.20883711494319e-07, + "loss": 0.6832599, + "num_input_tokens_seen": 261817660, + "router_z_loss_clip": 0.74804688, + "router_z_loss_mlp": 0.1184082, + "step": 12129, + "time_per_iteration": 2.682814836502075 + }, + { + "auxiliary_loss_clip": 0.0111173, + "auxiliary_loss_mlp": 0.01024759, + "balance_loss_clip": 1.03964126, + "balance_loss_mlp": 1.01257586, + "epoch": 0.729295054862468, + "flos": 29446597699200.0, + "grad_norm": 3.479924408526785, + "language_loss": 0.74334586, + "learning_rate": 7.205843399132927e-07, + "loss": 0.76471078, + "num_input_tokens_seen": 261837935, + "router_z_loss_clip": 0.72070312, + "router_z_loss_mlp": 0.12176514, + "step": 12130, + "time_per_iteration": 2.6829750537872314 + }, + { + "auxiliary_loss_clip": 0.01113717, + "auxiliary_loss_mlp": 0.01032078, + "balance_loss_clip": 1.03990436, + "balance_loss_mlp": 1.01997793, + "epoch": 0.7293551781151361, + "flos": 27840964438080.0, + "grad_norm": 1.7142959579403434, + "language_loss": 0.69407171, + "learning_rate": 7.202850168478374e-07, + "loss": 0.71552962, + "num_input_tokens_seen": 261857575, + "router_z_loss_clip": 0.73876953, + "router_z_loss_mlp": 0.12097168, + "step": 12131, + "time_per_iteration": 4.067334175109863 + }, + { + "auxiliary_loss_clip": 0.01113838, + "auxiliary_loss_mlp": 0.01031544, + "balance_loss_clip": 1.04052138, + "balance_loss_mlp": 1.02030802, + "epoch": 0.729415301367804, + "flos": 26999176534560.0, + "grad_norm": 1.822667158507255, + "language_loss": 0.77323723, + "learning_rate": 7.199857423093025e-07, + "loss": 0.79469097, + "num_input_tokens_seen": 261877265, + "router_z_loss_clip": 0.73339844, + "router_z_loss_mlp": 0.11242676, + "step": 12132, + "time_per_iteration": 2.6979877948760986 + }, + { + "auxiliary_loss_clip": 0.01114409, + "auxiliary_loss_mlp": 0.01037557, + "balance_loss_clip": 1.04030323, + "balance_loss_mlp": 1.02623236, + "epoch": 0.729475424620472, + "flos": 15068336895360.0, + "grad_norm": 2.6326504482244224, + "language_loss": 0.79540706, + "learning_rate": 7.196865163090358e-07, + "loss": 0.81692678, + "num_input_tokens_seen": 261893695, + "router_z_loss_clip": 0.74023438, + "router_z_loss_mlp": 0.11315918, + "step": 12133, + "time_per_iteration": 3.9238638877868652 + }, + { + "auxiliary_loss_clip": 0.01113767, + "auxiliary_loss_mlp": 0.01029928, + "balance_loss_clip": 1.03927064, + "balance_loss_mlp": 1.01807904, + "epoch": 0.7295355478731399, + "flos": 27083318224320.0, + "grad_norm": 2.385809612149503, + "language_loss": 0.72252822, + "learning_rate": 7.193873388583846e-07, + "loss": 0.74396515, + "num_input_tokens_seen": 261911825, + "router_z_loss_clip": 0.74560547, + "router_z_loss_mlp": 0.11853027, + "step": 12134, + "time_per_iteration": 2.619098663330078 + }, + { + "auxiliary_loss_clip": 0.01116707, + "auxiliary_loss_mlp": 0.01038678, + "balance_loss_clip": 1.04205918, + "balance_loss_mlp": 1.02648926, + "epoch": 0.7295956711258079, + "flos": 28335732124320.0, + "grad_norm": 1.8029075074517982, + "language_loss": 0.71845728, + "learning_rate": 7.190882099686939e-07, + "loss": 0.7400111, + "num_input_tokens_seen": 261931190, + "router_z_loss_clip": 0.74658203, + "router_z_loss_mlp": 0.1217041, + "step": 12135, + "time_per_iteration": 2.6871938705444336 + }, + { + "auxiliary_loss_clip": 0.01115976, + "auxiliary_loss_mlp": 0.01037827, + "balance_loss_clip": 1.04004157, + "balance_loss_mlp": 1.02597165, + "epoch": 0.7296557943784758, + "flos": 38887560414720.0, + "grad_norm": 2.06229199286962, + "language_loss": 0.62244308, + "learning_rate": 7.187891296513075e-07, + "loss": 0.6439811, + "num_input_tokens_seen": 261951240, + "router_z_loss_clip": 0.76025391, + "router_z_loss_mlp": 0.11853027, + "step": 12136, + "time_per_iteration": 2.7177014350891113 + }, + { + "auxiliary_loss_clip": 0.01113128, + "auxiliary_loss_mlp": 0.01038325, + "balance_loss_clip": 1.03923428, + "balance_loss_mlp": 1.02674437, + "epoch": 0.7297159176311439, + "flos": 32520894348960.0, + "grad_norm": 2.351304489860458, + "language_loss": 0.74606538, + "learning_rate": 7.184900979175654e-07, + "loss": 0.76757991, + "num_input_tokens_seen": 261971605, + "router_z_loss_clip": 0.73925781, + "router_z_loss_mlp": 0.11584473, + "step": 12137, + "time_per_iteration": 2.6946234703063965 + }, + { + "auxiliary_loss_clip": 0.01117603, + "auxiliary_loss_mlp": 0.01035229, + "balance_loss_clip": 1.04252648, + "balance_loss_mlp": 1.02320635, + "epoch": 0.7297760408838118, + "flos": 30200192184960.0, + "grad_norm": 2.2261584580091265, + "language_loss": 0.74610674, + "learning_rate": 7.181911147788069e-07, + "loss": 0.76763511, + "num_input_tokens_seen": 261990830, + "router_z_loss_clip": 0.75048828, + "router_z_loss_mlp": 0.12036133, + "step": 12138, + "time_per_iteration": 2.653618097305298 + }, + { + "auxiliary_loss_clip": 0.01112096, + "auxiliary_loss_mlp": 0.01030003, + "balance_loss_clip": 1.03846955, + "balance_loss_mlp": 1.01916718, + "epoch": 0.7298361641364798, + "flos": 22053774990240.0, + "grad_norm": 2.310978719641398, + "language_loss": 0.72025806, + "learning_rate": 7.178921802463702e-07, + "loss": 0.74167907, + "num_input_tokens_seen": 262008190, + "router_z_loss_clip": 0.73632812, + "router_z_loss_mlp": 0.10827637, + "step": 12139, + "time_per_iteration": 2.6229746341705322 + }, + { + "auxiliary_loss_clip": 0.01110711, + "auxiliary_loss_mlp": 0.01028571, + "balance_loss_clip": 1.04013097, + "balance_loss_mlp": 1.01800919, + "epoch": 0.7298962873891478, + "flos": 36479637247680.0, + "grad_norm": 1.8236739569086036, + "language_loss": 0.73705459, + "learning_rate": 7.175932943315898e-07, + "loss": 0.75844741, + "num_input_tokens_seen": 262030460, + "router_z_loss_clip": 0.70556641, + "router_z_loss_mlp": 0.10565186, + "step": 12140, + "time_per_iteration": 4.166077375411987 + }, + { + "auxiliary_loss_clip": 0.01115683, + "auxiliary_loss_mlp": 0.01031596, + "balance_loss_clip": 1.04076672, + "balance_loss_mlp": 1.0191263, + "epoch": 0.7299564106418157, + "flos": 39372401367360.0, + "grad_norm": 1.9347588369616402, + "language_loss": 0.55368942, + "learning_rate": 7.172944570458003e-07, + "loss": 0.57516217, + "num_input_tokens_seen": 262050830, + "router_z_loss_clip": 0.74902344, + "router_z_loss_mlp": 0.12469482, + "step": 12141, + "time_per_iteration": 2.710703134536743 + }, + { + "auxiliary_loss_clip": 0.01112534, + "auxiliary_loss_mlp": 0.01025747, + "balance_loss_clip": 1.04052079, + "balance_loss_mlp": 1.01485133, + "epoch": 0.7300165338944837, + "flos": 27979717070880.0, + "grad_norm": 1.9541504195053436, + "language_loss": 0.72664618, + "learning_rate": 7.169956684003342e-07, + "loss": 0.74802899, + "num_input_tokens_seen": 262071245, + "router_z_loss_clip": 0.71972656, + "router_z_loss_mlp": 0.10888672, + "step": 12142, + "time_per_iteration": 4.027556896209717 + }, + { + "auxiliary_loss_clip": 0.01114036, + "auxiliary_loss_mlp": 0.01031071, + "balance_loss_clip": 1.04083633, + "balance_loss_mlp": 1.01997232, + "epoch": 0.7300766571471516, + "flos": 24195051522720.0, + "grad_norm": 2.004145515716794, + "language_loss": 0.73669696, + "learning_rate": 7.16696928406521e-07, + "loss": 0.75814801, + "num_input_tokens_seen": 262087525, + "router_z_loss_clip": 0.73291016, + "router_z_loss_mlp": 0.11096191, + "step": 12143, + "time_per_iteration": 2.711561679840088 + }, + { + "auxiliary_loss_clip": 0.01114317, + "auxiliary_loss_mlp": 0.01031554, + "balance_loss_clip": 1.04037082, + "balance_loss_mlp": 1.01940703, + "epoch": 0.7301367803998197, + "flos": 29710002951360.0, + "grad_norm": 3.9708145455063564, + "language_loss": 0.66654319, + "learning_rate": 7.163982370756882e-07, + "loss": 0.68800187, + "num_input_tokens_seen": 262107355, + "router_z_loss_clip": 0.73876953, + "router_z_loss_mlp": 0.121521, + "step": 12144, + "time_per_iteration": 2.6553077697753906 + }, + { + "auxiliary_loss_clip": 0.01113246, + "auxiliary_loss_mlp": 0.01031167, + "balance_loss_clip": 1.03945422, + "balance_loss_mlp": 1.01887691, + "epoch": 0.7301969036524876, + "flos": 18674184571200.0, + "grad_norm": 1.8399984324668364, + "language_loss": 0.79357767, + "learning_rate": 7.160995944191627e-07, + "loss": 0.81502187, + "num_input_tokens_seen": 262125645, + "router_z_loss_clip": 0.73779297, + "router_z_loss_mlp": 0.12298584, + "step": 12145, + "time_per_iteration": 2.6382598876953125 + }, + { + "auxiliary_loss_clip": 0.01114663, + "auxiliary_loss_mlp": 0.01034502, + "balance_loss_clip": 1.04225683, + "balance_loss_mlp": 1.02259886, + "epoch": 0.7302570269051556, + "flos": 28684413550080.0, + "grad_norm": 2.0116537111915616, + "language_loss": 0.91338718, + "learning_rate": 7.158010004482702e-07, + "loss": 0.93487889, + "num_input_tokens_seen": 262144075, + "router_z_loss_clip": 0.72412109, + "router_z_loss_mlp": 0.11920166, + "step": 12146, + "time_per_iteration": 2.6574907302856445 + }, + { + "auxiliary_loss_clip": 0.01111435, + "auxiliary_loss_mlp": 0.01027405, + "balance_loss_clip": 1.040627, + "balance_loss_mlp": 1.0160085, + "epoch": 0.7303171501578235, + "flos": 25042147189920.0, + "grad_norm": 2.0789556992178073, + "language_loss": 0.62318331, + "learning_rate": 7.155024551743316e-07, + "loss": 0.64457178, + "num_input_tokens_seen": 262165940, + "router_z_loss_clip": 0.70898438, + "router_z_loss_mlp": 0.11407471, + "step": 12147, + "time_per_iteration": 2.735765218734741 + }, + { + "auxiliary_loss_clip": 0.01117594, + "auxiliary_loss_mlp": 0.01035166, + "balance_loss_clip": 1.04264355, + "balance_loss_mlp": 1.02349544, + "epoch": 0.7303772734104915, + "flos": 22369319631360.0, + "grad_norm": 2.3191110287605197, + "language_loss": 0.75812382, + "learning_rate": 7.152039586086693e-07, + "loss": 0.7796514, + "num_input_tokens_seen": 262184520, + "router_z_loss_clip": 0.74951172, + "router_z_loss_mlp": 0.11682129, + "step": 12148, + "time_per_iteration": 2.6210641860961914 + }, + { + "auxiliary_loss_clip": 0.01034184, + "auxiliary_loss_mlp": 0.01001566, + "balance_loss_clip": 1.01083386, + "balance_loss_mlp": 1.00052691, + "epoch": 0.7304373966631594, + "flos": 74010835473120.0, + "grad_norm": 0.6810265392822898, + "language_loss": 0.56682819, + "learning_rate": 7.149055107626017e-07, + "loss": 0.58718568, + "num_input_tokens_seen": 262247070, + "router_z_loss_clip": 0.23376465, + "router_z_loss_mlp": 0.01039886, + "step": 12149, + "time_per_iteration": 3.259645462036133 + }, + { + "auxiliary_loss_clip": 0.0111454, + "auxiliary_loss_mlp": 0.01032077, + "balance_loss_clip": 1.03879404, + "balance_loss_mlp": 1.01991153, + "epoch": 0.7304975199158275, + "flos": 24194241177120.0, + "grad_norm": 1.724905449714447, + "language_loss": 0.74034345, + "learning_rate": 7.146071116474451e-07, + "loss": 0.76180971, + "num_input_tokens_seen": 262266605, + "router_z_loss_clip": 0.75732422, + "router_z_loss_mlp": 0.1217041, + "step": 12150, + "time_per_iteration": 2.7516086101531982 + }, + { + "auxiliary_loss_clip": 0.01117569, + "auxiliary_loss_mlp": 0.01031746, + "balance_loss_clip": 1.04123998, + "balance_loss_mlp": 1.01942587, + "epoch": 0.7305576431684954, + "flos": 16135450158240.0, + "grad_norm": 50.207474302524524, + "language_loss": 0.84222716, + "learning_rate": 7.143087612745158e-07, + "loss": 0.86372042, + "num_input_tokens_seen": 262283880, + "router_z_loss_clip": 0.76318359, + "router_z_loss_mlp": 0.12322998, + "step": 12151, + "time_per_iteration": 2.6168594360351562 + }, + { + "auxiliary_loss_clip": 0.01116511, + "auxiliary_loss_mlp": 0.01035118, + "balance_loss_clip": 1.0413835, + "balance_loss_mlp": 1.02268493, + "epoch": 0.7306177664211634, + "flos": 29358931006080.0, + "grad_norm": 1.982714259520574, + "language_loss": 0.77659059, + "learning_rate": 7.14010459655127e-07, + "loss": 0.79810685, + "num_input_tokens_seen": 262304155, + "router_z_loss_clip": 0.75097656, + "router_z_loss_mlp": 0.12432861, + "step": 12152, + "time_per_iteration": 2.689246654510498 + }, + { + "auxiliary_loss_clip": 0.01115906, + "auxiliary_loss_mlp": 0.01030142, + "balance_loss_clip": 1.04217434, + "balance_loss_mlp": 1.01794684, + "epoch": 0.7306778896738314, + "flos": 33054673825440.0, + "grad_norm": 1.664846894458588, + "language_loss": 0.79142451, + "learning_rate": 7.137122068005919e-07, + "loss": 0.81288493, + "num_input_tokens_seen": 262325660, + "router_z_loss_clip": 0.73632812, + "router_z_loss_mlp": 0.12200928, + "step": 12153, + "time_per_iteration": 2.6894781589508057 + }, + { + "auxiliary_loss_clip": 0.01117079, + "auxiliary_loss_mlp": 0.01033927, + "balance_loss_clip": 1.04037333, + "balance_loss_mlp": 1.02195275, + "epoch": 0.7307380129264993, + "flos": 20365701765120.0, + "grad_norm": 1.6433367896654938, + "language_loss": 0.67545646, + "learning_rate": 7.134140027222173e-07, + "loss": 0.69696653, + "num_input_tokens_seen": 262344075, + "router_z_loss_clip": 0.76708984, + "router_z_loss_mlp": 0.11968994, + "step": 12154, + "time_per_iteration": 2.7146646976470947 + }, + { + "auxiliary_loss_clip": 0.01116239, + "auxiliary_loss_mlp": 0.01032603, + "balance_loss_clip": 1.04089189, + "balance_loss_mlp": 1.0205096, + "epoch": 0.7307981361791673, + "flos": 26509878681120.0, + "grad_norm": 1.7045234495782637, + "language_loss": 0.66276717, + "learning_rate": 7.131158474313128e-07, + "loss": 0.6842556, + "num_input_tokens_seen": 262363305, + "router_z_loss_clip": 0.75439453, + "router_z_loss_mlp": 0.12103271, + "step": 12155, + "time_per_iteration": 2.6264491081237793 + }, + { + "auxiliary_loss_clip": 0.01110734, + "auxiliary_loss_mlp": 0.01031464, + "balance_loss_clip": 1.03775573, + "balance_loss_mlp": 1.01935792, + "epoch": 0.7308582594318352, + "flos": 22013061474240.0, + "grad_norm": 8.53386512625685, + "language_loss": 0.81905246, + "learning_rate": 7.128177409391851e-07, + "loss": 0.84047449, + "num_input_tokens_seen": 262380730, + "router_z_loss_clip": 0.72998047, + "router_z_loss_mlp": 0.12103271, + "step": 12156, + "time_per_iteration": 2.6354880332946777 + }, + { + "auxiliary_loss_clip": 0.0111256, + "auxiliary_loss_mlp": 0.01029534, + "balance_loss_clip": 1.0404067, + "balance_loss_mlp": 1.01865602, + "epoch": 0.7309183826845033, + "flos": 16893136889280.0, + "grad_norm": 2.1992414177870963, + "language_loss": 0.7498256, + "learning_rate": 7.125196832571367e-07, + "loss": 0.77124655, + "num_input_tokens_seen": 262395480, + "router_z_loss_clip": 0.72119141, + "router_z_loss_mlp": 0.10870361, + "step": 12157, + "time_per_iteration": 2.678236246109009 + }, + { + "auxiliary_loss_clip": 0.01109536, + "auxiliary_loss_mlp": 0.01030585, + "balance_loss_clip": 1.03894877, + "balance_loss_mlp": 1.01971889, + "epoch": 0.7309785059371712, + "flos": 20766603614400.0, + "grad_norm": 2.097168483743878, + "language_loss": 0.72691429, + "learning_rate": 7.122216743964713e-07, + "loss": 0.74831557, + "num_input_tokens_seen": 262413340, + "router_z_loss_clip": 0.70507812, + "router_z_loss_mlp": 0.10858154, + "step": 12158, + "time_per_iteration": 2.668728828430176 + }, + { + "auxiliary_loss_clip": 0.01116462, + "auxiliary_loss_mlp": 0.01033699, + "balance_loss_clip": 1.0417937, + "balance_loss_mlp": 1.02205229, + "epoch": 0.7310386291898392, + "flos": 32338997163360.0, + "grad_norm": 1.589672900794535, + "language_loss": 0.85741544, + "learning_rate": 7.119237143684896e-07, + "loss": 0.8789171, + "num_input_tokens_seen": 262433455, + "router_z_loss_clip": 0.74707031, + "router_z_loss_mlp": 0.11645508, + "step": 12159, + "time_per_iteration": 2.676701307296753 + }, + { + "auxiliary_loss_clip": 0.01118898, + "auxiliary_loss_mlp": 0.01032866, + "balance_loss_clip": 1.04127002, + "balance_loss_mlp": 1.02044463, + "epoch": 0.7310987524425071, + "flos": 20677194678240.0, + "grad_norm": 2.0843137512687, + "language_loss": 0.73765945, + "learning_rate": 7.116258031844895e-07, + "loss": 0.75917709, + "num_input_tokens_seen": 262450335, + "router_z_loss_clip": 0.77539062, + "router_z_loss_mlp": 0.12420654, + "step": 12160, + "time_per_iteration": 2.6827456951141357 + }, + { + "auxiliary_loss_clip": 0.01118978, + "auxiliary_loss_mlp": 0.01033943, + "balance_loss_clip": 1.04220104, + "balance_loss_mlp": 1.02159286, + "epoch": 0.7311588756951751, + "flos": 16893501544800.0, + "grad_norm": 2.0887366033974333, + "language_loss": 0.72872037, + "learning_rate": 7.113279408557675e-07, + "loss": 0.75024962, + "num_input_tokens_seen": 262468240, + "router_z_loss_clip": 0.76757812, + "router_z_loss_mlp": 0.12353516, + "step": 12161, + "time_per_iteration": 2.6287178993225098 + }, + { + "auxiliary_loss_clip": 0.0112102, + "auxiliary_loss_mlp": 0.01030072, + "balance_loss_clip": 1.04225659, + "balance_loss_mlp": 1.01683354, + "epoch": 0.731218998947843, + "flos": 35103219004800.0, + "grad_norm": 1.79904300569442, + "language_loss": 0.69432896, + "learning_rate": 7.110301273936192e-07, + "loss": 0.71583986, + "num_input_tokens_seen": 262487045, + "router_z_loss_clip": 0.78808594, + "router_z_loss_mlp": 0.13232422, + "step": 12162, + "time_per_iteration": 2.7399888038635254 + }, + { + "auxiliary_loss_clip": 0.01118266, + "auxiliary_loss_mlp": 0.01030529, + "balance_loss_clip": 1.04185784, + "balance_loss_mlp": 1.01820898, + "epoch": 0.7312791222005111, + "flos": 33054309169920.0, + "grad_norm": 1.7230258576154935, + "language_loss": 0.67147493, + "learning_rate": 7.107323628093382e-07, + "loss": 0.69296288, + "num_input_tokens_seen": 262504855, + "router_z_loss_clip": 0.76367188, + "router_z_loss_mlp": 0.12322998, + "step": 12163, + "time_per_iteration": 2.7067644596099854 + }, + { + "auxiliary_loss_clip": 0.01112984, + "auxiliary_loss_mlp": 0.01031599, + "balance_loss_clip": 1.03805518, + "balance_loss_mlp": 1.01921916, + "epoch": 0.731339245453179, + "flos": 25531485560640.0, + "grad_norm": 1.5646259149315467, + "language_loss": 0.68645376, + "learning_rate": 7.104346471142153e-07, + "loss": 0.70789957, + "num_input_tokens_seen": 262524920, + "router_z_loss_clip": 0.74951172, + "router_z_loss_mlp": 0.1239624, + "step": 12164, + "time_per_iteration": 2.6564598083496094 + }, + { + "auxiliary_loss_clip": 0.01114242, + "auxiliary_loss_mlp": 0.01033198, + "balance_loss_clip": 1.0431776, + "balance_loss_mlp": 1.02200985, + "epoch": 0.731399368705847, + "flos": 28157197872960.0, + "grad_norm": 2.5698595587529196, + "language_loss": 0.73000985, + "learning_rate": 7.101369803195391e-07, + "loss": 0.75148427, + "num_input_tokens_seen": 262545725, + "router_z_loss_clip": 0.70996094, + "router_z_loss_mlp": 0.11181641, + "step": 12165, + "time_per_iteration": 2.7126243114471436 + }, + { + "auxiliary_loss_clip": 0.01115682, + "auxiliary_loss_mlp": 0.01034722, + "balance_loss_clip": 1.04041314, + "balance_loss_mlp": 1.02256227, + "epoch": 0.731459491958515, + "flos": 28776213005760.0, + "grad_norm": 1.9929648001007962, + "language_loss": 0.76713848, + "learning_rate": 7.098393624365988e-07, + "loss": 0.78864253, + "num_input_tokens_seen": 262565480, + "router_z_loss_clip": 0.75244141, + "router_z_loss_mlp": 0.12164307, + "step": 12166, + "time_per_iteration": 2.6448709964752197 + }, + { + "auxiliary_loss_clip": 0.011136, + "auxiliary_loss_mlp": 0.01032821, + "balance_loss_clip": 1.04088783, + "balance_loss_mlp": 1.02087617, + "epoch": 0.7315196152111829, + "flos": 27307671168960.0, + "grad_norm": 2.0048742074426436, + "language_loss": 0.79823005, + "learning_rate": 7.095417934766781e-07, + "loss": 0.81969422, + "num_input_tokens_seen": 262584145, + "router_z_loss_clip": 0.72705078, + "router_z_loss_mlp": 0.1194458, + "step": 12167, + "time_per_iteration": 2.663043975830078 + }, + { + "auxiliary_loss_clip": 0.01113537, + "auxiliary_loss_mlp": 0.01039924, + "balance_loss_clip": 1.0404917, + "balance_loss_mlp": 1.02822316, + "epoch": 0.7315797384638509, + "flos": 31941701352000.0, + "grad_norm": 3.5411282111330826, + "language_loss": 0.76462531, + "learning_rate": 7.092442734510622e-07, + "loss": 0.78615987, + "num_input_tokens_seen": 262604045, + "router_z_loss_clip": 0.72998047, + "router_z_loss_mlp": 0.11694336, + "step": 12168, + "time_per_iteration": 2.6610910892486572 + }, + { + "auxiliary_loss_clip": 0.01114938, + "auxiliary_loss_mlp": 0.01038972, + "balance_loss_clip": 1.03953433, + "balance_loss_mlp": 1.02631736, + "epoch": 0.7316398617165188, + "flos": 26242178597280.0, + "grad_norm": 1.5871887798380546, + "language_loss": 0.81571352, + "learning_rate": 7.089468023710326e-07, + "loss": 0.83725262, + "num_input_tokens_seen": 262624540, + "router_z_loss_clip": 0.75439453, + "router_z_loss_mlp": 0.12652588, + "step": 12169, + "time_per_iteration": 2.82561993598938 + }, + { + "auxiliary_loss_clip": 0.01117335, + "auxiliary_loss_mlp": 0.0104004, + "balance_loss_clip": 1.0414772, + "balance_loss_mlp": 1.02839875, + "epoch": 0.7316999849691869, + "flos": 37195597530720.0, + "grad_norm": 2.4240704538579267, + "language_loss": 0.7019515, + "learning_rate": 7.08649380247871e-07, + "loss": 0.72352523, + "num_input_tokens_seen": 262644545, + "router_z_loss_clip": 0.7578125, + "router_z_loss_mlp": 0.11651611, + "step": 12170, + "time_per_iteration": 4.272628545761108 + }, + { + "auxiliary_loss_clip": 0.01112914, + "auxiliary_loss_mlp": 0.01030394, + "balance_loss_clip": 1.0392766, + "balance_loss_mlp": 1.01761448, + "epoch": 0.7317601082218548, + "flos": 26287794704160.0, + "grad_norm": 2.252311597088038, + "language_loss": 0.70044327, + "learning_rate": 7.083520070928533e-07, + "loss": 0.72187638, + "num_input_tokens_seen": 262662570, + "router_z_loss_clip": 0.73583984, + "router_z_loss_mlp": 0.12780762, + "step": 12171, + "time_per_iteration": 2.646238327026367 + }, + { + "auxiliary_loss_clip": 0.01115808, + "auxiliary_loss_mlp": 0.0103475, + "balance_loss_clip": 1.04184127, + "balance_loss_mlp": 1.02272797, + "epoch": 0.7318202314745228, + "flos": 40574944846080.0, + "grad_norm": 1.901543721404074, + "language_loss": 0.65656835, + "learning_rate": 7.080546829172564e-07, + "loss": 0.678074, + "num_input_tokens_seen": 262683245, + "router_z_loss_clip": 0.74023438, + "router_z_loss_mlp": 0.12017822, + "step": 12172, + "time_per_iteration": 2.788597345352173 + }, + { + "auxiliary_loss_clip": 0.01116606, + "auxiliary_loss_mlp": 0.01025597, + "balance_loss_clip": 1.041628, + "balance_loss_mlp": 1.01347315, + "epoch": 0.7318803547271907, + "flos": 24595629233760.0, + "grad_norm": 2.855075603749359, + "language_loss": 0.61443901, + "learning_rate": 7.077574077323564e-07, + "loss": 0.63586098, + "num_input_tokens_seen": 262701585, + "router_z_loss_clip": 0.74951172, + "router_z_loss_mlp": 0.12133789, + "step": 12173, + "time_per_iteration": 3.929452896118164 + }, + { + "auxiliary_loss_clip": 0.01113543, + "auxiliary_loss_mlp": 0.01025759, + "balance_loss_clip": 1.04010582, + "balance_loss_mlp": 1.01432061, + "epoch": 0.7319404779798587, + "flos": 25085534846400.0, + "grad_norm": 3.3635375427481162, + "language_loss": 0.74234045, + "learning_rate": 7.074601815494243e-07, + "loss": 0.76373351, + "num_input_tokens_seen": 262719295, + "router_z_loss_clip": 0.73486328, + "router_z_loss_mlp": 0.11444092, + "step": 12174, + "time_per_iteration": 2.6958231925964355 + }, + { + "auxiliary_loss_clip": 0.01112478, + "auxiliary_loss_mlp": 0.01026607, + "balance_loss_clip": 1.04033613, + "balance_loss_mlp": 1.01563954, + "epoch": 0.7320006012325266, + "flos": 35014701448800.0, + "grad_norm": 1.6921833824532457, + "language_loss": 0.80691254, + "learning_rate": 7.071630043797317e-07, + "loss": 0.82830334, + "num_input_tokens_seen": 262739995, + "router_z_loss_clip": 0.72070312, + "router_z_loss_mlp": 0.10968018, + "step": 12175, + "time_per_iteration": 2.737680673599243 + }, + { + "auxiliary_loss_clip": 0.01115193, + "auxiliary_loss_mlp": 0.01027557, + "balance_loss_clip": 1.04007268, + "balance_loss_mlp": 1.01603496, + "epoch": 0.7320607244851947, + "flos": 19965448192320.0, + "grad_norm": 1.9994210449392336, + "language_loss": 0.76684821, + "learning_rate": 7.068658762345488e-07, + "loss": 0.78827572, + "num_input_tokens_seen": 262757680, + "router_z_loss_clip": 0.75048828, + "router_z_loss_mlp": 0.11529541, + "step": 12176, + "time_per_iteration": 2.6063826084136963 + }, + { + "auxiliary_loss_clip": 0.01114379, + "auxiliary_loss_mlp": 0.01028827, + "balance_loss_clip": 1.0418967, + "balance_loss_mlp": 1.0174427, + "epoch": 0.7321208477378626, + "flos": 25569889591680.0, + "grad_norm": 1.677682972862331, + "language_loss": 0.76664007, + "learning_rate": 7.065687971251399e-07, + "loss": 0.78807211, + "num_input_tokens_seen": 262776990, + "router_z_loss_clip": 0.72412109, + "router_z_loss_mlp": 0.11376953, + "step": 12177, + "time_per_iteration": 2.728106737136841 + }, + { + "auxiliary_loss_clip": 0.01111953, + "auxiliary_loss_mlp": 0.01035665, + "balance_loss_clip": 1.03811884, + "balance_loss_mlp": 1.02443528, + "epoch": 0.7321809709905306, + "flos": 16893258441120.0, + "grad_norm": 2.6070735257802204, + "language_loss": 0.74234271, + "learning_rate": 7.06271767062772e-07, + "loss": 0.76381898, + "num_input_tokens_seen": 262795440, + "router_z_loss_clip": 0.73779297, + "router_z_loss_mlp": 0.11230469, + "step": 12178, + "time_per_iteration": 2.587531805038452 + }, + { + "auxiliary_loss_clip": 0.01116589, + "auxiliary_loss_mlp": 0.01033734, + "balance_loss_clip": 1.04024243, + "balance_loss_mlp": 1.02203345, + "epoch": 0.7322410942431986, + "flos": 32921431542720.0, + "grad_norm": 2.2043473938258806, + "language_loss": 0.8204695, + "learning_rate": 7.059747860587084e-07, + "loss": 0.84197271, + "num_input_tokens_seen": 262816385, + "router_z_loss_clip": 0.76367188, + "router_z_loss_mlp": 0.11706543, + "step": 12179, + "time_per_iteration": 2.724454641342163 + }, + { + "auxiliary_loss_clip": 0.01109494, + "auxiliary_loss_mlp": 0.01029399, + "balance_loss_clip": 1.03983426, + "balance_loss_mlp": 1.01825285, + "epoch": 0.7323012174958665, + "flos": 21523763620800.0, + "grad_norm": 1.6853334534237898, + "language_loss": 0.74628812, + "learning_rate": 7.056778541242115e-07, + "loss": 0.76767707, + "num_input_tokens_seen": 262834955, + "router_z_loss_clip": 0.69580078, + "router_z_loss_mlp": 0.1114502, + "step": 12180, + "time_per_iteration": 4.07778787612915 + }, + { + "auxiliary_loss_clip": 0.01114302, + "auxiliary_loss_mlp": 0.01030066, + "balance_loss_clip": 1.03693509, + "balance_loss_mlp": 1.01762104, + "epoch": 0.7323613407485345, + "flos": 39465618927840.0, + "grad_norm": 2.3877535320333294, + "language_loss": 0.79452395, + "learning_rate": 7.053809712705396e-07, + "loss": 0.81596762, + "num_input_tokens_seen": 262853555, + "router_z_loss_clip": 0.77392578, + "router_z_loss_mlp": 0.12438965, + "step": 12181, + "time_per_iteration": 4.0345776081085205 + }, + { + "auxiliary_loss_clip": 0.0111611, + "auxiliary_loss_mlp": 0.01027669, + "balance_loss_clip": 1.04111147, + "balance_loss_mlp": 1.0157963, + "epoch": 0.7324214640012024, + "flos": 22406710730400.0, + "grad_norm": 2.0147187931092607, + "language_loss": 0.71835625, + "learning_rate": 7.050841375089506e-07, + "loss": 0.73979402, + "num_input_tokens_seen": 262870975, + "router_z_loss_clip": 0.75048828, + "router_z_loss_mlp": 0.11889648, + "step": 12182, + "time_per_iteration": 2.680948257446289 + }, + { + "auxiliary_loss_clip": 0.01116833, + "auxiliary_loss_mlp": 0.01032812, + "balance_loss_clip": 1.04175615, + "balance_loss_mlp": 1.02153492, + "epoch": 0.7324815872538705, + "flos": 37597836450240.0, + "grad_norm": 1.9364029126645232, + "language_loss": 0.7108857, + "learning_rate": 7.047873528507015e-07, + "loss": 0.73238212, + "num_input_tokens_seen": 262892635, + "router_z_loss_clip": 0.75097656, + "router_z_loss_mlp": 0.112854, + "step": 12183, + "time_per_iteration": 2.694080114364624 + }, + { + "auxiliary_loss_clip": 0.01118391, + "auxiliary_loss_mlp": 0.0103523, + "balance_loss_clip": 1.04234219, + "balance_loss_mlp": 1.02251053, + "epoch": 0.7325417105065384, + "flos": 26240760492480.0, + "grad_norm": 3.2207778386193797, + "language_loss": 0.72954458, + "learning_rate": 7.04490617307045e-07, + "loss": 0.75108075, + "num_input_tokens_seen": 262910725, + "router_z_loss_clip": 0.76123047, + "router_z_loss_mlp": 0.1270752, + "step": 12184, + "time_per_iteration": 2.6579036712646484 + }, + { + "auxiliary_loss_clip": 0.01033186, + "auxiliary_loss_mlp": 0.01003412, + "balance_loss_clip": 1.00999141, + "balance_loss_mlp": 1.00240874, + "epoch": 0.7326018337592064, + "flos": 82068370456320.0, + "grad_norm": 0.7719391986237026, + "language_loss": 0.65137309, + "learning_rate": 7.041939308892344e-07, + "loss": 0.67173904, + "num_input_tokens_seen": 262974150, + "router_z_loss_clip": 0.23205566, + "router_z_loss_mlp": 0.01004028, + "step": 12185, + "time_per_iteration": 3.279632806777954 + }, + { + "auxiliary_loss_clip": 0.01113809, + "auxiliary_loss_mlp": 0.01027269, + "balance_loss_clip": 1.03841829, + "balance_loss_mlp": 1.0146451, + "epoch": 0.7326619570118743, + "flos": 27890389169280.0, + "grad_norm": 2.0896837175390615, + "language_loss": 0.79885507, + "learning_rate": 7.038972936085197e-07, + "loss": 0.82026583, + "num_input_tokens_seen": 262993370, + "router_z_loss_clip": 0.75390625, + "router_z_loss_mlp": 0.12634277, + "step": 12186, + "time_per_iteration": 2.6238112449645996 + }, + { + "auxiliary_loss_clip": 0.01115608, + "auxiliary_loss_mlp": 0.01036072, + "balance_loss_clip": 1.03972292, + "balance_loss_mlp": 1.02286363, + "epoch": 0.7327220802645423, + "flos": 28464801127200.0, + "grad_norm": 1.7011165174581298, + "language_loss": 0.73628408, + "learning_rate": 7.036007054761508e-07, + "loss": 0.75780088, + "num_input_tokens_seen": 263012665, + "router_z_loss_clip": 0.75878906, + "router_z_loss_mlp": 0.13208008, + "step": 12187, + "time_per_iteration": 2.662263870239258 + }, + { + "auxiliary_loss_clip": 0.01117652, + "auxiliary_loss_mlp": 0.01037514, + "balance_loss_clip": 1.04209208, + "balance_loss_mlp": 1.02596283, + "epoch": 0.7327822035172102, + "flos": 28285699633920.0, + "grad_norm": 1.7670126869393772, + "language_loss": 0.88635468, + "learning_rate": 7.033041665033716e-07, + "loss": 0.90790635, + "num_input_tokens_seen": 263031475, + "router_z_loss_clip": 0.75439453, + "router_z_loss_mlp": 0.11553955, + "step": 12188, + "time_per_iteration": 2.645766258239746 + }, + { + "auxiliary_loss_clip": 0.0111609, + "auxiliary_loss_mlp": 0.01034188, + "balance_loss_clip": 1.03959632, + "balance_loss_mlp": 1.0216887, + "epoch": 0.7328423267698783, + "flos": 25705765497600.0, + "grad_norm": 1.9597703708078698, + "language_loss": 0.74499369, + "learning_rate": 7.030076767014284e-07, + "loss": 0.76649648, + "num_input_tokens_seen": 263051445, + "router_z_loss_clip": 0.76464844, + "router_z_loss_mlp": 0.12506104, + "step": 12189, + "time_per_iteration": 2.7492411136627197 + }, + { + "auxiliary_loss_clip": 0.01115507, + "auxiliary_loss_mlp": 0.0103173, + "balance_loss_clip": 1.03997791, + "balance_loss_mlp": 1.01939774, + "epoch": 0.7329024500225462, + "flos": 26465721196320.0, + "grad_norm": 1.7694340084421691, + "language_loss": 0.822258, + "learning_rate": 7.027112360815648e-07, + "loss": 0.84373033, + "num_input_tokens_seen": 263070835, + "router_z_loss_clip": 0.75634766, + "router_z_loss_mlp": 0.12335205, + "step": 12190, + "time_per_iteration": 2.659656524658203 + }, + { + "auxiliary_loss_clip": 0.01117013, + "auxiliary_loss_mlp": 0.01034959, + "balance_loss_clip": 1.04175055, + "balance_loss_mlp": 1.02232289, + "epoch": 0.7329625732752142, + "flos": 29485852593120.0, + "grad_norm": 1.8652277671935351, + "language_loss": 0.71744341, + "learning_rate": 7.024148446550204e-07, + "loss": 0.73896313, + "num_input_tokens_seen": 263090070, + "router_z_loss_clip": 0.75146484, + "router_z_loss_mlp": 0.12652588, + "step": 12191, + "time_per_iteration": 2.6786673069000244 + }, + { + "auxiliary_loss_clip": 0.01116242, + "auxiliary_loss_mlp": 0.01032454, + "balance_loss_clip": 1.04049253, + "balance_loss_mlp": 1.02030706, + "epoch": 0.7330226965278822, + "flos": 36702126397440.0, + "grad_norm": 1.7936766674184745, + "language_loss": 0.69432342, + "learning_rate": 7.021185024330361e-07, + "loss": 0.71581042, + "num_input_tokens_seen": 263110030, + "router_z_loss_clip": 0.75683594, + "router_z_loss_mlp": 0.12158203, + "step": 12192, + "time_per_iteration": 2.6792657375335693 + }, + { + "auxiliary_loss_clip": 0.01113782, + "auxiliary_loss_mlp": 0.01031685, + "balance_loss_clip": 1.04063272, + "balance_loss_mlp": 1.02012146, + "epoch": 0.7330828197805501, + "flos": 28514185341120.0, + "grad_norm": 1.9372995749578406, + "language_loss": 0.73399079, + "learning_rate": 7.01822209426848e-07, + "loss": 0.75544548, + "num_input_tokens_seen": 263129735, + "router_z_loss_clip": 0.73144531, + "router_z_loss_mlp": 0.11560059, + "step": 12193, + "time_per_iteration": 2.688865900039673 + }, + { + "auxiliary_loss_clip": 0.01114091, + "auxiliary_loss_mlp": 0.01029291, + "balance_loss_clip": 1.03836179, + "balance_loss_mlp": 1.01696444, + "epoch": 0.7331429430332181, + "flos": 25664282153280.0, + "grad_norm": 1.7293273804116505, + "language_loss": 0.77224946, + "learning_rate": 7.015259656476911e-07, + "loss": 0.79368329, + "num_input_tokens_seen": 263149100, + "router_z_loss_clip": 0.75683594, + "router_z_loss_mlp": 0.12316895, + "step": 12194, + "time_per_iteration": 2.7098238468170166 + }, + { + "auxiliary_loss_clip": 0.01114354, + "auxiliary_loss_mlp": 0.01030927, + "balance_loss_clip": 1.04141474, + "balance_loss_mlp": 1.0186187, + "epoch": 0.733203066285886, + "flos": 17872907597280.0, + "grad_norm": 2.2159187956122937, + "language_loss": 0.70367593, + "learning_rate": 7.012297711067998e-07, + "loss": 0.72512871, + "num_input_tokens_seen": 263166620, + "router_z_loss_clip": 0.72949219, + "router_z_loss_mlp": 0.12310791, + "step": 12195, + "time_per_iteration": 2.640582323074341 + }, + { + "auxiliary_loss_clip": 0.01114876, + "auxiliary_loss_mlp": 0.01040487, + "balance_loss_clip": 1.0395546, + "balance_loss_mlp": 1.02930498, + "epoch": 0.7332631895385541, + "flos": 20944854244800.0, + "grad_norm": 1.835215981697468, + "language_loss": 0.72348082, + "learning_rate": 7.009336258154057e-07, + "loss": 0.74503446, + "num_input_tokens_seen": 263184780, + "router_z_loss_clip": 0.75341797, + "router_z_loss_mlp": 0.11193848, + "step": 12196, + "time_per_iteration": 2.5936710834503174 + }, + { + "auxiliary_loss_clip": 0.01113682, + "auxiliary_loss_mlp": 0.01030572, + "balance_loss_clip": 1.04034364, + "balance_loss_mlp": 1.01847267, + "epoch": 0.733323312791222, + "flos": 34969328445600.0, + "grad_norm": 2.016582611122851, + "language_loss": 0.71557546, + "learning_rate": 7.006375297847394e-07, + "loss": 0.73701811, + "num_input_tokens_seen": 263204625, + "router_z_loss_clip": 0.734375, + "router_z_loss_mlp": 0.12103271, + "step": 12197, + "time_per_iteration": 2.726616859436035 + }, + { + "auxiliary_loss_clip": 0.01118435, + "auxiliary_loss_mlp": 0.01038952, + "balance_loss_clip": 1.04086709, + "balance_loss_mlp": 1.02540374, + "epoch": 0.73338343604389, + "flos": 20277467830080.0, + "grad_norm": 3.599825974916206, + "language_loss": 0.77702475, + "learning_rate": 7.003414830260282e-07, + "loss": 0.79859859, + "num_input_tokens_seen": 263221565, + "router_z_loss_clip": 0.77490234, + "router_z_loss_mlp": 0.13549805, + "step": 12198, + "time_per_iteration": 2.6253533363342285 + }, + { + "auxiliary_loss_clip": 0.01113675, + "auxiliary_loss_mlp": 0.01030864, + "balance_loss_clip": 1.03982234, + "balance_loss_mlp": 1.01960516, + "epoch": 0.7334435592965579, + "flos": 25797645987840.0, + "grad_norm": 1.9166482940395149, + "language_loss": 0.74356264, + "learning_rate": 7.000454855504974e-07, + "loss": 0.76500803, + "num_input_tokens_seen": 263240620, + "router_z_loss_clip": 0.73876953, + "router_z_loss_mlp": 0.1126709, + "step": 12199, + "time_per_iteration": 2.645327568054199 + }, + { + "auxiliary_loss_clip": 0.01119369, + "auxiliary_loss_mlp": 0.01034546, + "balance_loss_clip": 1.04292548, + "balance_loss_mlp": 1.02193332, + "epoch": 0.7335036825492259, + "flos": 20896401928320.0, + "grad_norm": 2.5282007917842417, + "language_loss": 0.76934671, + "learning_rate": 6.997495373693729e-07, + "loss": 0.79088593, + "num_input_tokens_seen": 263254365, + "router_z_loss_clip": 0.765625, + "router_z_loss_mlp": 0.12615967, + "step": 12200, + "time_per_iteration": 2.6363680362701416 + }, + { + "auxiliary_loss_clip": 0.01114242, + "auxiliary_loss_mlp": 0.0103433, + "balance_loss_clip": 1.04025674, + "balance_loss_mlp": 1.02268875, + "epoch": 0.7335638058018938, + "flos": 28957907604960.0, + "grad_norm": 1.8442188255160317, + "language_loss": 0.61525857, + "learning_rate": 6.994536384938754e-07, + "loss": 0.63674426, + "num_input_tokens_seen": 263275880, + "router_z_loss_clip": 0.73974609, + "router_z_loss_mlp": 0.11639404, + "step": 12201, + "time_per_iteration": 2.6455609798431396 + }, + { + "auxiliary_loss_clip": 0.01113182, + "auxiliary_loss_mlp": 0.01027091, + "balance_loss_clip": 1.03987634, + "balance_loss_mlp": 1.01556337, + "epoch": 0.7336239290545619, + "flos": 42626771925120.0, + "grad_norm": 1.9642429230167169, + "language_loss": 0.51681632, + "learning_rate": 6.991577889352264e-07, + "loss": 0.53821903, + "num_input_tokens_seen": 263298315, + "router_z_loss_clip": 0.73291016, + "router_z_loss_mlp": 0.11529541, + "step": 12202, + "time_per_iteration": 2.7497379779815674 + }, + { + "auxiliary_loss_clip": 0.01113913, + "auxiliary_loss_mlp": 0.01027777, + "balance_loss_clip": 1.0401566, + "balance_loss_mlp": 1.01624393, + "epoch": 0.7336840523072298, + "flos": 25750247120640.0, + "grad_norm": 1.915153182711694, + "language_loss": 0.68381512, + "learning_rate": 6.98861988704645e-07, + "loss": 0.70523202, + "num_input_tokens_seen": 263318615, + "router_z_loss_clip": 0.73779297, + "router_z_loss_mlp": 0.11541748, + "step": 12203, + "time_per_iteration": 2.619542121887207 + }, + { + "auxiliary_loss_clip": 0.01117489, + "auxiliary_loss_mlp": 0.01036109, + "balance_loss_clip": 1.0392971, + "balance_loss_mlp": 1.02387238, + "epoch": 0.7337441755598978, + "flos": 29314895073120.0, + "grad_norm": 3.5966029432797275, + "language_loss": 0.66108346, + "learning_rate": 6.985662378133474e-07, + "loss": 0.68261945, + "num_input_tokens_seen": 263336705, + "router_z_loss_clip": 0.78125, + "router_z_loss_mlp": 0.12231445, + "step": 12204, + "time_per_iteration": 2.7210073471069336 + }, + { + "auxiliary_loss_clip": 0.01112998, + "auxiliary_loss_mlp": 0.01033169, + "balance_loss_clip": 1.04063058, + "balance_loss_mlp": 1.0221777, + "epoch": 0.7338042988125658, + "flos": 27712503194400.0, + "grad_norm": 2.1204058046815972, + "language_loss": 0.7725234, + "learning_rate": 6.982705362725479e-07, + "loss": 0.79398501, + "num_input_tokens_seen": 263355065, + "router_z_loss_clip": 0.72460938, + "router_z_loss_mlp": 0.10998535, + "step": 12205, + "time_per_iteration": 2.6547582149505615 + }, + { + "auxiliary_loss_clip": 0.01114695, + "auxiliary_loss_mlp": 0.01030805, + "balance_loss_clip": 1.04212189, + "balance_loss_mlp": 1.01995087, + "epoch": 0.7338644220652337, + "flos": 26064778829760.0, + "grad_norm": 2.2239728273160964, + "language_loss": 0.79614866, + "learning_rate": 6.979748840934601e-07, + "loss": 0.81760359, + "num_input_tokens_seen": 263374460, + "router_z_loss_clip": 0.7265625, + "router_z_loss_mlp": 0.10852051, + "step": 12206, + "time_per_iteration": 2.665689706802368 + }, + { + "auxiliary_loss_clip": 0.01115774, + "auxiliary_loss_mlp": 0.0102811, + "balance_loss_clip": 1.04035807, + "balance_loss_mlp": 1.01593828, + "epoch": 0.7339245453179017, + "flos": 37729701145440.0, + "grad_norm": 4.96770419012985, + "language_loss": 0.71455657, + "learning_rate": 6.976792812872958e-07, + "loss": 0.73599541, + "num_input_tokens_seen": 263393610, + "router_z_loss_clip": 0.75390625, + "router_z_loss_mlp": 0.12164307, + "step": 12207, + "time_per_iteration": 2.704946756362915 + }, + { + "auxiliary_loss_clip": 0.01032357, + "auxiliary_loss_mlp": 0.01001437, + "balance_loss_clip": 1.00932527, + "balance_loss_mlp": 1.00039721, + "epoch": 0.7339846685705697, + "flos": 82852792241760.0, + "grad_norm": 0.7800987264554349, + "language_loss": 0.54755735, + "learning_rate": 6.97383727865263e-07, + "loss": 0.56789529, + "num_input_tokens_seen": 263450340, + "router_z_loss_clip": 0.23034668, + "router_z_loss_mlp": 0.01040649, + "step": 12208, + "time_per_iteration": 3.3294522762298584 + }, + { + "auxiliary_loss_clip": 0.01113581, + "auxiliary_loss_mlp": 0.01029711, + "balance_loss_clip": 1.0398047, + "balance_loss_mlp": 1.01918483, + "epoch": 0.7340447918232377, + "flos": 27133431749280.0, + "grad_norm": 1.7637538425153383, + "language_loss": 0.80422258, + "learning_rate": 6.970882238385703e-07, + "loss": 0.82565546, + "num_input_tokens_seen": 263471735, + "router_z_loss_clip": 0.73632812, + "router_z_loss_mlp": 0.10516357, + "step": 12209, + "time_per_iteration": 2.6852505207061768 + }, + { + "auxiliary_loss_clip": 0.01111123, + "auxiliary_loss_mlp": 0.0102821, + "balance_loss_clip": 1.0381068, + "balance_loss_mlp": 1.01709414, + "epoch": 0.7341049150759056, + "flos": 28998459051840.0, + "grad_norm": 1.5767055228601958, + "language_loss": 0.79141784, + "learning_rate": 6.96792769218423e-07, + "loss": 0.8128112, + "num_input_tokens_seen": 263493245, + "router_z_loss_clip": 0.72900391, + "router_z_loss_mlp": 0.11120605, + "step": 12210, + "time_per_iteration": 4.129259824752808 + }, + { + "auxiliary_loss_clip": 0.01110661, + "auxiliary_loss_mlp": 0.01027874, + "balance_loss_clip": 1.03850126, + "balance_loss_mlp": 1.01633453, + "epoch": 0.7341650383285736, + "flos": 21031953696000.0, + "grad_norm": 1.7989345997562323, + "language_loss": 0.76131499, + "learning_rate": 6.964973640160236e-07, + "loss": 0.78270036, + "num_input_tokens_seen": 263511660, + "router_z_loss_clip": 0.72070312, + "router_z_loss_mlp": 0.11535645, + "step": 12211, + "time_per_iteration": 2.6417715549468994 + }, + { + "auxiliary_loss_clip": 0.01115499, + "auxiliary_loss_mlp": 0.01027421, + "balance_loss_clip": 1.04055405, + "balance_loss_mlp": 1.01548851, + "epoch": 0.7342251615812415, + "flos": 28557775584000.0, + "grad_norm": 2.0885693273529986, + "language_loss": 0.717251, + "learning_rate": 6.962020082425748e-07, + "loss": 0.73868018, + "num_input_tokens_seen": 263530875, + "router_z_loss_clip": 0.74902344, + "router_z_loss_mlp": 0.11938477, + "step": 12212, + "time_per_iteration": 2.6489367485046387 + }, + { + "auxiliary_loss_clip": 0.0111626, + "auxiliary_loss_mlp": 0.01031432, + "balance_loss_clip": 1.04191327, + "balance_loss_mlp": 1.01957631, + "epoch": 0.7342852848339095, + "flos": 27756336540960.0, + "grad_norm": 1.630424768832287, + "language_loss": 0.68913585, + "learning_rate": 6.959067019092766e-07, + "loss": 0.71061277, + "num_input_tokens_seen": 263551585, + "router_z_loss_clip": 0.7421875, + "router_z_loss_mlp": 0.11853027, + "step": 12213, + "time_per_iteration": 3.9819607734680176 + }, + { + "auxiliary_loss_clip": 0.01032772, + "auxiliary_loss_mlp": 0.01001016, + "balance_loss_clip": 1.0095408, + "balance_loss_mlp": 0.99994987, + "epoch": 0.7343454080865774, + "flos": 65819734068960.0, + "grad_norm": 0.7472078379217253, + "language_loss": 0.54234684, + "learning_rate": 6.956114450273276e-07, + "loss": 0.56268471, + "num_input_tokens_seen": 263609545, + "router_z_loss_clip": 0.23254395, + "router_z_loss_mlp": 0.01067352, + "step": 12214, + "time_per_iteration": 3.1524200439453125 + }, + { + "auxiliary_loss_clip": 0.01116814, + "auxiliary_loss_mlp": 0.0102753, + "balance_loss_clip": 1.04039013, + "balance_loss_mlp": 1.01631212, + "epoch": 0.7344055313392455, + "flos": 14845361538240.0, + "grad_norm": 2.147188253524338, + "language_loss": 0.69736427, + "learning_rate": 6.953162376079233e-07, + "loss": 0.7188077, + "num_input_tokens_seen": 263627880, + "router_z_loss_clip": 0.76416016, + "router_z_loss_mlp": 0.11218262, + "step": 12215, + "time_per_iteration": 2.6141960620880127 + }, + { + "auxiliary_loss_clip": 0.01112152, + "auxiliary_loss_mlp": 0.01032729, + "balance_loss_clip": 1.0406599, + "balance_loss_mlp": 1.0216186, + "epoch": 0.7344656545919134, + "flos": 22634791264800.0, + "grad_norm": 1.6885145398874, + "language_loss": 0.72798097, + "learning_rate": 6.950210796622573e-07, + "loss": 0.74942982, + "num_input_tokens_seen": 263645665, + "router_z_loss_clip": 0.71533203, + "router_z_loss_mlp": 0.11114502, + "step": 12216, + "time_per_iteration": 2.6345295906066895 + }, + { + "auxiliary_loss_clip": 0.01119979, + "auxiliary_loss_mlp": 0.01038149, + "balance_loss_clip": 1.04095912, + "balance_loss_mlp": 1.02494049, + "epoch": 0.7345257778445814, + "flos": 28872023672160.0, + "grad_norm": 2.083829462792534, + "language_loss": 0.78568125, + "learning_rate": 6.947259712015236e-07, + "loss": 0.80726254, + "num_input_tokens_seen": 263668170, + "router_z_loss_clip": 0.79003906, + "router_z_loss_mlp": 0.13214111, + "step": 12217, + "time_per_iteration": 2.6606931686401367 + }, + { + "auxiliary_loss_clip": 0.01111845, + "auxiliary_loss_mlp": 0.01028287, + "balance_loss_clip": 1.0394913, + "balance_loss_mlp": 1.01759958, + "epoch": 0.7345859010972494, + "flos": 16848817335360.0, + "grad_norm": 2.2042157591232177, + "language_loss": 0.77343971, + "learning_rate": 6.94430912236911e-07, + "loss": 0.79484099, + "num_input_tokens_seen": 263684190, + "router_z_loss_clip": 0.72412109, + "router_z_loss_mlp": 0.10693359, + "step": 12218, + "time_per_iteration": 2.661656379699707 + }, + { + "auxiliary_loss_clip": 0.01111966, + "auxiliary_loss_mlp": 0.0102878, + "balance_loss_clip": 1.03992701, + "balance_loss_mlp": 1.0169965, + "epoch": 0.7346460243499173, + "flos": 27177386647680.0, + "grad_norm": 1.7550800900355092, + "language_loss": 0.71934074, + "learning_rate": 6.941359027796092e-07, + "loss": 0.74074817, + "num_input_tokens_seen": 263702095, + "router_z_loss_clip": 0.72070312, + "router_z_loss_mlp": 0.11785889, + "step": 12219, + "time_per_iteration": 4.094319105148315 + }, + { + "auxiliary_loss_clip": 0.01109823, + "auxiliary_loss_mlp": 0.01029511, + "balance_loss_clip": 1.03827512, + "balance_loss_mlp": 1.01794219, + "epoch": 0.7347061476025853, + "flos": 28376567192160.0, + "grad_norm": 2.095501873732211, + "language_loss": 0.74723792, + "learning_rate": 6.938409428408061e-07, + "loss": 0.76863128, + "num_input_tokens_seen": 263721385, + "router_z_loss_clip": 0.71679688, + "router_z_loss_mlp": 0.11584473, + "step": 12220, + "time_per_iteration": 2.622560501098633 + }, + { + "auxiliary_loss_clip": 0.01115441, + "auxiliary_loss_mlp": 0.0102902, + "balance_loss_clip": 1.04017282, + "balance_loss_mlp": 1.01741481, + "epoch": 0.7347662708552533, + "flos": 18629135706240.0, + "grad_norm": 1.751502907954499, + "language_loss": 0.66277778, + "learning_rate": 6.93546032431684e-07, + "loss": 0.68422246, + "num_input_tokens_seen": 263737835, + "router_z_loss_clip": 0.75244141, + "router_z_loss_mlp": 0.11608887, + "step": 12221, + "time_per_iteration": 3.847135066986084 + }, + { + "auxiliary_loss_clip": 0.01113277, + "auxiliary_loss_mlp": 0.01026887, + "balance_loss_clip": 1.03967619, + "balance_loss_mlp": 1.01550221, + "epoch": 0.7348263941079213, + "flos": 30334852572480.0, + "grad_norm": 1.7499004375892484, + "language_loss": 0.69307446, + "learning_rate": 6.932511715634273e-07, + "loss": 0.71447611, + "num_input_tokens_seen": 263756480, + "router_z_loss_clip": 0.73535156, + "router_z_loss_mlp": 0.11383057, + "step": 12222, + "time_per_iteration": 2.6951708793640137 + }, + { + "auxiliary_loss_clip": 0.01114299, + "auxiliary_loss_mlp": 0.010298, + "balance_loss_clip": 1.04072046, + "balance_loss_mlp": 1.0190587, + "epoch": 0.7348865173605892, + "flos": 29714824507680.0, + "grad_norm": 3.13490495936202, + "language_loss": 0.65741432, + "learning_rate": 6.92956360247217e-07, + "loss": 0.6788553, + "num_input_tokens_seen": 263776440, + "router_z_loss_clip": 0.73583984, + "router_z_loss_mlp": 0.10760498, + "step": 12223, + "time_per_iteration": 2.6599576473236084 + }, + { + "auxiliary_loss_clip": 0.01114408, + "auxiliary_loss_mlp": 0.01029002, + "balance_loss_clip": 1.04005611, + "balance_loss_mlp": 1.01749253, + "epoch": 0.7349466406132572, + "flos": 24410207044800.0, + "grad_norm": 1.8040852815323318, + "language_loss": 0.72427058, + "learning_rate": 6.926615984942332e-07, + "loss": 0.74570465, + "num_input_tokens_seen": 263793700, + "router_z_loss_clip": 0.74365234, + "router_z_loss_mlp": 0.11517334, + "step": 12224, + "time_per_iteration": 2.659142255783081 + }, + { + "auxiliary_loss_clip": 0.01115835, + "auxiliary_loss_mlp": 0.0103156, + "balance_loss_clip": 1.04115832, + "balance_loss_mlp": 1.02006865, + "epoch": 0.7350067638659251, + "flos": 36393348142080.0, + "grad_norm": 1.79473817459884, + "language_loss": 0.72708291, + "learning_rate": 6.92366886315652e-07, + "loss": 0.74855685, + "num_input_tokens_seen": 263814620, + "router_z_loss_clip": 0.74804688, + "router_z_loss_mlp": 0.11499023, + "step": 12225, + "time_per_iteration": 2.680140495300293 + }, + { + "auxiliary_loss_clip": 0.01116257, + "auxiliary_loss_mlp": 0.01032377, + "balance_loss_clip": 1.03907824, + "balance_loss_mlp": 1.01952648, + "epoch": 0.7350668871185931, + "flos": 26681727581280.0, + "grad_norm": 2.3365364784955647, + "language_loss": 0.76547092, + "learning_rate": 6.920722237226501e-07, + "loss": 0.78695726, + "num_input_tokens_seen": 263832725, + "router_z_loss_clip": 0.77246094, + "router_z_loss_mlp": 0.128479, + "step": 12226, + "time_per_iteration": 2.6656124591827393 + }, + { + "auxiliary_loss_clip": 0.0111363, + "auxiliary_loss_mlp": 0.01028705, + "balance_loss_clip": 1.03999805, + "balance_loss_mlp": 1.01661706, + "epoch": 0.735127010371261, + "flos": 27535265496000.0, + "grad_norm": 2.0755312430060813, + "language_loss": 0.67080677, + "learning_rate": 6.917776107264008e-07, + "loss": 0.69223011, + "num_input_tokens_seen": 263853850, + "router_z_loss_clip": 0.73730469, + "router_z_loss_mlp": 0.12091064, + "step": 12227, + "time_per_iteration": 2.6364681720733643 + }, + { + "auxiliary_loss_clip": 0.01113945, + "auxiliary_loss_mlp": 0.01031005, + "balance_loss_clip": 1.03940666, + "balance_loss_mlp": 1.02010345, + "epoch": 0.7351871336239291, + "flos": 31584592332000.0, + "grad_norm": 2.3294018684778854, + "language_loss": 0.63794422, + "learning_rate": 6.914830473380749e-07, + "loss": 0.65939373, + "num_input_tokens_seen": 263874760, + "router_z_loss_clip": 0.74609375, + "router_z_loss_mlp": 0.10888672, + "step": 12228, + "time_per_iteration": 2.714909553527832 + }, + { + "auxiliary_loss_clip": 0.01114356, + "auxiliary_loss_mlp": 0.01038907, + "balance_loss_clip": 1.04033375, + "balance_loss_mlp": 1.02799988, + "epoch": 0.735247256876597, + "flos": 21880589019840.0, + "grad_norm": 1.6415936070413055, + "language_loss": 0.63080287, + "learning_rate": 6.911885335688427e-07, + "loss": 0.65233552, + "num_input_tokens_seen": 263893390, + "router_z_loss_clip": 0.73974609, + "router_z_loss_mlp": 0.10906982, + "step": 12229, + "time_per_iteration": 2.630887508392334 + }, + { + "auxiliary_loss_clip": 0.01116621, + "auxiliary_loss_mlp": 0.01040731, + "balance_loss_clip": 1.0407052, + "balance_loss_mlp": 1.02755833, + "epoch": 0.735307380129265, + "flos": 35234313871680.0, + "grad_norm": 1.6951532432769771, + "language_loss": 0.73483986, + "learning_rate": 6.908940694298726e-07, + "loss": 0.75641334, + "num_input_tokens_seen": 263911180, + "router_z_loss_clip": 0.75878906, + "router_z_loss_mlp": 0.1317749, + "step": 12230, + "time_per_iteration": 2.6808862686157227 + }, + { + "auxiliary_loss_clip": 0.0111599, + "auxiliary_loss_mlp": 0.01033886, + "balance_loss_clip": 1.04100466, + "balance_loss_mlp": 1.02223945, + "epoch": 0.7353675033819329, + "flos": 16625558357280.0, + "grad_norm": 2.407252392535879, + "language_loss": 0.72523862, + "learning_rate": 6.90599654932332e-07, + "loss": 0.74673736, + "num_input_tokens_seen": 263928975, + "router_z_loss_clip": 0.75048828, + "router_z_loss_mlp": 0.11645508, + "step": 12231, + "time_per_iteration": 2.5968728065490723 + }, + { + "auxiliary_loss_clip": 0.01116425, + "auxiliary_loss_mlp": 0.01035831, + "balance_loss_clip": 1.04150212, + "balance_loss_mlp": 1.02256894, + "epoch": 0.7354276266346009, + "flos": 23748857704800.0, + "grad_norm": 2.254120451851413, + "language_loss": 0.63892353, + "learning_rate": 6.903052900873823e-07, + "loss": 0.66044611, + "num_input_tokens_seen": 263944495, + "router_z_loss_clip": 0.74951172, + "router_z_loss_mlp": 0.13269043, + "step": 12232, + "time_per_iteration": 2.6297032833099365 + }, + { + "auxiliary_loss_clip": 0.01115129, + "auxiliary_loss_mlp": 0.01029967, + "balance_loss_clip": 1.03960502, + "balance_loss_mlp": 1.01799202, + "epoch": 0.735487749887269, + "flos": 19245112043040.0, + "grad_norm": 1.6793840781117413, + "language_loss": 0.75196886, + "learning_rate": 6.900109749061874e-07, + "loss": 0.77341974, + "num_input_tokens_seen": 263961325, + "router_z_loss_clip": 0.75439453, + "router_z_loss_mlp": 0.11968994, + "step": 12233, + "time_per_iteration": 2.5770418643951416 + }, + { + "auxiliary_loss_clip": 0.01114676, + "auxiliary_loss_mlp": 0.01029091, + "balance_loss_clip": 1.03999734, + "balance_loss_mlp": 1.01739621, + "epoch": 0.7355478731399369, + "flos": 22281409834560.0, + "grad_norm": 2.9918706478537493, + "language_loss": 0.73633456, + "learning_rate": 6.897167093999079e-07, + "loss": 0.75777221, + "num_input_tokens_seen": 263980445, + "router_z_loss_clip": 0.74609375, + "router_z_loss_mlp": 0.11694336, + "step": 12234, + "time_per_iteration": 2.6427078247070312 + }, + { + "auxiliary_loss_clip": 0.01115564, + "auxiliary_loss_mlp": 0.01029489, + "balance_loss_clip": 1.04105496, + "balance_loss_mlp": 1.01763952, + "epoch": 0.7356079963926049, + "flos": 32387084824320.0, + "grad_norm": 2.1810797652595815, + "language_loss": 0.59691453, + "learning_rate": 6.894224935797017e-07, + "loss": 0.61836505, + "num_input_tokens_seen": 263999330, + "router_z_loss_clip": 0.74609375, + "router_z_loss_mlp": 0.11865234, + "step": 12235, + "time_per_iteration": 2.6829850673675537 + }, + { + "auxiliary_loss_clip": 0.01114156, + "auxiliary_loss_mlp": 0.01023407, + "balance_loss_clip": 1.04153585, + "balance_loss_mlp": 1.01186109, + "epoch": 0.7356681196452728, + "flos": 13151494342080.0, + "grad_norm": 2.529289032732035, + "language_loss": 0.85980558, + "learning_rate": 6.891283274567259e-07, + "loss": 0.88118124, + "num_input_tokens_seen": 264014150, + "router_z_loss_clip": 0.72607422, + "router_z_loss_mlp": 0.11541748, + "step": 12236, + "time_per_iteration": 2.627626895904541 + }, + { + "auxiliary_loss_clip": 0.01114884, + "auxiliary_loss_mlp": 0.01027127, + "balance_loss_clip": 1.03961754, + "balance_loss_mlp": 1.01564085, + "epoch": 0.7357282428979408, + "flos": 24060755790720.0, + "grad_norm": 2.0578128242274505, + "language_loss": 0.69504142, + "learning_rate": 6.888342110421364e-07, + "loss": 0.71646154, + "num_input_tokens_seen": 264033140, + "router_z_loss_clip": 0.75244141, + "router_z_loss_mlp": 0.11486816, + "step": 12237, + "time_per_iteration": 2.6728904247283936 + }, + { + "auxiliary_loss_clip": 0.01113389, + "auxiliary_loss_mlp": 0.01027976, + "balance_loss_clip": 1.03889036, + "balance_loss_mlp": 1.0167048, + "epoch": 0.7357883661506087, + "flos": 23749911154080.0, + "grad_norm": 2.0133434378587634, + "language_loss": 0.72212327, + "learning_rate": 6.885401443470839e-07, + "loss": 0.74353695, + "num_input_tokens_seen": 264052105, + "router_z_loss_clip": 0.74511719, + "router_z_loss_mlp": 0.11273193, + "step": 12238, + "time_per_iteration": 2.6140952110290527 + }, + { + "auxiliary_loss_clip": 0.01116256, + "auxiliary_loss_mlp": 0.01030436, + "balance_loss_clip": 1.03746104, + "balance_loss_mlp": 1.01814508, + "epoch": 0.7358484894032767, + "flos": 33096278721600.0, + "grad_norm": 2.9078073974401, + "language_loss": 0.72753352, + "learning_rate": 6.882461273827205e-07, + "loss": 0.74900037, + "num_input_tokens_seen": 264070690, + "router_z_loss_clip": 0.78662109, + "router_z_loss_mlp": 0.12298584, + "step": 12239, + "time_per_iteration": 2.665343999862671 + }, + { + "auxiliary_loss_clip": 0.01110015, + "auxiliary_loss_mlp": 0.01029512, + "balance_loss_clip": 1.03947377, + "balance_loss_mlp": 1.01840758, + "epoch": 0.7359086126559446, + "flos": 29897694108000.0, + "grad_norm": 1.4695546526478604, + "language_loss": 0.78876925, + "learning_rate": 6.879521601601954e-07, + "loss": 0.81016445, + "num_input_tokens_seen": 264094225, + "router_z_loss_clip": 0.70458984, + "router_z_loss_mlp": 0.11108398, + "step": 12240, + "time_per_iteration": 2.6903703212738037 + }, + { + "auxiliary_loss_clip": 0.0111533, + "auxiliary_loss_mlp": 0.01031185, + "balance_loss_clip": 1.04200506, + "balance_loss_mlp": 1.01973474, + "epoch": 0.7359687359086127, + "flos": 28469055441600.0, + "grad_norm": 2.3521804615063577, + "language_loss": 0.82690072, + "learning_rate": 6.876582426906565e-07, + "loss": 0.8483659, + "num_input_tokens_seen": 264113190, + "router_z_loss_clip": 0.73339844, + "router_z_loss_mlp": 0.11456299, + "step": 12241, + "time_per_iteration": 2.7067363262176514 + }, + { + "auxiliary_loss_clip": 0.01111434, + "auxiliary_loss_mlp": 0.01026624, + "balance_loss_clip": 1.03812778, + "balance_loss_mlp": 1.01531088, + "epoch": 0.7360288591612806, + "flos": 24639989304960.0, + "grad_norm": 2.143080263018305, + "language_loss": 0.78964794, + "learning_rate": 6.873643749852484e-07, + "loss": 0.81102848, + "num_input_tokens_seen": 264132050, + "router_z_loss_clip": 0.73291016, + "router_z_loss_mlp": 0.11315918, + "step": 12242, + "time_per_iteration": 2.626438856124878 + }, + { + "auxiliary_loss_clip": 0.01115053, + "auxiliary_loss_mlp": 0.01025791, + "balance_loss_clip": 1.04125595, + "balance_loss_mlp": 1.01400769, + "epoch": 0.7360889824139486, + "flos": 30472187100480.0, + "grad_norm": 1.9696211103615548, + "language_loss": 0.79045707, + "learning_rate": 6.870705570551145e-07, + "loss": 0.81186551, + "num_input_tokens_seen": 264152800, + "router_z_loss_clip": 0.73828125, + "router_z_loss_mlp": 0.11779785, + "step": 12243, + "time_per_iteration": 2.7160024642944336 + }, + { + "auxiliary_loss_clip": 0.011145, + "auxiliary_loss_mlp": 0.01030987, + "balance_loss_clip": 1.03875017, + "balance_loss_mlp": 1.01892257, + "epoch": 0.7361491056666165, + "flos": 18317642793120.0, + "grad_norm": 6.525715606795639, + "language_loss": 0.74205089, + "learning_rate": 6.867767889113969e-07, + "loss": 0.76350582, + "num_input_tokens_seen": 264169650, + "router_z_loss_clip": 0.75683594, + "router_z_loss_mlp": 0.1206665, + "step": 12244, + "time_per_iteration": 2.6060972213745117 + }, + { + "auxiliary_loss_clip": 0.01112633, + "auxiliary_loss_mlp": 0.01031079, + "balance_loss_clip": 1.03816128, + "balance_loss_mlp": 1.01942039, + "epoch": 0.7362092289192845, + "flos": 27979838622720.0, + "grad_norm": 1.884161267495987, + "language_loss": 0.69754088, + "learning_rate": 6.864830705652347e-07, + "loss": 0.71897805, + "num_input_tokens_seen": 264190530, + "router_z_loss_clip": 0.74511719, + "router_z_loss_mlp": 0.11663818, + "step": 12245, + "time_per_iteration": 2.7832412719726562 + }, + { + "auxiliary_loss_clip": 0.01107901, + "auxiliary_loss_mlp": 0.01028699, + "balance_loss_clip": 1.03805518, + "balance_loss_mlp": 1.01674247, + "epoch": 0.7362693521719526, + "flos": 25261597543680.0, + "grad_norm": 1.5388542108252439, + "language_loss": 0.73334402, + "learning_rate": 6.861894020277658e-07, + "loss": 0.75471002, + "num_input_tokens_seen": 264210820, + "router_z_loss_clip": 0.69873047, + "router_z_loss_mlp": 0.11956787, + "step": 12246, + "time_per_iteration": 2.6136810779571533 + }, + { + "auxiliary_loss_clip": 0.01109431, + "auxiliary_loss_mlp": 0.01023557, + "balance_loss_clip": 1.03867567, + "balance_loss_mlp": 1.01240492, + "epoch": 0.7363294754246205, + "flos": 15998318216640.0, + "grad_norm": 2.6384804557183994, + "language_loss": 0.73456353, + "learning_rate": 6.858957833101266e-07, + "loss": 0.75589341, + "num_input_tokens_seen": 264227430, + "router_z_loss_clip": 0.70703125, + "router_z_loss_mlp": 0.11157227, + "step": 12247, + "time_per_iteration": 2.7587881088256836 + }, + { + "auxiliary_loss_clip": 0.01114029, + "auxiliary_loss_mlp": 0.0102574, + "balance_loss_clip": 1.04306853, + "balance_loss_mlp": 1.0148561, + "epoch": 0.7363895986772885, + "flos": 17115788108160.0, + "grad_norm": 1.6684867390149012, + "language_loss": 0.74163443, + "learning_rate": 6.856022144234526e-07, + "loss": 0.76303208, + "num_input_tokens_seen": 264245230, + "router_z_loss_clip": 0.70947266, + "router_z_loss_mlp": 0.10894775, + "step": 12248, + "time_per_iteration": 2.597343921661377 + }, + { + "auxiliary_loss_clip": 0.0111373, + "auxiliary_loss_mlp": 0.01034682, + "balance_loss_clip": 1.03992724, + "balance_loss_mlp": 1.02283883, + "epoch": 0.7364497219299564, + "flos": 24062214412800.0, + "grad_norm": 2.0314671238777002, + "language_loss": 0.72731519, + "learning_rate": 6.853086953788727e-07, + "loss": 0.74879932, + "num_input_tokens_seen": 264263945, + "router_z_loss_clip": 0.73828125, + "router_z_loss_mlp": 0.11828613, + "step": 12249, + "time_per_iteration": 4.141043186187744 + }, + { + "auxiliary_loss_clip": 0.01115476, + "auxiliary_loss_mlp": 0.01030601, + "balance_loss_clip": 1.04214275, + "balance_loss_mlp": 1.01869226, + "epoch": 0.7365098451826244, + "flos": 26065224519840.0, + "grad_norm": 1.9256705695440293, + "language_loss": 0.77365899, + "learning_rate": 6.850152261875189e-07, + "loss": 0.79511976, + "num_input_tokens_seen": 264281500, + "router_z_loss_clip": 0.73291016, + "router_z_loss_mlp": 0.11895752, + "step": 12250, + "time_per_iteration": 2.778046131134033 + }, + { + "auxiliary_loss_clip": 0.01115416, + "auxiliary_loss_mlp": 0.01026089, + "balance_loss_clip": 1.04012835, + "balance_loss_mlp": 1.01431668, + "epoch": 0.7365699684352923, + "flos": 28514104306560.0, + "grad_norm": 1.9404674516917824, + "language_loss": 0.71257329, + "learning_rate": 6.8472180686052e-07, + "loss": 0.7339884, + "num_input_tokens_seen": 264301625, + "router_z_loss_clip": 0.75292969, + "router_z_loss_mlp": 0.11767578, + "step": 12251, + "time_per_iteration": 2.637625217437744 + }, + { + "auxiliary_loss_clip": 0.01111777, + "auxiliary_loss_mlp": 0.01028352, + "balance_loss_clip": 1.03939724, + "balance_loss_mlp": 1.01711047, + "epoch": 0.7366300916879603, + "flos": 72635302260000.0, + "grad_norm": 1.8760010297193115, + "language_loss": 0.65804029, + "learning_rate": 6.844284374090015e-07, + "loss": 0.67944157, + "num_input_tokens_seen": 264323975, + "router_z_loss_clip": 0.72314453, + "router_z_loss_mlp": 0.11248779, + "step": 12252, + "time_per_iteration": 4.357292413711548 + }, + { + "auxiliary_loss_clip": 0.01118037, + "auxiliary_loss_mlp": 0.01030313, + "balance_loss_clip": 1.04280961, + "balance_loss_mlp": 1.01868403, + "epoch": 0.7366902149406283, + "flos": 25530472628640.0, + "grad_norm": 2.974060306277701, + "language_loss": 0.78944266, + "learning_rate": 6.841351178440884e-07, + "loss": 0.8109262, + "num_input_tokens_seen": 264343785, + "router_z_loss_clip": 0.75195312, + "router_z_loss_mlp": 0.11627197, + "step": 12253, + "time_per_iteration": 2.6631791591644287 + }, + { + "auxiliary_loss_clip": 0.01109206, + "auxiliary_loss_mlp": 0.01030294, + "balance_loss_clip": 1.03859591, + "balance_loss_mlp": 1.01902926, + "epoch": 0.7367503381932963, + "flos": 21167951153760.0, + "grad_norm": 2.1268815931365728, + "language_loss": 0.76151884, + "learning_rate": 6.83841848176905e-07, + "loss": 0.7829138, + "num_input_tokens_seen": 264361130, + "router_z_loss_clip": 0.70556641, + "router_z_loss_mlp": 0.1126709, + "step": 12254, + "time_per_iteration": 2.5820119380950928 + }, + { + "auxiliary_loss_clip": 0.01113384, + "auxiliary_loss_mlp": 0.01032757, + "balance_loss_clip": 1.04026246, + "balance_loss_mlp": 1.02058578, + "epoch": 0.7368104614459642, + "flos": 21745442424960.0, + "grad_norm": 3.9766220305980515, + "language_loss": 0.69466949, + "learning_rate": 6.835486284185692e-07, + "loss": 0.71613085, + "num_input_tokens_seen": 264376965, + "router_z_loss_clip": 0.73144531, + "router_z_loss_mlp": 0.12158203, + "step": 12255, + "time_per_iteration": 2.6097283363342285 + }, + { + "auxiliary_loss_clip": 0.01114351, + "auxiliary_loss_mlp": 0.01027779, + "balance_loss_clip": 1.03971982, + "balance_loss_mlp": 1.0154345, + "epoch": 0.7368705846986322, + "flos": 30024899316000.0, + "grad_norm": 1.7519754694886955, + "language_loss": 0.76053441, + "learning_rate": 6.832554585802012e-07, + "loss": 0.78195572, + "num_input_tokens_seen": 264396310, + "router_z_loss_clip": 0.74658203, + "router_z_loss_mlp": 0.12341309, + "step": 12256, + "time_per_iteration": 2.6276493072509766 + }, + { + "auxiliary_loss_clip": 0.0111511, + "auxiliary_loss_mlp": 0.01027895, + "balance_loss_clip": 1.04045331, + "balance_loss_mlp": 1.01596236, + "epoch": 0.7369307079513001, + "flos": 42671739755520.0, + "grad_norm": 1.7773618290799735, + "language_loss": 0.7349987, + "learning_rate": 6.829623386729182e-07, + "loss": 0.75642878, + "num_input_tokens_seen": 264418085, + "router_z_loss_clip": 0.74609375, + "router_z_loss_mlp": 0.1194458, + "step": 12257, + "time_per_iteration": 2.7370803356170654 + }, + { + "auxiliary_loss_clip": 0.01112438, + "auxiliary_loss_mlp": 0.01031374, + "balance_loss_clip": 1.03912938, + "balance_loss_mlp": 1.02010226, + "epoch": 0.7369908312039681, + "flos": 25886163543840.0, + "grad_norm": 5.055081678506933, + "language_loss": 0.77940845, + "learning_rate": 6.826692687078362e-07, + "loss": 0.80084658, + "num_input_tokens_seen": 264437595, + "router_z_loss_clip": 0.73291016, + "router_z_loss_mlp": 0.11273193, + "step": 12258, + "time_per_iteration": 2.642767906188965 + }, + { + "auxiliary_loss_clip": 0.01116742, + "auxiliary_loss_mlp": 0.0103369, + "balance_loss_clip": 1.04063487, + "balance_loss_mlp": 1.02170908, + "epoch": 0.7370509544566362, + "flos": 28825799806080.0, + "grad_norm": 2.1170350633249004, + "language_loss": 0.66311574, + "learning_rate": 6.823762486960674e-07, + "loss": 0.68462008, + "num_input_tokens_seen": 264457385, + "router_z_loss_clip": 0.76171875, + "router_z_loss_mlp": 0.11993408, + "step": 12259, + "time_per_iteration": 4.0843353271484375 + }, + { + "auxiliary_loss_clip": 0.01115286, + "auxiliary_loss_mlp": 0.01035363, + "balance_loss_clip": 1.04120278, + "balance_loss_mlp": 1.02285194, + "epoch": 0.7371110777093041, + "flos": 30295070953920.0, + "grad_norm": 1.9283166255605264, + "language_loss": 0.73621219, + "learning_rate": 6.820832786487225e-07, + "loss": 0.75771868, + "num_input_tokens_seen": 264477205, + "router_z_loss_clip": 0.74121094, + "router_z_loss_mlp": 0.12506104, + "step": 12260, + "time_per_iteration": 3.9897308349609375 + }, + { + "auxiliary_loss_clip": 0.01116945, + "auxiliary_loss_mlp": 0.01028531, + "balance_loss_clip": 1.04060173, + "balance_loss_mlp": 1.01695585, + "epoch": 0.7371712009619721, + "flos": 28247174051040.0, + "grad_norm": 1.765395318180155, + "language_loss": 0.73516488, + "learning_rate": 6.817903585769125e-07, + "loss": 0.75661969, + "num_input_tokens_seen": 264497195, + "router_z_loss_clip": 0.76416016, + "router_z_loss_mlp": 0.11578369, + "step": 12261, + "time_per_iteration": 2.6529271602630615 + }, + { + "auxiliary_loss_clip": 0.01117668, + "auxiliary_loss_mlp": 0.01032338, + "balance_loss_clip": 1.04103696, + "balance_loss_mlp": 1.01977992, + "epoch": 0.73723132421464, + "flos": 28202530358880.0, + "grad_norm": 2.9539339536142513, + "language_loss": 0.66775042, + "learning_rate": 6.814974884917438e-07, + "loss": 0.68925047, + "num_input_tokens_seen": 264516950, + "router_z_loss_clip": 0.76513672, + "router_z_loss_mlp": 0.12567139, + "step": 12262, + "time_per_iteration": 2.632746458053589 + }, + { + "auxiliary_loss_clip": 0.01114153, + "auxiliary_loss_mlp": 0.01031401, + "balance_loss_clip": 1.03973889, + "balance_loss_mlp": 1.01886666, + "epoch": 0.737291447467308, + "flos": 23517981478080.0, + "grad_norm": 1.9015986917605412, + "language_loss": 0.8863821, + "learning_rate": 6.81204668404322e-07, + "loss": 0.90783763, + "num_input_tokens_seen": 264532675, + "router_z_loss_clip": 0.74414062, + "router_z_loss_mlp": 0.12542725, + "step": 12263, + "time_per_iteration": 2.6746163368225098 + }, + { + "auxiliary_loss_clip": 0.01108147, + "auxiliary_loss_mlp": 0.01024932, + "balance_loss_clip": 1.03934574, + "balance_loss_mlp": 1.01471615, + "epoch": 0.7373515707199759, + "flos": 30649748937120.0, + "grad_norm": 1.7155417110073563, + "language_loss": 0.67179883, + "learning_rate": 6.809118983257522e-07, + "loss": 0.6931296, + "num_input_tokens_seen": 264555635, + "router_z_loss_clip": 0.68847656, + "router_z_loss_mlp": 0.10205078, + "step": 12264, + "time_per_iteration": 2.6853160858154297 + }, + { + "auxiliary_loss_clip": 0.0110884, + "auxiliary_loss_mlp": 0.01025818, + "balance_loss_clip": 1.03820539, + "balance_loss_mlp": 1.01513052, + "epoch": 0.737411693972644, + "flos": 39546478717920.0, + "grad_norm": 1.8193316038864393, + "language_loss": 0.80173945, + "learning_rate": 6.806191782671356e-07, + "loss": 0.82308596, + "num_input_tokens_seen": 264573140, + "router_z_loss_clip": 0.70556641, + "router_z_loss_mlp": 0.10681152, + "step": 12265, + "time_per_iteration": 2.736090898513794 + }, + { + "auxiliary_loss_clip": 0.01116892, + "auxiliary_loss_mlp": 0.01029808, + "balance_loss_clip": 1.03960812, + "balance_loss_mlp": 1.01811981, + "epoch": 0.7374718172253119, + "flos": 29670504953760.0, + "grad_norm": 1.8002024007281372, + "language_loss": 0.73992348, + "learning_rate": 6.803265082395711e-07, + "loss": 0.76139045, + "num_input_tokens_seen": 264591610, + "router_z_loss_clip": 0.77294922, + "router_z_loss_mlp": 0.11682129, + "step": 12266, + "time_per_iteration": 2.6404685974121094 + }, + { + "auxiliary_loss_clip": 0.01114541, + "auxiliary_loss_mlp": 0.01037386, + "balance_loss_clip": 1.04048276, + "balance_loss_mlp": 1.02497089, + "epoch": 0.7375319404779799, + "flos": 33136992237600.0, + "grad_norm": 1.8565194887886183, + "language_loss": 0.73205274, + "learning_rate": 6.800338882541576e-07, + "loss": 0.75357205, + "num_input_tokens_seen": 264611170, + "router_z_loss_clip": 0.74072266, + "router_z_loss_mlp": 0.12414551, + "step": 12267, + "time_per_iteration": 2.735450029373169 + }, + { + "auxiliary_loss_clip": 0.01112139, + "auxiliary_loss_mlp": 0.0103298, + "balance_loss_clip": 1.0388664, + "balance_loss_mlp": 1.02170849, + "epoch": 0.7375920637306478, + "flos": 23037273288000.0, + "grad_norm": 2.1871927985703765, + "language_loss": 0.8302502, + "learning_rate": 6.797413183219923e-07, + "loss": 0.85170138, + "num_input_tokens_seen": 264629365, + "router_z_loss_clip": 0.73242188, + "router_z_loss_mlp": 0.11279297, + "step": 12268, + "time_per_iteration": 2.6077044010162354 + }, + { + "auxiliary_loss_clip": 0.01111672, + "auxiliary_loss_mlp": 0.01036851, + "balance_loss_clip": 1.03999043, + "balance_loss_mlp": 1.02554417, + "epoch": 0.7376521869833158, + "flos": 19119851664480.0, + "grad_norm": 2.0853323322497, + "language_loss": 0.72942019, + "learning_rate": 6.794487984541677e-07, + "loss": 0.75090539, + "num_input_tokens_seen": 264647915, + "router_z_loss_clip": 0.71679688, + "router_z_loss_mlp": 0.11309814, + "step": 12269, + "time_per_iteration": 2.6508171558380127 + }, + { + "auxiliary_loss_clip": 0.0111624, + "auxiliary_loss_mlp": 0.01030989, + "balance_loss_clip": 1.04043567, + "balance_loss_mlp": 1.01874614, + "epoch": 0.7377123102359837, + "flos": 45114947123040.0, + "grad_norm": 2.443963777093746, + "language_loss": 0.70189643, + "learning_rate": 6.791563286617776e-07, + "loss": 0.72336864, + "num_input_tokens_seen": 264669620, + "router_z_loss_clip": 0.7578125, + "router_z_loss_mlp": 0.12243652, + "step": 12270, + "time_per_iteration": 2.897078514099121 + }, + { + "auxiliary_loss_clip": 0.01111852, + "auxiliary_loss_mlp": 0.01029186, + "balance_loss_clip": 1.03953028, + "balance_loss_mlp": 1.01827276, + "epoch": 0.7377724334886517, + "flos": 29890927722240.0, + "grad_norm": 1.7095564893784492, + "language_loss": 0.69388551, + "learning_rate": 6.788639089559119e-07, + "loss": 0.71529591, + "num_input_tokens_seen": 264689345, + "router_z_loss_clip": 0.72265625, + "router_z_loss_mlp": 0.10913086, + "step": 12271, + "time_per_iteration": 2.7251148223876953 + }, + { + "auxiliary_loss_clip": 0.01114273, + "auxiliary_loss_mlp": 0.01028633, + "balance_loss_clip": 1.0397917, + "balance_loss_mlp": 1.01646733, + "epoch": 0.7378325567413198, + "flos": 29760967339200.0, + "grad_norm": 2.593290981433742, + "language_loss": 0.67735422, + "learning_rate": 6.785715393476586e-07, + "loss": 0.69878334, + "num_input_tokens_seen": 264707625, + "router_z_loss_clip": 0.74609375, + "router_z_loss_mlp": 0.1217041, + "step": 12272, + "time_per_iteration": 2.7177438735961914 + }, + { + "auxiliary_loss_clip": 0.01110573, + "auxiliary_loss_mlp": 0.01029747, + "balance_loss_clip": 1.0392375, + "balance_loss_mlp": 1.01859486, + "epoch": 0.7378926799939877, + "flos": 21251647153440.0, + "grad_norm": 2.2486609554561943, + "language_loss": 0.78031325, + "learning_rate": 6.782792198481049e-07, + "loss": 0.80171645, + "num_input_tokens_seen": 264725575, + "router_z_loss_clip": 0.71337891, + "router_z_loss_mlp": 0.11151123, + "step": 12273, + "time_per_iteration": 2.622852325439453 + }, + { + "auxiliary_loss_clip": 0.01110718, + "auxiliary_loss_mlp": 0.01031134, + "balance_loss_clip": 1.03765845, + "balance_loss_mlp": 1.01960003, + "epoch": 0.7379528032466557, + "flos": 22542505601760.0, + "grad_norm": 1.9608696763055968, + "language_loss": 0.83242559, + "learning_rate": 6.779869504683355e-07, + "loss": 0.85384405, + "num_input_tokens_seen": 264742855, + "router_z_loss_clip": 0.73095703, + "router_z_loss_mlp": 0.11529541, + "step": 12274, + "time_per_iteration": 2.6727676391601562 + }, + { + "auxiliary_loss_clip": 0.0111866, + "auxiliary_loss_mlp": 0.01029285, + "balance_loss_clip": 1.04078507, + "balance_loss_mlp": 1.0171082, + "epoch": 0.7380129264993236, + "flos": 21746212253280.0, + "grad_norm": 2.601106256831263, + "language_loss": 0.73759067, + "learning_rate": 6.776947312194341e-07, + "loss": 0.7590701, + "num_input_tokens_seen": 264761155, + "router_z_loss_clip": 0.77832031, + "router_z_loss_mlp": 0.12188721, + "step": 12275, + "time_per_iteration": 2.5879480838775635 + }, + { + "auxiliary_loss_clip": 0.01117417, + "auxiliary_loss_mlp": 0.01040227, + "balance_loss_clip": 1.04019129, + "balance_loss_mlp": 1.02798975, + "epoch": 0.7380730497519916, + "flos": 28062116517600.0, + "grad_norm": 1.7713591984158024, + "language_loss": 0.7351985, + "learning_rate": 6.774025621124813e-07, + "loss": 0.7567749, + "num_input_tokens_seen": 264780660, + "router_z_loss_clip": 0.77197266, + "router_z_loss_mlp": 0.12243652, + "step": 12276, + "time_per_iteration": 2.6858270168304443 + }, + { + "auxiliary_loss_clip": 0.01113, + "auxiliary_loss_mlp": 0.01029227, + "balance_loss_clip": 1.03803742, + "balance_loss_mlp": 1.01789618, + "epoch": 0.7381331730046595, + "flos": 24728912033760.0, + "grad_norm": 2.5296114290247123, + "language_loss": 0.77780569, + "learning_rate": 6.771104431585551e-07, + "loss": 0.79922795, + "num_input_tokens_seen": 264798850, + "router_z_loss_clip": 0.75048828, + "router_z_loss_mlp": 0.11322021, + "step": 12277, + "time_per_iteration": 2.6213889122009277 + }, + { + "auxiliary_loss_clip": 0.01111685, + "auxiliary_loss_mlp": 0.0103719, + "balance_loss_clip": 1.04004061, + "balance_loss_mlp": 1.02556133, + "epoch": 0.7381932962573275, + "flos": 24104548620000.0, + "grad_norm": 2.511905046296075, + "language_loss": 0.78688276, + "learning_rate": 6.768183743687338e-07, + "loss": 0.80837154, + "num_input_tokens_seen": 264816795, + "router_z_loss_clip": 0.71484375, + "router_z_loss_mlp": 0.11633301, + "step": 12278, + "time_per_iteration": 2.6396660804748535 + }, + { + "auxiliary_loss_clip": 0.01114157, + "auxiliary_loss_mlp": 0.0103332, + "balance_loss_clip": 1.03896165, + "balance_loss_mlp": 1.02136302, + "epoch": 0.7382534195099955, + "flos": 21116257454880.0, + "grad_norm": 2.6993538356204283, + "language_loss": 0.71797431, + "learning_rate": 6.765263557540921e-07, + "loss": 0.73944902, + "num_input_tokens_seen": 264834105, + "router_z_loss_clip": 0.75244141, + "router_z_loss_mlp": 0.11968994, + "step": 12279, + "time_per_iteration": 2.581317901611328 + }, + { + "auxiliary_loss_clip": 0.01114517, + "auxiliary_loss_mlp": 0.01037171, + "balance_loss_clip": 1.03840137, + "balance_loss_mlp": 1.02427244, + "epoch": 0.7383135427626635, + "flos": 22814419482720.0, + "grad_norm": 2.339692859019355, + "language_loss": 0.85499769, + "learning_rate": 6.762343873257034e-07, + "loss": 0.87651455, + "num_input_tokens_seen": 264850895, + "router_z_loss_clip": 0.76269531, + "router_z_loss_mlp": 0.12896729, + "step": 12280, + "time_per_iteration": 2.6141722202301025 + }, + { + "auxiliary_loss_clip": 0.01115077, + "auxiliary_loss_mlp": 0.01029296, + "balance_loss_clip": 1.04034483, + "balance_loss_mlp": 1.01717305, + "epoch": 0.7383736660153314, + "flos": 25479913413600.0, + "grad_norm": 2.4065468911724377, + "language_loss": 0.72347605, + "learning_rate": 6.759424690946408e-07, + "loss": 0.74491978, + "num_input_tokens_seen": 264869505, + "router_z_loss_clip": 0.74707031, + "router_z_loss_mlp": 0.12127686, + "step": 12281, + "time_per_iteration": 2.6452903747558594 + }, + { + "auxiliary_loss_clip": 0.01112644, + "auxiliary_loss_mlp": 0.01030663, + "balance_loss_clip": 1.03784657, + "balance_loss_mlp": 1.01918864, + "epoch": 0.7384337892679994, + "flos": 25213712469120.0, + "grad_norm": 2.239480207834748, + "language_loss": 0.60360229, + "learning_rate": 6.756506010719711e-07, + "loss": 0.62503535, + "num_input_tokens_seen": 264886915, + "router_z_loss_clip": 0.74804688, + "router_z_loss_mlp": 0.11468506, + "step": 12282, + "time_per_iteration": 2.6743013858795166 + }, + { + "auxiliary_loss_clip": 0.01113833, + "auxiliary_loss_mlp": 0.01029719, + "balance_loss_clip": 1.03903294, + "balance_loss_mlp": 1.01785731, + "epoch": 0.7384939125206673, + "flos": 35593408238400.0, + "grad_norm": 1.7962701366098168, + "language_loss": 0.68290836, + "learning_rate": 6.753587832687632e-07, + "loss": 0.70434391, + "num_input_tokens_seen": 264910350, + "router_z_loss_clip": 0.74853516, + "router_z_loss_mlp": 0.11846924, + "step": 12283, + "time_per_iteration": 2.7152421474456787 + }, + { + "auxiliary_loss_clip": 0.01116238, + "auxiliary_loss_mlp": 0.01039128, + "balance_loss_clip": 1.04281747, + "balance_loss_mlp": 1.0270108, + "epoch": 0.7385540357733353, + "flos": 44310833939520.0, + "grad_norm": 1.8989928664879026, + "language_loss": 0.75616086, + "learning_rate": 6.750670156960832e-07, + "loss": 0.77771449, + "num_input_tokens_seen": 264930705, + "router_z_loss_clip": 0.73339844, + "router_z_loss_mlp": 0.12127686, + "step": 12284, + "time_per_iteration": 2.751664161682129 + }, + { + "auxiliary_loss_clip": 0.01114739, + "auxiliary_loss_mlp": 0.01031856, + "balance_loss_clip": 1.03925192, + "balance_loss_mlp": 1.01907039, + "epoch": 0.7386141590260034, + "flos": 24773272104960.0, + "grad_norm": 2.474713045883182, + "language_loss": 0.69266987, + "learning_rate": 6.747752983649954e-07, + "loss": 0.71413577, + "num_input_tokens_seen": 264946975, + "router_z_loss_clip": 0.75537109, + "router_z_loss_mlp": 0.12799072, + "step": 12285, + "time_per_iteration": 2.610680341720581 + }, + { + "auxiliary_loss_clip": 0.01115807, + "auxiliary_loss_mlp": 0.01033712, + "balance_loss_clip": 1.03820348, + "balance_loss_mlp": 1.02128983, + "epoch": 0.7386742822786713, + "flos": 31095740168640.0, + "grad_norm": 2.1917471356630807, + "language_loss": 0.79781651, + "learning_rate": 6.744836312865602e-07, + "loss": 0.81931174, + "num_input_tokens_seen": 264967665, + "router_z_loss_clip": 0.77587891, + "router_z_loss_mlp": 0.12426758, + "step": 12286, + "time_per_iteration": 2.6616172790527344 + }, + { + "auxiliary_loss_clip": 0.01112588, + "auxiliary_loss_mlp": 0.01030434, + "balance_loss_clip": 1.03958023, + "balance_loss_mlp": 1.01875162, + "epoch": 0.7387344055313393, + "flos": 16804052091360.0, + "grad_norm": 2.881844485851332, + "language_loss": 0.65451533, + "learning_rate": 6.741920144718396e-07, + "loss": 0.67594552, + "num_input_tokens_seen": 264985480, + "router_z_loss_clip": 0.73095703, + "router_z_loss_mlp": 0.11688232, + "step": 12287, + "time_per_iteration": 2.6000068187713623 + }, + { + "auxiliary_loss_clip": 0.01108656, + "auxiliary_loss_mlp": 0.01027529, + "balance_loss_clip": 1.03812563, + "balance_loss_mlp": 1.0164547, + "epoch": 0.7387945287840072, + "flos": 33989517220320.0, + "grad_norm": 1.858669019954118, + "language_loss": 0.76681983, + "learning_rate": 6.739004479318903e-07, + "loss": 0.78818172, + "num_input_tokens_seen": 265004790, + "router_z_loss_clip": 0.70458984, + "router_z_loss_mlp": 0.11071777, + "step": 12288, + "time_per_iteration": 2.68900728225708 + }, + { + "auxiliary_loss_clip": 0.01117707, + "auxiliary_loss_mlp": 0.01033489, + "balance_loss_clip": 1.04140711, + "balance_loss_mlp": 1.02084053, + "epoch": 0.7388546520366752, + "flos": 53975420288640.0, + "grad_norm": 1.724990211509365, + "language_loss": 0.57683915, + "learning_rate": 6.736089316777684e-07, + "loss": 0.59835112, + "num_input_tokens_seen": 265028790, + "router_z_loss_clip": 0.76318359, + "router_z_loss_mlp": 0.12646484, + "step": 12289, + "time_per_iteration": 4.301521301269531 + }, + { + "auxiliary_loss_clip": 0.0103219, + "auxiliary_loss_mlp": 0.01004614, + "balance_loss_clip": 1.00901484, + "balance_loss_mlp": 1.00352085, + "epoch": 0.7389147752893431, + "flos": 86246158536000.0, + "grad_norm": 0.848280196091562, + "language_loss": 0.49222839, + "learning_rate": 6.733174657205287e-07, + "loss": 0.51259643, + "num_input_tokens_seen": 265096660, + "router_z_loss_clip": 0.23205566, + "router_z_loss_mlp": 0.01094055, + "step": 12290, + "time_per_iteration": 3.4107866287231445 + }, + { + "auxiliary_loss_clip": 0.01114664, + "auxiliary_loss_mlp": 0.01028374, + "balance_loss_clip": 1.04012728, + "balance_loss_mlp": 1.01592314, + "epoch": 0.7389748985420111, + "flos": 31719495823200.0, + "grad_norm": 2.5087696601685274, + "language_loss": 0.67038375, + "learning_rate": 6.730260500712237e-07, + "loss": 0.69181412, + "num_input_tokens_seen": 265116375, + "router_z_loss_clip": 0.74462891, + "router_z_loss_mlp": 0.12438965, + "step": 12291, + "time_per_iteration": 4.034460783004761 + }, + { + "auxiliary_loss_clip": 0.01032505, + "auxiliary_loss_mlp": 0.01003714, + "balance_loss_clip": 1.00925183, + "balance_loss_mlp": 1.00268221, + "epoch": 0.7390350217946791, + "flos": 66383854637760.0, + "grad_norm": 0.9947396197070492, + "language_loss": 0.60869247, + "learning_rate": 6.727346847409052e-07, + "loss": 0.62905467, + "num_input_tokens_seen": 265161230, + "router_z_loss_clip": 0.23278809, + "router_z_loss_mlp": 0.01032257, + "step": 12292, + "time_per_iteration": 2.918458938598633 + }, + { + "auxiliary_loss_clip": 0.01113295, + "auxiliary_loss_mlp": 0.0103133, + "balance_loss_clip": 1.0408442, + "balance_loss_mlp": 1.02046955, + "epoch": 0.7390951450473471, + "flos": 39287692435680.0, + "grad_norm": 2.080091391664097, + "language_loss": 0.67312455, + "learning_rate": 6.724433697406191e-07, + "loss": 0.69457078, + "num_input_tokens_seen": 265182515, + "router_z_loss_clip": 0.72460938, + "router_z_loss_mlp": 0.10876465, + "step": 12293, + "time_per_iteration": 2.732985734939575 + }, + { + "auxiliary_loss_clip": 0.01112573, + "auxiliary_loss_mlp": 0.01031264, + "balance_loss_clip": 1.0391053, + "balance_loss_mlp": 1.01946187, + "epoch": 0.739155268300015, + "flos": 20358449172000.0, + "grad_norm": 3.9859572252887485, + "language_loss": 0.83393818, + "learning_rate": 6.721521050814134e-07, + "loss": 0.85537654, + "num_input_tokens_seen": 265198160, + "router_z_loss_clip": 0.73486328, + "router_z_loss_mlp": 0.11816406, + "step": 12294, + "time_per_iteration": 2.629173517227173 + }, + { + "auxiliary_loss_clip": 0.01111444, + "auxiliary_loss_mlp": 0.01033485, + "balance_loss_clip": 1.03900719, + "balance_loss_mlp": 1.02123046, + "epoch": 0.739215391552683, + "flos": 38619941365440.0, + "grad_norm": 1.585643463802907, + "language_loss": 0.73359704, + "learning_rate": 6.718608907743337e-07, + "loss": 0.75504637, + "num_input_tokens_seen": 265218480, + "router_z_loss_clip": 0.72509766, + "router_z_loss_mlp": 0.12268066, + "step": 12295, + "time_per_iteration": 2.7367124557495117 + }, + { + "auxiliary_loss_clip": 0.0111166, + "auxiliary_loss_mlp": 0.01038449, + "balance_loss_clip": 1.04072988, + "balance_loss_mlp": 1.02718377, + "epoch": 0.7392755148053509, + "flos": 36263063620800.0, + "grad_norm": 1.8084757155511906, + "language_loss": 0.7861287, + "learning_rate": 6.715697268304215e-07, + "loss": 0.80762976, + "num_input_tokens_seen": 265240165, + "router_z_loss_clip": 0.70898438, + "router_z_loss_mlp": 0.1126709, + "step": 12296, + "time_per_iteration": 2.7467329502105713 + }, + { + "auxiliary_loss_clip": 0.01115648, + "auxiliary_loss_mlp": 0.01029141, + "balance_loss_clip": 1.04141343, + "balance_loss_mlp": 1.01665401, + "epoch": 0.7393356380580189, + "flos": 45210231064800.0, + "grad_norm": 2.053648929232603, + "language_loss": 0.66302389, + "learning_rate": 6.712786132607182e-07, + "loss": 0.68447173, + "num_input_tokens_seen": 265263295, + "router_z_loss_clip": 0.7421875, + "router_z_loss_mlp": 0.12487793, + "step": 12297, + "time_per_iteration": 2.8149454593658447 + }, + { + "auxiliary_loss_clip": 0.01114291, + "auxiliary_loss_mlp": 0.0103424, + "balance_loss_clip": 1.04031503, + "balance_loss_mlp": 1.02200341, + "epoch": 0.739395761310687, + "flos": 24061809240000.0, + "grad_norm": 12.24306522900776, + "language_loss": 0.68683535, + "learning_rate": 6.709875500762645e-07, + "loss": 0.70832062, + "num_input_tokens_seen": 265282740, + "router_z_loss_clip": 0.74072266, + "router_z_loss_mlp": 0.12225342, + "step": 12298, + "time_per_iteration": 2.643031358718872 + }, + { + "auxiliary_loss_clip": 0.01112853, + "auxiliary_loss_mlp": 0.01031107, + "balance_loss_clip": 1.03846931, + "balance_loss_mlp": 1.01919222, + "epoch": 0.7394558845633549, + "flos": 14399572893120.0, + "grad_norm": 2.2627129926691985, + "language_loss": 0.74050665, + "learning_rate": 6.706965372880946e-07, + "loss": 0.7619462, + "num_input_tokens_seen": 265300175, + "router_z_loss_clip": 0.74414062, + "router_z_loss_mlp": 0.11914062, + "step": 12299, + "time_per_iteration": 4.036150693893433 + }, + { + "auxiliary_loss_clip": 0.01032394, + "auxiliary_loss_mlp": 0.01001601, + "balance_loss_clip": 1.00912678, + "balance_loss_mlp": 1.00053072, + "epoch": 0.7395160078160229, + "flos": 80772893038080.0, + "grad_norm": 0.7267101114239898, + "language_loss": 0.60841483, + "learning_rate": 6.704055749072455e-07, + "loss": 0.62875479, + "num_input_tokens_seen": 265363275, + "router_z_loss_clip": 0.23266602, + "router_z_loss_mlp": 0.01070404, + "step": 12300, + "time_per_iteration": 4.753862619400024 + }, + { + "auxiliary_loss_clip": 0.01113127, + "auxiliary_loss_mlp": 0.01029124, + "balance_loss_clip": 1.04042864, + "balance_loss_mlp": 1.01742935, + "epoch": 0.7395761310686908, + "flos": 25929146027520.0, + "grad_norm": 1.692077956409245, + "language_loss": 0.80411804, + "learning_rate": 6.7011466294475e-07, + "loss": 0.82554054, + "num_input_tokens_seen": 265382935, + "router_z_loss_clip": 0.72607422, + "router_z_loss_mlp": 0.11688232, + "step": 12301, + "time_per_iteration": 2.667313575744629 + }, + { + "auxiliary_loss_clip": 0.01111953, + "auxiliary_loss_mlp": 0.0102862, + "balance_loss_clip": 1.03929186, + "balance_loss_mlp": 1.01753354, + "epoch": 0.7396362543213588, + "flos": 31671367644960.0, + "grad_norm": 1.4744050945738758, + "language_loss": 0.73148215, + "learning_rate": 6.698238014116406e-07, + "loss": 0.75288785, + "num_input_tokens_seen": 265403245, + "router_z_loss_clip": 0.72558594, + "router_z_loss_mlp": 0.11090088, + "step": 12302, + "time_per_iteration": 2.660097122192383 + }, + { + "auxiliary_loss_clip": 0.01112406, + "auxiliary_loss_mlp": 0.01036966, + "balance_loss_clip": 1.03836799, + "balance_loss_mlp": 1.0255394, + "epoch": 0.7396963775740267, + "flos": 33407406979200.0, + "grad_norm": 2.1009875036377417, + "language_loss": 0.73723143, + "learning_rate": 6.695329903189451e-07, + "loss": 0.75872517, + "num_input_tokens_seen": 265423105, + "router_z_loss_clip": 0.73974609, + "router_z_loss_mlp": 0.11437988, + "step": 12303, + "time_per_iteration": 2.6699256896972656 + }, + { + "auxiliary_loss_clip": 0.01109802, + "auxiliary_loss_mlp": 0.0102649, + "balance_loss_clip": 1.03780627, + "balance_loss_mlp": 1.01520133, + "epoch": 0.7397565008266948, + "flos": 31140100239840.0, + "grad_norm": 2.0849176391888737, + "language_loss": 0.54234391, + "learning_rate": 6.692422296776927e-07, + "loss": 0.56370682, + "num_input_tokens_seen": 265443445, + "router_z_loss_clip": 0.71972656, + "router_z_loss_mlp": 0.11291504, + "step": 12304, + "time_per_iteration": 2.6582553386688232 + }, + { + "auxiliary_loss_clip": 0.0111346, + "auxiliary_loss_mlp": 0.01030185, + "balance_loss_clip": 1.04015791, + "balance_loss_mlp": 1.01847219, + "epoch": 0.7398166240793627, + "flos": 28953329152320.0, + "grad_norm": 1.7161410796398222, + "language_loss": 0.84135979, + "learning_rate": 6.689515194989084e-07, + "loss": 0.86279625, + "num_input_tokens_seen": 265462085, + "router_z_loss_clip": 0.73242188, + "router_z_loss_mlp": 0.1171875, + "step": 12305, + "time_per_iteration": 2.6572089195251465 + }, + { + "auxiliary_loss_clip": 0.0103217, + "auxiliary_loss_mlp": 0.01001001, + "balance_loss_clip": 1.00898147, + "balance_loss_mlp": 0.99992573, + "epoch": 0.7398767473320307, + "flos": 82079472191040.0, + "grad_norm": 0.8723157359433267, + "language_loss": 0.57617009, + "learning_rate": 6.68660859793615e-07, + "loss": 0.59650183, + "num_input_tokens_seen": 265521190, + "router_z_loss_clip": 0.23193359, + "router_z_loss_mlp": 0.01075745, + "step": 12306, + "time_per_iteration": 3.2296481132507324 + }, + { + "auxiliary_loss_clip": 0.01116773, + "auxiliary_loss_mlp": 0.01031501, + "balance_loss_clip": 1.04177153, + "balance_loss_mlp": 1.01937175, + "epoch": 0.7399368705846986, + "flos": 26867271322080.0, + "grad_norm": 1.9913752000751173, + "language_loss": 0.81701899, + "learning_rate": 6.683702505728355e-07, + "loss": 0.83850175, + "num_input_tokens_seen": 265539705, + "router_z_loss_clip": 0.75097656, + "router_z_loss_mlp": 0.12133789, + "step": 12307, + "time_per_iteration": 2.6732709407806396 + }, + { + "auxiliary_loss_clip": 0.01110247, + "auxiliary_loss_mlp": 0.01028785, + "balance_loss_clip": 1.04050398, + "balance_loss_mlp": 1.01785898, + "epoch": 0.7399969938373666, + "flos": 17294443911360.0, + "grad_norm": 1.7280577491164049, + "language_loss": 0.69790202, + "learning_rate": 6.680796918475893e-07, + "loss": 0.71929234, + "num_input_tokens_seen": 265555855, + "router_z_loss_clip": 0.69775391, + "router_z_loss_mlp": 0.10919189, + "step": 12308, + "time_per_iteration": 2.569469690322876 + }, + { + "auxiliary_loss_clip": 0.01108633, + "auxiliary_loss_mlp": 0.01027001, + "balance_loss_clip": 1.03809738, + "balance_loss_mlp": 1.01581323, + "epoch": 0.7400571170900345, + "flos": 30873251018880.0, + "grad_norm": 1.8791611495883178, + "language_loss": 0.81353223, + "learning_rate": 6.67789183628896e-07, + "loss": 0.83488858, + "num_input_tokens_seen": 265575455, + "router_z_loss_clip": 0.70507812, + "router_z_loss_mlp": 0.11187744, + "step": 12309, + "time_per_iteration": 2.6930274963378906 + }, + { + "auxiliary_loss_clip": 0.0111556, + "auxiliary_loss_mlp": 0.01035981, + "balance_loss_clip": 1.03981328, + "balance_loss_mlp": 1.02375567, + "epoch": 0.7401172403427025, + "flos": 27712543711680.0, + "grad_norm": 2.640999497916875, + "language_loss": 0.72588938, + "learning_rate": 6.674987259277692e-07, + "loss": 0.74740481, + "num_input_tokens_seen": 265595250, + "router_z_loss_clip": 0.75634766, + "router_z_loss_mlp": 0.12225342, + "step": 12310, + "time_per_iteration": 2.6233553886413574 + }, + { + "auxiliary_loss_clip": 0.01115035, + "auxiliary_loss_mlp": 0.01039275, + "balance_loss_clip": 1.03981876, + "balance_loss_mlp": 1.02680016, + "epoch": 0.7401773635953706, + "flos": 22046643948960.0, + "grad_norm": 2.5118760195400442, + "language_loss": 0.88632196, + "learning_rate": 6.672083187552239e-07, + "loss": 0.90786505, + "num_input_tokens_seen": 265606945, + "router_z_loss_clip": 0.75195312, + "router_z_loss_mlp": 0.12475586, + "step": 12311, + "time_per_iteration": 2.611548662185669 + }, + { + "auxiliary_loss_clip": 0.0111111, + "auxiliary_loss_mlp": 0.0102638, + "balance_loss_clip": 1.03735137, + "balance_loss_mlp": 1.01570463, + "epoch": 0.7402374868480385, + "flos": 27713435091840.0, + "grad_norm": 2.0746757642725546, + "language_loss": 0.80427891, + "learning_rate": 6.669179621222738e-07, + "loss": 0.82565379, + "num_input_tokens_seen": 265626115, + "router_z_loss_clip": 0.73681641, + "router_z_loss_mlp": 0.10675049, + "step": 12312, + "time_per_iteration": 2.63090181350708 + }, + { + "auxiliary_loss_clip": 0.01112205, + "auxiliary_loss_mlp": 0.01031578, + "balance_loss_clip": 1.04054511, + "balance_loss_mlp": 1.02030122, + "epoch": 0.7402976101007065, + "flos": 27889943479200.0, + "grad_norm": 1.7956801915919118, + "language_loss": 0.78456849, + "learning_rate": 6.666276560399273e-07, + "loss": 0.80600631, + "num_input_tokens_seen": 265646520, + "router_z_loss_clip": 0.71679688, + "router_z_loss_mlp": 0.11279297, + "step": 12313, + "time_per_iteration": 2.678830146789551 + }, + { + "auxiliary_loss_clip": 0.01114084, + "auxiliary_loss_mlp": 0.01034049, + "balance_loss_clip": 1.03824639, + "balance_loss_mlp": 1.02174652, + "epoch": 0.7403577333533744, + "flos": 15064042063680.0, + "grad_norm": 1.9803606239960079, + "language_loss": 0.78236711, + "learning_rate": 6.663374005191937e-07, + "loss": 0.80384839, + "num_input_tokens_seen": 265661875, + "router_z_loss_clip": 0.75878906, + "router_z_loss_mlp": 0.12310791, + "step": 12314, + "time_per_iteration": 2.600175619125366 + }, + { + "auxiliary_loss_clip": 0.01031904, + "auxiliary_loss_mlp": 0.0100107, + "balance_loss_clip": 1.00872207, + "balance_loss_mlp": 1.00005567, + "epoch": 0.7404178566060424, + "flos": 73611108624960.0, + "grad_norm": 0.8439364219384041, + "language_loss": 0.55159289, + "learning_rate": 6.660471955710809e-07, + "loss": 0.57192266, + "num_input_tokens_seen": 265721255, + "router_z_loss_clip": 0.23181152, + "router_z_loss_mlp": 0.01014709, + "step": 12315, + "time_per_iteration": 3.2537734508514404 + }, + { + "auxiliary_loss_clip": 0.01109621, + "auxiliary_loss_mlp": 0.0103126, + "balance_loss_clip": 1.03950667, + "balance_loss_mlp": 1.01970863, + "epoch": 0.7404779798587103, + "flos": 39065203285920.0, + "grad_norm": 2.084901742868087, + "language_loss": 0.79588795, + "learning_rate": 6.65757041206591e-07, + "loss": 0.8172968, + "num_input_tokens_seen": 265743970, + "router_z_loss_clip": 0.70166016, + "router_z_loss_mlp": 0.11566162, + "step": 12316, + "time_per_iteration": 2.746474266052246 + }, + { + "auxiliary_loss_clip": 0.01110256, + "auxiliary_loss_mlp": 0.01031953, + "balance_loss_clip": 1.0375948, + "balance_loss_mlp": 1.0206821, + "epoch": 0.7405381031113784, + "flos": 15728065544160.0, + "grad_norm": 2.3963219340983617, + "language_loss": 0.74889147, + "learning_rate": 6.654669374367275e-07, + "loss": 0.7703135, + "num_input_tokens_seen": 265760890, + "router_z_loss_clip": 0.7265625, + "router_z_loss_mlp": 0.11273193, + "step": 12317, + "time_per_iteration": 2.589154005050659 + }, + { + "auxiliary_loss_clip": 0.01108618, + "auxiliary_loss_mlp": 0.0103294, + "balance_loss_clip": 1.03879797, + "balance_loss_mlp": 1.02211583, + "epoch": 0.7405982263640463, + "flos": 24683660582400.0, + "grad_norm": 2.3522505199944828, + "language_loss": 0.81636679, + "learning_rate": 6.651768842724917e-07, + "loss": 0.83778232, + "num_input_tokens_seen": 265779600, + "router_z_loss_clip": 0.69824219, + "router_z_loss_mlp": 0.10821533, + "step": 12318, + "time_per_iteration": 2.6579606533050537 + }, + { + "auxiliary_loss_clip": 0.01114441, + "auxiliary_loss_mlp": 0.0103095, + "balance_loss_clip": 1.03942752, + "balance_loss_mlp": 1.01903474, + "epoch": 0.7406583496167143, + "flos": 21435246064800.0, + "grad_norm": 2.0305526857365237, + "language_loss": 0.77072251, + "learning_rate": 6.648868817248827e-07, + "loss": 0.79217649, + "num_input_tokens_seen": 265797030, + "router_z_loss_clip": 0.75097656, + "router_z_loss_mlp": 0.11907959, + "step": 12319, + "time_per_iteration": 2.590522050857544 + }, + { + "auxiliary_loss_clip": 0.01111146, + "auxiliary_loss_mlp": 0.0103153, + "balance_loss_clip": 1.03924704, + "balance_loss_mlp": 1.02061617, + "epoch": 0.7407184728693822, + "flos": 22324351800960.0, + "grad_norm": 2.285878295353673, + "language_loss": 0.63607621, + "learning_rate": 6.64596929804897e-07, + "loss": 0.65750301, + "num_input_tokens_seen": 265815055, + "router_z_loss_clip": 0.71875, + "router_z_loss_mlp": 0.10913086, + "step": 12320, + "time_per_iteration": 2.6554715633392334 + }, + { + "auxiliary_loss_clip": 0.01115526, + "auxiliary_loss_mlp": 0.01035493, + "balance_loss_clip": 1.0395503, + "balance_loss_mlp": 1.02379251, + "epoch": 0.7407785961220502, + "flos": 20365580213280.0, + "grad_norm": 4.523881414487681, + "language_loss": 0.82225156, + "learning_rate": 6.643070285235288e-07, + "loss": 0.84376174, + "num_input_tokens_seen": 265828480, + "router_z_loss_clip": 0.75927734, + "router_z_loss_mlp": 0.11694336, + "step": 12321, + "time_per_iteration": 2.5799427032470703 + }, + { + "auxiliary_loss_clip": 0.0111849, + "auxiliary_loss_mlp": 0.01043957, + "balance_loss_clip": 1.03980947, + "balance_loss_mlp": 1.03077865, + "epoch": 0.7408387193747181, + "flos": 26952547495680.0, + "grad_norm": 1.8216699259261155, + "language_loss": 0.71886635, + "learning_rate": 6.640171778917727e-07, + "loss": 0.74049091, + "num_input_tokens_seen": 265845825, + "router_z_loss_clip": 0.78662109, + "router_z_loss_mlp": 0.1317749, + "step": 12322, + "time_per_iteration": 2.662935733795166 + }, + { + "auxiliary_loss_clip": 0.01112883, + "auxiliary_loss_mlp": 0.01033857, + "balance_loss_clip": 1.04014742, + "balance_loss_mlp": 1.02265084, + "epoch": 0.7408988426273861, + "flos": 29574572735520.0, + "grad_norm": 2.3590650840033462, + "language_loss": 0.64110935, + "learning_rate": 6.637273779206183e-07, + "loss": 0.66257668, + "num_input_tokens_seen": 265866335, + "router_z_loss_clip": 0.7265625, + "router_z_loss_mlp": 0.11212158, + "step": 12323, + "time_per_iteration": 2.635857343673706 + }, + { + "auxiliary_loss_clip": 0.01113212, + "auxiliary_loss_mlp": 0.01029462, + "balance_loss_clip": 1.03880072, + "balance_loss_mlp": 1.01732647, + "epoch": 0.7409589658800542, + "flos": 35414549848800.0, + "grad_norm": 1.4185397978935537, + "language_loss": 0.7588985, + "learning_rate": 6.634376286210559e-07, + "loss": 0.78032517, + "num_input_tokens_seen": 265888945, + "router_z_loss_clip": 0.74511719, + "router_z_loss_mlp": 0.12127686, + "step": 12324, + "time_per_iteration": 2.7252190113067627 + }, + { + "auxiliary_loss_clip": 0.01110624, + "auxiliary_loss_mlp": 0.01021824, + "balance_loss_clip": 1.03744078, + "balance_loss_mlp": 1.01046336, + "epoch": 0.7410190891327221, + "flos": 23612738695200.0, + "grad_norm": 2.0562233783638, + "language_loss": 0.74816167, + "learning_rate": 6.63147930004073e-07, + "loss": 0.76948613, + "num_input_tokens_seen": 265908030, + "router_z_loss_clip": 0.73095703, + "router_z_loss_mlp": 0.11364746, + "step": 12325, + "time_per_iteration": 2.59126877784729 + }, + { + "auxiliary_loss_clip": 0.01116744, + "auxiliary_loss_mlp": 0.01034441, + "balance_loss_clip": 1.03901911, + "balance_loss_mlp": 1.02189386, + "epoch": 0.7410792123853901, + "flos": 27756863265600.0, + "grad_norm": 1.8063398107317838, + "language_loss": 0.68634969, + "learning_rate": 6.628582820806545e-07, + "loss": 0.70786154, + "num_input_tokens_seen": 265927030, + "router_z_loss_clip": 0.77685547, + "router_z_loss_mlp": 0.12530518, + "step": 12326, + "time_per_iteration": 2.650481939315796 + }, + { + "auxiliary_loss_clip": 0.01111468, + "auxiliary_loss_mlp": 0.0102766, + "balance_loss_clip": 1.03859878, + "balance_loss_mlp": 1.01616848, + "epoch": 0.741139335638058, + "flos": 30960026331840.0, + "grad_norm": 2.0949757935528526, + "language_loss": 0.89422584, + "learning_rate": 6.625686848617835e-07, + "loss": 0.91561711, + "num_input_tokens_seen": 265945490, + "router_z_loss_clip": 0.72998047, + "router_z_loss_mlp": 0.11486816, + "step": 12327, + "time_per_iteration": 2.7033791542053223 + }, + { + "auxiliary_loss_clip": 0.01112579, + "auxiliary_loss_mlp": 0.01028033, + "balance_loss_clip": 1.03974724, + "balance_loss_mlp": 1.01669586, + "epoch": 0.741199458890726, + "flos": 22678300473120.0, + "grad_norm": 1.673123797573636, + "language_loss": 0.85588688, + "learning_rate": 6.62279138358442e-07, + "loss": 0.87729299, + "num_input_tokens_seen": 265963265, + "router_z_loss_clip": 0.72851562, + "router_z_loss_mlp": 0.11340332, + "step": 12328, + "time_per_iteration": 4.190820932388306 + }, + { + "auxiliary_loss_clip": 0.01110005, + "auxiliary_loss_mlp": 0.01026879, + "balance_loss_clip": 1.03860569, + "balance_loss_mlp": 1.01470137, + "epoch": 0.7412595821433939, + "flos": 26999622224640.0, + "grad_norm": 1.9062985517259636, + "language_loss": 0.6707561, + "learning_rate": 6.619896425816103e-07, + "loss": 0.69212496, + "num_input_tokens_seen": 265982270, + "router_z_loss_clip": 0.71337891, + "router_z_loss_mlp": 0.12176514, + "step": 12329, + "time_per_iteration": 2.684238910675049 + }, + { + "auxiliary_loss_clip": 0.01118067, + "auxiliary_loss_mlp": 0.01031952, + "balance_loss_clip": 1.04050672, + "balance_loss_mlp": 1.02044833, + "epoch": 0.741319705396062, + "flos": 35593489272960.0, + "grad_norm": 1.856599327757284, + "language_loss": 0.66659391, + "learning_rate": 6.617001975422647e-07, + "loss": 0.68809414, + "num_input_tokens_seen": 266003835, + "router_z_loss_clip": 0.77636719, + "router_z_loss_mlp": 0.11505127, + "step": 12330, + "time_per_iteration": 2.659724712371826 + }, + { + "auxiliary_loss_clip": 0.01120098, + "auxiliary_loss_mlp": 0.01032014, + "balance_loss_clip": 1.04167759, + "balance_loss_mlp": 1.01850796, + "epoch": 0.7413798286487299, + "flos": 25218534025440.0, + "grad_norm": 2.7905945137698747, + "language_loss": 0.85403061, + "learning_rate": 6.614108032513823e-07, + "loss": 0.87555182, + "num_input_tokens_seen": 266021595, + "router_z_loss_clip": 0.78369141, + "router_z_loss_mlp": 0.1350708, + "step": 12331, + "time_per_iteration": 3.893826484680176 + }, + { + "auxiliary_loss_clip": 0.01114097, + "auxiliary_loss_mlp": 0.01029599, + "balance_loss_clip": 1.03828621, + "balance_loss_mlp": 1.01758873, + "epoch": 0.7414399519013979, + "flos": 20009646194400.0, + "grad_norm": 1.9307506940696928, + "language_loss": 0.69398952, + "learning_rate": 6.611214597199364e-07, + "loss": 0.71542645, + "num_input_tokens_seen": 266039860, + "router_z_loss_clip": 0.75732422, + "router_z_loss_mlp": 0.11999512, + "step": 12332, + "time_per_iteration": 2.6410484313964844 + }, + { + "auxiliary_loss_clip": 0.01114544, + "auxiliary_loss_mlp": 0.01035461, + "balance_loss_clip": 1.04046345, + "balance_loss_mlp": 1.02290845, + "epoch": 0.7415000751540658, + "flos": 31274395971840.0, + "grad_norm": 2.4933348685380103, + "language_loss": 0.63221675, + "learning_rate": 6.608321669588984e-07, + "loss": 0.6537168, + "num_input_tokens_seen": 266058050, + "router_z_loss_clip": 0.74072266, + "router_z_loss_mlp": 0.12542725, + "step": 12333, + "time_per_iteration": 2.648033857345581 + }, + { + "auxiliary_loss_clip": 0.01111572, + "auxiliary_loss_mlp": 0.01033403, + "balance_loss_clip": 1.04173601, + "balance_loss_mlp": 1.02201259, + "epoch": 0.7415601984067338, + "flos": 29894817381120.0, + "grad_norm": 1.6840493542827453, + "language_loss": 0.71036267, + "learning_rate": 6.605429249792387e-07, + "loss": 0.73181248, + "num_input_tokens_seen": 266078060, + "router_z_loss_clip": 0.69824219, + "router_z_loss_mlp": 0.11395264, + "step": 12334, + "time_per_iteration": 2.7330551147460938 + }, + { + "auxiliary_loss_clip": 0.01111825, + "auxiliary_loss_mlp": 0.01024886, + "balance_loss_clip": 1.03845096, + "balance_loss_mlp": 1.01362073, + "epoch": 0.7416203216594017, + "flos": 25486517730240.0, + "grad_norm": 2.1726459730502716, + "language_loss": 0.82414615, + "learning_rate": 6.602537337919257e-07, + "loss": 0.84551328, + "num_input_tokens_seen": 266097110, + "router_z_loss_clip": 0.73388672, + "router_z_loss_mlp": 0.1126709, + "step": 12335, + "time_per_iteration": 2.616164207458496 + }, + { + "auxiliary_loss_clip": 0.01113808, + "auxiliary_loss_mlp": 0.01029829, + "balance_loss_clip": 1.03937364, + "balance_loss_mlp": 1.01758015, + "epoch": 0.7416804449120697, + "flos": 19065159686880.0, + "grad_norm": 13.95116349790573, + "language_loss": 0.75012243, + "learning_rate": 6.599645934079259e-07, + "loss": 0.77155882, + "num_input_tokens_seen": 266110870, + "router_z_loss_clip": 0.74414062, + "router_z_loss_mlp": 0.12237549, + "step": 12336, + "time_per_iteration": 2.593079090118408 + }, + { + "auxiliary_loss_clip": 0.01116702, + "auxiliary_loss_mlp": 0.01031974, + "balance_loss_clip": 1.04136825, + "balance_loss_mlp": 1.01992834, + "epoch": 0.7417405681647377, + "flos": 20890932095520.0, + "grad_norm": 1.9871181760893897, + "language_loss": 0.73582006, + "learning_rate": 6.596755038382029e-07, + "loss": 0.75730681, + "num_input_tokens_seen": 266127845, + "router_z_loss_clip": 0.75244141, + "router_z_loss_mlp": 0.1204834, + "step": 12337, + "time_per_iteration": 2.607093572616577 + }, + { + "auxiliary_loss_clip": 0.01112526, + "auxiliary_loss_mlp": 0.0103445, + "balance_loss_clip": 1.04150558, + "balance_loss_mlp": 1.02310717, + "epoch": 0.7418006914174057, + "flos": 23037637943520.0, + "grad_norm": 1.8922298297476576, + "language_loss": 0.76509297, + "learning_rate": 6.593864650937186e-07, + "loss": 0.78656274, + "num_input_tokens_seen": 266145400, + "router_z_loss_clip": 0.71044922, + "router_z_loss_mlp": 0.11352539, + "step": 12338, + "time_per_iteration": 4.07167387008667 + }, + { + "auxiliary_loss_clip": 0.01110579, + "auxiliary_loss_mlp": 0.01025592, + "balance_loss_clip": 1.03818631, + "balance_loss_mlp": 1.01526809, + "epoch": 0.7418608146700737, + "flos": 26332843569120.0, + "grad_norm": 1.7526146906498665, + "language_loss": 0.72800994, + "learning_rate": 6.590974771854345e-07, + "loss": 0.74937159, + "num_input_tokens_seen": 266164430, + "router_z_loss_clip": 0.72363281, + "router_z_loss_mlp": 0.10327148, + "step": 12339, + "time_per_iteration": 3.900883436203003 + }, + { + "auxiliary_loss_clip": 0.01114706, + "auxiliary_loss_mlp": 0.01029437, + "balance_loss_clip": 1.04086041, + "balance_loss_mlp": 1.01820147, + "epoch": 0.7419209379227416, + "flos": 27260515405440.0, + "grad_norm": 2.0497366711138163, + "language_loss": 0.80196667, + "learning_rate": 6.588085401243077e-07, + "loss": 0.82340813, + "num_input_tokens_seen": 266183855, + "router_z_loss_clip": 0.73925781, + "router_z_loss_mlp": 0.11236572, + "step": 12340, + "time_per_iteration": 2.7125561237335205 + }, + { + "auxiliary_loss_clip": 0.01112443, + "auxiliary_loss_mlp": 0.01032588, + "balance_loss_clip": 1.03831208, + "balance_loss_mlp": 1.02094686, + "epoch": 0.7419810611754096, + "flos": 20453206389120.0, + "grad_norm": 1.5692067615003162, + "language_loss": 0.75605965, + "learning_rate": 6.585196539212958e-07, + "loss": 0.77750999, + "num_input_tokens_seen": 266202085, + "router_z_loss_clip": 0.74169922, + "router_z_loss_mlp": 0.11627197, + "step": 12341, + "time_per_iteration": 2.7312166690826416 + }, + { + "auxiliary_loss_clip": 0.01106983, + "auxiliary_loss_mlp": 0.01031779, + "balance_loss_clip": 1.03993368, + "balance_loss_mlp": 1.02050149, + "epoch": 0.7420411844280775, + "flos": 31985575215840.0, + "grad_norm": 1.7492005065677398, + "language_loss": 0.80003643, + "learning_rate": 6.582308185873535e-07, + "loss": 0.82142401, + "num_input_tokens_seen": 266223445, + "router_z_loss_clip": 0.67041016, + "router_z_loss_mlp": 0.112854, + "step": 12342, + "time_per_iteration": 2.6908912658691406 + }, + { + "auxiliary_loss_clip": 0.01110785, + "auxiliary_loss_mlp": 0.01027304, + "balance_loss_clip": 1.03883195, + "balance_loss_mlp": 1.01634312, + "epoch": 0.7421013076807456, + "flos": 83623964876640.0, + "grad_norm": 1.7196331862683691, + "language_loss": 0.77569914, + "learning_rate": 6.57942034133433e-07, + "loss": 0.79708004, + "num_input_tokens_seen": 266246575, + "router_z_loss_clip": 0.71972656, + "router_z_loss_mlp": 0.10961914, + "step": 12343, + "time_per_iteration": 3.0017471313476562 + }, + { + "auxiliary_loss_clip": 0.01110674, + "auxiliary_loss_mlp": 0.01031499, + "balance_loss_clip": 1.03730452, + "balance_loss_mlp": 1.0197866, + "epoch": 0.7421614309334135, + "flos": 29804152409280.0, + "grad_norm": 1.8620229894271079, + "language_loss": 0.67532778, + "learning_rate": 6.576533005704843e-07, + "loss": 0.69674957, + "num_input_tokens_seen": 266266055, + "router_z_loss_clip": 0.734375, + "router_z_loss_mlp": 0.1171875, + "step": 12344, + "time_per_iteration": 2.742884397506714 + }, + { + "auxiliary_loss_clip": 0.01115158, + "auxiliary_loss_mlp": 0.01034522, + "balance_loss_clip": 1.04020143, + "balance_loss_mlp": 1.02186215, + "epoch": 0.7422215541860815, + "flos": 15021505270080.0, + "grad_norm": 4.2257253970542825, + "language_loss": 0.81518555, + "learning_rate": 6.573646179094572e-07, + "loss": 0.83668232, + "num_input_tokens_seen": 266282240, + "router_z_loss_clip": 0.74902344, + "router_z_loss_mlp": 0.12670898, + "step": 12345, + "time_per_iteration": 2.629859447479248 + }, + { + "auxiliary_loss_clip": 0.01114002, + "auxiliary_loss_mlp": 0.01033389, + "balance_loss_clip": 1.03965116, + "balance_loss_mlp": 1.02148557, + "epoch": 0.7422816774387494, + "flos": 23971549440960.0, + "grad_norm": 1.975679501652599, + "language_loss": 0.70505965, + "learning_rate": 6.570759861612988e-07, + "loss": 0.72653359, + "num_input_tokens_seen": 266300980, + "router_z_loss_clip": 0.74414062, + "router_z_loss_mlp": 0.11901855, + "step": 12346, + "time_per_iteration": 2.6528220176696777 + }, + { + "auxiliary_loss_clip": 0.01113315, + "auxiliary_loss_mlp": 0.01032407, + "balance_loss_clip": 1.03977573, + "balance_loss_mlp": 1.02064133, + "epoch": 0.7423418006914174, + "flos": 39066702425280.0, + "grad_norm": 1.5494757647696182, + "language_loss": 0.73041618, + "learning_rate": 6.56787405336953e-07, + "loss": 0.75187337, + "num_input_tokens_seen": 266322215, + "router_z_loss_clip": 0.73535156, + "router_z_loss_mlp": 0.11773682, + "step": 12347, + "time_per_iteration": 2.7118968963623047 + }, + { + "auxiliary_loss_clip": 0.01116158, + "auxiliary_loss_mlp": 0.0103269, + "balance_loss_clip": 1.03957057, + "balance_loss_mlp": 1.02110279, + "epoch": 0.7424019239440853, + "flos": 23082322152960.0, + "grad_norm": 1.8118794956442463, + "language_loss": 0.80965167, + "learning_rate": 6.564988754473642e-07, + "loss": 0.83114016, + "num_input_tokens_seen": 266341600, + "router_z_loss_clip": 0.76513672, + "router_z_loss_mlp": 0.1159668, + "step": 12348, + "time_per_iteration": 2.633517265319824 + }, + { + "auxiliary_loss_clip": 0.01110981, + "auxiliary_loss_mlp": 0.01028887, + "balance_loss_clip": 1.03836989, + "balance_loss_mlp": 1.01745486, + "epoch": 0.7424620471967533, + "flos": 43778269981440.0, + "grad_norm": 1.672387871197137, + "language_loss": 0.72709656, + "learning_rate": 6.562103965034724e-07, + "loss": 0.74849522, + "num_input_tokens_seen": 266362895, + "router_z_loss_clip": 0.72509766, + "router_z_loss_mlp": 0.11425781, + "step": 12349, + "time_per_iteration": 2.735536575317383 + }, + { + "auxiliary_loss_clip": 0.01114824, + "auxiliary_loss_mlp": 0.01033579, + "balance_loss_clip": 1.03802156, + "balance_loss_mlp": 1.02071595, + "epoch": 0.7425221704494213, + "flos": 32966156269440.0, + "grad_norm": 2.3633111604873775, + "language_loss": 0.78654683, + "learning_rate": 6.559219685162165e-07, + "loss": 0.80803084, + "num_input_tokens_seen": 266384015, + "router_z_loss_clip": 0.76757812, + "router_z_loss_mlp": 0.12860107, + "step": 12350, + "time_per_iteration": 2.7049877643585205 + }, + { + "auxiliary_loss_clip": 0.01112744, + "auxiliary_loss_mlp": 0.01033842, + "balance_loss_clip": 1.03910255, + "balance_loss_mlp": 1.02218366, + "epoch": 0.7425822937020893, + "flos": 41691847495680.0, + "grad_norm": 1.875670595094636, + "language_loss": 0.75383425, + "learning_rate": 6.556335914965343e-07, + "loss": 0.77530003, + "num_input_tokens_seen": 266405990, + "router_z_loss_clip": 0.73632812, + "router_z_loss_mlp": 0.11663818, + "step": 12351, + "time_per_iteration": 2.742678165435791 + }, + { + "auxiliary_loss_clip": 0.01112216, + "auxiliary_loss_mlp": 0.01024906, + "balance_loss_clip": 1.03923023, + "balance_loss_mlp": 1.01347387, + "epoch": 0.7426424169547573, + "flos": 25970710406400.0, + "grad_norm": 10.48013080008227, + "language_loss": 0.81385338, + "learning_rate": 6.553452654553611e-07, + "loss": 0.83522463, + "num_input_tokens_seen": 266424260, + "router_z_loss_clip": 0.72998047, + "router_z_loss_mlp": 0.11437988, + "step": 12352, + "time_per_iteration": 2.638638734817505 + }, + { + "auxiliary_loss_clip": 0.01113676, + "auxiliary_loss_mlp": 0.01035636, + "balance_loss_clip": 1.04081082, + "balance_loss_mlp": 1.02430463, + "epoch": 0.7427025402074252, + "flos": 27392420617920.0, + "grad_norm": 1.9258593771830306, + "language_loss": 0.71414131, + "learning_rate": 6.550569904036307e-07, + "loss": 0.73563439, + "num_input_tokens_seen": 266444580, + "router_z_loss_clip": 0.72900391, + "router_z_loss_mlp": 0.11322021, + "step": 12353, + "time_per_iteration": 2.7179274559020996 + }, + { + "auxiliary_loss_clip": 0.01114449, + "auxiliary_loss_mlp": 0.01037019, + "balance_loss_clip": 1.04186368, + "balance_loss_mlp": 1.02618337, + "epoch": 0.7427626634600932, + "flos": 27485070936480.0, + "grad_norm": 1.8224448293520235, + "language_loss": 0.72229528, + "learning_rate": 6.547687663522739e-07, + "loss": 0.74381, + "num_input_tokens_seen": 266465640, + "router_z_loss_clip": 0.72607422, + "router_z_loss_mlp": 0.10839844, + "step": 12354, + "time_per_iteration": 2.676905870437622 + }, + { + "auxiliary_loss_clip": 0.01031527, + "auxiliary_loss_mlp": 0.01003723, + "balance_loss_clip": 1.00849652, + "balance_loss_mlp": 1.00262892, + "epoch": 0.7428227867127611, + "flos": 82010558648160.0, + "grad_norm": 0.7011044228770855, + "language_loss": 0.59532356, + "learning_rate": 6.544805933122199e-07, + "loss": 0.61567605, + "num_input_tokens_seen": 266531950, + "router_z_loss_clip": 0.23034668, + "router_z_loss_mlp": 0.01094055, + "step": 12355, + "time_per_iteration": 3.3878273963928223 + }, + { + "auxiliary_loss_clip": 0.01113775, + "auxiliary_loss_mlp": 0.01026494, + "balance_loss_clip": 1.03947675, + "balance_loss_mlp": 1.01494241, + "epoch": 0.7428829099654292, + "flos": 17962357050720.0, + "grad_norm": 2.2532500278048953, + "language_loss": 0.67552245, + "learning_rate": 6.541924712943971e-07, + "loss": 0.69692516, + "num_input_tokens_seen": 266550665, + "router_z_loss_clip": 0.74169922, + "router_z_loss_mlp": 0.11553955, + "step": 12356, + "time_per_iteration": 2.681029796600342 + }, + { + "auxiliary_loss_clip": 0.01112244, + "auxiliary_loss_mlp": 0.01035861, + "balance_loss_clip": 1.03621268, + "balance_loss_mlp": 1.02398157, + "epoch": 0.7429430332180971, + "flos": 59361181162560.0, + "grad_norm": 1.8754939225370495, + "language_loss": 0.72011787, + "learning_rate": 6.539044003097301e-07, + "loss": 0.74159896, + "num_input_tokens_seen": 266572455, + "router_z_loss_clip": 0.76025391, + "router_z_loss_mlp": 0.11889648, + "step": 12357, + "time_per_iteration": 2.8584511280059814 + }, + { + "auxiliary_loss_clip": 0.01109418, + "auxiliary_loss_mlp": 0.01028645, + "balance_loss_clip": 1.04046726, + "balance_loss_mlp": 1.01818466, + "epoch": 0.7430031564707651, + "flos": 20455232253120.0, + "grad_norm": 1.7706400609166202, + "language_loss": 0.6483556, + "learning_rate": 6.53616380369143e-07, + "loss": 0.66973621, + "num_input_tokens_seen": 266590895, + "router_z_loss_clip": 0.68994141, + "router_z_loss_mlp": 0.10455322, + "step": 12358, + "time_per_iteration": 2.5909464359283447 + }, + { + "auxiliary_loss_clip": 0.011162, + "auxiliary_loss_mlp": 0.0103505, + "balance_loss_clip": 1.04072654, + "balance_loss_mlp": 1.0223546, + "epoch": 0.743063279723433, + "flos": 29127892710240.0, + "grad_norm": 2.3634685719963295, + "language_loss": 0.81128824, + "learning_rate": 6.533284114835591e-07, + "loss": 0.83280075, + "num_input_tokens_seen": 266607660, + "router_z_loss_clip": 0.75488281, + "router_z_loss_mlp": 0.12701416, + "step": 12359, + "time_per_iteration": 2.661198854446411 + }, + { + "auxiliary_loss_clip": 0.01110474, + "auxiliary_loss_mlp": 0.01026363, + "balance_loss_clip": 1.03761339, + "balance_loss_mlp": 1.0148828, + "epoch": 0.743123402976101, + "flos": 17560361234880.0, + "grad_norm": 2.0488567269163203, + "language_loss": 0.68219614, + "learning_rate": 6.530404936638956e-07, + "loss": 0.70356447, + "num_input_tokens_seen": 266624260, + "router_z_loss_clip": 0.72802734, + "router_z_loss_mlp": 0.11486816, + "step": 12360, + "time_per_iteration": 2.600665330886841 + }, + { + "auxiliary_loss_clip": 0.01109651, + "auxiliary_loss_mlp": 0.01029949, + "balance_loss_clip": 1.03676414, + "balance_loss_mlp": 1.01816559, + "epoch": 0.7431835262287689, + "flos": 33500300401440.0, + "grad_norm": 1.68799640798161, + "language_loss": 0.7280221, + "learning_rate": 6.527526269210715e-07, + "loss": 0.74941814, + "num_input_tokens_seen": 266644210, + "router_z_loss_clip": 0.72900391, + "router_z_loss_mlp": 0.11773682, + "step": 12361, + "time_per_iteration": 2.694167375564575 + }, + { + "auxiliary_loss_clip": 0.01113519, + "auxiliary_loss_mlp": 0.01033114, + "balance_loss_clip": 1.03942895, + "balance_loss_mlp": 1.02171743, + "epoch": 0.743243649481437, + "flos": 25574873217120.0, + "grad_norm": 2.0809995310154425, + "language_loss": 0.55264866, + "learning_rate": 6.524648112660027e-07, + "loss": 0.57411504, + "num_input_tokens_seen": 266664230, + "router_z_loss_clip": 0.74121094, + "router_z_loss_mlp": 0.11401367, + "step": 12362, + "time_per_iteration": 2.63944411277771 + }, + { + "auxiliary_loss_clip": 0.01114003, + "auxiliary_loss_mlp": 0.01028509, + "balance_loss_clip": 1.04123783, + "balance_loss_mlp": 1.01655829, + "epoch": 0.7433037727341049, + "flos": 27800291439360.0, + "grad_norm": 2.006893718898059, + "language_loss": 0.7746731, + "learning_rate": 6.521770467096039e-07, + "loss": 0.79609823, + "num_input_tokens_seen": 266683270, + "router_z_loss_clip": 0.72705078, + "router_z_loss_mlp": 0.11956787, + "step": 12363, + "time_per_iteration": 2.608581304550171 + }, + { + "auxiliary_loss_clip": 0.01109755, + "auxiliary_loss_mlp": 0.0102853, + "balance_loss_clip": 1.03794706, + "balance_loss_mlp": 1.01768768, + "epoch": 0.7433638959867729, + "flos": 27084898398240.0, + "grad_norm": 2.1555709878668776, + "language_loss": 0.7791636, + "learning_rate": 6.518893332627862e-07, + "loss": 0.80054641, + "num_input_tokens_seen": 266701235, + "router_z_loss_clip": 0.71826172, + "router_z_loss_mlp": 0.10839844, + "step": 12364, + "time_per_iteration": 2.641832113265991 + }, + { + "auxiliary_loss_clip": 0.01109958, + "auxiliary_loss_mlp": 0.01030049, + "balance_loss_clip": 1.03691792, + "balance_loss_mlp": 1.01855147, + "epoch": 0.7434240192394409, + "flos": 28426761751680.0, + "grad_norm": 1.8933485093179812, + "language_loss": 0.79055333, + "learning_rate": 6.516016709364604e-07, + "loss": 0.81195343, + "num_input_tokens_seen": 266721495, + "router_z_loss_clip": 0.73046875, + "router_z_loss_mlp": 0.11499023, + "step": 12365, + "time_per_iteration": 2.634460210800171 + }, + { + "auxiliary_loss_clip": 0.0111465, + "auxiliary_loss_mlp": 0.01031351, + "balance_loss_clip": 1.0389384, + "balance_loss_mlp": 1.0194186, + "epoch": 0.7434841424921088, + "flos": 65905652168640.0, + "grad_norm": 2.571863329565334, + "language_loss": 0.76859528, + "learning_rate": 6.513140597415346e-07, + "loss": 0.79005527, + "num_input_tokens_seen": 266747400, + "router_z_loss_clip": 0.7578125, + "router_z_loss_mlp": 0.11920166, + "step": 12366, + "time_per_iteration": 2.971946954727173 + }, + { + "auxiliary_loss_clip": 0.011092, + "auxiliary_loss_mlp": 0.01026219, + "balance_loss_clip": 1.03922212, + "balance_loss_mlp": 1.01606214, + "epoch": 0.7435442657447768, + "flos": 26153823110400.0, + "grad_norm": 1.4838030458406937, + "language_loss": 0.71614063, + "learning_rate": 6.510264996889141e-07, + "loss": 0.73749483, + "num_input_tokens_seen": 266767630, + "router_z_loss_clip": 0.69921875, + "router_z_loss_mlp": 0.10162354, + "step": 12367, + "time_per_iteration": 2.6485466957092285 + }, + { + "auxiliary_loss_clip": 0.01115305, + "auxiliary_loss_mlp": 0.01034573, + "balance_loss_clip": 1.0403285, + "balance_loss_mlp": 1.02332532, + "epoch": 0.7436043889974447, + "flos": 29894452725600.0, + "grad_norm": 5.01238792197476, + "language_loss": 0.74410808, + "learning_rate": 6.507389907895038e-07, + "loss": 0.76560688, + "num_input_tokens_seen": 266788015, + "router_z_loss_clip": 0.74951172, + "router_z_loss_mlp": 0.11248779, + "step": 12368, + "time_per_iteration": 4.073485851287842 + }, + { + "auxiliary_loss_clip": 0.01109849, + "auxiliary_loss_mlp": 0.01031968, + "balance_loss_clip": 1.03939307, + "balance_loss_mlp": 1.02176929, + "epoch": 0.7436645122501128, + "flos": 49661756303040.0, + "grad_norm": 1.654310439594209, + "language_loss": 0.69285619, + "learning_rate": 6.50451533054207e-07, + "loss": 0.71427441, + "num_input_tokens_seen": 266809010, + "router_z_loss_clip": 0.70410156, + "router_z_loss_mlp": 0.10205078, + "step": 12369, + "time_per_iteration": 2.8119566440582275 + }, + { + "auxiliary_loss_clip": 0.01112724, + "auxiliary_loss_mlp": 0.01026093, + "balance_loss_clip": 1.03949416, + "balance_loss_mlp": 1.01460111, + "epoch": 0.7437246355027807, + "flos": 23073165247680.0, + "grad_norm": 1.8069020090812984, + "language_loss": 0.75706369, + "learning_rate": 6.501641264939233e-07, + "loss": 0.77845186, + "num_input_tokens_seen": 266825390, + "router_z_loss_clip": 0.73339844, + "router_z_loss_mlp": 0.11486816, + "step": 12370, + "time_per_iteration": 3.96474552154541 + }, + { + "auxiliary_loss_clip": 0.01113608, + "auxiliary_loss_mlp": 0.01033565, + "balance_loss_clip": 1.04094648, + "balance_loss_mlp": 1.02162004, + "epoch": 0.7437847587554487, + "flos": 26287835221440.0, + "grad_norm": 1.621235089636437, + "language_loss": 0.78659761, + "learning_rate": 6.498767711195503e-07, + "loss": 0.80806935, + "num_input_tokens_seen": 266844675, + "router_z_loss_clip": 0.7265625, + "router_z_loss_mlp": 0.11950684, + "step": 12371, + "time_per_iteration": 2.656628131866455 + }, + { + "auxiliary_loss_clip": 0.01112842, + "auxiliary_loss_mlp": 0.01024887, + "balance_loss_clip": 1.03916621, + "balance_loss_mlp": 1.01337171, + "epoch": 0.7438448820081166, + "flos": 33900756560640.0, + "grad_norm": 1.7542692109744336, + "language_loss": 0.69472611, + "learning_rate": 6.495894669419857e-07, + "loss": 0.71610337, + "num_input_tokens_seen": 266865160, + "router_z_loss_clip": 0.73632812, + "router_z_loss_mlp": 0.11523438, + "step": 12372, + "time_per_iteration": 2.669940233230591 + }, + { + "auxiliary_loss_clip": 0.0111155, + "auxiliary_loss_mlp": 0.01031642, + "balance_loss_clip": 1.03954577, + "balance_loss_mlp": 1.02044821, + "epoch": 0.7439050052607846, + "flos": 21924179262720.0, + "grad_norm": 2.2410136086768975, + "language_loss": 0.75064605, + "learning_rate": 6.493022139721245e-07, + "loss": 0.77207798, + "num_input_tokens_seen": 266883285, + "router_z_loss_clip": 0.71923828, + "router_z_loss_mlp": 0.11199951, + "step": 12373, + "time_per_iteration": 2.6031899452209473 + }, + { + "auxiliary_loss_clip": 0.01114617, + "auxiliary_loss_mlp": 0.01034539, + "balance_loss_clip": 1.03895807, + "balance_loss_mlp": 1.02190316, + "epoch": 0.7439651285134525, + "flos": 28015122823200.0, + "grad_norm": 1.8338098543690198, + "language_loss": 0.770509, + "learning_rate": 6.49015012220858e-07, + "loss": 0.79200065, + "num_input_tokens_seen": 266900960, + "router_z_loss_clip": 0.75585938, + "router_z_loss_mlp": 0.12628174, + "step": 12374, + "time_per_iteration": 2.749168634414673 + }, + { + "auxiliary_loss_clip": 0.01112931, + "auxiliary_loss_mlp": 0.01032612, + "balance_loss_clip": 1.03905678, + "balance_loss_mlp": 1.02066731, + "epoch": 0.7440252517661206, + "flos": 22947945386400.0, + "grad_norm": 2.214792930433864, + "language_loss": 0.76348174, + "learning_rate": 6.487278616990774e-07, + "loss": 0.78493714, + "num_input_tokens_seen": 266917710, + "router_z_loss_clip": 0.73876953, + "router_z_loss_mlp": 0.11950684, + "step": 12375, + "time_per_iteration": 2.620025157928467 + }, + { + "auxiliary_loss_clip": 0.01110661, + "auxiliary_loss_mlp": 0.01030177, + "balance_loss_clip": 1.03901696, + "balance_loss_mlp": 1.01978159, + "epoch": 0.7440853750187885, + "flos": 24728952551040.0, + "grad_norm": 2.001206207884676, + "language_loss": 0.77278996, + "learning_rate": 6.484407624176733e-07, + "loss": 0.79419833, + "num_input_tokens_seen": 266934220, + "router_z_loss_clip": 0.71630859, + "router_z_loss_mlp": 0.10394287, + "step": 12376, + "time_per_iteration": 2.652287721633911 + }, + { + "auxiliary_loss_clip": 0.01113887, + "auxiliary_loss_mlp": 0.01027801, + "balance_loss_clip": 1.03931999, + "balance_loss_mlp": 1.01580906, + "epoch": 0.7441454982714565, + "flos": 30917003330880.0, + "grad_norm": 1.7580613465834767, + "language_loss": 0.7938987, + "learning_rate": 6.481537143875296e-07, + "loss": 0.8153156, + "num_input_tokens_seen": 266955210, + "router_z_loss_clip": 0.74658203, + "router_z_loss_mlp": 0.11975098, + "step": 12377, + "time_per_iteration": 4.131040334701538 + }, + { + "auxiliary_loss_clip": 0.0111544, + "auxiliary_loss_mlp": 0.01030457, + "balance_loss_clip": 1.04019237, + "balance_loss_mlp": 1.01872063, + "epoch": 0.7442056215241245, + "flos": 78683587475040.0, + "grad_norm": 1.9838024112205574, + "language_loss": 0.66930109, + "learning_rate": 6.478667176195322e-07, + "loss": 0.69076008, + "num_input_tokens_seen": 266976555, + "router_z_loss_clip": 0.75244141, + "router_z_loss_mlp": 0.11737061, + "step": 12378, + "time_per_iteration": 3.0442211627960205 + }, + { + "auxiliary_loss_clip": 0.01115117, + "auxiliary_loss_mlp": 0.01034539, + "balance_loss_clip": 1.04044151, + "balance_loss_mlp": 1.0216049, + "epoch": 0.7442657447767924, + "flos": 38175246686880.0, + "grad_norm": 2.3050912290040078, + "language_loss": 0.71795452, + "learning_rate": 6.475797721245648e-07, + "loss": 0.73945105, + "num_input_tokens_seen": 266997640, + "router_z_loss_clip": 0.74707031, + "router_z_loss_mlp": 0.12945557, + "step": 12379, + "time_per_iteration": 4.081618309020996 + }, + { + "auxiliary_loss_clip": 0.0111137, + "auxiliary_loss_mlp": 0.0103292, + "balance_loss_clip": 1.03802896, + "balance_loss_mlp": 1.02122009, + "epoch": 0.7443258680294604, + "flos": 25390382925600.0, + "grad_norm": 2.1600294999055003, + "language_loss": 0.65362287, + "learning_rate": 6.472928779135085e-07, + "loss": 0.67506576, + "num_input_tokens_seen": 267016165, + "router_z_loss_clip": 0.73339844, + "router_z_loss_mlp": 0.11706543, + "step": 12380, + "time_per_iteration": 2.622978448867798 + }, + { + "auxiliary_loss_clip": 0.01114421, + "auxiliary_loss_mlp": 0.01029406, + "balance_loss_clip": 1.04024196, + "balance_loss_mlp": 1.0176996, + "epoch": 0.7443859912821283, + "flos": 27088423401600.0, + "grad_norm": 2.047658544264971, + "language_loss": 0.78183484, + "learning_rate": 6.470060349972411e-07, + "loss": 0.80327308, + "num_input_tokens_seen": 267034075, + "router_z_loss_clip": 0.74121094, + "router_z_loss_mlp": 0.11712646, + "step": 12381, + "time_per_iteration": 2.6600418090820312 + }, + { + "auxiliary_loss_clip": 0.01116621, + "auxiliary_loss_mlp": 0.0103387, + "balance_loss_clip": 1.04162717, + "balance_loss_mlp": 1.02160966, + "epoch": 0.7444461145347964, + "flos": 26866298907360.0, + "grad_norm": 2.346373615452217, + "language_loss": 0.72327089, + "learning_rate": 6.467192433866411e-07, + "loss": 0.74477583, + "num_input_tokens_seen": 267053645, + "router_z_loss_clip": 0.74951172, + "router_z_loss_mlp": 0.12249756, + "step": 12382, + "time_per_iteration": 2.630784749984741 + }, + { + "auxiliary_loss_clip": 0.01031871, + "auxiliary_loss_mlp": 0.01000951, + "balance_loss_clip": 1.00878119, + "balance_loss_mlp": 0.99988687, + "epoch": 0.7445062377874643, + "flos": 86095257069600.0, + "grad_norm": 0.7188310372989372, + "language_loss": 0.54646063, + "learning_rate": 6.464325030925831e-07, + "loss": 0.56678885, + "num_input_tokens_seen": 267121830, + "router_z_loss_clip": 0.23059082, + "router_z_loss_mlp": 0.01064301, + "step": 12383, + "time_per_iteration": 3.446345567703247 + }, + { + "auxiliary_loss_clip": 0.0111212, + "auxiliary_loss_mlp": 0.01030459, + "balance_loss_clip": 1.03802156, + "balance_loss_mlp": 1.01893783, + "epoch": 0.7445663610401323, + "flos": 27045116779680.0, + "grad_norm": 3.6589553284385796, + "language_loss": 0.7586264, + "learning_rate": 6.461458141259395e-07, + "loss": 0.78005219, + "num_input_tokens_seen": 267141145, + "router_z_loss_clip": 0.74169922, + "router_z_loss_mlp": 0.11535645, + "step": 12384, + "time_per_iteration": 2.646209239959717 + }, + { + "auxiliary_loss_clip": 0.01110104, + "auxiliary_loss_mlp": 0.01030312, + "balance_loss_clip": 1.03754139, + "balance_loss_mlp": 1.01858735, + "epoch": 0.7446264842928002, + "flos": 29492862082560.0, + "grad_norm": 2.1398377712338865, + "language_loss": 0.79399288, + "learning_rate": 6.458591764975823e-07, + "loss": 0.81539708, + "num_input_tokens_seen": 267159280, + "router_z_loss_clip": 0.72607422, + "router_z_loss_mlp": 0.11724854, + "step": 12385, + "time_per_iteration": 2.6462643146514893 + }, + { + "auxiliary_loss_clip": 0.01115764, + "auxiliary_loss_mlp": 0.01033451, + "balance_loss_clip": 1.03973866, + "balance_loss_mlp": 1.02048147, + "epoch": 0.7446866075454682, + "flos": 29449231322400.0, + "grad_norm": 1.5945528310384407, + "language_loss": 0.81456602, + "learning_rate": 6.455725902183813e-07, + "loss": 0.8360582, + "num_input_tokens_seen": 267179390, + "router_z_loss_clip": 0.75976562, + "router_z_loss_mlp": 0.12963867, + "step": 12386, + "time_per_iteration": 2.674868583679199 + }, + { + "auxiliary_loss_clip": 0.01111442, + "auxiliary_loss_mlp": 0.01031358, + "balance_loss_clip": 1.0404129, + "balance_loss_mlp": 1.01997924, + "epoch": 0.7447467307981361, + "flos": 28733797764000.0, + "grad_norm": 1.7168480969521698, + "language_loss": 0.71014613, + "learning_rate": 6.452860552992037e-07, + "loss": 0.73157418, + "num_input_tokens_seen": 267198165, + "router_z_loss_clip": 0.70996094, + "router_z_loss_mlp": 0.11376953, + "step": 12387, + "time_per_iteration": 2.6512160301208496 + }, + { + "auxiliary_loss_clip": 0.0111346, + "auxiliary_loss_mlp": 0.01031729, + "balance_loss_clip": 1.03947425, + "balance_loss_mlp": 1.02063632, + "epoch": 0.7448068540508042, + "flos": 23877116362080.0, + "grad_norm": 2.149294873728005, + "language_loss": 0.70388031, + "learning_rate": 6.449995717509138e-07, + "loss": 0.72533214, + "num_input_tokens_seen": 267214520, + "router_z_loss_clip": 0.74023438, + "router_z_loss_mlp": 0.11083984, + "step": 12388, + "time_per_iteration": 2.7119028568267822 + }, + { + "auxiliary_loss_clip": 0.01111455, + "auxiliary_loss_mlp": 0.01027866, + "balance_loss_clip": 1.03872526, + "balance_loss_mlp": 1.01651168, + "epoch": 0.7448669773034721, + "flos": 26648550279360.0, + "grad_norm": 6.610433262064807, + "language_loss": 0.8532443, + "learning_rate": 6.447131395843761e-07, + "loss": 0.87463748, + "num_input_tokens_seen": 267236555, + "router_z_loss_clip": 0.72705078, + "router_z_loss_mlp": 0.11352539, + "step": 12389, + "time_per_iteration": 2.6408567428588867 + }, + { + "auxiliary_loss_clip": 0.01112454, + "auxiliary_loss_mlp": 0.01035039, + "balance_loss_clip": 1.03901148, + "balance_loss_mlp": 1.02390468, + "epoch": 0.7449271005561401, + "flos": 30695000388480.0, + "grad_norm": 2.209503799408642, + "language_loss": 0.79542476, + "learning_rate": 6.444267588104526e-07, + "loss": 0.81689972, + "num_input_tokens_seen": 267254800, + "router_z_loss_clip": 0.73388672, + "router_z_loss_mlp": 0.11132812, + "step": 12390, + "time_per_iteration": 2.6678388118743896 + }, + { + "auxiliary_loss_clip": 0.01115193, + "auxiliary_loss_mlp": 0.01030342, + "balance_loss_clip": 1.04076779, + "balance_loss_mlp": 1.01823592, + "epoch": 0.7449872238088081, + "flos": 27178075441440.0, + "grad_norm": 1.922900940810093, + "language_loss": 0.84532076, + "learning_rate": 6.441404294400014e-07, + "loss": 0.86677611, + "num_input_tokens_seen": 267274610, + "router_z_loss_clip": 0.74414062, + "router_z_loss_mlp": 0.12115479, + "step": 12391, + "time_per_iteration": 2.620473623275757 + }, + { + "auxiliary_loss_clip": 0.01111775, + "auxiliary_loss_mlp": 0.01026835, + "balance_loss_clip": 1.03937304, + "balance_loss_mlp": 1.01544476, + "epoch": 0.745047347061476, + "flos": 25129813883040.0, + "grad_norm": 1.9368125414651118, + "language_loss": 0.73799908, + "learning_rate": 6.438541514838811e-07, + "loss": 0.75938523, + "num_input_tokens_seen": 267292600, + "router_z_loss_clip": 0.72314453, + "router_z_loss_mlp": 0.1138916, + "step": 12392, + "time_per_iteration": 2.648228406906128 + }, + { + "auxiliary_loss_clip": 0.01109636, + "auxiliary_loss_mlp": 0.01035734, + "balance_loss_clip": 1.03933001, + "balance_loss_mlp": 1.0244447, + "epoch": 0.745107470314144, + "flos": 27000148949280.0, + "grad_norm": 1.7491705584180273, + "language_loss": 0.76705074, + "learning_rate": 6.435679249529487e-07, + "loss": 0.78850448, + "num_input_tokens_seen": 267311295, + "router_z_loss_clip": 0.70263672, + "router_z_loss_mlp": 0.11279297, + "step": 12393, + "time_per_iteration": 2.6090569496154785 + }, + { + "auxiliary_loss_clip": 0.01115238, + "auxiliary_loss_mlp": 0.01033676, + "balance_loss_clip": 1.042068, + "balance_loss_mlp": 1.02120709, + "epoch": 0.745167593566812, + "flos": 27133148128320.0, + "grad_norm": 2.0873918820444204, + "language_loss": 0.72716379, + "learning_rate": 6.432817498580552e-07, + "loss": 0.74865294, + "num_input_tokens_seen": 267328390, + "router_z_loss_clip": 0.73144531, + "router_z_loss_mlp": 0.12475586, + "step": 12394, + "time_per_iteration": 2.6958000659942627 + }, + { + "auxiliary_loss_clip": 0.01115967, + "auxiliary_loss_mlp": 0.01032183, + "balance_loss_clip": 1.04180276, + "balance_loss_mlp": 1.02017879, + "epoch": 0.74522771681948, + "flos": 25217115920640.0, + "grad_norm": 1.6244463315049076, + "language_loss": 0.81270778, + "learning_rate": 6.429956262100535e-07, + "loss": 0.8341893, + "num_input_tokens_seen": 267348185, + "router_z_loss_clip": 0.74267578, + "router_z_loss_mlp": 0.11999512, + "step": 12395, + "time_per_iteration": 2.693329095840454 + }, + { + "auxiliary_loss_clip": 0.01115979, + "auxiliary_loss_mlp": 0.010339, + "balance_loss_clip": 1.04044485, + "balance_loss_mlp": 1.0217886, + "epoch": 0.7452878400721479, + "flos": 25754704021440.0, + "grad_norm": 2.437716617974404, + "language_loss": 0.71482646, + "learning_rate": 6.427095540197937e-07, + "loss": 0.73632526, + "num_input_tokens_seen": 267367010, + "router_z_loss_clip": 0.75585938, + "router_z_loss_mlp": 0.12097168, + "step": 12396, + "time_per_iteration": 2.6440176963806152 + }, + { + "auxiliary_loss_clip": 0.01116147, + "auxiliary_loss_mlp": 0.01029225, + "balance_loss_clip": 1.04037619, + "balance_loss_mlp": 1.01736403, + "epoch": 0.7453479633248159, + "flos": 32565254420160.0, + "grad_norm": 1.8591970380254554, + "language_loss": 0.68173265, + "learning_rate": 6.424235332981245e-07, + "loss": 0.70318639, + "num_input_tokens_seen": 267386605, + "router_z_loss_clip": 0.75830078, + "router_z_loss_mlp": 0.11859131, + "step": 12397, + "time_per_iteration": 2.6786608695983887 + }, + { + "auxiliary_loss_clip": 0.01111555, + "auxiliary_loss_mlp": 0.01039414, + "balance_loss_clip": 1.03949094, + "balance_loss_mlp": 1.02766633, + "epoch": 0.7454080865774838, + "flos": 20763038093760.0, + "grad_norm": 1.9304323700044181, + "language_loss": 0.77130461, + "learning_rate": 6.421375640558908e-07, + "loss": 0.79281425, + "num_input_tokens_seen": 267404135, + "router_z_loss_clip": 0.72119141, + "router_z_loss_mlp": 0.11743164, + "step": 12398, + "time_per_iteration": 2.6121559143066406 + }, + { + "auxiliary_loss_clip": 0.01109495, + "auxiliary_loss_mlp": 0.01027649, + "balance_loss_clip": 1.03802621, + "balance_loss_mlp": 1.01616287, + "epoch": 0.7454682098301518, + "flos": 26019567895680.0, + "grad_norm": 1.8884805901682167, + "language_loss": 0.78000808, + "learning_rate": 6.418516463039363e-07, + "loss": 0.8013795, + "num_input_tokens_seen": 267423120, + "router_z_loss_clip": 0.71533203, + "router_z_loss_mlp": 0.11499023, + "step": 12399, + "time_per_iteration": 2.6660425662994385 + }, + { + "auxiliary_loss_clip": 0.01107549, + "auxiliary_loss_mlp": 0.01030382, + "balance_loss_clip": 1.0383544, + "balance_loss_mlp": 1.01964092, + "epoch": 0.7455283330828197, + "flos": 21790572324480.0, + "grad_norm": 1.9660231250517222, + "language_loss": 0.74658763, + "learning_rate": 6.415657800531038e-07, + "loss": 0.76796699, + "num_input_tokens_seen": 267441250, + "router_z_loss_clip": 0.69140625, + "router_z_loss_mlp": 0.10742188, + "step": 12400, + "time_per_iteration": 2.674119234085083 + }, + { + "auxiliary_loss_clip": 0.01110069, + "auxiliary_loss_mlp": 0.01028059, + "balance_loss_clip": 1.0386256, + "balance_loss_mlp": 1.01681185, + "epoch": 0.7455884563354878, + "flos": 37551572066880.0, + "grad_norm": 1.9077902879708886, + "language_loss": 0.82028282, + "learning_rate": 6.412799653142327e-07, + "loss": 0.84166408, + "num_input_tokens_seen": 267462820, + "router_z_loss_clip": 0.71435547, + "router_z_loss_mlp": 0.11260986, + "step": 12401, + "time_per_iteration": 2.6982614994049072 + }, + { + "auxiliary_loss_clip": 0.01111389, + "auxiliary_loss_mlp": 0.01029693, + "balance_loss_clip": 1.03928065, + "balance_loss_mlp": 1.01858902, + "epoch": 0.7456485795881557, + "flos": 28291209984000.0, + "grad_norm": 2.3795364385567987, + "language_loss": 0.64866853, + "learning_rate": 6.409942020981611e-07, + "loss": 0.67007941, + "num_input_tokens_seen": 267483065, + "router_z_loss_clip": 0.72119141, + "router_z_loss_mlp": 0.11108398, + "step": 12402, + "time_per_iteration": 2.6161210536956787 + }, + { + "auxiliary_loss_clip": 0.01110805, + "auxiliary_loss_mlp": 0.01028221, + "balance_loss_clip": 1.03862357, + "balance_loss_mlp": 1.01774812, + "epoch": 0.7457087028408237, + "flos": 47256385724640.0, + "grad_norm": 1.8363631069581263, + "language_loss": 0.73177546, + "learning_rate": 6.407084904157265e-07, + "loss": 0.75316572, + "num_input_tokens_seen": 267504825, + "router_z_loss_clip": 0.72216797, + "router_z_loss_mlp": 0.10467529, + "step": 12403, + "time_per_iteration": 2.825690269470215 + }, + { + "auxiliary_loss_clip": 0.01032288, + "auxiliary_loss_mlp": 0.01000816, + "balance_loss_clip": 1.00925255, + "balance_loss_mlp": 0.9998256, + "epoch": 0.7457688260934917, + "flos": 68385365605440.0, + "grad_norm": 0.9246614586288079, + "language_loss": 0.58815426, + "learning_rate": 6.404228302777621e-07, + "loss": 0.60848528, + "num_input_tokens_seen": 267559260, + "router_z_loss_clip": 0.23059082, + "router_z_loss_mlp": 0.00990295, + "step": 12404, + "time_per_iteration": 3.089275598526001 + }, + { + "auxiliary_loss_clip": 0.01108935, + "auxiliary_loss_mlp": 0.01028992, + "balance_loss_clip": 1.03649402, + "balance_loss_mlp": 1.01822758, + "epoch": 0.7458289493461596, + "flos": 24546123468000.0, + "grad_norm": 1.565853562358424, + "language_loss": 0.77675098, + "learning_rate": 6.401372216950995e-07, + "loss": 0.79813027, + "num_input_tokens_seen": 267578720, + "router_z_loss_clip": 0.72314453, + "router_z_loss_mlp": 0.10766602, + "step": 12405, + "time_per_iteration": 2.6933395862579346 + }, + { + "auxiliary_loss_clip": 0.01111535, + "auxiliary_loss_mlp": 0.01035195, + "balance_loss_clip": 1.04054952, + "balance_loss_mlp": 1.023983, + "epoch": 0.7458890725988276, + "flos": 24639624649440.0, + "grad_norm": 1.6436768827506252, + "language_loss": 0.69395292, + "learning_rate": 6.398516646785698e-07, + "loss": 0.71542025, + "num_input_tokens_seen": 267598250, + "router_z_loss_clip": 0.70996094, + "router_z_loss_mlp": 0.11212158, + "step": 12406, + "time_per_iteration": 2.6516106128692627 + }, + { + "auxiliary_loss_clip": 0.0111781, + "auxiliary_loss_mlp": 0.01034304, + "balance_loss_clip": 1.04100358, + "balance_loss_mlp": 1.02165532, + "epoch": 0.7459491958514956, + "flos": 20766076889760.0, + "grad_norm": 1.8354927264405791, + "language_loss": 0.65117788, + "learning_rate": 6.39566159239002e-07, + "loss": 0.67269903, + "num_input_tokens_seen": 267615430, + "router_z_loss_clip": 0.76806641, + "router_z_loss_mlp": 0.12652588, + "step": 12407, + "time_per_iteration": 4.143265962600708 + }, + { + "auxiliary_loss_clip": 0.01114722, + "auxiliary_loss_mlp": 0.01029103, + "balance_loss_clip": 1.03953028, + "balance_loss_mlp": 1.01652598, + "epoch": 0.7460093191041636, + "flos": 30601782828000.0, + "grad_norm": 1.8680944070383696, + "language_loss": 0.72172946, + "learning_rate": 6.392807053872212e-07, + "loss": 0.7431677, + "num_input_tokens_seen": 267635075, + "router_z_loss_clip": 0.75097656, + "router_z_loss_mlp": 0.12579346, + "step": 12408, + "time_per_iteration": 2.68198299407959 + }, + { + "auxiliary_loss_clip": 0.01116636, + "auxiliary_loss_mlp": 0.01033367, + "balance_loss_clip": 1.04081106, + "balance_loss_mlp": 1.02048659, + "epoch": 0.7460694423568315, + "flos": 26732894555520.0, + "grad_norm": 1.9821019962753985, + "language_loss": 0.72742486, + "learning_rate": 6.38995303134053e-07, + "loss": 0.74892497, + "num_input_tokens_seen": 267654105, + "router_z_loss_clip": 0.75830078, + "router_z_loss_mlp": 0.12878418, + "step": 12409, + "time_per_iteration": 2.607534646987915 + }, + { + "auxiliary_loss_clip": 0.01107385, + "auxiliary_loss_mlp": 0.01029897, + "balance_loss_clip": 1.03771853, + "balance_loss_mlp": 1.01892984, + "epoch": 0.7461295656094995, + "flos": 25887460096800.0, + "grad_norm": 1.6449075314760744, + "language_loss": 0.65584576, + "learning_rate": 6.38709952490319e-07, + "loss": 0.67721862, + "num_input_tokens_seen": 267673090, + "router_z_loss_clip": 0.69580078, + "router_z_loss_mlp": 0.10968018, + "step": 12410, + "time_per_iteration": 3.989230155944824 + }, + { + "auxiliary_loss_clip": 0.01111562, + "auxiliary_loss_mlp": 0.01033122, + "balance_loss_clip": 1.04037023, + "balance_loss_mlp": 1.0212909, + "epoch": 0.7461896888621674, + "flos": 27267727481280.0, + "grad_norm": 3.2301975549210815, + "language_loss": 0.84522563, + "learning_rate": 6.384246534668396e-07, + "loss": 0.86667252, + "num_input_tokens_seen": 267690605, + "router_z_loss_clip": 0.71337891, + "router_z_loss_mlp": 0.11828613, + "step": 12411, + "time_per_iteration": 2.674419641494751 + }, + { + "auxiliary_loss_clip": 0.01114505, + "auxiliary_loss_mlp": 0.01028409, + "balance_loss_clip": 1.04015088, + "balance_loss_mlp": 1.01628566, + "epoch": 0.7462498121148354, + "flos": 31094767753920.0, + "grad_norm": 2.2716939650011883, + "language_loss": 0.77695137, + "learning_rate": 6.381394060744339e-07, + "loss": 0.79838049, + "num_input_tokens_seen": 267710540, + "router_z_loss_clip": 0.74267578, + "router_z_loss_mlp": 0.12121582, + "step": 12412, + "time_per_iteration": 2.6463406085968018 + }, + { + "auxiliary_loss_clip": 0.0111232, + "auxiliary_loss_mlp": 0.01035083, + "balance_loss_clip": 1.03852987, + "balance_loss_mlp": 1.02365661, + "epoch": 0.7463099353675033, + "flos": 41424390515520.0, + "grad_norm": 4.443098316449746, + "language_loss": 0.62468976, + "learning_rate": 6.378542103239188e-07, + "loss": 0.64616382, + "num_input_tokens_seen": 267730780, + "router_z_loss_clip": 0.73876953, + "router_z_loss_mlp": 0.11431885, + "step": 12413, + "time_per_iteration": 2.7716736793518066 + }, + { + "auxiliary_loss_clip": 0.01032094, + "auxiliary_loss_mlp": 0.01001112, + "balance_loss_clip": 1.00909138, + "balance_loss_mlp": 1.00006652, + "epoch": 0.7463700586201714, + "flos": 76589513573760.0, + "grad_norm": 0.723117089847505, + "language_loss": 0.54845762, + "learning_rate": 6.375690662261082e-07, + "loss": 0.5687896, + "num_input_tokens_seen": 267794240, + "router_z_loss_clip": 0.23010254, + "router_z_loss_mlp": 0.0104599, + "step": 12414, + "time_per_iteration": 3.276902675628662 + }, + { + "auxiliary_loss_clip": 0.01112592, + "auxiliary_loss_mlp": 0.01032013, + "balance_loss_clip": 1.03836989, + "balance_loss_mlp": 1.02027035, + "epoch": 0.7464301818728393, + "flos": 40796015891040.0, + "grad_norm": 1.8643158652032994, + "language_loss": 0.54854298, + "learning_rate": 6.372839737918154e-07, + "loss": 0.56998909, + "num_input_tokens_seen": 267817190, + "router_z_loss_clip": 0.74267578, + "router_z_loss_mlp": 0.11749268, + "step": 12415, + "time_per_iteration": 2.7638866901397705 + }, + { + "auxiliary_loss_clip": 0.01111086, + "auxiliary_loss_mlp": 0.01029061, + "balance_loss_clip": 1.03909612, + "balance_loss_mlp": 1.01726508, + "epoch": 0.7464903051255073, + "flos": 32787176328000.0, + "grad_norm": 2.5094888650052924, + "language_loss": 0.74710488, + "learning_rate": 6.369989330318506e-07, + "loss": 0.76850629, + "num_input_tokens_seen": 267836245, + "router_z_loss_clip": 0.71972656, + "router_z_loss_mlp": 0.11785889, + "step": 12416, + "time_per_iteration": 2.6970314979553223 + }, + { + "auxiliary_loss_clip": 0.01111408, + "auxiliary_loss_mlp": 0.01032397, + "balance_loss_clip": 1.03933358, + "balance_loss_mlp": 1.02126265, + "epoch": 0.7465504283781753, + "flos": 53797169658240.0, + "grad_norm": 1.6021676883384899, + "language_loss": 0.69233179, + "learning_rate": 6.367139439570233e-07, + "loss": 0.71376985, + "num_input_tokens_seen": 267858310, + "router_z_loss_clip": 0.72021484, + "router_z_loss_mlp": 0.11132812, + "step": 12417, + "time_per_iteration": 4.291043519973755 + }, + { + "auxiliary_loss_clip": 0.01115691, + "auxiliary_loss_mlp": 0.01033034, + "balance_loss_clip": 1.04105353, + "balance_loss_mlp": 1.02090442, + "epoch": 0.7466105516308432, + "flos": 24009345712800.0, + "grad_norm": 1.7798076426989917, + "language_loss": 0.7391203, + "learning_rate": 6.364290065781392e-07, + "loss": 0.7606076, + "num_input_tokens_seen": 267876345, + "router_z_loss_clip": 0.74560547, + "router_z_loss_mlp": 0.12139893, + "step": 12418, + "time_per_iteration": 3.879155397415161 + }, + { + "auxiliary_loss_clip": 0.01113094, + "auxiliary_loss_mlp": 0.01025804, + "balance_loss_clip": 1.03990078, + "balance_loss_mlp": 1.01484823, + "epoch": 0.7466706748835112, + "flos": 25039513566720.0, + "grad_norm": 1.7618500183246242, + "language_loss": 0.69057477, + "learning_rate": 6.361441209060039e-07, + "loss": 0.71196371, + "num_input_tokens_seen": 267896740, + "router_z_loss_clip": 0.73242188, + "router_z_loss_mlp": 0.10974121, + "step": 12419, + "time_per_iteration": 2.8201522827148438 + }, + { + "auxiliary_loss_clip": 0.01107192, + "auxiliary_loss_mlp": 0.01029932, + "balance_loss_clip": 1.03844035, + "balance_loss_mlp": 1.0190835, + "epoch": 0.7467307981361792, + "flos": 26466491024640.0, + "grad_norm": 2.225256039578504, + "language_loss": 0.74591976, + "learning_rate": 6.358592869514216e-07, + "loss": 0.76729101, + "num_input_tokens_seen": 267914765, + "router_z_loss_clip": 0.68798828, + "router_z_loss_mlp": 0.10845947, + "step": 12420, + "time_per_iteration": 2.6806302070617676 + }, + { + "auxiliary_loss_clip": 0.01116411, + "auxiliary_loss_mlp": 0.01031963, + "balance_loss_clip": 1.04277849, + "balance_loss_mlp": 1.01976752, + "epoch": 0.7467909213888472, + "flos": 23883720678720.0, + "grad_norm": 2.003379118635151, + "language_loss": 0.67387211, + "learning_rate": 6.355745047251904e-07, + "loss": 0.69535589, + "num_input_tokens_seen": 267934085, + "router_z_loss_clip": 0.73486328, + "router_z_loss_mlp": 0.12200928, + "step": 12421, + "time_per_iteration": 2.713651657104492 + }, + { + "auxiliary_loss_clip": 0.01116419, + "auxiliary_loss_mlp": 0.01032685, + "balance_loss_clip": 1.04014468, + "balance_loss_mlp": 1.01973271, + "epoch": 0.7468510446415151, + "flos": 28913263912800.0, + "grad_norm": 2.6840893134158286, + "language_loss": 0.72373968, + "learning_rate": 6.352897742381107e-07, + "loss": 0.74523067, + "num_input_tokens_seen": 267955170, + "router_z_loss_clip": 0.76367188, + "router_z_loss_mlp": 0.1295166, + "step": 12422, + "time_per_iteration": 2.6430609226226807 + }, + { + "auxiliary_loss_clip": 0.01112584, + "auxiliary_loss_mlp": 0.01029048, + "balance_loss_clip": 1.04081964, + "balance_loss_mlp": 1.01737094, + "epoch": 0.7469111678941831, + "flos": 35771739903360.0, + "grad_norm": 2.3339716667051573, + "language_loss": 0.74785095, + "learning_rate": 6.350050955009796e-07, + "loss": 0.7692672, + "num_input_tokens_seen": 267974980, + "router_z_loss_clip": 0.71728516, + "router_z_loss_mlp": 0.11682129, + "step": 12423, + "time_per_iteration": 2.6959662437438965 + }, + { + "auxiliary_loss_clip": 0.01108307, + "auxiliary_loss_mlp": 0.01022416, + "balance_loss_clip": 1.03736138, + "balance_loss_mlp": 1.01197934, + "epoch": 0.746971291146851, + "flos": 26598841927200.0, + "grad_norm": 1.499282840313706, + "language_loss": 0.67393398, + "learning_rate": 6.347204685245929e-07, + "loss": 0.69524121, + "num_input_tokens_seen": 267994985, + "router_z_loss_clip": 0.70947266, + "router_z_loss_mlp": 0.10424805, + "step": 12424, + "time_per_iteration": 2.6673531532287598 + }, + { + "auxiliary_loss_clip": 0.01115099, + "auxiliary_loss_mlp": 0.01033376, + "balance_loss_clip": 1.04104614, + "balance_loss_mlp": 1.0217886, + "epoch": 0.747031414399519, + "flos": 44227664664480.0, + "grad_norm": 2.7946676233732455, + "language_loss": 0.74837959, + "learning_rate": 6.344358933197418e-07, + "loss": 0.76986432, + "num_input_tokens_seen": 268014985, + "router_z_loss_clip": 0.73925781, + "router_z_loss_mlp": 0.11584473, + "step": 12425, + "time_per_iteration": 2.7621006965637207 + }, + { + "auxiliary_loss_clip": 0.01111582, + "auxiliary_loss_mlp": 0.01029513, + "balance_loss_clip": 1.03848243, + "balance_loss_mlp": 1.01798511, + "epoch": 0.7470915376521869, + "flos": 24373342670400.0, + "grad_norm": 2.2253089345530674, + "language_loss": 0.69548273, + "learning_rate": 6.341513698972194e-07, + "loss": 0.71689367, + "num_input_tokens_seen": 268034395, + "router_z_loss_clip": 0.73095703, + "router_z_loss_mlp": 0.11535645, + "step": 12426, + "time_per_iteration": 2.6505980491638184 + }, + { + "auxiliary_loss_clip": 0.01109388, + "auxiliary_loss_mlp": 0.01032815, + "balance_loss_clip": 1.03883374, + "balance_loss_mlp": 1.02182364, + "epoch": 0.747151660904855, + "flos": 24506828056800.0, + "grad_norm": 2.303663770167309, + "language_loss": 0.65713811, + "learning_rate": 6.338668982678139e-07, + "loss": 0.67856014, + "num_input_tokens_seen": 268054485, + "router_z_loss_clip": 0.70507812, + "router_z_loss_mlp": 0.10974121, + "step": 12427, + "time_per_iteration": 2.6590919494628906 + }, + { + "auxiliary_loss_clip": 0.01112787, + "auxiliary_loss_mlp": 0.01030556, + "balance_loss_clip": 1.03993273, + "balance_loss_mlp": 1.0183965, + "epoch": 0.7472117841575229, + "flos": 19877133222720.0, + "grad_norm": 1.5644274128444462, + "language_loss": 0.74639559, + "learning_rate": 6.335824784423118e-07, + "loss": 0.767829, + "num_input_tokens_seen": 268072250, + "router_z_loss_clip": 0.72802734, + "router_z_loss_mlp": 0.121521, + "step": 12428, + "time_per_iteration": 2.661649703979492 + }, + { + "auxiliary_loss_clip": 0.01116221, + "auxiliary_loss_mlp": 0.01025434, + "balance_loss_clip": 1.04011989, + "balance_loss_mlp": 1.01266074, + "epoch": 0.7472719074101909, + "flos": 26100792341280.0, + "grad_norm": 2.0444826113461563, + "language_loss": 0.58399224, + "learning_rate": 6.33298110431499e-07, + "loss": 0.60540879, + "num_input_tokens_seen": 268089840, + "router_z_loss_clip": 0.76074219, + "router_z_loss_mlp": 0.12774658, + "step": 12429, + "time_per_iteration": 2.589329242706299 + }, + { + "auxiliary_loss_clip": 0.01115667, + "auxiliary_loss_mlp": 0.01027527, + "balance_loss_clip": 1.03960681, + "balance_loss_mlp": 1.01554608, + "epoch": 0.7473320306628589, + "flos": 36171466751520.0, + "grad_norm": 3.387520860112491, + "language_loss": 0.61144489, + "learning_rate": 6.330137942461595e-07, + "loss": 0.63287687, + "num_input_tokens_seen": 268109360, + "router_z_loss_clip": 0.76074219, + "router_z_loss_mlp": 0.11987305, + "step": 12430, + "time_per_iteration": 2.7011632919311523 + }, + { + "auxiliary_loss_clip": 0.01112041, + "auxiliary_loss_mlp": 0.01031504, + "balance_loss_clip": 1.04031527, + "balance_loss_mlp": 1.01989853, + "epoch": 0.7473921539155268, + "flos": 29450973565440.0, + "grad_norm": 1.4858800397455647, + "language_loss": 0.75597501, + "learning_rate": 6.327295298970734e-07, + "loss": 0.77741045, + "num_input_tokens_seen": 268131840, + "router_z_loss_clip": 0.71826172, + "router_z_loss_mlp": 0.11602783, + "step": 12431, + "time_per_iteration": 2.68117618560791 + }, + { + "auxiliary_loss_clip": 0.01112174, + "auxiliary_loss_mlp": 0.01030163, + "balance_loss_clip": 1.03802979, + "balance_loss_mlp": 1.01828396, + "epoch": 0.7474522771681948, + "flos": 21339111260160.0, + "grad_norm": 2.351423315193609, + "language_loss": 0.75628954, + "learning_rate": 6.32445317395021e-07, + "loss": 0.77771294, + "num_input_tokens_seen": 268148300, + "router_z_loss_clip": 0.7421875, + "router_z_loss_mlp": 0.11889648, + "step": 12432, + "time_per_iteration": 2.6103084087371826 + }, + { + "auxiliary_loss_clip": 0.01116562, + "auxiliary_loss_mlp": 0.01035844, + "balance_loss_clip": 1.03967261, + "balance_loss_mlp": 1.02280831, + "epoch": 0.7475124004208628, + "flos": 20410385974560.0, + "grad_norm": 2.993987721226906, + "language_loss": 0.69936764, + "learning_rate": 6.321611567507787e-07, + "loss": 0.72089171, + "num_input_tokens_seen": 268166450, + "router_z_loss_clip": 0.76708984, + "router_z_loss_mlp": 0.13043213, + "step": 12433, + "time_per_iteration": 2.590298652648926 + }, + { + "auxiliary_loss_clip": 0.01112562, + "auxiliary_loss_mlp": 0.01029674, + "balance_loss_clip": 1.03938949, + "balance_loss_mlp": 1.01750875, + "epoch": 0.7475725236735308, + "flos": 24062498033760.0, + "grad_norm": 1.79592276713947, + "language_loss": 0.67177796, + "learning_rate": 6.318770479751232e-07, + "loss": 0.69320035, + "num_input_tokens_seen": 268186165, + "router_z_loss_clip": 0.73095703, + "router_z_loss_mlp": 0.12176514, + "step": 12434, + "time_per_iteration": 2.700688362121582 + }, + { + "auxiliary_loss_clip": 0.01106934, + "auxiliary_loss_mlp": 0.01030219, + "balance_loss_clip": 1.03902972, + "balance_loss_mlp": 1.01972246, + "epoch": 0.7476326469261987, + "flos": 32074700531040.0, + "grad_norm": 1.4793232769809368, + "language_loss": 0.79603028, + "learning_rate": 6.315929910788263e-07, + "loss": 0.81740183, + "num_input_tokens_seen": 268208145, + "router_z_loss_clip": 0.6796875, + "router_z_loss_mlp": 0.10491943, + "step": 12435, + "time_per_iteration": 2.647496461868286 + }, + { + "auxiliary_loss_clip": 0.01113232, + "auxiliary_loss_mlp": 0.01025875, + "balance_loss_clip": 1.03911579, + "balance_loss_mlp": 1.01447272, + "epoch": 0.7476927701788667, + "flos": 38842146894240.0, + "grad_norm": 2.021816211105618, + "language_loss": 0.67803347, + "learning_rate": 6.313089860726604e-07, + "loss": 0.69942456, + "num_input_tokens_seen": 268228345, + "router_z_loss_clip": 0.74072266, + "router_z_loss_mlp": 0.11401367, + "step": 12436, + "time_per_iteration": 2.721191883087158 + }, + { + "auxiliary_loss_clip": 0.01116395, + "auxiliary_loss_mlp": 0.01029823, + "balance_loss_clip": 1.04034948, + "balance_loss_mlp": 1.01807535, + "epoch": 0.7477528934315346, + "flos": 38797786823040.0, + "grad_norm": 1.6607780670297179, + "language_loss": 0.70491165, + "learning_rate": 6.31025032967396e-07, + "loss": 0.72637385, + "num_input_tokens_seen": 268250260, + "router_z_loss_clip": 0.76025391, + "router_z_loss_mlp": 0.11749268, + "step": 12437, + "time_per_iteration": 2.6936726570129395 + }, + { + "auxiliary_loss_clip": 0.01107748, + "auxiliary_loss_mlp": 0.01025366, + "balance_loss_clip": 1.03805029, + "balance_loss_mlp": 1.01484561, + "epoch": 0.7478130166842026, + "flos": 24858264657600.0, + "grad_norm": 2.709357046629343, + "language_loss": 0.67212874, + "learning_rate": 6.307411317737986e-07, + "loss": 0.69345987, + "num_input_tokens_seen": 268268440, + "router_z_loss_clip": 0.69677734, + "router_z_loss_mlp": 0.10522461, + "step": 12438, + "time_per_iteration": 2.741617202758789 + }, + { + "auxiliary_loss_clip": 0.01113263, + "auxiliary_loss_mlp": 0.01030145, + "balance_loss_clip": 1.03999925, + "balance_loss_mlp": 1.01870072, + "epoch": 0.7478731399368705, + "flos": 22145290824960.0, + "grad_norm": 1.7424461455824074, + "language_loss": 0.80963922, + "learning_rate": 6.304572825026344e-07, + "loss": 0.83107334, + "num_input_tokens_seen": 268285765, + "router_z_loss_clip": 0.73388672, + "router_z_loss_mlp": 0.11444092, + "step": 12439, + "time_per_iteration": 2.5773417949676514 + }, + { + "auxiliary_loss_clip": 0.01110967, + "auxiliary_loss_mlp": 0.01036519, + "balance_loss_clip": 1.03869796, + "balance_loss_mlp": 1.02559948, + "epoch": 0.7479332631895386, + "flos": 18629500361760.0, + "grad_norm": 1.8543470225626941, + "language_loss": 0.70537108, + "learning_rate": 6.301734851646674e-07, + "loss": 0.72684592, + "num_input_tokens_seen": 268304015, + "router_z_loss_clip": 0.72265625, + "router_z_loss_mlp": 0.10925293, + "step": 12440, + "time_per_iteration": 2.639575242996216 + }, + { + "auxiliary_loss_clip": 0.01110031, + "auxiliary_loss_mlp": 0.01024989, + "balance_loss_clip": 1.03989124, + "balance_loss_mlp": 1.01424801, + "epoch": 0.7479933864422065, + "flos": 25798132195200.0, + "grad_norm": 1.745070035499737, + "language_loss": 0.74401224, + "learning_rate": 6.298897397706597e-07, + "loss": 0.76536238, + "num_input_tokens_seen": 268323290, + "router_z_loss_clip": 0.70117188, + "router_z_loss_mlp": 0.10736084, + "step": 12441, + "time_per_iteration": 2.660123348236084 + }, + { + "auxiliary_loss_clip": 0.0111511, + "auxiliary_loss_mlp": 0.0103148, + "balance_loss_clip": 1.04016244, + "balance_loss_mlp": 1.01921356, + "epoch": 0.7480535096948745, + "flos": 17560725890400.0, + "grad_norm": 2.2312916060670718, + "language_loss": 0.82677829, + "learning_rate": 6.296060463313698e-07, + "loss": 0.84824419, + "num_input_tokens_seen": 268339490, + "router_z_loss_clip": 0.74951172, + "router_z_loss_mlp": 0.12261963, + "step": 12442, + "time_per_iteration": 2.6634838581085205 + }, + { + "auxiliary_loss_clip": 0.01114912, + "auxiliary_loss_mlp": 0.01029913, + "balance_loss_clip": 1.0400095, + "balance_loss_mlp": 1.01750302, + "epoch": 0.7481136329475425, + "flos": 33366531394080.0, + "grad_norm": 2.1196075802454186, + "language_loss": 0.6277467, + "learning_rate": 6.293224048575565e-07, + "loss": 0.64919496, + "num_input_tokens_seen": 268359865, + "router_z_loss_clip": 0.74902344, + "router_z_loss_mlp": 0.12408447, + "step": 12443, + "time_per_iteration": 2.7221150398254395 + }, + { + "auxiliary_loss_clip": 0.01109252, + "auxiliary_loss_mlp": 0.01027534, + "balance_loss_clip": 1.03805637, + "balance_loss_mlp": 1.0168705, + "epoch": 0.7481737562002104, + "flos": 23832715773600.0, + "grad_norm": 1.8673315888459434, + "language_loss": 0.71560097, + "learning_rate": 6.29038815359975e-07, + "loss": 0.73696887, + "num_input_tokens_seen": 268377065, + "router_z_loss_clip": 0.71191406, + "router_z_loss_mlp": 0.10656738, + "step": 12444, + "time_per_iteration": 2.6099116802215576 + }, + { + "auxiliary_loss_clip": 0.01111943, + "auxiliary_loss_mlp": 0.0102721, + "balance_loss_clip": 1.03925192, + "balance_loss_mlp": 1.01543856, + "epoch": 0.7482338794528784, + "flos": 26551929267360.0, + "grad_norm": 1.4413575346901333, + "language_loss": 0.68546492, + "learning_rate": 6.287552778493786e-07, + "loss": 0.70685637, + "num_input_tokens_seen": 268396935, + "router_z_loss_clip": 0.72705078, + "router_z_loss_mlp": 0.11779785, + "step": 12445, + "time_per_iteration": 2.6297426223754883 + }, + { + "auxiliary_loss_clip": 0.01108549, + "auxiliary_loss_mlp": 0.01023099, + "balance_loss_clip": 1.0369966, + "balance_loss_mlp": 1.01178026, + "epoch": 0.7482940027055464, + "flos": 22814419482720.0, + "grad_norm": 1.701243269817605, + "language_loss": 0.73865473, + "learning_rate": 6.28471792336519e-07, + "loss": 0.7599712, + "num_input_tokens_seen": 268414460, + "router_z_loss_clip": 0.71533203, + "router_z_loss_mlp": 0.11328125, + "step": 12446, + "time_per_iteration": 4.008102178573608 + }, + { + "auxiliary_loss_clip": 0.01116833, + "auxiliary_loss_mlp": 0.01031418, + "balance_loss_clip": 1.04084373, + "balance_loss_mlp": 1.01902032, + "epoch": 0.7483541259582144, + "flos": 19520429375520.0, + "grad_norm": 2.0542446156975456, + "language_loss": 0.73273015, + "learning_rate": 6.281883588321475e-07, + "loss": 0.75421262, + "num_input_tokens_seen": 268432225, + "router_z_loss_clip": 0.75976562, + "router_z_loss_mlp": 0.1239624, + "step": 12447, + "time_per_iteration": 2.613187074661255 + }, + { + "auxiliary_loss_clip": 0.01111313, + "auxiliary_loss_mlp": 0.01028998, + "balance_loss_clip": 1.03851819, + "balance_loss_mlp": 1.01808393, + "epoch": 0.7484142492108823, + "flos": 31184257724640.0, + "grad_norm": 3.748813831700438, + "language_loss": 0.72378385, + "learning_rate": 6.279049773470109e-07, + "loss": 0.74518692, + "num_input_tokens_seen": 268449270, + "router_z_loss_clip": 0.72753906, + "router_z_loss_mlp": 0.10900879, + "step": 12448, + "time_per_iteration": 2.695256233215332 + }, + { + "auxiliary_loss_clip": 0.01115108, + "auxiliary_loss_mlp": 0.01034212, + "balance_loss_clip": 1.04070067, + "balance_loss_mlp": 1.02246439, + "epoch": 0.7484743724635503, + "flos": 27928347510240.0, + "grad_norm": 1.8844456756300492, + "language_loss": 0.73675942, + "learning_rate": 6.276216478918543e-07, + "loss": 0.75825262, + "num_input_tokens_seen": 268467250, + "router_z_loss_clip": 0.74462891, + "router_z_loss_mlp": 0.11749268, + "step": 12449, + "time_per_iteration": 2.6575114727020264 + }, + { + "auxiliary_loss_clip": 0.01118664, + "auxiliary_loss_mlp": 0.0103451, + "balance_loss_clip": 1.0423373, + "balance_loss_mlp": 1.02218962, + "epoch": 0.7485344957162182, + "flos": 30871913948640.0, + "grad_norm": 1.8839437568494937, + "language_loss": 0.61435378, + "learning_rate": 6.273383704774225e-07, + "loss": 0.63588548, + "num_input_tokens_seen": 268487270, + "router_z_loss_clip": 0.76269531, + "router_z_loss_mlp": 0.12316895, + "step": 12450, + "time_per_iteration": 3.971733808517456 + }, + { + "auxiliary_loss_clip": 0.01104488, + "auxiliary_loss_mlp": 0.01025731, + "balance_loss_clip": 1.03578091, + "balance_loss_mlp": 1.0148654, + "epoch": 0.7485946189688862, + "flos": 33010597375200.0, + "grad_norm": 2.060405587661818, + "language_loss": 0.70714867, + "learning_rate": 6.270551451144577e-07, + "loss": 0.72845089, + "num_input_tokens_seen": 268508020, + "router_z_loss_clip": 0.6875, + "router_z_loss_mlp": 0.10858154, + "step": 12451, + "time_per_iteration": 2.644505262374878 + }, + { + "auxiliary_loss_clip": 0.01116127, + "auxiliary_loss_mlp": 0.01024562, + "balance_loss_clip": 1.03976321, + "balance_loss_mlp": 1.01334453, + "epoch": 0.7486547422215541, + "flos": 32832144158400.0, + "grad_norm": 3.6746984554642883, + "language_loss": 0.8034811, + "learning_rate": 6.267719718136988e-07, + "loss": 0.82488799, + "num_input_tokens_seen": 268527375, + "router_z_loss_clip": 0.76464844, + "router_z_loss_mlp": 0.11212158, + "step": 12452, + "time_per_iteration": 2.6572680473327637 + }, + { + "auxiliary_loss_clip": 0.01121158, + "auxiliary_loss_mlp": 0.0103115, + "balance_loss_clip": 1.04387951, + "balance_loss_mlp": 1.01892543, + "epoch": 0.7487148654742222, + "flos": 27267079204800.0, + "grad_norm": 5.221045446948987, + "language_loss": 0.7173841, + "learning_rate": 6.264888505858843e-07, + "loss": 0.73890722, + "num_input_tokens_seen": 268544870, + "router_z_loss_clip": 0.77246094, + "router_z_loss_mlp": 0.12225342, + "step": 12453, + "time_per_iteration": 2.6498067378997803 + }, + { + "auxiliary_loss_clip": 0.01116066, + "auxiliary_loss_mlp": 0.01032827, + "balance_loss_clip": 1.0423274, + "balance_loss_mlp": 1.02140093, + "epoch": 0.7487749887268901, + "flos": 28112230042560.0, + "grad_norm": 1.5777015281632412, + "language_loss": 0.73976713, + "learning_rate": 6.262057814417517e-07, + "loss": 0.7612561, + "num_input_tokens_seen": 268564580, + "router_z_loss_clip": 0.73779297, + "router_z_loss_mlp": 0.11437988, + "step": 12454, + "time_per_iteration": 2.647423267364502 + }, + { + "auxiliary_loss_clip": 0.01032867, + "auxiliary_loss_mlp": 0.01001226, + "balance_loss_clip": 1.01003861, + "balance_loss_mlp": 1.00018191, + "epoch": 0.7488351119795581, + "flos": 87276083286240.0, + "grad_norm": 0.748907947142116, + "language_loss": 0.59371418, + "learning_rate": 6.259227643920322e-07, + "loss": 0.6140551, + "num_input_tokens_seen": 268629550, + "router_z_loss_clip": 0.22827148, + "router_z_loss_mlp": 0.01045227, + "step": 12455, + "time_per_iteration": 3.4201602935791016 + }, + { + "auxiliary_loss_clip": 0.0111117, + "auxiliary_loss_mlp": 0.0102403, + "balance_loss_clip": 1.03956294, + "balance_loss_mlp": 1.01270545, + "epoch": 0.748895235232226, + "flos": 20983136724000.0, + "grad_norm": 1.9787150115097976, + "language_loss": 0.79669631, + "learning_rate": 6.256397994474592e-07, + "loss": 0.81804836, + "num_input_tokens_seen": 268646645, + "router_z_loss_clip": 0.71630859, + "router_z_loss_mlp": 0.11322021, + "step": 12456, + "time_per_iteration": 4.0152599811553955 + }, + { + "auxiliary_loss_clip": 0.01034009, + "auxiliary_loss_mlp": 0.01000231, + "balance_loss_clip": 1.01101804, + "balance_loss_mlp": 0.9991951, + "epoch": 0.748955358484894, + "flos": 71965977366240.0, + "grad_norm": 0.8317557881310571, + "language_loss": 0.61372936, + "learning_rate": 6.25356886618763e-07, + "loss": 0.63407183, + "num_input_tokens_seen": 268702275, + "router_z_loss_clip": 0.22998047, + "router_z_loss_mlp": 0.01036835, + "step": 12457, + "time_per_iteration": 3.1932075023651123 + }, + { + "auxiliary_loss_clip": 0.01117174, + "auxiliary_loss_mlp": 0.01031275, + "balance_loss_clip": 1.04243207, + "balance_loss_mlp": 1.01982498, + "epoch": 0.749015481737562, + "flos": 13861620136800.0, + "grad_norm": 2.0540505348454956, + "language_loss": 0.67670786, + "learning_rate": 6.250740259166711e-07, + "loss": 0.6981923, + "num_input_tokens_seen": 268716265, + "router_z_loss_clip": 0.74658203, + "router_z_loss_mlp": 0.11431885, + "step": 12458, + "time_per_iteration": 3.9027352333068848 + }, + { + "auxiliary_loss_clip": 0.01110064, + "auxiliary_loss_mlp": 0.01028789, + "balance_loss_clip": 1.03879631, + "balance_loss_mlp": 1.01786935, + "epoch": 0.74907560499023, + "flos": 25754136779520.0, + "grad_norm": 1.8295055905595516, + "language_loss": 0.79662204, + "learning_rate": 6.247912173519106e-07, + "loss": 0.81801057, + "num_input_tokens_seen": 268734330, + "router_z_loss_clip": 0.71289062, + "router_z_loss_mlp": 0.10931396, + "step": 12459, + "time_per_iteration": 2.6108906269073486 + }, + { + "auxiliary_loss_clip": 0.01111985, + "auxiliary_loss_mlp": 0.01032875, + "balance_loss_clip": 1.04067612, + "balance_loss_mlp": 1.02129424, + "epoch": 0.749135728242898, + "flos": 27172078884000.0, + "grad_norm": 1.5520749635951292, + "language_loss": 0.80464602, + "learning_rate": 6.245084609352043e-07, + "loss": 0.82609463, + "num_input_tokens_seen": 268753500, + "router_z_loss_clip": 0.71289062, + "router_z_loss_mlp": 0.11572266, + "step": 12460, + "time_per_iteration": 2.662224054336548 + }, + { + "auxiliary_loss_clip": 0.0111286, + "auxiliary_loss_mlp": 0.01026849, + "balance_loss_clip": 1.03997898, + "balance_loss_mlp": 1.01477337, + "epoch": 0.7491958514955659, + "flos": 29355608589120.0, + "grad_norm": 2.155381168286663, + "language_loss": 0.8609457, + "learning_rate": 6.242257566772755e-07, + "loss": 0.88234282, + "num_input_tokens_seen": 268772055, + "router_z_loss_clip": 0.72900391, + "router_z_loss_mlp": 0.1206665, + "step": 12461, + "time_per_iteration": 2.6452507972717285 + }, + { + "auxiliary_loss_clip": 0.01110322, + "auxiliary_loss_mlp": 0.01033668, + "balance_loss_clip": 1.03969038, + "balance_loss_mlp": 1.02268839, + "epoch": 0.7492559747482339, + "flos": 29886227717760.0, + "grad_norm": 1.9358432354189323, + "language_loss": 0.7003476, + "learning_rate": 6.239431045888435e-07, + "loss": 0.72178751, + "num_input_tokens_seen": 268792265, + "router_z_loss_clip": 0.70605469, + "router_z_loss_mlp": 0.10986328, + "step": 12462, + "time_per_iteration": 2.7029852867126465 + }, + { + "auxiliary_loss_clip": 0.01113272, + "auxiliary_loss_mlp": 0.01032474, + "balance_loss_clip": 1.04104042, + "balance_loss_mlp": 1.02079201, + "epoch": 0.7493160980009018, + "flos": 33855505109280.0, + "grad_norm": 2.0879401814970087, + "language_loss": 0.70128691, + "learning_rate": 6.236605046806267e-07, + "loss": 0.72274441, + "num_input_tokens_seen": 268812735, + "router_z_loss_clip": 0.72314453, + "router_z_loss_mlp": 0.11694336, + "step": 12463, + "time_per_iteration": 2.6978886127471924 + }, + { + "auxiliary_loss_clip": 0.01112305, + "auxiliary_loss_mlp": 0.01030692, + "balance_loss_clip": 1.04027009, + "balance_loss_mlp": 1.02022552, + "epoch": 0.7493762212535698, + "flos": 36882848581920.0, + "grad_norm": 1.870630534349926, + "language_loss": 0.77209485, + "learning_rate": 6.233779569633419e-07, + "loss": 0.79352486, + "num_input_tokens_seen": 268833090, + "router_z_loss_clip": 0.72070312, + "router_z_loss_mlp": 0.10467529, + "step": 12464, + "time_per_iteration": 2.7052557468414307 + }, + { + "auxiliary_loss_clip": 0.01109552, + "auxiliary_loss_mlp": 0.01027379, + "balance_loss_clip": 1.03717637, + "balance_loss_mlp": 1.01625037, + "epoch": 0.7494363445062378, + "flos": 26776808936640.0, + "grad_norm": 1.7435481960739805, + "language_loss": 0.78125173, + "learning_rate": 6.230954614477034e-07, + "loss": 0.80262101, + "num_input_tokens_seen": 268851880, + "router_z_loss_clip": 0.72314453, + "router_z_loss_mlp": 0.11132812, + "step": 12465, + "time_per_iteration": 2.633514404296875 + }, + { + "auxiliary_loss_clip": 0.01121203, + "auxiliary_loss_mlp": 0.01033463, + "balance_loss_clip": 1.0418036, + "balance_loss_mlp": 1.02040958, + "epoch": 0.7494964677589058, + "flos": 15240550451040.0, + "grad_norm": 2.7625691781898762, + "language_loss": 0.7379142, + "learning_rate": 6.22813018144422e-07, + "loss": 0.75946081, + "num_input_tokens_seen": 268867910, + "router_z_loss_clip": 0.79248047, + "router_z_loss_mlp": 0.13043213, + "step": 12466, + "time_per_iteration": 2.707831621170044 + }, + { + "auxiliary_loss_clip": 0.01115543, + "auxiliary_loss_mlp": 0.01031571, + "balance_loss_clip": 1.04152346, + "balance_loss_mlp": 1.02021003, + "epoch": 0.7495565910115737, + "flos": 26421158538720.0, + "grad_norm": 2.7859337334960323, + "language_loss": 0.66446304, + "learning_rate": 6.22530627064209e-07, + "loss": 0.68593419, + "num_input_tokens_seen": 268887260, + "router_z_loss_clip": 0.74121094, + "router_z_loss_mlp": 0.11352539, + "step": 12467, + "time_per_iteration": 2.653965950012207 + }, + { + "auxiliary_loss_clip": 0.01113668, + "auxiliary_loss_mlp": 0.0103126, + "balance_loss_clip": 1.0399518, + "balance_loss_mlp": 1.01899374, + "epoch": 0.7496167142642417, + "flos": 18630229672800.0, + "grad_norm": 2.2485799573461027, + "language_loss": 0.76581675, + "learning_rate": 6.222482882177735e-07, + "loss": 0.78726602, + "num_input_tokens_seen": 268902520, + "router_z_loss_clip": 0.73632812, + "router_z_loss_mlp": 0.12268066, + "step": 12468, + "time_per_iteration": 2.601613998413086 + }, + { + "auxiliary_loss_clip": 0.01112633, + "auxiliary_loss_mlp": 0.01028379, + "balance_loss_clip": 1.03986669, + "balance_loss_mlp": 1.01624298, + "epoch": 0.7496768375169096, + "flos": 26911509841440.0, + "grad_norm": 2.7306498401233745, + "language_loss": 0.6961019, + "learning_rate": 6.219660016158201e-07, + "loss": 0.71751201, + "num_input_tokens_seen": 268920970, + "router_z_loss_clip": 0.72802734, + "router_z_loss_mlp": 0.12133789, + "step": 12469, + "time_per_iteration": 2.6853437423706055 + }, + { + "auxiliary_loss_clip": 0.01113825, + "auxiliary_loss_mlp": 0.01032188, + "balance_loss_clip": 1.03980374, + "balance_loss_mlp": 1.02019548, + "epoch": 0.7497369607695776, + "flos": 23253806397600.0, + "grad_norm": 2.0310854760659027, + "language_loss": 0.69167334, + "learning_rate": 6.216837672690543e-07, + "loss": 0.71313351, + "num_input_tokens_seen": 268936600, + "router_z_loss_clip": 0.74023438, + "router_z_loss_mlp": 0.11987305, + "step": 12470, + "time_per_iteration": 2.58123779296875 + }, + { + "auxiliary_loss_clip": 0.01115728, + "auxiliary_loss_mlp": 0.01032138, + "balance_loss_clip": 1.03832483, + "balance_loss_mlp": 1.01915658, + "epoch": 0.7497970840222457, + "flos": 26377608813120.0, + "grad_norm": 4.302919376266916, + "language_loss": 0.75320864, + "learning_rate": 6.214015851881793e-07, + "loss": 0.77468729, + "num_input_tokens_seen": 268956560, + "router_z_loss_clip": 0.7734375, + "router_z_loss_mlp": 0.12969971, + "step": 12471, + "time_per_iteration": 2.6988282203674316 + }, + { + "auxiliary_loss_clip": 0.01113524, + "auxiliary_loss_mlp": 0.0103079, + "balance_loss_clip": 1.03966093, + "balance_loss_mlp": 1.01935232, + "epoch": 0.7498572072749136, + "flos": 16759651502880.0, + "grad_norm": 2.369879956198122, + "language_loss": 0.76808602, + "learning_rate": 6.211194553838929e-07, + "loss": 0.7895292, + "num_input_tokens_seen": 268973945, + "router_z_loss_clip": 0.73925781, + "router_z_loss_mlp": 0.11444092, + "step": 12472, + "time_per_iteration": 2.5915465354919434 + }, + { + "auxiliary_loss_clip": 0.01111711, + "auxiliary_loss_mlp": 0.01030913, + "balance_loss_clip": 1.03902256, + "balance_loss_mlp": 1.01981473, + "epoch": 0.7499173305275816, + "flos": 28023955590240.0, + "grad_norm": 2.089676508766548, + "language_loss": 0.8440479, + "learning_rate": 6.208373778668951e-07, + "loss": 0.8654741, + "num_input_tokens_seen": 268993245, + "router_z_loss_clip": 0.72753906, + "router_z_loss_mlp": 0.11108398, + "step": 12473, + "time_per_iteration": 2.659062147140503 + }, + { + "auxiliary_loss_clip": 0.01117663, + "auxiliary_loss_mlp": 0.01030697, + "balance_loss_clip": 1.04119933, + "balance_loss_mlp": 1.01825202, + "epoch": 0.7499774537802495, + "flos": 27748759809600.0, + "grad_norm": 1.9360064085875242, + "language_loss": 0.73470044, + "learning_rate": 6.205553526478829e-07, + "loss": 0.75618404, + "num_input_tokens_seen": 269012125, + "router_z_loss_clip": 0.76464844, + "router_z_loss_mlp": 0.12438965, + "step": 12474, + "time_per_iteration": 2.616668224334717 + }, + { + "auxiliary_loss_clip": 0.01117376, + "auxiliary_loss_mlp": 0.01033795, + "balance_loss_clip": 1.03981721, + "balance_loss_mlp": 1.02190983, + "epoch": 0.7500375770329175, + "flos": 22325283698400.0, + "grad_norm": 2.051581004601071, + "language_loss": 0.74589729, + "learning_rate": 6.202733797375492e-07, + "loss": 0.76740909, + "num_input_tokens_seen": 269030545, + "router_z_loss_clip": 0.77539062, + "router_z_loss_mlp": 0.11889648, + "step": 12475, + "time_per_iteration": 2.6851108074188232 + }, + { + "auxiliary_loss_clip": 0.01118228, + "auxiliary_loss_mlp": 0.01037633, + "balance_loss_clip": 1.04062259, + "balance_loss_mlp": 1.025033, + "epoch": 0.7500977002855854, + "flos": 23391302994720.0, + "grad_norm": 1.822748563634483, + "language_loss": 0.79997456, + "learning_rate": 6.199914591465878e-07, + "loss": 0.8215332, + "num_input_tokens_seen": 269048180, + "router_z_loss_clip": 0.77539062, + "router_z_loss_mlp": 0.12609863, + "step": 12476, + "time_per_iteration": 2.576063632965088 + }, + { + "auxiliary_loss_clip": 0.0111384, + "auxiliary_loss_mlp": 0.01033725, + "balance_loss_clip": 1.04057598, + "balance_loss_mlp": 1.02195859, + "epoch": 0.7501578235382534, + "flos": 27044509020480.0, + "grad_norm": 1.9418025053621595, + "language_loss": 0.77260351, + "learning_rate": 6.19709590885688e-07, + "loss": 0.79407912, + "num_input_tokens_seen": 269068600, + "router_z_loss_clip": 0.73388672, + "router_z_loss_mlp": 0.11785889, + "step": 12477, + "time_per_iteration": 2.6717469692230225 + }, + { + "auxiliary_loss_clip": 0.01032132, + "auxiliary_loss_mlp": 0.01002427, + "balance_loss_clip": 1.00925553, + "balance_loss_mlp": 1.00140834, + "epoch": 0.7502179467909214, + "flos": 78662322253440.0, + "grad_norm": 0.8012392208852429, + "language_loss": 0.54380369, + "learning_rate": 6.194277749655394e-07, + "loss": 0.56414926, + "num_input_tokens_seen": 269119045, + "router_z_loss_clip": 0.22888184, + "router_z_loss_mlp": 0.01018524, + "step": 12478, + "time_per_iteration": 3.225989580154419 + }, + { + "auxiliary_loss_clip": 0.0111107, + "auxiliary_loss_mlp": 0.01034835, + "balance_loss_clip": 1.03954005, + "balance_loss_mlp": 1.02333772, + "epoch": 0.7502780700435894, + "flos": 24988265557920.0, + "grad_norm": 1.8542000710813304, + "language_loss": 0.80183363, + "learning_rate": 6.191460113968272e-07, + "loss": 0.82329267, + "num_input_tokens_seen": 269136755, + "router_z_loss_clip": 0.71484375, + "router_z_loss_mlp": 0.11499023, + "step": 12479, + "time_per_iteration": 2.6036064624786377 + }, + { + "auxiliary_loss_clip": 0.01117603, + "auxiliary_loss_mlp": 0.01033085, + "balance_loss_clip": 1.04101706, + "balance_loss_mlp": 1.02062213, + "epoch": 0.7503381932962573, + "flos": 24947673593760.0, + "grad_norm": 2.726044470039941, + "language_loss": 0.63444006, + "learning_rate": 6.188643001902369e-07, + "loss": 0.65594697, + "num_input_tokens_seen": 269156120, + "router_z_loss_clip": 0.76513672, + "router_z_loss_mlp": 0.12469482, + "step": 12480, + "time_per_iteration": 2.7619426250457764 + }, + { + "auxiliary_loss_clip": 0.01110449, + "auxiliary_loss_mlp": 0.01036559, + "balance_loss_clip": 1.04021668, + "balance_loss_mlp": 1.02564561, + "epoch": 0.7503983165489253, + "flos": 27310669447680.0, + "grad_norm": 2.150065169733439, + "language_loss": 0.77957696, + "learning_rate": 6.185826413564512e-07, + "loss": 0.80104703, + "num_input_tokens_seen": 269175650, + "router_z_loss_clip": 0.703125, + "router_z_loss_mlp": 0.10919189, + "step": 12481, + "time_per_iteration": 2.712937593460083 + }, + { + "auxiliary_loss_clip": 0.01113054, + "auxiliary_loss_mlp": 0.01034812, + "balance_loss_clip": 1.0383935, + "balance_loss_mlp": 1.02279615, + "epoch": 0.7504584398015932, + "flos": 30382737647040.0, + "grad_norm": 2.779039945801753, + "language_loss": 0.71470845, + "learning_rate": 6.183010349061501e-07, + "loss": 0.7361871, + "num_input_tokens_seen": 269197080, + "router_z_loss_clip": 0.74609375, + "router_z_loss_mlp": 0.12017822, + "step": 12482, + "time_per_iteration": 2.708984613418579 + }, + { + "auxiliary_loss_clip": 0.01113286, + "auxiliary_loss_mlp": 0.01033613, + "balance_loss_clip": 1.03979754, + "balance_loss_mlp": 1.02249646, + "epoch": 0.7505185630542612, + "flos": 31585605264000.0, + "grad_norm": 2.2355799063895296, + "language_loss": 0.70163232, + "learning_rate": 6.180194808500118e-07, + "loss": 0.72310126, + "num_input_tokens_seen": 269218600, + "router_z_loss_clip": 0.73535156, + "router_z_loss_mlp": 0.11114502, + "step": 12483, + "time_per_iteration": 2.6654884815216064 + }, + { + "auxiliary_loss_clip": 0.01111713, + "auxiliary_loss_mlp": 0.01024289, + "balance_loss_clip": 1.03883696, + "balance_loss_mlp": 1.01387656, + "epoch": 0.7505786863069293, + "flos": 28240812838080.0, + "grad_norm": 2.3467009436304753, + "language_loss": 0.74335885, + "learning_rate": 6.177379791987131e-07, + "loss": 0.76471889, + "num_input_tokens_seen": 269239245, + "router_z_loss_clip": 0.72802734, + "router_z_loss_mlp": 0.10418701, + "step": 12484, + "time_per_iteration": 2.675910234451294 + }, + { + "auxiliary_loss_clip": 0.01111356, + "auxiliary_loss_mlp": 0.01028386, + "balance_loss_clip": 1.03943217, + "balance_loss_mlp": 1.01701951, + "epoch": 0.7506388095595972, + "flos": 20722810785120.0, + "grad_norm": 2.3391720572161785, + "language_loss": 0.84777415, + "learning_rate": 6.174565299629295e-07, + "loss": 0.86917162, + "num_input_tokens_seen": 269258520, + "router_z_loss_clip": 0.71923828, + "router_z_loss_mlp": 0.11358643, + "step": 12485, + "time_per_iteration": 2.598580837249756 + }, + { + "auxiliary_loss_clip": 0.01110466, + "auxiliary_loss_mlp": 0.01026725, + "balance_loss_clip": 1.03858662, + "balance_loss_mlp": 1.01520348, + "epoch": 0.7506989328122652, + "flos": 27265620582720.0, + "grad_norm": 1.52078274937565, + "language_loss": 0.78101552, + "learning_rate": 6.171751331533323e-07, + "loss": 0.80238748, + "num_input_tokens_seen": 269278320, + "router_z_loss_clip": 0.71923828, + "router_z_loss_mlp": 0.11535645, + "step": 12486, + "time_per_iteration": 4.121228456497192 + }, + { + "auxiliary_loss_clip": 0.01115316, + "auxiliary_loss_mlp": 0.0103047, + "balance_loss_clip": 1.04009104, + "balance_loss_mlp": 1.01847768, + "epoch": 0.7507590560649331, + "flos": 31452362981280.0, + "grad_norm": 2.9884435403026113, + "language_loss": 0.72754848, + "learning_rate": 6.168937887805932e-07, + "loss": 0.74900639, + "num_input_tokens_seen": 269298025, + "router_z_loss_clip": 0.75244141, + "router_z_loss_mlp": 0.11999512, + "step": 12487, + "time_per_iteration": 2.700326681137085 + }, + { + "auxiliary_loss_clip": 0.01112551, + "auxiliary_loss_mlp": 0.01026933, + "balance_loss_clip": 1.03841662, + "balance_loss_mlp": 1.01575708, + "epoch": 0.7508191793176011, + "flos": 29626104365280.0, + "grad_norm": 3.3197875640237178, + "language_loss": 0.67437136, + "learning_rate": 6.166124968553801e-07, + "loss": 0.69576621, + "num_input_tokens_seen": 269316770, + "router_z_loss_clip": 0.74121094, + "router_z_loss_mlp": 0.11181641, + "step": 12488, + "time_per_iteration": 2.6481595039367676 + }, + { + "auxiliary_loss_clip": 0.01113039, + "auxiliary_loss_mlp": 0.01031237, + "balance_loss_clip": 1.03992438, + "balance_loss_mlp": 1.01900649, + "epoch": 0.750879302570269, + "flos": 24281624249280.0, + "grad_norm": 2.334240626978646, + "language_loss": 0.76939517, + "learning_rate": 6.163312573883592e-07, + "loss": 0.79083794, + "num_input_tokens_seen": 269334755, + "router_z_loss_clip": 0.73144531, + "router_z_loss_mlp": 0.12231445, + "step": 12489, + "time_per_iteration": 3.930973529815674 + }, + { + "auxiliary_loss_clip": 0.0111111, + "auxiliary_loss_mlp": 0.01028799, + "balance_loss_clip": 1.03972435, + "balance_loss_mlp": 1.01815963, + "epoch": 0.750939425822937, + "flos": 35637444171360.0, + "grad_norm": 1.9384578116244215, + "language_loss": 0.75186592, + "learning_rate": 6.160500703901956e-07, + "loss": 0.773265, + "num_input_tokens_seen": 269353810, + "router_z_loss_clip": 0.71337891, + "router_z_loss_mlp": 0.10638428, + "step": 12490, + "time_per_iteration": 2.7162301540374756 + }, + { + "auxiliary_loss_clip": 0.01111978, + "auxiliary_loss_mlp": 0.0102904, + "balance_loss_clip": 1.03991747, + "balance_loss_mlp": 1.01781011, + "epoch": 0.750999549075605, + "flos": 25797889091520.0, + "grad_norm": 1.7833245504728528, + "language_loss": 0.78362316, + "learning_rate": 6.157689358715527e-07, + "loss": 0.80503333, + "num_input_tokens_seen": 269372910, + "router_z_loss_clip": 0.72070312, + "router_z_loss_mlp": 0.11242676, + "step": 12491, + "time_per_iteration": 2.6309404373168945 + }, + { + "auxiliary_loss_clip": 0.01109524, + "auxiliary_loss_mlp": 0.01028165, + "balance_loss_clip": 1.03845489, + "balance_loss_mlp": 1.01760888, + "epoch": 0.751059672328273, + "flos": 28733433108480.0, + "grad_norm": 1.6634812046393759, + "language_loss": 0.7654866, + "learning_rate": 6.154878538430899e-07, + "loss": 0.78686345, + "num_input_tokens_seen": 269391545, + "router_z_loss_clip": 0.70996094, + "router_z_loss_mlp": 0.10552979, + "step": 12492, + "time_per_iteration": 2.668090343475342 + }, + { + "auxiliary_loss_clip": 0.01110166, + "auxiliary_loss_mlp": 0.01030623, + "balance_loss_clip": 1.03719103, + "balance_loss_mlp": 1.0198586, + "epoch": 0.7511197955809409, + "flos": 23171163847200.0, + "grad_norm": 1.9998607075730184, + "language_loss": 0.71714532, + "learning_rate": 6.152068243154671e-07, + "loss": 0.73855323, + "num_input_tokens_seen": 269408530, + "router_z_loss_clip": 0.73046875, + "router_z_loss_mlp": 0.10772705, + "step": 12493, + "time_per_iteration": 2.6000490188598633 + }, + { + "auxiliary_loss_clip": 0.01114677, + "auxiliary_loss_mlp": 0.01026404, + "balance_loss_clip": 1.04053628, + "balance_loss_mlp": 1.01497746, + "epoch": 0.7511799188336089, + "flos": 26902515005280.0, + "grad_norm": 1.6456790518912214, + "language_loss": 0.80781829, + "learning_rate": 6.149258472993395e-07, + "loss": 0.82922912, + "num_input_tokens_seen": 269425930, + "router_z_loss_clip": 0.74121094, + "router_z_loss_mlp": 0.11425781, + "step": 12494, + "time_per_iteration": 2.6771187782287598 + }, + { + "auxiliary_loss_clip": 0.01114287, + "auxiliary_loss_mlp": 0.01026079, + "balance_loss_clip": 1.03981483, + "balance_loss_mlp": 1.01383018, + "epoch": 0.7512400420862768, + "flos": 20093342194080.0, + "grad_norm": 2.2706867937129473, + "language_loss": 0.78469312, + "learning_rate": 6.146449228053634e-07, + "loss": 0.80609679, + "num_input_tokens_seen": 269443945, + "router_z_loss_clip": 0.74511719, + "router_z_loss_mlp": 0.12255859, + "step": 12495, + "time_per_iteration": 2.62522292137146 + }, + { + "auxiliary_loss_clip": 0.0111356, + "auxiliary_loss_mlp": 0.01033548, + "balance_loss_clip": 1.03987908, + "balance_loss_mlp": 1.02226448, + "epoch": 0.7513001653389448, + "flos": 24951117562560.0, + "grad_norm": 2.7421154194967086, + "language_loss": 0.71029246, + "learning_rate": 6.143640508441898e-07, + "loss": 0.73176354, + "num_input_tokens_seen": 269463625, + "router_z_loss_clip": 0.73632812, + "router_z_loss_mlp": 0.112854, + "step": 12496, + "time_per_iteration": 4.1162028312683105 + }, + { + "auxiliary_loss_clip": 0.01113608, + "auxiliary_loss_mlp": 0.01026956, + "balance_loss_clip": 1.04062152, + "balance_loss_mlp": 1.01595855, + "epoch": 0.7513602885916129, + "flos": 28646698312800.0, + "grad_norm": 2.348687321698185, + "language_loss": 0.78309101, + "learning_rate": 6.140832314264705e-07, + "loss": 0.80449665, + "num_input_tokens_seen": 269483415, + "router_z_loss_clip": 0.72998047, + "router_z_loss_mlp": 0.11004639, + "step": 12497, + "time_per_iteration": 3.8986871242523193 + }, + { + "auxiliary_loss_clip": 0.01114727, + "auxiliary_loss_mlp": 0.01036161, + "balance_loss_clip": 1.03944135, + "balance_loss_mlp": 1.02426422, + "epoch": 0.7514204118442808, + "flos": 32698496702880.0, + "grad_norm": 1.6013792221472054, + "language_loss": 0.76945329, + "learning_rate": 6.13802464562855e-07, + "loss": 0.79096222, + "num_input_tokens_seen": 269504635, + "router_z_loss_clip": 0.75244141, + "router_z_loss_mlp": 0.11895752, + "step": 12498, + "time_per_iteration": 2.6392433643341064 + }, + { + "auxiliary_loss_clip": 0.01111088, + "auxiliary_loss_mlp": 0.01030164, + "balance_loss_clip": 1.04129481, + "balance_loss_mlp": 1.01932788, + "epoch": 0.7514805350969488, + "flos": 24239978835840.0, + "grad_norm": 1.9742010788407203, + "language_loss": 0.73805237, + "learning_rate": 6.135217502639878e-07, + "loss": 0.75946486, + "num_input_tokens_seen": 269523955, + "router_z_loss_clip": 0.69677734, + "router_z_loss_mlp": 0.10839844, + "step": 12499, + "time_per_iteration": 2.64573335647583 + }, + { + "auxiliary_loss_clip": 0.01109379, + "auxiliary_loss_mlp": 0.0102649, + "balance_loss_clip": 1.03758359, + "balance_loss_mlp": 1.01561189, + "epoch": 0.7515406583496167, + "flos": 29982565108800.0, + "grad_norm": 1.9809897408846115, + "language_loss": 0.79526901, + "learning_rate": 6.132410885405148e-07, + "loss": 0.81662762, + "num_input_tokens_seen": 269544410, + "router_z_loss_clip": 0.71728516, + "router_z_loss_mlp": 0.10876465, + "step": 12500, + "time_per_iteration": 2.6367688179016113 + }, + { + "auxiliary_loss_clip": 0.01119775, + "auxiliary_loss_mlp": 0.01034768, + "balance_loss_clip": 1.04119027, + "balance_loss_mlp": 1.02126741, + "epoch": 0.7516007816022847, + "flos": 24550742437920.0, + "grad_norm": 2.2633923252502894, + "language_loss": 0.73618972, + "learning_rate": 6.129604794030794e-07, + "loss": 0.75773513, + "num_input_tokens_seen": 269563315, + "router_z_loss_clip": 0.78613281, + "router_z_loss_mlp": 0.13513184, + "step": 12501, + "time_per_iteration": 2.7525060176849365 + }, + { + "auxiliary_loss_clip": 0.01110675, + "auxiliary_loss_mlp": 0.01024131, + "balance_loss_clip": 1.03799903, + "balance_loss_mlp": 1.01203108, + "epoch": 0.7516609048549526, + "flos": 27801952647840.0, + "grad_norm": 1.8989919349978637, + "language_loss": 0.78397584, + "learning_rate": 6.126799228623207e-07, + "loss": 0.80532396, + "num_input_tokens_seen": 269583950, + "router_z_loss_clip": 0.7265625, + "router_z_loss_mlp": 0.12091064, + "step": 12502, + "time_per_iteration": 2.707223415374756 + }, + { + "auxiliary_loss_clip": 0.01113713, + "auxiliary_loss_mlp": 0.01032445, + "balance_loss_clip": 1.03974414, + "balance_loss_mlp": 1.02078652, + "epoch": 0.7517210281076206, + "flos": 12974904920160.0, + "grad_norm": 2.5889449287996116, + "language_loss": 0.70516831, + "learning_rate": 6.123994189288786e-07, + "loss": 0.72662991, + "num_input_tokens_seen": 269600120, + "router_z_loss_clip": 0.73925781, + "router_z_loss_mlp": 0.11657715, + "step": 12503, + "time_per_iteration": 2.6169724464416504 + }, + { + "auxiliary_loss_clip": 0.01032169, + "auxiliary_loss_mlp": 0.01001507, + "balance_loss_clip": 1.00942469, + "balance_loss_mlp": 1.00045216, + "epoch": 0.7517811513602886, + "flos": 80597438100000.0, + "grad_norm": 1.3366591811419997, + "language_loss": 0.63994765, + "learning_rate": 6.121189676133903e-07, + "loss": 0.66028446, + "num_input_tokens_seen": 269659815, + "router_z_loss_clip": 0.22729492, + "router_z_loss_mlp": 0.01055145, + "step": 12504, + "time_per_iteration": 3.168741464614868 + }, + { + "auxiliary_loss_clip": 0.0110908, + "auxiliary_loss_mlp": 0.01031015, + "balance_loss_clip": 1.03864431, + "balance_loss_mlp": 1.01995182, + "epoch": 0.7518412746129566, + "flos": 45476188905600.0, + "grad_norm": 1.476125322262781, + "language_loss": 0.68811715, + "learning_rate": 6.118385689264896e-07, + "loss": 0.70951807, + "num_input_tokens_seen": 269684565, + "router_z_loss_clip": 0.70458984, + "router_z_loss_mlp": 0.11065674, + "step": 12505, + "time_per_iteration": 2.7817842960357666 + }, + { + "auxiliary_loss_clip": 0.01032015, + "auxiliary_loss_mlp": 0.01002594, + "balance_loss_clip": 1.0091188, + "balance_loss_mlp": 1.00152957, + "epoch": 0.7519013978656245, + "flos": 73843727094720.0, + "grad_norm": 0.6454039399767326, + "language_loss": 0.55098575, + "learning_rate": 6.11558222878809e-07, + "loss": 0.57133186, + "num_input_tokens_seen": 269752325, + "router_z_loss_clip": 0.22875977, + "router_z_loss_mlp": 0.01065063, + "step": 12506, + "time_per_iteration": 3.3373141288757324 + }, + { + "auxiliary_loss_clip": 0.01118078, + "auxiliary_loss_mlp": 0.01031311, + "balance_loss_clip": 1.04267299, + "balance_loss_mlp": 1.01959908, + "epoch": 0.7519615211182925, + "flos": 22948066938240.0, + "grad_norm": 2.188205400614472, + "language_loss": 0.7848435, + "learning_rate": 6.112779294809796e-07, + "loss": 0.80633748, + "num_input_tokens_seen": 269770630, + "router_z_loss_clip": 0.75488281, + "router_z_loss_mlp": 0.11706543, + "step": 12507, + "time_per_iteration": 2.644057035446167 + }, + { + "auxiliary_loss_clip": 0.01111903, + "auxiliary_loss_mlp": 0.01032133, + "balance_loss_clip": 1.04172623, + "balance_loss_mlp": 1.02139246, + "epoch": 0.7520216443709604, + "flos": 17784227972160.0, + "grad_norm": 1.6978307100902006, + "language_loss": 0.71052396, + "learning_rate": 6.10997688743631e-07, + "loss": 0.73196435, + "num_input_tokens_seen": 269787280, + "router_z_loss_clip": 0.70214844, + "router_z_loss_mlp": 0.10742188, + "step": 12508, + "time_per_iteration": 2.656170129776001 + }, + { + "auxiliary_loss_clip": 0.01111261, + "auxiliary_loss_mlp": 0.01028254, + "balance_loss_clip": 1.03909135, + "balance_loss_mlp": 1.01710176, + "epoch": 0.7520817676236284, + "flos": 20812138686720.0, + "grad_norm": 2.8808591123681886, + "language_loss": 0.72176814, + "learning_rate": 6.107175006773885e-07, + "loss": 0.74316323, + "num_input_tokens_seen": 269805205, + "router_z_loss_clip": 0.72119141, + "router_z_loss_mlp": 0.1114502, + "step": 12509, + "time_per_iteration": 2.5786609649658203 + }, + { + "auxiliary_loss_clip": 0.01117903, + "auxiliary_loss_mlp": 0.01036442, + "balance_loss_clip": 1.04079652, + "balance_loss_mlp": 1.02376449, + "epoch": 0.7521418908762965, + "flos": 31318553456640.0, + "grad_norm": 2.3424959042003386, + "language_loss": 0.62395811, + "learning_rate": 6.104373652928785e-07, + "loss": 0.64550161, + "num_input_tokens_seen": 269824820, + "router_z_loss_clip": 0.77099609, + "router_z_loss_mlp": 0.12677002, + "step": 12510, + "time_per_iteration": 2.7145748138427734 + }, + { + "auxiliary_loss_clip": 0.01110372, + "auxiliary_loss_mlp": 0.01027897, + "balance_loss_clip": 1.04024839, + "balance_loss_mlp": 1.01635766, + "epoch": 0.7522020141289644, + "flos": 25486072040160.0, + "grad_norm": 2.2266707232414693, + "language_loss": 0.81608665, + "learning_rate": 6.10157282600722e-07, + "loss": 0.83746934, + "num_input_tokens_seen": 269842825, + "router_z_loss_clip": 0.70117188, + "router_z_loss_mlp": 0.11541748, + "step": 12511, + "time_per_iteration": 2.6582696437835693 + }, + { + "auxiliary_loss_clip": 0.01116966, + "auxiliary_loss_mlp": 0.01035906, + "balance_loss_clip": 1.04029679, + "balance_loss_mlp": 1.0233655, + "epoch": 0.7522621373816324, + "flos": 15423176947680.0, + "grad_norm": 3.666177318923579, + "language_loss": 0.75762916, + "learning_rate": 6.098772526115412e-07, + "loss": 0.77915788, + "num_input_tokens_seen": 269859000, + "router_z_loss_clip": 0.76611328, + "router_z_loss_mlp": 0.12548828, + "step": 12512, + "time_per_iteration": 2.652371406555176 + }, + { + "auxiliary_loss_clip": 0.011061, + "auxiliary_loss_mlp": 0.01028405, + "balance_loss_clip": 1.03766418, + "balance_loss_mlp": 1.01750922, + "epoch": 0.7523222606343003, + "flos": 31269777001920.0, + "grad_norm": 1.713647219624244, + "language_loss": 0.82167763, + "learning_rate": 6.095972753359537e-07, + "loss": 0.8430227, + "num_input_tokens_seen": 269878895, + "router_z_loss_clip": 0.68457031, + "router_z_loss_mlp": 0.10888672, + "step": 12513, + "time_per_iteration": 2.740652322769165 + }, + { + "auxiliary_loss_clip": 0.01115669, + "auxiliary_loss_mlp": 0.01034453, + "balance_loss_clip": 1.03978658, + "balance_loss_mlp": 1.02232397, + "epoch": 0.7523823838869683, + "flos": 24951765839040.0, + "grad_norm": 1.9822145798179664, + "language_loss": 0.747594, + "learning_rate": 6.093173507845771e-07, + "loss": 0.76909518, + "num_input_tokens_seen": 269897280, + "router_z_loss_clip": 0.75830078, + "router_z_loss_mlp": 0.12133789, + "step": 12514, + "time_per_iteration": 2.5917248725891113 + }, + { + "auxiliary_loss_clip": 0.01107096, + "auxiliary_loss_mlp": 0.01029778, + "balance_loss_clip": 1.03760695, + "balance_loss_mlp": 1.01945412, + "epoch": 0.7524425071396362, + "flos": 17961668256960.0, + "grad_norm": 2.129174385131898, + "language_loss": 0.68961048, + "learning_rate": 6.090374789680271e-07, + "loss": 0.71097922, + "num_input_tokens_seen": 269914640, + "router_z_loss_clip": 0.6953125, + "router_z_loss_mlp": 0.10327148, + "step": 12515, + "time_per_iteration": 2.6015331745147705 + }, + { + "auxiliary_loss_clip": 0.01113461, + "auxiliary_loss_mlp": 0.01027264, + "balance_loss_clip": 1.03971934, + "balance_loss_mlp": 1.016011, + "epoch": 0.7525026303923043, + "flos": 37329204468960.0, + "grad_norm": 1.8166392406425231, + "language_loss": 0.70219439, + "learning_rate": 6.087576598969137e-07, + "loss": 0.72360164, + "num_input_tokens_seen": 269934960, + "router_z_loss_clip": 0.73681641, + "router_z_loss_mlp": 0.11248779, + "step": 12516, + "time_per_iteration": 2.663764715194702 + }, + { + "auxiliary_loss_clip": 0.01110958, + "auxiliary_loss_mlp": 0.01029751, + "balance_loss_clip": 1.04062843, + "balance_loss_mlp": 1.01845002, + "epoch": 0.7525627536449722, + "flos": 30251156572800.0, + "grad_norm": 3.433608025421189, + "language_loss": 0.89628655, + "learning_rate": 6.084778935818495e-07, + "loss": 0.91769361, + "num_input_tokens_seen": 269956655, + "router_z_loss_clip": 0.70361328, + "router_z_loss_mlp": 0.11297607, + "step": 12517, + "time_per_iteration": 2.73464035987854 + }, + { + "auxiliary_loss_clip": 0.01112239, + "auxiliary_loss_mlp": 0.01030846, + "balance_loss_clip": 1.0382688, + "balance_loss_mlp": 1.0198015, + "epoch": 0.7526228768976402, + "flos": 25355301311520.0, + "grad_norm": 2.1328640039404445, + "language_loss": 0.744205, + "learning_rate": 6.081981800334437e-07, + "loss": 0.76563579, + "num_input_tokens_seen": 269976835, + "router_z_loss_clip": 0.74023438, + "router_z_loss_mlp": 0.11047363, + "step": 12518, + "time_per_iteration": 2.7297024726867676 + }, + { + "auxiliary_loss_clip": 0.01031781, + "auxiliary_loss_mlp": 0.01000923, + "balance_loss_clip": 1.00891018, + "balance_loss_mlp": 0.99988496, + "epoch": 0.7526830001503081, + "flos": 81216493750080.0, + "grad_norm": 0.7057218465848346, + "language_loss": 0.55636561, + "learning_rate": 6.079185192623017e-07, + "loss": 0.5766927, + "num_input_tokens_seen": 270040630, + "router_z_loss_clip": 0.22851562, + "router_z_loss_mlp": 0.01039124, + "step": 12519, + "time_per_iteration": 3.332195281982422 + }, + { + "auxiliary_loss_clip": 0.01110955, + "auxiliary_loss_mlp": 0.0102653, + "balance_loss_clip": 1.03824854, + "balance_loss_mlp": 1.01624274, + "epoch": 0.7527431234029761, + "flos": 28647913831200.0, + "grad_norm": 1.557954404953575, + "language_loss": 0.77821279, + "learning_rate": 6.07638911279029e-07, + "loss": 0.79958761, + "num_input_tokens_seen": 270059695, + "router_z_loss_clip": 0.72753906, + "router_z_loss_mlp": 0.10290527, + "step": 12520, + "time_per_iteration": 2.6510379314422607 + }, + { + "auxiliary_loss_clip": 0.01110549, + "auxiliary_loss_mlp": 0.0103148, + "balance_loss_clip": 1.03834999, + "balance_loss_mlp": 1.02062035, + "epoch": 0.752803246655644, + "flos": 26905432249440.0, + "grad_norm": 2.073886997833781, + "language_loss": 0.73723459, + "learning_rate": 6.07359356094229e-07, + "loss": 0.75865483, + "num_input_tokens_seen": 270078420, + "router_z_loss_clip": 0.72265625, + "router_z_loss_mlp": 0.10864258, + "step": 12521, + "time_per_iteration": 2.6617162227630615 + }, + { + "auxiliary_loss_clip": 0.01116411, + "auxiliary_loss_mlp": 0.01033, + "balance_loss_clip": 1.04015613, + "balance_loss_mlp": 1.02091825, + "epoch": 0.752863369908312, + "flos": 36794452577760.0, + "grad_norm": 2.6538319163519133, + "language_loss": 0.67117417, + "learning_rate": 6.070798537185016e-07, + "loss": 0.69266832, + "num_input_tokens_seen": 270097040, + "router_z_loss_clip": 0.76123047, + "router_z_loss_mlp": 0.12097168, + "step": 12522, + "time_per_iteration": 2.698054313659668 + }, + { + "auxiliary_loss_clip": 0.01115623, + "auxiliary_loss_mlp": 0.01038003, + "balance_loss_clip": 1.04024601, + "balance_loss_mlp": 1.02648759, + "epoch": 0.7529234931609801, + "flos": 29978148725280.0, + "grad_norm": 1.9542489235124447, + "language_loss": 0.78315747, + "learning_rate": 6.068004041624453e-07, + "loss": 0.80469364, + "num_input_tokens_seen": 270116365, + "router_z_loss_clip": 0.75390625, + "router_z_loss_mlp": 0.1151123, + "step": 12523, + "time_per_iteration": 2.6332032680511475 + }, + { + "auxiliary_loss_clip": 0.01110064, + "auxiliary_loss_mlp": 0.01028317, + "balance_loss_clip": 1.03903913, + "balance_loss_mlp": 1.01723623, + "epoch": 0.752983616413648, + "flos": 28201963116960.0, + "grad_norm": 1.919551467768114, + "language_loss": 0.80830526, + "learning_rate": 6.065210074366571e-07, + "loss": 0.82968909, + "num_input_tokens_seen": 270135395, + "router_z_loss_clip": 0.70996094, + "router_z_loss_mlp": 0.11090088, + "step": 12524, + "time_per_iteration": 2.649097442626953 + }, + { + "auxiliary_loss_clip": 0.01112229, + "auxiliary_loss_mlp": 0.01029146, + "balance_loss_clip": 1.04082561, + "balance_loss_mlp": 1.01854277, + "epoch": 0.753043739666316, + "flos": 29671801506720.0, + "grad_norm": 1.652390630710296, + "language_loss": 0.73795354, + "learning_rate": 6.062416635517326e-07, + "loss": 0.75936729, + "num_input_tokens_seen": 270156425, + "router_z_loss_clip": 0.71386719, + "router_z_loss_mlp": 0.10614014, + "step": 12525, + "time_per_iteration": 4.145842552185059 + }, + { + "auxiliary_loss_clip": 0.01111769, + "auxiliary_loss_mlp": 0.01031873, + "balance_loss_clip": 1.0391686, + "balance_loss_mlp": 1.02068543, + "epoch": 0.7531038629189839, + "flos": 29581987397760.0, + "grad_norm": 1.854053481003425, + "language_loss": 0.72254658, + "learning_rate": 6.059623725182641e-07, + "loss": 0.74398303, + "num_input_tokens_seen": 270176905, + "router_z_loss_clip": 0.72509766, + "router_z_loss_mlp": 0.11193848, + "step": 12526, + "time_per_iteration": 2.7372562885284424 + }, + { + "auxiliary_loss_clip": 0.01109301, + "auxiliary_loss_mlp": 0.01024124, + "balance_loss_clip": 1.03785396, + "balance_loss_mlp": 1.01335335, + "epoch": 0.7531639861716519, + "flos": 36837313509600.0, + "grad_norm": 2.56391402627506, + "language_loss": 0.72105145, + "learning_rate": 6.056831343468414e-07, + "loss": 0.74238575, + "num_input_tokens_seen": 270196640, + "router_z_loss_clip": 0.71484375, + "router_z_loss_mlp": 0.10772705, + "step": 12527, + "time_per_iteration": 2.7241928577423096 + }, + { + "auxiliary_loss_clip": 0.01109379, + "auxiliary_loss_mlp": 0.01025131, + "balance_loss_clip": 1.03830886, + "balance_loss_mlp": 1.014498, + "epoch": 0.7532241094243198, + "flos": 22235793727680.0, + "grad_norm": 1.8406097377963193, + "language_loss": 0.81117952, + "learning_rate": 6.054039490480539e-07, + "loss": 0.83252466, + "num_input_tokens_seen": 270213905, + "router_z_loss_clip": 0.7109375, + "router_z_loss_mlp": 0.10632324, + "step": 12528, + "time_per_iteration": 3.871896505355835 + }, + { + "auxiliary_loss_clip": 0.01112505, + "auxiliary_loss_mlp": 0.01033641, + "balance_loss_clip": 1.03885913, + "balance_loss_mlp": 1.02141571, + "epoch": 0.7532842326769879, + "flos": 25479629792640.0, + "grad_norm": 1.9977851073471513, + "language_loss": 0.85339761, + "learning_rate": 6.051248166324892e-07, + "loss": 0.87485904, + "num_input_tokens_seen": 270231995, + "router_z_loss_clip": 0.73632812, + "router_z_loss_mlp": 0.12213135, + "step": 12529, + "time_per_iteration": 2.6461524963378906 + }, + { + "auxiliary_loss_clip": 0.01117189, + "auxiliary_loss_mlp": 0.0103101, + "balance_loss_clip": 1.04126191, + "balance_loss_mlp": 1.01886845, + "epoch": 0.7533443559296558, + "flos": 22059325857600.0, + "grad_norm": 1.8821177584481026, + "language_loss": 0.73791397, + "learning_rate": 6.048457371107303e-07, + "loss": 0.75939596, + "num_input_tokens_seen": 270251480, + "router_z_loss_clip": 0.75830078, + "router_z_loss_mlp": 0.12133789, + "step": 12530, + "time_per_iteration": 2.6027719974517822 + }, + { + "auxiliary_loss_clip": 0.0103173, + "auxiliary_loss_mlp": 0.01001014, + "balance_loss_clip": 1.00883484, + "balance_loss_mlp": 1.00001609, + "epoch": 0.7534044791823238, + "flos": 61320161687040.0, + "grad_norm": 0.8306764876034395, + "language_loss": 0.63634944, + "learning_rate": 6.045667104933612e-07, + "loss": 0.65667683, + "num_input_tokens_seen": 270306480, + "router_z_loss_clip": 0.22875977, + "router_z_loss_mlp": 0.00997925, + "step": 12531, + "time_per_iteration": 3.142773389816284 + }, + { + "auxiliary_loss_clip": 0.01114914, + "auxiliary_loss_mlp": 0.01029171, + "balance_loss_clip": 1.03987813, + "balance_loss_mlp": 1.01701164, + "epoch": 0.7534646024349917, + "flos": 25441549899840.0, + "grad_norm": 2.138730684149205, + "language_loss": 0.70021415, + "learning_rate": 6.042877367909633e-07, + "loss": 0.72165495, + "num_input_tokens_seen": 270324595, + "router_z_loss_clip": 0.75048828, + "router_z_loss_mlp": 0.1217041, + "step": 12532, + "time_per_iteration": 2.6709721088409424 + }, + { + "auxiliary_loss_clip": 0.01107597, + "auxiliary_loss_mlp": 0.01028051, + "balance_loss_clip": 1.03853631, + "balance_loss_mlp": 1.01806688, + "epoch": 0.7535247256876597, + "flos": 28152781489440.0, + "grad_norm": 1.5578565307966121, + "language_loss": 0.77371466, + "learning_rate": 6.040088160141132e-07, + "loss": 0.79507113, + "num_input_tokens_seen": 270344375, + "router_z_loss_clip": 0.69091797, + "router_z_loss_mlp": 0.09979248, + "step": 12533, + "time_per_iteration": 2.64404296875 + }, + { + "auxiliary_loss_clip": 0.01031766, + "auxiliary_loss_mlp": 0.01001807, + "balance_loss_clip": 1.00873327, + "balance_loss_mlp": 1.00074887, + "epoch": 0.7535848489403276, + "flos": 71530925800320.0, + "grad_norm": 0.7836439343199213, + "language_loss": 0.57321298, + "learning_rate": 6.037299481733886e-07, + "loss": 0.59354872, + "num_input_tokens_seen": 270405235, + "router_z_loss_clip": 0.23046875, + "router_z_loss_mlp": 0.0105896, + "step": 12534, + "time_per_iteration": 3.2741036415100098 + }, + { + "auxiliary_loss_clip": 0.01110404, + "auxiliary_loss_mlp": 0.01024732, + "balance_loss_clip": 1.03780174, + "balance_loss_mlp": 1.01333594, + "epoch": 0.7536449721929956, + "flos": 32430877653600.0, + "grad_norm": 1.980756934008628, + "language_loss": 0.71333694, + "learning_rate": 6.03451133279365e-07, + "loss": 0.73468834, + "num_input_tokens_seen": 270425820, + "router_z_loss_clip": 0.7265625, + "router_z_loss_mlp": 0.1138916, + "step": 12535, + "time_per_iteration": 4.083255767822266 + }, + { + "auxiliary_loss_clip": 0.01113862, + "auxiliary_loss_mlp": 0.01030003, + "balance_loss_clip": 1.03849626, + "balance_loss_mlp": 1.0179925, + "epoch": 0.7537050954456637, + "flos": 31403789112960.0, + "grad_norm": 2.0499514320769245, + "language_loss": 0.80483437, + "learning_rate": 6.031723713426135e-07, + "loss": 0.82627302, + "num_input_tokens_seen": 270447120, + "router_z_loss_clip": 0.75341797, + "router_z_loss_mlp": 0.12017822, + "step": 12536, + "time_per_iteration": 2.66437029838562 + }, + { + "auxiliary_loss_clip": 0.0110783, + "auxiliary_loss_mlp": 0.01028038, + "balance_loss_clip": 1.03767681, + "balance_loss_mlp": 1.01684427, + "epoch": 0.7537652186983316, + "flos": 36880863235200.0, + "grad_norm": 1.848359077809684, + "language_loss": 0.74111634, + "learning_rate": 6.028936623737067e-07, + "loss": 0.76247501, + "num_input_tokens_seen": 270468680, + "router_z_loss_clip": 0.70166016, + "router_z_loss_mlp": 0.11193848, + "step": 12537, + "time_per_iteration": 4.000462055206299 + }, + { + "auxiliary_loss_clip": 0.01111834, + "auxiliary_loss_mlp": 0.01030571, + "balance_loss_clip": 1.03843009, + "balance_loss_mlp": 1.0192399, + "epoch": 0.7538253419509996, + "flos": 15424230396960.0, + "grad_norm": 2.4595205924807866, + "language_loss": 0.73997545, + "learning_rate": 6.026150063832111e-07, + "loss": 0.76139945, + "num_input_tokens_seen": 270486310, + "router_z_loss_clip": 0.734375, + "router_z_loss_mlp": 0.11346436, + "step": 12538, + "time_per_iteration": 2.565936326980591 + }, + { + "auxiliary_loss_clip": 0.0111387, + "auxiliary_loss_mlp": 0.01030506, + "balance_loss_clip": 1.03966284, + "balance_loss_mlp": 1.01898396, + "epoch": 0.7538854652036675, + "flos": 28291696191360.0, + "grad_norm": 1.7437352937104738, + "language_loss": 0.67278934, + "learning_rate": 6.023364033816956e-07, + "loss": 0.69423312, + "num_input_tokens_seen": 270507210, + "router_z_loss_clip": 0.74121094, + "router_z_loss_mlp": 0.11517334, + "step": 12539, + "time_per_iteration": 2.705341100692749 + }, + { + "auxiliary_loss_clip": 0.01111699, + "auxiliary_loss_mlp": 0.01026797, + "balance_loss_clip": 1.0402329, + "balance_loss_mlp": 1.01539505, + "epoch": 0.7539455884563355, + "flos": 28291979812320.0, + "grad_norm": 1.717540052761724, + "language_loss": 0.75076717, + "learning_rate": 6.020578533797229e-07, + "loss": 0.77215213, + "num_input_tokens_seen": 270525250, + "router_z_loss_clip": 0.71533203, + "router_z_loss_mlp": 0.11407471, + "step": 12540, + "time_per_iteration": 2.6166088581085205 + }, + { + "auxiliary_loss_clip": 0.01113531, + "auxiliary_loss_mlp": 0.01028384, + "balance_loss_clip": 1.0384903, + "balance_loss_mlp": 1.0164814, + "epoch": 0.7540057117090034, + "flos": 16083756459360.0, + "grad_norm": 3.0638136592287353, + "language_loss": 0.72955763, + "learning_rate": 6.017793563878566e-07, + "loss": 0.75097674, + "num_input_tokens_seen": 270539295, + "router_z_loss_clip": 0.75048828, + "router_z_loss_mlp": 0.11907959, + "step": 12541, + "time_per_iteration": 2.6075832843780518 + }, + { + "auxiliary_loss_clip": 0.01111554, + "auxiliary_loss_mlp": 0.01027992, + "balance_loss_clip": 1.0394876, + "balance_loss_mlp": 1.01655412, + "epoch": 0.7540658349616715, + "flos": 55494359271360.0, + "grad_norm": 1.6413270161918554, + "language_loss": 0.72092205, + "learning_rate": 6.015009124166576e-07, + "loss": 0.7423175, + "num_input_tokens_seen": 270562815, + "router_z_loss_clip": 0.72070312, + "router_z_loss_mlp": 0.11431885, + "step": 12542, + "time_per_iteration": 2.8289034366607666 + }, + { + "auxiliary_loss_clip": 0.01110794, + "auxiliary_loss_mlp": 0.01025441, + "balance_loss_clip": 1.03895736, + "balance_loss_mlp": 1.01419961, + "epoch": 0.7541259582143394, + "flos": 24320230866720.0, + "grad_norm": 1.9575852393971098, + "language_loss": 0.84754109, + "learning_rate": 6.012225214766844e-07, + "loss": 0.8689034, + "num_input_tokens_seen": 270579055, + "router_z_loss_clip": 0.71728516, + "router_z_loss_mlp": 0.11248779, + "step": 12543, + "time_per_iteration": 2.6012532711029053 + }, + { + "auxiliary_loss_clip": 0.01115218, + "auxiliary_loss_mlp": 0.01027632, + "balance_loss_clip": 1.04391861, + "balance_loss_mlp": 1.01651597, + "epoch": 0.7541860814670074, + "flos": 33187875590880.0, + "grad_norm": 2.672157076447956, + "language_loss": 0.73755419, + "learning_rate": 6.009441835784927e-07, + "loss": 0.75898266, + "num_input_tokens_seen": 270599080, + "router_z_loss_clip": 0.71240234, + "router_z_loss_mlp": 0.11120605, + "step": 12544, + "time_per_iteration": 2.722547769546509 + }, + { + "auxiliary_loss_clip": 0.01112366, + "auxiliary_loss_mlp": 0.01029142, + "balance_loss_clip": 1.03937304, + "balance_loss_mlp": 1.01841354, + "epoch": 0.7542462047196753, + "flos": 26019851516640.0, + "grad_norm": 2.049888052944314, + "language_loss": 0.68238676, + "learning_rate": 6.006658987326383e-07, + "loss": 0.70380187, + "num_input_tokens_seen": 270618715, + "router_z_loss_clip": 0.72949219, + "router_z_loss_mlp": 0.10717773, + "step": 12545, + "time_per_iteration": 2.6237809658050537 + }, + { + "auxiliary_loss_clip": 0.01111005, + "auxiliary_loss_mlp": 0.01027963, + "balance_loss_clip": 1.0369606, + "balance_loss_mlp": 1.01651883, + "epoch": 0.7543063279723433, + "flos": 14570327826720.0, + "grad_norm": 2.0308462372365015, + "language_loss": 0.68520957, + "learning_rate": 6.003876669496728e-07, + "loss": 0.70659924, + "num_input_tokens_seen": 270635695, + "router_z_loss_clip": 0.73974609, + "router_z_loss_mlp": 0.11450195, + "step": 12546, + "time_per_iteration": 2.6280171871185303 + }, + { + "auxiliary_loss_clip": 0.0111282, + "auxiliary_loss_mlp": 0.01031998, + "balance_loss_clip": 1.03992808, + "balance_loss_mlp": 1.02011883, + "epoch": 0.7543664512250112, + "flos": 27845178235200.0, + "grad_norm": 5.322488727456362, + "language_loss": 0.73202181, + "learning_rate": 6.00109488240147e-07, + "loss": 0.75346994, + "num_input_tokens_seen": 270654325, + "router_z_loss_clip": 0.72851562, + "router_z_loss_mlp": 0.11883545, + "step": 12547, + "time_per_iteration": 2.624192953109741 + }, + { + "auxiliary_loss_clip": 0.01112456, + "auxiliary_loss_mlp": 0.01028141, + "balance_loss_clip": 1.03961825, + "balance_loss_mlp": 1.01596367, + "epoch": 0.7544265744776792, + "flos": 25531282974240.0, + "grad_norm": 1.8404812975853357, + "language_loss": 0.68075812, + "learning_rate": 5.998313626146099e-07, + "loss": 0.70216417, + "num_input_tokens_seen": 270674260, + "router_z_loss_clip": 0.72802734, + "router_z_loss_mlp": 0.12176514, + "step": 12548, + "time_per_iteration": 2.6482725143432617 + }, + { + "auxiliary_loss_clip": 0.01112687, + "auxiliary_loss_mlp": 0.01033011, + "balance_loss_clip": 1.03847241, + "balance_loss_mlp": 1.02127457, + "epoch": 0.7544866977303473, + "flos": 18541104357600.0, + "grad_norm": 3.7554788771238994, + "language_loss": 0.87170887, + "learning_rate": 5.995532900836088e-07, + "loss": 0.89316583, + "num_input_tokens_seen": 270692200, + "router_z_loss_clip": 0.74072266, + "router_z_loss_mlp": 0.11730957, + "step": 12549, + "time_per_iteration": 2.5873196125030518 + }, + { + "auxiliary_loss_clip": 0.01108034, + "auxiliary_loss_mlp": 0.01032408, + "balance_loss_clip": 1.03838599, + "balance_loss_mlp": 1.02116048, + "epoch": 0.7545468209830152, + "flos": 33048555716160.0, + "grad_norm": 2.4099055859648604, + "language_loss": 0.77039373, + "learning_rate": 5.992752706576865e-07, + "loss": 0.79179811, + "num_input_tokens_seen": 270709675, + "router_z_loss_clip": 0.69726562, + "router_z_loss_mlp": 0.11242676, + "step": 12550, + "time_per_iteration": 2.7490546703338623 + }, + { + "auxiliary_loss_clip": 0.01111125, + "auxiliary_loss_mlp": 0.01022869, + "balance_loss_clip": 1.03766048, + "balance_loss_mlp": 1.01208019, + "epoch": 0.7546069442356832, + "flos": 32789242709280.0, + "grad_norm": 2.009556841152644, + "language_loss": 0.69641685, + "learning_rate": 5.98997304347386e-07, + "loss": 0.71775675, + "num_input_tokens_seen": 270733055, + "router_z_loss_clip": 0.73486328, + "router_z_loss_mlp": 0.10784912, + "step": 12551, + "time_per_iteration": 2.670301675796509 + }, + { + "auxiliary_loss_clip": 0.01113757, + "auxiliary_loss_mlp": 0.01026177, + "balance_loss_clip": 1.041273, + "balance_loss_mlp": 1.01442909, + "epoch": 0.7546670674883511, + "flos": 19209341635200.0, + "grad_norm": 2.9933170144022756, + "language_loss": 0.86752021, + "learning_rate": 5.987193911632487e-07, + "loss": 0.88891959, + "num_input_tokens_seen": 270749275, + "router_z_loss_clip": 0.72412109, + "router_z_loss_mlp": 0.11749268, + "step": 12552, + "time_per_iteration": 2.664443254470825 + }, + { + "auxiliary_loss_clip": 0.01114072, + "auxiliary_loss_mlp": 0.01027749, + "balance_loss_clip": 1.04055655, + "balance_loss_mlp": 1.01676381, + "epoch": 0.7547271907410191, + "flos": 28647711244800.0, + "grad_norm": 1.7576436101352564, + "language_loss": 0.78016806, + "learning_rate": 5.98441531115812e-07, + "loss": 0.80158621, + "num_input_tokens_seen": 270768230, + "router_z_loss_clip": 0.734375, + "router_z_loss_mlp": 0.10986328, + "step": 12553, + "time_per_iteration": 2.6428515911102295 + }, + { + "auxiliary_loss_clip": 0.01114239, + "auxiliary_loss_mlp": 0.01030913, + "balance_loss_clip": 1.04090965, + "balance_loss_mlp": 1.01937342, + "epoch": 0.754787313993687, + "flos": 38709066680640.0, + "grad_norm": 2.160597934544367, + "language_loss": 0.62754548, + "learning_rate": 5.981637242156135e-07, + "loss": 0.64899695, + "num_input_tokens_seen": 270786285, + "router_z_loss_clip": 0.73291016, + "router_z_loss_mlp": 0.11547852, + "step": 12554, + "time_per_iteration": 2.719479560852051 + }, + { + "auxiliary_loss_clip": 0.01111929, + "auxiliary_loss_mlp": 0.0102942, + "balance_loss_clip": 1.0389657, + "balance_loss_mlp": 1.01856613, + "epoch": 0.7548474372463551, + "flos": 33633056476800.0, + "grad_norm": 2.74923620030518, + "language_loss": 0.73506188, + "learning_rate": 5.978859704731864e-07, + "loss": 0.75647533, + "num_input_tokens_seen": 270805505, + "router_z_loss_clip": 0.72949219, + "router_z_loss_mlp": 0.10858154, + "step": 12555, + "time_per_iteration": 2.647367477416992 + }, + { + "auxiliary_loss_clip": 0.01116337, + "auxiliary_loss_mlp": 0.01028262, + "balance_loss_clip": 1.04211211, + "balance_loss_mlp": 1.01676416, + "epoch": 0.754907560499023, + "flos": 23572389834720.0, + "grad_norm": 2.0854144491065716, + "language_loss": 0.78296369, + "learning_rate": 5.976082698990645e-07, + "loss": 0.80440962, + "num_input_tokens_seen": 270824610, + "router_z_loss_clip": 0.74121094, + "router_z_loss_mlp": 0.1151123, + "step": 12556, + "time_per_iteration": 2.6545543670654297 + }, + { + "auxiliary_loss_clip": 0.01031389, + "auxiliary_loss_mlp": 0.01001487, + "balance_loss_clip": 1.00847769, + "balance_loss_mlp": 1.00046062, + "epoch": 0.754967683751691, + "flos": 85102520832000.0, + "grad_norm": 0.7017534925596687, + "language_loss": 0.50425279, + "learning_rate": 5.973306225037769e-07, + "loss": 0.52458161, + "num_input_tokens_seen": 270886155, + "router_z_loss_clip": 0.22937012, + "router_z_loss_mlp": 0.01025391, + "step": 12557, + "time_per_iteration": 3.2987804412841797 + }, + { + "auxiliary_loss_clip": 0.01115885, + "auxiliary_loss_mlp": 0.01028615, + "balance_loss_clip": 1.04171133, + "balance_loss_mlp": 1.01656854, + "epoch": 0.7550278070043589, + "flos": 29800303267680.0, + "grad_norm": 1.7619910083842947, + "language_loss": 0.71567917, + "learning_rate": 5.970530282978525e-07, + "loss": 0.73712415, + "num_input_tokens_seen": 270905325, + "router_z_loss_clip": 0.7421875, + "router_z_loss_mlp": 0.12042236, + "step": 12558, + "time_per_iteration": 2.705265760421753 + }, + { + "auxiliary_loss_clip": 0.01112192, + "auxiliary_loss_mlp": 0.01027381, + "balance_loss_clip": 1.03974891, + "balance_loss_mlp": 1.01568675, + "epoch": 0.7550879302570269, + "flos": 39821755533120.0, + "grad_norm": 1.7402192782942052, + "language_loss": 0.80048835, + "learning_rate": 5.967754872918187e-07, + "loss": 0.82188404, + "num_input_tokens_seen": 270927535, + "router_z_loss_clip": 0.72314453, + "router_z_loss_mlp": 0.11688232, + "step": 12559, + "time_per_iteration": 2.892735242843628 + }, + { + "auxiliary_loss_clip": 0.01114463, + "auxiliary_loss_mlp": 0.01027661, + "balance_loss_clip": 1.03989029, + "balance_loss_mlp": 1.0156033, + "epoch": 0.7551480535096948, + "flos": 26595357441120.0, + "grad_norm": 1.9044567109748065, + "language_loss": 0.78527445, + "learning_rate": 5.96497999496199e-07, + "loss": 0.8066957, + "num_input_tokens_seen": 270946920, + "router_z_loss_clip": 0.74511719, + "router_z_loss_mlp": 0.12072754, + "step": 12560, + "time_per_iteration": 2.625378131866455 + }, + { + "auxiliary_loss_clip": 0.01110214, + "auxiliary_loss_mlp": 0.01033632, + "balance_loss_clip": 1.03933442, + "balance_loss_mlp": 1.02239609, + "epoch": 0.7552081767623628, + "flos": 22592011367520.0, + "grad_norm": 2.4171523312353544, + "language_loss": 0.70539892, + "learning_rate": 5.96220564921515e-07, + "loss": 0.7268374, + "num_input_tokens_seen": 270965705, + "router_z_loss_clip": 0.70898438, + "router_z_loss_mlp": 0.11224365, + "step": 12561, + "time_per_iteration": 2.686189889907837 + }, + { + "auxiliary_loss_clip": 0.01110804, + "auxiliary_loss_mlp": 0.01028226, + "balance_loss_clip": 1.03733265, + "balance_loss_mlp": 1.01653767, + "epoch": 0.7552683000150308, + "flos": 33720723169920.0, + "grad_norm": 2.013654874639352, + "language_loss": 0.75697911, + "learning_rate": 5.959431835782889e-07, + "loss": 0.77836949, + "num_input_tokens_seen": 270986550, + "router_z_loss_clip": 0.73583984, + "router_z_loss_mlp": 0.11688232, + "step": 12562, + "time_per_iteration": 2.6708264350891113 + }, + { + "auxiliary_loss_clip": 0.01112073, + "auxiliary_loss_mlp": 0.01029007, + "balance_loss_clip": 1.03989935, + "balance_loss_mlp": 1.01742542, + "epoch": 0.7553284232676988, + "flos": 24773636760480.0, + "grad_norm": 2.291466918670069, + "language_loss": 0.75770527, + "learning_rate": 5.956658554770371e-07, + "loss": 0.77911609, + "num_input_tokens_seen": 271006250, + "router_z_loss_clip": 0.72070312, + "router_z_loss_mlp": 0.11572266, + "step": 12563, + "time_per_iteration": 2.6796658039093018 + }, + { + "auxiliary_loss_clip": 0.01120233, + "auxiliary_loss_mlp": 0.01035485, + "balance_loss_clip": 1.04068267, + "balance_loss_mlp": 1.02138281, + "epoch": 0.7553885465203668, + "flos": 40579320712320.0, + "grad_norm": 2.5199288910620283, + "language_loss": 0.66960502, + "learning_rate": 5.953885806282768e-07, + "loss": 0.69116223, + "num_input_tokens_seen": 271025575, + "router_z_loss_clip": 0.79638672, + "router_z_loss_mlp": 0.14093018, + "step": 12564, + "time_per_iteration": 2.6998353004455566 + }, + { + "auxiliary_loss_clip": 0.01116665, + "auxiliary_loss_mlp": 0.01032706, + "balance_loss_clip": 1.04085422, + "balance_loss_mlp": 1.02060056, + "epoch": 0.7554486697730347, + "flos": 26376271742880.0, + "grad_norm": 3.13019552104029, + "language_loss": 0.6858542, + "learning_rate": 5.951113590425228e-07, + "loss": 0.70734787, + "num_input_tokens_seen": 271045805, + "router_z_loss_clip": 0.75878906, + "router_z_loss_mlp": 0.12109375, + "step": 12565, + "time_per_iteration": 4.122371435165405 + }, + { + "auxiliary_loss_clip": 0.01116128, + "auxiliary_loss_mlp": 0.01028284, + "balance_loss_clip": 1.0381968, + "balance_loss_mlp": 1.01546311, + "epoch": 0.7555087930257027, + "flos": 33718656788640.0, + "grad_norm": 1.4678965560737067, + "language_loss": 0.75151205, + "learning_rate": 5.94834190730287e-07, + "loss": 0.77295607, + "num_input_tokens_seen": 271066065, + "router_z_loss_clip": 0.77978516, + "router_z_loss_mlp": 0.12817383, + "step": 12566, + "time_per_iteration": 2.6741578578948975 + }, + { + "auxiliary_loss_clip": 0.01117854, + "auxiliary_loss_mlp": 0.01032598, + "balance_loss_clip": 1.04141247, + "balance_loss_mlp": 1.01971722, + "epoch": 0.7555689162783706, + "flos": 28823530838400.0, + "grad_norm": 3.5014800563661854, + "language_loss": 0.74012148, + "learning_rate": 5.945570757020789e-07, + "loss": 0.76162601, + "num_input_tokens_seen": 271085870, + "router_z_loss_clip": 0.76416016, + "router_z_loss_mlp": 0.12884521, + "step": 12567, + "time_per_iteration": 2.6343467235565186 + }, + { + "auxiliary_loss_clip": 0.01112509, + "auxiliary_loss_mlp": 0.01031557, + "balance_loss_clip": 1.03917849, + "balance_loss_mlp": 1.02008295, + "epoch": 0.7556290395310387, + "flos": 30338337058560.0, + "grad_norm": 1.8197891868319869, + "language_loss": 0.62988043, + "learning_rate": 5.942800139684073e-07, + "loss": 0.65132105, + "num_input_tokens_seen": 271104260, + "router_z_loss_clip": 0.73388672, + "router_z_loss_mlp": 0.11480713, + "step": 12568, + "time_per_iteration": 4.0245583057403564 + }, + { + "auxiliary_loss_clip": 0.01111064, + "auxiliary_loss_mlp": 0.0103068, + "balance_loss_clip": 1.03906703, + "balance_loss_mlp": 1.01922369, + "epoch": 0.7556891627837066, + "flos": 53133956523360.0, + "grad_norm": 2.6833092871364532, + "language_loss": 0.66819108, + "learning_rate": 5.940030055397789e-07, + "loss": 0.68960857, + "num_input_tokens_seen": 271125745, + "router_z_loss_clip": 0.71923828, + "router_z_loss_mlp": 0.11462402, + "step": 12569, + "time_per_iteration": 2.8255207538604736 + }, + { + "auxiliary_loss_clip": 0.01118377, + "auxiliary_loss_mlp": 0.01033312, + "balance_loss_clip": 1.04201174, + "balance_loss_mlp": 1.02055669, + "epoch": 0.7557492860363746, + "flos": 32520002968800.0, + "grad_norm": 1.78782419574886, + "language_loss": 0.67718542, + "learning_rate": 5.93726050426697e-07, + "loss": 0.69870222, + "num_input_tokens_seen": 271147145, + "router_z_loss_clip": 0.76367188, + "router_z_loss_mlp": 0.12768555, + "step": 12570, + "time_per_iteration": 2.6672863960266113 + }, + { + "auxiliary_loss_clip": 0.01115528, + "auxiliary_loss_mlp": 0.01032251, + "balance_loss_clip": 1.0413909, + "balance_loss_mlp": 1.01980519, + "epoch": 0.7558094092890425, + "flos": 67336883940960.0, + "grad_norm": 1.8850503224052182, + "language_loss": 0.71948719, + "learning_rate": 5.934491486396647e-07, + "loss": 0.74096501, + "num_input_tokens_seen": 271170865, + "router_z_loss_clip": 0.74121094, + "router_z_loss_mlp": 0.12457275, + "step": 12571, + "time_per_iteration": 2.9427011013031006 + }, + { + "auxiliary_loss_clip": 0.01115672, + "auxiliary_loss_mlp": 0.01029832, + "balance_loss_clip": 1.03990984, + "balance_loss_mlp": 1.01777983, + "epoch": 0.7558695325417105, + "flos": 29270778105600.0, + "grad_norm": 1.781806828165434, + "language_loss": 0.73657179, + "learning_rate": 5.931723001891811e-07, + "loss": 0.75802678, + "num_input_tokens_seen": 271191450, + "router_z_loss_clip": 0.75830078, + "router_z_loss_mlp": 0.12060547, + "step": 12572, + "time_per_iteration": 2.6915907859802246 + }, + { + "auxiliary_loss_clip": 0.01115462, + "auxiliary_loss_mlp": 0.01031175, + "balance_loss_clip": 1.04037654, + "balance_loss_mlp": 1.0195874, + "epoch": 0.7559296557943784, + "flos": 17828223387840.0, + "grad_norm": 2.3527997613544063, + "language_loss": 0.7679916, + "learning_rate": 5.928955050857456e-07, + "loss": 0.78945804, + "num_input_tokens_seen": 271207335, + "router_z_loss_clip": 0.75097656, + "router_z_loss_mlp": 0.1159668, + "step": 12573, + "time_per_iteration": 2.604311943054199 + }, + { + "auxiliary_loss_clip": 0.01116266, + "auxiliary_loss_mlp": 0.01031326, + "balance_loss_clip": 1.04053235, + "balance_loss_mlp": 1.01945925, + "epoch": 0.7559897790470465, + "flos": 22635277472160.0, + "grad_norm": 1.7769720995455058, + "language_loss": 0.69302905, + "learning_rate": 5.926187633398527e-07, + "loss": 0.71450496, + "num_input_tokens_seen": 271226895, + "router_z_loss_clip": 0.7578125, + "router_z_loss_mlp": 0.11871338, + "step": 12574, + "time_per_iteration": 2.583253860473633 + }, + { + "auxiliary_loss_clip": 0.01109062, + "auxiliary_loss_mlp": 0.01032875, + "balance_loss_clip": 1.03718007, + "balance_loss_mlp": 1.02095962, + "epoch": 0.7560499022997144, + "flos": 21924219780000.0, + "grad_norm": 4.786347116629093, + "language_loss": 0.71718419, + "learning_rate": 5.923420749619974e-07, + "loss": 0.73860359, + "num_input_tokens_seen": 271244375, + "router_z_loss_clip": 0.71826172, + "router_z_loss_mlp": 0.11914062, + "step": 12575, + "time_per_iteration": 4.076395750045776 + }, + { + "auxiliary_loss_clip": 0.01109526, + "auxiliary_loss_mlp": 0.01029323, + "balance_loss_clip": 1.03753328, + "balance_loss_mlp": 1.01852894, + "epoch": 0.7561100255523824, + "flos": 19203345077760.0, + "grad_norm": 4.097507551864991, + "language_loss": 0.72363186, + "learning_rate": 5.92065439962673e-07, + "loss": 0.74502039, + "num_input_tokens_seen": 271259530, + "router_z_loss_clip": 0.72070312, + "router_z_loss_mlp": 0.10797119, + "step": 12576, + "time_per_iteration": 3.9274983406066895 + }, + { + "auxiliary_loss_clip": 0.0111201, + "auxiliary_loss_mlp": 0.01029137, + "balance_loss_clip": 1.04007459, + "balance_loss_mlp": 1.01755595, + "epoch": 0.7561701488050504, + "flos": 19386336229920.0, + "grad_norm": 1.8682064328091632, + "language_loss": 0.67209768, + "learning_rate": 5.917888583523669e-07, + "loss": 0.6935091, + "num_input_tokens_seen": 271276835, + "router_z_loss_clip": 0.72021484, + "router_z_loss_mlp": 0.11578369, + "step": 12577, + "time_per_iteration": 2.592506170272827 + }, + { + "auxiliary_loss_clip": 0.0111056, + "auxiliary_loss_mlp": 0.01034045, + "balance_loss_clip": 1.03817952, + "balance_loss_mlp": 1.02251172, + "epoch": 0.7562302720577183, + "flos": 24817875279840.0, + "grad_norm": 2.9780069617205913, + "language_loss": 0.7777468, + "learning_rate": 5.915123301415685e-07, + "loss": 0.79919291, + "num_input_tokens_seen": 271296275, + "router_z_loss_clip": 0.72363281, + "router_z_loss_mlp": 0.11547852, + "step": 12578, + "time_per_iteration": 2.6064484119415283 + }, + { + "auxiliary_loss_clip": 0.01113587, + "auxiliary_loss_mlp": 0.01030097, + "balance_loss_clip": 1.03896546, + "balance_loss_mlp": 1.01827133, + "epoch": 0.7562903953103863, + "flos": 25395244999200.0, + "grad_norm": 1.3989634630110042, + "language_loss": 0.75524062, + "learning_rate": 5.912358553407641e-07, + "loss": 0.77667743, + "num_input_tokens_seen": 271315685, + "router_z_loss_clip": 0.74560547, + "router_z_loss_mlp": 0.1182251, + "step": 12579, + "time_per_iteration": 2.657135009765625 + }, + { + "auxiliary_loss_clip": 0.01118495, + "auxiliary_loss_mlp": 0.01029393, + "balance_loss_clip": 1.040694, + "balance_loss_mlp": 1.01663816, + "epoch": 0.7563505185630542, + "flos": 45877212306720.0, + "grad_norm": 2.631592993377698, + "language_loss": 0.62614059, + "learning_rate": 5.90959433960437e-07, + "loss": 0.64761949, + "num_input_tokens_seen": 271336790, + "router_z_loss_clip": 0.77832031, + "router_z_loss_mlp": 0.12750244, + "step": 12580, + "time_per_iteration": 2.714872121810913 + }, + { + "auxiliary_loss_clip": 0.01112235, + "auxiliary_loss_mlp": 0.01026994, + "balance_loss_clip": 1.03952694, + "balance_loss_mlp": 1.01587152, + "epoch": 0.7564106418157223, + "flos": 24685767480960.0, + "grad_norm": 1.9723894176517272, + "language_loss": 0.75454843, + "learning_rate": 5.906830660110691e-07, + "loss": 0.77594072, + "num_input_tokens_seen": 271355470, + "router_z_loss_clip": 0.72607422, + "router_z_loss_mlp": 0.11132812, + "step": 12581, + "time_per_iteration": 2.6466224193573 + }, + { + "auxiliary_loss_clip": 0.01115569, + "auxiliary_loss_mlp": 0.01034015, + "balance_loss_clip": 1.04001713, + "balance_loss_mlp": 1.02219558, + "epoch": 0.7564707650683902, + "flos": 30206188742400.0, + "grad_norm": 3.9134333313645615, + "language_loss": 0.62711632, + "learning_rate": 5.904067515031412e-07, + "loss": 0.64861214, + "num_input_tokens_seen": 271375810, + "router_z_loss_clip": 0.75439453, + "router_z_loss_mlp": 0.11816406, + "step": 12582, + "time_per_iteration": 2.69138765335083 + }, + { + "auxiliary_loss_clip": 0.01031676, + "auxiliary_loss_mlp": 0.01001458, + "balance_loss_clip": 1.00874865, + "balance_loss_mlp": 1.00041986, + "epoch": 0.7565308883210582, + "flos": 59216397805440.0, + "grad_norm": 0.9738950421382485, + "language_loss": 0.60625416, + "learning_rate": 5.901304904471307e-07, + "loss": 0.62658554, + "num_input_tokens_seen": 271424775, + "router_z_loss_clip": 0.22912598, + "router_z_loss_mlp": 0.01039124, + "step": 12583, + "time_per_iteration": 3.0287933349609375 + }, + { + "auxiliary_loss_clip": 0.01113516, + "auxiliary_loss_mlp": 0.01033453, + "balance_loss_clip": 1.03946388, + "balance_loss_mlp": 1.02152574, + "epoch": 0.7565910115737261, + "flos": 15245898732000.0, + "grad_norm": 2.007769300215403, + "language_loss": 0.79269624, + "learning_rate": 5.898542828535125e-07, + "loss": 0.81416595, + "num_input_tokens_seen": 271440500, + "router_z_loss_clip": 0.73974609, + "router_z_loss_mlp": 0.1192627, + "step": 12584, + "time_per_iteration": 2.59075665473938 + }, + { + "auxiliary_loss_clip": 0.01111253, + "auxiliary_loss_mlp": 0.01025555, + "balance_loss_clip": 1.04074144, + "balance_loss_mlp": 1.01440346, + "epoch": 0.7566511348263941, + "flos": 25835887949760.0, + "grad_norm": 4.662889153132058, + "language_loss": 0.77561164, + "learning_rate": 5.895781287327612e-07, + "loss": 0.79697973, + "num_input_tokens_seen": 271458180, + "router_z_loss_clip": 0.70556641, + "router_z_loss_mlp": 0.11151123, + "step": 12585, + "time_per_iteration": 2.667963743209839 + }, + { + "auxiliary_loss_clip": 0.01117658, + "auxiliary_loss_mlp": 0.01033708, + "balance_loss_clip": 1.04180336, + "balance_loss_mlp": 1.02132249, + "epoch": 0.756711258079062, + "flos": 26546054261760.0, + "grad_norm": 1.9607179950581077, + "language_loss": 0.8292253, + "learning_rate": 5.893020280953493e-07, + "loss": 0.850739, + "num_input_tokens_seen": 271475730, + "router_z_loss_clip": 0.75732422, + "router_z_loss_mlp": 0.1239624, + "step": 12586, + "time_per_iteration": 2.620816946029663 + }, + { + "auxiliary_loss_clip": 0.01116329, + "auxiliary_loss_mlp": 0.01028788, + "balance_loss_clip": 1.04126716, + "balance_loss_mlp": 1.01769614, + "epoch": 0.75677138133173, + "flos": 27311763414240.0, + "grad_norm": 2.314940919809317, + "language_loss": 0.83244753, + "learning_rate": 5.890259809517459e-07, + "loss": 0.85389864, + "num_input_tokens_seen": 271495030, + "router_z_loss_clip": 0.75097656, + "router_z_loss_mlp": 0.11096191, + "step": 12587, + "time_per_iteration": 2.6377151012420654 + }, + { + "auxiliary_loss_clip": 0.01113643, + "auxiliary_loss_mlp": 0.01028498, + "balance_loss_clip": 1.04084754, + "balance_loss_mlp": 1.01688123, + "epoch": 0.756831504584398, + "flos": 27709869571200.0, + "grad_norm": 1.582862899862312, + "language_loss": 0.71367717, + "learning_rate": 5.88749987312418e-07, + "loss": 0.7350986, + "num_input_tokens_seen": 271515355, + "router_z_loss_clip": 0.72753906, + "router_z_loss_mlp": 0.1161499, + "step": 12588, + "time_per_iteration": 2.6392159461975098 + }, + { + "auxiliary_loss_clip": 0.01115569, + "auxiliary_loss_mlp": 0.01029736, + "balance_loss_clip": 1.04023349, + "balance_loss_mlp": 1.01758265, + "epoch": 0.756891627837066, + "flos": 29405033320320.0, + "grad_norm": 1.8368972585131877, + "language_loss": 0.69057953, + "learning_rate": 5.884740471878327e-07, + "loss": 0.71203256, + "num_input_tokens_seen": 271535090, + "router_z_loss_clip": 0.75341797, + "router_z_loss_mlp": 0.12145996, + "step": 12589, + "time_per_iteration": 2.6606974601745605 + }, + { + "auxiliary_loss_clip": 0.01112527, + "auxiliary_loss_mlp": 0.01027653, + "balance_loss_clip": 1.03921878, + "balance_loss_mlp": 1.01626885, + "epoch": 0.756951751089734, + "flos": 24098308958880.0, + "grad_norm": 1.728814571340785, + "language_loss": 0.92042267, + "learning_rate": 5.881981605884522e-07, + "loss": 0.94182444, + "num_input_tokens_seen": 271551075, + "router_z_loss_clip": 0.73339844, + "router_z_loss_mlp": 0.11383057, + "step": 12590, + "time_per_iteration": 2.58821964263916 + }, + { + "auxiliary_loss_clip": 0.01109268, + "auxiliary_loss_mlp": 0.01029894, + "balance_loss_clip": 1.03783822, + "balance_loss_mlp": 1.01880181, + "epoch": 0.7570118743424019, + "flos": 42804901003680.0, + "grad_norm": 1.997070950476957, + "language_loss": 0.65496826, + "learning_rate": 5.879223275247391e-07, + "loss": 0.67635989, + "num_input_tokens_seen": 271571035, + "router_z_loss_clip": 0.71386719, + "router_z_loss_mlp": 0.11083984, + "step": 12591, + "time_per_iteration": 2.7800495624542236 + }, + { + "auxiliary_loss_clip": 0.01111923, + "auxiliary_loss_mlp": 0.01022356, + "balance_loss_clip": 1.04021692, + "balance_loss_mlp": 1.0114429, + "epoch": 0.7570719975950699, + "flos": 31230765211680.0, + "grad_norm": 2.4618499442528825, + "language_loss": 0.7352947, + "learning_rate": 5.876465480071528e-07, + "loss": 0.75663751, + "num_input_tokens_seen": 271592950, + "router_z_loss_clip": 0.71630859, + "router_z_loss_mlp": 0.10913086, + "step": 12592, + "time_per_iteration": 2.6832098960876465 + }, + { + "auxiliary_loss_clip": 0.01113347, + "auxiliary_loss_mlp": 0.01034921, + "balance_loss_clip": 1.03824484, + "balance_loss_mlp": 1.02287507, + "epoch": 0.7571321208477378, + "flos": 13197272518080.0, + "grad_norm": 2.314700000152466, + "language_loss": 0.71798134, + "learning_rate": 5.873708220461522e-07, + "loss": 0.73946404, + "num_input_tokens_seen": 271608835, + "router_z_loss_clip": 0.75097656, + "router_z_loss_mlp": 0.12042236, + "step": 12593, + "time_per_iteration": 2.589951276779175 + }, + { + "auxiliary_loss_clip": 0.01114198, + "auxiliary_loss_mlp": 0.0103058, + "balance_loss_clip": 1.0394727, + "balance_loss_mlp": 1.01889777, + "epoch": 0.7571922441004059, + "flos": 22280802075360.0, + "grad_norm": 2.419006238040998, + "language_loss": 0.66259944, + "learning_rate": 5.870951496521903e-07, + "loss": 0.68404722, + "num_input_tokens_seen": 271627730, + "router_z_loss_clip": 0.74658203, + "router_z_loss_mlp": 0.11688232, + "step": 12594, + "time_per_iteration": 2.7013864517211914 + }, + { + "auxiliary_loss_clip": 0.01116352, + "auxiliary_loss_mlp": 0.01030536, + "balance_loss_clip": 1.03988874, + "balance_loss_mlp": 1.0188539, + "epoch": 0.7572523673530738, + "flos": 27931467340800.0, + "grad_norm": 2.63669408994603, + "language_loss": 0.80805111, + "learning_rate": 5.86819530835722e-07, + "loss": 0.82951999, + "num_input_tokens_seen": 271646415, + "router_z_loss_clip": 0.765625, + "router_z_loss_mlp": 0.11669922, + "step": 12595, + "time_per_iteration": 2.6928770542144775 + }, + { + "auxiliary_loss_clip": 0.01112288, + "auxiliary_loss_mlp": 0.01029438, + "balance_loss_clip": 1.03992438, + "balance_loss_mlp": 1.0183636, + "epoch": 0.7573124906057418, + "flos": 25619760012960.0, + "grad_norm": 2.116519125142513, + "language_loss": 0.71893328, + "learning_rate": 5.865439656071993e-07, + "loss": 0.74035048, + "num_input_tokens_seen": 271666240, + "router_z_loss_clip": 0.72314453, + "router_z_loss_mlp": 0.11065674, + "step": 12596, + "time_per_iteration": 2.6540818214416504 + }, + { + "auxiliary_loss_clip": 0.01113261, + "auxiliary_loss_mlp": 0.01031025, + "balance_loss_clip": 1.0410527, + "balance_loss_mlp": 1.02005792, + "epoch": 0.7573726138584097, + "flos": 25485545315520.0, + "grad_norm": 1.767901466302539, + "language_loss": 0.80107152, + "learning_rate": 5.862684539770706e-07, + "loss": 0.82251441, + "num_input_tokens_seen": 271686370, + "router_z_loss_clip": 0.72167969, + "router_z_loss_mlp": 0.10968018, + "step": 12597, + "time_per_iteration": 2.6500308513641357 + }, + { + "auxiliary_loss_clip": 0.01119794, + "auxiliary_loss_mlp": 0.01031787, + "balance_loss_clip": 1.04343188, + "balance_loss_mlp": 1.01940703, + "epoch": 0.7574327371110777, + "flos": 29932897273920.0, + "grad_norm": 1.7373613939461732, + "language_loss": 0.83647311, + "learning_rate": 5.859929959557835e-07, + "loss": 0.85798895, + "num_input_tokens_seen": 271705050, + "router_z_loss_clip": 0.76318359, + "router_z_loss_mlp": 0.12390137, + "step": 12598, + "time_per_iteration": 2.7491276264190674 + }, + { + "auxiliary_loss_clip": 0.01111639, + "auxiliary_loss_mlp": 0.01025107, + "balance_loss_clip": 1.04004264, + "balance_loss_mlp": 1.01464081, + "epoch": 0.7574928603637456, + "flos": 28509931026720.0, + "grad_norm": 2.038655155182345, + "language_loss": 0.62820673, + "learning_rate": 5.857175915537845e-07, + "loss": 0.64957416, + "num_input_tokens_seen": 271724915, + "router_z_loss_clip": 0.71582031, + "router_z_loss_mlp": 0.10455322, + "step": 12599, + "time_per_iteration": 2.6208608150482178 + }, + { + "auxiliary_loss_clip": 0.011171, + "auxiliary_loss_mlp": 0.01032466, + "balance_loss_clip": 1.04126143, + "balance_loss_mlp": 1.01927543, + "epoch": 0.7575529836164137, + "flos": 16492072970880.0, + "grad_norm": 2.8859766171816923, + "language_loss": 0.63363481, + "learning_rate": 5.854422407815161e-07, + "loss": 0.65513051, + "num_input_tokens_seen": 271742410, + "router_z_loss_clip": 0.7578125, + "router_z_loss_mlp": 0.13195801, + "step": 12600, + "time_per_iteration": 2.660841464996338 + }, + { + "auxiliary_loss_clip": 0.01111343, + "auxiliary_loss_mlp": 0.01029267, + "balance_loss_clip": 1.03989029, + "balance_loss_mlp": 1.01774526, + "epoch": 0.7576131068690816, + "flos": 23972683924800.0, + "grad_norm": 2.4995668176951766, + "language_loss": 0.66386586, + "learning_rate": 5.851669436494191e-07, + "loss": 0.68527198, + "num_input_tokens_seen": 271761425, + "router_z_loss_clip": 0.71386719, + "router_z_loss_mlp": 0.11517334, + "step": 12601, + "time_per_iteration": 2.6284091472625732 + }, + { + "auxiliary_loss_clip": 0.01109319, + "auxiliary_loss_mlp": 0.01030321, + "balance_loss_clip": 1.03907609, + "balance_loss_mlp": 1.01959229, + "epoch": 0.7576732301217496, + "flos": 24462792123840.0, + "grad_norm": 1.943957138313668, + "language_loss": 0.67675555, + "learning_rate": 5.848917001679335e-07, + "loss": 0.69815195, + "num_input_tokens_seen": 271780875, + "router_z_loss_clip": 0.70263672, + "router_z_loss_mlp": 0.1072998, + "step": 12602, + "time_per_iteration": 2.7022511959075928 + }, + { + "auxiliary_loss_clip": 0.01114511, + "auxiliary_loss_mlp": 0.01033823, + "balance_loss_clip": 1.0406965, + "balance_loss_mlp": 1.02161598, + "epoch": 0.7577333533744176, + "flos": 18762783161760.0, + "grad_norm": 2.547781472588773, + "language_loss": 0.6716572, + "learning_rate": 5.846165103474967e-07, + "loss": 0.69314051, + "num_input_tokens_seen": 271799490, + "router_z_loss_clip": 0.73828125, + "router_z_loss_mlp": 0.12200928, + "step": 12603, + "time_per_iteration": 2.585252285003662 + }, + { + "auxiliary_loss_clip": 0.01109085, + "auxiliary_loss_mlp": 0.01032674, + "balance_loss_clip": 1.03801513, + "balance_loss_mlp": 1.02237976, + "epoch": 0.7577934766270855, + "flos": 21835134982080.0, + "grad_norm": 2.169585294759722, + "language_loss": 0.62142646, + "learning_rate": 5.843413741985439e-07, + "loss": 0.64284402, + "num_input_tokens_seen": 271817040, + "router_z_loss_clip": 0.71044922, + "router_z_loss_mlp": 0.10290527, + "step": 12604, + "time_per_iteration": 4.114248275756836 + }, + { + "auxiliary_loss_clip": 0.01113938, + "auxiliary_loss_mlp": 0.01032673, + "balance_loss_clip": 1.04203272, + "balance_loss_mlp": 1.02112174, + "epoch": 0.7578535998797535, + "flos": 26373476050560.0, + "grad_norm": 3.852892751038136, + "language_loss": 0.79757929, + "learning_rate": 5.840662917315076e-07, + "loss": 0.81904542, + "num_input_tokens_seen": 271835480, + "router_z_loss_clip": 0.71972656, + "router_z_loss_mlp": 0.11553955, + "step": 12605, + "time_per_iteration": 2.758718490600586 + }, + { + "auxiliary_loss_clip": 0.01117167, + "auxiliary_loss_mlp": 0.01029732, + "balance_loss_clip": 1.04118109, + "balance_loss_mlp": 1.0179956, + "epoch": 0.7579137231324214, + "flos": 22547205606240.0, + "grad_norm": 3.874347754865577, + "language_loss": 0.80125463, + "learning_rate": 5.837912629568198e-07, + "loss": 0.82272363, + "num_input_tokens_seen": 271849835, + "router_z_loss_clip": 0.76025391, + "router_z_loss_mlp": 0.11737061, + "step": 12606, + "time_per_iteration": 2.60064697265625 + }, + { + "auxiliary_loss_clip": 0.0111095, + "auxiliary_loss_mlp": 0.0102757, + "balance_loss_clip": 1.0414052, + "balance_loss_mlp": 1.0175209, + "epoch": 0.7579738463850895, + "flos": 28376324088480.0, + "grad_norm": 1.3434663054468805, + "language_loss": 0.73130113, + "learning_rate": 5.835162878849087e-07, + "loss": 0.75268638, + "num_input_tokens_seen": 271869560, + "router_z_loss_clip": 0.6953125, + "router_z_loss_mlp": 0.10046387, + "step": 12607, + "time_per_iteration": 3.999542474746704 + }, + { + "auxiliary_loss_clip": 0.01115572, + "auxiliary_loss_mlp": 0.01029804, + "balance_loss_clip": 1.03915906, + "balance_loss_mlp": 1.01812148, + "epoch": 0.7580339696377574, + "flos": 17115342418080.0, + "grad_norm": 2.055218428373182, + "language_loss": 0.74538058, + "learning_rate": 5.83241366526202e-07, + "loss": 0.76683438, + "num_input_tokens_seen": 271887950, + "router_z_loss_clip": 0.76367188, + "router_z_loss_mlp": 0.11688232, + "step": 12608, + "time_per_iteration": 2.612781524658203 + }, + { + "auxiliary_loss_clip": 0.01110819, + "auxiliary_loss_mlp": 0.01031874, + "balance_loss_clip": 1.03945649, + "balance_loss_mlp": 1.02026308, + "epoch": 0.7580940928904254, + "flos": 30606361280640.0, + "grad_norm": 1.5854858383098789, + "language_loss": 0.71461362, + "learning_rate": 5.829664988911245e-07, + "loss": 0.73604059, + "num_input_tokens_seen": 271907700, + "router_z_loss_clip": 0.71435547, + "router_z_loss_mlp": 0.11602783, + "step": 12609, + "time_per_iteration": 2.6382291316986084 + }, + { + "auxiliary_loss_clip": 0.01113096, + "auxiliary_loss_mlp": 0.01030207, + "balance_loss_clip": 1.03897619, + "balance_loss_mlp": 1.01764214, + "epoch": 0.7581542161430933, + "flos": 29086004193120.0, + "grad_norm": 3.284149222815441, + "language_loss": 0.81557298, + "learning_rate": 5.826916849901007e-07, + "loss": 0.83700597, + "num_input_tokens_seen": 271926840, + "router_z_loss_clip": 0.7421875, + "router_z_loss_mlp": 0.12567139, + "step": 12610, + "time_per_iteration": 2.6416656970977783 + }, + { + "auxiliary_loss_clip": 0.0111688, + "auxiliary_loss_mlp": 0.01029406, + "balance_loss_clip": 1.04200888, + "balance_loss_mlp": 1.01796222, + "epoch": 0.7582143393957613, + "flos": 27133715370240.0, + "grad_norm": 1.6306384715749977, + "language_loss": 0.70473015, + "learning_rate": 5.824169248335488e-07, + "loss": 0.72619301, + "num_input_tokens_seen": 271946465, + "router_z_loss_clip": 0.74853516, + "router_z_loss_mlp": 0.11437988, + "step": 12611, + "time_per_iteration": 2.6143102645874023 + }, + { + "auxiliary_loss_clip": 0.01112818, + "auxiliary_loss_mlp": 0.01029031, + "balance_loss_clip": 1.0403192, + "balance_loss_mlp": 1.01780725, + "epoch": 0.7582744626484292, + "flos": 25754015227680.0, + "grad_norm": 1.5984259416105204, + "language_loss": 0.71025366, + "learning_rate": 5.821422184318893e-07, + "loss": 0.73167217, + "num_input_tokens_seen": 271967295, + "router_z_loss_clip": 0.72509766, + "router_z_loss_mlp": 0.11230469, + "step": 12612, + "time_per_iteration": 2.6509523391723633 + }, + { + "auxiliary_loss_clip": 0.01115442, + "auxiliary_loss_mlp": 0.0103583, + "balance_loss_clip": 1.04084742, + "balance_loss_mlp": 1.02492833, + "epoch": 0.7583345859010973, + "flos": 30023400176640.0, + "grad_norm": 1.4172568798239407, + "language_loss": 0.59665543, + "learning_rate": 5.818675657955397e-07, + "loss": 0.61816812, + "num_input_tokens_seen": 271987960, + "router_z_loss_clip": 0.74609375, + "router_z_loss_mlp": 0.10913086, + "step": 12613, + "time_per_iteration": 2.6643218994140625 + }, + { + "auxiliary_loss_clip": 0.01112911, + "auxiliary_loss_mlp": 0.01032442, + "balance_loss_clip": 1.039536, + "balance_loss_mlp": 1.02124846, + "epoch": 0.7583947091537652, + "flos": 40934890075680.0, + "grad_norm": 1.819326393359211, + "language_loss": 0.59943813, + "learning_rate": 5.815929669349135e-07, + "loss": 0.62089169, + "num_input_tokens_seen": 272011780, + "router_z_loss_clip": 0.734375, + "router_z_loss_mlp": 0.11187744, + "step": 12614, + "time_per_iteration": 4.274374008178711 + }, + { + "auxiliary_loss_clip": 0.0111433, + "auxiliary_loss_mlp": 0.01028248, + "balance_loss_clip": 1.0389148, + "balance_loss_mlp": 1.01624942, + "epoch": 0.7584548324064332, + "flos": 24551593300800.0, + "grad_norm": 1.851840754805295, + "language_loss": 0.73147976, + "learning_rate": 5.813184218604246e-07, + "loss": 0.75290555, + "num_input_tokens_seen": 272030825, + "router_z_loss_clip": 0.75390625, + "router_z_loss_mlp": 0.11993408, + "step": 12615, + "time_per_iteration": 2.6880061626434326 + }, + { + "auxiliary_loss_clip": 0.01031545, + "auxiliary_loss_mlp": 0.01002777, + "balance_loss_clip": 1.00867748, + "balance_loss_mlp": 1.00173271, + "epoch": 0.7585149556591012, + "flos": 85905296945280.0, + "grad_norm": 0.8017090312824167, + "language_loss": 0.67624617, + "learning_rate": 5.810439305824828e-07, + "loss": 0.69658935, + "num_input_tokens_seen": 272095825, + "router_z_loss_clip": 0.22851562, + "router_z_loss_mlp": 0.01044464, + "step": 12616, + "time_per_iteration": 4.667433023452759 + }, + { + "auxiliary_loss_clip": 0.01115468, + "auxiliary_loss_mlp": 0.0103537, + "balance_loss_clip": 1.04012847, + "balance_loss_mlp": 1.02375281, + "epoch": 0.7585750789117691, + "flos": 19698315350400.0, + "grad_norm": 2.088305622211488, + "language_loss": 0.8471241, + "learning_rate": 5.807694931114979e-07, + "loss": 0.8686325, + "num_input_tokens_seen": 272113950, + "router_z_loss_clip": 0.75341797, + "router_z_loss_mlp": 0.1161499, + "step": 12617, + "time_per_iteration": 2.7808046340942383 + }, + { + "auxiliary_loss_clip": 0.01116136, + "auxiliary_loss_mlp": 0.01032172, + "balance_loss_clip": 1.04242849, + "balance_loss_mlp": 1.02101374, + "epoch": 0.7586352021644371, + "flos": 21344986265760.0, + "grad_norm": 3.1336840655647946, + "language_loss": 0.75109291, + "learning_rate": 5.804951094578757e-07, + "loss": 0.77257597, + "num_input_tokens_seen": 272130315, + "router_z_loss_clip": 0.73632812, + "router_z_loss_mlp": 0.1116333, + "step": 12618, + "time_per_iteration": 2.7173287868499756 + }, + { + "auxiliary_loss_clip": 0.01118777, + "auxiliary_loss_mlp": 0.01035857, + "balance_loss_clip": 1.04076958, + "balance_loss_mlp": 1.02385902, + "epoch": 0.758695325417105, + "flos": 21078906873120.0, + "grad_norm": 2.1126870783480713, + "language_loss": 0.77499437, + "learning_rate": 5.802207796320209e-07, + "loss": 0.79654074, + "num_input_tokens_seen": 272149080, + "router_z_loss_clip": 0.77929688, + "router_z_loss_mlp": 0.11993408, + "step": 12619, + "time_per_iteration": 2.6579959392547607 + }, + { + "auxiliary_loss_clip": 0.01111971, + "auxiliary_loss_mlp": 0.01031858, + "balance_loss_clip": 1.04005885, + "balance_loss_mlp": 1.02036023, + "epoch": 0.7587554486697731, + "flos": 35991473878080.0, + "grad_norm": 2.0232850886383127, + "language_loss": 0.82587361, + "learning_rate": 5.79946503644337e-07, + "loss": 0.84731191, + "num_input_tokens_seen": 272168285, + "router_z_loss_clip": 0.71923828, + "router_z_loss_mlp": 0.1149292, + "step": 12620, + "time_per_iteration": 2.801661491394043 + }, + { + "auxiliary_loss_clip": 0.01117187, + "auxiliary_loss_mlp": 0.01036785, + "balance_loss_clip": 1.04095304, + "balance_loss_mlp": 1.02422667, + "epoch": 0.758815571922441, + "flos": 19646459582400.0, + "grad_norm": 2.3129065474855546, + "language_loss": 0.82709241, + "learning_rate": 5.796722815052242e-07, + "loss": 0.84863216, + "num_input_tokens_seen": 272184585, + "router_z_loss_clip": 0.76269531, + "router_z_loss_mlp": 0.12561035, + "step": 12621, + "time_per_iteration": 2.655409336090088 + }, + { + "auxiliary_loss_clip": 0.01114846, + "auxiliary_loss_mlp": 0.01034523, + "balance_loss_clip": 1.04110217, + "balance_loss_mlp": 1.02295423, + "epoch": 0.758875695175109, + "flos": 19698234315840.0, + "grad_norm": 1.9718360864984197, + "language_loss": 0.73779988, + "learning_rate": 5.7939811322508e-07, + "loss": 0.75929356, + "num_input_tokens_seen": 272200205, + "router_z_loss_clip": 0.73730469, + "router_z_loss_mlp": 0.11572266, + "step": 12622, + "time_per_iteration": 2.6901607513427734 + }, + { + "auxiliary_loss_clip": 0.01031818, + "auxiliary_loss_mlp": 0.01002789, + "balance_loss_clip": 1.00887346, + "balance_loss_mlp": 1.00174105, + "epoch": 0.7589358184277769, + "flos": 83537884707840.0, + "grad_norm": 0.9330189615616886, + "language_loss": 0.60854793, + "learning_rate": 5.791239988143024e-07, + "loss": 0.62889403, + "num_input_tokens_seen": 272259670, + "router_z_loss_clip": 0.22949219, + "router_z_loss_mlp": 0.01047516, + "step": 12623, + "time_per_iteration": 3.298039674758911 + }, + { + "auxiliary_loss_clip": 0.01111528, + "auxiliary_loss_mlp": 0.01034244, + "balance_loss_clip": 1.0409565, + "balance_loss_mlp": 1.02378881, + "epoch": 0.7589959416804449, + "flos": 24461738674560.0, + "grad_norm": 2.570951770493995, + "language_loss": 0.6732949, + "learning_rate": 5.788499382832847e-07, + "loss": 0.69475257, + "num_input_tokens_seen": 272277925, + "router_z_loss_clip": 0.70556641, + "router_z_loss_mlp": 0.10455322, + "step": 12624, + "time_per_iteration": 2.6424918174743652 + }, + { + "auxiliary_loss_clip": 0.01110667, + "auxiliary_loss_mlp": 0.01025109, + "balance_loss_clip": 1.04003179, + "balance_loss_mlp": 1.01357555, + "epoch": 0.7590560649331128, + "flos": 22905124971840.0, + "grad_norm": 1.8162237792746292, + "language_loss": 0.75859356, + "learning_rate": 5.785759316424196e-07, + "loss": 0.77995127, + "num_input_tokens_seen": 272296010, + "router_z_loss_clip": 0.70605469, + "router_z_loss_mlp": 0.11541748, + "step": 12625, + "time_per_iteration": 2.6473007202148438 + }, + { + "auxiliary_loss_clip": 0.01112925, + "auxiliary_loss_mlp": 0.0103389, + "balance_loss_clip": 1.04176331, + "balance_loss_mlp": 1.02204609, + "epoch": 0.7591161881857809, + "flos": 36393591245760.0, + "grad_norm": 1.9082756250827215, + "language_loss": 0.63127643, + "learning_rate": 5.783019789020977e-07, + "loss": 0.65274465, + "num_input_tokens_seen": 272318330, + "router_z_loss_clip": 0.71240234, + "router_z_loss_mlp": 0.11846924, + "step": 12626, + "time_per_iteration": 2.6755001544952393 + }, + { + "auxiliary_loss_clip": 0.01115837, + "auxiliary_loss_mlp": 0.01039308, + "balance_loss_clip": 1.0419929, + "balance_loss_mlp": 1.02672577, + "epoch": 0.7591763114384488, + "flos": 24773393656800.0, + "grad_norm": 2.0828239014635845, + "language_loss": 0.74251974, + "learning_rate": 5.780280800727084e-07, + "loss": 0.76407117, + "num_input_tokens_seen": 272335265, + "router_z_loss_clip": 0.73779297, + "router_z_loss_mlp": 0.12591553, + "step": 12627, + "time_per_iteration": 2.6498494148254395 + }, + { + "auxiliary_loss_clip": 0.01116191, + "auxiliary_loss_mlp": 0.01029252, + "balance_loss_clip": 1.04193068, + "balance_loss_mlp": 1.01786792, + "epoch": 0.7592364346911168, + "flos": 25396825173120.0, + "grad_norm": 4.245669001391348, + "language_loss": 0.68696547, + "learning_rate": 5.777542351646356e-07, + "loss": 0.70841986, + "num_input_tokens_seen": 272354795, + "router_z_loss_clip": 0.74121094, + "router_z_loss_mlp": 0.11383057, + "step": 12628, + "time_per_iteration": 2.651604413986206 + }, + { + "auxiliary_loss_clip": 0.01122493, + "auxiliary_loss_mlp": 0.0103234, + "balance_loss_clip": 1.04392552, + "balance_loss_mlp": 1.01984656, + "epoch": 0.7592965579437848, + "flos": 25931171891520.0, + "grad_norm": 1.773445690180493, + "language_loss": 0.63072991, + "learning_rate": 5.774804441882648e-07, + "loss": 0.65227824, + "num_input_tokens_seen": 272372875, + "router_z_loss_clip": 0.78564453, + "router_z_loss_mlp": 0.125, + "step": 12629, + "time_per_iteration": 2.6785175800323486 + }, + { + "auxiliary_loss_clip": 0.011075, + "auxiliary_loss_mlp": 0.01030355, + "balance_loss_clip": 1.03744245, + "balance_loss_mlp": 1.01930475, + "epoch": 0.7593566811964527, + "flos": 31986304526880.0, + "grad_norm": 1.919079933309666, + "language_loss": 0.77486408, + "learning_rate": 5.772067071539786e-07, + "loss": 0.79624259, + "num_input_tokens_seen": 272394715, + "router_z_loss_clip": 0.70068359, + "router_z_loss_mlp": 0.11047363, + "step": 12630, + "time_per_iteration": 2.6589019298553467 + }, + { + "auxiliary_loss_clip": 0.01031974, + "auxiliary_loss_mlp": 0.01001224, + "balance_loss_clip": 1.00898099, + "balance_loss_mlp": 1.00019622, + "epoch": 0.7594168044491207, + "flos": 86924241512640.0, + "grad_norm": 0.8110993523979231, + "language_loss": 0.61437452, + "learning_rate": 5.769330240721562e-07, + "loss": 0.6347065, + "num_input_tokens_seen": 272458775, + "router_z_loss_clip": 0.23022461, + "router_z_loss_mlp": 0.01027679, + "step": 12631, + "time_per_iteration": 3.348017454147339 + }, + { + "auxiliary_loss_clip": 0.01119955, + "auxiliary_loss_mlp": 0.01031335, + "balance_loss_clip": 1.04236531, + "balance_loss_mlp": 1.0180614, + "epoch": 0.7594769277017887, + "flos": 32474224792800.0, + "grad_norm": 2.0473090185357052, + "language_loss": 0.73939794, + "learning_rate": 5.766593949531767e-07, + "loss": 0.76091087, + "num_input_tokens_seen": 272479355, + "router_z_loss_clip": 0.77685547, + "router_z_loss_mlp": 0.1328125, + "step": 12632, + "time_per_iteration": 2.686969757080078 + }, + { + "auxiliary_loss_clip": 0.01115128, + "auxiliary_loss_mlp": 0.01029994, + "balance_loss_clip": 1.04155552, + "balance_loss_mlp": 1.01843643, + "epoch": 0.7595370509544567, + "flos": 21471016472640.0, + "grad_norm": 2.6631094093506955, + "language_loss": 0.75039423, + "learning_rate": 5.763858198074154e-07, + "loss": 0.77184546, + "num_input_tokens_seen": 272493555, + "router_z_loss_clip": 0.73583984, + "router_z_loss_mlp": 0.11560059, + "step": 12633, + "time_per_iteration": 2.6481916904449463 + }, + { + "auxiliary_loss_clip": 0.01113861, + "auxiliary_loss_mlp": 0.01030058, + "balance_loss_clip": 1.04094028, + "balance_loss_mlp": 1.01968062, + "epoch": 0.7595971742071246, + "flos": 21967161746400.0, + "grad_norm": 1.9145151206042004, + "language_loss": 0.73432362, + "learning_rate": 5.76112298645246e-07, + "loss": 0.75576282, + "num_input_tokens_seen": 272508925, + "router_z_loss_clip": 0.72900391, + "router_z_loss_mlp": 0.10369873, + "step": 12634, + "time_per_iteration": 2.613898992538452 + }, + { + "auxiliary_loss_clip": 0.01115295, + "auxiliary_loss_mlp": 0.01031077, + "balance_loss_clip": 1.04180741, + "balance_loss_mlp": 1.01977038, + "epoch": 0.7596572974597926, + "flos": 35191655526240.0, + "grad_norm": 1.8228584926753448, + "language_loss": 0.64889723, + "learning_rate": 5.758388314770408e-07, + "loss": 0.67036098, + "num_input_tokens_seen": 272528805, + "router_z_loss_clip": 0.734375, + "router_z_loss_mlp": 0.11315918, + "step": 12635, + "time_per_iteration": 2.7096610069274902 + }, + { + "auxiliary_loss_clip": 0.01116524, + "auxiliary_loss_mlp": 0.01028595, + "balance_loss_clip": 1.04028118, + "balance_loss_mlp": 1.01645315, + "epoch": 0.7597174207124605, + "flos": 17427443090400.0, + "grad_norm": 1.887925431823469, + "language_loss": 0.68627703, + "learning_rate": 5.7556541831317e-07, + "loss": 0.70772827, + "num_input_tokens_seen": 272546655, + "router_z_loss_clip": 0.76269531, + "router_z_loss_mlp": 0.121521, + "step": 12636, + "time_per_iteration": 2.615445375442505 + }, + { + "auxiliary_loss_clip": 0.01116487, + "auxiliary_loss_mlp": 0.01032117, + "balance_loss_clip": 1.04157221, + "balance_loss_mlp": 1.02039862, + "epoch": 0.7597775439651285, + "flos": 26464991885280.0, + "grad_norm": 2.032591057876996, + "language_loss": 0.8117134, + "learning_rate": 5.752920591640018e-07, + "loss": 0.83319944, + "num_input_tokens_seen": 272564010, + "router_z_loss_clip": 0.74902344, + "router_z_loss_mlp": 0.1171875, + "step": 12637, + "time_per_iteration": 2.699638843536377 + }, + { + "auxiliary_loss_clip": 0.01112178, + "auxiliary_loss_mlp": 0.01030757, + "balance_loss_clip": 1.03822064, + "balance_loss_mlp": 1.01931262, + "epoch": 0.7598376672177964, + "flos": 44051966622720.0, + "grad_norm": 3.5238710161590725, + "language_loss": 0.66334909, + "learning_rate": 5.750187540399017e-07, + "loss": 0.68477845, + "num_input_tokens_seen": 272585840, + "router_z_loss_clip": 0.74023438, + "router_z_loss_mlp": 0.11450195, + "step": 12638, + "time_per_iteration": 2.764052152633667 + }, + { + "auxiliary_loss_clip": 0.01115332, + "auxiliary_loss_mlp": 0.01034897, + "balance_loss_clip": 1.04108846, + "balance_loss_mlp": 1.02210569, + "epoch": 0.7598977904704645, + "flos": 22368995493120.0, + "grad_norm": 3.3149940388881527, + "language_loss": 0.65287149, + "learning_rate": 5.747455029512323e-07, + "loss": 0.67437375, + "num_input_tokens_seen": 272602300, + "router_z_loss_clip": 0.74316406, + "router_z_loss_mlp": 0.12792969, + "step": 12639, + "time_per_iteration": 2.6142632961273193 + }, + { + "auxiliary_loss_clip": 0.01113135, + "auxiliary_loss_mlp": 0.0102804, + "balance_loss_clip": 1.04034352, + "balance_loss_mlp": 1.01622641, + "epoch": 0.7599579137231324, + "flos": 24639138442080.0, + "grad_norm": 2.236046006410314, + "language_loss": 0.70542741, + "learning_rate": 5.744723059083572e-07, + "loss": 0.72683907, + "num_input_tokens_seen": 272619595, + "router_z_loss_clip": 0.72851562, + "router_z_loss_mlp": 0.1182251, + "step": 12640, + "time_per_iteration": 2.6552698612213135 + }, + { + "auxiliary_loss_clip": 0.01116543, + "auxiliary_loss_mlp": 0.01030604, + "balance_loss_clip": 1.04060388, + "balance_loss_mlp": 1.01867127, + "epoch": 0.7600180369758004, + "flos": 29315381280480.0, + "grad_norm": 2.0239803865138657, + "language_loss": 0.67395341, + "learning_rate": 5.741991629216343e-07, + "loss": 0.69542491, + "num_input_tokens_seen": 272638825, + "router_z_loss_clip": 0.76074219, + "router_z_loss_mlp": 0.1194458, + "step": 12641, + "time_per_iteration": 2.6625072956085205 + }, + { + "auxiliary_loss_clip": 0.01113882, + "auxiliary_loss_mlp": 0.01027468, + "balance_loss_clip": 1.038486, + "balance_loss_mlp": 1.01505816, + "epoch": 0.7600781602284684, + "flos": 23170920743520.0, + "grad_norm": 2.733012820752565, + "language_loss": 0.67099404, + "learning_rate": 5.73926074001422e-07, + "loss": 0.69240761, + "num_input_tokens_seen": 272657240, + "router_z_loss_clip": 0.75439453, + "router_z_loss_mlp": 0.12426758, + "step": 12642, + "time_per_iteration": 2.6389198303222656 + }, + { + "auxiliary_loss_clip": 0.0111448, + "auxiliary_loss_mlp": 0.0103122, + "balance_loss_clip": 1.0430584, + "balance_loss_mlp": 1.01962662, + "epoch": 0.7601382834811363, + "flos": 31808053896480.0, + "grad_norm": 1.8964493867959422, + "language_loss": 0.7609629, + "learning_rate": 5.736530391580765e-07, + "loss": 0.78241986, + "num_input_tokens_seen": 272677520, + "router_z_loss_clip": 0.71386719, + "router_z_loss_mlp": 0.11608887, + "step": 12643, + "time_per_iteration": 2.681795835494995 + }, + { + "auxiliary_loss_clip": 0.01115375, + "auxiliary_loss_mlp": 0.01035355, + "balance_loss_clip": 1.04103041, + "balance_loss_mlp": 1.02284956, + "epoch": 0.7601984067338043, + "flos": 22993561493280.0, + "grad_norm": 2.250977346441294, + "language_loss": 0.79059088, + "learning_rate": 5.733800584019508e-07, + "loss": 0.81209821, + "num_input_tokens_seen": 272696770, + "router_z_loss_clip": 0.74316406, + "router_z_loss_mlp": 0.12506104, + "step": 12644, + "time_per_iteration": 4.122596502304077 + }, + { + "auxiliary_loss_clip": 0.01112341, + "auxiliary_loss_mlp": 0.01028822, + "balance_loss_clip": 1.03865135, + "balance_loss_mlp": 1.0176518, + "epoch": 0.7602585299864723, + "flos": 30073716288000.0, + "grad_norm": 1.7111483041221305, + "language_loss": 0.80345011, + "learning_rate": 5.731071317433957e-07, + "loss": 0.82486171, + "num_input_tokens_seen": 272718340, + "router_z_loss_clip": 0.73632812, + "router_z_loss_mlp": 0.11169434, + "step": 12645, + "time_per_iteration": 2.726327657699585 + }, + { + "auxiliary_loss_clip": 0.01116702, + "auxiliary_loss_mlp": 0.01032772, + "balance_loss_clip": 1.041453, + "balance_loss_mlp": 1.0207144, + "epoch": 0.7603186532391403, + "flos": 29092405923360.0, + "grad_norm": 1.48704177401951, + "language_loss": 0.72979128, + "learning_rate": 5.728342591927611e-07, + "loss": 0.75128603, + "num_input_tokens_seen": 272739575, + "router_z_loss_clip": 0.75195312, + "router_z_loss_mlp": 0.12060547, + "step": 12646, + "time_per_iteration": 2.6381759643554688 + }, + { + "auxiliary_loss_clip": 0.01112283, + "auxiliary_loss_mlp": 0.0103346, + "balance_loss_clip": 1.03996098, + "balance_loss_mlp": 1.02227855, + "epoch": 0.7603787764918082, + "flos": 27088504436160.0, + "grad_norm": 2.6044970968569507, + "language_loss": 0.67115259, + "learning_rate": 5.725614407603949e-07, + "loss": 0.69261003, + "num_input_tokens_seen": 272758710, + "router_z_loss_clip": 0.72363281, + "router_z_loss_mlp": 0.11187744, + "step": 12647, + "time_per_iteration": 3.9200356006622314 + }, + { + "auxiliary_loss_clip": 0.01031228, + "auxiliary_loss_mlp": 0.01001282, + "balance_loss_clip": 1.00838304, + "balance_loss_mlp": 1.00026107, + "epoch": 0.7604388997444762, + "flos": 65996242456320.0, + "grad_norm": 0.6762499618449178, + "language_loss": 0.48942032, + "learning_rate": 5.722886764566415e-07, + "loss": 0.50974548, + "num_input_tokens_seen": 272814855, + "router_z_loss_clip": 0.22839355, + "router_z_loss_mlp": 0.01021576, + "step": 12648, + "time_per_iteration": 3.215296983718872 + }, + { + "auxiliary_loss_clip": 0.0111112, + "auxiliary_loss_mlp": 0.01031832, + "balance_loss_clip": 1.03944016, + "balance_loss_mlp": 1.02042341, + "epoch": 0.7604990229971441, + "flos": 24015909512160.0, + "grad_norm": 2.163269024018262, + "language_loss": 0.76724017, + "learning_rate": 5.720159662918451e-07, + "loss": 0.78866971, + "num_input_tokens_seen": 272834400, + "router_z_loss_clip": 0.71630859, + "router_z_loss_mlp": 0.11401367, + "step": 12649, + "time_per_iteration": 2.5966649055480957 + }, + { + "auxiliary_loss_clip": 0.01112373, + "auxiliary_loss_mlp": 0.01031972, + "balance_loss_clip": 1.04015696, + "balance_loss_mlp": 1.02012873, + "epoch": 0.7605591462498121, + "flos": 30783680013600.0, + "grad_norm": 1.976263617996349, + "language_loss": 0.68693215, + "learning_rate": 5.717433102763462e-07, + "loss": 0.70837557, + "num_input_tokens_seen": 272854760, + "router_z_loss_clip": 0.72167969, + "router_z_loss_mlp": 0.11853027, + "step": 12650, + "time_per_iteration": 2.7161953449249268 + }, + { + "auxiliary_loss_clip": 0.01031433, + "auxiliary_loss_mlp": 0.01002316, + "balance_loss_clip": 1.00844514, + "balance_loss_mlp": 1.0012815, + "epoch": 0.76061926950248, + "flos": 81489947287680.0, + "grad_norm": 0.75039842737433, + "language_loss": 0.62611806, + "learning_rate": 5.714707084204838e-07, + "loss": 0.64645553, + "num_input_tokens_seen": 272919030, + "router_z_loss_clip": 0.22998047, + "router_z_loss_mlp": 0.01034546, + "step": 12651, + "time_per_iteration": 3.2467026710510254 + }, + { + "auxiliary_loss_clip": 0.01110781, + "auxiliary_loss_mlp": 0.01033076, + "balance_loss_clip": 1.03911901, + "balance_loss_mlp": 1.02212071, + "epoch": 0.7606793927551481, + "flos": 30918218849280.0, + "grad_norm": 1.6706924552517908, + "language_loss": 0.71304739, + "learning_rate": 5.711981607345951e-07, + "loss": 0.73448592, + "num_input_tokens_seen": 272938925, + "router_z_loss_clip": 0.71533203, + "router_z_loss_mlp": 0.10955811, + "step": 12652, + "time_per_iteration": 2.668881416320801 + }, + { + "auxiliary_loss_clip": 0.01114424, + "auxiliary_loss_mlp": 0.01034456, + "balance_loss_clip": 1.03966498, + "balance_loss_mlp": 1.02269602, + "epoch": 0.760739516007816, + "flos": 22236279935040.0, + "grad_norm": 4.576965099572525, + "language_loss": 0.80710435, + "learning_rate": 5.709256672290152e-07, + "loss": 0.82859313, + "num_input_tokens_seen": 272954945, + "router_z_loss_clip": 0.74707031, + "router_z_loss_mlp": 0.11761475, + "step": 12653, + "time_per_iteration": 2.576404333114624 + }, + { + "auxiliary_loss_clip": 0.01119014, + "auxiliary_loss_mlp": 0.01030687, + "balance_loss_clip": 1.04184914, + "balance_loss_mlp": 1.01915348, + "epoch": 0.760799639260484, + "flos": 27527364626400.0, + "grad_norm": 1.6737981749431052, + "language_loss": 0.80002177, + "learning_rate": 5.706532279140785e-07, + "loss": 0.82151878, + "num_input_tokens_seen": 272972855, + "router_z_loss_clip": 0.77148438, + "router_z_loss_mlp": 0.11523438, + "step": 12654, + "time_per_iteration": 4.01407790184021 + }, + { + "auxiliary_loss_clip": 0.01115466, + "auxiliary_loss_mlp": 0.01035095, + "balance_loss_clip": 1.03956389, + "balance_loss_mlp": 1.02279246, + "epoch": 0.760859762513152, + "flos": 27222030339840.0, + "grad_norm": 2.2928174533236456, + "language_loss": 0.7938984, + "learning_rate": 5.703808428001136e-07, + "loss": 0.81540406, + "num_input_tokens_seen": 272989895, + "router_z_loss_clip": 0.75976562, + "router_z_loss_mlp": 0.12310791, + "step": 12655, + "time_per_iteration": 2.6583168506622314 + }, + { + "auxiliary_loss_clip": 0.01110399, + "auxiliary_loss_mlp": 0.01026255, + "balance_loss_clip": 1.04006588, + "balance_loss_mlp": 1.01623547, + "epoch": 0.7609198857658199, + "flos": 30339268956000.0, + "grad_norm": 1.5260985358584322, + "language_loss": 0.68624675, + "learning_rate": 5.701085118974505e-07, + "loss": 0.70761329, + "num_input_tokens_seen": 273011695, + "router_z_loss_clip": 0.70263672, + "router_z_loss_mlp": 0.10015869, + "step": 12656, + "time_per_iteration": 4.0233893394470215 + }, + { + "auxiliary_loss_clip": 0.01115178, + "auxiliary_loss_mlp": 0.01027294, + "balance_loss_clip": 1.03792143, + "balance_loss_mlp": 1.01494348, + "epoch": 0.760980009018488, + "flos": 20543992912800.0, + "grad_norm": 2.354550852644787, + "language_loss": 0.73623967, + "learning_rate": 5.698362352164164e-07, + "loss": 0.75766444, + "num_input_tokens_seen": 273028815, + "router_z_loss_clip": 0.77197266, + "router_z_loss_mlp": 0.12359619, + "step": 12657, + "time_per_iteration": 2.6663262844085693 + }, + { + "auxiliary_loss_clip": 0.01030994, + "auxiliary_loss_mlp": 0.01001276, + "balance_loss_clip": 1.00808692, + "balance_loss_mlp": 1.0003022, + "epoch": 0.7610401322711559, + "flos": 74714235399360.0, + "grad_norm": 0.8525977543220296, + "language_loss": 0.64867681, + "learning_rate": 5.695640127673347e-07, + "loss": 0.66899949, + "num_input_tokens_seen": 273084080, + "router_z_loss_clip": 0.22900391, + "router_z_loss_mlp": 0.00972748, + "step": 12658, + "time_per_iteration": 3.2098071575164795 + }, + { + "auxiliary_loss_clip": 0.01109739, + "auxiliary_loss_mlp": 0.01031267, + "balance_loss_clip": 1.03914356, + "balance_loss_mlp": 1.02000761, + "epoch": 0.7611002555238239, + "flos": 23966079608160.0, + "grad_norm": 1.6825788858274524, + "language_loss": 0.79297435, + "learning_rate": 5.692918445605293e-07, + "loss": 0.81438446, + "num_input_tokens_seen": 273102295, + "router_z_loss_clip": 0.70605469, + "router_z_loss_mlp": 0.11254883, + "step": 12659, + "time_per_iteration": 2.6140058040618896 + }, + { + "auxiliary_loss_clip": 0.0111024, + "auxiliary_loss_mlp": 0.01024343, + "balance_loss_clip": 1.03753042, + "balance_loss_mlp": 1.01282763, + "epoch": 0.7611603787764918, + "flos": 32789809951200.0, + "grad_norm": 1.5274728324750473, + "language_loss": 0.69139904, + "learning_rate": 5.690197306063209e-07, + "loss": 0.71274483, + "num_input_tokens_seen": 273123400, + "router_z_loss_clip": 0.72753906, + "router_z_loss_mlp": 0.1151123, + "step": 12660, + "time_per_iteration": 2.7187178134918213 + }, + { + "auxiliary_loss_clip": 0.01114776, + "auxiliary_loss_mlp": 0.01027456, + "balance_loss_clip": 1.04013419, + "balance_loss_mlp": 1.01614904, + "epoch": 0.7612205020291598, + "flos": 33366045186720.0, + "grad_norm": 2.247099098258058, + "language_loss": 0.70433903, + "learning_rate": 5.687476709150281e-07, + "loss": 0.72576135, + "num_input_tokens_seen": 273145150, + "router_z_loss_clip": 0.74609375, + "router_z_loss_mlp": 0.11315918, + "step": 12661, + "time_per_iteration": 2.6678192615509033 + }, + { + "auxiliary_loss_clip": 0.01110942, + "auxiliary_loss_mlp": 0.01030318, + "balance_loss_clip": 1.03708661, + "balance_loss_mlp": 1.019243, + "epoch": 0.7612806252818277, + "flos": 35770402833120.0, + "grad_norm": 1.712187328762411, + "language_loss": 0.83448923, + "learning_rate": 5.68475665496966e-07, + "loss": 0.85590184, + "num_input_tokens_seen": 273165180, + "router_z_loss_clip": 0.73925781, + "router_z_loss_mlp": 0.11071777, + "step": 12662, + "time_per_iteration": 2.710080862045288 + }, + { + "auxiliary_loss_clip": 0.01112382, + "auxiliary_loss_mlp": 0.01040764, + "balance_loss_clip": 1.03867459, + "balance_loss_mlp": 1.02946889, + "epoch": 0.7613407485344957, + "flos": 23215483401120.0, + "grad_norm": 2.022186423525192, + "language_loss": 0.68859422, + "learning_rate": 5.682037143624505e-07, + "loss": 0.71012568, + "num_input_tokens_seen": 273184005, + "router_z_loss_clip": 0.73681641, + "router_z_loss_mlp": 0.11297607, + "step": 12663, + "time_per_iteration": 2.6622934341430664 + }, + { + "auxiliary_loss_clip": 0.0111029, + "auxiliary_loss_mlp": 0.01022503, + "balance_loss_clip": 1.03910542, + "balance_loss_mlp": 1.01141667, + "epoch": 0.7614008717871636, + "flos": 28377458572320.0, + "grad_norm": 1.7258470794043257, + "language_loss": 0.70007575, + "learning_rate": 5.67931817521794e-07, + "loss": 0.72140366, + "num_input_tokens_seen": 273203565, + "router_z_loss_clip": 0.71191406, + "router_z_loss_mlp": 0.11083984, + "step": 12664, + "time_per_iteration": 2.702552556991577 + }, + { + "auxiliary_loss_clip": 0.01118795, + "auxiliary_loss_mlp": 0.01037409, + "balance_loss_clip": 1.04185963, + "balance_loss_mlp": 1.02499974, + "epoch": 0.7614609950398317, + "flos": 26331911671680.0, + "grad_norm": 2.535611228892817, + "language_loss": 0.79868257, + "learning_rate": 5.676599749853066e-07, + "loss": 0.82024461, + "num_input_tokens_seen": 273221645, + "router_z_loss_clip": 0.77050781, + "router_z_loss_mlp": 0.12414551, + "step": 12665, + "time_per_iteration": 2.627093553543091 + }, + { + "auxiliary_loss_clip": 0.01113573, + "auxiliary_loss_mlp": 0.01034511, + "balance_loss_clip": 1.04268718, + "balance_loss_mlp": 1.02351451, + "epoch": 0.7615211182924996, + "flos": 35725637589120.0, + "grad_norm": 1.8895257585717684, + "language_loss": 0.88060248, + "learning_rate": 5.673881867632959e-07, + "loss": 0.90208334, + "num_input_tokens_seen": 273242040, + "router_z_loss_clip": 0.70800781, + "router_z_loss_mlp": 0.10986328, + "step": 12666, + "time_per_iteration": 2.749077320098877 + }, + { + "auxiliary_loss_clip": 0.01112896, + "auxiliary_loss_mlp": 0.01031801, + "balance_loss_clip": 1.03895497, + "balance_loss_mlp": 1.0200175, + "epoch": 0.7615812415451676, + "flos": 16180701609600.0, + "grad_norm": 2.2824638089251112, + "language_loss": 0.83270717, + "learning_rate": 5.671164528660693e-07, + "loss": 0.85415411, + "num_input_tokens_seen": 273257365, + "router_z_loss_clip": 0.73974609, + "router_z_loss_mlp": 0.11791992, + "step": 12667, + "time_per_iteration": 2.5982913970947266 + }, + { + "auxiliary_loss_clip": 0.01111544, + "auxiliary_loss_mlp": 0.01034909, + "balance_loss_clip": 1.04065287, + "balance_loss_mlp": 1.02425742, + "epoch": 0.7616413647978356, + "flos": 22677287541120.0, + "grad_norm": 1.8267522115892143, + "language_loss": 0.78844464, + "learning_rate": 5.668447733039296e-07, + "loss": 0.80990916, + "num_input_tokens_seen": 273274710, + "router_z_loss_clip": 0.70996094, + "router_z_loss_mlp": 0.10650635, + "step": 12668, + "time_per_iteration": 2.5907676219940186 + }, + { + "auxiliary_loss_clip": 0.01111567, + "auxiliary_loss_mlp": 0.01027036, + "balance_loss_clip": 1.03935325, + "balance_loss_mlp": 1.01577663, + "epoch": 0.7617014880505035, + "flos": 22593226885920.0, + "grad_norm": 2.8706059856051036, + "language_loss": 0.64489621, + "learning_rate": 5.6657314808718e-07, + "loss": 0.6662823, + "num_input_tokens_seen": 273292870, + "router_z_loss_clip": 0.72216797, + "router_z_loss_mlp": 0.11254883, + "step": 12669, + "time_per_iteration": 2.6293318271636963 + }, + { + "auxiliary_loss_clip": 0.01114227, + "auxiliary_loss_mlp": 0.01037813, + "balance_loss_clip": 1.03929567, + "balance_loss_mlp": 1.02517116, + "epoch": 0.7617616113031715, + "flos": 30472632790560.0, + "grad_norm": 2.0666983610168104, + "language_loss": 0.66352922, + "learning_rate": 5.663015772261202e-07, + "loss": 0.68504965, + "num_input_tokens_seen": 273312375, + "router_z_loss_clip": 0.74951172, + "router_z_loss_mlp": 0.12646484, + "step": 12670, + "time_per_iteration": 2.6388323307037354 + }, + { + "auxiliary_loss_clip": 0.01115052, + "auxiliary_loss_mlp": 0.01032156, + "balance_loss_clip": 1.03968215, + "balance_loss_mlp": 1.02047336, + "epoch": 0.7618217345558395, + "flos": 28424816922240.0, + "grad_norm": 1.828966064899715, + "language_loss": 0.73458803, + "learning_rate": 5.660300607310493e-07, + "loss": 0.75606012, + "num_input_tokens_seen": 273332590, + "router_z_loss_clip": 0.75439453, + "router_z_loss_mlp": 0.11676025, + "step": 12671, + "time_per_iteration": 2.66170334815979 + }, + { + "auxiliary_loss_clip": 0.01111736, + "auxiliary_loss_mlp": 0.0102953, + "balance_loss_clip": 1.0396452, + "balance_loss_mlp": 1.01931345, + "epoch": 0.7618818578085075, + "flos": 31094727236640.0, + "grad_norm": 1.9872094212651965, + "language_loss": 0.7324512, + "learning_rate": 5.657585986122613e-07, + "loss": 0.75386387, + "num_input_tokens_seen": 273352885, + "router_z_loss_clip": 0.72167969, + "router_z_loss_mlp": 0.10223389, + "step": 12672, + "time_per_iteration": 2.654049873352051 + }, + { + "auxiliary_loss_clip": 0.01031317, + "auxiliary_loss_mlp": 0.01001445, + "balance_loss_clip": 1.00847459, + "balance_loss_mlp": 1.00046694, + "epoch": 0.7619419810611754, + "flos": 74618829905760.0, + "grad_norm": 0.7869240286312584, + "language_loss": 0.56703305, + "learning_rate": 5.654871908800506e-07, + "loss": 0.58736068, + "num_input_tokens_seen": 273411730, + "router_z_loss_clip": 0.22851562, + "router_z_loss_mlp": 0.00977325, + "step": 12673, + "time_per_iteration": 3.248552083969116 + }, + { + "auxiliary_loss_clip": 0.01115111, + "auxiliary_loss_mlp": 0.01029016, + "balance_loss_clip": 1.04047966, + "balance_loss_mlp": 1.01684499, + "epoch": 0.7620021043138434, + "flos": 28380416333760.0, + "grad_norm": 1.907088827496305, + "language_loss": 0.74652159, + "learning_rate": 5.652158375447102e-07, + "loss": 0.76796281, + "num_input_tokens_seen": 273430020, + "router_z_loss_clip": 0.74755859, + "router_z_loss_mlp": 0.12176514, + "step": 12674, + "time_per_iteration": 2.624074697494507 + }, + { + "auxiliary_loss_clip": 0.01111954, + "auxiliary_loss_mlp": 0.0102468, + "balance_loss_clip": 1.03955948, + "balance_loss_mlp": 1.01370132, + "epoch": 0.7620622275665113, + "flos": 30606037142400.0, + "grad_norm": 2.1200446345719697, + "language_loss": 0.72390175, + "learning_rate": 5.649445386165286e-07, + "loss": 0.74526811, + "num_input_tokens_seen": 273448690, + "router_z_loss_clip": 0.72412109, + "router_z_loss_mlp": 0.10974121, + "step": 12675, + "time_per_iteration": 2.674971103668213 + }, + { + "auxiliary_loss_clip": 0.01109772, + "auxiliary_loss_mlp": 0.01026215, + "balance_loss_clip": 1.03957891, + "balance_loss_mlp": 1.01539683, + "epoch": 0.7621223508191793, + "flos": 24593319748800.0, + "grad_norm": 2.2507335154076125, + "language_loss": 0.72900802, + "learning_rate": 5.646732941057936e-07, + "loss": 0.75036794, + "num_input_tokens_seen": 273465190, + "router_z_loss_clip": 0.70214844, + "router_z_loss_mlp": 0.1081543, + "step": 12676, + "time_per_iteration": 2.587869644165039 + }, + { + "auxiliary_loss_clip": 0.01119569, + "auxiliary_loss_mlp": 0.01032316, + "balance_loss_clip": 1.04113364, + "balance_loss_mlp": 1.02028227, + "epoch": 0.7621824740718472, + "flos": 21964041915840.0, + "grad_norm": 2.5508425146576057, + "language_loss": 0.53577232, + "learning_rate": 5.644021040227927e-07, + "loss": 0.55729115, + "num_input_tokens_seen": 273478620, + "router_z_loss_clip": 0.78417969, + "router_z_loss_mlp": 0.12030029, + "step": 12677, + "time_per_iteration": 2.620621919631958 + }, + { + "auxiliary_loss_clip": 0.01113692, + "auxiliary_loss_mlp": 0.01033085, + "balance_loss_clip": 1.03992271, + "balance_loss_mlp": 1.02109253, + "epoch": 0.7622425973245153, + "flos": 26509797646560.0, + "grad_norm": 2.2509294516748213, + "language_loss": 0.78788453, + "learning_rate": 5.641309683778064e-07, + "loss": 0.80935228, + "num_input_tokens_seen": 273497635, + "router_z_loss_clip": 0.73681641, + "router_z_loss_mlp": 0.11993408, + "step": 12678, + "time_per_iteration": 2.60170841217041 + }, + { + "auxiliary_loss_clip": 0.01114268, + "auxiliary_loss_mlp": 0.01033638, + "balance_loss_clip": 1.0392909, + "balance_loss_mlp": 1.02193177, + "epoch": 0.7623027205771832, + "flos": 24060593721600.0, + "grad_norm": 2.6208237066295683, + "language_loss": 0.77458727, + "learning_rate": 5.638598871811175e-07, + "loss": 0.79606628, + "num_input_tokens_seen": 273513955, + "router_z_loss_clip": 0.75048828, + "router_z_loss_mlp": 0.11712646, + "step": 12679, + "time_per_iteration": 2.684835195541382 + }, + { + "auxiliary_loss_clip": 0.01112345, + "auxiliary_loss_mlp": 0.01026424, + "balance_loss_clip": 1.03864396, + "balance_loss_mlp": 1.01498604, + "epoch": 0.7623628438298512, + "flos": 29271385864800.0, + "grad_norm": 1.4637891980360522, + "language_loss": 0.80022359, + "learning_rate": 5.635888604430059e-07, + "loss": 0.82161129, + "num_input_tokens_seen": 273533970, + "router_z_loss_clip": 0.73632812, + "router_z_loss_mlp": 0.11437988, + "step": 12680, + "time_per_iteration": 2.689650774002075 + }, + { + "auxiliary_loss_clip": 0.01113735, + "auxiliary_loss_mlp": 0.01028024, + "balance_loss_clip": 1.03990972, + "balance_loss_mlp": 1.0156498, + "epoch": 0.7624229670825191, + "flos": 27266430928320.0, + "grad_norm": 1.835205778648524, + "language_loss": 0.62701309, + "learning_rate": 5.633178881737493e-07, + "loss": 0.64843065, + "num_input_tokens_seen": 273553090, + "router_z_loss_clip": 0.73828125, + "router_z_loss_mlp": 0.12390137, + "step": 12681, + "time_per_iteration": 2.636931896209717 + }, + { + "auxiliary_loss_clip": 0.0111111, + "auxiliary_loss_mlp": 0.01033172, + "balance_loss_clip": 1.03955293, + "balance_loss_mlp": 1.02156103, + "epoch": 0.7624830903351871, + "flos": 27713029919040.0, + "grad_norm": 2.661663762456164, + "language_loss": 0.76292396, + "learning_rate": 5.63046970383622e-07, + "loss": 0.78436685, + "num_input_tokens_seen": 273572460, + "router_z_loss_clip": 0.71484375, + "router_z_loss_mlp": 0.11602783, + "step": 12682, + "time_per_iteration": 2.683375120162964 + }, + { + "auxiliary_loss_clip": 0.01110717, + "auxiliary_loss_mlp": 0.01028368, + "balance_loss_clip": 1.03898406, + "balance_loss_mlp": 1.01776481, + "epoch": 0.7625432135878552, + "flos": 31229833314240.0, + "grad_norm": 1.472723687794569, + "language_loss": 0.68512356, + "learning_rate": 5.627761070828974e-07, + "loss": 0.70651436, + "num_input_tokens_seen": 273592815, + "router_z_loss_clip": 0.71728516, + "router_z_loss_mlp": 0.1060791, + "step": 12683, + "time_per_iteration": 4.065381288528442 + }, + { + "auxiliary_loss_clip": 0.01113473, + "auxiliary_loss_mlp": 0.01030535, + "balance_loss_clip": 1.0400666, + "balance_loss_mlp": 1.01891208, + "epoch": 0.7626033368405231, + "flos": 29270048794560.0, + "grad_norm": 2.4664353030140065, + "language_loss": 0.83477092, + "learning_rate": 5.625052982818472e-07, + "loss": 0.85621101, + "num_input_tokens_seen": 273611790, + "router_z_loss_clip": 0.73339844, + "router_z_loss_mlp": 0.11627197, + "step": 12684, + "time_per_iteration": 2.6714560985565186 + }, + { + "auxiliary_loss_clip": 0.01112539, + "auxiliary_loss_mlp": 0.01037474, + "balance_loss_clip": 1.03889346, + "balance_loss_mlp": 1.02506471, + "epoch": 0.7626634600931911, + "flos": 15373347043680.0, + "grad_norm": 2.485451577089901, + "language_loss": 0.82926279, + "learning_rate": 5.622345439907396e-07, + "loss": 0.8507629, + "num_input_tokens_seen": 273628340, + "router_z_loss_clip": 0.73583984, + "router_z_loss_mlp": 0.12408447, + "step": 12685, + "time_per_iteration": 2.618928909301758 + }, + { + "auxiliary_loss_clip": 0.01115726, + "auxiliary_loss_mlp": 0.01024954, + "balance_loss_clip": 1.04065561, + "balance_loss_mlp": 1.01370633, + "epoch": 0.762723583345859, + "flos": 32119344223200.0, + "grad_norm": 2.0951335168278504, + "language_loss": 0.77596641, + "learning_rate": 5.619638442198422e-07, + "loss": 0.79737318, + "num_input_tokens_seen": 273646585, + "router_z_loss_clip": 0.75048828, + "router_z_loss_mlp": 0.11254883, + "step": 12686, + "time_per_iteration": 3.9215469360351562 + }, + { + "auxiliary_loss_clip": 0.01116825, + "auxiliary_loss_mlp": 0.01036844, + "balance_loss_clip": 1.04071057, + "balance_loss_mlp": 1.02427936, + "epoch": 0.762783706598527, + "flos": 26731233347040.0, + "grad_norm": 1.6028603776659096, + "language_loss": 0.71964955, + "learning_rate": 5.616931989794198e-07, + "loss": 0.74118626, + "num_input_tokens_seen": 273665410, + "router_z_loss_clip": 0.76123047, + "router_z_loss_mlp": 0.12573242, + "step": 12687, + "time_per_iteration": 2.6915924549102783 + }, + { + "auxiliary_loss_clip": 0.01113088, + "auxiliary_loss_mlp": 0.01035192, + "balance_loss_clip": 1.04000878, + "balance_loss_mlp": 1.02326536, + "epoch": 0.7628438298511949, + "flos": 18717329124000.0, + "grad_norm": 2.5146217604984242, + "language_loss": 0.64493787, + "learning_rate": 5.614226082797369e-07, + "loss": 0.66642058, + "num_input_tokens_seen": 273683035, + "router_z_loss_clip": 0.73095703, + "router_z_loss_mlp": 0.11920166, + "step": 12688, + "time_per_iteration": 2.58096981048584 + }, + { + "auxiliary_loss_clip": 0.01111998, + "auxiliary_loss_mlp": 0.01025951, + "balance_loss_clip": 1.04101849, + "balance_loss_mlp": 1.01493001, + "epoch": 0.7629039531038629, + "flos": 15869694903840.0, + "grad_norm": 2.199887874421513, + "language_loss": 0.71088928, + "learning_rate": 5.611520721310515e-07, + "loss": 0.73226875, + "num_input_tokens_seen": 273700130, + "router_z_loss_clip": 0.70947266, + "router_z_loss_mlp": 0.11022949, + "step": 12689, + "time_per_iteration": 2.6290457248687744 + }, + { + "auxiliary_loss_clip": 0.01118563, + "auxiliary_loss_mlp": 0.01035852, + "balance_loss_clip": 1.04145575, + "balance_loss_mlp": 1.02379966, + "epoch": 0.7629640763565309, + "flos": 31935623760000.0, + "grad_norm": 3.015816184202387, + "language_loss": 0.69639206, + "learning_rate": 5.608815905436238e-07, + "loss": 0.71793616, + "num_input_tokens_seen": 273720310, + "router_z_loss_clip": 0.77148438, + "router_z_loss_mlp": 0.12042236, + "step": 12690, + "time_per_iteration": 2.6457507610321045 + }, + { + "auxiliary_loss_clip": 0.01114336, + "auxiliary_loss_mlp": 0.01028501, + "balance_loss_clip": 1.04036975, + "balance_loss_mlp": 1.01796865, + "epoch": 0.7630241996091989, + "flos": 44897036425920.0, + "grad_norm": 1.669383931706051, + "language_loss": 0.69358265, + "learning_rate": 5.606111635277109e-07, + "loss": 0.715011, + "num_input_tokens_seen": 273744475, + "router_z_loss_clip": 0.73876953, + "router_z_loss_mlp": 0.10540771, + "step": 12691, + "time_per_iteration": 2.7518484592437744 + }, + { + "auxiliary_loss_clip": 0.0111151, + "auxiliary_loss_mlp": 0.01032001, + "balance_loss_clip": 1.03961527, + "balance_loss_mlp": 1.02158237, + "epoch": 0.7630843228618668, + "flos": 26643404584800.0, + "grad_norm": 1.7047701660527852, + "language_loss": 0.82022661, + "learning_rate": 5.603407910935662e-07, + "loss": 0.84166169, + "num_input_tokens_seen": 273764635, + "router_z_loss_clip": 0.71875, + "router_z_loss_mlp": 0.10412598, + "step": 12692, + "time_per_iteration": 2.609809637069702 + }, + { + "auxiliary_loss_clip": 0.01118391, + "auxiliary_loss_mlp": 0.01029644, + "balance_loss_clip": 1.04282117, + "balance_loss_mlp": 1.0187006, + "epoch": 0.7631444461145348, + "flos": 15423420051360.0, + "grad_norm": 2.7865260632377105, + "language_loss": 0.7722677, + "learning_rate": 5.600704732514438e-07, + "loss": 0.79374808, + "num_input_tokens_seen": 273780115, + "router_z_loss_clip": 0.75537109, + "router_z_loss_mlp": 0.109375, + "step": 12693, + "time_per_iteration": 3.994098424911499 + }, + { + "auxiliary_loss_clip": 0.01115403, + "auxiliary_loss_mlp": 0.01030576, + "balance_loss_clip": 1.04109955, + "balance_loss_mlp": 1.01854229, + "epoch": 0.7632045693672027, + "flos": 20543790326400.0, + "grad_norm": 2.2377887298319523, + "language_loss": 0.72587115, + "learning_rate": 5.598002100115933e-07, + "loss": 0.74733102, + "num_input_tokens_seen": 273796605, + "router_z_loss_clip": 0.74267578, + "router_z_loss_mlp": 0.12054443, + "step": 12694, + "time_per_iteration": 2.6098926067352295 + }, + { + "auxiliary_loss_clip": 0.01109413, + "auxiliary_loss_mlp": 0.01026437, + "balance_loss_clip": 1.03738892, + "balance_loss_mlp": 1.01484346, + "epoch": 0.7632646926198707, + "flos": 26865731665440.0, + "grad_norm": 1.8696926372712661, + "language_loss": 0.70363319, + "learning_rate": 5.595300013842625e-07, + "loss": 0.7249918, + "num_input_tokens_seen": 273816515, + "router_z_loss_clip": 0.72021484, + "router_z_loss_mlp": 0.1159668, + "step": 12695, + "time_per_iteration": 3.9841158390045166 + }, + { + "auxiliary_loss_clip": 0.01112117, + "auxiliary_loss_mlp": 0.01029748, + "balance_loss_clip": 1.03910267, + "balance_loss_mlp": 1.01872146, + "epoch": 0.7633248158725388, + "flos": 28203786394560.0, + "grad_norm": 1.4425505936225227, + "language_loss": 0.72000623, + "learning_rate": 5.592598473796985e-07, + "loss": 0.74142492, + "num_input_tokens_seen": 273837060, + "router_z_loss_clip": 0.72998047, + "router_z_loss_mlp": 0.11022949, + "step": 12696, + "time_per_iteration": 2.757798433303833 + }, + { + "auxiliary_loss_clip": 0.01112061, + "auxiliary_loss_mlp": 0.01030551, + "balance_loss_clip": 1.03903461, + "balance_loss_mlp": 1.01846361, + "epoch": 0.7633849391252067, + "flos": 13287046109760.0, + "grad_norm": 2.1280685120839897, + "language_loss": 0.71752435, + "learning_rate": 5.589897480081453e-07, + "loss": 0.73895049, + "num_input_tokens_seen": 273853365, + "router_z_loss_clip": 0.73046875, + "router_z_loss_mlp": 0.12084961, + "step": 12697, + "time_per_iteration": 2.5992729663848877 + }, + { + "auxiliary_loss_clip": 0.01112997, + "auxiliary_loss_mlp": 0.01029711, + "balance_loss_clip": 1.04107678, + "balance_loss_mlp": 1.01844001, + "epoch": 0.7634450623778747, + "flos": 25617328976160.0, + "grad_norm": 2.2476562662122044, + "language_loss": 0.66820109, + "learning_rate": 5.587197032798461e-07, + "loss": 0.68962812, + "num_input_tokens_seen": 273870750, + "router_z_loss_clip": 0.71972656, + "router_z_loss_mlp": 0.11273193, + "step": 12698, + "time_per_iteration": 2.5958073139190674 + }, + { + "auxiliary_loss_clip": 0.01113012, + "auxiliary_loss_mlp": 0.0102891, + "balance_loss_clip": 1.03917718, + "balance_loss_mlp": 1.01701331, + "epoch": 0.7635051856305426, + "flos": 23039582772960.0, + "grad_norm": 2.122740053396357, + "language_loss": 0.71991277, + "learning_rate": 5.5844971320504e-07, + "loss": 0.74133193, + "num_input_tokens_seen": 273890890, + "router_z_loss_clip": 0.73876953, + "router_z_loss_mlp": 0.11895752, + "step": 12699, + "time_per_iteration": 2.660196304321289 + }, + { + "auxiliary_loss_clip": 0.0111101, + "auxiliary_loss_mlp": 0.01034924, + "balance_loss_clip": 1.0394181, + "balance_loss_mlp": 1.02421331, + "epoch": 0.7635653088832106, + "flos": 42448278191040.0, + "grad_norm": 1.733534402639402, + "language_loss": 0.73276824, + "learning_rate": 5.581797777939648e-07, + "loss": 0.75422764, + "num_input_tokens_seen": 273914015, + "router_z_loss_clip": 0.71533203, + "router_z_loss_mlp": 0.10717773, + "step": 12700, + "time_per_iteration": 2.7181754112243652 + }, + { + "auxiliary_loss_clip": 0.0111357, + "auxiliary_loss_mlp": 0.01029905, + "balance_loss_clip": 1.03913379, + "balance_loss_mlp": 1.01884782, + "epoch": 0.7636254321358785, + "flos": 28283187562560.0, + "grad_norm": 2.641972715364762, + "language_loss": 0.69535244, + "learning_rate": 5.579098970568574e-07, + "loss": 0.71678716, + "num_input_tokens_seen": 273927415, + "router_z_loss_clip": 0.74365234, + "router_z_loss_mlp": 0.11053467, + "step": 12701, + "time_per_iteration": 2.701291561126709 + }, + { + "auxiliary_loss_clip": 0.01113723, + "auxiliary_loss_mlp": 0.0103083, + "balance_loss_clip": 1.03996062, + "balance_loss_mlp": 1.01970792, + "epoch": 0.7636855553885465, + "flos": 26021067035040.0, + "grad_norm": 1.8828639158641354, + "language_loss": 0.64558053, + "learning_rate": 5.576400710039508e-07, + "loss": 0.66702604, + "num_input_tokens_seen": 273946690, + "router_z_loss_clip": 0.73828125, + "router_z_loss_mlp": 0.11120605, + "step": 12702, + "time_per_iteration": 2.632514476776123 + }, + { + "auxiliary_loss_clip": 0.01114477, + "auxiliary_loss_mlp": 0.01028188, + "balance_loss_clip": 1.04019141, + "balance_loss_mlp": 1.01669061, + "epoch": 0.7637456786412145, + "flos": 34969247411040.0, + "grad_norm": 2.068693435475326, + "language_loss": 0.6541307, + "learning_rate": 5.57370299645477e-07, + "loss": 0.67555737, + "num_input_tokens_seen": 273966870, + "router_z_loss_clip": 0.74267578, + "router_z_loss_mlp": 0.1149292, + "step": 12703, + "time_per_iteration": 2.698838472366333 + }, + { + "auxiliary_loss_clip": 0.01113802, + "auxiliary_loss_mlp": 0.01027348, + "balance_loss_clip": 1.04145133, + "balance_loss_mlp": 1.01589227, + "epoch": 0.7638058018938825, + "flos": 26731476450720.0, + "grad_norm": 2.2905850011796103, + "language_loss": 0.83575296, + "learning_rate": 5.571005829916668e-07, + "loss": 0.85716444, + "num_input_tokens_seen": 273986360, + "router_z_loss_clip": 0.72265625, + "router_z_loss_mlp": 0.11462402, + "step": 12704, + "time_per_iteration": 2.608957529067993 + }, + { + "auxiliary_loss_clip": 0.01115929, + "auxiliary_loss_mlp": 0.01033985, + "balance_loss_clip": 1.04222989, + "balance_loss_mlp": 1.02290416, + "epoch": 0.7638659251465504, + "flos": 36172155545280.0, + "grad_norm": 1.533366088252126, + "language_loss": 0.68045199, + "learning_rate": 5.568309210527469e-07, + "loss": 0.70195109, + "num_input_tokens_seen": 274009745, + "router_z_loss_clip": 0.73681641, + "router_z_loss_mlp": 0.11083984, + "step": 12705, + "time_per_iteration": 2.6788315773010254 + }, + { + "auxiliary_loss_clip": 0.01111635, + "auxiliary_loss_mlp": 0.01027343, + "balance_loss_clip": 1.0402199, + "balance_loss_mlp": 1.01619089, + "epoch": 0.7639260483992184, + "flos": 31897868005440.0, + "grad_norm": 1.9388052297237581, + "language_loss": 0.74113405, + "learning_rate": 5.565613138389427e-07, + "loss": 0.76252389, + "num_input_tokens_seen": 274028775, + "router_z_loss_clip": 0.71484375, + "router_z_loss_mlp": 0.11151123, + "step": 12706, + "time_per_iteration": 2.661278486251831 + }, + { + "auxiliary_loss_clip": 0.01114546, + "auxiliary_loss_mlp": 0.01032236, + "balance_loss_clip": 1.04057074, + "balance_loss_mlp": 1.0204699, + "epoch": 0.7639861716518863, + "flos": 24595224060960.0, + "grad_norm": 1.8058119029369906, + "language_loss": 0.78542519, + "learning_rate": 5.562917613604781e-07, + "loss": 0.80689299, + "num_input_tokens_seen": 274047520, + "router_z_loss_clip": 0.73974609, + "router_z_loss_mlp": 0.11779785, + "step": 12707, + "time_per_iteration": 2.655088186264038 + }, + { + "auxiliary_loss_clip": 0.01112586, + "auxiliary_loss_mlp": 0.0102722, + "balance_loss_clip": 1.03852725, + "balance_loss_mlp": 1.01609814, + "epoch": 0.7640462949045543, + "flos": 22675464263520.0, + "grad_norm": 1.9036917968657865, + "language_loss": 0.79882324, + "learning_rate": 5.560222636275751e-07, + "loss": 0.8202213, + "num_input_tokens_seen": 274065350, + "router_z_loss_clip": 0.74121094, + "router_z_loss_mlp": 0.11120605, + "step": 12708, + "time_per_iteration": 2.677130699157715 + }, + { + "auxiliary_loss_clip": 0.01032134, + "auxiliary_loss_mlp": 0.01001805, + "balance_loss_clip": 1.00920939, + "balance_loss_mlp": 1.00075209, + "epoch": 0.7641064181572224, + "flos": 83366481497760.0, + "grad_norm": 0.8162387543898049, + "language_loss": 0.56442571, + "learning_rate": 5.557528206504521e-07, + "loss": 0.58476514, + "num_input_tokens_seen": 274122315, + "router_z_loss_clip": 0.22937012, + "router_z_loss_mlp": 0.01053619, + "step": 12709, + "time_per_iteration": 3.313040256500244 + }, + { + "auxiliary_loss_clip": 0.01118012, + "auxiliary_loss_mlp": 0.01036087, + "balance_loss_clip": 1.04189432, + "balance_loss_mlp": 1.02365994, + "epoch": 0.7641665414098903, + "flos": 21925475815680.0, + "grad_norm": 2.045917666029088, + "language_loss": 0.63420713, + "learning_rate": 5.554834324393271e-07, + "loss": 0.65574813, + "num_input_tokens_seen": 274140555, + "router_z_loss_clip": 0.76074219, + "router_z_loss_mlp": 0.12445068, + "step": 12710, + "time_per_iteration": 2.75253963470459 + }, + { + "auxiliary_loss_clip": 0.01116379, + "auxiliary_loss_mlp": 0.01033543, + "balance_loss_clip": 1.04102325, + "balance_loss_mlp": 1.02115762, + "epoch": 0.7642266646625583, + "flos": 25931820168000.0, + "grad_norm": 4.823009170479911, + "language_loss": 0.64350486, + "learning_rate": 5.552140990044154e-07, + "loss": 0.66500407, + "num_input_tokens_seen": 274161125, + "router_z_loss_clip": 0.75292969, + "router_z_loss_mlp": 0.1237793, + "step": 12711, + "time_per_iteration": 2.6492605209350586 + }, + { + "auxiliary_loss_clip": 0.01112006, + "auxiliary_loss_mlp": 0.01037189, + "balance_loss_clip": 1.03856015, + "balance_loss_mlp": 1.02595329, + "epoch": 0.7642867879152262, + "flos": 27757025334720.0, + "grad_norm": 1.62151600225939, + "language_loss": 0.7271086, + "learning_rate": 5.549448203559293e-07, + "loss": 0.74860048, + "num_input_tokens_seen": 274180835, + "router_z_loss_clip": 0.73388672, + "router_z_loss_mlp": 0.11230469, + "step": 12712, + "time_per_iteration": 2.6650633811950684 + }, + { + "auxiliary_loss_clip": 0.01111917, + "auxiliary_loss_mlp": 0.01030752, + "balance_loss_clip": 1.0405035, + "balance_loss_mlp": 1.01974297, + "epoch": 0.7643469111678942, + "flos": 28470027856320.0, + "grad_norm": 1.8500770725616211, + "language_loss": 0.80550742, + "learning_rate": 5.546755965040804e-07, + "loss": 0.8269341, + "num_input_tokens_seen": 274201190, + "router_z_loss_clip": 0.71435547, + "router_z_loss_mlp": 0.11004639, + "step": 12713, + "time_per_iteration": 2.6530182361602783 + }, + { + "auxiliary_loss_clip": 0.01115141, + "auxiliary_loss_mlp": 0.01031327, + "balance_loss_clip": 1.04021096, + "balance_loss_mlp": 1.01881015, + "epoch": 0.7644070344205621, + "flos": 24230335723200.0, + "grad_norm": 2.5878569163784286, + "language_loss": 0.83089244, + "learning_rate": 5.544064274590776e-07, + "loss": 0.85235709, + "num_input_tokens_seen": 274217595, + "router_z_loss_clip": 0.74853516, + "router_z_loss_mlp": 0.12518311, + "step": 12714, + "time_per_iteration": 2.6332762241363525 + }, + { + "auxiliary_loss_clip": 0.01115259, + "auxiliary_loss_mlp": 0.0103664, + "balance_loss_clip": 1.03983915, + "balance_loss_mlp": 1.02468276, + "epoch": 0.7644671576732301, + "flos": 26955140601600.0, + "grad_norm": 1.621269746452744, + "language_loss": 0.72824526, + "learning_rate": 5.541373132311287e-07, + "loss": 0.74976432, + "num_input_tokens_seen": 274237885, + "router_z_loss_clip": 0.75341797, + "router_z_loss_mlp": 0.11962891, + "step": 12715, + "time_per_iteration": 2.6156258583068848 + }, + { + "auxiliary_loss_clip": 0.01112475, + "auxiliary_loss_mlp": 0.01028133, + "balance_loss_clip": 1.03915882, + "balance_loss_mlp": 1.01642036, + "epoch": 0.7645272809258981, + "flos": 31093066028160.0, + "grad_norm": 2.183187806970887, + "language_loss": 0.62982154, + "learning_rate": 5.538682538304376e-07, + "loss": 0.65122759, + "num_input_tokens_seen": 274258820, + "router_z_loss_clip": 0.73291016, + "router_z_loss_mlp": 0.11724854, + "step": 12716, + "time_per_iteration": 2.676436424255371 + }, + { + "auxiliary_loss_clip": 0.01114945, + "auxiliary_loss_mlp": 0.01031128, + "balance_loss_clip": 1.03872442, + "balance_loss_mlp": 1.01874185, + "epoch": 0.7645874041785661, + "flos": 26285647288320.0, + "grad_norm": 2.167807399809388, + "language_loss": 0.79814374, + "learning_rate": 5.535992492672068e-07, + "loss": 0.81960446, + "num_input_tokens_seen": 274278835, + "router_z_loss_clip": 0.76171875, + "router_z_loss_mlp": 0.12390137, + "step": 12717, + "time_per_iteration": 2.608937978744507 + }, + { + "auxiliary_loss_clip": 0.01114758, + "auxiliary_loss_mlp": 0.01037002, + "balance_loss_clip": 1.04157948, + "balance_loss_mlp": 1.02594566, + "epoch": 0.764647527431234, + "flos": 25173890333280.0, + "grad_norm": 2.9716861913416674, + "language_loss": 0.66145551, + "learning_rate": 5.53330299551638e-07, + "loss": 0.68297315, + "num_input_tokens_seen": 274297110, + "router_z_loss_clip": 0.73144531, + "router_z_loss_mlp": 0.1105957, + "step": 12718, + "time_per_iteration": 2.641721487045288 + }, + { + "auxiliary_loss_clip": 0.01108491, + "auxiliary_loss_mlp": 0.01032676, + "balance_loss_clip": 1.03807783, + "balance_loss_mlp": 1.02226949, + "epoch": 0.764707650683902, + "flos": 26153944662240.0, + "grad_norm": 2.7686375398126586, + "language_loss": 0.77716398, + "learning_rate": 5.530614046939286e-07, + "loss": 0.79857564, + "num_input_tokens_seen": 274315610, + "router_z_loss_clip": 0.70507812, + "router_z_loss_mlp": 0.10406494, + "step": 12719, + "time_per_iteration": 2.6236445903778076 + }, + { + "auxiliary_loss_clip": 0.01112661, + "auxiliary_loss_mlp": 0.01031011, + "balance_loss_clip": 1.03901565, + "balance_loss_mlp": 1.01913774, + "epoch": 0.7647677739365699, + "flos": 27712381642560.0, + "grad_norm": 2.0566232667557367, + "language_loss": 0.6987409, + "learning_rate": 5.527925647042754e-07, + "loss": 0.72017765, + "num_input_tokens_seen": 274333975, + "router_z_loss_clip": 0.73632812, + "router_z_loss_mlp": 0.11871338, + "step": 12720, + "time_per_iteration": 2.646484375 + }, + { + "auxiliary_loss_clip": 0.01113421, + "auxiliary_loss_mlp": 0.01036207, + "balance_loss_clip": 1.04050934, + "balance_loss_mlp": 1.02433336, + "epoch": 0.7648278971892379, + "flos": 26019810999360.0, + "grad_norm": 1.7075470866053377, + "language_loss": 0.74119407, + "learning_rate": 5.52523779592875e-07, + "loss": 0.76269037, + "num_input_tokens_seen": 274353695, + "router_z_loss_clip": 0.72949219, + "router_z_loss_mlp": 0.11859131, + "step": 12721, + "time_per_iteration": 2.6419966220855713 + }, + { + "auxiliary_loss_clip": 0.01112576, + "auxiliary_loss_mlp": 0.01029558, + "balance_loss_clip": 1.039083, + "balance_loss_mlp": 1.01831603, + "epoch": 0.764888020441906, + "flos": 25218371956320.0, + "grad_norm": 1.8062499390270834, + "language_loss": 0.73762542, + "learning_rate": 5.522550493699163e-07, + "loss": 0.75904679, + "num_input_tokens_seen": 274371120, + "router_z_loss_clip": 0.73486328, + "router_z_loss_mlp": 0.11242676, + "step": 12722, + "time_per_iteration": 2.6810495853424072 + }, + { + "auxiliary_loss_clip": 0.01113843, + "auxiliary_loss_mlp": 0.01034539, + "balance_loss_clip": 1.04075456, + "balance_loss_mlp": 1.02304697, + "epoch": 0.7649481436945739, + "flos": 30606077659680.0, + "grad_norm": 1.9771393204433532, + "language_loss": 0.73998773, + "learning_rate": 5.519863740455912e-07, + "loss": 0.76147157, + "num_input_tokens_seen": 274389665, + "router_z_loss_clip": 0.73046875, + "router_z_loss_mlp": 0.1149292, + "step": 12723, + "time_per_iteration": 4.266615152359009 + }, + { + "auxiliary_loss_clip": 0.01113335, + "auxiliary_loss_mlp": 0.01028554, + "balance_loss_clip": 1.03790557, + "balance_loss_mlp": 1.01682365, + "epoch": 0.7650082669472419, + "flos": 30384358338240.0, + "grad_norm": 2.0337543040417727, + "language_loss": 0.73028737, + "learning_rate": 5.517177536300881e-07, + "loss": 0.7517063, + "num_input_tokens_seen": 274408750, + "router_z_loss_clip": 0.75488281, + "router_z_loss_mlp": 0.11737061, + "step": 12724, + "time_per_iteration": 2.7017366886138916 + }, + { + "auxiliary_loss_clip": 0.01109254, + "auxiliary_loss_mlp": 0.01026711, + "balance_loss_clip": 1.03884077, + "balance_loss_mlp": 1.01568377, + "epoch": 0.7650683901999098, + "flos": 17872340355360.0, + "grad_norm": 1.9854901211483669, + "language_loss": 0.83718765, + "learning_rate": 5.514491881335935e-07, + "loss": 0.85854733, + "num_input_tokens_seen": 274424600, + "router_z_loss_clip": 0.70361328, + "router_z_loss_mlp": 0.11016846, + "step": 12725, + "time_per_iteration": 2.6335031986236572 + }, + { + "auxiliary_loss_clip": 0.01114575, + "auxiliary_loss_mlp": 0.0103416, + "balance_loss_clip": 1.04063559, + "balance_loss_mlp": 1.02178574, + "epoch": 0.7651285134525778, + "flos": 32156370666720.0, + "grad_norm": 1.8316303554550404, + "language_loss": 0.77769208, + "learning_rate": 5.511806775662901e-07, + "loss": 0.79917943, + "num_input_tokens_seen": 274443075, + "router_z_loss_clip": 0.73974609, + "router_z_loss_mlp": 0.12353516, + "step": 12726, + "time_per_iteration": 4.0313897132873535 + }, + { + "auxiliary_loss_clip": 0.01114527, + "auxiliary_loss_mlp": 0.01031422, + "balance_loss_clip": 1.03982472, + "balance_loss_mlp": 1.02006721, + "epoch": 0.7651886367052457, + "flos": 32515424516160.0, + "grad_norm": 2.4873274994275274, + "language_loss": 0.70810497, + "learning_rate": 5.509122219383615e-07, + "loss": 0.72956443, + "num_input_tokens_seen": 274463240, + "router_z_loss_clip": 0.74755859, + "router_z_loss_mlp": 0.11364746, + "step": 12727, + "time_per_iteration": 2.673743963241577 + }, + { + "auxiliary_loss_clip": 0.01107193, + "auxiliary_loss_mlp": 0.01025104, + "balance_loss_clip": 1.03688669, + "balance_loss_mlp": 1.01414299, + "epoch": 0.7652487599579137, + "flos": 31363035079680.0, + "grad_norm": 1.7718889025437217, + "language_loss": 0.79655147, + "learning_rate": 5.506438212599864e-07, + "loss": 0.81787443, + "num_input_tokens_seen": 274482750, + "router_z_loss_clip": 0.703125, + "router_z_loss_mlp": 0.10955811, + "step": 12728, + "time_per_iteration": 2.685356855392456 + }, + { + "auxiliary_loss_clip": 0.01115453, + "auxiliary_loss_mlp": 0.01032076, + "balance_loss_clip": 1.04043293, + "balance_loss_mlp": 1.02001238, + "epoch": 0.7653088832105817, + "flos": 34880729855040.0, + "grad_norm": 1.8673410521725293, + "language_loss": 0.5531584, + "learning_rate": 5.503754755413424e-07, + "loss": 0.57463372, + "num_input_tokens_seen": 274503545, + "router_z_loss_clip": 0.75048828, + "router_z_loss_mlp": 0.12054443, + "step": 12729, + "time_per_iteration": 2.673961639404297 + }, + { + "auxiliary_loss_clip": 0.01110576, + "auxiliary_loss_mlp": 0.01026915, + "balance_loss_clip": 1.03758287, + "balance_loss_mlp": 1.01528633, + "epoch": 0.7653690064632497, + "flos": 28512443098080.0, + "grad_norm": 1.8632772677377054, + "language_loss": 0.7776109, + "learning_rate": 5.501071847926055e-07, + "loss": 0.79898584, + "num_input_tokens_seen": 274523825, + "router_z_loss_clip": 0.73095703, + "router_z_loss_mlp": 0.11621094, + "step": 12730, + "time_per_iteration": 2.6944267749786377 + }, + { + "auxiliary_loss_clip": 0.01120763, + "auxiliary_loss_mlp": 0.01039562, + "balance_loss_clip": 1.04516268, + "balance_loss_mlp": 1.02770686, + "epoch": 0.7654291297159176, + "flos": 19247421528000.0, + "grad_norm": 1.7945936306053287, + "language_loss": 0.68760985, + "learning_rate": 5.498389490239495e-07, + "loss": 0.70921314, + "num_input_tokens_seen": 274541625, + "router_z_loss_clip": 0.75585938, + "router_z_loss_mlp": 0.11871338, + "step": 12731, + "time_per_iteration": 2.63095760345459 + }, + { + "auxiliary_loss_clip": 0.01114205, + "auxiliary_loss_mlp": 0.01031236, + "balance_loss_clip": 1.03914678, + "balance_loss_mlp": 1.01997089, + "epoch": 0.7654892529685856, + "flos": 22005322673760.0, + "grad_norm": 2.3480156251936233, + "language_loss": 0.70493889, + "learning_rate": 5.495707682455471e-07, + "loss": 0.72639328, + "num_input_tokens_seen": 274557580, + "router_z_loss_clip": 0.75048828, + "router_z_loss_mlp": 0.11260986, + "step": 12732, + "time_per_iteration": 2.6334047317504883 + }, + { + "auxiliary_loss_clip": 0.01115418, + "auxiliary_loss_mlp": 0.01029182, + "balance_loss_clip": 1.03990257, + "balance_loss_mlp": 1.01690352, + "epoch": 0.7655493762212535, + "flos": 33233856353280.0, + "grad_norm": 1.6478358497502372, + "language_loss": 0.78268248, + "learning_rate": 5.493026424675653e-07, + "loss": 0.80412853, + "num_input_tokens_seen": 274578135, + "router_z_loss_clip": 0.75537109, + "router_z_loss_mlp": 0.12268066, + "step": 12733, + "time_per_iteration": 4.183235168457031 + }, + { + "auxiliary_loss_clip": 0.01112336, + "auxiliary_loss_mlp": 0.01028784, + "balance_loss_clip": 1.04037285, + "balance_loss_mlp": 1.017537, + "epoch": 0.7656094994739215, + "flos": 25348737512160.0, + "grad_norm": 1.8446946450803061, + "language_loss": 0.77481353, + "learning_rate": 5.490345717001726e-07, + "loss": 0.79622471, + "num_input_tokens_seen": 274595655, + "router_z_loss_clip": 0.72119141, + "router_z_loss_mlp": 0.11236572, + "step": 12734, + "time_per_iteration": 2.654191493988037 + }, + { + "auxiliary_loss_clip": 0.01116624, + "auxiliary_loss_mlp": 0.01032325, + "balance_loss_clip": 1.04059732, + "balance_loss_mlp": 1.01980186, + "epoch": 0.7656696227265896, + "flos": 28112716249920.0, + "grad_norm": 5.274937941979534, + "language_loss": 0.7314055, + "learning_rate": 5.48766555953535e-07, + "loss": 0.75289494, + "num_input_tokens_seen": 274616305, + "router_z_loss_clip": 0.76123047, + "router_z_loss_mlp": 0.12512207, + "step": 12735, + "time_per_iteration": 3.95455265045166 + }, + { + "auxiliary_loss_clip": 0.0111365, + "auxiliary_loss_mlp": 0.01031458, + "balance_loss_clip": 1.03942084, + "balance_loss_mlp": 1.02006745, + "epoch": 0.7657297459792575, + "flos": 33587480887200.0, + "grad_norm": 1.5061300379896871, + "language_loss": 0.72772527, + "learning_rate": 5.484985952378145e-07, + "loss": 0.74917632, + "num_input_tokens_seen": 274638110, + "router_z_loss_clip": 0.74267578, + "router_z_loss_mlp": 0.1138916, + "step": 12736, + "time_per_iteration": 2.6725680828094482 + }, + { + "auxiliary_loss_clip": 0.01118901, + "auxiliary_loss_mlp": 0.01038944, + "balance_loss_clip": 1.04244518, + "balance_loss_mlp": 1.02591419, + "epoch": 0.7657898692319255, + "flos": 20899926931680.0, + "grad_norm": 3.1624023969769834, + "language_loss": 0.77632415, + "learning_rate": 5.482306895631728e-07, + "loss": 0.79790258, + "num_input_tokens_seen": 274656565, + "router_z_loss_clip": 0.76416016, + "router_z_loss_mlp": 0.13043213, + "step": 12737, + "time_per_iteration": 2.6121129989624023 + }, + { + "auxiliary_loss_clip": 0.01114259, + "auxiliary_loss_mlp": 0.01033986, + "balance_loss_clip": 1.03993869, + "balance_loss_mlp": 1.02198792, + "epoch": 0.7658499924845934, + "flos": 26193037487040.0, + "grad_norm": 1.7064107602750178, + "language_loss": 0.76531971, + "learning_rate": 5.479628389397699e-07, + "loss": 0.78680217, + "num_input_tokens_seen": 274674215, + "router_z_loss_clip": 0.74267578, + "router_z_loss_mlp": 0.11993408, + "step": 12738, + "time_per_iteration": 2.6380701065063477 + }, + { + "auxiliary_loss_clip": 0.01116846, + "auxiliary_loss_mlp": 0.01032347, + "balance_loss_clip": 1.04144311, + "balance_loss_mlp": 1.0203371, + "epoch": 0.7659101157372614, + "flos": 35992608361920.0, + "grad_norm": 1.8745940668534422, + "language_loss": 0.62705886, + "learning_rate": 5.476950433777603e-07, + "loss": 0.64855081, + "num_input_tokens_seen": 274693445, + "router_z_loss_clip": 0.75439453, + "router_z_loss_mlp": 0.12005615, + "step": 12739, + "time_per_iteration": 2.684704065322876 + }, + { + "auxiliary_loss_clip": 0.01115857, + "auxiliary_loss_mlp": 0.01032068, + "balance_loss_clip": 1.04150844, + "balance_loss_mlp": 1.01922369, + "epoch": 0.7659702389899293, + "flos": 22636776611520.0, + "grad_norm": 1.9701881119031976, + "language_loss": 0.79299593, + "learning_rate": 5.474273028873004e-07, + "loss": 0.81447518, + "num_input_tokens_seen": 274712815, + "router_z_loss_clip": 0.74316406, + "router_z_loss_mlp": 0.1282959, + "step": 12740, + "time_per_iteration": 2.64896821975708 + }, + { + "auxiliary_loss_clip": 0.01114423, + "auxiliary_loss_mlp": 0.01032021, + "balance_loss_clip": 1.03977728, + "balance_loss_mlp": 1.01995158, + "epoch": 0.7660303622425974, + "flos": 28735053799680.0, + "grad_norm": 2.0414223942221015, + "language_loss": 0.65491033, + "learning_rate": 5.471596174785429e-07, + "loss": 0.67637479, + "num_input_tokens_seen": 274732690, + "router_z_loss_clip": 0.74707031, + "router_z_loss_mlp": 0.12060547, + "step": 12741, + "time_per_iteration": 2.6464579105377197 + }, + { + "auxiliary_loss_clip": 0.01111676, + "auxiliary_loss_mlp": 0.01026757, + "balance_loss_clip": 1.03944492, + "balance_loss_mlp": 1.01488924, + "epoch": 0.7660904854952653, + "flos": 23082079049280.0, + "grad_norm": 1.6475776204663697, + "language_loss": 0.76090264, + "learning_rate": 5.468919871616386e-07, + "loss": 0.78228694, + "num_input_tokens_seen": 274752460, + "router_z_loss_clip": 0.72216797, + "router_z_loss_mlp": 0.11877441, + "step": 12742, + "time_per_iteration": 2.6432976722717285 + }, + { + "auxiliary_loss_clip": 0.01109824, + "auxiliary_loss_mlp": 0.01031152, + "balance_loss_clip": 1.03920889, + "balance_loss_mlp": 1.02005315, + "epoch": 0.7661506087479333, + "flos": 28244945600640.0, + "grad_norm": 1.3609307070490717, + "language_loss": 0.76371646, + "learning_rate": 5.46624411946736e-07, + "loss": 0.78512621, + "num_input_tokens_seen": 274773070, + "router_z_loss_clip": 0.70556641, + "router_z_loss_mlp": 0.11096191, + "step": 12743, + "time_per_iteration": 2.7287676334381104 + }, + { + "auxiliary_loss_clip": 0.01111242, + "auxiliary_loss_mlp": 0.01029287, + "balance_loss_clip": 1.03830576, + "balance_loss_mlp": 1.01783681, + "epoch": 0.7662107320006012, + "flos": 21433625373600.0, + "grad_norm": 2.0824059889675937, + "language_loss": 0.74825788, + "learning_rate": 5.463568918439805e-07, + "loss": 0.76966316, + "num_input_tokens_seen": 274790220, + "router_z_loss_clip": 0.72949219, + "router_z_loss_mlp": 0.11444092, + "step": 12744, + "time_per_iteration": 2.684110403060913 + }, + { + "auxiliary_loss_clip": 0.01115198, + "auxiliary_loss_mlp": 0.01031033, + "balance_loss_clip": 1.04084492, + "balance_loss_mlp": 1.0189929, + "epoch": 0.7662708552532692, + "flos": 27214980333120.0, + "grad_norm": 12.794851177344258, + "language_loss": 0.71132731, + "learning_rate": 5.460894268635181e-07, + "loss": 0.73278958, + "num_input_tokens_seen": 274805095, + "router_z_loss_clip": 0.74267578, + "router_z_loss_mlp": 0.12042236, + "step": 12745, + "time_per_iteration": 2.6655142307281494 + }, + { + "auxiliary_loss_clip": 0.01112949, + "auxiliary_loss_mlp": 0.01029011, + "balance_loss_clip": 1.03884566, + "balance_loss_mlp": 1.01744795, + "epoch": 0.7663309785059371, + "flos": 19208126116800.0, + "grad_norm": 2.4362834554172728, + "language_loss": 0.76297808, + "learning_rate": 5.458220170154896e-07, + "loss": 0.78439772, + "num_input_tokens_seen": 274821800, + "router_z_loss_clip": 0.74072266, + "router_z_loss_mlp": 0.11572266, + "step": 12746, + "time_per_iteration": 2.625447988510132 + }, + { + "auxiliary_loss_clip": 0.01031561, + "auxiliary_loss_mlp": 0.01000803, + "balance_loss_clip": 1.0087918, + "balance_loss_mlp": 0.99979305, + "epoch": 0.7663911017586051, + "flos": 75850505308800.0, + "grad_norm": 0.6607497692490341, + "language_loss": 0.56763673, + "learning_rate": 5.455546623100362e-07, + "loss": 0.58796036, + "num_input_tokens_seen": 274886970, + "router_z_loss_clip": 0.2277832, + "router_z_loss_mlp": 0.01010132, + "step": 12747, + "time_per_iteration": 3.3141534328460693 + }, + { + "auxiliary_loss_clip": 0.0110956, + "auxiliary_loss_mlp": 0.01030387, + "balance_loss_clip": 1.03813303, + "balance_loss_mlp": 1.02028966, + "epoch": 0.7664512250112732, + "flos": 32343089408640.0, + "grad_norm": 1.5250958481315993, + "language_loss": 0.72537452, + "learning_rate": 5.452873627572956e-07, + "loss": 0.74677402, + "num_input_tokens_seen": 274907240, + "router_z_loss_clip": 0.71435547, + "router_z_loss_mlp": 0.10095215, + "step": 12748, + "time_per_iteration": 2.682581663131714 + }, + { + "auxiliary_loss_clip": 0.0111201, + "auxiliary_loss_mlp": 0.01033278, + "balance_loss_clip": 1.03894031, + "balance_loss_mlp": 1.02113676, + "epoch": 0.7665113482639411, + "flos": 19827708491520.0, + "grad_norm": 3.356217986950992, + "language_loss": 0.69284403, + "learning_rate": 5.450201183674052e-07, + "loss": 0.71429694, + "num_input_tokens_seen": 274924650, + "router_z_loss_clip": 0.73046875, + "router_z_loss_mlp": 0.12127686, + "step": 12749, + "time_per_iteration": 2.672274351119995 + }, + { + "auxiliary_loss_clip": 0.01113608, + "auxiliary_loss_mlp": 0.01028825, + "balance_loss_clip": 1.03949273, + "balance_loss_mlp": 1.01701689, + "epoch": 0.7665714715166091, + "flos": 33187470418080.0, + "grad_norm": 1.7887169685141604, + "language_loss": 0.73698562, + "learning_rate": 5.447529291504967e-07, + "loss": 0.75840998, + "num_input_tokens_seen": 274944550, + "router_z_loss_clip": 0.7421875, + "router_z_loss_mlp": 0.11816406, + "step": 12750, + "time_per_iteration": 2.6449294090270996 + }, + { + "auxiliary_loss_clip": 0.01110259, + "auxiliary_loss_mlp": 0.01030384, + "balance_loss_clip": 1.03872323, + "balance_loss_mlp": 1.01947677, + "epoch": 0.766631594769277, + "flos": 25707102567840.0, + "grad_norm": 2.3343913032151455, + "language_loss": 0.75815248, + "learning_rate": 5.444857951167026e-07, + "loss": 0.77955896, + "num_input_tokens_seen": 274961330, + "router_z_loss_clip": 0.71582031, + "router_z_loss_mlp": 0.10913086, + "step": 12751, + "time_per_iteration": 2.65484356880188 + }, + { + "auxiliary_loss_clip": 0.01113833, + "auxiliary_loss_mlp": 0.01031921, + "balance_loss_clip": 1.04099989, + "balance_loss_mlp": 1.02016687, + "epoch": 0.766691718021945, + "flos": 29404587630240.0, + "grad_norm": 1.8773579058319012, + "language_loss": 0.60942042, + "learning_rate": 5.442187162761537e-07, + "loss": 0.63087797, + "num_input_tokens_seen": 274981655, + "router_z_loss_clip": 0.72851562, + "router_z_loss_mlp": 0.11755371, + "step": 12752, + "time_per_iteration": 2.64296817779541 + }, + { + "auxiliary_loss_clip": 0.01114155, + "auxiliary_loss_mlp": 0.01031481, + "balance_loss_clip": 1.03983617, + "balance_loss_mlp": 1.01926768, + "epoch": 0.7667518412746129, + "flos": 28602540828000.0, + "grad_norm": 3.9717864858934844, + "language_loss": 0.69008946, + "learning_rate": 5.439516926389767e-07, + "loss": 0.71154583, + "num_input_tokens_seen": 274999970, + "router_z_loss_clip": 0.74316406, + "router_z_loss_mlp": 0.12213135, + "step": 12753, + "time_per_iteration": 2.677422285079956 + }, + { + "auxiliary_loss_clip": 0.01112447, + "auxiliary_loss_mlp": 0.01032299, + "balance_loss_clip": 1.04033422, + "balance_loss_mlp": 1.02122426, + "epoch": 0.766811964527281, + "flos": 22145128755840.0, + "grad_norm": 3.8997126787992547, + "language_loss": 0.62348354, + "learning_rate": 5.436847242152971e-07, + "loss": 0.64493096, + "num_input_tokens_seen": 275015805, + "router_z_loss_clip": 0.72070312, + "router_z_loss_mlp": 0.11071777, + "step": 12754, + "time_per_iteration": 2.5857417583465576 + }, + { + "auxiliary_loss_clip": 0.01112919, + "auxiliary_loss_mlp": 0.01023729, + "balance_loss_clip": 1.04135251, + "balance_loss_mlp": 1.01286888, + "epoch": 0.7668720877799489, + "flos": 23837901985440.0, + "grad_norm": 2.644160931060898, + "language_loss": 0.80176485, + "learning_rate": 5.434178110152401e-07, + "loss": 0.82313132, + "num_input_tokens_seen": 275031810, + "router_z_loss_clip": 0.71435547, + "router_z_loss_mlp": 0.10858154, + "step": 12755, + "time_per_iteration": 2.5988152027130127 + }, + { + "auxiliary_loss_clip": 0.01110125, + "auxiliary_loss_mlp": 0.01028854, + "balance_loss_clip": 1.0385462, + "balance_loss_mlp": 1.01757097, + "epoch": 0.7669322110326169, + "flos": 27667413812160.0, + "grad_norm": 1.7715444690632731, + "language_loss": 0.70455503, + "learning_rate": 5.431509530489242e-07, + "loss": 0.72594488, + "num_input_tokens_seen": 275049325, + "router_z_loss_clip": 0.71533203, + "router_z_loss_mlp": 0.112854, + "step": 12756, + "time_per_iteration": 2.6104252338409424 + }, + { + "auxiliary_loss_clip": 0.01113629, + "auxiliary_loss_mlp": 0.01038157, + "balance_loss_clip": 1.04036093, + "balance_loss_mlp": 1.02712464, + "epoch": 0.7669923342852848, + "flos": 32298405199200.0, + "grad_norm": 1.7819298427668975, + "language_loss": 0.70007306, + "learning_rate": 5.428841503264706e-07, + "loss": 0.72159088, + "num_input_tokens_seen": 275070865, + "router_z_loss_clip": 0.73291016, + "router_z_loss_mlp": 0.11022949, + "step": 12757, + "time_per_iteration": 2.7645175457000732 + }, + { + "auxiliary_loss_clip": 0.01116252, + "auxiliary_loss_mlp": 0.0103297, + "balance_loss_clip": 1.04222977, + "balance_loss_mlp": 1.02081621, + "epoch": 0.7670524575379528, + "flos": 27889335720000.0, + "grad_norm": 2.1697095854632558, + "language_loss": 0.75871587, + "learning_rate": 5.426174028579955e-07, + "loss": 0.78020817, + "num_input_tokens_seen": 275088015, + "router_z_loss_clip": 0.74072266, + "router_z_loss_mlp": 0.1217041, + "step": 12758, + "time_per_iteration": 2.652308225631714 + }, + { + "auxiliary_loss_clip": 0.01109834, + "auxiliary_loss_mlp": 0.01034442, + "balance_loss_clip": 1.03871775, + "balance_loss_mlp": 1.02306938, + "epoch": 0.7671125807906207, + "flos": 27396593897760.0, + "grad_norm": 1.646564796170833, + "language_loss": 0.76015151, + "learning_rate": 5.423507106536156e-07, + "loss": 0.78159428, + "num_input_tokens_seen": 275106975, + "router_z_loss_clip": 0.71044922, + "router_z_loss_mlp": 0.11376953, + "step": 12759, + "time_per_iteration": 2.661684513092041 + }, + { + "auxiliary_loss_clip": 0.01108845, + "auxiliary_loss_mlp": 0.0102646, + "balance_loss_clip": 1.03561687, + "balance_loss_mlp": 1.01546931, + "epoch": 0.7671727040432887, + "flos": 43161523816320.0, + "grad_norm": 2.0810349273397266, + "language_loss": 0.68326628, + "learning_rate": 5.420840737234425e-07, + "loss": 0.70461941, + "num_input_tokens_seen": 275129560, + "router_z_loss_clip": 0.73144531, + "router_z_loss_mlp": 0.10992432, + "step": 12760, + "time_per_iteration": 2.776658058166504 + }, + { + "auxiliary_loss_clip": 0.01112886, + "auxiliary_loss_mlp": 0.0102912, + "balance_loss_clip": 1.04010069, + "balance_loss_mlp": 1.01709795, + "epoch": 0.7672328272959568, + "flos": 27448247079360.0, + "grad_norm": 1.7919213513720544, + "language_loss": 0.79127145, + "learning_rate": 5.418174920775871e-07, + "loss": 0.81269151, + "num_input_tokens_seen": 275151180, + "router_z_loss_clip": 0.72802734, + "router_z_loss_mlp": 0.12030029, + "step": 12761, + "time_per_iteration": 4.133667707443237 + }, + { + "auxiliary_loss_clip": 0.0111035, + "auxiliary_loss_mlp": 0.01031223, + "balance_loss_clip": 1.04018688, + "balance_loss_mlp": 1.01990402, + "epoch": 0.7672929505486247, + "flos": 27840194609760.0, + "grad_norm": 1.796251758030348, + "language_loss": 0.66184187, + "learning_rate": 5.415509657261589e-07, + "loss": 0.68325758, + "num_input_tokens_seen": 275170605, + "router_z_loss_clip": 0.70214844, + "router_z_loss_mlp": 0.11309814, + "step": 12762, + "time_per_iteration": 2.653031826019287 + }, + { + "auxiliary_loss_clip": 0.01113436, + "auxiliary_loss_mlp": 0.01026568, + "balance_loss_clip": 1.03864944, + "balance_loss_mlp": 1.01493275, + "epoch": 0.7673530738012927, + "flos": 24817510624320.0, + "grad_norm": 1.705913495392155, + "language_loss": 0.7367357, + "learning_rate": 5.412844946792639e-07, + "loss": 0.75813568, + "num_input_tokens_seen": 275188750, + "router_z_loss_clip": 0.74853516, + "router_z_loss_mlp": 0.11639404, + "step": 12763, + "time_per_iteration": 2.611375093460083 + }, + { + "auxiliary_loss_clip": 0.01113221, + "auxiliary_loss_mlp": 0.01031043, + "balance_loss_clip": 1.04119813, + "balance_loss_mlp": 1.01936066, + "epoch": 0.7674131970539606, + "flos": 42627096063360.0, + "grad_norm": 1.5967826817722364, + "language_loss": 0.70709974, + "learning_rate": 5.410180789470067e-07, + "loss": 0.72854239, + "num_input_tokens_seen": 275211365, + "router_z_loss_clip": 0.72070312, + "router_z_loss_mlp": 0.11682129, + "step": 12764, + "time_per_iteration": 2.75567889213562 + }, + { + "auxiliary_loss_clip": 0.01112632, + "auxiliary_loss_mlp": 0.01029363, + "balance_loss_clip": 1.03988743, + "balance_loss_mlp": 1.01772213, + "epoch": 0.7674733203066286, + "flos": 34567575733440.0, + "grad_norm": 1.65637312966238, + "language_loss": 0.69468474, + "learning_rate": 5.40751718539491e-07, + "loss": 0.71610469, + "num_input_tokens_seen": 275231670, + "router_z_loss_clip": 0.72705078, + "router_z_loss_mlp": 0.11639404, + "step": 12765, + "time_per_iteration": 4.123523473739624 + }, + { + "auxiliary_loss_clip": 0.0110837, + "auxiliary_loss_mlp": 0.01026945, + "balance_loss_clip": 1.03637314, + "balance_loss_mlp": 1.01682425, + "epoch": 0.7675334435592965, + "flos": 19876403911680.0, + "grad_norm": 3.540438648661916, + "language_loss": 0.60925251, + "learning_rate": 5.404854134668162e-07, + "loss": 0.63060564, + "num_input_tokens_seen": 275249425, + "router_z_loss_clip": 0.72021484, + "router_z_loss_mlp": 0.10113525, + "step": 12766, + "time_per_iteration": 2.6165175437927246 + }, + { + "auxiliary_loss_clip": 0.01032132, + "auxiliary_loss_mlp": 0.01001586, + "balance_loss_clip": 1.00929499, + "balance_loss_mlp": 1.00061309, + "epoch": 0.7675935668119646, + "flos": 79100054310240.0, + "grad_norm": 0.7320365374039584, + "language_loss": 0.60803354, + "learning_rate": 5.402191637390803e-07, + "loss": 0.62837076, + "num_input_tokens_seen": 275312485, + "router_z_loss_clip": 0.22839355, + "router_z_loss_mlp": 0.00972748, + "step": 12767, + "time_per_iteration": 3.4115102291107178 + }, + { + "auxiliary_loss_clip": 0.01109231, + "auxiliary_loss_mlp": 0.01022856, + "balance_loss_clip": 1.03858232, + "balance_loss_mlp": 1.01198387, + "epoch": 0.7676536900646325, + "flos": 27668629330560.0, + "grad_norm": 1.9877270736376895, + "language_loss": 0.6939851, + "learning_rate": 5.399529693663801e-07, + "loss": 0.71530604, + "num_input_tokens_seen": 275331680, + "router_z_loss_clip": 0.70556641, + "router_z_loss_mlp": 0.10876465, + "step": 12768, + "time_per_iteration": 2.688309669494629 + }, + { + "auxiliary_loss_clip": 0.01118218, + "auxiliary_loss_mlp": 0.010342, + "balance_loss_clip": 1.04261899, + "balance_loss_mlp": 1.02263629, + "epoch": 0.7677138133173005, + "flos": 32872655088000.0, + "grad_norm": 1.8476905467057436, + "language_loss": 0.70746994, + "learning_rate": 5.3968683035881e-07, + "loss": 0.72899413, + "num_input_tokens_seen": 275351615, + "router_z_loss_clip": 0.75537109, + "router_z_loss_mlp": 0.11572266, + "step": 12769, + "time_per_iteration": 2.67718243598938 + }, + { + "auxiliary_loss_clip": 0.01113833, + "auxiliary_loss_mlp": 0.01028896, + "balance_loss_clip": 1.04003978, + "balance_loss_mlp": 1.01707661, + "epoch": 0.7677739365699684, + "flos": 29041036362720.0, + "grad_norm": 1.8587036342437278, + "language_loss": 0.80451781, + "learning_rate": 5.394207467264611e-07, + "loss": 0.82594514, + "num_input_tokens_seen": 275368815, + "router_z_loss_clip": 0.73779297, + "router_z_loss_mlp": 0.11828613, + "step": 12770, + "time_per_iteration": 2.7240242958068848 + }, + { + "auxiliary_loss_clip": 0.01111273, + "auxiliary_loss_mlp": 0.01026692, + "balance_loss_clip": 1.03964186, + "balance_loss_mlp": 1.01598132, + "epoch": 0.7678340598226364, + "flos": 42044053924800.0, + "grad_norm": 1.9465829828575334, + "language_loss": 0.78636563, + "learning_rate": 5.391547184794245e-07, + "loss": 0.80774528, + "num_input_tokens_seen": 275389345, + "router_z_loss_clip": 0.71728516, + "router_z_loss_mlp": 0.1071167, + "step": 12771, + "time_per_iteration": 2.7631006240844727 + }, + { + "auxiliary_loss_clip": 0.01111792, + "auxiliary_loss_mlp": 0.01025868, + "balance_loss_clip": 1.0386467, + "balance_loss_mlp": 1.01485324, + "epoch": 0.7678941830753043, + "flos": 29092000750560.0, + "grad_norm": 1.447806622457302, + "language_loss": 0.68279493, + "learning_rate": 5.388887456277876e-07, + "loss": 0.70417154, + "num_input_tokens_seen": 275411240, + "router_z_loss_clip": 0.73095703, + "router_z_loss_mlp": 0.11022949, + "step": 12772, + "time_per_iteration": 4.164801120758057 + }, + { + "auxiliary_loss_clip": 0.01109124, + "auxiliary_loss_mlp": 0.01024467, + "balance_loss_clip": 1.03927124, + "balance_loss_mlp": 1.01400065, + "epoch": 0.7679543063279723, + "flos": 31006817439840.0, + "grad_norm": 1.6246082962435537, + "language_loss": 0.73546088, + "learning_rate": 5.386228281816349e-07, + "loss": 0.75679684, + "num_input_tokens_seen": 275432010, + "router_z_loss_clip": 0.69873047, + "router_z_loss_mlp": 0.10461426, + "step": 12773, + "time_per_iteration": 2.717210292816162 + }, + { + "auxiliary_loss_clip": 0.01108699, + "auxiliary_loss_mlp": 0.0102612, + "balance_loss_clip": 1.03868437, + "balance_loss_mlp": 1.01551032, + "epoch": 0.7680144295806404, + "flos": 34120531052640.0, + "grad_norm": 2.0281523947939846, + "language_loss": 0.81096315, + "learning_rate": 5.383569661510512e-07, + "loss": 0.83231133, + "num_input_tokens_seen": 275453710, + "router_z_loss_clip": 0.70019531, + "router_z_loss_mlp": 0.1060791, + "step": 12774, + "time_per_iteration": 3.957510471343994 + }, + { + "auxiliary_loss_clip": 0.01112869, + "auxiliary_loss_mlp": 0.01035706, + "balance_loss_clip": 1.04087305, + "balance_loss_mlp": 1.02434552, + "epoch": 0.7680745528333083, + "flos": 24907689388800.0, + "grad_norm": 1.9014729406255344, + "language_loss": 0.70385969, + "learning_rate": 5.380911595461177e-07, + "loss": 0.72534543, + "num_input_tokens_seen": 275472915, + "router_z_loss_clip": 0.71972656, + "router_z_loss_mlp": 0.11352539, + "step": 12775, + "time_per_iteration": 2.7408926486968994 + }, + { + "auxiliary_loss_clip": 0.01032219, + "auxiliary_loss_mlp": 0.01000717, + "balance_loss_clip": 1.00951254, + "balance_loss_mlp": 0.9997437, + "epoch": 0.7681346760859763, + "flos": 83462454233280.0, + "grad_norm": 0.7059971054919669, + "language_loss": 0.56866217, + "learning_rate": 5.378254083769147e-07, + "loss": 0.58899152, + "num_input_tokens_seen": 275534785, + "router_z_loss_clip": 0.22741699, + "router_z_loss_mlp": 0.00971985, + "step": 12776, + "time_per_iteration": 3.3256988525390625 + }, + { + "auxiliary_loss_clip": 0.01111668, + "auxiliary_loss_mlp": 0.01032388, + "balance_loss_clip": 1.03986144, + "balance_loss_mlp": 1.02113497, + "epoch": 0.7681947993386442, + "flos": 25931131374240.0, + "grad_norm": 2.1290086558487262, + "language_loss": 0.74063116, + "learning_rate": 5.375597126535188e-07, + "loss": 0.76207173, + "num_input_tokens_seen": 275553205, + "router_z_loss_clip": 0.71826172, + "router_z_loss_mlp": 0.11254883, + "step": 12777, + "time_per_iteration": 2.6779167652130127 + }, + { + "auxiliary_loss_clip": 0.01112133, + "auxiliary_loss_mlp": 0.01029544, + "balance_loss_clip": 1.04010916, + "balance_loss_mlp": 1.01893389, + "epoch": 0.7682549225913122, + "flos": 26109544073760.0, + "grad_norm": 2.6795525880403614, + "language_loss": 0.70023489, + "learning_rate": 5.372940723860043e-07, + "loss": 0.72165161, + "num_input_tokens_seen": 275571490, + "router_z_loss_clip": 0.71972656, + "router_z_loss_mlp": 0.10614014, + "step": 12778, + "time_per_iteration": 2.635312557220459 + }, + { + "auxiliary_loss_clip": 0.01111182, + "auxiliary_loss_mlp": 0.01026947, + "balance_loss_clip": 1.03968549, + "balance_loss_mlp": 1.01599157, + "epoch": 0.7683150458439801, + "flos": 28112351594400.0, + "grad_norm": 1.951593519418713, + "language_loss": 0.70499063, + "learning_rate": 5.37028487584446e-07, + "loss": 0.72637188, + "num_input_tokens_seen": 275589665, + "router_z_loss_clip": 0.71533203, + "router_z_loss_mlp": 0.10955811, + "step": 12779, + "time_per_iteration": 2.628753185272217 + }, + { + "auxiliary_loss_clip": 0.0111429, + "auxiliary_loss_mlp": 0.01030408, + "balance_loss_clip": 1.0413754, + "balance_loss_mlp": 1.01917291, + "epoch": 0.7683751690966482, + "flos": 82162675632960.0, + "grad_norm": 1.651739450405995, + "language_loss": 0.58673251, + "learning_rate": 5.367629582589133e-07, + "loss": 0.60817951, + "num_input_tokens_seen": 275615605, + "router_z_loss_clip": 0.72851562, + "router_z_loss_mlp": 0.11242676, + "step": 12780, + "time_per_iteration": 3.07196044921875 + }, + { + "auxiliary_loss_clip": 0.01116245, + "auxiliary_loss_mlp": 0.01039076, + "balance_loss_clip": 1.03925931, + "balance_loss_mlp": 1.02603412, + "epoch": 0.7684352923493161, + "flos": 26599125548160.0, + "grad_norm": 1.8082792742486233, + "language_loss": 0.68633318, + "learning_rate": 5.364974844194759e-07, + "loss": 0.70788634, + "num_input_tokens_seen": 275634965, + "router_z_loss_clip": 0.76904297, + "router_z_loss_mlp": 0.13037109, + "step": 12781, + "time_per_iteration": 2.6357405185699463 + }, + { + "auxiliary_loss_clip": 0.01109958, + "auxiliary_loss_mlp": 0.01027657, + "balance_loss_clip": 1.03805566, + "balance_loss_mlp": 1.01660609, + "epoch": 0.7684954156019841, + "flos": 31539786570720.0, + "grad_norm": 1.638849680302145, + "language_loss": 0.79194856, + "learning_rate": 5.362320660762016e-07, + "loss": 0.81332469, + "num_input_tokens_seen": 275655785, + "router_z_loss_clip": 0.71875, + "router_z_loss_mlp": 0.1104126, + "step": 12782, + "time_per_iteration": 2.716557502746582 + }, + { + "auxiliary_loss_clip": 0.01113566, + "auxiliary_loss_mlp": 0.01029876, + "balance_loss_clip": 1.03954828, + "balance_loss_mlp": 1.01828289, + "epoch": 0.768555538854652, + "flos": 31051299062880.0, + "grad_norm": 5.291471969377748, + "language_loss": 0.66427791, + "learning_rate": 5.35966703239153e-07, + "loss": 0.68571228, + "num_input_tokens_seen": 275676160, + "router_z_loss_clip": 0.74023438, + "router_z_loss_mlp": 0.11584473, + "step": 12783, + "time_per_iteration": 2.6457173824310303 + }, + { + "auxiliary_loss_clip": 0.01112153, + "auxiliary_loss_mlp": 0.01035797, + "balance_loss_clip": 1.03872383, + "balance_loss_mlp": 1.02385211, + "epoch": 0.76861566210732, + "flos": 23972440821120.0, + "grad_norm": 2.2498686089214357, + "language_loss": 0.69018328, + "learning_rate": 5.357013959183938e-07, + "loss": 0.71166277, + "num_input_tokens_seen": 275695660, + "router_z_loss_clip": 0.73388672, + "router_z_loss_mlp": 0.11938477, + "step": 12784, + "time_per_iteration": 2.669123411178589 + }, + { + "auxiliary_loss_clip": 0.0111104, + "auxiliary_loss_mlp": 0.01025063, + "balance_loss_clip": 1.038481, + "balance_loss_mlp": 1.01515675, + "epoch": 0.7686757853599879, + "flos": 27355880381760.0, + "grad_norm": 1.9450798542589143, + "language_loss": 0.80157602, + "learning_rate": 5.354361441239843e-07, + "loss": 0.82293701, + "num_input_tokens_seen": 275714025, + "router_z_loss_clip": 0.72607422, + "router_z_loss_mlp": 0.09912109, + "step": 12785, + "time_per_iteration": 2.645899772644043 + }, + { + "auxiliary_loss_clip": 0.01114002, + "auxiliary_loss_mlp": 0.01031791, + "balance_loss_clip": 1.04018033, + "balance_loss_mlp": 1.01919627, + "epoch": 0.768735908612656, + "flos": 58296498936480.0, + "grad_norm": 3.663390677291424, + "language_loss": 0.77330327, + "learning_rate": 5.351709478659836e-07, + "loss": 0.79476124, + "num_input_tokens_seen": 275737300, + "router_z_loss_clip": 0.73779297, + "router_z_loss_mlp": 0.1260376, + "step": 12786, + "time_per_iteration": 2.9114980697631836 + }, + { + "auxiliary_loss_clip": 0.01110285, + "auxiliary_loss_mlp": 0.01031126, + "balance_loss_clip": 1.03806424, + "balance_loss_mlp": 1.01993227, + "epoch": 0.7687960318653239, + "flos": 36927937964160.0, + "grad_norm": 2.1858656706459274, + "language_loss": 0.58599573, + "learning_rate": 5.349058071544468e-07, + "loss": 0.60740983, + "num_input_tokens_seen": 275757895, + "router_z_loss_clip": 0.72119141, + "router_z_loss_mlp": 0.11199951, + "step": 12787, + "time_per_iteration": 2.7404181957244873 + }, + { + "auxiliary_loss_clip": 0.01108938, + "auxiliary_loss_mlp": 0.01027028, + "balance_loss_clip": 1.03720069, + "balance_loss_mlp": 1.01533914, + "epoch": 0.7688561551179919, + "flos": 23883437057760.0, + "grad_norm": 1.6916495292288207, + "language_loss": 0.76093489, + "learning_rate": 5.346407219994292e-07, + "loss": 0.78229451, + "num_input_tokens_seen": 275776745, + "router_z_loss_clip": 0.71826172, + "router_z_loss_mlp": 0.11688232, + "step": 12788, + "time_per_iteration": 2.687100887298584 + }, + { + "auxiliary_loss_clip": 0.01113755, + "auxiliary_loss_mlp": 0.01038287, + "balance_loss_clip": 1.03933859, + "balance_loss_mlp": 1.0264852, + "epoch": 0.7689162783706599, + "flos": 27800980233120.0, + "grad_norm": 2.1394072877564545, + "language_loss": 0.66457856, + "learning_rate": 5.343756924109821e-07, + "loss": 0.68609893, + "num_input_tokens_seen": 275797205, + "router_z_loss_clip": 0.74365234, + "router_z_loss_mlp": 0.11779785, + "step": 12789, + "time_per_iteration": 2.799405813217163 + }, + { + "auxiliary_loss_clip": 0.01113431, + "auxiliary_loss_mlp": 0.01034268, + "balance_loss_clip": 1.03967011, + "balance_loss_mlp": 1.02176917, + "epoch": 0.7689764016233278, + "flos": 41737017912480.0, + "grad_norm": 6.164047445171214, + "language_loss": 0.68849874, + "learning_rate": 5.341107183991553e-07, + "loss": 0.70997578, + "num_input_tokens_seen": 275817935, + "router_z_loss_clip": 0.73779297, + "router_z_loss_mlp": 0.12506104, + "step": 12790, + "time_per_iteration": 2.739946126937866 + }, + { + "auxiliary_loss_clip": 0.01111725, + "auxiliary_loss_mlp": 0.01027587, + "balance_loss_clip": 1.03863454, + "balance_loss_mlp": 1.0163393, + "epoch": 0.7690365248759958, + "flos": 21078096527520.0, + "grad_norm": 1.5692464091713385, + "language_loss": 0.68772918, + "learning_rate": 5.338457999739969e-07, + "loss": 0.7091223, + "num_input_tokens_seen": 275837145, + "router_z_loss_clip": 0.72998047, + "router_z_loss_mlp": 0.11242676, + "step": 12791, + "time_per_iteration": 2.6971681118011475 + }, + { + "auxiliary_loss_clip": 0.0111066, + "auxiliary_loss_mlp": 0.01030961, + "balance_loss_clip": 1.03944588, + "balance_loss_mlp": 1.02054167, + "epoch": 0.7690966481286637, + "flos": 22235753210400.0, + "grad_norm": 1.810651828320317, + "language_loss": 0.79833138, + "learning_rate": 5.335809371455526e-07, + "loss": 0.81974757, + "num_input_tokens_seen": 275855705, + "router_z_loss_clip": 0.71191406, + "router_z_loss_mlp": 0.10424805, + "step": 12792, + "time_per_iteration": 2.6025218963623047 + }, + { + "auxiliary_loss_clip": 0.01119368, + "auxiliary_loss_mlp": 0.01030941, + "balance_loss_clip": 1.04264665, + "balance_loss_mlp": 1.01877546, + "epoch": 0.7691567713813318, + "flos": 26280785214720.0, + "grad_norm": 2.200488509403955, + "language_loss": 0.72542179, + "learning_rate": 5.333161299238673e-07, + "loss": 0.74692488, + "num_input_tokens_seen": 275873930, + "router_z_loss_clip": 0.76611328, + "router_z_loss_mlp": 0.1217041, + "step": 12793, + "time_per_iteration": 2.662788152694702 + }, + { + "auxiliary_loss_clip": 0.01113783, + "auxiliary_loss_mlp": 0.01035087, + "balance_loss_clip": 1.04008555, + "balance_loss_mlp": 1.02361882, + "epoch": 0.7692168946339997, + "flos": 48053408384160.0, + "grad_norm": 1.826818344626428, + "language_loss": 0.63530415, + "learning_rate": 5.330513783189803e-07, + "loss": 0.65679288, + "num_input_tokens_seen": 275895895, + "router_z_loss_clip": 0.73779297, + "router_z_loss_mlp": 0.11462402, + "step": 12794, + "time_per_iteration": 2.795693874359131 + }, + { + "auxiliary_loss_clip": 0.0111625, + "auxiliary_loss_mlp": 0.01030978, + "balance_loss_clip": 1.0418669, + "balance_loss_mlp": 1.01977849, + "epoch": 0.7692770178866677, + "flos": 30517479069120.0, + "grad_norm": 1.5138669673607899, + "language_loss": 0.76857436, + "learning_rate": 5.327866823409319e-07, + "loss": 0.79004669, + "num_input_tokens_seen": 275917825, + "router_z_loss_clip": 0.74414062, + "router_z_loss_mlp": 0.11199951, + "step": 12795, + "time_per_iteration": 2.6937737464904785 + }, + { + "auxiliary_loss_clip": 0.01112905, + "auxiliary_loss_mlp": 0.01027357, + "balance_loss_clip": 1.03817153, + "balance_loss_mlp": 1.01587701, + "epoch": 0.7693371411393356, + "flos": 30159276082560.0, + "grad_norm": 1.9388809141050205, + "language_loss": 0.72093272, + "learning_rate": 5.325220419997601e-07, + "loss": 0.74233532, + "num_input_tokens_seen": 275937890, + "router_z_loss_clip": 0.74658203, + "router_z_loss_mlp": 0.11480713, + "step": 12796, + "time_per_iteration": 2.6916182041168213 + }, + { + "auxiliary_loss_clip": 0.01111629, + "auxiliary_loss_mlp": 0.01026593, + "balance_loss_clip": 1.03827548, + "balance_loss_mlp": 1.01556611, + "epoch": 0.7693972643920036, + "flos": 19431101473920.0, + "grad_norm": 3.1424499166381015, + "language_loss": 0.64495051, + "learning_rate": 5.32257457305499e-07, + "loss": 0.66633272, + "num_input_tokens_seen": 275954495, + "router_z_loss_clip": 0.73388672, + "router_z_loss_mlp": 0.11022949, + "step": 12797, + "time_per_iteration": 2.658216714859009 + }, + { + "auxiliary_loss_clip": 0.01114504, + "auxiliary_loss_mlp": 0.01033232, + "balance_loss_clip": 1.04015839, + "balance_loss_mlp": 1.02083981, + "epoch": 0.7694573876446715, + "flos": 31001145020640.0, + "grad_norm": 2.2017684075907784, + "language_loss": 0.91549319, + "learning_rate": 5.319929282681823e-07, + "loss": 0.93697053, + "num_input_tokens_seen": 275972395, + "router_z_loss_clip": 0.74267578, + "router_z_loss_mlp": 0.1239624, + "step": 12798, + "time_per_iteration": 2.733325242996216 + }, + { + "auxiliary_loss_clip": 0.0111078, + "auxiliary_loss_mlp": 0.01025416, + "balance_loss_clip": 1.03778648, + "balance_loss_mlp": 1.01455593, + "epoch": 0.7695175108973396, + "flos": 20321220142080.0, + "grad_norm": 1.948027255601143, + "language_loss": 0.82237637, + "learning_rate": 5.317284548978418e-07, + "loss": 0.84373832, + "num_input_tokens_seen": 275989020, + "router_z_loss_clip": 0.73095703, + "router_z_loss_mlp": 0.10864258, + "step": 12799, + "time_per_iteration": 2.744368553161621 + }, + { + "auxiliary_loss_clip": 0.01114566, + "auxiliary_loss_mlp": 0.01028962, + "balance_loss_clip": 1.04025126, + "balance_loss_mlp": 1.01744652, + "epoch": 0.7695776341500075, + "flos": 16626530772000.0, + "grad_norm": 2.213691195200591, + "language_loss": 0.7782253, + "learning_rate": 5.314640372045045e-07, + "loss": 0.79966062, + "num_input_tokens_seen": 276006525, + "router_z_loss_clip": 0.74365234, + "router_z_loss_mlp": 0.11523438, + "step": 12800, + "time_per_iteration": 2.6626839637756348 + }, + { + "auxiliary_loss_clip": 0.0111757, + "auxiliary_loss_mlp": 0.01028199, + "balance_loss_clip": 1.03911233, + "balance_loss_mlp": 1.01556838, + "epoch": 0.7696377574026755, + "flos": 29623025052000.0, + "grad_norm": 9.252053163224717, + "language_loss": 0.83858937, + "learning_rate": 5.31199675198198e-07, + "loss": 0.86004704, + "num_input_tokens_seen": 276027130, + "router_z_loss_clip": 0.78466797, + "router_z_loss_mlp": 0.12640381, + "step": 12801, + "time_per_iteration": 4.081493139266968 + }, + { + "auxiliary_loss_clip": 0.01111174, + "auxiliary_loss_mlp": 0.01030142, + "balance_loss_clip": 1.03829789, + "balance_loss_mlp": 1.01897192, + "epoch": 0.7696978806553435, + "flos": 25530472628640.0, + "grad_norm": 2.0819176799794343, + "language_loss": 0.72065526, + "learning_rate": 5.30935368888947e-07, + "loss": 0.74206841, + "num_input_tokens_seen": 276045715, + "router_z_loss_clip": 0.72900391, + "router_z_loss_mlp": 0.1116333, + "step": 12802, + "time_per_iteration": 2.638603687286377 + }, + { + "auxiliary_loss_clip": 0.01110732, + "auxiliary_loss_mlp": 0.01033021, + "balance_loss_clip": 1.03900194, + "balance_loss_mlp": 1.02195835, + "epoch": 0.7697580039080114, + "flos": 27978947242560.0, + "grad_norm": 1.8835552792251784, + "language_loss": 0.7619434, + "learning_rate": 5.306711182867747e-07, + "loss": 0.78338099, + "num_input_tokens_seen": 276065375, + "router_z_loss_clip": 0.71728516, + "router_z_loss_mlp": 0.1105957, + "step": 12803, + "time_per_iteration": 2.737471103668213 + }, + { + "auxiliary_loss_clip": 0.01030868, + "auxiliary_loss_mlp": 0.01002674, + "balance_loss_clip": 1.00821757, + "balance_loss_mlp": 1.00166774, + "epoch": 0.7698181271606794, + "flos": 83849823311040.0, + "grad_norm": 0.7377074060402316, + "language_loss": 0.55822849, + "learning_rate": 5.304069234017001e-07, + "loss": 0.57856393, + "num_input_tokens_seen": 276131405, + "router_z_loss_clip": 0.22644043, + "router_z_loss_mlp": 0.01005554, + "step": 12804, + "time_per_iteration": 3.286699056625366 + }, + { + "auxiliary_loss_clip": 0.01031522, + "auxiliary_loss_mlp": 0.01002298, + "balance_loss_clip": 1.00876403, + "balance_loss_mlp": 1.00127983, + "epoch": 0.7698782504133473, + "flos": 82253265920640.0, + "grad_norm": 0.7397843042359283, + "language_loss": 0.53989935, + "learning_rate": 5.301427842437429e-07, + "loss": 0.56023753, + "num_input_tokens_seen": 276200755, + "router_z_loss_clip": 0.2277832, + "router_z_loss_mlp": 0.01017761, + "step": 12805, + "time_per_iteration": 4.673955202102661 + }, + { + "auxiliary_loss_clip": 0.0111722, + "auxiliary_loss_mlp": 0.01034713, + "balance_loss_clip": 1.04345667, + "balance_loss_mlp": 1.02350712, + "epoch": 0.7699383736660154, + "flos": 27441359141760.0, + "grad_norm": 2.205978582857697, + "language_loss": 0.72933805, + "learning_rate": 5.298787008229187e-07, + "loss": 0.75085741, + "num_input_tokens_seen": 276217880, + "router_z_loss_clip": 0.73779297, + "router_z_loss_mlp": 0.11212158, + "step": 12806, + "time_per_iteration": 2.675597667694092 + }, + { + "auxiliary_loss_clip": 0.01112107, + "auxiliary_loss_mlp": 0.0103593, + "balance_loss_clip": 1.03857732, + "balance_loss_mlp": 1.02422965, + "epoch": 0.7699984969186833, + "flos": 26283216251520.0, + "grad_norm": 1.7789990808862004, + "language_loss": 0.75194871, + "learning_rate": 5.296146731492408e-07, + "loss": 0.77342916, + "num_input_tokens_seen": 276234810, + "router_z_loss_clip": 0.73486328, + "router_z_loss_mlp": 0.11712646, + "step": 12807, + "time_per_iteration": 2.6112468242645264 + }, + { + "auxiliary_loss_clip": 0.01117029, + "auxiliary_loss_mlp": 0.01031299, + "balance_loss_clip": 1.04070854, + "balance_loss_mlp": 1.01924682, + "epoch": 0.7700586201713513, + "flos": 26502382984320.0, + "grad_norm": 2.3337368694712413, + "language_loss": 0.79935068, + "learning_rate": 5.293507012327218e-07, + "loss": 0.82083392, + "num_input_tokens_seen": 276252850, + "router_z_loss_clip": 0.76269531, + "router_z_loss_mlp": 0.12054443, + "step": 12808, + "time_per_iteration": 2.6610591411590576 + }, + { + "auxiliary_loss_clip": 0.0111767, + "auxiliary_loss_mlp": 0.01031747, + "balance_loss_clip": 1.04156756, + "balance_loss_mlp": 1.0199219, + "epoch": 0.7701187434240192, + "flos": 33990449117760.0, + "grad_norm": 3.913891676936848, + "language_loss": 0.79012346, + "learning_rate": 5.290867850833718e-07, + "loss": 0.81161761, + "num_input_tokens_seen": 276272525, + "router_z_loss_clip": 0.76074219, + "router_z_loss_mlp": 0.11828613, + "step": 12809, + "time_per_iteration": 2.672485828399658 + }, + { + "auxiliary_loss_clip": 0.01108715, + "auxiliary_loss_mlp": 0.01026261, + "balance_loss_clip": 1.03850508, + "balance_loss_mlp": 1.01507282, + "epoch": 0.7701788666766872, + "flos": 34924401132480.0, + "grad_norm": 1.668111644594281, + "language_loss": 0.70287526, + "learning_rate": 5.288229247111993e-07, + "loss": 0.72422498, + "num_input_tokens_seen": 276294210, + "router_z_loss_clip": 0.70166016, + "router_z_loss_mlp": 0.11193848, + "step": 12810, + "time_per_iteration": 2.7525103092193604 + }, + { + "auxiliary_loss_clip": 0.01114538, + "auxiliary_loss_mlp": 0.01033579, + "balance_loss_clip": 1.0386312, + "balance_loss_mlp": 1.02097237, + "epoch": 0.7702389899293551, + "flos": 17383042501920.0, + "grad_norm": 3.1491945839888476, + "language_loss": 0.78530395, + "learning_rate": 5.285591201262079e-07, + "loss": 0.80678511, + "num_input_tokens_seen": 276310290, + "router_z_loss_clip": 0.75830078, + "router_z_loss_mlp": 0.12609863, + "step": 12811, + "time_per_iteration": 2.6045548915863037 + }, + { + "auxiliary_loss_clip": 0.01031344, + "auxiliary_loss_mlp": 0.01001821, + "balance_loss_clip": 1.00864458, + "balance_loss_mlp": 1.00078011, + "epoch": 0.7702991131820232, + "flos": 86115630911040.0, + "grad_norm": 0.8041258369036898, + "language_loss": 0.56673938, + "learning_rate": 5.28295371338402e-07, + "loss": 0.58707106, + "num_input_tokens_seen": 276371715, + "router_z_loss_clip": 0.22692871, + "router_z_loss_mlp": 0.01041412, + "step": 12812, + "time_per_iteration": 4.852889537811279 + }, + { + "auxiliary_loss_clip": 0.01114449, + "auxiliary_loss_mlp": 0.01031551, + "balance_loss_clip": 1.03982568, + "balance_loss_mlp": 1.01995766, + "epoch": 0.7703592364346911, + "flos": 31091485854240.0, + "grad_norm": 1.7188781766219252, + "language_loss": 0.7175523, + "learning_rate": 5.280316783577836e-07, + "loss": 0.7390123, + "num_input_tokens_seen": 276389895, + "router_z_loss_clip": 0.74609375, + "router_z_loss_mlp": 0.1159668, + "step": 12813, + "time_per_iteration": 3.8942251205444336 + }, + { + "auxiliary_loss_clip": 0.01113546, + "auxiliary_loss_mlp": 0.01029926, + "balance_loss_clip": 1.03859949, + "balance_loss_mlp": 1.01792121, + "epoch": 0.7704193596873591, + "flos": 23526530624160.0, + "grad_norm": 1.7502113928142917, + "language_loss": 0.66614777, + "learning_rate": 5.27768041194351e-07, + "loss": 0.68758249, + "num_input_tokens_seen": 276408990, + "router_z_loss_clip": 0.74853516, + "router_z_loss_mlp": 0.12005615, + "step": 12814, + "time_per_iteration": 2.6353495121002197 + }, + { + "auxiliary_loss_clip": 0.01112984, + "auxiliary_loss_mlp": 0.01028394, + "balance_loss_clip": 1.03972661, + "balance_loss_mlp": 1.01718283, + "epoch": 0.7704794829400271, + "flos": 28868417634240.0, + "grad_norm": 2.147288461812373, + "language_loss": 0.66189551, + "learning_rate": 5.275044598581018e-07, + "loss": 0.68330932, + "num_input_tokens_seen": 276428190, + "router_z_loss_clip": 0.73242188, + "router_z_loss_mlp": 0.11212158, + "step": 12815, + "time_per_iteration": 2.627912759780884 + }, + { + "auxiliary_loss_clip": 0.01113685, + "auxiliary_loss_mlp": 0.01031806, + "balance_loss_clip": 1.03953505, + "balance_loss_mlp": 1.01983714, + "epoch": 0.770539606192695, + "flos": 23170677639840.0, + "grad_norm": 2.1897437077301416, + "language_loss": 0.64835757, + "learning_rate": 5.272409343590322e-07, + "loss": 0.6698125, + "num_input_tokens_seen": 276446855, + "router_z_loss_clip": 0.74169922, + "router_z_loss_mlp": 0.11968994, + "step": 12816, + "time_per_iteration": 2.6576826572418213 + }, + { + "auxiliary_loss_clip": 0.01113795, + "auxiliary_loss_mlp": 0.0103384, + "balance_loss_clip": 1.04031014, + "balance_loss_mlp": 1.02223468, + "epoch": 0.770599729445363, + "flos": 14435262266400.0, + "grad_norm": 2.572541144825503, + "language_loss": 0.71839827, + "learning_rate": 5.26977464707133e-07, + "loss": 0.73987472, + "num_input_tokens_seen": 276462000, + "router_z_loss_clip": 0.73339844, + "router_z_loss_mlp": 0.1159668, + "step": 12817, + "time_per_iteration": 2.585862636566162 + }, + { + "auxiliary_loss_clip": 0.011135, + "auxiliary_loss_mlp": 0.01032962, + "balance_loss_clip": 1.03963852, + "balance_loss_mlp": 1.02174449, + "epoch": 0.770659852698031, + "flos": 21746901047040.0, + "grad_norm": 2.049328754501506, + "language_loss": 0.61607611, + "learning_rate": 5.267140509123957e-07, + "loss": 0.6375407, + "num_input_tokens_seen": 276481190, + "router_z_loss_clip": 0.73876953, + "router_z_loss_mlp": 0.11218262, + "step": 12818, + "time_per_iteration": 2.6445236206054688 + }, + { + "auxiliary_loss_clip": 0.01112287, + "auxiliary_loss_mlp": 0.01026497, + "balance_loss_clip": 1.0409956, + "balance_loss_mlp": 1.01601815, + "epoch": 0.770719975950699, + "flos": 26688291380640.0, + "grad_norm": 1.8807717831253425, + "language_loss": 0.67210376, + "learning_rate": 5.264506929848093e-07, + "loss": 0.69349158, + "num_input_tokens_seen": 276499520, + "router_z_loss_clip": 0.71289062, + "router_z_loss_mlp": 0.10467529, + "step": 12819, + "time_per_iteration": 2.6027634143829346 + }, + { + "auxiliary_loss_clip": 0.01114869, + "auxiliary_loss_mlp": 0.01028272, + "balance_loss_clip": 1.04059291, + "balance_loss_mlp": 1.01675081, + "epoch": 0.7707800992033669, + "flos": 26020864448640.0, + "grad_norm": 1.7466399338902143, + "language_loss": 0.5771969, + "learning_rate": 5.261873909343608e-07, + "loss": 0.59862828, + "num_input_tokens_seen": 276519110, + "router_z_loss_clip": 0.74316406, + "router_z_loss_mlp": 0.11529541, + "step": 12820, + "time_per_iteration": 2.6812326908111572 + }, + { + "auxiliary_loss_clip": 0.01112706, + "auxiliary_loss_mlp": 0.01031016, + "balance_loss_clip": 1.03853178, + "balance_loss_mlp": 1.0190537, + "epoch": 0.7708402224560349, + "flos": 34385556996000.0, + "grad_norm": 1.7643016665949354, + "language_loss": 0.81137264, + "learning_rate": 5.259241447710343e-07, + "loss": 0.83280987, + "num_input_tokens_seen": 276538805, + "router_z_loss_clip": 0.74072266, + "router_z_loss_mlp": 0.11968994, + "step": 12821, + "time_per_iteration": 2.645700454711914 + }, + { + "auxiliary_loss_clip": 0.0111316, + "auxiliary_loss_mlp": 0.01032802, + "balance_loss_clip": 1.03987348, + "balance_loss_mlp": 1.02113748, + "epoch": 0.7709003457087028, + "flos": 18762742644480.0, + "grad_norm": 2.3108475186931856, + "language_loss": 0.68824762, + "learning_rate": 5.256609545048114e-07, + "loss": 0.70970726, + "num_input_tokens_seen": 276554770, + "router_z_loss_clip": 0.73242188, + "router_z_loss_mlp": 0.11663818, + "step": 12822, + "time_per_iteration": 2.5922060012817383 + }, + { + "auxiliary_loss_clip": 0.0111219, + "auxiliary_loss_mlp": 0.01033693, + "balance_loss_clip": 1.04013097, + "balance_loss_mlp": 1.02217746, + "epoch": 0.7709604689613708, + "flos": 37366190395200.0, + "grad_norm": 1.856696462183485, + "language_loss": 0.72405314, + "learning_rate": 5.253978201456733e-07, + "loss": 0.74551201, + "num_input_tokens_seen": 276574535, + "router_z_loss_clip": 0.72070312, + "router_z_loss_mlp": 0.11523438, + "step": 12823, + "time_per_iteration": 2.681959867477417 + }, + { + "auxiliary_loss_clip": 0.01119945, + "auxiliary_loss_mlp": 0.01036404, + "balance_loss_clip": 1.04227495, + "balance_loss_mlp": 1.02327287, + "epoch": 0.7710205922140387, + "flos": 24771772965600.0, + "grad_norm": 1.5825410400199538, + "language_loss": 0.76305056, + "learning_rate": 5.251347417035969e-07, + "loss": 0.78461409, + "num_input_tokens_seen": 276592925, + "router_z_loss_clip": 0.77587891, + "router_z_loss_mlp": 0.13134766, + "step": 12824, + "time_per_iteration": 2.645113945007324 + }, + { + "auxiliary_loss_clip": 0.01114128, + "auxiliary_loss_mlp": 0.01028536, + "balance_loss_clip": 1.04082453, + "balance_loss_mlp": 1.01657355, + "epoch": 0.7710807154667068, + "flos": 23970009784320.0, + "grad_norm": 1.8042482966205824, + "language_loss": 0.71768177, + "learning_rate": 5.248717191885592e-07, + "loss": 0.73910844, + "num_input_tokens_seen": 276610540, + "router_z_loss_clip": 0.73291016, + "router_z_loss_mlp": 0.11950684, + "step": 12825, + "time_per_iteration": 2.5911648273468018 + }, + { + "auxiliary_loss_clip": 0.01112307, + "auxiliary_loss_mlp": 0.01033989, + "balance_loss_clip": 1.04169869, + "balance_loss_mlp": 1.0236001, + "epoch": 0.7711408387193747, + "flos": 24412151874240.0, + "grad_norm": 1.7217001459365378, + "language_loss": 0.73773289, + "learning_rate": 5.246087526105343e-07, + "loss": 0.75919592, + "num_input_tokens_seen": 276629200, + "router_z_loss_clip": 0.70654297, + "router_z_loss_mlp": 0.10394287, + "step": 12826, + "time_per_iteration": 2.640333652496338 + }, + { + "auxiliary_loss_clip": 0.01112964, + "auxiliary_loss_mlp": 0.01029007, + "balance_loss_clip": 1.03782654, + "balance_loss_mlp": 1.01713371, + "epoch": 0.7712009619720427, + "flos": 30470323305600.0, + "grad_norm": 1.629186380406615, + "language_loss": 0.81165475, + "learning_rate": 5.243458419794933e-07, + "loss": 0.83307445, + "num_input_tokens_seen": 276648655, + "router_z_loss_clip": 0.75, + "router_z_loss_mlp": 0.11883545, + "step": 12827, + "time_per_iteration": 2.6250977516174316 + }, + { + "auxiliary_loss_clip": 0.01031291, + "auxiliary_loss_mlp": 0.01000838, + "balance_loss_clip": 1.00871396, + "balance_loss_mlp": 0.99985307, + "epoch": 0.7712610852247107, + "flos": 77176931578560.0, + "grad_norm": 0.8677892177245354, + "language_loss": 0.55185986, + "learning_rate": 5.240829873054051e-07, + "loss": 0.57218117, + "num_input_tokens_seen": 276716500, + "router_z_loss_clip": 0.22570801, + "router_z_loss_mlp": 0.00984192, + "step": 12828, + "time_per_iteration": 3.412346839904785 + }, + { + "auxiliary_loss_clip": 0.01111386, + "auxiliary_loss_mlp": 0.01026044, + "balance_loss_clip": 1.03987861, + "balance_loss_mlp": 1.0155772, + "epoch": 0.7713212084773786, + "flos": 22814986724640.0, + "grad_norm": 2.6036096556807986, + "language_loss": 0.69522285, + "learning_rate": 5.23820188598238e-07, + "loss": 0.71659708, + "num_input_tokens_seen": 276733535, + "router_z_loss_clip": 0.71582031, + "router_z_loss_mlp": 0.10455322, + "step": 12829, + "time_per_iteration": 2.6149353981018066 + }, + { + "auxiliary_loss_clip": 0.01116503, + "auxiliary_loss_mlp": 0.0103309, + "balance_loss_clip": 1.0403657, + "balance_loss_mlp": 1.02052498, + "epoch": 0.7713813317300466, + "flos": 17293876669440.0, + "grad_norm": 10.166990487473253, + "language_loss": 0.80050504, + "learning_rate": 5.235574458679579e-07, + "loss": 0.82200098, + "num_input_tokens_seen": 276749575, + "router_z_loss_clip": 0.76123047, + "router_z_loss_mlp": 0.12573242, + "step": 12830, + "time_per_iteration": 2.604498863220215 + }, + { + "auxiliary_loss_clip": 0.01115305, + "auxiliary_loss_mlp": 0.0103165, + "balance_loss_clip": 1.03954184, + "balance_loss_mlp": 1.01942515, + "epoch": 0.7714414549827145, + "flos": 31364291115360.0, + "grad_norm": 1.9054840063762333, + "language_loss": 0.77949464, + "learning_rate": 5.232947591245269e-07, + "loss": 0.80096424, + "num_input_tokens_seen": 276769460, + "router_z_loss_clip": 0.7578125, + "router_z_loss_mlp": 0.12231445, + "step": 12831, + "time_per_iteration": 2.710023880004883 + }, + { + "auxiliary_loss_clip": 0.01111627, + "auxiliary_loss_mlp": 0.01029676, + "balance_loss_clip": 1.03846836, + "balance_loss_mlp": 1.01839316, + "epoch": 0.7715015782353826, + "flos": 37284398707680.0, + "grad_norm": 1.7760140293916782, + "language_loss": 0.61164153, + "learning_rate": 5.230321283779071e-07, + "loss": 0.63305461, + "num_input_tokens_seen": 276790820, + "router_z_loss_clip": 0.73242188, + "router_z_loss_mlp": 0.11279297, + "step": 12832, + "time_per_iteration": 2.6838107109069824 + }, + { + "auxiliary_loss_clip": 0.01113747, + "auxiliary_loss_mlp": 0.01034391, + "balance_loss_clip": 1.03840387, + "balance_loss_mlp": 1.02277446, + "epoch": 0.7715617014880505, + "flos": 24683903686080.0, + "grad_norm": 1.6061260220941405, + "language_loss": 0.79707307, + "learning_rate": 5.227695536380572e-07, + "loss": 0.81855446, + "num_input_tokens_seen": 276811345, + "router_z_loss_clip": 0.75341797, + "router_z_loss_mlp": 0.1161499, + "step": 12833, + "time_per_iteration": 2.7051851749420166 + }, + { + "auxiliary_loss_clip": 0.01031318, + "auxiliary_loss_mlp": 0.01000194, + "balance_loss_clip": 1.00864685, + "balance_loss_mlp": 0.99923182, + "epoch": 0.7716218247407185, + "flos": 77685549824160.0, + "grad_norm": 0.8407976458117087, + "language_loss": 0.55325639, + "learning_rate": 5.22507034914933e-07, + "loss": 0.5735715, + "num_input_tokens_seen": 276870950, + "router_z_loss_clip": 0.22680664, + "router_z_loss_mlp": 0.00961304, + "step": 12834, + "time_per_iteration": 3.2444021701812744 + }, + { + "auxiliary_loss_clip": 0.01114967, + "auxiliary_loss_mlp": 0.01029437, + "balance_loss_clip": 1.04022169, + "balance_loss_mlp": 1.01734352, + "epoch": 0.7716819479933864, + "flos": 24149962140480.0, + "grad_norm": 2.1331983117633677, + "language_loss": 0.73128366, + "learning_rate": 5.222445722184903e-07, + "loss": 0.75272769, + "num_input_tokens_seen": 276890760, + "router_z_loss_clip": 0.74755859, + "router_z_loss_mlp": 0.12078857, + "step": 12835, + "time_per_iteration": 2.67287278175354 + }, + { + "auxiliary_loss_clip": 0.01113553, + "auxiliary_loss_mlp": 0.01033002, + "balance_loss_clip": 1.03893375, + "balance_loss_mlp": 1.02132583, + "epoch": 0.7717420712460544, + "flos": 22503007604160.0, + "grad_norm": 1.9507119225661984, + "language_loss": 0.70372665, + "learning_rate": 5.219821655586814e-07, + "loss": 0.72519219, + "num_input_tokens_seen": 276909625, + "router_z_loss_clip": 0.74609375, + "router_z_loss_mlp": 0.11663818, + "step": 12836, + "time_per_iteration": 2.6303369998931885 + }, + { + "auxiliary_loss_clip": 0.01111102, + "auxiliary_loss_mlp": 0.01031361, + "balance_loss_clip": 1.03925037, + "balance_loss_mlp": 1.0199883, + "epoch": 0.7718021944987223, + "flos": 42939318287520.0, + "grad_norm": 1.9323228015985547, + "language_loss": 0.59297317, + "learning_rate": 5.217198149454575e-07, + "loss": 0.61439776, + "num_input_tokens_seen": 276930760, + "router_z_loss_clip": 0.71875, + "router_z_loss_mlp": 0.1137085, + "step": 12837, + "time_per_iteration": 2.737001895904541 + }, + { + "auxiliary_loss_clip": 0.01031708, + "auxiliary_loss_mlp": 0.0100103, + "balance_loss_clip": 1.00899053, + "balance_loss_mlp": 1.00000262, + "epoch": 0.7718623177513904, + "flos": 82879574163840.0, + "grad_norm": 0.8669473956394274, + "language_loss": 0.55805051, + "learning_rate": 5.214575203887666e-07, + "loss": 0.57837796, + "num_input_tokens_seen": 276989580, + "router_z_loss_clip": 0.22717285, + "router_z_loss_mlp": 0.01027679, + "step": 12838, + "time_per_iteration": 3.220468044281006 + }, + { + "auxiliary_loss_clip": 0.01112409, + "auxiliary_loss_mlp": 0.01027835, + "balance_loss_clip": 1.04040956, + "balance_loss_mlp": 1.01717186, + "epoch": 0.7719224410040583, + "flos": 22673519434080.0, + "grad_norm": 4.254048403454478, + "language_loss": 0.69391042, + "learning_rate": 5.211952818985538e-07, + "loss": 0.71531284, + "num_input_tokens_seen": 277005450, + "router_z_loss_clip": 0.71972656, + "router_z_loss_mlp": 0.10668945, + "step": 12839, + "time_per_iteration": 2.589308500289917 + }, + { + "auxiliary_loss_clip": 0.01111126, + "auxiliary_loss_mlp": 0.01022907, + "balance_loss_clip": 1.03989768, + "balance_loss_mlp": 1.01250589, + "epoch": 0.7719825642567263, + "flos": 28157319424800.0, + "grad_norm": 16.1090810595095, + "language_loss": 0.79905909, + "learning_rate": 5.209330994847647e-07, + "loss": 0.82039946, + "num_input_tokens_seen": 277023055, + "router_z_loss_clip": 0.71191406, + "router_z_loss_mlp": 0.10412598, + "step": 12840, + "time_per_iteration": 4.177715301513672 + }, + { + "auxiliary_loss_clip": 0.01112858, + "auxiliary_loss_mlp": 0.01029476, + "balance_loss_clip": 1.03951299, + "balance_loss_mlp": 1.01770377, + "epoch": 0.7720426875093943, + "flos": 24817794245280.0, + "grad_norm": 1.8820960741613375, + "language_loss": 0.80125916, + "learning_rate": 5.206709731573402e-07, + "loss": 0.82268244, + "num_input_tokens_seen": 277041150, + "router_z_loss_clip": 0.73339844, + "router_z_loss_mlp": 0.11773682, + "step": 12841, + "time_per_iteration": 2.6476447582244873 + }, + { + "auxiliary_loss_clip": 0.01113767, + "auxiliary_loss_mlp": 0.01031162, + "balance_loss_clip": 1.04018283, + "balance_loss_mlp": 1.01978993, + "epoch": 0.7721028107620622, + "flos": 29139318583200.0, + "grad_norm": 2.1562720070925745, + "language_loss": 0.76372844, + "learning_rate": 5.204089029262208e-07, + "loss": 0.78517777, + "num_input_tokens_seen": 277063895, + "router_z_loss_clip": 0.73632812, + "router_z_loss_mlp": 0.11376953, + "step": 12842, + "time_per_iteration": 2.6985859870910645 + }, + { + "auxiliary_loss_clip": 0.01118176, + "auxiliary_loss_mlp": 0.0103494, + "balance_loss_clip": 1.04235029, + "balance_loss_mlp": 1.0233109, + "epoch": 0.7721629340147302, + "flos": 32520732279840.0, + "grad_norm": 1.5407502805852906, + "language_loss": 0.68429554, + "learning_rate": 5.201468888013445e-07, + "loss": 0.70582676, + "num_input_tokens_seen": 277084045, + "router_z_loss_clip": 0.7578125, + "router_z_loss_mlp": 0.11627197, + "step": 12843, + "time_per_iteration": 2.712045192718506 + }, + { + "auxiliary_loss_clip": 0.01114438, + "auxiliary_loss_mlp": 0.01029582, + "balance_loss_clip": 1.03821516, + "balance_loss_mlp": 1.01837063, + "epoch": 0.7722230572673981, + "flos": 25842492266400.0, + "grad_norm": 2.0503088882645084, + "language_loss": 0.73725849, + "learning_rate": 5.198849307926465e-07, + "loss": 0.7586987, + "num_input_tokens_seen": 277102625, + "router_z_loss_clip": 0.76269531, + "router_z_loss_mlp": 0.11218262, + "step": 12844, + "time_per_iteration": 3.895781993865967 + }, + { + "auxiliary_loss_clip": 0.01112711, + "auxiliary_loss_mlp": 0.01030436, + "balance_loss_clip": 1.03981662, + "balance_loss_mlp": 1.01909947, + "epoch": 0.7722831805200662, + "flos": 34123083641280.0, + "grad_norm": 2.670302026615282, + "language_loss": 0.71467, + "learning_rate": 5.196230289100596e-07, + "loss": 0.73610151, + "num_input_tokens_seen": 277123210, + "router_z_loss_clip": 0.72753906, + "router_z_loss_mlp": 0.11340332, + "step": 12845, + "time_per_iteration": 2.6783127784729004 + }, + { + "auxiliary_loss_clip": 0.01110633, + "auxiliary_loss_mlp": 0.0102884, + "balance_loss_clip": 1.03947854, + "balance_loss_mlp": 1.01789045, + "epoch": 0.7723433037727341, + "flos": 41336035028640.0, + "grad_norm": 1.8933102511869293, + "language_loss": 0.64629877, + "learning_rate": 5.193611831635159e-07, + "loss": 0.6676935, + "num_input_tokens_seen": 277144895, + "router_z_loss_clip": 0.71142578, + "router_z_loss_mlp": 0.10943604, + "step": 12846, + "time_per_iteration": 2.765526056289673 + }, + { + "auxiliary_loss_clip": 0.01032018, + "auxiliary_loss_mlp": 0.00999908, + "balance_loss_clip": 1.00932825, + "balance_loss_mlp": 0.99891061, + "epoch": 0.7724034270254021, + "flos": 76687107000480.0, + "grad_norm": 0.809776895906609, + "language_loss": 0.61697459, + "learning_rate": 5.19099393562945e-07, + "loss": 0.63729393, + "num_input_tokens_seen": 277205160, + "router_z_loss_clip": 0.22668457, + "router_z_loss_mlp": 0.00996399, + "step": 12847, + "time_per_iteration": 3.185086488723755 + }, + { + "auxiliary_loss_clip": 0.01109838, + "auxiliary_loss_mlp": 0.01024508, + "balance_loss_clip": 1.03573871, + "balance_loss_mlp": 1.01321316, + "epoch": 0.77246355027807, + "flos": 28425303129600.0, + "grad_norm": 1.6943565744446258, + "language_loss": 0.79178637, + "learning_rate": 5.188376601182732e-07, + "loss": 0.81312984, + "num_input_tokens_seen": 277223005, + "router_z_loss_clip": 0.74023438, + "router_z_loss_mlp": 0.11291504, + "step": 12848, + "time_per_iteration": 2.753079891204834 + }, + { + "auxiliary_loss_clip": 0.01114938, + "auxiliary_loss_mlp": 0.01029355, + "balance_loss_clip": 1.03943539, + "balance_loss_mlp": 1.01826859, + "epoch": 0.772523673530738, + "flos": 24551917439040.0, + "grad_norm": 1.9468717816370218, + "language_loss": 0.72564673, + "learning_rate": 5.185759828394261e-07, + "loss": 0.74708974, + "num_input_tokens_seen": 277241785, + "router_z_loss_clip": 0.75537109, + "router_z_loss_mlp": 0.11077881, + "step": 12849, + "time_per_iteration": 2.6028926372528076 + }, + { + "auxiliary_loss_clip": 0.01111842, + "auxiliary_loss_mlp": 0.01031559, + "balance_loss_clip": 1.03860223, + "balance_loss_mlp": 1.01993036, + "epoch": 0.7725837967834059, + "flos": 21744064837440.0, + "grad_norm": 1.9286225679316242, + "language_loss": 0.7839148, + "learning_rate": 5.183143617363261e-07, + "loss": 0.80534875, + "num_input_tokens_seen": 277259050, + "router_z_loss_clip": 0.73193359, + "router_z_loss_mlp": 0.11621094, + "step": 12850, + "time_per_iteration": 2.6792125701904297 + }, + { + "auxiliary_loss_clip": 0.0111271, + "auxiliary_loss_mlp": 0.01033297, + "balance_loss_clip": 1.03662586, + "balance_loss_mlp": 1.02171552, + "epoch": 0.772643920036074, + "flos": 33188645419200.0, + "grad_norm": 1.5180716472803644, + "language_loss": 0.80271721, + "learning_rate": 5.180527968188935e-07, + "loss": 0.82417727, + "num_input_tokens_seen": 277278235, + "router_z_loss_clip": 0.76074219, + "router_z_loss_mlp": 0.11590576, + "step": 12851, + "time_per_iteration": 2.668968677520752 + }, + { + "auxiliary_loss_clip": 0.01112707, + "auxiliary_loss_mlp": 0.01029091, + "balance_loss_clip": 1.03983235, + "balance_loss_mlp": 1.01673508, + "epoch": 0.7727040432887419, + "flos": 26331465981600.0, + "grad_norm": 1.5017428502248371, + "language_loss": 0.73681653, + "learning_rate": 5.177912880970474e-07, + "loss": 0.7582345, + "num_input_tokens_seen": 277298355, + "router_z_loss_clip": 0.72802734, + "router_z_loss_mlp": 0.12347412, + "step": 12852, + "time_per_iteration": 4.096977233886719 + }, + { + "auxiliary_loss_clip": 0.01109923, + "auxiliary_loss_mlp": 0.01033524, + "balance_loss_clip": 1.03686762, + "balance_loss_mlp": 1.02175224, + "epoch": 0.7727641665414099, + "flos": 27133310197440.0, + "grad_norm": 1.8575071000739478, + "language_loss": 0.82005894, + "learning_rate": 5.17529835580704e-07, + "loss": 0.84149337, + "num_input_tokens_seen": 277316095, + "router_z_loss_clip": 0.73046875, + "router_z_loss_mlp": 0.11767578, + "step": 12853, + "time_per_iteration": 3.92555832862854 + }, + { + "auxiliary_loss_clip": 0.01030789, + "auxiliary_loss_mlp": 0.01002026, + "balance_loss_clip": 1.00822651, + "balance_loss_mlp": 1.00101614, + "epoch": 0.7728242897940779, + "flos": 66904310279520.0, + "grad_norm": 0.807828859572879, + "language_loss": 0.54509121, + "learning_rate": 5.172684392797786e-07, + "loss": 0.56541938, + "num_input_tokens_seen": 277380130, + "router_z_loss_clip": 0.22583008, + "router_z_loss_mlp": 0.01009369, + "step": 12854, + "time_per_iteration": 3.323899269104004 + }, + { + "auxiliary_loss_clip": 0.01116073, + "auxiliary_loss_mlp": 0.01028908, + "balance_loss_clip": 1.04016781, + "balance_loss_mlp": 1.01634967, + "epoch": 0.7728844130467458, + "flos": 42051347035200.0, + "grad_norm": 1.6693328440110695, + "language_loss": 0.71810943, + "learning_rate": 5.170070992041826e-07, + "loss": 0.73955929, + "num_input_tokens_seen": 277404015, + "router_z_loss_clip": 0.75830078, + "router_z_loss_mlp": 0.12567139, + "step": 12855, + "time_per_iteration": 2.737497091293335 + }, + { + "auxiliary_loss_clip": 0.01113145, + "auxiliary_loss_mlp": 0.01029023, + "balance_loss_clip": 1.03906333, + "balance_loss_mlp": 1.01638103, + "epoch": 0.7729445362994138, + "flos": 23081633359200.0, + "grad_norm": 3.387886441249091, + "language_loss": 0.67634225, + "learning_rate": 5.167458153638254e-07, + "loss": 0.69776392, + "num_input_tokens_seen": 277421375, + "router_z_loss_clip": 0.74169922, + "router_z_loss_mlp": 0.12646484, + "step": 12856, + "time_per_iteration": 2.6237964630126953 + }, + { + "auxiliary_loss_clip": 0.0111483, + "auxiliary_loss_mlp": 0.01031707, + "balance_loss_clip": 1.03936779, + "balance_loss_mlp": 1.02004886, + "epoch": 0.7730046595520818, + "flos": 27088666505280.0, + "grad_norm": 1.6916797954228053, + "language_loss": 0.79430401, + "learning_rate": 5.164845877686162e-07, + "loss": 0.81576943, + "num_input_tokens_seen": 277440170, + "router_z_loss_clip": 0.75439453, + "router_z_loss_mlp": 0.11657715, + "step": 12857, + "time_per_iteration": 2.6464364528656006 + }, + { + "auxiliary_loss_clip": 0.01113091, + "auxiliary_loss_mlp": 0.01031978, + "balance_loss_clip": 1.04048014, + "balance_loss_mlp": 1.02010465, + "epoch": 0.7730647828047498, + "flos": 16536108903840.0, + "grad_norm": 1.8594017010029655, + "language_loss": 0.78598583, + "learning_rate": 5.162234164284591e-07, + "loss": 0.80743659, + "num_input_tokens_seen": 277456880, + "router_z_loss_clip": 0.7265625, + "router_z_loss_mlp": 0.11877441, + "step": 12858, + "time_per_iteration": 2.635232925415039 + }, + { + "auxiliary_loss_clip": 0.01113173, + "auxiliary_loss_mlp": 0.01029442, + "balance_loss_clip": 1.03843403, + "balance_loss_mlp": 1.0177002, + "epoch": 0.7731249060574177, + "flos": 26815780209600.0, + "grad_norm": 15.526572480734607, + "language_loss": 0.77121902, + "learning_rate": 5.159623013532591e-07, + "loss": 0.79264522, + "num_input_tokens_seen": 277475365, + "router_z_loss_clip": 0.74658203, + "router_z_loss_mlp": 0.11730957, + "step": 12859, + "time_per_iteration": 2.624260663986206 + }, + { + "auxiliary_loss_clip": 0.01110284, + "auxiliary_loss_mlp": 0.01031431, + "balance_loss_clip": 1.04075539, + "balance_loss_mlp": 1.02116108, + "epoch": 0.7731850293100857, + "flos": 27578896256160.0, + "grad_norm": 1.7798654379624483, + "language_loss": 0.6775372, + "learning_rate": 5.157012425529186e-07, + "loss": 0.69895434, + "num_input_tokens_seen": 277494975, + "router_z_loss_clip": 0.69433594, + "router_z_loss_mlp": 0.10272217, + "step": 12860, + "time_per_iteration": 2.680062770843506 + }, + { + "auxiliary_loss_clip": 0.01114126, + "auxiliary_loss_mlp": 0.01037676, + "balance_loss_clip": 1.03752828, + "balance_loss_mlp": 1.02576661, + "epoch": 0.7732451525627536, + "flos": 17202603938400.0, + "grad_norm": 2.2646027991922764, + "language_loss": 0.74749017, + "learning_rate": 5.154402400373343e-07, + "loss": 0.76900816, + "num_input_tokens_seen": 277510520, + "router_z_loss_clip": 0.76611328, + "router_z_loss_mlp": 0.11914062, + "step": 12861, + "time_per_iteration": 2.5630176067352295 + }, + { + "auxiliary_loss_clip": 0.01116414, + "auxiliary_loss_mlp": 0.01027954, + "balance_loss_clip": 1.04024982, + "balance_loss_mlp": 1.01657581, + "epoch": 0.7733052758154216, + "flos": 26197859043360.0, + "grad_norm": 1.8477073161408069, + "language_loss": 0.7490989, + "learning_rate": 5.15179293816405e-07, + "loss": 0.77054262, + "num_input_tokens_seen": 277530505, + "router_z_loss_clip": 0.76220703, + "router_z_loss_mlp": 0.11376953, + "step": 12862, + "time_per_iteration": 2.7151927947998047 + }, + { + "auxiliary_loss_clip": 0.01110698, + "auxiliary_loss_mlp": 0.01032032, + "balance_loss_clip": 1.03865588, + "balance_loss_mlp": 1.02138102, + "epoch": 0.7733653990680895, + "flos": 26105694932160.0, + "grad_norm": 1.5242314060862387, + "language_loss": 0.8305583, + "learning_rate": 5.149184039000256e-07, + "loss": 0.85198563, + "num_input_tokens_seen": 277550810, + "router_z_loss_clip": 0.72070312, + "router_z_loss_mlp": 0.10662842, + "step": 12863, + "time_per_iteration": 2.6434929370880127 + }, + { + "auxiliary_loss_clip": 0.01111214, + "auxiliary_loss_mlp": 0.01031821, + "balance_loss_clip": 1.03860831, + "balance_loss_mlp": 1.02050209, + "epoch": 0.7734255223207576, + "flos": 21568123692000.0, + "grad_norm": 2.5507847199267104, + "language_loss": 0.73071122, + "learning_rate": 5.146575702980898e-07, + "loss": 0.75214159, + "num_input_tokens_seen": 277567680, + "router_z_loss_clip": 0.72558594, + "router_z_loss_mlp": 0.11322021, + "step": 12864, + "time_per_iteration": 2.6622002124786377 + }, + { + "auxiliary_loss_clip": 0.01112271, + "auxiliary_loss_mlp": 0.01030284, + "balance_loss_clip": 1.0388149, + "balance_loss_mlp": 1.01955509, + "epoch": 0.7734856455734255, + "flos": 30786840361440.0, + "grad_norm": 1.7749665300759359, + "language_loss": 0.82496595, + "learning_rate": 5.143967930204871e-07, + "loss": 0.8463915, + "num_input_tokens_seen": 277588970, + "router_z_loss_clip": 0.73535156, + "router_z_loss_mlp": 0.10723877, + "step": 12865, + "time_per_iteration": 2.638113498687744 + }, + { + "auxiliary_loss_clip": 0.01119355, + "auxiliary_loss_mlp": 0.01034179, + "balance_loss_clip": 1.04197276, + "balance_loss_mlp": 1.02094054, + "epoch": 0.7735457688260935, + "flos": 28594032199200.0, + "grad_norm": 2.0178876613393495, + "language_loss": 0.71962547, + "learning_rate": 5.141360720771077e-07, + "loss": 0.74116075, + "num_input_tokens_seen": 277605450, + "router_z_loss_clip": 0.77490234, + "router_z_loss_mlp": 0.13244629, + "step": 12866, + "time_per_iteration": 2.6407690048217773 + }, + { + "auxiliary_loss_clip": 0.01114829, + "auxiliary_loss_mlp": 0.01030424, + "balance_loss_clip": 1.04140139, + "balance_loss_mlp": 1.01840162, + "epoch": 0.7736058920787615, + "flos": 22853755411200.0, + "grad_norm": 2.833177585320623, + "language_loss": 0.64408362, + "learning_rate": 5.138754074778371e-07, + "loss": 0.66553617, + "num_input_tokens_seen": 277622530, + "router_z_loss_clip": 0.73486328, + "router_z_loss_mlp": 0.12017822, + "step": 12867, + "time_per_iteration": 2.591705083847046 + }, + { + "auxiliary_loss_clip": 0.01112268, + "auxiliary_loss_mlp": 0.01030491, + "balance_loss_clip": 1.03962862, + "balance_loss_mlp": 1.01904738, + "epoch": 0.7736660153314294, + "flos": 27934627688640.0, + "grad_norm": 1.8386813410322698, + "language_loss": 0.70959711, + "learning_rate": 5.136147992325595e-07, + "loss": 0.73102474, + "num_input_tokens_seen": 277642700, + "router_z_loss_clip": 0.72607422, + "router_z_loss_mlp": 0.11444092, + "step": 12868, + "time_per_iteration": 2.625385284423828 + }, + { + "auxiliary_loss_clip": 0.01115528, + "auxiliary_loss_mlp": 0.01028781, + "balance_loss_clip": 1.04096365, + "balance_loss_mlp": 1.01716387, + "epoch": 0.7737261385840974, + "flos": 16840065602880.0, + "grad_norm": 2.139995181056636, + "language_loss": 0.77932203, + "learning_rate": 5.133542473511578e-07, + "loss": 0.8007651, + "num_input_tokens_seen": 277660005, + "router_z_loss_clip": 0.74658203, + "router_z_loss_mlp": 0.1161499, + "step": 12869, + "time_per_iteration": 2.6059958934783936 + }, + { + "auxiliary_loss_clip": 0.01107986, + "auxiliary_loss_mlp": 0.01027783, + "balance_loss_clip": 1.03698277, + "balance_loss_mlp": 1.01624334, + "epoch": 0.7737862618367654, + "flos": 35057927036160.0, + "grad_norm": 5.9490489268168885, + "language_loss": 0.73523474, + "learning_rate": 5.130937518435124e-07, + "loss": 0.75659239, + "num_input_tokens_seen": 277682890, + "router_z_loss_clip": 0.70996094, + "router_z_loss_mlp": 0.11541748, + "step": 12870, + "time_per_iteration": 2.6464712619781494 + }, + { + "auxiliary_loss_clip": 0.01114454, + "auxiliary_loss_mlp": 0.01031017, + "balance_loss_clip": 1.03999448, + "balance_loss_mlp": 1.01950693, + "epoch": 0.7738463850894334, + "flos": 20765874303360.0, + "grad_norm": 2.9242826695935755, + "language_loss": 0.75703132, + "learning_rate": 5.12833312719501e-07, + "loss": 0.77848595, + "num_input_tokens_seen": 277699330, + "router_z_loss_clip": 0.74414062, + "router_z_loss_mlp": 0.11505127, + "step": 12871, + "time_per_iteration": 2.6310181617736816 + }, + { + "auxiliary_loss_clip": 0.01111211, + "auxiliary_loss_mlp": 0.01031578, + "balance_loss_clip": 1.03884614, + "balance_loss_mlp": 1.02085531, + "epoch": 0.7739065083421013, + "flos": 25263096683040.0, + "grad_norm": 1.7422883090691392, + "language_loss": 0.69225764, + "learning_rate": 5.12572929988999e-07, + "loss": 0.71368557, + "num_input_tokens_seen": 277718750, + "router_z_loss_clip": 0.72412109, + "router_z_loss_mlp": 0.10723877, + "step": 12872, + "time_per_iteration": 2.6040635108947754 + }, + { + "auxiliary_loss_clip": 0.01112981, + "auxiliary_loss_mlp": 0.01032641, + "balance_loss_clip": 1.03912282, + "balance_loss_mlp": 1.02033222, + "epoch": 0.7739666315947693, + "flos": 25255438917120.0, + "grad_norm": 2.4633533713443163, + "language_loss": 0.85104209, + "learning_rate": 5.123126036618804e-07, + "loss": 0.87249833, + "num_input_tokens_seen": 277734645, + "router_z_loss_clip": 0.73828125, + "router_z_loss_mlp": 0.12298584, + "step": 12873, + "time_per_iteration": 2.622976064682007 + }, + { + "auxiliary_loss_clip": 0.01114077, + "auxiliary_loss_mlp": 0.01033043, + "balance_loss_clip": 1.03962398, + "balance_loss_mlp": 1.02141976, + "epoch": 0.7740267548474372, + "flos": 36082827643680.0, + "grad_norm": 2.346969667322097, + "language_loss": 0.65432698, + "learning_rate": 5.120523337480174e-07, + "loss": 0.67579818, + "num_input_tokens_seen": 277755535, + "router_z_loss_clip": 0.74414062, + "router_z_loss_mlp": 0.11633301, + "step": 12874, + "time_per_iteration": 2.6914353370666504 + }, + { + "auxiliary_loss_clip": 0.01112952, + "auxiliary_loss_mlp": 0.01032772, + "balance_loss_clip": 1.04044533, + "balance_loss_mlp": 1.0209043, + "epoch": 0.7740868781001052, + "flos": 28869146945280.0, + "grad_norm": 1.75932205587118, + "language_loss": 0.625404, + "learning_rate": 5.117921202572785e-07, + "loss": 0.64686126, + "num_input_tokens_seen": 277775585, + "router_z_loss_clip": 0.72558594, + "router_z_loss_mlp": 0.11859131, + "step": 12875, + "time_per_iteration": 2.6929924488067627 + }, + { + "auxiliary_loss_clip": 0.01112814, + "auxiliary_loss_mlp": 0.01028613, + "balance_loss_clip": 1.03834867, + "balance_loss_mlp": 1.01705587, + "epoch": 0.7741470013527731, + "flos": 30160451083680.0, + "grad_norm": 1.9579771900735512, + "language_loss": 0.65558606, + "learning_rate": 5.115319631995318e-07, + "loss": 0.67700034, + "num_input_tokens_seen": 277794795, + "router_z_loss_clip": 0.74511719, + "router_z_loss_mlp": 0.11560059, + "step": 12876, + "time_per_iteration": 2.6547207832336426 + }, + { + "auxiliary_loss_clip": 0.01110074, + "auxiliary_loss_mlp": 0.01033568, + "balance_loss_clip": 1.03850734, + "balance_loss_mlp": 1.02218986, + "epoch": 0.7742071246054412, + "flos": 26687926725120.0, + "grad_norm": 2.1313165414513118, + "language_loss": 0.71606171, + "learning_rate": 5.112718625846433e-07, + "loss": 0.7374981, + "num_input_tokens_seen": 277813235, + "router_z_loss_clip": 0.71533203, + "router_z_loss_mlp": 0.11376953, + "step": 12877, + "time_per_iteration": 2.68498158454895 + }, + { + "auxiliary_loss_clip": 0.01116845, + "auxiliary_loss_mlp": 0.01032589, + "balance_loss_clip": 1.04019928, + "balance_loss_mlp": 1.02030432, + "epoch": 0.7742672478581091, + "flos": 27667332777600.0, + "grad_norm": 2.1522901300895367, + "language_loss": 0.82582325, + "learning_rate": 5.110118184224736e-07, + "loss": 0.84731764, + "num_input_tokens_seen": 277832560, + "router_z_loss_clip": 0.76708984, + "router_z_loss_mlp": 0.1229248, + "step": 12878, + "time_per_iteration": 2.597754955291748 + }, + { + "auxiliary_loss_clip": 0.01114781, + "auxiliary_loss_mlp": 0.01032, + "balance_loss_clip": 1.04058993, + "balance_loss_mlp": 1.01978159, + "epoch": 0.7743273711107771, + "flos": 22989793386240.0, + "grad_norm": 1.8606037563863165, + "language_loss": 0.73184693, + "learning_rate": 5.10751830722885e-07, + "loss": 0.75331479, + "num_input_tokens_seen": 277850120, + "router_z_loss_clip": 0.7421875, + "router_z_loss_mlp": 0.12219238, + "step": 12879, + "time_per_iteration": 2.6428120136260986 + }, + { + "auxiliary_loss_clip": 0.01107554, + "auxiliary_loss_mlp": 0.01028259, + "balance_loss_clip": 1.03755045, + "balance_loss_mlp": 1.0170598, + "epoch": 0.7743874943634451, + "flos": 35056346862240.0, + "grad_norm": 1.6984618319218918, + "language_loss": 0.79447699, + "learning_rate": 5.104918994957364e-07, + "loss": 0.81583512, + "num_input_tokens_seen": 277871020, + "router_z_loss_clip": 0.69970703, + "router_z_loss_mlp": 0.11193848, + "step": 12880, + "time_per_iteration": 4.2473530769348145 + }, + { + "auxiliary_loss_clip": 0.01110969, + "auxiliary_loss_mlp": 0.0103121, + "balance_loss_clip": 1.03926301, + "balance_loss_mlp": 1.01993847, + "epoch": 0.774447617616113, + "flos": 26734150591200.0, + "grad_norm": 1.5678931581477649, + "language_loss": 0.70228374, + "learning_rate": 5.102320247508847e-07, + "loss": 0.72370547, + "num_input_tokens_seen": 277891525, + "router_z_loss_clip": 0.71679688, + "router_z_loss_mlp": 0.1126709, + "step": 12881, + "time_per_iteration": 2.7093029022216797 + }, + { + "auxiliary_loss_clip": 0.01116616, + "auxiliary_loss_mlp": 0.01039037, + "balance_loss_clip": 1.03967428, + "balance_loss_mlp": 1.02641845, + "epoch": 0.774507740868781, + "flos": 23794595363520.0, + "grad_norm": 3.3389175998464427, + "language_loss": 0.84451967, + "learning_rate": 5.099722064981832e-07, + "loss": 0.86607623, + "num_input_tokens_seen": 277910425, + "router_z_loss_clip": 0.76953125, + "router_z_loss_mlp": 0.1262207, + "step": 12882, + "time_per_iteration": 2.6543562412261963 + }, + { + "auxiliary_loss_clip": 0.01031453, + "auxiliary_loss_mlp": 0.01001927, + "balance_loss_clip": 1.00887156, + "balance_loss_mlp": 1.00088835, + "epoch": 0.774567864121449, + "flos": 72510939612000.0, + "grad_norm": 0.7738357664730096, + "language_loss": 0.60444844, + "learning_rate": 5.097124447474858e-07, + "loss": 0.62478226, + "num_input_tokens_seen": 277972795, + "router_z_loss_clip": 0.22607422, + "router_z_loss_mlp": 0.01038361, + "step": 12883, + "time_per_iteration": 3.2005841732025146 + }, + { + "auxiliary_loss_clip": 0.01114733, + "auxiliary_loss_mlp": 0.01033379, + "balance_loss_clip": 1.03996122, + "balance_loss_mlp": 1.02124977, + "epoch": 0.774627987374117, + "flos": 16136949297600.0, + "grad_norm": 1.8666677804275675, + "language_loss": 0.72833937, + "learning_rate": 5.094527395086416e-07, + "loss": 0.74982047, + "num_input_tokens_seen": 277990675, + "router_z_loss_clip": 0.74804688, + "router_z_loss_mlp": 0.12121582, + "step": 12884, + "time_per_iteration": 3.9938883781433105 + }, + { + "auxiliary_loss_clip": 0.01111453, + "auxiliary_loss_mlp": 0.01031731, + "balance_loss_clip": 1.03937459, + "balance_loss_mlp": 1.02140713, + "epoch": 0.7746881106267849, + "flos": 26105411311200.0, + "grad_norm": 1.8052511434029366, + "language_loss": 0.80886519, + "learning_rate": 5.091930907914986e-07, + "loss": 0.83029711, + "num_input_tokens_seen": 278010050, + "router_z_loss_clip": 0.72119141, + "router_z_loss_mlp": 0.10314941, + "step": 12885, + "time_per_iteration": 2.7161426544189453 + }, + { + "auxiliary_loss_clip": 0.01109706, + "auxiliary_loss_mlp": 0.01030546, + "balance_loss_clip": 1.03806114, + "balance_loss_mlp": 1.02012098, + "epoch": 0.7747482338794529, + "flos": 31273383039840.0, + "grad_norm": 1.6785601787460014, + "language_loss": 0.63971162, + "learning_rate": 5.089334986059029e-07, + "loss": 0.66111416, + "num_input_tokens_seen": 278030660, + "router_z_loss_clip": 0.71484375, + "router_z_loss_mlp": 0.10424805, + "step": 12886, + "time_per_iteration": 2.6581509113311768 + }, + { + "auxiliary_loss_clip": 0.0111312, + "auxiliary_loss_mlp": 0.01033344, + "balance_loss_clip": 1.03756464, + "balance_loss_mlp": 1.02253723, + "epoch": 0.7748083571321208, + "flos": 14088849808320.0, + "grad_norm": 2.3894151287018586, + "language_loss": 0.69571614, + "learning_rate": 5.086739629616987e-07, + "loss": 0.71718085, + "num_input_tokens_seen": 278047645, + "router_z_loss_clip": 0.75488281, + "router_z_loss_mlp": 0.10803223, + "step": 12887, + "time_per_iteration": 2.6201887130737305 + }, + { + "auxiliary_loss_clip": 0.01109895, + "auxiliary_loss_mlp": 0.01029141, + "balance_loss_clip": 1.03818583, + "balance_loss_mlp": 1.01835239, + "epoch": 0.7748684803847888, + "flos": 23259883989600.0, + "grad_norm": 1.8556550765278421, + "language_loss": 0.70554441, + "learning_rate": 5.084144838687275e-07, + "loss": 0.72693479, + "num_input_tokens_seen": 278066170, + "router_z_loss_clip": 0.71728516, + "router_z_loss_mlp": 0.10784912, + "step": 12888, + "time_per_iteration": 2.5772640705108643 + }, + { + "auxiliary_loss_clip": 0.0111314, + "auxiliary_loss_mlp": 0.01029027, + "balance_loss_clip": 1.03815293, + "balance_loss_mlp": 1.01730263, + "epoch": 0.7749286036374567, + "flos": 27177872855040.0, + "grad_norm": 1.7385228690678076, + "language_loss": 0.81630707, + "learning_rate": 5.081550613368279e-07, + "loss": 0.83772886, + "num_input_tokens_seen": 278085545, + "router_z_loss_clip": 0.75, + "router_z_loss_mlp": 0.11737061, + "step": 12889, + "time_per_iteration": 2.6688735485076904 + }, + { + "auxiliary_loss_clip": 0.01113312, + "auxiliary_loss_mlp": 0.01030893, + "balance_loss_clip": 1.0408299, + "balance_loss_mlp": 1.01985991, + "epoch": 0.7749887268901248, + "flos": 24638935855680.0, + "grad_norm": 10.267878169477436, + "language_loss": 0.79401112, + "learning_rate": 5.07895695375838e-07, + "loss": 0.81545317, + "num_input_tokens_seen": 278102995, + "router_z_loss_clip": 0.72509766, + "router_z_loss_mlp": 0.11029053, + "step": 12890, + "time_per_iteration": 2.588258743286133 + }, + { + "auxiliary_loss_clip": 0.011172, + "auxiliary_loss_mlp": 0.01028967, + "balance_loss_clip": 1.0421859, + "balance_loss_mlp": 1.01726699, + "epoch": 0.7750488501427927, + "flos": 24816052002240.0, + "grad_norm": 2.072821639050862, + "language_loss": 0.66425157, + "learning_rate": 5.076363859955932e-07, + "loss": 0.68571329, + "num_input_tokens_seen": 278121460, + "router_z_loss_clip": 0.75, + "router_z_loss_mlp": 0.11712646, + "step": 12891, + "time_per_iteration": 4.083876371383667 + }, + { + "auxiliary_loss_clip": 0.01113617, + "auxiliary_loss_mlp": 0.01029982, + "balance_loss_clip": 1.03947282, + "balance_loss_mlp": 1.01810849, + "epoch": 0.7751089733954607, + "flos": 34610639251680.0, + "grad_norm": 1.6009812919167727, + "language_loss": 0.78415883, + "learning_rate": 5.073771332059257e-07, + "loss": 0.8055948, + "num_input_tokens_seen": 278143905, + "router_z_loss_clip": 0.74121094, + "router_z_loss_mlp": 0.11877441, + "step": 12892, + "time_per_iteration": 4.155648469924927 + }, + { + "auxiliary_loss_clip": 0.01116993, + "auxiliary_loss_mlp": 0.0102873, + "balance_loss_clip": 1.04158902, + "balance_loss_mlp": 1.01687455, + "epoch": 0.7751690966481286, + "flos": 20674804158720.0, + "grad_norm": 4.979076969310762, + "language_loss": 0.67651534, + "learning_rate": 5.071179370166669e-07, + "loss": 0.69797254, + "num_input_tokens_seen": 278160850, + "router_z_loss_clip": 0.75341797, + "router_z_loss_mlp": 0.11859131, + "step": 12893, + "time_per_iteration": 2.6142325401306152 + }, + { + "auxiliary_loss_clip": 0.01031543, + "auxiliary_loss_mlp": 0.01001097, + "balance_loss_clip": 1.00906968, + "balance_loss_mlp": 1.00002992, + "epoch": 0.7752292199007966, + "flos": 80127061816320.0, + "grad_norm": 0.7995900037936216, + "language_loss": 0.5847801, + "learning_rate": 5.068587974376468e-07, + "loss": 0.60510653, + "num_input_tokens_seen": 278219950, + "router_z_loss_clip": 0.22473145, + "router_z_loss_mlp": 0.01066589, + "step": 12894, + "time_per_iteration": 3.3087096214294434 + }, + { + "auxiliary_loss_clip": 0.0111454, + "auxiliary_loss_mlp": 0.01033168, + "balance_loss_clip": 1.04006815, + "balance_loss_mlp": 1.02136064, + "epoch": 0.7752893431534646, + "flos": 25129651813920.0, + "grad_norm": 2.624245338573762, + "language_loss": 0.78373349, + "learning_rate": 5.065997144786895e-07, + "loss": 0.80521053, + "num_input_tokens_seen": 278237805, + "router_z_loss_clip": 0.74365234, + "router_z_loss_mlp": 0.11816406, + "step": 12895, + "time_per_iteration": 2.6736319065093994 + }, + { + "auxiliary_loss_clip": 0.01113794, + "auxiliary_loss_mlp": 0.01027605, + "balance_loss_clip": 1.04115939, + "balance_loss_mlp": 1.01567769, + "epoch": 0.7753494664061326, + "flos": 24996247462080.0, + "grad_norm": 1.8598390303156396, + "language_loss": 0.6769011, + "learning_rate": 5.063406881496209e-07, + "loss": 0.69831514, + "num_input_tokens_seen": 278257660, + "router_z_loss_clip": 0.7265625, + "router_z_loss_mlp": 0.11920166, + "step": 12896, + "time_per_iteration": 2.60959529876709 + }, + { + "auxiliary_loss_clip": 0.01111366, + "auxiliary_loss_mlp": 0.01035222, + "balance_loss_clip": 1.03899074, + "balance_loss_mlp": 1.02479172, + "epoch": 0.7754095896588006, + "flos": 24728506860960.0, + "grad_norm": 1.8159548567665638, + "language_loss": 0.69197643, + "learning_rate": 5.060817184602629e-07, + "loss": 0.71344233, + "num_input_tokens_seen": 278275110, + "router_z_loss_clip": 0.72412109, + "router_z_loss_mlp": 0.10437012, + "step": 12897, + "time_per_iteration": 2.6526060104370117 + }, + { + "auxiliary_loss_clip": 0.01115134, + "auxiliary_loss_mlp": 0.01035572, + "balance_loss_clip": 1.0411973, + "balance_loss_mlp": 1.02293563, + "epoch": 0.7754697129114685, + "flos": 28468852855200.0, + "grad_norm": 2.5822730219664702, + "language_loss": 0.74752176, + "learning_rate": 5.058228054204364e-07, + "loss": 0.76902884, + "num_input_tokens_seen": 278293035, + "router_z_loss_clip": 0.73828125, + "router_z_loss_mlp": 0.12634277, + "step": 12898, + "time_per_iteration": 2.6314990520477295 + }, + { + "auxiliary_loss_clip": 0.01113142, + "auxiliary_loss_mlp": 0.0102841, + "balance_loss_clip": 1.03853905, + "balance_loss_mlp": 1.01603055, + "epoch": 0.7755298361641365, + "flos": 21167343394560.0, + "grad_norm": 1.95143958292196, + "language_loss": 0.70079982, + "learning_rate": 5.055639490399588e-07, + "loss": 0.72221535, + "num_input_tokens_seen": 278311010, + "router_z_loss_clip": 0.74462891, + "router_z_loss_mlp": 0.12390137, + "step": 12899, + "time_per_iteration": 2.6187360286712646 + }, + { + "auxiliary_loss_clip": 0.0111228, + "auxiliary_loss_mlp": 0.01033256, + "balance_loss_clip": 1.03953719, + "balance_loss_mlp": 1.0217936, + "epoch": 0.7755899594168044, + "flos": 23971873579200.0, + "grad_norm": 16.000449311890304, + "language_loss": 0.74870408, + "learning_rate": 5.053051493286453e-07, + "loss": 0.77015948, + "num_input_tokens_seen": 278329900, + "router_z_loss_clip": 0.72705078, + "router_z_loss_mlp": 0.11468506, + "step": 12900, + "time_per_iteration": 2.6004669666290283 + }, + { + "auxiliary_loss_clip": 0.01107876, + "auxiliary_loss_mlp": 0.01033064, + "balance_loss_clip": 1.03644586, + "balance_loss_mlp": 1.0229249, + "epoch": 0.7756500826694724, + "flos": 33451969636800.0, + "grad_norm": 2.245703450262887, + "language_loss": 0.77380717, + "learning_rate": 5.050464062963113e-07, + "loss": 0.79521656, + "num_input_tokens_seen": 278349980, + "router_z_loss_clip": 0.71484375, + "router_z_loss_mlp": 0.10144043, + "step": 12901, + "time_per_iteration": 2.7014830112457275 + }, + { + "auxiliary_loss_clip": 0.01112467, + "auxiliary_loss_mlp": 0.01028791, + "balance_loss_clip": 1.04033744, + "balance_loss_mlp": 1.01729369, + "epoch": 0.7757102059221404, + "flos": 35057683932480.0, + "grad_norm": 1.4821528510349116, + "language_loss": 0.76871336, + "learning_rate": 5.047877199527666e-07, + "loss": 0.79012597, + "num_input_tokens_seen": 278372485, + "router_z_loss_clip": 0.72119141, + "router_z_loss_mlp": 0.11499023, + "step": 12902, + "time_per_iteration": 2.6772589683532715 + }, + { + "auxiliary_loss_clip": 0.01111718, + "auxiliary_loss_mlp": 0.01030368, + "balance_loss_clip": 1.03933036, + "balance_loss_mlp": 1.01889408, + "epoch": 0.7757703291748084, + "flos": 27439049656800.0, + "grad_norm": 1.6714552600422914, + "language_loss": 0.73216051, + "learning_rate": 5.045290903078215e-07, + "loss": 0.7535814, + "num_input_tokens_seen": 278391660, + "router_z_loss_clip": 0.72314453, + "router_z_loss_mlp": 0.11468506, + "step": 12903, + "time_per_iteration": 2.6507067680358887 + }, + { + "auxiliary_loss_clip": 0.01112234, + "auxiliary_loss_mlp": 0.0102692, + "balance_loss_clip": 1.04021049, + "balance_loss_mlp": 1.01556563, + "epoch": 0.7758304524274763, + "flos": 26149528278720.0, + "grad_norm": 2.1521774137618053, + "language_loss": 0.75814164, + "learning_rate": 5.042705173712835e-07, + "loss": 0.77953315, + "num_input_tokens_seen": 278409125, + "router_z_loss_clip": 0.72070312, + "router_z_loss_mlp": 0.11358643, + "step": 12904, + "time_per_iteration": 2.6295480728149414 + }, + { + "auxiliary_loss_clip": 0.01107292, + "auxiliary_loss_mlp": 0.01023741, + "balance_loss_clip": 1.03772593, + "balance_loss_mlp": 1.01333404, + "epoch": 0.7758905756801443, + "flos": 28869309014400.0, + "grad_norm": 2.3896944606693413, + "language_loss": 0.68305737, + "learning_rate": 5.040120011529576e-07, + "loss": 0.7043677, + "num_input_tokens_seen": 278429450, + "router_z_loss_clip": 0.69482422, + "router_z_loss_mlp": 0.10394287, + "step": 12905, + "time_per_iteration": 2.64355206489563 + }, + { + "auxiliary_loss_clip": 0.0111084, + "auxiliary_loss_mlp": 0.01029419, + "balance_loss_clip": 1.04056966, + "balance_loss_mlp": 1.01789117, + "epoch": 0.7759506989328122, + "flos": 34612259942880.0, + "grad_norm": 1.646555553039813, + "language_loss": 0.66989601, + "learning_rate": 5.037535416626459e-07, + "loss": 0.6912986, + "num_input_tokens_seen": 278449925, + "router_z_loss_clip": 0.70263672, + "router_z_loss_mlp": 0.11529541, + "step": 12906, + "time_per_iteration": 2.728447914123535 + }, + { + "auxiliary_loss_clip": 0.0111223, + "auxiliary_loss_mlp": 0.01028845, + "balance_loss_clip": 1.03961456, + "balance_loss_mlp": 1.0177412, + "epoch": 0.7760108221854802, + "flos": 18183995337600.0, + "grad_norm": 5.04398591674407, + "language_loss": 0.81393439, + "learning_rate": 5.034951389101498e-07, + "loss": 0.83534515, + "num_input_tokens_seen": 278467255, + "router_z_loss_clip": 0.72509766, + "router_z_loss_mlp": 0.11102295, + "step": 12907, + "time_per_iteration": 2.5862109661102295 + }, + { + "auxiliary_loss_clip": 0.01109764, + "auxiliary_loss_mlp": 0.01032697, + "balance_loss_clip": 1.04001164, + "balance_loss_mlp": 1.02183676, + "epoch": 0.7760709454381483, + "flos": 18049497019200.0, + "grad_norm": 3.0697124177669806, + "language_loss": 0.67415559, + "learning_rate": 5.032367929052685e-07, + "loss": 0.69558024, + "num_input_tokens_seen": 278484250, + "router_z_loss_clip": 0.69775391, + "router_z_loss_mlp": 0.10852051, + "step": 12908, + "time_per_iteration": 2.634169101715088 + }, + { + "auxiliary_loss_clip": 0.01116752, + "auxiliary_loss_mlp": 0.01035112, + "balance_loss_clip": 1.04182875, + "balance_loss_mlp": 1.02368569, + "epoch": 0.7761310686908162, + "flos": 21207043978560.0, + "grad_norm": 1.74665699700435, + "language_loss": 0.70542854, + "learning_rate": 5.029785036577976e-07, + "loss": 0.72694719, + "num_input_tokens_seen": 278502740, + "router_z_loss_clip": 0.75, + "router_z_loss_mlp": 0.11425781, + "step": 12909, + "time_per_iteration": 2.616698741912842 + }, + { + "auxiliary_loss_clip": 0.01110739, + "auxiliary_loss_mlp": 0.010312, + "balance_loss_clip": 1.03924751, + "balance_loss_mlp": 1.02053058, + "epoch": 0.7761911919434842, + "flos": 31184541345600.0, + "grad_norm": 1.5125665438298646, + "language_loss": 0.67913127, + "learning_rate": 5.027202711775324e-07, + "loss": 0.70055068, + "num_input_tokens_seen": 278523890, + "router_z_loss_clip": 0.71386719, + "router_z_loss_mlp": 0.10681152, + "step": 12910, + "time_per_iteration": 2.6568541526794434 + }, + { + "auxiliary_loss_clip": 0.01113534, + "auxiliary_loss_mlp": 0.01032639, + "balance_loss_clip": 1.04066873, + "balance_loss_mlp": 1.02224946, + "epoch": 0.7762513151961521, + "flos": 28284686701920.0, + "grad_norm": 1.7583601772370372, + "language_loss": 0.7174437, + "learning_rate": 5.024620954742646e-07, + "loss": 0.73890549, + "num_input_tokens_seen": 278543185, + "router_z_loss_clip": 0.72802734, + "router_z_loss_mlp": 0.10388184, + "step": 12911, + "time_per_iteration": 2.6377217769622803 + }, + { + "auxiliary_loss_clip": 0.01116545, + "auxiliary_loss_mlp": 0.01032159, + "balance_loss_clip": 1.04207313, + "balance_loss_mlp": 1.02045846, + "epoch": 0.7763114384488201, + "flos": 26465761713600.0, + "grad_norm": 2.6557911030913406, + "language_loss": 0.63518339, + "learning_rate": 5.022039765577836e-07, + "loss": 0.65667045, + "num_input_tokens_seen": 278559220, + "router_z_loss_clip": 0.74414062, + "router_z_loss_mlp": 0.11700439, + "step": 12912, + "time_per_iteration": 2.6479697227478027 + }, + { + "auxiliary_loss_clip": 0.01031539, + "auxiliary_loss_mlp": 0.01001818, + "balance_loss_clip": 1.00890684, + "balance_loss_mlp": 1.00076675, + "epoch": 0.776371561701488, + "flos": 84224071140480.0, + "grad_norm": 0.7709072774658855, + "language_loss": 0.53251094, + "learning_rate": 5.019459144378779e-07, + "loss": 0.55284452, + "num_input_tokens_seen": 278618185, + "router_z_loss_clip": 0.2265625, + "router_z_loss_mlp": 0.01051331, + "step": 12913, + "time_per_iteration": 3.3074002265930176 + }, + { + "auxiliary_loss_clip": 0.01115987, + "auxiliary_loss_mlp": 0.01032901, + "balance_loss_clip": 1.04196334, + "balance_loss_mlp": 1.02164173, + "epoch": 0.776431684954156, + "flos": 27935316482400.0, + "grad_norm": 1.8119067866100775, + "language_loss": 0.62080753, + "learning_rate": 5.016879091243338e-07, + "loss": 0.64229637, + "num_input_tokens_seen": 278636210, + "router_z_loss_clip": 0.74023438, + "router_z_loss_mlp": 0.1126709, + "step": 12914, + "time_per_iteration": 2.6506593227386475 + }, + { + "auxiliary_loss_clip": 0.01110534, + "auxiliary_loss_mlp": 0.01029471, + "balance_loss_clip": 1.0383538, + "balance_loss_mlp": 1.01772881, + "epoch": 0.776491808206824, + "flos": 24723604270080.0, + "grad_norm": 2.4690664004335745, + "language_loss": 0.8237009, + "learning_rate": 5.014299606269339e-07, + "loss": 0.84510094, + "num_input_tokens_seen": 278653305, + "router_z_loss_clip": 0.72070312, + "router_z_loss_mlp": 0.11755371, + "step": 12915, + "time_per_iteration": 2.587794780731201 + }, + { + "auxiliary_loss_clip": 0.01114901, + "auxiliary_loss_mlp": 0.01032562, + "balance_loss_clip": 1.03944027, + "balance_loss_mlp": 1.02045584, + "epoch": 0.776551931459492, + "flos": 32652070250400.0, + "grad_norm": 1.9771092067166802, + "language_loss": 0.74876934, + "learning_rate": 5.011720689554603e-07, + "loss": 0.77024394, + "num_input_tokens_seen": 278671850, + "router_z_loss_clip": 0.75390625, + "router_z_loss_mlp": 0.12103271, + "step": 12916, + "time_per_iteration": 2.6860499382019043 + }, + { + "auxiliary_loss_clip": 0.01111912, + "auxiliary_loss_mlp": 0.01030974, + "balance_loss_clip": 1.03896892, + "balance_loss_mlp": 1.01953042, + "epoch": 0.7766120547121599, + "flos": 64264491603360.0, + "grad_norm": 1.56902090202544, + "language_loss": 0.65472615, + "learning_rate": 5.009142341196919e-07, + "loss": 0.67615497, + "num_input_tokens_seen": 278697860, + "router_z_loss_clip": 0.72949219, + "router_z_loss_mlp": 0.11444092, + "step": 12917, + "time_per_iteration": 2.9225316047668457 + }, + { + "auxiliary_loss_clip": 0.01111162, + "auxiliary_loss_mlp": 0.01034581, + "balance_loss_clip": 1.03781009, + "balance_loss_mlp": 1.02322662, + "epoch": 0.7766721779648279, + "flos": 30695932285920.0, + "grad_norm": 1.625964131319727, + "language_loss": 0.64615273, + "learning_rate": 5.006564561294065e-07, + "loss": 0.66761005, + "num_input_tokens_seen": 278720655, + "router_z_loss_clip": 0.73388672, + "router_z_loss_mlp": 0.11358643, + "step": 12918, + "time_per_iteration": 2.6559345722198486 + }, + { + "auxiliary_loss_clip": 0.01110236, + "auxiliary_loss_mlp": 0.01034494, + "balance_loss_clip": 1.03810287, + "balance_loss_mlp": 1.0234313, + "epoch": 0.7767323012174958, + "flos": 28993921116480.0, + "grad_norm": 2.585785345505732, + "language_loss": 0.73248303, + "learning_rate": 5.003987349943777e-07, + "loss": 0.75393033, + "num_input_tokens_seen": 278737375, + "router_z_loss_clip": 0.72167969, + "router_z_loss_mlp": 0.11065674, + "step": 12919, + "time_per_iteration": 4.101375102996826 + }, + { + "auxiliary_loss_clip": 0.01113928, + "auxiliary_loss_mlp": 0.01032143, + "balance_loss_clip": 1.03983593, + "balance_loss_mlp": 1.02031136, + "epoch": 0.7767924244701638, + "flos": 26950643183520.0, + "grad_norm": 2.0526711987625887, + "language_loss": 0.79124832, + "learning_rate": 5.001410707243792e-07, + "loss": 0.81270909, + "num_input_tokens_seen": 278756510, + "router_z_loss_clip": 0.74072266, + "router_z_loss_mlp": 0.11834717, + "step": 12920, + "time_per_iteration": 2.621029853820801 + }, + { + "auxiliary_loss_clip": 0.01113957, + "auxiliary_loss_mlp": 0.01030233, + "balance_loss_clip": 1.04024148, + "balance_loss_mlp": 1.01878881, + "epoch": 0.7768525477228319, + "flos": 26822100905280.0, + "grad_norm": 1.8021205711452246, + "language_loss": 0.7120657, + "learning_rate": 4.998834633291829e-07, + "loss": 0.73350763, + "num_input_tokens_seen": 278775410, + "router_z_loss_clip": 0.73730469, + "router_z_loss_mlp": 0.11456299, + "step": 12921, + "time_per_iteration": 2.652796506881714 + }, + { + "auxiliary_loss_clip": 0.01116647, + "auxiliary_loss_mlp": 0.01031016, + "balance_loss_clip": 1.04100204, + "balance_loss_mlp": 1.01874328, + "epoch": 0.7769126709754998, + "flos": 26594263474560.0, + "grad_norm": 1.646426007355759, + "language_loss": 0.75972658, + "learning_rate": 4.996259128185547e-07, + "loss": 0.78120315, + "num_input_tokens_seen": 278794260, + "router_z_loss_clip": 0.75537109, + "router_z_loss_mlp": 0.12286377, + "step": 12922, + "time_per_iteration": 2.632719039916992 + }, + { + "auxiliary_loss_clip": 0.01114265, + "auxiliary_loss_mlp": 0.01036675, + "balance_loss_clip": 1.04068065, + "balance_loss_mlp": 1.02532053, + "epoch": 0.7769727942281678, + "flos": 24462508502880.0, + "grad_norm": 2.0382711188742215, + "language_loss": 0.80401981, + "learning_rate": 4.993684192022625e-07, + "loss": 0.82552922, + "num_input_tokens_seen": 278813290, + "router_z_loss_clip": 0.73583984, + "router_z_loss_mlp": 0.11364746, + "step": 12923, + "time_per_iteration": 2.623769760131836 + }, + { + "auxiliary_loss_clip": 0.01113708, + "auxiliary_loss_mlp": 0.0103746, + "balance_loss_clip": 1.04117227, + "balance_loss_mlp": 1.02645659, + "epoch": 0.7770329174808357, + "flos": 26462560848480.0, + "grad_norm": 1.987380417173942, + "language_loss": 0.92281246, + "learning_rate": 4.991109824900699e-07, + "loss": 0.94432408, + "num_input_tokens_seen": 278830610, + "router_z_loss_clip": 0.72558594, + "router_z_loss_mlp": 0.11010742, + "step": 12924, + "time_per_iteration": 3.9762935638427734 + }, + { + "auxiliary_loss_clip": 0.01110387, + "auxiliary_loss_mlp": 0.0102842, + "balance_loss_clip": 1.03695571, + "balance_loss_mlp": 1.01691008, + "epoch": 0.7770930407335037, + "flos": 31541650365600.0, + "grad_norm": 2.208212128463859, + "language_loss": 0.65746689, + "learning_rate": 4.988536026917401e-07, + "loss": 0.67885494, + "num_input_tokens_seen": 278849530, + "router_z_loss_clip": 0.73486328, + "router_z_loss_mlp": 0.11517334, + "step": 12925, + "time_per_iteration": 2.6323962211608887 + }, + { + "auxiliary_loss_clip": 0.0111478, + "auxiliary_loss_mlp": 0.01034922, + "balance_loss_clip": 1.04034996, + "balance_loss_mlp": 1.02348387, + "epoch": 0.7771531639861716, + "flos": 29710043468640.0, + "grad_norm": 1.693995574953328, + "language_loss": 0.71781808, + "learning_rate": 4.985962798170314e-07, + "loss": 0.73931515, + "num_input_tokens_seen": 278869005, + "router_z_loss_clip": 0.74462891, + "router_z_loss_mlp": 0.11444092, + "step": 12926, + "time_per_iteration": 2.7106964588165283 + }, + { + "auxiliary_loss_clip": 0.01114992, + "auxiliary_loss_mlp": 0.01030092, + "balance_loss_clip": 1.03999615, + "balance_loss_mlp": 1.01818871, + "epoch": 0.7772132872388396, + "flos": 31274071833600.0, + "grad_norm": 1.8788046451959297, + "language_loss": 0.6534977, + "learning_rate": 4.983390138757027e-07, + "loss": 0.67494857, + "num_input_tokens_seen": 278888790, + "router_z_loss_clip": 0.74951172, + "router_z_loss_mlp": 0.11895752, + "step": 12927, + "time_per_iteration": 2.6861588954925537 + }, + { + "auxiliary_loss_clip": 0.01113045, + "auxiliary_loss_mlp": 0.01036812, + "balance_loss_clip": 1.03997207, + "balance_loss_mlp": 1.02473617, + "epoch": 0.7772734104915076, + "flos": 31807932344640.0, + "grad_norm": 1.9750110135495185, + "language_loss": 0.72370529, + "learning_rate": 4.980818048775093e-07, + "loss": 0.74520385, + "num_input_tokens_seen": 278908150, + "router_z_loss_clip": 0.73095703, + "router_z_loss_mlp": 0.12072754, + "step": 12928, + "time_per_iteration": 2.721668243408203 + }, + { + "auxiliary_loss_clip": 0.01108607, + "auxiliary_loss_mlp": 0.0102955, + "balance_loss_clip": 1.03711915, + "balance_loss_mlp": 1.0180459, + "epoch": 0.7773335337441756, + "flos": 27973801548000.0, + "grad_norm": 1.8964907800421957, + "language_loss": 0.74213076, + "learning_rate": 4.978246528322036e-07, + "loss": 0.76351225, + "num_input_tokens_seen": 278927425, + "router_z_loss_clip": 0.71533203, + "router_z_loss_mlp": 0.1149292, + "step": 12929, + "time_per_iteration": 2.641303777694702 + }, + { + "auxiliary_loss_clip": 0.01111738, + "auxiliary_loss_mlp": 0.01031034, + "balance_loss_clip": 1.03896856, + "balance_loss_mlp": 1.0198102, + "epoch": 0.7773936569968435, + "flos": 25351897860000.0, + "grad_norm": 5.397409317119394, + "language_loss": 0.77617288, + "learning_rate": 4.975675577495377e-07, + "loss": 0.79760063, + "num_input_tokens_seen": 278946475, + "router_z_loss_clip": 0.72802734, + "router_z_loss_mlp": 0.11224365, + "step": 12930, + "time_per_iteration": 2.725454568862915 + }, + { + "auxiliary_loss_clip": 0.01113651, + "auxiliary_loss_mlp": 0.01030286, + "balance_loss_clip": 1.04130816, + "balance_loss_mlp": 1.0187943, + "epoch": 0.7774537802495115, + "flos": 24858588795840.0, + "grad_norm": 1.950886409743559, + "language_loss": 0.794927, + "learning_rate": 4.973105196392613e-07, + "loss": 0.81636631, + "num_input_tokens_seen": 278964345, + "router_z_loss_clip": 0.72412109, + "router_z_loss_mlp": 0.11499023, + "step": 12931, + "time_per_iteration": 4.009785890579224 + }, + { + "auxiliary_loss_clip": 0.01030982, + "auxiliary_loss_mlp": 0.01002181, + "balance_loss_clip": 1.00844371, + "balance_loss_mlp": 1.00117111, + "epoch": 0.7775139035021794, + "flos": 65782221418080.0, + "grad_norm": 0.8080125688180095, + "language_loss": 0.59724605, + "learning_rate": 4.970535385111199e-07, + "loss": 0.61757767, + "num_input_tokens_seen": 279022380, + "router_z_loss_clip": 0.22546387, + "router_z_loss_mlp": 0.01009369, + "step": 12932, + "time_per_iteration": 4.554304361343384 + }, + { + "auxiliary_loss_clip": 0.01114332, + "auxiliary_loss_mlp": 0.01029953, + "balance_loss_clip": 1.040434, + "balance_loss_mlp": 1.01930737, + "epoch": 0.7775740267548474, + "flos": 35194086563040.0, + "grad_norm": 1.5603690843036944, + "language_loss": 0.7596817, + "learning_rate": 4.967966143748595e-07, + "loss": 0.78112453, + "num_input_tokens_seen": 279044275, + "router_z_loss_clip": 0.73876953, + "router_z_loss_mlp": 0.10644531, + "step": 12933, + "time_per_iteration": 2.678609848022461 + }, + { + "auxiliary_loss_clip": 0.01112432, + "auxiliary_loss_mlp": 0.01031834, + "balance_loss_clip": 1.03936636, + "balance_loss_mlp": 1.01984143, + "epoch": 0.7776341500075155, + "flos": 26688858622560.0, + "grad_norm": 2.6687635380329904, + "language_loss": 0.73453677, + "learning_rate": 4.965397472402215e-07, + "loss": 0.75597942, + "num_input_tokens_seen": 279063375, + "router_z_loss_clip": 0.73193359, + "router_z_loss_mlp": 0.11993408, + "step": 12934, + "time_per_iteration": 2.65728759765625 + }, + { + "auxiliary_loss_clip": 0.01112345, + "auxiliary_loss_mlp": 0.01028228, + "balance_loss_clip": 1.0387696, + "balance_loss_mlp": 1.01599741, + "epoch": 0.7776942732601834, + "flos": 24684470928000.0, + "grad_norm": 1.957309937070455, + "language_loss": 0.70373559, + "learning_rate": 4.962829371169475e-07, + "loss": 0.72514141, + "num_input_tokens_seen": 279082680, + "router_z_loss_clip": 0.73486328, + "router_z_loss_mlp": 0.12237549, + "step": 12935, + "time_per_iteration": 2.594506025314331 + }, + { + "auxiliary_loss_clip": 0.0111362, + "auxiliary_loss_mlp": 0.01038276, + "balance_loss_clip": 1.03943348, + "balance_loss_mlp": 1.02661157, + "epoch": 0.7777543965128514, + "flos": 27127880881920.0, + "grad_norm": 1.6107326975058678, + "language_loss": 0.83754164, + "learning_rate": 4.960261840147746e-07, + "loss": 0.85906059, + "num_input_tokens_seen": 279099805, + "router_z_loss_clip": 0.74121094, + "router_z_loss_mlp": 0.11682129, + "step": 12936, + "time_per_iteration": 2.620805501937866 + }, + { + "auxiliary_loss_clip": 0.01115984, + "auxiliary_loss_mlp": 0.01028829, + "balance_loss_clip": 1.03955746, + "balance_loss_mlp": 1.01788604, + "epoch": 0.7778145197655193, + "flos": 17694819036000.0, + "grad_norm": 2.3529232504281774, + "language_loss": 0.67501658, + "learning_rate": 4.957694879434397e-07, + "loss": 0.69646478, + "num_input_tokens_seen": 279117975, + "router_z_loss_clip": 0.76416016, + "router_z_loss_mlp": 0.10949707, + "step": 12937, + "time_per_iteration": 2.6178464889526367 + }, + { + "auxiliary_loss_clip": 0.01113331, + "auxiliary_loss_mlp": 0.01027718, + "balance_loss_clip": 1.03896856, + "balance_loss_mlp": 1.01641095, + "epoch": 0.7778746430181873, + "flos": 25795620123840.0, + "grad_norm": 1.5200968698686164, + "language_loss": 0.8720026, + "learning_rate": 4.955128489126777e-07, + "loss": 0.89341307, + "num_input_tokens_seen": 279137255, + "router_z_loss_clip": 0.74316406, + "router_z_loss_mlp": 0.11303711, + "step": 12938, + "time_per_iteration": 2.6366186141967773 + }, + { + "auxiliary_loss_clip": 0.01114118, + "auxiliary_loss_mlp": 0.01028632, + "balance_loss_clip": 1.03995776, + "balance_loss_mlp": 1.01715171, + "epoch": 0.7779347662708552, + "flos": 24729033585600.0, + "grad_norm": 2.301895275772722, + "language_loss": 0.8495447, + "learning_rate": 4.95256266932218e-07, + "loss": 0.87097222, + "num_input_tokens_seen": 279154500, + "router_z_loss_clip": 0.74169922, + "router_z_loss_mlp": 0.11480713, + "step": 12939, + "time_per_iteration": 2.62800931930542 + }, + { + "auxiliary_loss_clip": 0.01109133, + "auxiliary_loss_mlp": 0.0103014, + "balance_loss_clip": 1.03896189, + "balance_loss_mlp": 1.0192802, + "epoch": 0.7779948895235232, + "flos": 23438701861920.0, + "grad_norm": 8.251709366530502, + "language_loss": 0.69194424, + "learning_rate": 4.949997420117915e-07, + "loss": 0.713337, + "num_input_tokens_seen": 279173635, + "router_z_loss_clip": 0.70117188, + "router_z_loss_mlp": 0.10858154, + "step": 12940, + "time_per_iteration": 2.694077968597412 + }, + { + "auxiliary_loss_clip": 0.01112016, + "auxiliary_loss_mlp": 0.01025518, + "balance_loss_clip": 1.03743577, + "balance_loss_mlp": 1.0146935, + "epoch": 0.7780550127761912, + "flos": 29181045031200.0, + "grad_norm": 1.718495868583865, + "language_loss": 0.78030837, + "learning_rate": 4.947432741611255e-07, + "loss": 0.80168366, + "num_input_tokens_seen": 279194430, + "router_z_loss_clip": 0.74707031, + "router_z_loss_mlp": 0.10827637, + "step": 12941, + "time_per_iteration": 2.69014573097229 + }, + { + "auxiliary_loss_clip": 0.01116663, + "auxiliary_loss_mlp": 0.0103343, + "balance_loss_clip": 1.040223, + "balance_loss_mlp": 1.02103221, + "epoch": 0.7781151360288592, + "flos": 39554906312160.0, + "grad_norm": 2.1602799778751716, + "language_loss": 0.72852027, + "learning_rate": 4.944868633899462e-07, + "loss": 0.75002122, + "num_input_tokens_seen": 279212920, + "router_z_loss_clip": 0.76464844, + "router_z_loss_mlp": 0.1239624, + "step": 12942, + "time_per_iteration": 2.6792969703674316 + }, + { + "auxiliary_loss_clip": 0.01110364, + "auxiliary_loss_mlp": 0.01033852, + "balance_loss_clip": 1.0379523, + "balance_loss_mlp": 1.02275991, + "epoch": 0.7781752592815271, + "flos": 27267200756640.0, + "grad_norm": 6.533712972797739, + "language_loss": 0.67911547, + "learning_rate": 4.942305097079751e-07, + "loss": 0.70055765, + "num_input_tokens_seen": 279232310, + "router_z_loss_clip": 0.72460938, + "router_z_loss_mlp": 0.11090088, + "step": 12943, + "time_per_iteration": 2.6795613765716553 + }, + { + "auxiliary_loss_clip": 0.01030979, + "auxiliary_loss_mlp": 0.01001719, + "balance_loss_clip": 1.00839925, + "balance_loss_mlp": 1.00073171, + "epoch": 0.7782353825341951, + "flos": 85976999830080.0, + "grad_norm": 0.7726639969082488, + "language_loss": 0.58492029, + "learning_rate": 4.939742131249347e-07, + "loss": 0.60524726, + "num_input_tokens_seen": 279295375, + "router_z_loss_clip": 0.22595215, + "router_z_loss_mlp": 0.00986481, + "step": 12944, + "time_per_iteration": 3.398879051208496 + }, + { + "auxiliary_loss_clip": 0.0111576, + "auxiliary_loss_mlp": 0.010337, + "balance_loss_clip": 1.04037142, + "balance_loss_mlp": 1.02152252, + "epoch": 0.778295505786863, + "flos": 23260046058720.0, + "grad_norm": 2.2210095723459577, + "language_loss": 0.67825097, + "learning_rate": 4.937179736505428e-07, + "loss": 0.69974554, + "num_input_tokens_seen": 279313660, + "router_z_loss_clip": 0.75439453, + "router_z_loss_mlp": 0.12182617, + "step": 12945, + "time_per_iteration": 2.6570918560028076 + }, + { + "auxiliary_loss_clip": 0.01113917, + "auxiliary_loss_mlp": 0.01032669, + "balance_loss_clip": 1.04053628, + "balance_loss_mlp": 1.02139795, + "epoch": 0.778355629039531, + "flos": 25623609154560.0, + "grad_norm": 2.9943069786750867, + "language_loss": 0.69567823, + "learning_rate": 4.93461791294516e-07, + "loss": 0.71714407, + "num_input_tokens_seen": 279334495, + "router_z_loss_clip": 0.73339844, + "router_z_loss_mlp": 0.11273193, + "step": 12946, + "time_per_iteration": 2.651252508163452 + }, + { + "auxiliary_loss_clip": 0.01114676, + "auxiliary_loss_mlp": 0.01027339, + "balance_loss_clip": 1.04086447, + "balance_loss_mlp": 1.01541805, + "epoch": 0.7784157522921991, + "flos": 26109989763840.0, + "grad_norm": 2.426160028355048, + "language_loss": 0.653777, + "learning_rate": 4.932056660665689e-07, + "loss": 0.67519718, + "num_input_tokens_seen": 279352985, + "router_z_loss_clip": 0.73876953, + "router_z_loss_mlp": 0.1192627, + "step": 12947, + "time_per_iteration": 2.643571376800537 + }, + { + "auxiliary_loss_clip": 0.0111332, + "auxiliary_loss_mlp": 0.01029694, + "balance_loss_clip": 1.0402298, + "balance_loss_mlp": 1.01802886, + "epoch": 0.778475875544867, + "flos": 25397108794080.0, + "grad_norm": 1.9586582409997808, + "language_loss": 0.65091813, + "learning_rate": 4.929495979764147e-07, + "loss": 0.67234826, + "num_input_tokens_seen": 279371360, + "router_z_loss_clip": 0.73046875, + "router_z_loss_mlp": 0.11669922, + "step": 12948, + "time_per_iteration": 2.6901133060455322 + }, + { + "auxiliary_loss_clip": 0.01112329, + "auxiliary_loss_mlp": 0.01029569, + "balance_loss_clip": 1.03898931, + "balance_loss_mlp": 1.01816678, + "epoch": 0.778535998797535, + "flos": 17516001163680.0, + "grad_norm": 2.0575423673214006, + "language_loss": 0.74858063, + "learning_rate": 4.926935870337625e-07, + "loss": 0.76999962, + "num_input_tokens_seen": 279389400, + "router_z_loss_clip": 0.73339844, + "router_z_loss_mlp": 0.11407471, + "step": 12949, + "time_per_iteration": 2.6641433238983154 + }, + { + "auxiliary_loss_clip": 0.0111696, + "auxiliary_loss_mlp": 0.01031457, + "balance_loss_clip": 1.04059005, + "balance_loss_mlp": 1.01969063, + "epoch": 0.7785961220502029, + "flos": 23439309621120.0, + "grad_norm": 1.7175738636987785, + "language_loss": 0.68826103, + "learning_rate": 4.924376332483202e-07, + "loss": 0.70974517, + "num_input_tokens_seen": 279409715, + "router_z_loss_clip": 0.76464844, + "router_z_loss_mlp": 0.11773682, + "step": 12950, + "time_per_iteration": 2.613402843475342 + }, + { + "auxiliary_loss_clip": 0.01111189, + "auxiliary_loss_mlp": 0.01030114, + "balance_loss_clip": 1.03705847, + "balance_loss_mlp": 1.01869392, + "epoch": 0.7786562453028709, + "flos": 31406908943520.0, + "grad_norm": 2.199177105370105, + "language_loss": 0.72090977, + "learning_rate": 4.921817366297938e-07, + "loss": 0.7423228, + "num_input_tokens_seen": 279427705, + "router_z_loss_clip": 0.74121094, + "router_z_loss_mlp": 0.11425781, + "step": 12951, + "time_per_iteration": 2.710200071334839 + }, + { + "auxiliary_loss_clip": 0.01110936, + "auxiliary_loss_mlp": 0.01031102, + "balance_loss_clip": 1.03968906, + "balance_loss_mlp": 1.01946688, + "epoch": 0.7787163685555388, + "flos": 31407678771840.0, + "grad_norm": 1.783174540111046, + "language_loss": 0.65688276, + "learning_rate": 4.919258971878877e-07, + "loss": 0.67830312, + "num_input_tokens_seen": 279448215, + "router_z_loss_clip": 0.71240234, + "router_z_loss_mlp": 0.11639404, + "step": 12952, + "time_per_iteration": 2.778155565261841 + }, + { + "auxiliary_loss_clip": 0.01105646, + "auxiliary_loss_mlp": 0.01028082, + "balance_loss_clip": 1.037323, + "balance_loss_mlp": 1.01718664, + "epoch": 0.7787764918082068, + "flos": 27756944300160.0, + "grad_norm": 1.5960204673767486, + "language_loss": 0.81298196, + "learning_rate": 4.916701149323022e-07, + "loss": 0.83431923, + "num_input_tokens_seen": 279466260, + "router_z_loss_clip": 0.68359375, + "router_z_loss_mlp": 0.10888672, + "step": 12953, + "time_per_iteration": 2.622770071029663 + }, + { + "auxiliary_loss_clip": 0.01119276, + "auxiliary_loss_mlp": 0.01036018, + "balance_loss_clip": 1.0430727, + "balance_loss_mlp": 1.02439499, + "epoch": 0.7788366150608748, + "flos": 18535877628480.0, + "grad_norm": 3.2401935870466905, + "language_loss": 0.76769596, + "learning_rate": 4.91414389872737e-07, + "loss": 0.78924888, + "num_input_tokens_seen": 279484520, + "router_z_loss_clip": 0.76123047, + "router_z_loss_mlp": 0.1161499, + "step": 12954, + "time_per_iteration": 2.7385454177856445 + }, + { + "auxiliary_loss_clip": 0.01114087, + "auxiliary_loss_mlp": 0.01025667, + "balance_loss_clip": 1.0394156, + "balance_loss_mlp": 1.01503336, + "epoch": 0.7788967383135428, + "flos": 25881260952960.0, + "grad_norm": 1.6128138945018498, + "language_loss": 0.72855657, + "learning_rate": 4.911587220188905e-07, + "loss": 0.7499541, + "num_input_tokens_seen": 279503130, + "router_z_loss_clip": 0.74658203, + "router_z_loss_mlp": 0.10638428, + "step": 12955, + "time_per_iteration": 2.633023262023926 + }, + { + "auxiliary_loss_clip": 0.01113046, + "auxiliary_loss_mlp": 0.01033303, + "balance_loss_clip": 1.0385201, + "balance_loss_mlp": 1.02182317, + "epoch": 0.7789568615662107, + "flos": 26458266016800.0, + "grad_norm": 1.4371941388125533, + "language_loss": 0.68489355, + "learning_rate": 4.909031113804551e-07, + "loss": 0.706357, + "num_input_tokens_seen": 279521930, + "router_z_loss_clip": 0.74560547, + "router_z_loss_mlp": 0.11480713, + "step": 12956, + "time_per_iteration": 2.6869008541107178 + }, + { + "auxiliary_loss_clip": 0.01113587, + "auxiliary_loss_mlp": 0.01032885, + "balance_loss_clip": 1.039886, + "balance_loss_mlp": 1.02148902, + "epoch": 0.7790169848188787, + "flos": 32165324985600.0, + "grad_norm": 1.71278547153499, + "language_loss": 0.75756371, + "learning_rate": 4.906475579671252e-07, + "loss": 0.77902842, + "num_input_tokens_seen": 279542375, + "router_z_loss_clip": 0.73681641, + "router_z_loss_mlp": 0.11395264, + "step": 12957, + "time_per_iteration": 2.640641450881958 + }, + { + "auxiliary_loss_clip": 0.0111098, + "auxiliary_loss_mlp": 0.0102864, + "balance_loss_clip": 1.03738201, + "balance_loss_mlp": 1.01754737, + "epoch": 0.7790771080715466, + "flos": 31135643339040.0, + "grad_norm": 1.6086365356839263, + "language_loss": 0.77399075, + "learning_rate": 4.903920617885917e-07, + "loss": 0.79538691, + "num_input_tokens_seen": 279561885, + "router_z_loss_clip": 0.73632812, + "router_z_loss_mlp": 0.11102295, + "step": 12958, + "time_per_iteration": 2.6726746559143066 + }, + { + "auxiliary_loss_clip": 0.0111598, + "auxiliary_loss_mlp": 0.01036827, + "balance_loss_clip": 1.04091334, + "balance_loss_mlp": 1.02501976, + "epoch": 0.7791372313242146, + "flos": 19564586860320.0, + "grad_norm": 2.1932949225171843, + "language_loss": 0.7184878, + "learning_rate": 4.901366228545418e-07, + "loss": 0.74001586, + "num_input_tokens_seen": 279579965, + "router_z_loss_clip": 0.75097656, + "router_z_loss_mlp": 0.1182251, + "step": 12959, + "time_per_iteration": 4.122377634048462 + }, + { + "auxiliary_loss_clip": 0.01111959, + "auxiliary_loss_mlp": 0.01031495, + "balance_loss_clip": 1.039042, + "balance_loss_mlp": 1.02004457, + "epoch": 0.7791973545768827, + "flos": 29092527475200.0, + "grad_norm": 1.7163244365048078, + "language_loss": 0.7761066, + "learning_rate": 4.898812411746632e-07, + "loss": 0.79754114, + "num_input_tokens_seen": 279599030, + "router_z_loss_clip": 0.72900391, + "router_z_loss_mlp": 0.11437988, + "step": 12960, + "time_per_iteration": 2.6426901817321777 + }, + { + "auxiliary_loss_clip": 0.01114298, + "auxiliary_loss_mlp": 0.01036211, + "balance_loss_clip": 1.03977132, + "balance_loss_mlp": 1.02384329, + "epoch": 0.7792574778295506, + "flos": 29490755184000.0, + "grad_norm": 1.9540799536137623, + "language_loss": 0.75341856, + "learning_rate": 4.896259167586385e-07, + "loss": 0.77492356, + "num_input_tokens_seen": 279614400, + "router_z_loss_clip": 0.74511719, + "router_z_loss_mlp": 0.12384033, + "step": 12961, + "time_per_iteration": 2.662036418914795 + }, + { + "auxiliary_loss_clip": 0.01109607, + "auxiliary_loss_mlp": 0.01034887, + "balance_loss_clip": 1.04029942, + "balance_loss_mlp": 1.02378285, + "epoch": 0.7793176010822186, + "flos": 26192105589600.0, + "grad_norm": 2.2024630600188657, + "language_loss": 0.73851228, + "learning_rate": 4.893706496161511e-07, + "loss": 0.75995719, + "num_input_tokens_seen": 279633745, + "router_z_loss_clip": 0.69384766, + "router_z_loss_mlp": 0.11102295, + "step": 12962, + "time_per_iteration": 2.6943325996398926 + }, + { + "auxiliary_loss_clip": 0.01113415, + "auxiliary_loss_mlp": 0.01026133, + "balance_loss_clip": 1.04047561, + "balance_loss_mlp": 1.01458144, + "epoch": 0.7793777243348865, + "flos": 25217642645280.0, + "grad_norm": 7.041292888937135, + "language_loss": 0.69768846, + "learning_rate": 4.891154397568795e-07, + "loss": 0.71908391, + "num_input_tokens_seen": 279651165, + "router_z_loss_clip": 0.72851562, + "router_z_loss_mlp": 0.11560059, + "step": 12963, + "time_per_iteration": 3.8937480449676514 + }, + { + "auxiliary_loss_clip": 0.0111258, + "auxiliary_loss_mlp": 0.01033983, + "balance_loss_clip": 1.04110289, + "balance_loss_mlp": 1.02283704, + "epoch": 0.7794378475875545, + "flos": 33099398552160.0, + "grad_norm": 1.8999950066899283, + "language_loss": 0.63457114, + "learning_rate": 4.888602871905019e-07, + "loss": 0.65603673, + "num_input_tokens_seen": 279671175, + "router_z_loss_clip": 0.71484375, + "router_z_loss_mlp": 0.11157227, + "step": 12964, + "time_per_iteration": 2.6965110301971436 + }, + { + "auxiliary_loss_clip": 0.01113809, + "auxiliary_loss_mlp": 0.01028441, + "balance_loss_clip": 1.03922582, + "balance_loss_mlp": 1.017241, + "epoch": 0.7794979708402224, + "flos": 34256690579520.0, + "grad_norm": 1.5612741144634121, + "language_loss": 0.76820505, + "learning_rate": 4.88605191926694e-07, + "loss": 0.78962755, + "num_input_tokens_seen": 279688675, + "router_z_loss_clip": 0.74609375, + "router_z_loss_mlp": 0.11199951, + "step": 12965, + "time_per_iteration": 2.6559247970581055 + }, + { + "auxiliary_loss_clip": 0.01105992, + "auxiliary_loss_mlp": 0.01029701, + "balance_loss_clip": 1.03806949, + "balance_loss_mlp": 1.01900196, + "epoch": 0.7795580940928905, + "flos": 32786811672480.0, + "grad_norm": 1.5366770089103903, + "language_loss": 0.72863936, + "learning_rate": 4.883501539751289e-07, + "loss": 0.7499963, + "num_input_tokens_seen": 279710245, + "router_z_loss_clip": 0.67822266, + "router_z_loss_mlp": 0.10693359, + "step": 12966, + "time_per_iteration": 2.741201400756836 + }, + { + "auxiliary_loss_clip": 0.01110607, + "auxiliary_loss_mlp": 0.01027617, + "balance_loss_clip": 1.0406965, + "balance_loss_mlp": 1.01761508, + "epoch": 0.7796182173455584, + "flos": 29084829192000.0, + "grad_norm": 1.5541361075354652, + "language_loss": 0.74473745, + "learning_rate": 4.880951733454768e-07, + "loss": 0.76611972, + "num_input_tokens_seen": 279729045, + "router_z_loss_clip": 0.69970703, + "router_z_loss_mlp": 0.10003662, + "step": 12967, + "time_per_iteration": 2.64507794380188 + }, + { + "auxiliary_loss_clip": 0.01114438, + "auxiliary_loss_mlp": 0.01029518, + "balance_loss_clip": 1.04101324, + "balance_loss_mlp": 1.01775217, + "epoch": 0.7796783405982264, + "flos": 24150448347840.0, + "grad_norm": 4.313838249780767, + "language_loss": 0.7222321, + "learning_rate": 4.878402500474073e-07, + "loss": 0.74367166, + "num_input_tokens_seen": 279748350, + "router_z_loss_clip": 0.73486328, + "router_z_loss_mlp": 0.11773682, + "step": 12968, + "time_per_iteration": 2.6547203063964844 + }, + { + "auxiliary_loss_clip": 0.01112048, + "auxiliary_loss_mlp": 0.010351, + "balance_loss_clip": 1.04016995, + "balance_loss_mlp": 1.0234834, + "epoch": 0.7797384638508943, + "flos": 18851989511520.0, + "grad_norm": 2.0292816207168194, + "language_loss": 0.60883677, + "learning_rate": 4.875853840905874e-07, + "loss": 0.63030827, + "num_input_tokens_seen": 279765620, + "router_z_loss_clip": 0.71875, + "router_z_loss_mlp": 0.11602783, + "step": 12969, + "time_per_iteration": 2.5986273288726807 + }, + { + "auxiliary_loss_clip": 0.01108416, + "auxiliary_loss_mlp": 0.01029852, + "balance_loss_clip": 1.03858125, + "balance_loss_mlp": 1.01906991, + "epoch": 0.7797985871035623, + "flos": 25530026938560.0, + "grad_norm": 2.0720436950546977, + "language_loss": 0.70029849, + "learning_rate": 4.873305754846811e-07, + "loss": 0.72168112, + "num_input_tokens_seen": 279782485, + "router_z_loss_clip": 0.69824219, + "router_z_loss_mlp": 0.10784912, + "step": 12970, + "time_per_iteration": 4.164533853530884 + }, + { + "auxiliary_loss_clip": 0.01115515, + "auxiliary_loss_mlp": 0.01030089, + "balance_loss_clip": 1.04200745, + "balance_loss_mlp": 1.01813221, + "epoch": 0.7798587103562302, + "flos": 45072369812160.0, + "grad_norm": 1.7469677949930955, + "language_loss": 0.72186214, + "learning_rate": 4.870758242393507e-07, + "loss": 0.7433182, + "num_input_tokens_seen": 279804170, + "router_z_loss_clip": 0.73486328, + "router_z_loss_mlp": 0.11968994, + "step": 12971, + "time_per_iteration": 4.089584827423096 + }, + { + "auxiliary_loss_clip": 0.0111692, + "auxiliary_loss_mlp": 0.01031321, + "balance_loss_clip": 1.0407058, + "balance_loss_mlp": 1.01981115, + "epoch": 0.7799188336088982, + "flos": 27356164002720.0, + "grad_norm": 1.7096550552778453, + "language_loss": 0.74284476, + "learning_rate": 4.868211303642578e-07, + "loss": 0.76432717, + "num_input_tokens_seen": 279823730, + "router_z_loss_clip": 0.76269531, + "router_z_loss_mlp": 0.1151123, + "step": 12972, + "time_per_iteration": 2.6568143367767334 + }, + { + "auxiliary_loss_clip": 0.01112481, + "auxiliary_loss_mlp": 0.01022972, + "balance_loss_clip": 1.03908563, + "balance_loss_mlp": 1.01095533, + "epoch": 0.7799789568615663, + "flos": 23037800012640.0, + "grad_norm": 1.9491205466667922, + "language_loss": 0.71238106, + "learning_rate": 4.865664938690584e-07, + "loss": 0.73373556, + "num_input_tokens_seen": 279843035, + "router_z_loss_clip": 0.73486328, + "router_z_loss_mlp": 0.12017822, + "step": 12973, + "time_per_iteration": 2.6424169540405273 + }, + { + "auxiliary_loss_clip": 0.01110553, + "auxiliary_loss_mlp": 0.01033457, + "balance_loss_clip": 1.03896272, + "balance_loss_mlp": 1.02281761, + "epoch": 0.7800390801142342, + "flos": 24724414615680.0, + "grad_norm": 2.019030671386791, + "language_loss": 0.77750784, + "learning_rate": 4.863119147634089e-07, + "loss": 0.79894799, + "num_input_tokens_seen": 279861450, + "router_z_loss_clip": 0.71582031, + "router_z_loss_mlp": 0.10644531, + "step": 12974, + "time_per_iteration": 2.6017239093780518 + }, + { + "auxiliary_loss_clip": 0.01110977, + "auxiliary_loss_mlp": 0.01031934, + "balance_loss_clip": 1.03900564, + "balance_loss_mlp": 1.02050781, + "epoch": 0.7800992033669022, + "flos": 20365296592320.0, + "grad_norm": 1.6436812918716108, + "language_loss": 0.69290316, + "learning_rate": 4.86057393056964e-07, + "loss": 0.71433234, + "num_input_tokens_seen": 279878660, + "router_z_loss_clip": 0.72021484, + "router_z_loss_mlp": 0.11425781, + "step": 12975, + "time_per_iteration": 2.597015857696533 + }, + { + "auxiliary_loss_clip": 0.0110952, + "auxiliary_loss_mlp": 0.01028142, + "balance_loss_clip": 1.03868127, + "balance_loss_mlp": 1.01704907, + "epoch": 0.7801593266195701, + "flos": 22677814265760.0, + "grad_norm": 2.0873339243741036, + "language_loss": 0.82002032, + "learning_rate": 4.858029287593739e-07, + "loss": 0.84139699, + "num_input_tokens_seen": 279895685, + "router_z_loss_clip": 0.70849609, + "router_z_loss_mlp": 0.11102295, + "step": 12976, + "time_per_iteration": 2.5670368671417236 + }, + { + "auxiliary_loss_clip": 0.01114609, + "auxiliary_loss_mlp": 0.01027241, + "balance_loss_clip": 1.03927207, + "balance_loss_mlp": 1.01512337, + "epoch": 0.7802194498722381, + "flos": 31097036721600.0, + "grad_norm": 1.4299738487399392, + "language_loss": 0.66353619, + "learning_rate": 4.85548521880289e-07, + "loss": 0.68495476, + "num_input_tokens_seen": 279917240, + "router_z_loss_clip": 0.75341797, + "router_z_loss_mlp": 0.12115479, + "step": 12977, + "time_per_iteration": 2.6921157836914062 + }, + { + "auxiliary_loss_clip": 0.0111176, + "auxiliary_loss_mlp": 0.01027304, + "balance_loss_clip": 1.04035795, + "balance_loss_mlp": 1.01674199, + "epoch": 0.780279573124906, + "flos": 38042125956000.0, + "grad_norm": 1.4819529704844216, + "language_loss": 0.75104773, + "learning_rate": 4.852941724293554e-07, + "loss": 0.77243835, + "num_input_tokens_seen": 279938665, + "router_z_loss_clip": 0.71435547, + "router_z_loss_mlp": 0.10552979, + "step": 12978, + "time_per_iteration": 2.721125364303589 + }, + { + "auxiliary_loss_clip": 0.01115895, + "auxiliary_loss_mlp": 0.01032964, + "balance_loss_clip": 1.04042339, + "balance_loss_mlp": 1.02114463, + "epoch": 0.780339696377574, + "flos": 32877557678880.0, + "grad_norm": 1.8055453470624967, + "language_loss": 0.6229443, + "learning_rate": 4.85039880416219e-07, + "loss": 0.6444329, + "num_input_tokens_seen": 279957965, + "router_z_loss_clip": 0.75488281, + "router_z_loss_mlp": 0.11828613, + "step": 12979, + "time_per_iteration": 2.676844835281372 + }, + { + "auxiliary_loss_clip": 0.01114879, + "auxiliary_loss_mlp": 0.01028141, + "balance_loss_clip": 1.0413456, + "balance_loss_mlp": 1.01648211, + "epoch": 0.780399819630242, + "flos": 34114939668000.0, + "grad_norm": 2.010245921504083, + "language_loss": 0.77261174, + "learning_rate": 4.847856458505217e-07, + "loss": 0.79404199, + "num_input_tokens_seen": 279977490, + "router_z_loss_clip": 0.734375, + "router_z_loss_mlp": 0.11651611, + "step": 12980, + "time_per_iteration": 2.6991758346557617 + }, + { + "auxiliary_loss_clip": 0.01113034, + "auxiliary_loss_mlp": 0.01033209, + "balance_loss_clip": 1.03931355, + "balance_loss_mlp": 1.02187872, + "epoch": 0.78045994288291, + "flos": 27439049656800.0, + "grad_norm": 2.577849330208253, + "language_loss": 0.78161567, + "learning_rate": 4.845314687419046e-07, + "loss": 0.80307806, + "num_input_tokens_seen": 279994220, + "router_z_loss_clip": 0.73632812, + "router_z_loss_mlp": 0.11334229, + "step": 12981, + "time_per_iteration": 2.5965797901153564 + }, + { + "auxiliary_loss_clip": 0.01115713, + "auxiliary_loss_mlp": 0.01032287, + "balance_loss_clip": 1.04162836, + "balance_loss_mlp": 1.02145076, + "epoch": 0.7805200661355779, + "flos": 25442198176320.0, + "grad_norm": 2.164679795060609, + "language_loss": 0.72984904, + "learning_rate": 4.842773491000067e-07, + "loss": 0.75132906, + "num_input_tokens_seen": 280012590, + "router_z_loss_clip": 0.73974609, + "router_z_loss_mlp": 0.10827637, + "step": 12982, + "time_per_iteration": 2.734076499938965 + }, + { + "auxiliary_loss_clip": 0.01112145, + "auxiliary_loss_mlp": 0.01034098, + "balance_loss_clip": 1.03791702, + "balance_loss_mlp": 1.02305293, + "epoch": 0.7805801893882459, + "flos": 31318067249280.0, + "grad_norm": 2.3118233229751417, + "language_loss": 0.73643911, + "learning_rate": 4.840232869344636e-07, + "loss": 0.75790155, + "num_input_tokens_seen": 280033700, + "router_z_loss_clip": 0.74072266, + "router_z_loss_mlp": 0.1104126, + "step": 12983, + "time_per_iteration": 2.623361587524414 + }, + { + "auxiliary_loss_clip": 0.01112806, + "auxiliary_loss_mlp": 0.01027957, + "balance_loss_clip": 1.04002762, + "balance_loss_mlp": 1.01694214, + "epoch": 0.7806403126409138, + "flos": 13821473862720.0, + "grad_norm": 1.8498438110527202, + "language_loss": 0.75053924, + "learning_rate": 4.837692822549086e-07, + "loss": 0.77194679, + "num_input_tokens_seen": 280052215, + "router_z_loss_clip": 0.72802734, + "router_z_loss_mlp": 0.11022949, + "step": 12984, + "time_per_iteration": 2.6088707447052 + }, + { + "auxiliary_loss_clip": 0.01111702, + "auxiliary_loss_mlp": 0.01035361, + "balance_loss_clip": 1.03886628, + "balance_loss_mlp": 1.02463818, + "epoch": 0.7807004358935818, + "flos": 23883356023200.0, + "grad_norm": 1.8154165611007786, + "language_loss": 0.81524742, + "learning_rate": 4.835153350709746e-07, + "loss": 0.83671802, + "num_input_tokens_seen": 280070525, + "router_z_loss_clip": 0.72851562, + "router_z_loss_mlp": 0.10717773, + "step": 12985, + "time_per_iteration": 2.590585708618164 + }, + { + "auxiliary_loss_clip": 0.01111988, + "auxiliary_loss_mlp": 0.0103019, + "balance_loss_clip": 1.03992093, + "balance_loss_mlp": 1.0185318, + "epoch": 0.7807605591462499, + "flos": 23349414477600.0, + "grad_norm": 1.5606656198457098, + "language_loss": 0.77030158, + "learning_rate": 4.832614453922915e-07, + "loss": 0.79172337, + "num_input_tokens_seen": 280089855, + "router_z_loss_clip": 0.72119141, + "router_z_loss_mlp": 0.11663818, + "step": 12986, + "time_per_iteration": 2.6627299785614014 + }, + { + "auxiliary_loss_clip": 0.01112436, + "auxiliary_loss_mlp": 0.01029205, + "balance_loss_clip": 1.03926778, + "balance_loss_mlp": 1.01791632, + "epoch": 0.7808206823989178, + "flos": 39504954856320.0, + "grad_norm": 1.755014606174741, + "language_loss": 0.74225742, + "learning_rate": 4.830076132284859e-07, + "loss": 0.76367384, + "num_input_tokens_seen": 280109960, + "router_z_loss_clip": 0.73144531, + "router_z_loss_mlp": 0.112854, + "step": 12987, + "time_per_iteration": 2.7399282455444336 + }, + { + "auxiliary_loss_clip": 0.01031253, + "auxiliary_loss_mlp": 0.01002579, + "balance_loss_clip": 1.00867212, + "balance_loss_mlp": 1.00164568, + "epoch": 0.7808808056515858, + "flos": 67170430189440.0, + "grad_norm": 0.733253354274724, + "language_loss": 0.55030489, + "learning_rate": 4.82753838589184e-07, + "loss": 0.57064331, + "num_input_tokens_seen": 280169805, + "router_z_loss_clip": 0.22595215, + "router_z_loss_mlp": 0.00933075, + "step": 12988, + "time_per_iteration": 3.26316499710083 + }, + { + "auxiliary_loss_clip": 0.01109418, + "auxiliary_loss_mlp": 0.01036555, + "balance_loss_clip": 1.03935421, + "balance_loss_mlp": 1.02552819, + "epoch": 0.7809409289042537, + "flos": 15690593410560.0, + "grad_norm": 7.210345943847525, + "language_loss": 0.80684, + "learning_rate": 4.82500121484009e-07, + "loss": 0.8282997, + "num_input_tokens_seen": 280184630, + "router_z_loss_clip": 0.70019531, + "router_z_loss_mlp": 0.11022949, + "step": 12989, + "time_per_iteration": 2.607759475708008 + }, + { + "auxiliary_loss_clip": 0.01109231, + "auxiliary_loss_mlp": 0.01027259, + "balance_loss_clip": 1.03827369, + "balance_loss_mlp": 1.016065, + "epoch": 0.7810010521569217, + "flos": 26463087573120.0, + "grad_norm": 1.5895613524525078, + "language_loss": 0.70281994, + "learning_rate": 4.822464619225806e-07, + "loss": 0.72418481, + "num_input_tokens_seen": 280203880, + "router_z_loss_clip": 0.70898438, + "router_z_loss_mlp": 0.11193848, + "step": 12990, + "time_per_iteration": 2.629946708679199 + }, + { + "auxiliary_loss_clip": 0.01113399, + "auxiliary_loss_mlp": 0.01028625, + "balance_loss_clip": 1.03999996, + "balance_loss_mlp": 1.01642942, + "epoch": 0.7810611754095896, + "flos": 20447331383520.0, + "grad_norm": 2.1876288984308756, + "language_loss": 0.77855837, + "learning_rate": 4.819928599145184e-07, + "loss": 0.79997861, + "num_input_tokens_seen": 280220460, + "router_z_loss_clip": 0.73388672, + "router_z_loss_mlp": 0.12207031, + "step": 12991, + "time_per_iteration": 2.7044363021850586 + }, + { + "auxiliary_loss_clip": 0.01111039, + "auxiliary_loss_mlp": 0.01035415, + "balance_loss_clip": 1.03771627, + "balance_loss_mlp": 1.02382827, + "epoch": 0.7811212986622577, + "flos": 53089636969440.0, + "grad_norm": 1.5918648089161618, + "language_loss": 0.65675443, + "learning_rate": 4.817393154694398e-07, + "loss": 0.67821896, + "num_input_tokens_seen": 280242680, + "router_z_loss_clip": 0.73291016, + "router_z_loss_mlp": 0.1159668, + "step": 12992, + "time_per_iteration": 2.8083651065826416 + }, + { + "auxiliary_loss_clip": 0.01116692, + "auxiliary_loss_mlp": 0.01029467, + "balance_loss_clip": 1.0422188, + "balance_loss_mlp": 1.01799262, + "epoch": 0.7811814219149256, + "flos": 26549295644160.0, + "grad_norm": 9.772603748405505, + "language_loss": 0.61851335, + "learning_rate": 4.814858285969578e-07, + "loss": 0.63997489, + "num_input_tokens_seen": 280260655, + "router_z_loss_clip": 0.74511719, + "router_z_loss_mlp": 0.11474609, + "step": 12993, + "time_per_iteration": 2.687422275543213 + }, + { + "auxiliary_loss_clip": 0.01111451, + "auxiliary_loss_mlp": 0.01027677, + "balance_loss_clip": 1.03907943, + "balance_loss_mlp": 1.01622105, + "epoch": 0.7812415451675936, + "flos": 29360106007200.0, + "grad_norm": 1.7308231120443505, + "language_loss": 0.68680251, + "learning_rate": 4.812323993066862e-07, + "loss": 0.70819384, + "num_input_tokens_seen": 280281185, + "router_z_loss_clip": 0.72460938, + "router_z_loss_mlp": 0.11468506, + "step": 12994, + "time_per_iteration": 2.666429281234741 + }, + { + "auxiliary_loss_clip": 0.01111436, + "auxiliary_loss_mlp": 0.01027468, + "balance_loss_clip": 1.03954148, + "balance_loss_mlp": 1.01641154, + "epoch": 0.7813016684202615, + "flos": 23171123329920.0, + "grad_norm": 2.9547563517317634, + "language_loss": 0.68800765, + "learning_rate": 4.809790276082335e-07, + "loss": 0.70939672, + "num_input_tokens_seen": 280298255, + "router_z_loss_clip": 0.71826172, + "router_z_loss_mlp": 0.1105957, + "step": 12995, + "time_per_iteration": 2.6314268112182617 + }, + { + "auxiliary_loss_clip": 0.01108081, + "auxiliary_loss_mlp": 0.01024293, + "balance_loss_clip": 1.0368973, + "balance_loss_mlp": 1.01354086, + "epoch": 0.7813617916729295, + "flos": 30823988356800.0, + "grad_norm": 2.541876188500453, + "language_loss": 0.74824941, + "learning_rate": 4.807257135112088e-07, + "loss": 0.76957321, + "num_input_tokens_seen": 280319000, + "router_z_loss_clip": 0.71191406, + "router_z_loss_mlp": 0.10742188, + "step": 12996, + "time_per_iteration": 2.652726173400879 + }, + { + "auxiliary_loss_clip": 0.01116277, + "auxiliary_loss_mlp": 0.01031607, + "balance_loss_clip": 1.03960907, + "balance_loss_mlp": 1.01968074, + "epoch": 0.7814219149255974, + "flos": 21922153398720.0, + "grad_norm": 2.570851074952261, + "language_loss": 0.68771553, + "learning_rate": 4.804724570252167e-07, + "loss": 0.70919436, + "num_input_tokens_seen": 280336375, + "router_z_loss_clip": 0.765625, + "router_z_loss_mlp": 0.1192627, + "step": 12997, + "time_per_iteration": 2.595629930496216 + }, + { + "auxiliary_loss_clip": 0.01117003, + "auxiliary_loss_mlp": 0.01033435, + "balance_loss_clip": 1.04003716, + "balance_loss_mlp": 1.0211091, + "epoch": 0.7814820381782654, + "flos": 31452484533120.0, + "grad_norm": 1.7267508596839598, + "language_loss": 0.82304621, + "learning_rate": 4.802192581598614e-07, + "loss": 0.84455061, + "num_input_tokens_seen": 280358760, + "router_z_loss_clip": 0.77050781, + "router_z_loss_mlp": 0.12322998, + "step": 12998, + "time_per_iteration": 4.214895009994507 + }, + { + "auxiliary_loss_clip": 0.01112258, + "auxiliary_loss_mlp": 0.0103181, + "balance_loss_clip": 1.03829551, + "balance_loss_mlp": 1.0196327, + "epoch": 0.7815421614309335, + "flos": 25038217013760.0, + "grad_norm": 2.7925990901899347, + "language_loss": 0.74720442, + "learning_rate": 4.799661169247453e-07, + "loss": 0.76864511, + "num_input_tokens_seen": 280377085, + "router_z_loss_clip": 0.73925781, + "router_z_loss_mlp": 0.12182617, + "step": 12999, + "time_per_iteration": 2.5918469429016113 + }, + { + "auxiliary_loss_clip": 0.01114861, + "auxiliary_loss_mlp": 0.01035644, + "balance_loss_clip": 1.04012203, + "balance_loss_mlp": 1.02367544, + "epoch": 0.7816022846836014, + "flos": 25975329376320.0, + "grad_norm": 1.6742133810809559, + "language_loss": 0.84530234, + "learning_rate": 4.797130333294652e-07, + "loss": 0.86680734, + "num_input_tokens_seen": 280395465, + "router_z_loss_clip": 0.74755859, + "router_z_loss_mlp": 0.11975098, + "step": 13000, + "time_per_iteration": 2.6204640865325928 + }, + { + "auxiliary_loss_clip": 0.01114419, + "auxiliary_loss_mlp": 0.01032342, + "balance_loss_clip": 1.04022515, + "balance_loss_mlp": 1.02080297, + "epoch": 0.7816624079362694, + "flos": 23438013068160.0, + "grad_norm": 2.2345752609174827, + "language_loss": 0.65954244, + "learning_rate": 4.794600073836192e-07, + "loss": 0.68101001, + "num_input_tokens_seen": 280412775, + "router_z_loss_clip": 0.74169922, + "router_z_loss_mlp": 0.11553955, + "step": 13001, + "time_per_iteration": 2.5963733196258545 + }, + { + "auxiliary_loss_clip": 0.01113177, + "auxiliary_loss_mlp": 0.0103389, + "balance_loss_clip": 1.03952217, + "balance_loss_mlp": 1.0227859, + "epoch": 0.7817225311889373, + "flos": 31853345865120.0, + "grad_norm": 1.5689487190653257, + "language_loss": 0.66821563, + "learning_rate": 4.792070390968027e-07, + "loss": 0.6896863, + "num_input_tokens_seen": 280432905, + "router_z_loss_clip": 0.73681641, + "router_z_loss_mlp": 0.11108398, + "step": 13002, + "time_per_iteration": 4.130681037902832 + }, + { + "auxiliary_loss_clip": 0.01117692, + "auxiliary_loss_mlp": 0.01035811, + "balance_loss_clip": 1.04266477, + "balance_loss_mlp": 1.0229243, + "epoch": 0.7817826544416053, + "flos": 25930523615040.0, + "grad_norm": 2.8003231492422027, + "language_loss": 0.73768049, + "learning_rate": 4.78954128478607e-07, + "loss": 0.75921553, + "num_input_tokens_seen": 280450785, + "router_z_loss_clip": 0.75048828, + "router_z_loss_mlp": 0.12890625, + "step": 13003, + "time_per_iteration": 2.727407932281494 + }, + { + "auxiliary_loss_clip": 0.01116176, + "auxiliary_loss_mlp": 0.0103166, + "balance_loss_clip": 1.04202652, + "balance_loss_mlp": 1.02056718, + "epoch": 0.7818427776942732, + "flos": 24321770523360.0, + "grad_norm": 1.6849405387511618, + "language_loss": 0.62221026, + "learning_rate": 4.787012755386233e-07, + "loss": 0.64368868, + "num_input_tokens_seen": 280468400, + "router_z_loss_clip": 0.74121094, + "router_z_loss_mlp": 0.11090088, + "step": 13004, + "time_per_iteration": 2.630835771560669 + }, + { + "auxiliary_loss_clip": 0.01107781, + "auxiliary_loss_mlp": 0.01027477, + "balance_loss_clip": 1.03817987, + "balance_loss_mlp": 1.017398, + "epoch": 0.7819029009469413, + "flos": 13865388243840.0, + "grad_norm": 1.9862093909977117, + "language_loss": 0.83115304, + "learning_rate": 4.784484802864403e-07, + "loss": 0.85250562, + "num_input_tokens_seen": 280483930, + "router_z_loss_clip": 0.69628906, + "router_z_loss_mlp": 0.10083008, + "step": 13005, + "time_per_iteration": 2.6192171573638916 + }, + { + "auxiliary_loss_clip": 0.01109822, + "auxiliary_loss_mlp": 0.01028036, + "balance_loss_clip": 1.03780329, + "balance_loss_mlp": 1.01622248, + "epoch": 0.7819630241996092, + "flos": 29625942296160.0, + "grad_norm": 4.198400868087346, + "language_loss": 0.7226342, + "learning_rate": 4.781957427316432e-07, + "loss": 0.74401277, + "num_input_tokens_seen": 280503465, + "router_z_loss_clip": 0.71972656, + "router_z_loss_mlp": 0.11810303, + "step": 13006, + "time_per_iteration": 2.6411659717559814 + }, + { + "auxiliary_loss_clip": 0.01115907, + "auxiliary_loss_mlp": 0.01031406, + "balance_loss_clip": 1.04095411, + "balance_loss_mlp": 1.019503, + "epoch": 0.7820231474522772, + "flos": 27709788536640.0, + "grad_norm": 1.6423003012838773, + "language_loss": 0.71889991, + "learning_rate": 4.779430628838157e-07, + "loss": 0.74037302, + "num_input_tokens_seen": 280523375, + "router_z_loss_clip": 0.74951172, + "router_z_loss_mlp": 0.11907959, + "step": 13007, + "time_per_iteration": 2.636101007461548 + }, + { + "auxiliary_loss_clip": 0.01113762, + "auxiliary_loss_mlp": 0.01028822, + "balance_loss_clip": 1.03788424, + "balance_loss_mlp": 1.01668012, + "epoch": 0.7820832707049451, + "flos": 24461860226400.0, + "grad_norm": 1.9694498526317916, + "language_loss": 0.69076139, + "learning_rate": 4.776904407525397e-07, + "loss": 0.71218717, + "num_input_tokens_seen": 280542920, + "router_z_loss_clip": 0.75878906, + "router_z_loss_mlp": 0.12145996, + "step": 13008, + "time_per_iteration": 2.6018314361572266 + }, + { + "auxiliary_loss_clip": 0.01110189, + "auxiliary_loss_mlp": 0.01027652, + "balance_loss_clip": 1.03705394, + "balance_loss_mlp": 1.01542127, + "epoch": 0.7821433939576131, + "flos": 33144528451680.0, + "grad_norm": 1.7022150065465058, + "language_loss": 0.69694519, + "learning_rate": 4.774378763473954e-07, + "loss": 0.71832359, + "num_input_tokens_seen": 280561700, + "router_z_loss_clip": 0.73046875, + "router_z_loss_mlp": 0.12225342, + "step": 13009, + "time_per_iteration": 4.071582555770874 + }, + { + "auxiliary_loss_clip": 0.01110245, + "auxiliary_loss_mlp": 0.01026418, + "balance_loss_clip": 1.03752494, + "balance_loss_mlp": 1.0147835, + "epoch": 0.782203517210281, + "flos": 27579179877120.0, + "grad_norm": 2.905654371350455, + "language_loss": 0.82188714, + "learning_rate": 4.771853696779586e-07, + "loss": 0.84325379, + "num_input_tokens_seen": 280580605, + "router_z_loss_clip": 0.72705078, + "router_z_loss_mlp": 0.11633301, + "step": 13010, + "time_per_iteration": 2.652329206466675 + }, + { + "auxiliary_loss_clip": 0.01109137, + "auxiliary_loss_mlp": 0.01033036, + "balance_loss_clip": 1.03803992, + "balance_loss_mlp": 1.02234888, + "epoch": 0.782263640462949, + "flos": 35459436644640.0, + "grad_norm": 1.858077180953777, + "language_loss": 0.62159544, + "learning_rate": 4.76932920753806e-07, + "loss": 0.64301717, + "num_input_tokens_seen": 280601495, + "router_z_loss_clip": 0.7109375, + "router_z_loss_mlp": 0.10687256, + "step": 13011, + "time_per_iteration": 3.949662685394287 + }, + { + "auxiliary_loss_clip": 0.01111713, + "auxiliary_loss_mlp": 0.01027506, + "balance_loss_clip": 1.03997684, + "balance_loss_mlp": 1.01746845, + "epoch": 0.782323763715617, + "flos": 30871022568480.0, + "grad_norm": 2.0374863791249647, + "language_loss": 0.70331061, + "learning_rate": 4.7668052958450913e-07, + "loss": 0.72470284, + "num_input_tokens_seen": 280622760, + "router_z_loss_clip": 0.71728516, + "router_z_loss_mlp": 0.10046387, + "step": 13012, + "time_per_iteration": 2.720090389251709 + }, + { + "auxiliary_loss_clip": 0.01031183, + "auxiliary_loss_mlp": 0.01000888, + "balance_loss_clip": 1.00861669, + "balance_loss_mlp": 0.99994874, + "epoch": 0.782383886968285, + "flos": 79549003303200.0, + "grad_norm": 0.7036905004308818, + "language_loss": 0.55044556, + "learning_rate": 4.764281961796395e-07, + "loss": 0.57076633, + "num_input_tokens_seen": 280687115, + "router_z_loss_clip": 0.22546387, + "router_z_loss_mlp": 0.00937653, + "step": 13013, + "time_per_iteration": 3.353867530822754 + }, + { + "auxiliary_loss_clip": 0.01119247, + "auxiliary_loss_mlp": 0.01035541, + "balance_loss_clip": 1.04347253, + "balance_loss_mlp": 1.02424002, + "epoch": 0.782444010220953, + "flos": 22458201842880.0, + "grad_norm": 2.057603605039414, + "language_loss": 0.65841538, + "learning_rate": 4.76175920548765e-07, + "loss": 0.67996323, + "num_input_tokens_seen": 280705000, + "router_z_loss_clip": 0.7578125, + "router_z_loss_mlp": 0.11297607, + "step": 13014, + "time_per_iteration": 2.627163887023926 + }, + { + "auxiliary_loss_clip": 0.01031314, + "auxiliary_loss_mlp": 0.01000535, + "balance_loss_clip": 1.00865901, + "balance_loss_mlp": 0.99957669, + "epoch": 0.7825041334736209, + "flos": 78036506568000.0, + "grad_norm": 0.7154375571066592, + "language_loss": 0.58394706, + "learning_rate": 4.759237027014524e-07, + "loss": 0.60426557, + "num_input_tokens_seen": 280773525, + "router_z_loss_clip": 0.22668457, + "router_z_loss_mlp": 0.00959015, + "step": 13015, + "time_per_iteration": 3.3251354694366455 + }, + { + "auxiliary_loss_clip": 0.01110613, + "auxiliary_loss_mlp": 0.01031004, + "balance_loss_clip": 1.03911626, + "balance_loss_mlp": 1.0201261, + "epoch": 0.7825642567262889, + "flos": 24818118383520.0, + "grad_norm": 2.6705874464347485, + "language_loss": 0.74380171, + "learning_rate": 4.756715426472666e-07, + "loss": 0.7652179, + "num_input_tokens_seen": 280791915, + "router_z_loss_clip": 0.71484375, + "router_z_loss_mlp": 0.10882568, + "step": 13016, + "time_per_iteration": 2.6119496822357178 + }, + { + "auxiliary_loss_clip": 0.01115733, + "auxiliary_loss_mlp": 0.01030518, + "balance_loss_clip": 1.04057097, + "balance_loss_mlp": 1.01785827, + "epoch": 0.7826243799789568, + "flos": 24725184444000.0, + "grad_norm": 1.624577284872096, + "language_loss": 0.75411963, + "learning_rate": 4.7541944039576766e-07, + "loss": 0.77558213, + "num_input_tokens_seen": 280811460, + "router_z_loss_clip": 0.75048828, + "router_z_loss_mlp": 0.12658691, + "step": 13017, + "time_per_iteration": 2.6973938941955566 + }, + { + "auxiliary_loss_clip": 0.01113109, + "auxiliary_loss_mlp": 0.01032408, + "balance_loss_clip": 1.0379591, + "balance_loss_mlp": 1.02017713, + "epoch": 0.7826845032316249, + "flos": 25787314081440.0, + "grad_norm": 3.6595055519614923, + "language_loss": 0.75625718, + "learning_rate": 4.7516739595651636e-07, + "loss": 0.77771235, + "num_input_tokens_seen": 280825415, + "router_z_loss_clip": 0.75048828, + "router_z_loss_mlp": 0.12237549, + "step": 13018, + "time_per_iteration": 2.610713481903076 + }, + { + "auxiliary_loss_clip": 0.01111602, + "auxiliary_loss_mlp": 0.01028341, + "balance_loss_clip": 1.0383085, + "balance_loss_mlp": 1.01659918, + "epoch": 0.7827446264842928, + "flos": 27445208283360.0, + "grad_norm": 1.4615924394340434, + "language_loss": 0.77197498, + "learning_rate": 4.749154093390708e-07, + "loss": 0.79337442, + "num_input_tokens_seen": 280845335, + "router_z_loss_clip": 0.73339844, + "router_z_loss_mlp": 0.11755371, + "step": 13019, + "time_per_iteration": 2.6952664852142334 + }, + { + "auxiliary_loss_clip": 0.01110812, + "auxiliary_loss_mlp": 0.01027445, + "balance_loss_clip": 1.0378592, + "balance_loss_mlp": 1.01610851, + "epoch": 0.7828047497369608, + "flos": 35192020181760.0, + "grad_norm": 1.568356867993418, + "language_loss": 0.6780678, + "learning_rate": 4.746634805529852e-07, + "loss": 0.69945037, + "num_input_tokens_seen": 280867145, + "router_z_loss_clip": 0.72949219, + "router_z_loss_mlp": 0.11334229, + "step": 13020, + "time_per_iteration": 2.659031629562378 + }, + { + "auxiliary_loss_clip": 0.01113674, + "auxiliary_loss_mlp": 0.01027305, + "balance_loss_clip": 1.04120255, + "balance_loss_mlp": 1.01575971, + "epoch": 0.7828648729896287, + "flos": 28379322367200.0, + "grad_norm": 2.422279089294433, + "language_loss": 0.62639844, + "learning_rate": 4.7441160960781325e-07, + "loss": 0.64780819, + "num_input_tokens_seen": 280886185, + "router_z_loss_clip": 0.72460938, + "router_z_loss_mlp": 0.11547852, + "step": 13021, + "time_per_iteration": 2.6479883193969727 + }, + { + "auxiliary_loss_clip": 0.01110149, + "auxiliary_loss_mlp": 0.01033003, + "balance_loss_clip": 1.03820229, + "balance_loss_mlp": 1.02216053, + "epoch": 0.7829249962422967, + "flos": 30829498706880.0, + "grad_norm": 1.9768266115851687, + "language_loss": 0.69129866, + "learning_rate": 4.7415979651310636e-07, + "loss": 0.71273017, + "num_input_tokens_seen": 280907665, + "router_z_loss_clip": 0.71923828, + "router_z_loss_mlp": 0.10845947, + "step": 13022, + "time_per_iteration": 2.64620041847229 + }, + { + "auxiliary_loss_clip": 0.01031099, + "auxiliary_loss_mlp": 0.01001265, + "balance_loss_clip": 1.00854754, + "balance_loss_mlp": 1.0003171, + "epoch": 0.7829851194949646, + "flos": 86295947922720.0, + "grad_norm": 0.641763333352116, + "language_loss": 0.56243104, + "learning_rate": 4.739080412784131e-07, + "loss": 0.58275467, + "num_input_tokens_seen": 280971405, + "router_z_loss_clip": 0.22558594, + "router_z_loss_mlp": 0.00946808, + "step": 13023, + "time_per_iteration": 3.4024393558502197 + }, + { + "auxiliary_loss_clip": 0.01105067, + "auxiliary_loss_mlp": 0.0102909, + "balance_loss_clip": 1.03583539, + "balance_loss_mlp": 1.01830137, + "epoch": 0.7830452427476327, + "flos": 31312232760960.0, + "grad_norm": 1.9835108700465782, + "language_loss": 0.67310476, + "learning_rate": 4.736563439132792e-07, + "loss": 0.69444638, + "num_input_tokens_seen": 280989615, + "router_z_loss_clip": 0.69189453, + "router_z_loss_mlp": 0.10778809, + "step": 13024, + "time_per_iteration": 2.659691333770752 + }, + { + "auxiliary_loss_clip": 0.01114361, + "auxiliary_loss_mlp": 0.01028051, + "balance_loss_clip": 1.03950262, + "balance_loss_mlp": 1.01635039, + "epoch": 0.7831053660003006, + "flos": 27801587992320.0, + "grad_norm": 6.401691339803779, + "language_loss": 0.78064066, + "learning_rate": 4.734047044272498e-07, + "loss": 0.80206484, + "num_input_tokens_seen": 281009450, + "router_z_loss_clip": 0.74902344, + "router_z_loss_mlp": 0.11700439, + "step": 13025, + "time_per_iteration": 2.6362802982330322 + }, + { + "auxiliary_loss_clip": 0.01112974, + "auxiliary_loss_mlp": 0.01031588, + "balance_loss_clip": 1.0404985, + "balance_loss_mlp": 1.02045965, + "epoch": 0.7831654892529686, + "flos": 31496560983360.0, + "grad_norm": 1.775250876804906, + "language_loss": 0.77958441, + "learning_rate": 4.731531228298673e-07, + "loss": 0.80102992, + "num_input_tokens_seen": 281028120, + "router_z_loss_clip": 0.72509766, + "router_z_loss_mlp": 0.11132812, + "step": 13026, + "time_per_iteration": 2.6840248107910156 + }, + { + "auxiliary_loss_clip": 0.0111254, + "auxiliary_loss_mlp": 0.01025917, + "balance_loss_clip": 1.04054213, + "balance_loss_mlp": 1.01503921, + "epoch": 0.7832256125056366, + "flos": 25346022854400.0, + "grad_norm": 1.9514063819293812, + "language_loss": 0.75455117, + "learning_rate": 4.729015991306715e-07, + "loss": 0.77593577, + "num_input_tokens_seen": 281042130, + "router_z_loss_clip": 0.71923828, + "router_z_loss_mlp": 0.10888672, + "step": 13027, + "time_per_iteration": 2.574057102203369 + }, + { + "auxiliary_loss_clip": 0.0111378, + "auxiliary_loss_mlp": 0.01027685, + "balance_loss_clip": 1.04045212, + "balance_loss_mlp": 1.01671731, + "epoch": 0.7832857357583045, + "flos": 26242421700960.0, + "grad_norm": 1.8420823844820928, + "language_loss": 0.70628041, + "learning_rate": 4.726501333391997e-07, + "loss": 0.72769505, + "num_input_tokens_seen": 281060945, + "router_z_loss_clip": 0.73388672, + "router_z_loss_mlp": 0.10974121, + "step": 13028, + "time_per_iteration": 2.6740591526031494 + }, + { + "auxiliary_loss_clip": 0.0111597, + "auxiliary_loss_mlp": 0.01037076, + "balance_loss_clip": 1.03994608, + "balance_loss_mlp": 1.0252682, + "epoch": 0.7833458590109725, + "flos": 22058312925600.0, + "grad_norm": 2.101998844236882, + "language_loss": 0.68851519, + "learning_rate": 4.7239872546498774e-07, + "loss": 0.7100457, + "num_input_tokens_seen": 281079270, + "router_z_loss_clip": 0.75927734, + "router_z_loss_mlp": 0.11804199, + "step": 13029, + "time_per_iteration": 2.5980911254882812 + }, + { + "auxiliary_loss_clip": 0.01113098, + "auxiliary_loss_mlp": 0.01028977, + "balance_loss_clip": 1.03773654, + "balance_loss_mlp": 1.01703262, + "epoch": 0.7834059822636404, + "flos": 34518961347840.0, + "grad_norm": 1.9010105506995747, + "language_loss": 0.81040919, + "learning_rate": 4.721473755175698e-07, + "loss": 0.83182991, + "num_input_tokens_seen": 281099500, + "router_z_loss_clip": 0.75292969, + "router_z_loss_mlp": 0.11956787, + "step": 13030, + "time_per_iteration": 2.70326828956604 + }, + { + "auxiliary_loss_clip": 0.01115535, + "auxiliary_loss_mlp": 0.01030618, + "balance_loss_clip": 1.03939223, + "balance_loss_mlp": 1.01905513, + "epoch": 0.7834661055163085, + "flos": 38664422988480.0, + "grad_norm": 1.7267828595096242, + "language_loss": 0.70791936, + "learning_rate": 4.71896083506476e-07, + "loss": 0.72938085, + "num_input_tokens_seen": 281121250, + "router_z_loss_clip": 0.76123047, + "router_z_loss_mlp": 0.11572266, + "step": 13031, + "time_per_iteration": 2.6770615577697754 + }, + { + "auxiliary_loss_clip": 0.01114298, + "auxiliary_loss_mlp": 0.01031202, + "balance_loss_clip": 1.03949261, + "balance_loss_mlp": 1.01966262, + "epoch": 0.7835262287689764, + "flos": 15780488554080.0, + "grad_norm": 1.9137836837455005, + "language_loss": 0.78840554, + "learning_rate": 4.7164484944123574e-07, + "loss": 0.80986059, + "num_input_tokens_seen": 281138760, + "router_z_loss_clip": 0.74804688, + "router_z_loss_mlp": 0.11535645, + "step": 13032, + "time_per_iteration": 2.634941339492798 + }, + { + "auxiliary_loss_clip": 0.0111682, + "auxiliary_loss_mlp": 0.0103714, + "balance_loss_clip": 1.04100513, + "balance_loss_mlp": 1.0251658, + "epoch": 0.7835863520216444, + "flos": 19697667073920.0, + "grad_norm": 2.5969199649925394, + "language_loss": 0.63130057, + "learning_rate": 4.7139367333137726e-07, + "loss": 0.65284014, + "num_input_tokens_seen": 281157420, + "router_z_loss_clip": 0.7578125, + "router_z_loss_mlp": 0.11975098, + "step": 13033, + "time_per_iteration": 2.6609911918640137 + }, + { + "auxiliary_loss_clip": 0.01112352, + "auxiliary_loss_mlp": 0.01034367, + "balance_loss_clip": 1.03946257, + "balance_loss_mlp": 1.02252948, + "epoch": 0.7836464752743123, + "flos": 14043719908800.0, + "grad_norm": 1.5125488874774333, + "language_loss": 0.71900415, + "learning_rate": 4.7114255518642255e-07, + "loss": 0.74047136, + "num_input_tokens_seen": 281174620, + "router_z_loss_clip": 0.72900391, + "router_z_loss_mlp": 0.11846924, + "step": 13034, + "time_per_iteration": 2.6382651329040527 + }, + { + "auxiliary_loss_clip": 0.01115217, + "auxiliary_loss_mlp": 0.01029253, + "balance_loss_clip": 1.0396992, + "balance_loss_mlp": 1.01743376, + "epoch": 0.7837065985269803, + "flos": 22236360969600.0, + "grad_norm": 2.3151927991882597, + "language_loss": 0.72232842, + "learning_rate": 4.7089149501589555e-07, + "loss": 0.7437731, + "num_input_tokens_seen": 281193865, + "router_z_loss_clip": 0.75634766, + "router_z_loss_mlp": 0.11816406, + "step": 13035, + "time_per_iteration": 2.608431577682495 + }, + { + "auxiliary_loss_clip": 0.01116798, + "auxiliary_loss_mlp": 0.01033351, + "balance_loss_clip": 1.04219043, + "balance_loss_mlp": 1.021281, + "epoch": 0.7837667217796482, + "flos": 30205662017760.0, + "grad_norm": 1.750621571998983, + "language_loss": 0.66291678, + "learning_rate": 4.7064049282931664e-07, + "loss": 0.68441826, + "num_input_tokens_seen": 281212250, + "router_z_loss_clip": 0.74658203, + "router_z_loss_mlp": 0.12084961, + "step": 13036, + "time_per_iteration": 2.673267364501953 + }, + { + "auxiliary_loss_clip": 0.01117874, + "auxiliary_loss_mlp": 0.01037935, + "balance_loss_clip": 1.04056263, + "balance_loss_mlp": 1.02529263, + "epoch": 0.7838268450323163, + "flos": 27312006517920.0, + "grad_norm": 3.759924215808278, + "language_loss": 0.72590321, + "learning_rate": 4.703895486362031e-07, + "loss": 0.74746132, + "num_input_tokens_seen": 281230850, + "router_z_loss_clip": 0.77294922, + "router_z_loss_mlp": 0.12646484, + "step": 13037, + "time_per_iteration": 4.115649700164795 + }, + { + "auxiliary_loss_clip": 0.01110708, + "auxiliary_loss_mlp": 0.01034895, + "balance_loss_clip": 1.03770983, + "balance_loss_mlp": 1.0229497, + "epoch": 0.7838869682849842, + "flos": 23794190190720.0, + "grad_norm": 2.1727415304711992, + "language_loss": 0.60044581, + "learning_rate": 4.701386624460717e-07, + "loss": 0.62190187, + "num_input_tokens_seen": 281249810, + "router_z_loss_clip": 0.72998047, + "router_z_loss_mlp": 0.11938477, + "step": 13038, + "time_per_iteration": 2.5782134532928467 + }, + { + "auxiliary_loss_clip": 0.01111064, + "auxiliary_loss_mlp": 0.01035037, + "balance_loss_clip": 1.03895295, + "balance_loss_mlp": 1.02357483, + "epoch": 0.7839470915376522, + "flos": 40136611380480.0, + "grad_norm": 3.2332403411042074, + "language_loss": 0.6834712, + "learning_rate": 4.698878342684349e-07, + "loss": 0.70493221, + "num_input_tokens_seen": 281273730, + "router_z_loss_clip": 0.72021484, + "router_z_loss_mlp": 0.11462402, + "step": 13039, + "time_per_iteration": 2.741199016571045 + }, + { + "auxiliary_loss_clip": 0.0110717, + "auxiliary_loss_mlp": 0.01024823, + "balance_loss_clip": 1.03622437, + "balance_loss_mlp": 1.01464868, + "epoch": 0.7840072147903202, + "flos": 36211734577440.0, + "grad_norm": 1.8078037531156506, + "language_loss": 0.6955117, + "learning_rate": 4.6963706411280537e-07, + "loss": 0.71683168, + "num_input_tokens_seen": 281293670, + "router_z_loss_clip": 0.70947266, + "router_z_loss_mlp": 0.10168457, + "step": 13040, + "time_per_iteration": 2.675666093826294 + }, + { + "auxiliary_loss_clip": 0.01114156, + "auxiliary_loss_mlp": 0.01033805, + "balance_loss_clip": 1.04028869, + "balance_loss_mlp": 1.02177691, + "epoch": 0.7840673380429881, + "flos": 22191312104640.0, + "grad_norm": 1.6792910838068253, + "language_loss": 0.67385453, + "learning_rate": 4.6938635198869116e-07, + "loss": 0.69533414, + "num_input_tokens_seen": 281313070, + "router_z_loss_clip": 0.73925781, + "router_z_loss_mlp": 0.12011719, + "step": 13041, + "time_per_iteration": 2.6263651847839355 + }, + { + "auxiliary_loss_clip": 0.01031471, + "auxiliary_loss_mlp": 0.0100079, + "balance_loss_clip": 1.00899518, + "balance_loss_mlp": 0.99981046, + "epoch": 0.7841274612956561, + "flos": 80955762638400.0, + "grad_norm": 0.664498690702582, + "language_loss": 0.57415372, + "learning_rate": 4.691356979055998e-07, + "loss": 0.59447634, + "num_input_tokens_seen": 281374880, + "router_z_loss_clip": 0.22485352, + "router_z_loss_mlp": 0.00978851, + "step": 13042, + "time_per_iteration": 4.592874050140381 + }, + { + "auxiliary_loss_clip": 0.01115345, + "auxiliary_loss_mlp": 0.01029442, + "balance_loss_clip": 1.04112291, + "balance_loss_mlp": 1.01791418, + "epoch": 0.784187584548324, + "flos": 32517247793760.0, + "grad_norm": 1.9118344003549679, + "language_loss": 0.8401469, + "learning_rate": 4.688851018730369e-07, + "loss": 0.8615948, + "num_input_tokens_seen": 281392620, + "router_z_loss_clip": 0.74267578, + "router_z_loss_mlp": 0.1151123, + "step": 13043, + "time_per_iteration": 2.659003257751465 + }, + { + "auxiliary_loss_clip": 0.01110483, + "auxiliary_loss_mlp": 0.01026538, + "balance_loss_clip": 1.03883302, + "balance_loss_mlp": 1.01544535, + "epoch": 0.7842477078009921, + "flos": 31408489117440.0, + "grad_norm": 1.4918442277889667, + "language_loss": 0.88483393, + "learning_rate": 4.6863456390050425e-07, + "loss": 0.90620416, + "num_input_tokens_seen": 281413140, + "router_z_loss_clip": 0.71630859, + "router_z_loss_mlp": 0.11083984, + "step": 13044, + "time_per_iteration": 2.6759543418884277 + }, + { + "auxiliary_loss_clip": 0.01116659, + "auxiliary_loss_mlp": 0.01031028, + "balance_loss_clip": 1.03971791, + "balance_loss_mlp": 1.01934528, + "epoch": 0.78430783105366, + "flos": 26821331076960.0, + "grad_norm": 7.091156195425184, + "language_loss": 0.78909278, + "learning_rate": 4.6838408399750195e-07, + "loss": 0.81056964, + "num_input_tokens_seen": 281430860, + "router_z_loss_clip": 0.76855469, + "router_z_loss_mlp": 0.11682129, + "step": 13045, + "time_per_iteration": 2.6989023685455322 + }, + { + "auxiliary_loss_clip": 0.01109404, + "auxiliary_loss_mlp": 0.01026977, + "balance_loss_clip": 1.0374043, + "balance_loss_mlp": 1.01593184, + "epoch": 0.784367954306328, + "flos": 29092608509760.0, + "grad_norm": 1.8385295483257806, + "language_loss": 0.72469628, + "learning_rate": 4.6813366217352925e-07, + "loss": 0.74606007, + "num_input_tokens_seen": 281451385, + "router_z_loss_clip": 0.71972656, + "router_z_loss_mlp": 0.1104126, + "step": 13046, + "time_per_iteration": 2.6369709968566895 + }, + { + "auxiliary_loss_clip": 0.01112095, + "auxiliary_loss_mlp": 0.01036523, + "balance_loss_clip": 1.04062414, + "balance_loss_mlp": 1.02496529, + "epoch": 0.7844280775589959, + "flos": 30293895952800.0, + "grad_norm": 1.5521019174465336, + "language_loss": 0.63388216, + "learning_rate": 4.678832984380809e-07, + "loss": 0.65536833, + "num_input_tokens_seen": 281472255, + "router_z_loss_clip": 0.71484375, + "router_z_loss_mlp": 0.11560059, + "step": 13047, + "time_per_iteration": 2.728156805038452 + }, + { + "auxiliary_loss_clip": 0.01110787, + "auxiliary_loss_mlp": 0.01029631, + "balance_loss_clip": 1.03995144, + "balance_loss_mlp": 1.01890254, + "epoch": 0.7844882008116639, + "flos": 27400564591200.0, + "grad_norm": 1.5868247773303552, + "language_loss": 0.7343123, + "learning_rate": 4.676329928006515e-07, + "loss": 0.75571656, + "num_input_tokens_seen": 281492860, + "router_z_loss_clip": 0.70849609, + "router_z_loss_mlp": 0.1072998, + "step": 13048, + "time_per_iteration": 2.602668285369873 + }, + { + "auxiliary_loss_clip": 0.0111587, + "auxiliary_loss_mlp": 0.01028866, + "balance_loss_clip": 1.04089022, + "balance_loss_mlp": 1.0171237, + "epoch": 0.7845483240643318, + "flos": 31852616554080.0, + "grad_norm": 1.8453060247482824, + "language_loss": 0.74613643, + "learning_rate": 4.6738274527073243e-07, + "loss": 0.76758385, + "num_input_tokens_seen": 281511815, + "router_z_loss_clip": 0.74951172, + "router_z_loss_mlp": 0.11743164, + "step": 13049, + "time_per_iteration": 4.17603325843811 + }, + { + "auxiliary_loss_clip": 0.01116958, + "auxiliary_loss_mlp": 0.01030829, + "balance_loss_clip": 1.03988051, + "balance_loss_mlp": 1.01849699, + "epoch": 0.7846084473169999, + "flos": 23616061112160.0, + "grad_norm": 2.0725076856728464, + "language_loss": 0.72620893, + "learning_rate": 4.6713255585781454e-07, + "loss": 0.74768674, + "num_input_tokens_seen": 281530090, + "router_z_loss_clip": 0.77197266, + "router_z_loss_mlp": 0.12341309, + "step": 13050, + "time_per_iteration": 2.5879414081573486 + }, + { + "auxiliary_loss_clip": 0.01111329, + "auxiliary_loss_mlp": 0.01030151, + "balance_loss_clip": 1.03898311, + "balance_loss_mlp": 1.01873708, + "epoch": 0.7846685705696678, + "flos": 28462896815040.0, + "grad_norm": 2.3398749843998528, + "language_loss": 0.74150687, + "learning_rate": 4.668824245713825e-07, + "loss": 0.76292169, + "num_input_tokens_seen": 281547075, + "router_z_loss_clip": 0.72412109, + "router_z_loss_mlp": 0.11425781, + "step": 13051, + "time_per_iteration": 3.9016034603118896 + }, + { + "auxiliary_loss_clip": 0.01116238, + "auxiliary_loss_mlp": 0.01033644, + "balance_loss_clip": 1.04132223, + "balance_loss_mlp": 1.02118063, + "epoch": 0.7847286938223358, + "flos": 43695992086560.0, + "grad_norm": 2.5792098071036285, + "language_loss": 0.72774088, + "learning_rate": 4.666323514209227e-07, + "loss": 0.74923968, + "num_input_tokens_seen": 281568080, + "router_z_loss_clip": 0.74853516, + "router_z_loss_mlp": 0.12463379, + "step": 13052, + "time_per_iteration": 2.7684850692749023 + }, + { + "auxiliary_loss_clip": 0.01109478, + "auxiliary_loss_mlp": 0.01033373, + "balance_loss_clip": 1.03956223, + "balance_loss_mlp": 1.02220333, + "epoch": 0.7847888170750038, + "flos": 22546962502560.0, + "grad_norm": 2.7712451067742965, + "language_loss": 0.68994194, + "learning_rate": 4.663823364159183e-07, + "loss": 0.71137047, + "num_input_tokens_seen": 281586925, + "router_z_loss_clip": 0.69873047, + "router_z_loss_mlp": 0.11175537, + "step": 13053, + "time_per_iteration": 2.6156132221221924 + }, + { + "auxiliary_loss_clip": 0.01111192, + "auxiliary_loss_mlp": 0.01030359, + "balance_loss_clip": 1.03856516, + "balance_loss_mlp": 1.01920712, + "epoch": 0.7848489403276717, + "flos": 30650761869120.0, + "grad_norm": 4.853517299390952, + "language_loss": 0.70391965, + "learning_rate": 4.6613237956584893e-07, + "loss": 0.72533512, + "num_input_tokens_seen": 281603915, + "router_z_loss_clip": 0.72558594, + "router_z_loss_mlp": 0.11151123, + "step": 13054, + "time_per_iteration": 2.6999869346618652 + }, + { + "auxiliary_loss_clip": 0.01115015, + "auxiliary_loss_mlp": 0.01033285, + "balance_loss_clip": 1.03904676, + "balance_loss_mlp": 1.02117372, + "epoch": 0.7849090635803397, + "flos": 32339037680640.0, + "grad_norm": 1.7862540447336241, + "language_loss": 0.75702834, + "learning_rate": 4.658824808801938e-07, + "loss": 0.77851135, + "num_input_tokens_seen": 281624220, + "router_z_loss_clip": 0.75976562, + "router_z_loss_mlp": 0.12115479, + "step": 13055, + "time_per_iteration": 2.642085075378418 + }, + { + "auxiliary_loss_clip": 0.01118009, + "auxiliary_loss_mlp": 0.01033905, + "balance_loss_clip": 1.04101956, + "balance_loss_mlp": 1.02218127, + "epoch": 0.7849691868330076, + "flos": 25575035286240.0, + "grad_norm": 1.9571951816622783, + "language_loss": 0.75209707, + "learning_rate": 4.656326403684283e-07, + "loss": 0.77361619, + "num_input_tokens_seen": 281642325, + "router_z_loss_clip": 0.77050781, + "router_z_loss_mlp": 0.11730957, + "step": 13056, + "time_per_iteration": 2.7669966220855713 + }, + { + "auxiliary_loss_clip": 0.01112607, + "auxiliary_loss_mlp": 0.0102873, + "balance_loss_clip": 1.0397023, + "balance_loss_mlp": 1.01702976, + "epoch": 0.7850293100856757, + "flos": 31808013379200.0, + "grad_norm": 13.181897393245782, + "language_loss": 0.70199525, + "learning_rate": 4.6538285804002744e-07, + "loss": 0.72340858, + "num_input_tokens_seen": 281663065, + "router_z_loss_clip": 0.72900391, + "router_z_loss_mlp": 0.11706543, + "step": 13057, + "time_per_iteration": 2.6669867038726807 + }, + { + "auxiliary_loss_clip": 0.01114622, + "auxiliary_loss_mlp": 0.01035056, + "balance_loss_clip": 1.04008782, + "balance_loss_mlp": 1.0236125, + "epoch": 0.7850894333383436, + "flos": 27444803110560.0, + "grad_norm": 2.6242995574719004, + "language_loss": 0.76639825, + "learning_rate": 4.6513313390446175e-07, + "loss": 0.78789508, + "num_input_tokens_seen": 281681005, + "router_z_loss_clip": 0.74511719, + "router_z_loss_mlp": 0.11444092, + "step": 13058, + "time_per_iteration": 2.707759141921997 + }, + { + "auxiliary_loss_clip": 0.01113586, + "auxiliary_loss_mlp": 0.01032336, + "balance_loss_clip": 1.04131389, + "balance_loss_mlp": 1.02087402, + "epoch": 0.7851495565910116, + "flos": 25085656398240.0, + "grad_norm": 1.9223777386903942, + "language_loss": 0.71097153, + "learning_rate": 4.6488346797120146e-07, + "loss": 0.73243076, + "num_input_tokens_seen": 281697965, + "router_z_loss_clip": 0.72216797, + "router_z_loss_mlp": 0.11456299, + "step": 13059, + "time_per_iteration": 2.632479190826416 + }, + { + "auxiliary_loss_clip": 0.01117622, + "auxiliary_loss_mlp": 0.01037724, + "balance_loss_clip": 1.04016674, + "balance_loss_mlp": 1.02513528, + "epoch": 0.7852096798436795, + "flos": 19431141991200.0, + "grad_norm": 2.067864409242498, + "language_loss": 0.76308995, + "learning_rate": 4.646338602497144e-07, + "loss": 0.78464341, + "num_input_tokens_seen": 281716035, + "router_z_loss_clip": 0.77441406, + "router_z_loss_mlp": 0.12585449, + "step": 13060, + "time_per_iteration": 2.604738473892212 + }, + { + "auxiliary_loss_clip": 0.01114482, + "auxiliary_loss_mlp": 0.01032555, + "balance_loss_clip": 1.04088283, + "balance_loss_mlp": 1.02050245, + "epoch": 0.7852698030963475, + "flos": 23259721920480.0, + "grad_norm": 2.0918361394789837, + "language_loss": 0.76935196, + "learning_rate": 4.643843107494654e-07, + "loss": 0.79082233, + "num_input_tokens_seen": 281732815, + "router_z_loss_clip": 0.73632812, + "router_z_loss_mlp": 0.12042236, + "step": 13061, + "time_per_iteration": 2.6936774253845215 + }, + { + "auxiliary_loss_clip": 0.01112761, + "auxiliary_loss_mlp": 0.01029379, + "balance_loss_clip": 1.03839374, + "balance_loss_mlp": 1.01732111, + "epoch": 0.7853299263490154, + "flos": 30071447320320.0, + "grad_norm": 2.7135336508878756, + "language_loss": 0.74637574, + "learning_rate": 4.641348194799164e-07, + "loss": 0.76779711, + "num_input_tokens_seen": 281751980, + "router_z_loss_clip": 0.74267578, + "router_z_loss_mlp": 0.1206665, + "step": 13062, + "time_per_iteration": 2.640432119369507 + }, + { + "auxiliary_loss_clip": 0.01110785, + "auxiliary_loss_mlp": 0.01030371, + "balance_loss_clip": 1.03887844, + "balance_loss_mlp": 1.01933193, + "epoch": 0.7853900496016835, + "flos": 26866460976480.0, + "grad_norm": 1.513379503974854, + "language_loss": 0.68698668, + "learning_rate": 4.638853864505297e-07, + "loss": 0.70839828, + "num_input_tokens_seen": 281772670, + "router_z_loss_clip": 0.71875, + "router_z_loss_mlp": 0.11035156, + "step": 13063, + "time_per_iteration": 2.6268470287323 + }, + { + "auxiliary_loss_clip": 0.01115337, + "auxiliary_loss_mlp": 0.01032137, + "balance_loss_clip": 1.04356205, + "balance_loss_mlp": 1.0206337, + "epoch": 0.7854501728543514, + "flos": 36884226169440.0, + "grad_norm": 2.7165801966874326, + "language_loss": 0.72851092, + "learning_rate": 4.636360116707625e-07, + "loss": 0.74998569, + "num_input_tokens_seen": 281792930, + "router_z_loss_clip": 0.71826172, + "router_z_loss_mlp": 0.11505127, + "step": 13064, + "time_per_iteration": 2.6652274131774902 + }, + { + "auxiliary_loss_clip": 0.01115347, + "auxiliary_loss_mlp": 0.01031817, + "balance_loss_clip": 1.03915453, + "balance_loss_mlp": 1.0203495, + "epoch": 0.7855102961070194, + "flos": 22992953734080.0, + "grad_norm": 2.4803751168652575, + "language_loss": 0.68296415, + "learning_rate": 4.633866951500718e-07, + "loss": 0.70443583, + "num_input_tokens_seen": 281811805, + "router_z_loss_clip": 0.76220703, + "router_z_loss_mlp": 0.11462402, + "step": 13065, + "time_per_iteration": 2.712784767150879 + }, + { + "auxiliary_loss_clip": 0.01115555, + "auxiliary_loss_mlp": 0.01035521, + "balance_loss_clip": 1.0426656, + "balance_loss_mlp": 1.02410126, + "epoch": 0.7855704193596874, + "flos": 27222759650880.0, + "grad_norm": 1.9168154712417766, + "language_loss": 0.76218367, + "learning_rate": 4.6313743689791196e-07, + "loss": 0.78369439, + "num_input_tokens_seen": 281831885, + "router_z_loss_clip": 0.72802734, + "router_z_loss_mlp": 0.11431885, + "step": 13066, + "time_per_iteration": 2.621842622756958 + }, + { + "auxiliary_loss_clip": 0.01031542, + "auxiliary_loss_mlp": 0.0100118, + "balance_loss_clip": 1.00911283, + "balance_loss_mlp": 1.00024867, + "epoch": 0.7856305426123553, + "flos": 73216649023200.0, + "grad_norm": 0.7531497514154588, + "language_loss": 0.53364205, + "learning_rate": 4.628882369237346e-07, + "loss": 0.55396926, + "num_input_tokens_seen": 281900310, + "router_z_loss_clip": 0.22412109, + "router_z_loss_mlp": 0.00930786, + "step": 13067, + "time_per_iteration": 3.3124148845672607 + }, + { + "auxiliary_loss_clip": 0.01111458, + "auxiliary_loss_mlp": 0.01031552, + "balance_loss_clip": 1.03745604, + "balance_loss_mlp": 1.0194459, + "epoch": 0.7856906658650233, + "flos": 26684482756320.0, + "grad_norm": 1.5109939116993296, + "language_loss": 0.67520362, + "learning_rate": 4.62639095236989e-07, + "loss": 0.6966337, + "num_input_tokens_seen": 281918870, + "router_z_loss_clip": 0.74072266, + "router_z_loss_mlp": 0.12109375, + "step": 13068, + "time_per_iteration": 2.613715648651123 + }, + { + "auxiliary_loss_clip": 0.01111176, + "auxiliary_loss_mlp": 0.01032707, + "balance_loss_clip": 1.03961802, + "balance_loss_mlp": 1.02142382, + "epoch": 0.7857507891176913, + "flos": 28824989460480.0, + "grad_norm": 2.2315796336052514, + "language_loss": 0.68153059, + "learning_rate": 4.6239001184712267e-07, + "loss": 0.70296943, + "num_input_tokens_seen": 281936905, + "router_z_loss_clip": 0.71533203, + "router_z_loss_mlp": 0.112854, + "step": 13069, + "time_per_iteration": 2.682565450668335 + }, + { + "auxiliary_loss_clip": 0.0111598, + "auxiliary_loss_mlp": 0.01032729, + "balance_loss_clip": 1.04179168, + "balance_loss_mlp": 1.02081442, + "epoch": 0.7858109123703593, + "flos": 31140019205280.0, + "grad_norm": 1.568551259690204, + "language_loss": 0.77232707, + "learning_rate": 4.6214098676358195e-07, + "loss": 0.79381418, + "num_input_tokens_seen": 281955625, + "router_z_loss_clip": 0.74169922, + "router_z_loss_mlp": 0.11907959, + "step": 13070, + "time_per_iteration": 2.7202346324920654 + }, + { + "auxiliary_loss_clip": 0.01111558, + "auxiliary_loss_mlp": 0.01031791, + "balance_loss_clip": 1.0387969, + "balance_loss_mlp": 1.02063918, + "epoch": 0.7858710356230272, + "flos": 21301274471040.0, + "grad_norm": 2.141361317473877, + "language_loss": 0.6587379, + "learning_rate": 4.618920199958083e-07, + "loss": 0.68017137, + "num_input_tokens_seen": 281973285, + "router_z_loss_clip": 0.72753906, + "router_z_loss_mlp": 0.11157227, + "step": 13071, + "time_per_iteration": 2.6091716289520264 + }, + { + "auxiliary_loss_clip": 0.01112005, + "auxiliary_loss_mlp": 0.01031415, + "balance_loss_clip": 1.03798032, + "balance_loss_mlp": 1.02045941, + "epoch": 0.7859311588756952, + "flos": 30114632390400.0, + "grad_norm": 2.141454244729321, + "language_loss": 0.74029756, + "learning_rate": 4.616431115532442e-07, + "loss": 0.76173168, + "num_input_tokens_seen": 281991410, + "router_z_loss_clip": 0.73974609, + "router_z_loss_mlp": 0.10955811, + "step": 13072, + "time_per_iteration": 2.7373011112213135 + }, + { + "auxiliary_loss_clip": 0.01114978, + "auxiliary_loss_mlp": 0.0103215, + "balance_loss_clip": 1.04058075, + "balance_loss_mlp": 1.02003193, + "epoch": 0.7859912821283631, + "flos": 26599409169120.0, + "grad_norm": 1.8679863333307212, + "language_loss": 0.71380067, + "learning_rate": 4.613942614453268e-07, + "loss": 0.73527193, + "num_input_tokens_seen": 282010845, + "router_z_loss_clip": 0.74316406, + "router_z_loss_mlp": 0.12133789, + "step": 13073, + "time_per_iteration": 2.6960952281951904 + }, + { + "auxiliary_loss_clip": 0.01114279, + "auxiliary_loss_mlp": 0.01031675, + "balance_loss_clip": 1.04057658, + "balance_loss_mlp": 1.01953328, + "epoch": 0.7860514053810311, + "flos": 25438186965600.0, + "grad_norm": 1.9183434581171064, + "language_loss": 0.76751387, + "learning_rate": 4.611454696814938e-07, + "loss": 0.78897339, + "num_input_tokens_seen": 282029635, + "router_z_loss_clip": 0.73681641, + "router_z_loss_mlp": 0.12158203, + "step": 13074, + "time_per_iteration": 2.664693593978882 + }, + { + "auxiliary_loss_clip": 0.01110265, + "auxiliary_loss_mlp": 0.01029862, + "balance_loss_clip": 1.03904819, + "balance_loss_mlp": 1.01876986, + "epoch": 0.786111528633699, + "flos": 29670383401920.0, + "grad_norm": 3.50583288773314, + "language_loss": 0.74850839, + "learning_rate": 4.608967362711782e-07, + "loss": 0.76990974, + "num_input_tokens_seen": 282050285, + "router_z_loss_clip": 0.71142578, + "router_z_loss_mlp": 0.11083984, + "step": 13075, + "time_per_iteration": 2.637117862701416 + }, + { + "auxiliary_loss_clip": 0.0111354, + "auxiliary_loss_mlp": 0.01026497, + "balance_loss_clip": 1.04019189, + "balance_loss_mlp": 1.01565504, + "epoch": 0.7861716518863671, + "flos": 29716040026080.0, + "grad_norm": 1.7336121187864255, + "language_loss": 0.69075131, + "learning_rate": 4.6064806122381283e-07, + "loss": 0.71215177, + "num_input_tokens_seen": 282071040, + "router_z_loss_clip": 0.73339844, + "router_z_loss_mlp": 0.10845947, + "step": 13076, + "time_per_iteration": 4.117291212081909 + }, + { + "auxiliary_loss_clip": 0.01110909, + "auxiliary_loss_mlp": 0.01028294, + "balance_loss_clip": 1.0394088, + "balance_loss_mlp": 1.01663589, + "epoch": 0.786231775139035, + "flos": 17110115688960.0, + "grad_norm": 2.4080771041471474, + "language_loss": 0.80089056, + "learning_rate": 4.603994445488282e-07, + "loss": 0.82228255, + "num_input_tokens_seen": 282086610, + "router_z_loss_clip": 0.71435547, + "router_z_loss_mlp": 0.11663818, + "step": 13077, + "time_per_iteration": 2.627786159515381 + }, + { + "auxiliary_loss_clip": 0.01113135, + "auxiliary_loss_mlp": 0.0103226, + "balance_loss_clip": 1.03987503, + "balance_loss_mlp": 1.02053547, + "epoch": 0.786291898391703, + "flos": 41151868875360.0, + "grad_norm": 1.597696692989252, + "language_loss": 0.70574033, + "learning_rate": 4.6015088625564956e-07, + "loss": 0.72719431, + "num_input_tokens_seen": 282107440, + "router_z_loss_clip": 0.73339844, + "router_z_loss_mlp": 0.11724854, + "step": 13078, + "time_per_iteration": 2.7143473625183105 + }, + { + "auxiliary_loss_clip": 0.01111548, + "auxiliary_loss_mlp": 0.01029382, + "balance_loss_clip": 1.04007912, + "balance_loss_mlp": 1.0184679, + "epoch": 0.786352021644371, + "flos": 31495669603200.0, + "grad_norm": 1.5339687254489816, + "language_loss": 0.81552655, + "learning_rate": 4.599023863537039e-07, + "loss": 0.83693588, + "num_input_tokens_seen": 282127290, + "router_z_loss_clip": 0.71484375, + "router_z_loss_mlp": 0.10919189, + "step": 13079, + "time_per_iteration": 2.7893519401550293 + }, + { + "auxiliary_loss_clip": 0.01110073, + "auxiliary_loss_mlp": 0.01027855, + "balance_loss_clip": 1.0398047, + "balance_loss_mlp": 1.01653624, + "epoch": 0.7864121448970389, + "flos": 35277904114560.0, + "grad_norm": 2.053109123333001, + "language_loss": 0.68863219, + "learning_rate": 4.596539448524146e-07, + "loss": 0.71001148, + "num_input_tokens_seen": 282147505, + "router_z_loss_clip": 0.70263672, + "router_z_loss_mlp": 0.11309814, + "step": 13080, + "time_per_iteration": 2.68977952003479 + }, + { + "auxiliary_loss_clip": 0.01111682, + "auxiliary_loss_mlp": 0.01033475, + "balance_loss_clip": 1.0389421, + "balance_loss_mlp": 1.02160192, + "epoch": 0.7864722681497069, + "flos": 23438499275520.0, + "grad_norm": 2.0497658155832164, + "language_loss": 0.69694817, + "learning_rate": 4.594055617612016e-07, + "loss": 0.7183997, + "num_input_tokens_seen": 282166450, + "router_z_loss_clip": 0.72705078, + "router_z_loss_mlp": 0.11877441, + "step": 13081, + "time_per_iteration": 2.627272129058838 + }, + { + "auxiliary_loss_clip": 0.01112833, + "auxiliary_loss_mlp": 0.01036117, + "balance_loss_clip": 1.03815937, + "balance_loss_mlp": 1.02473843, + "epoch": 0.7865323914023749, + "flos": 26687845690560.0, + "grad_norm": 2.004352671619726, + "language_loss": 0.68227166, + "learning_rate": 4.591572370894838e-07, + "loss": 0.70376116, + "num_input_tokens_seen": 282186465, + "router_z_loss_clip": 0.74658203, + "router_z_loss_mlp": 0.11383057, + "step": 13082, + "time_per_iteration": 3.930901050567627 + }, + { + "auxiliary_loss_clip": 0.01111433, + "auxiliary_loss_mlp": 0.01031522, + "balance_loss_clip": 1.03922522, + "balance_loss_mlp": 1.02003598, + "epoch": 0.7865925146550429, + "flos": 31140302826240.0, + "grad_norm": 1.7890664000463854, + "language_loss": 0.66168988, + "learning_rate": 4.589089708466789e-07, + "loss": 0.68311942, + "num_input_tokens_seen": 282207180, + "router_z_loss_clip": 0.72119141, + "router_z_loss_mlp": 0.11474609, + "step": 13083, + "time_per_iteration": 2.6397364139556885 + }, + { + "auxiliary_loss_clip": 0.01117214, + "auxiliary_loss_mlp": 0.01033131, + "balance_loss_clip": 1.0405376, + "balance_loss_mlp": 1.02067411, + "epoch": 0.7866526379077108, + "flos": 23302663886880.0, + "grad_norm": 4.093711660119586, + "language_loss": 0.74757707, + "learning_rate": 4.5866076304220015e-07, + "loss": 0.76908052, + "num_input_tokens_seen": 282225865, + "router_z_loss_clip": 0.76660156, + "router_z_loss_mlp": 0.12451172, + "step": 13084, + "time_per_iteration": 2.667327880859375 + }, + { + "auxiliary_loss_clip": 0.01112369, + "auxiliary_loss_mlp": 0.0103071, + "balance_loss_clip": 1.04084373, + "balance_loss_mlp": 1.02001727, + "epoch": 0.7867127611603788, + "flos": 19735503863040.0, + "grad_norm": 3.318711265324469, + "language_loss": 0.70915586, + "learning_rate": 4.584126136854591e-07, + "loss": 0.73058665, + "num_input_tokens_seen": 282242895, + "router_z_loss_clip": 0.71484375, + "router_z_loss_mlp": 0.10687256, + "step": 13085, + "time_per_iteration": 2.6436729431152344 + }, + { + "auxiliary_loss_clip": 0.01116487, + "auxiliary_loss_mlp": 0.01030359, + "balance_loss_clip": 1.03944397, + "balance_loss_mlp": 1.01812792, + "epoch": 0.7867728844130467, + "flos": 25348372856640.0, + "grad_norm": 2.5579208163524245, + "language_loss": 0.72507232, + "learning_rate": 4.5816452278586617e-07, + "loss": 0.74654078, + "num_input_tokens_seen": 282260425, + "router_z_loss_clip": 0.77001953, + "router_z_loss_mlp": 0.12243652, + "step": 13086, + "time_per_iteration": 2.6598927974700928 + }, + { + "auxiliary_loss_clip": 0.01112531, + "auxiliary_loss_mlp": 0.01031598, + "balance_loss_clip": 1.03884149, + "balance_loss_mlp": 1.02023697, + "epoch": 0.7868330076657147, + "flos": 26552010301920.0, + "grad_norm": 1.8450924310106025, + "language_loss": 0.74927843, + "learning_rate": 4.5791649035282965e-07, + "loss": 0.77071977, + "num_input_tokens_seen": 282279335, + "router_z_loss_clip": 0.73632812, + "router_z_loss_mlp": 0.11364746, + "step": 13087, + "time_per_iteration": 2.602893352508545 + }, + { + "auxiliary_loss_clip": 0.01110055, + "auxiliary_loss_mlp": 0.01032338, + "balance_loss_clip": 1.03877521, + "balance_loss_mlp": 1.02162659, + "epoch": 0.7868931309183826, + "flos": 31360604042880.0, + "grad_norm": 2.8800454135186624, + "language_loss": 0.71154088, + "learning_rate": 4.5766851639575456e-07, + "loss": 0.73296475, + "num_input_tokens_seen": 282299905, + "router_z_loss_clip": 0.71240234, + "router_z_loss_mlp": 0.1071167, + "step": 13088, + "time_per_iteration": 4.099521160125732 + }, + { + "auxiliary_loss_clip": 0.01031933, + "auxiliary_loss_mlp": 0.0100178, + "balance_loss_clip": 1.00954199, + "balance_loss_mlp": 1.00082517, + "epoch": 0.7869532541710507, + "flos": 78878659127040.0, + "grad_norm": 0.6794651208289534, + "language_loss": 0.5547992, + "learning_rate": 4.574206009240431e-07, + "loss": 0.5751363, + "num_input_tokens_seen": 282367620, + "router_z_loss_clip": 0.22399902, + "router_z_loss_mlp": 0.00953674, + "step": 13089, + "time_per_iteration": 3.3366143703460693 + }, + { + "auxiliary_loss_clip": 0.01032639, + "auxiliary_loss_mlp": 0.0100269, + "balance_loss_clip": 1.01011848, + "balance_loss_mlp": 1.00172663, + "epoch": 0.7870133774237186, + "flos": 82306134620640.0, + "grad_norm": 0.7231258631389089, + "language_loss": 0.4998813, + "learning_rate": 4.571727439470976e-07, + "loss": 0.52023458, + "num_input_tokens_seen": 282435695, + "router_z_loss_clip": 0.22509766, + "router_z_loss_mlp": 0.00962067, + "step": 13090, + "time_per_iteration": 4.605634689331055 + }, + { + "auxiliary_loss_clip": 0.0111091, + "auxiliary_loss_mlp": 0.01029037, + "balance_loss_clip": 1.03974617, + "balance_loss_mlp": 1.01799202, + "epoch": 0.7870735006763866, + "flos": 31808945276640.0, + "grad_norm": 2.598996545650432, + "language_loss": 0.83459461, + "learning_rate": 4.5692494547431583e-07, + "loss": 0.85599411, + "num_input_tokens_seen": 282456025, + "router_z_loss_clip": 0.71142578, + "router_z_loss_mlp": 0.11047363, + "step": 13091, + "time_per_iteration": 2.640352249145508 + }, + { + "auxiliary_loss_clip": 0.01032641, + "auxiliary_loss_mlp": 0.01001855, + "balance_loss_clip": 1.01002562, + "balance_loss_mlp": 1.00086963, + "epoch": 0.7871336239290546, + "flos": 85767719313600.0, + "grad_norm": 0.7165726586248663, + "language_loss": 0.63952225, + "learning_rate": 4.566772055150947e-07, + "loss": 0.65986717, + "num_input_tokens_seen": 282520995, + "router_z_loss_clip": 0.22607422, + "router_z_loss_mlp": 0.00984192, + "step": 13092, + "time_per_iteration": 3.298917531967163 + }, + { + "auxiliary_loss_clip": 0.01114723, + "auxiliary_loss_mlp": 0.01033207, + "balance_loss_clip": 1.04008436, + "balance_loss_mlp": 1.02123284, + "epoch": 0.7871937471817225, + "flos": 19252688774400.0, + "grad_norm": 2.066059349486363, + "language_loss": 0.79641104, + "learning_rate": 4.564295240788285e-07, + "loss": 0.81789041, + "num_input_tokens_seen": 282539355, + "router_z_loss_clip": 0.74609375, + "router_z_loss_mlp": 0.11981201, + "step": 13093, + "time_per_iteration": 2.6010594367980957 + }, + { + "auxiliary_loss_clip": 0.01112166, + "auxiliary_loss_mlp": 0.01024414, + "balance_loss_clip": 1.04016578, + "balance_loss_mlp": 1.01342273, + "epoch": 0.7872538704343905, + "flos": 24996125910240.0, + "grad_norm": 2.319115653639295, + "language_loss": 0.75766778, + "learning_rate": 4.561819011749106e-07, + "loss": 0.7790336, + "num_input_tokens_seen": 282555735, + "router_z_loss_clip": 0.72021484, + "router_z_loss_mlp": 0.10992432, + "step": 13094, + "time_per_iteration": 2.6401052474975586 + }, + { + "auxiliary_loss_clip": 0.01115476, + "auxiliary_loss_mlp": 0.01033295, + "balance_loss_clip": 1.04126179, + "balance_loss_mlp": 1.02187443, + "epoch": 0.7873139936870585, + "flos": 30606239728800.0, + "grad_norm": 1.968064323173587, + "language_loss": 0.79786259, + "learning_rate": 4.5593433681272884e-07, + "loss": 0.8193503, + "num_input_tokens_seen": 282574550, + "router_z_loss_clip": 0.7421875, + "router_z_loss_mlp": 0.11425781, + "step": 13095, + "time_per_iteration": 2.6846072673797607 + }, + { + "auxiliary_loss_clip": 0.01113694, + "auxiliary_loss_mlp": 0.01030588, + "balance_loss_clip": 1.03870487, + "balance_loss_mlp": 1.01888788, + "epoch": 0.7873741169397265, + "flos": 37684530728640.0, + "grad_norm": 2.3440533881196703, + "language_loss": 0.68101108, + "learning_rate": 4.556868310016715e-07, + "loss": 0.70245385, + "num_input_tokens_seen": 282596520, + "router_z_loss_clip": 0.74902344, + "router_z_loss_mlp": 0.11706543, + "step": 13096, + "time_per_iteration": 2.73872971534729 + }, + { + "auxiliary_loss_clip": 0.01108675, + "auxiliary_loss_mlp": 0.01026607, + "balance_loss_clip": 1.03860652, + "balance_loss_mlp": 1.01671267, + "epoch": 0.7874342401923944, + "flos": 57097561495680.0, + "grad_norm": 1.6896696590829483, + "language_loss": 0.70668924, + "learning_rate": 4.55439383751125e-07, + "loss": 0.72804207, + "num_input_tokens_seen": 282620560, + "router_z_loss_clip": 0.70068359, + "router_z_loss_mlp": 0.09887695, + "step": 13097, + "time_per_iteration": 2.83732271194458 + }, + { + "auxiliary_loss_clip": 0.01116086, + "auxiliary_loss_mlp": 0.01032669, + "balance_loss_clip": 1.04148865, + "balance_loss_mlp": 1.0213263, + "epoch": 0.7874943634450624, + "flos": 28778319904320.0, + "grad_norm": 1.659748425735176, + "language_loss": 0.80504948, + "learning_rate": 4.5519199507047126e-07, + "loss": 0.82653707, + "num_input_tokens_seen": 282639830, + "router_z_loss_clip": 0.74707031, + "router_z_loss_mlp": 0.11340332, + "step": 13098, + "time_per_iteration": 2.6627447605133057 + }, + { + "auxiliary_loss_clip": 0.01110616, + "auxiliary_loss_mlp": 0.01025113, + "balance_loss_clip": 1.03931963, + "balance_loss_mlp": 1.01471746, + "epoch": 0.7875544866977303, + "flos": 24637477233600.0, + "grad_norm": 2.1669237785502773, + "language_loss": 0.74282271, + "learning_rate": 4.5494466496909177e-07, + "loss": 0.76418, + "num_input_tokens_seen": 282660130, + "router_z_loss_clip": 0.71289062, + "router_z_loss_mlp": 0.10406494, + "step": 13099, + "time_per_iteration": 2.652427911758423 + }, + { + "auxiliary_loss_clip": 0.01112903, + "auxiliary_loss_mlp": 0.01022336, + "balance_loss_clip": 1.04063845, + "balance_loss_mlp": 1.0111481, + "epoch": 0.7876146099503983, + "flos": 27579747119040.0, + "grad_norm": 1.5403132904761614, + "language_loss": 0.78128475, + "learning_rate": 4.5469739345636603e-07, + "loss": 0.8026371, + "num_input_tokens_seen": 282681125, + "router_z_loss_clip": 0.72265625, + "router_z_loss_mlp": 0.11187744, + "step": 13100, + "time_per_iteration": 2.647735834121704 + }, + { + "auxiliary_loss_clip": 0.01120224, + "auxiliary_loss_mlp": 0.01030418, + "balance_loss_clip": 1.04204452, + "balance_loss_mlp": 1.01745999, + "epoch": 0.7876747332030662, + "flos": 13062652647840.0, + "grad_norm": 3.2552101683356596, + "language_loss": 0.65773189, + "learning_rate": 4.5445018054167007e-07, + "loss": 0.67923826, + "num_input_tokens_seen": 282696690, + "router_z_loss_clip": 0.78125, + "router_z_loss_mlp": 0.12945557, + "step": 13101, + "time_per_iteration": 2.689532995223999 + }, + { + "auxiliary_loss_clip": 0.01112634, + "auxiliary_loss_mlp": 0.01027058, + "balance_loss_clip": 1.0390135, + "balance_loss_mlp": 1.01607883, + "epoch": 0.7877348564557343, + "flos": 46856294220960.0, + "grad_norm": 1.5255982379355466, + "language_loss": 0.78046656, + "learning_rate": 4.5420302623437745e-07, + "loss": 0.80186355, + "num_input_tokens_seen": 282721210, + "router_z_loss_clip": 0.73681641, + "router_z_loss_mlp": 0.10980225, + "step": 13102, + "time_per_iteration": 2.7709028720855713 + }, + { + "auxiliary_loss_clip": 0.01113375, + "auxiliary_loss_mlp": 0.01035419, + "balance_loss_clip": 1.04012978, + "balance_loss_mlp": 1.0247736, + "epoch": 0.7877949797084022, + "flos": 22365835145280.0, + "grad_norm": 2.1476606089757952, + "language_loss": 0.82432663, + "learning_rate": 4.5395593054386093e-07, + "loss": 0.84581459, + "num_input_tokens_seen": 282738505, + "router_z_loss_clip": 0.73193359, + "router_z_loss_mlp": 0.10638428, + "step": 13103, + "time_per_iteration": 2.615079879760742 + }, + { + "auxiliary_loss_clip": 0.01117042, + "auxiliary_loss_mlp": 0.010351, + "balance_loss_clip": 1.04212213, + "balance_loss_mlp": 1.02284491, + "epoch": 0.7878551029610702, + "flos": 31490321322240.0, + "grad_norm": 2.2694550412381633, + "language_loss": 0.80655491, + "learning_rate": 4.537088934794913e-07, + "loss": 0.8280763, + "num_input_tokens_seen": 282756895, + "router_z_loss_clip": 0.74902344, + "router_z_loss_mlp": 0.12249756, + "step": 13104, + "time_per_iteration": 2.622809886932373 + }, + { + "auxiliary_loss_clip": 0.01114764, + "auxiliary_loss_mlp": 0.01034376, + "balance_loss_clip": 1.04041302, + "balance_loss_mlp": 1.02323008, + "epoch": 0.7879152262137382, + "flos": 27263027476800.0, + "grad_norm": 2.4829068843497293, + "language_loss": 0.74065411, + "learning_rate": 4.5346191505063515e-07, + "loss": 0.76214552, + "num_input_tokens_seen": 282774955, + "router_z_loss_clip": 0.74414062, + "router_z_loss_mlp": 0.11157227, + "step": 13105, + "time_per_iteration": 2.641725540161133 + }, + { + "auxiliary_loss_clip": 0.01115806, + "auxiliary_loss_mlp": 0.01035505, + "balance_loss_clip": 1.04073572, + "balance_loss_mlp": 1.02385271, + "epoch": 0.7879753494664061, + "flos": 30243903979680.0, + "grad_norm": 1.6776161030169898, + "language_loss": 0.76034153, + "learning_rate": 4.5321499526665776e-07, + "loss": 0.78185463, + "num_input_tokens_seen": 282793165, + "router_z_loss_clip": 0.75, + "router_z_loss_mlp": 0.11657715, + "step": 13106, + "time_per_iteration": 2.6806812286376953 + }, + { + "auxiliary_loss_clip": 0.01113516, + "auxiliary_loss_mlp": 0.01031102, + "balance_loss_clip": 1.03933334, + "balance_loss_mlp": 1.01965761, + "epoch": 0.7880354727190741, + "flos": 20632267365120.0, + "grad_norm": 2.7775254684872532, + "language_loss": 0.73041433, + "learning_rate": 4.5296813413692337e-07, + "loss": 0.7518605, + "num_input_tokens_seen": 282809820, + "router_z_loss_clip": 0.74121094, + "router_z_loss_mlp": 0.11437988, + "step": 13107, + "time_per_iteration": 2.676539897918701 + }, + { + "auxiliary_loss_clip": 0.01110497, + "auxiliary_loss_mlp": 0.01028295, + "balance_loss_clip": 1.03824401, + "balance_loss_mlp": 1.01669598, + "epoch": 0.7880955959717421, + "flos": 27126381742560.0, + "grad_norm": 1.6715115417850785, + "language_loss": 0.73506242, + "learning_rate": 4.5272133167079165e-07, + "loss": 0.7564503, + "num_input_tokens_seen": 282828600, + "router_z_loss_clip": 0.72216797, + "router_z_loss_mlp": 0.11602783, + "step": 13108, + "time_per_iteration": 2.5976481437683105 + }, + { + "auxiliary_loss_clip": 0.01032139, + "auxiliary_loss_mlp": 0.01003472, + "balance_loss_clip": 1.00969434, + "balance_loss_mlp": 1.00250769, + "epoch": 0.7881557192244101, + "flos": 84418724918880.0, + "grad_norm": 0.8865668519801195, + "language_loss": 0.60258377, + "learning_rate": 4.5247458787762216e-07, + "loss": 0.62293988, + "num_input_tokens_seen": 282882775, + "router_z_loss_clip": 0.22436523, + "router_z_loss_mlp": 0.00963593, + "step": 13109, + "time_per_iteration": 3.201641321182251 + }, + { + "auxiliary_loss_clip": 0.01110434, + "auxiliary_loss_mlp": 0.01030216, + "balance_loss_clip": 1.0398581, + "balance_loss_mlp": 1.01893926, + "epoch": 0.788215842477078, + "flos": 30426692545440.0, + "grad_norm": 1.5660327936840541, + "language_loss": 0.72053766, + "learning_rate": 4.5222790276677126e-07, + "loss": 0.74194413, + "num_input_tokens_seen": 282902680, + "router_z_loss_clip": 0.70458984, + "router_z_loss_mlp": 0.11273193, + "step": 13110, + "time_per_iteration": 2.6333370208740234 + }, + { + "auxiliary_loss_clip": 0.01111359, + "auxiliary_loss_mlp": 0.01026712, + "balance_loss_clip": 1.04012489, + "balance_loss_mlp": 1.01609612, + "epoch": 0.788275965729746, + "flos": 31855371729120.0, + "grad_norm": 1.6373879401878995, + "language_loss": 0.75221962, + "learning_rate": 4.5198127634759455e-07, + "loss": 0.77360034, + "num_input_tokens_seen": 282923625, + "router_z_loss_clip": 0.71289062, + "router_z_loss_mlp": 0.1060791, + "step": 13111, + "time_per_iteration": 2.832881450653076 + }, + { + "auxiliary_loss_clip": 0.01112673, + "auxiliary_loss_mlp": 0.01034239, + "balance_loss_clip": 1.04014385, + "balance_loss_mlp": 1.02250862, + "epoch": 0.7883360889824139, + "flos": 25886406647520.0, + "grad_norm": 1.920925708246319, + "language_loss": 0.61308956, + "learning_rate": 4.5173470862944206e-07, + "loss": 0.63455868, + "num_input_tokens_seen": 282941955, + "router_z_loss_clip": 0.72509766, + "router_z_loss_mlp": 0.11737061, + "step": 13112, + "time_per_iteration": 2.7026195526123047 + }, + { + "auxiliary_loss_clip": 0.01111782, + "auxiliary_loss_mlp": 0.01026974, + "balance_loss_clip": 1.03882098, + "balance_loss_mlp": 1.01521969, + "epoch": 0.7883962122350819, + "flos": 25798172712480.0, + "grad_norm": 1.838785341008563, + "language_loss": 0.67319626, + "learning_rate": 4.514881996216644e-07, + "loss": 0.69458377, + "num_input_tokens_seen": 282961280, + "router_z_loss_clip": 0.72949219, + "router_z_loss_mlp": 0.11755371, + "step": 13113, + "time_per_iteration": 2.6239495277404785 + }, + { + "auxiliary_loss_clip": 0.01111438, + "auxiliary_loss_mlp": 0.0103281, + "balance_loss_clip": 1.03956115, + "balance_loss_mlp": 1.02160978, + "epoch": 0.7884563354877498, + "flos": 18672361293600.0, + "grad_norm": 3.1866573713414854, + "language_loss": 0.57231891, + "learning_rate": 4.5124174933361e-07, + "loss": 0.59376144, + "num_input_tokens_seen": 282978210, + "router_z_loss_clip": 0.71923828, + "router_z_loss_mlp": 0.11199951, + "step": 13114, + "time_per_iteration": 2.6345291137695312 + }, + { + "auxiliary_loss_clip": 0.01113778, + "auxiliary_loss_mlp": 0.0102818, + "balance_loss_clip": 1.03995395, + "balance_loss_mlp": 1.01668274, + "epoch": 0.7885164587404179, + "flos": 29759022509760.0, + "grad_norm": 2.344860758243003, + "language_loss": 0.67202342, + "learning_rate": 4.5099535777462306e-07, + "loss": 0.69344294, + "num_input_tokens_seen": 282998845, + "router_z_loss_clip": 0.73828125, + "router_z_loss_mlp": 0.11499023, + "step": 13115, + "time_per_iteration": 2.6431267261505127 + }, + { + "auxiliary_loss_clip": 0.01112879, + "auxiliary_loss_mlp": 0.01024668, + "balance_loss_clip": 1.04011369, + "balance_loss_mlp": 1.01359987, + "epoch": 0.7885765819930858, + "flos": 17553797435520.0, + "grad_norm": 1.9172747106285057, + "language_loss": 0.88898039, + "learning_rate": 4.50749024954048e-07, + "loss": 0.91035581, + "num_input_tokens_seen": 283015200, + "router_z_loss_clip": 0.72802734, + "router_z_loss_mlp": 0.11071777, + "step": 13116, + "time_per_iteration": 4.041107654571533 + }, + { + "auxiliary_loss_clip": 0.01119117, + "auxiliary_loss_mlp": 0.01032993, + "balance_loss_clip": 1.03990245, + "balance_loss_mlp": 1.02049994, + "epoch": 0.7886367052457538, + "flos": 22280599488960.0, + "grad_norm": 6.254258673781552, + "language_loss": 0.72933143, + "learning_rate": 4.505027508812245e-07, + "loss": 0.75085247, + "num_input_tokens_seen": 283033680, + "router_z_loss_clip": 0.79150391, + "router_z_loss_mlp": 0.12493896, + "step": 13117, + "time_per_iteration": 2.707077741622925 + }, + { + "auxiliary_loss_clip": 0.0111242, + "auxiliary_loss_mlp": 0.0102637, + "balance_loss_clip": 1.04158807, + "balance_loss_mlp": 1.0154326, + "epoch": 0.7886968284984217, + "flos": 18674832847680.0, + "grad_norm": 2.193632029120076, + "language_loss": 0.80383897, + "learning_rate": 4.502565355654926e-07, + "loss": 0.82522684, + "num_input_tokens_seen": 283050620, + "router_z_loss_clip": 0.70898438, + "router_z_loss_mlp": 0.10931396, + "step": 13118, + "time_per_iteration": 2.5971808433532715 + }, + { + "auxiliary_loss_clip": 0.01112539, + "auxiliary_loss_mlp": 0.01028051, + "balance_loss_clip": 1.03992712, + "balance_loss_mlp": 1.01626122, + "epoch": 0.7887569517510897, + "flos": 25887135958560.0, + "grad_norm": 1.7342776374839879, + "language_loss": 0.7313292, + "learning_rate": 4.500103790161878e-07, + "loss": 0.75273514, + "num_input_tokens_seen": 283070215, + "router_z_loss_clip": 0.72607422, + "router_z_loss_mlp": 0.11785889, + "step": 13119, + "time_per_iteration": 2.700038433074951 + }, + { + "auxiliary_loss_clip": 0.01113274, + "auxiliary_loss_mlp": 0.01024003, + "balance_loss_clip": 1.0393095, + "balance_loss_mlp": 1.01270187, + "epoch": 0.7888170750037578, + "flos": 27712422159840.0, + "grad_norm": 1.6429312985441318, + "language_loss": 0.71925187, + "learning_rate": 4.4976428124264454e-07, + "loss": 0.74062467, + "num_input_tokens_seen": 283091485, + "router_z_loss_clip": 0.74023438, + "router_z_loss_mlp": 0.11303711, + "step": 13120, + "time_per_iteration": 2.6633386611938477 + }, + { + "auxiliary_loss_clip": 0.01112226, + "auxiliary_loss_mlp": 0.01030193, + "balance_loss_clip": 1.03986073, + "balance_loss_mlp": 1.01866567, + "epoch": 0.7888771982564257, + "flos": 44452139160960.0, + "grad_norm": 1.7791022728173291, + "language_loss": 0.7882027, + "learning_rate": 4.4951824225419564e-07, + "loss": 0.80962688, + "num_input_tokens_seen": 283115040, + "router_z_loss_clip": 0.72363281, + "router_z_loss_mlp": 0.11541748, + "step": 13121, + "time_per_iteration": 4.131669759750366 + }, + { + "auxiliary_loss_clip": 0.0111082, + "auxiliary_loss_mlp": 0.0102981, + "balance_loss_clip": 1.03918648, + "balance_loss_mlp": 1.01843119, + "epoch": 0.7889373215090937, + "flos": 33324035117760.0, + "grad_norm": 1.4206537256839655, + "language_loss": 0.80341518, + "learning_rate": 4.4927226206017057e-07, + "loss": 0.82482141, + "num_input_tokens_seen": 283136925, + "router_z_loss_clip": 0.71582031, + "router_z_loss_mlp": 0.11376953, + "step": 13122, + "time_per_iteration": 2.7598187923431396 + }, + { + "auxiliary_loss_clip": 0.01113049, + "auxiliary_loss_mlp": 0.01024744, + "balance_loss_clip": 1.03875136, + "balance_loss_mlp": 1.01412821, + "epoch": 0.7889974447617616, + "flos": 24195294626400.0, + "grad_norm": 3.1463501418441804, + "language_loss": 0.78132856, + "learning_rate": 4.4902634066989597e-07, + "loss": 0.80270648, + "num_input_tokens_seen": 283155725, + "router_z_loss_clip": 0.74316406, + "router_z_loss_mlp": 0.1060791, + "step": 13123, + "time_per_iteration": 2.6552114486694336 + }, + { + "auxiliary_loss_clip": 0.01116636, + "auxiliary_loss_mlp": 0.01028617, + "balance_loss_clip": 1.0410099, + "balance_loss_mlp": 1.01750124, + "epoch": 0.7890575680144296, + "flos": 21074409455040.0, + "grad_norm": 2.5643412203188696, + "language_loss": 0.6673311, + "learning_rate": 4.487804780926985e-07, + "loss": 0.68878365, + "num_input_tokens_seen": 283173845, + "router_z_loss_clip": 0.75537109, + "router_z_loss_mlp": 0.11126709, + "step": 13124, + "time_per_iteration": 2.7462551593780518 + }, + { + "auxiliary_loss_clip": 0.01116486, + "auxiliary_loss_mlp": 0.0102479, + "balance_loss_clip": 1.04078352, + "balance_loss_mlp": 1.01359677, + "epoch": 0.7891176912670975, + "flos": 33678267410880.0, + "grad_norm": 1.863021867945412, + "language_loss": 0.72895557, + "learning_rate": 4.4853467433790036e-07, + "loss": 0.75036836, + "num_input_tokens_seen": 283191985, + "router_z_loss_clip": 0.7578125, + "router_z_loss_mlp": 0.11187744, + "step": 13125, + "time_per_iteration": 2.75913405418396 + }, + { + "auxiliary_loss_clip": 0.01113017, + "auxiliary_loss_mlp": 0.01029539, + "balance_loss_clip": 1.03735089, + "balance_loss_mlp": 1.01730871, + "epoch": 0.7891778145197655, + "flos": 27712908367200.0, + "grad_norm": 2.1120577432480174, + "language_loss": 0.72589952, + "learning_rate": 4.4828892941482267e-07, + "loss": 0.74732506, + "num_input_tokens_seen": 283210855, + "router_z_loss_clip": 0.7578125, + "router_z_loss_mlp": 0.12249756, + "step": 13126, + "time_per_iteration": 2.6717913150787354 + }, + { + "auxiliary_loss_clip": 0.01115505, + "auxiliary_loss_mlp": 0.01029637, + "balance_loss_clip": 1.04001164, + "balance_loss_mlp": 1.01735854, + "epoch": 0.7892379377724335, + "flos": 21122416081440.0, + "grad_norm": 3.441632196291802, + "language_loss": 0.77079558, + "learning_rate": 4.480432433327845e-07, + "loss": 0.79224706, + "num_input_tokens_seen": 283229665, + "router_z_loss_clip": 0.75585938, + "router_z_loss_mlp": 0.12280273, + "step": 13127, + "time_per_iteration": 3.993126153945923 + }, + { + "auxiliary_loss_clip": 0.01111799, + "auxiliary_loss_mlp": 0.01032524, + "balance_loss_clip": 1.04131532, + "balance_loss_mlp": 1.02127063, + "epoch": 0.7892980610251015, + "flos": 31451917291200.0, + "grad_norm": 2.520728705209345, + "language_loss": 0.85770828, + "learning_rate": 4.47797616101103e-07, + "loss": 0.87915146, + "num_input_tokens_seen": 283248615, + "router_z_loss_clip": 0.70507812, + "router_z_loss_mlp": 0.11254883, + "step": 13128, + "time_per_iteration": 2.665085554122925 + }, + { + "auxiliary_loss_clip": 0.01112296, + "auxiliary_loss_mlp": 0.0103514, + "balance_loss_clip": 1.04004669, + "balance_loss_mlp": 1.02446449, + "epoch": 0.7893581842777694, + "flos": 26331952188960.0, + "grad_norm": 2.0560852938876013, + "language_loss": 0.68986654, + "learning_rate": 4.475520477290904e-07, + "loss": 0.7113409, + "num_input_tokens_seen": 283267135, + "router_z_loss_clip": 0.72265625, + "router_z_loss_mlp": 0.10675049, + "step": 13129, + "time_per_iteration": 2.637544631958008 + }, + { + "auxiliary_loss_clip": 0.01031972, + "auxiliary_loss_mlp": 0.01002122, + "balance_loss_clip": 1.00952315, + "balance_loss_mlp": 1.00109279, + "epoch": 0.7894183075304374, + "flos": 84213820268640.0, + "grad_norm": 0.7139345576941314, + "language_loss": 0.61648828, + "learning_rate": 4.473065382260597e-07, + "loss": 0.63682926, + "num_input_tokens_seen": 283328940, + "router_z_loss_clip": 0.2244873, + "router_z_loss_mlp": 0.01029205, + "step": 13130, + "time_per_iteration": 4.443767070770264 + }, + { + "auxiliary_loss_clip": 0.01115566, + "auxiliary_loss_mlp": 0.01027431, + "balance_loss_clip": 1.04143834, + "balance_loss_mlp": 1.01633847, + "epoch": 0.7894784307831053, + "flos": 29581946880480.0, + "grad_norm": 2.3865357689432476, + "language_loss": 0.73869222, + "learning_rate": 4.4706108760132124e-07, + "loss": 0.76012218, + "num_input_tokens_seen": 283350000, + "router_z_loss_clip": 0.74072266, + "router_z_loss_mlp": 0.11096191, + "step": 13131, + "time_per_iteration": 2.6653826236724854 + }, + { + "auxiliary_loss_clip": 0.01121374, + "auxiliary_loss_mlp": 0.0102833, + "balance_loss_clip": 1.04001808, + "balance_loss_mlp": 1.015342, + "epoch": 0.7895385540357733, + "flos": 24729236172000.0, + "grad_norm": 2.9556064381946383, + "language_loss": 0.68987548, + "learning_rate": 4.4681569586418153e-07, + "loss": 0.71137249, + "num_input_tokens_seen": 283368020, + "router_z_loss_clip": 0.81298828, + "router_z_loss_mlp": 0.12988281, + "step": 13132, + "time_per_iteration": 2.5853569507598877 + }, + { + "auxiliary_loss_clip": 0.01116439, + "auxiliary_loss_mlp": 0.01035738, + "balance_loss_clip": 1.04131484, + "balance_loss_mlp": 1.02383447, + "epoch": 0.7895986772884414, + "flos": 25620043633920.0, + "grad_norm": 2.6089989316031743, + "language_loss": 0.61795807, + "learning_rate": 4.465703630239468e-07, + "loss": 0.63947988, + "num_input_tokens_seen": 283387030, + "router_z_loss_clip": 0.75146484, + "router_z_loss_mlp": 0.11901855, + "step": 13133, + "time_per_iteration": 2.6519596576690674 + }, + { + "auxiliary_loss_clip": 0.01118189, + "auxiliary_loss_mlp": 0.01037461, + "balance_loss_clip": 1.04187953, + "balance_loss_mlp": 1.02487814, + "epoch": 0.7896588005411093, + "flos": 22766655960000.0, + "grad_norm": 2.156156957495619, + "language_loss": 0.79973376, + "learning_rate": 4.463250890899195e-07, + "loss": 0.82129025, + "num_input_tokens_seen": 283402090, + "router_z_loss_clip": 0.76269531, + "router_z_loss_mlp": 0.12585449, + "step": 13134, + "time_per_iteration": 2.5836875438690186 + }, + { + "auxiliary_loss_clip": 0.01113796, + "auxiliary_loss_mlp": 0.01029516, + "balance_loss_clip": 1.03961372, + "balance_loss_mlp": 1.01807177, + "epoch": 0.7897189237937773, + "flos": 22458728567520.0, + "grad_norm": 1.9673799045723015, + "language_loss": 0.79985142, + "learning_rate": 4.460798740713998e-07, + "loss": 0.82128453, + "num_input_tokens_seen": 283421035, + "router_z_loss_clip": 0.74169922, + "router_z_loss_mlp": 0.11456299, + "step": 13135, + "time_per_iteration": 2.572977066040039 + }, + { + "auxiliary_loss_clip": 0.01111789, + "auxiliary_loss_mlp": 0.01030605, + "balance_loss_clip": 1.03932619, + "balance_loss_mlp": 1.01864862, + "epoch": 0.7897790470464452, + "flos": 28957664501280.0, + "grad_norm": 2.507277524494852, + "language_loss": 0.72513282, + "learning_rate": 4.4583471797768733e-07, + "loss": 0.74655676, + "num_input_tokens_seen": 283441830, + "router_z_loss_clip": 0.72460938, + "router_z_loss_mlp": 0.11956787, + "step": 13136, + "time_per_iteration": 2.6793100833892822 + }, + { + "auxiliary_loss_clip": 0.01120324, + "auxiliary_loss_mlp": 0.01034325, + "balance_loss_clip": 1.04166913, + "balance_loss_mlp": 1.02173662, + "epoch": 0.7898391702991132, + "flos": 19424497157280.0, + "grad_norm": 2.6934778534799357, + "language_loss": 0.70984203, + "learning_rate": 4.455896208180778e-07, + "loss": 0.73138851, + "num_input_tokens_seen": 283459540, + "router_z_loss_clip": 0.78710938, + "router_z_loss_mlp": 0.12591553, + "step": 13137, + "time_per_iteration": 2.5764548778533936 + }, + { + "auxiliary_loss_clip": 0.0111245, + "auxiliary_loss_mlp": 0.01033327, + "balance_loss_clip": 1.04067969, + "balance_loss_mlp": 1.02089989, + "epoch": 0.7898992935517811, + "flos": 24195335143680.0, + "grad_norm": 1.731376692888144, + "language_loss": 0.73840749, + "learning_rate": 4.4534458260186645e-07, + "loss": 0.75986522, + "num_input_tokens_seen": 283478790, + "router_z_loss_clip": 0.71777344, + "router_z_loss_mlp": 0.12420654, + "step": 13138, + "time_per_iteration": 2.6269772052764893 + }, + { + "auxiliary_loss_clip": 0.01113384, + "auxiliary_loss_mlp": 0.01030166, + "balance_loss_clip": 1.04068494, + "balance_loss_mlp": 1.01885855, + "epoch": 0.7899594168044491, + "flos": 19786792389120.0, + "grad_norm": 2.17055689880243, + "language_loss": 0.68973255, + "learning_rate": 4.4509960333834426e-07, + "loss": 0.71116805, + "num_input_tokens_seen": 283495720, + "router_z_loss_clip": 0.7265625, + "router_z_loss_mlp": 0.11303711, + "step": 13139, + "time_per_iteration": 2.5600790977478027 + }, + { + "auxiliary_loss_clip": 0.01032013, + "auxiliary_loss_mlp": 0.01002052, + "balance_loss_clip": 1.00952482, + "balance_loss_mlp": 1.00107527, + "epoch": 0.790019540057117, + "flos": 83376651335040.0, + "grad_norm": 0.8866486454666129, + "language_loss": 0.60151345, + "learning_rate": 4.448546830368003e-07, + "loss": 0.62185407, + "num_input_tokens_seen": 283558795, + "router_z_loss_clip": 0.22473145, + "router_z_loss_mlp": 0.009758, + "step": 13140, + "time_per_iteration": 3.338641881942749 + }, + { + "auxiliary_loss_clip": 0.01114498, + "auxiliary_loss_mlp": 0.01031137, + "balance_loss_clip": 1.04053617, + "balance_loss_mlp": 1.01913261, + "epoch": 0.7900796633097851, + "flos": 37013173620480.0, + "grad_norm": 2.495926378717874, + "language_loss": 0.76305807, + "learning_rate": 4.4460982170652304e-07, + "loss": 0.78451437, + "num_input_tokens_seen": 283579305, + "router_z_loss_clip": 0.73876953, + "router_z_loss_mlp": 0.11999512, + "step": 13141, + "time_per_iteration": 2.6673367023468018 + }, + { + "auxiliary_loss_clip": 0.01114885, + "auxiliary_loss_mlp": 0.01033903, + "balance_loss_clip": 1.03984547, + "balance_loss_mlp": 1.02233374, + "epoch": 0.790139786562453, + "flos": 26999298086400.0, + "grad_norm": 2.4996457729550126, + "language_loss": 0.68583351, + "learning_rate": 4.4436501935679694e-07, + "loss": 0.70732141, + "num_input_tokens_seen": 283597840, + "router_z_loss_clip": 0.75097656, + "router_z_loss_mlp": 0.11566162, + "step": 13142, + "time_per_iteration": 2.6402509212493896 + }, + { + "auxiliary_loss_clip": 0.01032229, + "auxiliary_loss_mlp": 0.01000306, + "balance_loss_clip": 1.0096215, + "balance_loss_mlp": 0.99932516, + "epoch": 0.790199909815121, + "flos": 71024489137440.0, + "grad_norm": 0.8611702602246032, + "language_loss": 0.59957016, + "learning_rate": 4.441202759969049e-07, + "loss": 0.61989546, + "num_input_tokens_seen": 283647950, + "router_z_loss_clip": 0.22619629, + "router_z_loss_mlp": 0.00980377, + "step": 13143, + "time_per_iteration": 2.9979612827301025 + }, + { + "auxiliary_loss_clip": 0.01115253, + "auxiliary_loss_mlp": 0.01031749, + "balance_loss_clip": 1.03995383, + "balance_loss_mlp": 1.01964951, + "epoch": 0.7902600330677889, + "flos": 42138568038240.0, + "grad_norm": 2.9861860447255353, + "language_loss": 0.7419517, + "learning_rate": 4.4387559163612875e-07, + "loss": 0.76342171, + "num_input_tokens_seen": 283670645, + "router_z_loss_clip": 0.75244141, + "router_z_loss_mlp": 0.12103271, + "step": 13144, + "time_per_iteration": 2.7659950256347656 + }, + { + "auxiliary_loss_clip": 0.01117588, + "auxiliary_loss_mlp": 0.01031459, + "balance_loss_clip": 1.04138875, + "balance_loss_mlp": 1.01888883, + "epoch": 0.7903201563204569, + "flos": 27267200756640.0, + "grad_norm": 1.927044376186344, + "language_loss": 0.83339286, + "learning_rate": 4.4363096628374605e-07, + "loss": 0.85488331, + "num_input_tokens_seen": 283688830, + "router_z_loss_clip": 0.76220703, + "router_z_loss_mlp": 0.12585449, + "step": 13145, + "time_per_iteration": 2.654371976852417 + }, + { + "auxiliary_loss_clip": 0.01107561, + "auxiliary_loss_mlp": 0.01029689, + "balance_loss_clip": 1.03747368, + "balance_loss_mlp": 1.01925826, + "epoch": 0.790380279573125, + "flos": 26909929667520.0, + "grad_norm": 1.9400479816537612, + "language_loss": 0.73044872, + "learning_rate": 4.4338639994903235e-07, + "loss": 0.75182128, + "num_input_tokens_seen": 283708625, + "router_z_loss_clip": 0.70166016, + "router_z_loss_mlp": 0.10437012, + "step": 13146, + "time_per_iteration": 2.650127410888672 + }, + { + "auxiliary_loss_clip": 0.01114755, + "auxiliary_loss_mlp": 0.01032845, + "balance_loss_clip": 1.03910732, + "balance_loss_mlp": 1.02123404, + "epoch": 0.7904404028257929, + "flos": 24773272104960.0, + "grad_norm": 1.9211843417171663, + "language_loss": 0.7556895, + "learning_rate": 4.4314189264126246e-07, + "loss": 0.77716553, + "num_input_tokens_seen": 283725710, + "router_z_loss_clip": 0.75585938, + "router_z_loss_mlp": 0.11602783, + "step": 13147, + "time_per_iteration": 2.6363747119903564 + }, + { + "auxiliary_loss_clip": 0.01111743, + "auxiliary_loss_mlp": 0.01034741, + "balance_loss_clip": 1.03919768, + "balance_loss_mlp": 1.02266502, + "epoch": 0.7905005260784609, + "flos": 24415069118400.0, + "grad_norm": 2.115282125009981, + "language_loss": 0.72088826, + "learning_rate": 4.428974443697087e-07, + "loss": 0.74235314, + "num_input_tokens_seen": 283744150, + "router_z_loss_clip": 0.72509766, + "router_z_loss_mlp": 0.12078857, + "step": 13148, + "time_per_iteration": 2.655395746231079 + }, + { + "auxiliary_loss_clip": 0.01114597, + "auxiliary_loss_mlp": 0.01029108, + "balance_loss_clip": 1.0388329, + "balance_loss_mlp": 1.01753223, + "epoch": 0.7905606493311288, + "flos": 32831212260960.0, + "grad_norm": 2.1461264338594948, + "language_loss": 0.71701086, + "learning_rate": 4.4265305514363913e-07, + "loss": 0.7384479, + "num_input_tokens_seen": 283764170, + "router_z_loss_clip": 0.75732422, + "router_z_loss_mlp": 0.11584473, + "step": 13149, + "time_per_iteration": 2.7167253494262695 + }, + { + "auxiliary_loss_clip": 0.01118318, + "auxiliary_loss_mlp": 0.01033442, + "balance_loss_clip": 1.04218006, + "balance_loss_mlp": 1.02049565, + "epoch": 0.7906207725837968, + "flos": 28914398396640.0, + "grad_norm": 1.9243067873740671, + "language_loss": 0.6566692, + "learning_rate": 4.424087249723225e-07, + "loss": 0.67818677, + "num_input_tokens_seen": 283784305, + "router_z_loss_clip": 0.76171875, + "router_z_loss_mlp": 0.12939453, + "step": 13150, + "time_per_iteration": 2.607102632522583 + }, + { + "auxiliary_loss_clip": 0.01110888, + "auxiliary_loss_mlp": 0.01031185, + "balance_loss_clip": 1.03766155, + "balance_loss_mlp": 1.02019954, + "epoch": 0.7906808958364647, + "flos": 25439929208640.0, + "grad_norm": 1.628216473085426, + "language_loss": 0.70402431, + "learning_rate": 4.421644538650231e-07, + "loss": 0.72544497, + "num_input_tokens_seen": 283804040, + "router_z_loss_clip": 0.73291016, + "router_z_loss_mlp": 0.10974121, + "step": 13151, + "time_per_iteration": 2.644949197769165 + }, + { + "auxiliary_loss_clip": 0.01116105, + "auxiliary_loss_mlp": 0.01036941, + "balance_loss_clip": 1.04074788, + "balance_loss_mlp": 1.02493703, + "epoch": 0.7907410190891327, + "flos": 49706643098880.0, + "grad_norm": 1.4500705067826583, + "language_loss": 0.70073402, + "learning_rate": 4.4192024183100306e-07, + "loss": 0.72226453, + "num_input_tokens_seen": 283827120, + "router_z_loss_clip": 0.75439453, + "router_z_loss_mlp": 0.11987305, + "step": 13152, + "time_per_iteration": 2.7599172592163086 + }, + { + "auxiliary_loss_clip": 0.01112638, + "auxiliary_loss_mlp": 0.0102948, + "balance_loss_clip": 1.03967333, + "balance_loss_mlp": 1.0181495, + "epoch": 0.7908011423418007, + "flos": 16180742126880.0, + "grad_norm": 2.0677358490754902, + "language_loss": 0.73281997, + "learning_rate": 4.4167608887952367e-07, + "loss": 0.75424123, + "num_input_tokens_seen": 283844820, + "router_z_loss_clip": 0.72949219, + "router_z_loss_mlp": 0.11334229, + "step": 13153, + "time_per_iteration": 2.6473464965820312 + }, + { + "auxiliary_loss_clip": 0.01112132, + "auxiliary_loss_mlp": 0.01026915, + "balance_loss_clip": 1.03812528, + "balance_loss_mlp": 1.01553011, + "epoch": 0.7908612655944687, + "flos": 24104791723680.0, + "grad_norm": 1.6560228812416138, + "language_loss": 0.78869116, + "learning_rate": 4.4143199501984306e-07, + "loss": 0.81008166, + "num_input_tokens_seen": 283862870, + "router_z_loss_clip": 0.73925781, + "router_z_loss_mlp": 0.11376953, + "step": 13154, + "time_per_iteration": 2.58282208442688 + }, + { + "auxiliary_loss_clip": 0.01120018, + "auxiliary_loss_mlp": 0.01028023, + "balance_loss_clip": 1.04043341, + "balance_loss_mlp": 1.0153271, + "epoch": 0.7909213888471366, + "flos": 25975410410880.0, + "grad_norm": 2.184019711008582, + "language_loss": 0.70434475, + "learning_rate": 4.411879602612185e-07, + "loss": 0.72582519, + "num_input_tokens_seen": 283882405, + "router_z_loss_clip": 0.796875, + "router_z_loss_mlp": 0.12701416, + "step": 13155, + "time_per_iteration": 2.6749258041381836 + }, + { + "auxiliary_loss_clip": 0.01112991, + "auxiliary_loss_mlp": 0.0102913, + "balance_loss_clip": 1.03907728, + "balance_loss_mlp": 1.01735234, + "epoch": 0.7909815120998046, + "flos": 27489892492800.0, + "grad_norm": 1.7401687363691596, + "language_loss": 0.76988977, + "learning_rate": 4.4094398461290174e-07, + "loss": 0.79131097, + "num_input_tokens_seen": 283902070, + "router_z_loss_clip": 0.73925781, + "router_z_loss_mlp": 0.11785889, + "step": 13156, + "time_per_iteration": 4.052265167236328 + }, + { + "auxiliary_loss_clip": 0.0111198, + "auxiliary_loss_mlp": 0.01031073, + "balance_loss_clip": 1.03900099, + "balance_loss_mlp": 1.01965308, + "epoch": 0.7910416353524725, + "flos": 32613342081120.0, + "grad_norm": 2.154680328115475, + "language_loss": 0.65420365, + "learning_rate": 4.4070006808414526e-07, + "loss": 0.67563415, + "num_input_tokens_seen": 283924100, + "router_z_loss_clip": 0.73046875, + "router_z_loss_mlp": 0.11419678, + "step": 13157, + "time_per_iteration": 2.654118299484253 + }, + { + "auxiliary_loss_clip": 0.01115047, + "auxiliary_loss_mlp": 0.01037281, + "balance_loss_clip": 1.03996658, + "balance_loss_mlp": 1.02486563, + "epoch": 0.7911017586051405, + "flos": 30071974044960.0, + "grad_norm": 1.8257693410964386, + "language_loss": 0.73874938, + "learning_rate": 4.4045621068419894e-07, + "loss": 0.76027262, + "num_input_tokens_seen": 283944955, + "router_z_loss_clip": 0.75, + "router_z_loss_mlp": 0.12408447, + "step": 13158, + "time_per_iteration": 2.7525532245635986 + }, + { + "auxiliary_loss_clip": 0.01109836, + "auxiliary_loss_mlp": 0.01027626, + "balance_loss_clip": 1.03853297, + "balance_loss_mlp": 1.01705241, + "epoch": 0.7911618818578086, + "flos": 21433301235360.0, + "grad_norm": 2.943265596756494, + "language_loss": 0.67182064, + "learning_rate": 4.40212412422309e-07, + "loss": 0.69319522, + "num_input_tokens_seen": 283963125, + "router_z_loss_clip": 0.71289062, + "router_z_loss_mlp": 0.10571289, + "step": 13159, + "time_per_iteration": 2.59382700920105 + }, + { + "auxiliary_loss_clip": 0.01112884, + "auxiliary_loss_mlp": 0.0103023, + "balance_loss_clip": 1.04003763, + "balance_loss_mlp": 1.01883364, + "epoch": 0.7912220051104765, + "flos": 20404024761600.0, + "grad_norm": 2.0979803366193703, + "language_loss": 0.67095834, + "learning_rate": 4.399686733077206e-07, + "loss": 0.69238949, + "num_input_tokens_seen": 283982850, + "router_z_loss_clip": 0.72851562, + "router_z_loss_mlp": 0.11401367, + "step": 13160, + "time_per_iteration": 2.6409666538238525 + }, + { + "auxiliary_loss_clip": 0.0110609, + "auxiliary_loss_mlp": 0.010271, + "balance_loss_clip": 1.03672504, + "balance_loss_mlp": 1.01700306, + "epoch": 0.7912821283631445, + "flos": 16714724189760.0, + "grad_norm": 2.1553344181554603, + "language_loss": 0.72641861, + "learning_rate": 4.3972499334967694e-07, + "loss": 0.74775052, + "num_input_tokens_seen": 283998275, + "router_z_loss_clip": 0.69384766, + "router_z_loss_mlp": 0.10101318, + "step": 13161, + "time_per_iteration": 3.984255313873291 + }, + { + "auxiliary_loss_clip": 0.01111558, + "auxiliary_loss_mlp": 0.01029095, + "balance_loss_clip": 1.04006171, + "balance_loss_mlp": 1.01756799, + "epoch": 0.7913422516158124, + "flos": 29003240090880.0, + "grad_norm": 2.4337170068113325, + "language_loss": 0.73564076, + "learning_rate": 4.39481372557418e-07, + "loss": 0.7570473, + "num_input_tokens_seen": 284018750, + "router_z_loss_clip": 0.71386719, + "router_z_loss_mlp": 0.11529541, + "step": 13162, + "time_per_iteration": 2.621454954147339 + }, + { + "auxiliary_loss_clip": 0.0111585, + "auxiliary_loss_mlp": 0.01030182, + "balance_loss_clip": 1.04133272, + "balance_loss_mlp": 1.01874363, + "epoch": 0.7914023748684804, + "flos": 24328617943680.0, + "grad_norm": 1.6616835602105096, + "language_loss": 0.71432972, + "learning_rate": 4.392378109401811e-07, + "loss": 0.73579001, + "num_input_tokens_seen": 284037850, + "router_z_loss_clip": 0.74462891, + "router_z_loss_mlp": 0.11437988, + "step": 13163, + "time_per_iteration": 2.626164674758911 + }, + { + "auxiliary_loss_clip": 0.01113921, + "auxiliary_loss_mlp": 0.01032447, + "balance_loss_clip": 1.04080033, + "balance_loss_mlp": 1.02001941, + "epoch": 0.7914624981211483, + "flos": 25129813883040.0, + "grad_norm": 1.9148357446876372, + "language_loss": 0.69871867, + "learning_rate": 4.3899430850720296e-07, + "loss": 0.7201823, + "num_input_tokens_seen": 284056380, + "router_z_loss_clip": 0.73046875, + "router_z_loss_mlp": 0.12420654, + "step": 13164, + "time_per_iteration": 2.578216552734375 + }, + { + "auxiliary_loss_clip": 0.01111643, + "auxiliary_loss_mlp": 0.01030462, + "balance_loss_clip": 1.03852773, + "balance_loss_mlp": 1.01888704, + "epoch": 0.7915226213738163, + "flos": 26599773824640.0, + "grad_norm": 2.232999317455009, + "language_loss": 0.66469872, + "learning_rate": 4.387508652677177e-07, + "loss": 0.68611979, + "num_input_tokens_seen": 284074945, + "router_z_loss_clip": 0.73095703, + "router_z_loss_mlp": 0.11566162, + "step": 13165, + "time_per_iteration": 2.651207447052002 + }, + { + "auxiliary_loss_clip": 0.01108229, + "auxiliary_loss_mlp": 0.01027388, + "balance_loss_clip": 1.03841734, + "balance_loss_mlp": 1.01681471, + "epoch": 0.7915827446264843, + "flos": 19874864255040.0, + "grad_norm": 1.869102204497613, + "language_loss": 0.72519881, + "learning_rate": 4.385074812309557e-07, + "loss": 0.74655497, + "num_input_tokens_seen": 284092070, + "router_z_loss_clip": 0.69775391, + "router_z_loss_mlp": 0.10565186, + "step": 13166, + "time_per_iteration": 4.0700929164886475 + }, + { + "auxiliary_loss_clip": 0.01112341, + "auxiliary_loss_mlp": 0.01030855, + "balance_loss_clip": 1.03907943, + "balance_loss_mlp": 1.01810563, + "epoch": 0.7916428678791523, + "flos": 31362751458720.0, + "grad_norm": 1.7852796210783692, + "language_loss": 0.77427828, + "learning_rate": 4.382641564061462e-07, + "loss": 0.79571021, + "num_input_tokens_seen": 284112255, + "router_z_loss_clip": 0.73193359, + "router_z_loss_mlp": 0.12750244, + "step": 13167, + "time_per_iteration": 2.6490118503570557 + }, + { + "auxiliary_loss_clip": 0.0111325, + "auxiliary_loss_mlp": 0.01028602, + "balance_loss_clip": 1.04056334, + "balance_loss_mlp": 1.0178318, + "epoch": 0.7917029911318202, + "flos": 29136847029120.0, + "grad_norm": 1.8959083638126262, + "language_loss": 0.84055245, + "learning_rate": 4.3802089080251713e-07, + "loss": 0.86197096, + "num_input_tokens_seen": 284132330, + "router_z_loss_clip": 0.72607422, + "router_z_loss_mlp": 0.10778809, + "step": 13168, + "time_per_iteration": 2.7183704376220703 + }, + { + "auxiliary_loss_clip": 0.01114278, + "auxiliary_loss_mlp": 0.01029561, + "balance_loss_clip": 1.04046988, + "balance_loss_mlp": 1.01831317, + "epoch": 0.7917631143844882, + "flos": 26413865428320.0, + "grad_norm": 2.0769157401221445, + "language_loss": 0.72984397, + "learning_rate": 4.3777768442929155e-07, + "loss": 0.75128239, + "num_input_tokens_seen": 284150640, + "router_z_loss_clip": 0.73828125, + "router_z_loss_mlp": 0.11236572, + "step": 13169, + "time_per_iteration": 3.968869209289551 + }, + { + "auxiliary_loss_clip": 0.01115249, + "auxiliary_loss_mlp": 0.01033468, + "balance_loss_clip": 1.03953803, + "balance_loss_mlp": 1.02099848, + "epoch": 0.7918232376371561, + "flos": 47435730321600.0, + "grad_norm": 1.8722591907444062, + "language_loss": 0.67016387, + "learning_rate": 4.3753453729569287e-07, + "loss": 0.69165099, + "num_input_tokens_seen": 284171910, + "router_z_loss_clip": 0.75683594, + "router_z_loss_mlp": 0.12481689, + "step": 13170, + "time_per_iteration": 2.8317711353302 + }, + { + "auxiliary_loss_clip": 0.0111288, + "auxiliary_loss_mlp": 0.01025836, + "balance_loss_clip": 1.03839493, + "balance_loss_mlp": 1.01443923, + "epoch": 0.7918833608898241, + "flos": 25350520272480.0, + "grad_norm": 2.1090668851950167, + "language_loss": 0.70821565, + "learning_rate": 4.372914494109412e-07, + "loss": 0.72960281, + "num_input_tokens_seen": 284191340, + "router_z_loss_clip": 0.74462891, + "router_z_loss_mlp": 0.11407471, + "step": 13171, + "time_per_iteration": 2.6008360385894775 + }, + { + "auxiliary_loss_clip": 0.0111107, + "auxiliary_loss_mlp": 0.010276, + "balance_loss_clip": 1.03828549, + "balance_loss_mlp": 1.01628733, + "epoch": 0.7919434841424922, + "flos": 41378369235840.0, + "grad_norm": 2.188069202288376, + "language_loss": 0.66760767, + "learning_rate": 4.370484207842553e-07, + "loss": 0.68899441, + "num_input_tokens_seen": 284212495, + "router_z_loss_clip": 0.72900391, + "router_z_loss_mlp": 0.11322021, + "step": 13172, + "time_per_iteration": 2.774604558944702 + }, + { + "auxiliary_loss_clip": 0.01114215, + "auxiliary_loss_mlp": 0.01032008, + "balance_loss_clip": 1.04054463, + "balance_loss_mlp": 1.02056408, + "epoch": 0.7920036073951601, + "flos": 25703577564480.0, + "grad_norm": 1.7711843896795698, + "language_loss": 0.79631317, + "learning_rate": 4.3680545142484893e-07, + "loss": 0.81777537, + "num_input_tokens_seen": 284230825, + "router_z_loss_clip": 0.73535156, + "router_z_loss_mlp": 0.11456299, + "step": 13173, + "time_per_iteration": 2.6665549278259277 + }, + { + "auxiliary_loss_clip": 0.01113008, + "auxiliary_loss_mlp": 0.01027607, + "balance_loss_clip": 1.03975821, + "balance_loss_mlp": 1.01697314, + "epoch": 0.7920637306478281, + "flos": 28865621941920.0, + "grad_norm": 1.8564961009199878, + "language_loss": 0.76654732, + "learning_rate": 4.365625413419365e-07, + "loss": 0.78795344, + "num_input_tokens_seen": 284250365, + "router_z_loss_clip": 0.73242188, + "router_z_loss_mlp": 0.10638428, + "step": 13174, + "time_per_iteration": 2.6376402378082275 + }, + { + "auxiliary_loss_clip": 0.01110262, + "auxiliary_loss_mlp": 0.01034383, + "balance_loss_clip": 1.03887248, + "balance_loss_mlp": 1.02368426, + "epoch": 0.792123853900496, + "flos": 33184715243040.0, + "grad_norm": 1.7488001894702343, + "language_loss": 0.7156747, + "learning_rate": 4.363196905447297e-07, + "loss": 0.73712111, + "num_input_tokens_seen": 284269635, + "router_z_loss_clip": 0.71435547, + "router_z_loss_mlp": 0.10693359, + "step": 13175, + "time_per_iteration": 2.6836178302764893 + }, + { + "auxiliary_loss_clip": 0.01112402, + "auxiliary_loss_mlp": 0.010299, + "balance_loss_clip": 1.03945601, + "balance_loss_mlp": 1.01829469, + "epoch": 0.792183977153164, + "flos": 23304122508960.0, + "grad_norm": 1.9570013717533843, + "language_loss": 0.60017157, + "learning_rate": 4.360768990424364e-07, + "loss": 0.62159455, + "num_input_tokens_seen": 284288380, + "router_z_loss_clip": 0.72900391, + "router_z_loss_mlp": 0.11608887, + "step": 13176, + "time_per_iteration": 2.592226266860962 + }, + { + "auxiliary_loss_clip": 0.01115023, + "auxiliary_loss_mlp": 0.01031003, + "balance_loss_clip": 1.04287267, + "balance_loss_mlp": 1.02004194, + "epoch": 0.7922441004058319, + "flos": 20900372621760.0, + "grad_norm": 1.8010860019034869, + "language_loss": 0.73440552, + "learning_rate": 4.3583416684426376e-07, + "loss": 0.75586575, + "num_input_tokens_seen": 284306920, + "router_z_loss_clip": 0.72216797, + "router_z_loss_mlp": 0.10961914, + "step": 13177, + "time_per_iteration": 2.6554200649261475 + }, + { + "auxiliary_loss_clip": 0.01111547, + "auxiliary_loss_mlp": 0.01035294, + "balance_loss_clip": 1.04068875, + "balance_loss_mlp": 1.02373731, + "epoch": 0.7923042236585, + "flos": 21745807080480.0, + "grad_norm": 4.875723550107734, + "language_loss": 0.64209956, + "learning_rate": 4.355914939594174e-07, + "loss": 0.66356802, + "num_input_tokens_seen": 284324700, + "router_z_loss_clip": 0.70800781, + "router_z_loss_mlp": 0.11572266, + "step": 13178, + "time_per_iteration": 2.5754692554473877 + }, + { + "auxiliary_loss_clip": 0.01111279, + "auxiliary_loss_mlp": 0.01032122, + "balance_loss_clip": 1.03790188, + "balance_loss_mlp": 1.02172732, + "epoch": 0.7923643469111679, + "flos": 36527684391360.0, + "grad_norm": 1.3689315936795345, + "language_loss": 0.68502504, + "learning_rate": 4.3534888039709726e-07, + "loss": 0.70645905, + "num_input_tokens_seen": 284345985, + "router_z_loss_clip": 0.73339844, + "router_z_loss_mlp": 0.10400391, + "step": 13179, + "time_per_iteration": 2.7047290802001953 + }, + { + "auxiliary_loss_clip": 0.01109675, + "auxiliary_loss_mlp": 0.01031503, + "balance_loss_clip": 1.03762007, + "balance_loss_mlp": 1.02011907, + "epoch": 0.7924244701638359, + "flos": 27667859502240.0, + "grad_norm": 1.9966108070217892, + "language_loss": 0.74219978, + "learning_rate": 4.3510632616650444e-07, + "loss": 0.76361156, + "num_input_tokens_seen": 284364475, + "router_z_loss_clip": 0.71972656, + "router_z_loss_mlp": 0.11395264, + "step": 13180, + "time_per_iteration": 2.633918523788452 + }, + { + "auxiliary_loss_clip": 0.01116316, + "auxiliary_loss_mlp": 0.01035432, + "balance_loss_clip": 1.04132533, + "balance_loss_mlp": 1.02296853, + "epoch": 0.7924845934165038, + "flos": 21924543918240.0, + "grad_norm": 3.48170498017563, + "language_loss": 0.82113183, + "learning_rate": 4.3486383127683646e-07, + "loss": 0.84264922, + "num_input_tokens_seen": 284382125, + "router_z_loss_clip": 0.75048828, + "router_z_loss_mlp": 0.12481689, + "step": 13181, + "time_per_iteration": 2.675954818725586 + }, + { + "auxiliary_loss_clip": 0.0111073, + "auxiliary_loss_mlp": 0.0103179, + "balance_loss_clip": 1.0388459, + "balance_loss_mlp": 1.01982725, + "epoch": 0.7925447166691718, + "flos": 28646860381920.0, + "grad_norm": 1.8650644766925917, + "language_loss": 0.77615041, + "learning_rate": 4.346213957372895e-07, + "loss": 0.79757559, + "num_input_tokens_seen": 284401585, + "router_z_loss_clip": 0.71777344, + "router_z_loss_mlp": 0.11950684, + "step": 13182, + "time_per_iteration": 2.698049306869507 + }, + { + "auxiliary_loss_clip": 0.01117272, + "auxiliary_loss_mlp": 0.01032764, + "balance_loss_clip": 1.04143393, + "balance_loss_mlp": 1.02027667, + "epoch": 0.7926048399218397, + "flos": 24950266699680.0, + "grad_norm": 2.0655708380514106, + "language_loss": 0.74164796, + "learning_rate": 4.34379019557056e-07, + "loss": 0.76314831, + "num_input_tokens_seen": 284419125, + "router_z_loss_clip": 0.7578125, + "router_z_loss_mlp": 0.12493896, + "step": 13183, + "time_per_iteration": 2.7576162815093994 + }, + { + "auxiliary_loss_clip": 0.0111327, + "auxiliary_loss_mlp": 0.01026573, + "balance_loss_clip": 1.04056346, + "balance_loss_mlp": 1.01500952, + "epoch": 0.7926649631745077, + "flos": 45343919037600.0, + "grad_norm": 1.9021129103289265, + "language_loss": 0.68311846, + "learning_rate": 4.341367027453264e-07, + "loss": 0.70451689, + "num_input_tokens_seen": 284440445, + "router_z_loss_clip": 0.72753906, + "router_z_loss_mlp": 0.11572266, + "step": 13184, + "time_per_iteration": 2.7404510974884033 + }, + { + "auxiliary_loss_clip": 0.01115864, + "auxiliary_loss_mlp": 0.01029984, + "balance_loss_clip": 1.04032397, + "balance_loss_mlp": 1.01831388, + "epoch": 0.7927250864271758, + "flos": 20765914820640.0, + "grad_norm": 2.1524469507483444, + "language_loss": 0.70801425, + "learning_rate": 4.338944453112907e-07, + "loss": 0.72947276, + "num_input_tokens_seen": 284459370, + "router_z_loss_clip": 0.75585938, + "router_z_loss_mlp": 0.11663818, + "step": 13185, + "time_per_iteration": 2.624260663986206 + }, + { + "auxiliary_loss_clip": 0.01114439, + "auxiliary_loss_mlp": 0.01030137, + "balance_loss_clip": 1.03961396, + "balance_loss_mlp": 1.01824582, + "epoch": 0.7927852096798437, + "flos": 21657370559040.0, + "grad_norm": 2.061994979813404, + "language_loss": 0.65130228, + "learning_rate": 4.3365224726413375e-07, + "loss": 0.67274803, + "num_input_tokens_seen": 284477525, + "router_z_loss_clip": 0.74804688, + "router_z_loss_mlp": 0.11895752, + "step": 13186, + "time_per_iteration": 2.578601837158203 + }, + { + "auxiliary_loss_clip": 0.01111737, + "auxiliary_loss_mlp": 0.01029588, + "balance_loss_clip": 1.03952479, + "balance_loss_mlp": 1.01876998, + "epoch": 0.7928453329325117, + "flos": 29088273160800.0, + "grad_norm": 1.8371358947939493, + "language_loss": 0.77044868, + "learning_rate": 4.334101086130408e-07, + "loss": 0.79186189, + "num_input_tokens_seen": 284496590, + "router_z_loss_clip": 0.72216797, + "router_z_loss_mlp": 0.1081543, + "step": 13187, + "time_per_iteration": 2.6858131885528564 + }, + { + "auxiliary_loss_clip": 0.01112014, + "auxiliary_loss_mlp": 0.01031819, + "balance_loss_clip": 1.04005861, + "balance_loss_mlp": 1.02052379, + "epoch": 0.7929054561851796, + "flos": 21298478778720.0, + "grad_norm": 2.1200062644190445, + "language_loss": 0.72856772, + "learning_rate": 4.3316802936719334e-07, + "loss": 0.75000602, + "num_input_tokens_seen": 284511470, + "router_z_loss_clip": 0.72021484, + "router_z_loss_mlp": 0.11297607, + "step": 13188, + "time_per_iteration": 2.6537187099456787 + }, + { + "auxiliary_loss_clip": 0.01116074, + "auxiliary_loss_mlp": 0.01040193, + "balance_loss_clip": 1.04021883, + "balance_loss_mlp": 1.02747965, + "epoch": 0.7929655794378476, + "flos": 26821695732480.0, + "grad_norm": 2.813039978270101, + "language_loss": 0.6304636, + "learning_rate": 4.329260095357725e-07, + "loss": 0.6520263, + "num_input_tokens_seen": 284531125, + "router_z_loss_clip": 0.75732422, + "router_z_loss_mlp": 0.12713623, + "step": 13189, + "time_per_iteration": 2.609262228012085 + }, + { + "auxiliary_loss_clip": 0.01111916, + "auxiliary_loss_mlp": 0.01030857, + "balance_loss_clip": 1.03900301, + "balance_loss_mlp": 1.02004504, + "epoch": 0.7930257026905155, + "flos": 21077488768320.0, + "grad_norm": 2.0945912512728992, + "language_loss": 0.72694486, + "learning_rate": 4.3268404912795307e-07, + "loss": 0.74837255, + "num_input_tokens_seen": 284549340, + "router_z_loss_clip": 0.72851562, + "router_z_loss_mlp": 0.1081543, + "step": 13190, + "time_per_iteration": 2.648456573486328 + }, + { + "auxiliary_loss_clip": 0.01110053, + "auxiliary_loss_mlp": 0.01026765, + "balance_loss_clip": 1.04100454, + "balance_loss_mlp": 1.01695943, + "epoch": 0.7930858259431836, + "flos": 33313419590400.0, + "grad_norm": 1.9364316441168008, + "language_loss": 0.73117828, + "learning_rate": 4.3244214815291166e-07, + "loss": 0.75254643, + "num_input_tokens_seen": 284567060, + "router_z_loss_clip": 0.69042969, + "router_z_loss_mlp": 0.09802246, + "step": 13191, + "time_per_iteration": 2.655770778656006 + }, + { + "auxiliary_loss_clip": 0.01113188, + "auxiliary_loss_mlp": 0.01038769, + "balance_loss_clip": 1.0400877, + "balance_loss_mlp": 1.02696681, + "epoch": 0.7931459491958515, + "flos": 24237831420000.0, + "grad_norm": 2.3168854517967556, + "language_loss": 0.69334185, + "learning_rate": 4.322003066198219e-07, + "loss": 0.71486139, + "num_input_tokens_seen": 284586600, + "router_z_loss_clip": 0.73193359, + "router_z_loss_mlp": 0.11798096, + "step": 13192, + "time_per_iteration": 2.630629301071167 + }, + { + "auxiliary_loss_clip": 0.01112527, + "auxiliary_loss_mlp": 0.01032373, + "balance_loss_clip": 1.0390743, + "balance_loss_mlp": 1.02105999, + "epoch": 0.7932060724485195, + "flos": 28244499910560.0, + "grad_norm": 5.08578768351462, + "language_loss": 0.75199264, + "learning_rate": 4.3195852453785274e-07, + "loss": 0.77344161, + "num_input_tokens_seen": 284605715, + "router_z_loss_clip": 0.734375, + "router_z_loss_mlp": 0.11315918, + "step": 13193, + "time_per_iteration": 2.643894910812378 + }, + { + "auxiliary_loss_clip": 0.01114041, + "auxiliary_loss_mlp": 0.01034152, + "balance_loss_clip": 1.04090416, + "balance_loss_mlp": 1.02115786, + "epoch": 0.7932661957011874, + "flos": 36527279218560.0, + "grad_norm": 1.624652611582168, + "language_loss": 0.72501904, + "learning_rate": 4.317168019161741e-07, + "loss": 0.74650097, + "num_input_tokens_seen": 284628540, + "router_z_loss_clip": 0.73193359, + "router_z_loss_mlp": 0.13012695, + "step": 13194, + "time_per_iteration": 4.187785387039185 + }, + { + "auxiliary_loss_clip": 0.01116619, + "auxiliary_loss_mlp": 0.01034187, + "balance_loss_clip": 1.04077029, + "balance_loss_mlp": 1.02227235, + "epoch": 0.7933263189538554, + "flos": 27527364626400.0, + "grad_norm": 2.3451471825018726, + "language_loss": 0.69999498, + "learning_rate": 4.314751387639517e-07, + "loss": 0.72150302, + "num_input_tokens_seen": 284646040, + "router_z_loss_clip": 0.7578125, + "router_z_loss_mlp": 0.11914062, + "step": 13195, + "time_per_iteration": 2.616849899291992 + }, + { + "auxiliary_loss_clip": 0.01113234, + "auxiliary_loss_mlp": 0.01030272, + "balance_loss_clip": 1.04025722, + "balance_loss_mlp": 1.01893532, + "epoch": 0.7933864422065233, + "flos": 31090391887680.0, + "grad_norm": 1.5853976223811748, + "language_loss": 0.7767784, + "learning_rate": 4.3123353509034844e-07, + "loss": 0.79821348, + "num_input_tokens_seen": 284665110, + "router_z_loss_clip": 0.73046875, + "router_z_loss_mlp": 0.11334229, + "step": 13196, + "time_per_iteration": 2.678196668624878 + }, + { + "auxiliary_loss_clip": 0.01116962, + "auxiliary_loss_mlp": 0.01037023, + "balance_loss_clip": 1.04169464, + "balance_loss_mlp": 1.02529883, + "epoch": 0.7934465654591913, + "flos": 40979250146880.0, + "grad_norm": 5.808842059201876, + "language_loss": 0.69093823, + "learning_rate": 4.309919909045268e-07, + "loss": 0.71247804, + "num_input_tokens_seen": 284686515, + "router_z_loss_clip": 0.75292969, + "router_z_loss_mlp": 0.11737061, + "step": 13197, + "time_per_iteration": 2.78387713432312 + }, + { + "auxiliary_loss_clip": 0.01110681, + "auxiliary_loss_mlp": 0.01027956, + "balance_loss_clip": 1.03860641, + "balance_loss_mlp": 1.01691139, + "epoch": 0.7935066887118594, + "flos": 38354712835680.0, + "grad_norm": 1.9046197395845337, + "language_loss": 0.65075874, + "learning_rate": 4.30750506215646e-07, + "loss": 0.67214513, + "num_input_tokens_seen": 284707300, + "router_z_loss_clip": 0.72070312, + "router_z_loss_mlp": 0.11029053, + "step": 13198, + "time_per_iteration": 2.689436197280884 + }, + { + "auxiliary_loss_clip": 0.01117699, + "auxiliary_loss_mlp": 0.0103086, + "balance_loss_clip": 1.04181111, + "balance_loss_mlp": 1.01833701, + "epoch": 0.7935668119645273, + "flos": 17915444390880.0, + "grad_norm": 2.2341868952159865, + "language_loss": 0.72367877, + "learning_rate": 4.30509081032864e-07, + "loss": 0.74516439, + "num_input_tokens_seen": 284723545, + "router_z_loss_clip": 0.75927734, + "router_z_loss_mlp": 0.12530518, + "step": 13199, + "time_per_iteration": 2.7184855937957764 + }, + { + "auxiliary_loss_clip": 0.01113014, + "auxiliary_loss_mlp": 0.01027162, + "balance_loss_clip": 1.03984714, + "balance_loss_mlp": 1.01611114, + "epoch": 0.7936269352171953, + "flos": 21968620368480.0, + "grad_norm": 2.1009676387185094, + "language_loss": 0.80245805, + "learning_rate": 4.302677153653349e-07, + "loss": 0.82385981, + "num_input_tokens_seen": 284742650, + "router_z_loss_clip": 0.73095703, + "router_z_loss_mlp": 0.11053467, + "step": 13200, + "time_per_iteration": 3.9976649284362793 + }, + { + "auxiliary_loss_clip": 0.01112064, + "auxiliary_loss_mlp": 0.01030978, + "balance_loss_clip": 1.04161489, + "balance_loss_mlp": 1.01976669, + "epoch": 0.7936870584698632, + "flos": 23037516391680.0, + "grad_norm": 2.0413963174611456, + "language_loss": 0.77075398, + "learning_rate": 4.3002640922221077e-07, + "loss": 0.79218435, + "num_input_tokens_seen": 284760955, + "router_z_loss_clip": 0.70458984, + "router_z_loss_mlp": 0.11212158, + "step": 13201, + "time_per_iteration": 2.608760118484497 + }, + { + "auxiliary_loss_clip": 0.01110407, + "auxiliary_loss_mlp": 0.01027213, + "balance_loss_clip": 1.03859305, + "balance_loss_mlp": 1.01654983, + "epoch": 0.7937471817225312, + "flos": 28512726719040.0, + "grad_norm": 1.67067577187919, + "language_loss": 0.67043221, + "learning_rate": 4.2978516261264296e-07, + "loss": 0.6918084, + "num_input_tokens_seen": 284780745, + "router_z_loss_clip": 0.71777344, + "router_z_loss_mlp": 0.10668945, + "step": 13202, + "time_per_iteration": 2.6607747077941895 + }, + { + "auxiliary_loss_clip": 0.01114196, + "auxiliary_loss_mlp": 0.01035981, + "balance_loss_clip": 1.04066479, + "balance_loss_mlp": 1.02409601, + "epoch": 0.7938073049751991, + "flos": 27667535364000.0, + "grad_norm": 2.2151608030331724, + "language_loss": 0.74934161, + "learning_rate": 4.2954397554577884e-07, + "loss": 0.77084339, + "num_input_tokens_seen": 284799000, + "router_z_loss_clip": 0.73535156, + "router_z_loss_mlp": 0.11877441, + "step": 13203, + "time_per_iteration": 2.608814239501953 + }, + { + "auxiliary_loss_clip": 0.01112662, + "auxiliary_loss_mlp": 0.0103465, + "balance_loss_clip": 1.03894866, + "balance_loss_mlp": 1.02386141, + "epoch": 0.7938674282278672, + "flos": 27884027956320.0, + "grad_norm": 1.7963959512650656, + "language_loss": 0.66313052, + "learning_rate": 4.293028480307643e-07, + "loss": 0.68460363, + "num_input_tokens_seen": 284817450, + "router_z_loss_clip": 0.73681641, + "router_z_loss_mlp": 0.10797119, + "step": 13204, + "time_per_iteration": 2.7183210849761963 + }, + { + "auxiliary_loss_clip": 0.01109536, + "auxiliary_loss_mlp": 0.01027519, + "balance_loss_clip": 1.03806758, + "balance_loss_mlp": 1.0164144, + "epoch": 0.7939275514805351, + "flos": 32961537299520.0, + "grad_norm": 1.3756088362506391, + "language_loss": 0.79336375, + "learning_rate": 4.290617800767438e-07, + "loss": 0.81473428, + "num_input_tokens_seen": 284838865, + "router_z_loss_clip": 0.71533203, + "router_z_loss_mlp": 0.11108398, + "step": 13205, + "time_per_iteration": 2.7787845134735107 + }, + { + "auxiliary_loss_clip": 0.01108885, + "auxiliary_loss_mlp": 0.01028446, + "balance_loss_clip": 1.03814888, + "balance_loss_mlp": 1.01728809, + "epoch": 0.7939876747332031, + "flos": 25798496850720.0, + "grad_norm": 1.814880993426505, + "language_loss": 0.7793411, + "learning_rate": 4.28820771692858e-07, + "loss": 0.80071443, + "num_input_tokens_seen": 284857975, + "router_z_loss_clip": 0.70654297, + "router_z_loss_mlp": 0.1116333, + "step": 13206, + "time_per_iteration": 4.158301591873169 + }, + { + "auxiliary_loss_clip": 0.01116436, + "auxiliary_loss_mlp": 0.01029722, + "balance_loss_clip": 1.04052114, + "balance_loss_mlp": 1.01750326, + "epoch": 0.794047797985871, + "flos": 28418091053760.0, + "grad_norm": 2.0401198324585796, + "language_loss": 0.79087424, + "learning_rate": 4.285798228882456e-07, + "loss": 0.81233585, + "num_input_tokens_seen": 284877145, + "router_z_loss_clip": 0.75976562, + "router_z_loss_mlp": 0.12219238, + "step": 13207, + "time_per_iteration": 2.674394369125366 + }, + { + "auxiliary_loss_clip": 0.01112324, + "auxiliary_loss_mlp": 0.01032519, + "balance_loss_clip": 1.03976846, + "balance_loss_mlp": 1.02148044, + "epoch": 0.794107921238539, + "flos": 30027127766400.0, + "grad_norm": 1.9338807053443643, + "language_loss": 0.84015548, + "learning_rate": 4.2833893367204375e-07, + "loss": 0.86160392, + "num_input_tokens_seen": 284895560, + "router_z_loss_clip": 0.7265625, + "router_z_loss_mlp": 0.11053467, + "step": 13208, + "time_per_iteration": 4.077382564544678 + }, + { + "auxiliary_loss_clip": 0.01033132, + "auxiliary_loss_mlp": 0.01001616, + "balance_loss_clip": 1.0106765, + "balance_loss_mlp": 1.00059366, + "epoch": 0.7941680444912069, + "flos": 78206410638720.0, + "grad_norm": 0.7257985356170287, + "language_loss": 0.58402979, + "learning_rate": 4.280981040533875e-07, + "loss": 0.60437727, + "num_input_tokens_seen": 284963135, + "router_z_loss_clip": 0.2244873, + "router_z_loss_mlp": 0.01022339, + "step": 13209, + "time_per_iteration": 3.42984938621521 + }, + { + "auxiliary_loss_clip": 0.01119002, + "auxiliary_loss_mlp": 0.01027928, + "balance_loss_clip": 1.04259312, + "balance_loss_mlp": 1.01611483, + "epoch": 0.794228167743875, + "flos": 29759630268960.0, + "grad_norm": 2.7898791286239297, + "language_loss": 0.62993592, + "learning_rate": 4.2785733404140825e-07, + "loss": 0.65140522, + "num_input_tokens_seen": 284981755, + "router_z_loss_clip": 0.76464844, + "router_z_loss_mlp": 0.11810303, + "step": 13210, + "time_per_iteration": 2.6927521228790283 + }, + { + "auxiliary_loss_clip": 0.01112658, + "auxiliary_loss_mlp": 0.01029908, + "balance_loss_clip": 1.03839135, + "balance_loss_mlp": 1.01884532, + "epoch": 0.794288290996543, + "flos": 34791564022560.0, + "grad_norm": 1.7529622056893366, + "language_loss": 0.6916492, + "learning_rate": 4.2761662364523676e-07, + "loss": 0.7130748, + "num_input_tokens_seen": 285003060, + "router_z_loss_clip": 0.7421875, + "router_z_loss_mlp": 0.1105957, + "step": 13211, + "time_per_iteration": 2.6680243015289307 + }, + { + "auxiliary_loss_clip": 0.01116187, + "auxiliary_loss_mlp": 0.01038448, + "balance_loss_clip": 1.04026878, + "balance_loss_mlp": 1.02555537, + "epoch": 0.7943484142492109, + "flos": 31630573094400.0, + "grad_norm": 1.6740474939724854, + "language_loss": 0.72374624, + "learning_rate": 4.2737597287400074e-07, + "loss": 0.7452926, + "num_input_tokens_seen": 285021640, + "router_z_loss_clip": 0.75830078, + "router_z_loss_mlp": 0.12890625, + "step": 13212, + "time_per_iteration": 2.7193942070007324 + }, + { + "auxiliary_loss_clip": 0.01108778, + "auxiliary_loss_mlp": 0.01027906, + "balance_loss_clip": 1.03854036, + "balance_loss_mlp": 1.01704025, + "epoch": 0.7944085375018789, + "flos": 29181693307680.0, + "grad_norm": 1.8412464391028236, + "language_loss": 0.80728555, + "learning_rate": 4.271353817368246e-07, + "loss": 0.82865238, + "num_input_tokens_seen": 285040490, + "router_z_loss_clip": 0.70166016, + "router_z_loss_mlp": 0.10864258, + "step": 13213, + "time_per_iteration": 2.7095463275909424 + }, + { + "auxiliary_loss_clip": 0.01118754, + "auxiliary_loss_mlp": 0.01030038, + "balance_loss_clip": 1.04234076, + "balance_loss_mlp": 1.01827836, + "epoch": 0.7944686607545468, + "flos": 24684592479840.0, + "grad_norm": 2.401988773741907, + "language_loss": 0.67927295, + "learning_rate": 4.268948502428327e-07, + "loss": 0.70076084, + "num_input_tokens_seen": 285059270, + "router_z_loss_clip": 0.76367188, + "router_z_loss_mlp": 0.11761475, + "step": 13214, + "time_per_iteration": 2.7150955200195312 + }, + { + "auxiliary_loss_clip": 0.01110948, + "auxiliary_loss_mlp": 0.01030288, + "balance_loss_clip": 1.03946447, + "balance_loss_mlp": 1.01925552, + "epoch": 0.7945287840072148, + "flos": 26821371594240.0, + "grad_norm": 1.9318718762740141, + "language_loss": 0.72458076, + "learning_rate": 4.2665437840114535e-07, + "loss": 0.74599314, + "num_input_tokens_seen": 285075390, + "router_z_loss_clip": 0.71435547, + "router_z_loss_mlp": 0.11035156, + "step": 13215, + "time_per_iteration": 2.709984302520752 + }, + { + "auxiliary_loss_clip": 0.01114373, + "auxiliary_loss_mlp": 0.01025971, + "balance_loss_clip": 1.0423317, + "balance_loss_mlp": 1.0150094, + "epoch": 0.7945889072598827, + "flos": 32210819540640.0, + "grad_norm": 1.9425263574445137, + "language_loss": 0.78777879, + "learning_rate": 4.2641396622088253e-07, + "loss": 0.80918223, + "num_input_tokens_seen": 285096290, + "router_z_loss_clip": 0.71972656, + "router_z_loss_mlp": 0.10961914, + "step": 13216, + "time_per_iteration": 2.716447591781616 + }, + { + "auxiliary_loss_clip": 0.01113905, + "auxiliary_loss_mlp": 0.01032912, + "balance_loss_clip": 1.04037976, + "balance_loss_mlp": 1.02205801, + "epoch": 0.7946490305125508, + "flos": 31494292015680.0, + "grad_norm": 1.6663113021799634, + "language_loss": 0.7377131, + "learning_rate": 4.261736137111598e-07, + "loss": 0.75918132, + "num_input_tokens_seen": 285116020, + "router_z_loss_clip": 0.73583984, + "router_z_loss_mlp": 0.10858154, + "step": 13217, + "time_per_iteration": 2.7256524562835693 + }, + { + "auxiliary_loss_clip": 0.01110106, + "auxiliary_loss_mlp": 0.0103437, + "balance_loss_clip": 1.03948665, + "balance_loss_mlp": 1.02305102, + "epoch": 0.7947091537652187, + "flos": 19475258958720.0, + "grad_norm": 2.2273170488621243, + "language_loss": 0.74137968, + "learning_rate": 4.259333208810907e-07, + "loss": 0.76282442, + "num_input_tokens_seen": 285133510, + "router_z_loss_clip": 0.70556641, + "router_z_loss_mlp": 0.11328125, + "step": 13218, + "time_per_iteration": 2.640054941177368 + }, + { + "auxiliary_loss_clip": 0.01113676, + "auxiliary_loss_mlp": 0.01036367, + "balance_loss_clip": 1.03858757, + "balance_loss_mlp": 1.02400494, + "epoch": 0.7947692770178867, + "flos": 22680812544480.0, + "grad_norm": 2.116620639723658, + "language_loss": 0.82999742, + "learning_rate": 4.2569308773978817e-07, + "loss": 0.85149783, + "num_input_tokens_seen": 285151690, + "router_z_loss_clip": 0.75146484, + "router_z_loss_mlp": 0.12347412, + "step": 13219, + "time_per_iteration": 2.580186367034912 + }, + { + "auxiliary_loss_clip": 0.0111806, + "auxiliary_loss_mlp": 0.01031528, + "balance_loss_clip": 1.04161847, + "balance_loss_mlp": 1.01889157, + "epoch": 0.7948294002705546, + "flos": 24944229624960.0, + "grad_norm": 1.8294679686315414, + "language_loss": 0.75822324, + "learning_rate": 4.2545291429636123e-07, + "loss": 0.77971911, + "num_input_tokens_seen": 285170485, + "router_z_loss_clip": 0.765625, + "router_z_loss_mlp": 0.12634277, + "step": 13220, + "time_per_iteration": 2.7029221057891846 + }, + { + "auxiliary_loss_clip": 0.0111785, + "auxiliary_loss_mlp": 0.0103636, + "balance_loss_clip": 1.04162526, + "balance_loss_mlp": 1.02491581, + "epoch": 0.7948895235232226, + "flos": 46590052759200.0, + "grad_norm": 1.8261738886488994, + "language_loss": 0.72076041, + "learning_rate": 4.252128005599176e-07, + "loss": 0.74230254, + "num_input_tokens_seen": 285191050, + "router_z_loss_clip": 0.76318359, + "router_z_loss_mlp": 0.11444092, + "step": 13221, + "time_per_iteration": 2.772141933441162 + }, + { + "auxiliary_loss_clip": 0.01110993, + "auxiliary_loss_mlp": 0.01026702, + "balance_loss_clip": 1.04027402, + "balance_loss_mlp": 1.0154072, + "epoch": 0.7949496467758905, + "flos": 18985718001600.0, + "grad_norm": 2.3894163789765224, + "language_loss": 0.74466252, + "learning_rate": 4.249727465395634e-07, + "loss": 0.76603949, + "num_input_tokens_seen": 285208750, + "router_z_loss_clip": 0.70751953, + "router_z_loss_mlp": 0.11297607, + "step": 13222, + "time_per_iteration": 2.596259355545044 + }, + { + "auxiliary_loss_clip": 0.01031788, + "auxiliary_loss_mlp": 0.01001773, + "balance_loss_clip": 1.00941229, + "balance_loss_mlp": 1.00070596, + "epoch": 0.7950097700285585, + "flos": 86510171547360.0, + "grad_norm": 0.7669648379012256, + "language_loss": 0.67002499, + "learning_rate": 4.247327522443993e-07, + "loss": 0.69036067, + "num_input_tokens_seen": 285264605, + "router_z_loss_clip": 0.22375488, + "router_z_loss_mlp": 0.01067352, + "step": 13223, + "time_per_iteration": 3.097990036010742 + }, + { + "auxiliary_loss_clip": 0.01112182, + "auxiliary_loss_mlp": 0.01030081, + "balance_loss_clip": 1.03858483, + "balance_loss_mlp": 1.01811814, + "epoch": 0.7950698932812266, + "flos": 29226053378880.0, + "grad_norm": 1.6854757058413512, + "language_loss": 0.7117461, + "learning_rate": 4.2449281768352717e-07, + "loss": 0.73316878, + "num_input_tokens_seen": 285283940, + "router_z_loss_clip": 0.73535156, + "router_z_loss_mlp": 0.11962891, + "step": 13224, + "time_per_iteration": 2.6153242588043213 + }, + { + "auxiliary_loss_clip": 0.01032569, + "auxiliary_loss_mlp": 0.01001834, + "balance_loss_clip": 1.01007116, + "balance_loss_mlp": 1.00087345, + "epoch": 0.7951300165338945, + "flos": 73557308027520.0, + "grad_norm": 0.6679273074633822, + "language_loss": 0.55039907, + "learning_rate": 4.2425294286604527e-07, + "loss": 0.57074308, + "num_input_tokens_seen": 285349525, + "router_z_loss_clip": 0.22509766, + "router_z_loss_mlp": 0.00959778, + "step": 13225, + "time_per_iteration": 3.3097856044769287 + }, + { + "auxiliary_loss_clip": 0.01108935, + "auxiliary_loss_mlp": 0.01021306, + "balance_loss_clip": 1.03798413, + "balance_loss_mlp": 1.01073766, + "epoch": 0.7951901397865625, + "flos": 27844084268640.0, + "grad_norm": 2.253561964978475, + "language_loss": 0.65011346, + "learning_rate": 4.2401312780105034e-07, + "loss": 0.67141587, + "num_input_tokens_seen": 285367355, + "router_z_loss_clip": 0.70947266, + "router_z_loss_mlp": 0.10571289, + "step": 13226, + "time_per_iteration": 2.618239641189575 + }, + { + "auxiliary_loss_clip": 0.01114071, + "auxiliary_loss_mlp": 0.01035791, + "balance_loss_clip": 1.03958917, + "balance_loss_mlp": 1.02437627, + "epoch": 0.7952502630392304, + "flos": 43557442040160.0, + "grad_norm": 2.8137374376123123, + "language_loss": 0.69908279, + "learning_rate": 4.237733724976349e-07, + "loss": 0.72058141, + "num_input_tokens_seen": 285386190, + "router_z_loss_clip": 0.74462891, + "router_z_loss_mlp": 0.11419678, + "step": 13227, + "time_per_iteration": 2.8144562244415283 + }, + { + "auxiliary_loss_clip": 0.01110698, + "auxiliary_loss_mlp": 0.01025226, + "balance_loss_clip": 1.03939748, + "balance_loss_mlp": 1.01483703, + "epoch": 0.7953103862918984, + "flos": 31273950281760.0, + "grad_norm": 1.8437540679425148, + "language_loss": 0.68896592, + "learning_rate": 4.2353367696489184e-07, + "loss": 0.71032512, + "num_input_tokens_seen": 285406150, + "router_z_loss_clip": 0.71337891, + "router_z_loss_mlp": 0.10400391, + "step": 13228, + "time_per_iteration": 2.730753183364868 + }, + { + "auxiliary_loss_clip": 0.0111298, + "auxiliary_loss_mlp": 0.0103382, + "balance_loss_clip": 1.03909981, + "balance_loss_mlp": 1.02254868, + "epoch": 0.7953705095445663, + "flos": 49483141017120.0, + "grad_norm": 1.4816513935452054, + "language_loss": 0.71061182, + "learning_rate": 4.232940412119095e-07, + "loss": 0.73207986, + "num_input_tokens_seen": 285429900, + "router_z_loss_clip": 0.73925781, + "router_z_loss_mlp": 0.11260986, + "step": 13229, + "time_per_iteration": 2.81504487991333 + }, + { + "auxiliary_loss_clip": 0.01117697, + "auxiliary_loss_mlp": 0.01035302, + "balance_loss_clip": 1.04161167, + "balance_loss_mlp": 1.02336335, + "epoch": 0.7954306327972344, + "flos": 33722830068480.0, + "grad_norm": 5.288062152655741, + "language_loss": 0.72347558, + "learning_rate": 4.2305446524777457e-07, + "loss": 0.74500561, + "num_input_tokens_seen": 285452555, + "router_z_loss_clip": 0.76074219, + "router_z_loss_mlp": 0.11950684, + "step": 13230, + "time_per_iteration": 2.684627056121826 + }, + { + "auxiliary_loss_clip": 0.01032038, + "auxiliary_loss_mlp": 0.01001575, + "balance_loss_clip": 1.00951803, + "balance_loss_mlp": 1.00056839, + "epoch": 0.7954907560499023, + "flos": 72605089069920.0, + "grad_norm": 0.9009975621905852, + "language_loss": 0.63502955, + "learning_rate": 4.2281494908157247e-07, + "loss": 0.65536571, + "num_input_tokens_seen": 285515700, + "router_z_loss_clip": 0.22521973, + "router_z_loss_mlp": 0.01006317, + "step": 13231, + "time_per_iteration": 3.2328619956970215 + }, + { + "auxiliary_loss_clip": 0.01111597, + "auxiliary_loss_mlp": 0.01029756, + "balance_loss_clip": 1.03934336, + "balance_loss_mlp": 1.01874077, + "epoch": 0.7955508793025703, + "flos": 24551066576160.0, + "grad_norm": 1.68303924895953, + "language_loss": 0.69861829, + "learning_rate": 4.2257549272238566e-07, + "loss": 0.72003186, + "num_input_tokens_seen": 285533910, + "router_z_loss_clip": 0.72167969, + "router_z_loss_mlp": 0.11022949, + "step": 13232, + "time_per_iteration": 2.6616201400756836 + }, + { + "auxiliary_loss_clip": 0.01113247, + "auxiliary_loss_mlp": 0.01028052, + "balance_loss_clip": 1.03970528, + "balance_loss_mlp": 1.01647687, + "epoch": 0.7956110025552382, + "flos": 32342360097600.0, + "grad_norm": 2.3205332140102497, + "language_loss": 0.77719021, + "learning_rate": 4.223360961792952e-07, + "loss": 0.79860324, + "num_input_tokens_seen": 285554080, + "router_z_loss_clip": 0.73535156, + "router_z_loss_mlp": 0.11578369, + "step": 13233, + "time_per_iteration": 2.6626076698303223 + }, + { + "auxiliary_loss_clip": 0.01111639, + "auxiliary_loss_mlp": 0.01031313, + "balance_loss_clip": 1.03780389, + "balance_loss_mlp": 1.01999998, + "epoch": 0.7956711258079062, + "flos": 27534536184960.0, + "grad_norm": 2.0183050646045655, + "language_loss": 0.78648204, + "learning_rate": 4.220967594613769e-07, + "loss": 0.80791152, + "num_input_tokens_seen": 285572325, + "router_z_loss_clip": 0.73828125, + "router_z_loss_mlp": 0.11309814, + "step": 13234, + "time_per_iteration": 4.099616765975952 + }, + { + "auxiliary_loss_clip": 0.01111899, + "auxiliary_loss_mlp": 0.01025786, + "balance_loss_clip": 1.04008949, + "balance_loss_mlp": 1.01523614, + "epoch": 0.7957312490605741, + "flos": 21206112081120.0, + "grad_norm": 2.2891574178812735, + "language_loss": 0.70038307, + "learning_rate": 4.218574825777077e-07, + "loss": 0.72175992, + "num_input_tokens_seen": 285589770, + "router_z_loss_clip": 0.71923828, + "router_z_loss_mlp": 0.10546875, + "step": 13235, + "time_per_iteration": 2.63442063331604 + }, + { + "auxiliary_loss_clip": 0.01113595, + "auxiliary_loss_mlp": 0.01029446, + "balance_loss_clip": 1.03971267, + "balance_loss_mlp": 1.01784098, + "epoch": 0.7957913723132422, + "flos": 27444276385920.0, + "grad_norm": 1.4577848024118332, + "language_loss": 0.67691523, + "learning_rate": 4.2161826553736145e-07, + "loss": 0.6983456, + "num_input_tokens_seen": 285610065, + "router_z_loss_clip": 0.74023438, + "router_z_loss_mlp": 0.11608887, + "step": 13236, + "time_per_iteration": 2.63321852684021 + }, + { + "auxiliary_loss_clip": 0.0111097, + "auxiliary_loss_mlp": 0.01025819, + "balance_loss_clip": 1.03840637, + "balance_loss_mlp": 1.01448238, + "epoch": 0.7958514955659101, + "flos": 27623175292800.0, + "grad_norm": 1.6468688024011182, + "language_loss": 0.75120962, + "learning_rate": 4.2137910834940826e-07, + "loss": 0.77257752, + "num_input_tokens_seen": 285628480, + "router_z_loss_clip": 0.72558594, + "router_z_loss_mlp": 0.11340332, + "step": 13237, + "time_per_iteration": 2.6359474658966064 + }, + { + "auxiliary_loss_clip": 0.01113616, + "auxiliary_loss_mlp": 0.01035632, + "balance_loss_clip": 1.04030538, + "balance_loss_mlp": 1.02370548, + "epoch": 0.7959116188185781, + "flos": 25263339786720.0, + "grad_norm": 7.262708541313649, + "language_loss": 0.71453333, + "learning_rate": 4.211400110229175e-07, + "loss": 0.73602587, + "num_input_tokens_seen": 285647805, + "router_z_loss_clip": 0.73339844, + "router_z_loss_mlp": 0.11932373, + "step": 13238, + "time_per_iteration": 2.630067825317383 + }, + { + "auxiliary_loss_clip": 0.01111235, + "auxiliary_loss_mlp": 0.01029311, + "balance_loss_clip": 1.03730512, + "balance_loss_mlp": 1.01779532, + "epoch": 0.7959717420712461, + "flos": 23213943744480.0, + "grad_norm": 1.8445955599435964, + "language_loss": 0.73864365, + "learning_rate": 4.2090097356695684e-07, + "loss": 0.7600491, + "num_input_tokens_seen": 285665505, + "router_z_loss_clip": 0.73828125, + "router_z_loss_mlp": 0.11505127, + "step": 13239, + "time_per_iteration": 3.8913586139678955 + }, + { + "auxiliary_loss_clip": 0.01116638, + "auxiliary_loss_mlp": 0.01032907, + "balance_loss_clip": 1.0404371, + "balance_loss_mlp": 1.02113485, + "epoch": 0.796031865323914, + "flos": 32161111188480.0, + "grad_norm": 1.9602177321001037, + "language_loss": 0.69228375, + "learning_rate": 4.2066199599058814e-07, + "loss": 0.71377921, + "num_input_tokens_seen": 285685855, + "router_z_loss_clip": 0.76025391, + "router_z_loss_mlp": 0.11773682, + "step": 13240, + "time_per_iteration": 2.6837100982666016 + }, + { + "auxiliary_loss_clip": 0.0103206, + "auxiliary_loss_mlp": 0.01000297, + "balance_loss_clip": 1.00953674, + "balance_loss_mlp": 0.9992907, + "epoch": 0.796091988576582, + "flos": 75737879971200.0, + "grad_norm": 0.8883243216735769, + "language_loss": 0.5868398, + "learning_rate": 4.2042307830287526e-07, + "loss": 0.60716331, + "num_input_tokens_seen": 285735710, + "router_z_loss_clip": 0.22521973, + "router_z_loss_mlp": 0.01005554, + "step": 13241, + "time_per_iteration": 3.028165340423584 + }, + { + "auxiliary_loss_clip": 0.01111744, + "auxiliary_loss_mlp": 0.0102916, + "balance_loss_clip": 1.03962171, + "balance_loss_mlp": 1.01897371, + "epoch": 0.7961521118292499, + "flos": 47613778365600.0, + "grad_norm": 2.0850580649258883, + "language_loss": 0.64543402, + "learning_rate": 4.201842205128772e-07, + "loss": 0.666843, + "num_input_tokens_seen": 285757045, + "router_z_loss_clip": 0.72119141, + "router_z_loss_mlp": 0.10186768, + "step": 13242, + "time_per_iteration": 2.779789686203003 + }, + { + "auxiliary_loss_clip": 0.01114136, + "auxiliary_loss_mlp": 0.01032826, + "balance_loss_clip": 1.03992295, + "balance_loss_mlp": 1.02081537, + "epoch": 0.796212235081918, + "flos": 26555089615200.0, + "grad_norm": 2.216581511170231, + "language_loss": 0.7612657, + "learning_rate": 4.199454226296526e-07, + "loss": 0.78273529, + "num_input_tokens_seen": 285776050, + "router_z_loss_clip": 0.7421875, + "router_z_loss_mlp": 0.12023926, + "step": 13243, + "time_per_iteration": 2.61200213432312 + }, + { + "auxiliary_loss_clip": 0.01114349, + "auxiliary_loss_mlp": 0.0102883, + "balance_loss_clip": 1.03980076, + "balance_loss_mlp": 1.01678991, + "epoch": 0.7962723583345859, + "flos": 25842492266400.0, + "grad_norm": 1.8037629556267654, + "language_loss": 0.79168499, + "learning_rate": 4.1970668466225565e-07, + "loss": 0.81311679, + "num_input_tokens_seen": 285796830, + "router_z_loss_clip": 0.74462891, + "router_z_loss_mlp": 0.12036133, + "step": 13244, + "time_per_iteration": 2.669450044631958 + }, + { + "auxiliary_loss_clip": 0.01114454, + "auxiliary_loss_mlp": 0.01030377, + "balance_loss_clip": 1.03857398, + "balance_loss_mlp": 1.018224, + "epoch": 0.7963324815872539, + "flos": 20900129518080.0, + "grad_norm": 2.451989255058146, + "language_loss": 0.68007827, + "learning_rate": 4.1946800661973934e-07, + "loss": 0.70152652, + "num_input_tokens_seen": 285814755, + "router_z_loss_clip": 0.7578125, + "router_z_loss_mlp": 0.12158203, + "step": 13245, + "time_per_iteration": 4.059845685958862 + }, + { + "auxiliary_loss_clip": 0.01112791, + "auxiliary_loss_mlp": 0.01032345, + "balance_loss_clip": 1.0391345, + "balance_loss_mlp": 1.0211935, + "epoch": 0.7963926048399218, + "flos": 26109300970080.0, + "grad_norm": 1.5954737843855602, + "language_loss": 0.79003352, + "learning_rate": 4.192293885111549e-07, + "loss": 0.81148493, + "num_input_tokens_seen": 285834255, + "router_z_loss_clip": 0.73730469, + "router_z_loss_mlp": 0.1114502, + "step": 13246, + "time_per_iteration": 2.638983964920044 + }, + { + "auxiliary_loss_clip": 0.01113549, + "auxiliary_loss_mlp": 0.01030112, + "balance_loss_clip": 1.03827047, + "balance_loss_mlp": 1.01827502, + "epoch": 0.7964527280925898, + "flos": 30730851830880.0, + "grad_norm": 2.151871255784176, + "language_loss": 0.66177994, + "learning_rate": 4.1899083034555007e-07, + "loss": 0.68321651, + "num_input_tokens_seen": 285853540, + "router_z_loss_clip": 0.75244141, + "router_z_loss_mlp": 0.1184082, + "step": 13247, + "time_per_iteration": 2.725010633468628 + }, + { + "auxiliary_loss_clip": 0.01109163, + "auxiliary_loss_mlp": 0.01029975, + "balance_loss_clip": 1.0375998, + "balance_loss_mlp": 1.01923978, + "epoch": 0.7965128513452577, + "flos": 32965345923840.0, + "grad_norm": 1.9468190010829947, + "language_loss": 0.71368361, + "learning_rate": 4.1875233213197123e-07, + "loss": 0.735075, + "num_input_tokens_seen": 285872705, + "router_z_loss_clip": 0.71630859, + "router_z_loss_mlp": 0.10736084, + "step": 13248, + "time_per_iteration": 4.028868675231934 + }, + { + "auxiliary_loss_clip": 0.01113905, + "auxiliary_loss_mlp": 0.01028132, + "balance_loss_clip": 1.03902853, + "balance_loss_mlp": 1.01622844, + "epoch": 0.7965729745979258, + "flos": 29797669644480.0, + "grad_norm": 3.0251067940403438, + "language_loss": 0.76437902, + "learning_rate": 4.1851389387946255e-07, + "loss": 0.78579938, + "num_input_tokens_seen": 285890290, + "router_z_loss_clip": 0.74707031, + "router_z_loss_mlp": 0.11895752, + "step": 13249, + "time_per_iteration": 2.706821918487549 + }, + { + "auxiliary_loss_clip": 0.01111414, + "auxiliary_loss_mlp": 0.01026602, + "balance_loss_clip": 1.03976095, + "balance_loss_mlp": 1.01527762, + "epoch": 0.7966330978505937, + "flos": 22989226144320.0, + "grad_norm": 22.504626438572526, + "language_loss": 0.61525607, + "learning_rate": 4.1827551559706674e-07, + "loss": 0.63663626, + "num_input_tokens_seen": 285909190, + "router_z_loss_clip": 0.71728516, + "router_z_loss_mlp": 0.11340332, + "step": 13250, + "time_per_iteration": 2.8633105754852295 + }, + { + "auxiliary_loss_clip": 0.01112833, + "auxiliary_loss_mlp": 0.01026284, + "balance_loss_clip": 1.03980875, + "balance_loss_mlp": 1.01498318, + "epoch": 0.7966932211032617, + "flos": 16047864499680.0, + "grad_norm": 2.76613844315939, + "language_loss": 0.7209301, + "learning_rate": 4.180371972938206e-07, + "loss": 0.74232125, + "num_input_tokens_seen": 285927570, + "router_z_loss_clip": 0.73095703, + "router_z_loss_mlp": 0.11297607, + "step": 13251, + "time_per_iteration": 2.606351852416992 + }, + { + "auxiliary_loss_clip": 0.01116769, + "auxiliary_loss_mlp": 0.01029077, + "balance_loss_clip": 1.04115367, + "balance_loss_mlp": 1.01669073, + "epoch": 0.7967533443559297, + "flos": 29223946480320.0, + "grad_norm": 3.0437995837179983, + "language_loss": 0.72347653, + "learning_rate": 4.177989389787624e-07, + "loss": 0.74493498, + "num_input_tokens_seen": 285945810, + "router_z_loss_clip": 0.75585938, + "router_z_loss_mlp": 0.12390137, + "step": 13252, + "time_per_iteration": 2.6994969844818115 + }, + { + "auxiliary_loss_clip": 0.0111042, + "auxiliary_loss_mlp": 0.01025963, + "balance_loss_clip": 1.03978741, + "balance_loss_mlp": 1.01463771, + "epoch": 0.7968134676085976, + "flos": 37057169036160.0, + "grad_norm": 1.8030283935946296, + "language_loss": 0.66419327, + "learning_rate": 4.175607406609278e-07, + "loss": 0.68555713, + "num_input_tokens_seen": 285964235, + "router_z_loss_clip": 0.70654297, + "router_z_loss_mlp": 0.11315918, + "step": 13253, + "time_per_iteration": 2.6927433013916016 + }, + { + "auxiliary_loss_clip": 0.01116575, + "auxiliary_loss_mlp": 0.01030993, + "balance_loss_clip": 1.04178154, + "balance_loss_mlp": 1.01910162, + "epoch": 0.7968735908612656, + "flos": 28156468561920.0, + "grad_norm": 1.692960059470021, + "language_loss": 0.67803454, + "learning_rate": 4.1732260234934767e-07, + "loss": 0.69951022, + "num_input_tokens_seen": 285983710, + "router_z_loss_clip": 0.74804688, + "router_z_loss_mlp": 0.11889648, + "step": 13254, + "time_per_iteration": 2.7163586616516113 + }, + { + "auxiliary_loss_clip": 0.01110619, + "auxiliary_loss_mlp": 0.01038515, + "balance_loss_clip": 1.03865218, + "balance_loss_mlp": 1.02689195, + "epoch": 0.7969337141139335, + "flos": 28775726798400.0, + "grad_norm": 2.023038838059804, + "language_loss": 0.69383401, + "learning_rate": 4.1708452405305314e-07, + "loss": 0.71532536, + "num_input_tokens_seen": 286003425, + "router_z_loss_clip": 0.71972656, + "router_z_loss_mlp": 0.11621094, + "step": 13255, + "time_per_iteration": 2.6220004558563232 + }, + { + "auxiliary_loss_clip": 0.01111212, + "auxiliary_loss_mlp": 0.01030003, + "balance_loss_clip": 1.03859937, + "balance_loss_mlp": 1.01869011, + "epoch": 0.7969938373666016, + "flos": 24106047759360.0, + "grad_norm": 1.91573606647287, + "language_loss": 0.79068547, + "learning_rate": 4.168465057810733e-07, + "loss": 0.81209767, + "num_input_tokens_seen": 286020130, + "router_z_loss_clip": 0.72558594, + "router_z_loss_mlp": 0.11309814, + "step": 13256, + "time_per_iteration": 2.67149019241333 + }, + { + "auxiliary_loss_clip": 0.01113466, + "auxiliary_loss_mlp": 0.01028307, + "balance_loss_clip": 1.04003, + "balance_loss_mlp": 1.01671982, + "epoch": 0.7970539606192695, + "flos": 29448704597760.0, + "grad_norm": 1.7872190226238869, + "language_loss": 0.65828657, + "learning_rate": 4.166085475424315e-07, + "loss": 0.67970431, + "num_input_tokens_seen": 286040230, + "router_z_loss_clip": 0.73388672, + "router_z_loss_mlp": 0.11590576, + "step": 13257, + "time_per_iteration": 2.6192221641540527 + }, + { + "auxiliary_loss_clip": 0.01119086, + "auxiliary_loss_mlp": 0.01034653, + "balance_loss_clip": 1.04241323, + "balance_loss_mlp": 1.02310777, + "epoch": 0.7971140838719375, + "flos": 21925070642880.0, + "grad_norm": 2.9987465576297536, + "language_loss": 0.72255403, + "learning_rate": 4.163706493461523e-07, + "loss": 0.74409139, + "num_input_tokens_seen": 286059475, + "router_z_loss_clip": 0.76660156, + "router_z_loss_mlp": 0.11560059, + "step": 13258, + "time_per_iteration": 2.6891331672668457 + }, + { + "auxiliary_loss_clip": 0.01113041, + "auxiliary_loss_mlp": 0.01032002, + "balance_loss_clip": 1.03904271, + "balance_loss_mlp": 1.02031898, + "epoch": 0.7971742071246054, + "flos": 23390816787360.0, + "grad_norm": 1.9072499810162125, + "language_loss": 0.68789005, + "learning_rate": 4.1613281120125655e-07, + "loss": 0.70934051, + "num_input_tokens_seen": 286077820, + "router_z_loss_clip": 0.73974609, + "router_z_loss_mlp": 0.11688232, + "step": 13259, + "time_per_iteration": 2.5841050148010254 + }, + { + "auxiliary_loss_clip": 0.01108923, + "auxiliary_loss_mlp": 0.01028142, + "balance_loss_clip": 1.03844357, + "balance_loss_mlp": 1.0179255, + "epoch": 0.7972343303772734, + "flos": 33099601138560.0, + "grad_norm": 2.3954984150795937, + "language_loss": 0.73564792, + "learning_rate": 4.158950331167641e-07, + "loss": 0.75701863, + "num_input_tokens_seen": 286097285, + "router_z_loss_clip": 0.70458984, + "router_z_loss_mlp": 0.10223389, + "step": 13260, + "time_per_iteration": 2.668264150619507 + }, + { + "auxiliary_loss_clip": 0.01109556, + "auxiliary_loss_mlp": 0.01029096, + "balance_loss_clip": 1.037817, + "balance_loss_mlp": 1.01816487, + "epoch": 0.7972944536299413, + "flos": 25620124668480.0, + "grad_norm": 2.0398119126354124, + "language_loss": 0.78043818, + "learning_rate": 4.1565731510169065e-07, + "loss": 0.80182475, + "num_input_tokens_seen": 286116000, + "router_z_loss_clip": 0.71728516, + "router_z_loss_mlp": 0.10925293, + "step": 13261, + "time_per_iteration": 2.6558845043182373 + }, + { + "auxiliary_loss_clip": 0.01107236, + "auxiliary_loss_mlp": 0.0102856, + "balance_loss_clip": 1.03867245, + "balance_loss_mlp": 1.01820099, + "epoch": 0.7973545768826094, + "flos": 26332033223520.0, + "grad_norm": 3.500956062499082, + "language_loss": 0.75980043, + "learning_rate": 4.154196571650501e-07, + "loss": 0.78115845, + "num_input_tokens_seen": 286135110, + "router_z_loss_clip": 0.68554688, + "router_z_loss_mlp": 0.10357666, + "step": 13262, + "time_per_iteration": 2.7217442989349365 + }, + { + "auxiliary_loss_clip": 0.01118395, + "auxiliary_loss_mlp": 0.01030458, + "balance_loss_clip": 1.04156685, + "balance_loss_mlp": 1.01753616, + "epoch": 0.7974147001352773, + "flos": 25085534846400.0, + "grad_norm": 2.428140609437743, + "language_loss": 0.70520252, + "learning_rate": 4.1518205931585524e-07, + "loss": 0.72669107, + "num_input_tokens_seen": 286152835, + "router_z_loss_clip": 0.76806641, + "router_z_loss_mlp": 0.12921143, + "step": 13263, + "time_per_iteration": 2.7251672744750977 + }, + { + "auxiliary_loss_clip": 0.01118227, + "auxiliary_loss_mlp": 0.01033982, + "balance_loss_clip": 1.04076147, + "balance_loss_mlp": 1.02129841, + "epoch": 0.7974748233879453, + "flos": 25620286737600.0, + "grad_norm": 1.8752432735135898, + "language_loss": 0.71088928, + "learning_rate": 4.149445215631153e-07, + "loss": 0.73241138, + "num_input_tokens_seen": 286171785, + "router_z_loss_clip": 0.77539062, + "router_z_loss_mlp": 0.12701416, + "step": 13264, + "time_per_iteration": 2.6226940155029297 + }, + { + "auxiliary_loss_clip": 0.01111471, + "auxiliary_loss_mlp": 0.01030894, + "balance_loss_clip": 1.03993678, + "balance_loss_mlp": 1.01981306, + "epoch": 0.7975349466406133, + "flos": 27534212046720.0, + "grad_norm": 2.0088984210126357, + "language_loss": 0.77096629, + "learning_rate": 4.1470704391583776e-07, + "loss": 0.79238999, + "num_input_tokens_seen": 286190420, + "router_z_loss_clip": 0.71582031, + "router_z_loss_mlp": 0.11083984, + "step": 13265, + "time_per_iteration": 2.629096269607544 + }, + { + "auxiliary_loss_clip": 0.01112519, + "auxiliary_loss_mlp": 0.01027504, + "balance_loss_clip": 1.03843284, + "balance_loss_mlp": 1.01643503, + "epoch": 0.7975950698932812, + "flos": 26465680679040.0, + "grad_norm": 2.0778689037449114, + "language_loss": 0.75321954, + "learning_rate": 4.144696263830285e-07, + "loss": 0.77461982, + "num_input_tokens_seen": 286210105, + "router_z_loss_clip": 0.74023438, + "router_z_loss_mlp": 0.11071777, + "step": 13266, + "time_per_iteration": 2.6139159202575684 + }, + { + "auxiliary_loss_clip": 0.01108823, + "auxiliary_loss_mlp": 0.01027905, + "balance_loss_clip": 1.03684163, + "balance_loss_mlp": 1.01699769, + "epoch": 0.7976551931459492, + "flos": 23921922123360.0, + "grad_norm": 1.687723134379971, + "language_loss": 0.84247828, + "learning_rate": 4.1423226897369015e-07, + "loss": 0.86384559, + "num_input_tokens_seen": 286228180, + "router_z_loss_clip": 0.71923828, + "router_z_loss_mlp": 0.10913086, + "step": 13267, + "time_per_iteration": 2.6098504066467285 + }, + { + "auxiliary_loss_clip": 0.01111914, + "auxiliary_loss_mlp": 0.01031322, + "balance_loss_clip": 1.03944838, + "balance_loss_mlp": 1.01947224, + "epoch": 0.7977153163986171, + "flos": 26463492745920.0, + "grad_norm": 2.2408281026381998, + "language_loss": 0.76203388, + "learning_rate": 4.139949716968223e-07, + "loss": 0.78346622, + "num_input_tokens_seen": 286247305, + "router_z_loss_clip": 0.72412109, + "router_z_loss_mlp": 0.11853027, + "step": 13268, + "time_per_iteration": 2.6075634956359863 + }, + { + "auxiliary_loss_clip": 0.01111289, + "auxiliary_loss_mlp": 0.01027373, + "balance_loss_clip": 1.03864002, + "balance_loss_mlp": 1.0161078, + "epoch": 0.7977754396512852, + "flos": 28646657795520.0, + "grad_norm": 1.9980429493171148, + "language_loss": 0.77662134, + "learning_rate": 4.1375773456142403e-07, + "loss": 0.79800797, + "num_input_tokens_seen": 286268145, + "router_z_loss_clip": 0.7265625, + "router_z_loss_mlp": 0.11260986, + "step": 13269, + "time_per_iteration": 2.6485698223114014 + }, + { + "auxiliary_loss_clip": 0.01106995, + "auxiliary_loss_mlp": 0.01032968, + "balance_loss_clip": 1.03680396, + "balance_loss_mlp": 1.02204883, + "epoch": 0.7978355629039531, + "flos": 27310993585920.0, + "grad_norm": 1.8657211838916983, + "language_loss": 0.82229972, + "learning_rate": 4.135205575764922e-07, + "loss": 0.84369934, + "num_input_tokens_seen": 286286775, + "router_z_loss_clip": 0.70166016, + "router_z_loss_mlp": 0.10925293, + "step": 13270, + "time_per_iteration": 2.6251890659332275 + }, + { + "auxiliary_loss_clip": 0.01111847, + "auxiliary_loss_mlp": 0.0103094, + "balance_loss_clip": 1.03998423, + "balance_loss_mlp": 1.01982379, + "epoch": 0.7978956861566211, + "flos": 24728993068320.0, + "grad_norm": 1.9408865894179261, + "language_loss": 0.59610444, + "learning_rate": 4.1328344075101905e-07, + "loss": 0.61753225, + "num_input_tokens_seen": 286305590, + "router_z_loss_clip": 0.71826172, + "router_z_loss_mlp": 0.11102295, + "step": 13271, + "time_per_iteration": 2.6402840614318848 + }, + { + "auxiliary_loss_clip": 0.01115409, + "auxiliary_loss_mlp": 0.0102911, + "balance_loss_clip": 1.03959322, + "balance_loss_mlp": 1.01796436, + "epoch": 0.797955809409289, + "flos": 34303238583840.0, + "grad_norm": 1.5882580828730164, + "language_loss": 0.73247147, + "learning_rate": 4.130463840939975e-07, + "loss": 0.75391662, + "num_input_tokens_seen": 286328050, + "router_z_loss_clip": 0.75830078, + "router_z_loss_mlp": 0.11138916, + "step": 13272, + "time_per_iteration": 2.7010035514831543 + }, + { + "auxiliary_loss_clip": 0.01112526, + "auxiliary_loss_mlp": 0.01028088, + "balance_loss_clip": 1.04062676, + "balance_loss_mlp": 1.01676297, + "epoch": 0.798015932661957, + "flos": 18985474897920.0, + "grad_norm": 2.3053317616145974, + "language_loss": 0.71323466, + "learning_rate": 4.128093876144161e-07, + "loss": 0.73464084, + "num_input_tokens_seen": 286345265, + "router_z_loss_clip": 0.71826172, + "router_z_loss_mlp": 0.11322021, + "step": 13273, + "time_per_iteration": 2.637030601501465 + }, + { + "auxiliary_loss_clip": 0.011155, + "auxiliary_loss_mlp": 0.01031376, + "balance_loss_clip": 1.03964853, + "balance_loss_mlp": 1.02008116, + "epoch": 0.7980760559146249, + "flos": 29225567171520.0, + "grad_norm": 2.1908388721415006, + "language_loss": 0.75783914, + "learning_rate": 4.1257245132126117e-07, + "loss": 0.77930784, + "num_input_tokens_seen": 286364465, + "router_z_loss_clip": 0.75830078, + "router_z_loss_mlp": 0.112854, + "step": 13274, + "time_per_iteration": 4.23547887802124 + }, + { + "auxiliary_loss_clip": 0.01106836, + "auxiliary_loss_mlp": 0.01027924, + "balance_loss_clip": 1.03837299, + "balance_loss_mlp": 1.01736248, + "epoch": 0.798136179167293, + "flos": 34212654646560.0, + "grad_norm": 1.4379362379437055, + "language_loss": 0.77666557, + "learning_rate": 4.12335575223518e-07, + "loss": 0.79801315, + "num_input_tokens_seen": 286385565, + "router_z_loss_clip": 0.68408203, + "router_z_loss_mlp": 0.10552979, + "step": 13275, + "time_per_iteration": 2.69378399848938 + }, + { + "auxiliary_loss_clip": 0.01113902, + "auxiliary_loss_mlp": 0.0103354, + "balance_loss_clip": 1.03881395, + "balance_loss_mlp": 1.02123761, + "epoch": 0.7981963024199609, + "flos": 43910215711200.0, + "grad_norm": 2.152837515083931, + "language_loss": 0.64549452, + "learning_rate": 4.1209875933016877e-07, + "loss": 0.66696894, + "num_input_tokens_seen": 286403950, + "router_z_loss_clip": 0.75, + "router_z_loss_mlp": 0.12316895, + "step": 13276, + "time_per_iteration": 2.7771289348602295 + }, + { + "auxiliary_loss_clip": 0.01108173, + "auxiliary_loss_mlp": 0.01030137, + "balance_loss_clip": 1.03750956, + "balance_loss_mlp": 1.01934266, + "epoch": 0.7982564256726289, + "flos": 31585240608480.0, + "grad_norm": 1.6921998978524018, + "language_loss": 0.60588193, + "learning_rate": 4.118620036501945e-07, + "loss": 0.62726498, + "num_input_tokens_seen": 286426160, + "router_z_loss_clip": 0.70654297, + "router_z_loss_mlp": 0.10797119, + "step": 13277, + "time_per_iteration": 2.645219564437866 + }, + { + "auxiliary_loss_clip": 0.01118057, + "auxiliary_loss_mlp": 0.01029647, + "balance_loss_clip": 1.04257989, + "balance_loss_mlp": 1.01837564, + "epoch": 0.7983165489252969, + "flos": 31407354633600.0, + "grad_norm": 2.1636345897589515, + "language_loss": 0.79227602, + "learning_rate": 4.1162530819257227e-07, + "loss": 0.81375301, + "num_input_tokens_seen": 286446610, + "router_z_loss_clip": 0.75537109, + "router_z_loss_mlp": 0.112854, + "step": 13278, + "time_per_iteration": 2.687290668487549 + }, + { + "auxiliary_loss_clip": 0.01114605, + "auxiliary_loss_mlp": 0.01035819, + "balance_loss_clip": 1.03943193, + "balance_loss_mlp": 1.02350509, + "epoch": 0.7983766721779648, + "flos": 26732529900000.0, + "grad_norm": 2.064340141823918, + "language_loss": 0.63544589, + "learning_rate": 4.113886729662768e-07, + "loss": 0.65695018, + "num_input_tokens_seen": 286465460, + "router_z_loss_clip": 0.75146484, + "router_z_loss_mlp": 0.12304688, + "step": 13279, + "time_per_iteration": 4.018103837966919 + }, + { + "auxiliary_loss_clip": 0.01105209, + "auxiliary_loss_mlp": 0.01025873, + "balance_loss_clip": 1.03680563, + "balance_loss_mlp": 1.01563287, + "epoch": 0.7984367954306328, + "flos": 35810792210880.0, + "grad_norm": 1.7050790581681599, + "language_loss": 0.71342146, + "learning_rate": 4.111520979802825e-07, + "loss": 0.73473227, + "num_input_tokens_seen": 286485720, + "router_z_loss_clip": 0.68408203, + "router_z_loss_mlp": 0.10241699, + "step": 13280, + "time_per_iteration": 2.667340040206909 + }, + { + "auxiliary_loss_clip": 0.01115976, + "auxiliary_loss_mlp": 0.01029998, + "balance_loss_clip": 1.04071891, + "balance_loss_mlp": 1.01805878, + "epoch": 0.7984969186833007, + "flos": 38263642691040.0, + "grad_norm": 2.2810657003331882, + "language_loss": 0.63249314, + "learning_rate": 4.1091558324355955e-07, + "loss": 0.6539529, + "num_input_tokens_seen": 286507465, + "router_z_loss_clip": 0.75244141, + "router_z_loss_mlp": 0.1194458, + "step": 13281, + "time_per_iteration": 2.7422616481781006 + }, + { + "auxiliary_loss_clip": 0.01114524, + "auxiliary_loss_mlp": 0.01032565, + "balance_loss_clip": 1.03800559, + "balance_loss_mlp": 1.02093053, + "epoch": 0.7985570419359688, + "flos": 29667263571360.0, + "grad_norm": 3.2211734062078126, + "language_loss": 0.80324376, + "learning_rate": 4.1067912876507683e-07, + "loss": 0.82471466, + "num_input_tokens_seen": 286526345, + "router_z_loss_clip": 0.76416016, + "router_z_loss_mlp": 0.11639404, + "step": 13282, + "time_per_iteration": 2.6967318058013916 + }, + { + "auxiliary_loss_clip": 0.01113251, + "auxiliary_loss_mlp": 0.01026913, + "balance_loss_clip": 1.03881001, + "balance_loss_mlp": 1.01551044, + "epoch": 0.7986171651886367, + "flos": 19208328703200.0, + "grad_norm": 1.8525643012113904, + "language_loss": 0.71415102, + "learning_rate": 4.10442734553802e-07, + "loss": 0.73555267, + "num_input_tokens_seen": 286544095, + "router_z_loss_clip": 0.74462891, + "router_z_loss_mlp": 0.11407471, + "step": 13283, + "time_per_iteration": 2.649167537689209 + }, + { + "auxiliary_loss_clip": 0.01109913, + "auxiliary_loss_mlp": 0.01027875, + "balance_loss_clip": 1.03748679, + "balance_loss_mlp": 1.01769412, + "epoch": 0.7986772884413047, + "flos": 14177326847040.0, + "grad_norm": 2.1048139628122207, + "language_loss": 0.73699486, + "learning_rate": 4.102064006186967e-07, + "loss": 0.75837266, + "num_input_tokens_seen": 286560960, + "router_z_loss_clip": 0.72412109, + "router_z_loss_mlp": 0.10174561, + "step": 13284, + "time_per_iteration": 2.606121778488159 + }, + { + "auxiliary_loss_clip": 0.01111825, + "auxiliary_loss_mlp": 0.01031336, + "balance_loss_clip": 1.03990102, + "balance_loss_mlp": 1.02132845, + "epoch": 0.7987374116939726, + "flos": 26955626808960.0, + "grad_norm": 1.7003065087679357, + "language_loss": 0.70652276, + "learning_rate": 4.0997012696872415e-07, + "loss": 0.72795439, + "num_input_tokens_seen": 286579865, + "router_z_loss_clip": 0.71875, + "router_z_loss_mlp": 0.10003662, + "step": 13285, + "time_per_iteration": 4.147035598754883 + }, + { + "auxiliary_loss_clip": 0.0111048, + "auxiliary_loss_mlp": 0.01028812, + "balance_loss_clip": 1.03800845, + "balance_loss_mlp": 1.01813066, + "epoch": 0.7987975349466406, + "flos": 21830434977600.0, + "grad_norm": 1.7526232458142956, + "language_loss": 0.73726416, + "learning_rate": 4.097339136128437e-07, + "loss": 0.7586571, + "num_input_tokens_seen": 286597295, + "router_z_loss_clip": 0.72460938, + "router_z_loss_mlp": 0.10687256, + "step": 13286, + "time_per_iteration": 2.7357680797576904 + }, + { + "auxiliary_loss_clip": 0.01112348, + "auxiliary_loss_mlp": 0.0103089, + "balance_loss_clip": 1.03908408, + "balance_loss_mlp": 1.01948142, + "epoch": 0.7988576581993085, + "flos": 24061363549920.0, + "grad_norm": 2.962854057688193, + "language_loss": 0.75091141, + "learning_rate": 4.0949776056001296e-07, + "loss": 0.77234375, + "num_input_tokens_seen": 286616270, + "router_z_loss_clip": 0.73193359, + "router_z_loss_mlp": 0.11419678, + "step": 13287, + "time_per_iteration": 3.871556043624878 + }, + { + "auxiliary_loss_clip": 0.01110662, + "auxiliary_loss_mlp": 0.01030494, + "balance_loss_clip": 1.03898513, + "balance_loss_mlp": 1.01955032, + "epoch": 0.7989177814519766, + "flos": 34210709817120.0, + "grad_norm": 1.5602387150087746, + "language_loss": 0.61539817, + "learning_rate": 4.092616678191863e-07, + "loss": 0.63680971, + "num_input_tokens_seen": 286638315, + "router_z_loss_clip": 0.71728516, + "router_z_loss_mlp": 0.109375, + "step": 13288, + "time_per_iteration": 2.709104299545288 + }, + { + "auxiliary_loss_clip": 0.01111766, + "auxiliary_loss_mlp": 0.01032701, + "balance_loss_clip": 1.04064262, + "balance_loss_mlp": 1.02196026, + "epoch": 0.7989779047046445, + "flos": 35229654384480.0, + "grad_norm": 2.6790934404269464, + "language_loss": 0.69825101, + "learning_rate": 4.090256353993169e-07, + "loss": 0.71969569, + "num_input_tokens_seen": 286658630, + "router_z_loss_clip": 0.71191406, + "router_z_loss_mlp": 0.10754395, + "step": 13289, + "time_per_iteration": 2.7106142044067383 + }, + { + "auxiliary_loss_clip": 0.01110552, + "auxiliary_loss_mlp": 0.0103101, + "balance_loss_clip": 1.03941262, + "balance_loss_mlp": 1.01939929, + "epoch": 0.7990380279573125, + "flos": 22191474173760.0, + "grad_norm": 3.9176463556303505, + "language_loss": 0.62570596, + "learning_rate": 4.0878966330935506e-07, + "loss": 0.64712155, + "num_input_tokens_seen": 286676870, + "router_z_loss_clip": 0.7109375, + "router_z_loss_mlp": 0.11602783, + "step": 13290, + "time_per_iteration": 2.60402774810791 + }, + { + "auxiliary_loss_clip": 0.01113705, + "auxiliary_loss_mlp": 0.01028739, + "balance_loss_clip": 1.04008842, + "balance_loss_mlp": 1.01697874, + "epoch": 0.7990981512099805, + "flos": 25478373756960.0, + "grad_norm": 2.7590334365401814, + "language_loss": 0.71394324, + "learning_rate": 4.08553751558248e-07, + "loss": 0.73536766, + "num_input_tokens_seen": 286694300, + "router_z_loss_clip": 0.73681641, + "router_z_loss_mlp": 0.11761475, + "step": 13291, + "time_per_iteration": 2.6956751346588135 + }, + { + "auxiliary_loss_clip": 0.01109495, + "auxiliary_loss_mlp": 0.01030694, + "balance_loss_clip": 1.0379976, + "balance_loss_mlp": 1.02033496, + "epoch": 0.7991582744626484, + "flos": 31847957066880.0, + "grad_norm": 1.5412785329498315, + "language_loss": 0.63757694, + "learning_rate": 4.083179001549422e-07, + "loss": 0.65897882, + "num_input_tokens_seen": 286714545, + "router_z_loss_clip": 0.71582031, + "router_z_loss_mlp": 0.10357666, + "step": 13292, + "time_per_iteration": 2.6529970169067383 + }, + { + "auxiliary_loss_clip": 0.01109711, + "auxiliary_loss_mlp": 0.01034324, + "balance_loss_clip": 1.03756547, + "balance_loss_mlp": 1.02345252, + "epoch": 0.7992183977153164, + "flos": 43069400222400.0, + "grad_norm": 2.2570740120066177, + "language_loss": 0.56181794, + "learning_rate": 4.0808210910838105e-07, + "loss": 0.58325827, + "num_input_tokens_seen": 286734525, + "router_z_loss_clip": 0.72265625, + "router_z_loss_mlp": 0.10876465, + "step": 13293, + "time_per_iteration": 2.811264991760254 + }, + { + "auxiliary_loss_clip": 0.01113421, + "auxiliary_loss_mlp": 0.0103417, + "balance_loss_clip": 1.0410105, + "balance_loss_mlp": 1.02298832, + "epoch": 0.7992785209679844, + "flos": 63274875196320.0, + "grad_norm": 2.824226595738507, + "language_loss": 0.7116726, + "learning_rate": 4.0784637842750704e-07, + "loss": 0.73314852, + "num_input_tokens_seen": 286753430, + "router_z_loss_clip": 0.72412109, + "router_z_loss_mlp": 0.11175537, + "step": 13294, + "time_per_iteration": 2.9044477939605713 + }, + { + "auxiliary_loss_clip": 0.01113979, + "auxiliary_loss_mlp": 0.01033329, + "balance_loss_clip": 1.04095769, + "balance_loss_mlp": 1.02180147, + "epoch": 0.7993386442206524, + "flos": 27534131012160.0, + "grad_norm": 1.9689966920874056, + "language_loss": 0.71908665, + "learning_rate": 4.0761070812125675e-07, + "loss": 0.7405597, + "num_input_tokens_seen": 286771915, + "router_z_loss_clip": 0.72998047, + "router_z_loss_mlp": 0.11523438, + "step": 13295, + "time_per_iteration": 2.690488338470459 + }, + { + "auxiliary_loss_clip": 0.01109541, + "auxiliary_loss_mlp": 0.01033721, + "balance_loss_clip": 1.03916216, + "balance_loss_mlp": 1.02322435, + "epoch": 0.7993987674733203, + "flos": 22940611758720.0, + "grad_norm": 2.6937238521578606, + "language_loss": 0.76407242, + "learning_rate": 4.0737509819856797e-07, + "loss": 0.78550494, + "num_input_tokens_seen": 286789835, + "router_z_loss_clip": 0.70361328, + "router_z_loss_mlp": 0.1048584, + "step": 13296, + "time_per_iteration": 2.6557462215423584 + }, + { + "auxiliary_loss_clip": 0.01030753, + "auxiliary_loss_mlp": 0.01001989, + "balance_loss_clip": 1.0083015, + "balance_loss_mlp": 1.00097775, + "epoch": 0.7994588907259883, + "flos": 84707980195680.0, + "grad_norm": 0.6901811952525552, + "language_loss": 0.60741025, + "learning_rate": 4.0713954866837573e-07, + "loss": 0.62773764, + "num_input_tokens_seen": 286855580, + "router_z_loss_clip": 0.22473145, + "router_z_loss_mlp": 0.01010895, + "step": 13297, + "time_per_iteration": 3.381823778152466 + }, + { + "auxiliary_loss_clip": 0.01111453, + "auxiliary_loss_mlp": 0.010292, + "balance_loss_clip": 1.039675, + "balance_loss_mlp": 1.01856077, + "epoch": 0.7995190139786562, + "flos": 16448158589760.0, + "grad_norm": 1.8652745759759035, + "language_loss": 0.7045089, + "learning_rate": 4.0690405953961073e-07, + "loss": 0.72591543, + "num_input_tokens_seen": 286874360, + "router_z_loss_clip": 0.71826172, + "router_z_loss_mlp": 0.10638428, + "step": 13298, + "time_per_iteration": 2.67891263961792 + }, + { + "auxiliary_loss_clip": 0.01115458, + "auxiliary_loss_mlp": 0.01034546, + "balance_loss_clip": 1.03947735, + "balance_loss_mlp": 1.02189767, + "epoch": 0.7995791372313242, + "flos": 26420915435040.0, + "grad_norm": 2.183086772661862, + "language_loss": 0.75749063, + "learning_rate": 4.066686308212037e-07, + "loss": 0.77899063, + "num_input_tokens_seen": 286891950, + "router_z_loss_clip": 0.75878906, + "router_z_loss_mlp": 0.12640381, + "step": 13299, + "time_per_iteration": 2.6221113204956055 + }, + { + "auxiliary_loss_clip": 0.01107713, + "auxiliary_loss_mlp": 0.0102988, + "balance_loss_clip": 1.03773403, + "balance_loss_mlp": 1.01941967, + "epoch": 0.7996392604839921, + "flos": 31808742690240.0, + "grad_norm": 1.749996795676033, + "language_loss": 0.77714628, + "learning_rate": 4.064332625220828e-07, + "loss": 0.79852223, + "num_input_tokens_seen": 286911725, + "router_z_loss_clip": 0.69970703, + "router_z_loss_mlp": 0.10461426, + "step": 13300, + "time_per_iteration": 2.773578405380249 + }, + { + "auxiliary_loss_clip": 0.01113168, + "auxiliary_loss_mlp": 0.01029363, + "balance_loss_clip": 1.03904366, + "balance_loss_mlp": 1.0179131, + "epoch": 0.7996993837366602, + "flos": 30025790696160.0, + "grad_norm": 1.8403091949485038, + "language_loss": 0.63810778, + "learning_rate": 4.0619795465117115e-07, + "loss": 0.65953302, + "num_input_tokens_seen": 286931400, + "router_z_loss_clip": 0.74169922, + "router_z_loss_mlp": 0.11444092, + "step": 13301, + "time_per_iteration": 2.6580452919006348 + }, + { + "auxiliary_loss_clip": 0.01109153, + "auxiliary_loss_mlp": 0.01036989, + "balance_loss_clip": 1.03890681, + "balance_loss_mlp": 1.02557492, + "epoch": 0.7997595069893281, + "flos": 25614938456640.0, + "grad_norm": 1.627890333510935, + "language_loss": 0.7178576, + "learning_rate": 4.059627072173928e-07, + "loss": 0.73931903, + "num_input_tokens_seen": 286949795, + "router_z_loss_clip": 0.70214844, + "router_z_loss_mlp": 0.11413574, + "step": 13302, + "time_per_iteration": 2.697161912918091 + }, + { + "auxiliary_loss_clip": 0.01112716, + "auxiliary_loss_mlp": 0.01028959, + "balance_loss_clip": 1.03816068, + "balance_loss_mlp": 1.0171988, + "epoch": 0.7998196302419961, + "flos": 29805489479520.0, + "grad_norm": 2.3839652280944654, + "language_loss": 0.83535445, + "learning_rate": 4.057275202296684e-07, + "loss": 0.85677123, + "num_input_tokens_seen": 286968805, + "router_z_loss_clip": 0.74609375, + "router_z_loss_mlp": 0.11755371, + "step": 13303, + "time_per_iteration": 2.6342337131500244 + }, + { + "auxiliary_loss_clip": 0.01108519, + "auxiliary_loss_mlp": 0.01031955, + "balance_loss_clip": 1.03827453, + "balance_loss_mlp": 1.02141118, + "epoch": 0.7998797534946641, + "flos": 36929599172640.0, + "grad_norm": 2.0068325291463602, + "language_loss": 0.58703494, + "learning_rate": 4.054923936969166e-07, + "loss": 0.60843968, + "num_input_tokens_seen": 286990235, + "router_z_loss_clip": 0.70166016, + "router_z_loss_mlp": 0.10540771, + "step": 13304, + "time_per_iteration": 2.779822587966919 + }, + { + "auxiliary_loss_clip": 0.01114009, + "auxiliary_loss_mlp": 0.01029602, + "balance_loss_clip": 1.03828311, + "balance_loss_mlp": 1.0181576, + "epoch": 0.799939876747332, + "flos": 28691990281440.0, + "grad_norm": 1.983056407095836, + "language_loss": 0.69177932, + "learning_rate": 4.0525732762805265e-07, + "loss": 0.71321547, + "num_input_tokens_seen": 287011060, + "router_z_loss_clip": 0.75830078, + "router_z_loss_mlp": 0.11456299, + "step": 13305, + "time_per_iteration": 2.6259803771972656 + }, + { + "auxiliary_loss_clip": 0.01108135, + "auxiliary_loss_mlp": 0.01029972, + "balance_loss_clip": 1.03748798, + "balance_loss_mlp": 1.01959467, + "epoch": 0.8, + "flos": 23571863110080.0, + "grad_norm": 2.8401387338773523, + "language_loss": 0.69204646, + "learning_rate": 4.0502232203199107e-07, + "loss": 0.71342754, + "num_input_tokens_seen": 287029215, + "router_z_loss_clip": 0.70605469, + "router_z_loss_mlp": 0.10375977, + "step": 13306, + "time_per_iteration": 2.664202928543091 + }, + { + "auxiliary_loss_clip": 0.01113789, + "auxiliary_loss_mlp": 0.0103503, + "balance_loss_clip": 1.04055715, + "balance_loss_mlp": 1.02406824, + "epoch": 0.800060123252668, + "flos": 39550935618720.0, + "grad_norm": 1.5520151205402748, + "language_loss": 0.69600058, + "learning_rate": 4.0478737691764286e-07, + "loss": 0.71748877, + "num_input_tokens_seen": 287050855, + "router_z_loss_clip": 0.73339844, + "router_z_loss_mlp": 0.10961914, + "step": 13307, + "time_per_iteration": 2.7122104167938232 + }, + { + "auxiliary_loss_clip": 0.01112177, + "auxiliary_loss_mlp": 0.01033995, + "balance_loss_clip": 1.03920662, + "balance_loss_mlp": 1.02327228, + "epoch": 0.800120246505336, + "flos": 24416973430560.0, + "grad_norm": 1.8330673504226693, + "language_loss": 0.76701069, + "learning_rate": 4.0455249229391677e-07, + "loss": 0.78847241, + "num_input_tokens_seen": 287069915, + "router_z_loss_clip": 0.72900391, + "router_z_loss_mlp": 0.1072998, + "step": 13308, + "time_per_iteration": 2.677262544631958 + }, + { + "auxiliary_loss_clip": 0.01115946, + "auxiliary_loss_mlp": 0.01030676, + "balance_loss_clip": 1.03922343, + "balance_loss_mlp": 1.01810527, + "epoch": 0.8001803697580039, + "flos": 38886628517280.0, + "grad_norm": 1.8520482653507981, + "language_loss": 0.79102111, + "learning_rate": 4.0431766816972e-07, + "loss": 0.81248742, + "num_input_tokens_seen": 287091450, + "router_z_loss_clip": 0.76757812, + "router_z_loss_mlp": 0.12573242, + "step": 13309, + "time_per_iteration": 2.7217938899993896 + }, + { + "auxiliary_loss_clip": 0.01030538, + "auxiliary_loss_mlp": 0.01001287, + "balance_loss_clip": 1.00812387, + "balance_loss_mlp": 1.00031543, + "epoch": 0.8002404930106719, + "flos": 77351008929120.0, + "grad_norm": 0.9535623617333778, + "language_loss": 0.64671701, + "learning_rate": 4.040829045539571e-07, + "loss": 0.66703534, + "num_input_tokens_seen": 287148365, + "router_z_loss_clip": 0.22424316, + "router_z_loss_mlp": 0.00970459, + "step": 13310, + "time_per_iteration": 3.2277894020080566 + }, + { + "auxiliary_loss_clip": 0.01112058, + "auxiliary_loss_mlp": 0.01034787, + "balance_loss_clip": 1.04024446, + "balance_loss_mlp": 1.02362943, + "epoch": 0.8003006162633398, + "flos": 33989841358560.0, + "grad_norm": 2.004365425598611, + "language_loss": 0.82974678, + "learning_rate": 4.0384820145553156e-07, + "loss": 0.85121524, + "num_input_tokens_seen": 287168280, + "router_z_loss_clip": 0.71875, + "router_z_loss_mlp": 0.11169434, + "step": 13311, + "time_per_iteration": 2.693875789642334 + }, + { + "auxiliary_loss_clip": 0.01111962, + "auxiliary_loss_mlp": 0.01032143, + "balance_loss_clip": 1.03954244, + "balance_loss_mlp": 1.02073479, + "epoch": 0.8003607395160078, + "flos": 22235955796800.0, + "grad_norm": 2.4392711078429943, + "language_loss": 0.66322476, + "learning_rate": 4.0361355888334116e-07, + "loss": 0.6846658, + "num_input_tokens_seen": 287185980, + "router_z_loss_clip": 0.72363281, + "router_z_loss_mlp": 0.11413574, + "step": 13312, + "time_per_iteration": 2.631955862045288 + }, + { + "auxiliary_loss_clip": 0.01116433, + "auxiliary_loss_mlp": 0.01033236, + "balance_loss_clip": 1.0418843, + "balance_loss_mlp": 1.02116609, + "epoch": 0.8004208627686757, + "flos": 25486315143840.0, + "grad_norm": 2.0418113630174695, + "language_loss": 0.7553674, + "learning_rate": 4.033789768462843e-07, + "loss": 0.77686411, + "num_input_tokens_seen": 287203875, + "router_z_loss_clip": 0.74511719, + "router_z_loss_mlp": 0.12078857, + "step": 13313, + "time_per_iteration": 2.763876438140869 + }, + { + "auxiliary_loss_clip": 0.01110387, + "auxiliary_loss_mlp": 0.01028371, + "balance_loss_clip": 1.03759873, + "balance_loss_mlp": 1.01687324, + "epoch": 0.8004809860213438, + "flos": 32257529614080.0, + "grad_norm": 1.6638071762451245, + "language_loss": 0.75642788, + "learning_rate": 4.031444553532575e-07, + "loss": 0.77781546, + "num_input_tokens_seen": 287226445, + "router_z_loss_clip": 0.72753906, + "router_z_loss_mlp": 0.11505127, + "step": 13314, + "time_per_iteration": 2.6751198768615723 + }, + { + "auxiliary_loss_clip": 0.01030371, + "auxiliary_loss_mlp": 0.01001553, + "balance_loss_clip": 1.00788391, + "balance_loss_mlp": 1.00056815, + "epoch": 0.8005411092740117, + "flos": 77663960464320.0, + "grad_norm": 0.8031660746004642, + "language_loss": 0.53767413, + "learning_rate": 4.029099944131522e-07, + "loss": 0.55799341, + "num_input_tokens_seen": 287286240, + "router_z_loss_clip": 0.22473145, + "router_z_loss_mlp": 0.00984192, + "step": 13315, + "time_per_iteration": 3.211838722229004 + }, + { + "auxiliary_loss_clip": 0.01111133, + "auxiliary_loss_mlp": 0.01032288, + "balance_loss_clip": 1.03918064, + "balance_loss_mlp": 1.02071857, + "epoch": 0.8006012325266797, + "flos": 44096853418560.0, + "grad_norm": 1.7793099263283285, + "language_loss": 0.71685851, + "learning_rate": 4.026755940348603e-07, + "loss": 0.73829269, + "num_input_tokens_seen": 287310265, + "router_z_loss_clip": 0.71923828, + "router_z_loss_mlp": 0.11566162, + "step": 13316, + "time_per_iteration": 2.7348618507385254 + }, + { + "auxiliary_loss_clip": 0.01115109, + "auxiliary_loss_mlp": 0.01031106, + "balance_loss_clip": 1.03950167, + "balance_loss_mlp": 1.02002525, + "epoch": 0.8006613557793477, + "flos": 41291350819200.0, + "grad_norm": 1.9124339159770896, + "language_loss": 0.64582658, + "learning_rate": 4.024412542272706e-07, + "loss": 0.66728878, + "num_input_tokens_seen": 287331610, + "router_z_loss_clip": 0.75634766, + "router_z_loss_mlp": 0.11083984, + "step": 13317, + "time_per_iteration": 2.828464984893799 + }, + { + "auxiliary_loss_clip": 0.01030169, + "auxiliary_loss_mlp": 0.01001368, + "balance_loss_clip": 1.00767493, + "balance_loss_mlp": 1.00041795, + "epoch": 0.8007214790320156, + "flos": 82177268204160.0, + "grad_norm": 0.761292891598128, + "language_loss": 0.58912456, + "learning_rate": 4.0220697499926783e-07, + "loss": 0.60943997, + "num_input_tokens_seen": 287394795, + "router_z_loss_clip": 0.22485352, + "router_z_loss_mlp": 0.00949097, + "step": 13318, + "time_per_iteration": 4.705078601837158 + }, + { + "auxiliary_loss_clip": 0.01108663, + "auxiliary_loss_mlp": 0.01023448, + "balance_loss_clip": 1.03711629, + "balance_loss_mlp": 1.01242661, + "epoch": 0.8007816022846836, + "flos": 28291331535840.0, + "grad_norm": 1.795980408577404, + "language_loss": 0.66483068, + "learning_rate": 4.019727563597366e-07, + "loss": 0.68615174, + "num_input_tokens_seen": 287414595, + "router_z_loss_clip": 0.71582031, + "router_z_loss_mlp": 0.11022949, + "step": 13319, + "time_per_iteration": 2.684244155883789 + }, + { + "auxiliary_loss_clip": 0.01112638, + "auxiliary_loss_mlp": 0.01034938, + "balance_loss_clip": 1.03746438, + "balance_loss_mlp": 1.02268934, + "epoch": 0.8008417255373516, + "flos": 26821817284320.0, + "grad_norm": 8.339466771827952, + "language_loss": 0.74059737, + "learning_rate": 4.0173859831755873e-07, + "loss": 0.76207316, + "num_input_tokens_seen": 287434395, + "router_z_loss_clip": 0.75244141, + "router_z_loss_mlp": 0.12255859, + "step": 13320, + "time_per_iteration": 2.67783522605896 + }, + { + "auxiliary_loss_clip": 0.01114442, + "auxiliary_loss_mlp": 0.01028148, + "balance_loss_clip": 1.03917933, + "balance_loss_mlp": 1.01673329, + "epoch": 0.8009018487900196, + "flos": 20410426491840.0, + "grad_norm": 2.9546942448564018, + "language_loss": 0.80025315, + "learning_rate": 4.015045008816138e-07, + "loss": 0.82167912, + "num_input_tokens_seen": 287450590, + "router_z_loss_clip": 0.75244141, + "router_z_loss_mlp": 0.11407471, + "step": 13321, + "time_per_iteration": 2.673483371734619 + }, + { + "auxiliary_loss_clip": 0.01104499, + "auxiliary_loss_mlp": 0.01026027, + "balance_loss_clip": 1.03432286, + "balance_loss_mlp": 1.01526201, + "epoch": 0.8009619720426875, + "flos": 25396825173120.0, + "grad_norm": 2.0818007225055566, + "language_loss": 0.66058052, + "learning_rate": 4.0127046406077825e-07, + "loss": 0.68188572, + "num_input_tokens_seen": 287468455, + "router_z_loss_clip": 0.70117188, + "router_z_loss_mlp": 0.10754395, + "step": 13322, + "time_per_iteration": 2.616340398788452 + }, + { + "auxiliary_loss_clip": 0.01111634, + "auxiliary_loss_mlp": 0.01028166, + "balance_loss_clip": 1.03840947, + "balance_loss_mlp": 1.01706767, + "epoch": 0.8010220952953555, + "flos": 21879373501440.0, + "grad_norm": 1.8526859727365688, + "language_loss": 0.7781328, + "learning_rate": 4.010364878639265e-07, + "loss": 0.7995308, + "num_input_tokens_seen": 287486485, + "router_z_loss_clip": 0.73242188, + "router_z_loss_mlp": 0.11096191, + "step": 13323, + "time_per_iteration": 4.081521987915039 + }, + { + "auxiliary_loss_clip": 0.01113846, + "auxiliary_loss_mlp": 0.01027771, + "balance_loss_clip": 1.03865623, + "balance_loss_mlp": 1.01612973, + "epoch": 0.8010822185480234, + "flos": 29983334937120.0, + "grad_norm": 2.804295185821394, + "language_loss": 0.71948993, + "learning_rate": 4.00802572299932e-07, + "loss": 0.74090612, + "num_input_tokens_seen": 287503940, + "router_z_loss_clip": 0.75195312, + "router_z_loss_mlp": 0.11645508, + "step": 13324, + "time_per_iteration": 2.668950319290161 + }, + { + "auxiliary_loss_clip": 0.01111934, + "auxiliary_loss_mlp": 0.01027743, + "balance_loss_clip": 1.03717303, + "balance_loss_mlp": 1.01595306, + "epoch": 0.8011423418006914, + "flos": 26637570096480.0, + "grad_norm": 1.6713661321395854, + "language_loss": 0.76507562, + "learning_rate": 4.005687173776635e-07, + "loss": 0.78647244, + "num_input_tokens_seen": 287521660, + "router_z_loss_clip": 0.74707031, + "router_z_loss_mlp": 0.11785889, + "step": 13325, + "time_per_iteration": 3.9178519248962402 + }, + { + "auxiliary_loss_clip": 0.01104927, + "auxiliary_loss_mlp": 0.01027249, + "balance_loss_clip": 1.0366199, + "balance_loss_mlp": 1.01691341, + "epoch": 0.8012024650533593, + "flos": 29181571755840.0, + "grad_norm": 2.177614499298879, + "language_loss": 0.79842782, + "learning_rate": 4.003349231059898e-07, + "loss": 0.81974959, + "num_input_tokens_seen": 287541505, + "router_z_loss_clip": 0.68261719, + "router_z_loss_mlp": 0.10333252, + "step": 13326, + "time_per_iteration": 2.7077114582061768 + }, + { + "auxiliary_loss_clip": 0.01108164, + "auxiliary_loss_mlp": 0.01029621, + "balance_loss_clip": 1.03817964, + "balance_loss_mlp": 1.01914239, + "epoch": 0.8012625883060274, + "flos": 28781156113920.0, + "grad_norm": 2.0815136081591716, + "language_loss": 0.6598537, + "learning_rate": 4.001011894937765e-07, + "loss": 0.68123162, + "num_input_tokens_seen": 287560015, + "router_z_loss_clip": 0.69970703, + "router_z_loss_mlp": 0.10479736, + "step": 13327, + "time_per_iteration": 2.9008703231811523 + }, + { + "auxiliary_loss_clip": 0.01107769, + "auxiliary_loss_mlp": 0.01032115, + "balance_loss_clip": 1.03812075, + "balance_loss_mlp": 1.02124333, + "epoch": 0.8013227115586953, + "flos": 25397189828640.0, + "grad_norm": 3.3362833060903307, + "language_loss": 0.73853451, + "learning_rate": 3.9986751654988636e-07, + "loss": 0.75993329, + "num_input_tokens_seen": 287579150, + "router_z_loss_clip": 0.69677734, + "router_z_loss_mlp": 0.10876465, + "step": 13328, + "time_per_iteration": 2.657487630844116 + }, + { + "auxiliary_loss_clip": 0.01113562, + "auxiliary_loss_mlp": 0.01032405, + "balance_loss_clip": 1.03884673, + "balance_loss_mlp": 1.02029359, + "epoch": 0.8013828348113633, + "flos": 19386579333600.0, + "grad_norm": 1.8176239515592882, + "language_loss": 0.73800027, + "learning_rate": 3.996339042831798e-07, + "loss": 0.75945997, + "num_input_tokens_seen": 287597420, + "router_z_loss_clip": 0.74755859, + "router_z_loss_mlp": 0.12097168, + "step": 13329, + "time_per_iteration": 2.598778486251831 + }, + { + "auxiliary_loss_clip": 0.01030218, + "auxiliary_loss_mlp": 0.01001696, + "balance_loss_clip": 1.00777507, + "balance_loss_mlp": 1.00070977, + "epoch": 0.8014429580640313, + "flos": 86710828233600.0, + "grad_norm": 0.7011955058507359, + "language_loss": 0.52946341, + "learning_rate": 3.9940035270251605e-07, + "loss": 0.54978251, + "num_input_tokens_seen": 287667280, + "router_z_loss_clip": 0.22436523, + "router_z_loss_mlp": 0.00985718, + "step": 13330, + "time_per_iteration": 3.3689897060394287 + }, + { + "auxiliary_loss_clip": 0.01115354, + "auxiliary_loss_mlp": 0.01037706, + "balance_loss_clip": 1.03919339, + "balance_loss_mlp": 1.02490306, + "epoch": 0.8015030813166992, + "flos": 28157927184000.0, + "grad_norm": 2.071750413151508, + "language_loss": 0.73115134, + "learning_rate": 3.991668618167519e-07, + "loss": 0.75268197, + "num_input_tokens_seen": 287687375, + "router_z_loss_clip": 0.76074219, + "router_z_loss_mlp": 0.12817383, + "step": 13331, + "time_per_iteration": 2.7153522968292236 + }, + { + "auxiliary_loss_clip": 0.0110953, + "auxiliary_loss_mlp": 0.0102712, + "balance_loss_clip": 1.03745699, + "balance_loss_mlp": 1.01714849, + "epoch": 0.8015632045693672, + "flos": 26688696553440.0, + "grad_norm": 2.4776300061571663, + "language_loss": 0.77100754, + "learning_rate": 3.989334316347401e-07, + "loss": 0.79237407, + "num_input_tokens_seen": 287707895, + "router_z_loss_clip": 0.71923828, + "router_z_loss_mlp": 0.09973145, + "step": 13332, + "time_per_iteration": 2.6620023250579834 + }, + { + "auxiliary_loss_clip": 0.01111075, + "auxiliary_loss_mlp": 0.01027035, + "balance_loss_clip": 1.03802812, + "balance_loss_mlp": 1.01565635, + "epoch": 0.8016233278220352, + "flos": 28866472804800.0, + "grad_norm": 2.3890540566879364, + "language_loss": 0.83101726, + "learning_rate": 3.987000621653338e-07, + "loss": 0.8523984, + "num_input_tokens_seen": 287723990, + "router_z_loss_clip": 0.72949219, + "router_z_loss_mlp": 0.11364746, + "step": 13333, + "time_per_iteration": 2.698625326156616 + }, + { + "auxiliary_loss_clip": 0.01111307, + "auxiliary_loss_mlp": 0.01026712, + "balance_loss_clip": 1.03756595, + "balance_loss_mlp": 1.01529729, + "epoch": 0.8016834510747032, + "flos": 19831800736800.0, + "grad_norm": 1.6697225480772158, + "language_loss": 0.73437083, + "learning_rate": 3.9846675341738133e-07, + "loss": 0.75575101, + "num_input_tokens_seen": 287742380, + "router_z_loss_clip": 0.73779297, + "router_z_loss_mlp": 0.11413574, + "step": 13334, + "time_per_iteration": 2.6823949813842773 + }, + { + "auxiliary_loss_clip": 0.01110557, + "auxiliary_loss_mlp": 0.01028118, + "balance_loss_clip": 1.03981376, + "balance_loss_mlp": 1.01684725, + "epoch": 0.8017435743273711, + "flos": 14978603820960.0, + "grad_norm": 2.0099833750524962, + "language_loss": 0.74556959, + "learning_rate": 3.9823350539972967e-07, + "loss": 0.76695639, + "num_input_tokens_seen": 287760130, + "router_z_loss_clip": 0.70751953, + "router_z_loss_mlp": 0.11260986, + "step": 13335, + "time_per_iteration": 2.6616318225860596 + }, + { + "auxiliary_loss_clip": 0.0110797, + "auxiliary_loss_mlp": 0.01026406, + "balance_loss_clip": 1.03583825, + "balance_loss_mlp": 1.01456904, + "epoch": 0.8018036975800391, + "flos": 20982974654880.0, + "grad_norm": 1.9665049684127884, + "language_loss": 0.7553302, + "learning_rate": 3.9800031812122416e-07, + "loss": 0.77667391, + "num_input_tokens_seen": 287777565, + "router_z_loss_clip": 0.72119141, + "router_z_loss_mlp": 0.11846924, + "step": 13336, + "time_per_iteration": 2.609161138534546 + }, + { + "auxiliary_loss_clip": 0.01118253, + "auxiliary_loss_mlp": 0.01035087, + "balance_loss_clip": 1.04149461, + "balance_loss_mlp": 1.02325511, + "epoch": 0.801863820832707, + "flos": 25174984299840.0, + "grad_norm": 2.301973206634782, + "language_loss": 0.7533409, + "learning_rate": 3.977671915907068e-07, + "loss": 0.77487427, + "num_input_tokens_seen": 287796310, + "router_z_loss_clip": 0.76757812, + "router_z_loss_mlp": 0.11828613, + "step": 13337, + "time_per_iteration": 2.690403699874878 + }, + { + "auxiliary_loss_clip": 0.01116157, + "auxiliary_loss_mlp": 0.01036042, + "balance_loss_clip": 1.0406698, + "balance_loss_mlp": 1.02409101, + "epoch": 0.801923944085375, + "flos": 37150508148480.0, + "grad_norm": 2.1937895811721693, + "language_loss": 0.80094862, + "learning_rate": 3.9753412581701883e-07, + "loss": 0.82247055, + "num_input_tokens_seen": 287817330, + "router_z_loss_clip": 0.75390625, + "router_z_loss_mlp": 0.11950684, + "step": 13338, + "time_per_iteration": 2.7403604984283447 + }, + { + "auxiliary_loss_clip": 0.01112627, + "auxiliary_loss_mlp": 0.01028604, + "balance_loss_clip": 1.03749204, + "balance_loss_mlp": 1.01684988, + "epoch": 0.801984067338043, + "flos": 24416932913280.0, + "grad_norm": 2.2229580083771077, + "language_loss": 0.74370152, + "learning_rate": 3.9730112080899733e-07, + "loss": 0.76511383, + "num_input_tokens_seen": 287835095, + "router_z_loss_clip": 0.75244141, + "router_z_loss_mlp": 0.11767578, + "step": 13339, + "time_per_iteration": 2.6912972927093506 + }, + { + "auxiliary_loss_clip": 0.01109095, + "auxiliary_loss_mlp": 0.01029404, + "balance_loss_clip": 1.03872824, + "balance_loss_mlp": 1.01835287, + "epoch": 0.802044190590711, + "flos": 27801263854080.0, + "grad_norm": 1.6065790261937158, + "language_loss": 0.79135728, + "learning_rate": 3.970681765754775e-07, + "loss": 0.81274223, + "num_input_tokens_seen": 287854595, + "router_z_loss_clip": 0.703125, + "router_z_loss_mlp": 0.11053467, + "step": 13340, + "time_per_iteration": 2.6360666751861572 + }, + { + "auxiliary_loss_clip": 0.01111289, + "auxiliary_loss_mlp": 0.01032501, + "balance_loss_clip": 1.03812349, + "balance_loss_mlp": 1.02147388, + "epoch": 0.8021043138433789, + "flos": 33678226893600.0, + "grad_norm": 1.8443287243196411, + "language_loss": 0.67738044, + "learning_rate": 3.968352931252936e-07, + "loss": 0.69881833, + "num_input_tokens_seen": 287876960, + "router_z_loss_clip": 0.73046875, + "router_z_loss_mlp": 0.11035156, + "step": 13341, + "time_per_iteration": 2.7816929817199707 + }, + { + "auxiliary_loss_clip": 0.01030386, + "auxiliary_loss_mlp": 0.0100008, + "balance_loss_clip": 1.00797749, + "balance_loss_mlp": 0.99908125, + "epoch": 0.8021644370960469, + "flos": 75731437723680.0, + "grad_norm": 0.8099408175975875, + "language_loss": 0.61580312, + "learning_rate": 3.9660247046727547e-07, + "loss": 0.6361078, + "num_input_tokens_seen": 287936530, + "router_z_loss_clip": 0.22399902, + "router_z_loss_mlp": 0.00997925, + "step": 13342, + "time_per_iteration": 3.189044237136841 + }, + { + "auxiliary_loss_clip": 0.01113707, + "auxiliary_loss_mlp": 0.01033366, + "balance_loss_clip": 1.04045081, + "balance_loss_mlp": 1.0214808, + "epoch": 0.8022245603487148, + "flos": 28506851713440.0, + "grad_norm": 2.1940437875305876, + "language_loss": 0.63691068, + "learning_rate": 3.963697086102522e-07, + "loss": 0.6583814, + "num_input_tokens_seen": 287954285, + "router_z_loss_clip": 0.73193359, + "router_z_loss_mlp": 0.11889648, + "step": 13343, + "time_per_iteration": 2.6218748092651367 + }, + { + "auxiliary_loss_clip": 0.01106464, + "auxiliary_loss_mlp": 0.01025258, + "balance_loss_clip": 1.03694296, + "balance_loss_mlp": 1.01432657, + "epoch": 0.8022846836013828, + "flos": 13242523969440.0, + "grad_norm": 2.0996619870531883, + "language_loss": 0.68680894, + "learning_rate": 3.96137007563051e-07, + "loss": 0.70812613, + "num_input_tokens_seen": 287971595, + "router_z_loss_clip": 0.6953125, + "router_z_loss_mlp": 0.10925293, + "step": 13344, + "time_per_iteration": 2.697038412094116 + }, + { + "auxiliary_loss_clip": 0.01113845, + "auxiliary_loss_mlp": 0.01024697, + "balance_loss_clip": 1.04042912, + "balance_loss_mlp": 1.01269817, + "epoch": 0.8023448068540509, + "flos": 35680021482240.0, + "grad_norm": 2.036605147522287, + "language_loss": 0.70069766, + "learning_rate": 3.9590436733449506e-07, + "loss": 0.72208303, + "num_input_tokens_seen": 287992540, + "router_z_loss_clip": 0.734375, + "router_z_loss_mlp": 0.11999512, + "step": 13345, + "time_per_iteration": 2.72705078125 + }, + { + "auxiliary_loss_clip": 0.01030132, + "auxiliary_loss_mlp": 0.01001328, + "balance_loss_clip": 1.00767684, + "balance_loss_mlp": 1.00034237, + "epoch": 0.8024049301067188, + "flos": 78280706629440.0, + "grad_norm": 0.8895938610495663, + "language_loss": 0.62943959, + "learning_rate": 3.956717879334059e-07, + "loss": 0.64975417, + "num_input_tokens_seen": 288052810, + "router_z_loss_clip": 0.2244873, + "router_z_loss_mlp": 0.00984955, + "step": 13346, + "time_per_iteration": 3.3398778438568115 + }, + { + "auxiliary_loss_clip": 0.01112273, + "auxiliary_loss_mlp": 0.01028922, + "balance_loss_clip": 1.04132962, + "balance_loss_mlp": 1.01763868, + "epoch": 0.8024650533593868, + "flos": 34880486751360.0, + "grad_norm": 1.7427315781030652, + "language_loss": 0.72920477, + "learning_rate": 3.9543926936860327e-07, + "loss": 0.75061679, + "num_input_tokens_seen": 288073045, + "router_z_loss_clip": 0.70898438, + "router_z_loss_mlp": 0.11273193, + "step": 13347, + "time_per_iteration": 2.7052924633026123 + }, + { + "auxiliary_loss_clip": 0.01113971, + "auxiliary_loss_mlp": 0.01030822, + "balance_loss_clip": 1.0390892, + "balance_loss_mlp": 1.01899612, + "epoch": 0.8025251766120547, + "flos": 20721554749440.0, + "grad_norm": 2.174907168071665, + "language_loss": 0.73162049, + "learning_rate": 3.9520681164890493e-07, + "loss": 0.75306845, + "num_input_tokens_seen": 288091165, + "router_z_loss_clip": 0.74951172, + "router_z_loss_mlp": 0.11834717, + "step": 13348, + "time_per_iteration": 2.6384527683258057 + }, + { + "auxiliary_loss_clip": 0.01113719, + "auxiliary_loss_mlp": 0.01027573, + "balance_loss_clip": 1.0406816, + "balance_loss_mlp": 1.01586092, + "epoch": 0.8025852998647227, + "flos": 27044265916800.0, + "grad_norm": 1.8393308759982723, + "language_loss": 0.75791556, + "learning_rate": 3.9497441478312444e-07, + "loss": 0.77932847, + "num_input_tokens_seen": 288110595, + "router_z_loss_clip": 0.73046875, + "router_z_loss_mlp": 0.11706543, + "step": 13349, + "time_per_iteration": 2.6558852195739746 + }, + { + "auxiliary_loss_clip": 0.01112992, + "auxiliary_loss_mlp": 0.01032851, + "balance_loss_clip": 1.04048324, + "balance_loss_mlp": 1.02233076, + "epoch": 0.8026454231173906, + "flos": 26865367009920.0, + "grad_norm": 2.0474152806806503, + "language_loss": 0.83440375, + "learning_rate": 3.947420787800755e-07, + "loss": 0.85586214, + "num_input_tokens_seen": 288128995, + "router_z_loss_clip": 0.72460938, + "router_z_loss_mlp": 0.10516357, + "step": 13350, + "time_per_iteration": 2.6455352306365967 + }, + { + "auxiliary_loss_clip": 0.01115421, + "auxiliary_loss_mlp": 0.01035047, + "balance_loss_clip": 1.04186845, + "balance_loss_mlp": 1.02372217, + "epoch": 0.8027055463700586, + "flos": 27444154834080.0, + "grad_norm": 1.7095503440731317, + "language_loss": 0.71193916, + "learning_rate": 3.945098036485679e-07, + "loss": 0.7334438, + "num_input_tokens_seen": 288149265, + "router_z_loss_clip": 0.73632812, + "router_z_loss_mlp": 0.11322021, + "step": 13351, + "time_per_iteration": 2.7319514751434326 + }, + { + "auxiliary_loss_clip": 0.01112392, + "auxiliary_loss_mlp": 0.01024229, + "balance_loss_clip": 1.04097915, + "balance_loss_mlp": 1.01300514, + "epoch": 0.8027656696227266, + "flos": 35280254116800.0, + "grad_norm": 3.9621501387950544, + "language_loss": 0.61664331, + "learning_rate": 3.9427758939740885e-07, + "loss": 0.63800949, + "num_input_tokens_seen": 288170745, + "router_z_loss_clip": 0.71386719, + "router_z_loss_mlp": 0.11230469, + "step": 13352, + "time_per_iteration": 4.1285927295684814 + }, + { + "auxiliary_loss_clip": 0.01113071, + "auxiliary_loss_mlp": 0.0103582, + "balance_loss_clip": 1.04064155, + "balance_loss_mlp": 1.02447724, + "epoch": 0.8028257928753946, + "flos": 22682514270240.0, + "grad_norm": 1.8403369996815762, + "language_loss": 0.76710665, + "learning_rate": 3.940454360354046e-07, + "loss": 0.78859556, + "num_input_tokens_seen": 288189415, + "router_z_loss_clip": 0.72460938, + "router_z_loss_mlp": 0.11346436, + "step": 13353, + "time_per_iteration": 2.6855058670043945 + }, + { + "auxiliary_loss_clip": 0.01120406, + "auxiliary_loss_mlp": 0.01029107, + "balance_loss_clip": 1.04053235, + "balance_loss_mlp": 1.01658976, + "epoch": 0.8028859161280625, + "flos": 23342850678240.0, + "grad_norm": 8.910089240260245, + "language_loss": 0.73324704, + "learning_rate": 3.938133435713582e-07, + "loss": 0.75474215, + "num_input_tokens_seen": 288206900, + "router_z_loss_clip": 0.79931641, + "router_z_loss_mlp": 0.12518311, + "step": 13354, + "time_per_iteration": 2.596222162246704 + }, + { + "auxiliary_loss_clip": 0.01113535, + "auxiliary_loss_mlp": 0.01031282, + "balance_loss_clip": 1.03860378, + "balance_loss_mlp": 1.0198319, + "epoch": 0.8029460393807305, + "flos": 24684511445280.0, + "grad_norm": 2.009560747167686, + "language_loss": 0.65700698, + "learning_rate": 3.935813120140714e-07, + "loss": 0.67845511, + "num_input_tokens_seen": 288224800, + "router_z_loss_clip": 0.74853516, + "router_z_loss_mlp": 0.11450195, + "step": 13355, + "time_per_iteration": 2.7033050060272217 + }, + { + "auxiliary_loss_clip": 0.0111644, + "auxiliary_loss_mlp": 0.01033145, + "balance_loss_clip": 1.03973365, + "balance_loss_mlp": 1.02086651, + "epoch": 0.8030061626333984, + "flos": 60747890794560.0, + "grad_norm": 2.102130332835948, + "language_loss": 0.68834281, + "learning_rate": 3.9334934137234235e-07, + "loss": 0.70983863, + "num_input_tokens_seen": 288249400, + "router_z_loss_clip": 0.76611328, + "router_z_loss_mlp": 0.1227417, + "step": 13356, + "time_per_iteration": 2.9052751064300537 + }, + { + "auxiliary_loss_clip": 0.01112485, + "auxiliary_loss_mlp": 0.01029351, + "balance_loss_clip": 1.0403651, + "balance_loss_mlp": 1.01785934, + "epoch": 0.8030662858860664, + "flos": 26375137259040.0, + "grad_norm": 1.6690186150077104, + "language_loss": 0.77511239, + "learning_rate": 3.931174316549666e-07, + "loss": 0.79653072, + "num_input_tokens_seen": 288268780, + "router_z_loss_clip": 0.72167969, + "router_z_loss_mlp": 0.11486816, + "step": 13357, + "time_per_iteration": 2.6288044452667236 + }, + { + "auxiliary_loss_clip": 0.01113318, + "auxiliary_loss_mlp": 0.01030072, + "balance_loss_clip": 1.03685641, + "balance_loss_mlp": 1.01753759, + "epoch": 0.8031264091387345, + "flos": 31274193385440.0, + "grad_norm": 1.523744748877044, + "language_loss": 0.77085972, + "learning_rate": 3.9288558287073937e-07, + "loss": 0.79229361, + "num_input_tokens_seen": 288290830, + "router_z_loss_clip": 0.76416016, + "router_z_loss_mlp": 0.12548828, + "step": 13358, + "time_per_iteration": 4.1981096267700195 + }, + { + "auxiliary_loss_clip": 0.01110516, + "auxiliary_loss_mlp": 0.01027843, + "balance_loss_clip": 1.03857064, + "balance_loss_mlp": 1.01672125, + "epoch": 0.8031865323914024, + "flos": 23972278752000.0, + "grad_norm": 2.9366716837923312, + "language_loss": 0.84736121, + "learning_rate": 3.9265379502845143e-07, + "loss": 0.86874473, + "num_input_tokens_seen": 288308865, + "router_z_loss_clip": 0.72021484, + "router_z_loss_mlp": 0.11102295, + "step": 13359, + "time_per_iteration": 2.626589059829712 + }, + { + "auxiliary_loss_clip": 0.01111294, + "auxiliary_loss_mlp": 0.01029804, + "balance_loss_clip": 1.04014432, + "balance_loss_mlp": 1.01933742, + "epoch": 0.8032466556440704, + "flos": 31932747033120.0, + "grad_norm": 2.132522514579651, + "language_loss": 0.7368716, + "learning_rate": 3.924220681368928e-07, + "loss": 0.75828254, + "num_input_tokens_seen": 288327325, + "router_z_loss_clip": 0.71240234, + "router_z_loss_mlp": 0.10461426, + "step": 13360, + "time_per_iteration": 2.6773154735565186 + }, + { + "auxiliary_loss_clip": 0.01111921, + "auxiliary_loss_mlp": 0.01027211, + "balance_loss_clip": 1.03805399, + "balance_loss_mlp": 1.0159874, + "epoch": 0.8033067788967383, + "flos": 31139978688000.0, + "grad_norm": 2.106818415354404, + "language_loss": 0.69533646, + "learning_rate": 3.921904022048512e-07, + "loss": 0.71672773, + "num_input_tokens_seen": 288347285, + "router_z_loss_clip": 0.73828125, + "router_z_loss_mlp": 0.11212158, + "step": 13361, + "time_per_iteration": 2.6723523139953613 + }, + { + "auxiliary_loss_clip": 0.01117547, + "auxiliary_loss_mlp": 0.01035322, + "balance_loss_clip": 1.04130101, + "balance_loss_mlp": 1.02328205, + "epoch": 0.8033669021494063, + "flos": 29314733004000.0, + "grad_norm": 2.374139361724282, + "language_loss": 0.70518917, + "learning_rate": 3.919587972411098e-07, + "loss": 0.72671789, + "num_input_tokens_seen": 288367785, + "router_z_loss_clip": 0.76220703, + "router_z_loss_mlp": 0.12054443, + "step": 13362, + "time_per_iteration": 2.708184003829956 + }, + { + "auxiliary_loss_clip": 0.01120248, + "auxiliary_loss_mlp": 0.01036944, + "balance_loss_clip": 1.04089916, + "balance_loss_mlp": 1.02381945, + "epoch": 0.8034270254020742, + "flos": 16579780181280.0, + "grad_norm": 3.3792780955799424, + "language_loss": 0.78640795, + "learning_rate": 3.91727253254452e-07, + "loss": 0.80797988, + "num_input_tokens_seen": 288384135, + "router_z_loss_clip": 0.79199219, + "router_z_loss_mlp": 0.13128662, + "step": 13363, + "time_per_iteration": 4.172184228897095 + }, + { + "auxiliary_loss_clip": 0.01111827, + "auxiliary_loss_mlp": 0.0102845, + "balance_loss_clip": 1.03729951, + "balance_loss_mlp": 1.01655316, + "epoch": 0.8034871486547422, + "flos": 33449943772800.0, + "grad_norm": 1.8167496715322895, + "language_loss": 0.75018394, + "learning_rate": 3.9149577025365787e-07, + "loss": 0.77158678, + "num_input_tokens_seen": 288403805, + "router_z_loss_clip": 0.74511719, + "router_z_loss_mlp": 0.11901855, + "step": 13364, + "time_per_iteration": 2.6416187286376953 + }, + { + "auxiliary_loss_clip": 0.01115221, + "auxiliary_loss_mlp": 0.01028701, + "balance_loss_clip": 1.04227948, + "balance_loss_mlp": 1.01766205, + "epoch": 0.8035472719074102, + "flos": 39777760117440.0, + "grad_norm": 1.9540345929522698, + "language_loss": 0.60756934, + "learning_rate": 3.9126434824750596e-07, + "loss": 0.62900853, + "num_input_tokens_seen": 288424895, + "router_z_loss_clip": 0.72949219, + "router_z_loss_mlp": 0.11047363, + "step": 13365, + "time_per_iteration": 4.184035062789917 + }, + { + "auxiliary_loss_clip": 0.01114319, + "auxiliary_loss_mlp": 0.01034072, + "balance_loss_clip": 1.03955626, + "balance_loss_mlp": 1.02243769, + "epoch": 0.8036073951600782, + "flos": 25975694031840.0, + "grad_norm": 2.4627769455659574, + "language_loss": 0.66037524, + "learning_rate": 3.910329872447706e-07, + "loss": 0.68185914, + "num_input_tokens_seen": 288443865, + "router_z_loss_clip": 0.74755859, + "router_z_loss_mlp": 0.11639404, + "step": 13366, + "time_per_iteration": 2.6822891235351562 + }, + { + "auxiliary_loss_clip": 0.01110288, + "auxiliary_loss_mlp": 0.01033022, + "balance_loss_clip": 1.03854656, + "balance_loss_mlp": 1.02197707, + "epoch": 0.8036675184127461, + "flos": 22102348858560.0, + "grad_norm": 2.3660978241709256, + "language_loss": 0.75343001, + "learning_rate": 3.908016872542259e-07, + "loss": 0.77486312, + "num_input_tokens_seen": 288461065, + "router_z_loss_clip": 0.71777344, + "router_z_loss_mlp": 0.11047363, + "step": 13367, + "time_per_iteration": 2.5955581665039062 + }, + { + "auxiliary_loss_clip": 0.01110025, + "auxiliary_loss_mlp": 0.01028644, + "balance_loss_clip": 1.0378561, + "balance_loss_mlp": 1.01773047, + "epoch": 0.8037276416654141, + "flos": 31762680893280.0, + "grad_norm": 2.7386917309813232, + "language_loss": 0.73993659, + "learning_rate": 3.905704482846428e-07, + "loss": 0.76132327, + "num_input_tokens_seen": 288481865, + "router_z_loss_clip": 0.72021484, + "router_z_loss_mlp": 0.10906982, + "step": 13368, + "time_per_iteration": 2.6874656677246094 + }, + { + "auxiliary_loss_clip": 0.01113579, + "auxiliary_loss_mlp": 0.01032104, + "balance_loss_clip": 1.03847671, + "balance_loss_mlp": 1.02069521, + "epoch": 0.803787764918082, + "flos": 22942759174560.0, + "grad_norm": 2.0255657118493957, + "language_loss": 0.70222032, + "learning_rate": 3.90339270344789e-07, + "loss": 0.72367716, + "num_input_tokens_seen": 288499345, + "router_z_loss_clip": 0.75048828, + "router_z_loss_mlp": 0.11407471, + "step": 13369, + "time_per_iteration": 2.5910427570343018 + }, + { + "auxiliary_loss_clip": 0.01108856, + "auxiliary_loss_mlp": 0.0103431, + "balance_loss_clip": 1.03715611, + "balance_loss_mlp": 1.02352774, + "epoch": 0.80384788817075, + "flos": 24679892475360.0, + "grad_norm": 1.779832456611723, + "language_loss": 0.74270785, + "learning_rate": 3.901081534434312e-07, + "loss": 0.76413953, + "num_input_tokens_seen": 288517660, + "router_z_loss_clip": 0.71679688, + "router_z_loss_mlp": 0.10784912, + "step": 13370, + "time_per_iteration": 2.6638643741607666 + }, + { + "auxiliary_loss_clip": 0.01116044, + "auxiliary_loss_mlp": 0.01031847, + "balance_loss_clip": 1.03995001, + "balance_loss_mlp": 1.01946068, + "epoch": 0.8039080114234181, + "flos": 22592213953920.0, + "grad_norm": 6.769600907653239, + "language_loss": 0.87042338, + "learning_rate": 3.898770975893342e-07, + "loss": 0.89190227, + "num_input_tokens_seen": 288534180, + "router_z_loss_clip": 0.76123047, + "router_z_loss_mlp": 0.12384033, + "step": 13371, + "time_per_iteration": 2.5784153938293457 + }, + { + "auxiliary_loss_clip": 0.01115686, + "auxiliary_loss_mlp": 0.01031287, + "balance_loss_clip": 1.03862417, + "balance_loss_mlp": 1.01898444, + "epoch": 0.803968134676086, + "flos": 27311317724160.0, + "grad_norm": 2.2920309047059013, + "language_loss": 0.74986517, + "learning_rate": 3.89646102791259e-07, + "loss": 0.77133489, + "num_input_tokens_seen": 288553350, + "router_z_loss_clip": 0.77099609, + "router_z_loss_mlp": 0.12316895, + "step": 13372, + "time_per_iteration": 2.6906542778015137 + }, + { + "auxiliary_loss_clip": 0.01111005, + "auxiliary_loss_mlp": 0.01028369, + "balance_loss_clip": 1.03854704, + "balance_loss_mlp": 1.01593518, + "epoch": 0.804028257928754, + "flos": 29092932648000.0, + "grad_norm": 2.1774628898686292, + "language_loss": 0.79208928, + "learning_rate": 3.894151690579646e-07, + "loss": 0.813483, + "num_input_tokens_seen": 288571325, + "router_z_loss_clip": 0.72412109, + "router_z_loss_mlp": 0.12426758, + "step": 13373, + "time_per_iteration": 2.6647627353668213 + }, + { + "auxiliary_loss_clip": 0.01109371, + "auxiliary_loss_mlp": 0.01033032, + "balance_loss_clip": 1.03847265, + "balance_loss_mlp": 1.02212405, + "epoch": 0.8040883811814219, + "flos": 28736796042720.0, + "grad_norm": 1.8286257319440204, + "language_loss": 0.74633539, + "learning_rate": 3.8918429639820815e-07, + "loss": 0.76775944, + "num_input_tokens_seen": 288592100, + "router_z_loss_clip": 0.70898438, + "router_z_loss_mlp": 0.10906982, + "step": 13374, + "time_per_iteration": 2.6553380489349365 + }, + { + "auxiliary_loss_clip": 0.0111482, + "auxiliary_loss_mlp": 0.01033668, + "balance_loss_clip": 1.03843427, + "balance_loss_mlp": 1.02161562, + "epoch": 0.8041485044340899, + "flos": 23215726504800.0, + "grad_norm": 2.23947519762091, + "language_loss": 0.68657416, + "learning_rate": 3.889534848207452e-07, + "loss": 0.70805907, + "num_input_tokens_seen": 288612305, + "router_z_loss_clip": 0.76367188, + "router_z_loss_mlp": 0.12054443, + "step": 13375, + "time_per_iteration": 2.7245633602142334 + }, + { + "auxiliary_loss_clip": 0.01030282, + "auxiliary_loss_mlp": 0.01000635, + "balance_loss_clip": 1.00795889, + "balance_loss_mlp": 0.99961478, + "epoch": 0.8042086276867578, + "flos": 85420537027200.0, + "grad_norm": 0.7757787314146689, + "language_loss": 0.55642557, + "learning_rate": 3.887227343343271e-07, + "loss": 0.57673466, + "num_input_tokens_seen": 288676015, + "router_z_loss_clip": 0.22351074, + "router_z_loss_mlp": 0.01019287, + "step": 13376, + "time_per_iteration": 3.335188627243042 + }, + { + "auxiliary_loss_clip": 0.0111487, + "auxiliary_loss_mlp": 0.01029058, + "balance_loss_clip": 1.03959894, + "balance_loss_mlp": 1.01750064, + "epoch": 0.8042687509394258, + "flos": 26688818105280.0, + "grad_norm": 1.599132910680999, + "language_loss": 0.73211074, + "learning_rate": 3.8849204494770425e-07, + "loss": 0.75354999, + "num_input_tokens_seen": 288696455, + "router_z_loss_clip": 0.75292969, + "router_z_loss_mlp": 0.11560059, + "step": 13377, + "time_per_iteration": 2.689409017562866 + }, + { + "auxiliary_loss_clip": 0.01111505, + "auxiliary_loss_mlp": 0.01028977, + "balance_loss_clip": 1.03656626, + "balance_loss_mlp": 1.01740181, + "epoch": 0.8043288741920938, + "flos": 32475967035840.0, + "grad_norm": 1.8821756861331576, + "language_loss": 0.70222127, + "learning_rate": 3.8826141666962567e-07, + "loss": 0.72362608, + "num_input_tokens_seen": 288715560, + "router_z_loss_clip": 0.74951172, + "router_z_loss_mlp": 0.11566162, + "step": 13378, + "time_per_iteration": 2.6878952980041504 + }, + { + "auxiliary_loss_clip": 0.01114925, + "auxiliary_loss_mlp": 0.01025559, + "balance_loss_clip": 1.04009366, + "balance_loss_mlp": 1.01404357, + "epoch": 0.8043889974447618, + "flos": 40759637724000.0, + "grad_norm": 1.5797738639647003, + "language_loss": 0.69064766, + "learning_rate": 3.880308495088347e-07, + "loss": 0.71205246, + "num_input_tokens_seen": 288739485, + "router_z_loss_clip": 0.74951172, + "router_z_loss_mlp": 0.11529541, + "step": 13379, + "time_per_iteration": 2.731653928756714 + }, + { + "auxiliary_loss_clip": 0.01118451, + "auxiliary_loss_mlp": 0.01032527, + "balance_loss_clip": 1.04175258, + "balance_loss_mlp": 1.01971233, + "epoch": 0.8044491206974297, + "flos": 24862762075680.0, + "grad_norm": 6.238460521473958, + "language_loss": 0.76495469, + "learning_rate": 3.8780034347407533e-07, + "loss": 0.78646445, + "num_input_tokens_seen": 288757420, + "router_z_loss_clip": 0.76660156, + "router_z_loss_mlp": 0.12823486, + "step": 13380, + "time_per_iteration": 2.6244983673095703 + }, + { + "auxiliary_loss_clip": 0.0111055, + "auxiliary_loss_mlp": 0.01025844, + "balance_loss_clip": 1.03730679, + "balance_loss_mlp": 1.01479912, + "epoch": 0.8045092439500977, + "flos": 28557613514880.0, + "grad_norm": 2.290443300299115, + "language_loss": 0.69141728, + "learning_rate": 3.875698985740887e-07, + "loss": 0.71278119, + "num_input_tokens_seen": 288775535, + "router_z_loss_clip": 0.73291016, + "router_z_loss_mlp": 0.1105957, + "step": 13381, + "time_per_iteration": 2.6631178855895996 + }, + { + "auxiliary_loss_clip": 0.01117268, + "auxiliary_loss_mlp": 0.01034231, + "balance_loss_clip": 1.04320669, + "balance_loss_mlp": 1.0222621, + "epoch": 0.8045693672027656, + "flos": 29403777284640.0, + "grad_norm": 1.9795907919745073, + "language_loss": 0.6372534, + "learning_rate": 3.873395148176135e-07, + "loss": 0.65876842, + "num_input_tokens_seen": 288795035, + "router_z_loss_clip": 0.74072266, + "router_z_loss_mlp": 0.11975098, + "step": 13382, + "time_per_iteration": 2.6727988719940186 + }, + { + "auxiliary_loss_clip": 0.01113651, + "auxiliary_loss_mlp": 0.01039471, + "balance_loss_clip": 1.04077458, + "balance_loss_mlp": 1.02882528, + "epoch": 0.8046294904554336, + "flos": 33809078656800.0, + "grad_norm": 2.776116079373296, + "language_loss": 0.76346171, + "learning_rate": 3.8710919221338487e-07, + "loss": 0.78499293, + "num_input_tokens_seen": 288816270, + "router_z_loss_clip": 0.72949219, + "router_z_loss_mlp": 0.10644531, + "step": 13383, + "time_per_iteration": 2.712117910385132 + }, + { + "auxiliary_loss_clip": 0.01113041, + "auxiliary_loss_mlp": 0.01033334, + "balance_loss_clip": 1.03986216, + "balance_loss_mlp": 1.02222419, + "epoch": 0.8046896137081017, + "flos": 30472430204160.0, + "grad_norm": 1.756405890226702, + "language_loss": 0.70061755, + "learning_rate": 3.868789307701381e-07, + "loss": 0.7220813, + "num_input_tokens_seen": 288836050, + "router_z_loss_clip": 0.73144531, + "router_z_loss_mlp": 0.11108398, + "step": 13384, + "time_per_iteration": 2.778035879135132 + }, + { + "auxiliary_loss_clip": 0.01114575, + "auxiliary_loss_mlp": 0.01034352, + "balance_loss_clip": 1.03778696, + "balance_loss_mlp": 1.02205527, + "epoch": 0.8047497369607696, + "flos": 21567475415520.0, + "grad_norm": 4.6672527133711785, + "language_loss": 0.79868484, + "learning_rate": 3.8664873049660375e-07, + "loss": 0.82017416, + "num_input_tokens_seen": 288852900, + "router_z_loss_clip": 0.76757812, + "router_z_loss_mlp": 0.12304688, + "step": 13385, + "time_per_iteration": 2.580108165740967 + }, + { + "auxiliary_loss_clip": 0.01113826, + "auxiliary_loss_mlp": 0.0103353, + "balance_loss_clip": 1.03939676, + "balance_loss_mlp": 1.02140009, + "epoch": 0.8048098602134376, + "flos": 27310831516800.0, + "grad_norm": 1.6850333242135538, + "language_loss": 0.7219739, + "learning_rate": 3.864185914015108e-07, + "loss": 0.74344742, + "num_input_tokens_seen": 288872625, + "router_z_loss_clip": 0.74414062, + "router_z_loss_mlp": 0.12127686, + "step": 13386, + "time_per_iteration": 2.674826145172119 + }, + { + "auxiliary_loss_clip": 0.01030759, + "auxiliary_loss_mlp": 0.0100224, + "balance_loss_clip": 1.00835741, + "balance_loss_mlp": 1.00120592, + "epoch": 0.8048699834661055, + "flos": 86879921958720.0, + "grad_norm": 0.6695485747577822, + "language_loss": 0.51205426, + "learning_rate": 3.861885134935865e-07, + "loss": 0.53238422, + "num_input_tokens_seen": 288939180, + "router_z_loss_clip": 0.22412109, + "router_z_loss_mlp": 0.01034546, + "step": 13387, + "time_per_iteration": 3.2957983016967773 + }, + { + "auxiliary_loss_clip": 0.01112952, + "auxiliary_loss_mlp": 0.01031785, + "balance_loss_clip": 1.03825378, + "balance_loss_mlp": 1.01883864, + "epoch": 0.8049301067187735, + "flos": 28870524532800.0, + "grad_norm": 1.7595088509153565, + "language_loss": 0.73788726, + "learning_rate": 3.859584967815559e-07, + "loss": 0.75933468, + "num_input_tokens_seen": 288958925, + "router_z_loss_clip": 0.74560547, + "router_z_loss_mlp": 0.1293335, + "step": 13388, + "time_per_iteration": 2.6872317790985107 + }, + { + "auxiliary_loss_clip": 0.01112771, + "auxiliary_loss_mlp": 0.0103019, + "balance_loss_clip": 1.04022741, + "balance_loss_mlp": 1.01872253, + "epoch": 0.8049902299714414, + "flos": 29805286893120.0, + "grad_norm": 1.4710997372416283, + "language_loss": 0.71426505, + "learning_rate": 3.857285412741411e-07, + "loss": 0.73569465, + "num_input_tokens_seen": 288980935, + "router_z_loss_clip": 0.72558594, + "router_z_loss_mlp": 0.11468506, + "step": 13389, + "time_per_iteration": 2.702246904373169 + }, + { + "auxiliary_loss_clip": 0.01114004, + "auxiliary_loss_mlp": 0.01033746, + "balance_loss_clip": 1.04130125, + "balance_loss_mlp": 1.0218544, + "epoch": 0.8050503532241094, + "flos": 21344378506560.0, + "grad_norm": 2.081968292299599, + "language_loss": 0.82562423, + "learning_rate": 3.8549864698006097e-07, + "loss": 0.84710169, + "num_input_tokens_seen": 288996780, + "router_z_loss_clip": 0.72705078, + "router_z_loss_mlp": 0.11895752, + "step": 13390, + "time_per_iteration": 2.663569927215576 + }, + { + "auxiliary_loss_clip": 0.01030534, + "auxiliary_loss_mlp": 0.01002261, + "balance_loss_clip": 1.00810015, + "balance_loss_mlp": 1.00125766, + "epoch": 0.8051104764767774, + "flos": 70354428582240.0, + "grad_norm": 0.7789431734541448, + "language_loss": 0.55456054, + "learning_rate": 3.8526881390803424e-07, + "loss": 0.57488853, + "num_input_tokens_seen": 289057590, + "router_z_loss_clip": 0.2244873, + "router_z_loss_mlp": 0.01003265, + "step": 13391, + "time_per_iteration": 4.695050239562988 + }, + { + "auxiliary_loss_clip": 0.01112501, + "auxiliary_loss_mlp": 0.01029467, + "balance_loss_clip": 1.04021764, + "balance_loss_mlp": 1.01750481, + "epoch": 0.8051705997294454, + "flos": 21967728988320.0, + "grad_norm": 1.7171660819195331, + "language_loss": 0.84927487, + "learning_rate": 3.850390420667762e-07, + "loss": 0.87069452, + "num_input_tokens_seen": 289076285, + "router_z_loss_clip": 0.72314453, + "router_z_loss_mlp": 0.11962891, + "step": 13392, + "time_per_iteration": 2.634415626525879 + }, + { + "auxiliary_loss_clip": 0.01112517, + "auxiliary_loss_mlp": 0.01028629, + "balance_loss_clip": 1.03838396, + "balance_loss_mlp": 1.01731575, + "epoch": 0.8052307229821133, + "flos": 32210860057920.0, + "grad_norm": 1.436551996087408, + "language_loss": 0.70652664, + "learning_rate": 3.8480933146499914e-07, + "loss": 0.72793818, + "num_input_tokens_seen": 289097585, + "router_z_loss_clip": 0.74072266, + "router_z_loss_mlp": 0.11309814, + "step": 13393, + "time_per_iteration": 2.702511787414551 + }, + { + "auxiliary_loss_clip": 0.01113227, + "auxiliary_loss_mlp": 0.01032262, + "balance_loss_clip": 1.03908086, + "balance_loss_mlp": 1.01994157, + "epoch": 0.8052908462347813, + "flos": 26548525815840.0, + "grad_norm": 2.2529101479704723, + "language_loss": 0.76410818, + "learning_rate": 3.84579682111414e-07, + "loss": 0.78556311, + "num_input_tokens_seen": 289116890, + "router_z_loss_clip": 0.74121094, + "router_z_loss_mlp": 0.12310791, + "step": 13394, + "time_per_iteration": 2.6670961380004883 + }, + { + "auxiliary_loss_clip": 0.0111621, + "auxiliary_loss_mlp": 0.01030979, + "balance_loss_clip": 1.04181743, + "balance_loss_mlp": 1.01975584, + "epoch": 0.8053509694874492, + "flos": 31045545609120.0, + "grad_norm": 1.7978842946467606, + "language_loss": 0.65008044, + "learning_rate": 3.843500940147304e-07, + "loss": 0.6715523, + "num_input_tokens_seen": 289136670, + "router_z_loss_clip": 0.74365234, + "router_z_loss_mlp": 0.11230469, + "step": 13395, + "time_per_iteration": 2.7441940307617188 + }, + { + "auxiliary_loss_clip": 0.01030577, + "auxiliary_loss_mlp": 0.01001825, + "balance_loss_clip": 1.00811994, + "balance_loss_mlp": 1.00082231, + "epoch": 0.8054110927401172, + "flos": 70365813937920.0, + "grad_norm": 0.7482941785709545, + "language_loss": 0.57331878, + "learning_rate": 3.8412056718365206e-07, + "loss": 0.59364283, + "num_input_tokens_seen": 289200150, + "router_z_loss_clip": 0.22473145, + "router_z_loss_mlp": 0.01002502, + "step": 13396, + "time_per_iteration": 3.348595142364502 + }, + { + "auxiliary_loss_clip": 0.01114063, + "auxiliary_loss_mlp": 0.01038067, + "balance_loss_clip": 1.04062915, + "balance_loss_mlp": 1.02550852, + "epoch": 0.8054712159927853, + "flos": 23521222860480.0, + "grad_norm": 1.9113788940442658, + "language_loss": 0.77572179, + "learning_rate": 3.8389110162688353e-07, + "loss": 0.79724312, + "num_input_tokens_seen": 289218125, + "router_z_loss_clip": 0.73486328, + "router_z_loss_mlp": 0.12567139, + "step": 13397, + "time_per_iteration": 4.306863784790039 + }, + { + "auxiliary_loss_clip": 0.01114597, + "auxiliary_loss_mlp": 0.01026635, + "balance_loss_clip": 1.04105043, + "balance_loss_mlp": 1.01583457, + "epoch": 0.8055313392454532, + "flos": 21923936159040.0, + "grad_norm": 2.0145482637297265, + "language_loss": 0.70368725, + "learning_rate": 3.836616973531266e-07, + "loss": 0.72509956, + "num_input_tokens_seen": 289237115, + "router_z_loss_clip": 0.73535156, + "router_z_loss_mlp": 0.10797119, + "step": 13398, + "time_per_iteration": 2.621312141418457 + }, + { + "auxiliary_loss_clip": 0.01111777, + "auxiliary_loss_mlp": 0.01029384, + "balance_loss_clip": 1.03826666, + "balance_loss_mlp": 1.01866066, + "epoch": 0.8055914624981212, + "flos": 16445930139360.0, + "grad_norm": 2.3250844439972607, + "language_loss": 0.68910342, + "learning_rate": 3.834323543710805e-07, + "loss": 0.71051502, + "num_input_tokens_seen": 289253635, + "router_z_loss_clip": 0.73535156, + "router_z_loss_mlp": 0.10723877, + "step": 13399, + "time_per_iteration": 2.656684160232544 + }, + { + "auxiliary_loss_clip": 0.01114014, + "auxiliary_loss_mlp": 0.01034574, + "balance_loss_clip": 1.04083717, + "balance_loss_mlp": 1.02370191, + "epoch": 0.8056515857507891, + "flos": 16136341538400.0, + "grad_norm": 2.238305008295827, + "language_loss": 0.72614062, + "learning_rate": 3.8320307268944153e-07, + "loss": 0.74762648, + "num_input_tokens_seen": 289270085, + "router_z_loss_clip": 0.73144531, + "router_z_loss_mlp": 0.10870361, + "step": 13400, + "time_per_iteration": 2.6431570053100586 + }, + { + "auxiliary_loss_clip": 0.01109723, + "auxiliary_loss_mlp": 0.01032414, + "balance_loss_clip": 1.0368551, + "balance_loss_mlp": 1.02066028, + "epoch": 0.8057117090034571, + "flos": 29136482373600.0, + "grad_norm": 1.8724075628365606, + "language_loss": 0.63899308, + "learning_rate": 3.829738523169037e-07, + "loss": 0.66041452, + "num_input_tokens_seen": 289289645, + "router_z_loss_clip": 0.72851562, + "router_z_loss_mlp": 0.11755371, + "step": 13401, + "time_per_iteration": 2.70993971824646 + }, + { + "auxiliary_loss_clip": 0.01115666, + "auxiliary_loss_mlp": 0.01031081, + "balance_loss_clip": 1.04000819, + "balance_loss_mlp": 1.0195179, + "epoch": 0.805771832256125, + "flos": 25885839405600.0, + "grad_norm": 3.7954661450969307, + "language_loss": 0.84260678, + "learning_rate": 3.8274469326215985e-07, + "loss": 0.86407423, + "num_input_tokens_seen": 289306630, + "router_z_loss_clip": 0.75585938, + "router_z_loss_mlp": 0.11566162, + "step": 13402, + "time_per_iteration": 4.111790418624878 + }, + { + "auxiliary_loss_clip": 0.01116179, + "auxiliary_loss_mlp": 0.01031775, + "balance_loss_clip": 1.04061162, + "balance_loss_mlp": 1.02041435, + "epoch": 0.805831955508793, + "flos": 21434800374720.0, + "grad_norm": 3.3829147589711277, + "language_loss": 0.67814553, + "learning_rate": 3.8251559553389876e-07, + "loss": 0.69962513, + "num_input_tokens_seen": 289324960, + "router_z_loss_clip": 0.75537109, + "router_z_loss_mlp": 0.1137085, + "step": 13403, + "time_per_iteration": 2.6424219608306885 + }, + { + "auxiliary_loss_clip": 0.01113363, + "auxiliary_loss_mlp": 0.01034725, + "balance_loss_clip": 1.04160893, + "balance_loss_mlp": 1.02404416, + "epoch": 0.805892078761461, + "flos": 32833035538560.0, + "grad_norm": 1.7597583222395914, + "language_loss": 0.84688026, + "learning_rate": 3.822865591408084e-07, + "loss": 0.86836118, + "num_input_tokens_seen": 289344980, + "router_z_loss_clip": 0.71777344, + "router_z_loss_mlp": 0.10687256, + "step": 13404, + "time_per_iteration": 4.011449813842773 + }, + { + "auxiliary_loss_clip": 0.01107371, + "auxiliary_loss_mlp": 0.01032322, + "balance_loss_clip": 1.0373683, + "balance_loss_mlp": 1.02128315, + "epoch": 0.805952202014129, + "flos": 38444810565600.0, + "grad_norm": 3.062759082066099, + "language_loss": 0.70327151, + "learning_rate": 3.820575840915743e-07, + "loss": 0.72466838, + "num_input_tokens_seen": 289367500, + "router_z_loss_clip": 0.70019531, + "router_z_loss_mlp": 0.1104126, + "step": 13405, + "time_per_iteration": 2.7073943614959717 + }, + { + "auxiliary_loss_clip": 0.01112484, + "auxiliary_loss_mlp": 0.01025452, + "balance_loss_clip": 1.04049647, + "balance_loss_mlp": 1.01464558, + "epoch": 0.8060123252667969, + "flos": 29760886304640.0, + "grad_norm": 2.437849377058138, + "language_loss": 0.75458121, + "learning_rate": 3.818286703948788e-07, + "loss": 0.77596056, + "num_input_tokens_seen": 289385930, + "router_z_loss_clip": 0.72070312, + "router_z_loss_mlp": 0.10803223, + "step": 13406, + "time_per_iteration": 2.677105188369751 + }, + { + "auxiliary_loss_clip": 0.0111449, + "auxiliary_loss_mlp": 0.01034921, + "balance_loss_clip": 1.03988075, + "balance_loss_mlp": 1.02279735, + "epoch": 0.8060724485194649, + "flos": 28650223316160.0, + "grad_norm": 1.479279481007709, + "language_loss": 0.76271713, + "learning_rate": 3.815998180594018e-07, + "loss": 0.78421122, + "num_input_tokens_seen": 289408025, + "router_z_loss_clip": 0.74609375, + "router_z_loss_mlp": 0.12133789, + "step": 13407, + "time_per_iteration": 2.661022186279297 + }, + { + "auxiliary_loss_clip": 0.01111053, + "auxiliary_loss_mlp": 0.0103473, + "balance_loss_clip": 1.03760684, + "balance_loss_mlp": 1.0229764, + "epoch": 0.8061325717721328, + "flos": 22725496753920.0, + "grad_norm": 1.9269464839825814, + "language_loss": 0.73910928, + "learning_rate": 3.81371027093822e-07, + "loss": 0.76056713, + "num_input_tokens_seen": 289426575, + "router_z_loss_clip": 0.73388672, + "router_z_loss_mlp": 0.11767578, + "step": 13408, + "time_per_iteration": 2.629380702972412 + }, + { + "auxiliary_loss_clip": 0.01111401, + "auxiliary_loss_mlp": 0.01027197, + "balance_loss_clip": 1.03822136, + "balance_loss_mlp": 1.01519275, + "epoch": 0.8061926950248008, + "flos": 28776294040320.0, + "grad_norm": 2.1436218833261584, + "language_loss": 0.70378292, + "learning_rate": 3.8114229750681523e-07, + "loss": 0.72516894, + "num_input_tokens_seen": 289447760, + "router_z_loss_clip": 0.73242188, + "router_z_loss_mlp": 0.12011719, + "step": 13409, + "time_per_iteration": 2.6323087215423584 + }, + { + "auxiliary_loss_clip": 0.01112532, + "auxiliary_loss_mlp": 0.01027983, + "balance_loss_clip": 1.0381484, + "balance_loss_mlp": 1.01629424, + "epoch": 0.8062528182774689, + "flos": 13596553676160.0, + "grad_norm": 2.4517645950117073, + "language_loss": 0.77431345, + "learning_rate": 3.809136293070545e-07, + "loss": 0.79571855, + "num_input_tokens_seen": 289463920, + "router_z_loss_clip": 0.74365234, + "router_z_loss_mlp": 0.11682129, + "step": 13410, + "time_per_iteration": 2.6257004737854004 + }, + { + "auxiliary_loss_clip": 0.0111217, + "auxiliary_loss_mlp": 0.01029745, + "balance_loss_clip": 1.03988934, + "balance_loss_mlp": 1.01815844, + "epoch": 0.8063129415301368, + "flos": 27267686964000.0, + "grad_norm": 1.8603647622178525, + "language_loss": 0.68297899, + "learning_rate": 3.806850225032117e-07, + "loss": 0.7043981, + "num_input_tokens_seen": 289482635, + "router_z_loss_clip": 0.72167969, + "router_z_loss_mlp": 0.11584473, + "step": 13411, + "time_per_iteration": 2.6742818355560303 + }, + { + "auxiliary_loss_clip": 0.01111079, + "auxiliary_loss_mlp": 0.01027692, + "balance_loss_clip": 1.03946948, + "balance_loss_mlp": 1.01660562, + "epoch": 0.8063730647828048, + "flos": 29270899657440.0, + "grad_norm": 1.7542407684390824, + "language_loss": 0.67963988, + "learning_rate": 3.804564771039551e-07, + "loss": 0.70102757, + "num_input_tokens_seen": 289502040, + "router_z_loss_clip": 0.71679688, + "router_z_loss_mlp": 0.11090088, + "step": 13412, + "time_per_iteration": 2.700004816055298 + }, + { + "auxiliary_loss_clip": 0.0111656, + "auxiliary_loss_mlp": 0.01034348, + "balance_loss_clip": 1.04082847, + "balance_loss_mlp": 1.02131212, + "epoch": 0.8064331880354727, + "flos": 26016853237920.0, + "grad_norm": 1.763425516851602, + "language_loss": 0.81177545, + "learning_rate": 3.8022799311795064e-07, + "loss": 0.8332845, + "num_input_tokens_seen": 289520740, + "router_z_loss_clip": 0.75732422, + "router_z_loss_mlp": 0.13043213, + "step": 13413, + "time_per_iteration": 2.730059862136841 + }, + { + "auxiliary_loss_clip": 0.01112728, + "auxiliary_loss_mlp": 0.01032123, + "balance_loss_clip": 1.03940773, + "balance_loss_mlp": 1.02070904, + "epoch": 0.8064933112881407, + "flos": 24017003478720.0, + "grad_norm": 2.003143859157547, + "language_loss": 0.84631634, + "learning_rate": 3.7999957055386303e-07, + "loss": 0.86776483, + "num_input_tokens_seen": 289535840, + "router_z_loss_clip": 0.734375, + "router_z_loss_mlp": 0.11413574, + "step": 13414, + "time_per_iteration": 2.6011784076690674 + }, + { + "auxiliary_loss_clip": 0.01110609, + "auxiliary_loss_mlp": 0.01030023, + "balance_loss_clip": 1.03801644, + "balance_loss_mlp": 1.01898432, + "epoch": 0.8065534345408086, + "flos": 23525193553920.0, + "grad_norm": 1.8806024696269819, + "language_loss": 0.67341089, + "learning_rate": 3.7977120942035467e-07, + "loss": 0.69481725, + "num_input_tokens_seen": 289555205, + "router_z_loss_clip": 0.72558594, + "router_z_loss_mlp": 0.11047363, + "step": 13415, + "time_per_iteration": 2.6556735038757324 + }, + { + "auxiliary_loss_clip": 0.01109469, + "auxiliary_loss_mlp": 0.01023522, + "balance_loss_clip": 1.03884661, + "balance_loss_mlp": 1.01247692, + "epoch": 0.8066135577934767, + "flos": 24011168990400.0, + "grad_norm": 1.5879042087380835, + "language_loss": 0.76595449, + "learning_rate": 3.7954290972608383e-07, + "loss": 0.78728437, + "num_input_tokens_seen": 289573000, + "router_z_loss_clip": 0.70556641, + "router_z_loss_mlp": 0.11053467, + "step": 13416, + "time_per_iteration": 2.599362373352051 + }, + { + "auxiliary_loss_clip": 0.01114508, + "auxiliary_loss_mlp": 0.01031687, + "balance_loss_clip": 1.03767502, + "balance_loss_mlp": 1.02059495, + "epoch": 0.8066736810461446, + "flos": 25799347713600.0, + "grad_norm": 1.623084320040291, + "language_loss": 0.65438735, + "learning_rate": 3.793146714797086e-07, + "loss": 0.67584932, + "num_input_tokens_seen": 289592625, + "router_z_loss_clip": 0.76806641, + "router_z_loss_mlp": 0.11102295, + "step": 13417, + "time_per_iteration": 2.677295684814453 + }, + { + "auxiliary_loss_clip": 0.01115631, + "auxiliary_loss_mlp": 0.01038926, + "balance_loss_clip": 1.04021239, + "balance_loss_mlp": 1.02755308, + "epoch": 0.8067338042988126, + "flos": 27574885045440.0, + "grad_norm": 1.6602575107278033, + "language_loss": 0.80578923, + "learning_rate": 3.7908649468988306e-07, + "loss": 0.82733482, + "num_input_tokens_seen": 289610780, + "router_z_loss_clip": 0.75488281, + "router_z_loss_mlp": 0.1137085, + "step": 13418, + "time_per_iteration": 2.629535436630249 + }, + { + "auxiliary_loss_clip": 0.01114199, + "auxiliary_loss_mlp": 0.01028787, + "balance_loss_clip": 1.0395658, + "balance_loss_mlp": 1.01719379, + "epoch": 0.8067939275514805, + "flos": 20143293649920.0, + "grad_norm": 1.6175152881662074, + "language_loss": 0.85225224, + "learning_rate": 3.7885837936526066e-07, + "loss": 0.87368214, + "num_input_tokens_seen": 289628890, + "router_z_loss_clip": 0.74658203, + "router_z_loss_mlp": 0.11578369, + "step": 13419, + "time_per_iteration": 2.6395463943481445 + }, + { + "auxiliary_loss_clip": 0.01114112, + "auxiliary_loss_mlp": 0.01029983, + "balance_loss_clip": 1.03920197, + "balance_loss_mlp": 1.0182054, + "epoch": 0.8068540508041485, + "flos": 34828023224160.0, + "grad_norm": 1.6836143229245248, + "language_loss": 0.75866401, + "learning_rate": 3.7863032551449047e-07, + "loss": 0.78010499, + "num_input_tokens_seen": 289647220, + "router_z_loss_clip": 0.74902344, + "router_z_loss_mlp": 0.11785889, + "step": 13420, + "time_per_iteration": 2.6634180545806885 + }, + { + "auxiliary_loss_clip": 0.01109669, + "auxiliary_loss_mlp": 0.01024624, + "balance_loss_clip": 1.03763163, + "balance_loss_mlp": 1.01462865, + "epoch": 0.8069141740568164, + "flos": 26420510262240.0, + "grad_norm": 1.8189141377584337, + "language_loss": 0.78553778, + "learning_rate": 3.784023331462207e-07, + "loss": 0.80688071, + "num_input_tokens_seen": 289665800, + "router_z_loss_clip": 0.72119141, + "router_z_loss_mlp": 0.09997559, + "step": 13421, + "time_per_iteration": 2.6863138675689697 + }, + { + "auxiliary_loss_clip": 0.01114142, + "auxiliary_loss_mlp": 0.01022286, + "balance_loss_clip": 1.04022574, + "balance_loss_mlp": 1.01094306, + "epoch": 0.8069742973094844, + "flos": 21389143750560.0, + "grad_norm": 1.912576116943567, + "language_loss": 0.79353821, + "learning_rate": 3.78174402269098e-07, + "loss": 0.81490248, + "num_input_tokens_seen": 289682705, + "router_z_loss_clip": 0.73779297, + "router_z_loss_mlp": 0.11352539, + "step": 13422, + "time_per_iteration": 2.6059539318084717 + }, + { + "auxiliary_loss_clip": 0.01110801, + "auxiliary_loss_mlp": 0.01031022, + "balance_loss_clip": 1.03835177, + "balance_loss_mlp": 1.0197804, + "epoch": 0.8070344205621525, + "flos": 28512767236320.0, + "grad_norm": 1.6501090886765766, + "language_loss": 0.67710006, + "learning_rate": 3.7794653289176347e-07, + "loss": 0.69851828, + "num_input_tokens_seen": 289702920, + "router_z_loss_clip": 0.72412109, + "router_z_loss_mlp": 0.11248779, + "step": 13423, + "time_per_iteration": 2.7106008529663086 + }, + { + "auxiliary_loss_clip": 0.01114163, + "auxiliary_loss_mlp": 0.01034433, + "balance_loss_clip": 1.0392549, + "balance_loss_mlp": 1.02258396, + "epoch": 0.8070945438148204, + "flos": 27979595519040.0, + "grad_norm": 1.8783296165341665, + "language_loss": 0.80318242, + "learning_rate": 3.7771872502285904e-07, + "loss": 0.82466841, + "num_input_tokens_seen": 289723280, + "router_z_loss_clip": 0.74951172, + "router_z_loss_mlp": 0.11853027, + "step": 13424, + "time_per_iteration": 2.651566743850708 + }, + { + "auxiliary_loss_clip": 0.01113484, + "auxiliary_loss_mlp": 0.01026917, + "balance_loss_clip": 1.03779674, + "balance_loss_mlp": 1.01535916, + "epoch": 0.8071546670674884, + "flos": 30872359638720.0, + "grad_norm": 1.6003561276444374, + "language_loss": 0.78795344, + "learning_rate": 3.774909786710232e-07, + "loss": 0.80935746, + "num_input_tokens_seen": 289743475, + "router_z_loss_clip": 0.75634766, + "router_z_loss_mlp": 0.11566162, + "step": 13425, + "time_per_iteration": 2.7731926441192627 + }, + { + "auxiliary_loss_clip": 0.01110228, + "auxiliary_loss_mlp": 0.01030679, + "balance_loss_clip": 1.03815234, + "balance_loss_mlp": 1.01993823, + "epoch": 0.8072147903201563, + "flos": 22102186789440.0, + "grad_norm": 2.7724576871534623, + "language_loss": 0.75027788, + "learning_rate": 3.772632938448923e-07, + "loss": 0.77168691, + "num_input_tokens_seen": 289761400, + "router_z_loss_clip": 0.72070312, + "router_z_loss_mlp": 0.10748291, + "step": 13426, + "time_per_iteration": 2.6597490310668945 + }, + { + "auxiliary_loss_clip": 0.01111628, + "auxiliary_loss_mlp": 0.01025324, + "balance_loss_clip": 1.03848433, + "balance_loss_mlp": 1.01417804, + "epoch": 0.8072749135728243, + "flos": 32565375972000.0, + "grad_norm": 1.77711447686593, + "language_loss": 0.73107183, + "learning_rate": 3.770356705530997e-07, + "loss": 0.75244141, + "num_input_tokens_seen": 289781025, + "router_z_loss_clip": 0.73193359, + "router_z_loss_mlp": 0.11138916, + "step": 13427, + "time_per_iteration": 2.69921875 + }, + { + "auxiliary_loss_clip": 0.01112659, + "auxiliary_loss_mlp": 0.01034653, + "balance_loss_clip": 1.03902209, + "balance_loss_mlp": 1.02274442, + "epoch": 0.8073350368254922, + "flos": 23477348996640.0, + "grad_norm": 2.787723989364437, + "language_loss": 0.69908965, + "learning_rate": 3.768081088042774e-07, + "loss": 0.72056282, + "num_input_tokens_seen": 289798380, + "router_z_loss_clip": 0.73632812, + "router_z_loss_mlp": 0.11901855, + "step": 13428, + "time_per_iteration": 2.631514072418213 + }, + { + "auxiliary_loss_clip": 0.01113654, + "auxiliary_loss_mlp": 0.01026122, + "balance_loss_clip": 1.03971028, + "balance_loss_mlp": 1.01569712, + "epoch": 0.8073951600781603, + "flos": 16270232097600.0, + "grad_norm": 2.698485959771909, + "language_loss": 0.74868089, + "learning_rate": 3.765806086070544e-07, + "loss": 0.7700786, + "num_input_tokens_seen": 289814515, + "router_z_loss_clip": 0.73974609, + "router_z_loss_mlp": 0.10418701, + "step": 13429, + "time_per_iteration": 2.581195592880249 + }, + { + "auxiliary_loss_clip": 0.01108981, + "auxiliary_loss_mlp": 0.01031086, + "balance_loss_clip": 1.03813481, + "balance_loss_mlp": 1.02010715, + "epoch": 0.8074552833308282, + "flos": 27886013303040.0, + "grad_norm": 1.771978551996487, + "language_loss": 0.67116326, + "learning_rate": 3.763531699700568e-07, + "loss": 0.69256395, + "num_input_tokens_seen": 289834315, + "router_z_loss_clip": 0.70849609, + "router_z_loss_mlp": 0.10980225, + "step": 13430, + "time_per_iteration": 2.730328321456909 + }, + { + "auxiliary_loss_clip": 0.01113627, + "auxiliary_loss_mlp": 0.01028499, + "balance_loss_clip": 1.0402534, + "balance_loss_mlp": 1.01746058, + "epoch": 0.8075154065834962, + "flos": 24818158900800.0, + "grad_norm": 1.8721676556250777, + "language_loss": 0.80174059, + "learning_rate": 3.7612579290190994e-07, + "loss": 0.8231619, + "num_input_tokens_seen": 289853770, + "router_z_loss_clip": 0.73339844, + "router_z_loss_mlp": 0.11035156, + "step": 13431, + "time_per_iteration": 4.098567962646484 + }, + { + "auxiliary_loss_clip": 0.0111114, + "auxiliary_loss_mlp": 0.01026275, + "balance_loss_clip": 1.03938925, + "balance_loss_mlp": 1.01434851, + "epoch": 0.8075755298361641, + "flos": 26732367830880.0, + "grad_norm": 1.754025905286549, + "language_loss": 0.80434442, + "learning_rate": 3.7589847741123593e-07, + "loss": 0.82571852, + "num_input_tokens_seen": 289870480, + "router_z_loss_clip": 0.71728516, + "router_z_loss_mlp": 0.1192627, + "step": 13432, + "time_per_iteration": 2.6748907566070557 + }, + { + "auxiliary_loss_clip": 0.01119103, + "auxiliary_loss_mlp": 0.01032131, + "balance_loss_clip": 1.04213512, + "balance_loss_mlp": 1.02025127, + "epoch": 0.8076356530888321, + "flos": 19119324939840.0, + "grad_norm": 3.907312144662862, + "language_loss": 0.70192742, + "learning_rate": 3.7567122350665415e-07, + "loss": 0.72343969, + "num_input_tokens_seen": 289888275, + "router_z_loss_clip": 0.77050781, + "router_z_loss_mlp": 0.11883545, + "step": 13433, + "time_per_iteration": 2.6356189250946045 + }, + { + "auxiliary_loss_clip": 0.0111174, + "auxiliary_loss_mlp": 0.01025427, + "balance_loss_clip": 1.03852344, + "balance_loss_mlp": 1.01447797, + "epoch": 0.8076957763415, + "flos": 46097554040640.0, + "grad_norm": 1.444205958814575, + "language_loss": 0.72521359, + "learning_rate": 3.754440311967828e-07, + "loss": 0.74658531, + "num_input_tokens_seen": 289911495, + "router_z_loss_clip": 0.73193359, + "router_z_loss_mlp": 0.10955811, + "step": 13434, + "time_per_iteration": 2.7578272819519043 + }, + { + "auxiliary_loss_clip": 0.01113887, + "auxiliary_loss_mlp": 0.010254, + "balance_loss_clip": 1.04153037, + "balance_loss_mlp": 1.01462924, + "epoch": 0.807755899594168, + "flos": 23928931612800.0, + "grad_norm": 2.1498598305974386, + "language_loss": 0.68281078, + "learning_rate": 3.752169004902361e-07, + "loss": 0.70420367, + "num_input_tokens_seen": 289930045, + "router_z_loss_clip": 0.72265625, + "router_z_loss_mlp": 0.10778809, + "step": 13435, + "time_per_iteration": 2.6973254680633545 + }, + { + "auxiliary_loss_clip": 0.01115316, + "auxiliary_loss_mlp": 0.01034528, + "balance_loss_clip": 1.04036391, + "balance_loss_mlp": 1.02130175, + "epoch": 0.8078160228468361, + "flos": 28424654853120.0, + "grad_norm": 1.7590869647047156, + "language_loss": 0.7515679, + "learning_rate": 3.749898313956279e-07, + "loss": 0.77306628, + "num_input_tokens_seen": 289950815, + "router_z_loss_clip": 0.74902344, + "router_z_loss_mlp": 0.13238525, + "step": 13436, + "time_per_iteration": 4.081660032272339 + }, + { + "auxiliary_loss_clip": 0.01107486, + "auxiliary_loss_mlp": 0.0102704, + "balance_loss_clip": 1.03596473, + "balance_loss_mlp": 1.01545858, + "epoch": 0.807876146099504, + "flos": 33144366382560.0, + "grad_norm": 1.995340041009147, + "language_loss": 0.70148063, + "learning_rate": 3.747628239215674e-07, + "loss": 0.72282588, + "num_input_tokens_seen": 289971730, + "router_z_loss_clip": 0.71484375, + "router_z_loss_mlp": 0.11578369, + "step": 13437, + "time_per_iteration": 2.7926242351531982 + }, + { + "auxiliary_loss_clip": 0.01114189, + "auxiliary_loss_mlp": 0.01030889, + "balance_loss_clip": 1.04238701, + "balance_loss_mlp": 1.01981449, + "epoch": 0.807936269352172, + "flos": 33141408621120.0, + "grad_norm": 1.8628350499433284, + "language_loss": 0.72574568, + "learning_rate": 3.745358780766636e-07, + "loss": 0.74719644, + "num_input_tokens_seen": 289992995, + "router_z_loss_clip": 0.71875, + "router_z_loss_mlp": 0.11071777, + "step": 13438, + "time_per_iteration": 2.7233147621154785 + }, + { + "auxiliary_loss_clip": 0.01111712, + "auxiliary_loss_mlp": 0.01031147, + "balance_loss_clip": 1.03958654, + "balance_loss_mlp": 1.02004302, + "epoch": 0.8079963926048399, + "flos": 25307537788800.0, + "grad_norm": 1.896049272739271, + "language_loss": 0.77100009, + "learning_rate": 3.7430899386952344e-07, + "loss": 0.79242867, + "num_input_tokens_seen": 290009405, + "router_z_loss_clip": 0.72167969, + "router_z_loss_mlp": 0.11102295, + "step": 13439, + "time_per_iteration": 2.619610071182251 + }, + { + "auxiliary_loss_clip": 0.01110777, + "auxiliary_loss_mlp": 0.01030471, + "balance_loss_clip": 1.03842211, + "balance_loss_mlp": 1.01878262, + "epoch": 0.8080565158575079, + "flos": 30517154930880.0, + "grad_norm": 1.629247781493086, + "language_loss": 0.78868139, + "learning_rate": 3.7408217130874786e-07, + "loss": 0.81009388, + "num_input_tokens_seen": 290031085, + "router_z_loss_clip": 0.72363281, + "router_z_loss_mlp": 0.11682129, + "step": 13440, + "time_per_iteration": 2.781329870223999 + }, + { + "auxiliary_loss_clip": 0.01113417, + "auxiliary_loss_mlp": 0.01026087, + "balance_loss_clip": 1.03843641, + "balance_loss_mlp": 1.01408291, + "epoch": 0.8081166391101758, + "flos": 22815229828320.0, + "grad_norm": 1.9162856443216933, + "language_loss": 0.59463429, + "learning_rate": 3.7385541040293946e-07, + "loss": 0.61602938, + "num_input_tokens_seen": 290048670, + "router_z_loss_clip": 0.74902344, + "router_z_loss_mlp": 0.12011719, + "step": 13441, + "time_per_iteration": 2.614661693572998 + }, + { + "auxiliary_loss_clip": 0.01111198, + "auxiliary_loss_mlp": 0.0102732, + "balance_loss_clip": 1.03981018, + "balance_loss_mlp": 1.01560819, + "epoch": 0.8081767623628439, + "flos": 24194889453600.0, + "grad_norm": 2.120187327448974, + "language_loss": 0.75985742, + "learning_rate": 3.7362871116069684e-07, + "loss": 0.78124261, + "num_input_tokens_seen": 290064085, + "router_z_loss_clip": 0.71386719, + "router_z_loss_mlp": 0.11712646, + "step": 13442, + "time_per_iteration": 4.072803974151611 + }, + { + "auxiliary_loss_clip": 0.01112946, + "auxiliary_loss_mlp": 0.01027759, + "balance_loss_clip": 1.03921747, + "balance_loss_mlp": 1.01623178, + "epoch": 0.8082368856155118, + "flos": 43651348394400.0, + "grad_norm": 1.941895437772194, + "language_loss": 0.70672297, + "learning_rate": 3.734020735906169e-07, + "loss": 0.72813004, + "num_input_tokens_seen": 290086255, + "router_z_loss_clip": 0.73681641, + "router_z_loss_mlp": 0.11535645, + "step": 13443, + "time_per_iteration": 2.946333169937134 + }, + { + "auxiliary_loss_clip": 0.01111424, + "auxiliary_loss_mlp": 0.01035299, + "balance_loss_clip": 1.03935122, + "balance_loss_mlp": 1.02408779, + "epoch": 0.8082970088681798, + "flos": 20985122070720.0, + "grad_norm": 2.259856558681174, + "language_loss": 0.82482576, + "learning_rate": 3.7317549770129286e-07, + "loss": 0.84629297, + "num_input_tokens_seen": 290103995, + "router_z_loss_clip": 0.72021484, + "router_z_loss_mlp": 0.11206055, + "step": 13444, + "time_per_iteration": 4.114064455032349 + }, + { + "auxiliary_loss_clip": 0.01030192, + "auxiliary_loss_mlp": 0.01001242, + "balance_loss_clip": 1.00785708, + "balance_loss_mlp": 1.00025749, + "epoch": 0.8083571321208477, + "flos": 77551861851360.0, + "grad_norm": 0.822107651690635, + "language_loss": 0.53627419, + "learning_rate": 3.7294898350131754e-07, + "loss": 0.55658853, + "num_input_tokens_seen": 290157245, + "router_z_loss_clip": 0.22338867, + "router_z_loss_mlp": 0.00984192, + "step": 13445, + "time_per_iteration": 3.145787239074707 + }, + { + "auxiliary_loss_clip": 0.01113349, + "auxiliary_loss_mlp": 0.01027647, + "balance_loss_clip": 1.04072046, + "balance_loss_mlp": 1.01504064, + "epoch": 0.8084172553735157, + "flos": 21878927811360.0, + "grad_norm": 1.9693343618270824, + "language_loss": 0.7168839, + "learning_rate": 3.7272253099927964e-07, + "loss": 0.73829389, + "num_input_tokens_seen": 290174970, + "router_z_loss_clip": 0.7265625, + "router_z_loss_mlp": 0.12615967, + "step": 13446, + "time_per_iteration": 2.6532535552978516 + }, + { + "auxiliary_loss_clip": 0.01116406, + "auxiliary_loss_mlp": 0.01031235, + "balance_loss_clip": 1.04095435, + "balance_loss_mlp": 1.01884902, + "epoch": 0.8084773786261836, + "flos": 29404628147520.0, + "grad_norm": 1.8094297259510954, + "language_loss": 0.71077168, + "learning_rate": 3.7249614020376606e-07, + "loss": 0.73224807, + "num_input_tokens_seen": 290194395, + "router_z_loss_clip": 0.75488281, + "router_z_loss_mlp": 0.12390137, + "step": 13447, + "time_per_iteration": 2.6708297729492188 + }, + { + "auxiliary_loss_clip": 0.01115622, + "auxiliary_loss_mlp": 0.01032337, + "balance_loss_clip": 1.0400244, + "balance_loss_mlp": 1.01978469, + "epoch": 0.8085375018788516, + "flos": 19021123753920.0, + "grad_norm": 2.0991865041218563, + "language_loss": 0.75119483, + "learning_rate": 3.7226981112336197e-07, + "loss": 0.77267444, + "num_input_tokens_seen": 290209200, + "router_z_loss_clip": 0.75537109, + "router_z_loss_mlp": 0.12554932, + "step": 13448, + "time_per_iteration": 2.7053332328796387 + }, + { + "auxiliary_loss_clip": 0.01030328, + "auxiliary_loss_mlp": 0.01001626, + "balance_loss_clip": 1.0079031, + "balance_loss_mlp": 1.00068271, + "epoch": 0.8085976251315197, + "flos": 82437715694880.0, + "grad_norm": 0.7375212641695685, + "language_loss": 0.6378988, + "learning_rate": 3.7204354376665024e-07, + "loss": 0.65821838, + "num_input_tokens_seen": 290274565, + "router_z_loss_clip": 0.22412109, + "router_z_loss_mlp": 0.00942993, + "step": 13449, + "time_per_iteration": 3.2858030796051025 + }, + { + "auxiliary_loss_clip": 0.01113565, + "auxiliary_loss_mlp": 0.01027822, + "balance_loss_clip": 1.04070067, + "balance_loss_mlp": 1.0160805, + "epoch": 0.8086577483841876, + "flos": 27529998249600.0, + "grad_norm": 1.7730152751730226, + "language_loss": 0.73984981, + "learning_rate": 3.718173381422105e-07, + "loss": 0.76126367, + "num_input_tokens_seen": 290293630, + "router_z_loss_clip": 0.72753906, + "router_z_loss_mlp": 0.11749268, + "step": 13450, + "time_per_iteration": 2.701637029647827 + }, + { + "auxiliary_loss_clip": 0.0111082, + "auxiliary_loss_mlp": 0.01030455, + "balance_loss_clip": 1.03766274, + "balance_loss_mlp": 1.01927352, + "epoch": 0.8087178716368556, + "flos": 21924462883680.0, + "grad_norm": 2.0420041763478185, + "language_loss": 0.74208015, + "learning_rate": 3.7159119425861986e-07, + "loss": 0.76349294, + "num_input_tokens_seen": 290311450, + "router_z_loss_clip": 0.73242188, + "router_z_loss_mlp": 0.11187744, + "step": 13451, + "time_per_iteration": 2.625936985015869 + }, + { + "auxiliary_loss_clip": 0.01115947, + "auxiliary_loss_mlp": 0.01030175, + "balance_loss_clip": 1.03866601, + "balance_loss_mlp": 1.01705015, + "epoch": 0.8087779948895235, + "flos": 26502869191680.0, + "grad_norm": 1.9645037147327478, + "language_loss": 0.80041957, + "learning_rate": 3.713651121244543e-07, + "loss": 0.82188082, + "num_input_tokens_seen": 290330165, + "router_z_loss_clip": 0.77246094, + "router_z_loss_mlp": 0.13104248, + "step": 13452, + "time_per_iteration": 2.729320526123047 + }, + { + "auxiliary_loss_clip": 0.01113506, + "auxiliary_loss_mlp": 0.01031678, + "balance_loss_clip": 1.03945017, + "balance_loss_mlp": 1.02032888, + "epoch": 0.8088381181421915, + "flos": 35500109643360.0, + "grad_norm": 1.6890264116337792, + "language_loss": 0.78262782, + "learning_rate": 3.711390917482875e-07, + "loss": 0.80407965, + "num_input_tokens_seen": 290350815, + "router_z_loss_clip": 0.74121094, + "router_z_loss_mlp": 0.11346436, + "step": 13453, + "time_per_iteration": 2.676835060119629 + }, + { + "auxiliary_loss_clip": 0.01111932, + "auxiliary_loss_mlp": 0.01031302, + "balance_loss_clip": 1.0379318, + "balance_loss_mlp": 1.01910722, + "epoch": 0.8088982413948594, + "flos": 27086843227680.0, + "grad_norm": 3.2142781014382864, + "language_loss": 0.77355886, + "learning_rate": 3.709131331386892e-07, + "loss": 0.7949912, + "num_input_tokens_seen": 290367380, + "router_z_loss_clip": 0.73925781, + "router_z_loss_mlp": 0.12188721, + "step": 13454, + "time_per_iteration": 2.664320707321167 + }, + { + "auxiliary_loss_clip": 0.01110883, + "auxiliary_loss_mlp": 0.0103241, + "balance_loss_clip": 1.03877783, + "balance_loss_mlp": 1.02038741, + "epoch": 0.8089583646475275, + "flos": 34211114989920.0, + "grad_norm": 2.3964884331416867, + "language_loss": 0.7677691, + "learning_rate": 3.7068723630422795e-07, + "loss": 0.78920203, + "num_input_tokens_seen": 290387965, + "router_z_loss_clip": 0.72070312, + "router_z_loss_mlp": 0.12017822, + "step": 13455, + "time_per_iteration": 2.70823073387146 + }, + { + "auxiliary_loss_clip": 0.01111966, + "auxiliary_loss_mlp": 0.01028103, + "balance_loss_clip": 1.03845882, + "balance_loss_mlp": 1.01698661, + "epoch": 0.8090184879001954, + "flos": 20276698001760.0, + "grad_norm": 1.8545529548851547, + "language_loss": 0.78543591, + "learning_rate": 3.70461401253471e-07, + "loss": 0.80683661, + "num_input_tokens_seen": 290404150, + "router_z_loss_clip": 0.734375, + "router_z_loss_mlp": 0.11132812, + "step": 13456, + "time_per_iteration": 2.6053667068481445 + }, + { + "auxiliary_loss_clip": 0.01112911, + "auxiliary_loss_mlp": 0.01037438, + "balance_loss_clip": 1.04121244, + "balance_loss_mlp": 1.02645302, + "epoch": 0.8090786111528634, + "flos": 33362155527840.0, + "grad_norm": 2.1796517305523033, + "language_loss": 0.72041512, + "learning_rate": 3.702356279949801e-07, + "loss": 0.74191862, + "num_input_tokens_seen": 290422370, + "router_z_loss_clip": 0.71679688, + "router_z_loss_mlp": 0.10986328, + "step": 13457, + "time_per_iteration": 2.7175447940826416 + }, + { + "auxiliary_loss_clip": 0.01111652, + "auxiliary_loss_mlp": 0.01028495, + "balance_loss_clip": 1.03941464, + "balance_loss_mlp": 1.01822567, + "epoch": 0.8091387344055313, + "flos": 25753285916640.0, + "grad_norm": 1.8997051211157558, + "language_loss": 0.72750461, + "learning_rate": 3.700099165373176e-07, + "loss": 0.74890614, + "num_input_tokens_seen": 290442645, + "router_z_loss_clip": 0.72216797, + "router_z_loss_mlp": 0.10266113, + "step": 13458, + "time_per_iteration": 2.7028589248657227 + }, + { + "auxiliary_loss_clip": 0.0111279, + "auxiliary_loss_mlp": 0.01035908, + "balance_loss_clip": 1.03909326, + "balance_loss_mlp": 1.02420771, + "epoch": 0.8091988576581993, + "flos": 14220917089920.0, + "grad_norm": 2.344722512132171, + "language_loss": 0.78381425, + "learning_rate": 3.6978426688904275e-07, + "loss": 0.80530119, + "num_input_tokens_seen": 290458520, + "router_z_loss_clip": 0.73681641, + "router_z_loss_mlp": 0.11706543, + "step": 13459, + "time_per_iteration": 2.609318733215332 + }, + { + "auxiliary_loss_clip": 0.01113852, + "auxiliary_loss_mlp": 0.01026847, + "balance_loss_clip": 1.03886843, + "balance_loss_mlp": 1.0146997, + "epoch": 0.8092589809108672, + "flos": 28020349552320.0, + "grad_norm": 2.2898064655032666, + "language_loss": 0.79683787, + "learning_rate": 3.695586790587113e-07, + "loss": 0.81824481, + "num_input_tokens_seen": 290474465, + "router_z_loss_clip": 0.74951172, + "router_z_loss_mlp": 0.12158203, + "step": 13460, + "time_per_iteration": 2.620482921600342 + }, + { + "auxiliary_loss_clip": 0.01114595, + "auxiliary_loss_mlp": 0.0103038, + "balance_loss_clip": 1.03905761, + "balance_loss_mlp": 1.01805973, + "epoch": 0.8093191041635353, + "flos": 16180539540480.0, + "grad_norm": 1.8528540803653923, + "language_loss": 0.84524751, + "learning_rate": 3.693331530548789e-07, + "loss": 0.86669725, + "num_input_tokens_seen": 290492060, + "router_z_loss_clip": 0.75585938, + "router_z_loss_mlp": 0.12310791, + "step": 13461, + "time_per_iteration": 2.6429266929626465 + }, + { + "auxiliary_loss_clip": 0.01115104, + "auxiliary_loss_mlp": 0.01034953, + "balance_loss_clip": 1.03992176, + "balance_loss_mlp": 1.0228827, + "epoch": 0.8093792274162032, + "flos": 31135440752640.0, + "grad_norm": 1.9274346270944052, + "language_loss": 0.76337659, + "learning_rate": 3.69107688886096e-07, + "loss": 0.78487718, + "num_input_tokens_seen": 290511510, + "router_z_loss_clip": 0.75097656, + "router_z_loss_mlp": 0.12072754, + "step": 13462, + "time_per_iteration": 2.657853841781616 + }, + { + "auxiliary_loss_clip": 0.0111416, + "auxiliary_loss_mlp": 0.0103323, + "balance_loss_clip": 1.04080105, + "balance_loss_mlp": 1.02101123, + "epoch": 0.8094393506688712, + "flos": 28732015003680.0, + "grad_norm": 1.6757841035645038, + "language_loss": 0.82882977, + "learning_rate": 3.6888228656091357e-07, + "loss": 0.85030371, + "num_input_tokens_seen": 290530035, + "router_z_loss_clip": 0.73388672, + "router_z_loss_mlp": 0.12219238, + "step": 13463, + "time_per_iteration": 2.6896519660949707 + }, + { + "auxiliary_loss_clip": 0.01109996, + "auxiliary_loss_mlp": 0.01032329, + "balance_loss_clip": 1.03827953, + "balance_loss_mlp": 1.02165365, + "epoch": 0.8094994739215392, + "flos": 20811368858400.0, + "grad_norm": 2.064867866215535, + "language_loss": 0.62456477, + "learning_rate": 3.686569460878779e-07, + "loss": 0.64598799, + "num_input_tokens_seen": 290548245, + "router_z_loss_clip": 0.71679688, + "router_z_loss_mlp": 0.10681152, + "step": 13464, + "time_per_iteration": 2.6732230186462402 + }, + { + "auxiliary_loss_clip": 0.01110561, + "auxiliary_loss_mlp": 0.01027085, + "balance_loss_clip": 1.03935242, + "balance_loss_mlp": 1.01661849, + "epoch": 0.8095595971742071, + "flos": 28737363284640.0, + "grad_norm": 1.6677296185310957, + "language_loss": 0.61847311, + "learning_rate": 3.684316674755341e-07, + "loss": 0.63984954, + "num_input_tokens_seen": 290568625, + "router_z_loss_clip": 0.71191406, + "router_z_loss_mlp": 0.10467529, + "step": 13465, + "time_per_iteration": 2.709346294403076 + }, + { + "auxiliary_loss_clip": 0.01112626, + "auxiliary_loss_mlp": 0.01035257, + "balance_loss_clip": 1.04121113, + "balance_loss_mlp": 1.0236578, + "epoch": 0.8096197204268751, + "flos": 24859642245120.0, + "grad_norm": 1.6999050587510964, + "language_loss": 0.81462216, + "learning_rate": 3.682064507324256e-07, + "loss": 0.83610106, + "num_input_tokens_seen": 290586575, + "router_z_loss_clip": 0.71289062, + "router_z_loss_mlp": 0.1159668, + "step": 13466, + "time_per_iteration": 2.628913402557373 + }, + { + "auxiliary_loss_clip": 0.01116264, + "auxiliary_loss_mlp": 0.01036364, + "balance_loss_clip": 1.04187298, + "balance_loss_mlp": 1.025069, + "epoch": 0.809679843679543, + "flos": 33945076114560.0, + "grad_norm": 9.353990825689918, + "language_loss": 0.76023757, + "learning_rate": 3.6798129586709204e-07, + "loss": 0.78176379, + "num_input_tokens_seen": 290606790, + "router_z_loss_clip": 0.74462891, + "router_z_loss_mlp": 0.11291504, + "step": 13467, + "time_per_iteration": 2.6752448081970215 + }, + { + "auxiliary_loss_clip": 0.01109613, + "auxiliary_loss_mlp": 0.01026919, + "balance_loss_clip": 1.03571033, + "balance_loss_mlp": 1.01582026, + "epoch": 0.8097399669322111, + "flos": 26862085110240.0, + "grad_norm": 1.815920466151208, + "language_loss": 0.78927195, + "learning_rate": 3.6775620288807073e-07, + "loss": 0.81063724, + "num_input_tokens_seen": 290625525, + "router_z_loss_clip": 0.73779297, + "router_z_loss_mlp": 0.11096191, + "step": 13468, + "time_per_iteration": 2.6164469718933105 + }, + { + "auxiliary_loss_clip": 0.01106742, + "auxiliary_loss_mlp": 0.01027722, + "balance_loss_clip": 1.03653121, + "balance_loss_mlp": 1.01679635, + "epoch": 0.809800090184879, + "flos": 23170312984320.0, + "grad_norm": 1.8417132273239387, + "language_loss": 0.67602301, + "learning_rate": 3.675311718038978e-07, + "loss": 0.69736767, + "num_input_tokens_seen": 290644935, + "router_z_loss_clip": 0.70214844, + "router_z_loss_mlp": 0.10925293, + "step": 13469, + "time_per_iteration": 2.6213483810424805 + }, + { + "auxiliary_loss_clip": 0.01030089, + "auxiliary_loss_mlp": 0.01000193, + "balance_loss_clip": 1.00777698, + "balance_loss_mlp": 0.99919403, + "epoch": 0.809860213437547, + "flos": 80654155941600.0, + "grad_norm": 0.6934925408159157, + "language_loss": 0.54658169, + "learning_rate": 3.6730620262310683e-07, + "loss": 0.56688452, + "num_input_tokens_seen": 290710735, + "router_z_loss_clip": 0.22314453, + "router_z_loss_mlp": 0.00998688, + "step": 13470, + "time_per_iteration": 4.825993299484253 + }, + { + "auxiliary_loss_clip": 0.01111482, + "auxiliary_loss_mlp": 0.0102677, + "balance_loss_clip": 1.03870976, + "balance_loss_mlp": 1.01584411, + "epoch": 0.8099203366902149, + "flos": 25481250483840.0, + "grad_norm": 2.839805831072968, + "language_loss": 0.6939491, + "learning_rate": 3.670812953542279e-07, + "loss": 0.71533167, + "num_input_tokens_seen": 290729565, + "router_z_loss_clip": 0.72802734, + "router_z_loss_mlp": 0.10919189, + "step": 13471, + "time_per_iteration": 2.604015827178955 + }, + { + "auxiliary_loss_clip": 0.01111654, + "auxiliary_loss_mlp": 0.01025487, + "balance_loss_clip": 1.03887725, + "balance_loss_mlp": 1.01437032, + "epoch": 0.8099804599428829, + "flos": 31763410204320.0, + "grad_norm": 1.8168855483000392, + "language_loss": 0.79952556, + "learning_rate": 3.6685645000579003e-07, + "loss": 0.82089698, + "num_input_tokens_seen": 290749360, + "router_z_loss_clip": 0.7265625, + "router_z_loss_mlp": 0.11102295, + "step": 13472, + "time_per_iteration": 2.7064547538757324 + }, + { + "auxiliary_loss_clip": 0.01030393, + "auxiliary_loss_mlp": 0.01000986, + "balance_loss_clip": 1.00800896, + "balance_loss_mlp": 0.99995548, + "epoch": 0.8100405831955508, + "flos": 84564608592960.0, + "grad_norm": 0.7448152000679772, + "language_loss": 0.5767411, + "learning_rate": 3.666316665863201e-07, + "loss": 0.5970549, + "num_input_tokens_seen": 290812145, + "router_z_loss_clip": 0.22399902, + "router_z_loss_mlp": 0.01030731, + "step": 13473, + "time_per_iteration": 3.2459561824798584 + }, + { + "auxiliary_loss_clip": 0.01113341, + "auxiliary_loss_mlp": 0.01026314, + "balance_loss_clip": 1.0392698, + "balance_loss_mlp": 1.01496506, + "epoch": 0.8101007064482189, + "flos": 18317845379520.0, + "grad_norm": 2.040246106404125, + "language_loss": 0.73763061, + "learning_rate": 3.664069451043399e-07, + "loss": 0.75902712, + "num_input_tokens_seen": 290829845, + "router_z_loss_clip": 0.74121094, + "router_z_loss_mlp": 0.11346436, + "step": 13474, + "time_per_iteration": 2.5861239433288574 + }, + { + "auxiliary_loss_clip": 0.01116977, + "auxiliary_loss_mlp": 0.01038083, + "balance_loss_clip": 1.04138732, + "balance_loss_mlp": 1.02641249, + "epoch": 0.8101608297008868, + "flos": 25706454291360.0, + "grad_norm": 1.6564511239045276, + "language_loss": 0.78243816, + "learning_rate": 3.661822855683723e-07, + "loss": 0.80398881, + "num_input_tokens_seen": 290848815, + "router_z_loss_clip": 0.75585938, + "router_z_loss_mlp": 0.11663818, + "step": 13475, + "time_per_iteration": 2.6853044033050537 + }, + { + "auxiliary_loss_clip": 0.01109813, + "auxiliary_loss_mlp": 0.01036964, + "balance_loss_clip": 1.03822613, + "balance_loss_mlp": 1.02585375, + "epoch": 0.8102209529535548, + "flos": 28957299845760.0, + "grad_norm": 1.7871484308864691, + "language_loss": 0.7521866, + "learning_rate": 3.659576879869364e-07, + "loss": 0.77365434, + "num_input_tokens_seen": 290868580, + "router_z_loss_clip": 0.71582031, + "router_z_loss_mlp": 0.11114502, + "step": 13476, + "time_per_iteration": 4.2338643074035645 + }, + { + "auxiliary_loss_clip": 0.01117311, + "auxiliary_loss_mlp": 0.01035339, + "balance_loss_clip": 1.04135323, + "balance_loss_mlp": 1.02316785, + "epoch": 0.8102810762062228, + "flos": 13368270555360.0, + "grad_norm": 4.545912707937857, + "language_loss": 0.73668742, + "learning_rate": 3.657331523685485e-07, + "loss": 0.75821388, + "num_input_tokens_seen": 290883540, + "router_z_loss_clip": 0.76025391, + "router_z_loss_mlp": 0.12176514, + "step": 13477, + "time_per_iteration": 2.609072685241699 + }, + { + "auxiliary_loss_clip": 0.01109808, + "auxiliary_loss_mlp": 0.01032003, + "balance_loss_clip": 1.03734851, + "balance_loss_mlp": 1.02082705, + "epoch": 0.8103411994588907, + "flos": 17873961046560.0, + "grad_norm": 2.106877623946606, + "language_loss": 0.69741023, + "learning_rate": 3.6550867872172365e-07, + "loss": 0.71882832, + "num_input_tokens_seen": 290901560, + "router_z_loss_clip": 0.72460938, + "router_z_loss_mlp": 0.11181641, + "step": 13478, + "time_per_iteration": 2.6956100463867188 + }, + { + "auxiliary_loss_clip": 0.01030241, + "auxiliary_loss_mlp": 0.01001859, + "balance_loss_clip": 1.00788343, + "balance_loss_mlp": 1.0008285, + "epoch": 0.8104013227115587, + "flos": 72179147541600.0, + "grad_norm": 0.6854470310686616, + "language_loss": 0.52145576, + "learning_rate": 3.6528426705497293e-07, + "loss": 0.54177678, + "num_input_tokens_seen": 290959185, + "router_z_loss_clip": 0.22351074, + "router_z_loss_mlp": 0.01030731, + "step": 13479, + "time_per_iteration": 3.1658687591552734 + }, + { + "auxiliary_loss_clip": 0.01111723, + "auxiliary_loss_mlp": 0.01029757, + "balance_loss_clip": 1.03906775, + "balance_loss_mlp": 1.01824093, + "epoch": 0.8104614459642266, + "flos": 24194929970880.0, + "grad_norm": 1.987274810561703, + "language_loss": 0.71400607, + "learning_rate": 3.650599173768072e-07, + "loss": 0.73542082, + "num_input_tokens_seen": 290979585, + "router_z_loss_clip": 0.7265625, + "router_z_loss_mlp": 0.11517334, + "step": 13480, + "time_per_iteration": 2.717559814453125 + }, + { + "auxiliary_loss_clip": 0.01112816, + "auxiliary_loss_mlp": 0.01030203, + "balance_loss_clip": 1.03947735, + "balance_loss_mlp": 1.01923585, + "epoch": 0.8105215692168947, + "flos": 30962092713120.0, + "grad_norm": 4.380356345948075, + "language_loss": 0.79707849, + "learning_rate": 3.648356296957327e-07, + "loss": 0.81850874, + "num_input_tokens_seen": 291000865, + "router_z_loss_clip": 0.73339844, + "router_z_loss_mlp": 0.10961914, + "step": 13481, + "time_per_iteration": 2.648416519165039 + }, + { + "auxiliary_loss_clip": 0.01110574, + "auxiliary_loss_mlp": 0.01031753, + "balance_loss_clip": 1.03829992, + "balance_loss_mlp": 1.02047622, + "epoch": 0.8105816924695626, + "flos": 24992155216800.0, + "grad_norm": 1.9545964031679388, + "language_loss": 0.7254312, + "learning_rate": 3.646114040202548e-07, + "loss": 0.74685442, + "num_input_tokens_seen": 291018285, + "router_z_loss_clip": 0.72265625, + "router_z_loss_mlp": 0.11260986, + "step": 13482, + "time_per_iteration": 4.073571681976318 + }, + { + "auxiliary_loss_clip": 0.01112162, + "auxiliary_loss_mlp": 0.01026852, + "balance_loss_clip": 1.03762889, + "balance_loss_mlp": 1.01512194, + "epoch": 0.8106418157222306, + "flos": 17739624797280.0, + "grad_norm": 3.4398130256577417, + "language_loss": 0.65310413, + "learning_rate": 3.6438724035887705e-07, + "loss": 0.67449427, + "num_input_tokens_seen": 291035745, + "router_z_loss_clip": 0.74511719, + "router_z_loss_mlp": 0.11730957, + "step": 13483, + "time_per_iteration": 3.8986098766326904 + }, + { + "auxiliary_loss_clip": 0.01112563, + "auxiliary_loss_mlp": 0.01025941, + "balance_loss_clip": 1.0395, + "balance_loss_mlp": 1.01419902, + "epoch": 0.8107019389748985, + "flos": 27533401701120.0, + "grad_norm": 2.3084388390596957, + "language_loss": 0.75925225, + "learning_rate": 3.641631387200992e-07, + "loss": 0.78063732, + "num_input_tokens_seen": 291053280, + "router_z_loss_clip": 0.73095703, + "router_z_loss_mlp": 0.11737061, + "step": 13484, + "time_per_iteration": 2.6649489402770996 + }, + { + "auxiliary_loss_clip": 0.0111649, + "auxiliary_loss_mlp": 0.01032954, + "balance_loss_clip": 1.03982162, + "balance_loss_mlp": 1.0204668, + "epoch": 0.8107620622275665, + "flos": 23927554025280.0, + "grad_norm": 1.7369929472080052, + "language_loss": 0.72048342, + "learning_rate": 3.639390991124183e-07, + "loss": 0.74197781, + "num_input_tokens_seen": 291072855, + "router_z_loss_clip": 0.765625, + "router_z_loss_mlp": 0.12487793, + "step": 13485, + "time_per_iteration": 2.6702752113342285 + }, + { + "auxiliary_loss_clip": 0.01109156, + "auxiliary_loss_mlp": 0.01025687, + "balance_loss_clip": 1.03804815, + "balance_loss_mlp": 1.01496983, + "epoch": 0.8108221854802344, + "flos": 19697991212160.0, + "grad_norm": 1.9788064748835315, + "language_loss": 0.76005727, + "learning_rate": 3.637151215443308e-07, + "loss": 0.78140569, + "num_input_tokens_seen": 291090285, + "router_z_loss_clip": 0.71044922, + "router_z_loss_mlp": 0.1071167, + "step": 13486, + "time_per_iteration": 2.6231882572174072 + }, + { + "auxiliary_loss_clip": 0.01115342, + "auxiliary_loss_mlp": 0.01029887, + "balance_loss_clip": 1.03952575, + "balance_loss_mlp": 1.01838923, + "epoch": 0.8108823087329025, + "flos": 25753812641280.0, + "grad_norm": 1.9128279744192516, + "language_loss": 0.7227881, + "learning_rate": 3.6349120602433045e-07, + "loss": 0.7442404, + "num_input_tokens_seen": 291107675, + "router_z_loss_clip": 0.7578125, + "router_z_loss_mlp": 0.11486816, + "step": 13487, + "time_per_iteration": 2.6436095237731934 + }, + { + "auxiliary_loss_clip": 0.01107543, + "auxiliary_loss_mlp": 0.01033622, + "balance_loss_clip": 1.03872395, + "balance_loss_mlp": 1.02322078, + "epoch": 0.8109424319855704, + "flos": 35630637268320.0, + "grad_norm": 1.786820624882484, + "language_loss": 0.84268069, + "learning_rate": 3.6326735256090715e-07, + "loss": 0.86409229, + "num_input_tokens_seen": 291126900, + "router_z_loss_clip": 0.68847656, + "router_z_loss_mlp": 0.10400391, + "step": 13488, + "time_per_iteration": 2.786419153213501 + }, + { + "auxiliary_loss_clip": 0.01114046, + "auxiliary_loss_mlp": 0.0103068, + "balance_loss_clip": 1.03982496, + "balance_loss_mlp": 1.01849127, + "epoch": 0.8110025552382384, + "flos": 28201355357760.0, + "grad_norm": 2.0985766776869994, + "language_loss": 0.735039, + "learning_rate": 3.630435611625502e-07, + "loss": 0.7564863, + "num_input_tokens_seen": 291145285, + "router_z_loss_clip": 0.74169922, + "router_z_loss_mlp": 0.12182617, + "step": 13489, + "time_per_iteration": 2.6668381690979004 + }, + { + "auxiliary_loss_clip": 0.01109378, + "auxiliary_loss_mlp": 0.01031493, + "balance_loss_clip": 1.03857887, + "balance_loss_mlp": 1.02029324, + "epoch": 0.8110626784909064, + "flos": 27308440997280.0, + "grad_norm": 1.7637688521027144, + "language_loss": 0.71394801, + "learning_rate": 3.628198318377453e-07, + "loss": 0.73535669, + "num_input_tokens_seen": 291163485, + "router_z_loss_clip": 0.70849609, + "router_z_loss_mlp": 0.11199951, + "step": 13490, + "time_per_iteration": 2.714343309402466 + }, + { + "auxiliary_loss_clip": 0.01116762, + "auxiliary_loss_mlp": 0.01035208, + "balance_loss_clip": 1.0414474, + "balance_loss_mlp": 1.02283382, + "epoch": 0.8111228017435743, + "flos": 28513901720160.0, + "grad_norm": 2.4682217212865676, + "language_loss": 0.71493298, + "learning_rate": 3.625961645949762e-07, + "loss": 0.7364527, + "num_input_tokens_seen": 291182215, + "router_z_loss_clip": 0.75292969, + "router_z_loss_mlp": 0.1237793, + "step": 13491, + "time_per_iteration": 2.633681058883667 + }, + { + "auxiliary_loss_clip": 0.01111922, + "auxiliary_loss_mlp": 0.01026575, + "balance_loss_clip": 1.03779459, + "balance_loss_mlp": 1.01536345, + "epoch": 0.8111829249962423, + "flos": 25974073340640.0, + "grad_norm": 1.4814885923801588, + "language_loss": 0.67850417, + "learning_rate": 3.623725594427245e-07, + "loss": 0.69988918, + "num_input_tokens_seen": 291203145, + "router_z_loss_clip": 0.74072266, + "router_z_loss_mlp": 0.11206055, + "step": 13492, + "time_per_iteration": 2.7073206901550293 + }, + { + "auxiliary_loss_clip": 0.01114581, + "auxiliary_loss_mlp": 0.01033571, + "balance_loss_clip": 1.03979039, + "balance_loss_mlp": 1.02167964, + "epoch": 0.8112430482489102, + "flos": 27266268859200.0, + "grad_norm": 1.6443394659026027, + "language_loss": 0.72177219, + "learning_rate": 3.6214901638947006e-07, + "loss": 0.74325371, + "num_input_tokens_seen": 291220600, + "router_z_loss_clip": 0.74804688, + "router_z_loss_mlp": 0.11889648, + "step": 13493, + "time_per_iteration": 2.6063458919525146 + }, + { + "auxiliary_loss_clip": 0.01111356, + "auxiliary_loss_mlp": 0.01037155, + "balance_loss_clip": 1.03752518, + "balance_loss_mlp": 1.02497196, + "epoch": 0.8113031715015783, + "flos": 37995902089920.0, + "grad_norm": 1.7087922741186943, + "language_loss": 0.70359194, + "learning_rate": 3.619255354436885e-07, + "loss": 0.72507703, + "num_input_tokens_seen": 291241195, + "router_z_loss_clip": 0.73730469, + "router_z_loss_mlp": 0.12182617, + "step": 13494, + "time_per_iteration": 2.7310950756073 + }, + { + "auxiliary_loss_clip": 0.01116608, + "auxiliary_loss_mlp": 0.0103246, + "balance_loss_clip": 1.04051948, + "balance_loss_mlp": 1.01959753, + "epoch": 0.8113632947542462, + "flos": 30914936949600.0, + "grad_norm": 5.444070747237478, + "language_loss": 0.76535875, + "learning_rate": 3.6170211661385543e-07, + "loss": 0.78684944, + "num_input_tokens_seen": 291258715, + "router_z_loss_clip": 0.76123047, + "router_z_loss_mlp": 0.12866211, + "step": 13495, + "time_per_iteration": 2.6668636798858643 + }, + { + "auxiliary_loss_clip": 0.01114011, + "auxiliary_loss_mlp": 0.01033393, + "balance_loss_clip": 1.04013562, + "balance_loss_mlp": 1.02122831, + "epoch": 0.8114234180069142, + "flos": 34702114569120.0, + "grad_norm": 2.1135475757428543, + "language_loss": 0.79865718, + "learning_rate": 3.614787599084417e-07, + "loss": 0.82013124, + "num_input_tokens_seen": 291278030, + "router_z_loss_clip": 0.73828125, + "router_z_loss_mlp": 0.12164307, + "step": 13496, + "time_per_iteration": 2.6809189319610596 + }, + { + "auxiliary_loss_clip": 0.01110972, + "auxiliary_loss_mlp": 0.01032559, + "balance_loss_clip": 1.03760576, + "balance_loss_mlp": 1.02044678, + "epoch": 0.8114835412595821, + "flos": 24817551141600.0, + "grad_norm": 2.436762358442609, + "language_loss": 0.71530163, + "learning_rate": 3.6125546533591787e-07, + "loss": 0.73673695, + "num_input_tokens_seen": 291296740, + "router_z_loss_clip": 0.734375, + "router_z_loss_mlp": 0.12127686, + "step": 13497, + "time_per_iteration": 2.6072468757629395 + }, + { + "auxiliary_loss_clip": 0.01113251, + "auxiliary_loss_mlp": 0.01032722, + "balance_loss_clip": 1.03931117, + "balance_loss_mlp": 1.02212477, + "epoch": 0.8115436645122501, + "flos": 27443668626720.0, + "grad_norm": 1.7837617949480942, + "language_loss": 0.76770878, + "learning_rate": 3.610322329047508e-07, + "loss": 0.78916848, + "num_input_tokens_seen": 291318730, + "router_z_loss_clip": 0.73828125, + "router_z_loss_mlp": 0.1060791, + "step": 13498, + "time_per_iteration": 2.671670913696289 + }, + { + "auxiliary_loss_clip": 0.01112654, + "auxiliary_loss_mlp": 0.01032689, + "balance_loss_clip": 1.03908753, + "balance_loss_mlp": 1.02098858, + "epoch": 0.811603787764918, + "flos": 16893501544800.0, + "grad_norm": 3.233005171219618, + "language_loss": 0.83417076, + "learning_rate": 3.608090626234055e-07, + "loss": 0.8556242, + "num_input_tokens_seen": 291336755, + "router_z_loss_clip": 0.73583984, + "router_z_loss_mlp": 0.11712646, + "step": 13499, + "time_per_iteration": 2.5934388637542725 + }, + { + "auxiliary_loss_clip": 0.01112204, + "auxiliary_loss_mlp": 0.01029497, + "balance_loss_clip": 1.04005909, + "balance_loss_mlp": 1.01739144, + "epoch": 0.8116639110175861, + "flos": 26374448465280.0, + "grad_norm": 1.4850108447282209, + "language_loss": 0.76224393, + "learning_rate": 3.6058595450034603e-07, + "loss": 0.78366101, + "num_input_tokens_seen": 291356795, + "router_z_loss_clip": 0.72167969, + "router_z_loss_mlp": 0.12097168, + "step": 13500, + "time_per_iteration": 2.6929032802581787 + }, + { + "auxiliary_loss_clip": 0.01030389, + "auxiliary_loss_mlp": 0.01001674, + "balance_loss_clip": 1.00800776, + "balance_loss_mlp": 1.00067997, + "epoch": 0.811724034270254, + "flos": 78653050146720.0, + "grad_norm": 0.8071505132581243, + "language_loss": 0.59900838, + "learning_rate": 3.603629085440303e-07, + "loss": 0.61932898, + "num_input_tokens_seen": 291416005, + "router_z_loss_clip": 0.22387695, + "router_z_loss_mlp": 0.0099411, + "step": 13501, + "time_per_iteration": 3.3770923614501953 + }, + { + "auxiliary_loss_clip": 0.01109527, + "auxiliary_loss_mlp": 0.01028362, + "balance_loss_clip": 1.03870487, + "balance_loss_mlp": 1.01716268, + "epoch": 0.811784157522922, + "flos": 30205337879520.0, + "grad_norm": 2.0351606195753216, + "language_loss": 0.78777546, + "learning_rate": 3.6013992476291753e-07, + "loss": 0.80915439, + "num_input_tokens_seen": 291434870, + "router_z_loss_clip": 0.70800781, + "router_z_loss_mlp": 0.11206055, + "step": 13502, + "time_per_iteration": 2.7192087173461914 + }, + { + "auxiliary_loss_clip": 0.01112517, + "auxiliary_loss_mlp": 0.01033288, + "balance_loss_clip": 1.03971732, + "balance_loss_mlp": 1.02220738, + "epoch": 0.81184428077559, + "flos": 14845685676480.0, + "grad_norm": 1.9296759698511738, + "language_loss": 0.71016765, + "learning_rate": 3.599170031654635e-07, + "loss": 0.73162568, + "num_input_tokens_seen": 291452230, + "router_z_loss_clip": 0.72851562, + "router_z_loss_mlp": 0.11090088, + "step": 13503, + "time_per_iteration": 2.701317310333252 + }, + { + "auxiliary_loss_clip": 0.01112742, + "auxiliary_loss_mlp": 0.01030836, + "balance_loss_clip": 1.03865337, + "balance_loss_mlp": 1.0181576, + "epoch": 0.8119044040282579, + "flos": 54205445652480.0, + "grad_norm": 1.765243198368364, + "language_loss": 0.67731643, + "learning_rate": 3.5969414376012065e-07, + "loss": 0.69875216, + "num_input_tokens_seen": 291477425, + "router_z_loss_clip": 0.74072266, + "router_z_loss_mlp": 0.12677002, + "step": 13504, + "time_per_iteration": 2.839848041534424 + }, + { + "auxiliary_loss_clip": 0.01111445, + "auxiliary_loss_mlp": 0.0102836, + "balance_loss_clip": 1.03630638, + "balance_loss_mlp": 1.01605189, + "epoch": 0.8119645272809259, + "flos": 63642559226400.0, + "grad_norm": 2.0009648446849697, + "language_loss": 0.74586338, + "learning_rate": 3.594713465553403e-07, + "loss": 0.76726139, + "num_input_tokens_seen": 291501070, + "router_z_loss_clip": 0.75097656, + "router_z_loss_mlp": 0.12310791, + "step": 13505, + "time_per_iteration": 2.903775453567505 + }, + { + "auxiliary_loss_clip": 0.01113868, + "auxiliary_loss_mlp": 0.01029764, + "balance_loss_clip": 1.03957796, + "balance_loss_mlp": 1.01718175, + "epoch": 0.8120246505335939, + "flos": 36890506347840.0, + "grad_norm": 3.11041578958548, + "language_loss": 0.72554505, + "learning_rate": 3.5924861155957123e-07, + "loss": 0.74698138, + "num_input_tokens_seen": 291524945, + "router_z_loss_clip": 0.74316406, + "router_z_loss_mlp": 0.12585449, + "step": 13506, + "time_per_iteration": 2.724463701248169 + }, + { + "auxiliary_loss_clip": 0.01117819, + "auxiliary_loss_mlp": 0.01028678, + "balance_loss_clip": 1.04011703, + "balance_loss_mlp": 1.01685786, + "epoch": 0.8120847737862619, + "flos": 27000270501120.0, + "grad_norm": 3.1666015444080418, + "language_loss": 0.76368463, + "learning_rate": 3.590259387812593e-07, + "loss": 0.78514957, + "num_input_tokens_seen": 291544605, + "router_z_loss_clip": 0.77685547, + "router_z_loss_mlp": 0.11810303, + "step": 13507, + "time_per_iteration": 2.6904802322387695 + }, + { + "auxiliary_loss_clip": 0.01112654, + "auxiliary_loss_mlp": 0.01028491, + "balance_loss_clip": 1.03661191, + "balance_loss_mlp": 1.01705861, + "epoch": 0.8121448970389298, + "flos": 28424978991360.0, + "grad_norm": 1.7377184231392147, + "language_loss": 0.70322907, + "learning_rate": 3.5880332822884783e-07, + "loss": 0.72464055, + "num_input_tokens_seen": 291563850, + "router_z_loss_clip": 0.76123047, + "router_z_loss_mlp": 0.11425781, + "step": 13508, + "time_per_iteration": 2.668117046356201 + }, + { + "auxiliary_loss_clip": 0.01111965, + "auxiliary_loss_mlp": 0.01032411, + "balance_loss_clip": 1.038872, + "balance_loss_mlp": 1.02109861, + "epoch": 0.8122050202915978, + "flos": 27044792641440.0, + "grad_norm": 1.7252571013294236, + "language_loss": 0.76318687, + "learning_rate": 3.585807799107785e-07, + "loss": 0.78463066, + "num_input_tokens_seen": 291581730, + "router_z_loss_clip": 0.73095703, + "router_z_loss_mlp": 0.11322021, + "step": 13509, + "time_per_iteration": 2.6780264377593994 + }, + { + "auxiliary_loss_clip": 0.01114465, + "auxiliary_loss_mlp": 0.0103075, + "balance_loss_clip": 1.03923535, + "balance_loss_mlp": 1.01906109, + "epoch": 0.8122651435442657, + "flos": 28380740472000.0, + "grad_norm": 1.8216381174490308, + "language_loss": 0.77182269, + "learning_rate": 3.58358293835491e-07, + "loss": 0.79327488, + "num_input_tokens_seen": 291601225, + "router_z_loss_clip": 0.75244141, + "router_z_loss_mlp": 0.11688232, + "step": 13510, + "time_per_iteration": 4.23008131980896 + }, + { + "auxiliary_loss_clip": 0.01114428, + "auxiliary_loss_mlp": 0.01032997, + "balance_loss_clip": 1.03927565, + "balance_loss_mlp": 1.02095675, + "epoch": 0.8123252667969337, + "flos": 19694061036000.0, + "grad_norm": 1.832351034630195, + "language_loss": 0.69581997, + "learning_rate": 3.581358700114212e-07, + "loss": 0.71729422, + "num_input_tokens_seen": 291616995, + "router_z_loss_clip": 0.75048828, + "router_z_loss_mlp": 0.12036133, + "step": 13511, + "time_per_iteration": 2.615341901779175 + }, + { + "auxiliary_loss_clip": 0.0111519, + "auxiliary_loss_mlp": 0.01035057, + "balance_loss_clip": 1.04001713, + "balance_loss_mlp": 1.02352953, + "epoch": 0.8123853900496016, + "flos": 25924283953920.0, + "grad_norm": 1.656337775648834, + "language_loss": 0.79602641, + "learning_rate": 3.57913508447004e-07, + "loss": 0.8175289, + "num_input_tokens_seen": 291636145, + "router_z_loss_clip": 0.75097656, + "router_z_loss_mlp": 0.11523438, + "step": 13512, + "time_per_iteration": 2.68088960647583 + }, + { + "auxiliary_loss_clip": 0.01111366, + "auxiliary_loss_mlp": 0.01030306, + "balance_loss_clip": 1.03825521, + "balance_loss_mlp": 1.01911855, + "epoch": 0.8124455133022697, + "flos": 78556949508960.0, + "grad_norm": 2.4243500160313736, + "language_loss": 0.63688076, + "learning_rate": 3.5769120915067076e-07, + "loss": 0.65829748, + "num_input_tokens_seen": 291662440, + "router_z_loss_clip": 0.73095703, + "router_z_loss_mlp": 0.11181641, + "step": 13513, + "time_per_iteration": 3.076122283935547 + }, + { + "auxiliary_loss_clip": 0.0111519, + "auxiliary_loss_mlp": 0.01033112, + "balance_loss_clip": 1.03990722, + "balance_loss_mlp": 1.0214777, + "epoch": 0.8125056365549376, + "flos": 29092527475200.0, + "grad_norm": 1.6864793819099677, + "language_loss": 0.71506226, + "learning_rate": 3.5746897213085194e-07, + "loss": 0.73654532, + "num_input_tokens_seen": 291680950, + "router_z_loss_clip": 0.75244141, + "router_z_loss_mlp": 0.11645508, + "step": 13514, + "time_per_iteration": 2.61781907081604 + }, + { + "auxiliary_loss_clip": 0.01108882, + "auxiliary_loss_mlp": 0.01028663, + "balance_loss_clip": 1.03728998, + "balance_loss_mlp": 1.01773727, + "epoch": 0.8125657598076056, + "flos": 28736350352640.0, + "grad_norm": 1.7102610915687841, + "language_loss": 0.6266728, + "learning_rate": 3.5724679739597364e-07, + "loss": 0.64804828, + "num_input_tokens_seen": 291702395, + "router_z_loss_clip": 0.71533203, + "router_z_loss_mlp": 0.10919189, + "step": 13515, + "time_per_iteration": 4.248893737792969 + }, + { + "auxiliary_loss_clip": 0.01105376, + "auxiliary_loss_mlp": 0.01029813, + "balance_loss_clip": 1.03689551, + "balance_loss_mlp": 1.01826787, + "epoch": 0.8126258830602736, + "flos": 25263339786720.0, + "grad_norm": 1.5964555535489313, + "language_loss": 0.75166821, + "learning_rate": 3.570246849544616e-07, + "loss": 0.77302015, + "num_input_tokens_seen": 291721135, + "router_z_loss_clip": 0.68554688, + "router_z_loss_mlp": 0.11553955, + "step": 13516, + "time_per_iteration": 2.6008827686309814 + }, + { + "auxiliary_loss_clip": 0.0111411, + "auxiliary_loss_mlp": 0.01032698, + "balance_loss_clip": 1.03901398, + "balance_loss_mlp": 1.02139139, + "epoch": 0.8126860063129415, + "flos": 28819600662240.0, + "grad_norm": 1.6636652469955093, + "language_loss": 0.91517997, + "learning_rate": 3.5680263481473907e-07, + "loss": 0.93664813, + "num_input_tokens_seen": 291741235, + "router_z_loss_clip": 0.75146484, + "router_z_loss_mlp": 0.11322021, + "step": 13517, + "time_per_iteration": 2.6959803104400635 + }, + { + "auxiliary_loss_clip": 0.01115306, + "auxiliary_loss_mlp": 0.01030151, + "balance_loss_clip": 1.04093587, + "balance_loss_mlp": 1.01938057, + "epoch": 0.8127461295656095, + "flos": 30514318721280.0, + "grad_norm": 1.5011994305908638, + "language_loss": 0.7863906, + "learning_rate": 3.565806469852244e-07, + "loss": 0.80784518, + "num_input_tokens_seen": 291761430, + "router_z_loss_clip": 0.74365234, + "router_z_loss_mlp": 0.10772705, + "step": 13518, + "time_per_iteration": 2.6960926055908203 + }, + { + "auxiliary_loss_clip": 0.01114493, + "auxiliary_loss_mlp": 0.01030038, + "balance_loss_clip": 1.04196072, + "balance_loss_mlp": 1.02012551, + "epoch": 0.8128062528182775, + "flos": 33363695184480.0, + "grad_norm": 2.321796819867503, + "language_loss": 0.7941618, + "learning_rate": 3.56358721474336e-07, + "loss": 0.81560707, + "num_input_tokens_seen": 291781755, + "router_z_loss_clip": 0.72509766, + "router_z_loss_mlp": 0.09906006, + "step": 13519, + "time_per_iteration": 2.663409471511841 + }, + { + "auxiliary_loss_clip": 0.01113559, + "auxiliary_loss_mlp": 0.01035589, + "balance_loss_clip": 1.03894377, + "balance_loss_mlp": 1.02425778, + "epoch": 0.8128663760709455, + "flos": 32343413546880.0, + "grad_norm": 2.0994389002891567, + "language_loss": 0.70718223, + "learning_rate": 3.561368582904905e-07, + "loss": 0.7286737, + "num_input_tokens_seen": 291804410, + "router_z_loss_clip": 0.74658203, + "router_z_loss_mlp": 0.11328125, + "step": 13520, + "time_per_iteration": 2.7266592979431152 + }, + { + "auxiliary_loss_clip": 0.01115208, + "auxiliary_loss_mlp": 0.01032514, + "balance_loss_clip": 1.04021287, + "balance_loss_mlp": 1.02105165, + "epoch": 0.8129264993236134, + "flos": 21879535570560.0, + "grad_norm": 1.6490691922694225, + "language_loss": 0.72607255, + "learning_rate": 3.5591505744209925e-07, + "loss": 0.74754977, + "num_input_tokens_seen": 291823285, + "router_z_loss_clip": 0.75, + "router_z_loss_mlp": 0.11450195, + "step": 13521, + "time_per_iteration": 4.134609699249268 + }, + { + "auxiliary_loss_clip": 0.01114005, + "auxiliary_loss_mlp": 0.01028551, + "balance_loss_clip": 1.03893673, + "balance_loss_mlp": 1.01655817, + "epoch": 0.8129866225762814, + "flos": 31942430663040.0, + "grad_norm": 1.842135046380155, + "language_loss": 0.70164704, + "learning_rate": 3.5569331893757394e-07, + "loss": 0.72307265, + "num_input_tokens_seen": 291845305, + "router_z_loss_clip": 0.75097656, + "router_z_loss_mlp": 0.11993408, + "step": 13522, + "time_per_iteration": 2.8418161869049072 + }, + { + "auxiliary_loss_clip": 0.01107808, + "auxiliary_loss_mlp": 0.01035439, + "balance_loss_clip": 1.03772068, + "balance_loss_mlp": 1.02463317, + "epoch": 0.8130467458289493, + "flos": 25663593359520.0, + "grad_norm": 1.6521339510996185, + "language_loss": 0.70991766, + "learning_rate": 3.554716427853233e-07, + "loss": 0.73135018, + "num_input_tokens_seen": 291863715, + "router_z_loss_clip": 0.70117188, + "router_z_loss_mlp": 0.10803223, + "step": 13523, + "time_per_iteration": 3.922600269317627 + }, + { + "auxiliary_loss_clip": 0.01111119, + "auxiliary_loss_mlp": 0.01027869, + "balance_loss_clip": 1.03778243, + "balance_loss_mlp": 1.01622808, + "epoch": 0.8131068690816173, + "flos": 18896916824640.0, + "grad_norm": 2.21745409090034, + "language_loss": 0.70612454, + "learning_rate": 3.5525002899375256e-07, + "loss": 0.72751439, + "num_input_tokens_seen": 291880735, + "router_z_loss_clip": 0.73339844, + "router_z_loss_mlp": 0.11639404, + "step": 13524, + "time_per_iteration": 2.579721689224243 + }, + { + "auxiliary_loss_clip": 0.01112494, + "auxiliary_loss_mlp": 0.01031174, + "balance_loss_clip": 1.03880072, + "balance_loss_mlp": 1.0200758, + "epoch": 0.8131669923342852, + "flos": 35815087042560.0, + "grad_norm": 1.8535808668078149, + "language_loss": 0.6246357, + "learning_rate": 3.550284775712653e-07, + "loss": 0.64607239, + "num_input_tokens_seen": 291900535, + "router_z_loss_clip": 0.73730469, + "router_z_loss_mlp": 0.11090088, + "step": 13525, + "time_per_iteration": 2.7372703552246094 + }, + { + "auxiliary_loss_clip": 0.0111205, + "auxiliary_loss_mlp": 0.01033228, + "balance_loss_clip": 1.03860652, + "balance_loss_mlp": 1.02181983, + "epoch": 0.8132271155869533, + "flos": 43021272044160.0, + "grad_norm": 1.5211857441730028, + "language_loss": 0.65451682, + "learning_rate": 3.548069885262628e-07, + "loss": 0.67596954, + "num_input_tokens_seen": 291919760, + "router_z_loss_clip": 0.73535156, + "router_z_loss_mlp": 0.11419678, + "step": 13526, + "time_per_iteration": 2.7548811435699463 + }, + { + "auxiliary_loss_clip": 0.01110942, + "auxiliary_loss_mlp": 0.01028451, + "balance_loss_clip": 1.0383091, + "balance_loss_mlp": 1.01794839, + "epoch": 0.8132872388396212, + "flos": 33900108284160.0, + "grad_norm": 1.7364470771738882, + "language_loss": 0.75327182, + "learning_rate": 3.5458556186714473e-07, + "loss": 0.77466571, + "num_input_tokens_seen": 291938915, + "router_z_loss_clip": 0.72607422, + "router_z_loss_mlp": 0.1050415, + "step": 13527, + "time_per_iteration": 2.7361154556274414 + }, + { + "auxiliary_loss_clip": 0.01110964, + "auxiliary_loss_mlp": 0.01027085, + "balance_loss_clip": 1.03783083, + "balance_loss_mlp": 1.01567101, + "epoch": 0.8133473620922892, + "flos": 33946008012000.0, + "grad_norm": 1.8068067364761307, + "language_loss": 0.70762861, + "learning_rate": 3.5436419760230706e-07, + "loss": 0.72900903, + "num_input_tokens_seen": 291958145, + "router_z_loss_clip": 0.73046875, + "router_z_loss_mlp": 0.11407471, + "step": 13528, + "time_per_iteration": 2.7815802097320557 + }, + { + "auxiliary_loss_clip": 0.01113104, + "auxiliary_loss_mlp": 0.01031993, + "balance_loss_clip": 1.03859806, + "balance_loss_mlp": 1.02069783, + "epoch": 0.8134074853449572, + "flos": 23170920743520.0, + "grad_norm": 2.5606069447603765, + "language_loss": 0.68898106, + "learning_rate": 3.5414289574014357e-07, + "loss": 0.71043205, + "num_input_tokens_seen": 291976860, + "router_z_loss_clip": 0.74560547, + "router_z_loss_mlp": 0.11303711, + "step": 13529, + "time_per_iteration": 2.6551151275634766 + }, + { + "auxiliary_loss_clip": 0.01109207, + "auxiliary_loss_mlp": 0.01028268, + "balance_loss_clip": 1.03746963, + "balance_loss_mlp": 1.01742578, + "epoch": 0.8134676085976251, + "flos": 29581501190400.0, + "grad_norm": 1.664543351809743, + "language_loss": 0.77452123, + "learning_rate": 3.5392165628904635e-07, + "loss": 0.79589593, + "num_input_tokens_seen": 291998085, + "router_z_loss_clip": 0.71679688, + "router_z_loss_mlp": 0.1083374, + "step": 13530, + "time_per_iteration": 2.6756224632263184 + }, + { + "auxiliary_loss_clip": 0.01110468, + "auxiliary_loss_mlp": 0.01030087, + "balance_loss_clip": 1.03823471, + "balance_loss_mlp": 1.01849365, + "epoch": 0.8135277318502931, + "flos": 23259843472320.0, + "grad_norm": 3.770776234381739, + "language_loss": 0.82138568, + "learning_rate": 3.537004792574052e-07, + "loss": 0.84279126, + "num_input_tokens_seen": 292016585, + "router_z_loss_clip": 0.72216797, + "router_z_loss_mlp": 0.11590576, + "step": 13531, + "time_per_iteration": 2.6116623878479004 + }, + { + "auxiliary_loss_clip": 0.01113449, + "auxiliary_loss_mlp": 0.01029526, + "balance_loss_clip": 1.0389688, + "balance_loss_mlp": 1.01771247, + "epoch": 0.813587855102961, + "flos": 21072748246560.0, + "grad_norm": 1.9487600646593497, + "language_loss": 0.715397, + "learning_rate": 3.534793646536065e-07, + "loss": 0.73682678, + "num_input_tokens_seen": 292033255, + "router_z_loss_clip": 0.74560547, + "router_z_loss_mlp": 0.11810303, + "step": 13532, + "time_per_iteration": 2.637979745864868 + }, + { + "auxiliary_loss_clip": 0.01109902, + "auxiliary_loss_mlp": 0.01028782, + "balance_loss_clip": 1.03773427, + "balance_loss_mlp": 1.01773143, + "epoch": 0.8136479783556291, + "flos": 24596236992960.0, + "grad_norm": 2.1457921079727917, + "language_loss": 0.76351893, + "learning_rate": 3.5325831248603533e-07, + "loss": 0.78490579, + "num_input_tokens_seen": 292051800, + "router_z_loss_clip": 0.72167969, + "router_z_loss_mlp": 0.11047363, + "step": 13533, + "time_per_iteration": 2.6581029891967773 + }, + { + "auxiliary_loss_clip": 0.01116691, + "auxiliary_loss_mlp": 0.01041598, + "balance_loss_clip": 1.03953278, + "balance_loss_mlp": 1.02909827, + "epoch": 0.813708101608297, + "flos": 26908592597280.0, + "grad_norm": 2.0522453192017798, + "language_loss": 0.76885337, + "learning_rate": 3.5303732276307495e-07, + "loss": 0.79043627, + "num_input_tokens_seen": 292072215, + "router_z_loss_clip": 0.77148438, + "router_z_loss_mlp": 0.12506104, + "step": 13534, + "time_per_iteration": 2.660195827484131 + }, + { + "auxiliary_loss_clip": 0.01110927, + "auxiliary_loss_mlp": 0.01028326, + "balance_loss_clip": 1.03904247, + "balance_loss_mlp": 1.01803851, + "epoch": 0.813768224860965, + "flos": 19735098690240.0, + "grad_norm": 2.0861864055820787, + "language_loss": 0.93004167, + "learning_rate": 3.5281639549310336e-07, + "loss": 0.9514342, + "num_input_tokens_seen": 292088830, + "router_z_loss_clip": 0.71826172, + "router_z_loss_mlp": 0.10284424, + "step": 13535, + "time_per_iteration": 2.6515557765960693 + }, + { + "auxiliary_loss_clip": 0.01110751, + "auxiliary_loss_mlp": 0.01026701, + "balance_loss_clip": 1.04060793, + "balance_loss_mlp": 1.01553154, + "epoch": 0.8138283481136329, + "flos": 29715067611360.0, + "grad_norm": 1.7530968119456591, + "language_loss": 0.70376134, + "learning_rate": 3.52595530684499e-07, + "loss": 0.72513586, + "num_input_tokens_seen": 292109225, + "router_z_loss_clip": 0.70068359, + "router_z_loss_mlp": 0.11181641, + "step": 13536, + "time_per_iteration": 2.6711270809173584 + }, + { + "auxiliary_loss_clip": 0.01112432, + "auxiliary_loss_mlp": 0.01029791, + "balance_loss_clip": 1.03972578, + "balance_loss_mlp": 1.0182929, + "epoch": 0.8138884713663009, + "flos": 31135845925440.0, + "grad_norm": 1.6609163468251031, + "language_loss": 0.75285733, + "learning_rate": 3.5237472834563775e-07, + "loss": 0.77427959, + "num_input_tokens_seen": 292129660, + "router_z_loss_clip": 0.7265625, + "router_z_loss_mlp": 0.11505127, + "step": 13537, + "time_per_iteration": 2.6488776206970215 + }, + { + "auxiliary_loss_clip": 0.01110405, + "auxiliary_loss_mlp": 0.01031339, + "balance_loss_clip": 1.03920269, + "balance_loss_mlp": 1.01998413, + "epoch": 0.8139485946189688, + "flos": 27399105969120.0, + "grad_norm": 1.534328055404464, + "language_loss": 0.76138192, + "learning_rate": 3.5215398848489163e-07, + "loss": 0.78279936, + "num_input_tokens_seen": 292149090, + "router_z_loss_clip": 0.71142578, + "router_z_loss_mlp": 0.11358643, + "step": 13538, + "time_per_iteration": 2.6829888820648193 + }, + { + "auxiliary_loss_clip": 0.01110743, + "auxiliary_loss_mlp": 0.01028388, + "balance_loss_clip": 1.03724456, + "balance_loss_mlp": 1.01733744, + "epoch": 0.8140087178716369, + "flos": 25929915855840.0, + "grad_norm": 1.7737384304898687, + "language_loss": 0.77909958, + "learning_rate": 3.5193331111063176e-07, + "loss": 0.80049098, + "num_input_tokens_seen": 292169260, + "router_z_loss_clip": 0.734375, + "router_z_loss_mlp": 0.11053467, + "step": 13539, + "time_per_iteration": 2.6019985675811768 + }, + { + "auxiliary_loss_clip": 0.01111308, + "auxiliary_loss_mlp": 0.01033799, + "balance_loss_clip": 1.04020119, + "balance_loss_mlp": 1.02297473, + "epoch": 0.8140688411243048, + "flos": 48097079661600.0, + "grad_norm": 3.2440104995246974, + "language_loss": 0.6605767, + "learning_rate": 3.5171269623122533e-07, + "loss": 0.68202782, + "num_input_tokens_seen": 292188145, + "router_z_loss_clip": 0.71044922, + "router_z_loss_mlp": 0.10821533, + "step": 13540, + "time_per_iteration": 2.753910779953003 + }, + { + "auxiliary_loss_clip": 0.0111385, + "auxiliary_loss_mlp": 0.01032428, + "balance_loss_clip": 1.0414356, + "balance_loss_mlp": 1.02205658, + "epoch": 0.8141289643769728, + "flos": 31012408824480.0, + "grad_norm": 1.674956693328079, + "language_loss": 0.67620969, + "learning_rate": 3.5149214385503913e-07, + "loss": 0.69767249, + "num_input_tokens_seen": 292212135, + "router_z_loss_clip": 0.72509766, + "router_z_loss_mlp": 0.10369873, + "step": 13541, + "time_per_iteration": 2.680150032043457 + }, + { + "auxiliary_loss_clip": 0.01109317, + "auxiliary_loss_mlp": 0.01034112, + "balance_loss_clip": 1.03744984, + "balance_loss_mlp": 1.02247763, + "epoch": 0.8141890876296408, + "flos": 15334943012640.0, + "grad_norm": 1.8854662401861524, + "language_loss": 0.68733233, + "learning_rate": 3.512716539904355e-07, + "loss": 0.70876658, + "num_input_tokens_seen": 292230645, + "router_z_loss_clip": 0.71923828, + "router_z_loss_mlp": 0.11621094, + "step": 13542, + "time_per_iteration": 2.645026922225952 + }, + { + "auxiliary_loss_clip": 0.0111553, + "auxiliary_loss_mlp": 0.01030494, + "balance_loss_clip": 1.03858614, + "balance_loss_mlp": 1.01835847, + "epoch": 0.8142492108823087, + "flos": 18263355988320.0, + "grad_norm": 2.817059313954091, + "language_loss": 0.7963202, + "learning_rate": 3.5105122664577613e-07, + "loss": 0.81778049, + "num_input_tokens_seen": 292243540, + "router_z_loss_clip": 0.77001953, + "router_z_loss_mlp": 0.12139893, + "step": 13543, + "time_per_iteration": 2.5637691020965576 + }, + { + "auxiliary_loss_clip": 0.01116333, + "auxiliary_loss_mlp": 0.01038717, + "balance_loss_clip": 1.03993082, + "balance_loss_mlp": 1.02629006, + "epoch": 0.8143093341349767, + "flos": 15156732899520.0, + "grad_norm": 5.49346295541602, + "language_loss": 0.77713835, + "learning_rate": 3.5083086182942003e-07, + "loss": 0.79868889, + "num_input_tokens_seen": 292261715, + "router_z_loss_clip": 0.76318359, + "router_z_loss_mlp": 0.12420654, + "step": 13544, + "time_per_iteration": 2.6005756855010986 + }, + { + "auxiliary_loss_clip": 0.01119467, + "auxiliary_loss_mlp": 0.01032923, + "balance_loss_clip": 1.04061472, + "balance_loss_mlp": 1.01972628, + "epoch": 0.8143694573876447, + "flos": 14532693624000.0, + "grad_norm": 2.939954852073019, + "language_loss": 0.73832941, + "learning_rate": 3.5061055954972264e-07, + "loss": 0.75985336, + "num_input_tokens_seen": 292275080, + "router_z_loss_clip": 0.78808594, + "router_z_loss_mlp": 0.13201904, + "step": 13545, + "time_per_iteration": 2.6906349658966064 + }, + { + "auxiliary_loss_clip": 0.01109201, + "auxiliary_loss_mlp": 0.01028865, + "balance_loss_clip": 1.03851843, + "balance_loss_mlp": 1.01781452, + "epoch": 0.8144295806403127, + "flos": 25884259231680.0, + "grad_norm": 2.9663081605435306, + "language_loss": 0.7635147, + "learning_rate": 3.5039031981503776e-07, + "loss": 0.78489536, + "num_input_tokens_seen": 292294635, + "router_z_loss_clip": 0.70703125, + "router_z_loss_mlp": 0.11035156, + "step": 13546, + "time_per_iteration": 2.6066105365753174 + }, + { + "auxiliary_loss_clip": 0.01113899, + "auxiliary_loss_mlp": 0.01029404, + "balance_loss_clip": 1.04015756, + "balance_loss_mlp": 1.01883554, + "epoch": 0.8144897038929806, + "flos": 24239573663040.0, + "grad_norm": 2.1772825199799395, + "language_loss": 0.70070636, + "learning_rate": 3.501701426337178e-07, + "loss": 0.72213936, + "num_input_tokens_seen": 292312695, + "router_z_loss_clip": 0.73730469, + "router_z_loss_mlp": 0.10565186, + "step": 13547, + "time_per_iteration": 2.6596710681915283 + }, + { + "auxiliary_loss_clip": 0.01115702, + "auxiliary_loss_mlp": 0.01033201, + "balance_loss_clip": 1.03999984, + "balance_loss_mlp": 1.02065372, + "epoch": 0.8145498271456486, + "flos": 29982767695200.0, + "grad_norm": 1.9150230241195296, + "language_loss": 0.70733929, + "learning_rate": 3.49950028014111e-07, + "loss": 0.72882831, + "num_input_tokens_seen": 292332005, + "router_z_loss_clip": 0.75732422, + "router_z_loss_mlp": 0.12548828, + "step": 13548, + "time_per_iteration": 2.6317389011383057 + }, + { + "auxiliary_loss_clip": 0.01117553, + "auxiliary_loss_mlp": 0.01032306, + "balance_loss_clip": 1.04241383, + "balance_loss_mlp": 1.01994443, + "epoch": 0.8146099503983165, + "flos": 24639584132160.0, + "grad_norm": 1.9512077486894204, + "language_loss": 0.76460034, + "learning_rate": 3.4972997596456444e-07, + "loss": 0.78609896, + "num_input_tokens_seen": 292348365, + "router_z_loss_clip": 0.75195312, + "router_z_loss_mlp": 0.12365723, + "step": 13549, + "time_per_iteration": 4.080698013305664 + }, + { + "auxiliary_loss_clip": 0.01113232, + "auxiliary_loss_mlp": 0.01030505, + "balance_loss_clip": 1.04002094, + "balance_loss_mlp": 1.01918042, + "epoch": 0.8146700736509845, + "flos": 23838914917440.0, + "grad_norm": 2.9412616779869007, + "language_loss": 0.71285325, + "learning_rate": 3.4950998649342233e-07, + "loss": 0.7342906, + "num_input_tokens_seen": 292368050, + "router_z_loss_clip": 0.73144531, + "router_z_loss_mlp": 0.11315918, + "step": 13550, + "time_per_iteration": 2.6361072063446045 + }, + { + "auxiliary_loss_clip": 0.01107252, + "auxiliary_loss_mlp": 0.01025244, + "balance_loss_clip": 1.03736579, + "balance_loss_mlp": 1.01382351, + "epoch": 0.8147301969036524, + "flos": 22013831302560.0, + "grad_norm": 2.2314025922981404, + "language_loss": 0.72448951, + "learning_rate": 3.4929005960902826e-07, + "loss": 0.74581444, + "num_input_tokens_seen": 292385315, + "router_z_loss_clip": 0.69921875, + "router_z_loss_mlp": 0.11401367, + "step": 13551, + "time_per_iteration": 2.6498730182647705 + }, + { + "auxiliary_loss_clip": 0.0111729, + "auxiliary_loss_mlp": 0.01033795, + "balance_loss_clip": 1.04094148, + "balance_loss_mlp": 1.02138519, + "epoch": 0.8147903201563205, + "flos": 21969147093120.0, + "grad_norm": 2.181181273473216, + "language_loss": 0.68408811, + "learning_rate": 3.4907019531971926e-07, + "loss": 0.70559895, + "num_input_tokens_seen": 292403375, + "router_z_loss_clip": 0.76367188, + "router_z_loss_mlp": 0.12408447, + "step": 13552, + "time_per_iteration": 2.6699752807617188 + }, + { + "auxiliary_loss_clip": 0.01111688, + "auxiliary_loss_mlp": 0.01038972, + "balance_loss_clip": 1.03832185, + "balance_loss_mlp": 1.02779603, + "epoch": 0.8148504434089884, + "flos": 24721092198720.0, + "grad_norm": 1.745209246394531, + "language_loss": 0.81855416, + "learning_rate": 3.4885039363383407e-07, + "loss": 0.84006083, + "num_input_tokens_seen": 292419260, + "router_z_loss_clip": 0.73388672, + "router_z_loss_mlp": 0.11175537, + "step": 13553, + "time_per_iteration": 2.618359088897705 + }, + { + "auxiliary_loss_clip": 0.01111159, + "auxiliary_loss_mlp": 0.01025972, + "balance_loss_clip": 1.0380199, + "balance_loss_mlp": 1.01458108, + "epoch": 0.8149105666616564, + "flos": 15245493559200.0, + "grad_norm": 2.046152729692779, + "language_loss": 0.67867249, + "learning_rate": 3.4863065455970795e-07, + "loss": 0.7000438, + "num_input_tokens_seen": 292436095, + "router_z_loss_clip": 0.73193359, + "router_z_loss_mlp": 0.11395264, + "step": 13554, + "time_per_iteration": 2.71071720123291 + }, + { + "auxiliary_loss_clip": 0.01115552, + "auxiliary_loss_mlp": 0.01030214, + "balance_loss_clip": 1.04188585, + "balance_loss_mlp": 1.01842403, + "epoch": 0.8149706899143244, + "flos": 39686527903680.0, + "grad_norm": 2.3450761239692364, + "language_loss": 0.66540372, + "learning_rate": 3.484109781056723e-07, + "loss": 0.6868614, + "num_input_tokens_seen": 292457190, + "router_z_loss_clip": 0.73583984, + "router_z_loss_mlp": 0.11791992, + "step": 13555, + "time_per_iteration": 4.156916618347168 + }, + { + "auxiliary_loss_clip": 0.0111561, + "auxiliary_loss_mlp": 0.01035554, + "balance_loss_clip": 1.03941333, + "balance_loss_mlp": 1.0236392, + "epoch": 0.8150308131669923, + "flos": 23655275488800.0, + "grad_norm": 2.0146189249301125, + "language_loss": 0.73091161, + "learning_rate": 3.4819136428005844e-07, + "loss": 0.75242317, + "num_input_tokens_seen": 292474300, + "router_z_loss_clip": 0.76220703, + "router_z_loss_mlp": 0.11920166, + "step": 13556, + "time_per_iteration": 2.7101902961730957 + }, + { + "auxiliary_loss_clip": 0.01112506, + "auxiliary_loss_mlp": 0.01027893, + "balance_loss_clip": 1.04097581, + "balance_loss_mlp": 1.0173552, + "epoch": 0.8150909364196604, + "flos": 21257279055360.0, + "grad_norm": 1.6952806165402703, + "language_loss": 0.80413866, + "learning_rate": 3.4797181309119307e-07, + "loss": 0.82554263, + "num_input_tokens_seen": 292492420, + "router_z_loss_clip": 0.71533203, + "router_z_loss_mlp": 0.10540771, + "step": 13557, + "time_per_iteration": 2.7073593139648438 + }, + { + "auxiliary_loss_clip": 0.01117335, + "auxiliary_loss_mlp": 0.01031475, + "balance_loss_clip": 1.0412302, + "balance_loss_mlp": 1.01977479, + "epoch": 0.8151510596723283, + "flos": 33144528451680.0, + "grad_norm": 2.128643844101582, + "language_loss": 0.66044152, + "learning_rate": 3.4775232454740255e-07, + "loss": 0.68192971, + "num_input_tokens_seen": 292512895, + "router_z_loss_clip": 0.76074219, + "router_z_loss_mlp": 0.11688232, + "step": 13558, + "time_per_iteration": 2.723261833190918 + }, + { + "auxiliary_loss_clip": 0.01030353, + "auxiliary_loss_mlp": 0.01000976, + "balance_loss_clip": 1.00802183, + "balance_loss_mlp": 0.99993998, + "epoch": 0.8152111829249963, + "flos": 78361282798560.0, + "grad_norm": 0.8437607312710881, + "language_loss": 0.56972289, + "learning_rate": 3.4753289865700896e-07, + "loss": 0.59003621, + "num_input_tokens_seen": 292566580, + "router_z_loss_clip": 0.22351074, + "router_z_loss_mlp": 0.01035309, + "step": 13559, + "time_per_iteration": 3.1635406017303467 + }, + { + "auxiliary_loss_clip": 0.01030822, + "auxiliary_loss_mlp": 0.0100092, + "balance_loss_clip": 1.0083462, + "balance_loss_mlp": 0.99992585, + "epoch": 0.8152713061776642, + "flos": 81842032164960.0, + "grad_norm": 0.6759567477563712, + "language_loss": 0.55199564, + "learning_rate": 3.473135354283334e-07, + "loss": 0.57231307, + "num_input_tokens_seen": 292621490, + "router_z_loss_clip": 0.22473145, + "router_z_loss_mlp": 0.0099411, + "step": 13560, + "time_per_iteration": 4.5673747062683105 + }, + { + "auxiliary_loss_clip": 0.01109276, + "auxiliary_loss_mlp": 0.01031135, + "balance_loss_clip": 1.03795576, + "balance_loss_mlp": 1.02060914, + "epoch": 0.8153314294303322, + "flos": 17559712958400.0, + "grad_norm": 2.0142388176204498, + "language_loss": 0.67603612, + "learning_rate": 3.470942348696948e-07, + "loss": 0.69744021, + "num_input_tokens_seen": 292638660, + "router_z_loss_clip": 0.71289062, + "router_z_loss_mlp": 0.10528564, + "step": 13561, + "time_per_iteration": 2.607813596725464 + }, + { + "auxiliary_loss_clip": 0.0111534, + "auxiliary_loss_mlp": 0.01031345, + "balance_loss_clip": 1.03928518, + "balance_loss_mlp": 1.02000237, + "epoch": 0.8153915526830001, + "flos": 31266819240480.0, + "grad_norm": 2.534149008373844, + "language_loss": 0.81575, + "learning_rate": 3.468749969894085e-07, + "loss": 0.83721685, + "num_input_tokens_seen": 292658545, + "router_z_loss_clip": 0.76074219, + "router_z_loss_mlp": 0.11352539, + "step": 13562, + "time_per_iteration": 2.683882236480713 + }, + { + "auxiliary_loss_clip": 0.01111358, + "auxiliary_loss_mlp": 0.01030679, + "balance_loss_clip": 1.03851891, + "balance_loss_mlp": 1.01961088, + "epoch": 0.8154516759356681, + "flos": 28515157755840.0, + "grad_norm": 2.3634492889579715, + "language_loss": 0.71720332, + "learning_rate": 3.4665582179578734e-07, + "loss": 0.73862368, + "num_input_tokens_seen": 292678460, + "router_z_loss_clip": 0.72851562, + "router_z_loss_mlp": 0.11065674, + "step": 13563, + "time_per_iteration": 4.102932691574097 + }, + { + "auxiliary_loss_clip": 0.01111246, + "auxiliary_loss_mlp": 0.01030751, + "balance_loss_clip": 1.03781927, + "balance_loss_mlp": 1.01839483, + "epoch": 0.815511799188336, + "flos": 34348084862400.0, + "grad_norm": 1.5708628321716427, + "language_loss": 0.69878781, + "learning_rate": 3.4643670929714387e-07, + "loss": 0.72020775, + "num_input_tokens_seen": 292699815, + "router_z_loss_clip": 0.73388672, + "router_z_loss_mlp": 0.12365723, + "step": 13564, + "time_per_iteration": 2.701326370239258 + }, + { + "auxiliary_loss_clip": 0.01111597, + "auxiliary_loss_mlp": 0.0102775, + "balance_loss_clip": 1.03849733, + "balance_loss_mlp": 1.01625204, + "epoch": 0.8155719224410041, + "flos": 20722851302400.0, + "grad_norm": 2.067595477463569, + "language_loss": 0.70451891, + "learning_rate": 3.462176595017854e-07, + "loss": 0.72591233, + "num_input_tokens_seen": 292717370, + "router_z_loss_clip": 0.72998047, + "router_z_loss_mlp": 0.1151123, + "step": 13565, + "time_per_iteration": 2.6707959175109863 + }, + { + "auxiliary_loss_clip": 0.01110059, + "auxiliary_loss_mlp": 0.01032841, + "balance_loss_clip": 1.03836238, + "balance_loss_mlp": 1.0217191, + "epoch": 0.815632045693672, + "flos": 30116941875360.0, + "grad_norm": 2.1427284899621926, + "language_loss": 0.78825122, + "learning_rate": 3.459986724180188e-07, + "loss": 0.80968022, + "num_input_tokens_seen": 292737110, + "router_z_loss_clip": 0.71728516, + "router_z_loss_mlp": 0.11120605, + "step": 13566, + "time_per_iteration": 2.6842150688171387 + }, + { + "auxiliary_loss_clip": 0.01111032, + "auxiliary_loss_mlp": 0.01030626, + "balance_loss_clip": 1.03993678, + "balance_loss_mlp": 1.01989126, + "epoch": 0.81569216894634, + "flos": 24328698978240.0, + "grad_norm": 2.1384137610449665, + "language_loss": 0.82316542, + "learning_rate": 3.457797480541491e-07, + "loss": 0.84458196, + "num_input_tokens_seen": 292756510, + "router_z_loss_clip": 0.71191406, + "router_z_loss_mlp": 0.10736084, + "step": 13567, + "time_per_iteration": 2.732025146484375 + }, + { + "auxiliary_loss_clip": 0.01109174, + "auxiliary_loss_mlp": 0.0102658, + "balance_loss_clip": 1.03818572, + "balance_loss_mlp": 1.01680481, + "epoch": 0.8157522921990079, + "flos": 26599692790080.0, + "grad_norm": 1.9376928844959103, + "language_loss": 0.79427731, + "learning_rate": 3.455608864184771e-07, + "loss": 0.81563485, + "num_input_tokens_seen": 292776710, + "router_z_loss_clip": 0.70996094, + "router_z_loss_mlp": 0.09777832, + "step": 13568, + "time_per_iteration": 2.7089409828186035 + }, + { + "auxiliary_loss_clip": 0.01107719, + "auxiliary_loss_mlp": 0.010306, + "balance_loss_clip": 1.03769469, + "balance_loss_mlp": 1.01975167, + "epoch": 0.8158124154516759, + "flos": 22584353601600.0, + "grad_norm": 1.9759971274839871, + "language_loss": 0.76937675, + "learning_rate": 3.453420875193016e-07, + "loss": 0.79075998, + "num_input_tokens_seen": 292794350, + "router_z_loss_clip": 0.70068359, + "router_z_loss_mlp": 0.10852051, + "step": 13569, + "time_per_iteration": 2.681324005126953 + }, + { + "auxiliary_loss_clip": 0.01109447, + "auxiliary_loss_mlp": 0.0103263, + "balance_loss_clip": 1.03812301, + "balance_loss_mlp": 1.02225912, + "epoch": 0.815872538704344, + "flos": 32739736943520.0, + "grad_norm": 2.596518113009443, + "language_loss": 0.57771641, + "learning_rate": 3.451233513649199e-07, + "loss": 0.59913713, + "num_input_tokens_seen": 292814005, + "router_z_loss_clip": 0.71337891, + "router_z_loss_mlp": 0.1038208, + "step": 13570, + "time_per_iteration": 2.654594898223877 + }, + { + "auxiliary_loss_clip": 0.01114145, + "auxiliary_loss_mlp": 0.01033007, + "balance_loss_clip": 1.0387665, + "balance_loss_mlp": 1.02101445, + "epoch": 0.8159326619570119, + "flos": 26509838163840.0, + "grad_norm": 1.7864099164419764, + "language_loss": 0.82316285, + "learning_rate": 3.4490467796362687e-07, + "loss": 0.84463441, + "num_input_tokens_seen": 292833485, + "router_z_loss_clip": 0.75341797, + "router_z_loss_mlp": 0.11999512, + "step": 13571, + "time_per_iteration": 2.668119430541992 + }, + { + "auxiliary_loss_clip": 0.01112453, + "auxiliary_loss_mlp": 0.01040511, + "balance_loss_clip": 1.0401938, + "balance_loss_mlp": 1.02873921, + "epoch": 0.8159927852096799, + "flos": 16888355850240.0, + "grad_norm": 3.682969985180381, + "language_loss": 0.7885139, + "learning_rate": 3.446860673237142e-07, + "loss": 0.81004351, + "num_input_tokens_seen": 292848045, + "router_z_loss_clip": 0.72216797, + "router_z_loss_mlp": 0.11779785, + "step": 13572, + "time_per_iteration": 2.567620038986206 + }, + { + "auxiliary_loss_clip": 0.01112689, + "auxiliary_loss_mlp": 0.0103296, + "balance_loss_clip": 1.03837395, + "balance_loss_mlp": 1.02173078, + "epoch": 0.8160529084623478, + "flos": 29894574277440.0, + "grad_norm": 1.5472925936434094, + "language_loss": 0.64746559, + "learning_rate": 3.4446751945347186e-07, + "loss": 0.66892207, + "num_input_tokens_seen": 292869965, + "router_z_loss_clip": 0.74365234, + "router_z_loss_mlp": 0.11236572, + "step": 13573, + "time_per_iteration": 2.7249512672424316 + }, + { + "auxiliary_loss_clip": 0.01111638, + "auxiliary_loss_mlp": 0.01031747, + "balance_loss_clip": 1.03944683, + "balance_loss_mlp": 1.02128029, + "epoch": 0.8161130317150158, + "flos": 30293531297280.0, + "grad_norm": 1.9496539473668886, + "language_loss": 0.75292814, + "learning_rate": 3.442490343611868e-07, + "loss": 0.77436203, + "num_input_tokens_seen": 292889680, + "router_z_loss_clip": 0.72216797, + "router_z_loss_mlp": 0.10467529, + "step": 13574, + "time_per_iteration": 2.6529111862182617 + }, + { + "auxiliary_loss_clip": 0.01113498, + "auxiliary_loss_mlp": 0.01031054, + "balance_loss_clip": 1.03930116, + "balance_loss_mlp": 1.01953852, + "epoch": 0.8161731549676837, + "flos": 37774020699360.0, + "grad_norm": 3.8453967511816343, + "language_loss": 0.59745026, + "learning_rate": 3.4403061205514485e-07, + "loss": 0.61889577, + "num_input_tokens_seen": 292912360, + "router_z_loss_clip": 0.74169922, + "router_z_loss_mlp": 0.11517334, + "step": 13575, + "time_per_iteration": 2.7213494777679443 + }, + { + "auxiliary_loss_clip": 0.0111044, + "auxiliary_loss_mlp": 0.01029735, + "balance_loss_clip": 1.03803957, + "balance_loss_mlp": 1.01833296, + "epoch": 0.8162332782203517, + "flos": 22636979197920.0, + "grad_norm": 1.904889184970324, + "language_loss": 0.74824584, + "learning_rate": 3.4381225254362736e-07, + "loss": 0.7696476, + "num_input_tokens_seen": 292928325, + "router_z_loss_clip": 0.72412109, + "router_z_loss_mlp": 0.11401367, + "step": 13576, + "time_per_iteration": 2.5789377689361572 + }, + { + "auxiliary_loss_clip": 0.01030157, + "auxiliary_loss_mlp": 0.01001325, + "balance_loss_clip": 1.00777185, + "balance_loss_mlp": 1.00034034, + "epoch": 0.8162934014730197, + "flos": 85887550376640.0, + "grad_norm": 0.8441056703068331, + "language_loss": 0.58693278, + "learning_rate": 3.435939558349155e-07, + "loss": 0.60724759, + "num_input_tokens_seen": 292992795, + "router_z_loss_clip": 0.22387695, + "router_z_loss_mlp": 0.00983429, + "step": 13577, + "time_per_iteration": 3.3256289958953857 + }, + { + "auxiliary_loss_clip": 0.01109145, + "auxiliary_loss_mlp": 0.01025882, + "balance_loss_clip": 1.03907132, + "balance_loss_mlp": 1.01501036, + "epoch": 0.8163535247256877, + "flos": 25886447164800.0, + "grad_norm": 1.707604934397055, + "language_loss": 0.71174443, + "learning_rate": 3.4337572193728747e-07, + "loss": 0.73309469, + "num_input_tokens_seen": 293011950, + "router_z_loss_clip": 0.70068359, + "router_z_loss_mlp": 0.10870361, + "step": 13578, + "time_per_iteration": 2.6530346870422363 + }, + { + "auxiliary_loss_clip": 0.01111329, + "auxiliary_loss_mlp": 0.01027541, + "balance_loss_clip": 1.03936327, + "balance_loss_mlp": 1.01677072, + "epoch": 0.8164136479783556, + "flos": 25745506598880.0, + "grad_norm": 1.6998936419013686, + "language_loss": 0.73597896, + "learning_rate": 3.431575508590172e-07, + "loss": 0.75736767, + "num_input_tokens_seen": 293030175, + "router_z_loss_clip": 0.71972656, + "router_z_loss_mlp": 0.10766602, + "step": 13579, + "time_per_iteration": 2.617152690887451 + }, + { + "auxiliary_loss_clip": 0.0111189, + "auxiliary_loss_mlp": 0.01026414, + "balance_loss_clip": 1.03759873, + "balance_loss_mlp": 1.01532793, + "epoch": 0.8164737712310236, + "flos": 26504125227360.0, + "grad_norm": 2.261123553168408, + "language_loss": 0.79725057, + "learning_rate": 3.4293944260837873e-07, + "loss": 0.81863362, + "num_input_tokens_seen": 293047980, + "router_z_loss_clip": 0.74316406, + "router_z_loss_mlp": 0.11102295, + "step": 13580, + "time_per_iteration": 2.6443469524383545 + }, + { + "auxiliary_loss_clip": 0.01109521, + "auxiliary_loss_mlp": 0.01030953, + "balance_loss_clip": 1.03875375, + "balance_loss_mlp": 1.01928234, + "epoch": 0.8165338944836915, + "flos": 23838752848320.0, + "grad_norm": 1.8638653116492734, + "language_loss": 0.69055611, + "learning_rate": 3.4272139719364314e-07, + "loss": 0.71196091, + "num_input_tokens_seen": 293067030, + "router_z_loss_clip": 0.70800781, + "router_z_loss_mlp": 0.11669922, + "step": 13581, + "time_per_iteration": 2.786526918411255 + }, + { + "auxiliary_loss_clip": 0.01109855, + "auxiliary_loss_mlp": 0.01027875, + "balance_loss_clip": 1.03735232, + "balance_loss_mlp": 1.01699138, + "epoch": 0.8165940177363595, + "flos": 27977772241440.0, + "grad_norm": 2.161093359650979, + "language_loss": 0.59624976, + "learning_rate": 3.4250341462307786e-07, + "loss": 0.61762708, + "num_input_tokens_seen": 293085575, + "router_z_loss_clip": 0.72460938, + "router_z_loss_mlp": 0.10888672, + "step": 13582, + "time_per_iteration": 2.675022840499878 + }, + { + "auxiliary_loss_clip": 0.01108639, + "auxiliary_loss_mlp": 0.01031588, + "balance_loss_clip": 1.03930461, + "balance_loss_mlp": 1.02050126, + "epoch": 0.8166541409890276, + "flos": 28516535343360.0, + "grad_norm": 1.6275884461732004, + "language_loss": 0.82078665, + "learning_rate": 3.4228549490494897e-07, + "loss": 0.84218889, + "num_input_tokens_seen": 293108200, + "router_z_loss_clip": 0.69433594, + "router_z_loss_mlp": 0.11083984, + "step": 13583, + "time_per_iteration": 2.649608850479126 + }, + { + "auxiliary_loss_clip": 0.01111921, + "auxiliary_loss_mlp": 0.01027943, + "balance_loss_clip": 1.039276, + "balance_loss_mlp": 1.01735687, + "epoch": 0.8167142642416955, + "flos": 22502805017760.0, + "grad_norm": 4.318009099849917, + "language_loss": 0.74244583, + "learning_rate": 3.4206763804752093e-07, + "loss": 0.76384449, + "num_input_tokens_seen": 293126020, + "router_z_loss_clip": 0.7265625, + "router_z_loss_mlp": 0.10583496, + "step": 13584, + "time_per_iteration": 2.644023895263672 + }, + { + "auxiliary_loss_clip": 0.0111364, + "auxiliary_loss_mlp": 0.01026674, + "balance_loss_clip": 1.04140592, + "balance_loss_mlp": 1.01510465, + "epoch": 0.8167743874943635, + "flos": 25886609233920.0, + "grad_norm": 1.6577662858514624, + "language_loss": 0.7446214, + "learning_rate": 3.4184984405905405e-07, + "loss": 0.76602447, + "num_input_tokens_seen": 293144620, + "router_z_loss_clip": 0.72265625, + "router_z_loss_mlp": 0.11566162, + "step": 13585, + "time_per_iteration": 2.6057991981506348 + }, + { + "auxiliary_loss_clip": 0.01112073, + "auxiliary_loss_mlp": 0.0103125, + "balance_loss_clip": 1.03943014, + "balance_loss_mlp": 1.01963282, + "epoch": 0.8168345107470314, + "flos": 22814784138240.0, + "grad_norm": 2.274870210031167, + "language_loss": 0.69633859, + "learning_rate": 3.416321129478068e-07, + "loss": 0.71777189, + "num_input_tokens_seen": 293162850, + "router_z_loss_clip": 0.72607422, + "router_z_loss_mlp": 0.11627197, + "step": 13586, + "time_per_iteration": 2.6206841468811035 + }, + { + "auxiliary_loss_clip": 0.01111754, + "auxiliary_loss_mlp": 0.01029241, + "balance_loss_clip": 1.0400821, + "balance_loss_mlp": 1.0191623, + "epoch": 0.8168946339996994, + "flos": 19831719702240.0, + "grad_norm": 1.6611341684632177, + "language_loss": 0.60924917, + "learning_rate": 3.4141444472203594e-07, + "loss": 0.6306591, + "num_input_tokens_seen": 293181620, + "router_z_loss_clip": 0.71679688, + "router_z_loss_mlp": 0.10076904, + "step": 13587, + "time_per_iteration": 2.5752644538879395 + }, + { + "auxiliary_loss_clip": 0.01114469, + "auxiliary_loss_mlp": 0.01031632, + "balance_loss_clip": 1.03846192, + "balance_loss_mlp": 1.02040839, + "epoch": 0.8169547572523673, + "flos": 32874802503840.0, + "grad_norm": 2.576034415434588, + "language_loss": 0.69657421, + "learning_rate": 3.4119683938999624e-07, + "loss": 0.71803522, + "num_input_tokens_seen": 293200270, + "router_z_loss_clip": 0.76074219, + "router_z_loss_mlp": 0.11230469, + "step": 13588, + "time_per_iteration": 2.710508346557617 + }, + { + "auxiliary_loss_clip": 0.01114579, + "auxiliary_loss_mlp": 0.01032695, + "balance_loss_clip": 1.04016185, + "balance_loss_mlp": 1.02048779, + "epoch": 0.8170148805050353, + "flos": 23125588257600.0, + "grad_norm": 1.737059468438326, + "language_loss": 0.73036122, + "learning_rate": 3.4097929695993854e-07, + "loss": 0.75183392, + "num_input_tokens_seen": 293218960, + "router_z_loss_clip": 0.74316406, + "router_z_loss_mlp": 0.12207031, + "step": 13589, + "time_per_iteration": 4.196516752243042 + }, + { + "auxiliary_loss_clip": 0.01109574, + "auxiliary_loss_mlp": 0.01031087, + "balance_loss_clip": 1.03793705, + "balance_loss_mlp": 1.01944041, + "epoch": 0.8170750037577033, + "flos": 26643120963840.0, + "grad_norm": 2.978528681998397, + "language_loss": 0.73141378, + "learning_rate": 3.4076181744011166e-07, + "loss": 0.75282043, + "num_input_tokens_seen": 293236450, + "router_z_loss_clip": 0.71679688, + "router_z_loss_mlp": 0.11651611, + "step": 13590, + "time_per_iteration": 2.612600326538086 + }, + { + "auxiliary_loss_clip": 0.01116385, + "auxiliary_loss_mlp": 0.01034036, + "balance_loss_clip": 1.04087853, + "balance_loss_mlp": 1.02181149, + "epoch": 0.8171351270103713, + "flos": 40886802414720.0, + "grad_norm": 2.2240007083292164, + "language_loss": 0.65638572, + "learning_rate": 3.4054440083876345e-07, + "loss": 0.67788994, + "num_input_tokens_seen": 293256480, + "router_z_loss_clip": 0.75537109, + "router_z_loss_mlp": 0.12225342, + "step": 13591, + "time_per_iteration": 2.761716842651367 + }, + { + "auxiliary_loss_clip": 0.01113731, + "auxiliary_loss_mlp": 0.01033387, + "balance_loss_clip": 1.03831065, + "balance_loss_mlp": 1.02124512, + "epoch": 0.8171952502630392, + "flos": 27708978191040.0, + "grad_norm": 2.4066926515790144, + "language_loss": 0.68075746, + "learning_rate": 3.403270471641373e-07, + "loss": 0.70222867, + "num_input_tokens_seen": 293274960, + "router_z_loss_clip": 0.75439453, + "router_z_loss_mlp": 0.12133789, + "step": 13592, + "time_per_iteration": 2.5997352600097656 + }, + { + "auxiliary_loss_clip": 0.01112181, + "auxiliary_loss_mlp": 0.01027063, + "balance_loss_clip": 1.03871679, + "balance_loss_mlp": 1.01561308, + "epoch": 0.8172553735157072, + "flos": 32608844663040.0, + "grad_norm": 3.1507867180950604, + "language_loss": 0.66175616, + "learning_rate": 3.401097564244759e-07, + "loss": 0.68314862, + "num_input_tokens_seen": 293295945, + "router_z_loss_clip": 0.734375, + "router_z_loss_mlp": 0.11450195, + "step": 13593, + "time_per_iteration": 2.6884524822235107 + }, + { + "auxiliary_loss_clip": 0.0110968, + "auxiliary_loss_mlp": 0.01027595, + "balance_loss_clip": 1.0370394, + "balance_loss_mlp": 1.01675916, + "epoch": 0.8173154967683751, + "flos": 19475380510560.0, + "grad_norm": 2.2836443962402644, + "language_loss": 0.69825923, + "learning_rate": 3.398925286280188e-07, + "loss": 0.71963203, + "num_input_tokens_seen": 293313300, + "router_z_loss_clip": 0.7265625, + "router_z_loss_mlp": 0.1083374, + "step": 13594, + "time_per_iteration": 4.0050013065338135 + }, + { + "auxiliary_loss_clip": 0.01112759, + "auxiliary_loss_mlp": 0.01030551, + "balance_loss_clip": 1.03763807, + "balance_loss_mlp": 1.01927996, + "epoch": 0.8173756200210431, + "flos": 31710176848800.0, + "grad_norm": 2.7768579122516917, + "language_loss": 0.65853488, + "learning_rate": 3.3967536378300456e-07, + "loss": 0.679968, + "num_input_tokens_seen": 293333085, + "router_z_loss_clip": 0.75048828, + "router_z_loss_mlp": 0.1126709, + "step": 13595, + "time_per_iteration": 2.642031192779541 + }, + { + "auxiliary_loss_clip": 0.0111531, + "auxiliary_loss_mlp": 0.01026207, + "balance_loss_clip": 1.03914213, + "balance_loss_mlp": 1.0145539, + "epoch": 0.8174357432737112, + "flos": 31316325006240.0, + "grad_norm": 1.8007999993045853, + "language_loss": 0.78808296, + "learning_rate": 3.394582618976658e-07, + "loss": 0.80949807, + "num_input_tokens_seen": 293351895, + "router_z_loss_clip": 0.76171875, + "router_z_loss_mlp": 0.11651611, + "step": 13596, + "time_per_iteration": 2.725835084915161 + }, + { + "auxiliary_loss_clip": 0.01109353, + "auxiliary_loss_mlp": 0.01023859, + "balance_loss_clip": 1.03756356, + "balance_loss_mlp": 1.01262331, + "epoch": 0.8174958665263791, + "flos": 26643323550240.0, + "grad_norm": 2.4816943928543505, + "language_loss": 0.58846539, + "learning_rate": 3.392412229802362e-07, + "loss": 0.60979748, + "num_input_tokens_seen": 293371165, + "router_z_loss_clip": 0.71777344, + "router_z_loss_mlp": 0.11242676, + "step": 13597, + "time_per_iteration": 2.6193156242370605 + }, + { + "auxiliary_loss_clip": 0.01109223, + "auxiliary_loss_mlp": 0.01029907, + "balance_loss_clip": 1.0383718, + "balance_loss_mlp": 1.01877904, + "epoch": 0.8175559897790471, + "flos": 27400402522080.0, + "grad_norm": 1.5642704460293229, + "language_loss": 0.82715774, + "learning_rate": 3.390242470389462e-07, + "loss": 0.84854907, + "num_input_tokens_seen": 293391150, + "router_z_loss_clip": 0.70947266, + "router_z_loss_mlp": 0.11120605, + "step": 13598, + "time_per_iteration": 2.640319585800171 + }, + { + "auxiliary_loss_clip": 0.01113295, + "auxiliary_loss_mlp": 0.01029445, + "balance_loss_clip": 1.03905797, + "balance_loss_mlp": 1.0184418, + "epoch": 0.817616113031715, + "flos": 28816926521760.0, + "grad_norm": 1.798906926982899, + "language_loss": 0.82539576, + "learning_rate": 3.3880733408202277e-07, + "loss": 0.8468231, + "num_input_tokens_seen": 293409440, + "router_z_loss_clip": 0.74169922, + "router_z_loss_mlp": 0.11004639, + "step": 13599, + "time_per_iteration": 2.6477770805358887 + }, + { + "auxiliary_loss_clip": 0.01110628, + "auxiliary_loss_mlp": 0.01030341, + "balance_loss_clip": 1.03882241, + "balance_loss_mlp": 1.0193435, + "epoch": 0.817676236284383, + "flos": 33766339276800.0, + "grad_norm": 2.0487202938993954, + "language_loss": 0.83487034, + "learning_rate": 3.3859048411769186e-07, + "loss": 0.85627997, + "num_input_tokens_seen": 293428995, + "router_z_loss_clip": 0.71875, + "router_z_loss_mlp": 0.10992432, + "step": 13600, + "time_per_iteration": 4.110528230667114 + }, + { + "auxiliary_loss_clip": 0.01111879, + "auxiliary_loss_mlp": 0.01032366, + "balance_loss_clip": 1.03794241, + "balance_loss_mlp": 1.02086198, + "epoch": 0.8177363595370509, + "flos": 30116131529760.0, + "grad_norm": 1.8959673691022971, + "language_loss": 0.73601788, + "learning_rate": 3.383736971541766e-07, + "loss": 0.75746036, + "num_input_tokens_seen": 293449155, + "router_z_loss_clip": 0.73925781, + "router_z_loss_mlp": 0.11499023, + "step": 13601, + "time_per_iteration": 2.700562000274658 + }, + { + "auxiliary_loss_clip": 0.01115549, + "auxiliary_loss_mlp": 0.01031116, + "balance_loss_clip": 1.03970182, + "balance_loss_mlp": 1.01978517, + "epoch": 0.817796482789719, + "flos": 21165884772480.0, + "grad_norm": 3.535741529507688, + "language_loss": 0.68244135, + "learning_rate": 3.3815697319969737e-07, + "loss": 0.70390803, + "num_input_tokens_seen": 293466125, + "router_z_loss_clip": 0.75878906, + "router_z_loss_mlp": 0.11328125, + "step": 13602, + "time_per_iteration": 2.586066722869873 + }, + { + "auxiliary_loss_clip": 0.01109545, + "auxiliary_loss_mlp": 0.01028805, + "balance_loss_clip": 1.03823745, + "balance_loss_mlp": 1.01788521, + "epoch": 0.8178566060423869, + "flos": 21693748726080.0, + "grad_norm": 2.2357370080240964, + "language_loss": 0.83438945, + "learning_rate": 3.379403122624718e-07, + "loss": 0.85577297, + "num_input_tokens_seen": 293481345, + "router_z_loss_clip": 0.71386719, + "router_z_loss_mlp": 0.10913086, + "step": 13603, + "time_per_iteration": 3.8983633518218994 + }, + { + "auxiliary_loss_clip": 0.01111258, + "auxiliary_loss_mlp": 0.0102293, + "balance_loss_clip": 1.03803444, + "balance_loss_mlp": 1.01221943, + "epoch": 0.8179167292950549, + "flos": 30472713825120.0, + "grad_norm": 1.8885159772375144, + "language_loss": 0.69100654, + "learning_rate": 3.377237143507159e-07, + "loss": 0.71234834, + "num_input_tokens_seen": 293502330, + "router_z_loss_clip": 0.73291016, + "router_z_loss_mlp": 0.1071167, + "step": 13604, + "time_per_iteration": 2.7036633491516113 + }, + { + "auxiliary_loss_clip": 0.01113579, + "auxiliary_loss_mlp": 0.01033972, + "balance_loss_clip": 1.04129958, + "balance_loss_mlp": 1.02210498, + "epoch": 0.8179768525477228, + "flos": 27890065031040.0, + "grad_norm": 1.9901961441221898, + "language_loss": 0.74295962, + "learning_rate": 3.3750717947264406e-07, + "loss": 0.76443517, + "num_input_tokens_seen": 293521415, + "router_z_loss_clip": 0.72216797, + "router_z_loss_mlp": 0.11859131, + "step": 13605, + "time_per_iteration": 2.609038829803467 + }, + { + "auxiliary_loss_clip": 0.01111006, + "auxiliary_loss_mlp": 0.01038193, + "balance_loss_clip": 1.04090953, + "balance_loss_mlp": 1.02634335, + "epoch": 0.8180369758003908, + "flos": 22592051884800.0, + "grad_norm": 2.078188554795061, + "language_loss": 0.74111116, + "learning_rate": 3.372907076364666e-07, + "loss": 0.7626031, + "num_input_tokens_seen": 293539245, + "router_z_loss_clip": 0.70117188, + "router_z_loss_mlp": 0.11853027, + "step": 13606, + "time_per_iteration": 2.6268203258514404 + }, + { + "auxiliary_loss_clip": 0.01112425, + "auxiliary_loss_mlp": 0.01027849, + "balance_loss_clip": 1.04015446, + "balance_loss_mlp": 1.017102, + "epoch": 0.8180970990530587, + "flos": 40489830741600.0, + "grad_norm": 2.0853941925575676, + "language_loss": 0.65607584, + "learning_rate": 3.370742988503916e-07, + "loss": 0.67747855, + "num_input_tokens_seen": 293560640, + "router_z_loss_clip": 0.72167969, + "router_z_loss_mlp": 0.10748291, + "step": 13607, + "time_per_iteration": 2.705888271331787 + }, + { + "auxiliary_loss_clip": 0.01112511, + "auxiliary_loss_mlp": 0.01028476, + "balance_loss_clip": 1.03930116, + "balance_loss_mlp": 1.01703763, + "epoch": 0.8181572223057267, + "flos": 30517560103680.0, + "grad_norm": 1.6819660222013553, + "language_loss": 0.70371991, + "learning_rate": 3.3685795312262634e-07, + "loss": 0.72512984, + "num_input_tokens_seen": 293579465, + "router_z_loss_clip": 0.73291016, + "router_z_loss_mlp": 0.11437988, + "step": 13608, + "time_per_iteration": 2.688660144805908 + }, + { + "auxiliary_loss_clip": 0.01109519, + "auxiliary_loss_mlp": 0.01031297, + "balance_loss_clip": 1.03739858, + "balance_loss_mlp": 1.02062142, + "epoch": 0.8182173455583948, + "flos": 34835802541920.0, + "grad_norm": 1.676866015687556, + "language_loss": 0.79522896, + "learning_rate": 3.366416704613735e-07, + "loss": 0.81663716, + "num_input_tokens_seen": 293600540, + "router_z_loss_clip": 0.72119141, + "router_z_loss_mlp": 0.10668945, + "step": 13609, + "time_per_iteration": 2.692563533782959 + }, + { + "auxiliary_loss_clip": 0.01030563, + "auxiliary_loss_mlp": 0.01000395, + "balance_loss_clip": 1.00815654, + "balance_loss_mlp": 0.99945587, + "epoch": 0.8182774688110627, + "flos": 87890317380000.0, + "grad_norm": 0.7673252819873015, + "language_loss": 0.55903494, + "learning_rate": 3.3642545087483544e-07, + "loss": 0.57934457, + "num_input_tokens_seen": 293665160, + "router_z_loss_clip": 0.22399902, + "router_z_loss_mlp": 0.00938416, + "step": 13610, + "time_per_iteration": 3.336230754852295 + }, + { + "auxiliary_loss_clip": 0.01109151, + "auxiliary_loss_mlp": 0.01026546, + "balance_loss_clip": 1.03876877, + "balance_loss_mlp": 1.01553082, + "epoch": 0.8183375920637307, + "flos": 24105480517440.0, + "grad_norm": 2.5035652229828016, + "language_loss": 0.77565372, + "learning_rate": 3.362092943712107e-07, + "loss": 0.79701066, + "num_input_tokens_seen": 293683995, + "router_z_loss_clip": 0.703125, + "router_z_loss_mlp": 0.11016846, + "step": 13611, + "time_per_iteration": 2.606642723083496 + }, + { + "auxiliary_loss_clip": 0.01115119, + "auxiliary_loss_mlp": 0.01034769, + "balance_loss_clip": 1.03743672, + "balance_loss_mlp": 1.02223408, + "epoch": 0.8183977153163986, + "flos": 27262136096640.0, + "grad_norm": 2.1481558158840244, + "language_loss": 0.77125788, + "learning_rate": 3.3599320095869745e-07, + "loss": 0.79275674, + "num_input_tokens_seen": 293704115, + "router_z_loss_clip": 0.77587891, + "router_z_loss_mlp": 0.12548828, + "step": 13612, + "time_per_iteration": 2.6830337047576904 + }, + { + "auxiliary_loss_clip": 0.01107242, + "auxiliary_loss_mlp": 0.01024699, + "balance_loss_clip": 1.03686714, + "balance_loss_mlp": 1.01398194, + "epoch": 0.8184578385690666, + "flos": 21612564797760.0, + "grad_norm": 2.3664060015372916, + "language_loss": 0.85833549, + "learning_rate": 3.3577717064548793e-07, + "loss": 0.87965488, + "num_input_tokens_seen": 293722225, + "router_z_loss_clip": 0.703125, + "router_z_loss_mlp": 0.10717773, + "step": 13613, + "time_per_iteration": 2.681352376937866 + }, + { + "auxiliary_loss_clip": 0.0111043, + "auxiliary_loss_mlp": 0.01040549, + "balance_loss_clip": 1.03970361, + "balance_loss_mlp": 1.02993357, + "epoch": 0.8185179618217345, + "flos": 31361130767520.0, + "grad_norm": 1.8465622832158128, + "language_loss": 0.72978693, + "learning_rate": 3.355612034397746e-07, + "loss": 0.7512967, + "num_input_tokens_seen": 293743995, + "router_z_loss_clip": 0.70703125, + "router_z_loss_mlp": 0.10620117, + "step": 13614, + "time_per_iteration": 2.679230213165283 + }, + { + "auxiliary_loss_clip": 0.01113402, + "auxiliary_loss_mlp": 0.01030691, + "balance_loss_clip": 1.03983092, + "balance_loss_mlp": 1.01927638, + "epoch": 0.8185780850744026, + "flos": 31677040064160.0, + "grad_norm": 1.8704447420853878, + "language_loss": 0.80871886, + "learning_rate": 3.353452993497479e-07, + "loss": 0.83015978, + "num_input_tokens_seen": 293764935, + "router_z_loss_clip": 0.734375, + "router_z_loss_mlp": 0.11413574, + "step": 13615, + "time_per_iteration": 2.6839754581451416 + }, + { + "auxiliary_loss_clip": 0.01111134, + "auxiliary_loss_mlp": 0.01029823, + "balance_loss_clip": 1.03826618, + "balance_loss_mlp": 1.01817071, + "epoch": 0.8186382083270705, + "flos": 30784125703680.0, + "grad_norm": 1.9987067472011812, + "language_loss": 0.76012778, + "learning_rate": 3.3512945838359375e-07, + "loss": 0.78153741, + "num_input_tokens_seen": 293784035, + "router_z_loss_clip": 0.72802734, + "router_z_loss_mlp": 0.11651611, + "step": 13616, + "time_per_iteration": 2.627075433731079 + }, + { + "auxiliary_loss_clip": 0.01107983, + "auxiliary_loss_mlp": 0.01028761, + "balance_loss_clip": 1.03708887, + "balance_loss_mlp": 1.01694727, + "epoch": 0.8186983315797385, + "flos": 27351342446400.0, + "grad_norm": 1.6910838702938935, + "language_loss": 0.752002, + "learning_rate": 3.349136805494979e-07, + "loss": 0.77336949, + "num_input_tokens_seen": 293803360, + "router_z_loss_clip": 0.70898438, + "router_z_loss_mlp": 0.11798096, + "step": 13617, + "time_per_iteration": 2.674891233444214 + }, + { + "auxiliary_loss_clip": 0.01105938, + "auxiliary_loss_mlp": 0.01030526, + "balance_loss_clip": 1.03497052, + "balance_loss_mlp": 1.01989865, + "epoch": 0.8187584548324064, + "flos": 26866501493760.0, + "grad_norm": 1.8250868603657118, + "language_loss": 0.68520206, + "learning_rate": 3.346979658556415e-07, + "loss": 0.70656669, + "num_input_tokens_seen": 293821325, + "router_z_loss_clip": 0.70947266, + "router_z_loss_mlp": 0.10620117, + "step": 13618, + "time_per_iteration": 2.610532760620117 + }, + { + "auxiliary_loss_clip": 0.01115934, + "auxiliary_loss_mlp": 0.01029465, + "balance_loss_clip": 1.03975022, + "balance_loss_mlp": 1.01749027, + "epoch": 0.8188185780850744, + "flos": 35681399069760.0, + "grad_norm": 1.6934556538599936, + "language_loss": 0.69603771, + "learning_rate": 3.344823143102058e-07, + "loss": 0.71749169, + "num_input_tokens_seen": 293840315, + "router_z_loss_clip": 0.76171875, + "router_z_loss_mlp": 0.11975098, + "step": 13619, + "time_per_iteration": 2.7468202114105225 + }, + { + "auxiliary_loss_clip": 0.01115502, + "auxiliary_loss_mlp": 0.01029522, + "balance_loss_clip": 1.04139519, + "balance_loss_mlp": 1.0182687, + "epoch": 0.8188787013377423, + "flos": 25254628571520.0, + "grad_norm": 1.9391219095408823, + "language_loss": 0.74350804, + "learning_rate": 3.3426672592136694e-07, + "loss": 0.76495832, + "num_input_tokens_seen": 293855685, + "router_z_loss_clip": 0.74121094, + "router_z_loss_mlp": 0.11248779, + "step": 13620, + "time_per_iteration": 2.596811294555664 + }, + { + "auxiliary_loss_clip": 0.01109053, + "auxiliary_loss_mlp": 0.01027762, + "balance_loss_clip": 1.03767753, + "balance_loss_mlp": 1.01676464, + "epoch": 0.8189388245904103, + "flos": 28959204157920.0, + "grad_norm": 1.5338012005068626, + "language_loss": 0.76062846, + "learning_rate": 3.340512006973011e-07, + "loss": 0.78199661, + "num_input_tokens_seen": 293875540, + "router_z_loss_clip": 0.71435547, + "router_z_loss_mlp": 0.10992432, + "step": 13621, + "time_per_iteration": 2.678274154663086 + }, + { + "auxiliary_loss_clip": 0.01110246, + "auxiliary_loss_mlp": 0.01026915, + "balance_loss_clip": 1.03827643, + "balance_loss_mlp": 1.01535797, + "epoch": 0.8189989478430784, + "flos": 34477640072640.0, + "grad_norm": 2.9143249310040953, + "language_loss": 0.65856737, + "learning_rate": 3.3383573864618076e-07, + "loss": 0.67993897, + "num_input_tokens_seen": 293896570, + "router_z_loss_clip": 0.71875, + "router_z_loss_mlp": 0.11553955, + "step": 13622, + "time_per_iteration": 2.6466948986053467 + }, + { + "auxiliary_loss_clip": 0.01114297, + "auxiliary_loss_mlp": 0.01030045, + "balance_loss_clip": 1.04153812, + "balance_loss_mlp": 1.01792097, + "epoch": 0.8190590710957463, + "flos": 26109544073760.0, + "grad_norm": 2.256447267691727, + "language_loss": 0.74960327, + "learning_rate": 3.3362033977617653e-07, + "loss": 0.7710467, + "num_input_tokens_seen": 293914680, + "router_z_loss_clip": 0.72802734, + "router_z_loss_mlp": 0.12115479, + "step": 13623, + "time_per_iteration": 2.6576647758483887 + }, + { + "auxiliary_loss_clip": 0.01113023, + "auxiliary_loss_mlp": 0.01036578, + "balance_loss_clip": 1.03930855, + "balance_loss_mlp": 1.02460933, + "epoch": 0.8191191943484143, + "flos": 47341297242720.0, + "grad_norm": 2.3445752359822394, + "language_loss": 0.63227248, + "learning_rate": 3.3340500409545527e-07, + "loss": 0.65376842, + "num_input_tokens_seen": 293936480, + "router_z_loss_clip": 0.73730469, + "router_z_loss_mlp": 0.11962891, + "step": 13624, + "time_per_iteration": 2.7394371032714844 + }, + { + "auxiliary_loss_clip": 0.01108693, + "auxiliary_loss_mlp": 0.01031237, + "balance_loss_clip": 1.03818989, + "balance_loss_mlp": 1.02010834, + "epoch": 0.8191793176010822, + "flos": 31050326648160.0, + "grad_norm": 1.7506045197061784, + "language_loss": 0.78381974, + "learning_rate": 3.3318973161218386e-07, + "loss": 0.80521905, + "num_input_tokens_seen": 293957815, + "router_z_loss_clip": 0.70458984, + "router_z_loss_mlp": 0.11132812, + "step": 13625, + "time_per_iteration": 2.6781885623931885 + }, + { + "auxiliary_loss_clip": 0.01116597, + "auxiliary_loss_mlp": 0.01031299, + "balance_loss_clip": 1.03889251, + "balance_loss_mlp": 1.01912737, + "epoch": 0.8192394408537502, + "flos": 30606644901600.0, + "grad_norm": 2.7629207912447415, + "language_loss": 0.75556362, + "learning_rate": 3.329745223345244e-07, + "loss": 0.77704263, + "num_input_tokens_seen": 293975440, + "router_z_loss_clip": 0.77734375, + "router_z_loss_mlp": 0.1217041, + "step": 13626, + "time_per_iteration": 2.634197473526001 + }, + { + "auxiliary_loss_clip": 0.01112402, + "auxiliary_loss_mlp": 0.01032632, + "balance_loss_clip": 1.04089689, + "balance_loss_mlp": 1.02232087, + "epoch": 0.8192995641064181, + "flos": 33768000485280.0, + "grad_norm": 1.8691668902518348, + "language_loss": 0.73462844, + "learning_rate": 3.3275937627063823e-07, + "loss": 0.75607884, + "num_input_tokens_seen": 293997540, + "router_z_loss_clip": 0.71484375, + "router_z_loss_mlp": 0.10308838, + "step": 13627, + "time_per_iteration": 2.729349374771118 + }, + { + "auxiliary_loss_clip": 0.01111001, + "auxiliary_loss_mlp": 0.0103226, + "balance_loss_clip": 1.03766036, + "balance_loss_mlp": 1.02092314, + "epoch": 0.8193596873590862, + "flos": 25976261273760.0, + "grad_norm": 1.8839865975880787, + "language_loss": 0.68938512, + "learning_rate": 3.3254429342868353e-07, + "loss": 0.71081769, + "num_input_tokens_seen": 294017030, + "router_z_loss_clip": 0.73339844, + "router_z_loss_mlp": 0.11328125, + "step": 13628, + "time_per_iteration": 4.148350954055786 + }, + { + "auxiliary_loss_clip": 0.0111718, + "auxiliary_loss_mlp": 0.01031053, + "balance_loss_clip": 1.04082108, + "balance_loss_mlp": 1.01895308, + "epoch": 0.8194198106117541, + "flos": 21344378506560.0, + "grad_norm": 1.7746423365043618, + "language_loss": 0.85071421, + "learning_rate": 3.323292738168171e-07, + "loss": 0.8721965, + "num_input_tokens_seen": 294035700, + "router_z_loss_clip": 0.76464844, + "router_z_loss_mlp": 0.12103271, + "step": 13629, + "time_per_iteration": 2.58231258392334 + }, + { + "auxiliary_loss_clip": 0.01110815, + "auxiliary_loss_mlp": 0.01024778, + "balance_loss_clip": 1.03903699, + "balance_loss_mlp": 1.01346517, + "epoch": 0.8194799338644221, + "flos": 18629540879040.0, + "grad_norm": 2.2149403410757187, + "language_loss": 0.74107516, + "learning_rate": 3.3211431744319084e-07, + "loss": 0.76243114, + "num_input_tokens_seen": 294049730, + "router_z_loss_clip": 0.71777344, + "router_z_loss_mlp": 0.11315918, + "step": 13630, + "time_per_iteration": 2.663907527923584 + }, + { + "auxiliary_loss_clip": 0.0111381, + "auxiliary_loss_mlp": 0.01030469, + "balance_loss_clip": 1.04014945, + "balance_loss_mlp": 1.01953197, + "epoch": 0.81954005711709, + "flos": 17959642392960.0, + "grad_norm": 1.9818074269320534, + "language_loss": 0.72609985, + "learning_rate": 3.31899424315957e-07, + "loss": 0.74754268, + "num_input_tokens_seen": 294066545, + "router_z_loss_clip": 0.73583984, + "router_z_loss_mlp": 0.10931396, + "step": 13631, + "time_per_iteration": 2.6155641078948975 + }, + { + "auxiliary_loss_clip": 0.01112282, + "auxiliary_loss_mlp": 0.01031173, + "balance_loss_clip": 1.03895032, + "balance_loss_mlp": 1.02045035, + "epoch": 0.819600180369758, + "flos": 28155334078080.0, + "grad_norm": 1.6690474188703206, + "language_loss": 0.76595473, + "learning_rate": 3.3168459444326447e-07, + "loss": 0.78738928, + "num_input_tokens_seen": 294087455, + "router_z_loss_clip": 0.73242188, + "router_z_loss_mlp": 0.10717773, + "step": 13632, + "time_per_iteration": 2.7001349925994873 + }, + { + "auxiliary_loss_clip": 0.01108759, + "auxiliary_loss_mlp": 0.01027843, + "balance_loss_clip": 1.03747237, + "balance_loss_mlp": 1.0178889, + "epoch": 0.8196603036224259, + "flos": 33677862238080.0, + "grad_norm": 2.4368780503832532, + "language_loss": 0.65683949, + "learning_rate": 3.314698278332588e-07, + "loss": 0.67820549, + "num_input_tokens_seen": 294107480, + "router_z_loss_clip": 0.71337891, + "router_z_loss_mlp": 0.09954834, + "step": 13633, + "time_per_iteration": 2.691728353500366 + }, + { + "auxiliary_loss_clip": 0.01108416, + "auxiliary_loss_mlp": 0.01031255, + "balance_loss_clip": 1.03835261, + "balance_loss_mlp": 1.02094913, + "epoch": 0.8197204268750939, + "flos": 34876718644320.0, + "grad_norm": 1.5553948986032804, + "language_loss": 0.75565374, + "learning_rate": 3.3125512449408513e-07, + "loss": 0.7770505, + "num_input_tokens_seen": 294130115, + "router_z_loss_clip": 0.70068359, + "router_z_loss_mlp": 0.10302734, + "step": 13634, + "time_per_iteration": 4.223001718521118 + }, + { + "auxiliary_loss_clip": 0.0110791, + "auxiliary_loss_mlp": 0.01029462, + "balance_loss_clip": 1.03749955, + "balance_loss_mlp": 1.01887059, + "epoch": 0.819780550127762, + "flos": 28380011160960.0, + "grad_norm": 5.390774920184147, + "language_loss": 0.81849658, + "learning_rate": 3.310404844338841e-07, + "loss": 0.83987033, + "num_input_tokens_seen": 294148495, + "router_z_loss_clip": 0.70410156, + "router_z_loss_mlp": 0.10601807, + "step": 13635, + "time_per_iteration": 2.6583213806152344 + }, + { + "auxiliary_loss_clip": 0.01110367, + "auxiliary_loss_mlp": 0.01029383, + "balance_loss_clip": 1.03784513, + "balance_loss_mlp": 1.01764059, + "epoch": 0.8198406733804299, + "flos": 32562094072320.0, + "grad_norm": 1.6653655824344011, + "language_loss": 0.75769448, + "learning_rate": 3.308259076607949e-07, + "loss": 0.77909195, + "num_input_tokens_seen": 294169595, + "router_z_loss_clip": 0.72509766, + "router_z_loss_mlp": 0.11737061, + "step": 13636, + "time_per_iteration": 2.6477108001708984 + }, + { + "auxiliary_loss_clip": 0.01107557, + "auxiliary_loss_mlp": 0.01032533, + "balance_loss_clip": 1.03639221, + "balance_loss_mlp": 1.02099979, + "epoch": 0.8199007966330979, + "flos": 24506584953120.0, + "grad_norm": 2.194087529115797, + "language_loss": 0.80841458, + "learning_rate": 3.3061139418295445e-07, + "loss": 0.82981545, + "num_input_tokens_seen": 294183885, + "router_z_loss_clip": 0.71191406, + "router_z_loss_mlp": 0.11529541, + "step": 13637, + "time_per_iteration": 2.647953748703003 + }, + { + "auxiliary_loss_clip": 0.01111328, + "auxiliary_loss_mlp": 0.01031444, + "balance_loss_clip": 1.04002428, + "balance_loss_mlp": 1.02010095, + "epoch": 0.8199609198857658, + "flos": 38928922207200.0, + "grad_norm": 2.259854599213321, + "language_loss": 0.71267104, + "learning_rate": 3.3039694400849725e-07, + "loss": 0.73409879, + "num_input_tokens_seen": 294200150, + "router_z_loss_clip": 0.71289062, + "router_z_loss_mlp": 0.11340332, + "step": 13638, + "time_per_iteration": 2.6988096237182617 + }, + { + "auxiliary_loss_clip": 0.01115864, + "auxiliary_loss_mlp": 0.01031938, + "balance_loss_clip": 1.03985238, + "balance_loss_mlp": 1.01915312, + "epoch": 0.8200210431384338, + "flos": 32299256062080.0, + "grad_norm": 3.94082079383715, + "language_loss": 0.79238427, + "learning_rate": 3.3018255714555564e-07, + "loss": 0.81386226, + "num_input_tokens_seen": 294220385, + "router_z_loss_clip": 0.76074219, + "router_z_loss_mlp": 0.12792969, + "step": 13639, + "time_per_iteration": 4.121222257614136 + }, + { + "auxiliary_loss_clip": 0.01111285, + "auxiliary_loss_mlp": 0.01027011, + "balance_loss_clip": 1.03906453, + "balance_loss_mlp": 1.01646078, + "epoch": 0.8200811663911017, + "flos": 26956072499040.0, + "grad_norm": 1.9599667460504346, + "language_loss": 0.78855628, + "learning_rate": 3.299682336022589e-07, + "loss": 0.80993927, + "num_input_tokens_seen": 294239355, + "router_z_loss_clip": 0.72216797, + "router_z_loss_mlp": 0.10559082, + "step": 13640, + "time_per_iteration": 2.6603176593780518 + }, + { + "auxiliary_loss_clip": 0.01115858, + "auxiliary_loss_mlp": 0.01029559, + "balance_loss_clip": 1.03979039, + "balance_loss_mlp": 1.01804972, + "epoch": 0.8201412896437698, + "flos": 45873444199680.0, + "grad_norm": 1.8952872401134055, + "language_loss": 0.6333338, + "learning_rate": 3.297539733867336e-07, + "loss": 0.65478802, + "num_input_tokens_seen": 294259395, + "router_z_loss_clip": 0.76171875, + "router_z_loss_mlp": 0.1149292, + "step": 13641, + "time_per_iteration": 2.743889331817627 + }, + { + "auxiliary_loss_clip": 0.01110777, + "auxiliary_loss_mlp": 0.01027173, + "balance_loss_clip": 1.03846288, + "balance_loss_mlp": 1.01519847, + "epoch": 0.8202014128964377, + "flos": 23972400303840.0, + "grad_norm": 2.786588908193049, + "language_loss": 0.73742604, + "learning_rate": 3.295397765071055e-07, + "loss": 0.75880557, + "num_input_tokens_seen": 294277365, + "router_z_loss_clip": 0.72314453, + "router_z_loss_mlp": 0.11987305, + "step": 13642, + "time_per_iteration": 3.9407901763916016 + }, + { + "auxiliary_loss_clip": 0.01111738, + "auxiliary_loss_mlp": 0.01032793, + "balance_loss_clip": 1.04017425, + "balance_loss_mlp": 1.02158713, + "epoch": 0.8202615361491057, + "flos": 38397330663840.0, + "grad_norm": 2.5155244988937415, + "language_loss": 0.70559508, + "learning_rate": 3.2932564297149615e-07, + "loss": 0.72704035, + "num_input_tokens_seen": 294297555, + "router_z_loss_clip": 0.71484375, + "router_z_loss_mlp": 0.11199951, + "step": 13643, + "time_per_iteration": 2.698875665664673 + }, + { + "auxiliary_loss_clip": 0.01111554, + "auxiliary_loss_mlp": 0.01032019, + "balance_loss_clip": 1.03984451, + "balance_loss_mlp": 1.02106416, + "epoch": 0.8203216594017736, + "flos": 30158465736960.0, + "grad_norm": 2.8580767995713625, + "language_loss": 0.65838426, + "learning_rate": 3.291115727880256e-07, + "loss": 0.67982, + "num_input_tokens_seen": 294317600, + "router_z_loss_clip": 0.71777344, + "router_z_loss_mlp": 0.10955811, + "step": 13644, + "time_per_iteration": 2.628170967102051 + }, + { + "auxiliary_loss_clip": 0.01112758, + "auxiliary_loss_mlp": 0.01035053, + "balance_loss_clip": 1.0380342, + "balance_loss_mlp": 1.02355576, + "epoch": 0.8203817826544416, + "flos": 31764423136320.0, + "grad_norm": 1.5971598203517865, + "language_loss": 0.70722604, + "learning_rate": 3.2889756596481234e-07, + "loss": 0.72870415, + "num_input_tokens_seen": 294340215, + "router_z_loss_clip": 0.74755859, + "router_z_loss_mlp": 0.11499023, + "step": 13645, + "time_per_iteration": 2.6914565563201904 + }, + { + "auxiliary_loss_clip": 0.01109131, + "auxiliary_loss_mlp": 0.01029583, + "balance_loss_clip": 1.03835475, + "balance_loss_mlp": 1.01838338, + "epoch": 0.8204419059071095, + "flos": 31670516782080.0, + "grad_norm": 1.8411966848948516, + "language_loss": 0.71373206, + "learning_rate": 3.286836225099707e-07, + "loss": 0.73511916, + "num_input_tokens_seen": 294358590, + "router_z_loss_clip": 0.70751953, + "router_z_loss_mlp": 0.11199951, + "step": 13646, + "time_per_iteration": 2.646608829498291 + }, + { + "auxiliary_loss_clip": 0.01114378, + "auxiliary_loss_mlp": 0.01030371, + "balance_loss_clip": 1.04044211, + "balance_loss_mlp": 1.01862931, + "epoch": 0.8205020291597775, + "flos": 28773255244320.0, + "grad_norm": 2.1627864648484687, + "language_loss": 0.78733027, + "learning_rate": 3.284697424316132e-07, + "loss": 0.80877775, + "num_input_tokens_seen": 294375825, + "router_z_loss_clip": 0.73876953, + "router_z_loss_mlp": 0.11737061, + "step": 13647, + "time_per_iteration": 2.6740996837615967 + }, + { + "auxiliary_loss_clip": 0.01108367, + "auxiliary_loss_mlp": 0.01032952, + "balance_loss_clip": 1.03944933, + "balance_loss_mlp": 1.02196109, + "epoch": 0.8205621524124456, + "flos": 32700684636000.0, + "grad_norm": 1.3002058073788785, + "language_loss": 0.67689085, + "learning_rate": 3.2825592573785034e-07, + "loss": 0.69830406, + "num_input_tokens_seen": 294398500, + "router_z_loss_clip": 0.68847656, + "router_z_loss_mlp": 0.10998535, + "step": 13648, + "time_per_iteration": 2.646651029586792 + }, + { + "auxiliary_loss_clip": 0.0111057, + "auxiliary_loss_mlp": 0.01026197, + "balance_loss_clip": 1.03667998, + "balance_loss_mlp": 1.01444888, + "epoch": 0.8206222756651135, + "flos": 33589992958560.0, + "grad_norm": 2.2903739459641685, + "language_loss": 0.80664825, + "learning_rate": 3.28042172436791e-07, + "loss": 0.82801592, + "num_input_tokens_seen": 294418840, + "router_z_loss_clip": 0.73925781, + "router_z_loss_mlp": 0.11749268, + "step": 13649, + "time_per_iteration": 2.6718180179595947 + }, + { + "auxiliary_loss_clip": 0.01113703, + "auxiliary_loss_mlp": 0.01033858, + "balance_loss_clip": 1.04066694, + "balance_loss_mlp": 1.02082264, + "epoch": 0.8206823989177815, + "flos": 25842208645440.0, + "grad_norm": 1.7181972352449966, + "language_loss": 0.69073784, + "learning_rate": 3.278284825365396e-07, + "loss": 0.71221352, + "num_input_tokens_seen": 294438215, + "router_z_loss_clip": 0.72998047, + "router_z_loss_mlp": 0.13049316, + "step": 13650, + "time_per_iteration": 2.591001272201538 + }, + { + "auxiliary_loss_clip": 0.01115764, + "auxiliary_loss_mlp": 0.0102882, + "balance_loss_clip": 1.04109001, + "balance_loss_mlp": 1.01708424, + "epoch": 0.8207425221704494, + "flos": 14043395770560.0, + "grad_norm": 3.3841506998914603, + "language_loss": 0.60915351, + "learning_rate": 3.276148560452001e-07, + "loss": 0.63059938, + "num_input_tokens_seen": 294455260, + "router_z_loss_clip": 0.74609375, + "router_z_loss_mlp": 0.11737061, + "step": 13651, + "time_per_iteration": 2.6486258506774902 + }, + { + "auxiliary_loss_clip": 0.01115943, + "auxiliary_loss_mlp": 0.01031107, + "balance_loss_clip": 1.04145014, + "balance_loss_mlp": 1.01954985, + "epoch": 0.8208026454231174, + "flos": 24150164726880.0, + "grad_norm": 1.93312767257408, + "language_loss": 0.72620511, + "learning_rate": 3.2740129297087293e-07, + "loss": 0.74767566, + "num_input_tokens_seen": 294473205, + "router_z_loss_clip": 0.74511719, + "router_z_loss_mlp": 0.11553955, + "step": 13652, + "time_per_iteration": 2.6246237754821777 + }, + { + "auxiliary_loss_clip": 0.01107293, + "auxiliary_loss_mlp": 0.01028405, + "balance_loss_clip": 1.03859711, + "balance_loss_mlp": 1.0184927, + "epoch": 0.8208627686757853, + "flos": 19117339593120.0, + "grad_norm": 3.0922995002617872, + "language_loss": 0.7278372, + "learning_rate": 3.271877933216558e-07, + "loss": 0.7491942, + "num_input_tokens_seen": 294490645, + "router_z_loss_clip": 0.68701172, + "router_z_loss_mlp": 0.09918213, + "step": 13653, + "time_per_iteration": 2.6476995944976807 + }, + { + "auxiliary_loss_clip": 0.01117397, + "auxiliary_loss_mlp": 0.01036104, + "balance_loss_clip": 1.04092276, + "balance_loss_mlp": 1.02331877, + "epoch": 0.8209228919284534, + "flos": 45738135535680.0, + "grad_norm": 2.5368251597434472, + "language_loss": 0.62855589, + "learning_rate": 3.269743571056451e-07, + "loss": 0.65009081, + "num_input_tokens_seen": 294513500, + "router_z_loss_clip": 0.76513672, + "router_z_loss_mlp": 0.12786865, + "step": 13654, + "time_per_iteration": 2.8441035747528076 + }, + { + "auxiliary_loss_clip": 0.01110915, + "auxiliary_loss_mlp": 0.01029767, + "balance_loss_clip": 1.03639793, + "balance_loss_mlp": 1.01819158, + "epoch": 0.8209830151811213, + "flos": 28202611393440.0, + "grad_norm": 1.515307053488176, + "language_loss": 0.69864357, + "learning_rate": 3.2676098433093447e-07, + "loss": 0.72005033, + "num_input_tokens_seen": 294535710, + "router_z_loss_clip": 0.74560547, + "router_z_loss_mlp": 0.11578369, + "step": 13655, + "time_per_iteration": 2.6376893520355225 + }, + { + "auxiliary_loss_clip": 0.01110645, + "auxiliary_loss_mlp": 0.01031008, + "balance_loss_clip": 1.03901458, + "balance_loss_mlp": 1.01977813, + "epoch": 0.8210431384337893, + "flos": 25975896618240.0, + "grad_norm": 2.1264699256055666, + "language_loss": 0.82314217, + "learning_rate": 3.265476750056162e-07, + "loss": 0.84455872, + "num_input_tokens_seen": 294554055, + "router_z_loss_clip": 0.71679688, + "router_z_loss_mlp": 0.11236572, + "step": 13656, + "time_per_iteration": 2.689833641052246 + }, + { + "auxiliary_loss_clip": 0.01109648, + "auxiliary_loss_mlp": 0.01031985, + "balance_loss_clip": 1.04018998, + "balance_loss_mlp": 1.02042127, + "epoch": 0.8211032616864572, + "flos": 14035251797280.0, + "grad_norm": 4.181496118392547, + "language_loss": 0.73828423, + "learning_rate": 3.2633442913777654e-07, + "loss": 0.75970066, + "num_input_tokens_seen": 294570390, + "router_z_loss_clip": 0.69433594, + "router_z_loss_mlp": 0.11560059, + "step": 13657, + "time_per_iteration": 2.589568614959717 + }, + { + "auxiliary_loss_clip": 0.01110657, + "auxiliary_loss_mlp": 0.0102826, + "balance_loss_clip": 1.03814483, + "balance_loss_mlp": 1.01708424, + "epoch": 0.8211633849391252, + "flos": 36389296414080.0, + "grad_norm": 1.684641362512704, + "language_loss": 0.55411184, + "learning_rate": 3.2612124673550325e-07, + "loss": 0.57550102, + "num_input_tokens_seen": 294593050, + "router_z_loss_clip": 0.72509766, + "router_z_loss_mlp": 0.11169434, + "step": 13658, + "time_per_iteration": 2.716475486755371 + }, + { + "auxiliary_loss_clip": 0.01111207, + "auxiliary_loss_mlp": 0.010302, + "balance_loss_clip": 1.03811729, + "balance_loss_mlp": 1.01889896, + "epoch": 0.8212235081917931, + "flos": 16002896669280.0, + "grad_norm": 2.2375093256274736, + "language_loss": 0.79272294, + "learning_rate": 3.259081278068805e-07, + "loss": 0.81413698, + "num_input_tokens_seen": 294608550, + "router_z_loss_clip": 0.73046875, + "router_z_loss_mlp": 0.11303711, + "step": 13659, + "time_per_iteration": 2.580845355987549 + }, + { + "auxiliary_loss_clip": 0.01105723, + "auxiliary_loss_mlp": 0.01024493, + "balance_loss_clip": 1.03644955, + "balance_loss_mlp": 1.01432478, + "epoch": 0.8212836314444611, + "flos": 49438456807680.0, + "grad_norm": 1.6217936026403386, + "language_loss": 0.59573066, + "learning_rate": 3.256950723599887e-07, + "loss": 0.61703283, + "num_input_tokens_seen": 294630380, + "router_z_loss_clip": 0.69287109, + "router_z_loss_mlp": 0.10180664, + "step": 13660, + "time_per_iteration": 2.8855245113372803 + }, + { + "auxiliary_loss_clip": 0.01112911, + "auxiliary_loss_mlp": 0.01031394, + "balance_loss_clip": 1.03953004, + "balance_loss_mlp": 1.01940763, + "epoch": 0.8213437546971292, + "flos": 22903706867040.0, + "grad_norm": 2.2654642508778933, + "language_loss": 0.7218098, + "learning_rate": 3.254820804029075e-07, + "loss": 0.74325287, + "num_input_tokens_seen": 294648655, + "router_z_loss_clip": 0.73388672, + "router_z_loss_mlp": 0.11968994, + "step": 13661, + "time_per_iteration": 2.590212106704712 + }, + { + "auxiliary_loss_clip": 0.0111276, + "auxiliary_loss_mlp": 0.01029372, + "balance_loss_clip": 1.03763831, + "balance_loss_mlp": 1.01781464, + "epoch": 0.8214038779497971, + "flos": 24016314684960.0, + "grad_norm": 2.8741472241522072, + "language_loss": 0.74681091, + "learning_rate": 3.252691519437143e-07, + "loss": 0.76823229, + "num_input_tokens_seen": 294666915, + "router_z_loss_clip": 0.75097656, + "router_z_loss_mlp": 0.11547852, + "step": 13662, + "time_per_iteration": 2.651498794555664 + }, + { + "auxiliary_loss_clip": 0.01031539, + "auxiliary_loss_mlp": 0.0100002, + "balance_loss_clip": 1.00908196, + "balance_loss_mlp": 0.9990561, + "epoch": 0.8214640012024651, + "flos": 87370435330560.0, + "grad_norm": 0.7487470331159396, + "language_loss": 0.54037213, + "learning_rate": 3.250562869904825e-07, + "loss": 0.56068772, + "num_input_tokens_seen": 294731545, + "router_z_loss_clip": 0.22485352, + "router_z_loss_mlp": 0.0096283, + "step": 13663, + "time_per_iteration": 3.4191863536834717 + }, + { + "auxiliary_loss_clip": 0.0110949, + "auxiliary_loss_mlp": 0.01029721, + "balance_loss_clip": 1.03675032, + "balance_loss_mlp": 1.01858699, + "epoch": 0.821524124455133, + "flos": 18006311949120.0, + "grad_norm": 2.586087863963657, + "language_loss": 0.65564722, + "learning_rate": 3.248434855512838e-07, + "loss": 0.67703938, + "num_input_tokens_seen": 294748745, + "router_z_loss_clip": 0.72753906, + "router_z_loss_mlp": 0.11132812, + "step": 13664, + "time_per_iteration": 2.6967318058013916 + }, + { + "auxiliary_loss_clip": 0.01110742, + "auxiliary_loss_mlp": 0.01026415, + "balance_loss_clip": 1.03992546, + "balance_loss_mlp": 1.01553702, + "epoch": 0.821584247707801, + "flos": 30605794038720.0, + "grad_norm": 1.4721370299760665, + "language_loss": 0.75181723, + "learning_rate": 3.246307476341881e-07, + "loss": 0.77318877, + "num_input_tokens_seen": 294768955, + "router_z_loss_clip": 0.70849609, + "router_z_loss_mlp": 0.10882568, + "step": 13665, + "time_per_iteration": 2.7030036449432373 + }, + { + "auxiliary_loss_clip": 0.01113042, + "auxiliary_loss_mlp": 0.01028865, + "balance_loss_clip": 1.03994644, + "balance_loss_mlp": 1.01791537, + "epoch": 0.8216443709604689, + "flos": 44942125808160.0, + "grad_norm": 2.4994561321474515, + "language_loss": 0.65919018, + "learning_rate": 3.2441807324726256e-07, + "loss": 0.68060929, + "num_input_tokens_seen": 294789250, + "router_z_loss_clip": 0.73046875, + "router_z_loss_mlp": 0.10961914, + "step": 13666, + "time_per_iteration": 2.7485289573669434 + }, + { + "auxiliary_loss_clip": 0.01109755, + "auxiliary_loss_mlp": 0.01029685, + "balance_loss_clip": 1.03819823, + "balance_loss_mlp": 1.01843762, + "epoch": 0.821704494213137, + "flos": 30605267314080.0, + "grad_norm": 1.8465787587039069, + "language_loss": 0.77096975, + "learning_rate": 3.2420546239857174e-07, + "loss": 0.79236412, + "num_input_tokens_seen": 294809760, + "router_z_loss_clip": 0.71533203, + "router_z_loss_mlp": 0.11254883, + "step": 13667, + "time_per_iteration": 2.6820013523101807 + }, + { + "auxiliary_loss_clip": 0.01113727, + "auxiliary_loss_mlp": 0.01031995, + "balance_loss_clip": 1.03939188, + "balance_loss_mlp": 1.02045584, + "epoch": 0.8217646174658049, + "flos": 17516852026560.0, + "grad_norm": 1.8755653190902892, + "language_loss": 0.7737962, + "learning_rate": 3.239929150961773e-07, + "loss": 0.7952534, + "num_input_tokens_seen": 294826495, + "router_z_loss_clip": 0.74365234, + "router_z_loss_mlp": 0.11547852, + "step": 13668, + "time_per_iteration": 4.090028524398804 + }, + { + "auxiliary_loss_clip": 0.01108884, + "auxiliary_loss_mlp": 0.01026657, + "balance_loss_clip": 1.03702164, + "balance_loss_mlp": 1.01567173, + "epoch": 0.8218247407184729, + "flos": 26955059567040.0, + "grad_norm": 2.431031311977607, + "language_loss": 0.73687434, + "learning_rate": 3.2378043134813984e-07, + "loss": 0.75822979, + "num_input_tokens_seen": 294845370, + "router_z_loss_clip": 0.71777344, + "router_z_loss_mlp": 0.10980225, + "step": 13669, + "time_per_iteration": 2.6548471450805664 + }, + { + "auxiliary_loss_clip": 0.01110105, + "auxiliary_loss_mlp": 0.01024224, + "balance_loss_clip": 1.03768992, + "balance_loss_mlp": 1.01326871, + "epoch": 0.8218848639711408, + "flos": 20455191735840.0, + "grad_norm": 1.8165595678624655, + "language_loss": 0.78507811, + "learning_rate": 3.235680111625161e-07, + "loss": 0.8064214, + "num_input_tokens_seen": 294863740, + "router_z_loss_clip": 0.72412109, + "router_z_loss_mlp": 0.10949707, + "step": 13670, + "time_per_iteration": 2.628328561782837 + }, + { + "auxiliary_loss_clip": 0.01116946, + "auxiliary_loss_mlp": 0.01034929, + "balance_loss_clip": 1.04110181, + "balance_loss_mlp": 1.0228591, + "epoch": 0.8219449872238088, + "flos": 31719293236800.0, + "grad_norm": 2.381341303801159, + "language_loss": 0.74775028, + "learning_rate": 3.2335565454736123e-07, + "loss": 0.76926899, + "num_input_tokens_seen": 294882815, + "router_z_loss_clip": 0.75683594, + "router_z_loss_mlp": 0.12060547, + "step": 13671, + "time_per_iteration": 2.6317992210388184 + }, + { + "auxiliary_loss_clip": 0.01115575, + "auxiliary_loss_mlp": 0.01028446, + "balance_loss_clip": 1.03876567, + "balance_loss_mlp": 1.01680505, + "epoch": 0.8220051104764767, + "flos": 25353396999360.0, + "grad_norm": 1.8727116849841892, + "language_loss": 0.76438916, + "learning_rate": 3.23143361510728e-07, + "loss": 0.78582937, + "num_input_tokens_seen": 294901985, + "router_z_loss_clip": 0.76660156, + "router_z_loss_mlp": 0.11645508, + "step": 13672, + "time_per_iteration": 2.6493282318115234 + }, + { + "auxiliary_loss_clip": 0.01111679, + "auxiliary_loss_mlp": 0.01032878, + "balance_loss_clip": 1.03863907, + "balance_loss_mlp": 1.02066505, + "epoch": 0.8220652337291448, + "flos": 17783782282080.0, + "grad_norm": 2.349764431015346, + "language_loss": 0.74549234, + "learning_rate": 3.2293113206066733e-07, + "loss": 0.76693791, + "num_input_tokens_seen": 294919705, + "router_z_loss_clip": 0.73144531, + "router_z_loss_mlp": 0.12219238, + "step": 13673, + "time_per_iteration": 2.5734853744506836 + }, + { + "auxiliary_loss_clip": 0.01113729, + "auxiliary_loss_mlp": 0.01028323, + "balance_loss_clip": 1.03942108, + "balance_loss_mlp": 1.01670575, + "epoch": 0.8221253569818128, + "flos": 29048167404000.0, + "grad_norm": 1.9230566845921686, + "language_loss": 0.79743958, + "learning_rate": 3.227189662052254e-07, + "loss": 0.81886005, + "num_input_tokens_seen": 294939900, + "router_z_loss_clip": 0.74267578, + "router_z_loss_mlp": 0.11627197, + "step": 13674, + "time_per_iteration": 4.018197536468506 + }, + { + "auxiliary_loss_clip": 0.01110798, + "auxiliary_loss_mlp": 0.01032821, + "balance_loss_clip": 1.03801167, + "balance_loss_mlp": 1.02186584, + "epoch": 0.8221854802344807, + "flos": 25975977652800.0, + "grad_norm": 1.9893496771226418, + "language_loss": 0.70385826, + "learning_rate": 3.225068639524484e-07, + "loss": 0.72529447, + "num_input_tokens_seen": 294959110, + "router_z_loss_clip": 0.72802734, + "router_z_loss_mlp": 0.10961914, + "step": 13675, + "time_per_iteration": 2.678896903991699 + }, + { + "auxiliary_loss_clip": 0.011092, + "auxiliary_loss_mlp": 0.01031687, + "balance_loss_clip": 1.03855598, + "balance_loss_mlp": 1.02060628, + "epoch": 0.8222456034871487, + "flos": 25571793903840.0, + "grad_norm": 1.5824437845077126, + "language_loss": 0.74329495, + "learning_rate": 3.2229482531037965e-07, + "loss": 0.76470387, + "num_input_tokens_seen": 294978660, + "router_z_loss_clip": 0.70556641, + "router_z_loss_mlp": 0.11077881, + "step": 13676, + "time_per_iteration": 2.657405138015747 + }, + { + "auxiliary_loss_clip": 0.01111033, + "auxiliary_loss_mlp": 0.01029825, + "balance_loss_clip": 1.03846431, + "balance_loss_mlp": 1.01891148, + "epoch": 0.8223057267398166, + "flos": 26109341487360.0, + "grad_norm": 2.070437285177976, + "language_loss": 0.80270755, + "learning_rate": 3.2208285028705893e-07, + "loss": 0.82411617, + "num_input_tokens_seen": 294998075, + "router_z_loss_clip": 0.72607422, + "router_z_loss_mlp": 0.10919189, + "step": 13677, + "time_per_iteration": 2.6872260570526123 + }, + { + "auxiliary_loss_clip": 0.01112545, + "auxiliary_loss_mlp": 0.01032056, + "balance_loss_clip": 1.03876209, + "balance_loss_mlp": 1.0208261, + "epoch": 0.8223658499924846, + "flos": 18629865017280.0, + "grad_norm": 2.2791288092346482, + "language_loss": 0.70148915, + "learning_rate": 3.218709388905245e-07, + "loss": 0.72293514, + "num_input_tokens_seen": 295015950, + "router_z_loss_clip": 0.73828125, + "router_z_loss_mlp": 0.11236572, + "step": 13678, + "time_per_iteration": 2.622231960296631 + }, + { + "auxiliary_loss_clip": 0.0111046, + "auxiliary_loss_mlp": 0.01034146, + "balance_loss_clip": 1.03822148, + "balance_loss_mlp": 1.02269614, + "epoch": 0.8224259732451525, + "flos": 38131453857600.0, + "grad_norm": 1.5753693728366491, + "language_loss": 0.71516538, + "learning_rate": 3.216590911288133e-07, + "loss": 0.73661149, + "num_input_tokens_seen": 295036800, + "router_z_loss_clip": 0.72265625, + "router_z_loss_mlp": 0.11450195, + "step": 13679, + "time_per_iteration": 4.119655132293701 + }, + { + "auxiliary_loss_clip": 0.01108826, + "auxiliary_loss_mlp": 0.01027665, + "balance_loss_clip": 1.03731704, + "balance_loss_mlp": 1.01657224, + "epoch": 0.8224860964978206, + "flos": 26325955631520.0, + "grad_norm": 2.0438058075717684, + "language_loss": 0.69804323, + "learning_rate": 3.214473070099564e-07, + "loss": 0.71940815, + "num_input_tokens_seen": 295055300, + "router_z_loss_clip": 0.71484375, + "router_z_loss_mlp": 0.11102295, + "step": 13680, + "time_per_iteration": 2.6163601875305176 + }, + { + "auxiliary_loss_clip": 0.01111742, + "auxiliary_loss_mlp": 0.01032458, + "balance_loss_clip": 1.03965259, + "balance_loss_mlp": 1.02147293, + "epoch": 0.8225462197504885, + "flos": 31095091892160.0, + "grad_norm": 1.7407144956162388, + "language_loss": 0.59825885, + "learning_rate": 3.21235586541986e-07, + "loss": 0.61970091, + "num_input_tokens_seen": 295076420, + "router_z_loss_clip": 0.72119141, + "router_z_loss_mlp": 0.10980225, + "step": 13681, + "time_per_iteration": 2.727355718612671 + }, + { + "auxiliary_loss_clip": 0.01113336, + "auxiliary_loss_mlp": 0.01031234, + "balance_loss_clip": 1.03849816, + "balance_loss_mlp": 1.01996326, + "epoch": 0.8226063430031565, + "flos": 48059567010720.0, + "grad_norm": 1.7710019480230101, + "language_loss": 0.6955663, + "learning_rate": 3.2102392973293047e-07, + "loss": 0.71701205, + "num_input_tokens_seen": 295100540, + "router_z_loss_clip": 0.74804688, + "router_z_loss_mlp": 0.1126709, + "step": 13682, + "time_per_iteration": 4.079323053359985 + }, + { + "auxiliary_loss_clip": 0.01113383, + "auxiliary_loss_mlp": 0.01032289, + "balance_loss_clip": 1.03861368, + "balance_loss_mlp": 1.01992726, + "epoch": 0.8226664662558244, + "flos": 27839870471520.0, + "grad_norm": 3.483090230739884, + "language_loss": 0.79297709, + "learning_rate": 3.20812336590816e-07, + "loss": 0.81443381, + "num_input_tokens_seen": 295120180, + "router_z_loss_clip": 0.74658203, + "router_z_loss_mlp": 0.1237793, + "step": 13683, + "time_per_iteration": 2.623992919921875 + }, + { + "auxiliary_loss_clip": 0.01107143, + "auxiliary_loss_mlp": 0.01031763, + "balance_loss_clip": 1.03778684, + "balance_loss_mlp": 1.02178502, + "epoch": 0.8227265895084924, + "flos": 31317662076480.0, + "grad_norm": 2.2129238287111037, + "language_loss": 0.86504686, + "learning_rate": 3.206008071236661e-07, + "loss": 0.88643593, + "num_input_tokens_seen": 295138530, + "router_z_loss_clip": 0.69433594, + "router_z_loss_mlp": 0.09985352, + "step": 13684, + "time_per_iteration": 2.6936328411102295 + }, + { + "auxiliary_loss_clip": 0.01108828, + "auxiliary_loss_mlp": 0.01029092, + "balance_loss_clip": 1.03891897, + "balance_loss_mlp": 1.0183394, + "epoch": 0.8227867127611603, + "flos": 31944213423360.0, + "grad_norm": 1.6859992549450384, + "language_loss": 0.79877645, + "learning_rate": 3.2038934133950157e-07, + "loss": 0.82015562, + "num_input_tokens_seen": 295160260, + "router_z_loss_clip": 0.69824219, + "router_z_loss_mlp": 0.10754395, + "step": 13685, + "time_per_iteration": 2.677473545074463 + }, + { + "auxiliary_loss_clip": 0.01110304, + "auxiliary_loss_mlp": 0.01028496, + "balance_loss_clip": 1.03847075, + "balance_loss_mlp": 1.01746345, + "epoch": 0.8228468360138284, + "flos": 26866542011040.0, + "grad_norm": 1.7311128620128728, + "language_loss": 0.68848604, + "learning_rate": 3.2017793924634194e-07, + "loss": 0.70987403, + "num_input_tokens_seen": 295177055, + "router_z_loss_clip": 0.71875, + "router_z_loss_mlp": 0.11029053, + "step": 13686, + "time_per_iteration": 2.697110176086426 + }, + { + "auxiliary_loss_clip": 0.01111425, + "auxiliary_loss_mlp": 0.0103256, + "balance_loss_clip": 1.0375402, + "balance_loss_mlp": 1.02097881, + "epoch": 0.8229069592664963, + "flos": 18184643614080.0, + "grad_norm": 2.1855498182054074, + "language_loss": 0.78381622, + "learning_rate": 3.1996660085220263e-07, + "loss": 0.80525613, + "num_input_tokens_seen": 295193870, + "router_z_loss_clip": 0.73828125, + "router_z_loss_mlp": 0.11578369, + "step": 13687, + "time_per_iteration": 2.611351251602173 + }, + { + "auxiliary_loss_clip": 0.01110794, + "auxiliary_loss_mlp": 0.01032639, + "balance_loss_clip": 1.03791714, + "balance_loss_mlp": 1.02096891, + "epoch": 0.8229670825191643, + "flos": 19119527526240.0, + "grad_norm": 2.4841827988220064, + "language_loss": 0.72617054, + "learning_rate": 3.1975532616509825e-07, + "loss": 0.74760491, + "num_input_tokens_seen": 295211040, + "router_z_loss_clip": 0.72900391, + "router_z_loss_mlp": 0.11676025, + "step": 13688, + "time_per_iteration": 2.65327787399292 + }, + { + "auxiliary_loss_clip": 0.01112733, + "auxiliary_loss_mlp": 0.01030625, + "balance_loss_clip": 1.04024673, + "balance_loss_mlp": 1.0193243, + "epoch": 0.8230272057718323, + "flos": 28288900499040.0, + "grad_norm": 1.5470308603971263, + "language_loss": 0.73102653, + "learning_rate": 3.1954411519304025e-07, + "loss": 0.75246012, + "num_input_tokens_seen": 295231300, + "router_z_loss_clip": 0.72509766, + "router_z_loss_mlp": 0.11309814, + "step": 13689, + "time_per_iteration": 2.666729211807251 + }, + { + "auxiliary_loss_clip": 0.01113675, + "auxiliary_loss_mlp": 0.010311, + "balance_loss_clip": 1.03943598, + "balance_loss_mlp": 1.01955426, + "epoch": 0.8230873290245002, + "flos": 25663836463200.0, + "grad_norm": 2.3425404237100205, + "language_loss": 0.6933924, + "learning_rate": 3.1933296794403887e-07, + "loss": 0.71484011, + "num_input_tokens_seen": 295251045, + "router_z_loss_clip": 0.7421875, + "router_z_loss_mlp": 0.11553955, + "step": 13690, + "time_per_iteration": 2.6258158683776855 + }, + { + "auxiliary_loss_clip": 0.01111356, + "auxiliary_loss_mlp": 0.01032937, + "balance_loss_clip": 1.03853989, + "balance_loss_mlp": 1.02157032, + "epoch": 0.8231474522771682, + "flos": 25929632234880.0, + "grad_norm": 2.4621469947953307, + "language_loss": 0.85523617, + "learning_rate": 3.191218844260988e-07, + "loss": 0.87667912, + "num_input_tokens_seen": 295270225, + "router_z_loss_clip": 0.72802734, + "router_z_loss_mlp": 0.11364746, + "step": 13691, + "time_per_iteration": 2.6306118965148926 + }, + { + "auxiliary_loss_clip": 0.01112252, + "auxiliary_loss_mlp": 0.01029597, + "balance_loss_clip": 1.0390234, + "balance_loss_mlp": 1.01860595, + "epoch": 0.8232075755298361, + "flos": 29092324888800.0, + "grad_norm": 3.384078305025693, + "language_loss": 0.77116722, + "learning_rate": 3.189108646472252e-07, + "loss": 0.79258567, + "num_input_tokens_seen": 295288950, + "router_z_loss_clip": 0.73193359, + "router_z_loss_mlp": 0.10992432, + "step": 13692, + "time_per_iteration": 2.617809534072876 + }, + { + "auxiliary_loss_clip": 0.01110963, + "auxiliary_loss_mlp": 0.01026688, + "balance_loss_clip": 1.03862071, + "balance_loss_mlp": 1.01532722, + "epoch": 0.8232676987825042, + "flos": 26422171470720.0, + "grad_norm": 1.8345312978351165, + "language_loss": 0.71711385, + "learning_rate": 3.186999086154205e-07, + "loss": 0.7384904, + "num_input_tokens_seen": 295309405, + "router_z_loss_clip": 0.72314453, + "router_z_loss_mlp": 0.11364746, + "step": 13693, + "time_per_iteration": 2.655670166015625 + }, + { + "auxiliary_loss_clip": 0.01107469, + "auxiliary_loss_mlp": 0.01030937, + "balance_loss_clip": 1.03717089, + "balance_loss_mlp": 1.02085197, + "epoch": 0.8233278220351721, + "flos": 32119465775040.0, + "grad_norm": 1.3409497937273211, + "language_loss": 0.83915377, + "learning_rate": 3.1848901633868355e-07, + "loss": 0.86053789, + "num_input_tokens_seen": 295331115, + "router_z_loss_clip": 0.703125, + "router_z_loss_mlp": 0.10076904, + "step": 13694, + "time_per_iteration": 2.654869556427002 + }, + { + "auxiliary_loss_clip": 0.01110627, + "auxiliary_loss_mlp": 0.01030473, + "balance_loss_clip": 1.03754568, + "balance_loss_mlp": 1.01880264, + "epoch": 0.8233879452878401, + "flos": 26509959715680.0, + "grad_norm": 1.623476504363525, + "language_loss": 0.76974738, + "learning_rate": 3.182781878250118e-07, + "loss": 0.79115832, + "num_input_tokens_seen": 295350495, + "router_z_loss_clip": 0.73144531, + "router_z_loss_mlp": 0.11676025, + "step": 13695, + "time_per_iteration": 2.6741080284118652 + }, + { + "auxiliary_loss_clip": 0.01112614, + "auxiliary_loss_mlp": 0.01031057, + "balance_loss_clip": 1.0406034, + "balance_loss_mlp": 1.02012515, + "epoch": 0.823448068540508, + "flos": 25084643466240.0, + "grad_norm": 2.058258165939562, + "language_loss": 0.8100509, + "learning_rate": 3.1806742308239985e-07, + "loss": 0.8314876, + "num_input_tokens_seen": 295368225, + "router_z_loss_clip": 0.72021484, + "router_z_loss_mlp": 0.10925293, + "step": 13696, + "time_per_iteration": 2.613088846206665 + }, + { + "auxiliary_loss_clip": 0.01031047, + "auxiliary_loss_mlp": 0.01000082, + "balance_loss_clip": 1.00858331, + "balance_loss_mlp": 0.99908626, + "epoch": 0.823508191793176, + "flos": 82086360128640.0, + "grad_norm": 0.7434156596023306, + "language_loss": 0.6383822, + "learning_rate": 3.178567221188393e-07, + "loss": 0.65869343, + "num_input_tokens_seen": 295430035, + "router_z_loss_clip": 0.22460938, + "router_z_loss_mlp": 0.00996399, + "step": 13697, + "time_per_iteration": 3.3860886096954346 + }, + { + "auxiliary_loss_clip": 0.01104868, + "auxiliary_loss_mlp": 0.01025843, + "balance_loss_clip": 1.03652596, + "balance_loss_mlp": 1.01582968, + "epoch": 0.8235683150458439, + "flos": 21876334705440.0, + "grad_norm": 1.602529648805965, + "language_loss": 0.72746241, + "learning_rate": 3.1764608494232037e-07, + "loss": 0.74876952, + "num_input_tokens_seen": 295447765, + "router_z_loss_clip": 0.68310547, + "router_z_loss_mlp": 0.10015869, + "step": 13698, + "time_per_iteration": 2.601902961730957 + }, + { + "auxiliary_loss_clip": 0.0111208, + "auxiliary_loss_mlp": 0.01026184, + "balance_loss_clip": 1.03947723, + "balance_loss_mlp": 1.01459694, + "epoch": 0.823628438298512, + "flos": 23081349738240.0, + "grad_norm": 4.345460726234925, + "language_loss": 0.71753979, + "learning_rate": 3.174355115608305e-07, + "loss": 0.73892242, + "num_input_tokens_seen": 295464810, + "router_z_loss_clip": 0.72607422, + "router_z_loss_mlp": 0.11590576, + "step": 13699, + "time_per_iteration": 2.641423225402832 + }, + { + "auxiliary_loss_clip": 0.0110988, + "auxiliary_loss_mlp": 0.01027659, + "balance_loss_clip": 1.03844965, + "balance_loss_mlp": 1.01655459, + "epoch": 0.8236885615511799, + "flos": 22813406550720.0, + "grad_norm": 2.4015497422465013, + "language_loss": 0.8197577, + "learning_rate": 3.1722500198235526e-07, + "loss": 0.84113312, + "num_input_tokens_seen": 295482605, + "router_z_loss_clip": 0.71386719, + "router_z_loss_mlp": 0.11108398, + "step": 13700, + "time_per_iteration": 2.7255725860595703 + }, + { + "auxiliary_loss_clip": 0.0111051, + "auxiliary_loss_mlp": 0.01030815, + "balance_loss_clip": 1.03718972, + "balance_loss_mlp": 1.02025914, + "epoch": 0.8237486848038479, + "flos": 28913020809120.0, + "grad_norm": 1.6695790607536065, + "language_loss": 0.72758377, + "learning_rate": 3.170145562148763e-07, + "loss": 0.74899703, + "num_input_tokens_seen": 295503780, + "router_z_loss_clip": 0.73291016, + "router_z_loss_mlp": 0.10559082, + "step": 13701, + "time_per_iteration": 2.6616578102111816 + }, + { + "auxiliary_loss_clip": 0.01111207, + "auxiliary_loss_mlp": 0.01033232, + "balance_loss_clip": 1.03675127, + "balance_loss_mlp": 1.02168655, + "epoch": 0.8238088080565159, + "flos": 28603270139040.0, + "grad_norm": 2.365190725072471, + "language_loss": 0.69426858, + "learning_rate": 3.1680417426637384e-07, + "loss": 0.71571296, + "num_input_tokens_seen": 295522035, + "router_z_loss_clip": 0.74414062, + "router_z_loss_mlp": 0.11547852, + "step": 13702, + "time_per_iteration": 2.685882329940796 + }, + { + "auxiliary_loss_clip": 0.01111475, + "auxiliary_loss_mlp": 0.01030596, + "balance_loss_clip": 1.03907228, + "balance_loss_mlp": 1.01943803, + "epoch": 0.8238689313091838, + "flos": 27755607229920.0, + "grad_norm": 2.347768281500002, + "language_loss": 0.74557471, + "learning_rate": 3.1659385614482603e-07, + "loss": 0.76699543, + "num_input_tokens_seen": 295541190, + "router_z_loss_clip": 0.72412109, + "router_z_loss_mlp": 0.11157227, + "step": 13703, + "time_per_iteration": 2.6340341567993164 + }, + { + "auxiliary_loss_clip": 0.01113102, + "auxiliary_loss_mlp": 0.01035185, + "balance_loss_clip": 1.0369035, + "balance_loss_mlp": 1.02325821, + "epoch": 0.8239290545618518, + "flos": 31274233902720.0, + "grad_norm": 2.0391758973626413, + "language_loss": 0.70074737, + "learning_rate": 3.1638360185820755e-07, + "loss": 0.72223026, + "num_input_tokens_seen": 295558860, + "router_z_loss_clip": 0.76123047, + "router_z_loss_mlp": 0.1192627, + "step": 13704, + "time_per_iteration": 2.7139229774475098 + }, + { + "auxiliary_loss_clip": 0.01108588, + "auxiliary_loss_mlp": 0.01029061, + "balance_loss_clip": 1.03698087, + "balance_loss_mlp": 1.01819539, + "epoch": 0.8239891778145197, + "flos": 31758386061600.0, + "grad_norm": 1.9393452106405642, + "language_loss": 0.64436001, + "learning_rate": 3.161734114144916e-07, + "loss": 0.6657365, + "num_input_tokens_seen": 295578155, + "router_z_loss_clip": 0.71582031, + "router_z_loss_mlp": 0.10876465, + "step": 13705, + "time_per_iteration": 2.6365482807159424 + }, + { + "auxiliary_loss_clip": 0.01112493, + "auxiliary_loss_mlp": 0.0103273, + "balance_loss_clip": 1.0380578, + "balance_loss_mlp": 1.02077973, + "epoch": 0.8240493010671878, + "flos": 26641257168960.0, + "grad_norm": 1.650560897555403, + "language_loss": 0.69238484, + "learning_rate": 3.1596328482164915e-07, + "loss": 0.71383709, + "num_input_tokens_seen": 295599170, + "router_z_loss_clip": 0.74414062, + "router_z_loss_mlp": 0.11950684, + "step": 13706, + "time_per_iteration": 2.797938585281372 + }, + { + "auxiliary_loss_clip": 0.01113851, + "auxiliary_loss_mlp": 0.01032644, + "balance_loss_clip": 1.04104877, + "balance_loss_mlp": 1.02079415, + "epoch": 0.8241094243198557, + "flos": 22636898163360.0, + "grad_norm": 2.0474182034798845, + "language_loss": 0.69623017, + "learning_rate": 3.157532220876475e-07, + "loss": 0.71769518, + "num_input_tokens_seen": 295617465, + "router_z_loss_clip": 0.72753906, + "router_z_loss_mlp": 0.11865234, + "step": 13707, + "time_per_iteration": 4.228539705276489 + }, + { + "auxiliary_loss_clip": 0.01111387, + "auxiliary_loss_mlp": 0.01028713, + "balance_loss_clip": 1.03794038, + "balance_loss_mlp": 1.01737618, + "epoch": 0.8241695475725237, + "flos": 31051055959200.0, + "grad_norm": 1.9223210010392981, + "language_loss": 0.79038942, + "learning_rate": 3.1554322322045226e-07, + "loss": 0.81179047, + "num_input_tokens_seen": 295634960, + "router_z_loss_clip": 0.73535156, + "router_z_loss_mlp": 0.11340332, + "step": 13708, + "time_per_iteration": 2.695913791656494 + }, + { + "auxiliary_loss_clip": 0.01110863, + "auxiliary_loss_mlp": 0.0102708, + "balance_loss_clip": 1.03751969, + "balance_loss_mlp": 1.01564813, + "epoch": 0.8242296708251916, + "flos": 23171082812640.0, + "grad_norm": 2.580901169048564, + "language_loss": 0.68132687, + "learning_rate": 3.1533328822802664e-07, + "loss": 0.70270634, + "num_input_tokens_seen": 295652725, + "router_z_loss_clip": 0.73242188, + "router_z_loss_mlp": 0.11444092, + "step": 13709, + "time_per_iteration": 2.6593575477600098 + }, + { + "auxiliary_loss_clip": 0.01110078, + "auxiliary_loss_mlp": 0.01030945, + "balance_loss_clip": 1.03720939, + "balance_loss_mlp": 1.0203774, + "epoch": 0.8242897940778596, + "flos": 27577842806880.0, + "grad_norm": 2.0211315447076075, + "language_loss": 0.82068992, + "learning_rate": 3.151234171183319e-07, + "loss": 0.84210026, + "num_input_tokens_seen": 295671195, + "router_z_loss_clip": 0.72802734, + "router_z_loss_mlp": 0.10565186, + "step": 13710, + "time_per_iteration": 2.6218783855438232 + }, + { + "auxiliary_loss_clip": 0.01111229, + "auxiliary_loss_mlp": 0.01030244, + "balance_loss_clip": 1.03817391, + "balance_loss_mlp": 1.01811409, + "epoch": 0.8243499173305275, + "flos": 26195752144800.0, + "grad_norm": 2.8018492726640165, + "language_loss": 0.78499711, + "learning_rate": 3.149136098993257e-07, + "loss": 0.8064118, + "num_input_tokens_seen": 295689130, + "router_z_loss_clip": 0.73095703, + "router_z_loss_mlp": 0.12127686, + "step": 13711, + "time_per_iteration": 2.625394821166992 + }, + { + "auxiliary_loss_clip": 0.01109243, + "auxiliary_loss_mlp": 0.01026906, + "balance_loss_clip": 1.03742826, + "balance_loss_mlp": 1.01555181, + "epoch": 0.8244100405831956, + "flos": 24416689809600.0, + "grad_norm": 2.0981852251270965, + "language_loss": 0.6553632, + "learning_rate": 3.1470386657896473e-07, + "loss": 0.67672467, + "num_input_tokens_seen": 295706385, + "router_z_loss_clip": 0.71777344, + "router_z_loss_mlp": 0.11358643, + "step": 13712, + "time_per_iteration": 2.6334400177001953 + }, + { + "auxiliary_loss_clip": 0.01112049, + "auxiliary_loss_mlp": 0.01030086, + "balance_loss_clip": 1.03962183, + "balance_loss_mlp": 1.01905298, + "epoch": 0.8244701638358635, + "flos": 32251370987520.0, + "grad_norm": 1.8143540225019288, + "language_loss": 0.74308819, + "learning_rate": 3.14494187165202e-07, + "loss": 0.76450956, + "num_input_tokens_seen": 295727925, + "router_z_loss_clip": 0.72412109, + "router_z_loss_mlp": 0.11022949, + "step": 13713, + "time_per_iteration": 2.699369192123413 + }, + { + "auxiliary_loss_clip": 0.01110225, + "auxiliary_loss_mlp": 0.01024668, + "balance_loss_clip": 1.03684783, + "balance_loss_mlp": 1.01353979, + "epoch": 0.8245302870885315, + "flos": 21524087759040.0, + "grad_norm": 2.054305454928377, + "language_loss": 0.81049263, + "learning_rate": 3.1428457166598833e-07, + "loss": 0.83184159, + "num_input_tokens_seen": 295744420, + "router_z_loss_clip": 0.73339844, + "router_z_loss_mlp": 0.11132812, + "step": 13714, + "time_per_iteration": 4.045989036560059 + }, + { + "auxiliary_loss_clip": 0.01111899, + "auxiliary_loss_mlp": 0.01030062, + "balance_loss_clip": 1.04029298, + "balance_loss_mlp": 1.01825476, + "epoch": 0.8245904103411995, + "flos": 31981685556960.0, + "grad_norm": 2.234692009244262, + "language_loss": 0.66474533, + "learning_rate": 3.1407502008927235e-07, + "loss": 0.68616492, + "num_input_tokens_seen": 295765105, + "router_z_loss_clip": 0.71630859, + "router_z_loss_mlp": 0.11798096, + "step": 13715, + "time_per_iteration": 2.676697015762329 + }, + { + "auxiliary_loss_clip": 0.01112616, + "auxiliary_loss_mlp": 0.01030118, + "balance_loss_clip": 1.03864455, + "balance_loss_mlp": 1.01840591, + "epoch": 0.8246505335938674, + "flos": 29535520428000.0, + "grad_norm": 2.2345906098100796, + "language_loss": 0.74821317, + "learning_rate": 3.1386553244300086e-07, + "loss": 0.76964056, + "num_input_tokens_seen": 295784200, + "router_z_loss_clip": 0.74023438, + "router_z_loss_mlp": 0.11712646, + "step": 13716, + "time_per_iteration": 2.6738932132720947 + }, + { + "auxiliary_loss_clip": 0.01030564, + "auxiliary_loss_mlp": 0.01000232, + "balance_loss_clip": 1.00813389, + "balance_loss_mlp": 0.99925911, + "epoch": 0.8247106568465354, + "flos": 81864802876320.0, + "grad_norm": 0.7155574531932206, + "language_loss": 0.58936763, + "learning_rate": 3.136561087351175e-07, + "loss": 0.60967565, + "num_input_tokens_seen": 295846555, + "router_z_loss_clip": 0.22424316, + "router_z_loss_mlp": 0.00971985, + "step": 13717, + "time_per_iteration": 3.356104612350464 + }, + { + "auxiliary_loss_clip": 0.01110844, + "auxiliary_loss_mlp": 0.0102509, + "balance_loss_clip": 1.03902435, + "balance_loss_mlp": 1.01514196, + "epoch": 0.8247707800992033, + "flos": 15335145599040.0, + "grad_norm": 2.1725477864222658, + "language_loss": 0.79241091, + "learning_rate": 3.1344674897356373e-07, + "loss": 0.81377023, + "num_input_tokens_seen": 295863425, + "router_z_loss_clip": 0.71875, + "router_z_loss_mlp": 0.0994873, + "step": 13718, + "time_per_iteration": 4.043992042541504 + }, + { + "auxiliary_loss_clip": 0.0110782, + "auxiliary_loss_mlp": 0.01030711, + "balance_loss_clip": 1.03788376, + "balance_loss_mlp": 1.01997042, + "epoch": 0.8248309033518714, + "flos": 19429764403680.0, + "grad_norm": 1.9101670269342423, + "language_loss": 0.68262112, + "learning_rate": 3.132374531662778e-07, + "loss": 0.70400643, + "num_input_tokens_seen": 295880925, + "router_z_loss_clip": 0.69921875, + "router_z_loss_mlp": 0.10736084, + "step": 13719, + "time_per_iteration": 2.7173757553100586 + }, + { + "auxiliary_loss_clip": 0.01110902, + "auxiliary_loss_mlp": 0.01027963, + "balance_loss_clip": 1.03694284, + "balance_loss_mlp": 1.0163337, + "epoch": 0.8248910266045393, + "flos": 21432450372480.0, + "grad_norm": 3.159288935386271, + "language_loss": 0.70261019, + "learning_rate": 3.13028221321197e-07, + "loss": 0.72399884, + "num_input_tokens_seen": 295898205, + "router_z_loss_clip": 0.73876953, + "router_z_loss_mlp": 0.11639404, + "step": 13720, + "time_per_iteration": 2.659968376159668 + }, + { + "auxiliary_loss_clip": 0.01112629, + "auxiliary_loss_mlp": 0.01027806, + "balance_loss_clip": 1.03861272, + "balance_loss_mlp": 1.01629639, + "epoch": 0.8249511498572073, + "flos": 34922537337600.0, + "grad_norm": 1.8863879063497426, + "language_loss": 0.75812113, + "learning_rate": 3.1281905344625467e-07, + "loss": 0.77952552, + "num_input_tokens_seen": 295918130, + "router_z_loss_clip": 0.73974609, + "router_z_loss_mlp": 0.11505127, + "step": 13721, + "time_per_iteration": 2.7394447326660156 + }, + { + "auxiliary_loss_clip": 0.01108875, + "auxiliary_loss_mlp": 0.01026817, + "balance_loss_clip": 1.03738761, + "balance_loss_mlp": 1.01636863, + "epoch": 0.8250112731098752, + "flos": 31184581862880.0, + "grad_norm": 2.6928081923758973, + "language_loss": 0.77875912, + "learning_rate": 3.1260994954938305e-07, + "loss": 0.800116, + "num_input_tokens_seen": 295937760, + "router_z_loss_clip": 0.71386719, + "router_z_loss_mlp": 0.10443115, + "step": 13722, + "time_per_iteration": 3.9647936820983887 + }, + { + "auxiliary_loss_clip": 0.01108177, + "auxiliary_loss_mlp": 0.01029644, + "balance_loss_clip": 1.03811133, + "balance_loss_mlp": 1.01905847, + "epoch": 0.8250713963625432, + "flos": 33856437006720.0, + "grad_norm": 5.3050723036796255, + "language_loss": 0.62512195, + "learning_rate": 3.1240090963851205e-07, + "loss": 0.64650011, + "num_input_tokens_seen": 295957585, + "router_z_loss_clip": 0.69970703, + "router_z_loss_mlp": 0.10583496, + "step": 13723, + "time_per_iteration": 2.715764284133911 + }, + { + "auxiliary_loss_clip": 0.01111693, + "auxiliary_loss_mlp": 0.0103096, + "balance_loss_clip": 1.03867972, + "balance_loss_mlp": 1.01958752, + "epoch": 0.8251315196152111, + "flos": 26369910529920.0, + "grad_norm": 1.6438683100406035, + "language_loss": 0.74273241, + "learning_rate": 3.121919337215666e-07, + "loss": 0.76415884, + "num_input_tokens_seen": 295977135, + "router_z_loss_clip": 0.73046875, + "router_z_loss_mlp": 0.11358643, + "step": 13724, + "time_per_iteration": 2.68689227104187 + }, + { + "auxiliary_loss_clip": 0.01112644, + "auxiliary_loss_mlp": 0.01032963, + "balance_loss_clip": 1.03931034, + "balance_loss_mlp": 1.02092242, + "epoch": 0.8251916428678792, + "flos": 34873720365600.0, + "grad_norm": 2.0584300979821997, + "language_loss": 0.64612079, + "learning_rate": 3.1198302180647253e-07, + "loss": 0.66757685, + "num_input_tokens_seen": 295996265, + "router_z_loss_clip": 0.73388672, + "router_z_loss_mlp": 0.12042236, + "step": 13725, + "time_per_iteration": 2.6530959606170654 + }, + { + "auxiliary_loss_clip": 0.01108685, + "auxiliary_loss_mlp": 0.01027682, + "balance_loss_clip": 1.03730273, + "balance_loss_mlp": 1.01674473, + "epoch": 0.8252517661205471, + "flos": 28156630631040.0, + "grad_norm": 1.9220608296573012, + "language_loss": 0.81746721, + "learning_rate": 3.1177417390115125e-07, + "loss": 0.83883095, + "num_input_tokens_seen": 296014745, + "router_z_loss_clip": 0.71386719, + "router_z_loss_mlp": 0.109375, + "step": 13726, + "time_per_iteration": 2.678630828857422 + }, + { + "auxiliary_loss_clip": 0.01103911, + "auxiliary_loss_mlp": 0.01033661, + "balance_loss_clip": 1.03424406, + "balance_loss_mlp": 1.02293849, + "epoch": 0.8253118893732151, + "flos": 38753912959200.0, + "grad_norm": 1.7057196456581663, + "language_loss": 0.70008487, + "learning_rate": 3.1156539001352286e-07, + "loss": 0.72146058, + "num_input_tokens_seen": 296036960, + "router_z_loss_clip": 0.69628906, + "router_z_loss_mlp": 0.10717773, + "step": 13727, + "time_per_iteration": 2.7000675201416016 + }, + { + "auxiliary_loss_clip": 0.0111311, + "auxiliary_loss_mlp": 0.01030091, + "balance_loss_clip": 1.03965402, + "balance_loss_mlp": 1.01800931, + "epoch": 0.8253720126258831, + "flos": 22325121629280.0, + "grad_norm": 6.494304051900525, + "language_loss": 0.62771785, + "learning_rate": 3.113566701515036e-07, + "loss": 0.64914978, + "num_input_tokens_seen": 296056540, + "router_z_loss_clip": 0.73388672, + "router_z_loss_mlp": 0.12078857, + "step": 13728, + "time_per_iteration": 2.6923890113830566 + }, + { + "auxiliary_loss_clip": 0.01116531, + "auxiliary_loss_mlp": 0.01030511, + "balance_loss_clip": 1.04091752, + "balance_loss_mlp": 1.01890016, + "epoch": 0.825432135878551, + "flos": 32698618254720.0, + "grad_norm": 1.7585683333449122, + "language_loss": 0.7139563, + "learning_rate": 3.111480143230092e-07, + "loss": 0.73542672, + "num_input_tokens_seen": 296077950, + "router_z_loss_clip": 0.75585938, + "router_z_loss_mlp": 0.1161499, + "step": 13729, + "time_per_iteration": 2.6631200313568115 + }, + { + "auxiliary_loss_clip": 0.0103057, + "auxiliary_loss_mlp": 0.01000583, + "balance_loss_clip": 1.00812459, + "balance_loss_mlp": 0.99958271, + "epoch": 0.825492259131219, + "flos": 66158853416640.0, + "grad_norm": 0.870738348193881, + "language_loss": 0.62657553, + "learning_rate": 3.109394225359514e-07, + "loss": 0.646887, + "num_input_tokens_seen": 296127060, + "router_z_loss_clip": 0.2244873, + "router_z_loss_mlp": 0.01000214, + "step": 13730, + "time_per_iteration": 3.0610294342041016 + }, + { + "auxiliary_loss_clip": 0.01111327, + "auxiliary_loss_mlp": 0.01031068, + "balance_loss_clip": 1.03981662, + "balance_loss_mlp": 1.01986849, + "epoch": 0.825552382383887, + "flos": 53393715220320.0, + "grad_norm": 2.173909054165015, + "language_loss": 0.63261938, + "learning_rate": 3.1073089479823945e-07, + "loss": 0.65404332, + "num_input_tokens_seen": 296147775, + "router_z_loss_clip": 0.71582031, + "router_z_loss_mlp": 0.11187744, + "step": 13731, + "time_per_iteration": 2.8431875705718994 + }, + { + "auxiliary_loss_clip": 0.01114652, + "auxiliary_loss_mlp": 0.01030258, + "balance_loss_clip": 1.03814054, + "balance_loss_mlp": 1.01890314, + "epoch": 0.825612505636555, + "flos": 15377520323520.0, + "grad_norm": 2.4327700286953164, + "language_loss": 0.69842494, + "learning_rate": 3.105224311177812e-07, + "loss": 0.71987396, + "num_input_tokens_seen": 296163560, + "router_z_loss_clip": 0.76513672, + "router_z_loss_mlp": 0.11358643, + "step": 13732, + "time_per_iteration": 2.6311399936676025 + }, + { + "auxiliary_loss_clip": 0.01113881, + "auxiliary_loss_mlp": 0.01035702, + "balance_loss_clip": 1.03837872, + "balance_loss_mlp": 1.02416825, + "epoch": 0.8256726288892229, + "flos": 21470651817120.0, + "grad_norm": 2.4156783620664712, + "language_loss": 0.7068997, + "learning_rate": 3.103140315024817e-07, + "loss": 0.72839552, + "num_input_tokens_seen": 296178730, + "router_z_loss_clip": 0.75488281, + "router_z_loss_mlp": 0.11541748, + "step": 13733, + "time_per_iteration": 2.5988285541534424 + }, + { + "auxiliary_loss_clip": 0.01107933, + "auxiliary_loss_mlp": 0.01027475, + "balance_loss_clip": 1.03674412, + "balance_loss_mlp": 1.0159893, + "epoch": 0.8257327521418909, + "flos": 29048086369440.0, + "grad_norm": 2.9898444012131007, + "language_loss": 0.82580435, + "learning_rate": 3.1010569596024437e-07, + "loss": 0.84715843, + "num_input_tokens_seen": 296200175, + "router_z_loss_clip": 0.71142578, + "router_z_loss_mlp": 0.1149292, + "step": 13734, + "time_per_iteration": 2.6541035175323486 + }, + { + "auxiliary_loss_clip": 0.01107754, + "auxiliary_loss_mlp": 0.010307, + "balance_loss_clip": 1.03727913, + "balance_loss_mlp": 1.0190711, + "epoch": 0.8257928753945588, + "flos": 23526773727840.0, + "grad_norm": 2.0606627434570646, + "language_loss": 0.82867891, + "learning_rate": 3.098974244989676e-07, + "loss": 0.85006344, + "num_input_tokens_seen": 296219305, + "router_z_loss_clip": 0.70507812, + "router_z_loss_mlp": 0.11621094, + "step": 13735, + "time_per_iteration": 2.655601978302002 + }, + { + "auxiliary_loss_clip": 0.01114122, + "auxiliary_loss_mlp": 0.01027287, + "balance_loss_clip": 1.04103446, + "balance_loss_mlp": 1.01685584, + "epoch": 0.8258529986472268, + "flos": 22547732330880.0, + "grad_norm": 1.8526456539158056, + "language_loss": 0.71128988, + "learning_rate": 3.096892171265497e-07, + "loss": 0.73270398, + "num_input_tokens_seen": 296236945, + "router_z_loss_clip": 0.73144531, + "router_z_loss_mlp": 0.10437012, + "step": 13736, + "time_per_iteration": 2.618891477584839 + }, + { + "auxiliary_loss_clip": 0.01030174, + "auxiliary_loss_mlp": 0.01000691, + "balance_loss_clip": 1.00774419, + "balance_loss_mlp": 0.99971461, + "epoch": 0.8259131218998947, + "flos": 75821900109120.0, + "grad_norm": 0.857289773492398, + "language_loss": 0.67877293, + "learning_rate": 3.0948107385088665e-07, + "loss": 0.6990816, + "num_input_tokens_seen": 296294685, + "router_z_loss_clip": 0.2244873, + "router_z_loss_mlp": 0.009758, + "step": 13737, + "time_per_iteration": 3.260988235473633 + }, + { + "auxiliary_loss_clip": 0.01112481, + "auxiliary_loss_mlp": 0.01032022, + "balance_loss_clip": 1.03901219, + "balance_loss_mlp": 1.02121603, + "epoch": 0.8259732451525628, + "flos": 27039039187680.0, + "grad_norm": 1.813140848282544, + "language_loss": 0.69375235, + "learning_rate": 3.0927299467987e-07, + "loss": 0.71519738, + "num_input_tokens_seen": 296314790, + "router_z_loss_clip": 0.73535156, + "router_z_loss_mlp": 0.1081543, + "step": 13738, + "time_per_iteration": 2.6187331676483154 + }, + { + "auxiliary_loss_clip": 0.01115157, + "auxiliary_loss_mlp": 0.0102856, + "balance_loss_clip": 1.04108536, + "balance_loss_mlp": 1.01556599, + "epoch": 0.8260333684052307, + "flos": 46810435010400.0, + "grad_norm": 1.8975216490591469, + "language_loss": 0.62442565, + "learning_rate": 3.090649796213911e-07, + "loss": 0.64586282, + "num_input_tokens_seen": 296335355, + "router_z_loss_clip": 0.74121094, + "router_z_loss_mlp": 0.12988281, + "step": 13739, + "time_per_iteration": 2.7862095832824707 + }, + { + "auxiliary_loss_clip": 0.01031013, + "auxiliary_loss_mlp": 0.01000633, + "balance_loss_clip": 1.00850773, + "balance_loss_mlp": 0.99972808, + "epoch": 0.8260934916578987, + "flos": 75879023123520.0, + "grad_norm": 0.8264738506698278, + "language_loss": 0.5936265, + "learning_rate": 3.0885702868333853e-07, + "loss": 0.61394298, + "num_input_tokens_seen": 296399885, + "router_z_loss_clip": 0.22509766, + "router_z_loss_mlp": 0.00904846, + "step": 13740, + "time_per_iteration": 3.323737144470215 + }, + { + "auxiliary_loss_clip": 0.01114963, + "auxiliary_loss_mlp": 0.01031386, + "balance_loss_clip": 1.03927553, + "balance_loss_mlp": 1.0196321, + "epoch": 0.8261536149105667, + "flos": 27534536184960.0, + "grad_norm": 1.87919072397754, + "language_loss": 0.75020051, + "learning_rate": 3.086491418735959e-07, + "loss": 0.77166402, + "num_input_tokens_seen": 296417660, + "router_z_loss_clip": 0.75830078, + "router_z_loss_mlp": 0.11743164, + "step": 13741, + "time_per_iteration": 2.6344807147979736 + }, + { + "auxiliary_loss_clip": 0.01111875, + "auxiliary_loss_mlp": 0.01032217, + "balance_loss_clip": 1.03904486, + "balance_loss_mlp": 1.02092195, + "epoch": 0.8262137381632346, + "flos": 39688553767680.0, + "grad_norm": 2.510283639034287, + "language_loss": 0.62518376, + "learning_rate": 3.0844131920004726e-07, + "loss": 0.64662468, + "num_input_tokens_seen": 296438255, + "router_z_loss_clip": 0.72802734, + "router_z_loss_mlp": 0.11297607, + "step": 13742, + "time_per_iteration": 2.7723283767700195 + }, + { + "auxiliary_loss_clip": 0.01116903, + "auxiliary_loss_mlp": 0.01033001, + "balance_loss_clip": 1.03983617, + "balance_loss_mlp": 1.02029335, + "epoch": 0.8262738614159026, + "flos": 17248868321760.0, + "grad_norm": 3.2857800717233636, + "language_loss": 0.65659666, + "learning_rate": 3.0823356067057327e-07, + "loss": 0.67809576, + "num_input_tokens_seen": 296454485, + "router_z_loss_clip": 0.77099609, + "router_z_loss_mlp": 0.12713623, + "step": 13743, + "time_per_iteration": 2.5783398151397705 + }, + { + "auxiliary_loss_clip": 0.01114031, + "auxiliary_loss_mlp": 0.01032447, + "balance_loss_clip": 1.0409224, + "balance_loss_mlp": 1.02106833, + "epoch": 0.8263339846685706, + "flos": 24191161863840.0, + "grad_norm": 1.8901815447056756, + "language_loss": 0.66700268, + "learning_rate": 3.0802586629305283e-07, + "loss": 0.6884675, + "num_input_tokens_seen": 296473740, + "router_z_loss_clip": 0.72998047, + "router_z_loss_mlp": 0.11383057, + "step": 13744, + "time_per_iteration": 2.665882110595703 + }, + { + "auxiliary_loss_clip": 0.01112003, + "auxiliary_loss_mlp": 0.01029035, + "balance_loss_clip": 1.03978074, + "balance_loss_mlp": 1.01801991, + "epoch": 0.8263941079212386, + "flos": 27754958953440.0, + "grad_norm": 2.5358786414127032, + "language_loss": 0.75476944, + "learning_rate": 3.078182360753612e-07, + "loss": 0.77617985, + "num_input_tokens_seen": 296493355, + "router_z_loss_clip": 0.72216797, + "router_z_loss_mlp": 0.11010742, + "step": 13745, + "time_per_iteration": 2.6452784538269043 + }, + { + "auxiliary_loss_clip": 0.01107489, + "auxiliary_loss_mlp": 0.01030547, + "balance_loss_clip": 1.03773785, + "balance_loss_mlp": 1.02067637, + "epoch": 0.8264542311739065, + "flos": 24551188128000.0, + "grad_norm": 1.932184809355741, + "language_loss": 0.79064035, + "learning_rate": 3.076106700253709e-07, + "loss": 0.81202072, + "num_input_tokens_seen": 296510520, + "router_z_loss_clip": 0.69873047, + "router_z_loss_mlp": 0.09869385, + "step": 13746, + "time_per_iteration": 2.696479082107544 + }, + { + "auxiliary_loss_clip": 0.01118158, + "auxiliary_loss_mlp": 0.01033991, + "balance_loss_clip": 1.04237771, + "balance_loss_mlp": 1.02190936, + "epoch": 0.8265143544265745, + "flos": 20544398085600.0, + "grad_norm": 4.691524326289309, + "language_loss": 0.68096197, + "learning_rate": 3.0740316815095415e-07, + "loss": 0.70248348, + "num_input_tokens_seen": 296528265, + "router_z_loss_clip": 0.75830078, + "router_z_loss_mlp": 0.12084961, + "step": 13747, + "time_per_iteration": 4.08221435546875 + }, + { + "auxiliary_loss_clip": 0.01112851, + "auxiliary_loss_mlp": 0.01029894, + "balance_loss_clip": 1.03889549, + "balance_loss_mlp": 1.01816344, + "epoch": 0.8265744776792424, + "flos": 26867149770240.0, + "grad_norm": 2.0750499617933618, + "language_loss": 0.75206399, + "learning_rate": 3.0719573045997835e-07, + "loss": 0.77349144, + "num_input_tokens_seen": 296547810, + "router_z_loss_clip": 0.74072266, + "router_z_loss_mlp": 0.1171875, + "step": 13748, + "time_per_iteration": 2.62347674369812 + }, + { + "auxiliary_loss_clip": 0.01108658, + "auxiliary_loss_mlp": 0.01030315, + "balance_loss_clip": 1.03924191, + "balance_loss_mlp": 1.020051, + "epoch": 0.8266346009319104, + "flos": 23482494691200.0, + "grad_norm": 3.00710939886836, + "language_loss": 0.63893306, + "learning_rate": 3.069883569603102e-07, + "loss": 0.66032279, + "num_input_tokens_seen": 296565940, + "router_z_loss_clip": 0.69384766, + "router_z_loss_mlp": 0.10266113, + "step": 13749, + "time_per_iteration": 2.6719424724578857 + }, + { + "auxiliary_loss_clip": 0.01107435, + "auxiliary_loss_mlp": 0.01026286, + "balance_loss_clip": 1.03581977, + "balance_loss_mlp": 1.01527083, + "epoch": 0.8266947241845783, + "flos": 29488931906400.0, + "grad_norm": 2.0773405339523063, + "language_loss": 0.73789477, + "learning_rate": 3.067810476598132e-07, + "loss": 0.75923193, + "num_input_tokens_seen": 296585090, + "router_z_loss_clip": 0.71630859, + "router_z_loss_mlp": 0.11016846, + "step": 13750, + "time_per_iteration": 2.639920711517334 + }, + { + "auxiliary_loss_clip": 0.01112278, + "auxiliary_loss_mlp": 0.01032572, + "balance_loss_clip": 1.03895354, + "balance_loss_mlp": 1.02128863, + "epoch": 0.8267548474372464, + "flos": 25753366951200.0, + "grad_norm": 3.1717117032461375, + "language_loss": 0.66054082, + "learning_rate": 3.065738025663496e-07, + "loss": 0.68198931, + "num_input_tokens_seen": 296604950, + "router_z_loss_clip": 0.73242188, + "router_z_loss_mlp": 0.112854, + "step": 13751, + "time_per_iteration": 2.68855619430542 + }, + { + "auxiliary_loss_clip": 0.01108473, + "auxiliary_loss_mlp": 0.01026488, + "balance_loss_clip": 1.03785062, + "balance_loss_mlp": 1.01601517, + "epoch": 0.8268149706899143, + "flos": 48770867806560.0, + "grad_norm": 2.1503375806432277, + "language_loss": 0.6042996, + "learning_rate": 3.0636662168777607e-07, + "loss": 0.62564921, + "num_input_tokens_seen": 296627780, + "router_z_loss_clip": 0.70556641, + "router_z_loss_mlp": 0.10473633, + "step": 13752, + "time_per_iteration": 2.846989154815674 + }, + { + "auxiliary_loss_clip": 0.01030387, + "auxiliary_loss_mlp": 0.01000236, + "balance_loss_clip": 1.00801361, + "balance_loss_mlp": 0.99929786, + "epoch": 0.8268750939425823, + "flos": 80269535688480.0, + "grad_norm": 0.7776215212826445, + "language_loss": 0.57439679, + "learning_rate": 3.0615950503194986e-07, + "loss": 0.59470296, + "num_input_tokens_seen": 296683850, + "router_z_loss_clip": 0.22387695, + "router_z_loss_mlp": 0.0093689, + "step": 13753, + "time_per_iteration": 4.600157737731934 + }, + { + "auxiliary_loss_clip": 0.01030453, + "auxiliary_loss_mlp": 0.01002245, + "balance_loss_clip": 1.00799477, + "balance_loss_mlp": 1.00131226, + "epoch": 0.8269352171952503, + "flos": 64648301510880.0, + "grad_norm": 0.6956477018053203, + "language_loss": 0.54930007, + "learning_rate": 3.0595245260672563e-07, + "loss": 0.56962705, + "num_input_tokens_seen": 296741420, + "router_z_loss_clip": 0.22485352, + "router_z_loss_mlp": 0.00933075, + "step": 13754, + "time_per_iteration": 3.375615119934082 + }, + { + "auxiliary_loss_clip": 0.01107643, + "auxiliary_loss_mlp": 0.01032703, + "balance_loss_clip": 1.03699803, + "balance_loss_mlp": 1.02313638, + "epoch": 0.8269953404479182, + "flos": 28157684080320.0, + "grad_norm": 2.498329067249389, + "language_loss": 0.6940397, + "learning_rate": 3.0574546441995354e-07, + "loss": 0.71544313, + "num_input_tokens_seen": 296759620, + "router_z_loss_clip": 0.70605469, + "router_z_loss_mlp": 0.09576416, + "step": 13755, + "time_per_iteration": 2.6323063373565674 + }, + { + "auxiliary_loss_clip": 0.01109105, + "auxiliary_loss_mlp": 0.01032602, + "balance_loss_clip": 1.03863966, + "balance_loss_mlp": 1.02239203, + "epoch": 0.8270554637005862, + "flos": 17338560878880.0, + "grad_norm": 2.3175425122204927, + "language_loss": 0.70013374, + "learning_rate": 3.0553854047948324e-07, + "loss": 0.72155082, + "num_input_tokens_seen": 296777275, + "router_z_loss_clip": 0.70410156, + "router_z_loss_mlp": 0.10205078, + "step": 13756, + "time_per_iteration": 2.6454358100891113 + }, + { + "auxiliary_loss_clip": 0.01113523, + "auxiliary_loss_mlp": 0.01033526, + "balance_loss_clip": 1.04147768, + "balance_loss_mlp": 1.02238548, + "epoch": 0.8271155869532542, + "flos": 26554198235040.0, + "grad_norm": 2.019046031050044, + "language_loss": 0.72271937, + "learning_rate": 3.053316807931623e-07, + "loss": 0.74418986, + "num_input_tokens_seen": 296796655, + "router_z_loss_clip": 0.72021484, + "router_z_loss_mlp": 0.1114502, + "step": 13757, + "time_per_iteration": 2.6590216159820557 + }, + { + "auxiliary_loss_clip": 0.01114183, + "auxiliary_loss_mlp": 0.01032217, + "balance_loss_clip": 1.0389545, + "balance_loss_mlp": 1.01928294, + "epoch": 0.8271757102059222, + "flos": 18449750592000.0, + "grad_norm": 2.422850401407608, + "language_loss": 0.68968272, + "learning_rate": 3.0512488536883283e-07, + "loss": 0.71114671, + "num_input_tokens_seen": 296813705, + "router_z_loss_clip": 0.75341797, + "router_z_loss_mlp": 0.1293335, + "step": 13758, + "time_per_iteration": 4.110937118530273 + }, + { + "auxiliary_loss_clip": 0.01105781, + "auxiliary_loss_mlp": 0.01028606, + "balance_loss_clip": 1.03609705, + "balance_loss_mlp": 1.01813984, + "epoch": 0.8272358334585901, + "flos": 29448299424960.0, + "grad_norm": 1.636172968198405, + "language_loss": 0.69874084, + "learning_rate": 3.0491815421433775e-07, + "loss": 0.72008473, + "num_input_tokens_seen": 296833985, + "router_z_loss_clip": 0.69677734, + "router_z_loss_mlp": 0.10467529, + "step": 13759, + "time_per_iteration": 2.7091429233551025 + }, + { + "auxiliary_loss_clip": 0.01108379, + "auxiliary_loss_mlp": 0.01029468, + "balance_loss_clip": 1.03757322, + "balance_loss_mlp": 1.0179882, + "epoch": 0.8272959567112581, + "flos": 23172257813760.0, + "grad_norm": 1.725534386486864, + "language_loss": 0.71072042, + "learning_rate": 3.047114873375161e-07, + "loss": 0.73209888, + "num_input_tokens_seen": 296850150, + "router_z_loss_clip": 0.70849609, + "router_z_loss_mlp": 0.11486816, + "step": 13760, + "time_per_iteration": 2.6063923835754395 + }, + { + "auxiliary_loss_clip": 0.0110947, + "auxiliary_loss_mlp": 0.01026199, + "balance_loss_clip": 1.03971398, + "balance_loss_mlp": 1.01550615, + "epoch": 0.827356079963926, + "flos": 25174700678880.0, + "grad_norm": 1.9517565860291868, + "language_loss": 0.77639186, + "learning_rate": 3.0450488474620505e-07, + "loss": 0.79774857, + "num_input_tokens_seen": 296869585, + "router_z_loss_clip": 0.69726562, + "router_z_loss_mlp": 0.10693359, + "step": 13761, + "time_per_iteration": 4.063249111175537 + }, + { + "auxiliary_loss_clip": 0.01107686, + "auxiliary_loss_mlp": 0.01027666, + "balance_loss_clip": 1.03815961, + "balance_loss_mlp": 1.01730657, + "epoch": 0.827416203216594, + "flos": 27353165724000.0, + "grad_norm": 1.6724698042229236, + "language_loss": 0.69893324, + "learning_rate": 3.042983464482387e-07, + "loss": 0.72028679, + "num_input_tokens_seen": 296887710, + "router_z_loss_clip": 0.69482422, + "router_z_loss_mlp": 0.10357666, + "step": 13762, + "time_per_iteration": 2.650110960006714 + }, + { + "auxiliary_loss_clip": 0.01108247, + "auxiliary_loss_mlp": 0.0102477, + "balance_loss_clip": 1.03674889, + "balance_loss_mlp": 1.01435733, + "epoch": 0.827476326469262, + "flos": 23215645470240.0, + "grad_norm": 1.9444197471600901, + "language_loss": 0.70253009, + "learning_rate": 3.0409187245144853e-07, + "loss": 0.7238602, + "num_input_tokens_seen": 296906265, + "router_z_loss_clip": 0.71435547, + "router_z_loss_mlp": 0.10424805, + "step": 13763, + "time_per_iteration": 2.5839438438415527 + }, + { + "auxiliary_loss_clip": 0.01030145, + "auxiliary_loss_mlp": 0.01001892, + "balance_loss_clip": 1.00779986, + "balance_loss_mlp": 1.00094628, + "epoch": 0.82753644972193, + "flos": 83584756850400.0, + "grad_norm": 0.834853150880662, + "language_loss": 0.65052879, + "learning_rate": 3.038854627636651e-07, + "loss": 0.67084914, + "num_input_tokens_seen": 296971290, + "router_z_loss_clip": 0.22363281, + "router_z_loss_mlp": 0.00945282, + "step": 13764, + "time_per_iteration": 3.4291012287139893 + }, + { + "auxiliary_loss_clip": 0.01113379, + "auxiliary_loss_mlp": 0.0103191, + "balance_loss_clip": 1.03965271, + "balance_loss_mlp": 1.02029347, + "epoch": 0.8275965729745979, + "flos": 22458161325600.0, + "grad_norm": 2.2083535814466173, + "language_loss": 0.78009719, + "learning_rate": 3.0367911739271423e-07, + "loss": 0.80155009, + "num_input_tokens_seen": 296989060, + "router_z_loss_clip": 0.73779297, + "router_z_loss_mlp": 0.1161499, + "step": 13765, + "time_per_iteration": 2.6483962535858154 + }, + { + "auxiliary_loss_clip": 0.01113387, + "auxiliary_loss_mlp": 0.01029435, + "balance_loss_clip": 1.03860927, + "balance_loss_mlp": 1.01768708, + "epoch": 0.8276566962272659, + "flos": 34791077815200.0, + "grad_norm": 1.5718463664683004, + "language_loss": 0.62058371, + "learning_rate": 3.034728363464214e-07, + "loss": 0.64201194, + "num_input_tokens_seen": 297011300, + "router_z_loss_clip": 0.74755859, + "router_z_loss_mlp": 0.11755371, + "step": 13766, + "time_per_iteration": 2.692810297012329 + }, + { + "auxiliary_loss_clip": 0.01110843, + "auxiliary_loss_mlp": 0.0102965, + "balance_loss_clip": 1.03840446, + "balance_loss_mlp": 1.01810503, + "epoch": 0.8277168194799339, + "flos": 24684916618080.0, + "grad_norm": 1.7305328602305614, + "language_loss": 0.82578504, + "learning_rate": 3.03266619632609e-07, + "loss": 0.8471899, + "num_input_tokens_seen": 297030350, + "router_z_loss_clip": 0.72314453, + "router_z_loss_mlp": 0.11535645, + "step": 13767, + "time_per_iteration": 2.6624932289123535 + }, + { + "auxiliary_loss_clip": 0.01114, + "auxiliary_loss_mlp": 0.01025504, + "balance_loss_clip": 1.04065084, + "balance_loss_mlp": 1.01413691, + "epoch": 0.8277769427326018, + "flos": 34747730676000.0, + "grad_norm": 1.688530624482271, + "language_loss": 0.69118935, + "learning_rate": 3.030604672590964e-07, + "loss": 0.71258438, + "num_input_tokens_seen": 297049710, + "router_z_loss_clip": 0.73388672, + "router_z_loss_mlp": 0.11358643, + "step": 13768, + "time_per_iteration": 2.762364149093628 + }, + { + "auxiliary_loss_clip": 0.01107847, + "auxiliary_loss_mlp": 0.01027828, + "balance_loss_clip": 1.037076, + "balance_loss_mlp": 1.01732612, + "epoch": 0.8278370659852698, + "flos": 33187186797120.0, + "grad_norm": 2.044248987317444, + "language_loss": 0.74041033, + "learning_rate": 3.028543792337006e-07, + "loss": 0.76176709, + "num_input_tokens_seen": 297070510, + "router_z_loss_clip": 0.70800781, + "router_z_loss_mlp": 0.1050415, + "step": 13769, + "time_per_iteration": 2.77325439453125 + }, + { + "auxiliary_loss_clip": 0.0111104, + "auxiliary_loss_mlp": 0.01026228, + "balance_loss_clip": 1.03856492, + "balance_loss_mlp": 1.01518893, + "epoch": 0.8278971892379378, + "flos": 46144588252320.0, + "grad_norm": 2.3877408679608227, + "language_loss": 0.74147379, + "learning_rate": 3.0264835556423675e-07, + "loss": 0.76284653, + "num_input_tokens_seen": 297092585, + "router_z_loss_clip": 0.72460938, + "router_z_loss_mlp": 0.11047363, + "step": 13770, + "time_per_iteration": 2.7963287830352783 + }, + { + "auxiliary_loss_clip": 0.01112174, + "auxiliary_loss_mlp": 0.01030142, + "balance_loss_clip": 1.0389266, + "balance_loss_mlp": 1.01884055, + "epoch": 0.8279573124906058, + "flos": 27528539627520.0, + "grad_norm": 2.2177459109977633, + "language_loss": 0.75859082, + "learning_rate": 3.0244239625851785e-07, + "loss": 0.78001398, + "num_input_tokens_seen": 297110055, + "router_z_loss_clip": 0.73242188, + "router_z_loss_mlp": 0.11309814, + "step": 13771, + "time_per_iteration": 2.7046456336975098 + }, + { + "auxiliary_loss_clip": 0.01110218, + "auxiliary_loss_mlp": 0.0103062, + "balance_loss_clip": 1.03773308, + "balance_loss_mlp": 1.01951003, + "epoch": 0.8280174357432737, + "flos": 44007282413280.0, + "grad_norm": 1.531507243124842, + "language_loss": 0.72829914, + "learning_rate": 3.0223650132435284e-07, + "loss": 0.74970752, + "num_input_tokens_seen": 297132170, + "router_z_loss_clip": 0.72607422, + "router_z_loss_mlp": 0.11114502, + "step": 13772, + "time_per_iteration": 2.7664451599121094 + }, + { + "auxiliary_loss_clip": 0.01108801, + "auxiliary_loss_mlp": 0.01026347, + "balance_loss_clip": 1.03805733, + "balance_loss_mlp": 1.01487291, + "epoch": 0.8280775589959417, + "flos": 28017796963680.0, + "grad_norm": 2.869009180923734, + "language_loss": 0.74763536, + "learning_rate": 3.0203067076955035e-07, + "loss": 0.76898682, + "num_input_tokens_seen": 297149515, + "router_z_loss_clip": 0.70800781, + "router_z_loss_mlp": 0.11480713, + "step": 13773, + "time_per_iteration": 2.7414002418518066 + }, + { + "auxiliary_loss_clip": 0.01109837, + "auxiliary_loss_mlp": 0.01030871, + "balance_loss_clip": 1.03935623, + "balance_loss_mlp": 1.02001739, + "epoch": 0.8281376822486096, + "flos": 31803272857440.0, + "grad_norm": 1.9713865864333395, + "language_loss": 0.75519627, + "learning_rate": 3.01824904601915e-07, + "loss": 0.77660334, + "num_input_tokens_seen": 297170320, + "router_z_loss_clip": 0.70507812, + "router_z_loss_mlp": 0.10864258, + "step": 13774, + "time_per_iteration": 2.7212166786193848 + }, + { + "auxiliary_loss_clip": 0.01115897, + "auxiliary_loss_mlp": 0.01025139, + "balance_loss_clip": 1.04070354, + "balance_loss_mlp": 1.01425552, + "epoch": 0.8281978055012776, + "flos": 25218858163680.0, + "grad_norm": 1.6805074754331102, + "language_loss": 0.7533958, + "learning_rate": 3.01619202829249e-07, + "loss": 0.7748062, + "num_input_tokens_seen": 297189935, + "router_z_loss_clip": 0.75244141, + "router_z_loss_mlp": 0.10876465, + "step": 13775, + "time_per_iteration": 2.6821906566619873 + }, + { + "auxiliary_loss_clip": 0.01114054, + "auxiliary_loss_mlp": 0.01030271, + "balance_loss_clip": 1.03812969, + "balance_loss_mlp": 1.01812339, + "epoch": 0.8282579287539455, + "flos": 35771172661440.0, + "grad_norm": 17.14186038961389, + "language_loss": 0.73812032, + "learning_rate": 3.01413565459353e-07, + "loss": 0.75956357, + "num_input_tokens_seen": 297210885, + "router_z_loss_clip": 0.75878906, + "router_z_loss_mlp": 0.12145996, + "step": 13776, + "time_per_iteration": 2.6755146980285645 + }, + { + "auxiliary_loss_clip": 0.01110178, + "auxiliary_loss_mlp": 0.01024636, + "balance_loss_clip": 1.03650522, + "balance_loss_mlp": 1.01414585, + "epoch": 0.8283180520066136, + "flos": 19164292770240.0, + "grad_norm": 2.320492358467925, + "language_loss": 0.77862877, + "learning_rate": 3.0120799250002483e-07, + "loss": 0.79997689, + "num_input_tokens_seen": 297228500, + "router_z_loss_clip": 0.73730469, + "router_z_loss_mlp": 0.10491943, + "step": 13777, + "time_per_iteration": 2.666497230529785 + }, + { + "auxiliary_loss_clip": 0.01107921, + "auxiliary_loss_mlp": 0.01027189, + "balance_loss_clip": 1.03904617, + "balance_loss_mlp": 1.0169431, + "epoch": 0.8283781752592815, + "flos": 30250832434560.0, + "grad_norm": 1.5795616052913066, + "language_loss": 0.82589722, + "learning_rate": 3.010024839590604e-07, + "loss": 0.84724832, + "num_input_tokens_seen": 297249470, + "router_z_loss_clip": 0.68847656, + "router_z_loss_mlp": 0.10241699, + "step": 13778, + "time_per_iteration": 2.6425704956054688 + }, + { + "auxiliary_loss_clip": 0.0110689, + "auxiliary_loss_mlp": 0.01024286, + "balance_loss_clip": 1.03721881, + "balance_loss_mlp": 1.01343155, + "epoch": 0.8284382985119495, + "flos": 23163344012160.0, + "grad_norm": 2.401118572621773, + "language_loss": 0.7429527, + "learning_rate": 3.0079703984425187e-07, + "loss": 0.7642644, + "num_input_tokens_seen": 297265970, + "router_z_loss_clip": 0.69775391, + "router_z_loss_mlp": 0.10858154, + "step": 13779, + "time_per_iteration": 2.68996000289917 + }, + { + "auxiliary_loss_clip": 0.01030236, + "auxiliary_loss_mlp": 0.01001064, + "balance_loss_clip": 1.0079217, + "balance_loss_mlp": 1.00010014, + "epoch": 0.8284984217646175, + "flos": 74472790512960.0, + "grad_norm": 0.764719020500758, + "language_loss": 0.56727904, + "learning_rate": 3.0059166016338954e-07, + "loss": 0.58759207, + "num_input_tokens_seen": 297325525, + "router_z_loss_clip": 0.22314453, + "router_z_loss_mlp": 0.0096283, + "step": 13780, + "time_per_iteration": 3.2956202030181885 + }, + { + "auxiliary_loss_clip": 0.0111111, + "auxiliary_loss_mlp": 0.01023824, + "balance_loss_clip": 1.03811395, + "balance_loss_mlp": 1.01215339, + "epoch": 0.8285585450172854, + "flos": 24056177338080.0, + "grad_norm": 1.9363389235214232, + "language_loss": 0.79638207, + "learning_rate": 3.0038634492426205e-07, + "loss": 0.81773138, + "num_input_tokens_seen": 297345025, + "router_z_loss_clip": 0.73046875, + "router_z_loss_mlp": 0.11676025, + "step": 13781, + "time_per_iteration": 2.6300697326660156 + }, + { + "auxiliary_loss_clip": 0.01114704, + "auxiliary_loss_mlp": 0.01027139, + "balance_loss_clip": 1.04146814, + "balance_loss_mlp": 1.01486635, + "epoch": 0.8286186682699535, + "flos": 26464951368000.0, + "grad_norm": 2.1836012315801443, + "language_loss": 0.75536811, + "learning_rate": 3.001810941346543e-07, + "loss": 0.77678651, + "num_input_tokens_seen": 297363570, + "router_z_loss_clip": 0.73193359, + "router_z_loss_mlp": 0.1227417, + "step": 13782, + "time_per_iteration": 2.6664891242980957 + }, + { + "auxiliary_loss_clip": 0.01109367, + "auxiliary_loss_mlp": 0.01028082, + "balance_loss_clip": 1.03620493, + "balance_loss_mlp": 1.01672137, + "epoch": 0.8286787915226214, + "flos": 31451795739360.0, + "grad_norm": 1.5486348386939666, + "language_loss": 0.76513076, + "learning_rate": 2.9997590780234983e-07, + "loss": 0.78650528, + "num_input_tokens_seen": 297385385, + "router_z_loss_clip": 0.73095703, + "router_z_loss_mlp": 0.11364746, + "step": 13783, + "time_per_iteration": 2.7277987003326416 + }, + { + "auxiliary_loss_clip": 0.01109481, + "auxiliary_loss_mlp": 0.0102748, + "balance_loss_clip": 1.03683364, + "balance_loss_mlp": 1.01633406, + "epoch": 0.8287389147752894, + "flos": 25976099204640.0, + "grad_norm": 6.103291899383868, + "language_loss": 0.73477399, + "learning_rate": 2.997707859351304e-07, + "loss": 0.75614357, + "num_input_tokens_seen": 297403950, + "router_z_loss_clip": 0.72607422, + "router_z_loss_mlp": 0.1114502, + "step": 13784, + "time_per_iteration": 2.719515562057495 + }, + { + "auxiliary_loss_clip": 0.01113374, + "auxiliary_loss_mlp": 0.01035307, + "balance_loss_clip": 1.03731656, + "balance_loss_mlp": 1.02295685, + "epoch": 0.8287990380279573, + "flos": 40932702142560.0, + "grad_norm": 1.6152522403527005, + "language_loss": 0.70219743, + "learning_rate": 2.99565728540772e-07, + "loss": 0.72368419, + "num_input_tokens_seen": 297424565, + "router_z_loss_clip": 0.76025391, + "router_z_loss_mlp": 0.12347412, + "step": 13785, + "time_per_iteration": 2.715181350708008 + }, + { + "auxiliary_loss_clip": 0.01113418, + "auxiliary_loss_mlp": 0.01031053, + "balance_loss_clip": 1.04091203, + "balance_loss_mlp": 1.02006173, + "epoch": 0.8288591612806253, + "flos": 28023753003840.0, + "grad_norm": 1.6071595014285391, + "language_loss": 0.68675822, + "learning_rate": 2.993607356270516e-07, + "loss": 0.7082029, + "num_input_tokens_seen": 297445180, + "router_z_loss_clip": 0.7265625, + "router_z_loss_mlp": 0.10986328, + "step": 13786, + "time_per_iteration": 4.100902557373047 + }, + { + "auxiliary_loss_clip": 0.01115198, + "auxiliary_loss_mlp": 0.01031121, + "balance_loss_clip": 1.03910327, + "balance_loss_mlp": 1.01967692, + "epoch": 0.8289192845332932, + "flos": 22682068580160.0, + "grad_norm": 1.81681307503694, + "language_loss": 0.77479339, + "learning_rate": 2.991558072017426e-07, + "loss": 0.7962566, + "num_input_tokens_seen": 297463790, + "router_z_loss_clip": 0.76025391, + "router_z_loss_mlp": 0.11456299, + "step": 13787, + "time_per_iteration": 2.659329891204834 + }, + { + "auxiliary_loss_clip": 0.01110952, + "auxiliary_loss_mlp": 0.01032824, + "balance_loss_clip": 1.0399121, + "balance_loss_mlp": 1.02221417, + "epoch": 0.8289794077859612, + "flos": 18852232615200.0, + "grad_norm": 2.684687823559268, + "language_loss": 0.80915177, + "learning_rate": 2.989509432726163e-07, + "loss": 0.83058953, + "num_input_tokens_seen": 297480100, + "router_z_loss_clip": 0.7109375, + "router_z_loss_mlp": 0.10614014, + "step": 13788, + "time_per_iteration": 2.6110341548919678 + }, + { + "auxiliary_loss_clip": 0.01110613, + "auxiliary_loss_mlp": 0.01031318, + "balance_loss_clip": 1.03877068, + "balance_loss_mlp": 1.02042842, + "epoch": 0.8290395310386292, + "flos": 35237352667680.0, + "grad_norm": 1.5321863124823814, + "language_loss": 0.71149909, + "learning_rate": 2.9874614384744014e-07, + "loss": 0.73291838, + "num_input_tokens_seen": 297499890, + "router_z_loss_clip": 0.71826172, + "router_z_loss_mlp": 0.10900879, + "step": 13789, + "time_per_iteration": 2.7579071521759033 + }, + { + "auxiliary_loss_clip": 0.01111004, + "auxiliary_loss_mlp": 0.01030045, + "balance_loss_clip": 1.03616881, + "balance_loss_mlp": 1.01839209, + "epoch": 0.8290996542912972, + "flos": 44631564792480.0, + "grad_norm": 1.9672996230937567, + "language_loss": 0.68121099, + "learning_rate": 2.985414089339813e-07, + "loss": 0.70262146, + "num_input_tokens_seen": 297521440, + "router_z_loss_clip": 0.74853516, + "router_z_loss_mlp": 0.11651611, + "step": 13790, + "time_per_iteration": 2.741288185119629 + }, + { + "auxiliary_loss_clip": 0.0111116, + "auxiliary_loss_mlp": 0.01033844, + "balance_loss_clip": 1.03655338, + "balance_loss_mlp": 1.02126777, + "epoch": 0.8291597775439651, + "flos": 28825070495040.0, + "grad_norm": 4.180699681255427, + "language_loss": 0.77219743, + "learning_rate": 2.9833673854000265e-07, + "loss": 0.79364753, + "num_input_tokens_seen": 297539920, + "router_z_loss_clip": 0.74609375, + "router_z_loss_mlp": 0.12573242, + "step": 13791, + "time_per_iteration": 2.648721694946289 + }, + { + "auxiliary_loss_clip": 0.01109408, + "auxiliary_loss_mlp": 0.01026652, + "balance_loss_clip": 1.03899622, + "balance_loss_mlp": 1.01490974, + "epoch": 0.8292199007966331, + "flos": 26109300970080.0, + "grad_norm": 2.6137932288326957, + "language_loss": 0.69834524, + "learning_rate": 2.981321326732651e-07, + "loss": 0.71970582, + "num_input_tokens_seen": 297560000, + "router_z_loss_clip": 0.70507812, + "router_z_loss_mlp": 0.11737061, + "step": 13792, + "time_per_iteration": 4.108422756195068 + }, + { + "auxiliary_loss_clip": 0.01111342, + "auxiliary_loss_mlp": 0.01029034, + "balance_loss_clip": 1.03694844, + "balance_loss_mlp": 1.01758385, + "epoch": 0.829280024049301, + "flos": 35102894866560.0, + "grad_norm": 1.6271800904127662, + "language_loss": 0.64961398, + "learning_rate": 2.9792759134152736e-07, + "loss": 0.67101777, + "num_input_tokens_seen": 297579300, + "router_z_loss_clip": 0.74365234, + "router_z_loss_mlp": 0.11456299, + "step": 13793, + "time_per_iteration": 2.711312770843506 + }, + { + "auxiliary_loss_clip": 0.01113503, + "auxiliary_loss_mlp": 0.01026963, + "balance_loss_clip": 1.03817976, + "balance_loss_mlp": 1.01539946, + "epoch": 0.829340147301969, + "flos": 24328415357280.0, + "grad_norm": 1.9367581567820218, + "language_loss": 0.66756958, + "learning_rate": 2.977231145525461e-07, + "loss": 0.68897426, + "num_input_tokens_seen": 297598095, + "router_z_loss_clip": 0.75341797, + "router_z_loss_mlp": 0.11553955, + "step": 13794, + "time_per_iteration": 2.6499392986297607 + }, + { + "auxiliary_loss_clip": 0.01109029, + "auxiliary_loss_mlp": 0.01031521, + "balance_loss_clip": 1.03667939, + "balance_loss_mlp": 1.02036929, + "epoch": 0.829400270554637, + "flos": 30784287772800.0, + "grad_norm": 1.9790113565547816, + "language_loss": 0.66396666, + "learning_rate": 2.975187023140757e-07, + "loss": 0.68537211, + "num_input_tokens_seen": 297615955, + "router_z_loss_clip": 0.72363281, + "router_z_loss_mlp": 0.11151123, + "step": 13795, + "time_per_iteration": 2.6619651317596436 + }, + { + "auxiliary_loss_clip": 0.01106496, + "auxiliary_loss_mlp": 0.01029728, + "balance_loss_clip": 1.03848803, + "balance_loss_mlp": 1.01884985, + "epoch": 0.829460393807305, + "flos": 29850659896320.0, + "grad_norm": 1.7484841327479832, + "language_loss": 0.65867716, + "learning_rate": 2.973143546338661e-07, + "loss": 0.68003941, + "num_input_tokens_seen": 297636285, + "router_z_loss_clip": 0.6796875, + "router_z_loss_mlp": 0.10882568, + "step": 13796, + "time_per_iteration": 2.7284116744995117 + }, + { + "auxiliary_loss_clip": 0.01109517, + "auxiliary_loss_mlp": 0.01029702, + "balance_loss_clip": 1.03832769, + "balance_loss_mlp": 1.01868677, + "epoch": 0.829520517059973, + "flos": 18452019559680.0, + "grad_norm": 1.7200977374858613, + "language_loss": 0.71724743, + "learning_rate": 2.971100715196666e-07, + "loss": 0.73863959, + "num_input_tokens_seen": 297653315, + "router_z_loss_clip": 0.71142578, + "router_z_loss_mlp": 0.11022949, + "step": 13797, + "time_per_iteration": 4.119750738143921 + }, + { + "auxiliary_loss_clip": 0.01112291, + "auxiliary_loss_mlp": 0.01029145, + "balance_loss_clip": 1.03923428, + "balance_loss_mlp": 1.01875639, + "epoch": 0.8295806403126409, + "flos": 26331992706240.0, + "grad_norm": 2.006100931162229, + "language_loss": 0.72227782, + "learning_rate": 2.969058529792243e-07, + "loss": 0.74369216, + "num_input_tokens_seen": 297673480, + "router_z_loss_clip": 0.73144531, + "router_z_loss_mlp": 0.10394287, + "step": 13798, + "time_per_iteration": 2.6271846294403076 + }, + { + "auxiliary_loss_clip": 0.01104921, + "auxiliary_loss_mlp": 0.01028869, + "balance_loss_clip": 1.03601289, + "balance_loss_mlp": 1.01810455, + "epoch": 0.8296407635653089, + "flos": 26510810578560.0, + "grad_norm": 2.023466987944689, + "language_loss": 0.7625438, + "learning_rate": 2.967016990202822e-07, + "loss": 0.78388166, + "num_input_tokens_seen": 297693250, + "router_z_loss_clip": 0.68847656, + "router_z_loss_mlp": 0.10760498, + "step": 13799, + "time_per_iteration": 2.657778024673462 + }, + { + "auxiliary_loss_clip": 0.01110192, + "auxiliary_loss_mlp": 0.01030262, + "balance_loss_clip": 1.03874922, + "balance_loss_mlp": 1.01918125, + "epoch": 0.8297008868179768, + "flos": 13642574955840.0, + "grad_norm": 1.9058072249640419, + "language_loss": 0.67405802, + "learning_rate": 2.9649760965058245e-07, + "loss": 0.69546258, + "num_input_tokens_seen": 297710975, + "router_z_loss_clip": 0.71582031, + "router_z_loss_mlp": 0.11083984, + "step": 13800, + "time_per_iteration": 2.5943210124969482 + }, + { + "auxiliary_loss_clip": 0.01116502, + "auxiliary_loss_mlp": 0.01034708, + "balance_loss_clip": 1.04119778, + "balance_loss_mlp": 1.02204168, + "epoch": 0.8297610100706448, + "flos": 25214644366560.0, + "grad_norm": 2.894896250517594, + "language_loss": 0.74558508, + "learning_rate": 2.9629358487786515e-07, + "loss": 0.76709718, + "num_input_tokens_seen": 297730860, + "router_z_loss_clip": 0.75292969, + "router_z_loss_mlp": 0.12664795, + "step": 13801, + "time_per_iteration": 4.01429295539856 + }, + { + "auxiliary_loss_clip": 0.01111478, + "auxiliary_loss_mlp": 0.01027368, + "balance_loss_clip": 1.03807855, + "balance_loss_mlp": 1.01686597, + "epoch": 0.8298211333233128, + "flos": 24863167248480.0, + "grad_norm": 1.4670025334388372, + "language_loss": 0.73340893, + "learning_rate": 2.9608962470986476e-07, + "loss": 0.7547974, + "num_input_tokens_seen": 297749765, + "router_z_loss_clip": 0.73535156, + "router_z_loss_mlp": 0.10498047, + "step": 13802, + "time_per_iteration": 2.692329168319702 + }, + { + "auxiliary_loss_clip": 0.01110858, + "auxiliary_loss_mlp": 0.01029524, + "balance_loss_clip": 1.03778613, + "balance_loss_mlp": 1.01867568, + "epoch": 0.8298812565759808, + "flos": 26244852737760.0, + "grad_norm": 1.6013689972876504, + "language_loss": 0.74860668, + "learning_rate": 2.9588572915431644e-07, + "loss": 0.77001047, + "num_input_tokens_seen": 297770380, + "router_z_loss_clip": 0.73046875, + "router_z_loss_mlp": 0.10852051, + "step": 13803, + "time_per_iteration": 2.626924991607666 + }, + { + "auxiliary_loss_clip": 0.01112247, + "auxiliary_loss_mlp": 0.01029802, + "balance_loss_clip": 1.04018927, + "balance_loss_mlp": 1.01881123, + "epoch": 0.8299413798286487, + "flos": 27843719613120.0, + "grad_norm": 1.9872920730586419, + "language_loss": 0.79213476, + "learning_rate": 2.9568189821895215e-07, + "loss": 0.8135553, + "num_input_tokens_seen": 297789440, + "router_z_loss_clip": 0.72021484, + "router_z_loss_mlp": 0.10998535, + "step": 13804, + "time_per_iteration": 2.676398992538452 + }, + { + "auxiliary_loss_clip": 0.01109715, + "auxiliary_loss_mlp": 0.01027501, + "balance_loss_clip": 1.03835607, + "balance_loss_mlp": 1.01723731, + "epoch": 0.8300015030813167, + "flos": 36215340615360.0, + "grad_norm": 1.7546687060716786, + "language_loss": 0.73410928, + "learning_rate": 2.954781319115016e-07, + "loss": 0.75548148, + "num_input_tokens_seen": 297810425, + "router_z_loss_clip": 0.71289062, + "router_z_loss_mlp": 0.1026001, + "step": 13805, + "time_per_iteration": 2.678941249847412 + }, + { + "auxiliary_loss_clip": 0.01114029, + "auxiliary_loss_mlp": 0.01030204, + "balance_loss_clip": 1.03986287, + "balance_loss_mlp": 1.01903439, + "epoch": 0.8300616263339846, + "flos": 24061647170880.0, + "grad_norm": 2.1319764998940114, + "language_loss": 0.77530372, + "learning_rate": 2.952744302396906e-07, + "loss": 0.79674608, + "num_input_tokens_seen": 297827680, + "router_z_loss_clip": 0.74267578, + "router_z_loss_mlp": 0.11175537, + "step": 13806, + "time_per_iteration": 2.6772117614746094 + }, + { + "auxiliary_loss_clip": 0.0111646, + "auxiliary_loss_mlp": 0.01031181, + "balance_loss_clip": 1.04155374, + "balance_loss_mlp": 1.01899767, + "epoch": 0.8301217495866526, + "flos": 24284298389760.0, + "grad_norm": 1.9349062868442342, + "language_loss": 0.62940967, + "learning_rate": 2.950707932112444e-07, + "loss": 0.65088606, + "num_input_tokens_seen": 297848005, + "router_z_loss_clip": 0.74853516, + "router_z_loss_mlp": 0.12188721, + "step": 13807, + "time_per_iteration": 2.6308650970458984 + }, + { + "auxiliary_loss_clip": 0.0111296, + "auxiliary_loss_mlp": 0.0102593, + "balance_loss_clip": 1.04127979, + "balance_loss_mlp": 1.01479006, + "epoch": 0.8301818728393207, + "flos": 24057473891040.0, + "grad_norm": 4.589544392477137, + "language_loss": 0.73282218, + "learning_rate": 2.948672208338847e-07, + "loss": 0.75421107, + "num_input_tokens_seen": 297866730, + "router_z_loss_clip": 0.71630859, + "router_z_loss_mlp": 0.11132812, + "step": 13808, + "time_per_iteration": 2.7120516300201416 + }, + { + "auxiliary_loss_clip": 0.01118209, + "auxiliary_loss_mlp": 0.01040664, + "balance_loss_clip": 1.04224205, + "balance_loss_mlp": 1.02858174, + "epoch": 0.8302419960919886, + "flos": 34524512215200.0, + "grad_norm": 3.514215536065313, + "language_loss": 0.66430461, + "learning_rate": 2.9466371311533046e-07, + "loss": 0.68589336, + "num_input_tokens_seen": 297886390, + "router_z_loss_clip": 0.75976562, + "router_z_loss_mlp": 0.12078857, + "step": 13809, + "time_per_iteration": 2.6868069171905518 + }, + { + "auxiliary_loss_clip": 0.01111128, + "auxiliary_loss_mlp": 0.01022206, + "balance_loss_clip": 1.03768694, + "balance_loss_mlp": 1.01153064, + "epoch": 0.8303021193446566, + "flos": 22236077348640.0, + "grad_norm": 5.625847223156463, + "language_loss": 0.74267733, + "learning_rate": 2.9446027006329896e-07, + "loss": 0.76401067, + "num_input_tokens_seen": 297905110, + "router_z_loss_clip": 0.734375, + "router_z_loss_mlp": 0.10681152, + "step": 13810, + "time_per_iteration": 2.6376287937164307 + }, + { + "auxiliary_loss_clip": 0.01109895, + "auxiliary_loss_mlp": 0.01036008, + "balance_loss_clip": 1.03992915, + "balance_loss_mlp": 1.02596998, + "epoch": 0.8303622425973245, + "flos": 28200869150400.0, + "grad_norm": 1.5701443797669294, + "language_loss": 0.81069404, + "learning_rate": 2.94256891685505e-07, + "loss": 0.83215302, + "num_input_tokens_seen": 297925460, + "router_z_loss_clip": 0.69873047, + "router_z_loss_mlp": 0.10040283, + "step": 13811, + "time_per_iteration": 2.745973587036133 + }, + { + "auxiliary_loss_clip": 0.01114487, + "auxiliary_loss_mlp": 0.01036626, + "balance_loss_clip": 1.04095435, + "balance_loss_mlp": 1.02614164, + "epoch": 0.8304223658499925, + "flos": 23882788781280.0, + "grad_norm": 2.730806342355887, + "language_loss": 0.73387039, + "learning_rate": 2.9405357798966156e-07, + "loss": 0.75538146, + "num_input_tokens_seen": 297941760, + "router_z_loss_clip": 0.73583984, + "router_z_loss_mlp": 0.10479736, + "step": 13812, + "time_per_iteration": 2.652724504470825 + }, + { + "auxiliary_loss_clip": 0.01108806, + "auxiliary_loss_mlp": 0.01031209, + "balance_loss_clip": 1.03896451, + "balance_loss_mlp": 1.02009892, + "epoch": 0.8304824891026604, + "flos": 29805043789440.0, + "grad_norm": 1.6387291125484968, + "language_loss": 0.78343534, + "learning_rate": 2.9385032898347664e-07, + "loss": 0.80483556, + "num_input_tokens_seen": 297959745, + "router_z_loss_clip": 0.69824219, + "router_z_loss_mlp": 0.11114502, + "step": 13813, + "time_per_iteration": 2.664296865463257 + }, + { + "auxiliary_loss_clip": 0.01112392, + "auxiliary_loss_mlp": 0.01025011, + "balance_loss_clip": 1.03814387, + "balance_loss_mlp": 1.01339996, + "epoch": 0.8305426123553284, + "flos": 27310507378560.0, + "grad_norm": 2.016516224175614, + "language_loss": 0.71147168, + "learning_rate": 2.93647144674658e-07, + "loss": 0.73284572, + "num_input_tokens_seen": 297977665, + "router_z_loss_clip": 0.74169922, + "router_z_loss_mlp": 0.1161499, + "step": 13814, + "time_per_iteration": 2.672018051147461 + }, + { + "auxiliary_loss_clip": 0.01118616, + "auxiliary_loss_mlp": 0.01035209, + "balance_loss_clip": 1.03997445, + "balance_loss_mlp": 1.02194095, + "epoch": 0.8306027356079964, + "flos": 18184886717760.0, + "grad_norm": 2.1132984932059142, + "language_loss": 0.67922109, + "learning_rate": 2.9344402507091116e-07, + "loss": 0.70075941, + "num_input_tokens_seen": 297993525, + "router_z_loss_clip": 0.78613281, + "router_z_loss_mlp": 0.13275146, + "step": 13815, + "time_per_iteration": 2.6515443325042725 + }, + { + "auxiliary_loss_clip": 0.01113085, + "auxiliary_loss_mlp": 0.01031173, + "balance_loss_clip": 1.0398072, + "balance_loss_mlp": 1.01956761, + "epoch": 0.8306628588606644, + "flos": 23970414957120.0, + "grad_norm": 3.4190523489055336, + "language_loss": 0.75687712, + "learning_rate": 2.9324097017993745e-07, + "loss": 0.77831972, + "num_input_tokens_seen": 298012920, + "router_z_loss_clip": 0.73291016, + "router_z_loss_mlp": 0.1161499, + "step": 13816, + "time_per_iteration": 2.651398181915283 + }, + { + "auxiliary_loss_clip": 0.01109581, + "auxiliary_loss_mlp": 0.01028968, + "balance_loss_clip": 1.03782606, + "balance_loss_mlp": 1.01856112, + "epoch": 0.8307229821133323, + "flos": 29760643200960.0, + "grad_norm": 1.749283188222412, + "language_loss": 0.81472856, + "learning_rate": 2.930379800094371e-07, + "loss": 0.83611405, + "num_input_tokens_seen": 298033310, + "router_z_loss_clip": 0.71728516, + "router_z_loss_mlp": 0.10412598, + "step": 13817, + "time_per_iteration": 2.7000906467437744 + }, + { + "auxiliary_loss_clip": 0.01114499, + "auxiliary_loss_mlp": 0.01035162, + "balance_loss_clip": 1.04028368, + "balance_loss_mlp": 1.02311039, + "epoch": 0.8307831053660003, + "flos": 25620448806720.0, + "grad_norm": 1.635303249640784, + "language_loss": 0.78084099, + "learning_rate": 2.9283505456710875e-07, + "loss": 0.80233765, + "num_input_tokens_seen": 298053530, + "router_z_loss_clip": 0.7421875, + "router_z_loss_mlp": 0.12042236, + "step": 13818, + "time_per_iteration": 2.6000587940216064 + }, + { + "auxiliary_loss_clip": 0.01113256, + "auxiliary_loss_mlp": 0.01032435, + "balance_loss_clip": 1.03994882, + "balance_loss_mlp": 1.02121699, + "epoch": 0.8308432286186682, + "flos": 26109098383680.0, + "grad_norm": 1.8011460827758043, + "language_loss": 0.82567966, + "learning_rate": 2.926321938606453e-07, + "loss": 0.84713656, + "num_input_tokens_seen": 298069305, + "router_z_loss_clip": 0.73291016, + "router_z_loss_mlp": 0.11230469, + "step": 13819, + "time_per_iteration": 2.674687385559082 + }, + { + "auxiliary_loss_clip": 0.0102971, + "auxiliary_loss_mlp": 0.01001916, + "balance_loss_clip": 1.00735617, + "balance_loss_mlp": 1.00094926, + "epoch": 0.8309033518713362, + "flos": 76305282439680.0, + "grad_norm": 0.7652966338498169, + "language_loss": 0.56254953, + "learning_rate": 2.924293978977399e-07, + "loss": 0.58286577, + "num_input_tokens_seen": 298125830, + "router_z_loss_clip": 0.22363281, + "router_z_loss_mlp": 0.00965118, + "step": 13820, + "time_per_iteration": 3.2485759258270264 + }, + { + "auxiliary_loss_clip": 0.01106071, + "auxiliary_loss_mlp": 0.01021431, + "balance_loss_clip": 1.036713, + "balance_loss_mlp": 1.01035666, + "epoch": 0.8309634751240043, + "flos": 20717989228800.0, + "grad_norm": 1.9388156651015176, + "language_loss": 0.6829316, + "learning_rate": 2.922266666860831e-07, + "loss": 0.70420665, + "num_input_tokens_seen": 298142320, + "router_z_loss_clip": 0.69433594, + "router_z_loss_mlp": 0.11071777, + "step": 13821, + "time_per_iteration": 2.662085771560669 + }, + { + "auxiliary_loss_clip": 0.01112775, + "auxiliary_loss_mlp": 0.01031722, + "balance_loss_clip": 1.03812695, + "balance_loss_mlp": 1.02020645, + "epoch": 0.8310235983766722, + "flos": 27667575881280.0, + "grad_norm": 1.9633497399366922, + "language_loss": 0.69139618, + "learning_rate": 2.920240002333625e-07, + "loss": 0.71284115, + "num_input_tokens_seen": 298161845, + "router_z_loss_clip": 0.74755859, + "router_z_loss_mlp": 0.11517334, + "step": 13822, + "time_per_iteration": 2.6859731674194336 + }, + { + "auxiliary_loss_clip": 0.011087, + "auxiliary_loss_mlp": 0.01031776, + "balance_loss_clip": 1.03819656, + "balance_loss_mlp": 1.02130318, + "epoch": 0.8310837216293402, + "flos": 37596904552800.0, + "grad_norm": 2.2151918734680622, + "language_loss": 0.62449068, + "learning_rate": 2.918213985472631e-07, + "loss": 0.64589548, + "num_input_tokens_seen": 298184165, + "router_z_loss_clip": 0.70458984, + "router_z_loss_mlp": 0.10473633, + "step": 13823, + "time_per_iteration": 2.6864399909973145 + }, + { + "auxiliary_loss_clip": 0.01029873, + "auxiliary_loss_mlp": 0.01001631, + "balance_loss_clip": 1.00754654, + "balance_loss_mlp": 1.00070238, + "epoch": 0.8311438448820081, + "flos": 86971397276160.0, + "grad_norm": 1.111655586002279, + "language_loss": 0.61878026, + "learning_rate": 2.916188616354669e-07, + "loss": 0.63909531, + "num_input_tokens_seen": 298251720, + "router_z_loss_clip": 0.22351074, + "router_z_loss_mlp": 0.00928497, + "step": 13824, + "time_per_iteration": 3.333073854446411 + }, + { + "auxiliary_loss_clip": 0.01111145, + "auxiliary_loss_mlp": 0.01029857, + "balance_loss_clip": 1.03902256, + "balance_loss_mlp": 1.01902688, + "epoch": 0.8312039681346761, + "flos": 25486436695680.0, + "grad_norm": 1.9024649216533676, + "language_loss": 0.74161673, + "learning_rate": 2.914163895056552e-07, + "loss": 0.76302671, + "num_input_tokens_seen": 298271910, + "router_z_loss_clip": 0.72119141, + "router_z_loss_mlp": 0.1083374, + "step": 13825, + "time_per_iteration": 2.5985217094421387 + }, + { + "auxiliary_loss_clip": 0.01112601, + "auxiliary_loss_mlp": 0.01033894, + "balance_loss_clip": 1.03867531, + "balance_loss_mlp": 1.02238476, + "epoch": 0.831264091387344, + "flos": 20764699302240.0, + "grad_norm": 1.8605689651029673, + "language_loss": 0.80005699, + "learning_rate": 2.9121398216550486e-07, + "loss": 0.82152194, + "num_input_tokens_seen": 298288105, + "router_z_loss_clip": 0.73876953, + "router_z_loss_mlp": 0.1151123, + "step": 13826, + "time_per_iteration": 4.09131121635437 + }, + { + "auxiliary_loss_clip": 0.01109244, + "auxiliary_loss_mlp": 0.01026695, + "balance_loss_clip": 1.03731036, + "balance_loss_mlp": 1.01534581, + "epoch": 0.831324214640012, + "flos": 29797669644480.0, + "grad_norm": 1.7017570257961971, + "language_loss": 0.68196762, + "learning_rate": 2.910116396226914e-07, + "loss": 0.70332694, + "num_input_tokens_seen": 298307600, + "router_z_loss_clip": 0.71875, + "router_z_loss_mlp": 0.11364746, + "step": 13827, + "time_per_iteration": 2.6385960578918457 + }, + { + "auxiliary_loss_clip": 0.01109704, + "auxiliary_loss_mlp": 0.0102954, + "balance_loss_clip": 1.03739047, + "balance_loss_mlp": 1.01900792, + "epoch": 0.83138433789268, + "flos": 16528491655200.0, + "grad_norm": 2.165207461888946, + "language_loss": 0.73691392, + "learning_rate": 2.9080936188488834e-07, + "loss": 0.75830632, + "num_input_tokens_seen": 298323055, + "router_z_loss_clip": 0.72363281, + "router_z_loss_mlp": 0.10540771, + "step": 13828, + "time_per_iteration": 2.6736366748809814 + }, + { + "auxiliary_loss_clip": 0.01111457, + "auxiliary_loss_mlp": 0.01032367, + "balance_loss_clip": 1.0371201, + "balance_loss_mlp": 1.02101219, + "epoch": 0.831444461145348, + "flos": 54291532171680.0, + "grad_norm": 3.071239481540392, + "language_loss": 0.67292833, + "learning_rate": 2.906071489597657e-07, + "loss": 0.69436657, + "num_input_tokens_seen": 298346950, + "router_z_loss_clip": 0.74267578, + "router_z_loss_mlp": 0.11358643, + "step": 13829, + "time_per_iteration": 2.8453221321105957 + }, + { + "auxiliary_loss_clip": 0.01113493, + "auxiliary_loss_mlp": 0.01029605, + "balance_loss_clip": 1.03775263, + "balance_loss_mlp": 1.01802397, + "epoch": 0.8315045843980159, + "flos": 27705412670400.0, + "grad_norm": 1.7972934383626271, + "language_loss": 0.82718766, + "learning_rate": 2.9040500085499054e-07, + "loss": 0.84861869, + "num_input_tokens_seen": 298366315, + "router_z_loss_clip": 0.75683594, + "router_z_loss_mlp": 0.11578369, + "step": 13830, + "time_per_iteration": 2.6721138954162598 + }, + { + "auxiliary_loss_clip": 0.01110521, + "auxiliary_loss_mlp": 0.0102937, + "balance_loss_clip": 1.03823566, + "balance_loss_mlp": 1.01836097, + "epoch": 0.8315647076506839, + "flos": 20588758156800.0, + "grad_norm": 2.3319268050559545, + "language_loss": 0.74197376, + "learning_rate": 2.9020291757822925e-07, + "loss": 0.76337266, + "num_input_tokens_seen": 298385185, + "router_z_loss_clip": 0.72412109, + "router_z_loss_mlp": 0.11016846, + "step": 13831, + "time_per_iteration": 2.6564245223999023 + }, + { + "auxiliary_loss_clip": 0.01112197, + "auxiliary_loss_mlp": 0.01031345, + "balance_loss_clip": 1.0394969, + "balance_loss_mlp": 1.01979923, + "epoch": 0.8316248309033518, + "flos": 16848898369920.0, + "grad_norm": 1.9840886519747605, + "language_loss": 0.71154356, + "learning_rate": 2.9000089913714523e-07, + "loss": 0.732979, + "num_input_tokens_seen": 298402335, + "router_z_loss_clip": 0.72607422, + "router_z_loss_mlp": 0.11553955, + "step": 13832, + "time_per_iteration": 3.8588755130767822 + }, + { + "auxiliary_loss_clip": 0.01109102, + "auxiliary_loss_mlp": 0.01029846, + "balance_loss_clip": 1.03765225, + "balance_loss_mlp": 1.01884341, + "epoch": 0.8316849541560198, + "flos": 28690329072960.0, + "grad_norm": 1.8356788328645917, + "language_loss": 0.84754717, + "learning_rate": 2.897989455393979e-07, + "loss": 0.8689366, + "num_input_tokens_seen": 298423370, + "router_z_loss_clip": 0.71435547, + "router_z_loss_mlp": 0.11010742, + "step": 13833, + "time_per_iteration": 2.6470131874084473 + }, + { + "auxiliary_loss_clip": 0.01113417, + "auxiliary_loss_mlp": 0.01033179, + "balance_loss_clip": 1.03864789, + "balance_loss_mlp": 1.02194941, + "epoch": 0.8317450774086879, + "flos": 29005873714080.0, + "grad_norm": 1.54200557461306, + "language_loss": 0.76207918, + "learning_rate": 2.8959705679264625e-07, + "loss": 0.78354514, + "num_input_tokens_seen": 298444835, + "router_z_loss_clip": 0.74853516, + "router_z_loss_mlp": 0.11224365, + "step": 13834, + "time_per_iteration": 2.698336362838745 + }, + { + "auxiliary_loss_clip": 0.01106936, + "auxiliary_loss_mlp": 0.01028565, + "balance_loss_clip": 1.03704238, + "balance_loss_mlp": 1.01745462, + "epoch": 0.8318052006613558, + "flos": 19785982043520.0, + "grad_norm": 1.9381438592090285, + "language_loss": 0.79264271, + "learning_rate": 2.893952329045459e-07, + "loss": 0.81399775, + "num_input_tokens_seen": 298461845, + "router_z_loss_clip": 0.69873047, + "router_z_loss_mlp": 0.11102295, + "step": 13835, + "time_per_iteration": 2.66288423538208 + }, + { + "auxiliary_loss_clip": 0.01117017, + "auxiliary_loss_mlp": 0.01035033, + "balance_loss_clip": 1.04154539, + "balance_loss_mlp": 1.02213466, + "epoch": 0.8318653239140238, + "flos": 24372937497600.0, + "grad_norm": 2.4322290541451483, + "language_loss": 0.80547601, + "learning_rate": 2.8919347388274905e-07, + "loss": 0.82699651, + "num_input_tokens_seen": 298479095, + "router_z_loss_clip": 0.75341797, + "router_z_loss_mlp": 0.12908936, + "step": 13836, + "time_per_iteration": 2.6492176055908203 + }, + { + "auxiliary_loss_clip": 0.01110718, + "auxiliary_loss_mlp": 0.01028225, + "balance_loss_clip": 1.03905988, + "balance_loss_mlp": 1.01718616, + "epoch": 0.8319254471666917, + "flos": 21603813065280.0, + "grad_norm": 2.5589657414895792, + "language_loss": 0.77549195, + "learning_rate": 2.8899177973490727e-07, + "loss": 0.79688144, + "num_input_tokens_seen": 298494475, + "router_z_loss_clip": 0.71630859, + "router_z_loss_mlp": 0.1104126, + "step": 13837, + "time_per_iteration": 4.14945912361145 + }, + { + "auxiliary_loss_clip": 0.01114451, + "auxiliary_loss_mlp": 0.01028784, + "balance_loss_clip": 1.03824353, + "balance_loss_mlp": 1.01664245, + "epoch": 0.8319855704193597, + "flos": 23838469227360.0, + "grad_norm": 2.822614590210657, + "language_loss": 0.83362365, + "learning_rate": 2.887901504686685e-07, + "loss": 0.85505605, + "num_input_tokens_seen": 298513185, + "router_z_loss_clip": 0.76171875, + "router_z_loss_mlp": 0.12121582, + "step": 13838, + "time_per_iteration": 2.633733034133911 + }, + { + "auxiliary_loss_clip": 0.01110818, + "auxiliary_loss_mlp": 0.01030534, + "balance_loss_clip": 1.0392611, + "balance_loss_mlp": 1.01899457, + "epoch": 0.8320456936720276, + "flos": 25841600886240.0, + "grad_norm": 2.014711880357002, + "language_loss": 0.7430774, + "learning_rate": 2.885885860916795e-07, + "loss": 0.7644909, + "num_input_tokens_seen": 298531885, + "router_z_loss_clip": 0.71484375, + "router_z_loss_mlp": 0.11535645, + "step": 13839, + "time_per_iteration": 2.619185209274292 + }, + { + "auxiliary_loss_clip": 0.01113649, + "auxiliary_loss_mlp": 0.0103164, + "balance_loss_clip": 1.04054523, + "balance_loss_mlp": 1.02014792, + "epoch": 0.8321058169246957, + "flos": 40574134500480.0, + "grad_norm": 1.5363231822603471, + "language_loss": 0.67636448, + "learning_rate": 2.8838708661158253e-07, + "loss": 0.69781739, + "num_input_tokens_seen": 298554905, + "router_z_loss_clip": 0.73144531, + "router_z_loss_mlp": 0.11486816, + "step": 13840, + "time_per_iteration": 4.086419343948364 + }, + { + "auxiliary_loss_clip": 0.01110966, + "auxiliary_loss_mlp": 0.01027452, + "balance_loss_clip": 1.03752613, + "balance_loss_mlp": 1.01610351, + "epoch": 0.8321659401773636, + "flos": 17336453980320.0, + "grad_norm": 2.0445560063548305, + "language_loss": 0.79657793, + "learning_rate": 2.8818565203601843e-07, + "loss": 0.81796217, + "num_input_tokens_seen": 298571185, + "router_z_loss_clip": 0.73486328, + "router_z_loss_mlp": 0.11340332, + "step": 13841, + "time_per_iteration": 2.639117479324341 + }, + { + "auxiliary_loss_clip": 0.01109407, + "auxiliary_loss_mlp": 0.01027758, + "balance_loss_clip": 1.03833604, + "balance_loss_mlp": 1.01633227, + "epoch": 0.8322260634300316, + "flos": 18495731354400.0, + "grad_norm": 1.764040368318447, + "language_loss": 0.68211424, + "learning_rate": 2.879842823726262e-07, + "loss": 0.70348591, + "num_input_tokens_seen": 298588505, + "router_z_loss_clip": 0.71044922, + "router_z_loss_mlp": 0.11425781, + "step": 13842, + "time_per_iteration": 2.5792505741119385 + }, + { + "auxiliary_loss_clip": 0.01110716, + "auxiliary_loss_mlp": 0.01032135, + "balance_loss_clip": 1.03875804, + "balance_loss_mlp": 1.01980281, + "epoch": 0.8322861866826995, + "flos": 30872521707840.0, + "grad_norm": 2.1262408908391337, + "language_loss": 0.73194993, + "learning_rate": 2.8778297762904124e-07, + "loss": 0.75337851, + "num_input_tokens_seen": 298609295, + "router_z_loss_clip": 0.71875, + "router_z_loss_mlp": 0.12335205, + "step": 13843, + "time_per_iteration": 2.760488271713257 + }, + { + "auxiliary_loss_clip": 0.011104, + "auxiliary_loss_mlp": 0.01029043, + "balance_loss_clip": 1.03992891, + "balance_loss_mlp": 1.01783764, + "epoch": 0.8323463099353675, + "flos": 20766765683520.0, + "grad_norm": 1.7396899963695516, + "language_loss": 0.7736274, + "learning_rate": 2.875817378128975e-07, + "loss": 0.79502177, + "num_input_tokens_seen": 298625765, + "router_z_loss_clip": 0.70458984, + "router_z_loss_mlp": 0.11212158, + "step": 13844, + "time_per_iteration": 2.5924649238586426 + }, + { + "auxiliary_loss_clip": 0.01029692, + "auxiliary_loss_mlp": 0.01000501, + "balance_loss_clip": 1.0073266, + "balance_loss_mlp": 0.99954188, + "epoch": 0.8324064331880354, + "flos": 67853328372000.0, + "grad_norm": 0.7787455486448183, + "language_loss": 0.55217767, + "learning_rate": 2.8738056293182624e-07, + "loss": 0.57247961, + "num_input_tokens_seen": 298683005, + "router_z_loss_clip": 0.22363281, + "router_z_loss_mlp": 0.00958252, + "step": 13845, + "time_per_iteration": 3.1628904342651367 + }, + { + "auxiliary_loss_clip": 0.01112883, + "auxiliary_loss_mlp": 0.01047776, + "balance_loss_clip": 1.0387969, + "balance_loss_mlp": 1.03609991, + "epoch": 0.8324665564407034, + "flos": 31894991278560.0, + "grad_norm": 4.270115091159958, + "language_loss": 0.75425434, + "learning_rate": 2.871794529934555e-07, + "loss": 0.77586102, + "num_input_tokens_seen": 298703060, + "router_z_loss_clip": 0.74121094, + "router_z_loss_mlp": 0.11682129, + "step": 13846, + "time_per_iteration": 2.6768527030944824 + }, + { + "auxiliary_loss_clip": 0.01112166, + "auxiliary_loss_mlp": 0.01027009, + "balance_loss_clip": 1.0367496, + "balance_loss_mlp": 1.01506448, + "epoch": 0.8325266796933715, + "flos": 26905027076640.0, + "grad_norm": 1.9365595764396506, + "language_loss": 0.78557014, + "learning_rate": 2.8697840800541115e-07, + "loss": 0.80696189, + "num_input_tokens_seen": 298721765, + "router_z_loss_clip": 0.75390625, + "router_z_loss_mlp": 0.11956787, + "step": 13847, + "time_per_iteration": 2.6957273483276367 + }, + { + "auxiliary_loss_clip": 0.0110999, + "auxiliary_loss_mlp": 0.01025364, + "balance_loss_clip": 1.03841388, + "balance_loss_mlp": 1.0149157, + "epoch": 0.8325868029460394, + "flos": 27841410128160.0, + "grad_norm": 2.8808443211495187, + "language_loss": 0.74675333, + "learning_rate": 2.867774279753175e-07, + "loss": 0.76810694, + "num_input_tokens_seen": 298740825, + "router_z_loss_clip": 0.71533203, + "router_z_loss_mlp": 0.10455322, + "step": 13848, + "time_per_iteration": 2.678004503250122 + }, + { + "auxiliary_loss_clip": 0.01111234, + "auxiliary_loss_mlp": 0.01028201, + "balance_loss_clip": 1.03908217, + "balance_loss_mlp": 1.0166254, + "epoch": 0.8326469261987074, + "flos": 18006595570080.0, + "grad_norm": 5.093123085114421, + "language_loss": 0.62829989, + "learning_rate": 2.8657651291079554e-07, + "loss": 0.6496942, + "num_input_tokens_seen": 298758515, + "router_z_loss_clip": 0.72119141, + "router_z_loss_mlp": 0.11578369, + "step": 13849, + "time_per_iteration": 2.603991985321045 + }, + { + "auxiliary_loss_clip": 0.01112775, + "auxiliary_loss_mlp": 0.01030353, + "balance_loss_clip": 1.03794992, + "balance_loss_mlp": 1.01896846, + "epoch": 0.8327070494513753, + "flos": 27975017066400.0, + "grad_norm": 3.452719877568559, + "language_loss": 0.80045426, + "learning_rate": 2.863756628194638e-07, + "loss": 0.82188553, + "num_input_tokens_seen": 298776375, + "router_z_loss_clip": 0.74902344, + "router_z_loss_mlp": 0.11383057, + "step": 13850, + "time_per_iteration": 2.647986888885498 + }, + { + "auxiliary_loss_clip": 0.01107151, + "auxiliary_loss_mlp": 0.0103018, + "balance_loss_clip": 1.03774118, + "balance_loss_mlp": 1.01999998, + "epoch": 0.8327671727040433, + "flos": 25216102988640.0, + "grad_norm": 1.679283724846671, + "language_loss": 0.7856015, + "learning_rate": 2.8617487770893877e-07, + "loss": 0.80697483, + "num_input_tokens_seen": 298795135, + "router_z_loss_clip": 0.69433594, + "router_z_loss_mlp": 0.10180664, + "step": 13851, + "time_per_iteration": 2.606471538543701 + }, + { + "auxiliary_loss_clip": 0.01029616, + "auxiliary_loss_mlp": 0.01000532, + "balance_loss_clip": 1.0071559, + "balance_loss_mlp": 0.99956799, + "epoch": 0.8328272959567112, + "flos": 68404570796160.0, + "grad_norm": 0.7582550449116863, + "language_loss": 0.55712676, + "learning_rate": 2.859741575868344e-07, + "loss": 0.57742822, + "num_input_tokens_seen": 298855475, + "router_z_loss_clip": 0.22460938, + "router_z_loss_mlp": 0.0096283, + "step": 13852, + "time_per_iteration": 3.287698268890381 + }, + { + "auxiliary_loss_clip": 0.01109432, + "auxiliary_loss_mlp": 0.01026242, + "balance_loss_clip": 1.03871989, + "balance_loss_mlp": 1.01460135, + "epoch": 0.8328874192093793, + "flos": 39417490749600.0, + "grad_norm": 1.6833004392578723, + "language_loss": 0.67098081, + "learning_rate": 2.8577350246076125e-07, + "loss": 0.69233757, + "num_input_tokens_seen": 298875875, + "router_z_loss_clip": 0.70703125, + "router_z_loss_mlp": 0.11639404, + "step": 13853, + "time_per_iteration": 2.7023322582244873 + }, + { + "auxiliary_loss_clip": 0.01111892, + "auxiliary_loss_mlp": 0.01031512, + "balance_loss_clip": 1.03974319, + "balance_loss_mlp": 1.02075958, + "epoch": 0.8329475424620472, + "flos": 28690531659360.0, + "grad_norm": 1.6827116429960347, + "language_loss": 0.78516537, + "learning_rate": 2.855729123383286e-07, + "loss": 0.80659944, + "num_input_tokens_seen": 298895950, + "router_z_loss_clip": 0.72167969, + "router_z_loss_mlp": 0.10754395, + "step": 13854, + "time_per_iteration": 2.6491549015045166 + }, + { + "auxiliary_loss_clip": 0.01029846, + "auxiliary_loss_mlp": 0.01000326, + "balance_loss_clip": 1.00740659, + "balance_loss_mlp": 0.99937499, + "epoch": 0.8330076657147152, + "flos": 82779792804000.0, + "grad_norm": 0.7559737322563768, + "language_loss": 0.5865792, + "learning_rate": 2.8537238722714295e-07, + "loss": 0.60688096, + "num_input_tokens_seen": 298955770, + "router_z_loss_clip": 0.2244873, + "router_z_loss_mlp": 0.00949097, + "step": 13855, + "time_per_iteration": 3.1866471767425537 + }, + { + "auxiliary_loss_clip": 0.01111262, + "auxiliary_loss_mlp": 0.01027049, + "balance_loss_clip": 1.03949642, + "balance_loss_mlp": 1.0156939, + "epoch": 0.8330677889673831, + "flos": 27934060446720.0, + "grad_norm": 1.9832785584813686, + "language_loss": 0.71311128, + "learning_rate": 2.8517192713480853e-07, + "loss": 0.73449439, + "num_input_tokens_seen": 298976545, + "router_z_loss_clip": 0.71728516, + "router_z_loss_mlp": 0.11352539, + "step": 13856, + "time_per_iteration": 2.654203176498413 + }, + { + "auxiliary_loss_clip": 0.01111256, + "auxiliary_loss_mlp": 0.01031504, + "balance_loss_clip": 1.03937912, + "balance_loss_mlp": 1.02064371, + "epoch": 0.8331279122200511, + "flos": 33366936566880.0, + "grad_norm": 1.9180012169457943, + "language_loss": 0.75705308, + "learning_rate": 2.8497153206892677e-07, + "loss": 0.77848065, + "num_input_tokens_seen": 298996750, + "router_z_loss_clip": 0.71875, + "router_z_loss_mlp": 0.10852051, + "step": 13857, + "time_per_iteration": 2.7898001670837402 + }, + { + "auxiliary_loss_clip": 0.0110832, + "auxiliary_loss_mlp": 0.01024444, + "balance_loss_clip": 1.03987622, + "balance_loss_mlp": 1.01437724, + "epoch": 0.833188035472719, + "flos": 24328536909120.0, + "grad_norm": 1.6272340905897447, + "language_loss": 0.73388094, + "learning_rate": 2.847712020370958e-07, + "loss": 0.75520861, + "num_input_tokens_seen": 299014895, + "router_z_loss_clip": 0.68408203, + "router_z_loss_mlp": 0.10064697, + "step": 13858, + "time_per_iteration": 2.6737265586853027 + }, + { + "auxiliary_loss_clip": 0.01114321, + "auxiliary_loss_mlp": 0.01037107, + "balance_loss_clip": 1.03790092, + "balance_loss_mlp": 1.02507305, + "epoch": 0.833248158725387, + "flos": 18586355808960.0, + "grad_norm": 1.8275875148759815, + "language_loss": 0.7312876, + "learning_rate": 2.8457093704691316e-07, + "loss": 0.7528019, + "num_input_tokens_seen": 299032855, + "router_z_loss_clip": 0.76464844, + "router_z_loss_mlp": 0.12030029, + "step": 13859, + "time_per_iteration": 2.6453826427459717 + }, + { + "auxiliary_loss_clip": 0.01106792, + "auxiliary_loss_mlp": 0.01024471, + "balance_loss_clip": 1.03827858, + "balance_loss_mlp": 1.01378369, + "epoch": 0.8333082819780551, + "flos": 29938569693120.0, + "grad_norm": 1.8401953554161579, + "language_loss": 0.79351491, + "learning_rate": 2.8437073710597205e-07, + "loss": 0.8148275, + "num_input_tokens_seen": 299052055, + "router_z_loss_clip": 0.68457031, + "router_z_loss_mlp": 0.10687256, + "step": 13860, + "time_per_iteration": 2.69392991065979 + }, + { + "auxiliary_loss_clip": 0.01108056, + "auxiliary_loss_mlp": 0.01030521, + "balance_loss_clip": 1.0373503, + "balance_loss_mlp": 1.01908922, + "epoch": 0.833368405230723, + "flos": 38397411698400.0, + "grad_norm": 1.5734480325090678, + "language_loss": 0.82304901, + "learning_rate": 2.841706022218644e-07, + "loss": 0.8444348, + "num_input_tokens_seen": 299075285, + "router_z_loss_clip": 0.70605469, + "router_z_loss_mlp": 0.11431885, + "step": 13861, + "time_per_iteration": 2.7878808975219727 + }, + { + "auxiliary_loss_clip": 0.01113176, + "auxiliary_loss_mlp": 0.01030853, + "balance_loss_clip": 1.04000592, + "balance_loss_mlp": 1.01961124, + "epoch": 0.833428528483391, + "flos": 18183914303040.0, + "grad_norm": 1.723434570870199, + "language_loss": 0.79482937, + "learning_rate": 2.839705324021806e-07, + "loss": 0.81626964, + "num_input_tokens_seen": 299092520, + "router_z_loss_clip": 0.73095703, + "router_z_loss_mlp": 0.11248779, + "step": 13862, + "time_per_iteration": 2.6044585704803467 + }, + { + "auxiliary_loss_clip": 0.01111005, + "auxiliary_loss_mlp": 0.01030647, + "balance_loss_clip": 1.03711629, + "balance_loss_mlp": 1.0195787, + "epoch": 0.8334886517360589, + "flos": 27088423401600.0, + "grad_norm": 2.120752532153646, + "language_loss": 0.75889659, + "learning_rate": 2.83770527654505e-07, + "loss": 0.78031307, + "num_input_tokens_seen": 299109450, + "router_z_loss_clip": 0.73779297, + "router_z_loss_mlp": 0.1105957, + "step": 13863, + "time_per_iteration": 2.649477005004883 + }, + { + "auxiliary_loss_clip": 0.01108413, + "auxiliary_loss_mlp": 0.01029274, + "balance_loss_clip": 1.03926837, + "balance_loss_mlp": 1.01900375, + "epoch": 0.8335487749887269, + "flos": 37061423350560.0, + "grad_norm": 3.457678139820656, + "language_loss": 0.75469297, + "learning_rate": 2.835705879864232e-07, + "loss": 0.77606988, + "num_input_tokens_seen": 299129540, + "router_z_loss_clip": 0.69189453, + "router_z_loss_mlp": 0.10266113, + "step": 13864, + "time_per_iteration": 2.7137112617492676 + }, + { + "auxiliary_loss_clip": 0.01111371, + "auxiliary_loss_mlp": 0.01035043, + "balance_loss_clip": 1.03821898, + "balance_loss_mlp": 1.02258539, + "epoch": 0.8336088982413948, + "flos": 30116293598880.0, + "grad_norm": 1.8113197582985967, + "language_loss": 0.69149649, + "learning_rate": 2.833707134055168e-07, + "loss": 0.7129606, + "num_input_tokens_seen": 299148670, + "router_z_loss_clip": 0.73095703, + "router_z_loss_mlp": 0.12463379, + "step": 13865, + "time_per_iteration": 4.266476392745972 + }, + { + "auxiliary_loss_clip": 0.01112034, + "auxiliary_loss_mlp": 0.01036198, + "balance_loss_clip": 1.03931248, + "balance_loss_mlp": 1.02471805, + "epoch": 0.8336690214940629, + "flos": 46585555341120.0, + "grad_norm": 1.750730157340283, + "language_loss": 0.75340879, + "learning_rate": 2.831709039193653e-07, + "loss": 0.77489114, + "num_input_tokens_seen": 299169330, + "router_z_loss_clip": 0.72851562, + "router_z_loss_mlp": 0.11474609, + "step": 13866, + "time_per_iteration": 2.831618070602417 + }, + { + "auxiliary_loss_clip": 0.01029864, + "auxiliary_loss_mlp": 0.01002427, + "balance_loss_clip": 1.00749433, + "balance_loss_mlp": 1.00146127, + "epoch": 0.8337291447467308, + "flos": 67801715707680.0, + "grad_norm": 0.8679364444985072, + "language_loss": 0.63053244, + "learning_rate": 2.8297115953554465e-07, + "loss": 0.6508553, + "num_input_tokens_seen": 299220980, + "router_z_loss_clip": 0.22375488, + "router_z_loss_mlp": 0.00964355, + "step": 13867, + "time_per_iteration": 3.133863687515259 + }, + { + "auxiliary_loss_clip": 0.01108446, + "auxiliary_loss_mlp": 0.0102875, + "balance_loss_clip": 1.03817773, + "balance_loss_mlp": 1.01872444, + "epoch": 0.8337892679993988, + "flos": 29448096838560.0, + "grad_norm": 3.824038341168519, + "language_loss": 0.72010911, + "learning_rate": 2.827714802616301e-07, + "loss": 0.74148101, + "num_input_tokens_seen": 299240130, + "router_z_loss_clip": 0.703125, + "router_z_loss_mlp": 0.10028076, + "step": 13868, + "time_per_iteration": 2.6701250076293945 + }, + { + "auxiliary_loss_clip": 0.01115222, + "auxiliary_loss_mlp": 0.010311, + "balance_loss_clip": 1.04267192, + "balance_loss_mlp": 1.01984072, + "epoch": 0.8338493912520667, + "flos": 34390621656000.0, + "grad_norm": 1.593703068013912, + "language_loss": 0.80487478, + "learning_rate": 2.8257186610519325e-07, + "loss": 0.82633799, + "num_input_tokens_seen": 299260705, + "router_z_loss_clip": 0.72460938, + "router_z_loss_mlp": 0.11254883, + "step": 13869, + "time_per_iteration": 2.6931722164154053 + }, + { + "auxiliary_loss_clip": 0.01112331, + "auxiliary_loss_mlp": 0.01032221, + "balance_loss_clip": 1.03984308, + "balance_loss_mlp": 1.02122426, + "epoch": 0.8339095145047347, + "flos": 27038269359360.0, + "grad_norm": 1.6569026525554156, + "language_loss": 0.8257674, + "learning_rate": 2.823723170738028e-07, + "loss": 0.84721291, + "num_input_tokens_seen": 299278925, + "router_z_loss_clip": 0.72558594, + "router_z_loss_mlp": 0.11004639, + "step": 13870, + "time_per_iteration": 2.6489877700805664 + }, + { + "auxiliary_loss_clip": 0.01109801, + "auxiliary_loss_mlp": 0.01027629, + "balance_loss_clip": 1.0356698, + "balance_loss_mlp": 1.01591706, + "epoch": 0.8339696377574026, + "flos": 21118283318880.0, + "grad_norm": 3.500745263917983, + "language_loss": 0.70537233, + "learning_rate": 2.821728331750264e-07, + "loss": 0.72674662, + "num_input_tokens_seen": 299291580, + "router_z_loss_clip": 0.74072266, + "router_z_loss_mlp": 0.11724854, + "step": 13871, + "time_per_iteration": 4.071155548095703 + }, + { + "auxiliary_loss_clip": 0.01110411, + "auxiliary_loss_mlp": 0.01033487, + "balance_loss_clip": 1.03944123, + "balance_loss_mlp": 1.02254987, + "epoch": 0.8340297610100706, + "flos": 25219506440160.0, + "grad_norm": 2.0119710945266, + "language_loss": 0.69214326, + "learning_rate": 2.8197341441642853e-07, + "loss": 0.71358222, + "num_input_tokens_seen": 299310385, + "router_z_loss_clip": 0.70996094, + "router_z_loss_mlp": 0.10949707, + "step": 13872, + "time_per_iteration": 2.6304357051849365 + }, + { + "auxiliary_loss_clip": 0.01110256, + "auxiliary_loss_mlp": 0.01024333, + "balance_loss_clip": 1.03765988, + "balance_loss_mlp": 1.01359224, + "epoch": 0.8340898842627387, + "flos": 25033557526560.0, + "grad_norm": 1.9908599261432245, + "language_loss": 0.73478818, + "learning_rate": 2.817740608055712e-07, + "loss": 0.75613409, + "num_input_tokens_seen": 299327660, + "router_z_loss_clip": 0.72558594, + "router_z_loss_mlp": 0.10742188, + "step": 13873, + "time_per_iteration": 2.646515130996704 + }, + { + "auxiliary_loss_clip": 0.01114124, + "auxiliary_loss_mlp": 0.01038169, + "balance_loss_clip": 1.03891623, + "balance_loss_mlp": 1.02520466, + "epoch": 0.8341500075154066, + "flos": 26144666205120.0, + "grad_norm": 2.3812204361634732, + "language_loss": 0.75221556, + "learning_rate": 2.81574772350013e-07, + "loss": 0.77373844, + "num_input_tokens_seen": 299343685, + "router_z_loss_clip": 0.75244141, + "router_z_loss_mlp": 0.12963867, + "step": 13874, + "time_per_iteration": 2.589185953140259 + }, + { + "auxiliary_loss_clip": 0.01109179, + "auxiliary_loss_mlp": 0.01027445, + "balance_loss_clip": 1.03851974, + "balance_loss_mlp": 1.01677525, + "epoch": 0.8342101307680746, + "flos": 26955545774400.0, + "grad_norm": 1.9544696173909664, + "language_loss": 0.65759391, + "learning_rate": 2.813755490573118e-07, + "loss": 0.6789602, + "num_input_tokens_seen": 299363305, + "router_z_loss_clip": 0.70800781, + "router_z_loss_mlp": 0.10662842, + "step": 13875, + "time_per_iteration": 2.6625261306762695 + }, + { + "auxiliary_loss_clip": 0.01112469, + "auxiliary_loss_mlp": 0.01035261, + "balance_loss_clip": 1.04093468, + "balance_loss_mlp": 1.02418661, + "epoch": 0.8342702540207425, + "flos": 26687724138720.0, + "grad_norm": 1.857448855285122, + "language_loss": 0.79673791, + "learning_rate": 2.8117639093502243e-07, + "loss": 0.81821519, + "num_input_tokens_seen": 299382630, + "router_z_loss_clip": 0.71484375, + "router_z_loss_mlp": 0.11065674, + "step": 13876, + "time_per_iteration": 4.093953609466553 + }, + { + "auxiliary_loss_clip": 0.0111031, + "auxiliary_loss_mlp": 0.01031667, + "balance_loss_clip": 1.03902161, + "balance_loss_mlp": 1.02059269, + "epoch": 0.8343303772734105, + "flos": 27489446802720.0, + "grad_norm": 2.2060623327583837, + "language_loss": 0.87313092, + "learning_rate": 2.8097729799069615e-07, + "loss": 0.89455068, + "num_input_tokens_seen": 299402385, + "router_z_loss_clip": 0.71289062, + "router_z_loss_mlp": 0.11071777, + "step": 13877, + "time_per_iteration": 2.6174609661102295 + }, + { + "auxiliary_loss_clip": 0.01110673, + "auxiliary_loss_mlp": 0.01026338, + "balance_loss_clip": 1.0391798, + "balance_loss_mlp": 1.01592565, + "epoch": 0.8343905005260784, + "flos": 18229084719840.0, + "grad_norm": 3.7731697013041634, + "language_loss": 0.69363487, + "learning_rate": 2.807782702318828e-07, + "loss": 0.71500498, + "num_input_tokens_seen": 299419820, + "router_z_loss_clip": 0.71484375, + "router_z_loss_mlp": 0.10418701, + "step": 13878, + "time_per_iteration": 2.6527364253997803 + }, + { + "auxiliary_loss_clip": 0.01109714, + "auxiliary_loss_mlp": 0.01029758, + "balance_loss_clip": 1.03850257, + "balance_loss_mlp": 1.01905894, + "epoch": 0.8344506237787465, + "flos": 18317804862240.0, + "grad_norm": 1.8688660417985734, + "language_loss": 0.79494119, + "learning_rate": 2.805793076661309e-07, + "loss": 0.81633592, + "num_input_tokens_seen": 299436265, + "router_z_loss_clip": 0.71191406, + "router_z_loss_mlp": 0.10705566, + "step": 13879, + "time_per_iteration": 2.620917320251465 + }, + { + "auxiliary_loss_clip": 0.01110081, + "auxiliary_loss_mlp": 0.01028809, + "balance_loss_clip": 1.0387311, + "balance_loss_mlp": 1.01884282, + "epoch": 0.8345107470314144, + "flos": 21427547781600.0, + "grad_norm": 2.296434758945783, + "language_loss": 0.83516532, + "learning_rate": 2.803804103009828e-07, + "loss": 0.85655421, + "num_input_tokens_seen": 299451660, + "router_z_loss_clip": 0.71386719, + "router_z_loss_mlp": 0.09967041, + "step": 13880, + "time_per_iteration": 3.9584102630615234 + }, + { + "auxiliary_loss_clip": 0.01113491, + "auxiliary_loss_mlp": 0.0102952, + "balance_loss_clip": 1.0394721, + "balance_loss_mlp": 1.01869035, + "epoch": 0.8345708702840824, + "flos": 30734336316960.0, + "grad_norm": 1.8692222898715931, + "language_loss": 0.78356516, + "learning_rate": 2.80181578143982e-07, + "loss": 0.8049953, + "num_input_tokens_seen": 299472070, + "router_z_loss_clip": 0.74023438, + "router_z_loss_mlp": 0.10827637, + "step": 13881, + "time_per_iteration": 2.68235182762146 + }, + { + "auxiliary_loss_clip": 0.01105005, + "auxiliary_loss_mlp": 0.01024384, + "balance_loss_clip": 1.03789783, + "balance_loss_mlp": 1.01471663, + "epoch": 0.8346309935367503, + "flos": 18404742244320.0, + "grad_norm": 2.477049713088093, + "language_loss": 0.78235096, + "learning_rate": 2.7998281120266807e-07, + "loss": 0.8036449, + "num_input_tokens_seen": 299486725, + "router_z_loss_clip": 0.67089844, + "router_z_loss_mlp": 0.09674072, + "step": 13882, + "time_per_iteration": 2.603484869003296 + }, + { + "auxiliary_loss_clip": 0.01111154, + "auxiliary_loss_mlp": 0.01036229, + "balance_loss_clip": 1.03957045, + "balance_loss_mlp": 1.02525556, + "epoch": 0.8346911167894183, + "flos": 27980081726400.0, + "grad_norm": 1.828431391176864, + "language_loss": 0.80491418, + "learning_rate": 2.79784109484579e-07, + "loss": 0.826388, + "num_input_tokens_seen": 299505435, + "router_z_loss_clip": 0.71582031, + "router_z_loss_mlp": 0.10980225, + "step": 13883, + "time_per_iteration": 2.6806414127349854 + }, + { + "auxiliary_loss_clip": 0.01111093, + "auxiliary_loss_mlp": 0.0103157, + "balance_loss_clip": 1.03688407, + "balance_loss_mlp": 1.0200485, + "epoch": 0.8347512400420862, + "flos": 24639584132160.0, + "grad_norm": 2.0891447135712293, + "language_loss": 0.74124336, + "learning_rate": 2.795854729972482e-07, + "loss": 0.76267004, + "num_input_tokens_seen": 299523555, + "router_z_loss_clip": 0.74316406, + "router_z_loss_mlp": 0.11529541, + "step": 13884, + "time_per_iteration": 2.6033897399902344 + }, + { + "auxiliary_loss_clip": 0.01119016, + "auxiliary_loss_mlp": 0.01035732, + "balance_loss_clip": 1.04103339, + "balance_loss_mlp": 1.02301264, + "epoch": 0.8348113632947542, + "flos": 31670840920320.0, + "grad_norm": 1.8253234423818419, + "language_loss": 0.70548463, + "learning_rate": 2.7938690174820913e-07, + "loss": 0.72703207, + "num_input_tokens_seen": 299541660, + "router_z_loss_clip": 0.78125, + "router_z_loss_mlp": 0.1272583, + "step": 13885, + "time_per_iteration": 2.7184033393859863 + }, + { + "auxiliary_loss_clip": 0.01111356, + "auxiliary_loss_mlp": 0.010307, + "balance_loss_clip": 1.03808522, + "balance_loss_mlp": 1.01941061, + "epoch": 0.8348714865474223, + "flos": 41737423085280.0, + "grad_norm": 1.811409341926321, + "language_loss": 0.69945377, + "learning_rate": 2.791883957449912e-07, + "loss": 0.72087431, + "num_input_tokens_seen": 299562465, + "router_z_loss_clip": 0.73291016, + "router_z_loss_mlp": 0.11279297, + "step": 13886, + "time_per_iteration": 2.7795801162719727 + }, + { + "auxiliary_loss_clip": 0.01109071, + "auxiliary_loss_mlp": 0.01025935, + "balance_loss_clip": 1.03754401, + "balance_loss_mlp": 1.01465797, + "epoch": 0.8349316098000902, + "flos": 29761088891040.0, + "grad_norm": 1.5602601998049486, + "language_loss": 0.79048032, + "learning_rate": 2.7898995499512134e-07, + "loss": 0.8118304, + "num_input_tokens_seen": 299582700, + "router_z_loss_clip": 0.71533203, + "router_z_loss_mlp": 0.11279297, + "step": 13887, + "time_per_iteration": 2.6854429244995117 + }, + { + "auxiliary_loss_clip": 0.0111665, + "auxiliary_loss_mlp": 0.01032141, + "balance_loss_clip": 1.04072809, + "balance_loss_mlp": 1.01994586, + "epoch": 0.8349917330527582, + "flos": 28105382622240.0, + "grad_norm": 2.590691103400904, + "language_loss": 0.64229882, + "learning_rate": 2.7879157950612467e-07, + "loss": 0.66378677, + "num_input_tokens_seen": 299600310, + "router_z_loss_clip": 0.75878906, + "router_z_loss_mlp": 0.12194824, + "step": 13888, + "time_per_iteration": 2.595675230026245 + }, + { + "auxiliary_loss_clip": 0.01112444, + "auxiliary_loss_mlp": 0.01026806, + "balance_loss_clip": 1.03748178, + "balance_loss_mlp": 1.01562977, + "epoch": 0.8350518563054261, + "flos": 16625801460960.0, + "grad_norm": 2.344978102084442, + "language_loss": 0.67226601, + "learning_rate": 2.785932692855244e-07, + "loss": 0.69365847, + "num_input_tokens_seen": 299617025, + "router_z_loss_clip": 0.74902344, + "router_z_loss_mlp": 0.11181641, + "step": 13889, + "time_per_iteration": 2.7599809169769287 + }, + { + "auxiliary_loss_clip": 0.0110763, + "auxiliary_loss_mlp": 0.01026768, + "balance_loss_clip": 1.03650713, + "balance_loss_mlp": 1.01618814, + "epoch": 0.8351119795580941, + "flos": 26331790119840.0, + "grad_norm": 2.378066756881037, + "language_loss": 0.68959737, + "learning_rate": 2.783950243408399e-07, + "loss": 0.71094131, + "num_input_tokens_seen": 299633050, + "router_z_loss_clip": 0.71191406, + "router_z_loss_mlp": 0.10583496, + "step": 13890, + "time_per_iteration": 2.615985870361328 + }, + { + "auxiliary_loss_clip": 0.01112745, + "auxiliary_loss_mlp": 0.01036398, + "balance_loss_clip": 1.03973556, + "balance_loss_mlp": 1.02484024, + "epoch": 0.835172102810762, + "flos": 24451568837280.0, + "grad_norm": 2.26965953825954, + "language_loss": 0.59102273, + "learning_rate": 2.7819684467958817e-07, + "loss": 0.6125142, + "num_input_tokens_seen": 299646445, + "router_z_loss_clip": 0.72949219, + "router_z_loss_mlp": 0.11560059, + "step": 13891, + "time_per_iteration": 2.601421356201172 + }, + { + "auxiliary_loss_clip": 0.01112242, + "auxiliary_loss_mlp": 0.01028995, + "balance_loss_clip": 1.03907299, + "balance_loss_mlp": 1.0185461, + "epoch": 0.8352322260634301, + "flos": 30642212723040.0, + "grad_norm": 1.6538255196368918, + "language_loss": 0.71841574, + "learning_rate": 2.779987303092846e-07, + "loss": 0.73982811, + "num_input_tokens_seen": 299662665, + "router_z_loss_clip": 0.73193359, + "router_z_loss_mlp": 0.10449219, + "step": 13892, + "time_per_iteration": 2.6605277061462402 + }, + { + "auxiliary_loss_clip": 0.01108046, + "auxiliary_loss_mlp": 0.01026835, + "balance_loss_clip": 1.0375421, + "balance_loss_mlp": 1.01604033, + "epoch": 0.835292349316098, + "flos": 30339106886880.0, + "grad_norm": 1.869821732601669, + "language_loss": 0.66020977, + "learning_rate": 2.7780068123744207e-07, + "loss": 0.68155861, + "num_input_tokens_seen": 299683585, + "router_z_loss_clip": 0.70361328, + "router_z_loss_mlp": 0.10791016, + "step": 13893, + "time_per_iteration": 2.6574249267578125 + }, + { + "auxiliary_loss_clip": 0.011069, + "auxiliary_loss_mlp": 0.01027319, + "balance_loss_clip": 1.03485453, + "balance_loss_mlp": 1.01641166, + "epoch": 0.835352472568766, + "flos": 24239897801280.0, + "grad_norm": 2.03548487804698, + "language_loss": 0.78364432, + "learning_rate": 2.7760269747156996e-07, + "loss": 0.8049866, + "num_input_tokens_seen": 299702680, + "router_z_loss_clip": 0.72119141, + "router_z_loss_mlp": 0.10913086, + "step": 13894, + "time_per_iteration": 2.650840997695923 + }, + { + "auxiliary_loss_clip": 0.01108458, + "auxiliary_loss_mlp": 0.01026676, + "balance_loss_clip": 1.04003859, + "balance_loss_mlp": 1.01595926, + "epoch": 0.8354125958214339, + "flos": 26911307255040.0, + "grad_norm": 1.6896915461613695, + "language_loss": 0.72614491, + "learning_rate": 2.7740477901917625e-07, + "loss": 0.74749625, + "num_input_tokens_seen": 299721050, + "router_z_loss_clip": 0.68359375, + "router_z_loss_mlp": 0.10717773, + "step": 13895, + "time_per_iteration": 2.647510051727295 + }, + { + "auxiliary_loss_clip": 0.01111536, + "auxiliary_loss_mlp": 0.01040443, + "balance_loss_clip": 1.03806376, + "balance_loss_mlp": 1.02805102, + "epoch": 0.8354727190741019, + "flos": 26109665625600.0, + "grad_norm": 12.286712437618075, + "language_loss": 0.7268914, + "learning_rate": 2.772069258877667e-07, + "loss": 0.74841124, + "num_input_tokens_seen": 299738255, + "router_z_loss_clip": 0.73388672, + "router_z_loss_mlp": 0.12384033, + "step": 13896, + "time_per_iteration": 2.6356797218322754 + }, + { + "auxiliary_loss_clip": 0.01111044, + "auxiliary_loss_mlp": 0.0102701, + "balance_loss_clip": 1.03988838, + "balance_loss_mlp": 1.01614428, + "epoch": 0.8355328423267698, + "flos": 62038587173760.0, + "grad_norm": 2.671856029393716, + "language_loss": 0.58772039, + "learning_rate": 2.770091380848423e-07, + "loss": 0.60910094, + "num_input_tokens_seen": 299761315, + "router_z_loss_clip": 0.71142578, + "router_z_loss_mlp": 0.10870361, + "step": 13897, + "time_per_iteration": 2.8816208839416504 + }, + { + "auxiliary_loss_clip": 0.01029754, + "auxiliary_loss_mlp": 0.01000129, + "balance_loss_clip": 1.00745893, + "balance_loss_mlp": 0.9992106, + "epoch": 0.8355929655794379, + "flos": 79984986766560.0, + "grad_norm": 0.6970866696436272, + "language_loss": 0.57622826, + "learning_rate": 2.7681141561790423e-07, + "loss": 0.5965271, + "num_input_tokens_seen": 299828735, + "router_z_loss_clip": 0.22314453, + "router_z_loss_mlp": 0.00917053, + "step": 13898, + "time_per_iteration": 3.33327579498291 + }, + { + "auxiliary_loss_clip": 0.0111396, + "auxiliary_loss_mlp": 0.01034386, + "balance_loss_clip": 1.03971279, + "balance_loss_mlp": 1.02233994, + "epoch": 0.8356530888321058, + "flos": 23391910753920.0, + "grad_norm": 2.084661885299713, + "language_loss": 0.80348706, + "learning_rate": 2.7661375849444967e-07, + "loss": 0.8249706, + "num_input_tokens_seen": 299848395, + "router_z_loss_clip": 0.74267578, + "router_z_loss_mlp": 0.12042236, + "step": 13899, + "time_per_iteration": 2.617579698562622 + }, + { + "auxiliary_loss_clip": 0.01111904, + "auxiliary_loss_mlp": 0.01029314, + "balance_loss_clip": 1.03855848, + "balance_loss_mlp": 1.01871002, + "epoch": 0.8357132120847738, + "flos": 53845338353760.0, + "grad_norm": 3.586552410183914, + "language_loss": 0.68490434, + "learning_rate": 2.764161667219749e-07, + "loss": 0.70631659, + "num_input_tokens_seen": 299871665, + "router_z_loss_clip": 0.734375, + "router_z_loss_mlp": 0.1060791, + "step": 13900, + "time_per_iteration": 2.8573033809661865 + }, + { + "auxiliary_loss_clip": 0.01112178, + "auxiliary_loss_mlp": 0.01032606, + "balance_loss_clip": 1.04071641, + "balance_loss_mlp": 1.02185297, + "epoch": 0.8357733353374418, + "flos": 29760602683680.0, + "grad_norm": 1.5453709971671654, + "language_loss": 0.71587741, + "learning_rate": 2.762186403079716e-07, + "loss": 0.73732519, + "num_input_tokens_seen": 299891960, + "router_z_loss_clip": 0.71484375, + "router_z_loss_mlp": 0.10760498, + "step": 13901, + "time_per_iteration": 2.68471622467041 + }, + { + "auxiliary_loss_clip": 0.01114172, + "auxiliary_loss_mlp": 0.0103339, + "balance_loss_clip": 1.03852117, + "balance_loss_mlp": 1.02192855, + "epoch": 0.8358334585901097, + "flos": 25522774345440.0, + "grad_norm": 2.2658895409041016, + "language_loss": 0.80333006, + "learning_rate": 2.7602117925992963e-07, + "loss": 0.82480574, + "num_input_tokens_seen": 299905070, + "router_z_loss_clip": 0.75537109, + "router_z_loss_mlp": 0.11462402, + "step": 13902, + "time_per_iteration": 2.623915910720825 + }, + { + "auxiliary_loss_clip": 0.01109634, + "auxiliary_loss_mlp": 0.01030325, + "balance_loss_clip": 1.03969216, + "balance_loss_mlp": 1.01962638, + "epoch": 0.8358935818427777, + "flos": 23481724862880.0, + "grad_norm": 1.5946866614944764, + "language_loss": 0.62680101, + "learning_rate": 2.758237835853379e-07, + "loss": 0.64820051, + "num_input_tokens_seen": 299925130, + "router_z_loss_clip": 0.69921875, + "router_z_loss_mlp": 0.10693359, + "step": 13903, + "time_per_iteration": 2.710314989089966 + }, + { + "auxiliary_loss_clip": 0.01110628, + "auxiliary_loss_mlp": 0.01028536, + "balance_loss_clip": 1.03842497, + "balance_loss_mlp": 1.01727057, + "epoch": 0.8359537050954456, + "flos": 29448664080480.0, + "grad_norm": 1.9155344124259073, + "language_loss": 0.74025315, + "learning_rate": 2.7562645329168054e-07, + "loss": 0.76164472, + "num_input_tokens_seen": 299943845, + "router_z_loss_clip": 0.72167969, + "router_z_loss_mlp": 0.1126709, + "step": 13904, + "time_per_iteration": 2.6691415309906006 + }, + { + "auxiliary_loss_clip": 0.01107223, + "auxiliary_loss_mlp": 0.01029758, + "balance_loss_clip": 1.03726828, + "balance_loss_mlp": 1.01877856, + "epoch": 0.8360138283481137, + "flos": 19742999559840.0, + "grad_norm": 1.800701202680676, + "language_loss": 0.7278446, + "learning_rate": 2.7542918838644104e-07, + "loss": 0.74921441, + "num_input_tokens_seen": 299961620, + "router_z_loss_clip": 0.69970703, + "router_z_loss_mlp": 0.10968018, + "step": 13905, + "time_per_iteration": 4.07892107963562 + }, + { + "auxiliary_loss_clip": 0.01110764, + "auxiliary_loss_mlp": 0.01035878, + "balance_loss_clip": 1.04086053, + "balance_loss_mlp": 1.02574527, + "epoch": 0.8360739516007816, + "flos": 27087369952320.0, + "grad_norm": 1.588360062390748, + "language_loss": 0.66438997, + "learning_rate": 2.752319888771e-07, + "loss": 0.68585634, + "num_input_tokens_seen": 299982170, + "router_z_loss_clip": 0.69970703, + "router_z_loss_mlp": 0.10137939, + "step": 13906, + "time_per_iteration": 2.7582225799560547 + }, + { + "auxiliary_loss_clip": 0.01108743, + "auxiliary_loss_mlp": 0.01024997, + "balance_loss_clip": 1.03678489, + "balance_loss_mlp": 1.01413715, + "epoch": 0.8361340748534496, + "flos": 25530472628640.0, + "grad_norm": 1.559589964071097, + "language_loss": 0.74241269, + "learning_rate": 2.7503485477113475e-07, + "loss": 0.76375008, + "num_input_tokens_seen": 300001330, + "router_z_loss_clip": 0.71923828, + "router_z_loss_mlp": 0.10858154, + "step": 13907, + "time_per_iteration": 2.623682737350464 + }, + { + "auxiliary_loss_clip": 0.01110241, + "auxiliary_loss_mlp": 0.01032324, + "balance_loss_clip": 1.03603292, + "balance_loss_mlp": 1.02081466, + "epoch": 0.8361941981061175, + "flos": 31937811693120.0, + "grad_norm": 1.722623787024501, + "language_loss": 0.75266016, + "learning_rate": 2.7483778607602005e-07, + "loss": 0.77408582, + "num_input_tokens_seen": 300020645, + "router_z_loss_clip": 0.74267578, + "router_z_loss_mlp": 0.11523438, + "step": 13908, + "time_per_iteration": 2.700209617614746 + }, + { + "auxiliary_loss_clip": 0.01112581, + "auxiliary_loss_mlp": 0.01025065, + "balance_loss_clip": 1.03805828, + "balance_loss_mlp": 1.01272714, + "epoch": 0.8362543213587855, + "flos": 29797426540800.0, + "grad_norm": 2.511286770784032, + "language_loss": 0.71377969, + "learning_rate": 2.7464078279922964e-07, + "loss": 0.73515618, + "num_input_tokens_seen": 300039945, + "router_z_loss_clip": 0.74560547, + "router_z_loss_mlp": 0.12335205, + "step": 13909, + "time_per_iteration": 2.6295242309570312 + }, + { + "auxiliary_loss_clip": 0.01113599, + "auxiliary_loss_mlp": 0.01032956, + "balance_loss_clip": 1.03815794, + "balance_loss_mlp": 1.021703, + "epoch": 0.8363144446114534, + "flos": 20990065178880.0, + "grad_norm": 1.9174578273560228, + "language_loss": 0.73581421, + "learning_rate": 2.744438449482338e-07, + "loss": 0.75727981, + "num_input_tokens_seen": 300058260, + "router_z_loss_clip": 0.75341797, + "router_z_loss_mlp": 0.11260986, + "step": 13910, + "time_per_iteration": 3.8831324577331543 + }, + { + "auxiliary_loss_clip": 0.01111724, + "auxiliary_loss_mlp": 0.01028364, + "balance_loss_clip": 1.03917241, + "balance_loss_mlp": 1.01767087, + "epoch": 0.8363745678641215, + "flos": 23524828898400.0, + "grad_norm": 2.1971377630703537, + "language_loss": 0.73253971, + "learning_rate": 2.742469725305001e-07, + "loss": 0.75394058, + "num_input_tokens_seen": 300076720, + "router_z_loss_clip": 0.72509766, + "router_z_loss_mlp": 0.10699463, + "step": 13911, + "time_per_iteration": 2.58455228805542 + }, + { + "auxiliary_loss_clip": 0.011126, + "auxiliary_loss_mlp": 0.010376, + "balance_loss_clip": 1.03847218, + "balance_loss_mlp": 1.02632236, + "epoch": 0.8364346911167894, + "flos": 14489062863840.0, + "grad_norm": 2.066609890887844, + "language_loss": 0.79006439, + "learning_rate": 2.740501655534946e-07, + "loss": 0.81156635, + "num_input_tokens_seen": 300092950, + "router_z_loss_clip": 0.74121094, + "router_z_loss_mlp": 0.11279297, + "step": 13912, + "time_per_iteration": 2.649817705154419 + }, + { + "auxiliary_loss_clip": 0.01111951, + "auxiliary_loss_mlp": 0.0102798, + "balance_loss_clip": 1.03956223, + "balance_loss_mlp": 1.01743627, + "epoch": 0.8364948143694574, + "flos": 24679244198880.0, + "grad_norm": 1.9358283018215525, + "language_loss": 0.78729653, + "learning_rate": 2.738534240246797e-07, + "loss": 0.80869591, + "num_input_tokens_seen": 300110950, + "router_z_loss_clip": 0.72363281, + "router_z_loss_mlp": 0.10540771, + "step": 13913, + "time_per_iteration": 2.583845615386963 + }, + { + "auxiliary_loss_clip": 0.01110865, + "auxiliary_loss_mlp": 0.01030079, + "balance_loss_clip": 1.03778434, + "balance_loss_mlp": 1.0184021, + "epoch": 0.8365549376221254, + "flos": 26372503635840.0, + "grad_norm": 2.3790498306813466, + "language_loss": 0.73470509, + "learning_rate": 2.736567479515153e-07, + "loss": 0.75611448, + "num_input_tokens_seen": 300128705, + "router_z_loss_clip": 0.73095703, + "router_z_loss_mlp": 0.11682129, + "step": 13914, + "time_per_iteration": 2.663757801055908 + }, + { + "auxiliary_loss_clip": 0.01111398, + "auxiliary_loss_mlp": 0.01032788, + "balance_loss_clip": 1.03875995, + "balance_loss_mlp": 1.02096272, + "epoch": 0.8366150608747933, + "flos": 28423885024800.0, + "grad_norm": 1.7889393926365655, + "language_loss": 0.7145136, + "learning_rate": 2.7346013734146025e-07, + "loss": 0.73595548, + "num_input_tokens_seen": 300148635, + "router_z_loss_clip": 0.7265625, + "router_z_loss_mlp": 0.1182251, + "step": 13915, + "time_per_iteration": 2.641813039779663 + }, + { + "auxiliary_loss_clip": 0.01111892, + "auxiliary_loss_mlp": 0.01030467, + "balance_loss_clip": 1.03838408, + "balance_loss_mlp": 1.0194881, + "epoch": 0.8366751841274613, + "flos": 18629257258080.0, + "grad_norm": 1.9357614875196745, + "language_loss": 0.7277447, + "learning_rate": 2.7326359220197035e-07, + "loss": 0.74916828, + "num_input_tokens_seen": 300165490, + "router_z_loss_clip": 0.73535156, + "router_z_loss_mlp": 0.10974121, + "step": 13916, + "time_per_iteration": 4.063259601593018 + }, + { + "auxiliary_loss_clip": 0.01111598, + "auxiliary_loss_mlp": 0.0102725, + "balance_loss_clip": 1.0385499, + "balance_loss_mlp": 1.01584125, + "epoch": 0.8367353073801292, + "flos": 16136503607520.0, + "grad_norm": 1.8904519857739595, + "language_loss": 0.74894029, + "learning_rate": 2.7306711254049755e-07, + "loss": 0.7703287, + "num_input_tokens_seen": 300182130, + "router_z_loss_clip": 0.73046875, + "router_z_loss_mlp": 0.11413574, + "step": 13917, + "time_per_iteration": 2.6205899715423584 + }, + { + "auxiliary_loss_clip": 0.01107592, + "auxiliary_loss_mlp": 0.01030089, + "balance_loss_clip": 1.03917527, + "balance_loss_mlp": 1.01955652, + "epoch": 0.8367954306327973, + "flos": 29535520428000.0, + "grad_norm": 1.495126196539709, + "language_loss": 0.78996223, + "learning_rate": 2.728706983644933e-07, + "loss": 0.81133902, + "num_input_tokens_seen": 300203050, + "router_z_loss_clip": 0.68359375, + "router_z_loss_mlp": 0.10534668, + "step": 13918, + "time_per_iteration": 2.6452107429504395 + }, + { + "auxiliary_loss_clip": 0.01113151, + "auxiliary_loss_mlp": 0.01034688, + "balance_loss_clip": 1.04017162, + "balance_loss_mlp": 1.02310681, + "epoch": 0.8368555538854652, + "flos": 29937678312960.0, + "grad_norm": 1.5986564268774843, + "language_loss": 0.67912114, + "learning_rate": 2.7267434968140457e-07, + "loss": 0.70059955, + "num_input_tokens_seen": 300224380, + "router_z_loss_clip": 0.72949219, + "router_z_loss_mlp": 0.11578369, + "step": 13919, + "time_per_iteration": 2.6637516021728516 + }, + { + "auxiliary_loss_clip": 0.01108039, + "auxiliary_loss_mlp": 0.010289, + "balance_loss_clip": 1.03685856, + "balance_loss_mlp": 1.01799822, + "epoch": 0.8369156771381332, + "flos": 24721132716000.0, + "grad_norm": 2.8252966034550786, + "language_loss": 0.73684335, + "learning_rate": 2.7247806649867835e-07, + "loss": 0.75821269, + "num_input_tokens_seen": 300242915, + "router_z_loss_clip": 0.71142578, + "router_z_loss_mlp": 0.10900879, + "step": 13920, + "time_per_iteration": 4.018870830535889 + }, + { + "auxiliary_loss_clip": 0.01111652, + "auxiliary_loss_mlp": 0.010324, + "balance_loss_clip": 1.03905749, + "balance_loss_mlp": 1.02108741, + "epoch": 0.8369758003908011, + "flos": 26643850274880.0, + "grad_norm": 3.875534547025897, + "language_loss": 0.6878252, + "learning_rate": 2.722818488237566e-07, + "loss": 0.70926577, + "num_input_tokens_seen": 300261905, + "router_z_loss_clip": 0.72558594, + "router_z_loss_mlp": 0.11309814, + "step": 13921, + "time_per_iteration": 2.6273908615112305 + }, + { + "auxiliary_loss_clip": 0.01114361, + "auxiliary_loss_mlp": 0.0103058, + "balance_loss_clip": 1.04014587, + "balance_loss_mlp": 1.01966059, + "epoch": 0.8370359236434691, + "flos": 26502626088000.0, + "grad_norm": 2.058194391694605, + "language_loss": 0.85593331, + "learning_rate": 2.720856966640801e-07, + "loss": 0.87738264, + "num_input_tokens_seen": 300281145, + "router_z_loss_clip": 0.74169922, + "router_z_loss_mlp": 0.10919189, + "step": 13922, + "time_per_iteration": 2.6730570793151855 + }, + { + "auxiliary_loss_clip": 0.01106241, + "auxiliary_loss_mlp": 0.01030415, + "balance_loss_clip": 1.03648829, + "balance_loss_mlp": 1.02000868, + "epoch": 0.837096046896137, + "flos": 28246485257280.0, + "grad_norm": 1.859843250406023, + "language_loss": 0.72013366, + "learning_rate": 2.71889610027088e-07, + "loss": 0.74150026, + "num_input_tokens_seen": 300301610, + "router_z_loss_clip": 0.69726562, + "router_z_loss_mlp": 0.10412598, + "step": 13923, + "time_per_iteration": 2.6522462368011475 + }, + { + "auxiliary_loss_clip": 0.01109858, + "auxiliary_loss_mlp": 0.01028345, + "balance_loss_clip": 1.03934264, + "balance_loss_mlp": 1.01665616, + "epoch": 0.8371561701488051, + "flos": 29887078580640.0, + "grad_norm": 1.8901061997225506, + "language_loss": 0.76112282, + "learning_rate": 2.7169358892021433e-07, + "loss": 0.78250486, + "num_input_tokens_seen": 300319420, + "router_z_loss_clip": 0.70458984, + "router_z_loss_mlp": 0.11694336, + "step": 13924, + "time_per_iteration": 2.692094326019287 + }, + { + "auxiliary_loss_clip": 0.01108852, + "auxiliary_loss_mlp": 0.01025669, + "balance_loss_clip": 1.03723979, + "balance_loss_mlp": 1.01471317, + "epoch": 0.837216293401473, + "flos": 35638619172480.0, + "grad_norm": 1.6694711620489229, + "language_loss": 0.64306498, + "learning_rate": 2.7149763335089293e-07, + "loss": 0.66441023, + "num_input_tokens_seen": 300341325, + "router_z_loss_clip": 0.71582031, + "router_z_loss_mlp": 0.10961914, + "step": 13925, + "time_per_iteration": 2.701604127883911 + }, + { + "auxiliary_loss_clip": 0.0111222, + "auxiliary_loss_mlp": 0.01035237, + "balance_loss_clip": 1.038625, + "balance_loss_mlp": 1.02375126, + "epoch": 0.837276416654141, + "flos": 30829053016800.0, + "grad_norm": 1.6067030134890423, + "language_loss": 0.74198008, + "learning_rate": 2.713017433265543e-07, + "loss": 0.76345468, + "num_input_tokens_seen": 300361620, + "router_z_loss_clip": 0.73583984, + "router_z_loss_mlp": 0.1149292, + "step": 13926, + "time_per_iteration": 2.6455490589141846 + }, + { + "auxiliary_loss_clip": 0.01112376, + "auxiliary_loss_mlp": 0.01033568, + "balance_loss_clip": 1.04018188, + "balance_loss_mlp": 1.0217905, + "epoch": 0.837336539906809, + "flos": 16938307306080.0, + "grad_norm": 2.018318845261419, + "language_loss": 0.70926845, + "learning_rate": 2.711059188546274e-07, + "loss": 0.73072791, + "num_input_tokens_seen": 300378675, + "router_z_loss_clip": 0.72119141, + "router_z_loss_mlp": 0.11773682, + "step": 13927, + "time_per_iteration": 2.625192403793335 + }, + { + "auxiliary_loss_clip": 0.01029357, + "auxiliary_loss_mlp": 0.01001523, + "balance_loss_clip": 1.00697041, + "balance_loss_mlp": 1.00056326, + "epoch": 0.8373966631594769, + "flos": 84035812741920.0, + "grad_norm": 0.702340761160636, + "language_loss": 0.58772206, + "learning_rate": 2.7091015994253695e-07, + "loss": 0.60803092, + "num_input_tokens_seen": 300449740, + "router_z_loss_clip": 0.22399902, + "router_z_loss_mlp": 0.00958252, + "step": 13928, + "time_per_iteration": 3.3987178802490234 + }, + { + "auxiliary_loss_clip": 0.01114775, + "auxiliary_loss_mlp": 0.01033816, + "balance_loss_clip": 1.04223907, + "balance_loss_mlp": 1.02212214, + "epoch": 0.8374567864121449, + "flos": 24951846873600.0, + "grad_norm": 1.8170795996107467, + "language_loss": 0.69673681, + "learning_rate": 2.707144665977068e-07, + "loss": 0.71822274, + "num_input_tokens_seen": 300470000, + "router_z_loss_clip": 0.72460938, + "router_z_loss_mlp": 0.11694336, + "step": 13929, + "time_per_iteration": 2.747154951095581 + }, + { + "auxiliary_loss_clip": 0.01114245, + "auxiliary_loss_mlp": 0.01027498, + "balance_loss_clip": 1.03899193, + "balance_loss_mlp": 1.01563656, + "epoch": 0.8375169096648128, + "flos": 51133012797600.0, + "grad_norm": 1.9725372497618625, + "language_loss": 0.66702831, + "learning_rate": 2.705188388275574e-07, + "loss": 0.68844581, + "num_input_tokens_seen": 300494975, + "router_z_loss_clip": 0.75146484, + "router_z_loss_mlp": 0.11865234, + "step": 13930, + "time_per_iteration": 2.8110151290893555 + }, + { + "auxiliary_loss_clip": 0.01112222, + "auxiliary_loss_mlp": 0.01026727, + "balance_loss_clip": 1.04113102, + "balance_loss_mlp": 1.01576543, + "epoch": 0.8375770329174809, + "flos": 24415555325760.0, + "grad_norm": 2.4748827182452686, + "language_loss": 0.71440828, + "learning_rate": 2.703232766395067e-07, + "loss": 0.73579776, + "num_input_tokens_seen": 300513175, + "router_z_loss_clip": 0.71044922, + "router_z_loss_mlp": 0.10955811, + "step": 13931, + "time_per_iteration": 2.6956093311309814 + }, + { + "auxiliary_loss_clip": 0.0110912, + "auxiliary_loss_mlp": 0.01030119, + "balance_loss_clip": 1.0383389, + "balance_loss_mlp": 1.01900244, + "epoch": 0.8376371561701488, + "flos": 27800777646720.0, + "grad_norm": 1.649102460764559, + "language_loss": 0.71354103, + "learning_rate": 2.701277800409705e-07, + "loss": 0.73493332, + "num_input_tokens_seen": 300533770, + "router_z_loss_clip": 0.70800781, + "router_z_loss_mlp": 0.11120605, + "step": 13932, + "time_per_iteration": 2.6321685314178467 + }, + { + "auxiliary_loss_clip": 0.01109884, + "auxiliary_loss_mlp": 0.01033142, + "balance_loss_clip": 1.03845704, + "balance_loss_mlp": 1.02277076, + "epoch": 0.8376972794228168, + "flos": 29180761410240.0, + "grad_norm": 2.0966485485829027, + "language_loss": 0.66741502, + "learning_rate": 2.699323490393628e-07, + "loss": 0.68884528, + "num_input_tokens_seen": 300552995, + "router_z_loss_clip": 0.71386719, + "router_z_loss_mlp": 0.10375977, + "step": 13933, + "time_per_iteration": 2.67759108543396 + }, + { + "auxiliary_loss_clip": 0.01108822, + "auxiliary_loss_mlp": 0.01037371, + "balance_loss_clip": 1.03927493, + "balance_loss_mlp": 1.02623081, + "epoch": 0.8377574026754847, + "flos": 16758679088160.0, + "grad_norm": 2.047291842803438, + "language_loss": 0.76347202, + "learning_rate": 2.697369836420933e-07, + "loss": 0.78493392, + "num_input_tokens_seen": 300570275, + "router_z_loss_clip": 0.69433594, + "router_z_loss_mlp": 0.11157227, + "step": 13934, + "time_per_iteration": 2.6091296672821045 + }, + { + "auxiliary_loss_clip": 0.01113625, + "auxiliary_loss_mlp": 0.0102684, + "balance_loss_clip": 1.04263091, + "balance_loss_mlp": 1.01597977, + "epoch": 0.8378175259281527, + "flos": 26419213709280.0, + "grad_norm": 1.6346656038813843, + "language_loss": 0.77598703, + "learning_rate": 2.6954168385657115e-07, + "loss": 0.79739165, + "num_input_tokens_seen": 300590875, + "router_z_loss_clip": 0.70947266, + "router_z_loss_mlp": 0.10870361, + "step": 13935, + "time_per_iteration": 2.7231318950653076 + }, + { + "auxiliary_loss_clip": 0.01110145, + "auxiliary_loss_mlp": 0.01029769, + "balance_loss_clip": 1.03725529, + "balance_loss_mlp": 1.0184505, + "epoch": 0.8378776491808206, + "flos": 18850409337600.0, + "grad_norm": 3.4031377401671628, + "language_loss": 0.55750698, + "learning_rate": 2.6934644969020135e-07, + "loss": 0.57890612, + "num_input_tokens_seen": 300607490, + "router_z_loss_clip": 0.72851562, + "router_z_loss_mlp": 0.11315918, + "step": 13936, + "time_per_iteration": 2.649688959121704 + }, + { + "auxiliary_loss_clip": 0.01109409, + "auxiliary_loss_mlp": 0.01029448, + "balance_loss_clip": 1.03766894, + "balance_loss_mlp": 1.01905894, + "epoch": 0.8379377724334887, + "flos": 17961425153280.0, + "grad_norm": 2.5199080395453493, + "language_loss": 0.89253855, + "learning_rate": 2.691512811503882e-07, + "loss": 0.91392708, + "num_input_tokens_seen": 300623635, + "router_z_loss_clip": 0.71826172, + "router_z_loss_mlp": 0.10394287, + "step": 13937, + "time_per_iteration": 2.6515793800354004 + }, + { + "auxiliary_loss_clip": 0.01112551, + "auxiliary_loss_mlp": 0.01027747, + "balance_loss_clip": 1.04001939, + "balance_loss_mlp": 1.01675606, + "epoch": 0.8379978956861566, + "flos": 29938245554880.0, + "grad_norm": 1.6933931573522283, + "language_loss": 0.8180964, + "learning_rate": 2.689561782445313e-07, + "loss": 0.83949935, + "num_input_tokens_seen": 300643835, + "router_z_loss_clip": 0.72558594, + "router_z_loss_mlp": 0.10992432, + "step": 13938, + "time_per_iteration": 2.6408536434173584 + }, + { + "auxiliary_loss_clip": 0.01112143, + "auxiliary_loss_mlp": 0.01031227, + "balance_loss_clip": 1.0379113, + "balance_loss_mlp": 1.0195322, + "epoch": 0.8380580189388246, + "flos": 23125912395840.0, + "grad_norm": 1.7703040090096174, + "language_loss": 0.70787048, + "learning_rate": 2.6876114098002965e-07, + "loss": 0.72930419, + "num_input_tokens_seen": 300662500, + "router_z_loss_clip": 0.74316406, + "router_z_loss_mlp": 0.11694336, + "step": 13939, + "time_per_iteration": 2.679422616958618 + }, + { + "auxiliary_loss_clip": 0.01113841, + "auxiliary_loss_mlp": 0.01035084, + "balance_loss_clip": 1.04041886, + "balance_loss_mlp": 1.02366412, + "epoch": 0.8381181421914926, + "flos": 32385099477600.0, + "grad_norm": 2.242862657990804, + "language_loss": 0.76348913, + "learning_rate": 2.6856616936428e-07, + "loss": 0.78497839, + "num_input_tokens_seen": 300681480, + "router_z_loss_clip": 0.734375, + "router_z_loss_mlp": 0.11413574, + "step": 13940, + "time_per_iteration": 2.6967902183532715 + }, + { + "auxiliary_loss_clip": 0.01109775, + "auxiliary_loss_mlp": 0.01029821, + "balance_loss_clip": 1.039101, + "balance_loss_mlp": 1.0189079, + "epoch": 0.8381782654441605, + "flos": 28420805711520.0, + "grad_norm": 1.951270199117212, + "language_loss": 0.76255488, + "learning_rate": 2.6837126340467374e-07, + "loss": 0.78395087, + "num_input_tokens_seen": 300699165, + "router_z_loss_clip": 0.70654297, + "router_z_loss_mlp": 0.10906982, + "step": 13941, + "time_per_iteration": 2.6355936527252197 + }, + { + "auxiliary_loss_clip": 0.01114053, + "auxiliary_loss_mlp": 0.01029456, + "balance_loss_clip": 1.03775096, + "balance_loss_mlp": 1.01760674, + "epoch": 0.8382383886968285, + "flos": 32651786629440.0, + "grad_norm": 2.3498542138780394, + "language_loss": 0.73726881, + "learning_rate": 2.6817642310860276e-07, + "loss": 0.75870389, + "num_input_tokens_seen": 300714615, + "router_z_loss_clip": 0.76367188, + "router_z_loss_mlp": 0.1184082, + "step": 13942, + "time_per_iteration": 2.6735641956329346 + }, + { + "auxiliary_loss_clip": 0.01118307, + "auxiliary_loss_mlp": 0.01033409, + "balance_loss_clip": 1.04067874, + "balance_loss_mlp": 1.02112412, + "epoch": 0.8382985119494964, + "flos": 31853183796000.0, + "grad_norm": 1.5001696524229313, + "language_loss": 0.79685283, + "learning_rate": 2.679816484834554e-07, + "loss": 0.81836993, + "num_input_tokens_seen": 300734860, + "router_z_loss_clip": 0.77636719, + "router_z_loss_mlp": 0.12286377, + "step": 13943, + "time_per_iteration": 2.6388354301452637 + }, + { + "auxiliary_loss_clip": 0.01110375, + "auxiliary_loss_mlp": 0.0102753, + "balance_loss_clip": 1.03802025, + "balance_loss_mlp": 1.01675296, + "epoch": 0.8383586352021645, + "flos": 20053884713760.0, + "grad_norm": 1.9726049471042593, + "language_loss": 0.8539983, + "learning_rate": 2.6778693953661766e-07, + "loss": 0.8753773, + "num_input_tokens_seen": 300752735, + "router_z_loss_clip": 0.72363281, + "router_z_loss_mlp": 0.10766602, + "step": 13944, + "time_per_iteration": 2.6440610885620117 + }, + { + "auxiliary_loss_clip": 0.01029136, + "auxiliary_loss_mlp": 0.01000967, + "balance_loss_clip": 1.006814, + "balance_loss_mlp": 1.00001967, + "epoch": 0.8384187584548324, + "flos": 78331387396320.0, + "grad_norm": 0.6136563116114787, + "language_loss": 0.50249445, + "learning_rate": 2.6759229627547263e-07, + "loss": 0.52279544, + "num_input_tokens_seen": 300820760, + "router_z_loss_clip": 0.22314453, + "router_z_loss_mlp": 0.00946045, + "step": 13945, + "time_per_iteration": 4.835330486297607 + }, + { + "auxiliary_loss_clip": 0.01108685, + "auxiliary_loss_mlp": 0.01028306, + "balance_loss_clip": 1.03782272, + "balance_loss_mlp": 1.01798272, + "epoch": 0.8384788817075004, + "flos": 27311763414240.0, + "grad_norm": 2.12473443562681, + "language_loss": 0.64717585, + "learning_rate": 2.673977187074017e-07, + "loss": 0.66854584, + "num_input_tokens_seen": 300840025, + "router_z_loss_clip": 0.70800781, + "router_z_loss_mlp": 0.10314941, + "step": 13946, + "time_per_iteration": 2.615015745162964 + }, + { + "auxiliary_loss_clip": 0.01110788, + "auxiliary_loss_mlp": 0.01028627, + "balance_loss_clip": 1.03770542, + "balance_loss_mlp": 1.01718926, + "epoch": 0.8385390049601683, + "flos": 35992810948320.0, + "grad_norm": 3.675616805577105, + "language_loss": 0.67868698, + "learning_rate": 2.672032068397829e-07, + "loss": 0.70008111, + "num_input_tokens_seen": 300860380, + "router_z_loss_clip": 0.73144531, + "router_z_loss_mlp": 0.11450195, + "step": 13947, + "time_per_iteration": 2.6987836360931396 + }, + { + "auxiliary_loss_clip": 0.01113379, + "auxiliary_loss_mlp": 0.0102908, + "balance_loss_clip": 1.03974998, + "balance_loss_mlp": 1.01721239, + "epoch": 0.8385991282128363, + "flos": 39911042917440.0, + "grad_norm": 2.1073781821670003, + "language_loss": 0.70125127, + "learning_rate": 2.6700876067999176e-07, + "loss": 0.72267592, + "num_input_tokens_seen": 300881895, + "router_z_loss_clip": 0.73681641, + "router_z_loss_mlp": 0.11871338, + "step": 13948, + "time_per_iteration": 2.7581424713134766 + }, + { + "auxiliary_loss_clip": 0.01107905, + "auxiliary_loss_mlp": 0.01029977, + "balance_loss_clip": 1.03753471, + "balance_loss_mlp": 1.01972461, + "epoch": 0.8386592514655042, + "flos": 31044127504320.0, + "grad_norm": 2.0474811105537483, + "language_loss": 0.85170972, + "learning_rate": 2.6681438023540194e-07, + "loss": 0.87308848, + "num_input_tokens_seen": 300901575, + "router_z_loss_clip": 0.70410156, + "router_z_loss_mlp": 0.10253906, + "step": 13949, + "time_per_iteration": 2.792999744415283 + }, + { + "auxiliary_loss_clip": 0.01109311, + "auxiliary_loss_mlp": 0.01026815, + "balance_loss_clip": 1.03874791, + "balance_loss_mlp": 1.01551437, + "epoch": 0.8387193747181723, + "flos": 26864110974240.0, + "grad_norm": 1.9199017803580896, + "language_loss": 0.70909607, + "learning_rate": 2.66620065513385e-07, + "loss": 0.73045731, + "num_input_tokens_seen": 300919735, + "router_z_loss_clip": 0.70605469, + "router_z_loss_mlp": 0.11291504, + "step": 13950, + "time_per_iteration": 4.176244497299194 + }, + { + "auxiliary_loss_clip": 0.01110214, + "auxiliary_loss_mlp": 0.01026441, + "balance_loss_clip": 1.03836298, + "balance_loss_mlp": 1.0154084, + "epoch": 0.8387794979708402, + "flos": 22146749447040.0, + "grad_norm": 5.759206370808267, + "language_loss": 0.64516509, + "learning_rate": 2.6642581652130913e-07, + "loss": 0.66653168, + "num_input_tokens_seen": 300939150, + "router_z_loss_clip": 0.71875, + "router_z_loss_mlp": 0.1104126, + "step": 13951, + "time_per_iteration": 2.5789599418640137 + }, + { + "auxiliary_loss_clip": 0.01111078, + "auxiliary_loss_mlp": 0.01031233, + "balance_loss_clip": 1.038849, + "balance_loss_mlp": 1.02065325, + "epoch": 0.8388396212235082, + "flos": 31007141578080.0, + "grad_norm": 1.4696686460549806, + "language_loss": 0.70087069, + "learning_rate": 2.662316332665393e-07, + "loss": 0.72229385, + "num_input_tokens_seen": 300959730, + "router_z_loss_clip": 0.72216797, + "router_z_loss_mlp": 0.10583496, + "step": 13952, + "time_per_iteration": 2.727975368499756 + }, + { + "auxiliary_loss_clip": 0.01108687, + "auxiliary_loss_mlp": 0.010297, + "balance_loss_clip": 1.03801417, + "balance_loss_mlp": 1.0186671, + "epoch": 0.8388997444761762, + "flos": 27177751303200.0, + "grad_norm": 2.0766701792763294, + "language_loss": 0.7297951, + "learning_rate": 2.6603751575643987e-07, + "loss": 0.75117898, + "num_input_tokens_seen": 300976120, + "router_z_loss_clip": 0.70751953, + "router_z_loss_mlp": 0.11035156, + "step": 13953, + "time_per_iteration": 2.593668222427368 + }, + { + "auxiliary_loss_clip": 0.01107948, + "auxiliary_loss_mlp": 0.01028548, + "balance_loss_clip": 1.037503, + "balance_loss_mlp": 1.01729441, + "epoch": 0.8389598677288441, + "flos": 23883193954080.0, + "grad_norm": 1.9226191586565018, + "language_loss": 0.68340355, + "learning_rate": 2.6584346399837176e-07, + "loss": 0.70476854, + "num_input_tokens_seen": 300995080, + "router_z_loss_clip": 0.70361328, + "router_z_loss_mlp": 0.1126709, + "step": 13954, + "time_per_iteration": 2.6409783363342285 + }, + { + "auxiliary_loss_clip": 0.01112665, + "auxiliary_loss_mlp": 0.01030716, + "balance_loss_clip": 1.04021215, + "balance_loss_mlp": 1.02002287, + "epoch": 0.8390199909815121, + "flos": 21212554328640.0, + "grad_norm": 1.7907381651070895, + "language_loss": 0.73372352, + "learning_rate": 2.656494779996932e-07, + "loss": 0.75515735, + "num_input_tokens_seen": 301012920, + "router_z_loss_clip": 0.72460938, + "router_z_loss_mlp": 0.10693359, + "step": 13955, + "time_per_iteration": 2.5673460960388184 + }, + { + "auxiliary_loss_clip": 0.01110743, + "auxiliary_loss_mlp": 0.01027223, + "balance_loss_clip": 1.03821898, + "balance_loss_mlp": 1.01581478, + "epoch": 0.83908011423418, + "flos": 30066099039360.0, + "grad_norm": 3.415817109952967, + "language_loss": 0.66740346, + "learning_rate": 2.6545555776775995e-07, + "loss": 0.68878305, + "num_input_tokens_seen": 301028875, + "router_z_loss_clip": 0.72509766, + "router_z_loss_mlp": 0.11407471, + "step": 13956, + "time_per_iteration": 4.049537658691406 + }, + { + "auxiliary_loss_clip": 0.01113309, + "auxiliary_loss_mlp": 0.01032014, + "balance_loss_clip": 1.03823793, + "balance_loss_mlp": 1.0205934, + "epoch": 0.8391402374868481, + "flos": 30161626084800.0, + "grad_norm": 1.7880306650855216, + "language_loss": 0.7966671, + "learning_rate": 2.6526170330992667e-07, + "loss": 0.81812024, + "num_input_tokens_seen": 301050115, + "router_z_loss_clip": 0.75048828, + "router_z_loss_mlp": 0.11413574, + "step": 13957, + "time_per_iteration": 2.6870999336242676 + }, + { + "auxiliary_loss_clip": 0.01029311, + "auxiliary_loss_mlp": 0.01001745, + "balance_loss_clip": 1.00688004, + "balance_loss_mlp": 1.00077868, + "epoch": 0.839200360739516, + "flos": 69393376857600.0, + "grad_norm": 0.752596275540581, + "language_loss": 0.53341281, + "learning_rate": 2.6506791463354283e-07, + "loss": 0.55372339, + "num_input_tokens_seen": 301114155, + "router_z_loss_clip": 0.2244873, + "router_z_loss_mlp": 0.00965118, + "step": 13958, + "time_per_iteration": 3.326493263244629 + }, + { + "auxiliary_loss_clip": 0.01110348, + "auxiliary_loss_mlp": 0.01031388, + "balance_loss_clip": 1.03844774, + "balance_loss_mlp": 1.01944327, + "epoch": 0.839260483992184, + "flos": 22369522217760.0, + "grad_norm": 2.4852982460418858, + "language_loss": 0.73403168, + "learning_rate": 2.648741917459574e-07, + "loss": 0.755449, + "num_input_tokens_seen": 301133150, + "router_z_loss_clip": 0.71826172, + "router_z_loss_mlp": 0.11950684, + "step": 13959, + "time_per_iteration": 3.8986315727233887 + }, + { + "auxiliary_loss_clip": 0.01108567, + "auxiliary_loss_mlp": 0.01025841, + "balance_loss_clip": 1.0388937, + "balance_loss_mlp": 1.01548767, + "epoch": 0.8393206072448519, + "flos": 33053336755200.0, + "grad_norm": 1.8087437252280243, + "language_loss": 0.55829763, + "learning_rate": 2.646805346545169e-07, + "loss": 0.5796417, + "num_input_tokens_seen": 301153600, + "router_z_loss_clip": 0.69726562, + "router_z_loss_mlp": 0.10351562, + "step": 13960, + "time_per_iteration": 2.7283291816711426 + }, + { + "auxiliary_loss_clip": 0.01028991, + "auxiliary_loss_mlp": 0.01002439, + "balance_loss_clip": 1.00660634, + "balance_loss_mlp": 1.0015192, + "epoch": 0.8393807304975199, + "flos": 75068953899840.0, + "grad_norm": 1.0232107349226023, + "language_loss": 0.60805362, + "learning_rate": 2.6448694336656397e-07, + "loss": 0.6283679, + "num_input_tokens_seen": 301214335, + "router_z_loss_clip": 0.22399902, + "router_z_loss_mlp": 0.00919342, + "step": 13961, + "time_per_iteration": 3.3265209197998047 + }, + { + "auxiliary_loss_clip": 0.01108041, + "auxiliary_loss_mlp": 0.0103239, + "balance_loss_clip": 1.03592539, + "balance_loss_mlp": 1.02141643, + "epoch": 0.8394408537501878, + "flos": 18175041018720.0, + "grad_norm": 4.108548206225412, + "language_loss": 0.67896652, + "learning_rate": 2.642934178894405e-07, + "loss": 0.70037079, + "num_input_tokens_seen": 301228960, + "router_z_loss_clip": 0.72070312, + "router_z_loss_mlp": 0.10961914, + "step": 13962, + "time_per_iteration": 2.6189303398132324 + }, + { + "auxiliary_loss_clip": 0.01110905, + "auxiliary_loss_mlp": 0.01028584, + "balance_loss_clip": 1.03714347, + "balance_loss_mlp": 1.01763475, + "epoch": 0.8395009770028559, + "flos": 21246906631680.0, + "grad_norm": 2.385649311419026, + "language_loss": 0.72932655, + "learning_rate": 2.640999582304841e-07, + "loss": 0.75072145, + "num_input_tokens_seen": 301245875, + "router_z_loss_clip": 0.73828125, + "router_z_loss_mlp": 0.10949707, + "step": 13963, + "time_per_iteration": 2.5865423679351807 + }, + { + "auxiliary_loss_clip": 0.01110921, + "auxiliary_loss_mlp": 0.01032677, + "balance_loss_clip": 1.03782225, + "balance_loss_mlp": 1.02173352, + "epoch": 0.8395611002555238, + "flos": 34074712359360.0, + "grad_norm": 1.764097933726831, + "language_loss": 0.76367021, + "learning_rate": 2.6390656439703173e-07, + "loss": 0.78510618, + "num_input_tokens_seen": 301265550, + "router_z_loss_clip": 0.73144531, + "router_z_loss_mlp": 0.10943604, + "step": 13964, + "time_per_iteration": 2.6709766387939453 + }, + { + "auxiliary_loss_clip": 0.01114128, + "auxiliary_loss_mlp": 0.01034776, + "balance_loss_clip": 1.03919959, + "balance_loss_mlp": 1.02245545, + "epoch": 0.8396212235081918, + "flos": 13545346184640.0, + "grad_norm": 2.270410483928265, + "language_loss": 0.78148687, + "learning_rate": 2.637132363964161e-07, + "loss": 0.80297589, + "num_input_tokens_seen": 301282035, + "router_z_loss_clip": 0.74804688, + "router_z_loss_mlp": 0.12329102, + "step": 13965, + "time_per_iteration": 2.577099561691284 + }, + { + "auxiliary_loss_clip": 0.01108385, + "auxiliary_loss_mlp": 0.0103023, + "balance_loss_clip": 1.03734815, + "balance_loss_mlp": 1.0196327, + "epoch": 0.8396813467608598, + "flos": 43605894356640.0, + "grad_norm": 1.7466147586675145, + "language_loss": 0.65562713, + "learning_rate": 2.635199742359684e-07, + "loss": 0.67701328, + "num_input_tokens_seen": 301305210, + "router_z_loss_clip": 0.71044922, + "router_z_loss_mlp": 0.10614014, + "step": 13966, + "time_per_iteration": 2.7655725479125977 + }, + { + "auxiliary_loss_clip": 0.0110952, + "auxiliary_loss_mlp": 0.0103253, + "balance_loss_clip": 1.03822088, + "balance_loss_mlp": 1.02122903, + "epoch": 0.8397414700135277, + "flos": 31941336696480.0, + "grad_norm": 1.7574775712449215, + "language_loss": 0.74279475, + "learning_rate": 2.633267779230177e-07, + "loss": 0.76421517, + "num_input_tokens_seen": 301324885, + "router_z_loss_clip": 0.71240234, + "router_z_loss_mlp": 0.11291504, + "step": 13967, + "time_per_iteration": 2.6428604125976562 + }, + { + "auxiliary_loss_clip": 0.0111025, + "auxiliary_loss_mlp": 0.01029596, + "balance_loss_clip": 1.03992188, + "balance_loss_mlp": 1.01871252, + "epoch": 0.8398015932661957, + "flos": 22369886873280.0, + "grad_norm": 1.9686553841852654, + "language_loss": 0.82725668, + "learning_rate": 2.6313364746488974e-07, + "loss": 0.8486551, + "num_input_tokens_seen": 301343070, + "router_z_loss_clip": 0.703125, + "router_z_loss_mlp": 0.10888672, + "step": 13968, + "time_per_iteration": 2.6358540058135986 + }, + { + "auxiliary_loss_clip": 0.01114153, + "auxiliary_loss_mlp": 0.01030682, + "balance_loss_clip": 1.04062939, + "balance_loss_mlp": 1.01958942, + "epoch": 0.8398617165188637, + "flos": 21206679323040.0, + "grad_norm": 1.9787236289483718, + "language_loss": 0.77717823, + "learning_rate": 2.629405828689075e-07, + "loss": 0.79862648, + "num_input_tokens_seen": 301359280, + "router_z_loss_clip": 0.73486328, + "router_z_loss_mlp": 0.11108398, + "step": 13969, + "time_per_iteration": 2.6282453536987305 + }, + { + "auxiliary_loss_clip": 0.01112025, + "auxiliary_loss_mlp": 0.01027978, + "balance_loss_clip": 1.03741384, + "balance_loss_mlp": 1.01615274, + "epoch": 0.8399218397715317, + "flos": 27979028277120.0, + "grad_norm": 2.2834173179948483, + "language_loss": 0.77349257, + "learning_rate": 2.627475841423923e-07, + "loss": 0.79489261, + "num_input_tokens_seen": 301376465, + "router_z_loss_clip": 0.74560547, + "router_z_loss_mlp": 0.1182251, + "step": 13970, + "time_per_iteration": 2.6605069637298584 + }, + { + "auxiliary_loss_clip": 0.01110447, + "auxiliary_loss_mlp": 0.01037921, + "balance_loss_clip": 1.03799784, + "balance_loss_mlp": 1.0270251, + "epoch": 0.8399819630241996, + "flos": 28247255085600.0, + "grad_norm": 2.075459594032789, + "language_loss": 0.72386658, + "learning_rate": 2.625546512926633e-07, + "loss": 0.7453503, + "num_input_tokens_seen": 301396000, + "router_z_loss_clip": 0.72412109, + "router_z_loss_mlp": 0.10900879, + "step": 13971, + "time_per_iteration": 2.660108804702759 + }, + { + "auxiliary_loss_clip": 0.01110055, + "auxiliary_loss_mlp": 0.01029347, + "balance_loss_clip": 1.03760076, + "balance_loss_mlp": 1.0175575, + "epoch": 0.8400420862768676, + "flos": 20007984985920.0, + "grad_norm": 2.271368354235555, + "language_loss": 0.77483946, + "learning_rate": 2.623617843270358e-07, + "loss": 0.79623348, + "num_input_tokens_seen": 301413160, + "router_z_loss_clip": 0.72509766, + "router_z_loss_mlp": 0.11791992, + "step": 13972, + "time_per_iteration": 2.558894395828247 + }, + { + "auxiliary_loss_clip": 0.01109072, + "auxiliary_loss_mlp": 0.01029771, + "balance_loss_clip": 1.03864634, + "balance_loss_mlp": 1.01922703, + "epoch": 0.8401022095295355, + "flos": 25975126789920.0, + "grad_norm": 1.52459315053275, + "language_loss": 0.6812315, + "learning_rate": 2.6216898325282333e-07, + "loss": 0.70261991, + "num_input_tokens_seen": 301433325, + "router_z_loss_clip": 0.70361328, + "router_z_loss_mlp": 0.10546875, + "step": 13973, + "time_per_iteration": 2.7150535583496094 + }, + { + "auxiliary_loss_clip": 0.01112648, + "auxiliary_loss_mlp": 0.01027219, + "balance_loss_clip": 1.0390377, + "balance_loss_mlp": 1.01597762, + "epoch": 0.8401623327822035, + "flos": 21123023840640.0, + "grad_norm": 2.20703861558825, + "language_loss": 0.78102916, + "learning_rate": 2.619762480773382e-07, + "loss": 0.80242789, + "num_input_tokens_seen": 301450265, + "router_z_loss_clip": 0.73632812, + "router_z_loss_mlp": 0.11248779, + "step": 13974, + "time_per_iteration": 2.590033531188965 + }, + { + "auxiliary_loss_clip": 0.01110067, + "auxiliary_loss_mlp": 0.01029231, + "balance_loss_clip": 1.03743351, + "balance_loss_mlp": 1.01855624, + "epoch": 0.8402224560348714, + "flos": 27133350714720.0, + "grad_norm": 1.6444527293010887, + "language_loss": 0.7264744, + "learning_rate": 2.617835788078868e-07, + "loss": 0.74786735, + "num_input_tokens_seen": 301470760, + "router_z_loss_clip": 0.72558594, + "router_z_loss_mlp": 0.10668945, + "step": 13975, + "time_per_iteration": 2.6611030101776123 + }, + { + "auxiliary_loss_clip": 0.01109518, + "auxiliary_loss_mlp": 0.01026362, + "balance_loss_clip": 1.0384891, + "balance_loss_mlp": 1.01485193, + "epoch": 0.8402825792875395, + "flos": 24684592479840.0, + "grad_norm": 1.7564042094383767, + "language_loss": 0.72405428, + "learning_rate": 2.6159097545177645e-07, + "loss": 0.74541306, + "num_input_tokens_seen": 301489425, + "router_z_loss_clip": 0.71044922, + "router_z_loss_mlp": 0.1151123, + "step": 13976, + "time_per_iteration": 2.6070425510406494 + }, + { + "auxiliary_loss_clip": 0.01108084, + "auxiliary_loss_mlp": 0.01023027, + "balance_loss_clip": 1.03705227, + "balance_loss_mlp": 1.0127393, + "epoch": 0.8403427025402074, + "flos": 28419103985760.0, + "grad_norm": 2.464011715757884, + "language_loss": 0.71690917, + "learning_rate": 2.61398438016311e-07, + "loss": 0.73822033, + "num_input_tokens_seen": 301508885, + "router_z_loss_clip": 0.7109375, + "router_z_loss_mlp": 0.10284424, + "step": 13977, + "time_per_iteration": 2.6534361839294434 + }, + { + "auxiliary_loss_clip": 0.01109445, + "auxiliary_loss_mlp": 0.01027161, + "balance_loss_clip": 1.03558278, + "balance_loss_mlp": 1.01615763, + "epoch": 0.8404028257928754, + "flos": 39871342333440.0, + "grad_norm": 1.8985373646140862, + "language_loss": 0.68751639, + "learning_rate": 2.6120596650879043e-07, + "loss": 0.70888245, + "num_input_tokens_seen": 301533780, + "router_z_loss_clip": 0.73925781, + "router_z_loss_mlp": 0.11010742, + "step": 13978, + "time_per_iteration": 2.7094616889953613 + }, + { + "auxiliary_loss_clip": 0.01107194, + "auxiliary_loss_mlp": 0.01030124, + "balance_loss_clip": 1.03823423, + "balance_loss_mlp": 1.01927006, + "epoch": 0.8404629490455434, + "flos": 19743121111680.0, + "grad_norm": 1.83827693448045, + "language_loss": 0.77870911, + "learning_rate": 2.610135609365145e-07, + "loss": 0.80008233, + "num_input_tokens_seen": 301551775, + "router_z_loss_clip": 0.68945312, + "router_z_loss_mlp": 0.10839844, + "step": 13979, + "time_per_iteration": 2.6199333667755127 + }, + { + "auxiliary_loss_clip": 0.01110915, + "auxiliary_loss_mlp": 0.01024117, + "balance_loss_clip": 1.03940797, + "balance_loss_mlp": 1.01300681, + "epoch": 0.8405230722982113, + "flos": 18539929356480.0, + "grad_norm": 2.003324857625556, + "language_loss": 0.78380567, + "learning_rate": 2.60821221306778e-07, + "loss": 0.80515599, + "num_input_tokens_seen": 301570495, + "router_z_loss_clip": 0.71533203, + "router_z_loss_mlp": 0.11114502, + "step": 13980, + "time_per_iteration": 2.5640275478363037 + }, + { + "auxiliary_loss_clip": 0.01110374, + "auxiliary_loss_mlp": 0.01027854, + "balance_loss_clip": 1.04008007, + "balance_loss_mlp": 1.01753092, + "epoch": 0.8405831955508793, + "flos": 33937904556000.0, + "grad_norm": 45.93644185450037, + "language_loss": 0.86672473, + "learning_rate": 2.606289476268757e-07, + "loss": 0.88810694, + "num_input_tokens_seen": 301591705, + "router_z_loss_clip": 0.703125, + "router_z_loss_mlp": 0.10327148, + "step": 13981, + "time_per_iteration": 2.682274341583252 + }, + { + "auxiliary_loss_clip": 0.01111092, + "auxiliary_loss_mlp": 0.01029305, + "balance_loss_clip": 1.04011416, + "balance_loss_mlp": 1.01842105, + "epoch": 0.8406433188035473, + "flos": 29003726298240.0, + "grad_norm": 2.3491424638228433, + "language_loss": 0.67933917, + "learning_rate": 2.6043673990409745e-07, + "loss": 0.7007432, + "num_input_tokens_seen": 301611670, + "router_z_loss_clip": 0.7109375, + "router_z_loss_mlp": 0.10888672, + "step": 13982, + "time_per_iteration": 2.701859951019287 + }, + { + "auxiliary_loss_clip": 0.0111227, + "auxiliary_loss_mlp": 0.01033605, + "balance_loss_clip": 1.03962755, + "balance_loss_mlp": 1.02149284, + "epoch": 0.8407034420562153, + "flos": 35637444171360.0, + "grad_norm": 2.3512698563489334, + "language_loss": 0.68642342, + "learning_rate": 2.602445981457324e-07, + "loss": 0.70788217, + "num_input_tokens_seen": 301632540, + "router_z_loss_clip": 0.7265625, + "router_z_loss_mlp": 0.12109375, + "step": 13983, + "time_per_iteration": 2.695791482925415 + }, + { + "auxiliary_loss_clip": 0.01110556, + "auxiliary_loss_mlp": 0.01029069, + "balance_loss_clip": 1.03636098, + "balance_loss_mlp": 1.01764894, + "epoch": 0.8407635653088832, + "flos": 32165000847360.0, + "grad_norm": 1.9998727828568625, + "language_loss": 0.78507698, + "learning_rate": 2.6005252235906684e-07, + "loss": 0.80647326, + "num_input_tokens_seen": 301651480, + "router_z_loss_clip": 0.74169922, + "router_z_loss_mlp": 0.11431885, + "step": 13984, + "time_per_iteration": 4.112321615219116 + }, + { + "auxiliary_loss_clip": 0.01106211, + "auxiliary_loss_mlp": 0.01030053, + "balance_loss_clip": 1.03491831, + "balance_loss_mlp": 1.01919341, + "epoch": 0.8408236885615512, + "flos": 26196481455840.0, + "grad_norm": 2.5371916772191465, + "language_loss": 0.60344797, + "learning_rate": 2.598605125513842e-07, + "loss": 0.62481058, + "num_input_tokens_seen": 301670010, + "router_z_loss_clip": 0.71289062, + "router_z_loss_mlp": 0.10852051, + "step": 13985, + "time_per_iteration": 2.6240174770355225 + }, + { + "auxiliary_loss_clip": 0.01112097, + "auxiliary_loss_mlp": 0.01030249, + "balance_loss_clip": 1.03823543, + "balance_loss_mlp": 1.01857269, + "epoch": 0.8408838118142191, + "flos": 28020795242400.0, + "grad_norm": 1.7750233024891893, + "language_loss": 0.81699443, + "learning_rate": 2.5966856872996467e-07, + "loss": 0.83841789, + "num_input_tokens_seen": 301689785, + "router_z_loss_clip": 0.73828125, + "router_z_loss_mlp": 0.11682129, + "step": 13986, + "time_per_iteration": 2.683000326156616 + }, + { + "auxiliary_loss_clip": 0.01111263, + "auxiliary_loss_mlp": 0.01029636, + "balance_loss_clip": 1.03949177, + "balance_loss_mlp": 1.01846051, + "epoch": 0.8409439350668871, + "flos": 32701657050720.0, + "grad_norm": 1.541989895836011, + "language_loss": 0.65627956, + "learning_rate": 2.5947669090208755e-07, + "loss": 0.67768854, + "num_input_tokens_seen": 301712225, + "router_z_loss_clip": 0.71777344, + "router_z_loss_mlp": 0.11187744, + "step": 13987, + "time_per_iteration": 2.724870443344116 + }, + { + "auxiliary_loss_clip": 0.01110807, + "auxiliary_loss_mlp": 0.01036411, + "balance_loss_clip": 1.03889763, + "balance_loss_mlp": 1.02544975, + "epoch": 0.841004058319555, + "flos": 32431769033760.0, + "grad_norm": 2.0136144845004598, + "language_loss": 0.67796308, + "learning_rate": 2.5928487907502906e-07, + "loss": 0.69943523, + "num_input_tokens_seen": 301730955, + "router_z_loss_clip": 0.71777344, + "router_z_loss_mlp": 0.10955811, + "step": 13988, + "time_per_iteration": 2.7483694553375244 + }, + { + "auxiliary_loss_clip": 0.0111591, + "auxiliary_loss_mlp": 0.01035166, + "balance_loss_clip": 1.04065299, + "balance_loss_mlp": 1.02303076, + "epoch": 0.8410641815722231, + "flos": 17694738001440.0, + "grad_norm": 2.245622920592306, + "language_loss": 0.80842423, + "learning_rate": 2.590931332560622e-07, + "loss": 0.82993501, + "num_input_tokens_seen": 301746930, + "router_z_loss_clip": 0.75195312, + "router_z_loss_mlp": 0.12139893, + "step": 13989, + "time_per_iteration": 3.9049389362335205 + }, + { + "auxiliary_loss_clip": 0.01110125, + "auxiliary_loss_mlp": 0.01026586, + "balance_loss_clip": 1.03692973, + "balance_loss_mlp": 1.01525545, + "epoch": 0.841124304824891, + "flos": 35590085821440.0, + "grad_norm": 1.9449130604143339, + "language_loss": 0.74893194, + "learning_rate": 2.5890145345245826e-07, + "loss": 0.77029902, + "num_input_tokens_seen": 301766945, + "router_z_loss_clip": 0.73193359, + "router_z_loss_mlp": 0.11334229, + "step": 13990, + "time_per_iteration": 2.6561930179595947 + }, + { + "auxiliary_loss_clip": 0.01104003, + "auxiliary_loss_mlp": 0.01032916, + "balance_loss_clip": 1.0355742, + "balance_loss_mlp": 1.02207375, + "epoch": 0.841184428077559, + "flos": 27347979512160.0, + "grad_norm": 1.768424235681518, + "language_loss": 0.80817449, + "learning_rate": 2.5870983967148597e-07, + "loss": 0.82954365, + "num_input_tokens_seen": 301785460, + "router_z_loss_clip": 0.68408203, + "router_z_loss_mlp": 0.10839844, + "step": 13991, + "time_per_iteration": 2.6539907455444336 + }, + { + "auxiliary_loss_clip": 0.011087, + "auxiliary_loss_mlp": 0.01031917, + "balance_loss_clip": 1.03759527, + "balance_loss_mlp": 1.02124751, + "epoch": 0.841244551330227, + "flos": 28019741793120.0, + "grad_norm": 2.1772795920258856, + "language_loss": 0.70490599, + "learning_rate": 2.585182919204105e-07, + "loss": 0.72631216, + "num_input_tokens_seen": 301804180, + "router_z_loss_clip": 0.7109375, + "router_z_loss_mlp": 0.10675049, + "step": 13992, + "time_per_iteration": 2.596940517425537 + }, + { + "auxiliary_loss_clip": 0.0111135, + "auxiliary_loss_mlp": 0.01024368, + "balance_loss_clip": 1.03817999, + "balance_loss_mlp": 1.01363873, + "epoch": 0.8413046745828949, + "flos": 25664201118720.0, + "grad_norm": 1.6524397322474946, + "language_loss": 0.76449478, + "learning_rate": 2.583268102064959e-07, + "loss": 0.78585196, + "num_input_tokens_seen": 301823670, + "router_z_loss_clip": 0.73046875, + "router_z_loss_mlp": 0.1072998, + "step": 13993, + "time_per_iteration": 2.6582226753234863 + }, + { + "auxiliary_loss_clip": 0.01115883, + "auxiliary_loss_mlp": 0.01035646, + "balance_loss_clip": 1.03849196, + "balance_loss_mlp": 1.02331996, + "epoch": 0.841364797835563, + "flos": 33009543925920.0, + "grad_norm": 1.8658067629828694, + "language_loss": 0.74139154, + "learning_rate": 2.5813539453700393e-07, + "loss": 0.76290685, + "num_input_tokens_seen": 301845890, + "router_z_loss_clip": 0.77392578, + "router_z_loss_mlp": 0.12322998, + "step": 13994, + "time_per_iteration": 2.6445062160491943 + }, + { + "auxiliary_loss_clip": 0.01109442, + "auxiliary_loss_mlp": 0.01027412, + "balance_loss_clip": 1.03924632, + "balance_loss_mlp": 1.01692772, + "epoch": 0.8414249210882309, + "flos": 21836066879520.0, + "grad_norm": 1.6608008573850859, + "language_loss": 0.59426248, + "learning_rate": 2.5794404491919163e-07, + "loss": 0.61563098, + "num_input_tokens_seen": 301863985, + "router_z_loss_clip": 0.70166016, + "router_z_loss_mlp": 0.1048584, + "step": 13995, + "time_per_iteration": 4.144723176956177 + }, + { + "auxiliary_loss_clip": 0.01111482, + "auxiliary_loss_mlp": 0.01027592, + "balance_loss_clip": 1.03997302, + "balance_loss_mlp": 1.01583815, + "epoch": 0.8414850443408989, + "flos": 31044330090720.0, + "grad_norm": 1.6267935060646328, + "language_loss": 0.71348369, + "learning_rate": 2.577527613603163e-07, + "loss": 0.73487449, + "num_input_tokens_seen": 301882765, + "router_z_loss_clip": 0.71435547, + "router_z_loss_mlp": 0.11755371, + "step": 13996, + "time_per_iteration": 2.6736340522766113 + }, + { + "auxiliary_loss_clip": 0.0111028, + "auxiliary_loss_mlp": 0.0102781, + "balance_loss_clip": 1.03759503, + "balance_loss_mlp": 1.01735568, + "epoch": 0.8415451675935668, + "flos": 28333503673920.0, + "grad_norm": 1.8265265683217704, + "language_loss": 0.64308572, + "learning_rate": 2.5756154386763017e-07, + "loss": 0.66446662, + "num_input_tokens_seen": 301902720, + "router_z_loss_clip": 0.7265625, + "router_z_loss_mlp": 0.10449219, + "step": 13997, + "time_per_iteration": 2.602491855621338 + }, + { + "auxiliary_loss_clip": 0.01113769, + "auxiliary_loss_mlp": 0.0103134, + "balance_loss_clip": 1.03845203, + "balance_loss_mlp": 1.01932931, + "epoch": 0.8416052908462348, + "flos": 22630780054080.0, + "grad_norm": 2.2218403213506974, + "language_loss": 0.81981957, + "learning_rate": 2.5737039244838565e-07, + "loss": 0.84127069, + "num_input_tokens_seen": 301921245, + "router_z_loss_clip": 0.75292969, + "router_z_loss_mlp": 0.12017822, + "step": 13998, + "time_per_iteration": 2.6366801261901855 + }, + { + "auxiliary_loss_clip": 0.0111246, + "auxiliary_loss_mlp": 0.01029444, + "balance_loss_clip": 1.03935337, + "balance_loss_mlp": 1.01792288, + "epoch": 0.8416654140989027, + "flos": 31853913107040.0, + "grad_norm": 1.6780637242612377, + "language_loss": 0.80325747, + "learning_rate": 2.5717930710982984e-07, + "loss": 0.82467645, + "num_input_tokens_seen": 301942320, + "router_z_loss_clip": 0.73095703, + "router_z_loss_mlp": 0.1151123, + "step": 13999, + "time_per_iteration": 3.991389751434326 + }, + { + "auxiliary_loss_clip": 0.01113961, + "auxiliary_loss_mlp": 0.01030567, + "balance_loss_clip": 1.03966737, + "balance_loss_mlp": 1.01887274, + "epoch": 0.8417255373515707, + "flos": 32254288231680.0, + "grad_norm": 2.456629314356513, + "language_loss": 0.6713227, + "learning_rate": 2.569882878592096e-07, + "loss": 0.69276798, + "num_input_tokens_seen": 301963110, + "router_z_loss_clip": 0.74316406, + "router_z_loss_mlp": 0.11700439, + "step": 14000, + "time_per_iteration": 2.64436936378479 + }, + { + "auxiliary_loss_clip": 0.01115483, + "auxiliary_loss_mlp": 0.0102715, + "balance_loss_clip": 1.04051638, + "balance_loss_mlp": 1.01558089, + "epoch": 0.8417856606042387, + "flos": 30160937291040.0, + "grad_norm": 1.453778932628544, + "language_loss": 0.79807854, + "learning_rate": 2.5679733470376885e-07, + "loss": 0.81950486, + "num_input_tokens_seen": 301984915, + "router_z_loss_clip": 0.74951172, + "router_z_loss_mlp": 0.11572266, + "step": 14001, + "time_per_iteration": 2.6663658618927 + }, + { + "auxiliary_loss_clip": 0.01108823, + "auxiliary_loss_mlp": 0.01031319, + "balance_loss_clip": 1.03706622, + "balance_loss_mlp": 1.02061439, + "epoch": 0.8418457838569067, + "flos": 25442198176320.0, + "grad_norm": 1.9314693761479451, + "language_loss": 0.78791606, + "learning_rate": 2.5660644765074703e-07, + "loss": 0.80931753, + "num_input_tokens_seen": 302004095, + "router_z_loss_clip": 0.71777344, + "router_z_loss_mlp": 0.10705566, + "step": 14002, + "time_per_iteration": 2.606431484222412 + }, + { + "auxiliary_loss_clip": 0.01109206, + "auxiliary_loss_mlp": 0.0102535, + "balance_loss_clip": 1.03700066, + "balance_loss_mlp": 1.01412618, + "epoch": 0.8419059071095746, + "flos": 34972124137920.0, + "grad_norm": 1.7231589113669583, + "language_loss": 0.78184235, + "learning_rate": 2.5641562670738334e-07, + "loss": 0.80318797, + "num_input_tokens_seen": 302027250, + "router_z_loss_clip": 0.72265625, + "router_z_loss_mlp": 0.11224365, + "step": 14003, + "time_per_iteration": 2.7385127544403076 + }, + { + "auxiliary_loss_clip": 0.01110192, + "auxiliary_loss_mlp": 0.01028872, + "balance_loss_clip": 1.0380826, + "balance_loss_mlp": 1.01766682, + "epoch": 0.8419660303622426, + "flos": 26421766297920.0, + "grad_norm": 1.7290727224844336, + "language_loss": 0.65836942, + "learning_rate": 2.5622487188091436e-07, + "loss": 0.6797601, + "num_input_tokens_seen": 302046950, + "router_z_loss_clip": 0.72265625, + "router_z_loss_mlp": 0.11218262, + "step": 14004, + "time_per_iteration": 2.634471893310547 + }, + { + "auxiliary_loss_clip": 0.01113045, + "auxiliary_loss_mlp": 0.01033308, + "balance_loss_clip": 1.03927648, + "balance_loss_mlp": 1.0218339, + "epoch": 0.8420261536149106, + "flos": 30872845846080.0, + "grad_norm": 2.6456673536617163, + "language_loss": 0.76279402, + "learning_rate": 2.560341831785724e-07, + "loss": 0.78425753, + "num_input_tokens_seen": 302065470, + "router_z_loss_clip": 0.73730469, + "router_z_loss_mlp": 0.11480713, + "step": 14005, + "time_per_iteration": 2.7001841068267822 + }, + { + "auxiliary_loss_clip": 0.01112617, + "auxiliary_loss_mlp": 0.01028422, + "balance_loss_clip": 1.03819478, + "balance_loss_mlp": 1.01650095, + "epoch": 0.8420862768675785, + "flos": 22895643928320.0, + "grad_norm": 2.154358527704287, + "language_loss": 0.77496445, + "learning_rate": 2.5584356060758906e-07, + "loss": 0.79637486, + "num_input_tokens_seen": 302083190, + "router_z_loss_clip": 0.74316406, + "router_z_loss_mlp": 0.11932373, + "step": 14006, + "time_per_iteration": 2.5986781120300293 + }, + { + "auxiliary_loss_clip": 0.01111448, + "auxiliary_loss_mlp": 0.01031617, + "balance_loss_clip": 1.03902841, + "balance_loss_mlp": 1.02040577, + "epoch": 0.8421464001202466, + "flos": 22364173936800.0, + "grad_norm": 2.2797089225821336, + "language_loss": 0.76977003, + "learning_rate": 2.556530041751932e-07, + "loss": 0.79120064, + "num_input_tokens_seen": 302098820, + "router_z_loss_clip": 0.72363281, + "router_z_loss_mlp": 0.11206055, + "step": 14007, + "time_per_iteration": 2.6182968616485596 + }, + { + "auxiliary_loss_clip": 0.0111096, + "auxiliary_loss_mlp": 0.01028245, + "balance_loss_clip": 1.03735662, + "balance_loss_mlp": 1.01662183, + "epoch": 0.8422065233729145, + "flos": 38482849941120.0, + "grad_norm": 1.7880798560611186, + "language_loss": 0.66000861, + "learning_rate": 2.554625138886102e-07, + "loss": 0.68140066, + "num_input_tokens_seen": 302117075, + "router_z_loss_clip": 0.73632812, + "router_z_loss_mlp": 0.11627197, + "step": 14008, + "time_per_iteration": 2.6714770793914795 + }, + { + "auxiliary_loss_clip": 0.01028812, + "auxiliary_loss_mlp": 0.01001246, + "balance_loss_clip": 1.00650752, + "balance_loss_mlp": 1.00037098, + "epoch": 0.8422666466255825, + "flos": 78457215016800.0, + "grad_norm": 0.7388828364572374, + "language_loss": 0.56910837, + "learning_rate": 2.552720897550631e-07, + "loss": 0.58940899, + "num_input_tokens_seen": 302179735, + "router_z_loss_clip": 0.22290039, + "router_z_loss_mlp": 0.00875092, + "step": 14009, + "time_per_iteration": 3.318358898162842 + }, + { + "auxiliary_loss_clip": 0.01107514, + "auxiliary_loss_mlp": 0.01027013, + "balance_loss_clip": 1.03710508, + "balance_loss_mlp": 1.01661777, + "epoch": 0.8423267698782504, + "flos": 29671558403040.0, + "grad_norm": 1.3204095424928892, + "language_loss": 0.77992088, + "learning_rate": 2.5508173178177304e-07, + "loss": 0.80126619, + "num_input_tokens_seen": 302202055, + "router_z_loss_clip": 0.70361328, + "router_z_loss_mlp": 0.10394287, + "step": 14010, + "time_per_iteration": 2.750277042388916 + }, + { + "auxiliary_loss_clip": 0.0111498, + "auxiliary_loss_mlp": 0.01038071, + "balance_loss_clip": 1.04070628, + "balance_loss_mlp": 1.0261147, + "epoch": 0.8423868931309184, + "flos": 22324756973760.0, + "grad_norm": 2.1898548331329994, + "language_loss": 0.72447097, + "learning_rate": 2.548914399759592e-07, + "loss": 0.74600148, + "num_input_tokens_seen": 302221360, + "router_z_loss_clip": 0.74169922, + "router_z_loss_mlp": 0.11956787, + "step": 14011, + "time_per_iteration": 2.6132352352142334 + }, + { + "auxiliary_loss_clip": 0.01109912, + "auxiliary_loss_mlp": 0.01037421, + "balance_loss_clip": 1.03674686, + "balance_loss_mlp": 1.02671552, + "epoch": 0.8424470163835863, + "flos": 28736228800800.0, + "grad_norm": 1.9328425731626435, + "language_loss": 0.84290338, + "learning_rate": 2.5470121434483636e-07, + "loss": 0.86437666, + "num_input_tokens_seen": 302240715, + "router_z_loss_clip": 0.73144531, + "router_z_loss_mlp": 0.10705566, + "step": 14012, + "time_per_iteration": 2.6714954376220703 + }, + { + "auxiliary_loss_clip": 0.01102586, + "auxiliary_loss_mlp": 0.0102691, + "balance_loss_clip": 1.03575802, + "balance_loss_mlp": 1.01733136, + "epoch": 0.8425071396362543, + "flos": 29003766815520.0, + "grad_norm": 2.0763852492688724, + "language_loss": 0.68102711, + "learning_rate": 2.5451105489561884e-07, + "loss": 0.70232207, + "num_input_tokens_seen": 302260950, + "router_z_loss_clip": 0.66845703, + "router_z_loss_mlp": 0.09576416, + "step": 14013, + "time_per_iteration": 2.658087730407715 + }, + { + "auxiliary_loss_clip": 0.01116118, + "auxiliary_loss_mlp": 0.01028765, + "balance_loss_clip": 1.04001999, + "balance_loss_mlp": 1.01700473, + "epoch": 0.8425672628889223, + "flos": 19741297834080.0, + "grad_norm": 2.323807469430081, + "language_loss": 0.78687078, + "learning_rate": 2.5432096163551644e-07, + "loss": 0.80831957, + "num_input_tokens_seen": 302277500, + "router_z_loss_clip": 0.76025391, + "router_z_loss_mlp": 0.11767578, + "step": 14014, + "time_per_iteration": 2.642962694168091 + }, + { + "auxiliary_loss_clip": 0.01110552, + "auxiliary_loss_mlp": 0.01028444, + "balance_loss_clip": 1.03852534, + "balance_loss_mlp": 1.01739299, + "epoch": 0.8426273861415903, + "flos": 28246768878240.0, + "grad_norm": 1.8435901141692916, + "language_loss": 0.67158377, + "learning_rate": 2.5413093457173884e-07, + "loss": 0.69297373, + "num_input_tokens_seen": 302297930, + "router_z_loss_clip": 0.72021484, + "router_z_loss_mlp": 0.11047363, + "step": 14015, + "time_per_iteration": 2.632817268371582 + }, + { + "auxiliary_loss_clip": 0.01110647, + "auxiliary_loss_mlp": 0.01027202, + "balance_loss_clip": 1.03916836, + "balance_loss_mlp": 1.01521587, + "epoch": 0.8426875093942582, + "flos": 21301679643840.0, + "grad_norm": 3.6261886968975867, + "language_loss": 0.76007515, + "learning_rate": 2.5394097371149036e-07, + "loss": 0.78145361, + "num_input_tokens_seen": 302315735, + "router_z_loss_clip": 0.71484375, + "router_z_loss_mlp": 0.11993408, + "step": 14016, + "time_per_iteration": 2.624955415725708 + }, + { + "auxiliary_loss_clip": 0.01110826, + "auxiliary_loss_mlp": 0.01030636, + "balance_loss_clip": 1.03898382, + "balance_loss_mlp": 1.01926398, + "epoch": 0.8427476326469262, + "flos": 23965877021760.0, + "grad_norm": 2.155366070899287, + "language_loss": 0.79359043, + "learning_rate": 2.5375107906197544e-07, + "loss": 0.815005, + "num_input_tokens_seen": 302332790, + "router_z_loss_clip": 0.71826172, + "router_z_loss_mlp": 0.1137085, + "step": 14017, + "time_per_iteration": 2.5997161865234375 + }, + { + "auxiliary_loss_clip": 0.01109112, + "auxiliary_loss_mlp": 0.0102988, + "balance_loss_clip": 1.03757119, + "balance_loss_mlp": 1.01874614, + "epoch": 0.8428077558995941, + "flos": 14570570930400.0, + "grad_norm": 2.2064081215335034, + "language_loss": 0.62617552, + "learning_rate": 2.5356125063039525e-07, + "loss": 0.64756542, + "num_input_tokens_seen": 302346490, + "router_z_loss_clip": 0.71484375, + "router_z_loss_mlp": 0.11126709, + "step": 14018, + "time_per_iteration": 2.5950989723205566 + }, + { + "auxiliary_loss_clip": 0.0111069, + "auxiliary_loss_mlp": 0.01028414, + "balance_loss_clip": 1.03735638, + "balance_loss_mlp": 1.0178045, + "epoch": 0.8428678791522621, + "flos": 12752456287680.0, + "grad_norm": 2.5261595070639284, + "language_loss": 0.79467201, + "learning_rate": 2.5337148842394687e-07, + "loss": 0.81606305, + "num_input_tokens_seen": 302363235, + "router_z_loss_clip": 0.73388672, + "router_z_loss_mlp": 0.1060791, + "step": 14019, + "time_per_iteration": 2.572671413421631 + }, + { + "auxiliary_loss_clip": 0.0111272, + "auxiliary_loss_mlp": 0.01029368, + "balance_loss_clip": 1.03855753, + "balance_loss_mlp": 1.01794171, + "epoch": 0.8429280024049302, + "flos": 35102489693760.0, + "grad_norm": 1.852051330474172, + "language_loss": 0.78526795, + "learning_rate": 2.531817924498265e-07, + "loss": 0.8066889, + "num_input_tokens_seen": 302383270, + "router_z_loss_clip": 0.74072266, + "router_z_loss_mlp": 0.11425781, + "step": 14020, + "time_per_iteration": 2.6906027793884277 + }, + { + "auxiliary_loss_clip": 0.01111177, + "auxiliary_loss_mlp": 0.01025637, + "balance_loss_clip": 1.03861833, + "balance_loss_mlp": 1.01494408, + "epoch": 0.8429881256575981, + "flos": 23839076986560.0, + "grad_norm": 1.8444180351137047, + "language_loss": 0.71406043, + "learning_rate": 2.5299216271522805e-07, + "loss": 0.73542857, + "num_input_tokens_seen": 302401355, + "router_z_loss_clip": 0.7265625, + "router_z_loss_mlp": 0.10699463, + "step": 14021, + "time_per_iteration": 2.6162145137786865 + }, + { + "auxiliary_loss_clip": 0.0111171, + "auxiliary_loss_mlp": 0.01035551, + "balance_loss_clip": 1.03824973, + "balance_loss_mlp": 1.02417266, + "epoch": 0.8430482489102661, + "flos": 30250224675360.0, + "grad_norm": 2.2904549415910576, + "language_loss": 0.69543827, + "learning_rate": 2.5280259922734125e-07, + "loss": 0.71691084, + "num_input_tokens_seen": 302419515, + "router_z_loss_clip": 0.734375, + "router_z_loss_mlp": 0.1138916, + "step": 14022, + "time_per_iteration": 2.649404287338257 + }, + { + "auxiliary_loss_clip": 0.01115543, + "auxiliary_loss_mlp": 0.01034254, + "balance_loss_clip": 1.04022074, + "balance_loss_mlp": 1.02226186, + "epoch": 0.843108372162934, + "flos": 26288321428800.0, + "grad_norm": 2.2922808969333053, + "language_loss": 0.72346026, + "learning_rate": 2.526131019933553e-07, + "loss": 0.74495828, + "num_input_tokens_seen": 302438280, + "router_z_loss_clip": 0.75292969, + "router_z_loss_mlp": 0.11999512, + "step": 14023, + "time_per_iteration": 4.071164131164551 + }, + { + "auxiliary_loss_clip": 0.01110984, + "auxiliary_loss_mlp": 0.01033102, + "balance_loss_clip": 1.03912079, + "balance_loss_mlp": 1.02136612, + "epoch": 0.843168495415602, + "flos": 30027938112000.0, + "grad_norm": 1.5048165140976923, + "language_loss": 0.66940814, + "learning_rate": 2.524236710204559e-07, + "loss": 0.69084895, + "num_input_tokens_seen": 302460860, + "router_z_loss_clip": 0.71777344, + "router_z_loss_mlp": 0.11737061, + "step": 14024, + "time_per_iteration": 2.694650888442993 + }, + { + "auxiliary_loss_clip": 0.01108321, + "auxiliary_loss_mlp": 0.01030903, + "balance_loss_clip": 1.03737211, + "balance_loss_mlp": 1.01952457, + "epoch": 0.8432286186682699, + "flos": 18452303180640.0, + "grad_norm": 2.121561506298458, + "language_loss": 0.81217408, + "learning_rate": 2.522343063158261e-07, + "loss": 0.83356631, + "num_input_tokens_seen": 302476980, + "router_z_loss_clip": 0.70996094, + "router_z_loss_mlp": 0.11376953, + "step": 14025, + "time_per_iteration": 2.615870952606201 + }, + { + "auxiliary_loss_clip": 0.01105663, + "auxiliary_loss_mlp": 0.01027175, + "balance_loss_clip": 1.03623235, + "balance_loss_mlp": 1.01751959, + "epoch": 0.843288741920938, + "flos": 24772461759360.0, + "grad_norm": 1.5986509925628372, + "language_loss": 0.77704269, + "learning_rate": 2.5204500788664606e-07, + "loss": 0.79837108, + "num_input_tokens_seen": 302496380, + "router_z_loss_clip": 0.69335938, + "router_z_loss_mlp": 0.09649658, + "step": 14026, + "time_per_iteration": 2.5955986976623535 + }, + { + "auxiliary_loss_clip": 0.01111452, + "auxiliary_loss_mlp": 0.01033934, + "balance_loss_clip": 1.04037309, + "balance_loss_mlp": 1.02255559, + "epoch": 0.8433488651736059, + "flos": 28469987339040.0, + "grad_norm": 1.469023451362538, + "language_loss": 0.82673818, + "learning_rate": 2.518557757400945e-07, + "loss": 0.84819204, + "num_input_tokens_seen": 302516845, + "router_z_loss_clip": 0.7109375, + "router_z_loss_mlp": 0.11376953, + "step": 14027, + "time_per_iteration": 2.6757330894470215 + }, + { + "auxiliary_loss_clip": 0.01109232, + "auxiliary_loss_mlp": 0.01029815, + "balance_loss_clip": 1.03820825, + "balance_loss_mlp": 1.0192349, + "epoch": 0.8434089884262739, + "flos": 48148246635840.0, + "grad_norm": 1.5641379637081578, + "language_loss": 0.56824273, + "learning_rate": 2.5166660988334754e-07, + "loss": 0.58963323, + "num_input_tokens_seen": 302538865, + "router_z_loss_clip": 0.70996094, + "router_z_loss_mlp": 0.10577393, + "step": 14028, + "time_per_iteration": 2.809243679046631 + }, + { + "auxiliary_loss_clip": 0.01109531, + "auxiliary_loss_mlp": 0.01024089, + "balance_loss_clip": 1.03808141, + "balance_loss_mlp": 1.01388466, + "epoch": 0.8434691116789418, + "flos": 29003037504480.0, + "grad_norm": 2.0010078955672133, + "language_loss": 0.63680828, + "learning_rate": 2.51477510323578e-07, + "loss": 0.65814447, + "num_input_tokens_seen": 302557970, + "router_z_loss_clip": 0.71435547, + "router_z_loss_mlp": 0.10192871, + "step": 14029, + "time_per_iteration": 4.057652950286865 + }, + { + "auxiliary_loss_clip": 0.01106766, + "auxiliary_loss_mlp": 0.01028421, + "balance_loss_clip": 1.03816116, + "balance_loss_mlp": 1.01794899, + "epoch": 0.8435292349316098, + "flos": 27664131912480.0, + "grad_norm": 1.6004195913181076, + "language_loss": 0.75127888, + "learning_rate": 2.51288477067956e-07, + "loss": 0.77263069, + "num_input_tokens_seen": 302578915, + "router_z_loss_clip": 0.68505859, + "router_z_loss_mlp": 0.10473633, + "step": 14030, + "time_per_iteration": 2.664581537246704 + }, + { + "auxiliary_loss_clip": 0.01109942, + "auxiliary_loss_mlp": 0.01029916, + "balance_loss_clip": 1.03896713, + "balance_loss_mlp": 1.01857376, + "epoch": 0.8435893581842777, + "flos": 22992872699520.0, + "grad_norm": 1.821458140617079, + "language_loss": 0.83363712, + "learning_rate": 2.510995101236502e-07, + "loss": 0.85503566, + "num_input_tokens_seen": 302596300, + "router_z_loss_clip": 0.70996094, + "router_z_loss_mlp": 0.11340332, + "step": 14031, + "time_per_iteration": 2.600985527038574 + }, + { + "auxiliary_loss_clip": 0.01107864, + "auxiliary_loss_mlp": 0.01028818, + "balance_loss_clip": 1.03730273, + "balance_loss_mlp": 1.0184294, + "epoch": 0.8436494814369457, + "flos": 25263339786720.0, + "grad_norm": 2.2012243604570774, + "language_loss": 0.80625659, + "learning_rate": 2.509106094978266e-07, + "loss": 0.82762349, + "num_input_tokens_seen": 302614975, + "router_z_loss_clip": 0.70556641, + "router_z_loss_mlp": 0.10388184, + "step": 14032, + "time_per_iteration": 2.6748199462890625 + }, + { + "auxiliary_loss_clip": 0.01110476, + "auxiliary_loss_mlp": 0.01030539, + "balance_loss_clip": 1.03709555, + "balance_loss_mlp": 1.01770616, + "epoch": 0.8437096046896138, + "flos": 27667251743040.0, + "grad_norm": 1.5813072240967287, + "language_loss": 0.75600189, + "learning_rate": 2.507217751976478e-07, + "loss": 0.77741194, + "num_input_tokens_seen": 302636415, + "router_z_loss_clip": 0.73388672, + "router_z_loss_mlp": 0.1282959, + "step": 14033, + "time_per_iteration": 2.6344449520111084 + }, + { + "auxiliary_loss_clip": 0.01109121, + "auxiliary_loss_mlp": 0.01032602, + "balance_loss_clip": 1.03669882, + "balance_loss_mlp": 1.02201056, + "epoch": 0.8437697279422817, + "flos": 19742675421600.0, + "grad_norm": 1.7633715103777958, + "language_loss": 0.83226466, + "learning_rate": 2.505330072302743e-07, + "loss": 0.85368192, + "num_input_tokens_seen": 302653605, + "router_z_loss_clip": 0.72412109, + "router_z_loss_mlp": 0.10601807, + "step": 14034, + "time_per_iteration": 2.676079750061035 + }, + { + "auxiliary_loss_clip": 0.01112218, + "auxiliary_loss_mlp": 0.01025192, + "balance_loss_clip": 1.03952837, + "balance_loss_mlp": 1.01409936, + "epoch": 0.8438298511949497, + "flos": 35100909519840.0, + "grad_norm": 1.69566414855202, + "language_loss": 0.78389227, + "learning_rate": 2.503443056028656e-07, + "loss": 0.80526632, + "num_input_tokens_seen": 302673965, + "router_z_loss_clip": 0.72705078, + "router_z_loss_mlp": 0.11102295, + "step": 14035, + "time_per_iteration": 4.13451623916626 + }, + { + "auxiliary_loss_clip": 0.01109828, + "auxiliary_loss_mlp": 0.01032579, + "balance_loss_clip": 1.03825712, + "balance_loss_mlp": 1.02138484, + "epoch": 0.8438899744476176, + "flos": 41151909392640.0, + "grad_norm": 1.3771065924052621, + "language_loss": 0.72084785, + "learning_rate": 2.501556703225751e-07, + "loss": 0.7422719, + "num_input_tokens_seen": 302695560, + "router_z_loss_clip": 0.71533203, + "router_z_loss_mlp": 0.11187744, + "step": 14036, + "time_per_iteration": 2.7045650482177734 + }, + { + "auxiliary_loss_clip": 0.01105045, + "auxiliary_loss_mlp": 0.01023974, + "balance_loss_clip": 1.03751087, + "balance_loss_mlp": 1.01447928, + "epoch": 0.8439500977002856, + "flos": 30641321342880.0, + "grad_norm": 1.6769345169763752, + "language_loss": 0.69586271, + "learning_rate": 2.49967101396557e-07, + "loss": 0.71715295, + "num_input_tokens_seen": 302713480, + "router_z_loss_clip": 0.67431641, + "router_z_loss_mlp": 0.09503174, + "step": 14037, + "time_per_iteration": 2.69575572013855 + }, + { + "auxiliary_loss_clip": 0.01107404, + "auxiliary_loss_mlp": 0.01023478, + "balance_loss_clip": 1.03624153, + "balance_loss_mlp": 1.01276731, + "epoch": 0.8440102209529535, + "flos": 40085322854400.0, + "grad_norm": 1.9009458921834923, + "language_loss": 0.68706995, + "learning_rate": 2.4977859883196227e-07, + "loss": 0.70837879, + "num_input_tokens_seen": 302736860, + "router_z_loss_clip": 0.7109375, + "router_z_loss_mlp": 0.10717773, + "step": 14038, + "time_per_iteration": 4.03134822845459 + }, + { + "auxiliary_loss_clip": 0.01111502, + "auxiliary_loss_mlp": 0.01032783, + "balance_loss_clip": 1.03880584, + "balance_loss_mlp": 1.02123725, + "epoch": 0.8440703442056215, + "flos": 28956084327360.0, + "grad_norm": 1.5467287591136372, + "language_loss": 0.76300663, + "learning_rate": 2.49590162635938e-07, + "loss": 0.78444946, + "num_input_tokens_seen": 302757745, + "router_z_loss_clip": 0.72607422, + "router_z_loss_mlp": 0.11560059, + "step": 14039, + "time_per_iteration": 2.6403653621673584 + }, + { + "auxiliary_loss_clip": 0.01115842, + "auxiliary_loss_mlp": 0.01028485, + "balance_loss_clip": 1.0399195, + "balance_loss_mlp": 1.01704085, + "epoch": 0.8441304674582895, + "flos": 24640353960480.0, + "grad_norm": 2.373720288415507, + "language_loss": 0.7933771, + "learning_rate": 2.4940179281563046e-07, + "loss": 0.81482035, + "num_input_tokens_seen": 302774885, + "router_z_loss_clip": 0.75976562, + "router_z_loss_mlp": 0.11437988, + "step": 14040, + "time_per_iteration": 2.643810272216797 + }, + { + "auxiliary_loss_clip": 0.01113365, + "auxiliary_loss_mlp": 0.01034554, + "balance_loss_clip": 1.04094386, + "balance_loss_mlp": 1.02267528, + "epoch": 0.8441905907109575, + "flos": 24673247641440.0, + "grad_norm": 2.2388993243533832, + "language_loss": 0.69327879, + "learning_rate": 2.492134893781821e-07, + "loss": 0.71475804, + "num_input_tokens_seen": 302791035, + "router_z_loss_clip": 0.72314453, + "router_z_loss_mlp": 0.11877441, + "step": 14041, + "time_per_iteration": 2.7414255142211914 + }, + { + "auxiliary_loss_clip": 0.01114773, + "auxiliary_loss_mlp": 0.01033999, + "balance_loss_clip": 1.04013288, + "balance_loss_mlp": 1.02314496, + "epoch": 0.8442507139636254, + "flos": 16492275557280.0, + "grad_norm": 2.153046630835867, + "language_loss": 0.69010735, + "learning_rate": 2.490252523307341e-07, + "loss": 0.71159506, + "num_input_tokens_seen": 302808650, + "router_z_loss_clip": 0.74658203, + "router_z_loss_mlp": 0.10858154, + "step": 14042, + "time_per_iteration": 2.7178871631622314 + }, + { + "auxiliary_loss_clip": 0.01107217, + "auxiliary_loss_mlp": 0.01030567, + "balance_loss_clip": 1.03772414, + "balance_loss_mlp": 1.01992154, + "epoch": 0.8443108372162934, + "flos": 22232430793440.0, + "grad_norm": 1.7609763005011059, + "language_loss": 0.75014848, + "learning_rate": 2.4883708168042373e-07, + "loss": 0.77152634, + "num_input_tokens_seen": 302824605, + "router_z_loss_clip": 0.69482422, + "router_z_loss_mlp": 0.10656738, + "step": 14043, + "time_per_iteration": 2.595189332962036 + }, + { + "auxiliary_loss_clip": 0.01108881, + "auxiliary_loss_mlp": 0.01028667, + "balance_loss_clip": 1.03768456, + "balance_loss_mlp": 1.0177238, + "epoch": 0.8443709604689613, + "flos": 19651321656000.0, + "grad_norm": 2.1849463025630267, + "language_loss": 0.71796882, + "learning_rate": 2.486489774343865e-07, + "loss": 0.7393443, + "num_input_tokens_seen": 302840170, + "router_z_loss_clip": 0.71240234, + "router_z_loss_mlp": 0.109375, + "step": 14044, + "time_per_iteration": 2.6287312507629395 + }, + { + "auxiliary_loss_clip": 0.01107916, + "auxiliary_loss_mlp": 0.01027647, + "balance_loss_clip": 1.03707039, + "balance_loss_mlp": 1.01650119, + "epoch": 0.8444310837216293, + "flos": 22588648433280.0, + "grad_norm": 1.633102916417435, + "language_loss": 0.75009185, + "learning_rate": 2.484609395997559e-07, + "loss": 0.77144754, + "num_input_tokens_seen": 302858320, + "router_z_loss_clip": 0.70849609, + "router_z_loss_mlp": 0.1114502, + "step": 14045, + "time_per_iteration": 2.6067392826080322 + }, + { + "auxiliary_loss_clip": 0.01108343, + "auxiliary_loss_mlp": 0.01031139, + "balance_loss_clip": 1.03647518, + "balance_loss_mlp": 1.02017808, + "epoch": 0.8444912069742974, + "flos": 18229165754400.0, + "grad_norm": 2.961186749528706, + "language_loss": 0.78653729, + "learning_rate": 2.4827296818366216e-07, + "loss": 0.80793214, + "num_input_tokens_seen": 302875255, + "router_z_loss_clip": 0.71923828, + "router_z_loss_mlp": 0.10955811, + "step": 14046, + "time_per_iteration": 2.6171061992645264 + }, + { + "auxiliary_loss_clip": 0.01112499, + "auxiliary_loss_mlp": 0.01026484, + "balance_loss_clip": 1.03841329, + "balance_loss_mlp": 1.01517057, + "epoch": 0.8445513302269653, + "flos": 24550823472480.0, + "grad_norm": 2.9622034202313516, + "language_loss": 0.78036362, + "learning_rate": 2.4808506319323255e-07, + "loss": 0.80175346, + "num_input_tokens_seen": 302894690, + "router_z_loss_clip": 0.74023438, + "router_z_loss_mlp": 0.11315918, + "step": 14047, + "time_per_iteration": 2.5994338989257812 + }, + { + "auxiliary_loss_clip": 0.01112039, + "auxiliary_loss_mlp": 0.01029585, + "balance_loss_clip": 1.04035997, + "balance_loss_mlp": 1.01841521, + "epoch": 0.8446114534796333, + "flos": 38036048364000.0, + "grad_norm": 1.998129901425585, + "language_loss": 0.72064996, + "learning_rate": 2.478972246355935e-07, + "loss": 0.74206626, + "num_input_tokens_seen": 302912405, + "router_z_loss_clip": 0.71533203, + "router_z_loss_mlp": 0.11169434, + "step": 14048, + "time_per_iteration": 2.7120251655578613 + }, + { + "auxiliary_loss_clip": 0.01110258, + "auxiliary_loss_mlp": 0.01034882, + "balance_loss_clip": 1.03850508, + "balance_loss_mlp": 1.0235157, + "epoch": 0.8446715767323012, + "flos": 29222325789120.0, + "grad_norm": 2.1512935252446717, + "language_loss": 0.73370337, + "learning_rate": 2.477094525178667e-07, + "loss": 0.75515479, + "num_input_tokens_seen": 302932525, + "router_z_loss_clip": 0.71728516, + "router_z_loss_mlp": 0.1137085, + "step": 14049, + "time_per_iteration": 2.6214118003845215 + }, + { + "auxiliary_loss_clip": 0.01029081, + "auxiliary_loss_mlp": 0.01001697, + "balance_loss_clip": 1.00677323, + "balance_loss_mlp": 1.00078654, + "epoch": 0.8447316999849692, + "flos": 82955936535840.0, + "grad_norm": 0.8020985510764764, + "language_loss": 0.60675919, + "learning_rate": 2.475217468471729e-07, + "loss": 0.62706697, + "num_input_tokens_seen": 302991285, + "router_z_loss_clip": 0.22314453, + "router_z_loss_mlp": 0.00909424, + "step": 14050, + "time_per_iteration": 3.2212488651275635 + }, + { + "auxiliary_loss_clip": 0.01109988, + "auxiliary_loss_mlp": 0.01027144, + "balance_loss_clip": 1.0376482, + "balance_loss_mlp": 1.0154314, + "epoch": 0.8447918232376371, + "flos": 27355758829920.0, + "grad_norm": 2.374989059252388, + "language_loss": 0.72655225, + "learning_rate": 2.473341076306303e-07, + "loss": 0.74792361, + "num_input_tokens_seen": 303009515, + "router_z_loss_clip": 0.72314453, + "router_z_loss_mlp": 0.11712646, + "step": 14051, + "time_per_iteration": 2.694182872772217 + }, + { + "auxiliary_loss_clip": 0.01107943, + "auxiliary_loss_mlp": 0.01026866, + "balance_loss_clip": 1.03760266, + "balance_loss_mlp": 1.01591706, + "epoch": 0.8448519464903052, + "flos": 28912048394400.0, + "grad_norm": 2.9104786141827326, + "language_loss": 0.74913228, + "learning_rate": 2.471465348753547e-07, + "loss": 0.77048039, + "num_input_tokens_seen": 303026905, + "router_z_loss_clip": 0.70361328, + "router_z_loss_mlp": 0.10955811, + "step": 14052, + "time_per_iteration": 2.65813946723938 + }, + { + "auxiliary_loss_clip": 0.01104065, + "auxiliary_loss_mlp": 0.0102487, + "balance_loss_clip": 1.03728497, + "balance_loss_mlp": 1.0153873, + "epoch": 0.8449120697429731, + "flos": 16760016158400.0, + "grad_norm": 2.107536131277452, + "language_loss": 0.73697692, + "learning_rate": 2.469590285884575e-07, + "loss": 0.75826627, + "num_input_tokens_seen": 303045245, + "router_z_loss_clip": 0.66748047, + "router_z_loss_mlp": 0.09484863, + "step": 14053, + "time_per_iteration": 2.6678247451782227 + }, + { + "auxiliary_loss_clip": 0.01109084, + "auxiliary_loss_mlp": 0.01025805, + "balance_loss_clip": 1.03888953, + "balance_loss_mlp": 1.01517129, + "epoch": 0.8449721929956411, + "flos": 25486112557440.0, + "grad_norm": 2.7520143777431594, + "language_loss": 0.74155003, + "learning_rate": 2.467715887770494e-07, + "loss": 0.76289892, + "num_input_tokens_seen": 303065205, + "router_z_loss_clip": 0.70117188, + "router_z_loss_mlp": 0.10620117, + "step": 14054, + "time_per_iteration": 2.653524398803711 + }, + { + "auxiliary_loss_clip": 0.01114928, + "auxiliary_loss_mlp": 0.01031198, + "balance_loss_clip": 1.03942466, + "balance_loss_mlp": 1.01975393, + "epoch": 0.845032316248309, + "flos": 40532489087040.0, + "grad_norm": 1.4805229574271188, + "language_loss": 0.78193372, + "learning_rate": 2.4658421544823895e-07, + "loss": 0.80339497, + "num_input_tokens_seen": 303088250, + "router_z_loss_clip": 0.75488281, + "router_z_loss_mlp": 0.11437988, + "step": 14055, + "time_per_iteration": 2.764617919921875 + }, + { + "auxiliary_loss_clip": 0.01109822, + "auxiliary_loss_mlp": 0.01030255, + "balance_loss_clip": 1.03917551, + "balance_loss_mlp": 1.0196749, + "epoch": 0.845092439500977, + "flos": 28779089732640.0, + "grad_norm": 1.6763911719188263, + "language_loss": 0.73014951, + "learning_rate": 2.463969086091302e-07, + "loss": 0.75155032, + "num_input_tokens_seen": 303109280, + "router_z_loss_clip": 0.70654297, + "router_z_loss_mlp": 0.10583496, + "step": 14056, + "time_per_iteration": 2.642648935317993 + }, + { + "auxiliary_loss_clip": 0.01116196, + "auxiliary_loss_mlp": 0.01035646, + "balance_loss_clip": 1.04088855, + "balance_loss_mlp": 1.02438068, + "epoch": 0.8451525627536449, + "flos": 16269543303840.0, + "grad_norm": 2.386214077423933, + "language_loss": 0.67198515, + "learning_rate": 2.4620966826682686e-07, + "loss": 0.69350356, + "num_input_tokens_seen": 303126075, + "router_z_loss_clip": 0.75292969, + "router_z_loss_mlp": 0.11260986, + "step": 14057, + "time_per_iteration": 2.688678026199341 + }, + { + "auxiliary_loss_clip": 0.01112124, + "auxiliary_loss_mlp": 0.01030331, + "balance_loss_clip": 1.03882074, + "balance_loss_mlp": 1.01894021, + "epoch": 0.8452126860063129, + "flos": 33944184734400.0, + "grad_norm": 2.8325428855918307, + "language_loss": 0.77746868, + "learning_rate": 2.460224944284284e-07, + "loss": 0.79889321, + "num_input_tokens_seen": 303146920, + "router_z_loss_clip": 0.73242188, + "router_z_loss_mlp": 0.1138916, + "step": 14058, + "time_per_iteration": 2.653958559036255 + }, + { + "auxiliary_loss_clip": 0.01112328, + "auxiliary_loss_mlp": 0.0103157, + "balance_loss_clip": 1.03896368, + "balance_loss_mlp": 1.02046585, + "epoch": 0.845272809258981, + "flos": 33098183033760.0, + "grad_norm": 1.7798014391678385, + "language_loss": 0.69872069, + "learning_rate": 2.45835387101033e-07, + "loss": 0.72015965, + "num_input_tokens_seen": 303167885, + "router_z_loss_clip": 0.734375, + "router_z_loss_mlp": 0.11096191, + "step": 14059, + "time_per_iteration": 2.6956729888916016 + }, + { + "auxiliary_loss_clip": 0.01113808, + "auxiliary_loss_mlp": 0.0103517, + "balance_loss_clip": 1.03850174, + "balance_loss_mlp": 1.02290988, + "epoch": 0.8453329325116489, + "flos": 22369279114080.0, + "grad_norm": 3.121351373065302, + "language_loss": 0.57295644, + "learning_rate": 2.4564834629173516e-07, + "loss": 0.59444624, + "num_input_tokens_seen": 303185000, + "router_z_loss_clip": 0.75292969, + "router_z_loss_mlp": 0.12261963, + "step": 14060, + "time_per_iteration": 2.58916974067688 + }, + { + "auxiliary_loss_clip": 0.01113824, + "auxiliary_loss_mlp": 0.01032471, + "balance_loss_clip": 1.03831148, + "balance_loss_mlp": 1.02018023, + "epoch": 0.8453930557643169, + "flos": 27668467261440.0, + "grad_norm": 9.285065327465023, + "language_loss": 0.7571454, + "learning_rate": 2.454613720076277e-07, + "loss": 0.77860838, + "num_input_tokens_seen": 303205210, + "router_z_loss_clip": 0.75439453, + "router_z_loss_mlp": 0.1229248, + "step": 14061, + "time_per_iteration": 2.661144495010376 + }, + { + "auxiliary_loss_clip": 0.01113592, + "auxiliary_loss_mlp": 0.01026339, + "balance_loss_clip": 1.03908563, + "balance_loss_mlp": 1.01481152, + "epoch": 0.8454531790169848, + "flos": 27439576381440.0, + "grad_norm": 2.110264031138489, + "language_loss": 0.71098256, + "learning_rate": 2.452744642558013e-07, + "loss": 0.73238188, + "num_input_tokens_seen": 303224655, + "router_z_loss_clip": 0.74560547, + "router_z_loss_mlp": 0.11523438, + "step": 14062, + "time_per_iteration": 2.6247048377990723 + }, + { + "auxiliary_loss_clip": 0.01028947, + "auxiliary_loss_mlp": 0.01001216, + "balance_loss_clip": 1.00659406, + "balance_loss_mlp": 1.00029266, + "epoch": 0.8455133022696528, + "flos": 71110818760320.0, + "grad_norm": 0.6371484867378732, + "language_loss": 0.52618051, + "learning_rate": 2.450876230433432e-07, + "loss": 0.54648209, + "num_input_tokens_seen": 303289645, + "router_z_loss_clip": 0.22351074, + "router_z_loss_mlp": 0.00922394, + "step": 14063, + "time_per_iteration": 4.797551870346069 + }, + { + "auxiliary_loss_clip": 0.01107423, + "auxiliary_loss_mlp": 0.01025046, + "balance_loss_clip": 1.03891921, + "balance_loss_mlp": 1.01537836, + "epoch": 0.8455734255223207, + "flos": 26065102968000.0, + "grad_norm": 2.1771107832362695, + "language_loss": 0.81590134, + "learning_rate": 2.449008483773378e-07, + "loss": 0.83722603, + "num_input_tokens_seen": 303308350, + "router_z_loss_clip": 0.68505859, + "router_z_loss_mlp": 0.09667969, + "step": 14064, + "time_per_iteration": 2.737874984741211 + }, + { + "auxiliary_loss_clip": 0.01114921, + "auxiliary_loss_mlp": 0.01034668, + "balance_loss_clip": 1.04050016, + "balance_loss_mlp": 1.02278304, + "epoch": 0.8456335487749888, + "flos": 24952171011840.0, + "grad_norm": 2.392155819378926, + "language_loss": 0.72624004, + "learning_rate": 2.447141402648685e-07, + "loss": 0.74773598, + "num_input_tokens_seen": 303325230, + "router_z_loss_clip": 0.74511719, + "router_z_loss_mlp": 0.11895752, + "step": 14065, + "time_per_iteration": 2.6190524101257324 + }, + { + "auxiliary_loss_clip": 0.01109508, + "auxiliary_loss_mlp": 0.01027376, + "balance_loss_clip": 1.03951335, + "balance_loss_mlp": 1.01650393, + "epoch": 0.8456936720276567, + "flos": 35192303802720.0, + "grad_norm": 1.9784964980075364, + "language_loss": 0.77569211, + "learning_rate": 2.445274987130146e-07, + "loss": 0.79706091, + "num_input_tokens_seen": 303345810, + "router_z_loss_clip": 0.70068359, + "router_z_loss_mlp": 0.10870361, + "step": 14066, + "time_per_iteration": 2.7227210998535156 + }, + { + "auxiliary_loss_clip": 0.01113161, + "auxiliary_loss_mlp": 0.01029803, + "balance_loss_clip": 1.04071808, + "balance_loss_mlp": 1.01807916, + "epoch": 0.8457537952803247, + "flos": 27665793120960.0, + "grad_norm": 1.830960508578076, + "language_loss": 0.70043463, + "learning_rate": 2.4434092372885363e-07, + "loss": 0.72186428, + "num_input_tokens_seen": 303365140, + "router_z_loss_clip": 0.72412109, + "router_z_loss_mlp": 0.11724854, + "step": 14067, + "time_per_iteration": 2.6408417224884033 + }, + { + "auxiliary_loss_clip": 0.01107992, + "auxiliary_loss_mlp": 0.01026829, + "balance_loss_clip": 1.03636813, + "balance_loss_mlp": 1.01599264, + "epoch": 0.8458139185329926, + "flos": 41247355403520.0, + "grad_norm": 2.122713948848722, + "language_loss": 0.71174884, + "learning_rate": 2.4415441531946144e-07, + "loss": 0.73309696, + "num_input_tokens_seen": 303386150, + "router_z_loss_clip": 0.71582031, + "router_z_loss_mlp": 0.10839844, + "step": 14068, + "time_per_iteration": 2.719106674194336 + }, + { + "auxiliary_loss_clip": 0.01029541, + "auxiliary_loss_mlp": 0.01002754, + "balance_loss_clip": 1.00707531, + "balance_loss_mlp": 1.00176692, + "epoch": 0.8458740417856606, + "flos": 85774890872160.0, + "grad_norm": 0.7008105127303229, + "language_loss": 0.60440648, + "learning_rate": 2.4396797349190976e-07, + "loss": 0.62472939, + "num_input_tokens_seen": 303453770, + "router_z_loss_clip": 0.22460938, + "router_z_loss_mlp": 0.00986481, + "step": 14069, + "time_per_iteration": 4.674065351486206 + }, + { + "auxiliary_loss_clip": 0.01112309, + "auxiliary_loss_mlp": 0.01030495, + "balance_loss_clip": 1.03890324, + "balance_loss_mlp": 1.0199275, + "epoch": 0.8459341650383285, + "flos": 29493550876320.0, + "grad_norm": 2.1462041943230172, + "language_loss": 0.74645108, + "learning_rate": 2.4378159825326804e-07, + "loss": 0.76787913, + "num_input_tokens_seen": 303474520, + "router_z_loss_clip": 0.73388672, + "router_z_loss_mlp": 0.10565186, + "step": 14070, + "time_per_iteration": 2.6839754581451416 + }, + { + "auxiliary_loss_clip": 0.01110493, + "auxiliary_loss_mlp": 0.01028279, + "balance_loss_clip": 1.03871584, + "balance_loss_mlp": 1.01727629, + "epoch": 0.8459942882909965, + "flos": 46590012241920.0, + "grad_norm": 1.7846435007951944, + "language_loss": 0.67468548, + "learning_rate": 2.435952896106039e-07, + "loss": 0.69607329, + "num_input_tokens_seen": 303497345, + "router_z_loss_clip": 0.71875, + "router_z_loss_mlp": 0.10998535, + "step": 14071, + "time_per_iteration": 2.7565829753875732 + }, + { + "auxiliary_loss_clip": 0.01029126, + "auxiliary_loss_mlp": 0.01003598, + "balance_loss_clip": 1.00671184, + "balance_loss_mlp": 1.00267851, + "epoch": 0.8460544115436646, + "flos": 78237481042080.0, + "grad_norm": 0.7367924230799522, + "language_loss": 0.60993218, + "learning_rate": 2.4340904757098313e-07, + "loss": 0.63025945, + "num_input_tokens_seen": 303554890, + "router_z_loss_clip": 0.22412109, + "router_z_loss_mlp": 0.00919342, + "step": 14072, + "time_per_iteration": 3.101332187652588 + }, + { + "auxiliary_loss_clip": 0.01113635, + "auxiliary_loss_mlp": 0.01033498, + "balance_loss_clip": 1.03874457, + "balance_loss_mlp": 1.02113581, + "epoch": 0.8461145347963325, + "flos": 29493510359040.0, + "grad_norm": 2.159171212074166, + "language_loss": 0.72761673, + "learning_rate": 2.4322287214146664e-07, + "loss": 0.74908805, + "num_input_tokens_seen": 303574380, + "router_z_loss_clip": 0.74853516, + "router_z_loss_mlp": 0.12359619, + "step": 14073, + "time_per_iteration": 2.657846212387085 + }, + { + "auxiliary_loss_clip": 0.01116979, + "auxiliary_loss_mlp": 0.01034444, + "balance_loss_clip": 1.03968072, + "balance_loss_mlp": 1.02228427, + "epoch": 0.8461746580490005, + "flos": 42578846333280.0, + "grad_norm": 2.100169576909764, + "language_loss": 0.77872074, + "learning_rate": 2.430367633291155e-07, + "loss": 0.80023497, + "num_input_tokens_seen": 303594910, + "router_z_loss_clip": 0.77294922, + "router_z_loss_mlp": 0.12176514, + "step": 14074, + "time_per_iteration": 4.246251106262207 + }, + { + "auxiliary_loss_clip": 0.0111257, + "auxiliary_loss_mlp": 0.01031148, + "balance_loss_clip": 1.0403254, + "balance_loss_mlp": 1.01993585, + "epoch": 0.8462347813016684, + "flos": 31185068070240.0, + "grad_norm": 3.2179485592004564, + "language_loss": 0.75198412, + "learning_rate": 2.4285072114098583e-07, + "loss": 0.77342129, + "num_input_tokens_seen": 303613520, + "router_z_loss_clip": 0.72216797, + "router_z_loss_mlp": 0.11206055, + "step": 14075, + "time_per_iteration": 2.7089591026306152 + }, + { + "auxiliary_loss_clip": 0.01109134, + "auxiliary_loss_mlp": 0.01027661, + "balance_loss_clip": 1.03783107, + "balance_loss_mlp": 1.01558518, + "epoch": 0.8462949045543364, + "flos": 26021188586880.0, + "grad_norm": 2.3305607106373634, + "language_loss": 0.72854018, + "learning_rate": 2.4266474558413355e-07, + "loss": 0.74990821, + "num_input_tokens_seen": 303631225, + "router_z_loss_clip": 0.71240234, + "router_z_loss_mlp": 0.12072754, + "step": 14076, + "time_per_iteration": 2.641143560409546 + }, + { + "auxiliary_loss_clip": 0.01114098, + "auxiliary_loss_mlp": 0.01032203, + "balance_loss_clip": 1.03928554, + "balance_loss_mlp": 1.02093816, + "epoch": 0.8463550278070043, + "flos": 27622445981760.0, + "grad_norm": 1.8978212786814945, + "language_loss": 0.77009654, + "learning_rate": 2.4247883666560945e-07, + "loss": 0.79155958, + "num_input_tokens_seen": 303649175, + "router_z_loss_clip": 0.74755859, + "router_z_loss_mlp": 0.1126709, + "step": 14077, + "time_per_iteration": 3.9072985649108887 + }, + { + "auxiliary_loss_clip": 0.01115891, + "auxiliary_loss_mlp": 0.01031297, + "balance_loss_clip": 1.04024279, + "balance_loss_mlp": 1.0201571, + "epoch": 0.8464151510596724, + "flos": 15868884558240.0, + "grad_norm": 2.41102708603961, + "language_loss": 0.7539863, + "learning_rate": 2.422929943924643e-07, + "loss": 0.77545822, + "num_input_tokens_seen": 303665915, + "router_z_loss_clip": 0.75537109, + "router_z_loss_mlp": 0.11138916, + "step": 14078, + "time_per_iteration": 2.5979666709899902 + }, + { + "auxiliary_loss_clip": 0.01108549, + "auxiliary_loss_mlp": 0.01027113, + "balance_loss_clip": 1.03760934, + "balance_loss_mlp": 1.01540661, + "epoch": 0.8464752743123403, + "flos": 19162874665440.0, + "grad_norm": 2.4529779481643357, + "language_loss": 0.84846199, + "learning_rate": 2.4210721877174565e-07, + "loss": 0.86981857, + "num_input_tokens_seen": 303679985, + "router_z_loss_clip": 0.71044922, + "router_z_loss_mlp": 0.11706543, + "step": 14079, + "time_per_iteration": 2.619906187057495 + }, + { + "auxiliary_loss_clip": 0.01119737, + "auxiliary_loss_mlp": 0.01036551, + "balance_loss_clip": 1.04086447, + "balance_loss_mlp": 1.0242784, + "epoch": 0.8465353975650083, + "flos": 26422171470720.0, + "grad_norm": 3.213202998062622, + "language_loss": 0.58304572, + "learning_rate": 2.419215098104965e-07, + "loss": 0.60460865, + "num_input_tokens_seen": 303698470, + "router_z_loss_clip": 0.78857422, + "router_z_loss_mlp": 0.1227417, + "step": 14080, + "time_per_iteration": 2.6629374027252197 + }, + { + "auxiliary_loss_clip": 0.01113456, + "auxiliary_loss_mlp": 0.01030937, + "balance_loss_clip": 1.03802848, + "balance_loss_mlp": 1.01930809, + "epoch": 0.8465955208176762, + "flos": 22592619126720.0, + "grad_norm": 2.238477722873299, + "language_loss": 0.66304016, + "learning_rate": 2.4173586751576014e-07, + "loss": 0.684484, + "num_input_tokens_seen": 303716415, + "router_z_loss_clip": 0.75439453, + "router_z_loss_mlp": 0.11627197, + "step": 14081, + "time_per_iteration": 2.5863068103790283 + }, + { + "auxiliary_loss_clip": 0.01113443, + "auxiliary_loss_mlp": 0.01029139, + "balance_loss_clip": 1.0392549, + "balance_loss_mlp": 1.01899409, + "epoch": 0.8466556440703442, + "flos": 29530415250720.0, + "grad_norm": 1.8660972853591091, + "language_loss": 0.73283511, + "learning_rate": 2.41550291894576e-07, + "loss": 0.75426096, + "num_input_tokens_seen": 303734490, + "router_z_loss_clip": 0.7421875, + "router_z_loss_mlp": 0.10144043, + "step": 14082, + "time_per_iteration": 2.6476359367370605 + }, + { + "auxiliary_loss_clip": 0.01111387, + "auxiliary_loss_mlp": 0.01025422, + "balance_loss_clip": 1.03716779, + "balance_loss_mlp": 1.01458621, + "epoch": 0.8467157673230121, + "flos": 24862681041120.0, + "grad_norm": 1.9334009198441453, + "language_loss": 0.75576735, + "learning_rate": 2.413647829539809e-07, + "loss": 0.77713543, + "num_input_tokens_seen": 303752310, + "router_z_loss_clip": 0.74267578, + "router_z_loss_mlp": 0.1083374, + "step": 14083, + "time_per_iteration": 2.596195697784424 + }, + { + "auxiliary_loss_clip": 0.01114765, + "auxiliary_loss_mlp": 0.01028293, + "balance_loss_clip": 1.038239, + "balance_loss_mlp": 1.01613343, + "epoch": 0.8467758905756801, + "flos": 34744367741760.0, + "grad_norm": 1.8212328169308334, + "language_loss": 0.66457242, + "learning_rate": 2.411793407010092e-07, + "loss": 0.68600297, + "num_input_tokens_seen": 303776065, + "router_z_loss_clip": 0.76513672, + "router_z_loss_mlp": 0.12164307, + "step": 14084, + "time_per_iteration": 2.6989099979400635 + }, + { + "auxiliary_loss_clip": 0.01113299, + "auxiliary_loss_mlp": 0.01028253, + "balance_loss_clip": 1.04073727, + "balance_loss_mlp": 1.01726222, + "epoch": 0.8468360138283482, + "flos": 14266573714080.0, + "grad_norm": 2.181056621064566, + "language_loss": 0.69718099, + "learning_rate": 2.409939651426938e-07, + "loss": 0.71859646, + "num_input_tokens_seen": 303793500, + "router_z_loss_clip": 0.72607422, + "router_z_loss_mlp": 0.10992432, + "step": 14085, + "time_per_iteration": 2.63900089263916 + }, + { + "auxiliary_loss_clip": 0.01109856, + "auxiliary_loss_mlp": 0.01028374, + "balance_loss_clip": 1.03748298, + "balance_loss_mlp": 1.01796138, + "epoch": 0.8468961370810161, + "flos": 30026884662720.0, + "grad_norm": 1.7872560839955103, + "language_loss": 0.71104264, + "learning_rate": 2.408086562860634e-07, + "loss": 0.73242497, + "num_input_tokens_seen": 303814835, + "router_z_loss_clip": 0.72412109, + "router_z_loss_mlp": 0.10412598, + "step": 14086, + "time_per_iteration": 2.765483856201172 + }, + { + "auxiliary_loss_clip": 0.01109681, + "auxiliary_loss_mlp": 0.01031844, + "balance_loss_clip": 1.03800917, + "balance_loss_mlp": 1.02047741, + "epoch": 0.8469562603336841, + "flos": 23927837646240.0, + "grad_norm": 1.827092846163465, + "language_loss": 0.74880099, + "learning_rate": 2.4062341413814445e-07, + "loss": 0.77021629, + "num_input_tokens_seen": 303834505, + "router_z_loss_clip": 0.71679688, + "router_z_loss_mlp": 0.1137085, + "step": 14087, + "time_per_iteration": 2.6557798385620117 + }, + { + "auxiliary_loss_clip": 0.01111231, + "auxiliary_loss_mlp": 0.0102277, + "balance_loss_clip": 1.03930569, + "balance_loss_mlp": 1.01153433, + "epoch": 0.847016383586352, + "flos": 27624147707520.0, + "grad_norm": 3.920650957439455, + "language_loss": 0.73939431, + "learning_rate": 2.4043823870596227e-07, + "loss": 0.76073432, + "num_input_tokens_seen": 303855050, + "router_z_loss_clip": 0.71923828, + "router_z_loss_mlp": 0.11242676, + "step": 14088, + "time_per_iteration": 2.715855836868286 + }, + { + "auxiliary_loss_clip": 0.0111282, + "auxiliary_loss_mlp": 0.01034402, + "balance_loss_clip": 1.03871238, + "balance_loss_mlp": 1.02294612, + "epoch": 0.84707650683902, + "flos": 25575602528160.0, + "grad_norm": 2.4438235463693854, + "language_loss": 0.72226232, + "learning_rate": 2.402531299965387e-07, + "loss": 0.74373454, + "num_input_tokens_seen": 303875635, + "router_z_loss_clip": 0.74121094, + "router_z_loss_mlp": 0.11456299, + "step": 14089, + "time_per_iteration": 2.601073741912842 + }, + { + "auxiliary_loss_clip": 0.01108301, + "auxiliary_loss_mlp": 0.01026861, + "balance_loss_clip": 1.03890896, + "balance_loss_mlp": 1.01592338, + "epoch": 0.8471366300916879, + "flos": 29398996245600.0, + "grad_norm": 1.5447748932646792, + "language_loss": 0.79042983, + "learning_rate": 2.400680880168928e-07, + "loss": 0.81178141, + "num_input_tokens_seen": 303896750, + "router_z_loss_clip": 0.69433594, + "router_z_loss_mlp": 0.10943604, + "step": 14090, + "time_per_iteration": 2.718358278274536 + }, + { + "auxiliary_loss_clip": 0.01113046, + "auxiliary_loss_mlp": 0.01036683, + "balance_loss_clip": 1.03862834, + "balance_loss_mlp": 1.02396894, + "epoch": 0.847196753344356, + "flos": 22369643769600.0, + "grad_norm": 2.2434481413169185, + "language_loss": 0.77076226, + "learning_rate": 2.3988311277404085e-07, + "loss": 0.79225957, + "num_input_tokens_seen": 303915435, + "router_z_loss_clip": 0.74462891, + "router_z_loss_mlp": 0.12719727, + "step": 14091, + "time_per_iteration": 2.6003947257995605 + }, + { + "auxiliary_loss_clip": 0.01028854, + "auxiliary_loss_mlp": 0.01001614, + "balance_loss_clip": 1.00656986, + "balance_loss_mlp": 1.00068605, + "epoch": 0.8472568765970239, + "flos": 60480926372160.0, + "grad_norm": 0.8433354028269914, + "language_loss": 0.59387195, + "learning_rate": 2.396982042749982e-07, + "loss": 0.61417663, + "num_input_tokens_seen": 303977245, + "router_z_loss_clip": 0.22290039, + "router_z_loss_mlp": 0.00926971, + "step": 14092, + "time_per_iteration": 3.3174211978912354 + }, + { + "auxiliary_loss_clip": 0.01110061, + "auxiliary_loss_mlp": 0.01031406, + "balance_loss_clip": 1.03696764, + "balance_loss_mlp": 1.01950264, + "epoch": 0.8473169998496919, + "flos": 23524464242880.0, + "grad_norm": 2.023905332199627, + "language_loss": 0.70114726, + "learning_rate": 2.395133625267756e-07, + "loss": 0.72256196, + "num_input_tokens_seen": 303996055, + "router_z_loss_clip": 0.73144531, + "router_z_loss_mlp": 0.11901855, + "step": 14093, + "time_per_iteration": 2.6240642070770264 + }, + { + "auxiliary_loss_clip": 0.01106846, + "auxiliary_loss_mlp": 0.0102488, + "balance_loss_clip": 1.03641665, + "balance_loss_mlp": 1.01430595, + "epoch": 0.8473771231023598, + "flos": 21567556450080.0, + "grad_norm": 2.1432658759347842, + "language_loss": 0.83464444, + "learning_rate": 2.3932858753638263e-07, + "loss": 0.85596168, + "num_input_tokens_seen": 304012205, + "router_z_loss_clip": 0.70410156, + "router_z_loss_mlp": 0.10577393, + "step": 14094, + "time_per_iteration": 2.5854947566986084 + }, + { + "auxiliary_loss_clip": 0.01107205, + "auxiliary_loss_mlp": 0.01033941, + "balance_loss_clip": 1.03825641, + "balance_loss_mlp": 1.02278924, + "epoch": 0.8474372463550278, + "flos": 32164636191840.0, + "grad_norm": 1.726625889534479, + "language_loss": 0.71327555, + "learning_rate": 2.3914387931082626e-07, + "loss": 0.73468709, + "num_input_tokens_seen": 304033475, + "router_z_loss_clip": 0.68945312, + "router_z_loss_mlp": 0.11157227, + "step": 14095, + "time_per_iteration": 2.6898746490478516 + }, + { + "auxiliary_loss_clip": 0.0110825, + "auxiliary_loss_mlp": 0.01033098, + "balance_loss_clip": 1.03762484, + "balance_loss_mlp": 1.02222657, + "epoch": 0.8474973696076957, + "flos": 28555628168160.0, + "grad_norm": 2.0696950559161125, + "language_loss": 0.80942571, + "learning_rate": 2.3895923785711105e-07, + "loss": 0.83083922, + "num_input_tokens_seen": 304051845, + "router_z_loss_clip": 0.70605469, + "router_z_loss_mlp": 0.10864258, + "step": 14096, + "time_per_iteration": 2.6133670806884766 + }, + { + "auxiliary_loss_clip": 0.011127, + "auxiliary_loss_mlp": 0.01028107, + "balance_loss_clip": 1.03762984, + "balance_loss_mlp": 1.01629329, + "epoch": 0.8475574928603637, + "flos": 30598338859200.0, + "grad_norm": 1.7874083083486711, + "language_loss": 0.77152669, + "learning_rate": 2.387746631822374e-07, + "loss": 0.79293478, + "num_input_tokens_seen": 304069965, + "router_z_loss_clip": 0.75097656, + "router_z_loss_mlp": 0.1182251, + "step": 14097, + "time_per_iteration": 2.7258365154266357 + }, + { + "auxiliary_loss_clip": 0.01111516, + "auxiliary_loss_mlp": 0.01025131, + "balance_loss_clip": 1.04013801, + "balance_loss_mlp": 1.01452172, + "epoch": 0.8476176161130318, + "flos": 24364064213280.0, + "grad_norm": 1.7640443043424936, + "language_loss": 0.80691838, + "learning_rate": 2.385901552932048e-07, + "loss": 0.82828486, + "num_input_tokens_seen": 304086805, + "router_z_loss_clip": 0.71435547, + "router_z_loss_mlp": 0.10614014, + "step": 14098, + "time_per_iteration": 2.5958714485168457 + }, + { + "auxiliary_loss_clip": 0.01109975, + "auxiliary_loss_mlp": 0.01030108, + "balance_loss_clip": 1.0385623, + "balance_loss_mlp": 1.01846147, + "epoch": 0.8476777393656997, + "flos": 25973262995040.0, + "grad_norm": 2.2840531348290534, + "language_loss": 0.71903831, + "learning_rate": 2.3840571419701062e-07, + "loss": 0.74043918, + "num_input_tokens_seen": 304105865, + "router_z_loss_clip": 0.71435547, + "router_z_loss_mlp": 0.11639404, + "step": 14099, + "time_per_iteration": 2.634856700897217 + }, + { + "auxiliary_loss_clip": 0.01109512, + "auxiliary_loss_mlp": 0.01028155, + "balance_loss_clip": 1.03736544, + "balance_loss_mlp": 1.01572728, + "epoch": 0.8477378626183677, + "flos": 36571963428000.0, + "grad_norm": 1.9433394431388333, + "language_loss": 0.63276476, + "learning_rate": 2.3822133990064787e-07, + "loss": 0.65414143, + "num_input_tokens_seen": 304128300, + "router_z_loss_clip": 0.72070312, + "router_z_loss_mlp": 0.12426758, + "step": 14100, + "time_per_iteration": 2.681422472000122 + }, + { + "auxiliary_loss_clip": 0.0111267, + "auxiliary_loss_mlp": 0.01032709, + "balance_loss_clip": 1.03801668, + "balance_loss_mlp": 1.02097249, + "epoch": 0.8477979858710356, + "flos": 29575707219360.0, + "grad_norm": 2.682454241958148, + "language_loss": 0.73594713, + "learning_rate": 2.380370324111085e-07, + "loss": 0.75740099, + "num_input_tokens_seen": 304143695, + "router_z_loss_clip": 0.74658203, + "router_z_loss_mlp": 0.11743164, + "step": 14101, + "time_per_iteration": 2.6699507236480713 + }, + { + "auxiliary_loss_clip": 0.01111552, + "auxiliary_loss_mlp": 0.01029535, + "balance_loss_clip": 1.03840148, + "balance_loss_mlp": 1.01856804, + "epoch": 0.8478581091237036, + "flos": 31229792796960.0, + "grad_norm": 1.6731125876646034, + "language_loss": 0.71426177, + "learning_rate": 2.3785279173538163e-07, + "loss": 0.73567265, + "num_input_tokens_seen": 304165800, + "router_z_loss_clip": 0.73144531, + "router_z_loss_mlp": 0.10955811, + "step": 14102, + "time_per_iteration": 4.242453575134277 + }, + { + "auxiliary_loss_clip": 0.01113643, + "auxiliary_loss_mlp": 0.01029621, + "balance_loss_clip": 1.0385623, + "balance_loss_mlp": 1.01810586, + "epoch": 0.8479182323763715, + "flos": 14711470979040.0, + "grad_norm": 2.179123264753785, + "language_loss": 0.82126296, + "learning_rate": 2.3766861788045366e-07, + "loss": 0.84269559, + "num_input_tokens_seen": 304182910, + "router_z_loss_clip": 0.75097656, + "router_z_loss_mlp": 0.1151123, + "step": 14103, + "time_per_iteration": 2.6227524280548096 + }, + { + "auxiliary_loss_clip": 0.01110999, + "auxiliary_loss_mlp": 0.0102957, + "balance_loss_clip": 1.04021525, + "balance_loss_mlp": 1.01881194, + "epoch": 0.8479783556290396, + "flos": 26153174833920.0, + "grad_norm": 2.681794071645072, + "language_loss": 0.78542912, + "learning_rate": 2.374845108533079e-07, + "loss": 0.80683482, + "num_input_tokens_seen": 304200175, + "router_z_loss_clip": 0.70800781, + "router_z_loss_mlp": 0.10742188, + "step": 14104, + "time_per_iteration": 2.636315107345581 + }, + { + "auxiliary_loss_clip": 0.01113752, + "auxiliary_loss_mlp": 0.01033026, + "balance_loss_clip": 1.0399785, + "balance_loss_mlp": 1.02151608, + "epoch": 0.8480384788817075, + "flos": 23968227024000.0, + "grad_norm": 2.196598995670802, + "language_loss": 0.78846502, + "learning_rate": 2.3730047066092607e-07, + "loss": 0.80993283, + "num_input_tokens_seen": 304217775, + "router_z_loss_clip": 0.73779297, + "router_z_loss_mlp": 0.1151123, + "step": 14105, + "time_per_iteration": 2.5868518352508545 + }, + { + "auxiliary_loss_clip": 0.01116945, + "auxiliary_loss_mlp": 0.01028178, + "balance_loss_clip": 1.04030967, + "balance_loss_mlp": 1.01641226, + "epoch": 0.8480986021343755, + "flos": 27441764314560.0, + "grad_norm": 2.21206345871489, + "language_loss": 0.50560117, + "learning_rate": 2.3711649731028749e-07, + "loss": 0.5270524, + "num_input_tokens_seen": 304235760, + "router_z_loss_clip": 0.76611328, + "router_z_loss_mlp": 0.11773682, + "step": 14106, + "time_per_iteration": 2.6807796955108643 + }, + { + "auxiliary_loss_clip": 0.01111147, + "auxiliary_loss_mlp": 0.01029068, + "balance_loss_clip": 1.03881097, + "balance_loss_mlp": 1.01797569, + "epoch": 0.8481587253870434, + "flos": 26955059567040.0, + "grad_norm": 3.126398799755093, + "language_loss": 0.7561214, + "learning_rate": 2.3693259080836792e-07, + "loss": 0.77752364, + "num_input_tokens_seen": 304253985, + "router_z_loss_clip": 0.72363281, + "router_z_loss_mlp": 0.11090088, + "step": 14107, + "time_per_iteration": 2.6069016456604004 + }, + { + "auxiliary_loss_clip": 0.01110654, + "auxiliary_loss_mlp": 0.0102764, + "balance_loss_clip": 1.03835964, + "balance_loss_mlp": 1.01673222, + "epoch": 0.8482188486397114, + "flos": 40978723422240.0, + "grad_norm": 2.5036967261349767, + "language_loss": 0.73552883, + "learning_rate": 2.3674875116214087e-07, + "loss": 0.75691175, + "num_input_tokens_seen": 304276785, + "router_z_loss_clip": 0.72363281, + "router_z_loss_mlp": 0.10906982, + "step": 14108, + "time_per_iteration": 4.138775587081909 + }, + { + "auxiliary_loss_clip": 0.01107669, + "auxiliary_loss_mlp": 0.01024626, + "balance_loss_clip": 1.03782189, + "balance_loss_mlp": 1.0122695, + "epoch": 0.8482789718923793, + "flos": 25526258831520.0, + "grad_norm": 1.8569008477286646, + "language_loss": 0.72607809, + "learning_rate": 2.3656497837857836e-07, + "loss": 0.74740106, + "num_input_tokens_seen": 304296310, + "router_z_loss_clip": 0.69970703, + "router_z_loss_mlp": 0.12365723, + "step": 14109, + "time_per_iteration": 2.6525816917419434 + }, + { + "auxiliary_loss_clip": 0.01107004, + "auxiliary_loss_mlp": 0.0103291, + "balance_loss_clip": 1.03709865, + "balance_loss_mlp": 1.02179956, + "epoch": 0.8483390951450474, + "flos": 15735399171840.0, + "grad_norm": 2.1735009991072616, + "language_loss": 0.73782086, + "learning_rate": 2.3638127246464811e-07, + "loss": 0.75922, + "num_input_tokens_seen": 304311715, + "router_z_loss_clip": 0.69873047, + "router_z_loss_mlp": 0.11108398, + "step": 14110, + "time_per_iteration": 2.6382975578308105 + }, + { + "auxiliary_loss_clip": 0.01111801, + "auxiliary_loss_mlp": 0.01030985, + "balance_loss_clip": 1.03988874, + "balance_loss_mlp": 1.0202446, + "epoch": 0.8483992183977154, + "flos": 30605064727680.0, + "grad_norm": 1.7982360333451364, + "language_loss": 0.76076138, + "learning_rate": 2.3619763342731658e-07, + "loss": 0.78218925, + "num_input_tokens_seen": 304331910, + "router_z_loss_clip": 0.71875, + "router_z_loss_mlp": 0.10748291, + "step": 14111, + "time_per_iteration": 2.6784985065460205 + }, + { + "auxiliary_loss_clip": 0.01109618, + "auxiliary_loss_mlp": 0.01024396, + "balance_loss_clip": 1.03853905, + "balance_loss_mlp": 1.01434708, + "epoch": 0.8484593416503833, + "flos": 31185554277600.0, + "grad_norm": 1.678755333369068, + "language_loss": 0.67249727, + "learning_rate": 2.3601406127354772e-07, + "loss": 0.6938374, + "num_input_tokens_seen": 304351405, + "router_z_loss_clip": 0.71142578, + "router_z_loss_mlp": 0.1005249, + "step": 14112, + "time_per_iteration": 2.640460252761841 + }, + { + "auxiliary_loss_clip": 0.01112178, + "auxiliary_loss_mlp": 0.0102982, + "balance_loss_clip": 1.03863502, + "balance_loss_mlp": 1.01915109, + "epoch": 0.8485194649030513, + "flos": 33188118694560.0, + "grad_norm": 2.897520756493806, + "language_loss": 0.73470736, + "learning_rate": 2.3583055601030312e-07, + "loss": 0.75612736, + "num_input_tokens_seen": 304372935, + "router_z_loss_clip": 0.73583984, + "router_z_loss_mlp": 0.10668945, + "step": 14113, + "time_per_iteration": 4.152140140533447 + }, + { + "auxiliary_loss_clip": 0.01110211, + "auxiliary_loss_mlp": 0.01027155, + "balance_loss_clip": 1.03843474, + "balance_loss_mlp": 1.01643777, + "epoch": 0.8485795881557192, + "flos": 29536452325440.0, + "grad_norm": 2.4180134818242864, + "language_loss": 0.66629779, + "learning_rate": 2.3564711764454003e-07, + "loss": 0.68767136, + "num_input_tokens_seen": 304393070, + "router_z_loss_clip": 0.71826172, + "router_z_loss_mlp": 0.10723877, + "step": 14114, + "time_per_iteration": 2.683347702026367 + }, + { + "auxiliary_loss_clip": 0.01112648, + "auxiliary_loss_mlp": 0.01029358, + "balance_loss_clip": 1.03889155, + "balance_loss_mlp": 1.01797307, + "epoch": 0.8486397114083872, + "flos": 25797159780480.0, + "grad_norm": 1.553704143646122, + "language_loss": 0.78814507, + "learning_rate": 2.3546374618321495e-07, + "loss": 0.80956519, + "num_input_tokens_seen": 304411195, + "router_z_loss_clip": 0.73779297, + "router_z_loss_mlp": 0.11376953, + "step": 14115, + "time_per_iteration": 2.6369879245758057 + }, + { + "auxiliary_loss_clip": 0.01110707, + "auxiliary_loss_mlp": 0.01029454, + "balance_loss_clip": 1.03806996, + "balance_loss_mlp": 1.01870763, + "epoch": 0.8486998346610551, + "flos": 24373180601280.0, + "grad_norm": 1.855469137438758, + "language_loss": 0.79077029, + "learning_rate": 2.3528044163328187e-07, + "loss": 0.81217194, + "num_input_tokens_seen": 304429425, + "router_z_loss_clip": 0.72558594, + "router_z_loss_mlp": 0.10748291, + "step": 14116, + "time_per_iteration": 2.662287712097168 + }, + { + "auxiliary_loss_clip": 0.01112933, + "auxiliary_loss_mlp": 0.01031782, + "balance_loss_clip": 1.03789186, + "balance_loss_mlp": 1.02023125, + "epoch": 0.8487599579137232, + "flos": 24150529382400.0, + "grad_norm": 1.834252987823395, + "language_loss": 0.68865287, + "learning_rate": 2.3509720400169076e-07, + "loss": 0.71010005, + "num_input_tokens_seen": 304447460, + "router_z_loss_clip": 0.75048828, + "router_z_loss_mlp": 0.11553955, + "step": 14117, + "time_per_iteration": 3.9161806106567383 + }, + { + "auxiliary_loss_clip": 0.01111406, + "auxiliary_loss_mlp": 0.01027442, + "balance_loss_clip": 1.0371294, + "balance_loss_mlp": 1.01611757, + "epoch": 0.8488200811663911, + "flos": 32208915228480.0, + "grad_norm": 2.179565786472641, + "language_loss": 0.64551771, + "learning_rate": 2.3491403329539096e-07, + "loss": 0.66690612, + "num_input_tokens_seen": 304468230, + "router_z_loss_clip": 0.74267578, + "router_z_loss_mlp": 0.11340332, + "step": 14118, + "time_per_iteration": 2.6960437297821045 + }, + { + "auxiliary_loss_clip": 0.01109262, + "auxiliary_loss_mlp": 0.01030888, + "balance_loss_clip": 1.03770161, + "balance_loss_mlp": 1.02024257, + "epoch": 0.8488802044190591, + "flos": 19961599050720.0, + "grad_norm": 2.724456165674927, + "language_loss": 0.73105454, + "learning_rate": 2.3473092952132757e-07, + "loss": 0.75245607, + "num_input_tokens_seen": 304484860, + "router_z_loss_clip": 0.71484375, + "router_z_loss_mlp": 0.10644531, + "step": 14119, + "time_per_iteration": 2.6314706802368164 + }, + { + "auxiliary_loss_clip": 0.0110989, + "auxiliary_loss_mlp": 0.01031214, + "balance_loss_clip": 1.03689694, + "balance_loss_mlp": 1.01900148, + "epoch": 0.848940327671727, + "flos": 23438539792800.0, + "grad_norm": 1.6720609816279817, + "language_loss": 0.77923983, + "learning_rate": 2.345478926864446e-07, + "loss": 0.80065089, + "num_input_tokens_seen": 304503575, + "router_z_loss_clip": 0.72949219, + "router_z_loss_mlp": 0.12219238, + "step": 14120, + "time_per_iteration": 2.6742124557495117 + }, + { + "auxiliary_loss_clip": 0.01111466, + "auxiliary_loss_mlp": 0.01025688, + "balance_loss_clip": 1.03825164, + "balance_loss_mlp": 1.01416671, + "epoch": 0.849000450924395, + "flos": 26687400000480.0, + "grad_norm": 2.131753627958884, + "language_loss": 0.75697517, + "learning_rate": 2.3436492279768227e-07, + "loss": 0.77834672, + "num_input_tokens_seen": 304525005, + "router_z_loss_clip": 0.73242188, + "router_z_loss_mlp": 0.11517334, + "step": 14121, + "time_per_iteration": 2.644420862197876 + }, + { + "auxiliary_loss_clip": 0.01028845, + "auxiliary_loss_mlp": 0.01001036, + "balance_loss_clip": 1.00646341, + "balance_loss_mlp": 1.00005448, + "epoch": 0.8490605741770629, + "flos": 86837668786080.0, + "grad_norm": 0.8125986135154504, + "language_loss": 0.60180002, + "learning_rate": 2.3418201986197883e-07, + "loss": 0.62209886, + "num_input_tokens_seen": 304585220, + "router_z_loss_clip": 0.22424316, + "router_z_loss_mlp": 0.0098114, + "step": 14122, + "time_per_iteration": 3.229896306991577 + }, + { + "auxiliary_loss_clip": 0.01111488, + "auxiliary_loss_mlp": 0.01030039, + "balance_loss_clip": 1.03870308, + "balance_loss_mlp": 1.01888704, + "epoch": 0.849120697429731, + "flos": 30472835376960.0, + "grad_norm": 4.261707365676753, + "language_loss": 0.79907346, + "learning_rate": 2.3399918388627048e-07, + "loss": 0.82048875, + "num_input_tokens_seen": 304604665, + "router_z_loss_clip": 0.72753906, + "router_z_loss_mlp": 0.11157227, + "step": 14123, + "time_per_iteration": 2.675178050994873 + }, + { + "auxiliary_loss_clip": 0.01108315, + "auxiliary_loss_mlp": 0.01027587, + "balance_loss_clip": 1.03802466, + "balance_loss_mlp": 1.0168879, + "epoch": 0.8491808206823989, + "flos": 28104855897600.0, + "grad_norm": 2.67234628109862, + "language_loss": 0.83168328, + "learning_rate": 2.3381641487749016e-07, + "loss": 0.8530423, + "num_input_tokens_seen": 304620600, + "router_z_loss_clip": 0.70214844, + "router_z_loss_mlp": 0.10693359, + "step": 14124, + "time_per_iteration": 2.6030852794647217 + }, + { + "auxiliary_loss_clip": 0.01114413, + "auxiliary_loss_mlp": 0.01030815, + "balance_loss_clip": 1.04153407, + "balance_loss_mlp": 1.01932347, + "epoch": 0.8492409439350669, + "flos": 29136887546400.0, + "grad_norm": 2.0641290902226963, + "language_loss": 0.71741748, + "learning_rate": 2.3363371284256805e-07, + "loss": 0.73886979, + "num_input_tokens_seen": 304639540, + "router_z_loss_clip": 0.73046875, + "router_z_loss_mlp": 0.11499023, + "step": 14125, + "time_per_iteration": 2.6561410427093506 + }, + { + "auxiliary_loss_clip": 0.01115043, + "auxiliary_loss_mlp": 0.01032904, + "balance_loss_clip": 1.03917944, + "balance_loss_mlp": 1.02051771, + "epoch": 0.8493010671877349, + "flos": 27356852796480.0, + "grad_norm": 1.601356091720288, + "language_loss": 0.73546314, + "learning_rate": 2.3345107778843288e-07, + "loss": 0.75694263, + "num_input_tokens_seen": 304660595, + "router_z_loss_clip": 0.75878906, + "router_z_loss_mlp": 0.1237793, + "step": 14126, + "time_per_iteration": 2.64847469329834 + }, + { + "auxiliary_loss_clip": 0.01108932, + "auxiliary_loss_mlp": 0.01029904, + "balance_loss_clip": 1.03765821, + "balance_loss_mlp": 1.01910353, + "epoch": 0.8493611904404028, + "flos": 21389265302400.0, + "grad_norm": 1.6489286240656786, + "language_loss": 0.67703998, + "learning_rate": 2.3326850972200928e-07, + "loss": 0.69842833, + "num_input_tokens_seen": 304679580, + "router_z_loss_clip": 0.71289062, + "router_z_loss_mlp": 0.10797119, + "step": 14127, + "time_per_iteration": 2.643667459487915 + }, + { + "auxiliary_loss_clip": 0.01112584, + "auxiliary_loss_mlp": 0.01025873, + "balance_loss_clip": 1.03790784, + "balance_loss_mlp": 1.01416087, + "epoch": 0.8494213136930708, + "flos": 23748695635680.0, + "grad_norm": 2.2126319827742225, + "language_loss": 0.69394612, + "learning_rate": 2.330860086502211e-07, + "loss": 0.71533072, + "num_input_tokens_seen": 304698385, + "router_z_loss_clip": 0.74658203, + "router_z_loss_mlp": 0.11712646, + "step": 14128, + "time_per_iteration": 2.5908708572387695 + }, + { + "auxiliary_loss_clip": 0.01110071, + "auxiliary_loss_mlp": 0.01030634, + "balance_loss_clip": 1.03898001, + "balance_loss_mlp": 1.01933324, + "epoch": 0.8494814369457387, + "flos": 22903423246080.0, + "grad_norm": 2.031102373843954, + "language_loss": 0.78159624, + "learning_rate": 2.3290357457998855e-07, + "loss": 0.80300331, + "num_input_tokens_seen": 304715430, + "router_z_loss_clip": 0.7109375, + "router_z_loss_mlp": 0.11297607, + "step": 14129, + "time_per_iteration": 2.682602643966675 + }, + { + "auxiliary_loss_clip": 0.01110352, + "auxiliary_loss_mlp": 0.01030595, + "balance_loss_clip": 1.03841484, + "balance_loss_mlp": 1.01974106, + "epoch": 0.8495415601984068, + "flos": 28469298545280.0, + "grad_norm": 1.934792144212318, + "language_loss": 0.68286008, + "learning_rate": 2.3272120751823031e-07, + "loss": 0.70426953, + "num_input_tokens_seen": 304734345, + "router_z_loss_clip": 0.71875, + "router_z_loss_mlp": 0.10864258, + "step": 14130, + "time_per_iteration": 2.606510877609253 + }, + { + "auxiliary_loss_clip": 0.01112793, + "auxiliary_loss_mlp": 0.01030348, + "balance_loss_clip": 1.03859913, + "balance_loss_mlp": 1.01857567, + "epoch": 0.8496016834510747, + "flos": 32473698068160.0, + "grad_norm": 2.069424635978285, + "language_loss": 0.71167552, + "learning_rate": 2.3253890747186e-07, + "loss": 0.73310697, + "num_input_tokens_seen": 304755030, + "router_z_loss_clip": 0.74169922, + "router_z_loss_mlp": 0.11779785, + "step": 14131, + "time_per_iteration": 2.6982154846191406 + }, + { + "auxiliary_loss_clip": 0.01109842, + "auxiliary_loss_mlp": 0.01027683, + "balance_loss_clip": 1.03647125, + "balance_loss_mlp": 1.01653731, + "epoch": 0.8496618067037427, + "flos": 31091891027040.0, + "grad_norm": 2.149508535010185, + "language_loss": 0.6854822, + "learning_rate": 2.3235667444779162e-07, + "loss": 0.70685744, + "num_input_tokens_seen": 304774320, + "router_z_loss_clip": 0.734375, + "router_z_loss_mlp": 0.1114502, + "step": 14132, + "time_per_iteration": 2.709498643875122 + }, + { + "auxiliary_loss_clip": 0.01107036, + "auxiliary_loss_mlp": 0.01034676, + "balance_loss_clip": 1.03578162, + "balance_loss_mlp": 1.02425683, + "epoch": 0.8497219299564106, + "flos": 30962457368640.0, + "grad_norm": 1.6576769404472254, + "language_loss": 0.70383751, + "learning_rate": 2.3217450845293564e-07, + "loss": 0.72525465, + "num_input_tokens_seen": 304795355, + "router_z_loss_clip": 0.71289062, + "router_z_loss_mlp": 0.10418701, + "step": 14133, + "time_per_iteration": 2.659181594848633 + }, + { + "auxiliary_loss_clip": 0.01028839, + "auxiliary_loss_mlp": 0.01001856, + "balance_loss_clip": 1.00639963, + "balance_loss_mlp": 1.00091827, + "epoch": 0.8497820532090786, + "flos": 82704929571360.0, + "grad_norm": 0.7248667208363245, + "language_loss": 0.57551146, + "learning_rate": 2.3199240949419918e-07, + "loss": 0.5958184, + "num_input_tokens_seen": 304863915, + "router_z_loss_clip": 0.2244873, + "router_z_loss_mlp": 0.0093689, + "step": 14134, + "time_per_iteration": 3.357311725616455 + }, + { + "auxiliary_loss_clip": 0.01111472, + "auxiliary_loss_mlp": 0.01029703, + "balance_loss_clip": 1.03734446, + "balance_loss_mlp": 1.01804447, + "epoch": 0.8498421764617465, + "flos": 28602419276160.0, + "grad_norm": 2.3554122183877273, + "language_loss": 0.78990358, + "learning_rate": 2.3181037757848787e-07, + "loss": 0.8113153, + "num_input_tokens_seen": 304881555, + "router_z_loss_clip": 0.74072266, + "router_z_loss_mlp": 0.11669922, + "step": 14135, + "time_per_iteration": 2.6393232345581055 + }, + { + "auxiliary_loss_clip": 0.01113024, + "auxiliary_loss_mlp": 0.0103052, + "balance_loss_clip": 1.03892648, + "balance_loss_mlp": 1.01885557, + "epoch": 0.8499022997144146, + "flos": 21612726866880.0, + "grad_norm": 1.8842339775458907, + "language_loss": 0.63174033, + "learning_rate": 2.316284127127044e-07, + "loss": 0.65317571, + "num_input_tokens_seen": 304898760, + "router_z_loss_clip": 0.74121094, + "router_z_loss_mlp": 0.11657715, + "step": 14136, + "time_per_iteration": 2.7408134937286377 + }, + { + "auxiliary_loss_clip": 0.01115534, + "auxiliary_loss_mlp": 0.01030352, + "balance_loss_clip": 1.04077744, + "balance_loss_mlp": 1.01833606, + "epoch": 0.8499624229670825, + "flos": 22681541855520.0, + "grad_norm": 4.90620833788747, + "language_loss": 0.83730698, + "learning_rate": 2.3144651490374835e-07, + "loss": 0.85876584, + "num_input_tokens_seen": 304915465, + "router_z_loss_clip": 0.74658203, + "router_z_loss_mlp": 0.12036133, + "step": 14137, + "time_per_iteration": 2.593573808670044 + }, + { + "auxiliary_loss_clip": 0.01108142, + "auxiliary_loss_mlp": 0.0102706, + "balance_loss_clip": 1.03798676, + "balance_loss_mlp": 1.0168798, + "epoch": 0.8500225462197505, + "flos": 29707774500960.0, + "grad_norm": 2.870581296370002, + "language_loss": 0.79003203, + "learning_rate": 2.3126468415851773e-07, + "loss": 0.81138408, + "num_input_tokens_seen": 304933190, + "router_z_loss_clip": 0.70117188, + "router_z_loss_mlp": 0.10180664, + "step": 14138, + "time_per_iteration": 2.6845953464508057 + }, + { + "auxiliary_loss_clip": 0.0111174, + "auxiliary_loss_mlp": 0.01027988, + "balance_loss_clip": 1.03919566, + "balance_loss_mlp": 1.01703894, + "epoch": 0.8500826694724185, + "flos": 20188504584000.0, + "grad_norm": 2.1595407314125468, + "language_loss": 0.64854717, + "learning_rate": 2.310829204839073e-07, + "loss": 0.66994441, + "num_input_tokens_seen": 304951110, + "router_z_loss_clip": 0.72509766, + "router_z_loss_mlp": 0.10949707, + "step": 14139, + "time_per_iteration": 2.591379404067993 + }, + { + "auxiliary_loss_clip": 0.01108962, + "auxiliary_loss_mlp": 0.01028208, + "balance_loss_clip": 1.03777885, + "balance_loss_mlp": 1.01752126, + "epoch": 0.8501427927250864, + "flos": 19876039256160.0, + "grad_norm": 1.566694919554421, + "language_loss": 0.70504373, + "learning_rate": 2.3090122388681043e-07, + "loss": 0.7264154, + "num_input_tokens_seen": 304969095, + "router_z_loss_clip": 0.71191406, + "router_z_loss_mlp": 0.10687256, + "step": 14140, + "time_per_iteration": 2.6703078746795654 + }, + { + "auxiliary_loss_clip": 0.01113048, + "auxiliary_loss_mlp": 0.01030305, + "balance_loss_clip": 1.03785205, + "balance_loss_mlp": 1.01874137, + "epoch": 0.8502029159777544, + "flos": 32565092351040.0, + "grad_norm": 2.2904441623243392, + "language_loss": 0.640876, + "learning_rate": 2.3071959437411648e-07, + "loss": 0.66230953, + "num_input_tokens_seen": 304989315, + "router_z_loss_clip": 0.75244141, + "router_z_loss_mlp": 0.11560059, + "step": 14141, + "time_per_iteration": 4.107525587081909 + }, + { + "auxiliary_loss_clip": 0.0111284, + "auxiliary_loss_mlp": 0.01031967, + "balance_loss_clip": 1.03967667, + "balance_loss_mlp": 1.02067232, + "epoch": 0.8502630392304223, + "flos": 43429021313760.0, + "grad_norm": 1.6772963827265583, + "language_loss": 0.71126664, + "learning_rate": 2.3053803195271214e-07, + "loss": 0.73271471, + "num_input_tokens_seen": 305011020, + "router_z_loss_clip": 0.73046875, + "router_z_loss_mlp": 0.11297607, + "step": 14142, + "time_per_iteration": 2.7265751361846924 + }, + { + "auxiliary_loss_clip": 0.01109285, + "auxiliary_loss_mlp": 0.0102858, + "balance_loss_clip": 1.03666008, + "balance_loss_mlp": 1.01787508, + "epoch": 0.8503231624830904, + "flos": 26417106810720.0, + "grad_norm": 4.450926571206834, + "language_loss": 0.64921039, + "learning_rate": 2.3035653662948375e-07, + "loss": 0.67058897, + "num_input_tokens_seen": 305033550, + "router_z_loss_clip": 0.72607422, + "router_z_loss_mlp": 0.10699463, + "step": 14143, + "time_per_iteration": 2.687422513961792 + }, + { + "auxiliary_loss_clip": 0.01113685, + "auxiliary_loss_mlp": 0.01031871, + "balance_loss_clip": 1.03772926, + "balance_loss_mlp": 1.0202179, + "epoch": 0.8503832857357583, + "flos": 27354097621440.0, + "grad_norm": 1.9876356081872595, + "language_loss": 0.67841542, + "learning_rate": 2.3017510841131216e-07, + "loss": 0.69987094, + "num_input_tokens_seen": 305052885, + "router_z_loss_clip": 0.75927734, + "router_z_loss_mlp": 0.11657715, + "step": 14144, + "time_per_iteration": 2.608710289001465 + }, + { + "auxiliary_loss_clip": 0.01108293, + "auxiliary_loss_mlp": 0.01029698, + "balance_loss_clip": 1.0381453, + "balance_loss_mlp": 1.01852798, + "epoch": 0.8504434089884263, + "flos": 22815108276480.0, + "grad_norm": 3.9172768621925673, + "language_loss": 0.64959943, + "learning_rate": 2.299937473050777e-07, + "loss": 0.67097938, + "num_input_tokens_seen": 305071995, + "router_z_loss_clip": 0.70068359, + "router_z_loss_mlp": 0.11169434, + "step": 14145, + "time_per_iteration": 2.627633810043335 + }, + { + "auxiliary_loss_clip": 0.01111009, + "auxiliary_loss_mlp": 0.01032338, + "balance_loss_clip": 1.03894913, + "balance_loss_mlp": 1.02074504, + "epoch": 0.8505035322410942, + "flos": 24414015669120.0, + "grad_norm": 1.8973286263953772, + "language_loss": 0.85695446, + "learning_rate": 2.2981245331765842e-07, + "loss": 0.87838799, + "num_input_tokens_seen": 305090190, + "router_z_loss_clip": 0.72070312, + "router_z_loss_mlp": 0.1159668, + "step": 14146, + "time_per_iteration": 2.597475051879883 + }, + { + "auxiliary_loss_clip": 0.01107801, + "auxiliary_loss_mlp": 0.01025148, + "balance_loss_clip": 1.03600085, + "balance_loss_mlp": 1.01401949, + "epoch": 0.8505636554937622, + "flos": 25395204481920.0, + "grad_norm": 1.9865048078494532, + "language_loss": 0.83754551, + "learning_rate": 2.2963122645592814e-07, + "loss": 0.85887504, + "num_input_tokens_seen": 305109355, + "router_z_loss_clip": 0.71826172, + "router_z_loss_mlp": 0.11120605, + "step": 14147, + "time_per_iteration": 2.6458680629730225 + }, + { + "auxiliary_loss_clip": 0.01115653, + "auxiliary_loss_mlp": 0.01030489, + "balance_loss_clip": 1.03970194, + "balance_loss_mlp": 1.01899779, + "epoch": 0.8506237787464301, + "flos": 17295092187840.0, + "grad_norm": 2.7424463099207, + "language_loss": 0.85457277, + "learning_rate": 2.2945006672675894e-07, + "loss": 0.8760342, + "num_input_tokens_seen": 305124165, + "router_z_loss_clip": 0.75976562, + "router_z_loss_mlp": 0.11486816, + "step": 14148, + "time_per_iteration": 4.002156496047974 + }, + { + "auxiliary_loss_clip": 0.01110031, + "auxiliary_loss_mlp": 0.01030235, + "balance_loss_clip": 1.03909409, + "balance_loss_mlp": 1.01824856, + "epoch": 0.8506839019990982, + "flos": 28380092195520.0, + "grad_norm": 1.7515485173737704, + "language_loss": 0.71896029, + "learning_rate": 2.292689741370204e-07, + "loss": 0.74036288, + "num_input_tokens_seen": 305143940, + "router_z_loss_clip": 0.70898438, + "router_z_loss_mlp": 0.11981201, + "step": 14149, + "time_per_iteration": 2.6283164024353027 + }, + { + "auxiliary_loss_clip": 0.0111049, + "auxiliary_loss_mlp": 0.0102813, + "balance_loss_clip": 1.03813148, + "balance_loss_mlp": 1.01674592, + "epoch": 0.8507440252517661, + "flos": 28869268497120.0, + "grad_norm": 1.8592313260989533, + "language_loss": 0.76014012, + "learning_rate": 2.290879486935804e-07, + "loss": 0.78152633, + "num_input_tokens_seen": 305163505, + "router_z_loss_clip": 0.72265625, + "router_z_loss_mlp": 0.11383057, + "step": 14150, + "time_per_iteration": 2.665562391281128 + }, + { + "auxiliary_loss_clip": 0.01111798, + "auxiliary_loss_mlp": 0.01030063, + "balance_loss_clip": 1.04136658, + "balance_loss_mlp": 1.0193758, + "epoch": 0.8508041485044341, + "flos": 22770424067040.0, + "grad_norm": 1.8745650583504438, + "language_loss": 0.72638774, + "learning_rate": 2.2890699040330231e-07, + "loss": 0.74780631, + "num_input_tokens_seen": 305182325, + "router_z_loss_clip": 0.70410156, + "router_z_loss_mlp": 0.10693359, + "step": 14151, + "time_per_iteration": 2.607757091522217 + }, + { + "auxiliary_loss_clip": 0.01028612, + "auxiliary_loss_mlp": 0.01000671, + "balance_loss_clip": 1.0062952, + "balance_loss_mlp": 0.9997865, + "epoch": 0.8508642717571021, + "flos": 64073768001120.0, + "grad_norm": 0.8805904144931144, + "language_loss": 0.59568441, + "learning_rate": 2.2872609927304909e-07, + "loss": 0.61597723, + "num_input_tokens_seen": 305230775, + "router_z_loss_clip": 0.22314453, + "router_z_loss_mlp": 0.00884247, + "step": 14152, + "time_per_iteration": 3.0021119117736816 + }, + { + "auxiliary_loss_clip": 0.0102914, + "auxiliary_loss_mlp": 0.01002319, + "balance_loss_clip": 1.00672293, + "balance_loss_mlp": 1.0013926, + "epoch": 0.85092439500977, + "flos": 84557355999840.0, + "grad_norm": 0.6882897616395602, + "language_loss": 0.61143637, + "learning_rate": 2.285452753096797e-07, + "loss": 0.63175094, + "num_input_tokens_seen": 305296000, + "router_z_loss_clip": 0.22424316, + "router_z_loss_mlp": 0.00925446, + "step": 14153, + "time_per_iteration": 4.698224306106567 + }, + { + "auxiliary_loss_clip": 0.01110933, + "auxiliary_loss_mlp": 0.01030834, + "balance_loss_clip": 1.03856385, + "balance_loss_mlp": 1.01897311, + "epoch": 0.850984518262438, + "flos": 29760643200960.0, + "grad_norm": 2.099893827675564, + "language_loss": 0.80851996, + "learning_rate": 2.2836451852005067e-07, + "loss": 0.82993758, + "num_input_tokens_seen": 305314705, + "router_z_loss_clip": 0.72363281, + "router_z_loss_mlp": 0.11853027, + "step": 14154, + "time_per_iteration": 2.6313326358795166 + }, + { + "auxiliary_loss_clip": 0.01104805, + "auxiliary_loss_mlp": 0.01030518, + "balance_loss_clip": 1.03616416, + "balance_loss_mlp": 1.02022457, + "epoch": 0.851044641515106, + "flos": 28424573818560.0, + "grad_norm": 1.7222555782042226, + "language_loss": 0.79550558, + "learning_rate": 2.281838289110165e-07, + "loss": 0.81685883, + "num_input_tokens_seen": 305333870, + "router_z_loss_clip": 0.68603516, + "router_z_loss_mlp": 0.10290527, + "step": 14155, + "time_per_iteration": 2.6511192321777344 + }, + { + "auxiliary_loss_clip": 0.01110903, + "auxiliary_loss_mlp": 0.01027512, + "balance_loss_clip": 1.0363276, + "balance_loss_mlp": 1.01624036, + "epoch": 0.851104764767774, + "flos": 26906040008640.0, + "grad_norm": 1.9521172236918265, + "language_loss": 0.70792866, + "learning_rate": 2.2800320648942904e-07, + "loss": 0.72931284, + "num_input_tokens_seen": 305352780, + "router_z_loss_clip": 0.74462891, + "router_z_loss_mlp": 0.11273193, + "step": 14156, + "time_per_iteration": 3.917283058166504 + }, + { + "auxiliary_loss_clip": 0.01107634, + "auxiliary_loss_mlp": 0.01030385, + "balance_loss_clip": 1.03787708, + "balance_loss_mlp": 1.0193578, + "epoch": 0.8511648880204419, + "flos": 25263907028640.0, + "grad_norm": 1.8481854469314265, + "language_loss": 0.74258316, + "learning_rate": 2.278226512621386e-07, + "loss": 0.76396334, + "num_input_tokens_seen": 305371370, + "router_z_loss_clip": 0.69775391, + "router_z_loss_mlp": 0.11029053, + "step": 14157, + "time_per_iteration": 2.6160085201263428 + }, + { + "auxiliary_loss_clip": 0.0110823, + "auxiliary_loss_mlp": 0.01024609, + "balance_loss_clip": 1.03815556, + "balance_loss_mlp": 1.0144825, + "epoch": 0.8512250112731099, + "flos": 29315178694080.0, + "grad_norm": 2.5010043932092003, + "language_loss": 0.79016125, + "learning_rate": 2.2764216323598995e-07, + "loss": 0.81148964, + "num_input_tokens_seen": 305387955, + "router_z_loss_clip": 0.70019531, + "router_z_loss_mlp": 0.10125732, + "step": 14158, + "time_per_iteration": 2.738748550415039 + }, + { + "auxiliary_loss_clip": 0.01111624, + "auxiliary_loss_mlp": 0.01028758, + "balance_loss_clip": 1.03886628, + "balance_loss_mlp": 1.0171411, + "epoch": 0.8512851345257778, + "flos": 26863624766880.0, + "grad_norm": 2.8256286611061214, + "language_loss": 0.78568596, + "learning_rate": 2.27461742417828e-07, + "loss": 0.80708981, + "num_input_tokens_seen": 305406285, + "router_z_loss_clip": 0.72753906, + "router_z_loss_mlp": 0.11621094, + "step": 14159, + "time_per_iteration": 2.613462209701538 + }, + { + "auxiliary_loss_clip": 0.01112889, + "auxiliary_loss_mlp": 0.01031456, + "balance_loss_clip": 1.0402317, + "balance_loss_mlp": 1.02019715, + "epoch": 0.8513452577784458, + "flos": 18095396747040.0, + "grad_norm": 1.87458702011481, + "language_loss": 0.7135756, + "learning_rate": 2.2728138881449488e-07, + "loss": 0.73501909, + "num_input_tokens_seen": 305424500, + "router_z_loss_clip": 0.7265625, + "router_z_loss_mlp": 0.1126709, + "step": 14160, + "time_per_iteration": 2.6739635467529297 + }, + { + "auxiliary_loss_clip": 0.01117861, + "auxiliary_loss_mlp": 0.01031459, + "balance_loss_clip": 1.04124296, + "balance_loss_mlp": 1.0192759, + "epoch": 0.8514053810311137, + "flos": 40311174938400.0, + "grad_norm": 2.3116747736098553, + "language_loss": 0.70416951, + "learning_rate": 2.2710110243282866e-07, + "loss": 0.72566271, + "num_input_tokens_seen": 305442990, + "router_z_loss_clip": 0.76513672, + "router_z_loss_mlp": 0.12182617, + "step": 14161, + "time_per_iteration": 2.744062662124634 + }, + { + "auxiliary_loss_clip": 0.01110584, + "auxiliary_loss_mlp": 0.01027252, + "balance_loss_clip": 1.03582263, + "balance_loss_mlp": 1.01636779, + "epoch": 0.8514655042837818, + "flos": 33634636650720.0, + "grad_norm": 2.243949607753205, + "language_loss": 0.78140521, + "learning_rate": 2.2692088327966653e-07, + "loss": 0.80278361, + "num_input_tokens_seen": 305463065, + "router_z_loss_clip": 0.74804688, + "router_z_loss_mlp": 0.10882568, + "step": 14162, + "time_per_iteration": 2.69026517868042 + }, + { + "auxiliary_loss_clip": 0.01112166, + "auxiliary_loss_mlp": 0.01035817, + "balance_loss_clip": 1.03935409, + "balance_loss_mlp": 1.02422416, + "epoch": 0.8515256275364497, + "flos": 43386484520160.0, + "grad_norm": 2.0028919291453864, + "language_loss": 0.76944232, + "learning_rate": 2.2674073136184235e-07, + "loss": 0.79092216, + "num_input_tokens_seen": 305489070, + "router_z_loss_clip": 0.72753906, + "router_z_loss_mlp": 0.11590576, + "step": 14163, + "time_per_iteration": 2.766937017440796 + }, + { + "auxiliary_loss_clip": 0.01028845, + "auxiliary_loss_mlp": 0.0100095, + "balance_loss_clip": 1.00647211, + "balance_loss_mlp": 1.00002146, + "epoch": 0.8515857507891177, + "flos": 85668262092000.0, + "grad_norm": 0.6965719450596456, + "language_loss": 0.5493449, + "learning_rate": 2.2656064668618735e-07, + "loss": 0.56964284, + "num_input_tokens_seen": 305551490, + "router_z_loss_clip": 0.22412109, + "router_z_loss_mlp": 0.00926208, + "step": 14164, + "time_per_iteration": 3.316286563873291 + }, + { + "auxiliary_loss_clip": 0.01110962, + "auxiliary_loss_mlp": 0.01032108, + "balance_loss_clip": 1.03840947, + "balance_loss_mlp": 1.02124786, + "epoch": 0.8516458740417857, + "flos": 27668507778720.0, + "grad_norm": 1.976593758624071, + "language_loss": 0.72388875, + "learning_rate": 2.2638062925953005e-07, + "loss": 0.74531949, + "num_input_tokens_seen": 305570535, + "router_z_loss_clip": 0.72607422, + "router_z_loss_mlp": 0.10864258, + "step": 14165, + "time_per_iteration": 2.687303304672241 + }, + { + "auxiliary_loss_clip": 0.01107542, + "auxiliary_loss_mlp": 0.0102904, + "balance_loss_clip": 1.0365597, + "balance_loss_mlp": 1.01748836, + "epoch": 0.8517059972944536, + "flos": 27756620161920.0, + "grad_norm": 1.540651665907644, + "language_loss": 0.67455244, + "learning_rate": 2.26200679088697e-07, + "loss": 0.69591826, + "num_input_tokens_seen": 305590800, + "router_z_loss_clip": 0.70947266, + "router_z_loss_mlp": 0.11547852, + "step": 14166, + "time_per_iteration": 2.733945369720459 + }, + { + "auxiliary_loss_clip": 0.01109164, + "auxiliary_loss_mlp": 0.01033891, + "balance_loss_clip": 1.03743768, + "balance_loss_mlp": 1.02290547, + "epoch": 0.8517661205471216, + "flos": 26465234988960.0, + "grad_norm": 1.9058352793674158, + "language_loss": 0.73569, + "learning_rate": 2.260207961805125e-07, + "loss": 0.75712055, + "num_input_tokens_seen": 305609495, + "router_z_loss_clip": 0.71728516, + "router_z_loss_mlp": 0.10986328, + "step": 14167, + "time_per_iteration": 2.6494903564453125 + }, + { + "auxiliary_loss_clip": 0.01108952, + "auxiliary_loss_mlp": 0.01029745, + "balance_loss_clip": 1.03716326, + "balance_loss_mlp": 1.0189147, + "epoch": 0.8518262437997896, + "flos": 30962740989600.0, + "grad_norm": 12.637474210296423, + "language_loss": 0.80173993, + "learning_rate": 2.258409805417969e-07, + "loss": 0.82312691, + "num_input_tokens_seen": 305629420, + "router_z_loss_clip": 0.71923828, + "router_z_loss_mlp": 0.1083374, + "step": 14168, + "time_per_iteration": 2.665578603744507 + }, + { + "auxiliary_loss_clip": 0.01108803, + "auxiliary_loss_mlp": 0.01024598, + "balance_loss_clip": 1.03662395, + "balance_loss_mlp": 1.01385689, + "epoch": 0.8518863670524576, + "flos": 33233167559520.0, + "grad_norm": 1.9087995646647393, + "language_loss": 0.76416636, + "learning_rate": 2.2566123217936893e-07, + "loss": 0.78550029, + "num_input_tokens_seen": 305649835, + "router_z_loss_clip": 0.72119141, + "router_z_loss_mlp": 0.10742188, + "step": 14169, + "time_per_iteration": 2.6939966678619385 + }, + { + "auxiliary_loss_clip": 0.01113196, + "auxiliary_loss_mlp": 0.01028626, + "balance_loss_clip": 1.03873968, + "balance_loss_mlp": 1.01710474, + "epoch": 0.8519464903051255, + "flos": 25574873217120.0, + "grad_norm": 2.0142197170549743, + "language_loss": 0.63709849, + "learning_rate": 2.254815511000452e-07, + "loss": 0.65851671, + "num_input_tokens_seen": 305668840, + "router_z_loss_clip": 0.74511719, + "router_z_loss_mlp": 0.11517334, + "step": 14170, + "time_per_iteration": 2.5931596755981445 + }, + { + "auxiliary_loss_clip": 0.01107442, + "auxiliary_loss_mlp": 0.01025152, + "balance_loss_clip": 1.03560102, + "balance_loss_mlp": 1.01392233, + "epoch": 0.8520066135577935, + "flos": 22502521396800.0, + "grad_norm": 2.4892249795233665, + "language_loss": 0.86658537, + "learning_rate": 2.253019373106384e-07, + "loss": 0.88791132, + "num_input_tokens_seen": 305686955, + "router_z_loss_clip": 0.71923828, + "router_z_loss_mlp": 0.11230469, + "step": 14171, + "time_per_iteration": 2.6453969478607178 + }, + { + "auxiliary_loss_clip": 0.01113215, + "auxiliary_loss_mlp": 0.01036681, + "balance_loss_clip": 1.03995872, + "balance_loss_mlp": 1.02532601, + "epoch": 0.8520667368104614, + "flos": 35545604198400.0, + "grad_norm": 2.2625285315046155, + "language_loss": 0.5458765, + "learning_rate": 2.2512239081796003e-07, + "loss": 0.56737548, + "num_input_tokens_seen": 305706290, + "router_z_loss_clip": 0.73291016, + "router_z_loss_mlp": 0.11352539, + "step": 14172, + "time_per_iteration": 2.68426251411438 + }, + { + "auxiliary_loss_clip": 0.01107281, + "auxiliary_loss_mlp": 0.01026464, + "balance_loss_clip": 1.0372138, + "balance_loss_mlp": 1.01677203, + "epoch": 0.8521268600631294, + "flos": 19564789446720.0, + "grad_norm": 2.9029858630533396, + "language_loss": 0.69390041, + "learning_rate": 2.2494291162881862e-07, + "loss": 0.71523786, + "num_input_tokens_seen": 305723835, + "router_z_loss_clip": 0.70019531, + "router_z_loss_mlp": 0.09698486, + "step": 14173, + "time_per_iteration": 2.67427134513855 + }, + { + "auxiliary_loss_clip": 0.01111265, + "auxiliary_loss_mlp": 0.01027739, + "balance_loss_clip": 1.03789616, + "balance_loss_mlp": 1.0157764, + "epoch": 0.8521869833157973, + "flos": 27399673211040.0, + "grad_norm": 3.7185063030723424, + "language_loss": 0.76705658, + "learning_rate": 2.247634997500205e-07, + "loss": 0.78844666, + "num_input_tokens_seen": 305741655, + "router_z_loss_clip": 0.73388672, + "router_z_loss_mlp": 0.11968994, + "step": 14174, + "time_per_iteration": 2.6183602809906006 + }, + { + "auxiliary_loss_clip": 0.01113604, + "auxiliary_loss_mlp": 0.01031099, + "balance_loss_clip": 1.0401268, + "balance_loss_mlp": 1.02003002, + "epoch": 0.8522471065684654, + "flos": 30471336237600.0, + "grad_norm": 1.917425624911353, + "language_loss": 0.81914294, + "learning_rate": 2.245841551883676e-07, + "loss": 0.84059, + "num_input_tokens_seen": 305761890, + "router_z_loss_clip": 0.73486328, + "router_z_loss_mlp": 0.11071777, + "step": 14175, + "time_per_iteration": 2.6823410987854004 + }, + { + "auxiliary_loss_clip": 0.01113957, + "auxiliary_loss_mlp": 0.01028871, + "balance_loss_clip": 1.03958297, + "balance_loss_mlp": 1.01721203, + "epoch": 0.8523072298211333, + "flos": 21610457899200.0, + "grad_norm": 3.98216725116104, + "language_loss": 0.65647805, + "learning_rate": 2.2440487795066153e-07, + "loss": 0.67790627, + "num_input_tokens_seen": 305779190, + "router_z_loss_clip": 0.74365234, + "router_z_loss_mlp": 0.11657715, + "step": 14176, + "time_per_iteration": 2.624987840652466 + }, + { + "auxiliary_loss_clip": 0.01110226, + "auxiliary_loss_mlp": 0.01028543, + "balance_loss_clip": 1.03926444, + "balance_loss_mlp": 1.01754022, + "epoch": 0.8523673530738013, + "flos": 31049961992640.0, + "grad_norm": 1.7195932712819435, + "language_loss": 0.78896588, + "learning_rate": 2.2422566804370068e-07, + "loss": 0.81035364, + "num_input_tokens_seen": 305799870, + "router_z_loss_clip": 0.70996094, + "router_z_loss_mlp": 0.10998535, + "step": 14177, + "time_per_iteration": 2.6445038318634033 + }, + { + "auxiliary_loss_clip": 0.0111219, + "auxiliary_loss_mlp": 0.01026712, + "balance_loss_clip": 1.03890324, + "balance_loss_mlp": 1.01532149, + "epoch": 0.8524274763264693, + "flos": 38351390418720.0, + "grad_norm": 1.531989846794693, + "language_loss": 0.73269761, + "learning_rate": 2.2404652547428026e-07, + "loss": 0.75408667, + "num_input_tokens_seen": 305819695, + "router_z_loss_clip": 0.73242188, + "router_z_loss_mlp": 0.11395264, + "step": 14178, + "time_per_iteration": 2.7245070934295654 + }, + { + "auxiliary_loss_clip": 0.01113356, + "auxiliary_loss_mlp": 0.01036492, + "balance_loss_clip": 1.04033327, + "balance_loss_mlp": 1.02532172, + "epoch": 0.8524875995791372, + "flos": 21612767384160.0, + "grad_norm": 2.382003324242924, + "language_loss": 0.74826944, + "learning_rate": 2.238674502491935e-07, + "loss": 0.76976788, + "num_input_tokens_seen": 305837270, + "router_z_loss_clip": 0.73046875, + "router_z_loss_mlp": 0.11181641, + "step": 14179, + "time_per_iteration": 2.599104642868042 + }, + { + "auxiliary_loss_clip": 0.01109639, + "auxiliary_loss_mlp": 0.01028637, + "balance_loss_clip": 1.03882146, + "balance_loss_mlp": 1.01759863, + "epoch": 0.8525477228318052, + "flos": 26463168607680.0, + "grad_norm": 2.2078402611396015, + "language_loss": 0.81461686, + "learning_rate": 2.2368844237523165e-07, + "loss": 0.83599961, + "num_input_tokens_seen": 305855250, + "router_z_loss_clip": 0.70800781, + "router_z_loss_mlp": 0.1104126, + "step": 14180, + "time_per_iteration": 4.142724514007568 + }, + { + "auxiliary_loss_clip": 0.01110447, + "auxiliary_loss_mlp": 0.01030621, + "balance_loss_clip": 1.03798652, + "balance_loss_mlp": 1.02022004, + "epoch": 0.8526078460844732, + "flos": 30294341642880.0, + "grad_norm": 2.1006661205409665, + "language_loss": 0.61251324, + "learning_rate": 2.235095018591815e-07, + "loss": 0.63392389, + "num_input_tokens_seen": 305875660, + "router_z_loss_clip": 0.72412109, + "router_z_loss_mlp": 0.10400391, + "step": 14181, + "time_per_iteration": 2.7032418251037598 + }, + { + "auxiliary_loss_clip": 0.01110017, + "auxiliary_loss_mlp": 0.01029485, + "balance_loss_clip": 1.03975487, + "balance_loss_mlp": 1.01919723, + "epoch": 0.8526679693371412, + "flos": 16492235040000.0, + "grad_norm": 2.1412108359855995, + "language_loss": 0.72321856, + "learning_rate": 2.2333062870782894e-07, + "loss": 0.74461353, + "num_input_tokens_seen": 305892415, + "router_z_loss_clip": 0.703125, + "router_z_loss_mlp": 0.10290527, + "step": 14182, + "time_per_iteration": 2.5958211421966553 + }, + { + "auxiliary_loss_clip": 0.01109832, + "auxiliary_loss_mlp": 0.01035368, + "balance_loss_clip": 1.03934598, + "balance_loss_mlp": 1.02452564, + "epoch": 0.8527280925898091, + "flos": 28692111833280.0, + "grad_norm": 3.0053086623490155, + "language_loss": 0.70990813, + "learning_rate": 2.2315182292795697e-07, + "loss": 0.73136014, + "num_input_tokens_seen": 305912665, + "router_z_loss_clip": 0.70507812, + "router_z_loss_mlp": 0.10845947, + "step": 14183, + "time_per_iteration": 2.7195498943328857 + }, + { + "auxiliary_loss_clip": 0.01109145, + "auxiliary_loss_mlp": 0.0102917, + "balance_loss_clip": 1.03955579, + "balance_loss_mlp": 1.01884031, + "epoch": 0.8527882158424771, + "flos": 24773758312320.0, + "grad_norm": 2.0878254543717345, + "language_loss": 0.72403044, + "learning_rate": 2.2297308452634644e-07, + "loss": 0.7454136, + "num_input_tokens_seen": 305931515, + "router_z_loss_clip": 0.69580078, + "router_z_loss_mlp": 0.10327148, + "step": 14184, + "time_per_iteration": 2.647186517715454 + }, + { + "auxiliary_loss_clip": 0.01111765, + "auxiliary_loss_mlp": 0.01028663, + "balance_loss_clip": 1.03914821, + "balance_loss_mlp": 1.01720726, + "epoch": 0.852848339095145, + "flos": 20990186730720.0, + "grad_norm": 2.3286960945016366, + "language_loss": 0.76703417, + "learning_rate": 2.2279441350977457e-07, + "loss": 0.78843844, + "num_input_tokens_seen": 305949965, + "router_z_loss_clip": 0.72607422, + "router_z_loss_mlp": 0.11456299, + "step": 14185, + "time_per_iteration": 2.680271625518799 + }, + { + "auxiliary_loss_clip": 0.01111648, + "auxiliary_loss_mlp": 0.01024673, + "balance_loss_clip": 1.03871644, + "balance_loss_mlp": 1.01317549, + "epoch": 0.852908462347813, + "flos": 22413720219840.0, + "grad_norm": 2.0232804386459624, + "language_loss": 0.79680753, + "learning_rate": 2.2261580988501637e-07, + "loss": 0.81817079, + "num_input_tokens_seen": 305967820, + "router_z_loss_clip": 0.72949219, + "router_z_loss_mlp": 0.1151123, + "step": 14186, + "time_per_iteration": 2.60213565826416 + }, + { + "auxiliary_loss_clip": 0.01108661, + "auxiliary_loss_mlp": 0.01025827, + "balance_loss_clip": 1.03564978, + "balance_loss_mlp": 1.01426375, + "epoch": 0.8529685856004809, + "flos": 22725942444000.0, + "grad_norm": 1.8587446692357295, + "language_loss": 0.62446004, + "learning_rate": 2.224372736588449e-07, + "loss": 0.64580494, + "num_input_tokens_seen": 305985505, + "router_z_loss_clip": 0.72998047, + "router_z_loss_mlp": 0.11560059, + "step": 14187, + "time_per_iteration": 3.874960422515869 + }, + { + "auxiliary_loss_clip": 0.011125, + "auxiliary_loss_mlp": 0.0102869, + "balance_loss_clip": 1.03683794, + "balance_loss_mlp": 1.01665568, + "epoch": 0.853028708853149, + "flos": 36127917025920.0, + "grad_norm": 1.6022618245860425, + "language_loss": 0.76579142, + "learning_rate": 2.2225880483803005e-07, + "loss": 0.78720331, + "num_input_tokens_seen": 306005220, + "router_z_loss_clip": 0.75634766, + "router_z_loss_mlp": 0.12036133, + "step": 14188, + "time_per_iteration": 2.763199806213379 + }, + { + "auxiliary_loss_clip": 0.01111237, + "auxiliary_loss_mlp": 0.01029338, + "balance_loss_clip": 1.03736329, + "balance_loss_mlp": 1.01711321, + "epoch": 0.8530888321058169, + "flos": 32157181012320.0, + "grad_norm": 1.7662738062296448, + "language_loss": 0.78500742, + "learning_rate": 2.2208040342933932e-07, + "loss": 0.80641317, + "num_input_tokens_seen": 306023785, + "router_z_loss_clip": 0.73876953, + "router_z_loss_mlp": 0.12225342, + "step": 14189, + "time_per_iteration": 2.6920108795166016 + }, + { + "auxiliary_loss_clip": 0.01110866, + "auxiliary_loss_mlp": 0.01028919, + "balance_loss_clip": 1.03746176, + "balance_loss_mlp": 1.01764762, + "epoch": 0.8531489553584849, + "flos": 25040931671520.0, + "grad_norm": 2.542011857479059, + "language_loss": 0.79496884, + "learning_rate": 2.2190206943953793e-07, + "loss": 0.81636667, + "num_input_tokens_seen": 306041600, + "router_z_loss_clip": 0.73388672, + "router_z_loss_mlp": 0.11273193, + "step": 14190, + "time_per_iteration": 2.7043304443359375 + }, + { + "auxiliary_loss_clip": 0.01111059, + "auxiliary_loss_mlp": 0.01029466, + "balance_loss_clip": 1.03843403, + "balance_loss_mlp": 1.01792657, + "epoch": 0.8532090786111529, + "flos": 25263420821280.0, + "grad_norm": 2.0955256461874416, + "language_loss": 0.75980997, + "learning_rate": 2.2172380287538894e-07, + "loss": 0.78121519, + "num_input_tokens_seen": 306060345, + "router_z_loss_clip": 0.72753906, + "router_z_loss_mlp": 0.11535645, + "step": 14191, + "time_per_iteration": 2.601593494415283 + }, + { + "auxiliary_loss_clip": 0.01110829, + "auxiliary_loss_mlp": 0.01025892, + "balance_loss_clip": 1.03899801, + "balance_loss_mlp": 1.01463306, + "epoch": 0.8532692018638208, + "flos": 24195173074560.0, + "grad_norm": 1.7801498006084562, + "language_loss": 0.69285291, + "learning_rate": 2.2154560374365073e-07, + "loss": 0.71422017, + "num_input_tokens_seen": 306078285, + "router_z_loss_clip": 0.71728516, + "router_z_loss_mlp": 0.11254883, + "step": 14192, + "time_per_iteration": 4.088806390762329 + }, + { + "auxiliary_loss_clip": 0.01115326, + "auxiliary_loss_mlp": 0.01036478, + "balance_loss_clip": 1.0378561, + "balance_loss_mlp": 1.02387142, + "epoch": 0.8533293251164888, + "flos": 25620286737600.0, + "grad_norm": 2.821418048287236, + "language_loss": 0.62705386, + "learning_rate": 2.2136747205108164e-07, + "loss": 0.64857185, + "num_input_tokens_seen": 306093760, + "router_z_loss_clip": 0.7734375, + "router_z_loss_mlp": 0.12609863, + "step": 14193, + "time_per_iteration": 2.5968480110168457 + }, + { + "auxiliary_loss_clip": 0.01110594, + "auxiliary_loss_mlp": 0.01031165, + "balance_loss_clip": 1.03838193, + "balance_loss_mlp": 1.02009034, + "epoch": 0.8533894483691568, + "flos": 27356528658240.0, + "grad_norm": 2.013358922991878, + "language_loss": 0.76667988, + "learning_rate": 2.211894078044365e-07, + "loss": 0.78809744, + "num_input_tokens_seen": 306112595, + "router_z_loss_clip": 0.72167969, + "router_z_loss_mlp": 0.11077881, + "step": 14194, + "time_per_iteration": 2.6938202381134033 + }, + { + "auxiliary_loss_clip": 0.01110308, + "auxiliary_loss_mlp": 0.0102635, + "balance_loss_clip": 1.03764558, + "balance_loss_mlp": 1.01578236, + "epoch": 0.8534495716218248, + "flos": 26376595881120.0, + "grad_norm": 1.8342555033307444, + "language_loss": 0.69762647, + "learning_rate": 2.2101141101046705e-07, + "loss": 0.71899307, + "num_input_tokens_seen": 306131800, + "router_z_loss_clip": 0.7265625, + "router_z_loss_mlp": 0.10565186, + "step": 14195, + "time_per_iteration": 2.6367712020874023 + }, + { + "auxiliary_loss_clip": 0.01110822, + "auxiliary_loss_mlp": 0.01028452, + "balance_loss_clip": 1.03685319, + "balance_loss_mlp": 1.01693606, + "epoch": 0.8535096948744927, + "flos": 27267079204800.0, + "grad_norm": 2.5585085994318564, + "language_loss": 0.86011362, + "learning_rate": 2.2083348167592343e-07, + "loss": 0.88150632, + "num_input_tokens_seen": 306150590, + "router_z_loss_clip": 0.73925781, + "router_z_loss_mlp": 0.11517334, + "step": 14196, + "time_per_iteration": 2.687342882156372 + }, + { + "auxiliary_loss_clip": 0.01028607, + "auxiliary_loss_mlp": 0.01001286, + "balance_loss_clip": 1.00632501, + "balance_loss_mlp": 1.00035167, + "epoch": 0.8535698181271607, + "flos": 64381128151680.0, + "grad_norm": 0.7773624661640309, + "language_loss": 0.55085301, + "learning_rate": 2.2065561980755243e-07, + "loss": 0.57115191, + "num_input_tokens_seen": 306205850, + "router_z_loss_clip": 0.22302246, + "router_z_loss_mlp": 0.00933838, + "step": 14197, + "time_per_iteration": 4.505394458770752 + }, + { + "auxiliary_loss_clip": 0.01107804, + "auxiliary_loss_mlp": 0.01036288, + "balance_loss_clip": 1.03747416, + "balance_loss_mlp": 1.02513015, + "epoch": 0.8536299413798286, + "flos": 23260410714240.0, + "grad_norm": 1.6195370698919618, + "language_loss": 0.81131667, + "learning_rate": 2.2047782541209826e-07, + "loss": 0.83275759, + "num_input_tokens_seen": 306225220, + "router_z_loss_clip": 0.703125, + "router_z_loss_mlp": 0.1116333, + "step": 14198, + "time_per_iteration": 2.6481804847717285 + }, + { + "auxiliary_loss_clip": 0.01108131, + "auxiliary_loss_mlp": 0.01029388, + "balance_loss_clip": 1.03716278, + "balance_loss_mlp": 1.01941633, + "epoch": 0.8536900646324966, + "flos": 60212774247840.0, + "grad_norm": 1.5124866494463274, + "language_loss": 0.68616927, + "learning_rate": 2.203000984963035e-07, + "loss": 0.70754445, + "num_input_tokens_seen": 306249865, + "router_z_loss_clip": 0.70947266, + "router_z_loss_mlp": 0.09967041, + "step": 14199, + "time_per_iteration": 2.9126741886138916 + }, + { + "auxiliary_loss_clip": 0.01105441, + "auxiliary_loss_mlp": 0.01027565, + "balance_loss_clip": 1.03647923, + "balance_loss_mlp": 1.01718235, + "epoch": 0.8537501878851645, + "flos": 26554522373280.0, + "grad_norm": 1.6268806083963778, + "language_loss": 0.86367446, + "learning_rate": 2.201224390669072e-07, + "loss": 0.88500458, + "num_input_tokens_seen": 306270215, + "router_z_loss_clip": 0.68896484, + "router_z_loss_mlp": 0.10375977, + "step": 14200, + "time_per_iteration": 2.6767828464508057 + }, + { + "auxiliary_loss_clip": 0.01109695, + "auxiliary_loss_mlp": 0.01024838, + "balance_loss_clip": 1.03718758, + "balance_loss_mlp": 1.01440668, + "epoch": 0.8538103111378326, + "flos": 27173375436960.0, + "grad_norm": 1.7989651136871032, + "language_loss": 0.77923018, + "learning_rate": 2.1994484713064666e-07, + "loss": 0.80057549, + "num_input_tokens_seen": 306288960, + "router_z_loss_clip": 0.72509766, + "router_z_loss_mlp": 0.10430908, + "step": 14201, + "time_per_iteration": 2.598893880844116 + }, + { + "auxiliary_loss_clip": 0.01108949, + "auxiliary_loss_mlp": 0.01025409, + "balance_loss_clip": 1.03802562, + "balance_loss_mlp": 1.01509714, + "epoch": 0.8538704343905005, + "flos": 24774771244320.0, + "grad_norm": 1.7879938438889724, + "language_loss": 0.68587232, + "learning_rate": 2.19767322694256e-07, + "loss": 0.70721585, + "num_input_tokens_seen": 306308735, + "router_z_loss_clip": 0.70898438, + "router_z_loss_mlp": 0.10314941, + "step": 14202, + "time_per_iteration": 2.7000184059143066 + }, + { + "auxiliary_loss_clip": 0.0111049, + "auxiliary_loss_mlp": 0.01032494, + "balance_loss_clip": 1.03771734, + "balance_loss_mlp": 1.02162254, + "epoch": 0.8539305576431685, + "flos": 30206512880640.0, + "grad_norm": 1.6078443101695992, + "language_loss": 0.80222309, + "learning_rate": 2.195898657644666e-07, + "loss": 0.82365286, + "num_input_tokens_seen": 306329015, + "router_z_loss_clip": 0.72753906, + "router_z_loss_mlp": 0.10882568, + "step": 14203, + "time_per_iteration": 2.644763469696045 + }, + { + "auxiliary_loss_clip": 0.01113154, + "auxiliary_loss_mlp": 0.01029892, + "balance_loss_clip": 1.03936243, + "balance_loss_mlp": 1.01816761, + "epoch": 0.8539906808958365, + "flos": 32564768212800.0, + "grad_norm": 3.234316170326808, + "language_loss": 0.66005051, + "learning_rate": 2.1941247634800808e-07, + "loss": 0.681481, + "num_input_tokens_seen": 306349085, + "router_z_loss_clip": 0.73876953, + "router_z_loss_mlp": 0.11724854, + "step": 14204, + "time_per_iteration": 2.7418153285980225 + }, + { + "auxiliary_loss_clip": 0.01113007, + "auxiliary_loss_mlp": 0.01034371, + "balance_loss_clip": 1.03880942, + "balance_loss_mlp": 1.02269399, + "epoch": 0.8540508041485044, + "flos": 16308555094080.0, + "grad_norm": 3.72068623450237, + "language_loss": 0.59883648, + "learning_rate": 2.1923515445160667e-07, + "loss": 0.62031031, + "num_input_tokens_seen": 306365385, + "router_z_loss_clip": 0.74169922, + "router_z_loss_mlp": 0.11663818, + "step": 14205, + "time_per_iteration": 2.65791654586792 + }, + { + "auxiliary_loss_clip": 0.0110952, + "auxiliary_loss_mlp": 0.01026077, + "balance_loss_clip": 1.03773046, + "balance_loss_mlp": 1.01500821, + "epoch": 0.8541109274011724, + "flos": 40000208749920.0, + "grad_norm": 2.888849576865661, + "language_loss": 0.72386783, + "learning_rate": 2.1905790008198655e-07, + "loss": 0.74522376, + "num_input_tokens_seen": 306384585, + "router_z_loss_clip": 0.71777344, + "router_z_loss_mlp": 0.11077881, + "step": 14206, + "time_per_iteration": 2.771253824234009 + }, + { + "auxiliary_loss_clip": 0.01112393, + "auxiliary_loss_mlp": 0.01027306, + "balance_loss_clip": 1.0391016, + "balance_loss_mlp": 1.0162791, + "epoch": 0.8541710506538404, + "flos": 21523277413440.0, + "grad_norm": 2.6080460614582504, + "language_loss": 0.76570171, + "learning_rate": 2.1888071324586987e-07, + "loss": 0.78709871, + "num_input_tokens_seen": 306401565, + "router_z_loss_clip": 0.73339844, + "router_z_loss_mlp": 0.11029053, + "step": 14207, + "time_per_iteration": 2.612034797668457 + }, + { + "auxiliary_loss_clip": 0.01112426, + "auxiliary_loss_mlp": 0.01028288, + "balance_loss_clip": 1.03902912, + "balance_loss_mlp": 1.01657605, + "epoch": 0.8542311739065084, + "flos": 24725305995840.0, + "grad_norm": 2.259419469221681, + "language_loss": 0.85225725, + "learning_rate": 2.1870359394997485e-07, + "loss": 0.87366438, + "num_input_tokens_seen": 306419995, + "router_z_loss_clip": 0.734375, + "router_z_loss_mlp": 0.11712646, + "step": 14208, + "time_per_iteration": 2.6734402179718018 + }, + { + "auxiliary_loss_clip": 0.01110579, + "auxiliary_loss_mlp": 0.01030906, + "balance_loss_clip": 1.03882432, + "balance_loss_mlp": 1.02048135, + "epoch": 0.8542912971591763, + "flos": 21701852182080.0, + "grad_norm": 1.5978232880657781, + "language_loss": 0.66324151, + "learning_rate": 2.1852654220101785e-07, + "loss": 0.68465638, + "num_input_tokens_seen": 306439240, + "router_z_loss_clip": 0.71728516, + "router_z_loss_mlp": 0.10418701, + "step": 14209, + "time_per_iteration": 2.618485927581787 + }, + { + "auxiliary_loss_clip": 0.0110901, + "auxiliary_loss_mlp": 0.0102923, + "balance_loss_clip": 1.0382266, + "balance_loss_mlp": 1.01875722, + "epoch": 0.8543514204118443, + "flos": 32920945335360.0, + "grad_norm": 2.592952219982089, + "language_loss": 0.70613706, + "learning_rate": 2.1834955800571287e-07, + "loss": 0.72751945, + "num_input_tokens_seen": 306458425, + "router_z_loss_clip": 0.70800781, + "router_z_loss_mlp": 0.10461426, + "step": 14210, + "time_per_iteration": 2.7036855220794678 + }, + { + "auxiliary_loss_clip": 0.0110881, + "auxiliary_loss_mlp": 0.01030668, + "balance_loss_clip": 1.03702545, + "balance_loss_mlp": 1.01980782, + "epoch": 0.8544115436645122, + "flos": 29315097659520.0, + "grad_norm": 1.3915524847317466, + "language_loss": 0.70081198, + "learning_rate": 2.1817264137077141e-07, + "loss": 0.72220677, + "num_input_tokens_seen": 306477210, + "router_z_loss_clip": 0.71777344, + "router_z_loss_mlp": 0.10864258, + "step": 14211, + "time_per_iteration": 2.6453707218170166 + }, + { + "auxiliary_loss_clip": 0.01111372, + "auxiliary_loss_mlp": 0.01030251, + "balance_loss_clip": 1.03817177, + "balance_loss_mlp": 1.01874113, + "epoch": 0.8544716669171802, + "flos": 20276981622720.0, + "grad_norm": 3.124295260196141, + "language_loss": 0.81455982, + "learning_rate": 2.1799579230290166e-07, + "loss": 0.83597606, + "num_input_tokens_seen": 306495820, + "router_z_loss_clip": 0.73339844, + "router_z_loss_mlp": 0.11517334, + "step": 14212, + "time_per_iteration": 2.663431167602539 + }, + { + "auxiliary_loss_clip": 0.01111907, + "auxiliary_loss_mlp": 0.01031247, + "balance_loss_clip": 1.038234, + "balance_loss_mlp": 1.01875377, + "epoch": 0.8545317901698481, + "flos": 48816524430720.0, + "grad_norm": 1.9457770719706953, + "language_loss": 0.66454589, + "learning_rate": 2.178190108088105e-07, + "loss": 0.68597746, + "num_input_tokens_seen": 306516420, + "router_z_loss_clip": 0.73730469, + "router_z_loss_mlp": 0.12487793, + "step": 14213, + "time_per_iteration": 2.8603570461273193 + }, + { + "auxiliary_loss_clip": 0.01106916, + "auxiliary_loss_mlp": 0.01026358, + "balance_loss_clip": 1.03642416, + "balance_loss_mlp": 1.01520634, + "epoch": 0.8545919134225162, + "flos": 24284946666240.0, + "grad_norm": 1.8682993538083217, + "language_loss": 0.78111446, + "learning_rate": 2.1764229689520098e-07, + "loss": 0.8024472, + "num_input_tokens_seen": 306534785, + "router_z_loss_clip": 0.70507812, + "router_z_loss_mlp": 0.11151123, + "step": 14214, + "time_per_iteration": 2.6287171840667725 + }, + { + "auxiliary_loss_clip": 0.0111351, + "auxiliary_loss_mlp": 0.01027505, + "balance_loss_clip": 1.03756857, + "balance_loss_mlp": 1.0154469, + "epoch": 0.8546520366751841, + "flos": 23126317568640.0, + "grad_norm": 2.792765280612471, + "language_loss": 0.6707111, + "learning_rate": 2.1746565056877397e-07, + "loss": 0.69212127, + "num_input_tokens_seen": 306552440, + "router_z_loss_clip": 0.75878906, + "router_z_loss_mlp": 0.12054443, + "step": 14215, + "time_per_iteration": 2.6294374465942383 + }, + { + "auxiliary_loss_clip": 0.01110415, + "auxiliary_loss_mlp": 0.01025406, + "balance_loss_clip": 1.03861403, + "balance_loss_mlp": 1.01470685, + "epoch": 0.8547121599278521, + "flos": 43466979654720.0, + "grad_norm": 1.8694881483107797, + "language_loss": 0.6282118, + "learning_rate": 2.172890718362279e-07, + "loss": 0.64956999, + "num_input_tokens_seen": 306573600, + "router_z_loss_clip": 0.71826172, + "router_z_loss_mlp": 0.10693359, + "step": 14216, + "time_per_iteration": 2.783578395843506 + }, + { + "auxiliary_loss_clip": 0.01110173, + "auxiliary_loss_mlp": 0.01031033, + "balance_loss_clip": 1.03651452, + "balance_loss_mlp": 1.01993418, + "epoch": 0.8547722831805201, + "flos": 20633361331680.0, + "grad_norm": 3.5281992018247563, + "language_loss": 0.65487623, + "learning_rate": 2.17112560704259e-07, + "loss": 0.67628825, + "num_input_tokens_seen": 306592840, + "router_z_loss_clip": 0.73681641, + "router_z_loss_mlp": 0.11096191, + "step": 14217, + "time_per_iteration": 2.6370809078216553 + }, + { + "auxiliary_loss_clip": 0.0110958, + "auxiliary_loss_mlp": 0.01028087, + "balance_loss_clip": 1.03960991, + "balance_loss_mlp": 1.01778746, + "epoch": 0.854832406433188, + "flos": 28068032040480.0, + "grad_norm": 1.4963161922280492, + "language_loss": 0.64881963, + "learning_rate": 2.1693611717956072e-07, + "loss": 0.67019629, + "num_input_tokens_seen": 306613210, + "router_z_loss_clip": 0.69970703, + "router_z_loss_mlp": 0.10296631, + "step": 14218, + "time_per_iteration": 2.6401150226593018 + }, + { + "auxiliary_loss_clip": 0.0111166, + "auxiliary_loss_mlp": 0.01026819, + "balance_loss_clip": 1.03698349, + "balance_loss_mlp": 1.01613164, + "epoch": 0.854892529685856, + "flos": 24907446285120.0, + "grad_norm": 1.7877954561912923, + "language_loss": 0.6996783, + "learning_rate": 2.167597412688238e-07, + "loss": 0.72106314, + "num_input_tokens_seen": 306631620, + "router_z_loss_clip": 0.74609375, + "router_z_loss_mlp": 0.10699463, + "step": 14219, + "time_per_iteration": 2.690108299255371 + }, + { + "auxiliary_loss_clip": 0.01111526, + "auxiliary_loss_mlp": 0.01033361, + "balance_loss_clip": 1.03629231, + "balance_loss_mlp": 1.02201211, + "epoch": 0.854952652938524, + "flos": 20009443608000.0, + "grad_norm": 2.668592000368837, + "language_loss": 0.66927814, + "learning_rate": 2.1658343297873549e-07, + "loss": 0.69072706, + "num_input_tokens_seen": 306646695, + "router_z_loss_clip": 0.75195312, + "router_z_loss_mlp": 0.11346436, + "step": 14220, + "time_per_iteration": 4.1924285888671875 + }, + { + "auxiliary_loss_clip": 0.01107428, + "auxiliary_loss_mlp": 0.01027368, + "balance_loss_clip": 1.03735149, + "balance_loss_mlp": 1.01668048, + "epoch": 0.855012776191192, + "flos": 25842289680000.0, + "grad_norm": 2.70073220852518, + "language_loss": 0.71292299, + "learning_rate": 2.164071923159827e-07, + "loss": 0.73427093, + "num_input_tokens_seen": 306665465, + "router_z_loss_clip": 0.70117188, + "router_z_loss_mlp": 0.10693359, + "step": 14221, + "time_per_iteration": 2.6087334156036377 + }, + { + "auxiliary_loss_clip": 0.01110269, + "auxiliary_loss_mlp": 0.01034229, + "balance_loss_clip": 1.03724647, + "balance_loss_mlp": 1.02283263, + "epoch": 0.8550728994438599, + "flos": 31897665419040.0, + "grad_norm": 2.1775569760629603, + "language_loss": 0.5956555, + "learning_rate": 2.1623101928724763e-07, + "loss": 0.61710048, + "num_input_tokens_seen": 306685950, + "router_z_loss_clip": 0.73046875, + "router_z_loss_mlp": 0.11395264, + "step": 14222, + "time_per_iteration": 2.6911520957946777 + }, + { + "auxiliary_loss_clip": 0.01109367, + "auxiliary_loss_mlp": 0.0103074, + "balance_loss_clip": 1.03798771, + "balance_loss_mlp": 1.01976705, + "epoch": 0.8551330226965279, + "flos": 27804383684640.0, + "grad_norm": 1.5990201327334235, + "language_loss": 0.84015155, + "learning_rate": 2.1605491389921093e-07, + "loss": 0.8615526, + "num_input_tokens_seen": 306705740, + "router_z_loss_clip": 0.71386719, + "router_z_loss_mlp": 0.10968018, + "step": 14223, + "time_per_iteration": 2.646165132522583 + }, + { + "auxiliary_loss_clip": 0.01109917, + "auxiliary_loss_mlp": 0.01029285, + "balance_loss_clip": 1.03955948, + "balance_loss_mlp": 1.01825809, + "epoch": 0.8551931459491958, + "flos": 27355880381760.0, + "grad_norm": 1.600737818094644, + "language_loss": 0.73735422, + "learning_rate": 2.158788761585515e-07, + "loss": 0.75874627, + "num_input_tokens_seen": 306725065, + "router_z_loss_clip": 0.70361328, + "router_z_loss_mlp": 0.11022949, + "step": 14224, + "time_per_iteration": 2.6782243251800537 + }, + { + "auxiliary_loss_clip": 0.01109288, + "auxiliary_loss_mlp": 0.0102806, + "balance_loss_clip": 1.03749526, + "balance_loss_mlp": 1.01699769, + "epoch": 0.8552532692018638, + "flos": 23883153436800.0, + "grad_norm": 1.8543781676805684, + "language_loss": 0.7594794, + "learning_rate": 2.1570290607194307e-07, + "loss": 0.78085291, + "num_input_tokens_seen": 306743630, + "router_z_loss_clip": 0.71728516, + "router_z_loss_mlp": 0.11065674, + "step": 14225, + "time_per_iteration": 2.6125376224517822 + }, + { + "auxiliary_loss_clip": 0.01108631, + "auxiliary_loss_mlp": 0.01035045, + "balance_loss_clip": 1.03801358, + "balance_loss_mlp": 1.02454305, + "epoch": 0.8553133924545318, + "flos": 32253720989760.0, + "grad_norm": 1.9905895468551285, + "language_loss": 0.77015173, + "learning_rate": 2.1552700364605925e-07, + "loss": 0.79158849, + "num_input_tokens_seen": 306763105, + "router_z_loss_clip": 0.70605469, + "router_z_loss_mlp": 0.1050415, + "step": 14226, + "time_per_iteration": 2.704714298248291 + }, + { + "auxiliary_loss_clip": 0.01112856, + "auxiliary_loss_mlp": 0.01034269, + "balance_loss_clip": 1.03793454, + "balance_loss_mlp": 1.02267563, + "epoch": 0.8553735157071998, + "flos": 19965164571360.0, + "grad_norm": 2.3969036275302704, + "language_loss": 0.54463673, + "learning_rate": 2.153511688875702e-07, + "loss": 0.56610799, + "num_input_tokens_seen": 306779875, + "router_z_loss_clip": 0.74853516, + "router_z_loss_mlp": 0.11590576, + "step": 14227, + "time_per_iteration": 4.152902603149414 + }, + { + "auxiliary_loss_clip": 0.01110044, + "auxiliary_loss_mlp": 0.01031686, + "balance_loss_clip": 1.03973413, + "balance_loss_mlp": 1.02087963, + "epoch": 0.8554336389598677, + "flos": 25486720316640.0, + "grad_norm": 2.0503800804968315, + "language_loss": 0.65169168, + "learning_rate": 2.151754018031442e-07, + "loss": 0.67310899, + "num_input_tokens_seen": 306800015, + "router_z_loss_clip": 0.70214844, + "router_z_loss_mlp": 0.10809326, + "step": 14228, + "time_per_iteration": 2.6369457244873047 + }, + { + "auxiliary_loss_clip": 0.01112995, + "auxiliary_loss_mlp": 0.01035269, + "balance_loss_clip": 1.03907919, + "balance_loss_mlp": 1.02346766, + "epoch": 0.8554937622125357, + "flos": 25972006959360.0, + "grad_norm": 3.4755451586509425, + "language_loss": 0.74158192, + "learning_rate": 2.1499970239944542e-07, + "loss": 0.7630645, + "num_input_tokens_seen": 306814160, + "router_z_loss_clip": 0.73925781, + "router_z_loss_mlp": 0.11810303, + "step": 14229, + "time_per_iteration": 2.647045135498047 + }, + { + "auxiliary_loss_clip": 0.01108314, + "auxiliary_loss_mlp": 0.01027842, + "balance_loss_clip": 1.03698969, + "balance_loss_mlp": 1.01728046, + "epoch": 0.8555538854652037, + "flos": 27349276065120.0, + "grad_norm": 1.7066144939933827, + "language_loss": 0.72712386, + "learning_rate": 2.1482407068313724e-07, + "loss": 0.74848539, + "num_input_tokens_seen": 306833310, + "router_z_loss_clip": 0.71337891, + "router_z_loss_mlp": 0.10559082, + "step": 14230, + "time_per_iteration": 2.6304566860198975 + }, + { + "auxiliary_loss_clip": 0.01108009, + "auxiliary_loss_mlp": 0.01026603, + "balance_loss_clip": 1.0372659, + "balance_loss_mlp": 1.01561165, + "epoch": 0.8556140087178716, + "flos": 24640353960480.0, + "grad_norm": 1.879781692265414, + "language_loss": 0.82635188, + "learning_rate": 2.1464850666087897e-07, + "loss": 0.84769797, + "num_input_tokens_seen": 306851345, + "router_z_loss_clip": 0.70898438, + "router_z_loss_mlp": 0.10986328, + "step": 14231, + "time_per_iteration": 4.115677833557129 + }, + { + "auxiliary_loss_clip": 0.01113333, + "auxiliary_loss_mlp": 0.0103156, + "balance_loss_clip": 1.04031646, + "balance_loss_mlp": 1.01944876, + "epoch": 0.8556741319705397, + "flos": 27623215810080.0, + "grad_norm": 2.5480928431683894, + "language_loss": 0.67918074, + "learning_rate": 2.1447301033932796e-07, + "loss": 0.70062971, + "num_input_tokens_seen": 306871040, + "router_z_loss_clip": 0.73046875, + "router_z_loss_mlp": 0.12121582, + "step": 14232, + "time_per_iteration": 2.6630959510803223 + }, + { + "auxiliary_loss_clip": 0.01113662, + "auxiliary_loss_mlp": 0.01030622, + "balance_loss_clip": 1.03981161, + "balance_loss_mlp": 1.01878488, + "epoch": 0.8557342552232076, + "flos": 28735215868800.0, + "grad_norm": 2.987260008512691, + "language_loss": 0.67324102, + "learning_rate": 2.1429758172513955e-07, + "loss": 0.69468391, + "num_input_tokens_seen": 306891625, + "router_z_loss_clip": 0.73779297, + "router_z_loss_mlp": 0.1184082, + "step": 14233, + "time_per_iteration": 2.625253438949585 + }, + { + "auxiliary_loss_clip": 0.01107679, + "auxiliary_loss_mlp": 0.01030936, + "balance_loss_clip": 1.03696632, + "balance_loss_mlp": 1.02009344, + "epoch": 0.8557943784758756, + "flos": 23927918680800.0, + "grad_norm": 1.8021903195846978, + "language_loss": 0.76605117, + "learning_rate": 2.1412222082496556e-07, + "loss": 0.78743732, + "num_input_tokens_seen": 306910020, + "router_z_loss_clip": 0.70751953, + "router_z_loss_mlp": 0.1083374, + "step": 14234, + "time_per_iteration": 2.647444248199463 + }, + { + "auxiliary_loss_clip": 0.01028547, + "auxiliary_loss_mlp": 0.01002396, + "balance_loss_clip": 1.00629425, + "balance_loss_mlp": 1.00145149, + "epoch": 0.8558545017285435, + "flos": 86198800186080.0, + "grad_norm": 0.7547729449047419, + "language_loss": 0.57965374, + "learning_rate": 2.1394692764545684e-07, + "loss": 0.59996319, + "num_input_tokens_seen": 306969505, + "router_z_loss_clip": 0.22241211, + "router_z_loss_mlp": 0.00943756, + "step": 14235, + "time_per_iteration": 3.2116663455963135 + }, + { + "auxiliary_loss_clip": 0.01028857, + "auxiliary_loss_mlp": 0.01002626, + "balance_loss_clip": 1.00652432, + "balance_loss_mlp": 1.00165892, + "epoch": 0.8559146249812115, + "flos": 69123083667840.0, + "grad_norm": 0.7870380886049242, + "language_loss": 0.56591344, + "learning_rate": 2.1377170219325858e-07, + "loss": 0.58622825, + "num_input_tokens_seen": 307027710, + "router_z_loss_clip": 0.22314453, + "router_z_loss_mlp": 0.00965881, + "step": 14236, + "time_per_iteration": 4.5015034675598145 + }, + { + "auxiliary_loss_clip": 0.01110007, + "auxiliary_loss_mlp": 0.01028209, + "balance_loss_clip": 1.03772271, + "balance_loss_mlp": 1.01699162, + "epoch": 0.8559747482338794, + "flos": 27928914752160.0, + "grad_norm": 1.770942251389755, + "language_loss": 0.70035851, + "learning_rate": 2.1359654447501673e-07, + "loss": 0.72174072, + "num_input_tokens_seen": 307045515, + "router_z_loss_clip": 0.72314453, + "router_z_loss_mlp": 0.11212158, + "step": 14237, + "time_per_iteration": 2.6865384578704834 + }, + { + "auxiliary_loss_clip": 0.01107639, + "auxiliary_loss_mlp": 0.01028328, + "balance_loss_clip": 1.03571463, + "balance_loss_mlp": 1.01746249, + "epoch": 0.8560348714865474, + "flos": 27579017808000.0, + "grad_norm": 2.9250516133778497, + "language_loss": 0.63889778, + "learning_rate": 2.1342145449737314e-07, + "loss": 0.6602574, + "num_input_tokens_seen": 307064470, + "router_z_loss_clip": 0.71875, + "router_z_loss_mlp": 0.10876465, + "step": 14238, + "time_per_iteration": 2.618396043777466 + }, + { + "auxiliary_loss_clip": 0.01104533, + "auxiliary_loss_mlp": 0.0103048, + "balance_loss_clip": 1.03558445, + "balance_loss_mlp": 1.02078199, + "epoch": 0.8560949947392154, + "flos": 21879778674240.0, + "grad_norm": 1.5885711467166068, + "language_loss": 0.69312501, + "learning_rate": 2.1324643226696648e-07, + "loss": 0.7144751, + "num_input_tokens_seen": 307083900, + "router_z_loss_clip": 0.68896484, + "router_z_loss_mlp": 0.09698486, + "step": 14239, + "time_per_iteration": 2.644993782043457 + }, + { + "auxiliary_loss_clip": 0.01112664, + "auxiliary_loss_mlp": 0.01031244, + "balance_loss_clip": 1.03807843, + "balance_loss_mlp": 1.02003193, + "epoch": 0.8561551179918834, + "flos": 37859337390240.0, + "grad_norm": 2.0522237612331122, + "language_loss": 0.6661725, + "learning_rate": 2.1307147779043455e-07, + "loss": 0.68761152, + "num_input_tokens_seen": 307104590, + "router_z_loss_clip": 0.74560547, + "router_z_loss_mlp": 0.11212158, + "step": 14240, + "time_per_iteration": 2.6743972301483154 + }, + { + "auxiliary_loss_clip": 0.0111146, + "auxiliary_loss_mlp": 0.01029771, + "balance_loss_clip": 1.03779173, + "balance_loss_mlp": 1.01792145, + "epoch": 0.8562152412445513, + "flos": 37418451336000.0, + "grad_norm": 1.78586169268098, + "language_loss": 0.61988139, + "learning_rate": 2.1289659107441182e-07, + "loss": 0.64129364, + "num_input_tokens_seen": 307125580, + "router_z_loss_clip": 0.73730469, + "router_z_loss_mlp": 0.11846924, + "step": 14241, + "time_per_iteration": 2.695223569869995 + }, + { + "auxiliary_loss_clip": 0.01114085, + "auxiliary_loss_mlp": 0.01032007, + "balance_loss_clip": 1.03762412, + "balance_loss_mlp": 1.01959109, + "epoch": 0.8562753644972193, + "flos": 38530167773760.0, + "grad_norm": 1.5472991912218936, + "language_loss": 0.74506187, + "learning_rate": 2.1272177212552855e-07, + "loss": 0.76652271, + "num_input_tokens_seen": 307147625, + "router_z_loss_clip": 0.76513672, + "router_z_loss_mlp": 0.12414551, + "step": 14242, + "time_per_iteration": 2.6870248317718506 + }, + { + "auxiliary_loss_clip": 0.01115297, + "auxiliary_loss_mlp": 0.01039421, + "balance_loss_clip": 1.04025435, + "balance_loss_mlp": 1.02816224, + "epoch": 0.8563354877498872, + "flos": 31986345044160.0, + "grad_norm": 7.253620753383879, + "language_loss": 0.76770329, + "learning_rate": 2.1254702095041498e-07, + "loss": 0.78925049, + "num_input_tokens_seen": 307164665, + "router_z_loss_clip": 0.75, + "router_z_loss_mlp": 0.11260986, + "step": 14243, + "time_per_iteration": 2.6426572799682617 + }, + { + "auxiliary_loss_clip": 0.01110624, + "auxiliary_loss_mlp": 0.01027868, + "balance_loss_clip": 1.03786492, + "balance_loss_mlp": 1.01691258, + "epoch": 0.8563956110025552, + "flos": 29448988218720.0, + "grad_norm": 2.1716233458990453, + "language_loss": 0.68784571, + "learning_rate": 2.123723375556974e-07, + "loss": 0.70923066, + "num_input_tokens_seen": 307182530, + "router_z_loss_clip": 0.72753906, + "router_z_loss_mlp": 0.109375, + "step": 14244, + "time_per_iteration": 2.6360678672790527 + }, + { + "auxiliary_loss_clip": 0.01028482, + "auxiliary_loss_mlp": 0.01002577, + "balance_loss_clip": 1.00612056, + "balance_loss_mlp": 1.00158501, + "epoch": 0.8564557342552233, + "flos": 68663032940160.0, + "grad_norm": 0.7570746734304836, + "language_loss": 0.5844242, + "learning_rate": 2.1219772194800046e-07, + "loss": 0.60473484, + "num_input_tokens_seen": 307241240, + "router_z_loss_clip": 0.22351074, + "router_z_loss_mlp": 0.00991821, + "step": 14245, + "time_per_iteration": 3.171790599822998 + }, + { + "auxiliary_loss_clip": 0.01114085, + "auxiliary_loss_mlp": 0.01029242, + "balance_loss_clip": 1.03845692, + "balance_loss_mlp": 1.01721382, + "epoch": 0.8565158575078912, + "flos": 28602500310720.0, + "grad_norm": 2.3396912593446753, + "language_loss": 0.77414411, + "learning_rate": 2.1202317413394488e-07, + "loss": 0.79557735, + "num_input_tokens_seen": 307261485, + "router_z_loss_clip": 0.75537109, + "router_z_loss_mlp": 0.12017822, + "step": 14246, + "time_per_iteration": 2.6604270935058594 + }, + { + "auxiliary_loss_clip": 0.01106512, + "auxiliary_loss_mlp": 0.0102627, + "balance_loss_clip": 1.03445685, + "balance_loss_mlp": 1.01538002, + "epoch": 0.8565759807605592, + "flos": 24862721558400.0, + "grad_norm": 3.375606409186421, + "language_loss": 0.81463969, + "learning_rate": 2.1184869412014938e-07, + "loss": 0.83596754, + "num_input_tokens_seen": 307279160, + "router_z_loss_clip": 0.72021484, + "router_z_loss_mlp": 0.10894775, + "step": 14247, + "time_per_iteration": 2.7499845027923584 + }, + { + "auxiliary_loss_clip": 0.01112278, + "auxiliary_loss_mlp": 0.01029401, + "balance_loss_clip": 1.03941929, + "balance_loss_mlp": 1.0175333, + "epoch": 0.8566361040132271, + "flos": 22948310041920.0, + "grad_norm": 1.7337818526417044, + "language_loss": 0.7733022, + "learning_rate": 2.1167428191323112e-07, + "loss": 0.79471904, + "num_input_tokens_seen": 307297920, + "router_z_loss_clip": 0.72851562, + "router_z_loss_mlp": 0.11859131, + "step": 14248, + "time_per_iteration": 2.6691997051239014 + }, + { + "auxiliary_loss_clip": 0.01109381, + "auxiliary_loss_mlp": 0.01029601, + "balance_loss_clip": 1.03649735, + "balance_loss_mlp": 1.01815128, + "epoch": 0.8566962272658951, + "flos": 29938367106720.0, + "grad_norm": 2.402229120355768, + "language_loss": 0.77861255, + "learning_rate": 2.1149993751980278e-07, + "loss": 0.8000024, + "num_input_tokens_seen": 307318320, + "router_z_loss_clip": 0.72900391, + "router_z_loss_mlp": 0.11450195, + "step": 14249, + "time_per_iteration": 2.629589796066284 + }, + { + "auxiliary_loss_clip": 0.01107362, + "auxiliary_loss_mlp": 0.01031683, + "balance_loss_clip": 1.03740573, + "balance_loss_mlp": 1.02096558, + "epoch": 0.856756350518563, + "flos": 28283268597120.0, + "grad_norm": 1.80810549946026, + "language_loss": 0.78460956, + "learning_rate": 2.1132566094647597e-07, + "loss": 0.80599999, + "num_input_tokens_seen": 307336720, + "router_z_loss_clip": 0.70019531, + "router_z_loss_mlp": 0.10723877, + "step": 14250, + "time_per_iteration": 2.6689670085906982 + }, + { + "auxiliary_loss_clip": 0.01106935, + "auxiliary_loss_mlp": 0.01028612, + "balance_loss_clip": 1.03731132, + "balance_loss_mlp": 1.01824117, + "epoch": 0.856816473771231, + "flos": 25391152753920.0, + "grad_norm": 2.130905303321015, + "language_loss": 0.79856253, + "learning_rate": 2.1115145219985942e-07, + "loss": 0.81991792, + "num_input_tokens_seen": 307354120, + "router_z_loss_clip": 0.69677734, + "router_z_loss_mlp": 0.10369873, + "step": 14251, + "time_per_iteration": 2.5875978469848633 + }, + { + "auxiliary_loss_clip": 0.01109461, + "auxiliary_loss_mlp": 0.01029438, + "balance_loss_clip": 1.0383997, + "balance_loss_mlp": 1.01901889, + "epoch": 0.856876597023899, + "flos": 24682566615840.0, + "grad_norm": 2.0340842752311588, + "language_loss": 0.61150253, + "learning_rate": 2.1097731128656005e-07, + "loss": 0.63289154, + "num_input_tokens_seen": 307373165, + "router_z_loss_clip": 0.71044922, + "router_z_loss_mlp": 0.10430908, + "step": 14252, + "time_per_iteration": 2.615156412124634 + }, + { + "auxiliary_loss_clip": 0.01115987, + "auxiliary_loss_mlp": 0.01030764, + "balance_loss_clip": 1.04158533, + "balance_loss_mlp": 1.01907527, + "epoch": 0.856936720276567, + "flos": 22324797491040.0, + "grad_norm": 2.101566044697455, + "language_loss": 0.69926083, + "learning_rate": 2.1080323821317924e-07, + "loss": 0.72072834, + "num_input_tokens_seen": 307391000, + "router_z_loss_clip": 0.74414062, + "router_z_loss_mlp": 0.11676025, + "step": 14253, + "time_per_iteration": 2.6234819889068604 + }, + { + "auxiliary_loss_clip": 0.01028223, + "auxiliary_loss_mlp": 0.01002276, + "balance_loss_clip": 1.00588822, + "balance_loss_mlp": 1.00129378, + "epoch": 0.8569968435292349, + "flos": 85266874035360.0, + "grad_norm": 0.7963547607248991, + "language_loss": 0.591398, + "learning_rate": 2.1062923298631907e-07, + "loss": 0.61170298, + "num_input_tokens_seen": 307452865, + "router_z_loss_clip": 0.22351074, + "router_z_loss_mlp": 0.0098114, + "step": 14254, + "time_per_iteration": 3.2904129028320312 + }, + { + "auxiliary_loss_clip": 0.01108126, + "auxiliary_loss_mlp": 0.01030761, + "balance_loss_clip": 1.03631055, + "balance_loss_mlp": 1.01863134, + "epoch": 0.8570569667819029, + "flos": 31540880537280.0, + "grad_norm": 1.9840118093686536, + "language_loss": 0.81137663, + "learning_rate": 2.1045529561257825e-07, + "loss": 0.83276546, + "num_input_tokens_seen": 307471940, + "router_z_loss_clip": 0.71826172, + "router_z_loss_mlp": 0.12121582, + "step": 14255, + "time_per_iteration": 2.6953678131103516 + }, + { + "auxiliary_loss_clip": 0.01108618, + "auxiliary_loss_mlp": 0.01026407, + "balance_loss_clip": 1.03835106, + "balance_loss_mlp": 1.01523709, + "epoch": 0.8571170900345708, + "flos": 28378674090720.0, + "grad_norm": 1.998533970522801, + "language_loss": 0.67222369, + "learning_rate": 2.1028142609855126e-07, + "loss": 0.69357395, + "num_input_tokens_seen": 307488745, + "router_z_loss_clip": 0.70214844, + "router_z_loss_mlp": 0.11181641, + "step": 14256, + "time_per_iteration": 2.7216198444366455 + }, + { + "auxiliary_loss_clip": 0.0110984, + "auxiliary_loss_mlp": 0.01030936, + "balance_loss_clip": 1.0374893, + "balance_loss_mlp": 1.02011824, + "epoch": 0.8571772132872388, + "flos": 23083132498560.0, + "grad_norm": 1.7279711405738252, + "language_loss": 0.69958019, + "learning_rate": 2.1010762445083218e-07, + "loss": 0.72098798, + "num_input_tokens_seen": 307506855, + "router_z_loss_clip": 0.72216797, + "router_z_loss_mlp": 0.1081543, + "step": 14257, + "time_per_iteration": 2.659245491027832 + }, + { + "auxiliary_loss_clip": 0.0110946, + "auxiliary_loss_mlp": 0.01030159, + "balance_loss_clip": 1.03777194, + "balance_loss_mlp": 1.01925159, + "epoch": 0.8572373365399069, + "flos": 40573283637600.0, + "grad_norm": 2.255789447512365, + "language_loss": 0.77247846, + "learning_rate": 2.0993389067601197e-07, + "loss": 0.79387468, + "num_input_tokens_seen": 307526115, + "router_z_loss_clip": 0.71777344, + "router_z_loss_mlp": 0.10913086, + "step": 14258, + "time_per_iteration": 2.7088189125061035 + }, + { + "auxiliary_loss_clip": 0.0110942, + "auxiliary_loss_mlp": 0.01030293, + "balance_loss_clip": 1.0387795, + "balance_loss_mlp": 1.01914144, + "epoch": 0.8572974597925748, + "flos": 28464841644480.0, + "grad_norm": 1.874515038809229, + "language_loss": 0.68103492, + "learning_rate": 2.0976022478067735e-07, + "loss": 0.70243204, + "num_input_tokens_seen": 307545230, + "router_z_loss_clip": 0.70654297, + "router_z_loss_mlp": 0.11157227, + "step": 14259, + "time_per_iteration": 4.116435289382935 + }, + { + "auxiliary_loss_clip": 0.01108589, + "auxiliary_loss_mlp": 0.01031752, + "balance_loss_clip": 1.03655791, + "balance_loss_mlp": 1.02046919, + "epoch": 0.8573575830452428, + "flos": 29938934348640.0, + "grad_norm": 1.6819319505883867, + "language_loss": 0.77117676, + "learning_rate": 2.0958662677141437e-07, + "loss": 0.79258019, + "num_input_tokens_seen": 307564900, + "router_z_loss_clip": 0.71972656, + "router_z_loss_mlp": 0.11273193, + "step": 14260, + "time_per_iteration": 2.636791229248047 + }, + { + "auxiliary_loss_clip": 0.01113099, + "auxiliary_loss_mlp": 0.01028105, + "balance_loss_clip": 1.03972387, + "balance_loss_mlp": 1.01685715, + "epoch": 0.8574177062979107, + "flos": 29487392249760.0, + "grad_norm": 1.8206197682152112, + "language_loss": 0.74347341, + "learning_rate": 2.09413096654806e-07, + "loss": 0.76488543, + "num_input_tokens_seen": 307583500, + "router_z_loss_clip": 0.73339844, + "router_z_loss_mlp": 0.11248779, + "step": 14261, + "time_per_iteration": 2.6654186248779297 + }, + { + "auxiliary_loss_clip": 0.01114205, + "auxiliary_loss_mlp": 0.01034156, + "balance_loss_clip": 1.03950095, + "balance_loss_mlp": 1.02215147, + "epoch": 0.8574778295505787, + "flos": 21878968328640.0, + "grad_norm": 1.897570041902138, + "language_loss": 0.79070789, + "learning_rate": 2.0923963443743276e-07, + "loss": 0.81219149, + "num_input_tokens_seen": 307601430, + "router_z_loss_clip": 0.74658203, + "router_z_loss_mlp": 0.12005615, + "step": 14262, + "time_per_iteration": 2.6155335903167725 + }, + { + "auxiliary_loss_clip": 0.01108052, + "auxiliary_loss_mlp": 0.01031582, + "balance_loss_clip": 1.03876209, + "balance_loss_mlp": 1.02128839, + "epoch": 0.8575379528032466, + "flos": 26332803051840.0, + "grad_norm": 1.677851386479293, + "language_loss": 0.67762703, + "learning_rate": 2.0906624012587203e-07, + "loss": 0.69902331, + "num_input_tokens_seen": 307621495, + "router_z_loss_clip": 0.69433594, + "router_z_loss_mlp": 0.10296631, + "step": 14263, + "time_per_iteration": 2.601759910583496 + }, + { + "auxiliary_loss_clip": 0.01110246, + "auxiliary_loss_mlp": 0.01030282, + "balance_loss_clip": 1.0373683, + "balance_loss_mlp": 1.01899886, + "epoch": 0.8575980760559146, + "flos": 26553671510400.0, + "grad_norm": 1.565830249526282, + "language_loss": 0.79674232, + "learning_rate": 2.088929137266986e-07, + "loss": 0.81814754, + "num_input_tokens_seen": 307640840, + "router_z_loss_clip": 0.72851562, + "router_z_loss_mlp": 0.11279297, + "step": 14264, + "time_per_iteration": 2.6739866733551025 + }, + { + "auxiliary_loss_clip": 0.01110993, + "auxiliary_loss_mlp": 0.01028581, + "balance_loss_clip": 1.03918695, + "balance_loss_mlp": 1.01792943, + "epoch": 0.8576581993085826, + "flos": 41959426027680.0, + "grad_norm": 1.3500508424562263, + "language_loss": 0.69685656, + "learning_rate": 2.0871965524648582e-07, + "loss": 0.7182523, + "num_input_tokens_seen": 307663820, + "router_z_loss_clip": 0.71777344, + "router_z_loss_mlp": 0.10662842, + "step": 14265, + "time_per_iteration": 2.7189900875091553 + }, + { + "auxiliary_loss_clip": 0.01106945, + "auxiliary_loss_mlp": 0.01024751, + "balance_loss_clip": 1.03734112, + "balance_loss_mlp": 1.01443315, + "epoch": 0.8577183225612506, + "flos": 28336258848960.0, + "grad_norm": 2.410740105865858, + "language_loss": 0.66378391, + "learning_rate": 2.085464646918027e-07, + "loss": 0.68510091, + "num_input_tokens_seen": 307682385, + "router_z_loss_clip": 0.69628906, + "router_z_loss_mlp": 0.10314941, + "step": 14266, + "time_per_iteration": 3.9154915809631348 + }, + { + "auxiliary_loss_clip": 0.01108984, + "auxiliary_loss_mlp": 0.01031461, + "balance_loss_clip": 1.03771639, + "balance_loss_mlp": 1.02028489, + "epoch": 0.8577784458139185, + "flos": 35147498041440.0, + "grad_norm": 1.7596031652299726, + "language_loss": 0.75544757, + "learning_rate": 2.0837334206921731e-07, + "loss": 0.77685201, + "num_input_tokens_seen": 307704680, + "router_z_loss_clip": 0.71240234, + "router_z_loss_mlp": 0.11175537, + "step": 14267, + "time_per_iteration": 2.6715307235717773 + }, + { + "auxiliary_loss_clip": 0.01108229, + "auxiliary_loss_mlp": 0.0102784, + "balance_loss_clip": 1.03766692, + "balance_loss_mlp": 1.01753426, + "epoch": 0.8578385690665865, + "flos": 24105723621120.0, + "grad_norm": 1.7009247495561635, + "language_loss": 0.87855232, + "learning_rate": 2.082002873852946e-07, + "loss": 0.89991307, + "num_input_tokens_seen": 307723245, + "router_z_loss_clip": 0.70605469, + "router_z_loss_mlp": 0.10308838, + "step": 14268, + "time_per_iteration": 2.648815870285034 + }, + { + "auxiliary_loss_clip": 0.01111765, + "auxiliary_loss_mlp": 0.01031609, + "balance_loss_clip": 1.03879452, + "balance_loss_mlp": 1.02025974, + "epoch": 0.8578986923192544, + "flos": 25263177717600.0, + "grad_norm": 2.21093983074447, + "language_loss": 0.72879225, + "learning_rate": 2.0802730064659667e-07, + "loss": 0.75022602, + "num_input_tokens_seen": 307742510, + "router_z_loss_clip": 0.72900391, + "router_z_loss_mlp": 0.11352539, + "step": 14269, + "time_per_iteration": 2.669445276260376 + }, + { + "auxiliary_loss_clip": 0.01111146, + "auxiliary_loss_mlp": 0.01029778, + "balance_loss_clip": 1.03845596, + "balance_loss_mlp": 1.01863754, + "epoch": 0.8579588155719224, + "flos": 44052493347360.0, + "grad_norm": 1.5830808900642066, + "language_loss": 0.66471982, + "learning_rate": 2.0785438185968252e-07, + "loss": 0.68612897, + "num_input_tokens_seen": 307766030, + "router_z_loss_clip": 0.72705078, + "router_z_loss_mlp": 0.11138916, + "step": 14270, + "time_per_iteration": 4.21120548248291 + }, + { + "auxiliary_loss_clip": 0.01107971, + "auxiliary_loss_mlp": 0.01024622, + "balance_loss_clip": 1.03647947, + "balance_loss_mlp": 1.01319575, + "epoch": 0.8580189388245905, + "flos": 27886337441280.0, + "grad_norm": 3.0702884509357142, + "language_loss": 0.73771787, + "learning_rate": 2.0768153103110997e-07, + "loss": 0.75904381, + "num_input_tokens_seen": 307785800, + "router_z_loss_clip": 0.71582031, + "router_z_loss_mlp": 0.11425781, + "step": 14271, + "time_per_iteration": 2.639173746109009 + }, + { + "auxiliary_loss_clip": 0.01027863, + "auxiliary_loss_mlp": 0.01001178, + "balance_loss_clip": 1.00559711, + "balance_loss_mlp": 1.00028539, + "epoch": 0.8580790620772584, + "flos": 84976774246080.0, + "grad_norm": 0.8199240305986423, + "language_loss": 0.59426379, + "learning_rate": 2.0750874816743358e-07, + "loss": 0.61455417, + "num_input_tokens_seen": 307850995, + "router_z_loss_clip": 0.22277832, + "router_z_loss_mlp": 0.00891876, + "step": 14272, + "time_per_iteration": 3.379744529724121 + }, + { + "auxiliary_loss_clip": 0.01114006, + "auxiliary_loss_mlp": 0.01028658, + "balance_loss_clip": 1.03832912, + "balance_loss_mlp": 1.01688576, + "epoch": 0.8581391853299264, + "flos": 16269907959360.0, + "grad_norm": 2.9096395347263524, + "language_loss": 0.74929601, + "learning_rate": 2.0733603327520499e-07, + "loss": 0.77072263, + "num_input_tokens_seen": 307868585, + "router_z_loss_clip": 0.75585938, + "router_z_loss_mlp": 0.11767578, + "step": 14273, + "time_per_iteration": 2.598862886428833 + }, + { + "auxiliary_loss_clip": 0.01109719, + "auxiliary_loss_mlp": 0.01031096, + "balance_loss_clip": 1.03739047, + "balance_loss_mlp": 1.01979482, + "epoch": 0.8581993085825943, + "flos": 23971630475520.0, + "grad_norm": 2.3135429669550907, + "language_loss": 0.81976581, + "learning_rate": 2.0716338636097385e-07, + "loss": 0.84117401, + "num_input_tokens_seen": 307886820, + "router_z_loss_clip": 0.72363281, + "router_z_loss_mlp": 0.11309814, + "step": 14274, + "time_per_iteration": 2.6395745277404785 + }, + { + "auxiliary_loss_clip": 0.01028082, + "auxiliary_loss_mlp": 0.01001402, + "balance_loss_clip": 1.00575721, + "balance_loss_mlp": 1.00046313, + "epoch": 0.8582594318352623, + "flos": 68118516384480.0, + "grad_norm": 0.7949139041776025, + "language_loss": 0.60868466, + "learning_rate": 2.0699080743128672e-07, + "loss": 0.62897944, + "num_input_tokens_seen": 307944020, + "router_z_loss_clip": 0.22302246, + "router_z_loss_mlp": 0.00937653, + "step": 14275, + "time_per_iteration": 3.2693066596984863 + }, + { + "auxiliary_loss_clip": 0.01110813, + "auxiliary_loss_mlp": 0.01024332, + "balance_loss_clip": 1.03760397, + "balance_loss_mlp": 1.01236951, + "epoch": 0.8583195550879302, + "flos": 29626266434400.0, + "grad_norm": 2.166623774294587, + "language_loss": 0.59083223, + "learning_rate": 2.0681829649268768e-07, + "loss": 0.61218369, + "num_input_tokens_seen": 307961055, + "router_z_loss_clip": 0.73193359, + "router_z_loss_mlp": 0.11962891, + "step": 14276, + "time_per_iteration": 3.9234330654144287 + }, + { + "auxiliary_loss_clip": 0.01111565, + "auxiliary_loss_mlp": 0.01032334, + "balance_loss_clip": 1.03843009, + "balance_loss_mlp": 1.02130675, + "epoch": 0.8583796783405983, + "flos": 16403960587680.0, + "grad_norm": 5.574500518703953, + "language_loss": 0.76270849, + "learning_rate": 2.0664585355171838e-07, + "loss": 0.7841475, + "num_input_tokens_seen": 307978690, + "router_z_loss_clip": 0.73095703, + "router_z_loss_mlp": 0.11035156, + "step": 14277, + "time_per_iteration": 2.6263632774353027 + }, + { + "auxiliary_loss_clip": 0.01111162, + "auxiliary_loss_mlp": 0.01029117, + "balance_loss_clip": 1.03852773, + "balance_loss_mlp": 1.01743495, + "epoch": 0.8584398015932662, + "flos": 19742837490720.0, + "grad_norm": 1.5737638716168483, + "language_loss": 0.84035367, + "learning_rate": 2.0647347861491803e-07, + "loss": 0.86175644, + "num_input_tokens_seen": 307995870, + "router_z_loss_clip": 0.72607422, + "router_z_loss_mlp": 0.11688232, + "step": 14278, + "time_per_iteration": 2.5748684406280518 + }, + { + "auxiliary_loss_clip": 0.01112818, + "auxiliary_loss_mlp": 0.01027828, + "balance_loss_clip": 1.03812099, + "balance_loss_mlp": 1.01606822, + "epoch": 0.8584999248459342, + "flos": 21293414118720.0, + "grad_norm": 2.0831327641097968, + "language_loss": 0.74926776, + "learning_rate": 2.0630117168882366e-07, + "loss": 0.77067423, + "num_input_tokens_seen": 308013645, + "router_z_loss_clip": 0.74707031, + "router_z_loss_mlp": 0.11767578, + "step": 14279, + "time_per_iteration": 2.6234495639801025 + }, + { + "auxiliary_loss_clip": 0.01110234, + "auxiliary_loss_mlp": 0.01030979, + "balance_loss_clip": 1.03843307, + "balance_loss_mlp": 1.02005398, + "epoch": 0.8585600480986021, + "flos": 28602946000800.0, + "grad_norm": 3.116024612610694, + "language_loss": 0.66101468, + "learning_rate": 2.0612893277996845e-07, + "loss": 0.68242681, + "num_input_tokens_seen": 308032490, + "router_z_loss_clip": 0.71875, + "router_z_loss_mlp": 0.10906982, + "step": 14280, + "time_per_iteration": 2.6386799812316895 + }, + { + "auxiliary_loss_clip": 0.0110856, + "auxiliary_loss_mlp": 0.01025717, + "balance_loss_clip": 1.03765225, + "balance_loss_mlp": 1.01532173, + "epoch": 0.8586201713512701, + "flos": 24328253288160.0, + "grad_norm": 3.7754889692329647, + "language_loss": 0.62548763, + "learning_rate": 2.0595676189488343e-07, + "loss": 0.64683032, + "num_input_tokens_seen": 308052110, + "router_z_loss_clip": 0.70800781, + "router_z_loss_mlp": 0.10394287, + "step": 14281, + "time_per_iteration": 2.646273136138916 + }, + { + "auxiliary_loss_clip": 0.01109828, + "auxiliary_loss_mlp": 0.01028135, + "balance_loss_clip": 1.03714967, + "balance_loss_mlp": 1.01688766, + "epoch": 0.858680294603938, + "flos": 18674468192160.0, + "grad_norm": 1.7181289246135387, + "language_loss": 0.73110068, + "learning_rate": 2.0578465904009845e-07, + "loss": 0.75248027, + "num_input_tokens_seen": 308070660, + "router_z_loss_clip": 0.72705078, + "router_z_loss_mlp": 0.11254883, + "step": 14282, + "time_per_iteration": 2.5923516750335693 + }, + { + "auxiliary_loss_clip": 0.01107278, + "auxiliary_loss_mlp": 0.01024127, + "balance_loss_clip": 1.03532517, + "balance_loss_mlp": 1.01377368, + "epoch": 0.858740417856606, + "flos": 27712422159840.0, + "grad_norm": 11.114623609342502, + "language_loss": 0.75699186, + "learning_rate": 2.0561262422213832e-07, + "loss": 0.77830589, + "num_input_tokens_seen": 308089520, + "router_z_loss_clip": 0.71875, + "router_z_loss_mlp": 0.10351562, + "step": 14283, + "time_per_iteration": 2.6382763385772705 + }, + { + "auxiliary_loss_clip": 0.01110677, + "auxiliary_loss_mlp": 0.01028151, + "balance_loss_clip": 1.03763998, + "balance_loss_mlp": 1.01694536, + "epoch": 0.8588005411092741, + "flos": 41554472450400.0, + "grad_norm": 2.410488069586683, + "language_loss": 0.60403597, + "learning_rate": 2.0544065744752736e-07, + "loss": 0.62542421, + "num_input_tokens_seen": 308111545, + "router_z_loss_clip": 0.73046875, + "router_z_loss_mlp": 0.11199951, + "step": 14284, + "time_per_iteration": 2.700430393218994 + }, + { + "auxiliary_loss_clip": 0.01108084, + "auxiliary_loss_mlp": 0.01028258, + "balance_loss_clip": 1.03853357, + "balance_loss_mlp": 1.0171473, + "epoch": 0.858860664361942, + "flos": 35281023945120.0, + "grad_norm": 1.8566806880541005, + "language_loss": 0.76052248, + "learning_rate": 2.0526875872278749e-07, + "loss": 0.78188586, + "num_input_tokens_seen": 308129690, + "router_z_loss_clip": 0.69580078, + "router_z_loss_mlp": 0.11114502, + "step": 14285, + "time_per_iteration": 2.7113959789276123 + }, + { + "auxiliary_loss_clip": 0.01113802, + "auxiliary_loss_mlp": 0.01031936, + "balance_loss_clip": 1.04112196, + "balance_loss_mlp": 1.02043247, + "epoch": 0.85892078761461, + "flos": 24150772486080.0, + "grad_norm": 1.8716974864972284, + "language_loss": 0.74776495, + "learning_rate": 2.0509692805443524e-07, + "loss": 0.76922238, + "num_input_tokens_seen": 308147410, + "router_z_loss_clip": 0.7265625, + "router_z_loss_mlp": 0.1149292, + "step": 14286, + "time_per_iteration": 2.5993528366088867 + }, + { + "auxiliary_loss_clip": 0.01028041, + "auxiliary_loss_mlp": 0.01000711, + "balance_loss_clip": 1.00575876, + "balance_loss_mlp": 0.99978602, + "epoch": 0.8589809108672779, + "flos": 81884447406720.0, + "grad_norm": 0.7785723399630139, + "language_loss": 0.49421227, + "learning_rate": 2.0492516544898718e-07, + "loss": 0.51449978, + "num_input_tokens_seen": 308204875, + "router_z_loss_clip": 0.22265625, + "router_z_loss_mlp": 0.0092392, + "step": 14287, + "time_per_iteration": 3.221558094024658 + }, + { + "auxiliary_loss_clip": 0.01112579, + "auxiliary_loss_mlp": 0.01030954, + "balance_loss_clip": 1.03948092, + "balance_loss_mlp": 1.01988554, + "epoch": 0.8590410341199459, + "flos": 36260511032160.0, + "grad_norm": 1.944274921857699, + "language_loss": 0.79081309, + "learning_rate": 2.0475347091295704e-07, + "loss": 0.81224841, + "num_input_tokens_seen": 308225690, + "router_z_loss_clip": 0.73193359, + "router_z_loss_mlp": 0.1105957, + "step": 14288, + "time_per_iteration": 2.714142322540283 + }, + { + "auxiliary_loss_clip": 0.01110419, + "auxiliary_loss_mlp": 0.01030175, + "balance_loss_clip": 1.0369041, + "balance_loss_mlp": 1.01828432, + "epoch": 0.8591011573726138, + "flos": 29269805690880.0, + "grad_norm": 3.0280742021884834, + "language_loss": 0.80890805, + "learning_rate": 2.045818444528553e-07, + "loss": 0.83031404, + "num_input_tokens_seen": 308245255, + "router_z_loss_clip": 0.73535156, + "router_z_loss_mlp": 0.11895752, + "step": 14289, + "time_per_iteration": 2.6410419940948486 + }, + { + "auxiliary_loss_clip": 0.01113297, + "auxiliary_loss_mlp": 0.01027913, + "balance_loss_clip": 1.04050851, + "balance_loss_mlp": 1.01669586, + "epoch": 0.8591612806252819, + "flos": 17605410099840.0, + "grad_norm": 2.1480202083640476, + "language_loss": 0.65313721, + "learning_rate": 2.0441028607518973e-07, + "loss": 0.67454922, + "num_input_tokens_seen": 308261755, + "router_z_loss_clip": 0.72851562, + "router_z_loss_mlp": 0.11224365, + "step": 14290, + "time_per_iteration": 2.619889259338379 + }, + { + "auxiliary_loss_clip": 0.01113045, + "auxiliary_loss_mlp": 0.01030479, + "balance_loss_clip": 1.03916812, + "balance_loss_mlp": 1.01855826, + "epoch": 0.8592214038779498, + "flos": 38531302257600.0, + "grad_norm": 2.0723059295925768, + "language_loss": 0.55019593, + "learning_rate": 2.0423879578646642e-07, + "loss": 0.57163119, + "num_input_tokens_seen": 308285145, + "router_z_loss_clip": 0.73925781, + "router_z_loss_mlp": 0.11920166, + "step": 14291, + "time_per_iteration": 2.7085301876068115 + }, + { + "auxiliary_loss_clip": 0.01112698, + "auxiliary_loss_mlp": 0.01027445, + "balance_loss_clip": 1.03989458, + "balance_loss_mlp": 1.0164361, + "epoch": 0.8592815271306178, + "flos": 21301152919200.0, + "grad_norm": 3.570328741358612, + "language_loss": 0.71727377, + "learning_rate": 2.0406737359318792e-07, + "loss": 0.73867518, + "num_input_tokens_seen": 308304130, + "router_z_loss_clip": 0.72900391, + "router_z_loss_mlp": 0.10998535, + "step": 14292, + "time_per_iteration": 2.6513023376464844 + }, + { + "auxiliary_loss_clip": 0.01109625, + "auxiliary_loss_mlp": 0.01026771, + "balance_loss_clip": 1.03733015, + "balance_loss_mlp": 1.01571989, + "epoch": 0.8593416503832857, + "flos": 31007303647200.0, + "grad_norm": 1.6228860308265733, + "language_loss": 0.71182114, + "learning_rate": 2.038960195018542e-07, + "loss": 0.73318505, + "num_input_tokens_seen": 308324670, + "router_z_loss_clip": 0.72314453, + "router_z_loss_mlp": 0.11047363, + "step": 14293, + "time_per_iteration": 2.637404680252075 + }, + { + "auxiliary_loss_clip": 0.0110668, + "auxiliary_loss_mlp": 0.01031805, + "balance_loss_clip": 1.03649902, + "balance_loss_mlp": 1.02088511, + "epoch": 0.8594017736359537, + "flos": 25620124668480.0, + "grad_norm": 1.8025672986315984, + "language_loss": 0.687356, + "learning_rate": 2.0372473351896358e-07, + "loss": 0.70874083, + "num_input_tokens_seen": 308344215, + "router_z_loss_clip": 0.70263672, + "router_z_loss_mlp": 0.10906982, + "step": 14294, + "time_per_iteration": 2.655179977416992 + }, + { + "auxiliary_loss_clip": 0.01105374, + "auxiliary_loss_mlp": 0.01026331, + "balance_loss_clip": 1.03509998, + "balance_loss_mlp": 1.01578069, + "epoch": 0.8594618968886216, + "flos": 26955626808960.0, + "grad_norm": 1.9329114394681752, + "language_loss": 0.78040528, + "learning_rate": 2.0355351565101087e-07, + "loss": 0.80172235, + "num_input_tokens_seen": 308360520, + "router_z_loss_clip": 0.70263672, + "router_z_loss_mlp": 0.10540771, + "step": 14295, + "time_per_iteration": 2.636852502822876 + }, + { + "auxiliary_loss_clip": 0.01115126, + "auxiliary_loss_mlp": 0.01037729, + "balance_loss_clip": 1.03901553, + "balance_loss_mlp": 1.02527761, + "epoch": 0.8595220201412896, + "flos": 14222254160160.0, + "grad_norm": 3.380639460884487, + "language_loss": 0.69178581, + "learning_rate": 2.0338236590448975e-07, + "loss": 0.71331429, + "num_input_tokens_seen": 308376865, + "router_z_loss_clip": 0.76123047, + "router_z_loss_mlp": 0.12457275, + "step": 14296, + "time_per_iteration": 2.625880718231201 + }, + { + "auxiliary_loss_clip": 0.01108329, + "auxiliary_loss_mlp": 0.01026476, + "balance_loss_clip": 1.03642869, + "balance_loss_mlp": 1.01500189, + "epoch": 0.8595821433939577, + "flos": 30555153789120.0, + "grad_norm": 2.0224503712847226, + "language_loss": 0.79390264, + "learning_rate": 2.0321128428588842e-07, + "loss": 0.81525069, + "num_input_tokens_seen": 308395870, + "router_z_loss_clip": 0.71923828, + "router_z_loss_mlp": 0.11480713, + "step": 14297, + "time_per_iteration": 2.625145435333252 + }, + { + "auxiliary_loss_clip": 0.0110634, + "auxiliary_loss_mlp": 0.01027969, + "balance_loss_clip": 1.03669298, + "balance_loss_mlp": 1.0176456, + "epoch": 0.8596422666466256, + "flos": 34790470056000.0, + "grad_norm": 1.681250107945687, + "language_loss": 0.68393558, + "learning_rate": 2.030402708016954e-07, + "loss": 0.70527864, + "num_input_tokens_seen": 308417250, + "router_z_loss_clip": 0.69775391, + "router_z_loss_mlp": 0.10327148, + "step": 14298, + "time_per_iteration": 4.220468282699585 + }, + { + "auxiliary_loss_clip": 0.01108498, + "auxiliary_loss_mlp": 0.01033163, + "balance_loss_clip": 1.03892112, + "balance_loss_mlp": 1.02205849, + "epoch": 0.8597023898992936, + "flos": 16580995699680.0, + "grad_norm": 1.9295099897372896, + "language_loss": 0.68516946, + "learning_rate": 2.0286932545839576e-07, + "loss": 0.70658606, + "num_input_tokens_seen": 308434565, + "router_z_loss_clip": 0.69482422, + "router_z_loss_mlp": 0.11096191, + "step": 14299, + "time_per_iteration": 2.677579879760742 + }, + { + "auxiliary_loss_clip": 0.01115269, + "auxiliary_loss_mlp": 0.01035078, + "balance_loss_clip": 1.04000366, + "balance_loss_mlp": 1.02345526, + "epoch": 0.8597625131519615, + "flos": 39414938160960.0, + "grad_norm": 2.513590582347706, + "language_loss": 0.71411777, + "learning_rate": 2.0269844826247096e-07, + "loss": 0.73562127, + "num_input_tokens_seen": 308450040, + "router_z_loss_clip": 0.75292969, + "router_z_loss_mlp": 0.1161499, + "step": 14300, + "time_per_iteration": 2.684544086456299 + }, + { + "auxiliary_loss_clip": 0.01108369, + "auxiliary_loss_mlp": 0.01029201, + "balance_loss_clip": 1.03668261, + "balance_loss_mlp": 1.01853812, + "epoch": 0.8598226364046295, + "flos": 35057521863360.0, + "grad_norm": 2.0183903799265304, + "language_loss": 0.68883932, + "learning_rate": 2.0252763922040116e-07, + "loss": 0.71021497, + "num_input_tokens_seen": 308470545, + "router_z_loss_clip": 0.71630859, + "router_z_loss_mlp": 0.10656738, + "step": 14301, + "time_per_iteration": 2.7109217643737793 + }, + { + "auxiliary_loss_clip": 0.01110136, + "auxiliary_loss_mlp": 0.01029417, + "balance_loss_clip": 1.03727317, + "balance_loss_mlp": 1.01816964, + "epoch": 0.8598827596572974, + "flos": 26688129311520.0, + "grad_norm": 1.7016942436146203, + "language_loss": 0.73908824, + "learning_rate": 2.023568983386641e-07, + "loss": 0.76048374, + "num_input_tokens_seen": 308490020, + "router_z_loss_clip": 0.72753906, + "router_z_loss_mlp": 0.11242676, + "step": 14302, + "time_per_iteration": 2.6140925884246826 + }, + { + "auxiliary_loss_clip": 0.0110479, + "auxiliary_loss_mlp": 0.01028596, + "balance_loss_clip": 1.03549671, + "balance_loss_mlp": 1.01834989, + "epoch": 0.8599428829099655, + "flos": 29001295261440.0, + "grad_norm": 1.8095190081696622, + "language_loss": 0.83710575, + "learning_rate": 2.02186225623733e-07, + "loss": 0.85843962, + "num_input_tokens_seen": 308509065, + "router_z_loss_clip": 0.69238281, + "router_z_loss_mlp": 0.10247803, + "step": 14303, + "time_per_iteration": 2.711120128631592 + }, + { + "auxiliary_loss_clip": 0.01109522, + "auxiliary_loss_mlp": 0.010312, + "balance_loss_clip": 1.03705454, + "balance_loss_mlp": 1.01978552, + "epoch": 0.8600030061626334, + "flos": 19782862212960.0, + "grad_norm": 2.1141927912060665, + "language_loss": 0.77216339, + "learning_rate": 2.0201562108208025e-07, + "loss": 0.79357064, + "num_input_tokens_seen": 308524725, + "router_z_loss_clip": 0.72363281, + "router_z_loss_mlp": 0.11413574, + "step": 14304, + "time_per_iteration": 2.556785821914673 + }, + { + "auxiliary_loss_clip": 0.01111611, + "auxiliary_loss_mlp": 0.01036143, + "balance_loss_clip": 1.03887606, + "balance_loss_mlp": 1.0239836, + "epoch": 0.8600631294153014, + "flos": 19119405974400.0, + "grad_norm": 2.2988870016393337, + "language_loss": 0.53511566, + "learning_rate": 2.0184508472017537e-07, + "loss": 0.55659318, + "num_input_tokens_seen": 308543525, + "router_z_loss_clip": 0.72753906, + "router_z_loss_mlp": 0.12158203, + "step": 14305, + "time_per_iteration": 2.65220308303833 + }, + { + "auxiliary_loss_clip": 0.01110374, + "auxiliary_loss_mlp": 0.01027487, + "balance_loss_clip": 1.03850555, + "balance_loss_mlp": 1.01570344, + "epoch": 0.8601232526679693, + "flos": 21344905231200.0, + "grad_norm": 1.9090693839160493, + "language_loss": 0.8364073, + "learning_rate": 2.0167461654448558e-07, + "loss": 0.85778588, + "num_input_tokens_seen": 308557995, + "router_z_loss_clip": 0.71875, + "router_z_loss_mlp": 0.11785889, + "step": 14306, + "time_per_iteration": 3.9129481315612793 + }, + { + "auxiliary_loss_clip": 0.0110728, + "auxiliary_loss_mlp": 0.01027958, + "balance_loss_clip": 1.03700614, + "balance_loss_mlp": 1.01736045, + "epoch": 0.8601833759206373, + "flos": 32928197928480.0, + "grad_norm": 1.3374615900938658, + "language_loss": 0.71727455, + "learning_rate": 2.01504216561474e-07, + "loss": 0.7386269, + "num_input_tokens_seen": 308582750, + "router_z_loss_clip": 0.70410156, + "router_z_loss_mlp": 0.10595703, + "step": 14307, + "time_per_iteration": 2.702458381652832 + }, + { + "auxiliary_loss_clip": 0.01111696, + "auxiliary_loss_mlp": 0.01039531, + "balance_loss_clip": 1.03738892, + "balance_loss_mlp": 1.02718091, + "epoch": 0.8602434991733052, + "flos": 30785624843040.0, + "grad_norm": 1.6157932585340675, + "language_loss": 0.63527399, + "learning_rate": 2.0133388477760316e-07, + "loss": 0.6567862, + "num_input_tokens_seen": 308603770, + "router_z_loss_clip": 0.74267578, + "router_z_loss_mlp": 0.12353516, + "step": 14308, + "time_per_iteration": 2.7021725177764893 + }, + { + "auxiliary_loss_clip": 0.0102806, + "auxiliary_loss_mlp": 0.0100254, + "balance_loss_clip": 1.00583267, + "balance_loss_mlp": 1.00159168, + "epoch": 0.8603036224259732, + "flos": 86653745736480.0, + "grad_norm": 0.6224430289569876, + "language_loss": 0.48419517, + "learning_rate": 2.0116362119933172e-07, + "loss": 0.5045011, + "num_input_tokens_seen": 308667735, + "router_z_loss_clip": 0.22229004, + "router_z_loss_mlp": 0.00946808, + "step": 14309, + "time_per_iteration": 3.301514148712158 + }, + { + "auxiliary_loss_clip": 0.01110854, + "auxiliary_loss_mlp": 0.01034525, + "balance_loss_clip": 1.03787601, + "balance_loss_mlp": 1.02216935, + "epoch": 0.8603637456786413, + "flos": 24771043654560.0, + "grad_norm": 2.6970142249171127, + "language_loss": 0.671332, + "learning_rate": 2.0099342583311563e-07, + "loss": 0.6927858, + "num_input_tokens_seen": 308686300, + "router_z_loss_clip": 0.72949219, + "router_z_loss_mlp": 0.12341309, + "step": 14310, + "time_per_iteration": 4.119737386703491 + }, + { + "auxiliary_loss_clip": 0.01110915, + "auxiliary_loss_mlp": 0.01029261, + "balance_loss_clip": 1.03740692, + "balance_loss_mlp": 1.01884842, + "epoch": 0.8604238689313092, + "flos": 26644174413120.0, + "grad_norm": 1.8677300607915759, + "language_loss": 0.77915978, + "learning_rate": 2.0082329868540905e-07, + "loss": 0.80056149, + "num_input_tokens_seen": 308705825, + "router_z_loss_clip": 0.73535156, + "router_z_loss_mlp": 0.10406494, + "step": 14311, + "time_per_iteration": 2.670128107070923 + }, + { + "auxiliary_loss_clip": 0.0110885, + "auxiliary_loss_mlp": 0.01026543, + "balance_loss_clip": 1.03795719, + "balance_loss_mlp": 1.01586151, + "epoch": 0.8604839921839772, + "flos": 21968579851200.0, + "grad_norm": 2.4893555809449754, + "language_loss": 0.71504915, + "learning_rate": 2.006532397626639e-07, + "loss": 0.73640305, + "num_input_tokens_seen": 308723340, + "router_z_loss_clip": 0.70898438, + "router_z_loss_mlp": 0.10681152, + "step": 14312, + "time_per_iteration": 2.6372101306915283 + }, + { + "auxiliary_loss_clip": 0.01109081, + "auxiliary_loss_mlp": 0.01030755, + "balance_loss_clip": 1.03671026, + "balance_loss_mlp": 1.01982915, + "epoch": 0.8605441154366451, + "flos": 19830666252960.0, + "grad_norm": 2.0551305979099275, + "language_loss": 0.78090584, + "learning_rate": 2.0048324907132797e-07, + "loss": 0.80230421, + "num_input_tokens_seen": 308741280, + "router_z_loss_clip": 0.72412109, + "router_z_loss_mlp": 0.10919189, + "step": 14313, + "time_per_iteration": 2.6334786415100098 + }, + { + "auxiliary_loss_clip": 0.01110421, + "auxiliary_loss_mlp": 0.01029806, + "balance_loss_clip": 1.03996706, + "balance_loss_mlp": 1.01786101, + "epoch": 0.8606042386893131, + "flos": 39373495333920.0, + "grad_norm": 1.5650415999584266, + "language_loss": 0.72910464, + "learning_rate": 2.003133266178474e-07, + "loss": 0.75050688, + "num_input_tokens_seen": 308762875, + "router_z_loss_clip": 0.703125, + "router_z_loss_mlp": 0.1194458, + "step": 14314, + "time_per_iteration": 2.684558153152466 + }, + { + "auxiliary_loss_clip": 0.01108054, + "auxiliary_loss_mlp": 0.01029312, + "balance_loss_clip": 1.0362612, + "balance_loss_mlp": 1.01817775, + "epoch": 0.860664361941981, + "flos": 24684389893440.0, + "grad_norm": 1.8835163844351366, + "language_loss": 0.68962157, + "learning_rate": 2.001434724086657e-07, + "loss": 0.7109952, + "num_input_tokens_seen": 308780315, + "router_z_loss_clip": 0.71826172, + "router_z_loss_mlp": 0.11138916, + "step": 14315, + "time_per_iteration": 2.640777349472046 + }, + { + "auxiliary_loss_clip": 0.01108838, + "auxiliary_loss_mlp": 0.01034064, + "balance_loss_clip": 1.03813946, + "balance_loss_mlp": 1.02306688, + "epoch": 0.8607244851946491, + "flos": 30739238907840.0, + "grad_norm": 1.9835261153369739, + "language_loss": 0.7227686, + "learning_rate": 1.9997368645022418e-07, + "loss": 0.74419761, + "num_input_tokens_seen": 308799435, + "router_z_loss_clip": 0.70751953, + "router_z_loss_mlp": 0.10986328, + "step": 14316, + "time_per_iteration": 3.9970240592956543 + }, + { + "auxiliary_loss_clip": 0.01114558, + "auxiliary_loss_mlp": 0.01024627, + "balance_loss_clip": 1.04133976, + "balance_loss_mlp": 1.01377904, + "epoch": 0.860784608447317, + "flos": 24992560389600.0, + "grad_norm": 2.2407719609149477, + "language_loss": 0.82638824, + "learning_rate": 1.9980396874896056e-07, + "loss": 0.84778005, + "num_input_tokens_seen": 308817730, + "router_z_loss_clip": 0.73193359, + "router_z_loss_mlp": 0.10858154, + "step": 14317, + "time_per_iteration": 2.5923917293548584 + }, + { + "auxiliary_loss_clip": 0.01108193, + "auxiliary_loss_mlp": 0.01029216, + "balance_loss_clip": 1.03795338, + "balance_loss_mlp": 1.01840949, + "epoch": 0.860844731699985, + "flos": 61593244218720.0, + "grad_norm": 1.615912891827999, + "language_loss": 0.66826975, + "learning_rate": 1.996343193113108e-07, + "loss": 0.68964386, + "num_input_tokens_seen": 308841735, + "router_z_loss_clip": 0.70214844, + "router_z_loss_mlp": 0.10803223, + "step": 14318, + "time_per_iteration": 2.9504752159118652 + }, + { + "auxiliary_loss_clip": 0.01105694, + "auxiliary_loss_mlp": 0.01027676, + "balance_loss_clip": 1.03600025, + "balance_loss_mlp": 1.01723933, + "epoch": 0.8609048549526529, + "flos": 50240625161760.0, + "grad_norm": 1.56992627431222, + "language_loss": 0.71250522, + "learning_rate": 1.9946473814370911e-07, + "loss": 0.73383892, + "num_input_tokens_seen": 308865050, + "router_z_loss_clip": 0.69726562, + "router_z_loss_mlp": 0.10437012, + "step": 14319, + "time_per_iteration": 2.758929967880249 + }, + { + "auxiliary_loss_clip": 0.01115142, + "auxiliary_loss_mlp": 0.0103266, + "balance_loss_clip": 1.04133785, + "balance_loss_mlp": 1.02169883, + "epoch": 0.8609649782053209, + "flos": 29225567171520.0, + "grad_norm": 2.5104153777932994, + "language_loss": 0.67248553, + "learning_rate": 1.992952252525839e-07, + "loss": 0.69396353, + "num_input_tokens_seen": 308885375, + "router_z_loss_clip": 0.73730469, + "router_z_loss_mlp": 0.10968018, + "step": 14320, + "time_per_iteration": 2.7909765243530273 + }, + { + "auxiliary_loss_clip": 0.01113049, + "auxiliary_loss_mlp": 0.01036214, + "balance_loss_clip": 1.03731155, + "balance_loss_mlp": 1.02429295, + "epoch": 0.8610251014579888, + "flos": 28202165703360.0, + "grad_norm": 2.2462596791824914, + "language_loss": 0.80240017, + "learning_rate": 1.9912578064436446e-07, + "loss": 0.82389277, + "num_input_tokens_seen": 308904700, + "router_z_loss_clip": 0.75732422, + "router_z_loss_mlp": 0.11932373, + "step": 14321, + "time_per_iteration": 2.6604838371276855 + }, + { + "auxiliary_loss_clip": 0.01106898, + "auxiliary_loss_mlp": 0.01031091, + "balance_loss_clip": 1.03796411, + "balance_loss_mlp": 1.01946259, + "epoch": 0.8610852247106568, + "flos": 23704578668160.0, + "grad_norm": 2.399995675359857, + "language_loss": 0.71282238, + "learning_rate": 1.9895640432547567e-07, + "loss": 0.73420227, + "num_input_tokens_seen": 308922985, + "router_z_loss_clip": 0.68896484, + "router_z_loss_mlp": 0.11627197, + "step": 14322, + "time_per_iteration": 2.7069530487060547 + }, + { + "auxiliary_loss_clip": 0.01114663, + "auxiliary_loss_mlp": 0.01034672, + "balance_loss_clip": 1.03899455, + "balance_loss_mlp": 1.02223814, + "epoch": 0.8611453479633249, + "flos": 23565056207040.0, + "grad_norm": 2.1032293307682304, + "language_loss": 0.55881786, + "learning_rate": 1.9878709630234102e-07, + "loss": 0.5803113, + "num_input_tokens_seen": 308940765, + "router_z_loss_clip": 0.75585938, + "router_z_loss_mlp": 0.12438965, + "step": 14323, + "time_per_iteration": 2.7120120525360107 + }, + { + "auxiliary_loss_clip": 0.01107963, + "auxiliary_loss_mlp": 0.01025931, + "balance_loss_clip": 1.03682184, + "balance_loss_mlp": 1.01508284, + "epoch": 0.8612054712159928, + "flos": 28374784431840.0, + "grad_norm": 1.8709348812870497, + "language_loss": 0.7559315, + "learning_rate": 1.986178565813801e-07, + "loss": 0.77727044, + "num_input_tokens_seen": 308960110, + "router_z_loss_clip": 0.71142578, + "router_z_loss_mlp": 0.10845947, + "step": 14324, + "time_per_iteration": 2.70859956741333 + }, + { + "auxiliary_loss_clip": 0.01111672, + "auxiliary_loss_mlp": 0.01031055, + "balance_loss_clip": 1.03850543, + "balance_loss_mlp": 1.01940191, + "epoch": 0.8612655944686608, + "flos": 19558104095520.0, + "grad_norm": 2.284992278293363, + "language_loss": 0.66594255, + "learning_rate": 1.9844868516901036e-07, + "loss": 0.68736982, + "num_input_tokens_seen": 308976665, + "router_z_loss_clip": 0.73144531, + "router_z_loss_mlp": 0.11663818, + "step": 14325, + "time_per_iteration": 2.677050828933716 + }, + { + "auxiliary_loss_clip": 0.01112505, + "auxiliary_loss_mlp": 0.01031985, + "balance_loss_clip": 1.03889227, + "balance_loss_mlp": 1.02032614, + "epoch": 0.8613257177213287, + "flos": 27445735008000.0, + "grad_norm": 2.293818899306774, + "language_loss": 0.64765418, + "learning_rate": 1.982795820716472e-07, + "loss": 0.66909909, + "num_input_tokens_seen": 308997015, + "router_z_loss_clip": 0.73730469, + "router_z_loss_mlp": 0.11663818, + "step": 14326, + "time_per_iteration": 2.651315689086914 + }, + { + "auxiliary_loss_clip": 0.01112065, + "auxiliary_loss_mlp": 0.01029317, + "balance_loss_clip": 1.03894448, + "balance_loss_mlp": 1.01776528, + "epoch": 0.8613858409739967, + "flos": 21033655421760.0, + "grad_norm": 8.24875916151248, + "language_loss": 0.84285522, + "learning_rate": 1.9811054729570253e-07, + "loss": 0.86426902, + "num_input_tokens_seen": 309015250, + "router_z_loss_clip": 0.73095703, + "router_z_loss_mlp": 0.11541748, + "step": 14327, + "time_per_iteration": 2.6450750827789307 + }, + { + "auxiliary_loss_clip": 0.01108916, + "auxiliary_loss_mlp": 0.01032868, + "balance_loss_clip": 1.03679132, + "balance_loss_mlp": 1.02166235, + "epoch": 0.8614459642266646, + "flos": 27846474788160.0, + "grad_norm": 2.729751620430989, + "language_loss": 0.75171459, + "learning_rate": 1.9794158084758661e-07, + "loss": 0.77313238, + "num_input_tokens_seen": 309034140, + "router_z_loss_clip": 0.72119141, + "router_z_loss_mlp": 0.11212158, + "step": 14328, + "time_per_iteration": 2.6085009574890137 + }, + { + "auxiliary_loss_clip": 0.01108956, + "auxiliary_loss_mlp": 0.01026745, + "balance_loss_clip": 1.03792703, + "balance_loss_mlp": 1.01566434, + "epoch": 0.8615060874793327, + "flos": 32341144579200.0, + "grad_norm": 1.6949311802240223, + "language_loss": 0.79866433, + "learning_rate": 1.9777268273370673e-07, + "loss": 0.82002139, + "num_input_tokens_seen": 309055075, + "router_z_loss_clip": 0.70898438, + "router_z_loss_mlp": 0.11083984, + "step": 14329, + "time_per_iteration": 2.7299389839172363 + }, + { + "auxiliary_loss_clip": 0.01109337, + "auxiliary_loss_mlp": 0.01025941, + "balance_loss_clip": 1.03716516, + "balance_loss_mlp": 1.01480043, + "epoch": 0.8615662107320006, + "flos": 29359538765280.0, + "grad_norm": 2.386568692467605, + "language_loss": 0.76899862, + "learning_rate": 1.9760385296046757e-07, + "loss": 0.79035139, + "num_input_tokens_seen": 309074650, + "router_z_loss_clip": 0.72167969, + "router_z_loss_mlp": 0.11151123, + "step": 14330, + "time_per_iteration": 2.6180331707000732 + }, + { + "auxiliary_loss_clip": 0.0111147, + "auxiliary_loss_mlp": 0.01029304, + "balance_loss_clip": 1.03910208, + "balance_loss_mlp": 1.0182296, + "epoch": 0.8616263339846686, + "flos": 29486987076960.0, + "grad_norm": 4.874210471433458, + "language_loss": 0.64591789, + "learning_rate": 1.974350915342702e-07, + "loss": 0.66732562, + "num_input_tokens_seen": 309094385, + "router_z_loss_clip": 0.72265625, + "router_z_loss_mlp": 0.11090088, + "step": 14331, + "time_per_iteration": 2.711178779602051 + }, + { + "auxiliary_loss_clip": 0.01109648, + "auxiliary_loss_mlp": 0.01031675, + "balance_loss_clip": 1.03896117, + "balance_loss_mlp": 1.02124453, + "epoch": 0.8616864572373365, + "flos": 26508298507200.0, + "grad_norm": 1.6067737709680392, + "language_loss": 0.76213956, + "learning_rate": 1.9726639846151506e-07, + "loss": 0.78355277, + "num_input_tokens_seen": 309111815, + "router_z_loss_clip": 0.70703125, + "router_z_loss_mlp": 0.10430908, + "step": 14332, + "time_per_iteration": 2.7413716316223145 + }, + { + "auxiliary_loss_clip": 0.01113092, + "auxiliary_loss_mlp": 0.01028046, + "balance_loss_clip": 1.03779888, + "balance_loss_mlp": 1.0157733, + "epoch": 0.8617465804900045, + "flos": 29000565950400.0, + "grad_norm": 2.4740236480759363, + "language_loss": 0.67107058, + "learning_rate": 1.9709777374859904e-07, + "loss": 0.69248199, + "num_input_tokens_seen": 309131385, + "router_z_loss_clip": 0.75292969, + "router_z_loss_mlp": 0.1227417, + "step": 14333, + "time_per_iteration": 2.891570806503296 + }, + { + "auxiliary_loss_clip": 0.01116048, + "auxiliary_loss_mlp": 0.01032277, + "balance_loss_clip": 1.03900385, + "balance_loss_mlp": 1.02001643, + "epoch": 0.8618067037426724, + "flos": 46007942518080.0, + "grad_norm": 2.799937169128825, + "language_loss": 0.6202904, + "learning_rate": 1.969292174019157e-07, + "loss": 0.64177358, + "num_input_tokens_seen": 309155020, + "router_z_loss_clip": 0.77148438, + "router_z_loss_mlp": 0.12249756, + "step": 14334, + "time_per_iteration": 2.892805337905884 + }, + { + "auxiliary_loss_clip": 0.01115552, + "auxiliary_loss_mlp": 0.01040322, + "balance_loss_clip": 1.04107523, + "balance_loss_mlp": 1.02875233, + "epoch": 0.8618668269953405, + "flos": 26198993527200.0, + "grad_norm": 2.0172300491491715, + "language_loss": 0.6921984, + "learning_rate": 1.967607294278577e-07, + "loss": 0.71375716, + "num_input_tokens_seen": 309172865, + "router_z_loss_clip": 0.74414062, + "router_z_loss_mlp": 0.11566162, + "step": 14335, + "time_per_iteration": 2.873002052307129 + }, + { + "auxiliary_loss_clip": 0.01111319, + "auxiliary_loss_mlp": 0.01031233, + "balance_loss_clip": 1.03818512, + "balance_loss_mlp": 1.02004552, + "epoch": 0.8619269502480085, + "flos": 27128853296640.0, + "grad_norm": 1.5275223076911917, + "language_loss": 0.82761121, + "learning_rate": 1.965923098328135e-07, + "loss": 0.84903681, + "num_input_tokens_seen": 309193575, + "router_z_loss_clip": 0.73144531, + "router_z_loss_mlp": 0.11199951, + "step": 14336, + "time_per_iteration": 2.841644287109375 + }, + { + "auxiliary_loss_clip": 0.01113168, + "auxiliary_loss_mlp": 0.01029072, + "balance_loss_clip": 1.03787541, + "balance_loss_mlp": 1.01765811, + "epoch": 0.8619870735006764, + "flos": 27711733366080.0, + "grad_norm": 1.7991810913708188, + "language_loss": 0.67642581, + "learning_rate": 1.9642395862316907e-07, + "loss": 0.6978482, + "num_input_tokens_seen": 309212680, + "router_z_loss_clip": 0.75244141, + "router_z_loss_mlp": 0.11419678, + "step": 14337, + "time_per_iteration": 2.759740114212036 + }, + { + "auxiliary_loss_clip": 0.01109195, + "auxiliary_loss_mlp": 0.01027041, + "balance_loss_clip": 1.03727293, + "balance_loss_mlp": 1.01603198, + "epoch": 0.8620471967533444, + "flos": 45783954228960.0, + "grad_norm": 1.8896447961061846, + "language_loss": 0.6710043, + "learning_rate": 1.962556758053089e-07, + "loss": 0.69236672, + "num_input_tokens_seen": 309234485, + "router_z_loss_clip": 0.71923828, + "router_z_loss_mlp": 0.11004639, + "step": 14338, + "time_per_iteration": 4.369790315628052 + }, + { + "auxiliary_loss_clip": 0.01112243, + "auxiliary_loss_mlp": 0.01030382, + "balance_loss_clip": 1.03919399, + "balance_loss_mlp": 1.01971912, + "epoch": 0.8621073200060123, + "flos": 24016517271360.0, + "grad_norm": 1.9502325174184765, + "language_loss": 0.6188789, + "learning_rate": 1.9608746138561448e-07, + "loss": 0.64030516, + "num_input_tokens_seen": 309253630, + "router_z_loss_clip": 0.72949219, + "router_z_loss_mlp": 0.10662842, + "step": 14339, + "time_per_iteration": 2.794680595397949 + }, + { + "auxiliary_loss_clip": 0.0110832, + "auxiliary_loss_mlp": 0.01031638, + "balance_loss_clip": 1.03627479, + "balance_loss_mlp": 1.02025962, + "epoch": 0.8621674432586803, + "flos": 17737558416000.0, + "grad_norm": 2.327414965179672, + "language_loss": 0.63286757, + "learning_rate": 1.9591931537046458e-07, + "loss": 0.65426719, + "num_input_tokens_seen": 309270950, + "router_z_loss_clip": 0.71923828, + "router_z_loss_mlp": 0.11383057, + "step": 14340, + "time_per_iteration": 2.5838863849639893 + }, + { + "auxiliary_loss_clip": 0.01106073, + "auxiliary_loss_mlp": 0.01025052, + "balance_loss_clip": 1.03861308, + "balance_loss_mlp": 1.01478827, + "epoch": 0.8622275665113482, + "flos": 25307173133280.0, + "grad_norm": 1.6519103034989857, + "language_loss": 0.80176061, + "learning_rate": 1.9575123776623493e-07, + "loss": 0.8230719, + "num_input_tokens_seen": 309288780, + "router_z_loss_clip": 0.67382812, + "router_z_loss_mlp": 0.10253906, + "step": 14341, + "time_per_iteration": 2.6354219913482666 + }, + { + "auxiliary_loss_clip": 0.01108365, + "auxiliary_loss_mlp": 0.01031351, + "balance_loss_clip": 1.03740478, + "balance_loss_mlp": 1.02066946, + "epoch": 0.8622876897640163, + "flos": 30158870909760.0, + "grad_norm": 2.457590226950822, + "language_loss": 0.74655187, + "learning_rate": 1.9558322857929887e-07, + "loss": 0.7679491, + "num_input_tokens_seen": 309310875, + "router_z_loss_clip": 0.71044922, + "router_z_loss_mlp": 0.10687256, + "step": 14342, + "time_per_iteration": 2.6505026817321777 + }, + { + "auxiliary_loss_clip": 0.01111478, + "auxiliary_loss_mlp": 0.01029343, + "balance_loss_clip": 1.03844547, + "balance_loss_mlp": 1.0182147, + "epoch": 0.8623478130166842, + "flos": 21301233953760.0, + "grad_norm": 1.8904538319253186, + "language_loss": 0.68797219, + "learning_rate": 1.95415287816028e-07, + "loss": 0.70938039, + "num_input_tokens_seen": 309329900, + "router_z_loss_clip": 0.73046875, + "router_z_loss_mlp": 0.11120605, + "step": 14343, + "time_per_iteration": 2.684063673019409 + }, + { + "auxiliary_loss_clip": 0.01110975, + "auxiliary_loss_mlp": 0.01041622, + "balance_loss_clip": 1.03744888, + "balance_loss_mlp": 1.02929592, + "epoch": 0.8624079362693522, + "flos": 22097081612160.0, + "grad_norm": 1.8139259141302269, + "language_loss": 0.67694306, + "learning_rate": 1.9524741548278967e-07, + "loss": 0.69846904, + "num_input_tokens_seen": 309347870, + "router_z_loss_clip": 0.73583984, + "router_z_loss_mlp": 0.12335205, + "step": 14344, + "time_per_iteration": 2.5881001949310303 + }, + { + "auxiliary_loss_clip": 0.01111449, + "auxiliary_loss_mlp": 0.01035451, + "balance_loss_clip": 1.03765047, + "balance_loss_mlp": 1.02403677, + "epoch": 0.8624680595220201, + "flos": 37419869440800.0, + "grad_norm": 1.4364580644017606, + "language_loss": 0.81353176, + "learning_rate": 1.9507961158595054e-07, + "loss": 0.83500069, + "num_input_tokens_seen": 309371695, + "router_z_loss_clip": 0.73779297, + "router_z_loss_mlp": 0.11407471, + "step": 14345, + "time_per_iteration": 4.150452136993408 + }, + { + "auxiliary_loss_clip": 0.01113579, + "auxiliary_loss_mlp": 0.01030203, + "balance_loss_clip": 1.03917313, + "balance_loss_mlp": 1.01866317, + "epoch": 0.8625281827746881, + "flos": 46367239471200.0, + "grad_norm": 3.0132218973564213, + "language_loss": 0.50595617, + "learning_rate": 1.9491187613187355e-07, + "loss": 0.52739394, + "num_input_tokens_seen": 309394645, + "router_z_loss_clip": 0.74462891, + "router_z_loss_mlp": 0.11529541, + "step": 14346, + "time_per_iteration": 2.7904326915740967 + }, + { + "auxiliary_loss_clip": 0.01109145, + "auxiliary_loss_mlp": 0.01032319, + "balance_loss_clip": 1.03692174, + "balance_loss_mlp": 1.02086854, + "epoch": 0.862588306027356, + "flos": 32030421494400.0, + "grad_norm": 1.7782125148672894, + "language_loss": 0.75008827, + "learning_rate": 1.9474420912691913e-07, + "loss": 0.77150285, + "num_input_tokens_seen": 309413170, + "router_z_loss_clip": 0.72216797, + "router_z_loss_mlp": 0.11444092, + "step": 14347, + "time_per_iteration": 2.679741859436035 + }, + { + "auxiliary_loss_clip": 0.01111373, + "auxiliary_loss_mlp": 0.01027922, + "balance_loss_clip": 1.03930509, + "balance_loss_mlp": 1.01614976, + "epoch": 0.862648429280024, + "flos": 31578352670880.0, + "grad_norm": 2.1661752524666307, + "language_loss": 0.80827975, + "learning_rate": 1.945766105774449e-07, + "loss": 0.82967269, + "num_input_tokens_seen": 309431315, + "router_z_loss_clip": 0.72119141, + "router_z_loss_mlp": 0.11785889, + "step": 14348, + "time_per_iteration": 2.728710889816284 + }, + { + "auxiliary_loss_clip": 0.01105293, + "auxiliary_loss_mlp": 0.01028877, + "balance_loss_clip": 1.03589356, + "balance_loss_mlp": 1.01818419, + "epoch": 0.862708552532692, + "flos": 46144912390560.0, + "grad_norm": 1.7880036991147465, + "language_loss": 0.66182727, + "learning_rate": 1.9440908048980665e-07, + "loss": 0.68316901, + "num_input_tokens_seen": 309453020, + "router_z_loss_clip": 0.69384766, + "router_z_loss_mlp": 0.10699463, + "step": 14349, + "time_per_iteration": 4.247889518737793 + }, + { + "auxiliary_loss_clip": 0.01111174, + "auxiliary_loss_mlp": 0.01037022, + "balance_loss_clip": 1.03913188, + "balance_loss_mlp": 1.02595353, + "epoch": 0.86276867578536, + "flos": 23296383708480.0, + "grad_norm": 2.2080479683491947, + "language_loss": 0.69372261, + "learning_rate": 1.942416188703573e-07, + "loss": 0.7152046, + "num_input_tokens_seen": 309469780, + "router_z_loss_clip": 0.72070312, + "router_z_loss_mlp": 0.11065674, + "step": 14350, + "time_per_iteration": 2.6729657649993896 + }, + { + "auxiliary_loss_clip": 0.0110951, + "auxiliary_loss_mlp": 0.01031938, + "balance_loss_clip": 1.03786194, + "balance_loss_mlp": 1.02071476, + "epoch": 0.862828799038028, + "flos": 27044954710560.0, + "grad_norm": 1.7854016601753864, + "language_loss": 0.77088249, + "learning_rate": 1.9407422572544618e-07, + "loss": 0.79229701, + "num_input_tokens_seen": 309489610, + "router_z_loss_clip": 0.71679688, + "router_z_loss_mlp": 0.11206055, + "step": 14351, + "time_per_iteration": 2.6615424156188965 + }, + { + "auxiliary_loss_clip": 0.01108745, + "auxiliary_loss_mlp": 0.01028784, + "balance_loss_clip": 1.03698957, + "balance_loss_mlp": 1.01808548, + "epoch": 0.8628889222906959, + "flos": 28242757667520.0, + "grad_norm": 4.529392466444507, + "language_loss": 0.85012746, + "learning_rate": 1.9390690106142204e-07, + "loss": 0.87150276, + "num_input_tokens_seen": 309508295, + "router_z_loss_clip": 0.71679688, + "router_z_loss_mlp": 0.10693359, + "step": 14352, + "time_per_iteration": 2.605790376663208 + }, + { + "auxiliary_loss_clip": 0.01028456, + "auxiliary_loss_mlp": 0.01002076, + "balance_loss_clip": 1.00615072, + "balance_loss_mlp": 1.00108624, + "epoch": 0.8629490455433639, + "flos": 70548075779040.0, + "grad_norm": 0.7884152954077462, + "language_loss": 0.61851037, + "learning_rate": 1.9373964488462913e-07, + "loss": 0.63881564, + "num_input_tokens_seen": 309567960, + "router_z_loss_clip": 0.22277832, + "router_z_loss_mlp": 0.0098877, + "step": 14353, + "time_per_iteration": 3.283259391784668 + }, + { + "auxiliary_loss_clip": 0.01110295, + "auxiliary_loss_mlp": 0.01027811, + "balance_loss_clip": 1.0398581, + "balance_loss_mlp": 1.01770198, + "epoch": 0.8630091687960318, + "flos": 19425307502880.0, + "grad_norm": 1.6543881375927543, + "language_loss": 0.81764513, + "learning_rate": 1.9357245720140948e-07, + "loss": 0.83902621, + "num_input_tokens_seen": 309586050, + "router_z_loss_clip": 0.70361328, + "router_z_loss_mlp": 0.10107422, + "step": 14354, + "time_per_iteration": 2.5941073894500732 + }, + { + "auxiliary_loss_clip": 0.01109487, + "auxiliary_loss_mlp": 0.01026463, + "balance_loss_clip": 1.03742313, + "balance_loss_mlp": 1.01489973, + "epoch": 0.8630692920486999, + "flos": 21917250807840.0, + "grad_norm": 1.9197215697577006, + "language_loss": 0.85758972, + "learning_rate": 1.934053380181031e-07, + "loss": 0.87894928, + "num_input_tokens_seen": 309602910, + "router_z_loss_clip": 0.71923828, + "router_z_loss_mlp": 0.11560059, + "step": 14355, + "time_per_iteration": 3.890113115310669 + }, + { + "auxiliary_loss_clip": 0.01109368, + "auxiliary_loss_mlp": 0.01029617, + "balance_loss_clip": 1.0361408, + "balance_loss_mlp": 1.01792812, + "epoch": 0.8631294153013678, + "flos": 27399997349280.0, + "grad_norm": 2.5726516281418554, + "language_loss": 0.58438098, + "learning_rate": 1.9323828734104763e-07, + "loss": 0.60577083, + "num_input_tokens_seen": 309621175, + "router_z_loss_clip": 0.73193359, + "router_z_loss_mlp": 0.11688232, + "step": 14356, + "time_per_iteration": 2.661043882369995 + }, + { + "auxiliary_loss_clip": 0.0111172, + "auxiliary_loss_mlp": 0.0103318, + "balance_loss_clip": 1.03723121, + "balance_loss_mlp": 1.02120543, + "epoch": 0.8631895385540358, + "flos": 20544236016480.0, + "grad_norm": 1.7987348692787806, + "language_loss": 0.76952291, + "learning_rate": 1.9307130517657756e-07, + "loss": 0.79097199, + "num_input_tokens_seen": 309639395, + "router_z_loss_clip": 0.74414062, + "router_z_loss_mlp": 0.11975098, + "step": 14357, + "time_per_iteration": 2.593330144882202 + }, + { + "auxiliary_loss_clip": 0.01111055, + "auxiliary_loss_mlp": 0.01030356, + "balance_loss_clip": 1.03823328, + "balance_loss_mlp": 1.01924539, + "epoch": 0.8632496618067037, + "flos": 22814257413600.0, + "grad_norm": 2.6362754411477285, + "language_loss": 0.7771917, + "learning_rate": 1.9290439153102468e-07, + "loss": 0.7986058, + "num_input_tokens_seen": 309657265, + "router_z_loss_clip": 0.72851562, + "router_z_loss_mlp": 0.11108398, + "step": 14358, + "time_per_iteration": 2.6270110607147217 + }, + { + "auxiliary_loss_clip": 0.01108974, + "auxiliary_loss_mlp": 0.0102753, + "balance_loss_clip": 1.03631878, + "balance_loss_mlp": 1.0167532, + "epoch": 0.8633097850593717, + "flos": 29626671607200.0, + "grad_norm": 1.3477782522175024, + "language_loss": 0.74903476, + "learning_rate": 1.9273754641071816e-07, + "loss": 0.77039981, + "num_input_tokens_seen": 309678610, + "router_z_loss_clip": 0.72607422, + "router_z_loss_mlp": 0.10778809, + "step": 14359, + "time_per_iteration": 2.639599323272705 + }, + { + "auxiliary_loss_clip": 0.01105253, + "auxiliary_loss_mlp": 0.01030752, + "balance_loss_clip": 1.03614879, + "balance_loss_mlp": 1.02016652, + "epoch": 0.8633699083120396, + "flos": 25842168128160.0, + "grad_norm": 1.8578380559676233, + "language_loss": 0.7043938, + "learning_rate": 1.9257076982198517e-07, + "loss": 0.72575384, + "num_input_tokens_seen": 309697710, + "router_z_loss_clip": 0.69140625, + "router_z_loss_mlp": 0.105896, + "step": 14360, + "time_per_iteration": 2.647529125213623 + }, + { + "auxiliary_loss_clip": 0.01115181, + "auxiliary_loss_mlp": 0.0103246, + "balance_loss_clip": 1.0405283, + "balance_loss_mlp": 1.02016938, + "epoch": 0.8634300315647077, + "flos": 23482535208480.0, + "grad_norm": 1.8832286516965682, + "language_loss": 0.76022446, + "learning_rate": 1.9240406177114953e-07, + "loss": 0.78170085, + "num_input_tokens_seen": 309715985, + "router_z_loss_clip": 0.74609375, + "router_z_loss_mlp": 0.12298584, + "step": 14361, + "time_per_iteration": 2.589226245880127 + }, + { + "auxiliary_loss_clip": 0.01028715, + "auxiliary_loss_mlp": 0.01002867, + "balance_loss_clip": 1.00636947, + "balance_loss_mlp": 1.00191295, + "epoch": 0.8634901548173756, + "flos": 80772690451680.0, + "grad_norm": 0.9778883324093942, + "language_loss": 0.5881964, + "learning_rate": 1.922374222645329e-07, + "loss": 0.60851216, + "num_input_tokens_seen": 309779930, + "router_z_loss_clip": 0.22351074, + "router_z_loss_mlp": 0.00953674, + "step": 14362, + "time_per_iteration": 3.3342292308807373 + }, + { + "auxiliary_loss_clip": 0.01113581, + "auxiliary_loss_mlp": 0.01035192, + "balance_loss_clip": 1.03823817, + "balance_loss_mlp": 1.0231998, + "epoch": 0.8635502780700436, + "flos": 30248644501440.0, + "grad_norm": 1.5829184566859233, + "language_loss": 0.80427247, + "learning_rate": 1.9207085130845524e-07, + "loss": 0.82576019, + "num_input_tokens_seen": 309800580, + "router_z_loss_clip": 0.75390625, + "router_z_loss_mlp": 0.11981201, + "step": 14363, + "time_per_iteration": 2.703155279159546 + }, + { + "auxiliary_loss_clip": 0.011115, + "auxiliary_loss_mlp": 0.010342, + "balance_loss_clip": 1.03729236, + "balance_loss_mlp": 1.02199852, + "epoch": 0.8636104013227116, + "flos": 30737375112960.0, + "grad_norm": 3.8530295077542784, + "language_loss": 0.7322371, + "learning_rate": 1.9190434890923112e-07, + "loss": 0.75369412, + "num_input_tokens_seen": 309821725, + "router_z_loss_clip": 0.74169922, + "router_z_loss_mlp": 0.12194824, + "step": 14364, + "time_per_iteration": 2.784391403198242 + }, + { + "auxiliary_loss_clip": 0.01112203, + "auxiliary_loss_mlp": 0.01027581, + "balance_loss_clip": 1.03721189, + "balance_loss_mlp": 1.01684618, + "epoch": 0.8636705245753795, + "flos": 29136765994560.0, + "grad_norm": 1.622717918069238, + "language_loss": 0.71818244, + "learning_rate": 1.917379150731755e-07, + "loss": 0.73958027, + "num_input_tokens_seen": 309841565, + "router_z_loss_clip": 0.74951172, + "router_z_loss_mlp": 0.1072998, + "step": 14365, + "time_per_iteration": 2.8151135444641113 + }, + { + "auxiliary_loss_clip": 0.01113824, + "auxiliary_loss_mlp": 0.01031976, + "balance_loss_clip": 1.03948188, + "balance_loss_mlp": 1.01990557, + "epoch": 0.8637306478280475, + "flos": 28199775183840.0, + "grad_norm": 2.642124005393522, + "language_loss": 0.70699835, + "learning_rate": 1.915715498065993e-07, + "loss": 0.72845638, + "num_input_tokens_seen": 309858635, + "router_z_loss_clip": 0.74365234, + "router_z_loss_mlp": 0.12078857, + "step": 14366, + "time_per_iteration": 2.644857168197632 + }, + { + "auxiliary_loss_clip": 0.01108018, + "auxiliary_loss_mlp": 0.01027283, + "balance_loss_clip": 1.03800392, + "balance_loss_mlp": 1.01666117, + "epoch": 0.8637907710807154, + "flos": 26730949726080.0, + "grad_norm": 1.9373623631993784, + "language_loss": 0.81623995, + "learning_rate": 1.9140525311581146e-07, + "loss": 0.83759296, + "num_input_tokens_seen": 309877885, + "router_z_loss_clip": 0.70019531, + "router_z_loss_mlp": 0.10620117, + "step": 14367, + "time_per_iteration": 2.692033052444458 + }, + { + "auxiliary_loss_clip": 0.0111219, + "auxiliary_loss_mlp": 0.01030592, + "balance_loss_clip": 1.03953254, + "balance_loss_mlp": 1.01848078, + "epoch": 0.8638508943333835, + "flos": 28773781968960.0, + "grad_norm": 2.1606195002914164, + "language_loss": 0.61397028, + "learning_rate": 1.9123902500711743e-07, + "loss": 0.63539809, + "num_input_tokens_seen": 309893140, + "router_z_loss_clip": 0.72607422, + "router_z_loss_mlp": 0.12121582, + "step": 14368, + "time_per_iteration": 2.6697452068328857 + }, + { + "auxiliary_loss_clip": 0.01111523, + "auxiliary_loss_mlp": 0.01031082, + "balance_loss_clip": 1.03999019, + "balance_loss_mlp": 1.02010274, + "epoch": 0.8639110175860514, + "flos": 31452119877600.0, + "grad_norm": 2.551134291456106, + "language_loss": 0.75991178, + "learning_rate": 1.91072865486821e-07, + "loss": 0.78133786, + "num_input_tokens_seen": 309914175, + "router_z_loss_clip": 0.71484375, + "router_z_loss_mlp": 0.10980225, + "step": 14369, + "time_per_iteration": 2.704763412475586 + }, + { + "auxiliary_loss_clip": 0.01111543, + "auxiliary_loss_mlp": 0.01034956, + "balance_loss_clip": 1.03727233, + "balance_loss_mlp": 1.02335143, + "epoch": 0.8639711408387194, + "flos": 28515481894080.0, + "grad_norm": 1.8513023756895208, + "language_loss": 0.64044565, + "learning_rate": 1.9090677456122294e-07, + "loss": 0.66191065, + "num_input_tokens_seen": 309932395, + "router_z_loss_clip": 0.7421875, + "router_z_loss_mlp": 0.1161499, + "step": 14370, + "time_per_iteration": 2.63724684715271 + }, + { + "auxiliary_loss_clip": 0.01111267, + "auxiliary_loss_mlp": 0.01030747, + "balance_loss_clip": 1.03952479, + "balance_loss_mlp": 1.0195775, + "epoch": 0.8640312640913873, + "flos": 26999784293760.0, + "grad_norm": 1.7544301203419497, + "language_loss": 0.66269255, + "learning_rate": 1.907407522366209e-07, + "loss": 0.68411267, + "num_input_tokens_seen": 309951720, + "router_z_loss_clip": 0.71679688, + "router_z_loss_mlp": 0.11169434, + "step": 14371, + "time_per_iteration": 2.665192127227783 + }, + { + "auxiliary_loss_clip": 0.01028904, + "auxiliary_loss_mlp": 0.01002183, + "balance_loss_clip": 1.00652862, + "balance_loss_mlp": 1.00118041, + "epoch": 0.8640913873440553, + "flos": 70250352390720.0, + "grad_norm": 0.8700676882752844, + "language_loss": 0.56874436, + "learning_rate": 1.905747985193107e-07, + "loss": 0.58905524, + "num_input_tokens_seen": 310006120, + "router_z_loss_clip": 0.22375488, + "router_z_loss_mlp": 0.01002502, + "step": 14372, + "time_per_iteration": 3.1019558906555176 + }, + { + "auxiliary_loss_clip": 0.01110583, + "auxiliary_loss_mlp": 0.01028059, + "balance_loss_clip": 1.04034078, + "balance_loss_mlp": 1.01608455, + "epoch": 0.8641515105967232, + "flos": 29269805690880.0, + "grad_norm": 1.9073613008244557, + "language_loss": 0.79280961, + "learning_rate": 1.9040891341558597e-07, + "loss": 0.81419599, + "num_input_tokens_seen": 310026740, + "router_z_loss_clip": 0.70263672, + "router_z_loss_mlp": 0.11962891, + "step": 14373, + "time_per_iteration": 2.652317762374878 + }, + { + "auxiliary_loss_clip": 0.01109708, + "auxiliary_loss_mlp": 0.0102734, + "balance_loss_clip": 1.03719068, + "balance_loss_mlp": 1.01593184, + "epoch": 0.8642116338493913, + "flos": 23260248645120.0, + "grad_norm": 1.73684643599906, + "language_loss": 0.63508666, + "learning_rate": 1.9024309693173656e-07, + "loss": 0.65645707, + "num_input_tokens_seen": 310044135, + "router_z_loss_clip": 0.72558594, + "router_z_loss_mlp": 0.11413574, + "step": 14374, + "time_per_iteration": 2.590214967727661 + }, + { + "auxiliary_loss_clip": 0.01110286, + "auxiliary_loss_mlp": 0.01032407, + "balance_loss_clip": 1.0391469, + "balance_loss_mlp": 1.02131438, + "epoch": 0.8642717571020592, + "flos": 22274076206880.0, + "grad_norm": 3.2132119999516053, + "language_loss": 0.77349651, + "learning_rate": 1.9007734907404993e-07, + "loss": 0.79492342, + "num_input_tokens_seen": 310061560, + "router_z_loss_clip": 0.71240234, + "router_z_loss_mlp": 0.11108398, + "step": 14375, + "time_per_iteration": 2.606454610824585 + }, + { + "auxiliary_loss_clip": 0.01110977, + "auxiliary_loss_mlp": 0.0103427, + "balance_loss_clip": 1.03824294, + "balance_loss_mlp": 1.02261126, + "epoch": 0.8643318803547272, + "flos": 70363700688960.0, + "grad_norm": 1.8263913119713415, + "language_loss": 0.612149, + "learning_rate": 1.899116698488117e-07, + "loss": 0.63360143, + "num_input_tokens_seen": 310087310, + "router_z_loss_clip": 0.72753906, + "router_z_loss_mlp": 0.11663818, + "step": 14376, + "time_per_iteration": 2.964134931564331 + }, + { + "auxiliary_loss_clip": 0.01107306, + "auxiliary_loss_mlp": 0.01030818, + "balance_loss_clip": 1.03646135, + "balance_loss_mlp": 1.02008295, + "epoch": 0.8643920036073952, + "flos": 23927513508000.0, + "grad_norm": 1.5893825601560554, + "language_loss": 0.66626501, + "learning_rate": 1.8974605926230457e-07, + "loss": 0.68764627, + "num_input_tokens_seen": 310106260, + "router_z_loss_clip": 0.70800781, + "router_z_loss_mlp": 0.10736084, + "step": 14377, + "time_per_iteration": 2.6120307445526123 + }, + { + "auxiliary_loss_clip": 0.01111243, + "auxiliary_loss_mlp": 0.01031468, + "balance_loss_clip": 1.03694761, + "balance_loss_mlp": 1.02020323, + "epoch": 0.8644521268600631, + "flos": 25441630934400.0, + "grad_norm": 1.8461425759979357, + "language_loss": 0.70343179, + "learning_rate": 1.8958051732080804e-07, + "loss": 0.72485888, + "num_input_tokens_seen": 310125305, + "router_z_loss_clip": 0.74169922, + "router_z_loss_mlp": 0.11260986, + "step": 14378, + "time_per_iteration": 4.185255289077759 + }, + { + "auxiliary_loss_clip": 0.01028424, + "auxiliary_loss_mlp": 0.01002161, + "balance_loss_clip": 1.00612187, + "balance_loss_mlp": 1.00115097, + "epoch": 0.8645122501127311, + "flos": 81410789223360.0, + "grad_norm": 0.8196968938508773, + "language_loss": 0.60226661, + "learning_rate": 1.894150440305995e-07, + "loss": 0.62257242, + "num_input_tokens_seen": 310189270, + "router_z_loss_clip": 0.22314453, + "router_z_loss_mlp": 0.01010132, + "step": 14379, + "time_per_iteration": 3.2780416011810303 + }, + { + "auxiliary_loss_clip": 0.01107685, + "auxiliary_loss_mlp": 0.01030417, + "balance_loss_clip": 1.03676772, + "balance_loss_mlp": 1.01943183, + "epoch": 0.864572373365399, + "flos": 26466207403680.0, + "grad_norm": 1.7990684992503576, + "language_loss": 0.74606317, + "learning_rate": 1.8924963939795478e-07, + "loss": 0.76744419, + "num_input_tokens_seen": 310208395, + "router_z_loss_clip": 0.70800781, + "router_z_loss_mlp": 0.10992432, + "step": 14380, + "time_per_iteration": 2.6539106369018555 + }, + { + "auxiliary_loss_clip": 0.01111727, + "auxiliary_loss_mlp": 0.01032432, + "balance_loss_clip": 1.03758454, + "balance_loss_mlp": 1.02119613, + "epoch": 0.8646324966180671, + "flos": 24729074102880.0, + "grad_norm": 2.3344831177896004, + "language_loss": 0.75462896, + "learning_rate": 1.8908430342914473e-07, + "loss": 0.77607054, + "num_input_tokens_seen": 310227415, + "router_z_loss_clip": 0.74169922, + "router_z_loss_mlp": 0.11236572, + "step": 14381, + "time_per_iteration": 2.7398571968078613 + }, + { + "auxiliary_loss_clip": 0.01108839, + "auxiliary_loss_mlp": 0.01031805, + "balance_loss_clip": 1.03799057, + "balance_loss_mlp": 1.02100492, + "epoch": 0.864692619870735, + "flos": 14576243349600.0, + "grad_norm": 2.4732668684258314, + "language_loss": 0.84523165, + "learning_rate": 1.8891903613043892e-07, + "loss": 0.86663812, + "num_input_tokens_seen": 310242625, + "router_z_loss_clip": 0.70751953, + "router_z_loss_mlp": 0.10797119, + "step": 14382, + "time_per_iteration": 2.640962839126587 + }, + { + "auxiliary_loss_clip": 0.0111179, + "auxiliary_loss_mlp": 0.01031999, + "balance_loss_clip": 1.03825855, + "balance_loss_mlp": 1.0200901, + "epoch": 0.864752743123403, + "flos": 26198628871680.0, + "grad_norm": 1.7688505764206117, + "language_loss": 0.75990796, + "learning_rate": 1.8875383750810504e-07, + "loss": 0.78134584, + "num_input_tokens_seen": 310260585, + "router_z_loss_clip": 0.73583984, + "router_z_loss_mlp": 0.11907959, + "step": 14383, + "time_per_iteration": 2.6691534519195557 + }, + { + "auxiliary_loss_clip": 0.01110734, + "auxiliary_loss_mlp": 0.01028692, + "balance_loss_clip": 1.04020619, + "balance_loss_mlp": 1.0175755, + "epoch": 0.8648128663760709, + "flos": 23831095082400.0, + "grad_norm": 2.630382963212073, + "language_loss": 0.84985936, + "learning_rate": 1.8858870756840738e-07, + "loss": 0.87125361, + "num_input_tokens_seen": 310277210, + "router_z_loss_clip": 0.70556641, + "router_z_loss_mlp": 0.11114502, + "step": 14384, + "time_per_iteration": 2.6227335929870605 + }, + { + "auxiliary_loss_clip": 0.01107564, + "auxiliary_loss_mlp": 0.01026685, + "balance_loss_clip": 1.03660417, + "balance_loss_mlp": 1.01586127, + "epoch": 0.8648729896287389, + "flos": 25975653514560.0, + "grad_norm": 3.2919788791141618, + "language_loss": 0.80622518, + "learning_rate": 1.884236463176072e-07, + "loss": 0.8275677, + "num_input_tokens_seen": 310296610, + "router_z_loss_clip": 0.70947266, + "router_z_loss_mlp": 0.10827637, + "step": 14385, + "time_per_iteration": 3.922849178314209 + }, + { + "auxiliary_loss_clip": 0.01113349, + "auxiliary_loss_mlp": 0.01028899, + "balance_loss_clip": 1.0390594, + "balance_loss_mlp": 1.0177052, + "epoch": 0.8649331128814068, + "flos": 30023076038400.0, + "grad_norm": 2.521093694010299, + "language_loss": 0.72153568, + "learning_rate": 1.8825865376196437e-07, + "loss": 0.74295813, + "num_input_tokens_seen": 310316830, + "router_z_loss_clip": 0.74316406, + "router_z_loss_mlp": 0.11199951, + "step": 14386, + "time_per_iteration": 2.721487522125244 + }, + { + "auxiliary_loss_clip": 0.0110972, + "auxiliary_loss_mlp": 0.01031815, + "balance_loss_clip": 1.03830373, + "balance_loss_mlp": 1.02069831, + "epoch": 0.8649932361340749, + "flos": 18763593507360.0, + "grad_norm": 1.8903549846379797, + "language_loss": 0.81910628, + "learning_rate": 1.8809372990773476e-07, + "loss": 0.84052163, + "num_input_tokens_seen": 310334355, + "router_z_loss_clip": 0.71386719, + "router_z_loss_mlp": 0.11120605, + "step": 14387, + "time_per_iteration": 2.615659713745117 + }, + { + "auxiliary_loss_clip": 0.0110793, + "auxiliary_loss_mlp": 0.010244, + "balance_loss_clip": 1.03820348, + "balance_loss_mlp": 1.01368284, + "epoch": 0.8650533593867428, + "flos": 24283488044160.0, + "grad_norm": 1.9999949671386295, + "language_loss": 0.68614638, + "learning_rate": 1.8792887476117224e-07, + "loss": 0.70746964, + "num_input_tokens_seen": 310352900, + "router_z_loss_clip": 0.69726562, + "router_z_loss_mlp": 0.1071167, + "step": 14388, + "time_per_iteration": 2.7401397228240967 + }, + { + "auxiliary_loss_clip": 0.01106784, + "auxiliary_loss_mlp": 0.01030967, + "balance_loss_clip": 1.03791213, + "balance_loss_mlp": 1.02072096, + "epoch": 0.8651134826394108, + "flos": 31270668382080.0, + "grad_norm": 1.9104303886288703, + "language_loss": 0.90342641, + "learning_rate": 1.877640883285283e-07, + "loss": 0.92480397, + "num_input_tokens_seen": 310372855, + "router_z_loss_clip": 0.68847656, + "router_z_loss_mlp": 0.10241699, + "step": 14389, + "time_per_iteration": 4.19947361946106 + }, + { + "auxiliary_loss_clip": 0.01107825, + "auxiliary_loss_mlp": 0.01026738, + "balance_loss_clip": 1.03774071, + "balance_loss_mlp": 1.01608682, + "epoch": 0.8651736058920788, + "flos": 22859711451360.0, + "grad_norm": 1.657951283245825, + "language_loss": 0.70775962, + "learning_rate": 1.8759937061605212e-07, + "loss": 0.72910523, + "num_input_tokens_seen": 310391595, + "router_z_loss_clip": 0.70068359, + "router_z_loss_mlp": 0.10650635, + "step": 14390, + "time_per_iteration": 2.641500949859619 + }, + { + "auxiliary_loss_clip": 0.01110137, + "auxiliary_loss_mlp": 0.01036439, + "balance_loss_clip": 1.03716183, + "balance_loss_mlp": 1.02526903, + "epoch": 0.8652337291447467, + "flos": 25351897860000.0, + "grad_norm": 1.6201193737552222, + "language_loss": 0.82405812, + "learning_rate": 1.8743472162998941e-07, + "loss": 0.84552383, + "num_input_tokens_seen": 310410090, + "router_z_loss_clip": 0.72851562, + "router_z_loss_mlp": 0.11175537, + "step": 14391, + "time_per_iteration": 2.657581090927124 + }, + { + "auxiliary_loss_clip": 0.01028098, + "auxiliary_loss_mlp": 0.01001357, + "balance_loss_clip": 1.0058111, + "balance_loss_mlp": 1.0004009, + "epoch": 0.8652938523974147, + "flos": 78371250049440.0, + "grad_norm": 0.7921706955054897, + "language_loss": 0.67919195, + "learning_rate": 1.8727014137658337e-07, + "loss": 0.69948649, + "num_input_tokens_seen": 310470055, + "router_z_loss_clip": 0.22290039, + "router_z_loss_mlp": 0.009552, + "step": 14392, + "time_per_iteration": 3.152642011642456 + }, + { + "auxiliary_loss_clip": 0.01112801, + "auxiliary_loss_mlp": 0.01030657, + "balance_loss_clip": 1.03759885, + "balance_loss_mlp": 1.01911139, + "epoch": 0.8653539756500827, + "flos": 22012980439680.0, + "grad_norm": 2.0050398599386026, + "language_loss": 0.76006615, + "learning_rate": 1.8710562986207523e-07, + "loss": 0.78150076, + "num_input_tokens_seen": 310487665, + "router_z_loss_clip": 0.75244141, + "router_z_loss_mlp": 0.11547852, + "step": 14393, + "time_per_iteration": 2.634634256362915 + }, + { + "auxiliary_loss_clip": 0.01109477, + "auxiliary_loss_mlp": 0.01030743, + "balance_loss_clip": 1.03582573, + "balance_loss_mlp": 1.01975787, + "epoch": 0.8654140989027507, + "flos": 21209961222720.0, + "grad_norm": 1.90207248100061, + "language_loss": 0.73413527, + "learning_rate": 1.8694118709270357e-07, + "loss": 0.75553751, + "num_input_tokens_seen": 310506130, + "router_z_loss_clip": 0.73583984, + "router_z_loss_mlp": 0.10968018, + "step": 14394, + "time_per_iteration": 2.6087818145751953 + }, + { + "auxiliary_loss_clip": 0.01112202, + "auxiliary_loss_mlp": 0.01029085, + "balance_loss_clip": 1.03784919, + "balance_loss_mlp": 1.01742089, + "epoch": 0.8654742221554186, + "flos": 65020841264160.0, + "grad_norm": 1.9143246745950606, + "language_loss": 0.65367824, + "learning_rate": 1.867768130747036e-07, + "loss": 0.67509115, + "num_input_tokens_seen": 310532445, + "router_z_loss_clip": 0.74414062, + "router_z_loss_mlp": 0.11676025, + "step": 14395, + "time_per_iteration": 4.213276624679565 + }, + { + "auxiliary_loss_clip": 0.01108851, + "auxiliary_loss_mlp": 0.01034939, + "balance_loss_clip": 1.03821492, + "balance_loss_mlp": 1.02390623, + "epoch": 0.8655343454080866, + "flos": 29181045031200.0, + "grad_norm": 1.9180735722060052, + "language_loss": 0.68139744, + "learning_rate": 1.8661250781430838e-07, + "loss": 0.70283532, + "num_input_tokens_seen": 310552300, + "router_z_loss_clip": 0.70703125, + "router_z_loss_mlp": 0.11035156, + "step": 14396, + "time_per_iteration": 2.7241458892822266 + }, + { + "auxiliary_loss_clip": 0.01114921, + "auxiliary_loss_mlp": 0.01036913, + "balance_loss_clip": 1.04058993, + "balance_loss_mlp": 1.02545667, + "epoch": 0.8655944686607545, + "flos": 29403736767360.0, + "grad_norm": 3.984817567191364, + "language_loss": 0.69455731, + "learning_rate": 1.8644827131774954e-07, + "loss": 0.71607554, + "num_input_tokens_seen": 310572710, + "router_z_loss_clip": 0.74414062, + "router_z_loss_mlp": 0.11462402, + "step": 14397, + "time_per_iteration": 2.622081995010376 + }, + { + "auxiliary_loss_clip": 0.01108373, + "auxiliary_loss_mlp": 0.01026923, + "balance_loss_clip": 1.03615308, + "balance_loss_mlp": 1.01639664, + "epoch": 0.8656545919134225, + "flos": 28202327772480.0, + "grad_norm": 2.0942050914473036, + "language_loss": 0.63618135, + "learning_rate": 1.86284103591253e-07, + "loss": 0.65753436, + "num_input_tokens_seen": 310592460, + "router_z_loss_clip": 0.72265625, + "router_z_loss_mlp": 0.10534668, + "step": 14398, + "time_per_iteration": 2.7130379676818848 + }, + { + "auxiliary_loss_clip": 0.01111297, + "auxiliary_loss_mlp": 0.0102837, + "balance_loss_clip": 1.03950167, + "balance_loss_mlp": 1.01765895, + "epoch": 0.8657147151660904, + "flos": 25797281332320.0, + "grad_norm": 8.040086311940295, + "language_loss": 0.7610805, + "learning_rate": 1.8612000464104517e-07, + "loss": 0.78247714, + "num_input_tokens_seen": 310609375, + "router_z_loss_clip": 0.71728516, + "router_z_loss_mlp": 0.1071167, + "step": 14399, + "time_per_iteration": 2.626244306564331 + }, + { + "auxiliary_loss_clip": 0.01107286, + "auxiliary_loss_mlp": 0.01026058, + "balance_loss_clip": 1.03670287, + "balance_loss_mlp": 1.01560903, + "epoch": 0.8657748384187585, + "flos": 19876079773440.0, + "grad_norm": 2.4365243912785735, + "language_loss": 0.93106008, + "learning_rate": 1.8595597447334855e-07, + "loss": 0.95239353, + "num_input_tokens_seen": 310627405, + "router_z_loss_clip": 0.70654297, + "router_z_loss_mlp": 0.10449219, + "step": 14400, + "time_per_iteration": 2.691859006881714 + }, + { + "auxiliary_loss_clip": 0.01111119, + "auxiliary_loss_mlp": 0.01029919, + "balance_loss_clip": 1.03851759, + "balance_loss_mlp": 1.01903558, + "epoch": 0.8658349616714264, + "flos": 37636848240480.0, + "grad_norm": 2.3050408069580137, + "language_loss": 0.67840171, + "learning_rate": 1.8579201309438353e-07, + "loss": 0.69981211, + "num_input_tokens_seen": 310649945, + "router_z_loss_clip": 0.72558594, + "router_z_loss_mlp": 0.10888672, + "step": 14401, + "time_per_iteration": 2.7277815341949463 + }, + { + "auxiliary_loss_clip": 0.01112211, + "auxiliary_loss_mlp": 0.01028863, + "balance_loss_clip": 1.0378902, + "balance_loss_mlp": 1.01740718, + "epoch": 0.8658950849240944, + "flos": 23126196016800.0, + "grad_norm": 2.2464500197075457, + "language_loss": 0.74256295, + "learning_rate": 1.8562812051036714e-07, + "loss": 0.76397371, + "num_input_tokens_seen": 310668285, + "router_z_loss_clip": 0.74316406, + "router_z_loss_mlp": 0.11450195, + "step": 14402, + "time_per_iteration": 2.7047536373138428 + }, + { + "auxiliary_loss_clip": 0.01107487, + "auxiliary_loss_mlp": 0.01028434, + "balance_loss_clip": 1.03687251, + "balance_loss_mlp": 1.01797938, + "epoch": 0.8659552081767624, + "flos": 28509728440320.0, + "grad_norm": 1.6940567737378027, + "language_loss": 0.74660516, + "learning_rate": 1.8546429672751397e-07, + "loss": 0.76796442, + "num_input_tokens_seen": 310687015, + "router_z_loss_clip": 0.70605469, + "router_z_loss_mlp": 0.10455322, + "step": 14403, + "time_per_iteration": 2.6133625507354736 + }, + { + "auxiliary_loss_clip": 0.01111976, + "auxiliary_loss_mlp": 0.01029006, + "balance_loss_clip": 1.03845489, + "balance_loss_mlp": 1.01678133, + "epoch": 0.8660153314294303, + "flos": 29092284371520.0, + "grad_norm": 1.9007615971275296, + "language_loss": 0.73018014, + "learning_rate": 1.853005417520368e-07, + "loss": 0.75158989, + "num_input_tokens_seen": 310707580, + "router_z_loss_clip": 0.73535156, + "router_z_loss_mlp": 0.12237549, + "step": 14404, + "time_per_iteration": 2.690000295639038 + }, + { + "auxiliary_loss_clip": 0.01111077, + "auxiliary_loss_mlp": 0.01031776, + "balance_loss_clip": 1.04018021, + "balance_loss_mlp": 1.02005768, + "epoch": 0.8660754546820983, + "flos": 28202044151520.0, + "grad_norm": 2.388205084992342, + "language_loss": 0.70582068, + "learning_rate": 1.851368555901447e-07, + "loss": 0.72724921, + "num_input_tokens_seen": 310727300, + "router_z_loss_clip": 0.70898438, + "router_z_loss_mlp": 0.11724854, + "step": 14405, + "time_per_iteration": 2.614819049835205 + }, + { + "auxiliary_loss_clip": 0.01114031, + "auxiliary_loss_mlp": 0.01030859, + "balance_loss_clip": 1.03920388, + "balance_loss_mlp": 1.0196172, + "epoch": 0.8661355779347663, + "flos": 17560685373120.0, + "grad_norm": 2.394701698460315, + "language_loss": 0.66491842, + "learning_rate": 1.8497323824804467e-07, + "loss": 0.68636733, + "num_input_tokens_seen": 310744935, + "router_z_loss_clip": 0.74853516, + "router_z_loss_mlp": 0.11242676, + "step": 14406, + "time_per_iteration": 2.635430097579956 + }, + { + "auxiliary_loss_clip": 0.01108974, + "auxiliary_loss_mlp": 0.01024869, + "balance_loss_clip": 1.03743923, + "balance_loss_mlp": 1.01453972, + "epoch": 0.8661957011874343, + "flos": 26686224999360.0, + "grad_norm": 1.5952035849216313, + "language_loss": 0.83027375, + "learning_rate": 1.8480968973194177e-07, + "loss": 0.85161221, + "num_input_tokens_seen": 310765085, + "router_z_loss_clip": 0.71533203, + "router_z_loss_mlp": 0.10327148, + "step": 14407, + "time_per_iteration": 2.694638967514038 + }, + { + "auxiliary_loss_clip": 0.01110527, + "auxiliary_loss_mlp": 0.01033292, + "balance_loss_clip": 1.03936005, + "balance_loss_mlp": 1.02246785, + "epoch": 0.8662558244401022, + "flos": 26643404584800.0, + "grad_norm": 2.2025775607726232, + "language_loss": 0.70027685, + "learning_rate": 1.8464621004803748e-07, + "loss": 0.72171509, + "num_input_tokens_seen": 310783260, + "router_z_loss_clip": 0.71240234, + "router_z_loss_mlp": 0.10809326, + "step": 14408, + "time_per_iteration": 2.6100893020629883 + }, + { + "auxiliary_loss_clip": 0.01104116, + "auxiliary_loss_mlp": 0.01030932, + "balance_loss_clip": 1.03523755, + "balance_loss_mlp": 1.02065039, + "epoch": 0.8663159476927702, + "flos": 21211865534880.0, + "grad_norm": 3.378183848157094, + "language_loss": 0.77498096, + "learning_rate": 1.844827992025304e-07, + "loss": 0.79633141, + "num_input_tokens_seen": 310801970, + "router_z_loss_clip": 0.68847656, + "router_z_loss_mlp": 0.1027832, + "step": 14409, + "time_per_iteration": 2.6804885864257812 + }, + { + "auxiliary_loss_clip": 0.01114013, + "auxiliary_loss_mlp": 0.01031632, + "balance_loss_clip": 1.0401752, + "balance_loss_mlp": 1.01959765, + "epoch": 0.8663760709454381, + "flos": 27756944300160.0, + "grad_norm": 1.9412876062699471, + "language_loss": 0.76952398, + "learning_rate": 1.8431945720161757e-07, + "loss": 0.79098046, + "num_input_tokens_seen": 310822070, + "router_z_loss_clip": 0.73828125, + "router_z_loss_mlp": 0.1204834, + "step": 14410, + "time_per_iteration": 2.7163071632385254 + }, + { + "auxiliary_loss_clip": 0.01110082, + "auxiliary_loss_mlp": 0.01032684, + "balance_loss_clip": 1.03794241, + "balance_loss_mlp": 1.02175295, + "epoch": 0.8664361941981061, + "flos": 21203681044320.0, + "grad_norm": 1.8825597206084395, + "language_loss": 0.77692127, + "learning_rate": 1.8415618405149315e-07, + "loss": 0.79834896, + "num_input_tokens_seen": 310838355, + "router_z_loss_clip": 0.72119141, + "router_z_loss_mlp": 0.109375, + "step": 14411, + "time_per_iteration": 2.695333242416382 + }, + { + "auxiliary_loss_clip": 0.01106503, + "auxiliary_loss_mlp": 0.01031891, + "balance_loss_clip": 1.03526974, + "balance_loss_mlp": 1.02152574, + "epoch": 0.866496317450774, + "flos": 19564870481280.0, + "grad_norm": 2.019277240524971, + "language_loss": 0.74047375, + "learning_rate": 1.8399297975834794e-07, + "loss": 0.76185769, + "num_input_tokens_seen": 310856055, + "router_z_loss_clip": 0.71240234, + "router_z_loss_mlp": 0.10369873, + "step": 14412, + "time_per_iteration": 2.630208969116211 + }, + { + "auxiliary_loss_clip": 0.01107044, + "auxiliary_loss_mlp": 0.01027854, + "balance_loss_clip": 1.03783691, + "balance_loss_mlp": 1.01762009, + "epoch": 0.8665564407034421, + "flos": 25398202760640.0, + "grad_norm": 1.7100957719928311, + "language_loss": 0.6970517, + "learning_rate": 1.83829844328371e-07, + "loss": 0.71840072, + "num_input_tokens_seen": 310876695, + "router_z_loss_clip": 0.69287109, + "router_z_loss_mlp": 0.10241699, + "step": 14413, + "time_per_iteration": 2.717034101486206 + }, + { + "auxiliary_loss_clip": 0.01110218, + "auxiliary_loss_mlp": 0.01026494, + "balance_loss_clip": 1.03849828, + "balance_loss_mlp": 1.01518679, + "epoch": 0.86661656395611, + "flos": 19297332466560.0, + "grad_norm": 2.1130117536505226, + "language_loss": 0.63143718, + "learning_rate": 1.8366677776774874e-07, + "loss": 0.65280426, + "num_input_tokens_seen": 310893880, + "router_z_loss_clip": 0.71728516, + "router_z_loss_mlp": 0.11309814, + "step": 14414, + "time_per_iteration": 2.6845738887786865 + }, + { + "auxiliary_loss_clip": 0.01110733, + "auxiliary_loss_mlp": 0.01028326, + "balance_loss_clip": 1.03829217, + "balance_loss_mlp": 1.01718569, + "epoch": 0.866676687208778, + "flos": 28110082626720.0, + "grad_norm": 1.7945269761088214, + "language_loss": 0.63972139, + "learning_rate": 1.8350378008266377e-07, + "loss": 0.66111207, + "num_input_tokens_seen": 310914145, + "router_z_loss_clip": 0.72558594, + "router_z_loss_mlp": 0.11138916, + "step": 14415, + "time_per_iteration": 2.7629806995391846 + }, + { + "auxiliary_loss_clip": 0.01028105, + "auxiliary_loss_mlp": 0.01001567, + "balance_loss_clip": 1.00579321, + "balance_loss_mlp": 1.00061893, + "epoch": 0.866736810461446, + "flos": 77850760240800.0, + "grad_norm": 0.8005325695552559, + "language_loss": 0.60361028, + "learning_rate": 1.8334085127929754e-07, + "loss": 0.62390697, + "num_input_tokens_seen": 310972825, + "router_z_loss_clip": 0.2232666, + "router_z_loss_mlp": 0.00947571, + "step": 14416, + "time_per_iteration": 3.3327598571777344 + }, + { + "auxiliary_loss_clip": 0.01111058, + "auxiliary_loss_mlp": 0.01033278, + "balance_loss_clip": 1.03669333, + "balance_loss_mlp": 1.0218637, + "epoch": 0.8667969337141139, + "flos": 24952292563680.0, + "grad_norm": 6.424827722404769, + "language_loss": 0.74531651, + "learning_rate": 1.831779913638285e-07, + "loss": 0.76675981, + "num_input_tokens_seen": 310992050, + "router_z_loss_clip": 0.74365234, + "router_z_loss_mlp": 0.11419678, + "step": 14417, + "time_per_iteration": 4.127086877822876 + }, + { + "auxiliary_loss_clip": 0.01107387, + "auxiliary_loss_mlp": 0.01035123, + "balance_loss_clip": 1.03651965, + "balance_loss_mlp": 1.02410781, + "epoch": 0.866857056966782, + "flos": 26421887849760.0, + "grad_norm": 1.8913704529231963, + "language_loss": 0.75037229, + "learning_rate": 1.830152003424319e-07, + "loss": 0.77179742, + "num_input_tokens_seen": 311011105, + "router_z_loss_clip": 0.70898438, + "router_z_loss_mlp": 0.11016846, + "step": 14418, + "time_per_iteration": 2.6724934577941895 + }, + { + "auxiliary_loss_clip": 0.01108988, + "auxiliary_loss_mlp": 0.01032186, + "balance_loss_clip": 1.037184, + "balance_loss_mlp": 1.02129078, + "epoch": 0.8669171802194499, + "flos": 27885608130240.0, + "grad_norm": 3.406854037446285, + "language_loss": 0.68126935, + "learning_rate": 1.8285247822128126e-07, + "loss": 0.70268112, + "num_input_tokens_seen": 311032080, + "router_z_loss_clip": 0.71777344, + "router_z_loss_mlp": 0.10894775, + "step": 14419, + "time_per_iteration": 2.688683271408081 + }, + { + "auxiliary_loss_clip": 0.01109266, + "auxiliary_loss_mlp": 0.0103035, + "balance_loss_clip": 1.03702402, + "balance_loss_mlp": 1.02033091, + "epoch": 0.8669773034721179, + "flos": 22859306278560.0, + "grad_norm": 1.8790186531669928, + "language_loss": 0.78684944, + "learning_rate": 1.826898250065465e-07, + "loss": 0.8082456, + "num_input_tokens_seen": 311049735, + "router_z_loss_clip": 0.72314453, + "router_z_loss_mlp": 0.10015869, + "step": 14420, + "time_per_iteration": 2.6472854614257812 + }, + { + "auxiliary_loss_clip": 0.0110842, + "auxiliary_loss_mlp": 0.01026641, + "balance_loss_clip": 1.03759265, + "balance_loss_mlp": 1.0159595, + "epoch": 0.8670374267247858, + "flos": 23081268703680.0, + "grad_norm": 1.5784743628464428, + "language_loss": 0.83676648, + "learning_rate": 1.8252724070439586e-07, + "loss": 0.8581171, + "num_input_tokens_seen": 311067675, + "router_z_loss_clip": 0.70800781, + "router_z_loss_mlp": 0.10681152, + "step": 14421, + "time_per_iteration": 2.621469736099243 + }, + { + "auxiliary_loss_clip": 0.01028382, + "auxiliary_loss_mlp": 0.01001399, + "balance_loss_clip": 1.00598836, + "balance_loss_mlp": 1.00037622, + "epoch": 0.8670975499774538, + "flos": 59560831267200.0, + "grad_norm": 0.7032973272809853, + "language_loss": 0.49098924, + "learning_rate": 1.823647253209941e-07, + "loss": 0.51128703, + "num_input_tokens_seen": 311126605, + "router_z_loss_clip": 0.22399902, + "router_z_loss_mlp": 0.01023102, + "step": 14422, + "time_per_iteration": 3.286194086074829 + }, + { + "auxiliary_loss_clip": 0.01108717, + "auxiliary_loss_mlp": 0.01023649, + "balance_loss_clip": 1.03725433, + "balance_loss_mlp": 1.01347458, + "epoch": 0.8671576732301217, + "flos": 31892195586240.0, + "grad_norm": 3.8681335773026877, + "language_loss": 0.73603761, + "learning_rate": 1.8220227886250417e-07, + "loss": 0.75736117, + "num_input_tokens_seen": 311147325, + "router_z_loss_clip": 0.71484375, + "router_z_loss_mlp": 0.10168457, + "step": 14423, + "time_per_iteration": 2.6965444087982178 + }, + { + "auxiliary_loss_clip": 0.01105169, + "auxiliary_loss_mlp": 0.01024293, + "balance_loss_clip": 1.03673375, + "balance_loss_mlp": 1.01453543, + "epoch": 0.8672177964827897, + "flos": 22411977976800.0, + "grad_norm": 3.2843384743238486, + "language_loss": 0.76660872, + "learning_rate": 1.8203990133508684e-07, + "loss": 0.78790331, + "num_input_tokens_seen": 311165385, + "router_z_loss_clip": 0.68408203, + "router_z_loss_mlp": 0.09753418, + "step": 14424, + "time_per_iteration": 4.037229061126709 + }, + { + "auxiliary_loss_clip": 0.01105655, + "auxiliary_loss_mlp": 0.01030575, + "balance_loss_clip": 1.03701699, + "balance_loss_mlp": 1.02025771, + "epoch": 0.8672779197354576, + "flos": 34831993917600.0, + "grad_norm": 1.4699150208468361, + "language_loss": 0.7145617, + "learning_rate": 1.8187759274489767e-07, + "loss": 0.73592401, + "num_input_tokens_seen": 311185860, + "router_z_loss_clip": 0.68701172, + "router_z_loss_mlp": 0.10321045, + "step": 14425, + "time_per_iteration": 2.690826416015625 + }, + { + "auxiliary_loss_clip": 0.01111562, + "auxiliary_loss_mlp": 0.01029296, + "balance_loss_clip": 1.03777218, + "balance_loss_mlp": 1.01761913, + "epoch": 0.8673380429881257, + "flos": 27311277206880.0, + "grad_norm": 1.5058019837320031, + "language_loss": 0.67917913, + "learning_rate": 1.817153530980926e-07, + "loss": 0.70058775, + "num_input_tokens_seen": 311205810, + "router_z_loss_clip": 0.73779297, + "router_z_loss_mlp": 0.11682129, + "step": 14426, + "time_per_iteration": 2.701817274093628 + }, + { + "auxiliary_loss_clip": 0.01110496, + "auxiliary_loss_mlp": 0.01025954, + "balance_loss_clip": 1.03775311, + "balance_loss_mlp": 1.01440907, + "epoch": 0.8673981662407936, + "flos": 25619881564800.0, + "grad_norm": 1.81004708876254, + "language_loss": 0.70698178, + "learning_rate": 1.815531824008234e-07, + "loss": 0.72834629, + "num_input_tokens_seen": 311226080, + "router_z_loss_clip": 0.72705078, + "router_z_loss_mlp": 0.11547852, + "step": 14427, + "time_per_iteration": 2.6307692527770996 + }, + { + "auxiliary_loss_clip": 0.01111139, + "auxiliary_loss_mlp": 0.01028227, + "balance_loss_clip": 1.03899598, + "balance_loss_mlp": 1.01721752, + "epoch": 0.8674582894934616, + "flos": 29805813617760.0, + "grad_norm": 1.6243384450231024, + "language_loss": 0.67977172, + "learning_rate": 1.8139108065924004e-07, + "loss": 0.70116544, + "num_input_tokens_seen": 311246380, + "router_z_loss_clip": 0.72167969, + "router_z_loss_mlp": 0.11004639, + "step": 14428, + "time_per_iteration": 4.135229587554932 + }, + { + "auxiliary_loss_clip": 0.01108315, + "auxiliary_loss_mlp": 0.01025427, + "balance_loss_clip": 1.03629518, + "balance_loss_mlp": 1.01493096, + "epoch": 0.8675184127461296, + "flos": 25304255889120.0, + "grad_norm": 2.1358712621901823, + "language_loss": 0.70638251, + "learning_rate": 1.812290478794889e-07, + "loss": 0.72771996, + "num_input_tokens_seen": 311266465, + "router_z_loss_clip": 0.71972656, + "router_z_loss_mlp": 0.10491943, + "step": 14429, + "time_per_iteration": 2.5799038410186768 + }, + { + "auxiliary_loss_clip": 0.01109099, + "auxiliary_loss_mlp": 0.0103004, + "balance_loss_clip": 1.03771937, + "balance_loss_mlp": 1.01900709, + "epoch": 0.8675785359987975, + "flos": 23349211891200.0, + "grad_norm": 1.9293109480883974, + "language_loss": 0.67086381, + "learning_rate": 1.810670840677151e-07, + "loss": 0.6922552, + "num_input_tokens_seen": 311285075, + "router_z_loss_clip": 0.71386719, + "router_z_loss_mlp": 0.11029053, + "step": 14430, + "time_per_iteration": 2.661384344100952 + }, + { + "auxiliary_loss_clip": 0.01112298, + "auxiliary_loss_mlp": 0.01039764, + "balance_loss_clip": 1.03870606, + "balance_loss_mlp": 1.0275805, + "epoch": 0.8676386592514655, + "flos": 27711773883360.0, + "grad_norm": 2.26854728571599, + "language_loss": 0.6955964, + "learning_rate": 1.8090518923005948e-07, + "loss": 0.71711707, + "num_input_tokens_seen": 311303230, + "router_z_loss_clip": 0.73583984, + "router_z_loss_mlp": 0.12188721, + "step": 14431, + "time_per_iteration": 2.6253411769866943 + }, + { + "auxiliary_loss_clip": 0.01111174, + "auxiliary_loss_mlp": 0.01034912, + "balance_loss_clip": 1.03815246, + "balance_loss_mlp": 1.02360475, + "epoch": 0.8676987825041335, + "flos": 17338398809760.0, + "grad_norm": 2.259401967685122, + "language_loss": 0.63233292, + "learning_rate": 1.8074336337266116e-07, + "loss": 0.65379375, + "num_input_tokens_seen": 311318070, + "router_z_loss_clip": 0.73046875, + "router_z_loss_mlp": 0.11315918, + "step": 14432, + "time_per_iteration": 2.6441657543182373 + }, + { + "auxiliary_loss_clip": 0.01110647, + "auxiliary_loss_mlp": 0.01033777, + "balance_loss_clip": 1.03835583, + "balance_loss_mlp": 1.023561, + "epoch": 0.8677589057568015, + "flos": 16581887079840.0, + "grad_norm": 1.890860712537703, + "language_loss": 0.78162414, + "learning_rate": 1.8058160650165656e-07, + "loss": 0.8030684, + "num_input_tokens_seen": 311334885, + "router_z_loss_clip": 0.72314453, + "router_z_loss_mlp": 0.10217285, + "step": 14433, + "time_per_iteration": 2.6240603923797607 + }, + { + "auxiliary_loss_clip": 0.01028294, + "auxiliary_loss_mlp": 0.01002157, + "balance_loss_clip": 1.00586843, + "balance_loss_mlp": 1.00112855, + "epoch": 0.8678190290094694, + "flos": 84113674253280.0, + "grad_norm": 0.7028423090379078, + "language_loss": 0.58428675, + "learning_rate": 1.804199186231805e-07, + "loss": 0.60459125, + "num_input_tokens_seen": 311399780, + "router_z_loss_clip": 0.22436523, + "router_z_loss_mlp": 0.01029205, + "step": 14434, + "time_per_iteration": 3.3327646255493164 + }, + { + "auxiliary_loss_clip": 0.01105967, + "auxiliary_loss_mlp": 0.0102956, + "balance_loss_clip": 1.0369333, + "balance_loss_mlp": 1.01933813, + "epoch": 0.8678791522621374, + "flos": 39728294868960.0, + "grad_norm": 1.8083638861030833, + "language_loss": 0.79957271, + "learning_rate": 1.802582997433628e-07, + "loss": 0.82092798, + "num_input_tokens_seen": 311419610, + "router_z_loss_clip": 0.69091797, + "router_z_loss_mlp": 0.10223389, + "step": 14435, + "time_per_iteration": 4.005856513977051 + }, + { + "auxiliary_loss_clip": 0.01108782, + "auxiliary_loss_mlp": 0.01027344, + "balance_loss_clip": 1.03522754, + "balance_loss_mlp": 1.01619828, + "epoch": 0.8679392755148053, + "flos": 42760216794240.0, + "grad_norm": 2.2515627561421705, + "language_loss": 0.62060153, + "learning_rate": 1.8009674986833322e-07, + "loss": 0.64196283, + "num_input_tokens_seen": 311440045, + "router_z_loss_clip": 0.73535156, + "router_z_loss_mlp": 0.11151123, + "step": 14436, + "time_per_iteration": 2.812195062637329 + }, + { + "auxiliary_loss_clip": 0.01111992, + "auxiliary_loss_mlp": 0.01028914, + "balance_loss_clip": 1.03945196, + "balance_loss_mlp": 1.01716614, + "epoch": 0.8679993987674733, + "flos": 22637546439840.0, + "grad_norm": 2.761989755435746, + "language_loss": 0.70845127, + "learning_rate": 1.7993526900421706e-07, + "loss": 0.72986031, + "num_input_tokens_seen": 311456660, + "router_z_loss_clip": 0.72509766, + "router_z_loss_mlp": 0.11761475, + "step": 14437, + "time_per_iteration": 2.5839602947235107 + }, + { + "auxiliary_loss_clip": 0.01109807, + "auxiliary_loss_mlp": 0.0102497, + "balance_loss_clip": 1.03779387, + "balance_loss_mlp": 1.01386595, + "epoch": 0.8680595220201412, + "flos": 33500381436000.0, + "grad_norm": 4.732637130019985, + "language_loss": 0.80660319, + "learning_rate": 1.797738571571381e-07, + "loss": 0.82795095, + "num_input_tokens_seen": 311475460, + "router_z_loss_clip": 0.71972656, + "router_z_loss_mlp": 0.11102295, + "step": 14438, + "time_per_iteration": 2.7384116649627686 + }, + { + "auxiliary_loss_clip": 0.01105045, + "auxiliary_loss_mlp": 0.01024394, + "balance_loss_clip": 1.03545642, + "balance_loss_mlp": 1.0136174, + "epoch": 0.8681196452728093, + "flos": 23438458758240.0, + "grad_norm": 1.9090551746398245, + "language_loss": 0.67304027, + "learning_rate": 1.7961251433321656e-07, + "loss": 0.69433469, + "num_input_tokens_seen": 311494575, + "router_z_loss_clip": 0.6953125, + "router_z_loss_mlp": 0.10772705, + "step": 14439, + "time_per_iteration": 2.6065518856048584 + }, + { + "auxiliary_loss_clip": 0.01106614, + "auxiliary_loss_mlp": 0.01030896, + "balance_loss_clip": 1.03575563, + "balance_loss_mlp": 1.02056086, + "epoch": 0.8681797685254772, + "flos": 45832609131840.0, + "grad_norm": 10.59655679919856, + "language_loss": 0.63969004, + "learning_rate": 1.7945124053857085e-07, + "loss": 0.6610651, + "num_input_tokens_seen": 311515805, + "router_z_loss_clip": 0.70898438, + "router_z_loss_mlp": 0.10333252, + "step": 14440, + "time_per_iteration": 2.8298795223236084 + }, + { + "auxiliary_loss_clip": 0.01108271, + "auxiliary_loss_mlp": 0.01029669, + "balance_loss_clip": 1.03915572, + "balance_loss_mlp": 1.01857626, + "epoch": 0.8682398917781452, + "flos": 28419063468480.0, + "grad_norm": 1.6832604025476396, + "language_loss": 0.6580832, + "learning_rate": 1.7929003577931722e-07, + "loss": 0.67946255, + "num_input_tokens_seen": 311536000, + "router_z_loss_clip": 0.69189453, + "router_z_loss_mlp": 0.11090088, + "step": 14441, + "time_per_iteration": 2.663902997970581 + }, + { + "auxiliary_loss_clip": 0.01108185, + "auxiliary_loss_mlp": 0.01027335, + "balance_loss_clip": 1.03893185, + "balance_loss_mlp": 1.01690435, + "epoch": 0.8683000150308132, + "flos": 26689223278080.0, + "grad_norm": 1.5321581072228496, + "language_loss": 0.66769761, + "learning_rate": 1.7912890006156722e-07, + "loss": 0.68905282, + "num_input_tokens_seen": 311556220, + "router_z_loss_clip": 0.69287109, + "router_z_loss_mlp": 0.10443115, + "step": 14442, + "time_per_iteration": 2.715717077255249 + }, + { + "auxiliary_loss_clip": 0.01112138, + "auxiliary_loss_mlp": 0.01029637, + "balance_loss_clip": 1.03812468, + "balance_loss_mlp": 1.01721537, + "epoch": 0.8683601382834811, + "flos": 17872137768960.0, + "grad_norm": 1.8527168696230796, + "language_loss": 0.72477567, + "learning_rate": 1.7896783339143195e-07, + "loss": 0.74619347, + "num_input_tokens_seen": 311572530, + "router_z_loss_clip": 0.74072266, + "router_z_loss_mlp": 0.12420654, + "step": 14443, + "time_per_iteration": 2.6793980598449707 + }, + { + "auxiliary_loss_clip": 0.01110621, + "auxiliary_loss_mlp": 0.01026519, + "balance_loss_clip": 1.03779626, + "balance_loss_mlp": 1.01519394, + "epoch": 0.8684202615361492, + "flos": 32164798260960.0, + "grad_norm": 1.7701368240904563, + "language_loss": 0.83105469, + "learning_rate": 1.7880683577501877e-07, + "loss": 0.85242611, + "num_input_tokens_seen": 311591105, + "router_z_loss_clip": 0.72753906, + "router_z_loss_mlp": 0.11328125, + "step": 14444, + "time_per_iteration": 2.713832378387451 + }, + { + "auxiliary_loss_clip": 0.01111996, + "auxiliary_loss_mlp": 0.01028086, + "balance_loss_clip": 1.03908014, + "balance_loss_mlp": 1.01669002, + "epoch": 0.8684803847888171, + "flos": 25263096683040.0, + "grad_norm": 2.154796646363324, + "language_loss": 0.77480155, + "learning_rate": 1.7864590721843342e-07, + "loss": 0.79620236, + "num_input_tokens_seen": 311608350, + "router_z_loss_clip": 0.72900391, + "router_z_loss_mlp": 0.11395264, + "step": 14445, + "time_per_iteration": 2.6258127689361572 + }, + { + "auxiliary_loss_clip": 0.01110249, + "auxiliary_loss_mlp": 0.01031904, + "balance_loss_clip": 1.03811932, + "balance_loss_mlp": 1.0209012, + "epoch": 0.8685405080414851, + "flos": 27623215810080.0, + "grad_norm": 2.14354153898497, + "language_loss": 0.68070805, + "learning_rate": 1.7848504772777728e-07, + "loss": 0.7021296, + "num_input_tokens_seen": 311626380, + "router_z_loss_clip": 0.72021484, + "router_z_loss_mlp": 0.11004639, + "step": 14446, + "time_per_iteration": 2.6687941551208496 + }, + { + "auxiliary_loss_clip": 0.01108327, + "auxiliary_loss_mlp": 0.01028487, + "balance_loss_clip": 1.03737974, + "balance_loss_mlp": 1.01716757, + "epoch": 0.868600631294153, + "flos": 30288709740960.0, + "grad_norm": 1.9369168541774204, + "language_loss": 0.83350348, + "learning_rate": 1.7832425730915102e-07, + "loss": 0.85487163, + "num_input_tokens_seen": 311644345, + "router_z_loss_clip": 0.70947266, + "router_z_loss_mlp": 0.11322021, + "step": 14447, + "time_per_iteration": 2.6744277477264404 + }, + { + "auxiliary_loss_clip": 0.01108543, + "auxiliary_loss_mlp": 0.01027953, + "balance_loss_clip": 1.03695953, + "balance_loss_mlp": 1.01720607, + "epoch": 0.868660754546821, + "flos": 30644927380800.0, + "grad_norm": 1.5528793898285207, + "language_loss": 0.74143422, + "learning_rate": 1.781635359686515e-07, + "loss": 0.76279914, + "num_input_tokens_seen": 311663340, + "router_z_loss_clip": 0.71533203, + "router_z_loss_mlp": 0.10736084, + "step": 14448, + "time_per_iteration": 2.7140774726867676 + }, + { + "auxiliary_loss_clip": 0.01110618, + "auxiliary_loss_mlp": 0.01027217, + "balance_loss_clip": 1.03824878, + "balance_loss_mlp": 1.01574326, + "epoch": 0.8687208777994889, + "flos": 15468874089120.0, + "grad_norm": 2.763738671728206, + "language_loss": 0.80349284, + "learning_rate": 1.7800288371237303e-07, + "loss": 0.82487118, + "num_input_tokens_seen": 311679860, + "router_z_loss_clip": 0.72363281, + "router_z_loss_mlp": 0.11474609, + "step": 14449, + "time_per_iteration": 2.631096839904785 + }, + { + "auxiliary_loss_clip": 0.01028235, + "auxiliary_loss_mlp": 0.01001474, + "balance_loss_clip": 1.00590181, + "balance_loss_mlp": 1.00050712, + "epoch": 0.8687810010521569, + "flos": 80066373281280.0, + "grad_norm": 0.8055580874721137, + "language_loss": 0.60592747, + "learning_rate": 1.7784230054640758e-07, + "loss": 0.62622452, + "num_input_tokens_seen": 311738135, + "router_z_loss_clip": 0.22363281, + "router_z_loss_mlp": 0.00965881, + "step": 14450, + "time_per_iteration": 3.2170188426971436 + }, + { + "auxiliary_loss_clip": 0.01111767, + "auxiliary_loss_mlp": 0.01027999, + "balance_loss_clip": 1.038625, + "balance_loss_mlp": 1.01694274, + "epoch": 0.8688411243048249, + "flos": 29582797743360.0, + "grad_norm": 2.2576223844148067, + "language_loss": 0.75802374, + "learning_rate": 1.7768178647684517e-07, + "loss": 0.77942145, + "num_input_tokens_seen": 311756975, + "router_z_loss_clip": 0.73095703, + "router_z_loss_mlp": 0.11065674, + "step": 14451, + "time_per_iteration": 2.7004737854003906 + }, + { + "auxiliary_loss_clip": 0.01108006, + "auxiliary_loss_mlp": 0.01024731, + "balance_loss_clip": 1.03703427, + "balance_loss_mlp": 1.01384687, + "epoch": 0.8689012475574929, + "flos": 22234254071040.0, + "grad_norm": 3.443024437815121, + "language_loss": 0.7218467, + "learning_rate": 1.7752134150977205e-07, + "loss": 0.74317408, + "num_input_tokens_seen": 311771830, + "router_z_loss_clip": 0.70947266, + "router_z_loss_mlp": 0.10888672, + "step": 14452, + "time_per_iteration": 2.680119514465332 + }, + { + "auxiliary_loss_clip": 0.01113065, + "auxiliary_loss_mlp": 0.0103032, + "balance_loss_clip": 1.03944647, + "balance_loss_mlp": 1.01832783, + "epoch": 0.8689613708101608, + "flos": 23972845993920.0, + "grad_norm": 1.4162449826008403, + "language_loss": 0.72262055, + "learning_rate": 1.7736096565127201e-07, + "loss": 0.74405432, + "num_input_tokens_seen": 311790130, + "router_z_loss_clip": 0.73681641, + "router_z_loss_mlp": 0.11999512, + "step": 14453, + "time_per_iteration": 2.7663350105285645 + }, + { + "auxiliary_loss_clip": 0.01110209, + "auxiliary_loss_mlp": 0.01034015, + "balance_loss_clip": 1.0395335, + "balance_loss_mlp": 1.02300572, + "epoch": 0.8690214940628288, + "flos": 14310771716160.0, + "grad_norm": 3.2901489379380986, + "language_loss": 0.73867744, + "learning_rate": 1.7720065890742664e-07, + "loss": 0.76011968, + "num_input_tokens_seen": 311808360, + "router_z_loss_clip": 0.70654297, + "router_z_loss_mlp": 0.11004639, + "step": 14454, + "time_per_iteration": 2.6441526412963867 + }, + { + "auxiliary_loss_clip": 0.01110262, + "auxiliary_loss_mlp": 0.01031067, + "balance_loss_clip": 1.03867006, + "balance_loss_mlp": 1.01999879, + "epoch": 0.8690816173154968, + "flos": 42630337445760.0, + "grad_norm": 2.7841416811375947, + "language_loss": 0.59673238, + "learning_rate": 1.7704042128431552e-07, + "loss": 0.6181457, + "num_input_tokens_seen": 311831325, + "router_z_loss_clip": 0.71533203, + "router_z_loss_mlp": 0.11071777, + "step": 14455, + "time_per_iteration": 2.7633044719696045 + }, + { + "auxiliary_loss_clip": 0.01110378, + "auxiliary_loss_mlp": 0.01026985, + "balance_loss_clip": 1.03693438, + "balance_loss_mlp": 1.01587498, + "epoch": 0.8691417405681647, + "flos": 14172140635200.0, + "grad_norm": 2.337610378219423, + "language_loss": 0.80413389, + "learning_rate": 1.7688025278801378e-07, + "loss": 0.82550746, + "num_input_tokens_seen": 311848090, + "router_z_loss_clip": 0.73486328, + "router_z_loss_mlp": 0.11114502, + "step": 14456, + "time_per_iteration": 2.7068252563476562 + }, + { + "auxiliary_loss_clip": 0.01115577, + "auxiliary_loss_mlp": 0.01034135, + "balance_loss_clip": 1.03968668, + "balance_loss_mlp": 1.02150464, + "epoch": 0.8692018638208328, + "flos": 30027087249120.0, + "grad_norm": 3.1924516656649624, + "language_loss": 0.74423969, + "learning_rate": 1.7672015342459568e-07, + "loss": 0.76573682, + "num_input_tokens_seen": 311867855, + "router_z_loss_clip": 0.75976562, + "router_z_loss_mlp": 0.12628174, + "step": 14457, + "time_per_iteration": 4.110087633132935 + }, + { + "auxiliary_loss_clip": 0.0110704, + "auxiliary_loss_mlp": 0.01025323, + "balance_loss_clip": 1.03732562, + "balance_loss_mlp": 1.01453495, + "epoch": 0.8692619870735007, + "flos": 31719657892320.0, + "grad_norm": 1.4911743467096694, + "language_loss": 0.78411257, + "learning_rate": 1.765601232001328e-07, + "loss": 0.80543625, + "num_input_tokens_seen": 311888675, + "router_z_loss_clip": 0.69726562, + "router_z_loss_mlp": 0.10784912, + "step": 14458, + "time_per_iteration": 2.7290050983428955 + }, + { + "auxiliary_loss_clip": 0.01110761, + "auxiliary_loss_mlp": 0.01036431, + "balance_loss_clip": 1.03870964, + "balance_loss_mlp": 1.0242238, + "epoch": 0.8693221103261687, + "flos": 22014358027200.0, + "grad_norm": 1.6202440830807356, + "language_loss": 0.70861268, + "learning_rate": 1.7640016212069187e-07, + "loss": 0.73008466, + "num_input_tokens_seen": 311907310, + "router_z_loss_clip": 0.71972656, + "router_z_loss_mlp": 0.12207031, + "step": 14459, + "time_per_iteration": 2.6168434619903564 + }, + { + "auxiliary_loss_clip": 0.01104687, + "auxiliary_loss_mlp": 0.01031395, + "balance_loss_clip": 1.03723001, + "balance_loss_mlp": 1.02112556, + "epoch": 0.8693822335788366, + "flos": 33546564784800.0, + "grad_norm": 1.4246347146206058, + "language_loss": 0.73563218, + "learning_rate": 1.762402701923398e-07, + "loss": 0.75699306, + "num_input_tokens_seen": 311929635, + "router_z_loss_clip": 0.67480469, + "router_z_loss_mlp": 0.10266113, + "step": 14460, + "time_per_iteration": 2.7244653701782227 + }, + { + "auxiliary_loss_clip": 0.01114147, + "auxiliary_loss_mlp": 0.01029008, + "balance_loss_clip": 1.03926015, + "balance_loss_mlp": 1.01783776, + "epoch": 0.8694423568315046, + "flos": 29404304009280.0, + "grad_norm": 1.8057758145119256, + "language_loss": 0.64082992, + "learning_rate": 1.7608044742113947e-07, + "loss": 0.66226149, + "num_input_tokens_seen": 311948800, + "router_z_loss_clip": 0.74853516, + "router_z_loss_mlp": 0.11175537, + "step": 14461, + "time_per_iteration": 2.601417064666748 + }, + { + "auxiliary_loss_clip": 0.01108919, + "auxiliary_loss_mlp": 0.01030339, + "balance_loss_clip": 1.03624475, + "balance_loss_mlp": 1.0185492, + "epoch": 0.8695024800841725, + "flos": 22405981419360.0, + "grad_norm": 2.3161179014137208, + "language_loss": 0.82860184, + "learning_rate": 1.7592069381315123e-07, + "loss": 0.84999442, + "num_input_tokens_seen": 311964090, + "router_z_loss_clip": 0.7265625, + "router_z_loss_mlp": 0.11804199, + "step": 14462, + "time_per_iteration": 2.6138508319854736 + }, + { + "auxiliary_loss_clip": 0.01109111, + "auxiliary_loss_mlp": 0.0103396, + "balance_loss_clip": 1.03663778, + "balance_loss_mlp": 1.02223635, + "epoch": 0.8695626033368405, + "flos": 17116152763680.0, + "grad_norm": 1.8266597684623267, + "language_loss": 0.65231383, + "learning_rate": 1.757610093744335e-07, + "loss": 0.67374456, + "num_input_tokens_seen": 311981460, + "router_z_loss_clip": 0.72460938, + "router_z_loss_mlp": 0.11737061, + "step": 14463, + "time_per_iteration": 2.615518808364868 + }, + { + "auxiliary_loss_clip": 0.01112985, + "auxiliary_loss_mlp": 0.01033528, + "balance_loss_clip": 1.03911805, + "balance_loss_mlp": 1.02188694, + "epoch": 0.8696227265895085, + "flos": 20544073947360.0, + "grad_norm": 2.123700402890201, + "language_loss": 0.66993791, + "learning_rate": 1.7560139411104058e-07, + "loss": 0.69140309, + "num_input_tokens_seen": 312000115, + "router_z_loss_clip": 0.73828125, + "router_z_loss_mlp": 0.11651611, + "step": 14464, + "time_per_iteration": 3.9454777240753174 + }, + { + "auxiliary_loss_clip": 0.01113992, + "auxiliary_loss_mlp": 0.01032715, + "balance_loss_clip": 1.03904712, + "balance_loss_mlp": 1.02177727, + "epoch": 0.8696828498421765, + "flos": 26599328134560.0, + "grad_norm": 2.1185222401715444, + "language_loss": 0.62546265, + "learning_rate": 1.7544184802902607e-07, + "loss": 0.64692974, + "num_input_tokens_seen": 312020770, + "router_z_loss_clip": 0.75, + "router_z_loss_mlp": 0.10931396, + "step": 14465, + "time_per_iteration": 2.7275891304016113 + }, + { + "auxiliary_loss_clip": 0.01103755, + "auxiliary_loss_mlp": 0.01035622, + "balance_loss_clip": 1.0354054, + "balance_loss_mlp": 1.02541733, + "epoch": 0.8697429730948444, + "flos": 27936329414400.0, + "grad_norm": 1.482756987494401, + "language_loss": 0.84496999, + "learning_rate": 1.7528237113443934e-07, + "loss": 0.86636376, + "num_input_tokens_seen": 312041870, + "router_z_loss_clip": 0.68310547, + "router_z_loss_mlp": 0.10211182, + "step": 14466, + "time_per_iteration": 2.651029109954834 + }, + { + "auxiliary_loss_clip": 0.01114575, + "auxiliary_loss_mlp": 0.0103968, + "balance_loss_clip": 1.03940439, + "balance_loss_mlp": 1.02718091, + "epoch": 0.8698030963475124, + "flos": 30160613152800.0, + "grad_norm": 3.6249899989180574, + "language_loss": 0.61733472, + "learning_rate": 1.7512296343332779e-07, + "loss": 0.63887727, + "num_input_tokens_seen": 312058210, + "router_z_loss_clip": 0.75146484, + "router_z_loss_mlp": 0.12487793, + "step": 14467, + "time_per_iteration": 4.1404664516448975 + }, + { + "auxiliary_loss_clip": 0.0110533, + "auxiliary_loss_mlp": 0.01026994, + "balance_loss_clip": 1.03646398, + "balance_loss_mlp": 1.01641452, + "epoch": 0.8698632196001803, + "flos": 34705558537920.0, + "grad_norm": 1.4570889129165523, + "language_loss": 0.68924737, + "learning_rate": 1.7496362493173655e-07, + "loss": 0.71057057, + "num_input_tokens_seen": 312082665, + "router_z_loss_clip": 0.68847656, + "router_z_loss_mlp": 0.10583496, + "step": 14468, + "time_per_iteration": 2.7670979499816895 + }, + { + "auxiliary_loss_clip": 0.01106408, + "auxiliary_loss_mlp": 0.01028813, + "balance_loss_clip": 1.03649545, + "balance_loss_mlp": 1.01835799, + "epoch": 0.8699233428528483, + "flos": 33722586964800.0, + "grad_norm": 2.024504504959228, + "language_loss": 0.71059752, + "learning_rate": 1.7480435563570773e-07, + "loss": 0.73194981, + "num_input_tokens_seen": 312101960, + "router_z_loss_clip": 0.69824219, + "router_z_loss_mlp": 0.10449219, + "step": 14469, + "time_per_iteration": 2.684818744659424 + }, + { + "auxiliary_loss_clip": 0.01104886, + "auxiliary_loss_mlp": 0.01026762, + "balance_loss_clip": 1.03747749, + "balance_loss_mlp": 1.01634884, + "epoch": 0.8699834661055164, + "flos": 24459874879680.0, + "grad_norm": 2.127136288909355, + "language_loss": 0.84086466, + "learning_rate": 1.7464515555128024e-07, + "loss": 0.86218107, + "num_input_tokens_seen": 312117125, + "router_z_loss_clip": 0.67431641, + "router_z_loss_mlp": 0.10418701, + "step": 14470, + "time_per_iteration": 2.7088682651519775 + }, + { + "auxiliary_loss_clip": 0.01111601, + "auxiliary_loss_mlp": 0.01027874, + "balance_loss_clip": 1.03968918, + "balance_loss_mlp": 1.01715708, + "epoch": 0.8700435893581843, + "flos": 28959001571520.0, + "grad_norm": 1.7569251112938917, + "language_loss": 0.73020339, + "learning_rate": 1.7448602468449148e-07, + "loss": 0.75159818, + "num_input_tokens_seen": 312135775, + "router_z_loss_clip": 0.71875, + "router_z_loss_mlp": 0.10717773, + "step": 14471, + "time_per_iteration": 2.661811113357544 + }, + { + "auxiliary_loss_clip": 0.01109041, + "auxiliary_loss_mlp": 0.01026215, + "balance_loss_clip": 1.0381906, + "balance_loss_mlp": 1.01580787, + "epoch": 0.8701037126108523, + "flos": 28734202936800.0, + "grad_norm": 1.636990723952828, + "language_loss": 0.78976679, + "learning_rate": 1.7432696304137573e-07, + "loss": 0.81111932, + "num_input_tokens_seen": 312156070, + "router_z_loss_clip": 0.70849609, + "router_z_loss_mlp": 0.10406494, + "step": 14472, + "time_per_iteration": 2.7098512649536133 + }, + { + "auxiliary_loss_clip": 0.01109398, + "auxiliary_loss_mlp": 0.01026417, + "balance_loss_clip": 1.03726673, + "balance_loss_mlp": 1.01552701, + "epoch": 0.8701638358635202, + "flos": 22992670113120.0, + "grad_norm": 2.371959802034288, + "language_loss": 0.72745395, + "learning_rate": 1.741679706279644e-07, + "loss": 0.7488122, + "num_input_tokens_seen": 312174380, + "router_z_loss_clip": 0.72070312, + "router_z_loss_mlp": 0.10888672, + "step": 14473, + "time_per_iteration": 2.614132881164551 + }, + { + "auxiliary_loss_clip": 0.0111279, + "auxiliary_loss_mlp": 0.01029591, + "balance_loss_clip": 1.03852415, + "balance_loss_mlp": 1.01811767, + "epoch": 0.8702239591161882, + "flos": 34078764087360.0, + "grad_norm": 1.4102458686514372, + "language_loss": 0.72170848, + "learning_rate": 1.7400904745028644e-07, + "loss": 0.74313223, + "num_input_tokens_seen": 312195130, + "router_z_loss_clip": 0.7421875, + "router_z_loss_mlp": 0.11486816, + "step": 14474, + "time_per_iteration": 3.96116042137146 + }, + { + "auxiliary_loss_clip": 0.01108222, + "auxiliary_loss_mlp": 0.01033093, + "balance_loss_clip": 1.03620887, + "balance_loss_mlp": 1.02126181, + "epoch": 0.8702840823688561, + "flos": 21029076969120.0, + "grad_norm": 3.230817501777707, + "language_loss": 0.67120707, + "learning_rate": 1.7385019351436925e-07, + "loss": 0.69262028, + "num_input_tokens_seen": 312212300, + "router_z_loss_clip": 0.72070312, + "router_z_loss_mlp": 0.11834717, + "step": 14475, + "time_per_iteration": 2.676449775695801 + }, + { + "auxiliary_loss_clip": 0.01108705, + "auxiliary_loss_mlp": 0.01022588, + "balance_loss_clip": 1.03539479, + "balance_loss_mlp": 1.01134038, + "epoch": 0.8703442056215241, + "flos": 23705429531040.0, + "grad_norm": 1.6398759580963775, + "language_loss": 0.77606761, + "learning_rate": 1.736914088262349e-07, + "loss": 0.79738051, + "num_input_tokens_seen": 312231735, + "router_z_loss_clip": 0.73242188, + "router_z_loss_mlp": 0.11254883, + "step": 14476, + "time_per_iteration": 2.6331300735473633 + }, + { + "auxiliary_loss_clip": 0.01107332, + "auxiliary_loss_mlp": 0.01025612, + "balance_loss_clip": 1.0375185, + "balance_loss_mlp": 1.01521683, + "epoch": 0.8704043288741921, + "flos": 27178642683360.0, + "grad_norm": 1.535377140339159, + "language_loss": 0.72518426, + "learning_rate": 1.7353269339190525e-07, + "loss": 0.74651372, + "num_input_tokens_seen": 312253060, + "router_z_loss_clip": 0.69775391, + "router_z_loss_mlp": 0.10406494, + "step": 14477, + "time_per_iteration": 2.6919641494750977 + }, + { + "auxiliary_loss_clip": 0.01111015, + "auxiliary_loss_mlp": 0.01029047, + "balance_loss_clip": 1.03890371, + "balance_loss_mlp": 1.01828265, + "epoch": 0.8704644521268601, + "flos": 20315020998240.0, + "grad_norm": 2.644968481097659, + "language_loss": 0.59033871, + "learning_rate": 1.7337404721739946e-07, + "loss": 0.61173934, + "num_input_tokens_seen": 312269460, + "router_z_loss_clip": 0.72167969, + "router_z_loss_mlp": 0.10760498, + "step": 14478, + "time_per_iteration": 2.6140239238739014 + }, + { + "auxiliary_loss_clip": 0.01110153, + "auxiliary_loss_mlp": 0.01025915, + "balance_loss_clip": 1.04136872, + "balance_loss_mlp": 1.01621711, + "epoch": 0.870524575379528, + "flos": 29626995745440.0, + "grad_norm": 1.6172180209720082, + "language_loss": 0.71596742, + "learning_rate": 1.732154703087323e-07, + "loss": 0.73732805, + "num_input_tokens_seen": 312289830, + "router_z_loss_clip": 0.68798828, + "router_z_loss_mlp": 0.0970459, + "step": 14479, + "time_per_iteration": 2.69624662399292 + }, + { + "auxiliary_loss_clip": 0.01110091, + "auxiliary_loss_mlp": 0.01030901, + "balance_loss_clip": 1.03880715, + "balance_loss_mlp": 1.01926029, + "epoch": 0.870584698632196, + "flos": 35104110384960.0, + "grad_norm": 1.76855508774551, + "language_loss": 0.70937204, + "learning_rate": 1.7305696267191805e-07, + "loss": 0.73078197, + "num_input_tokens_seen": 312311320, + "router_z_loss_clip": 0.71289062, + "router_z_loss_mlp": 0.11633301, + "step": 14480, + "time_per_iteration": 2.682131767272949 + }, + { + "auxiliary_loss_clip": 0.01110394, + "auxiliary_loss_mlp": 0.01029661, + "balance_loss_clip": 1.03732264, + "balance_loss_mlp": 1.01915228, + "epoch": 0.8706448218848639, + "flos": 39596268104640.0, + "grad_norm": 1.6171566009205984, + "language_loss": 0.70120299, + "learning_rate": 1.728985243129666e-07, + "loss": 0.72260356, + "num_input_tokens_seen": 312332095, + "router_z_loss_clip": 0.72998047, + "router_z_loss_mlp": 0.10510254, + "step": 14481, + "time_per_iteration": 2.7200934886932373 + }, + { + "auxiliary_loss_clip": 0.01108515, + "auxiliary_loss_mlp": 0.0103193, + "balance_loss_clip": 1.03711307, + "balance_loss_mlp": 1.02098656, + "epoch": 0.8707049451375319, + "flos": 27757268438400.0, + "grad_norm": 2.0149313924358663, + "language_loss": 0.76854992, + "learning_rate": 1.7274015523788643e-07, + "loss": 0.7899543, + "num_input_tokens_seen": 312351225, + "router_z_loss_clip": 0.71386719, + "router_z_loss_mlp": 0.109375, + "step": 14482, + "time_per_iteration": 2.6705234050750732 + }, + { + "auxiliary_loss_clip": 0.01109494, + "auxiliary_loss_mlp": 0.01033047, + "balance_loss_clip": 1.03889382, + "balance_loss_mlp": 1.02169847, + "epoch": 0.8707650683902, + "flos": 19342219262400.0, + "grad_norm": 2.73518365434651, + "language_loss": 0.76532447, + "learning_rate": 1.7258185545268234e-07, + "loss": 0.78674984, + "num_input_tokens_seen": 312369730, + "router_z_loss_clip": 0.70654297, + "router_z_loss_mlp": 0.11358643, + "step": 14483, + "time_per_iteration": 2.6276967525482178 + }, + { + "auxiliary_loss_clip": 0.01115322, + "auxiliary_loss_mlp": 0.01036387, + "balance_loss_clip": 1.03992093, + "balance_loss_mlp": 1.02379227, + "epoch": 0.8708251916428679, + "flos": 20094476677920.0, + "grad_norm": 2.068721767935498, + "language_loss": 0.61808872, + "learning_rate": 1.7242362496335749e-07, + "loss": 0.63960576, + "num_input_tokens_seen": 312386780, + "router_z_loss_clip": 0.75390625, + "router_z_loss_mlp": 0.1260376, + "step": 14484, + "time_per_iteration": 2.6287729740142822 + }, + { + "auxiliary_loss_clip": 0.01111114, + "auxiliary_loss_mlp": 0.01030965, + "balance_loss_clip": 1.03988123, + "balance_loss_mlp": 1.01984882, + "epoch": 0.8708853148955359, + "flos": 18763634024640.0, + "grad_norm": 1.7610518034753797, + "language_loss": 0.67870152, + "learning_rate": 1.7226546377591222e-07, + "loss": 0.70012236, + "num_input_tokens_seen": 312404875, + "router_z_loss_clip": 0.71142578, + "router_z_loss_mlp": 0.11114502, + "step": 14485, + "time_per_iteration": 2.653327703475952 + }, + { + "auxiliary_loss_clip": 0.01108472, + "auxiliary_loss_mlp": 0.01032083, + "balance_loss_clip": 1.03696918, + "balance_loss_mlp": 1.02041805, + "epoch": 0.8709454381482038, + "flos": 37279901289600.0, + "grad_norm": 2.583448314304666, + "language_loss": 0.62662941, + "learning_rate": 1.7210737189634373e-07, + "loss": 0.64803493, + "num_input_tokens_seen": 312425280, + "router_z_loss_clip": 0.71386719, + "router_z_loss_mlp": 0.11669922, + "step": 14486, + "time_per_iteration": 2.7200164794921875 + }, + { + "auxiliary_loss_clip": 0.01111739, + "auxiliary_loss_mlp": 0.01033404, + "balance_loss_clip": 1.03678632, + "balance_loss_mlp": 1.02091122, + "epoch": 0.8710055614008718, + "flos": 27578450566080.0, + "grad_norm": 2.6842856722880017, + "language_loss": 0.61805487, + "learning_rate": 1.7194934933064653e-07, + "loss": 0.63950628, + "num_input_tokens_seen": 312443835, + "router_z_loss_clip": 0.74902344, + "router_z_loss_mlp": 0.125, + "step": 14487, + "time_per_iteration": 2.6125283241271973 + }, + { + "auxiliary_loss_clip": 0.01108134, + "auxiliary_loss_mlp": 0.01024473, + "balance_loss_clip": 1.03805757, + "balance_loss_mlp": 1.01430416, + "epoch": 0.8710656846535397, + "flos": 22504101570720.0, + "grad_norm": 1.8853221728361944, + "language_loss": 0.67765528, + "learning_rate": 1.7179139608481318e-07, + "loss": 0.69898134, + "num_input_tokens_seen": 312460830, + "router_z_loss_clip": 0.69970703, + "router_z_loss_mlp": 0.10168457, + "step": 14488, + "time_per_iteration": 2.703378438949585 + }, + { + "auxiliary_loss_clip": 0.01112806, + "auxiliary_loss_mlp": 0.01029472, + "balance_loss_clip": 1.04004455, + "balance_loss_mlp": 1.01803946, + "epoch": 0.8711258079062077, + "flos": 20138147955360.0, + "grad_norm": 2.7068009167960727, + "language_loss": 0.85924447, + "learning_rate": 1.716335121648338e-07, + "loss": 0.88066721, + "num_input_tokens_seen": 312477575, + "router_z_loss_clip": 0.72753906, + "router_z_loss_mlp": 0.11431885, + "step": 14489, + "time_per_iteration": 2.609095335006714 + }, + { + "auxiliary_loss_clip": 0.01115885, + "auxiliary_loss_mlp": 0.01029668, + "balance_loss_clip": 1.0390209, + "balance_loss_mlp": 1.01777744, + "epoch": 0.8711859311588757, + "flos": 19112599071360.0, + "grad_norm": 2.3413595908411824, + "language_loss": 0.75726235, + "learning_rate": 1.7147569757669445e-07, + "loss": 0.77871788, + "num_input_tokens_seen": 312492140, + "router_z_loss_clip": 0.76855469, + "router_z_loss_mlp": 0.11901855, + "step": 14490, + "time_per_iteration": 2.6163673400878906 + }, + { + "auxiliary_loss_clip": 0.01112812, + "auxiliary_loss_mlp": 0.01028998, + "balance_loss_clip": 1.03887653, + "balance_loss_mlp": 1.01720786, + "epoch": 0.8712460544115437, + "flos": 18983165412960.0, + "grad_norm": 2.2938466945032405, + "language_loss": 0.76210165, + "learning_rate": 1.7131795232638012e-07, + "loss": 0.78351974, + "num_input_tokens_seen": 312508400, + "router_z_loss_clip": 0.73974609, + "router_z_loss_mlp": 0.11791992, + "step": 14491, + "time_per_iteration": 2.6967899799346924 + }, + { + "auxiliary_loss_clip": 0.01113419, + "auxiliary_loss_mlp": 0.01022341, + "balance_loss_clip": 1.04242992, + "balance_loss_mlp": 1.01144564, + "epoch": 0.8713061776642116, + "flos": 20454665011200.0, + "grad_norm": 2.6475617764715325, + "language_loss": 0.67039388, + "learning_rate": 1.711602764198723e-07, + "loss": 0.69175148, + "num_input_tokens_seen": 312525915, + "router_z_loss_clip": 0.71044922, + "router_z_loss_mlp": 0.10900879, + "step": 14492, + "time_per_iteration": 2.628288984298706 + }, + { + "auxiliary_loss_clip": 0.01107192, + "auxiliary_loss_mlp": 0.01026807, + "balance_loss_clip": 1.03787994, + "balance_loss_mlp": 1.01672769, + "epoch": 0.8713663009168796, + "flos": 29626631089920.0, + "grad_norm": 1.964658289648921, + "language_loss": 0.69647318, + "learning_rate": 1.7100266986314992e-07, + "loss": 0.71781313, + "num_input_tokens_seen": 312544735, + "router_z_loss_clip": 0.69384766, + "router_z_loss_mlp": 0.10076904, + "step": 14493, + "time_per_iteration": 2.618412971496582 + }, + { + "auxiliary_loss_clip": 0.01112645, + "auxiliary_loss_mlp": 0.01031164, + "balance_loss_clip": 1.04037356, + "balance_loss_mlp": 1.019279, + "epoch": 0.8714264241695475, + "flos": 29036579461920.0, + "grad_norm": 2.1655930903687515, + "language_loss": 0.89322168, + "learning_rate": 1.7084513266218936e-07, + "loss": 0.91465974, + "num_input_tokens_seen": 312557910, + "router_z_loss_clip": 0.72167969, + "router_z_loss_mlp": 0.11883545, + "step": 14494, + "time_per_iteration": 2.708810567855835 + }, + { + "auxiliary_loss_clip": 0.01111036, + "auxiliary_loss_mlp": 0.01028642, + "balance_loss_clip": 1.03979361, + "balance_loss_mlp": 1.01806808, + "epoch": 0.8714865474222155, + "flos": 46366510160160.0, + "grad_norm": 1.7596527492543843, + "language_loss": 0.59234035, + "learning_rate": 1.7068766482296514e-07, + "loss": 0.61373711, + "num_input_tokens_seen": 312580360, + "router_z_loss_clip": 0.71191406, + "router_z_loss_mlp": 0.10571289, + "step": 14495, + "time_per_iteration": 2.746034622192383 + }, + { + "auxiliary_loss_clip": 0.01111544, + "auxiliary_loss_mlp": 0.0103426, + "balance_loss_clip": 1.03849041, + "balance_loss_mlp": 1.02292371, + "epoch": 0.8715466706748836, + "flos": 27400605108480.0, + "grad_norm": 3.343043224239186, + "language_loss": 0.80514741, + "learning_rate": 1.7053026635144762e-07, + "loss": 0.82660544, + "num_input_tokens_seen": 312597550, + "router_z_loss_clip": 0.73046875, + "router_z_loss_mlp": 0.11346436, + "step": 14496, + "time_per_iteration": 4.195731163024902 + }, + { + "auxiliary_loss_clip": 0.01112189, + "auxiliary_loss_mlp": 0.01036098, + "balance_loss_clip": 1.03858447, + "balance_loss_mlp": 1.02353895, + "epoch": 0.8716067939275515, + "flos": 25886771303040.0, + "grad_norm": 2.3194312821705654, + "language_loss": 0.79070449, + "learning_rate": 1.7037293725360624e-07, + "loss": 0.81218731, + "num_input_tokens_seen": 312616435, + "router_z_loss_clip": 0.73535156, + "router_z_loss_mlp": 0.12561035, + "step": 14497, + "time_per_iteration": 2.679985284805298 + }, + { + "auxiliary_loss_clip": 0.01111021, + "auxiliary_loss_mlp": 0.01031805, + "balance_loss_clip": 1.03779817, + "balance_loss_mlp": 1.0199616, + "epoch": 0.8716669171802195, + "flos": 28062521690400.0, + "grad_norm": 2.2346327879204835, + "language_loss": 0.67147756, + "learning_rate": 1.70215677535406e-07, + "loss": 0.69290578, + "num_input_tokens_seen": 312632770, + "router_z_loss_clip": 0.73242188, + "router_z_loss_mlp": 0.1184082, + "step": 14498, + "time_per_iteration": 2.6249184608459473 + }, + { + "auxiliary_loss_clip": 0.01107757, + "auxiliary_loss_mlp": 0.01029371, + "balance_loss_clip": 1.03595901, + "balance_loss_mlp": 1.01825428, + "epoch": 0.8717270404328874, + "flos": 36343720824480.0, + "grad_norm": 1.6297827088148173, + "language_loss": 0.56923246, + "learning_rate": 1.700584872028108e-07, + "loss": 0.59060383, + "num_input_tokens_seen": 312651900, + "router_z_loss_clip": 0.71826172, + "router_z_loss_mlp": 0.11120605, + "step": 14499, + "time_per_iteration": 2.752863883972168 + }, + { + "auxiliary_loss_clip": 0.011113, + "auxiliary_loss_mlp": 0.01031436, + "balance_loss_clip": 1.03698754, + "balance_loss_mlp": 1.01979506, + "epoch": 0.8717871636855554, + "flos": 26866623045600.0, + "grad_norm": 2.538533517742103, + "language_loss": 0.80273342, + "learning_rate": 1.6990136626178097e-07, + "loss": 0.82416075, + "num_input_tokens_seen": 312671380, + "router_z_loss_clip": 0.74414062, + "router_z_loss_mlp": 0.11639404, + "step": 14500, + "time_per_iteration": 2.6224377155303955 + }, + { + "auxiliary_loss_clip": 0.01110187, + "auxiliary_loss_mlp": 0.01027478, + "balance_loss_clip": 1.0389142, + "balance_loss_mlp": 1.01674283, + "epoch": 0.8718472869382233, + "flos": 20321260659360.0, + "grad_norm": 2.0444939634754786, + "language_loss": 0.72840178, + "learning_rate": 1.6974431471827466e-07, + "loss": 0.74977845, + "num_input_tokens_seen": 312689215, + "router_z_loss_clip": 0.71289062, + "router_z_loss_mlp": 0.10742188, + "step": 14501, + "time_per_iteration": 2.6369400024414062 + }, + { + "auxiliary_loss_clip": 0.01114801, + "auxiliary_loss_mlp": 0.01030307, + "balance_loss_clip": 1.03986335, + "balance_loss_mlp": 1.0185883, + "epoch": 0.8719074101908914, + "flos": 23788598806080.0, + "grad_norm": 1.7097248872330126, + "language_loss": 0.64644951, + "learning_rate": 1.695873325782482e-07, + "loss": 0.66790056, + "num_input_tokens_seen": 312706400, + "router_z_loss_clip": 0.75, + "router_z_loss_mlp": 0.11724854, + "step": 14502, + "time_per_iteration": 2.6640992164611816 + }, + { + "auxiliary_loss_clip": 0.01112538, + "auxiliary_loss_mlp": 0.01030554, + "balance_loss_clip": 1.03857541, + "balance_loss_mlp": 1.01897335, + "epoch": 0.8719675334435593, + "flos": 40354278973920.0, + "grad_norm": 2.005157479029713, + "language_loss": 0.68556142, + "learning_rate": 1.6943041984765262e-07, + "loss": 0.70699239, + "num_input_tokens_seen": 312727985, + "router_z_loss_clip": 0.74023438, + "router_z_loss_mlp": 0.11584473, + "step": 14503, + "time_per_iteration": 4.227763414382935 + }, + { + "auxiliary_loss_clip": 0.01109348, + "auxiliary_loss_mlp": 0.01028031, + "balance_loss_clip": 1.03722119, + "balance_loss_mlp": 1.01652145, + "epoch": 0.8720276566962273, + "flos": 16626247151040.0, + "grad_norm": 2.392296460087416, + "language_loss": 0.69537646, + "learning_rate": 1.6927357653243912e-07, + "loss": 0.71675026, + "num_input_tokens_seen": 312745025, + "router_z_loss_clip": 0.72167969, + "router_z_loss_mlp": 0.1151123, + "step": 14504, + "time_per_iteration": 2.6126022338867188 + }, + { + "auxiliary_loss_clip": 0.01111106, + "auxiliary_loss_mlp": 0.0102579, + "balance_loss_clip": 1.03819299, + "balance_loss_mlp": 1.01500773, + "epoch": 0.8720877799488952, + "flos": 28692192867840.0, + "grad_norm": 2.2571026218797634, + "language_loss": 0.70137799, + "learning_rate": 1.691168026385552e-07, + "loss": 0.72274697, + "num_input_tokens_seen": 312764170, + "router_z_loss_clip": 0.72851562, + "router_z_loss_mlp": 0.10778809, + "step": 14505, + "time_per_iteration": 2.6455769538879395 + }, + { + "auxiliary_loss_clip": 0.01109116, + "auxiliary_loss_mlp": 0.01030883, + "balance_loss_clip": 1.03820765, + "balance_loss_mlp": 1.0204289, + "epoch": 0.8721479032015632, + "flos": 25397432932320.0, + "grad_norm": 1.9664064956234877, + "language_loss": 0.78117752, + "learning_rate": 1.6896009817194545e-07, + "loss": 0.80257761, + "num_input_tokens_seen": 312783830, + "router_z_loss_clip": 0.70898438, + "router_z_loss_mlp": 0.10455322, + "step": 14506, + "time_per_iteration": 2.657729387283325 + }, + { + "auxiliary_loss_clip": 0.01110042, + "auxiliary_loss_mlp": 0.01028608, + "balance_loss_clip": 1.03643525, + "balance_loss_mlp": 1.01768827, + "epoch": 0.8722080264542311, + "flos": 23749505981280.0, + "grad_norm": 2.481793074633545, + "language_loss": 0.74523836, + "learning_rate": 1.6880346313855221e-07, + "loss": 0.76662487, + "num_input_tokens_seen": 312802015, + "router_z_loss_clip": 0.73681641, + "router_z_loss_mlp": 0.10919189, + "step": 14507, + "time_per_iteration": 4.093102693557739 + }, + { + "auxiliary_loss_clip": 0.0111445, + "auxiliary_loss_mlp": 0.01030026, + "balance_loss_clip": 1.03905606, + "balance_loss_mlp": 1.01789629, + "epoch": 0.8722681497068991, + "flos": 26553995648640.0, + "grad_norm": 2.2025374487382403, + "language_loss": 0.72454703, + "learning_rate": 1.686468975443156e-07, + "loss": 0.74599177, + "num_input_tokens_seen": 312820650, + "router_z_loss_clip": 0.75439453, + "router_z_loss_mlp": 0.12145996, + "step": 14508, + "time_per_iteration": 2.6019275188446045 + }, + { + "auxiliary_loss_clip": 0.01114576, + "auxiliary_loss_mlp": 0.01032093, + "balance_loss_clip": 1.03926516, + "balance_loss_mlp": 1.02008271, + "epoch": 0.8723282729595672, + "flos": 35236542322080.0, + "grad_norm": 1.7451072217043575, + "language_loss": 0.68768054, + "learning_rate": 1.6849040139517202e-07, + "loss": 0.70914721, + "num_input_tokens_seen": 312841310, + "router_z_loss_clip": 0.75341797, + "router_z_loss_mlp": 0.12030029, + "step": 14509, + "time_per_iteration": 2.719395160675049 + }, + { + "auxiliary_loss_clip": 0.01110969, + "auxiliary_loss_mlp": 0.01032991, + "balance_loss_clip": 1.03802919, + "balance_loss_mlp": 1.02180314, + "epoch": 0.8723883962122351, + "flos": 32298121578240.0, + "grad_norm": 2.177713910862882, + "language_loss": 0.58207703, + "learning_rate": 1.683339746970558e-07, + "loss": 0.60351658, + "num_input_tokens_seen": 312862100, + "router_z_loss_clip": 0.72900391, + "router_z_loss_mlp": 0.11181641, + "step": 14510, + "time_per_iteration": 2.6596508026123047 + }, + { + "auxiliary_loss_clip": 0.01116981, + "auxiliary_loss_mlp": 0.010314, + "balance_loss_clip": 1.03935969, + "balance_loss_mlp": 1.01921654, + "epoch": 0.8724485194649031, + "flos": 25040364429600.0, + "grad_norm": 3.725468597841506, + "language_loss": 0.67176282, + "learning_rate": 1.6817761745589865e-07, + "loss": 0.6932466, + "num_input_tokens_seen": 312880220, + "router_z_loss_clip": 0.77490234, + "router_z_loss_mlp": 0.12182617, + "step": 14511, + "time_per_iteration": 2.64333438873291 + }, + { + "auxiliary_loss_clip": 0.01113502, + "auxiliary_loss_mlp": 0.01031692, + "balance_loss_clip": 1.03838611, + "balance_loss_mlp": 1.02007508, + "epoch": 0.872508642717571, + "flos": 29716242612480.0, + "grad_norm": 2.158437451745855, + "language_loss": 0.81553322, + "learning_rate": 1.6802132967763027e-07, + "loss": 0.83698517, + "num_input_tokens_seen": 312900765, + "router_z_loss_clip": 0.75195312, + "router_z_loss_mlp": 0.11621094, + "step": 14512, + "time_per_iteration": 2.6381704807281494 + }, + { + "auxiliary_loss_clip": 0.01028159, + "auxiliary_loss_mlp": 0.01001113, + "balance_loss_clip": 1.00572681, + "balance_loss_mlp": 1.00010931, + "epoch": 0.872568765970239, + "flos": 74933442649440.0, + "grad_norm": 0.7870136665182448, + "language_loss": 0.58593118, + "learning_rate": 1.6786511136817617e-07, + "loss": 0.60622394, + "num_input_tokens_seen": 312955840, + "router_z_loss_clip": 0.22436523, + "router_z_loss_mlp": 0.01003265, + "step": 14513, + "time_per_iteration": 3.1242754459381104 + }, + { + "auxiliary_loss_clip": 0.01110594, + "auxiliary_loss_mlp": 0.01029505, + "balance_loss_clip": 1.03827524, + "balance_loss_mlp": 1.01802552, + "epoch": 0.8726288892229069, + "flos": 27574641941760.0, + "grad_norm": 2.2458459292216886, + "language_loss": 0.76643825, + "learning_rate": 1.6770896253346112e-07, + "loss": 0.78783917, + "num_input_tokens_seen": 312973565, + "router_z_loss_clip": 0.72314453, + "router_z_loss_mlp": 0.11468506, + "step": 14514, + "time_per_iteration": 3.965463638305664 + }, + { + "auxiliary_loss_clip": 0.01114193, + "auxiliary_loss_mlp": 0.01027034, + "balance_loss_clip": 1.04019046, + "balance_loss_mlp": 1.01637101, + "epoch": 0.872689012475575, + "flos": 31585564746720.0, + "grad_norm": 2.047167239206119, + "language_loss": 0.65233743, + "learning_rate": 1.675528831794055e-07, + "loss": 0.67374963, + "num_input_tokens_seen": 312994660, + "router_z_loss_clip": 0.74072266, + "router_z_loss_mlp": 0.10656738, + "step": 14515, + "time_per_iteration": 2.656186580657959 + }, + { + "auxiliary_loss_clip": 0.01113316, + "auxiliary_loss_mlp": 0.01033981, + "balance_loss_clip": 1.03964829, + "balance_loss_mlp": 1.02201867, + "epoch": 0.8727491357282429, + "flos": 26242624287360.0, + "grad_norm": 2.199711435302428, + "language_loss": 0.78859943, + "learning_rate": 1.6739687331192842e-07, + "loss": 0.81007242, + "num_input_tokens_seen": 313009860, + "router_z_loss_clip": 0.73583984, + "router_z_loss_mlp": 0.11956787, + "step": 14516, + "time_per_iteration": 2.690558910369873 + }, + { + "auxiliary_loss_clip": 0.0111299, + "auxiliary_loss_mlp": 0.01027847, + "balance_loss_clip": 1.03847265, + "balance_loss_mlp": 1.0166415, + "epoch": 0.8728092589809109, + "flos": 23437000136160.0, + "grad_norm": 6.095372364157057, + "language_loss": 0.7228471, + "learning_rate": 1.672409329369453e-07, + "loss": 0.74425542, + "num_input_tokens_seen": 313027025, + "router_z_loss_clip": 0.74462891, + "router_z_loss_mlp": 0.11206055, + "step": 14517, + "time_per_iteration": 2.6100735664367676 + }, + { + "auxiliary_loss_clip": 0.01106094, + "auxiliary_loss_mlp": 0.01024079, + "balance_loss_clip": 1.0356673, + "balance_loss_mlp": 1.01362467, + "epoch": 0.8728693822335788, + "flos": 25130381124960.0, + "grad_norm": 2.566590905951388, + "language_loss": 0.72555172, + "learning_rate": 1.6708506206036966e-07, + "loss": 0.74685347, + "num_input_tokens_seen": 313046830, + "router_z_loss_clip": 0.703125, + "router_z_loss_mlp": 0.10455322, + "step": 14518, + "time_per_iteration": 2.670606851577759 + }, + { + "auxiliary_loss_clip": 0.01108435, + "auxiliary_loss_mlp": 0.01034759, + "balance_loss_clip": 1.03778493, + "balance_loss_mlp": 1.02382171, + "epoch": 0.8729295054862468, + "flos": 26513120063520.0, + "grad_norm": 1.4019068428497958, + "language_loss": 0.7410562, + "learning_rate": 1.6692926068811275e-07, + "loss": 0.76248813, + "num_input_tokens_seen": 313067715, + "router_z_loss_clip": 0.70507812, + "router_z_loss_mlp": 0.10931396, + "step": 14519, + "time_per_iteration": 2.640702486038208 + }, + { + "auxiliary_loss_clip": 0.0111294, + "auxiliary_loss_mlp": 0.0102796, + "balance_loss_clip": 1.03805709, + "balance_loss_mlp": 1.01611042, + "epoch": 0.8729896287389147, + "flos": 21565165930560.0, + "grad_norm": 3.5605516390674694, + "language_loss": 0.76647466, + "learning_rate": 1.6677352882608142e-07, + "loss": 0.78788364, + "num_input_tokens_seen": 313082305, + "router_z_loss_clip": 0.74902344, + "router_z_loss_mlp": 0.1184082, + "step": 14520, + "time_per_iteration": 2.6079835891723633 + }, + { + "auxiliary_loss_clip": 0.01113113, + "auxiliary_loss_mlp": 0.010319, + "balance_loss_clip": 1.03920722, + "balance_loss_mlp": 1.01974082, + "epoch": 0.8730497519915827, + "flos": 29982848729760.0, + "grad_norm": 1.6314925703943788, + "language_loss": 0.81821764, + "learning_rate": 1.666178664801816e-07, + "loss": 0.83966774, + "num_input_tokens_seen": 313101190, + "router_z_loss_clip": 0.73974609, + "router_z_loss_mlp": 0.12164307, + "step": 14521, + "time_per_iteration": 2.6365790367126465 + }, + { + "auxiliary_loss_clip": 0.01114253, + "auxiliary_loss_mlp": 0.01034614, + "balance_loss_clip": 1.03973675, + "balance_loss_mlp": 1.02280593, + "epoch": 0.8731098752442508, + "flos": 16403514897600.0, + "grad_norm": 1.9060114151600356, + "language_loss": 0.76834059, + "learning_rate": 1.6646227365631616e-07, + "loss": 0.78982925, + "num_input_tokens_seen": 313118965, + "router_z_loss_clip": 0.74511719, + "router_z_loss_mlp": 0.11816406, + "step": 14522, + "time_per_iteration": 2.6328928470611572 + }, + { + "auxiliary_loss_clip": 0.011072, + "auxiliary_loss_mlp": 0.01026854, + "balance_loss_clip": 1.03715873, + "balance_loss_mlp": 1.01634622, + "epoch": 0.8731699984969187, + "flos": 28644024172320.0, + "grad_norm": 1.8058086323946265, + "language_loss": 0.75685608, + "learning_rate": 1.66306750360385e-07, + "loss": 0.77819669, + "num_input_tokens_seen": 313139280, + "router_z_loss_clip": 0.70019531, + "router_z_loss_mlp": 0.10510254, + "step": 14523, + "time_per_iteration": 2.613830804824829 + }, + { + "auxiliary_loss_clip": 0.01108189, + "auxiliary_loss_mlp": 0.01027393, + "balance_loss_clip": 1.03753519, + "balance_loss_mlp": 1.01657438, + "epoch": 0.8732301217495867, + "flos": 21612767384160.0, + "grad_norm": 2.5300266365658346, + "language_loss": 0.78120637, + "learning_rate": 1.6615129659828542e-07, + "loss": 0.80256212, + "num_input_tokens_seen": 313156655, + "router_z_loss_clip": 0.70703125, + "router_z_loss_mlp": 0.10821533, + "step": 14524, + "time_per_iteration": 2.688976287841797 + }, + { + "auxiliary_loss_clip": 0.01107066, + "auxiliary_loss_mlp": 0.01027476, + "balance_loss_clip": 1.03681874, + "balance_loss_mlp": 1.01711679, + "epoch": 0.8732902450022546, + "flos": 26910861564960.0, + "grad_norm": 1.9771916678032964, + "language_loss": 0.77283168, + "learning_rate": 1.6599591237591272e-07, + "loss": 0.79417711, + "num_input_tokens_seen": 313174050, + "router_z_loss_clip": 0.703125, + "router_z_loss_mlp": 0.1036377, + "step": 14525, + "time_per_iteration": 2.665696859359741 + }, + { + "auxiliary_loss_clip": 0.01113553, + "auxiliary_loss_mlp": 0.01031183, + "balance_loss_clip": 1.03909326, + "balance_loss_mlp": 1.01997089, + "epoch": 0.8733503682549226, + "flos": 27177670268640.0, + "grad_norm": 1.7632466047333255, + "language_loss": 0.69291061, + "learning_rate": 1.6584059769915902e-07, + "loss": 0.71435791, + "num_input_tokens_seen": 313192765, + "router_z_loss_clip": 0.74462891, + "router_z_loss_mlp": 0.11212158, + "step": 14526, + "time_per_iteration": 2.637625217437744 + }, + { + "auxiliary_loss_clip": 0.01115036, + "auxiliary_loss_mlp": 0.01035253, + "balance_loss_clip": 1.0392065, + "balance_loss_mlp": 1.02290905, + "epoch": 0.8734104915075905, + "flos": 28509728440320.0, + "grad_norm": 2.077119483050176, + "language_loss": 0.60940099, + "learning_rate": 1.6568535257391326e-07, + "loss": 0.63090384, + "num_input_tokens_seen": 313210925, + "router_z_loss_clip": 0.75927734, + "router_z_loss_mlp": 0.12347412, + "step": 14527, + "time_per_iteration": 2.662273406982422 + }, + { + "auxiliary_loss_clip": 0.01118104, + "auxiliary_loss_mlp": 0.01035669, + "balance_loss_clip": 1.04088283, + "balance_loss_mlp": 1.02284765, + "epoch": 0.8734706147602586, + "flos": 21612362211360.0, + "grad_norm": 2.377240201495626, + "language_loss": 0.65619302, + "learning_rate": 1.6553017700606265e-07, + "loss": 0.6777308, + "num_input_tokens_seen": 313228250, + "router_z_loss_clip": 0.77197266, + "router_z_loss_mlp": 0.1282959, + "step": 14528, + "time_per_iteration": 2.6192591190338135 + }, + { + "auxiliary_loss_clip": 0.01109107, + "auxiliary_loss_mlp": 0.0102792, + "balance_loss_clip": 1.03859138, + "balance_loss_mlp": 1.01664233, + "epoch": 0.8735307380129265, + "flos": 26905229663040.0, + "grad_norm": 1.8234445742339283, + "language_loss": 0.89465344, + "learning_rate": 1.6537507100149205e-07, + "loss": 0.91602367, + "num_input_tokens_seen": 313247880, + "router_z_loss_clip": 0.70458984, + "router_z_loss_mlp": 0.11273193, + "step": 14529, + "time_per_iteration": 2.7358527183532715 + }, + { + "auxiliary_loss_clip": 0.0110908, + "auxiliary_loss_mlp": 0.01026108, + "balance_loss_clip": 1.03828931, + "balance_loss_mlp": 1.01493227, + "epoch": 0.8735908612655945, + "flos": 30918299883840.0, + "grad_norm": 1.823815910079173, + "language_loss": 0.85012126, + "learning_rate": 1.6522003456608258e-07, + "loss": 0.87147313, + "num_input_tokens_seen": 313266790, + "router_z_loss_clip": 0.70898438, + "router_z_loss_mlp": 0.11181641, + "step": 14530, + "time_per_iteration": 2.6704440116882324 + }, + { + "auxiliary_loss_clip": 0.01110933, + "auxiliary_loss_mlp": 0.01032227, + "balance_loss_clip": 1.0376457, + "balance_loss_mlp": 1.02143908, + "epoch": 0.8736509845182624, + "flos": 26284553321760.0, + "grad_norm": 2.3560474602598913, + "language_loss": 0.74460316, + "learning_rate": 1.650650677057128e-07, + "loss": 0.76603472, + "num_input_tokens_seen": 313286805, + "router_z_loss_clip": 0.73193359, + "router_z_loss_mlp": 0.10784912, + "step": 14531, + "time_per_iteration": 2.6544649600982666 + }, + { + "auxiliary_loss_clip": 0.01106286, + "auxiliary_loss_mlp": 0.01029558, + "balance_loss_clip": 1.03629851, + "balance_loss_mlp": 1.0187993, + "epoch": 0.8737111077709304, + "flos": 26865610113600.0, + "grad_norm": 2.0890661937989985, + "language_loss": 0.6159718, + "learning_rate": 1.6491017042625966e-07, + "loss": 0.63733017, + "num_input_tokens_seen": 313305415, + "router_z_loss_clip": 0.70019531, + "router_z_loss_mlp": 0.10760498, + "step": 14532, + "time_per_iteration": 2.6243600845336914 + }, + { + "auxiliary_loss_clip": 0.01028024, + "auxiliary_loss_mlp": 0.01001347, + "balance_loss_clip": 1.00567532, + "balance_loss_mlp": 1.00034618, + "epoch": 0.8737712310235983, + "flos": 85495683880800.0, + "grad_norm": 0.8195608972808506, + "language_loss": 0.58656472, + "learning_rate": 1.6475534273359704e-07, + "loss": 0.60685843, + "num_input_tokens_seen": 313369940, + "router_z_loss_clip": 0.22351074, + "router_z_loss_mlp": 0.01000214, + "step": 14533, + "time_per_iteration": 3.3546173572540283 + }, + { + "auxiliary_loss_clip": 0.01109468, + "auxiliary_loss_mlp": 0.01029318, + "balance_loss_clip": 1.03817177, + "balance_loss_mlp": 1.01850557, + "epoch": 0.8738313542762663, + "flos": 34969531032000.0, + "grad_norm": 2.1937026880067267, + "language_loss": 0.76729101, + "learning_rate": 1.646005846335954e-07, + "loss": 0.78867888, + "num_input_tokens_seen": 313390965, + "router_z_loss_clip": 0.71337891, + "router_z_loss_mlp": 0.1081543, + "step": 14534, + "time_per_iteration": 2.696112871170044 + }, + { + "auxiliary_loss_clip": 0.01109868, + "auxiliary_loss_mlp": 0.01026516, + "balance_loss_clip": 1.03702629, + "balance_loss_mlp": 1.01520944, + "epoch": 0.8738914775289344, + "flos": 27267200756640.0, + "grad_norm": 1.656136639252654, + "language_loss": 0.74936014, + "learning_rate": 1.6444589613212357e-07, + "loss": 0.770724, + "num_input_tokens_seen": 313409680, + "router_z_loss_clip": 0.72851562, + "router_z_loss_mlp": 0.11315918, + "step": 14535, + "time_per_iteration": 2.714250087738037 + }, + { + "auxiliary_loss_clip": 0.01109175, + "auxiliary_loss_mlp": 0.01029454, + "balance_loss_clip": 1.036443, + "balance_loss_mlp": 1.0179683, + "epoch": 0.8739516007816023, + "flos": 38753993993760.0, + "grad_norm": 1.8357498045121603, + "language_loss": 0.74576789, + "learning_rate": 1.64291277235048e-07, + "loss": 0.76715416, + "num_input_tokens_seen": 313431335, + "router_z_loss_clip": 0.72753906, + "router_z_loss_mlp": 0.11486816, + "step": 14536, + "time_per_iteration": 4.193888425827026 + }, + { + "auxiliary_loss_clip": 0.01108968, + "auxiliary_loss_mlp": 0.0103236, + "balance_loss_clip": 1.03604662, + "balance_loss_mlp": 1.02169108, + "epoch": 0.8740117240342703, + "flos": 25882922161440.0, + "grad_norm": 1.6737762576222288, + "language_loss": 0.63742518, + "learning_rate": 1.641367279482304e-07, + "loss": 0.65883851, + "num_input_tokens_seen": 313449225, + "router_z_loss_clip": 0.72900391, + "router_z_loss_mlp": 0.10668945, + "step": 14537, + "time_per_iteration": 2.6273603439331055 + }, + { + "auxiliary_loss_clip": 0.01108311, + "auxiliary_loss_mlp": 0.01027376, + "balance_loss_clip": 1.03685057, + "balance_loss_mlp": 1.01567566, + "epoch": 0.8740718472869382, + "flos": 30733647523200.0, + "grad_norm": 1.7518803684234359, + "language_loss": 0.58386546, + "learning_rate": 1.6398224827753216e-07, + "loss": 0.60522234, + "num_input_tokens_seen": 313467715, + "router_z_loss_clip": 0.71484375, + "router_z_loss_mlp": 0.11700439, + "step": 14538, + "time_per_iteration": 2.678434133529663 + }, + { + "auxiliary_loss_clip": 0.0110907, + "auxiliary_loss_mlp": 0.01027524, + "balance_loss_clip": 1.04021478, + "balance_loss_mlp": 1.0168016, + "epoch": 0.8741319705396062, + "flos": 23794919501760.0, + "grad_norm": 2.0034625063311666, + "language_loss": 0.68551195, + "learning_rate": 1.6382783822881142e-07, + "loss": 0.70687783, + "num_input_tokens_seen": 313486805, + "router_z_loss_clip": 0.68847656, + "router_z_loss_mlp": 0.10723877, + "step": 14539, + "time_per_iteration": 2.6688084602355957 + }, + { + "auxiliary_loss_clip": 0.01111856, + "auxiliary_loss_mlp": 0.01031273, + "balance_loss_clip": 1.03639841, + "balance_loss_mlp": 1.01975131, + "epoch": 0.8741920937922741, + "flos": 17205399630720.0, + "grad_norm": 3.218350676890319, + "language_loss": 0.74432707, + "learning_rate": 1.6367349780792262e-07, + "loss": 0.7657584, + "num_input_tokens_seen": 313504880, + "router_z_loss_clip": 0.75439453, + "router_z_loss_mlp": 0.11517334, + "step": 14540, + "time_per_iteration": 2.6390602588653564 + }, + { + "auxiliary_loss_clip": 0.01109905, + "auxiliary_loss_mlp": 0.01031262, + "balance_loss_clip": 1.03730285, + "balance_loss_mlp": 1.02018797, + "epoch": 0.8742522170449422, + "flos": 33812117452800.0, + "grad_norm": 1.8213297882175534, + "language_loss": 0.79264855, + "learning_rate": 1.635192270207193e-07, + "loss": 0.81406021, + "num_input_tokens_seen": 313524995, + "router_z_loss_clip": 0.72558594, + "router_z_loss_mlp": 0.11071777, + "step": 14541, + "time_per_iteration": 2.730107069015503 + }, + { + "auxiliary_loss_clip": 0.0111649, + "auxiliary_loss_mlp": 0.01028974, + "balance_loss_clip": 1.04005039, + "balance_loss_mlp": 1.01616549, + "epoch": 0.8743123402976101, + "flos": 25798213229760.0, + "grad_norm": 2.386790090829348, + "language_loss": 0.66467905, + "learning_rate": 1.6336502587305035e-07, + "loss": 0.68613368, + "num_input_tokens_seen": 313541740, + "router_z_loss_clip": 0.76416016, + "router_z_loss_mlp": 0.12805176, + "step": 14542, + "time_per_iteration": 2.6217055320739746 + }, + { + "auxiliary_loss_clip": 0.0102824, + "auxiliary_loss_mlp": 0.0100097, + "balance_loss_clip": 1.00592601, + "balance_loss_mlp": 1.00003099, + "epoch": 0.8743724635502781, + "flos": 74273835552480.0, + "grad_norm": 0.7859835218164868, + "language_loss": 0.54507387, + "learning_rate": 1.632108943707642e-07, + "loss": 0.56536591, + "num_input_tokens_seen": 313593445, + "router_z_loss_clip": 0.22314453, + "router_z_loss_mlp": 0.00937653, + "step": 14543, + "time_per_iteration": 4.351099491119385 + }, + { + "auxiliary_loss_clip": 0.01113688, + "auxiliary_loss_mlp": 0.01031901, + "balance_loss_clip": 1.04002225, + "balance_loss_mlp": 1.02036774, + "epoch": 0.874432586802946, + "flos": 34299713580480.0, + "grad_norm": 2.017640901050321, + "language_loss": 0.69563562, + "learning_rate": 1.6305683251970458e-07, + "loss": 0.7170915, + "num_input_tokens_seen": 313615640, + "router_z_loss_clip": 0.73632812, + "router_z_loss_mlp": 0.11535645, + "step": 14544, + "time_per_iteration": 2.6592910289764404 + }, + { + "auxiliary_loss_clip": 0.01105366, + "auxiliary_loss_mlp": 0.01025018, + "balance_loss_clip": 1.03691781, + "balance_loss_mlp": 1.01471782, + "epoch": 0.874492710055614, + "flos": 28736309835360.0, + "grad_norm": 2.1714846361380586, + "language_loss": 0.76017678, + "learning_rate": 1.62902840325714e-07, + "loss": 0.78148067, + "num_input_tokens_seen": 313635550, + "router_z_loss_clip": 0.68554688, + "router_z_loss_mlp": 0.10302734, + "step": 14545, + "time_per_iteration": 2.6643428802490234 + }, + { + "auxiliary_loss_clip": 0.01110727, + "auxiliary_loss_mlp": 0.01033122, + "balance_loss_clip": 1.03719926, + "balance_loss_mlp": 1.0197351, + "epoch": 0.8745528333082819, + "flos": 49927025350080.0, + "grad_norm": 1.8308783537745237, + "language_loss": 0.6606285, + "learning_rate": 1.6274891779463217e-07, + "loss": 0.68206704, + "num_input_tokens_seen": 313659275, + "router_z_loss_clip": 0.73681641, + "router_z_loss_mlp": 0.13391113, + "step": 14546, + "time_per_iteration": 4.222371816635132 + }, + { + "auxiliary_loss_clip": 0.01109249, + "auxiliary_loss_mlp": 0.0102896, + "balance_loss_clip": 1.03771079, + "balance_loss_mlp": 1.01795149, + "epoch": 0.87461295656095, + "flos": 28824948943200.0, + "grad_norm": 1.6254787096247605, + "language_loss": 0.73028743, + "learning_rate": 1.6259506493229536e-07, + "loss": 0.75166953, + "num_input_tokens_seen": 313680595, + "router_z_loss_clip": 0.71582031, + "router_z_loss_mlp": 0.11004639, + "step": 14547, + "time_per_iteration": 2.6629116535186768 + }, + { + "auxiliary_loss_clip": 0.01117387, + "auxiliary_loss_mlp": 0.01035478, + "balance_loss_clip": 1.03962088, + "balance_loss_mlp": 1.02309203, + "epoch": 0.874673079813618, + "flos": 47338663619520.0, + "grad_norm": 2.9783381812838496, + "language_loss": 0.69376546, + "learning_rate": 1.6244128174453752e-07, + "loss": 0.71529412, + "num_input_tokens_seen": 313699730, + "router_z_loss_clip": 0.77685547, + "router_z_loss_mlp": 0.12384033, + "step": 14548, + "time_per_iteration": 2.7641379833221436 + }, + { + "auxiliary_loss_clip": 0.01113799, + "auxiliary_loss_mlp": 0.01031573, + "balance_loss_clip": 1.03855598, + "balance_loss_mlp": 1.01998615, + "epoch": 0.8747332030662859, + "flos": 28914317362080.0, + "grad_norm": 2.356970673087793, + "language_loss": 0.70412529, + "learning_rate": 1.6228756823719093e-07, + "loss": 0.72557902, + "num_input_tokens_seen": 313720090, + "router_z_loss_clip": 0.75195312, + "router_z_loss_mlp": 0.11590576, + "step": 14549, + "time_per_iteration": 2.6732499599456787 + }, + { + "auxiliary_loss_clip": 0.01114865, + "auxiliary_loss_mlp": 0.01033064, + "balance_loss_clip": 1.03842688, + "balance_loss_mlp": 1.02008843, + "epoch": 0.8747933263189539, + "flos": 29849403860640.0, + "grad_norm": 3.3588777226055595, + "language_loss": 0.83311939, + "learning_rate": 1.6213392441608352e-07, + "loss": 0.8545987, + "num_input_tokens_seen": 313736795, + "router_z_loss_clip": 0.76464844, + "router_z_loss_mlp": 0.12963867, + "step": 14550, + "time_per_iteration": 2.649764060974121 + }, + { + "auxiliary_loss_clip": 0.01113587, + "auxiliary_loss_mlp": 0.01037022, + "balance_loss_clip": 1.03849256, + "balance_loss_mlp": 1.02593529, + "epoch": 0.8748534495716218, + "flos": 16849384577280.0, + "grad_norm": 1.7189360430101757, + "language_loss": 0.71607018, + "learning_rate": 1.6198035028704183e-07, + "loss": 0.73757625, + "num_input_tokens_seen": 313754820, + "router_z_loss_clip": 0.75097656, + "router_z_loss_mlp": 0.11083984, + "step": 14551, + "time_per_iteration": 2.8503565788269043 + }, + { + "auxiliary_loss_clip": 0.01108166, + "auxiliary_loss_mlp": 0.01030101, + "balance_loss_clip": 1.03776777, + "balance_loss_mlp": 1.01887119, + "epoch": 0.8749135728242898, + "flos": 36438478041600.0, + "grad_norm": 2.051545320926191, + "language_loss": 0.64364403, + "learning_rate": 1.6182684585588934e-07, + "loss": 0.66502672, + "num_input_tokens_seen": 313775830, + "router_z_loss_clip": 0.70410156, + "router_z_loss_mlp": 0.11236572, + "step": 14552, + "time_per_iteration": 2.7409794330596924 + }, + { + "auxiliary_loss_clip": 0.01113198, + "auxiliary_loss_mlp": 0.01025622, + "balance_loss_clip": 1.03797174, + "balance_loss_mlp": 1.01277721, + "epoch": 0.8749736960769577, + "flos": 29448015804000.0, + "grad_norm": 1.8778314286552222, + "language_loss": 0.79549021, + "learning_rate": 1.616734111284479e-07, + "loss": 0.81687844, + "num_input_tokens_seen": 313795745, + "router_z_loss_clip": 0.75292969, + "router_z_loss_mlp": 0.1282959, + "step": 14553, + "time_per_iteration": 3.94911527633667 + }, + { + "auxiliary_loss_clip": 0.01109912, + "auxiliary_loss_mlp": 0.01030458, + "balance_loss_clip": 1.03551602, + "balance_loss_mlp": 1.01959765, + "epoch": 0.8750338193296258, + "flos": 20990227248000.0, + "grad_norm": 1.8747220558985116, + "language_loss": 0.70282102, + "learning_rate": 1.6152004611053416e-07, + "loss": 0.72422469, + "num_input_tokens_seen": 313813895, + "router_z_loss_clip": 0.74414062, + "router_z_loss_mlp": 0.10852051, + "step": 14554, + "time_per_iteration": 2.6941275596618652 + }, + { + "auxiliary_loss_clip": 0.01112234, + "auxiliary_loss_mlp": 0.01024711, + "balance_loss_clip": 1.03866243, + "balance_loss_mlp": 1.01373148, + "epoch": 0.8750939425822937, + "flos": 28959487778880.0, + "grad_norm": 1.4372709865011775, + "language_loss": 0.83667213, + "learning_rate": 1.6136675080796457e-07, + "loss": 0.85804152, + "num_input_tokens_seen": 313834225, + "router_z_loss_clip": 0.73632812, + "router_z_loss_mlp": 0.10986328, + "step": 14555, + "time_per_iteration": 2.748758316040039 + }, + { + "auxiliary_loss_clip": 0.0110919, + "auxiliary_loss_mlp": 0.01030942, + "balance_loss_clip": 1.03723919, + "balance_loss_mlp": 1.01937914, + "epoch": 0.8751540658349617, + "flos": 32386679651520.0, + "grad_norm": 1.5083717753543284, + "language_loss": 0.71015644, + "learning_rate": 1.6121352522655252e-07, + "loss": 0.73155773, + "num_input_tokens_seen": 313854430, + "router_z_loss_clip": 0.71923828, + "router_z_loss_mlp": 0.11553955, + "step": 14556, + "time_per_iteration": 2.7255699634552 + }, + { + "auxiliary_loss_clip": 0.01113291, + "auxiliary_loss_mlp": 0.01032616, + "balance_loss_clip": 1.03822732, + "balance_loss_mlp": 1.01995611, + "epoch": 0.8752141890876296, + "flos": 23656085834400.0, + "grad_norm": 1.8118660578873729, + "language_loss": 0.76680213, + "learning_rate": 1.6106036937210732e-07, + "loss": 0.78826118, + "num_input_tokens_seen": 313871600, + "router_z_loss_clip": 0.75097656, + "router_z_loss_mlp": 0.12664795, + "step": 14557, + "time_per_iteration": 2.6214563846588135 + }, + { + "auxiliary_loss_clip": 0.01113628, + "auxiliary_loss_mlp": 0.01036552, + "balance_loss_clip": 1.04021478, + "balance_loss_mlp": 1.02475071, + "epoch": 0.8752743123402976, + "flos": 30962254782240.0, + "grad_norm": 2.018736674795843, + "language_loss": 0.82917833, + "learning_rate": 1.6090728325043767e-07, + "loss": 0.85068017, + "num_input_tokens_seen": 313891570, + "router_z_loss_clip": 0.73388672, + "router_z_loss_mlp": 0.11804199, + "step": 14558, + "time_per_iteration": 2.6882879734039307 + }, + { + "auxiliary_loss_clip": 0.0102834, + "auxiliary_loss_mlp": 0.01001177, + "balance_loss_clip": 1.00592875, + "balance_loss_mlp": 1.00014377, + "epoch": 0.8753344355929655, + "flos": 73154582900640.0, + "grad_norm": 0.804390401564344, + "language_loss": 0.56056964, + "learning_rate": 1.6075426686734784e-07, + "loss": 0.58086485, + "num_input_tokens_seen": 313951290, + "router_z_loss_clip": 0.22424316, + "router_z_loss_mlp": 0.01033783, + "step": 14559, + "time_per_iteration": 3.2676873207092285 + }, + { + "auxiliary_loss_clip": 0.01110768, + "auxiliary_loss_mlp": 0.01028904, + "balance_loss_clip": 1.03899717, + "balance_loss_mlp": 1.01801443, + "epoch": 0.8753945588456336, + "flos": 21834891878400.0, + "grad_norm": 1.6589145958827365, + "language_loss": 0.65879977, + "learning_rate": 1.606013202286407e-07, + "loss": 0.68019646, + "num_input_tokens_seen": 313968645, + "router_z_loss_clip": 0.71826172, + "router_z_loss_mlp": 0.10882568, + "step": 14560, + "time_per_iteration": 2.6449925899505615 + }, + { + "auxiliary_loss_clip": 0.01107573, + "auxiliary_loss_mlp": 0.01026641, + "balance_loss_clip": 1.0369271, + "balance_loss_mlp": 1.01583445, + "epoch": 0.8754546820983016, + "flos": 37723664070720.0, + "grad_norm": 2.6919533320196836, + "language_loss": 0.78848577, + "learning_rate": 1.6044844334011541e-07, + "loss": 0.80982792, + "num_input_tokens_seen": 313987580, + "router_z_loss_clip": 0.70556641, + "router_z_loss_mlp": 0.10803223, + "step": 14561, + "time_per_iteration": 2.7063233852386475 + }, + { + "auxiliary_loss_clip": 0.01111326, + "auxiliary_loss_mlp": 0.0103169, + "balance_loss_clip": 1.03683519, + "balance_loss_mlp": 1.01949513, + "epoch": 0.8755148053509695, + "flos": 25174092919680.0, + "grad_norm": 1.9815510739834281, + "language_loss": 0.77714074, + "learning_rate": 1.6029563620756982e-07, + "loss": 0.79857093, + "num_input_tokens_seen": 314004460, + "router_z_loss_clip": 0.74365234, + "router_z_loss_mlp": 0.12188721, + "step": 14562, + "time_per_iteration": 2.6229240894317627 + }, + { + "auxiliary_loss_clip": 0.01104841, + "auxiliary_loss_mlp": 0.01028329, + "balance_loss_clip": 1.03657985, + "balance_loss_mlp": 1.01797569, + "epoch": 0.8755749286036375, + "flos": 42671253548160.0, + "grad_norm": 1.6113394619884605, + "language_loss": 0.71901053, + "learning_rate": 1.601428988367981e-07, + "loss": 0.74034214, + "num_input_tokens_seen": 314026855, + "router_z_loss_clip": 0.68261719, + "router_z_loss_mlp": 0.10357666, + "step": 14563, + "time_per_iteration": 2.826469898223877 + }, + { + "auxiliary_loss_clip": 0.01115184, + "auxiliary_loss_mlp": 0.01031358, + "balance_loss_clip": 1.04060507, + "balance_loss_mlp": 1.01966918, + "epoch": 0.8756350518563054, + "flos": 22190987966400.0, + "grad_norm": 2.527536010084978, + "language_loss": 0.65173626, + "learning_rate": 1.5999023123359235e-07, + "loss": 0.67320168, + "num_input_tokens_seen": 314042830, + "router_z_loss_clip": 0.74658203, + "router_z_loss_mlp": 0.11682129, + "step": 14564, + "time_per_iteration": 2.6250691413879395 + }, + { + "auxiliary_loss_clip": 0.01108917, + "auxiliary_loss_mlp": 0.01036007, + "balance_loss_clip": 1.03652811, + "balance_loss_mlp": 1.02508712, + "epoch": 0.8756951751089734, + "flos": 24506503918560.0, + "grad_norm": 3.5524828886860838, + "language_loss": 0.70677483, + "learning_rate": 1.598376334037408e-07, + "loss": 0.72822404, + "num_input_tokens_seen": 314062225, + "router_z_loss_clip": 0.72265625, + "router_z_loss_mlp": 0.10931396, + "step": 14565, + "time_per_iteration": 2.622710704803467 + }, + { + "auxiliary_loss_clip": 0.0111706, + "auxiliary_loss_mlp": 0.01035449, + "balance_loss_clip": 1.03975213, + "balance_loss_mlp": 1.02305746, + "epoch": 0.8757552983616413, + "flos": 33587035197120.0, + "grad_norm": 1.6706453818972973, + "language_loss": 0.77608109, + "learning_rate": 1.5968510535303102e-07, + "loss": 0.79760617, + "num_input_tokens_seen": 314082325, + "router_z_loss_clip": 0.7734375, + "router_z_loss_mlp": 0.12390137, + "step": 14566, + "time_per_iteration": 2.7063474655151367 + }, + { + "auxiliary_loss_clip": 0.01113336, + "auxiliary_loss_mlp": 0.01030369, + "balance_loss_clip": 1.04158258, + "balance_loss_mlp": 1.01921678, + "epoch": 0.8758154216143094, + "flos": 22052924127360.0, + "grad_norm": 1.6833076946534766, + "language_loss": 0.71133745, + "learning_rate": 1.5953264708724624e-07, + "loss": 0.7327745, + "num_input_tokens_seen": 314100310, + "router_z_loss_clip": 0.71679688, + "router_z_loss_mlp": 0.11151123, + "step": 14567, + "time_per_iteration": 2.631314754486084 + }, + { + "auxiliary_loss_clip": 0.01109825, + "auxiliary_loss_mlp": 0.01031851, + "balance_loss_clip": 1.03839028, + "balance_loss_mlp": 1.02048445, + "epoch": 0.8758755448669773, + "flos": 30561150346560.0, + "grad_norm": 2.4534123477800747, + "language_loss": 0.74404383, + "learning_rate": 1.5938025861216776e-07, + "loss": 0.76546061, + "num_input_tokens_seen": 314121330, + "router_z_loss_clip": 0.71435547, + "router_z_loss_mlp": 0.11358643, + "step": 14568, + "time_per_iteration": 2.654623508453369 + }, + { + "auxiliary_loss_clip": 0.01108956, + "auxiliary_loss_mlp": 0.01028246, + "balance_loss_clip": 1.03801823, + "balance_loss_mlp": 1.01750541, + "epoch": 0.8759356681196453, + "flos": 27890105548320.0, + "grad_norm": 2.1791250258246215, + "language_loss": 0.86409515, + "learning_rate": 1.5922793993357475e-07, + "loss": 0.88546717, + "num_input_tokens_seen": 314139875, + "router_z_loss_clip": 0.70947266, + "router_z_loss_mlp": 0.10736084, + "step": 14569, + "time_per_iteration": 2.640918016433716 + }, + { + "auxiliary_loss_clip": 0.01110242, + "auxiliary_loss_mlp": 0.01029872, + "balance_loss_clip": 1.03638935, + "balance_loss_mlp": 1.01892829, + "epoch": 0.8759957913723132, + "flos": 25663958015040.0, + "grad_norm": 1.709659747400394, + "language_loss": 0.73547632, + "learning_rate": 1.5907569105724284e-07, + "loss": 0.75687742, + "num_input_tokens_seen": 314157850, + "router_z_loss_clip": 0.73925781, + "router_z_loss_mlp": 0.109375, + "step": 14570, + "time_per_iteration": 2.599036931991577 + }, + { + "auxiliary_loss_clip": 0.01112946, + "auxiliary_loss_mlp": 0.01028623, + "balance_loss_clip": 1.03774476, + "balance_loss_mlp": 1.01695776, + "epoch": 0.8760559146249812, + "flos": 24416811361440.0, + "grad_norm": 2.344586312837684, + "language_loss": 0.68024671, + "learning_rate": 1.5892351198894472e-07, + "loss": 0.70166242, + "num_input_tokens_seen": 314176720, + "router_z_loss_clip": 0.75195312, + "router_z_loss_mlp": 0.11669922, + "step": 14571, + "time_per_iteration": 2.6523630619049072 + }, + { + "auxiliary_loss_clip": 0.0110793, + "auxiliary_loss_mlp": 0.0102575, + "balance_loss_clip": 1.03693008, + "balance_loss_mlp": 1.01511657, + "epoch": 0.8761160378776491, + "flos": 24372491807520.0, + "grad_norm": 2.032197796489907, + "language_loss": 0.62647867, + "learning_rate": 1.5877140273445156e-07, + "loss": 0.64781547, + "num_input_tokens_seen": 314196645, + "router_z_loss_clip": 0.71044922, + "router_z_loss_mlp": 0.10632324, + "step": 14572, + "time_per_iteration": 2.648066282272339 + }, + { + "auxiliary_loss_clip": 0.01107766, + "auxiliary_loss_mlp": 0.01027611, + "balance_loss_clip": 1.03790843, + "balance_loss_mlp": 1.01722801, + "epoch": 0.8761761611303172, + "flos": 35147781662400.0, + "grad_norm": 1.6400598472580101, + "language_loss": 0.73532915, + "learning_rate": 1.5861936329953162e-07, + "loss": 0.75668287, + "num_input_tokens_seen": 314217430, + "router_z_loss_clip": 0.69824219, + "router_z_loss_mlp": 0.1038208, + "step": 14573, + "time_per_iteration": 2.683213472366333 + }, + { + "auxiliary_loss_clip": 0.01105925, + "auxiliary_loss_mlp": 0.01030282, + "balance_loss_clip": 1.03683138, + "balance_loss_mlp": 1.02004838, + "epoch": 0.8762362843829851, + "flos": 22369198079520.0, + "grad_norm": 2.0336757483254493, + "language_loss": 0.72517169, + "learning_rate": 1.5846739368994966e-07, + "loss": 0.74653369, + "num_input_tokens_seen": 314235310, + "router_z_loss_clip": 0.69091797, + "router_z_loss_mlp": 0.10235596, + "step": 14574, + "time_per_iteration": 2.584467887878418 + }, + { + "auxiliary_loss_clip": 0.01108782, + "auxiliary_loss_mlp": 0.01031236, + "balance_loss_clip": 1.03702605, + "balance_loss_mlp": 1.01988745, + "epoch": 0.8762964076356531, + "flos": 19253337050880.0, + "grad_norm": 1.723294709996541, + "language_loss": 0.75699866, + "learning_rate": 1.5831549391146903e-07, + "loss": 0.77839887, + "num_input_tokens_seen": 314252355, + "router_z_loss_clip": 0.71875, + "router_z_loss_mlp": 0.11340332, + "step": 14575, + "time_per_iteration": 4.11812686920166 + }, + { + "auxiliary_loss_clip": 0.0111186, + "auxiliary_loss_mlp": 0.01033904, + "balance_loss_clip": 1.04046559, + "balance_loss_mlp": 1.02356863, + "epoch": 0.8763565308883211, + "flos": 40483793666880.0, + "grad_norm": 2.353447250580058, + "language_loss": 0.66598439, + "learning_rate": 1.5816366396984916e-07, + "loss": 0.68744206, + "num_input_tokens_seen": 314272755, + "router_z_loss_clip": 0.71484375, + "router_z_loss_mlp": 0.10333252, + "step": 14576, + "time_per_iteration": 2.7421648502349854 + }, + { + "auxiliary_loss_clip": 0.01106131, + "auxiliary_loss_mlp": 0.01028151, + "balance_loss_clip": 1.03537381, + "balance_loss_mlp": 1.01755357, + "epoch": 0.876416654140989, + "flos": 19386336229920.0, + "grad_norm": 1.9865534707917814, + "language_loss": 0.66650128, + "learning_rate": 1.5801190387084806e-07, + "loss": 0.68784404, + "num_input_tokens_seen": 314291365, + "router_z_loss_clip": 0.70751953, + "router_z_loss_mlp": 0.10601807, + "step": 14577, + "time_per_iteration": 2.5900027751922607 + }, + { + "auxiliary_loss_clip": 0.01112577, + "auxiliary_loss_mlp": 0.01028383, + "balance_loss_clip": 1.03973258, + "balance_loss_mlp": 1.01680827, + "epoch": 0.876476777393657, + "flos": 31585605264000.0, + "grad_norm": 3.264235187794755, + "language_loss": 0.71022522, + "learning_rate": 1.5786021362021962e-07, + "loss": 0.73163486, + "num_input_tokens_seen": 314310075, + "router_z_loss_clip": 0.72753906, + "router_z_loss_mlp": 0.11584473, + "step": 14578, + "time_per_iteration": 2.6814794540405273 + }, + { + "auxiliary_loss_clip": 0.01112272, + "auxiliary_loss_mlp": 0.0103181, + "balance_loss_clip": 1.03725183, + "balance_loss_mlp": 1.02036572, + "epoch": 0.876536900646325, + "flos": 16581603458880.0, + "grad_norm": 1.938397525113219, + "language_loss": 0.7160508, + "learning_rate": 1.5770859322371676e-07, + "loss": 0.73749161, + "num_input_tokens_seen": 314325695, + "router_z_loss_clip": 0.75, + "router_z_loss_mlp": 0.11444092, + "step": 14579, + "time_per_iteration": 2.564708709716797 + }, + { + "auxiliary_loss_clip": 0.01106748, + "auxiliary_loss_mlp": 0.01030412, + "balance_loss_clip": 1.03797269, + "balance_loss_mlp": 1.01962399, + "epoch": 0.876597023898993, + "flos": 14889883678560.0, + "grad_norm": 1.9730839650854493, + "language_loss": 0.69902968, + "learning_rate": 1.5755704268708912e-07, + "loss": 0.72040129, + "num_input_tokens_seen": 314343605, + "router_z_loss_clip": 0.68798828, + "router_z_loss_mlp": 0.10797119, + "step": 14580, + "time_per_iteration": 2.6116175651550293 + }, + { + "auxiliary_loss_clip": 0.0110889, + "auxiliary_loss_mlp": 0.01027575, + "balance_loss_clip": 1.03890014, + "balance_loss_mlp": 1.01702487, + "epoch": 0.8766571471516609, + "flos": 30916436088960.0, + "grad_norm": 3.279050852063623, + "language_loss": 0.65655899, + "learning_rate": 1.5740556201608256e-07, + "loss": 0.67792368, + "num_input_tokens_seen": 314364275, + "router_z_loss_clip": 0.70019531, + "router_z_loss_mlp": 0.10559082, + "step": 14581, + "time_per_iteration": 2.746046304702759 + }, + { + "auxiliary_loss_clip": 0.01106332, + "auxiliary_loss_mlp": 0.01030082, + "balance_loss_clip": 1.03716135, + "balance_loss_mlp": 1.01950812, + "epoch": 0.8767172704043289, + "flos": 36745392502080.0, + "grad_norm": 1.4990442605122165, + "language_loss": 0.73620409, + "learning_rate": 1.572541512164416e-07, + "loss": 0.75756824, + "num_input_tokens_seen": 314385140, + "router_z_loss_clip": 0.69091797, + "router_z_loss_mlp": 0.10583496, + "step": 14582, + "time_per_iteration": 4.172935724258423 + }, + { + "auxiliary_loss_clip": 0.01107798, + "auxiliary_loss_mlp": 0.01032157, + "balance_loss_clip": 1.03565955, + "balance_loss_mlp": 1.02073073, + "epoch": 0.8767773936569968, + "flos": 23526814245120.0, + "grad_norm": 2.0479039387861078, + "language_loss": 0.66995829, + "learning_rate": 1.5710281029390826e-07, + "loss": 0.69135785, + "num_input_tokens_seen": 314403715, + "router_z_loss_clip": 0.72167969, + "router_z_loss_mlp": 0.11419678, + "step": 14583, + "time_per_iteration": 2.646512269973755 + }, + { + "auxiliary_loss_clip": 0.01110811, + "auxiliary_loss_mlp": 0.01027075, + "balance_loss_clip": 1.03694177, + "balance_loss_mlp": 1.01591134, + "epoch": 0.8768375169096648, + "flos": 25926350335200.0, + "grad_norm": 2.5131608802201675, + "language_loss": 0.79144198, + "learning_rate": 1.5695153925422067e-07, + "loss": 0.81282091, + "num_input_tokens_seen": 314421880, + "router_z_loss_clip": 0.73876953, + "router_z_loss_mlp": 0.11157227, + "step": 14584, + "time_per_iteration": 2.588324546813965 + }, + { + "auxiliary_loss_clip": 0.01111283, + "auxiliary_loss_mlp": 0.01025259, + "balance_loss_clip": 1.03747296, + "balance_loss_mlp": 1.01417828, + "epoch": 0.8768976401623327, + "flos": 28425384164160.0, + "grad_norm": 1.5780006919614993, + "language_loss": 0.7237466, + "learning_rate": 1.5680033810311555e-07, + "loss": 0.745112, + "num_input_tokens_seen": 314441585, + "router_z_loss_clip": 0.73876953, + "router_z_loss_mlp": 0.11077881, + "step": 14585, + "time_per_iteration": 2.671543598175049 + }, + { + "auxiliary_loss_clip": 0.01111049, + "auxiliary_loss_mlp": 0.01028241, + "balance_loss_clip": 1.03874648, + "balance_loss_mlp": 1.01620054, + "epoch": 0.8769577634150008, + "flos": 26065062450720.0, + "grad_norm": 2.4537756525790475, + "language_loss": 0.74417269, + "learning_rate": 1.5664920684632654e-07, + "loss": 0.76556563, + "num_input_tokens_seen": 314459020, + "router_z_loss_clip": 0.72216797, + "router_z_loss_mlp": 0.12042236, + "step": 14586, + "time_per_iteration": 4.051861524581909 + }, + { + "auxiliary_loss_clip": 0.01106694, + "auxiliary_loss_mlp": 0.01027917, + "balance_loss_clip": 1.03570247, + "balance_loss_mlp": 1.01657462, + "epoch": 0.8770178866676687, + "flos": 28692071316000.0, + "grad_norm": 1.768697670460109, + "language_loss": 0.7851004, + "learning_rate": 1.564981454895844e-07, + "loss": 0.80644655, + "num_input_tokens_seen": 314478935, + "router_z_loss_clip": 0.70996094, + "router_z_loss_mlp": 0.11352539, + "step": 14587, + "time_per_iteration": 2.670022964477539 + }, + { + "auxiliary_loss_clip": 0.01110224, + "auxiliary_loss_mlp": 0.01026329, + "balance_loss_clip": 1.03809941, + "balance_loss_mlp": 1.01349044, + "epoch": 0.8770780099203367, + "flos": 24061160963520.0, + "grad_norm": 2.2978207350706823, + "language_loss": 0.73948383, + "learning_rate": 1.5634715403861697e-07, + "loss": 0.76084936, + "num_input_tokens_seen": 314497635, + "router_z_loss_clip": 0.72119141, + "router_z_loss_mlp": 0.1282959, + "step": 14588, + "time_per_iteration": 2.644021987915039 + }, + { + "auxiliary_loss_clip": 0.01107643, + "auxiliary_loss_mlp": 0.01032822, + "balance_loss_clip": 1.03628385, + "balance_loss_mlp": 1.02210486, + "epoch": 0.8771381331730047, + "flos": 26107518209760.0, + "grad_norm": 3.7416072910315465, + "language_loss": 0.6642046, + "learning_rate": 1.5619623249915016e-07, + "loss": 0.68560922, + "num_input_tokens_seen": 314515445, + "router_z_loss_clip": 0.71337891, + "router_z_loss_mlp": 0.1071167, + "step": 14589, + "time_per_iteration": 2.588343381881714 + }, + { + "auxiliary_loss_clip": 0.01112482, + "auxiliary_loss_mlp": 0.01031373, + "balance_loss_clip": 1.03971267, + "balance_loss_mlp": 1.02012563, + "epoch": 0.8771982564256726, + "flos": 24723847373760.0, + "grad_norm": 2.3509253422547136, + "language_loss": 0.7059288, + "learning_rate": 1.5604538087690732e-07, + "loss": 0.72736734, + "num_input_tokens_seen": 314533040, + "router_z_loss_clip": 0.72851562, + "router_z_loss_mlp": 0.11248779, + "step": 14590, + "time_per_iteration": 2.6548728942871094 + }, + { + "auxiliary_loss_clip": 0.01114628, + "auxiliary_loss_mlp": 0.01037968, + "balance_loss_clip": 1.03752148, + "balance_loss_mlp": 1.02583218, + "epoch": 0.8772583796783406, + "flos": 15239456484480.0, + "grad_norm": 2.0299931066002697, + "language_loss": 0.74232757, + "learning_rate": 1.558945991776086e-07, + "loss": 0.76385355, + "num_input_tokens_seen": 314548280, + "router_z_loss_clip": 0.77099609, + "router_z_loss_mlp": 0.12139893, + "step": 14591, + "time_per_iteration": 2.536196231842041 + }, + { + "auxiliary_loss_clip": 0.01105511, + "auxiliary_loss_mlp": 0.01024605, + "balance_loss_clip": 1.03776741, + "balance_loss_mlp": 1.01401281, + "epoch": 0.8773185029310085, + "flos": 19426928194080.0, + "grad_norm": 1.7611051737921488, + "language_loss": 0.80065894, + "learning_rate": 1.5574388740697096e-07, + "loss": 0.82196009, + "num_input_tokens_seen": 314565345, + "router_z_loss_clip": 0.67773438, + "router_z_loss_mlp": 0.10595703, + "step": 14592, + "time_per_iteration": 2.6341850757598877 + }, + { + "auxiliary_loss_clip": 0.01106478, + "auxiliary_loss_mlp": 0.01029544, + "balance_loss_clip": 1.03705168, + "balance_loss_mlp": 1.01938117, + "epoch": 0.8773786261836766, + "flos": 26240152733280.0, + "grad_norm": 1.585107896443009, + "language_loss": 0.82942104, + "learning_rate": 1.5559324557071052e-07, + "loss": 0.8507812, + "num_input_tokens_seen": 314584190, + "router_z_loss_clip": 0.69433594, + "router_z_loss_mlp": 0.10168457, + "step": 14593, + "time_per_iteration": 4.0047767162323 + }, + { + "auxiliary_loss_clip": 0.01110126, + "auxiliary_loss_mlp": 0.01025879, + "balance_loss_clip": 1.03886664, + "balance_loss_mlp": 1.01555526, + "epoch": 0.8774387494363445, + "flos": 32654825425440.0, + "grad_norm": 1.4228902439838347, + "language_loss": 0.76112485, + "learning_rate": 1.5544267367453845e-07, + "loss": 0.78248489, + "num_input_tokens_seen": 314605625, + "router_z_loss_clip": 0.71240234, + "router_z_loss_mlp": 0.10333252, + "step": 14594, + "time_per_iteration": 2.7783186435699463 + }, + { + "auxiliary_loss_clip": 0.01109874, + "auxiliary_loss_mlp": 0.01032602, + "balance_loss_clip": 1.03621149, + "balance_loss_mlp": 1.02111578, + "epoch": 0.8774988726890125, + "flos": 22547732330880.0, + "grad_norm": 2.3183957947965554, + "language_loss": 0.77517456, + "learning_rate": 1.552921717241651e-07, + "loss": 0.79659933, + "num_input_tokens_seen": 314622630, + "router_z_loss_clip": 0.73730469, + "router_z_loss_mlp": 0.11486816, + "step": 14595, + "time_per_iteration": 2.59792423248291 + }, + { + "auxiliary_loss_clip": 0.01110759, + "auxiliary_loss_mlp": 0.0103202, + "balance_loss_clip": 1.0387305, + "balance_loss_mlp": 1.02053428, + "epoch": 0.8775589959416804, + "flos": 29805327410400.0, + "grad_norm": 1.3671719785623309, + "language_loss": 0.70561481, + "learning_rate": 1.5514173972529743e-07, + "loss": 0.72704262, + "num_input_tokens_seen": 314642460, + "router_z_loss_clip": 0.71923828, + "router_z_loss_mlp": 0.11486816, + "step": 14596, + "time_per_iteration": 2.6558587551116943 + }, + { + "auxiliary_loss_clip": 0.01110653, + "auxiliary_loss_mlp": 0.01023954, + "balance_loss_clip": 1.03895831, + "balance_loss_mlp": 1.0134635, + "epoch": 0.8776191191943484, + "flos": 28601973586080.0, + "grad_norm": 1.8519298330705283, + "language_loss": 0.85693431, + "learning_rate": 1.5499137768364067e-07, + "loss": 0.8782804, + "num_input_tokens_seen": 314659875, + "router_z_loss_clip": 0.71728516, + "router_z_loss_mlp": 0.1048584, + "step": 14597, + "time_per_iteration": 2.6958141326904297 + }, + { + "auxiliary_loss_clip": 0.01111124, + "auxiliary_loss_mlp": 0.01028922, + "balance_loss_clip": 1.03964162, + "balance_loss_mlp": 1.01841378, + "epoch": 0.8776792424470163, + "flos": 32739817978080.0, + "grad_norm": 1.8074627985684268, + "language_loss": 0.73010242, + "learning_rate": 1.5484108560489494e-07, + "loss": 0.75150287, + "num_input_tokens_seen": 314680260, + "router_z_loss_clip": 0.71484375, + "router_z_loss_mlp": 0.10516357, + "step": 14598, + "time_per_iteration": 2.649768352508545 + }, + { + "auxiliary_loss_clip": 0.01112114, + "auxiliary_loss_mlp": 0.01029753, + "balance_loss_clip": 1.0398016, + "balance_loss_mlp": 1.01855922, + "epoch": 0.8777393656996844, + "flos": 19067307102720.0, + "grad_norm": 2.3130458081245315, + "language_loss": 0.77380282, + "learning_rate": 1.5469086349476036e-07, + "loss": 0.79522145, + "num_input_tokens_seen": 314696260, + "router_z_loss_clip": 0.72314453, + "router_z_loss_mlp": 0.11199951, + "step": 14599, + "time_per_iteration": 2.646021842956543 + }, + { + "auxiliary_loss_clip": 0.01110002, + "auxiliary_loss_mlp": 0.01029284, + "balance_loss_clip": 1.03759551, + "balance_loss_mlp": 1.01823378, + "epoch": 0.8777994889523523, + "flos": 23037273288000.0, + "grad_norm": 2.1882364728604244, + "language_loss": 0.67842233, + "learning_rate": 1.545407113589332e-07, + "loss": 0.69981515, + "num_input_tokens_seen": 314714215, + "router_z_loss_clip": 0.72460938, + "router_z_loss_mlp": 0.11053467, + "step": 14600, + "time_per_iteration": 2.6190879344940186 + }, + { + "auxiliary_loss_clip": 0.01110166, + "auxiliary_loss_mlp": 0.0103388, + "balance_loss_clip": 1.03659022, + "balance_loss_mlp": 1.02299035, + "epoch": 0.8778596122050203, + "flos": 59581077206400.0, + "grad_norm": 2.1163638944626926, + "language_loss": 0.69360077, + "learning_rate": 1.543906292031072e-07, + "loss": 0.71504116, + "num_input_tokens_seen": 314735700, + "router_z_loss_clip": 0.73535156, + "router_z_loss_mlp": 0.10894775, + "step": 14601, + "time_per_iteration": 2.888864517211914 + }, + { + "auxiliary_loss_clip": 0.01112718, + "auxiliary_loss_mlp": 0.0102955, + "balance_loss_clip": 1.03781414, + "balance_loss_mlp": 1.01838613, + "epoch": 0.8779197354576883, + "flos": 31311543967200.0, + "grad_norm": 2.800342597420025, + "language_loss": 0.7316941, + "learning_rate": 1.542406170329733e-07, + "loss": 0.75311679, + "num_input_tokens_seen": 314753335, + "router_z_loss_clip": 0.74804688, + "router_z_loss_mlp": 0.11169434, + "step": 14602, + "time_per_iteration": 2.682905912399292 + }, + { + "auxiliary_loss_clip": 0.01107241, + "auxiliary_loss_mlp": 0.01028577, + "balance_loss_clip": 1.03643775, + "balance_loss_mlp": 1.01799154, + "epoch": 0.8779798587103562, + "flos": 22992427009440.0, + "grad_norm": 2.8088702355475226, + "language_loss": 0.70887345, + "learning_rate": 1.5409067485422056e-07, + "loss": 0.73023164, + "num_input_tokens_seen": 314770800, + "router_z_loss_clip": 0.70800781, + "router_z_loss_mlp": 0.10583496, + "step": 14603, + "time_per_iteration": 2.650726318359375 + }, + { + "auxiliary_loss_clip": 0.01028558, + "auxiliary_loss_mlp": 0.01002209, + "balance_loss_clip": 1.00611281, + "balance_loss_mlp": 1.00119853, + "epoch": 0.8780399819630242, + "flos": 83721807757440.0, + "grad_norm": 0.736645149681602, + "language_loss": 0.54155415, + "learning_rate": 1.539408026725344e-07, + "loss": 0.56186187, + "num_input_tokens_seen": 314837275, + "router_z_loss_clip": 0.2244873, + "router_z_loss_mlp": 0.01010132, + "step": 14604, + "time_per_iteration": 3.309882402420044 + }, + { + "auxiliary_loss_clip": 0.01028439, + "auxiliary_loss_mlp": 0.01002805, + "balance_loss_clip": 1.00606298, + "balance_loss_mlp": 1.00181198, + "epoch": 0.8781001052156922, + "flos": 80209096607520.0, + "grad_norm": 0.7895202210236805, + "language_loss": 0.59217972, + "learning_rate": 1.537910004935976e-07, + "loss": 0.6124922, + "num_input_tokens_seen": 314902220, + "router_z_loss_clip": 0.22387695, + "router_z_loss_mlp": 0.00992584, + "step": 14605, + "time_per_iteration": 3.255059242248535 + }, + { + "auxiliary_loss_clip": 0.01110911, + "auxiliary_loss_mlp": 0.01032675, + "balance_loss_clip": 1.03781605, + "balance_loss_mlp": 1.02144504, + "epoch": 0.8781602284683602, + "flos": 26905553801280.0, + "grad_norm": 1.6112142707497497, + "language_loss": 0.85149086, + "learning_rate": 1.536412683230912e-07, + "loss": 0.87292671, + "num_input_tokens_seen": 314921645, + "router_z_loss_clip": 0.72998047, + "router_z_loss_mlp": 0.11236572, + "step": 14606, + "time_per_iteration": 2.6728782653808594 + }, + { + "auxiliary_loss_clip": 0.01112914, + "auxiliary_loss_mlp": 0.01029694, + "balance_loss_clip": 1.03936076, + "balance_loss_mlp": 1.01799917, + "epoch": 0.8782203517210281, + "flos": 21430424508480.0, + "grad_norm": 1.8994925763301616, + "language_loss": 0.70229411, + "learning_rate": 1.534916061666931e-07, + "loss": 0.72372019, + "num_input_tokens_seen": 314939390, + "router_z_loss_clip": 0.73535156, + "router_z_loss_mlp": 0.11700439, + "step": 14607, + "time_per_iteration": 2.668900728225708 + }, + { + "auxiliary_loss_clip": 0.01108273, + "auxiliary_loss_mlp": 0.01036682, + "balance_loss_clip": 1.03713608, + "balance_loss_mlp": 1.02626276, + "epoch": 0.8782804749736961, + "flos": 31140424378080.0, + "grad_norm": 1.8667590004205714, + "language_loss": 0.71537352, + "learning_rate": 1.533420140300785e-07, + "loss": 0.73682308, + "num_input_tokens_seen": 314959205, + "router_z_loss_clip": 0.7109375, + "router_z_loss_mlp": 0.10418701, + "step": 14608, + "time_per_iteration": 2.7172391414642334 + }, + { + "auxiliary_loss_clip": 0.01114427, + "auxiliary_loss_mlp": 0.01030755, + "balance_loss_clip": 1.03882599, + "balance_loss_mlp": 1.01979971, + "epoch": 0.878340598226364, + "flos": 26598963479040.0, + "grad_norm": 2.483704483290778, + "language_loss": 0.87760568, + "learning_rate": 1.5319249191891936e-07, + "loss": 0.89905751, + "num_input_tokens_seen": 314977485, + "router_z_loss_clip": 0.75585938, + "router_z_loss_mlp": 0.10961914, + "step": 14609, + "time_per_iteration": 2.638376235961914 + }, + { + "auxiliary_loss_clip": 0.01113565, + "auxiliary_loss_mlp": 0.01027296, + "balance_loss_clip": 1.04167295, + "balance_loss_mlp": 1.01622093, + "epoch": 0.878400721479032, + "flos": 25749517809600.0, + "grad_norm": 1.583608645855205, + "language_loss": 0.7044692, + "learning_rate": 1.5304303983888643e-07, + "loss": 0.72587776, + "num_input_tokens_seen": 314997830, + "router_z_loss_clip": 0.71923828, + "router_z_loss_mlp": 0.11083984, + "step": 14610, + "time_per_iteration": 2.70456862449646 + }, + { + "auxiliary_loss_clip": 0.01108729, + "auxiliary_loss_mlp": 0.01028581, + "balance_loss_clip": 1.03864682, + "balance_loss_mlp": 1.01787627, + "epoch": 0.8784608447316999, + "flos": 25530918318720.0, + "grad_norm": 1.965122912275618, + "language_loss": 0.8048563, + "learning_rate": 1.5289365779564612e-07, + "loss": 0.82622945, + "num_input_tokens_seen": 315016480, + "router_z_loss_clip": 0.70019531, + "router_z_loss_mlp": 0.10705566, + "step": 14611, + "time_per_iteration": 2.631319284439087 + }, + { + "auxiliary_loss_clip": 0.01110985, + "auxiliary_loss_mlp": 0.01031701, + "balance_loss_clip": 1.03798652, + "balance_loss_mlp": 1.02067399, + "epoch": 0.878520967984368, + "flos": 28468366647840.0, + "grad_norm": 1.677175957500176, + "language_loss": 0.76201493, + "learning_rate": 1.5274434579486338e-07, + "loss": 0.78344178, + "num_input_tokens_seen": 315036135, + "router_z_loss_clip": 0.73046875, + "router_z_loss_mlp": 0.11035156, + "step": 14612, + "time_per_iteration": 2.6490213871002197 + }, + { + "auxiliary_loss_clip": 0.01109225, + "auxiliary_loss_mlp": 0.01032754, + "balance_loss_clip": 1.03792381, + "balance_loss_mlp": 1.02145886, + "epoch": 0.8785810912370359, + "flos": 31138803686880.0, + "grad_norm": 1.7467249179624333, + "language_loss": 0.7222656, + "learning_rate": 1.525951038422002e-07, + "loss": 0.74368536, + "num_input_tokens_seen": 315057995, + "router_z_loss_clip": 0.71337891, + "router_z_loss_mlp": 0.11303711, + "step": 14613, + "time_per_iteration": 2.657949924468994 + }, + { + "auxiliary_loss_clip": 0.01028255, + "auxiliary_loss_mlp": 0.01002163, + "balance_loss_clip": 1.00588083, + "balance_loss_mlp": 1.00117064, + "epoch": 0.8786412144897039, + "flos": 75460050567360.0, + "grad_norm": 1.024069065614661, + "language_loss": 0.64513397, + "learning_rate": 1.5244593194331667e-07, + "loss": 0.66543818, + "num_input_tokens_seen": 315104010, + "router_z_loss_clip": 0.22363281, + "router_z_loss_mlp": 0.00991821, + "step": 14614, + "time_per_iteration": 4.511784791946411 + }, + { + "auxiliary_loss_clip": 0.0102838, + "auxiliary_loss_mlp": 0.01003356, + "balance_loss_clip": 1.00598907, + "balance_loss_mlp": 1.00235748, + "epoch": 0.8787013377423719, + "flos": 86621054641920.0, + "grad_norm": 0.6665744943553781, + "language_loss": 0.5854404, + "learning_rate": 1.5229683010386762e-07, + "loss": 0.60575783, + "num_input_tokens_seen": 315174550, + "router_z_loss_clip": 0.22399902, + "router_z_loss_mlp": 0.00998688, + "step": 14615, + "time_per_iteration": 3.3315811157226562 + }, + { + "auxiliary_loss_clip": 0.01108653, + "auxiliary_loss_mlp": 0.01026744, + "balance_loss_clip": 1.03592575, + "balance_loss_mlp": 1.015962, + "epoch": 0.8787614609950398, + "flos": 21167100290880.0, + "grad_norm": 11.311655891211938, + "language_loss": 0.72557259, + "learning_rate": 1.5214779832950807e-07, + "loss": 0.74692655, + "num_input_tokens_seen": 315191825, + "router_z_loss_clip": 0.72802734, + "router_z_loss_mlp": 0.10791016, + "step": 14616, + "time_per_iteration": 2.5950374603271484 + }, + { + "auxiliary_loss_clip": 0.01028491, + "auxiliary_loss_mlp": 0.01003621, + "balance_loss_clip": 1.00609779, + "balance_loss_mlp": 1.00264096, + "epoch": 0.8788215842477078, + "flos": 88479315558720.0, + "grad_norm": 0.8073326282885265, + "language_loss": 0.57925224, + "learning_rate": 1.5199883662588953e-07, + "loss": 0.59957337, + "num_input_tokens_seen": 315255075, + "router_z_loss_clip": 0.22387695, + "router_z_loss_mlp": 0.00978851, + "step": 14617, + "time_per_iteration": 3.3846304416656494 + }, + { + "auxiliary_loss_clip": 0.01106877, + "auxiliary_loss_mlp": 0.01026641, + "balance_loss_clip": 1.03690743, + "balance_loss_mlp": 1.01590061, + "epoch": 0.8788817075003758, + "flos": 30294787332960.0, + "grad_norm": 1.7862606183842207, + "language_loss": 0.83725649, + "learning_rate": 1.5184994499865987e-07, + "loss": 0.85859168, + "num_input_tokens_seen": 315273995, + "router_z_loss_clip": 0.69970703, + "router_z_loss_mlp": 0.10736084, + "step": 14618, + "time_per_iteration": 2.7009661197662354 + }, + { + "auxiliary_loss_clip": 0.01107391, + "auxiliary_loss_mlp": 0.01026721, + "balance_loss_clip": 1.03926706, + "balance_loss_mlp": 1.01640296, + "epoch": 0.8789418307530438, + "flos": 27623945121120.0, + "grad_norm": 2.726891941880594, + "language_loss": 0.69524771, + "learning_rate": 1.5170112345346598e-07, + "loss": 0.71658885, + "num_input_tokens_seen": 315294485, + "router_z_loss_clip": 0.68164062, + "router_z_loss_mlp": 0.10314941, + "step": 14619, + "time_per_iteration": 2.650038957595825 + }, + { + "auxiliary_loss_clip": 0.01112167, + "auxiliary_loss_mlp": 0.01038549, + "balance_loss_clip": 1.03749502, + "balance_loss_mlp": 1.02733159, + "epoch": 0.8790019540057117, + "flos": 24143114720160.0, + "grad_norm": 2.5820264156334667, + "language_loss": 0.77440214, + "learning_rate": 1.5155237199595016e-07, + "loss": 0.79590935, + "num_input_tokens_seen": 315310420, + "router_z_loss_clip": 0.74707031, + "router_z_loss_mlp": 0.11224365, + "step": 14620, + "time_per_iteration": 2.696953296661377 + }, + { + "auxiliary_loss_clip": 0.01111649, + "auxiliary_loss_mlp": 0.01030767, + "balance_loss_clip": 1.03904414, + "balance_loss_mlp": 1.01898909, + "epoch": 0.8790620772583797, + "flos": 24684389893440.0, + "grad_norm": 1.7716028097048384, + "language_loss": 0.79017878, + "learning_rate": 1.514036906317542e-07, + "loss": 0.81160289, + "num_input_tokens_seen": 315330110, + "router_z_loss_clip": 0.72607422, + "router_z_loss_mlp": 0.11773682, + "step": 14621, + "time_per_iteration": 2.6179213523864746 + }, + { + "auxiliary_loss_clip": 0.01112112, + "auxiliary_loss_mlp": 0.01030963, + "balance_loss_clip": 1.03731918, + "balance_loss_mlp": 1.01959682, + "epoch": 0.8791222005110476, + "flos": 29444977008000.0, + "grad_norm": 1.7507891632524077, + "language_loss": 0.66627777, + "learning_rate": 1.5125507936651506e-07, + "loss": 0.6877085, + "num_input_tokens_seen": 315350080, + "router_z_loss_clip": 0.74804688, + "router_z_loss_mlp": 0.11364746, + "step": 14622, + "time_per_iteration": 4.135830879211426 + }, + { + "auxiliary_loss_clip": 0.01110558, + "auxiliary_loss_mlp": 0.01033079, + "balance_loss_clip": 1.0393635, + "balance_loss_mlp": 1.02193308, + "epoch": 0.8791823237637156, + "flos": 26373962257920.0, + "grad_norm": 1.9748174310792352, + "language_loss": 0.72951841, + "learning_rate": 1.511065382058687e-07, + "loss": 0.75095475, + "num_input_tokens_seen": 315366360, + "router_z_loss_clip": 0.71142578, + "router_z_loss_mlp": 0.11138916, + "step": 14623, + "time_per_iteration": 2.68473744392395 + }, + { + "auxiliary_loss_clip": 0.01103964, + "auxiliary_loss_mlp": 0.01029672, + "balance_loss_clip": 1.03302622, + "balance_loss_mlp": 1.01861572, + "epoch": 0.8792424470163835, + "flos": 29582108949600.0, + "grad_norm": 1.648500410562709, + "language_loss": 0.78239095, + "learning_rate": 1.5095806715544801e-07, + "loss": 0.80372733, + "num_input_tokens_seen": 315385890, + "router_z_loss_clip": 0.70898438, + "router_z_loss_mlp": 0.1105957, + "step": 14624, + "time_per_iteration": 2.62528395652771 + }, + { + "auxiliary_loss_clip": 0.01110878, + "auxiliary_loss_mlp": 0.01031291, + "balance_loss_clip": 1.03796697, + "balance_loss_mlp": 1.01950097, + "epoch": 0.8793025702690516, + "flos": 30375687640320.0, + "grad_norm": 1.9048494543160757, + "language_loss": 0.7957601, + "learning_rate": 1.5080966622088265e-07, + "loss": 0.81718183, + "num_input_tokens_seen": 315403400, + "router_z_loss_clip": 0.72900391, + "router_z_loss_mlp": 0.11798096, + "step": 14625, + "time_per_iteration": 4.166045904159546 + }, + { + "auxiliary_loss_clip": 0.01109063, + "auxiliary_loss_mlp": 0.01035216, + "balance_loss_clip": 1.03903675, + "balance_loss_mlp": 1.02427888, + "epoch": 0.8793626935217195, + "flos": 30960796160160.0, + "grad_norm": 1.5760137626772095, + "language_loss": 0.74271333, + "learning_rate": 1.5066133540779967e-07, + "loss": 0.7641561, + "num_input_tokens_seen": 315423670, + "router_z_loss_clip": 0.70068359, + "router_z_loss_mlp": 0.10943604, + "step": 14626, + "time_per_iteration": 2.699681520462036 + }, + { + "auxiliary_loss_clip": 0.01109859, + "auxiliary_loss_mlp": 0.01033848, + "balance_loss_clip": 1.03533316, + "balance_loss_mlp": 1.02259457, + "epoch": 0.8794228167743875, + "flos": 42315238494720.0, + "grad_norm": 1.7186088203970922, + "language_loss": 0.71230114, + "learning_rate": 1.505130747218246e-07, + "loss": 0.73373818, + "num_input_tokens_seen": 315446265, + "router_z_loss_clip": 0.74414062, + "router_z_loss_mlp": 0.11260986, + "step": 14627, + "time_per_iteration": 2.741269111633301 + }, + { + "auxiliary_loss_clip": 0.01109316, + "auxiliary_loss_mlp": 0.01026724, + "balance_loss_clip": 1.03662789, + "balance_loss_mlp": 1.01539373, + "epoch": 0.8794829400270555, + "flos": 23749911154080.0, + "grad_norm": 8.762011079848278, + "language_loss": 0.72489619, + "learning_rate": 1.5036488416857873e-07, + "loss": 0.74625659, + "num_input_tokens_seen": 315464655, + "router_z_loss_clip": 0.7265625, + "router_z_loss_mlp": 0.11334229, + "step": 14628, + "time_per_iteration": 2.7218477725982666 + }, + { + "auxiliary_loss_clip": 0.01110526, + "auxiliary_loss_mlp": 0.01031261, + "balance_loss_clip": 1.03792071, + "balance_loss_mlp": 1.01941705, + "epoch": 0.8795430632797234, + "flos": 18584978221440.0, + "grad_norm": 3.12722071555414, + "language_loss": 0.69321769, + "learning_rate": 1.5021676375368175e-07, + "loss": 0.71463555, + "num_input_tokens_seen": 315481090, + "router_z_loss_clip": 0.7265625, + "router_z_loss_mlp": 0.1184082, + "step": 14629, + "time_per_iteration": 2.580808401107788 + }, + { + "auxiliary_loss_clip": 0.01104933, + "auxiliary_loss_mlp": 0.01029683, + "balance_loss_clip": 1.03473997, + "balance_loss_mlp": 1.01939559, + "epoch": 0.8796031865323914, + "flos": 33854816315520.0, + "grad_norm": 3.65084115060786, + "language_loss": 0.68943983, + "learning_rate": 1.5006871348275053e-07, + "loss": 0.71078598, + "num_input_tokens_seen": 315502010, + "router_z_loss_clip": 0.70166016, + "router_z_loss_mlp": 0.10290527, + "step": 14630, + "time_per_iteration": 2.6997320652008057 + }, + { + "auxiliary_loss_clip": 0.01106414, + "auxiliary_loss_mlp": 0.0103558, + "balance_loss_clip": 1.0371933, + "balance_loss_mlp": 1.02430892, + "epoch": 0.8796633097850594, + "flos": 38175287204160.0, + "grad_norm": 2.294756591344351, + "language_loss": 0.74369693, + "learning_rate": 1.499207333613999e-07, + "loss": 0.76511687, + "num_input_tokens_seen": 315523040, + "router_z_loss_clip": 0.69287109, + "router_z_loss_mlp": 0.1126709, + "step": 14631, + "time_per_iteration": 2.710752487182617 + }, + { + "auxiliary_loss_clip": 0.01107855, + "auxiliary_loss_mlp": 0.01032527, + "balance_loss_clip": 1.03856993, + "balance_loss_mlp": 1.02119637, + "epoch": 0.8797234330377274, + "flos": 29581825328640.0, + "grad_norm": 3.076183442478805, + "language_loss": 0.69806522, + "learning_rate": 1.4977282339523954e-07, + "loss": 0.71946907, + "num_input_tokens_seen": 315541865, + "router_z_loss_clip": 0.69238281, + "router_z_loss_mlp": 0.11340332, + "step": 14632, + "time_per_iteration": 2.7198684215545654 + }, + { + "auxiliary_loss_clip": 0.01110903, + "auxiliary_loss_mlp": 0.01026649, + "balance_loss_clip": 1.0388937, + "balance_loss_mlp": 1.01646864, + "epoch": 0.8797835562903953, + "flos": 29489661217440.0, + "grad_norm": 2.494673328404805, + "language_loss": 0.65613353, + "learning_rate": 1.4962498358987929e-07, + "loss": 0.67750907, + "num_input_tokens_seen": 315561470, + "router_z_loss_clip": 0.72021484, + "router_z_loss_mlp": 0.10180664, + "step": 14633, + "time_per_iteration": 4.019506454467773 + }, + { + "auxiliary_loss_clip": 0.01109849, + "auxiliary_loss_mlp": 0.0103147, + "balance_loss_clip": 1.03814256, + "balance_loss_mlp": 1.02033532, + "epoch": 0.8798436795430633, + "flos": 23530784938560.0, + "grad_norm": 1.5829941286776295, + "language_loss": 0.84009308, + "learning_rate": 1.4947721395092528e-07, + "loss": 0.86150628, + "num_input_tokens_seen": 315583140, + "router_z_loss_clip": 0.71777344, + "router_z_loss_mlp": 0.11126709, + "step": 14634, + "time_per_iteration": 2.6798484325408936 + }, + { + "auxiliary_loss_clip": 0.0110872, + "auxiliary_loss_mlp": 0.01031079, + "balance_loss_clip": 1.03641891, + "balance_loss_mlp": 1.02027833, + "epoch": 0.8799038027957312, + "flos": 34385516478720.0, + "grad_norm": 1.7372596214434888, + "language_loss": 0.79865831, + "learning_rate": 1.4932951448398056e-07, + "loss": 0.82005626, + "num_input_tokens_seen": 315601935, + "router_z_loss_clip": 0.72363281, + "router_z_loss_mlp": 0.10803223, + "step": 14635, + "time_per_iteration": 2.7187294960021973 + }, + { + "auxiliary_loss_clip": 0.01109768, + "auxiliary_loss_mlp": 0.01023723, + "balance_loss_clip": 1.03783178, + "balance_loss_mlp": 1.01295841, + "epoch": 0.8799639260483992, + "flos": 30071811975840.0, + "grad_norm": 1.662367417005939, + "language_loss": 0.65201652, + "learning_rate": 1.4918188519464648e-07, + "loss": 0.67335147, + "num_input_tokens_seen": 315619995, + "router_z_loss_clip": 0.71875, + "router_z_loss_mlp": 0.10754395, + "step": 14636, + "time_per_iteration": 2.642890214920044 + }, + { + "auxiliary_loss_clip": 0.01111387, + "auxiliary_loss_mlp": 0.01035943, + "balance_loss_clip": 1.03850675, + "balance_loss_mlp": 1.02396822, + "epoch": 0.8800240493010671, + "flos": 27089152712640.0, + "grad_norm": 1.6206333679548097, + "language_loss": 0.70322663, + "learning_rate": 1.4903432608852074e-07, + "loss": 0.72469997, + "num_input_tokens_seen": 315637895, + "router_z_loss_clip": 0.72851562, + "router_z_loss_mlp": 0.11968994, + "step": 14637, + "time_per_iteration": 2.692235231399536 + }, + { + "auxiliary_loss_clip": 0.01113327, + "auxiliary_loss_mlp": 0.01029177, + "balance_loss_clip": 1.04177308, + "balance_loss_mlp": 1.01832867, + "epoch": 0.8800841725537352, + "flos": 17382637329120.0, + "grad_norm": 1.830119817436139, + "language_loss": 0.66380489, + "learning_rate": 1.4888683717119843e-07, + "loss": 0.68523002, + "num_input_tokens_seen": 315655520, + "router_z_loss_clip": 0.71533203, + "router_z_loss_mlp": 0.10852051, + "step": 14638, + "time_per_iteration": 2.5770225524902344 + }, + { + "auxiliary_loss_clip": 0.01109329, + "auxiliary_loss_mlp": 0.01027819, + "balance_loss_clip": 1.03689837, + "balance_loss_mlp": 1.01704812, + "epoch": 0.8801442958064031, + "flos": 45655979192640.0, + "grad_norm": 1.83187274529057, + "language_loss": 0.58184206, + "learning_rate": 1.4873941844827286e-07, + "loss": 0.60321349, + "num_input_tokens_seen": 315678955, + "router_z_loss_clip": 0.72460938, + "router_z_loss_mlp": 0.10772705, + "step": 14639, + "time_per_iteration": 2.8458805084228516 + }, + { + "auxiliary_loss_clip": 0.0111057, + "auxiliary_loss_mlp": 0.01035648, + "balance_loss_clip": 1.03770137, + "balance_loss_mlp": 1.0242753, + "epoch": 0.8802044190590711, + "flos": 30563014141440.0, + "grad_norm": 1.4845231457371348, + "language_loss": 0.74487996, + "learning_rate": 1.4859206992533402e-07, + "loss": 0.7663421, + "num_input_tokens_seen": 315700360, + "router_z_loss_clip": 0.72900391, + "router_z_loss_mlp": 0.1137085, + "step": 14640, + "time_per_iteration": 2.7062771320343018 + }, + { + "auxiliary_loss_clip": 0.01108607, + "auxiliary_loss_mlp": 0.01037003, + "balance_loss_clip": 1.03617978, + "balance_loss_mlp": 1.0259763, + "epoch": 0.8802645423117391, + "flos": 29448461494080.0, + "grad_norm": 1.8437888073158009, + "language_loss": 0.69783628, + "learning_rate": 1.4844479160796985e-07, + "loss": 0.71929234, + "num_input_tokens_seen": 315719270, + "router_z_loss_clip": 0.72460938, + "router_z_loss_mlp": 0.11029053, + "step": 14641, + "time_per_iteration": 2.8688271045684814 + }, + { + "auxiliary_loss_clip": 0.01110361, + "auxiliary_loss_mlp": 0.01024538, + "balance_loss_clip": 1.0370208, + "balance_loss_mlp": 1.01280808, + "epoch": 0.880324665564407, + "flos": 21879495053280.0, + "grad_norm": 2.1898484171807455, + "language_loss": 0.84644687, + "learning_rate": 1.4829758350176457e-07, + "loss": 0.86779583, + "num_input_tokens_seen": 315737425, + "router_z_loss_clip": 0.73388672, + "router_z_loss_mlp": 0.11724854, + "step": 14642, + "time_per_iteration": 2.5943541526794434 + }, + { + "auxiliary_loss_clip": 0.01110935, + "auxiliary_loss_mlp": 0.01030964, + "balance_loss_clip": 1.03990006, + "balance_loss_mlp": 1.0192045, + "epoch": 0.880384788817075, + "flos": 25975288859040.0, + "grad_norm": 2.004182846587196, + "language_loss": 0.78889424, + "learning_rate": 1.4815044561230038e-07, + "loss": 0.81031322, + "num_input_tokens_seen": 315755725, + "router_z_loss_clip": 0.70996094, + "router_z_loss_mlp": 0.11767578, + "step": 14643, + "time_per_iteration": 2.69868803024292 + }, + { + "auxiliary_loss_clip": 0.01104474, + "auxiliary_loss_mlp": 0.01026003, + "balance_loss_clip": 1.03498006, + "balance_loss_mlp": 1.01552463, + "epoch": 0.880444912069743, + "flos": 15201255039840.0, + "grad_norm": 1.959844329352175, + "language_loss": 0.73107851, + "learning_rate": 1.4800337794515705e-07, + "loss": 0.75238323, + "num_input_tokens_seen": 315773835, + "router_z_loss_clip": 0.6953125, + "router_z_loss_mlp": 0.10479736, + "step": 14644, + "time_per_iteration": 2.6861886978149414 + }, + { + "auxiliary_loss_clip": 0.01111997, + "auxiliary_loss_mlp": 0.01028905, + "balance_loss_clip": 1.03808951, + "balance_loss_mlp": 1.01707387, + "epoch": 0.880505035322411, + "flos": 16625882495520.0, + "grad_norm": 3.5577951076442544, + "language_loss": 0.79167902, + "learning_rate": 1.47856380505911e-07, + "loss": 0.81308806, + "num_input_tokens_seen": 315790615, + "router_z_loss_clip": 0.73974609, + "router_z_loss_mlp": 0.11834717, + "step": 14645, + "time_per_iteration": 2.59831166267395 + }, + { + "auxiliary_loss_clip": 0.01108112, + "auxiliary_loss_mlp": 0.01030767, + "balance_loss_clip": 1.03761506, + "balance_loss_mlp": 1.01938844, + "epoch": 0.8805651585750789, + "flos": 28288414291680.0, + "grad_norm": 2.126868084821728, + "language_loss": 0.63906682, + "learning_rate": 1.477094533001364e-07, + "loss": 0.66045558, + "num_input_tokens_seen": 315811010, + "router_z_loss_clip": 0.70410156, + "router_z_loss_mlp": 0.11376953, + "step": 14646, + "time_per_iteration": 2.6941661834716797 + }, + { + "auxiliary_loss_clip": 0.01113491, + "auxiliary_loss_mlp": 0.01028073, + "balance_loss_clip": 1.03840089, + "balance_loss_mlp": 1.01643229, + "epoch": 0.8806252818277469, + "flos": 18184238441280.0, + "grad_norm": 2.101530685830835, + "language_loss": 0.77448094, + "learning_rate": 1.475625963334055e-07, + "loss": 0.79589659, + "num_input_tokens_seen": 315828130, + "router_z_loss_clip": 0.75097656, + "router_z_loss_mlp": 0.11645508, + "step": 14647, + "time_per_iteration": 2.6105973720550537 + }, + { + "auxiliary_loss_clip": 0.01107534, + "auxiliary_loss_mlp": 0.01029637, + "balance_loss_clip": 1.037462, + "balance_loss_mlp": 1.01918244, + "epoch": 0.8806854050804148, + "flos": 21523155861600.0, + "grad_norm": 2.323895553240714, + "language_loss": 0.75102514, + "learning_rate": 1.4741580961128652e-07, + "loss": 0.7723968, + "num_input_tokens_seen": 315844900, + "router_z_loss_clip": 0.70068359, + "router_z_loss_mlp": 0.10461426, + "step": 14648, + "time_per_iteration": 2.6644232273101807 + }, + { + "auxiliary_loss_clip": 0.01109529, + "auxiliary_loss_mlp": 0.0102939, + "balance_loss_clip": 1.03591669, + "balance_loss_mlp": 1.01841652, + "epoch": 0.8807455283330828, + "flos": 30911492980800.0, + "grad_norm": 1.7606244083525584, + "language_loss": 0.65654916, + "learning_rate": 1.4726909313934522e-07, + "loss": 0.67793834, + "num_input_tokens_seen": 315863745, + "router_z_loss_clip": 0.73535156, + "router_z_loss_mlp": 0.10980225, + "step": 14649, + "time_per_iteration": 2.6421942710876465 + }, + { + "auxiliary_loss_clip": 0.01111892, + "auxiliary_loss_mlp": 0.01028321, + "balance_loss_clip": 1.04023862, + "balance_loss_mlp": 1.01679897, + "epoch": 0.8808056515857507, + "flos": 30828931464960.0, + "grad_norm": 1.3424101129380939, + "language_loss": 0.625166, + "learning_rate": 1.4712244692314578e-07, + "loss": 0.64656812, + "num_input_tokens_seen": 315885765, + "router_z_loss_clip": 0.71630859, + "router_z_loss_mlp": 0.11523438, + "step": 14650, + "time_per_iteration": 2.7091012001037598 + }, + { + "auxiliary_loss_clip": 0.01107739, + "auxiliary_loss_mlp": 0.01028222, + "balance_loss_clip": 1.03708315, + "balance_loss_mlp": 1.01740384, + "epoch": 0.8808657748384188, + "flos": 32431971620160.0, + "grad_norm": 1.5194912132078813, + "language_loss": 0.72695947, + "learning_rate": 1.4697587096824914e-07, + "loss": 0.74831915, + "num_input_tokens_seen": 315907340, + "router_z_loss_clip": 0.70654297, + "router_z_loss_mlp": 0.10827637, + "step": 14651, + "time_per_iteration": 2.687922239303589 + }, + { + "auxiliary_loss_clip": 0.01112582, + "auxiliary_loss_mlp": 0.01032714, + "balance_loss_clip": 1.0389545, + "balance_loss_mlp": 1.02067375, + "epoch": 0.8809258980910867, + "flos": 22770302515200.0, + "grad_norm": 1.6912327010457051, + "language_loss": 0.72387689, + "learning_rate": 1.4682936528021284e-07, + "loss": 0.74532986, + "num_input_tokens_seen": 315924935, + "router_z_loss_clip": 0.73632812, + "router_z_loss_mlp": 0.12042236, + "step": 14652, + "time_per_iteration": 2.6558897495269775 + }, + { + "auxiliary_loss_clip": 0.01107569, + "auxiliary_loss_mlp": 0.01028851, + "balance_loss_clip": 1.03589225, + "balance_loss_mlp": 1.01814651, + "epoch": 0.8809860213437547, + "flos": 24150488865120.0, + "grad_norm": 1.9973905694743919, + "language_loss": 0.74919975, + "learning_rate": 1.4668292986459286e-07, + "loss": 0.77056396, + "num_input_tokens_seen": 315943165, + "router_z_loss_clip": 0.71679688, + "router_z_loss_mlp": 0.10705566, + "step": 14653, + "time_per_iteration": 4.079354763031006 + }, + { + "auxiliary_loss_clip": 0.0111202, + "auxiliary_loss_mlp": 0.01030247, + "balance_loss_clip": 1.03738165, + "balance_loss_mlp": 1.01854062, + "epoch": 0.8810461445964227, + "flos": 21834689292000.0, + "grad_norm": 1.820877090108622, + "language_loss": 0.71054614, + "learning_rate": 1.465365647269421e-07, + "loss": 0.73196888, + "num_input_tokens_seen": 315961340, + "router_z_loss_clip": 0.74658203, + "router_z_loss_mlp": 0.11700439, + "step": 14654, + "time_per_iteration": 2.5937302112579346 + }, + { + "auxiliary_loss_clip": 0.01110754, + "auxiliary_loss_mlp": 0.01033934, + "balance_loss_clip": 1.03779948, + "balance_loss_mlp": 1.02161968, + "epoch": 0.8811062678490906, + "flos": 35586965990880.0, + "grad_norm": 2.0388461533196893, + "language_loss": 0.71657741, + "learning_rate": 1.4639026987281012e-07, + "loss": 0.73802429, + "num_input_tokens_seen": 315981335, + "router_z_loss_clip": 0.72949219, + "router_z_loss_mlp": 0.12310791, + "step": 14655, + "time_per_iteration": 2.7253451347351074 + }, + { + "auxiliary_loss_clip": 0.01109487, + "auxiliary_loss_mlp": 0.01027788, + "balance_loss_clip": 1.03747082, + "balance_loss_mlp": 1.01666617, + "epoch": 0.8811663911017587, + "flos": 24817510624320.0, + "grad_norm": 1.9832778889538774, + "language_loss": 0.81341219, + "learning_rate": 1.462440453077449e-07, + "loss": 0.83478492, + "num_input_tokens_seen": 316001325, + "router_z_loss_clip": 0.71972656, + "router_z_loss_mlp": 0.11126709, + "step": 14656, + "time_per_iteration": 2.6222281455993652 + }, + { + "auxiliary_loss_clip": 0.01110831, + "auxiliary_loss_mlp": 0.01030265, + "balance_loss_clip": 1.03825903, + "balance_loss_mlp": 1.01972055, + "epoch": 0.8812265143544266, + "flos": 31586131988640.0, + "grad_norm": 2.8828880585460728, + "language_loss": 0.68697655, + "learning_rate": 1.460978910372914e-07, + "loss": 0.70838749, + "num_input_tokens_seen": 316022540, + "router_z_loss_clip": 0.72558594, + "router_z_loss_mlp": 0.10552979, + "step": 14657, + "time_per_iteration": 2.6918976306915283 + }, + { + "auxiliary_loss_clip": 0.01110523, + "auxiliary_loss_mlp": 0.01032156, + "balance_loss_clip": 1.03747773, + "balance_loss_mlp": 1.02129591, + "epoch": 0.8812866376070946, + "flos": 33184796277600.0, + "grad_norm": 2.3240541726645394, + "language_loss": 0.84163976, + "learning_rate": 1.4595180706699207e-07, + "loss": 0.86306655, + "num_input_tokens_seen": 316037735, + "router_z_loss_clip": 0.72998047, + "router_z_loss_mlp": 0.10864258, + "step": 14658, + "time_per_iteration": 2.6739447116851807 + }, + { + "auxiliary_loss_clip": 0.01116734, + "auxiliary_loss_mlp": 0.01033034, + "balance_loss_clip": 1.04057932, + "balance_loss_mlp": 1.02081466, + "epoch": 0.8813467608597625, + "flos": 29049666543360.0, + "grad_norm": 1.9175618770448006, + "language_loss": 0.77265728, + "learning_rate": 1.4580579340238554e-07, + "loss": 0.79415494, + "num_input_tokens_seen": 316058105, + "router_z_loss_clip": 0.76220703, + "router_z_loss_mlp": 0.12219238, + "step": 14659, + "time_per_iteration": 2.7062745094299316 + }, + { + "auxiliary_loss_clip": 0.01108174, + "auxiliary_loss_mlp": 0.01031443, + "balance_loss_clip": 1.03647947, + "balance_loss_mlp": 1.02029705, + "epoch": 0.8814068841124305, + "flos": 25752961778400.0, + "grad_norm": 2.0087195916009715, + "language_loss": 0.60567331, + "learning_rate": 1.4565985004900894e-07, + "loss": 0.62706953, + "num_input_tokens_seen": 316074415, + "router_z_loss_clip": 0.71630859, + "router_z_loss_mlp": 0.11151123, + "step": 14660, + "time_per_iteration": 2.6191320419311523 + }, + { + "auxiliary_loss_clip": 0.01110022, + "auxiliary_loss_mlp": 0.01033955, + "balance_loss_clip": 1.03710687, + "balance_loss_mlp": 1.0213728, + "epoch": 0.8814670073650984, + "flos": 30160086428160.0, + "grad_norm": 1.8726869113508708, + "language_loss": 0.77833647, + "learning_rate": 1.455139770123972e-07, + "loss": 0.79977626, + "num_input_tokens_seen": 316094405, + "router_z_loss_clip": 0.72851562, + "router_z_loss_mlp": 0.12573242, + "step": 14661, + "time_per_iteration": 4.051406621932983 + }, + { + "auxiliary_loss_clip": 0.01113592, + "auxiliary_loss_mlp": 0.01038047, + "balance_loss_clip": 1.03965795, + "balance_loss_mlp": 1.02632272, + "epoch": 0.8815271306177664, + "flos": 28023671969280.0, + "grad_norm": 1.6686473371615413, + "language_loss": 0.77016181, + "learning_rate": 1.45368174298081e-07, + "loss": 0.79167819, + "num_input_tokens_seen": 316113390, + "router_z_loss_clip": 0.73876953, + "router_z_loss_mlp": 0.11724854, + "step": 14662, + "time_per_iteration": 2.6809167861938477 + }, + { + "auxiliary_loss_clip": 0.01106787, + "auxiliary_loss_mlp": 0.01026466, + "balance_loss_clip": 1.03669214, + "balance_loss_mlp": 1.0165174, + "epoch": 0.8815872538704344, + "flos": 23745454253280.0, + "grad_norm": 1.9283418795647764, + "language_loss": 0.74034262, + "learning_rate": 1.4522244191158929e-07, + "loss": 0.76167512, + "num_input_tokens_seen": 316131085, + "router_z_loss_clip": 0.70019531, + "router_z_loss_mlp": 0.09942627, + "step": 14663, + "time_per_iteration": 2.623771905899048 + }, + { + "auxiliary_loss_clip": 0.01110659, + "auxiliary_loss_mlp": 0.0102929, + "balance_loss_clip": 1.03857934, + "balance_loss_mlp": 1.01807261, + "epoch": 0.8816473771231024, + "flos": 39238794429120.0, + "grad_norm": 2.3859442796403445, + "language_loss": 0.70034063, + "learning_rate": 1.450767798584489e-07, + "loss": 0.72174013, + "num_input_tokens_seen": 316151440, + "router_z_loss_clip": 0.72119141, + "router_z_loss_mlp": 0.11218262, + "step": 14664, + "time_per_iteration": 2.7380008697509766 + }, + { + "auxiliary_loss_clip": 0.01106936, + "auxiliary_loss_mlp": 0.01032133, + "balance_loss_clip": 1.03630209, + "balance_loss_mlp": 1.02211332, + "epoch": 0.8817075003757703, + "flos": 24016314684960.0, + "grad_norm": 1.514461874723354, + "language_loss": 0.81201452, + "learning_rate": 1.449311881441828e-07, + "loss": 0.83340526, + "num_input_tokens_seen": 316170750, + "router_z_loss_clip": 0.70605469, + "router_z_loss_mlp": 0.10015869, + "step": 14665, + "time_per_iteration": 4.123984336853027 + }, + { + "auxiliary_loss_clip": 0.01111647, + "auxiliary_loss_mlp": 0.01030013, + "balance_loss_clip": 1.03962898, + "balance_loss_mlp": 1.01955843, + "epoch": 0.8817676236284383, + "flos": 19118555111520.0, + "grad_norm": 2.1640300472123046, + "language_loss": 0.58671641, + "learning_rate": 1.447856667743117e-07, + "loss": 0.60813308, + "num_input_tokens_seen": 316187265, + "router_z_loss_clip": 0.72021484, + "router_z_loss_mlp": 0.10449219, + "step": 14666, + "time_per_iteration": 2.6155025959014893 + }, + { + "auxiliary_loss_clip": 0.01113469, + "auxiliary_loss_mlp": 0.01029521, + "balance_loss_clip": 1.04045796, + "balance_loss_mlp": 1.01736188, + "epoch": 0.8818277468811063, + "flos": 21835499637600.0, + "grad_norm": 2.385236182747632, + "language_loss": 0.83447558, + "learning_rate": 1.4464021575435403e-07, + "loss": 0.85590547, + "num_input_tokens_seen": 316206555, + "router_z_loss_clip": 0.72998047, + "router_z_loss_mlp": 0.12164307, + "step": 14667, + "time_per_iteration": 2.6259756088256836 + }, + { + "auxiliary_loss_clip": 0.011098, + "auxiliary_loss_mlp": 0.0103208, + "balance_loss_clip": 1.03811526, + "balance_loss_mlp": 1.020082, + "epoch": 0.8818878701337742, + "flos": 22903666349760.0, + "grad_norm": 1.9897290431882542, + "language_loss": 0.62445587, + "learning_rate": 1.4449483508982563e-07, + "loss": 0.64587474, + "num_input_tokens_seen": 316225210, + "router_z_loss_clip": 0.71728516, + "router_z_loss_mlp": 0.11999512, + "step": 14668, + "time_per_iteration": 2.6004905700683594 + }, + { + "auxiliary_loss_clip": 0.011074, + "auxiliary_loss_mlp": 0.01024225, + "balance_loss_clip": 1.03829956, + "balance_loss_mlp": 1.01402044, + "epoch": 0.8819479933864423, + "flos": 21612362211360.0, + "grad_norm": 2.6512799206103197, + "language_loss": 0.56692326, + "learning_rate": 1.4434952478623918e-07, + "loss": 0.58823949, + "num_input_tokens_seen": 316242685, + "router_z_loss_clip": 0.69189453, + "router_z_loss_mlp": 0.10198975, + "step": 14669, + "time_per_iteration": 2.6130528450012207 + }, + { + "auxiliary_loss_clip": 0.01107174, + "auxiliary_loss_mlp": 0.01031429, + "balance_loss_clip": 1.03562975, + "balance_loss_mlp": 1.020509, + "epoch": 0.8820081166391102, + "flos": 14310771716160.0, + "grad_norm": 3.0833000662423458, + "language_loss": 0.7112667, + "learning_rate": 1.442042848491043e-07, + "loss": 0.73265272, + "num_input_tokens_seen": 316260935, + "router_z_loss_clip": 0.71435547, + "router_z_loss_mlp": 0.10919189, + "step": 14670, + "time_per_iteration": 2.5953986644744873 + }, + { + "auxiliary_loss_clip": 0.01107622, + "auxiliary_loss_mlp": 0.01027111, + "balance_loss_clip": 1.03620696, + "balance_loss_mlp": 1.0159831, + "epoch": 0.8820682398917782, + "flos": 33544336334400.0, + "grad_norm": 3.5090724282777055, + "language_loss": 0.73699665, + "learning_rate": 1.44059115283929e-07, + "loss": 0.75834405, + "num_input_tokens_seen": 316281190, + "router_z_loss_clip": 0.71484375, + "router_z_loss_mlp": 0.11126709, + "step": 14671, + "time_per_iteration": 2.707956552505493 + }, + { + "auxiliary_loss_clip": 0.01110627, + "auxiliary_loss_mlp": 0.01028468, + "balance_loss_clip": 1.03583431, + "balance_loss_mlp": 1.01711309, + "epoch": 0.8821283631444461, + "flos": 20589163329600.0, + "grad_norm": 2.0160677915090592, + "language_loss": 0.8492707, + "learning_rate": 1.43914016096218e-07, + "loss": 0.87066168, + "num_input_tokens_seen": 316297115, + "router_z_loss_clip": 0.74853516, + "router_z_loss_mlp": 0.11364746, + "step": 14672, + "time_per_iteration": 4.0076775550842285 + }, + { + "auxiliary_loss_clip": 0.01106775, + "auxiliary_loss_mlp": 0.01027055, + "balance_loss_clip": 1.03680253, + "balance_loss_mlp": 1.0161593, + "epoch": 0.8821884863971141, + "flos": 29627846608320.0, + "grad_norm": 1.5862350790148756, + "language_loss": 0.72528076, + "learning_rate": 1.4376898729147336e-07, + "loss": 0.74661911, + "num_input_tokens_seen": 316318235, + "router_z_loss_clip": 0.70068359, + "router_z_loss_mlp": 0.10900879, + "step": 14673, + "time_per_iteration": 2.6584360599517822 + }, + { + "auxiliary_loss_clip": 0.01028003, + "auxiliary_loss_mlp": 0.0100092, + "balance_loss_clip": 1.00561452, + "balance_loss_mlp": 0.99994349, + "epoch": 0.882248609649782, + "flos": 72517462894080.0, + "grad_norm": 0.8013915647803999, + "language_loss": 0.49409762, + "learning_rate": 1.4362402887519487e-07, + "loss": 0.51438683, + "num_input_tokens_seen": 316384705, + "router_z_loss_clip": 0.22399902, + "router_z_loss_mlp": 0.00975037, + "step": 14674, + "time_per_iteration": 3.3697900772094727 + }, + { + "auxiliary_loss_clip": 0.0111141, + "auxiliary_loss_mlp": 0.01032453, + "balance_loss_clip": 1.03722024, + "balance_loss_mlp": 1.02100325, + "epoch": 0.88230873290245, + "flos": 24328091219040.0, + "grad_norm": 2.6984456350143393, + "language_loss": 0.76339352, + "learning_rate": 1.4347914085287971e-07, + "loss": 0.78483224, + "num_input_tokens_seen": 316401165, + "router_z_loss_clip": 0.7421875, + "router_z_loss_mlp": 0.11444092, + "step": 14675, + "time_per_iteration": 2.593104124069214 + }, + { + "auxiliary_loss_clip": 0.01106453, + "auxiliary_loss_mlp": 0.0103186, + "balance_loss_clip": 1.0367353, + "balance_loss_mlp": 1.02101827, + "epoch": 0.882368856155118, + "flos": 19965529226880.0, + "grad_norm": 1.903579017551702, + "language_loss": 0.7963146, + "learning_rate": 1.4333432323002105e-07, + "loss": 0.81769776, + "num_input_tokens_seen": 316418780, + "router_z_loss_clip": 0.69677734, + "router_z_loss_mlp": 0.10845947, + "step": 14676, + "time_per_iteration": 2.635699510574341 + }, + { + "auxiliary_loss_clip": 0.01027875, + "auxiliary_loss_mlp": 0.0100148, + "balance_loss_clip": 1.00550985, + "balance_loss_mlp": 1.00051939, + "epoch": 0.882428979407786, + "flos": 86138685243360.0, + "grad_norm": 0.6942596685805329, + "language_loss": 0.54721534, + "learning_rate": 1.431895760121109e-07, + "loss": 0.56750894, + "num_input_tokens_seen": 316482030, + "router_z_loss_clip": 0.22351074, + "router_z_loss_mlp": 0.00959015, + "step": 14677, + "time_per_iteration": 3.325813055038452 + }, + { + "auxiliary_loss_clip": 0.01106237, + "auxiliary_loss_mlp": 0.01024565, + "balance_loss_clip": 1.03560209, + "balance_loss_mlp": 1.01374078, + "epoch": 0.8824891026604539, + "flos": 22146830481600.0, + "grad_norm": 2.505447865838957, + "language_loss": 0.65004802, + "learning_rate": 1.4304489920463847e-07, + "loss": 0.67135602, + "num_input_tokens_seen": 316499175, + "router_z_loss_clip": 0.70605469, + "router_z_loss_mlp": 0.10821533, + "step": 14678, + "time_per_iteration": 2.6862118244171143 + }, + { + "auxiliary_loss_clip": 0.01113164, + "auxiliary_loss_mlp": 0.01033154, + "balance_loss_clip": 1.03898859, + "balance_loss_mlp": 1.02237165, + "epoch": 0.8825492259131219, + "flos": 33230128763520.0, + "grad_norm": 2.415789255422744, + "language_loss": 0.71225929, + "learning_rate": 1.4290029281308936e-07, + "loss": 0.73372257, + "num_input_tokens_seen": 316519495, + "router_z_loss_clip": 0.74169922, + "router_z_loss_mlp": 0.10784912, + "step": 14679, + "time_per_iteration": 2.731736660003662 + }, + { + "auxiliary_loss_clip": 0.01107237, + "auxiliary_loss_mlp": 0.01027894, + "balance_loss_clip": 1.03707457, + "balance_loss_mlp": 1.01835179, + "epoch": 0.8826093491657898, + "flos": 27179290959840.0, + "grad_norm": 1.7555623489383723, + "language_loss": 0.63884485, + "learning_rate": 1.4275575684294694e-07, + "loss": 0.66019613, + "num_input_tokens_seen": 316538180, + "router_z_loss_clip": 0.70214844, + "router_z_loss_mlp": 0.09539795, + "step": 14680, + "time_per_iteration": 2.646681547164917 + }, + { + "auxiliary_loss_clip": 0.01108604, + "auxiliary_loss_mlp": 0.01030242, + "balance_loss_clip": 1.03725052, + "balance_loss_mlp": 1.0189172, + "epoch": 0.8826694724184578, + "flos": 17332645356000.0, + "grad_norm": 5.956865606269894, + "language_loss": 0.7768563, + "learning_rate": 1.4261129129969328e-07, + "loss": 0.79824483, + "num_input_tokens_seen": 316551750, + "router_z_loss_clip": 0.71386719, + "router_z_loss_mlp": 0.11328125, + "step": 14681, + "time_per_iteration": 2.5904829502105713 + }, + { + "auxiliary_loss_clip": 0.01111633, + "auxiliary_loss_mlp": 0.01027452, + "balance_loss_clip": 1.03761899, + "balance_loss_mlp": 1.01578164, + "epoch": 0.8827295956711259, + "flos": 25174781713440.0, + "grad_norm": 1.7049932939496617, + "language_loss": 0.72959399, + "learning_rate": 1.424668961888047e-07, + "loss": 0.75098485, + "num_input_tokens_seen": 316570680, + "router_z_loss_clip": 0.73876953, + "router_z_loss_mlp": 0.11682129, + "step": 14682, + "time_per_iteration": 2.6554441452026367 + }, + { + "auxiliary_loss_clip": 0.0111555, + "auxiliary_loss_mlp": 0.0102885, + "balance_loss_clip": 1.03980112, + "balance_loss_mlp": 1.01635647, + "epoch": 0.8827897189237938, + "flos": 22589701882560.0, + "grad_norm": 1.8970219824152863, + "language_loss": 0.75086117, + "learning_rate": 1.4232257151575765e-07, + "loss": 0.77230513, + "num_input_tokens_seen": 316588635, + "router_z_loss_clip": 0.7578125, + "router_z_loss_mlp": 0.12506104, + "step": 14683, + "time_per_iteration": 2.677119016647339 + }, + { + "auxiliary_loss_clip": 0.01110364, + "auxiliary_loss_mlp": 0.01030386, + "balance_loss_clip": 1.03832722, + "balance_loss_mlp": 1.0190196, + "epoch": 0.8828498421764618, + "flos": 27756579644640.0, + "grad_norm": 1.759729321598481, + "language_loss": 0.65742683, + "learning_rate": 1.4217831728602492e-07, + "loss": 0.67883438, + "num_input_tokens_seen": 316607550, + "router_z_loss_clip": 0.72021484, + "router_z_loss_mlp": 0.1137085, + "step": 14684, + "time_per_iteration": 2.665637731552124 + }, + { + "auxiliary_loss_clip": 0.01108897, + "auxiliary_loss_mlp": 0.01026809, + "balance_loss_clip": 1.03765893, + "balance_loss_mlp": 1.01643157, + "epoch": 0.8829099654291297, + "flos": 18317723827680.0, + "grad_norm": 3.1649107198262194, + "language_loss": 0.69377542, + "learning_rate": 1.4203413350507677e-07, + "loss": 0.71513247, + "num_input_tokens_seen": 316624460, + "router_z_loss_clip": 0.71240234, + "router_z_loss_mlp": 0.10375977, + "step": 14685, + "time_per_iteration": 2.6534461975097656 + }, + { + "auxiliary_loss_clip": 0.01112634, + "auxiliary_loss_mlp": 0.01029994, + "balance_loss_clip": 1.03809786, + "balance_loss_mlp": 1.01810312, + "epoch": 0.8829700886817977, + "flos": 20402768725920.0, + "grad_norm": 2.068189883836, + "language_loss": 0.74380076, + "learning_rate": 1.418900201783806e-07, + "loss": 0.76522708, + "num_input_tokens_seen": 316640765, + "router_z_loss_clip": 0.74609375, + "router_z_loss_mlp": 0.11895752, + "step": 14686, + "time_per_iteration": 2.713456869125366 + }, + { + "auxiliary_loss_clip": 0.01107616, + "auxiliary_loss_mlp": 0.01025885, + "balance_loss_clip": 1.03654838, + "balance_loss_mlp": 1.0152632, + "epoch": 0.8830302119344656, + "flos": 18624273632640.0, + "grad_norm": 1.9249536863986851, + "language_loss": 0.63065296, + "learning_rate": 1.417459773114007e-07, + "loss": 0.65198791, + "num_input_tokens_seen": 316656120, + "router_z_loss_clip": 0.7109375, + "router_z_loss_mlp": 0.10620117, + "step": 14687, + "time_per_iteration": 2.7350521087646484 + }, + { + "auxiliary_loss_clip": 0.01112454, + "auxiliary_loss_mlp": 0.01033372, + "balance_loss_clip": 1.03856337, + "balance_loss_mlp": 1.02200508, + "epoch": 0.8830903351871336, + "flos": 34920389921760.0, + "grad_norm": 2.8262313033151916, + "language_loss": 0.69073945, + "learning_rate": 1.4160200490959984e-07, + "loss": 0.71219778, + "num_input_tokens_seen": 316676095, + "router_z_loss_clip": 0.73779297, + "router_z_loss_mlp": 0.11376953, + "step": 14688, + "time_per_iteration": 2.6920807361602783 + }, + { + "auxiliary_loss_clip": 0.01106313, + "auxiliary_loss_mlp": 0.01025761, + "balance_loss_clip": 1.03683305, + "balance_loss_mlp": 1.01469874, + "epoch": 0.8831504584398016, + "flos": 34168173023520.0, + "grad_norm": 1.9048539526240518, + "language_loss": 0.67288709, + "learning_rate": 1.4145810297843697e-07, + "loss": 0.69420791, + "num_input_tokens_seen": 316696235, + "router_z_loss_clip": 0.69433594, + "router_z_loss_mlp": 0.11065674, + "step": 14689, + "time_per_iteration": 2.684586763381958 + }, + { + "auxiliary_loss_clip": 0.0111111, + "auxiliary_loss_mlp": 0.01026652, + "balance_loss_clip": 1.04111433, + "balance_loss_mlp": 1.01579833, + "epoch": 0.8832105816924696, + "flos": 32432863000320.0, + "grad_norm": 1.4331758562236607, + "language_loss": 0.74798232, + "learning_rate": 1.4131427152336905e-07, + "loss": 0.76936001, + "num_input_tokens_seen": 316719680, + "router_z_loss_clip": 0.69970703, + "router_z_loss_mlp": 0.10858154, + "step": 14690, + "time_per_iteration": 2.7561020851135254 + }, + { + "auxiliary_loss_clip": 0.01109193, + "auxiliary_loss_mlp": 0.01032464, + "balance_loss_clip": 1.03725052, + "balance_loss_mlp": 1.02034652, + "epoch": 0.8832707049451375, + "flos": 30381967818720.0, + "grad_norm": 1.6952272224616334, + "language_loss": 0.72582746, + "learning_rate": 1.4117051054985018e-07, + "loss": 0.74724406, + "num_input_tokens_seen": 316739830, + "router_z_loss_clip": 0.71875, + "router_z_loss_mlp": 0.12127686, + "step": 14691, + "time_per_iteration": 2.6893866062164307 + }, + { + "auxiliary_loss_clip": 0.01113803, + "auxiliary_loss_mlp": 0.01027492, + "balance_loss_clip": 1.03850734, + "balance_loss_mlp": 1.01579773, + "epoch": 0.8833308281978055, + "flos": 18853042960800.0, + "grad_norm": 2.0501471495336254, + "language_loss": 0.52081048, + "learning_rate": 1.4102682006333243e-07, + "loss": 0.54222345, + "num_input_tokens_seen": 316758105, + "router_z_loss_clip": 0.75292969, + "router_z_loss_mlp": 0.11688232, + "step": 14692, + "time_per_iteration": 2.6752097606658936 + }, + { + "auxiliary_loss_clip": 0.01111669, + "auxiliary_loss_mlp": 0.0102758, + "balance_loss_clip": 1.03889918, + "balance_loss_mlp": 1.01659489, + "epoch": 0.8833909514504734, + "flos": 24772056586560.0, + "grad_norm": 3.218721305537784, + "language_loss": 0.59911901, + "learning_rate": 1.4088320006926346e-07, + "loss": 0.62051153, + "num_input_tokens_seen": 316777455, + "router_z_loss_clip": 0.72851562, + "router_z_loss_mlp": 0.10986328, + "step": 14693, + "time_per_iteration": 4.054675817489624 + }, + { + "auxiliary_loss_clip": 0.01107029, + "auxiliary_loss_mlp": 0.01024381, + "balance_loss_clip": 1.03818429, + "balance_loss_mlp": 1.01425445, + "epoch": 0.8834510747031414, + "flos": 24860574142560.0, + "grad_norm": 2.219025459013486, + "language_loss": 0.75384283, + "learning_rate": 1.407396505730898e-07, + "loss": 0.77515692, + "num_input_tokens_seen": 316796300, + "router_z_loss_clip": 0.68896484, + "router_z_loss_mlp": 0.10125732, + "step": 14694, + "time_per_iteration": 2.674482583999634 + }, + { + "auxiliary_loss_clip": 0.01110735, + "auxiliary_loss_mlp": 0.01030077, + "balance_loss_clip": 1.03578353, + "balance_loss_mlp": 1.01975942, + "epoch": 0.8835111979558095, + "flos": 36304222826880.0, + "grad_norm": 1.8212924117711848, + "language_loss": 0.72793239, + "learning_rate": 1.4059617158025527e-07, + "loss": 0.74934053, + "num_input_tokens_seen": 316819090, + "router_z_loss_clip": 0.75048828, + "router_z_loss_mlp": 0.10314941, + "step": 14695, + "time_per_iteration": 2.697549343109131 + }, + { + "auxiliary_loss_clip": 0.01105315, + "auxiliary_loss_mlp": 0.01024889, + "balance_loss_clip": 1.03728795, + "balance_loss_mlp": 1.0144459, + "epoch": 0.8835713212084774, + "flos": 29449231322400.0, + "grad_norm": 1.9956265237640547, + "language_loss": 0.79953635, + "learning_rate": 1.404527630961998e-07, + "loss": 0.82083833, + "num_input_tokens_seen": 316839250, + "router_z_loss_clip": 0.68017578, + "router_z_loss_mlp": 0.10443115, + "step": 14696, + "time_per_iteration": 2.6919326782226562 + }, + { + "auxiliary_loss_clip": 0.01111708, + "auxiliary_loss_mlp": 0.01030798, + "balance_loss_clip": 1.03897786, + "balance_loss_mlp": 1.02016401, + "epoch": 0.8836314444611454, + "flos": 33766420311360.0, + "grad_norm": 1.5223729746215848, + "language_loss": 0.75034851, + "learning_rate": 1.4030942512636236e-07, + "loss": 0.77177358, + "num_input_tokens_seen": 316861315, + "router_z_loss_clip": 0.72802734, + "router_z_loss_mlp": 0.10638428, + "step": 14697, + "time_per_iteration": 2.698294162750244 + }, + { + "auxiliary_loss_clip": 0.01108396, + "auxiliary_loss_mlp": 0.01031059, + "balance_loss_clip": 1.0368228, + "balance_loss_mlp": 1.02021718, + "epoch": 0.8836915677138133, + "flos": 20544195499200.0, + "grad_norm": 2.1074957176737916, + "language_loss": 0.72105193, + "learning_rate": 1.401661576761779e-07, + "loss": 0.74244648, + "num_input_tokens_seen": 316879325, + "router_z_loss_clip": 0.71630859, + "router_z_loss_mlp": 0.10845947, + "step": 14698, + "time_per_iteration": 2.6057257652282715 + }, + { + "auxiliary_loss_clip": 0.01027842, + "auxiliary_loss_mlp": 0.01001198, + "balance_loss_clip": 1.00548947, + "balance_loss_mlp": 1.00022936, + "epoch": 0.8837516909664813, + "flos": 84572468945280.0, + "grad_norm": 0.8331582027321247, + "language_loss": 0.53708577, + "learning_rate": 1.4002296075107856e-07, + "loss": 0.55737615, + "num_input_tokens_seen": 316936425, + "router_z_loss_clip": 0.22375488, + "router_z_loss_mlp": 0.0096817, + "step": 14699, + "time_per_iteration": 3.2469208240509033 + }, + { + "auxiliary_loss_clip": 0.01113732, + "auxiliary_loss_mlp": 0.01024952, + "balance_loss_clip": 1.03832519, + "balance_loss_mlp": 1.01357913, + "epoch": 0.8838118142191492, + "flos": 26020499793120.0, + "grad_norm": 1.725716785884035, + "language_loss": 0.76836997, + "learning_rate": 1.3987983435649508e-07, + "loss": 0.78975677, + "num_input_tokens_seen": 316956360, + "router_z_loss_clip": 0.75292969, + "router_z_loss_mlp": 0.11376953, + "step": 14700, + "time_per_iteration": 2.6092910766601562 + }, + { + "auxiliary_loss_clip": 0.01108944, + "auxiliary_loss_mlp": 0.01026099, + "balance_loss_clip": 1.03892589, + "balance_loss_mlp": 1.01557267, + "epoch": 0.8838719374718172, + "flos": 26198466802560.0, + "grad_norm": 2.4075607007220308, + "language_loss": 0.73070174, + "learning_rate": 1.3973677849785494e-07, + "loss": 0.75205219, + "num_input_tokens_seen": 316975295, + "router_z_loss_clip": 0.70068359, + "router_z_loss_mlp": 0.10528564, + "step": 14701, + "time_per_iteration": 4.144633769989014 + }, + { + "auxiliary_loss_clip": 0.01112846, + "auxiliary_loss_mlp": 0.01029104, + "balance_loss_clip": 1.03810883, + "balance_loss_mlp": 1.01765382, + "epoch": 0.8839320607244852, + "flos": 32297878474560.0, + "grad_norm": 1.851734711449822, + "language_loss": 0.71306241, + "learning_rate": 1.3959379318058262e-07, + "loss": 0.73448187, + "num_input_tokens_seen": 316994520, + "router_z_loss_clip": 0.74707031, + "router_z_loss_mlp": 0.11450195, + "step": 14702, + "time_per_iteration": 2.720423460006714 + }, + { + "auxiliary_loss_clip": 0.01112231, + "auxiliary_loss_mlp": 0.01034235, + "balance_loss_clip": 1.03960943, + "balance_loss_mlp": 1.02308941, + "epoch": 0.8839921839771532, + "flos": 55182177564480.0, + "grad_norm": 1.8063599373985775, + "language_loss": 0.71584797, + "learning_rate": 1.3945087841010006e-07, + "loss": 0.73731261, + "num_input_tokens_seen": 317018095, + "router_z_loss_clip": 0.7265625, + "router_z_loss_mlp": 0.11132812, + "step": 14703, + "time_per_iteration": 2.8594141006469727 + }, + { + "auxiliary_loss_clip": 0.0110793, + "auxiliary_loss_mlp": 0.01026175, + "balance_loss_clip": 1.03742003, + "balance_loss_mlp": 1.0158515, + "epoch": 0.8840523072298211, + "flos": 24412759633440.0, + "grad_norm": 2.363543523295642, + "language_loss": 0.66645932, + "learning_rate": 1.3930803419182645e-07, + "loss": 0.68780041, + "num_input_tokens_seen": 317035755, + "router_z_loss_clip": 0.70605469, + "router_z_loss_mlp": 0.10314941, + "step": 14704, + "time_per_iteration": 4.117065906524658 + }, + { + "auxiliary_loss_clip": 0.01104404, + "auxiliary_loss_mlp": 0.01024137, + "balance_loss_clip": 1.03496373, + "balance_loss_mlp": 1.01400399, + "epoch": 0.8841124304824891, + "flos": 29804922237600.0, + "grad_norm": 1.7983003609687382, + "language_loss": 0.71005768, + "learning_rate": 1.3916526053117905e-07, + "loss": 0.73134309, + "num_input_tokens_seen": 317055765, + "router_z_loss_clip": 0.69384766, + "router_z_loss_mlp": 0.10125732, + "step": 14705, + "time_per_iteration": 2.6527323722839355 + }, + { + "auxiliary_loss_clip": 0.01109235, + "auxiliary_loss_mlp": 0.01032525, + "balance_loss_clip": 1.0381819, + "balance_loss_mlp": 1.02249408, + "epoch": 0.884172553735157, + "flos": 38174395824000.0, + "grad_norm": 4.334175855241299, + "language_loss": 0.70874208, + "learning_rate": 1.3902255743357104e-07, + "loss": 0.7301597, + "num_input_tokens_seen": 317077955, + "router_z_loss_clip": 0.71142578, + "router_z_loss_mlp": 0.10040283, + "step": 14706, + "time_per_iteration": 2.725893497467041 + }, + { + "auxiliary_loss_clip": 0.01106768, + "auxiliary_loss_mlp": 0.01029723, + "balance_loss_clip": 1.03572035, + "balance_loss_mlp": 1.01860046, + "epoch": 0.884232676987825, + "flos": 26102169928800.0, + "grad_norm": 1.6077146119561663, + "language_loss": 0.74190176, + "learning_rate": 1.3887992490441413e-07, + "loss": 0.76326662, + "num_input_tokens_seen": 317095825, + "router_z_loss_clip": 0.71044922, + "router_z_loss_mlp": 0.11114502, + "step": 14707, + "time_per_iteration": 2.6757571697235107 + }, + { + "auxiliary_loss_clip": 0.01027771, + "auxiliary_loss_mlp": 0.01001821, + "balance_loss_clip": 1.00538015, + "balance_loss_mlp": 1.0008564, + "epoch": 0.8842928002404931, + "flos": 70663855114080.0, + "grad_norm": 0.797010185905807, + "language_loss": 0.60352623, + "learning_rate": 1.387373629491173e-07, + "loss": 0.62382209, + "num_input_tokens_seen": 317152875, + "router_z_loss_clip": 0.22387695, + "router_z_loss_mlp": 0.00963593, + "step": 14708, + "time_per_iteration": 3.0991291999816895 + }, + { + "auxiliary_loss_clip": 0.01104803, + "auxiliary_loss_mlp": 0.01026757, + "balance_loss_clip": 1.03642678, + "balance_loss_mlp": 1.01623678, + "epoch": 0.884352923493161, + "flos": 50595951421440.0, + "grad_norm": 1.7655993967685797, + "language_loss": 0.67749918, + "learning_rate": 1.3859487157308625e-07, + "loss": 0.69881481, + "num_input_tokens_seen": 317176725, + "router_z_loss_clip": 0.68408203, + "router_z_loss_mlp": 0.10522461, + "step": 14709, + "time_per_iteration": 2.840393543243408 + }, + { + "auxiliary_loss_clip": 0.01113726, + "auxiliary_loss_mlp": 0.01035588, + "balance_loss_clip": 1.03710771, + "balance_loss_mlp": 1.02309501, + "epoch": 0.884413046745829, + "flos": 56791173759840.0, + "grad_norm": 1.5825361078734825, + "language_loss": 0.62598717, + "learning_rate": 1.3845245078172373e-07, + "loss": 0.64748025, + "num_input_tokens_seen": 317206880, + "router_z_loss_clip": 0.765625, + "router_z_loss_mlp": 0.12506104, + "step": 14710, + "time_per_iteration": 2.88598370552063 + }, + { + "auxiliary_loss_clip": 0.01107588, + "auxiliary_loss_mlp": 0.01024856, + "balance_loss_clip": 1.03847992, + "balance_loss_mlp": 1.0147655, + "epoch": 0.8844731699984969, + "flos": 23348847235680.0, + "grad_norm": 3.3481879104039516, + "language_loss": 0.64082986, + "learning_rate": 1.38310100580431e-07, + "loss": 0.66215432, + "num_input_tokens_seen": 317224135, + "router_z_loss_clip": 0.69091797, + "router_z_loss_mlp": 0.10095215, + "step": 14711, + "time_per_iteration": 2.6869146823883057 + }, + { + "auxiliary_loss_clip": 0.01113036, + "auxiliary_loss_mlp": 0.01028096, + "balance_loss_clip": 1.03694582, + "balance_loss_mlp": 1.0167408, + "epoch": 0.8845332932511649, + "flos": 28382644784160.0, + "grad_norm": 1.8608039461304988, + "language_loss": 0.76109576, + "learning_rate": 1.38167820974606e-07, + "loss": 0.78250706, + "num_input_tokens_seen": 317244505, + "router_z_loss_clip": 0.76074219, + "router_z_loss_mlp": 0.11358643, + "step": 14712, + "time_per_iteration": 3.992379665374756 + }, + { + "auxiliary_loss_clip": 0.01108056, + "auxiliary_loss_mlp": 0.01025193, + "balance_loss_clip": 1.03538358, + "balance_loss_mlp": 1.01412487, + "epoch": 0.8845934165038328, + "flos": 21432288303360.0, + "grad_norm": 3.3325348349324435, + "language_loss": 0.81379092, + "learning_rate": 1.3802561196964368e-07, + "loss": 0.83512342, + "num_input_tokens_seen": 317257830, + "router_z_loss_clip": 0.72705078, + "router_z_loss_mlp": 0.11065674, + "step": 14713, + "time_per_iteration": 2.620671510696411 + }, + { + "auxiliary_loss_clip": 0.01107379, + "auxiliary_loss_mlp": 0.01026265, + "balance_loss_clip": 1.03553677, + "balance_loss_mlp": 1.01491594, + "epoch": 0.8846535397565009, + "flos": 33539109605280.0, + "grad_norm": 1.4581911822963756, + "language_loss": 0.55573404, + "learning_rate": 1.3788347357093688e-07, + "loss": 0.57707047, + "num_input_tokens_seen": 317278430, + "router_z_loss_clip": 0.71875, + "router_z_loss_mlp": 0.11358643, + "step": 14714, + "time_per_iteration": 2.7298529148101807 + }, + { + "auxiliary_loss_clip": 0.01109056, + "auxiliary_loss_mlp": 0.01032435, + "balance_loss_clip": 1.03716588, + "balance_loss_mlp": 1.02143764, + "epoch": 0.8847136630091688, + "flos": 35095925894400.0, + "grad_norm": 2.281087748640463, + "language_loss": 0.73795462, + "learning_rate": 1.377414057838755e-07, + "loss": 0.75936949, + "num_input_tokens_seen": 317295970, + "router_z_loss_clip": 0.71972656, + "router_z_loss_mlp": 0.10992432, + "step": 14715, + "time_per_iteration": 2.6756174564361572 + }, + { + "auxiliary_loss_clip": 0.01110124, + "auxiliary_loss_mlp": 0.01028974, + "balance_loss_clip": 1.0383203, + "balance_loss_mlp": 1.01806045, + "epoch": 0.8847737862618368, + "flos": 28646819864640.0, + "grad_norm": 1.5067551522199074, + "language_loss": 0.75351477, + "learning_rate": 1.375994086138461e-07, + "loss": 0.77490574, + "num_input_tokens_seen": 317316185, + "router_z_loss_clip": 0.71777344, + "router_z_loss_mlp": 0.10913086, + "step": 14716, + "time_per_iteration": 2.8180665969848633 + }, + { + "auxiliary_loss_clip": 0.01109541, + "auxiliary_loss_mlp": 0.01034542, + "balance_loss_clip": 1.0385685, + "balance_loss_mlp": 1.02393246, + "epoch": 0.8848339095145047, + "flos": 22769897342400.0, + "grad_norm": 2.457290333863129, + "language_loss": 0.71190035, + "learning_rate": 1.3745748206623397e-07, + "loss": 0.7333411, + "num_input_tokens_seen": 317333275, + "router_z_loss_clip": 0.71044922, + "router_z_loss_mlp": 0.1060791, + "step": 14717, + "time_per_iteration": 2.6111598014831543 + }, + { + "auxiliary_loss_clip": 0.01105789, + "auxiliary_loss_mlp": 0.01028528, + "balance_loss_clip": 1.0379796, + "balance_loss_mlp": 1.01807976, + "epoch": 0.8848940327671727, + "flos": 39377141889120.0, + "grad_norm": 1.917709017721779, + "language_loss": 0.73879445, + "learning_rate": 1.373156261464208e-07, + "loss": 0.76013762, + "num_input_tokens_seen": 317351245, + "router_z_loss_clip": 0.67773438, + "router_z_loss_mlp": 0.10461426, + "step": 14718, + "time_per_iteration": 2.7267541885375977 + }, + { + "auxiliary_loss_clip": 0.01111848, + "auxiliary_loss_mlp": 0.01026312, + "balance_loss_clip": 1.03799844, + "balance_loss_mlp": 1.01501679, + "epoch": 0.8849541560198406, + "flos": 29311977828960.0, + "grad_norm": 1.6657689966349765, + "language_loss": 0.78389245, + "learning_rate": 1.3717384085978602e-07, + "loss": 0.80527401, + "num_input_tokens_seen": 317370740, + "router_z_loss_clip": 0.73730469, + "router_z_loss_mlp": 0.11291504, + "step": 14719, + "time_per_iteration": 2.6594958305358887 + }, + { + "auxiliary_loss_clip": 0.01111635, + "auxiliary_loss_mlp": 0.01026687, + "balance_loss_clip": 1.03805089, + "balance_loss_mlp": 1.01568437, + "epoch": 0.8850142792725086, + "flos": 20588271949440.0, + "grad_norm": 1.9033420152555576, + "language_loss": 0.72308522, + "learning_rate": 1.3703212621170579e-07, + "loss": 0.74446845, + "num_input_tokens_seen": 317388370, + "router_z_loss_clip": 0.73583984, + "router_z_loss_mlp": 0.10998535, + "step": 14720, + "time_per_iteration": 2.6624338626861572 + }, + { + "auxiliary_loss_clip": 0.01112079, + "auxiliary_loss_mlp": 0.01027146, + "balance_loss_clip": 1.03732407, + "balance_loss_mlp": 1.01566029, + "epoch": 0.8850744025251767, + "flos": 29314489900320.0, + "grad_norm": 2.226772048325326, + "language_loss": 0.82390064, + "learning_rate": 1.3689048220755383e-07, + "loss": 0.84529287, + "num_input_tokens_seen": 317407390, + "router_z_loss_clip": 0.74707031, + "router_z_loss_mlp": 0.11486816, + "step": 14721, + "time_per_iteration": 2.673396587371826 + }, + { + "auxiliary_loss_clip": 0.01109893, + "auxiliary_loss_mlp": 0.0102827, + "balance_loss_clip": 1.0366087, + "balance_loss_mlp": 1.01649797, + "epoch": 0.8851345257778446, + "flos": 58517853602400.0, + "grad_norm": 1.7292872428353068, + "language_loss": 0.62005961, + "learning_rate": 1.3674890885270186e-07, + "loss": 0.64144123, + "num_input_tokens_seen": 317430825, + "router_z_loss_clip": 0.73291016, + "router_z_loss_mlp": 0.11779785, + "step": 14722, + "time_per_iteration": 2.9104130268096924 + }, + { + "auxiliary_loss_clip": 0.01111467, + "auxiliary_loss_mlp": 0.01026719, + "balance_loss_clip": 1.03736424, + "balance_loss_mlp": 1.01561427, + "epoch": 0.8851946490305126, + "flos": 44674101586080.0, + "grad_norm": 3.225169926203961, + "language_loss": 0.68603897, + "learning_rate": 1.3660740615251754e-07, + "loss": 0.70742083, + "num_input_tokens_seen": 317451905, + "router_z_loss_clip": 0.74121094, + "router_z_loss_mlp": 0.11102295, + "step": 14723, + "time_per_iteration": 2.783721685409546 + }, + { + "auxiliary_loss_clip": 0.011091, + "auxiliary_loss_mlp": 0.01029372, + "balance_loss_clip": 1.03757977, + "balance_loss_mlp": 1.01805866, + "epoch": 0.8852547722831805, + "flos": 26288321428800.0, + "grad_norm": 1.884485798143411, + "language_loss": 0.78318864, + "learning_rate": 1.3646597411236703e-07, + "loss": 0.80457336, + "num_input_tokens_seen": 317470030, + "router_z_loss_clip": 0.71630859, + "router_z_loss_mlp": 0.11309814, + "step": 14724, + "time_per_iteration": 2.78051495552063 + }, + { + "auxiliary_loss_clip": 0.01027811, + "auxiliary_loss_mlp": 0.01001184, + "balance_loss_clip": 1.00541353, + "balance_loss_mlp": 1.00017548, + "epoch": 0.8853148955358485, + "flos": 76945285523520.0, + "grad_norm": 0.8221028157064116, + "language_loss": 0.58901644, + "learning_rate": 1.363246127376143e-07, + "loss": 0.60930645, + "num_input_tokens_seen": 317527460, + "router_z_loss_clip": 0.22424316, + "router_z_loss_mlp": 0.01008606, + "step": 14725, + "time_per_iteration": 3.1632444858551025 + }, + { + "auxiliary_loss_clip": 0.01113512, + "auxiliary_loss_mlp": 0.01033322, + "balance_loss_clip": 1.03691268, + "balance_loss_mlp": 1.021824, + "epoch": 0.8853750187885164, + "flos": 22146303756960.0, + "grad_norm": 2.3353673986816323, + "language_loss": 0.69166672, + "learning_rate": 1.3618332203361837e-07, + "loss": 0.71313512, + "num_input_tokens_seen": 317544070, + "router_z_loss_clip": 0.76611328, + "router_z_loss_mlp": 0.11499023, + "step": 14726, + "time_per_iteration": 2.625528573989868 + }, + { + "auxiliary_loss_clip": 0.01109252, + "auxiliary_loss_mlp": 0.0102992, + "balance_loss_clip": 1.03889441, + "balance_loss_mlp": 1.018345, + "epoch": 0.8854351420411845, + "flos": 48285175991040.0, + "grad_norm": 1.4797337186277597, + "language_loss": 0.69753599, + "learning_rate": 1.3604210200573785e-07, + "loss": 0.71892768, + "num_input_tokens_seen": 317570275, + "router_z_loss_clip": 0.70361328, + "router_z_loss_mlp": 0.11584473, + "step": 14727, + "time_per_iteration": 2.796229362487793 + }, + { + "auxiliary_loss_clip": 0.01111798, + "auxiliary_loss_mlp": 0.01032767, + "balance_loss_clip": 1.04015875, + "balance_loss_mlp": 1.02194297, + "epoch": 0.8854952652938524, + "flos": 29003564229120.0, + "grad_norm": 1.6097993689970305, + "language_loss": 0.70113611, + "learning_rate": 1.3590095265932733e-07, + "loss": 0.7225818, + "num_input_tokens_seen": 317590160, + "router_z_loss_clip": 0.71582031, + "router_z_loss_mlp": 0.1081543, + "step": 14728, + "time_per_iteration": 2.6843926906585693 + }, + { + "auxiliary_loss_clip": 0.01109548, + "auxiliary_loss_mlp": 0.01029674, + "balance_loss_clip": 1.03718543, + "balance_loss_mlp": 1.01939857, + "epoch": 0.8855553885465204, + "flos": 22318233691680.0, + "grad_norm": 2.210639348091458, + "language_loss": 0.66878355, + "learning_rate": 1.3575987399973987e-07, + "loss": 0.69017577, + "num_input_tokens_seen": 317608340, + "router_z_loss_clip": 0.72363281, + "router_z_loss_mlp": 0.1027832, + "step": 14729, + "time_per_iteration": 2.645486831665039 + }, + { + "auxiliary_loss_clip": 0.01108642, + "auxiliary_loss_mlp": 0.01030335, + "balance_loss_clip": 1.0387733, + "balance_loss_mlp": 1.02032113, + "epoch": 0.8856155117991883, + "flos": 44986891052160.0, + "grad_norm": 1.918774805139202, + "language_loss": 0.63019168, + "learning_rate": 1.3561886603232453e-07, + "loss": 0.65158141, + "num_input_tokens_seen": 317629910, + "router_z_loss_clip": 0.69824219, + "router_z_loss_mlp": 0.10021973, + "step": 14730, + "time_per_iteration": 2.838611364364624 + }, + { + "auxiliary_loss_clip": 0.01106026, + "auxiliary_loss_mlp": 0.01029937, + "balance_loss_clip": 1.03535306, + "balance_loss_mlp": 1.01890445, + "epoch": 0.8856756350518563, + "flos": 27044509020480.0, + "grad_norm": 1.553782111406976, + "language_loss": 0.79421884, + "learning_rate": 1.3547792876242904e-07, + "loss": 0.81557846, + "num_input_tokens_seen": 317650265, + "router_z_loss_clip": 0.70654297, + "router_z_loss_mlp": 0.1104126, + "step": 14731, + "time_per_iteration": 2.6491360664367676 + }, + { + "auxiliary_loss_clip": 0.01109798, + "auxiliary_loss_mlp": 0.01029049, + "balance_loss_clip": 1.03670037, + "balance_loss_mlp": 1.01758718, + "epoch": 0.8857357583045242, + "flos": 25308145548000.0, + "grad_norm": 1.8339907112855425, + "language_loss": 0.83208072, + "learning_rate": 1.3533706219539708e-07, + "loss": 0.85346919, + "num_input_tokens_seen": 317669045, + "router_z_loss_clip": 0.73046875, + "router_z_loss_mlp": 0.11468506, + "step": 14732, + "time_per_iteration": 4.163718223571777 + }, + { + "auxiliary_loss_clip": 0.0102783, + "auxiliary_loss_mlp": 0.01001402, + "balance_loss_clip": 1.00543332, + "balance_loss_mlp": 1.00041807, + "epoch": 0.8857958815571922, + "flos": 85282594740000.0, + "grad_norm": 0.8969905578934734, + "language_loss": 0.59874421, + "learning_rate": 1.3519626633657045e-07, + "loss": 0.61903656, + "num_input_tokens_seen": 317728065, + "router_z_loss_clip": 0.22436523, + "router_z_loss_mlp": 0.00982666, + "step": 14733, + "time_per_iteration": 3.266918659210205 + }, + { + "auxiliary_loss_clip": 0.01110033, + "auxiliary_loss_mlp": 0.01030458, + "balance_loss_clip": 1.0378108, + "balance_loss_mlp": 1.0192107, + "epoch": 0.8858560048098603, + "flos": 18451411800480.0, + "grad_norm": 1.7748404868862742, + "language_loss": 0.66782343, + "learning_rate": 1.3505554119128838e-07, + "loss": 0.68922836, + "num_input_tokens_seen": 317746120, + "router_z_loss_clip": 0.72265625, + "router_z_loss_mlp": 0.11248779, + "step": 14734, + "time_per_iteration": 2.613656759262085 + }, + { + "auxiliary_loss_clip": 0.01110501, + "auxiliary_loss_mlp": 0.01036636, + "balance_loss_clip": 1.04095078, + "balance_loss_mlp": 1.02645516, + "epoch": 0.8859161280625282, + "flos": 20269121270400.0, + "grad_norm": 2.200951728810776, + "language_loss": 0.74951833, + "learning_rate": 1.3491488676488682e-07, + "loss": 0.77098972, + "num_input_tokens_seen": 317762280, + "router_z_loss_clip": 0.6953125, + "router_z_loss_mlp": 0.10186768, + "step": 14735, + "time_per_iteration": 2.761608600616455 + }, + { + "auxiliary_loss_clip": 0.01109793, + "auxiliary_loss_mlp": 0.01029529, + "balance_loss_clip": 1.03730154, + "balance_loss_mlp": 1.01858544, + "epoch": 0.8859762513151962, + "flos": 22811178100320.0, + "grad_norm": 2.3462730612512037, + "language_loss": 0.70516264, + "learning_rate": 1.3477430306270066e-07, + "loss": 0.72655582, + "num_input_tokens_seen": 317780615, + "router_z_loss_clip": 0.72412109, + "router_z_loss_mlp": 0.10931396, + "step": 14736, + "time_per_iteration": 2.595658540725708 + }, + { + "auxiliary_loss_clip": 0.01111691, + "auxiliary_loss_mlp": 0.01029776, + "balance_loss_clip": 1.03890121, + "balance_loss_mlp": 1.01861823, + "epoch": 0.8860363745678641, + "flos": 23838833882880.0, + "grad_norm": 2.406570381335431, + "language_loss": 0.84328389, + "learning_rate": 1.3463379009005892e-07, + "loss": 0.86469859, + "num_input_tokens_seen": 317798830, + "router_z_loss_clip": 0.72851562, + "router_z_loss_mlp": 0.1116333, + "step": 14737, + "time_per_iteration": 2.6561551094055176 + }, + { + "auxiliary_loss_clip": 0.01116326, + "auxiliary_loss_mlp": 0.0103209, + "balance_loss_clip": 1.03911352, + "balance_loss_mlp": 1.01962042, + "epoch": 0.8860964978205321, + "flos": 43873432371360.0, + "grad_norm": 2.1797726567633795, + "language_loss": 0.68343186, + "learning_rate": 1.3449334785229093e-07, + "loss": 0.704916, + "num_input_tokens_seen": 317819235, + "router_z_loss_clip": 0.77148438, + "router_z_loss_mlp": 0.12469482, + "step": 14738, + "time_per_iteration": 2.738135814666748 + }, + { + "auxiliary_loss_clip": 0.01113869, + "auxiliary_loss_mlp": 0.01027598, + "balance_loss_clip": 1.03738427, + "balance_loss_mlp": 1.01605272, + "epoch": 0.8861566210732, + "flos": 25883894576160.0, + "grad_norm": 2.2044480958816646, + "language_loss": 0.75098848, + "learning_rate": 1.343529763547222e-07, + "loss": 0.77240312, + "num_input_tokens_seen": 317836785, + "router_z_loss_clip": 0.76513672, + "router_z_loss_mlp": 0.11541748, + "step": 14739, + "time_per_iteration": 2.6709160804748535 + }, + { + "auxiliary_loss_clip": 0.01109209, + "auxiliary_loss_mlp": 0.01029327, + "balance_loss_clip": 1.03848088, + "balance_loss_mlp": 1.01907492, + "epoch": 0.886216744325868, + "flos": 17825751833760.0, + "grad_norm": 2.053848795323427, + "language_loss": 0.87283838, + "learning_rate": 1.3421267560267559e-07, + "loss": 0.89422369, + "num_input_tokens_seen": 317854225, + "router_z_loss_clip": 0.70751953, + "router_z_loss_mlp": 0.1026001, + "step": 14740, + "time_per_iteration": 3.895575523376465 + }, + { + "auxiliary_loss_clip": 0.01109013, + "auxiliary_loss_mlp": 0.01033253, + "balance_loss_clip": 1.03788733, + "balance_loss_mlp": 1.02225041, + "epoch": 0.886276867578536, + "flos": 32520813314400.0, + "grad_norm": 1.8327268944292838, + "language_loss": 0.63902783, + "learning_rate": 1.34072445601471e-07, + "loss": 0.66045046, + "num_input_tokens_seen": 317874865, + "router_z_loss_clip": 0.7109375, + "router_z_loss_mlp": 0.10998535, + "step": 14741, + "time_per_iteration": 2.6735057830810547 + }, + { + "auxiliary_loss_clip": 0.01110003, + "auxiliary_loss_mlp": 0.01028793, + "balance_loss_clip": 1.03831983, + "balance_loss_mlp": 1.01781964, + "epoch": 0.886336990831204, + "flos": 20455232253120.0, + "grad_norm": 2.0678201884928007, + "language_loss": 0.73161054, + "learning_rate": 1.3393228635642717e-07, + "loss": 0.75299847, + "num_input_tokens_seen": 317892830, + "router_z_loss_clip": 0.71630859, + "router_z_loss_mlp": 0.10974121, + "step": 14742, + "time_per_iteration": 2.6444878578186035 + }, + { + "auxiliary_loss_clip": 0.01109206, + "auxiliary_loss_mlp": 0.01028635, + "balance_loss_clip": 1.03790879, + "balance_loss_mlp": 1.01779306, + "epoch": 0.8863971140838719, + "flos": 30829053016800.0, + "grad_norm": 2.062584777229838, + "language_loss": 0.59439451, + "learning_rate": 1.3379219787285733e-07, + "loss": 0.6157729, + "num_input_tokens_seen": 317911780, + "router_z_loss_clip": 0.71240234, + "router_z_loss_mlp": 0.10845947, + "step": 14743, + "time_per_iteration": 2.656660556793213 + }, + { + "auxiliary_loss_clip": 0.01112934, + "auxiliary_loss_mlp": 0.01032493, + "balance_loss_clip": 1.0378561, + "balance_loss_mlp": 1.02009571, + "epoch": 0.8864572373365399, + "flos": 28558180756800.0, + "grad_norm": 2.3393867999742075, + "language_loss": 0.59954995, + "learning_rate": 1.3365218015607437e-07, + "loss": 0.62100422, + "num_input_tokens_seen": 317932855, + "router_z_loss_clip": 0.75, + "router_z_loss_mlp": 0.12414551, + "step": 14744, + "time_per_iteration": 4.07467246055603 + }, + { + "auxiliary_loss_clip": 0.0111101, + "auxiliary_loss_mlp": 0.01031477, + "balance_loss_clip": 1.0376364, + "balance_loss_mlp": 1.02006245, + "epoch": 0.8865173605892078, + "flos": 22633332642720.0, + "grad_norm": 1.683324617288442, + "language_loss": 0.76668823, + "learning_rate": 1.3351223321138762e-07, + "loss": 0.78811306, + "num_input_tokens_seen": 317952090, + "router_z_loss_clip": 0.734375, + "router_z_loss_mlp": 0.11401367, + "step": 14745, + "time_per_iteration": 2.618030548095703 + }, + { + "auxiliary_loss_clip": 0.01108313, + "auxiliary_loss_mlp": 0.01032775, + "balance_loss_clip": 1.03764427, + "balance_loss_mlp": 1.02187347, + "epoch": 0.8865774838418758, + "flos": 23215280814720.0, + "grad_norm": 2.124740282227065, + "language_loss": 0.77446443, + "learning_rate": 1.3337235704410454e-07, + "loss": 0.79587531, + "num_input_tokens_seen": 317970370, + "router_z_loss_clip": 0.70654297, + "router_z_loss_mlp": 0.10906982, + "step": 14746, + "time_per_iteration": 2.6466996669769287 + }, + { + "auxiliary_loss_clip": 0.01112754, + "auxiliary_loss_mlp": 0.01034418, + "balance_loss_clip": 1.03891659, + "balance_loss_mlp": 1.02246094, + "epoch": 0.8866376070945439, + "flos": 27043698674880.0, + "grad_norm": 2.2717019983423503, + "language_loss": 0.76530647, + "learning_rate": 1.3323255165952873e-07, + "loss": 0.78677821, + "num_input_tokens_seen": 317989125, + "router_z_loss_clip": 0.73925781, + "router_z_loss_mlp": 0.11956787, + "step": 14747, + "time_per_iteration": 2.687809705734253 + }, + { + "auxiliary_loss_clip": 0.01106037, + "auxiliary_loss_mlp": 0.01024391, + "balance_loss_clip": 1.03547752, + "balance_loss_mlp": 1.01331687, + "epoch": 0.8866977303472118, + "flos": 25263380304000.0, + "grad_norm": 4.102742223327527, + "language_loss": 0.82459098, + "learning_rate": 1.3309281706296127e-07, + "loss": 0.84589529, + "num_input_tokens_seen": 318007820, + "router_z_loss_clip": 0.70507812, + "router_z_loss_mlp": 0.11077881, + "step": 14748, + "time_per_iteration": 2.6163525581359863 + }, + { + "auxiliary_loss_clip": 0.0111064, + "auxiliary_loss_mlp": 0.01028393, + "balance_loss_clip": 1.03851676, + "balance_loss_mlp": 1.01675773, + "epoch": 0.8867578535998798, + "flos": 59541538691520.0, + "grad_norm": 2.0630125414013123, + "language_loss": 0.77487326, + "learning_rate": 1.3295315325970148e-07, + "loss": 0.79626358, + "num_input_tokens_seen": 318030435, + "router_z_loss_clip": 0.72167969, + "router_z_loss_mlp": 0.11627197, + "step": 14749, + "time_per_iteration": 2.896397829055786 + }, + { + "auxiliary_loss_clip": 0.01112034, + "auxiliary_loss_mlp": 0.01029213, + "balance_loss_clip": 1.03670573, + "balance_loss_mlp": 1.01730359, + "epoch": 0.8868179768525477, + "flos": 25752799709280.0, + "grad_norm": 2.0017382356089595, + "language_loss": 0.6988858, + "learning_rate": 1.328135602550451e-07, + "loss": 0.72029829, + "num_input_tokens_seen": 318049465, + "router_z_loss_clip": 0.75341797, + "router_z_loss_mlp": 0.11914062, + "step": 14750, + "time_per_iteration": 2.619086980819702 + }, + { + "auxiliary_loss_clip": 0.01110014, + "auxiliary_loss_mlp": 0.01030285, + "balance_loss_clip": 1.03808451, + "balance_loss_mlp": 1.01940656, + "epoch": 0.8868781001052157, + "flos": 26638623545760.0, + "grad_norm": 1.721857394876625, + "language_loss": 0.5899232, + "learning_rate": 1.3267403805428546e-07, + "loss": 0.61132622, + "num_input_tokens_seen": 318067760, + "router_z_loss_clip": 0.71972656, + "router_z_loss_mlp": 0.10876465, + "step": 14751, + "time_per_iteration": 3.9591662883758545 + }, + { + "auxiliary_loss_clip": 0.01110287, + "auxiliary_loss_mlp": 0.01027455, + "balance_loss_clip": 1.03887582, + "balance_loss_mlp": 1.01637411, + "epoch": 0.8869382233578836, + "flos": 16492235040000.0, + "grad_norm": 2.054137440911517, + "language_loss": 0.81791389, + "learning_rate": 1.3253458666271344e-07, + "loss": 0.83929133, + "num_input_tokens_seen": 318082785, + "router_z_loss_clip": 0.71435547, + "router_z_loss_mlp": 0.11077881, + "step": 14752, + "time_per_iteration": 2.613090991973877 + }, + { + "auxiliary_loss_clip": 0.01114242, + "auxiliary_loss_mlp": 0.01030828, + "balance_loss_clip": 1.03848815, + "balance_loss_mlp": 1.01881135, + "epoch": 0.8869983466105517, + "flos": 27705372153120.0, + "grad_norm": 2.0989133373590176, + "language_loss": 0.79919755, + "learning_rate": 1.3239520608561793e-07, + "loss": 0.82064819, + "num_input_tokens_seen": 318101925, + "router_z_loss_clip": 0.7578125, + "router_z_loss_mlp": 0.12011719, + "step": 14753, + "time_per_iteration": 2.6190812587738037 + }, + { + "auxiliary_loss_clip": 0.0110855, + "auxiliary_loss_mlp": 0.01029281, + "balance_loss_clip": 1.03733516, + "balance_loss_mlp": 1.018659, + "epoch": 0.8870584698632196, + "flos": 18718423090560.0, + "grad_norm": 1.6692116563723571, + "language_loss": 0.65549374, + "learning_rate": 1.3225589632828248e-07, + "loss": 0.67687201, + "num_input_tokens_seen": 318119945, + "router_z_loss_clip": 0.71240234, + "router_z_loss_mlp": 0.10614014, + "step": 14754, + "time_per_iteration": 2.6714000701904297 + }, + { + "auxiliary_loss_clip": 0.01111869, + "auxiliary_loss_mlp": 0.01029224, + "balance_loss_clip": 1.03917432, + "balance_loss_mlp": 1.01784539, + "epoch": 0.8871185931158876, + "flos": 32476817898720.0, + "grad_norm": 2.002381043622042, + "language_loss": 0.7489593, + "learning_rate": 1.3211665739599065e-07, + "loss": 0.77037019, + "num_input_tokens_seen": 318139685, + "router_z_loss_clip": 0.72753906, + "router_z_loss_mlp": 0.11376953, + "step": 14755, + "time_per_iteration": 2.6457064151763916 + }, + { + "auxiliary_loss_clip": 0.01110105, + "auxiliary_loss_mlp": 0.01029583, + "balance_loss_clip": 1.03700089, + "balance_loss_mlp": 1.01745915, + "epoch": 0.8871787163685555, + "flos": 26599287617280.0, + "grad_norm": 1.5028880207303232, + "language_loss": 0.77819526, + "learning_rate": 1.3197748929402262e-07, + "loss": 0.79959214, + "num_input_tokens_seen": 318160375, + "router_z_loss_clip": 0.73046875, + "router_z_loss_mlp": 0.12121582, + "step": 14756, + "time_per_iteration": 2.6758103370666504 + }, + { + "auxiliary_loss_clip": 0.01112424, + "auxiliary_loss_mlp": 0.01032986, + "balance_loss_clip": 1.03927207, + "balance_loss_mlp": 1.02171469, + "epoch": 0.8872388396212235, + "flos": 18184197924000.0, + "grad_norm": 2.3439185344770457, + "language_loss": 0.76402807, + "learning_rate": 1.3183839202765535e-07, + "loss": 0.78548217, + "num_input_tokens_seen": 318177995, + "router_z_loss_clip": 0.72998047, + "router_z_loss_mlp": 0.11279297, + "step": 14757, + "time_per_iteration": 2.6043317317962646 + }, + { + "auxiliary_loss_clip": 0.01106502, + "auxiliary_loss_mlp": 0.01033056, + "balance_loss_clip": 1.03726149, + "balance_loss_mlp": 1.02195179, + "epoch": 0.8872989628738914, + "flos": 32253032196000.0, + "grad_norm": 2.1929050405881596, + "language_loss": 0.68160826, + "learning_rate": 1.316993656021632e-07, + "loss": 0.70300388, + "num_input_tokens_seen": 318197030, + "router_z_loss_clip": 0.69189453, + "router_z_loss_mlp": 0.11108398, + "step": 14758, + "time_per_iteration": 2.693300247192383 + }, + { + "auxiliary_loss_clip": 0.01109164, + "auxiliary_loss_mlp": 0.01034013, + "balance_loss_clip": 1.0377202, + "balance_loss_mlp": 1.02169263, + "epoch": 0.8873590861265594, + "flos": 58745609998560.0, + "grad_norm": 1.8633874071980085, + "language_loss": 0.68785602, + "learning_rate": 1.3156041002281915e-07, + "loss": 0.70928776, + "num_input_tokens_seen": 318221780, + "router_z_loss_clip": 0.71533203, + "router_z_loss_mlp": 0.12316895, + "step": 14759, + "time_per_iteration": 2.8733468055725098 + }, + { + "auxiliary_loss_clip": 0.01107104, + "auxiliary_loss_mlp": 0.01025688, + "balance_loss_clip": 1.03579974, + "balance_loss_mlp": 1.0145117, + "epoch": 0.8874192093792275, + "flos": 22369724804160.0, + "grad_norm": 1.9827245323948854, + "language_loss": 0.74719226, + "learning_rate": 1.3142152529489092e-07, + "loss": 0.76852012, + "num_input_tokens_seen": 318239710, + "router_z_loss_clip": 0.71289062, + "router_z_loss_mlp": 0.11169434, + "step": 14760, + "time_per_iteration": 2.685046672821045 + }, + { + "auxiliary_loss_clip": 0.01114103, + "auxiliary_loss_mlp": 0.01033554, + "balance_loss_clip": 1.03999448, + "balance_loss_mlp": 1.02182353, + "epoch": 0.8874793326318954, + "flos": 21834405671040.0, + "grad_norm": 2.6107036862151007, + "language_loss": 0.76055551, + "learning_rate": 1.3128271142364565e-07, + "loss": 0.78203213, + "num_input_tokens_seen": 318257425, + "router_z_loss_clip": 0.74072266, + "router_z_loss_mlp": 0.11737061, + "step": 14761, + "time_per_iteration": 2.6160194873809814 + }, + { + "auxiliary_loss_clip": 0.01112247, + "auxiliary_loss_mlp": 0.01036971, + "balance_loss_clip": 1.03824937, + "balance_loss_mlp": 1.02555656, + "epoch": 0.8875394558845634, + "flos": 37951825639680.0, + "grad_norm": 1.7747920583656098, + "language_loss": 0.61246419, + "learning_rate": 1.3114396841434717e-07, + "loss": 0.63395631, + "num_input_tokens_seen": 318278485, + "router_z_loss_clip": 0.74072266, + "router_z_loss_mlp": 0.11419678, + "step": 14762, + "time_per_iteration": 2.71335506439209 + }, + { + "auxiliary_loss_clip": 0.01110714, + "auxiliary_loss_mlp": 0.0103329, + "balance_loss_clip": 1.03795493, + "balance_loss_mlp": 1.02139938, + "epoch": 0.8875995791372313, + "flos": 25797605470560.0, + "grad_norm": 2.2217299152631598, + "language_loss": 0.6353035, + "learning_rate": 1.3100529627225697e-07, + "loss": 0.65674353, + "num_input_tokens_seen": 318297560, + "router_z_loss_clip": 0.72753906, + "router_z_loss_mlp": 0.11883545, + "step": 14763, + "time_per_iteration": 2.617978572845459 + }, + { + "auxiliary_loss_clip": 0.01111364, + "auxiliary_loss_mlp": 0.01030188, + "balance_loss_clip": 1.0383625, + "balance_loss_mlp": 1.01870251, + "epoch": 0.8876597023898993, + "flos": 21298721882400.0, + "grad_norm": 2.7729230480333547, + "language_loss": 0.71313435, + "learning_rate": 1.3086669500263335e-07, + "loss": 0.73454988, + "num_input_tokens_seen": 318313060, + "router_z_loss_clip": 0.73095703, + "router_z_loss_mlp": 0.11499023, + "step": 14764, + "time_per_iteration": 2.6027910709381104 + }, + { + "auxiliary_loss_clip": 0.01113478, + "auxiliary_loss_mlp": 0.01035253, + "balance_loss_clip": 1.03808093, + "balance_loss_mlp": 1.0243032, + "epoch": 0.8877198256425672, + "flos": 27708694570080.0, + "grad_norm": 2.425787281640522, + "language_loss": 0.65860152, + "learning_rate": 1.3072816461073166e-07, + "loss": 0.68008882, + "num_input_tokens_seen": 318332030, + "router_z_loss_clip": 0.75244141, + "router_z_loss_mlp": 0.10949707, + "step": 14765, + "time_per_iteration": 2.6581685543060303 + }, + { + "auxiliary_loss_clip": 0.01106987, + "auxiliary_loss_mlp": 0.01026081, + "balance_loss_clip": 1.03792524, + "balance_loss_mlp": 1.01621675, + "epoch": 0.8877799488952353, + "flos": 29938083485760.0, + "grad_norm": 1.7351434275042423, + "language_loss": 0.76584011, + "learning_rate": 1.3058970510180568e-07, + "loss": 0.78717083, + "num_input_tokens_seen": 318351090, + "router_z_loss_clip": 0.69042969, + "router_z_loss_mlp": 0.09863281, + "step": 14766, + "time_per_iteration": 2.6858012676239014 + }, + { + "auxiliary_loss_clip": 0.01107414, + "auxiliary_loss_mlp": 0.01028225, + "balance_loss_clip": 1.03771269, + "balance_loss_mlp": 1.01759768, + "epoch": 0.8878400721479032, + "flos": 25574508561600.0, + "grad_norm": 1.7161434158151565, + "language_loss": 0.72936904, + "learning_rate": 1.3045131648110496e-07, + "loss": 0.75072539, + "num_input_tokens_seen": 318372000, + "router_z_loss_clip": 0.69726562, + "router_z_loss_mlp": 0.10632324, + "step": 14767, + "time_per_iteration": 2.6555285453796387 + }, + { + "auxiliary_loss_clip": 0.01106122, + "auxiliary_loss_mlp": 0.01028928, + "balance_loss_clip": 1.03717089, + "balance_loss_mlp": 1.01815176, + "epoch": 0.8879001954005712, + "flos": 30866241529440.0, + "grad_norm": 1.8946052878175117, + "language_loss": 0.70654744, + "learning_rate": 1.303129987538778e-07, + "loss": 0.727898, + "num_input_tokens_seen": 318391530, + "router_z_loss_clip": 0.68945312, + "router_z_loss_mlp": 0.10772705, + "step": 14768, + "time_per_iteration": 2.657527208328247 + }, + { + "auxiliary_loss_clip": 0.01107985, + "auxiliary_loss_mlp": 0.01027828, + "balance_loss_clip": 1.03703451, + "balance_loss_mlp": 1.01720023, + "epoch": 0.8879603186532391, + "flos": 28290966880320.0, + "grad_norm": 1.8331093736816693, + "language_loss": 0.70134491, + "learning_rate": 1.3017475192536932e-07, + "loss": 0.72270298, + "num_input_tokens_seen": 318410690, + "router_z_loss_clip": 0.70947266, + "router_z_loss_mlp": 0.10626221, + "step": 14769, + "time_per_iteration": 2.661771297454834 + }, + { + "auxiliary_loss_clip": 0.011088, + "auxiliary_loss_mlp": 0.01029455, + "balance_loss_clip": 1.03860986, + "balance_loss_mlp": 1.01882756, + "epoch": 0.8880204419059071, + "flos": 16662503766240.0, + "grad_norm": 2.1224642699945875, + "language_loss": 0.66836888, + "learning_rate": 1.3003657600082174e-07, + "loss": 0.68975139, + "num_input_tokens_seen": 318427380, + "router_z_loss_clip": 0.70214844, + "router_z_loss_mlp": 0.10620117, + "step": 14770, + "time_per_iteration": 2.6001696586608887 + }, + { + "auxiliary_loss_clip": 0.0110551, + "auxiliary_loss_mlp": 0.01032251, + "balance_loss_clip": 1.03705752, + "balance_loss_mlp": 1.02108097, + "epoch": 0.888080565158575, + "flos": 25174984299840.0, + "grad_norm": 1.7252903715069632, + "language_loss": 0.65212393, + "learning_rate": 1.2989847098547424e-07, + "loss": 0.67350161, + "num_input_tokens_seen": 318448530, + "router_z_loss_clip": 0.68457031, + "router_z_loss_mlp": 0.1116333, + "step": 14771, + "time_per_iteration": 2.682250738143921 + }, + { + "auxiliary_loss_clip": 0.01108184, + "auxiliary_loss_mlp": 0.01023851, + "balance_loss_clip": 1.03681493, + "balance_loss_mlp": 1.01325893, + "epoch": 0.888140688411243, + "flos": 34922942510400.0, + "grad_norm": 3.7599436778238116, + "language_loss": 0.82479674, + "learning_rate": 1.2976043688456396e-07, + "loss": 0.84611714, + "num_input_tokens_seen": 318468655, + "router_z_loss_clip": 0.71337891, + "router_z_loss_mlp": 0.10595703, + "step": 14772, + "time_per_iteration": 4.099614858627319 + }, + { + "auxiliary_loss_clip": 0.01103848, + "auxiliary_loss_mlp": 0.01022557, + "balance_loss_clip": 1.03559208, + "balance_loss_mlp": 1.01246572, + "epoch": 0.8882008116639111, + "flos": 31140221791680.0, + "grad_norm": 1.6469937298473711, + "language_loss": 0.76292819, + "learning_rate": 1.296224737033258e-07, + "loss": 0.7841922, + "num_input_tokens_seen": 318488740, + "router_z_loss_clip": 0.68261719, + "router_z_loss_mlp": 0.10095215, + "step": 14773, + "time_per_iteration": 2.731407403945923 + }, + { + "auxiliary_loss_clip": 0.01107131, + "auxiliary_loss_mlp": 0.01027189, + "balance_loss_clip": 1.03746998, + "balance_loss_mlp": 1.01683521, + "epoch": 0.888260934916579, + "flos": 33767959968000.0, + "grad_norm": 1.822560492341129, + "language_loss": 0.74889749, + "learning_rate": 1.294845814469907e-07, + "loss": 0.77024066, + "num_input_tokens_seen": 318508810, + "router_z_loss_clip": 0.69628906, + "router_z_loss_mlp": 0.10351562, + "step": 14774, + "time_per_iteration": 2.7181389331817627 + }, + { + "auxiliary_loss_clip": 0.01112794, + "auxiliary_loss_mlp": 0.01032135, + "balance_loss_clip": 1.03907394, + "balance_loss_mlp": 1.0199163, + "epoch": 0.888321058169247, + "flos": 26370437254560.0, + "grad_norm": 2.807188404741945, + "language_loss": 0.71866322, + "learning_rate": 1.2934676012078783e-07, + "loss": 0.74011248, + "num_input_tokens_seen": 318526860, + "router_z_loss_clip": 0.73681641, + "router_z_loss_mlp": 0.12213135, + "step": 14775, + "time_per_iteration": 2.615408182144165 + }, + { + "auxiliary_loss_clip": 0.01108293, + "auxiliary_loss_mlp": 0.01026594, + "balance_loss_clip": 1.0369935, + "balance_loss_mlp": 1.01625276, + "epoch": 0.8883811814219149, + "flos": 22145493411360.0, + "grad_norm": 1.8666942415495187, + "language_loss": 0.80149066, + "learning_rate": 1.292090097299432e-07, + "loss": 0.82283956, + "num_input_tokens_seen": 318545180, + "router_z_loss_clip": 0.71289062, + "router_z_loss_mlp": 0.10321045, + "step": 14776, + "time_per_iteration": 2.6786770820617676 + }, + { + "auxiliary_loss_clip": 0.0111334, + "auxiliary_loss_mlp": 0.01029897, + "balance_loss_clip": 1.03733909, + "balance_loss_mlp": 1.01853633, + "epoch": 0.8884413046745829, + "flos": 34562835211680.0, + "grad_norm": 1.9628214626105673, + "language_loss": 0.69862163, + "learning_rate": 1.290713302796802e-07, + "loss": 0.72005403, + "num_input_tokens_seen": 318564350, + "router_z_loss_clip": 0.76123047, + "router_z_loss_mlp": 0.11364746, + "step": 14777, + "time_per_iteration": 2.6602532863616943 + }, + { + "auxiliary_loss_clip": 0.01106751, + "auxiliary_loss_mlp": 0.01034387, + "balance_loss_clip": 1.03502834, + "balance_loss_mlp": 1.02365863, + "epoch": 0.8885014279272508, + "flos": 18496298596320.0, + "grad_norm": 1.8746606934079584, + "language_loss": 0.70023239, + "learning_rate": 1.2893372177522e-07, + "loss": 0.72164381, + "num_input_tokens_seen": 318582275, + "router_z_loss_clip": 0.71728516, + "router_z_loss_mlp": 0.1072998, + "step": 14778, + "time_per_iteration": 2.6993021965026855 + }, + { + "auxiliary_loss_clip": 0.0110907, + "auxiliary_loss_mlp": 0.0103265, + "balance_loss_clip": 1.03707099, + "balance_loss_mlp": 1.02199912, + "epoch": 0.8885615511799189, + "flos": 23305175958240.0, + "grad_norm": 1.640609674594338, + "language_loss": 0.77591634, + "learning_rate": 1.287961842217804e-07, + "loss": 0.79733348, + "num_input_tokens_seen": 318601230, + "router_z_loss_clip": 0.71972656, + "router_z_loss_mlp": 0.10662842, + "step": 14779, + "time_per_iteration": 2.6188299655914307 + }, + { + "auxiliary_loss_clip": 0.01027993, + "auxiliary_loss_mlp": 0.01001481, + "balance_loss_clip": 1.00561976, + "balance_loss_mlp": 1.00047421, + "epoch": 0.8886216744325868, + "flos": 62458694213760.0, + "grad_norm": 0.8905182500824272, + "language_loss": 0.5675205, + "learning_rate": 1.2865871762457747e-07, + "loss": 0.58781523, + "num_input_tokens_seen": 318645595, + "router_z_loss_clip": 0.22412109, + "router_z_loss_mlp": 0.01006317, + "step": 14780, + "time_per_iteration": 4.479472637176514 + }, + { + "auxiliary_loss_clip": 0.01028107, + "auxiliary_loss_mlp": 0.01001749, + "balance_loss_clip": 1.00565267, + "balance_loss_mlp": 1.00074995, + "epoch": 0.8886817976852548, + "flos": 75179229235200.0, + "grad_norm": 0.7923924236192474, + "language_loss": 0.62435126, + "learning_rate": 1.2852132198882326e-07, + "loss": 0.64464986, + "num_input_tokens_seen": 318707850, + "router_z_loss_clip": 0.22460938, + "router_z_loss_mlp": 0.00998688, + "step": 14781, + "time_per_iteration": 3.352903127670288 + }, + { + "auxiliary_loss_clip": 0.01028224, + "auxiliary_loss_mlp": 0.01001636, + "balance_loss_clip": 1.00575614, + "balance_loss_mlp": 1.00066543, + "epoch": 0.8887419209379227, + "flos": 74002002706080.0, + "grad_norm": 0.7842416466041604, + "language_loss": 0.58092034, + "learning_rate": 1.2838399731972805e-07, + "loss": 0.60121894, + "num_input_tokens_seen": 318764915, + "router_z_loss_clip": 0.2244873, + "router_z_loss_mlp": 0.00969696, + "step": 14782, + "time_per_iteration": 3.0895352363586426 + }, + { + "auxiliary_loss_clip": 0.01109303, + "auxiliary_loss_mlp": 0.01025689, + "balance_loss_clip": 1.03984022, + "balance_loss_mlp": 1.01524591, + "epoch": 0.8888020441905907, + "flos": 35637646757760.0, + "grad_norm": 2.1263211235352886, + "language_loss": 0.6590811, + "learning_rate": 1.2824674362249922e-07, + "loss": 0.68043101, + "num_input_tokens_seen": 318785660, + "router_z_loss_clip": 0.69482422, + "router_z_loss_mlp": 0.10449219, + "step": 14783, + "time_per_iteration": 4.087193250656128 + }, + { + "auxiliary_loss_clip": 0.01112236, + "auxiliary_loss_mlp": 0.0102847, + "balance_loss_clip": 1.03779483, + "balance_loss_mlp": 1.01672173, + "epoch": 0.8888621674432586, + "flos": 27043171950240.0, + "grad_norm": 1.5411841967021855, + "language_loss": 0.77588475, + "learning_rate": 1.281095609023415e-07, + "loss": 0.79729176, + "num_input_tokens_seen": 318806080, + "router_z_loss_clip": 0.74365234, + "router_z_loss_mlp": 0.11743164, + "step": 14784, + "time_per_iteration": 2.6698641777038574 + }, + { + "auxiliary_loss_clip": 0.01111882, + "auxiliary_loss_mlp": 0.01032224, + "balance_loss_clip": 1.0389533, + "balance_loss_mlp": 1.02065516, + "epoch": 0.8889222906959267, + "flos": 33766987553280.0, + "grad_norm": 2.5893717145007726, + "language_loss": 0.60557103, + "learning_rate": 1.279724491644565e-07, + "loss": 0.62701213, + "num_input_tokens_seen": 318826445, + "router_z_loss_clip": 0.72949219, + "router_z_loss_mlp": 0.11566162, + "step": 14785, + "time_per_iteration": 2.737661123275757 + }, + { + "auxiliary_loss_clip": 0.01111871, + "auxiliary_loss_mlp": 0.01028816, + "balance_loss_clip": 1.04004622, + "balance_loss_mlp": 1.01740813, + "epoch": 0.8889824139485947, + "flos": 17288528388480.0, + "grad_norm": 1.9529471148093256, + "language_loss": 0.65083146, + "learning_rate": 1.278354084140445e-07, + "loss": 0.67223829, + "num_input_tokens_seen": 318843915, + "router_z_loss_clip": 0.71923828, + "router_z_loss_mlp": 0.11413574, + "step": 14786, + "time_per_iteration": 2.6909799575805664 + }, + { + "auxiliary_loss_clip": 0.01114675, + "auxiliary_loss_mlp": 0.01028915, + "balance_loss_clip": 1.03830433, + "balance_loss_mlp": 1.01701772, + "epoch": 0.8890425372012626, + "flos": 15683300300160.0, + "grad_norm": 2.4996609578590956, + "language_loss": 0.85190815, + "learning_rate": 1.276984386563009e-07, + "loss": 0.873344, + "num_input_tokens_seen": 318859670, + "router_z_loss_clip": 0.76318359, + "router_z_loss_mlp": 0.11895752, + "step": 14787, + "time_per_iteration": 2.572721242904663 + }, + { + "auxiliary_loss_clip": 0.01110378, + "auxiliary_loss_mlp": 0.01028167, + "balance_loss_clip": 1.03829181, + "balance_loss_mlp": 1.01705718, + "epoch": 0.8891026604539306, + "flos": 26465397058080.0, + "grad_norm": 1.990654602360749, + "language_loss": 0.71444285, + "learning_rate": 1.2756153989642027e-07, + "loss": 0.73582828, + "num_input_tokens_seen": 318877855, + "router_z_loss_clip": 0.72070312, + "router_z_loss_mlp": 0.11108398, + "step": 14788, + "time_per_iteration": 2.694603443145752 + }, + { + "auxiliary_loss_clip": 0.01106007, + "auxiliary_loss_mlp": 0.01030558, + "balance_loss_clip": 1.03734875, + "balance_loss_mlp": 1.02000833, + "epoch": 0.8891627837065985, + "flos": 26687683621440.0, + "grad_norm": 1.646503058791912, + "language_loss": 0.69978666, + "learning_rate": 1.274247121395935e-07, + "loss": 0.72115231, + "num_input_tokens_seen": 318896045, + "router_z_loss_clip": 0.68603516, + "router_z_loss_mlp": 0.10552979, + "step": 14789, + "time_per_iteration": 2.6212103366851807 + }, + { + "auxiliary_loss_clip": 0.01109898, + "auxiliary_loss_mlp": 0.01026053, + "balance_loss_clip": 1.03904068, + "balance_loss_mlp": 1.01468027, + "epoch": 0.8892229069592665, + "flos": 26332276327200.0, + "grad_norm": 1.552360349543588, + "language_loss": 0.70405328, + "learning_rate": 1.2728795539100956e-07, + "loss": 0.72541285, + "num_input_tokens_seen": 318915515, + "router_z_loss_clip": 0.70800781, + "router_z_loss_mlp": 0.11376953, + "step": 14790, + "time_per_iteration": 2.671400785446167 + }, + { + "auxiliary_loss_clip": 0.01110409, + "auxiliary_loss_mlp": 0.01027053, + "balance_loss_clip": 1.03836572, + "balance_loss_mlp": 1.01672959, + "epoch": 0.8892830302119344, + "flos": 28157522011200.0, + "grad_norm": 1.8765859999233034, + "language_loss": 0.73075354, + "learning_rate": 1.2715126965585387e-07, + "loss": 0.75212818, + "num_input_tokens_seen": 318934305, + "router_z_loss_clip": 0.72021484, + "router_z_loss_mlp": 0.10327148, + "step": 14791, + "time_per_iteration": 3.917259693145752 + }, + { + "auxiliary_loss_clip": 0.01109396, + "auxiliary_loss_mlp": 0.01033161, + "balance_loss_clip": 1.03955555, + "balance_loss_mlp": 1.0220747, + "epoch": 0.8893431534646025, + "flos": 28153186662240.0, + "grad_norm": 1.7529271349871167, + "language_loss": 0.74083745, + "learning_rate": 1.2701465493931008e-07, + "loss": 0.762263, + "num_input_tokens_seen": 318953880, + "router_z_loss_clip": 0.69921875, + "router_z_loss_mlp": 0.11096191, + "step": 14792, + "time_per_iteration": 2.6245009899139404 + }, + { + "auxiliary_loss_clip": 0.01114497, + "auxiliary_loss_mlp": 0.01030027, + "balance_loss_clip": 1.03952813, + "balance_loss_mlp": 1.01790977, + "epoch": 0.8894032767172704, + "flos": 27399794762880.0, + "grad_norm": 2.2237936073441866, + "language_loss": 0.66252005, + "learning_rate": 1.2687811124655801e-07, + "loss": 0.68396533, + "num_input_tokens_seen": 318971395, + "router_z_loss_clip": 0.75, + "router_z_loss_mlp": 0.12115479, + "step": 14793, + "time_per_iteration": 2.6634910106658936 + }, + { + "auxiliary_loss_clip": 0.01112521, + "auxiliary_loss_mlp": 0.01030991, + "balance_loss_clip": 1.03874421, + "balance_loss_mlp": 1.01876616, + "epoch": 0.8894633999699384, + "flos": 30918097297440.0, + "grad_norm": 1.8179372967194314, + "language_loss": 0.71734667, + "learning_rate": 1.2674163858277552e-07, + "loss": 0.73878181, + "num_input_tokens_seen": 318990580, + "router_z_loss_clip": 0.73730469, + "router_z_loss_mlp": 0.12213135, + "step": 14794, + "time_per_iteration": 2.6370768547058105 + }, + { + "auxiliary_loss_clip": 0.01115653, + "auxiliary_loss_mlp": 0.01030951, + "balance_loss_clip": 1.04008341, + "balance_loss_mlp": 1.01931, + "epoch": 0.8895235232226063, + "flos": 25617247941600.0, + "grad_norm": 1.6377859298964472, + "language_loss": 0.75271857, + "learning_rate": 1.2660523695313785e-07, + "loss": 0.77418464, + "num_input_tokens_seen": 319010040, + "router_z_loss_clip": 0.75537109, + "router_z_loss_mlp": 0.11645508, + "step": 14795, + "time_per_iteration": 2.645515203475952 + }, + { + "auxiliary_loss_clip": 0.010278, + "auxiliary_loss_mlp": 0.01001099, + "balance_loss_clip": 1.00543404, + "balance_loss_mlp": 1.00009704, + "epoch": 0.8895836464752743, + "flos": 85089352716000.0, + "grad_norm": 0.7665183984847201, + "language_loss": 0.56079906, + "learning_rate": 1.2646890636281727e-07, + "loss": 0.58108807, + "num_input_tokens_seen": 319063860, + "router_z_loss_clip": 0.22363281, + "router_z_loss_mlp": 0.0100174, + "step": 14796, + "time_per_iteration": 3.1548871994018555 + }, + { + "auxiliary_loss_clip": 0.01112294, + "auxiliary_loss_mlp": 0.01032402, + "balance_loss_clip": 1.03851414, + "balance_loss_mlp": 1.01994514, + "epoch": 0.8896437697279422, + "flos": 28335489020640.0, + "grad_norm": 1.8892958424079367, + "language_loss": 0.70215166, + "learning_rate": 1.263326468169843e-07, + "loss": 0.7235986, + "num_input_tokens_seen": 319082335, + "router_z_loss_clip": 0.73730469, + "router_z_loss_mlp": 0.12445068, + "step": 14797, + "time_per_iteration": 2.6553547382354736 + }, + { + "auxiliary_loss_clip": 0.010279, + "auxiliary_loss_mlp": 0.01001396, + "balance_loss_clip": 1.00553226, + "balance_loss_mlp": 1.00042927, + "epoch": 0.8897038929806103, + "flos": 86333906263680.0, + "grad_norm": 0.753388294452339, + "language_loss": 0.58011246, + "learning_rate": 1.2619645832080417e-07, + "loss": 0.60040545, + "num_input_tokens_seen": 319147075, + "router_z_loss_clip": 0.22387695, + "router_z_loss_mlp": 0.00965118, + "step": 14798, + "time_per_iteration": 3.264122486114502 + }, + { + "auxiliary_loss_clip": 0.01111394, + "auxiliary_loss_mlp": 0.01026563, + "balance_loss_clip": 1.03911459, + "balance_loss_mlp": 1.01510727, + "epoch": 0.8897640162332782, + "flos": 23482778312160.0, + "grad_norm": 2.8720016133301116, + "language_loss": 0.78859657, + "learning_rate": 1.2606034087944251e-07, + "loss": 0.80997616, + "num_input_tokens_seen": 319166630, + "router_z_loss_clip": 0.72265625, + "router_z_loss_mlp": 0.11456299, + "step": 14799, + "time_per_iteration": 2.632625102996826 + }, + { + "auxiliary_loss_clip": 0.01028073, + "auxiliary_loss_mlp": 0.01002216, + "balance_loss_clip": 1.00573456, + "balance_loss_mlp": 1.00125837, + "epoch": 0.8898241394859462, + "flos": 50461094797920.0, + "grad_norm": 0.9110899764147458, + "language_loss": 0.58018291, + "learning_rate": 1.2592429449806053e-07, + "loss": 0.6004858, + "num_input_tokens_seen": 319221865, + "router_z_loss_clip": 0.2232666, + "router_z_loss_mlp": 0.00957489, + "step": 14800, + "time_per_iteration": 3.182435989379883 + }, + { + "auxiliary_loss_clip": 0.01111427, + "auxiliary_loss_mlp": 0.01030743, + "balance_loss_clip": 1.03917313, + "balance_loss_mlp": 1.02006173, + "epoch": 0.8898842627386142, + "flos": 23167274188320.0, + "grad_norm": 1.7324726695362722, + "language_loss": 0.65947282, + "learning_rate": 1.2578831918181698e-07, + "loss": 0.68089449, + "num_input_tokens_seen": 319240710, + "router_z_loss_clip": 0.72167969, + "router_z_loss_mlp": 0.10675049, + "step": 14801, + "time_per_iteration": 2.624706745147705 + }, + { + "auxiliary_loss_clip": 0.01114234, + "auxiliary_loss_mlp": 0.01039425, + "balance_loss_clip": 1.04040825, + "balance_loss_mlp": 1.02684879, + "epoch": 0.8899443859912821, + "flos": 16127751875040.0, + "grad_norm": 2.3677744086145442, + "language_loss": 0.75698173, + "learning_rate": 1.256524149358682e-07, + "loss": 0.77851832, + "num_input_tokens_seen": 319256495, + "router_z_loss_clip": 0.73828125, + "router_z_loss_mlp": 0.12567139, + "step": 14802, + "time_per_iteration": 2.627607583999634 + }, + { + "auxiliary_loss_clip": 0.01108639, + "auxiliary_loss_mlp": 0.01029193, + "balance_loss_clip": 1.0391624, + "balance_loss_mlp": 1.01836312, + "epoch": 0.8900045092439501, + "flos": 27667818984960.0, + "grad_norm": 1.842494219701139, + "language_loss": 0.73686826, + "learning_rate": 1.2551658176536805e-07, + "loss": 0.7582466, + "num_input_tokens_seen": 319273620, + "router_z_loss_clip": 0.69433594, + "router_z_loss_mlp": 0.1083374, + "step": 14803, + "time_per_iteration": 2.709111452102661 + }, + { + "auxiliary_loss_clip": 0.01106897, + "auxiliary_loss_mlp": 0.01028242, + "balance_loss_clip": 1.03597236, + "balance_loss_mlp": 1.01724458, + "epoch": 0.890064632496618, + "flos": 25797119263200.0, + "grad_norm": 2.004386809182406, + "language_loss": 0.71871948, + "learning_rate": 1.2538081967546664e-07, + "loss": 0.74007088, + "num_input_tokens_seen": 319291720, + "router_z_loss_clip": 0.70947266, + "router_z_loss_mlp": 0.10998535, + "step": 14804, + "time_per_iteration": 2.663931131362915 + }, + { + "auxiliary_loss_clip": 0.01108644, + "auxiliary_loss_mlp": 0.01030641, + "balance_loss_clip": 1.03616762, + "balance_loss_mlp": 1.01949477, + "epoch": 0.8901247557492861, + "flos": 28549996266240.0, + "grad_norm": 1.9909047100393606, + "language_loss": 0.81166637, + "learning_rate": 1.252451286713123e-07, + "loss": 0.83305919, + "num_input_tokens_seen": 319310380, + "router_z_loss_clip": 0.72460938, + "router_z_loss_mlp": 0.11132812, + "step": 14805, + "time_per_iteration": 2.623121976852417 + }, + { + "auxiliary_loss_clip": 0.01112605, + "auxiliary_loss_mlp": 0.01028674, + "balance_loss_clip": 1.03804028, + "balance_loss_mlp": 1.01722348, + "epoch": 0.890184879001954, + "flos": 35593367721120.0, + "grad_norm": 1.9514419410987471, + "language_loss": 0.66899931, + "learning_rate": 1.251095087580505e-07, + "loss": 0.69041204, + "num_input_tokens_seen": 319331765, + "router_z_loss_clip": 0.74511719, + "router_z_loss_mlp": 0.11444092, + "step": 14806, + "time_per_iteration": 2.713136672973633 + }, + { + "auxiliary_loss_clip": 0.01108321, + "auxiliary_loss_mlp": 0.01031223, + "balance_loss_clip": 1.03650832, + "balance_loss_mlp": 1.01979637, + "epoch": 0.890245002254622, + "flos": 17604478202400.0, + "grad_norm": 1.8952248988555342, + "language_loss": 0.67382932, + "learning_rate": 1.2497395994082438e-07, + "loss": 0.69522476, + "num_input_tokens_seen": 319349135, + "router_z_loss_clip": 0.71728516, + "router_z_loss_mlp": 0.11431885, + "step": 14807, + "time_per_iteration": 2.63077974319458 + }, + { + "auxiliary_loss_clip": 0.01108083, + "auxiliary_loss_mlp": 0.01027422, + "balance_loss_clip": 1.03635156, + "balance_loss_mlp": 1.01643109, + "epoch": 0.8903051255072899, + "flos": 27310953068640.0, + "grad_norm": 2.190878337200849, + "language_loss": 0.75500119, + "learning_rate": 1.248384822247732e-07, + "loss": 0.77635628, + "num_input_tokens_seen": 319368410, + "router_z_loss_clip": 0.71826172, + "router_z_loss_mlp": 0.10992432, + "step": 14808, + "time_per_iteration": 2.628211259841919 + }, + { + "auxiliary_loss_clip": 0.01110514, + "auxiliary_loss_mlp": 0.01032488, + "balance_loss_clip": 1.03779745, + "balance_loss_mlp": 1.02208078, + "epoch": 0.8903652487599579, + "flos": 25352586653760.0, + "grad_norm": 2.517916844655214, + "language_loss": 0.8142181, + "learning_rate": 1.2470307561503513e-07, + "loss": 0.83564818, + "num_input_tokens_seen": 319387535, + "router_z_loss_clip": 0.72705078, + "router_z_loss_mlp": 0.10406494, + "step": 14809, + "time_per_iteration": 2.6558525562286377 + }, + { + "auxiliary_loss_clip": 0.01109374, + "auxiliary_loss_mlp": 0.01028129, + "balance_loss_clip": 1.03753793, + "balance_loss_mlp": 1.01750195, + "epoch": 0.8904253720126258, + "flos": 29803220511840.0, + "grad_norm": 2.1834675039960842, + "language_loss": 0.67993081, + "learning_rate": 1.2456774011674442e-07, + "loss": 0.70130587, + "num_input_tokens_seen": 319407210, + "router_z_loss_clip": 0.71777344, + "router_z_loss_mlp": 0.10638428, + "step": 14810, + "time_per_iteration": 2.6653881072998047 + }, + { + "auxiliary_loss_clip": 0.0111136, + "auxiliary_loss_mlp": 0.01026423, + "balance_loss_clip": 1.03711712, + "balance_loss_mlp": 1.01500845, + "epoch": 0.8904854952652939, + "flos": 23749749084960.0, + "grad_norm": 2.3030272271726595, + "language_loss": 0.70598227, + "learning_rate": 1.2443247573503257e-07, + "loss": 0.72736013, + "num_input_tokens_seen": 319425340, + "router_z_loss_clip": 0.7421875, + "router_z_loss_mlp": 0.11425781, + "step": 14811, + "time_per_iteration": 4.049888610839844 + }, + { + "auxiliary_loss_clip": 0.01112611, + "auxiliary_loss_mlp": 0.01026366, + "balance_loss_clip": 1.03948843, + "balance_loss_mlp": 1.01589382, + "epoch": 0.8905456185179618, + "flos": 61989810719040.0, + "grad_norm": 2.0264756279230403, + "language_loss": 0.65308857, + "learning_rate": 1.2429728247502924e-07, + "loss": 0.67447829, + "num_input_tokens_seen": 319448150, + "router_z_loss_clip": 0.73144531, + "router_z_loss_mlp": 0.10473633, + "step": 14812, + "time_per_iteration": 2.921128988265991 + }, + { + "auxiliary_loss_clip": 0.01109263, + "auxiliary_loss_mlp": 0.01028937, + "balance_loss_clip": 1.03786278, + "balance_loss_mlp": 1.01835084, + "epoch": 0.8906057417706298, + "flos": 21700960801920.0, + "grad_norm": 2.0790086376678927, + "language_loss": 0.68460232, + "learning_rate": 1.24162160341861e-07, + "loss": 0.70598435, + "num_input_tokens_seen": 319466115, + "router_z_loss_clip": 0.71386719, + "router_z_loss_mlp": 0.10577393, + "step": 14813, + "time_per_iteration": 2.5907845497131348 + }, + { + "auxiliary_loss_clip": 0.01115779, + "auxiliary_loss_mlp": 0.01030918, + "balance_loss_clip": 1.03864646, + "balance_loss_mlp": 1.0180788, + "epoch": 0.8906658650232978, + "flos": 26777254626720.0, + "grad_norm": 2.214585373439158, + "language_loss": 0.75266933, + "learning_rate": 1.2402710934065198e-07, + "loss": 0.77413636, + "num_input_tokens_seen": 319485255, + "router_z_loss_clip": 0.77148438, + "router_z_loss_mlp": 0.12841797, + "step": 14814, + "time_per_iteration": 2.635091781616211 + }, + { + "auxiliary_loss_clip": 0.01111423, + "auxiliary_loss_mlp": 0.01025198, + "balance_loss_clip": 1.03737783, + "balance_loss_mlp": 1.01383686, + "epoch": 0.8907259882759657, + "flos": 25975491445440.0, + "grad_norm": 2.607564639060002, + "language_loss": 0.7417354, + "learning_rate": 1.2389212947652229e-07, + "loss": 0.76310164, + "num_input_tokens_seen": 319501800, + "router_z_loss_clip": 0.74023438, + "router_z_loss_mlp": 0.11352539, + "step": 14815, + "time_per_iteration": 2.6186485290527344 + }, + { + "auxiliary_loss_clip": 0.01107118, + "auxiliary_loss_mlp": 0.0102903, + "balance_loss_clip": 1.03750563, + "balance_loss_mlp": 1.0176692, + "epoch": 0.8907861115286337, + "flos": 24550945024320.0, + "grad_norm": 1.9936038079962046, + "language_loss": 0.75308096, + "learning_rate": 1.237572207545914e-07, + "loss": 0.77444243, + "num_input_tokens_seen": 319520415, + "router_z_loss_clip": 0.69580078, + "router_z_loss_mlp": 0.11358643, + "step": 14816, + "time_per_iteration": 2.7061870098114014 + }, + { + "auxiliary_loss_clip": 0.01110026, + "auxiliary_loss_mlp": 0.01025252, + "balance_loss_clip": 1.03703451, + "balance_loss_mlp": 1.01423693, + "epoch": 0.8908462347813016, + "flos": 24506665987680.0, + "grad_norm": 2.07358187842269, + "language_loss": 0.77416074, + "learning_rate": 1.2362238317997476e-07, + "loss": 0.79551357, + "num_input_tokens_seen": 319538410, + "router_z_loss_clip": 0.72998047, + "router_z_loss_mlp": 0.11010742, + "step": 14817, + "time_per_iteration": 2.6033217906951904 + }, + { + "auxiliary_loss_clip": 0.01027881, + "auxiliary_loss_mlp": 0.01000903, + "balance_loss_clip": 1.00549817, + "balance_loss_mlp": 0.99992967, + "epoch": 0.8909063580339697, + "flos": 79928147373120.0, + "grad_norm": 0.7376287689430808, + "language_loss": 0.56492484, + "learning_rate": 1.2348761675778517e-07, + "loss": 0.58521265, + "num_input_tokens_seen": 319602565, + "router_z_loss_clip": 0.22375488, + "router_z_loss_mlp": 0.00971985, + "step": 14818, + "time_per_iteration": 3.3155806064605713 + }, + { + "auxiliary_loss_clip": 0.01109311, + "auxiliary_loss_mlp": 0.01033981, + "balance_loss_clip": 1.03759539, + "balance_loss_mlp": 1.02275193, + "epoch": 0.8909664812866376, + "flos": 36439572008160.0, + "grad_norm": 1.7502396097413522, + "language_loss": 0.64392412, + "learning_rate": 1.2335292149313325e-07, + "loss": 0.66535699, + "num_input_tokens_seen": 319624645, + "router_z_loss_clip": 0.71777344, + "router_z_loss_mlp": 0.11218262, + "step": 14819, + "time_per_iteration": 4.135153532028198 + }, + { + "auxiliary_loss_clip": 0.01110941, + "auxiliary_loss_mlp": 0.01030737, + "balance_loss_clip": 1.03774822, + "balance_loss_mlp": 1.01887012, + "epoch": 0.8910266045393056, + "flos": 31051015441920.0, + "grad_norm": 2.4343091989364676, + "language_loss": 0.78555501, + "learning_rate": 1.2321829739112731e-07, + "loss": 0.80697179, + "num_input_tokens_seen": 319644040, + "router_z_loss_clip": 0.73242188, + "router_z_loss_mlp": 0.11877441, + "step": 14820, + "time_per_iteration": 2.6487720012664795 + }, + { + "auxiliary_loss_clip": 0.01111091, + "auxiliary_loss_mlp": 0.01031655, + "balance_loss_clip": 1.03842521, + "balance_loss_mlp": 1.02052641, + "epoch": 0.8910867277919735, + "flos": 29894128587360.0, + "grad_norm": 1.8660878759114456, + "language_loss": 0.76609981, + "learning_rate": 1.2308374445687087e-07, + "loss": 0.78752726, + "num_input_tokens_seen": 319663930, + "router_z_loss_clip": 0.7265625, + "router_z_loss_mlp": 0.11132812, + "step": 14821, + "time_per_iteration": 2.699219226837158 + }, + { + "auxiliary_loss_clip": 0.01027846, + "auxiliary_loss_mlp": 0.01000951, + "balance_loss_clip": 1.00549519, + "balance_loss_mlp": 0.99997956, + "epoch": 0.8911468510446415, + "flos": 74051832610080.0, + "grad_norm": 0.8677580254690694, + "language_loss": 0.59273559, + "learning_rate": 1.2294926269546712e-07, + "loss": 0.61302358, + "num_input_tokens_seen": 319721245, + "router_z_loss_clip": 0.22351074, + "router_z_loss_mlp": 0.00970459, + "step": 14822, + "time_per_iteration": 3.1297054290771484 + }, + { + "auxiliary_loss_clip": 0.01109554, + "auxiliary_loss_mlp": 0.01031482, + "balance_loss_clip": 1.03665709, + "balance_loss_mlp": 1.02026486, + "epoch": 0.8912069742973094, + "flos": 30917246434560.0, + "grad_norm": 1.8450225186834859, + "language_loss": 0.6927976, + "learning_rate": 1.2281485211201515e-07, + "loss": 0.71420795, + "num_input_tokens_seen": 319741200, + "router_z_loss_clip": 0.72900391, + "router_z_loss_mlp": 0.11218262, + "step": 14823, + "time_per_iteration": 4.1982996463775635 + }, + { + "auxiliary_loss_clip": 0.01106914, + "auxiliary_loss_mlp": 0.01029065, + "balance_loss_clip": 1.03618526, + "balance_loss_mlp": 1.01748335, + "epoch": 0.8912670975499775, + "flos": 22236158383200.0, + "grad_norm": 2.8622756386917585, + "language_loss": 0.69241202, + "learning_rate": 1.2268051271161262e-07, + "loss": 0.71377182, + "num_input_tokens_seen": 319759265, + "router_z_loss_clip": 0.70703125, + "router_z_loss_mlp": 0.1159668, + "step": 14824, + "time_per_iteration": 2.6874477863311768 + }, + { + "auxiliary_loss_clip": 0.01112733, + "auxiliary_loss_mlp": 0.01029728, + "balance_loss_clip": 1.03827727, + "balance_loss_mlp": 1.01771188, + "epoch": 0.8913272208026454, + "flos": 32341347165600.0, + "grad_norm": 2.3366328644879415, + "language_loss": 0.70642436, + "learning_rate": 1.2254624449935303e-07, + "loss": 0.72784901, + "num_input_tokens_seen": 319777560, + "router_z_loss_clip": 0.74560547, + "router_z_loss_mlp": 0.12011719, + "step": 14825, + "time_per_iteration": 2.6477537155151367 + }, + { + "auxiliary_loss_clip": 0.01108704, + "auxiliary_loss_mlp": 0.0102928, + "balance_loss_clip": 1.0368135, + "balance_loss_mlp": 1.01759744, + "epoch": 0.8913873440553134, + "flos": 22942921243680.0, + "grad_norm": 2.1156374966864746, + "language_loss": 0.71538889, + "learning_rate": 1.2241204748032786e-07, + "loss": 0.73676872, + "num_input_tokens_seen": 319794125, + "router_z_loss_clip": 0.71826172, + "router_z_loss_mlp": 0.11676025, + "step": 14826, + "time_per_iteration": 2.646530866622925 + }, + { + "auxiliary_loss_clip": 0.01109034, + "auxiliary_loss_mlp": 0.01027339, + "balance_loss_clip": 1.03791618, + "balance_loss_mlp": 1.01665175, + "epoch": 0.8914474673079814, + "flos": 25483965141600.0, + "grad_norm": 2.097882444992474, + "language_loss": 0.75189275, + "learning_rate": 1.2227792165962615e-07, + "loss": 0.77325648, + "num_input_tokens_seen": 319810310, + "router_z_loss_clip": 0.71191406, + "router_z_loss_mlp": 0.10687256, + "step": 14827, + "time_per_iteration": 2.59146785736084 + }, + { + "auxiliary_loss_clip": 0.01111, + "auxiliary_loss_mlp": 0.01027594, + "balance_loss_clip": 1.03812242, + "balance_loss_mlp": 1.01631665, + "epoch": 0.8915075905606493, + "flos": 25566891312960.0, + "grad_norm": 1.9993915983467239, + "language_loss": 0.78257263, + "learning_rate": 1.221438670423336e-07, + "loss": 0.80395854, + "num_input_tokens_seen": 319828505, + "router_z_loss_clip": 0.72802734, + "router_z_loss_mlp": 0.11273193, + "step": 14828, + "time_per_iteration": 2.6756672859191895 + }, + { + "auxiliary_loss_clip": 0.01109409, + "auxiliary_loss_mlp": 0.01030398, + "balance_loss_clip": 1.03766727, + "balance_loss_mlp": 1.01884651, + "epoch": 0.8915677138133173, + "flos": 28157886666720.0, + "grad_norm": 2.0548022164785733, + "language_loss": 0.75650299, + "learning_rate": 1.2200988363353392e-07, + "loss": 0.77790105, + "num_input_tokens_seen": 319848680, + "router_z_loss_clip": 0.71728516, + "router_z_loss_mlp": 0.11547852, + "step": 14829, + "time_per_iteration": 2.637929677963257 + }, + { + "auxiliary_loss_clip": 0.01108901, + "auxiliary_loss_mlp": 0.01030779, + "balance_loss_clip": 1.03646314, + "balance_loss_mlp": 1.02046764, + "epoch": 0.8916278370659853, + "flos": 28602095137920.0, + "grad_norm": 1.5767739833785204, + "language_loss": 0.84560251, + "learning_rate": 1.2187597143830773e-07, + "loss": 0.86699939, + "num_input_tokens_seen": 319868835, + "router_z_loss_clip": 0.72460938, + "router_z_loss_mlp": 0.10321045, + "step": 14830, + "time_per_iteration": 2.64618182182312 + }, + { + "auxiliary_loss_clip": 0.01104691, + "auxiliary_loss_mlp": 0.01026581, + "balance_loss_clip": 1.03584671, + "balance_loss_mlp": 1.01634073, + "epoch": 0.8916879603186533, + "flos": 30700753842240.0, + "grad_norm": 1.3215844723704244, + "language_loss": 0.74811167, + "learning_rate": 1.2174213046173299e-07, + "loss": 0.76942432, + "num_input_tokens_seen": 319891585, + "router_z_loss_clip": 0.68798828, + "router_z_loss_mlp": 0.10247803, + "step": 14831, + "time_per_iteration": 4.0203821659088135 + }, + { + "auxiliary_loss_clip": 0.0111152, + "auxiliary_loss_mlp": 0.01026499, + "balance_loss_clip": 1.03723025, + "balance_loss_mlp": 1.01503158, + "epoch": 0.8917480835713212, + "flos": 24684470928000.0, + "grad_norm": 2.4204658500315617, + "language_loss": 0.73589563, + "learning_rate": 1.216083607088847e-07, + "loss": 0.75727582, + "num_input_tokens_seen": 319910315, + "router_z_loss_clip": 0.74316406, + "router_z_loss_mlp": 0.11468506, + "step": 14832, + "time_per_iteration": 2.6182048320770264 + }, + { + "auxiliary_loss_clip": 0.01110934, + "auxiliary_loss_mlp": 0.01027931, + "balance_loss_clip": 1.03655577, + "balance_loss_mlp": 1.01667786, + "epoch": 0.8918082068239892, + "flos": 31850226034560.0, + "grad_norm": 2.4659007818916736, + "language_loss": 0.67399341, + "learning_rate": 1.214746621848355e-07, + "loss": 0.695382, + "num_input_tokens_seen": 319932275, + "router_z_loss_clip": 0.74414062, + "router_z_loss_mlp": 0.11242676, + "step": 14833, + "time_per_iteration": 2.6949195861816406 + }, + { + "auxiliary_loss_clip": 0.01114723, + "auxiliary_loss_mlp": 0.01031502, + "balance_loss_clip": 1.03954566, + "balance_loss_mlp": 1.01934874, + "epoch": 0.8918683300766571, + "flos": 30299284751040.0, + "grad_norm": 2.0065220987269674, + "language_loss": 0.73691308, + "learning_rate": 1.2134103489465575e-07, + "loss": 0.75837529, + "num_input_tokens_seen": 319955335, + "router_z_loss_clip": 0.75146484, + "router_z_loss_mlp": 0.12164307, + "step": 14834, + "time_per_iteration": 2.6898863315582275 + }, + { + "auxiliary_loss_clip": 0.01109015, + "auxiliary_loss_mlp": 0.01032845, + "balance_loss_clip": 1.03761971, + "balance_loss_mlp": 1.02162194, + "epoch": 0.8919284533293251, + "flos": 27217330335360.0, + "grad_norm": 3.075330572077687, + "language_loss": 0.78817606, + "learning_rate": 1.2120747884341188e-07, + "loss": 0.80959463, + "num_input_tokens_seen": 319973990, + "router_z_loss_clip": 0.71386719, + "router_z_loss_mlp": 0.11224365, + "step": 14835, + "time_per_iteration": 2.6780519485473633 + }, + { + "auxiliary_loss_clip": 0.0110499, + "auxiliary_loss_mlp": 0.01026744, + "balance_loss_clip": 1.03491712, + "balance_loss_mlp": 1.01615191, + "epoch": 0.891988576581993, + "flos": 37062517317120.0, + "grad_norm": 1.4718725394815193, + "language_loss": 0.74013972, + "learning_rate": 1.210739940361689e-07, + "loss": 0.76145709, + "num_input_tokens_seen": 319995555, + "router_z_loss_clip": 0.70117188, + "router_z_loss_mlp": 0.10595703, + "step": 14836, + "time_per_iteration": 2.6829071044921875 + }, + { + "auxiliary_loss_clip": 0.01108306, + "auxiliary_loss_mlp": 0.01029669, + "balance_loss_clip": 1.03581774, + "balance_loss_mlp": 1.01860642, + "epoch": 0.8920486998346611, + "flos": 18977979201120.0, + "grad_norm": 2.3443304107355343, + "language_loss": 0.68687719, + "learning_rate": 1.2094058047798838e-07, + "loss": 0.70825696, + "num_input_tokens_seen": 320012385, + "router_z_loss_clip": 0.72412109, + "router_z_loss_mlp": 0.1105957, + "step": 14837, + "time_per_iteration": 2.6203763484954834 + }, + { + "auxiliary_loss_clip": 0.01114891, + "auxiliary_loss_mlp": 0.0103, + "balance_loss_clip": 1.0388149, + "balance_loss_mlp": 1.01828766, + "epoch": 0.892108823087329, + "flos": 25886609233920.0, + "grad_norm": 1.6885471695028627, + "language_loss": 0.67468178, + "learning_rate": 1.2080723817392913e-07, + "loss": 0.69613075, + "num_input_tokens_seen": 320032390, + "router_z_loss_clip": 0.76025391, + "router_z_loss_mlp": 0.11706543, + "step": 14838, + "time_per_iteration": 2.633906364440918 + }, + { + "auxiliary_loss_clip": 0.01110134, + "auxiliary_loss_mlp": 0.01027475, + "balance_loss_clip": 1.03662562, + "balance_loss_mlp": 1.01548862, + "epoch": 0.892168946339997, + "flos": 26819102626560.0, + "grad_norm": 2.1915499539923156, + "language_loss": 0.76614869, + "learning_rate": 1.2067396712904777e-07, + "loss": 0.78752482, + "num_input_tokens_seen": 320052885, + "router_z_loss_clip": 0.73535156, + "router_z_loss_mlp": 0.11987305, + "step": 14839, + "time_per_iteration": 2.6520566940307617 + }, + { + "auxiliary_loss_clip": 0.01027709, + "auxiliary_loss_mlp": 0.01001048, + "balance_loss_clip": 1.00538862, + "balance_loss_mlp": 1.00005281, + "epoch": 0.892229069592665, + "flos": 82333963641600.0, + "grad_norm": 0.6787102359716426, + "language_loss": 0.49361542, + "learning_rate": 1.205407673483978e-07, + "loss": 0.51390302, + "num_input_tokens_seen": 320113685, + "router_z_loss_clip": 0.2232666, + "router_z_loss_mlp": 0.00994873, + "step": 14840, + "time_per_iteration": 3.2235116958618164 + }, + { + "auxiliary_loss_clip": 0.01115174, + "auxiliary_loss_mlp": 0.01036443, + "balance_loss_clip": 1.03847349, + "balance_loss_mlp": 1.02409291, + "epoch": 0.8922891928453329, + "flos": 23744887011360.0, + "grad_norm": 2.2863999820601997, + "language_loss": 0.64374936, + "learning_rate": 1.2040763883703074e-07, + "loss": 0.66526556, + "num_input_tokens_seen": 320130810, + "router_z_loss_clip": 0.76757812, + "router_z_loss_mlp": 0.12353516, + "step": 14841, + "time_per_iteration": 2.6077632904052734 + }, + { + "auxiliary_loss_clip": 0.01106963, + "auxiliary_loss_mlp": 0.01031979, + "balance_loss_clip": 1.03753257, + "balance_loss_mlp": 1.02209699, + "epoch": 0.8923493160980009, + "flos": 28513172409120.0, + "grad_norm": 2.0087140030398984, + "language_loss": 0.68215096, + "learning_rate": 1.2027458159999438e-07, + "loss": 0.70354038, + "num_input_tokens_seen": 320152170, + "router_z_loss_clip": 0.69433594, + "router_z_loss_mlp": 0.09881592, + "step": 14842, + "time_per_iteration": 2.6999716758728027 + }, + { + "auxiliary_loss_clip": 0.01109246, + "auxiliary_loss_mlp": 0.01030973, + "balance_loss_clip": 1.03935862, + "balance_loss_mlp": 1.02084064, + "epoch": 0.8924094393506689, + "flos": 31941701352000.0, + "grad_norm": 2.007912793614006, + "language_loss": 0.80058157, + "learning_rate": 1.2014159564233373e-07, + "loss": 0.8219837, + "num_input_tokens_seen": 320172360, + "router_z_loss_clip": 0.69873047, + "router_z_loss_mlp": 0.10137939, + "step": 14843, + "time_per_iteration": 2.6654398441314697 + }, + { + "auxiliary_loss_clip": 0.01113286, + "auxiliary_loss_mlp": 0.01031523, + "balance_loss_clip": 1.03872705, + "balance_loss_mlp": 1.01980507, + "epoch": 0.8924695626033369, + "flos": 26867068735680.0, + "grad_norm": 2.2661742958687983, + "language_loss": 0.67894435, + "learning_rate": 1.2000868096909257e-07, + "loss": 0.70039248, + "num_input_tokens_seen": 320192130, + "router_z_loss_clip": 0.74560547, + "router_z_loss_mlp": 0.1171875, + "step": 14844, + "time_per_iteration": 2.659595489501953 + }, + { + "auxiliary_loss_clip": 0.01110898, + "auxiliary_loss_mlp": 0.01029211, + "balance_loss_clip": 1.03863859, + "balance_loss_mlp": 1.01853037, + "epoch": 0.8925296858560048, + "flos": 18051117710400.0, + "grad_norm": 3.211507223166166, + "language_loss": 0.91387522, + "learning_rate": 1.1987583758531038e-07, + "loss": 0.93527639, + "num_input_tokens_seen": 320207760, + "router_z_loss_clip": 0.72167969, + "router_z_loss_mlp": 0.10675049, + "step": 14845, + "time_per_iteration": 2.5978682041168213 + }, + { + "auxiliary_loss_clip": 0.01107849, + "auxiliary_loss_mlp": 0.010242, + "balance_loss_clip": 1.03814745, + "balance_loss_mlp": 1.01409698, + "epoch": 0.8925898091086728, + "flos": 27267484377600.0, + "grad_norm": 2.5247903815157464, + "language_loss": 0.72115129, + "learning_rate": 1.1974306549602476e-07, + "loss": 0.74247181, + "num_input_tokens_seen": 320225325, + "router_z_loss_clip": 0.69726562, + "router_z_loss_mlp": 0.10107422, + "step": 14846, + "time_per_iteration": 2.6940011978149414 + }, + { + "auxiliary_loss_clip": 0.01111226, + "auxiliary_loss_mlp": 0.01032404, + "balance_loss_clip": 1.0380615, + "balance_loss_mlp": 1.02113867, + "epoch": 0.8926499323613407, + "flos": 55894855947840.0, + "grad_norm": 1.6182177173328547, + "language_loss": 0.57205772, + "learning_rate": 1.1961036470627094e-07, + "loss": 0.593494, + "num_input_tokens_seen": 320247645, + "router_z_loss_clip": 0.73242188, + "router_z_loss_mlp": 0.1126709, + "step": 14847, + "time_per_iteration": 2.817591428756714 + }, + { + "auxiliary_loss_clip": 0.01108784, + "auxiliary_loss_mlp": 0.01028667, + "balance_loss_clip": 1.03584898, + "balance_loss_mlp": 1.0182426, + "epoch": 0.8927100556140087, + "flos": 26999581707360.0, + "grad_norm": 9.999370243525457, + "language_loss": 0.76263046, + "learning_rate": 1.1947773522108052e-07, + "loss": 0.78400499, + "num_input_tokens_seen": 320266005, + "router_z_loss_clip": 0.72851562, + "router_z_loss_mlp": 0.10412598, + "step": 14848, + "time_per_iteration": 2.662595748901367 + }, + { + "auxiliary_loss_clip": 0.01108663, + "auxiliary_loss_mlp": 0.01028065, + "balance_loss_clip": 1.03792751, + "balance_loss_mlp": 1.01711547, + "epoch": 0.8927701788666766, + "flos": 34569034355520.0, + "grad_norm": 3.717909783734938, + "language_loss": 0.69134849, + "learning_rate": 1.1934517704548251e-07, + "loss": 0.71271574, + "num_input_tokens_seen": 320285555, + "router_z_loss_clip": 0.70703125, + "router_z_loss_mlp": 0.10955811, + "step": 14849, + "time_per_iteration": 2.673251152038574 + }, + { + "auxiliary_loss_clip": 0.01112463, + "auxiliary_loss_mlp": 0.01032392, + "balance_loss_clip": 1.03984714, + "balance_loss_mlp": 1.02152061, + "epoch": 0.8928303021193447, + "flos": 30865917391200.0, + "grad_norm": 1.686275827152221, + "language_loss": 0.80745065, + "learning_rate": 1.1921269018450364e-07, + "loss": 0.82889915, + "num_input_tokens_seen": 320305395, + "router_z_loss_clip": 0.72607422, + "router_z_loss_mlp": 0.10876465, + "step": 14850, + "time_per_iteration": 2.7255938053131104 + }, + { + "auxiliary_loss_clip": 0.01109622, + "auxiliary_loss_mlp": 0.01032092, + "balance_loss_clip": 1.03930473, + "balance_loss_mlp": 1.0208503, + "epoch": 0.8928904253720126, + "flos": 27133391232000.0, + "grad_norm": 1.5518007191726755, + "language_loss": 0.74939001, + "learning_rate": 1.1908027464316872e-07, + "loss": 0.77080709, + "num_input_tokens_seen": 320324220, + "router_z_loss_clip": 0.70361328, + "router_z_loss_mlp": 0.11248779, + "step": 14851, + "time_per_iteration": 4.09346604347229 + }, + { + "auxiliary_loss_clip": 0.01107666, + "auxiliary_loss_mlp": 0.01027691, + "balance_loss_clip": 1.03721523, + "balance_loss_mlp": 1.01622891, + "epoch": 0.8929505486246806, + "flos": 33058360897920.0, + "grad_norm": 1.7595168139848003, + "language_loss": 0.78629667, + "learning_rate": 1.1894793042649775e-07, + "loss": 0.80765021, + "num_input_tokens_seen": 320347195, + "router_z_loss_clip": 0.70361328, + "router_z_loss_mlp": 0.11462402, + "step": 14852, + "time_per_iteration": 2.6821789741516113 + }, + { + "auxiliary_loss_clip": 0.01107963, + "auxiliary_loss_mlp": 0.01029469, + "balance_loss_clip": 1.03816915, + "balance_loss_mlp": 1.01884186, + "epoch": 0.8930106718773486, + "flos": 28112837801760.0, + "grad_norm": 1.470829130333945, + "language_loss": 0.69198304, + "learning_rate": 1.1881565753951006e-07, + "loss": 0.71335733, + "num_input_tokens_seen": 320366850, + "router_z_loss_clip": 0.69775391, + "router_z_loss_mlp": 0.10620117, + "step": 14853, + "time_per_iteration": 2.6479012966156006 + }, + { + "auxiliary_loss_clip": 0.01109629, + "auxiliary_loss_mlp": 0.01031525, + "balance_loss_clip": 1.03789794, + "balance_loss_mlp": 1.02011037, + "epoch": 0.8930707951300165, + "flos": 43472854660320.0, + "grad_norm": 1.820971921789754, + "language_loss": 0.67013377, + "learning_rate": 1.1868345598722118e-07, + "loss": 0.69154531, + "num_input_tokens_seen": 320388895, + "router_z_loss_clip": 0.71679688, + "router_z_loss_mlp": 0.11419678, + "step": 14854, + "time_per_iteration": 2.768951416015625 + }, + { + "auxiliary_loss_clip": 0.01107111, + "auxiliary_loss_mlp": 0.01027369, + "balance_loss_clip": 1.03769708, + "balance_loss_mlp": 1.01714635, + "epoch": 0.8931309183826845, + "flos": 28113445560960.0, + "grad_norm": 1.5200006421464078, + "language_loss": 0.74563044, + "learning_rate": 1.1855132577464399e-07, + "loss": 0.76697522, + "num_input_tokens_seen": 320408520, + "router_z_loss_clip": 0.69482422, + "router_z_loss_mlp": 0.10229492, + "step": 14855, + "time_per_iteration": 2.648144245147705 + }, + { + "auxiliary_loss_clip": 0.01109034, + "auxiliary_loss_mlp": 0.01030556, + "balance_loss_clip": 1.03770351, + "balance_loss_mlp": 1.01983356, + "epoch": 0.8931910416353525, + "flos": 32341509234720.0, + "grad_norm": 2.252260574423321, + "language_loss": 0.64255667, + "learning_rate": 1.1841926690678893e-07, + "loss": 0.66395253, + "num_input_tokens_seen": 320427400, + "router_z_loss_clip": 0.71337891, + "router_z_loss_mlp": 0.1072998, + "step": 14856, + "time_per_iteration": 2.6808629035949707 + }, + { + "auxiliary_loss_clip": 0.01109114, + "auxiliary_loss_mlp": 0.01026453, + "balance_loss_clip": 1.03701043, + "balance_loss_mlp": 1.01591444, + "epoch": 0.8932511648880205, + "flos": 30473118997920.0, + "grad_norm": 1.6951928877321867, + "language_loss": 0.66168392, + "learning_rate": 1.1828727938866378e-07, + "loss": 0.68303967, + "num_input_tokens_seen": 320447570, + "router_z_loss_clip": 0.72119141, + "router_z_loss_mlp": 0.10546875, + "step": 14857, + "time_per_iteration": 2.696486234664917 + }, + { + "auxiliary_loss_clip": 0.01112398, + "auxiliary_loss_mlp": 0.01033056, + "balance_loss_clip": 1.03940713, + "balance_loss_mlp": 1.02167177, + "epoch": 0.8933112881406884, + "flos": 29847459031200.0, + "grad_norm": 2.4840370600028465, + "language_loss": 0.7482034, + "learning_rate": 1.1815536322527408e-07, + "loss": 0.76965797, + "num_input_tokens_seen": 320464405, + "router_z_loss_clip": 0.72998047, + "router_z_loss_mlp": 0.11376953, + "step": 14858, + "time_per_iteration": 2.675906181335449 + }, + { + "auxiliary_loss_clip": 0.01108617, + "auxiliary_loss_mlp": 0.01027838, + "balance_loss_clip": 1.03666461, + "balance_loss_mlp": 1.01644123, + "epoch": 0.8933714113933564, + "flos": 34523863938720.0, + "grad_norm": 1.7708390818278432, + "language_loss": 0.6934191, + "learning_rate": 1.1802351842162139e-07, + "loss": 0.71478361, + "num_input_tokens_seen": 320485525, + "router_z_loss_clip": 0.71972656, + "router_z_loss_mlp": 0.11401367, + "step": 14859, + "time_per_iteration": 3.9163875579833984 + }, + { + "auxiliary_loss_clip": 0.01102747, + "auxiliary_loss_mlp": 0.0102704, + "balance_loss_clip": 1.03626561, + "balance_loss_mlp": 1.01686549, + "epoch": 0.8934315346460243, + "flos": 26154836042400.0, + "grad_norm": 1.85730816276212, + "language_loss": 0.75853169, + "learning_rate": 1.1789174498270526e-07, + "loss": 0.7798295, + "num_input_tokens_seen": 320506725, + "router_z_loss_clip": 0.66552734, + "router_z_loss_mlp": 0.10168457, + "step": 14860, + "time_per_iteration": 2.6638665199279785 + }, + { + "auxiliary_loss_clip": 0.01112091, + "auxiliary_loss_mlp": 0.01030591, + "balance_loss_clip": 1.03896952, + "balance_loss_mlp": 1.01880109, + "epoch": 0.8934916578986923, + "flos": 29003888367360.0, + "grad_norm": 2.5726433972839904, + "language_loss": 0.57226062, + "learning_rate": 1.1776004291352303e-07, + "loss": 0.59368742, + "num_input_tokens_seen": 320525425, + "router_z_loss_clip": 0.73046875, + "router_z_loss_mlp": 0.11791992, + "step": 14861, + "time_per_iteration": 2.6479740142822266 + }, + { + "auxiliary_loss_clip": 0.01106295, + "auxiliary_loss_mlp": 0.01028375, + "balance_loss_clip": 1.03524327, + "balance_loss_mlp": 1.0170083, + "epoch": 0.8935517811513602, + "flos": 23081430772800.0, + "grad_norm": 1.9849257248020444, + "language_loss": 0.63158417, + "learning_rate": 1.176284122190685e-07, + "loss": 0.65293086, + "num_input_tokens_seen": 320543010, + "router_z_loss_clip": 0.71044922, + "router_z_loss_mlp": 0.11364746, + "step": 14862, + "time_per_iteration": 4.12746000289917 + }, + { + "auxiliary_loss_clip": 0.01105596, + "auxiliary_loss_mlp": 0.01025691, + "balance_loss_clip": 1.035133, + "balance_loss_mlp": 1.01448572, + "epoch": 0.8936119044040283, + "flos": 29359943938080.0, + "grad_norm": 2.0418801057338296, + "language_loss": 0.77888787, + "learning_rate": 1.1749685290433298e-07, + "loss": 0.80020082, + "num_input_tokens_seen": 320562180, + "router_z_loss_clip": 0.70507812, + "router_z_loss_mlp": 0.11206055, + "step": 14863, + "time_per_iteration": 2.6712374687194824 + }, + { + "auxiliary_loss_clip": 0.01105085, + "auxiliary_loss_mlp": 0.01026936, + "balance_loss_clip": 1.03600037, + "balance_loss_mlp": 1.01692867, + "epoch": 0.8936720276566962, + "flos": 26019770482080.0, + "grad_norm": 2.0610316100122508, + "language_loss": 0.70989519, + "learning_rate": 1.1736536497430627e-07, + "loss": 0.73121536, + "num_input_tokens_seen": 320580395, + "router_z_loss_clip": 0.69140625, + "router_z_loss_mlp": 0.10003662, + "step": 14864, + "time_per_iteration": 2.6180338859558105 + }, + { + "auxiliary_loss_clip": 0.01117804, + "auxiliary_loss_mlp": 0.01035134, + "balance_loss_clip": 1.04093719, + "balance_loss_mlp": 1.02359509, + "epoch": 0.8937321509093642, + "flos": 22458971671200.0, + "grad_norm": 2.2150466421655137, + "language_loss": 0.76435971, + "learning_rate": 1.1723394843397283e-07, + "loss": 0.78588909, + "num_input_tokens_seen": 320599505, + "router_z_loss_clip": 0.76904297, + "router_z_loss_mlp": 0.11541748, + "step": 14865, + "time_per_iteration": 2.6202399730682373 + }, + { + "auxiliary_loss_clip": 0.01105589, + "auxiliary_loss_mlp": 0.01029485, + "balance_loss_clip": 1.03538084, + "balance_loss_mlp": 1.01902986, + "epoch": 0.8937922741620322, + "flos": 26911307255040.0, + "grad_norm": 1.962573199587974, + "language_loss": 0.71863794, + "learning_rate": 1.1710260328831668e-07, + "loss": 0.73998868, + "num_input_tokens_seen": 320619825, + "router_z_loss_clip": 0.70214844, + "router_z_loss_mlp": 0.10455322, + "step": 14866, + "time_per_iteration": 2.646526336669922 + }, + { + "auxiliary_loss_clip": 0.01112943, + "auxiliary_loss_mlp": 0.01028173, + "balance_loss_clip": 1.03835857, + "balance_loss_mlp": 1.01612091, + "epoch": 0.8938523974147001, + "flos": 31316284488960.0, + "grad_norm": 5.168381884666692, + "language_loss": 0.84053373, + "learning_rate": 1.1697132954231869e-07, + "loss": 0.86194479, + "num_input_tokens_seen": 320638515, + "router_z_loss_clip": 0.74560547, + "router_z_loss_mlp": 0.1204834, + "step": 14867, + "time_per_iteration": 2.657762289047241 + }, + { + "auxiliary_loss_clip": 0.01108572, + "auxiliary_loss_mlp": 0.01025773, + "balance_loss_clip": 1.03658664, + "balance_loss_mlp": 1.01599157, + "epoch": 0.8939125206673681, + "flos": 31411527913440.0, + "grad_norm": 1.580666136469022, + "language_loss": 0.80526352, + "learning_rate": 1.168401272009567e-07, + "loss": 0.82660699, + "num_input_tokens_seen": 320659430, + "router_z_loss_clip": 0.71972656, + "router_z_loss_mlp": 0.09771729, + "step": 14868, + "time_per_iteration": 2.666837692260742 + }, + { + "auxiliary_loss_clip": 0.01109997, + "auxiliary_loss_mlp": 0.01029323, + "balance_loss_clip": 1.03774452, + "balance_loss_mlp": 1.01774156, + "epoch": 0.8939726439200361, + "flos": 33366328807680.0, + "grad_norm": 2.0722742956410882, + "language_loss": 0.77229643, + "learning_rate": 1.167089962692056e-07, + "loss": 0.79368961, + "num_input_tokens_seen": 320679295, + "router_z_loss_clip": 0.72216797, + "router_z_loss_mlp": 0.1159668, + "step": 14869, + "time_per_iteration": 2.682234287261963 + }, + { + "auxiliary_loss_clip": 0.01110279, + "auxiliary_loss_mlp": 0.01022098, + "balance_loss_clip": 1.03899026, + "balance_loss_mlp": 1.01086879, + "epoch": 0.8940327671727041, + "flos": 24817510624320.0, + "grad_norm": 1.4870035520659295, + "language_loss": 0.65613794, + "learning_rate": 1.1657793675203853e-07, + "loss": 0.67746174, + "num_input_tokens_seen": 320697535, + "router_z_loss_clip": 0.71240234, + "router_z_loss_mlp": 0.11224365, + "step": 14870, + "time_per_iteration": 3.9386086463928223 + }, + { + "auxiliary_loss_clip": 0.01027579, + "auxiliary_loss_mlp": 0.01001471, + "balance_loss_clip": 1.00520349, + "balance_loss_mlp": 1.00051498, + "epoch": 0.894092890425372, + "flos": 71273996962560.0, + "grad_norm": 0.8009404451408183, + "language_loss": 0.55952245, + "learning_rate": 1.1644694865442461e-07, + "loss": 0.579813, + "num_input_tokens_seen": 320758635, + "router_z_loss_clip": 0.22363281, + "router_z_loss_mlp": 0.00955963, + "step": 14871, + "time_per_iteration": 3.2816073894500732 + }, + { + "auxiliary_loss_clip": 0.01108545, + "auxiliary_loss_mlp": 0.01029637, + "balance_loss_clip": 1.03875661, + "balance_loss_mlp": 1.01902103, + "epoch": 0.89415301367804, + "flos": 24196023937440.0, + "grad_norm": 2.0412048704789942, + "language_loss": 0.76451701, + "learning_rate": 1.16316031981331e-07, + "loss": 0.7858988, + "num_input_tokens_seen": 320777175, + "router_z_loss_clip": 0.69726562, + "router_z_loss_mlp": 0.10620117, + "step": 14872, + "time_per_iteration": 2.7671046257019043 + }, + { + "auxiliary_loss_clip": 0.01105064, + "auxiliary_loss_mlp": 0.01026315, + "balance_loss_clip": 1.03667307, + "balance_loss_mlp": 1.01629567, + "epoch": 0.8942131369307079, + "flos": 31452038843040.0, + "grad_norm": 1.564519778561077, + "language_loss": 0.66933107, + "learning_rate": 1.1618518673772215e-07, + "loss": 0.69064486, + "num_input_tokens_seen": 320797670, + "router_z_loss_clip": 0.68310547, + "router_z_loss_mlp": 0.10028076, + "step": 14873, + "time_per_iteration": 2.6499180793762207 + }, + { + "auxiliary_loss_clip": 0.01106252, + "auxiliary_loss_mlp": 0.01030239, + "balance_loss_clip": 1.03681803, + "balance_loss_mlp": 1.01882434, + "epoch": 0.8942732601833759, + "flos": 28246647326400.0, + "grad_norm": 1.745268870378562, + "language_loss": 0.59928656, + "learning_rate": 1.1605441292856033e-07, + "loss": 0.62065154, + "num_input_tokens_seen": 320817410, + "router_z_loss_clip": 0.69433594, + "router_z_loss_mlp": 0.11419678, + "step": 14874, + "time_per_iteration": 2.6627936363220215 + }, + { + "auxiliary_loss_clip": 0.01113062, + "auxiliary_loss_mlp": 0.01030344, + "balance_loss_clip": 1.03942525, + "balance_loss_mlp": 1.01857185, + "epoch": 0.8943333834360438, + "flos": 33989922393120.0, + "grad_norm": 2.029297839470577, + "language_loss": 0.75640595, + "learning_rate": 1.1592371055880356e-07, + "loss": 0.77784002, + "num_input_tokens_seen": 320836745, + "router_z_loss_clip": 0.73632812, + "router_z_loss_mlp": 0.11773682, + "step": 14875, + "time_per_iteration": 2.707522392272949 + }, + { + "auxiliary_loss_clip": 0.01114627, + "auxiliary_loss_mlp": 0.01033058, + "balance_loss_clip": 1.03859258, + "balance_loss_mlp": 1.02017152, + "epoch": 0.8943935066887119, + "flos": 27044590055040.0, + "grad_norm": 2.423761290665227, + "language_loss": 0.77448696, + "learning_rate": 1.1579307963340857e-07, + "loss": 0.79596382, + "num_input_tokens_seen": 320853305, + "router_z_loss_clip": 0.76025391, + "router_z_loss_mlp": 0.12884521, + "step": 14876, + "time_per_iteration": 2.6277899742126465 + }, + { + "auxiliary_loss_clip": 0.01108876, + "auxiliary_loss_mlp": 0.01026656, + "balance_loss_clip": 1.03825271, + "balance_loss_mlp": 1.01623166, + "epoch": 0.8944536299413798, + "flos": 26197899560640.0, + "grad_norm": 1.7397806601245585, + "language_loss": 0.79171866, + "learning_rate": 1.156625201573287e-07, + "loss": 0.81307399, + "num_input_tokens_seen": 320872885, + "router_z_loss_clip": 0.70556641, + "router_z_loss_mlp": 0.10418701, + "step": 14877, + "time_per_iteration": 2.761113166809082 + }, + { + "auxiliary_loss_clip": 0.01110668, + "auxiliary_loss_mlp": 0.01030023, + "balance_loss_clip": 1.03924179, + "balance_loss_mlp": 1.01869249, + "epoch": 0.8945137531940478, + "flos": 21656884351680.0, + "grad_norm": 2.0841011083495027, + "language_loss": 0.75198603, + "learning_rate": 1.155320321355151e-07, + "loss": 0.77339292, + "num_input_tokens_seen": 320889755, + "router_z_loss_clip": 0.71484375, + "router_z_loss_mlp": 0.11340332, + "step": 14878, + "time_per_iteration": 2.596972942352295 + }, + { + "auxiliary_loss_clip": 0.01109802, + "auxiliary_loss_mlp": 0.01023143, + "balance_loss_clip": 1.03732216, + "balance_loss_mlp": 1.01160336, + "epoch": 0.8945738764467158, + "flos": 25797848574240.0, + "grad_norm": 1.6588886285961957, + "language_loss": 0.75691462, + "learning_rate": 1.1540161557291539e-07, + "loss": 0.77824408, + "num_input_tokens_seen": 320907860, + "router_z_loss_clip": 0.72509766, + "router_z_loss_mlp": 0.11541748, + "step": 14879, + "time_per_iteration": 2.6923067569732666 + }, + { + "auxiliary_loss_clip": 0.01111972, + "auxiliary_loss_mlp": 0.01029252, + "balance_loss_clip": 1.03975213, + "balance_loss_mlp": 1.01801062, + "epoch": 0.8946339996993837, + "flos": 18184238441280.0, + "grad_norm": 1.9639362863716339, + "language_loss": 0.74915409, + "learning_rate": 1.1527127047447538e-07, + "loss": 0.77056634, + "num_input_tokens_seen": 320925825, + "router_z_loss_clip": 0.72216797, + "router_z_loss_mlp": 0.11242676, + "step": 14880, + "time_per_iteration": 2.6200954914093018 + }, + { + "auxiliary_loss_clip": 0.01109576, + "auxiliary_loss_mlp": 0.01028114, + "balance_loss_clip": 1.037395, + "balance_loss_mlp": 1.01664615, + "epoch": 0.8946941229520518, + "flos": 33412188018240.0, + "grad_norm": 1.639374501938277, + "language_loss": 0.82804924, + "learning_rate": 1.1514099684513822e-07, + "loss": 0.84942615, + "num_input_tokens_seen": 320946165, + "router_z_loss_clip": 0.72216797, + "router_z_loss_mlp": 0.11462402, + "step": 14881, + "time_per_iteration": 2.7048630714416504 + }, + { + "auxiliary_loss_clip": 0.01105848, + "auxiliary_loss_mlp": 0.0102611, + "balance_loss_clip": 1.03538275, + "balance_loss_mlp": 1.01508319, + "epoch": 0.8947542462047197, + "flos": 38798678203200.0, + "grad_norm": 7.843616713987748, + "language_loss": 0.67334187, + "learning_rate": 1.1501079468984287e-07, + "loss": 0.69466138, + "num_input_tokens_seen": 320969330, + "router_z_loss_clip": 0.70458984, + "router_z_loss_mlp": 0.11029053, + "step": 14882, + "time_per_iteration": 2.70357608795166 + }, + { + "auxiliary_loss_clip": 0.01114054, + "auxiliary_loss_mlp": 0.01028478, + "balance_loss_clip": 1.03840756, + "balance_loss_mlp": 1.01589537, + "epoch": 0.8948143694573877, + "flos": 25482506519520.0, + "grad_norm": 2.8757936571355223, + "language_loss": 0.75473166, + "learning_rate": 1.1488066401352691e-07, + "loss": 0.77615696, + "num_input_tokens_seen": 320985055, + "router_z_loss_clip": 0.75585938, + "router_z_loss_mlp": 0.12591553, + "step": 14883, + "time_per_iteration": 2.6160354614257812 + }, + { + "auxiliary_loss_clip": 0.01104303, + "auxiliary_loss_mlp": 0.01032061, + "balance_loss_clip": 1.03517079, + "balance_loss_mlp": 1.02133811, + "epoch": 0.8948744927100556, + "flos": 34429673963520.0, + "grad_norm": 1.6253488262216635, + "language_loss": 0.72204423, + "learning_rate": 1.147506048211253e-07, + "loss": 0.74340785, + "num_input_tokens_seen": 321004720, + "router_z_loss_clip": 0.69287109, + "router_z_loss_mlp": 0.1072998, + "step": 14884, + "time_per_iteration": 2.668506383895874 + }, + { + "auxiliary_loss_clip": 0.01103942, + "auxiliary_loss_mlp": 0.01025462, + "balance_loss_clip": 1.03471851, + "balance_loss_mlp": 1.01539445, + "epoch": 0.8949346159627236, + "flos": 26726290238880.0, + "grad_norm": 1.9315706461540958, + "language_loss": 0.75660419, + "learning_rate": 1.1462061711756987e-07, + "loss": 0.77789819, + "num_input_tokens_seen": 321022350, + "router_z_loss_clip": 0.69287109, + "router_z_loss_mlp": 0.10070801, + "step": 14885, + "time_per_iteration": 2.691718101501465 + }, + { + "auxiliary_loss_clip": 0.01111228, + "auxiliary_loss_mlp": 0.01030331, + "balance_loss_clip": 1.03693926, + "balance_loss_mlp": 1.01854086, + "epoch": 0.8949947392153915, + "flos": 26062671931200.0, + "grad_norm": 1.9719251550941406, + "language_loss": 0.81884027, + "learning_rate": 1.1449070090778911e-07, + "loss": 0.84025586, + "num_input_tokens_seen": 321040450, + "router_z_loss_clip": 0.74267578, + "router_z_loss_mlp": 0.11791992, + "step": 14886, + "time_per_iteration": 2.6093225479125977 + }, + { + "auxiliary_loss_clip": 0.01108841, + "auxiliary_loss_mlp": 0.01027386, + "balance_loss_clip": 1.03708696, + "balance_loss_mlp": 1.01654971, + "epoch": 0.8950548624680595, + "flos": 63996426864000.0, + "grad_norm": 1.6382638454830043, + "language_loss": 0.63787746, + "learning_rate": 1.1436085619671043e-07, + "loss": 0.65923971, + "num_input_tokens_seen": 321063970, + "router_z_loss_clip": 0.71728516, + "router_z_loss_mlp": 0.1083374, + "step": 14887, + "time_per_iteration": 2.899798631668091 + }, + { + "auxiliary_loss_clip": 0.01113669, + "auxiliary_loss_mlp": 0.01032004, + "balance_loss_clip": 1.03983045, + "balance_loss_mlp": 1.02084565, + "epoch": 0.8951149857207275, + "flos": 24551957956320.0, + "grad_norm": 1.8240955494357838, + "language_loss": 0.60876, + "learning_rate": 1.1423108298925698e-07, + "loss": 0.63021672, + "num_input_tokens_seen": 321083840, + "router_z_loss_clip": 0.73925781, + "router_z_loss_mlp": 0.11157227, + "step": 14888, + "time_per_iteration": 2.6653945446014404 + }, + { + "auxiliary_loss_clip": 0.01110888, + "auxiliary_loss_mlp": 0.0102446, + "balance_loss_clip": 1.03741109, + "balance_loss_mlp": 1.01377892, + "epoch": 0.8951751089733955, + "flos": 36439085800800.0, + "grad_norm": 3.4347412221280913, + "language_loss": 0.70158613, + "learning_rate": 1.1410138129034952e-07, + "loss": 0.72293961, + "num_input_tokens_seen": 321104165, + "router_z_loss_clip": 0.734375, + "router_z_loss_mlp": 0.10681152, + "step": 14889, + "time_per_iteration": 2.7038471698760986 + }, + { + "auxiliary_loss_clip": 0.01110099, + "auxiliary_loss_mlp": 0.01029405, + "balance_loss_clip": 1.0373224, + "balance_loss_mlp": 1.01778758, + "epoch": 0.8952352322260634, + "flos": 18623746908000.0, + "grad_norm": 2.655613360211478, + "language_loss": 0.71391463, + "learning_rate": 1.1397175110490676e-07, + "loss": 0.73530966, + "num_input_tokens_seen": 321117290, + "router_z_loss_clip": 0.72753906, + "router_z_loss_mlp": 0.11608887, + "step": 14890, + "time_per_iteration": 4.023431062698364 + }, + { + "auxiliary_loss_clip": 0.01109577, + "auxiliary_loss_mlp": 0.01029834, + "balance_loss_clip": 1.03665495, + "balance_loss_mlp": 1.01839542, + "epoch": 0.8952953554787314, + "flos": 32699752738560.0, + "grad_norm": 1.7650459197224821, + "language_loss": 0.75614524, + "learning_rate": 1.1384219243784454e-07, + "loss": 0.77753931, + "num_input_tokens_seen": 321137115, + "router_z_loss_clip": 0.72900391, + "router_z_loss_mlp": 0.11437988, + "step": 14891, + "time_per_iteration": 2.6671273708343506 + }, + { + "auxiliary_loss_clip": 0.01111075, + "auxiliary_loss_mlp": 0.01029313, + "balance_loss_clip": 1.03642786, + "balance_loss_mlp": 1.01830447, + "epoch": 0.8953554787313994, + "flos": 17249719184640.0, + "grad_norm": 2.381743039006332, + "language_loss": 0.76279557, + "learning_rate": 1.1371270529407517e-07, + "loss": 0.78419948, + "num_input_tokens_seen": 321154490, + "router_z_loss_clip": 0.74609375, + "router_z_loss_mlp": 0.11004639, + "step": 14892, + "time_per_iteration": 2.6119139194488525 + }, + { + "auxiliary_loss_clip": 0.01110303, + "auxiliary_loss_mlp": 0.01030784, + "balance_loss_clip": 1.03801751, + "balance_loss_mlp": 1.01986456, + "epoch": 0.8954156019840673, + "flos": 31363561804320.0, + "grad_norm": 4.884147760478499, + "language_loss": 0.8151862, + "learning_rate": 1.1358328967850895e-07, + "loss": 0.83659708, + "num_input_tokens_seen": 321175625, + "router_z_loss_clip": 0.72265625, + "router_z_loss_mlp": 0.10919189, + "step": 14893, + "time_per_iteration": 2.6526637077331543 + }, + { + "auxiliary_loss_clip": 0.01106432, + "auxiliary_loss_mlp": 0.01029168, + "balance_loss_clip": 1.03732598, + "balance_loss_mlp": 1.01839781, + "epoch": 0.8954757252367354, + "flos": 26732286796320.0, + "grad_norm": 1.882507907308314, + "language_loss": 0.74930376, + "learning_rate": 1.1345394559605348e-07, + "loss": 0.77065974, + "num_input_tokens_seen": 321193895, + "router_z_loss_clip": 0.69091797, + "router_z_loss_mlp": 0.10766602, + "step": 14894, + "time_per_iteration": 2.6429431438446045 + }, + { + "auxiliary_loss_clip": 0.01113868, + "auxiliary_loss_mlp": 0.01030875, + "balance_loss_clip": 1.0403738, + "balance_loss_mlp": 1.01941943, + "epoch": 0.8955358484894033, + "flos": 15825821040000.0, + "grad_norm": 1.9800821754102715, + "language_loss": 0.66774046, + "learning_rate": 1.1332467305161352e-07, + "loss": 0.68918788, + "num_input_tokens_seen": 321211610, + "router_z_loss_clip": 0.734375, + "router_z_loss_mlp": 0.11444092, + "step": 14895, + "time_per_iteration": 2.626321315765381 + }, + { + "auxiliary_loss_clip": 0.01112341, + "auxiliary_loss_mlp": 0.01025883, + "balance_loss_clip": 1.03788316, + "balance_loss_mlp": 1.01398635, + "epoch": 0.8955959717420713, + "flos": 21077529285600.0, + "grad_norm": 1.5586710096628666, + "language_loss": 0.66852915, + "learning_rate": 1.1319547205009094e-07, + "loss": 0.68991137, + "num_input_tokens_seen": 321229805, + "router_z_loss_clip": 0.74511719, + "router_z_loss_mlp": 0.11889648, + "step": 14896, + "time_per_iteration": 2.601544141769409 + }, + { + "auxiliary_loss_clip": 0.01109227, + "auxiliary_loss_mlp": 0.01027661, + "balance_loss_clip": 1.0373323, + "balance_loss_mlp": 1.01649117, + "epoch": 0.8956560949947392, + "flos": 18051036675840.0, + "grad_norm": 2.9652537046014675, + "language_loss": 0.7575298, + "learning_rate": 1.1306634259638492e-07, + "loss": 0.77889872, + "num_input_tokens_seen": 321247165, + "router_z_loss_clip": 0.71923828, + "router_z_loss_mlp": 0.1116333, + "step": 14897, + "time_per_iteration": 2.6456685066223145 + }, + { + "auxiliary_loss_clip": 0.01027436, + "auxiliary_loss_mlp": 0.01000957, + "balance_loss_clip": 1.00506496, + "balance_loss_mlp": 1.00005436, + "epoch": 0.8957162182474072, + "flos": 77614576250400.0, + "grad_norm": 0.738922071612297, + "language_loss": 0.55313885, + "learning_rate": 1.129372846953931e-07, + "loss": 0.57342279, + "num_input_tokens_seen": 321308425, + "router_z_loss_clip": 0.22387695, + "router_z_loss_mlp": 0.00901794, + "step": 14898, + "time_per_iteration": 3.2719764709472656 + }, + { + "auxiliary_loss_clip": 0.01110174, + "auxiliary_loss_mlp": 0.01029919, + "balance_loss_clip": 1.03794241, + "balance_loss_mlp": 1.01868975, + "epoch": 0.8957763415000751, + "flos": 30517357517280.0, + "grad_norm": 1.4384022991172707, + "language_loss": 0.69898111, + "learning_rate": 1.12808298352008e-07, + "loss": 0.72038203, + "num_input_tokens_seen": 321329295, + "router_z_loss_clip": 0.72167969, + "router_z_loss_mlp": 0.11230469, + "step": 14899, + "time_per_iteration": 4.062104225158691 + }, + { + "auxiliary_loss_clip": 0.01111722, + "auxiliary_loss_mlp": 0.01031493, + "balance_loss_clip": 1.03894734, + "balance_loss_mlp": 1.01957238, + "epoch": 0.8958364647527431, + "flos": 24194848936320.0, + "grad_norm": 1.6123799179632847, + "language_loss": 0.73850226, + "learning_rate": 1.1267938357112106e-07, + "loss": 0.75993443, + "num_input_tokens_seen": 321347580, + "router_z_loss_clip": 0.72753906, + "router_z_loss_mlp": 0.11907959, + "step": 14900, + "time_per_iteration": 2.6389431953430176 + }, + { + "auxiliary_loss_clip": 0.01027387, + "auxiliary_loss_mlp": 0.01000603, + "balance_loss_clip": 1.00501156, + "balance_loss_mlp": 0.99966002, + "epoch": 0.895896588005411, + "flos": 79969347096480.0, + "grad_norm": 0.7924572362388421, + "language_loss": 0.61843795, + "learning_rate": 1.1255054035762124e-07, + "loss": 0.63871783, + "num_input_tokens_seen": 321407820, + "router_z_loss_clip": 0.22387695, + "router_z_loss_mlp": 0.0094223, + "step": 14901, + "time_per_iteration": 4.696310520172119 + }, + { + "auxiliary_loss_clip": 0.01110125, + "auxiliary_loss_mlp": 0.01029068, + "balance_loss_clip": 1.03607655, + "balance_loss_mlp": 1.01790404, + "epoch": 0.8959567112580791, + "flos": 31226794518240.0, + "grad_norm": 1.869869634395707, + "language_loss": 0.70871305, + "learning_rate": 1.1242176871639441e-07, + "loss": 0.73010504, + "num_input_tokens_seen": 321426745, + "router_z_loss_clip": 0.74023438, + "router_z_loss_mlp": 0.11175537, + "step": 14902, + "time_per_iteration": 2.6627771854400635 + }, + { + "auxiliary_loss_clip": 0.01104555, + "auxiliary_loss_mlp": 0.01026417, + "balance_loss_clip": 1.03527141, + "balance_loss_mlp": 1.01612329, + "epoch": 0.896016834510747, + "flos": 29531792838240.0, + "grad_norm": 1.7371487005595359, + "language_loss": 0.78081399, + "learning_rate": 1.1229306865232313e-07, + "loss": 0.80212367, + "num_input_tokens_seen": 321446165, + "router_z_loss_clip": 0.69189453, + "router_z_loss_mlp": 0.10290527, + "step": 14903, + "time_per_iteration": 2.690953493118286 + }, + { + "auxiliary_loss_clip": 0.01114339, + "auxiliary_loss_mlp": 0.01029864, + "balance_loss_clip": 1.03923154, + "balance_loss_mlp": 1.01788366, + "epoch": 0.896076957763415, + "flos": 28157481493920.0, + "grad_norm": 1.8495843425401604, + "language_loss": 0.72758114, + "learning_rate": 1.121644401702877e-07, + "loss": 0.7490232, + "num_input_tokens_seen": 321465285, + "router_z_loss_clip": 0.75097656, + "router_z_loss_mlp": 0.11968994, + "step": 14904, + "time_per_iteration": 2.6168630123138428 + }, + { + "auxiliary_loss_clip": 0.01111163, + "auxiliary_loss_mlp": 0.01026084, + "balance_loss_clip": 1.0377357, + "balance_loss_mlp": 1.01410937, + "epoch": 0.8961370810160829, + "flos": 27133269680160.0, + "grad_norm": 2.3036329049803306, + "language_loss": 0.7470246, + "learning_rate": 1.12035883275166e-07, + "loss": 0.76839703, + "num_input_tokens_seen": 321483670, + "router_z_loss_clip": 0.734375, + "router_z_loss_mlp": 0.11981201, + "step": 14905, + "time_per_iteration": 2.7244656085968018 + }, + { + "auxiliary_loss_clip": 0.01108129, + "auxiliary_loss_mlp": 0.01029729, + "balance_loss_clip": 1.03721654, + "balance_loss_mlp": 1.01852322, + "epoch": 0.8961972042687509, + "flos": 28153389248640.0, + "grad_norm": 1.6628194899172453, + "language_loss": 0.76605916, + "learning_rate": 1.1190739797183279e-07, + "loss": 0.78743768, + "num_input_tokens_seen": 321501190, + "router_z_loss_clip": 0.70996094, + "router_z_loss_mlp": 0.11193848, + "step": 14906, + "time_per_iteration": 2.619854688644409 + }, + { + "auxiliary_loss_clip": 0.01110661, + "auxiliary_loss_mlp": 0.01029188, + "balance_loss_clip": 1.03860402, + "balance_loss_mlp": 1.01833367, + "epoch": 0.896257327521419, + "flos": 22190339689920.0, + "grad_norm": 1.6124407635328133, + "language_loss": 0.7401464, + "learning_rate": 1.1177898426515996e-07, + "loss": 0.76154494, + "num_input_tokens_seen": 321518540, + "router_z_loss_clip": 0.71972656, + "router_z_loss_mlp": 0.10858154, + "step": 14907, + "time_per_iteration": 2.6857945919036865 + }, + { + "auxiliary_loss_clip": 0.01109255, + "auxiliary_loss_mlp": 0.01032145, + "balance_loss_clip": 1.03882551, + "balance_loss_mlp": 1.02145791, + "epoch": 0.8963174507740869, + "flos": 21835823775840.0, + "grad_norm": 1.6951259579740907, + "language_loss": 0.83140171, + "learning_rate": 1.1165064216001785e-07, + "loss": 0.85281569, + "num_input_tokens_seen": 321536555, + "router_z_loss_clip": 0.70458984, + "router_z_loss_mlp": 0.10693359, + "step": 14908, + "time_per_iteration": 2.605294704437256 + }, + { + "auxiliary_loss_clip": 0.0111142, + "auxiliary_loss_mlp": 0.01030151, + "balance_loss_clip": 1.03709674, + "balance_loss_mlp": 1.01830125, + "epoch": 0.8963775740267549, + "flos": 25663674394080.0, + "grad_norm": 1.8305935006081142, + "language_loss": 0.7051152, + "learning_rate": 1.1152237166127232e-07, + "loss": 0.72653091, + "num_input_tokens_seen": 321557655, + "router_z_loss_clip": 0.74267578, + "router_z_loss_mlp": 0.11859131, + "step": 14909, + "time_per_iteration": 2.657712697982788 + }, + { + "auxiliary_loss_clip": 0.01112938, + "auxiliary_loss_mlp": 0.01030895, + "balance_loss_clip": 1.03922296, + "balance_loss_mlp": 1.01980853, + "epoch": 0.8964376972794228, + "flos": 28284524632800.0, + "grad_norm": 2.2082627032402153, + "language_loss": 0.72440052, + "learning_rate": 1.113941727737877e-07, + "loss": 0.74583876, + "num_input_tokens_seen": 321576160, + "router_z_loss_clip": 0.73730469, + "router_z_loss_mlp": 0.11083984, + "step": 14910, + "time_per_iteration": 3.8786134719848633 + }, + { + "auxiliary_loss_clip": 0.01106465, + "auxiliary_loss_mlp": 0.01027134, + "balance_loss_clip": 1.03537393, + "balance_loss_mlp": 1.01663792, + "epoch": 0.8964978205320908, + "flos": 30472430204160.0, + "grad_norm": 2.036421528415319, + "language_loss": 0.63732064, + "learning_rate": 1.1126604550242502e-07, + "loss": 0.6586566, + "num_input_tokens_seen": 321596205, + "router_z_loss_clip": 0.7109375, + "router_z_loss_mlp": 0.10491943, + "step": 14911, + "time_per_iteration": 2.6285347938537598 + }, + { + "auxiliary_loss_clip": 0.01111992, + "auxiliary_loss_mlp": 0.01026617, + "balance_loss_clip": 1.03933179, + "balance_loss_mlp": 1.01522017, + "epoch": 0.8965579437847587, + "flos": 23393612479680.0, + "grad_norm": 1.6887709299233908, + "language_loss": 0.74697018, + "learning_rate": 1.111379898520437e-07, + "loss": 0.7683562, + "num_input_tokens_seen": 321614800, + "router_z_loss_clip": 0.72607422, + "router_z_loss_mlp": 0.11376953, + "step": 14912, + "time_per_iteration": 2.630324363708496 + }, + { + "auxiliary_loss_clip": 0.01109707, + "auxiliary_loss_mlp": 0.01031563, + "balance_loss_clip": 1.03679013, + "balance_loss_mlp": 1.02011335, + "epoch": 0.8966180670374267, + "flos": 29623025052000.0, + "grad_norm": 2.033197062807709, + "language_loss": 0.81398535, + "learning_rate": 1.1101000582749876e-07, + "loss": 0.83539808, + "num_input_tokens_seen": 321633445, + "router_z_loss_clip": 0.72998047, + "router_z_loss_mlp": 0.11456299, + "step": 14913, + "time_per_iteration": 2.6286118030548096 + }, + { + "auxiliary_loss_clip": 0.01112267, + "auxiliary_loss_mlp": 0.01032866, + "balance_loss_clip": 1.03813434, + "balance_loss_mlp": 1.02143383, + "epoch": 0.8966781902900947, + "flos": 16537081318560.0, + "grad_norm": 2.623489614193492, + "language_loss": 0.61077124, + "learning_rate": 1.1088209343364407e-07, + "loss": 0.63222253, + "num_input_tokens_seen": 321650890, + "router_z_loss_clip": 0.74169922, + "router_z_loss_mlp": 0.11431885, + "step": 14914, + "time_per_iteration": 2.629528045654297 + }, + { + "auxiliary_loss_clip": 0.01027611, + "auxiliary_loss_mlp": 0.01000709, + "balance_loss_clip": 1.00523686, + "balance_loss_mlp": 0.99974066, + "epoch": 0.8967383135427627, + "flos": 79394205827520.0, + "grad_norm": 0.7170781512990633, + "language_loss": 0.55028319, + "learning_rate": 1.1075425267532956e-07, + "loss": 0.57056636, + "num_input_tokens_seen": 321710960, + "router_z_loss_clip": 0.22375488, + "router_z_loss_mlp": 0.0096817, + "step": 14915, + "time_per_iteration": 3.2512829303741455 + }, + { + "auxiliary_loss_clip": 0.01106216, + "auxiliary_loss_mlp": 0.01028619, + "balance_loss_clip": 1.03600907, + "balance_loss_mlp": 1.01831329, + "epoch": 0.8967984367954306, + "flos": 36256864476960.0, + "grad_norm": 1.5845528796272614, + "language_loss": 0.71468854, + "learning_rate": 1.1062648355740289e-07, + "loss": 0.7360369, + "num_input_tokens_seen": 321733290, + "router_z_loss_clip": 0.70117188, + "router_z_loss_mlp": 0.10296631, + "step": 14916, + "time_per_iteration": 2.731126308441162 + }, + { + "auxiliary_loss_clip": 0.0111022, + "auxiliary_loss_mlp": 0.01032229, + "balance_loss_clip": 1.03715277, + "balance_loss_mlp": 1.02150595, + "epoch": 0.8968585600480986, + "flos": 31362629906880.0, + "grad_norm": 1.7207148550527165, + "language_loss": 0.78087711, + "learning_rate": 1.1049878608470931e-07, + "loss": 0.80230165, + "num_input_tokens_seen": 321753120, + "router_z_loss_clip": 0.73046875, + "router_z_loss_mlp": 0.10723877, + "step": 14917, + "time_per_iteration": 2.6581246852874756 + }, + { + "auxiliary_loss_clip": 0.01114974, + "auxiliary_loss_mlp": 0.01041275, + "balance_loss_clip": 1.04010212, + "balance_loss_mlp": 1.0291748, + "epoch": 0.8969186833007665, + "flos": 36661291329600.0, + "grad_norm": 2.0405770389253943, + "language_loss": 0.67845863, + "learning_rate": 1.1037116026209137e-07, + "loss": 0.70002115, + "num_input_tokens_seen": 321772840, + "router_z_loss_clip": 0.74902344, + "router_z_loss_mlp": 0.12091064, + "step": 14918, + "time_per_iteration": 2.692218065261841 + }, + { + "auxiliary_loss_clip": 0.01109945, + "auxiliary_loss_mlp": 0.01028805, + "balance_loss_clip": 1.03691578, + "balance_loss_mlp": 1.01833224, + "epoch": 0.8969788065534345, + "flos": 27843111853920.0, + "grad_norm": 1.9002970621466042, + "language_loss": 0.83451396, + "learning_rate": 1.102436060943881e-07, + "loss": 0.85590142, + "num_input_tokens_seen": 321791020, + "router_z_loss_clip": 0.73046875, + "router_z_loss_mlp": 0.10479736, + "step": 14919, + "time_per_iteration": 2.6350231170654297 + }, + { + "auxiliary_loss_clip": 0.01110317, + "auxiliary_loss_mlp": 0.01030899, + "balance_loss_clip": 1.03699195, + "balance_loss_mlp": 1.01921034, + "epoch": 0.8970389298061026, + "flos": 16181106782400.0, + "grad_norm": 2.363131788652141, + "language_loss": 0.71947467, + "learning_rate": 1.1011612358643696e-07, + "loss": 0.74088681, + "num_input_tokens_seen": 321810075, + "router_z_loss_clip": 0.73339844, + "router_z_loss_mlp": 0.11694336, + "step": 14920, + "time_per_iteration": 2.625011682510376 + }, + { + "auxiliary_loss_clip": 0.01110451, + "auxiliary_loss_mlp": 0.01028417, + "balance_loss_clip": 1.03785872, + "balance_loss_mlp": 1.01661587, + "epoch": 0.8970990530587705, + "flos": 12526199030880.0, + "grad_norm": 2.563439891275269, + "language_loss": 0.90745497, + "learning_rate": 1.0998871274307164e-07, + "loss": 0.92884362, + "num_input_tokens_seen": 321822635, + "router_z_loss_clip": 0.72509766, + "router_z_loss_mlp": 0.11804199, + "step": 14921, + "time_per_iteration": 2.5768442153930664 + }, + { + "auxiliary_loss_clip": 0.01110119, + "auxiliary_loss_mlp": 0.01025763, + "balance_loss_clip": 1.0367372, + "balance_loss_mlp": 1.01450372, + "epoch": 0.8971591763114385, + "flos": 24773393656800.0, + "grad_norm": 1.9101722310250864, + "language_loss": 0.73578411, + "learning_rate": 1.0986137356912384e-07, + "loss": 0.75714296, + "num_input_tokens_seen": 321841130, + "router_z_loss_clip": 0.73339844, + "router_z_loss_mlp": 0.11260986, + "step": 14922, + "time_per_iteration": 2.6362364292144775 + }, + { + "auxiliary_loss_clip": 0.01108132, + "auxiliary_loss_mlp": 0.0102991, + "balance_loss_clip": 1.03587961, + "balance_loss_mlp": 1.01847208, + "epoch": 0.8972192995641064, + "flos": 28378593056160.0, + "grad_norm": 1.6762366982822163, + "language_loss": 0.70735765, + "learning_rate": 1.097341060694219e-07, + "loss": 0.72873807, + "num_input_tokens_seen": 321859855, + "router_z_loss_clip": 0.72265625, + "router_z_loss_mlp": 0.11437988, + "step": 14923, + "time_per_iteration": 2.658855676651001 + }, + { + "auxiliary_loss_clip": 0.01111067, + "auxiliary_loss_mlp": 0.01027856, + "balance_loss_clip": 1.03746235, + "balance_loss_mlp": 1.01581609, + "epoch": 0.8972794228167744, + "flos": 22414409013600.0, + "grad_norm": 2.19509904016215, + "language_loss": 0.70585823, + "learning_rate": 1.0960691024879221e-07, + "loss": 0.72724742, + "num_input_tokens_seen": 321877990, + "router_z_loss_clip": 0.73583984, + "router_z_loss_mlp": 0.12042236, + "step": 14924, + "time_per_iteration": 2.62205171585083 + }, + { + "auxiliary_loss_clip": 0.01108259, + "auxiliary_loss_mlp": 0.01027688, + "balance_loss_clip": 1.03672886, + "balance_loss_mlp": 1.01762676, + "epoch": 0.8973395460694423, + "flos": 29226539586240.0, + "grad_norm": 1.5312854862125675, + "language_loss": 0.72130466, + "learning_rate": 1.0947978611205844e-07, + "loss": 0.74266416, + "num_input_tokens_seen": 321898120, + "router_z_loss_clip": 0.71484375, + "router_z_loss_mlp": 0.10058594, + "step": 14925, + "time_per_iteration": 2.670936107635498 + }, + { + "auxiliary_loss_clip": 0.01113551, + "auxiliary_loss_mlp": 0.01027845, + "balance_loss_clip": 1.03986728, + "balance_loss_mlp": 1.0157994, + "epoch": 0.8973996693221103, + "flos": 30469999167360.0, + "grad_norm": 2.092273695829012, + "language_loss": 0.82316887, + "learning_rate": 1.0935273366404008e-07, + "loss": 0.84458292, + "num_input_tokens_seen": 321918140, + "router_z_loss_clip": 0.73681641, + "router_z_loss_mlp": 0.1204834, + "step": 14926, + "time_per_iteration": 2.664754867553711 + }, + { + "auxiliary_loss_clip": 0.01108697, + "auxiliary_loss_mlp": 0.01029225, + "balance_loss_clip": 1.03708947, + "balance_loss_mlp": 1.01857936, + "epoch": 0.8974597925747783, + "flos": 31407111529920.0, + "grad_norm": 1.9270525680242185, + "language_loss": 0.79162294, + "learning_rate": 1.092257529095555e-07, + "loss": 0.81300217, + "num_input_tokens_seen": 321938580, + "router_z_loss_clip": 0.71582031, + "router_z_loss_mlp": 0.10644531, + "step": 14927, + "time_per_iteration": 2.7556848526000977 + }, + { + "auxiliary_loss_clip": 0.01107999, + "auxiliary_loss_mlp": 0.01024866, + "balance_loss_clip": 1.03703928, + "balance_loss_mlp": 1.01450038, + "epoch": 0.8975199158274463, + "flos": 46456688924640.0, + "grad_norm": 1.7618122046523357, + "language_loss": 0.66409016, + "learning_rate": 1.0909884385341994e-07, + "loss": 0.68541878, + "num_input_tokens_seen": 321961135, + "router_z_loss_clip": 0.70898438, + "router_z_loss_mlp": 0.10369873, + "step": 14928, + "time_per_iteration": 2.7880396842956543 + }, + { + "auxiliary_loss_clip": 0.01114004, + "auxiliary_loss_mlp": 0.01036843, + "balance_loss_clip": 1.03924274, + "balance_loss_mlp": 1.02291942, + "epoch": 0.8975800390801142, + "flos": 31007668302720.0, + "grad_norm": 2.5696670948928704, + "language_loss": 0.7152015, + "learning_rate": 1.0897200650044602e-07, + "loss": 0.73670995, + "num_input_tokens_seen": 321980945, + "router_z_loss_clip": 0.74804688, + "router_z_loss_mlp": 0.13922119, + "step": 14929, + "time_per_iteration": 2.6761958599090576 + }, + { + "auxiliary_loss_clip": 0.01110281, + "auxiliary_loss_mlp": 0.0103022, + "balance_loss_clip": 1.03902245, + "balance_loss_mlp": 1.01989651, + "epoch": 0.8976401623327822, + "flos": 26551443060000.0, + "grad_norm": 1.950560036777189, + "language_loss": 0.67770791, + "learning_rate": 1.0884524085544256e-07, + "loss": 0.69911295, + "num_input_tokens_seen": 322000350, + "router_z_loss_clip": 0.71240234, + "router_z_loss_mlp": 0.10333252, + "step": 14930, + "time_per_iteration": 4.066127061843872 + }, + { + "auxiliary_loss_clip": 0.01105925, + "auxiliary_loss_mlp": 0.01025646, + "balance_loss_clip": 1.03505325, + "balance_loss_mlp": 1.01501226, + "epoch": 0.8977002855854501, + "flos": 16893542062080.0, + "grad_norm": 2.21074818128173, + "language_loss": 0.75213742, + "learning_rate": 1.0871854692321769e-07, + "loss": 0.77345312, + "num_input_tokens_seen": 322018980, + "router_z_loss_clip": 0.70800781, + "router_z_loss_mlp": 0.10626221, + "step": 14931, + "time_per_iteration": 2.6143527030944824 + }, + { + "auxiliary_loss_clip": 0.01109729, + "auxiliary_loss_mlp": 0.01026895, + "balance_loss_clip": 1.03902316, + "balance_loss_mlp": 1.01664948, + "epoch": 0.8977604088381181, + "flos": 23705389013760.0, + "grad_norm": 1.9823262713457812, + "language_loss": 0.62864709, + "learning_rate": 1.0859192470857492e-07, + "loss": 0.65001333, + "num_input_tokens_seen": 322037675, + "router_z_loss_clip": 0.70654297, + "router_z_loss_mlp": 0.10241699, + "step": 14932, + "time_per_iteration": 2.624940872192383 + }, + { + "auxiliary_loss_clip": 0.01105813, + "auxiliary_loss_mlp": 0.01027311, + "balance_loss_clip": 1.03717387, + "balance_loss_mlp": 1.01727939, + "epoch": 0.8978205320907862, + "flos": 27750583087200.0, + "grad_norm": 1.7900056258776558, + "language_loss": 0.71704054, + "learning_rate": 1.0846537421631552e-07, + "loss": 0.73837179, + "num_input_tokens_seen": 322055130, + "router_z_loss_clip": 0.68652344, + "router_z_loss_mlp": 0.10028076, + "step": 14933, + "time_per_iteration": 2.6113364696502686 + }, + { + "auxiliary_loss_clip": 0.011113, + "auxiliary_loss_mlp": 0.01028929, + "balance_loss_clip": 1.03782153, + "balance_loss_mlp": 1.01754999, + "epoch": 0.8978806553434541, + "flos": 26064252105120.0, + "grad_norm": 1.3973860613510167, + "language_loss": 0.74673307, + "learning_rate": 1.0833889545123898e-07, + "loss": 0.76813537, + "num_input_tokens_seen": 322074850, + "router_z_loss_clip": 0.73388672, + "router_z_loss_mlp": 0.1137085, + "step": 14934, + "time_per_iteration": 2.6463513374328613 + }, + { + "auxiliary_loss_clip": 0.01108256, + "auxiliary_loss_mlp": 0.0102539, + "balance_loss_clip": 1.03764617, + "balance_loss_mlp": 1.01485205, + "epoch": 0.8979407785961221, + "flos": 25531647629760.0, + "grad_norm": 1.9384926094607697, + "language_loss": 0.60451996, + "learning_rate": 1.0821248841814123e-07, + "loss": 0.6258564, + "num_input_tokens_seen": 322093315, + "router_z_loss_clip": 0.70507812, + "router_z_loss_mlp": 0.10540771, + "step": 14935, + "time_per_iteration": 2.636625289916992 + }, + { + "auxiliary_loss_clip": 0.01107747, + "auxiliary_loss_mlp": 0.01028225, + "balance_loss_clip": 1.03754842, + "balance_loss_mlp": 1.0168103, + "epoch": 0.89800090184879, + "flos": 30784692945600.0, + "grad_norm": 1.96448284036142, + "language_loss": 0.76741642, + "learning_rate": 1.0808615312181512e-07, + "loss": 0.78877616, + "num_input_tokens_seen": 322112555, + "router_z_loss_clip": 0.70214844, + "router_z_loss_mlp": 0.11413574, + "step": 14936, + "time_per_iteration": 2.6889679431915283 + }, + { + "auxiliary_loss_clip": 0.01109495, + "auxiliary_loss_mlp": 0.01031949, + "balance_loss_clip": 1.03809738, + "balance_loss_mlp": 1.02078521, + "epoch": 0.898061025101458, + "flos": 27531375837120.0, + "grad_norm": 2.23198177070677, + "language_loss": 0.74310988, + "learning_rate": 1.0795988956705193e-07, + "loss": 0.76452434, + "num_input_tokens_seen": 322130440, + "router_z_loss_clip": 0.71484375, + "router_z_loss_mlp": 0.11169434, + "step": 14937, + "time_per_iteration": 2.6153323650360107 + }, + { + "auxiliary_loss_clip": 0.0102736, + "auxiliary_loss_mlp": 0.0100076, + "balance_loss_clip": 1.00499678, + "balance_loss_mlp": 0.99987829, + "epoch": 0.8981211483541259, + "flos": 68566695549120.0, + "grad_norm": 0.8411341226099505, + "language_loss": 0.63452047, + "learning_rate": 1.0783369775863915e-07, + "loss": 0.65480173, + "num_input_tokens_seen": 322187295, + "router_z_loss_clip": 0.22363281, + "router_z_loss_mlp": 0.00881195, + "step": 14938, + "time_per_iteration": 4.413204193115234 + }, + { + "auxiliary_loss_clip": 0.01108536, + "auxiliary_loss_mlp": 0.0103066, + "balance_loss_clip": 1.03819895, + "balance_loss_mlp": 1.01942432, + "epoch": 0.898181271606794, + "flos": 20002677222240.0, + "grad_norm": 2.4443108553017354, + "language_loss": 0.80243808, + "learning_rate": 1.0770757770136251e-07, + "loss": 0.82383001, + "num_input_tokens_seen": 322202965, + "router_z_loss_clip": 0.70361328, + "router_z_loss_mlp": 0.11230469, + "step": 14939, + "time_per_iteration": 2.6723899841308594 + }, + { + "auxiliary_loss_clip": 0.01027311, + "auxiliary_loss_mlp": 0.01001355, + "balance_loss_clip": 1.00500107, + "balance_loss_mlp": 1.00044072, + "epoch": 0.8982413948594619, + "flos": 77408739702720.0, + "grad_norm": 0.7264420341290178, + "language_loss": 0.52852893, + "learning_rate": 1.0758152940000375e-07, + "loss": 0.54881561, + "num_input_tokens_seen": 322269490, + "router_z_loss_clip": 0.22290039, + "router_z_loss_mlp": 0.00914001, + "step": 14940, + "time_per_iteration": 4.8278279304504395 + }, + { + "auxiliary_loss_clip": 0.01109998, + "auxiliary_loss_mlp": 0.01029472, + "balance_loss_clip": 1.03737903, + "balance_loss_mlp": 1.01773572, + "epoch": 0.8983015181121299, + "flos": 26643607171200.0, + "grad_norm": 1.8367751683833733, + "language_loss": 0.78097206, + "learning_rate": 1.0745555285934327e-07, + "loss": 0.80236673, + "num_input_tokens_seen": 322288060, + "router_z_loss_clip": 0.7265625, + "router_z_loss_mlp": 0.11724854, + "step": 14941, + "time_per_iteration": 2.6285417079925537 + }, + { + "auxiliary_loss_clip": 0.0110981, + "auxiliary_loss_mlp": 0.01032976, + "balance_loss_clip": 1.03758609, + "balance_loss_mlp": 1.02110243, + "epoch": 0.8983616413647978, + "flos": 35325424533600.0, + "grad_norm": 2.604515576503733, + "language_loss": 0.7317816, + "learning_rate": 1.0732964808415834e-07, + "loss": 0.75320947, + "num_input_tokens_seen": 322307930, + "router_z_loss_clip": 0.72265625, + "router_z_loss_mlp": 0.11865234, + "step": 14942, + "time_per_iteration": 2.6868886947631836 + }, + { + "auxiliary_loss_clip": 0.01111122, + "auxiliary_loss_mlp": 0.01034143, + "balance_loss_clip": 1.03778875, + "balance_loss_mlp": 1.02279401, + "epoch": 0.8984217646174658, + "flos": 21701203905600.0, + "grad_norm": 2.1712237805794983, + "language_loss": 0.79682636, + "learning_rate": 1.0720381507922205e-07, + "loss": 0.81827891, + "num_input_tokens_seen": 322326155, + "router_z_loss_clip": 0.73339844, + "router_z_loss_mlp": 0.11352539, + "step": 14943, + "time_per_iteration": 2.6010262966156006 + }, + { + "auxiliary_loss_clip": 0.01111993, + "auxiliary_loss_mlp": 0.01032172, + "balance_loss_clip": 1.03778958, + "balance_loss_mlp": 1.02004218, + "epoch": 0.8984818878701337, + "flos": 28558423860480.0, + "grad_norm": 1.7691809277932875, + "language_loss": 0.70994574, + "learning_rate": 1.0707805384930701e-07, + "loss": 0.73138738, + "num_input_tokens_seen": 322345850, + "router_z_loss_clip": 0.74169922, + "router_z_loss_mlp": 0.12133789, + "step": 14944, + "time_per_iteration": 2.659146547317505 + }, + { + "auxiliary_loss_clip": 0.0111662, + "auxiliary_loss_mlp": 0.01028144, + "balance_loss_clip": 1.04129314, + "balance_loss_mlp": 1.01647377, + "epoch": 0.8985420111228017, + "flos": 27266836101120.0, + "grad_norm": 3.6057230613902957, + "language_loss": 0.75958329, + "learning_rate": 1.0695236439918187e-07, + "loss": 0.78103089, + "num_input_tokens_seen": 322364715, + "router_z_loss_clip": 0.75341797, + "router_z_loss_mlp": 0.11669922, + "step": 14945, + "time_per_iteration": 2.6364617347717285 + }, + { + "auxiliary_loss_clip": 0.0111587, + "auxiliary_loss_mlp": 0.01030721, + "balance_loss_clip": 1.0383811, + "balance_loss_mlp": 1.01871073, + "epoch": 0.8986021343754698, + "flos": 26103993206400.0, + "grad_norm": 2.637340773374896, + "language_loss": 0.73740733, + "learning_rate": 1.0682674673361302e-07, + "loss": 0.75887322, + "num_input_tokens_seen": 322383570, + "router_z_loss_clip": 0.77441406, + "router_z_loss_mlp": 0.12005615, + "step": 14946, + "time_per_iteration": 2.677835464477539 + }, + { + "auxiliary_loss_clip": 0.01108608, + "auxiliary_loss_mlp": 0.01025631, + "balance_loss_clip": 1.03630328, + "balance_loss_mlp": 1.01381707, + "epoch": 0.8986622576281377, + "flos": 26021229104160.0, + "grad_norm": 2.162832488663844, + "language_loss": 0.64266551, + "learning_rate": 1.0670120085736334e-07, + "loss": 0.66400784, + "num_input_tokens_seen": 322401375, + "router_z_loss_clip": 0.72314453, + "router_z_loss_mlp": 0.11816406, + "step": 14947, + "time_per_iteration": 2.6001498699188232 + }, + { + "auxiliary_loss_clip": 0.01111312, + "auxiliary_loss_mlp": 0.01033604, + "balance_loss_clip": 1.03920555, + "balance_loss_mlp": 1.02214801, + "epoch": 0.8987223808808057, + "flos": 29270778105600.0, + "grad_norm": 1.7306550122069235, + "language_loss": 0.69560099, + "learning_rate": 1.0657572677519411e-07, + "loss": 0.71705008, + "num_input_tokens_seen": 322421890, + "router_z_loss_clip": 0.72167969, + "router_z_loss_mlp": 0.11456299, + "step": 14948, + "time_per_iteration": 2.6812093257904053 + }, + { + "auxiliary_loss_clip": 0.0110808, + "auxiliary_loss_mlp": 0.01027543, + "balance_loss_clip": 1.03649354, + "balance_loss_mlp": 1.01614082, + "epoch": 0.8987825041334736, + "flos": 50641121838240.0, + "grad_norm": 1.941121040292848, + "language_loss": 0.7484802, + "learning_rate": 1.0645032449186309e-07, + "loss": 0.76983643, + "num_input_tokens_seen": 322445730, + "router_z_loss_clip": 0.71533203, + "router_z_loss_mlp": 0.11395264, + "step": 14949, + "time_per_iteration": 4.192850112915039 + }, + { + "auxiliary_loss_clip": 0.01112578, + "auxiliary_loss_mlp": 0.01031784, + "balance_loss_clip": 1.03871322, + "balance_loss_mlp": 1.01955295, + "epoch": 0.8988426273861416, + "flos": 33634434064320.0, + "grad_norm": 1.682370315500328, + "language_loss": 0.75976193, + "learning_rate": 1.0632499401212513e-07, + "loss": 0.78120553, + "num_input_tokens_seen": 322464595, + "router_z_loss_clip": 0.73925781, + "router_z_loss_mlp": 0.12219238, + "step": 14950, + "time_per_iteration": 2.6689000129699707 + }, + { + "auxiliary_loss_clip": 0.01111647, + "auxiliary_loss_mlp": 0.01029908, + "balance_loss_clip": 1.0403806, + "balance_loss_mlp": 1.01947081, + "epoch": 0.8989027506388095, + "flos": 20856174619680.0, + "grad_norm": 1.7426841165276068, + "language_loss": 0.66842937, + "learning_rate": 1.0619973534073334e-07, + "loss": 0.68984497, + "num_input_tokens_seen": 322483305, + "router_z_loss_clip": 0.71337891, + "router_z_loss_mlp": 0.10412598, + "step": 14951, + "time_per_iteration": 2.6535215377807617 + }, + { + "auxiliary_loss_clip": 0.01111254, + "auxiliary_loss_mlp": 0.01025753, + "balance_loss_clip": 1.03536034, + "balance_loss_mlp": 1.0154177, + "epoch": 0.8989628738914776, + "flos": 25081685704800.0, + "grad_norm": 2.4956823594076685, + "language_loss": 0.73867846, + "learning_rate": 1.0607454848243769e-07, + "loss": 0.76004851, + "num_input_tokens_seen": 322501905, + "router_z_loss_clip": 0.75830078, + "router_z_loss_mlp": 0.10333252, + "step": 14952, + "time_per_iteration": 2.6108410358428955 + }, + { + "auxiliary_loss_clip": 0.01109872, + "auxiliary_loss_mlp": 0.01032861, + "balance_loss_clip": 1.0374527, + "balance_loss_mlp": 1.02180409, + "epoch": 0.8990229971441455, + "flos": 19829450734560.0, + "grad_norm": 2.3318848804218373, + "language_loss": 0.5653882, + "learning_rate": 1.0594943344198481e-07, + "loss": 0.58681554, + "num_input_tokens_seen": 322518135, + "router_z_loss_clip": 0.72460938, + "router_z_loss_mlp": 0.11071777, + "step": 14953, + "time_per_iteration": 2.661241292953491 + }, + { + "auxiliary_loss_clip": 0.01109533, + "auxiliary_loss_mlp": 0.0102961, + "balance_loss_clip": 1.03785956, + "balance_loss_mlp": 1.01872611, + "epoch": 0.8990831203968135, + "flos": 26821736249760.0, + "grad_norm": 1.9735939938491662, + "language_loss": 0.81887984, + "learning_rate": 1.0582439022411915e-07, + "loss": 0.84027129, + "num_input_tokens_seen": 322537905, + "router_z_loss_clip": 0.71679688, + "router_z_loss_mlp": 0.10882568, + "step": 14954, + "time_per_iteration": 2.6140544414520264 + }, + { + "auxiliary_loss_clip": 0.01108175, + "auxiliary_loss_mlp": 0.01031382, + "balance_loss_clip": 1.0383842, + "balance_loss_mlp": 1.0203793, + "epoch": 0.8991432436494814, + "flos": 33492237462720.0, + "grad_norm": 2.231169171310746, + "language_loss": 0.60381889, + "learning_rate": 1.0569941883358224e-07, + "loss": 0.62521446, + "num_input_tokens_seen": 322557945, + "router_z_loss_clip": 0.69726562, + "router_z_loss_mlp": 0.10998535, + "step": 14955, + "time_per_iteration": 2.663184642791748 + }, + { + "auxiliary_loss_clip": 0.01108198, + "auxiliary_loss_mlp": 0.01027334, + "balance_loss_clip": 1.03794622, + "balance_loss_mlp": 1.01687908, + "epoch": 0.8992033669021494, + "flos": 26331830637120.0, + "grad_norm": 2.0047311946129884, + "language_loss": 0.5532341, + "learning_rate": 1.0557451927511341e-07, + "loss": 0.57458943, + "num_input_tokens_seen": 322575765, + "router_z_loss_clip": 0.70263672, + "router_z_loss_mlp": 0.10449219, + "step": 14956, + "time_per_iteration": 2.6133298873901367 + }, + { + "auxiliary_loss_clip": 0.01109472, + "auxiliary_loss_mlp": 0.01032984, + "balance_loss_clip": 1.03770638, + "balance_loss_mlp": 1.02195752, + "epoch": 0.8992634901548173, + "flos": 34879190198400.0, + "grad_norm": 1.7812150415423396, + "language_loss": 0.7957595, + "learning_rate": 1.0544969155344863e-07, + "loss": 0.81718409, + "num_input_tokens_seen": 322595665, + "router_z_loss_clip": 0.71777344, + "router_z_loss_mlp": 0.1104126, + "step": 14957, + "time_per_iteration": 2.6636359691619873 + }, + { + "auxiliary_loss_clip": 0.01114945, + "auxiliary_loss_mlp": 0.01029469, + "balance_loss_clip": 1.03968191, + "balance_loss_mlp": 1.01731563, + "epoch": 0.8993236134074853, + "flos": 24242004699840.0, + "grad_norm": 1.7528211536787521, + "language_loss": 0.78593957, + "learning_rate": 1.0532493567332123e-07, + "loss": 0.80738372, + "num_input_tokens_seen": 322614755, + "router_z_loss_clip": 0.75292969, + "router_z_loss_mlp": 0.121521, + "step": 14958, + "time_per_iteration": 2.627645492553711 + }, + { + "auxiliary_loss_clip": 0.01110146, + "auxiliary_loss_mlp": 0.01027541, + "balance_loss_clip": 1.04006982, + "balance_loss_mlp": 1.01727128, + "epoch": 0.8993837366601534, + "flos": 23660421183360.0, + "grad_norm": 1.5751267270076326, + "language_loss": 0.74712253, + "learning_rate": 1.0520025163946277e-07, + "loss": 0.76849943, + "num_input_tokens_seen": 322633425, + "router_z_loss_clip": 0.70068359, + "router_z_loss_mlp": 0.10266113, + "step": 14959, + "time_per_iteration": 2.6356639862060547 + }, + { + "auxiliary_loss_clip": 0.01105761, + "auxiliary_loss_mlp": 0.01027693, + "balance_loss_clip": 1.03608215, + "balance_loss_mlp": 1.01726174, + "epoch": 0.8994438599128213, + "flos": 22636695576960.0, + "grad_norm": 1.9265843491593027, + "language_loss": 0.6837976, + "learning_rate": 1.0507563945660015e-07, + "loss": 0.70513219, + "num_input_tokens_seen": 322652065, + "router_z_loss_clip": 0.69628906, + "router_z_loss_mlp": 0.10430908, + "step": 14960, + "time_per_iteration": 2.651928186416626 + }, + { + "auxiliary_loss_clip": 0.01109823, + "auxiliary_loss_mlp": 0.01027188, + "balance_loss_clip": 1.03899968, + "balance_loss_mlp": 1.01672721, + "epoch": 0.8995039831654893, + "flos": 29806704997920.0, + "grad_norm": 1.631983273604935, + "language_loss": 0.65985727, + "learning_rate": 1.049510991294591e-07, + "loss": 0.68122739, + "num_input_tokens_seen": 322673275, + "router_z_loss_clip": 0.70703125, + "router_z_loss_mlp": 0.10461426, + "step": 14961, + "time_per_iteration": 2.648958444595337 + }, + { + "auxiliary_loss_clip": 0.01105818, + "auxiliary_loss_mlp": 0.01026411, + "balance_loss_clip": 1.03581786, + "balance_loss_mlp": 1.01598048, + "epoch": 0.8995641064181572, + "flos": 25930969305120.0, + "grad_norm": 1.629508536757891, + "language_loss": 0.83272612, + "learning_rate": 1.0482663066276254e-07, + "loss": 0.85404837, + "num_input_tokens_seen": 322693375, + "router_z_loss_clip": 0.70019531, + "router_z_loss_mlp": 0.10437012, + "step": 14962, + "time_per_iteration": 2.7265782356262207 + }, + { + "auxiliary_loss_clip": 0.01115073, + "auxiliary_loss_mlp": 0.01026007, + "balance_loss_clip": 1.03928542, + "balance_loss_mlp": 1.01386523, + "epoch": 0.8996242296708252, + "flos": 28691625625920.0, + "grad_norm": 2.0978757132960424, + "language_loss": 0.76384211, + "learning_rate": 1.047022340612298e-07, + "loss": 0.78525287, + "num_input_tokens_seen": 322712615, + "router_z_loss_clip": 0.75683594, + "router_z_loss_mlp": 0.12133789, + "step": 14963, + "time_per_iteration": 2.6412394046783447 + }, + { + "auxiliary_loss_clip": 0.01027489, + "auxiliary_loss_mlp": 0.01000689, + "balance_loss_clip": 1.00518835, + "balance_loss_mlp": 0.99973249, + "epoch": 0.8996843529234931, + "flos": 76144697343360.0, + "grad_norm": 0.7799077597942459, + "language_loss": 0.57526267, + "learning_rate": 1.0457790932957867e-07, + "loss": 0.5955444, + "num_input_tokens_seen": 322766855, + "router_z_loss_clip": 0.22302246, + "router_z_loss_mlp": 0.00955963, + "step": 14964, + "time_per_iteration": 3.1842167377471924 + }, + { + "auxiliary_loss_clip": 0.01116724, + "auxiliary_loss_mlp": 0.01030065, + "balance_loss_clip": 1.04014444, + "balance_loss_mlp": 1.0182755, + "epoch": 0.8997444761761612, + "flos": 29574694287360.0, + "grad_norm": 2.7571629834218467, + "language_loss": 0.68156207, + "learning_rate": 1.0445365647252269e-07, + "loss": 0.70302999, + "num_input_tokens_seen": 322781130, + "router_z_loss_clip": 0.76464844, + "router_z_loss_mlp": 0.11791992, + "step": 14965, + "time_per_iteration": 2.665976047515869 + }, + { + "auxiliary_loss_clip": 0.01112028, + "auxiliary_loss_mlp": 0.01029579, + "balance_loss_clip": 1.03819442, + "balance_loss_mlp": 1.01867175, + "epoch": 0.8998045994288291, + "flos": 26064900381600.0, + "grad_norm": 2.412606233141335, + "language_loss": 0.71869135, + "learning_rate": 1.0432947549477433e-07, + "loss": 0.74010748, + "num_input_tokens_seen": 322800310, + "router_z_loss_clip": 0.73779297, + "router_z_loss_mlp": 0.10913086, + "step": 14966, + "time_per_iteration": 2.7063450813293457 + }, + { + "auxiliary_loss_clip": 0.01112194, + "auxiliary_loss_mlp": 0.01029143, + "balance_loss_clip": 1.03958559, + "balance_loss_mlp": 1.01727605, + "epoch": 0.8998647226814971, + "flos": 35369622535680.0, + "grad_norm": 2.0763723178883016, + "language_loss": 0.7324605, + "learning_rate": 1.0420536640104205e-07, + "loss": 0.75387383, + "num_input_tokens_seen": 322820955, + "router_z_loss_clip": 0.7265625, + "router_z_loss_mlp": 0.11871338, + "step": 14967, + "time_per_iteration": 2.684318780899048 + }, + { + "auxiliary_loss_clip": 0.01108863, + "auxiliary_loss_mlp": 0.01026399, + "balance_loss_clip": 1.03642869, + "balance_loss_mlp": 1.01515746, + "epoch": 0.899924845934165, + "flos": 16625841978240.0, + "grad_norm": 2.1930375335202297, + "language_loss": 0.71908462, + "learning_rate": 1.040813291960323e-07, + "loss": 0.74043721, + "num_input_tokens_seen": 322838780, + "router_z_loss_clip": 0.72412109, + "router_z_loss_mlp": 0.11242676, + "step": 14968, + "time_per_iteration": 2.6043734550476074 + }, + { + "auxiliary_loss_clip": 0.01109308, + "auxiliary_loss_mlp": 0.01030363, + "balance_loss_clip": 1.03731859, + "balance_loss_mlp": 1.01942587, + "epoch": 0.899984969186833, + "flos": 25481412552960.0, + "grad_norm": 2.977640713899267, + "language_loss": 0.70850086, + "learning_rate": 1.0395736388444864e-07, + "loss": 0.7298975, + "num_input_tokens_seen": 322856710, + "router_z_loss_clip": 0.72070312, + "router_z_loss_mlp": 0.10943604, + "step": 14969, + "time_per_iteration": 4.098467111587524 + }, + { + "auxiliary_loss_clip": 0.01113983, + "auxiliary_loss_mlp": 0.01030024, + "balance_loss_clip": 1.04047155, + "balance_loss_mlp": 1.01888347, + "epoch": 0.9000450924395009, + "flos": 25528973489280.0, + "grad_norm": 2.6095602616057407, + "language_loss": 0.76225138, + "learning_rate": 1.0383347047099201e-07, + "loss": 0.78369153, + "num_input_tokens_seen": 322876070, + "router_z_loss_clip": 0.73535156, + "router_z_loss_mlp": 0.1114502, + "step": 14970, + "time_per_iteration": 2.6720268726348877 + }, + { + "auxiliary_loss_clip": 0.01111939, + "auxiliary_loss_mlp": 0.01030347, + "balance_loss_clip": 1.03784382, + "balance_loss_mlp": 1.01992238, + "epoch": 0.900105215692169, + "flos": 20944570623840.0, + "grad_norm": 1.823715333802797, + "language_loss": 0.73336053, + "learning_rate": 1.0370964896035972e-07, + "loss": 0.75478339, + "num_input_tokens_seen": 322895095, + "router_z_loss_clip": 0.74169922, + "router_z_loss_mlp": 0.10430908, + "step": 14971, + "time_per_iteration": 2.6293158531188965 + }, + { + "auxiliary_loss_clip": 0.01110286, + "auxiliary_loss_mlp": 0.01024763, + "balance_loss_clip": 1.0378828, + "balance_loss_mlp": 1.01364625, + "epoch": 0.900165338944837, + "flos": 24321486902400.0, + "grad_norm": 3.0162676128518613, + "language_loss": 0.80987322, + "learning_rate": 1.035858993572476e-07, + "loss": 0.83122373, + "num_input_tokens_seen": 322911845, + "router_z_loss_clip": 0.72460938, + "router_z_loss_mlp": 0.11114502, + "step": 14972, + "time_per_iteration": 2.6069374084472656 + }, + { + "auxiliary_loss_clip": 0.0111258, + "auxiliary_loss_mlp": 0.010287, + "balance_loss_clip": 1.03754783, + "balance_loss_mlp": 1.01767325, + "epoch": 0.9002254621975049, + "flos": 19653550106400.0, + "grad_norm": 4.329097233588385, + "language_loss": 0.81491089, + "learning_rate": 1.0346222166634855e-07, + "loss": 0.83632362, + "num_input_tokens_seen": 322928170, + "router_z_loss_clip": 0.75048828, + "router_z_loss_mlp": 0.11035156, + "step": 14973, + "time_per_iteration": 2.6114983558654785 + }, + { + "auxiliary_loss_clip": 0.01107919, + "auxiliary_loss_mlp": 0.01030726, + "balance_loss_clip": 1.03653765, + "balance_loss_mlp": 1.01955628, + "epoch": 0.9002855854501729, + "flos": 34747163434080.0, + "grad_norm": 2.0025385905940993, + "language_loss": 0.57977194, + "learning_rate": 1.0333861589235193e-07, + "loss": 0.60115838, + "num_input_tokens_seen": 322948165, + "router_z_loss_clip": 0.71435547, + "router_z_loss_mlp": 0.11181641, + "step": 14974, + "time_per_iteration": 2.662015914916992 + }, + { + "auxiliary_loss_clip": 0.01111719, + "auxiliary_loss_mlp": 0.01031054, + "balance_loss_clip": 1.03974903, + "balance_loss_mlp": 1.01994991, + "epoch": 0.9003457087028408, + "flos": 31274477006400.0, + "grad_norm": 1.596038328147391, + "language_loss": 0.63542092, + "learning_rate": 1.0321508203994489e-07, + "loss": 0.65684867, + "num_input_tokens_seen": 322968880, + "router_z_loss_clip": 0.72021484, + "router_z_loss_mlp": 0.11108398, + "step": 14975, + "time_per_iteration": 2.6877388954162598 + }, + { + "auxiliary_loss_clip": 0.01109583, + "auxiliary_loss_mlp": 0.01035323, + "balance_loss_clip": 1.03689277, + "balance_loss_mlp": 1.02398634, + "epoch": 0.9004058319555088, + "flos": 29760278545440.0, + "grad_norm": 1.7121828818825446, + "language_loss": 0.73322934, + "learning_rate": 1.0309162011381257e-07, + "loss": 0.75467843, + "num_input_tokens_seen": 322989395, + "router_z_loss_clip": 0.7265625, + "router_z_loss_mlp": 0.11328125, + "step": 14976, + "time_per_iteration": 2.627304792404175 + }, + { + "auxiliary_loss_clip": 0.01111043, + "auxiliary_loss_mlp": 0.01029867, + "balance_loss_clip": 1.03886926, + "balance_loss_mlp": 1.0186317, + "epoch": 0.9004659552081767, + "flos": 35458909920000.0, + "grad_norm": 1.8705233443475648, + "language_loss": 0.6950388, + "learning_rate": 1.0296823011863565e-07, + "loss": 0.71644789, + "num_input_tokens_seen": 323009060, + "router_z_loss_clip": 0.72216797, + "router_z_loss_mlp": 0.11242676, + "step": 14977, + "time_per_iteration": 2.6987814903259277 + }, + { + "auxiliary_loss_clip": 0.01112286, + "auxiliary_loss_mlp": 0.01034998, + "balance_loss_clip": 1.0382154, + "balance_loss_mlp": 1.02264786, + "epoch": 0.9005260784608448, + "flos": 20454989149440.0, + "grad_norm": 2.200370625365242, + "language_loss": 0.65635723, + "learning_rate": 1.0284491205909351e-07, + "loss": 0.6778301, + "num_input_tokens_seen": 323027530, + "router_z_loss_clip": 0.74121094, + "router_z_loss_mlp": 0.12347412, + "step": 14978, + "time_per_iteration": 3.925612688064575 + }, + { + "auxiliary_loss_clip": 0.01114423, + "auxiliary_loss_mlp": 0.01033514, + "balance_loss_clip": 1.03975809, + "balance_loss_mlp": 1.02173066, + "epoch": 0.9005862017135127, + "flos": 24862802592960.0, + "grad_norm": 2.0565914307729214, + "language_loss": 0.79084522, + "learning_rate": 1.0272166593986286e-07, + "loss": 0.81232458, + "num_input_tokens_seen": 323045370, + "router_z_loss_clip": 0.74755859, + "router_z_loss_mlp": 0.11785889, + "step": 14979, + "time_per_iteration": 2.622195243835449 + }, + { + "auxiliary_loss_clip": 0.01027491, + "auxiliary_loss_mlp": 0.01000297, + "balance_loss_clip": 1.00517774, + "balance_loss_mlp": 0.99938321, + "epoch": 0.9006463249661807, + "flos": 82462465402560.0, + "grad_norm": 0.7212097414898424, + "language_loss": 0.53561318, + "learning_rate": 1.0259849176561642e-07, + "loss": 0.55589104, + "num_input_tokens_seen": 323105660, + "router_z_loss_clip": 0.2232666, + "router_z_loss_mlp": 0.00912476, + "step": 14980, + "time_per_iteration": 4.7246081829071045 + }, + { + "auxiliary_loss_clip": 0.01115007, + "auxiliary_loss_mlp": 0.01035137, + "balance_loss_clip": 1.03981662, + "balance_loss_mlp": 1.0233531, + "epoch": 0.9007064482188486, + "flos": 34524147559680.0, + "grad_norm": 1.7520521060347283, + "language_loss": 0.8229273, + "learning_rate": 1.0247538954102553e-07, + "loss": 0.84442872, + "num_input_tokens_seen": 323126365, + "router_z_loss_clip": 0.75195312, + "router_z_loss_mlp": 0.11798096, + "step": 14981, + "time_per_iteration": 2.702214002609253 + }, + { + "auxiliary_loss_clip": 0.01108788, + "auxiliary_loss_mlp": 0.01028573, + "balance_loss_clip": 1.03899097, + "balance_loss_mlp": 1.01737952, + "epoch": 0.9007665714715166, + "flos": 26376352777440.0, + "grad_norm": 1.9078585493625593, + "language_loss": 0.81483936, + "learning_rate": 1.0235235927075758e-07, + "loss": 0.83621299, + "num_input_tokens_seen": 323145655, + "router_z_loss_clip": 0.69824219, + "router_z_loss_mlp": 0.11181641, + "step": 14982, + "time_per_iteration": 2.6261672973632812 + }, + { + "auxiliary_loss_clip": 0.01106417, + "auxiliary_loss_mlp": 0.01031501, + "balance_loss_clip": 1.03828323, + "balance_loss_mlp": 1.02132058, + "epoch": 0.9008266947241845, + "flos": 32386882237920.0, + "grad_norm": 2.239920954593851, + "language_loss": 0.71526182, + "learning_rate": 1.0222940095947885e-07, + "loss": 0.73664099, + "num_input_tokens_seen": 323164540, + "router_z_loss_clip": 0.68212891, + "router_z_loss_mlp": 0.10186768, + "step": 14983, + "time_per_iteration": 2.6545443534851074 + }, + { + "auxiliary_loss_clip": 0.01109405, + "auxiliary_loss_mlp": 0.01022332, + "balance_loss_clip": 1.03953838, + "balance_loss_mlp": 1.01222348, + "epoch": 0.9008868179768525, + "flos": 28199856218400.0, + "grad_norm": 1.378693898055497, + "language_loss": 0.75269663, + "learning_rate": 1.0210651461185115e-07, + "loss": 0.774014, + "num_input_tokens_seen": 323186960, + "router_z_loss_clip": 0.69873047, + "router_z_loss_mlp": 0.10113525, + "step": 14984, + "time_per_iteration": 2.644078016281128 + }, + { + "auxiliary_loss_clip": 0.01104788, + "auxiliary_loss_mlp": 0.01029014, + "balance_loss_clip": 1.03498197, + "balance_loss_mlp": 1.01782656, + "epoch": 0.9009469412295206, + "flos": 23258060712000.0, + "grad_norm": 1.4985131683748647, + "language_loss": 0.70107383, + "learning_rate": 1.0198370023253456e-07, + "loss": 0.72241187, + "num_input_tokens_seen": 323206135, + "router_z_loss_clip": 0.69873047, + "router_z_loss_mlp": 0.11193848, + "step": 14985, + "time_per_iteration": 2.62004017829895 + }, + { + "auxiliary_loss_clip": 0.01109702, + "auxiliary_loss_mlp": 0.0103074, + "balance_loss_clip": 1.03536856, + "balance_loss_mlp": 1.01912308, + "epoch": 0.9010070644821885, + "flos": 28336015745280.0, + "grad_norm": 2.233473592069944, + "language_loss": 0.70094258, + "learning_rate": 1.0186095782618643e-07, + "loss": 0.72234702, + "num_input_tokens_seen": 323225980, + "router_z_loss_clip": 0.74316406, + "router_z_loss_mlp": 0.11608887, + "step": 14986, + "time_per_iteration": 2.616472005844116 + }, + { + "auxiliary_loss_clip": 0.01108806, + "auxiliary_loss_mlp": 0.01029554, + "balance_loss_clip": 1.03612602, + "balance_loss_mlp": 1.01810956, + "epoch": 0.9010671877348565, + "flos": 21212675880480.0, + "grad_norm": 1.959850705363879, + "language_loss": 0.7710017, + "learning_rate": 1.0173828739746104e-07, + "loss": 0.79238534, + "num_input_tokens_seen": 323243700, + "router_z_loss_clip": 0.7265625, + "router_z_loss_mlp": 0.11450195, + "step": 14987, + "time_per_iteration": 2.6139371395111084 + }, + { + "auxiliary_loss_clip": 0.01109995, + "auxiliary_loss_mlp": 0.01030528, + "balance_loss_clip": 1.03828251, + "balance_loss_mlp": 1.01934028, + "epoch": 0.9011273109875244, + "flos": 26732489382720.0, + "grad_norm": 1.8822696552373068, + "language_loss": 0.73643994, + "learning_rate": 1.0161568895100981e-07, + "loss": 0.75784516, + "num_input_tokens_seen": 323261535, + "router_z_loss_clip": 0.71679688, + "router_z_loss_mlp": 0.11187744, + "step": 14988, + "time_per_iteration": 2.644012451171875 + }, + { + "auxiliary_loss_clip": 0.01116102, + "auxiliary_loss_mlp": 0.01030725, + "balance_loss_clip": 1.04054403, + "balance_loss_mlp": 1.01879847, + "epoch": 0.9011874342401924, + "flos": 29359336178880.0, + "grad_norm": 2.2894391890194106, + "language_loss": 0.69284713, + "learning_rate": 1.0149316249148188e-07, + "loss": 0.71431541, + "num_input_tokens_seen": 323281855, + "router_z_loss_clip": 0.75585938, + "router_z_loss_mlp": 0.1192627, + "step": 14989, + "time_per_iteration": 4.0466697216033936 + }, + { + "auxiliary_loss_clip": 0.0111092, + "auxiliary_loss_mlp": 0.0103106, + "balance_loss_clip": 1.03818619, + "balance_loss_mlp": 1.02022409, + "epoch": 0.9012475574928603, + "flos": 20449276212960.0, + "grad_norm": 2.207826743846369, + "language_loss": 0.79901123, + "learning_rate": 1.0137070802352376e-07, + "loss": 0.82043099, + "num_input_tokens_seen": 323299505, + "router_z_loss_clip": 0.72705078, + "router_z_loss_mlp": 0.1083374, + "step": 14990, + "time_per_iteration": 2.61696720123291 + }, + { + "auxiliary_loss_clip": 0.01115078, + "auxiliary_loss_mlp": 0.01025911, + "balance_loss_clip": 1.03965497, + "balance_loss_mlp": 1.01485395, + "epoch": 0.9013076807455284, + "flos": 24369047838720.0, + "grad_norm": 1.8802589930166924, + "language_loss": 0.77985799, + "learning_rate": 1.0124832555177842e-07, + "loss": 0.80126786, + "num_input_tokens_seen": 323318365, + "router_z_loss_clip": 0.75341797, + "router_z_loss_mlp": 0.1105957, + "step": 14991, + "time_per_iteration": 2.7088823318481445 + }, + { + "auxiliary_loss_clip": 0.01027509, + "auxiliary_loss_mlp": 0.01000803, + "balance_loss_clip": 1.00518179, + "balance_loss_mlp": 0.99985927, + "epoch": 0.9013678039981963, + "flos": 79533080012160.0, + "grad_norm": 0.7780298808157696, + "language_loss": 0.60175622, + "learning_rate": 1.0112601508088726e-07, + "loss": 0.62203938, + "num_input_tokens_seen": 323371835, + "router_z_loss_clip": 0.22363281, + "router_z_loss_mlp": 0.00941467, + "step": 14992, + "time_per_iteration": 3.1859397888183594 + }, + { + "auxiliary_loss_clip": 0.01109044, + "auxiliary_loss_mlp": 0.01024924, + "balance_loss_clip": 1.03657258, + "balance_loss_mlp": 1.01370096, + "epoch": 0.9014279272508643, + "flos": 25040080808640.0, + "grad_norm": 1.9166951444487064, + "language_loss": 0.82761872, + "learning_rate": 1.0100377661548764e-07, + "loss": 0.84895837, + "num_input_tokens_seen": 323388495, + "router_z_loss_clip": 0.72460938, + "router_z_loss_mlp": 0.11224365, + "step": 14993, + "time_per_iteration": 3.0110106468200684 + }, + { + "auxiliary_loss_clip": 0.0111013, + "auxiliary_loss_mlp": 0.01029499, + "balance_loss_clip": 1.03737497, + "balance_loss_mlp": 1.01785207, + "epoch": 0.9014880505035322, + "flos": 21120471252000.0, + "grad_norm": 2.288112384810497, + "language_loss": 0.73031759, + "learning_rate": 1.0088161016021502e-07, + "loss": 0.75171387, + "num_input_tokens_seen": 323405280, + "router_z_loss_clip": 0.72705078, + "router_z_loss_mlp": 0.11645508, + "step": 14994, + "time_per_iteration": 2.715036630630493 + }, + { + "auxiliary_loss_clip": 0.01107827, + "auxiliary_loss_mlp": 0.01025947, + "balance_loss_clip": 1.03768742, + "balance_loss_mlp": 1.01560569, + "epoch": 0.9015481737562002, + "flos": 34657795015200.0, + "grad_norm": 1.694100598004427, + "language_loss": 0.6476711, + "learning_rate": 1.0075951571970187e-07, + "loss": 0.66900885, + "num_input_tokens_seen": 323425310, + "router_z_loss_clip": 0.70214844, + "router_z_loss_mlp": 0.10333252, + "step": 14995, + "time_per_iteration": 2.7103264331817627 + }, + { + "auxiliary_loss_clip": 0.01110647, + "auxiliary_loss_mlp": 0.01029574, + "balance_loss_clip": 1.03660524, + "balance_loss_mlp": 1.01792169, + "epoch": 0.9016082970088681, + "flos": 36304952137920.0, + "grad_norm": 1.9567025106229852, + "language_loss": 0.6644209, + "learning_rate": 1.0063749329857873e-07, + "loss": 0.68582308, + "num_input_tokens_seen": 323447805, + "router_z_loss_clip": 0.73974609, + "router_z_loss_mlp": 0.11651611, + "step": 14996, + "time_per_iteration": 2.70346999168396 + }, + { + "auxiliary_loss_clip": 0.01107516, + "auxiliary_loss_mlp": 0.01025988, + "balance_loss_clip": 1.03647065, + "balance_loss_mlp": 1.01547384, + "epoch": 0.9016684202615362, + "flos": 28691139418560.0, + "grad_norm": 1.7292189630273365, + "language_loss": 0.66065824, + "learning_rate": 1.0051554290147168e-07, + "loss": 0.68199331, + "num_input_tokens_seen": 323467150, + "router_z_loss_clip": 0.71044922, + "router_z_loss_mlp": 0.10516357, + "step": 14997, + "time_per_iteration": 2.677955389022827 + }, + { + "auxiliary_loss_clip": 0.01108865, + "auxiliary_loss_mlp": 0.01031641, + "balance_loss_clip": 1.03658795, + "balance_loss_mlp": 1.02056003, + "epoch": 0.9017285435142042, + "flos": 20718151297920.0, + "grad_norm": 1.8081953357659175, + "language_loss": 0.77888209, + "learning_rate": 1.0039366453300613e-07, + "loss": 0.80028713, + "num_input_tokens_seen": 323484250, + "router_z_loss_clip": 0.72412109, + "router_z_loss_mlp": 0.11090088, + "step": 14998, + "time_per_iteration": 2.5865628719329834 + }, + { + "auxiliary_loss_clip": 0.01109517, + "auxiliary_loss_mlp": 0.01027501, + "balance_loss_clip": 1.03673577, + "balance_loss_mlp": 1.01623583, + "epoch": 0.9017886667668721, + "flos": 26104641482880.0, + "grad_norm": 1.7835370261224512, + "language_loss": 0.75166535, + "learning_rate": 1.0027185819780281e-07, + "loss": 0.77303553, + "num_input_tokens_seen": 323502910, + "router_z_loss_clip": 0.72802734, + "router_z_loss_mlp": 0.11273193, + "step": 14999, + "time_per_iteration": 2.6300628185272217 + }, + { + "auxiliary_loss_clip": 0.01109334, + "auxiliary_loss_mlp": 0.01027278, + "balance_loss_clip": 1.03752732, + "balance_loss_mlp": 1.01566076, + "epoch": 0.9018487900195401, + "flos": 25620043633920.0, + "grad_norm": 2.838852605596595, + "language_loss": 0.75775337, + "learning_rate": 1.0015012390048117e-07, + "loss": 0.77911955, + "num_input_tokens_seen": 323521820, + "router_z_loss_clip": 0.71728516, + "router_z_loss_mlp": 0.1161499, + "step": 15000, + "time_per_iteration": 2.6075117588043213 + }, + { + "auxiliary_loss_clip": 0.01106432, + "auxiliary_loss_mlp": 0.01025195, + "balance_loss_clip": 1.03616869, + "balance_loss_mlp": 1.01500845, + "epoch": 0.901908913272208, + "flos": 65110574338560.0, + "grad_norm": 2.6469751019732795, + "language_loss": 0.8106038, + "learning_rate": 1.0002846164565704e-07, + "loss": 0.83192009, + "num_input_tokens_seen": 323543200, + "router_z_loss_clip": 0.70166016, + "router_z_loss_mlp": 0.10186768, + "step": 15001, + "time_per_iteration": 2.8852741718292236 + }, + { + "auxiliary_loss_clip": 0.0110758, + "auxiliary_loss_mlp": 0.01029339, + "balance_loss_clip": 1.03742039, + "balance_loss_mlp": 1.01883101, + "epoch": 0.901969036524876, + "flos": 26954249221440.0, + "grad_norm": 1.4765462713093127, + "language_loss": 0.78380573, + "learning_rate": 9.990687143794407e-08, + "loss": 0.80517495, + "num_input_tokens_seen": 323563075, + "router_z_loss_clip": 0.70166016, + "router_z_loss_mlp": 0.1050415, + "step": 15002, + "time_per_iteration": 2.6311659812927246 + }, + { + "auxiliary_loss_clip": 0.01112419, + "auxiliary_loss_mlp": 0.01032185, + "balance_loss_clip": 1.03952837, + "balance_loss_mlp": 1.02012062, + "epoch": 0.9020291597775439, + "flos": 29084424019200.0, + "grad_norm": 2.288130445696359, + "language_loss": 0.68027097, + "learning_rate": 9.978535328195347e-08, + "loss": 0.70171696, + "num_input_tokens_seen": 323579065, + "router_z_loss_clip": 0.72949219, + "router_z_loss_mlp": 0.12072754, + "step": 15003, + "time_per_iteration": 2.635887622833252 + }, + { + "auxiliary_loss_clip": 0.01110404, + "auxiliary_loss_mlp": 0.01032297, + "balance_loss_clip": 1.03631067, + "balance_loss_mlp": 1.02081156, + "epoch": 0.902089283030212, + "flos": 22361864451840.0, + "grad_norm": 1.7253111179685319, + "language_loss": 0.85892266, + "learning_rate": 9.9663907182292e-08, + "loss": 0.8803497, + "num_input_tokens_seen": 323594835, + "router_z_loss_clip": 0.74023438, + "router_z_loss_mlp": 0.1149292, + "step": 15004, + "time_per_iteration": 2.5969479084014893 + }, + { + "auxiliary_loss_clip": 0.01111476, + "auxiliary_loss_mlp": 0.01031244, + "balance_loss_clip": 1.03889859, + "balance_loss_mlp": 1.01985979, + "epoch": 0.9021494062828799, + "flos": 29492983634400.0, + "grad_norm": 2.3731675617079038, + "language_loss": 0.7186743, + "learning_rate": 9.954253314356575e-08, + "loss": 0.74010152, + "num_input_tokens_seen": 323611475, + "router_z_loss_clip": 0.72509766, + "router_z_loss_mlp": 0.11376953, + "step": 15005, + "time_per_iteration": 2.6572062969207764 + }, + { + "auxiliary_loss_clip": 0.01110732, + "auxiliary_loss_mlp": 0.01030778, + "balance_loss_clip": 1.03519964, + "balance_loss_mlp": 1.01897597, + "epoch": 0.9022095295355479, + "flos": 26376960536640.0, + "grad_norm": 1.8196445725390686, + "language_loss": 0.71188426, + "learning_rate": 9.942123117037748e-08, + "loss": 0.73329937, + "num_input_tokens_seen": 323629730, + "router_z_loss_clip": 0.75585938, + "router_z_loss_mlp": 0.11798096, + "step": 15006, + "time_per_iteration": 2.6600759029388428 + }, + { + "auxiliary_loss_clip": 0.01112298, + "auxiliary_loss_mlp": 0.01025667, + "balance_loss_clip": 1.03835225, + "balance_loss_mlp": 1.01491439, + "epoch": 0.9022696527882158, + "flos": 22850351959680.0, + "grad_norm": 1.8806526243803874, + "language_loss": 0.8432467, + "learning_rate": 9.930000126732618e-08, + "loss": 0.86462641, + "num_input_tokens_seen": 323646000, + "router_z_loss_clip": 0.74072266, + "router_z_loss_mlp": 0.10754395, + "step": 15007, + "time_per_iteration": 2.5781102180480957 + }, + { + "auxiliary_loss_clip": 0.0110967, + "auxiliary_loss_mlp": 0.01030967, + "balance_loss_clip": 1.0390898, + "balance_loss_mlp": 1.0200659, + "epoch": 0.9023297760408838, + "flos": 32654501287200.0, + "grad_norm": 1.81124131144261, + "language_loss": 0.78600305, + "learning_rate": 9.917884343900928e-08, + "loss": 0.80740941, + "num_input_tokens_seen": 323667250, + "router_z_loss_clip": 0.70605469, + "router_z_loss_mlp": 0.10919189, + "step": 15008, + "time_per_iteration": 2.7420148849487305 + }, + { + "auxiliary_loss_clip": 0.01106567, + "auxiliary_loss_mlp": 0.01027659, + "balance_loss_clip": 1.03878069, + "balance_loss_mlp": 1.01700139, + "epoch": 0.9023898992935517, + "flos": 25041579948000.0, + "grad_norm": 1.9269438129118877, + "language_loss": 0.73512083, + "learning_rate": 9.905775769002156e-08, + "loss": 0.75646305, + "num_input_tokens_seen": 323687150, + "router_z_loss_clip": 0.67773438, + "router_z_loss_mlp": 0.10656738, + "step": 15009, + "time_per_iteration": 4.077061891555786 + }, + { + "auxiliary_loss_clip": 0.0110951, + "auxiliary_loss_mlp": 0.01029794, + "balance_loss_clip": 1.03812015, + "balance_loss_mlp": 1.0184989, + "epoch": 0.9024500225462198, + "flos": 21300423608160.0, + "grad_norm": 1.8858705469444603, + "language_loss": 0.73307973, + "learning_rate": 9.893674402495399e-08, + "loss": 0.75447273, + "num_input_tokens_seen": 323703660, + "router_z_loss_clip": 0.71435547, + "router_z_loss_mlp": 0.11291504, + "step": 15010, + "time_per_iteration": 2.6463987827301025 + }, + { + "auxiliary_loss_clip": 0.0111008, + "auxiliary_loss_mlp": 0.01029146, + "balance_loss_clip": 1.03712428, + "balance_loss_mlp": 1.01746976, + "epoch": 0.9025101457988878, + "flos": 25396825173120.0, + "grad_norm": 2.2375971937770425, + "language_loss": 0.73998702, + "learning_rate": 9.881580244839538e-08, + "loss": 0.76137924, + "num_input_tokens_seen": 323722060, + "router_z_loss_clip": 0.72949219, + "router_z_loss_mlp": 0.11682129, + "step": 15011, + "time_per_iteration": 2.6097917556762695 + }, + { + "auxiliary_loss_clip": 0.01113717, + "auxiliary_loss_mlp": 0.01029452, + "balance_loss_clip": 1.03813791, + "balance_loss_mlp": 1.01785874, + "epoch": 0.9025702690515557, + "flos": 23215848056640.0, + "grad_norm": 2.7089534561560584, + "language_loss": 0.73479986, + "learning_rate": 9.869493296493204e-08, + "loss": 0.75623155, + "num_input_tokens_seen": 323740645, + "router_z_loss_clip": 0.75634766, + "router_z_loss_mlp": 0.11578369, + "step": 15012, + "time_per_iteration": 2.669255495071411 + }, + { + "auxiliary_loss_clip": 0.01110112, + "auxiliary_loss_mlp": 0.01033434, + "balance_loss_clip": 1.03862631, + "balance_loss_mlp": 1.02259803, + "epoch": 0.9026303923042237, + "flos": 24016922444160.0, + "grad_norm": 1.6605888914259213, + "language_loss": 0.69510841, + "learning_rate": 9.857413557914763e-08, + "loss": 0.71654385, + "num_input_tokens_seen": 323758905, + "router_z_loss_clip": 0.71533203, + "router_z_loss_mlp": 0.1083374, + "step": 15013, + "time_per_iteration": 2.629716396331787 + }, + { + "auxiliary_loss_clip": 0.01105438, + "auxiliary_loss_mlp": 0.01029045, + "balance_loss_clip": 1.0359031, + "balance_loss_mlp": 1.01857817, + "epoch": 0.9026905155568916, + "flos": 30027168283680.0, + "grad_norm": 1.4629174708785377, + "language_loss": 0.72885925, + "learning_rate": 9.845341029562249e-08, + "loss": 0.75020409, + "num_input_tokens_seen": 323780595, + "router_z_loss_clip": 0.69482422, + "router_z_loss_mlp": 0.10473633, + "step": 15014, + "time_per_iteration": 2.6774959564208984 + }, + { + "auxiliary_loss_clip": 0.01107951, + "auxiliary_loss_mlp": 0.0102639, + "balance_loss_clip": 1.03589892, + "balance_loss_mlp": 1.01493979, + "epoch": 0.9027506388095596, + "flos": 25040729085120.0, + "grad_norm": 1.7702537427025729, + "language_loss": 0.722049, + "learning_rate": 9.833275711893474e-08, + "loss": 0.74339235, + "num_input_tokens_seen": 323798160, + "router_z_loss_clip": 0.72070312, + "router_z_loss_mlp": 0.11462402, + "step": 15015, + "time_per_iteration": 2.6287081241607666 + }, + { + "auxiliary_loss_clip": 0.01110225, + "auxiliary_loss_mlp": 0.01028856, + "balance_loss_clip": 1.0373168, + "balance_loss_mlp": 1.0180552, + "epoch": 0.9028107620622275, + "flos": 27801790578720.0, + "grad_norm": 1.9881878618621167, + "language_loss": 0.68938136, + "learning_rate": 9.821217605365895e-08, + "loss": 0.71077222, + "num_input_tokens_seen": 323816810, + "router_z_loss_clip": 0.72949219, + "router_z_loss_mlp": 0.10797119, + "step": 15016, + "time_per_iteration": 2.6362380981445312 + }, + { + "auxiliary_loss_clip": 0.01106491, + "auxiliary_loss_mlp": 0.01027264, + "balance_loss_clip": 1.03604984, + "balance_loss_mlp": 1.0167439, + "epoch": 0.9028708853148956, + "flos": 31006371749760.0, + "grad_norm": 2.2978609232190252, + "language_loss": 0.70553005, + "learning_rate": 9.809166710436855e-08, + "loss": 0.72686756, + "num_input_tokens_seen": 323836900, + "router_z_loss_clip": 0.70410156, + "router_z_loss_mlp": 0.10516357, + "step": 15017, + "time_per_iteration": 4.090786695480347 + }, + { + "auxiliary_loss_clip": 0.01112246, + "auxiliary_loss_mlp": 0.01028837, + "balance_loss_clip": 1.04103744, + "balance_loss_mlp": 1.01851392, + "epoch": 0.9029310085675635, + "flos": 26687562069600.0, + "grad_norm": 1.7298074999204933, + "language_loss": 0.69574177, + "learning_rate": 9.797123027563237e-08, + "loss": 0.7171526, + "num_input_tokens_seen": 323855325, + "router_z_loss_clip": 0.71191406, + "router_z_loss_mlp": 0.10321045, + "step": 15018, + "time_per_iteration": 2.624093532562256 + }, + { + "auxiliary_loss_clip": 0.01111608, + "auxiliary_loss_mlp": 0.01029088, + "balance_loss_clip": 1.03931165, + "balance_loss_mlp": 1.01757216, + "epoch": 0.9029911318202315, + "flos": 31987195907040.0, + "grad_norm": 1.7350051582559267, + "language_loss": 0.68596172, + "learning_rate": 9.785086557201782e-08, + "loss": 0.70736861, + "num_input_tokens_seen": 323875650, + "router_z_loss_clip": 0.72216797, + "router_z_loss_mlp": 0.11517334, + "step": 15019, + "time_per_iteration": 4.1629743576049805 + }, + { + "auxiliary_loss_clip": 0.01106006, + "auxiliary_loss_mlp": 0.010314, + "balance_loss_clip": 1.03605795, + "balance_loss_mlp": 1.02092147, + "epoch": 0.9030512550728994, + "flos": 19475704648800.0, + "grad_norm": 1.840283702812054, + "language_loss": 0.72423124, + "learning_rate": 9.773057299808951e-08, + "loss": 0.74560529, + "num_input_tokens_seen": 323892920, + "router_z_loss_clip": 0.69873047, + "router_z_loss_mlp": 0.10473633, + "step": 15020, + "time_per_iteration": 2.631375789642334 + }, + { + "auxiliary_loss_clip": 0.01108793, + "auxiliary_loss_mlp": 0.01028255, + "balance_loss_clip": 1.03598893, + "balance_loss_mlp": 1.01680493, + "epoch": 0.9031113783255674, + "flos": 29270008277280.0, + "grad_norm": 1.7666916936258215, + "language_loss": 0.74563575, + "learning_rate": 9.7610352558408e-08, + "loss": 0.76700628, + "num_input_tokens_seen": 323913835, + "router_z_loss_clip": 0.72753906, + "router_z_loss_mlp": 0.11456299, + "step": 15021, + "time_per_iteration": 2.621669292449951 + }, + { + "auxiliary_loss_clip": 0.01113652, + "auxiliary_loss_mlp": 0.01030484, + "balance_loss_clip": 1.03862607, + "balance_loss_mlp": 1.01897418, + "epoch": 0.9031715015782353, + "flos": 27133958473920.0, + "grad_norm": 3.000469446357992, + "language_loss": 0.73249221, + "learning_rate": 9.749020425753251e-08, + "loss": 0.75393361, + "num_input_tokens_seen": 323933440, + "router_z_loss_clip": 0.74951172, + "router_z_loss_mlp": 0.11517334, + "step": 15022, + "time_per_iteration": 2.6778364181518555 + }, + { + "auxiliary_loss_clip": 0.011051, + "auxiliary_loss_mlp": 0.01025675, + "balance_loss_clip": 1.03758192, + "balance_loss_mlp": 1.01527429, + "epoch": 0.9032316248309034, + "flos": 32119992499680.0, + "grad_norm": 1.9277970218216678, + "language_loss": 0.7230432, + "learning_rate": 9.737012810001943e-08, + "loss": 0.74435097, + "num_input_tokens_seen": 323954090, + "router_z_loss_clip": 0.67480469, + "router_z_loss_mlp": 0.10406494, + "step": 15023, + "time_per_iteration": 2.674656867980957 + }, + { + "auxiliary_loss_clip": 0.01108998, + "auxiliary_loss_mlp": 0.01028945, + "balance_loss_clip": 1.03779447, + "balance_loss_mlp": 1.01837659, + "epoch": 0.9032917480835713, + "flos": 27621595118880.0, + "grad_norm": 1.7800621853163838, + "language_loss": 0.82447541, + "learning_rate": 9.725012409042155e-08, + "loss": 0.84585488, + "num_input_tokens_seen": 323974040, + "router_z_loss_clip": 0.71240234, + "router_z_loss_mlp": 0.10565186, + "step": 15024, + "time_per_iteration": 2.697307825088501 + }, + { + "auxiliary_loss_clip": 0.0111005, + "auxiliary_loss_mlp": 0.01026406, + "balance_loss_clip": 1.03668797, + "balance_loss_mlp": 1.01540256, + "epoch": 0.9033518713362393, + "flos": 29137130650080.0, + "grad_norm": 1.5770531598420308, + "language_loss": 0.69853926, + "learning_rate": 9.713019223328966e-08, + "loss": 0.71990383, + "num_input_tokens_seen": 323996125, + "router_z_loss_clip": 0.73339844, + "router_z_loss_mlp": 0.11004639, + "step": 15025, + "time_per_iteration": 2.6548948287963867 + }, + { + "auxiliary_loss_clip": 0.01106, + "auxiliary_loss_mlp": 0.01031014, + "balance_loss_clip": 1.03560364, + "balance_loss_mlp": 1.02017832, + "epoch": 0.9034119945889073, + "flos": 32830482949920.0, + "grad_norm": 2.7946463072541405, + "language_loss": 0.76982546, + "learning_rate": 9.70103325331717e-08, + "loss": 0.79119563, + "num_input_tokens_seen": 324017645, + "router_z_loss_clip": 0.703125, + "router_z_loss_mlp": 0.1083374, + "step": 15026, + "time_per_iteration": 2.6736178398132324 + }, + { + "auxiliary_loss_clip": 0.01110722, + "auxiliary_loss_mlp": 0.01026401, + "balance_loss_clip": 1.03973413, + "balance_loss_mlp": 1.01617861, + "epoch": 0.9034721178415752, + "flos": 25441428348000.0, + "grad_norm": 1.6802054826230735, + "language_loss": 0.68281406, + "learning_rate": 9.68905449946129e-08, + "loss": 0.70418537, + "num_input_tokens_seen": 324036875, + "router_z_loss_clip": 0.70996094, + "router_z_loss_mlp": 0.10217285, + "step": 15027, + "time_per_iteration": 2.6481974124908447 + }, + { + "auxiliary_loss_clip": 0.01104911, + "auxiliary_loss_mlp": 0.01028993, + "balance_loss_clip": 1.0368911, + "balance_loss_mlp": 1.01853824, + "epoch": 0.9035322410942432, + "flos": 27130676574240.0, + "grad_norm": 1.6639580175665707, + "language_loss": 0.7569344, + "learning_rate": 9.677082962215477e-08, + "loss": 0.7782734, + "num_input_tokens_seen": 324057045, + "router_z_loss_clip": 0.68017578, + "router_z_loss_mlp": 0.10449219, + "step": 15028, + "time_per_iteration": 2.60257625579834 + }, + { + "auxiliary_loss_clip": 0.01108259, + "auxiliary_loss_mlp": 0.01035259, + "balance_loss_clip": 1.0374316, + "balance_loss_mlp": 1.02410758, + "epoch": 0.9035923643469111, + "flos": 31632031716480.0, + "grad_norm": 1.8953389905005484, + "language_loss": 0.69037795, + "learning_rate": 9.665118642033765e-08, + "loss": 0.71181315, + "num_input_tokens_seen": 324079735, + "router_z_loss_clip": 0.70800781, + "router_z_loss_mlp": 0.11157227, + "step": 15029, + "time_per_iteration": 3.9866538047790527 + }, + { + "auxiliary_loss_clip": 0.01113297, + "auxiliary_loss_mlp": 0.01027827, + "balance_loss_clip": 1.0390811, + "balance_loss_mlp": 1.01629949, + "epoch": 0.9036524875995792, + "flos": 24818604590880.0, + "grad_norm": 2.05197092496639, + "language_loss": 0.73701286, + "learning_rate": 9.653161539369858e-08, + "loss": 0.7584241, + "num_input_tokens_seen": 324097785, + "router_z_loss_clip": 0.74072266, + "router_z_loss_mlp": 0.11529541, + "step": 15030, + "time_per_iteration": 2.6405770778656006 + }, + { + "auxiliary_loss_clip": 0.01110333, + "auxiliary_loss_mlp": 0.0103002, + "balance_loss_clip": 1.03687549, + "balance_loss_mlp": 1.01864767, + "epoch": 0.9037126108522471, + "flos": 48949402057920.0, + "grad_norm": 2.055889696086326, + "language_loss": 0.68356276, + "learning_rate": 9.641211654677151e-08, + "loss": 0.70496631, + "num_input_tokens_seen": 324121625, + "router_z_loss_clip": 0.73535156, + "router_z_loss_mlp": 0.11364746, + "step": 15031, + "time_per_iteration": 2.78277587890625 + }, + { + "auxiliary_loss_clip": 0.01105001, + "auxiliary_loss_mlp": 0.01025821, + "balance_loss_clip": 1.03505266, + "balance_loss_mlp": 1.01516974, + "epoch": 0.9037727341049151, + "flos": 28469906304480.0, + "grad_norm": 2.095751630918249, + "language_loss": 0.76454568, + "learning_rate": 9.629268988408723e-08, + "loss": 0.78585392, + "num_input_tokens_seen": 324142535, + "router_z_loss_clip": 0.69970703, + "router_z_loss_mlp": 0.10650635, + "step": 15032, + "time_per_iteration": 2.667868137359619 + }, + { + "auxiliary_loss_clip": 0.01110568, + "auxiliary_loss_mlp": 0.01030448, + "balance_loss_clip": 1.03771687, + "balance_loss_mlp": 1.01936126, + "epoch": 0.903832857357583, + "flos": 15645706614720.0, + "grad_norm": 14.62834028002444, + "language_loss": 0.75268501, + "learning_rate": 9.617333541017502e-08, + "loss": 0.77409518, + "num_input_tokens_seen": 324159610, + "router_z_loss_clip": 0.72753906, + "router_z_loss_mlp": 0.11077881, + "step": 15033, + "time_per_iteration": 2.5757782459259033 + }, + { + "auxiliary_loss_clip": 0.01111467, + "auxiliary_loss_mlp": 0.01035376, + "balance_loss_clip": 1.03808784, + "balance_loss_mlp": 1.02417672, + "epoch": 0.903892980610251, + "flos": 31363237666080.0, + "grad_norm": 1.7395608762686061, + "language_loss": 0.73765385, + "learning_rate": 9.605405312956105e-08, + "loss": 0.75912237, + "num_input_tokens_seen": 324182510, + "router_z_loss_clip": 0.73486328, + "router_z_loss_mlp": 0.11206055, + "step": 15034, + "time_per_iteration": 2.6814796924591064 + }, + { + "auxiliary_loss_clip": 0.01110016, + "auxiliary_loss_mlp": 0.01028717, + "balance_loss_clip": 1.03849554, + "balance_loss_mlp": 1.01764226, + "epoch": 0.9039531038629189, + "flos": 17917105599360.0, + "grad_norm": 3.2546185036426145, + "language_loss": 0.63488913, + "learning_rate": 9.593484304676791e-08, + "loss": 0.65627652, + "num_input_tokens_seen": 324200555, + "router_z_loss_clip": 0.71533203, + "router_z_loss_mlp": 0.11077881, + "step": 15035, + "time_per_iteration": 2.66886568069458 + }, + { + "auxiliary_loss_clip": 0.01110569, + "auxiliary_loss_mlp": 0.01029689, + "balance_loss_clip": 1.03825986, + "balance_loss_mlp": 1.0176127, + "epoch": 0.904013227115587, + "flos": 29315462315040.0, + "grad_norm": 3.6902790048613623, + "language_loss": 0.62757987, + "learning_rate": 9.581570516631643e-08, + "loss": 0.64898241, + "num_input_tokens_seen": 324220255, + "router_z_loss_clip": 0.72363281, + "router_z_loss_mlp": 0.12084961, + "step": 15036, + "time_per_iteration": 2.6792008876800537 + }, + { + "auxiliary_loss_clip": 0.0110654, + "auxiliary_loss_mlp": 0.01027446, + "balance_loss_clip": 1.03792953, + "balance_loss_mlp": 1.01662207, + "epoch": 0.9040733503682549, + "flos": 27889700375520.0, + "grad_norm": 1.7872613834768474, + "language_loss": 0.82250583, + "learning_rate": 9.569663949272455e-08, + "loss": 0.84384573, + "num_input_tokens_seen": 324237855, + "router_z_loss_clip": 0.68554688, + "router_z_loss_mlp": 0.10821533, + "step": 15037, + "time_per_iteration": 2.61104416847229 + }, + { + "auxiliary_loss_clip": 0.01112109, + "auxiliary_loss_mlp": 0.01027543, + "balance_loss_clip": 1.03851974, + "balance_loss_mlp": 1.01640856, + "epoch": 0.9041334736209229, + "flos": 24373383187680.0, + "grad_norm": 2.002773418158051, + "language_loss": 0.67610478, + "learning_rate": 9.557764603050667e-08, + "loss": 0.69750136, + "num_input_tokens_seen": 324257050, + "router_z_loss_clip": 0.73486328, + "router_z_loss_mlp": 0.11126709, + "step": 15038, + "time_per_iteration": 2.675438165664673 + }, + { + "auxiliary_loss_clip": 0.01108384, + "auxiliary_loss_mlp": 0.01031493, + "balance_loss_clip": 1.03622365, + "balance_loss_mlp": 1.02009702, + "epoch": 0.9041935968735909, + "flos": 21390521338080.0, + "grad_norm": 1.9835390833876168, + "language_loss": 0.75132138, + "learning_rate": 9.545872478417494e-08, + "loss": 0.7727201, + "num_input_tokens_seen": 324275510, + "router_z_loss_clip": 0.72167969, + "router_z_loss_mlp": 0.11395264, + "step": 15039, + "time_per_iteration": 2.6305811405181885 + }, + { + "auxiliary_loss_clip": 0.01108289, + "auxiliary_loss_mlp": 0.01028093, + "balance_loss_clip": 1.03827524, + "balance_loss_mlp": 1.01718545, + "epoch": 0.9042537201262588, + "flos": 27796887987840.0, + "grad_norm": 1.5871539978188007, + "language_loss": 0.70051301, + "learning_rate": 9.533987575823977e-08, + "loss": 0.7218768, + "num_input_tokens_seen": 324295150, + "router_z_loss_clip": 0.70019531, + "router_z_loss_mlp": 0.10900879, + "step": 15040, + "time_per_iteration": 2.6917314529418945 + }, + { + "auxiliary_loss_clip": 0.01106443, + "auxiliary_loss_mlp": 0.0102553, + "balance_loss_clip": 1.03629839, + "balance_loss_mlp": 1.01444924, + "epoch": 0.9043138433789268, + "flos": 25130502676800.0, + "grad_norm": 1.8167859521205716, + "language_loss": 0.67663741, + "learning_rate": 9.522109895720709e-08, + "loss": 0.69795716, + "num_input_tokens_seen": 324313855, + "router_z_loss_clip": 0.70214844, + "router_z_loss_mlp": 0.11077881, + "step": 15041, + "time_per_iteration": 2.6302895545959473 + }, + { + "auxiliary_loss_clip": 0.01108727, + "auxiliary_loss_mlp": 0.0102932, + "balance_loss_clip": 1.0368228, + "balance_loss_mlp": 1.0180428, + "epoch": 0.9043739666315948, + "flos": 40222454796000.0, + "grad_norm": 2.0268862873800493, + "language_loss": 0.57907772, + "learning_rate": 9.510239438558155e-08, + "loss": 0.6004582, + "num_input_tokens_seen": 324338465, + "router_z_loss_clip": 0.71777344, + "router_z_loss_mlp": 0.11279297, + "step": 15042, + "time_per_iteration": 2.766566038131714 + }, + { + "auxiliary_loss_clip": 0.01027429, + "auxiliary_loss_mlp": 0.0100113, + "balance_loss_clip": 1.00511122, + "balance_loss_mlp": 1.00017738, + "epoch": 0.9044340898842628, + "flos": 82115364150720.0, + "grad_norm": 0.7729362144488213, + "language_loss": 0.56851983, + "learning_rate": 9.498376204786351e-08, + "loss": 0.58880544, + "num_input_tokens_seen": 324398740, + "router_z_loss_clip": 0.2232666, + "router_z_loss_mlp": 0.00951385, + "step": 15043, + "time_per_iteration": 3.240577459335327 + }, + { + "auxiliary_loss_clip": 0.01109317, + "auxiliary_loss_mlp": 0.01026619, + "balance_loss_clip": 1.03682721, + "balance_loss_mlp": 1.01486468, + "epoch": 0.9044942131369307, + "flos": 21613334626080.0, + "grad_norm": 1.8809631154538726, + "language_loss": 0.70143151, + "learning_rate": 9.486520194855274e-08, + "loss": 0.7227909, + "num_input_tokens_seen": 324417335, + "router_z_loss_clip": 0.72509766, + "router_z_loss_mlp": 0.11755371, + "step": 15044, + "time_per_iteration": 2.6596391201019287 + }, + { + "auxiliary_loss_clip": 0.01110148, + "auxiliary_loss_mlp": 0.01035263, + "balance_loss_clip": 1.03716254, + "balance_loss_mlp": 1.02341962, + "epoch": 0.9045543363895987, + "flos": 21744591562080.0, + "grad_norm": 3.820870885375902, + "language_loss": 0.69480652, + "learning_rate": 9.474671409214407e-08, + "loss": 0.71626067, + "num_input_tokens_seen": 324433240, + "router_z_loss_clip": 0.72949219, + "router_z_loss_mlp": 0.1184082, + "step": 15045, + "time_per_iteration": 2.622486114501953 + }, + { + "auxiliary_loss_clip": 0.01114489, + "auxiliary_loss_mlp": 0.0103095, + "balance_loss_clip": 1.04054177, + "balance_loss_mlp": 1.01909423, + "epoch": 0.9046144596422666, + "flos": 26689142243520.0, + "grad_norm": 11.247079517232436, + "language_loss": 0.65620613, + "learning_rate": 9.462829848313081e-08, + "loss": 0.67766052, + "num_input_tokens_seen": 324452675, + "router_z_loss_clip": 0.73876953, + "router_z_loss_mlp": 0.11846924, + "step": 15046, + "time_per_iteration": 2.6317272186279297 + }, + { + "auxiliary_loss_clip": 0.01110788, + "auxiliary_loss_mlp": 0.01035909, + "balance_loss_clip": 1.03671479, + "balance_loss_mlp": 1.0246551, + "epoch": 0.9046745828949346, + "flos": 21564274550400.0, + "grad_norm": 3.322577513456221, + "language_loss": 0.62319601, + "learning_rate": 9.450995512600379e-08, + "loss": 0.64466298, + "num_input_tokens_seen": 324467865, + "router_z_loss_clip": 0.74072266, + "router_z_loss_mlp": 0.11260986, + "step": 15047, + "time_per_iteration": 2.6681995391845703 + }, + { + "auxiliary_loss_clip": 0.01109166, + "auxiliary_loss_mlp": 0.01030851, + "balance_loss_clip": 1.03848505, + "balance_loss_mlp": 1.02054548, + "epoch": 0.9047347061476025, + "flos": 31362710941440.0, + "grad_norm": 1.3967540193462333, + "language_loss": 0.7122333, + "learning_rate": 9.439168402525032e-08, + "loss": 0.73363346, + "num_input_tokens_seen": 324490430, + "router_z_loss_clip": 0.70800781, + "router_z_loss_mlp": 0.10308838, + "step": 15048, + "time_per_iteration": 2.6805191040039062 + }, + { + "auxiliary_loss_clip": 0.01109934, + "auxiliary_loss_mlp": 0.01032133, + "balance_loss_clip": 1.03558087, + "balance_loss_mlp": 1.02028966, + "epoch": 0.9047948294002706, + "flos": 18495609802560.0, + "grad_norm": 2.0959003722957132, + "language_loss": 0.74813342, + "learning_rate": 9.427348518535483e-08, + "loss": 0.76955408, + "num_input_tokens_seen": 324506620, + "router_z_loss_clip": 0.74267578, + "router_z_loss_mlp": 0.11865234, + "step": 15049, + "time_per_iteration": 4.026185035705566 + }, + { + "auxiliary_loss_clip": 0.01109118, + "auxiliary_loss_mlp": 0.01028759, + "balance_loss_clip": 1.03932369, + "balance_loss_mlp": 1.01764882, + "epoch": 0.9048549526529385, + "flos": 26688656036160.0, + "grad_norm": 2.03751641520873, + "language_loss": 0.75776982, + "learning_rate": 9.415535861079993e-08, + "loss": 0.77914852, + "num_input_tokens_seen": 324525505, + "router_z_loss_clip": 0.69775391, + "router_z_loss_mlp": 0.11108398, + "step": 15050, + "time_per_iteration": 2.638787031173706 + }, + { + "auxiliary_loss_clip": 0.01109162, + "auxiliary_loss_mlp": 0.01031431, + "balance_loss_clip": 1.0363183, + "balance_loss_mlp": 1.02033901, + "epoch": 0.9049150759056065, + "flos": 28732177072800.0, + "grad_norm": 2.7177223426820167, + "language_loss": 0.81827545, + "learning_rate": 9.403730430606472e-08, + "loss": 0.83968139, + "num_input_tokens_seen": 324544415, + "router_z_loss_clip": 0.72753906, + "router_z_loss_mlp": 0.11096191, + "step": 15051, + "time_per_iteration": 2.6456727981567383 + }, + { + "auxiliary_loss_clip": 0.01109803, + "auxiliary_loss_mlp": 0.0102895, + "balance_loss_clip": 1.03811193, + "balance_loss_mlp": 1.01866186, + "epoch": 0.9049751991582745, + "flos": 23971914096480.0, + "grad_norm": 2.126771034907158, + "language_loss": 0.8873449, + "learning_rate": 9.391932227562582e-08, + "loss": 0.90873241, + "num_input_tokens_seen": 324562555, + "router_z_loss_clip": 0.71679688, + "router_z_loss_mlp": 0.10284424, + "step": 15052, + "time_per_iteration": 2.650536060333252 + }, + { + "auxiliary_loss_clip": 0.01113357, + "auxiliary_loss_mlp": 0.01027684, + "balance_loss_clip": 1.03862345, + "balance_loss_mlp": 1.01657403, + "epoch": 0.9050353224109424, + "flos": 19030159107360.0, + "grad_norm": 1.8847831209587154, + "language_loss": 0.77034497, + "learning_rate": 9.380141252395724e-08, + "loss": 0.79175538, + "num_input_tokens_seen": 324580865, + "router_z_loss_clip": 0.74755859, + "router_z_loss_mlp": 0.11096191, + "step": 15053, + "time_per_iteration": 2.5958993434906006 + }, + { + "auxiliary_loss_clip": 0.01107883, + "auxiliary_loss_mlp": 0.01032469, + "balance_loss_clip": 1.03722906, + "balance_loss_mlp": 1.02128112, + "epoch": 0.9050954456636104, + "flos": 34390257000480.0, + "grad_norm": 1.7738972901178722, + "language_loss": 0.73148155, + "learning_rate": 9.368357505553049e-08, + "loss": 0.7528851, + "num_input_tokens_seen": 324600665, + "router_z_loss_clip": 0.70654297, + "router_z_loss_mlp": 0.11187744, + "step": 15054, + "time_per_iteration": 2.719968557357788 + }, + { + "auxiliary_loss_clip": 0.01109001, + "auxiliary_loss_mlp": 0.01028789, + "balance_loss_clip": 1.0377363, + "balance_loss_mlp": 1.01807201, + "epoch": 0.9051555689162784, + "flos": 31398846004800.0, + "grad_norm": 1.8400286881157026, + "language_loss": 0.83508337, + "learning_rate": 9.356580987481333e-08, + "loss": 0.85646129, + "num_input_tokens_seen": 324618145, + "router_z_loss_clip": 0.71240234, + "router_z_loss_mlp": 0.10723877, + "step": 15055, + "time_per_iteration": 2.8079018592834473 + }, + { + "auxiliary_loss_clip": 0.01107159, + "auxiliary_loss_mlp": 0.01031543, + "balance_loss_clip": 1.03705752, + "balance_loss_mlp": 1.02034354, + "epoch": 0.9052156921689464, + "flos": 28378836159840.0, + "grad_norm": 1.7662894862063974, + "language_loss": 0.85110378, + "learning_rate": 9.344811698627176e-08, + "loss": 0.8724907, + "num_input_tokens_seen": 324638165, + "router_z_loss_clip": 0.70019531, + "router_z_loss_mlp": 0.11199951, + "step": 15056, + "time_per_iteration": 2.7284181118011475 + }, + { + "auxiliary_loss_clip": 0.01108064, + "auxiliary_loss_mlp": 0.01026042, + "balance_loss_clip": 1.03707123, + "balance_loss_mlp": 1.01536655, + "epoch": 0.9052758154216143, + "flos": 36075615567840.0, + "grad_norm": 1.895003921379226, + "language_loss": 0.72276664, + "learning_rate": 9.333049639436863e-08, + "loss": 0.74410772, + "num_input_tokens_seen": 324658560, + "router_z_loss_clip": 0.7109375, + "router_z_loss_mlp": 0.10687256, + "step": 15057, + "time_per_iteration": 4.041747570037842 + }, + { + "auxiliary_loss_clip": 0.01107399, + "auxiliary_loss_mlp": 0.01025094, + "balance_loss_clip": 1.03708613, + "balance_loss_mlp": 1.01444256, + "epoch": 0.9053359386742823, + "flos": 27000311018400.0, + "grad_norm": 1.7545751756845238, + "language_loss": 0.81230247, + "learning_rate": 9.321294810356418e-08, + "loss": 0.8336274, + "num_input_tokens_seen": 324679185, + "router_z_loss_clip": 0.70410156, + "router_z_loss_mlp": 0.10644531, + "step": 15058, + "time_per_iteration": 2.670882225036621 + }, + { + "auxiliary_loss_clip": 0.01027656, + "auxiliary_loss_mlp": 0.01001915, + "balance_loss_clip": 1.00537062, + "balance_loss_mlp": 1.00099659, + "epoch": 0.9053960619269502, + "flos": 81862452874080.0, + "grad_norm": 0.6706332298822227, + "language_loss": 0.51378095, + "learning_rate": 9.309547211831592e-08, + "loss": 0.53407669, + "num_input_tokens_seen": 324744830, + "router_z_loss_clip": 0.22277832, + "router_z_loss_mlp": 0.00917053, + "step": 15059, + "time_per_iteration": 4.787571430206299 + }, + { + "auxiliary_loss_clip": 0.01108672, + "auxiliary_loss_mlp": 0.01029829, + "balance_loss_clip": 1.03675556, + "balance_loss_mlp": 1.01871252, + "epoch": 0.9054561851796182, + "flos": 19297616087520.0, + "grad_norm": 2.733273475543257, + "language_loss": 0.67204118, + "learning_rate": 9.297806844307831e-08, + "loss": 0.69342619, + "num_input_tokens_seen": 324762905, + "router_z_loss_clip": 0.71923828, + "router_z_loss_mlp": 0.11120605, + "step": 15060, + "time_per_iteration": 2.630934715270996 + }, + { + "auxiliary_loss_clip": 0.01112509, + "auxiliary_loss_mlp": 0.01031278, + "balance_loss_clip": 1.03864837, + "balance_loss_mlp": 1.02035272, + "epoch": 0.9055163084322861, + "flos": 21435002961120.0, + "grad_norm": 2.116943637794578, + "language_loss": 0.64145303, + "learning_rate": 9.286073708230357e-08, + "loss": 0.66289091, + "num_input_tokens_seen": 324781905, + "router_z_loss_clip": 0.73876953, + "router_z_loss_mlp": 0.10931396, + "step": 15061, + "time_per_iteration": 2.5872750282287598 + }, + { + "auxiliary_loss_clip": 0.01112554, + "auxiliary_loss_mlp": 0.01031547, + "balance_loss_clip": 1.03992414, + "balance_loss_mlp": 1.02032924, + "epoch": 0.9055764316849542, + "flos": 21523601551680.0, + "grad_norm": 2.408575403876542, + "language_loss": 0.71691686, + "learning_rate": 9.274347804044058e-08, + "loss": 0.7383579, + "num_input_tokens_seen": 324799260, + "router_z_loss_clip": 0.72607422, + "router_z_loss_mlp": 0.11218262, + "step": 15062, + "time_per_iteration": 2.670830249786377 + }, + { + "auxiliary_loss_clip": 0.01106217, + "auxiliary_loss_mlp": 0.01031708, + "balance_loss_clip": 1.03578472, + "balance_loss_mlp": 1.02106249, + "epoch": 0.9056365549376221, + "flos": 24551431231680.0, + "grad_norm": 2.329883650217011, + "language_loss": 0.70715857, + "learning_rate": 9.2626291321936e-08, + "loss": 0.7285378, + "num_input_tokens_seen": 324817800, + "router_z_loss_clip": 0.70458984, + "router_z_loss_mlp": 0.10644531, + "step": 15063, + "time_per_iteration": 2.5976734161376953 + }, + { + "auxiliary_loss_clip": 0.01106315, + "auxiliary_loss_mlp": 0.01031577, + "balance_loss_clip": 1.0370295, + "balance_loss_mlp": 1.0207231, + "epoch": 0.9056966781902901, + "flos": 33676727754240.0, + "grad_norm": 1.7130576154915587, + "language_loss": 0.723122, + "learning_rate": 9.250917693123406e-08, + "loss": 0.74450088, + "num_input_tokens_seen": 324838445, + "router_z_loss_clip": 0.69384766, + "router_z_loss_mlp": 0.10864258, + "step": 15064, + "time_per_iteration": 2.7461166381835938 + }, + { + "auxiliary_loss_clip": 0.01108879, + "auxiliary_loss_mlp": 0.01032875, + "balance_loss_clip": 1.03524804, + "balance_loss_mlp": 1.02156234, + "epoch": 0.9057568014429581, + "flos": 31628304126720.0, + "grad_norm": 1.7683493123651937, + "language_loss": 0.69314736, + "learning_rate": 9.23921348727752e-08, + "loss": 0.71456492, + "num_input_tokens_seen": 324859895, + "router_z_loss_clip": 0.73681641, + "router_z_loss_mlp": 0.11322021, + "step": 15065, + "time_per_iteration": 2.6483964920043945 + }, + { + "auxiliary_loss_clip": 0.01111342, + "auxiliary_loss_mlp": 0.01032073, + "balance_loss_clip": 1.03947806, + "balance_loss_mlp": 1.02087903, + "epoch": 0.905816924695626, + "flos": 27979798105440.0, + "grad_norm": 1.9085594108108221, + "language_loss": 0.63161266, + "learning_rate": 9.227516515099743e-08, + "loss": 0.65304679, + "num_input_tokens_seen": 324879580, + "router_z_loss_clip": 0.71875, + "router_z_loss_mlp": 0.11199951, + "step": 15066, + "time_per_iteration": 2.6682398319244385 + }, + { + "auxiliary_loss_clip": 0.01112894, + "auxiliary_loss_mlp": 0.01026467, + "balance_loss_clip": 1.03692532, + "balance_loss_mlp": 1.01406908, + "epoch": 0.905877047948294, + "flos": 27037215910080.0, + "grad_norm": 7.0816586158470125, + "language_loss": 0.79735416, + "learning_rate": 9.215826777033675e-08, + "loss": 0.81874776, + "num_input_tokens_seen": 324898950, + "router_z_loss_clip": 0.75927734, + "router_z_loss_mlp": 0.12390137, + "step": 15067, + "time_per_iteration": 2.625978469848633 + }, + { + "auxiliary_loss_clip": 0.01111133, + "auxiliary_loss_mlp": 0.01032673, + "balance_loss_clip": 1.03787303, + "balance_loss_mlp": 1.02139556, + "epoch": 0.905937171200962, + "flos": 18674508709440.0, + "grad_norm": 1.6474666019196567, + "language_loss": 0.69701558, + "learning_rate": 9.204144273522563e-08, + "loss": 0.71845365, + "num_input_tokens_seen": 324917455, + "router_z_loss_clip": 0.73193359, + "router_z_loss_mlp": 0.11273193, + "step": 15068, + "time_per_iteration": 4.081158399581909 + }, + { + "auxiliary_loss_clip": 0.0110578, + "auxiliary_loss_mlp": 0.01025636, + "balance_loss_clip": 1.03606129, + "balance_loss_mlp": 1.01496696, + "epoch": 0.90599729445363, + "flos": 23748736152960.0, + "grad_norm": 2.0327975349880405, + "language_loss": 0.85047388, + "learning_rate": 9.19246900500943e-08, + "loss": 0.87178808, + "num_input_tokens_seen": 324934495, + "router_z_loss_clip": 0.69726562, + "router_z_loss_mlp": 0.10668945, + "step": 15069, + "time_per_iteration": 2.668409585952759 + }, + { + "auxiliary_loss_clip": 0.01114326, + "auxiliary_loss_mlp": 0.01029594, + "balance_loss_clip": 1.03808737, + "balance_loss_mlp": 1.0176847, + "epoch": 0.9060574177062979, + "flos": 28959204157920.0, + "grad_norm": 3.2990709813513597, + "language_loss": 0.5966301, + "learning_rate": 9.180800971936987e-08, + "loss": 0.61806929, + "num_input_tokens_seen": 324953230, + "router_z_loss_clip": 0.76220703, + "router_z_loss_mlp": 0.11907959, + "step": 15070, + "time_per_iteration": 2.626631736755371 + }, + { + "auxiliary_loss_clip": 0.01111166, + "auxiliary_loss_mlp": 0.01024577, + "balance_loss_clip": 1.03655005, + "balance_loss_mlp": 1.01316321, + "epoch": 0.9061175409589659, + "flos": 21122821254240.0, + "grad_norm": 3.1407072164224448, + "language_loss": 0.81173199, + "learning_rate": 9.169140174747724e-08, + "loss": 0.83308941, + "num_input_tokens_seen": 324969880, + "router_z_loss_clip": 0.74560547, + "router_z_loss_mlp": 0.11407471, + "step": 15071, + "time_per_iteration": 2.6138672828674316 + }, + { + "auxiliary_loss_clip": 0.01113664, + "auxiliary_loss_mlp": 0.0103375, + "balance_loss_clip": 1.03817701, + "balance_loss_mlp": 1.02151299, + "epoch": 0.9061776642116338, + "flos": 21693708208800.0, + "grad_norm": 1.8292054278537098, + "language_loss": 0.61813247, + "learning_rate": 9.157486613883758e-08, + "loss": 0.6396066, + "num_input_tokens_seen": 324987005, + "router_z_loss_clip": 0.75488281, + "router_z_loss_mlp": 0.12249756, + "step": 15072, + "time_per_iteration": 2.5560946464538574 + }, + { + "auxiliary_loss_clip": 0.01108919, + "auxiliary_loss_mlp": 0.01033765, + "balance_loss_clip": 1.03708768, + "balance_loss_mlp": 1.02249372, + "epoch": 0.9062377874643018, + "flos": 52199113128480.0, + "grad_norm": 2.1460983301487726, + "language_loss": 0.73062897, + "learning_rate": 9.145840289787021e-08, + "loss": 0.75205576, + "num_input_tokens_seen": 325010700, + "router_z_loss_clip": 0.71777344, + "router_z_loss_mlp": 0.11260986, + "step": 15073, + "time_per_iteration": 2.821812629699707 + }, + { + "auxiliary_loss_clip": 0.01107259, + "auxiliary_loss_mlp": 0.01022788, + "balance_loss_clip": 1.03784847, + "balance_loss_mlp": 1.01227939, + "epoch": 0.9062979107169697, + "flos": 19964232673920.0, + "grad_norm": 5.600297574907872, + "language_loss": 0.80862689, + "learning_rate": 9.134201202899161e-08, + "loss": 0.82992733, + "num_input_tokens_seen": 325028760, + "router_z_loss_clip": 0.69384766, + "router_z_loss_mlp": 0.10516357, + "step": 15074, + "time_per_iteration": 2.5688509941101074 + }, + { + "auxiliary_loss_clip": 0.01027562, + "auxiliary_loss_mlp": 0.01001501, + "balance_loss_clip": 1.00525343, + "balance_loss_mlp": 1.0005641, + "epoch": 0.9063580339696378, + "flos": 84576358604160.0, + "grad_norm": 0.7555293110884276, + "language_loss": 0.52311432, + "learning_rate": 9.122569353661513e-08, + "loss": 0.54340494, + "num_input_tokens_seen": 325093545, + "router_z_loss_clip": 0.2232666, + "router_z_loss_mlp": 0.0093689, + "step": 15075, + "time_per_iteration": 3.3209288120269775 + }, + { + "auxiliary_loss_clip": 0.01027584, + "auxiliary_loss_mlp": 0.01001538, + "balance_loss_clip": 1.00523281, + "balance_loss_mlp": 1.00064254, + "epoch": 0.9064181572223057, + "flos": 71741016662400.0, + "grad_norm": 0.7321014052825557, + "language_loss": 0.62096024, + "learning_rate": 9.11094474251517e-08, + "loss": 0.64125144, + "num_input_tokens_seen": 325152295, + "router_z_loss_clip": 0.22351074, + "router_z_loss_mlp": 0.00895691, + "step": 15076, + "time_per_iteration": 3.161271095275879 + }, + { + "auxiliary_loss_clip": 0.01108202, + "auxiliary_loss_mlp": 0.01031028, + "balance_loss_clip": 1.03690934, + "balance_loss_mlp": 1.02043653, + "epoch": 0.9064782804749737, + "flos": 26554927546080.0, + "grad_norm": 1.9072447499807181, + "language_loss": 0.82616216, + "learning_rate": 9.09932736990091e-08, + "loss": 0.84755445, + "num_input_tokens_seen": 325169705, + "router_z_loss_clip": 0.71337891, + "router_z_loss_mlp": 0.105896, + "step": 15077, + "time_per_iteration": 2.601745843887329 + }, + { + "auxiliary_loss_clip": 0.01104627, + "auxiliary_loss_mlp": 0.01024383, + "balance_loss_clip": 1.03507173, + "balance_loss_mlp": 1.01405334, + "epoch": 0.9065384037276417, + "flos": 26109179418240.0, + "grad_norm": 1.992487322158482, + "language_loss": 0.84019756, + "learning_rate": 9.08771723625934e-08, + "loss": 0.86148763, + "num_input_tokens_seen": 325189175, + "router_z_loss_clip": 0.6953125, + "router_z_loss_mlp": 0.10333252, + "step": 15078, + "time_per_iteration": 2.6507205963134766 + }, + { + "auxiliary_loss_clip": 0.01106859, + "auxiliary_loss_mlp": 0.01028524, + "balance_loss_clip": 1.03874791, + "balance_loss_mlp": 1.01746094, + "epoch": 0.9065985269803096, + "flos": 46715718310560.0, + "grad_norm": 1.5603856640556713, + "language_loss": 0.65282637, + "learning_rate": 9.076114342030617e-08, + "loss": 0.67418021, + "num_input_tokens_seen": 325211020, + "router_z_loss_clip": 0.68115234, + "router_z_loss_mlp": 0.11053467, + "step": 15079, + "time_per_iteration": 2.77642560005188 + }, + { + "auxiliary_loss_clip": 0.01106317, + "auxiliary_loss_mlp": 0.01026771, + "balance_loss_clip": 1.03511262, + "balance_loss_mlp": 1.01579225, + "epoch": 0.9066586502329776, + "flos": 54689759880480.0, + "grad_norm": 1.6254596572286288, + "language_loss": 0.70762539, + "learning_rate": 9.064518687654765e-08, + "loss": 0.72895634, + "num_input_tokens_seen": 325236970, + "router_z_loss_clip": 0.71240234, + "router_z_loss_mlp": 0.10986328, + "step": 15080, + "time_per_iteration": 2.8446481227874756 + }, + { + "auxiliary_loss_clip": 0.01113602, + "auxiliary_loss_mlp": 0.010296, + "balance_loss_clip": 1.03954589, + "balance_loss_mlp": 1.01806021, + "epoch": 0.9067187734856456, + "flos": 22724929512000.0, + "grad_norm": 2.0888668274980473, + "language_loss": 0.71086764, + "learning_rate": 9.052930273571547e-08, + "loss": 0.73229969, + "num_input_tokens_seen": 325252670, + "router_z_loss_clip": 0.73974609, + "router_z_loss_mlp": 0.11535645, + "step": 15081, + "time_per_iteration": 2.5934062004089355 + }, + { + "auxiliary_loss_clip": 0.01108308, + "auxiliary_loss_mlp": 0.01026969, + "balance_loss_clip": 1.03762758, + "balance_loss_mlp": 1.01604962, + "epoch": 0.9067788967383136, + "flos": 27756782231040.0, + "grad_norm": 2.187743065002408, + "language_loss": 0.74349105, + "learning_rate": 9.04134910022032e-08, + "loss": 0.76484388, + "num_input_tokens_seen": 325273860, + "router_z_loss_clip": 0.70800781, + "router_z_loss_mlp": 0.10913086, + "step": 15082, + "time_per_iteration": 2.675814628601074 + }, + { + "auxiliary_loss_clip": 0.01108864, + "auxiliary_loss_mlp": 0.01032634, + "balance_loss_clip": 1.03842759, + "balance_loss_mlp": 1.02186966, + "epoch": 0.9068390199909815, + "flos": 33764678068320.0, + "grad_norm": 1.8454729832324168, + "language_loss": 0.78112781, + "learning_rate": 9.029775168040266e-08, + "loss": 0.80254281, + "num_input_tokens_seen": 325294140, + "router_z_loss_clip": 0.70361328, + "router_z_loss_mlp": 0.10766602, + "step": 15083, + "time_per_iteration": 2.68990159034729 + }, + { + "auxiliary_loss_clip": 0.01106964, + "auxiliary_loss_mlp": 0.01032767, + "balance_loss_clip": 1.03841925, + "balance_loss_mlp": 1.02265847, + "epoch": 0.9068991432436495, + "flos": 29582797743360.0, + "grad_norm": 1.618048569969236, + "language_loss": 0.69180572, + "learning_rate": 9.01820847747028e-08, + "loss": 0.71320307, + "num_input_tokens_seen": 325313130, + "router_z_loss_clip": 0.68554688, + "router_z_loss_mlp": 0.10107422, + "step": 15084, + "time_per_iteration": 2.781020164489746 + }, + { + "auxiliary_loss_clip": 0.01109058, + "auxiliary_loss_mlp": 0.01029725, + "balance_loss_clip": 1.03783393, + "balance_loss_mlp": 1.01848364, + "epoch": 0.9069592664963174, + "flos": 34207265848320.0, + "grad_norm": 2.2221785310542, + "language_loss": 0.66811574, + "learning_rate": 9.006649028948965e-08, + "loss": 0.68950355, + "num_input_tokens_seen": 325334880, + "router_z_loss_clip": 0.71191406, + "router_z_loss_mlp": 0.11242676, + "step": 15085, + "time_per_iteration": 2.6739745140075684 + }, + { + "auxiliary_loss_clip": 0.01027394, + "auxiliary_loss_mlp": 0.01001429, + "balance_loss_clip": 1.00507867, + "balance_loss_mlp": 1.0005312, + "epoch": 0.9070193897489854, + "flos": 83925091716480.0, + "grad_norm": 0.7679731597837252, + "language_loss": 0.61308229, + "learning_rate": 8.995096822914638e-08, + "loss": 0.63337052, + "num_input_tokens_seen": 325394175, + "router_z_loss_clip": 0.22302246, + "router_z_loss_mlp": 0.00896454, + "step": 15086, + "time_per_iteration": 3.257866382598877 + }, + { + "auxiliary_loss_clip": 0.01110303, + "auxiliary_loss_mlp": 0.01037595, + "balance_loss_clip": 1.03899956, + "balance_loss_mlp": 1.02581763, + "epoch": 0.9070795130016533, + "flos": 28603634794560.0, + "grad_norm": 2.6919466033583395, + "language_loss": 0.7208547, + "learning_rate": 8.983551859805416e-08, + "loss": 0.74233371, + "num_input_tokens_seen": 325415020, + "router_z_loss_clip": 0.71289062, + "router_z_loss_mlp": 0.11779785, + "step": 15087, + "time_per_iteration": 2.712122678756714 + }, + { + "auxiliary_loss_clip": 0.01108582, + "auxiliary_loss_mlp": 0.01025139, + "balance_loss_clip": 1.0375123, + "balance_loss_mlp": 1.01466632, + "epoch": 0.9071396362543214, + "flos": 23081714393760.0, + "grad_norm": 2.1407385049415715, + "language_loss": 0.77123755, + "learning_rate": 8.972014140059058e-08, + "loss": 0.79257476, + "num_input_tokens_seen": 325433595, + "router_z_loss_clip": 0.70996094, + "router_z_loss_mlp": 0.10473633, + "step": 15088, + "time_per_iteration": 4.113672971725464 + }, + { + "auxiliary_loss_clip": 0.01104943, + "auxiliary_loss_mlp": 0.01025811, + "balance_loss_clip": 1.03629255, + "balance_loss_mlp": 1.01552951, + "epoch": 0.9071997595069893, + "flos": 30784368807360.0, + "grad_norm": 1.9759740751608244, + "language_loss": 0.7325334, + "learning_rate": 8.960483664113038e-08, + "loss": 0.75384092, + "num_input_tokens_seen": 325451605, + "router_z_loss_clip": 0.68652344, + "router_z_loss_mlp": 0.10284424, + "step": 15089, + "time_per_iteration": 2.677762746810913 + }, + { + "auxiliary_loss_clip": 0.0110307, + "auxiliary_loss_mlp": 0.01028286, + "balance_loss_clip": 1.03563595, + "balance_loss_mlp": 1.0182426, + "epoch": 0.9072598827596573, + "flos": 29708625363840.0, + "grad_norm": 1.9030116302282427, + "language_loss": 0.75769138, + "learning_rate": 8.948960432404628e-08, + "loss": 0.77900505, + "num_input_tokens_seen": 325470645, + "router_z_loss_clip": 0.67382812, + "router_z_loss_mlp": 0.1005249, + "step": 15090, + "time_per_iteration": 2.631556272506714 + }, + { + "auxiliary_loss_clip": 0.01110758, + "auxiliary_loss_mlp": 0.01028649, + "balance_loss_clip": 1.03779006, + "balance_loss_mlp": 1.01676345, + "epoch": 0.9073200060123253, + "flos": 27668143123200.0, + "grad_norm": 2.8579428102865654, + "language_loss": 0.78250092, + "learning_rate": 8.93744444537079e-08, + "loss": 0.803895, + "num_input_tokens_seen": 325488070, + "router_z_loss_clip": 0.73046875, + "router_z_loss_mlp": 0.11883545, + "step": 15091, + "time_per_iteration": 2.623140573501587 + }, + { + "auxiliary_loss_clip": 0.01102081, + "auxiliary_loss_mlp": 0.01023645, + "balance_loss_clip": 1.03487635, + "balance_loss_mlp": 1.01382256, + "epoch": 0.9073801292649932, + "flos": 28913547533760.0, + "grad_norm": 1.7597509629165564, + "language_loss": 0.85885453, + "learning_rate": 8.925935703448217e-08, + "loss": 0.88011181, + "num_input_tokens_seen": 325509285, + "router_z_loss_clip": 0.671875, + "router_z_loss_mlp": 0.0982666, + "step": 15092, + "time_per_iteration": 2.6806163787841797 + }, + { + "auxiliary_loss_clip": 0.01109203, + "auxiliary_loss_mlp": 0.01031239, + "balance_loss_clip": 1.03855753, + "balance_loss_mlp": 1.01989663, + "epoch": 0.9074402525176612, + "flos": 30963389266080.0, + "grad_norm": 1.5330396588746085, + "language_loss": 0.78764498, + "learning_rate": 8.914434207073296e-08, + "loss": 0.80904949, + "num_input_tokens_seen": 325529360, + "router_z_loss_clip": 0.70605469, + "router_z_loss_mlp": 0.11334229, + "step": 15093, + "time_per_iteration": 2.649296283721924 + }, + { + "auxiliary_loss_clip": 0.01027383, + "auxiliary_loss_mlp": 0.01002093, + "balance_loss_clip": 1.00503683, + "balance_loss_mlp": 1.0011996, + "epoch": 0.9075003757703292, + "flos": 82546769161440.0, + "grad_norm": 0.7491297135199141, + "language_loss": 0.56958359, + "learning_rate": 8.902939956682188e-08, + "loss": 0.58987838, + "num_input_tokens_seen": 325583565, + "router_z_loss_clip": 0.22338867, + "router_z_loss_mlp": 0.00893402, + "step": 15094, + "time_per_iteration": 3.1919243335723877 + }, + { + "auxiliary_loss_clip": 0.01111122, + "auxiliary_loss_mlp": 0.0103235, + "balance_loss_clip": 1.03794861, + "balance_loss_mlp": 1.02100098, + "epoch": 0.9075604990229972, + "flos": 27398295623520.0, + "grad_norm": 2.513010170964566, + "language_loss": 0.71380258, + "learning_rate": 8.891452952710742e-08, + "loss": 0.7352373, + "num_input_tokens_seen": 325603690, + "router_z_loss_clip": 0.73193359, + "router_z_loss_mlp": 0.11352539, + "step": 15095, + "time_per_iteration": 2.653106689453125 + }, + { + "auxiliary_loss_clip": 0.01109607, + "auxiliary_loss_mlp": 0.01027399, + "balance_loss_clip": 1.03760517, + "balance_loss_mlp": 1.01676595, + "epoch": 0.9076206222756651, + "flos": 23838590779200.0, + "grad_norm": 1.779504963689805, + "language_loss": 0.74135506, + "learning_rate": 8.879973195594526e-08, + "loss": 0.76272517, + "num_input_tokens_seen": 325622255, + "router_z_loss_clip": 0.71923828, + "router_z_loss_mlp": 0.10626221, + "step": 15096, + "time_per_iteration": 4.085666656494141 + }, + { + "auxiliary_loss_clip": 0.0111078, + "auxiliary_loss_mlp": 0.01033928, + "balance_loss_clip": 1.03719127, + "balance_loss_mlp": 1.02156579, + "epoch": 0.9076807455283331, + "flos": 36749484747360.0, + "grad_norm": 1.8844770067482355, + "language_loss": 0.57268846, + "learning_rate": 8.868500685768898e-08, + "loss": 0.59413552, + "num_input_tokens_seen": 325640165, + "router_z_loss_clip": 0.73583984, + "router_z_loss_mlp": 0.12347412, + "step": 15097, + "time_per_iteration": 2.704699993133545 + }, + { + "auxiliary_loss_clip": 0.01104087, + "auxiliary_loss_mlp": 0.01020664, + "balance_loss_clip": 1.03350925, + "balance_loss_mlp": 1.01036453, + "epoch": 0.907740868781001, + "flos": 22814622069120.0, + "grad_norm": 1.7461807322471583, + "language_loss": 0.79680645, + "learning_rate": 8.857035423668935e-08, + "loss": 0.81805396, + "num_input_tokens_seen": 325659455, + "router_z_loss_clip": 0.70605469, + "router_z_loss_mlp": 0.10308838, + "step": 15098, + "time_per_iteration": 4.093413352966309 + }, + { + "auxiliary_loss_clip": 0.01112469, + "auxiliary_loss_mlp": 0.01026449, + "balance_loss_clip": 1.03775179, + "balance_loss_mlp": 1.01511192, + "epoch": 0.907800992033669, + "flos": 27623985638400.0, + "grad_norm": 1.7350580060973393, + "language_loss": 0.65993798, + "learning_rate": 8.845577409729266e-08, + "loss": 0.68132716, + "num_input_tokens_seen": 325678095, + "router_z_loss_clip": 0.74609375, + "router_z_loss_mlp": 0.11328125, + "step": 15099, + "time_per_iteration": 2.8007233142852783 + }, + { + "auxiliary_loss_clip": 0.01111356, + "auxiliary_loss_mlp": 0.01033696, + "balance_loss_clip": 1.03835213, + "balance_loss_mlp": 1.02210307, + "epoch": 0.907861115286337, + "flos": 25975329376320.0, + "grad_norm": 2.4022262568365096, + "language_loss": 0.7047708, + "learning_rate": 8.834126644384477e-08, + "loss": 0.72622132, + "num_input_tokens_seen": 325695825, + "router_z_loss_clip": 0.72998047, + "router_z_loss_mlp": 0.11602783, + "step": 15100, + "time_per_iteration": 2.7385003566741943 + }, + { + "auxiliary_loss_clip": 0.01027607, + "auxiliary_loss_mlp": 0.01000598, + "balance_loss_clip": 1.00531816, + "balance_loss_mlp": 0.99962932, + "epoch": 0.907921238539005, + "flos": 85096726860960.0, + "grad_norm": 0.6207549750029705, + "language_loss": 0.5335359, + "learning_rate": 8.822683128068775e-08, + "loss": 0.55381799, + "num_input_tokens_seen": 325764515, + "router_z_loss_clip": 0.22302246, + "router_z_loss_mlp": 0.00967407, + "step": 15101, + "time_per_iteration": 3.31431245803833 + }, + { + "auxiliary_loss_clip": 0.01109236, + "auxiliary_loss_mlp": 0.01026402, + "balance_loss_clip": 1.03737235, + "balance_loss_mlp": 1.01536298, + "epoch": 0.9079813617916729, + "flos": 28647346589280.0, + "grad_norm": 1.7380220090236698, + "language_loss": 0.6844306, + "learning_rate": 8.811246861216081e-08, + "loss": 0.70578694, + "num_input_tokens_seen": 325783235, + "router_z_loss_clip": 0.71728516, + "router_z_loss_mlp": 0.1104126, + "step": 15102, + "time_per_iteration": 2.6421701908111572 + }, + { + "auxiliary_loss_clip": 0.01109659, + "auxiliary_loss_mlp": 0.01025976, + "balance_loss_clip": 1.03878987, + "balance_loss_mlp": 1.01507437, + "epoch": 0.9080414850443409, + "flos": 27979311898080.0, + "grad_norm": 2.024377085078777, + "language_loss": 0.79034507, + "learning_rate": 8.799817844260049e-08, + "loss": 0.81170142, + "num_input_tokens_seen": 325800195, + "router_z_loss_clip": 0.70849609, + "router_z_loss_mlp": 0.10894775, + "step": 15103, + "time_per_iteration": 2.6261420249938965 + }, + { + "auxiliary_loss_clip": 0.01109935, + "auxiliary_loss_mlp": 0.01028601, + "balance_loss_clip": 1.03685641, + "balance_loss_mlp": 1.01732373, + "epoch": 0.9081016082970089, + "flos": 31941741869280.0, + "grad_norm": 2.046096186850545, + "language_loss": 0.7151581, + "learning_rate": 8.78839607763413e-08, + "loss": 0.73654342, + "num_input_tokens_seen": 325820215, + "router_z_loss_clip": 0.72998047, + "router_z_loss_mlp": 0.11279297, + "step": 15104, + "time_per_iteration": 2.7159461975097656 + }, + { + "auxiliary_loss_clip": 0.01107765, + "auxiliary_loss_mlp": 0.01023238, + "balance_loss_clip": 1.03732562, + "balance_loss_mlp": 1.01322401, + "epoch": 0.9081617315496768, + "flos": 29849079722400.0, + "grad_norm": 1.7784702557893115, + "language_loss": 0.77168441, + "learning_rate": 8.77698156177138e-08, + "loss": 0.79299444, + "num_input_tokens_seen": 325838415, + "router_z_loss_clip": 0.70410156, + "router_z_loss_mlp": 0.10015869, + "step": 15105, + "time_per_iteration": 2.6426267623901367 + }, + { + "auxiliary_loss_clip": 0.01107945, + "auxiliary_loss_mlp": 0.01035261, + "balance_loss_clip": 1.03573203, + "balance_loss_mlp": 1.02390587, + "epoch": 0.9082218548023449, + "flos": 30197153388960.0, + "grad_norm": 1.9224722759701882, + "language_loss": 0.73967707, + "learning_rate": 8.765574297104628e-08, + "loss": 0.76110917, + "num_input_tokens_seen": 325855580, + "router_z_loss_clip": 0.72216797, + "router_z_loss_mlp": 0.11358643, + "step": 15106, + "time_per_iteration": 2.671008825302124 + }, + { + "auxiliary_loss_clip": 0.01109757, + "auxiliary_loss_mlp": 0.01030288, + "balance_loss_clip": 1.03753483, + "balance_loss_mlp": 1.01915419, + "epoch": 0.9082819780550128, + "flos": 29799614473920.0, + "grad_norm": 2.0057429492124452, + "language_loss": 0.80184877, + "learning_rate": 8.754174284066462e-08, + "loss": 0.82324922, + "num_input_tokens_seen": 325874890, + "router_z_loss_clip": 0.72265625, + "router_z_loss_mlp": 0.11138916, + "step": 15107, + "time_per_iteration": 2.6794369220733643 + }, + { + "auxiliary_loss_clip": 0.0102758, + "auxiliary_loss_mlp": 0.01001132, + "balance_loss_clip": 1.00521767, + "balance_loss_mlp": 1.00016963, + "epoch": 0.9083421013076808, + "flos": 72734360659200.0, + "grad_norm": 0.8280500905831516, + "language_loss": 0.59743536, + "learning_rate": 8.742781523089205e-08, + "loss": 0.61772251, + "num_input_tokens_seen": 325935835, + "router_z_loss_clip": 0.22363281, + "router_z_loss_mlp": 0.00961304, + "step": 15108, + "time_per_iteration": 4.435035943984985 + }, + { + "auxiliary_loss_clip": 0.01109799, + "auxiliary_loss_mlp": 0.01022973, + "balance_loss_clip": 1.03716564, + "balance_loss_mlp": 1.01207685, + "epoch": 0.9084022245603487, + "flos": 41024420563680.0, + "grad_norm": 1.9729315746259262, + "language_loss": 0.73264039, + "learning_rate": 8.73139601460482e-08, + "loss": 0.75396812, + "num_input_tokens_seen": 325958035, + "router_z_loss_clip": 0.72558594, + "router_z_loss_mlp": 0.10900879, + "step": 15109, + "time_per_iteration": 2.7290782928466797 + }, + { + "auxiliary_loss_clip": 0.01106305, + "auxiliary_loss_mlp": 0.01025465, + "balance_loss_clip": 1.03615785, + "balance_loss_mlp": 1.01484942, + "epoch": 0.9084623478130167, + "flos": 30470971582080.0, + "grad_norm": 2.0207775115486273, + "language_loss": 0.7173593, + "learning_rate": 8.720017759045073e-08, + "loss": 0.73867697, + "num_input_tokens_seen": 325979870, + "router_z_loss_clip": 0.70166016, + "router_z_loss_mlp": 0.10620117, + "step": 15110, + "time_per_iteration": 2.649055004119873 + }, + { + "auxiliary_loss_clip": 0.01106234, + "auxiliary_loss_mlp": 0.01025865, + "balance_loss_clip": 1.03601456, + "balance_loss_mlp": 1.01530361, + "epoch": 0.9085224710656846, + "flos": 38391091002720.0, + "grad_norm": 1.8616245089931163, + "language_loss": 0.6886338, + "learning_rate": 8.708646756841421e-08, + "loss": 0.70995474, + "num_input_tokens_seen": 325998245, + "router_z_loss_clip": 0.70214844, + "router_z_loss_mlp": 0.10552979, + "step": 15111, + "time_per_iteration": 2.7195873260498047 + }, + { + "auxiliary_loss_clip": 0.01027675, + "auxiliary_loss_mlp": 0.01001279, + "balance_loss_clip": 1.00534773, + "balance_loss_mlp": 1.00032461, + "epoch": 0.9085825943183526, + "flos": 79212551745600.0, + "grad_norm": 0.6893416125196872, + "language_loss": 0.51682073, + "learning_rate": 8.697283008425026e-08, + "loss": 0.53711027, + "num_input_tokens_seen": 326061770, + "router_z_loss_clip": 0.22338867, + "router_z_loss_mlp": 0.00952911, + "step": 15112, + "time_per_iteration": 3.340282678604126 + }, + { + "auxiliary_loss_clip": 0.01109205, + "auxiliary_loss_mlp": 0.01029044, + "balance_loss_clip": 1.03604579, + "balance_loss_mlp": 1.01793957, + "epoch": 0.9086427175710206, + "flos": 23126601189600.0, + "grad_norm": 1.953203925815893, + "language_loss": 0.69423127, + "learning_rate": 8.685926514226837e-08, + "loss": 0.71561378, + "num_input_tokens_seen": 326080945, + "router_z_loss_clip": 0.73095703, + "router_z_loss_mlp": 0.11096191, + "step": 15113, + "time_per_iteration": 2.591817617416382 + }, + { + "auxiliary_loss_clip": 0.01110188, + "auxiliary_loss_mlp": 0.01028029, + "balance_loss_clip": 1.03827286, + "balance_loss_mlp": 1.01700211, + "epoch": 0.9087028408236886, + "flos": 41506627893120.0, + "grad_norm": 2.1844109679871, + "language_loss": 0.79001611, + "learning_rate": 8.674577274677508e-08, + "loss": 0.81139827, + "num_input_tokens_seen": 326100630, + "router_z_loss_clip": 0.71923828, + "router_z_loss_mlp": 0.11029053, + "step": 15114, + "time_per_iteration": 2.7759530544281006 + }, + { + "auxiliary_loss_clip": 0.0111242, + "auxiliary_loss_mlp": 0.01031823, + "balance_loss_clip": 1.03762341, + "balance_loss_mlp": 1.01938987, + "epoch": 0.9087629640763565, + "flos": 26776930488480.0, + "grad_norm": 2.053002576381678, + "language_loss": 0.70425051, + "learning_rate": 8.663235290207405e-08, + "loss": 0.72569293, + "num_input_tokens_seen": 326120145, + "router_z_loss_clip": 0.74707031, + "router_z_loss_mlp": 0.12445068, + "step": 15115, + "time_per_iteration": 2.6196255683898926 + }, + { + "auxiliary_loss_clip": 0.0111519, + "auxiliary_loss_mlp": 0.0102523, + "balance_loss_clip": 1.04086566, + "balance_loss_mlp": 1.01373196, + "epoch": 0.9088230873290245, + "flos": 26555170649760.0, + "grad_norm": 1.9232200359545768, + "language_loss": 0.65786529, + "learning_rate": 8.651900561246561e-08, + "loss": 0.67926943, + "num_input_tokens_seen": 326140715, + "router_z_loss_clip": 0.74316406, + "router_z_loss_mlp": 0.11505127, + "step": 15116, + "time_per_iteration": 2.626509666442871 + }, + { + "auxiliary_loss_clip": 0.01108769, + "auxiliary_loss_mlp": 0.01030418, + "balance_loss_clip": 1.0394969, + "balance_loss_mlp": 1.01906347, + "epoch": 0.9088832105816925, + "flos": 26284958494560.0, + "grad_norm": 1.6139999085321786, + "language_loss": 0.69657022, + "learning_rate": 8.640573088224812e-08, + "loss": 0.71796209, + "num_input_tokens_seen": 326159130, + "router_z_loss_clip": 0.69335938, + "router_z_loss_mlp": 0.11352539, + "step": 15117, + "time_per_iteration": 2.609567165374756 + }, + { + "auxiliary_loss_clip": 0.0110782, + "auxiliary_loss_mlp": 0.01025442, + "balance_loss_clip": 1.0371052, + "balance_loss_mlp": 1.01480246, + "epoch": 0.9089433338343604, + "flos": 31721440652640.0, + "grad_norm": 1.5961944854810688, + "language_loss": 0.748882, + "learning_rate": 8.629252871571745e-08, + "loss": 0.77021456, + "num_input_tokens_seen": 326181375, + "router_z_loss_clip": 0.70654297, + "router_z_loss_mlp": 0.10644531, + "step": 15118, + "time_per_iteration": 2.7174232006073 + }, + { + "auxiliary_loss_clip": 0.01113148, + "auxiliary_loss_mlp": 0.01032687, + "balance_loss_clip": 1.03653657, + "balance_loss_mlp": 1.02028918, + "epoch": 0.9090034570870285, + "flos": 25842006059040.0, + "grad_norm": 2.284091283922301, + "language_loss": 0.73232293, + "learning_rate": 8.617939911716554e-08, + "loss": 0.7537812, + "num_input_tokens_seen": 326199740, + "router_z_loss_clip": 0.76660156, + "router_z_loss_mlp": 0.12390137, + "step": 15119, + "time_per_iteration": 2.634289026260376 + }, + { + "auxiliary_loss_clip": 0.01115289, + "auxiliary_loss_mlp": 0.01030743, + "balance_loss_clip": 1.03963685, + "balance_loss_mlp": 1.01815498, + "epoch": 0.9090635803396964, + "flos": 19696086900000.0, + "grad_norm": 2.4697211986764613, + "language_loss": 0.71360105, + "learning_rate": 8.60663420908827e-08, + "loss": 0.73506141, + "num_input_tokens_seen": 326214350, + "router_z_loss_clip": 0.75585938, + "router_z_loss_mlp": 0.12591553, + "step": 15120, + "time_per_iteration": 2.6165785789489746 + }, + { + "auxiliary_loss_clip": 0.01110432, + "auxiliary_loss_mlp": 0.01022887, + "balance_loss_clip": 1.03776753, + "balance_loss_mlp": 1.01174068, + "epoch": 0.9091237035923644, + "flos": 25126126810560.0, + "grad_norm": 3.3061357162676415, + "language_loss": 0.65759325, + "learning_rate": 8.595335764115596e-08, + "loss": 0.67892647, + "num_input_tokens_seen": 326234580, + "router_z_loss_clip": 0.72705078, + "router_z_loss_mlp": 0.1114502, + "step": 15121, + "time_per_iteration": 2.699831962585449 + }, + { + "auxiliary_loss_clip": 0.01108696, + "auxiliary_loss_mlp": 0.01037697, + "balance_loss_clip": 1.0366565, + "balance_loss_mlp": 1.02640796, + "epoch": 0.9091838268450323, + "flos": 63730793161440.0, + "grad_norm": 1.893101932468727, + "language_loss": 0.70118088, + "learning_rate": 8.58404457722699e-08, + "loss": 0.72264481, + "num_input_tokens_seen": 326259080, + "router_z_loss_clip": 0.72021484, + "router_z_loss_mlp": 0.11273193, + "step": 15122, + "time_per_iteration": 2.8719570636749268 + }, + { + "auxiliary_loss_clip": 0.01106356, + "auxiliary_loss_mlp": 0.01030606, + "balance_loss_clip": 1.03553188, + "balance_loss_mlp": 1.01976967, + "epoch": 0.9092439500977003, + "flos": 25086750364800.0, + "grad_norm": 1.3320786437590426, + "language_loss": 0.74515414, + "learning_rate": 8.572760648850575e-08, + "loss": 0.76652378, + "num_input_tokens_seen": 326280175, + "router_z_loss_clip": 0.70849609, + "router_z_loss_mlp": 0.10839844, + "step": 15123, + "time_per_iteration": 2.6781694889068604 + }, + { + "auxiliary_loss_clip": 0.01107235, + "auxiliary_loss_mlp": 0.01027245, + "balance_loss_clip": 1.03815114, + "balance_loss_mlp": 1.01651573, + "epoch": 0.9093040733503682, + "flos": 34920511473600.0, + "grad_norm": 3.2921243342920676, + "language_loss": 0.7547642, + "learning_rate": 8.561483979414253e-08, + "loss": 0.77610898, + "num_input_tokens_seen": 326297990, + "router_z_loss_clip": 0.68994141, + "router_z_loss_mlp": 0.10723877, + "step": 15124, + "time_per_iteration": 2.7096152305603027 + }, + { + "auxiliary_loss_clip": 0.01109078, + "auxiliary_loss_mlp": 0.01028105, + "balance_loss_clip": 1.03823757, + "balance_loss_mlp": 1.01698279, + "epoch": 0.9093641966030362, + "flos": 28602014103360.0, + "grad_norm": 2.5295129763853628, + "language_loss": 0.72257572, + "learning_rate": 8.55021456934566e-08, + "loss": 0.74394751, + "num_input_tokens_seen": 326316735, + "router_z_loss_clip": 0.70800781, + "router_z_loss_mlp": 0.11120605, + "step": 15125, + "time_per_iteration": 2.702589988708496 + }, + { + "auxiliary_loss_clip": 0.01109696, + "auxiliary_loss_mlp": 0.01032281, + "balance_loss_clip": 1.03954983, + "balance_loss_mlp": 1.02150464, + "epoch": 0.9094243198557042, + "flos": 20499349220640.0, + "grad_norm": 1.9480966252930931, + "language_loss": 0.79273003, + "learning_rate": 8.538952419072143e-08, + "loss": 0.8141498, + "num_input_tokens_seen": 326334370, + "router_z_loss_clip": 0.70117188, + "router_z_loss_mlp": 0.10778809, + "step": 15126, + "time_per_iteration": 2.670280933380127 + }, + { + "auxiliary_loss_clip": 0.01106248, + "auxiliary_loss_mlp": 0.01033217, + "balance_loss_clip": 1.03624797, + "balance_loss_mlp": 1.02217853, + "epoch": 0.9094844431083722, + "flos": 29619175910400.0, + "grad_norm": 1.7714813327953987, + "language_loss": 0.75474167, + "learning_rate": 8.527697529020694e-08, + "loss": 0.77613628, + "num_input_tokens_seen": 326353435, + "router_z_loss_clip": 0.69921875, + "router_z_loss_mlp": 0.11035156, + "step": 15127, + "time_per_iteration": 4.043834686279297 + }, + { + "auxiliary_loss_clip": 0.0110884, + "auxiliary_loss_mlp": 0.01029492, + "balance_loss_clip": 1.03559923, + "balance_loss_mlp": 1.01816726, + "epoch": 0.9095445663610401, + "flos": 26777011523040.0, + "grad_norm": 2.4054470502453085, + "language_loss": 0.62871814, + "learning_rate": 8.516449899618173e-08, + "loss": 0.65010148, + "num_input_tokens_seen": 326371810, + "router_z_loss_clip": 0.73242188, + "router_z_loss_mlp": 0.11328125, + "step": 15128, + "time_per_iteration": 2.646301031112671 + }, + { + "auxiliary_loss_clip": 0.01106965, + "auxiliary_loss_mlp": 0.01023521, + "balance_loss_clip": 1.0364182, + "balance_loss_mlp": 1.01265538, + "epoch": 0.9096046896137081, + "flos": 24151137141600.0, + "grad_norm": 1.8902403668640695, + "language_loss": 0.76742762, + "learning_rate": 8.505209531291013e-08, + "loss": 0.78873253, + "num_input_tokens_seen": 326391380, + "router_z_loss_clip": 0.70507812, + "router_z_loss_mlp": 0.10870361, + "step": 15129, + "time_per_iteration": 2.695497989654541 + }, + { + "auxiliary_loss_clip": 0.01110844, + "auxiliary_loss_mlp": 0.01026079, + "balance_loss_clip": 1.03815722, + "balance_loss_mlp": 1.01562452, + "epoch": 0.909664812866376, + "flos": 27623175292800.0, + "grad_norm": 2.071964327726855, + "language_loss": 0.8285681, + "learning_rate": 8.49397642446552e-08, + "loss": 0.84993738, + "num_input_tokens_seen": 326408800, + "router_z_loss_clip": 0.7265625, + "router_z_loss_mlp": 0.10449219, + "step": 15130, + "time_per_iteration": 2.629960060119629 + }, + { + "auxiliary_loss_clip": 0.01110832, + "auxiliary_loss_mlp": 0.01033548, + "balance_loss_clip": 1.03820503, + "balance_loss_mlp": 1.02211595, + "epoch": 0.909724936119044, + "flos": 48629765171520.0, + "grad_norm": 1.7066904100817406, + "language_loss": 0.75064558, + "learning_rate": 8.482750579567644e-08, + "loss": 0.77208936, + "num_input_tokens_seen": 326431565, + "router_z_loss_clip": 0.72607422, + "router_z_loss_mlp": 0.11437988, + "step": 15131, + "time_per_iteration": 2.7833073139190674 + }, + { + "auxiliary_loss_clip": 0.01110899, + "auxiliary_loss_mlp": 0.01026417, + "balance_loss_clip": 1.03908038, + "balance_loss_mlp": 1.01506841, + "epoch": 0.9097850593717121, + "flos": 42796270823040.0, + "grad_norm": 2.253091295085119, + "language_loss": 0.59448224, + "learning_rate": 8.471531997023085e-08, + "loss": 0.61585546, + "num_input_tokens_seen": 326451715, + "router_z_loss_clip": 0.71826172, + "router_z_loss_mlp": 0.11358643, + "step": 15132, + "time_per_iteration": 2.7434422969818115 + }, + { + "auxiliary_loss_clip": 0.01110582, + "auxiliary_loss_mlp": 0.01027252, + "balance_loss_clip": 1.03895843, + "balance_loss_mlp": 1.0167141, + "epoch": 0.90984518262438, + "flos": 28513253443680.0, + "grad_norm": 1.5233653183109952, + "language_loss": 0.82616842, + "learning_rate": 8.460320677257193e-08, + "loss": 0.84754676, + "num_input_tokens_seen": 326470855, + "router_z_loss_clip": 0.71582031, + "router_z_loss_mlp": 0.10540771, + "step": 15133, + "time_per_iteration": 2.60627818107605 + }, + { + "auxiliary_loss_clip": 0.01108684, + "auxiliary_loss_mlp": 0.01029888, + "balance_loss_clip": 1.03589511, + "balance_loss_mlp": 1.0184381, + "epoch": 0.909905305877048, + "flos": 33584847264000.0, + "grad_norm": 1.8913273635785992, + "language_loss": 0.73853004, + "learning_rate": 8.449116620695118e-08, + "loss": 0.75991571, + "num_input_tokens_seen": 326490480, + "router_z_loss_clip": 0.72900391, + "router_z_loss_mlp": 0.11444092, + "step": 15134, + "time_per_iteration": 2.661848545074463 + }, + { + "auxiliary_loss_clip": 0.01113386, + "auxiliary_loss_mlp": 0.01028966, + "balance_loss_clip": 1.03752053, + "balance_loss_mlp": 1.01774287, + "epoch": 0.9099654291297159, + "flos": 29710043468640.0, + "grad_norm": 1.6046842385205047, + "language_loss": 0.727633, + "learning_rate": 8.437919827761786e-08, + "loss": 0.74905658, + "num_input_tokens_seen": 326509445, + "router_z_loss_clip": 0.7578125, + "router_z_loss_mlp": 0.11218262, + "step": 15135, + "time_per_iteration": 4.031731367111206 + }, + { + "auxiliary_loss_clip": 0.01108887, + "auxiliary_loss_mlp": 0.01025684, + "balance_loss_clip": 1.03863144, + "balance_loss_mlp": 1.0151813, + "epoch": 0.9100255523823839, + "flos": 25886892854880.0, + "grad_norm": 2.1720832176193476, + "language_loss": 0.69571805, + "learning_rate": 8.426730298881702e-08, + "loss": 0.71706378, + "num_input_tokens_seen": 326528380, + "router_z_loss_clip": 0.703125, + "router_z_loss_mlp": 0.1050415, + "step": 15136, + "time_per_iteration": 2.628190517425537 + }, + { + "auxiliary_loss_clip": 0.01027422, + "auxiliary_loss_mlp": 0.01001071, + "balance_loss_clip": 1.00514936, + "balance_loss_mlp": 1.00010169, + "epoch": 0.9100856756350518, + "flos": 56192498301600.0, + "grad_norm": 0.827806327819981, + "language_loss": 0.59288263, + "learning_rate": 8.415548034479214e-08, + "loss": 0.61316752, + "num_input_tokens_seen": 326576940, + "router_z_loss_clip": 0.22277832, + "router_z_loss_mlp": 0.0096817, + "step": 15137, + "time_per_iteration": 4.455686330795288 + }, + { + "auxiliary_loss_clip": 0.01110769, + "auxiliary_loss_mlp": 0.01029927, + "balance_loss_clip": 1.03867507, + "balance_loss_mlp": 1.01926398, + "epoch": 0.9101457988877198, + "flos": 24684551962560.0, + "grad_norm": 2.287237533213485, + "language_loss": 0.82344013, + "learning_rate": 8.40437303497834e-08, + "loss": 0.84484708, + "num_input_tokens_seen": 326596100, + "router_z_loss_clip": 0.72070312, + "router_z_loss_mlp": 0.10662842, + "step": 15138, + "time_per_iteration": 2.608036756515503 + }, + { + "auxiliary_loss_clip": 0.01106686, + "auxiliary_loss_mlp": 0.01022063, + "balance_loss_clip": 1.03876162, + "balance_loss_mlp": 1.01159036, + "epoch": 0.9102059221403878, + "flos": 32476291174080.0, + "grad_norm": 1.4632381993992631, + "language_loss": 0.81114346, + "learning_rate": 8.39320530080283e-08, + "loss": 0.8324309, + "num_input_tokens_seen": 326615700, + "router_z_loss_clip": 0.67871094, + "router_z_loss_mlp": 0.10473633, + "step": 15139, + "time_per_iteration": 2.7097890377044678 + }, + { + "auxiliary_loss_clip": 0.01109509, + "auxiliary_loss_mlp": 0.01028987, + "balance_loss_clip": 1.03813529, + "balance_loss_mlp": 1.01865149, + "epoch": 0.9102660453930558, + "flos": 26732935072800.0, + "grad_norm": 1.7891434249858231, + "language_loss": 0.77527672, + "learning_rate": 8.382044832376167e-08, + "loss": 0.79666162, + "num_input_tokens_seen": 326635905, + "router_z_loss_clip": 0.71337891, + "router_z_loss_mlp": 0.10333252, + "step": 15140, + "time_per_iteration": 2.6449670791625977 + }, + { + "auxiliary_loss_clip": 0.01107728, + "auxiliary_loss_mlp": 0.01027437, + "balance_loss_clip": 1.03645205, + "balance_loss_mlp": 1.01673818, + "epoch": 0.9103261686457237, + "flos": 44143766078400.0, + "grad_norm": 1.7786422581397323, + "language_loss": 0.66291499, + "learning_rate": 8.370891630121569e-08, + "loss": 0.68426669, + "num_input_tokens_seen": 326661855, + "router_z_loss_clip": 0.71191406, + "router_z_loss_mlp": 0.10693359, + "step": 15141, + "time_per_iteration": 2.809208869934082 + }, + { + "auxiliary_loss_clip": 0.01110879, + "auxiliary_loss_mlp": 0.01028647, + "balance_loss_clip": 1.03744769, + "balance_loss_mlp": 1.01804936, + "epoch": 0.9103862918983917, + "flos": 29136887546400.0, + "grad_norm": 1.7176589999210228, + "language_loss": 0.74875879, + "learning_rate": 8.359745694462005e-08, + "loss": 0.770154, + "num_input_tokens_seen": 326679320, + "router_z_loss_clip": 0.73486328, + "router_z_loss_mlp": 0.105896, + "step": 15142, + "time_per_iteration": 2.6910789012908936 + }, + { + "auxiliary_loss_clip": 0.01106898, + "auxiliary_loss_mlp": 0.0102988, + "balance_loss_clip": 1.03604078, + "balance_loss_mlp": 1.01938343, + "epoch": 0.9104464151510596, + "flos": 18228801098880.0, + "grad_norm": 1.8677641444894302, + "language_loss": 0.63958418, + "learning_rate": 8.348607025820076e-08, + "loss": 0.66095191, + "num_input_tokens_seen": 326698110, + "router_z_loss_clip": 0.70849609, + "router_z_loss_mlp": 0.10498047, + "step": 15143, + "time_per_iteration": 2.607888698577881 + }, + { + "auxiliary_loss_clip": 0.01109869, + "auxiliary_loss_mlp": 0.01031504, + "balance_loss_clip": 1.03592122, + "balance_loss_mlp": 1.02026844, + "epoch": 0.9105065384037276, + "flos": 41067929772000.0, + "grad_norm": 1.6773557199730698, + "language_loss": 0.60797763, + "learning_rate": 8.337475624618152e-08, + "loss": 0.62939137, + "num_input_tokens_seen": 326718370, + "router_z_loss_clip": 0.73925781, + "router_z_loss_mlp": 0.11242676, + "step": 15144, + "time_per_iteration": 2.7559597492218018 + }, + { + "auxiliary_loss_clip": 0.01105214, + "auxiliary_loss_mlp": 0.010237, + "balance_loss_clip": 1.03678513, + "balance_loss_mlp": 1.01289964, + "epoch": 0.9105666616563957, + "flos": 29671517885760.0, + "grad_norm": 1.581933282212292, + "language_loss": 0.71023595, + "learning_rate": 8.326351491278382e-08, + "loss": 0.73152506, + "num_input_tokens_seen": 326738445, + "router_z_loss_clip": 0.68408203, + "router_z_loss_mlp": 0.10797119, + "step": 15145, + "time_per_iteration": 2.6533761024475098 + }, + { + "auxiliary_loss_clip": 0.01104864, + "auxiliary_loss_mlp": 0.01028974, + "balance_loss_clip": 1.03561926, + "balance_loss_mlp": 1.01856732, + "epoch": 0.9106267849090636, + "flos": 36572206531680.0, + "grad_norm": 1.9372755336568555, + "language_loss": 0.70766842, + "learning_rate": 8.315234626222545e-08, + "loss": 0.72900683, + "num_input_tokens_seen": 326758855, + "router_z_loss_clip": 0.69189453, + "router_z_loss_mlp": 0.10406494, + "step": 15146, + "time_per_iteration": 2.771707057952881 + }, + { + "auxiliary_loss_clip": 0.0110781, + "auxiliary_loss_mlp": 0.010287, + "balance_loss_clip": 1.03646672, + "balance_loss_mlp": 1.01843596, + "epoch": 0.9106869081617316, + "flos": 30917530055520.0, + "grad_norm": 2.634114745637177, + "language_loss": 0.72841632, + "learning_rate": 8.304125029872233e-08, + "loss": 0.74978143, + "num_input_tokens_seen": 326777140, + "router_z_loss_clip": 0.71386719, + "router_z_loss_mlp": 0.1026001, + "step": 15147, + "time_per_iteration": 4.002994775772095 + }, + { + "auxiliary_loss_clip": 0.01111851, + "auxiliary_loss_mlp": 0.01027851, + "balance_loss_clip": 1.03710914, + "balance_loss_mlp": 1.01680672, + "epoch": 0.9107470314143995, + "flos": 22191636242880.0, + "grad_norm": 11.06164970386969, + "language_loss": 0.80197775, + "learning_rate": 8.293022702648711e-08, + "loss": 0.82337475, + "num_input_tokens_seen": 326794070, + "router_z_loss_clip": 0.74707031, + "router_z_loss_mlp": 0.11053467, + "step": 15148, + "time_per_iteration": 2.5911765098571777 + }, + { + "auxiliary_loss_clip": 0.01109493, + "auxiliary_loss_mlp": 0.01032757, + "balance_loss_clip": 1.0364697, + "balance_loss_mlp": 1.02181375, + "epoch": 0.9108071546670675, + "flos": 28736917594560.0, + "grad_norm": 1.5802992506430438, + "language_loss": 0.67550099, + "learning_rate": 8.281927644972996e-08, + "loss": 0.69692349, + "num_input_tokens_seen": 326814695, + "router_z_loss_clip": 0.72998047, + "router_z_loss_mlp": 0.10949707, + "step": 15149, + "time_per_iteration": 2.660548686981201 + }, + { + "auxiliary_loss_clip": 0.01109188, + "auxiliary_loss_mlp": 0.01027749, + "balance_loss_clip": 1.03745627, + "balance_loss_mlp": 1.01657867, + "epoch": 0.9108672779197354, + "flos": 31274477006400.0, + "grad_norm": 1.7928786220317878, + "language_loss": 0.63388264, + "learning_rate": 8.270839857265776e-08, + "loss": 0.65525198, + "num_input_tokens_seen": 326835295, + "router_z_loss_clip": 0.71630859, + "router_z_loss_mlp": 0.1116333, + "step": 15150, + "time_per_iteration": 2.6413965225219727 + }, + { + "auxiliary_loss_clip": 0.01106753, + "auxiliary_loss_mlp": 0.01031119, + "balance_loss_clip": 1.03596425, + "balance_loss_mlp": 1.01976991, + "epoch": 0.9109274011724035, + "flos": 27258975748800.0, + "grad_norm": 2.2100171642966635, + "language_loss": 0.72391599, + "learning_rate": 8.259759339947514e-08, + "loss": 0.74529469, + "num_input_tokens_seen": 326853350, + "router_z_loss_clip": 0.70751953, + "router_z_loss_mlp": 0.11346436, + "step": 15151, + "time_per_iteration": 2.632359504699707 + }, + { + "auxiliary_loss_clip": 0.01108509, + "auxiliary_loss_mlp": 0.01026065, + "balance_loss_clip": 1.03724718, + "balance_loss_mlp": 1.01524115, + "epoch": 0.9109875244250714, + "flos": 32565416489280.0, + "grad_norm": 1.6972570901455746, + "language_loss": 0.6457743, + "learning_rate": 8.248686093438429e-08, + "loss": 0.66711998, + "num_input_tokens_seen": 326873425, + "router_z_loss_clip": 0.71289062, + "router_z_loss_mlp": 0.10827637, + "step": 15152, + "time_per_iteration": 2.6467761993408203 + }, + { + "auxiliary_loss_clip": 0.01110742, + "auxiliary_loss_mlp": 0.01024862, + "balance_loss_clip": 1.0388186, + "balance_loss_mlp": 1.01357293, + "epoch": 0.9110476476777394, + "flos": 27979514484480.0, + "grad_norm": 1.9789635482585355, + "language_loss": 0.73572302, + "learning_rate": 8.23762011815834e-08, + "loss": 0.75707901, + "num_input_tokens_seen": 326893455, + "router_z_loss_clip": 0.71972656, + "router_z_loss_mlp": 0.112854, + "step": 15153, + "time_per_iteration": 2.6630232334136963 + }, + { + "auxiliary_loss_clip": 0.01111722, + "auxiliary_loss_mlp": 0.01028733, + "balance_loss_clip": 1.03912854, + "balance_loss_mlp": 1.01810551, + "epoch": 0.9111077709304073, + "flos": 16439730995520.0, + "grad_norm": 3.2054698042379997, + "language_loss": 0.72279578, + "learning_rate": 8.226561414526956e-08, + "loss": 0.74420035, + "num_input_tokens_seen": 326910210, + "router_z_loss_clip": 0.72607422, + "router_z_loss_mlp": 0.10626221, + "step": 15154, + "time_per_iteration": 2.597996711730957 + }, + { + "auxiliary_loss_clip": 0.01109667, + "auxiliary_loss_mlp": 0.01031641, + "balance_loss_clip": 1.03872812, + "balance_loss_mlp": 1.02103722, + "epoch": 0.9111678941830753, + "flos": 25441874038080.0, + "grad_norm": 1.9375358812640153, + "language_loss": 0.82456762, + "learning_rate": 8.215509982963564e-08, + "loss": 0.84598076, + "num_input_tokens_seen": 326929350, + "router_z_loss_clip": 0.70800781, + "router_z_loss_mlp": 0.1060791, + "step": 15155, + "time_per_iteration": 2.627362012863159 + }, + { + "auxiliary_loss_clip": 0.01110335, + "auxiliary_loss_mlp": 0.01030423, + "balance_loss_clip": 1.03872788, + "balance_loss_mlp": 1.01893771, + "epoch": 0.9112280174357432, + "flos": 24016274167680.0, + "grad_norm": 1.6902637287209998, + "language_loss": 0.5953272, + "learning_rate": 8.204465823887252e-08, + "loss": 0.6167348, + "num_input_tokens_seen": 326949060, + "router_z_loss_clip": 0.71582031, + "router_z_loss_mlp": 0.1149292, + "step": 15156, + "time_per_iteration": 2.6383748054504395 + }, + { + "auxiliary_loss_clip": 0.01110664, + "auxiliary_loss_mlp": 0.01030252, + "balance_loss_clip": 1.03589952, + "balance_loss_mlp": 1.01812851, + "epoch": 0.9112881406884112, + "flos": 31051055959200.0, + "grad_norm": 1.934397492393068, + "language_loss": 0.74110568, + "learning_rate": 8.193428937716796e-08, + "loss": 0.76251483, + "num_input_tokens_seen": 326968950, + "router_z_loss_clip": 0.74707031, + "router_z_loss_mlp": 0.12127686, + "step": 15157, + "time_per_iteration": 2.691066265106201 + }, + { + "auxiliary_loss_clip": 0.01110211, + "auxiliary_loss_mlp": 0.01027686, + "balance_loss_clip": 1.03718925, + "balance_loss_mlp": 1.01767862, + "epoch": 0.9113482639410793, + "flos": 40349862590400.0, + "grad_norm": 1.9376601412005734, + "language_loss": 0.5971787, + "learning_rate": 8.182399324870747e-08, + "loss": 0.61855769, + "num_input_tokens_seen": 326989455, + "router_z_loss_clip": 0.72998047, + "router_z_loss_mlp": 0.10015869, + "step": 15158, + "time_per_iteration": 2.694772481918335 + }, + { + "auxiliary_loss_clip": 0.01109014, + "auxiliary_loss_mlp": 0.01031714, + "balance_loss_clip": 1.03740835, + "balance_loss_mlp": 1.02102101, + "epoch": 0.9114083871937472, + "flos": 26643890792160.0, + "grad_norm": 1.8802205848580773, + "language_loss": 0.67923355, + "learning_rate": 8.171376985767375e-08, + "loss": 0.7006408, + "num_input_tokens_seen": 327009640, + "router_z_loss_clip": 0.71533203, + "router_z_loss_mlp": 0.10693359, + "step": 15159, + "time_per_iteration": 2.6481502056121826 + }, + { + "auxiliary_loss_clip": 0.01107734, + "auxiliary_loss_mlp": 0.01028418, + "balance_loss_clip": 1.03644848, + "balance_loss_mlp": 1.01784444, + "epoch": 0.9114685104464152, + "flos": 33054552273600.0, + "grad_norm": 2.269437536982412, + "language_loss": 0.78728735, + "learning_rate": 8.160361920824588e-08, + "loss": 0.80864888, + "num_input_tokens_seen": 327027690, + "router_z_loss_clip": 0.71240234, + "router_z_loss_mlp": 0.10577393, + "step": 15160, + "time_per_iteration": 2.6332850456237793 + }, + { + "auxiliary_loss_clip": 0.01112772, + "auxiliary_loss_mlp": 0.01027085, + "balance_loss_clip": 1.03982842, + "balance_loss_mlp": 1.01536679, + "epoch": 0.9115286336990831, + "flos": 21923287882560.0, + "grad_norm": 1.988443484662038, + "language_loss": 0.69204926, + "learning_rate": 8.149354130460073e-08, + "loss": 0.71344781, + "num_input_tokens_seen": 327045915, + "router_z_loss_clip": 0.72998047, + "router_z_loss_mlp": 0.1171875, + "step": 15161, + "time_per_iteration": 2.647550582885742 + }, + { + "auxiliary_loss_clip": 0.01111446, + "auxiliary_loss_mlp": 0.01029433, + "balance_loss_clip": 1.03876233, + "balance_loss_mlp": 1.01757741, + "epoch": 0.9115887569517511, + "flos": 27979230863520.0, + "grad_norm": 1.749674987183248, + "language_loss": 0.76648003, + "learning_rate": 8.138353615091321e-08, + "loss": 0.78788882, + "num_input_tokens_seen": 327066355, + "router_z_loss_clip": 0.7265625, + "router_z_loss_mlp": 0.11865234, + "step": 15162, + "time_per_iteration": 2.632585048675537 + }, + { + "auxiliary_loss_clip": 0.0111022, + "auxiliary_loss_mlp": 0.01033663, + "balance_loss_clip": 1.03800428, + "balance_loss_mlp": 1.02246332, + "epoch": 0.911648880204419, + "flos": 29270940174720.0, + "grad_norm": 2.174415800807176, + "language_loss": 0.66615999, + "learning_rate": 8.127360375135395e-08, + "loss": 0.68759882, + "num_input_tokens_seen": 327086735, + "router_z_loss_clip": 0.72167969, + "router_z_loss_mlp": 0.11206055, + "step": 15163, + "time_per_iteration": 2.6540298461914062 + }, + { + "auxiliary_loss_clip": 0.01112483, + "auxiliary_loss_mlp": 0.01032498, + "balance_loss_clip": 1.03751564, + "balance_loss_mlp": 1.02101171, + "epoch": 0.911709003457087, + "flos": 20810680064640.0, + "grad_norm": 2.5346604788934983, + "language_loss": 0.70674241, + "learning_rate": 8.116374411009186e-08, + "loss": 0.72819221, + "num_input_tokens_seen": 327104035, + "router_z_loss_clip": 0.74902344, + "router_z_loss_mlp": 0.11486816, + "step": 15164, + "time_per_iteration": 2.5996603965759277 + }, + { + "auxiliary_loss_clip": 0.01109707, + "auxiliary_loss_mlp": 0.01032033, + "balance_loss_clip": 1.04034925, + "balance_loss_mlp": 1.02063632, + "epoch": 0.911769126709755, + "flos": 26421482676960.0, + "grad_norm": 1.5504176011487636, + "language_loss": 0.76211941, + "learning_rate": 8.105395723129315e-08, + "loss": 0.78353685, + "num_input_tokens_seen": 327124370, + "router_z_loss_clip": 0.69335938, + "router_z_loss_mlp": 0.11401367, + "step": 15165, + "time_per_iteration": 2.6286003589630127 + }, + { + "auxiliary_loss_clip": 0.01111512, + "auxiliary_loss_mlp": 0.01030402, + "balance_loss_clip": 1.0389092, + "balance_loss_mlp": 1.01916659, + "epoch": 0.911829249962423, + "flos": 30249535881600.0, + "grad_norm": 2.348644393290927, + "language_loss": 0.72591758, + "learning_rate": 8.094424311912074e-08, + "loss": 0.74733675, + "num_input_tokens_seen": 327140915, + "router_z_loss_clip": 0.72607422, + "router_z_loss_mlp": 0.11236572, + "step": 15166, + "time_per_iteration": 4.045762538909912 + }, + { + "auxiliary_loss_clip": 0.01110522, + "auxiliary_loss_mlp": 0.01035517, + "balance_loss_clip": 1.03679192, + "balance_loss_mlp": 1.02358413, + "epoch": 0.9118893732150909, + "flos": 25574751665280.0, + "grad_norm": 2.4961785150439635, + "language_loss": 0.73108017, + "learning_rate": 8.083460177773482e-08, + "loss": 0.75254059, + "num_input_tokens_seen": 327158940, + "router_z_loss_clip": 0.73681641, + "router_z_loss_mlp": 0.1192627, + "step": 15167, + "time_per_iteration": 2.5964696407318115 + }, + { + "auxiliary_loss_clip": 0.01027214, + "auxiliary_loss_mlp": 0.01001245, + "balance_loss_clip": 1.00486231, + "balance_loss_mlp": 1.00031638, + "epoch": 0.9119494964677589, + "flos": 82873050881760.0, + "grad_norm": 0.7684025233314995, + "language_loss": 0.65541422, + "learning_rate": 8.072503321129298e-08, + "loss": 0.67569882, + "num_input_tokens_seen": 327217450, + "router_z_loss_clip": 0.22363281, + "router_z_loss_mlp": 0.00927734, + "step": 15168, + "time_per_iteration": 3.2038118839263916 + }, + { + "auxiliary_loss_clip": 0.01105971, + "auxiliary_loss_mlp": 0.01029049, + "balance_loss_clip": 1.03595138, + "balance_loss_mlp": 1.0184927, + "epoch": 0.9120096197204268, + "flos": 22636452473280.0, + "grad_norm": 2.2611756304442188, + "language_loss": 0.77938628, + "learning_rate": 8.061553742395033e-08, + "loss": 0.80073643, + "num_input_tokens_seen": 327233905, + "router_z_loss_clip": 0.70068359, + "router_z_loss_mlp": 0.10559082, + "step": 15169, + "time_per_iteration": 2.6380112171173096 + }, + { + "auxiliary_loss_clip": 0.01108087, + "auxiliary_loss_mlp": 0.01026983, + "balance_loss_clip": 1.03637028, + "balance_loss_mlp": 1.0163914, + "epoch": 0.9120697429730948, + "flos": 23215118745600.0, + "grad_norm": 1.6473373053499698, + "language_loss": 0.82243967, + "learning_rate": 8.05061144198591e-08, + "loss": 0.84379041, + "num_input_tokens_seen": 327252430, + "router_z_loss_clip": 0.71728516, + "router_z_loss_mlp": 0.10595703, + "step": 15170, + "time_per_iteration": 2.6298961639404297 + }, + { + "auxiliary_loss_clip": 0.01110636, + "auxiliary_loss_mlp": 0.01028343, + "balance_loss_clip": 1.03844249, + "balance_loss_mlp": 1.01705337, + "epoch": 0.9121298662257629, + "flos": 20943030967200.0, + "grad_norm": 2.6699068249637117, + "language_loss": 0.77156281, + "learning_rate": 8.039676420316799e-08, + "loss": 0.79295254, + "num_input_tokens_seen": 327269215, + "router_z_loss_clip": 0.72216797, + "router_z_loss_mlp": 0.11291504, + "step": 15171, + "time_per_iteration": 2.7159125804901123 + }, + { + "auxiliary_loss_clip": 0.01106901, + "auxiliary_loss_mlp": 0.01030757, + "balance_loss_clip": 1.03598773, + "balance_loss_mlp": 1.01960468, + "epoch": 0.9121899894784308, + "flos": 24016800892320.0, + "grad_norm": 1.34534624100544, + "language_loss": 0.66838443, + "learning_rate": 8.02874867780241e-08, + "loss": 0.68976104, + "num_input_tokens_seen": 327290320, + "router_z_loss_clip": 0.70996094, + "router_z_loss_mlp": 0.11157227, + "step": 15172, + "time_per_iteration": 2.6321897506713867 + }, + { + "auxiliary_loss_clip": 0.01113167, + "auxiliary_loss_mlp": 0.01028293, + "balance_loss_clip": 1.04001987, + "balance_loss_mlp": 1.01764786, + "epoch": 0.9122501127310988, + "flos": 27132297265440.0, + "grad_norm": 1.762694786586019, + "language_loss": 0.74902749, + "learning_rate": 8.017828214857103e-08, + "loss": 0.77044213, + "num_input_tokens_seen": 327310150, + "router_z_loss_clip": 0.73095703, + "router_z_loss_mlp": 0.10638428, + "step": 15173, + "time_per_iteration": 2.684511423110962 + }, + { + "auxiliary_loss_clip": 0.01115238, + "auxiliary_loss_mlp": 0.01032276, + "balance_loss_clip": 1.0389539, + "balance_loss_mlp": 1.01955044, + "epoch": 0.9123102359837667, + "flos": 19470923609760.0, + "grad_norm": 2.2324178904858942, + "language_loss": 0.66265213, + "learning_rate": 8.00691503189499e-08, + "loss": 0.68412733, + "num_input_tokens_seen": 327326660, + "router_z_loss_clip": 0.76269531, + "router_z_loss_mlp": 0.1272583, + "step": 15174, + "time_per_iteration": 2.571847915649414 + }, + { + "auxiliary_loss_clip": 0.01113077, + "auxiliary_loss_mlp": 0.01028618, + "balance_loss_clip": 1.03935671, + "balance_loss_mlp": 1.01636958, + "epoch": 0.9123703592364347, + "flos": 30695446078560.0, + "grad_norm": 2.6106946713230887, + "language_loss": 0.74945676, + "learning_rate": 7.996009129329894e-08, + "loss": 0.77087373, + "num_input_tokens_seen": 327346700, + "router_z_loss_clip": 0.73681641, + "router_z_loss_mlp": 0.12237549, + "step": 15175, + "time_per_iteration": 4.020845174789429 + }, + { + "auxiliary_loss_clip": 0.01027405, + "auxiliary_loss_mlp": 0.01001069, + "balance_loss_clip": 1.00511348, + "balance_loss_mlp": 1.00010896, + "epoch": 0.9124304824891026, + "flos": 74189977483680.0, + "grad_norm": 0.9702439307007558, + "language_loss": 0.58384871, + "learning_rate": 7.985110507575421e-08, + "loss": 0.60413349, + "num_input_tokens_seen": 327403050, + "router_z_loss_clip": 0.22302246, + "router_z_loss_mlp": 0.00959015, + "step": 15176, + "time_per_iteration": 4.724566221237183 + }, + { + "auxiliary_loss_clip": 0.01110216, + "auxiliary_loss_mlp": 0.010333, + "balance_loss_clip": 1.03792918, + "balance_loss_mlp": 1.0225296, + "epoch": 0.9124906057417707, + "flos": 22147559792640.0, + "grad_norm": 2.0705595383923967, + "language_loss": 0.65538335, + "learning_rate": 7.97421916704475e-08, + "loss": 0.67681849, + "num_input_tokens_seen": 327422225, + "router_z_loss_clip": 0.72363281, + "router_z_loss_mlp": 0.10772705, + "step": 15177, + "time_per_iteration": 2.5762314796447754 + }, + { + "auxiliary_loss_clip": 0.01107512, + "auxiliary_loss_mlp": 0.01027744, + "balance_loss_clip": 1.03615165, + "balance_loss_mlp": 1.01710439, + "epoch": 0.9125507289944386, + "flos": 14260050432000.0, + "grad_norm": 1.9623139432531445, + "language_loss": 0.81218213, + "learning_rate": 7.963335108150926e-08, + "loss": 0.83353466, + "num_input_tokens_seen": 327437025, + "router_z_loss_clip": 0.71386719, + "router_z_loss_mlp": 0.10644531, + "step": 15178, + "time_per_iteration": 2.5876028537750244 + }, + { + "auxiliary_loss_clip": 0.01108787, + "auxiliary_loss_mlp": 0.01024481, + "balance_loss_clip": 1.03763723, + "balance_loss_mlp": 1.01356745, + "epoch": 0.9126108522471066, + "flos": 21656924868960.0, + "grad_norm": 2.0310235926252584, + "language_loss": 0.78895265, + "learning_rate": 7.952458331306711e-08, + "loss": 0.81028533, + "num_input_tokens_seen": 327453915, + "router_z_loss_clip": 0.71240234, + "router_z_loss_mlp": 0.10919189, + "step": 15179, + "time_per_iteration": 2.5779263973236084 + }, + { + "auxiliary_loss_clip": 0.01106762, + "auxiliary_loss_mlp": 0.01029969, + "balance_loss_clip": 1.03640294, + "balance_loss_mlp": 1.01929998, + "epoch": 0.9126709754997745, + "flos": 33721776619200.0, + "grad_norm": 1.547590861919999, + "language_loss": 0.68178183, + "learning_rate": 7.941588836924507e-08, + "loss": 0.7031492, + "num_input_tokens_seen": 327474415, + "router_z_loss_clip": 0.70361328, + "router_z_loss_mlp": 0.10668945, + "step": 15180, + "time_per_iteration": 2.667022466659546 + }, + { + "auxiliary_loss_clip": 0.01105158, + "auxiliary_loss_mlp": 0.01024637, + "balance_loss_clip": 1.03535438, + "balance_loss_mlp": 1.01431918, + "epoch": 0.9127310987524425, + "flos": 19431304060320.0, + "grad_norm": 1.9829780877318361, + "language_loss": 0.75176799, + "learning_rate": 7.930726625416495e-08, + "loss": 0.77306598, + "num_input_tokens_seen": 327492750, + "router_z_loss_clip": 0.69726562, + "router_z_loss_mlp": 0.10327148, + "step": 15181, + "time_per_iteration": 2.651921272277832 + }, + { + "auxiliary_loss_clip": 0.0111374, + "auxiliary_loss_mlp": 0.01028361, + "balance_loss_clip": 1.03906369, + "balance_loss_mlp": 1.01748872, + "epoch": 0.9127912220051104, + "flos": 26280015386400.0, + "grad_norm": 1.7241718467045397, + "language_loss": 0.74831885, + "learning_rate": 7.919871697194614e-08, + "loss": 0.76973987, + "num_input_tokens_seen": 327509470, + "router_z_loss_clip": 0.74658203, + "router_z_loss_mlp": 0.10870361, + "step": 15182, + "time_per_iteration": 2.607034921646118 + }, + { + "auxiliary_loss_clip": 0.01111463, + "auxiliary_loss_mlp": 0.01028883, + "balance_loss_clip": 1.03760052, + "balance_loss_mlp": 1.01757622, + "epoch": 0.9128513452577784, + "flos": 29362415492160.0, + "grad_norm": 1.5483208016445773, + "language_loss": 0.76400971, + "learning_rate": 7.909024052670421e-08, + "loss": 0.78541315, + "num_input_tokens_seen": 327530520, + "router_z_loss_clip": 0.73828125, + "router_z_loss_mlp": 0.11309814, + "step": 15183, + "time_per_iteration": 2.685307502746582 + }, + { + "auxiliary_loss_clip": 0.01113839, + "auxiliary_loss_mlp": 0.01028735, + "balance_loss_clip": 1.03940129, + "balance_loss_mlp": 1.01755285, + "epoch": 0.9129114685104465, + "flos": 19787035492800.0, + "grad_norm": 2.4539054606997843, + "language_loss": 0.76953888, + "learning_rate": 7.898183692255256e-08, + "loss": 0.7909646, + "num_input_tokens_seen": 327546960, + "router_z_loss_clip": 0.74414062, + "router_z_loss_mlp": 0.11175537, + "step": 15184, + "time_per_iteration": 2.5919384956359863 + }, + { + "auxiliary_loss_clip": 0.01111245, + "auxiliary_loss_mlp": 0.01031586, + "balance_loss_clip": 1.0389421, + "balance_loss_mlp": 1.02091646, + "epoch": 0.9129715917631144, + "flos": 23653330659360.0, + "grad_norm": 1.7374030576793298, + "language_loss": 0.74339682, + "learning_rate": 7.887350616360233e-08, + "loss": 0.76482511, + "num_input_tokens_seen": 327564830, + "router_z_loss_clip": 0.72363281, + "router_z_loss_mlp": 0.10662842, + "step": 15185, + "time_per_iteration": 2.6672356128692627 + }, + { + "auxiliary_loss_clip": 0.01108583, + "auxiliary_loss_mlp": 0.0102951, + "balance_loss_clip": 1.03684604, + "balance_loss_mlp": 1.01826835, + "epoch": 0.9130317150157824, + "flos": 25124911292160.0, + "grad_norm": 2.6379284564705285, + "language_loss": 0.68570006, + "learning_rate": 7.876524825396158e-08, + "loss": 0.70708096, + "num_input_tokens_seen": 327583675, + "router_z_loss_clip": 0.71728516, + "router_z_loss_mlp": 0.11230469, + "step": 15186, + "time_per_iteration": 2.679997444152832 + }, + { + "auxiliary_loss_clip": 0.01116839, + "auxiliary_loss_mlp": 0.01031594, + "balance_loss_clip": 1.0396297, + "balance_loss_mlp": 1.01966691, + "epoch": 0.9130918382684503, + "flos": 24635289300480.0, + "grad_norm": 2.054910211103661, + "language_loss": 0.77400041, + "learning_rate": 7.865706319773502e-08, + "loss": 0.79548472, + "num_input_tokens_seen": 327602280, + "router_z_loss_clip": 0.77197266, + "router_z_loss_mlp": 0.11920166, + "step": 15187, + "time_per_iteration": 3.954368829727173 + }, + { + "auxiliary_loss_clip": 0.0110923, + "auxiliary_loss_mlp": 0.01029631, + "balance_loss_clip": 1.03701413, + "balance_loss_mlp": 1.01953983, + "epoch": 0.9131519615211183, + "flos": 31184906001120.0, + "grad_norm": 1.8808012676448649, + "language_loss": 0.65670478, + "learning_rate": 7.854895099902515e-08, + "loss": 0.67809343, + "num_input_tokens_seen": 327623515, + "router_z_loss_clip": 0.72216797, + "router_z_loss_mlp": 0.10095215, + "step": 15188, + "time_per_iteration": 2.6763980388641357 + }, + { + "auxiliary_loss_clip": 0.01106257, + "auxiliary_loss_mlp": 0.01029729, + "balance_loss_clip": 1.03565013, + "balance_loss_mlp": 1.01926231, + "epoch": 0.9132120847737862, + "flos": 21879900226080.0, + "grad_norm": 2.0345053159589215, + "language_loss": 0.7673136, + "learning_rate": 7.844091166193157e-08, + "loss": 0.78867346, + "num_input_tokens_seen": 327642875, + "router_z_loss_clip": 0.70654297, + "router_z_loss_mlp": 0.10467529, + "step": 15189, + "time_per_iteration": 2.60209584236145 + }, + { + "auxiliary_loss_clip": 0.01107193, + "auxiliary_loss_mlp": 0.0102614, + "balance_loss_clip": 1.03701639, + "balance_loss_mlp": 1.01606119, + "epoch": 0.9132722080264543, + "flos": 24462103330080.0, + "grad_norm": 1.7424972629047792, + "language_loss": 0.75307727, + "learning_rate": 7.8332945190551e-08, + "loss": 0.77441061, + "num_input_tokens_seen": 327662450, + "router_z_loss_clip": 0.70117188, + "router_z_loss_mlp": 0.10083008, + "step": 15190, + "time_per_iteration": 2.6569130420684814 + }, + { + "auxiliary_loss_clip": 0.01027364, + "auxiliary_loss_mlp": 0.01002033, + "balance_loss_clip": 1.00501132, + "balance_loss_mlp": 1.00113177, + "epoch": 0.9133323312791222, + "flos": 85950832017600.0, + "grad_norm": 0.7109237178920825, + "language_loss": 0.57329649, + "learning_rate": 7.822505158897797e-08, + "loss": 0.59359044, + "num_input_tokens_seen": 327723845, + "router_z_loss_clip": 0.22375488, + "router_z_loss_mlp": 0.00901031, + "step": 15191, + "time_per_iteration": 3.275312662124634 + }, + { + "auxiliary_loss_clip": 0.01112069, + "auxiliary_loss_mlp": 0.01034105, + "balance_loss_clip": 1.03873634, + "balance_loss_mlp": 1.02205324, + "epoch": 0.9133924545317902, + "flos": 31095740168640.0, + "grad_norm": 1.8836159090281082, + "language_loss": 0.74775612, + "learning_rate": 7.81172308613034e-08, + "loss": 0.76921785, + "num_input_tokens_seen": 327742590, + "router_z_loss_clip": 0.73291016, + "router_z_loss_mlp": 0.1204834, + "step": 15192, + "time_per_iteration": 2.7642478942871094 + }, + { + "auxiliary_loss_clip": 0.01108885, + "auxiliary_loss_mlp": 0.01027136, + "balance_loss_clip": 1.03875959, + "balance_loss_mlp": 1.01649654, + "epoch": 0.9134525777844581, + "flos": 48725089630560.0, + "grad_norm": 1.5874546245727625, + "language_loss": 0.69312894, + "learning_rate": 7.800948301161647e-08, + "loss": 0.71448922, + "num_input_tokens_seen": 327764350, + "router_z_loss_clip": 0.70117188, + "router_z_loss_mlp": 0.10638428, + "step": 15193, + "time_per_iteration": 2.7813243865966797 + }, + { + "auxiliary_loss_clip": 0.01107769, + "auxiliary_loss_mlp": 0.01039709, + "balance_loss_clip": 1.0381155, + "balance_loss_mlp": 1.02936196, + "epoch": 0.9135127010371261, + "flos": 25486031522880.0, + "grad_norm": 1.649450051440481, + "language_loss": 0.73252583, + "learning_rate": 7.790180804400215e-08, + "loss": 0.7540006, + "num_input_tokens_seen": 327783120, + "router_z_loss_clip": 0.69677734, + "router_z_loss_mlp": 0.10345459, + "step": 15194, + "time_per_iteration": 2.63883113861084 + }, + { + "auxiliary_loss_clip": 0.01111671, + "auxiliary_loss_mlp": 0.01029291, + "balance_loss_clip": 1.0364455, + "balance_loss_mlp": 1.01700044, + "epoch": 0.913572824289794, + "flos": 25396420000320.0, + "grad_norm": 1.8570267287955782, + "language_loss": 0.61552358, + "learning_rate": 7.779420596254383e-08, + "loss": 0.63693321, + "num_input_tokens_seen": 327801960, + "router_z_loss_clip": 0.75195312, + "router_z_loss_mlp": 0.12280273, + "step": 15195, + "time_per_iteration": 2.6202950477600098 + }, + { + "auxiliary_loss_clip": 0.01108989, + "auxiliary_loss_mlp": 0.01028142, + "balance_loss_clip": 1.03664017, + "balance_loss_mlp": 1.01712084, + "epoch": 0.913632947542462, + "flos": 31363521287040.0, + "grad_norm": 1.7044483968047037, + "language_loss": 0.71649057, + "learning_rate": 7.768667677132201e-08, + "loss": 0.73786187, + "num_input_tokens_seen": 327823795, + "router_z_loss_clip": 0.72314453, + "router_z_loss_mlp": 0.11022949, + "step": 15196, + "time_per_iteration": 2.6872057914733887 + }, + { + "auxiliary_loss_clip": 0.01109812, + "auxiliary_loss_mlp": 0.01030453, + "balance_loss_clip": 1.03886414, + "balance_loss_mlp": 1.01960492, + "epoch": 0.9136930707951301, + "flos": 32074943634720.0, + "grad_norm": 1.5880130724576036, + "language_loss": 0.71286142, + "learning_rate": 7.757922047441411e-08, + "loss": 0.73426402, + "num_input_tokens_seen": 327845175, + "router_z_loss_clip": 0.70849609, + "router_z_loss_mlp": 0.10845947, + "step": 15197, + "time_per_iteration": 2.732139825820923 + }, + { + "auxiliary_loss_clip": 0.01110283, + "auxiliary_loss_mlp": 0.01027677, + "balance_loss_clip": 1.03666615, + "balance_loss_mlp": 1.01588726, + "epoch": 0.913753194047798, + "flos": 26956558706400.0, + "grad_norm": 3.0563455266842823, + "language_loss": 0.77770001, + "learning_rate": 7.747183707589489e-08, + "loss": 0.7990796, + "num_input_tokens_seen": 327863150, + "router_z_loss_clip": 0.73535156, + "router_z_loss_mlp": 0.11785889, + "step": 15198, + "time_per_iteration": 2.637371301651001 + }, + { + "auxiliary_loss_clip": 0.01106094, + "auxiliary_loss_mlp": 0.01028744, + "balance_loss_clip": 1.03716874, + "balance_loss_mlp": 1.01800299, + "epoch": 0.913813317300466, + "flos": 28781318183040.0, + "grad_norm": 1.4638542000221215, + "language_loss": 0.68167835, + "learning_rate": 7.736452657983616e-08, + "loss": 0.70302671, + "num_input_tokens_seen": 327883445, + "router_z_loss_clip": 0.68945312, + "router_z_loss_mlp": 0.10736084, + "step": 15199, + "time_per_iteration": 2.687073230743408 + }, + { + "auxiliary_loss_clip": 0.01111425, + "auxiliary_loss_mlp": 0.01032894, + "balance_loss_clip": 1.03841305, + "balance_loss_mlp": 1.02217674, + "epoch": 0.9138734405531339, + "flos": 35235772493760.0, + "grad_norm": 1.636392992435464, + "language_loss": 0.67825586, + "learning_rate": 7.725728899030714e-08, + "loss": 0.69969904, + "num_input_tokens_seen": 327905745, + "router_z_loss_clip": 0.72802734, + "router_z_loss_mlp": 0.10717773, + "step": 15200, + "time_per_iteration": 2.656956672668457 + }, + { + "auxiliary_loss_clip": 0.01109196, + "auxiliary_loss_mlp": 0.01027669, + "balance_loss_clip": 1.0392592, + "balance_loss_mlp": 1.01769745, + "epoch": 0.9139335638058019, + "flos": 27846434270880.0, + "grad_norm": 1.5657293350648607, + "language_loss": 0.71352005, + "learning_rate": 7.715012431137435e-08, + "loss": 0.73488867, + "num_input_tokens_seen": 327925435, + "router_z_loss_clip": 0.69873047, + "router_z_loss_mlp": 0.09979248, + "step": 15201, + "time_per_iteration": 2.6989564895629883 + }, + { + "auxiliary_loss_clip": 0.01108447, + "auxiliary_loss_mlp": 0.01027579, + "balance_loss_clip": 1.03664017, + "balance_loss_mlp": 1.01742232, + "epoch": 0.9139936870584698, + "flos": 21968539333920.0, + "grad_norm": 1.8622475721380716, + "language_loss": 0.70781159, + "learning_rate": 7.704303254710165e-08, + "loss": 0.72917187, + "num_input_tokens_seen": 327944145, + "router_z_loss_clip": 0.71826172, + "router_z_loss_mlp": 0.10150146, + "step": 15202, + "time_per_iteration": 2.5844810009002686 + }, + { + "auxiliary_loss_clip": 0.01109051, + "auxiliary_loss_mlp": 0.01031047, + "balance_loss_clip": 1.03743196, + "balance_loss_mlp": 1.01972795, + "epoch": 0.9140538103111379, + "flos": 19295752292640.0, + "grad_norm": 2.863180797845372, + "language_loss": 0.66698581, + "learning_rate": 7.693601370155001e-08, + "loss": 0.6883868, + "num_input_tokens_seen": 327960565, + "router_z_loss_clip": 0.71679688, + "router_z_loss_mlp": 0.11328125, + "step": 15203, + "time_per_iteration": 2.6311936378479004 + }, + { + "auxiliary_loss_clip": 0.01111623, + "auxiliary_loss_mlp": 0.01028719, + "balance_loss_clip": 1.03943849, + "balance_loss_mlp": 1.01682222, + "epoch": 0.9141139335638058, + "flos": 29269360000800.0, + "grad_norm": 1.6273276089111568, + "language_loss": 0.68989474, + "learning_rate": 7.682906777877751e-08, + "loss": 0.71129811, + "num_input_tokens_seen": 327981180, + "router_z_loss_clip": 0.72070312, + "router_z_loss_mlp": 0.11907959, + "step": 15204, + "time_per_iteration": 2.6413662433624268 + }, + { + "auxiliary_loss_clip": 0.01109524, + "auxiliary_loss_mlp": 0.01026637, + "balance_loss_clip": 1.03638339, + "balance_loss_mlp": 1.01484179, + "epoch": 0.9141740568164738, + "flos": 29314895073120.0, + "grad_norm": 2.1579878281068505, + "language_loss": 0.59139764, + "learning_rate": 7.672219478283915e-08, + "loss": 0.61275923, + "num_input_tokens_seen": 328001500, + "router_z_loss_clip": 0.73193359, + "router_z_loss_mlp": 0.11779785, + "step": 15205, + "time_per_iteration": 2.69608998298645 + }, + { + "auxiliary_loss_clip": 0.01105738, + "auxiliary_loss_mlp": 0.01030333, + "balance_loss_clip": 1.0365001, + "balance_loss_mlp": 1.01939511, + "epoch": 0.9142341800691417, + "flos": 32968060581600.0, + "grad_norm": 1.6547839706380076, + "language_loss": 0.81047988, + "learning_rate": 7.661539471778811e-08, + "loss": 0.83184052, + "num_input_tokens_seen": 328023025, + "router_z_loss_clip": 0.69189453, + "router_z_loss_mlp": 0.10931396, + "step": 15206, + "time_per_iteration": 4.143695592880249 + }, + { + "auxiliary_loss_clip": 0.01109271, + "auxiliary_loss_mlp": 0.01025464, + "balance_loss_clip": 1.03611875, + "balance_loss_mlp": 1.01418638, + "epoch": 0.9142943033218097, + "flos": 24907324733280.0, + "grad_norm": 2.3609088661889532, + "language_loss": 0.7375567, + "learning_rate": 7.650866758767382e-08, + "loss": 0.75890404, + "num_input_tokens_seen": 328041410, + "router_z_loss_clip": 0.72998047, + "router_z_loss_mlp": 0.11273193, + "step": 15207, + "time_per_iteration": 2.6462690830230713 + }, + { + "auxiliary_loss_clip": 0.01107936, + "auxiliary_loss_mlp": 0.01032726, + "balance_loss_clip": 1.03683543, + "balance_loss_mlp": 1.02113867, + "epoch": 0.9143544265744776, + "flos": 24105440000160.0, + "grad_norm": 1.7197857935401755, + "language_loss": 0.72951859, + "learning_rate": 7.640201339654373e-08, + "loss": 0.75092518, + "num_input_tokens_seen": 328060495, + "router_z_loss_clip": 0.71044922, + "router_z_loss_mlp": 0.11590576, + "step": 15208, + "time_per_iteration": 2.6936440467834473 + }, + { + "auxiliary_loss_clip": 0.01108298, + "auxiliary_loss_mlp": 0.01026821, + "balance_loss_clip": 1.03796768, + "balance_loss_mlp": 1.01638985, + "epoch": 0.9144145498271457, + "flos": 20945016313920.0, + "grad_norm": 2.5929497884712753, + "language_loss": 0.86108613, + "learning_rate": 7.629543214844237e-08, + "loss": 0.88243735, + "num_input_tokens_seen": 328076905, + "router_z_loss_clip": 0.70410156, + "router_z_loss_mlp": 0.10424805, + "step": 15209, + "time_per_iteration": 2.656874656677246 + }, + { + "auxiliary_loss_clip": 0.01108997, + "auxiliary_loss_mlp": 0.0103477, + "balance_loss_clip": 1.03787673, + "balance_loss_mlp": 1.02434552, + "epoch": 0.9144746730798137, + "flos": 28951424840160.0, + "grad_norm": 11.904185101953448, + "language_loss": 0.75325662, + "learning_rate": 7.618892384741093e-08, + "loss": 0.77469432, + "num_input_tokens_seen": 328096960, + "router_z_loss_clip": 0.71142578, + "router_z_loss_mlp": 0.10430908, + "step": 15210, + "time_per_iteration": 2.714759349822998 + }, + { + "auxiliary_loss_clip": 0.01105964, + "auxiliary_loss_mlp": 0.01031003, + "balance_loss_clip": 1.03355801, + "balance_loss_mlp": 1.01955891, + "epoch": 0.9145347963324816, + "flos": 31540070191680.0, + "grad_norm": 1.9064830652032954, + "language_loss": 0.78294444, + "learning_rate": 7.6082488497488e-08, + "loss": 0.80431414, + "num_input_tokens_seen": 328115445, + "router_z_loss_clip": 0.72363281, + "router_z_loss_mlp": 0.11444092, + "step": 15211, + "time_per_iteration": 2.6528053283691406 + }, + { + "auxiliary_loss_clip": 0.01111252, + "auxiliary_loss_mlp": 0.01026321, + "balance_loss_clip": 1.03881562, + "balance_loss_mlp": 1.01568198, + "epoch": 0.9145949195851496, + "flos": 23480144688960.0, + "grad_norm": 2.0934265213336145, + "language_loss": 0.82631099, + "learning_rate": 7.597612610270986e-08, + "loss": 0.84768677, + "num_input_tokens_seen": 328133965, + "router_z_loss_clip": 0.72460938, + "router_z_loss_mlp": 0.10638428, + "step": 15212, + "time_per_iteration": 2.659343957901001 + }, + { + "auxiliary_loss_clip": 0.01107215, + "auxiliary_loss_mlp": 0.01025097, + "balance_loss_clip": 1.03716254, + "balance_loss_mlp": 1.01461279, + "epoch": 0.9146550428378175, + "flos": 22325081112000.0, + "grad_norm": 1.7403745685233065, + "language_loss": 0.83827806, + "learning_rate": 7.586983666711022e-08, + "loss": 0.8596012, + "num_input_tokens_seen": 328151520, + "router_z_loss_clip": 0.70019531, + "router_z_loss_mlp": 0.1048584, + "step": 15213, + "time_per_iteration": 2.592397928237915 + }, + { + "auxiliary_loss_clip": 0.01109778, + "auxiliary_loss_mlp": 0.01029029, + "balance_loss_clip": 1.0376929, + "balance_loss_mlp": 1.01825202, + "epoch": 0.9147151660904855, + "flos": 24506787539520.0, + "grad_norm": 1.862877308651324, + "language_loss": 0.70389664, + "learning_rate": 7.576362019471894e-08, + "loss": 0.72528476, + "num_input_tokens_seen": 328171275, + "router_z_loss_clip": 0.72167969, + "router_z_loss_mlp": 0.10778809, + "step": 15214, + "time_per_iteration": 2.6985421180725098 + }, + { + "auxiliary_loss_clip": 0.01112323, + "auxiliary_loss_mlp": 0.0103392, + "balance_loss_clip": 1.03763008, + "balance_loss_mlp": 1.02211213, + "epoch": 0.9147752893431534, + "flos": 29759954407200.0, + "grad_norm": 1.8498698210338245, + "language_loss": 0.62680793, + "learning_rate": 7.565747668956413e-08, + "loss": 0.64827037, + "num_input_tokens_seen": 328192115, + "router_z_loss_clip": 0.74755859, + "router_z_loss_mlp": 0.11810303, + "step": 15215, + "time_per_iteration": 4.100326061248779 + }, + { + "auxiliary_loss_clip": 0.0111519, + "auxiliary_loss_mlp": 0.01026825, + "balance_loss_clip": 1.03953528, + "balance_loss_mlp": 1.01547587, + "epoch": 0.9148354125958215, + "flos": 22146952033440.0, + "grad_norm": 2.6124586043091984, + "language_loss": 0.76289189, + "learning_rate": 7.555140615567058e-08, + "loss": 0.78431207, + "num_input_tokens_seen": 328208990, + "router_z_loss_clip": 0.75683594, + "router_z_loss_mlp": 0.11358643, + "step": 15216, + "time_per_iteration": 4.16969633102417 + }, + { + "auxiliary_loss_clip": 0.01111679, + "auxiliary_loss_mlp": 0.01033702, + "balance_loss_clip": 1.04040575, + "balance_loss_mlp": 1.0224551, + "epoch": 0.9148955358484894, + "flos": 28513334478240.0, + "grad_norm": 2.9723716711216874, + "language_loss": 0.68436009, + "learning_rate": 7.544540859706062e-08, + "loss": 0.70581394, + "num_input_tokens_seen": 328227840, + "router_z_loss_clip": 0.71337891, + "router_z_loss_mlp": 0.11248779, + "step": 15217, + "time_per_iteration": 2.644556760787964 + }, + { + "auxiliary_loss_clip": 0.01109406, + "auxiliary_loss_mlp": 0.01028665, + "balance_loss_clip": 1.0387665, + "balance_loss_mlp": 1.01790643, + "epoch": 0.9149556591011574, + "flos": 22056854303520.0, + "grad_norm": 1.7440571159244933, + "language_loss": 0.80037045, + "learning_rate": 7.533948401775347e-08, + "loss": 0.82175118, + "num_input_tokens_seen": 328246250, + "router_z_loss_clip": 0.70605469, + "router_z_loss_mlp": 0.10760498, + "step": 15218, + "time_per_iteration": 2.660430908203125 + }, + { + "auxiliary_loss_clip": 0.01027519, + "auxiliary_loss_mlp": 0.01000982, + "balance_loss_clip": 1.00518131, + "balance_loss_mlp": 1.00001454, + "epoch": 0.9150157823538253, + "flos": 66606789477600.0, + "grad_norm": 0.8551755598107138, + "language_loss": 0.5917474, + "learning_rate": 7.523363242176595e-08, + "loss": 0.61203235, + "num_input_tokens_seen": 328303625, + "router_z_loss_clip": 0.22338867, + "router_z_loss_mlp": 0.00966644, + "step": 15219, + "time_per_iteration": 3.1787655353546143 + }, + { + "auxiliary_loss_clip": 0.01106651, + "auxiliary_loss_mlp": 0.01032599, + "balance_loss_clip": 1.03691101, + "balance_loss_mlp": 1.02148318, + "epoch": 0.9150759056064933, + "flos": 21833149635360.0, + "grad_norm": 3.095625344413666, + "language_loss": 0.78649664, + "learning_rate": 7.512785381311216e-08, + "loss": 0.80788916, + "num_input_tokens_seen": 328322135, + "router_z_loss_clip": 0.69628906, + "router_z_loss_mlp": 0.11126709, + "step": 15220, + "time_per_iteration": 2.615605115890503 + }, + { + "auxiliary_loss_clip": 0.01112796, + "auxiliary_loss_mlp": 0.01031624, + "balance_loss_clip": 1.03710508, + "balance_loss_mlp": 1.01989985, + "epoch": 0.9151360288591612, + "flos": 22053491369280.0, + "grad_norm": 2.2735529019842335, + "language_loss": 0.66276193, + "learning_rate": 7.50221481958031e-08, + "loss": 0.68420619, + "num_input_tokens_seen": 328340750, + "router_z_loss_clip": 0.75683594, + "router_z_loss_mlp": 0.11730957, + "step": 15221, + "time_per_iteration": 2.6212587356567383 + }, + { + "auxiliary_loss_clip": 0.01108101, + "auxiliary_loss_mlp": 0.01029117, + "balance_loss_clip": 1.0362308, + "balance_loss_mlp": 1.01875782, + "epoch": 0.9151961521118293, + "flos": 23616304215840.0, + "grad_norm": 1.8566760734534644, + "language_loss": 0.84109753, + "learning_rate": 7.491651557384692e-08, + "loss": 0.86246973, + "num_input_tokens_seen": 328359995, + "router_z_loss_clip": 0.71728516, + "router_z_loss_mlp": 0.1036377, + "step": 15222, + "time_per_iteration": 2.7396466732025146 + }, + { + "auxiliary_loss_clip": 0.01027308, + "auxiliary_loss_mlp": 0.01001256, + "balance_loss_clip": 1.0049113, + "balance_loss_mlp": 1.00029075, + "epoch": 0.9152562753644973, + "flos": 88034175190080.0, + "grad_norm": 0.7441813475938308, + "language_loss": 0.49565288, + "learning_rate": 7.481095595124953e-08, + "loss": 0.51593852, + "num_input_tokens_seen": 328426865, + "router_z_loss_clip": 0.22412109, + "router_z_loss_mlp": 0.00963593, + "step": 15223, + "time_per_iteration": 3.2735962867736816 + }, + { + "auxiliary_loss_clip": 0.01111177, + "auxiliary_loss_mlp": 0.010367, + "balance_loss_clip": 1.03801143, + "balance_loss_mlp": 1.02519619, + "epoch": 0.9153163986171652, + "flos": 25352181480960.0, + "grad_norm": 2.0631664675239807, + "language_loss": 0.72634012, + "learning_rate": 7.470546933201349e-08, + "loss": 0.74781895, + "num_input_tokens_seen": 328445970, + "router_z_loss_clip": 0.73242188, + "router_z_loss_mlp": 0.11505127, + "step": 15224, + "time_per_iteration": 2.630380392074585 + }, + { + "auxiliary_loss_clip": 0.01107376, + "auxiliary_loss_mlp": 0.01024613, + "balance_loss_clip": 1.03665173, + "balance_loss_mlp": 1.01341915, + "epoch": 0.9153765218698332, + "flos": 28113688664640.0, + "grad_norm": 1.8444678116817939, + "language_loss": 0.8125667, + "learning_rate": 7.460005572013895e-08, + "loss": 0.8338865, + "num_input_tokens_seen": 328464585, + "router_z_loss_clip": 0.70800781, + "router_z_loss_mlp": 0.11193848, + "step": 15225, + "time_per_iteration": 2.6304075717926025 + }, + { + "auxiliary_loss_clip": 0.01107869, + "auxiliary_loss_mlp": 0.01022352, + "balance_loss_clip": 1.03614926, + "balance_loss_mlp": 1.01189113, + "epoch": 0.9154366451225011, + "flos": 35374970816640.0, + "grad_norm": 1.4756836057524982, + "language_loss": 0.71331954, + "learning_rate": 7.44947151196238e-08, + "loss": 0.73462176, + "num_input_tokens_seen": 328490155, + "router_z_loss_clip": 0.71777344, + "router_z_loss_mlp": 0.10461426, + "step": 15226, + "time_per_iteration": 3.999922037124634 + }, + { + "auxiliary_loss_clip": 0.01110809, + "auxiliary_loss_mlp": 0.01028687, + "balance_loss_clip": 1.03729129, + "balance_loss_mlp": 1.01714182, + "epoch": 0.9154967683751691, + "flos": 27222232926240.0, + "grad_norm": 2.077208052347883, + "language_loss": 0.74948883, + "learning_rate": 7.43894475344613e-08, + "loss": 0.77088386, + "num_input_tokens_seen": 328508275, + "router_z_loss_clip": 0.73535156, + "router_z_loss_mlp": 0.11553955, + "step": 15227, + "time_per_iteration": 2.7125794887542725 + }, + { + "auxiliary_loss_clip": 0.01107944, + "auxiliary_loss_mlp": 0.01027767, + "balance_loss_clip": 1.03667021, + "balance_loss_mlp": 1.01735449, + "epoch": 0.915556891627837, + "flos": 29982484074240.0, + "grad_norm": 1.5318315770118343, + "language_loss": 0.739694, + "learning_rate": 7.428425296864404e-08, + "loss": 0.76105112, + "num_input_tokens_seen": 328529425, + "router_z_loss_clip": 0.71289062, + "router_z_loss_mlp": 0.10412598, + "step": 15228, + "time_per_iteration": 2.6562840938568115 + }, + { + "auxiliary_loss_clip": 0.01105406, + "auxiliary_loss_mlp": 0.01030559, + "balance_loss_clip": 1.03431463, + "balance_loss_mlp": 1.02005696, + "epoch": 0.9156170148805051, + "flos": 27045400400640.0, + "grad_norm": 2.209432200641819, + "language_loss": 0.72422612, + "learning_rate": 7.417913142616106e-08, + "loss": 0.74558574, + "num_input_tokens_seen": 328550200, + "router_z_loss_clip": 0.7109375, + "router_z_loss_mlp": 0.10510254, + "step": 15229, + "time_per_iteration": 2.7531514167785645 + }, + { + "auxiliary_loss_clip": 0.01111638, + "auxiliary_loss_mlp": 0.01033474, + "balance_loss_clip": 1.038908, + "balance_loss_mlp": 1.02180898, + "epoch": 0.915677138133173, + "flos": 25527676936320.0, + "grad_norm": 1.6043827706307827, + "language_loss": 0.8310045, + "learning_rate": 7.407408291099848e-08, + "loss": 0.85245568, + "num_input_tokens_seen": 328568540, + "router_z_loss_clip": 0.72802734, + "router_z_loss_mlp": 0.11657715, + "step": 15230, + "time_per_iteration": 2.638814926147461 + }, + { + "auxiliary_loss_clip": 0.01106987, + "auxiliary_loss_mlp": 0.01027323, + "balance_loss_clip": 1.03703618, + "balance_loss_mlp": 1.01699948, + "epoch": 0.915737261385841, + "flos": 29706964155360.0, + "grad_norm": 1.9652931778365974, + "language_loss": 0.83456618, + "learning_rate": 7.396910742713957e-08, + "loss": 0.85590923, + "num_input_tokens_seen": 328587300, + "router_z_loss_clip": 0.69970703, + "router_z_loss_mlp": 0.10327148, + "step": 15231, + "time_per_iteration": 2.7185046672821045 + }, + { + "auxiliary_loss_clip": 0.01105379, + "auxiliary_loss_mlp": 0.0102573, + "balance_loss_clip": 1.0351516, + "balance_loss_mlp": 1.0150075, + "epoch": 0.9157973846385089, + "flos": 32654703873600.0, + "grad_norm": 1.576150643478121, + "language_loss": 0.72394907, + "learning_rate": 7.386420497856516e-08, + "loss": 0.74526012, + "num_input_tokens_seen": 328610055, + "router_z_loss_clip": 0.70117188, + "router_z_loss_mlp": 0.10723877, + "step": 15232, + "time_per_iteration": 2.7098324298858643 + }, + { + "auxiliary_loss_clip": 0.01109392, + "auxiliary_loss_mlp": 0.01031613, + "balance_loss_clip": 1.0363009, + "balance_loss_mlp": 1.02064538, + "epoch": 0.9158575078911769, + "flos": 22547610779040.0, + "grad_norm": 2.1664570671367183, + "language_loss": 0.67732882, + "learning_rate": 7.375937556925338e-08, + "loss": 0.69873887, + "num_input_tokens_seen": 328626815, + "router_z_loss_clip": 0.72998047, + "router_z_loss_mlp": 0.10980225, + "step": 15233, + "time_per_iteration": 2.6175692081451416 + }, + { + "auxiliary_loss_clip": 0.01112234, + "auxiliary_loss_mlp": 0.01034659, + "balance_loss_clip": 1.03848755, + "balance_loss_mlp": 1.02345967, + "epoch": 0.9159176311438448, + "flos": 26599166065440.0, + "grad_norm": 2.102675893233823, + "language_loss": 0.69534695, + "learning_rate": 7.365461920317861e-08, + "loss": 0.71681589, + "num_input_tokens_seen": 328643995, + "router_z_loss_clip": 0.73583984, + "router_z_loss_mlp": 0.11212158, + "step": 15234, + "time_per_iteration": 2.675833225250244 + }, + { + "auxiliary_loss_clip": 0.01110809, + "auxiliary_loss_mlp": 0.01031712, + "balance_loss_clip": 1.03853893, + "balance_loss_mlp": 1.02053607, + "epoch": 0.9159777543965129, + "flos": 30242121219360.0, + "grad_norm": 1.6979031829543008, + "language_loss": 0.88621974, + "learning_rate": 7.354993588431391e-08, + "loss": 0.90764499, + "num_input_tokens_seen": 328659565, + "router_z_loss_clip": 0.72265625, + "router_z_loss_mlp": 0.11193848, + "step": 15235, + "time_per_iteration": 2.640608549118042 + }, + { + "auxiliary_loss_clip": 0.01111418, + "auxiliary_loss_mlp": 0.01033663, + "balance_loss_clip": 1.03856015, + "balance_loss_mlp": 1.02248681, + "epoch": 0.9160378776491809, + "flos": 32785920292320.0, + "grad_norm": 1.6574302088619275, + "language_loss": 0.77279007, + "learning_rate": 7.344532561662853e-08, + "loss": 0.79424083, + "num_input_tokens_seen": 328679045, + "router_z_loss_clip": 0.72753906, + "router_z_loss_mlp": 0.11187744, + "step": 15236, + "time_per_iteration": 2.734463691711426 + }, + { + "auxiliary_loss_clip": 0.01027384, + "auxiliary_loss_mlp": 0.01001806, + "balance_loss_clip": 1.00504947, + "balance_loss_mlp": 1.00087452, + "epoch": 0.9160980009018488, + "flos": 86120249880960.0, + "grad_norm": 0.6762173077377309, + "language_loss": 0.62176549, + "learning_rate": 7.334078840409019e-08, + "loss": 0.64205742, + "num_input_tokens_seen": 328744565, + "router_z_loss_clip": 0.22302246, + "router_z_loss_mlp": 0.00930023, + "step": 15237, + "time_per_iteration": 3.182685136795044 + }, + { + "auxiliary_loss_clip": 0.01113087, + "auxiliary_loss_mlp": 0.01031718, + "balance_loss_clip": 1.03908408, + "balance_loss_mlp": 1.01996946, + "epoch": 0.9161581241545168, + "flos": 19875958221600.0, + "grad_norm": 1.8992946423527242, + "language_loss": 0.74959058, + "learning_rate": 7.323632425066151e-08, + "loss": 0.77103865, + "num_input_tokens_seen": 328762455, + "router_z_loss_clip": 0.73876953, + "router_z_loss_mlp": 0.11749268, + "step": 15238, + "time_per_iteration": 2.717564821243286 + }, + { + "auxiliary_loss_clip": 0.01110978, + "auxiliary_loss_mlp": 0.01024687, + "balance_loss_clip": 1.03735983, + "balance_loss_mlp": 1.01389253, + "epoch": 0.9162182474071847, + "flos": 22498469668800.0, + "grad_norm": 2.268244821517277, + "language_loss": 0.74626094, + "learning_rate": 7.313193316030464e-08, + "loss": 0.76761758, + "num_input_tokens_seen": 328780320, + "router_z_loss_clip": 0.73535156, + "router_z_loss_mlp": 0.10797119, + "step": 15239, + "time_per_iteration": 2.5824451446533203 + }, + { + "auxiliary_loss_clip": 0.01111497, + "auxiliary_loss_mlp": 0.0103052, + "balance_loss_clip": 1.03797019, + "balance_loss_mlp": 1.0191828, + "epoch": 0.9162783706598527, + "flos": 23388223681440.0, + "grad_norm": 2.121145658102493, + "language_loss": 0.6329186, + "learning_rate": 7.302761513697819e-08, + "loss": 0.65433878, + "num_input_tokens_seen": 328797570, + "router_z_loss_clip": 0.73486328, + "router_z_loss_mlp": 0.11334229, + "step": 15240, + "time_per_iteration": 2.6529793739318848 + }, + { + "auxiliary_loss_clip": 0.01107792, + "auxiliary_loss_mlp": 0.01021463, + "balance_loss_clip": 1.03810954, + "balance_loss_mlp": 1.01138997, + "epoch": 0.9163384939125206, + "flos": 24907891975200.0, + "grad_norm": 1.9510007434229582, + "language_loss": 0.76398218, + "learning_rate": 7.292337018463746e-08, + "loss": 0.78527474, + "num_input_tokens_seen": 328814075, + "router_z_loss_clip": 0.69677734, + "router_z_loss_mlp": 0.10070801, + "step": 15241, + "time_per_iteration": 2.5807507038116455 + }, + { + "auxiliary_loss_clip": 0.0111577, + "auxiliary_loss_mlp": 0.0102771, + "balance_loss_clip": 1.03761649, + "balance_loss_mlp": 1.01546717, + "epoch": 0.9163986171651887, + "flos": 23971914096480.0, + "grad_norm": 2.2626790509709167, + "language_loss": 0.67694199, + "learning_rate": 7.281919830723549e-08, + "loss": 0.69837677, + "num_input_tokens_seen": 328831990, + "router_z_loss_clip": 0.78173828, + "router_z_loss_mlp": 0.12249756, + "step": 15242, + "time_per_iteration": 2.618602991104126 + }, + { + "auxiliary_loss_clip": 0.01109784, + "auxiliary_loss_mlp": 0.01030474, + "balance_loss_clip": 1.03741586, + "balance_loss_mlp": 1.01929796, + "epoch": 0.9164587404178566, + "flos": 15646152304800.0, + "grad_norm": 2.5707288884075776, + "language_loss": 0.80668402, + "learning_rate": 7.271509950872334e-08, + "loss": 0.82808661, + "num_input_tokens_seen": 328849105, + "router_z_loss_clip": 0.72363281, + "router_z_loss_mlp": 0.11175537, + "step": 15243, + "time_per_iteration": 2.597104787826538 + }, + { + "auxiliary_loss_clip": 0.01110989, + "auxiliary_loss_mlp": 0.01027994, + "balance_loss_clip": 1.03625035, + "balance_loss_mlp": 1.01709259, + "epoch": 0.9165188636705246, + "flos": 27222111374400.0, + "grad_norm": 1.9070245249836406, + "language_loss": 0.81926429, + "learning_rate": 7.261107379304721e-08, + "loss": 0.84065413, + "num_input_tokens_seen": 328866810, + "router_z_loss_clip": 0.74609375, + "router_z_loss_mlp": 0.10913086, + "step": 15244, + "time_per_iteration": 2.625624656677246 + }, + { + "auxiliary_loss_clip": 0.0111383, + "auxiliary_loss_mlp": 0.01031767, + "balance_loss_clip": 1.03833687, + "balance_loss_mlp": 1.01991177, + "epoch": 0.9165789869231925, + "flos": 22235955796800.0, + "grad_norm": 3.342301954564491, + "language_loss": 0.72149837, + "learning_rate": 7.250712116415214e-08, + "loss": 0.74295425, + "num_input_tokens_seen": 328885325, + "router_z_loss_clip": 0.75537109, + "router_z_loss_mlp": 0.1184082, + "step": 15245, + "time_per_iteration": 4.04247522354126 + }, + { + "auxiliary_loss_clip": 0.01107119, + "auxiliary_loss_mlp": 0.01028387, + "balance_loss_clip": 1.03582561, + "balance_loss_mlp": 1.01800942, + "epoch": 0.9166391101758605, + "flos": 16706620733760.0, + "grad_norm": 1.6708596530704518, + "language_loss": 0.74665618, + "learning_rate": 7.240324162598033e-08, + "loss": 0.76801127, + "num_input_tokens_seen": 328902655, + "router_z_loss_clip": 0.71386719, + "router_z_loss_mlp": 0.1038208, + "step": 15246, + "time_per_iteration": 2.592876672744751 + }, + { + "auxiliary_loss_clip": 0.0110928, + "auxiliary_loss_mlp": 0.01028908, + "balance_loss_clip": 1.03785944, + "balance_loss_mlp": 1.0171833, + "epoch": 0.9166992334285284, + "flos": 21166411497120.0, + "grad_norm": 2.226553719839198, + "language_loss": 0.75160325, + "learning_rate": 7.229943518247106e-08, + "loss": 0.7729851, + "num_input_tokens_seen": 328918440, + "router_z_loss_clip": 0.71435547, + "router_z_loss_mlp": 0.11724854, + "step": 15247, + "time_per_iteration": 2.6540820598602295 + }, + { + "auxiliary_loss_clip": 0.0111305, + "auxiliary_loss_mlp": 0.01027237, + "balance_loss_clip": 1.03949344, + "balance_loss_mlp": 1.01588225, + "epoch": 0.9167593566811965, + "flos": 28957137776640.0, + "grad_norm": 1.810449286810903, + "language_loss": 0.76180255, + "learning_rate": 7.219570183756052e-08, + "loss": 0.78320545, + "num_input_tokens_seen": 328938055, + "router_z_loss_clip": 0.73535156, + "router_z_loss_mlp": 0.11358643, + "step": 15248, + "time_per_iteration": 2.6573898792266846 + }, + { + "auxiliary_loss_clip": 0.01110218, + "auxiliary_loss_mlp": 0.01034723, + "balance_loss_clip": 1.03755355, + "balance_loss_mlp": 1.02271891, + "epoch": 0.9168194799338644, + "flos": 33944427838080.0, + "grad_norm": 2.823755838624366, + "language_loss": 0.73058152, + "learning_rate": 7.209204159518178e-08, + "loss": 0.75203097, + "num_input_tokens_seen": 328957895, + "router_z_loss_clip": 0.7265625, + "router_z_loss_mlp": 0.12005615, + "step": 15249, + "time_per_iteration": 2.6808888912200928 + }, + { + "auxiliary_loss_clip": 0.01112768, + "auxiliary_loss_mlp": 0.01026049, + "balance_loss_clip": 1.03971839, + "balance_loss_mlp": 1.01449144, + "epoch": 0.9168796031865324, + "flos": 26500640741280.0, + "grad_norm": 2.4155106261723343, + "language_loss": 0.76139939, + "learning_rate": 7.198845445926616e-08, + "loss": 0.78278756, + "num_input_tokens_seen": 328971365, + "router_z_loss_clip": 0.73046875, + "router_z_loss_mlp": 0.11553955, + "step": 15250, + "time_per_iteration": 2.616600275039673 + }, + { + "auxiliary_loss_clip": 0.01107311, + "auxiliary_loss_mlp": 0.01025416, + "balance_loss_clip": 1.03613138, + "balance_loss_mlp": 1.01431799, + "epoch": 0.9169397264392004, + "flos": 28558302308640.0, + "grad_norm": 1.7634237438473097, + "language_loss": 0.75977534, + "learning_rate": 7.188494043374138e-08, + "loss": 0.78110266, + "num_input_tokens_seen": 328990830, + "router_z_loss_clip": 0.7109375, + "router_z_loss_mlp": 0.11090088, + "step": 15251, + "time_per_iteration": 2.654444456100464 + }, + { + "auxiliary_loss_clip": 0.01113966, + "auxiliary_loss_mlp": 0.01030268, + "balance_loss_clip": 1.04007435, + "balance_loss_mlp": 1.01765561, + "epoch": 0.9169998496918683, + "flos": 29225324067840.0, + "grad_norm": 2.4141318139487438, + "language_loss": 0.7998879, + "learning_rate": 7.178149952253298e-08, + "loss": 0.82133031, + "num_input_tokens_seen": 329008345, + "router_z_loss_clip": 0.73876953, + "router_z_loss_mlp": 0.12609863, + "step": 15252, + "time_per_iteration": 2.633248805999756 + }, + { + "auxiliary_loss_clip": 0.01109319, + "auxiliary_loss_mlp": 0.01029642, + "balance_loss_clip": 1.03681719, + "balance_loss_mlp": 1.01869905, + "epoch": 0.9170599729445363, + "flos": 22369157562240.0, + "grad_norm": 2.950195923115148, + "language_loss": 0.77322042, + "learning_rate": 7.167813172956316e-08, + "loss": 0.79461002, + "num_input_tokens_seen": 329027440, + "router_z_loss_clip": 0.72460938, + "router_z_loss_mlp": 0.10943604, + "step": 15253, + "time_per_iteration": 2.635369300842285 + }, + { + "auxiliary_loss_clip": 0.01111744, + "auxiliary_loss_mlp": 0.01024944, + "balance_loss_clip": 1.03892183, + "balance_loss_mlp": 1.01410794, + "epoch": 0.9171200961972042, + "flos": 27668021571360.0, + "grad_norm": 1.870128035921369, + "language_loss": 0.73341918, + "learning_rate": 7.157483705875256e-08, + "loss": 0.75478601, + "num_input_tokens_seen": 329046445, + "router_z_loss_clip": 0.72851562, + "router_z_loss_mlp": 0.10827637, + "step": 15254, + "time_per_iteration": 4.039762020111084 + }, + { + "auxiliary_loss_clip": 0.01107363, + "auxiliary_loss_mlp": 0.01024679, + "balance_loss_clip": 1.03776789, + "balance_loss_mlp": 1.01423001, + "epoch": 0.9171802194498723, + "flos": 32603861037600.0, + "grad_norm": 1.5051597015530842, + "language_loss": 0.79160714, + "learning_rate": 7.14716155140167e-08, + "loss": 0.81292754, + "num_input_tokens_seen": 329065555, + "router_z_loss_clip": 0.6953125, + "router_z_loss_mlp": 0.10461426, + "step": 15255, + "time_per_iteration": 4.188175916671753 + }, + { + "auxiliary_loss_clip": 0.01112024, + "auxiliary_loss_mlp": 0.01031618, + "balance_loss_clip": 1.03850913, + "balance_loss_mlp": 1.02044773, + "epoch": 0.9172403427025402, + "flos": 46233794602080.0, + "grad_norm": 2.0433480436320144, + "language_loss": 0.68555135, + "learning_rate": 7.136846709927047e-08, + "loss": 0.70698774, + "num_input_tokens_seen": 329087515, + "router_z_loss_clip": 0.73583984, + "router_z_loss_mlp": 0.11181641, + "step": 15256, + "time_per_iteration": 2.7571024894714355 + }, + { + "auxiliary_loss_clip": 0.01107568, + "auxiliary_loss_mlp": 0.0103144, + "balance_loss_clip": 1.03686178, + "balance_loss_mlp": 1.02090812, + "epoch": 0.9173004659552082, + "flos": 20810517995520.0, + "grad_norm": 1.7113715033517456, + "language_loss": 0.8404817, + "learning_rate": 7.126539181842561e-08, + "loss": 0.86187178, + "num_input_tokens_seen": 329106820, + "router_z_loss_clip": 0.70654297, + "router_z_loss_mlp": 0.10522461, + "step": 15257, + "time_per_iteration": 2.6529994010925293 + }, + { + "auxiliary_loss_clip": 0.01107405, + "auxiliary_loss_mlp": 0.01029497, + "balance_loss_clip": 1.03716421, + "balance_loss_mlp": 1.01918483, + "epoch": 0.9173605892078761, + "flos": 27090408748320.0, + "grad_norm": 1.716924814627798, + "language_loss": 0.77689904, + "learning_rate": 7.116238967539012e-08, + "loss": 0.79826808, + "num_input_tokens_seen": 329126515, + "router_z_loss_clip": 0.70214844, + "router_z_loss_mlp": 0.10308838, + "step": 15258, + "time_per_iteration": 2.618696451187134 + }, + { + "auxiliary_loss_clip": 0.01112851, + "auxiliary_loss_mlp": 0.01029314, + "balance_loss_clip": 1.04110253, + "balance_loss_mlp": 1.01803672, + "epoch": 0.9174207124605441, + "flos": 20142564338880.0, + "grad_norm": 1.8632826012287949, + "language_loss": 0.78419006, + "learning_rate": 7.105946067406999e-08, + "loss": 0.80561167, + "num_input_tokens_seen": 329142660, + "router_z_loss_clip": 0.71728516, + "router_z_loss_mlp": 0.112854, + "step": 15259, + "time_per_iteration": 2.6299524307250977 + }, + { + "auxiliary_loss_clip": 0.01106387, + "auxiliary_loss_mlp": 0.01032751, + "balance_loss_clip": 1.03581095, + "balance_loss_mlp": 1.02222538, + "epoch": 0.917480835713212, + "flos": 29938650727680.0, + "grad_norm": 1.7037605075405788, + "language_loss": 0.76451528, + "learning_rate": 7.095660481836895e-08, + "loss": 0.78590667, + "num_input_tokens_seen": 329162575, + "router_z_loss_clip": 0.70605469, + "router_z_loss_mlp": 0.10528564, + "step": 15260, + "time_per_iteration": 2.640399694442749 + }, + { + "auxiliary_loss_clip": 0.01108117, + "auxiliary_loss_mlp": 0.01025746, + "balance_loss_clip": 1.03660786, + "balance_loss_mlp": 1.01512456, + "epoch": 0.9175409589658801, + "flos": 25479265137120.0, + "grad_norm": 1.667949737198613, + "language_loss": 0.60890812, + "learning_rate": 7.085382211218637e-08, + "loss": 0.63024676, + "num_input_tokens_seen": 329182090, + "router_z_loss_clip": 0.71533203, + "router_z_loss_mlp": 0.10626221, + "step": 15261, + "time_per_iteration": 2.6878271102905273 + }, + { + "auxiliary_loss_clip": 0.01107231, + "auxiliary_loss_mlp": 0.01024872, + "balance_loss_clip": 1.03680742, + "balance_loss_mlp": 1.0143162, + "epoch": 0.917601082218548, + "flos": 17419785324480.0, + "grad_norm": 1.8461288162471483, + "language_loss": 0.73959291, + "learning_rate": 7.075111255942002e-08, + "loss": 0.76091397, + "num_input_tokens_seen": 329196535, + "router_z_loss_clip": 0.70361328, + "router_z_loss_mlp": 0.10559082, + "step": 15262, + "time_per_iteration": 2.5840559005737305 + }, + { + "auxiliary_loss_clip": 0.01111384, + "auxiliary_loss_mlp": 0.01033715, + "balance_loss_clip": 1.0359565, + "balance_loss_mlp": 1.02263486, + "epoch": 0.917661205471216, + "flos": 23304770785440.0, + "grad_norm": 2.18616785366695, + "language_loss": 0.7757144, + "learning_rate": 7.064847616396496e-08, + "loss": 0.79716539, + "num_input_tokens_seen": 329215135, + "router_z_loss_clip": 0.75341797, + "router_z_loss_mlp": 0.11077881, + "step": 15263, + "time_per_iteration": 2.600297451019287 + }, + { + "auxiliary_loss_clip": 0.01112874, + "auxiliary_loss_mlp": 0.0103075, + "balance_loss_clip": 1.03790641, + "balance_loss_mlp": 1.01965141, + "epoch": 0.917721328723884, + "flos": 25754217814080.0, + "grad_norm": 1.9440820633778695, + "language_loss": 0.75628352, + "learning_rate": 7.054591292971324e-08, + "loss": 0.77771974, + "num_input_tokens_seen": 329235150, + "router_z_loss_clip": 0.74902344, + "router_z_loss_mlp": 0.11096191, + "step": 15264, + "time_per_iteration": 2.6066784858703613 + }, + { + "auxiliary_loss_clip": 0.01109117, + "auxiliary_loss_mlp": 0.01034562, + "balance_loss_clip": 1.03668928, + "balance_loss_mlp": 1.02386904, + "epoch": 0.9177814519765519, + "flos": 26775390831840.0, + "grad_norm": 2.3337342831370003, + "language_loss": 0.83165014, + "learning_rate": 7.044342286055394e-08, + "loss": 0.85308689, + "num_input_tokens_seen": 329254365, + "router_z_loss_clip": 0.72363281, + "router_z_loss_mlp": 0.10693359, + "step": 15265, + "time_per_iteration": 2.6505231857299805 + }, + { + "auxiliary_loss_clip": 0.01115295, + "auxiliary_loss_mlp": 0.0103797, + "balance_loss_clip": 1.03986764, + "balance_loss_mlp": 1.02595389, + "epoch": 0.9178415752292199, + "flos": 29536533360000.0, + "grad_norm": 1.7781043021472562, + "language_loss": 0.73419404, + "learning_rate": 7.034100596037306e-08, + "loss": 0.7557267, + "num_input_tokens_seen": 329274385, + "router_z_loss_clip": 0.75488281, + "router_z_loss_mlp": 0.12017822, + "step": 15266, + "time_per_iteration": 3.962756872177124 + }, + { + "auxiliary_loss_clip": 0.01108682, + "auxiliary_loss_mlp": 0.01026924, + "balance_loss_clip": 1.03661323, + "balance_loss_mlp": 1.01620734, + "epoch": 0.9179016984818879, + "flos": 24455823151680.0, + "grad_norm": 1.628518304946535, + "language_loss": 0.77927738, + "learning_rate": 7.023866223305486e-08, + "loss": 0.80063343, + "num_input_tokens_seen": 329292160, + "router_z_loss_clip": 0.72070312, + "router_z_loss_mlp": 0.1071167, + "step": 15267, + "time_per_iteration": 2.596832036972046 + }, + { + "auxiliary_loss_clip": 0.01027302, + "auxiliary_loss_mlp": 0.0100141, + "balance_loss_clip": 1.00498962, + "balance_loss_mlp": 1.00046647, + "epoch": 0.9179618217345559, + "flos": 79990456599360.0, + "grad_norm": 0.7418269195376092, + "language_loss": 0.56244105, + "learning_rate": 7.013639168247975e-08, + "loss": 0.58272815, + "num_input_tokens_seen": 329351870, + "router_z_loss_clip": 0.22265625, + "router_z_loss_mlp": 0.00941467, + "step": 15268, + "time_per_iteration": 3.326892375946045 + }, + { + "auxiliary_loss_clip": 0.01111394, + "auxiliary_loss_mlp": 0.01026583, + "balance_loss_clip": 1.03787208, + "balance_loss_mlp": 1.0155381, + "epoch": 0.9180219449872238, + "flos": 26020378241280.0, + "grad_norm": 1.8958967787464158, + "language_loss": 0.76336962, + "learning_rate": 7.0034194312526e-08, + "loss": 0.78474939, + "num_input_tokens_seen": 329370930, + "router_z_loss_clip": 0.73535156, + "router_z_loss_mlp": 0.1104126, + "step": 15269, + "time_per_iteration": 2.593294143676758 + }, + { + "auxiliary_loss_clip": 0.01108997, + "auxiliary_loss_mlp": 0.0102701, + "balance_loss_clip": 1.03683972, + "balance_loss_mlp": 1.01559019, + "epoch": 0.9180820682398918, + "flos": 50103331151040.0, + "grad_norm": 1.6941983858910954, + "language_loss": 0.72767514, + "learning_rate": 6.993207012706936e-08, + "loss": 0.74903524, + "num_input_tokens_seen": 329391275, + "router_z_loss_clip": 0.72216797, + "router_z_loss_mlp": 0.11413574, + "step": 15270, + "time_per_iteration": 2.8126678466796875 + }, + { + "auxiliary_loss_clip": 0.0110548, + "auxiliary_loss_mlp": 0.01030547, + "balance_loss_clip": 1.03528094, + "balance_loss_mlp": 1.01922822, + "epoch": 0.9181421914925597, + "flos": 34255312992000.0, + "grad_norm": 1.6174952236725162, + "language_loss": 0.7981267, + "learning_rate": 6.98300191299821e-08, + "loss": 0.81948698, + "num_input_tokens_seen": 329412775, + "router_z_loss_clip": 0.70214844, + "router_z_loss_mlp": 0.11315918, + "step": 15271, + "time_per_iteration": 2.6746602058410645 + }, + { + "auxiliary_loss_clip": 0.01110999, + "auxiliary_loss_mlp": 0.01030325, + "balance_loss_clip": 1.03783047, + "balance_loss_mlp": 1.01931024, + "epoch": 0.9182023147452277, + "flos": 35593165134720.0, + "grad_norm": 2.390642659251496, + "language_loss": 0.72874236, + "learning_rate": 6.972804132513355e-08, + "loss": 0.75015557, + "num_input_tokens_seen": 329432440, + "router_z_loss_clip": 0.73193359, + "router_z_loss_mlp": 0.11016846, + "step": 15272, + "time_per_iteration": 2.674886703491211 + }, + { + "auxiliary_loss_clip": 0.01109864, + "auxiliary_loss_mlp": 0.01031923, + "balance_loss_clip": 1.03781188, + "balance_loss_mlp": 1.0212779, + "epoch": 0.9182624379978956, + "flos": 29359336178880.0, + "grad_norm": 5.714437713920468, + "language_loss": 0.72471094, + "learning_rate": 6.962613671639105e-08, + "loss": 0.7461288, + "num_input_tokens_seen": 329450605, + "router_z_loss_clip": 0.71923828, + "router_z_loss_mlp": 0.10644531, + "step": 15273, + "time_per_iteration": 2.6990272998809814 + }, + { + "auxiliary_loss_clip": 0.01101841, + "auxiliary_loss_mlp": 0.0102479, + "balance_loss_clip": 1.03453064, + "balance_loss_mlp": 1.01454997, + "epoch": 0.9183225612505637, + "flos": 28423115196480.0, + "grad_norm": 1.5425135856041625, + "language_loss": 0.74483705, + "learning_rate": 6.952430530761933e-08, + "loss": 0.76610339, + "num_input_tokens_seen": 329470550, + "router_z_loss_clip": 0.67236328, + "router_z_loss_mlp": 0.10247803, + "step": 15274, + "time_per_iteration": 2.63269305229187 + }, + { + "auxiliary_loss_clip": 0.01109726, + "auxiliary_loss_mlp": 0.01035532, + "balance_loss_clip": 1.0362401, + "balance_loss_mlp": 1.02495849, + "epoch": 0.9183826845032316, + "flos": 23927189369760.0, + "grad_norm": 5.061816587270997, + "language_loss": 0.68611205, + "learning_rate": 6.942254710267902e-08, + "loss": 0.70756459, + "num_input_tokens_seen": 329489765, + "router_z_loss_clip": 0.734375, + "router_z_loss_mlp": 0.10583496, + "step": 15275, + "time_per_iteration": 2.660555601119995 + }, + { + "auxiliary_loss_clip": 0.01107152, + "auxiliary_loss_mlp": 0.0103038, + "balance_loss_clip": 1.03662229, + "balance_loss_mlp": 1.01915073, + "epoch": 0.9184428077558996, + "flos": 22547975434560.0, + "grad_norm": 1.835002612847836, + "language_loss": 0.72136188, + "learning_rate": 6.932086210542953e-08, + "loss": 0.74273717, + "num_input_tokens_seen": 329507040, + "router_z_loss_clip": 0.70605469, + "router_z_loss_mlp": 0.11242676, + "step": 15276, + "time_per_iteration": 2.5723464488983154 + }, + { + "auxiliary_loss_clip": 0.01111303, + "auxiliary_loss_mlp": 0.01030388, + "balance_loss_clip": 1.0399915, + "balance_loss_mlp": 1.01989818, + "epoch": 0.9185029310085676, + "flos": 25308226582560.0, + "grad_norm": 1.9620024450815385, + "language_loss": 0.73264992, + "learning_rate": 6.921925031972642e-08, + "loss": 0.75406682, + "num_input_tokens_seen": 329525540, + "router_z_loss_clip": 0.71337891, + "router_z_loss_mlp": 0.1050415, + "step": 15277, + "time_per_iteration": 2.658905267715454 + }, + { + "auxiliary_loss_clip": 0.0102743, + "auxiliary_loss_mlp": 0.01001489, + "balance_loss_clip": 1.00506186, + "balance_loss_mlp": 1.00052381, + "epoch": 0.9185630542612355, + "flos": 83229187487040.0, + "grad_norm": 0.7162262367331952, + "language_loss": 0.59171557, + "learning_rate": 6.91177117494226e-08, + "loss": 0.61200476, + "num_input_tokens_seen": 329592905, + "router_z_loss_clip": 0.22387695, + "router_z_loss_mlp": 0.00964355, + "step": 15278, + "time_per_iteration": 3.3168857097625732 + }, + { + "auxiliary_loss_clip": 0.01103649, + "auxiliary_loss_mlp": 0.01025577, + "balance_loss_clip": 1.03363562, + "balance_loss_mlp": 1.01552725, + "epoch": 0.9186231775139035, + "flos": 14934000646080.0, + "grad_norm": 2.090054204936111, + "language_loss": 0.64942783, + "learning_rate": 6.901624639836879e-08, + "loss": 0.6707201, + "num_input_tokens_seen": 329610150, + "router_z_loss_clip": 0.70117188, + "router_z_loss_mlp": 0.10058594, + "step": 15279, + "time_per_iteration": 2.60445499420166 + }, + { + "auxiliary_loss_clip": 0.01027435, + "auxiliary_loss_mlp": 0.01001535, + "balance_loss_clip": 1.005077, + "balance_loss_mlp": 1.0005976, + "epoch": 0.9186833007665715, + "flos": 78018719482080.0, + "grad_norm": 0.8572090025392778, + "language_loss": 0.60149407, + "learning_rate": 6.891485427041211e-08, + "loss": 0.62178373, + "num_input_tokens_seen": 329673650, + "router_z_loss_clip": 0.22375488, + "router_z_loss_mlp": 0.0093689, + "step": 15280, + "time_per_iteration": 3.2176172733306885 + }, + { + "auxiliary_loss_clip": 0.01111253, + "auxiliary_loss_mlp": 0.01032702, + "balance_loss_clip": 1.03773046, + "balance_loss_mlp": 1.02128804, + "epoch": 0.9187434240192395, + "flos": 24373099566720.0, + "grad_norm": 2.0517188165539073, + "language_loss": 0.69477046, + "learning_rate": 6.881353536939815e-08, + "loss": 0.71620995, + "num_input_tokens_seen": 329692520, + "router_z_loss_clip": 0.734375, + "router_z_loss_mlp": 0.11413574, + "step": 15281, + "time_per_iteration": 2.595890522003174 + }, + { + "auxiliary_loss_clip": 0.01110628, + "auxiliary_loss_mlp": 0.01027553, + "balance_loss_clip": 1.03701603, + "balance_loss_mlp": 1.0159781, + "epoch": 0.9188035472719074, + "flos": 30783436909920.0, + "grad_norm": 1.8906960071349614, + "language_loss": 0.84665495, + "learning_rate": 6.871228969916831e-08, + "loss": 0.86803675, + "num_input_tokens_seen": 329713750, + "router_z_loss_clip": 0.73681641, + "router_z_loss_mlp": 0.11578369, + "step": 15282, + "time_per_iteration": 2.669976234436035 + }, + { + "auxiliary_loss_clip": 0.01108229, + "auxiliary_loss_mlp": 0.01031032, + "balance_loss_clip": 1.03752303, + "balance_loss_mlp": 1.01986182, + "epoch": 0.9188636705245754, + "flos": 22458323394720.0, + "grad_norm": 1.9435269049576558, + "language_loss": 0.59875226, + "learning_rate": 6.861111726356194e-08, + "loss": 0.62014484, + "num_input_tokens_seen": 329730960, + "router_z_loss_clip": 0.70751953, + "router_z_loss_mlp": 0.11175537, + "step": 15283, + "time_per_iteration": 2.573151111602783 + }, + { + "auxiliary_loss_clip": 0.0111353, + "auxiliary_loss_mlp": 0.01031276, + "balance_loss_clip": 1.03812766, + "balance_loss_mlp": 1.01998675, + "epoch": 0.9189237937772433, + "flos": 29003361642720.0, + "grad_norm": 1.8357487440061917, + "language_loss": 0.65295011, + "learning_rate": 6.851001806641554e-08, + "loss": 0.67439812, + "num_input_tokens_seen": 329750975, + "router_z_loss_clip": 0.75439453, + "router_z_loss_mlp": 0.112854, + "step": 15284, + "time_per_iteration": 4.139976263046265 + }, + { + "auxiliary_loss_clip": 0.01108223, + "auxiliary_loss_mlp": 0.01029379, + "balance_loss_clip": 1.03711486, + "balance_loss_mlp": 1.01801825, + "epoch": 0.9189839170299113, + "flos": 25886204061120.0, + "grad_norm": 3.011848523770493, + "language_loss": 0.73718673, + "learning_rate": 6.840899211156292e-08, + "loss": 0.75856274, + "num_input_tokens_seen": 329769645, + "router_z_loss_clip": 0.71044922, + "router_z_loss_mlp": 0.11364746, + "step": 15285, + "time_per_iteration": 2.6292929649353027 + }, + { + "auxiliary_loss_clip": 0.01108852, + "auxiliary_loss_mlp": 0.01033475, + "balance_loss_clip": 1.03706479, + "balance_loss_mlp": 1.02206647, + "epoch": 0.9190440402825792, + "flos": 20410467009120.0, + "grad_norm": 1.8855919960367105, + "language_loss": 0.71664584, + "learning_rate": 6.830803940283458e-08, + "loss": 0.73806912, + "num_input_tokens_seen": 329788185, + "router_z_loss_clip": 0.71777344, + "router_z_loss_mlp": 0.11413574, + "step": 15286, + "time_per_iteration": 2.6152291297912598 + }, + { + "auxiliary_loss_clip": 0.01110826, + "auxiliary_loss_mlp": 0.0102954, + "balance_loss_clip": 1.03782082, + "balance_loss_mlp": 1.01801276, + "epoch": 0.9191041635352473, + "flos": 28603148587200.0, + "grad_norm": 2.0467464711970216, + "language_loss": 0.73942304, + "learning_rate": 6.820715994405945e-08, + "loss": 0.76082671, + "num_input_tokens_seen": 329806780, + "router_z_loss_clip": 0.73046875, + "router_z_loss_mlp": 0.11535645, + "step": 15287, + "time_per_iteration": 2.6522278785705566 + }, + { + "auxiliary_loss_clip": 0.0111204, + "auxiliary_loss_mlp": 0.01028071, + "balance_loss_clip": 1.03944337, + "balance_loss_mlp": 1.01583457, + "epoch": 0.9191642867879152, + "flos": 22948796249280.0, + "grad_norm": 4.9032040259217915, + "language_loss": 0.65310955, + "learning_rate": 6.810635373906226e-08, + "loss": 0.67451072, + "num_input_tokens_seen": 329826350, + "router_z_loss_clip": 0.72705078, + "router_z_loss_mlp": 0.12249756, + "step": 15288, + "time_per_iteration": 2.6017346382141113 + }, + { + "auxiliary_loss_clip": 0.01114843, + "auxiliary_loss_mlp": 0.01031991, + "balance_loss_clip": 1.0424546, + "balance_loss_mlp": 1.02129245, + "epoch": 0.9192244100405832, + "flos": 39243697020000.0, + "grad_norm": 3.4699197905837385, + "language_loss": 0.70997834, + "learning_rate": 6.800562079166549e-08, + "loss": 0.73144662, + "num_input_tokens_seen": 329846160, + "router_z_loss_clip": 0.72363281, + "router_z_loss_mlp": 0.10705566, + "step": 15289, + "time_per_iteration": 2.711693048477173 + }, + { + "auxiliary_loss_clip": 0.01111866, + "auxiliary_loss_mlp": 0.01029119, + "balance_loss_clip": 1.03893721, + "balance_loss_mlp": 1.01825333, + "epoch": 0.9192845332932512, + "flos": 19959775773120.0, + "grad_norm": 2.438501214267494, + "language_loss": 0.74693978, + "learning_rate": 6.790496110568921e-08, + "loss": 0.76834971, + "num_input_tokens_seen": 329862020, + "router_z_loss_clip": 0.72998047, + "router_z_loss_mlp": 0.10870361, + "step": 15290, + "time_per_iteration": 2.579779863357544 + }, + { + "auxiliary_loss_clip": 0.01107638, + "auxiliary_loss_mlp": 0.01028634, + "balance_loss_clip": 1.0370717, + "balance_loss_mlp": 1.01826894, + "epoch": 0.9193446565459191, + "flos": 32475521345760.0, + "grad_norm": 2.1333601619376155, + "language_loss": 0.72017777, + "learning_rate": 6.78043746849506e-08, + "loss": 0.74154049, + "num_input_tokens_seen": 329880185, + "router_z_loss_clip": 0.70507812, + "router_z_loss_mlp": 0.1036377, + "step": 15291, + "time_per_iteration": 2.6482596397399902 + }, + { + "auxiliary_loss_clip": 0.01107788, + "auxiliary_loss_mlp": 0.01025379, + "balance_loss_clip": 1.03725624, + "balance_loss_mlp": 1.01466775, + "epoch": 0.9194047797985871, + "flos": 27445127248800.0, + "grad_norm": 2.002897611691877, + "language_loss": 0.70993704, + "learning_rate": 6.770386153326346e-08, + "loss": 0.7312687, + "num_input_tokens_seen": 329900255, + "router_z_loss_clip": 0.70507812, + "router_z_loss_mlp": 0.1071167, + "step": 15292, + "time_per_iteration": 2.6008784770965576 + }, + { + "auxiliary_loss_clip": 0.01109314, + "auxiliary_loss_mlp": 0.01028582, + "balance_loss_clip": 1.03754795, + "balance_loss_mlp": 1.01728654, + "epoch": 0.9194649030512551, + "flos": 30601863862560.0, + "grad_norm": 1.8140875075306748, + "language_loss": 0.73182976, + "learning_rate": 6.760342165443988e-08, + "loss": 0.75320876, + "num_input_tokens_seen": 329919095, + "router_z_loss_clip": 0.71777344, + "router_z_loss_mlp": 0.11297607, + "step": 15293, + "time_per_iteration": 2.637376546859741 + }, + { + "auxiliary_loss_clip": 0.01108501, + "auxiliary_loss_mlp": 0.01027152, + "balance_loss_clip": 1.03723431, + "balance_loss_mlp": 1.01573813, + "epoch": 0.9195250263039231, + "flos": 14533341900480.0, + "grad_norm": 2.278181675016831, + "language_loss": 0.78336763, + "learning_rate": 6.750305505228837e-08, + "loss": 0.80472416, + "num_input_tokens_seen": 329936505, + "router_z_loss_clip": 0.71386719, + "router_z_loss_mlp": 0.11401367, + "step": 15294, + "time_per_iteration": 4.2202770709991455 + }, + { + "auxiliary_loss_clip": 0.01112466, + "auxiliary_loss_mlp": 0.01030891, + "balance_loss_clip": 1.03797555, + "balance_loss_mlp": 1.01878488, + "epoch": 0.919585149556591, + "flos": 26642270100960.0, + "grad_norm": 10.462172976399309, + "language_loss": 0.7707355, + "learning_rate": 6.74027617306141e-08, + "loss": 0.79216903, + "num_input_tokens_seen": 329956795, + "router_z_loss_clip": 0.74365234, + "router_z_loss_mlp": 0.12115479, + "step": 15295, + "time_per_iteration": 2.6303350925445557 + }, + { + "auxiliary_loss_clip": 0.01107015, + "auxiliary_loss_mlp": 0.01027163, + "balance_loss_clip": 1.03822219, + "balance_loss_mlp": 1.01765013, + "epoch": 0.919645272809259, + "flos": 34390783725120.0, + "grad_norm": 2.2313064176106705, + "language_loss": 0.71286112, + "learning_rate": 6.730254169322114e-08, + "loss": 0.73420292, + "num_input_tokens_seen": 329977195, + "router_z_loss_clip": 0.68798828, + "router_z_loss_mlp": 0.09503174, + "step": 15296, + "time_per_iteration": 2.6652333736419678 + }, + { + "auxiliary_loss_clip": 0.01108915, + "auxiliary_loss_mlp": 0.01036314, + "balance_loss_clip": 1.03712273, + "balance_loss_mlp": 1.0253582, + "epoch": 0.9197053960619269, + "flos": 22369562735040.0, + "grad_norm": 2.0082510597411853, + "language_loss": 0.75354159, + "learning_rate": 6.720239494390912e-08, + "loss": 0.7749939, + "num_input_tokens_seen": 329992095, + "router_z_loss_clip": 0.71728516, + "router_z_loss_mlp": 0.10943604, + "step": 15297, + "time_per_iteration": 2.586642026901245 + }, + { + "auxiliary_loss_clip": 0.01108873, + "auxiliary_loss_mlp": 0.01026571, + "balance_loss_clip": 1.03722847, + "balance_loss_mlp": 1.01547873, + "epoch": 0.9197655193145949, + "flos": 34390094931360.0, + "grad_norm": 1.7712895366582992, + "language_loss": 0.7384907, + "learning_rate": 6.710232148647676e-08, + "loss": 0.75984514, + "num_input_tokens_seen": 330011490, + "router_z_loss_clip": 0.71582031, + "router_z_loss_mlp": 0.11090088, + "step": 15298, + "time_per_iteration": 2.744488000869751 + }, + { + "auxiliary_loss_clip": 0.01109576, + "auxiliary_loss_mlp": 0.01032813, + "balance_loss_clip": 1.03712821, + "balance_loss_mlp": 1.02143478, + "epoch": 0.9198256425672628, + "flos": 21117391938720.0, + "grad_norm": 2.351018864948536, + "language_loss": 0.79301667, + "learning_rate": 6.70023213247175e-08, + "loss": 0.81444061, + "num_input_tokens_seen": 330027885, + "router_z_loss_clip": 0.72509766, + "router_z_loss_mlp": 0.1137085, + "step": 15299, + "time_per_iteration": 2.593348979949951 + }, + { + "auxiliary_loss_clip": 0.0110933, + "auxiliary_loss_mlp": 0.01023198, + "balance_loss_clip": 1.03850305, + "balance_loss_mlp": 1.01271379, + "epoch": 0.9198857658199309, + "flos": 21790774910880.0, + "grad_norm": 2.069832584220607, + "language_loss": 0.63937616, + "learning_rate": 6.690239446242385e-08, + "loss": 0.66070139, + "num_input_tokens_seen": 330046230, + "router_z_loss_clip": 0.70800781, + "router_z_loss_mlp": 0.10479736, + "step": 15300, + "time_per_iteration": 2.66021990776062 + }, + { + "auxiliary_loss_clip": 0.0110401, + "auxiliary_loss_mlp": 0.01028386, + "balance_loss_clip": 1.03703904, + "balance_loss_mlp": 1.01905811, + "epoch": 0.9199458890725988, + "flos": 27000148949280.0, + "grad_norm": 1.8558326417084179, + "language_loss": 0.69120479, + "learning_rate": 6.680254090338545e-08, + "loss": 0.71252877, + "num_input_tokens_seen": 330065535, + "router_z_loss_clip": 0.66992188, + "router_z_loss_mlp": 0.09332275, + "step": 15301, + "time_per_iteration": 2.612790822982788 + }, + { + "auxiliary_loss_clip": 0.01112634, + "auxiliary_loss_mlp": 0.01032104, + "balance_loss_clip": 1.03837681, + "balance_loss_mlp": 1.02003431, + "epoch": 0.9200060123252668, + "flos": 19564424791200.0, + "grad_norm": 1.72670431327423, + "language_loss": 0.71247101, + "learning_rate": 6.670276065138814e-08, + "loss": 0.73391837, + "num_input_tokens_seen": 330082920, + "router_z_loss_clip": 0.74267578, + "router_z_loss_mlp": 0.12072754, + "step": 15302, + "time_per_iteration": 2.603276252746582 + }, + { + "auxiliary_loss_clip": 0.01109372, + "auxiliary_loss_mlp": 0.01030359, + "balance_loss_clip": 1.03680444, + "balance_loss_mlp": 1.01965976, + "epoch": 0.9200661355779348, + "flos": 32781139253280.0, + "grad_norm": 1.9088646006930508, + "language_loss": 0.76683056, + "learning_rate": 6.660305371021579e-08, + "loss": 0.78822792, + "num_input_tokens_seen": 330101165, + "router_z_loss_clip": 0.72460938, + "router_z_loss_mlp": 0.10705566, + "step": 15303, + "time_per_iteration": 2.6342477798461914 + }, + { + "auxiliary_loss_clip": 0.01110642, + "auxiliary_loss_mlp": 0.01031383, + "balance_loss_clip": 1.03924119, + "balance_loss_mlp": 1.02007604, + "epoch": 0.9201262588306027, + "flos": 15601832750880.0, + "grad_norm": 3.723900858710783, + "language_loss": 0.87615132, + "learning_rate": 6.650342008365006e-08, + "loss": 0.89757162, + "num_input_tokens_seen": 330118775, + "router_z_loss_clip": 0.71435547, + "router_z_loss_mlp": 0.11309814, + "step": 15304, + "time_per_iteration": 2.628815174102783 + }, + { + "auxiliary_loss_clip": 0.01113961, + "auxiliary_loss_mlp": 0.01029742, + "balance_loss_clip": 1.03923512, + "balance_loss_mlp": 1.01668215, + "epoch": 0.9201863820832707, + "flos": 25174700678880.0, + "grad_norm": 2.122260743186412, + "language_loss": 0.77532339, + "learning_rate": 6.64038597754677e-08, + "loss": 0.79676038, + "num_input_tokens_seen": 330135570, + "router_z_loss_clip": 0.74755859, + "router_z_loss_mlp": 0.13049316, + "step": 15305, + "time_per_iteration": 2.5972445011138916 + }, + { + "auxiliary_loss_clip": 0.01108507, + "auxiliary_loss_mlp": 0.01033775, + "balance_loss_clip": 1.03626251, + "balance_loss_mlp": 1.02234876, + "epoch": 0.9202465053359387, + "flos": 32208672124800.0, + "grad_norm": 3.165314066848032, + "language_loss": 0.81590039, + "learning_rate": 6.630437278944501e-08, + "loss": 0.83732319, + "num_input_tokens_seen": 330152840, + "router_z_loss_clip": 0.72314453, + "router_z_loss_mlp": 0.11419678, + "step": 15306, + "time_per_iteration": 4.002558708190918 + }, + { + "auxiliary_loss_clip": 0.01105946, + "auxiliary_loss_mlp": 0.01029955, + "balance_loss_clip": 1.03608477, + "balance_loss_mlp": 1.01991749, + "epoch": 0.9203066285886067, + "flos": 12796937910720.0, + "grad_norm": 2.045837696555656, + "language_loss": 0.71743089, + "learning_rate": 6.62049591293541e-08, + "loss": 0.73878992, + "num_input_tokens_seen": 330168605, + "router_z_loss_clip": 0.69824219, + "router_z_loss_mlp": 0.1003418, + "step": 15307, + "time_per_iteration": 2.64591908454895 + }, + { + "auxiliary_loss_clip": 0.01110799, + "auxiliary_loss_mlp": 0.01029328, + "balance_loss_clip": 1.03656471, + "balance_loss_mlp": 1.01795602, + "epoch": 0.9203667518412746, + "flos": 23660502217920.0, + "grad_norm": 1.961404423983814, + "language_loss": 0.78841406, + "learning_rate": 6.610561879896526e-08, + "loss": 0.80981529, + "num_input_tokens_seen": 330186160, + "router_z_loss_clip": 0.74267578, + "router_z_loss_mlp": 0.1137085, + "step": 15308, + "time_per_iteration": 2.578202962875366 + }, + { + "auxiliary_loss_clip": 0.01106976, + "auxiliary_loss_mlp": 0.01028004, + "balance_loss_clip": 1.03534853, + "balance_loss_mlp": 1.01713169, + "epoch": 0.9204268750939426, + "flos": 19431304060320.0, + "grad_norm": 2.757722770059241, + "language_loss": 0.78298062, + "learning_rate": 6.600635180204484e-08, + "loss": 0.80433047, + "num_input_tokens_seen": 330201780, + "router_z_loss_clip": 0.71630859, + "router_z_loss_mlp": 0.10864258, + "step": 15309, + "time_per_iteration": 2.5972414016723633 + }, + { + "auxiliary_loss_clip": 0.01108552, + "auxiliary_loss_mlp": 0.01029285, + "balance_loss_clip": 1.03649473, + "balance_loss_mlp": 1.01816845, + "epoch": 0.9204869983466105, + "flos": 20098568923200.0, + "grad_norm": 2.507403662069543, + "language_loss": 0.66511023, + "learning_rate": 6.590715814235781e-08, + "loss": 0.68648863, + "num_input_tokens_seen": 330219165, + "router_z_loss_clip": 0.72070312, + "router_z_loss_mlp": 0.11120605, + "step": 15310, + "time_per_iteration": 2.580326795578003 + }, + { + "auxiliary_loss_clip": 0.01108127, + "auxiliary_loss_mlp": 0.01031609, + "balance_loss_clip": 1.0355829, + "balance_loss_mlp": 1.02009916, + "epoch": 0.9205471215992785, + "flos": 26282649009600.0, + "grad_norm": 2.5505148228261394, + "language_loss": 0.66099977, + "learning_rate": 6.580803782366495e-08, + "loss": 0.68239713, + "num_input_tokens_seen": 330238975, + "router_z_loss_clip": 0.72558594, + "router_z_loss_mlp": 0.11517334, + "step": 15311, + "time_per_iteration": 2.6276960372924805 + }, + { + "auxiliary_loss_clip": 0.01108656, + "auxiliary_loss_mlp": 0.01030376, + "balance_loss_clip": 1.03579879, + "balance_loss_mlp": 1.01918828, + "epoch": 0.9206072448519464, + "flos": 30512900616480.0, + "grad_norm": 1.9447416554588306, + "language_loss": 0.75959742, + "learning_rate": 6.570899084972503e-08, + "loss": 0.78098768, + "num_input_tokens_seen": 330259755, + "router_z_loss_clip": 0.72851562, + "router_z_loss_mlp": 0.11181641, + "step": 15312, + "time_per_iteration": 2.633798599243164 + }, + { + "auxiliary_loss_clip": 0.01107582, + "auxiliary_loss_mlp": 0.01031978, + "balance_loss_clip": 1.03757358, + "balance_loss_mlp": 1.021559, + "epoch": 0.9206673681046145, + "flos": 25041701499840.0, + "grad_norm": 1.7195574600420709, + "language_loss": 0.79305202, + "learning_rate": 6.561001722429394e-08, + "loss": 0.81444764, + "num_input_tokens_seen": 330277660, + "router_z_loss_clip": 0.69970703, + "router_z_loss_mlp": 0.10418701, + "step": 15313, + "time_per_iteration": 2.672804594039917 + }, + { + "auxiliary_loss_clip": 0.01110959, + "auxiliary_loss_mlp": 0.01029669, + "balance_loss_clip": 1.03798938, + "balance_loss_mlp": 1.01881492, + "epoch": 0.9207274913572824, + "flos": 25482344450400.0, + "grad_norm": 2.08589040317032, + "language_loss": 0.78514063, + "learning_rate": 6.55111169511251e-08, + "loss": 0.80654687, + "num_input_tokens_seen": 330295455, + "router_z_loss_clip": 0.72949219, + "router_z_loss_mlp": 0.10864258, + "step": 15314, + "time_per_iteration": 2.6107242107391357 + }, + { + "auxiliary_loss_clip": 0.01113937, + "auxiliary_loss_mlp": 0.01027105, + "balance_loss_clip": 1.03851175, + "balance_loss_mlp": 1.01511836, + "epoch": 0.9207876146099504, + "flos": 27709059225600.0, + "grad_norm": 2.8201264532758263, + "language_loss": 0.78943002, + "learning_rate": 6.541229003396864e-08, + "loss": 0.81084043, + "num_input_tokens_seen": 330315310, + "router_z_loss_clip": 0.75537109, + "router_z_loss_mlp": 0.11981201, + "step": 15315, + "time_per_iteration": 2.707098960876465 + }, + { + "auxiliary_loss_clip": 0.01112042, + "auxiliary_loss_mlp": 0.01031206, + "balance_loss_clip": 1.03704798, + "balance_loss_mlp": 1.02029276, + "epoch": 0.9208477378626184, + "flos": 22583259635040.0, + "grad_norm": 1.9934287876142507, + "language_loss": 0.76192433, + "learning_rate": 6.531353647657156e-08, + "loss": 0.78335679, + "num_input_tokens_seen": 330333260, + "router_z_loss_clip": 0.74951172, + "router_z_loss_mlp": 0.10906982, + "step": 15316, + "time_per_iteration": 2.5739455223083496 + }, + { + "auxiliary_loss_clip": 0.01108181, + "auxiliary_loss_mlp": 0.01031644, + "balance_loss_clip": 1.03517854, + "balance_loss_mlp": 1.0202055, + "epoch": 0.9209078611152863, + "flos": 28064669106240.0, + "grad_norm": 1.7720297693732765, + "language_loss": 0.69483185, + "learning_rate": 6.521485628267931e-08, + "loss": 0.71623003, + "num_input_tokens_seen": 330352465, + "router_z_loss_clip": 0.73046875, + "router_z_loss_mlp": 0.11444092, + "step": 15317, + "time_per_iteration": 2.7283506393432617 + }, + { + "auxiliary_loss_clip": 0.01110855, + "auxiliary_loss_mlp": 0.01030853, + "balance_loss_clip": 1.0393486, + "balance_loss_mlp": 1.01965952, + "epoch": 0.9209679843679544, + "flos": 29360106007200.0, + "grad_norm": 1.854017580465312, + "language_loss": 0.83774364, + "learning_rate": 6.511624945603378e-08, + "loss": 0.85916072, + "num_input_tokens_seen": 330372685, + "router_z_loss_clip": 0.71484375, + "router_z_loss_mlp": 0.11187744, + "step": 15318, + "time_per_iteration": 2.6398587226867676 + }, + { + "auxiliary_loss_clip": 0.01111727, + "auxiliary_loss_mlp": 0.01028921, + "balance_loss_clip": 1.0402559, + "balance_loss_mlp": 1.01841879, + "epoch": 0.9210281076206223, + "flos": 16537283904960.0, + "grad_norm": 2.3107491226071817, + "language_loss": 0.85762876, + "learning_rate": 6.501771600037354e-08, + "loss": 0.87903523, + "num_input_tokens_seen": 330388860, + "router_z_loss_clip": 0.71533203, + "router_z_loss_mlp": 0.10498047, + "step": 15319, + "time_per_iteration": 2.652315855026245 + }, + { + "auxiliary_loss_clip": 0.01027182, + "auxiliary_loss_mlp": 0.01001912, + "balance_loss_clip": 1.00482273, + "balance_loss_mlp": 1.00098205, + "epoch": 0.9210882308732903, + "flos": 87154307393760.0, + "grad_norm": 0.7728851397933354, + "language_loss": 0.561463, + "learning_rate": 6.491925591943559e-08, + "loss": 0.58175397, + "num_input_tokens_seen": 330448735, + "router_z_loss_clip": 0.22375488, + "router_z_loss_mlp": 0.00928497, + "step": 15320, + "time_per_iteration": 3.2936370372772217 + }, + { + "auxiliary_loss_clip": 0.01114379, + "auxiliary_loss_mlp": 0.01038146, + "balance_loss_clip": 1.03839445, + "balance_loss_mlp": 1.02601612, + "epoch": 0.9211483541259582, + "flos": 22585244981760.0, + "grad_norm": 2.380222760096315, + "language_loss": 0.6396209, + "learning_rate": 6.482086921695384e-08, + "loss": 0.66114616, + "num_input_tokens_seen": 330465600, + "router_z_loss_clip": 0.75976562, + "router_z_loss_mlp": 0.12127686, + "step": 15321, + "time_per_iteration": 2.6028783321380615 + }, + { + "auxiliary_loss_clip": 0.01104918, + "auxiliary_loss_mlp": 0.01026003, + "balance_loss_clip": 1.0370183, + "balance_loss_mlp": 1.01554799, + "epoch": 0.9212084773786262, + "flos": 28379930126400.0, + "grad_norm": 1.654316159566403, + "language_loss": 0.71860117, + "learning_rate": 6.47225558966582e-08, + "loss": 0.73991036, + "num_input_tokens_seen": 330485770, + "router_z_loss_clip": 0.67871094, + "router_z_loss_mlp": 0.10455322, + "step": 15322, + "time_per_iteration": 2.7033982276916504 + }, + { + "auxiliary_loss_clip": 0.01108033, + "auxiliary_loss_mlp": 0.0103052, + "balance_loss_clip": 1.03594398, + "balance_loss_mlp": 1.01998234, + "epoch": 0.9212686006312941, + "flos": 19876160808000.0, + "grad_norm": 2.614493193570559, + "language_loss": 0.70302916, + "learning_rate": 6.462431596227725e-08, + "loss": 0.72441471, + "num_input_tokens_seen": 330504255, + "router_z_loss_clip": 0.72167969, + "router_z_loss_mlp": 0.10540771, + "step": 15323, + "time_per_iteration": 2.5800278186798096 + }, + { + "auxiliary_loss_clip": 0.01111818, + "auxiliary_loss_mlp": 0.01032953, + "balance_loss_clip": 1.03704381, + "balance_loss_mlp": 1.02098417, + "epoch": 0.9213287238839621, + "flos": 24143155237440.0, + "grad_norm": 1.9852089969966122, + "language_loss": 0.74777246, + "learning_rate": 6.452614941753597e-08, + "loss": 0.76922017, + "num_input_tokens_seen": 330520705, + "router_z_loss_clip": 0.74804688, + "router_z_loss_mlp": 0.11968994, + "step": 15324, + "time_per_iteration": 3.9959449768066406 + }, + { + "auxiliary_loss_clip": 0.01111617, + "auxiliary_loss_mlp": 0.01038345, + "balance_loss_clip": 1.03808045, + "balance_loss_mlp": 1.02744889, + "epoch": 0.92138884713663, + "flos": 25661567495520.0, + "grad_norm": 2.527580598691541, + "language_loss": 0.71227312, + "learning_rate": 6.442805626615744e-08, + "loss": 0.73377281, + "num_input_tokens_seen": 330539245, + "router_z_loss_clip": 0.73535156, + "router_z_loss_mlp": 0.10894775, + "step": 15325, + "time_per_iteration": 2.6531803607940674 + }, + { + "auxiliary_loss_clip": 0.01108402, + "auxiliary_loss_mlp": 0.01029348, + "balance_loss_clip": 1.0368166, + "balance_loss_mlp": 1.01830339, + "epoch": 0.9214489703892981, + "flos": 34882634167200.0, + "grad_norm": 1.4810312618606634, + "language_loss": 0.7875489, + "learning_rate": 6.433003651186109e-08, + "loss": 0.8089264, + "num_input_tokens_seen": 330561815, + "router_z_loss_clip": 0.71533203, + "router_z_loss_mlp": 0.1104126, + "step": 15326, + "time_per_iteration": 2.665036678314209 + }, + { + "auxiliary_loss_clip": 0.01113503, + "auxiliary_loss_mlp": 0.01034073, + "balance_loss_clip": 1.03962374, + "balance_loss_mlp": 1.02239037, + "epoch": 0.921509093641966, + "flos": 19964070604800.0, + "grad_norm": 9.350806541607119, + "language_loss": 0.71283442, + "learning_rate": 6.42320901583635e-08, + "loss": 0.73431015, + "num_input_tokens_seen": 330579760, + "router_z_loss_clip": 0.73974609, + "router_z_loss_mlp": 0.11688232, + "step": 15327, + "time_per_iteration": 2.6001248359680176 + }, + { + "auxiliary_loss_clip": 0.0111472, + "auxiliary_loss_mlp": 0.01034858, + "balance_loss_clip": 1.04018021, + "balance_loss_mlp": 1.02338386, + "epoch": 0.921569216894634, + "flos": 32739777460800.0, + "grad_norm": 3.0385455776935872, + "language_loss": 0.77364421, + "learning_rate": 6.413421720937906e-08, + "loss": 0.79514003, + "num_input_tokens_seen": 330598545, + "router_z_loss_clip": 0.74560547, + "router_z_loss_mlp": 0.1149292, + "step": 15328, + "time_per_iteration": 2.6450936794281006 + }, + { + "auxiliary_loss_clip": 0.0110841, + "auxiliary_loss_mlp": 0.01028419, + "balance_loss_clip": 1.03723979, + "balance_loss_mlp": 1.01754117, + "epoch": 0.921629340147302, + "flos": 30072298183200.0, + "grad_norm": 2.5586380365599974, + "language_loss": 0.70695817, + "learning_rate": 6.4036417668619e-08, + "loss": 0.72832644, + "num_input_tokens_seen": 330616700, + "router_z_loss_clip": 0.71044922, + "router_z_loss_mlp": 0.10888672, + "step": 15329, + "time_per_iteration": 2.6386282444000244 + }, + { + "auxiliary_loss_clip": 0.01107911, + "auxiliary_loss_mlp": 0.01023933, + "balance_loss_clip": 1.03646398, + "balance_loss_mlp": 1.01377606, + "epoch": 0.9216894633999699, + "flos": 18408064661280.0, + "grad_norm": 2.0192707655327213, + "language_loss": 0.86629045, + "learning_rate": 6.393869153979192e-08, + "loss": 0.88760895, + "num_input_tokens_seen": 330633355, + "router_z_loss_clip": 0.71435547, + "router_z_loss_mlp": 0.10162354, + "step": 15330, + "time_per_iteration": 2.5503716468811035 + }, + { + "auxiliary_loss_clip": 0.01111522, + "auxiliary_loss_mlp": 0.01029263, + "balance_loss_clip": 1.03855121, + "balance_loss_mlp": 1.0184505, + "epoch": 0.921749586652638, + "flos": 23433637201920.0, + "grad_norm": 2.1756024667079092, + "language_loss": 0.75838482, + "learning_rate": 6.384103882660397e-08, + "loss": 0.77979267, + "num_input_tokens_seen": 330651470, + "router_z_loss_clip": 0.72949219, + "router_z_loss_mlp": 0.1081543, + "step": 15331, + "time_per_iteration": 2.6032803058624268 + }, + { + "auxiliary_loss_clip": 0.01108661, + "auxiliary_loss_mlp": 0.0102514, + "balance_loss_clip": 1.0366683, + "balance_loss_mlp": 1.01435196, + "epoch": 0.9218097099053059, + "flos": 25041458396160.0, + "grad_norm": 2.317044328136042, + "language_loss": 0.74996471, + "learning_rate": 6.374345953275794e-08, + "loss": 0.7713027, + "num_input_tokens_seen": 330669170, + "router_z_loss_clip": 0.71972656, + "router_z_loss_mlp": 0.10784912, + "step": 15332, + "time_per_iteration": 2.661578416824341 + }, + { + "auxiliary_loss_clip": 0.01109349, + "auxiliary_loss_mlp": 0.01027512, + "balance_loss_clip": 1.03699398, + "balance_loss_mlp": 1.01732004, + "epoch": 0.9218698331579739, + "flos": 21168477878400.0, + "grad_norm": 1.9352438810057744, + "language_loss": 0.74609995, + "learning_rate": 6.364595366195358e-08, + "loss": 0.76746863, + "num_input_tokens_seen": 330686635, + "router_z_loss_clip": 0.72460938, + "router_z_loss_mlp": 0.10192871, + "step": 15333, + "time_per_iteration": 5.541851043701172 + }, + { + "auxiliary_loss_clip": 0.01027068, + "auxiliary_loss_mlp": 0.01001115, + "balance_loss_clip": 1.00474954, + "balance_loss_mlp": 1.00021529, + "epoch": 0.9219299564106418, + "flos": 75602449755360.0, + "grad_norm": 0.8505985140333123, + "language_loss": 0.52841097, + "learning_rate": 6.354852121788879e-08, + "loss": 0.54869282, + "num_input_tokens_seen": 330749160, + "router_z_loss_clip": 0.22302246, + "router_z_loss_mlp": 0.00900269, + "step": 15334, + "time_per_iteration": 3.2289223670959473 + }, + { + "auxiliary_loss_clip": 0.01107538, + "auxiliary_loss_mlp": 0.01029013, + "balance_loss_clip": 1.03826714, + "balance_loss_mlp": 1.01878428, + "epoch": 0.9219900796633098, + "flos": 19159309144800.0, + "grad_norm": 2.114901772058005, + "language_loss": 0.6258859, + "learning_rate": 6.345116220425839e-08, + "loss": 0.64725143, + "num_input_tokens_seen": 330766840, + "router_z_loss_clip": 0.69238281, + "router_z_loss_mlp": 0.10229492, + "step": 15335, + "time_per_iteration": 2.620171308517456 + }, + { + "auxiliary_loss_clip": 0.01107878, + "auxiliary_loss_mlp": 0.01030001, + "balance_loss_clip": 1.03739631, + "balance_loss_mlp": 1.01899791, + "epoch": 0.9220502029159777, + "flos": 30423572714880.0, + "grad_norm": 2.168945501816292, + "language_loss": 0.70955086, + "learning_rate": 6.335387662475366e-08, + "loss": 0.73092961, + "num_input_tokens_seen": 330785585, + "router_z_loss_clip": 0.70507812, + "router_z_loss_mlp": 0.10998535, + "step": 15336, + "time_per_iteration": 2.631309986114502 + }, + { + "auxiliary_loss_clip": 0.01104866, + "auxiliary_loss_mlp": 0.0102696, + "balance_loss_clip": 1.03569746, + "balance_loss_mlp": 1.01708305, + "epoch": 0.9221103261686457, + "flos": 19116974937600.0, + "grad_norm": 3.652301063565994, + "language_loss": 0.71580565, + "learning_rate": 6.325666448306433e-08, + "loss": 0.73712391, + "num_input_tokens_seen": 330800750, + "router_z_loss_clip": 0.69238281, + "router_z_loss_mlp": 0.09881592, + "step": 15337, + "time_per_iteration": 2.6217663288116455 + }, + { + "auxiliary_loss_clip": 0.01027115, + "auxiliary_loss_mlp": 0.01001711, + "balance_loss_clip": 1.00478196, + "balance_loss_mlp": 1.00085783, + "epoch": 0.9221704494213137, + "flos": 82383590959200.0, + "grad_norm": 1.0654782214287506, + "language_loss": 0.65331292, + "learning_rate": 6.31595257828763e-08, + "loss": 0.67360127, + "num_input_tokens_seen": 330863640, + "router_z_loss_clip": 0.22363281, + "router_z_loss_mlp": 0.00854492, + "step": 15338, + "time_per_iteration": 3.2091994285583496 + }, + { + "auxiliary_loss_clip": 0.01111739, + "auxiliary_loss_mlp": 0.0102975, + "balance_loss_clip": 1.03881907, + "balance_loss_mlp": 1.0182817, + "epoch": 0.9222305726739817, + "flos": 36883699444800.0, + "grad_norm": 1.6105865000753161, + "language_loss": 0.67088556, + "learning_rate": 6.306246052787289e-08, + "loss": 0.69230044, + "num_input_tokens_seen": 330884675, + "router_z_loss_clip": 0.72949219, + "router_z_loss_mlp": 0.11456299, + "step": 15339, + "time_per_iteration": 2.7252109050750732 + }, + { + "auxiliary_loss_clip": 0.0110972, + "auxiliary_loss_mlp": 0.01028045, + "balance_loss_clip": 1.03749883, + "balance_loss_mlp": 1.01731038, + "epoch": 0.9222906959266496, + "flos": 30916922296320.0, + "grad_norm": 2.9380895574524017, + "language_loss": 0.7204085, + "learning_rate": 6.296546872173513e-08, + "loss": 0.74178612, + "num_input_tokens_seen": 330904125, + "router_z_loss_clip": 0.72216797, + "router_z_loss_mlp": 0.10723877, + "step": 15340, + "time_per_iteration": 2.6788883209228516 + }, + { + "auxiliary_loss_clip": 0.0110787, + "auxiliary_loss_mlp": 0.0102938, + "balance_loss_clip": 1.0372355, + "balance_loss_mlp": 1.01866901, + "epoch": 0.9223508191793176, + "flos": 33677943272640.0, + "grad_norm": 1.4704214014658428, + "language_loss": 0.70240825, + "learning_rate": 6.286855036814098e-08, + "loss": 0.72378075, + "num_input_tokens_seen": 330925140, + "router_z_loss_clip": 0.70507812, + "router_z_loss_mlp": 0.1071167, + "step": 15341, + "time_per_iteration": 2.7068045139312744 + }, + { + "auxiliary_loss_clip": 0.01103814, + "auxiliary_loss_mlp": 0.01025996, + "balance_loss_clip": 1.03578591, + "balance_loss_mlp": 1.01578033, + "epoch": 0.9224109424319856, + "flos": 33321320460000.0, + "grad_norm": 2.169942410342618, + "language_loss": 0.6751287, + "learning_rate": 6.277170547076571e-08, + "loss": 0.69642675, + "num_input_tokens_seen": 330946625, + "router_z_loss_clip": 0.68017578, + "router_z_loss_mlp": 0.10223389, + "step": 15342, + "time_per_iteration": 2.65051007270813 + }, + { + "auxiliary_loss_clip": 0.0110939, + "auxiliary_loss_mlp": 0.01029823, + "balance_loss_clip": 1.03737676, + "balance_loss_mlp": 1.0197258, + "epoch": 0.9224710656846535, + "flos": 58826510305920.0, + "grad_norm": 1.9392829057768142, + "language_loss": 0.69256473, + "learning_rate": 6.26749340332815e-08, + "loss": 0.71395683, + "num_input_tokens_seen": 330967795, + "router_z_loss_clip": 0.71972656, + "router_z_loss_mlp": 0.10095215, + "step": 15343, + "time_per_iteration": 2.896969795227051 + }, + { + "auxiliary_loss_clip": 0.01027252, + "auxiliary_loss_mlp": 0.0100129, + "balance_loss_clip": 1.0049212, + "balance_loss_mlp": 1.00043046, + "epoch": 0.9225311889373216, + "flos": 81413746984800.0, + "grad_norm": 0.7260696562429266, + "language_loss": 0.51976061, + "learning_rate": 6.257823605935786e-08, + "loss": 0.54004604, + "num_input_tokens_seen": 331040850, + "router_z_loss_clip": 0.2232666, + "router_z_loss_mlp": 0.00859451, + "step": 15344, + "time_per_iteration": 3.3909151554107666 + }, + { + "auxiliary_loss_clip": 0.01104466, + "auxiliary_loss_mlp": 0.01025866, + "balance_loss_clip": 1.03705311, + "balance_loss_mlp": 1.01575744, + "epoch": 0.9225913121899895, + "flos": 27175765956480.0, + "grad_norm": 1.813233879762759, + "language_loss": 0.70143211, + "learning_rate": 6.248161155266162e-08, + "loss": 0.7227354, + "num_input_tokens_seen": 331060595, + "router_z_loss_clip": 0.67431641, + "router_z_loss_mlp": 0.10107422, + "step": 15345, + "time_per_iteration": 2.6079752445220947 + }, + { + "auxiliary_loss_clip": 0.01108499, + "auxiliary_loss_mlp": 0.01035913, + "balance_loss_clip": 1.03599954, + "balance_loss_mlp": 1.02485657, + "epoch": 0.9226514354426575, + "flos": 24505045296480.0, + "grad_norm": 1.9133405550285332, + "language_loss": 0.77363485, + "learning_rate": 6.238506051685677e-08, + "loss": 0.79507899, + "num_input_tokens_seen": 331080195, + "router_z_loss_clip": 0.72509766, + "router_z_loss_mlp": 0.1105957, + "step": 15346, + "time_per_iteration": 4.001711130142212 + }, + { + "auxiliary_loss_clip": 0.01114331, + "auxiliary_loss_mlp": 0.01031771, + "balance_loss_clip": 1.03844047, + "balance_loss_mlp": 1.02068424, + "epoch": 0.9227115586953254, + "flos": 19609311587040.0, + "grad_norm": 2.000798842619896, + "language_loss": 0.76638681, + "learning_rate": 6.228858295560457e-08, + "loss": 0.78784776, + "num_input_tokens_seen": 331097645, + "router_z_loss_clip": 0.75927734, + "router_z_loss_mlp": 0.11077881, + "step": 15347, + "time_per_iteration": 2.577622175216675 + }, + { + "auxiliary_loss_clip": 0.0110749, + "auxiliary_loss_mlp": 0.01026502, + "balance_loss_clip": 1.03863776, + "balance_loss_mlp": 1.01651812, + "epoch": 0.9227716819479934, + "flos": 24948524456640.0, + "grad_norm": 1.707887531211883, + "language_loss": 0.7671479, + "learning_rate": 6.219217887256367e-08, + "loss": 0.78848779, + "num_input_tokens_seen": 331116830, + "router_z_loss_clip": 0.68896484, + "router_z_loss_mlp": 0.09991455, + "step": 15348, + "time_per_iteration": 2.633495330810547 + }, + { + "auxiliary_loss_clip": 0.01110023, + "auxiliary_loss_mlp": 0.01030126, + "balance_loss_clip": 1.03661692, + "balance_loss_mlp": 1.01878905, + "epoch": 0.9228318052006613, + "flos": 30514602342240.0, + "grad_norm": 2.343035506034801, + "language_loss": 0.67713594, + "learning_rate": 6.209584827138959e-08, + "loss": 0.69853747, + "num_input_tokens_seen": 331137235, + "router_z_loss_clip": 0.73388672, + "router_z_loss_mlp": 0.11334229, + "step": 15349, + "time_per_iteration": 2.703153371810913 + }, + { + "auxiliary_loss_clip": 0.01108065, + "auxiliary_loss_mlp": 0.01026501, + "balance_loss_clip": 1.03539419, + "balance_loss_mlp": 1.01546812, + "epoch": 0.9228919284533293, + "flos": 15468225812640.0, + "grad_norm": 3.753026668300098, + "language_loss": 0.87357938, + "learning_rate": 6.199959115573495e-08, + "loss": 0.89492512, + "num_input_tokens_seen": 331153155, + "router_z_loss_clip": 0.72705078, + "router_z_loss_mlp": 0.11035156, + "step": 15350, + "time_per_iteration": 2.5685386657714844 + }, + { + "auxiliary_loss_clip": 0.01027366, + "auxiliary_loss_mlp": 0.01000609, + "balance_loss_clip": 1.00506055, + "balance_loss_mlp": 0.99980366, + "epoch": 0.9229520517059973, + "flos": 85398536144160.0, + "grad_norm": 0.7785121927672711, + "language_loss": 0.60371786, + "learning_rate": 6.190340752924994e-08, + "loss": 0.62399763, + "num_input_tokens_seen": 331214895, + "router_z_loss_clip": 0.22314453, + "router_z_loss_mlp": 0.00805664, + "step": 15351, + "time_per_iteration": 3.1981282234191895 + }, + { + "auxiliary_loss_clip": 0.01109917, + "auxiliary_loss_mlp": 0.01023987, + "balance_loss_clip": 1.03566337, + "balance_loss_mlp": 1.01314473, + "epoch": 0.9230121749586653, + "flos": 18050996158560.0, + "grad_norm": 2.101251499519415, + "language_loss": 0.77110189, + "learning_rate": 6.180729739558233e-08, + "loss": 0.79244101, + "num_input_tokens_seen": 331232185, + "router_z_loss_clip": 0.74316406, + "router_z_loss_mlp": 0.10839844, + "step": 15352, + "time_per_iteration": 2.578345775604248 + }, + { + "auxiliary_loss_clip": 0.01111237, + "auxiliary_loss_mlp": 0.01034814, + "balance_loss_clip": 1.03604484, + "balance_loss_mlp": 1.02270257, + "epoch": 0.9230722982113332, + "flos": 28024725418560.0, + "grad_norm": 1.8560297762178268, + "language_loss": 0.59400672, + "learning_rate": 6.171126075837585e-08, + "loss": 0.61546725, + "num_input_tokens_seen": 331251065, + "router_z_loss_clip": 0.75244141, + "router_z_loss_mlp": 0.12115479, + "step": 15353, + "time_per_iteration": 2.7227959632873535 + }, + { + "auxiliary_loss_clip": 0.01107876, + "auxiliary_loss_mlp": 0.01029913, + "balance_loss_clip": 1.03754234, + "balance_loss_mlp": 1.01986337, + "epoch": 0.9231324214640012, + "flos": 22636371438720.0, + "grad_norm": 1.5916361834438342, + "language_loss": 0.74627781, + "learning_rate": 6.161529762127293e-08, + "loss": 0.76765573, + "num_input_tokens_seen": 331269110, + "router_z_loss_clip": 0.70263672, + "router_z_loss_mlp": 0.1005249, + "step": 15354, + "time_per_iteration": 2.578850507736206 + }, + { + "auxiliary_loss_clip": 0.01112643, + "auxiliary_loss_mlp": 0.0103087, + "balance_loss_clip": 1.03778541, + "balance_loss_mlp": 1.01916957, + "epoch": 0.9231925447166691, + "flos": 26946226800000.0, + "grad_norm": 2.3372292962205616, + "language_loss": 0.65115643, + "learning_rate": 6.1519407987912e-08, + "loss": 0.67259157, + "num_input_tokens_seen": 331286555, + "router_z_loss_clip": 0.74902344, + "router_z_loss_mlp": 0.11700439, + "step": 15355, + "time_per_iteration": 2.635230541229248 + }, + { + "auxiliary_loss_clip": 0.01106898, + "auxiliary_loss_mlp": 0.01031019, + "balance_loss_clip": 1.03727436, + "balance_loss_mlp": 1.02096939, + "epoch": 0.9232526679693371, + "flos": 32386071892320.0, + "grad_norm": 1.5965736481876336, + "language_loss": 0.74390817, + "learning_rate": 6.142359186192947e-08, + "loss": 0.76528728, + "num_input_tokens_seen": 331307660, + "router_z_loss_clip": 0.69628906, + "router_z_loss_mlp": 0.1005249, + "step": 15356, + "time_per_iteration": 2.6313602924346924 + }, + { + "auxiliary_loss_clip": 0.011108, + "auxiliary_loss_mlp": 0.01032194, + "balance_loss_clip": 1.0380497, + "balance_loss_mlp": 1.0209527, + "epoch": 0.9233127912220052, + "flos": 18005704189920.0, + "grad_norm": 1.7248259416947591, + "language_loss": 0.61285478, + "learning_rate": 6.132784924695844e-08, + "loss": 0.63428468, + "num_input_tokens_seen": 331324885, + "router_z_loss_clip": 0.72753906, + "router_z_loss_mlp": 0.11254883, + "step": 15357, + "time_per_iteration": 2.594317674636841 + }, + { + "auxiliary_loss_clip": 0.01110302, + "auxiliary_loss_mlp": 0.01030144, + "balance_loss_clip": 1.03668833, + "balance_loss_mlp": 1.01860487, + "epoch": 0.9233729144746731, + "flos": 30825122840640.0, + "grad_norm": 1.9939714407853442, + "language_loss": 0.69973361, + "learning_rate": 6.123218014662956e-08, + "loss": 0.721138, + "num_input_tokens_seen": 331345885, + "router_z_loss_clip": 0.73632812, + "router_z_loss_mlp": 0.11541748, + "step": 15358, + "time_per_iteration": 2.6524438858032227 + }, + { + "auxiliary_loss_clip": 0.01108215, + "auxiliary_loss_mlp": 0.01030791, + "balance_loss_clip": 1.03634918, + "balance_loss_mlp": 1.01957893, + "epoch": 0.9234330377273411, + "flos": 33983480145600.0, + "grad_norm": 2.1542917072453616, + "language_loss": 0.73443139, + "learning_rate": 6.113658456457104e-08, + "loss": 0.75582147, + "num_input_tokens_seen": 331364320, + "router_z_loss_clip": 0.71826172, + "router_z_loss_mlp": 0.11206055, + "step": 15359, + "time_per_iteration": 2.651492118835449 + }, + { + "auxiliary_loss_clip": 0.01109928, + "auxiliary_loss_mlp": 0.01028155, + "balance_loss_clip": 1.03766751, + "balance_loss_mlp": 1.01727688, + "epoch": 0.923493160980009, + "flos": 30027451904640.0, + "grad_norm": 2.204401941112098, + "language_loss": 0.64976859, + "learning_rate": 6.104106250440732e-08, + "loss": 0.67114943, + "num_input_tokens_seen": 331384135, + "router_z_loss_clip": 0.72265625, + "router_z_loss_mlp": 0.10876465, + "step": 15360, + "time_per_iteration": 2.6407506465911865 + }, + { + "auxiliary_loss_clip": 0.01027271, + "auxiliary_loss_mlp": 0.01001446, + "balance_loss_clip": 1.0049088, + "balance_loss_mlp": 1.00051403, + "epoch": 0.923553284232677, + "flos": 82608673214880.0, + "grad_norm": 0.7533710813581566, + "language_loss": 0.55032265, + "learning_rate": 6.094561396976083e-08, + "loss": 0.57060981, + "num_input_tokens_seen": 331440645, + "router_z_loss_clip": 0.22351074, + "router_z_loss_mlp": 0.00931549, + "step": 15361, + "time_per_iteration": 3.2714314460754395 + }, + { + "auxiliary_loss_clip": 0.01111453, + "auxiliary_loss_mlp": 0.01027945, + "balance_loss_clip": 1.03688884, + "balance_loss_mlp": 1.01660228, + "epoch": 0.9236134074853449, + "flos": 22948472111040.0, + "grad_norm": 2.1550630447956785, + "language_loss": 0.6982044, + "learning_rate": 6.085023896425112e-08, + "loss": 0.71959841, + "num_input_tokens_seen": 331459580, + "router_z_loss_clip": 0.74609375, + "router_z_loss_mlp": 0.11334229, + "step": 15362, + "time_per_iteration": 2.6567959785461426 + }, + { + "auxiliary_loss_clip": 0.01113433, + "auxiliary_loss_mlp": 0.01030928, + "balance_loss_clip": 1.03852081, + "balance_loss_mlp": 1.01844692, + "epoch": 0.923673530738013, + "flos": 33900635008800.0, + "grad_norm": 1.519934047031904, + "language_loss": 0.75990975, + "learning_rate": 6.075493749149463e-08, + "loss": 0.78135335, + "num_input_tokens_seen": 331481560, + "router_z_loss_clip": 0.74902344, + "router_z_loss_mlp": 0.12481689, + "step": 15363, + "time_per_iteration": 4.132835865020752 + }, + { + "auxiliary_loss_clip": 0.01109328, + "auxiliary_loss_mlp": 0.01026841, + "balance_loss_clip": 1.03722882, + "balance_loss_mlp": 1.01598072, + "epoch": 0.9237336539906809, + "flos": 32698375151040.0, + "grad_norm": 2.821703275027284, + "language_loss": 0.83912075, + "learning_rate": 6.065970955510514e-08, + "loss": 0.86048245, + "num_input_tokens_seen": 331499090, + "router_z_loss_clip": 0.72119141, + "router_z_loss_mlp": 0.10858154, + "step": 15364, + "time_per_iteration": 2.7880020141601562 + }, + { + "auxiliary_loss_clip": 0.01107775, + "auxiliary_loss_mlp": 0.01022848, + "balance_loss_clip": 1.03704822, + "balance_loss_mlp": 1.01251221, + "epoch": 0.9237937772433489, + "flos": 28781885424960.0, + "grad_norm": 1.6170107708665142, + "language_loss": 0.68207347, + "learning_rate": 6.056455515869419e-08, + "loss": 0.70337969, + "num_input_tokens_seen": 331519420, + "router_z_loss_clip": 0.70751953, + "router_z_loss_mlp": 0.10333252, + "step": 15365, + "time_per_iteration": 2.6690855026245117 + }, + { + "auxiliary_loss_clip": 0.01112081, + "auxiliary_loss_mlp": 0.01028468, + "balance_loss_clip": 1.03943443, + "balance_loss_mlp": 1.01756036, + "epoch": 0.9238539004960168, + "flos": 31897179211680.0, + "grad_norm": 2.3563943555741065, + "language_loss": 0.62820864, + "learning_rate": 6.046947430586913e-08, + "loss": 0.6496141, + "num_input_tokens_seen": 331538720, + "router_z_loss_clip": 0.7265625, + "router_z_loss_mlp": 0.10906982, + "step": 15366, + "time_per_iteration": 2.647327184677124 + }, + { + "auxiliary_loss_clip": 0.0111099, + "auxiliary_loss_mlp": 0.01028431, + "balance_loss_clip": 1.03960657, + "balance_loss_mlp": 1.0169394, + "epoch": 0.9239140237486848, + "flos": 25707345671520.0, + "grad_norm": 1.7206781853300588, + "language_loss": 0.74318027, + "learning_rate": 6.037446700023619e-08, + "loss": 0.76457447, + "num_input_tokens_seen": 331558505, + "router_z_loss_clip": 0.71337891, + "router_z_loss_mlp": 0.11480713, + "step": 15367, + "time_per_iteration": 2.6760973930358887 + }, + { + "auxiliary_loss_clip": 0.01105352, + "auxiliary_loss_mlp": 0.01029859, + "balance_loss_clip": 1.03701234, + "balance_loss_mlp": 1.01967812, + "epoch": 0.9239741470013527, + "flos": 30026965697280.0, + "grad_norm": 2.284284321126926, + "language_loss": 0.64920616, + "learning_rate": 6.027953324539759e-08, + "loss": 0.67055833, + "num_input_tokens_seen": 331578440, + "router_z_loss_clip": 0.68359375, + "router_z_loss_mlp": 0.10174561, + "step": 15368, + "time_per_iteration": 2.616563081741333 + }, + { + "auxiliary_loss_clip": 0.01112701, + "auxiliary_loss_mlp": 0.01030389, + "balance_loss_clip": 1.03799629, + "balance_loss_mlp": 1.01892686, + "epoch": 0.9240342702540207, + "flos": 30161504532960.0, + "grad_norm": 3.7642507612950435, + "language_loss": 0.74596387, + "learning_rate": 6.018467304495401e-08, + "loss": 0.76739478, + "num_input_tokens_seen": 331598945, + "router_z_loss_clip": 0.74658203, + "router_z_loss_mlp": 0.11468506, + "step": 15369, + "time_per_iteration": 2.6362428665161133 + }, + { + "auxiliary_loss_clip": 0.01115659, + "auxiliary_loss_mlp": 0.01035412, + "balance_loss_clip": 1.03974664, + "balance_loss_mlp": 1.02281725, + "epoch": 0.9240943935066888, + "flos": 25441225761600.0, + "grad_norm": 2.1435500828914, + "language_loss": 0.76494187, + "learning_rate": 6.008988640250145e-08, + "loss": 0.78645253, + "num_input_tokens_seen": 331616700, + "router_z_loss_clip": 0.75927734, + "router_z_loss_mlp": 0.12591553, + "step": 15370, + "time_per_iteration": 2.5955135822296143 + }, + { + "auxiliary_loss_clip": 0.01109441, + "auxiliary_loss_mlp": 0.01028523, + "balance_loss_clip": 1.0370121, + "balance_loss_mlp": 1.01769304, + "epoch": 0.9241545167593567, + "flos": 29849241791520.0, + "grad_norm": 2.4004853727828728, + "language_loss": 0.66800427, + "learning_rate": 5.999517332163528e-08, + "loss": 0.68938386, + "num_input_tokens_seen": 331635625, + "router_z_loss_clip": 0.72363281, + "router_z_loss_mlp": 0.10839844, + "step": 15371, + "time_per_iteration": 2.690812110900879 + }, + { + "auxiliary_loss_clip": 0.01027045, + "auxiliary_loss_mlp": 0.01000957, + "balance_loss_clip": 1.00478935, + "balance_loss_mlp": 1.00007391, + "epoch": 0.9242146400120247, + "flos": 75441135348000.0, + "grad_norm": 0.7175379928491884, + "language_loss": 0.57629228, + "learning_rate": 5.99005338059464e-08, + "loss": 0.59657228, + "num_input_tokens_seen": 331698595, + "router_z_loss_clip": 0.22265625, + "router_z_loss_mlp": 0.00883484, + "step": 15372, + "time_per_iteration": 3.1783721446990967 + }, + { + "auxiliary_loss_clip": 0.01109632, + "auxiliary_loss_mlp": 0.0103015, + "balance_loss_clip": 1.03949583, + "balance_loss_mlp": 1.02029097, + "epoch": 0.9242747632646926, + "flos": 26904135696480.0, + "grad_norm": 1.9064524187592666, + "language_loss": 0.6954276, + "learning_rate": 5.98059678590237e-08, + "loss": 0.71682543, + "num_input_tokens_seen": 331717975, + "router_z_loss_clip": 0.70166016, + "router_z_loss_mlp": 0.09857178, + "step": 15373, + "time_per_iteration": 5.3064234256744385 + }, + { + "auxiliary_loss_clip": 0.01110493, + "auxiliary_loss_mlp": 0.01037803, + "balance_loss_clip": 1.03836894, + "balance_loss_mlp": 1.02665102, + "epoch": 0.9243348865173606, + "flos": 22547165088960.0, + "grad_norm": 2.4534031287336115, + "language_loss": 0.7561217, + "learning_rate": 5.971147548445299e-08, + "loss": 0.7776047, + "num_input_tokens_seen": 331737220, + "router_z_loss_clip": 0.72216797, + "router_z_loss_mlp": 0.11157227, + "step": 15374, + "time_per_iteration": 2.6222565174102783 + }, + { + "auxiliary_loss_clip": 0.01111263, + "auxiliary_loss_mlp": 0.01035037, + "balance_loss_clip": 1.03901613, + "balance_loss_mlp": 1.02423048, + "epoch": 0.9243950097700285, + "flos": 28380537885600.0, + "grad_norm": 2.151578311886108, + "language_loss": 0.64616454, + "learning_rate": 5.961705668581784e-08, + "loss": 0.66762757, + "num_input_tokens_seen": 331757300, + "router_z_loss_clip": 0.72216797, + "router_z_loss_mlp": 0.10803223, + "step": 15375, + "time_per_iteration": 2.620859384536743 + }, + { + "auxiliary_loss_clip": 0.01107951, + "auxiliary_loss_mlp": 0.01031183, + "balance_loss_clip": 1.03778148, + "balance_loss_mlp": 1.02025104, + "epoch": 0.9244551330226966, + "flos": 36300900409920.0, + "grad_norm": 1.9136296072486279, + "language_loss": 0.66075933, + "learning_rate": 5.952271146669829e-08, + "loss": 0.6821506, + "num_input_tokens_seen": 331776995, + "router_z_loss_clip": 0.70214844, + "router_z_loss_mlp": 0.109375, + "step": 15376, + "time_per_iteration": 2.7407588958740234 + }, + { + "auxiliary_loss_clip": 0.01027017, + "auxiliary_loss_mlp": 0.01001062, + "balance_loss_clip": 1.0046798, + "balance_loss_mlp": 1.00018525, + "epoch": 0.9245152562753645, + "flos": 84029451528960.0, + "grad_norm": 0.6520541511015192, + "language_loss": 0.61093974, + "learning_rate": 5.94284398306717e-08, + "loss": 0.63122052, + "num_input_tokens_seen": 331845015, + "router_z_loss_clip": 0.22338867, + "router_z_loss_mlp": 0.0087738, + "step": 15377, + "time_per_iteration": 3.3557257652282715 + }, + { + "auxiliary_loss_clip": 0.01108863, + "auxiliary_loss_mlp": 0.01032958, + "balance_loss_clip": 1.03741407, + "balance_loss_mlp": 1.02181768, + "epoch": 0.9245753795280325, + "flos": 26331182360640.0, + "grad_norm": 2.062234657906062, + "language_loss": 0.7367171, + "learning_rate": 5.933424178131341e-08, + "loss": 0.75813532, + "num_input_tokens_seen": 331862795, + "router_z_loss_clip": 0.71582031, + "router_z_loss_mlp": 0.11157227, + "step": 15378, + "time_per_iteration": 2.65434193611145 + }, + { + "auxiliary_loss_clip": 0.01112206, + "auxiliary_loss_mlp": 0.01028295, + "balance_loss_clip": 1.038553, + "balance_loss_mlp": 1.01689219, + "epoch": 0.9246355027807004, + "flos": 42093195035040.0, + "grad_norm": 2.303426157395922, + "language_loss": 0.62629664, + "learning_rate": 5.924011732219503e-08, + "loss": 0.64770162, + "num_input_tokens_seen": 331882535, + "router_z_loss_clip": 0.73632812, + "router_z_loss_mlp": 0.11401367, + "step": 15379, + "time_per_iteration": 2.7367262840270996 + }, + { + "auxiliary_loss_clip": 0.01107024, + "auxiliary_loss_mlp": 0.01028221, + "balance_loss_clip": 1.03771949, + "balance_loss_mlp": 1.01729596, + "epoch": 0.9246956260333684, + "flos": 19467236537280.0, + "grad_norm": 2.675742576726605, + "language_loss": 0.83684289, + "learning_rate": 5.914606645688591e-08, + "loss": 0.8581953, + "num_input_tokens_seen": 331899335, + "router_z_loss_clip": 0.69287109, + "router_z_loss_mlp": 0.10919189, + "step": 15380, + "time_per_iteration": 2.5878560543060303 + }, + { + "auxiliary_loss_clip": 0.01111079, + "auxiliary_loss_mlp": 0.01031066, + "balance_loss_clip": 1.03673911, + "balance_loss_mlp": 1.01921082, + "epoch": 0.9247557492860363, + "flos": 28514266375680.0, + "grad_norm": 1.616544816497905, + "language_loss": 0.73201108, + "learning_rate": 5.905208918895233e-08, + "loss": 0.75343251, + "num_input_tokens_seen": 331919030, + "router_z_loss_clip": 0.74365234, + "router_z_loss_mlp": 0.11859131, + "step": 15381, + "time_per_iteration": 2.682544231414795 + }, + { + "auxiliary_loss_clip": 0.01110433, + "auxiliary_loss_mlp": 0.01029977, + "balance_loss_clip": 1.03884268, + "balance_loss_mlp": 1.0191288, + "epoch": 0.9248158725387043, + "flos": 28113405043680.0, + "grad_norm": 1.8978667774111615, + "language_loss": 0.78493214, + "learning_rate": 5.8958185521958524e-08, + "loss": 0.80633622, + "num_input_tokens_seen": 331936465, + "router_z_loss_clip": 0.71630859, + "router_z_loss_mlp": 0.10845947, + "step": 15382, + "time_per_iteration": 2.6307926177978516 + }, + { + "auxiliary_loss_clip": 0.01109704, + "auxiliary_loss_mlp": 0.01032874, + "balance_loss_clip": 1.03679609, + "balance_loss_mlp": 1.0214721, + "epoch": 0.9248759957913724, + "flos": 27482599382400.0, + "grad_norm": 1.7222895216529568, + "language_loss": 0.74916267, + "learning_rate": 5.886435545946455e-08, + "loss": 0.77058852, + "num_input_tokens_seen": 331954625, + "router_z_loss_clip": 0.72802734, + "router_z_loss_mlp": 0.11407471, + "step": 15383, + "time_per_iteration": 2.6881871223449707 + }, + { + "auxiliary_loss_clip": 0.01106479, + "auxiliary_loss_mlp": 0.01025698, + "balance_loss_clip": 1.03571439, + "balance_loss_mlp": 1.01496935, + "epoch": 0.9249361190440403, + "flos": 31051177511040.0, + "grad_norm": 1.7194994284338976, + "language_loss": 0.75580323, + "learning_rate": 5.8770599005028456e-08, + "loss": 0.777125, + "num_input_tokens_seen": 331975865, + "router_z_loss_clip": 0.70751953, + "router_z_loss_mlp": 0.10736084, + "step": 15384, + "time_per_iteration": 2.658214807510376 + }, + { + "auxiliary_loss_clip": 0.01105598, + "auxiliary_loss_mlp": 0.01030908, + "balance_loss_clip": 1.03555262, + "balance_loss_mlp": 1.01960111, + "epoch": 0.9249962422967083, + "flos": 15104431441440.0, + "grad_norm": 3.8272391465100197, + "language_loss": 0.66220933, + "learning_rate": 5.8676916162206045e-08, + "loss": 0.68357438, + "num_input_tokens_seen": 331992760, + "router_z_loss_clip": 0.70068359, + "router_z_loss_mlp": 0.11303711, + "step": 15385, + "time_per_iteration": 3.9626786708831787 + }, + { + "auxiliary_loss_clip": 0.01107306, + "auxiliary_loss_mlp": 0.01029944, + "balance_loss_clip": 1.03580892, + "balance_loss_mlp": 1.01896477, + "epoch": 0.9250563655493762, + "flos": 27978744656160.0, + "grad_norm": 2.14372951383341, + "language_loss": 0.80570126, + "learning_rate": 5.85833069345496e-08, + "loss": 0.82707375, + "num_input_tokens_seen": 332011890, + "router_z_loss_clip": 0.71533203, + "router_z_loss_mlp": 0.10974121, + "step": 15386, + "time_per_iteration": 2.652029275894165 + }, + { + "auxiliary_loss_clip": 0.01107995, + "auxiliary_loss_mlp": 0.01036523, + "balance_loss_clip": 1.03865612, + "balance_loss_mlp": 1.02511525, + "epoch": 0.9251164888020442, + "flos": 22547529744480.0, + "grad_norm": 2.094884819997878, + "language_loss": 0.75380647, + "learning_rate": 5.8489771325608504e-08, + "loss": 0.77525163, + "num_input_tokens_seen": 332029485, + "router_z_loss_clip": 0.69335938, + "router_z_loss_mlp": 0.11407471, + "step": 15387, + "time_per_iteration": 2.5749833583831787 + }, + { + "auxiliary_loss_clip": 0.01107324, + "auxiliary_loss_mlp": 0.0103064, + "balance_loss_clip": 1.03716516, + "balance_loss_mlp": 1.02035785, + "epoch": 0.9251766120547121, + "flos": 40312755112320.0, + "grad_norm": 1.3667757596362269, + "language_loss": 0.70202494, + "learning_rate": 5.839630933893014e-08, + "loss": 0.72340453, + "num_input_tokens_seen": 332052970, + "router_z_loss_clip": 0.70117188, + "router_z_loss_mlp": 0.10284424, + "step": 15388, + "time_per_iteration": 2.735051155090332 + }, + { + "auxiliary_loss_clip": 0.01111613, + "auxiliary_loss_mlp": 0.01028271, + "balance_loss_clip": 1.0383575, + "balance_loss_mlp": 1.01717234, + "epoch": 0.9252367353073802, + "flos": 29758576819680.0, + "grad_norm": 1.7756505651259888, + "language_loss": 0.81978726, + "learning_rate": 5.8302920978058115e-08, + "loss": 0.84118617, + "num_input_tokens_seen": 332070395, + "router_z_loss_clip": 0.73193359, + "router_z_loss_mlp": 0.11083984, + "step": 15389, + "time_per_iteration": 2.6244888305664062 + }, + { + "auxiliary_loss_clip": 0.01116756, + "auxiliary_loss_mlp": 0.01032044, + "balance_loss_clip": 1.03958917, + "balance_loss_mlp": 1.01995587, + "epoch": 0.9252968585600481, + "flos": 23082200601120.0, + "grad_norm": 9.852569458931004, + "language_loss": 0.79021752, + "learning_rate": 5.820960624653381e-08, + "loss": 0.81170553, + "num_input_tokens_seen": 332090185, + "router_z_loss_clip": 0.77246094, + "router_z_loss_mlp": 0.12078857, + "step": 15390, + "time_per_iteration": 2.6294384002685547 + }, + { + "auxiliary_loss_clip": 0.01112146, + "auxiliary_loss_mlp": 0.01036758, + "balance_loss_clip": 1.03779185, + "balance_loss_mlp": 1.02518892, + "epoch": 0.9253569818127161, + "flos": 26509432991040.0, + "grad_norm": 1.902636175486021, + "language_loss": 0.75439209, + "learning_rate": 5.811636514789597e-08, + "loss": 0.77588117, + "num_input_tokens_seen": 332109050, + "router_z_loss_clip": 0.74365234, + "router_z_loss_mlp": 0.11560059, + "step": 15391, + "time_per_iteration": 2.6131398677825928 + }, + { + "auxiliary_loss_clip": 0.01108964, + "auxiliary_loss_mlp": 0.01032061, + "balance_loss_clip": 1.03661561, + "balance_loss_mlp": 1.01941252, + "epoch": 0.925417105065384, + "flos": 41781175397280.0, + "grad_norm": 9.121499133013243, + "language_loss": 0.51917309, + "learning_rate": 5.80231976856802e-08, + "loss": 0.54058331, + "num_input_tokens_seen": 332131180, + "router_z_loss_clip": 0.72412109, + "router_z_loss_mlp": 0.12658691, + "step": 15392, + "time_per_iteration": 2.845233678817749 + }, + { + "auxiliary_loss_clip": 0.01107604, + "auxiliary_loss_mlp": 0.01028039, + "balance_loss_clip": 1.03502738, + "balance_loss_mlp": 1.01717305, + "epoch": 0.925477228318052, + "flos": 31676148684000.0, + "grad_norm": 1.8590606718226512, + "language_loss": 0.77393746, + "learning_rate": 5.7930103863419454e-08, + "loss": 0.79529387, + "num_input_tokens_seen": 332149555, + "router_z_loss_clip": 0.72509766, + "router_z_loss_mlp": 0.10864258, + "step": 15393, + "time_per_iteration": 2.662806510925293 + }, + { + "auxiliary_loss_clip": 0.01107036, + "auxiliary_loss_mlp": 0.01028754, + "balance_loss_clip": 1.036201, + "balance_loss_mlp": 1.01808512, + "epoch": 0.9255373515707199, + "flos": 14444378654400.0, + "grad_norm": 1.8290395476917376, + "language_loss": 0.69638109, + "learning_rate": 5.783708368464357e-08, + "loss": 0.71773899, + "num_input_tokens_seen": 332165830, + "router_z_loss_clip": 0.70751953, + "router_z_loss_mlp": 0.10681152, + "step": 15394, + "time_per_iteration": 2.61265230178833 + }, + { + "auxiliary_loss_clip": 0.01108999, + "auxiliary_loss_mlp": 0.01026418, + "balance_loss_clip": 1.03671169, + "balance_loss_mlp": 1.01512253, + "epoch": 0.925597474823388, + "flos": 26154228283200.0, + "grad_norm": 1.7540215322265378, + "language_loss": 0.72912717, + "learning_rate": 5.7744137152879956e-08, + "loss": 0.75048131, + "num_input_tokens_seen": 332185130, + "router_z_loss_clip": 0.72265625, + "router_z_loss_mlp": 0.11303711, + "step": 15395, + "time_per_iteration": 2.658055067062378 + }, + { + "auxiliary_loss_clip": 0.01104396, + "auxiliary_loss_mlp": 0.01027124, + "balance_loss_clip": 1.03498209, + "balance_loss_mlp": 1.01688957, + "epoch": 0.925657598076056, + "flos": 27890956411200.0, + "grad_norm": 3.0117249784825417, + "language_loss": 0.71770257, + "learning_rate": 5.7651264271653785e-08, + "loss": 0.73901772, + "num_input_tokens_seen": 332203695, + "router_z_loss_clip": 0.69433594, + "router_z_loss_mlp": 0.10235596, + "step": 15396, + "time_per_iteration": 2.6058223247528076 + }, + { + "auxiliary_loss_clip": 0.01106819, + "auxiliary_loss_mlp": 0.01027527, + "balance_loss_clip": 1.03622651, + "balance_loss_mlp": 1.01657748, + "epoch": 0.9257177213287239, + "flos": 31364372149920.0, + "grad_norm": 2.3073621591128877, + "language_loss": 0.87222898, + "learning_rate": 5.755846504448603e-08, + "loss": 0.89357245, + "num_input_tokens_seen": 332224850, + "router_z_loss_clip": 0.70556641, + "router_z_loss_mlp": 0.10955811, + "step": 15397, + "time_per_iteration": 2.671283483505249 + }, + { + "auxiliary_loss_clip": 0.010272, + "auxiliary_loss_mlp": 0.01001009, + "balance_loss_clip": 1.00490081, + "balance_loss_mlp": 1.00011802, + "epoch": 0.9257778445813919, + "flos": 72715439089440.0, + "grad_norm": 0.8151208672549572, + "language_loss": 0.55109996, + "learning_rate": 5.746573947489586e-08, + "loss": 0.57138205, + "num_input_tokens_seen": 332278085, + "router_z_loss_clip": 0.22302246, + "router_z_loss_mlp": 0.00891113, + "step": 15398, + "time_per_iteration": 3.1056275367736816 + }, + { + "auxiliary_loss_clip": 0.01114286, + "auxiliary_loss_mlp": 0.0102987, + "balance_loss_clip": 1.03776741, + "balance_loss_mlp": 1.0169481, + "epoch": 0.9258379678340598, + "flos": 33811631245440.0, + "grad_norm": 2.259585111403342, + "language_loss": 0.76429844, + "learning_rate": 5.7373087566400025e-08, + "loss": 0.78574002, + "num_input_tokens_seen": 332297875, + "router_z_loss_clip": 0.76464844, + "router_z_loss_mlp": 0.12902832, + "step": 15399, + "time_per_iteration": 2.6881895065307617 + }, + { + "auxiliary_loss_clip": 0.01104627, + "auxiliary_loss_mlp": 0.01025911, + "balance_loss_clip": 1.03578353, + "balance_loss_mlp": 1.01618385, + "epoch": 0.9258980910867278, + "flos": 30338863783200.0, + "grad_norm": 1.5283735903220927, + "language_loss": 0.78124905, + "learning_rate": 5.7280509322510826e-08, + "loss": 0.80255449, + "num_input_tokens_seen": 332318500, + "router_z_loss_clip": 0.68945312, + "router_z_loss_mlp": 0.09716797, + "step": 15400, + "time_per_iteration": 2.658369779586792 + }, + { + "auxiliary_loss_clip": 0.01027113, + "auxiliary_loss_mlp": 0.01001857, + "balance_loss_clip": 1.00477898, + "balance_loss_mlp": 1.00100601, + "epoch": 0.9259582143393957, + "flos": 77037003944640.0, + "grad_norm": 0.7118257838733797, + "language_loss": 0.51281077, + "learning_rate": 5.718800474673946e-08, + "loss": 0.53310043, + "num_input_tokens_seen": 332381980, + "router_z_loss_clip": 0.22338867, + "router_z_loss_mlp": 0.00851059, + "step": 15401, + "time_per_iteration": 3.208099842071533 + }, + { + "auxiliary_loss_clip": 0.0110668, + "auxiliary_loss_mlp": 0.01028481, + "balance_loss_clip": 1.0384531, + "balance_loss_mlp": 1.0182116, + "epoch": 0.9260183375920638, + "flos": 29440844245440.0, + "grad_norm": 1.8122253010266054, + "language_loss": 0.82264817, + "learning_rate": 5.709557384259378e-08, + "loss": 0.84399974, + "num_input_tokens_seen": 332399510, + "router_z_loss_clip": 0.68164062, + "router_z_loss_mlp": 0.10266113, + "step": 15402, + "time_per_iteration": 4.1188108921051025 + }, + { + "auxiliary_loss_clip": 0.01027505, + "auxiliary_loss_mlp": 0.0100115, + "balance_loss_clip": 1.00514221, + "balance_loss_mlp": 1.00026453, + "epoch": 0.9260784608447317, + "flos": 76923453060000.0, + "grad_norm": 0.7285858017006028, + "language_loss": 0.50968981, + "learning_rate": 5.700321661357876e-08, + "loss": 0.52997637, + "num_input_tokens_seen": 332459130, + "router_z_loss_clip": 0.22375488, + "router_z_loss_mlp": 0.0088501, + "step": 15403, + "time_per_iteration": 3.378483533859253 + }, + { + "auxiliary_loss_clip": 0.01028208, + "auxiliary_loss_mlp": 0.01001205, + "balance_loss_clip": 1.00571799, + "balance_loss_mlp": 1.00028014, + "epoch": 0.9261385840973997, + "flos": 86129001613440.0, + "grad_norm": 0.6833418656769423, + "language_loss": 0.58723271, + "learning_rate": 5.69109330631965e-08, + "loss": 0.60752678, + "num_input_tokens_seen": 332526555, + "router_z_loss_clip": 0.22497559, + "router_z_loss_mlp": 0.00925446, + "step": 15404, + "time_per_iteration": 3.276005983352661 + }, + { + "auxiliary_loss_clip": 0.01110643, + "auxiliary_loss_mlp": 0.01031087, + "balance_loss_clip": 1.03707695, + "balance_loss_mlp": 1.01942825, + "epoch": 0.9261987073500676, + "flos": 24684551962560.0, + "grad_norm": 2.1285370936789967, + "language_loss": 0.71095312, + "learning_rate": 5.681872319494596e-08, + "loss": 0.73237038, + "num_input_tokens_seen": 332544005, + "router_z_loss_clip": 0.73632812, + "router_z_loss_mlp": 0.11663818, + "step": 15405, + "time_per_iteration": 2.6660733222961426 + }, + { + "auxiliary_loss_clip": 0.01113318, + "auxiliary_loss_mlp": 0.01038345, + "balance_loss_clip": 1.03867054, + "balance_loss_mlp": 1.02666807, + "epoch": 0.9262588306027356, + "flos": 25568836142400.0, + "grad_norm": 3.7297753593267586, + "language_loss": 0.68314433, + "learning_rate": 5.672658701232458e-08, + "loss": 0.70466089, + "num_input_tokens_seen": 332563070, + "router_z_loss_clip": 0.74658203, + "router_z_loss_mlp": 0.11676025, + "step": 15406, + "time_per_iteration": 2.709188222885132 + }, + { + "auxiliary_loss_clip": 0.01109816, + "auxiliary_loss_mlp": 0.0103476, + "balance_loss_clip": 1.03716242, + "balance_loss_mlp": 1.02292204, + "epoch": 0.9263189538554035, + "flos": 27038634014880.0, + "grad_norm": 2.1250433849021397, + "language_loss": 0.76619256, + "learning_rate": 5.663452451882555e-08, + "loss": 0.78763831, + "num_input_tokens_seen": 332579620, + "router_z_loss_clip": 0.72460938, + "router_z_loss_mlp": 0.11828613, + "step": 15407, + "time_per_iteration": 2.6797683238983154 + }, + { + "auxiliary_loss_clip": 0.0111277, + "auxiliary_loss_mlp": 0.01035265, + "balance_loss_clip": 1.03586352, + "balance_loss_mlp": 1.02354074, + "epoch": 0.9263790771080715, + "flos": 22191838829280.0, + "grad_norm": 2.1354674020301734, + "language_loss": 0.72234225, + "learning_rate": 5.6542535717940096e-08, + "loss": 0.74382257, + "num_input_tokens_seen": 332597795, + "router_z_loss_clip": 0.76904297, + "router_z_loss_mlp": 0.11730957, + "step": 15408, + "time_per_iteration": 2.589054584503174 + }, + { + "auxiliary_loss_clip": 0.01107936, + "auxiliary_loss_mlp": 0.01025697, + "balance_loss_clip": 1.03753459, + "balance_loss_mlp": 1.01604128, + "epoch": 0.9264392003607396, + "flos": 58789564896960.0, + "grad_norm": 1.9182490956706442, + "language_loss": 0.68781543, + "learning_rate": 5.645062061315675e-08, + "loss": 0.70915174, + "num_input_tokens_seen": 332620375, + "router_z_loss_clip": 0.70410156, + "router_z_loss_mlp": 0.09649658, + "step": 15409, + "time_per_iteration": 2.8814141750335693 + }, + { + "auxiliary_loss_clip": 0.01112226, + "auxiliary_loss_mlp": 0.01029128, + "balance_loss_clip": 1.03980231, + "balance_loss_mlp": 1.01763034, + "epoch": 0.9264993236134075, + "flos": 32201500566240.0, + "grad_norm": 1.9890465138962778, + "language_loss": 0.75770509, + "learning_rate": 5.6358779207960506e-08, + "loss": 0.7791186, + "num_input_tokens_seen": 332639510, + "router_z_loss_clip": 0.72509766, + "router_z_loss_mlp": 0.1149292, + "step": 15410, + "time_per_iteration": 2.649066925048828 + }, + { + "auxiliary_loss_clip": 0.0110929, + "auxiliary_loss_mlp": 0.01026164, + "balance_loss_clip": 1.037292, + "balance_loss_mlp": 1.0154115, + "epoch": 0.9265594468660755, + "flos": 25527028659840.0, + "grad_norm": 1.7150477388643734, + "language_loss": 0.82066947, + "learning_rate": 5.6267011505833905e-08, + "loss": 0.84202403, + "num_input_tokens_seen": 332658350, + "router_z_loss_clip": 0.72021484, + "router_z_loss_mlp": 0.10748291, + "step": 15411, + "time_per_iteration": 2.640878677368164 + }, + { + "auxiliary_loss_clip": 0.01112151, + "auxiliary_loss_mlp": 0.01031877, + "balance_loss_clip": 1.04016125, + "balance_loss_mlp": 1.02116585, + "epoch": 0.9266195701187434, + "flos": 21384281676960.0, + "grad_norm": 1.763657512328949, + "language_loss": 0.75443667, + "learning_rate": 5.617531751025728e-08, + "loss": 0.77587694, + "num_input_tokens_seen": 332676715, + "router_z_loss_clip": 0.72021484, + "router_z_loss_mlp": 0.10717773, + "step": 15412, + "time_per_iteration": 4.0431859493255615 + }, + { + "auxiliary_loss_clip": 0.01106191, + "auxiliary_loss_mlp": 0.01025827, + "balance_loss_clip": 1.03487468, + "balance_loss_mlp": 1.01462781, + "epoch": 0.9266796933714114, + "flos": 41108440701600.0, + "grad_norm": 1.6961649365354894, + "language_loss": 0.66974193, + "learning_rate": 5.6083697224707406e-08, + "loss": 0.69106209, + "num_input_tokens_seen": 332701470, + "router_z_loss_clip": 0.71289062, + "router_z_loss_mlp": 0.11199951, + "step": 15413, + "time_per_iteration": 4.233536958694458 + }, + { + "auxiliary_loss_clip": 0.01110548, + "auxiliary_loss_mlp": 0.01032536, + "balance_loss_clip": 1.03722358, + "balance_loss_mlp": 1.02118182, + "epoch": 0.9267398166240793, + "flos": 23081835945600.0, + "grad_norm": 1.7336261818532548, + "language_loss": 0.76107037, + "learning_rate": 5.5992150652658167e-08, + "loss": 0.78250122, + "num_input_tokens_seen": 332719060, + "router_z_loss_clip": 0.73339844, + "router_z_loss_mlp": 0.11358643, + "step": 15414, + "time_per_iteration": 2.6010518074035645 + }, + { + "auxiliary_loss_clip": 0.01109707, + "auxiliary_loss_mlp": 0.01028379, + "balance_loss_clip": 1.03817558, + "balance_loss_mlp": 1.01741159, + "epoch": 0.9267999398767474, + "flos": 24989035386240.0, + "grad_norm": 2.5917055691686173, + "language_loss": 0.81527543, + "learning_rate": 5.59006777975819e-08, + "loss": 0.83665627, + "num_input_tokens_seen": 332736345, + "router_z_loss_clip": 0.71533203, + "router_z_loss_mlp": 0.10980225, + "step": 15415, + "time_per_iteration": 2.667799949645996 + }, + { + "auxiliary_loss_clip": 0.01109774, + "auxiliary_loss_mlp": 0.01031865, + "balance_loss_clip": 1.03645062, + "balance_loss_mlp": 1.02033138, + "epoch": 0.9268600631294153, + "flos": 30248441915040.0, + "grad_norm": 1.5968963073883424, + "language_loss": 0.5403409, + "learning_rate": 5.580927866294671e-08, + "loss": 0.56175733, + "num_input_tokens_seen": 332756270, + "router_z_loss_clip": 0.73242188, + "router_z_loss_mlp": 0.11535645, + "step": 15416, + "time_per_iteration": 2.6281442642211914 + }, + { + "auxiliary_loss_clip": 0.01106838, + "auxiliary_loss_mlp": 0.01030521, + "balance_loss_clip": 1.03650904, + "balance_loss_mlp": 1.01989317, + "epoch": 0.9269201863820833, + "flos": 22814216896320.0, + "grad_norm": 1.50261753072739, + "language_loss": 0.71939284, + "learning_rate": 5.571795325221807e-08, + "loss": 0.74076641, + "num_input_tokens_seen": 332775185, + "router_z_loss_clip": 0.703125, + "router_z_loss_mlp": 0.10638428, + "step": 15417, + "time_per_iteration": 2.641123056411743 + }, + { + "auxiliary_loss_clip": 0.01109805, + "auxiliary_loss_mlp": 0.01031734, + "balance_loss_clip": 1.0381763, + "balance_loss_mlp": 1.02006304, + "epoch": 0.9269803096347512, + "flos": 25531445043360.0, + "grad_norm": 2.0653904914469265, + "language_loss": 0.75823331, + "learning_rate": 5.5626701568859624e-08, + "loss": 0.77964872, + "num_input_tokens_seen": 332794320, + "router_z_loss_clip": 0.71582031, + "router_z_loss_mlp": 0.11669922, + "step": 15418, + "time_per_iteration": 2.5968315601348877 + }, + { + "auxiliary_loss_clip": 0.01107277, + "auxiliary_loss_mlp": 0.01027171, + "balance_loss_clip": 1.03589404, + "balance_loss_mlp": 1.0159477, + "epoch": 0.9270404328874192, + "flos": 34169388541920.0, + "grad_norm": 1.553420180636867, + "language_loss": 0.75984931, + "learning_rate": 5.553552361633174e-08, + "loss": 0.78119379, + "num_input_tokens_seen": 332818095, + "router_z_loss_clip": 0.71289062, + "router_z_loss_mlp": 0.11230469, + "step": 15419, + "time_per_iteration": 2.743790864944458 + }, + { + "auxiliary_loss_clip": 0.01105104, + "auxiliary_loss_mlp": 0.01028816, + "balance_loss_clip": 1.03570235, + "balance_loss_mlp": 1.01833713, + "epoch": 0.9271005561400871, + "flos": 31590831993120.0, + "grad_norm": 1.732179686457114, + "language_loss": 0.75912052, + "learning_rate": 5.5444419398091636e-08, + "loss": 0.78045964, + "num_input_tokens_seen": 332839860, + "router_z_loss_clip": 0.69384766, + "router_z_loss_mlp": 0.10479736, + "step": 15420, + "time_per_iteration": 2.6821272373199463 + }, + { + "auxiliary_loss_clip": 0.01111997, + "auxiliary_loss_mlp": 0.010303, + "balance_loss_clip": 1.03772557, + "balance_loss_mlp": 1.01851654, + "epoch": 0.9271606793927551, + "flos": 33011610307200.0, + "grad_norm": 1.8213184267981632, + "language_loss": 0.76663798, + "learning_rate": 5.535338891759389e-08, + "loss": 0.78806096, + "num_input_tokens_seen": 332861155, + "router_z_loss_clip": 0.74316406, + "router_z_loss_mlp": 0.11785889, + "step": 15421, + "time_per_iteration": 2.734160900115967 + }, + { + "auxiliary_loss_clip": 0.01109882, + "auxiliary_loss_mlp": 0.01028261, + "balance_loss_clip": 1.03837895, + "balance_loss_mlp": 1.01750243, + "epoch": 0.9272208026454232, + "flos": 31982090729760.0, + "grad_norm": 2.195021897000781, + "language_loss": 0.72811806, + "learning_rate": 5.526243217829041e-08, + "loss": 0.7494995, + "num_input_tokens_seen": 332881110, + "router_z_loss_clip": 0.71533203, + "router_z_loss_mlp": 0.10766602, + "step": 15422, + "time_per_iteration": 2.644383430480957 + }, + { + "auxiliary_loss_clip": 0.01111021, + "auxiliary_loss_mlp": 0.01035464, + "balance_loss_clip": 1.03799152, + "balance_loss_mlp": 1.02383494, + "epoch": 0.9272809258980911, + "flos": 15201214522560.0, + "grad_norm": 3.7910223541564854, + "language_loss": 0.77556479, + "learning_rate": 5.517154918363065e-08, + "loss": 0.79702961, + "num_input_tokens_seen": 332899350, + "router_z_loss_clip": 0.72949219, + "router_z_loss_mlp": 0.11633301, + "step": 15423, + "time_per_iteration": 2.640831232070923 + }, + { + "auxiliary_loss_clip": 0.0110988, + "auxiliary_loss_mlp": 0.01027412, + "balance_loss_clip": 1.0368948, + "balance_loss_mlp": 1.01597428, + "epoch": 0.9273410491507591, + "flos": 27890186582880.0, + "grad_norm": 1.789499727370972, + "language_loss": 0.7523222, + "learning_rate": 5.508073993706053e-08, + "loss": 0.77369511, + "num_input_tokens_seen": 332918105, + "router_z_loss_clip": 0.72949219, + "router_z_loss_mlp": 0.11431885, + "step": 15424, + "time_per_iteration": 2.6731159687042236 + }, + { + "auxiliary_loss_clip": 0.01027103, + "auxiliary_loss_mlp": 0.0100137, + "balance_loss_clip": 1.00472713, + "balance_loss_mlp": 1.00046635, + "epoch": 0.927401172403427, + "flos": 58160224208160.0, + "grad_norm": 0.7796166261623841, + "language_loss": 0.60653722, + "learning_rate": 5.499000444202351e-08, + "loss": 0.62682199, + "num_input_tokens_seen": 332969490, + "router_z_loss_clip": 0.22363281, + "router_z_loss_mlp": 0.00902557, + "step": 15425, + "time_per_iteration": 4.300410509109497 + }, + { + "auxiliary_loss_clip": 0.01111177, + "auxiliary_loss_mlp": 0.01029374, + "balance_loss_clip": 1.03894591, + "balance_loss_mlp": 1.01828742, + "epoch": 0.927461295656095, + "flos": 36572976360000.0, + "grad_norm": 1.4475460040792294, + "language_loss": 0.70519066, + "learning_rate": 5.489934270196106e-08, + "loss": 0.72659624, + "num_input_tokens_seen": 332988805, + "router_z_loss_clip": 0.72363281, + "router_z_loss_mlp": 0.11083984, + "step": 15426, + "time_per_iteration": 2.717172622680664 + }, + { + "auxiliary_loss_clip": 0.01108788, + "auxiliary_loss_mlp": 0.01025067, + "balance_loss_clip": 1.03755593, + "balance_loss_mlp": 1.01460671, + "epoch": 0.9275214189087629, + "flos": 24862154316480.0, + "grad_norm": 1.819151444953551, + "language_loss": 0.83045208, + "learning_rate": 5.480875472030977e-08, + "loss": 0.85179061, + "num_input_tokens_seen": 333007960, + "router_z_loss_clip": 0.71240234, + "router_z_loss_mlp": 0.10461426, + "step": 15427, + "time_per_iteration": 2.702406644821167 + }, + { + "auxiliary_loss_clip": 0.01112117, + "auxiliary_loss_mlp": 0.01030079, + "balance_loss_clip": 1.03988314, + "balance_loss_mlp": 1.01898694, + "epoch": 0.927581542161431, + "flos": 27312087552480.0, + "grad_norm": 1.7803915210875203, + "language_loss": 0.76872283, + "learning_rate": 5.471824050050555e-08, + "loss": 0.7901448, + "num_input_tokens_seen": 333026035, + "router_z_loss_clip": 0.72167969, + "router_z_loss_mlp": 0.11090088, + "step": 15428, + "time_per_iteration": 2.642235040664673 + }, + { + "auxiliary_loss_clip": 0.01107183, + "auxiliary_loss_mlp": 0.01030705, + "balance_loss_clip": 1.0354346, + "balance_loss_mlp": 1.01946974, + "epoch": 0.9276416654140989, + "flos": 29226985276320.0, + "grad_norm": 1.9767424029552703, + "language_loss": 0.74464387, + "learning_rate": 5.4627800045980555e-08, + "loss": 0.7660228, + "num_input_tokens_seen": 333045590, + "router_z_loss_clip": 0.71679688, + "router_z_loss_mlp": 0.11236572, + "step": 15429, + "time_per_iteration": 2.6588079929351807 + }, + { + "auxiliary_loss_clip": 0.0110712, + "auxiliary_loss_mlp": 0.01033831, + "balance_loss_clip": 1.03752637, + "balance_loss_mlp": 1.02284026, + "epoch": 0.9277017886667669, + "flos": 16981938066240.0, + "grad_norm": 2.2737512659670993, + "language_loss": 0.75047541, + "learning_rate": 5.45374333601647e-08, + "loss": 0.77188492, + "num_input_tokens_seen": 333063355, + "router_z_loss_clip": 0.69628906, + "router_z_loss_mlp": 0.10986328, + "step": 15430, + "time_per_iteration": 2.6045875549316406 + }, + { + "auxiliary_loss_clip": 0.01110056, + "auxiliary_loss_mlp": 0.0102572, + "balance_loss_clip": 1.03753722, + "balance_loss_mlp": 1.0136385, + "epoch": 0.9277619119194348, + "flos": 43521144907680.0, + "grad_norm": 1.5624001392607723, + "language_loss": 0.76591039, + "learning_rate": 5.444714044648391e-08, + "loss": 0.78726816, + "num_input_tokens_seen": 333088045, + "router_z_loss_clip": 0.72509766, + "router_z_loss_mlp": 0.12103271, + "step": 15431, + "time_per_iteration": 2.79278564453125 + }, + { + "auxiliary_loss_clip": 0.01107091, + "auxiliary_loss_mlp": 0.01027091, + "balance_loss_clip": 1.03720152, + "balance_loss_mlp": 1.01648688, + "epoch": 0.9278220351721028, + "flos": 29091433508640.0, + "grad_norm": 1.6096412504041453, + "language_loss": 0.7087664, + "learning_rate": 5.4356921308363e-08, + "loss": 0.73010826, + "num_input_tokens_seen": 333108005, + "router_z_loss_clip": 0.69873047, + "router_z_loss_mlp": 0.1060791, + "step": 15432, + "time_per_iteration": 2.6397266387939453 + }, + { + "auxiliary_loss_clip": 0.01112253, + "auxiliary_loss_mlp": 0.01030016, + "balance_loss_clip": 1.03842771, + "balance_loss_mlp": 1.01910794, + "epoch": 0.9278821584247707, + "flos": 18581291148960.0, + "grad_norm": 4.3479589329128085, + "language_loss": 0.81972015, + "learning_rate": 5.4266775949222354e-08, + "loss": 0.84114289, + "num_input_tokens_seen": 333124335, + "router_z_loss_clip": 0.73828125, + "router_z_loss_mlp": 0.10906982, + "step": 15433, + "time_per_iteration": 2.6026077270507812 + }, + { + "auxiliary_loss_clip": 0.01106483, + "auxiliary_loss_mlp": 0.01026176, + "balance_loss_clip": 1.03771937, + "balance_loss_mlp": 1.01612675, + "epoch": 0.9279422816774388, + "flos": 30116577219840.0, + "grad_norm": 1.9529188809055238, + "language_loss": 0.6632179, + "learning_rate": 5.417670437248056e-08, + "loss": 0.6845445, + "num_input_tokens_seen": 333143995, + "router_z_loss_clip": 0.68847656, + "router_z_loss_mlp": 0.1005249, + "step": 15434, + "time_per_iteration": 2.6684203147888184 + }, + { + "auxiliary_loss_clip": 0.01103168, + "auxiliary_loss_mlp": 0.01027606, + "balance_loss_clip": 1.03561723, + "balance_loss_mlp": 1.01731253, + "epoch": 0.9280024049301068, + "flos": 23390411614560.0, + "grad_norm": 1.8269698731435875, + "language_loss": 0.69115782, + "learning_rate": 5.40867065815529e-08, + "loss": 0.71246558, + "num_input_tokens_seen": 333162805, + "router_z_loss_clip": 0.67529297, + "router_z_loss_mlp": 0.10290527, + "step": 15435, + "time_per_iteration": 2.667041301727295 + }, + { + "auxiliary_loss_clip": 0.01109723, + "auxiliary_loss_mlp": 0.01027403, + "balance_loss_clip": 1.03748059, + "balance_loss_mlp": 1.01609015, + "epoch": 0.9280625281827747, + "flos": 13902536239200.0, + "grad_norm": 2.7143961980298426, + "language_loss": 0.72708011, + "learning_rate": 5.399678257985263e-08, + "loss": 0.74845141, + "num_input_tokens_seen": 333175770, + "router_z_loss_clip": 0.72167969, + "router_z_loss_mlp": 0.11303711, + "step": 15436, + "time_per_iteration": 2.5366837978363037 + }, + { + "auxiliary_loss_clip": 0.01109829, + "auxiliary_loss_mlp": 0.0102773, + "balance_loss_clip": 1.03826761, + "balance_loss_mlp": 1.01632166, + "epoch": 0.9281226514354427, + "flos": 30244228117920.0, + "grad_norm": 2.0756698999358663, + "language_loss": 0.66804957, + "learning_rate": 5.390693237078925e-08, + "loss": 0.68942517, + "num_input_tokens_seen": 333194775, + "router_z_loss_clip": 0.71582031, + "router_z_loss_mlp": 0.11407471, + "step": 15437, + "time_per_iteration": 2.7030019760131836 + }, + { + "auxiliary_loss_clip": 0.01112802, + "auxiliary_loss_mlp": 0.01033278, + "balance_loss_clip": 1.03801632, + "balance_loss_mlp": 1.02132726, + "epoch": 0.9281827746881106, + "flos": 18404256036960.0, + "grad_norm": 2.7604197602158145, + "language_loss": 0.71597928, + "learning_rate": 5.3817155957770254e-08, + "loss": 0.73744005, + "num_input_tokens_seen": 333208920, + "router_z_loss_clip": 0.74707031, + "router_z_loss_mlp": 0.11950684, + "step": 15438, + "time_per_iteration": 2.5629374980926514 + }, + { + "auxiliary_loss_clip": 0.01109943, + "auxiliary_loss_mlp": 0.01026757, + "balance_loss_clip": 1.03769016, + "balance_loss_mlp": 1.01592112, + "epoch": 0.9282428979407786, + "flos": 29449798564320.0, + "grad_norm": 1.6416734132101571, + "language_loss": 0.64609981, + "learning_rate": 5.3727453344199366e-08, + "loss": 0.66746676, + "num_input_tokens_seen": 333229350, + "router_z_loss_clip": 0.72265625, + "router_z_loss_mlp": 0.10827637, + "step": 15439, + "time_per_iteration": 2.675766706466675 + }, + { + "auxiliary_loss_clip": 0.0110951, + "auxiliary_loss_mlp": 0.01027707, + "balance_loss_clip": 1.03821874, + "balance_loss_mlp": 1.01674628, + "epoch": 0.9283030211934465, + "flos": 30290006293920.0, + "grad_norm": 1.6919428604284201, + "language_loss": 0.70132804, + "learning_rate": 5.363782453347876e-08, + "loss": 0.72270018, + "num_input_tokens_seen": 333246125, + "router_z_loss_clip": 0.71191406, + "router_z_loss_mlp": 0.10968018, + "step": 15440, + "time_per_iteration": 2.627387046813965 + }, + { + "auxiliary_loss_clip": 0.01113454, + "auxiliary_loss_mlp": 0.01035563, + "balance_loss_clip": 1.03878868, + "balance_loss_mlp": 1.02404785, + "epoch": 0.9283631444461146, + "flos": 29262593615040.0, + "grad_norm": 2.9229379798237063, + "language_loss": 0.76576698, + "learning_rate": 5.354826952900682e-08, + "loss": 0.78725713, + "num_input_tokens_seen": 333263685, + "router_z_loss_clip": 0.74658203, + "router_z_loss_mlp": 0.11523438, + "step": 15441, + "time_per_iteration": 2.6741769313812256 + }, + { + "auxiliary_loss_clip": 0.01105109, + "auxiliary_loss_mlp": 0.01026119, + "balance_loss_clip": 1.03693497, + "balance_loss_mlp": 1.01688671, + "epoch": 0.9284232676987825, + "flos": 27801831096000.0, + "grad_norm": 4.955928567292082, + "language_loss": 0.63931978, + "learning_rate": 5.345878833417949e-08, + "loss": 0.66063201, + "num_input_tokens_seen": 333282435, + "router_z_loss_clip": 0.68115234, + "router_z_loss_mlp": 0.09234619, + "step": 15442, + "time_per_iteration": 4.138249397277832 + }, + { + "auxiliary_loss_clip": 0.01112119, + "auxiliary_loss_mlp": 0.01032673, + "balance_loss_clip": 1.03829324, + "balance_loss_mlp": 1.02132988, + "epoch": 0.9284833909514505, + "flos": 23794433294400.0, + "grad_norm": 1.8482703689901923, + "language_loss": 0.80498981, + "learning_rate": 5.3369380952390295e-08, + "loss": 0.82643771, + "num_input_tokens_seen": 333300400, + "router_z_loss_clip": 0.73779297, + "router_z_loss_mlp": 0.11334229, + "step": 15443, + "time_per_iteration": 2.616975784301758 + }, + { + "auxiliary_loss_clip": 0.01111194, + "auxiliary_loss_mlp": 0.01026707, + "balance_loss_clip": 1.03907251, + "balance_loss_mlp": 1.01545334, + "epoch": 0.9285435142041184, + "flos": 28291696191360.0, + "grad_norm": 1.7470171293005394, + "language_loss": 0.65382624, + "learning_rate": 5.328004738702896e-08, + "loss": 0.67520523, + "num_input_tokens_seen": 333318980, + "router_z_loss_clip": 0.72119141, + "router_z_loss_mlp": 0.11248779, + "step": 15444, + "time_per_iteration": 2.621920347213745 + }, + { + "auxiliary_loss_clip": 0.01109593, + "auxiliary_loss_mlp": 0.0102972, + "balance_loss_clip": 1.03729939, + "balance_loss_mlp": 1.01904476, + "epoch": 0.9286036374567864, + "flos": 21567515932800.0, + "grad_norm": 2.623049542771193, + "language_loss": 0.73644525, + "learning_rate": 5.3190787641483215e-08, + "loss": 0.75783843, + "num_input_tokens_seen": 333334135, + "router_z_loss_clip": 0.72314453, + "router_z_loss_mlp": 0.10675049, + "step": 15445, + "time_per_iteration": 2.6004960536956787 + }, + { + "auxiliary_loss_clip": 0.01111638, + "auxiliary_loss_mlp": 0.01031414, + "balance_loss_clip": 1.03978562, + "balance_loss_mlp": 1.01957679, + "epoch": 0.9286637607094543, + "flos": 25485991005600.0, + "grad_norm": 1.6593185014074736, + "language_loss": 0.71276736, + "learning_rate": 5.3101601719138135e-08, + "loss": 0.73419791, + "num_input_tokens_seen": 333353325, + "router_z_loss_clip": 0.71826172, + "router_z_loss_mlp": 0.1184082, + "step": 15446, + "time_per_iteration": 2.7038989067077637 + }, + { + "auxiliary_loss_clip": 0.01112505, + "auxiliary_loss_mlp": 0.01027447, + "balance_loss_clip": 1.0373354, + "balance_loss_mlp": 1.01608586, + "epoch": 0.9287238839621224, + "flos": 23215442883840.0, + "grad_norm": 1.92925360096701, + "language_loss": 0.69287276, + "learning_rate": 5.301248962337523e-08, + "loss": 0.71427226, + "num_input_tokens_seen": 333371110, + "router_z_loss_clip": 0.75195312, + "router_z_loss_mlp": 0.1137085, + "step": 15447, + "time_per_iteration": 2.6017932891845703 + }, + { + "auxiliary_loss_clip": 0.01101511, + "auxiliary_loss_mlp": 0.01028342, + "balance_loss_clip": 1.03504705, + "balance_loss_mlp": 1.01836979, + "epoch": 0.9287840072147904, + "flos": 25083833120640.0, + "grad_norm": 1.5945505950134589, + "language_loss": 0.72259653, + "learning_rate": 5.292345135757403e-08, + "loss": 0.74389505, + "num_input_tokens_seen": 333391420, + "router_z_loss_clip": 0.66455078, + "router_z_loss_mlp": 0.09967041, + "step": 15448, + "time_per_iteration": 2.6638646125793457 + }, + { + "auxiliary_loss_clip": 0.01109057, + "auxiliary_loss_mlp": 0.01027845, + "balance_loss_clip": 1.03727579, + "balance_loss_mlp": 1.01582885, + "epoch": 0.9288441304674583, + "flos": 25930402063200.0, + "grad_norm": 1.7102480453303002, + "language_loss": 0.74500728, + "learning_rate": 5.283448692511072e-08, + "loss": 0.76637626, + "num_input_tokens_seen": 333410365, + "router_z_loss_clip": 0.71728516, + "router_z_loss_mlp": 0.12023926, + "step": 15449, + "time_per_iteration": 2.6291732788085938 + }, + { + "auxiliary_loss_clip": 0.01109621, + "auxiliary_loss_mlp": 0.01024319, + "balance_loss_clip": 1.03722715, + "balance_loss_mlp": 1.0127852, + "epoch": 0.9289042537201263, + "flos": 33763624619040.0, + "grad_norm": 1.8291549948742316, + "language_loss": 0.67743784, + "learning_rate": 5.27455963293586e-08, + "loss": 0.6987772, + "num_input_tokens_seen": 333430000, + "router_z_loss_clip": 0.72412109, + "router_z_loss_mlp": 0.11529541, + "step": 15450, + "time_per_iteration": 2.7080342769622803 + }, + { + "auxiliary_loss_clip": 0.01110463, + "auxiliary_loss_mlp": 0.01025756, + "balance_loss_clip": 1.03766131, + "balance_loss_mlp": 1.01433539, + "epoch": 0.9289643769727942, + "flos": 23571457937280.0, + "grad_norm": 4.8240647574757825, + "language_loss": 0.71968853, + "learning_rate": 5.265677957368875e-08, + "loss": 0.74105072, + "num_input_tokens_seen": 333445800, + "router_z_loss_clip": 0.72851562, + "router_z_loss_mlp": 0.11419678, + "step": 15451, + "time_per_iteration": 2.610745906829834 + }, + { + "auxiliary_loss_clip": 0.01111263, + "auxiliary_loss_mlp": 0.01031986, + "balance_loss_clip": 1.03846741, + "balance_loss_mlp": 1.021245, + "epoch": 0.9290245002254622, + "flos": 17153705931840.0, + "grad_norm": 2.117905555839074, + "language_loss": 0.73367488, + "learning_rate": 5.25680366614687e-08, + "loss": 0.7551074, + "num_input_tokens_seen": 333461550, + "router_z_loss_clip": 0.72900391, + "router_z_loss_mlp": 0.10742188, + "step": 15452, + "time_per_iteration": 4.036312103271484 + }, + { + "auxiliary_loss_clip": 0.01111541, + "auxiliary_loss_mlp": 0.01027004, + "balance_loss_clip": 1.04056001, + "balance_loss_mlp": 1.01595974, + "epoch": 0.9290846234781301, + "flos": 24461536088160.0, + "grad_norm": 2.4880479102226327, + "language_loss": 0.73986953, + "learning_rate": 5.2479367596064196e-08, + "loss": 0.76125491, + "num_input_tokens_seen": 333478835, + "router_z_loss_clip": 0.70898438, + "router_z_loss_mlp": 0.11047363, + "step": 15453, + "time_per_iteration": 3.8987646102905273 + }, + { + "auxiliary_loss_clip": 0.01027089, + "auxiliary_loss_mlp": 0.01002032, + "balance_loss_clip": 1.00478482, + "balance_loss_mlp": 1.00114465, + "epoch": 0.9291447467307982, + "flos": 74710750913280.0, + "grad_norm": 0.8805270179792912, + "language_loss": 0.60559922, + "learning_rate": 5.2390772380837226e-08, + "loss": 0.62589037, + "num_input_tokens_seen": 333535250, + "router_z_loss_clip": 0.22302246, + "router_z_loss_mlp": 0.0088768, + "step": 15454, + "time_per_iteration": 3.151735782623291 + }, + { + "auxiliary_loss_clip": 0.01109911, + "auxiliary_loss_mlp": 0.01032184, + "balance_loss_clip": 1.03730917, + "balance_loss_mlp": 1.02121043, + "epoch": 0.9292048699834661, + "flos": 25079659840800.0, + "grad_norm": 1.7678384882050984, + "language_loss": 0.69107223, + "learning_rate": 5.230225101914709e-08, + "loss": 0.71249312, + "num_input_tokens_seen": 333553805, + "router_z_loss_clip": 0.72607422, + "router_z_loss_mlp": 0.10974121, + "step": 15455, + "time_per_iteration": 2.631749153137207 + }, + { + "auxiliary_loss_clip": 0.01110286, + "auxiliary_loss_mlp": 0.01031937, + "balance_loss_clip": 1.03894556, + "balance_loss_mlp": 1.02065384, + "epoch": 0.9292649932361341, + "flos": 28825799806080.0, + "grad_norm": 1.8557756978909885, + "language_loss": 0.6480366, + "learning_rate": 5.22138035143509e-08, + "loss": 0.66945881, + "num_input_tokens_seen": 333572800, + "router_z_loss_clip": 0.71337891, + "router_z_loss_mlp": 0.11291504, + "step": 15456, + "time_per_iteration": 2.6846208572387695 + }, + { + "auxiliary_loss_clip": 0.01110815, + "auxiliary_loss_mlp": 0.01030552, + "balance_loss_clip": 1.0399946, + "balance_loss_mlp": 1.01901293, + "epoch": 0.929325116488802, + "flos": 18314968652640.0, + "grad_norm": 2.09221364123511, + "language_loss": 0.68003654, + "learning_rate": 5.2125429869802615e-08, + "loss": 0.70145023, + "num_input_tokens_seen": 333588520, + "router_z_loss_clip": 0.70800781, + "router_z_loss_mlp": 0.11547852, + "step": 15457, + "time_per_iteration": 2.580420970916748 + }, + { + "auxiliary_loss_clip": 0.01109254, + "auxiliary_loss_mlp": 0.01027171, + "balance_loss_clip": 1.03554654, + "balance_loss_mlp": 1.01635265, + "epoch": 0.92938523974147, + "flos": 21923652538080.0, + "grad_norm": 2.935039986303922, + "language_loss": 0.80989778, + "learning_rate": 5.203713008885291e-08, + "loss": 0.83126199, + "num_input_tokens_seen": 333603435, + "router_z_loss_clip": 0.73730469, + "router_z_loss_mlp": 0.1081543, + "step": 15458, + "time_per_iteration": 2.588738441467285 + }, + { + "auxiliary_loss_clip": 0.011093, + "auxiliary_loss_mlp": 0.01032024, + "balance_loss_clip": 1.03628588, + "balance_loss_mlp": 1.02091384, + "epoch": 0.9294453629941379, + "flos": 28068518247840.0, + "grad_norm": 1.6455043264226767, + "language_loss": 0.7242775, + "learning_rate": 5.194890417485065e-08, + "loss": 0.7456907, + "num_input_tokens_seen": 333623305, + "router_z_loss_clip": 0.72949219, + "router_z_loss_mlp": 0.11114502, + "step": 15459, + "time_per_iteration": 2.6816000938415527 + }, + { + "auxiliary_loss_clip": 0.01109729, + "auxiliary_loss_mlp": 0.01031065, + "balance_loss_clip": 1.03719926, + "balance_loss_mlp": 1.0206399, + "epoch": 0.929505486246806, + "flos": 20810599030080.0, + "grad_norm": 2.688589059417893, + "language_loss": 0.58455998, + "learning_rate": 5.1860752131141384e-08, + "loss": 0.60596788, + "num_input_tokens_seen": 333641205, + "router_z_loss_clip": 0.72509766, + "router_z_loss_mlp": 0.10424805, + "step": 15460, + "time_per_iteration": 2.617001533508301 + }, + { + "auxiliary_loss_clip": 0.01112209, + "auxiliary_loss_mlp": 0.01029222, + "balance_loss_clip": 1.03837109, + "balance_loss_mlp": 1.01801014, + "epoch": 0.9295656094994739, + "flos": 33361709837760.0, + "grad_norm": 1.7698176946320823, + "language_loss": 0.80113375, + "learning_rate": 5.177267396106733e-08, + "loss": 0.82254815, + "num_input_tokens_seen": 333659615, + "router_z_loss_clip": 0.73779297, + "router_z_loss_mlp": 0.11224365, + "step": 15461, + "time_per_iteration": 2.6608357429504395 + }, + { + "auxiliary_loss_clip": 0.01106157, + "auxiliary_loss_mlp": 0.01023188, + "balance_loss_clip": 1.03615069, + "balance_loss_mlp": 1.01219654, + "epoch": 0.9296257327521419, + "flos": 26198912492640.0, + "grad_norm": 11.483520307115226, + "language_loss": 0.78912127, + "learning_rate": 5.168466966796869e-08, + "loss": 0.81041473, + "num_input_tokens_seen": 333678985, + "router_z_loss_clip": 0.70019531, + "router_z_loss_mlp": 0.10986328, + "step": 15462, + "time_per_iteration": 2.647263765335083 + }, + { + "auxiliary_loss_clip": 0.01108751, + "auxiliary_loss_mlp": 0.01021639, + "balance_loss_clip": 1.03588605, + "balance_loss_mlp": 1.01104152, + "epoch": 0.9296858560048099, + "flos": 19965407675040.0, + "grad_norm": 2.806081968856655, + "language_loss": 0.62989414, + "learning_rate": 5.159673925518282e-08, + "loss": 0.65119803, + "num_input_tokens_seen": 333696410, + "router_z_loss_clip": 0.72802734, + "router_z_loss_mlp": 0.1060791, + "step": 15463, + "time_per_iteration": 2.6038472652435303 + }, + { + "auxiliary_loss_clip": 0.01106085, + "auxiliary_loss_mlp": 0.01026553, + "balance_loss_clip": 1.03492343, + "balance_loss_mlp": 1.01625884, + "epoch": 0.9297459792574778, + "flos": 36434547865440.0, + "grad_norm": 1.521114036201086, + "language_loss": 0.71217555, + "learning_rate": 5.15088827260437e-08, + "loss": 0.73350191, + "num_input_tokens_seen": 333716615, + "router_z_loss_clip": 0.71191406, + "router_z_loss_mlp": 0.10290527, + "step": 15464, + "time_per_iteration": 2.7396552562713623 + }, + { + "auxiliary_loss_clip": 0.01108946, + "auxiliary_loss_mlp": 0.01025788, + "balance_loss_clip": 1.03666556, + "balance_loss_mlp": 1.01502991, + "epoch": 0.9298061025101458, + "flos": 19431141991200.0, + "grad_norm": 2.1656521468097965, + "language_loss": 0.77307492, + "learning_rate": 5.1421100083883115e-08, + "loss": 0.79442227, + "num_input_tokens_seen": 333732800, + "router_z_loss_clip": 0.72363281, + "router_z_loss_mlp": 0.10754395, + "step": 15465, + "time_per_iteration": 3.9507062435150146 + }, + { + "auxiliary_loss_clip": 0.01026972, + "auxiliary_loss_mlp": 0.01001144, + "balance_loss_clip": 1.00466681, + "balance_loss_mlp": 1.00028527, + "epoch": 0.9298662257628137, + "flos": 78210138228480.0, + "grad_norm": 0.6920842629126873, + "language_loss": 0.56477213, + "learning_rate": 5.133339133202952e-08, + "loss": 0.58505332, + "num_input_tokens_seen": 333799300, + "router_z_loss_clip": 0.22302246, + "router_z_loss_mlp": 0.0085907, + "step": 15466, + "time_per_iteration": 3.382103204727173 + }, + { + "auxiliary_loss_clip": 0.01107723, + "auxiliary_loss_mlp": 0.01031122, + "balance_loss_clip": 1.03559446, + "balance_loss_mlp": 1.01902199, + "epoch": 0.9299263490154818, + "flos": 29626995745440.0, + "grad_norm": 1.7300055103499887, + "language_loss": 0.72954226, + "learning_rate": 5.1245756473809355e-08, + "loss": 0.75093073, + "num_input_tokens_seen": 333820360, + "router_z_loss_clip": 0.72021484, + "router_z_loss_mlp": 0.12091064, + "step": 15467, + "time_per_iteration": 2.7478842735290527 + }, + { + "auxiliary_loss_clip": 0.01111896, + "auxiliary_loss_mlp": 0.01031953, + "balance_loss_clip": 1.03893757, + "balance_loss_mlp": 1.02072382, + "epoch": 0.9299864722681497, + "flos": 28424047093920.0, + "grad_norm": 5.372208604815634, + "language_loss": 0.71821058, + "learning_rate": 5.1158195512545076e-08, + "loss": 0.73964912, + "num_input_tokens_seen": 333840415, + "router_z_loss_clip": 0.72998047, + "router_z_loss_mlp": 0.11224365, + "step": 15468, + "time_per_iteration": 2.6454386711120605 + }, + { + "auxiliary_loss_clip": 0.01110078, + "auxiliary_loss_mlp": 0.01027521, + "balance_loss_clip": 1.03600419, + "balance_loss_mlp": 1.01647604, + "epoch": 0.9300465955208177, + "flos": 26107599244320.0, + "grad_norm": 2.0947586838051744, + "language_loss": 0.7596488, + "learning_rate": 5.107070845155737e-08, + "loss": 0.78102475, + "num_input_tokens_seen": 333859910, + "router_z_loss_clip": 0.74072266, + "router_z_loss_mlp": 0.1104126, + "step": 15469, + "time_per_iteration": 2.6686527729034424 + }, + { + "auxiliary_loss_clip": 0.0110993, + "auxiliary_loss_mlp": 0.01028763, + "balance_loss_clip": 1.03728914, + "balance_loss_mlp": 1.01763463, + "epoch": 0.9301067187734856, + "flos": 29982443556960.0, + "grad_norm": 2.4293879912403242, + "language_loss": 0.75792974, + "learning_rate": 5.098329529416379e-08, + "loss": 0.77931666, + "num_input_tokens_seen": 333880495, + "router_z_loss_clip": 0.7265625, + "router_z_loss_mlp": 0.11126709, + "step": 15470, + "time_per_iteration": 2.6295783519744873 + }, + { + "auxiliary_loss_clip": 0.01109115, + "auxiliary_loss_mlp": 0.01029254, + "balance_loss_clip": 1.03691363, + "balance_loss_mlp": 1.01877499, + "epoch": 0.9301668420261536, + "flos": 27085182019200.0, + "grad_norm": 1.5936459470872162, + "language_loss": 0.74854851, + "learning_rate": 5.089595604367902e-08, + "loss": 0.76993215, + "num_input_tokens_seen": 333897640, + "router_z_loss_clip": 0.72216797, + "router_z_loss_mlp": 0.10473633, + "step": 15471, + "time_per_iteration": 2.6567676067352295 + }, + { + "auxiliary_loss_clip": 0.01109923, + "auxiliary_loss_mlp": 0.01028146, + "balance_loss_clip": 1.03772116, + "balance_loss_mlp": 1.01677942, + "epoch": 0.9302269652788215, + "flos": 21655587798720.0, + "grad_norm": 3.0813659073892676, + "language_loss": 0.6958003, + "learning_rate": 5.080869070341487e-08, + "loss": 0.71718103, + "num_input_tokens_seen": 333913670, + "router_z_loss_clip": 0.72167969, + "router_z_loss_mlp": 0.11358643, + "step": 15472, + "time_per_iteration": 2.582361936569214 + }, + { + "auxiliary_loss_clip": 0.01103262, + "auxiliary_loss_mlp": 0.01026908, + "balance_loss_clip": 1.03528225, + "balance_loss_mlp": 1.01679873, + "epoch": 0.9302870885314896, + "flos": 23660583252480.0, + "grad_norm": 1.9978617711527253, + "language_loss": 0.88387185, + "learning_rate": 5.0721499276680233e-08, + "loss": 0.90517354, + "num_input_tokens_seen": 333934105, + "router_z_loss_clip": 0.68066406, + "router_z_loss_mlp": 0.10101318, + "step": 15473, + "time_per_iteration": 2.659600257873535 + }, + { + "auxiliary_loss_clip": 0.01115455, + "auxiliary_loss_mlp": 0.01037567, + "balance_loss_clip": 1.03960824, + "balance_loss_mlp": 1.02479434, + "epoch": 0.9303472117841575, + "flos": 26552212888320.0, + "grad_norm": 1.99411149991171, + "language_loss": 0.64491081, + "learning_rate": 5.063438176678203e-08, + "loss": 0.66644108, + "num_input_tokens_seen": 333953635, + "router_z_loss_clip": 0.75830078, + "router_z_loss_mlp": 0.12780762, + "step": 15474, + "time_per_iteration": 2.633322238922119 + }, + { + "auxiliary_loss_clip": 0.01108945, + "auxiliary_loss_mlp": 0.01028989, + "balance_loss_clip": 1.03745437, + "balance_loss_mlp": 1.01785481, + "epoch": 0.9304073350368255, + "flos": 23927756611680.0, + "grad_norm": 21.14932261771929, + "language_loss": 0.74697375, + "learning_rate": 5.054733817702339e-08, + "loss": 0.7683531, + "num_input_tokens_seen": 333971825, + "router_z_loss_clip": 0.71435547, + "router_z_loss_mlp": 0.11132812, + "step": 15475, + "time_per_iteration": 2.61202073097229 + }, + { + "auxiliary_loss_clip": 0.01108098, + "auxiliary_loss_mlp": 0.01025112, + "balance_loss_clip": 1.03605223, + "balance_loss_mlp": 1.01467538, + "epoch": 0.9304674582894935, + "flos": 37146010730400.0, + "grad_norm": 2.257531717890774, + "language_loss": 0.66533923, + "learning_rate": 5.0460368510704786e-08, + "loss": 0.68667132, + "num_input_tokens_seen": 333990120, + "router_z_loss_clip": 0.72021484, + "router_z_loss_mlp": 0.10437012, + "step": 15476, + "time_per_iteration": 2.686797618865967 + }, + { + "auxiliary_loss_clip": 0.01111461, + "auxiliary_loss_mlp": 0.01033523, + "balance_loss_clip": 1.03907228, + "balance_loss_mlp": 1.02197146, + "epoch": 0.9305275815421614, + "flos": 21701892699360.0, + "grad_norm": 2.091659275367194, + "language_loss": 0.69555205, + "learning_rate": 5.0373472771124914e-08, + "loss": 0.71700191, + "num_input_tokens_seen": 334007970, + "router_z_loss_clip": 0.72460938, + "router_z_loss_mlp": 0.11566162, + "step": 15477, + "time_per_iteration": 2.603426218032837 + }, + { + "auxiliary_loss_clip": 0.01108782, + "auxiliary_loss_mlp": 0.0102993, + "balance_loss_clip": 1.03815007, + "balance_loss_mlp": 1.01925457, + "epoch": 0.9305877047948294, + "flos": 30869847567360.0, + "grad_norm": 3.21365240295355, + "language_loss": 0.58344781, + "learning_rate": 5.0286650961578027e-08, + "loss": 0.60483491, + "num_input_tokens_seen": 334027120, + "router_z_loss_clip": 0.70654297, + "router_z_loss_mlp": 0.10675049, + "step": 15478, + "time_per_iteration": 2.625664234161377 + }, + { + "auxiliary_loss_clip": 0.0111511, + "auxiliary_loss_mlp": 0.0102966, + "balance_loss_clip": 1.03834569, + "balance_loss_mlp": 1.017501, + "epoch": 0.9306478280474973, + "flos": 20715882330240.0, + "grad_norm": 2.0107931546838844, + "language_loss": 0.78527898, + "learning_rate": 5.01999030853566e-08, + "loss": 0.80672669, + "num_input_tokens_seen": 334042785, + "router_z_loss_clip": 0.76757812, + "router_z_loss_mlp": 0.12164307, + "step": 15479, + "time_per_iteration": 2.610522985458374 + }, + { + "auxiliary_loss_clip": 0.01109698, + "auxiliary_loss_mlp": 0.01033492, + "balance_loss_clip": 1.03763688, + "balance_loss_mlp": 1.02291799, + "epoch": 0.9307079513001654, + "flos": 43517619904320.0, + "grad_norm": 1.879569799152852, + "language_loss": 0.69184339, + "learning_rate": 5.0113229145750445e-08, + "loss": 0.71327525, + "num_input_tokens_seen": 334063480, + "router_z_loss_clip": 0.72119141, + "router_z_loss_mlp": 0.10577393, + "step": 15480, + "time_per_iteration": 2.803595542907715 + }, + { + "auxiliary_loss_clip": 0.01109342, + "auxiliary_loss_mlp": 0.01030984, + "balance_loss_clip": 1.03744566, + "balance_loss_mlp": 1.02029061, + "epoch": 0.9307680745528333, + "flos": 23437729447200.0, + "grad_norm": 1.7920379172794778, + "language_loss": 0.67460454, + "learning_rate": 5.002662914604583e-08, + "loss": 0.69600785, + "num_input_tokens_seen": 334082005, + "router_z_loss_clip": 0.71875, + "router_z_loss_mlp": 0.10699463, + "step": 15481, + "time_per_iteration": 4.193808555603027 + }, + { + "auxiliary_loss_clip": 0.01106692, + "auxiliary_loss_mlp": 0.01025665, + "balance_loss_clip": 1.03592575, + "balance_loss_mlp": 1.01521635, + "epoch": 0.9308281978055013, + "flos": 23260127093280.0, + "grad_norm": 2.0062649981096574, + "language_loss": 0.74399257, + "learning_rate": 4.994010308952701e-08, + "loss": 0.76531613, + "num_input_tokens_seen": 334101375, + "router_z_loss_clip": 0.70849609, + "router_z_loss_mlp": 0.10449219, + "step": 15482, + "time_per_iteration": 2.601161241531372 + }, + { + "auxiliary_loss_clip": 0.01105707, + "auxiliary_loss_mlp": 0.01025855, + "balance_loss_clip": 1.03579044, + "balance_loss_mlp": 1.01585388, + "epoch": 0.9308883210581692, + "flos": 25040445464160.0, + "grad_norm": 2.06108630627733, + "language_loss": 0.79977691, + "learning_rate": 4.985365097947469e-08, + "loss": 0.82109255, + "num_input_tokens_seen": 334119460, + "router_z_loss_clip": 0.69921875, + "router_z_loss_mlp": 0.10003662, + "step": 15483, + "time_per_iteration": 2.655421495437622 + }, + { + "auxiliary_loss_clip": 0.0111029, + "auxiliary_loss_mlp": 0.01029678, + "balance_loss_clip": 1.03828323, + "balance_loss_mlp": 1.01866841, + "epoch": 0.9309484443108372, + "flos": 15864184553760.0, + "grad_norm": 2.435058206662866, + "language_loss": 0.74690437, + "learning_rate": 4.976727281916782e-08, + "loss": 0.76830405, + "num_input_tokens_seen": 334136065, + "router_z_loss_clip": 0.71972656, + "router_z_loss_mlp": 0.11010742, + "step": 15484, + "time_per_iteration": 2.601332187652588 + }, + { + "auxiliary_loss_clip": 0.01111639, + "auxiliary_loss_mlp": 0.01029368, + "balance_loss_clip": 1.03837848, + "balance_loss_mlp": 1.01813865, + "epoch": 0.9310085675635051, + "flos": 15334618874400.0, + "grad_norm": 2.343328809132338, + "language_loss": 0.76174617, + "learning_rate": 4.968096861188087e-08, + "loss": 0.78315622, + "num_input_tokens_seen": 334153690, + "router_z_loss_clip": 0.73242188, + "router_z_loss_mlp": 0.11218262, + "step": 15485, + "time_per_iteration": 2.642732858657837 + }, + { + "auxiliary_loss_clip": 0.01109548, + "auxiliary_loss_mlp": 0.01029866, + "balance_loss_clip": 1.03638768, + "balance_loss_mlp": 1.01733685, + "epoch": 0.9310686908161732, + "flos": 28647549175680.0, + "grad_norm": 1.908997532287569, + "language_loss": 0.7780484, + "learning_rate": 4.959473836088723e-08, + "loss": 0.79944253, + "num_input_tokens_seen": 334171880, + "router_z_loss_clip": 0.73291016, + "router_z_loss_mlp": 0.12530518, + "step": 15486, + "time_per_iteration": 2.6175079345703125 + }, + { + "auxiliary_loss_clip": 0.01114447, + "auxiliary_loss_mlp": 0.01029233, + "balance_loss_clip": 1.03946733, + "balance_loss_mlp": 1.01755619, + "epoch": 0.9311288140688411, + "flos": 29492902599840.0, + "grad_norm": 5.0492540864119615, + "language_loss": 0.76888371, + "learning_rate": 4.950858206945674e-08, + "loss": 0.79032052, + "num_input_tokens_seen": 334190005, + "router_z_loss_clip": 0.75, + "router_z_loss_mlp": 0.11676025, + "step": 15487, + "time_per_iteration": 2.681251049041748 + }, + { + "auxiliary_loss_clip": 0.01109247, + "auxiliary_loss_mlp": 0.01024824, + "balance_loss_clip": 1.03800941, + "balance_loss_mlp": 1.01329041, + "epoch": 0.9311889373215091, + "flos": 43428373037280.0, + "grad_norm": 3.3534959354996277, + "language_loss": 0.66885078, + "learning_rate": 4.942249974085633e-08, + "loss": 0.69019151, + "num_input_tokens_seen": 334209545, + "router_z_loss_clip": 0.71337891, + "router_z_loss_mlp": 0.11529541, + "step": 15488, + "time_per_iteration": 2.706105947494507 + }, + { + "auxiliary_loss_clip": 0.0110632, + "auxiliary_loss_mlp": 0.01029188, + "balance_loss_clip": 1.03732562, + "balance_loss_mlp": 1.01774454, + "epoch": 0.9312490605741771, + "flos": 24685240756320.0, + "grad_norm": 3.6892251791722726, + "language_loss": 0.74904561, + "learning_rate": 4.933649137834983e-08, + "loss": 0.77040064, + "num_input_tokens_seen": 334228900, + "router_z_loss_clip": 0.68945312, + "router_z_loss_mlp": 0.11450195, + "step": 15489, + "time_per_iteration": 2.6390492916107178 + }, + { + "auxiliary_loss_clip": 0.01111389, + "auxiliary_loss_mlp": 0.01029457, + "balance_loss_clip": 1.03746128, + "balance_loss_mlp": 1.01821542, + "epoch": 0.931309183826845, + "flos": 17022367961280.0, + "grad_norm": 2.541546262947319, + "language_loss": 0.81231272, + "learning_rate": 4.925055698519931e-08, + "loss": 0.83372116, + "num_input_tokens_seen": 334245500, + "router_z_loss_clip": 0.73974609, + "router_z_loss_mlp": 0.11248779, + "step": 15490, + "time_per_iteration": 2.605886459350586 + }, + { + "auxiliary_loss_clip": 0.01110846, + "auxiliary_loss_mlp": 0.01031056, + "balance_loss_clip": 1.0377053, + "balance_loss_mlp": 1.01955783, + "epoch": 0.931369307079513, + "flos": 24595264578240.0, + "grad_norm": 1.7445717900354325, + "language_loss": 0.72117907, + "learning_rate": 4.9164696564663264e-08, + "loss": 0.74259806, + "num_input_tokens_seen": 334264370, + "router_z_loss_clip": 0.73046875, + "router_z_loss_mlp": 0.11499023, + "step": 15491, + "time_per_iteration": 4.136736869812012 + }, + { + "auxiliary_loss_clip": 0.01105242, + "auxiliary_loss_mlp": 0.01026604, + "balance_loss_clip": 1.03584611, + "balance_loss_mlp": 1.01666164, + "epoch": 0.931429430332181, + "flos": 30918461952960.0, + "grad_norm": 1.9820830113485817, + "language_loss": 0.73975378, + "learning_rate": 4.9078910119997096e-08, + "loss": 0.76107228, + "num_input_tokens_seen": 334283905, + "router_z_loss_clip": 0.69384766, + "router_z_loss_mlp": 0.09936523, + "step": 15492, + "time_per_iteration": 4.095034599304199 + }, + { + "auxiliary_loss_clip": 0.01026902, + "auxiliary_loss_mlp": 0.01001997, + "balance_loss_clip": 1.00461841, + "balance_loss_mlp": 1.0011133, + "epoch": 0.931489553584849, + "flos": 86911762190400.0, + "grad_norm": 0.7142564968908368, + "language_loss": 0.53403991, + "learning_rate": 4.899319765445442e-08, + "loss": 0.55432892, + "num_input_tokens_seen": 334339925, + "router_z_loss_clip": 0.22290039, + "router_z_loss_mlp": 0.00883484, + "step": 15493, + "time_per_iteration": 3.1630163192749023 + }, + { + "auxiliary_loss_clip": 0.0110849, + "auxiliary_loss_mlp": 0.01031249, + "balance_loss_clip": 1.03750312, + "balance_loss_mlp": 1.02084839, + "epoch": 0.9315496768375169, + "flos": 17868126558240.0, + "grad_norm": 1.8427592262355081, + "language_loss": 0.70524454, + "learning_rate": 4.890755917128531e-08, + "loss": 0.72664189, + "num_input_tokens_seen": 334357225, + "router_z_loss_clip": 0.70898438, + "router_z_loss_mlp": 0.10406494, + "step": 15494, + "time_per_iteration": 2.5866618156433105 + }, + { + "auxiliary_loss_clip": 0.01110629, + "auxiliary_loss_mlp": 0.01027131, + "balance_loss_clip": 1.03610706, + "balance_loss_mlp": 1.01562738, + "epoch": 0.9316098000901849, + "flos": 34568791251840.0, + "grad_norm": 1.6858324662392437, + "language_loss": 0.68697476, + "learning_rate": 4.882199467373671e-08, + "loss": 0.70835233, + "num_input_tokens_seen": 334375945, + "router_z_loss_clip": 0.74462891, + "router_z_loss_mlp": 0.1151123, + "step": 15495, + "time_per_iteration": 2.6996843814849854 + }, + { + "auxiliary_loss_clip": 0.01104898, + "auxiliary_loss_mlp": 0.01028607, + "balance_loss_clip": 1.03472209, + "balance_loss_mlp": 1.01827121, + "epoch": 0.9316699233428528, + "flos": 34791766608960.0, + "grad_norm": 2.208044542491506, + "language_loss": 0.61960423, + "learning_rate": 4.8736504165053815e-08, + "loss": 0.64093924, + "num_input_tokens_seen": 334395310, + "router_z_loss_clip": 0.70117188, + "router_z_loss_mlp": 0.10339355, + "step": 15496, + "time_per_iteration": 2.66135835647583 + }, + { + "auxiliary_loss_clip": 0.01110936, + "auxiliary_loss_mlp": 0.01030412, + "balance_loss_clip": 1.03791809, + "balance_loss_mlp": 1.01895547, + "epoch": 0.9317300465955208, + "flos": 41113707948000.0, + "grad_norm": 2.1923720585334507, + "language_loss": 0.7691763, + "learning_rate": 4.865108764847825e-08, + "loss": 0.79058981, + "num_input_tokens_seen": 334416965, + "router_z_loss_clip": 0.72949219, + "router_z_loss_mlp": 0.11456299, + "step": 15497, + "time_per_iteration": 2.7344820499420166 + }, + { + "auxiliary_loss_clip": 0.01113014, + "auxiliary_loss_mlp": 0.01030495, + "balance_loss_clip": 1.03946781, + "balance_loss_mlp": 1.01890755, + "epoch": 0.9317901698481887, + "flos": 28868174530560.0, + "grad_norm": 1.8569414940395603, + "language_loss": 0.66404665, + "learning_rate": 4.856574512724898e-08, + "loss": 0.68548179, + "num_input_tokens_seen": 334435620, + "router_z_loss_clip": 0.73486328, + "router_z_loss_mlp": 0.11590576, + "step": 15498, + "time_per_iteration": 2.6071579456329346 + }, + { + "auxiliary_loss_clip": 0.01110746, + "auxiliary_loss_mlp": 0.01035863, + "balance_loss_clip": 1.03823102, + "balance_loss_mlp": 1.02396584, + "epoch": 0.9318502931008568, + "flos": 25575440459040.0, + "grad_norm": 1.8423265871877719, + "language_loss": 0.79622412, + "learning_rate": 4.8480476604602305e-08, + "loss": 0.81769025, + "num_input_tokens_seen": 334456210, + "router_z_loss_clip": 0.72558594, + "router_z_loss_mlp": 0.11901855, + "step": 15499, + "time_per_iteration": 2.6305325031280518 + }, + { + "auxiliary_loss_clip": 0.01109124, + "auxiliary_loss_mlp": 0.01035045, + "balance_loss_clip": 1.03905344, + "balance_loss_mlp": 1.02367282, + "epoch": 0.9319104163535247, + "flos": 28602986518080.0, + "grad_norm": 1.615398655028043, + "language_loss": 0.77016133, + "learning_rate": 4.8395282083771196e-08, + "loss": 0.79160303, + "num_input_tokens_seen": 334475485, + "router_z_loss_clip": 0.70117188, + "router_z_loss_mlp": 0.1137085, + "step": 15500, + "time_per_iteration": 2.6786274909973145 + }, + { + "auxiliary_loss_clip": 0.01105587, + "auxiliary_loss_mlp": 0.01025782, + "balance_loss_clip": 1.03507304, + "balance_loss_mlp": 1.0146122, + "epoch": 0.9319705396061927, + "flos": 27268375757760.0, + "grad_norm": 1.649716003995145, + "language_loss": 0.72563618, + "learning_rate": 4.8310161567987064e-08, + "loss": 0.74694979, + "num_input_tokens_seen": 334494740, + "router_z_loss_clip": 0.70507812, + "router_z_loss_mlp": 0.1116333, + "step": 15501, + "time_per_iteration": 2.6065332889556885 + }, + { + "auxiliary_loss_clip": 0.01112425, + "auxiliary_loss_mlp": 0.01032708, + "balance_loss_clip": 1.03808033, + "balance_loss_mlp": 1.02143073, + "epoch": 0.9320306628588607, + "flos": 25616640182400.0, + "grad_norm": 1.8082751249118523, + "language_loss": 0.65788746, + "learning_rate": 4.822511506047666e-08, + "loss": 0.67933881, + "num_input_tokens_seen": 334511910, + "router_z_loss_clip": 0.74414062, + "router_z_loss_mlp": 0.11273193, + "step": 15502, + "time_per_iteration": 2.581153631210327 + }, + { + "auxiliary_loss_clip": 0.0111197, + "auxiliary_loss_mlp": 0.0103308, + "balance_loss_clip": 1.03841841, + "balance_loss_mlp": 1.0222435, + "epoch": 0.9320907861115286, + "flos": 29941689523680.0, + "grad_norm": 1.6206198888965333, + "language_loss": 0.65621293, + "learning_rate": 4.814014256446586e-08, + "loss": 0.67766339, + "num_input_tokens_seen": 334533150, + "router_z_loss_clip": 0.73583984, + "router_z_loss_mlp": 0.10839844, + "step": 15503, + "time_per_iteration": 2.640951633453369 + }, + { + "auxiliary_loss_clip": 0.01111374, + "auxiliary_loss_mlp": 0.01031932, + "balance_loss_clip": 1.03785717, + "balance_loss_mlp": 1.02050543, + "epoch": 0.9321509093641966, + "flos": 24142750064640.0, + "grad_norm": 1.6409122581000901, + "language_loss": 0.75037444, + "learning_rate": 4.805524408317652e-08, + "loss": 0.77180743, + "num_input_tokens_seen": 334550940, + "router_z_loss_clip": 0.73535156, + "router_z_loss_mlp": 0.11425781, + "step": 15504, + "time_per_iteration": 3.9477486610412598 + }, + { + "auxiliary_loss_clip": 0.01110759, + "auxiliary_loss_mlp": 0.01024466, + "balance_loss_clip": 1.03824043, + "balance_loss_mlp": 1.01271749, + "epoch": 0.9322110326168646, + "flos": 30472065548640.0, + "grad_norm": 2.8963476903283, + "language_loss": 0.71632576, + "learning_rate": 4.797041961982762e-08, + "loss": 0.73767799, + "num_input_tokens_seen": 334570935, + "router_z_loss_clip": 0.72412109, + "router_z_loss_mlp": 0.11743164, + "step": 15505, + "time_per_iteration": 2.6430740356445312 + }, + { + "auxiliary_loss_clip": 0.01111069, + "auxiliary_loss_mlp": 0.01028605, + "balance_loss_clip": 1.03847945, + "balance_loss_mlp": 1.01706004, + "epoch": 0.9322711558695326, + "flos": 19698153281280.0, + "grad_norm": 1.73184120227821, + "language_loss": 0.7576409, + "learning_rate": 4.788566917763614e-08, + "loss": 0.77903759, + "num_input_tokens_seen": 334589315, + "router_z_loss_clip": 0.72558594, + "router_z_loss_mlp": 0.11541748, + "step": 15506, + "time_per_iteration": 2.6569464206695557 + }, + { + "auxiliary_loss_clip": 0.01106887, + "auxiliary_loss_mlp": 0.01022971, + "balance_loss_clip": 1.03756225, + "balance_loss_mlp": 1.01278484, + "epoch": 0.9323312791222005, + "flos": 28958596398720.0, + "grad_norm": 1.8163000074953548, + "language_loss": 0.83103538, + "learning_rate": 4.780099275981597e-08, + "loss": 0.85233396, + "num_input_tokens_seen": 334608990, + "router_z_loss_clip": 0.69287109, + "router_z_loss_mlp": 0.10174561, + "step": 15507, + "time_per_iteration": 2.716434955596924 + }, + { + "auxiliary_loss_clip": 0.01111046, + "auxiliary_loss_mlp": 0.01029084, + "balance_loss_clip": 1.03757083, + "balance_loss_mlp": 1.01793742, + "epoch": 0.9323914023748685, + "flos": 25353194412960.0, + "grad_norm": 1.568544316764138, + "language_loss": 0.67929292, + "learning_rate": 4.771639036957742e-08, + "loss": 0.7006942, + "num_input_tokens_seen": 334628655, + "router_z_loss_clip": 0.73486328, + "router_z_loss_mlp": 0.11151123, + "step": 15508, + "time_per_iteration": 2.7024290561676025 + }, + { + "auxiliary_loss_clip": 0.01108509, + "auxiliary_loss_mlp": 0.01030089, + "balance_loss_clip": 1.03764188, + "balance_loss_mlp": 1.01898503, + "epoch": 0.9324515256275364, + "flos": 29181571755840.0, + "grad_norm": 1.9220284720334668, + "language_loss": 0.72338253, + "learning_rate": 4.7631862010129033e-08, + "loss": 0.74476856, + "num_input_tokens_seen": 334648295, + "router_z_loss_clip": 0.70800781, + "router_z_loss_mlp": 0.11108398, + "step": 15509, + "time_per_iteration": 2.6442103385925293 + }, + { + "auxiliary_loss_clip": 0.01109249, + "auxiliary_loss_mlp": 0.01027491, + "balance_loss_clip": 1.03740335, + "balance_loss_mlp": 1.01633334, + "epoch": 0.9325116488802044, + "flos": 21969714335040.0, + "grad_norm": 3.978378450427089, + "language_loss": 0.74501079, + "learning_rate": 4.754740768467624e-08, + "loss": 0.76637822, + "num_input_tokens_seen": 334666280, + "router_z_loss_clip": 0.71972656, + "router_z_loss_mlp": 0.11157227, + "step": 15510, + "time_per_iteration": 2.644160270690918 + }, + { + "auxiliary_loss_clip": 0.01111179, + "auxiliary_loss_mlp": 0.0102836, + "balance_loss_clip": 1.03688323, + "balance_loss_mlp": 1.01730299, + "epoch": 0.9325717721328723, + "flos": 35412969674880.0, + "grad_norm": 1.6996016654341342, + "language_loss": 0.70239621, + "learning_rate": 4.746302739642161e-08, + "loss": 0.7237916, + "num_input_tokens_seen": 334688830, + "router_z_loss_clip": 0.74316406, + "router_z_loss_mlp": 0.11065674, + "step": 15511, + "time_per_iteration": 2.762432098388672 + }, + { + "auxiliary_loss_clip": 0.01110076, + "auxiliary_loss_mlp": 0.01032245, + "balance_loss_clip": 1.03735614, + "balance_loss_mlp": 1.02101564, + "epoch": 0.9326318953855404, + "flos": 26413581807360.0, + "grad_norm": 2.0631926987664477, + "language_loss": 0.7796672, + "learning_rate": 4.737872114856412e-08, + "loss": 0.80109036, + "num_input_tokens_seen": 334705205, + "router_z_loss_clip": 0.72851562, + "router_z_loss_mlp": 0.11224365, + "step": 15512, + "time_per_iteration": 2.6203722953796387 + }, + { + "auxiliary_loss_clip": 0.01108792, + "auxiliary_loss_mlp": 0.01024681, + "balance_loss_clip": 1.03703332, + "balance_loss_mlp": 1.01329052, + "epoch": 0.9326920186382083, + "flos": 31806028032480.0, + "grad_norm": 1.595099636065712, + "language_loss": 0.80598456, + "learning_rate": 4.7294488944301436e-08, + "loss": 0.82731926, + "num_input_tokens_seen": 334723830, + "router_z_loss_clip": 0.71679688, + "router_z_loss_mlp": 0.1138916, + "step": 15513, + "time_per_iteration": 2.6462807655334473 + }, + { + "auxiliary_loss_clip": 0.01115503, + "auxiliary_loss_mlp": 0.01028543, + "balance_loss_clip": 1.03903365, + "balance_loss_mlp": 1.016294, + "epoch": 0.9327521418908763, + "flos": 14711470979040.0, + "grad_norm": 2.2155871285736284, + "language_loss": 0.79868472, + "learning_rate": 4.721033078682768e-08, + "loss": 0.82012516, + "num_input_tokens_seen": 334740825, + "router_z_loss_clip": 0.76513672, + "router_z_loss_mlp": 0.12249756, + "step": 15514, + "time_per_iteration": 2.6242895126342773 + }, + { + "auxiliary_loss_clip": 0.0110678, + "auxiliary_loss_mlp": 0.01034412, + "balance_loss_clip": 1.037395, + "balance_loss_mlp": 1.02374887, + "epoch": 0.9328122651435443, + "flos": 53488715541120.0, + "grad_norm": 1.7599990915184938, + "language_loss": 0.71462089, + "learning_rate": 4.7126246679333626e-08, + "loss": 0.73603284, + "num_input_tokens_seen": 334765825, + "router_z_loss_clip": 0.69482422, + "router_z_loss_mlp": 0.10656738, + "step": 15515, + "time_per_iteration": 2.851485013961792 + }, + { + "auxiliary_loss_clip": 0.01113939, + "auxiliary_loss_mlp": 0.01029748, + "balance_loss_clip": 1.0389154, + "balance_loss_mlp": 1.01796389, + "epoch": 0.9328723883962122, + "flos": 18540496598400.0, + "grad_norm": 2.775243703885276, + "language_loss": 0.81334579, + "learning_rate": 4.704223662500806e-08, + "loss": 0.83478266, + "num_input_tokens_seen": 334782680, + "router_z_loss_clip": 0.75, + "router_z_loss_mlp": 0.11779785, + "step": 15516, + "time_per_iteration": 2.5655949115753174 + }, + { + "auxiliary_loss_clip": 0.01110443, + "auxiliary_loss_mlp": 0.0102747, + "balance_loss_clip": 1.03698134, + "balance_loss_mlp": 1.01638377, + "epoch": 0.9329325116488802, + "flos": 24723847373760.0, + "grad_norm": 1.6827742340193366, + "language_loss": 0.80997133, + "learning_rate": 4.695830062703643e-08, + "loss": 0.83135045, + "num_input_tokens_seen": 334800160, + "router_z_loss_clip": 0.734375, + "router_z_loss_mlp": 0.11071777, + "step": 15517, + "time_per_iteration": 2.739654541015625 + }, + { + "auxiliary_loss_clip": 0.01110712, + "auxiliary_loss_mlp": 0.01030555, + "balance_loss_clip": 1.03726244, + "balance_loss_mlp": 1.01863444, + "epoch": 0.9329926349015482, + "flos": 16002410461920.0, + "grad_norm": 2.236365923414965, + "language_loss": 0.74459958, + "learning_rate": 4.687443868860219e-08, + "loss": 0.76601219, + "num_input_tokens_seen": 334815840, + "router_z_loss_clip": 0.73388672, + "router_z_loss_mlp": 0.11907959, + "step": 15518, + "time_per_iteration": 2.5915846824645996 + }, + { + "auxiliary_loss_clip": 0.01109456, + "auxiliary_loss_mlp": 0.01034422, + "balance_loss_clip": 1.03791404, + "balance_loss_mlp": 1.02325821, + "epoch": 0.9330527581542162, + "flos": 28113850733760.0, + "grad_norm": 2.6930168004017077, + "language_loss": 0.75636977, + "learning_rate": 4.679065081288458e-08, + "loss": 0.77780855, + "num_input_tokens_seen": 334834735, + "router_z_loss_clip": 0.71484375, + "router_z_loss_mlp": 0.1116333, + "step": 15519, + "time_per_iteration": 2.64490008354187 + }, + { + "auxiliary_loss_clip": 0.01107658, + "auxiliary_loss_mlp": 0.01033002, + "balance_loss_clip": 1.03614676, + "balance_loss_mlp": 1.02134955, + "epoch": 0.9331128814068841, + "flos": 18985515415200.0, + "grad_norm": 2.0843040317348724, + "language_loss": 0.8240574, + "learning_rate": 4.6706937003061275e-08, + "loss": 0.84546399, + "num_input_tokens_seen": 334853490, + "router_z_loss_clip": 0.71484375, + "router_z_loss_mlp": 0.11651611, + "step": 15520, + "time_per_iteration": 4.054867744445801 + }, + { + "auxiliary_loss_clip": 0.01107643, + "auxiliary_loss_mlp": 0.01027384, + "balance_loss_clip": 1.03723145, + "balance_loss_mlp": 1.01642263, + "epoch": 0.9331730046595521, + "flos": 27176292681120.0, + "grad_norm": 2.0166206909144324, + "language_loss": 0.76557857, + "learning_rate": 4.6623297262306846e-08, + "loss": 0.78692883, + "num_input_tokens_seen": 334873675, + "router_z_loss_clip": 0.703125, + "router_z_loss_mlp": 0.10961914, + "step": 15521, + "time_per_iteration": 2.6077282428741455 + }, + { + "auxiliary_loss_clip": 0.01111822, + "auxiliary_loss_mlp": 0.01030687, + "balance_loss_clip": 1.04022956, + "balance_loss_mlp": 1.01948154, + "epoch": 0.93323312791222, + "flos": 19251635325120.0, + "grad_norm": 1.7539041517427378, + "language_loss": 0.77591681, + "learning_rate": 4.6539731593792545e-08, + "loss": 0.79734188, + "num_input_tokens_seen": 334890970, + "router_z_loss_clip": 0.71679688, + "router_z_loss_mlp": 0.11206055, + "step": 15522, + "time_per_iteration": 2.6195766925811768 + }, + { + "auxiliary_loss_clip": 0.01109297, + "auxiliary_loss_mlp": 0.01025923, + "balance_loss_clip": 1.03682005, + "balance_loss_mlp": 1.01476467, + "epoch": 0.933293251164888, + "flos": 26858560106880.0, + "grad_norm": 2.6697351069734223, + "language_loss": 0.62820929, + "learning_rate": 4.6456240000687373e-08, + "loss": 0.64956152, + "num_input_tokens_seen": 334906635, + "router_z_loss_clip": 0.72558594, + "router_z_loss_mlp": 0.11157227, + "step": 15523, + "time_per_iteration": 2.573151111602783 + }, + { + "auxiliary_loss_clip": 0.01107862, + "auxiliary_loss_mlp": 0.01029009, + "balance_loss_clip": 1.03699636, + "balance_loss_mlp": 1.01847124, + "epoch": 0.933353374417556, + "flos": 31763896411680.0, + "grad_norm": 1.7014609110103958, + "language_loss": 0.68412954, + "learning_rate": 4.63728224861577e-08, + "loss": 0.70549828, + "num_input_tokens_seen": 334926230, + "router_z_loss_clip": 0.70898438, + "router_z_loss_mlp": 0.10534668, + "step": 15524, + "time_per_iteration": 2.6784722805023193 + }, + { + "auxiliary_loss_clip": 0.01110339, + "auxiliary_loss_mlp": 0.01034002, + "balance_loss_clip": 1.03780913, + "balance_loss_mlp": 1.02224815, + "epoch": 0.933413497670224, + "flos": 30383466958080.0, + "grad_norm": 2.032199023598177, + "language_loss": 0.74107981, + "learning_rate": 4.628947905336589e-08, + "loss": 0.76252317, + "num_input_tokens_seen": 334946680, + "router_z_loss_clip": 0.72460938, + "router_z_loss_mlp": 0.11749268, + "step": 15525, + "time_per_iteration": 2.6397478580474854 + }, + { + "auxiliary_loss_clip": 0.01108182, + "auxiliary_loss_mlp": 0.0103486, + "balance_loss_clip": 1.03743672, + "balance_loss_mlp": 1.02380371, + "epoch": 0.9334736209228919, + "flos": 28905889767840.0, + "grad_norm": 1.8614548530485644, + "language_loss": 0.83764029, + "learning_rate": 4.6206209705473175e-08, + "loss": 0.85907078, + "num_input_tokens_seen": 334964785, + "router_z_loss_clip": 0.70654297, + "router_z_loss_mlp": 0.1105957, + "step": 15526, + "time_per_iteration": 2.6350975036621094 + }, + { + "auxiliary_loss_clip": 0.01111998, + "auxiliary_loss_mlp": 0.01028118, + "balance_loss_clip": 1.03867793, + "balance_loss_mlp": 1.01694846, + "epoch": 0.9335337441755599, + "flos": 18763066782720.0, + "grad_norm": 1.9859678110209043, + "language_loss": 0.68860805, + "learning_rate": 4.61230144456366e-08, + "loss": 0.71000922, + "num_input_tokens_seen": 334982400, + "router_z_loss_clip": 0.73388672, + "router_z_loss_mlp": 0.11169434, + "step": 15527, + "time_per_iteration": 2.5884907245635986 + }, + { + "auxiliary_loss_clip": 0.01112512, + "auxiliary_loss_mlp": 0.01026178, + "balance_loss_clip": 1.03862453, + "balance_loss_mlp": 1.01383352, + "epoch": 0.9335938674282279, + "flos": 19653144933600.0, + "grad_norm": 2.0911079539290296, + "language_loss": 0.64760929, + "learning_rate": 4.603989327701141e-08, + "loss": 0.66899616, + "num_input_tokens_seen": 334999685, + "router_z_loss_clip": 0.73828125, + "router_z_loss_mlp": 0.12335205, + "step": 15528, + "time_per_iteration": 2.5974783897399902 + }, + { + "auxiliary_loss_clip": 0.01109282, + "auxiliary_loss_mlp": 0.01033441, + "balance_loss_clip": 1.03598869, + "balance_loss_mlp": 1.02174032, + "epoch": 0.9336539906808958, + "flos": 23126236534080.0, + "grad_norm": 1.8917995868365611, + "language_loss": 0.74404466, + "learning_rate": 4.5956846202748867e-08, + "loss": 0.76547188, + "num_input_tokens_seen": 335019160, + "router_z_loss_clip": 0.73242188, + "router_z_loss_mlp": 0.11700439, + "step": 15529, + "time_per_iteration": 2.5872092247009277 + }, + { + "auxiliary_loss_clip": 0.01107611, + "auxiliary_loss_mlp": 0.01026703, + "balance_loss_clip": 1.03638577, + "balance_loss_mlp": 1.01620698, + "epoch": 0.9337141139335638, + "flos": 22097000577600.0, + "grad_norm": 1.8479900366261208, + "language_loss": 0.6317333, + "learning_rate": 4.5873873225998674e-08, + "loss": 0.65307647, + "num_input_tokens_seen": 335037350, + "router_z_loss_clip": 0.71191406, + "router_z_loss_mlp": 0.10491943, + "step": 15530, + "time_per_iteration": 2.5958211421966553 + }, + { + "auxiliary_loss_clip": 0.01107083, + "auxiliary_loss_mlp": 0.01027221, + "balance_loss_clip": 1.03728545, + "balance_loss_mlp": 1.01650965, + "epoch": 0.9337742371862318, + "flos": 21165195978720.0, + "grad_norm": 1.6787030377892922, + "language_loss": 0.72161376, + "learning_rate": 4.5790974349907194e-08, + "loss": 0.74295676, + "num_input_tokens_seen": 335056060, + "router_z_loss_clip": 0.69775391, + "router_z_loss_mlp": 0.10699463, + "step": 15531, + "time_per_iteration": 5.378061294555664 + }, + { + "auxiliary_loss_clip": 0.01109649, + "auxiliary_loss_mlp": 0.01028752, + "balance_loss_clip": 1.03804672, + "balance_loss_mlp": 1.01715875, + "epoch": 0.9338343604388998, + "flos": 35458504747200.0, + "grad_norm": 1.923660212560177, + "language_loss": 0.70714128, + "learning_rate": 4.5708149577617925e-08, + "loss": 0.72852528, + "num_input_tokens_seen": 335075410, + "router_z_loss_clip": 0.71630859, + "router_z_loss_mlp": 0.11602783, + "step": 15532, + "time_per_iteration": 2.6727306842803955 + }, + { + "auxiliary_loss_clip": 0.0111138, + "auxiliary_loss_mlp": 0.01031151, + "balance_loss_clip": 1.03752255, + "balance_loss_mlp": 1.01999879, + "epoch": 0.9338944836915677, + "flos": 22769937859680.0, + "grad_norm": 3.6004903432959825, + "language_loss": 0.73564565, + "learning_rate": 4.5625398912271016e-08, + "loss": 0.75707096, + "num_input_tokens_seen": 335095190, + "router_z_loss_clip": 0.73925781, + "router_z_loss_mlp": 0.1114502, + "step": 15533, + "time_per_iteration": 2.6670987606048584 + }, + { + "auxiliary_loss_clip": 0.01108057, + "auxiliary_loss_mlp": 0.01025815, + "balance_loss_clip": 1.03751183, + "balance_loss_mlp": 1.01497912, + "epoch": 0.9339546069442357, + "flos": 20276698001760.0, + "grad_norm": 2.1022509853761426, + "language_loss": 0.79867649, + "learning_rate": 4.554272235700507e-08, + "loss": 0.82001525, + "num_input_tokens_seen": 335113825, + "router_z_loss_clip": 0.70507812, + "router_z_loss_mlp": 0.1083374, + "step": 15534, + "time_per_iteration": 2.5864415168762207 + }, + { + "auxiliary_loss_clip": 0.01103359, + "auxiliary_loss_mlp": 0.01024805, + "balance_loss_clip": 1.03696537, + "balance_loss_mlp": 1.01504803, + "epoch": 0.9340147301969036, + "flos": 28910873393280.0, + "grad_norm": 1.7083768353944382, + "language_loss": 0.74414426, + "learning_rate": 4.546011991495513e-08, + "loss": 0.76542592, + "num_input_tokens_seen": 335136425, + "router_z_loss_clip": 0.66357422, + "router_z_loss_mlp": 0.09759521, + "step": 15535, + "time_per_iteration": 2.667186737060547 + }, + { + "auxiliary_loss_clip": 0.01110921, + "auxiliary_loss_mlp": 0.01026518, + "balance_loss_clip": 1.03807521, + "balance_loss_mlp": 1.01525235, + "epoch": 0.9340748534495716, + "flos": 34965155165760.0, + "grad_norm": 1.9626547688745042, + "language_loss": 0.77823627, + "learning_rate": 4.537759158925292e-08, + "loss": 0.79961061, + "num_input_tokens_seen": 335157925, + "router_z_loss_clip": 0.72851562, + "router_z_loss_mlp": 0.1126709, + "step": 15536, + "time_per_iteration": 2.6687867641448975 + }, + { + "auxiliary_loss_clip": 0.01106994, + "auxiliary_loss_mlp": 0.01027933, + "balance_loss_clip": 1.03601825, + "balance_loss_mlp": 1.01664996, + "epoch": 0.9341349767022396, + "flos": 30382940233440.0, + "grad_norm": 1.640328317873504, + "language_loss": 0.80567873, + "learning_rate": 4.5295137383028593e-08, + "loss": 0.82702798, + "num_input_tokens_seen": 335177840, + "router_z_loss_clip": 0.70996094, + "router_z_loss_mlp": 0.11291504, + "step": 15537, + "time_per_iteration": 2.6453683376312256 + }, + { + "auxiliary_loss_clip": 0.01111435, + "auxiliary_loss_mlp": 0.01030636, + "balance_loss_clip": 1.03794837, + "balance_loss_mlp": 1.01976442, + "epoch": 0.9341950999549076, + "flos": 35459112506400.0, + "grad_norm": 1.8707902231539277, + "language_loss": 0.77709442, + "learning_rate": 4.5212757299408764e-08, + "loss": 0.79851508, + "num_input_tokens_seen": 335199470, + "router_z_loss_clip": 0.73535156, + "router_z_loss_mlp": 0.10870361, + "step": 15538, + "time_per_iteration": 2.69599986076355 + }, + { + "auxiliary_loss_clip": 0.0110712, + "auxiliary_loss_mlp": 0.01026757, + "balance_loss_clip": 1.03609204, + "balance_loss_mlp": 1.01570606, + "epoch": 0.9342552232075755, + "flos": 28781156113920.0, + "grad_norm": 2.578379645027032, + "language_loss": 0.73305178, + "learning_rate": 4.513045134151672e-08, + "loss": 0.7543906, + "num_input_tokens_seen": 335218885, + "router_z_loss_clip": 0.70996094, + "router_z_loss_mlp": 0.1105957, + "step": 15539, + "time_per_iteration": 2.678504228591919 + }, + { + "auxiliary_loss_clip": 0.01109141, + "auxiliary_loss_mlp": 0.01028456, + "balance_loss_clip": 1.03798318, + "balance_loss_mlp": 1.01842451, + "epoch": 0.9343153464602435, + "flos": 44808721456320.0, + "grad_norm": 1.7326461846309693, + "language_loss": 0.64713418, + "learning_rate": 4.504821951247373e-08, + "loss": 0.66851014, + "num_input_tokens_seen": 335239485, + "router_z_loss_clip": 0.71142578, + "router_z_loss_mlp": 0.1003418, + "step": 15540, + "time_per_iteration": 2.7932631969451904 + }, + { + "auxiliary_loss_clip": 0.01108145, + "auxiliary_loss_mlp": 0.01027323, + "balance_loss_clip": 1.03624773, + "balance_loss_mlp": 1.01606369, + "epoch": 0.9343754697129115, + "flos": 27133148128320.0, + "grad_norm": 1.7300165259012563, + "language_loss": 0.76832771, + "learning_rate": 4.496606181539864e-08, + "loss": 0.78968239, + "num_input_tokens_seen": 335258355, + "router_z_loss_clip": 0.71972656, + "router_z_loss_mlp": 0.11260986, + "step": 15541, + "time_per_iteration": 2.6621041297912598 + }, + { + "auxiliary_loss_clip": 0.01109737, + "auxiliary_loss_mlp": 0.01029704, + "balance_loss_clip": 1.03809285, + "balance_loss_mlp": 1.018749, + "epoch": 0.9344355929655794, + "flos": 36253825680960.0, + "grad_norm": 2.03415537270056, + "language_loss": 0.66842484, + "learning_rate": 4.4883978253406066e-08, + "loss": 0.68981922, + "num_input_tokens_seen": 335276835, + "router_z_loss_clip": 0.71582031, + "router_z_loss_mlp": 0.10961914, + "step": 15542, + "time_per_iteration": 2.6748600006103516 + }, + { + "auxiliary_loss_clip": 0.01108227, + "auxiliary_loss_mlp": 0.0102859, + "balance_loss_clip": 1.03631544, + "balance_loss_mlp": 1.01714563, + "epoch": 0.9344957162182475, + "flos": 23037718978080.0, + "grad_norm": 1.896301046859053, + "language_loss": 0.69727606, + "learning_rate": 4.480196882960907e-08, + "loss": 0.71864426, + "num_input_tokens_seen": 335296220, + "router_z_loss_clip": 0.71923828, + "router_z_loss_mlp": 0.11444092, + "step": 15543, + "time_per_iteration": 2.6383109092712402 + }, + { + "auxiliary_loss_clip": 0.01112049, + "auxiliary_loss_mlp": 0.01032643, + "balance_loss_clip": 1.0373528, + "balance_loss_mlp": 1.02048397, + "epoch": 0.9345558394709154, + "flos": 33455697226560.0, + "grad_norm": 1.9159806793864258, + "language_loss": 0.70282733, + "learning_rate": 4.4720033547117394e-08, + "loss": 0.72427428, + "num_input_tokens_seen": 335316335, + "router_z_loss_clip": 0.74609375, + "router_z_loss_mlp": 0.121521, + "step": 15544, + "time_per_iteration": 3.978583335876465 + }, + { + "auxiliary_loss_clip": 0.01110247, + "auxiliary_loss_mlp": 0.010362, + "balance_loss_clip": 1.03718078, + "balance_loss_mlp": 1.02538157, + "epoch": 0.9346159627235834, + "flos": 25308510203520.0, + "grad_norm": 2.672257345992321, + "language_loss": 0.77186191, + "learning_rate": 4.463817240903789e-08, + "loss": 0.79332638, + "num_input_tokens_seen": 335335545, + "router_z_loss_clip": 0.73095703, + "router_z_loss_mlp": 0.10821533, + "step": 15545, + "time_per_iteration": 2.624403238296509 + }, + { + "auxiliary_loss_clip": 0.01111362, + "auxiliary_loss_mlp": 0.01026878, + "balance_loss_clip": 1.03816664, + "balance_loss_mlp": 1.01646531, + "epoch": 0.9346760859762513, + "flos": 25708723259040.0, + "grad_norm": 1.722508564629324, + "language_loss": 0.68931234, + "learning_rate": 4.455638541847495e-08, + "loss": 0.71069467, + "num_input_tokens_seen": 335355350, + "router_z_loss_clip": 0.73339844, + "router_z_loss_mlp": 0.10406494, + "step": 15546, + "time_per_iteration": 2.6554453372955322 + }, + { + "auxiliary_loss_clip": 0.01104541, + "auxiliary_loss_mlp": 0.01028807, + "balance_loss_clip": 1.03603911, + "balance_loss_mlp": 1.01838768, + "epoch": 0.9347362092289193, + "flos": 35636593308480.0, + "grad_norm": 2.2430413916461007, + "language_loss": 0.82508624, + "learning_rate": 4.447467257852966e-08, + "loss": 0.84641969, + "num_input_tokens_seen": 335375160, + "router_z_loss_clip": 0.68505859, + "router_z_loss_mlp": 0.10418701, + "step": 15547, + "time_per_iteration": 2.7636890411376953 + }, + { + "auxiliary_loss_clip": 0.01105118, + "auxiliary_loss_mlp": 0.01029462, + "balance_loss_clip": 1.03498912, + "balance_loss_mlp": 1.01901889, + "epoch": 0.9347963324815872, + "flos": 23705186427360.0, + "grad_norm": 2.0447748819448694, + "language_loss": 0.83456159, + "learning_rate": 4.439303389230087e-08, + "loss": 0.85590744, + "num_input_tokens_seen": 335394080, + "router_z_loss_clip": 0.70117188, + "router_z_loss_mlp": 0.10449219, + "step": 15548, + "time_per_iteration": 2.574352979660034 + }, + { + "auxiliary_loss_clip": 0.0111411, + "auxiliary_loss_mlp": 0.01029981, + "balance_loss_clip": 1.0383631, + "balance_loss_mlp": 1.01783359, + "epoch": 0.9348564557342552, + "flos": 45029103707520.0, + "grad_norm": 1.6034010908272613, + "language_loss": 0.65509832, + "learning_rate": 4.4311469362884326e-08, + "loss": 0.67653924, + "num_input_tokens_seen": 335414230, + "router_z_loss_clip": 0.75683594, + "router_z_loss_mlp": 0.12145996, + "step": 15549, + "time_per_iteration": 2.7670624256134033 + }, + { + "auxiliary_loss_clip": 0.01110838, + "auxiliary_loss_mlp": 0.01031772, + "balance_loss_clip": 1.03825402, + "balance_loss_mlp": 1.02032828, + "epoch": 0.9349165789869232, + "flos": 26466328955520.0, + "grad_norm": 12.177407959484457, + "language_loss": 0.80505234, + "learning_rate": 4.4229978993372665e-08, + "loss": 0.82647842, + "num_input_tokens_seen": 335432890, + "router_z_loss_clip": 0.72509766, + "router_z_loss_mlp": 0.11444092, + "step": 15550, + "time_per_iteration": 2.5929582118988037 + }, + { + "auxiliary_loss_clip": 0.01109422, + "auxiliary_loss_mlp": 0.01026744, + "balance_loss_clip": 1.03854442, + "balance_loss_mlp": 1.01595545, + "epoch": 0.9349767022395912, + "flos": 22993358906880.0, + "grad_norm": 1.7910002544023582, + "language_loss": 0.75469446, + "learning_rate": 4.4148562786856524e-08, + "loss": 0.77605611, + "num_input_tokens_seen": 335452085, + "router_z_loss_clip": 0.70898438, + "router_z_loss_mlp": 0.10778809, + "step": 15551, + "time_per_iteration": 2.6565706729888916 + }, + { + "auxiliary_loss_clip": 0.01105844, + "auxiliary_loss_mlp": 0.01031582, + "balance_loss_clip": 1.03685153, + "balance_loss_mlp": 1.02190852, + "epoch": 0.9350368254922591, + "flos": 30472916411520.0, + "grad_norm": 1.6108796955567566, + "language_loss": 0.73633641, + "learning_rate": 4.406722074642255e-08, + "loss": 0.7577107, + "num_input_tokens_seen": 335472130, + "router_z_loss_clip": 0.68945312, + "router_z_loss_mlp": 0.09674072, + "step": 15552, + "time_per_iteration": 2.6413285732269287 + }, + { + "auxiliary_loss_clip": 0.01108252, + "auxiliary_loss_mlp": 0.01032692, + "balance_loss_clip": 1.03674185, + "balance_loss_mlp": 1.02156365, + "epoch": 0.9350969487449271, + "flos": 28151444419200.0, + "grad_norm": 1.8407367609928886, + "language_loss": 0.77264988, + "learning_rate": 4.3985952875155386e-08, + "loss": 0.79405934, + "num_input_tokens_seen": 335489970, + "router_z_loss_clip": 0.71484375, + "router_z_loss_mlp": 0.11120605, + "step": 15553, + "time_per_iteration": 2.649404764175415 + }, + { + "auxiliary_loss_clip": 0.01113442, + "auxiliary_loss_mlp": 0.01033022, + "balance_loss_clip": 1.03913844, + "balance_loss_mlp": 1.02147007, + "epoch": 0.9351570719975951, + "flos": 22726266582240.0, + "grad_norm": 1.8334826609872357, + "language_loss": 0.78086102, + "learning_rate": 4.390475917613723e-08, + "loss": 0.80232561, + "num_input_tokens_seen": 335509125, + "router_z_loss_clip": 0.74316406, + "router_z_loss_mlp": 0.11541748, + "step": 15554, + "time_per_iteration": 2.623441457748413 + }, + { + "auxiliary_loss_clip": 0.01103598, + "auxiliary_loss_mlp": 0.01022821, + "balance_loss_clip": 1.0356071, + "balance_loss_mlp": 1.01343977, + "epoch": 0.935217195250263, + "flos": 19386619850880.0, + "grad_norm": 1.7032874640882993, + "language_loss": 0.69104546, + "learning_rate": 4.382363965244695e-08, + "loss": 0.71230972, + "num_input_tokens_seen": 335525620, + "router_z_loss_clip": 0.67871094, + "router_z_loss_mlp": 0.09387207, + "step": 15555, + "time_per_iteration": 2.6484782695770264 + }, + { + "auxiliary_loss_clip": 0.01107247, + "auxiliary_loss_mlp": 0.01035783, + "balance_loss_clip": 1.03642964, + "balance_loss_mlp": 1.02477968, + "epoch": 0.935277318502931, + "flos": 29761291477440.0, + "grad_norm": 1.5676328266638417, + "language_loss": 0.75781929, + "learning_rate": 4.374259430715965e-08, + "loss": 0.77924961, + "num_input_tokens_seen": 335547565, + "router_z_loss_clip": 0.70849609, + "router_z_loss_mlp": 0.10998535, + "step": 15556, + "time_per_iteration": 2.676917314529419 + }, + { + "auxiliary_loss_clip": 0.01108516, + "auxiliary_loss_mlp": 0.01032005, + "balance_loss_clip": 1.03672028, + "balance_loss_mlp": 1.02165771, + "epoch": 0.935337441755599, + "flos": 33678388962720.0, + "grad_norm": 2.0786853489402004, + "language_loss": 0.72334456, + "learning_rate": 4.366162314334953e-08, + "loss": 0.74474978, + "num_input_tokens_seen": 335570285, + "router_z_loss_clip": 0.71679688, + "router_z_loss_mlp": 0.10345459, + "step": 15557, + "time_per_iteration": 2.696399450302124 + }, + { + "auxiliary_loss_clip": 0.01109376, + "auxiliary_loss_mlp": 0.01031267, + "balance_loss_clip": 1.03701806, + "balance_loss_mlp": 1.01994228, + "epoch": 0.935397565008267, + "flos": 24993451769760.0, + "grad_norm": 1.650419559156566, + "language_loss": 0.63589031, + "learning_rate": 4.358072616408681e-08, + "loss": 0.65729672, + "num_input_tokens_seen": 335588600, + "router_z_loss_clip": 0.72363281, + "router_z_loss_mlp": 0.11322021, + "step": 15558, + "time_per_iteration": 2.6437504291534424 + }, + { + "auxiliary_loss_clip": 0.0110914, + "auxiliary_loss_mlp": 0.01029053, + "balance_loss_clip": 1.03730404, + "balance_loss_mlp": 1.01709032, + "epoch": 0.9354576882609349, + "flos": 28863839181600.0, + "grad_norm": 1.8875934992101933, + "language_loss": 0.72905016, + "learning_rate": 4.34999033724388e-08, + "loss": 0.75043207, + "num_input_tokens_seen": 335606235, + "router_z_loss_clip": 0.71777344, + "router_z_loss_mlp": 0.11968994, + "step": 15559, + "time_per_iteration": 2.6329197883605957 + }, + { + "auxiliary_loss_clip": 0.01106391, + "auxiliary_loss_mlp": 0.01025672, + "balance_loss_clip": 1.03690636, + "balance_loss_mlp": 1.01587903, + "epoch": 0.9355178115136029, + "flos": 44764199316000.0, + "grad_norm": 1.5551428446529567, + "language_loss": 0.6377055, + "learning_rate": 4.341915477147062e-08, + "loss": 0.65902615, + "num_input_tokens_seen": 335628240, + "router_z_loss_clip": 0.69482422, + "router_z_loss_mlp": 0.09802246, + "step": 15560, + "time_per_iteration": 4.232922554016113 + }, + { + "auxiliary_loss_clip": 0.01117796, + "auxiliary_loss_mlp": 0.01035008, + "balance_loss_clip": 1.03963304, + "balance_loss_mlp": 1.02210391, + "epoch": 0.9355779347662708, + "flos": 17644583959200.0, + "grad_norm": 2.1851915699781324, + "language_loss": 0.63650203, + "learning_rate": 4.3338480364244034e-08, + "loss": 0.65803015, + "num_input_tokens_seen": 335643755, + "router_z_loss_clip": 0.78125, + "router_z_loss_mlp": 0.12908936, + "step": 15561, + "time_per_iteration": 2.5681605339050293 + }, + { + "auxiliary_loss_clip": 0.01110604, + "auxiliary_loss_mlp": 0.01034404, + "balance_loss_clip": 1.03952885, + "balance_loss_mlp": 1.02280474, + "epoch": 0.9356380580189388, + "flos": 28291696191360.0, + "grad_norm": 2.841693952408097, + "language_loss": 0.75142169, + "learning_rate": 4.325788015381859e-08, + "loss": 0.77287173, + "num_input_tokens_seen": 335665160, + "router_z_loss_clip": 0.71191406, + "router_z_loss_mlp": 0.1159668, + "step": 15562, + "time_per_iteration": 2.6207642555236816 + }, + { + "auxiliary_loss_clip": 0.0102718, + "auxiliary_loss_mlp": 0.01000866, + "balance_loss_clip": 1.00481534, + "balance_loss_mlp": 0.99997371, + "epoch": 0.9356981812716068, + "flos": 82329830879040.0, + "grad_norm": 0.9388014252317026, + "language_loss": 0.62290573, + "learning_rate": 4.31773541432503e-08, + "loss": 0.64318621, + "num_input_tokens_seen": 335715240, + "router_z_loss_clip": 0.22363281, + "router_z_loss_mlp": 0.00892639, + "step": 15563, + "time_per_iteration": 3.0925257205963135 + }, + { + "auxiliary_loss_clip": 0.01106915, + "auxiliary_loss_mlp": 0.01032083, + "balance_loss_clip": 1.03651798, + "balance_loss_mlp": 1.02124143, + "epoch": 0.9357583045242748, + "flos": 30116820323520.0, + "grad_norm": 1.9098907803375744, + "language_loss": 0.78228676, + "learning_rate": 4.3096902335592714e-08, + "loss": 0.80367678, + "num_input_tokens_seen": 335734970, + "router_z_loss_clip": 0.70410156, + "router_z_loss_mlp": 0.10845947, + "step": 15564, + "time_per_iteration": 2.6350033283233643 + }, + { + "auxiliary_loss_clip": 0.01109762, + "auxiliary_loss_mlp": 0.01027084, + "balance_loss_clip": 1.03611994, + "balance_loss_mlp": 1.01461506, + "epoch": 0.9358184277769427, + "flos": 23749587015840.0, + "grad_norm": 2.4672735746379915, + "language_loss": 0.78309584, + "learning_rate": 4.301652473389694e-08, + "loss": 0.80446434, + "num_input_tokens_seen": 335753435, + "router_z_loss_clip": 0.73632812, + "router_z_loss_mlp": 0.12463379, + "step": 15565, + "time_per_iteration": 2.612150192260742 + }, + { + "auxiliary_loss_clip": 0.01106296, + "auxiliary_loss_mlp": 0.01028109, + "balance_loss_clip": 1.03635764, + "balance_loss_mlp": 1.01782751, + "epoch": 0.9358785510296107, + "flos": 23082362670240.0, + "grad_norm": 2.1528294868493423, + "language_loss": 0.72025853, + "learning_rate": 4.2936221341210774e-08, + "loss": 0.74160254, + "num_input_tokens_seen": 335772105, + "router_z_loss_clip": 0.69970703, + "router_z_loss_mlp": 0.1027832, + "step": 15566, + "time_per_iteration": 2.611224889755249 + }, + { + "auxiliary_loss_clip": 0.01107669, + "auxiliary_loss_mlp": 0.01029013, + "balance_loss_clip": 1.03549218, + "balance_loss_mlp": 1.01839185, + "epoch": 0.9359386742822787, + "flos": 28603229621760.0, + "grad_norm": 2.8419778408292564, + "language_loss": 0.6796838, + "learning_rate": 4.285599216057889e-08, + "loss": 0.70105064, + "num_input_tokens_seen": 335789125, + "router_z_loss_clip": 0.72216797, + "router_z_loss_mlp": 0.10626221, + "step": 15567, + "time_per_iteration": 2.64973783493042 + }, + { + "auxiliary_loss_clip": 0.01109559, + "auxiliary_loss_mlp": 0.01030468, + "balance_loss_clip": 1.03782439, + "balance_loss_mlp": 1.01953089, + "epoch": 0.9359987975349466, + "flos": 39955321954080.0, + "grad_norm": 2.5083092159047347, + "language_loss": 0.62174338, + "learning_rate": 4.277583719504418e-08, + "loss": 0.64314365, + "num_input_tokens_seen": 335810995, + "router_z_loss_clip": 0.71728516, + "router_z_loss_mlp": 0.109375, + "step": 15568, + "time_per_iteration": 2.6893396377563477 + }, + { + "auxiliary_loss_clip": 0.01107979, + "auxiliary_loss_mlp": 0.01032343, + "balance_loss_clip": 1.03610945, + "balance_loss_mlp": 1.02151871, + "epoch": 0.9360589207876147, + "flos": 27844813579680.0, + "grad_norm": 1.7901984806849638, + "language_loss": 0.78549653, + "learning_rate": 4.269575644764556e-08, + "loss": 0.80689979, + "num_input_tokens_seen": 335830580, + "router_z_loss_clip": 0.71923828, + "router_z_loss_mlp": 0.1081543, + "step": 15569, + "time_per_iteration": 2.6629743576049805 + }, + { + "auxiliary_loss_clip": 0.01112807, + "auxiliary_loss_mlp": 0.01033682, + "balance_loss_clip": 1.03897524, + "balance_loss_mlp": 1.02229166, + "epoch": 0.9361190440402826, + "flos": 25484370314400.0, + "grad_norm": 2.6285472178870415, + "language_loss": 0.69504094, + "learning_rate": 4.261574992142014e-08, + "loss": 0.71650577, + "num_input_tokens_seen": 335846515, + "router_z_loss_clip": 0.73828125, + "router_z_loss_mlp": 0.11383057, + "step": 15570, + "time_per_iteration": 4.1206018924713135 + }, + { + "auxiliary_loss_clip": 0.01108951, + "auxiliary_loss_mlp": 0.01029041, + "balance_loss_clip": 1.03633273, + "balance_loss_mlp": 1.01800251, + "epoch": 0.9361791672929506, + "flos": 23571538971840.0, + "grad_norm": 2.3353696048564454, + "language_loss": 0.7924763, + "learning_rate": 4.2535817619401726e-08, + "loss": 0.81385624, + "num_input_tokens_seen": 335863350, + "router_z_loss_clip": 0.72558594, + "router_z_loss_mlp": 0.1104126, + "step": 15571, + "time_per_iteration": 3.953484535217285 + }, + { + "auxiliary_loss_clip": 0.01107932, + "auxiliary_loss_mlp": 0.01028905, + "balance_loss_clip": 1.03618288, + "balance_loss_mlp": 1.01766312, + "epoch": 0.9362392905456185, + "flos": 18495650319840.0, + "grad_norm": 2.750564239478883, + "language_loss": 0.77606297, + "learning_rate": 4.2455959544621224e-08, + "loss": 0.79743135, + "num_input_tokens_seen": 335880510, + "router_z_loss_clip": 0.71826172, + "router_z_loss_mlp": 0.11230469, + "step": 15572, + "time_per_iteration": 2.614025115966797 + }, + { + "auxiliary_loss_clip": 0.01107223, + "auxiliary_loss_mlp": 0.01037209, + "balance_loss_clip": 1.03757083, + "balance_loss_mlp": 1.02645624, + "epoch": 0.9362994137982865, + "flos": 26949792320640.0, + "grad_norm": 1.8070872760660157, + "language_loss": 0.77850437, + "learning_rate": 4.237617570010688e-08, + "loss": 0.79994863, + "num_input_tokens_seen": 335899440, + "router_z_loss_clip": 0.69580078, + "router_z_loss_mlp": 0.10748291, + "step": 15573, + "time_per_iteration": 2.7015504837036133 + }, + { + "auxiliary_loss_clip": 0.01105584, + "auxiliary_loss_mlp": 0.01027028, + "balance_loss_clip": 1.03646398, + "balance_loss_mlp": 1.01649559, + "epoch": 0.9363595370509544, + "flos": 28690045452000.0, + "grad_norm": 1.724249219939581, + "language_loss": 0.74489278, + "learning_rate": 4.2296466088884044e-08, + "loss": 0.7662189, + "num_input_tokens_seen": 335919540, + "router_z_loss_clip": 0.69189453, + "router_z_loss_mlp": 0.10528564, + "step": 15574, + "time_per_iteration": 2.665461301803589 + }, + { + "auxiliary_loss_clip": 0.01106396, + "auxiliary_loss_mlp": 0.01032739, + "balance_loss_clip": 1.03665304, + "balance_loss_mlp": 1.02173638, + "epoch": 0.9364196603036224, + "flos": 33096805446240.0, + "grad_norm": 1.8702576875718342, + "language_loss": 0.68492943, + "learning_rate": 4.221683071397564e-08, + "loss": 0.70632076, + "num_input_tokens_seen": 335939665, + "router_z_loss_clip": 0.69775391, + "router_z_loss_mlp": 0.11010742, + "step": 15575, + "time_per_iteration": 2.646869659423828 + }, + { + "auxiliary_loss_clip": 0.01107686, + "auxiliary_loss_mlp": 0.01033792, + "balance_loss_clip": 1.03773284, + "balance_loss_mlp": 1.02253902, + "epoch": 0.9364797835562904, + "flos": 22547975434560.0, + "grad_norm": 7.0731404827026285, + "language_loss": 0.65237296, + "learning_rate": 4.2137269578401026e-08, + "loss": 0.67378777, + "num_input_tokens_seen": 335958580, + "router_z_loss_clip": 0.69873047, + "router_z_loss_mlp": 0.11242676, + "step": 15576, + "time_per_iteration": 2.64282488822937 + }, + { + "auxiliary_loss_clip": 0.0110833, + "auxiliary_loss_mlp": 0.01026355, + "balance_loss_clip": 1.03477538, + "balance_loss_mlp": 1.01438022, + "epoch": 0.9365399068089584, + "flos": 15868844040960.0, + "grad_norm": 2.462268207563084, + "language_loss": 0.75900137, + "learning_rate": 4.2057782685177566e-08, + "loss": 0.78034818, + "num_input_tokens_seen": 335974965, + "router_z_loss_clip": 0.73486328, + "router_z_loss_mlp": 0.11975098, + "step": 15577, + "time_per_iteration": 2.6329832077026367 + }, + { + "auxiliary_loss_clip": 0.01107951, + "auxiliary_loss_mlp": 0.01028015, + "balance_loss_clip": 1.03539324, + "balance_loss_mlp": 1.01675558, + "epoch": 0.9366000300616263, + "flos": 31318593973920.0, + "grad_norm": 2.9003730787416884, + "language_loss": 0.52730882, + "learning_rate": 4.1978370037318855e-08, + "loss": 0.5486685, + "num_input_tokens_seen": 335996575, + "router_z_loss_clip": 0.72558594, + "router_z_loss_mlp": 0.11242676, + "step": 15578, + "time_per_iteration": 2.6791908740997314 + }, + { + "auxiliary_loss_clip": 0.01107688, + "auxiliary_loss_mlp": 0.01032022, + "balance_loss_clip": 1.03729188, + "balance_loss_mlp": 1.02113247, + "epoch": 0.9366601533142943, + "flos": 26153134316640.0, + "grad_norm": 1.8351215765291098, + "language_loss": 0.70804137, + "learning_rate": 4.189903163783692e-08, + "loss": 0.72943848, + "num_input_tokens_seen": 336017265, + "router_z_loss_clip": 0.70410156, + "router_z_loss_mlp": 0.10888672, + "step": 15579, + "time_per_iteration": 2.6579036712646484 + }, + { + "auxiliary_loss_clip": 0.01107529, + "auxiliary_loss_mlp": 0.01026308, + "balance_loss_clip": 1.03737617, + "balance_loss_mlp": 1.01574564, + "epoch": 0.9367202765669622, + "flos": 29399239349280.0, + "grad_norm": 2.1242519860571867, + "language_loss": 0.76611668, + "learning_rate": 4.181976748973959e-08, + "loss": 0.78745502, + "num_input_tokens_seen": 336035905, + "router_z_loss_clip": 0.70166016, + "router_z_loss_mlp": 0.10565186, + "step": 15580, + "time_per_iteration": 2.610196113586426 + }, + { + "auxiliary_loss_clip": 0.01112425, + "auxiliary_loss_mlp": 0.01030698, + "balance_loss_clip": 1.03811073, + "balance_loss_mlp": 1.01898539, + "epoch": 0.9367803998196302, + "flos": 25488422042400.0, + "grad_norm": 1.8421674936177899, + "language_loss": 0.66145658, + "learning_rate": 4.1740577596033114e-08, + "loss": 0.68288779, + "num_input_tokens_seen": 336055585, + "router_z_loss_clip": 0.74365234, + "router_z_loss_mlp": 0.1171875, + "step": 15581, + "time_per_iteration": 2.6205475330352783 + }, + { + "auxiliary_loss_clip": 0.01109888, + "auxiliary_loss_mlp": 0.01026998, + "balance_loss_clip": 1.03769684, + "balance_loss_mlp": 1.01551187, + "epoch": 0.9368405230722983, + "flos": 27533280149280.0, + "grad_norm": 1.5961002064877423, + "language_loss": 0.76221895, + "learning_rate": 4.166146195972042e-08, + "loss": 0.78358781, + "num_input_tokens_seen": 336076695, + "router_z_loss_clip": 0.72119141, + "router_z_loss_mlp": 0.1149292, + "step": 15582, + "time_per_iteration": 2.6163411140441895 + }, + { + "auxiliary_loss_clip": 0.01107801, + "auxiliary_loss_mlp": 0.01031169, + "balance_loss_clip": 1.03712821, + "balance_loss_mlp": 1.01998138, + "epoch": 0.9369006463249662, + "flos": 23037516391680.0, + "grad_norm": 2.789136106048784, + "language_loss": 0.73602068, + "learning_rate": 4.1582420583800905e-08, + "loss": 0.75741047, + "num_input_tokens_seen": 336094740, + "router_z_loss_clip": 0.70605469, + "router_z_loss_mlp": 0.11193848, + "step": 15583, + "time_per_iteration": 3.9640955924987793 + }, + { + "auxiliary_loss_clip": 0.01114161, + "auxiliary_loss_mlp": 0.01031754, + "balance_loss_clip": 1.03877699, + "balance_loss_mlp": 1.02014923, + "epoch": 0.9369607695776342, + "flos": 32253437368800.0, + "grad_norm": 2.574540020583964, + "language_loss": 0.83919585, + "learning_rate": 4.1503453471272376e-08, + "loss": 0.86065495, + "num_input_tokens_seen": 336113985, + "router_z_loss_clip": 0.75341797, + "router_z_loss_mlp": 0.1159668, + "step": 15584, + "time_per_iteration": 2.687748432159424 + }, + { + "auxiliary_loss_clip": 0.01114203, + "auxiliary_loss_mlp": 0.01035864, + "balance_loss_clip": 1.03868699, + "balance_loss_mlp": 1.02349615, + "epoch": 0.9370208928303021, + "flos": 48281569953120.0, + "grad_norm": 1.757264000548427, + "language_loss": 0.723068, + "learning_rate": 4.1424560625129334e-08, + "loss": 0.74456865, + "num_input_tokens_seen": 336136395, + "router_z_loss_clip": 0.75537109, + "router_z_loss_mlp": 0.1237793, + "step": 15585, + "time_per_iteration": 2.882443428039551 + }, + { + "auxiliary_loss_clip": 0.01104291, + "auxiliary_loss_mlp": 0.01025219, + "balance_loss_clip": 1.03432739, + "balance_loss_mlp": 1.01483536, + "epoch": 0.9370810160829701, + "flos": 28020106448640.0, + "grad_norm": 1.8668498277567418, + "language_loss": 0.80302376, + "learning_rate": 4.134574204836316e-08, + "loss": 0.82431883, + "num_input_tokens_seen": 336156345, + "router_z_loss_clip": 0.70019531, + "router_z_loss_mlp": 0.10375977, + "step": 15586, + "time_per_iteration": 2.675232410430908 + }, + { + "auxiliary_loss_clip": 0.01109889, + "auxiliary_loss_mlp": 0.01030326, + "balance_loss_clip": 1.03802514, + "balance_loss_mlp": 1.01902485, + "epoch": 0.937141139335638, + "flos": 28156468561920.0, + "grad_norm": 4.770717769042582, + "language_loss": 0.7651304, + "learning_rate": 4.126699774396258e-08, + "loss": 0.78653252, + "num_input_tokens_seen": 336176760, + "router_z_loss_clip": 0.71777344, + "router_z_loss_mlp": 0.11303711, + "step": 15587, + "time_per_iteration": 2.6062276363372803 + }, + { + "auxiliary_loss_clip": 0.01112803, + "auxiliary_loss_mlp": 0.01030232, + "balance_loss_clip": 1.03832841, + "balance_loss_mlp": 1.01869261, + "epoch": 0.937201262588306, + "flos": 19958641289280.0, + "grad_norm": 2.0661269960594755, + "language_loss": 0.8788653, + "learning_rate": 4.118832771491387e-08, + "loss": 0.90029567, + "num_input_tokens_seen": 336193285, + "router_z_loss_clip": 0.74462891, + "router_z_loss_mlp": 0.11535645, + "step": 15588, + "time_per_iteration": 2.615537166595459 + }, + { + "auxiliary_loss_clip": 0.01105607, + "auxiliary_loss_mlp": 0.01025731, + "balance_loss_clip": 1.03660917, + "balance_loss_mlp": 1.01543677, + "epoch": 0.937261385840974, + "flos": 24640759133280.0, + "grad_norm": 1.8013406059048886, + "language_loss": 0.78203034, + "learning_rate": 4.11097319642002e-08, + "loss": 0.80334365, + "num_input_tokens_seen": 336211425, + "router_z_loss_clip": 0.68994141, + "router_z_loss_mlp": 0.10290527, + "step": 15589, + "time_per_iteration": 2.5803775787353516 + }, + { + "auxiliary_loss_clip": 0.0110645, + "auxiliary_loss_mlp": 0.01032069, + "balance_loss_clip": 1.03662634, + "balance_loss_mlp": 1.02099991, + "epoch": 0.937321509093642, + "flos": 22324594904640.0, + "grad_norm": 1.7715050210385914, + "language_loss": 0.77281737, + "learning_rate": 4.103121049480163e-08, + "loss": 0.79420257, + "num_input_tokens_seen": 336230205, + "router_z_loss_clip": 0.69873047, + "router_z_loss_mlp": 0.11071777, + "step": 15590, + "time_per_iteration": 2.604673147201538 + }, + { + "auxiliary_loss_clip": 0.01112401, + "auxiliary_loss_mlp": 0.0103633, + "balance_loss_clip": 1.03752398, + "balance_loss_mlp": 1.02383101, + "epoch": 0.9373816323463099, + "flos": 31584956987520.0, + "grad_norm": 1.7897274102364151, + "language_loss": 0.71634406, + "learning_rate": 4.095276330969577e-08, + "loss": 0.73783135, + "num_input_tokens_seen": 336252440, + "router_z_loss_clip": 0.74902344, + "router_z_loss_mlp": 0.12506104, + "step": 15591, + "time_per_iteration": 2.642792224884033 + }, + { + "auxiliary_loss_clip": 0.01114819, + "auxiliary_loss_mlp": 0.01031135, + "balance_loss_clip": 1.03996301, + "balance_loss_mlp": 1.01877344, + "epoch": 0.9374417555989779, + "flos": 33188564384640.0, + "grad_norm": 2.7079289782039195, + "language_loss": 0.53892362, + "learning_rate": 4.0874390411857804e-08, + "loss": 0.5603832, + "num_input_tokens_seen": 336273845, + "router_z_loss_clip": 0.74902344, + "router_z_loss_mlp": 0.12347412, + "step": 15592, + "time_per_iteration": 2.680734872817993 + }, + { + "auxiliary_loss_clip": 0.01107368, + "auxiliary_loss_mlp": 0.01027415, + "balance_loss_clip": 1.03621531, + "balance_loss_mlp": 1.01678705, + "epoch": 0.9375018788516458, + "flos": 28823085148320.0, + "grad_norm": 1.4828869393714932, + "language_loss": 0.67354137, + "learning_rate": 4.0796091804259136e-08, + "loss": 0.69488919, + "num_input_tokens_seen": 336292790, + "router_z_loss_clip": 0.7109375, + "router_z_loss_mlp": 0.10632324, + "step": 15593, + "time_per_iteration": 2.621912956237793 + }, + { + "auxiliary_loss_clip": 0.01108943, + "auxiliary_loss_mlp": 0.01025812, + "balance_loss_clip": 1.036309, + "balance_loss_mlp": 1.0149461, + "epoch": 0.9375620021043138, + "flos": 27672437954880.0, + "grad_norm": 1.6906720927517782, + "language_loss": 0.74191916, + "learning_rate": 4.0717867489868715e-08, + "loss": 0.76326668, + "num_input_tokens_seen": 336312600, + "router_z_loss_clip": 0.72705078, + "router_z_loss_mlp": 0.10870361, + "step": 15594, + "time_per_iteration": 2.6770284175872803 + }, + { + "auxiliary_loss_clip": 0.01105038, + "auxiliary_loss_mlp": 0.01029277, + "balance_loss_clip": 1.03552246, + "balance_loss_mlp": 1.01925778, + "epoch": 0.9376221253569819, + "flos": 33629896128960.0, + "grad_norm": 1.6587558622521084, + "language_loss": 0.73711741, + "learning_rate": 4.063971747165351e-08, + "loss": 0.75846058, + "num_input_tokens_seen": 336332770, + "router_z_loss_clip": 0.69482422, + "router_z_loss_mlp": 0.10028076, + "step": 15595, + "time_per_iteration": 2.685621738433838 + }, + { + "auxiliary_loss_clip": 0.01109867, + "auxiliary_loss_mlp": 0.01026821, + "balance_loss_clip": 1.03656733, + "balance_loss_mlp": 1.01616383, + "epoch": 0.9376822486096498, + "flos": 29443964076000.0, + "grad_norm": 2.399994270034324, + "language_loss": 0.76251549, + "learning_rate": 4.056164175257626e-08, + "loss": 0.78388238, + "num_input_tokens_seen": 336351445, + "router_z_loss_clip": 0.73339844, + "router_z_loss_mlp": 0.10662842, + "step": 15596, + "time_per_iteration": 2.6730337142944336 + }, + { + "auxiliary_loss_clip": 0.011087, + "auxiliary_loss_mlp": 0.0102986, + "balance_loss_clip": 1.0368005, + "balance_loss_mlp": 1.01898789, + "epoch": 0.9377423718623178, + "flos": 27801263854080.0, + "grad_norm": 2.435928308492425, + "language_loss": 0.7867434, + "learning_rate": 4.0483640335597926e-08, + "loss": 0.80812895, + "num_input_tokens_seen": 336368690, + "router_z_loss_clip": 0.71875, + "router_z_loss_mlp": 0.10864258, + "step": 15597, + "time_per_iteration": 2.624967098236084 + }, + { + "auxiliary_loss_clip": 0.01114378, + "auxiliary_loss_mlp": 0.01031003, + "balance_loss_clip": 1.03905737, + "balance_loss_mlp": 1.01946354, + "epoch": 0.9378024951149857, + "flos": 23390087476320.0, + "grad_norm": 1.7912832976524748, + "language_loss": 0.81192493, + "learning_rate": 4.0405713223676363e-08, + "loss": 0.83337879, + "num_input_tokens_seen": 336388165, + "router_z_loss_clip": 0.75390625, + "router_z_loss_mlp": 0.11523438, + "step": 15598, + "time_per_iteration": 2.6536881923675537 + }, + { + "auxiliary_loss_clip": 0.01113573, + "auxiliary_loss_mlp": 0.01028846, + "balance_loss_clip": 1.037112, + "balance_loss_mlp": 1.01743221, + "epoch": 0.9378626183676537, + "flos": 28682428203360.0, + "grad_norm": 2.0195058926985814, + "language_loss": 0.62701434, + "learning_rate": 4.0327860419766994e-08, + "loss": 0.64843857, + "num_input_tokens_seen": 336406475, + "router_z_loss_clip": 0.76464844, + "router_z_loss_mlp": 0.11413574, + "step": 15599, + "time_per_iteration": 4.227042198181152 + }, + { + "auxiliary_loss_clip": 0.01110007, + "auxiliary_loss_mlp": 0.01026609, + "balance_loss_clip": 1.03719032, + "balance_loss_mlp": 1.01564157, + "epoch": 0.9379227416203216, + "flos": 22458444946560.0, + "grad_norm": 2.3290362732350762, + "language_loss": 0.73408902, + "learning_rate": 4.0250081926821e-08, + "loss": 0.75545514, + "num_input_tokens_seen": 336424690, + "router_z_loss_clip": 0.72802734, + "router_z_loss_mlp": 0.10961914, + "step": 15600, + "time_per_iteration": 2.5957179069519043 + }, + { + "auxiliary_loss_clip": 0.01107494, + "auxiliary_loss_mlp": 0.01027296, + "balance_loss_clip": 1.03718519, + "balance_loss_mlp": 1.01694286, + "epoch": 0.9379828648729897, + "flos": 21746050184160.0, + "grad_norm": 2.016035745292346, + "language_loss": 0.69463044, + "learning_rate": 4.0172377747788474e-08, + "loss": 0.71597832, + "num_input_tokens_seen": 336443055, + "router_z_loss_clip": 0.70361328, + "router_z_loss_mlp": 0.10345459, + "step": 15601, + "time_per_iteration": 2.6469361782073975 + }, + { + "auxiliary_loss_clip": 0.01027049, + "auxiliary_loss_mlp": 0.01000883, + "balance_loss_clip": 1.00469077, + "balance_loss_mlp": 0.99994111, + "epoch": 0.9380429881256576, + "flos": 83003862127680.0, + "grad_norm": 0.751823930327589, + "language_loss": 0.58112407, + "learning_rate": 4.009474788561573e-08, + "loss": 0.60140342, + "num_input_tokens_seen": 336510190, + "router_z_loss_clip": 0.22363281, + "router_z_loss_mlp": 0.00940704, + "step": 15602, + "time_per_iteration": 3.4148027896881104 + }, + { + "auxiliary_loss_clip": 0.01109705, + "auxiliary_loss_mlp": 0.01030817, + "balance_loss_clip": 1.0362258, + "balance_loss_mlp": 1.02019525, + "epoch": 0.9381031113783256, + "flos": 25352140963680.0, + "grad_norm": 2.2457493675444775, + "language_loss": 0.71467602, + "learning_rate": 4.001719234324663e-08, + "loss": 0.73608124, + "num_input_tokens_seen": 336529250, + "router_z_loss_clip": 0.734375, + "router_z_loss_mlp": 0.10620117, + "step": 15603, + "time_per_iteration": 2.615548610687256 + }, + { + "auxiliary_loss_clip": 0.01099871, + "auxiliary_loss_mlp": 0.01025032, + "balance_loss_clip": 1.03323197, + "balance_loss_mlp": 1.01496506, + "epoch": 0.9381632346309935, + "flos": 23214389434560.0, + "grad_norm": 1.7625582807766316, + "language_loss": 0.7592324, + "learning_rate": 3.993971112362171e-08, + "loss": 0.78048146, + "num_input_tokens_seen": 336548530, + "router_z_loss_clip": 0.66699219, + "router_z_loss_mlp": 0.10058594, + "step": 15604, + "time_per_iteration": 2.62554669380188 + }, + { + "auxiliary_loss_clip": 0.01111018, + "auxiliary_loss_mlp": 0.01030894, + "balance_loss_clip": 1.03747141, + "balance_loss_mlp": 1.0193131, + "epoch": 0.9382233578836615, + "flos": 28691828212320.0, + "grad_norm": 2.651884186272918, + "language_loss": 0.65773153, + "learning_rate": 3.9862304229679734e-08, + "loss": 0.67915064, + "num_input_tokens_seen": 336568510, + "router_z_loss_clip": 0.73535156, + "router_z_loss_mlp": 0.11578369, + "step": 15605, + "time_per_iteration": 2.617792844772339 + }, + { + "auxiliary_loss_clip": 0.01114409, + "auxiliary_loss_mlp": 0.01032056, + "balance_loss_clip": 1.03837526, + "balance_loss_mlp": 1.0203377, + "epoch": 0.9382834811363294, + "flos": 52553385938880.0, + "grad_norm": 1.7388592958799005, + "language_loss": 0.67652154, + "learning_rate": 3.9784971664355683e-08, + "loss": 0.69798619, + "num_input_tokens_seen": 336592020, + "router_z_loss_clip": 0.76025391, + "router_z_loss_mlp": 0.11706543, + "step": 15606, + "time_per_iteration": 2.8120315074920654 + }, + { + "auxiliary_loss_clip": 0.01103849, + "auxiliary_loss_mlp": 0.01024084, + "balance_loss_clip": 1.03514886, + "balance_loss_mlp": 1.01376677, + "epoch": 0.9383436043889974, + "flos": 20055140749440.0, + "grad_norm": 1.860953781706671, + "language_loss": 0.77397037, + "learning_rate": 3.970771343058166e-08, + "loss": 0.7952497, + "num_input_tokens_seen": 336610010, + "router_z_loss_clip": 0.6875, + "router_z_loss_mlp": 0.10314941, + "step": 15607, + "time_per_iteration": 2.582397699356079 + }, + { + "auxiliary_loss_clip": 0.01109914, + "auxiliary_loss_mlp": 0.01028474, + "balance_loss_clip": 1.03728414, + "balance_loss_mlp": 1.01822782, + "epoch": 0.9384037276416655, + "flos": 25307740375200.0, + "grad_norm": 3.488129729146492, + "language_loss": 0.82862246, + "learning_rate": 3.963052953128776e-08, + "loss": 0.85000634, + "num_input_tokens_seen": 336628520, + "router_z_loss_clip": 0.72607422, + "router_z_loss_mlp": 0.10247803, + "step": 15608, + "time_per_iteration": 2.6226861476898193 + }, + { + "auxiliary_loss_clip": 0.01113304, + "auxiliary_loss_mlp": 0.01033251, + "balance_loss_clip": 1.04106963, + "balance_loss_mlp": 1.0215981, + "epoch": 0.9384638508943334, + "flos": 23260167610560.0, + "grad_norm": 19.129454773146843, + "language_loss": 0.68749237, + "learning_rate": 3.9553419969400536e-08, + "loss": 0.70895791, + "num_input_tokens_seen": 336647365, + "router_z_loss_clip": 0.72265625, + "router_z_loss_mlp": 0.11663818, + "step": 15609, + "time_per_iteration": 2.6074490547180176 + }, + { + "auxiliary_loss_clip": 0.01111358, + "auxiliary_loss_mlp": 0.01029489, + "balance_loss_clip": 1.03602672, + "balance_loss_mlp": 1.01775908, + "epoch": 0.9385239741470014, + "flos": 28558829033280.0, + "grad_norm": 2.2886785547466904, + "language_loss": 0.74991125, + "learning_rate": 3.9476384747844316e-08, + "loss": 0.77131975, + "num_input_tokens_seen": 336667165, + "router_z_loss_clip": 0.75292969, + "router_z_loss_mlp": 0.11749268, + "step": 15610, + "time_per_iteration": 4.127153158187866 + }, + { + "auxiliary_loss_clip": 0.01109594, + "auxiliary_loss_mlp": 0.01026914, + "balance_loss_clip": 1.03691494, + "balance_loss_mlp": 1.0166198, + "epoch": 0.9385840973996693, + "flos": 15648056616960.0, + "grad_norm": 3.1982578359230622, + "language_loss": 0.74739027, + "learning_rate": 3.939942386953987e-08, + "loss": 0.76875532, + "num_input_tokens_seen": 336684130, + "router_z_loss_clip": 0.7265625, + "router_z_loss_mlp": 0.10290527, + "step": 15611, + "time_per_iteration": 4.040822982788086 + }, + { + "auxiliary_loss_clip": 0.01110834, + "auxiliary_loss_mlp": 0.01028455, + "balance_loss_clip": 1.0390842, + "balance_loss_mlp": 1.01736903, + "epoch": 0.9386442206523373, + "flos": 18896835790080.0, + "grad_norm": 2.328116835770316, + "language_loss": 0.6592139, + "learning_rate": 3.9322537337405756e-08, + "loss": 0.68060672, + "num_input_tokens_seen": 336701520, + "router_z_loss_clip": 0.71728516, + "router_z_loss_mlp": 0.11083984, + "step": 15612, + "time_per_iteration": 2.5888144969940186 + }, + { + "auxiliary_loss_clip": 0.01106337, + "auxiliary_loss_mlp": 0.01029875, + "balance_loss_clip": 1.03665352, + "balance_loss_mlp": 1.01956308, + "epoch": 0.9387043439050052, + "flos": 25842451749120.0, + "grad_norm": 2.7539440718618673, + "language_loss": 0.57261562, + "learning_rate": 3.924572515435742e-08, + "loss": 0.59397769, + "num_input_tokens_seen": 336720675, + "router_z_loss_clip": 0.69775391, + "router_z_loss_mlp": 0.10302734, + "step": 15613, + "time_per_iteration": 2.659411907196045 + }, + { + "auxiliary_loss_clip": 0.01107989, + "auxiliary_loss_mlp": 0.01034335, + "balance_loss_clip": 1.03538465, + "balance_loss_mlp": 1.02342772, + "epoch": 0.9387644671576733, + "flos": 33761841858720.0, + "grad_norm": 2.3882223316040765, + "language_loss": 0.70847917, + "learning_rate": 3.916898732330764e-08, + "loss": 0.72990239, + "num_input_tokens_seen": 336741005, + "router_z_loss_clip": 0.72607422, + "router_z_loss_mlp": 0.10906982, + "step": 15614, + "time_per_iteration": 2.657606840133667 + }, + { + "auxiliary_loss_clip": 0.01112219, + "auxiliary_loss_mlp": 0.01031368, + "balance_loss_clip": 1.03813434, + "balance_loss_mlp": 1.02011466, + "epoch": 0.9388245904103412, + "flos": 22984647691680.0, + "grad_norm": 3.0407057853494313, + "language_loss": 0.81031477, + "learning_rate": 3.9092323847166544e-08, + "loss": 0.83175069, + "num_input_tokens_seen": 336757990, + "router_z_loss_clip": 0.74072266, + "router_z_loss_mlp": 0.11254883, + "step": 15615, + "time_per_iteration": 2.6973917484283447 + }, + { + "auxiliary_loss_clip": 0.01107937, + "auxiliary_loss_mlp": 0.0102694, + "balance_loss_clip": 1.03788042, + "balance_loss_mlp": 1.01622939, + "epoch": 0.9388847136630092, + "flos": 31095780685920.0, + "grad_norm": 1.5946143298594349, + "language_loss": 0.71979618, + "learning_rate": 3.901573472884134e-08, + "loss": 0.74114496, + "num_input_tokens_seen": 336777705, + "router_z_loss_clip": 0.70019531, + "router_z_loss_mlp": 0.1071167, + "step": 15616, + "time_per_iteration": 2.659844398498535 + }, + { + "auxiliary_loss_clip": 0.01109162, + "auxiliary_loss_mlp": 0.01026167, + "balance_loss_clip": 1.03838634, + "balance_loss_mlp": 1.01528347, + "epoch": 0.9389448369156771, + "flos": 22860238176000.0, + "grad_norm": 1.8711524690885382, + "language_loss": 0.6623624, + "learning_rate": 3.89392199712355e-08, + "loss": 0.6837157, + "num_input_tokens_seen": 336798275, + "router_z_loss_clip": 0.70800781, + "router_z_loss_mlp": 0.10876465, + "step": 15617, + "time_per_iteration": 2.6472690105438232 + }, + { + "auxiliary_loss_clip": 0.01113747, + "auxiliary_loss_mlp": 0.0103388, + "balance_loss_clip": 1.03865457, + "balance_loss_mlp": 1.02156568, + "epoch": 0.9390049601683451, + "flos": 26500964879520.0, + "grad_norm": 2.870106062016885, + "language_loss": 0.73091042, + "learning_rate": 3.886277957725092e-08, + "loss": 0.75238669, + "num_input_tokens_seen": 336813835, + "router_z_loss_clip": 0.75048828, + "router_z_loss_mlp": 0.12322998, + "step": 15618, + "time_per_iteration": 2.6998045444488525 + }, + { + "auxiliary_loss_clip": 0.0111503, + "auxiliary_loss_mlp": 0.0103274, + "balance_loss_clip": 1.03838849, + "balance_loss_mlp": 1.0201695, + "epoch": 0.939065083421013, + "flos": 23661069459840.0, + "grad_norm": 1.9847821147847726, + "language_loss": 0.70059437, + "learning_rate": 3.878641354978662e-08, + "loss": 0.72207206, + "num_input_tokens_seen": 336832210, + "router_z_loss_clip": 0.76513672, + "router_z_loss_mlp": 0.12579346, + "step": 15619, + "time_per_iteration": 2.5971953868865967 + }, + { + "auxiliary_loss_clip": 0.01110475, + "auxiliary_loss_mlp": 0.01029418, + "balance_loss_clip": 1.03804851, + "balance_loss_mlp": 1.01805747, + "epoch": 0.939125206673681, + "flos": 30116698771680.0, + "grad_norm": 1.984448210641579, + "language_loss": 0.77887797, + "learning_rate": 3.8710121891737834e-08, + "loss": 0.80027688, + "num_input_tokens_seen": 336851380, + "router_z_loss_clip": 0.72509766, + "router_z_loss_mlp": 0.11358643, + "step": 15620, + "time_per_iteration": 2.668994188308716 + }, + { + "auxiliary_loss_clip": 0.01104777, + "auxiliary_loss_mlp": 0.01023951, + "balance_loss_clip": 1.03577876, + "balance_loss_mlp": 1.01306677, + "epoch": 0.9391853299263491, + "flos": 19920642431040.0, + "grad_norm": 2.6174348173911297, + "language_loss": 0.73833716, + "learning_rate": 3.8633904605998025e-08, + "loss": 0.75962448, + "num_input_tokens_seen": 336868525, + "router_z_loss_clip": 0.68945312, + "router_z_loss_mlp": 0.10888672, + "step": 15621, + "time_per_iteration": 2.6432857513427734 + }, + { + "auxiliary_loss_clip": 0.01114809, + "auxiliary_loss_mlp": 0.01030121, + "balance_loss_clip": 1.03991985, + "balance_loss_mlp": 1.01840878, + "epoch": 0.939245453179017, + "flos": 14221727435520.0, + "grad_norm": 2.0545250955838936, + "language_loss": 0.66018742, + "learning_rate": 3.855776169545688e-08, + "loss": 0.68163669, + "num_input_tokens_seen": 336886200, + "router_z_loss_clip": 0.74951172, + "router_z_loss_mlp": 0.1171875, + "step": 15622, + "time_per_iteration": 2.613736152648926 + }, + { + "auxiliary_loss_clip": 0.01108389, + "auxiliary_loss_mlp": 0.01028227, + "balance_loss_clip": 1.03739095, + "balance_loss_mlp": 1.01753402, + "epoch": 0.939305576431685, + "flos": 28246282670880.0, + "grad_norm": 1.6196810622240743, + "language_loss": 0.71263564, + "learning_rate": 3.848169316300209e-08, + "loss": 0.73400182, + "num_input_tokens_seen": 336905815, + "router_z_loss_clip": 0.70996094, + "router_z_loss_mlp": 0.10693359, + "step": 15623, + "time_per_iteration": 3.992401599884033 + }, + { + "auxiliary_loss_clip": 0.01113399, + "auxiliary_loss_mlp": 0.01029741, + "balance_loss_clip": 1.04005241, + "balance_loss_mlp": 1.01854181, + "epoch": 0.9393656996843529, + "flos": 40619142848160.0, + "grad_norm": 2.2967058621637175, + "language_loss": 0.7239204, + "learning_rate": 3.84056990115178e-08, + "loss": 0.74535179, + "num_input_tokens_seen": 336928460, + "router_z_loss_clip": 0.73339844, + "router_z_loss_mlp": 0.11199951, + "step": 15624, + "time_per_iteration": 2.7375762462615967 + }, + { + "auxiliary_loss_clip": 0.01107565, + "auxiliary_loss_mlp": 0.01028148, + "balance_loss_clip": 1.03716969, + "balance_loss_mlp": 1.01725185, + "epoch": 0.9394258229370209, + "flos": 26465559127200.0, + "grad_norm": 1.8757967492760768, + "language_loss": 0.89079511, + "learning_rate": 3.832977924388614e-08, + "loss": 0.91215223, + "num_input_tokens_seen": 336948320, + "router_z_loss_clip": 0.70410156, + "router_z_loss_mlp": 0.10894775, + "step": 15625, + "time_per_iteration": 2.646519184112549 + }, + { + "auxiliary_loss_clip": 0.01108621, + "auxiliary_loss_mlp": 0.01030439, + "balance_loss_clip": 1.03721595, + "balance_loss_mlp": 1.01889372, + "epoch": 0.9394859461896888, + "flos": 29131984955520.0, + "grad_norm": 1.6268239782684126, + "language_loss": 0.83677661, + "learning_rate": 3.825393386298592e-08, + "loss": 0.85816717, + "num_input_tokens_seen": 336967670, + "router_z_loss_clip": 0.71484375, + "router_z_loss_mlp": 0.11553955, + "step": 15626, + "time_per_iteration": 2.6021876335144043 + }, + { + "auxiliary_loss_clip": 0.01027406, + "auxiliary_loss_mlp": 0.01001102, + "balance_loss_clip": 1.00501776, + "balance_loss_mlp": 1.00015259, + "epoch": 0.9395460694423569, + "flos": 75122997600960.0, + "grad_norm": 0.8122210810871594, + "language_loss": 0.55987048, + "learning_rate": 3.8178162871693284e-08, + "loss": 0.58015561, + "num_input_tokens_seen": 337028395, + "router_z_loss_clip": 0.22375488, + "router_z_loss_mlp": 0.00948334, + "step": 15627, + "time_per_iteration": 3.198380470275879 + }, + { + "auxiliary_loss_clip": 0.01109037, + "auxiliary_loss_mlp": 0.01029796, + "balance_loss_clip": 1.03813362, + "balance_loss_mlp": 1.01891828, + "epoch": 0.9396061926950248, + "flos": 25619071219200.0, + "grad_norm": 1.4576618171145204, + "language_loss": 0.7011807, + "learning_rate": 3.810246627288105e-08, + "loss": 0.72256899, + "num_input_tokens_seen": 337048150, + "router_z_loss_clip": 0.70947266, + "router_z_loss_mlp": 0.10870361, + "step": 15628, + "time_per_iteration": 2.609358072280884 + }, + { + "auxiliary_loss_clip": 0.01109416, + "auxiliary_loss_mlp": 0.01030446, + "balance_loss_clip": 1.03801215, + "balance_loss_mlp": 1.0191685, + "epoch": 0.9396663159476928, + "flos": 33541419090240.0, + "grad_norm": 1.6821815298194154, + "language_loss": 0.75420845, + "learning_rate": 3.8026844069420025e-08, + "loss": 0.77560711, + "num_input_tokens_seen": 337069315, + "router_z_loss_clip": 0.71386719, + "router_z_loss_mlp": 0.11279297, + "step": 15629, + "time_per_iteration": 2.7076070308685303 + }, + { + "auxiliary_loss_clip": 0.01105548, + "auxiliary_loss_mlp": 0.01023811, + "balance_loss_clip": 1.03762758, + "balance_loss_mlp": 1.01370847, + "epoch": 0.9397264392003607, + "flos": 23705307979200.0, + "grad_norm": 2.1859038945375, + "language_loss": 0.74602938, + "learning_rate": 3.795129626417748e-08, + "loss": 0.76732296, + "num_input_tokens_seen": 337087765, + "router_z_loss_clip": 0.67919922, + "router_z_loss_mlp": 0.10101318, + "step": 15630, + "time_per_iteration": 2.5831446647644043 + }, + { + "auxiliary_loss_clip": 0.01105934, + "auxiliary_loss_mlp": 0.01028815, + "balance_loss_clip": 1.03755987, + "balance_loss_mlp": 1.01803851, + "epoch": 0.9397865624530287, + "flos": 21969268644960.0, + "grad_norm": 3.334862330216622, + "language_loss": 0.69373423, + "learning_rate": 3.787582286001845e-08, + "loss": 0.71508169, + "num_input_tokens_seen": 337106265, + "router_z_loss_clip": 0.68359375, + "router_z_loss_mlp": 0.10778809, + "step": 15631, + "time_per_iteration": 2.6304092407226562 + }, + { + "auxiliary_loss_clip": 0.0110851, + "auxiliary_loss_mlp": 0.01033575, + "balance_loss_clip": 1.03764236, + "balance_loss_mlp": 1.02309096, + "epoch": 0.9398466857056966, + "flos": 27533563770240.0, + "grad_norm": 1.5750447294459462, + "language_loss": 0.75161958, + "learning_rate": 3.7800423859805086e-08, + "loss": 0.77304041, + "num_input_tokens_seen": 337126090, + "router_z_loss_clip": 0.70800781, + "router_z_loss_mlp": 0.1048584, + "step": 15632, + "time_per_iteration": 2.687856912612915 + }, + { + "auxiliary_loss_clip": 0.01115181, + "auxiliary_loss_mlp": 0.0102997, + "balance_loss_clip": 1.04012084, + "balance_loss_mlp": 1.01782262, + "epoch": 0.9399068089583646, + "flos": 29938731762240.0, + "grad_norm": 1.8036932637789682, + "language_loss": 0.74207127, + "learning_rate": 3.772509926639622e-08, + "loss": 0.76352274, + "num_input_tokens_seen": 337145655, + "router_z_loss_clip": 0.75, + "router_z_loss_mlp": 0.121521, + "step": 15633, + "time_per_iteration": 2.6561858654022217 + }, + { + "auxiliary_loss_clip": 0.011105, + "auxiliary_loss_mlp": 0.01034355, + "balance_loss_clip": 1.03715277, + "balance_loss_mlp": 1.02224338, + "epoch": 0.9399669322110327, + "flos": 31274517523680.0, + "grad_norm": 2.1890288973841785, + "language_loss": 0.7249254, + "learning_rate": 3.764984908264823e-08, + "loss": 0.74637395, + "num_input_tokens_seen": 337164805, + "router_z_loss_clip": 0.73339844, + "router_z_loss_mlp": 0.12115479, + "step": 15634, + "time_per_iteration": 2.633213758468628 + }, + { + "auxiliary_loss_clip": 0.01110694, + "auxiliary_loss_mlp": 0.0102548, + "balance_loss_clip": 1.03686476, + "balance_loss_mlp": 1.01375544, + "epoch": 0.9400270554637006, + "flos": 20852487547200.0, + "grad_norm": 1.6257192105189795, + "language_loss": 0.69210863, + "learning_rate": 3.75746733114144e-08, + "loss": 0.7134704, + "num_input_tokens_seen": 337182280, + "router_z_loss_clip": 0.73828125, + "router_z_loss_mlp": 0.11737061, + "step": 15635, + "time_per_iteration": 2.6457772254943848 + }, + { + "auxiliary_loss_clip": 0.01107032, + "auxiliary_loss_mlp": 0.01023939, + "balance_loss_clip": 1.03741491, + "balance_loss_mlp": 1.01308489, + "epoch": 0.9400871787163686, + "flos": 26911712427840.0, + "grad_norm": 1.554536219400522, + "language_loss": 0.742208, + "learning_rate": 3.7499571955545985e-08, + "loss": 0.76351774, + "num_input_tokens_seen": 337203495, + "router_z_loss_clip": 0.69628906, + "router_z_loss_mlp": 0.10852051, + "step": 15636, + "time_per_iteration": 2.652653932571411 + }, + { + "auxiliary_loss_clip": 0.01111529, + "auxiliary_loss_mlp": 0.01030527, + "balance_loss_clip": 1.03863239, + "balance_loss_mlp": 1.01928592, + "epoch": 0.9401473019690365, + "flos": 20722121991360.0, + "grad_norm": 2.556674521210176, + "language_loss": 0.82528156, + "learning_rate": 3.7424545017890054e-08, + "loss": 0.84670216, + "num_input_tokens_seen": 337220435, + "router_z_loss_clip": 0.72900391, + "router_z_loss_mlp": 0.11248779, + "step": 15637, + "time_per_iteration": 2.63222074508667 + }, + { + "auxiliary_loss_clip": 0.01109397, + "auxiliary_loss_mlp": 0.01030203, + "balance_loss_clip": 1.03694177, + "balance_loss_mlp": 1.01888442, + "epoch": 0.9402074252217045, + "flos": 24015950029440.0, + "grad_norm": 2.2553639749679135, + "language_loss": 0.69223559, + "learning_rate": 3.7349592501292325e-08, + "loss": 0.71363163, + "num_input_tokens_seen": 337238095, + "router_z_loss_clip": 0.72412109, + "router_z_loss_mlp": 0.11315918, + "step": 15638, + "time_per_iteration": 4.1018452644348145 + }, + { + "auxiliary_loss_clip": 0.01105361, + "auxiliary_loss_mlp": 0.0103299, + "balance_loss_clip": 1.03707731, + "balance_loss_mlp": 1.02311337, + "epoch": 0.9402675484743724, + "flos": 30205540465920.0, + "grad_norm": 1.7578680390474124, + "language_loss": 0.84676778, + "learning_rate": 3.727471440859498e-08, + "loss": 0.86815131, + "num_input_tokens_seen": 337256645, + "router_z_loss_clip": 0.68212891, + "router_z_loss_mlp": 0.09875488, + "step": 15639, + "time_per_iteration": 2.6275832653045654 + }, + { + "auxiliary_loss_clip": 0.01108143, + "auxiliary_loss_mlp": 0.01026856, + "balance_loss_clip": 1.0359385, + "balance_loss_mlp": 1.0162046, + "epoch": 0.9403276717270405, + "flos": 31186891347840.0, + "grad_norm": 1.5302524366950008, + "language_loss": 0.78343135, + "learning_rate": 3.719991074263662e-08, + "loss": 0.80478132, + "num_input_tokens_seen": 337278360, + "router_z_loss_clip": 0.72216797, + "router_z_loss_mlp": 0.10650635, + "step": 15640, + "time_per_iteration": 2.7697877883911133 + }, + { + "auxiliary_loss_clip": 0.01111606, + "auxiliary_loss_mlp": 0.01027452, + "balance_loss_clip": 1.03680253, + "balance_loss_mlp": 1.01609707, + "epoch": 0.9403877949797084, + "flos": 32567280284160.0, + "grad_norm": 1.5153531289462523, + "language_loss": 0.74228752, + "learning_rate": 3.7125181506254544e-08, + "loss": 0.76367813, + "num_input_tokens_seen": 337302480, + "router_z_loss_clip": 0.74658203, + "router_z_loss_mlp": 0.11346436, + "step": 15641, + "time_per_iteration": 2.6809046268463135 + }, + { + "auxiliary_loss_clip": 0.01112533, + "auxiliary_loss_mlp": 0.01030481, + "balance_loss_clip": 1.03691137, + "balance_loss_mlp": 1.01770186, + "epoch": 0.9404479182323764, + "flos": 18316913482080.0, + "grad_norm": 2.313296734056322, + "language_loss": 0.82474881, + "learning_rate": 3.7050526702282256e-08, + "loss": 0.84617895, + "num_input_tokens_seen": 337316600, + "router_z_loss_clip": 0.75634766, + "router_z_loss_mlp": 0.12774658, + "step": 15642, + "time_per_iteration": 2.601872682571411 + }, + { + "auxiliary_loss_clip": 0.01106052, + "auxiliary_loss_mlp": 0.01029945, + "balance_loss_clip": 1.03603721, + "balance_loss_mlp": 1.01932931, + "epoch": 0.9405080414850443, + "flos": 30473078480640.0, + "grad_norm": 2.9120017320287146, + "language_loss": 0.68232977, + "learning_rate": 3.697594633355084e-08, + "loss": 0.70368981, + "num_input_tokens_seen": 337336895, + "router_z_loss_clip": 0.69970703, + "router_z_loss_mlp": 0.10614014, + "step": 15643, + "time_per_iteration": 2.667935609817505 + }, + { + "auxiliary_loss_clip": 0.01115821, + "auxiliary_loss_mlp": 0.01038102, + "balance_loss_clip": 1.04069328, + "balance_loss_mlp": 1.02627671, + "epoch": 0.9405681647377123, + "flos": 25435310238720.0, + "grad_norm": 1.7989471701363842, + "language_loss": 0.76441741, + "learning_rate": 3.6901440402888226e-08, + "loss": 0.78595668, + "num_input_tokens_seen": 337355105, + "router_z_loss_clip": 0.75195312, + "router_z_loss_mlp": 0.1182251, + "step": 15644, + "time_per_iteration": 2.6628336906433105 + }, + { + "auxiliary_loss_clip": 0.01106809, + "auxiliary_loss_mlp": 0.01030519, + "balance_loss_clip": 1.03723621, + "balance_loss_mlp": 1.02036214, + "epoch": 0.9406282879903802, + "flos": 29047843265760.0, + "grad_norm": 1.7740458351543977, + "language_loss": 0.67621559, + "learning_rate": 3.682700891311974e-08, + "loss": 0.69758886, + "num_input_tokens_seen": 337374905, + "router_z_loss_clip": 0.69580078, + "router_z_loss_mlp": 0.1015625, + "step": 15645, + "time_per_iteration": 2.7003533840179443 + }, + { + "auxiliary_loss_clip": 0.01105211, + "auxiliary_loss_mlp": 0.0102995, + "balance_loss_clip": 1.03664708, + "balance_loss_mlp": 1.01926839, + "epoch": 0.9406884112430483, + "flos": 33769378072800.0, + "grad_norm": 1.9642570619613837, + "language_loss": 0.70200634, + "learning_rate": 3.6752651867067774e-08, + "loss": 0.72335792, + "num_input_tokens_seen": 337397130, + "router_z_loss_clip": 0.68603516, + "router_z_loss_mlp": 0.10675049, + "step": 15646, + "time_per_iteration": 2.6413772106170654 + }, + { + "auxiliary_loss_clip": 0.01105072, + "auxiliary_loss_mlp": 0.0102909, + "balance_loss_clip": 1.03560829, + "balance_loss_mlp": 1.01842117, + "epoch": 0.9407485344957163, + "flos": 28156468561920.0, + "grad_norm": 1.6372845619600358, + "language_loss": 0.74163234, + "learning_rate": 3.667836926755208e-08, + "loss": 0.7629739, + "num_input_tokens_seen": 337418660, + "router_z_loss_clip": 0.6953125, + "router_z_loss_mlp": 0.10668945, + "step": 15647, + "time_per_iteration": 2.6419296264648438 + }, + { + "auxiliary_loss_clip": 0.01027406, + "auxiliary_loss_mlp": 0.01000874, + "balance_loss_clip": 1.00501227, + "balance_loss_mlp": 0.99999845, + "epoch": 0.9408086577483842, + "flos": 86652246597120.0, + "grad_norm": 0.8878523539416111, + "language_loss": 0.63551223, + "learning_rate": 3.660416111738907e-08, + "loss": 0.65579504, + "num_input_tokens_seen": 337478055, + "router_z_loss_clip": 0.22387695, + "router_z_loss_mlp": 0.00875854, + "step": 15648, + "time_per_iteration": 3.3865156173706055 + }, + { + "auxiliary_loss_clip": 0.01105884, + "auxiliary_loss_mlp": 0.0102945, + "balance_loss_clip": 1.03717971, + "balance_loss_mlp": 1.0194838, + "epoch": 0.9408687810010522, + "flos": 28956935190240.0, + "grad_norm": 1.6797400992499376, + "language_loss": 0.66440439, + "learning_rate": 3.653002741939337e-08, + "loss": 0.68575776, + "num_input_tokens_seen": 337499405, + "router_z_loss_clip": 0.68701172, + "router_z_loss_mlp": 0.09967041, + "step": 15649, + "time_per_iteration": 4.115861892700195 + }, + { + "auxiliary_loss_clip": 0.01108317, + "auxiliary_loss_mlp": 0.0102982, + "balance_loss_clip": 1.03611422, + "balance_loss_mlp": 1.01901364, + "epoch": 0.9409289042537201, + "flos": 22414084875360.0, + "grad_norm": 2.5518414288722986, + "language_loss": 0.77804983, + "learning_rate": 3.645596817637586e-08, + "loss": 0.7994312, + "num_input_tokens_seen": 337517195, + "router_z_loss_clip": 0.72167969, + "router_z_loss_mlp": 0.10809326, + "step": 15650, + "time_per_iteration": 3.925501585006714 + }, + { + "auxiliary_loss_clip": 0.0110998, + "auxiliary_loss_mlp": 0.01031837, + "balance_loss_clip": 1.03903747, + "balance_loss_mlp": 1.02109599, + "epoch": 0.9409890275063881, + "flos": 29136644442720.0, + "grad_norm": 1.7337743511227472, + "language_loss": 0.74175918, + "learning_rate": 3.638198339114451e-08, + "loss": 0.76317728, + "num_input_tokens_seen": 337535245, + "router_z_loss_clip": 0.70947266, + "router_z_loss_mlp": 0.10736084, + "step": 15651, + "time_per_iteration": 2.615750789642334 + }, + { + "auxiliary_loss_clip": 0.01107777, + "auxiliary_loss_mlp": 0.01031868, + "balance_loss_clip": 1.0364058, + "balance_loss_mlp": 1.02008986, + "epoch": 0.941049150759056, + "flos": 20188018376640.0, + "grad_norm": 1.9924271596458738, + "language_loss": 0.72012699, + "learning_rate": 3.630807306650507e-08, + "loss": 0.74152339, + "num_input_tokens_seen": 337553040, + "router_z_loss_clip": 0.71435547, + "router_z_loss_mlp": 0.11767578, + "step": 15652, + "time_per_iteration": 2.594836950302124 + }, + { + "auxiliary_loss_clip": 0.01115094, + "auxiliary_loss_mlp": 0.01034314, + "balance_loss_clip": 1.03901911, + "balance_loss_mlp": 1.02300084, + "epoch": 0.9411092740117241, + "flos": 30649911006240.0, + "grad_norm": 3.2954222242074622, + "language_loss": 0.66138637, + "learning_rate": 3.6234237205260645e-08, + "loss": 0.6828804, + "num_input_tokens_seen": 337574580, + "router_z_loss_clip": 0.75976562, + "router_z_loss_mlp": 0.11309814, + "step": 15653, + "time_per_iteration": 2.6614296436309814 + }, + { + "auxiliary_loss_clip": 0.01110595, + "auxiliary_loss_mlp": 0.01033147, + "balance_loss_clip": 1.0381434, + "balance_loss_mlp": 1.02164936, + "epoch": 0.941169397264392, + "flos": 25797848574240.0, + "grad_norm": 1.9914118715521258, + "language_loss": 0.77723432, + "learning_rate": 3.6160475810210536e-08, + "loss": 0.79867172, + "num_input_tokens_seen": 337593010, + "router_z_loss_clip": 0.72460938, + "router_z_loss_mlp": 0.11499023, + "step": 15654, + "time_per_iteration": 2.6132798194885254 + }, + { + "auxiliary_loss_clip": 0.01113514, + "auxiliary_loss_mlp": 0.01027077, + "balance_loss_clip": 1.03778052, + "balance_loss_mlp": 1.01586545, + "epoch": 0.94122952051706, + "flos": 46990022711040.0, + "grad_norm": 2.0619631491732964, + "language_loss": 0.70221388, + "learning_rate": 3.6086788884152065e-08, + "loss": 0.7236197, + "num_input_tokens_seen": 337616170, + "router_z_loss_clip": 0.75634766, + "router_z_loss_mlp": 0.11212158, + "step": 15655, + "time_per_iteration": 2.7837424278259277 + }, + { + "auxiliary_loss_clip": 0.01110013, + "auxiliary_loss_mlp": 0.01030344, + "balance_loss_clip": 1.03791094, + "balance_loss_mlp": 1.0183692, + "epoch": 0.9412896437697279, + "flos": 22413963323520.0, + "grad_norm": 1.862213533655766, + "language_loss": 0.72229838, + "learning_rate": 3.601317642987944e-08, + "loss": 0.74370193, + "num_input_tokens_seen": 337635215, + "router_z_loss_clip": 0.72070312, + "router_z_loss_mlp": 0.11981201, + "step": 15656, + "time_per_iteration": 2.6059064865112305 + }, + { + "auxiliary_loss_clip": 0.01108364, + "auxiliary_loss_mlp": 0.01028233, + "balance_loss_clip": 1.03710127, + "balance_loss_mlp": 1.01767147, + "epoch": 0.9413497670223959, + "flos": 31585281125760.0, + "grad_norm": 2.0615335573394478, + "language_loss": 0.77645355, + "learning_rate": 3.593963845018377e-08, + "loss": 0.7978195, + "num_input_tokens_seen": 337654195, + "router_z_loss_clip": 0.71142578, + "router_z_loss_mlp": 0.10571289, + "step": 15657, + "time_per_iteration": 2.680696964263916 + }, + { + "auxiliary_loss_clip": 0.01107438, + "auxiliary_loss_mlp": 0.01024241, + "balance_loss_clip": 1.03531003, + "balance_loss_mlp": 1.01311278, + "epoch": 0.9414098902750638, + "flos": 20277346278240.0, + "grad_norm": 2.2233433731565695, + "language_loss": 0.84525508, + "learning_rate": 3.586617494785371e-08, + "loss": 0.8665719, + "num_input_tokens_seen": 337671810, + "router_z_loss_clip": 0.72070312, + "router_z_loss_mlp": 0.1114502, + "step": 15658, + "time_per_iteration": 2.588094711303711 + }, + { + "auxiliary_loss_clip": 0.01115563, + "auxiliary_loss_mlp": 0.01033629, + "balance_loss_clip": 1.03941202, + "balance_loss_mlp": 1.02073061, + "epoch": 0.9414700135277319, + "flos": 22726104513120.0, + "grad_norm": 2.3545555856947558, + "language_loss": 0.70517999, + "learning_rate": 3.5792785925675254e-08, + "loss": 0.72667193, + "num_input_tokens_seen": 337689410, + "router_z_loss_clip": 0.76220703, + "router_z_loss_mlp": 0.12890625, + "step": 15659, + "time_per_iteration": 2.674067974090576 + }, + { + "auxiliary_loss_clip": 0.01107224, + "auxiliary_loss_mlp": 0.01037065, + "balance_loss_clip": 1.03643107, + "balance_loss_mlp": 1.02654517, + "epoch": 0.9415301367803999, + "flos": 32068096214400.0, + "grad_norm": 1.7774750102577501, + "language_loss": 0.79809213, + "learning_rate": 3.571947138643172e-08, + "loss": 0.81953508, + "num_input_tokens_seen": 337709950, + "router_z_loss_clip": 0.70751953, + "router_z_loss_mlp": 0.10516357, + "step": 15660, + "time_per_iteration": 2.6827926635742188 + }, + { + "auxiliary_loss_clip": 0.01106813, + "auxiliary_loss_mlp": 0.01027383, + "balance_loss_clip": 1.03674102, + "balance_loss_mlp": 1.01665986, + "epoch": 0.9415902600330678, + "flos": 28383495647040.0, + "grad_norm": 1.6421462899404835, + "language_loss": 0.68149316, + "learning_rate": 3.564623133290201e-08, + "loss": 0.70283514, + "num_input_tokens_seen": 337731320, + "router_z_loss_clip": 0.69970703, + "router_z_loss_mlp": 0.10736084, + "step": 15661, + "time_per_iteration": 2.719226598739624 + }, + { + "auxiliary_loss_clip": 0.01108521, + "auxiliary_loss_mlp": 0.01026792, + "balance_loss_clip": 1.03651118, + "balance_loss_mlp": 1.01604581, + "epoch": 0.9416503832857358, + "flos": 17960007048480.0, + "grad_norm": 2.424972042856097, + "language_loss": 0.66654712, + "learning_rate": 3.557306576786434e-08, + "loss": 0.68790019, + "num_input_tokens_seen": 337747720, + "router_z_loss_clip": 0.71972656, + "router_z_loss_mlp": 0.10748291, + "step": 15662, + "time_per_iteration": 3.8365960121154785 + }, + { + "auxiliary_loss_clip": 0.01027158, + "auxiliary_loss_mlp": 0.0100056, + "balance_loss_clip": 1.00480127, + "balance_loss_mlp": 0.99968255, + "epoch": 0.9417105065384037, + "flos": 85797169025760.0, + "grad_norm": 0.771717890109685, + "language_loss": 0.59222728, + "learning_rate": 3.5499974694092935e-08, + "loss": 0.61250448, + "num_input_tokens_seen": 337806930, + "router_z_loss_clip": 0.22338867, + "router_z_loss_mlp": 0.0087738, + "step": 15663, + "time_per_iteration": 3.2960755825042725 + }, + { + "auxiliary_loss_clip": 0.01114385, + "auxiliary_loss_mlp": 0.01033501, + "balance_loss_clip": 1.03855491, + "balance_loss_mlp": 1.02122235, + "epoch": 0.9417706297910717, + "flos": 41558524178400.0, + "grad_norm": 1.9369655512486927, + "language_loss": 0.66728181, + "learning_rate": 3.542695811435914e-08, + "loss": 0.6887607, + "num_input_tokens_seen": 337828100, + "router_z_loss_clip": 0.75830078, + "router_z_loss_mlp": 0.12280273, + "step": 15664, + "time_per_iteration": 2.824753761291504 + }, + { + "auxiliary_loss_clip": 0.01108701, + "auxiliary_loss_mlp": 0.01027194, + "balance_loss_clip": 1.03811383, + "balance_loss_mlp": 1.01653051, + "epoch": 0.9418307530437396, + "flos": 20098406854080.0, + "grad_norm": 2.3732392259433777, + "language_loss": 0.73393786, + "learning_rate": 3.535401603143207e-08, + "loss": 0.75529683, + "num_input_tokens_seen": 337844805, + "router_z_loss_clip": 0.70605469, + "router_z_loss_mlp": 0.10662842, + "step": 15665, + "time_per_iteration": 2.5642802715301514 + }, + { + "auxiliary_loss_clip": 0.01107221, + "auxiliary_loss_mlp": 0.01026236, + "balance_loss_clip": 1.03794646, + "balance_loss_mlp": 1.01601386, + "epoch": 0.9418908762964077, + "flos": 13730970960000.0, + "grad_norm": 2.7572133710512414, + "language_loss": 0.63604629, + "learning_rate": 3.528114844807773e-08, + "loss": 0.65738088, + "num_input_tokens_seen": 337860490, + "router_z_loss_clip": 0.69189453, + "router_z_loss_mlp": 0.10217285, + "step": 15666, + "time_per_iteration": 2.635209560394287 + }, + { + "auxiliary_loss_clip": 0.01109145, + "auxiliary_loss_mlp": 0.01029497, + "balance_loss_clip": 1.03680253, + "balance_loss_mlp": 1.01769543, + "epoch": 0.9419509995490756, + "flos": 22499239497120.0, + "grad_norm": 1.8887610934874663, + "language_loss": 0.78544754, + "learning_rate": 3.520835536705902e-08, + "loss": 0.80683398, + "num_input_tokens_seen": 337878360, + "router_z_loss_clip": 0.72265625, + "router_z_loss_mlp": 0.11804199, + "step": 15667, + "time_per_iteration": 2.592219114303589 + }, + { + "auxiliary_loss_clip": 0.01106049, + "auxiliary_loss_mlp": 0.01026342, + "balance_loss_clip": 1.0360713, + "balance_loss_mlp": 1.01651287, + "epoch": 0.9420111228017436, + "flos": 25304742096480.0, + "grad_norm": 1.9148797276258178, + "language_loss": 0.75610566, + "learning_rate": 3.5135636791136404e-08, + "loss": 0.77742958, + "num_input_tokens_seen": 337895635, + "router_z_loss_clip": 0.69970703, + "router_z_loss_mlp": 0.0982666, + "step": 15668, + "time_per_iteration": 2.6319496631622314 + }, + { + "auxiliary_loss_clip": 0.01111129, + "auxiliary_loss_mlp": 0.01029129, + "balance_loss_clip": 1.03708625, + "balance_loss_mlp": 1.0179528, + "epoch": 0.9420712460544115, + "flos": 25797483918720.0, + "grad_norm": 3.356819042944211, + "language_loss": 0.58999574, + "learning_rate": 3.506299272306723e-08, + "loss": 0.61139828, + "num_input_tokens_seen": 337913940, + "router_z_loss_clip": 0.74072266, + "router_z_loss_mlp": 0.11169434, + "step": 15669, + "time_per_iteration": 2.60115647315979 + }, + { + "auxiliary_loss_clip": 0.01105182, + "auxiliary_loss_mlp": 0.01025156, + "balance_loss_clip": 1.03616953, + "balance_loss_mlp": 1.01473069, + "epoch": 0.9421313693070795, + "flos": 19341976158720.0, + "grad_norm": 1.6936769642820493, + "language_loss": 0.76809287, + "learning_rate": 3.4990423165606406e-08, + "loss": 0.78939617, + "num_input_tokens_seen": 337932015, + "router_z_loss_clip": 0.68994141, + "router_z_loss_mlp": 0.10424805, + "step": 15670, + "time_per_iteration": 2.616058588027954 + }, + { + "auxiliary_loss_clip": 0.01109123, + "auxiliary_loss_mlp": 0.01027378, + "balance_loss_clip": 1.03729558, + "balance_loss_mlp": 1.01618385, + "epoch": 0.9421914925597474, + "flos": 39554865794880.0, + "grad_norm": 2.043484724611047, + "language_loss": 0.64927429, + "learning_rate": 3.491792812150574e-08, + "loss": 0.67063928, + "num_input_tokens_seen": 337953345, + "router_z_loss_clip": 0.71826172, + "router_z_loss_mlp": 0.11199951, + "step": 15671, + "time_per_iteration": 2.67342209815979 + }, + { + "auxiliary_loss_clip": 0.01108356, + "auxiliary_loss_mlp": 0.01029039, + "balance_loss_clip": 1.03681445, + "balance_loss_mlp": 1.01783943, + "epoch": 0.9422516158124155, + "flos": 24060553204320.0, + "grad_norm": 1.6190729889277018, + "language_loss": 0.79458022, + "learning_rate": 3.48455075935139e-08, + "loss": 0.81595421, + "num_input_tokens_seen": 337973685, + "router_z_loss_clip": 0.71484375, + "router_z_loss_mlp": 0.11187744, + "step": 15672, + "time_per_iteration": 2.6366865634918213 + }, + { + "auxiliary_loss_clip": 0.01112885, + "auxiliary_loss_mlp": 0.01030846, + "balance_loss_clip": 1.0372355, + "balance_loss_mlp": 1.01876974, + "epoch": 0.9423117390650835, + "flos": 19831922288640.0, + "grad_norm": 2.407187239956423, + "language_loss": 0.73451543, + "learning_rate": 3.47731615843776e-08, + "loss": 0.75595272, + "num_input_tokens_seen": 337989175, + "router_z_loss_clip": 0.75634766, + "router_z_loss_mlp": 0.12078857, + "step": 15673, + "time_per_iteration": 2.646233320236206 + }, + { + "auxiliary_loss_clip": 0.01108386, + "auxiliary_loss_mlp": 0.01028308, + "balance_loss_clip": 1.0371182, + "balance_loss_mlp": 1.01712048, + "epoch": 0.9423718623177514, + "flos": 38799002341440.0, + "grad_norm": 1.6822424589919778, + "language_loss": 0.7029435, + "learning_rate": 3.470089009683974e-08, + "loss": 0.72431046, + "num_input_tokens_seen": 338011800, + "router_z_loss_clip": 0.71289062, + "router_z_loss_mlp": 0.11187744, + "step": 15674, + "time_per_iteration": 2.721785068511963 + }, + { + "auxiliary_loss_clip": 0.01108456, + "auxiliary_loss_mlp": 0.01022855, + "balance_loss_clip": 1.03657317, + "balance_loss_mlp": 1.0120486, + "epoch": 0.9424319855704194, + "flos": 28469663200800.0, + "grad_norm": 1.781103308872403, + "language_loss": 0.81310898, + "learning_rate": 3.462869313364125e-08, + "loss": 0.83442211, + "num_input_tokens_seen": 338032120, + "router_z_loss_clip": 0.71923828, + "router_z_loss_mlp": 0.10803223, + "step": 15675, + "time_per_iteration": 2.669743776321411 + }, + { + "auxiliary_loss_clip": 0.01108166, + "auxiliary_loss_mlp": 0.01025806, + "balance_loss_clip": 1.0374285, + "balance_loss_mlp": 1.01510131, + "epoch": 0.9424921088230873, + "flos": 25352870274720.0, + "grad_norm": 2.34194859627406, + "language_loss": 0.62830728, + "learning_rate": 3.4556570697519494e-08, + "loss": 0.649647, + "num_input_tokens_seen": 338051880, + "router_z_loss_clip": 0.70800781, + "router_z_loss_mlp": 0.10699463, + "step": 15676, + "time_per_iteration": 2.6081252098083496 + }, + { + "auxiliary_loss_clip": 0.01108068, + "auxiliary_loss_mlp": 0.01036268, + "balance_loss_clip": 1.03685999, + "balance_loss_mlp": 1.02529478, + "epoch": 0.9425522320757553, + "flos": 23216172194880.0, + "grad_norm": 1.7138562743099515, + "language_loss": 0.66969156, + "learning_rate": 3.448452279120984e-08, + "loss": 0.69113493, + "num_input_tokens_seen": 338069665, + "router_z_loss_clip": 0.71142578, + "router_z_loss_mlp": 0.10980225, + "step": 15677, + "time_per_iteration": 2.6061675548553467 + }, + { + "auxiliary_loss_clip": 0.01110386, + "auxiliary_loss_mlp": 0.01033402, + "balance_loss_clip": 1.03635073, + "balance_loss_mlp": 1.02129054, + "epoch": 0.9426123553284232, + "flos": 30694959871200.0, + "grad_norm": 1.8297271778670234, + "language_loss": 0.64186037, + "learning_rate": 3.441254941744387e-08, + "loss": 0.66329825, + "num_input_tokens_seen": 338090490, + "router_z_loss_clip": 0.74072266, + "router_z_loss_mlp": 0.12109375, + "step": 15678, + "time_per_iteration": 4.2132673263549805 + }, + { + "auxiliary_loss_clip": 0.01108666, + "auxiliary_loss_mlp": 0.01027636, + "balance_loss_clip": 1.03761554, + "balance_loss_mlp": 1.01639497, + "epoch": 0.9426724785810913, + "flos": 25843059508320.0, + "grad_norm": 1.5353776732717923, + "language_loss": 0.74203372, + "learning_rate": 3.434065057895097e-08, + "loss": 0.76339674, + "num_input_tokens_seen": 338109825, + "router_z_loss_clip": 0.7109375, + "router_z_loss_mlp": 0.11242676, + "step": 15679, + "time_per_iteration": 2.665532112121582 + }, + { + "auxiliary_loss_clip": 0.01112319, + "auxiliary_loss_mlp": 0.01032515, + "balance_loss_clip": 1.03809786, + "balance_loss_mlp": 1.02130938, + "epoch": 0.9427326018337592, + "flos": 18005987810880.0, + "grad_norm": 2.4782310731257984, + "language_loss": 0.77797711, + "learning_rate": 3.426882627845762e-08, + "loss": 0.79942548, + "num_input_tokens_seen": 338125790, + "router_z_loss_clip": 0.74267578, + "router_z_loss_mlp": 0.11218262, + "step": 15680, + "time_per_iteration": 2.6146395206451416 + }, + { + "auxiliary_loss_clip": 0.01108042, + "auxiliary_loss_mlp": 0.01031991, + "balance_loss_clip": 1.03659475, + "balance_loss_mlp": 1.02083254, + "epoch": 0.9427927250864272, + "flos": 25531039870560.0, + "grad_norm": 1.8309986000663165, + "language_loss": 0.75399959, + "learning_rate": 3.419707651868742e-08, + "loss": 0.77539992, + "num_input_tokens_seen": 338145610, + "router_z_loss_clip": 0.71435547, + "router_z_loss_mlp": 0.1116333, + "step": 15681, + "time_per_iteration": 2.6334784030914307 + }, + { + "auxiliary_loss_clip": 0.01111787, + "auxiliary_loss_mlp": 0.01030898, + "balance_loss_clip": 1.03883433, + "balance_loss_mlp": 1.01962709, + "epoch": 0.9428528483390951, + "flos": 24102279652320.0, + "grad_norm": 2.0822826676572164, + "language_loss": 0.65507811, + "learning_rate": 3.412540130236086e-08, + "loss": 0.67650497, + "num_input_tokens_seen": 338165960, + "router_z_loss_clip": 0.72998047, + "router_z_loss_mlp": 0.11273193, + "step": 15682, + "time_per_iteration": 2.6362404823303223 + }, + { + "auxiliary_loss_clip": 0.01106568, + "auxiliary_loss_mlp": 0.01027954, + "balance_loss_clip": 1.03552413, + "balance_loss_mlp": 1.01698065, + "epoch": 0.9429129715917631, + "flos": 29937961933920.0, + "grad_norm": 1.9127923834447622, + "language_loss": 0.77254361, + "learning_rate": 3.405380063219665e-08, + "loss": 0.79388887, + "num_input_tokens_seen": 338187215, + "router_z_loss_clip": 0.71044922, + "router_z_loss_mlp": 0.10968018, + "step": 15683, + "time_per_iteration": 2.6322925090789795 + }, + { + "auxiliary_loss_clip": 0.0111374, + "auxiliary_loss_mlp": 0.01041755, + "balance_loss_clip": 1.04017735, + "balance_loss_mlp": 1.02993524, + "epoch": 0.942973094844431, + "flos": 21915143909280.0, + "grad_norm": 3.0631982723634383, + "language_loss": 0.75529689, + "learning_rate": 3.398227451090885e-08, + "loss": 0.77685183, + "num_input_tokens_seen": 338201825, + "router_z_loss_clip": 0.73535156, + "router_z_loss_mlp": 0.11834717, + "step": 15684, + "time_per_iteration": 2.611534357070923 + }, + { + "auxiliary_loss_clip": 0.01106891, + "auxiliary_loss_mlp": 0.01023749, + "balance_loss_clip": 1.03676486, + "balance_loss_mlp": 1.01352692, + "epoch": 0.9430332180970991, + "flos": 31893937829280.0, + "grad_norm": 1.7755912273342283, + "language_loss": 0.7724154, + "learning_rate": 3.391082294121017e-08, + "loss": 0.7937218, + "num_input_tokens_seen": 338220865, + "router_z_loss_clip": 0.70117188, + "router_z_loss_mlp": 0.10217285, + "step": 15685, + "time_per_iteration": 2.6396920680999756 + }, + { + "auxiliary_loss_clip": 0.01105341, + "auxiliary_loss_mlp": 0.01025167, + "balance_loss_clip": 1.03579414, + "balance_loss_mlp": 1.01461136, + "epoch": 0.943093341349767, + "flos": 29225769757920.0, + "grad_norm": 1.7599293676542445, + "language_loss": 0.7589047, + "learning_rate": 3.383944592581023e-08, + "loss": 0.78020978, + "num_input_tokens_seen": 338240160, + "router_z_loss_clip": 0.6953125, + "router_z_loss_mlp": 0.10559082, + "step": 15686, + "time_per_iteration": 2.6645987033843994 + }, + { + "auxiliary_loss_clip": 0.01111915, + "auxiliary_loss_mlp": 0.01027744, + "balance_loss_clip": 1.03742456, + "balance_loss_mlp": 1.01646113, + "epoch": 0.943153464602435, + "flos": 21924827539200.0, + "grad_norm": 3.1420074518782197, + "language_loss": 0.803532, + "learning_rate": 3.376814346741575e-08, + "loss": 0.82492864, + "num_input_tokens_seen": 338259305, + "router_z_loss_clip": 0.74609375, + "router_z_loss_mlp": 0.11297607, + "step": 15687, + "time_per_iteration": 2.6306278705596924 + }, + { + "auxiliary_loss_clip": 0.01113593, + "auxiliary_loss_mlp": 0.01035012, + "balance_loss_clip": 1.03810084, + "balance_loss_mlp": 1.02257299, + "epoch": 0.943213587855103, + "flos": 17694332828640.0, + "grad_norm": 4.642729408386016, + "language_loss": 0.76083124, + "learning_rate": 3.369691556873011e-08, + "loss": 0.78231728, + "num_input_tokens_seen": 338274950, + "router_z_loss_clip": 0.75488281, + "router_z_loss_mlp": 0.12451172, + "step": 15688, + "time_per_iteration": 2.6688687801361084 + }, + { + "auxiliary_loss_clip": 0.01103766, + "auxiliary_loss_mlp": 0.01024944, + "balance_loss_clip": 1.03519917, + "balance_loss_mlp": 1.01406014, + "epoch": 0.9432737111077709, + "flos": 35369906156640.0, + "grad_norm": 1.7622293812521919, + "language_loss": 0.68902212, + "learning_rate": 3.3625762232454504e-08, + "loss": 0.71030921, + "num_input_tokens_seen": 338295585, + "router_z_loss_clip": 0.68554688, + "router_z_loss_mlp": 0.10882568, + "step": 15689, + "time_per_iteration": 4.288061618804932 + }, + { + "auxiliary_loss_clip": 0.0110849, + "auxiliary_loss_mlp": 0.01030513, + "balance_loss_clip": 1.03765464, + "balance_loss_mlp": 1.02089334, + "epoch": 0.9433338343604389, + "flos": 26368168286880.0, + "grad_norm": 1.6976998845524172, + "language_loss": 0.80114555, + "learning_rate": 3.35546834612872e-08, + "loss": 0.82253563, + "num_input_tokens_seen": 338314555, + "router_z_loss_clip": 0.70800781, + "router_z_loss_mlp": 0.09619141, + "step": 15690, + "time_per_iteration": 4.087229251861572 + }, + { + "auxiliary_loss_clip": 0.01109468, + "auxiliary_loss_mlp": 0.01030066, + "balance_loss_clip": 1.03796113, + "balance_loss_mlp": 1.01923561, + "epoch": 0.9433939576131068, + "flos": 40490033328000.0, + "grad_norm": 2.3103458082791186, + "language_loss": 0.60091114, + "learning_rate": 3.348367925792317e-08, + "loss": 0.62230647, + "num_input_tokens_seen": 338336260, + "router_z_loss_clip": 0.71582031, + "router_z_loss_mlp": 0.10839844, + "step": 15691, + "time_per_iteration": 2.7033815383911133 + }, + { + "auxiliary_loss_clip": 0.01113315, + "auxiliary_loss_mlp": 0.01026379, + "balance_loss_clip": 1.04007232, + "balance_loss_mlp": 1.01563787, + "epoch": 0.9434540808657749, + "flos": 24997422463200.0, + "grad_norm": 1.610698426525359, + "language_loss": 0.66628689, + "learning_rate": 3.341274962505514e-08, + "loss": 0.68768376, + "num_input_tokens_seen": 338354680, + "router_z_loss_clip": 0.73291016, + "router_z_loss_mlp": 0.10742188, + "step": 15692, + "time_per_iteration": 2.605177164077759 + }, + { + "auxiliary_loss_clip": 0.01110457, + "auxiliary_loss_mlp": 0.01028764, + "balance_loss_clip": 1.03789973, + "balance_loss_mlp": 1.01793385, + "epoch": 0.9435142041184428, + "flos": 26286862806720.0, + "grad_norm": 2.2515582393810254, + "language_loss": 0.74299061, + "learning_rate": 3.334189456537251e-08, + "loss": 0.76438284, + "num_input_tokens_seen": 338372490, + "router_z_loss_clip": 0.72558594, + "router_z_loss_mlp": 0.10839844, + "step": 15693, + "time_per_iteration": 2.619253158569336 + }, + { + "auxiliary_loss_clip": 0.01111328, + "auxiliary_loss_mlp": 0.01030478, + "balance_loss_clip": 1.03951645, + "balance_loss_mlp": 1.01966012, + "epoch": 0.9435743273711108, + "flos": 30516506654400.0, + "grad_norm": 2.075296070004299, + "language_loss": 0.73225665, + "learning_rate": 3.327111408156291e-08, + "loss": 0.75367469, + "num_input_tokens_seen": 338390870, + "router_z_loss_clip": 0.71875, + "router_z_loss_mlp": 0.10821533, + "step": 15694, + "time_per_iteration": 2.6447949409484863 + }, + { + "auxiliary_loss_clip": 0.01027237, + "auxiliary_loss_mlp": 0.01001225, + "balance_loss_clip": 1.00489998, + "balance_loss_mlp": 1.00035012, + "epoch": 0.9436344506237787, + "flos": 73404137593440.0, + "grad_norm": 0.6920581873402172, + "language_loss": 0.50623155, + "learning_rate": 3.3200408176309316e-08, + "loss": 0.52651614, + "num_input_tokens_seen": 338453075, + "router_z_loss_clip": 0.2232666, + "router_z_loss_mlp": 0.00875092, + "step": 15695, + "time_per_iteration": 3.310940742492676 + }, + { + "auxiliary_loss_clip": 0.01104754, + "auxiliary_loss_mlp": 0.01028169, + "balance_loss_clip": 1.03671157, + "balance_loss_mlp": 1.0176127, + "epoch": 0.9436945738764467, + "flos": 27133998991200.0, + "grad_norm": 2.386901788782591, + "language_loss": 0.64864326, + "learning_rate": 3.312977685229335e-08, + "loss": 0.66997248, + "num_input_tokens_seen": 338471770, + "router_z_loss_clip": 0.68066406, + "router_z_loss_mlp": 0.10552979, + "step": 15696, + "time_per_iteration": 2.6611597537994385 + }, + { + "auxiliary_loss_clip": 0.0110972, + "auxiliary_loss_mlp": 0.01027291, + "balance_loss_clip": 1.03820634, + "balance_loss_mlp": 1.01696193, + "epoch": 0.9437546971291146, + "flos": 30560583104640.0, + "grad_norm": 1.637691209144837, + "language_loss": 0.65978324, + "learning_rate": 3.305922011219353e-08, + "loss": 0.6811533, + "num_input_tokens_seen": 338492190, + "router_z_loss_clip": 0.71386719, + "router_z_loss_mlp": 0.10327148, + "step": 15697, + "time_per_iteration": 2.633803367614746 + }, + { + "auxiliary_loss_clip": 0.01027214, + "auxiliary_loss_mlp": 0.01001248, + "balance_loss_clip": 1.00490475, + "balance_loss_mlp": 1.00034094, + "epoch": 0.9438148203817827, + "flos": 69295378258080.0, + "grad_norm": 0.8445366865657542, + "language_loss": 0.63212776, + "learning_rate": 3.298873795868506e-08, + "loss": 0.65241241, + "num_input_tokens_seen": 338552560, + "router_z_loss_clip": 0.22302246, + "router_z_loss_mlp": 0.00906372, + "step": 15698, + "time_per_iteration": 3.1571044921875 + }, + { + "auxiliary_loss_clip": 0.01112218, + "auxiliary_loss_mlp": 0.01036412, + "balance_loss_clip": 1.03791499, + "balance_loss_mlp": 1.02465796, + "epoch": 0.9438749436344506, + "flos": 27267200756640.0, + "grad_norm": 2.0811457076121083, + "language_loss": 0.69930691, + "learning_rate": 3.291833039444092e-08, + "loss": 0.72079325, + "num_input_tokens_seen": 338571770, + "router_z_loss_clip": 0.74414062, + "router_z_loss_mlp": 0.11761475, + "step": 15699, + "time_per_iteration": 2.6270864009857178 + }, + { + "auxiliary_loss_clip": 0.01104918, + "auxiliary_loss_mlp": 0.01030014, + "balance_loss_clip": 1.0357151, + "balance_loss_mlp": 1.01966071, + "epoch": 0.9439350668871186, + "flos": 16314511134240.0, + "grad_norm": 2.1238987635509052, + "language_loss": 0.74183768, + "learning_rate": 3.2847997422130734e-08, + "loss": 0.76318699, + "num_input_tokens_seen": 338587310, + "router_z_loss_clip": 0.69189453, + "router_z_loss_mlp": 0.10357666, + "step": 15700, + "time_per_iteration": 2.6089553833007812 + }, + { + "auxiliary_loss_clip": 0.01108046, + "auxiliary_loss_mlp": 0.01029182, + "balance_loss_clip": 1.03793824, + "balance_loss_mlp": 1.01875734, + "epoch": 0.9439951901397866, + "flos": 21701487526560.0, + "grad_norm": 1.6671876784927553, + "language_loss": 0.70436233, + "learning_rate": 3.2777739044421495e-08, + "loss": 0.72573459, + "num_input_tokens_seen": 338606235, + "router_z_loss_clip": 0.70068359, + "router_z_loss_mlp": 0.10412598, + "step": 15701, + "time_per_iteration": 3.907787322998047 + }, + { + "auxiliary_loss_clip": 0.01112948, + "auxiliary_loss_mlp": 0.01027939, + "balance_loss_clip": 1.03680539, + "balance_loss_mlp": 1.01675153, + "epoch": 0.9440553133924545, + "flos": 23036787080640.0, + "grad_norm": 2.1216719065787406, + "language_loss": 0.77965271, + "learning_rate": 3.2707555263977505e-08, + "loss": 0.80106157, + "num_input_tokens_seen": 338624090, + "router_z_loss_clip": 0.76171875, + "router_z_loss_mlp": 0.11187744, + "step": 15702, + "time_per_iteration": 2.574552536010742 + }, + { + "auxiliary_loss_clip": 0.01111312, + "auxiliary_loss_mlp": 0.01032735, + "balance_loss_clip": 1.03801537, + "balance_loss_mlp": 1.02226853, + "epoch": 0.9441154366451225, + "flos": 23882910333120.0, + "grad_norm": 2.8107340417600915, + "language_loss": 0.66647542, + "learning_rate": 3.2637446083460194e-08, + "loss": 0.68791592, + "num_input_tokens_seen": 338643695, + "router_z_loss_clip": 0.73339844, + "router_z_loss_mlp": 0.10461426, + "step": 15703, + "time_per_iteration": 2.625704050064087 + }, + { + "auxiliary_loss_clip": 0.01111926, + "auxiliary_loss_mlp": 0.01025955, + "balance_loss_clip": 1.03858709, + "balance_loss_mlp": 1.01433861, + "epoch": 0.9441755598977905, + "flos": 36967192858080.0, + "grad_norm": 1.606856630752469, + "language_loss": 0.7335676, + "learning_rate": 3.256741150552833e-08, + "loss": 0.75494647, + "num_input_tokens_seen": 338664725, + "router_z_loss_clip": 0.73339844, + "router_z_loss_mlp": 0.11621094, + "step": 15704, + "time_per_iteration": 2.667867660522461 + }, + { + "auxiliary_loss_clip": 0.01107288, + "auxiliary_loss_mlp": 0.01032143, + "balance_loss_clip": 1.03750348, + "balance_loss_mlp": 1.02093768, + "epoch": 0.9442356831504585, + "flos": 25218979715520.0, + "grad_norm": 1.8025645886260295, + "language_loss": 0.74325049, + "learning_rate": 3.2497451532837336e-08, + "loss": 0.76464474, + "num_input_tokens_seen": 338683990, + "router_z_loss_clip": 0.69775391, + "router_z_loss_mlp": 0.11212158, + "step": 15705, + "time_per_iteration": 2.6584787368774414 + }, + { + "auxiliary_loss_clip": 0.01110013, + "auxiliary_loss_mlp": 0.01030409, + "balance_loss_clip": 1.0384661, + "balance_loss_mlp": 1.02015734, + "epoch": 0.9442958064031264, + "flos": 19653752692800.0, + "grad_norm": 2.0891211162585623, + "language_loss": 0.77187777, + "learning_rate": 3.2427566168039986e-08, + "loss": 0.79328191, + "num_input_tokens_seen": 338702025, + "router_z_loss_clip": 0.71582031, + "router_z_loss_mlp": 0.10253906, + "step": 15706, + "time_per_iteration": 2.572995901107788 + }, + { + "auxiliary_loss_clip": 0.01104873, + "auxiliary_loss_mlp": 0.01025411, + "balance_loss_clip": 1.03556967, + "balance_loss_mlp": 1.01528978, + "epoch": 0.9443559296557944, + "flos": 24950064113280.0, + "grad_norm": 1.7958854559222508, + "language_loss": 0.69255924, + "learning_rate": 3.23577554137866e-08, + "loss": 0.71386206, + "num_input_tokens_seen": 338720920, + "router_z_loss_clip": 0.69335938, + "router_z_loss_mlp": 0.10119629, + "step": 15707, + "time_per_iteration": 2.629690408706665 + }, + { + "auxiliary_loss_clip": 0.01100812, + "auxiliary_loss_mlp": 0.01027784, + "balance_loss_clip": 1.0329597, + "balance_loss_mlp": 1.0179553, + "epoch": 0.9444160529084623, + "flos": 26370234668160.0, + "grad_norm": 1.838019671035452, + "language_loss": 0.69392389, + "learning_rate": 3.22880192727244e-08, + "loss": 0.71520984, + "num_input_tokens_seen": 338739590, + "router_z_loss_clip": 0.67822266, + "router_z_loss_mlp": 0.0982666, + "step": 15708, + "time_per_iteration": 2.6077094078063965 + }, + { + "auxiliary_loss_clip": 0.01107924, + "auxiliary_loss_mlp": 0.01027344, + "balance_loss_clip": 1.03775525, + "balance_loss_mlp": 1.01668119, + "epoch": 0.9444761761611303, + "flos": 22502561914080.0, + "grad_norm": 2.701996616511746, + "language_loss": 0.70968366, + "learning_rate": 3.221835774749748e-08, + "loss": 0.73103637, + "num_input_tokens_seen": 338757240, + "router_z_loss_clip": 0.70166016, + "router_z_loss_mlp": 0.10656738, + "step": 15709, + "time_per_iteration": 2.591312885284424 + }, + { + "auxiliary_loss_clip": 0.01108196, + "auxiliary_loss_mlp": 0.01030152, + "balance_loss_clip": 1.03775132, + "balance_loss_mlp": 1.0193224, + "epoch": 0.9445362994137982, + "flos": 25572199076640.0, + "grad_norm": 2.2067965862563725, + "language_loss": 0.84973639, + "learning_rate": 3.214877084074774e-08, + "loss": 0.87111986, + "num_input_tokens_seen": 338773750, + "router_z_loss_clip": 0.70458984, + "router_z_loss_mlp": 0.10827637, + "step": 15710, + "time_per_iteration": 2.6156980991363525 + }, + { + "auxiliary_loss_clip": 0.0111308, + "auxiliary_loss_mlp": 0.01029656, + "balance_loss_clip": 1.03881884, + "balance_loss_mlp": 1.01777053, + "epoch": 0.9445964226664663, + "flos": 24774041933280.0, + "grad_norm": 1.81527778642662, + "language_loss": 0.71606028, + "learning_rate": 3.2079258555113956e-08, + "loss": 0.73748755, + "num_input_tokens_seen": 338792115, + "router_z_loss_clip": 0.74316406, + "router_z_loss_mlp": 0.11895752, + "step": 15711, + "time_per_iteration": 2.605882406234741 + }, + { + "auxiliary_loss_clip": 0.01112736, + "auxiliary_loss_mlp": 0.01026169, + "balance_loss_clip": 1.04028237, + "balance_loss_mlp": 1.0150826, + "epoch": 0.9446565459191342, + "flos": 32208915228480.0, + "grad_norm": 1.7574240910422345, + "language_loss": 0.68810904, + "learning_rate": 3.200982089323179e-08, + "loss": 0.70949817, + "num_input_tokens_seen": 338812480, + "router_z_loss_clip": 0.72412109, + "router_z_loss_mlp": 0.11083984, + "step": 15712, + "time_per_iteration": 2.641295909881592 + }, + { + "auxiliary_loss_clip": 0.01114386, + "auxiliary_loss_mlp": 0.01033515, + "balance_loss_clip": 1.03977978, + "balance_loss_mlp": 1.02123046, + "epoch": 0.9447166691718022, + "flos": 20187653721120.0, + "grad_norm": 15.398183040019047, + "language_loss": 0.70633221, + "learning_rate": 3.1940457857734246e-08, + "loss": 0.72781122, + "num_input_tokens_seen": 338829105, + "router_z_loss_clip": 0.74560547, + "router_z_loss_mlp": 0.12280273, + "step": 15713, + "time_per_iteration": 2.6140151023864746 + }, + { + "auxiliary_loss_clip": 0.01105836, + "auxiliary_loss_mlp": 0.01035781, + "balance_loss_clip": 1.0364995, + "balance_loss_mlp": 1.02411044, + "epoch": 0.9447767924244702, + "flos": 35587776336480.0, + "grad_norm": 1.960865028877255, + "language_loss": 0.7653693, + "learning_rate": 3.187116945125212e-08, + "loss": 0.78678542, + "num_input_tokens_seen": 338850670, + "router_z_loss_clip": 0.69335938, + "router_z_loss_mlp": 0.11676025, + "step": 15714, + "time_per_iteration": 2.6594624519348145 + }, + { + "auxiliary_loss_clip": 0.01109302, + "auxiliary_loss_mlp": 0.01030239, + "balance_loss_clip": 1.03630316, + "balance_loss_mlp": 1.01862192, + "epoch": 0.9448369156771381, + "flos": 23519764238400.0, + "grad_norm": 2.158493679808563, + "language_loss": 0.6757046, + "learning_rate": 3.1801955676412194e-08, + "loss": 0.69710004, + "num_input_tokens_seen": 338867795, + "router_z_loss_clip": 0.73046875, + "router_z_loss_mlp": 0.11602783, + "step": 15715, + "time_per_iteration": 2.6036229133605957 + }, + { + "auxiliary_loss_clip": 0.01110678, + "auxiliary_loss_mlp": 0.01031153, + "balance_loss_clip": 1.03744829, + "balance_loss_mlp": 1.01914227, + "epoch": 0.9448970389298061, + "flos": 29091352474080.0, + "grad_norm": 2.0464492693390377, + "language_loss": 0.74641311, + "learning_rate": 3.173281653583948e-08, + "loss": 0.76783144, + "num_input_tokens_seen": 338887205, + "router_z_loss_clip": 0.73193359, + "router_z_loss_mlp": 0.12011719, + "step": 15716, + "time_per_iteration": 2.6383719444274902 + }, + { + "auxiliary_loss_clip": 0.01114135, + "auxiliary_loss_mlp": 0.01028206, + "balance_loss_clip": 1.04147673, + "balance_loss_mlp": 1.01663041, + "epoch": 0.944957162182474, + "flos": 27311034103200.0, + "grad_norm": 1.8822706847217336, + "language_loss": 0.62599754, + "learning_rate": 3.166375203215565e-08, + "loss": 0.647421, + "num_input_tokens_seen": 338906130, + "router_z_loss_clip": 0.72705078, + "router_z_loss_mlp": 0.11566162, + "step": 15717, + "time_per_iteration": 4.14451789855957 + }, + { + "auxiliary_loss_clip": 0.01109613, + "auxiliary_loss_mlp": 0.01031683, + "balance_loss_clip": 1.0375855, + "balance_loss_mlp": 1.02062678, + "epoch": 0.9450172854351421, + "flos": 21211622431200.0, + "grad_norm": 1.7164275331561, + "language_loss": 0.79262698, + "learning_rate": 3.1594762167979514e-08, + "loss": 0.81403995, + "num_input_tokens_seen": 338923045, + "router_z_loss_clip": 0.72070312, + "router_z_loss_mlp": 0.11047363, + "step": 15718, + "time_per_iteration": 2.604348659515381 + }, + { + "auxiliary_loss_clip": 0.01027119, + "auxiliary_loss_mlp": 0.0100134, + "balance_loss_clip": 1.00476444, + "balance_loss_mlp": 1.00044203, + "epoch": 0.94507740868781, + "flos": 83542787298720.0, + "grad_norm": 0.6958459854486949, + "language_loss": 0.57832849, + "learning_rate": 3.152584694592719e-08, + "loss": 0.59861314, + "num_input_tokens_seen": 338987545, + "router_z_loss_clip": 0.22363281, + "router_z_loss_mlp": 0.00897217, + "step": 15719, + "time_per_iteration": 3.292724847793579 + }, + { + "auxiliary_loss_clip": 0.0111018, + "auxiliary_loss_mlp": 0.01030123, + "balance_loss_clip": 1.03732014, + "balance_loss_mlp": 1.01868534, + "epoch": 0.945137531940478, + "flos": 25798091677920.0, + "grad_norm": 4.1209579565197085, + "language_loss": 0.75685889, + "learning_rate": 3.145700636861193e-08, + "loss": 0.7782619, + "num_input_tokens_seen": 339007830, + "router_z_loss_clip": 0.72851562, + "router_z_loss_mlp": 0.11450195, + "step": 15720, + "time_per_iteration": 2.604100227355957 + }, + { + "auxiliary_loss_clip": 0.0110633, + "auxiliary_loss_mlp": 0.01027855, + "balance_loss_clip": 1.03582191, + "balance_loss_mlp": 1.01787174, + "epoch": 0.9451976551931459, + "flos": 29936746415520.0, + "grad_norm": 2.1130130412311554, + "language_loss": 0.72727704, + "learning_rate": 3.138824043864452e-08, + "loss": 0.74861896, + "num_input_tokens_seen": 339028980, + "router_z_loss_clip": 0.70507812, + "router_z_loss_mlp": 0.09991455, + "step": 15721, + "time_per_iteration": 2.6744894981384277 + }, + { + "auxiliary_loss_clip": 0.01109844, + "auxiliary_loss_mlp": 0.01034613, + "balance_loss_clip": 1.03790033, + "balance_loss_mlp": 1.02337754, + "epoch": 0.9452577784458139, + "flos": 28602459793440.0, + "grad_norm": 1.8635770316532683, + "language_loss": 0.85045224, + "learning_rate": 3.131954915863244e-08, + "loss": 0.8718968, + "num_input_tokens_seen": 339047950, + "router_z_loss_clip": 0.71923828, + "router_z_loss_mlp": 0.11242676, + "step": 15722, + "time_per_iteration": 2.6573357582092285 + }, + { + "auxiliary_loss_clip": 0.01027192, + "auxiliary_loss_mlp": 0.01001739, + "balance_loss_clip": 1.00481844, + "balance_loss_mlp": 1.00084245, + "epoch": 0.9453179016984818, + "flos": 63470270986560.0, + "grad_norm": 0.8912917644996221, + "language_loss": 0.64443266, + "learning_rate": 3.125093253118005e-08, + "loss": 0.66472203, + "num_input_tokens_seen": 339104535, + "router_z_loss_clip": 0.22375488, + "router_z_loss_mlp": 0.00895691, + "step": 15723, + "time_per_iteration": 3.2399868965148926 + }, + { + "auxiliary_loss_clip": 0.01111635, + "auxiliary_loss_mlp": 0.01026272, + "balance_loss_clip": 1.03833628, + "balance_loss_mlp": 1.01485157, + "epoch": 0.9453780249511499, + "flos": 16439893064640.0, + "grad_norm": 2.8611322821507845, + "language_loss": 0.73209584, + "learning_rate": 3.1182390558889715e-08, + "loss": 0.75347489, + "num_input_tokens_seen": 339122050, + "router_z_loss_clip": 0.73242188, + "router_z_loss_mlp": 0.11419678, + "step": 15724, + "time_per_iteration": 2.6387360095977783 + }, + { + "auxiliary_loss_clip": 0.0110857, + "auxiliary_loss_mlp": 0.0102584, + "balance_loss_clip": 1.03701258, + "balance_loss_mlp": 1.01508141, + "epoch": 0.9454381482038178, + "flos": 28380254264640.0, + "grad_norm": 2.364725524363571, + "language_loss": 0.84509474, + "learning_rate": 3.111392324436024e-08, + "loss": 0.86643887, + "num_input_tokens_seen": 339138940, + "router_z_loss_clip": 0.71582031, + "router_z_loss_mlp": 0.10760498, + "step": 15725, + "time_per_iteration": 2.887249708175659 + }, + { + "auxiliary_loss_clip": 0.01109372, + "auxiliary_loss_mlp": 0.0102603, + "balance_loss_clip": 1.03747475, + "balance_loss_mlp": 1.01491344, + "epoch": 0.9454982714564858, + "flos": 23790057428160.0, + "grad_norm": 1.886654509226049, + "language_loss": 0.71088898, + "learning_rate": 3.104553059018822e-08, + "loss": 0.73224306, + "num_input_tokens_seen": 339158245, + "router_z_loss_clip": 0.71972656, + "router_z_loss_mlp": 0.11114502, + "step": 15726, + "time_per_iteration": 2.671566963195801 + }, + { + "auxiliary_loss_clip": 0.0110864, + "auxiliary_loss_mlp": 0.01029442, + "balance_loss_clip": 1.03685927, + "balance_loss_mlp": 1.01740801, + "epoch": 0.9455583947091538, + "flos": 28379808574560.0, + "grad_norm": 1.7618153002278905, + "language_loss": 0.60801184, + "learning_rate": 3.097721259896735e-08, + "loss": 0.62939268, + "num_input_tokens_seen": 339178200, + "router_z_loss_clip": 0.71728516, + "router_z_loss_mlp": 0.12036133, + "step": 15727, + "time_per_iteration": 2.6762804985046387 + }, + { + "auxiliary_loss_clip": 0.01103872, + "auxiliary_loss_mlp": 0.01027387, + "balance_loss_clip": 1.03481627, + "balance_loss_mlp": 1.0170815, + "epoch": 0.9456185179618217, + "flos": 21564720240480.0, + "grad_norm": 2.218348615802093, + "language_loss": 0.81986523, + "learning_rate": 3.0908969273287566e-08, + "loss": 0.84117782, + "num_input_tokens_seen": 339193950, + "router_z_loss_clip": 0.69140625, + "router_z_loss_mlp": 0.10302734, + "step": 15728, + "time_per_iteration": 4.038335800170898 + }, + { + "auxiliary_loss_clip": 0.01027266, + "auxiliary_loss_mlp": 0.01001625, + "balance_loss_clip": 1.00491786, + "balance_loss_mlp": 1.00069308, + "epoch": 0.9456786412144897, + "flos": 74939115068640.0, + "grad_norm": 0.7337050140876101, + "language_loss": 0.58981359, + "learning_rate": 3.08408006157368e-08, + "loss": 0.61010253, + "num_input_tokens_seen": 339252330, + "router_z_loss_clip": 0.22363281, + "router_z_loss_mlp": 0.00930786, + "step": 15729, + "time_per_iteration": 4.560804843902588 + }, + { + "auxiliary_loss_clip": 0.01107142, + "auxiliary_loss_mlp": 0.01023419, + "balance_loss_clip": 1.03676808, + "balance_loss_mlp": 1.01219583, + "epoch": 0.9457387644671577, + "flos": 22681379786400.0, + "grad_norm": 2.03829528156011, + "language_loss": 0.76591551, + "learning_rate": 3.077270662890052e-08, + "loss": 0.78722119, + "num_input_tokens_seen": 339270325, + "router_z_loss_clip": 0.70458984, + "router_z_loss_mlp": 0.11230469, + "step": 15730, + "time_per_iteration": 2.590599298477173 + }, + { + "auxiliary_loss_clip": 0.01107811, + "auxiliary_loss_mlp": 0.01033793, + "balance_loss_clip": 1.03543139, + "balance_loss_mlp": 1.02205658, + "epoch": 0.9457988877198257, + "flos": 25756446264480.0, + "grad_norm": 1.4909669626923405, + "language_loss": 0.62261069, + "learning_rate": 3.070468731536047e-08, + "loss": 0.6440267, + "num_input_tokens_seen": 339291980, + "router_z_loss_clip": 0.72314453, + "router_z_loss_mlp": 0.11730957, + "step": 15731, + "time_per_iteration": 2.6877644062042236 + }, + { + "auxiliary_loss_clip": 0.01110889, + "auxiliary_loss_mlp": 0.01027609, + "balance_loss_clip": 1.03701758, + "balance_loss_mlp": 1.01605797, + "epoch": 0.9458590109724936, + "flos": 32565619075680.0, + "grad_norm": 1.7742353759703926, + "language_loss": 0.63797945, + "learning_rate": 3.063674267769589e-08, + "loss": 0.65936446, + "num_input_tokens_seen": 339311795, + "router_z_loss_clip": 0.73925781, + "router_z_loss_mlp": 0.11541748, + "step": 15732, + "time_per_iteration": 2.6360678672790527 + }, + { + "auxiliary_loss_clip": 0.01113663, + "auxiliary_loss_mlp": 0.01025491, + "balance_loss_clip": 1.03752875, + "balance_loss_mlp": 1.01376653, + "epoch": 0.9459191342251616, + "flos": 22770707688000.0, + "grad_norm": 1.9178751567073924, + "language_loss": 0.84403872, + "learning_rate": 3.056887271848363e-08, + "loss": 0.8654303, + "num_input_tokens_seen": 339327745, + "router_z_loss_clip": 0.76123047, + "router_z_loss_mlp": 0.1171875, + "step": 15733, + "time_per_iteration": 2.6194679737091064 + }, + { + "auxiliary_loss_clip": 0.01106611, + "auxiliary_loss_mlp": 0.01030149, + "balance_loss_clip": 1.03715599, + "balance_loss_mlp": 1.020082, + "epoch": 0.9459792574778295, + "flos": 28551576440160.0, + "grad_norm": 1.4922489299766728, + "language_loss": 0.72176993, + "learning_rate": 3.0501077440297173e-08, + "loss": 0.74313748, + "num_input_tokens_seen": 339346445, + "router_z_loss_clip": 0.6953125, + "router_z_loss_mlp": 0.10070801, + "step": 15734, + "time_per_iteration": 2.6174328327178955 + }, + { + "auxiliary_loss_clip": 0.0110209, + "auxiliary_loss_mlp": 0.01028154, + "balance_loss_clip": 1.03429186, + "balance_loss_mlp": 1.01852798, + "epoch": 0.9460393807304975, + "flos": 29764289756160.0, + "grad_norm": 1.503007349089794, + "language_loss": 0.86915362, + "learning_rate": 3.043335684570692e-08, + "loss": 0.89045608, + "num_input_tokens_seen": 339367945, + "router_z_loss_clip": 0.67773438, + "router_z_loss_mlp": 0.09631348, + "step": 15735, + "time_per_iteration": 2.6579606533050537 + }, + { + "auxiliary_loss_clip": 0.01108676, + "auxiliary_loss_mlp": 0.01028991, + "balance_loss_clip": 1.03684556, + "balance_loss_mlp": 1.01860166, + "epoch": 0.9460995039831654, + "flos": 26771096000160.0, + "grad_norm": 1.8122108579712997, + "language_loss": 0.66999477, + "learning_rate": 3.036571093728102e-08, + "loss": 0.69137138, + "num_input_tokens_seen": 339386060, + "router_z_loss_clip": 0.71826172, + "router_z_loss_mlp": 0.1038208, + "step": 15736, + "time_per_iteration": 2.5928521156311035 + }, + { + "auxiliary_loss_clip": 0.01027264, + "auxiliary_loss_mlp": 0.01001756, + "balance_loss_clip": 1.00490022, + "balance_loss_mlp": 1.00086367, + "epoch": 0.9461596272358335, + "flos": 85808189725920.0, + "grad_norm": 0.8726748504332412, + "language_loss": 0.65288883, + "learning_rate": 3.029813971758499e-08, + "loss": 0.67317909, + "num_input_tokens_seen": 339446695, + "router_z_loss_clip": 0.22363281, + "router_z_loss_mlp": 0.00892639, + "step": 15737, + "time_per_iteration": 3.299790382385254 + }, + { + "auxiliary_loss_clip": 0.01027536, + "auxiliary_loss_mlp": 0.01001632, + "balance_loss_clip": 1.00521398, + "balance_loss_mlp": 1.00070167, + "epoch": 0.9462197504885014, + "flos": 71491265733600.0, + "grad_norm": 0.8025753538179627, + "language_loss": 0.58770299, + "learning_rate": 3.0230643189181225e-08, + "loss": 0.60799468, + "num_input_tokens_seen": 339510080, + "router_z_loss_clip": 0.2232666, + "router_z_loss_mlp": 0.0092926, + "step": 15738, + "time_per_iteration": 3.2171387672424316 + }, + { + "auxiliary_loss_clip": 0.01106932, + "auxiliary_loss_mlp": 0.01030554, + "balance_loss_clip": 1.03655243, + "balance_loss_mlp": 1.02065313, + "epoch": 0.9462798737411694, + "flos": 28594761510240.0, + "grad_norm": 1.653708857616519, + "language_loss": 0.71771008, + "learning_rate": 3.016322135462834e-08, + "loss": 0.73908496, + "num_input_tokens_seen": 339529335, + "router_z_loss_clip": 0.70263672, + "router_z_loss_mlp": 0.09906006, + "step": 15739, + "time_per_iteration": 2.6484858989715576 + }, + { + "auxiliary_loss_clip": 0.01109618, + "auxiliary_loss_mlp": 0.01027861, + "balance_loss_clip": 1.03699136, + "balance_loss_mlp": 1.01694131, + "epoch": 0.9463399969938374, + "flos": 30561596036640.0, + "grad_norm": 2.5065771565178623, + "language_loss": 0.6463756, + "learning_rate": 3.009587421648363e-08, + "loss": 0.66775042, + "num_input_tokens_seen": 339548820, + "router_z_loss_clip": 0.72558594, + "router_z_loss_mlp": 0.10925293, + "step": 15740, + "time_per_iteration": 2.714813470840454 + }, + { + "auxiliary_loss_clip": 0.01106839, + "auxiliary_loss_mlp": 0.01029384, + "balance_loss_clip": 1.03693509, + "balance_loss_mlp": 1.01885176, + "epoch": 0.9464001202465053, + "flos": 29715391749600.0, + "grad_norm": 1.6892468079306069, + "language_loss": 0.66562623, + "learning_rate": 3.0028601777301045e-08, + "loss": 0.68698847, + "num_input_tokens_seen": 339566775, + "router_z_loss_clip": 0.69921875, + "router_z_loss_mlp": 0.10540771, + "step": 15741, + "time_per_iteration": 3.9911649227142334 + }, + { + "auxiliary_loss_clip": 0.01109933, + "auxiliary_loss_mlp": 0.01023342, + "balance_loss_clip": 1.03786778, + "balance_loss_mlp": 1.01258969, + "epoch": 0.9464602434991733, + "flos": 20945016313920.0, + "grad_norm": 4.159583480903439, + "language_loss": 0.75836194, + "learning_rate": 2.9961404039630987e-08, + "loss": 0.77969474, + "num_input_tokens_seen": 339581905, + "router_z_loss_clip": 0.72070312, + "router_z_loss_mlp": 0.10748291, + "step": 15742, + "time_per_iteration": 2.5870234966278076 + }, + { + "auxiliary_loss_clip": 0.01107658, + "auxiliary_loss_mlp": 0.01029572, + "balance_loss_clip": 1.03736913, + "balance_loss_mlp": 1.01919436, + "epoch": 0.9465203667518413, + "flos": 24328455874560.0, + "grad_norm": 1.914664793596821, + "language_loss": 0.72423989, + "learning_rate": 2.989428100602187e-08, + "loss": 0.74561214, + "num_input_tokens_seen": 339599870, + "router_z_loss_clip": 0.70263672, + "router_z_loss_mlp": 0.10369873, + "step": 15743, + "time_per_iteration": 2.6586623191833496 + }, + { + "auxiliary_loss_clip": 0.01111797, + "auxiliary_loss_mlp": 0.01027422, + "balance_loss_clip": 1.0380379, + "balance_loss_mlp": 1.01625228, + "epoch": 0.9465804900045093, + "flos": 24550823472480.0, + "grad_norm": 2.0538468019076306, + "language_loss": 0.79721677, + "learning_rate": 2.982723267901943e-08, + "loss": 0.818609, + "num_input_tokens_seen": 339620250, + "router_z_loss_clip": 0.73730469, + "router_z_loss_mlp": 0.11169434, + "step": 15744, + "time_per_iteration": 2.644500255584717 + }, + { + "auxiliary_loss_clip": 0.01112032, + "auxiliary_loss_mlp": 0.01034807, + "balance_loss_clip": 1.03783393, + "balance_loss_mlp": 1.02335083, + "epoch": 0.9466406132571772, + "flos": 29177560545120.0, + "grad_norm": 1.6763558034155268, + "language_loss": 0.78246707, + "learning_rate": 2.9760259061165417e-08, + "loss": 0.80393547, + "num_input_tokens_seen": 339639900, + "router_z_loss_clip": 0.74072266, + "router_z_loss_mlp": 0.11456299, + "step": 15745, + "time_per_iteration": 2.6872920989990234 + }, + { + "auxiliary_loss_clip": 0.01110894, + "auxiliary_loss_mlp": 0.01031856, + "balance_loss_clip": 1.03674912, + "balance_loss_mlp": 1.02064419, + "epoch": 0.9467007365098452, + "flos": 24322945524480.0, + "grad_norm": 2.6720517656596514, + "language_loss": 0.70384175, + "learning_rate": 2.9693360155000014e-08, + "loss": 0.7252692, + "num_input_tokens_seen": 339658970, + "router_z_loss_clip": 0.74169922, + "router_z_loss_mlp": 0.11206055, + "step": 15746, + "time_per_iteration": 2.6288537979125977 + }, + { + "auxiliary_loss_clip": 0.01109153, + "auxiliary_loss_mlp": 0.01025929, + "balance_loss_clip": 1.03755879, + "balance_loss_mlp": 1.01461053, + "epoch": 0.9467608597625131, + "flos": 23563273446720.0, + "grad_norm": 2.073690819303121, + "language_loss": 0.5626542, + "learning_rate": 2.962653596305964e-08, + "loss": 0.58400506, + "num_input_tokens_seen": 339675600, + "router_z_loss_clip": 0.71533203, + "router_z_loss_mlp": 0.11315918, + "step": 15747, + "time_per_iteration": 2.6391842365264893 + }, + { + "auxiliary_loss_clip": 0.01027348, + "auxiliary_loss_mlp": 0.01001583, + "balance_loss_clip": 1.00494099, + "balance_loss_mlp": 1.00068784, + "epoch": 0.9468209830151811, + "flos": 84963403543680.0, + "grad_norm": 0.6610413326110345, + "language_loss": 0.53238225, + "learning_rate": 2.955978648787871e-08, + "loss": 0.55267155, + "num_input_tokens_seen": 339744505, + "router_z_loss_clip": 0.22387695, + "router_z_loss_mlp": 0.00894165, + "step": 15748, + "time_per_iteration": 3.4247732162475586 + }, + { + "auxiliary_loss_clip": 0.01110581, + "auxiliary_loss_mlp": 0.01033922, + "balance_loss_clip": 1.03846383, + "balance_loss_mlp": 1.02256143, + "epoch": 0.946881106267849, + "flos": 32966196786720.0, + "grad_norm": 1.614866894613967, + "language_loss": 0.65971196, + "learning_rate": 2.9493111731988096e-08, + "loss": 0.68115699, + "num_input_tokens_seen": 339765810, + "router_z_loss_clip": 0.72167969, + "router_z_loss_mlp": 0.1137085, + "step": 15749, + "time_per_iteration": 2.690325975418091 + }, + { + "auxiliary_loss_clip": 0.01110324, + "auxiliary_loss_mlp": 0.01029064, + "balance_loss_clip": 1.03704834, + "balance_loss_mlp": 1.01687443, + "epoch": 0.9469412295205171, + "flos": 24636261715200.0, + "grad_norm": 2.3202939007132524, + "language_loss": 0.76228511, + "learning_rate": 2.942651169791621e-08, + "loss": 0.78367907, + "num_input_tokens_seen": 339784125, + "router_z_loss_clip": 0.73339844, + "router_z_loss_mlp": 0.12200928, + "step": 15750, + "time_per_iteration": 2.626934766769409 + }, + { + "auxiliary_loss_clip": 0.011089, + "auxiliary_loss_mlp": 0.01029563, + "balance_loss_clip": 1.03846765, + "balance_loss_mlp": 1.01834583, + "epoch": 0.947001352773185, + "flos": 26020702379520.0, + "grad_norm": 1.8835162459086074, + "language_loss": 0.67963827, + "learning_rate": 2.9359986388188372e-08, + "loss": 0.70102292, + "num_input_tokens_seen": 339803450, + "router_z_loss_clip": 0.70458984, + "router_z_loss_mlp": 0.11212158, + "step": 15751, + "time_per_iteration": 2.6823525428771973 + }, + { + "auxiliary_loss_clip": 0.01109106, + "auxiliary_loss_mlp": 0.010283, + "balance_loss_clip": 1.03712249, + "balance_loss_mlp": 1.0179764, + "epoch": 0.947061476025853, + "flos": 26776322729280.0, + "grad_norm": 1.7742475779796374, + "language_loss": 0.65464103, + "learning_rate": 2.929353580532723e-08, + "loss": 0.67601502, + "num_input_tokens_seen": 339823215, + "router_z_loss_clip": 0.71972656, + "router_z_loss_mlp": 0.10314941, + "step": 15752, + "time_per_iteration": 2.6272122859954834 + }, + { + "auxiliary_loss_clip": 0.01109087, + "auxiliary_loss_mlp": 0.01027893, + "balance_loss_clip": 1.03708625, + "balance_loss_mlp": 1.01680064, + "epoch": 0.947121599278521, + "flos": 26106181139520.0, + "grad_norm": 1.584398070204161, + "language_loss": 0.71769786, + "learning_rate": 2.9227159951852764e-08, + "loss": 0.73906761, + "num_input_tokens_seen": 339842230, + "router_z_loss_clip": 0.71972656, + "router_z_loss_mlp": 0.11090088, + "step": 15753, + "time_per_iteration": 2.6040821075439453 + }, + { + "auxiliary_loss_clip": 0.01110649, + "auxiliary_loss_mlp": 0.01031848, + "balance_loss_clip": 1.03594553, + "balance_loss_mlp": 1.01921821, + "epoch": 0.9471817225311889, + "flos": 28157238390240.0, + "grad_norm": 1.9715759958825958, + "language_loss": 0.70038068, + "learning_rate": 2.9160858830281855e-08, + "loss": 0.72180557, + "num_input_tokens_seen": 339861640, + "router_z_loss_clip": 0.74755859, + "router_z_loss_mlp": 0.12634277, + "step": 15754, + "time_per_iteration": 2.6348438262939453 + }, + { + "auxiliary_loss_clip": 0.0111201, + "auxiliary_loss_mlp": 0.01027759, + "balance_loss_clip": 1.03741729, + "balance_loss_mlp": 1.01714909, + "epoch": 0.947241845783857, + "flos": 14532896210400.0, + "grad_norm": 2.3224490872940375, + "language_loss": 0.78980339, + "learning_rate": 2.9094632443129153e-08, + "loss": 0.8112011, + "num_input_tokens_seen": 339878210, + "router_z_loss_clip": 0.74609375, + "router_z_loss_mlp": 0.1060791, + "step": 15755, + "time_per_iteration": 2.5716006755828857 + }, + { + "auxiliary_loss_clip": 0.0111555, + "auxiliary_loss_mlp": 0.01036039, + "balance_loss_clip": 1.03796697, + "balance_loss_mlp": 1.02305686, + "epoch": 0.9473019690365249, + "flos": 25307821409760.0, + "grad_norm": 4.469909923123778, + "language_loss": 0.75155866, + "learning_rate": 2.9028480792904876e-08, + "loss": 0.77307451, + "num_input_tokens_seen": 339894255, + "router_z_loss_clip": 0.77587891, + "router_z_loss_mlp": 0.12976074, + "step": 15756, + "time_per_iteration": 2.627782106399536 + }, + { + "auxiliary_loss_clip": 0.01109484, + "auxiliary_loss_mlp": 0.01027023, + "balance_loss_clip": 1.03738153, + "balance_loss_mlp": 1.01643741, + "epoch": 0.9473620922891929, + "flos": 21523966207200.0, + "grad_norm": 10.982492000765987, + "language_loss": 0.74726152, + "learning_rate": 2.8962403882118347e-08, + "loss": 0.76862657, + "num_input_tokens_seen": 339912425, + "router_z_loss_clip": 0.72070312, + "router_z_loss_mlp": 0.10577393, + "step": 15757, + "time_per_iteration": 4.011837720870972 + }, + { + "auxiliary_loss_clip": 0.01113046, + "auxiliary_loss_mlp": 0.01030779, + "balance_loss_clip": 1.03814101, + "balance_loss_mlp": 1.01885223, + "epoch": 0.9474222155418608, + "flos": 28735985697120.0, + "grad_norm": 2.4126519955307684, + "language_loss": 0.79231942, + "learning_rate": 2.889640171327512e-08, + "loss": 0.81375766, + "num_input_tokens_seen": 339929635, + "router_z_loss_clip": 0.74853516, + "router_z_loss_mlp": 0.11938477, + "step": 15758, + "time_per_iteration": 2.6188719272613525 + }, + { + "auxiliary_loss_clip": 0.01108694, + "auxiliary_loss_mlp": 0.01028085, + "balance_loss_clip": 1.03812063, + "balance_loss_mlp": 1.0175705, + "epoch": 0.9474823387945288, + "flos": 33055403136480.0, + "grad_norm": 1.6823499144779899, + "language_loss": 0.72072303, + "learning_rate": 2.8830474288877638e-08, + "loss": 0.74209088, + "num_input_tokens_seen": 339951200, + "router_z_loss_clip": 0.70556641, + "router_z_loss_mlp": 0.10516357, + "step": 15759, + "time_per_iteration": 2.652215003967285 + }, + { + "auxiliary_loss_clip": 0.01105731, + "auxiliary_loss_mlp": 0.01025383, + "balance_loss_clip": 1.03785086, + "balance_loss_mlp": 1.01581037, + "epoch": 0.9475424620471967, + "flos": 28023955590240.0, + "grad_norm": 1.5070556711183674, + "language_loss": 0.75475818, + "learning_rate": 2.8764621611426344e-08, + "loss": 0.77606928, + "num_input_tokens_seen": 339971820, + "router_z_loss_clip": 0.67919922, + "router_z_loss_mlp": 0.09570312, + "step": 15760, + "time_per_iteration": 2.6405813694000244 + }, + { + "auxiliary_loss_clip": 0.01109619, + "auxiliary_loss_mlp": 0.0102808, + "balance_loss_clip": 1.03863382, + "balance_loss_mlp": 1.01733923, + "epoch": 0.9476025852998647, + "flos": 24462143847360.0, + "grad_norm": 1.7762562070461188, + "language_loss": 0.7265709, + "learning_rate": 2.8698843683418128e-08, + "loss": 0.74794793, + "num_input_tokens_seen": 339989420, + "router_z_loss_clip": 0.7109375, + "router_z_loss_mlp": 0.10742188, + "step": 15761, + "time_per_iteration": 2.6279635429382324 + }, + { + "auxiliary_loss_clip": 0.01109209, + "auxiliary_loss_mlp": 0.01033943, + "balance_loss_clip": 1.03908932, + "balance_loss_mlp": 1.02356601, + "epoch": 0.9476627085525327, + "flos": 18272715480000.0, + "grad_norm": 2.174751131144568, + "language_loss": 0.71815324, + "learning_rate": 2.863314050734722e-08, + "loss": 0.7395848, + "num_input_tokens_seen": 340006690, + "router_z_loss_clip": 0.69970703, + "router_z_loss_mlp": 0.10375977, + "step": 15762, + "time_per_iteration": 2.6604998111724854 + }, + { + "auxiliary_loss_clip": 0.01113008, + "auxiliary_loss_mlp": 0.01035586, + "balance_loss_clip": 1.03660107, + "balance_loss_mlp": 1.02355182, + "epoch": 0.9477228318052007, + "flos": 22814905690080.0, + "grad_norm": 2.0536485021844912, + "language_loss": 0.67249292, + "learning_rate": 2.856751208570518e-08, + "loss": 0.69397885, + "num_input_tokens_seen": 340025480, + "router_z_loss_clip": 0.76367188, + "router_z_loss_mlp": 0.12036133, + "step": 15763, + "time_per_iteration": 2.639162063598633 + }, + { + "auxiliary_loss_clip": 0.01108547, + "auxiliary_loss_mlp": 0.01029318, + "balance_loss_clip": 1.03488159, + "balance_loss_mlp": 1.01811826, + "epoch": 0.9477829550578686, + "flos": 29132957370240.0, + "grad_norm": 1.7341798414043572, + "language_loss": 0.69911826, + "learning_rate": 2.8501958420980466e-08, + "loss": 0.72049689, + "num_input_tokens_seen": 340043785, + "router_z_loss_clip": 0.73583984, + "router_z_loss_mlp": 0.11206055, + "step": 15764, + "time_per_iteration": 2.6140060424804688 + }, + { + "auxiliary_loss_clip": 0.01105906, + "auxiliary_loss_mlp": 0.01026746, + "balance_loss_clip": 1.03927135, + "balance_loss_mlp": 1.01722693, + "epoch": 0.9478430783105366, + "flos": 27531618940800.0, + "grad_norm": 1.6531321490406368, + "language_loss": 0.71193171, + "learning_rate": 2.8436479515659306e-08, + "loss": 0.73325825, + "num_input_tokens_seen": 340064360, + "router_z_loss_clip": 0.66601562, + "router_z_loss_mlp": 0.09515381, + "step": 15765, + "time_per_iteration": 2.6957006454467773 + }, + { + "auxiliary_loss_clip": 0.01026993, + "auxiliary_loss_mlp": 0.01001551, + "balance_loss_clip": 1.00461793, + "balance_loss_mlp": 1.00068235, + "epoch": 0.9479032015632046, + "flos": 74260140711840.0, + "grad_norm": 0.8168426308009474, + "language_loss": 0.58945966, + "learning_rate": 2.8371075372224384e-08, + "loss": 0.60974514, + "num_input_tokens_seen": 340114425, + "router_z_loss_clip": 0.22387695, + "router_z_loss_mlp": 0.00869751, + "step": 15766, + "time_per_iteration": 3.020796298980713 + }, + { + "auxiliary_loss_clip": 0.0111104, + "auxiliary_loss_mlp": 0.01033507, + "balance_loss_clip": 1.03912377, + "balance_loss_mlp": 1.02301097, + "epoch": 0.9479633248158725, + "flos": 17916862495680.0, + "grad_norm": 2.2765971020365185, + "language_loss": 0.7421279, + "learning_rate": 2.8305745993155938e-08, + "loss": 0.76357341, + "num_input_tokens_seen": 340132200, + "router_z_loss_clip": 0.71875, + "router_z_loss_mlp": 0.10491943, + "step": 15767, + "time_per_iteration": 2.6185550689697266 + }, + { + "auxiliary_loss_clip": 0.01115198, + "auxiliary_loss_mlp": 0.0103622, + "balance_loss_clip": 1.04001713, + "balance_loss_mlp": 1.02395952, + "epoch": 0.9480234480685406, + "flos": 24811919239680.0, + "grad_norm": 2.5919712089670877, + "language_loss": 0.73570073, + "learning_rate": 2.8240491380931096e-08, + "loss": 0.7572149, + "num_input_tokens_seen": 340149175, + "router_z_loss_clip": 0.75244141, + "router_z_loss_mlp": 0.12280273, + "step": 15768, + "time_per_iteration": 4.201109170913696 + }, + { + "auxiliary_loss_clip": 0.0102709, + "auxiliary_loss_mlp": 0.01001221, + "balance_loss_clip": 1.00470281, + "balance_loss_mlp": 1.0003283, + "epoch": 0.9480835713212085, + "flos": 85771690007040.0, + "grad_norm": 0.7657157028401383, + "language_loss": 0.55238742, + "learning_rate": 2.8175311538024326e-08, + "loss": 0.57267052, + "num_input_tokens_seen": 340208155, + "router_z_loss_clip": 0.22375488, + "router_z_loss_mlp": 0.00891876, + "step": 15769, + "time_per_iteration": 4.580230712890625 + }, + { + "auxiliary_loss_clip": 0.01107344, + "auxiliary_loss_mlp": 0.01026864, + "balance_loss_clip": 1.03513932, + "balance_loss_mlp": 1.01575375, + "epoch": 0.9481436945738765, + "flos": 31053973203360.0, + "grad_norm": 1.394829134267905, + "language_loss": 0.77423, + "learning_rate": 2.8110206466907428e-08, + "loss": 0.79557216, + "num_input_tokens_seen": 340229275, + "router_z_loss_clip": 0.72119141, + "router_z_loss_mlp": 0.11096191, + "step": 15770, + "time_per_iteration": 2.7467784881591797 + }, + { + "auxiliary_loss_clip": 0.0111421, + "auxiliary_loss_mlp": 0.01032705, + "balance_loss_clip": 1.04110348, + "balance_loss_mlp": 1.0207963, + "epoch": 0.9482038178265444, + "flos": 32921391025440.0, + "grad_norm": 1.934774549156461, + "language_loss": 0.80022073, + "learning_rate": 2.8045176170049313e-08, + "loss": 0.8216899, + "num_input_tokens_seen": 340248920, + "router_z_loss_clip": 0.73144531, + "router_z_loss_mlp": 0.11907959, + "step": 15771, + "time_per_iteration": 2.6945226192474365 + }, + { + "auxiliary_loss_clip": 0.01107841, + "auxiliary_loss_mlp": 0.01026244, + "balance_loss_clip": 1.03749883, + "balance_loss_mlp": 1.0148952, + "epoch": 0.9482639410792124, + "flos": 21701609078400.0, + "grad_norm": 1.9845969504050849, + "language_loss": 0.69684041, + "learning_rate": 2.7980220649915566e-08, + "loss": 0.71818125, + "num_input_tokens_seen": 340266775, + "router_z_loss_clip": 0.703125, + "router_z_loss_mlp": 0.11334229, + "step": 15772, + "time_per_iteration": 2.6292734146118164 + }, + { + "auxiliary_loss_clip": 0.01110668, + "auxiliary_loss_mlp": 0.01027348, + "balance_loss_clip": 1.03911376, + "balance_loss_mlp": 1.01626766, + "epoch": 0.9483240643318803, + "flos": 25620165185760.0, + "grad_norm": 1.722812473534672, + "language_loss": 0.73757195, + "learning_rate": 2.7915339908969327e-08, + "loss": 0.75895214, + "num_input_tokens_seen": 340285295, + "router_z_loss_clip": 0.71630859, + "router_z_loss_mlp": 0.11077881, + "step": 15773, + "time_per_iteration": 2.668950080871582 + }, + { + "auxiliary_loss_clip": 0.01110627, + "auxiliary_loss_mlp": 0.01035163, + "balance_loss_clip": 1.03652716, + "balance_loss_mlp": 1.02262855, + "epoch": 0.9483841875845483, + "flos": 24505693572960.0, + "grad_norm": 2.5326838317782387, + "language_loss": 0.62838316, + "learning_rate": 2.7850533949671072e-08, + "loss": 0.64984095, + "num_input_tokens_seen": 340304265, + "router_z_loss_clip": 0.74169922, + "router_z_loss_mlp": 0.12536621, + "step": 15774, + "time_per_iteration": 2.624147415161133 + }, + { + "auxiliary_loss_clip": 0.0110902, + "auxiliary_loss_mlp": 0.0103165, + "balance_loss_clip": 1.03603375, + "balance_loss_mlp": 1.02000296, + "epoch": 0.9484443108372163, + "flos": 25396865690400.0, + "grad_norm": 1.743934237233606, + "language_loss": 0.59180391, + "learning_rate": 2.7785802774478396e-08, + "loss": 0.61321068, + "num_input_tokens_seen": 340323690, + "router_z_loss_clip": 0.72949219, + "router_z_loss_mlp": 0.11645508, + "step": 15775, + "time_per_iteration": 2.7886970043182373 + }, + { + "auxiliary_loss_clip": 0.01111652, + "auxiliary_loss_mlp": 0.01026924, + "balance_loss_clip": 1.03862834, + "balance_loss_mlp": 1.01519418, + "epoch": 0.9485044340898843, + "flos": 44452179678240.0, + "grad_norm": 1.6531009244187083, + "language_loss": 0.61722237, + "learning_rate": 2.772114638584555e-08, + "loss": 0.6386081, + "num_input_tokens_seen": 340345830, + "router_z_loss_clip": 0.73046875, + "router_z_loss_mlp": 0.11743164, + "step": 15776, + "time_per_iteration": 2.7563045024871826 + }, + { + "auxiliary_loss_clip": 0.01108582, + "auxiliary_loss_mlp": 0.01028041, + "balance_loss_clip": 1.03589976, + "balance_loss_mlp": 1.01638818, + "epoch": 0.9485645573425522, + "flos": 27579706601760.0, + "grad_norm": 1.9191817830222717, + "language_loss": 0.73157215, + "learning_rate": 2.765656478622458e-08, + "loss": 0.75293839, + "num_input_tokens_seen": 340365910, + "router_z_loss_clip": 0.7265625, + "router_z_loss_mlp": 0.11657715, + "step": 15777, + "time_per_iteration": 2.7270071506500244 + }, + { + "auxiliary_loss_clip": 0.01119889, + "auxiliary_loss_mlp": 0.01036389, + "balance_loss_clip": 1.04130793, + "balance_loss_mlp": 1.02415264, + "epoch": 0.9486246805952202, + "flos": 26866217872800.0, + "grad_norm": 14.572803908608877, + "language_loss": 0.72463757, + "learning_rate": 2.759205797806441e-08, + "loss": 0.74620032, + "num_input_tokens_seen": 340383935, + "router_z_loss_clip": 0.78564453, + "router_z_loss_mlp": 0.12243652, + "step": 15778, + "time_per_iteration": 2.5936925411224365 + }, + { + "auxiliary_loss_clip": 0.01104928, + "auxiliary_loss_mlp": 0.0102958, + "balance_loss_clip": 1.03776634, + "balance_loss_mlp": 1.01995361, + "epoch": 0.9486848038478882, + "flos": 20143577270880.0, + "grad_norm": 1.8778549577416006, + "language_loss": 0.70098168, + "learning_rate": 2.7527625963810865e-08, + "loss": 0.72232676, + "num_input_tokens_seen": 340402760, + "router_z_loss_clip": 0.67138672, + "router_z_loss_mlp": 0.09625244, + "step": 15779, + "time_per_iteration": 2.596003293991089 + }, + { + "auxiliary_loss_clip": 0.01110119, + "auxiliary_loss_mlp": 0.01029509, + "balance_loss_clip": 1.03804755, + "balance_loss_mlp": 1.01817775, + "epoch": 0.9487449271005561, + "flos": 23481967966560.0, + "grad_norm": 2.286941946610088, + "language_loss": 0.78007269, + "learning_rate": 2.7463268745907542e-08, + "loss": 0.80146891, + "num_input_tokens_seen": 340422105, + "router_z_loss_clip": 0.72167969, + "router_z_loss_mlp": 0.11340332, + "step": 15780, + "time_per_iteration": 2.5767269134521484 + }, + { + "auxiliary_loss_clip": 0.0111032, + "auxiliary_loss_mlp": 0.01030066, + "balance_loss_clip": 1.03863335, + "balance_loss_mlp": 1.01894939, + "epoch": 0.9488050503532242, + "flos": 26554927546080.0, + "grad_norm": 2.0399985989418488, + "language_loss": 0.66325963, + "learning_rate": 2.7398986326794494e-08, + "loss": 0.68466347, + "num_input_tokens_seen": 340441160, + "router_z_loss_clip": 0.71679688, + "router_z_loss_mlp": 0.11108398, + "step": 15781, + "time_per_iteration": 3.9573214054107666 + }, + { + "auxiliary_loss_clip": 0.01108159, + "auxiliary_loss_mlp": 0.01029747, + "balance_loss_clip": 1.0371362, + "balance_loss_mlp": 1.01894069, + "epoch": 0.9488651736058921, + "flos": 22413639185280.0, + "grad_norm": 2.91249878295373, + "language_loss": 0.79858243, + "learning_rate": 2.733477870890999e-08, + "loss": 0.81996149, + "num_input_tokens_seen": 340458200, + "router_z_loss_clip": 0.70996094, + "router_z_loss_mlp": 0.10803223, + "step": 15782, + "time_per_iteration": 2.596651077270508 + }, + { + "auxiliary_loss_clip": 0.01027038, + "auxiliary_loss_mlp": 0.01000758, + "balance_loss_clip": 1.00460339, + "balance_loss_mlp": 0.9998768, + "epoch": 0.9489252968585601, + "flos": 85515699417120.0, + "grad_norm": 0.715774344890833, + "language_loss": 0.59736383, + "learning_rate": 2.7270645894688082e-08, + "loss": 0.61764181, + "num_input_tokens_seen": 340526420, + "router_z_loss_clip": 0.22424316, + "router_z_loss_mlp": 0.00881195, + "step": 15783, + "time_per_iteration": 3.334455966949463 + }, + { + "auxiliary_loss_clip": 0.01109032, + "auxiliary_loss_mlp": 0.01032571, + "balance_loss_clip": 1.03589296, + "balance_loss_mlp": 1.02135348, + "epoch": 0.948985420111228, + "flos": 33989800841280.0, + "grad_norm": 1.81141298955318, + "language_loss": 0.74180102, + "learning_rate": 2.720658788656105e-08, + "loss": 0.76321703, + "num_input_tokens_seen": 340546325, + "router_z_loss_clip": 0.73095703, + "router_z_loss_mlp": 0.11218262, + "step": 15784, + "time_per_iteration": 2.6498541831970215 + }, + { + "auxiliary_loss_clip": 0.01109151, + "auxiliary_loss_mlp": 0.01029535, + "balance_loss_clip": 1.03636861, + "balance_loss_mlp": 1.01805484, + "epoch": 0.949045543363896, + "flos": 29670788574720.0, + "grad_norm": 1.9854577173315646, + "language_loss": 0.69699222, + "learning_rate": 2.714260468695806e-08, + "loss": 0.71837908, + "num_input_tokens_seen": 340565145, + "router_z_loss_clip": 0.72802734, + "router_z_loss_mlp": 0.11480713, + "step": 15785, + "time_per_iteration": 2.6865553855895996 + }, + { + "auxiliary_loss_clip": 0.01109997, + "auxiliary_loss_mlp": 0.0102807, + "balance_loss_clip": 1.03608358, + "balance_loss_mlp": 1.0173229, + "epoch": 0.9491056666165639, + "flos": 29580164120160.0, + "grad_norm": 1.6135517738277387, + "language_loss": 0.76082492, + "learning_rate": 2.707869629830495e-08, + "loss": 0.78220558, + "num_input_tokens_seen": 340585465, + "router_z_loss_clip": 0.73925781, + "router_z_loss_mlp": 0.10736084, + "step": 15786, + "time_per_iteration": 2.642953634262085 + }, + { + "auxiliary_loss_clip": 0.01109432, + "auxiliary_loss_mlp": 0.01030651, + "balance_loss_clip": 1.03775668, + "balance_loss_mlp": 1.02032709, + "epoch": 0.949165789869232, + "flos": 29933748136800.0, + "grad_norm": 1.6605234170804446, + "language_loss": 0.78863001, + "learning_rate": 2.7014862723025335e-08, + "loss": 0.81003088, + "num_input_tokens_seen": 340606010, + "router_z_loss_clip": 0.71582031, + "router_z_loss_mlp": 0.10327148, + "step": 15787, + "time_per_iteration": 2.698633909225464 + }, + { + "auxiliary_loss_clip": 0.01109159, + "auxiliary_loss_mlp": 0.01025452, + "balance_loss_clip": 1.03978598, + "balance_loss_mlp": 1.01486611, + "epoch": 0.9492259131218999, + "flos": 27131851575360.0, + "grad_norm": 1.8554368811230617, + "language_loss": 0.76420677, + "learning_rate": 2.6951103963540388e-08, + "loss": 0.78555298, + "num_input_tokens_seen": 340626135, + "router_z_loss_clip": 0.69335938, + "router_z_loss_mlp": 0.105896, + "step": 15788, + "time_per_iteration": 2.643458604812622 + }, + { + "auxiliary_loss_clip": 0.0111042, + "auxiliary_loss_mlp": 0.01030263, + "balance_loss_clip": 1.03677344, + "balance_loss_mlp": 1.01846099, + "epoch": 0.9492860363745679, + "flos": 28023712486560.0, + "grad_norm": 1.7465434906534976, + "language_loss": 0.715886, + "learning_rate": 2.6887420022266848e-08, + "loss": 0.73729289, + "num_input_tokens_seen": 340644870, + "router_z_loss_clip": 0.73730469, + "router_z_loss_mlp": 0.11810303, + "step": 15789, + "time_per_iteration": 2.7374565601348877 + }, + { + "auxiliary_loss_clip": 0.01107547, + "auxiliary_loss_mlp": 0.01027847, + "balance_loss_clip": 1.0375911, + "balance_loss_mlp": 1.01655757, + "epoch": 0.9493461596272358, + "flos": 22415178841920.0, + "grad_norm": 4.324988008601962, + "language_loss": 0.73650581, + "learning_rate": 2.682381090161989e-08, + "loss": 0.75785977, + "num_input_tokens_seen": 340663695, + "router_z_loss_clip": 0.69921875, + "router_z_loss_mlp": 0.11291504, + "step": 15790, + "time_per_iteration": 2.6422364711761475 + }, + { + "auxiliary_loss_clip": 0.01111716, + "auxiliary_loss_mlp": 0.01031995, + "balance_loss_clip": 1.0370822, + "balance_loss_mlp": 1.02021718, + "epoch": 0.9494062828799038, + "flos": 24418229466240.0, + "grad_norm": 1.765891906940901, + "language_loss": 0.77621496, + "learning_rate": 2.6760276604012033e-08, + "loss": 0.79765207, + "num_input_tokens_seen": 340682970, + "router_z_loss_clip": 0.74658203, + "router_z_loss_mlp": 0.11779785, + "step": 15791, + "time_per_iteration": 2.6944518089294434 + }, + { + "auxiliary_loss_clip": 0.01114008, + "auxiliary_loss_mlp": 0.0102776, + "balance_loss_clip": 1.03910959, + "balance_loss_mlp": 1.01590466, + "epoch": 0.9494664061325718, + "flos": 33225469276320.0, + "grad_norm": 2.123509122035129, + "language_loss": 0.73827839, + "learning_rate": 2.6696817131852234e-08, + "loss": 0.75969601, + "num_input_tokens_seen": 340702275, + "router_z_loss_clip": 0.74804688, + "router_z_loss_mlp": 0.11853027, + "step": 15792, + "time_per_iteration": 2.682591199874878 + }, + { + "auxiliary_loss_clip": 0.01108483, + "auxiliary_loss_mlp": 0.01034621, + "balance_loss_clip": 1.0369699, + "balance_loss_mlp": 1.0236479, + "epoch": 0.9495265293852397, + "flos": 22414652117280.0, + "grad_norm": 2.00398112543458, + "language_loss": 0.7833783, + "learning_rate": 2.663343248754679e-08, + "loss": 0.80480939, + "num_input_tokens_seen": 340719060, + "router_z_loss_clip": 0.71386719, + "router_z_loss_mlp": 0.10968018, + "step": 15793, + "time_per_iteration": 2.622732162475586 + }, + { + "auxiliary_loss_clip": 0.01108654, + "auxiliary_loss_mlp": 0.01028584, + "balance_loss_clip": 1.03675592, + "balance_loss_mlp": 1.01808143, + "epoch": 0.9495866526379078, + "flos": 28157278907520.0, + "grad_norm": 1.6714608503657296, + "language_loss": 0.77544832, + "learning_rate": 2.6570122673499562e-08, + "loss": 0.7968207, + "num_input_tokens_seen": 340737815, + "router_z_loss_clip": 0.71826172, + "router_z_loss_mlp": 0.10498047, + "step": 15794, + "time_per_iteration": 2.688056707382202 + }, + { + "auxiliary_loss_clip": 0.01112182, + "auxiliary_loss_mlp": 0.01030346, + "balance_loss_clip": 1.03763616, + "balance_loss_mlp": 1.01785326, + "epoch": 0.9496467758905757, + "flos": 21389792027040.0, + "grad_norm": 3.9119550418656335, + "language_loss": 0.61039352, + "learning_rate": 2.650688769211107e-08, + "loss": 0.63181877, + "num_input_tokens_seen": 340756035, + "router_z_loss_clip": 0.74414062, + "router_z_loss_mlp": 0.12493896, + "step": 15795, + "time_per_iteration": 2.6908628940582275 + }, + { + "auxiliary_loss_clip": 0.01107667, + "auxiliary_loss_mlp": 0.01032672, + "balance_loss_clip": 1.03802836, + "balance_loss_mlp": 1.02136517, + "epoch": 0.9497068991432437, + "flos": 29448502011360.0, + "grad_norm": 2.2263028167182677, + "language_loss": 0.79126787, + "learning_rate": 2.644372754577895e-08, + "loss": 0.8126713, + "num_input_tokens_seen": 340775620, + "router_z_loss_clip": 0.69726562, + "router_z_loss_mlp": 0.11315918, + "step": 15796, + "time_per_iteration": 4.113338232040405 + }, + { + "auxiliary_loss_clip": 0.01109373, + "auxiliary_loss_mlp": 0.01027972, + "balance_loss_clip": 1.03660107, + "balance_loss_mlp": 1.01624739, + "epoch": 0.9497670223959116, + "flos": 24774082450560.0, + "grad_norm": 9.747816029599388, + "language_loss": 0.75596827, + "learning_rate": 2.6380642236898398e-08, + "loss": 0.77734172, + "num_input_tokens_seen": 340794510, + "router_z_loss_clip": 0.72753906, + "router_z_loss_mlp": 0.11730957, + "step": 15797, + "time_per_iteration": 2.621526002883911 + }, + { + "auxiliary_loss_clip": 0.01111271, + "auxiliary_loss_mlp": 0.01030267, + "balance_loss_clip": 1.03906989, + "balance_loss_mlp": 1.01908565, + "epoch": 0.9498271456485796, + "flos": 16714926776160.0, + "grad_norm": 3.035056858181428, + "language_loss": 0.65872931, + "learning_rate": 2.6317631767861727e-08, + "loss": 0.68014467, + "num_input_tokens_seen": 340812955, + "router_z_loss_clip": 0.72167969, + "router_z_loss_mlp": 0.11175537, + "step": 15798, + "time_per_iteration": 2.6138978004455566 + }, + { + "auxiliary_loss_clip": 0.0111219, + "auxiliary_loss_mlp": 0.01029293, + "balance_loss_clip": 1.03856421, + "balance_loss_mlp": 1.01895761, + "epoch": 0.9498872689012475, + "flos": 25397270863200.0, + "grad_norm": 2.040498946120244, + "language_loss": 0.77579558, + "learning_rate": 2.6254696141058575e-08, + "loss": 0.79721045, + "num_input_tokens_seen": 340829200, + "router_z_loss_clip": 0.73730469, + "router_z_loss_mlp": 0.10339355, + "step": 15799, + "time_per_iteration": 2.6415908336639404 + }, + { + "auxiliary_loss_clip": 0.01107603, + "auxiliary_loss_mlp": 0.01029008, + "balance_loss_clip": 1.03767836, + "balance_loss_mlp": 1.01882768, + "epoch": 0.9499473921539155, + "flos": 25664160601440.0, + "grad_norm": 1.950654526069986, + "language_loss": 0.70954883, + "learning_rate": 2.6191835358874814e-08, + "loss": 0.73091495, + "num_input_tokens_seen": 340848035, + "router_z_loss_clip": 0.70019531, + "router_z_loss_mlp": 0.10174561, + "step": 15800, + "time_per_iteration": 2.629279375076294 + }, + { + "auxiliary_loss_clip": 0.01105847, + "auxiliary_loss_mlp": 0.01026016, + "balance_loss_clip": 1.03557062, + "balance_loss_mlp": 1.0147326, + "epoch": 0.9500075154065835, + "flos": 25622434153440.0, + "grad_norm": 1.662852160744883, + "language_loss": 0.71361554, + "learning_rate": 2.6129049423694315e-08, + "loss": 0.73493421, + "num_input_tokens_seen": 340870025, + "router_z_loss_clip": 0.70361328, + "router_z_loss_mlp": 0.112854, + "step": 15801, + "time_per_iteration": 2.6839540004730225 + }, + { + "auxiliary_loss_clip": 0.01109975, + "auxiliary_loss_mlp": 0.01030124, + "balance_loss_clip": 1.03867507, + "balance_loss_mlp": 1.01950836, + "epoch": 0.9500676386592515, + "flos": 30653922216960.0, + "grad_norm": 1.637775394029683, + "language_loss": 0.81107593, + "learning_rate": 2.6066338337898508e-08, + "loss": 0.83247691, + "num_input_tokens_seen": 340892290, + "router_z_loss_clip": 0.71289062, + "router_z_loss_mlp": 0.10620117, + "step": 15802, + "time_per_iteration": 2.6759159564971924 + }, + { + "auxiliary_loss_clip": 0.01111442, + "auxiliary_loss_mlp": 0.01030277, + "balance_loss_clip": 1.03855598, + "balance_loss_mlp": 1.01899362, + "epoch": 0.9501277619119194, + "flos": 33585211919520.0, + "grad_norm": 1.8367310451570964, + "language_loss": 0.67667687, + "learning_rate": 2.60037021038646e-08, + "loss": 0.69809407, + "num_input_tokens_seen": 340912260, + "router_z_loss_clip": 0.72851562, + "router_z_loss_mlp": 0.112854, + "step": 15803, + "time_per_iteration": 2.6586883068084717 + }, + { + "auxiliary_loss_clip": 0.01107783, + "auxiliary_loss_mlp": 0.01030269, + "balance_loss_clip": 1.0369904, + "balance_loss_mlp": 1.01942682, + "epoch": 0.9501878851645874, + "flos": 25397230345920.0, + "grad_norm": 1.7463412637237645, + "language_loss": 0.76009786, + "learning_rate": 2.5941140723968247e-08, + "loss": 0.78147835, + "num_input_tokens_seen": 340928930, + "router_z_loss_clip": 0.70800781, + "router_z_loss_mlp": 0.10845947, + "step": 15804, + "time_per_iteration": 2.626376152038574 + }, + { + "auxiliary_loss_clip": 0.01114042, + "auxiliary_loss_mlp": 0.01030315, + "balance_loss_clip": 1.04081178, + "balance_loss_mlp": 1.01924622, + "epoch": 0.9502480084172553, + "flos": 22414165909920.0, + "grad_norm": 1.8635945104280571, + "language_loss": 0.72899312, + "learning_rate": 2.5878654200581775e-08, + "loss": 0.75043666, + "num_input_tokens_seen": 340946615, + "router_z_loss_clip": 0.73193359, + "router_z_loss_mlp": 0.1105957, + "step": 15805, + "time_per_iteration": 2.6747937202453613 + }, + { + "auxiliary_loss_clip": 0.01110834, + "auxiliary_loss_mlp": 0.01032457, + "balance_loss_clip": 1.03844833, + "balance_loss_mlp": 1.02162075, + "epoch": 0.9503081316699233, + "flos": 28735540007040.0, + "grad_norm": 1.597262458202604, + "language_loss": 0.80227339, + "learning_rate": 2.5816242536074618e-08, + "loss": 0.82370627, + "num_input_tokens_seen": 340967545, + "router_z_loss_clip": 0.72412109, + "router_z_loss_mlp": 0.1083374, + "step": 15806, + "time_per_iteration": 2.650723457336426 + }, + { + "auxiliary_loss_clip": 0.01112473, + "auxiliary_loss_mlp": 0.01028584, + "balance_loss_clip": 1.03911543, + "balance_loss_mlp": 1.01780736, + "epoch": 0.9503682549225914, + "flos": 22012818370560.0, + "grad_norm": 4.521051729866437, + "language_loss": 0.82440698, + "learning_rate": 2.5753905732813108e-08, + "loss": 0.84581757, + "num_input_tokens_seen": 340984955, + "router_z_loss_clip": 0.734375, + "router_z_loss_mlp": 0.10772705, + "step": 15807, + "time_per_iteration": 4.096617937088013 + }, + { + "auxiliary_loss_clip": 0.01105873, + "auxiliary_loss_mlp": 0.01028932, + "balance_loss_clip": 1.0347898, + "balance_loss_mlp": 1.01802993, + "epoch": 0.9504283781752593, + "flos": 31586618196000.0, + "grad_norm": 1.8368254288080215, + "language_loss": 0.71457624, + "learning_rate": 2.5691643793161355e-08, + "loss": 0.7359243, + "num_input_tokens_seen": 341007300, + "router_z_loss_clip": 0.7109375, + "router_z_loss_mlp": 0.10900879, + "step": 15808, + "time_per_iteration": 2.6449992656707764 + }, + { + "auxiliary_loss_clip": 0.01110354, + "auxiliary_loss_mlp": 0.01027143, + "balance_loss_clip": 1.03894317, + "balance_loss_mlp": 1.01605082, + "epoch": 0.9504885014279273, + "flos": 26996178255840.0, + "grad_norm": 1.5626004753996927, + "language_loss": 0.70017785, + "learning_rate": 2.562945671948058e-08, + "loss": 0.72155273, + "num_input_tokens_seen": 341026695, + "router_z_loss_clip": 0.71386719, + "router_z_loss_mlp": 0.11083984, + "step": 15809, + "time_per_iteration": 3.9939544200897217 + }, + { + "auxiliary_loss_clip": 0.01107841, + "auxiliary_loss_mlp": 0.0102678, + "balance_loss_clip": 1.03568494, + "balance_loss_mlp": 1.01542532, + "epoch": 0.9505486246805952, + "flos": 26375947604640.0, + "grad_norm": 1.690010432335258, + "language_loss": 0.75533152, + "learning_rate": 2.5567344514128452e-08, + "loss": 0.77667773, + "num_input_tokens_seen": 341047080, + "router_z_loss_clip": 0.72119141, + "router_z_loss_mlp": 0.11358643, + "step": 15810, + "time_per_iteration": 2.6318533420562744 + }, + { + "auxiliary_loss_clip": 0.01108567, + "auxiliary_loss_mlp": 0.0103801, + "balance_loss_clip": 1.03628957, + "balance_loss_mlp": 1.026173, + "epoch": 0.9506087479332632, + "flos": 27489163181760.0, + "grad_norm": 1.4314384863721967, + "language_loss": 0.7977531, + "learning_rate": 2.5505307179460643e-08, + "loss": 0.81921887, + "num_input_tokens_seen": 341067310, + "router_z_loss_clip": 0.72314453, + "router_z_loss_mlp": 0.11834717, + "step": 15811, + "time_per_iteration": 2.6258721351623535 + }, + { + "auxiliary_loss_clip": 0.01108695, + "auxiliary_loss_mlp": 0.01032754, + "balance_loss_clip": 1.03686547, + "balance_loss_mlp": 1.02166152, + "epoch": 0.9506688711859311, + "flos": 33589425716640.0, + "grad_norm": 1.9247173575973164, + "language_loss": 0.69710743, + "learning_rate": 2.5443344717829495e-08, + "loss": 0.71852195, + "num_input_tokens_seen": 341085110, + "router_z_loss_clip": 0.71777344, + "router_z_loss_mlp": 0.11096191, + "step": 15812, + "time_per_iteration": 2.7103383541107178 + }, + { + "auxiliary_loss_clip": 0.01111468, + "auxiliary_loss_mlp": 0.01029283, + "balance_loss_clip": 1.03824568, + "balance_loss_mlp": 1.01833999, + "epoch": 0.9507289944385992, + "flos": 24240140904960.0, + "grad_norm": 1.7518274596829888, + "language_loss": 0.65777439, + "learning_rate": 2.538145713158446e-08, + "loss": 0.67918187, + "num_input_tokens_seen": 341103190, + "router_z_loss_clip": 0.73242188, + "router_z_loss_mlp": 0.10943604, + "step": 15813, + "time_per_iteration": 2.622678756713867 + }, + { + "auxiliary_loss_clip": 0.01110442, + "auxiliary_loss_mlp": 0.01034639, + "balance_loss_clip": 1.03740418, + "balance_loss_mlp": 1.02332568, + "epoch": 0.9507891176912671, + "flos": 30741102702720.0, + "grad_norm": 1.4680773476117235, + "language_loss": 0.70740014, + "learning_rate": 2.5319644423072327e-08, + "loss": 0.72885096, + "num_input_tokens_seen": 341125695, + "router_z_loss_clip": 0.73095703, + "router_z_loss_mlp": 0.11309814, + "step": 15814, + "time_per_iteration": 2.7120156288146973 + }, + { + "auxiliary_loss_clip": 0.01107349, + "auxiliary_loss_mlp": 0.01026108, + "balance_loss_clip": 1.03692913, + "balance_loss_mlp": 1.01528335, + "epoch": 0.9508492409439351, + "flos": 30382859198880.0, + "grad_norm": 2.001081445736412, + "language_loss": 0.63140237, + "learning_rate": 2.5257906594637445e-08, + "loss": 0.65273696, + "num_input_tokens_seen": 341143930, + "router_z_loss_clip": 0.70361328, + "router_z_loss_mlp": 0.10827637, + "step": 15815, + "time_per_iteration": 2.621624231338501 + }, + { + "auxiliary_loss_clip": 0.01106537, + "auxiliary_loss_mlp": 0.01023812, + "balance_loss_clip": 1.03483033, + "balance_loss_mlp": 1.01369715, + "epoch": 0.950909364196603, + "flos": 36343761341760.0, + "grad_norm": 2.120738290278599, + "language_loss": 0.58868408, + "learning_rate": 2.519624364862061e-08, + "loss": 0.60998762, + "num_input_tokens_seen": 341164280, + "router_z_loss_clip": 0.71679688, + "router_z_loss_mlp": 0.10113525, + "step": 15816, + "time_per_iteration": 2.7205235958099365 + }, + { + "auxiliary_loss_clip": 0.01110259, + "auxiliary_loss_mlp": 0.01037475, + "balance_loss_clip": 1.03793025, + "balance_loss_mlp": 1.02626991, + "epoch": 0.950969487449271, + "flos": 30160977808320.0, + "grad_norm": 1.4733384035683088, + "language_loss": 0.73522621, + "learning_rate": 2.513465558735994e-08, + "loss": 0.75670356, + "num_input_tokens_seen": 341183670, + "router_z_loss_clip": 0.72265625, + "router_z_loss_mlp": 0.11199951, + "step": 15817, + "time_per_iteration": 2.6606497764587402 + }, + { + "auxiliary_loss_clip": 0.01112917, + "auxiliary_loss_mlp": 0.01032886, + "balance_loss_clip": 1.03884912, + "balance_loss_mlp": 1.02012455, + "epoch": 0.9510296107019389, + "flos": 16715210397120.0, + "grad_norm": 1.8127253204844618, + "language_loss": 0.60236931, + "learning_rate": 2.5073142413190918e-08, + "loss": 0.62382734, + "num_input_tokens_seen": 341201900, + "router_z_loss_clip": 0.73974609, + "router_z_loss_mlp": 0.12756348, + "step": 15818, + "time_per_iteration": 2.639911651611328 + }, + { + "auxiliary_loss_clip": 0.01110059, + "auxiliary_loss_mlp": 0.01031243, + "balance_loss_clip": 1.03851795, + "balance_loss_mlp": 1.0198586, + "epoch": 0.9510897339546069, + "flos": 21123631599840.0, + "grad_norm": 2.8014737742086164, + "language_loss": 0.69607031, + "learning_rate": 2.5011704128446552e-08, + "loss": 0.71748334, + "num_input_tokens_seen": 341218340, + "router_z_loss_clip": 0.71533203, + "router_z_loss_mlp": 0.11383057, + "step": 15819, + "time_per_iteration": 2.60444712638855 + }, + { + "auxiliary_loss_clip": 0.01113473, + "auxiliary_loss_mlp": 0.01028156, + "balance_loss_clip": 1.03983974, + "balance_loss_mlp": 1.01741517, + "epoch": 0.951149857207275, + "flos": 18051117710400.0, + "grad_norm": 1.8980218075414148, + "language_loss": 0.74251413, + "learning_rate": 2.49503407354561e-08, + "loss": 0.76393044, + "num_input_tokens_seen": 341235885, + "router_z_loss_clip": 0.73632812, + "router_z_loss_mlp": 0.10736084, + "step": 15820, + "time_per_iteration": 3.9099338054656982 + }, + { + "auxiliary_loss_clip": 0.01113416, + "auxiliary_loss_mlp": 0.0103201, + "balance_loss_clip": 1.03906012, + "balance_loss_mlp": 1.02041054, + "epoch": 0.9512099804599429, + "flos": 23660988425280.0, + "grad_norm": 2.097203126309214, + "language_loss": 0.78566456, + "learning_rate": 2.4889052236546804e-08, + "loss": 0.80711877, + "num_input_tokens_seen": 341255280, + "router_z_loss_clip": 0.74316406, + "router_z_loss_mlp": 0.11590576, + "step": 15821, + "time_per_iteration": 2.7114222049713135 + }, + { + "auxiliary_loss_clip": 0.01108645, + "auxiliary_loss_mlp": 0.01029545, + "balance_loss_clip": 1.03685784, + "balance_loss_mlp": 1.01819003, + "epoch": 0.9512701037126109, + "flos": 44853243596640.0, + "grad_norm": 1.697074708160104, + "language_loss": 0.71019828, + "learning_rate": 2.4827838634042586e-08, + "loss": 0.73158014, + "num_input_tokens_seen": 341279055, + "router_z_loss_clip": 0.71777344, + "router_z_loss_mlp": 0.11340332, + "step": 15822, + "time_per_iteration": 2.740401029586792 + }, + { + "auxiliary_loss_clip": 0.01110075, + "auxiliary_loss_mlp": 0.01032088, + "balance_loss_clip": 1.03888154, + "balance_loss_mlp": 1.02141905, + "epoch": 0.9513302269652788, + "flos": 27623377879200.0, + "grad_norm": 1.5392981686705103, + "language_loss": 0.66442192, + "learning_rate": 2.47666999302647e-08, + "loss": 0.68584353, + "num_input_tokens_seen": 341298560, + "router_z_loss_clip": 0.71240234, + "router_z_loss_mlp": 0.10656738, + "step": 15823, + "time_per_iteration": 2.6538894176483154 + }, + { + "auxiliary_loss_clip": 0.01107524, + "auxiliary_loss_mlp": 0.01029283, + "balance_loss_clip": 1.03758621, + "balance_loss_mlp": 1.01852489, + "epoch": 0.9513903502179468, + "flos": 27934546654080.0, + "grad_norm": 1.7059598105677138, + "language_loss": 0.77510834, + "learning_rate": 2.4705636127531292e-08, + "loss": 0.79647636, + "num_input_tokens_seen": 341316650, + "router_z_loss_clip": 0.69921875, + "router_z_loss_mlp": 0.10772705, + "step": 15824, + "time_per_iteration": 2.589099168777466 + }, + { + "auxiliary_loss_clip": 0.01112603, + "auxiliary_loss_mlp": 0.01029056, + "balance_loss_clip": 1.03676772, + "balance_loss_mlp": 1.01722431, + "epoch": 0.9514504734706147, + "flos": 34079088225600.0, + "grad_norm": 2.1159843371296847, + "language_loss": 0.73714781, + "learning_rate": 2.4644647228158065e-08, + "loss": 0.75856447, + "num_input_tokens_seen": 341336185, + "router_z_loss_clip": 0.75830078, + "router_z_loss_mlp": 0.11834717, + "step": 15825, + "time_per_iteration": 2.686131715774536 + }, + { + "auxiliary_loss_clip": 0.01026967, + "auxiliary_loss_mlp": 0.01001424, + "balance_loss_clip": 1.00464821, + "balance_loss_mlp": 1.00061882, + "epoch": 0.9515105967232828, + "flos": 82201450669920.0, + "grad_norm": 0.8178587390297147, + "language_loss": 0.5343588, + "learning_rate": 2.458373323445806e-08, + "loss": 0.55464274, + "num_input_tokens_seen": 341395795, + "router_z_loss_clip": 0.22302246, + "router_z_loss_mlp": 0.00805664, + "step": 15826, + "time_per_iteration": 3.1878912448883057 + }, + { + "auxiliary_loss_clip": 0.01111053, + "auxiliary_loss_mlp": 0.01035152, + "balance_loss_clip": 1.03810453, + "balance_loss_mlp": 1.02400053, + "epoch": 0.9515707199759507, + "flos": 31538571052320.0, + "grad_norm": 2.2492961545693553, + "language_loss": 0.72409505, + "learning_rate": 2.452289414874076e-08, + "loss": 0.74555707, + "num_input_tokens_seen": 341415675, + "router_z_loss_clip": 0.72949219, + "router_z_loss_mlp": 0.11151123, + "step": 15827, + "time_per_iteration": 2.8792455196380615 + }, + { + "auxiliary_loss_clip": 0.01110834, + "auxiliary_loss_mlp": 0.01028456, + "balance_loss_clip": 1.0387001, + "balance_loss_mlp": 1.01735759, + "epoch": 0.9516308432286187, + "flos": 26636435612640.0, + "grad_norm": 1.8050339846900918, + "language_loss": 0.74254709, + "learning_rate": 2.4462129973313207e-08, + "loss": 0.76393998, + "num_input_tokens_seen": 341432990, + "router_z_loss_clip": 0.72119141, + "router_z_loss_mlp": 0.11090088, + "step": 15828, + "time_per_iteration": 2.6418802738189697 + }, + { + "auxiliary_loss_clip": 0.01108867, + "auxiliary_loss_mlp": 0.01030764, + "balance_loss_clip": 1.03896761, + "balance_loss_mlp": 1.02027369, + "epoch": 0.9516909664812866, + "flos": 33275096593920.0, + "grad_norm": 1.6981427393312407, + "language_loss": 0.72973263, + "learning_rate": 2.440144071047978e-08, + "loss": 0.75112891, + "num_input_tokens_seen": 341454100, + "router_z_loss_clip": 0.69824219, + "router_z_loss_mlp": 0.10491943, + "step": 15829, + "time_per_iteration": 2.684755563735962 + }, + { + "auxiliary_loss_clip": 0.011091, + "auxiliary_loss_mlp": 0.01027493, + "balance_loss_clip": 1.03671324, + "balance_loss_mlp": 1.01684785, + "epoch": 0.9517510897339546, + "flos": 25887014406720.0, + "grad_norm": 2.35930496884311, + "language_loss": 0.61919141, + "learning_rate": 2.4340826362541533e-08, + "loss": 0.64055729, + "num_input_tokens_seen": 341472955, + "router_z_loss_clip": 0.72363281, + "router_z_loss_mlp": 0.10650635, + "step": 15830, + "time_per_iteration": 2.6538355350494385 + }, + { + "auxiliary_loss_clip": 0.01111812, + "auxiliary_loss_mlp": 0.01031642, + "balance_loss_clip": 1.0376184, + "balance_loss_mlp": 1.01906586, + "epoch": 0.9518112129866225, + "flos": 22859103692160.0, + "grad_norm": 10.72107630506921, + "language_loss": 0.73037267, + "learning_rate": 2.428028693179729e-08, + "loss": 0.75180721, + "num_input_tokens_seen": 341490165, + "router_z_loss_clip": 0.7421875, + "router_z_loss_mlp": 0.12591553, + "step": 15831, + "time_per_iteration": 2.599641799926758 + }, + { + "auxiliary_loss_clip": 0.01105033, + "auxiliary_loss_mlp": 0.01025051, + "balance_loss_clip": 1.03550541, + "balance_loss_mlp": 1.01484108, + "epoch": 0.9518713362392905, + "flos": 20454786563040.0, + "grad_norm": 1.7864875394497561, + "language_loss": 0.65774393, + "learning_rate": 2.4219822420542545e-08, + "loss": 0.67904478, + "num_input_tokens_seen": 341508055, + "router_z_loss_clip": 0.6953125, + "router_z_loss_mlp": 0.10198975, + "step": 15832, + "time_per_iteration": 2.613363027572632 + }, + { + "auxiliary_loss_clip": 0.01109329, + "auxiliary_loss_mlp": 0.01029339, + "balance_loss_clip": 1.04102874, + "balance_loss_mlp": 1.01865745, + "epoch": 0.9519314594919586, + "flos": 18584127358560.0, + "grad_norm": 1.823415073294859, + "language_loss": 0.77700996, + "learning_rate": 2.4159432831070135e-08, + "loss": 0.79839659, + "num_input_tokens_seen": 341526155, + "router_z_loss_clip": 0.68310547, + "router_z_loss_mlp": 0.10675049, + "step": 15833, + "time_per_iteration": 2.600917339324951 + }, + { + "auxiliary_loss_clip": 0.01107515, + "auxiliary_loss_mlp": 0.01029107, + "balance_loss_clip": 1.03797984, + "balance_loss_mlp": 1.01844394, + "epoch": 0.9519915827446265, + "flos": 23614521455520.0, + "grad_norm": 2.17466980321928, + "language_loss": 0.74533141, + "learning_rate": 2.4099118165670007e-08, + "loss": 0.76669765, + "num_input_tokens_seen": 341540450, + "router_z_loss_clip": 0.69482422, + "router_z_loss_mlp": 0.10656738, + "step": 15834, + "time_per_iteration": 2.627960443496704 + }, + { + "auxiliary_loss_clip": 0.01114714, + "auxiliary_loss_mlp": 0.01030731, + "balance_loss_clip": 1.03860641, + "balance_loss_mlp": 1.01894736, + "epoch": 0.9520517059972945, + "flos": 27170134054560.0, + "grad_norm": 2.2127949849395834, + "language_loss": 0.75928253, + "learning_rate": 2.4038878426629216e-08, + "loss": 0.78073698, + "num_input_tokens_seen": 341557865, + "router_z_loss_clip": 0.76220703, + "router_z_loss_mlp": 0.11791992, + "step": 15835, + "time_per_iteration": 2.6450083255767822 + }, + { + "auxiliary_loss_clip": 0.01112193, + "auxiliary_loss_mlp": 0.01030786, + "balance_loss_clip": 1.03875339, + "balance_loss_mlp": 1.01921642, + "epoch": 0.9521118292499624, + "flos": 18134975779200.0, + "grad_norm": 2.3405722480070716, + "language_loss": 0.66396576, + "learning_rate": 2.397871361623238e-08, + "loss": 0.68539554, + "num_input_tokens_seen": 341573890, + "router_z_loss_clip": 0.734375, + "router_z_loss_mlp": 0.11578369, + "step": 15836, + "time_per_iteration": 4.005847454071045 + }, + { + "auxiliary_loss_clip": 0.01106364, + "auxiliary_loss_mlp": 0.01025838, + "balance_loss_clip": 1.03624249, + "balance_loss_mlp": 1.01489449, + "epoch": 0.9521719525026304, + "flos": 28686155793120.0, + "grad_norm": 2.680046735515054, + "language_loss": 0.70337653, + "learning_rate": 2.391862373676057e-08, + "loss": 0.72469854, + "num_input_tokens_seen": 341593770, + "router_z_loss_clip": 0.70214844, + "router_z_loss_mlp": 0.109375, + "step": 15837, + "time_per_iteration": 2.721135139465332 + }, + { + "auxiliary_loss_clip": 0.01111833, + "auxiliary_loss_mlp": 0.01030719, + "balance_loss_clip": 1.03772759, + "balance_loss_mlp": 1.01780903, + "epoch": 0.9522320757552983, + "flos": 24056501476320.0, + "grad_norm": 2.112110342960942, + "language_loss": 0.73479575, + "learning_rate": 2.3858608790492617e-08, + "loss": 0.75622129, + "num_input_tokens_seen": 341612065, + "router_z_loss_clip": 0.74121094, + "router_z_loss_mlp": 0.12896729, + "step": 15838, + "time_per_iteration": 2.61849045753479 + }, + { + "auxiliary_loss_clip": 0.01109081, + "auxiliary_loss_mlp": 0.01027054, + "balance_loss_clip": 1.03596401, + "balance_loss_mlp": 1.01630163, + "epoch": 0.9522921990079664, + "flos": 31629641196960.0, + "grad_norm": 1.815530738396375, + "language_loss": 0.77877617, + "learning_rate": 2.379866877970449e-08, + "loss": 0.80013746, + "num_input_tokens_seen": 341631365, + "router_z_loss_clip": 0.73046875, + "router_z_loss_mlp": 0.10754395, + "step": 15839, + "time_per_iteration": 2.6547629833221436 + }, + { + "auxiliary_loss_clip": 0.01111761, + "auxiliary_loss_mlp": 0.0103463, + "balance_loss_clip": 1.03896046, + "balance_loss_mlp": 1.02361536, + "epoch": 0.9523523222606343, + "flos": 23437688929920.0, + "grad_norm": 1.5684033121285248, + "language_loss": 0.80262715, + "learning_rate": 2.3738803706668585e-08, + "loss": 0.82409108, + "num_input_tokens_seen": 341650300, + "router_z_loss_clip": 0.72851562, + "router_z_loss_mlp": 0.11010742, + "step": 15840, + "time_per_iteration": 2.5796892642974854 + }, + { + "auxiliary_loss_clip": 0.01102644, + "auxiliary_loss_mlp": 0.01028411, + "balance_loss_clip": 1.03467143, + "balance_loss_mlp": 1.01863575, + "epoch": 0.9524124455133023, + "flos": 25528365730080.0, + "grad_norm": 2.0265313992098344, + "language_loss": 0.72775584, + "learning_rate": 2.3679013573655314e-08, + "loss": 0.74906635, + "num_input_tokens_seen": 341667680, + "router_z_loss_clip": 0.6796875, + "router_z_loss_mlp": 0.09777832, + "step": 15841, + "time_per_iteration": 2.6247920989990234 + }, + { + "auxiliary_loss_clip": 0.0110302, + "auxiliary_loss_mlp": 0.01027544, + "balance_loss_clip": 1.03542697, + "balance_loss_mlp": 1.01728046, + "epoch": 0.9524725687659702, + "flos": 22992791664960.0, + "grad_norm": 1.8433347299384695, + "language_loss": 0.79073417, + "learning_rate": 2.3619298382931972e-08, + "loss": 0.81203979, + "num_input_tokens_seen": 341685760, + "router_z_loss_clip": 0.67626953, + "router_z_loss_mlp": 0.10272217, + "step": 15842, + "time_per_iteration": 2.5700435638427734 + }, + { + "auxiliary_loss_clip": 0.01111296, + "auxiliary_loss_mlp": 0.01033107, + "balance_loss_clip": 1.03966141, + "balance_loss_mlp": 1.02173471, + "epoch": 0.9525326920186382, + "flos": 27667818984960.0, + "grad_norm": 1.6006856525089166, + "language_loss": 0.72117001, + "learning_rate": 2.3559658136762973e-08, + "loss": 0.74261409, + "num_input_tokens_seen": 341705300, + "router_z_loss_clip": 0.71630859, + "router_z_loss_mlp": 0.11364746, + "step": 15843, + "time_per_iteration": 2.672125816345215 + }, + { + "auxiliary_loss_clip": 0.01111509, + "auxiliary_loss_mlp": 0.01028837, + "balance_loss_clip": 1.03830957, + "balance_loss_mlp": 1.01695144, + "epoch": 0.9525928152713061, + "flos": 26950643183520.0, + "grad_norm": 2.6932215618636985, + "language_loss": 0.78256917, + "learning_rate": 2.3500092837409612e-08, + "loss": 0.8039726, + "num_input_tokens_seen": 341724565, + "router_z_loss_clip": 0.73144531, + "router_z_loss_mlp": 0.11895752, + "step": 15844, + "time_per_iteration": 2.634309768676758 + }, + { + "auxiliary_loss_clip": 0.01113981, + "auxiliary_loss_mlp": 0.0103252, + "balance_loss_clip": 1.03779149, + "balance_loss_mlp": 1.01953244, + "epoch": 0.9526529385239741, + "flos": 25263907028640.0, + "grad_norm": 1.7898445015972633, + "language_loss": 0.70229244, + "learning_rate": 2.3440602487130977e-08, + "loss": 0.72375751, + "num_input_tokens_seen": 341743605, + "router_z_loss_clip": 0.76171875, + "router_z_loss_mlp": 0.12988281, + "step": 15845, + "time_per_iteration": 2.633857250213623 + }, + { + "auxiliary_loss_clip": 0.01111356, + "auxiliary_loss_mlp": 0.010349, + "balance_loss_clip": 1.0370295, + "balance_loss_mlp": 1.02379537, + "epoch": 0.9527130617766422, + "flos": 28513901720160.0, + "grad_norm": 1.5835799429407116, + "language_loss": 0.75781596, + "learning_rate": 2.338118708818282e-08, + "loss": 0.77927846, + "num_input_tokens_seen": 341763475, + "router_z_loss_clip": 0.74316406, + "router_z_loss_mlp": 0.11108398, + "step": 15846, + "time_per_iteration": 2.6365907192230225 + }, + { + "auxiliary_loss_clip": 0.01110216, + "auxiliary_loss_mlp": 0.01024211, + "balance_loss_clip": 1.03725374, + "balance_loss_mlp": 1.01344037, + "epoch": 0.9527731850293101, + "flos": 23166869015520.0, + "grad_norm": 1.8768642299377587, + "language_loss": 0.77886611, + "learning_rate": 2.3321846642817998e-08, + "loss": 0.80021036, + "num_input_tokens_seen": 341781265, + "router_z_loss_clip": 0.72998047, + "router_z_loss_mlp": 0.10772705, + "step": 15847, + "time_per_iteration": 4.133933782577515 + }, + { + "auxiliary_loss_clip": 0.01107368, + "auxiliary_loss_mlp": 0.01031874, + "balance_loss_clip": 1.03695428, + "balance_loss_mlp": 1.02150929, + "epoch": 0.9528333082819781, + "flos": 23571782075520.0, + "grad_norm": 1.8281924557483067, + "language_loss": 0.77996957, + "learning_rate": 2.326258115328672e-08, + "loss": 0.80136192, + "num_input_tokens_seen": 341798825, + "router_z_loss_clip": 0.70410156, + "router_z_loss_mlp": 0.10357666, + "step": 15848, + "time_per_iteration": 3.9056859016418457 + }, + { + "auxiliary_loss_clip": 0.01115338, + "auxiliary_loss_mlp": 0.01037045, + "balance_loss_clip": 1.04001379, + "balance_loss_mlp": 1.02515435, + "epoch": 0.952893431534646, + "flos": 29225729240640.0, + "grad_norm": 1.6793717897997786, + "language_loss": 0.72006166, + "learning_rate": 2.320339062183674e-08, + "loss": 0.74158549, + "num_input_tokens_seen": 341819480, + "router_z_loss_clip": 0.75292969, + "router_z_loss_mlp": 0.11895752, + "step": 15849, + "time_per_iteration": 2.733614683151245 + }, + { + "auxiliary_loss_clip": 0.01118059, + "auxiliary_loss_mlp": 0.01032357, + "balance_loss_clip": 1.04113269, + "balance_loss_mlp": 1.02055502, + "epoch": 0.952953554787314, + "flos": 25661729564640.0, + "grad_norm": 1.6958287948940034, + "language_loss": 0.75365162, + "learning_rate": 2.314427505071226e-08, + "loss": 0.7751559, + "num_input_tokens_seen": 341838035, + "router_z_loss_clip": 0.76855469, + "router_z_loss_mlp": 0.11798096, + "step": 15850, + "time_per_iteration": 2.6226255893707275 + }, + { + "auxiliary_loss_clip": 0.01107651, + "auxiliary_loss_mlp": 0.01029128, + "balance_loss_clip": 1.03552651, + "balance_loss_mlp": 1.01857233, + "epoch": 0.9530136780399819, + "flos": 27311722896960.0, + "grad_norm": 2.09168660567295, + "language_loss": 0.72303486, + "learning_rate": 2.308523444215482e-08, + "loss": 0.74440265, + "num_input_tokens_seen": 341855895, + "router_z_loss_clip": 0.72070312, + "router_z_loss_mlp": 0.10559082, + "step": 15851, + "time_per_iteration": 2.690512180328369 + }, + { + "auxiliary_loss_clip": 0.01106965, + "auxiliary_loss_mlp": 0.010261, + "balance_loss_clip": 1.03640294, + "balance_loss_mlp": 1.01532936, + "epoch": 0.95307380129265, + "flos": 27039687464160.0, + "grad_norm": 1.8166115408468733, + "language_loss": 0.79984385, + "learning_rate": 2.3026268798403525e-08, + "loss": 0.8211745, + "num_input_tokens_seen": 341875240, + "router_z_loss_clip": 0.70507812, + "router_z_loss_mlp": 0.10772705, + "step": 15852, + "time_per_iteration": 2.642711639404297 + }, + { + "auxiliary_loss_clip": 0.01110131, + "auxiliary_loss_mlp": 0.01029672, + "balance_loss_clip": 1.03705513, + "balance_loss_mlp": 1.01838923, + "epoch": 0.9531339245453179, + "flos": 53718011593920.0, + "grad_norm": 1.66601538329487, + "language_loss": 0.59822381, + "learning_rate": 2.2967378121694138e-08, + "loss": 0.61962187, + "num_input_tokens_seen": 341901020, + "router_z_loss_clip": 0.72949219, + "router_z_loss_mlp": 0.11291504, + "step": 15853, + "time_per_iteration": 2.926989793777466 + }, + { + "auxiliary_loss_clip": 0.01105852, + "auxiliary_loss_mlp": 0.01027745, + "balance_loss_clip": 1.03667903, + "balance_loss_mlp": 1.01713586, + "epoch": 0.9531940477979859, + "flos": 24729398241120.0, + "grad_norm": 1.913200513675894, + "language_loss": 0.72685796, + "learning_rate": 2.290856241425998e-08, + "loss": 0.74819392, + "num_input_tokens_seen": 341919365, + "router_z_loss_clip": 0.69189453, + "router_z_loss_mlp": 0.1060791, + "step": 15854, + "time_per_iteration": 2.5806362628936768 + }, + { + "auxiliary_loss_clip": 0.01108878, + "auxiliary_loss_mlp": 0.01026386, + "balance_loss_clip": 1.0357635, + "balance_loss_mlp": 1.01563907, + "epoch": 0.9532541710506538, + "flos": 30915261087840.0, + "grad_norm": 3.2606033497055535, + "language_loss": 0.67925251, + "learning_rate": 2.284982167833127e-08, + "loss": 0.70060509, + "num_input_tokens_seen": 341939985, + "router_z_loss_clip": 0.73095703, + "router_z_loss_mlp": 0.10742188, + "step": 15855, + "time_per_iteration": 2.674290895462036 + }, + { + "auxiliary_loss_clip": 0.01109357, + "auxiliary_loss_mlp": 0.01028527, + "balance_loss_clip": 1.03694856, + "balance_loss_mlp": 1.01777411, + "epoch": 0.9533142943033218, + "flos": 32298283647360.0, + "grad_norm": 1.691761715360391, + "language_loss": 0.76735348, + "learning_rate": 2.279115591613556e-08, + "loss": 0.78873229, + "num_input_tokens_seen": 341959255, + "router_z_loss_clip": 0.72412109, + "router_z_loss_mlp": 0.10748291, + "step": 15856, + "time_per_iteration": 2.6507372856140137 + }, + { + "auxiliary_loss_clip": 0.01106403, + "auxiliary_loss_mlp": 0.01030175, + "balance_loss_clip": 1.03537261, + "balance_loss_mlp": 1.01994061, + "epoch": 0.9533744175559897, + "flos": 28866715908480.0, + "grad_norm": 1.9657093369292795, + "language_loss": 0.77931464, + "learning_rate": 2.2732565129897075e-08, + "loss": 0.8006804, + "num_input_tokens_seen": 341977205, + "router_z_loss_clip": 0.70996094, + "router_z_loss_mlp": 0.10235596, + "step": 15857, + "time_per_iteration": 2.675657272338867 + }, + { + "auxiliary_loss_clip": 0.01026927, + "auxiliary_loss_mlp": 0.01001532, + "balance_loss_clip": 1.00462461, + "balance_loss_mlp": 1.0006485, + "epoch": 0.9534345408086577, + "flos": 74496000564000.0, + "grad_norm": 0.7130758444275502, + "language_loss": 0.6260348, + "learning_rate": 2.267404932183803e-08, + "loss": 0.64631939, + "num_input_tokens_seen": 342038545, + "router_z_loss_clip": 0.2232666, + "router_z_loss_mlp": 0.00883484, + "step": 15858, + "time_per_iteration": 3.2045888900756836 + }, + { + "auxiliary_loss_clip": 0.01108949, + "auxiliary_loss_mlp": 0.01027391, + "balance_loss_clip": 1.0383532, + "balance_loss_mlp": 1.01690626, + "epoch": 0.9534946640613258, + "flos": 23124858946560.0, + "grad_norm": 1.5473631195541828, + "language_loss": 0.56686866, + "learning_rate": 2.2615608494177097e-08, + "loss": 0.58823204, + "num_input_tokens_seen": 342058195, + "router_z_loss_clip": 0.70654297, + "router_z_loss_mlp": 0.1048584, + "step": 15859, + "time_per_iteration": 3.9955999851226807 + }, + { + "auxiliary_loss_clip": 0.01107182, + "auxiliary_loss_mlp": 0.01025556, + "balance_loss_clip": 1.03733766, + "balance_loss_mlp": 1.01534021, + "epoch": 0.9535547873139937, + "flos": 20321301176640.0, + "grad_norm": 23.75564120122094, + "language_loss": 0.81920737, + "learning_rate": 2.2557242649130504e-08, + "loss": 0.84053469, + "num_input_tokens_seen": 342075025, + "router_z_loss_clip": 0.69873047, + "router_z_loss_mlp": 0.10205078, + "step": 15860, + "time_per_iteration": 2.638157367706299 + }, + { + "auxiliary_loss_clip": 0.01108439, + "auxiliary_loss_mlp": 0.01026467, + "balance_loss_clip": 1.03597701, + "balance_loss_mlp": 1.01611996, + "epoch": 0.9536149105666617, + "flos": 25218858163680.0, + "grad_norm": 1.812913258763936, + "language_loss": 0.66719735, + "learning_rate": 2.249895178891159e-08, + "loss": 0.68854642, + "num_input_tokens_seen": 342094595, + "router_z_loss_clip": 0.72412109, + "router_z_loss_mlp": 0.10345459, + "step": 15861, + "time_per_iteration": 2.6080517768859863 + }, + { + "auxiliary_loss_clip": 0.01111002, + "auxiliary_loss_mlp": 0.01033607, + "balance_loss_clip": 1.03825712, + "balance_loss_mlp": 1.02209115, + "epoch": 0.9536750338193296, + "flos": 37461636406080.0, + "grad_norm": 1.9408128318124174, + "language_loss": 0.65466821, + "learning_rate": 2.244073591573037e-08, + "loss": 0.67611426, + "num_input_tokens_seen": 342115970, + "router_z_loss_clip": 0.72802734, + "router_z_loss_mlp": 0.11523438, + "step": 15862, + "time_per_iteration": 2.6918118000030518 + }, + { + "auxiliary_loss_clip": 0.01106666, + "auxiliary_loss_mlp": 0.01027149, + "balance_loss_clip": 1.03792799, + "balance_loss_mlp": 1.01680779, + "epoch": 0.9537351570719976, + "flos": 24899423863680.0, + "grad_norm": 1.6208725863411815, + "language_loss": 0.67816216, + "learning_rate": 2.238259503179485e-08, + "loss": 0.69950032, + "num_input_tokens_seen": 342134080, + "router_z_loss_clip": 0.6875, + "router_z_loss_mlp": 0.10339355, + "step": 15863, + "time_per_iteration": 2.678640842437744 + }, + { + "auxiliary_loss_clip": 0.01109239, + "auxiliary_loss_mlp": 0.01027287, + "balance_loss_clip": 1.03678083, + "balance_loss_mlp": 1.01640368, + "epoch": 0.9537952803246655, + "flos": 36520958522880.0, + "grad_norm": 4.414778438510839, + "language_loss": 0.78410196, + "learning_rate": 2.2324529139309267e-08, + "loss": 0.80546725, + "num_input_tokens_seen": 342154725, + "router_z_loss_clip": 0.72460938, + "router_z_loss_mlp": 0.10888672, + "step": 15864, + "time_per_iteration": 2.730821371078491 + }, + { + "auxiliary_loss_clip": 0.01109178, + "auxiliary_loss_mlp": 0.01026987, + "balance_loss_clip": 1.03830945, + "balance_loss_mlp": 1.01612687, + "epoch": 0.9538554035773336, + "flos": 25040283395040.0, + "grad_norm": 2.0817541650862705, + "language_loss": 0.59717858, + "learning_rate": 2.226653824047586e-08, + "loss": 0.61854029, + "num_input_tokens_seen": 342172275, + "router_z_loss_clip": 0.70849609, + "router_z_loss_mlp": 0.10858154, + "step": 15865, + "time_per_iteration": 2.629143714904785 + }, + { + "auxiliary_loss_clip": 0.01110836, + "auxiliary_loss_mlp": 0.01029757, + "balance_loss_clip": 1.03763294, + "balance_loss_mlp": 1.01855147, + "epoch": 0.9539155268300015, + "flos": 22459417361280.0, + "grad_norm": 2.029319657934533, + "language_loss": 0.70807898, + "learning_rate": 2.2208622337493765e-08, + "loss": 0.72948486, + "num_input_tokens_seen": 342190880, + "router_z_loss_clip": 0.73339844, + "router_z_loss_mlp": 0.11199951, + "step": 15866, + "time_per_iteration": 2.626905679702759 + }, + { + "auxiliary_loss_clip": 0.01109758, + "auxiliary_loss_mlp": 0.01028803, + "balance_loss_clip": 1.03699112, + "balance_loss_mlp": 1.01748407, + "epoch": 0.9539756500826695, + "flos": 31985939871360.0, + "grad_norm": 2.2464906768504043, + "language_loss": 0.85028571, + "learning_rate": 2.215078143255855e-08, + "loss": 0.87167138, + "num_input_tokens_seen": 342208165, + "router_z_loss_clip": 0.72802734, + "router_z_loss_mlp": 0.11315918, + "step": 15867, + "time_per_iteration": 2.6589107513427734 + }, + { + "auxiliary_loss_clip": 0.01027054, + "auxiliary_loss_mlp": 0.0100055, + "balance_loss_clip": 1.00469661, + "balance_loss_mlp": 0.99971938, + "epoch": 0.9540357733353374, + "flos": 83326011085440.0, + "grad_norm": 0.779315456846039, + "language_loss": 0.61812633, + "learning_rate": 2.2093015527864024e-08, + "loss": 0.6384024, + "num_input_tokens_seen": 342277110, + "router_z_loss_clip": 0.22351074, + "router_z_loss_mlp": 0.00831223, + "step": 15868, + "time_per_iteration": 3.2970900535583496 + }, + { + "auxiliary_loss_clip": 0.01109729, + "auxiliary_loss_mlp": 0.01024168, + "balance_loss_clip": 1.03752005, + "balance_loss_mlp": 1.01298666, + "epoch": 0.9540958965880054, + "flos": 25976018170080.0, + "grad_norm": 1.9429139958663806, + "language_loss": 0.60579765, + "learning_rate": 2.2035324625600425e-08, + "loss": 0.62713659, + "num_input_tokens_seen": 342294695, + "router_z_loss_clip": 0.72216797, + "router_z_loss_mlp": 0.11181641, + "step": 15869, + "time_per_iteration": 2.65285587310791 + }, + { + "auxiliary_loss_clip": 0.01109263, + "auxiliary_loss_mlp": 0.01027297, + "balance_loss_clip": 1.03806663, + "balance_loss_mlp": 1.0176053, + "epoch": 0.9541560198406733, + "flos": 24101023616640.0, + "grad_norm": 1.9680860717299813, + "language_loss": 0.70762908, + "learning_rate": 2.197770872795579e-08, + "loss": 0.72899467, + "num_input_tokens_seen": 342314970, + "router_z_loss_clip": 0.71191406, + "router_z_loss_mlp": 0.09692383, + "step": 15870, + "time_per_iteration": 2.6328539848327637 + }, + { + "auxiliary_loss_clip": 0.01105872, + "auxiliary_loss_mlp": 0.01027699, + "balance_loss_clip": 1.03514791, + "balance_loss_mlp": 1.01654673, + "epoch": 0.9542161430933414, + "flos": 30158546771520.0, + "grad_norm": 2.2035704142756116, + "language_loss": 0.76654482, + "learning_rate": 2.1920167837114368e-08, + "loss": 0.78788054, + "num_input_tokens_seen": 342334255, + "router_z_loss_clip": 0.70751953, + "router_z_loss_mlp": 0.11151123, + "step": 15871, + "time_per_iteration": 2.6796274185180664 + }, + { + "auxiliary_loss_clip": 0.01111436, + "auxiliary_loss_mlp": 0.01030096, + "balance_loss_clip": 1.03823209, + "balance_loss_mlp": 1.01855683, + "epoch": 0.9542762663460094, + "flos": 37907384533920.0, + "grad_norm": 1.7933251403601607, + "language_loss": 0.58278269, + "learning_rate": 2.1862701955258634e-08, + "loss": 0.60419798, + "num_input_tokens_seen": 342354730, + "router_z_loss_clip": 0.73193359, + "router_z_loss_mlp": 0.11529541, + "step": 15872, + "time_per_iteration": 2.6705751419067383 + }, + { + "auxiliary_loss_clip": 0.01112112, + "auxiliary_loss_mlp": 0.01028643, + "balance_loss_clip": 1.0375185, + "balance_loss_mlp": 1.01680529, + "epoch": 0.9543363895986773, + "flos": 25351857342720.0, + "grad_norm": 1.5765285261145592, + "language_loss": 0.75042897, + "learning_rate": 2.1805311084567514e-08, + "loss": 0.77183652, + "num_input_tokens_seen": 342374565, + "router_z_loss_clip": 0.74658203, + "router_z_loss_mlp": 0.1184082, + "step": 15873, + "time_per_iteration": 2.631169319152832 + }, + { + "auxiliary_loss_clip": 0.01111223, + "auxiliary_loss_mlp": 0.01031619, + "balance_loss_clip": 1.03822565, + "balance_loss_mlp": 1.02002037, + "epoch": 0.9543965128513453, + "flos": 29849727998880.0, + "grad_norm": 1.944953899744343, + "language_loss": 0.62292671, + "learning_rate": 2.1747995227217265e-08, + "loss": 0.64435512, + "num_input_tokens_seen": 342394590, + "router_z_loss_clip": 0.73095703, + "router_z_loss_mlp": 0.1159668, + "step": 15874, + "time_per_iteration": 2.640195608139038 + }, + { + "auxiliary_loss_clip": 0.0110673, + "auxiliary_loss_mlp": 0.01033951, + "balance_loss_clip": 1.0365274, + "balance_loss_mlp": 1.02281725, + "epoch": 0.9544566361040132, + "flos": 18622328803200.0, + "grad_norm": 2.162566976837643, + "language_loss": 0.89585543, + "learning_rate": 2.169075438538104e-08, + "loss": 0.9172622, + "num_input_tokens_seen": 342410445, + "router_z_loss_clip": 0.70166016, + "router_z_loss_mlp": 0.11132812, + "step": 15875, + "time_per_iteration": 3.9948010444641113 + }, + { + "auxiliary_loss_clip": 0.01113396, + "auxiliary_loss_mlp": 0.01031478, + "balance_loss_clip": 1.03782976, + "balance_loss_mlp": 1.01973546, + "epoch": 0.9545167593566812, + "flos": 31626804987360.0, + "grad_norm": 1.7205154917594692, + "language_loss": 0.6763404, + "learning_rate": 2.1633588561229765e-08, + "loss": 0.69778913, + "num_input_tokens_seen": 342430970, + "router_z_loss_clip": 0.75537109, + "router_z_loss_mlp": 0.11737061, + "step": 15876, + "time_per_iteration": 2.6545732021331787 + }, + { + "auxiliary_loss_clip": 0.01111452, + "auxiliary_loss_mlp": 0.01028121, + "balance_loss_clip": 1.03780699, + "balance_loss_mlp": 1.01610446, + "epoch": 0.9545768826093491, + "flos": 31272086486880.0, + "grad_norm": 2.711282264599967, + "language_loss": 0.69691968, + "learning_rate": 2.1576497756931267e-08, + "loss": 0.71831536, + "num_input_tokens_seen": 342449505, + "router_z_loss_clip": 0.73632812, + "router_z_loss_mlp": 0.12017822, + "step": 15877, + "time_per_iteration": 2.640376329421997 + }, + { + "auxiliary_loss_clip": 0.01112057, + "auxiliary_loss_mlp": 0.01027694, + "balance_loss_clip": 1.03862274, + "balance_loss_mlp": 1.01644087, + "epoch": 0.9546370058620172, + "flos": 27444357420480.0, + "grad_norm": 3.2851362779027786, + "language_loss": 0.70872653, + "learning_rate": 2.1519481974650035e-08, + "loss": 0.73012412, + "num_input_tokens_seen": 342470390, + "router_z_loss_clip": 0.73388672, + "router_z_loss_mlp": 0.11248779, + "step": 15878, + "time_per_iteration": 2.6821813583374023 + }, + { + "auxiliary_loss_clip": 0.0110612, + "auxiliary_loss_mlp": 0.01028499, + "balance_loss_clip": 1.03604102, + "balance_loss_mlp": 1.01717448, + "epoch": 0.9546971291146851, + "flos": 30028829492160.0, + "grad_norm": 1.3647138887095032, + "language_loss": 0.68100232, + "learning_rate": 2.1462541216548335e-08, + "loss": 0.70234847, + "num_input_tokens_seen": 342492560, + "router_z_loss_clip": 0.70117188, + "router_z_loss_mlp": 0.11328125, + "step": 15879, + "time_per_iteration": 2.7114784717559814 + }, + { + "auxiliary_loss_clip": 0.01107243, + "auxiliary_loss_mlp": 0.01029192, + "balance_loss_clip": 1.03702736, + "balance_loss_mlp": 1.01846349, + "epoch": 0.9547572523673531, + "flos": 34968882755520.0, + "grad_norm": 1.9257408768818605, + "language_loss": 0.84758806, + "learning_rate": 2.1405675484785334e-08, + "loss": 0.86895245, + "num_input_tokens_seen": 342512315, + "router_z_loss_clip": 0.703125, + "router_z_loss_mlp": 0.10723877, + "step": 15880, + "time_per_iteration": 2.727041721343994 + }, + { + "auxiliary_loss_clip": 0.01108272, + "auxiliary_loss_mlp": 0.01031651, + "balance_loss_clip": 1.03592658, + "balance_loss_mlp": 1.02027857, + "epoch": 0.954817375620021, + "flos": 41246990748000.0, + "grad_norm": 1.8847343095253009, + "language_loss": 0.71926999, + "learning_rate": 2.134888478151753e-08, + "loss": 0.74066925, + "num_input_tokens_seen": 342533060, + "router_z_loss_clip": 0.72412109, + "router_z_loss_mlp": 0.11364746, + "step": 15881, + "time_per_iteration": 2.750095844268799 + }, + { + "auxiliary_loss_clip": 0.01108077, + "auxiliary_loss_mlp": 0.01030375, + "balance_loss_clip": 1.0376668, + "balance_loss_mlp": 1.01943755, + "epoch": 0.954877498872689, + "flos": 17604923892480.0, + "grad_norm": 2.3669582538159593, + "language_loss": 0.71143913, + "learning_rate": 2.1292169108898083e-08, + "loss": 0.73282361, + "num_input_tokens_seen": 342550830, + "router_z_loss_clip": 0.70458984, + "router_z_loss_mlp": 0.10931396, + "step": 15882, + "time_per_iteration": 2.6431009769439697 + }, + { + "auxiliary_loss_clip": 0.01109185, + "auxiliary_loss_mlp": 0.0103078, + "balance_loss_clip": 1.03776824, + "balance_loss_mlp": 1.01990247, + "epoch": 0.9549376221253569, + "flos": 72323606760480.0, + "grad_norm": 1.7104215045171574, + "language_loss": 0.65284336, + "learning_rate": 2.1235528469078168e-08, + "loss": 0.67424297, + "num_input_tokens_seen": 342575070, + "router_z_loss_clip": 0.71386719, + "router_z_loss_mlp": 0.10888672, + "step": 15883, + "time_per_iteration": 2.9690451622009277 + }, + { + "auxiliary_loss_clip": 0.01114311, + "auxiliary_loss_mlp": 0.01029202, + "balance_loss_clip": 1.04061687, + "balance_loss_mlp": 1.01744175, + "epoch": 0.954997745378025, + "flos": 21078015492960.0, + "grad_norm": 2.179291382680772, + "language_loss": 0.78049314, + "learning_rate": 2.1178962864205175e-08, + "loss": 0.80192828, + "num_input_tokens_seen": 342592215, + "router_z_loss_clip": 0.73681641, + "router_z_loss_mlp": 0.11773682, + "step": 15884, + "time_per_iteration": 2.566725254058838 + }, + { + "auxiliary_loss_clip": 0.01111252, + "auxiliary_loss_mlp": 0.01027789, + "balance_loss_clip": 1.03673458, + "balance_loss_mlp": 1.01642799, + "epoch": 0.955057868630693, + "flos": 15869775938400.0, + "grad_norm": 1.96152623182243, + "language_loss": 0.77449644, + "learning_rate": 2.1122472296424054e-08, + "loss": 0.79588687, + "num_input_tokens_seen": 342610030, + "router_z_loss_clip": 0.74462891, + "router_z_loss_mlp": 0.11352539, + "step": 15885, + "time_per_iteration": 2.5772976875305176 + }, + { + "auxiliary_loss_clip": 0.0111063, + "auxiliary_loss_mlp": 0.01031816, + "balance_loss_clip": 1.03679943, + "balance_loss_mlp": 1.02084887, + "epoch": 0.9551179918833609, + "flos": 27622729602720.0, + "grad_norm": 1.8508498876710546, + "language_loss": 0.69914711, + "learning_rate": 2.1066056767877317e-08, + "loss": 0.72057152, + "num_input_tokens_seen": 342626475, + "router_z_loss_clip": 0.73876953, + "router_z_loss_mlp": 0.10955811, + "step": 15886, + "time_per_iteration": 4.161782741546631 + }, + { + "auxiliary_loss_clip": 0.01113696, + "auxiliary_loss_mlp": 0.01032029, + "balance_loss_clip": 1.03881407, + "balance_loss_mlp": 1.01953614, + "epoch": 0.9551781151360289, + "flos": 26286700737600.0, + "grad_norm": 1.6348104702231445, + "language_loss": 0.72345275, + "learning_rate": 2.1009716280703916e-08, + "loss": 0.74491, + "num_input_tokens_seen": 342646645, + "router_z_loss_clip": 0.74902344, + "router_z_loss_mlp": 0.125, + "step": 15887, + "time_per_iteration": 2.629117965698242 + }, + { + "auxiliary_loss_clip": 0.01105263, + "auxiliary_loss_mlp": 0.01027742, + "balance_loss_clip": 1.03590763, + "balance_loss_mlp": 1.01720369, + "epoch": 0.9552382383886968, + "flos": 25261111336320.0, + "grad_norm": 2.105398881183544, + "language_loss": 0.57173002, + "learning_rate": 2.0953450837040364e-08, + "loss": 0.59306008, + "num_input_tokens_seen": 342663615, + "router_z_loss_clip": 0.69335938, + "router_z_loss_mlp": 0.10540771, + "step": 15888, + "time_per_iteration": 3.938817262649536 + }, + { + "auxiliary_loss_clip": 0.0102696, + "auxiliary_loss_mlp": 0.01001318, + "balance_loss_clip": 1.00462651, + "balance_loss_mlp": 1.00040293, + "epoch": 0.9552983616413648, + "flos": 82693098525600.0, + "grad_norm": 0.7044046570430773, + "language_loss": 0.57801658, + "learning_rate": 2.0897260439020514e-08, + "loss": 0.59829938, + "num_input_tokens_seen": 342728275, + "router_z_loss_clip": 0.2232666, + "router_z_loss_mlp": 0.00914001, + "step": 15889, + "time_per_iteration": 3.2627735137939453 + }, + { + "auxiliary_loss_clip": 0.01109697, + "auxiliary_loss_mlp": 0.0102888, + "balance_loss_clip": 1.03548634, + "balance_loss_mlp": 1.01723278, + "epoch": 0.9553584848940327, + "flos": 26331992706240.0, + "grad_norm": 2.4198814532283426, + "language_loss": 0.66878849, + "learning_rate": 2.084114508877466e-08, + "loss": 0.69017428, + "num_input_tokens_seen": 342748860, + "router_z_loss_clip": 0.74121094, + "router_z_loss_mlp": 0.11639404, + "step": 15890, + "time_per_iteration": 2.6774799823760986 + }, + { + "auxiliary_loss_clip": 0.0111081, + "auxiliary_loss_mlp": 0.01030898, + "balance_loss_clip": 1.03898907, + "balance_loss_mlp": 1.01993048, + "epoch": 0.9554186081467008, + "flos": 29539004914080.0, + "grad_norm": 1.7739908618428972, + "language_loss": 0.73982626, + "learning_rate": 2.0785104788430874e-08, + "loss": 0.76124334, + "num_input_tokens_seen": 342769705, + "router_z_loss_clip": 0.71875, + "router_z_loss_mlp": 0.10955811, + "step": 15891, + "time_per_iteration": 2.69145131111145 + }, + { + "auxiliary_loss_clip": 0.01105231, + "auxiliary_loss_mlp": 0.01030957, + "balance_loss_clip": 1.03635049, + "balance_loss_mlp": 1.02068162, + "epoch": 0.9554787313993687, + "flos": 19829977459200.0, + "grad_norm": 1.8484086126614674, + "language_loss": 0.78225577, + "learning_rate": 2.072913954011435e-08, + "loss": 0.80361766, + "num_input_tokens_seen": 342787000, + "router_z_loss_clip": 0.68847656, + "router_z_loss_mlp": 0.10266113, + "step": 15892, + "time_per_iteration": 2.6719295978546143 + }, + { + "auxiliary_loss_clip": 0.01109534, + "auxiliary_loss_mlp": 0.01029774, + "balance_loss_clip": 1.03837585, + "balance_loss_mlp": 1.01787114, + "epoch": 0.9555388546520367, + "flos": 28557978170400.0, + "grad_norm": 1.6170965395377241, + "language_loss": 0.70042038, + "learning_rate": 2.0673249345947386e-08, + "loss": 0.72181344, + "num_input_tokens_seen": 342807795, + "router_z_loss_clip": 0.71191406, + "router_z_loss_mlp": 0.11895752, + "step": 15893, + "time_per_iteration": 2.659662961959839 + }, + { + "auxiliary_loss_clip": 0.01111166, + "auxiliary_loss_mlp": 0.01030664, + "balance_loss_clip": 1.04032671, + "balance_loss_mlp": 1.01811135, + "epoch": 0.9555989779047046, + "flos": 18051239262240.0, + "grad_norm": 2.4273382366270773, + "language_loss": 0.66457009, + "learning_rate": 2.0617434208048955e-08, + "loss": 0.68598843, + "num_input_tokens_seen": 342825490, + "router_z_loss_clip": 0.70849609, + "router_z_loss_mlp": 0.12542725, + "step": 15894, + "time_per_iteration": 2.6303763389587402 + }, + { + "auxiliary_loss_clip": 0.01110544, + "auxiliary_loss_mlp": 0.01029427, + "balance_loss_clip": 1.03758788, + "balance_loss_mlp": 1.01820314, + "epoch": 0.9556591011573726, + "flos": 27133431749280.0, + "grad_norm": 3.9528794638981175, + "language_loss": 0.81604981, + "learning_rate": 2.056169412853581e-08, + "loss": 0.83744955, + "num_input_tokens_seen": 342844965, + "router_z_loss_clip": 0.72851562, + "router_z_loss_mlp": 0.11236572, + "step": 15895, + "time_per_iteration": 2.660247564315796 + }, + { + "auxiliary_loss_clip": 0.01110821, + "auxiliary_loss_mlp": 0.01031099, + "balance_loss_clip": 1.03838205, + "balance_loss_mlp": 1.01936841, + "epoch": 0.9557192244100405, + "flos": 33989638772160.0, + "grad_norm": 1.5791954730880136, + "language_loss": 0.72325432, + "learning_rate": 2.0506029109521593e-08, + "loss": 0.74467349, + "num_input_tokens_seen": 342865915, + "router_z_loss_clip": 0.72509766, + "router_z_loss_mlp": 0.11730957, + "step": 15896, + "time_per_iteration": 2.676097869873047 + }, + { + "auxiliary_loss_clip": 0.01106749, + "auxiliary_loss_mlp": 0.01031241, + "balance_loss_clip": 1.03630137, + "balance_loss_mlp": 1.02030945, + "epoch": 0.9557793476627086, + "flos": 21478755273120.0, + "grad_norm": 1.8337211478006188, + "language_loss": 0.79299968, + "learning_rate": 2.045043915311706e-08, + "loss": 0.81437957, + "num_input_tokens_seen": 342884000, + "router_z_loss_clip": 0.70507812, + "router_z_loss_mlp": 0.10931396, + "step": 15897, + "time_per_iteration": 2.613365888595581 + }, + { + "auxiliary_loss_clip": 0.01107237, + "auxiliary_loss_mlp": 0.01030621, + "balance_loss_clip": 1.03568852, + "balance_loss_mlp": 1.0189147, + "epoch": 0.9558394709153766, + "flos": 29133443577600.0, + "grad_norm": 1.9399586961406392, + "language_loss": 0.7269901, + "learning_rate": 2.03949242614303e-08, + "loss": 0.74836868, + "num_input_tokens_seen": 342903095, + "router_z_loss_clip": 0.71582031, + "router_z_loss_mlp": 0.11706543, + "step": 15898, + "time_per_iteration": 2.613130807876587 + }, + { + "auxiliary_loss_clip": 0.01027032, + "auxiliary_loss_mlp": 0.01001713, + "balance_loss_clip": 1.00468946, + "balance_loss_mlp": 1.00080967, + "epoch": 0.9558995941680445, + "flos": 65502487702080.0, + "grad_norm": 0.8418896016816955, + "language_loss": 0.52288771, + "learning_rate": 2.033948443656652e-08, + "loss": 0.54317516, + "num_input_tokens_seen": 342958155, + "router_z_loss_clip": 0.22363281, + "router_z_loss_mlp": 0.00901794, + "step": 15899, + "time_per_iteration": 4.4465086460113525 + }, + { + "auxiliary_loss_clip": 0.01113836, + "auxiliary_loss_mlp": 0.01032087, + "balance_loss_clip": 1.0384537, + "balance_loss_mlp": 1.01990938, + "epoch": 0.9559597174207125, + "flos": 16795624497120.0, + "grad_norm": 2.3515957709483923, + "language_loss": 0.68578309, + "learning_rate": 2.028411968062782e-08, + "loss": 0.70724231, + "num_input_tokens_seen": 342972500, + "router_z_loss_clip": 0.75244141, + "router_z_loss_mlp": 0.12176514, + "step": 15900, + "time_per_iteration": 2.591606616973877 + }, + { + "auxiliary_loss_clip": 0.01110484, + "auxiliary_loss_mlp": 0.01026442, + "balance_loss_clip": 1.03730273, + "balance_loss_mlp": 1.01505208, + "epoch": 0.9560198406733804, + "flos": 24326470527840.0, + "grad_norm": 3.613752112570943, + "language_loss": 0.83243501, + "learning_rate": 2.0228829995713627e-08, + "loss": 0.85380423, + "num_input_tokens_seen": 342989035, + "router_z_loss_clip": 0.73046875, + "router_z_loss_mlp": 0.1138916, + "step": 15901, + "time_per_iteration": 2.610659122467041 + }, + { + "auxiliary_loss_clip": 0.01027028, + "auxiliary_loss_mlp": 0.01000883, + "balance_loss_clip": 1.00469244, + "balance_loss_mlp": 1.00000083, + "epoch": 0.9560799639260484, + "flos": 69905114933760.0, + "grad_norm": 0.7289202742861057, + "language_loss": 0.54304588, + "learning_rate": 2.0173615383920485e-08, + "loss": 0.56332499, + "num_input_tokens_seen": 343051675, + "router_z_loss_clip": 0.22338867, + "router_z_loss_mlp": 0.00882721, + "step": 15902, + "time_per_iteration": 3.3622076511383057 + }, + { + "auxiliary_loss_clip": 0.01102985, + "auxiliary_loss_mlp": 0.01025876, + "balance_loss_clip": 1.03603196, + "balance_loss_mlp": 1.0165894, + "epoch": 0.9561400871787163, + "flos": 23082565256640.0, + "grad_norm": 1.7364327941795055, + "language_loss": 0.85479748, + "learning_rate": 2.01184758473425e-08, + "loss": 0.87608612, + "num_input_tokens_seen": 343068895, + "router_z_loss_clip": 0.66992188, + "router_z_loss_mlp": 0.09283447, + "step": 15903, + "time_per_iteration": 2.6257786750793457 + }, + { + "auxiliary_loss_clip": 0.01107953, + "auxiliary_loss_mlp": 0.01023326, + "balance_loss_clip": 1.03684115, + "balance_loss_mlp": 1.01315129, + "epoch": 0.9562002104313844, + "flos": 22010711472000.0, + "grad_norm": 1.9733632599671478, + "language_loss": 0.80718291, + "learning_rate": 2.0063411388070217e-08, + "loss": 0.82849562, + "num_input_tokens_seen": 343087115, + "router_z_loss_clip": 0.71142578, + "router_z_loss_mlp": 0.10174561, + "step": 15904, + "time_per_iteration": 2.701134443283081 + }, + { + "auxiliary_loss_clip": 0.01111368, + "auxiliary_loss_mlp": 0.01028405, + "balance_loss_clip": 1.03800583, + "balance_loss_mlp": 1.01708055, + "epoch": 0.9562603336840523, + "flos": 30160856256480.0, + "grad_norm": 11.477106110571805, + "language_loss": 0.60399389, + "learning_rate": 2.0008422008191972e-08, + "loss": 0.62539166, + "num_input_tokens_seen": 343105575, + "router_z_loss_clip": 0.734375, + "router_z_loss_mlp": 0.11328125, + "step": 15905, + "time_per_iteration": 2.632614850997925 + }, + { + "auxiliary_loss_clip": 0.01107551, + "auxiliary_loss_mlp": 0.01030397, + "balance_loss_clip": 1.03603792, + "balance_loss_mlp": 1.01968598, + "epoch": 0.9563204569367203, + "flos": 25840628471520.0, + "grad_norm": 1.9422724391627169, + "language_loss": 0.70790488, + "learning_rate": 1.995350770979254e-08, + "loss": 0.72928441, + "num_input_tokens_seen": 343123025, + "router_z_loss_clip": 0.71484375, + "router_z_loss_mlp": 0.1071167, + "step": 15906, + "time_per_iteration": 2.6139650344848633 + }, + { + "auxiliary_loss_clip": 0.01112547, + "auxiliary_loss_mlp": 0.01028826, + "balance_loss_clip": 1.03908122, + "balance_loss_mlp": 1.01717949, + "epoch": 0.9563805801893882, + "flos": 24684349376160.0, + "grad_norm": 7.908230282798161, + "language_loss": 0.70852029, + "learning_rate": 1.9898668494954473e-08, + "loss": 0.72993404, + "num_input_tokens_seen": 343141625, + "router_z_loss_clip": 0.734375, + "router_z_loss_mlp": 0.11657715, + "step": 15907, + "time_per_iteration": 2.603646755218506 + }, + { + "auxiliary_loss_clip": 0.01107843, + "auxiliary_loss_mlp": 0.01028229, + "balance_loss_clip": 1.03731048, + "balance_loss_mlp": 1.01699376, + "epoch": 0.9564407034420562, + "flos": 31006898474400.0, + "grad_norm": 2.013986667490908, + "language_loss": 0.70276272, + "learning_rate": 1.9843904365757447e-08, + "loss": 0.72412348, + "num_input_tokens_seen": 343161300, + "router_z_loss_clip": 0.70458984, + "router_z_loss_mlp": 0.11236572, + "step": 15908, + "time_per_iteration": 2.649683952331543 + }, + { + "auxiliary_loss_clip": 0.01109726, + "auxiliary_loss_mlp": 0.01029139, + "balance_loss_clip": 1.03928506, + "balance_loss_mlp": 1.01841569, + "epoch": 0.9565008266947241, + "flos": 22724807960160.0, + "grad_norm": 2.1339502812670315, + "language_loss": 0.83161485, + "learning_rate": 1.978921532427802e-08, + "loss": 0.8530035, + "num_input_tokens_seen": 343177815, + "router_z_loss_clip": 0.70458984, + "router_z_loss_mlp": 0.1071167, + "step": 15909, + "time_per_iteration": 2.6398138999938965 + }, + { + "auxiliary_loss_clip": 0.01108055, + "auxiliary_loss_mlp": 0.0102913, + "balance_loss_clip": 1.03632939, + "balance_loss_mlp": 1.01803136, + "epoch": 0.9565609499473922, + "flos": 30337283609280.0, + "grad_norm": 1.76801038257312, + "language_loss": 0.67647707, + "learning_rate": 1.9734601372590086e-08, + "loss": 0.69784892, + "num_input_tokens_seen": 343198140, + "router_z_loss_clip": 0.71777344, + "router_z_loss_mlp": 0.11108398, + "step": 15910, + "time_per_iteration": 2.690441608428955 + }, + { + "auxiliary_loss_clip": 0.01113321, + "auxiliary_loss_mlp": 0.0103141, + "balance_loss_clip": 1.0389266, + "balance_loss_mlp": 1.02040112, + "epoch": 0.9566210732000601, + "flos": 26598639340800.0, + "grad_norm": 1.94141710538669, + "language_loss": 0.74916148, + "learning_rate": 1.968006251276444e-08, + "loss": 0.77060872, + "num_input_tokens_seen": 343218280, + "router_z_loss_clip": 0.74511719, + "router_z_loss_mlp": 0.11016846, + "step": 15911, + "time_per_iteration": 2.638012409210205 + }, + { + "auxiliary_loss_clip": 0.01108538, + "auxiliary_loss_mlp": 0.0102734, + "balance_loss_clip": 1.03616309, + "balance_loss_mlp": 1.01658094, + "epoch": 0.9566811964527281, + "flos": 22814703103680.0, + "grad_norm": 2.068565092803164, + "language_loss": 0.69463134, + "learning_rate": 1.9625598746869198e-08, + "loss": 0.71599007, + "num_input_tokens_seen": 343236850, + "router_z_loss_clip": 0.72216797, + "router_z_loss_mlp": 0.10760498, + "step": 15912, + "time_per_iteration": 2.582610607147217 + }, + { + "auxiliary_loss_clip": 0.01110479, + "auxiliary_loss_mlp": 0.01037144, + "balance_loss_clip": 1.03805733, + "balance_loss_mlp": 1.02564049, + "epoch": 0.9567413197053961, + "flos": 15863293173600.0, + "grad_norm": 2.8995099968866973, + "language_loss": 0.72426766, + "learning_rate": 1.95712100769696e-08, + "loss": 0.74574387, + "num_input_tokens_seen": 343253065, + "router_z_loss_clip": 0.72363281, + "router_z_loss_mlp": 0.11499023, + "step": 15913, + "time_per_iteration": 2.6333870887756348 + }, + { + "auxiliary_loss_clip": 0.01108844, + "auxiliary_loss_mlp": 0.01024614, + "balance_loss_clip": 1.03795528, + "balance_loss_mlp": 1.01431453, + "epoch": 0.956801442958064, + "flos": 24061404067200.0, + "grad_norm": 1.9828227730200845, + "language_loss": 0.73347384, + "learning_rate": 1.9516896505128444e-08, + "loss": 0.75480843, + "num_input_tokens_seen": 343270330, + "router_z_loss_clip": 0.70800781, + "router_z_loss_mlp": 0.10308838, + "step": 15914, + "time_per_iteration": 4.098360061645508 + }, + { + "auxiliary_loss_clip": 0.01109345, + "auxiliary_loss_mlp": 0.01025764, + "balance_loss_clip": 1.03798628, + "balance_loss_mlp": 1.01492155, + "epoch": 0.956861566210732, + "flos": 22235388554880.0, + "grad_norm": 1.5223630260084757, + "language_loss": 0.67217708, + "learning_rate": 1.9462658033404965e-08, + "loss": 0.69352818, + "num_input_tokens_seen": 343289625, + "router_z_loss_clip": 0.71337891, + "router_z_loss_mlp": 0.10839844, + "step": 15915, + "time_per_iteration": 2.6278059482574463 + }, + { + "auxiliary_loss_clip": 0.01106558, + "auxiliary_loss_mlp": 0.01025207, + "balance_loss_clip": 1.03644061, + "balance_loss_mlp": 1.01419175, + "epoch": 0.9569216894634, + "flos": 27085182019200.0, + "grad_norm": 2.098095635387582, + "language_loss": 0.64327359, + "learning_rate": 1.9408494663855967e-08, + "loss": 0.66459119, + "num_input_tokens_seen": 343309200, + "router_z_loss_clip": 0.70117188, + "router_z_loss_mlp": 0.11016846, + "step": 15916, + "time_per_iteration": 2.615257740020752 + }, + { + "auxiliary_loss_clip": 0.0110256, + "auxiliary_loss_mlp": 0.01025187, + "balance_loss_clip": 1.03621936, + "balance_loss_mlp": 1.01495886, + "epoch": 0.956981812716068, + "flos": 26465397058080.0, + "grad_norm": 1.972721992383768, + "language_loss": 0.80780065, + "learning_rate": 1.935440639853536e-08, + "loss": 0.82907814, + "num_input_tokens_seen": 343326270, + "router_z_loss_clip": 0.66357422, + "router_z_loss_mlp": 0.10229492, + "step": 15917, + "time_per_iteration": 2.6522388458251953 + }, + { + "auxiliary_loss_clip": 0.01108644, + "auxiliary_loss_mlp": 0.0103241, + "balance_loss_clip": 1.03777766, + "balance_loss_mlp": 1.02149677, + "epoch": 0.9570419359687359, + "flos": 17071225450560.0, + "grad_norm": 2.6607791151625406, + "language_loss": 0.726264, + "learning_rate": 1.9300393239494172e-08, + "loss": 0.74767458, + "num_input_tokens_seen": 343344430, + "router_z_loss_clip": 0.70849609, + "router_z_loss_mlp": 0.10906982, + "step": 15918, + "time_per_iteration": 2.630056619644165 + }, + { + "auxiliary_loss_clip": 0.01026958, + "auxiliary_loss_mlp": 0.01001738, + "balance_loss_clip": 1.004668, + "balance_loss_mlp": 1.00086915, + "epoch": 0.9571020592214039, + "flos": 79551880030080.0, + "grad_norm": 0.6267831874449689, + "language_loss": 0.53092104, + "learning_rate": 1.924645518878032e-08, + "loss": 0.55120802, + "num_input_tokens_seen": 343416155, + "router_z_loss_clip": 0.22277832, + "router_z_loss_mlp": 0.00868988, + "step": 15919, + "time_per_iteration": 3.3620221614837646 + }, + { + "auxiliary_loss_clip": 0.01117038, + "auxiliary_loss_mlp": 0.01035974, + "balance_loss_clip": 1.04173076, + "balance_loss_mlp": 1.02370751, + "epoch": 0.9571621824740718, + "flos": 21211825017600.0, + "grad_norm": 2.9329002489466913, + "language_loss": 0.75241256, + "learning_rate": 1.919259224843972e-08, + "loss": 0.77394271, + "num_input_tokens_seen": 343431715, + "router_z_loss_clip": 0.75292969, + "router_z_loss_mlp": 0.12261963, + "step": 15920, + "time_per_iteration": 2.6068875789642334 + }, + { + "auxiliary_loss_clip": 0.01113998, + "auxiliary_loss_mlp": 0.01031404, + "balance_loss_clip": 1.03970981, + "balance_loss_mlp": 1.01960814, + "epoch": 0.9572223057267398, + "flos": 17739057555360.0, + "grad_norm": 2.120355490794248, + "language_loss": 0.79050767, + "learning_rate": 1.9138804420514298e-08, + "loss": 0.81196165, + "num_input_tokens_seen": 343450425, + "router_z_loss_clip": 0.74316406, + "router_z_loss_mlp": 0.11810303, + "step": 15921, + "time_per_iteration": 2.573927402496338 + }, + { + "auxiliary_loss_clip": 0.01113242, + "auxiliary_loss_mlp": 0.01029884, + "balance_loss_clip": 1.03696752, + "balance_loss_mlp": 1.01796925, + "epoch": 0.9572824289794077, + "flos": 41424228446400.0, + "grad_norm": 2.033038731766721, + "language_loss": 0.51233077, + "learning_rate": 1.9085091707044197e-08, + "loss": 0.5337621, + "num_input_tokens_seen": 343470445, + "router_z_loss_clip": 0.76220703, + "router_z_loss_mlp": 0.11914062, + "step": 15922, + "time_per_iteration": 2.740575075149536 + }, + { + "auxiliary_loss_clip": 0.01111116, + "auxiliary_loss_mlp": 0.01031117, + "balance_loss_clip": 1.03786349, + "balance_loss_mlp": 1.01933289, + "epoch": 0.9573425522320758, + "flos": 22811664307680.0, + "grad_norm": 2.665647338709057, + "language_loss": 0.83906138, + "learning_rate": 1.903145411006557e-08, + "loss": 0.86048377, + "num_input_tokens_seen": 343485200, + "router_z_loss_clip": 0.73242188, + "router_z_loss_mlp": 0.11791992, + "step": 15923, + "time_per_iteration": 2.574817657470703 + }, + { + "auxiliary_loss_clip": 0.01106875, + "auxiliary_loss_mlp": 0.01028089, + "balance_loss_clip": 1.03590941, + "balance_loss_mlp": 1.01744318, + "epoch": 0.9574026754847437, + "flos": 34789092468480.0, + "grad_norm": 1.5943536043363111, + "language_loss": 0.75145125, + "learning_rate": 1.8977891631613008e-08, + "loss": 0.77280086, + "num_input_tokens_seen": 343505080, + "router_z_loss_clip": 0.70947266, + "router_z_loss_mlp": 0.10650635, + "step": 15924, + "time_per_iteration": 2.6935007572174072 + }, + { + "auxiliary_loss_clip": 0.01109853, + "auxiliary_loss_mlp": 0.01027703, + "balance_loss_clip": 1.03655815, + "balance_loss_mlp": 1.01619983, + "epoch": 0.9574627987374117, + "flos": 29715148645920.0, + "grad_norm": 2.646583762984512, + "language_loss": 0.85946733, + "learning_rate": 1.892440427371711e-08, + "loss": 0.88084292, + "num_input_tokens_seen": 343523995, + "router_z_loss_clip": 0.73339844, + "router_z_loss_mlp": 0.1151123, + "step": 15925, + "time_per_iteration": 2.7501797676086426 + }, + { + "auxiliary_loss_clip": 0.01114037, + "auxiliary_loss_mlp": 0.01033445, + "balance_loss_clip": 1.03857923, + "balance_loss_mlp": 1.02199507, + "epoch": 0.9575229219900797, + "flos": 28688262691680.0, + "grad_norm": 3.5841002813387495, + "language_loss": 0.75886256, + "learning_rate": 1.8870992038406474e-08, + "loss": 0.78033739, + "num_input_tokens_seen": 343542015, + "router_z_loss_clip": 0.75488281, + "router_z_loss_mlp": 0.11456299, + "step": 15926, + "time_per_iteration": 4.108238935470581 + }, + { + "auxiliary_loss_clip": 0.01110023, + "auxiliary_loss_mlp": 0.01026897, + "balance_loss_clip": 1.03805232, + "balance_loss_mlp": 1.01642489, + "epoch": 0.9575830452427476, + "flos": 27667981054080.0, + "grad_norm": 2.08626945779734, + "language_loss": 0.77780044, + "learning_rate": 1.8817654927706373e-08, + "loss": 0.79916966, + "num_input_tokens_seen": 343561680, + "router_z_loss_clip": 0.71972656, + "router_z_loss_mlp": 0.10473633, + "step": 15927, + "time_per_iteration": 3.946136236190796 + }, + { + "auxiliary_loss_clip": 0.01112727, + "auxiliary_loss_mlp": 0.01031523, + "balance_loss_clip": 1.03811085, + "balance_loss_mlp": 1.01917291, + "epoch": 0.9576431684954156, + "flos": 37198636326720.0, + "grad_norm": 1.7483848553234786, + "language_loss": 0.68684947, + "learning_rate": 1.8764392943639183e-08, + "loss": 0.70829189, + "num_input_tokens_seen": 343585290, + "router_z_loss_clip": 0.74609375, + "router_z_loss_mlp": 0.12353516, + "step": 15928, + "time_per_iteration": 2.7140862941741943 + }, + { + "auxiliary_loss_clip": 0.01110285, + "auxiliary_loss_mlp": 0.01029809, + "balance_loss_clip": 1.03871071, + "balance_loss_mlp": 1.01878178, + "epoch": 0.9577032917480836, + "flos": 26463249642240.0, + "grad_norm": 1.7207754914591333, + "language_loss": 0.821778, + "learning_rate": 1.871120608822485e-08, + "loss": 0.84317899, + "num_input_tokens_seen": 343604045, + "router_z_loss_clip": 0.71533203, + "router_z_loss_mlp": 0.11035156, + "step": 15929, + "time_per_iteration": 2.6144843101501465 + }, + { + "auxiliary_loss_clip": 0.01114002, + "auxiliary_loss_mlp": 0.01038176, + "balance_loss_clip": 1.03840697, + "balance_loss_mlp": 1.02721524, + "epoch": 0.9577634150007516, + "flos": 35414711917920.0, + "grad_norm": 1.537367251277294, + "language_loss": 0.72148943, + "learning_rate": 1.8658094363480202e-08, + "loss": 0.74301118, + "num_input_tokens_seen": 343626595, + "router_z_loss_clip": 0.75585938, + "router_z_loss_mlp": 0.10968018, + "step": 15930, + "time_per_iteration": 2.692969560623169 + }, + { + "auxiliary_loss_clip": 0.01108156, + "auxiliary_loss_mlp": 0.01028623, + "balance_loss_clip": 1.0374825, + "balance_loss_mlp": 1.0180496, + "epoch": 0.9578235382534195, + "flos": 23528029763520.0, + "grad_norm": 1.5742214033120985, + "language_loss": 0.61968791, + "learning_rate": 1.8605057771419185e-08, + "loss": 0.64105576, + "num_input_tokens_seen": 343646195, + "router_z_loss_clip": 0.70703125, + "router_z_loss_mlp": 0.10577393, + "step": 15931, + "time_per_iteration": 2.586421251296997 + }, + { + "auxiliary_loss_clip": 0.01107651, + "auxiliary_loss_mlp": 0.01026958, + "balance_loss_clip": 1.03803182, + "balance_loss_mlp": 1.01649737, + "epoch": 0.9578836615060875, + "flos": 16715331948960.0, + "grad_norm": 1.9243531220202188, + "language_loss": 0.69281483, + "learning_rate": 1.8552096314052633e-08, + "loss": 0.71416092, + "num_input_tokens_seen": 343663665, + "router_z_loss_clip": 0.69677734, + "router_z_loss_mlp": 0.10461426, + "step": 15932, + "time_per_iteration": 2.593075752258301 + }, + { + "auxiliary_loss_clip": 0.01113183, + "auxiliary_loss_mlp": 0.01031813, + "balance_loss_clip": 1.03771925, + "balance_loss_mlp": 1.01972532, + "epoch": 0.9579437847587554, + "flos": 20809788684480.0, + "grad_norm": 1.7463920883985338, + "language_loss": 0.75459439, + "learning_rate": 1.849920999338961e-08, + "loss": 0.77604437, + "num_input_tokens_seen": 343682145, + "router_z_loss_clip": 0.75390625, + "router_z_loss_mlp": 0.12097168, + "step": 15933, + "time_per_iteration": 2.566310405731201 + }, + { + "auxiliary_loss_clip": 0.01026923, + "auxiliary_loss_mlp": 0.01002271, + "balance_loss_clip": 1.00458908, + "balance_loss_mlp": 1.00137949, + "epoch": 0.9580039080114234, + "flos": 73909473939360.0, + "grad_norm": 0.7050770935904501, + "language_loss": 0.57232463, + "learning_rate": 1.8446398811434948e-08, + "loss": 0.59261656, + "num_input_tokens_seen": 343744685, + "router_z_loss_clip": 0.2232666, + "router_z_loss_mlp": 0.00891113, + "step": 15934, + "time_per_iteration": 3.3739166259765625 + }, + { + "auxiliary_loss_clip": 0.01027019, + "auxiliary_loss_mlp": 0.01002144, + "balance_loss_clip": 1.00473332, + "balance_loss_mlp": 1.00121212, + "epoch": 0.9580640312640913, + "flos": 80820859147200.0, + "grad_norm": 0.9099026133469462, + "language_loss": 0.65829211, + "learning_rate": 1.8393662770191277e-08, + "loss": 0.67858374, + "num_input_tokens_seen": 343801835, + "router_z_loss_clip": 0.22265625, + "router_z_loss_mlp": 0.00930786, + "step": 15935, + "time_per_iteration": 3.177013874053955 + }, + { + "auxiliary_loss_clip": 0.010276, + "auxiliary_loss_mlp": 0.0100132, + "balance_loss_clip": 1.00528455, + "balance_loss_mlp": 1.00038469, + "epoch": 0.9581241545167594, + "flos": 75918926293920.0, + "grad_norm": 0.7825184290040865, + "language_loss": 0.57001638, + "learning_rate": 1.8341001871658546e-08, + "loss": 0.59030569, + "num_input_tokens_seen": 343861515, + "router_z_loss_clip": 0.22314453, + "router_z_loss_mlp": 0.00934601, + "step": 15936, + "time_per_iteration": 3.2401418685913086 + }, + { + "auxiliary_loss_clip": 0.01110397, + "auxiliary_loss_mlp": 0.01026841, + "balance_loss_clip": 1.03734779, + "balance_loss_mlp": 1.01519966, + "epoch": 0.9581842777694273, + "flos": 29002186641600.0, + "grad_norm": 1.680743769747993, + "language_loss": 0.78382695, + "learning_rate": 1.8288416117833825e-08, + "loss": 0.80519933, + "num_input_tokens_seen": 343881240, + "router_z_loss_clip": 0.73046875, + "router_z_loss_mlp": 0.11657715, + "step": 15937, + "time_per_iteration": 2.6786186695098877 + }, + { + "auxiliary_loss_clip": 0.01112308, + "auxiliary_loss_mlp": 0.01032342, + "balance_loss_clip": 1.03932703, + "balance_loss_mlp": 1.02070117, + "epoch": 0.9582444010220953, + "flos": 25884461818080.0, + "grad_norm": 1.589024239358445, + "language_loss": 0.68540072, + "learning_rate": 1.8235905510710636e-08, + "loss": 0.70684719, + "num_input_tokens_seen": 343900885, + "router_z_loss_clip": 0.72900391, + "router_z_loss_mlp": 0.11633301, + "step": 15938, + "time_per_iteration": 2.620943307876587 + }, + { + "auxiliary_loss_clip": 0.01109806, + "auxiliary_loss_mlp": 0.01028119, + "balance_loss_clip": 1.03748775, + "balance_loss_mlp": 1.01698446, + "epoch": 0.9583045242747633, + "flos": 29047357058400.0, + "grad_norm": 2.4775860305614223, + "language_loss": 0.65504682, + "learning_rate": 1.8183470052280712e-08, + "loss": 0.67642611, + "num_input_tokens_seen": 343918460, + "router_z_loss_clip": 0.72265625, + "router_z_loss_mlp": 0.1114502, + "step": 15939, + "time_per_iteration": 3.8906776905059814 + }, + { + "auxiliary_loss_clip": 0.01107312, + "auxiliary_loss_mlp": 0.01030658, + "balance_loss_clip": 1.03593349, + "balance_loss_mlp": 1.01966667, + "epoch": 0.9583646475274312, + "flos": 29445746836320.0, + "grad_norm": 1.588290870733746, + "language_loss": 0.73720121, + "learning_rate": 1.8131109744532025e-08, + "loss": 0.75858086, + "num_input_tokens_seen": 343938030, + "router_z_loss_clip": 0.71484375, + "router_z_loss_mlp": 0.10992432, + "step": 15940, + "time_per_iteration": 2.6451759338378906 + }, + { + "auxiliary_loss_clip": 0.01109692, + "auxiliary_loss_mlp": 0.01030506, + "balance_loss_clip": 1.03744364, + "balance_loss_mlp": 1.01888347, + "epoch": 0.9584247707800992, + "flos": 25485950488320.0, + "grad_norm": 1.733958884229455, + "language_loss": 0.73024976, + "learning_rate": 1.8078824589450535e-08, + "loss": 0.75165176, + "num_input_tokens_seen": 343956635, + "router_z_loss_clip": 0.72216797, + "router_z_loss_mlp": 0.11627197, + "step": 15941, + "time_per_iteration": 2.6140670776367188 + }, + { + "auxiliary_loss_clip": 0.0110917, + "auxiliary_loss_mlp": 0.0103222, + "balance_loss_clip": 1.03779078, + "balance_loss_mlp": 1.02156866, + "epoch": 0.9584848940327672, + "flos": 31808134931040.0, + "grad_norm": 1.6393416560561138, + "language_loss": 0.71403337, + "learning_rate": 1.8026614589018442e-08, + "loss": 0.73544723, + "num_input_tokens_seen": 343976625, + "router_z_loss_clip": 0.71484375, + "router_z_loss_mlp": 0.10650635, + "step": 15942, + "time_per_iteration": 2.6580824851989746 + }, + { + "auxiliary_loss_clip": 0.01110212, + "auxiliary_loss_mlp": 0.01031478, + "balance_loss_clip": 1.03679729, + "balance_loss_mlp": 1.0198133, + "epoch": 0.9585450172854352, + "flos": 42089710548960.0, + "grad_norm": 1.6970100209064272, + "language_loss": 0.71897233, + "learning_rate": 1.797447974521571e-08, + "loss": 0.74038923, + "num_input_tokens_seen": 343997790, + "router_z_loss_clip": 0.73388672, + "router_z_loss_mlp": 0.11663818, + "step": 15943, + "time_per_iteration": 2.7201390266418457 + }, + { + "auxiliary_loss_clip": 0.01112585, + "auxiliary_loss_mlp": 0.01032868, + "balance_loss_clip": 1.0381422, + "balance_loss_mlp": 1.02074444, + "epoch": 0.9586051405381031, + "flos": 28200099322080.0, + "grad_norm": 1.7030988284016029, + "language_loss": 0.68118274, + "learning_rate": 1.792242006001965e-08, + "loss": 0.70263726, + "num_input_tokens_seen": 344016935, + "router_z_loss_clip": 0.74414062, + "router_z_loss_mlp": 0.12133789, + "step": 15944, + "time_per_iteration": 2.652442693710327 + }, + { + "auxiliary_loss_clip": 0.01110189, + "auxiliary_loss_mlp": 0.01032613, + "balance_loss_clip": 1.03688478, + "balance_loss_mlp": 1.02112699, + "epoch": 0.9586652637907711, + "flos": 23921273846880.0, + "grad_norm": 1.6348514780263568, + "language_loss": 0.6568495, + "learning_rate": 1.7870435535403795e-08, + "loss": 0.67827749, + "num_input_tokens_seen": 344035590, + "router_z_loss_clip": 0.73144531, + "router_z_loss_mlp": 0.11486816, + "step": 15945, + "time_per_iteration": 2.581585645675659 + }, + { + "auxiliary_loss_clip": 0.01026931, + "auxiliary_loss_mlp": 0.01001905, + "balance_loss_clip": 1.00462174, + "balance_loss_mlp": 1.00104952, + "epoch": 0.958725387043439, + "flos": 87944766253920.0, + "grad_norm": 0.8339689696386972, + "language_loss": 0.6188361, + "learning_rate": 1.7818526173339678e-08, + "loss": 0.63912451, + "num_input_tokens_seen": 344100845, + "router_z_loss_clip": 0.22277832, + "router_z_loss_mlp": 0.00855255, + "step": 15946, + "time_per_iteration": 3.360194206237793 + }, + { + "auxiliary_loss_clip": 0.01107776, + "auxiliary_loss_mlp": 0.01026289, + "balance_loss_clip": 1.03768826, + "balance_loss_mlp": 1.01540542, + "epoch": 0.958785510296107, + "flos": 35279160150240.0, + "grad_norm": 1.7348854861107763, + "language_loss": 0.75115144, + "learning_rate": 1.7766691975795723e-08, + "loss": 0.77249205, + "num_input_tokens_seen": 344121780, + "router_z_loss_clip": 0.70068359, + "router_z_loss_mlp": 0.10888672, + "step": 15947, + "time_per_iteration": 2.6676032543182373 + }, + { + "auxiliary_loss_clip": 0.01107341, + "auxiliary_loss_mlp": 0.01028939, + "balance_loss_clip": 1.0359906, + "balance_loss_mlp": 1.01819873, + "epoch": 0.958845633548775, + "flos": 22545868536000.0, + "grad_norm": 3.182995561762683, + "language_loss": 0.69687146, + "learning_rate": 1.771493294473747e-08, + "loss": 0.7182343, + "num_input_tokens_seen": 344140150, + "router_z_loss_clip": 0.71435547, + "router_z_loss_mlp": 0.10742188, + "step": 15948, + "time_per_iteration": 2.6355032920837402 + }, + { + "auxiliary_loss_clip": 0.0110685, + "auxiliary_loss_mlp": 0.01026435, + "balance_loss_clip": 1.03563535, + "balance_loss_mlp": 1.01573646, + "epoch": 0.958905756801443, + "flos": 29537586809280.0, + "grad_norm": 2.110846126227678, + "language_loss": 0.7855618, + "learning_rate": 1.7663249082127574e-08, + "loss": 0.80689466, + "num_input_tokens_seen": 344158200, + "router_z_loss_clip": 0.71240234, + "router_z_loss_mlp": 0.10699463, + "step": 15949, + "time_per_iteration": 2.614729166030884 + }, + { + "auxiliary_loss_clip": 0.01110752, + "auxiliary_loss_mlp": 0.01031464, + "balance_loss_clip": 1.03844738, + "balance_loss_mlp": 1.02016258, + "epoch": 0.9589658800541109, + "flos": 30515210101440.0, + "grad_norm": 1.8475942993637333, + "language_loss": 0.68222833, + "learning_rate": 1.761164038992602e-08, + "loss": 0.70365047, + "num_input_tokens_seen": 344174720, + "router_z_loss_clip": 0.72119141, + "router_z_loss_mlp": 0.11315918, + "step": 15950, + "time_per_iteration": 2.6311843395233154 + }, + { + "auxiliary_loss_clip": 0.01110389, + "auxiliary_loss_mlp": 0.01031343, + "balance_loss_clip": 1.03778505, + "balance_loss_mlp": 1.02077508, + "epoch": 0.9590260033067789, + "flos": 28692638557920.0, + "grad_norm": 1.791578472103837, + "language_loss": 0.85898364, + "learning_rate": 1.7560106870089687e-08, + "loss": 0.88040102, + "num_input_tokens_seen": 344192580, + "router_z_loss_clip": 0.72705078, + "router_z_loss_mlp": 0.10577393, + "step": 15951, + "time_per_iteration": 2.627798080444336 + }, + { + "auxiliary_loss_clip": 0.01114057, + "auxiliary_loss_mlp": 0.01034296, + "balance_loss_clip": 1.03884614, + "balance_loss_mlp": 1.02256525, + "epoch": 0.9590861265594469, + "flos": 31140343343520.0, + "grad_norm": 2.3037054904065286, + "language_loss": 0.8052969, + "learning_rate": 1.7508648524572568e-08, + "loss": 0.8267805, + "num_input_tokens_seen": 344210345, + "router_z_loss_clip": 0.75195312, + "router_z_loss_mlp": 0.11749268, + "step": 15952, + "time_per_iteration": 2.652837038040161 + }, + { + "auxiliary_loss_clip": 0.01110288, + "auxiliary_loss_mlp": 0.01026516, + "balance_loss_clip": 1.03816938, + "balance_loss_mlp": 1.01461315, + "epoch": 0.9591462498121148, + "flos": 25843018991040.0, + "grad_norm": 2.84043063096599, + "language_loss": 0.69704497, + "learning_rate": 1.7457265355326434e-08, + "loss": 0.718413, + "num_input_tokens_seen": 344229540, + "router_z_loss_clip": 0.72070312, + "router_z_loss_mlp": 0.11895752, + "step": 15953, + "time_per_iteration": 2.586167097091675 + }, + { + "auxiliary_loss_clip": 0.01111215, + "auxiliary_loss_mlp": 0.01030797, + "balance_loss_clip": 1.03779185, + "balance_loss_mlp": 1.01863134, + "epoch": 0.9592063730647828, + "flos": 26506839885120.0, + "grad_norm": 4.261099885171508, + "language_loss": 0.58441705, + "learning_rate": 1.7405957364299285e-08, + "loss": 0.60583723, + "num_input_tokens_seen": 344247830, + "router_z_loss_clip": 0.734375, + "router_z_loss_mlp": 0.12176514, + "step": 15954, + "time_per_iteration": 4.08018159866333 + }, + { + "auxiliary_loss_clip": 0.01112285, + "auxiliary_loss_mlp": 0.01032605, + "balance_loss_clip": 1.03848052, + "balance_loss_mlp": 1.0206064, + "epoch": 0.9592664963174508, + "flos": 36474896725920.0, + "grad_norm": 3.108596025363599, + "language_loss": 0.73839164, + "learning_rate": 1.7354724553437117e-08, + "loss": 0.75984055, + "num_input_tokens_seen": 344267760, + "router_z_loss_clip": 0.73828125, + "router_z_loss_mlp": 0.11999512, + "step": 15955, + "time_per_iteration": 2.7985951900482178 + }, + { + "auxiliary_loss_clip": 0.01111336, + "auxiliary_loss_mlp": 0.01033468, + "balance_loss_clip": 1.03721452, + "balance_loss_mlp": 1.02175522, + "epoch": 0.9593266195701188, + "flos": 21963393639360.0, + "grad_norm": 1.8065892557862282, + "language_loss": 0.62413734, + "learning_rate": 1.7303566924682378e-08, + "loss": 0.64558542, + "num_input_tokens_seen": 344284905, + "router_z_loss_clip": 0.74023438, + "router_z_loss_mlp": 0.11712646, + "step": 15956, + "time_per_iteration": 2.587190628051758 + }, + { + "auxiliary_loss_clip": 0.01110578, + "auxiliary_loss_mlp": 0.01028645, + "balance_loss_clip": 1.03802681, + "balance_loss_mlp": 1.01740348, + "epoch": 0.9593867428227867, + "flos": 22986835624800.0, + "grad_norm": 1.9771246691360307, + "language_loss": 0.60081065, + "learning_rate": 1.725248447997507e-08, + "loss": 0.62220287, + "num_input_tokens_seen": 344302025, + "router_z_loss_clip": 0.72558594, + "router_z_loss_mlp": 0.11248779, + "step": 15957, + "time_per_iteration": 2.61413311958313 + }, + { + "auxiliary_loss_clip": 0.01110829, + "auxiliary_loss_mlp": 0.01038882, + "balance_loss_clip": 1.03805697, + "balance_loss_mlp": 1.02757525, + "epoch": 0.9594468660754547, + "flos": 36078735398400.0, + "grad_norm": 2.1642358250365477, + "language_loss": 0.74136829, + "learning_rate": 1.7201477221252314e-08, + "loss": 0.76286542, + "num_input_tokens_seen": 344321935, + "router_z_loss_clip": 0.72900391, + "router_z_loss_mlp": 0.11309814, + "step": 15958, + "time_per_iteration": 2.6536059379577637 + }, + { + "auxiliary_loss_clip": 0.01108278, + "auxiliary_loss_mlp": 0.01029444, + "balance_loss_clip": 1.03641582, + "balance_loss_mlp": 1.01792192, + "epoch": 0.9595069893281226, + "flos": 25262245820160.0, + "grad_norm": 1.8777884871130799, + "language_loss": 0.7439245, + "learning_rate": 1.7150545150448116e-08, + "loss": 0.7653017, + "num_input_tokens_seen": 344340405, + "router_z_loss_clip": 0.71875, + "router_z_loss_mlp": 0.11517334, + "step": 15959, + "time_per_iteration": 2.652940511703491 + }, + { + "auxiliary_loss_clip": 0.01111874, + "auxiliary_loss_mlp": 0.01029157, + "balance_loss_clip": 1.03847635, + "balance_loss_mlp": 1.01784432, + "epoch": 0.9595671125807906, + "flos": 27398255106240.0, + "grad_norm": 2.404807450801196, + "language_loss": 0.65060604, + "learning_rate": 1.7099688269493816e-08, + "loss": 0.67201638, + "num_input_tokens_seen": 344359925, + "router_z_loss_clip": 0.734375, + "router_z_loss_mlp": 0.11315918, + "step": 15960, + "time_per_iteration": 2.6970651149749756 + }, + { + "auxiliary_loss_clip": 0.01107587, + "auxiliary_loss_mlp": 0.01032306, + "balance_loss_clip": 1.03740907, + "balance_loss_mlp": 1.02096295, + "epoch": 0.9596272358334585, + "flos": 29181612273120.0, + "grad_norm": 1.8280787225514445, + "language_loss": 0.77976465, + "learning_rate": 1.7048906580318544e-08, + "loss": 0.80116355, + "num_input_tokens_seen": 344379100, + "router_z_loss_clip": 0.70214844, + "router_z_loss_mlp": 0.11346436, + "step": 15961, + "time_per_iteration": 2.63688325881958 + }, + { + "auxiliary_loss_clip": 0.01107361, + "auxiliary_loss_mlp": 0.0102766, + "balance_loss_clip": 1.03697371, + "balance_loss_mlp": 1.01694918, + "epoch": 0.9596873590861266, + "flos": 21564152998560.0, + "grad_norm": 11.883570799383996, + "language_loss": 0.75445735, + "learning_rate": 1.699820008484698e-08, + "loss": 0.7758075, + "num_input_tokens_seen": 344396895, + "router_z_loss_clip": 0.70458984, + "router_z_loss_mlp": 0.1071167, + "step": 15962, + "time_per_iteration": 2.6027581691741943 + }, + { + "auxiliary_loss_clip": 0.0111177, + "auxiliary_loss_mlp": 0.01029033, + "balance_loss_clip": 1.03728604, + "balance_loss_mlp": 1.01750541, + "epoch": 0.9597474823387945, + "flos": 31492347186240.0, + "grad_norm": 2.6866661033686867, + "language_loss": 0.7184183, + "learning_rate": 1.6947568785002698e-08, + "loss": 0.73982632, + "num_input_tokens_seen": 344415115, + "router_z_loss_clip": 0.74414062, + "router_z_loss_mlp": 0.11523438, + "step": 15963, + "time_per_iteration": 2.618250608444214 + }, + { + "auxiliary_loss_clip": 0.01107303, + "auxiliary_loss_mlp": 0.01030958, + "balance_loss_clip": 1.03885567, + "balance_loss_mlp": 1.02043796, + "epoch": 0.9598076055914625, + "flos": 29002672848960.0, + "grad_norm": 2.128977472037575, + "language_loss": 0.74386132, + "learning_rate": 1.689701268270527e-08, + "loss": 0.76524395, + "num_input_tokens_seen": 344435185, + "router_z_loss_clip": 0.68457031, + "router_z_loss_mlp": 0.10516357, + "step": 15964, + "time_per_iteration": 2.6576311588287354 + }, + { + "auxiliary_loss_clip": 0.01027121, + "auxiliary_loss_mlp": 0.01002045, + "balance_loss_clip": 1.00478745, + "balance_loss_mlp": 1.00115418, + "epoch": 0.9598677288441305, + "flos": 68961803427360.0, + "grad_norm": 0.8825434868853207, + "language_loss": 0.57569194, + "learning_rate": 1.684653177987161e-08, + "loss": 0.59598356, + "num_input_tokens_seen": 344488950, + "router_z_loss_clip": 0.22351074, + "router_z_loss_mlp": 0.00891876, + "step": 15965, + "time_per_iteration": 4.766626834869385 + }, + { + "auxiliary_loss_clip": 0.01109309, + "auxiliary_loss_mlp": 0.01028359, + "balance_loss_clip": 1.03666043, + "balance_loss_mlp": 1.01780927, + "epoch": 0.9599278520967984, + "flos": 28062238069440.0, + "grad_norm": 1.9904809616871562, + "language_loss": 0.78975224, + "learning_rate": 1.6796126078416627e-08, + "loss": 0.81112897, + "num_input_tokens_seen": 344506740, + "router_z_loss_clip": 0.72607422, + "router_z_loss_mlp": 0.10552979, + "step": 15966, + "time_per_iteration": 2.6160027980804443 + }, + { + "auxiliary_loss_clip": 0.01103944, + "auxiliary_loss_mlp": 0.01027882, + "balance_loss_clip": 1.034127, + "balance_loss_mlp": 1.01670063, + "epoch": 0.9599879753494664, + "flos": 28113202457280.0, + "grad_norm": 1.703441683336394, + "language_loss": 0.79447055, + "learning_rate": 1.674579558025102e-08, + "loss": 0.81578887, + "num_input_tokens_seen": 344526670, + "router_z_loss_clip": 0.69824219, + "router_z_loss_mlp": 0.11187744, + "step": 15967, + "time_per_iteration": 4.08367133140564 + }, + { + "auxiliary_loss_clip": 0.01114229, + "auxiliary_loss_mlp": 0.01024199, + "balance_loss_clip": 1.03916037, + "balance_loss_mlp": 1.01189017, + "epoch": 0.9600480986021344, + "flos": 20002191014880.0, + "grad_norm": 2.170215437894623, + "language_loss": 0.80662262, + "learning_rate": 1.669554028728348e-08, + "loss": 0.82800686, + "num_input_tokens_seen": 344541995, + "router_z_loss_clip": 0.75195312, + "router_z_loss_mlp": 0.12316895, + "step": 15968, + "time_per_iteration": 2.6166820526123047 + }, + { + "auxiliary_loss_clip": 0.01115157, + "auxiliary_loss_mlp": 0.0103438, + "balance_loss_clip": 1.04001021, + "balance_loss_mlp": 1.02155256, + "epoch": 0.9601082218548024, + "flos": 29622741431040.0, + "grad_norm": 3.072299368148948, + "language_loss": 0.67775774, + "learning_rate": 1.6645360201420044e-08, + "loss": 0.69925308, + "num_input_tokens_seen": 344559980, + "router_z_loss_clip": 0.75097656, + "router_z_loss_mlp": 0.12835693, + "step": 15969, + "time_per_iteration": 2.638941526412964 + }, + { + "auxiliary_loss_clip": 0.01110248, + "auxiliary_loss_mlp": 0.01033388, + "balance_loss_clip": 1.03965235, + "balance_loss_mlp": 1.02328551, + "epoch": 0.9601683451074703, + "flos": 23928040232640.0, + "grad_norm": 3.202722939954169, + "language_loss": 0.78902483, + "learning_rate": 1.6595255324563186e-08, + "loss": 0.81046116, + "num_input_tokens_seen": 344577765, + "router_z_loss_clip": 0.70605469, + "router_z_loss_mlp": 0.10095215, + "step": 15970, + "time_per_iteration": 2.611454963684082 + }, + { + "auxiliary_loss_clip": 0.01109289, + "auxiliary_loss_mlp": 0.0102972, + "balance_loss_clip": 1.03965473, + "balance_loss_mlp": 1.0183773, + "epoch": 0.9602284683601383, + "flos": 32520205555200.0, + "grad_norm": 1.5021200824905374, + "language_loss": 0.77320802, + "learning_rate": 1.654522565861316e-08, + "loss": 0.7945981, + "num_input_tokens_seen": 344597650, + "router_z_loss_clip": 0.69628906, + "router_z_loss_mlp": 0.11340332, + "step": 15971, + "time_per_iteration": 2.662058115005493 + }, + { + "auxiliary_loss_clip": 0.01111368, + "auxiliary_loss_mlp": 0.01023701, + "balance_loss_clip": 1.03705108, + "balance_loss_mlp": 1.01272786, + "epoch": 0.9602885916128062, + "flos": 18979275754080.0, + "grad_norm": 2.033241146245272, + "language_loss": 0.6789248, + "learning_rate": 1.64952712054669e-08, + "loss": 0.70027548, + "num_input_tokens_seen": 344613580, + "router_z_loss_clip": 0.74365234, + "router_z_loss_mlp": 0.10980225, + "step": 15972, + "time_per_iteration": 2.5740292072296143 + }, + { + "auxiliary_loss_clip": 0.01108516, + "auxiliary_loss_mlp": 0.01025576, + "balance_loss_clip": 1.03640389, + "balance_loss_mlp": 1.01439428, + "epoch": 0.9603487148654742, + "flos": 20136446229600.0, + "grad_norm": 2.536201903329633, + "language_loss": 0.76077199, + "learning_rate": 1.644539196701844e-08, + "loss": 0.78211296, + "num_input_tokens_seen": 344626910, + "router_z_loss_clip": 0.72070312, + "router_z_loss_mlp": 0.11187744, + "step": 15973, + "time_per_iteration": 2.579993486404419 + }, + { + "auxiliary_loss_clip": 0.01110751, + "auxiliary_loss_mlp": 0.01035412, + "balance_loss_clip": 1.03996897, + "balance_loss_mlp": 1.02403367, + "epoch": 0.9604088381181421, + "flos": 25435999032480.0, + "grad_norm": 1.6397919085120245, + "language_loss": 0.69102263, + "learning_rate": 1.639558794515983e-08, + "loss": 0.7124843, + "num_input_tokens_seen": 344644330, + "router_z_loss_clip": 0.70800781, + "router_z_loss_mlp": 0.1137085, + "step": 15974, + "time_per_iteration": 2.6075596809387207 + }, + { + "auxiliary_loss_clip": 0.0110863, + "auxiliary_loss_mlp": 0.01027145, + "balance_loss_clip": 1.03509498, + "balance_loss_mlp": 1.01566529, + "epoch": 0.9604689613708102, + "flos": 24017246582400.0, + "grad_norm": 1.6668755921113276, + "language_loss": 0.67864877, + "learning_rate": 1.6345859141779105e-08, + "loss": 0.70000648, + "num_input_tokens_seen": 344663910, + "router_z_loss_clip": 0.73486328, + "router_z_loss_mlp": 0.11486816, + "step": 15975, + "time_per_iteration": 2.592973232269287 + }, + { + "auxiliary_loss_clip": 0.01106269, + "auxiliary_loss_mlp": 0.01025089, + "balance_loss_clip": 1.03713536, + "balance_loss_mlp": 1.01416337, + "epoch": 0.9605290846234781, + "flos": 29404141940160.0, + "grad_norm": 2.4292332173085596, + "language_loss": 0.56015086, + "learning_rate": 1.6296205558762322e-08, + "loss": 0.58146447, + "num_input_tokens_seen": 344682320, + "router_z_loss_clip": 0.69091797, + "router_z_loss_mlp": 0.10931396, + "step": 15976, + "time_per_iteration": 2.6491451263427734 + }, + { + "auxiliary_loss_clip": 0.01105987, + "auxiliary_loss_mlp": 0.0102522, + "balance_loss_clip": 1.03584957, + "balance_loss_mlp": 1.01460469, + "epoch": 0.9605892078761461, + "flos": 33095792514240.0, + "grad_norm": 1.8205593272749305, + "language_loss": 0.68486923, + "learning_rate": 1.624662719799219e-08, + "loss": 0.70618129, + "num_input_tokens_seen": 344701355, + "router_z_loss_clip": 0.70068359, + "router_z_loss_mlp": 0.10614014, + "step": 15977, + "time_per_iteration": 2.6310479640960693 + }, + { + "auxiliary_loss_clip": 0.01108451, + "auxiliary_loss_mlp": 0.01033944, + "balance_loss_clip": 1.03688741, + "balance_loss_mlp": 1.02224374, + "epoch": 0.9606493311288141, + "flos": 17249395046400.0, + "grad_norm": 1.9321396534647597, + "language_loss": 0.81699097, + "learning_rate": 1.6197124061348766e-08, + "loss": 0.83841491, + "num_input_tokens_seen": 344717980, + "router_z_loss_clip": 0.71533203, + "router_z_loss_mlp": 0.11694336, + "step": 15978, + "time_per_iteration": 3.9825236797332764 + }, + { + "auxiliary_loss_clip": 0.01114034, + "auxiliary_loss_mlp": 0.01027828, + "balance_loss_clip": 1.03923893, + "balance_loss_mlp": 1.01651478, + "epoch": 0.960709454381482, + "flos": 19295266085280.0, + "grad_norm": 2.26819132721024, + "language_loss": 0.83138359, + "learning_rate": 1.614769615070921e-08, + "loss": 0.85280216, + "num_input_tokens_seen": 344733480, + "router_z_loss_clip": 0.74609375, + "router_z_loss_mlp": 0.11309814, + "step": 15979, + "time_per_iteration": 2.5487067699432373 + }, + { + "auxiliary_loss_clip": 0.01109765, + "auxiliary_loss_mlp": 0.01030785, + "balance_loss_clip": 1.03725791, + "balance_loss_mlp": 1.02031207, + "epoch": 0.96076957763415, + "flos": 27534455150400.0, + "grad_norm": 3.776649031689069, + "language_loss": 0.80334932, + "learning_rate": 1.6098343467947805e-08, + "loss": 0.82475483, + "num_input_tokens_seen": 344752130, + "router_z_loss_clip": 0.72558594, + "router_z_loss_mlp": 0.10479736, + "step": 15980, + "time_per_iteration": 2.6585071086883545 + }, + { + "auxiliary_loss_clip": 0.01110064, + "auxiliary_loss_mlp": 0.01026182, + "balance_loss_clip": 1.03701258, + "balance_loss_mlp": 1.01539385, + "epoch": 0.960829700886818, + "flos": 30116415150720.0, + "grad_norm": 3.251813819605887, + "language_loss": 0.68336409, + "learning_rate": 1.6049066014935942e-08, + "loss": 0.70472658, + "num_input_tokens_seen": 344771195, + "router_z_loss_clip": 0.73046875, + "router_z_loss_mlp": 0.10778809, + "step": 15981, + "time_per_iteration": 2.7242345809936523 + }, + { + "auxiliary_loss_clip": 0.01107621, + "auxiliary_loss_mlp": 0.01026103, + "balance_loss_clip": 1.03720379, + "balance_loss_mlp": 1.01574993, + "epoch": 0.960889824139486, + "flos": 32387287410720.0, + "grad_norm": 1.7271141985038225, + "language_loss": 0.69643617, + "learning_rate": 1.5999863793542344e-08, + "loss": 0.71777338, + "num_input_tokens_seen": 344793150, + "router_z_loss_clip": 0.70458984, + "router_z_loss_mlp": 0.10351562, + "step": 15982, + "time_per_iteration": 2.7301323413848877 + }, + { + "auxiliary_loss_clip": 0.01027166, + "auxiliary_loss_mlp": 0.01001566, + "balance_loss_clip": 1.00481641, + "balance_loss_mlp": 1.00066531, + "epoch": 0.9609499473921539, + "flos": 86776169905440.0, + "grad_norm": 0.6712351661012684, + "language_loss": 0.53269291, + "learning_rate": 1.595073680563286e-08, + "loss": 0.55298018, + "num_input_tokens_seen": 344852855, + "router_z_loss_clip": 0.22363281, + "router_z_loss_mlp": 0.0090065, + "step": 15983, + "time_per_iteration": 3.348877429962158 + }, + { + "auxiliary_loss_clip": 0.0110958, + "auxiliary_loss_mlp": 0.01032948, + "balance_loss_clip": 1.03826237, + "balance_loss_mlp": 1.02192068, + "epoch": 0.9610100706448219, + "flos": 25078687426080.0, + "grad_norm": 2.055099820901924, + "language_loss": 0.68153685, + "learning_rate": 1.5901685053070212e-08, + "loss": 0.70296216, + "num_input_tokens_seen": 344869830, + "router_z_loss_clip": 0.71435547, + "router_z_loss_mlp": 0.11016846, + "step": 15984, + "time_per_iteration": 2.6073052883148193 + }, + { + "auxiliary_loss_clip": 0.01104995, + "auxiliary_loss_mlp": 0.0103315, + "balance_loss_clip": 1.03654957, + "balance_loss_mlp": 1.0226295, + "epoch": 0.9610701938974898, + "flos": 17160634386720.0, + "grad_norm": 2.549646579199551, + "language_loss": 0.67504603, + "learning_rate": 1.5852708537714477e-08, + "loss": 0.69642746, + "num_input_tokens_seen": 344888905, + "router_z_loss_clip": 0.68457031, + "router_z_loss_mlp": 0.10516357, + "step": 15985, + "time_per_iteration": 2.62200927734375 + }, + { + "auxiliary_loss_clip": 0.01111213, + "auxiliary_loss_mlp": 0.01030427, + "balance_loss_clip": 1.03817296, + "balance_loss_mlp": 1.01950693, + "epoch": 0.9611303171501578, + "flos": 24684632997120.0, + "grad_norm": 2.4803990233759623, + "language_loss": 0.78963816, + "learning_rate": 1.580380726142283e-08, + "loss": 0.81105459, + "num_input_tokens_seen": 344907160, + "router_z_loss_clip": 0.73095703, + "router_z_loss_mlp": 0.10931396, + "step": 15986, + "time_per_iteration": 2.5771143436431885 + }, + { + "auxiliary_loss_clip": 0.01112068, + "auxiliary_loss_mlp": 0.01029891, + "balance_loss_clip": 1.03936708, + "balance_loss_mlp": 1.01762986, + "epoch": 0.9611904404028258, + "flos": 25565351656320.0, + "grad_norm": 2.1924367292515328, + "language_loss": 0.63756502, + "learning_rate": 1.5754981226049792e-08, + "loss": 0.65898454, + "num_input_tokens_seen": 344922400, + "router_z_loss_clip": 0.7265625, + "router_z_loss_mlp": 0.12261963, + "step": 15987, + "time_per_iteration": 2.6150481700897217 + }, + { + "auxiliary_loss_clip": 0.01106714, + "auxiliary_loss_mlp": 0.01026391, + "balance_loss_clip": 1.0379982, + "balance_loss_mlp": 1.01668167, + "epoch": 0.9612505636554938, + "flos": 30295111471200.0, + "grad_norm": 1.7954145851196013, + "language_loss": 0.66472936, + "learning_rate": 1.5706230433446544e-08, + "loss": 0.68606043, + "num_input_tokens_seen": 344941910, + "router_z_loss_clip": 0.68701172, + "router_z_loss_mlp": 0.09710693, + "step": 15988, + "time_per_iteration": 2.644217014312744 + }, + { + "auxiliary_loss_clip": 0.0110831, + "auxiliary_loss_mlp": 0.01037495, + "balance_loss_clip": 1.03650665, + "balance_loss_mlp": 1.02671814, + "epoch": 0.9613106869081617, + "flos": 20944489589280.0, + "grad_norm": 3.25445664175611, + "language_loss": 0.74429888, + "learning_rate": 1.5657554885462055e-08, + "loss": 0.76575691, + "num_input_tokens_seen": 344960020, + "router_z_loss_clip": 0.71728516, + "router_z_loss_mlp": 0.10784912, + "step": 15989, + "time_per_iteration": 2.6646366119384766 + }, + { + "auxiliary_loss_clip": 0.01027053, + "auxiliary_loss_mlp": 0.01001584, + "balance_loss_clip": 1.00475264, + "balance_loss_mlp": 1.00065541, + "epoch": 0.9613708101608297, + "flos": 75120404495040.0, + "grad_norm": 0.8700188821336444, + "language_loss": 0.631567, + "learning_rate": 1.5608954583941737e-08, + "loss": 0.65185332, + "num_input_tokens_seen": 345018290, + "router_z_loss_clip": 0.22314453, + "router_z_loss_mlp": 0.00926971, + "step": 15990, + "time_per_iteration": 3.19085431098938 + }, + { + "auxiliary_loss_clip": 0.01108695, + "auxiliary_loss_mlp": 0.01029099, + "balance_loss_clip": 1.0364598, + "balance_loss_mlp": 1.01850748, + "epoch": 0.9614309334134977, + "flos": 33455859295680.0, + "grad_norm": 2.742653454877012, + "language_loss": 0.77637649, + "learning_rate": 1.5560429530729003e-08, + "loss": 0.79775447, + "num_input_tokens_seen": 345040235, + "router_z_loss_clip": 0.72167969, + "router_z_loss_mlp": 0.105896, + "step": 15991, + "time_per_iteration": 2.65702223777771 + }, + { + "auxiliary_loss_clip": 0.01113324, + "auxiliary_loss_mlp": 0.01031769, + "balance_loss_clip": 1.03689337, + "balance_loss_mlp": 1.01970482, + "epoch": 0.9614910566661656, + "flos": 27844246337760.0, + "grad_norm": 2.5573163841269393, + "language_loss": 0.85060036, + "learning_rate": 1.5511979727663493e-08, + "loss": 0.8720513, + "num_input_tokens_seen": 345054540, + "router_z_loss_clip": 0.76416016, + "router_z_loss_mlp": 0.12060547, + "step": 15992, + "time_per_iteration": 2.627816915512085 + }, + { + "auxiliary_loss_clip": 0.01108478, + "auxiliary_loss_mlp": 0.01029601, + "balance_loss_clip": 1.03523922, + "balance_loss_mlp": 1.0178529, + "epoch": 0.9615511799188337, + "flos": 25218898680960.0, + "grad_norm": 1.9500610018734617, + "language_loss": 0.72301507, + "learning_rate": 1.5463605176582406e-08, + "loss": 0.74439585, + "num_input_tokens_seen": 345074035, + "router_z_loss_clip": 0.73291016, + "router_z_loss_mlp": 0.11743164, + "step": 15993, + "time_per_iteration": 2.6061813831329346 + }, + { + "auxiliary_loss_clip": 0.01109151, + "auxiliary_loss_mlp": 0.01030418, + "balance_loss_clip": 1.03569627, + "balance_loss_mlp": 1.01875985, + "epoch": 0.9616113031715016, + "flos": 40449360329280.0, + "grad_norm": 2.084698743317128, + "language_loss": 0.68329966, + "learning_rate": 1.5415305879320716e-08, + "loss": 0.70469534, + "num_input_tokens_seen": 345099270, + "router_z_loss_clip": 0.734375, + "router_z_loss_mlp": 0.11663818, + "step": 15994, + "time_per_iteration": 4.161406517028809 + }, + { + "auxiliary_loss_clip": 0.01110607, + "auxiliary_loss_mlp": 0.01028496, + "balance_loss_clip": 1.03855634, + "balance_loss_mlp": 1.01686168, + "epoch": 0.9616714264241696, + "flos": 30517317000000.0, + "grad_norm": 1.9358854921687956, + "language_loss": 0.84515297, + "learning_rate": 1.5367081837709183e-08, + "loss": 0.86654401, + "num_input_tokens_seen": 345116975, + "router_z_loss_clip": 0.72021484, + "router_z_loss_mlp": 0.11633301, + "step": 15995, + "time_per_iteration": 2.6353471279144287 + }, + { + "auxiliary_loss_clip": 0.01111476, + "auxiliary_loss_mlp": 0.01030787, + "balance_loss_clip": 1.03678083, + "balance_loss_mlp": 1.01895523, + "epoch": 0.9617315496768375, + "flos": 16530639071040.0, + "grad_norm": 2.3549137114741643, + "language_loss": 0.76031178, + "learning_rate": 1.5318933053576788e-08, + "loss": 0.78173441, + "num_input_tokens_seen": 345133645, + "router_z_loss_clip": 0.74755859, + "router_z_loss_mlp": 0.11834717, + "step": 15996, + "time_per_iteration": 2.595144271850586 + }, + { + "auxiliary_loss_clip": 0.01107284, + "auxiliary_loss_mlp": 0.0102963, + "balance_loss_clip": 1.03553057, + "balance_loss_mlp": 1.01851416, + "epoch": 0.9617916729295055, + "flos": 13732064926560.0, + "grad_norm": 2.04477984945901, + "language_loss": 0.77144277, + "learning_rate": 1.52708595287494e-08, + "loss": 0.79281199, + "num_input_tokens_seen": 345150740, + "router_z_loss_clip": 0.71728516, + "router_z_loss_mlp": 0.11120605, + "step": 15997, + "time_per_iteration": 2.7018380165100098 + }, + { + "auxiliary_loss_clip": 0.01104753, + "auxiliary_loss_mlp": 0.01028237, + "balance_loss_clip": 1.03609002, + "balance_loss_mlp": 1.01801527, + "epoch": 0.9618517961821734, + "flos": 27844894614240.0, + "grad_norm": 1.5544925493693387, + "language_loss": 0.67132115, + "learning_rate": 1.522286126505001e-08, + "loss": 0.69265103, + "num_input_tokens_seen": 345170365, + "router_z_loss_clip": 0.68652344, + "router_z_loss_mlp": 0.10223389, + "step": 15998, + "time_per_iteration": 2.6159019470214844 + }, + { + "auxiliary_loss_clip": 0.01107809, + "auxiliary_loss_mlp": 0.010287, + "balance_loss_clip": 1.03623724, + "balance_loss_mlp": 1.0170716, + "epoch": 0.9619119194348414, + "flos": 20276252311680.0, + "grad_norm": 1.9448164476763803, + "language_loss": 0.73044044, + "learning_rate": 1.5174938264298498e-08, + "loss": 0.75180548, + "num_input_tokens_seen": 345188930, + "router_z_loss_clip": 0.71533203, + "router_z_loss_mlp": 0.11639404, + "step": 15999, + "time_per_iteration": 2.597996950149536 + }, + { + "auxiliary_loss_clip": 0.0110537, + "auxiliary_loss_mlp": 0.01025926, + "balance_loss_clip": 1.0365566, + "balance_loss_mlp": 1.0158112, + "epoch": 0.9619720426875094, + "flos": 29938124003040.0, + "grad_norm": 2.0636062564322697, + "language_loss": 0.65082729, + "learning_rate": 1.5127090528312514e-08, + "loss": 0.67214018, + "num_input_tokens_seen": 345209615, + "router_z_loss_clip": 0.68847656, + "router_z_loss_mlp": 0.10107422, + "step": 16000, + "time_per_iteration": 2.6489205360412598 + }, + { + "auxiliary_loss_clip": 0.01108351, + "auxiliary_loss_mlp": 0.010261, + "balance_loss_clip": 1.03639984, + "balance_loss_mlp": 1.0148114, + "epoch": 0.9620321659401774, + "flos": 25175389472640.0, + "grad_norm": 1.9622502065285896, + "language_loss": 0.75498426, + "learning_rate": 1.5079318058905723e-08, + "loss": 0.7763288, + "num_input_tokens_seen": 345229175, + "router_z_loss_clip": 0.72070312, + "router_z_loss_mlp": 0.112854, + "step": 16001, + "time_per_iteration": 2.632371425628662 + }, + { + "auxiliary_loss_clip": 0.01106417, + "auxiliary_loss_mlp": 0.01029842, + "balance_loss_clip": 1.0346601, + "balance_loss_mlp": 1.0182426, + "epoch": 0.9620922891928453, + "flos": 22592213953920.0, + "grad_norm": 1.7055006243528226, + "language_loss": 0.68124264, + "learning_rate": 1.5031620857890447e-08, + "loss": 0.70260525, + "num_input_tokens_seen": 345247815, + "router_z_loss_clip": 0.71728516, + "router_z_loss_mlp": 0.11602783, + "step": 16002, + "time_per_iteration": 2.6932106018066406 + }, + { + "auxiliary_loss_clip": 0.01110429, + "auxiliary_loss_mlp": 0.01031688, + "balance_loss_clip": 1.03916478, + "balance_loss_mlp": 1.02046466, + "epoch": 0.9621524124455133, + "flos": 35102651762880.0, + "grad_norm": 1.4087598251985027, + "language_loss": 0.64568019, + "learning_rate": 1.4983998927074804e-08, + "loss": 0.66710132, + "num_input_tokens_seen": 345269935, + "router_z_loss_clip": 0.71240234, + "router_z_loss_mlp": 0.11224365, + "step": 16003, + "time_per_iteration": 2.7273213863372803 + }, + { + "auxiliary_loss_clip": 0.01111414, + "auxiliary_loss_mlp": 0.01037068, + "balance_loss_clip": 1.038903, + "balance_loss_mlp": 1.02644086, + "epoch": 0.9622125356981813, + "flos": 23304365612640.0, + "grad_norm": 1.7720484482797696, + "language_loss": 0.75604111, + "learning_rate": 1.493645226826512e-08, + "loss": 0.7775259, + "num_input_tokens_seen": 345288310, + "router_z_loss_clip": 0.72607422, + "router_z_loss_mlp": 0.10632324, + "step": 16004, + "time_per_iteration": 2.584766387939453 + }, + { + "auxiliary_loss_clip": 0.01107578, + "auxiliary_loss_mlp": 0.01027454, + "balance_loss_clip": 1.03722286, + "balance_loss_mlp": 1.01587248, + "epoch": 0.9622726589508492, + "flos": 24773515208640.0, + "grad_norm": 2.095067005670735, + "language_loss": 0.79670924, + "learning_rate": 1.4888980883263958e-08, + "loss": 0.81805956, + "num_input_tokens_seen": 345306615, + "router_z_loss_clip": 0.70361328, + "router_z_loss_mlp": 0.11584473, + "step": 16005, + "time_per_iteration": 4.08805513381958 + }, + { + "auxiliary_loss_clip": 0.01105287, + "auxiliary_loss_mlp": 0.0102729, + "balance_loss_clip": 1.03570485, + "balance_loss_mlp": 1.01663303, + "epoch": 0.9623327822035173, + "flos": 67027781547360.0, + "grad_norm": 2.197371173666002, + "language_loss": 0.67014039, + "learning_rate": 1.4841584773871652e-08, + "loss": 0.69146621, + "num_input_tokens_seen": 345331935, + "router_z_loss_clip": 0.69628906, + "router_z_loss_mlp": 0.10662842, + "step": 16006, + "time_per_iteration": 4.3981640338897705 + }, + { + "auxiliary_loss_clip": 0.01104579, + "auxiliary_loss_mlp": 0.01029726, + "balance_loss_clip": 1.03740704, + "balance_loss_mlp": 1.01943874, + "epoch": 0.9623929054561852, + "flos": 26550794783520.0, + "grad_norm": 1.6019872591007491, + "language_loss": 0.78227806, + "learning_rate": 1.479426394188521e-08, + "loss": 0.80362117, + "num_input_tokens_seen": 345351510, + "router_z_loss_clip": 0.671875, + "router_z_loss_mlp": 0.10296631, + "step": 16007, + "time_per_iteration": 2.6383213996887207 + }, + { + "auxiliary_loss_clip": 0.01111321, + "auxiliary_loss_mlp": 0.0103148, + "balance_loss_clip": 1.03841519, + "balance_loss_mlp": 1.02003038, + "epoch": 0.9624530287088532, + "flos": 21879900226080.0, + "grad_norm": 2.055048290582686, + "language_loss": 0.67908847, + "learning_rate": 1.4747018389099198e-08, + "loss": 0.70051646, + "num_input_tokens_seen": 345367750, + "router_z_loss_clip": 0.72949219, + "router_z_loss_mlp": 0.11450195, + "step": 16008, + "time_per_iteration": 2.611590623855591 + }, + { + "auxiliary_loss_clip": 0.01110864, + "auxiliary_loss_mlp": 0.0102835, + "balance_loss_clip": 1.03783607, + "balance_loss_mlp": 1.01612496, + "epoch": 0.9625131519615211, + "flos": 28374865466400.0, + "grad_norm": 2.196922107767491, + "language_loss": 0.73679233, + "learning_rate": 1.469984811730529e-08, + "loss": 0.75818449, + "num_input_tokens_seen": 345384790, + "router_z_loss_clip": 0.72998047, + "router_z_loss_mlp": 0.12231445, + "step": 16009, + "time_per_iteration": 2.5970592498779297 + }, + { + "auxiliary_loss_clip": 0.01106471, + "auxiliary_loss_mlp": 0.01029726, + "balance_loss_clip": 1.03537023, + "balance_loss_mlp": 1.0187763, + "epoch": 0.9625732752141891, + "flos": 23081957497440.0, + "grad_norm": 1.9031840950837757, + "language_loss": 0.76016521, + "learning_rate": 1.4652753128292061e-08, + "loss": 0.78152722, + "num_input_tokens_seen": 345403390, + "router_z_loss_clip": 0.70996094, + "router_z_loss_mlp": 0.10955811, + "step": 16010, + "time_per_iteration": 2.6101529598236084 + }, + { + "auxiliary_loss_clip": 0.01117189, + "auxiliary_loss_mlp": 0.01031044, + "balance_loss_clip": 1.04056442, + "balance_loss_mlp": 1.01771641, + "epoch": 0.962633398466857, + "flos": 19831517115840.0, + "grad_norm": 1.849575293360489, + "language_loss": 0.69546378, + "learning_rate": 1.4605733423845635e-08, + "loss": 0.71694613, + "num_input_tokens_seen": 345418685, + "router_z_loss_clip": 0.76660156, + "router_z_loss_mlp": 0.13317871, + "step": 16011, + "time_per_iteration": 2.699880361557007 + }, + { + "auxiliary_loss_clip": 0.01107316, + "auxiliary_loss_mlp": 0.01031241, + "balance_loss_clip": 1.0370667, + "balance_loss_mlp": 1.02079189, + "epoch": 0.962693521719525, + "flos": 66134380979520.0, + "grad_norm": 1.8311364273267452, + "language_loss": 0.68745846, + "learning_rate": 1.4558789005748585e-08, + "loss": 0.70884407, + "num_input_tokens_seen": 345442380, + "router_z_loss_clip": 0.70263672, + "router_z_loss_mlp": 0.10455322, + "step": 16012, + "time_per_iteration": 2.883556842803955 + }, + { + "auxiliary_loss_clip": 0.01116838, + "auxiliary_loss_mlp": 0.01033057, + "balance_loss_clip": 1.04027426, + "balance_loss_mlp": 1.02060556, + "epoch": 0.962753644972193, + "flos": 40397018353920.0, + "grad_norm": 2.9099934023268195, + "language_loss": 0.72351468, + "learning_rate": 1.4511919875781264e-08, + "loss": 0.74501359, + "num_input_tokens_seen": 345463815, + "router_z_loss_clip": 0.765625, + "router_z_loss_mlp": 0.12451172, + "step": 16013, + "time_per_iteration": 2.7073910236358643 + }, + { + "auxiliary_loss_clip": 0.0110801, + "auxiliary_loss_mlp": 0.01028345, + "balance_loss_clip": 1.03622103, + "balance_loss_mlp": 1.0164659, + "epoch": 0.962813768224861, + "flos": 51531564644640.0, + "grad_norm": 3.0359999614138617, + "language_loss": 0.6337201, + "learning_rate": 1.4465126035720698e-08, + "loss": 0.65508366, + "num_input_tokens_seen": 345484525, + "router_z_loss_clip": 0.71777344, + "router_z_loss_mlp": 0.11889648, + "step": 16014, + "time_per_iteration": 2.775521755218506 + }, + { + "auxiliary_loss_clip": 0.0110517, + "auxiliary_loss_mlp": 0.01029855, + "balance_loss_clip": 1.03713703, + "balance_loss_mlp": 1.01999044, + "epoch": 0.9628738914775289, + "flos": 53623213859520.0, + "grad_norm": 1.6806149739634544, + "language_loss": 0.71823633, + "learning_rate": 1.4418407487341688e-08, + "loss": 0.73958653, + "num_input_tokens_seen": 345508295, + "router_z_loss_clip": 0.68066406, + "router_z_loss_mlp": 0.09875488, + "step": 16015, + "time_per_iteration": 2.798283815383911 + }, + { + "auxiliary_loss_clip": 0.01107628, + "auxiliary_loss_mlp": 0.01028142, + "balance_loss_clip": 1.03643441, + "balance_loss_mlp": 1.01716244, + "epoch": 0.9629340147301969, + "flos": 19030118590080.0, + "grad_norm": 2.2332652313753587, + "language_loss": 0.77298403, + "learning_rate": 1.4371764232415707e-08, + "loss": 0.79434168, + "num_input_tokens_seen": 345525155, + "router_z_loss_clip": 0.71240234, + "router_z_loss_mlp": 0.10980225, + "step": 16016, + "time_per_iteration": 2.5680670738220215 + }, + { + "auxiliary_loss_clip": 0.01027028, + "auxiliary_loss_mlp": 0.01001782, + "balance_loss_clip": 1.00475848, + "balance_loss_mlp": 1.00090778, + "epoch": 0.9629941379828649, + "flos": 76813096690080.0, + "grad_norm": 0.8145469497649257, + "language_loss": 0.63121605, + "learning_rate": 1.4325196272711337e-08, + "loss": 0.65150416, + "num_input_tokens_seen": 345578905, + "router_z_loss_clip": 0.22265625, + "router_z_loss_mlp": 0.00874329, + "step": 16017, + "time_per_iteration": 3.144444227218628 + }, + { + "auxiliary_loss_clip": 0.01110893, + "auxiliary_loss_mlp": 0.0102703, + "balance_loss_clip": 1.03795075, + "balance_loss_mlp": 1.0160327, + "epoch": 0.9630542612355328, + "flos": 36483607941120.0, + "grad_norm": 2.0820924768327203, + "language_loss": 0.66231543, + "learning_rate": 1.4278703609994502e-08, + "loss": 0.68369466, + "num_input_tokens_seen": 345598965, + "router_z_loss_clip": 0.72900391, + "router_z_loss_mlp": 0.10998535, + "step": 16018, + "time_per_iteration": 4.020913362503052 + }, + { + "auxiliary_loss_clip": 0.01109219, + "auxiliary_loss_mlp": 0.01035718, + "balance_loss_clip": 1.03822088, + "balance_loss_mlp": 1.02469683, + "epoch": 0.9631143844882009, + "flos": 21834810843840.0, + "grad_norm": 1.9389704561840593, + "language_loss": 0.79465806, + "learning_rate": 1.4232286246028457e-08, + "loss": 0.81610745, + "num_input_tokens_seen": 345617945, + "router_z_loss_clip": 0.70947266, + "router_z_loss_mlp": 0.11029053, + "step": 16019, + "time_per_iteration": 2.5954718589782715 + }, + { + "auxiliary_loss_clip": 0.01105914, + "auxiliary_loss_mlp": 0.01028216, + "balance_loss_clip": 1.03564477, + "balance_loss_mlp": 1.01823211, + "epoch": 0.9631745077408688, + "flos": 31895599037760.0, + "grad_norm": 1.4874595742391434, + "language_loss": 0.71719193, + "learning_rate": 1.4185944182572907e-08, + "loss": 0.73853326, + "num_input_tokens_seen": 345637920, + "router_z_loss_clip": 0.70214844, + "router_z_loss_mlp": 0.09979248, + "step": 16020, + "time_per_iteration": 2.656179904937744 + }, + { + "auxiliary_loss_clip": 0.01108559, + "auxiliary_loss_mlp": 0.01025603, + "balance_loss_clip": 1.03648686, + "balance_loss_mlp": 1.01527929, + "epoch": 0.9632346309935368, + "flos": 30473159515200.0, + "grad_norm": 1.849904487230292, + "language_loss": 0.77051407, + "learning_rate": 1.4139677421385331e-08, + "loss": 0.79185569, + "num_input_tokens_seen": 345656195, + "router_z_loss_clip": 0.72070312, + "router_z_loss_mlp": 0.10321045, + "step": 16021, + "time_per_iteration": 2.650395631790161 + }, + { + "auxiliary_loss_clip": 0.01112394, + "auxiliary_loss_mlp": 0.01030449, + "balance_loss_clip": 1.03725016, + "balance_loss_mlp": 1.01735437, + "epoch": 0.9632947542462047, + "flos": 28818547212960.0, + "grad_norm": 2.1218233066182224, + "language_loss": 0.64614916, + "learning_rate": 1.4093485964220331e-08, + "loss": 0.66757762, + "num_input_tokens_seen": 345676700, + "router_z_loss_clip": 0.75, + "router_z_loss_mlp": 0.13098145, + "step": 16022, + "time_per_iteration": 2.6391971111297607 + }, + { + "auxiliary_loss_clip": 0.01106155, + "auxiliary_loss_mlp": 0.01029288, + "balance_loss_clip": 1.03601623, + "balance_loss_mlp": 1.01904798, + "epoch": 0.9633548774988727, + "flos": 32208429021120.0, + "grad_norm": 2.005388701894513, + "language_loss": 0.73258257, + "learning_rate": 1.4047369812829168e-08, + "loss": 0.75393701, + "num_input_tokens_seen": 345696725, + "router_z_loss_clip": 0.70068359, + "router_z_loss_mlp": 0.10241699, + "step": 16023, + "time_per_iteration": 2.636584997177124 + }, + { + "auxiliary_loss_clip": 0.01107193, + "auxiliary_loss_mlp": 0.0102483, + "balance_loss_clip": 1.03623223, + "balance_loss_mlp": 1.01450086, + "epoch": 0.9634150007515406, + "flos": 29001781468800.0, + "grad_norm": 2.1798994777868246, + "language_loss": 0.81421244, + "learning_rate": 1.4001328968960891e-08, + "loss": 0.83553267, + "num_input_tokens_seen": 345716245, + "router_z_loss_clip": 0.70947266, + "router_z_loss_mlp": 0.10333252, + "step": 16024, + "time_per_iteration": 2.626708745956421 + }, + { + "auxiliary_loss_clip": 0.01113084, + "auxiliary_loss_mlp": 0.01033776, + "balance_loss_clip": 1.03774118, + "balance_loss_mlp": 1.02204561, + "epoch": 0.9634751240042086, + "flos": 29450122702560.0, + "grad_norm": 2.405483814707075, + "language_loss": 0.81474078, + "learning_rate": 1.3955363434361212e-08, + "loss": 0.83620942, + "num_input_tokens_seen": 345739060, + "router_z_loss_clip": 0.75341797, + "router_z_loss_mlp": 0.11737061, + "step": 16025, + "time_per_iteration": 2.659428834915161 + }, + { + "auxiliary_loss_clip": 0.0110983, + "auxiliary_loss_mlp": 0.01027288, + "balance_loss_clip": 1.03595543, + "balance_loss_mlp": 1.01590908, + "epoch": 0.9635352472568766, + "flos": 29711421056160.0, + "grad_norm": 1.9176751117389508, + "language_loss": 0.76696318, + "learning_rate": 1.3909473210773181e-08, + "loss": 0.78833437, + "num_input_tokens_seen": 345758325, + "router_z_loss_clip": 0.73779297, + "router_z_loss_mlp": 0.1137085, + "step": 16026, + "time_per_iteration": 2.682950735092163 + }, + { + "auxiliary_loss_clip": 0.01109861, + "auxiliary_loss_mlp": 0.01030586, + "balance_loss_clip": 1.03673089, + "balance_loss_mlp": 1.01885033, + "epoch": 0.9635953705095446, + "flos": 29266807412160.0, + "grad_norm": 4.1242095165013595, + "language_loss": 0.63280267, + "learning_rate": 1.3863658299936965e-08, + "loss": 0.65420717, + "num_input_tokens_seen": 345778530, + "router_z_loss_clip": 0.72998047, + "router_z_loss_mlp": 0.11743164, + "step": 16027, + "time_per_iteration": 2.640382766723633 + }, + { + "auxiliary_loss_clip": 0.01112149, + "auxiliary_loss_mlp": 0.01025572, + "balance_loss_clip": 1.03869081, + "balance_loss_mlp": 1.01406264, + "epoch": 0.9636554937622125, + "flos": 24194646349920.0, + "grad_norm": 2.1071644534934677, + "language_loss": 0.87346619, + "learning_rate": 1.3817918703589837e-08, + "loss": 0.8948434, + "num_input_tokens_seen": 345796535, + "router_z_loss_clip": 0.734375, + "router_z_loss_mlp": 0.1151123, + "step": 16028, + "time_per_iteration": 2.648934841156006 + }, + { + "auxiliary_loss_clip": 0.0102697, + "auxiliary_loss_mlp": 0.01001412, + "balance_loss_clip": 1.00465715, + "balance_loss_mlp": 1.00051928, + "epoch": 0.9637156170148805, + "flos": 82285713911520.0, + "grad_norm": 0.6856236052013638, + "language_loss": 0.5322364, + "learning_rate": 1.3772254423466412e-08, + "loss": 0.55252022, + "num_input_tokens_seen": 345859700, + "router_z_loss_clip": 0.2232666, + "router_z_loss_mlp": 0.00892639, + "step": 16029, + "time_per_iteration": 3.2130062580108643 + }, + { + "auxiliary_loss_clip": 0.01110714, + "auxiliary_loss_mlp": 0.01028568, + "balance_loss_clip": 1.03704643, + "balance_loss_mlp": 1.01699924, + "epoch": 0.9637757402675484, + "flos": 24771367792800.0, + "grad_norm": 1.6442469652980793, + "language_loss": 0.74094486, + "learning_rate": 1.372666546129797e-08, + "loss": 0.76233763, + "num_input_tokens_seen": 345878760, + "router_z_loss_clip": 0.73681641, + "router_z_loss_mlp": 0.11572266, + "step": 16030, + "time_per_iteration": 2.6285247802734375 + }, + { + "auxiliary_loss_clip": 0.01107996, + "auxiliary_loss_mlp": 0.01028974, + "balance_loss_clip": 1.03825653, + "balance_loss_mlp": 1.01848912, + "epoch": 0.9638358635202164, + "flos": 33232195144800.0, + "grad_norm": 1.738765672747237, + "language_loss": 0.66280055, + "learning_rate": 1.3681151818813575e-08, + "loss": 0.68417025, + "num_input_tokens_seen": 345900445, + "router_z_loss_clip": 0.69824219, + "router_z_loss_mlp": 0.1048584, + "step": 16031, + "time_per_iteration": 2.7306971549987793 + }, + { + "auxiliary_loss_clip": 0.01027161, + "auxiliary_loss_mlp": 0.01001843, + "balance_loss_clip": 1.00482154, + "balance_loss_mlp": 1.00088453, + "epoch": 0.9638959867728845, + "flos": 85767071037120.0, + "grad_norm": 0.8323320916118573, + "language_loss": 0.60734689, + "learning_rate": 1.3635713497738955e-08, + "loss": 0.62763691, + "num_input_tokens_seen": 345961020, + "router_z_loss_clip": 0.22351074, + "router_z_loss_mlp": 0.00957489, + "step": 16032, + "time_per_iteration": 3.2646865844726562 + }, + { + "auxiliary_loss_clip": 0.01101516, + "auxiliary_loss_mlp": 0.01026325, + "balance_loss_clip": 1.03466845, + "balance_loss_mlp": 1.01634121, + "epoch": 0.9639561100255524, + "flos": 31002968298240.0, + "grad_norm": 1.8233770906760538, + "language_loss": 0.66678166, + "learning_rate": 1.3590350499796954e-08, + "loss": 0.68806005, + "num_input_tokens_seen": 345980210, + "router_z_loss_clip": 0.66845703, + "router_z_loss_mlp": 0.09985352, + "step": 16033, + "time_per_iteration": 2.687814474105835 + }, + { + "auxiliary_loss_clip": 0.01109326, + "auxiliary_loss_mlp": 0.01028362, + "balance_loss_clip": 1.03850508, + "balance_loss_mlp": 1.01766288, + "epoch": 0.9640162332782204, + "flos": 22102389375840.0, + "grad_norm": 1.6975752694094426, + "language_loss": 0.65637088, + "learning_rate": 1.3545062826707976e-08, + "loss": 0.67774773, + "num_input_tokens_seen": 345998280, + "router_z_loss_clip": 0.70849609, + "router_z_loss_mlp": 0.10699463, + "step": 16034, + "time_per_iteration": 4.042732000350952 + }, + { + "auxiliary_loss_clip": 0.01111197, + "auxiliary_loss_mlp": 0.0102743, + "balance_loss_clip": 1.03823102, + "balance_loss_mlp": 1.016469, + "epoch": 0.9640763565308883, + "flos": 28602135655200.0, + "grad_norm": 2.5104018133947577, + "language_loss": 0.74193513, + "learning_rate": 1.3499850480189313e-08, + "loss": 0.7633214, + "num_input_tokens_seen": 346015545, + "router_z_loss_clip": 0.72949219, + "router_z_loss_mlp": 0.10961914, + "step": 16035, + "time_per_iteration": 2.619570016860962 + }, + { + "auxiliary_loss_clip": 0.01112129, + "auxiliary_loss_mlp": 0.01027961, + "balance_loss_clip": 1.04057157, + "balance_loss_mlp": 1.01682663, + "epoch": 0.9641364797835563, + "flos": 27356245037280.0, + "grad_norm": 2.464947392647243, + "language_loss": 0.8230052, + "learning_rate": 1.3454713461955591e-08, + "loss": 0.84440607, + "num_input_tokens_seen": 346034055, + "router_z_loss_clip": 0.71582031, + "router_z_loss_mlp": 0.11126709, + "step": 16036, + "time_per_iteration": 2.6294898986816406 + }, + { + "auxiliary_loss_clip": 0.0110782, + "auxiliary_loss_mlp": 0.01031819, + "balance_loss_clip": 1.03597522, + "balance_loss_mlp": 1.02050602, + "epoch": 0.9641966030362242, + "flos": 37366757637120.0, + "grad_norm": 1.8576708135860718, + "language_loss": 0.70116055, + "learning_rate": 1.340965177371789e-08, + "loss": 0.72255695, + "num_input_tokens_seen": 346054130, + "router_z_loss_clip": 0.71923828, + "router_z_loss_mlp": 0.11315918, + "step": 16037, + "time_per_iteration": 2.6605165004730225 + }, + { + "auxiliary_loss_clip": 0.01108436, + "auxiliary_loss_mlp": 0.01024876, + "balance_loss_clip": 1.03590798, + "balance_loss_mlp": 1.01399255, + "epoch": 0.9642567262888923, + "flos": 25567661141280.0, + "grad_norm": 2.4913429618020198, + "language_loss": 0.6296894, + "learning_rate": 1.3364665417185506e-08, + "loss": 0.65102255, + "num_input_tokens_seen": 346072990, + "router_z_loss_clip": 0.72509766, + "router_z_loss_mlp": 0.10876465, + "step": 16038, + "time_per_iteration": 2.610072374343872 + }, + { + "auxiliary_loss_clip": 0.0111008, + "auxiliary_loss_mlp": 0.0102993, + "balance_loss_clip": 1.03722453, + "balance_loss_mlp": 1.01919508, + "epoch": 0.9643168495415602, + "flos": 27624674432160.0, + "grad_norm": 1.7849252569064984, + "language_loss": 0.71289492, + "learning_rate": 1.3319754394064187e-08, + "loss": 0.73429501, + "num_input_tokens_seen": 346093745, + "router_z_loss_clip": 0.72900391, + "router_z_loss_mlp": 0.1072998, + "step": 16039, + "time_per_iteration": 2.6342859268188477 + }, + { + "auxiliary_loss_clip": 0.01111907, + "auxiliary_loss_mlp": 0.01027597, + "balance_loss_clip": 1.03860378, + "balance_loss_mlp": 1.01591432, + "epoch": 0.9643769727942282, + "flos": 24728912033760.0, + "grad_norm": 2.0139561080749835, + "language_loss": 0.73162472, + "learning_rate": 1.327491870605657e-08, + "loss": 0.75301981, + "num_input_tokens_seen": 346110115, + "router_z_loss_clip": 0.73291016, + "router_z_loss_mlp": 0.11688232, + "step": 16040, + "time_per_iteration": 2.6150028705596924 + }, + { + "auxiliary_loss_clip": 0.0111045, + "auxiliary_loss_mlp": 0.01031025, + "balance_loss_clip": 1.03695941, + "balance_loss_mlp": 1.01918745, + "epoch": 0.9644370960468961, + "flos": 16938104719680.0, + "grad_norm": 2.160042322624322, + "language_loss": 0.72961223, + "learning_rate": 1.3230158354863296e-08, + "loss": 0.75102699, + "num_input_tokens_seen": 346127165, + "router_z_loss_clip": 0.73486328, + "router_z_loss_mlp": 0.1184082, + "step": 16041, + "time_per_iteration": 2.5857176780700684 + }, + { + "auxiliary_loss_clip": 0.0110403, + "auxiliary_loss_mlp": 0.01027306, + "balance_loss_clip": 1.03631783, + "balance_loss_mlp": 1.0168221, + "epoch": 0.9644972192995641, + "flos": 21033817490880.0, + "grad_norm": 1.7819753869678023, + "language_loss": 0.7185756, + "learning_rate": 1.3185473342181674e-08, + "loss": 0.73988891, + "num_input_tokens_seen": 346145950, + "router_z_loss_clip": 0.67724609, + "router_z_loss_mlp": 0.1048584, + "step": 16042, + "time_per_iteration": 2.6015625 + }, + { + "auxiliary_loss_clip": 0.01111887, + "auxiliary_loss_mlp": 0.01029073, + "balance_loss_clip": 1.03716612, + "balance_loss_mlp": 1.01814151, + "epoch": 0.964557342552232, + "flos": 29090420576640.0, + "grad_norm": 1.8173253361468018, + "language_loss": 0.81102544, + "learning_rate": 1.3140863669705683e-08, + "loss": 0.83243501, + "num_input_tokens_seen": 346165005, + "router_z_loss_clip": 0.74755859, + "router_z_loss_mlp": 0.10943604, + "step": 16043, + "time_per_iteration": 2.6064841747283936 + }, + { + "auxiliary_loss_clip": 0.01108092, + "auxiliary_loss_mlp": 0.01028358, + "balance_loss_clip": 1.0375247, + "balance_loss_mlp": 1.01794457, + "epoch": 0.9646174658049, + "flos": 26421239573280.0, + "grad_norm": 1.6657636950374257, + "language_loss": 0.71627969, + "learning_rate": 1.3096329339127522e-08, + "loss": 0.7376442, + "num_input_tokens_seen": 346185095, + "router_z_loss_clip": 0.70556641, + "router_z_loss_mlp": 0.10418701, + "step": 16044, + "time_per_iteration": 4.0700953006744385 + }, + { + "auxiliary_loss_clip": 0.01106633, + "auxiliary_loss_mlp": 0.0102743, + "balance_loss_clip": 1.0364455, + "balance_loss_mlp": 1.01599205, + "epoch": 0.9646775890575681, + "flos": 20901182967360.0, + "grad_norm": 1.9241082146666573, + "language_loss": 0.70209026, + "learning_rate": 1.3051870352135397e-08, + "loss": 0.72343087, + "num_input_tokens_seen": 346202580, + "router_z_loss_clip": 0.70117188, + "router_z_loss_mlp": 0.11437988, + "step": 16045, + "time_per_iteration": 2.6048624515533447 + }, + { + "auxiliary_loss_clip": 0.01110956, + "auxiliary_loss_mlp": 0.0103087, + "balance_loss_clip": 1.03778934, + "balance_loss_mlp": 1.01879418, + "epoch": 0.964737712310236, + "flos": 15869046627360.0, + "grad_norm": 2.9412655899867306, + "language_loss": 0.75099182, + "learning_rate": 1.3007486710415737e-08, + "loss": 0.77241009, + "num_input_tokens_seen": 346219395, + "router_z_loss_clip": 0.73193359, + "router_z_loss_mlp": 0.1206665, + "step": 16046, + "time_per_iteration": 3.9559080600738525 + }, + { + "auxiliary_loss_clip": 0.0111238, + "auxiliary_loss_mlp": 0.01033243, + "balance_loss_clip": 1.03785408, + "balance_loss_mlp": 1.02110767, + "epoch": 0.964797835562904, + "flos": 29625901778880.0, + "grad_norm": 1.8064642742377441, + "language_loss": 0.62525821, + "learning_rate": 1.2963178415651199e-08, + "loss": 0.64671445, + "num_input_tokens_seen": 346239715, + "router_z_loss_clip": 0.74511719, + "router_z_loss_mlp": 0.12127686, + "step": 16047, + "time_per_iteration": 2.7104930877685547 + }, + { + "auxiliary_loss_clip": 0.01110787, + "auxiliary_loss_mlp": 0.01032887, + "balance_loss_clip": 1.03948092, + "balance_loss_mlp": 1.02178848, + "epoch": 0.9648579588155719, + "flos": 25041742017120.0, + "grad_norm": 2.229623361095638, + "language_loss": 0.69658989, + "learning_rate": 1.2918945469521992e-08, + "loss": 0.71802664, + "num_input_tokens_seen": 346258500, + "router_z_loss_clip": 0.71337891, + "router_z_loss_mlp": 0.11090088, + "step": 16048, + "time_per_iteration": 2.6731760501861572 + }, + { + "auxiliary_loss_clip": 0.01111554, + "auxiliary_loss_mlp": 0.01028667, + "balance_loss_clip": 1.03743804, + "balance_loss_mlp": 1.01715171, + "epoch": 0.9649180820682399, + "flos": 39236809082400.0, + "grad_norm": 1.8052533177807395, + "language_loss": 0.64058781, + "learning_rate": 1.2874787873705662e-08, + "loss": 0.66199005, + "num_input_tokens_seen": 346279110, + "router_z_loss_clip": 0.74121094, + "router_z_loss_mlp": 0.11523438, + "step": 16049, + "time_per_iteration": 2.6822845935821533 + }, + { + "auxiliary_loss_clip": 0.01111692, + "auxiliary_loss_mlp": 0.0102415, + "balance_loss_clip": 1.03964865, + "balance_loss_mlp": 1.01305711, + "epoch": 0.9649782053209078, + "flos": 25041620465280.0, + "grad_norm": 1.6280006532404876, + "language_loss": 0.71244878, + "learning_rate": 1.2830705629876427e-08, + "loss": 0.73380721, + "num_input_tokens_seen": 346297860, + "router_z_loss_clip": 0.72070312, + "router_z_loss_mlp": 0.11096191, + "step": 16050, + "time_per_iteration": 2.6480965614318848 + }, + { + "auxiliary_loss_clip": 0.01112282, + "auxiliary_loss_mlp": 0.0103255, + "balance_loss_clip": 1.0364027, + "balance_loss_mlp": 1.01968741, + "epoch": 0.9650383285735759, + "flos": 52553466973440.0, + "grad_norm": 2.1563074261454473, + "language_loss": 0.6980747, + "learning_rate": 1.278669873970606e-08, + "loss": 0.71952295, + "num_input_tokens_seen": 346319860, + "router_z_loss_clip": 0.75830078, + "router_z_loss_mlp": 0.12854004, + "step": 16051, + "time_per_iteration": 2.773390293121338 + }, + { + "auxiliary_loss_clip": 0.01026947, + "auxiliary_loss_mlp": 0.01002722, + "balance_loss_clip": 1.00463295, + "balance_loss_mlp": 1.0018779, + "epoch": 0.9650984518262438, + "flos": 75345284164320.0, + "grad_norm": 0.8390064646258434, + "language_loss": 0.59095585, + "learning_rate": 1.2742767204863004e-08, + "loss": 0.61125255, + "num_input_tokens_seen": 346379025, + "router_z_loss_clip": 0.22314453, + "router_z_loss_mlp": 0.00845337, + "step": 16052, + "time_per_iteration": 3.3141965866088867 + }, + { + "auxiliary_loss_clip": 0.01105126, + "auxiliary_loss_mlp": 0.01024592, + "balance_loss_clip": 1.03554595, + "balance_loss_mlp": 1.01311815, + "epoch": 0.9651585750789118, + "flos": 36349676864640.0, + "grad_norm": 1.68387855884497, + "language_loss": 0.74692935, + "learning_rate": 1.2698911027013482e-08, + "loss": 0.7682265, + "num_input_tokens_seen": 346402250, + "router_z_loss_clip": 0.6953125, + "router_z_loss_mlp": 0.11480713, + "step": 16053, + "time_per_iteration": 2.687751293182373 + }, + { + "auxiliary_loss_clip": 0.01110995, + "auxiliary_loss_mlp": 0.01028557, + "balance_loss_clip": 1.03772593, + "balance_loss_mlp": 1.01780987, + "epoch": 0.9652186983315797, + "flos": 20588271949440.0, + "grad_norm": 3.035355284290909, + "language_loss": 0.68448806, + "learning_rate": 1.2655130207820386e-08, + "loss": 0.70588356, + "num_input_tokens_seen": 346419555, + "router_z_loss_clip": 0.73242188, + "router_z_loss_mlp": 0.10742188, + "step": 16054, + "time_per_iteration": 2.6617681980133057 + }, + { + "auxiliary_loss_clip": 0.01108544, + "auxiliary_loss_mlp": 0.01031772, + "balance_loss_clip": 1.0382756, + "balance_loss_mlp": 1.02144229, + "epoch": 0.9652788215842477, + "flos": 38619941365440.0, + "grad_norm": 4.612821745832562, + "language_loss": 0.62360799, + "learning_rate": 1.2611424748943944e-08, + "loss": 0.64501119, + "num_input_tokens_seen": 346441245, + "router_z_loss_clip": 0.703125, + "router_z_loss_mlp": 0.10333252, + "step": 16055, + "time_per_iteration": 2.7353334426879883 + }, + { + "auxiliary_loss_clip": 0.01105919, + "auxiliary_loss_mlp": 0.01031355, + "balance_loss_clip": 1.03649116, + "balance_loss_mlp": 1.02020288, + "epoch": 0.9653389448369156, + "flos": 30205499948640.0, + "grad_norm": 1.8340763936352913, + "language_loss": 0.76401007, + "learning_rate": 1.2567794652041719e-08, + "loss": 0.78538281, + "num_input_tokens_seen": 346460065, + "router_z_loss_clip": 0.69335938, + "router_z_loss_mlp": 0.11151123, + "step": 16056, + "time_per_iteration": 2.641511917114258 + }, + { + "auxiliary_loss_clip": 0.01109778, + "auxiliary_loss_mlp": 0.01027612, + "balance_loss_clip": 1.0361315, + "balance_loss_mlp": 1.01716876, + "epoch": 0.9653990680895836, + "flos": 24767235030240.0, + "grad_norm": 1.7653182611979192, + "language_loss": 0.71721101, + "learning_rate": 1.2524239918767498e-08, + "loss": 0.73858494, + "num_input_tokens_seen": 346478005, + "router_z_loss_clip": 0.73583984, + "router_z_loss_mlp": 0.10437012, + "step": 16057, + "time_per_iteration": 3.8685526847839355 + }, + { + "auxiliary_loss_clip": 0.0110589, + "auxiliary_loss_mlp": 0.01028503, + "balance_loss_clip": 1.03550875, + "balance_loss_mlp": 1.01821518, + "epoch": 0.9654591913422517, + "flos": 27489770940960.0, + "grad_norm": 2.0198908773658784, + "language_loss": 0.71815413, + "learning_rate": 1.2480760550773295e-08, + "loss": 0.73949808, + "num_input_tokens_seen": 346497575, + "router_z_loss_clip": 0.70361328, + "router_z_loss_mlp": 0.10284424, + "step": 16058, + "time_per_iteration": 2.5896782875061035 + }, + { + "auxiliary_loss_clip": 0.0110786, + "auxiliary_loss_mlp": 0.01030456, + "balance_loss_clip": 1.03702021, + "balance_loss_mlp": 1.01968527, + "epoch": 0.9655193145949196, + "flos": 32656527151200.0, + "grad_norm": 1.3424653382576026, + "language_loss": 0.74060082, + "learning_rate": 1.2437356549708011e-08, + "loss": 0.76198393, + "num_input_tokens_seen": 346520000, + "router_z_loss_clip": 0.70751953, + "router_z_loss_mlp": 0.10772705, + "step": 16059, + "time_per_iteration": 2.66215181350708 + }, + { + "auxiliary_loss_clip": 0.01113161, + "auxiliary_loss_mlp": 0.01031912, + "balance_loss_clip": 1.03758049, + "balance_loss_mlp": 1.02112341, + "epoch": 0.9655794378475876, + "flos": 51214763967840.0, + "grad_norm": 2.216789387527933, + "language_loss": 0.73768485, + "learning_rate": 1.239402791721722e-08, + "loss": 0.7591356, + "num_input_tokens_seen": 346541605, + "router_z_loss_clip": 0.75585938, + "router_z_loss_mlp": 0.10784912, + "step": 16060, + "time_per_iteration": 2.7620675563812256 + }, + { + "auxiliary_loss_clip": 0.0110665, + "auxiliary_loss_mlp": 0.01027829, + "balance_loss_clip": 1.03820777, + "balance_loss_mlp": 1.01776814, + "epoch": 0.9656395611002555, + "flos": 33811631245440.0, + "grad_norm": 1.57589037293368, + "language_loss": 0.76794112, + "learning_rate": 1.2350774654944273e-08, + "loss": 0.7892859, + "num_input_tokens_seen": 346560955, + "router_z_loss_clip": 0.68457031, + "router_z_loss_mlp": 0.10064697, + "step": 16061, + "time_per_iteration": 2.66353440284729 + }, + { + "auxiliary_loss_clip": 0.010268, + "auxiliary_loss_mlp": 0.01001793, + "balance_loss_clip": 1.00454354, + "balance_loss_mlp": 1.0009172, + "epoch": 0.9656996843529235, + "flos": 84156413633280.0, + "grad_norm": 0.7226473802365583, + "language_loss": 0.64185131, + "learning_rate": 1.2307596764528749e-08, + "loss": 0.66213727, + "num_input_tokens_seen": 346621615, + "router_z_loss_clip": 0.22265625, + "router_z_loss_mlp": 0.00876617, + "step": 16062, + "time_per_iteration": 3.27329421043396 + }, + { + "auxiliary_loss_clip": 0.01103553, + "auxiliary_loss_mlp": 0.01026973, + "balance_loss_clip": 1.03542805, + "balance_loss_mlp": 1.01678646, + "epoch": 0.9657598076055914, + "flos": 25174376540640.0, + "grad_norm": 2.6470588696176103, + "language_loss": 0.93187487, + "learning_rate": 1.226449424760867e-08, + "loss": 0.95318007, + "num_input_tokens_seen": 346637460, + "router_z_loss_clip": 0.68066406, + "router_z_loss_mlp": 0.10186768, + "step": 16063, + "time_per_iteration": 2.5831398963928223 + }, + { + "auxiliary_loss_clip": 0.01111073, + "auxiliary_loss_mlp": 0.01031827, + "balance_loss_clip": 1.03860307, + "balance_loss_mlp": 1.02048409, + "epoch": 0.9658199308582595, + "flos": 24951401183520.0, + "grad_norm": 2.174209455097874, + "language_loss": 0.82190514, + "learning_rate": 1.2221467105818062e-08, + "loss": 0.84333414, + "num_input_tokens_seen": 346655625, + "router_z_loss_clip": 0.72558594, + "router_z_loss_mlp": 0.11340332, + "step": 16064, + "time_per_iteration": 2.614738702774048 + }, + { + "auxiliary_loss_clip": 0.01110904, + "auxiliary_loss_mlp": 0.01024639, + "balance_loss_clip": 1.04032576, + "balance_loss_mlp": 1.01456571, + "epoch": 0.9658800541109274, + "flos": 30161261429280.0, + "grad_norm": 1.795173072539539, + "language_loss": 0.84408081, + "learning_rate": 1.2178515340788731e-08, + "loss": 0.86543626, + "num_input_tokens_seen": 346675220, + "router_z_loss_clip": 0.70556641, + "router_z_loss_mlp": 0.10070801, + "step": 16065, + "time_per_iteration": 2.648796319961548 + }, + { + "auxiliary_loss_clip": 0.01107828, + "auxiliary_loss_mlp": 0.01028963, + "balance_loss_clip": 1.03669405, + "balance_loss_mlp": 1.01770389, + "epoch": 0.9659401773635954, + "flos": 26369262253440.0, + "grad_norm": 2.5923974021296265, + "language_loss": 0.67621756, + "learning_rate": 1.2135638954149151e-08, + "loss": 0.6975854, + "num_input_tokens_seen": 346694710, + "router_z_loss_clip": 0.7109375, + "router_z_loss_mlp": 0.11260986, + "step": 16066, + "time_per_iteration": 2.6303021907806396 + }, + { + "auxiliary_loss_clip": 0.01107141, + "auxiliary_loss_mlp": 0.01026711, + "balance_loss_clip": 1.03580809, + "balance_loss_mlp": 1.0158627, + "epoch": 0.9660003006162633, + "flos": 24771691931040.0, + "grad_norm": 2.3305096560601735, + "language_loss": 0.8201198, + "learning_rate": 1.209283794752558e-08, + "loss": 0.84145832, + "num_input_tokens_seen": 346712645, + "router_z_loss_clip": 0.71386719, + "router_z_loss_mlp": 0.10839844, + "step": 16067, + "time_per_iteration": 2.603393077850342 + }, + { + "auxiliary_loss_clip": 0.01107874, + "auxiliary_loss_mlp": 0.0102714, + "balance_loss_clip": 1.03695273, + "balance_loss_mlp": 1.01582134, + "epoch": 0.9660604238689313, + "flos": 29848836618720.0, + "grad_norm": 3.154890164304174, + "language_loss": 0.69366461, + "learning_rate": 1.2050112322540496e-08, + "loss": 0.71501476, + "num_input_tokens_seen": 346732375, + "router_z_loss_clip": 0.70898438, + "router_z_loss_mlp": 0.11309814, + "step": 16068, + "time_per_iteration": 2.662778377532959 + }, + { + "auxiliary_loss_clip": 0.01103167, + "auxiliary_loss_mlp": 0.01027823, + "balance_loss_clip": 1.03588271, + "balance_loss_mlp": 1.01826847, + "epoch": 0.9661205471215992, + "flos": 24238155558240.0, + "grad_norm": 1.8063758208729288, + "language_loss": 0.67804289, + "learning_rate": 1.20074620808146e-08, + "loss": 0.69935274, + "num_input_tokens_seen": 346750430, + "router_z_loss_clip": 0.67236328, + "router_z_loss_mlp": 0.09558105, + "step": 16069, + "time_per_iteration": 2.610358238220215 + }, + { + "auxiliary_loss_clip": 0.01112431, + "auxiliary_loss_mlp": 0.0102712, + "balance_loss_clip": 1.04026282, + "balance_loss_mlp": 1.01633167, + "epoch": 0.9661806703742672, + "flos": 25084805535360.0, + "grad_norm": 2.082648042396889, + "language_loss": 0.88609964, + "learning_rate": 1.1964887223964826e-08, + "loss": 0.90749514, + "num_input_tokens_seen": 346768455, + "router_z_loss_clip": 0.72216797, + "router_z_loss_mlp": 0.10797119, + "step": 16070, + "time_per_iteration": 2.6653268337249756 + }, + { + "auxiliary_loss_clip": 0.01112612, + "auxiliary_loss_mlp": 0.01031894, + "balance_loss_clip": 1.0404644, + "balance_loss_mlp": 1.02039623, + "epoch": 0.9662407936269353, + "flos": 26150419658880.0, + "grad_norm": 1.838618475079558, + "language_loss": 0.77298856, + "learning_rate": 1.1922387753605878e-08, + "loss": 0.79443359, + "num_input_tokens_seen": 346786530, + "router_z_loss_clip": 0.72070312, + "router_z_loss_mlp": 0.1149292, + "step": 16071, + "time_per_iteration": 2.6472973823547363 + }, + { + "auxiliary_loss_clip": 0.01107424, + "auxiliary_loss_mlp": 0.01028447, + "balance_loss_clip": 1.0367105, + "balance_loss_mlp": 1.01651454, + "epoch": 0.9663009168796032, + "flos": 18184116889440.0, + "grad_norm": 1.9618939842303198, + "language_loss": 0.6628257, + "learning_rate": 1.1879963671349137e-08, + "loss": 0.68418443, + "num_input_tokens_seen": 346804635, + "router_z_loss_clip": 0.70703125, + "router_z_loss_mlp": 0.11932373, + "step": 16072, + "time_per_iteration": 2.6520371437072754 + }, + { + "auxiliary_loss_clip": 0.011112, + "auxiliary_loss_mlp": 0.01028479, + "balance_loss_clip": 1.03822434, + "balance_loss_mlp": 1.01732755, + "epoch": 0.9663610401322712, + "flos": 29664954086400.0, + "grad_norm": 1.782429770695296, + "language_loss": 0.7766729, + "learning_rate": 1.1837614978803534e-08, + "loss": 0.79806972, + "num_input_tokens_seen": 346823070, + "router_z_loss_clip": 0.72998047, + "router_z_loss_mlp": 0.11151123, + "step": 16073, + "time_per_iteration": 4.096599578857422 + }, + { + "auxiliary_loss_clip": 0.01113322, + "auxiliary_loss_mlp": 0.01030589, + "balance_loss_clip": 1.03902006, + "balance_loss_mlp": 1.01928759, + "epoch": 0.9664211633849391, + "flos": 21521656722240.0, + "grad_norm": 3.0247236182677897, + "language_loss": 0.75958669, + "learning_rate": 1.1795341677574677e-08, + "loss": 0.78102577, + "num_input_tokens_seen": 346841180, + "router_z_loss_clip": 0.74316406, + "router_z_loss_mlp": 0.11309814, + "step": 16074, + "time_per_iteration": 2.5793979167938232 + }, + { + "auxiliary_loss_clip": 0.01111693, + "auxiliary_loss_mlp": 0.01028199, + "balance_loss_clip": 1.03850114, + "balance_loss_mlp": 1.01710057, + "epoch": 0.9664812866376071, + "flos": 36349757899200.0, + "grad_norm": 1.8557683545288444, + "language_loss": 0.75558913, + "learning_rate": 1.1753143769265728e-08, + "loss": 0.77698797, + "num_input_tokens_seen": 346864250, + "router_z_loss_clip": 0.73291016, + "router_z_loss_mlp": 0.11102295, + "step": 16075, + "time_per_iteration": 2.7009456157684326 + }, + { + "auxiliary_loss_clip": 0.01110329, + "auxiliary_loss_mlp": 0.01036457, + "balance_loss_clip": 1.03861451, + "balance_loss_mlp": 1.02523339, + "epoch": 0.966541409890275, + "flos": 17426875848480.0, + "grad_norm": 2.542352926058014, + "language_loss": 0.78809136, + "learning_rate": 1.171102125547696e-08, + "loss": 0.80955917, + "num_input_tokens_seen": 346881955, + "router_z_loss_clip": 0.71728516, + "router_z_loss_mlp": 0.11236572, + "step": 16076, + "time_per_iteration": 2.601236581802368 + }, + { + "auxiliary_loss_clip": 0.01113169, + "auxiliary_loss_mlp": 0.01038647, + "balance_loss_clip": 1.03999662, + "balance_loss_mlp": 1.02686894, + "epoch": 0.9666015331429431, + "flos": 24234103830240.0, + "grad_norm": 1.9965934397436411, + "language_loss": 0.72247499, + "learning_rate": 1.166897413780532e-08, + "loss": 0.74399316, + "num_input_tokens_seen": 346900445, + "router_z_loss_clip": 0.73242188, + "router_z_loss_mlp": 0.11791992, + "step": 16077, + "time_per_iteration": 2.6185622215270996 + }, + { + "auxiliary_loss_clip": 0.01109134, + "auxiliary_loss_mlp": 0.01033751, + "balance_loss_clip": 1.03687239, + "balance_loss_mlp": 1.02189565, + "epoch": 0.966661656395611, + "flos": 33099601138560.0, + "grad_norm": 2.171269003286751, + "language_loss": 0.59583294, + "learning_rate": 1.1627002417845533e-08, + "loss": 0.61726183, + "num_input_tokens_seen": 346920135, + "router_z_loss_clip": 0.72167969, + "router_z_loss_mlp": 0.11853027, + "step": 16078, + "time_per_iteration": 2.6384894847869873 + }, + { + "auxiliary_loss_clip": 0.01112747, + "auxiliary_loss_mlp": 0.01028327, + "balance_loss_clip": 1.03784823, + "balance_loss_mlp": 1.0162456, + "epoch": 0.966721779648279, + "flos": 26244326013120.0, + "grad_norm": 1.9064913858116548, + "language_loss": 0.71779168, + "learning_rate": 1.158510609718899e-08, + "loss": 0.73920238, + "num_input_tokens_seen": 346940450, + "router_z_loss_clip": 0.74853516, + "router_z_loss_mlp": 0.12084961, + "step": 16079, + "time_per_iteration": 2.6383864879608154 + }, + { + "auxiliary_loss_clip": 0.01105927, + "auxiliary_loss_mlp": 0.01026674, + "balance_loss_clip": 1.03660154, + "balance_loss_mlp": 1.01638603, + "epoch": 0.9667819029009469, + "flos": 29136401339040.0, + "grad_norm": 1.7771220274439952, + "language_loss": 0.72092676, + "learning_rate": 1.1543285177424644e-08, + "loss": 0.74225283, + "num_input_tokens_seen": 346960935, + "router_z_loss_clip": 0.69335938, + "router_z_loss_mlp": 0.10290527, + "step": 16080, + "time_per_iteration": 2.6307015419006348 + }, + { + "auxiliary_loss_clip": 0.01110658, + "auxiliary_loss_mlp": 0.01024145, + "balance_loss_clip": 1.03874779, + "balance_loss_mlp": 1.01376152, + "epoch": 0.9668420261536149, + "flos": 26243191529280.0, + "grad_norm": 2.2004056469179667, + "language_loss": 0.74138814, + "learning_rate": 1.1501539660138115e-08, + "loss": 0.7627362, + "num_input_tokens_seen": 346980100, + "router_z_loss_clip": 0.72021484, + "router_z_loss_mlp": 0.1038208, + "step": 16081, + "time_per_iteration": 2.6303634643554688 + }, + { + "auxiliary_loss_clip": 0.01108833, + "auxiliary_loss_mlp": 0.01025113, + "balance_loss_clip": 1.0363462, + "balance_loss_mlp": 1.01415801, + "epoch": 0.9669021494062828, + "flos": 32564363040000.0, + "grad_norm": 3.455811440790516, + "language_loss": 0.6706444, + "learning_rate": 1.145986954691236e-08, + "loss": 0.69198382, + "num_input_tokens_seen": 347001250, + "router_z_loss_clip": 0.72460938, + "router_z_loss_mlp": 0.10961914, + "step": 16082, + "time_per_iteration": 2.654635429382324 + }, + { + "auxiliary_loss_clip": 0.01109, + "auxiliary_loss_mlp": 0.01029066, + "balance_loss_clip": 1.03769517, + "balance_loss_mlp": 1.01809907, + "epoch": 0.9669622726589508, + "flos": 36393307624800.0, + "grad_norm": 3.983846882321529, + "language_loss": 0.76755178, + "learning_rate": 1.141827483932789e-08, + "loss": 0.78893244, + "num_input_tokens_seen": 347022975, + "router_z_loss_clip": 0.71289062, + "router_z_loss_mlp": 0.10968018, + "step": 16083, + "time_per_iteration": 2.665890693664551 + }, + { + "auxiliary_loss_clip": 0.01110956, + "auxiliary_loss_mlp": 0.01029816, + "balance_loss_clip": 1.03812146, + "balance_loss_mlp": 1.01870024, + "epoch": 0.9670223959116189, + "flos": 27970924821120.0, + "grad_norm": 2.6856659617483336, + "language_loss": 0.79382861, + "learning_rate": 1.1376755538961669e-08, + "loss": 0.81523633, + "num_input_tokens_seen": 347038780, + "router_z_loss_clip": 0.72851562, + "router_z_loss_mlp": 0.11108398, + "step": 16084, + "time_per_iteration": 4.103554964065552 + }, + { + "auxiliary_loss_clip": 0.01111543, + "auxiliary_loss_mlp": 0.01029432, + "balance_loss_clip": 1.03641665, + "balance_loss_mlp": 1.01837492, + "epoch": 0.9670825191642868, + "flos": 22725496753920.0, + "grad_norm": 2.6025092389198723, + "language_loss": 0.67830038, + "learning_rate": 1.1335311647387991e-08, + "loss": 0.69971013, + "num_input_tokens_seen": 347056705, + "router_z_loss_clip": 0.75097656, + "router_z_loss_mlp": 0.11047363, + "step": 16085, + "time_per_iteration": 3.869868755340576 + }, + { + "auxiliary_loss_clip": 0.01114525, + "auxiliary_loss_mlp": 0.01029405, + "balance_loss_clip": 1.03835773, + "balance_loss_mlp": 1.0168345, + "epoch": 0.9671426424169548, + "flos": 29892791517120.0, + "grad_norm": 2.5430769042798076, + "language_loss": 0.68794709, + "learning_rate": 1.1293943166178709e-08, + "loss": 0.70938647, + "num_input_tokens_seen": 347075710, + "router_z_loss_clip": 0.76074219, + "router_z_loss_mlp": 0.12573242, + "step": 16086, + "time_per_iteration": 2.647989273071289 + }, + { + "auxiliary_loss_clip": 0.01108825, + "auxiliary_loss_mlp": 0.01030181, + "balance_loss_clip": 1.03793263, + "balance_loss_mlp": 1.0190351, + "epoch": 0.9672027656696227, + "flos": 24857373277440.0, + "grad_norm": 1.5318661932073918, + "language_loss": 0.78801668, + "learning_rate": 1.125265009690235e-08, + "loss": 0.80940682, + "num_input_tokens_seen": 347092325, + "router_z_loss_clip": 0.70898438, + "router_z_loss_mlp": 0.1114502, + "step": 16087, + "time_per_iteration": 2.6489150524139404 + }, + { + "auxiliary_loss_clip": 0.01108228, + "auxiliary_loss_mlp": 0.01023183, + "balance_loss_clip": 1.03672767, + "balance_loss_mlp": 1.01262665, + "epoch": 0.9672628889222907, + "flos": 23037475874400.0, + "grad_norm": 2.422650503312592, + "language_loss": 0.71352267, + "learning_rate": 1.1211432441124769e-08, + "loss": 0.7348367, + "num_input_tokens_seen": 347110595, + "router_z_loss_clip": 0.71435547, + "router_z_loss_mlp": 0.10559082, + "step": 16088, + "time_per_iteration": 2.570990562438965 + }, + { + "auxiliary_loss_clip": 0.01107077, + "auxiliary_loss_mlp": 0.01028993, + "balance_loss_clip": 1.03780711, + "balance_loss_mlp": 1.0185504, + "epoch": 0.9673230121749586, + "flos": 35014539379680.0, + "grad_norm": 1.907775550907385, + "language_loss": 0.70637393, + "learning_rate": 1.117029020040916e-08, + "loss": 0.72773463, + "num_input_tokens_seen": 347131625, + "router_z_loss_clip": 0.69287109, + "router_z_loss_mlp": 0.10449219, + "step": 16089, + "time_per_iteration": 2.6804869174957275 + }, + { + "auxiliary_loss_clip": 0.01111491, + "auxiliary_loss_mlp": 0.0103143, + "balance_loss_clip": 1.03829265, + "balance_loss_mlp": 1.02029037, + "epoch": 0.9673831354276267, + "flos": 24995234530080.0, + "grad_norm": 3.4139646489080766, + "language_loss": 0.74943167, + "learning_rate": 1.1129223376315167e-08, + "loss": 0.77086091, + "num_input_tokens_seen": 347147910, + "router_z_loss_clip": 0.73193359, + "router_z_loss_mlp": 0.11151123, + "step": 16090, + "time_per_iteration": 2.5694172382354736 + }, + { + "auxiliary_loss_clip": 0.01112866, + "auxiliary_loss_mlp": 0.01030463, + "balance_loss_clip": 1.03710186, + "balance_loss_mlp": 1.0191201, + "epoch": 0.9674432586802946, + "flos": 32698861358400.0, + "grad_norm": 2.139462065765716, + "language_loss": 0.69237447, + "learning_rate": 1.1088231970400653e-08, + "loss": 0.71380776, + "num_input_tokens_seen": 347168805, + "router_z_loss_clip": 0.7578125, + "router_z_loss_mlp": 0.11328125, + "step": 16091, + "time_per_iteration": 2.6535017490386963 + }, + { + "auxiliary_loss_clip": 0.01106707, + "auxiliary_loss_mlp": 0.01029229, + "balance_loss_clip": 1.03596723, + "balance_loss_mlp": 1.01779652, + "epoch": 0.9675033819329626, + "flos": 27223853617440.0, + "grad_norm": 1.8143103989961058, + "language_loss": 0.76758653, + "learning_rate": 1.1047315984219484e-08, + "loss": 0.78894591, + "num_input_tokens_seen": 347189455, + "router_z_loss_clip": 0.70751953, + "router_z_loss_mlp": 0.11437988, + "step": 16092, + "time_per_iteration": 2.6128487586975098 + }, + { + "auxiliary_loss_clip": 0.01110001, + "auxiliary_loss_mlp": 0.01025146, + "balance_loss_clip": 1.03920865, + "balance_loss_mlp": 1.01467299, + "epoch": 0.9675635051856305, + "flos": 15466483569600.0, + "grad_norm": 2.364766278338106, + "language_loss": 0.75738263, + "learning_rate": 1.1006475419323313e-08, + "loss": 0.77873409, + "num_input_tokens_seen": 347206030, + "router_z_loss_clip": 0.70703125, + "router_z_loss_mlp": 0.10473633, + "step": 16093, + "time_per_iteration": 2.5919861793518066 + }, + { + "auxiliary_loss_clip": 0.01107702, + "auxiliary_loss_mlp": 0.01026241, + "balance_loss_clip": 1.03673828, + "balance_loss_mlp": 1.01407003, + "epoch": 0.9676236284382985, + "flos": 30027573456480.0, + "grad_norm": 1.6019035984008834, + "language_loss": 0.69263476, + "learning_rate": 1.096571027726112e-08, + "loss": 0.71397412, + "num_input_tokens_seen": 347226250, + "router_z_loss_clip": 0.70996094, + "router_z_loss_mlp": 0.1217041, + "step": 16094, + "time_per_iteration": 2.72727370262146 + }, + { + "auxiliary_loss_clip": 0.0111257, + "auxiliary_loss_mlp": 0.01027376, + "balance_loss_clip": 1.03774357, + "balance_loss_mlp": 1.01629591, + "epoch": 0.9676837516909664, + "flos": 28512888788160.0, + "grad_norm": 1.839925682532095, + "language_loss": 0.75715518, + "learning_rate": 1.0925020559578557e-08, + "loss": 0.77855468, + "num_input_tokens_seen": 347247350, + "router_z_loss_clip": 0.74853516, + "router_z_loss_mlp": 0.11077881, + "step": 16095, + "time_per_iteration": 2.701137065887451 + }, + { + "auxiliary_loss_clip": 0.01114677, + "auxiliary_loss_mlp": 0.01031238, + "balance_loss_clip": 1.03889358, + "balance_loss_mlp": 1.02006841, + "epoch": 0.9677438749436345, + "flos": 24997989705120.0, + "grad_norm": 2.182907888270333, + "language_loss": 0.70381379, + "learning_rate": 1.0884406267818392e-08, + "loss": 0.72527289, + "num_input_tokens_seen": 347266870, + "router_z_loss_clip": 0.7578125, + "router_z_loss_mlp": 0.11169434, + "step": 16096, + "time_per_iteration": 2.6108429431915283 + }, + { + "auxiliary_loss_clip": 0.01111019, + "auxiliary_loss_mlp": 0.0102646, + "balance_loss_clip": 1.03813171, + "balance_loss_mlp": 1.0150938, + "epoch": 0.9678039981963025, + "flos": 58029973853760.0, + "grad_norm": 1.9017065826454451, + "language_loss": 0.71604019, + "learning_rate": 1.0843867403520946e-08, + "loss": 0.73741496, + "num_input_tokens_seen": 347290120, + "router_z_loss_clip": 0.72802734, + "router_z_loss_mlp": 0.11376953, + "step": 16097, + "time_per_iteration": 4.122883081436157 + }, + { + "auxiliary_loss_clip": 0.01108777, + "auxiliary_loss_mlp": 0.01030105, + "balance_loss_clip": 1.03717184, + "balance_loss_mlp": 1.01909614, + "epoch": 0.9678641214489704, + "flos": 30555275340960.0, + "grad_norm": 1.908663552643407, + "language_loss": 0.77908981, + "learning_rate": 1.0803403968223434e-08, + "loss": 0.80047858, + "num_input_tokens_seen": 347308785, + "router_z_loss_clip": 0.71582031, + "router_z_loss_mlp": 0.11004639, + "step": 16098, + "time_per_iteration": 2.6849379539489746 + }, + { + "auxiliary_loss_clip": 0.01107318, + "auxiliary_loss_mlp": 0.0102662, + "balance_loss_clip": 1.03692937, + "balance_loss_mlp": 1.01627302, + "epoch": 0.9679242447016384, + "flos": 23478078307680.0, + "grad_norm": 2.0018193818901318, + "language_loss": 0.90773749, + "learning_rate": 1.0763015963459965e-08, + "loss": 0.92907691, + "num_input_tokens_seen": 347326375, + "router_z_loss_clip": 0.70361328, + "router_z_loss_mlp": 0.10345459, + "step": 16099, + "time_per_iteration": 2.6167757511138916 + }, + { + "auxiliary_loss_clip": 0.01110236, + "auxiliary_loss_mlp": 0.01029478, + "balance_loss_clip": 1.03637433, + "balance_loss_mlp": 1.01781321, + "epoch": 0.9679843679543063, + "flos": 40578429332160.0, + "grad_norm": 1.623314437347113, + "language_loss": 0.66144389, + "learning_rate": 1.0722703390762643e-08, + "loss": 0.68284106, + "num_input_tokens_seen": 347348250, + "router_z_loss_clip": 0.73779297, + "router_z_loss_mlp": 0.11663818, + "step": 16100, + "time_per_iteration": 2.717280149459839 + }, + { + "auxiliary_loss_clip": 0.01110459, + "auxiliary_loss_mlp": 0.01028358, + "balance_loss_clip": 1.0382818, + "balance_loss_mlp": 1.01744461, + "epoch": 0.9680444912069743, + "flos": 27800818164000.0, + "grad_norm": 1.584135772399779, + "language_loss": 0.73375076, + "learning_rate": 1.0682466251659584e-08, + "loss": 0.75513899, + "num_input_tokens_seen": 347367400, + "router_z_loss_clip": 0.72167969, + "router_z_loss_mlp": 0.10919189, + "step": 16101, + "time_per_iteration": 2.619624137878418 + }, + { + "auxiliary_loss_clip": 0.01109919, + "auxiliary_loss_mlp": 0.01027236, + "balance_loss_clip": 1.03783584, + "balance_loss_mlp": 1.01607776, + "epoch": 0.9681046144596422, + "flos": 29314530417600.0, + "grad_norm": 1.646153884575047, + "language_loss": 0.73088294, + "learning_rate": 1.0642304547676672e-08, + "loss": 0.75225449, + "num_input_tokens_seen": 347387600, + "router_z_loss_clip": 0.72119141, + "router_z_loss_mlp": 0.11157227, + "step": 16102, + "time_per_iteration": 2.639160633087158 + }, + { + "auxiliary_loss_clip": 0.01112967, + "auxiliary_loss_mlp": 0.01033502, + "balance_loss_clip": 1.03965235, + "balance_loss_mlp": 1.02175355, + "epoch": 0.9681647377123103, + "flos": 28602946000800.0, + "grad_norm": 1.8592274237536714, + "language_loss": 0.77109247, + "learning_rate": 1.0602218280337139e-08, + "loss": 0.79255718, + "num_input_tokens_seen": 347406915, + "router_z_loss_clip": 0.73388672, + "router_z_loss_mlp": 0.11743164, + "step": 16103, + "time_per_iteration": 2.6208386421203613 + }, + { + "auxiliary_loss_clip": 0.01108613, + "auxiliary_loss_mlp": 0.01029137, + "balance_loss_clip": 1.03724003, + "balance_loss_mlp": 1.01862264, + "epoch": 0.9682248609649782, + "flos": 27668264675040.0, + "grad_norm": 1.5753174965141303, + "language_loss": 0.80501175, + "learning_rate": 1.0562207451160655e-08, + "loss": 0.82638919, + "num_input_tokens_seen": 347425140, + "router_z_loss_clip": 0.71337891, + "router_z_loss_mlp": 0.10510254, + "step": 16104, + "time_per_iteration": 2.622962474822998 + }, + { + "auxiliary_loss_clip": 0.01104562, + "auxiliary_loss_mlp": 0.0103164, + "balance_loss_clip": 1.03458321, + "balance_loss_mlp": 1.02181721, + "epoch": 0.9682849842176462, + "flos": 29807636895360.0, + "grad_norm": 1.5945934241030173, + "language_loss": 0.77588177, + "learning_rate": 1.0522272061664672e-08, + "loss": 0.79724377, + "num_input_tokens_seen": 347446350, + "router_z_loss_clip": 0.69970703, + "router_z_loss_mlp": 0.09820557, + "step": 16105, + "time_per_iteration": 2.6406571865081787 + }, + { + "auxiliary_loss_clip": 0.0102705, + "auxiliary_loss_mlp": 0.01001384, + "balance_loss_clip": 1.00470948, + "balance_loss_mlp": 1.0004797, + "epoch": 0.9683451074703141, + "flos": 73206155047680.0, + "grad_norm": 0.8220923040862632, + "language_loss": 0.56697053, + "learning_rate": 1.0482412113363536e-08, + "loss": 0.58725488, + "num_input_tokens_seen": 347510135, + "router_z_loss_clip": 0.22363281, + "router_z_loss_mlp": 0.0090332, + "step": 16106, + "time_per_iteration": 3.2952916622161865 + }, + { + "auxiliary_loss_clip": 0.01027136, + "auxiliary_loss_mlp": 0.01001078, + "balance_loss_clip": 1.00483179, + "balance_loss_mlp": 1.00013566, + "epoch": 0.9684052307229821, + "flos": 64299539050560.0, + "grad_norm": 0.8768840136380216, + "language_loss": 0.61574435, + "learning_rate": 1.0442627607768707e-08, + "loss": 0.63602656, + "num_input_tokens_seen": 347562505, + "router_z_loss_clip": 0.22290039, + "router_z_loss_mlp": 0.00941467, + "step": 16107, + "time_per_iteration": 3.1245503425598145 + }, + { + "auxiliary_loss_clip": 0.01111281, + "auxiliary_loss_mlp": 0.0103814, + "balance_loss_clip": 1.03914785, + "balance_loss_mlp": 1.02587914, + "epoch": 0.96846535397565, + "flos": 27801061267680.0, + "grad_norm": 2.9773889556356155, + "language_loss": 0.73991311, + "learning_rate": 1.040291854638875e-08, + "loss": 0.76140732, + "num_input_tokens_seen": 347579150, + "router_z_loss_clip": 0.72070312, + "router_z_loss_mlp": 0.12261963, + "step": 16108, + "time_per_iteration": 2.6221981048583984 + }, + { + "auxiliary_loss_clip": 0.01110356, + "auxiliary_loss_mlp": 0.01024709, + "balance_loss_clip": 1.03731263, + "balance_loss_mlp": 1.01345587, + "epoch": 0.968525477228318, + "flos": 28460830433760.0, + "grad_norm": 4.583660042973393, + "language_loss": 0.57166457, + "learning_rate": 1.0363284930729576e-08, + "loss": 0.59301519, + "num_input_tokens_seen": 347596705, + "router_z_loss_clip": 0.73095703, + "router_z_loss_mlp": 0.11248779, + "step": 16109, + "time_per_iteration": 2.6952223777770996 + }, + { + "auxiliary_loss_clip": 0.01027331, + "auxiliary_loss_mlp": 0.01000644, + "balance_loss_clip": 1.0049988, + "balance_loss_mlp": 0.99969167, + "epoch": 0.9685856004809861, + "flos": 82830838226400.0, + "grad_norm": 0.6719666408581616, + "language_loss": 0.54280806, + "learning_rate": 1.0323726762294205e-08, + "loss": 0.56308782, + "num_input_tokens_seen": 347661870, + "router_z_loss_clip": 0.22338867, + "router_z_loss_mlp": 0.00951385, + "step": 16110, + "time_per_iteration": 3.230988025665283 + }, + { + "auxiliary_loss_clip": 0.0111545, + "auxiliary_loss_mlp": 0.01038975, + "balance_loss_clip": 1.04042804, + "balance_loss_mlp": 1.0263567, + "epoch": 0.968645723733654, + "flos": 41424349998240.0, + "grad_norm": 1.4812012384943958, + "language_loss": 0.62590981, + "learning_rate": 1.0284244042582325e-08, + "loss": 0.64745414, + "num_input_tokens_seen": 347684295, + "router_z_loss_clip": 0.75, + "router_z_loss_mlp": 0.12615967, + "step": 16111, + "time_per_iteration": 2.733618974685669 + }, + { + "auxiliary_loss_clip": 0.01107582, + "auxiliary_loss_mlp": 0.01026363, + "balance_loss_clip": 1.03672624, + "balance_loss_mlp": 1.01631379, + "epoch": 0.968705846986322, + "flos": 22636411956000.0, + "grad_norm": 2.2428889283158453, + "language_loss": 0.74555457, + "learning_rate": 1.024483677309118e-08, + "loss": 0.76689398, + "num_input_tokens_seen": 347702585, + "router_z_loss_clip": 0.70751953, + "router_z_loss_mlp": 0.10040283, + "step": 16112, + "time_per_iteration": 4.107001066207886 + }, + { + "auxiliary_loss_clip": 0.01105763, + "auxiliary_loss_mlp": 0.01024864, + "balance_loss_clip": 1.03575349, + "balance_loss_mlp": 1.01454067, + "epoch": 0.9687659702389899, + "flos": 21256752330720.0, + "grad_norm": 2.0683808712544614, + "language_loss": 0.66262829, + "learning_rate": 1.020550495531558e-08, + "loss": 0.68393463, + "num_input_tokens_seen": 347721810, + "router_z_loss_clip": 0.69921875, + "router_z_loss_mlp": 0.10327148, + "step": 16113, + "time_per_iteration": 2.582921266555786 + }, + { + "auxiliary_loss_clip": 0.01027152, + "auxiliary_loss_mlp": 0.01001121, + "balance_loss_clip": 1.00486183, + "balance_loss_mlp": 1.00016451, + "epoch": 0.9688260934916579, + "flos": 75709402673760.0, + "grad_norm": 0.6906648142494516, + "language_loss": 0.56532216, + "learning_rate": 1.0166248590746329e-08, + "loss": 0.58560491, + "num_input_tokens_seen": 347782330, + "router_z_loss_clip": 0.22277832, + "router_z_loss_mlp": 0.009552, + "step": 16114, + "time_per_iteration": 3.2638297080993652 + }, + { + "auxiliary_loss_clip": 0.01109639, + "auxiliary_loss_mlp": 0.01033687, + "balance_loss_clip": 1.03804231, + "balance_loss_mlp": 1.02248144, + "epoch": 0.9688862167443258, + "flos": 18396800857440.0, + "grad_norm": 3.6390902921838495, + "language_loss": 0.82501352, + "learning_rate": 1.0127067680872458e-08, + "loss": 0.84644675, + "num_input_tokens_seen": 347794835, + "router_z_loss_clip": 0.71630859, + "router_z_loss_mlp": 0.11199951, + "step": 16115, + "time_per_iteration": 2.640894889831543 + }, + { + "auxiliary_loss_clip": 0.01103923, + "auxiliary_loss_mlp": 0.01026841, + "balance_loss_clip": 1.03648901, + "balance_loss_mlp": 1.01683378, + "epoch": 0.9689463399969939, + "flos": 24328577426400.0, + "grad_norm": 2.2004912460491157, + "language_loss": 0.72088832, + "learning_rate": 1.0087962227179448e-08, + "loss": 0.74219596, + "num_input_tokens_seen": 347814320, + "router_z_loss_clip": 0.67431641, + "router_z_loss_mlp": 0.10003662, + "step": 16116, + "time_per_iteration": 2.630166530609131 + }, + { + "auxiliary_loss_clip": 0.01112725, + "auxiliary_loss_mlp": 0.010315, + "balance_loss_clip": 1.03934431, + "balance_loss_mlp": 1.02037811, + "epoch": 0.9690064632496618, + "flos": 23882991367680.0, + "grad_norm": 2.0776814472063285, + "language_loss": 0.76261717, + "learning_rate": 1.0048932231150553e-08, + "loss": 0.78405941, + "num_input_tokens_seen": 347832125, + "router_z_loss_clip": 0.73388672, + "router_z_loss_mlp": 0.11120605, + "step": 16117, + "time_per_iteration": 2.6459245681762695 + }, + { + "auxiliary_loss_clip": 0.01110633, + "auxiliary_loss_mlp": 0.01028625, + "balance_loss_clip": 1.03695643, + "balance_loss_mlp": 1.01708591, + "epoch": 0.9690665865023298, + "flos": 26688656036160.0, + "grad_norm": 1.9322909537786563, + "language_loss": 0.77340901, + "learning_rate": 1.000997769426548e-08, + "loss": 0.79480159, + "num_input_tokens_seen": 347850765, + "router_z_loss_clip": 0.73632812, + "router_z_loss_mlp": 0.11547852, + "step": 16118, + "time_per_iteration": 2.597491502761841 + }, + { + "auxiliary_loss_clip": 0.01113856, + "auxiliary_loss_mlp": 0.01030953, + "balance_loss_clip": 1.0409559, + "balance_loss_mlp": 1.01980686, + "epoch": 0.9691267097549977, + "flos": 25617693631680.0, + "grad_norm": 1.775477289438646, + "language_loss": 0.77930695, + "learning_rate": 9.971098618001272e-09, + "loss": 0.80075496, + "num_input_tokens_seen": 347870125, + "router_z_loss_clip": 0.72851562, + "router_z_loss_mlp": 0.1114502, + "step": 16119, + "time_per_iteration": 2.6988525390625 + }, + { + "auxiliary_loss_clip": 0.01105959, + "auxiliary_loss_mlp": 0.010246, + "balance_loss_clip": 1.03685749, + "balance_loss_mlp": 1.01456249, + "epoch": 0.9691868330076657, + "flos": 29625780227040.0, + "grad_norm": 1.6340354342955024, + "language_loss": 0.75437462, + "learning_rate": 9.932295003832747e-09, + "loss": 0.77568018, + "num_input_tokens_seen": 347890615, + "router_z_loss_clip": 0.69091797, + "router_z_loss_mlp": 0.1003418, + "step": 16120, + "time_per_iteration": 2.65937876701355 + }, + { + "auxiliary_loss_clip": 0.01108433, + "auxiliary_loss_mlp": 0.01028491, + "balance_loss_clip": 1.03702974, + "balance_loss_mlp": 1.01784515, + "epoch": 0.9692469562603336, + "flos": 21567718519200.0, + "grad_norm": 2.141824567427206, + "language_loss": 0.69422901, + "learning_rate": 9.89356685323095e-09, + "loss": 0.71559823, + "num_input_tokens_seen": 347908685, + "router_z_loss_clip": 0.71386719, + "router_z_loss_mlp": 0.10638428, + "step": 16121, + "time_per_iteration": 2.599937915802002 + }, + { + "auxiliary_loss_clip": 0.01106787, + "auxiliary_loss_mlp": 0.01028487, + "balance_loss_clip": 1.03600156, + "balance_loss_mlp": 1.01715016, + "epoch": 0.9693070795130017, + "flos": 32743788671520.0, + "grad_norm": 1.790101372101031, + "language_loss": 0.6905148, + "learning_rate": 9.854914167664486e-09, + "loss": 0.71186751, + "num_input_tokens_seen": 347926385, + "router_z_loss_clip": 0.70800781, + "router_z_loss_mlp": 0.11346436, + "step": 16122, + "time_per_iteration": 2.640317678451538 + }, + { + "auxiliary_loss_clip": 0.01107572, + "auxiliary_loss_mlp": 0.01026158, + "balance_loss_clip": 1.03596449, + "balance_loss_mlp": 1.01608443, + "epoch": 0.9693672027656697, + "flos": 22057745683680.0, + "grad_norm": 1.9635951583745648, + "language_loss": 0.75782549, + "learning_rate": 9.81633694859907e-09, + "loss": 0.77916276, + "num_input_tokens_seen": 347945290, + "router_z_loss_clip": 0.71582031, + "router_z_loss_mlp": 0.10076904, + "step": 16123, + "time_per_iteration": 4.100126266479492 + }, + { + "auxiliary_loss_clip": 0.01108386, + "auxiliary_loss_mlp": 0.0102939, + "balance_loss_clip": 1.0354948, + "balance_loss_mlp": 1.0176537, + "epoch": 0.9694273260183376, + "flos": 26555292201600.0, + "grad_norm": 1.786698888326262, + "language_loss": 0.74639094, + "learning_rate": 9.777835197497753e-09, + "loss": 0.76776862, + "num_input_tokens_seen": 347966330, + "router_z_loss_clip": 0.72802734, + "router_z_loss_mlp": 0.11743164, + "step": 16124, + "time_per_iteration": 2.6617348194122314 + }, + { + "auxiliary_loss_clip": 0.01109622, + "auxiliary_loss_mlp": 0.01034054, + "balance_loss_clip": 1.03664887, + "balance_loss_mlp": 1.02324164, + "epoch": 0.9694874492710056, + "flos": 29805043789440.0, + "grad_norm": 2.139879984175954, + "language_loss": 0.74286807, + "learning_rate": 9.739408915820258e-09, + "loss": 0.76430488, + "num_input_tokens_seen": 347982590, + "router_z_loss_clip": 0.72900391, + "router_z_loss_mlp": 0.1081543, + "step": 16125, + "time_per_iteration": 4.124663352966309 + }, + { + "auxiliary_loss_clip": 0.01027032, + "auxiliary_loss_mlp": 0.01001144, + "balance_loss_clip": 1.00470793, + "balance_loss_mlp": 1.0002594, + "epoch": 0.9695475725236735, + "flos": 82548713990880.0, + "grad_norm": 0.8944552675552294, + "language_loss": 0.61412704, + "learning_rate": 9.70105810502364e-09, + "loss": 0.63440883, + "num_input_tokens_seen": 348043310, + "router_z_loss_clip": 0.22314453, + "router_z_loss_mlp": 0.0088501, + "step": 16126, + "time_per_iteration": 3.2059125900268555 + }, + { + "auxiliary_loss_clip": 0.01108519, + "auxiliary_loss_mlp": 0.01033032, + "balance_loss_clip": 1.03872347, + "balance_loss_mlp": 1.02235675, + "epoch": 0.9696076957763415, + "flos": 23342080849920.0, + "grad_norm": 4.437911865144957, + "language_loss": 0.75300419, + "learning_rate": 9.662782766562738e-09, + "loss": 0.77441967, + "num_input_tokens_seen": 348062200, + "router_z_loss_clip": 0.69775391, + "router_z_loss_mlp": 0.10668945, + "step": 16127, + "time_per_iteration": 2.6252970695495605 + }, + { + "auxiliary_loss_clip": 0.01110739, + "auxiliary_loss_mlp": 0.0103069, + "balance_loss_clip": 1.03661847, + "balance_loss_mlp": 1.01924586, + "epoch": 0.9696678190290094, + "flos": 18896673720960.0, + "grad_norm": 1.7965319589206268, + "language_loss": 0.6902957, + "learning_rate": 9.62458290188839e-09, + "loss": 0.71171004, + "num_input_tokens_seen": 348080685, + "router_z_loss_clip": 0.74072266, + "router_z_loss_mlp": 0.11450195, + "step": 16128, + "time_per_iteration": 2.6283326148986816 + }, + { + "auxiliary_loss_clip": 0.01110326, + "auxiliary_loss_mlp": 0.01032206, + "balance_loss_clip": 1.03829265, + "balance_loss_mlp": 1.02090478, + "epoch": 0.9697279422816775, + "flos": 44183264076000.0, + "grad_norm": 1.5744513028638487, + "language_loss": 0.65256089, + "learning_rate": 9.586458512449213e-09, + "loss": 0.67398626, + "num_input_tokens_seen": 348102500, + "router_z_loss_clip": 0.72021484, + "router_z_loss_mlp": 0.11303711, + "step": 16129, + "time_per_iteration": 2.7664546966552734 + }, + { + "auxiliary_loss_clip": 0.01113632, + "auxiliary_loss_mlp": 0.01029925, + "balance_loss_clip": 1.03809071, + "balance_loss_mlp": 1.01799178, + "epoch": 0.9697880655343454, + "flos": 31095983272320.0, + "grad_norm": 2.2151018072689577, + "language_loss": 0.63126326, + "learning_rate": 9.548409599691166e-09, + "loss": 0.65269887, + "num_input_tokens_seen": 348122515, + "router_z_loss_clip": 0.75488281, + "router_z_loss_mlp": 0.11938477, + "step": 16130, + "time_per_iteration": 2.658189058303833 + }, + { + "auxiliary_loss_clip": 0.01112792, + "auxiliary_loss_mlp": 0.0102717, + "balance_loss_clip": 1.03832471, + "balance_loss_mlp": 1.01617861, + "epoch": 0.9698481887870134, + "flos": 18710279117280.0, + "grad_norm": 3.234214875536261, + "language_loss": 0.70071971, + "learning_rate": 9.510436165056867e-09, + "loss": 0.72211927, + "num_input_tokens_seen": 348138775, + "router_z_loss_clip": 0.74511719, + "router_z_loss_mlp": 0.10992432, + "step": 16131, + "time_per_iteration": 2.589904308319092 + }, + { + "auxiliary_loss_clip": 0.01111073, + "auxiliary_loss_mlp": 0.01027449, + "balance_loss_clip": 1.03738832, + "balance_loss_mlp": 1.01633906, + "epoch": 0.9699083120396813, + "flos": 26822789699040.0, + "grad_norm": 2.37336885374372, + "language_loss": 0.76472938, + "learning_rate": 9.472538209986058e-09, + "loss": 0.78611457, + "num_input_tokens_seen": 348157115, + "router_z_loss_clip": 0.73779297, + "router_z_loss_mlp": 0.11108398, + "step": 16132, + "time_per_iteration": 2.62447190284729 + }, + { + "auxiliary_loss_clip": 0.01112176, + "auxiliary_loss_mlp": 0.01033087, + "balance_loss_clip": 1.03938866, + "balance_loss_mlp": 1.02157164, + "epoch": 0.9699684352923493, + "flos": 19113409416960.0, + "grad_norm": 2.9391438154036718, + "language_loss": 0.79223466, + "learning_rate": 9.434715735916477e-09, + "loss": 0.81368732, + "num_input_tokens_seen": 348173035, + "router_z_loss_clip": 0.72753906, + "router_z_loss_mlp": 0.11505127, + "step": 16133, + "time_per_iteration": 2.586233615875244 + }, + { + "auxiliary_loss_clip": 0.01105879, + "auxiliary_loss_mlp": 0.0102864, + "balance_loss_clip": 1.03634441, + "balance_loss_mlp": 1.01820898, + "epoch": 0.9700285585450172, + "flos": 26732732486400.0, + "grad_norm": 1.6361165092329133, + "language_loss": 0.64617366, + "learning_rate": 9.396968744281863e-09, + "loss": 0.66751885, + "num_input_tokens_seen": 348192960, + "router_z_loss_clip": 0.6953125, + "router_z_loss_mlp": 0.10424805, + "step": 16134, + "time_per_iteration": 2.627906084060669 + }, + { + "auxiliary_loss_clip": 0.01108867, + "auxiliary_loss_mlp": 0.01031997, + "balance_loss_clip": 1.03671885, + "balance_loss_mlp": 1.02064264, + "epoch": 0.9700886817976853, + "flos": 29181247617600.0, + "grad_norm": 2.046456299956338, + "language_loss": 0.80428845, + "learning_rate": 9.359297236513519e-09, + "loss": 0.82569706, + "num_input_tokens_seen": 348212805, + "router_z_loss_clip": 0.72119141, + "router_z_loss_mlp": 0.11346436, + "step": 16135, + "time_per_iteration": 2.6618103981018066 + }, + { + "auxiliary_loss_clip": 0.01111792, + "auxiliary_loss_mlp": 0.01030749, + "balance_loss_clip": 1.03725398, + "balance_loss_mlp": 1.01912594, + "epoch": 0.9701488050503532, + "flos": 31051825787520.0, + "grad_norm": 1.7729457242664872, + "language_loss": 0.73169672, + "learning_rate": 9.321701214040079e-09, + "loss": 0.75312221, + "num_input_tokens_seen": 348232900, + "router_z_loss_clip": 0.74462891, + "router_z_loss_mlp": 0.11627197, + "step": 16136, + "time_per_iteration": 4.054363012313843 + }, + { + "auxiliary_loss_clip": 0.01108719, + "auxiliary_loss_mlp": 0.01026597, + "balance_loss_clip": 1.0383805, + "balance_loss_mlp": 1.01660705, + "epoch": 0.9702089283030212, + "flos": 25125356982240.0, + "grad_norm": 1.5743403096693214, + "language_loss": 0.76114506, + "learning_rate": 9.28418067828729e-09, + "loss": 0.78249824, + "num_input_tokens_seen": 348253065, + "router_z_loss_clip": 0.703125, + "router_z_loss_mlp": 0.09985352, + "step": 16137, + "time_per_iteration": 2.6398470401763916 + }, + { + "auxiliary_loss_clip": 0.01026919, + "auxiliary_loss_mlp": 0.01001313, + "balance_loss_clip": 1.00457656, + "balance_loss_mlp": 1.00037003, + "epoch": 0.9702690515556892, + "flos": 86209658817120.0, + "grad_norm": 0.7764245503880257, + "language_loss": 0.54866946, + "learning_rate": 9.246735630678015e-09, + "loss": 0.56895185, + "num_input_tokens_seen": 348316075, + "router_z_loss_clip": 0.22351074, + "router_z_loss_mlp": 0.00942993, + "step": 16138, + "time_per_iteration": 3.3650243282318115 + }, + { + "auxiliary_loss_clip": 0.01108674, + "auxiliary_loss_mlp": 0.01030981, + "balance_loss_clip": 1.03639102, + "balance_loss_mlp": 1.02029991, + "epoch": 0.9703291748083571, + "flos": 43783820848800.0, + "grad_norm": 2.0953969697711248, + "language_loss": 0.7074126, + "learning_rate": 9.209366072632007e-09, + "loss": 0.72880912, + "num_input_tokens_seen": 348337605, + "router_z_loss_clip": 0.72265625, + "router_z_loss_mlp": 0.10681152, + "step": 16139, + "time_per_iteration": 2.768155813217163 + }, + { + "auxiliary_loss_clip": 0.01112609, + "auxiliary_loss_mlp": 0.01027007, + "balance_loss_clip": 1.03971744, + "balance_loss_mlp": 1.01564085, + "epoch": 0.9703892980610251, + "flos": 29671274782080.0, + "grad_norm": 1.4737495052018574, + "language_loss": 0.72369665, + "learning_rate": 9.172072005566134e-09, + "loss": 0.74509275, + "num_input_tokens_seen": 348359430, + "router_z_loss_clip": 0.72900391, + "router_z_loss_mlp": 0.11376953, + "step": 16140, + "time_per_iteration": 2.7722511291503906 + }, + { + "auxiliary_loss_clip": 0.01113963, + "auxiliary_loss_mlp": 0.01038696, + "balance_loss_clip": 1.03970373, + "balance_loss_mlp": 1.02736533, + "epoch": 0.970449421313693, + "flos": 21968053126560.0, + "grad_norm": 2.3743343636590795, + "language_loss": 0.68301332, + "learning_rate": 9.13485343089504e-09, + "loss": 0.7045399, + "num_input_tokens_seen": 348377890, + "router_z_loss_clip": 0.74316406, + "router_z_loss_mlp": 0.11340332, + "step": 16141, + "time_per_iteration": 2.6012134552001953 + }, + { + "auxiliary_loss_clip": 0.01105514, + "auxiliary_loss_mlp": 0.01026982, + "balance_loss_clip": 1.0366714, + "balance_loss_mlp": 1.01631856, + "epoch": 0.9705095445663611, + "flos": 30917813676480.0, + "grad_norm": 1.924025640816296, + "language_loss": 0.68788683, + "learning_rate": 9.097710350029597e-09, + "loss": 0.70921177, + "num_input_tokens_seen": 348396550, + "router_z_loss_clip": 0.68896484, + "router_z_loss_mlp": 0.10668945, + "step": 16142, + "time_per_iteration": 2.6584393978118896 + }, + { + "auxiliary_loss_clip": 0.01105829, + "auxiliary_loss_mlp": 0.01027799, + "balance_loss_clip": 1.03461862, + "balance_loss_mlp": 1.01718926, + "epoch": 0.970569667819029, + "flos": 32742897291360.0, + "grad_norm": 1.9954090340738102, + "language_loss": 0.55868018, + "learning_rate": 9.060642764378457e-09, + "loss": 0.58001643, + "num_input_tokens_seen": 348417120, + "router_z_loss_clip": 0.71191406, + "router_z_loss_mlp": 0.1060791, + "step": 16143, + "time_per_iteration": 2.6645262241363525 + }, + { + "auxiliary_loss_clip": 0.01110754, + "auxiliary_loss_mlp": 0.01030611, + "balance_loss_clip": 1.03719544, + "balance_loss_mlp": 1.02019215, + "epoch": 0.970629791071697, + "flos": 31540961571840.0, + "grad_norm": 2.4599492079131626, + "language_loss": 0.67784458, + "learning_rate": 9.023650675347382e-09, + "loss": 0.69925821, + "num_input_tokens_seen": 348437750, + "router_z_loss_clip": 0.73583984, + "router_z_loss_mlp": 0.10412598, + "step": 16144, + "time_per_iteration": 2.7284059524536133 + }, + { + "auxiliary_loss_clip": 0.01110264, + "auxiliary_loss_mlp": 0.01036912, + "balance_loss_clip": 1.03943181, + "balance_loss_mlp": 1.02661252, + "epoch": 0.9706899143243649, + "flos": 44587488342240.0, + "grad_norm": 2.078944255972681, + "language_loss": 0.71839279, + "learning_rate": 8.986734084339253e-09, + "loss": 0.73986453, + "num_input_tokens_seen": 348460935, + "router_z_loss_clip": 0.70800781, + "router_z_loss_mlp": 0.10296631, + "step": 16145, + "time_per_iteration": 2.8625195026397705 + }, + { + "auxiliary_loss_clip": 0.01111281, + "auxiliary_loss_mlp": 0.01025314, + "balance_loss_clip": 1.03618169, + "balance_loss_mlp": 1.01342916, + "epoch": 0.9707500375770329, + "flos": 14969811571200.0, + "grad_norm": 3.34476070922151, + "language_loss": 0.79557621, + "learning_rate": 8.949892992753395e-09, + "loss": 0.81694221, + "num_input_tokens_seen": 348474480, + "router_z_loss_clip": 0.75146484, + "router_z_loss_mlp": 0.11889648, + "step": 16146, + "time_per_iteration": 2.5887935161590576 + }, + { + "auxiliary_loss_clip": 0.01027128, + "auxiliary_loss_mlp": 0.01001217, + "balance_loss_clip": 1.00477695, + "balance_loss_mlp": 1.00030518, + "epoch": 0.9708101608297008, + "flos": 74251314295200.0, + "grad_norm": 0.80846832048715, + "language_loss": 0.54507113, + "learning_rate": 8.91312740198713e-09, + "loss": 0.56535459, + "num_input_tokens_seen": 348541220, + "router_z_loss_clip": 0.22375488, + "router_z_loss_mlp": 0.0091095, + "step": 16147, + "time_per_iteration": 3.303260087966919 + }, + { + "auxiliary_loss_clip": 0.01112319, + "auxiliary_loss_mlp": 0.01030616, + "balance_loss_clip": 1.03809857, + "balance_loss_mlp": 1.0190711, + "epoch": 0.9708702840823689, + "flos": 33098264068320.0, + "grad_norm": 3.304997944215275, + "language_loss": 0.61864686, + "learning_rate": 8.876437313434682e-09, + "loss": 0.64007628, + "num_input_tokens_seen": 348559230, + "router_z_loss_clip": 0.7421875, + "router_z_loss_mlp": 0.11547852, + "step": 16148, + "time_per_iteration": 2.6408538818359375 + }, + { + "auxiliary_loss_clip": 0.01106757, + "auxiliary_loss_mlp": 0.01033585, + "balance_loss_clip": 1.03639352, + "balance_loss_mlp": 1.02231932, + "epoch": 0.9709304073350368, + "flos": 25352951309280.0, + "grad_norm": 3.9131912149610724, + "language_loss": 0.74024647, + "learning_rate": 8.839822728487155e-09, + "loss": 0.76164991, + "num_input_tokens_seen": 348577850, + "router_z_loss_clip": 0.70361328, + "router_z_loss_mlp": 0.11279297, + "step": 16149, + "time_per_iteration": 2.6730170249938965 + }, + { + "auxiliary_loss_clip": 0.01108225, + "auxiliary_loss_mlp": 0.01034481, + "balance_loss_clip": 1.03505242, + "balance_loss_mlp": 1.02309036, + "epoch": 0.9709905305877048, + "flos": 51170606483040.0, + "grad_norm": 2.4213677394696087, + "language_loss": 0.75035739, + "learning_rate": 8.803283648533222e-09, + "loss": 0.77178442, + "num_input_tokens_seen": 348598345, + "router_z_loss_clip": 0.73193359, + "router_z_loss_mlp": 0.11383057, + "step": 16150, + "time_per_iteration": 2.790618419647217 + }, + { + "auxiliary_loss_clip": 0.01116725, + "auxiliary_loss_mlp": 0.01027262, + "balance_loss_clip": 1.03980851, + "balance_loss_mlp": 1.01420879, + "epoch": 0.9710506538403728, + "flos": 20945178383040.0, + "grad_norm": 2.2001178997748494, + "language_loss": 0.73783869, + "learning_rate": 8.766820074958214e-09, + "loss": 0.75927854, + "num_input_tokens_seen": 348616300, + "router_z_loss_clip": 0.76806641, + "router_z_loss_mlp": 0.13031006, + "step": 16151, + "time_per_iteration": 4.194408416748047 + }, + { + "auxiliary_loss_clip": 0.01108079, + "auxiliary_loss_mlp": 0.01030352, + "balance_loss_clip": 1.03789949, + "balance_loss_mlp": 1.01937842, + "epoch": 0.9711107770930407, + "flos": 25836414674400.0, + "grad_norm": 2.1479307624265216, + "language_loss": 0.7487967, + "learning_rate": 8.730432009145027e-09, + "loss": 0.770181, + "num_input_tokens_seen": 348633845, + "router_z_loss_clip": 0.70166016, + "router_z_loss_mlp": 0.10968018, + "step": 16152, + "time_per_iteration": 2.629972219467163 + }, + { + "auxiliary_loss_clip": 0.01108813, + "auxiliary_loss_mlp": 0.01029859, + "balance_loss_clip": 1.03773904, + "balance_loss_mlp": 1.01891577, + "epoch": 0.9711709003457087, + "flos": 27133796404800.0, + "grad_norm": 2.1074501510020522, + "language_loss": 0.67146516, + "learning_rate": 8.694119452473448e-09, + "loss": 0.6928519, + "num_input_tokens_seen": 348653070, + "router_z_loss_clip": 0.71044922, + "router_z_loss_mlp": 0.10949707, + "step": 16153, + "time_per_iteration": 2.6013989448547363 + }, + { + "auxiliary_loss_clip": 0.01108054, + "auxiliary_loss_mlp": 0.01027335, + "balance_loss_clip": 1.03588021, + "balance_loss_mlp": 1.01693964, + "epoch": 0.9712310235983767, + "flos": 31986588147840.0, + "grad_norm": 1.6671385576264988, + "language_loss": 0.7050122, + "learning_rate": 8.65788240632037e-09, + "loss": 0.7263661, + "num_input_tokens_seen": 348672145, + "router_z_loss_clip": 0.72070312, + "router_z_loss_mlp": 0.10394287, + "step": 16154, + "time_per_iteration": 2.6750457286834717 + }, + { + "auxiliary_loss_clip": 0.011134, + "auxiliary_loss_mlp": 0.01028082, + "balance_loss_clip": 1.04041409, + "balance_loss_mlp": 1.01573217, + "epoch": 0.9712911468510447, + "flos": 25219303853760.0, + "grad_norm": 1.8376292813109814, + "language_loss": 0.80939239, + "learning_rate": 8.621720872059812e-09, + "loss": 0.83080721, + "num_input_tokens_seen": 348690615, + "router_z_loss_clip": 0.72998047, + "router_z_loss_mlp": 0.12347412, + "step": 16155, + "time_per_iteration": 2.6385200023651123 + }, + { + "auxiliary_loss_clip": 0.01112819, + "auxiliary_loss_mlp": 0.01035737, + "balance_loss_clip": 1.03779078, + "balance_loss_mlp": 1.02372706, + "epoch": 0.9713512701037126, + "flos": 16536757180320.0, + "grad_norm": 2.127787282802855, + "language_loss": 0.67513669, + "learning_rate": 8.58563485106334e-09, + "loss": 0.69662225, + "num_input_tokens_seen": 348708665, + "router_z_loss_clip": 0.74951172, + "router_z_loss_mlp": 0.12017822, + "step": 16156, + "time_per_iteration": 2.6172313690185547 + }, + { + "auxiliary_loss_clip": 0.01110821, + "auxiliary_loss_mlp": 0.01032665, + "balance_loss_clip": 1.03662515, + "balance_loss_mlp": 1.02208531, + "epoch": 0.9714113933563806, + "flos": 31541083123680.0, + "grad_norm": 2.618601127135708, + "language_loss": 0.91247225, + "learning_rate": 8.54962434469919e-09, + "loss": 0.93390709, + "num_input_tokens_seen": 348726105, + "router_z_loss_clip": 0.74121094, + "router_z_loss_mlp": 0.10583496, + "step": 16157, + "time_per_iteration": 2.6241536140441895 + }, + { + "auxiliary_loss_clip": 0.01110934, + "auxiliary_loss_mlp": 0.01025578, + "balance_loss_clip": 1.03770781, + "balance_loss_mlp": 1.01481938, + "epoch": 0.9714715166090485, + "flos": 15549207154560.0, + "grad_norm": 2.061175056419638, + "language_loss": 0.72195429, + "learning_rate": 8.513689354332721e-09, + "loss": 0.74331939, + "num_input_tokens_seen": 348743360, + "router_z_loss_clip": 0.73193359, + "router_z_loss_mlp": 0.10760498, + "step": 16158, + "time_per_iteration": 2.601086139678955 + }, + { + "auxiliary_loss_clip": 0.01107537, + "auxiliary_loss_mlp": 0.01035068, + "balance_loss_clip": 1.03698087, + "balance_loss_mlp": 1.02408886, + "epoch": 0.9715316398617165, + "flos": 22458323394720.0, + "grad_norm": 2.0641977811698937, + "language_loss": 0.5992862, + "learning_rate": 8.477829881326836e-09, + "loss": 0.62071222, + "num_input_tokens_seen": 348759045, + "router_z_loss_clip": 0.70654297, + "router_z_loss_mlp": 0.10974121, + "step": 16159, + "time_per_iteration": 2.612058639526367 + }, + { + "auxiliary_loss_clip": 0.01104108, + "auxiliary_loss_mlp": 0.0102382, + "balance_loss_clip": 1.03628659, + "balance_loss_mlp": 1.01378846, + "epoch": 0.9715917631143844, + "flos": 35281267048800.0, + "grad_norm": 1.7296453307146098, + "language_loss": 0.78909832, + "learning_rate": 8.44204592704112e-09, + "loss": 0.81037754, + "num_input_tokens_seen": 348779910, + "router_z_loss_clip": 0.67822266, + "router_z_loss_mlp": 0.10040283, + "step": 16160, + "time_per_iteration": 2.7465736865997314 + }, + { + "auxiliary_loss_clip": 0.01027124, + "auxiliary_loss_mlp": 0.01001646, + "balance_loss_clip": 1.0047735, + "balance_loss_mlp": 1.00070643, + "epoch": 0.9716518863670525, + "flos": 80460427710240.0, + "grad_norm": 0.9382070762362541, + "language_loss": 0.54207098, + "learning_rate": 8.406337492832704e-09, + "loss": 0.56235868, + "num_input_tokens_seen": 348838995, + "router_z_loss_clip": 0.22375488, + "router_z_loss_mlp": 0.00938416, + "step": 16161, + "time_per_iteration": 3.303577184677124 + }, + { + "auxiliary_loss_clip": 0.01107346, + "auxiliary_loss_mlp": 0.01029534, + "balance_loss_clip": 1.03809071, + "balance_loss_mlp": 1.01841724, + "epoch": 0.9717120096197204, + "flos": 21612888936000.0, + "grad_norm": 2.420834493379578, + "language_loss": 0.72089779, + "learning_rate": 8.3707045800554e-09, + "loss": 0.74226665, + "num_input_tokens_seen": 348858090, + "router_z_loss_clip": 0.69238281, + "router_z_loss_mlp": 0.11120605, + "step": 16162, + "time_per_iteration": 2.6853809356689453 + }, + { + "auxiliary_loss_clip": 0.01106768, + "auxiliary_loss_mlp": 0.01027293, + "balance_loss_clip": 1.03617036, + "balance_loss_mlp": 1.01628423, + "epoch": 0.9717721328723884, + "flos": 29850497827200.0, + "grad_norm": 2.2166681696185515, + "language_loss": 0.78395569, + "learning_rate": 8.335147190060787e-09, + "loss": 0.8052963, + "num_input_tokens_seen": 348877885, + "router_z_loss_clip": 0.70605469, + "router_z_loss_mlp": 0.11004639, + "step": 16163, + "time_per_iteration": 4.089768648147583 + }, + { + "auxiliary_loss_clip": 0.01107634, + "auxiliary_loss_mlp": 0.01025605, + "balance_loss_clip": 1.03711033, + "balance_loss_mlp": 1.01517415, + "epoch": 0.9718322561250564, + "flos": 25351573721760.0, + "grad_norm": 1.832932130181092, + "language_loss": 0.72596741, + "learning_rate": 8.299665324196903e-09, + "loss": 0.74729979, + "num_input_tokens_seen": 348897720, + "router_z_loss_clip": 0.70458984, + "router_z_loss_mlp": 0.10430908, + "step": 16164, + "time_per_iteration": 2.6237101554870605 + }, + { + "auxiliary_loss_clip": 0.01110716, + "auxiliary_loss_mlp": 0.01031629, + "balance_loss_clip": 1.03782237, + "balance_loss_mlp": 1.02050066, + "epoch": 0.9718923793777243, + "flos": 23215321332000.0, + "grad_norm": 2.897890247772712, + "language_loss": 0.83774245, + "learning_rate": 8.264258983809114e-09, + "loss": 0.85916591, + "num_input_tokens_seen": 348915410, + "router_z_loss_clip": 0.72851562, + "router_z_loss_mlp": 0.11114502, + "step": 16165, + "time_per_iteration": 3.904500961303711 + }, + { + "auxiliary_loss_clip": 0.01107749, + "auxiliary_loss_mlp": 0.01023722, + "balance_loss_clip": 1.03684807, + "balance_loss_mlp": 1.01367271, + "epoch": 0.9719525026303923, + "flos": 26688007759680.0, + "grad_norm": 1.8065278980741821, + "language_loss": 0.79427189, + "learning_rate": 8.228928170240345e-09, + "loss": 0.81558657, + "num_input_tokens_seen": 348934335, + "router_z_loss_clip": 0.70751953, + "router_z_loss_mlp": 0.10046387, + "step": 16166, + "time_per_iteration": 2.6436402797698975 + }, + { + "auxiliary_loss_clip": 0.01109919, + "auxiliary_loss_mlp": 0.01023295, + "balance_loss_clip": 1.03832126, + "balance_loss_mlp": 1.01251221, + "epoch": 0.9720126258830603, + "flos": 17606139410880.0, + "grad_norm": 2.064552971748758, + "language_loss": 0.70887446, + "learning_rate": 8.193672884830195e-09, + "loss": 0.73020661, + "num_input_tokens_seen": 348952405, + "router_z_loss_clip": 0.71582031, + "router_z_loss_mlp": 0.10784912, + "step": 16167, + "time_per_iteration": 2.5754287242889404 + }, + { + "auxiliary_loss_clip": 0.01108943, + "auxiliary_loss_mlp": 0.01032265, + "balance_loss_clip": 1.03898215, + "balance_loss_mlp": 1.02149415, + "epoch": 0.9720727491357283, + "flos": 32032042185600.0, + "grad_norm": 2.577534151402848, + "language_loss": 0.75626481, + "learning_rate": 8.158493128915812e-09, + "loss": 0.77767688, + "num_input_tokens_seen": 348973580, + "router_z_loss_clip": 0.70068359, + "router_z_loss_mlp": 0.10778809, + "step": 16168, + "time_per_iteration": 2.6643357276916504 + }, + { + "auxiliary_loss_clip": 0.01111246, + "auxiliary_loss_mlp": 0.01033216, + "balance_loss_clip": 1.03885138, + "balance_loss_mlp": 1.0216887, + "epoch": 0.9721328723883962, + "flos": 27667170708480.0, + "grad_norm": 2.288206966338517, + "language_loss": 0.72645891, + "learning_rate": 8.123388903830797e-09, + "loss": 0.74790347, + "num_input_tokens_seen": 348992035, + "router_z_loss_clip": 0.72363281, + "router_z_loss_mlp": 0.11529541, + "step": 16169, + "time_per_iteration": 2.6100552082061768 + }, + { + "auxiliary_loss_clip": 0.0111146, + "auxiliary_loss_mlp": 0.01030558, + "balance_loss_clip": 1.03596449, + "balance_loss_mlp": 1.01877367, + "epoch": 0.9721929956410642, + "flos": 34256731096800.0, + "grad_norm": 3.2128860466359934, + "language_loss": 0.57870841, + "learning_rate": 8.088360210906309e-09, + "loss": 0.60012865, + "num_input_tokens_seen": 349013160, + "router_z_loss_clip": 0.75585938, + "router_z_loss_mlp": 0.11785889, + "step": 16170, + "time_per_iteration": 2.6573374271392822 + }, + { + "auxiliary_loss_clip": 0.01109372, + "auxiliary_loss_mlp": 0.0102793, + "balance_loss_clip": 1.03673625, + "balance_loss_mlp": 1.01619995, + "epoch": 0.9722531188937321, + "flos": 25619678978400.0, + "grad_norm": 1.7616575296476575, + "language_loss": 0.71443081, + "learning_rate": 8.053407051471062e-09, + "loss": 0.73580384, + "num_input_tokens_seen": 349033485, + "router_z_loss_clip": 0.72607422, + "router_z_loss_mlp": 0.11730957, + "step": 16171, + "time_per_iteration": 2.7329392433166504 + }, + { + "auxiliary_loss_clip": 0.01109819, + "auxiliary_loss_mlp": 0.01035226, + "balance_loss_clip": 1.0373826, + "balance_loss_mlp": 1.02425838, + "epoch": 0.9723132421464001, + "flos": 19609027966080.0, + "grad_norm": 1.762616341167006, + "language_loss": 0.6832431, + "learning_rate": 8.018529426850218e-09, + "loss": 0.70469356, + "num_input_tokens_seen": 349051705, + "router_z_loss_clip": 0.72558594, + "router_z_loss_mlp": 0.10968018, + "step": 16172, + "time_per_iteration": 2.6133830547332764 + }, + { + "auxiliary_loss_clip": 0.01108193, + "auxiliary_loss_mlp": 0.010253, + "balance_loss_clip": 1.03726292, + "balance_loss_mlp": 1.01432097, + "epoch": 0.972373365399068, + "flos": 33855991316640.0, + "grad_norm": 1.8791274653326553, + "language_loss": 0.86079836, + "learning_rate": 7.983727338366274e-09, + "loss": 0.88213325, + "num_input_tokens_seen": 349070825, + "router_z_loss_clip": 0.70947266, + "router_z_loss_mlp": 0.10961914, + "step": 16173, + "time_per_iteration": 2.754718542098999 + }, + { + "auxiliary_loss_clip": 0.01114075, + "auxiliary_loss_mlp": 0.01031341, + "balance_loss_clip": 1.03782845, + "balance_loss_mlp": 1.01903903, + "epoch": 0.9724334886517361, + "flos": 28068396696000.0, + "grad_norm": 2.677951730625783, + "language_loss": 0.64190626, + "learning_rate": 7.949000787339289e-09, + "loss": 0.66336048, + "num_input_tokens_seen": 349089730, + "router_z_loss_clip": 0.76318359, + "router_z_loss_mlp": 0.12286377, + "step": 16174, + "time_per_iteration": 2.6430985927581787 + }, + { + "auxiliary_loss_clip": 0.01107634, + "auxiliary_loss_mlp": 0.01026564, + "balance_loss_clip": 1.03718579, + "balance_loss_mlp": 1.01606739, + "epoch": 0.972493611904404, + "flos": 31050934407360.0, + "grad_norm": 1.7499382502224867, + "language_loss": 0.7778573, + "learning_rate": 7.914349775085538e-09, + "loss": 0.79919928, + "num_input_tokens_seen": 349111315, + "router_z_loss_clip": 0.70507812, + "router_z_loss_mlp": 0.1050415, + "step": 16175, + "time_per_iteration": 3.9511616230010986 + }, + { + "auxiliary_loss_clip": 0.01110075, + "auxiliary_loss_mlp": 0.01030279, + "balance_loss_clip": 1.0382694, + "balance_loss_mlp": 1.01841187, + "epoch": 0.972553735157072, + "flos": 20722446129600.0, + "grad_norm": 5.945240412868303, + "language_loss": 0.56594765, + "learning_rate": 7.879774302919307e-09, + "loss": 0.5873512, + "num_input_tokens_seen": 349129495, + "router_z_loss_clip": 0.71826172, + "router_z_loss_mlp": 0.11853027, + "step": 16176, + "time_per_iteration": 2.5957791805267334 + }, + { + "auxiliary_loss_clip": 0.01109947, + "auxiliary_loss_mlp": 0.01029049, + "balance_loss_clip": 1.03897429, + "balance_loss_mlp": 1.01851106, + "epoch": 0.97261385840974, + "flos": 31853305347840.0, + "grad_norm": 2.5147081538190736, + "language_loss": 0.72141361, + "learning_rate": 7.845274372151545e-09, + "loss": 0.74280363, + "num_input_tokens_seen": 349148850, + "router_z_loss_clip": 0.70947266, + "router_z_loss_mlp": 0.10540771, + "step": 16177, + "time_per_iteration": 2.648127317428589 + }, + { + "auxiliary_loss_clip": 0.0110983, + "auxiliary_loss_mlp": 0.01026793, + "balance_loss_clip": 1.03665948, + "balance_loss_mlp": 1.01582551, + "epoch": 0.9726739816624079, + "flos": 31051866304800.0, + "grad_norm": 1.7225807035598921, + "language_loss": 0.68560266, + "learning_rate": 7.810849984090984e-09, + "loss": 0.7069689, + "num_input_tokens_seen": 349167620, + "router_z_loss_clip": 0.73046875, + "router_z_loss_mlp": 0.10974121, + "step": 16178, + "time_per_iteration": 2.683805227279663 + }, + { + "auxiliary_loss_clip": 0.0110962, + "auxiliary_loss_mlp": 0.01029849, + "balance_loss_clip": 1.03585768, + "balance_loss_mlp": 1.01845908, + "epoch": 0.972734104915076, + "flos": 35405433460800.0, + "grad_norm": 3.2168623075910903, + "language_loss": 0.67234939, + "learning_rate": 7.776501140042358e-09, + "loss": 0.69374412, + "num_input_tokens_seen": 349185845, + "router_z_loss_clip": 0.73730469, + "router_z_loss_mlp": 0.11383057, + "step": 16179, + "time_per_iteration": 2.651026725769043 + }, + { + "auxiliary_loss_clip": 0.01107563, + "auxiliary_loss_mlp": 0.01031964, + "balance_loss_clip": 1.03765023, + "balance_loss_mlp": 1.02127051, + "epoch": 0.9727942281677439, + "flos": 28599137376480.0, + "grad_norm": 1.7439306324886266, + "language_loss": 0.76887381, + "learning_rate": 7.742227841308624e-09, + "loss": 0.79026914, + "num_input_tokens_seen": 349204525, + "router_z_loss_clip": 0.69921875, + "router_z_loss_mlp": 0.10693359, + "step": 16180, + "time_per_iteration": 2.6280155181884766 + }, + { + "auxiliary_loss_clip": 0.01112392, + "auxiliary_loss_mlp": 0.0103117, + "balance_loss_clip": 1.03766036, + "balance_loss_mlp": 1.02011919, + "epoch": 0.9728543514204119, + "flos": 38711092544640.0, + "grad_norm": 1.5704582616634484, + "language_loss": 0.76539665, + "learning_rate": 7.708030089189188e-09, + "loss": 0.78683221, + "num_input_tokens_seen": 349228075, + "router_z_loss_clip": 0.74755859, + "router_z_loss_mlp": 0.11053467, + "step": 16181, + "time_per_iteration": 2.702396869659424 + }, + { + "auxiliary_loss_clip": 0.01108123, + "auxiliary_loss_mlp": 0.01030467, + "balance_loss_clip": 1.03684592, + "balance_loss_mlp": 1.01930892, + "epoch": 0.9729144746730798, + "flos": 19875998738880.0, + "grad_norm": 6.5418323684628925, + "language_loss": 0.63539159, + "learning_rate": 7.67390788498079e-09, + "loss": 0.6567775, + "num_input_tokens_seen": 349246990, + "router_z_loss_clip": 0.71240234, + "router_z_loss_mlp": 0.11169434, + "step": 16182, + "time_per_iteration": 2.6318891048431396 + }, + { + "auxiliary_loss_clip": 0.01111841, + "auxiliary_loss_mlp": 0.01036475, + "balance_loss_clip": 1.03858948, + "balance_loss_mlp": 1.02560925, + "epoch": 0.9729745979257478, + "flos": 30556328790240.0, + "grad_norm": 2.116536762542532, + "language_loss": 0.62017787, + "learning_rate": 7.639861229977507e-09, + "loss": 0.64166105, + "num_input_tokens_seen": 349265890, + "router_z_loss_clip": 0.73291016, + "router_z_loss_mlp": 0.10858154, + "step": 16183, + "time_per_iteration": 2.629502058029175 + }, + { + "auxiliary_loss_clip": 0.01108253, + "auxiliary_loss_mlp": 0.01029922, + "balance_loss_clip": 1.0375185, + "balance_loss_mlp": 1.01857913, + "epoch": 0.9730347211784157, + "flos": 27623904603840.0, + "grad_norm": 1.7421427717329059, + "language_loss": 0.78065002, + "learning_rate": 7.605890125470527e-09, + "loss": 0.80203176, + "num_input_tokens_seen": 349285275, + "router_z_loss_clip": 0.70703125, + "router_z_loss_mlp": 0.11340332, + "step": 16184, + "time_per_iteration": 2.6559996604919434 + }, + { + "auxiliary_loss_clip": 0.0110476, + "auxiliary_loss_mlp": 0.01027163, + "balance_loss_clip": 1.03460431, + "balance_loss_mlp": 1.0163213, + "epoch": 0.9730948444310837, + "flos": 13419842702400.0, + "grad_norm": 2.440914953946686, + "language_loss": 0.79691648, + "learning_rate": 7.571994572747709e-09, + "loss": 0.81823575, + "num_input_tokens_seen": 349301515, + "router_z_loss_clip": 0.70166016, + "router_z_loss_mlp": 0.10845947, + "step": 16185, + "time_per_iteration": 2.6133997440338135 + }, + { + "auxiliary_loss_clip": 0.01110763, + "auxiliary_loss_mlp": 0.0103091, + "balance_loss_clip": 1.03786695, + "balance_loss_mlp": 1.02014565, + "epoch": 0.9731549676837516, + "flos": 20498538875040.0, + "grad_norm": 1.986738916868738, + "language_loss": 0.77572048, + "learning_rate": 7.538174573094469e-09, + "loss": 0.79713726, + "num_input_tokens_seen": 349319590, + "router_z_loss_clip": 0.72900391, + "router_z_loss_mlp": 0.10766602, + "step": 16186, + "time_per_iteration": 2.6063520908355713 + }, + { + "auxiliary_loss_clip": 0.01108002, + "auxiliary_loss_mlp": 0.01027607, + "balance_loss_clip": 1.03737092, + "balance_loss_mlp": 1.0164851, + "epoch": 0.9732150909364197, + "flos": 25797281332320.0, + "grad_norm": 2.0361395588679927, + "language_loss": 0.65609217, + "learning_rate": 7.504430127793337e-09, + "loss": 0.67744827, + "num_input_tokens_seen": 349339230, + "router_z_loss_clip": 0.70654297, + "router_z_loss_mlp": 0.11120605, + "step": 16187, + "time_per_iteration": 2.7424585819244385 + }, + { + "auxiliary_loss_clip": 0.01107811, + "auxiliary_loss_mlp": 0.01027185, + "balance_loss_clip": 1.03689158, + "balance_loss_mlp": 1.01680148, + "epoch": 0.9732752141890876, + "flos": 41155353361440.0, + "grad_norm": 1.6778329453833438, + "language_loss": 0.8083992, + "learning_rate": 7.47076123812418e-09, + "loss": 0.82974917, + "num_input_tokens_seen": 349361155, + "router_z_loss_clip": 0.70898438, + "router_z_loss_mlp": 0.1038208, + "step": 16188, + "time_per_iteration": 2.734165906906128 + }, + { + "auxiliary_loss_clip": 0.01105301, + "auxiliary_loss_mlp": 0.01027309, + "balance_loss_clip": 1.03635383, + "balance_loss_mlp": 1.01712263, + "epoch": 0.9733353374417556, + "flos": 28558261791360.0, + "grad_norm": 2.0220715277433032, + "language_loss": 0.7832818, + "learning_rate": 7.437167905363084e-09, + "loss": 0.80460787, + "num_input_tokens_seen": 349379335, + "router_z_loss_clip": 0.68847656, + "router_z_loss_mlp": 0.10180664, + "step": 16189, + "time_per_iteration": 2.6278467178344727 + }, + { + "auxiliary_loss_clip": 0.01107384, + "auxiliary_loss_mlp": 0.01025836, + "balance_loss_clip": 1.03623593, + "balance_loss_mlp": 1.01502931, + "epoch": 0.9733954606944236, + "flos": 47791583305920.0, + "grad_norm": 2.1186007340366744, + "language_loss": 0.50954592, + "learning_rate": 7.403650130784367e-09, + "loss": 0.53087819, + "num_input_tokens_seen": 349401575, + "router_z_loss_clip": 0.71240234, + "router_z_loss_mlp": 0.10803223, + "step": 16190, + "time_per_iteration": 2.7860734462738037 + }, + { + "auxiliary_loss_clip": 0.01108342, + "auxiliary_loss_mlp": 0.0102757, + "balance_loss_clip": 1.03649759, + "balance_loss_mlp": 1.01640654, + "epoch": 0.9734555839470915, + "flos": 26822303491680.0, + "grad_norm": 1.6131833816057035, + "language_loss": 0.80727935, + "learning_rate": 7.3702079156590105e-09, + "loss": 0.82863849, + "num_input_tokens_seen": 349420650, + "router_z_loss_clip": 0.71826172, + "router_z_loss_mlp": 0.11157227, + "step": 16191, + "time_per_iteration": 4.09703803062439 + }, + { + "auxiliary_loss_clip": 0.01106142, + "auxiliary_loss_mlp": 0.01026763, + "balance_loss_clip": 1.03454447, + "balance_loss_mlp": 1.0165056, + "epoch": 0.9735157071997596, + "flos": 20225733613920.0, + "grad_norm": 1.6835923398677406, + "language_loss": 0.82796371, + "learning_rate": 7.336841261255111e-09, + "loss": 0.84929276, + "num_input_tokens_seen": 349436830, + "router_z_loss_clip": 0.71582031, + "router_z_loss_mlp": 0.10253906, + "step": 16192, + "time_per_iteration": 2.572176456451416 + }, + { + "auxiliary_loss_clip": 0.01112738, + "auxiliary_loss_mlp": 0.01027003, + "balance_loss_clip": 1.04049492, + "balance_loss_mlp": 1.01632786, + "epoch": 0.9735758304524275, + "flos": 24677015748480.0, + "grad_norm": 1.9848937906220756, + "language_loss": 0.75237525, + "learning_rate": 7.303550168837658e-09, + "loss": 0.77377272, + "num_input_tokens_seen": 349454325, + "router_z_loss_clip": 0.72216797, + "router_z_loss_mlp": 0.10668945, + "step": 16193, + "time_per_iteration": 2.631891965866089 + }, + { + "auxiliary_loss_clip": 0.01106946, + "auxiliary_loss_mlp": 0.01026941, + "balance_loss_clip": 1.03727698, + "balance_loss_mlp": 1.01704061, + "epoch": 0.9736359537050955, + "flos": 28862664180480.0, + "grad_norm": 2.6492250690524677, + "language_loss": 0.85322744, + "learning_rate": 7.270334639669417e-09, + "loss": 0.87456632, + "num_input_tokens_seen": 349470230, + "router_z_loss_clip": 0.69628906, + "router_z_loss_mlp": 0.09899902, + "step": 16194, + "time_per_iteration": 2.605987071990967 + }, + { + "auxiliary_loss_clip": 0.01108042, + "auxiliary_loss_mlp": 0.0103327, + "balance_loss_clip": 1.03825009, + "balance_loss_mlp": 1.02219546, + "epoch": 0.9736960769577634, + "flos": 18986163691680.0, + "grad_norm": 1.670649609112807, + "language_loss": 0.75603813, + "learning_rate": 7.237194675009828e-09, + "loss": 0.77745128, + "num_input_tokens_seen": 349486250, + "router_z_loss_clip": 0.69824219, + "router_z_loss_mlp": 0.11065674, + "step": 16195, + "time_per_iteration": 2.6394457817077637 + }, + { + "auxiliary_loss_clip": 0.01027015, + "auxiliary_loss_mlp": 0.01001127, + "balance_loss_clip": 1.00471425, + "balance_loss_mlp": 1.00023341, + "epoch": 0.9737562002104314, + "flos": 79743373460640.0, + "grad_norm": 0.8057420172505406, + "language_loss": 0.52476811, + "learning_rate": 7.204130276115439e-09, + "loss": 0.54504955, + "num_input_tokens_seen": 349545865, + "router_z_loss_clip": 0.22302246, + "router_z_loss_mlp": 0.00892639, + "step": 16196, + "time_per_iteration": 3.207732915878296 + }, + { + "auxiliary_loss_clip": 0.01107759, + "auxiliary_loss_mlp": 0.01027806, + "balance_loss_clip": 1.03696287, + "balance_loss_mlp": 1.01676774, + "epoch": 0.9738163234630993, + "flos": 33186173865120.0, + "grad_norm": 1.7657839674585243, + "language_loss": 0.7629391, + "learning_rate": 7.171141444240136e-09, + "loss": 0.78429472, + "num_input_tokens_seen": 349566080, + "router_z_loss_clip": 0.70751953, + "router_z_loss_mlp": 0.11035156, + "step": 16197, + "time_per_iteration": 2.6706559658050537 + }, + { + "auxiliary_loss_clip": 0.01111682, + "auxiliary_loss_mlp": 0.01025504, + "balance_loss_clip": 1.03702307, + "balance_loss_mlp": 1.01435804, + "epoch": 0.9738764467157673, + "flos": 25708885328160.0, + "grad_norm": 1.9493239701855765, + "language_loss": 0.67538953, + "learning_rate": 7.13822818063492e-09, + "loss": 0.69676137, + "num_input_tokens_seen": 349585665, + "router_z_loss_clip": 0.74560547, + "router_z_loss_mlp": 0.1114502, + "step": 16198, + "time_per_iteration": 2.692983627319336 + }, + { + "auxiliary_loss_clip": 0.01107535, + "auxiliary_loss_mlp": 0.01027895, + "balance_loss_clip": 1.03514671, + "balance_loss_mlp": 1.01627791, + "epoch": 0.9739365699684353, + "flos": 26065021933440.0, + "grad_norm": 1.8804941167744422, + "language_loss": 0.7781117, + "learning_rate": 7.10539048654768e-09, + "loss": 0.79946601, + "num_input_tokens_seen": 349605125, + "router_z_loss_clip": 0.72363281, + "router_z_loss_mlp": 0.11627197, + "step": 16199, + "time_per_iteration": 2.6243534088134766 + }, + { + "auxiliary_loss_clip": 0.01109916, + "auxiliary_loss_mlp": 0.01032646, + "balance_loss_clip": 1.03803277, + "balance_loss_mlp": 1.0211364, + "epoch": 0.9739966932211033, + "flos": 26725601445120.0, + "grad_norm": 4.625799821395876, + "language_loss": 0.79659712, + "learning_rate": 7.072628363223865e-09, + "loss": 0.81802273, + "num_input_tokens_seen": 349623360, + "router_z_loss_clip": 0.71923828, + "router_z_loss_mlp": 0.1151123, + "step": 16200, + "time_per_iteration": 2.660308361053467 + }, + { + "auxiliary_loss_clip": 0.01116154, + "auxiliary_loss_mlp": 0.01034192, + "balance_loss_clip": 1.03897762, + "balance_loss_mlp": 1.02267623, + "epoch": 0.9740568164737712, + "flos": 30294665781120.0, + "grad_norm": 2.3315420903872117, + "language_loss": 0.68692529, + "learning_rate": 7.039941811905592e-09, + "loss": 0.70842874, + "num_input_tokens_seen": 349644390, + "router_z_loss_clip": 0.77099609, + "router_z_loss_mlp": 0.11517334, + "step": 16201, + "time_per_iteration": 2.623721122741699 + }, + { + "auxiliary_loss_clip": 0.01109679, + "auxiliary_loss_mlp": 0.01026668, + "balance_loss_clip": 1.03675795, + "balance_loss_mlp": 1.01592159, + "epoch": 0.9741169397264392, + "flos": 28825718771520.0, + "grad_norm": 1.5024131409777697, + "language_loss": 0.72724527, + "learning_rate": 7.0073308338325364e-09, + "loss": 0.74860877, + "num_input_tokens_seen": 349663200, + "router_z_loss_clip": 0.72900391, + "router_z_loss_mlp": 0.10754395, + "step": 16202, + "time_per_iteration": 4.153093576431274 + }, + { + "auxiliary_loss_clip": 0.01112952, + "auxiliary_loss_mlp": 0.01029152, + "balance_loss_clip": 1.03930521, + "balance_loss_mlp": 1.01757121, + "epoch": 0.9741770629791072, + "flos": 22990077007200.0, + "grad_norm": 2.1321478506755582, + "language_loss": 0.72849798, + "learning_rate": 6.974795430241265e-09, + "loss": 0.749919, + "num_input_tokens_seen": 349681975, + "router_z_loss_clip": 0.73632812, + "router_z_loss_mlp": 0.11572266, + "step": 16203, + "time_per_iteration": 2.654529094696045 + }, + { + "auxiliary_loss_clip": 0.01109259, + "auxiliary_loss_mlp": 0.01032319, + "balance_loss_clip": 1.03719831, + "balance_loss_mlp": 1.02150655, + "epoch": 0.9742371862317751, + "flos": 27267038687520.0, + "grad_norm": 1.8403966689649567, + "language_loss": 0.77290249, + "learning_rate": 6.942335602365235e-09, + "loss": 0.79431832, + "num_input_tokens_seen": 349701185, + "router_z_loss_clip": 0.72070312, + "router_z_loss_mlp": 0.10803223, + "step": 16204, + "time_per_iteration": 4.087934970855713 + }, + { + "auxiliary_loss_clip": 0.01112328, + "auxiliary_loss_mlp": 0.01034904, + "balance_loss_clip": 1.0386579, + "balance_loss_mlp": 1.0233407, + "epoch": 0.9742973094844432, + "flos": 26555008580640.0, + "grad_norm": 1.9511978552613727, + "language_loss": 0.79305816, + "learning_rate": 6.909951351435905e-09, + "loss": 0.81453049, + "num_input_tokens_seen": 349720360, + "router_z_loss_clip": 0.73681641, + "router_z_loss_mlp": 0.11578369, + "step": 16205, + "time_per_iteration": 2.6694114208221436 + }, + { + "auxiliary_loss_clip": 0.01107848, + "auxiliary_loss_mlp": 0.01026201, + "balance_loss_clip": 1.03688097, + "balance_loss_mlp": 1.01545978, + "epoch": 0.9743574327371111, + "flos": 32029287010560.0, + "grad_norm": 3.02939055923005, + "language_loss": 0.74509609, + "learning_rate": 6.87764267868074e-09, + "loss": 0.76643658, + "num_input_tokens_seen": 349741040, + "router_z_loss_clip": 0.70947266, + "router_z_loss_mlp": 0.10742188, + "step": 16206, + "time_per_iteration": 2.701848030090332 + }, + { + "auxiliary_loss_clip": 0.01108492, + "auxiliary_loss_mlp": 0.01028647, + "balance_loss_clip": 1.03593826, + "balance_loss_mlp": 1.01718497, + "epoch": 0.9744175559897791, + "flos": 15068012757120.0, + "grad_norm": 2.3763734003589367, + "language_loss": 0.8447994, + "learning_rate": 6.8454095853252015e-09, + "loss": 0.86617088, + "num_input_tokens_seen": 349758895, + "router_z_loss_clip": 0.72509766, + "router_z_loss_mlp": 0.11468506, + "step": 16207, + "time_per_iteration": 2.614617109298706 + }, + { + "auxiliary_loss_clip": 0.01107661, + "auxiliary_loss_mlp": 0.01029352, + "balance_loss_clip": 1.0370369, + "balance_loss_mlp": 1.01856351, + "epoch": 0.974477679242447, + "flos": 34652122596000.0, + "grad_norm": 2.0427672802815073, + "language_loss": 0.70879412, + "learning_rate": 6.813252072591425e-09, + "loss": 0.73016417, + "num_input_tokens_seen": 349779740, + "router_z_loss_clip": 0.70556641, + "router_z_loss_mlp": 0.10797119, + "step": 16208, + "time_per_iteration": 2.6921439170837402 + }, + { + "auxiliary_loss_clip": 0.01101666, + "auxiliary_loss_mlp": 0.0102412, + "balance_loss_clip": 1.03558469, + "balance_loss_mlp": 1.01468468, + "epoch": 0.974537802495115, + "flos": 21701528043840.0, + "grad_norm": 1.684841356594811, + "language_loss": 0.77225113, + "learning_rate": 6.781170141698878e-09, + "loss": 0.79350901, + "num_input_tokens_seen": 349796820, + "router_z_loss_clip": 0.66015625, + "router_z_loss_mlp": 0.09436035, + "step": 16209, + "time_per_iteration": 2.620513916015625 + }, + { + "auxiliary_loss_clip": 0.01112942, + "auxiliary_loss_mlp": 0.01030006, + "balance_loss_clip": 1.0384686, + "balance_loss_mlp": 1.01900327, + "epoch": 0.9745979257477829, + "flos": 29092649027040.0, + "grad_norm": 1.7172524531970326, + "language_loss": 0.79019505, + "learning_rate": 6.749163793864144e-09, + "loss": 0.81162453, + "num_input_tokens_seen": 349816550, + "router_z_loss_clip": 0.74511719, + "router_z_loss_mlp": 0.11004639, + "step": 16210, + "time_per_iteration": 2.6578683853149414 + }, + { + "auxiliary_loss_clip": 0.01110287, + "auxiliary_loss_mlp": 0.01032308, + "balance_loss_clip": 1.03731775, + "balance_loss_mlp": 1.02184129, + "epoch": 0.9746580490004509, + "flos": 32965670062080.0, + "grad_norm": 2.3951162470227674, + "language_loss": 0.77858102, + "learning_rate": 6.7172330303009176e-09, + "loss": 0.80000699, + "num_input_tokens_seen": 349834350, + "router_z_loss_clip": 0.73095703, + "router_z_loss_mlp": 0.10467529, + "step": 16211, + "time_per_iteration": 2.640946388244629 + }, + { + "auxiliary_loss_clip": 0.01113644, + "auxiliary_loss_mlp": 0.01028358, + "balance_loss_clip": 1.03774285, + "balance_loss_mlp": 1.01645494, + "epoch": 0.9747181722531189, + "flos": 24150205244160.0, + "grad_norm": 2.2109204251476324, + "language_loss": 0.78673291, + "learning_rate": 6.685377852219787e-09, + "loss": 0.80815291, + "num_input_tokens_seen": 349853460, + "router_z_loss_clip": 0.75976562, + "router_z_loss_mlp": 0.11914062, + "step": 16212, + "time_per_iteration": 2.6509034633636475 + }, + { + "auxiliary_loss_clip": 0.01108578, + "auxiliary_loss_mlp": 0.01032758, + "balance_loss_clip": 1.03784418, + "balance_loss_mlp": 1.0221777, + "epoch": 0.9747782955057869, + "flos": 38352524902560.0, + "grad_norm": 1.647362703216973, + "language_loss": 0.79919124, + "learning_rate": 6.653598260829118e-09, + "loss": 0.82060462, + "num_input_tokens_seen": 349874830, + "router_z_loss_clip": 0.70751953, + "router_z_loss_mlp": 0.10583496, + "step": 16213, + "time_per_iteration": 2.680593490600586 + }, + { + "auxiliary_loss_clip": 0.01106469, + "auxiliary_loss_mlp": 0.01026995, + "balance_loss_clip": 1.03488696, + "balance_loss_mlp": 1.01612318, + "epoch": 0.9748384187584548, + "flos": 19475623614240.0, + "grad_norm": 2.32996326455174, + "language_loss": 0.66608095, + "learning_rate": 6.6218942573335044e-09, + "loss": 0.6874156, + "num_input_tokens_seen": 349893690, + "router_z_loss_clip": 0.71533203, + "router_z_loss_mlp": 0.10870361, + "step": 16214, + "time_per_iteration": 2.613640069961548 + }, + { + "auxiliary_loss_clip": 0.01112319, + "auxiliary_loss_mlp": 0.0102767, + "balance_loss_clip": 1.03859472, + "balance_loss_mlp": 1.01617229, + "epoch": 0.9748985420111228, + "flos": 25085737432800.0, + "grad_norm": 1.6032880333805846, + "language_loss": 0.74158752, + "learning_rate": 6.5902658429355386e-09, + "loss": 0.76298743, + "num_input_tokens_seen": 349912480, + "router_z_loss_clip": 0.73730469, + "router_z_loss_mlp": 0.1151123, + "step": 16215, + "time_per_iteration": 3.9815032482147217 + }, + { + "auxiliary_loss_clip": 0.01107569, + "auxiliary_loss_mlp": 0.01027716, + "balance_loss_clip": 1.03641093, + "balance_loss_mlp": 1.01687419, + "epoch": 0.9749586652637908, + "flos": 44808964560000.0, + "grad_norm": 2.7834199721908983, + "language_loss": 0.67234921, + "learning_rate": 6.558713018834483e-09, + "loss": 0.6937021, + "num_input_tokens_seen": 349932470, + "router_z_loss_clip": 0.71191406, + "router_z_loss_mlp": 0.10839844, + "step": 16216, + "time_per_iteration": 2.765371084213257 + }, + { + "auxiliary_loss_clip": 0.01111648, + "auxiliary_loss_mlp": 0.01030358, + "balance_loss_clip": 1.03763819, + "balance_loss_mlp": 1.01880705, + "epoch": 0.9750187885164587, + "flos": 13420531496160.0, + "grad_norm": 2.000189752682352, + "language_loss": 0.71714616, + "learning_rate": 6.527235786226937e-09, + "loss": 0.73856616, + "num_input_tokens_seen": 349949060, + "router_z_loss_clip": 0.74023438, + "router_z_loss_mlp": 0.11560059, + "step": 16217, + "time_per_iteration": 2.624687671661377 + }, + { + "auxiliary_loss_clip": 0.0110814, + "auxiliary_loss_mlp": 0.01025913, + "balance_loss_clip": 1.03700125, + "balance_loss_mlp": 1.01498747, + "epoch": 0.9750789117691268, + "flos": 31407759806400.0, + "grad_norm": 1.618144591932365, + "language_loss": 0.78363341, + "learning_rate": 6.495834146306167e-09, + "loss": 0.80497396, + "num_input_tokens_seen": 349968010, + "router_z_loss_clip": 0.7109375, + "router_z_loss_mlp": 0.10925293, + "step": 16218, + "time_per_iteration": 2.6595637798309326 + }, + { + "auxiliary_loss_clip": 0.01107341, + "auxiliary_loss_mlp": 0.01029069, + "balance_loss_clip": 1.0372777, + "balance_loss_mlp": 1.01783943, + "epoch": 0.9751390350217947, + "flos": 16269583821120.0, + "grad_norm": 2.66696953222105, + "language_loss": 0.77462792, + "learning_rate": 6.464508100263222e-09, + "loss": 0.79599208, + "num_input_tokens_seen": 349985270, + "router_z_loss_clip": 0.69970703, + "router_z_loss_mlp": 0.11236572, + "step": 16219, + "time_per_iteration": 2.6622304916381836 + }, + { + "auxiliary_loss_clip": 0.01112031, + "auxiliary_loss_mlp": 0.01030945, + "balance_loss_clip": 1.03887439, + "balance_loss_mlp": 1.02014518, + "epoch": 0.9751991582744627, + "flos": 27845907546240.0, + "grad_norm": 1.835987902473497, + "language_loss": 0.81062138, + "learning_rate": 6.433257649285817e-09, + "loss": 0.83205116, + "num_input_tokens_seen": 350003935, + "router_z_loss_clip": 0.73144531, + "router_z_loss_mlp": 0.10803223, + "step": 16220, + "time_per_iteration": 2.627988815307617 + }, + { + "auxiliary_loss_clip": 0.01107335, + "auxiliary_loss_mlp": 0.01028821, + "balance_loss_clip": 1.03657603, + "balance_loss_mlp": 1.01844978, + "epoch": 0.9752592815271306, + "flos": 23972238234720.0, + "grad_norm": 2.1116437678166062, + "language_loss": 0.7566607, + "learning_rate": 6.402082794559227e-09, + "loss": 0.77802229, + "num_input_tokens_seen": 350023595, + "router_z_loss_clip": 0.70849609, + "router_z_loss_mlp": 0.10369873, + "step": 16221, + "time_per_iteration": 2.7106239795684814 + }, + { + "auxiliary_loss_clip": 0.01105067, + "auxiliary_loss_mlp": 0.01025968, + "balance_loss_clip": 1.03560686, + "balance_loss_mlp": 1.01537061, + "epoch": 0.9753194047797986, + "flos": 32567969077920.0, + "grad_norm": 1.5622312860146312, + "language_loss": 0.66391325, + "learning_rate": 6.370983537265395e-09, + "loss": 0.68522364, + "num_input_tokens_seen": 350045920, + "router_z_loss_clip": 0.6953125, + "router_z_loss_mlp": 0.1060791, + "step": 16222, + "time_per_iteration": 2.6550040245056152 + }, + { + "auxiliary_loss_clip": 0.0110731, + "auxiliary_loss_mlp": 0.01028021, + "balance_loss_clip": 1.03618765, + "balance_loss_mlp": 1.01719701, + "epoch": 0.9753795280324665, + "flos": 28335448503360.0, + "grad_norm": 1.8656868376904876, + "language_loss": 0.88459504, + "learning_rate": 6.3399598785836004e-09, + "loss": 0.90594834, + "num_input_tokens_seen": 350063925, + "router_z_loss_clip": 0.71142578, + "router_z_loss_mlp": 0.1083374, + "step": 16223, + "time_per_iteration": 2.642735719680786 + }, + { + "auxiliary_loss_clip": 0.01106315, + "auxiliary_loss_mlp": 0.01026781, + "balance_loss_clip": 1.03574049, + "balance_loss_mlp": 1.01623678, + "epoch": 0.9754396512851345, + "flos": 23749708567680.0, + "grad_norm": 2.3124551028930553, + "language_loss": 0.74656612, + "learning_rate": 6.309011819690457e-09, + "loss": 0.76789707, + "num_input_tokens_seen": 350080900, + "router_z_loss_clip": 0.70605469, + "router_z_loss_mlp": 0.10540771, + "step": 16224, + "time_per_iteration": 2.603011131286621 + }, + { + "auxiliary_loss_clip": 0.01027159, + "auxiliary_loss_mlp": 0.01001451, + "balance_loss_clip": 1.00479496, + "balance_loss_mlp": 1.00055134, + "epoch": 0.9754997745378025, + "flos": 83534845911840.0, + "grad_norm": 0.8050342818181446, + "language_loss": 0.5906347, + "learning_rate": 6.278139361759249e-09, + "loss": 0.61092079, + "num_input_tokens_seen": 350144550, + "router_z_loss_clip": 0.22375488, + "router_z_loss_mlp": 0.00900269, + "step": 16225, + "time_per_iteration": 3.2561252117156982 + }, + { + "auxiliary_loss_clip": 0.01108189, + "auxiliary_loss_mlp": 0.01034474, + "balance_loss_clip": 1.0379169, + "balance_loss_mlp": 1.02378702, + "epoch": 0.9755598977904705, + "flos": 32207942813760.0, + "grad_norm": 1.8821463548551802, + "language_loss": 0.68906885, + "learning_rate": 6.247342505960818e-09, + "loss": 0.71049547, + "num_input_tokens_seen": 350164050, + "router_z_loss_clip": 0.70263672, + "router_z_loss_mlp": 0.10687256, + "step": 16226, + "time_per_iteration": 2.6417768001556396 + }, + { + "auxiliary_loss_clip": 0.0110912, + "auxiliary_loss_mlp": 0.01038054, + "balance_loss_clip": 1.03723288, + "balance_loss_mlp": 1.02675867, + "epoch": 0.9756200210431384, + "flos": 20276616967200.0, + "grad_norm": 1.7835591342843296, + "language_loss": 0.82955384, + "learning_rate": 6.216621253462894e-09, + "loss": 0.85102558, + "num_input_tokens_seen": 350181350, + "router_z_loss_clip": 0.71923828, + "router_z_loss_mlp": 0.11291504, + "step": 16227, + "time_per_iteration": 2.609653949737549 + }, + { + "auxiliary_loss_clip": 0.01108041, + "auxiliary_loss_mlp": 0.01027474, + "balance_loss_clip": 1.03739631, + "balance_loss_mlp": 1.01707315, + "epoch": 0.9756801442958064, + "flos": 28825111012320.0, + "grad_norm": 1.776031591500618, + "language_loss": 0.7776913, + "learning_rate": 6.185975605430549e-09, + "loss": 0.79904646, + "num_input_tokens_seen": 350199765, + "router_z_loss_clip": 0.70703125, + "router_z_loss_mlp": 0.10400391, + "step": 16228, + "time_per_iteration": 2.721177577972412 + }, + { + "auxiliary_loss_clip": 0.01027019, + "auxiliary_loss_mlp": 0.01001484, + "balance_loss_clip": 1.00468755, + "balance_loss_mlp": 1.00057137, + "epoch": 0.9757402675484744, + "flos": 75197050488000.0, + "grad_norm": 0.8405733360531491, + "language_loss": 0.55775547, + "learning_rate": 6.155405563025962e-09, + "loss": 0.57804054, + "num_input_tokens_seen": 350256420, + "router_z_loss_clip": 0.2232666, + "router_z_loss_mlp": 0.0091095, + "step": 16229, + "time_per_iteration": 3.199110984802246 + }, + { + "auxiliary_loss_clip": 0.01110388, + "auxiliary_loss_mlp": 0.01030401, + "balance_loss_clip": 1.03844714, + "balance_loss_mlp": 1.0188849, + "epoch": 0.9758003908011423, + "flos": 29357229280320.0, + "grad_norm": 1.8649631070452337, + "language_loss": 0.75069308, + "learning_rate": 6.124911127407984e-09, + "loss": 0.77210093, + "num_input_tokens_seen": 350276270, + "router_z_loss_clip": 0.71923828, + "router_z_loss_mlp": 0.11499023, + "step": 16230, + "time_per_iteration": 4.092018365859985 + }, + { + "auxiliary_loss_clip": 0.01104569, + "auxiliary_loss_mlp": 0.01029819, + "balance_loss_clip": 1.03622377, + "balance_loss_mlp": 1.01909065, + "epoch": 0.9758605140538104, + "flos": 21344905231200.0, + "grad_norm": 3.155497929826093, + "language_loss": 0.7173928, + "learning_rate": 6.094492299733245e-09, + "loss": 0.73873675, + "num_input_tokens_seen": 350295000, + "router_z_loss_clip": 0.68408203, + "router_z_loss_mlp": 0.10723877, + "step": 16231, + "time_per_iteration": 2.5851616859436035 + }, + { + "auxiliary_loss_clip": 0.01112923, + "auxiliary_loss_mlp": 0.01026036, + "balance_loss_clip": 1.03898239, + "balance_loss_mlp": 1.01473498, + "epoch": 0.9759206373064783, + "flos": 30294220091040.0, + "grad_norm": 2.006400764229657, + "language_loss": 0.76457185, + "learning_rate": 6.064149081155267e-09, + "loss": 0.78596139, + "num_input_tokens_seen": 350314980, + "router_z_loss_clip": 0.73925781, + "router_z_loss_mlp": 0.11309814, + "step": 16232, + "time_per_iteration": 2.6578598022460938 + }, + { + "auxiliary_loss_clip": 0.01027255, + "auxiliary_loss_mlp": 0.01001168, + "balance_loss_clip": 1.00486994, + "balance_loss_mlp": 1.00022292, + "epoch": 0.9759807605591463, + "flos": 83171618782560.0, + "grad_norm": 0.7405429139421457, + "language_loss": 0.53784859, + "learning_rate": 6.033881472824465e-09, + "loss": 0.55813283, + "num_input_tokens_seen": 350371985, + "router_z_loss_clip": 0.22424316, + "router_z_loss_mlp": 0.00943756, + "step": 16233, + "time_per_iteration": 3.1497793197631836 + }, + { + "auxiliary_loss_clip": 0.01109719, + "auxiliary_loss_mlp": 0.01031925, + "balance_loss_clip": 1.0372051, + "balance_loss_mlp": 1.02109516, + "epoch": 0.9760408838118142, + "flos": 23170596605280.0, + "grad_norm": 1.8956143679711517, + "language_loss": 0.71500254, + "learning_rate": 6.003689475888807e-09, + "loss": 0.73641896, + "num_input_tokens_seen": 350390590, + "router_z_loss_clip": 0.72509766, + "router_z_loss_mlp": 0.1083374, + "step": 16234, + "time_per_iteration": 2.5823233127593994 + }, + { + "auxiliary_loss_clip": 0.01112244, + "auxiliary_loss_mlp": 0.0102895, + "balance_loss_clip": 1.03707623, + "balance_loss_mlp": 1.01723123, + "epoch": 0.9761010070644822, + "flos": 20897131239360.0, + "grad_norm": 2.4790129455514274, + "language_loss": 0.78806973, + "learning_rate": 5.973573091493156e-09, + "loss": 0.80948162, + "num_input_tokens_seen": 350403770, + "router_z_loss_clip": 0.75195312, + "router_z_loss_mlp": 0.11730957, + "step": 16235, + "time_per_iteration": 2.6157965660095215 + }, + { + "auxiliary_loss_clip": 0.01109624, + "auxiliary_loss_mlp": 0.0103031, + "balance_loss_clip": 1.03831112, + "balance_loss_mlp": 1.01823378, + "epoch": 0.9761611303171501, + "flos": 26908876218240.0, + "grad_norm": 2.29899072817952, + "language_loss": 0.76800102, + "learning_rate": 5.943532320779265e-09, + "loss": 0.78940034, + "num_input_tokens_seen": 350421870, + "router_z_loss_clip": 0.71289062, + "router_z_loss_mlp": 0.12072754, + "step": 16236, + "time_per_iteration": 2.628627061843872 + }, + { + "auxiliary_loss_clip": 0.01108248, + "auxiliary_loss_mlp": 0.01023133, + "balance_loss_clip": 1.03727078, + "balance_loss_mlp": 1.01238668, + "epoch": 0.9762212535698181, + "flos": 26549093057760.0, + "grad_norm": 1.6840335392412837, + "language_loss": 0.75412142, + "learning_rate": 5.913567164886446e-09, + "loss": 0.77543521, + "num_input_tokens_seen": 350440025, + "router_z_loss_clip": 0.71044922, + "router_z_loss_mlp": 0.10748291, + "step": 16237, + "time_per_iteration": 2.7282440662384033 + }, + { + "auxiliary_loss_clip": 0.01110645, + "auxiliary_loss_mlp": 0.01031968, + "balance_loss_clip": 1.03675175, + "balance_loss_mlp": 1.01927257, + "epoch": 0.9762813768224861, + "flos": 31630005852480.0, + "grad_norm": 2.8259924062598665, + "language_loss": 0.72852767, + "learning_rate": 5.8836776249509e-09, + "loss": 0.74995375, + "num_input_tokens_seen": 350459435, + "router_z_loss_clip": 0.73876953, + "router_z_loss_mlp": 0.12701416, + "step": 16238, + "time_per_iteration": 2.666445732116699 + }, + { + "auxiliary_loss_clip": 0.01108005, + "auxiliary_loss_mlp": 0.01030247, + "balance_loss_clip": 1.0369978, + "balance_loss_mlp": 1.01868391, + "epoch": 0.9763415000751541, + "flos": 29349206858880.0, + "grad_norm": 2.25504675382565, + "language_loss": 0.83525193, + "learning_rate": 5.8538637021063875e-09, + "loss": 0.85663444, + "num_input_tokens_seen": 350472655, + "router_z_loss_clip": 0.70996094, + "router_z_loss_mlp": 0.11566162, + "step": 16239, + "time_per_iteration": 2.6586687564849854 + }, + { + "auxiliary_loss_clip": 0.01109824, + "auxiliary_loss_mlp": 0.01028831, + "balance_loss_clip": 1.03802276, + "balance_loss_mlp": 1.01751852, + "epoch": 0.976401623327822, + "flos": 20765995855200.0, + "grad_norm": 2.6095796865011205, + "language_loss": 0.60061729, + "learning_rate": 5.824125397483115e-09, + "loss": 0.62200391, + "num_input_tokens_seen": 350488160, + "router_z_loss_clip": 0.71777344, + "router_z_loss_mlp": 0.11309814, + "step": 16240, + "time_per_iteration": 2.5956387519836426 + }, + { + "auxiliary_loss_clip": 0.01109342, + "auxiliary_loss_mlp": 0.01031465, + "balance_loss_clip": 1.03911686, + "balance_loss_mlp": 1.02102256, + "epoch": 0.97646174658049, + "flos": 19653388037280.0, + "grad_norm": 2.0977817750865704, + "language_loss": 0.82410502, + "learning_rate": 5.7944627122088474e-09, + "loss": 0.84551299, + "num_input_tokens_seen": 350506065, + "router_z_loss_clip": 0.703125, + "router_z_loss_mlp": 0.10449219, + "step": 16241, + "time_per_iteration": 2.6057610511779785 + }, + { + "auxiliary_loss_clip": 0.0110969, + "auxiliary_loss_mlp": 0.01034518, + "balance_loss_clip": 1.03861511, + "balance_loss_mlp": 1.02381945, + "epoch": 0.9765218698331579, + "flos": 25932549479040.0, + "grad_norm": 1.9719169732448851, + "language_loss": 0.83347189, + "learning_rate": 5.764875647408463e-09, + "loss": 0.85491395, + "num_input_tokens_seen": 350524495, + "router_z_loss_clip": 0.7109375, + "router_z_loss_mlp": 0.10699463, + "step": 16242, + "time_per_iteration": 4.0855865478515625 + }, + { + "auxiliary_loss_clip": 0.01110398, + "auxiliary_loss_mlp": 0.01027903, + "balance_loss_clip": 1.03827643, + "balance_loss_mlp": 1.01667976, + "epoch": 0.9765819930858259, + "flos": 22681015130880.0, + "grad_norm": 2.1752415712596043, + "language_loss": 0.75475228, + "learning_rate": 5.7353642042037294e-09, + "loss": 0.77613533, + "num_input_tokens_seen": 350544185, + "router_z_loss_clip": 0.72070312, + "router_z_loss_mlp": 0.11218262, + "step": 16243, + "time_per_iteration": 2.615936756134033 + }, + { + "auxiliary_loss_clip": 0.01110541, + "auxiliary_loss_mlp": 0.01032105, + "balance_loss_clip": 1.03757238, + "balance_loss_mlp": 1.02046359, + "epoch": 0.976642116338494, + "flos": 24729560310240.0, + "grad_norm": 1.6288285259025908, + "language_loss": 0.70226598, + "learning_rate": 5.705928383713754e-09, + "loss": 0.72369242, + "num_input_tokens_seen": 350562675, + "router_z_loss_clip": 0.72998047, + "router_z_loss_mlp": 0.11645508, + "step": 16244, + "time_per_iteration": 3.9800472259521484 + }, + { + "auxiliary_loss_clip": 0.01113083, + "auxiliary_loss_mlp": 0.01028776, + "balance_loss_clip": 1.03986764, + "balance_loss_mlp": 1.01723051, + "epoch": 0.9767022395911619, + "flos": 31177693925280.0, + "grad_norm": 1.8819481454847045, + "language_loss": 0.83424401, + "learning_rate": 5.676568187055197e-09, + "loss": 0.85566258, + "num_input_tokens_seen": 350581535, + "router_z_loss_clip": 0.73291016, + "router_z_loss_mlp": 0.11535645, + "step": 16245, + "time_per_iteration": 2.650968074798584 + }, + { + "auxiliary_loss_clip": 0.01105457, + "auxiliary_loss_mlp": 0.01024325, + "balance_loss_clip": 1.0354991, + "balance_loss_mlp": 1.01365018, + "epoch": 0.9767623628438299, + "flos": 26554887028800.0, + "grad_norm": 1.3280264349782047, + "language_loss": 0.78562903, + "learning_rate": 5.647283615340726e-09, + "loss": 0.80692685, + "num_input_tokens_seen": 350601615, + "router_z_loss_clip": 0.70019531, + "router_z_loss_mlp": 0.10681152, + "step": 16246, + "time_per_iteration": 2.6149795055389404 + }, + { + "auxiliary_loss_clip": 0.01102285, + "auxiliary_loss_mlp": 0.01029713, + "balance_loss_clip": 1.03623736, + "balance_loss_mlp": 1.02004504, + "epoch": 0.9768224860964978, + "flos": 19341246847680.0, + "grad_norm": 1.4806022753160537, + "language_loss": 0.74141884, + "learning_rate": 5.6180746696812275e-09, + "loss": 0.76273882, + "num_input_tokens_seen": 350619580, + "router_z_loss_clip": 0.66064453, + "router_z_loss_mlp": 0.09667969, + "step": 16247, + "time_per_iteration": 2.6523549556732178 + }, + { + "auxiliary_loss_clip": 0.01109481, + "auxiliary_loss_mlp": 0.0102905, + "balance_loss_clip": 1.03736484, + "balance_loss_mlp": 1.01783872, + "epoch": 0.9768826093491658, + "flos": 30690219349440.0, + "grad_norm": 1.640049548571647, + "language_loss": 0.79846632, + "learning_rate": 5.58894135118404e-09, + "loss": 0.81985164, + "num_input_tokens_seen": 350640015, + "router_z_loss_clip": 0.72167969, + "router_z_loss_mlp": 0.11212158, + "step": 16248, + "time_per_iteration": 2.651881217956543 + }, + { + "auxiliary_loss_clip": 0.01117358, + "auxiliary_loss_mlp": 0.01043773, + "balance_loss_clip": 1.04199123, + "balance_loss_mlp": 1.03162563, + "epoch": 0.9769427326018337, + "flos": 28024522832160.0, + "grad_norm": 2.285135581789798, + "language_loss": 0.78917009, + "learning_rate": 5.559883660954278e-09, + "loss": 0.81078136, + "num_input_tokens_seen": 350659155, + "router_z_loss_clip": 0.75390625, + "router_z_loss_mlp": 0.12145996, + "step": 16249, + "time_per_iteration": 2.7330496311187744 + }, + { + "auxiliary_loss_clip": 0.01106631, + "auxiliary_loss_mlp": 0.0103249, + "balance_loss_clip": 1.03755569, + "balance_loss_mlp": 1.02145672, + "epoch": 0.9770028558545018, + "flos": 18625448633760.0, + "grad_norm": 2.348891378703474, + "language_loss": 0.66757774, + "learning_rate": 5.530901600093507e-09, + "loss": 0.68896896, + "num_input_tokens_seen": 350676615, + "router_z_loss_clip": 0.69091797, + "router_z_loss_mlp": 0.11029053, + "step": 16250, + "time_per_iteration": 2.646174430847168 + }, + { + "auxiliary_loss_clip": 0.01027135, + "auxiliary_loss_mlp": 0.01001176, + "balance_loss_clip": 1.00474548, + "balance_loss_mlp": 1.00022268, + "epoch": 0.9770629791071697, + "flos": 87184810555200.0, + "grad_norm": 1.1262676270023728, + "language_loss": 0.59866846, + "learning_rate": 5.501995169700846e-09, + "loss": 0.61895156, + "num_input_tokens_seen": 350736805, + "router_z_loss_clip": 0.22387695, + "router_z_loss_mlp": 0.00952148, + "step": 16251, + "time_per_iteration": 3.335550308227539 + }, + { + "auxiliary_loss_clip": 0.01108096, + "auxiliary_loss_mlp": 0.01029116, + "balance_loss_clip": 1.03658366, + "balance_loss_mlp": 1.01777291, + "epoch": 0.9771231023598377, + "flos": 27348911409600.0, + "grad_norm": 1.710109571775923, + "language_loss": 0.7843774, + "learning_rate": 5.473164370872307e-09, + "loss": 0.80574954, + "num_input_tokens_seen": 350753600, + "router_z_loss_clip": 0.71435547, + "router_z_loss_mlp": 0.11334229, + "step": 16252, + "time_per_iteration": 2.6124260425567627 + }, + { + "auxiliary_loss_clip": 0.01107546, + "auxiliary_loss_mlp": 0.01028516, + "balance_loss_clip": 1.0368042, + "balance_loss_mlp": 1.01738811, + "epoch": 0.9771832256125056, + "flos": 23215361849280.0, + "grad_norm": 2.915496346792537, + "language_loss": 0.65436327, + "learning_rate": 5.444409204701461e-09, + "loss": 0.67572391, + "num_input_tokens_seen": 350771225, + "router_z_loss_clip": 0.70703125, + "router_z_loss_mlp": 0.11114502, + "step": 16253, + "time_per_iteration": 2.601231575012207 + }, + { + "auxiliary_loss_clip": 0.01113671, + "auxiliary_loss_mlp": 0.01030594, + "balance_loss_clip": 1.04094052, + "balance_loss_mlp": 1.01788652, + "epoch": 0.9772433488651736, + "flos": 21746374322400.0, + "grad_norm": 2.9232115957127993, + "language_loss": 0.76593328, + "learning_rate": 5.415729672278324e-09, + "loss": 0.78737587, + "num_input_tokens_seen": 350789100, + "router_z_loss_clip": 0.72607422, + "router_z_loss_mlp": 0.12701416, + "step": 16254, + "time_per_iteration": 4.0800745487213135 + }, + { + "auxiliary_loss_clip": 0.01111872, + "auxiliary_loss_mlp": 0.01030738, + "balance_loss_clip": 1.03797626, + "balance_loss_mlp": 1.01953852, + "epoch": 0.9773034721178415, + "flos": 45917561167200.0, + "grad_norm": 1.9167501764635582, + "language_loss": 0.63784897, + "learning_rate": 5.387125774690471e-09, + "loss": 0.65927511, + "num_input_tokens_seen": 350811085, + "router_z_loss_clip": 0.73828125, + "router_z_loss_mlp": 0.11199951, + "step": 16255, + "time_per_iteration": 2.745478391647339 + }, + { + "auxiliary_loss_clip": 0.01113966, + "auxiliary_loss_mlp": 0.01032482, + "balance_loss_clip": 1.03847694, + "balance_loss_mlp": 1.02008414, + "epoch": 0.9773635953705095, + "flos": 24773110035840.0, + "grad_norm": 2.3723892469149943, + "language_loss": 0.75808704, + "learning_rate": 5.358597513023033e-09, + "loss": 0.77955151, + "num_input_tokens_seen": 350831065, + "router_z_loss_clip": 0.75488281, + "router_z_loss_mlp": 0.12402344, + "step": 16256, + "time_per_iteration": 2.649350643157959 + }, + { + "auxiliary_loss_clip": 0.01107663, + "auxiliary_loss_mlp": 0.0103165, + "balance_loss_clip": 1.03870809, + "balance_loss_mlp": 1.02014065, + "epoch": 0.9774237186231776, + "flos": 27221949305280.0, + "grad_norm": 2.081532118316586, + "language_loss": 0.78162473, + "learning_rate": 5.330144888357369e-09, + "loss": 0.80301791, + "num_input_tokens_seen": 350849675, + "router_z_loss_clip": 0.68896484, + "router_z_loss_mlp": 0.11517334, + "step": 16257, + "time_per_iteration": 2.6379871368408203 + }, + { + "auxiliary_loss_clip": 0.01108785, + "auxiliary_loss_mlp": 0.01030679, + "balance_loss_clip": 1.0379796, + "balance_loss_mlp": 1.01934862, + "epoch": 0.9774838418758455, + "flos": 29535317841600.0, + "grad_norm": 1.9796389285873799, + "language_loss": 0.75431073, + "learning_rate": 5.301767901772391e-09, + "loss": 0.7757054, + "num_input_tokens_seen": 350868955, + "router_z_loss_clip": 0.70800781, + "router_z_loss_mlp": 0.11334229, + "step": 16258, + "time_per_iteration": 2.7852628231048584 + }, + { + "auxiliary_loss_clip": 0.01027177, + "auxiliary_loss_mlp": 0.01001588, + "balance_loss_clip": 1.00481749, + "balance_loss_mlp": 1.00064349, + "epoch": 0.9775439651285135, + "flos": 80968322995200.0, + "grad_norm": 0.6717416645328683, + "language_loss": 0.59756422, + "learning_rate": 5.273466554344353e-09, + "loss": 0.61785191, + "num_input_tokens_seen": 350935110, + "router_z_loss_clip": 0.22363281, + "router_z_loss_mlp": 0.00942993, + "step": 16259, + "time_per_iteration": 3.3907814025878906 + }, + { + "auxiliary_loss_clip": 0.01113126, + "auxiliary_loss_mlp": 0.01029534, + "balance_loss_clip": 1.03808713, + "balance_loss_mlp": 1.01762486, + "epoch": 0.9776040883811814, + "flos": 27578612635200.0, + "grad_norm": 1.676660475366586, + "language_loss": 0.73411632, + "learning_rate": 5.2452408471461705e-09, + "loss": 0.75554293, + "num_input_tokens_seen": 350953220, + "router_z_loss_clip": 0.75048828, + "router_z_loss_mlp": 0.11907959, + "step": 16260, + "time_per_iteration": 2.628465414047241 + }, + { + "auxiliary_loss_clip": 0.01111049, + "auxiliary_loss_mlp": 0.01033734, + "balance_loss_clip": 1.03777826, + "balance_loss_mlp": 1.02233744, + "epoch": 0.9776642116338494, + "flos": 22503007604160.0, + "grad_norm": 2.0634009276760032, + "language_loss": 0.79264498, + "learning_rate": 5.2170907812485456e-09, + "loss": 0.81409276, + "num_input_tokens_seen": 350971915, + "router_z_loss_clip": 0.73242188, + "router_z_loss_mlp": 0.1138916, + "step": 16261, + "time_per_iteration": 2.6628425121307373 + }, + { + "auxiliary_loss_clip": 0.01110024, + "auxiliary_loss_mlp": 0.0102579, + "balance_loss_clip": 1.03684282, + "balance_loss_mlp": 1.0145483, + "epoch": 0.9777243348865173, + "flos": 27623580465600.0, + "grad_norm": 17.416089990639502, + "language_loss": 0.7418164, + "learning_rate": 5.189016357718845e-09, + "loss": 0.76317453, + "num_input_tokens_seen": 350990470, + "router_z_loss_clip": 0.73144531, + "router_z_loss_mlp": 0.11236572, + "step": 16262, + "time_per_iteration": 2.6425023078918457 + }, + { + "auxiliary_loss_clip": 0.01110832, + "auxiliary_loss_mlp": 0.01031497, + "balance_loss_clip": 1.0378139, + "balance_loss_mlp": 1.01918864, + "epoch": 0.9777844581391854, + "flos": 38219242102560.0, + "grad_norm": 2.9394779817106875, + "language_loss": 0.6977337, + "learning_rate": 5.16101757762133e-09, + "loss": 0.71915698, + "num_input_tokens_seen": 351010755, + "router_z_loss_clip": 0.72998047, + "router_z_loss_mlp": 0.12329102, + "step": 16263, + "time_per_iteration": 2.7328128814697266 + }, + { + "auxiliary_loss_clip": 0.01110917, + "auxiliary_loss_mlp": 0.01027713, + "balance_loss_clip": 1.03809512, + "balance_loss_mlp": 1.01704335, + "epoch": 0.9778445813918533, + "flos": 28112554180800.0, + "grad_norm": 1.8756270634159746, + "language_loss": 0.66249806, + "learning_rate": 5.133094442018038e-09, + "loss": 0.68388432, + "num_input_tokens_seen": 351029965, + "router_z_loss_clip": 0.72753906, + "router_z_loss_mlp": 0.10668945, + "step": 16264, + "time_per_iteration": 2.634570360183716 + }, + { + "auxiliary_loss_clip": 0.01114994, + "auxiliary_loss_mlp": 0.01028443, + "balance_loss_clip": 1.03912592, + "balance_loss_mlp": 1.01647484, + "epoch": 0.9779047046445213, + "flos": 21434273650080.0, + "grad_norm": 2.8029334551266603, + "language_loss": 0.72981226, + "learning_rate": 5.105246951967679e-09, + "loss": 0.75124663, + "num_input_tokens_seen": 351046205, + "router_z_loss_clip": 0.7578125, + "router_z_loss_mlp": 0.11962891, + "step": 16265, + "time_per_iteration": 2.610511064529419 + }, + { + "auxiliary_loss_clip": 0.01106605, + "auxiliary_loss_mlp": 0.01029488, + "balance_loss_clip": 1.03660035, + "balance_loss_mlp": 1.01828218, + "epoch": 0.9779648278971892, + "flos": 25308348134400.0, + "grad_norm": 2.3486518164163246, + "language_loss": 0.68232745, + "learning_rate": 5.077475108526297e-09, + "loss": 0.70368838, + "num_input_tokens_seen": 351065390, + "router_z_loss_clip": 0.70068359, + "router_z_loss_mlp": 0.11206055, + "step": 16266, + "time_per_iteration": 2.5984385013580322 + }, + { + "auxiliary_loss_clip": 0.01105727, + "auxiliary_loss_mlp": 0.01025499, + "balance_loss_clip": 1.03723049, + "balance_loss_mlp": 1.01550925, + "epoch": 0.9780249511498572, + "flos": 25657677836640.0, + "grad_norm": 1.62267883436185, + "language_loss": 0.86809129, + "learning_rate": 5.049778912747049e-09, + "loss": 0.88940358, + "num_input_tokens_seen": 351084355, + "router_z_loss_clip": 0.68457031, + "router_z_loss_mlp": 0.09985352, + "step": 16267, + "time_per_iteration": 2.7002227306365967 + }, + { + "auxiliary_loss_clip": 0.01110974, + "auxiliary_loss_mlp": 0.01027328, + "balance_loss_clip": 1.0372895, + "balance_loss_mlp": 1.01550877, + "epoch": 0.9780850744025251, + "flos": 37551531549600.0, + "grad_norm": 1.9600324866432868, + "language_loss": 0.7013582, + "learning_rate": 5.022158365679985e-09, + "loss": 0.72274125, + "num_input_tokens_seen": 351105870, + "router_z_loss_clip": 0.73632812, + "router_z_loss_mlp": 0.11816406, + "step": 16268, + "time_per_iteration": 2.6900362968444824 + }, + { + "auxiliary_loss_clip": 0.01110674, + "auxiliary_loss_mlp": 0.01027145, + "balance_loss_clip": 1.03818679, + "balance_loss_mlp": 1.01633251, + "epoch": 0.9781451976551931, + "flos": 24773717795040.0, + "grad_norm": 1.5217723841681712, + "language_loss": 0.73748475, + "learning_rate": 4.994613468372711e-09, + "loss": 0.75886297, + "num_input_tokens_seen": 351124760, + "router_z_loss_clip": 0.72558594, + "router_z_loss_mlp": 0.1081543, + "step": 16269, + "time_per_iteration": 2.6246225833892822 + }, + { + "auxiliary_loss_clip": 0.01111764, + "auxiliary_loss_mlp": 0.01028973, + "balance_loss_clip": 1.03907883, + "balance_loss_mlp": 1.01692736, + "epoch": 0.9782053209078612, + "flos": 29671274782080.0, + "grad_norm": 1.7968041276338407, + "language_loss": 0.70759386, + "learning_rate": 4.967144221869501e-09, + "loss": 0.72900128, + "num_input_tokens_seen": 351142820, + "router_z_loss_clip": 0.72705078, + "router_z_loss_mlp": 0.1204834, + "step": 16270, + "time_per_iteration": 4.0593421459198 + }, + { + "auxiliary_loss_clip": 0.01110951, + "auxiliary_loss_mlp": 0.01030799, + "balance_loss_clip": 1.03841567, + "balance_loss_mlp": 1.01979613, + "epoch": 0.9782654441605291, + "flos": 39644517834720.0, + "grad_norm": 1.8214407652245552, + "language_loss": 0.64150947, + "learning_rate": 4.939750627212191e-09, + "loss": 0.66292697, + "num_input_tokens_seen": 351164805, + "router_z_loss_clip": 0.72509766, + "router_z_loss_mlp": 0.10998535, + "step": 16271, + "time_per_iteration": 2.7385873794555664 + }, + { + "auxiliary_loss_clip": 0.01107007, + "auxiliary_loss_mlp": 0.01029958, + "balance_loss_clip": 1.03820705, + "balance_loss_mlp": 1.01844192, + "epoch": 0.9783255674131971, + "flos": 32920459128000.0, + "grad_norm": 2.0295856290700183, + "language_loss": 0.70416349, + "learning_rate": 4.912432685439505e-09, + "loss": 0.72553307, + "num_input_tokens_seen": 351187005, + "router_z_loss_clip": 0.6875, + "router_z_loss_mlp": 0.11517334, + "step": 16272, + "time_per_iteration": 2.6983141899108887 + }, + { + "auxiliary_loss_clip": 0.01112209, + "auxiliary_loss_mlp": 0.01032443, + "balance_loss_clip": 1.03937101, + "balance_loss_mlp": 1.02130866, + "epoch": 0.978385690665865, + "flos": 28201801047840.0, + "grad_norm": 1.7470793576212953, + "language_loss": 0.66809672, + "learning_rate": 4.88519039758728e-09, + "loss": 0.68954325, + "num_input_tokens_seen": 351208450, + "router_z_loss_clip": 0.72851562, + "router_z_loss_mlp": 0.11138916, + "step": 16273, + "time_per_iteration": 2.62813138961792 + }, + { + "auxiliary_loss_clip": 0.01109327, + "auxiliary_loss_mlp": 0.01021256, + "balance_loss_clip": 1.03742957, + "balance_loss_mlp": 1.00972867, + "epoch": 0.978445813918533, + "flos": 31005399335040.0, + "grad_norm": 1.8697403119788847, + "language_loss": 0.73930573, + "learning_rate": 4.85802376468869e-09, + "loss": 0.76061159, + "num_input_tokens_seen": 351229585, + "router_z_loss_clip": 0.71875, + "router_z_loss_mlp": 0.11541748, + "step": 16274, + "time_per_iteration": 2.6730852127075195 + }, + { + "auxiliary_loss_clip": 0.01110487, + "auxiliary_loss_mlp": 0.01027724, + "balance_loss_clip": 1.0393424, + "balance_loss_mlp": 1.01728141, + "epoch": 0.9785059371712009, + "flos": 28736228800800.0, + "grad_norm": 1.6912881804457445, + "language_loss": 0.7753396, + "learning_rate": 4.830932787773579e-09, + "loss": 0.7967217, + "num_input_tokens_seen": 351249525, + "router_z_loss_clip": 0.71191406, + "router_z_loss_mlp": 0.10443115, + "step": 16275, + "time_per_iteration": 2.626640796661377 + }, + { + "auxiliary_loss_clip": 0.01112902, + "auxiliary_loss_mlp": 0.0102733, + "balance_loss_clip": 1.03952634, + "balance_loss_mlp": 1.01582658, + "epoch": 0.978566060423869, + "flos": 41916281474880.0, + "grad_norm": 1.9188621137220583, + "language_loss": 0.70535016, + "learning_rate": 4.803917467869567e-09, + "loss": 0.72675252, + "num_input_tokens_seen": 351272530, + "router_z_loss_clip": 0.73388672, + "router_z_loss_mlp": 0.1149292, + "step": 16276, + "time_per_iteration": 2.7701034545898438 + }, + { + "auxiliary_loss_clip": 0.01105973, + "auxiliary_loss_mlp": 0.01029835, + "balance_loss_clip": 1.03598547, + "balance_loss_mlp": 1.01936853, + "epoch": 0.9786261836765369, + "flos": 14176719087840.0, + "grad_norm": 2.3181972763150602, + "language_loss": 0.85674465, + "learning_rate": 4.776977806000726e-09, + "loss": 0.87810278, + "num_input_tokens_seen": 351288530, + "router_z_loss_clip": 0.69873047, + "router_z_loss_mlp": 0.10467529, + "step": 16277, + "time_per_iteration": 2.5855047702789307 + }, + { + "auxiliary_loss_clip": 0.01108406, + "auxiliary_loss_mlp": 0.01024367, + "balance_loss_clip": 1.03771627, + "balance_loss_mlp": 1.01339912, + "epoch": 0.9786863069292049, + "flos": 21256792848000.0, + "grad_norm": 1.8683778778331128, + "language_loss": 0.7070812, + "learning_rate": 4.7501138031891264e-09, + "loss": 0.72840893, + "num_input_tokens_seen": 351305890, + "router_z_loss_clip": 0.70703125, + "router_z_loss_mlp": 0.10974121, + "step": 16278, + "time_per_iteration": 2.635801076889038 + }, + { + "auxiliary_loss_clip": 0.01105073, + "auxiliary_loss_mlp": 0.01026057, + "balance_loss_clip": 1.03454494, + "balance_loss_mlp": 1.01451719, + "epoch": 0.9787464301818728, + "flos": 25435512825120.0, + "grad_norm": 2.1000321421812593, + "language_loss": 0.84453064, + "learning_rate": 4.723325460453065e-09, + "loss": 0.86584193, + "num_input_tokens_seen": 351325010, + "router_z_loss_clip": 0.70507812, + "router_z_loss_mlp": 0.11535645, + "step": 16279, + "time_per_iteration": 2.6437597274780273 + }, + { + "auxiliary_loss_clip": 0.01108659, + "auxiliary_loss_mlp": 0.01028692, + "balance_loss_clip": 1.03630102, + "balance_loss_mlp": 1.01702094, + "epoch": 0.9788065534345408, + "flos": 22235510106720.0, + "grad_norm": 1.7561727623834247, + "language_loss": 0.79005009, + "learning_rate": 4.696612778808395e-09, + "loss": 0.81142354, + "num_input_tokens_seen": 351343060, + "router_z_loss_clip": 0.72265625, + "router_z_loss_mlp": 0.11676025, + "step": 16280, + "time_per_iteration": 2.6765987873077393 + }, + { + "auxiliary_loss_clip": 0.01105008, + "auxiliary_loss_mlp": 0.01031539, + "balance_loss_clip": 1.03732121, + "balance_loss_mlp": 1.02127504, + "epoch": 0.9788666766872087, + "flos": 26330007359520.0, + "grad_norm": 2.530262819816376, + "language_loss": 0.79576325, + "learning_rate": 4.669975759268085e-09, + "loss": 0.81712872, + "num_input_tokens_seen": 351363260, + "router_z_loss_clip": 0.67626953, + "router_z_loss_mlp": 0.10266113, + "step": 16281, + "time_per_iteration": 4.07382607460022 + }, + { + "auxiliary_loss_clip": 0.01110537, + "auxiliary_loss_mlp": 0.01029787, + "balance_loss_clip": 1.03711796, + "balance_loss_mlp": 1.01803279, + "epoch": 0.9789267999398767, + "flos": 30383912648160.0, + "grad_norm": 2.194188152227817, + "language_loss": 0.80392301, + "learning_rate": 4.643414402842216e-09, + "loss": 0.8253262, + "num_input_tokens_seen": 351382610, + "router_z_loss_clip": 0.73486328, + "router_z_loss_mlp": 0.11749268, + "step": 16282, + "time_per_iteration": 2.6582939624786377 + }, + { + "auxiliary_loss_clip": 0.01109862, + "auxiliary_loss_mlp": 0.01036784, + "balance_loss_clip": 1.03776968, + "balance_loss_mlp": 1.02583504, + "epoch": 0.9789869231925448, + "flos": 23883072402240.0, + "grad_norm": 2.1983887343064588, + "language_loss": 0.83385611, + "learning_rate": 4.616928710538204e-09, + "loss": 0.85532254, + "num_input_tokens_seen": 351401075, + "router_z_loss_clip": 0.72021484, + "router_z_loss_mlp": 0.10955811, + "step": 16283, + "time_per_iteration": 2.6964266300201416 + }, + { + "auxiliary_loss_clip": 0.01109882, + "auxiliary_loss_mlp": 0.01030333, + "balance_loss_clip": 1.03746974, + "balance_loss_mlp": 1.01909161, + "epoch": 0.9790470464452127, + "flos": 20495338009920.0, + "grad_norm": 1.8485686403966224, + "language_loss": 0.72061622, + "learning_rate": 4.590518683360134e-09, + "loss": 0.74201846, + "num_input_tokens_seen": 351419275, + "router_z_loss_clip": 0.72412109, + "router_z_loss_mlp": 0.11230469, + "step": 16284, + "time_per_iteration": 3.965088367462158 + }, + { + "auxiliary_loss_clip": 0.0110868, + "auxiliary_loss_mlp": 0.01030886, + "balance_loss_clip": 1.03876829, + "balance_loss_mlp": 1.02034795, + "epoch": 0.9791071696978807, + "flos": 22414125392640.0, + "grad_norm": 2.452395045074463, + "language_loss": 0.6425572, + "learning_rate": 4.56418432230965e-09, + "loss": 0.66395283, + "num_input_tokens_seen": 351437375, + "router_z_loss_clip": 0.69921875, + "router_z_loss_mlp": 0.10540771, + "step": 16285, + "time_per_iteration": 2.607041120529175 + }, + { + "auxiliary_loss_clip": 0.01107513, + "auxiliary_loss_mlp": 0.01028013, + "balance_loss_clip": 1.03710485, + "balance_loss_mlp": 1.01711094, + "epoch": 0.9791672929505486, + "flos": 29492943117120.0, + "grad_norm": 1.5523995092405938, + "language_loss": 0.70682245, + "learning_rate": 4.537925628385286e-09, + "loss": 0.72817773, + "num_input_tokens_seen": 351457810, + "router_z_loss_clip": 0.70361328, + "router_z_loss_mlp": 0.10900879, + "step": 16286, + "time_per_iteration": 2.6745033264160156 + }, + { + "auxiliary_loss_clip": 0.01106218, + "auxiliary_loss_mlp": 0.01026113, + "balance_loss_clip": 1.03622401, + "balance_loss_mlp": 1.01571763, + "epoch": 0.9792274162032166, + "flos": 29717093475360.0, + "grad_norm": 1.6837929584994353, + "language_loss": 0.58377671, + "learning_rate": 4.511742602582691e-09, + "loss": 0.60510004, + "num_input_tokens_seen": 351478825, + "router_z_loss_clip": 0.69970703, + "router_z_loss_mlp": 0.10394287, + "step": 16287, + "time_per_iteration": 2.6527740955352783 + }, + { + "auxiliary_loss_clip": 0.01109251, + "auxiliary_loss_mlp": 0.01032797, + "balance_loss_clip": 1.03782582, + "balance_loss_mlp": 1.02152026, + "epoch": 0.9792875394558845, + "flos": 32208631607520.0, + "grad_norm": 1.5700146972336897, + "language_loss": 0.81556815, + "learning_rate": 4.485635245894626e-09, + "loss": 0.83698857, + "num_input_tokens_seen": 351498785, + "router_z_loss_clip": 0.71435547, + "router_z_loss_mlp": 0.11279297, + "step": 16288, + "time_per_iteration": 2.695185661315918 + }, + { + "auxiliary_loss_clip": 0.01109374, + "auxiliary_loss_mlp": 0.01024182, + "balance_loss_clip": 1.03634024, + "balance_loss_mlp": 1.01277983, + "epoch": 0.9793476627085526, + "flos": 34346747792160.0, + "grad_norm": 1.5598824215551128, + "language_loss": 0.71409786, + "learning_rate": 4.459603559311631e-09, + "loss": 0.7354334, + "num_input_tokens_seen": 351520235, + "router_z_loss_clip": 0.72949219, + "router_z_loss_mlp": 0.11401367, + "step": 16289, + "time_per_iteration": 2.69032883644104 + }, + { + "auxiliary_loss_clip": 0.01108176, + "auxiliary_loss_mlp": 0.01033167, + "balance_loss_clip": 1.03757417, + "balance_loss_mlp": 1.02168727, + "epoch": 0.9794077859612205, + "flos": 20455070184000.0, + "grad_norm": 7.670789829647185, + "language_loss": 0.74824864, + "learning_rate": 4.43364754382003e-09, + "loss": 0.76966202, + "num_input_tokens_seen": 351538900, + "router_z_loss_clip": 0.70556641, + "router_z_loss_mlp": 0.11486816, + "step": 16290, + "time_per_iteration": 2.710336685180664 + }, + { + "auxiliary_loss_clip": 0.01111331, + "auxiliary_loss_mlp": 0.01031187, + "balance_loss_clip": 1.03759646, + "balance_loss_mlp": 1.01929593, + "epoch": 0.9794679092138885, + "flos": 23526449589600.0, + "grad_norm": 1.5678722133479228, + "language_loss": 0.67521286, + "learning_rate": 4.4077672004048105e-09, + "loss": 0.69663805, + "num_input_tokens_seen": 351558715, + "router_z_loss_clip": 0.73730469, + "router_z_loss_mlp": 0.11865234, + "step": 16291, + "time_per_iteration": 2.633681535720825 + }, + { + "auxiliary_loss_clip": 0.01112432, + "auxiliary_loss_mlp": 0.01025594, + "balance_loss_clip": 1.03773689, + "balance_loss_mlp": 1.0143292, + "epoch": 0.9795280324665564, + "flos": 39238834946400.0, + "grad_norm": 2.1354480289956665, + "language_loss": 0.63091749, + "learning_rate": 4.3819625300467456e-09, + "loss": 0.65229774, + "num_input_tokens_seen": 351578450, + "router_z_loss_clip": 0.74707031, + "router_z_loss_mlp": 0.1126709, + "step": 16292, + "time_per_iteration": 2.735177993774414 + }, + { + "auxiliary_loss_clip": 0.01110355, + "auxiliary_loss_mlp": 0.01027849, + "balance_loss_clip": 1.03872085, + "balance_loss_mlp": 1.01718593, + "epoch": 0.9795881557192244, + "flos": 23258222781120.0, + "grad_norm": 2.359673359580551, + "language_loss": 0.73200893, + "learning_rate": 4.356233533724829e-09, + "loss": 0.75339097, + "num_input_tokens_seen": 351597195, + "router_z_loss_clip": 0.71630859, + "router_z_loss_mlp": 0.10656738, + "step": 16293, + "time_per_iteration": 4.16929817199707 + }, + { + "auxiliary_loss_clip": 0.01110181, + "auxiliary_loss_mlp": 0.01027928, + "balance_loss_clip": 1.03670585, + "balance_loss_mlp": 1.01677632, + "epoch": 0.9796482789718923, + "flos": 34568791251840.0, + "grad_norm": 2.0554852305059335, + "language_loss": 0.83920717, + "learning_rate": 4.330580212414503e-09, + "loss": 0.86058831, + "num_input_tokens_seen": 351617460, + "router_z_loss_clip": 0.73388672, + "router_z_loss_mlp": 0.11151123, + "step": 16294, + "time_per_iteration": 2.6675264835357666 + }, + { + "auxiliary_loss_clip": 0.01105512, + "auxiliary_loss_mlp": 0.01030097, + "balance_loss_clip": 1.03674662, + "balance_loss_mlp": 1.01979721, + "epoch": 0.9797084022245603, + "flos": 21924341331840.0, + "grad_norm": 5.854422983736405, + "language_loss": 0.71897972, + "learning_rate": 4.305002567088767e-09, + "loss": 0.74033582, + "num_input_tokens_seen": 351635900, + "router_z_loss_clip": 0.68847656, + "router_z_loss_mlp": 0.10302734, + "step": 16295, + "time_per_iteration": 2.6208877563476562 + }, + { + "auxiliary_loss_clip": 0.01113723, + "auxiliary_loss_mlp": 0.01035899, + "balance_loss_clip": 1.03867745, + "balance_loss_mlp": 1.02475286, + "epoch": 0.9797685254772284, + "flos": 24729438758400.0, + "grad_norm": 1.9832383832526683, + "language_loss": 0.8063364, + "learning_rate": 4.2795005987170674e-09, + "loss": 0.82783258, + "num_input_tokens_seen": 351655400, + "router_z_loss_clip": 0.75146484, + "router_z_loss_mlp": 0.11132812, + "step": 16296, + "time_per_iteration": 2.6072909832000732 + }, + { + "auxiliary_loss_clip": 0.01107622, + "auxiliary_loss_mlp": 0.01032116, + "balance_loss_clip": 1.03700638, + "balance_loss_mlp": 1.02156639, + "epoch": 0.9798286487298963, + "flos": 32832589848480.0, + "grad_norm": 1.7865982226621129, + "language_loss": 0.75960338, + "learning_rate": 4.254074308266853e-09, + "loss": 0.78100079, + "num_input_tokens_seen": 351675505, + "router_z_loss_clip": 0.70654297, + "router_z_loss_mlp": 0.10552979, + "step": 16297, + "time_per_iteration": 2.6793291568756104 + }, + { + "auxiliary_loss_clip": 0.01112507, + "auxiliary_loss_mlp": 0.01033299, + "balance_loss_clip": 1.03777909, + "balance_loss_mlp": 1.02224874, + "epoch": 0.9798887719825643, + "flos": 33142583622240.0, + "grad_norm": 1.72581891366255, + "language_loss": 0.78335059, + "learning_rate": 4.228723696702019e-09, + "loss": 0.80480862, + "num_input_tokens_seen": 351697920, + "router_z_loss_clip": 0.74658203, + "router_z_loss_mlp": 0.11053467, + "step": 16298, + "time_per_iteration": 2.680680513381958 + }, + { + "auxiliary_loss_clip": 0.01104634, + "auxiliary_loss_mlp": 0.01024197, + "balance_loss_clip": 1.03590298, + "balance_loss_mlp": 1.01356936, + "epoch": 0.9799488952352322, + "flos": 25219587474720.0, + "grad_norm": 1.6382088901176968, + "language_loss": 0.72494566, + "learning_rate": 4.203448764984019e-09, + "loss": 0.74623394, + "num_input_tokens_seen": 351717615, + "router_z_loss_clip": 0.68701172, + "router_z_loss_mlp": 0.10626221, + "step": 16299, + "time_per_iteration": 2.6301982402801514 + }, + { + "auxiliary_loss_clip": 0.01111408, + "auxiliary_loss_mlp": 0.01028938, + "balance_loss_clip": 1.03722084, + "balance_loss_mlp": 1.01746464, + "epoch": 0.9800090184879002, + "flos": 26821614697920.0, + "grad_norm": 6.2322110745341455, + "language_loss": 0.8913812, + "learning_rate": 4.178249514071419e-09, + "loss": 0.9127847, + "num_input_tokens_seen": 351735260, + "router_z_loss_clip": 0.74121094, + "router_z_loss_mlp": 0.11480713, + "step": 16300, + "time_per_iteration": 2.6373164653778076 + }, + { + "auxiliary_loss_clip": 0.01111773, + "auxiliary_loss_mlp": 0.01029602, + "balance_loss_clip": 1.03753412, + "balance_loss_mlp": 1.01811028, + "epoch": 0.9800691417405681, + "flos": 25976180239200.0, + "grad_norm": 2.8137493559587217, + "language_loss": 0.78299034, + "learning_rate": 4.1531259449194555e-09, + "loss": 0.80440414, + "num_input_tokens_seen": 351755800, + "router_z_loss_clip": 0.74267578, + "router_z_loss_mlp": 0.1149292, + "step": 16301, + "time_per_iteration": 2.6485416889190674 + }, + { + "auxiliary_loss_clip": 0.01109922, + "auxiliary_loss_mlp": 0.01033508, + "balance_loss_clip": 1.03744555, + "balance_loss_mlp": 1.02239799, + "epoch": 0.9801292649932362, + "flos": 22500414498240.0, + "grad_norm": 2.2401524535246473, + "language_loss": 0.75181872, + "learning_rate": 4.128078058480921e-09, + "loss": 0.77325296, + "num_input_tokens_seen": 351774790, + "router_z_loss_clip": 0.72460938, + "router_z_loss_mlp": 0.11114502, + "step": 16302, + "time_per_iteration": 2.61999773979187 + }, + { + "auxiliary_loss_clip": 0.01109161, + "auxiliary_loss_mlp": 0.01028624, + "balance_loss_clip": 1.03791571, + "balance_loss_mlp": 1.01707864, + "epoch": 0.9801893882459041, + "flos": 30561920174880.0, + "grad_norm": 2.134652902684229, + "language_loss": 0.79713476, + "learning_rate": 4.103105855705724e-09, + "loss": 0.81851262, + "num_input_tokens_seen": 351792855, + "router_z_loss_clip": 0.71337891, + "router_z_loss_mlp": 0.11547852, + "step": 16303, + "time_per_iteration": 2.6431586742401123 + }, + { + "auxiliary_loss_clip": 0.01113199, + "auxiliary_loss_mlp": 0.01028508, + "balance_loss_clip": 1.0385716, + "balance_loss_mlp": 1.01670086, + "epoch": 0.9802495114985721, + "flos": 22587919122240.0, + "grad_norm": 2.9776496541969935, + "language_loss": 0.83792078, + "learning_rate": 4.078209337540883e-09, + "loss": 0.85933787, + "num_input_tokens_seen": 351811450, + "router_z_loss_clip": 0.74511719, + "router_z_loss_mlp": 0.11810303, + "step": 16304, + "time_per_iteration": 2.620410680770874 + }, + { + "auxiliary_loss_clip": 0.01102682, + "auxiliary_loss_mlp": 0.01025001, + "balance_loss_clip": 1.03519511, + "balance_loss_mlp": 1.01495123, + "epoch": 0.98030963475124, + "flos": 26197656456960.0, + "grad_norm": 3.0285187786918555, + "language_loss": 0.70542955, + "learning_rate": 4.053388504930089e-09, + "loss": 0.72670639, + "num_input_tokens_seen": 351831960, + "router_z_loss_clip": 0.67529297, + "router_z_loss_mlp": 0.1005249, + "step": 16305, + "time_per_iteration": 2.6386494636535645 + }, + { + "auxiliary_loss_clip": 0.01111477, + "auxiliary_loss_mlp": 0.0103106, + "balance_loss_clip": 1.03876019, + "balance_loss_mlp": 1.01991439, + "epoch": 0.980369758003908, + "flos": 24907081629600.0, + "grad_norm": 1.7050714613385254, + "language_loss": 0.7158711, + "learning_rate": 4.028643358815032e-09, + "loss": 0.73729652, + "num_input_tokens_seen": 351851585, + "router_z_loss_clip": 0.7265625, + "router_z_loss_mlp": 0.1114502, + "step": 16306, + "time_per_iteration": 2.753115653991699 + }, + { + "auxiliary_loss_clip": 0.01105154, + "auxiliary_loss_mlp": 0.01032186, + "balance_loss_clip": 1.03575087, + "balance_loss_mlp": 1.0216006, + "epoch": 0.9804298812565759, + "flos": 28552386785760.0, + "grad_norm": 1.5737161584110622, + "language_loss": 0.73482871, + "learning_rate": 4.00397390013385e-09, + "loss": 0.75620222, + "num_input_tokens_seen": 351871085, + "router_z_loss_clip": 0.69335938, + "router_z_loss_mlp": 0.10583496, + "step": 16307, + "time_per_iteration": 2.6405255794525146 + }, + { + "auxiliary_loss_clip": 0.01103397, + "auxiliary_loss_mlp": 0.01029394, + "balance_loss_clip": 1.03635311, + "balance_loss_mlp": 1.01922524, + "epoch": 0.980490004509244, + "flos": 28422142781760.0, + "grad_norm": 1.738866518002676, + "language_loss": 0.75092775, + "learning_rate": 3.979380129822018e-09, + "loss": 0.77225566, + "num_input_tokens_seen": 351891775, + "router_z_loss_clip": 0.66992188, + "router_z_loss_mlp": 0.10168457, + "step": 16308, + "time_per_iteration": 2.659525156021118 + }, + { + "auxiliary_loss_clip": 0.01027131, + "auxiliary_loss_mlp": 0.01001231, + "balance_loss_clip": 1.00480127, + "balance_loss_mlp": 1.00030756, + "epoch": 0.980550127761912, + "flos": 68393590613280.0, + "grad_norm": 0.7528258488531752, + "language_loss": 0.57722539, + "learning_rate": 3.954862048811902e-09, + "loss": 0.59750903, + "num_input_tokens_seen": 351946770, + "router_z_loss_clip": 0.22314453, + "router_z_loss_mlp": 0.00921631, + "step": 16309, + "time_per_iteration": 4.533123254776001 + }, + { + "auxiliary_loss_clip": 0.01109045, + "auxiliary_loss_mlp": 0.0103012, + "balance_loss_clip": 1.03587866, + "balance_loss_mlp": 1.01853907, + "epoch": 0.9806102510145799, + "flos": 30912505912800.0, + "grad_norm": 1.9314727565435073, + "language_loss": 0.66547394, + "learning_rate": 3.930419658033646e-09, + "loss": 0.68686557, + "num_input_tokens_seen": 351966155, + "router_z_loss_clip": 0.73193359, + "router_z_loss_mlp": 0.11566162, + "step": 16310, + "time_per_iteration": 2.6491572856903076 + }, + { + "auxiliary_loss_clip": 0.01027117, + "auxiliary_loss_mlp": 0.01001397, + "balance_loss_clip": 1.00477576, + "balance_loss_mlp": 1.00041699, + "epoch": 0.9806703742672479, + "flos": 82087656681600.0, + "grad_norm": 1.023630358502157, + "language_loss": 0.54569554, + "learning_rate": 3.906052958413841e-09, + "loss": 0.56598067, + "num_input_tokens_seen": 352031655, + "router_z_loss_clip": 0.2232666, + "router_z_loss_mlp": 0.00978851, + "step": 16311, + "time_per_iteration": 3.3203964233398438 + }, + { + "auxiliary_loss_clip": 0.01108159, + "auxiliary_loss_mlp": 0.01023461, + "balance_loss_clip": 1.03723669, + "balance_loss_mlp": 1.0128274, + "epoch": 0.9807304975199158, + "flos": 30784733462880.0, + "grad_norm": 1.6912442319985366, + "language_loss": 0.80214429, + "learning_rate": 3.881761950876638e-09, + "loss": 0.82346046, + "num_input_tokens_seen": 352051920, + "router_z_loss_clip": 0.70898438, + "router_z_loss_mlp": 0.10632324, + "step": 16312, + "time_per_iteration": 2.662057399749756 + }, + { + "auxiliary_loss_clip": 0.01108481, + "auxiliary_loss_mlp": 0.0102598, + "balance_loss_clip": 1.03816557, + "balance_loss_mlp": 1.01548994, + "epoch": 0.9807906207725838, + "flos": 21300139987200.0, + "grad_norm": 1.953079171758541, + "language_loss": 0.63883066, + "learning_rate": 3.8575466363430785e-09, + "loss": 0.6601752, + "num_input_tokens_seen": 352069315, + "router_z_loss_clip": 0.703125, + "router_z_loss_mlp": 0.1048584, + "step": 16313, + "time_per_iteration": 2.6130199432373047 + }, + { + "auxiliary_loss_clip": 0.01107951, + "auxiliary_loss_mlp": 0.01030202, + "balance_loss_clip": 1.03751969, + "balance_loss_mlp": 1.01880622, + "epoch": 0.9808507440252517, + "flos": 25663836463200.0, + "grad_norm": 2.7491685401912624, + "language_loss": 0.72774643, + "learning_rate": 3.833407015731316e-09, + "loss": 0.74912792, + "num_input_tokens_seen": 352089480, + "router_z_loss_clip": 0.70458984, + "router_z_loss_mlp": 0.11395264, + "step": 16314, + "time_per_iteration": 2.7423083782196045 + }, + { + "auxiliary_loss_clip": 0.01027211, + "auxiliary_loss_mlp": 0.01001338, + "balance_loss_clip": 1.00486505, + "balance_loss_mlp": 1.00037909, + "epoch": 0.9809108672779198, + "flos": 78147545381280.0, + "grad_norm": 0.6862199475463624, + "language_loss": 0.51649392, + "learning_rate": 3.80934308995684e-09, + "loss": 0.5367794, + "num_input_tokens_seen": 352150000, + "router_z_loss_clip": 0.22351074, + "router_z_loss_mlp": 0.00958252, + "step": 16315, + "time_per_iteration": 3.2646937370300293 + }, + { + "auxiliary_loss_clip": 0.0110884, + "auxiliary_loss_mlp": 0.01029767, + "balance_loss_clip": 1.03608513, + "balance_loss_mlp": 1.01938391, + "epoch": 0.9809709905305877, + "flos": 27797414712480.0, + "grad_norm": 1.3818082857855132, + "language_loss": 0.6981777, + "learning_rate": 3.785354859932033e-09, + "loss": 0.71956372, + "num_input_tokens_seen": 352170990, + "router_z_loss_clip": 0.72802734, + "router_z_loss_mlp": 0.1038208, + "step": 16316, + "time_per_iteration": 2.740565061569214 + }, + { + "auxiliary_loss_clip": 0.01109777, + "auxiliary_loss_mlp": 0.01025938, + "balance_loss_clip": 1.03626943, + "balance_loss_mlp": 1.01480997, + "epoch": 0.9810311137832557, + "flos": 45165060648000.0, + "grad_norm": 1.9774667602480525, + "language_loss": 0.55543876, + "learning_rate": 3.76144232656661e-09, + "loss": 0.57679588, + "num_input_tokens_seen": 352195335, + "router_z_loss_clip": 0.73583984, + "router_z_loss_mlp": 0.11132812, + "step": 16317, + "time_per_iteration": 2.740177631378174 + }, + { + "auxiliary_loss_clip": 0.01107622, + "auxiliary_loss_mlp": 0.01028559, + "balance_loss_clip": 1.0377394, + "balance_loss_mlp": 1.01836693, + "epoch": 0.9810912370359236, + "flos": 23081592841920.0, + "grad_norm": 1.9138626462946002, + "language_loss": 0.72872972, + "learning_rate": 3.737605490767404e-09, + "loss": 0.75009155, + "num_input_tokens_seen": 352214170, + "router_z_loss_clip": 0.69873047, + "router_z_loss_mlp": 0.10192871, + "step": 16318, + "time_per_iteration": 2.639996290206909 + }, + { + "auxiliary_loss_clip": 0.01106491, + "auxiliary_loss_mlp": 0.01023933, + "balance_loss_clip": 1.03667736, + "balance_loss_mlp": 1.01360941, + "epoch": 0.9811513602885916, + "flos": 22502359327680.0, + "grad_norm": 2.106603091874176, + "language_loss": 0.82270402, + "learning_rate": 3.7138443534383555e-09, + "loss": 0.84400821, + "num_input_tokens_seen": 352231470, + "router_z_loss_clip": 0.69873047, + "router_z_loss_mlp": 0.10314941, + "step": 16319, + "time_per_iteration": 2.583981990814209 + }, + { + "auxiliary_loss_clip": 0.01027187, + "auxiliary_loss_mlp": 0.01000736, + "balance_loss_clip": 1.00483561, + "balance_loss_mlp": 0.99978936, + "epoch": 0.9812114835412595, + "flos": 83046196334880.0, + "grad_norm": 0.7214724228386055, + "language_loss": 0.53521574, + "learning_rate": 3.6901589154803014e-09, + "loss": 0.55549502, + "num_input_tokens_seen": 352291770, + "router_z_loss_clip": 0.22351074, + "router_z_loss_mlp": 0.00946808, + "step": 16320, + "time_per_iteration": 4.562175750732422 + }, + { + "auxiliary_loss_clip": 0.011102, + "auxiliary_loss_mlp": 0.01034644, + "balance_loss_clip": 1.03786969, + "balance_loss_mlp": 1.02370095, + "epoch": 0.9812716067939276, + "flos": 30960674608320.0, + "grad_norm": 1.7023185629081485, + "language_loss": 0.73127902, + "learning_rate": 3.6665491777914116e-09, + "loss": 0.75272751, + "num_input_tokens_seen": 352310735, + "router_z_loss_clip": 0.72314453, + "router_z_loss_mlp": 0.10943604, + "step": 16321, + "time_per_iteration": 2.6751859188079834 + }, + { + "auxiliary_loss_clip": 0.01110712, + "auxiliary_loss_mlp": 0.01031546, + "balance_loss_clip": 1.04102254, + "balance_loss_mlp": 1.02055442, + "epoch": 0.9813317300465956, + "flos": 27890105548320.0, + "grad_norm": 1.585564345486608, + "language_loss": 0.78312612, + "learning_rate": 3.6430151412669698e-09, + "loss": 0.80454874, + "num_input_tokens_seen": 352329545, + "router_z_loss_clip": 0.69677734, + "router_z_loss_mlp": 0.10992432, + "step": 16322, + "time_per_iteration": 2.635507345199585 + }, + { + "auxiliary_loss_clip": 0.01109566, + "auxiliary_loss_mlp": 0.01032612, + "balance_loss_clip": 1.03782141, + "balance_loss_mlp": 1.02128077, + "epoch": 0.9813918532992635, + "flos": 28781034562080.0, + "grad_norm": 1.5597389993790285, + "language_loss": 0.80942398, + "learning_rate": 3.619556806799595e-09, + "loss": 0.83084571, + "num_input_tokens_seen": 352352080, + "router_z_loss_clip": 0.71826172, + "router_z_loss_mlp": 0.11334229, + "step": 16323, + "time_per_iteration": 4.136441946029663 + }, + { + "auxiliary_loss_clip": 0.01113359, + "auxiliary_loss_mlp": 0.01030604, + "balance_loss_clip": 1.03933513, + "balance_loss_mlp": 1.01994038, + "epoch": 0.9814519765519315, + "flos": 23923907470080.0, + "grad_norm": 2.204650809165582, + "language_loss": 0.84873694, + "learning_rate": 3.596174175278799e-09, + "loss": 0.87017655, + "num_input_tokens_seen": 352366455, + "router_z_loss_clip": 0.74023438, + "router_z_loss_mlp": 0.10662842, + "step": 16324, + "time_per_iteration": 2.6692070960998535 + }, + { + "auxiliary_loss_clip": 0.01108794, + "auxiliary_loss_mlp": 0.01026027, + "balance_loss_clip": 1.03691006, + "balance_loss_mlp": 1.01449978, + "epoch": 0.9815120998045994, + "flos": 41422526720640.0, + "grad_norm": 1.5345585843557574, + "language_loss": 0.74751556, + "learning_rate": 3.5728672475909827e-09, + "loss": 0.76886374, + "num_input_tokens_seen": 352386090, + "router_z_loss_clip": 0.71923828, + "router_z_loss_mlp": 0.11529541, + "step": 16325, + "time_per_iteration": 2.806136131286621 + }, + { + "auxiliary_loss_clip": 0.0110542, + "auxiliary_loss_mlp": 0.01031087, + "balance_loss_clip": 1.03697753, + "balance_loss_mlp": 1.02095985, + "epoch": 0.9815722230572674, + "flos": 25441266278880.0, + "grad_norm": 1.7860529301123664, + "language_loss": 0.76614881, + "learning_rate": 3.5496360246201063e-09, + "loss": 0.78751385, + "num_input_tokens_seen": 352404000, + "router_z_loss_clip": 0.68457031, + "router_z_loss_mlp": 0.10125732, + "step": 16326, + "time_per_iteration": 2.670438289642334 + }, + { + "auxiliary_loss_clip": 0.01111866, + "auxiliary_loss_mlp": 0.01028521, + "balance_loss_clip": 1.03860545, + "balance_loss_mlp": 1.01736927, + "epoch": 0.9816323463099353, + "flos": 27936005276160.0, + "grad_norm": 3.5281417658130216, + "language_loss": 0.67112857, + "learning_rate": 3.5264805072470205e-09, + "loss": 0.69253242, + "num_input_tokens_seen": 352423540, + "router_z_loss_clip": 0.73339844, + "router_z_loss_mlp": 0.11157227, + "step": 16327, + "time_per_iteration": 2.6386663913726807 + }, + { + "auxiliary_loss_clip": 0.01115293, + "auxiliary_loss_mlp": 0.01037639, + "balance_loss_clip": 1.03969991, + "balance_loss_mlp": 1.02609384, + "epoch": 0.9816924695626034, + "flos": 38485402529760.0, + "grad_norm": 1.5569829631506655, + "language_loss": 0.73709881, + "learning_rate": 3.5034006963501337e-09, + "loss": 0.75862813, + "num_input_tokens_seen": 352445530, + "router_z_loss_clip": 0.75634766, + "router_z_loss_mlp": 0.11541748, + "step": 16328, + "time_per_iteration": 2.7224042415618896 + }, + { + "auxiliary_loss_clip": 0.01116796, + "auxiliary_loss_mlp": 0.01035412, + "balance_loss_clip": 1.03942919, + "balance_loss_mlp": 1.0233779, + "epoch": 0.9817525928152713, + "flos": 26242826873760.0, + "grad_norm": 1.7464748827219745, + "language_loss": 0.81029946, + "learning_rate": 3.4803965928040802e-09, + "loss": 0.83182156, + "num_input_tokens_seen": 352466325, + "router_z_loss_clip": 0.7734375, + "router_z_loss_mlp": 0.1204834, + "step": 16329, + "time_per_iteration": 2.713425874710083 + }, + { + "auxiliary_loss_clip": 0.01112973, + "auxiliary_loss_mlp": 0.01032035, + "balance_loss_clip": 1.03750777, + "balance_loss_mlp": 1.01979232, + "epoch": 0.9818127160679393, + "flos": 31177734442560.0, + "grad_norm": 2.191840663621133, + "language_loss": 0.76067269, + "learning_rate": 3.4574681974817168e-09, + "loss": 0.78212279, + "num_input_tokens_seen": 352485505, + "router_z_loss_clip": 0.75439453, + "router_z_loss_mlp": 0.12255859, + "step": 16330, + "time_per_iteration": 2.6786766052246094 + }, + { + "auxiliary_loss_clip": 0.01117922, + "auxiliary_loss_mlp": 0.01030895, + "balance_loss_clip": 1.03886735, + "balance_loss_mlp": 1.01714993, + "epoch": 0.9818728393206072, + "flos": 35146930799520.0, + "grad_norm": 2.3227879815333483, + "language_loss": 0.66613609, + "learning_rate": 3.434615511252126e-09, + "loss": 0.68762422, + "num_input_tokens_seen": 352505360, + "router_z_loss_clip": 0.79052734, + "router_z_loss_mlp": 0.13769531, + "step": 16331, + "time_per_iteration": 2.669862985610962 + }, + { + "auxiliary_loss_clip": 0.01106361, + "auxiliary_loss_mlp": 0.01028484, + "balance_loss_clip": 1.03554344, + "balance_loss_mlp": 1.01761222, + "epoch": 0.9819329625732752, + "flos": 28335975228000.0, + "grad_norm": 1.7561336447393616, + "language_loss": 0.73296952, + "learning_rate": 3.411838534981948e-09, + "loss": 0.754318, + "num_input_tokens_seen": 352524035, + "router_z_loss_clip": 0.70800781, + "router_z_loss_mlp": 0.10870361, + "step": 16332, + "time_per_iteration": 2.6730234622955322 + }, + { + "auxiliary_loss_clip": 0.01109457, + "auxiliary_loss_mlp": 0.01027072, + "balance_loss_clip": 1.03861451, + "balance_loss_mlp": 1.01718402, + "epoch": 0.9819930858259431, + "flos": 21390399786240.0, + "grad_norm": 2.032229937646965, + "language_loss": 0.76814544, + "learning_rate": 3.389137269534936e-09, + "loss": 0.78951073, + "num_input_tokens_seen": 352543210, + "router_z_loss_clip": 0.70947266, + "router_z_loss_mlp": 0.09887695, + "step": 16333, + "time_per_iteration": 4.165971994400024 + }, + { + "auxiliary_loss_clip": 0.01107511, + "auxiliary_loss_mlp": 0.01024277, + "balance_loss_clip": 1.03664839, + "balance_loss_mlp": 1.01316094, + "epoch": 0.9820532090786112, + "flos": 15288557077440.0, + "grad_norm": 17.643988895265124, + "language_loss": 0.7343446, + "learning_rate": 3.366511715771958e-09, + "loss": 0.7556625, + "num_input_tokens_seen": 352559770, + "router_z_loss_clip": 0.70947266, + "router_z_loss_mlp": 0.11114502, + "step": 16334, + "time_per_iteration": 2.643460273742676 + }, + { + "auxiliary_loss_clip": 0.01110335, + "auxiliary_loss_mlp": 0.01039393, + "balance_loss_clip": 1.03717542, + "balance_loss_mlp": 1.02836633, + "epoch": 0.9821133323312792, + "flos": 22988983040640.0, + "grad_norm": 2.19873722923414, + "language_loss": 0.78666538, + "learning_rate": 3.3439618745509934e-09, + "loss": 0.80816269, + "num_input_tokens_seen": 352577690, + "router_z_loss_clip": 0.73144531, + "router_z_loss_mlp": 0.11029053, + "step": 16335, + "time_per_iteration": 2.741044044494629 + }, + { + "auxiliary_loss_clip": 0.01114708, + "auxiliary_loss_mlp": 0.01034516, + "balance_loss_clip": 1.03922224, + "balance_loss_mlp": 1.02241671, + "epoch": 0.9821734555839471, + "flos": 42493408090560.0, + "grad_norm": 5.487933691729378, + "language_loss": 0.64098895, + "learning_rate": 3.3214877467271362e-09, + "loss": 0.66248113, + "num_input_tokens_seen": 352598850, + "router_z_loss_clip": 0.75488281, + "router_z_loss_mlp": 0.12109375, + "step": 16336, + "time_per_iteration": 2.7493956089019775 + }, + { + "auxiliary_loss_clip": 0.01112912, + "auxiliary_loss_mlp": 0.01036555, + "balance_loss_clip": 1.03698945, + "balance_loss_mlp": 1.0240674, + "epoch": 0.9822335788366151, + "flos": 20899602793440.0, + "grad_norm": 2.2441621441330493, + "language_loss": 0.73233616, + "learning_rate": 3.299089333152372e-09, + "loss": 0.75383079, + "num_input_tokens_seen": 352616130, + "router_z_loss_clip": 0.76025391, + "router_z_loss_mlp": 0.12493896, + "step": 16337, + "time_per_iteration": 2.6162452697753906 + }, + { + "auxiliary_loss_clip": 0.01111008, + "auxiliary_loss_mlp": 0.01026783, + "balance_loss_clip": 1.03746605, + "balance_loss_mlp": 1.01470113, + "epoch": 0.982293702089283, + "flos": 25396420000320.0, + "grad_norm": 2.0562584581829766, + "language_loss": 0.73168069, + "learning_rate": 3.2767666346764645e-09, + "loss": 0.75305861, + "num_input_tokens_seen": 352636885, + "router_z_loss_clip": 0.73632812, + "router_z_loss_mlp": 0.12078857, + "step": 16338, + "time_per_iteration": 2.610705852508545 + }, + { + "auxiliary_loss_clip": 0.0110594, + "auxiliary_loss_mlp": 0.01030207, + "balance_loss_clip": 1.03498399, + "balance_loss_mlp": 1.0192579, + "epoch": 0.982353825341951, + "flos": 30115928943360.0, + "grad_norm": 1.9697448750806552, + "language_loss": 0.81273401, + "learning_rate": 3.2545196521454045e-09, + "loss": 0.83409548, + "num_input_tokens_seen": 352657905, + "router_z_loss_clip": 0.70849609, + "router_z_loss_mlp": 0.10943604, + "step": 16339, + "time_per_iteration": 2.7085354328155518 + }, + { + "auxiliary_loss_clip": 0.01105135, + "auxiliary_loss_mlp": 0.01030308, + "balance_loss_clip": 1.03563213, + "balance_loss_mlp": 1.01978815, + "epoch": 0.982413948594619, + "flos": 25441793003520.0, + "grad_norm": 1.8910630859888864, + "language_loss": 0.62283474, + "learning_rate": 3.232348386403405e-09, + "loss": 0.64418924, + "num_input_tokens_seen": 352676320, + "router_z_loss_clip": 0.69433594, + "router_z_loss_mlp": 0.10516357, + "step": 16340, + "time_per_iteration": 2.6286866664886475 + }, + { + "auxiliary_loss_clip": 0.01112435, + "auxiliary_loss_mlp": 0.01030983, + "balance_loss_clip": 1.03814507, + "balance_loss_mlp": 1.01935363, + "epoch": 0.982474071847287, + "flos": 18763107300000.0, + "grad_norm": 2.4496304613808406, + "language_loss": 0.8594116, + "learning_rate": 3.2102528382904613e-09, + "loss": 0.88084579, + "num_input_tokens_seen": 352692665, + "router_z_loss_clip": 0.74316406, + "router_z_loss_mlp": 0.11621094, + "step": 16341, + "time_per_iteration": 2.5897951126098633 + }, + { + "auxiliary_loss_clip": 0.01106259, + "auxiliary_loss_mlp": 0.01023962, + "balance_loss_clip": 1.03743339, + "balance_loss_mlp": 1.01312566, + "epoch": 0.9825341950999549, + "flos": 29009682338400.0, + "grad_norm": 1.4654263062445467, + "language_loss": 0.66938877, + "learning_rate": 3.188233008645014e-09, + "loss": 0.69069093, + "num_input_tokens_seen": 352716130, + "router_z_loss_clip": 0.68847656, + "router_z_loss_mlp": 0.10839844, + "step": 16342, + "time_per_iteration": 2.689936637878418 + }, + { + "auxiliary_loss_clip": 0.01109741, + "auxiliary_loss_mlp": 0.01023142, + "balance_loss_clip": 1.03657389, + "balance_loss_mlp": 1.01201987, + "epoch": 0.9825943183526229, + "flos": 27755323608960.0, + "grad_norm": 1.8243588769178214, + "language_loss": 0.7700628, + "learning_rate": 3.16628889830195e-09, + "loss": 0.79139161, + "num_input_tokens_seen": 352734705, + "router_z_loss_clip": 0.73193359, + "router_z_loss_mlp": 0.11114502, + "step": 16343, + "time_per_iteration": 2.6303677558898926 + }, + { + "auxiliary_loss_clip": 0.01108231, + "auxiliary_loss_mlp": 0.01029754, + "balance_loss_clip": 1.0374589, + "balance_loss_mlp": 1.01953793, + "epoch": 0.9826544416052908, + "flos": 33812239004640.0, + "grad_norm": 2.1023246533978552, + "language_loss": 0.75394809, + "learning_rate": 3.1444205080932707e-09, + "loss": 0.77532792, + "num_input_tokens_seen": 352756225, + "router_z_loss_clip": 0.70703125, + "router_z_loss_mlp": 0.10211182, + "step": 16344, + "time_per_iteration": 2.7223565578460693 + }, + { + "auxiliary_loss_clip": 0.01108131, + "auxiliary_loss_mlp": 0.01029104, + "balance_loss_clip": 1.03611112, + "balance_loss_mlp": 1.01790452, + "epoch": 0.9827145648579588, + "flos": 32874883538400.0, + "grad_norm": 3.094462504195243, + "language_loss": 0.66401237, + "learning_rate": 3.122627838848313e-09, + "loss": 0.68538475, + "num_input_tokens_seen": 352776210, + "router_z_loss_clip": 0.71972656, + "router_z_loss_mlp": 0.11193848, + "step": 16345, + "time_per_iteration": 2.6534762382507324 + }, + { + "auxiliary_loss_clip": 0.01103105, + "auxiliary_loss_mlp": 0.01025431, + "balance_loss_clip": 1.03504944, + "balance_loss_mlp": 1.01551902, + "epoch": 0.9827746881106267, + "flos": 26682173271360.0, + "grad_norm": 1.4969942677700034, + "language_loss": 0.7950778, + "learning_rate": 3.1009108913933045e-09, + "loss": 0.81636322, + "num_input_tokens_seen": 352795455, + "router_z_loss_clip": 0.68115234, + "router_z_loss_mlp": 0.09906006, + "step": 16346, + "time_per_iteration": 2.6761436462402344 + }, + { + "auxiliary_loss_clip": 0.01114808, + "auxiliary_loss_mlp": 0.01031374, + "balance_loss_clip": 1.0379529, + "balance_loss_mlp": 1.01960218, + "epoch": 0.9828348113632948, + "flos": 25442117141760.0, + "grad_norm": 2.344398413671424, + "language_loss": 0.74689007, + "learning_rate": 3.079269666552031e-09, + "loss": 0.76835191, + "num_input_tokens_seen": 352812895, + "router_z_loss_clip": 0.76806641, + "router_z_loss_mlp": 0.11773682, + "step": 16347, + "time_per_iteration": 2.645069122314453 + }, + { + "auxiliary_loss_clip": 0.01106315, + "auxiliary_loss_mlp": 0.0103382, + "balance_loss_clip": 1.03633165, + "balance_loss_mlp": 1.02377701, + "epoch": 0.9828949346159628, + "flos": 42182441902080.0, + "grad_norm": 3.4694233199898186, + "language_loss": 0.66802007, + "learning_rate": 3.0577041651449474e-09, + "loss": 0.68942142, + "num_input_tokens_seen": 352835470, + "router_z_loss_clip": 0.69970703, + "router_z_loss_mlp": 0.10040283, + "step": 16348, + "time_per_iteration": 2.7713382244110107 + }, + { + "auxiliary_loss_clip": 0.01109419, + "auxiliary_loss_mlp": 0.01026727, + "balance_loss_clip": 1.03786469, + "balance_loss_mlp": 1.01552725, + "epoch": 0.9829550578686307, + "flos": 29843771958720.0, + "grad_norm": 1.7867064167868378, + "language_loss": 0.69370115, + "learning_rate": 3.0362143879898437e-09, + "loss": 0.71506268, + "num_input_tokens_seen": 352854295, + "router_z_loss_clip": 0.71533203, + "router_z_loss_mlp": 0.11199951, + "step": 16349, + "time_per_iteration": 4.150103807449341 + }, + { + "auxiliary_loss_clip": 0.01103827, + "auxiliary_loss_mlp": 0.01027961, + "balance_loss_clip": 1.0360949, + "balance_loss_mlp": 1.01767302, + "epoch": 0.9830151811212987, + "flos": 20633401848960.0, + "grad_norm": 2.1787038043835425, + "language_loss": 0.76226133, + "learning_rate": 3.0148003359014018e-09, + "loss": 0.78357923, + "num_input_tokens_seen": 352869695, + "router_z_loss_clip": 0.67724609, + "router_z_loss_mlp": 0.10284424, + "step": 16350, + "time_per_iteration": 2.5997180938720703 + }, + { + "auxiliary_loss_clip": 0.01110899, + "auxiliary_loss_mlp": 0.01029565, + "balance_loss_clip": 1.03827202, + "balance_loss_mlp": 1.01861537, + "epoch": 0.9830753043739666, + "flos": 25976058687360.0, + "grad_norm": 2.5827387452280015, + "language_loss": 0.84225631, + "learning_rate": 2.9934620096920826e-09, + "loss": 0.86366099, + "num_input_tokens_seen": 352887430, + "router_z_loss_clip": 0.72558594, + "router_z_loss_mlp": 0.10961914, + "step": 16351, + "time_per_iteration": 2.6541664600372314 + }, + { + "auxiliary_loss_clip": 0.01108305, + "auxiliary_loss_mlp": 0.01023265, + "balance_loss_clip": 1.03631818, + "balance_loss_mlp": 1.0122385, + "epoch": 0.9831354276266346, + "flos": 38709877026240.0, + "grad_norm": 1.739846745103381, + "language_loss": 0.68733895, + "learning_rate": 2.972199410170795e-09, + "loss": 0.70865464, + "num_input_tokens_seen": 352907555, + "router_z_loss_clip": 0.71923828, + "router_z_loss_mlp": 0.11035156, + "step": 16352, + "time_per_iteration": 2.7425341606140137 + }, + { + "auxiliary_loss_clip": 0.01107184, + "auxiliary_loss_mlp": 0.01030084, + "balance_loss_clip": 1.03749061, + "balance_loss_mlp": 1.01983178, + "epoch": 0.9831955508793025, + "flos": 26379432090720.0, + "grad_norm": 1.9857841515011658, + "language_loss": 0.66451609, + "learning_rate": 2.951012538143782e-09, + "loss": 0.68588877, + "num_input_tokens_seen": 352928670, + "router_z_loss_clip": 0.69628906, + "router_z_loss_mlp": 0.10253906, + "step": 16353, + "time_per_iteration": 2.6728463172912598 + }, + { + "auxiliary_loss_clip": 0.01105579, + "auxiliary_loss_mlp": 0.01028793, + "balance_loss_clip": 1.03499329, + "balance_loss_mlp": 1.01808774, + "epoch": 0.9832556741319706, + "flos": 28025859902400.0, + "grad_norm": 1.593733038469251, + "language_loss": 0.74650156, + "learning_rate": 2.9299013944144025e-09, + "loss": 0.76784527, + "num_input_tokens_seen": 352948345, + "router_z_loss_clip": 0.70556641, + "router_z_loss_mlp": 0.10699463, + "step": 16354, + "time_per_iteration": 2.6558849811553955 + }, + { + "auxiliary_loss_clip": 0.01107331, + "auxiliary_loss_mlp": 0.01027394, + "balance_loss_clip": 1.03630316, + "balance_loss_mlp": 1.01655185, + "epoch": 0.9833157973846385, + "flos": 26019567895680.0, + "grad_norm": 2.363220566033202, + "language_loss": 0.77556884, + "learning_rate": 2.9088659797835702e-09, + "loss": 0.79691607, + "num_input_tokens_seen": 352967250, + "router_z_loss_clip": 0.70947266, + "router_z_loss_mlp": 0.10845947, + "step": 16355, + "time_per_iteration": 2.6557321548461914 + }, + { + "auxiliary_loss_clip": 0.01106071, + "auxiliary_loss_mlp": 0.01026419, + "balance_loss_clip": 1.03585577, + "balance_loss_mlp": 1.01495147, + "epoch": 0.9833759206373065, + "flos": 25708156017120.0, + "grad_norm": 2.038737717868336, + "language_loss": 0.73286939, + "learning_rate": 2.8879062950484256e-09, + "loss": 0.75419426, + "num_input_tokens_seen": 352984725, + "router_z_loss_clip": 0.70214844, + "router_z_loss_mlp": 0.11462402, + "step": 16356, + "time_per_iteration": 2.6557583808898926 + }, + { + "auxiliary_loss_clip": 0.01108122, + "auxiliary_loss_mlp": 0.01030386, + "balance_loss_clip": 1.03793323, + "balance_loss_mlp": 1.01924014, + "epoch": 0.9834360438899744, + "flos": 22814378965440.0, + "grad_norm": 1.711939078678114, + "language_loss": 0.75800884, + "learning_rate": 2.8670223410041104e-09, + "loss": 0.77939397, + "num_input_tokens_seen": 353003480, + "router_z_loss_clip": 0.70263672, + "router_z_loss_mlp": 0.11157227, + "step": 16357, + "time_per_iteration": 2.6296658515930176 + }, + { + "auxiliary_loss_clip": 0.01107584, + "auxiliary_loss_mlp": 0.01024333, + "balance_loss_clip": 1.03705454, + "balance_loss_mlp": 1.01288319, + "epoch": 0.9834961671426424, + "flos": 25752718674720.0, + "grad_norm": 1.904225052250329, + "language_loss": 0.80111009, + "learning_rate": 2.846214118442436e-09, + "loss": 0.8224293, + "num_input_tokens_seen": 353021425, + "router_z_loss_clip": 0.70556641, + "router_z_loss_mlp": 0.11437988, + "step": 16358, + "time_per_iteration": 2.8100483417510986 + }, + { + "auxiliary_loss_clip": 0.01106863, + "auxiliary_loss_mlp": 0.0102559, + "balance_loss_clip": 1.03565156, + "balance_loss_mlp": 1.01501036, + "epoch": 0.9835562903953103, + "flos": 32564768212800.0, + "grad_norm": 2.3596115926523433, + "language_loss": 0.67833757, + "learning_rate": 2.8254816281523263e-09, + "loss": 0.69966209, + "num_input_tokens_seen": 353039870, + "router_z_loss_clip": 0.71191406, + "router_z_loss_mlp": 0.10577393, + "step": 16359, + "time_per_iteration": 2.676964521408081 + }, + { + "auxiliary_loss_clip": 0.01105351, + "auxiliary_loss_mlp": 0.01029751, + "balance_loss_clip": 1.03542233, + "balance_loss_mlp": 1.01940393, + "epoch": 0.9836164136479784, + "flos": 26954613876960.0, + "grad_norm": 1.950419278164543, + "language_loss": 0.69565588, + "learning_rate": 2.804824870920264e-09, + "loss": 0.71700686, + "num_input_tokens_seen": 353059750, + "router_z_loss_clip": 0.70019531, + "router_z_loss_mlp": 0.10351562, + "step": 16360, + "time_per_iteration": 4.091042995452881 + }, + { + "auxiliary_loss_clip": 0.01110612, + "auxiliary_loss_mlp": 0.010305, + "balance_loss_clip": 1.03746748, + "balance_loss_mlp": 1.01909745, + "epoch": 0.9836765369006463, + "flos": 29136725477280.0, + "grad_norm": 2.713108688881078, + "language_loss": 0.84370673, + "learning_rate": 2.7842438475293996e-09, + "loss": 0.86511791, + "num_input_tokens_seen": 353079940, + "router_z_loss_clip": 0.73193359, + "router_z_loss_mlp": 0.11395264, + "step": 16361, + "time_per_iteration": 2.617312431335449 + }, + { + "auxiliary_loss_clip": 0.01109608, + "auxiliary_loss_mlp": 0.01026075, + "balance_loss_clip": 1.03724587, + "balance_loss_mlp": 1.01527452, + "epoch": 0.9837366601533143, + "flos": 31536869326560.0, + "grad_norm": 1.7516700784142363, + "language_loss": 0.76132929, + "learning_rate": 2.76373855876022e-09, + "loss": 0.78268611, + "num_input_tokens_seen": 353099990, + "router_z_loss_clip": 0.72363281, + "router_z_loss_mlp": 0.10803223, + "step": 16362, + "time_per_iteration": 4.153934001922607 + }, + { + "auxiliary_loss_clip": 0.01108982, + "auxiliary_loss_mlp": 0.01030665, + "balance_loss_clip": 1.03727174, + "balance_loss_mlp": 1.01944137, + "epoch": 0.9837967834059823, + "flos": 26061415895520.0, + "grad_norm": 2.120062517073155, + "language_loss": 0.71143717, + "learning_rate": 2.7433090053901043e-09, + "loss": 0.73283362, + "num_input_tokens_seen": 353118710, + "router_z_loss_clip": 0.71679688, + "router_z_loss_mlp": 0.11218262, + "step": 16363, + "time_per_iteration": 2.625635862350464 + }, + { + "auxiliary_loss_clip": 0.01104096, + "auxiliary_loss_mlp": 0.01026203, + "balance_loss_clip": 1.03581476, + "balance_loss_mlp": 1.0157361, + "epoch": 0.9838569066586502, + "flos": 22592781195840.0, + "grad_norm": 1.7459820590836728, + "language_loss": 0.63116515, + "learning_rate": 2.7229551881937653e-09, + "loss": 0.65246809, + "num_input_tokens_seen": 353136415, + "router_z_loss_clip": 0.68261719, + "router_z_loss_mlp": 0.10461426, + "step": 16364, + "time_per_iteration": 2.584744930267334 + }, + { + "auxiliary_loss_clip": 0.01108088, + "auxiliary_loss_mlp": 0.01028977, + "balance_loss_clip": 1.03675449, + "balance_loss_mlp": 1.01884985, + "epoch": 0.9839170299113182, + "flos": 27396877518720.0, + "grad_norm": 1.6064569898151173, + "language_loss": 0.75260848, + "learning_rate": 2.702677107943252e-09, + "loss": 0.77397907, + "num_input_tokens_seen": 353154650, + "router_z_loss_clip": 0.71337891, + "router_z_loss_mlp": 0.10125732, + "step": 16365, + "time_per_iteration": 2.6395423412323 + }, + { + "auxiliary_loss_clip": 0.01106486, + "auxiliary_loss_mlp": 0.010269, + "balance_loss_clip": 1.03575206, + "balance_loss_mlp": 1.01545632, + "epoch": 0.9839771531639862, + "flos": 34034768671680.0, + "grad_norm": 2.2659797931380563, + "language_loss": 0.76169491, + "learning_rate": 2.6824747654072832e-09, + "loss": 0.78302884, + "num_input_tokens_seen": 353174065, + "router_z_loss_clip": 0.70751953, + "router_z_loss_mlp": 0.11450195, + "step": 16366, + "time_per_iteration": 2.664416551589966 + }, + { + "auxiliary_loss_clip": 0.01104908, + "auxiliary_loss_mlp": 0.01024734, + "balance_loss_clip": 1.03520751, + "balance_loss_mlp": 1.01435065, + "epoch": 0.9840372764166542, + "flos": 34428498962400.0, + "grad_norm": 1.785363955101005, + "language_loss": 0.77027482, + "learning_rate": 2.662348161352357e-09, + "loss": 0.79157126, + "num_input_tokens_seen": 353193560, + "router_z_loss_clip": 0.69628906, + "router_z_loss_mlp": 0.10388184, + "step": 16367, + "time_per_iteration": 2.6378767490386963 + }, + { + "auxiliary_loss_clip": 0.01109071, + "auxiliary_loss_mlp": 0.01030949, + "balance_loss_clip": 1.03926635, + "balance_loss_mlp": 1.01976156, + "epoch": 0.9840973996693221, + "flos": 28557816101280.0, + "grad_norm": 2.2661845505571483, + "language_loss": 0.61782289, + "learning_rate": 2.642297296540974e-09, + "loss": 0.6392231, + "num_input_tokens_seen": 353213525, + "router_z_loss_clip": 0.69873047, + "router_z_loss_mlp": 0.11193848, + "step": 16368, + "time_per_iteration": 2.6230552196502686 + }, + { + "auxiliary_loss_clip": 0.01104687, + "auxiliary_loss_mlp": 0.01026763, + "balance_loss_clip": 1.03634572, + "balance_loss_mlp": 1.01680946, + "epoch": 0.9841575229219901, + "flos": 26107477692480.0, + "grad_norm": 1.6691719390386532, + "language_loss": 0.65845037, + "learning_rate": 2.6223221717340816e-09, + "loss": 0.67976487, + "num_input_tokens_seen": 353234000, + "router_z_loss_clip": 0.68408203, + "router_z_loss_mlp": 0.09954834, + "step": 16369, + "time_per_iteration": 2.6328043937683105 + }, + { + "auxiliary_loss_clip": 0.01108764, + "auxiliary_loss_mlp": 0.01034979, + "balance_loss_clip": 1.03723669, + "balance_loss_mlp": 1.02361858, + "epoch": 0.984217646174658, + "flos": 29851227138240.0, + "grad_norm": 2.0487905263276502, + "language_loss": 0.68733203, + "learning_rate": 2.6024227876886295e-09, + "loss": 0.7087695, + "num_input_tokens_seen": 353254940, + "router_z_loss_clip": 0.71630859, + "router_z_loss_mlp": 0.1137085, + "step": 16370, + "time_per_iteration": 2.6369152069091797 + }, + { + "auxiliary_loss_clip": 0.01109761, + "auxiliary_loss_mlp": 0.01029681, + "balance_loss_clip": 1.03624284, + "balance_loss_mlp": 1.01780176, + "epoch": 0.984277769427326, + "flos": 20053803679200.0, + "grad_norm": 1.909918480809965, + "language_loss": 0.73706967, + "learning_rate": 2.582599145159792e-09, + "loss": 0.7584641, + "num_input_tokens_seen": 353272590, + "router_z_loss_clip": 0.734375, + "router_z_loss_mlp": 0.11883545, + "step": 16371, + "time_per_iteration": 2.6518638134002686 + }, + { + "auxiliary_loss_clip": 0.01026988, + "auxiliary_loss_mlp": 0.010015, + "balance_loss_clip": 1.0046463, + "balance_loss_mlp": 1.00055707, + "epoch": 0.9843378926799939, + "flos": 78739663390560.0, + "grad_norm": 0.775159219108615, + "language_loss": 0.65107989, + "learning_rate": 2.562851244898745e-09, + "loss": 0.67136472, + "num_input_tokens_seen": 353334380, + "router_z_loss_clip": 0.22363281, + "router_z_loss_mlp": 0.00940704, + "step": 16372, + "time_per_iteration": 4.735153436660767 + }, + { + "auxiliary_loss_clip": 0.01107542, + "auxiliary_loss_mlp": 0.01027011, + "balance_loss_clip": 1.03649318, + "balance_loss_mlp": 1.01584077, + "epoch": 0.984398015932662, + "flos": 21210406912800.0, + "grad_norm": 2.136676731587753, + "language_loss": 0.71227843, + "learning_rate": 2.5431790876544456e-09, + "loss": 0.73362392, + "num_input_tokens_seen": 353351640, + "router_z_loss_clip": 0.70996094, + "router_z_loss_mlp": 0.1116333, + "step": 16373, + "time_per_iteration": 2.5840651988983154 + }, + { + "auxiliary_loss_clip": 0.01108466, + "auxiliary_loss_mlp": 0.01026696, + "balance_loss_clip": 1.03812361, + "balance_loss_mlp": 1.01582456, + "epoch": 0.9844581391853299, + "flos": 29137495305600.0, + "grad_norm": 2.0194372245654035, + "language_loss": 0.81799364, + "learning_rate": 2.523582674173186e-09, + "loss": 0.83934522, + "num_input_tokens_seen": 353372555, + "router_z_loss_clip": 0.703125, + "router_z_loss_mlp": 0.10870361, + "step": 16374, + "time_per_iteration": 2.664083480834961 + }, + { + "auxiliary_loss_clip": 0.01112567, + "auxiliary_loss_mlp": 0.01031041, + "balance_loss_clip": 1.03949642, + "balance_loss_mlp": 1.02050316, + "epoch": 0.9845182624379979, + "flos": 24239654697600.0, + "grad_norm": 2.4462537078964357, + "language_loss": 0.69328713, + "learning_rate": 2.504062005197927e-09, + "loss": 0.71472323, + "num_input_tokens_seen": 353391385, + "router_z_loss_clip": 0.73046875, + "router_z_loss_mlp": 0.10534668, + "step": 16375, + "time_per_iteration": 2.675901174545288 + }, + { + "auxiliary_loss_clip": 0.01111396, + "auxiliary_loss_mlp": 0.0103095, + "balance_loss_clip": 1.03731465, + "balance_loss_mlp": 1.01913023, + "epoch": 0.9845783856906659, + "flos": 34477153865280.0, + "grad_norm": 1.8968112056513882, + "language_loss": 0.8072418, + "learning_rate": 2.484617081468521e-09, + "loss": 0.82866526, + "num_input_tokens_seen": 353411630, + "router_z_loss_clip": 0.74121094, + "router_z_loss_mlp": 0.11834717, + "step": 16376, + "time_per_iteration": 2.6637723445892334 + }, + { + "auxiliary_loss_clip": 0.01107479, + "auxiliary_loss_mlp": 0.01031796, + "balance_loss_clip": 1.03699195, + "balance_loss_mlp": 1.02069783, + "epoch": 0.9846385089433338, + "flos": 34567049008800.0, + "grad_norm": 1.588637712181034, + "language_loss": 0.62264383, + "learning_rate": 2.4652479037228224e-09, + "loss": 0.64403659, + "num_input_tokens_seen": 353432895, + "router_z_loss_clip": 0.70458984, + "router_z_loss_mlp": 0.11096191, + "step": 16377, + "time_per_iteration": 2.6838057041168213 + }, + { + "auxiliary_loss_clip": 0.01110166, + "auxiliary_loss_mlp": 0.01029508, + "balance_loss_clip": 1.03712189, + "balance_loss_mlp": 1.0181179, + "epoch": 0.9846986321960018, + "flos": 29671598920320.0, + "grad_norm": 1.774271743378061, + "language_loss": 0.72989142, + "learning_rate": 2.445954472695133e-09, + "loss": 0.75128818, + "num_input_tokens_seen": 353454195, + "router_z_loss_clip": 0.73046875, + "router_z_loss_mlp": 0.1138916, + "step": 16378, + "time_per_iteration": 2.6922385692596436 + }, + { + "auxiliary_loss_clip": 0.01107883, + "auxiliary_loss_mlp": 0.01032124, + "balance_loss_clip": 1.03622985, + "balance_loss_mlp": 1.02124047, + "epoch": 0.9847587554486698, + "flos": 33277811251680.0, + "grad_norm": 1.692925948344299, + "language_loss": 0.71173882, + "learning_rate": 2.426736789116868e-09, + "loss": 0.73313886, + "num_input_tokens_seen": 353475125, + "router_z_loss_clip": 0.71630859, + "router_z_loss_mlp": 0.10882568, + "step": 16379, + "time_per_iteration": 2.780911922454834 + }, + { + "auxiliary_loss_clip": 0.01110972, + "auxiliary_loss_mlp": 0.01029288, + "balance_loss_clip": 1.03822768, + "balance_loss_mlp": 1.0181061, + "epoch": 0.9848188787013378, + "flos": 20185344236160.0, + "grad_norm": 1.9757153786433184, + "language_loss": 0.68641639, + "learning_rate": 2.407594853716999e-09, + "loss": 0.70781898, + "num_input_tokens_seen": 353493265, + "router_z_loss_clip": 0.72705078, + "router_z_loss_mlp": 0.11181641, + "step": 16380, + "time_per_iteration": 2.6233325004577637 + }, + { + "auxiliary_loss_clip": 0.01113473, + "auxiliary_loss_mlp": 0.01033575, + "balance_loss_clip": 1.03882432, + "balance_loss_mlp": 1.02263713, + "epoch": 0.9848790019540057, + "flos": 24640272925920.0, + "grad_norm": 2.055972571929479, + "language_loss": 0.78921306, + "learning_rate": 2.38852866722139e-09, + "loss": 0.81068361, + "num_input_tokens_seen": 353511650, + "router_z_loss_clip": 0.74658203, + "router_z_loss_mlp": 0.109375, + "step": 16381, + "time_per_iteration": 2.6363091468811035 + }, + { + "auxiliary_loss_clip": 0.01109841, + "auxiliary_loss_mlp": 0.01027302, + "balance_loss_clip": 1.03741574, + "balance_loss_mlp": 1.01626968, + "epoch": 0.9849391252066737, + "flos": 34482866801760.0, + "grad_norm": 1.4204195074521073, + "language_loss": 0.82588267, + "learning_rate": 2.3695382303527965e-09, + "loss": 0.84725416, + "num_input_tokens_seen": 353534035, + "router_z_loss_clip": 0.72460938, + "router_z_loss_mlp": 0.1104126, + "step": 16382, + "time_per_iteration": 2.6683762073516846 + }, + { + "auxiliary_loss_clip": 0.01112048, + "auxiliary_loss_mlp": 0.01028413, + "balance_loss_clip": 1.0369873, + "balance_loss_mlp": 1.01678979, + "epoch": 0.9849992484593416, + "flos": 27399754245600.0, + "grad_norm": 2.293061235817714, + "language_loss": 0.74862587, + "learning_rate": 2.3506235438315316e-09, + "loss": 0.7700305, + "num_input_tokens_seen": 353549950, + "router_z_loss_clip": 0.75048828, + "router_z_loss_mlp": 0.11633301, + "step": 16383, + "time_per_iteration": 2.617283582687378 + }, + { + "auxiliary_loss_clip": 0.01110983, + "auxiliary_loss_mlp": 0.0102738, + "balance_loss_clip": 1.03851724, + "balance_loss_mlp": 1.01665711, + "epoch": 0.9850593717120096, + "flos": 42094451070720.0, + "grad_norm": 1.668922029944383, + "language_loss": 0.66333479, + "learning_rate": 2.3317846083750203e-09, + "loss": 0.68471843, + "num_input_tokens_seen": 353573745, + "router_z_loss_clip": 0.72460938, + "router_z_loss_mlp": 0.1072998, + "step": 16384, + "time_per_iteration": 2.723153829574585 + }, + { + "auxiliary_loss_clip": 0.01114718, + "auxiliary_loss_mlp": 0.01031707, + "balance_loss_clip": 1.0398798, + "balance_loss_mlp": 1.01905847, + "epoch": 0.9851194949646775, + "flos": 47390721973920.0, + "grad_norm": 1.7719922646063484, + "language_loss": 0.70469093, + "learning_rate": 2.313021424697359e-09, + "loss": 0.72615522, + "num_input_tokens_seen": 353595335, + "router_z_loss_clip": 0.74804688, + "router_z_loss_mlp": 0.12652588, + "step": 16385, + "time_per_iteration": 2.7960755825042725 + }, + { + "auxiliary_loss_clip": 0.01113309, + "auxiliary_loss_mlp": 0.01031638, + "balance_loss_clip": 1.04054618, + "balance_loss_mlp": 1.02067637, + "epoch": 0.9851796182173456, + "flos": 21612524280480.0, + "grad_norm": 1.9615750650788408, + "language_loss": 0.81051159, + "learning_rate": 2.294333993509978e-09, + "loss": 0.83196104, + "num_input_tokens_seen": 353614270, + "router_z_loss_clip": 0.72705078, + "router_z_loss_mlp": 0.10955811, + "step": 16386, + "time_per_iteration": 2.587402105331421 + }, + { + "auxiliary_loss_clip": 0.01109582, + "auxiliary_loss_mlp": 0.01032384, + "balance_loss_clip": 1.0370779, + "balance_loss_mlp": 1.0208683, + "epoch": 0.9852397414700135, + "flos": 33500300401440.0, + "grad_norm": 1.884521627437756, + "language_loss": 0.67874449, + "learning_rate": 2.2757223155216442e-09, + "loss": 0.7001642, + "num_input_tokens_seen": 353634900, + "router_z_loss_clip": 0.72460938, + "router_z_loss_mlp": 0.1151123, + "step": 16387, + "time_per_iteration": 2.6509673595428467 + }, + { + "auxiliary_loss_clip": 0.01101814, + "auxiliary_loss_mlp": 0.01025171, + "balance_loss_clip": 1.03439999, + "balance_loss_mlp": 1.01497316, + "epoch": 0.9852998647226815, + "flos": 22325445767520.0, + "grad_norm": 1.8206268888127215, + "language_loss": 0.73921299, + "learning_rate": 2.257186391438237e-09, + "loss": 0.76048291, + "num_input_tokens_seen": 353652890, + "router_z_loss_clip": 0.67480469, + "router_z_loss_mlp": 0.10205078, + "step": 16388, + "time_per_iteration": 4.020601749420166 + }, + { + "auxiliary_loss_clip": 0.01106873, + "auxiliary_loss_mlp": 0.01030055, + "balance_loss_clip": 1.03536284, + "balance_loss_mlp": 1.01924229, + "epoch": 0.9853599879753495, + "flos": 23968145989440.0, + "grad_norm": 1.8643543344803297, + "language_loss": 0.8214612, + "learning_rate": 2.238726221962528e-09, + "loss": 0.84283054, + "num_input_tokens_seen": 353671295, + "router_z_loss_clip": 0.71484375, + "router_z_loss_mlp": 0.10809326, + "step": 16389, + "time_per_iteration": 2.586721658706665 + }, + { + "auxiliary_loss_clip": 0.01109308, + "auxiliary_loss_mlp": 0.01026183, + "balance_loss_clip": 1.03815782, + "balance_loss_mlp": 1.0150069, + "epoch": 0.9854201112280174, + "flos": 29092243854240.0, + "grad_norm": 2.03324455666665, + "language_loss": 0.67155957, + "learning_rate": 2.2203418077946234e-09, + "loss": 0.69291461, + "num_input_tokens_seen": 353690560, + "router_z_loss_clip": 0.7109375, + "router_z_loss_mlp": 0.11175537, + "step": 16390, + "time_per_iteration": 2.7090134620666504 + }, + { + "auxiliary_loss_clip": 0.01109166, + "auxiliary_loss_mlp": 0.01034003, + "balance_loss_clip": 1.03753352, + "balance_loss_mlp": 1.02207017, + "epoch": 0.9854802344806854, + "flos": 36705124676160.0, + "grad_norm": 1.8746104852598449, + "language_loss": 0.7735607, + "learning_rate": 2.2020331496312994e-09, + "loss": 0.79499233, + "num_input_tokens_seen": 353710660, + "router_z_loss_clip": 0.71582031, + "router_z_loss_mlp": 0.1194458, + "step": 16391, + "time_per_iteration": 2.697105646133423 + }, + { + "auxiliary_loss_clip": 0.01103286, + "auxiliary_loss_mlp": 0.01029808, + "balance_loss_clip": 1.03632116, + "balance_loss_mlp": 1.01944828, + "epoch": 0.9855403577333534, + "flos": 26731638519840.0, + "grad_norm": 2.427827312174541, + "language_loss": 0.68147361, + "learning_rate": 2.1838002481673333e-09, + "loss": 0.70280457, + "num_input_tokens_seen": 353730440, + "router_z_loss_clip": 0.66992188, + "router_z_loss_mlp": 0.10345459, + "step": 16392, + "time_per_iteration": 2.6464297771453857 + }, + { + "auxiliary_loss_clip": 0.01112945, + "auxiliary_loss_mlp": 0.01028752, + "balance_loss_clip": 1.03811622, + "balance_loss_mlp": 1.01696801, + "epoch": 0.9856004809860214, + "flos": 18807548405760.0, + "grad_norm": 1.9644932974168567, + "language_loss": 0.55644971, + "learning_rate": 2.1656431040937286e-09, + "loss": 0.57786667, + "num_input_tokens_seen": 353748360, + "router_z_loss_clip": 0.74902344, + "router_z_loss_mlp": 0.11785889, + "step": 16393, + "time_per_iteration": 2.6072781085968018 + }, + { + "auxiliary_loss_clip": 0.01114976, + "auxiliary_loss_mlp": 0.01028552, + "balance_loss_clip": 1.03887653, + "balance_loss_mlp": 1.01649952, + "epoch": 0.9856606042386893, + "flos": 16659991694880.0, + "grad_norm": 2.4974649550500767, + "language_loss": 0.78355479, + "learning_rate": 2.1475617180990444e-09, + "loss": 0.80499005, + "num_input_tokens_seen": 353760880, + "router_z_loss_clip": 0.76171875, + "router_z_loss_mlp": 0.12054443, + "step": 16394, + "time_per_iteration": 2.5942790508270264 + }, + { + "auxiliary_loss_clip": 0.01111117, + "auxiliary_loss_mlp": 0.01033803, + "balance_loss_clip": 1.03639674, + "balance_loss_mlp": 1.02225804, + "epoch": 0.9857207274913573, + "flos": 28649331936000.0, + "grad_norm": 1.4773345765014354, + "language_loss": 0.75988388, + "learning_rate": 2.129556090869178e-09, + "loss": 0.78133309, + "num_input_tokens_seen": 353782255, + "router_z_loss_clip": 0.74658203, + "router_z_loss_mlp": 0.11553955, + "step": 16395, + "time_per_iteration": 2.6610658168792725 + }, + { + "auxiliary_loss_clip": 0.01107344, + "auxiliary_loss_mlp": 0.01027254, + "balance_loss_clip": 1.03696501, + "balance_loss_mlp": 1.01651287, + "epoch": 0.9857808507440252, + "flos": 25704509461920.0, + "grad_norm": 2.079805564315606, + "language_loss": 0.74929374, + "learning_rate": 2.1116262230866933e-09, + "loss": 0.77063972, + "num_input_tokens_seen": 353803580, + "router_z_loss_clip": 0.703125, + "router_z_loss_mlp": 0.10742188, + "step": 16396, + "time_per_iteration": 2.6941335201263428 + }, + { + "auxiliary_loss_clip": 0.0110783, + "auxiliary_loss_mlp": 0.01024399, + "balance_loss_clip": 1.03682852, + "balance_loss_mlp": 1.01333606, + "epoch": 0.9858409739966932, + "flos": 30873129467040.0, + "grad_norm": 1.5000284842757634, + "language_loss": 0.70889688, + "learning_rate": 2.0937721154317133e-09, + "loss": 0.73021924, + "num_input_tokens_seen": 353824200, + "router_z_loss_clip": 0.70996094, + "router_z_loss_mlp": 0.1105957, + "step": 16397, + "time_per_iteration": 2.67212176322937 + }, + { + "auxiliary_loss_clip": 0.01106435, + "auxiliary_loss_mlp": 0.01030769, + "balance_loss_clip": 1.03890681, + "balance_loss_mlp": 1.02045751, + "epoch": 0.9859010972493611, + "flos": 25085777950080.0, + "grad_norm": 2.8532867090923753, + "language_loss": 0.71024859, + "learning_rate": 2.0759937685810304e-09, + "loss": 0.73162067, + "num_input_tokens_seen": 353843350, + "router_z_loss_clip": 0.67529297, + "router_z_loss_mlp": 0.10302734, + "step": 16398, + "time_per_iteration": 2.633525848388672 + }, + { + "auxiliary_loss_clip": 0.01107115, + "auxiliary_loss_mlp": 0.01025292, + "balance_loss_clip": 1.03670394, + "balance_loss_mlp": 1.01473022, + "epoch": 0.9859612205020292, + "flos": 30206310294240.0, + "grad_norm": 1.6701486939187595, + "language_loss": 0.74140477, + "learning_rate": 2.058291183208771e-09, + "loss": 0.76272887, + "num_input_tokens_seen": 353864520, + "router_z_loss_clip": 0.70361328, + "router_z_loss_mlp": 0.10571289, + "step": 16399, + "time_per_iteration": 2.6872005462646484 + }, + { + "auxiliary_loss_clip": 0.01109679, + "auxiliary_loss_mlp": 0.01027032, + "balance_loss_clip": 1.03666878, + "balance_loss_mlp": 1.01592135, + "epoch": 0.9860213437546971, + "flos": 25753407468480.0, + "grad_norm": 1.9203423811035585, + "language_loss": 0.57428128, + "learning_rate": 2.0406643599863993e-09, + "loss": 0.59564841, + "num_input_tokens_seen": 353882240, + "router_z_loss_clip": 0.72949219, + "router_z_loss_mlp": 0.11108398, + "step": 16400, + "time_per_iteration": 4.053078651428223 + }, + { + "auxiliary_loss_clip": 0.01114888, + "auxiliary_loss_mlp": 0.01029633, + "balance_loss_clip": 1.03781128, + "balance_loss_mlp": 1.01769996, + "epoch": 0.9860814670073651, + "flos": 23349698098560.0, + "grad_norm": 1.7305522107379359, + "language_loss": 0.80897534, + "learning_rate": 2.023113299582491e-09, + "loss": 0.83042055, + "num_input_tokens_seen": 353901590, + "router_z_loss_clip": 0.77197266, + "router_z_loss_mlp": 0.11938477, + "step": 16401, + "time_per_iteration": 2.721219778060913 + }, + { + "auxiliary_loss_clip": 0.01107487, + "auxiliary_loss_mlp": 0.01028883, + "balance_loss_clip": 1.03731728, + "balance_loss_mlp": 1.01683116, + "epoch": 0.9861415902600331, + "flos": 21032520937920.0, + "grad_norm": 2.15755502412894, + "language_loss": 0.77942604, + "learning_rate": 2.005638002662069e-09, + "loss": 0.80078971, + "num_input_tokens_seen": 353918785, + "router_z_loss_clip": 0.70166016, + "router_z_loss_mlp": 0.1204834, + "step": 16402, + "time_per_iteration": 3.8954591751098633 + }, + { + "auxiliary_loss_clip": 0.01110517, + "auxiliary_loss_mlp": 0.01030933, + "balance_loss_clip": 1.03798127, + "balance_loss_mlp": 1.02025759, + "epoch": 0.986201713512701, + "flos": 33900878112480.0, + "grad_norm": 1.787960987592959, + "language_loss": 0.69655615, + "learning_rate": 1.9882384698881596e-09, + "loss": 0.71797061, + "num_input_tokens_seen": 353940390, + "router_z_loss_clip": 0.72509766, + "router_z_loss_mlp": 0.10675049, + "step": 16403, + "time_per_iteration": 2.653348207473755 + }, + { + "auxiliary_loss_clip": 0.01106222, + "auxiliary_loss_mlp": 0.0102878, + "balance_loss_clip": 1.03547049, + "balance_loss_mlp": 1.01837945, + "epoch": 0.986261836765369, + "flos": 35057562380640.0, + "grad_norm": 1.882567186480774, + "language_loss": 0.74429262, + "learning_rate": 1.9709147019204566e-09, + "loss": 0.76564264, + "num_input_tokens_seen": 353962180, + "router_z_loss_clip": 0.70800781, + "router_z_loss_mlp": 0.10400391, + "step": 16404, + "time_per_iteration": 2.7023346424102783 + }, + { + "auxiliary_loss_clip": 0.0110981, + "auxiliary_loss_mlp": 0.01027267, + "balance_loss_clip": 1.03760958, + "balance_loss_mlp": 1.01619852, + "epoch": 0.986321960018037, + "flos": 41870381747040.0, + "grad_norm": 1.7071730696396483, + "language_loss": 0.69790459, + "learning_rate": 1.953666699415768e-09, + "loss": 0.71927541, + "num_input_tokens_seen": 353984305, + "router_z_loss_clip": 0.72216797, + "router_z_loss_mlp": 0.11065674, + "step": 16405, + "time_per_iteration": 2.7010509967803955 + }, + { + "auxiliary_loss_clip": 0.01107762, + "auxiliary_loss_mlp": 0.01033943, + "balance_loss_clip": 1.03834999, + "balance_loss_mlp": 1.02386999, + "epoch": 0.986382083270705, + "flos": 30737132009280.0, + "grad_norm": 1.65498722302393, + "language_loss": 0.69439566, + "learning_rate": 1.93649446302846e-09, + "loss": 0.71581268, + "num_input_tokens_seen": 354004495, + "router_z_loss_clip": 0.69433594, + "router_z_loss_mlp": 0.10070801, + "step": 16406, + "time_per_iteration": 2.643014430999756 + }, + { + "auxiliary_loss_clip": 0.01107384, + "auxiliary_loss_mlp": 0.01031636, + "balance_loss_clip": 1.03794742, + "balance_loss_mlp": 1.02067518, + "epoch": 0.9864422065233729, + "flos": 13453992419040.0, + "grad_norm": 3.019370914103134, + "language_loss": 0.75062609, + "learning_rate": 1.9193979934095663e-09, + "loss": 0.77201635, + "num_input_tokens_seen": 354015985, + "router_z_loss_clip": 0.69433594, + "router_z_loss_mlp": 0.10955811, + "step": 16407, + "time_per_iteration": 2.553065776824951 + }, + { + "auxiliary_loss_clip": 0.01106109, + "auxiliary_loss_mlp": 0.01029471, + "balance_loss_clip": 1.03521228, + "balance_loss_mlp": 1.01870024, + "epoch": 0.9865023297760409, + "flos": 20188585618560.0, + "grad_norm": 2.0297576464747062, + "language_loss": 0.77244461, + "learning_rate": 1.9023772912072357e-09, + "loss": 0.79380041, + "num_input_tokens_seen": 354033260, + "router_z_loss_clip": 0.70898438, + "router_z_loss_mlp": 0.10766602, + "step": 16408, + "time_per_iteration": 2.614154815673828 + }, + { + "auxiliary_loss_clip": 0.01112894, + "auxiliary_loss_mlp": 0.01032139, + "balance_loss_clip": 1.03785026, + "balance_loss_mlp": 1.02030754, + "epoch": 0.9865624530287088, + "flos": 23037475874400.0, + "grad_norm": 2.171298483220812, + "language_loss": 0.68236601, + "learning_rate": 1.8854323570669515e-09, + "loss": 0.70381635, + "num_input_tokens_seen": 354052825, + "router_z_loss_clip": 0.75048828, + "router_z_loss_mlp": 0.1182251, + "step": 16409, + "time_per_iteration": 2.5808486938476562 + }, + { + "auxiliary_loss_clip": 0.01026998, + "auxiliary_loss_mlp": 0.01001257, + "balance_loss_clip": 1.00462937, + "balance_loss_mlp": 1.0003556, + "epoch": 0.9866225762813768, + "flos": 84058577102880.0, + "grad_norm": 0.7994268891957037, + "language_loss": 0.6103546, + "learning_rate": 1.8685631916313118e-09, + "loss": 0.63063717, + "num_input_tokens_seen": 354113920, + "router_z_loss_clip": 0.22375488, + "router_z_loss_mlp": 0.00901794, + "step": 16410, + "time_per_iteration": 3.3058888912200928 + }, + { + "auxiliary_loss_clip": 0.01110457, + "auxiliary_loss_mlp": 0.01026474, + "balance_loss_clip": 1.03753972, + "balance_loss_mlp": 1.01531637, + "epoch": 0.9866826995340447, + "flos": 35414671400640.0, + "grad_norm": 4.637638161605222, + "language_loss": 0.66016269, + "learning_rate": 1.8517697955400258e-09, + "loss": 0.68153203, + "num_input_tokens_seen": 354134210, + "router_z_loss_clip": 0.72998047, + "router_z_loss_mlp": 0.1116333, + "step": 16411, + "time_per_iteration": 2.69920015335083 + }, + { + "auxiliary_loss_clip": 0.01027043, + "auxiliary_loss_mlp": 0.01000763, + "balance_loss_clip": 1.00471032, + "balance_loss_mlp": 0.99981463, + "epoch": 0.9867428227867128, + "flos": 79771614004800.0, + "grad_norm": 0.7228322410603724, + "language_loss": 0.56200325, + "learning_rate": 1.8350521694299182e-09, + "loss": 0.58228129, + "num_input_tokens_seen": 354198010, + "router_z_loss_clip": 0.2232666, + "router_z_loss_mlp": 0.00946808, + "step": 16412, + "time_per_iteration": 4.7275378704071045 + }, + { + "auxiliary_loss_clip": 0.01111679, + "auxiliary_loss_mlp": 0.01031987, + "balance_loss_clip": 1.03720689, + "balance_loss_mlp": 1.02055526, + "epoch": 0.9868029460393807, + "flos": 32343413546880.0, + "grad_norm": 1.8270275491484667, + "language_loss": 0.73372036, + "learning_rate": 1.818410313934926e-09, + "loss": 0.75515699, + "num_input_tokens_seen": 354220000, + "router_z_loss_clip": 0.74511719, + "router_z_loss_mlp": 0.11437988, + "step": 16413, + "time_per_iteration": 2.692514181137085 + }, + { + "auxiliary_loss_clip": 0.0110897, + "auxiliary_loss_mlp": 0.01027429, + "balance_loss_clip": 1.03572226, + "balance_loss_mlp": 1.01631236, + "epoch": 0.9868630692920487, + "flos": 28024806453120.0, + "grad_norm": 1.4955997525266274, + "language_loss": 0.71533841, + "learning_rate": 1.8018442296858782e-09, + "loss": 0.73670232, + "num_input_tokens_seen": 354240910, + "router_z_loss_clip": 0.73291016, + "router_z_loss_mlp": 0.11114502, + "step": 16414, + "time_per_iteration": 2.6496994495391846 + }, + { + "auxiliary_loss_clip": 0.01108266, + "auxiliary_loss_mlp": 0.01031126, + "balance_loss_clip": 1.03983498, + "balance_loss_mlp": 1.0206002, + "epoch": 0.9869231925447167, + "flos": 24194929970880.0, + "grad_norm": 1.7007377522544433, + "language_loss": 0.70683867, + "learning_rate": 1.7853539173111608e-09, + "loss": 0.72823262, + "num_input_tokens_seen": 354259430, + "router_z_loss_clip": 0.68408203, + "router_z_loss_mlp": 0.10516357, + "step": 16415, + "time_per_iteration": 2.627715587615967 + }, + { + "auxiliary_loss_clip": 0.01104017, + "auxiliary_loss_mlp": 0.01031341, + "balance_loss_clip": 1.03639913, + "balance_loss_mlp": 1.02133369, + "epoch": 0.9869833157973846, + "flos": 24642055686240.0, + "grad_norm": 1.9009747521806013, + "language_loss": 0.75707006, + "learning_rate": 1.7689393774362737e-09, + "loss": 0.77842367, + "num_input_tokens_seen": 354279490, + "router_z_loss_clip": 0.67578125, + "router_z_loss_mlp": 0.10003662, + "step": 16416, + "time_per_iteration": 2.6928727626800537 + }, + { + "auxiliary_loss_clip": 0.01108017, + "auxiliary_loss_mlp": 0.01027013, + "balance_loss_clip": 1.03754687, + "balance_loss_mlp": 1.01630163, + "epoch": 0.9870434390500527, + "flos": 19645000960320.0, + "grad_norm": 1.9330409383388312, + "language_loss": 0.70487356, + "learning_rate": 1.7526006106833858e-09, + "loss": 0.72622383, + "num_input_tokens_seen": 354295080, + "router_z_loss_clip": 0.70507812, + "router_z_loss_mlp": 0.10705566, + "step": 16417, + "time_per_iteration": 2.580639600753784 + }, + { + "auxiliary_loss_clip": 0.01114966, + "auxiliary_loss_mlp": 0.01037262, + "balance_loss_clip": 1.0393132, + "balance_loss_mlp": 1.02580571, + "epoch": 0.9871035623027206, + "flos": 26552739612960.0, + "grad_norm": 1.5832802728544304, + "language_loss": 0.70495522, + "learning_rate": 1.7363376176720013e-09, + "loss": 0.7264775, + "num_input_tokens_seen": 354314610, + "router_z_loss_clip": 0.75634766, + "router_z_loss_mlp": 0.11456299, + "step": 16418, + "time_per_iteration": 2.775104522705078 + }, + { + "auxiliary_loss_clip": 0.0102699, + "auxiliary_loss_mlp": 0.01001755, + "balance_loss_clip": 1.00464034, + "balance_loss_mlp": 1.00085092, + "epoch": 0.9871636855553886, + "flos": 85682037967200.0, + "grad_norm": 0.6603359058113172, + "language_loss": 0.53711426, + "learning_rate": 1.7201503990189603e-09, + "loss": 0.55740166, + "num_input_tokens_seen": 354383115, + "router_z_loss_clip": 0.22363281, + "router_z_loss_mlp": 0.0090332, + "step": 16419, + "time_per_iteration": 3.3849594593048096 + }, + { + "auxiliary_loss_clip": 0.0111236, + "auxiliary_loss_mlp": 0.01032551, + "balance_loss_clip": 1.03650188, + "balance_loss_mlp": 1.02014697, + "epoch": 0.9872238088080565, + "flos": 30561312415680.0, + "grad_norm": 5.722340127447327, + "language_loss": 0.78052306, + "learning_rate": 1.7040389553382162e-09, + "loss": 0.80197215, + "num_input_tokens_seen": 354403115, + "router_z_loss_clip": 0.7578125, + "router_z_loss_mlp": 0.12414551, + "step": 16420, + "time_per_iteration": 2.667001485824585 + }, + { + "auxiliary_loss_clip": 0.01110078, + "auxiliary_loss_mlp": 0.01029478, + "balance_loss_clip": 1.04045224, + "balance_loss_mlp": 1.01845169, + "epoch": 0.9872839320607245, + "flos": 23751977535360.0, + "grad_norm": 1.5607823157407914, + "language_loss": 0.70873749, + "learning_rate": 1.6880032872403916e-09, + "loss": 0.73013306, + "num_input_tokens_seen": 354424520, + "router_z_loss_clip": 0.69628906, + "router_z_loss_mlp": 0.11035156, + "step": 16421, + "time_per_iteration": 2.856403350830078 + }, + { + "auxiliary_loss_clip": 0.01110948, + "auxiliary_loss_mlp": 0.01031201, + "balance_loss_clip": 1.03683054, + "balance_loss_mlp": 1.01941109, + "epoch": 0.9873440553133924, + "flos": 32876301643200.0, + "grad_norm": 2.567720385390718, + "language_loss": 0.82338452, + "learning_rate": 1.6720433953338886e-09, + "loss": 0.84480602, + "num_input_tokens_seen": 354444800, + "router_z_loss_clip": 0.74023438, + "router_z_loss_mlp": 0.11791992, + "step": 16422, + "time_per_iteration": 2.7858378887176514 + }, + { + "auxiliary_loss_clip": 0.0110611, + "auxiliary_loss_mlp": 0.01023517, + "balance_loss_clip": 1.03640854, + "balance_loss_mlp": 1.01298451, + "epoch": 0.9874041785660604, + "flos": 23259640885920.0, + "grad_norm": 1.8775398019745095, + "language_loss": 0.86287022, + "learning_rate": 1.656159280223779e-09, + "loss": 0.88416648, + "num_input_tokens_seen": 354464590, + "router_z_loss_clip": 0.69775391, + "router_z_loss_mlp": 0.10534668, + "step": 16423, + "time_per_iteration": 2.626343250274658 + }, + { + "auxiliary_loss_clip": 0.01112462, + "auxiliary_loss_mlp": 0.01027361, + "balance_loss_clip": 1.03907084, + "balance_loss_mlp": 1.01645947, + "epoch": 0.9874643018187284, + "flos": 25753366951200.0, + "grad_norm": 2.2197249151212843, + "language_loss": 0.70371115, + "learning_rate": 1.6403509425122475e-09, + "loss": 0.72510934, + "num_input_tokens_seen": 354484145, + "router_z_loss_clip": 0.73339844, + "router_z_loss_mlp": 0.10906982, + "step": 16424, + "time_per_iteration": 2.624298572540283 + }, + { + "auxiliary_loss_clip": 0.01109321, + "auxiliary_loss_mlp": 0.01030008, + "balance_loss_clip": 1.0367471, + "balance_loss_mlp": 1.01862979, + "epoch": 0.9875244250713964, + "flos": 29804962754880.0, + "grad_norm": 3.3113448413203943, + "language_loss": 0.80284798, + "learning_rate": 1.6246183827990366e-09, + "loss": 0.82424128, + "num_input_tokens_seen": 354502475, + "router_z_loss_clip": 0.72607422, + "router_z_loss_mlp": 0.1138916, + "step": 16425, + "time_per_iteration": 2.674518346786499 + }, + { + "auxiliary_loss_clip": 0.01111192, + "auxiliary_loss_mlp": 0.01028113, + "balance_loss_clip": 1.03840816, + "balance_loss_mlp": 1.01637721, + "epoch": 0.9875845483240643, + "flos": 30649019626080.0, + "grad_norm": 3.3807410286418667, + "language_loss": 0.79959923, + "learning_rate": 1.6089616016803364e-09, + "loss": 0.82099223, + "num_input_tokens_seen": 354521855, + "router_z_loss_clip": 0.72753906, + "router_z_loss_mlp": 0.11737061, + "step": 16426, + "time_per_iteration": 2.6628639698028564 + }, + { + "auxiliary_loss_clip": 0.01111487, + "auxiliary_loss_mlp": 0.01030512, + "balance_loss_clip": 1.03977346, + "balance_loss_mlp": 1.01935422, + "epoch": 0.9876446715767323, + "flos": 20232783620640.0, + "grad_norm": 1.8943602937808581, + "language_loss": 0.84620839, + "learning_rate": 1.593380599750338e-09, + "loss": 0.8676284, + "num_input_tokens_seen": 354539535, + "router_z_loss_clip": 0.71630859, + "router_z_loss_mlp": 0.11151123, + "step": 16427, + "time_per_iteration": 4.07166051864624 + }, + { + "auxiliary_loss_clip": 0.01107045, + "auxiliary_loss_mlp": 0.01029838, + "balance_loss_clip": 1.03709149, + "balance_loss_mlp": 1.01878715, + "epoch": 0.9877047948294003, + "flos": 26376150191040.0, + "grad_norm": 1.9798195538481733, + "language_loss": 0.70350754, + "learning_rate": 1.577875377599458e-09, + "loss": 0.7248764, + "num_input_tokens_seen": 354557430, + "router_z_loss_clip": 0.69970703, + "router_z_loss_mlp": 0.11065674, + "step": 16428, + "time_per_iteration": 2.609204053878784 + }, + { + "auxiliary_loss_clip": 0.01106477, + "auxiliary_loss_mlp": 0.01030844, + "balance_loss_clip": 1.03684616, + "balance_loss_mlp": 1.01975155, + "epoch": 0.9877649180820682, + "flos": 25842087093600.0, + "grad_norm": 2.917639833927349, + "language_loss": 0.80301863, + "learning_rate": 1.5624459358158926e-09, + "loss": 0.82439184, + "num_input_tokens_seen": 354574735, + "router_z_loss_clip": 0.6953125, + "router_z_loss_mlp": 0.11096191, + "step": 16429, + "time_per_iteration": 2.6648521423339844 + }, + { + "auxiliary_loss_clip": 0.01108095, + "auxiliary_loss_mlp": 0.01027312, + "balance_loss_clip": 1.03705549, + "balance_loss_mlp": 1.01657128, + "epoch": 0.9878250413347363, + "flos": 48503572895520.0, + "grad_norm": 1.7054533511154675, + "language_loss": 0.62232268, + "learning_rate": 1.5470922749845073e-09, + "loss": 0.64367676, + "num_input_tokens_seen": 354597050, + "router_z_loss_clip": 0.71142578, + "router_z_loss_mlp": 0.10742188, + "step": 16430, + "time_per_iteration": 2.8033668994903564 + }, + { + "auxiliary_loss_clip": 0.01110143, + "auxiliary_loss_mlp": 0.01030718, + "balance_loss_clip": 1.03763795, + "balance_loss_mlp": 1.02000082, + "epoch": 0.9878851645874042, + "flos": 35905752014400.0, + "grad_norm": 1.3279156381886097, + "language_loss": 0.73039997, + "learning_rate": 1.531814395687725e-09, + "loss": 0.75180852, + "num_input_tokens_seen": 354619095, + "router_z_loss_clip": 0.72460938, + "router_z_loss_mlp": 0.10717773, + "step": 16431, + "time_per_iteration": 2.701331377029419 + }, + { + "auxiliary_loss_clip": 0.01109924, + "auxiliary_loss_mlp": 0.0103103, + "balance_loss_clip": 1.03872073, + "balance_loss_mlp": 1.01987791, + "epoch": 0.9879452878400722, + "flos": 19287284181120.0, + "grad_norm": 2.230954223043708, + "language_loss": 0.80574083, + "learning_rate": 1.5166122985048602e-09, + "loss": 0.82715034, + "num_input_tokens_seen": 354633790, + "router_z_loss_clip": 0.71142578, + "router_z_loss_mlp": 0.11151123, + "step": 16432, + "time_per_iteration": 2.667362928390503 + }, + { + "auxiliary_loss_clip": 0.01105442, + "auxiliary_loss_mlp": 0.0102533, + "balance_loss_clip": 1.03539133, + "balance_loss_mlp": 1.01522648, + "epoch": 0.9880054110927401, + "flos": 27130230884160.0, + "grad_norm": 1.5520586645179564, + "language_loss": 0.80732965, + "learning_rate": 1.5014859840123405e-09, + "loss": 0.82863736, + "num_input_tokens_seen": 354653180, + "router_z_loss_clip": 0.69970703, + "router_z_loss_mlp": 0.10107422, + "step": 16433, + "time_per_iteration": 2.6334474086761475 + }, + { + "auxiliary_loss_clip": 0.01106975, + "auxiliary_loss_mlp": 0.01030358, + "balance_loss_clip": 1.03748989, + "balance_loss_mlp": 1.01916456, + "epoch": 0.9880655343454081, + "flos": 35098640552160.0, + "grad_norm": 2.298922475926436, + "language_loss": 0.65168512, + "learning_rate": 1.4864354527837075e-09, + "loss": 0.67305851, + "num_input_tokens_seen": 354669900, + "router_z_loss_clip": 0.69482422, + "router_z_loss_mlp": 0.11187744, + "step": 16434, + "time_per_iteration": 2.6849915981292725 + }, + { + "auxiliary_loss_clip": 0.01109853, + "auxiliary_loss_mlp": 0.01027481, + "balance_loss_clip": 1.03580809, + "balance_loss_mlp": 1.01646006, + "epoch": 0.988125657598076, + "flos": 40088726305920.0, + "grad_norm": 1.4732840969430085, + "language_loss": 0.69431764, + "learning_rate": 1.4714607053896154e-09, + "loss": 0.71569097, + "num_input_tokens_seen": 354693165, + "router_z_loss_clip": 0.74121094, + "router_z_loss_mlp": 0.11016846, + "step": 16435, + "time_per_iteration": 2.6865017414093018 + }, + { + "auxiliary_loss_clip": 0.0110984, + "auxiliary_loss_mlp": 0.0103041, + "balance_loss_clip": 1.03864217, + "balance_loss_mlp": 1.0194788, + "epoch": 0.988185780850744, + "flos": 23660461700640.0, + "grad_norm": 1.6856099286335617, + "language_loss": 0.75499654, + "learning_rate": 1.4565617423980548e-09, + "loss": 0.77639908, + "num_input_tokens_seen": 354711915, + "router_z_loss_clip": 0.71191406, + "router_z_loss_mlp": 0.10931396, + "step": 16436, + "time_per_iteration": 2.636482000350952 + }, + { + "auxiliary_loss_clip": 0.01108466, + "auxiliary_loss_mlp": 0.01027162, + "balance_loss_clip": 1.03707981, + "balance_loss_mlp": 1.01578379, + "epoch": 0.988245904103412, + "flos": 27489365768160.0, + "grad_norm": 2.7105061438582188, + "language_loss": 0.74144328, + "learning_rate": 1.4417385643741286e-09, + "loss": 0.7627995, + "num_input_tokens_seen": 354729135, + "router_z_loss_clip": 0.71435547, + "router_z_loss_mlp": 0.11364746, + "step": 16437, + "time_per_iteration": 2.646209239959717 + }, + { + "auxiliary_loss_clip": 0.011048, + "auxiliary_loss_mlp": 0.01030644, + "balance_loss_clip": 1.035568, + "balance_loss_mlp": 1.01986742, + "epoch": 0.98830602735608, + "flos": 34970503446720.0, + "grad_norm": 1.9113938729434385, + "language_loss": 0.60518706, + "learning_rate": 1.4269911718796103e-09, + "loss": 0.62654155, + "num_input_tokens_seen": 354752530, + "router_z_loss_clip": 0.69238281, + "router_z_loss_mlp": 0.10784912, + "step": 16438, + "time_per_iteration": 2.762112617492676 + }, + { + "auxiliary_loss_clip": 0.01108773, + "auxiliary_loss_mlp": 0.01027902, + "balance_loss_clip": 1.03769481, + "balance_loss_mlp": 1.01661301, + "epoch": 0.9883661506087479, + "flos": 25619435874720.0, + "grad_norm": 1.9306628250312385, + "language_loss": 0.72318351, + "learning_rate": 1.4123195654738295e-09, + "loss": 0.74455023, + "num_input_tokens_seen": 354771135, + "router_z_loss_clip": 0.71044922, + "router_z_loss_mlp": 0.112854, + "step": 16439, + "time_per_iteration": 4.1262171268463135 + }, + { + "auxiliary_loss_clip": 0.01106846, + "auxiliary_loss_mlp": 0.01028095, + "balance_loss_clip": 1.03617382, + "balance_loss_mlp": 1.01695454, + "epoch": 0.9884262738614159, + "flos": 39909138605280.0, + "grad_norm": 1.9548375826951143, + "language_loss": 0.60260534, + "learning_rate": 1.3977237457134528e-09, + "loss": 0.62395477, + "num_input_tokens_seen": 354791800, + "router_z_loss_clip": 0.70703125, + "router_z_loss_mlp": 0.11138916, + "step": 16440, + "time_per_iteration": 2.745481491088867 + }, + { + "auxiliary_loss_clip": 0.01108777, + "auxiliary_loss_mlp": 0.0102854, + "balance_loss_clip": 1.03518629, + "balance_loss_mlp": 1.01785898, + "epoch": 0.9884863971140839, + "flos": 21432409855200.0, + "grad_norm": 2.350941408123709, + "language_loss": 0.75701511, + "learning_rate": 1.3832037131513707e-09, + "loss": 0.77838832, + "num_input_tokens_seen": 354809200, + "router_z_loss_clip": 0.73583984, + "router_z_loss_mlp": 0.10675049, + "step": 16441, + "time_per_iteration": 2.683138370513916 + }, + { + "auxiliary_loss_clip": 0.01108776, + "auxiliary_loss_mlp": 0.01025075, + "balance_loss_clip": 1.03706336, + "balance_loss_mlp": 1.01438236, + "epoch": 0.9885465203667518, + "flos": 49483748776320.0, + "grad_norm": 1.993474022360969, + "language_loss": 0.67714405, + "learning_rate": 1.3687594683386982e-09, + "loss": 0.69848257, + "num_input_tokens_seen": 354829945, + "router_z_loss_clip": 0.71679688, + "router_z_loss_mlp": 0.10693359, + "step": 16442, + "time_per_iteration": 4.3227622509002686 + }, + { + "auxiliary_loss_clip": 0.01106379, + "auxiliary_loss_mlp": 0.01030576, + "balance_loss_clip": 1.03560328, + "balance_loss_mlp": 1.01960254, + "epoch": 0.9886066436194199, + "flos": 16848047507040.0, + "grad_norm": 2.32107874275121, + "language_loss": 0.74439728, + "learning_rate": 1.3543910118227753e-09, + "loss": 0.76576686, + "num_input_tokens_seen": 354845055, + "router_z_loss_clip": 0.70751953, + "router_z_loss_mlp": 0.10980225, + "step": 16443, + "time_per_iteration": 2.6566150188446045 + }, + { + "auxiliary_loss_clip": 0.01111045, + "auxiliary_loss_mlp": 0.01026512, + "balance_loss_clip": 1.03774774, + "balance_loss_mlp": 1.01466811, + "epoch": 0.9886667668720878, + "flos": 28462126986720.0, + "grad_norm": 2.1067561840176996, + "language_loss": 0.7339648, + "learning_rate": 1.3400983441487213e-09, + "loss": 0.75534034, + "num_input_tokens_seen": 354864680, + "router_z_loss_clip": 0.73242188, + "router_z_loss_mlp": 0.11859131, + "step": 16444, + "time_per_iteration": 2.697760581970215 + }, + { + "auxiliary_loss_clip": 0.01110315, + "auxiliary_loss_mlp": 0.01027838, + "balance_loss_clip": 1.04024804, + "balance_loss_mlp": 1.01704919, + "epoch": 0.9887268901247558, + "flos": 27706547154240.0, + "grad_norm": 2.527837475503798, + "language_loss": 0.69483691, + "learning_rate": 1.325881465858547e-09, + "loss": 0.71621841, + "num_input_tokens_seen": 354885685, + "router_z_loss_clip": 0.70166016, + "router_z_loss_mlp": 0.10784912, + "step": 16445, + "time_per_iteration": 2.635434865951538 + }, + { + "auxiliary_loss_clip": 0.0111193, + "auxiliary_loss_mlp": 0.01025168, + "balance_loss_clip": 1.0395298, + "balance_loss_mlp": 1.01403403, + "epoch": 0.9887870133774237, + "flos": 15912515318400.0, + "grad_norm": 3.172744559202246, + "language_loss": 0.61122799, + "learning_rate": 1.311740377491155e-09, + "loss": 0.632599, + "num_input_tokens_seen": 354901505, + "router_z_loss_clip": 0.72412109, + "router_z_loss_mlp": 0.11138916, + "step": 16446, + "time_per_iteration": 2.6247751712799072 + }, + { + "auxiliary_loss_clip": 0.01108347, + "auxiliary_loss_mlp": 0.01028061, + "balance_loss_clip": 1.03720999, + "balance_loss_mlp": 1.0175643, + "epoch": 0.9888471366300917, + "flos": 18496420148160.0, + "grad_norm": 2.5922624379069616, + "language_loss": 0.70956057, + "learning_rate": 1.297675079582783e-09, + "loss": 0.73092461, + "num_input_tokens_seen": 354920060, + "router_z_loss_clip": 0.71191406, + "router_z_loss_mlp": 0.10498047, + "step": 16447, + "time_per_iteration": 2.593322515487671 + }, + { + "auxiliary_loss_clip": 0.01107663, + "auxiliary_loss_mlp": 0.01029431, + "balance_loss_clip": 1.03709865, + "balance_loss_mlp": 1.01902997, + "epoch": 0.9889072598827596, + "flos": 30650356696320.0, + "grad_norm": 2.21618135884544, + "language_loss": 0.83804405, + "learning_rate": 1.2836855726667818e-09, + "loss": 0.85941505, + "num_input_tokens_seen": 354938690, + "router_z_loss_clip": 0.70605469, + "router_z_loss_mlp": 0.10400391, + "step": 16448, + "time_per_iteration": 2.656862497329712 + }, + { + "auxiliary_loss_clip": 0.01105436, + "auxiliary_loss_mlp": 0.0102391, + "balance_loss_clip": 1.03614974, + "balance_loss_mlp": 1.01408732, + "epoch": 0.9889673831354276, + "flos": 20411925631200.0, + "grad_norm": 1.472383933544256, + "language_loss": 0.69778383, + "learning_rate": 1.26977185727406e-09, + "loss": 0.71907729, + "num_input_tokens_seen": 354956955, + "router_z_loss_clip": 0.69287109, + "router_z_loss_mlp": 0.0982666, + "step": 16449, + "time_per_iteration": 2.6029059886932373 + }, + { + "auxiliary_loss_clip": 0.01110435, + "auxiliary_loss_mlp": 0.01026654, + "balance_loss_clip": 1.03694558, + "balance_loss_mlp": 1.01549029, + "epoch": 0.9890275063880956, + "flos": 43422700618080.0, + "grad_norm": 2.377293702923003, + "language_loss": 0.73785287, + "learning_rate": 1.25593393393153e-09, + "loss": 0.75922382, + "num_input_tokens_seen": 354976800, + "router_z_loss_clip": 0.73486328, + "router_z_loss_mlp": 0.11169434, + "step": 16450, + "time_per_iteration": 2.7388319969177246 + }, + { + "auxiliary_loss_clip": 0.01109471, + "auxiliary_loss_mlp": 0.01028617, + "balance_loss_clip": 1.03525615, + "balance_loss_mlp": 1.01757812, + "epoch": 0.9890876296407636, + "flos": 23125628774880.0, + "grad_norm": 3.094380186460157, + "language_loss": 0.7938965, + "learning_rate": 1.242171803164549e-09, + "loss": 0.8152774, + "num_input_tokens_seen": 354996625, + "router_z_loss_clip": 0.7421875, + "router_z_loss_mlp": 0.11035156, + "step": 16451, + "time_per_iteration": 4.107515096664429 + }, + { + "auxiliary_loss_clip": 0.01108241, + "auxiliary_loss_mlp": 0.01032604, + "balance_loss_clip": 1.0350306, + "balance_loss_mlp": 1.02114189, + "epoch": 0.9891477528934315, + "flos": 29003564229120.0, + "grad_norm": 3.11316991726688, + "language_loss": 0.70603067, + "learning_rate": 1.2284854654946996e-09, + "loss": 0.72743911, + "num_input_tokens_seen": 355014535, + "router_z_loss_clip": 0.73291016, + "router_z_loss_mlp": 0.11462402, + "step": 16452, + "time_per_iteration": 2.6237943172454834 + }, + { + "auxiliary_loss_clip": 0.0110587, + "auxiliary_loss_mlp": 0.01024607, + "balance_loss_clip": 1.03740466, + "balance_loss_mlp": 1.01478434, + "epoch": 0.9892078761460995, + "flos": 25347562511040.0, + "grad_norm": 1.663620134685855, + "language_loss": 0.73738897, + "learning_rate": 1.2148749214409004e-09, + "loss": 0.7586937, + "num_input_tokens_seen": 355033280, + "router_z_loss_clip": 0.68505859, + "router_z_loss_mlp": 0.09820557, + "step": 16453, + "time_per_iteration": 2.678572654724121 + }, + { + "auxiliary_loss_clip": 0.01109187, + "auxiliary_loss_mlp": 0.01033334, + "balance_loss_clip": 1.03660119, + "balance_loss_mlp": 1.02278972, + "epoch": 0.9892679993987675, + "flos": 28513658616480.0, + "grad_norm": 2.087474228985757, + "language_loss": 0.69672, + "learning_rate": 1.2013401715191828e-09, + "loss": 0.71814519, + "num_input_tokens_seen": 355053320, + "router_z_loss_clip": 0.72558594, + "router_z_loss_mlp": 0.10546875, + "step": 16454, + "time_per_iteration": 2.6401419639587402 + }, + { + "auxiliary_loss_clip": 0.01105467, + "auxiliary_loss_mlp": 0.01028892, + "balance_loss_clip": 1.03703475, + "balance_loss_mlp": 1.01847351, + "epoch": 0.9893281226514354, + "flos": 27705858360480.0, + "grad_norm": 1.8865333457068207, + "language_loss": 0.75941575, + "learning_rate": 1.1878812162433583e-09, + "loss": 0.78075933, + "num_input_tokens_seen": 355070230, + "router_z_loss_clip": 0.68359375, + "router_z_loss_mlp": 0.10418701, + "step": 16455, + "time_per_iteration": 2.732355833053589 + }, + { + "auxiliary_loss_clip": 0.01106337, + "auxiliary_loss_mlp": 0.01025314, + "balance_loss_clip": 1.03676319, + "balance_loss_mlp": 1.01431704, + "epoch": 0.9893882459041035, + "flos": 26596613476800.0, + "grad_norm": 2.835136553624787, + "language_loss": 0.65590966, + "learning_rate": 1.1744980561230188e-09, + "loss": 0.67722625, + "num_input_tokens_seen": 355090125, + "router_z_loss_clip": 0.69628906, + "router_z_loss_mlp": 0.10998535, + "step": 16456, + "time_per_iteration": 2.628950357437134 + }, + { + "auxiliary_loss_clip": 0.0111107, + "auxiliary_loss_mlp": 0.01028166, + "balance_loss_clip": 1.03856921, + "balance_loss_mlp": 1.01706803, + "epoch": 0.9894483691567714, + "flos": 22102267824000.0, + "grad_norm": 1.7759109916875255, + "language_loss": 0.73999488, + "learning_rate": 1.161190691666203e-09, + "loss": 0.76138717, + "num_input_tokens_seen": 355107890, + "router_z_loss_clip": 0.72558594, + "router_z_loss_mlp": 0.11108398, + "step": 16457, + "time_per_iteration": 2.6439552307128906 + }, + { + "auxiliary_loss_clip": 0.01110465, + "auxiliary_loss_mlp": 0.01024627, + "balance_loss_clip": 1.0387907, + "balance_loss_mlp": 1.01359475, + "epoch": 0.9895084924094394, + "flos": 38085311026080.0, + "grad_norm": 2.1964001057441758, + "language_loss": 0.68889296, + "learning_rate": 1.1479591233773954e-09, + "loss": 0.71024388, + "num_input_tokens_seen": 355126340, + "router_z_loss_clip": 0.71777344, + "router_z_loss_mlp": 0.11022949, + "step": 16458, + "time_per_iteration": 2.685117244720459 + }, + { + "auxiliary_loss_clip": 0.01106783, + "auxiliary_loss_mlp": 0.01028617, + "balance_loss_clip": 1.03708124, + "balance_loss_mlp": 1.01767385, + "epoch": 0.9895686156621073, + "flos": 24011533645920.0, + "grad_norm": 1.7152301159203545, + "language_loss": 0.79306734, + "learning_rate": 1.1348033517581956e-09, + "loss": 0.8144213, + "num_input_tokens_seen": 355144025, + "router_z_loss_clip": 0.69677734, + "router_z_loss_mlp": 0.10949707, + "step": 16459, + "time_per_iteration": 2.6241841316223145 + }, + { + "auxiliary_loss_clip": 0.01108033, + "auxiliary_loss_mlp": 0.01029441, + "balance_loss_clip": 1.03567231, + "balance_loss_mlp": 1.01863432, + "epoch": 0.9896287389147753, + "flos": 28775078521920.0, + "grad_norm": 2.4298585222890576, + "language_loss": 0.70599234, + "learning_rate": 1.1217233773075373e-09, + "loss": 0.72736704, + "num_input_tokens_seen": 355163125, + "router_z_loss_clip": 0.72265625, + "router_z_loss_mlp": 0.1081543, + "step": 16460, + "time_per_iteration": 2.6561594009399414 + }, + { + "auxiliary_loss_clip": 0.01111645, + "auxiliary_loss_mlp": 0.01026624, + "balance_loss_clip": 1.03760815, + "balance_loss_mlp": 1.01543593, + "epoch": 0.9896888621674432, + "flos": 36125283402720.0, + "grad_norm": 1.6141863844420676, + "language_loss": 0.87556034, + "learning_rate": 1.1087192005214685e-09, + "loss": 0.89694303, + "num_input_tokens_seen": 355184060, + "router_z_loss_clip": 0.74121094, + "router_z_loss_mlp": 0.11181641, + "step": 16461, + "time_per_iteration": 2.696953535079956 + }, + { + "auxiliary_loss_clip": 0.01107665, + "auxiliary_loss_mlp": 0.01028753, + "balance_loss_clip": 1.036479, + "balance_loss_mlp": 1.01702237, + "epoch": 0.9897489854201112, + "flos": 28913061326400.0, + "grad_norm": 1.8953591520187336, + "language_loss": 0.63153541, + "learning_rate": 1.09579082189315e-09, + "loss": 0.6528995, + "num_input_tokens_seen": 355204505, + "router_z_loss_clip": 0.71289062, + "router_z_loss_mlp": 0.11730957, + "step": 16462, + "time_per_iteration": 2.696256637573242 + }, + { + "auxiliary_loss_clip": 0.01111165, + "auxiliary_loss_mlp": 0.01030248, + "balance_loss_clip": 1.03962708, + "balance_loss_mlp": 1.01948893, + "epoch": 0.9898091086727792, + "flos": 16136746711200.0, + "grad_norm": 1.864099473819449, + "language_loss": 0.72949088, + "learning_rate": 1.0829382419126343e-09, + "loss": 0.75090504, + "num_input_tokens_seen": 355223055, + "router_z_loss_clip": 0.71582031, + "router_z_loss_mlp": 0.10754395, + "step": 16463, + "time_per_iteration": 2.5846774578094482 + }, + { + "auxiliary_loss_clip": 0.01108926, + "auxiliary_loss_mlp": 0.01029321, + "balance_loss_clip": 1.0371387, + "balance_loss_mlp": 1.01733446, + "epoch": 0.9898692319254472, + "flos": 27979838622720.0, + "grad_norm": 1.8030538620965368, + "language_loss": 0.69969368, + "learning_rate": 1.0701614610675314e-09, + "loss": 0.72107613, + "num_input_tokens_seen": 355242000, + "router_z_loss_clip": 0.71826172, + "router_z_loss_mlp": 0.11981201, + "step": 16464, + "time_per_iteration": 2.7367796897888184 + }, + { + "auxiliary_loss_clip": 0.01110975, + "auxiliary_loss_mlp": 0.01029936, + "balance_loss_clip": 1.03773189, + "balance_loss_mlp": 1.01855731, + "epoch": 0.9899293551781151, + "flos": 15201092970720.0, + "grad_norm": 3.4597886220970167, + "language_loss": 0.72906744, + "learning_rate": 1.0574604798421204e-09, + "loss": 0.75047654, + "num_input_tokens_seen": 355260175, + "router_z_loss_clip": 0.73291016, + "router_z_loss_mlp": 0.11383057, + "step": 16465, + "time_per_iteration": 2.74831485748291 + }, + { + "auxiliary_loss_clip": 0.01105675, + "auxiliary_loss_mlp": 0.01031404, + "balance_loss_clip": 1.03497767, + "balance_loss_mlp": 1.02145624, + "epoch": 0.9899894784307831, + "flos": 32788594432800.0, + "grad_norm": 1.6650616034703263, + "language_loss": 0.86463416, + "learning_rate": 1.0448352987182386e-09, + "loss": 0.88600487, + "num_input_tokens_seen": 355281930, + "router_z_loss_clip": 0.70751953, + "router_z_loss_mlp": 0.09942627, + "step": 16466, + "time_per_iteration": 2.7011003494262695 + }, + { + "auxiliary_loss_clip": 0.01108653, + "auxiliary_loss_mlp": 0.01023999, + "balance_loss_clip": 1.037269, + "balance_loss_mlp": 1.01335406, + "epoch": 0.990049601683451, + "flos": 26285890392000.0, + "grad_norm": 2.1256435635414648, + "language_loss": 0.71816993, + "learning_rate": 1.0322859181743915e-09, + "loss": 0.73949641, + "num_input_tokens_seen": 355301555, + "router_z_loss_clip": 0.71386719, + "router_z_loss_mlp": 0.10638428, + "step": 16467, + "time_per_iteration": 4.1038055419921875 + }, + { + "auxiliary_loss_clip": 0.01108256, + "auxiliary_loss_mlp": 0.010304, + "balance_loss_clip": 1.03730178, + "balance_loss_mlp": 1.0196116, + "epoch": 0.990109724936119, + "flos": 35107635388320.0, + "grad_norm": 1.4581708653042569, + "language_loss": 0.64950371, + "learning_rate": 1.019812338686643e-09, + "loss": 0.67089021, + "num_input_tokens_seen": 355324925, + "router_z_loss_clip": 0.70947266, + "router_z_loss_mlp": 0.10791016, + "step": 16468, + "time_per_iteration": 2.7524945735931396 + }, + { + "auxiliary_loss_clip": 0.01112221, + "auxiliary_loss_mlp": 0.01029596, + "balance_loss_clip": 1.03726101, + "balance_loss_mlp": 1.01850402, + "epoch": 0.9901698481887871, + "flos": 35721545343840.0, + "grad_norm": 4.419465138209186, + "language_loss": 0.6200152, + "learning_rate": 1.0074145607281704e-09, + "loss": 0.64143336, + "num_input_tokens_seen": 355343875, + "router_z_loss_clip": 0.74902344, + "router_z_loss_mlp": 0.11096191, + "step": 16469, + "time_per_iteration": 2.7409932613372803 + }, + { + "auxiliary_loss_clip": 0.01111594, + "auxiliary_loss_mlp": 0.01028511, + "balance_loss_clip": 1.03808713, + "balance_loss_mlp": 1.01717424, + "epoch": 0.990229971441455, + "flos": 19473071025600.0, + "grad_norm": 2.3225919585748054, + "language_loss": 0.70359641, + "learning_rate": 9.950925847685976e-10, + "loss": 0.7249974, + "num_input_tokens_seen": 355358835, + "router_z_loss_clip": 0.73583984, + "router_z_loss_mlp": 0.11334229, + "step": 16470, + "time_per_iteration": 2.583190441131592 + }, + { + "auxiliary_loss_clip": 0.0102691, + "auxiliary_loss_mlp": 0.01001016, + "balance_loss_clip": 1.00458109, + "balance_loss_mlp": 1.00012302, + "epoch": 0.990290094694123, + "flos": 85146111074880.0, + "grad_norm": 0.6772053339538322, + "language_loss": 0.55459499, + "learning_rate": 9.828464112755509e-10, + "loss": 0.57487422, + "num_input_tokens_seen": 355431225, + "router_z_loss_clip": 0.2232666, + "router_z_loss_mlp": 0.00892639, + "step": 16471, + "time_per_iteration": 3.431962490081787 + }, + { + "auxiliary_loss_clip": 0.01110282, + "auxiliary_loss_mlp": 0.01029604, + "balance_loss_clip": 1.03869653, + "balance_loss_mlp": 1.01851201, + "epoch": 0.9903502179467909, + "flos": 19830990391200.0, + "grad_norm": 2.1574993672418676, + "language_loss": 0.83602744, + "learning_rate": 9.706760407131032e-10, + "loss": 0.85742629, + "num_input_tokens_seen": 355448250, + "router_z_loss_clip": 0.71533203, + "router_z_loss_mlp": 0.11096191, + "step": 16472, + "time_per_iteration": 2.600372552871704 + }, + { + "auxiliary_loss_clip": 0.01109978, + "auxiliary_loss_mlp": 0.01025829, + "balance_loss_clip": 1.03820348, + "balance_loss_mlp": 1.01467156, + "epoch": 0.9904103411994589, + "flos": 26464384126080.0, + "grad_norm": 2.4858553853069347, + "language_loss": 0.86016011, + "learning_rate": 9.585814735431075e-10, + "loss": 0.88151824, + "num_input_tokens_seen": 355467040, + "router_z_loss_clip": 0.71728516, + "router_z_loss_mlp": 0.11157227, + "step": 16473, + "time_per_iteration": 2.6351001262664795 + }, + { + "auxiliary_loss_clip": 0.01106468, + "auxiliary_loss_mlp": 0.01027572, + "balance_loss_clip": 1.03537786, + "balance_loss_mlp": 1.0173434, + "epoch": 0.9904704644521268, + "flos": 31407678771840.0, + "grad_norm": 1.6115281728003565, + "language_loss": 0.842345, + "learning_rate": 9.465627102240859e-10, + "loss": 0.86368537, + "num_input_tokens_seen": 355487825, + "router_z_loss_clip": 0.71044922, + "router_z_loss_mlp": 0.10235596, + "step": 16474, + "time_per_iteration": 2.656940460205078 + }, + { + "auxiliary_loss_clip": 0.01104501, + "auxiliary_loss_mlp": 0.01030501, + "balance_loss_clip": 1.03330052, + "balance_loss_mlp": 1.02018964, + "epoch": 0.9905305877047949, + "flos": 26732691969120.0, + "grad_norm": 6.177966103212802, + "language_loss": 0.76361448, + "learning_rate": 9.346197512116738e-10, + "loss": 0.78496456, + "num_input_tokens_seen": 355507445, + "router_z_loss_clip": 0.71289062, + "router_z_loss_mlp": 0.10302734, + "step": 16475, + "time_per_iteration": 2.6383907794952393 + }, + { + "auxiliary_loss_clip": 0.01107857, + "auxiliary_loss_mlp": 0.0103195, + "balance_loss_clip": 1.03499782, + "balance_loss_mlp": 1.02091122, + "epoch": 0.9905907109574628, + "flos": 26103588033600.0, + "grad_norm": 1.652747858681304, + "language_loss": 0.76018298, + "learning_rate": 9.227525969588423e-10, + "loss": 0.78158104, + "num_input_tokens_seen": 355527205, + "router_z_loss_clip": 0.72802734, + "router_z_loss_mlp": 0.11035156, + "step": 16476, + "time_per_iteration": 2.611443519592285 + }, + { + "auxiliary_loss_clip": 0.01113142, + "auxiliary_loss_mlp": 0.01027225, + "balance_loss_clip": 1.03748751, + "balance_loss_mlp": 1.01477969, + "epoch": 0.9906508342101308, + "flos": 25040526498720.0, + "grad_norm": 6.254247868214429, + "language_loss": 0.67385155, + "learning_rate": 9.109612479154538e-10, + "loss": 0.69525528, + "num_input_tokens_seen": 355544740, + "router_z_loss_clip": 0.75732422, + "router_z_loss_mlp": 0.12451172, + "step": 16477, + "time_per_iteration": 2.667201519012451 + }, + { + "auxiliary_loss_clip": 0.01114262, + "auxiliary_loss_mlp": 0.01031128, + "balance_loss_clip": 1.03928351, + "balance_loss_mlp": 1.01933837, + "epoch": 0.9907109574627987, + "flos": 26065021933440.0, + "grad_norm": 2.1732433706108467, + "language_loss": 0.71689403, + "learning_rate": 8.992457045289282e-10, + "loss": 0.73834795, + "num_input_tokens_seen": 355564385, + "router_z_loss_clip": 0.75, + "router_z_loss_mlp": 0.11791992, + "step": 16478, + "time_per_iteration": 4.1898534297943115 + }, + { + "auxiliary_loss_clip": 0.01110182, + "auxiliary_loss_mlp": 0.01035748, + "balance_loss_clip": 1.03771615, + "balance_loss_mlp": 1.02398789, + "epoch": 0.9907710807154667, + "flos": 21157173557280.0, + "grad_norm": 2.1971632425546024, + "language_loss": 0.80605793, + "learning_rate": 8.876059672433545e-10, + "loss": 0.82751715, + "num_input_tokens_seen": 355579260, + "router_z_loss_clip": 0.72509766, + "router_z_loss_mlp": 0.11767578, + "step": 16479, + "time_per_iteration": 2.578523635864258 + }, + { + "auxiliary_loss_clip": 0.01110356, + "auxiliary_loss_mlp": 0.01032213, + "balance_loss_clip": 1.03839135, + "balance_loss_mlp": 1.02199066, + "epoch": 0.9908312039681346, + "flos": 34925049408960.0, + "grad_norm": 1.5967269411618261, + "language_loss": 0.66256267, + "learning_rate": 8.760420364999355e-10, + "loss": 0.68398833, + "num_input_tokens_seen": 355599790, + "router_z_loss_clip": 0.71826172, + "router_z_loss_mlp": 0.10223389, + "step": 16480, + "time_per_iteration": 2.700172185897827 + }, + { + "auxiliary_loss_clip": 0.01106516, + "auxiliary_loss_mlp": 0.01029904, + "balance_loss_clip": 1.0360775, + "balance_loss_mlp": 1.01918745, + "epoch": 0.9908913272208026, + "flos": 43648755288480.0, + "grad_norm": 1.8848816193425615, + "language_loss": 0.72283959, + "learning_rate": 8.645539127374313e-10, + "loss": 0.74420375, + "num_input_tokens_seen": 355620925, + "router_z_loss_clip": 0.70458984, + "router_z_loss_mlp": 0.10723877, + "step": 16481, + "time_per_iteration": 4.26386570930481 + }, + { + "auxiliary_loss_clip": 0.01106383, + "auxiliary_loss_mlp": 0.01020425, + "balance_loss_clip": 1.03687227, + "balance_loss_mlp": 1.00979769, + "epoch": 0.9909514504734707, + "flos": 24284582010720.0, + "grad_norm": 1.799002815034142, + "language_loss": 0.77890074, + "learning_rate": 8.531415963912713e-10, + "loss": 0.80016887, + "num_input_tokens_seen": 355639165, + "router_z_loss_clip": 0.69580078, + "router_z_loss_mlp": 0.10626221, + "step": 16482, + "time_per_iteration": 2.6470417976379395 + }, + { + "auxiliary_loss_clip": 0.01110393, + "auxiliary_loss_mlp": 0.01028057, + "balance_loss_clip": 1.03713882, + "balance_loss_mlp": 1.01691699, + "epoch": 0.9910115737261386, + "flos": 24410409631200.0, + "grad_norm": 2.0779287305477308, + "language_loss": 0.75550067, + "learning_rate": 8.418050878944427e-10, + "loss": 0.77688515, + "num_input_tokens_seen": 355657320, + "router_z_loss_clip": 0.73193359, + "router_z_loss_mlp": 0.11132812, + "step": 16483, + "time_per_iteration": 2.6650164127349854 + }, + { + "auxiliary_loss_clip": 0.0102699, + "auxiliary_loss_mlp": 0.01001531, + "balance_loss_clip": 1.00463438, + "balance_loss_mlp": 1.00061071, + "epoch": 0.9910716969788066, + "flos": 86254667164800.0, + "grad_norm": 0.6773496146121382, + "language_loss": 0.53639752, + "learning_rate": 8.305443876768237e-10, + "loss": 0.55668271, + "num_input_tokens_seen": 355726370, + "router_z_loss_clip": 0.22338867, + "router_z_loss_mlp": 0.00920868, + "step": 16484, + "time_per_iteration": 3.3871748447418213 + }, + { + "auxiliary_loss_clip": 0.01105811, + "auxiliary_loss_mlp": 0.0103073, + "balance_loss_clip": 1.03731334, + "balance_loss_mlp": 1.02003074, + "epoch": 0.9911318202314745, + "flos": 26153904144960.0, + "grad_norm": 1.8424256583796688, + "language_loss": 0.81853247, + "learning_rate": 8.19359496165184e-10, + "loss": 0.83989781, + "num_input_tokens_seen": 355745840, + "router_z_loss_clip": 0.68457031, + "router_z_loss_mlp": 0.10699463, + "step": 16485, + "time_per_iteration": 2.634061098098755 + }, + { + "auxiliary_loss_clip": 0.01109901, + "auxiliary_loss_mlp": 0.01031685, + "balance_loss_clip": 1.03999913, + "balance_loss_mlp": 1.02007437, + "epoch": 0.9911919434841425, + "flos": 24192823072320.0, + "grad_norm": 1.7438172207219578, + "language_loss": 0.81292248, + "learning_rate": 8.082504137836288e-10, + "loss": 0.83433837, + "num_input_tokens_seen": 355763385, + "router_z_loss_clip": 0.69873047, + "router_z_loss_mlp": 0.1161499, + "step": 16486, + "time_per_iteration": 2.6128664016723633 + }, + { + "auxiliary_loss_clip": 0.01110492, + "auxiliary_loss_mlp": 0.01027114, + "balance_loss_clip": 1.03798485, + "balance_loss_mlp": 1.01622391, + "epoch": 0.9912520667368104, + "flos": 50907687438240.0, + "grad_norm": 1.4859566431943279, + "language_loss": 0.6599763, + "learning_rate": 7.972171409538209e-10, + "loss": 0.68135238, + "num_input_tokens_seen": 355786075, + "router_z_loss_clip": 0.72460938, + "router_z_loss_mlp": 0.10888672, + "step": 16487, + "time_per_iteration": 2.801917314529419 + }, + { + "auxiliary_loss_clip": 0.0110582, + "auxiliary_loss_mlp": 0.01026469, + "balance_loss_clip": 1.03657401, + "balance_loss_mlp": 1.01642013, + "epoch": 0.9913121899894785, + "flos": 29003969401920.0, + "grad_norm": 1.5952388118079532, + "language_loss": 0.76874542, + "learning_rate": 7.862596780936481e-10, + "loss": 0.79006827, + "num_input_tokens_seen": 355806295, + "router_z_loss_clip": 0.69238281, + "router_z_loss_mlp": 0.10058594, + "step": 16488, + "time_per_iteration": 2.6709325313568115 + }, + { + "auxiliary_loss_clip": 0.0111368, + "auxiliary_loss_mlp": 0.01030471, + "balance_loss_clip": 1.03763938, + "balance_loss_mlp": 1.01907444, + "epoch": 0.9913723132421464, + "flos": 29003969401920.0, + "grad_norm": 2.0397840505708973, + "language_loss": 0.690207, + "learning_rate": 7.753780256190001e-10, + "loss": 0.71164846, + "num_input_tokens_seen": 355825730, + "router_z_loss_clip": 0.75976562, + "router_z_loss_mlp": 0.11407471, + "step": 16489, + "time_per_iteration": 2.624833345413208 + }, + { + "auxiliary_loss_clip": 0.0102707, + "auxiliary_loss_mlp": 0.01001081, + "balance_loss_clip": 1.0047226, + "balance_loss_mlp": 1.0001545, + "epoch": 0.9914324364948144, + "flos": 86961632611680.0, + "grad_norm": 0.6066768377778952, + "language_loss": 0.52564657, + "learning_rate": 7.645721839424357e-10, + "loss": 0.54592812, + "num_input_tokens_seen": 355891545, + "router_z_loss_clip": 0.22351074, + "router_z_loss_mlp": 0.00924683, + "step": 16490, + "time_per_iteration": 4.758643388748169 + }, + { + "auxiliary_loss_clip": 0.01113894, + "auxiliary_loss_mlp": 0.01035896, + "balance_loss_clip": 1.03828645, + "balance_loss_mlp": 1.023772, + "epoch": 0.9914925597474823, + "flos": 28913507016480.0, + "grad_norm": 3.3588964196393785, + "language_loss": 0.75422126, + "learning_rate": 7.538421534734052e-10, + "loss": 0.77571917, + "num_input_tokens_seen": 355909920, + "router_z_loss_clip": 0.75634766, + "router_z_loss_mlp": 0.12121582, + "step": 16491, + "time_per_iteration": 2.624025821685791 + }, + { + "auxiliary_loss_clip": 0.01115002, + "auxiliary_loss_mlp": 0.01028076, + "balance_loss_clip": 1.04064965, + "balance_loss_mlp": 1.01623285, + "epoch": 0.9915526830001503, + "flos": 16392332128320.0, + "grad_norm": 2.739368182617376, + "language_loss": 0.7068367, + "learning_rate": 7.431879346191383e-10, + "loss": 0.72826749, + "num_input_tokens_seen": 355923130, + "router_z_loss_clip": 0.74316406, + "router_z_loss_mlp": 0.11846924, + "step": 16492, + "time_per_iteration": 2.5728557109832764 + }, + { + "auxiliary_loss_clip": 0.01108925, + "auxiliary_loss_mlp": 0.01027799, + "balance_loss_clip": 1.03702033, + "balance_loss_mlp": 1.01640224, + "epoch": 0.9916128062528182, + "flos": 25308145548000.0, + "grad_norm": 2.088422013466341, + "language_loss": 0.67482603, + "learning_rate": 7.326095277837563e-10, + "loss": 0.69619334, + "num_input_tokens_seen": 355941960, + "router_z_loss_clip": 0.71875, + "router_z_loss_mlp": 0.11413574, + "step": 16493, + "time_per_iteration": 2.6487812995910645 + }, + { + "auxiliary_loss_clip": 0.0111189, + "auxiliary_loss_mlp": 0.01033914, + "balance_loss_clip": 1.03740025, + "balance_loss_mlp": 1.02264297, + "epoch": 0.9916729295054862, + "flos": 27439373795040.0, + "grad_norm": 1.769514747420674, + "language_loss": 0.7148639, + "learning_rate": 7.221069333678276e-10, + "loss": 0.73632193, + "num_input_tokens_seen": 355961640, + "router_z_loss_clip": 0.74414062, + "router_z_loss_mlp": 0.11273193, + "step": 16494, + "time_per_iteration": 2.627514362335205 + }, + { + "auxiliary_loss_clip": 0.01110658, + "auxiliary_loss_mlp": 0.01030475, + "balance_loss_clip": 1.03765821, + "balance_loss_mlp": 1.01852989, + "epoch": 0.9917330527581543, + "flos": 18049456501920.0, + "grad_norm": 4.349873775016051, + "language_loss": 0.68361855, + "learning_rate": 7.116801517701443e-10, + "loss": 0.7050299, + "num_input_tokens_seen": 355977980, + "router_z_loss_clip": 0.72949219, + "router_z_loss_mlp": 0.11956787, + "step": 16495, + "time_per_iteration": 2.6024160385131836 + }, + { + "auxiliary_loss_clip": 0.01026981, + "auxiliary_loss_mlp": 0.01000899, + "balance_loss_clip": 1.00463533, + "balance_loss_mlp": 0.99999237, + "epoch": 0.9917931760108222, + "flos": 72225695545920.0, + "grad_norm": 0.7577152712551185, + "language_loss": 0.53403211, + "learning_rate": 7.013291833859458e-10, + "loss": 0.55431092, + "num_input_tokens_seen": 356042900, + "router_z_loss_clip": 0.22351074, + "router_z_loss_mlp": 0.00905609, + "step": 16496, + "time_per_iteration": 3.3585431575775146 + }, + { + "auxiliary_loss_clip": 0.01110668, + "auxiliary_loss_mlp": 0.01029926, + "balance_loss_clip": 1.03744125, + "balance_loss_mlp": 1.01797521, + "epoch": 0.9918532992634902, + "flos": 32654987494560.0, + "grad_norm": 1.7055759591069615, + "language_loss": 0.71399224, + "learning_rate": 6.91054028607585e-10, + "loss": 0.73539817, + "num_input_tokens_seen": 356063000, + "router_z_loss_clip": 0.73144531, + "router_z_loss_mlp": 0.11956787, + "step": 16497, + "time_per_iteration": 2.7100272178649902 + }, + { + "auxiliary_loss_clip": 0.01113091, + "auxiliary_loss_mlp": 0.0103263, + "balance_loss_clip": 1.03801012, + "balance_loss_mlp": 1.02069688, + "epoch": 0.9919134225161581, + "flos": 18272999100960.0, + "grad_norm": 3.527219732681712, + "language_loss": 0.8215031, + "learning_rate": 6.808546878249721e-10, + "loss": 0.8429603, + "num_input_tokens_seen": 356078130, + "router_z_loss_clip": 0.75048828, + "router_z_loss_mlp": 0.1192627, + "step": 16498, + "time_per_iteration": 2.6078808307647705 + }, + { + "auxiliary_loss_clip": 0.01112134, + "auxiliary_loss_mlp": 0.01034802, + "balance_loss_clip": 1.03874183, + "balance_loss_mlp": 1.02352464, + "epoch": 0.9919735457688261, + "flos": 33944427838080.0, + "grad_norm": 1.4985192980141657, + "language_loss": 0.67960495, + "learning_rate": 6.707311614246869e-10, + "loss": 0.7010743, + "num_input_tokens_seen": 356101655, + "router_z_loss_clip": 0.73388672, + "router_z_loss_mlp": 0.11279297, + "step": 16499, + "time_per_iteration": 2.6894354820251465 + }, + { + "auxiliary_loss_clip": 0.01111377, + "auxiliary_loss_mlp": 0.01028696, + "balance_loss_clip": 1.03850412, + "balance_loss_mlp": 1.01778889, + "epoch": 0.992033669021494, + "flos": 27531497388960.0, + "grad_norm": 2.193155438763601, + "language_loss": 0.82030153, + "learning_rate": 6.606834497904223e-10, + "loss": 0.84170222, + "num_input_tokens_seen": 356121425, + "router_z_loss_clip": 0.72802734, + "router_z_loss_mlp": 0.10906982, + "step": 16500, + "time_per_iteration": 2.6458117961883545 + }, + { + "auxiliary_loss_clip": 0.01112502, + "auxiliary_loss_mlp": 0.0102867, + "balance_loss_clip": 1.03836012, + "balance_loss_mlp": 1.01737547, + "epoch": 0.9920937922741621, + "flos": 30962457368640.0, + "grad_norm": 1.9174352918447102, + "language_loss": 0.8181783, + "learning_rate": 6.507115533036511e-10, + "loss": 0.83959001, + "num_input_tokens_seen": 356140710, + "router_z_loss_clip": 0.74121094, + "router_z_loss_mlp": 0.11297607, + "step": 16501, + "time_per_iteration": 2.6747400760650635 + }, + { + "auxiliary_loss_clip": 0.01110721, + "auxiliary_loss_mlp": 0.01028072, + "balance_loss_clip": 1.03812277, + "balance_loss_mlp": 1.01673484, + "epoch": 0.99215391552683, + "flos": 26910983116800.0, + "grad_norm": 1.9630109338500292, + "language_loss": 0.77088922, + "learning_rate": 6.408154723420711e-10, + "loss": 0.7922771, + "num_input_tokens_seen": 356159835, + "router_z_loss_clip": 0.72558594, + "router_z_loss_mlp": 0.11328125, + "step": 16502, + "time_per_iteration": 2.6759817600250244 + }, + { + "auxiliary_loss_clip": 0.01110601, + "auxiliary_loss_mlp": 0.01026432, + "balance_loss_clip": 1.03706717, + "balance_loss_mlp": 1.01454699, + "epoch": 0.992214038779498, + "flos": 18807710474880.0, + "grad_norm": 2.2744716420071054, + "language_loss": 0.72041839, + "learning_rate": 6.309952072811597e-10, + "loss": 0.74178869, + "num_input_tokens_seen": 356177555, + "router_z_loss_clip": 0.734375, + "router_z_loss_mlp": 0.11901855, + "step": 16503, + "time_per_iteration": 2.591050148010254 + }, + { + "auxiliary_loss_clip": 0.01027066, + "auxiliary_loss_mlp": 0.01001059, + "balance_loss_clip": 1.00466967, + "balance_loss_mlp": 1.00011694, + "epoch": 0.9922741620321659, + "flos": 75669175365120.0, + "grad_norm": 0.63654708662066, + "language_loss": 0.55069184, + "learning_rate": 6.212507584932858e-10, + "loss": 0.57097304, + "num_input_tokens_seen": 356244975, + "router_z_loss_clip": 0.22387695, + "router_z_loss_mlp": 0.00939941, + "step": 16504, + "time_per_iteration": 3.3417584896087646 + }, + { + "auxiliary_loss_clip": 0.01106166, + "auxiliary_loss_mlp": 0.0102259, + "balance_loss_clip": 1.03528619, + "balance_loss_mlp": 1.01283884, + "epoch": 0.9923342852848339, + "flos": 20945056831200.0, + "grad_norm": 1.9581746567389573, + "language_loss": 0.69502389, + "learning_rate": 6.115821263481536e-10, + "loss": 0.71631145, + "num_input_tokens_seen": 356262605, + "router_z_loss_clip": 0.70800781, + "router_z_loss_mlp": 0.09753418, + "step": 16505, + "time_per_iteration": 2.592357635498047 + }, + { + "auxiliary_loss_clip": 0.01111754, + "auxiliary_loss_mlp": 0.0102704, + "balance_loss_clip": 1.03634191, + "balance_loss_mlp": 1.01483321, + "epoch": 0.9923944085375018, + "flos": 28288576360800.0, + "grad_norm": 2.367170968841848, + "language_loss": 0.65769792, + "learning_rate": 6.019893112119146e-10, + "loss": 0.67908585, + "num_input_tokens_seen": 356278935, + "router_z_loss_clip": 0.75439453, + "router_z_loss_mlp": 0.12219238, + "step": 16506, + "time_per_iteration": 4.093010663986206 + }, + { + "auxiliary_loss_clip": 0.01107544, + "auxiliary_loss_mlp": 0.01028999, + "balance_loss_clip": 1.03627002, + "balance_loss_mlp": 1.01734662, + "epoch": 0.9924545317901698, + "flos": 25397108794080.0, + "grad_norm": 2.8718688804424666, + "language_loss": 0.62827122, + "learning_rate": 5.924723134487219e-10, + "loss": 0.64963663, + "num_input_tokens_seen": 356295675, + "router_z_loss_clip": 0.71240234, + "router_z_loss_mlp": 0.11651611, + "step": 16507, + "time_per_iteration": 2.6230735778808594 + }, + { + "auxiliary_loss_clip": 0.01109672, + "auxiliary_loss_mlp": 0.01033223, + "balance_loss_clip": 1.03688359, + "balance_loss_mlp": 1.0217073, + "epoch": 0.9925146550428379, + "flos": 24505855642080.0, + "grad_norm": 2.7088731125695737, + "language_loss": 0.72972667, + "learning_rate": 5.830311334193983e-10, + "loss": 0.75115561, + "num_input_tokens_seen": 356312885, + "router_z_loss_clip": 0.72851562, + "router_z_loss_mlp": 0.11529541, + "step": 16508, + "time_per_iteration": 2.571793556213379 + }, + { + "auxiliary_loss_clip": 0.01110286, + "auxiliary_loss_mlp": 0.01028705, + "balance_loss_clip": 1.03759408, + "balance_loss_mlp": 1.01704621, + "epoch": 0.9925747782955058, + "flos": 30473240549760.0, + "grad_norm": 1.5952744630162332, + "language_loss": 0.70518309, + "learning_rate": 5.736657714818793e-10, + "loss": 0.72657299, + "num_input_tokens_seen": 356334070, + "router_z_loss_clip": 0.72705078, + "router_z_loss_mlp": 0.11663818, + "step": 16509, + "time_per_iteration": 2.674044370651245 + }, + { + "auxiliary_loss_clip": 0.01109301, + "auxiliary_loss_mlp": 0.01030961, + "balance_loss_clip": 1.03655398, + "balance_loss_mlp": 1.01929021, + "epoch": 0.9926349015481738, + "flos": 73791783941760.0, + "grad_norm": 1.6994971596603552, + "language_loss": 0.68422091, + "learning_rate": 5.643762279912146e-10, + "loss": 0.70562357, + "num_input_tokens_seen": 356359410, + "router_z_loss_clip": 0.72802734, + "router_z_loss_mlp": 0.11663818, + "step": 16510, + "time_per_iteration": 2.983825206756592 + }, + { + "auxiliary_loss_clip": 0.01111948, + "auxiliary_loss_mlp": 0.01033085, + "balance_loss_clip": 1.03808045, + "balance_loss_mlp": 1.02166438, + "epoch": 0.9926950248008417, + "flos": 25308996410880.0, + "grad_norm": 4.479741957101587, + "language_loss": 0.80914581, + "learning_rate": 5.551625032997886e-10, + "loss": 0.83059621, + "num_input_tokens_seen": 356378345, + "router_z_loss_clip": 0.73828125, + "router_z_loss_mlp": 0.11431885, + "step": 16511, + "time_per_iteration": 2.6244068145751953 + }, + { + "auxiliary_loss_clip": 0.01107182, + "auxiliary_loss_mlp": 0.0102845, + "balance_loss_clip": 1.03631234, + "balance_loss_mlp": 1.01790619, + "epoch": 0.9927551480535097, + "flos": 29715796922400.0, + "grad_norm": 2.024930065345239, + "language_loss": 0.91615534, + "learning_rate": 5.460245977570998e-10, + "loss": 0.93751168, + "num_input_tokens_seen": 356397345, + "router_z_loss_clip": 0.70849609, + "router_z_loss_mlp": 0.10540771, + "step": 16512, + "time_per_iteration": 2.641488790512085 + }, + { + "auxiliary_loss_clip": 0.0102707, + "auxiliary_loss_mlp": 0.01001498, + "balance_loss_clip": 1.00472665, + "balance_loss_mlp": 1.0005759, + "epoch": 0.9928152713061776, + "flos": 86970830034240.0, + "grad_norm": 0.7024405696304967, + "language_loss": 0.55165774, + "learning_rate": 5.369625117095378e-10, + "loss": 0.5719434, + "num_input_tokens_seen": 356459160, + "router_z_loss_clip": 0.2232666, + "router_z_loss_mlp": 0.00921631, + "step": 16513, + "time_per_iteration": 3.3531901836395264 + }, + { + "auxiliary_loss_clip": 0.01108662, + "auxiliary_loss_mlp": 0.01028693, + "balance_loss_clip": 1.03717875, + "balance_loss_mlp": 1.01751685, + "epoch": 0.9928753945588457, + "flos": 70542113388480.0, + "grad_norm": 1.464596686305068, + "language_loss": 0.65143967, + "learning_rate": 5.279762455006054e-10, + "loss": 0.67281324, + "num_input_tokens_seen": 356486405, + "router_z_loss_clip": 0.71533203, + "router_z_loss_mlp": 0.11175537, + "step": 16514, + "time_per_iteration": 2.984793186187744 + }, + { + "auxiliary_loss_clip": 0.01111721, + "auxiliary_loss_mlp": 0.01025397, + "balance_loss_clip": 1.0385977, + "balance_loss_mlp": 1.01347637, + "epoch": 0.9929355178115136, + "flos": 23878412915040.0, + "grad_norm": 1.8766615747336557, + "language_loss": 0.72885406, + "learning_rate": 5.190657994713632e-10, + "loss": 0.75022525, + "num_input_tokens_seen": 356502905, + "router_z_loss_clip": 0.73046875, + "router_z_loss_mlp": 0.11920166, + "step": 16515, + "time_per_iteration": 2.5795538425445557 + }, + { + "auxiliary_loss_clip": 0.01110881, + "auxiliary_loss_mlp": 0.0102967, + "balance_loss_clip": 1.03864121, + "balance_loss_mlp": 1.01873231, + "epoch": 0.9929956410641816, + "flos": 28021848691680.0, + "grad_norm": 1.6583130952088263, + "language_loss": 0.77136588, + "learning_rate": 5.102311739593191e-10, + "loss": 0.79277146, + "num_input_tokens_seen": 356523830, + "router_z_loss_clip": 0.72265625, + "router_z_loss_mlp": 0.10943604, + "step": 16516, + "time_per_iteration": 2.660562515258789 + }, + { + "auxiliary_loss_clip": 0.01106302, + "auxiliary_loss_mlp": 0.01027824, + "balance_loss_clip": 1.03538656, + "balance_loss_mlp": 1.01759577, + "epoch": 0.9930557643168495, + "flos": 27535589634240.0, + "grad_norm": 1.6721181609229148, + "language_loss": 0.78034568, + "learning_rate": 5.014723692997602e-10, + "loss": 0.801687, + "num_input_tokens_seen": 356543965, + "router_z_loss_clip": 0.70849609, + "router_z_loss_mlp": 0.10223389, + "step": 16517, + "time_per_iteration": 2.637488603591919 + }, + { + "auxiliary_loss_clip": 0.01114424, + "auxiliary_loss_mlp": 0.01035729, + "balance_loss_clip": 1.03941035, + "balance_loss_mlp": 1.02331972, + "epoch": 0.9931158875695175, + "flos": 20988525522240.0, + "grad_norm": 2.2099562737167404, + "language_loss": 0.6703524, + "learning_rate": 4.927893858248655e-10, + "loss": 0.691854, + "num_input_tokens_seen": 356561530, + "router_z_loss_clip": 0.75, + "router_z_loss_mlp": 0.12402344, + "step": 16518, + "time_per_iteration": 4.082259893417358 + }, + { + "auxiliary_loss_clip": 0.01027031, + "auxiliary_loss_mlp": 0.01001222, + "balance_loss_clip": 1.00466955, + "balance_loss_mlp": 1.00031304, + "epoch": 0.9931760108221854, + "flos": 77741295251040.0, + "grad_norm": 0.7308471476881744, + "language_loss": 0.53439945, + "learning_rate": 4.84182223863483e-10, + "loss": 0.55468196, + "num_input_tokens_seen": 356616845, + "router_z_loss_clip": 0.22375488, + "router_z_loss_mlp": 0.00908661, + "step": 16519, + "time_per_iteration": 3.1756057739257812 + }, + { + "auxiliary_loss_clip": 0.01107818, + "auxiliary_loss_mlp": 0.01028962, + "balance_loss_clip": 1.03736365, + "balance_loss_mlp": 1.01810229, + "epoch": 0.9932361340748534, + "flos": 18674508709440.0, + "grad_norm": 1.7715416044561032, + "language_loss": 0.60007739, + "learning_rate": 4.756508837426842e-10, + "loss": 0.62144524, + "num_input_tokens_seen": 356633560, + "router_z_loss_clip": 0.70507812, + "router_z_loss_mlp": 0.10858154, + "step": 16520, + "time_per_iteration": 2.5858044624328613 + }, + { + "auxiliary_loss_clip": 0.0110959, + "auxiliary_loss_mlp": 0.01029673, + "balance_loss_clip": 1.03794813, + "balance_loss_mlp": 1.01869941, + "epoch": 0.9932962573275215, + "flos": 44007606551520.0, + "grad_norm": 1.6366445145190265, + "language_loss": 0.61568487, + "learning_rate": 4.671953657853223e-10, + "loss": 0.63707751, + "num_input_tokens_seen": 356657600, + "router_z_loss_clip": 0.71679688, + "router_z_loss_mlp": 0.10980225, + "step": 16521, + "time_per_iteration": 4.096699953079224 + }, + { + "auxiliary_loss_clip": 0.01112399, + "auxiliary_loss_mlp": 0.01030686, + "balance_loss_clip": 1.03868473, + "balance_loss_mlp": 1.01895571, + "epoch": 0.9933563805801894, + "flos": 26198426285280.0, + "grad_norm": 4.3925345313170014, + "language_loss": 0.74394822, + "learning_rate": 4.5881567031225145e-10, + "loss": 0.76537913, + "num_input_tokens_seen": 356675880, + "router_z_loss_clip": 0.73730469, + "router_z_loss_mlp": 0.11737061, + "step": 16522, + "time_per_iteration": 2.6304728984832764 + }, + { + "auxiliary_loss_clip": 0.01108465, + "auxiliary_loss_mlp": 0.01027182, + "balance_loss_clip": 1.03777838, + "balance_loss_mlp": 1.01624513, + "epoch": 0.9934165038328574, + "flos": 29268792758880.0, + "grad_norm": 1.8734872581256252, + "language_loss": 0.73363376, + "learning_rate": 4.5051179764143964e-10, + "loss": 0.75499028, + "num_input_tokens_seen": 356696000, + "router_z_loss_clip": 0.70605469, + "router_z_loss_mlp": 0.10943604, + "step": 16523, + "time_per_iteration": 2.6581339836120605 + }, + { + "auxiliary_loss_clip": 0.01109867, + "auxiliary_loss_mlp": 0.01027991, + "balance_loss_clip": 1.03764224, + "balance_loss_mlp": 1.01671398, + "epoch": 0.9934766270855253, + "flos": 26732084209920.0, + "grad_norm": 2.2807029488466375, + "language_loss": 0.71443635, + "learning_rate": 4.422837480875241e-10, + "loss": 0.73581493, + "num_input_tokens_seen": 356716845, + "router_z_loss_clip": 0.72314453, + "router_z_loss_mlp": 0.11273193, + "step": 16524, + "time_per_iteration": 2.6794745922088623 + }, + { + "auxiliary_loss_clip": 0.01109689, + "auxiliary_loss_mlp": 0.01028433, + "balance_loss_clip": 1.03823459, + "balance_loss_mlp": 1.01772165, + "epoch": 0.9935367503381933, + "flos": 20900696760000.0, + "grad_norm": 1.9522473763876074, + "language_loss": 0.79217923, + "learning_rate": 4.341315219624775e-10, + "loss": 0.81356049, + "num_input_tokens_seen": 356732100, + "router_z_loss_clip": 0.71386719, + "router_z_loss_mlp": 0.10717773, + "step": 16525, + "time_per_iteration": 2.6044504642486572 + }, + { + "auxiliary_loss_clip": 0.01107999, + "auxiliary_loss_mlp": 0.01023312, + "balance_loss_clip": 1.03780341, + "balance_loss_mlp": 1.01239777, + "epoch": 0.9935968735908612, + "flos": 27267038687520.0, + "grad_norm": 1.6282669296767518, + "language_loss": 0.74747592, + "learning_rate": 4.2605511957582995e-10, + "loss": 0.76878905, + "num_input_tokens_seen": 356751480, + "router_z_loss_clip": 0.70166016, + "router_z_loss_mlp": 0.10913086, + "step": 16526, + "time_per_iteration": 2.6775434017181396 + }, + { + "auxiliary_loss_clip": 0.01105588, + "auxiliary_loss_mlp": 0.010257, + "balance_loss_clip": 1.03586924, + "balance_loss_mlp": 1.01520324, + "epoch": 0.9936569968435293, + "flos": 35948491394400.0, + "grad_norm": 1.6075641267927605, + "language_loss": 0.72606695, + "learning_rate": 4.180545412333369e-10, + "loss": 0.74737978, + "num_input_tokens_seen": 356772650, + "router_z_loss_clip": 0.69726562, + "router_z_loss_mlp": 0.10491943, + "step": 16527, + "time_per_iteration": 2.679553270339966 + }, + { + "auxiliary_loss_clip": 0.01108732, + "auxiliary_loss_mlp": 0.01026918, + "balance_loss_clip": 1.03574669, + "balance_loss_mlp": 1.01582003, + "epoch": 0.9937171200961972, + "flos": 20186843375520.0, + "grad_norm": 2.228130937181352, + "language_loss": 0.76175833, + "learning_rate": 4.1012978723875547e-10, + "loss": 0.78311479, + "num_input_tokens_seen": 356788510, + "router_z_loss_clip": 0.72998047, + "router_z_loss_mlp": 0.11102295, + "step": 16528, + "time_per_iteration": 2.605738639831543 + }, + { + "auxiliary_loss_clip": 0.01109764, + "auxiliary_loss_mlp": 0.01029015, + "balance_loss_clip": 1.03645504, + "balance_loss_mlp": 1.01695096, + "epoch": 0.9937772433488652, + "flos": 29761250960160.0, + "grad_norm": 2.6067815348710415, + "language_loss": 0.67939234, + "learning_rate": 4.022808578922898e-10, + "loss": 0.70078009, + "num_input_tokens_seen": 356809115, + "router_z_loss_clip": 0.73339844, + "router_z_loss_mlp": 0.12072754, + "step": 16529, + "time_per_iteration": 4.15263295173645 + }, + { + "auxiliary_loss_clip": 0.01114683, + "auxiliary_loss_mlp": 0.01034474, + "balance_loss_clip": 1.03907943, + "balance_loss_mlp": 1.0219574, + "epoch": 0.9938373666015331, + "flos": 19120013733600.0, + "grad_norm": 2.392991916181526, + "language_loss": 0.65744591, + "learning_rate": 3.9450775349170186e-10, + "loss": 0.67893744, + "num_input_tokens_seen": 356826410, + "router_z_loss_clip": 0.75537109, + "router_z_loss_mlp": 0.12512207, + "step": 16530, + "time_per_iteration": 2.5857861042022705 + }, + { + "auxiliary_loss_clip": 0.0111156, + "auxiliary_loss_mlp": 0.01026987, + "balance_loss_clip": 1.0390234, + "balance_loss_mlp": 1.01662183, + "epoch": 0.9938974898542011, + "flos": 23789571220800.0, + "grad_norm": 2.200433632561264, + "language_loss": 0.71398246, + "learning_rate": 3.8681047433186676e-10, + "loss": 0.73536789, + "num_input_tokens_seen": 356844990, + "router_z_loss_clip": 0.72509766, + "router_z_loss_mlp": 0.10357666, + "step": 16531, + "time_per_iteration": 2.6433353424072266 + }, + { + "auxiliary_loss_clip": 0.01112656, + "auxiliary_loss_mlp": 0.01026885, + "balance_loss_clip": 1.03801584, + "balance_loss_mlp": 1.01514935, + "epoch": 0.993957613106869, + "flos": 32832306227520.0, + "grad_norm": 1.62918452048075, + "language_loss": 0.74285591, + "learning_rate": 3.791890207045512e-10, + "loss": 0.76425135, + "num_input_tokens_seen": 356866530, + "router_z_loss_clip": 0.74560547, + "router_z_loss_mlp": 0.11743164, + "step": 16532, + "time_per_iteration": 2.642195463180542 + }, + { + "auxiliary_loss_clip": 0.01103956, + "auxiliary_loss_mlp": 0.01030682, + "balance_loss_clip": 1.03683782, + "balance_loss_mlp": 1.02025151, + "epoch": 0.994017736359537, + "flos": 18228882133440.0, + "grad_norm": 1.747705665357834, + "language_loss": 0.70870066, + "learning_rate": 3.7164339289885717e-10, + "loss": 0.73004711, + "num_input_tokens_seen": 356884660, + "router_z_loss_clip": 0.67089844, + "router_z_loss_mlp": 0.10437012, + "step": 16533, + "time_per_iteration": 2.625316858291626 + }, + { + "auxiliary_loss_clip": 0.01112923, + "auxiliary_loss_mlp": 0.01028639, + "balance_loss_clip": 1.03891659, + "balance_loss_mlp": 1.01698077, + "epoch": 0.9940778596122051, + "flos": 18763674541920.0, + "grad_norm": 2.496634050300578, + "language_loss": 0.83931911, + "learning_rate": 3.641735912007782e-10, + "loss": 0.86073482, + "num_input_tokens_seen": 356900895, + "router_z_loss_clip": 0.74023438, + "router_z_loss_mlp": 0.11657715, + "step": 16534, + "time_per_iteration": 2.650202512741089 + }, + { + "auxiliary_loss_clip": 0.01104989, + "auxiliary_loss_mlp": 0.01027071, + "balance_loss_clip": 1.03699648, + "balance_loss_mlp": 1.01656318, + "epoch": 0.994137982864873, + "flos": 30783761048160.0, + "grad_norm": 1.4167915912669684, + "language_loss": 0.65822005, + "learning_rate": 3.567796158934211e-10, + "loss": 0.67954063, + "num_input_tokens_seen": 356920985, + "router_z_loss_clip": 0.68066406, + "router_z_loss_mlp": 0.1050415, + "step": 16535, + "time_per_iteration": 2.6973888874053955 + }, + { + "auxiliary_loss_clip": 0.01108652, + "auxiliary_loss_mlp": 0.01026827, + "balance_loss_clip": 1.03936756, + "balance_loss_mlp": 1.01674819, + "epoch": 0.994198106117541, + "flos": 22502967086880.0, + "grad_norm": 1.6441731519881202, + "language_loss": 0.64730084, + "learning_rate": 3.4946146725767235e-10, + "loss": 0.66865557, + "num_input_tokens_seen": 356939800, + "router_z_loss_clip": 0.69335938, + "router_z_loss_mlp": 0.10083008, + "step": 16536, + "time_per_iteration": 2.6018519401550293 + }, + { + "auxiliary_loss_clip": 0.01107825, + "auxiliary_loss_mlp": 0.01031118, + "balance_loss_clip": 1.03718996, + "balance_loss_mlp": 1.01981068, + "epoch": 0.9942582293702089, + "flos": 20321098590240.0, + "grad_norm": 2.1214937567277627, + "language_loss": 0.78955698, + "learning_rate": 3.4221914557064357e-10, + "loss": 0.81094646, + "num_input_tokens_seen": 356957780, + "router_z_loss_clip": 0.70654297, + "router_z_loss_mlp": 0.11315918, + "step": 16537, + "time_per_iteration": 2.617177724838257 + }, + { + "auxiliary_loss_clip": 0.01114607, + "auxiliary_loss_mlp": 0.01029453, + "balance_loss_clip": 1.03778601, + "balance_loss_mlp": 1.01799726, + "epoch": 0.9943183526228769, + "flos": 26776930488480.0, + "grad_norm": 1.7908072344466026, + "language_loss": 0.68880677, + "learning_rate": 3.35052651107004e-10, + "loss": 0.7102474, + "num_input_tokens_seen": 356979185, + "router_z_loss_clip": 0.76806641, + "router_z_loss_mlp": 0.11456299, + "step": 16538, + "time_per_iteration": 2.625900983810425 + }, + { + "auxiliary_loss_clip": 0.01104279, + "auxiliary_loss_mlp": 0.01029291, + "balance_loss_clip": 1.03469706, + "balance_loss_mlp": 1.01854396, + "epoch": 0.9943784758755448, + "flos": 29092932648000.0, + "grad_norm": 3.1601028531621833, + "language_loss": 0.75254333, + "learning_rate": 3.2796198413853614e-10, + "loss": 0.77387905, + "num_input_tokens_seen": 356997735, + "router_z_loss_clip": 0.69628906, + "router_z_loss_mlp": 0.10748291, + "step": 16539, + "time_per_iteration": 2.6571125984191895 + }, + { + "auxiliary_loss_clip": 0.01110793, + "auxiliary_loss_mlp": 0.01030971, + "balance_loss_clip": 1.03802085, + "balance_loss_mlp": 1.01935434, + "epoch": 0.9944385991282129, + "flos": 26197818526080.0, + "grad_norm": 2.0238676065456915, + "language_loss": 0.70483613, + "learning_rate": 3.209471449341361e-10, + "loss": 0.72625387, + "num_input_tokens_seen": 357015660, + "router_z_loss_clip": 0.72705078, + "router_z_loss_mlp": 0.11608887, + "step": 16540, + "time_per_iteration": 2.6201834678649902 + }, + { + "auxiliary_loss_clip": 0.01106018, + "auxiliary_loss_mlp": 0.01024183, + "balance_loss_clip": 1.03554988, + "balance_loss_mlp": 1.01430082, + "epoch": 0.9944987223808808, + "flos": 27976029998400.0, + "grad_norm": 2.287150085714004, + "language_loss": 0.75312239, + "learning_rate": 3.140081337600353e-10, + "loss": 0.77442437, + "num_input_tokens_seen": 357034800, + "router_z_loss_clip": 0.70410156, + "router_z_loss_mlp": 0.09881592, + "step": 16541, + "time_per_iteration": 2.714416980743408 + }, + { + "auxiliary_loss_clip": 0.0110784, + "auxiliary_loss_mlp": 0.01029001, + "balance_loss_clip": 1.03589189, + "balance_loss_mlp": 1.01804566, + "epoch": 0.9945588456335488, + "flos": 27311884966080.0, + "grad_norm": 2.0095304896764294, + "language_loss": 0.76368624, + "learning_rate": 3.0714495087891255e-10, + "loss": 0.78505462, + "num_input_tokens_seen": 357053785, + "router_z_loss_clip": 0.71972656, + "router_z_loss_mlp": 0.10961914, + "step": 16542, + "time_per_iteration": 2.6308064460754395 + }, + { + "auxiliary_loss_clip": 0.01112645, + "auxiliary_loss_mlp": 0.01027877, + "balance_loss_clip": 1.03801489, + "balance_loss_mlp": 1.01637328, + "epoch": 0.9946189688862167, + "flos": 26109341487360.0, + "grad_norm": 2.1323776755129686, + "language_loss": 0.74506044, + "learning_rate": 3.0035759655122615e-10, + "loss": 0.76646566, + "num_input_tokens_seen": 357072025, + "router_z_loss_clip": 0.74609375, + "router_z_loss_mlp": 0.11505127, + "step": 16543, + "time_per_iteration": 2.6634254455566406 + }, + { + "auxiliary_loss_clip": 0.01112673, + "auxiliary_loss_mlp": 0.01027905, + "balance_loss_clip": 1.03817844, + "balance_loss_mlp": 1.01638377, + "epoch": 0.9946790921388847, + "flos": 15150574272960.0, + "grad_norm": 2.183800385952965, + "language_loss": 0.81499648, + "learning_rate": 2.9364607103454785e-10, + "loss": 0.8364023, + "num_input_tokens_seen": 357086960, + "router_z_loss_clip": 0.74462891, + "router_z_loss_mlp": 0.11517334, + "step": 16544, + "time_per_iteration": 2.5675413608551025 + }, + { + "auxiliary_loss_clip": 0.011066, + "auxiliary_loss_mlp": 0.0102763, + "balance_loss_clip": 1.03577399, + "balance_loss_mlp": 1.01653743, + "epoch": 0.9947392153915526, + "flos": 23255224502400.0, + "grad_norm": 1.784868448333381, + "language_loss": 0.7834906, + "learning_rate": 2.870103745831187e-10, + "loss": 0.80483288, + "num_input_tokens_seen": 357105095, + "router_z_loss_clip": 0.70849609, + "router_z_loss_mlp": 0.11090088, + "step": 16545, + "time_per_iteration": 2.629758596420288 + }, + { + "auxiliary_loss_clip": 0.01112308, + "auxiliary_loss_mlp": 0.01027733, + "balance_loss_clip": 1.03833461, + "balance_loss_mlp": 1.01631284, + "epoch": 0.9947993386442207, + "flos": 33322698047520.0, + "grad_norm": 2.201727700293355, + "language_loss": 0.72492194, + "learning_rate": 2.8045050744873733e-10, + "loss": 0.74632233, + "num_input_tokens_seen": 357125065, + "router_z_loss_clip": 0.73925781, + "router_z_loss_mlp": 0.11419678, + "step": 16546, + "time_per_iteration": 4.122064828872681 + }, + { + "auxiliary_loss_clip": 0.01106704, + "auxiliary_loss_mlp": 0.01032097, + "balance_loss_clip": 1.03647566, + "balance_loss_mlp": 1.02129102, + "epoch": 0.9948594618968887, + "flos": 25388802751680.0, + "grad_norm": 2.306120724364073, + "language_loss": 0.77455819, + "learning_rate": 2.739664698798716e-10, + "loss": 0.79594618, + "num_input_tokens_seen": 357141600, + "router_z_loss_clip": 0.70214844, + "router_z_loss_mlp": 0.10803223, + "step": 16547, + "time_per_iteration": 2.611268997192383 + }, + { + "auxiliary_loss_clip": 0.01107985, + "auxiliary_loss_mlp": 0.01029336, + "balance_loss_clip": 1.03597951, + "balance_loss_mlp": 1.01866055, + "epoch": 0.9949195851495566, + "flos": 28422466920000.0, + "grad_norm": 2.9856135019108305, + "language_loss": 0.70111758, + "learning_rate": 2.67558262122769e-10, + "loss": 0.72249079, + "num_input_tokens_seen": 357157880, + "router_z_loss_clip": 0.72021484, + "router_z_loss_mlp": 0.10668945, + "step": 16548, + "time_per_iteration": 2.636888265609741 + }, + { + "auxiliary_loss_clip": 0.0110974, + "auxiliary_loss_mlp": 0.01029327, + "balance_loss_clip": 1.03793311, + "balance_loss_mlp": 1.01847315, + "epoch": 0.9949797084022246, + "flos": 22592416540320.0, + "grad_norm": 8.478554915099258, + "language_loss": 0.75250852, + "learning_rate": 2.6122588442012427e-10, + "loss": 0.77389926, + "num_input_tokens_seen": 357176705, + "router_z_loss_clip": 0.71777344, + "router_z_loss_mlp": 0.10858154, + "step": 16549, + "time_per_iteration": 2.587148427963257 + }, + { + "auxiliary_loss_clip": 0.01113095, + "auxiliary_loss_mlp": 0.01032241, + "balance_loss_clip": 1.03887343, + "balance_loss_mlp": 1.02090466, + "epoch": 0.9950398316548925, + "flos": 37104446351520.0, + "grad_norm": 2.1341681734904836, + "language_loss": 0.7497564, + "learning_rate": 2.5496933701241177e-10, + "loss": 0.77120978, + "num_input_tokens_seen": 357197630, + "router_z_loss_clip": 0.74267578, + "router_z_loss_mlp": 0.11334229, + "step": 16550, + "time_per_iteration": 2.667468547821045 + }, + { + "auxiliary_loss_clip": 0.01108358, + "auxiliary_loss_mlp": 0.01025624, + "balance_loss_clip": 1.03696728, + "balance_loss_mlp": 1.01540732, + "epoch": 0.9950999549075605, + "flos": 24283082871360.0, + "grad_norm": 1.5437459345854148, + "language_loss": 0.77972442, + "learning_rate": 2.4878862013655297e-10, + "loss": 0.80106425, + "num_input_tokens_seen": 357215445, + "router_z_loss_clip": 0.71386719, + "router_z_loss_mlp": 0.10205078, + "step": 16551, + "time_per_iteration": 2.600647211074829 + }, + { + "auxiliary_loss_clip": 0.01103754, + "auxiliary_loss_mlp": 0.0103013, + "balance_loss_clip": 1.03633809, + "balance_loss_mlp": 1.02042091, + "epoch": 0.9951600781602284, + "flos": 21479606136000.0, + "grad_norm": 1.7037889328654228, + "language_loss": 0.66768599, + "learning_rate": 2.426837340270271e-10, + "loss": 0.68902481, + "num_input_tokens_seen": 357234285, + "router_z_loss_clip": 0.67382812, + "router_z_loss_mlp": 0.09710693, + "step": 16552, + "time_per_iteration": 2.6330785751342773 + }, + { + "auxiliary_loss_clip": 0.01110847, + "auxiliary_loss_mlp": 0.01027759, + "balance_loss_clip": 1.03777397, + "balance_loss_mlp": 1.01608229, + "epoch": 0.9952202014128965, + "flos": 35326275396480.0, + "grad_norm": 1.6795349668910875, + "language_loss": 0.8158567, + "learning_rate": 2.3665467891520465e-10, + "loss": 0.83724272, + "num_input_tokens_seen": 357257565, + "router_z_loss_clip": 0.73095703, + "router_z_loss_mlp": 0.11688232, + "step": 16553, + "time_per_iteration": 2.6676185131073 + }, + { + "auxiliary_loss_clip": 0.01027022, + "auxiliary_loss_mlp": 0.01001494, + "balance_loss_clip": 1.00464082, + "balance_loss_mlp": 1.00057304, + "epoch": 0.9952803246655644, + "flos": 86402333599200.0, + "grad_norm": 0.7310524448610864, + "language_loss": 0.57328296, + "learning_rate": 2.3070145503001348e-10, + "loss": 0.59356809, + "num_input_tokens_seen": 357320205, + "router_z_loss_clip": 0.22375488, + "router_z_loss_mlp": 0.00920105, + "step": 16554, + "time_per_iteration": 3.349942922592163 + }, + { + "auxiliary_loss_clip": 0.01109975, + "auxiliary_loss_mlp": 0.01032974, + "balance_loss_clip": 1.03769875, + "balance_loss_mlp": 1.02245963, + "epoch": 0.9953404479182324, + "flos": 26599328134560.0, + "grad_norm": 2.774704912867087, + "language_loss": 0.76623267, + "learning_rate": 2.24824062597051e-10, + "loss": 0.78766215, + "num_input_tokens_seen": 357340695, + "router_z_loss_clip": 0.72265625, + "router_z_loss_mlp": 0.10522461, + "step": 16555, + "time_per_iteration": 2.701045513153076 + }, + { + "auxiliary_loss_clip": 0.01109548, + "auxiliary_loss_mlp": 0.01031111, + "balance_loss_clip": 1.03695011, + "balance_loss_mlp": 1.02035272, + "epoch": 0.9954005711709003, + "flos": 26768827032480.0, + "grad_norm": 2.065495143159917, + "language_loss": 0.8636533, + "learning_rate": 2.1902250183902793e-10, + "loss": 0.88505995, + "num_input_tokens_seen": 357357505, + "router_z_loss_clip": 0.72607422, + "router_z_loss_mlp": 0.10766602, + "step": 16556, + "time_per_iteration": 2.7017273902893066 + }, + { + "auxiliary_loss_clip": 0.01107238, + "auxiliary_loss_mlp": 0.01028771, + "balance_loss_clip": 1.03710377, + "balance_loss_mlp": 1.01770878, + "epoch": 0.9954606944235683, + "flos": 23616344733120.0, + "grad_norm": 1.8379046789932822, + "language_loss": 0.72776699, + "learning_rate": 2.132967729762125e-10, + "loss": 0.74912709, + "num_input_tokens_seen": 357375395, + "router_z_loss_clip": 0.70166016, + "router_z_loss_mlp": 0.11077881, + "step": 16557, + "time_per_iteration": 4.243994951248169 + }, + { + "auxiliary_loss_clip": 0.01105928, + "auxiliary_loss_mlp": 0.01030399, + "balance_loss_clip": 1.03652167, + "balance_loss_mlp": 1.02011764, + "epoch": 0.9955208176762362, + "flos": 37239633463680.0, + "grad_norm": 1.7924724534088883, + "language_loss": 0.76441085, + "learning_rate": 2.0764687622554233e-10, + "loss": 0.78577417, + "num_input_tokens_seen": 357397375, + "router_z_loss_clip": 0.69384766, + "router_z_loss_mlp": 0.10284424, + "step": 16558, + "time_per_iteration": 2.7190628051757812 + }, + { + "auxiliary_loss_clip": 0.01108435, + "auxiliary_loss_mlp": 0.01028363, + "balance_loss_clip": 1.03612101, + "balance_loss_mlp": 1.01709783, + "epoch": 0.9955809409289043, + "flos": 36616809706560.0, + "grad_norm": 4.193655435136463, + "language_loss": 0.63833785, + "learning_rate": 2.0207281180129044e-10, + "loss": 0.65970588, + "num_input_tokens_seen": 357418880, + "router_z_loss_clip": 0.72216797, + "router_z_loss_mlp": 0.11260986, + "step": 16559, + "time_per_iteration": 2.719820499420166 + }, + { + "auxiliary_loss_clip": 0.0110781, + "auxiliary_loss_mlp": 0.01027114, + "balance_loss_clip": 1.03744268, + "balance_loss_mlp": 1.01624787, + "epoch": 0.9956410641815723, + "flos": 26287227462240.0, + "grad_norm": 2.0350126001846554, + "language_loss": 0.74787569, + "learning_rate": 1.965745799148433e-10, + "loss": 0.76922488, + "num_input_tokens_seen": 357438310, + "router_z_loss_clip": 0.70263672, + "router_z_loss_mlp": 0.10858154, + "step": 16560, + "time_per_iteration": 4.18878698348999 + }, + { + "auxiliary_loss_clip": 0.01107598, + "auxiliary_loss_mlp": 0.01029156, + "balance_loss_clip": 1.03707838, + "balance_loss_mlp": 1.01846254, + "epoch": 0.9957011874342402, + "flos": 26465599644480.0, + "grad_norm": 1.8252773986871476, + "language_loss": 0.78812945, + "learning_rate": 1.9115218077470073e-10, + "loss": 0.80949694, + "num_input_tokens_seen": 357457155, + "router_z_loss_clip": 0.70507812, + "router_z_loss_mlp": 0.10693359, + "step": 16561, + "time_per_iteration": 2.6347782611846924 + }, + { + "auxiliary_loss_clip": 0.01106853, + "auxiliary_loss_mlp": 0.01028728, + "balance_loss_clip": 1.03785062, + "balance_loss_mlp": 1.01811886, + "epoch": 0.9957613106869082, + "flos": 21601503580320.0, + "grad_norm": 2.7306922856094187, + "language_loss": 0.65701783, + "learning_rate": 1.8580561458647614e-10, + "loss": 0.67837358, + "num_input_tokens_seen": 357468060, + "router_z_loss_clip": 0.68945312, + "router_z_loss_mlp": 0.10614014, + "step": 16562, + "time_per_iteration": 2.5853209495544434 + }, + { + "auxiliary_loss_clip": 0.01112403, + "auxiliary_loss_mlp": 0.01038417, + "balance_loss_clip": 1.03729498, + "balance_loss_mlp": 1.02634084, + "epoch": 0.9958214339395761, + "flos": 37285168536000.0, + "grad_norm": 1.8945681523865736, + "language_loss": 0.64296925, + "learning_rate": 1.805348815528962e-10, + "loss": 0.66447747, + "num_input_tokens_seen": 357489665, + "router_z_loss_clip": 0.75097656, + "router_z_loss_mlp": 0.12084961, + "step": 16563, + "time_per_iteration": 2.7140352725982666 + }, + { + "auxiliary_loss_clip": 0.01108102, + "auxiliary_loss_mlp": 0.01028454, + "balance_loss_clip": 1.03728068, + "balance_loss_mlp": 1.01708198, + "epoch": 0.9958815571922441, + "flos": 29491646564160.0, + "grad_norm": 2.0323168802039504, + "language_loss": 0.64756984, + "learning_rate": 1.7533998187380105e-10, + "loss": 0.66893542, + "num_input_tokens_seen": 357511975, + "router_z_loss_clip": 0.70849609, + "router_z_loss_mlp": 0.11358643, + "step": 16564, + "time_per_iteration": 2.657444953918457 + }, + { + "auxiliary_loss_clip": 0.01108008, + "auxiliary_loss_mlp": 0.01025088, + "balance_loss_clip": 1.03769064, + "balance_loss_mlp": 1.01393604, + "epoch": 0.995941680444912, + "flos": 18896876307360.0, + "grad_norm": 1.9156398010727844, + "language_loss": 0.74394774, + "learning_rate": 1.7022091574636633e-10, + "loss": 0.7652787, + "num_input_tokens_seen": 357529345, + "router_z_loss_clip": 0.70361328, + "router_z_loss_mlp": 0.1114502, + "step": 16565, + "time_per_iteration": 2.608294725418091 + }, + { + "auxiliary_loss_clip": 0.01109323, + "auxiliary_loss_mlp": 0.01027506, + "balance_loss_clip": 1.03616822, + "balance_loss_mlp": 1.016801, + "epoch": 0.9960018036975801, + "flos": 22721809681440.0, + "grad_norm": 1.767908477441342, + "language_loss": 0.79448509, + "learning_rate": 1.6517768336443694e-10, + "loss": 0.81585342, + "num_input_tokens_seen": 357547615, + "router_z_loss_clip": 0.73242188, + "router_z_loss_mlp": 0.10705566, + "step": 16566, + "time_per_iteration": 2.6314280033111572 + }, + { + "auxiliary_loss_clip": 0.01105545, + "auxiliary_loss_mlp": 0.0102667, + "balance_loss_clip": 1.03601754, + "balance_loss_mlp": 1.01685286, + "epoch": 0.996061926950248, + "flos": 24506625470400.0, + "grad_norm": 1.7425181194323256, + "language_loss": 0.70775759, + "learning_rate": 1.6021028491941535e-10, + "loss": 0.72907972, + "num_input_tokens_seen": 357567380, + "router_z_loss_clip": 0.6953125, + "router_z_loss_mlp": 0.09814453, + "step": 16567, + "time_per_iteration": 2.639892578125 + }, + { + "auxiliary_loss_clip": 0.01112131, + "auxiliary_loss_mlp": 0.01030151, + "balance_loss_clip": 1.03865647, + "balance_loss_mlp": 1.01853418, + "epoch": 0.996122050202916, + "flos": 29710002951360.0, + "grad_norm": 2.22335684271087, + "language_loss": 0.7825762, + "learning_rate": 1.5531872059959538e-10, + "loss": 0.80399907, + "num_input_tokens_seen": 357586435, + "router_z_loss_clip": 0.73388672, + "router_z_loss_mlp": 0.11621094, + "step": 16568, + "time_per_iteration": 4.1152870655059814 + }, + { + "auxiliary_loss_clip": 0.01104202, + "auxiliary_loss_mlp": 0.01026434, + "balance_loss_clip": 1.03576231, + "balance_loss_mlp": 1.01643276, + "epoch": 0.9961821734555839, + "flos": 29528997145920.0, + "grad_norm": 1.70267745230357, + "language_loss": 0.81987941, + "learning_rate": 1.5050299059060634e-10, + "loss": 0.84118581, + "num_input_tokens_seen": 357604720, + "router_z_loss_clip": 0.68457031, + "router_z_loss_mlp": 0.10015869, + "step": 16569, + "time_per_iteration": 2.6468374729156494 + }, + { + "auxiliary_loss_clip": 0.01107104, + "auxiliary_loss_mlp": 0.0102984, + "balance_loss_clip": 1.03803098, + "balance_loss_mlp": 1.01933742, + "epoch": 0.9962422967082519, + "flos": 27618272701920.0, + "grad_norm": 1.8277031575274838, + "language_loss": 0.70362967, + "learning_rate": 1.457630950747468e-10, + "loss": 0.72499913, + "num_input_tokens_seen": 357622345, + "router_z_loss_clip": 0.69091797, + "router_z_loss_mlp": 0.10516357, + "step": 16570, + "time_per_iteration": 2.672116994857788 + }, + { + "auxiliary_loss_clip": 0.01109383, + "auxiliary_loss_mlp": 0.01026542, + "balance_loss_clip": 1.0381937, + "balance_loss_mlp": 1.01546717, + "epoch": 0.9963024199609198, + "flos": 32208915228480.0, + "grad_norm": 1.6449084046063842, + "language_loss": 0.74802095, + "learning_rate": 1.4109903423209502e-10, + "loss": 0.76938021, + "num_input_tokens_seen": 357642710, + "router_z_loss_clip": 0.70996094, + "router_z_loss_mlp": 0.11077881, + "step": 16571, + "time_per_iteration": 2.65283203125 + }, + { + "auxiliary_loss_clip": 0.01107037, + "auxiliary_loss_mlp": 0.01030354, + "balance_loss_clip": 1.03553867, + "balance_loss_mlp": 1.01892149, + "epoch": 0.9963625432135879, + "flos": 20232499999680.0, + "grad_norm": 1.799024918315967, + "language_loss": 0.7947036, + "learning_rate": 1.3651080823939843e-10, + "loss": 0.81607747, + "num_input_tokens_seen": 357659870, + "router_z_loss_clip": 0.71484375, + "router_z_loss_mlp": 0.11419678, + "step": 16572, + "time_per_iteration": 2.590052366256714 + }, + { + "auxiliary_loss_clip": 0.01109778, + "auxiliary_loss_mlp": 0.01033357, + "balance_loss_clip": 1.03792691, + "balance_loss_mlp": 1.0219841, + "epoch": 0.9964226664662559, + "flos": 32297432784480.0, + "grad_norm": 1.9158466449609717, + "language_loss": 0.70373464, + "learning_rate": 1.3199841727074e-10, + "loss": 0.72516602, + "num_input_tokens_seen": 357677075, + "router_z_loss_clip": 0.71875, + "router_z_loss_mlp": 0.1137085, + "step": 16573, + "time_per_iteration": 2.649793863296509 + }, + { + "auxiliary_loss_clip": 0.01112116, + "auxiliary_loss_mlp": 0.010314, + "balance_loss_clip": 1.03735662, + "balance_loss_mlp": 1.0198245, + "epoch": 0.9964827897189238, + "flos": 33493534015680.0, + "grad_norm": 1.6954443941803423, + "language_loss": 0.63083154, + "learning_rate": 1.275618614968721e-10, + "loss": 0.65226662, + "num_input_tokens_seen": 357696715, + "router_z_loss_clip": 0.74755859, + "router_z_loss_mlp": 0.11584473, + "step": 16574, + "time_per_iteration": 2.6560890674591064 + }, + { + "auxiliary_loss_clip": 0.01116522, + "auxiliary_loss_mlp": 0.01029863, + "balance_loss_clip": 1.04032862, + "balance_loss_mlp": 1.017555, + "epoch": 0.9965429129715918, + "flos": 14302668260160.0, + "grad_norm": 3.11984599325285, + "language_loss": 0.76027739, + "learning_rate": 1.2320114108654856e-10, + "loss": 0.78174126, + "num_input_tokens_seen": 357712345, + "router_z_loss_clip": 0.76171875, + "router_z_loss_mlp": 0.12304688, + "step": 16575, + "time_per_iteration": 2.5689287185668945 + }, + { + "auxiliary_loss_clip": 0.01110203, + "auxiliary_loss_mlp": 0.0102754, + "balance_loss_clip": 1.03848028, + "balance_loss_mlp": 1.01606631, + "epoch": 0.9966030362242597, + "flos": 24106209828480.0, + "grad_norm": 1.740506677049405, + "language_loss": 0.70302415, + "learning_rate": 1.1891625620474855e-10, + "loss": 0.72440159, + "num_input_tokens_seen": 357731815, + "router_z_loss_clip": 0.71679688, + "router_z_loss_mlp": 0.11468506, + "step": 16576, + "time_per_iteration": 2.607362747192383 + }, + { + "auxiliary_loss_clip": 0.01107237, + "auxiliary_loss_mlp": 0.01023707, + "balance_loss_clip": 1.03734398, + "balance_loss_mlp": 1.01212609, + "epoch": 0.9966631594769277, + "flos": 29181571755840.0, + "grad_norm": 1.7370886989644507, + "language_loss": 0.71925598, + "learning_rate": 1.1470720701400871e-10, + "loss": 0.74056542, + "num_input_tokens_seen": 357751640, + "router_z_loss_clip": 0.69873047, + "router_z_loss_mlp": 0.11578369, + "step": 16577, + "time_per_iteration": 2.6346988677978516 + }, + { + "auxiliary_loss_clip": 0.01108945, + "auxiliary_loss_mlp": 0.01034778, + "balance_loss_clip": 1.03663194, + "balance_loss_mlp": 1.02426398, + "epoch": 0.9967232827295956, + "flos": 18985474897920.0, + "grad_norm": 1.8868257771623618, + "language_loss": 0.78797507, + "learning_rate": 1.1057399367397912e-10, + "loss": 0.80941236, + "num_input_tokens_seen": 357769850, + "router_z_loss_clip": 0.72265625, + "router_z_loss_mlp": 0.10516357, + "step": 16578, + "time_per_iteration": 2.627131462097168 + }, + { + "auxiliary_loss_clip": 0.01111128, + "auxiliary_loss_mlp": 0.01031, + "balance_loss_clip": 1.03853059, + "balance_loss_mlp": 1.02021718, + "epoch": 0.9967834059822637, + "flos": 25396176896640.0, + "grad_norm": 1.7689486714882068, + "language_loss": 0.76024812, + "learning_rate": 1.0651661634142328e-10, + "loss": 0.78166938, + "num_input_tokens_seen": 357789550, + "router_z_loss_clip": 0.72509766, + "router_z_loss_mlp": 0.10778809, + "step": 16579, + "time_per_iteration": 2.5947768688201904 + }, + { + "auxiliary_loss_clip": 0.01112763, + "auxiliary_loss_mlp": 0.01034904, + "balance_loss_clip": 1.03979027, + "balance_loss_mlp": 1.02240467, + "epoch": 0.9968435292349316, + "flos": 44806290419520.0, + "grad_norm": 2.6744774457189764, + "language_loss": 0.69531739, + "learning_rate": 1.0253507516999604e-10, + "loss": 0.71679407, + "num_input_tokens_seen": 357809525, + "router_z_loss_clip": 0.72949219, + "router_z_loss_mlp": 0.125, + "step": 16580, + "time_per_iteration": 2.7669482231140137 + }, + { + "auxiliary_loss_clip": 0.01108669, + "auxiliary_loss_mlp": 0.01029599, + "balance_loss_clip": 1.03684664, + "balance_loss_mlp": 1.01905465, + "epoch": 0.9969036524875996, + "flos": 32654177148960.0, + "grad_norm": 2.090906527147561, + "language_loss": 0.80237913, + "learning_rate": 9.862937031113184e-11, + "loss": 0.82376182, + "num_input_tokens_seen": 357829795, + "router_z_loss_clip": 0.71777344, + "router_z_loss_mlp": 0.10546875, + "step": 16581, + "time_per_iteration": 2.652764320373535 + }, + { + "auxiliary_loss_clip": 0.01105398, + "auxiliary_loss_mlp": 0.01022492, + "balance_loss_clip": 1.03606606, + "balance_loss_mlp": 1.0126276, + "epoch": 0.9969637757402675, + "flos": 30294544229280.0, + "grad_norm": 2.7595553289520645, + "language_loss": 0.803662, + "learning_rate": 9.479950191249031e-11, + "loss": 0.82494092, + "num_input_tokens_seen": 357851655, + "router_z_loss_clip": 0.69287109, + "router_z_loss_mlp": 0.09863281, + "step": 16582, + "time_per_iteration": 2.6863820552825928 + }, + { + "auxiliary_loss_clip": 0.01105875, + "auxiliary_loss_mlp": 0.01029713, + "balance_loss_clip": 1.03750145, + "balance_loss_mlp": 1.01903749, + "epoch": 0.9970238989929355, + "flos": 28112149008000.0, + "grad_norm": 1.602280598288843, + "language_loss": 0.60580838, + "learning_rate": 9.104547011951069e-11, + "loss": 0.62716424, + "num_input_tokens_seen": 357871205, + "router_z_loss_clip": 0.68359375, + "router_z_loss_mlp": 0.10681152, + "step": 16583, + "time_per_iteration": 2.6278438568115234 + }, + { + "auxiliary_loss_clip": 0.01109872, + "auxiliary_loss_mlp": 0.01030775, + "balance_loss_clip": 1.0377773, + "balance_loss_mlp": 1.01973593, + "epoch": 0.9970840222456034, + "flos": 30870252740160.0, + "grad_norm": 1.9523623771402294, + "language_loss": 0.77483791, + "learning_rate": 8.736727507452357e-11, + "loss": 0.79624438, + "num_input_tokens_seen": 357892145, + "router_z_loss_clip": 0.72119141, + "router_z_loss_mlp": 0.11035156, + "step": 16584, + "time_per_iteration": 2.761094808578491 + }, + { + "auxiliary_loss_clip": 0.01103856, + "auxiliary_loss_mlp": 0.01028246, + "balance_loss_clip": 1.03526044, + "balance_loss_mlp": 1.01810074, + "epoch": 0.9971441454982715, + "flos": 26375542431840.0, + "grad_norm": 1.545631505322263, + "language_loss": 0.69497085, + "learning_rate": 8.376491691697297e-11, + "loss": 0.7162919, + "num_input_tokens_seen": 357911205, + "router_z_loss_clip": 0.68701172, + "router_z_loss_mlp": 0.1015625, + "step": 16585, + "time_per_iteration": 2.6784310340881348 + }, + { + "auxiliary_loss_clip": 0.01109631, + "auxiliary_loss_mlp": 0.01029215, + "balance_loss_clip": 1.03903711, + "balance_loss_mlp": 1.01815271, + "epoch": 0.9972042687509394, + "flos": 18272877549120.0, + "grad_norm": 2.5294190100248337, + "language_loss": 0.82020986, + "learning_rate": 8.023839578363834e-11, + "loss": 0.84159839, + "num_input_tokens_seen": 357928190, + "router_z_loss_clip": 0.70556641, + "router_z_loss_mlp": 0.11071777, + "step": 16586, + "time_per_iteration": 4.109133005142212 + }, + { + "auxiliary_loss_clip": 0.01109633, + "auxiliary_loss_mlp": 0.01030773, + "balance_loss_clip": 1.03721964, + "balance_loss_mlp": 1.02031267, + "epoch": 0.9972643920036074, + "flos": 31490240287680.0, + "grad_norm": 1.6606570695793867, + "language_loss": 0.77924502, + "learning_rate": 7.678771180796851e-11, + "loss": 0.80064905, + "num_input_tokens_seen": 357946985, + "router_z_loss_clip": 0.72460938, + "router_z_loss_mlp": 0.10461426, + "step": 16587, + "time_per_iteration": 2.653456926345825 + }, + { + "auxiliary_loss_clip": 0.01111928, + "auxiliary_loss_mlp": 0.01031995, + "balance_loss_clip": 1.03845489, + "balance_loss_mlp": 1.02080727, + "epoch": 0.9973245152562754, + "flos": 28463383022400.0, + "grad_norm": 1.7534124895072334, + "language_loss": 0.72818387, + "learning_rate": 7.341286512074773e-11, + "loss": 0.74962318, + "num_input_tokens_seen": 357966720, + "router_z_loss_clip": 0.734375, + "router_z_loss_mlp": 0.11181641, + "step": 16588, + "time_per_iteration": 2.647043228149414 + }, + { + "auxiliary_loss_clip": 0.01115417, + "auxiliary_loss_mlp": 0.01026367, + "balance_loss_clip": 1.03966534, + "balance_loss_mlp": 1.0153048, + "epoch": 0.9973846385089433, + "flos": 14844632227200.0, + "grad_norm": 2.680374393697472, + "language_loss": 0.82714707, + "learning_rate": 7.011385585031781e-11, + "loss": 0.84856486, + "num_input_tokens_seen": 357981375, + "router_z_loss_clip": 0.75683594, + "router_z_loss_mlp": 0.1105957, + "step": 16589, + "time_per_iteration": 2.596012592315674 + }, + { + "auxiliary_loss_clip": 0.01113648, + "auxiliary_loss_mlp": 0.01031591, + "balance_loss_clip": 1.03862095, + "balance_loss_mlp": 1.01910424, + "epoch": 0.9974447617616113, + "flos": 24459307637760.0, + "grad_norm": 2.1249614505869747, + "language_loss": 0.70565832, + "learning_rate": 6.689068412168986e-11, + "loss": 0.72711074, + "num_input_tokens_seen": 358000290, + "router_z_loss_clip": 0.75048828, + "router_z_loss_mlp": 0.12475586, + "step": 16590, + "time_per_iteration": 2.5971217155456543 + }, + { + "auxiliary_loss_clip": 0.01112552, + "auxiliary_loss_mlp": 0.01026924, + "balance_loss_clip": 1.03932333, + "balance_loss_mlp": 1.0152235, + "epoch": 0.9975048850142793, + "flos": 39065162768640.0, + "grad_norm": 1.7809644810341652, + "language_loss": 0.63259447, + "learning_rate": 6.374335005676634e-11, + "loss": 0.65398926, + "num_input_tokens_seen": 358022075, + "router_z_loss_clip": 0.73242188, + "router_z_loss_mlp": 0.11712646, + "step": 16591, + "time_per_iteration": 2.6976475715637207 + }, + { + "auxiliary_loss_clip": 0.01106687, + "auxiliary_loss_mlp": 0.01026726, + "balance_loss_clip": 1.03444541, + "balance_loss_mlp": 1.01630747, + "epoch": 0.9975650082669473, + "flos": 45068439636000.0, + "grad_norm": 3.163184761557634, + "language_loss": 0.73061371, + "learning_rate": 6.067185377522933e-11, + "loss": 0.75194776, + "num_input_tokens_seen": 358043940, + "router_z_loss_clip": 0.72167969, + "router_z_loss_mlp": 0.10424805, + "step": 16592, + "time_per_iteration": 2.7214841842651367 + }, + { + "auxiliary_loss_clip": 0.01109332, + "auxiliary_loss_mlp": 0.01027419, + "balance_loss_clip": 1.03675914, + "balance_loss_mlp": 1.01561737, + "epoch": 0.9976251315196152, + "flos": 20098974096000.0, + "grad_norm": 2.049757213491613, + "language_loss": 0.85134864, + "learning_rate": 5.767619539343016e-11, + "loss": 0.87271619, + "num_input_tokens_seen": 358062720, + "router_z_loss_clip": 0.72509766, + "router_z_loss_mlp": 0.11798096, + "step": 16593, + "time_per_iteration": 2.5945630073547363 + }, + { + "auxiliary_loss_clip": 0.01106565, + "auxiliary_loss_mlp": 0.01029154, + "balance_loss_clip": 1.03769684, + "balance_loss_mlp": 1.01858068, + "epoch": 0.9976852547722832, + "flos": 23972967545760.0, + "grad_norm": 1.7798242812735152, + "language_loss": 0.69839257, + "learning_rate": 5.4756375024833656e-11, + "loss": 0.71974975, + "num_input_tokens_seen": 358081560, + "router_z_loss_clip": 0.68798828, + "router_z_loss_mlp": 0.10577393, + "step": 16594, + "time_per_iteration": 2.613800525665283 + }, + { + "auxiliary_loss_clip": 0.01112542, + "auxiliary_loss_mlp": 0.010257, + "balance_loss_clip": 1.0385735, + "balance_loss_mlp": 1.01466155, + "epoch": 0.9977453780249511, + "flos": 24951239114400.0, + "grad_norm": 2.7149142241993567, + "language_loss": 0.72801095, + "learning_rate": 5.1912392780462113e-11, + "loss": 0.7493934, + "num_input_tokens_seen": 358099065, + "router_z_loss_clip": 0.73974609, + "router_z_loss_mlp": 0.1104126, + "step": 16595, + "time_per_iteration": 2.610635280609131 + }, + { + "auxiliary_loss_clip": 0.01026978, + "auxiliary_loss_mlp": 0.01001355, + "balance_loss_clip": 1.00458765, + "balance_loss_mlp": 1.00046039, + "epoch": 0.9978055012776191, + "flos": 79870011426720.0, + "grad_norm": 0.784577034449336, + "language_loss": 0.60325539, + "learning_rate": 4.9144248768007156e-11, + "loss": 0.62353873, + "num_input_tokens_seen": 358156095, + "router_z_loss_clip": 0.22399902, + "router_z_loss_mlp": 0.00893402, + "step": 16596, + "time_per_iteration": 3.1714131832122803 + }, + { + "auxiliary_loss_clip": 0.01110305, + "auxiliary_loss_mlp": 0.01032852, + "balance_loss_clip": 1.03926802, + "balance_loss_mlp": 1.02137232, + "epoch": 0.997865624530287, + "flos": 25174579127040.0, + "grad_norm": 1.7647333829627452, + "language_loss": 0.77498132, + "learning_rate": 4.645194309227385e-11, + "loss": 0.79641294, + "num_input_tokens_seen": 358175230, + "router_z_loss_clip": 0.70996094, + "router_z_loss_mlp": 0.11462402, + "step": 16597, + "time_per_iteration": 4.153491258621216 + }, + { + "auxiliary_loss_clip": 0.01110776, + "auxiliary_loss_mlp": 0.01027563, + "balance_loss_clip": 1.03789544, + "balance_loss_mlp": 1.01629221, + "epoch": 0.9979257477829551, + "flos": 35858920389120.0, + "grad_norm": 3.5443529524161486, + "language_loss": 0.82055342, + "learning_rate": 4.383547585562475e-11, + "loss": 0.84193683, + "num_input_tokens_seen": 358197075, + "router_z_loss_clip": 0.72802734, + "router_z_loss_mlp": 0.11260986, + "step": 16598, + "time_per_iteration": 2.6727688312530518 + }, + { + "auxiliary_loss_clip": 0.01113549, + "auxiliary_loss_mlp": 0.01040661, + "balance_loss_clip": 1.03786683, + "balance_loss_mlp": 1.02875233, + "epoch": 0.997985871035623, + "flos": 27619771841280.0, + "grad_norm": 1.8598299961541054, + "language_loss": 0.64389777, + "learning_rate": 4.129484715709175e-11, + "loss": 0.6654399, + "num_input_tokens_seen": 358215925, + "router_z_loss_clip": 0.75683594, + "router_z_loss_mlp": 0.11907959, + "step": 16599, + "time_per_iteration": 2.6381986141204834 + }, + { + "auxiliary_loss_clip": 0.01027046, + "auxiliary_loss_mlp": 0.01001668, + "balance_loss_clip": 1.00467443, + "balance_loss_mlp": 1.00073302, + "epoch": 0.998045994288291, + "flos": 75418243084800.0, + "grad_norm": 0.850306371521773, + "language_loss": 0.62208271, + "learning_rate": 3.8830057093264256e-11, + "loss": 0.64236987, + "num_input_tokens_seen": 358269035, + "router_z_loss_clip": 0.22387695, + "router_z_loss_mlp": 0.00933838, + "step": 16600, + "time_per_iteration": 4.525591135025024 + }, + { + "auxiliary_loss_clip": 0.01107316, + "auxiliary_loss_mlp": 0.01028092, + "balance_loss_clip": 1.03693974, + "balance_loss_mlp": 1.01820338, + "epoch": 0.998106117540959, + "flos": 23483264519520.0, + "grad_norm": 2.50435478261627, + "language_loss": 0.7856952, + "learning_rate": 3.644110575717896e-11, + "loss": 0.80704927, + "num_input_tokens_seen": 358287680, + "router_z_loss_clip": 0.70410156, + "router_z_loss_mlp": 0.09887695, + "step": 16601, + "time_per_iteration": 2.612734317779541 + }, + { + "auxiliary_loss_clip": 0.01113828, + "auxiliary_loss_mlp": 0.010313, + "balance_loss_clip": 1.03823102, + "balance_loss_mlp": 1.02015352, + "epoch": 0.9981662407936269, + "flos": 25753812641280.0, + "grad_norm": 2.518273981456983, + "language_loss": 0.82217389, + "learning_rate": 3.412799323987414e-11, + "loss": 0.84362519, + "num_input_tokens_seen": 358304080, + "router_z_loss_clip": 0.75537109, + "router_z_loss_mlp": 0.1114502, + "step": 16602, + "time_per_iteration": 2.6888346672058105 + }, + { + "auxiliary_loss_clip": 0.01110446, + "auxiliary_loss_mlp": 0.01032549, + "balance_loss_clip": 1.03842008, + "balance_loss_mlp": 1.02136135, + "epoch": 0.998226364046295, + "flos": 29671679954880.0, + "grad_norm": 2.0435431886107622, + "language_loss": 0.62717164, + "learning_rate": 3.189071962883538e-11, + "loss": 0.64860159, + "num_input_tokens_seen": 358323670, + "router_z_loss_clip": 0.72021484, + "router_z_loss_mlp": 0.11193848, + "step": 16603, + "time_per_iteration": 2.6409811973571777 + }, + { + "auxiliary_loss_clip": 0.01108762, + "auxiliary_loss_mlp": 0.01028759, + "balance_loss_clip": 1.03664708, + "balance_loss_mlp": 1.01748192, + "epoch": 0.9982864872989629, + "flos": 29085963675840.0, + "grad_norm": 2.075138598643699, + "language_loss": 0.71041286, + "learning_rate": 2.972928500866168e-11, + "loss": 0.7317881, + "num_input_tokens_seen": 358341980, + "router_z_loss_clip": 0.72119141, + "router_z_loss_mlp": 0.112854, + "step": 16604, + "time_per_iteration": 2.679455041885376 + }, + { + "auxiliary_loss_clip": 0.01108401, + "auxiliary_loss_mlp": 0.01025456, + "balance_loss_clip": 1.03610063, + "balance_loss_mlp": 1.01382732, + "epoch": 0.9983466105516309, + "flos": 22369886873280.0, + "grad_norm": 2.4082764722024157, + "language_loss": 0.64335644, + "learning_rate": 2.7643689461953613e-11, + "loss": 0.66469502, + "num_input_tokens_seen": 358360400, + "router_z_loss_clip": 0.72265625, + "router_z_loss_mlp": 0.11633301, + "step": 16605, + "time_per_iteration": 2.5970916748046875 + }, + { + "auxiliary_loss_clip": 0.01107836, + "auxiliary_loss_mlp": 0.01027754, + "balance_loss_clip": 1.03751826, + "balance_loss_mlp": 1.01725757, + "epoch": 0.9984067338042988, + "flos": 21032196799680.0, + "grad_norm": 2.525299251711579, + "language_loss": 0.71666384, + "learning_rate": 2.5633933067092938e-11, + "loss": 0.7380197, + "num_input_tokens_seen": 358378990, + "router_z_loss_clip": 0.703125, + "router_z_loss_mlp": 0.1050415, + "step": 16606, + "time_per_iteration": 2.6978139877319336 + }, + { + "auxiliary_loss_clip": 0.01108193, + "auxiliary_loss_mlp": 0.01025116, + "balance_loss_clip": 1.03651547, + "balance_loss_mlp": 1.01423264, + "epoch": 0.9984668570569668, + "flos": 25218979715520.0, + "grad_norm": 3.267059549950028, + "language_loss": 0.81954056, + "learning_rate": 2.370001590090709e-11, + "loss": 0.8408736, + "num_input_tokens_seen": 358395970, + "router_z_loss_clip": 0.71630859, + "router_z_loss_mlp": 0.10888672, + "step": 16607, + "time_per_iteration": 2.6098835468292236 + }, + { + "auxiliary_loss_clip": 0.01109629, + "auxiliary_loss_mlp": 0.01029278, + "balance_loss_clip": 1.03518772, + "balance_loss_mlp": 1.01773858, + "epoch": 0.9985269803096347, + "flos": 36927208653120.0, + "grad_norm": 1.61754019209382, + "language_loss": 0.67105329, + "learning_rate": 2.184193803622669e-11, + "loss": 0.69244242, + "num_input_tokens_seen": 358417355, + "router_z_loss_clip": 0.74365234, + "router_z_loss_mlp": 0.11535645, + "step": 16608, + "time_per_iteration": 4.147508859634399 + }, + { + "auxiliary_loss_clip": 0.01112438, + "auxiliary_loss_mlp": 0.01028445, + "balance_loss_clip": 1.03990495, + "balance_loss_mlp": 1.01714945, + "epoch": 0.9985871035623027, + "flos": 12885779604960.0, + "grad_norm": 2.0038058031696715, + "language_loss": 0.8109448, + "learning_rate": 2.0059699543883978e-11, + "loss": 0.83235359, + "num_input_tokens_seen": 358434345, + "router_z_loss_clip": 0.72509766, + "router_z_loss_mlp": 0.11303711, + "step": 16609, + "time_per_iteration": 2.6068079471588135 + }, + { + "auxiliary_loss_clip": 0.01108937, + "auxiliary_loss_mlp": 0.0103334, + "balance_loss_clip": 1.03588009, + "balance_loss_mlp": 1.02197325, + "epoch": 0.9986472268149706, + "flos": 20588717639520.0, + "grad_norm": 1.584553844906512, + "language_loss": 0.63449192, + "learning_rate": 1.8353300491158462e-11, + "loss": 0.65591466, + "num_input_tokens_seen": 358452870, + "router_z_loss_clip": 0.72949219, + "router_z_loss_mlp": 0.1137085, + "step": 16610, + "time_per_iteration": 2.581821918487549 + }, + { + "auxiliary_loss_clip": 0.01107225, + "auxiliary_loss_mlp": 0.01031128, + "balance_loss_clip": 1.03512144, + "balance_loss_mlp": 1.02056611, + "epoch": 0.9987073500676387, + "flos": 26911064151360.0, + "grad_norm": 2.460637489290875, + "language_loss": 0.67092264, + "learning_rate": 1.672274094288717e-11, + "loss": 0.69230616, + "num_input_tokens_seen": 358472210, + "router_z_loss_clip": 0.72167969, + "router_z_loss_mlp": 0.10571289, + "step": 16611, + "time_per_iteration": 2.656344413757324 + }, + { + "auxiliary_loss_clip": 0.01109659, + "auxiliary_loss_mlp": 0.01033804, + "balance_loss_clip": 1.03761959, + "balance_loss_mlp": 1.02213979, + "epoch": 0.9987674733203066, + "flos": 37195962186240.0, + "grad_norm": 1.5399682786814382, + "language_loss": 0.69706112, + "learning_rate": 1.5168020961020544e-11, + "loss": 0.71849579, + "num_input_tokens_seen": 358493840, + "router_z_loss_clip": 0.72021484, + "router_z_loss_mlp": 0.11669922, + "step": 16612, + "time_per_iteration": 2.6917483806610107 + }, + { + "auxiliary_loss_clip": 0.01106656, + "auxiliary_loss_mlp": 0.01029403, + "balance_loss_clip": 1.03723145, + "balance_loss_mlp": 1.01891828, + "epoch": 0.9988275965729746, + "flos": 33855383557440.0, + "grad_norm": 1.4912875030389279, + "language_loss": 0.73972118, + "learning_rate": 1.3689140604400407e-11, + "loss": 0.76108176, + "num_input_tokens_seen": 358515060, + "router_z_loss_clip": 0.69335938, + "router_z_loss_mlp": 0.10491943, + "step": 16613, + "time_per_iteration": 2.67545485496521 + }, + { + "auxiliary_loss_clip": 0.01110007, + "auxiliary_loss_mlp": 0.0102511, + "balance_loss_clip": 1.03716588, + "balance_loss_mlp": 1.0135529, + "epoch": 0.9988877198256426, + "flos": 21383349779520.0, + "grad_norm": 2.1180010898552957, + "language_loss": 0.73731339, + "learning_rate": 1.2286099928981996e-11, + "loss": 0.75866461, + "num_input_tokens_seen": 358528200, + "router_z_loss_clip": 0.72949219, + "router_z_loss_mlp": 0.11547852, + "step": 16614, + "time_per_iteration": 2.5513672828674316 + }, + { + "auxiliary_loss_clip": 0.01109496, + "auxiliary_loss_mlp": 0.01028971, + "balance_loss_clip": 1.03809333, + "balance_loss_mlp": 1.01841533, + "epoch": 0.9989478430783105, + "flos": 25619962599360.0, + "grad_norm": 2.246411487178756, + "language_loss": 0.72976393, + "learning_rate": 1.0958898988278065e-11, + "loss": 0.75114858, + "num_input_tokens_seen": 358548360, + "router_z_loss_clip": 0.71484375, + "router_z_loss_mlp": 0.10552979, + "step": 16615, + "time_per_iteration": 2.6255266666412354 + }, + { + "auxiliary_loss_clip": 0.01113348, + "auxiliary_loss_mlp": 0.01027147, + "balance_loss_clip": 1.03927267, + "balance_loss_mlp": 1.01608479, + "epoch": 0.9990079663309785, + "flos": 16314308547840.0, + "grad_norm": 6.048387489138262, + "language_loss": 0.77376276, + "learning_rate": 9.70753783247069e-12, + "loss": 0.79516768, + "num_input_tokens_seen": 358566270, + "router_z_loss_clip": 0.74072266, + "router_z_loss_mlp": 0.1105957, + "step": 16616, + "time_per_iteration": 2.59051775932312 + }, + { + "auxiliary_loss_clip": 0.0111138, + "auxiliary_loss_mlp": 0.01028322, + "balance_loss_clip": 1.03934288, + "balance_loss_mlp": 1.01756358, + "epoch": 0.9990680895836465, + "flos": 21121808322240.0, + "grad_norm": 1.9508918039782206, + "language_loss": 0.8330754, + "learning_rate": 8.532016508855378e-12, + "loss": 0.85447246, + "num_input_tokens_seen": 358584710, + "router_z_loss_clip": 0.71972656, + "router_z_loss_mlp": 0.10760498, + "step": 16617, + "time_per_iteration": 2.6545283794403076 + }, + { + "auxiliary_loss_clip": 0.01108519, + "auxiliary_loss_mlp": 0.01027367, + "balance_loss_clip": 1.03728604, + "balance_loss_mlp": 1.01689434, + "epoch": 0.9991282128363145, + "flos": 29538802327680.0, + "grad_norm": 1.5601858701009295, + "language_loss": 0.78761268, + "learning_rate": 7.43233506206309e-12, + "loss": 0.80897152, + "num_input_tokens_seen": 358606750, + "router_z_loss_clip": 0.71191406, + "router_z_loss_mlp": 0.10467529, + "step": 16618, + "time_per_iteration": 2.6299824714660645 + }, + { + "auxiliary_loss_clip": 0.01107555, + "auxiliary_loss_mlp": 0.01028162, + "balance_loss_clip": 1.03647137, + "balance_loss_mlp": 1.01714087, + "epoch": 0.9991883360889824, + "flos": 25838481055680.0, + "grad_norm": 2.660872960675552, + "language_loss": 0.74672532, + "learning_rate": 6.408493534060255e-12, + "loss": 0.7680825, + "num_input_tokens_seen": 358624675, + "router_z_loss_clip": 0.7109375, + "router_z_loss_mlp": 0.11029053, + "step": 16619, + "time_per_iteration": 2.6093525886535645 + }, + { + "auxiliary_loss_clip": 0.01106448, + "auxiliary_loss_mlp": 0.01026862, + "balance_loss_clip": 1.03694046, + "balance_loss_mlp": 1.01693249, + "epoch": 0.9992484593416504, + "flos": 24283407009600.0, + "grad_norm": 2.259296578218354, + "language_loss": 0.86479086, + "learning_rate": 5.460491963260594e-12, + "loss": 0.8861239, + "num_input_tokens_seen": 358640715, + "router_z_loss_clip": 0.69580078, + "router_z_loss_mlp": 0.0993042, + "step": 16620, + "time_per_iteration": 2.6134767532348633 + }, + { + "auxiliary_loss_clip": 0.01104194, + "auxiliary_loss_mlp": 0.01021833, + "balance_loss_clip": 1.03472459, + "balance_loss_mlp": 1.01164055, + "epoch": 0.9993085825943183, + "flos": 30332421535680.0, + "grad_norm": 2.098115932257004, + "language_loss": 0.72419775, + "learning_rate": 4.58833038607942e-12, + "loss": 0.74545807, + "num_input_tokens_seen": 358659630, + "router_z_loss_clip": 0.69433594, + "router_z_loss_mlp": 0.10192871, + "step": 16621, + "time_per_iteration": 2.646101951599121 + }, + { + "auxiliary_loss_clip": 0.01026797, + "auxiliary_loss_mlp": 0.01001258, + "balance_loss_clip": 1.0044241, + "balance_loss_mlp": 1.00034571, + "epoch": 0.9993687058469863, + "flos": 86980878319680.0, + "grad_norm": 0.7356939109811609, + "language_loss": 0.56446683, + "learning_rate": 3.79200883515729e-12, + "loss": 0.58474737, + "num_input_tokens_seen": 358727840, + "router_z_loss_clip": 0.22375488, + "router_z_loss_mlp": 0.0091095, + "step": 16622, + "time_per_iteration": 3.422578811645508 + }, + { + "auxiliary_loss_clip": 0.01109772, + "auxiliary_loss_mlp": 0.01026555, + "balance_loss_clip": 1.03668022, + "balance_loss_mlp": 1.01533759, + "epoch": 0.9994288290996542, + "flos": 14885629364160.0, + "grad_norm": 4.767292472951718, + "language_loss": 0.70596039, + "learning_rate": 3.071527340914315e-12, + "loss": 0.72732365, + "num_input_tokens_seen": 358744125, + "router_z_loss_clip": 0.72998047, + "router_z_loss_mlp": 0.11224365, + "step": 16623, + "time_per_iteration": 2.5638232231140137 + }, + { + "auxiliary_loss_clip": 0.01107912, + "auxiliary_loss_mlp": 0.01029785, + "balance_loss_clip": 1.03688717, + "balance_loss_mlp": 1.01806712, + "epoch": 0.9994889523523223, + "flos": 21829584114720.0, + "grad_norm": 1.7628118887004813, + "language_loss": 0.74689186, + "learning_rate": 2.4268859304399368e-12, + "loss": 0.76826882, + "num_input_tokens_seen": 358761420, + "router_z_loss_clip": 0.70996094, + "router_z_loss_mlp": 0.11700439, + "step": 16624, + "time_per_iteration": 2.627880096435547 + }, + { + "auxiliary_loss_clip": 0.01106616, + "auxiliary_loss_mlp": 0.01028409, + "balance_loss_clip": 1.0348506, + "balance_loss_mlp": 1.01657176, + "epoch": 0.9995490756049902, + "flos": 32432012137440.0, + "grad_norm": 1.8711804894545114, + "language_loss": 0.73681307, + "learning_rate": 1.8580846286031514e-12, + "loss": 0.75816333, + "num_input_tokens_seen": 358782600, + "router_z_loss_clip": 0.71777344, + "router_z_loss_mlp": 0.11834717, + "step": 16625, + "time_per_iteration": 2.6549885272979736 + }, + { + "auxiliary_loss_clip": 0.01106231, + "auxiliary_loss_mlp": 0.01027272, + "balance_loss_clip": 1.0367173, + "balance_loss_mlp": 1.01671648, + "epoch": 0.9996091988576582, + "flos": 27088666505280.0, + "grad_norm": 2.5038964404997275, + "language_loss": 0.77293921, + "learning_rate": 1.3651234567202408e-12, + "loss": 0.79427421, + "num_input_tokens_seen": 358801220, + "router_z_loss_clip": 0.69482422, + "router_z_loss_mlp": 0.10559082, + "step": 16626, + "time_per_iteration": 4.104433059692383 + }, + { + "auxiliary_loss_clip": 0.01107298, + "auxiliary_loss_mlp": 0.0103465, + "balance_loss_clip": 1.03778028, + "balance_loss_mlp": 1.02374792, + "epoch": 0.9996693221103262, + "flos": 33402058698240.0, + "grad_norm": 2.2007935549704203, + "language_loss": 0.82248032, + "learning_rate": 9.480024334429515e-13, + "loss": 0.84389973, + "num_input_tokens_seen": 358819190, + "router_z_loss_clip": 0.69580078, + "router_z_loss_mlp": 0.10888672, + "step": 16627, + "time_per_iteration": 2.6719980239868164 + }, + { + "auxiliary_loss_clip": 0.01113659, + "auxiliary_loss_mlp": 0.01033548, + "balance_loss_clip": 1.03902161, + "balance_loss_mlp": 1.02177036, + "epoch": 0.9997294453629941, + "flos": 32787784087200.0, + "grad_norm": 2.1944181780123246, + "language_loss": 0.70713496, + "learning_rate": 6.067215747584952e-13, + "loss": 0.72860706, + "num_input_tokens_seen": 358839850, + "router_z_loss_clip": 0.74560547, + "router_z_loss_mlp": 0.11779785, + "step": 16628, + "time_per_iteration": 2.704735279083252 + }, + { + "auxiliary_loss_clip": 0.01108047, + "auxiliary_loss_mlp": 0.01026962, + "balance_loss_clip": 1.03546333, + "balance_loss_mlp": 1.01591754, + "epoch": 0.9997895686156621, + "flos": 28646900899200.0, + "grad_norm": 1.5215038619721915, + "language_loss": 0.75488216, + "learning_rate": 3.4128089332341456e-13, + "loss": 0.77623224, + "num_input_tokens_seen": 358859805, + "router_z_loss_clip": 0.72607422, + "router_z_loss_mlp": 0.1105957, + "step": 16629, + "time_per_iteration": 2.658212184906006 + }, + { + "auxiliary_loss_clip": 0.0111228, + "auxiliary_loss_mlp": 0.01030835, + "balance_loss_clip": 1.03779387, + "balance_loss_mlp": 1.01939678, + "epoch": 0.9998496918683301, + "flos": 24678960577920.0, + "grad_norm": 1.5874081487539107, + "language_loss": 0.60415173, + "learning_rate": 1.5168039935176126e-13, + "loss": 0.62558287, + "num_input_tokens_seen": 358877900, + "router_z_loss_clip": 0.74560547, + "router_z_loss_mlp": 0.11444092, + "step": 16630, + "time_per_iteration": 2.60664701461792 + }, + { + "auxiliary_loss_clip": 0.01112625, + "auxiliary_loss_mlp": 0.01026769, + "balance_loss_clip": 1.0394361, + "balance_loss_mlp": 1.01545024, + "epoch": 0.9999098151209981, + "flos": 26420996469600.0, + "grad_norm": 2.0567041856100188, + "language_loss": 0.60408854, + "learning_rate": 3.792010017100722e-14, + "loss": 0.6254825, + "num_input_tokens_seen": 358897285, + "router_z_loss_clip": 0.73193359, + "router_z_loss_mlp": 0.11328125, + "step": 16631, + "time_per_iteration": 2.6676905155181885 + }, + { + "auxiliary_loss_clip": 0.01106861, + "auxiliary_loss_mlp": 0.0102663, + "balance_loss_clip": 1.03738904, + "balance_loss_mlp": 1.01649094, + "epoch": 0.999969938373666, + "flos": 14086216185120.0, + "grad_norm": 2.1586823119794545, + "language_loss": 0.72689283, + "learning_rate": 0.0, + "loss": 0.74822772, + "num_input_tokens_seen": 358911570, + "router_z_loss_clip": 0.69384766, + "router_z_loss_mlp": 0.10144043, + "step": 16632, + "time_per_iteration": 2.5574147701263428 + } + ], + "logging_steps": 1.0, + "max_steps": 16632, + "num_input_tokens_seen": 358911570, + "num_train_epochs": 1, + "save_steps": 3328, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 1.7073231260035318e+18, + "train_batch_size": 5, + "trial_name": null, + "trial_params": null +}